justhtml 0.12.0__py3-none-any.whl → 0.24.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of justhtml might be problematic. Click here for more details.
- justhtml/__init__.py +6 -0
- justhtml/__main__.py +49 -16
- justhtml/entities.py +45 -7
- justhtml/errors.py +9 -0
- justhtml/node.py +358 -89
- justhtml/parser.py +70 -14
- justhtml/sanitize.py +763 -0
- justhtml/selector.py +114 -18
- justhtml/serialize.py +332 -28
- justhtml/tokenizer.py +249 -179
- justhtml/tokens.py +8 -3
- justhtml/treebuilder.py +50 -14
- justhtml/treebuilder_modes.py +100 -36
- justhtml-0.24.0.dist-info/METADATA +192 -0
- justhtml-0.24.0.dist-info/RECORD +24 -0
- {justhtml-0.12.0.dist-info → justhtml-0.24.0.dist-info}/licenses/LICENSE +4 -1
- justhtml-0.12.0.dist-info/METADATA +0 -164
- justhtml-0.12.0.dist-info/RECORD +0 -23
- {justhtml-0.12.0.dist-info → justhtml-0.24.0.dist-info}/WHEEL +0 -0
- {justhtml-0.12.0.dist-info → justhtml-0.24.0.dist-info}/entry_points.txt +0 -0
justhtml/tokenizer.py
CHANGED
|
@@ -11,7 +11,7 @@ from .entities import decode_entities_in_text
|
|
|
11
11
|
from .errors import generate_error_message
|
|
12
12
|
from .tokens import CommentToken, Doctype, DoctypeToken, EOFToken, ParseError, Tag
|
|
13
13
|
|
|
14
|
-
_ATTR_VALUE_UNQUOTED_TERMINATORS = "\t\n\f >&\"'<=`\
|
|
14
|
+
_ATTR_VALUE_UNQUOTED_TERMINATORS = "\t\n\f >&\"'<=`\0"
|
|
15
15
|
_ASCII_LOWER_TABLE = str.maketrans({chr(code): chr(code + 32) for code in range(65, 91)})
|
|
16
16
|
_RCDATA_ELEMENTS = {"title", "textarea"}
|
|
17
17
|
_RAWTEXT_SWITCH_TAGS = {
|
|
@@ -29,8 +29,8 @@ _ATTR_VALUE_DOUBLE_PATTERN = re.compile(r'["&\0]')
|
|
|
29
29
|
_ATTR_VALUE_SINGLE_PATTERN = re.compile(r"['&\0]")
|
|
30
30
|
_ATTR_VALUE_UNQUOTED_PATTERN = re.compile(f"[{re.escape(_ATTR_VALUE_UNQUOTED_TERMINATORS)}]")
|
|
31
31
|
|
|
32
|
-
_TAG_NAME_RUN_PATTERN = re.compile(r"[^\t\n\f />\0
|
|
33
|
-
_ATTR_NAME_RUN_PATTERN = re.compile(r"[^\t\n\f />=\0\"'
|
|
32
|
+
_TAG_NAME_RUN_PATTERN = re.compile(r"[^\t\n\f />\0]+")
|
|
33
|
+
_ATTR_NAME_RUN_PATTERN = re.compile(r"[^\t\n\f />=\0\"'<]+")
|
|
34
34
|
_COMMENT_RUN_PATTERN = re.compile(r"[^-\0]+")
|
|
35
35
|
_WHITESPACE_PATTERN = re.compile(r"[ \t\n\f]+")
|
|
36
36
|
|
|
@@ -44,6 +44,13 @@ for _plane in range(17):
|
|
|
44
44
|
_XML_COERCION_PATTERN = re.compile(r"[\f\uFDD0-\uFDEF" + "".join(_xml_invalid_single_chars) + "]")
|
|
45
45
|
|
|
46
46
|
|
|
47
|
+
def _is_noncharacter_codepoint(codepoint: int) -> bool:
|
|
48
|
+
if 0xFDD0 <= codepoint <= 0xFDEF:
|
|
49
|
+
return True
|
|
50
|
+
last = codepoint & 0xFFFF
|
|
51
|
+
return last == 0xFFFE or last == 0xFFFF
|
|
52
|
+
|
|
53
|
+
|
|
47
54
|
def _xml_coercion_callback(match: re.Match[str]) -> str:
|
|
48
55
|
if match.group(0) == "\f":
|
|
49
56
|
return " "
|
|
@@ -178,11 +185,12 @@ class Tokenizer:
|
|
|
178
185
|
"current_tag_kind",
|
|
179
186
|
"current_tag_name",
|
|
180
187
|
"current_tag_self_closing",
|
|
188
|
+
"current_token_start_pos",
|
|
181
189
|
"errors",
|
|
182
|
-
"ignore_lf",
|
|
183
190
|
"last_start_tag_name",
|
|
184
191
|
"last_token_column",
|
|
185
192
|
"last_token_line",
|
|
193
|
+
"last_token_start_pos",
|
|
186
194
|
"length",
|
|
187
195
|
"opts",
|
|
188
196
|
"original_tag_name",
|
|
@@ -194,6 +202,7 @@ class Tokenizer:
|
|
|
194
202
|
"temp_buffer",
|
|
195
203
|
"text_buffer",
|
|
196
204
|
"text_start_pos",
|
|
205
|
+
"track_node_locations",
|
|
197
206
|
)
|
|
198
207
|
|
|
199
208
|
_comment_token: CommentToken
|
|
@@ -202,6 +211,7 @@ class Tokenizer:
|
|
|
202
211
|
_tag_token: Tag
|
|
203
212
|
buffer: str
|
|
204
213
|
collect_errors: bool
|
|
214
|
+
track_node_locations: bool
|
|
205
215
|
current_attr_name: list[str]
|
|
206
216
|
current_attr_value: list[str]
|
|
207
217
|
current_attr_value_has_amp: bool
|
|
@@ -215,11 +225,12 @@ class Tokenizer:
|
|
|
215
225
|
current_tag_kind: int
|
|
216
226
|
current_tag_name: list[str]
|
|
217
227
|
current_tag_self_closing: bool
|
|
228
|
+
current_token_start_pos: int
|
|
218
229
|
errors: list[ParseError]
|
|
219
|
-
ignore_lf: bool
|
|
220
230
|
last_start_tag_name: str | None
|
|
221
231
|
last_token_column: int
|
|
222
232
|
last_token_line: int
|
|
233
|
+
last_token_start_pos: int | None
|
|
223
234
|
length: int
|
|
224
235
|
opts: TokenizerOpts
|
|
225
236
|
original_tag_name: list[str]
|
|
@@ -234,10 +245,18 @@ class Tokenizer:
|
|
|
234
245
|
|
|
235
246
|
# _STATE_HANDLERS is defined at the end of the file
|
|
236
247
|
|
|
237
|
-
def __init__(
|
|
248
|
+
def __init__(
|
|
249
|
+
self,
|
|
250
|
+
sink: Any,
|
|
251
|
+
opts: TokenizerOpts | None = None,
|
|
252
|
+
*,
|
|
253
|
+
collect_errors: bool = False,
|
|
254
|
+
track_node_locations: bool = False,
|
|
255
|
+
) -> None:
|
|
238
256
|
self.sink = sink
|
|
239
257
|
self.opts = opts or TokenizerOpts()
|
|
240
258
|
self.collect_errors = collect_errors
|
|
259
|
+
self.track_node_locations = bool(track_node_locations)
|
|
241
260
|
self.errors = []
|
|
242
261
|
|
|
243
262
|
self.state = self.DATA
|
|
@@ -246,9 +265,10 @@ class Tokenizer:
|
|
|
246
265
|
self.pos = 0
|
|
247
266
|
self.reconsume = False
|
|
248
267
|
self.current_char = ""
|
|
249
|
-
self.ignore_lf = False
|
|
250
268
|
self.last_token_line = 1
|
|
251
269
|
self.last_token_column = 0
|
|
270
|
+
self.current_token_start_pos = 0
|
|
271
|
+
self.last_token_start_pos = None
|
|
252
272
|
|
|
253
273
|
# Reusable buffers to avoid per-token allocations.
|
|
254
274
|
self.text_buffer = []
|
|
@@ -276,14 +296,20 @@ class Tokenizer:
|
|
|
276
296
|
if html and html[0] == "\ufeff" and self.opts.discard_bom:
|
|
277
297
|
html = html[1:]
|
|
278
298
|
|
|
299
|
+
# Normalize newlines per §13.2.2.5
|
|
300
|
+
if html:
|
|
301
|
+
if "\r" in html:
|
|
302
|
+
html = html.replace("\r\n", "\n").replace("\r", "\n")
|
|
303
|
+
|
|
279
304
|
self.buffer = html or ""
|
|
280
305
|
self.length = len(self.buffer)
|
|
281
306
|
self.pos = 0
|
|
282
307
|
self.reconsume = False
|
|
283
308
|
self.current_char = ""
|
|
284
|
-
self.ignore_lf = False
|
|
285
309
|
self.last_token_line = 1
|
|
286
310
|
self.last_token_column = 0
|
|
311
|
+
self.current_token_start_pos = 0
|
|
312
|
+
self.last_token_start_pos = None
|
|
287
313
|
self.errors = []
|
|
288
314
|
self.text_buffer.clear()
|
|
289
315
|
self.text_start_pos = 0
|
|
@@ -313,8 +339,9 @@ class Tokenizer:
|
|
|
313
339
|
else:
|
|
314
340
|
self.state = self.DATA
|
|
315
341
|
|
|
316
|
-
# Pre-compute newline positions for O(log n) line lookups
|
|
317
|
-
|
|
342
|
+
# Pre-compute newline positions for O(log n) line lookups.
|
|
343
|
+
# Only do this when errors are collected or when node locations are requested.
|
|
344
|
+
if self.collect_errors or self.track_node_locations:
|
|
318
345
|
self._newline_positions = []
|
|
319
346
|
pos = -1
|
|
320
347
|
buffer = self.buffer
|
|
@@ -334,6 +361,34 @@ class Tokenizer:
|
|
|
334
361
|
return 1
|
|
335
362
|
return bisect_right(newline_positions, pos - 1) + 1
|
|
336
363
|
|
|
364
|
+
def location_at_pos(self, pos: int) -> tuple[int, int]:
|
|
365
|
+
"""Return (line, column) for a 0-indexed offset in the current buffer.
|
|
366
|
+
|
|
367
|
+
Column is 1-indexed. Newline positions are computed lazily when needed.
|
|
368
|
+
"""
|
|
369
|
+
newline_positions = self._newline_positions
|
|
370
|
+
if newline_positions is None:
|
|
371
|
+
newline_positions = []
|
|
372
|
+
scan = -1
|
|
373
|
+
buffer = self.buffer
|
|
374
|
+
while True:
|
|
375
|
+
scan = buffer.find("\n", scan + 1)
|
|
376
|
+
if scan == -1:
|
|
377
|
+
break
|
|
378
|
+
newline_positions.append(scan)
|
|
379
|
+
self._newline_positions = newline_positions
|
|
380
|
+
|
|
381
|
+
line_index = bisect_right(newline_positions, pos - 1)
|
|
382
|
+
line = line_index + 1
|
|
383
|
+
|
|
384
|
+
# Compute column using newline index rather than rfind() to avoid O(n) scans.
|
|
385
|
+
if line_index == 0:
|
|
386
|
+
last_newline = -1
|
|
387
|
+
else:
|
|
388
|
+
last_newline = newline_positions[line_index - 1]
|
|
389
|
+
column = pos - last_newline
|
|
390
|
+
return line, column
|
|
391
|
+
|
|
337
392
|
def step(self) -> bool:
|
|
338
393
|
"""Run one step of the tokenizer state machine. Returns True if EOF reached."""
|
|
339
394
|
handler = self._STATE_HANDLERS[self.state] # type: ignore[attr-defined]
|
|
@@ -356,9 +411,8 @@ class Tokenizer:
|
|
|
356
411
|
return self.buffer[peek_pos]
|
|
357
412
|
return None
|
|
358
413
|
|
|
359
|
-
def _append_text_chunk(self, chunk: str
|
|
414
|
+
def _append_text_chunk(self, chunk: str) -> None:
|
|
360
415
|
self._append_text(chunk)
|
|
361
|
-
self.ignore_lf = ends_with_cr
|
|
362
416
|
|
|
363
417
|
# ---------------------
|
|
364
418
|
# State handlers
|
|
@@ -392,12 +446,12 @@ class Tokenizer:
|
|
|
392
446
|
|
|
393
447
|
if end > pos:
|
|
394
448
|
chunk = buffer[pos:end]
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
449
|
+
if self.collect_errors and not chunk.isascii():
|
|
450
|
+
base_pos = pos
|
|
451
|
+
for offset, ch in enumerate(chunk):
|
|
452
|
+
if _is_noncharacter_codepoint(ord(ch)):
|
|
453
|
+
self._emit_error_at_pos("noncharacter-in-input-stream", base_pos + offset)
|
|
399
454
|
self._append_text(chunk)
|
|
400
|
-
self.ignore_lf = chunk.endswith("\r")
|
|
401
455
|
|
|
402
456
|
pos = end
|
|
403
457
|
self.pos = pos
|
|
@@ -410,8 +464,8 @@ class Tokenizer:
|
|
|
410
464
|
pos += 1
|
|
411
465
|
self.pos = pos
|
|
412
466
|
self.current_char = c
|
|
413
|
-
self.ignore_lf = False
|
|
414
467
|
# c is always '<' here due to find() optimization above
|
|
468
|
+
self.current_token_start_pos = pos - 1
|
|
415
469
|
# Optimization: Peek ahead for common tag starts
|
|
416
470
|
if pos < length:
|
|
417
471
|
nc = buffer[pos]
|
|
@@ -518,15 +572,15 @@ class Tokenizer:
|
|
|
518
572
|
append_tag_char = self.current_tag_name.append
|
|
519
573
|
buffer = self.buffer
|
|
520
574
|
length = self.length
|
|
575
|
+
pos = self.pos
|
|
521
576
|
|
|
522
577
|
while True:
|
|
523
578
|
# Inline _consume_tag_name_run
|
|
524
|
-
# Note: reconsume
|
|
525
|
-
pos = self.pos
|
|
579
|
+
# Note: reconsume is never True when entering TAG_NAME
|
|
526
580
|
if pos < length:
|
|
527
581
|
# Optimization: Check for common terminators before regex
|
|
528
582
|
match = None
|
|
529
|
-
if buffer[pos] not in "\t\n\f />\0
|
|
583
|
+
if buffer[pos] not in "\t\n\f />\0":
|
|
530
584
|
match = _TAG_NAME_RUN_PATTERN.match(buffer, pos)
|
|
531
585
|
|
|
532
586
|
if match:
|
|
@@ -534,46 +588,59 @@ class Tokenizer:
|
|
|
534
588
|
if not chunk.islower():
|
|
535
589
|
chunk = chunk.translate(_ASCII_LOWER_TABLE)
|
|
536
590
|
append_tag_char(chunk)
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
if
|
|
540
|
-
|
|
541
|
-
if
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
self.ignore_lf = True
|
|
591
|
+
pos = match.end()
|
|
592
|
+
|
|
593
|
+
if pos < length:
|
|
594
|
+
next_char = buffer[pos]
|
|
595
|
+
if next_char in (" ", "\t", "\n", "\f"):
|
|
596
|
+
pos += 1
|
|
597
|
+
self.pos = pos
|
|
545
598
|
self.state = self.BEFORE_ATTRIBUTE_NAME
|
|
546
599
|
return self._state_before_attribute_name()
|
|
547
|
-
if
|
|
548
|
-
|
|
600
|
+
if next_char == ">":
|
|
601
|
+
pos += 1
|
|
602
|
+
self.pos = pos
|
|
549
603
|
if not self._emit_current_tag():
|
|
550
604
|
self.state = self.DATA
|
|
551
605
|
return False
|
|
552
|
-
if
|
|
553
|
-
|
|
606
|
+
if next_char == "/":
|
|
607
|
+
pos += 1
|
|
608
|
+
self.pos = pos
|
|
554
609
|
self.state = self.SELF_CLOSING_START_TAG
|
|
555
610
|
return self._state_self_closing_start_tag()
|
|
556
611
|
|
|
557
|
-
|
|
612
|
+
# Inline _get_char
|
|
613
|
+
# Note: reconsume is never True in this state.
|
|
614
|
+
if pos >= length:
|
|
615
|
+
c: str | None = None
|
|
616
|
+
else:
|
|
617
|
+
c = buffer[pos]
|
|
618
|
+
pos += 1
|
|
619
|
+
self.current_char = c
|
|
558
620
|
if c is None:
|
|
621
|
+
self.pos = pos
|
|
559
622
|
self._emit_error("eof-in-tag")
|
|
560
623
|
# Per HTML5 spec: EOF in tag name is a parse error, emit EOF token only
|
|
561
624
|
# The incomplete tag is discarded (not emitted as text)
|
|
562
625
|
self._emit_token(EOFToken())
|
|
563
626
|
return True
|
|
564
627
|
if c in ("\t", "\n", "\f", " "):
|
|
628
|
+
self.pos = pos
|
|
565
629
|
self.state = self.BEFORE_ATTRIBUTE_NAME
|
|
566
630
|
return self._state_before_attribute_name()
|
|
567
631
|
if c == "/":
|
|
632
|
+
self.pos = pos
|
|
568
633
|
self.state = self.SELF_CLOSING_START_TAG
|
|
569
634
|
return self._state_self_closing_start_tag()
|
|
570
635
|
if c == ">":
|
|
571
636
|
# In slow path, tag name is only first char (from DATA),
|
|
572
637
|
# so no rawtext elements possible - always set DATA state
|
|
638
|
+
self.pos = pos
|
|
573
639
|
self._emit_current_tag()
|
|
574
640
|
self.state = self.DATA
|
|
575
641
|
return False
|
|
576
642
|
# c == "\0" - the only remaining possibility after fast-path
|
|
643
|
+
self.pos = pos
|
|
577
644
|
self._emit_error("unexpected-null-character")
|
|
578
645
|
append_tag_char(replacement)
|
|
579
646
|
|
|
@@ -583,7 +650,7 @@ class Tokenizer:
|
|
|
583
650
|
|
|
584
651
|
while True:
|
|
585
652
|
# Optimization: Skip whitespace
|
|
586
|
-
if not self.reconsume
|
|
653
|
+
if not self.reconsume:
|
|
587
654
|
if self.pos < length:
|
|
588
655
|
# Check if current char is whitespace before running regex
|
|
589
656
|
if buffer[self.pos] in " \t\n\f":
|
|
@@ -603,21 +670,7 @@ class Tokenizer:
|
|
|
603
670
|
|
|
604
671
|
self.current_char = c
|
|
605
672
|
|
|
606
|
-
if c
|
|
607
|
-
self.ignore_lf = False
|
|
608
|
-
continue
|
|
609
|
-
if c == "\n":
|
|
610
|
-
if self.ignore_lf:
|
|
611
|
-
self.ignore_lf = False
|
|
612
|
-
# Line tracking now computed on-demand via _get_line_at_pos()
|
|
613
|
-
continue
|
|
614
|
-
if c == "\t" or c == "\f":
|
|
615
|
-
self.ignore_lf = False
|
|
616
|
-
continue
|
|
617
|
-
if c == "\r":
|
|
618
|
-
self.ignore_lf = False
|
|
619
|
-
if self.pos < length and buffer[self.pos] == "\n":
|
|
620
|
-
self.pos += 1
|
|
673
|
+
if c in (" ", "\n", "\t", "\f"):
|
|
621
674
|
continue
|
|
622
675
|
|
|
623
676
|
if c is None:
|
|
@@ -661,50 +714,54 @@ class Tokenizer:
|
|
|
661
714
|
append_attr_char = self.current_attr_name.append
|
|
662
715
|
buffer = self.buffer
|
|
663
716
|
length = self.length
|
|
717
|
+
pos = self.pos
|
|
664
718
|
|
|
665
719
|
while True:
|
|
666
720
|
# Inline _consume_attribute_name_run
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
match = _ATTR_NAME_RUN_PATTERN.match(buffer, pos)
|
|
674
|
-
|
|
675
|
-
if match:
|
|
676
|
-
chunk = match.group(0)
|
|
677
|
-
if not chunk.islower():
|
|
678
|
-
chunk = chunk.translate(_ASCII_LOWER_TABLE)
|
|
679
|
-
append_attr_char(chunk)
|
|
680
|
-
self.pos = match.end()
|
|
721
|
+
# Note: reconsume is never True in this state.
|
|
722
|
+
if pos < length:
|
|
723
|
+
# Optimization: Check for common terminators before regex
|
|
724
|
+
match = None
|
|
725
|
+
if buffer[pos] not in "\t\n\f />=\0\"'<":
|
|
726
|
+
match = _ATTR_NAME_RUN_PATTERN.match(buffer, pos)
|
|
681
727
|
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
728
|
+
if match:
|
|
729
|
+
chunk = match.group(0)
|
|
730
|
+
if not chunk.islower():
|
|
731
|
+
chunk = chunk.translate(_ASCII_LOWER_TABLE)
|
|
732
|
+
append_attr_char(chunk)
|
|
733
|
+
pos = match.end()
|
|
734
|
+
|
|
735
|
+
if pos < length:
|
|
736
|
+
c = buffer[pos]
|
|
737
|
+
if c == "=":
|
|
738
|
+
pos += 1
|
|
739
|
+
self.pos = pos
|
|
740
|
+
self.state = self.BEFORE_ATTRIBUTE_VALUE
|
|
741
|
+
return self._state_before_attribute_value()
|
|
742
|
+
if c in (" ", "\t", "\n", "\f"):
|
|
743
|
+
pos += 1
|
|
744
|
+
self.pos = pos
|
|
745
|
+
self._finish_attribute()
|
|
746
|
+
self.state = self.AFTER_ATTRIBUTE_NAME
|
|
747
|
+
return False # Let main loop dispatch to avoid recursion
|
|
748
|
+
if c == ">":
|
|
749
|
+
pos += 1
|
|
750
|
+
self.pos = pos
|
|
751
|
+
self._finish_attribute()
|
|
752
|
+
if not self._emit_current_tag():
|
|
753
|
+
self.state = self.DATA
|
|
754
|
+
return False
|
|
755
|
+
if c == "/":
|
|
756
|
+
pos += 1
|
|
757
|
+
self.pos = pos
|
|
758
|
+
self._finish_attribute()
|
|
759
|
+
self.state = self.SELF_CLOSING_START_TAG
|
|
760
|
+
return self._state_self_closing_start_tag()
|
|
706
761
|
|
|
762
|
+
self.pos = pos
|
|
707
763
|
c = self._get_char() # type: ignore[assignment]
|
|
764
|
+
pos = self.pos
|
|
708
765
|
if c is None:
|
|
709
766
|
self._emit_error("eof-in-tag")
|
|
710
767
|
self._flush_text()
|
|
@@ -730,8 +787,7 @@ class Tokenizer:
|
|
|
730
787
|
self._emit_error("unexpected-null-character")
|
|
731
788
|
append_attr_char(replacement)
|
|
732
789
|
continue
|
|
733
|
-
|
|
734
|
-
self._emit_error("unexpected-character-in-attribute-name")
|
|
790
|
+
self._emit_error("unexpected-character-in-attribute-name")
|
|
735
791
|
append_attr_char(c)
|
|
736
792
|
|
|
737
793
|
def _state_after_attribute_name(self) -> bool:
|
|
@@ -740,7 +796,7 @@ class Tokenizer:
|
|
|
740
796
|
|
|
741
797
|
while True:
|
|
742
798
|
# Optimization: Skip whitespace
|
|
743
|
-
if not self.reconsume
|
|
799
|
+
if not self.reconsume:
|
|
744
800
|
if self.pos < length:
|
|
745
801
|
match = _WHITESPACE_PATTERN.match(buffer, self.pos)
|
|
746
802
|
if match:
|
|
@@ -755,23 +811,9 @@ class Tokenizer:
|
|
|
755
811
|
|
|
756
812
|
self.current_char = c
|
|
757
813
|
|
|
758
|
-
if c
|
|
759
|
-
self.ignore_lf = False
|
|
760
|
-
continue
|
|
761
|
-
if c == "\n":
|
|
762
|
-
# Note: Only reachable when ignore_lf=True (CR-LF handling)
|
|
763
|
-
# Standalone \n is caught by whitespace optimization
|
|
764
|
-
self.ignore_lf = False
|
|
765
|
-
continue
|
|
766
|
-
if c == "\r":
|
|
767
|
-
self.ignore_lf = True
|
|
768
|
-
continue
|
|
769
|
-
if c == "\t" or c == "\f":
|
|
770
|
-
self.ignore_lf = False
|
|
814
|
+
if c in (" ", "\n", "\t", "\f"):
|
|
771
815
|
continue
|
|
772
816
|
|
|
773
|
-
self.ignore_lf = False
|
|
774
|
-
|
|
775
817
|
if c is None:
|
|
776
818
|
self._emit_error("eof-in-tag")
|
|
777
819
|
self._flush_text()
|
|
@@ -857,10 +899,6 @@ class Tokenizer:
|
|
|
857
899
|
if end != next_quote:
|
|
858
900
|
chunk = buffer[pos:end]
|
|
859
901
|
|
|
860
|
-
# Normalize chunk for value if needed
|
|
861
|
-
if "\r" in chunk:
|
|
862
|
-
chunk = chunk.replace("\r\n", "\n").replace("\r", "\n")
|
|
863
|
-
|
|
864
902
|
self.current_attr_value.append(chunk)
|
|
865
903
|
self.pos = end
|
|
866
904
|
|
|
@@ -916,10 +954,6 @@ class Tokenizer:
|
|
|
916
954
|
if end != next_quote:
|
|
917
955
|
chunk = buffer[pos:end]
|
|
918
956
|
|
|
919
|
-
# Normalize chunk for value if needed
|
|
920
|
-
if "\r" in chunk:
|
|
921
|
-
chunk = chunk.replace("\r\n", "\n").replace("\r", "\n")
|
|
922
|
-
|
|
923
957
|
self.current_attr_value.append(chunk)
|
|
924
958
|
self.pos = end
|
|
925
959
|
|
|
@@ -965,7 +999,17 @@ class Tokenizer:
|
|
|
965
999
|
self.current_attr_value.append(buffer[pos:end])
|
|
966
1000
|
self.pos = end
|
|
967
1001
|
|
|
968
|
-
|
|
1002
|
+
# Inline _get_char
|
|
1003
|
+
if self.reconsume:
|
|
1004
|
+
self.reconsume = False
|
|
1005
|
+
c = self.current_char
|
|
1006
|
+
elif self.pos >= length:
|
|
1007
|
+
c = None
|
|
1008
|
+
else:
|
|
1009
|
+
c = buffer[self.pos]
|
|
1010
|
+
self.pos += 1
|
|
1011
|
+
self.current_char = c
|
|
1012
|
+
|
|
969
1013
|
if c is None:
|
|
970
1014
|
# Per HTML5 spec: EOF in attribute value is a parse error
|
|
971
1015
|
# The incomplete tag is discarded (not emitted)
|
|
@@ -995,7 +1039,14 @@ class Tokenizer:
|
|
|
995
1039
|
|
|
996
1040
|
def _state_after_attribute_value_quoted(self) -> bool:
|
|
997
1041
|
"""After attribute value (quoted) state per HTML5 spec §13.2.5.42"""
|
|
998
|
-
|
|
1042
|
+
# Inline _get_char
|
|
1043
|
+
if self.pos >= self.length:
|
|
1044
|
+
c: str | None = None
|
|
1045
|
+
else:
|
|
1046
|
+
c = self.buffer[self.pos]
|
|
1047
|
+
self.pos += 1
|
|
1048
|
+
self.current_char = c
|
|
1049
|
+
|
|
999
1050
|
if c is None:
|
|
1000
1051
|
self._emit_error("eof-in-tag")
|
|
1001
1052
|
self._flush_text()
|
|
@@ -1125,7 +1176,14 @@ class Tokenizer:
|
|
|
1125
1176
|
while True:
|
|
1126
1177
|
if self._consume_comment_run():
|
|
1127
1178
|
continue
|
|
1128
|
-
|
|
1179
|
+
# Inline _get_char
|
|
1180
|
+
if self.pos >= self.length:
|
|
1181
|
+
c: str | None = None
|
|
1182
|
+
else:
|
|
1183
|
+
c = self.buffer[self.pos]
|
|
1184
|
+
self.pos += 1
|
|
1185
|
+
self.current_char = c
|
|
1186
|
+
|
|
1129
1187
|
if c is None:
|
|
1130
1188
|
self._emit_error("eof-in-comment")
|
|
1131
1189
|
self._emit_comment()
|
|
@@ -1264,7 +1322,7 @@ class Tokenizer:
|
|
|
1264
1322
|
while True:
|
|
1265
1323
|
c = self._get_char()
|
|
1266
1324
|
if c is None:
|
|
1267
|
-
self._emit_error("eof-in-doctype
|
|
1325
|
+
self._emit_error("eof-in-doctype")
|
|
1268
1326
|
self.current_doctype_force_quirks = True
|
|
1269
1327
|
self._emit_doctype()
|
|
1270
1328
|
self._emit_token(EOFToken())
|
|
@@ -1291,7 +1349,7 @@ class Tokenizer:
|
|
|
1291
1349
|
while True:
|
|
1292
1350
|
c = self._get_char()
|
|
1293
1351
|
if c is None:
|
|
1294
|
-
self._emit_error("eof-in-doctype
|
|
1352
|
+
self._emit_error("eof-in-doctype")
|
|
1295
1353
|
self.current_doctype_force_quirks = True
|
|
1296
1354
|
self._emit_doctype()
|
|
1297
1355
|
self._emit_token(EOFToken())
|
|
@@ -1675,36 +1733,19 @@ class Tokenizer:
|
|
|
1675
1733
|
self.reconsume = False
|
|
1676
1734
|
return self.current_char
|
|
1677
1735
|
|
|
1678
|
-
buffer = self.buffer
|
|
1679
1736
|
pos = self.pos
|
|
1680
|
-
|
|
1681
|
-
|
|
1682
|
-
|
|
1683
|
-
self.pos = pos
|
|
1684
|
-
self.current_char = None
|
|
1685
|
-
return None
|
|
1686
|
-
|
|
1687
|
-
c = buffer[pos]
|
|
1688
|
-
pos += 1
|
|
1737
|
+
if pos >= self.length:
|
|
1738
|
+
self.current_char = None
|
|
1739
|
+
return None
|
|
1689
1740
|
|
|
1690
|
-
|
|
1691
|
-
|
|
1692
|
-
|
|
1693
|
-
|
|
1694
|
-
|
|
1695
|
-
|
|
1696
|
-
|
|
1697
|
-
|
|
1698
|
-
self.ignore_lf = False
|
|
1699
|
-
continue
|
|
1700
|
-
# Line tracking now computed on-demand via _get_line_at_pos()
|
|
1701
|
-
|
|
1702
|
-
else:
|
|
1703
|
-
self.ignore_lf = False
|
|
1704
|
-
|
|
1705
|
-
self.current_char = c
|
|
1706
|
-
self.pos = pos
|
|
1707
|
-
return c
|
|
1741
|
+
c = self.buffer[pos]
|
|
1742
|
+
self.pos = pos + 1
|
|
1743
|
+
self.current_char = c
|
|
1744
|
+
if c == "<":
|
|
1745
|
+
self.current_token_start_pos = pos
|
|
1746
|
+
if self.collect_errors and not c.isascii() and _is_noncharacter_codepoint(ord(c)):
|
|
1747
|
+
self._emit_error_at_pos("noncharacter-in-input-stream", pos)
|
|
1748
|
+
return c
|
|
1708
1749
|
|
|
1709
1750
|
def _reconsume_current(self) -> None:
|
|
1710
1751
|
self.reconsume = True
|
|
@@ -1731,10 +1772,37 @@ class Tokenizer:
|
|
|
1731
1772
|
raw_len = len(data)
|
|
1732
1773
|
|
|
1733
1774
|
self.text_buffer.clear()
|
|
1734
|
-
|
|
1735
|
-
|
|
1736
|
-
|
|
1737
|
-
|
|
1775
|
+
# U+0000 NULL is a parse error in text.
|
|
1776
|
+
# Emit one error per NULL at the *actual* character position.
|
|
1777
|
+
if "\0" in data:
|
|
1778
|
+
base_pos = self.text_start_pos
|
|
1779
|
+
search_from = 0
|
|
1780
|
+
while True:
|
|
1781
|
+
idx = data.find("\0", search_from)
|
|
1782
|
+
if idx == -1:
|
|
1783
|
+
break
|
|
1784
|
+
error_pos = base_pos + idx
|
|
1785
|
+
|
|
1786
|
+
# Compute column at error_pos (1-indexed).
|
|
1787
|
+
last_newline = self.buffer.rfind("\n", 0, error_pos + 1)
|
|
1788
|
+
if last_newline == -1:
|
|
1789
|
+
column = error_pos + 1
|
|
1790
|
+
else:
|
|
1791
|
+
column = error_pos - last_newline
|
|
1792
|
+
line = self._get_line_at_pos(error_pos)
|
|
1793
|
+
|
|
1794
|
+
message = generate_error_message("unexpected-null-character")
|
|
1795
|
+
self.errors.append(
|
|
1796
|
+
ParseError(
|
|
1797
|
+
"unexpected-null-character",
|
|
1798
|
+
line=line,
|
|
1799
|
+
column=column,
|
|
1800
|
+
message=message,
|
|
1801
|
+
source_html=self.buffer,
|
|
1802
|
+
)
|
|
1803
|
+
)
|
|
1804
|
+
|
|
1805
|
+
search_from = idx + 1
|
|
1738
1806
|
|
|
1739
1807
|
# Per HTML5 spec:
|
|
1740
1808
|
# - RCDATA state (title, textarea): decode character references
|
|
@@ -1747,13 +1815,16 @@ class Tokenizer:
|
|
|
1747
1815
|
pass
|
|
1748
1816
|
else:
|
|
1749
1817
|
if "&" in data:
|
|
1750
|
-
|
|
1818
|
+
report_error = self._emit_error if self.collect_errors else None
|
|
1819
|
+
data = decode_entities_in_text(data, report_error=report_error)
|
|
1751
1820
|
# Apply XML coercion if enabled
|
|
1752
1821
|
if self.opts.xml_coercion:
|
|
1753
1822
|
data = _coerce_text_for_xml(data)
|
|
1754
1823
|
|
|
1755
1824
|
# Record position at END of raw text (1-indexed column = raw_len)
|
|
1756
|
-
self.
|
|
1825
|
+
if self.collect_errors:
|
|
1826
|
+
self._record_text_end_position(raw_len)
|
|
1827
|
+
self.last_token_start_pos = self.text_start_pos
|
|
1757
1828
|
self.sink.process_characters(data)
|
|
1758
1829
|
# Note: process_characters never returns Plaintext or RawData
|
|
1759
1830
|
# State switches happen via _emit_current_tag instead
|
|
@@ -1785,7 +1856,8 @@ class Tokenizer:
|
|
|
1785
1856
|
else:
|
|
1786
1857
|
value = "".join(attr_value_buffer)
|
|
1787
1858
|
if self.current_attr_value_has_amp:
|
|
1788
|
-
|
|
1859
|
+
report_error = self._emit_error if self.collect_errors else None
|
|
1860
|
+
value = decode_entities_in_text(value, in_attribute=True, report_error=report_error)
|
|
1789
1861
|
attrs[name] = value
|
|
1790
1862
|
attr_value_buffer.clear()
|
|
1791
1863
|
self.current_attr_value_has_amp = False
|
|
@@ -1806,6 +1878,8 @@ class Tokenizer:
|
|
|
1806
1878
|
tag.name = name
|
|
1807
1879
|
tag.attrs = attrs
|
|
1808
1880
|
tag.self_closing = self.current_tag_self_closing
|
|
1881
|
+
tag.start_pos = self.current_token_start_pos
|
|
1882
|
+
self.last_token_start_pos = tag.start_pos
|
|
1809
1883
|
|
|
1810
1884
|
switched_to_rawtext = False
|
|
1811
1885
|
if self.current_tag_kind == Tag.START:
|
|
@@ -1831,7 +1905,8 @@ class Tokenizer:
|
|
|
1831
1905
|
# Remember current state before emitting
|
|
1832
1906
|
|
|
1833
1907
|
# Emit token to sink
|
|
1834
|
-
self.
|
|
1908
|
+
if self.collect_errors:
|
|
1909
|
+
self._record_token_position()
|
|
1835
1910
|
result = self.sink.process_token(tag)
|
|
1836
1911
|
if result == 1: # TokenSinkResult.Plaintext
|
|
1837
1912
|
self.state = self.PLAINTEXT
|
|
@@ -1851,6 +1926,8 @@ class Tokenizer:
|
|
|
1851
1926
|
if self.opts.xml_coercion:
|
|
1852
1927
|
data = _coerce_comment_for_xml(data)
|
|
1853
1928
|
self._comment_token.data = data
|
|
1929
|
+
self._comment_token.start_pos = self.current_token_start_pos
|
|
1930
|
+
self.last_token_start_pos = self._comment_token.start_pos
|
|
1854
1931
|
self._emit_token(self._comment_token)
|
|
1855
1932
|
|
|
1856
1933
|
def _emit_doctype(self) -> None:
|
|
@@ -1871,7 +1948,8 @@ class Tokenizer:
|
|
|
1871
1948
|
self._emit_token(DoctypeToken(doctype))
|
|
1872
1949
|
|
|
1873
1950
|
def _emit_token(self, token: Any) -> None:
|
|
1874
|
-
self.
|
|
1951
|
+
if self.collect_errors:
|
|
1952
|
+
self._record_token_position()
|
|
1875
1953
|
self.sink.process_token(token)
|
|
1876
1954
|
# Note: process_token never returns Plaintext or RawData for state switches
|
|
1877
1955
|
# State switches happen via _emit_current_tag checking sink response
|
|
@@ -1881,8 +1959,6 @@ class Tokenizer:
|
|
|
1881
1959
|
|
|
1882
1960
|
Per the spec, the position should be at the end of the token (after the last char).
|
|
1883
1961
|
"""
|
|
1884
|
-
if not self.collect_errors:
|
|
1885
|
-
return
|
|
1886
1962
|
# pos points after the last consumed character, which is exactly what we want
|
|
1887
1963
|
pos = self.pos
|
|
1888
1964
|
last_newline = self.buffer.rfind("\n", 0, pos)
|
|
@@ -1899,8 +1975,6 @@ class Tokenizer:
|
|
|
1899
1975
|
Uses text_start_pos + raw_len to compute where text ends, matching html5lib's
|
|
1900
1976
|
behavior of reporting the column of the last character (1-indexed).
|
|
1901
1977
|
"""
|
|
1902
|
-
if not self.collect_errors:
|
|
1903
|
-
return
|
|
1904
1978
|
# Position of last character of text (0-indexed)
|
|
1905
1979
|
end_pos = self.text_start_pos + raw_len
|
|
1906
1980
|
last_newline = self.buffer.rfind("\n", 0, end_pos)
|
|
@@ -1926,6 +2000,17 @@ class Tokenizer:
|
|
|
1926
2000
|
line = self._get_line_at_pos(self.pos)
|
|
1927
2001
|
self.errors.append(ParseError(code, line=line, column=column, message=message, source_html=self.buffer))
|
|
1928
2002
|
|
|
2003
|
+
def _emit_error_at_pos(self, code: str, pos: int) -> None:
|
|
2004
|
+
last_newline = self.buffer.rfind("\n", 0, pos + 1)
|
|
2005
|
+
if last_newline == -1:
|
|
2006
|
+
column = pos + 1
|
|
2007
|
+
else:
|
|
2008
|
+
column = pos - last_newline
|
|
2009
|
+
|
|
2010
|
+
message = generate_error_message(code)
|
|
2011
|
+
line = self._get_line_at_pos(pos)
|
|
2012
|
+
self.errors.append(ParseError(code, line=line, column=column, message=message, source_html=self.buffer))
|
|
2013
|
+
|
|
1929
2014
|
def _consume_if(self, literal: str) -> bool:
|
|
1930
2015
|
end = self.pos + len(literal)
|
|
1931
2016
|
if end > self.length:
|
|
@@ -1953,21 +2038,9 @@ class Tokenizer:
|
|
|
1953
2038
|
if pos >= length:
|
|
1954
2039
|
return False
|
|
1955
2040
|
|
|
1956
|
-
# Handle ignore_lf for CRLF sequences
|
|
1957
|
-
if self.ignore_lf and pos < length and self.buffer[pos] == "\n":
|
|
1958
|
-
self.ignore_lf = False
|
|
1959
|
-
pos += 1
|
|
1960
|
-
self.pos = pos
|
|
1961
|
-
if pos >= length:
|
|
1962
|
-
return False
|
|
1963
|
-
|
|
1964
2041
|
match = _COMMENT_RUN_PATTERN.match(self.buffer, pos)
|
|
1965
2042
|
if match:
|
|
1966
2043
|
chunk = match.group(0)
|
|
1967
|
-
# Handle CRLF normalization for comments
|
|
1968
|
-
if "\r" in chunk:
|
|
1969
|
-
chunk = chunk.replace("\r\n", "\n").replace("\r", "\n")
|
|
1970
|
-
self.ignore_lf = chunk.endswith("\r")
|
|
1971
2044
|
self.current_comment.append(chunk)
|
|
1972
2045
|
self.pos = match.end()
|
|
1973
2046
|
return True
|
|
@@ -2061,7 +2134,7 @@ class Tokenizer:
|
|
|
2061
2134
|
# Consume everything up to the special character
|
|
2062
2135
|
if next_special > pos:
|
|
2063
2136
|
chunk = buffer[pos:next_special]
|
|
2064
|
-
self._append_text_chunk(chunk
|
|
2137
|
+
self._append_text_chunk(chunk)
|
|
2065
2138
|
pos = next_special
|
|
2066
2139
|
self.pos = pos
|
|
2067
2140
|
|
|
@@ -2073,7 +2146,6 @@ class Tokenizer:
|
|
|
2073
2146
|
|
|
2074
2147
|
# Handle special characters - we're at one of them after find()
|
|
2075
2148
|
if null_index == pos:
|
|
2076
|
-
self.ignore_lf = False
|
|
2077
2149
|
self._emit_error("unexpected-null-character")
|
|
2078
2150
|
self._append_text("\ufffd")
|
|
2079
2151
|
pos += 1
|
|
@@ -2188,9 +2260,7 @@ class Tokenizer:
|
|
|
2188
2260
|
if null_index != -1 and null_index < next_special:
|
|
2189
2261
|
if null_index > pos:
|
|
2190
2262
|
chunk = buffer[pos:null_index]
|
|
2191
|
-
self._append_text_chunk(chunk
|
|
2192
|
-
else:
|
|
2193
|
-
self.ignore_lf = False
|
|
2263
|
+
self._append_text_chunk(chunk)
|
|
2194
2264
|
self._emit_error("unexpected-null-character")
|
|
2195
2265
|
self._append_text("\ufffd")
|
|
2196
2266
|
pos = null_index + 1
|
|
@@ -2199,14 +2269,14 @@ class Tokenizer:
|
|
|
2199
2269
|
if lt_index == -1:
|
|
2200
2270
|
if pos < length:
|
|
2201
2271
|
chunk = buffer[pos:length]
|
|
2202
|
-
self._append_text_chunk(chunk
|
|
2272
|
+
self._append_text_chunk(chunk)
|
|
2203
2273
|
self.pos = length
|
|
2204
2274
|
self._flush_text()
|
|
2205
2275
|
self._emit_token(EOFToken())
|
|
2206
2276
|
return True
|
|
2207
2277
|
if lt_index > pos:
|
|
2208
2278
|
chunk = buffer[pos:lt_index]
|
|
2209
|
-
self._append_text_chunk(chunk
|
|
2279
|
+
self._append_text_chunk(chunk)
|
|
2210
2280
|
pos = lt_index + 1
|
|
2211
2281
|
self.pos = pos
|
|
2212
2282
|
# Handle script escaped transition before treating '<' as markup boundary
|