justhtml 0.12.0__py3-none-any.whl → 0.38.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of justhtml might be problematic. Click here for more details.
- justhtml/__init__.py +48 -0
- justhtml/__main__.py +86 -17
- justhtml/constants.py +12 -0
- justhtml/entities.py +45 -7
- justhtml/errors.py +17 -3
- justhtml/linkify.py +438 -0
- justhtml/node.py +385 -97
- justhtml/parser.py +139 -16
- justhtml/sanitize.py +992 -0
- justhtml/selector.py +117 -19
- justhtml/serialize.py +671 -41
- justhtml/tokenizer.py +364 -194
- justhtml/tokens.py +28 -5
- justhtml/transforms.py +2568 -0
- justhtml/treebuilder.py +297 -204
- justhtml/treebuilder_modes.py +208 -138
- justhtml-0.38.0.dist-info/METADATA +213 -0
- justhtml-0.38.0.dist-info/RECORD +26 -0
- {justhtml-0.12.0.dist-info → justhtml-0.38.0.dist-info}/licenses/LICENSE +4 -1
- justhtml-0.12.0.dist-info/METADATA +0 -164
- justhtml-0.12.0.dist-info/RECORD +0 -23
- {justhtml-0.12.0.dist-info → justhtml-0.38.0.dist-info}/WHEEL +0 -0
- {justhtml-0.12.0.dist-info → justhtml-0.38.0.dist-info}/entry_points.txt +0 -0
justhtml/tokenizer.py
CHANGED
|
@@ -9,9 +9,9 @@ if TYPE_CHECKING:
|
|
|
9
9
|
|
|
10
10
|
from .entities import decode_entities_in_text
|
|
11
11
|
from .errors import generate_error_message
|
|
12
|
-
from .tokens import CommentToken, Doctype, DoctypeToken, EOFToken, ParseError, Tag
|
|
12
|
+
from .tokens import AnyToken, CharacterTokens, CommentToken, Doctype, DoctypeToken, EOFToken, ParseError, Tag
|
|
13
13
|
|
|
14
|
-
_ATTR_VALUE_UNQUOTED_TERMINATORS = "\t\n\f >&\"'<=`\
|
|
14
|
+
_ATTR_VALUE_UNQUOTED_TERMINATORS = "\t\n\f >&\"'<=`\0"
|
|
15
15
|
_ASCII_LOWER_TABLE = str.maketrans({chr(code): chr(code + 32) for code in range(65, 91)})
|
|
16
16
|
_RCDATA_ELEMENTS = {"title", "textarea"}
|
|
17
17
|
_RAWTEXT_SWITCH_TAGS = {
|
|
@@ -29,8 +29,8 @@ _ATTR_VALUE_DOUBLE_PATTERN = re.compile(r'["&\0]')
|
|
|
29
29
|
_ATTR_VALUE_SINGLE_PATTERN = re.compile(r"['&\0]")
|
|
30
30
|
_ATTR_VALUE_UNQUOTED_PATTERN = re.compile(f"[{re.escape(_ATTR_VALUE_UNQUOTED_TERMINATORS)}]")
|
|
31
31
|
|
|
32
|
-
_TAG_NAME_RUN_PATTERN = re.compile(r"[^\t\n\f />\0
|
|
33
|
-
_ATTR_NAME_RUN_PATTERN = re.compile(r"[^\t\n\f />=\0\"'
|
|
32
|
+
_TAG_NAME_RUN_PATTERN = re.compile(r"[^\t\n\f />\0]+")
|
|
33
|
+
_ATTR_NAME_RUN_PATTERN = re.compile(r"[^\t\n\f />=\0\"'<]+")
|
|
34
34
|
_COMMENT_RUN_PATTERN = re.compile(r"[^-\0]+")
|
|
35
35
|
_WHITESPACE_PATTERN = re.compile(r"[ \t\n\f]+")
|
|
36
36
|
|
|
@@ -44,6 +44,13 @@ for _plane in range(17):
|
|
|
44
44
|
_XML_COERCION_PATTERN = re.compile(r"[\f\uFDD0-\uFDEF" + "".join(_xml_invalid_single_chars) + "]")
|
|
45
45
|
|
|
46
46
|
|
|
47
|
+
def _is_noncharacter_codepoint(codepoint: int) -> bool:
|
|
48
|
+
if 0xFDD0 <= codepoint <= 0xFDEF:
|
|
49
|
+
return True
|
|
50
|
+
last = codepoint & 0xFFFF
|
|
51
|
+
return last == 0xFFFE or last == 0xFFFF
|
|
52
|
+
|
|
53
|
+
|
|
47
54
|
def _xml_coercion_callback(match: re.Match[str]) -> str:
|
|
48
55
|
if match.group(0) == "\f":
|
|
49
56
|
return " "
|
|
@@ -72,7 +79,14 @@ def _coerce_comment_for_xml(text: str) -> str:
|
|
|
72
79
|
|
|
73
80
|
|
|
74
81
|
class TokenizerOpts:
|
|
75
|
-
__slots__ = (
|
|
82
|
+
__slots__ = (
|
|
83
|
+
"discard_bom",
|
|
84
|
+
"emit_bogus_markup_as_text",
|
|
85
|
+
"exact_errors",
|
|
86
|
+
"initial_rawtext_tag",
|
|
87
|
+
"initial_state",
|
|
88
|
+
"xml_coercion",
|
|
89
|
+
)
|
|
76
90
|
|
|
77
91
|
discard_bom: bool
|
|
78
92
|
exact_errors: bool
|
|
@@ -84,12 +98,14 @@ class TokenizerOpts:
|
|
|
84
98
|
self,
|
|
85
99
|
exact_errors: bool = False,
|
|
86
100
|
discard_bom: bool = True,
|
|
101
|
+
emit_bogus_markup_as_text: bool = False,
|
|
87
102
|
initial_state: int | None = None,
|
|
88
103
|
initial_rawtext_tag: str | None = None,
|
|
89
104
|
xml_coercion: bool = False,
|
|
90
105
|
) -> None:
|
|
91
106
|
self.exact_errors = bool(exact_errors)
|
|
92
107
|
self.discard_bom = bool(discard_bom)
|
|
108
|
+
self.emit_bogus_markup_as_text = bool(emit_bogus_markup_as_text)
|
|
93
109
|
self.initial_state = initial_state
|
|
94
110
|
self.initial_rawtext_tag = initial_rawtext_tag
|
|
95
111
|
self.xml_coercion = bool(xml_coercion)
|
|
@@ -178,11 +194,12 @@ class Tokenizer:
|
|
|
178
194
|
"current_tag_kind",
|
|
179
195
|
"current_tag_name",
|
|
180
196
|
"current_tag_self_closing",
|
|
197
|
+
"current_token_start_pos",
|
|
181
198
|
"errors",
|
|
182
|
-
"ignore_lf",
|
|
183
199
|
"last_start_tag_name",
|
|
184
200
|
"last_token_column",
|
|
185
201
|
"last_token_line",
|
|
202
|
+
"last_token_start_pos",
|
|
186
203
|
"length",
|
|
187
204
|
"opts",
|
|
188
205
|
"original_tag_name",
|
|
@@ -194,6 +211,8 @@ class Tokenizer:
|
|
|
194
211
|
"temp_buffer",
|
|
195
212
|
"text_buffer",
|
|
196
213
|
"text_start_pos",
|
|
214
|
+
"track_node_locations",
|
|
215
|
+
"track_tag_positions",
|
|
197
216
|
)
|
|
198
217
|
|
|
199
218
|
_comment_token: CommentToken
|
|
@@ -202,6 +221,8 @@ class Tokenizer:
|
|
|
202
221
|
_tag_token: Tag
|
|
203
222
|
buffer: str
|
|
204
223
|
collect_errors: bool
|
|
224
|
+
track_tag_positions: bool
|
|
225
|
+
track_node_locations: bool
|
|
205
226
|
current_attr_name: list[str]
|
|
206
227
|
current_attr_value: list[str]
|
|
207
228
|
current_attr_value_has_amp: bool
|
|
@@ -215,11 +236,12 @@ class Tokenizer:
|
|
|
215
236
|
current_tag_kind: int
|
|
216
237
|
current_tag_name: list[str]
|
|
217
238
|
current_tag_self_closing: bool
|
|
239
|
+
current_token_start_pos: int
|
|
218
240
|
errors: list[ParseError]
|
|
219
|
-
ignore_lf: bool
|
|
220
241
|
last_start_tag_name: str | None
|
|
221
242
|
last_token_column: int
|
|
222
243
|
last_token_line: int
|
|
244
|
+
last_token_start_pos: int | None
|
|
223
245
|
length: int
|
|
224
246
|
opts: TokenizerOpts
|
|
225
247
|
original_tag_name: list[str]
|
|
@@ -234,10 +256,20 @@ class Tokenizer:
|
|
|
234
256
|
|
|
235
257
|
# _STATE_HANDLERS is defined at the end of the file
|
|
236
258
|
|
|
237
|
-
def __init__(
|
|
259
|
+
def __init__(
|
|
260
|
+
self,
|
|
261
|
+
sink: Any,
|
|
262
|
+
opts: TokenizerOpts | None = None,
|
|
263
|
+
*,
|
|
264
|
+
collect_errors: bool = False,
|
|
265
|
+
track_node_locations: bool = False,
|
|
266
|
+
track_tag_positions: bool = False,
|
|
267
|
+
) -> None:
|
|
238
268
|
self.sink = sink
|
|
239
269
|
self.opts = opts or TokenizerOpts()
|
|
240
270
|
self.collect_errors = collect_errors
|
|
271
|
+
self.track_node_locations = bool(track_node_locations)
|
|
272
|
+
self.track_tag_positions = bool(track_tag_positions)
|
|
241
273
|
self.errors = []
|
|
242
274
|
|
|
243
275
|
self.state = self.DATA
|
|
@@ -246,9 +278,10 @@ class Tokenizer:
|
|
|
246
278
|
self.pos = 0
|
|
247
279
|
self.reconsume = False
|
|
248
280
|
self.current_char = ""
|
|
249
|
-
self.ignore_lf = False
|
|
250
281
|
self.last_token_line = 1
|
|
251
282
|
self.last_token_column = 0
|
|
283
|
+
self.current_token_start_pos = 0
|
|
284
|
+
self.last_token_start_pos = None
|
|
252
285
|
|
|
253
286
|
# Reusable buffers to avoid per-token allocations.
|
|
254
287
|
self.text_buffer = []
|
|
@@ -276,14 +309,20 @@ class Tokenizer:
|
|
|
276
309
|
if html and html[0] == "\ufeff" and self.opts.discard_bom:
|
|
277
310
|
html = html[1:]
|
|
278
311
|
|
|
312
|
+
# Normalize newlines per §13.2.2.5
|
|
313
|
+
if html:
|
|
314
|
+
if "\r" in html:
|
|
315
|
+
html = html.replace("\r\n", "\n").replace("\r", "\n")
|
|
316
|
+
|
|
279
317
|
self.buffer = html or ""
|
|
280
318
|
self.length = len(self.buffer)
|
|
281
319
|
self.pos = 0
|
|
282
320
|
self.reconsume = False
|
|
283
321
|
self.current_char = ""
|
|
284
|
-
self.ignore_lf = False
|
|
285
322
|
self.last_token_line = 1
|
|
286
323
|
self.last_token_column = 0
|
|
324
|
+
self.current_token_start_pos = 0
|
|
325
|
+
self.last_token_start_pos = None
|
|
287
326
|
self.errors = []
|
|
288
327
|
self.text_buffer.clear()
|
|
289
328
|
self.text_start_pos = 0
|
|
@@ -313,8 +352,9 @@ class Tokenizer:
|
|
|
313
352
|
else:
|
|
314
353
|
self.state = self.DATA
|
|
315
354
|
|
|
316
|
-
# Pre-compute newline positions for O(log n) line lookups
|
|
317
|
-
|
|
355
|
+
# Pre-compute newline positions for O(log n) line lookups.
|
|
356
|
+
# Only do this when errors are collected or when node locations are requested.
|
|
357
|
+
if self.collect_errors or self.track_node_locations:
|
|
318
358
|
self._newline_positions = []
|
|
319
359
|
pos = -1
|
|
320
360
|
buffer = self.buffer
|
|
@@ -334,6 +374,34 @@ class Tokenizer:
|
|
|
334
374
|
return 1
|
|
335
375
|
return bisect_right(newline_positions, pos - 1) + 1
|
|
336
376
|
|
|
377
|
+
def location_at_pos(self, pos: int) -> tuple[int, int]:
|
|
378
|
+
"""Return (line, column) for a 0-indexed offset in the current buffer.
|
|
379
|
+
|
|
380
|
+
Column is 1-indexed. Newline positions are computed lazily when needed.
|
|
381
|
+
"""
|
|
382
|
+
newline_positions = self._newline_positions
|
|
383
|
+
if newline_positions is None:
|
|
384
|
+
newline_positions = []
|
|
385
|
+
scan = -1
|
|
386
|
+
buffer = self.buffer
|
|
387
|
+
while True:
|
|
388
|
+
scan = buffer.find("\n", scan + 1)
|
|
389
|
+
if scan == -1:
|
|
390
|
+
break
|
|
391
|
+
newline_positions.append(scan)
|
|
392
|
+
self._newline_positions = newline_positions
|
|
393
|
+
|
|
394
|
+
line_index = bisect_right(newline_positions, pos - 1)
|
|
395
|
+
line = line_index + 1
|
|
396
|
+
|
|
397
|
+
# Compute column using newline index rather than rfind() to avoid O(n) scans.
|
|
398
|
+
if line_index == 0:
|
|
399
|
+
last_newline = -1
|
|
400
|
+
else:
|
|
401
|
+
last_newline = newline_positions[line_index - 1]
|
|
402
|
+
column = pos - last_newline
|
|
403
|
+
return line, column
|
|
404
|
+
|
|
337
405
|
def step(self) -> bool:
|
|
338
406
|
"""Run one step of the tokenizer state machine. Returns True if EOF reached."""
|
|
339
407
|
handler = self._STATE_HANDLERS[self.state] # type: ignore[attr-defined]
|
|
@@ -341,8 +409,9 @@ class Tokenizer:
|
|
|
341
409
|
|
|
342
410
|
def run(self, html: str | None) -> None:
|
|
343
411
|
self.initialize(html)
|
|
412
|
+
handlers = self._STATE_HANDLERS # type: ignore[attr-defined]
|
|
344
413
|
while True:
|
|
345
|
-
if self.
|
|
414
|
+
if handlers[self.state](self): # type: ignore[no-any-return]
|
|
346
415
|
break
|
|
347
416
|
|
|
348
417
|
# ---------------------
|
|
@@ -356,9 +425,8 @@ class Tokenizer:
|
|
|
356
425
|
return self.buffer[peek_pos]
|
|
357
426
|
return None
|
|
358
427
|
|
|
359
|
-
def _append_text_chunk(self, chunk: str
|
|
428
|
+
def _append_text_chunk(self, chunk: str) -> None:
|
|
360
429
|
self._append_text(chunk)
|
|
361
|
-
self.ignore_lf = ends_with_cr
|
|
362
430
|
|
|
363
431
|
# ---------------------
|
|
364
432
|
# State handlers
|
|
@@ -392,12 +460,12 @@ class Tokenizer:
|
|
|
392
460
|
|
|
393
461
|
if end > pos:
|
|
394
462
|
chunk = buffer[pos:end]
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
463
|
+
if self.collect_errors and not chunk.isascii():
|
|
464
|
+
base_pos = pos
|
|
465
|
+
for offset, ch in enumerate(chunk):
|
|
466
|
+
if _is_noncharacter_codepoint(ord(ch)):
|
|
467
|
+
self._emit_error_at_pos("noncharacter-in-input-stream", base_pos + offset)
|
|
399
468
|
self._append_text(chunk)
|
|
400
|
-
self.ignore_lf = chunk.endswith("\r")
|
|
401
469
|
|
|
402
470
|
pos = end
|
|
403
471
|
self.pos = pos
|
|
@@ -410,8 +478,8 @@ class Tokenizer:
|
|
|
410
478
|
pos += 1
|
|
411
479
|
self.pos = pos
|
|
412
480
|
self.current_char = c
|
|
413
|
-
self.ignore_lf = False
|
|
414
481
|
# c is always '<' here due to find() optimization above
|
|
482
|
+
self.current_token_start_pos = pos - 1
|
|
415
483
|
# Optimization: Peek ahead for common tag starts
|
|
416
484
|
if pos < length:
|
|
417
485
|
nc = buffer[pos]
|
|
@@ -432,7 +500,7 @@ class Tokenizer:
|
|
|
432
500
|
self.state = self.TAG_NAME
|
|
433
501
|
return self._state_tag_name()
|
|
434
502
|
|
|
435
|
-
if nc == "!":
|
|
503
|
+
if nc == "!" and not self.opts.emit_bogus_markup_as_text:
|
|
436
504
|
# Optimization: Peek ahead for comments
|
|
437
505
|
if pos + 2 < length and buffer[pos + 1] == "-" and buffer[pos + 2] == "-":
|
|
438
506
|
self._flush_text()
|
|
@@ -475,12 +543,20 @@ class Tokenizer:
|
|
|
475
543
|
self._emit_token(EOFToken())
|
|
476
544
|
return True
|
|
477
545
|
if c == "!":
|
|
546
|
+
if self.opts.emit_bogus_markup_as_text:
|
|
547
|
+
self._append_text("<!")
|
|
548
|
+
self.state = self.DATA
|
|
549
|
+
return False
|
|
478
550
|
self.state = self.MARKUP_DECLARATION_OPEN
|
|
479
551
|
return False
|
|
480
552
|
if c == "/":
|
|
481
553
|
self.state = self.END_TAG_OPEN
|
|
482
554
|
return False
|
|
483
555
|
if c == "?":
|
|
556
|
+
if self.opts.emit_bogus_markup_as_text:
|
|
557
|
+
self._append_text("<?")
|
|
558
|
+
self.state = self.DATA
|
|
559
|
+
return False
|
|
484
560
|
self._emit_error("unexpected-question-mark-instead-of-tag-name")
|
|
485
561
|
self.current_comment.clear()
|
|
486
562
|
self._reconsume_current()
|
|
@@ -497,6 +573,11 @@ class Tokenizer:
|
|
|
497
573
|
c = self._get_char()
|
|
498
574
|
if c is None:
|
|
499
575
|
self._emit_error("eof-before-tag-name")
|
|
576
|
+
if self.opts.emit_bogus_markup_as_text:
|
|
577
|
+
self._append_text("</")
|
|
578
|
+
self._flush_text()
|
|
579
|
+
self._emit_token(EOFToken())
|
|
580
|
+
return True
|
|
500
581
|
self._append_text("<")
|
|
501
582
|
self._append_text("/")
|
|
502
583
|
self._flush_text()
|
|
@@ -504,6 +585,16 @@ class Tokenizer:
|
|
|
504
585
|
return True
|
|
505
586
|
if c == ">":
|
|
506
587
|
self._emit_error("empty-end-tag")
|
|
588
|
+
if self.opts.emit_bogus_markup_as_text:
|
|
589
|
+
self._append_text("</>")
|
|
590
|
+
self.state = self.DATA
|
|
591
|
+
return False
|
|
592
|
+
self.state = self.DATA
|
|
593
|
+
return False
|
|
594
|
+
|
|
595
|
+
if self.opts.emit_bogus_markup_as_text:
|
|
596
|
+
self._append_text("</")
|
|
597
|
+
self._append_text(c)
|
|
507
598
|
self.state = self.DATA
|
|
508
599
|
return False
|
|
509
600
|
|
|
@@ -518,15 +609,15 @@ class Tokenizer:
|
|
|
518
609
|
append_tag_char = self.current_tag_name.append
|
|
519
610
|
buffer = self.buffer
|
|
520
611
|
length = self.length
|
|
612
|
+
pos = self.pos
|
|
521
613
|
|
|
522
614
|
while True:
|
|
523
615
|
# Inline _consume_tag_name_run
|
|
524
|
-
# Note: reconsume
|
|
525
|
-
pos = self.pos
|
|
616
|
+
# Note: reconsume is never True when entering TAG_NAME
|
|
526
617
|
if pos < length:
|
|
527
618
|
# Optimization: Check for common terminators before regex
|
|
528
619
|
match = None
|
|
529
|
-
if buffer[pos] not in "\t\n\f />\0
|
|
620
|
+
if buffer[pos] not in "\t\n\f />\0":
|
|
530
621
|
match = _TAG_NAME_RUN_PATTERN.match(buffer, pos)
|
|
531
622
|
|
|
532
623
|
if match:
|
|
@@ -534,46 +625,68 @@ class Tokenizer:
|
|
|
534
625
|
if not chunk.islower():
|
|
535
626
|
chunk = chunk.translate(_ASCII_LOWER_TABLE)
|
|
536
627
|
append_tag_char(chunk)
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
if
|
|
540
|
-
|
|
541
|
-
if
|
|
542
|
-
self.
|
|
543
|
-
|
|
544
|
-
|
|
628
|
+
pos = match.end()
|
|
629
|
+
|
|
630
|
+
if pos < length:
|
|
631
|
+
next_char = buffer[pos]
|
|
632
|
+
if next_char in (" ", "\t", "\n", "\f"):
|
|
633
|
+
if self.current_tag_kind == Tag.END and self.opts.emit_bogus_markup_as_text:
|
|
634
|
+
return self._emit_raw_end_tag_as_text(pos)
|
|
635
|
+
pos += 1
|
|
636
|
+
self.pos = pos
|
|
545
637
|
self.state = self.BEFORE_ATTRIBUTE_NAME
|
|
546
638
|
return self._state_before_attribute_name()
|
|
547
|
-
if
|
|
548
|
-
|
|
639
|
+
if next_char == ">":
|
|
640
|
+
pos += 1
|
|
641
|
+
self.pos = pos
|
|
549
642
|
if not self._emit_current_tag():
|
|
550
643
|
self.state = self.DATA
|
|
551
644
|
return False
|
|
552
|
-
if
|
|
553
|
-
self.
|
|
645
|
+
if next_char == "/":
|
|
646
|
+
if self.current_tag_kind == Tag.END and self.opts.emit_bogus_markup_as_text:
|
|
647
|
+
return self._emit_raw_end_tag_as_text(pos)
|
|
648
|
+
pos += 1
|
|
649
|
+
self.pos = pos
|
|
554
650
|
self.state = self.SELF_CLOSING_START_TAG
|
|
555
651
|
return self._state_self_closing_start_tag()
|
|
556
652
|
|
|
557
|
-
|
|
653
|
+
# Inline _get_char
|
|
654
|
+
# Note: reconsume is never True in this state.
|
|
655
|
+
if pos >= length:
|
|
656
|
+
c: str | None = None
|
|
657
|
+
else:
|
|
658
|
+
c = buffer[pos]
|
|
659
|
+
pos += 1
|
|
660
|
+
self.current_char = c
|
|
558
661
|
if c is None:
|
|
662
|
+
self.pos = pos
|
|
559
663
|
self._emit_error("eof-in-tag")
|
|
560
|
-
|
|
561
|
-
# The incomplete tag is discarded (not emitted as text)
|
|
664
|
+
self._emit_incomplete_tag_as_text()
|
|
562
665
|
self._emit_token(EOFToken())
|
|
563
666
|
return True
|
|
564
667
|
if c in ("\t", "\n", "\f", " "):
|
|
668
|
+
if self.current_tag_kind == Tag.END and self.opts.emit_bogus_markup_as_text:
|
|
669
|
+
self.pos = pos
|
|
670
|
+
return self._emit_raw_end_tag_as_text(pos)
|
|
671
|
+
self.pos = pos
|
|
565
672
|
self.state = self.BEFORE_ATTRIBUTE_NAME
|
|
566
673
|
return self._state_before_attribute_name()
|
|
567
674
|
if c == "/":
|
|
675
|
+
if self.current_tag_kind == Tag.END and self.opts.emit_bogus_markup_as_text:
|
|
676
|
+
self.pos = pos
|
|
677
|
+
return self._emit_raw_end_tag_as_text(pos)
|
|
678
|
+
self.pos = pos
|
|
568
679
|
self.state = self.SELF_CLOSING_START_TAG
|
|
569
680
|
return self._state_self_closing_start_tag()
|
|
570
681
|
if c == ">":
|
|
571
682
|
# In slow path, tag name is only first char (from DATA),
|
|
572
683
|
# so no rawtext elements possible - always set DATA state
|
|
684
|
+
self.pos = pos
|
|
573
685
|
self._emit_current_tag()
|
|
574
686
|
self.state = self.DATA
|
|
575
687
|
return False
|
|
576
688
|
# c == "\0" - the only remaining possibility after fast-path
|
|
689
|
+
self.pos = pos
|
|
577
690
|
self._emit_error("unexpected-null-character")
|
|
578
691
|
append_tag_char(replacement)
|
|
579
692
|
|
|
@@ -583,7 +696,7 @@ class Tokenizer:
|
|
|
583
696
|
|
|
584
697
|
while True:
|
|
585
698
|
# Optimization: Skip whitespace
|
|
586
|
-
if not self.reconsume
|
|
699
|
+
if not self.reconsume:
|
|
587
700
|
if self.pos < length:
|
|
588
701
|
# Check if current char is whitespace before running regex
|
|
589
702
|
if buffer[self.pos] in " \t\n\f":
|
|
@@ -603,25 +716,12 @@ class Tokenizer:
|
|
|
603
716
|
|
|
604
717
|
self.current_char = c
|
|
605
718
|
|
|
606
|
-
if c
|
|
607
|
-
self.ignore_lf = False
|
|
608
|
-
continue
|
|
609
|
-
if c == "\n":
|
|
610
|
-
if self.ignore_lf:
|
|
611
|
-
self.ignore_lf = False
|
|
612
|
-
# Line tracking now computed on-demand via _get_line_at_pos()
|
|
613
|
-
continue
|
|
614
|
-
if c == "\t" or c == "\f":
|
|
615
|
-
self.ignore_lf = False
|
|
616
|
-
continue
|
|
617
|
-
if c == "\r":
|
|
618
|
-
self.ignore_lf = False
|
|
619
|
-
if self.pos < length and buffer[self.pos] == "\n":
|
|
620
|
-
self.pos += 1
|
|
719
|
+
if c in (" ", "\n", "\t", "\f"):
|
|
621
720
|
continue
|
|
622
721
|
|
|
623
722
|
if c is None:
|
|
624
723
|
self._emit_error("eof-in-tag")
|
|
724
|
+
self._emit_incomplete_tag_as_text()
|
|
625
725
|
self._flush_text()
|
|
626
726
|
self._emit_token(EOFToken())
|
|
627
727
|
return True
|
|
@@ -661,52 +761,62 @@ class Tokenizer:
|
|
|
661
761
|
append_attr_char = self.current_attr_name.append
|
|
662
762
|
buffer = self.buffer
|
|
663
763
|
length = self.length
|
|
764
|
+
pos = self.pos
|
|
664
765
|
|
|
665
766
|
while True:
|
|
666
767
|
# Inline _consume_attribute_name_run
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
768
|
+
# Note: reconsume is never True in this state.
|
|
769
|
+
if pos < length:
|
|
770
|
+
# Optimization: Check for common terminators before regex
|
|
771
|
+
match = None
|
|
772
|
+
if buffer[pos] not in "\t\n\f />=\0\"'<":
|
|
773
|
+
match = _ATTR_NAME_RUN_PATTERN.match(buffer, pos)
|
|
774
|
+
|
|
775
|
+
if match:
|
|
776
|
+
chunk = match.group(0)
|
|
777
|
+
if not chunk.islower():
|
|
778
|
+
chunk = chunk.translate(_ASCII_LOWER_TABLE)
|
|
779
|
+
append_attr_char(chunk)
|
|
780
|
+
pos = match.end()
|
|
781
|
+
|
|
782
|
+
if pos < length:
|
|
783
|
+
next_char = buffer[pos]
|
|
784
|
+
if next_char == "=":
|
|
785
|
+
pos += 1
|
|
786
|
+
self.pos = pos
|
|
787
|
+
self.state = self.BEFORE_ATTRIBUTE_VALUE
|
|
788
|
+
return self._state_before_attribute_value()
|
|
789
|
+
if next_char in (" ", "\t", "\n", "\f"):
|
|
790
|
+
pos += 1
|
|
791
|
+
self.pos = pos
|
|
792
|
+
self._finish_attribute()
|
|
793
|
+
self.state = self.AFTER_ATTRIBUTE_NAME
|
|
794
|
+
return False # Let main loop dispatch to avoid recursion
|
|
795
|
+
if next_char == ">":
|
|
796
|
+
pos += 1
|
|
797
|
+
self.pos = pos
|
|
798
|
+
self._finish_attribute()
|
|
799
|
+
if not self._emit_current_tag():
|
|
800
|
+
self.state = self.DATA
|
|
801
|
+
return False
|
|
802
|
+
if next_char == "/":
|
|
803
|
+
pos += 1
|
|
804
|
+
self.pos = pos
|
|
805
|
+
self._finish_attribute()
|
|
806
|
+
self.state = self.SELF_CLOSING_START_TAG
|
|
807
|
+
return self._state_self_closing_start_tag()
|
|
808
|
+
|
|
809
|
+
# Inline _get_char (reconsume is never True in this state)
|
|
810
|
+
if pos >= length:
|
|
811
|
+
c: str | None = None
|
|
812
|
+
else:
|
|
813
|
+
c = buffer[pos]
|
|
814
|
+
pos += 1
|
|
815
|
+
self.current_char = c
|
|
816
|
+
self.pos = pos
|
|
708
817
|
if c is None:
|
|
709
818
|
self._emit_error("eof-in-tag")
|
|
819
|
+
self._emit_incomplete_tag_as_text()
|
|
710
820
|
self._flush_text()
|
|
711
821
|
self._emit_token(EOFToken())
|
|
712
822
|
return True
|
|
@@ -730,8 +840,7 @@ class Tokenizer:
|
|
|
730
840
|
self._emit_error("unexpected-null-character")
|
|
731
841
|
append_attr_char(replacement)
|
|
732
842
|
continue
|
|
733
|
-
|
|
734
|
-
self._emit_error("unexpected-character-in-attribute-name")
|
|
843
|
+
self._emit_error("unexpected-character-in-attribute-name")
|
|
735
844
|
append_attr_char(c)
|
|
736
845
|
|
|
737
846
|
def _state_after_attribute_name(self) -> bool:
|
|
@@ -740,11 +849,10 @@ class Tokenizer:
|
|
|
740
849
|
|
|
741
850
|
while True:
|
|
742
851
|
# Optimization: Skip whitespace
|
|
743
|
-
if not self.reconsume
|
|
852
|
+
if not self.reconsume:
|
|
744
853
|
if self.pos < length:
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
self.pos = match.end()
|
|
854
|
+
if buffer[self.pos] in " \t\n\f":
|
|
855
|
+
self.pos = _WHITESPACE_PATTERN.match(buffer, self.pos).end() # type: ignore[union-attr]
|
|
748
856
|
|
|
749
857
|
# Inline _get_char
|
|
750
858
|
if self.pos >= length:
|
|
@@ -755,25 +863,12 @@ class Tokenizer:
|
|
|
755
863
|
|
|
756
864
|
self.current_char = c
|
|
757
865
|
|
|
758
|
-
if c
|
|
759
|
-
self.ignore_lf = False
|
|
760
|
-
continue
|
|
761
|
-
if c == "\n":
|
|
762
|
-
# Note: Only reachable when ignore_lf=True (CR-LF handling)
|
|
763
|
-
# Standalone \n is caught by whitespace optimization
|
|
764
|
-
self.ignore_lf = False
|
|
866
|
+
if c in (" ", "\n", "\t", "\f"):
|
|
765
867
|
continue
|
|
766
|
-
if c == "\r":
|
|
767
|
-
self.ignore_lf = True
|
|
768
|
-
continue
|
|
769
|
-
if c == "\t" or c == "\f":
|
|
770
|
-
self.ignore_lf = False
|
|
771
|
-
continue
|
|
772
|
-
|
|
773
|
-
self.ignore_lf = False
|
|
774
868
|
|
|
775
869
|
if c is None:
|
|
776
870
|
self._emit_error("eof-in-tag")
|
|
871
|
+
self._emit_incomplete_tag_as_text()
|
|
777
872
|
self._flush_text()
|
|
778
873
|
self._emit_token(EOFToken())
|
|
779
874
|
return True
|
|
@@ -804,9 +899,17 @@ class Tokenizer:
|
|
|
804
899
|
|
|
805
900
|
def _state_before_attribute_value(self) -> bool:
|
|
806
901
|
while True:
|
|
807
|
-
|
|
902
|
+
# Inline _get_char (reconsume is never True in this state)
|
|
903
|
+
pos = self.pos
|
|
904
|
+
if pos >= self.length:
|
|
905
|
+
c: str | None = None
|
|
906
|
+
else:
|
|
907
|
+
c = self.buffer[pos]
|
|
908
|
+
self.pos = pos + 1
|
|
909
|
+
self.current_char = c
|
|
808
910
|
if c is None:
|
|
809
911
|
self._emit_error("eof-in-tag")
|
|
912
|
+
self._emit_incomplete_tag_as_text()
|
|
810
913
|
self._flush_text()
|
|
811
914
|
self._emit_token(EOFToken())
|
|
812
915
|
return True
|
|
@@ -857,10 +960,6 @@ class Tokenizer:
|
|
|
857
960
|
if end != next_quote:
|
|
858
961
|
chunk = buffer[pos:end]
|
|
859
962
|
|
|
860
|
-
# Normalize chunk for value if needed
|
|
861
|
-
if "\r" in chunk:
|
|
862
|
-
chunk = chunk.replace("\r\n", "\n").replace("\r", "\n")
|
|
863
|
-
|
|
864
963
|
self.current_attr_value.append(chunk)
|
|
865
964
|
self.pos = end
|
|
866
965
|
|
|
@@ -868,6 +967,7 @@ class Tokenizer:
|
|
|
868
967
|
if self.pos >= length:
|
|
869
968
|
self.current_char = None
|
|
870
969
|
self._emit_error("eof-in-tag")
|
|
970
|
+
self._emit_incomplete_tag_as_text()
|
|
871
971
|
self._emit_token(EOFToken())
|
|
872
972
|
return True
|
|
873
973
|
|
|
@@ -916,10 +1016,6 @@ class Tokenizer:
|
|
|
916
1016
|
if end != next_quote:
|
|
917
1017
|
chunk = buffer[pos:end]
|
|
918
1018
|
|
|
919
|
-
# Normalize chunk for value if needed
|
|
920
|
-
if "\r" in chunk:
|
|
921
|
-
chunk = chunk.replace("\r\n", "\n").replace("\r", "\n")
|
|
922
|
-
|
|
923
1019
|
self.current_attr_value.append(chunk)
|
|
924
1020
|
self.pos = end
|
|
925
1021
|
|
|
@@ -927,6 +1023,7 @@ class Tokenizer:
|
|
|
927
1023
|
if self.pos >= length:
|
|
928
1024
|
self.current_char = None
|
|
929
1025
|
self._emit_error("eof-in-tag")
|
|
1026
|
+
self._emit_incomplete_tag_as_text()
|
|
930
1027
|
self._emit_token(EOFToken())
|
|
931
1028
|
return True
|
|
932
1029
|
|
|
@@ -965,11 +1062,22 @@ class Tokenizer:
|
|
|
965
1062
|
self.current_attr_value.append(buffer[pos:end])
|
|
966
1063
|
self.pos = end
|
|
967
1064
|
|
|
968
|
-
|
|
1065
|
+
# Inline _get_char
|
|
1066
|
+
if self.reconsume:
|
|
1067
|
+
self.reconsume = False
|
|
1068
|
+
c = self.current_char
|
|
1069
|
+
elif self.pos >= length:
|
|
1070
|
+
c = None
|
|
1071
|
+
else:
|
|
1072
|
+
c = buffer[self.pos]
|
|
1073
|
+
self.pos += 1
|
|
1074
|
+
self.current_char = c
|
|
1075
|
+
|
|
969
1076
|
if c is None:
|
|
970
1077
|
# Per HTML5 spec: EOF in attribute value is a parse error
|
|
971
1078
|
# The incomplete tag is discarded (not emitted)
|
|
972
1079
|
self._emit_error("eof-in-tag")
|
|
1080
|
+
self._emit_incomplete_tag_as_text()
|
|
973
1081
|
self._emit_token(EOFToken())
|
|
974
1082
|
return True
|
|
975
1083
|
if c in ("\t", "\n", "\f", " "):
|
|
@@ -995,9 +1103,17 @@ class Tokenizer:
|
|
|
995
1103
|
|
|
996
1104
|
def _state_after_attribute_value_quoted(self) -> bool:
|
|
997
1105
|
"""After attribute value (quoted) state per HTML5 spec §13.2.5.42"""
|
|
998
|
-
|
|
1106
|
+
# Inline _get_char
|
|
1107
|
+
if self.pos >= self.length:
|
|
1108
|
+
c: str | None = None
|
|
1109
|
+
else:
|
|
1110
|
+
c = self.buffer[self.pos]
|
|
1111
|
+
self.pos += 1
|
|
1112
|
+
self.current_char = c
|
|
1113
|
+
|
|
999
1114
|
if c is None:
|
|
1000
1115
|
self._emit_error("eof-in-tag")
|
|
1116
|
+
self._emit_incomplete_tag_as_text()
|
|
1001
1117
|
self._flush_text()
|
|
1002
1118
|
self._emit_token(EOFToken())
|
|
1003
1119
|
return True
|
|
@@ -1025,6 +1141,7 @@ class Tokenizer:
|
|
|
1025
1141
|
c = self._get_char()
|
|
1026
1142
|
if c is None:
|
|
1027
1143
|
self._emit_error("eof-in-tag")
|
|
1144
|
+
self._emit_incomplete_tag_as_text()
|
|
1028
1145
|
self._flush_text()
|
|
1029
1146
|
self._emit_token(EOFToken())
|
|
1030
1147
|
return True
|
|
@@ -1125,7 +1242,14 @@ class Tokenizer:
|
|
|
1125
1242
|
while True:
|
|
1126
1243
|
if self._consume_comment_run():
|
|
1127
1244
|
continue
|
|
1128
|
-
|
|
1245
|
+
# Inline _get_char
|
|
1246
|
+
if self.pos >= self.length:
|
|
1247
|
+
c: str | None = None
|
|
1248
|
+
else:
|
|
1249
|
+
c = self.buffer[self.pos]
|
|
1250
|
+
self.pos += 1
|
|
1251
|
+
self.current_char = c
|
|
1252
|
+
|
|
1129
1253
|
if c is None:
|
|
1130
1254
|
self._emit_error("eof-in-comment")
|
|
1131
1255
|
self._emit_comment()
|
|
@@ -1264,7 +1388,7 @@ class Tokenizer:
|
|
|
1264
1388
|
while True:
|
|
1265
1389
|
c = self._get_char()
|
|
1266
1390
|
if c is None:
|
|
1267
|
-
self._emit_error("eof-in-doctype
|
|
1391
|
+
self._emit_error("eof-in-doctype")
|
|
1268
1392
|
self.current_doctype_force_quirks = True
|
|
1269
1393
|
self._emit_doctype()
|
|
1270
1394
|
self._emit_token(EOFToken())
|
|
@@ -1291,7 +1415,7 @@ class Tokenizer:
|
|
|
1291
1415
|
while True:
|
|
1292
1416
|
c = self._get_char()
|
|
1293
1417
|
if c is None:
|
|
1294
|
-
self._emit_error("eof-in-doctype
|
|
1418
|
+
self._emit_error("eof-in-doctype")
|
|
1295
1419
|
self.current_doctype_force_quirks = True
|
|
1296
1420
|
self._emit_doctype()
|
|
1297
1421
|
self._emit_token(EOFToken())
|
|
@@ -1675,36 +1799,19 @@ class Tokenizer:
|
|
|
1675
1799
|
self.reconsume = False
|
|
1676
1800
|
return self.current_char
|
|
1677
1801
|
|
|
1678
|
-
buffer = self.buffer
|
|
1679
1802
|
pos = self.pos
|
|
1680
|
-
|
|
1681
|
-
|
|
1682
|
-
|
|
1683
|
-
self.pos = pos
|
|
1684
|
-
self.current_char = None
|
|
1685
|
-
return None
|
|
1686
|
-
|
|
1687
|
-
c = buffer[pos]
|
|
1688
|
-
pos += 1
|
|
1689
|
-
|
|
1690
|
-
if c == "\r":
|
|
1691
|
-
self.ignore_lf = True
|
|
1692
|
-
self.current_char = "\n"
|
|
1693
|
-
self.pos = pos
|
|
1694
|
-
return "\n"
|
|
1695
|
-
|
|
1696
|
-
if c == "\n":
|
|
1697
|
-
if self.ignore_lf:
|
|
1698
|
-
self.ignore_lf = False
|
|
1699
|
-
continue
|
|
1700
|
-
# Line tracking now computed on-demand via _get_line_at_pos()
|
|
1803
|
+
if pos >= self.length:
|
|
1804
|
+
self.current_char = None
|
|
1805
|
+
return None
|
|
1701
1806
|
|
|
1702
|
-
|
|
1703
|
-
|
|
1704
|
-
|
|
1705
|
-
|
|
1706
|
-
self.
|
|
1707
|
-
|
|
1807
|
+
c = self.buffer[pos]
|
|
1808
|
+
self.pos = pos + 1
|
|
1809
|
+
self.current_char = c
|
|
1810
|
+
if c == "<":
|
|
1811
|
+
self.current_token_start_pos = pos
|
|
1812
|
+
if self.collect_errors and not c.isascii() and _is_noncharacter_codepoint(ord(c)):
|
|
1813
|
+
self._emit_error_at_pos("noncharacter-in-input-stream", pos)
|
|
1814
|
+
return c
|
|
1708
1815
|
|
|
1709
1816
|
def _reconsume_current(self) -> None:
|
|
1710
1817
|
self.reconsume = True
|
|
@@ -1731,10 +1838,38 @@ class Tokenizer:
|
|
|
1731
1838
|
raw_len = len(data)
|
|
1732
1839
|
|
|
1733
1840
|
self.text_buffer.clear()
|
|
1734
|
-
|
|
1735
|
-
|
|
1736
|
-
|
|
1737
|
-
|
|
1841
|
+
# U+0000 NULL is a parse error in text.
|
|
1842
|
+
# Emit one error per NULL at the *actual* character position.
|
|
1843
|
+
if "\0" in data:
|
|
1844
|
+
base_pos = self.text_start_pos
|
|
1845
|
+
search_from = 0
|
|
1846
|
+
while True:
|
|
1847
|
+
idx = data.find("\0", search_from)
|
|
1848
|
+
if idx == -1:
|
|
1849
|
+
break
|
|
1850
|
+
error_pos = base_pos + idx
|
|
1851
|
+
|
|
1852
|
+
# Compute column at error_pos (1-indexed).
|
|
1853
|
+
last_newline = self.buffer.rfind("\n", 0, error_pos + 1)
|
|
1854
|
+
if last_newline == -1:
|
|
1855
|
+
column = error_pos + 1
|
|
1856
|
+
else:
|
|
1857
|
+
column = error_pos - last_newline
|
|
1858
|
+
line = self._get_line_at_pos(error_pos)
|
|
1859
|
+
|
|
1860
|
+
message = generate_error_message("unexpected-null-character")
|
|
1861
|
+
self.errors.append(
|
|
1862
|
+
ParseError(
|
|
1863
|
+
"unexpected-null-character",
|
|
1864
|
+
line=line,
|
|
1865
|
+
column=column,
|
|
1866
|
+
category="tokenizer",
|
|
1867
|
+
message=message,
|
|
1868
|
+
source_html=self.buffer,
|
|
1869
|
+
)
|
|
1870
|
+
)
|
|
1871
|
+
|
|
1872
|
+
search_from = idx + 1
|
|
1738
1873
|
|
|
1739
1874
|
# Per HTML5 spec:
|
|
1740
1875
|
# - RCDATA state (title, textarea): decode character references
|
|
@@ -1747,13 +1882,16 @@ class Tokenizer:
|
|
|
1747
1882
|
pass
|
|
1748
1883
|
else:
|
|
1749
1884
|
if "&" in data:
|
|
1750
|
-
|
|
1885
|
+
report_error = self._emit_error if self.collect_errors else None
|
|
1886
|
+
data = decode_entities_in_text(data, report_error=report_error)
|
|
1751
1887
|
# Apply XML coercion if enabled
|
|
1752
1888
|
if self.opts.xml_coercion:
|
|
1753
1889
|
data = _coerce_text_for_xml(data)
|
|
1754
1890
|
|
|
1755
1891
|
# Record position at END of raw text (1-indexed column = raw_len)
|
|
1756
|
-
self.
|
|
1892
|
+
if self.collect_errors:
|
|
1893
|
+
self._record_text_end_position(raw_len)
|
|
1894
|
+
self.last_token_start_pos = self.text_start_pos
|
|
1757
1895
|
self.sink.process_characters(data)
|
|
1758
1896
|
# Note: process_characters never returns Plaintext or RawData
|
|
1759
1897
|
# State switches happen via _emit_current_tag instead
|
|
@@ -1785,7 +1923,8 @@ class Tokenizer:
|
|
|
1785
1923
|
else:
|
|
1786
1924
|
value = "".join(attr_value_buffer)
|
|
1787
1925
|
if self.current_attr_value_has_amp:
|
|
1788
|
-
|
|
1926
|
+
report_error = self._emit_error if self.collect_errors else None
|
|
1927
|
+
value = decode_entities_in_text(value, in_attribute=True, report_error=report_error)
|
|
1789
1928
|
attrs[name] = value
|
|
1790
1929
|
attr_value_buffer.clear()
|
|
1791
1930
|
self.current_attr_value_has_amp = False
|
|
@@ -1806,6 +1945,13 @@ class Tokenizer:
|
|
|
1806
1945
|
tag.name = name
|
|
1807
1946
|
tag.attrs = attrs
|
|
1808
1947
|
tag.self_closing = self.current_tag_self_closing
|
|
1948
|
+
if self.track_tag_positions:
|
|
1949
|
+
tag.start_pos = self.current_token_start_pos
|
|
1950
|
+
tag.end_pos = self.pos
|
|
1951
|
+
else:
|
|
1952
|
+
tag.start_pos = None
|
|
1953
|
+
tag.end_pos = None
|
|
1954
|
+
self.last_token_start_pos = tag.start_pos
|
|
1809
1955
|
|
|
1810
1956
|
switched_to_rawtext = False
|
|
1811
1957
|
if self.current_tag_kind == Tag.START:
|
|
@@ -1831,7 +1977,8 @@ class Tokenizer:
|
|
|
1831
1977
|
# Remember current state before emitting
|
|
1832
1978
|
|
|
1833
1979
|
# Emit token to sink
|
|
1834
|
-
self.
|
|
1980
|
+
if self.collect_errors:
|
|
1981
|
+
self._record_token_position()
|
|
1835
1982
|
result = self.sink.process_token(tag)
|
|
1836
1983
|
if result == 1: # TokenSinkResult.Plaintext
|
|
1837
1984
|
self.state = self.PLAINTEXT
|
|
@@ -1844,6 +1991,30 @@ class Tokenizer:
|
|
|
1844
1991
|
self.current_tag_kind = Tag.START
|
|
1845
1992
|
return switched_to_rawtext
|
|
1846
1993
|
|
|
1994
|
+
def _emit_incomplete_tag_as_text(self) -> None:
|
|
1995
|
+
if not self.opts.emit_bogus_markup_as_text:
|
|
1996
|
+
return
|
|
1997
|
+
start = self.current_token_start_pos
|
|
1998
|
+
if start is None: # pragma: no cover
|
|
1999
|
+
return
|
|
2000
|
+
raw = self.buffer[start : self.pos]
|
|
2001
|
+
if raw: # pragma: no branch
|
|
2002
|
+
self._emit_token(CharacterTokens(raw))
|
|
2003
|
+
|
|
2004
|
+
def _emit_raw_end_tag_as_text(self, pos: int) -> bool:
|
|
2005
|
+
end = self.buffer.find(">", pos)
|
|
2006
|
+
if end == -1:
|
|
2007
|
+
self.pos = self.length
|
|
2008
|
+
self._emit_incomplete_tag_as_text()
|
|
2009
|
+
self._emit_token(EOFToken())
|
|
2010
|
+
return True
|
|
2011
|
+
self.pos = end + 1
|
|
2012
|
+
raw = self.buffer[self.current_token_start_pos : self.pos]
|
|
2013
|
+
if raw: # pragma: no branch
|
|
2014
|
+
self._emit_token(CharacterTokens(raw))
|
|
2015
|
+
self.state = self.DATA
|
|
2016
|
+
return False
|
|
2017
|
+
|
|
1847
2018
|
def _emit_comment(self) -> None:
|
|
1848
2019
|
data = "".join(self.current_comment)
|
|
1849
2020
|
self.current_comment.clear()
|
|
@@ -1851,6 +2022,8 @@ class Tokenizer:
|
|
|
1851
2022
|
if self.opts.xml_coercion:
|
|
1852
2023
|
data = _coerce_comment_for_xml(data)
|
|
1853
2024
|
self._comment_token.data = data
|
|
2025
|
+
self._comment_token.start_pos = self.current_token_start_pos
|
|
2026
|
+
self.last_token_start_pos = self._comment_token.start_pos
|
|
1854
2027
|
self._emit_token(self._comment_token)
|
|
1855
2028
|
|
|
1856
2029
|
def _emit_doctype(self) -> None:
|
|
@@ -1870,8 +2043,9 @@ class Tokenizer:
|
|
|
1870
2043
|
self.current_doctype_force_quirks = False
|
|
1871
2044
|
self._emit_token(DoctypeToken(doctype))
|
|
1872
2045
|
|
|
1873
|
-
def _emit_token(self, token:
|
|
1874
|
-
self.
|
|
2046
|
+
def _emit_token(self, token: AnyToken) -> None:
|
|
2047
|
+
if self.collect_errors:
|
|
2048
|
+
self._record_token_position()
|
|
1875
2049
|
self.sink.process_token(token)
|
|
1876
2050
|
# Note: process_token never returns Plaintext or RawData for state switches
|
|
1877
2051
|
# State switches happen via _emit_current_tag checking sink response
|
|
@@ -1881,8 +2055,6 @@ class Tokenizer:
|
|
|
1881
2055
|
|
|
1882
2056
|
Per the spec, the position should be at the end of the token (after the last char).
|
|
1883
2057
|
"""
|
|
1884
|
-
if not self.collect_errors:
|
|
1885
|
-
return
|
|
1886
2058
|
# pos points after the last consumed character, which is exactly what we want
|
|
1887
2059
|
pos = self.pos
|
|
1888
2060
|
last_newline = self.buffer.rfind("\n", 0, pos)
|
|
@@ -1899,8 +2071,6 @@ class Tokenizer:
|
|
|
1899
2071
|
Uses text_start_pos + raw_len to compute where text ends, matching html5lib's
|
|
1900
2072
|
behavior of reporting the column of the last character (1-indexed).
|
|
1901
2073
|
"""
|
|
1902
|
-
if not self.collect_errors:
|
|
1903
|
-
return
|
|
1904
2074
|
# Position of last character of text (0-indexed)
|
|
1905
2075
|
end_pos = self.text_start_pos + raw_len
|
|
1906
2076
|
last_newline = self.buffer.rfind("\n", 0, end_pos)
|
|
@@ -1924,7 +2094,22 @@ class Tokenizer:
|
|
|
1924
2094
|
|
|
1925
2095
|
message = generate_error_message(code)
|
|
1926
2096
|
line = self._get_line_at_pos(self.pos)
|
|
1927
|
-
self.errors.append(
|
|
2097
|
+
self.errors.append(
|
|
2098
|
+
ParseError(code, line=line, column=column, category="tokenizer", message=message, source_html=self.buffer)
|
|
2099
|
+
)
|
|
2100
|
+
|
|
2101
|
+
def _emit_error_at_pos(self, code: str, pos: int) -> None:
|
|
2102
|
+
last_newline = self.buffer.rfind("\n", 0, pos + 1)
|
|
2103
|
+
if last_newline == -1:
|
|
2104
|
+
column = pos + 1
|
|
2105
|
+
else:
|
|
2106
|
+
column = pos - last_newline
|
|
2107
|
+
|
|
2108
|
+
message = generate_error_message(code)
|
|
2109
|
+
line = self._get_line_at_pos(pos)
|
|
2110
|
+
self.errors.append(
|
|
2111
|
+
ParseError(code, line=line, column=column, category="tokenizer", message=message, source_html=self.buffer)
|
|
2112
|
+
)
|
|
1928
2113
|
|
|
1929
2114
|
def _consume_if(self, literal: str) -> bool:
|
|
1930
2115
|
end = self.pos + len(literal)
|
|
@@ -1953,21 +2138,9 @@ class Tokenizer:
|
|
|
1953
2138
|
if pos >= length:
|
|
1954
2139
|
return False
|
|
1955
2140
|
|
|
1956
|
-
# Handle ignore_lf for CRLF sequences
|
|
1957
|
-
if self.ignore_lf and pos < length and self.buffer[pos] == "\n":
|
|
1958
|
-
self.ignore_lf = False
|
|
1959
|
-
pos += 1
|
|
1960
|
-
self.pos = pos
|
|
1961
|
-
if pos >= length:
|
|
1962
|
-
return False
|
|
1963
|
-
|
|
1964
2141
|
match = _COMMENT_RUN_PATTERN.match(self.buffer, pos)
|
|
1965
2142
|
if match:
|
|
1966
2143
|
chunk = match.group(0)
|
|
1967
|
-
# Handle CRLF normalization for comments
|
|
1968
|
-
if "\r" in chunk:
|
|
1969
|
-
chunk = chunk.replace("\r\n", "\n").replace("\r", "\n")
|
|
1970
|
-
self.ignore_lf = chunk.endswith("\r")
|
|
1971
2144
|
self.current_comment.append(chunk)
|
|
1972
2145
|
self.pos = match.end()
|
|
1973
2146
|
return True
|
|
@@ -2061,7 +2234,7 @@ class Tokenizer:
|
|
|
2061
2234
|
# Consume everything up to the special character
|
|
2062
2235
|
if next_special > pos:
|
|
2063
2236
|
chunk = buffer[pos:next_special]
|
|
2064
|
-
self._append_text_chunk(chunk
|
|
2237
|
+
self._append_text_chunk(chunk)
|
|
2065
2238
|
pos = next_special
|
|
2066
2239
|
self.pos = pos
|
|
2067
2240
|
|
|
@@ -2073,7 +2246,6 @@ class Tokenizer:
|
|
|
2073
2246
|
|
|
2074
2247
|
# Handle special characters - we're at one of them after find()
|
|
2075
2248
|
if null_index == pos:
|
|
2076
|
-
self.ignore_lf = False
|
|
2077
2249
|
self._emit_error("unexpected-null-character")
|
|
2078
2250
|
self._append_text("\ufffd")
|
|
2079
2251
|
pos += 1
|
|
@@ -2188,9 +2360,7 @@ class Tokenizer:
|
|
|
2188
2360
|
if null_index != -1 and null_index < next_special:
|
|
2189
2361
|
if null_index > pos:
|
|
2190
2362
|
chunk = buffer[pos:null_index]
|
|
2191
|
-
self._append_text_chunk(chunk
|
|
2192
|
-
else:
|
|
2193
|
-
self.ignore_lf = False
|
|
2363
|
+
self._append_text_chunk(chunk)
|
|
2194
2364
|
self._emit_error("unexpected-null-character")
|
|
2195
2365
|
self._append_text("\ufffd")
|
|
2196
2366
|
pos = null_index + 1
|
|
@@ -2199,14 +2369,14 @@ class Tokenizer:
|
|
|
2199
2369
|
if lt_index == -1:
|
|
2200
2370
|
if pos < length:
|
|
2201
2371
|
chunk = buffer[pos:length]
|
|
2202
|
-
self._append_text_chunk(chunk
|
|
2372
|
+
self._append_text_chunk(chunk)
|
|
2203
2373
|
self.pos = length
|
|
2204
2374
|
self._flush_text()
|
|
2205
2375
|
self._emit_token(EOFToken())
|
|
2206
2376
|
return True
|
|
2207
2377
|
if lt_index > pos:
|
|
2208
2378
|
chunk = buffer[pos:lt_index]
|
|
2209
|
-
self._append_text_chunk(chunk
|
|
2379
|
+
self._append_text_chunk(chunk)
|
|
2210
2380
|
pos = lt_index + 1
|
|
2211
2381
|
self.pos = pos
|
|
2212
2382
|
# Handle script escaped transition before treating '<' as markup boundary
|