justhtml 0.6.0__py3-none-any.whl → 0.33.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- justhtml/__init__.py +28 -0
- justhtml/__main__.py +161 -13
- justhtml/constants.py +17 -1
- justhtml/context.py +7 -1
- justhtml/encoding.py +405 -0
- justhtml/entities.py +57 -17
- justhtml/errors.py +20 -4
- justhtml/linkify.py +438 -0
- justhtml/node.py +738 -41
- justhtml/parser.py +188 -21
- justhtml/py.typed +0 -0
- justhtml/sanitize.py +1141 -0
- justhtml/selector.py +240 -104
- justhtml/serialize.py +418 -57
- justhtml/stream.py +34 -10
- justhtml/tokenizer.py +433 -289
- justhtml/tokens.py +91 -23
- justhtml/transforms.py +690 -0
- justhtml/treebuilder.py +196 -111
- justhtml/treebuilder_modes.py +191 -117
- justhtml/treebuilder_utils.py +11 -4
- justhtml-0.33.0.dist-info/METADATA +196 -0
- justhtml-0.33.0.dist-info/RECORD +26 -0
- justhtml-0.33.0.dist-info/entry_points.txt +2 -0
- {justhtml-0.6.0.dist-info → justhtml-0.33.0.dist-info}/licenses/LICENSE +4 -1
- justhtml-0.6.0.dist-info/METADATA +0 -126
- justhtml-0.6.0.dist-info/RECORD +0 -20
- {justhtml-0.6.0.dist-info → justhtml-0.33.0.dist-info}/WHEEL +0 -0
justhtml/tokenizer.py
CHANGED
|
@@ -1,11 +1,17 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import re
|
|
2
4
|
from bisect import bisect_right
|
|
5
|
+
from typing import TYPE_CHECKING, Any
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from collections.abc import Callable
|
|
3
9
|
|
|
4
10
|
from .entities import decode_entities_in_text
|
|
5
11
|
from .errors import generate_error_message
|
|
6
|
-
from .tokens import CommentToken, Doctype, DoctypeToken, EOFToken, ParseError, Tag
|
|
12
|
+
from .tokens import AnyToken, CommentToken, Doctype, DoctypeToken, EOFToken, ParseError, Tag
|
|
7
13
|
|
|
8
|
-
_ATTR_VALUE_UNQUOTED_TERMINATORS = "\t\n\f >&\"'<=`\
|
|
14
|
+
_ATTR_VALUE_UNQUOTED_TERMINATORS = "\t\n\f >&\"'<=`\0"
|
|
9
15
|
_ASCII_LOWER_TABLE = str.maketrans({chr(code): chr(code + 32) for code in range(65, 91)})
|
|
10
16
|
_RCDATA_ELEMENTS = {"title", "textarea"}
|
|
11
17
|
_RAWTEXT_SWITCH_TAGS = {
|
|
@@ -23,8 +29,8 @@ _ATTR_VALUE_DOUBLE_PATTERN = re.compile(r'["&\0]')
|
|
|
23
29
|
_ATTR_VALUE_SINGLE_PATTERN = re.compile(r"['&\0]")
|
|
24
30
|
_ATTR_VALUE_UNQUOTED_PATTERN = re.compile(f"[{re.escape(_ATTR_VALUE_UNQUOTED_TERMINATORS)}]")
|
|
25
31
|
|
|
26
|
-
_TAG_NAME_RUN_PATTERN = re.compile(r"[^\t\n\f />\0
|
|
27
|
-
_ATTR_NAME_RUN_PATTERN = re.compile(r"[^\t\n\f />=\0\"'
|
|
32
|
+
_TAG_NAME_RUN_PATTERN = re.compile(r"[^\t\n\f />\0]+")
|
|
33
|
+
_ATTR_NAME_RUN_PATTERN = re.compile(r"[^\t\n\f />=\0\"'<]+")
|
|
28
34
|
_COMMENT_RUN_PATTERN = re.compile(r"[^-\0]+")
|
|
29
35
|
_WHITESPACE_PATTERN = re.compile(r"[ \t\n\f]+")
|
|
30
36
|
|
|
@@ -38,13 +44,20 @@ for _plane in range(17):
|
|
|
38
44
|
_XML_COERCION_PATTERN = re.compile(r"[\f\uFDD0-\uFDEF" + "".join(_xml_invalid_single_chars) + "]")
|
|
39
45
|
|
|
40
46
|
|
|
41
|
-
def
|
|
47
|
+
def _is_noncharacter_codepoint(codepoint: int) -> bool:
|
|
48
|
+
if 0xFDD0 <= codepoint <= 0xFDEF:
|
|
49
|
+
return True
|
|
50
|
+
last = codepoint & 0xFFFF
|
|
51
|
+
return last == 0xFFFE or last == 0xFFFF
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _xml_coercion_callback(match: re.Match[str]) -> str:
|
|
42
55
|
if match.group(0) == "\f":
|
|
43
56
|
return " "
|
|
44
57
|
return "\ufffd"
|
|
45
58
|
|
|
46
59
|
|
|
47
|
-
def _coerce_text_for_xml(text):
|
|
60
|
+
def _coerce_text_for_xml(text: str) -> str:
|
|
48
61
|
"""Apply XML coercion to text content."""
|
|
49
62
|
# Fast path for ASCII
|
|
50
63
|
if text.isascii():
|
|
@@ -57,7 +70,7 @@ def _coerce_text_for_xml(text):
|
|
|
57
70
|
return _XML_COERCION_PATTERN.sub(_xml_coercion_callback, text)
|
|
58
71
|
|
|
59
72
|
|
|
60
|
-
def _coerce_comment_for_xml(text):
|
|
73
|
+
def _coerce_comment_for_xml(text: str) -> str:
|
|
61
74
|
"""Apply XML coercion to comment content - handle double hyphens."""
|
|
62
75
|
# Replace -- with - - (with space)
|
|
63
76
|
if "--" in text:
|
|
@@ -68,14 +81,20 @@ def _coerce_comment_for_xml(text):
|
|
|
68
81
|
class TokenizerOpts:
|
|
69
82
|
__slots__ = ("discard_bom", "exact_errors", "initial_rawtext_tag", "initial_state", "xml_coercion")
|
|
70
83
|
|
|
84
|
+
discard_bom: bool
|
|
85
|
+
exact_errors: bool
|
|
86
|
+
initial_rawtext_tag: str | None
|
|
87
|
+
initial_state: int | None
|
|
88
|
+
xml_coercion: bool
|
|
89
|
+
|
|
71
90
|
def __init__(
|
|
72
91
|
self,
|
|
73
|
-
exact_errors=False,
|
|
74
|
-
discard_bom=True,
|
|
75
|
-
initial_state=None,
|
|
76
|
-
initial_rawtext_tag=None,
|
|
77
|
-
xml_coercion=False,
|
|
78
|
-
):
|
|
92
|
+
exact_errors: bool = False,
|
|
93
|
+
discard_bom: bool = True,
|
|
94
|
+
initial_state: int | None = None,
|
|
95
|
+
initial_rawtext_tag: str | None = None,
|
|
96
|
+
xml_coercion: bool = False,
|
|
97
|
+
) -> None:
|
|
79
98
|
self.exact_errors = bool(exact_errors)
|
|
80
99
|
self.discard_bom = bool(discard_bom)
|
|
81
100
|
self.initial_state = initial_state
|
|
@@ -166,11 +185,12 @@ class Tokenizer:
|
|
|
166
185
|
"current_tag_kind",
|
|
167
186
|
"current_tag_name",
|
|
168
187
|
"current_tag_self_closing",
|
|
188
|
+
"current_token_start_pos",
|
|
169
189
|
"errors",
|
|
170
|
-
"ignore_lf",
|
|
171
190
|
"last_start_tag_name",
|
|
172
191
|
"last_token_column",
|
|
173
192
|
"last_token_line",
|
|
193
|
+
"last_token_start_pos",
|
|
174
194
|
"length",
|
|
175
195
|
"opts",
|
|
176
196
|
"original_tag_name",
|
|
@@ -182,14 +202,61 @@ class Tokenizer:
|
|
|
182
202
|
"temp_buffer",
|
|
183
203
|
"text_buffer",
|
|
184
204
|
"text_start_pos",
|
|
205
|
+
"track_node_locations",
|
|
185
206
|
)
|
|
186
207
|
|
|
208
|
+
_comment_token: CommentToken
|
|
209
|
+
_newline_positions: list[int] | None
|
|
210
|
+
_state_handlers: list[Callable[[Tokenizer], bool]]
|
|
211
|
+
_tag_token: Tag
|
|
212
|
+
buffer: str
|
|
213
|
+
collect_errors: bool
|
|
214
|
+
track_node_locations: bool
|
|
215
|
+
current_attr_name: list[str]
|
|
216
|
+
current_attr_value: list[str]
|
|
217
|
+
current_attr_value_has_amp: bool
|
|
218
|
+
current_char: str | None
|
|
219
|
+
current_comment: list[str]
|
|
220
|
+
current_doctype_force_quirks: bool
|
|
221
|
+
current_doctype_name: list[str]
|
|
222
|
+
current_doctype_public: list[str] | None
|
|
223
|
+
current_doctype_system: list[str] | None
|
|
224
|
+
current_tag_attrs: dict[str, str | None]
|
|
225
|
+
current_tag_kind: int
|
|
226
|
+
current_tag_name: list[str]
|
|
227
|
+
current_tag_self_closing: bool
|
|
228
|
+
current_token_start_pos: int
|
|
229
|
+
errors: list[ParseError]
|
|
230
|
+
last_start_tag_name: str | None
|
|
231
|
+
last_token_column: int
|
|
232
|
+
last_token_line: int
|
|
233
|
+
last_token_start_pos: int | None
|
|
234
|
+
length: int
|
|
235
|
+
opts: TokenizerOpts
|
|
236
|
+
original_tag_name: list[str]
|
|
237
|
+
pos: int
|
|
238
|
+
rawtext_tag_name: str | None
|
|
239
|
+
reconsume: bool
|
|
240
|
+
sink: Any
|
|
241
|
+
state: int
|
|
242
|
+
temp_buffer: list[str]
|
|
243
|
+
text_buffer: list[str]
|
|
244
|
+
text_start_pos: int
|
|
245
|
+
|
|
187
246
|
# _STATE_HANDLERS is defined at the end of the file
|
|
188
247
|
|
|
189
|
-
def __init__(
|
|
248
|
+
def __init__(
|
|
249
|
+
self,
|
|
250
|
+
sink: Any,
|
|
251
|
+
opts: TokenizerOpts | None = None,
|
|
252
|
+
*,
|
|
253
|
+
collect_errors: bool = False,
|
|
254
|
+
track_node_locations: bool = False,
|
|
255
|
+
) -> None:
|
|
190
256
|
self.sink = sink
|
|
191
257
|
self.opts = opts or TokenizerOpts()
|
|
192
258
|
self.collect_errors = collect_errors
|
|
259
|
+
self.track_node_locations = bool(track_node_locations)
|
|
193
260
|
self.errors = []
|
|
194
261
|
|
|
195
262
|
self.state = self.DATA
|
|
@@ -198,9 +265,10 @@ class Tokenizer:
|
|
|
198
265
|
self.pos = 0
|
|
199
266
|
self.reconsume = False
|
|
200
267
|
self.current_char = ""
|
|
201
|
-
self.ignore_lf = False
|
|
202
268
|
self.last_token_line = 1
|
|
203
269
|
self.last_token_column = 0
|
|
270
|
+
self.current_token_start_pos = 0
|
|
271
|
+
self.last_token_start_pos = None
|
|
204
272
|
|
|
205
273
|
# Reusable buffers to avoid per-token allocations.
|
|
206
274
|
self.text_buffer = []
|
|
@@ -224,18 +292,24 @@ class Tokenizer:
|
|
|
224
292
|
self._tag_token = Tag(Tag.START, "", {}, False)
|
|
225
293
|
self._comment_token = CommentToken("")
|
|
226
294
|
|
|
227
|
-
def initialize(self, html):
|
|
295
|
+
def initialize(self, html: str | None) -> None:
|
|
228
296
|
if html and html[0] == "\ufeff" and self.opts.discard_bom:
|
|
229
297
|
html = html[1:]
|
|
230
298
|
|
|
299
|
+
# Normalize newlines per §13.2.2.5
|
|
300
|
+
if html:
|
|
301
|
+
if "\r" in html:
|
|
302
|
+
html = html.replace("\r\n", "\n").replace("\r", "\n")
|
|
303
|
+
|
|
231
304
|
self.buffer = html or ""
|
|
232
305
|
self.length = len(self.buffer)
|
|
233
306
|
self.pos = 0
|
|
234
307
|
self.reconsume = False
|
|
235
308
|
self.current_char = ""
|
|
236
|
-
self.ignore_lf = False
|
|
237
309
|
self.last_token_line = 1
|
|
238
310
|
self.last_token_column = 0
|
|
311
|
+
self.current_token_start_pos = 0
|
|
312
|
+
self.last_token_start_pos = None
|
|
239
313
|
self.errors = []
|
|
240
314
|
self.text_buffer.clear()
|
|
241
315
|
self.text_start_pos = 0
|
|
@@ -265,8 +339,9 @@ class Tokenizer:
|
|
|
265
339
|
else:
|
|
266
340
|
self.state = self.DATA
|
|
267
341
|
|
|
268
|
-
# Pre-compute newline positions for O(log n) line lookups
|
|
269
|
-
|
|
342
|
+
# Pre-compute newline positions for O(log n) line lookups.
|
|
343
|
+
# Only do this when errors are collected or when node locations are requested.
|
|
344
|
+
if self.collect_errors or self.track_node_locations:
|
|
270
345
|
self._newline_positions = []
|
|
271
346
|
pos = -1
|
|
272
347
|
buffer = self.buffer
|
|
@@ -278,42 +353,73 @@ class Tokenizer:
|
|
|
278
353
|
else:
|
|
279
354
|
self._newline_positions = None
|
|
280
355
|
|
|
281
|
-
def _get_line_at_pos(self, pos):
|
|
356
|
+
def _get_line_at_pos(self, pos: int) -> int:
|
|
282
357
|
"""Get line number (1-indexed) for a position using binary search."""
|
|
283
358
|
# Line number = count of newlines before pos + 1
|
|
284
|
-
|
|
359
|
+
newline_positions = self._newline_positions
|
|
360
|
+
if newline_positions is None: # pragma: no cover
|
|
361
|
+
return 1
|
|
362
|
+
return bisect_right(newline_positions, pos - 1) + 1
|
|
363
|
+
|
|
364
|
+
def location_at_pos(self, pos: int) -> tuple[int, int]:
|
|
365
|
+
"""Return (line, column) for a 0-indexed offset in the current buffer.
|
|
366
|
+
|
|
367
|
+
Column is 1-indexed. Newline positions are computed lazily when needed.
|
|
368
|
+
"""
|
|
369
|
+
newline_positions = self._newline_positions
|
|
370
|
+
if newline_positions is None:
|
|
371
|
+
newline_positions = []
|
|
372
|
+
scan = -1
|
|
373
|
+
buffer = self.buffer
|
|
374
|
+
while True:
|
|
375
|
+
scan = buffer.find("\n", scan + 1)
|
|
376
|
+
if scan == -1:
|
|
377
|
+
break
|
|
378
|
+
newline_positions.append(scan)
|
|
379
|
+
self._newline_positions = newline_positions
|
|
380
|
+
|
|
381
|
+
line_index = bisect_right(newline_positions, pos - 1)
|
|
382
|
+
line = line_index + 1
|
|
285
383
|
|
|
286
|
-
|
|
384
|
+
# Compute column using newline index rather than rfind() to avoid O(n) scans.
|
|
385
|
+
if line_index == 0:
|
|
386
|
+
last_newline = -1
|
|
387
|
+
else:
|
|
388
|
+
last_newline = newline_positions[line_index - 1]
|
|
389
|
+
column = pos - last_newline
|
|
390
|
+
return line, column
|
|
391
|
+
|
|
392
|
+
def step(self) -> bool:
|
|
287
393
|
"""Run one step of the tokenizer state machine. Returns True if EOF reached."""
|
|
288
|
-
handler = self._STATE_HANDLERS[self.state]
|
|
289
|
-
return handler(self)
|
|
394
|
+
handler = self._STATE_HANDLERS[self.state] # type: ignore[attr-defined]
|
|
395
|
+
return handler(self) # type: ignore[no-any-return]
|
|
290
396
|
|
|
291
|
-
def run(self, html):
|
|
397
|
+
def run(self, html: str | None) -> None:
|
|
292
398
|
self.initialize(html)
|
|
399
|
+
handlers = self._STATE_HANDLERS # type: ignore[attr-defined]
|
|
293
400
|
while True:
|
|
294
|
-
if self.
|
|
401
|
+
if handlers[self.state](self): # type: ignore[no-any-return]
|
|
295
402
|
break
|
|
296
403
|
|
|
297
404
|
# ---------------------
|
|
298
405
|
# Helper methods
|
|
299
406
|
# ---------------------
|
|
300
407
|
|
|
301
|
-
def _peek_char(self, offset):
|
|
408
|
+
def _peek_char(self, offset: int) -> str | None:
|
|
302
409
|
"""Peek ahead at character at current position + offset without consuming"""
|
|
303
410
|
peek_pos = self.pos + offset
|
|
304
411
|
if peek_pos < self.length:
|
|
305
412
|
return self.buffer[peek_pos]
|
|
306
413
|
return None
|
|
307
414
|
|
|
308
|
-
def _append_text_chunk(self, chunk
|
|
415
|
+
def _append_text_chunk(self, chunk: str) -> None:
|
|
309
416
|
self._append_text(chunk)
|
|
310
|
-
self.ignore_lf = ends_with_cr
|
|
311
417
|
|
|
312
418
|
# ---------------------
|
|
313
419
|
# State handlers
|
|
314
420
|
# ---------------------
|
|
315
421
|
|
|
316
|
-
def _state_data(self):
|
|
422
|
+
def _state_data(self) -> bool:
|
|
317
423
|
buffer = self.buffer
|
|
318
424
|
length = self.length
|
|
319
425
|
pos = self.pos
|
|
@@ -341,12 +447,12 @@ class Tokenizer:
|
|
|
341
447
|
|
|
342
448
|
if end > pos:
|
|
343
449
|
chunk = buffer[pos:end]
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
450
|
+
if self.collect_errors and not chunk.isascii():
|
|
451
|
+
base_pos = pos
|
|
452
|
+
for offset, ch in enumerate(chunk):
|
|
453
|
+
if _is_noncharacter_codepoint(ord(ch)):
|
|
454
|
+
self._emit_error_at_pos("noncharacter-in-input-stream", base_pos + offset)
|
|
348
455
|
self._append_text(chunk)
|
|
349
|
-
self.ignore_lf = chunk.endswith("\r")
|
|
350
456
|
|
|
351
457
|
pos = end
|
|
352
458
|
self.pos = pos
|
|
@@ -359,8 +465,8 @@ class Tokenizer:
|
|
|
359
465
|
pos += 1
|
|
360
466
|
self.pos = pos
|
|
361
467
|
self.current_char = c
|
|
362
|
-
self.ignore_lf = False
|
|
363
468
|
# c is always '<' here due to find() optimization above
|
|
469
|
+
self.current_token_start_pos = pos - 1
|
|
364
470
|
# Optimization: Peek ahead for common tag starts
|
|
365
471
|
if pos < length:
|
|
366
472
|
nc = buffer[pos]
|
|
@@ -415,7 +521,7 @@ class Tokenizer:
|
|
|
415
521
|
self.state = self.TAG_OPEN
|
|
416
522
|
return self._state_tag_open()
|
|
417
523
|
|
|
418
|
-
def _state_tag_open(self):
|
|
524
|
+
def _state_tag_open(self) -> bool:
|
|
419
525
|
c = self._get_char()
|
|
420
526
|
if c is None:
|
|
421
527
|
self._emit_error("eof-before-tag-name")
|
|
@@ -442,7 +548,7 @@ class Tokenizer:
|
|
|
442
548
|
self.state = self.DATA
|
|
443
549
|
return False
|
|
444
550
|
|
|
445
|
-
def _state_end_tag_open(self):
|
|
551
|
+
def _state_end_tag_open(self) -> bool:
|
|
446
552
|
c = self._get_char()
|
|
447
553
|
if c is None:
|
|
448
554
|
self._emit_error("eof-before-tag-name")
|
|
@@ -462,20 +568,20 @@ class Tokenizer:
|
|
|
462
568
|
self.state = self.BOGUS_COMMENT
|
|
463
569
|
return False
|
|
464
570
|
|
|
465
|
-
def _state_tag_name(self):
|
|
571
|
+
def _state_tag_name(self) -> bool:
|
|
466
572
|
replacement = "\ufffd"
|
|
467
573
|
append_tag_char = self.current_tag_name.append
|
|
468
574
|
buffer = self.buffer
|
|
469
575
|
length = self.length
|
|
576
|
+
pos = self.pos
|
|
470
577
|
|
|
471
578
|
while True:
|
|
472
579
|
# Inline _consume_tag_name_run
|
|
473
|
-
# Note: reconsume
|
|
474
|
-
pos = self.pos
|
|
580
|
+
# Note: reconsume is never True when entering TAG_NAME
|
|
475
581
|
if pos < length:
|
|
476
582
|
# Optimization: Check for common terminators before regex
|
|
477
583
|
match = None
|
|
478
|
-
if buffer[pos] not in "\t\n\f />\0
|
|
584
|
+
if buffer[pos] not in "\t\n\f />\0":
|
|
479
585
|
match = _TAG_NAME_RUN_PATTERN.match(buffer, pos)
|
|
480
586
|
|
|
481
587
|
if match:
|
|
@@ -483,56 +589,69 @@ class Tokenizer:
|
|
|
483
589
|
if not chunk.islower():
|
|
484
590
|
chunk = chunk.translate(_ASCII_LOWER_TABLE)
|
|
485
591
|
append_tag_char(chunk)
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
if
|
|
489
|
-
|
|
490
|
-
if
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
self.ignore_lf = True
|
|
592
|
+
pos = match.end()
|
|
593
|
+
|
|
594
|
+
if pos < length:
|
|
595
|
+
next_char = buffer[pos]
|
|
596
|
+
if next_char in (" ", "\t", "\n", "\f"):
|
|
597
|
+
pos += 1
|
|
598
|
+
self.pos = pos
|
|
494
599
|
self.state = self.BEFORE_ATTRIBUTE_NAME
|
|
495
600
|
return self._state_before_attribute_name()
|
|
496
|
-
if
|
|
497
|
-
|
|
601
|
+
if next_char == ">":
|
|
602
|
+
pos += 1
|
|
603
|
+
self.pos = pos
|
|
498
604
|
if not self._emit_current_tag():
|
|
499
605
|
self.state = self.DATA
|
|
500
606
|
return False
|
|
501
|
-
if
|
|
502
|
-
|
|
607
|
+
if next_char == "/":
|
|
608
|
+
pos += 1
|
|
609
|
+
self.pos = pos
|
|
503
610
|
self.state = self.SELF_CLOSING_START_TAG
|
|
504
611
|
return self._state_self_closing_start_tag()
|
|
505
612
|
|
|
506
|
-
|
|
613
|
+
# Inline _get_char
|
|
614
|
+
# Note: reconsume is never True in this state.
|
|
615
|
+
if pos >= length:
|
|
616
|
+
c: str | None = None
|
|
617
|
+
else:
|
|
618
|
+
c = buffer[pos]
|
|
619
|
+
pos += 1
|
|
620
|
+
self.current_char = c
|
|
507
621
|
if c is None:
|
|
622
|
+
self.pos = pos
|
|
508
623
|
self._emit_error("eof-in-tag")
|
|
509
624
|
# Per HTML5 spec: EOF in tag name is a parse error, emit EOF token only
|
|
510
625
|
# The incomplete tag is discarded (not emitted as text)
|
|
511
626
|
self._emit_token(EOFToken())
|
|
512
627
|
return True
|
|
513
628
|
if c in ("\t", "\n", "\f", " "):
|
|
629
|
+
self.pos = pos
|
|
514
630
|
self.state = self.BEFORE_ATTRIBUTE_NAME
|
|
515
631
|
return self._state_before_attribute_name()
|
|
516
632
|
if c == "/":
|
|
633
|
+
self.pos = pos
|
|
517
634
|
self.state = self.SELF_CLOSING_START_TAG
|
|
518
635
|
return self._state_self_closing_start_tag()
|
|
519
636
|
if c == ">":
|
|
520
637
|
# In slow path, tag name is only first char (from DATA),
|
|
521
638
|
# so no rawtext elements possible - always set DATA state
|
|
639
|
+
self.pos = pos
|
|
522
640
|
self._emit_current_tag()
|
|
523
641
|
self.state = self.DATA
|
|
524
642
|
return False
|
|
525
643
|
# c == "\0" - the only remaining possibility after fast-path
|
|
644
|
+
self.pos = pos
|
|
526
645
|
self._emit_error("unexpected-null-character")
|
|
527
646
|
append_tag_char(replacement)
|
|
528
647
|
|
|
529
|
-
def _state_before_attribute_name(self):
|
|
648
|
+
def _state_before_attribute_name(self) -> bool:
|
|
530
649
|
buffer = self.buffer
|
|
531
650
|
length = self.length
|
|
532
651
|
|
|
533
652
|
while True:
|
|
534
653
|
# Optimization: Skip whitespace
|
|
535
|
-
if not self.reconsume
|
|
654
|
+
if not self.reconsume:
|
|
536
655
|
if self.pos < length:
|
|
537
656
|
# Check if current char is whitespace before running regex
|
|
538
657
|
if buffer[self.pos] in " \t\n\f":
|
|
@@ -552,21 +671,7 @@ class Tokenizer:
|
|
|
552
671
|
|
|
553
672
|
self.current_char = c
|
|
554
673
|
|
|
555
|
-
if c
|
|
556
|
-
self.ignore_lf = False
|
|
557
|
-
continue
|
|
558
|
-
if c == "\n":
|
|
559
|
-
if self.ignore_lf:
|
|
560
|
-
self.ignore_lf = False
|
|
561
|
-
# Line tracking now computed on-demand via _get_line_at_pos()
|
|
562
|
-
continue
|
|
563
|
-
if c == "\t" or c == "\f":
|
|
564
|
-
self.ignore_lf = False
|
|
565
|
-
continue
|
|
566
|
-
if c == "\r":
|
|
567
|
-
self.ignore_lf = False
|
|
568
|
-
if self.pos < length and buffer[self.pos] == "\n":
|
|
569
|
-
self.pos += 1
|
|
674
|
+
if c in (" ", "\n", "\t", "\f"):
|
|
570
675
|
continue
|
|
571
676
|
|
|
572
677
|
if c is None:
|
|
@@ -605,55 +710,64 @@ class Tokenizer:
|
|
|
605
710
|
self.state = self.ATTRIBUTE_NAME
|
|
606
711
|
return False # Let main loop dispatch to avoid recursion
|
|
607
712
|
|
|
608
|
-
def _state_attribute_name(self):
|
|
713
|
+
def _state_attribute_name(self) -> bool:
|
|
609
714
|
replacement = "\ufffd"
|
|
610
715
|
append_attr_char = self.current_attr_name.append
|
|
611
716
|
buffer = self.buffer
|
|
612
717
|
length = self.length
|
|
718
|
+
pos = self.pos
|
|
613
719
|
|
|
614
720
|
while True:
|
|
615
721
|
# Inline _consume_attribute_name_run
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
match = _ATTR_NAME_RUN_PATTERN.match(buffer, pos)
|
|
623
|
-
|
|
624
|
-
if match:
|
|
625
|
-
chunk = match.group(0)
|
|
626
|
-
if not chunk.islower():
|
|
627
|
-
chunk = chunk.translate(_ASCII_LOWER_TABLE)
|
|
628
|
-
append_attr_char(chunk)
|
|
629
|
-
self.pos = match.end()
|
|
630
|
-
|
|
631
|
-
if self.pos < length:
|
|
632
|
-
c = buffer[self.pos]
|
|
633
|
-
if c == "=":
|
|
634
|
-
self.pos += 1
|
|
635
|
-
self.state = self.BEFORE_ATTRIBUTE_VALUE
|
|
636
|
-
return self._state_before_attribute_value()
|
|
637
|
-
if c in (" ", "\t", "\n", "\f", "\r"):
|
|
638
|
-
self.pos += 1
|
|
639
|
-
if c == "\r":
|
|
640
|
-
self.ignore_lf = True
|
|
641
|
-
self._finish_attribute()
|
|
642
|
-
self.state = self.AFTER_ATTRIBUTE_NAME
|
|
643
|
-
return False # Let main loop dispatch to avoid recursion
|
|
644
|
-
if c == ">":
|
|
645
|
-
self.pos += 1
|
|
646
|
-
self._finish_attribute()
|
|
647
|
-
if not self._emit_current_tag():
|
|
648
|
-
self.state = self.DATA
|
|
649
|
-
return False
|
|
650
|
-
if c == "/":
|
|
651
|
-
self.pos += 1
|
|
652
|
-
self._finish_attribute()
|
|
653
|
-
self.state = self.SELF_CLOSING_START_TAG
|
|
654
|
-
return self._state_self_closing_start_tag()
|
|
722
|
+
# Note: reconsume is never True in this state.
|
|
723
|
+
if pos < length:
|
|
724
|
+
# Optimization: Check for common terminators before regex
|
|
725
|
+
match = None
|
|
726
|
+
if buffer[pos] not in "\t\n\f />=\0\"'<":
|
|
727
|
+
match = _ATTR_NAME_RUN_PATTERN.match(buffer, pos)
|
|
655
728
|
|
|
656
|
-
|
|
729
|
+
if match:
|
|
730
|
+
chunk = match.group(0)
|
|
731
|
+
if not chunk.islower():
|
|
732
|
+
chunk = chunk.translate(_ASCII_LOWER_TABLE)
|
|
733
|
+
append_attr_char(chunk)
|
|
734
|
+
pos = match.end()
|
|
735
|
+
|
|
736
|
+
if pos < length:
|
|
737
|
+
next_char = buffer[pos]
|
|
738
|
+
if next_char == "=":
|
|
739
|
+
pos += 1
|
|
740
|
+
self.pos = pos
|
|
741
|
+
self.state = self.BEFORE_ATTRIBUTE_VALUE
|
|
742
|
+
return self._state_before_attribute_value()
|
|
743
|
+
if next_char in (" ", "\t", "\n", "\f"):
|
|
744
|
+
pos += 1
|
|
745
|
+
self.pos = pos
|
|
746
|
+
self._finish_attribute()
|
|
747
|
+
self.state = self.AFTER_ATTRIBUTE_NAME
|
|
748
|
+
return False # Let main loop dispatch to avoid recursion
|
|
749
|
+
if next_char == ">":
|
|
750
|
+
pos += 1
|
|
751
|
+
self.pos = pos
|
|
752
|
+
self._finish_attribute()
|
|
753
|
+
if not self._emit_current_tag():
|
|
754
|
+
self.state = self.DATA
|
|
755
|
+
return False
|
|
756
|
+
if next_char == "/":
|
|
757
|
+
pos += 1
|
|
758
|
+
self.pos = pos
|
|
759
|
+
self._finish_attribute()
|
|
760
|
+
self.state = self.SELF_CLOSING_START_TAG
|
|
761
|
+
return self._state_self_closing_start_tag()
|
|
762
|
+
|
|
763
|
+
# Inline _get_char (reconsume is never True in this state)
|
|
764
|
+
if pos >= length:
|
|
765
|
+
c: str | None = None
|
|
766
|
+
else:
|
|
767
|
+
c = buffer[pos]
|
|
768
|
+
pos += 1
|
|
769
|
+
self.current_char = c
|
|
770
|
+
self.pos = pos
|
|
657
771
|
if c is None:
|
|
658
772
|
self._emit_error("eof-in-tag")
|
|
659
773
|
self._flush_text()
|
|
@@ -679,21 +793,19 @@ class Tokenizer:
|
|
|
679
793
|
self._emit_error("unexpected-null-character")
|
|
680
794
|
append_attr_char(replacement)
|
|
681
795
|
continue
|
|
682
|
-
|
|
683
|
-
self._emit_error("unexpected-character-in-attribute-name")
|
|
796
|
+
self._emit_error("unexpected-character-in-attribute-name")
|
|
684
797
|
append_attr_char(c)
|
|
685
798
|
|
|
686
|
-
def _state_after_attribute_name(self):
|
|
799
|
+
def _state_after_attribute_name(self) -> bool:
|
|
687
800
|
buffer = self.buffer
|
|
688
801
|
length = self.length
|
|
689
802
|
|
|
690
803
|
while True:
|
|
691
804
|
# Optimization: Skip whitespace
|
|
692
|
-
if not self.reconsume
|
|
805
|
+
if not self.reconsume:
|
|
693
806
|
if self.pos < length:
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
self.pos = match.end()
|
|
807
|
+
if buffer[self.pos] in " \t\n\f":
|
|
808
|
+
self.pos = _WHITESPACE_PATTERN.match(buffer, self.pos).end() # type: ignore[union-attr]
|
|
697
809
|
|
|
698
810
|
# Inline _get_char
|
|
699
811
|
if self.pos >= length:
|
|
@@ -704,23 +816,9 @@ class Tokenizer:
|
|
|
704
816
|
|
|
705
817
|
self.current_char = c
|
|
706
818
|
|
|
707
|
-
if c
|
|
708
|
-
self.ignore_lf = False
|
|
709
|
-
continue
|
|
710
|
-
if c == "\n":
|
|
711
|
-
# Note: Only reachable when ignore_lf=True (CR-LF handling)
|
|
712
|
-
# Standalone \n is caught by whitespace optimization
|
|
713
|
-
self.ignore_lf = False
|
|
714
|
-
continue
|
|
715
|
-
if c == "\r":
|
|
716
|
-
self.ignore_lf = True
|
|
717
|
-
continue
|
|
718
|
-
if c == "\t" or c == "\f":
|
|
719
|
-
self.ignore_lf = False
|
|
819
|
+
if c in (" ", "\n", "\t", "\f"):
|
|
720
820
|
continue
|
|
721
821
|
|
|
722
|
-
self.ignore_lf = False
|
|
723
|
-
|
|
724
822
|
if c is None:
|
|
725
823
|
self._emit_error("eof-in-tag")
|
|
726
824
|
self._flush_text()
|
|
@@ -751,9 +849,16 @@ class Tokenizer:
|
|
|
751
849
|
self.state = self.ATTRIBUTE_NAME
|
|
752
850
|
return False # Let main loop dispatch to avoid recursion
|
|
753
851
|
|
|
754
|
-
def _state_before_attribute_value(self):
|
|
852
|
+
def _state_before_attribute_value(self) -> bool:
|
|
755
853
|
while True:
|
|
756
|
-
|
|
854
|
+
# Inline _get_char (reconsume is never True in this state)
|
|
855
|
+
pos = self.pos
|
|
856
|
+
if pos >= self.length:
|
|
857
|
+
c: str | None = None
|
|
858
|
+
else:
|
|
859
|
+
c = self.buffer[pos]
|
|
860
|
+
self.pos = pos + 1
|
|
861
|
+
self.current_char = c
|
|
757
862
|
if c is None:
|
|
758
863
|
self._emit_error("eof-in-tag")
|
|
759
864
|
self._flush_text()
|
|
@@ -777,7 +882,7 @@ class Tokenizer:
|
|
|
777
882
|
self.state = self.ATTRIBUTE_VALUE_UNQUOTED
|
|
778
883
|
return self._state_attribute_value_unquoted()
|
|
779
884
|
|
|
780
|
-
def _state_attribute_value_double(self):
|
|
885
|
+
def _state_attribute_value_double(self) -> bool:
|
|
781
886
|
replacement = "\ufffd"
|
|
782
887
|
stop_pattern = _ATTR_VALUE_DOUBLE_PATTERN
|
|
783
888
|
buffer = self.buffer
|
|
@@ -797,8 +902,7 @@ class Tokenizer:
|
|
|
797
902
|
if "&" in chunk or "\0" in chunk:
|
|
798
903
|
# Fallback to regex if complex chars present
|
|
799
904
|
match = stop_pattern.search(buffer, pos)
|
|
800
|
-
|
|
801
|
-
end = match.start()
|
|
905
|
+
end = length if match is None else match.start()
|
|
802
906
|
else:
|
|
803
907
|
end = next_quote
|
|
804
908
|
|
|
@@ -807,10 +911,6 @@ class Tokenizer:
|
|
|
807
911
|
if end != next_quote:
|
|
808
912
|
chunk = buffer[pos:end]
|
|
809
913
|
|
|
810
|
-
# Normalize chunk for value if needed
|
|
811
|
-
if "\r" in chunk:
|
|
812
|
-
chunk = chunk.replace("\r\n", "\n").replace("\r", "\n")
|
|
813
|
-
|
|
814
914
|
self.current_attr_value.append(chunk)
|
|
815
915
|
self.pos = end
|
|
816
916
|
|
|
@@ -837,7 +937,7 @@ class Tokenizer:
|
|
|
837
937
|
self._emit_error("unexpected-null-character")
|
|
838
938
|
self._append_attr_value_char(replacement)
|
|
839
939
|
|
|
840
|
-
def _state_attribute_value_single(self):
|
|
940
|
+
def _state_attribute_value_single(self) -> bool:
|
|
841
941
|
replacement = "\ufffd"
|
|
842
942
|
stop_pattern = _ATTR_VALUE_SINGLE_PATTERN
|
|
843
943
|
buffer = self.buffer
|
|
@@ -857,8 +957,7 @@ class Tokenizer:
|
|
|
857
957
|
if "&" in chunk or "\0" in chunk:
|
|
858
958
|
# Fallback to regex if complex chars present
|
|
859
959
|
match = stop_pattern.search(buffer, pos)
|
|
860
|
-
|
|
861
|
-
end = match.start()
|
|
960
|
+
end = length if match is None else match.start()
|
|
862
961
|
else:
|
|
863
962
|
end = next_quote
|
|
864
963
|
|
|
@@ -867,10 +966,6 @@ class Tokenizer:
|
|
|
867
966
|
if end != next_quote:
|
|
868
967
|
chunk = buffer[pos:end]
|
|
869
968
|
|
|
870
|
-
# Normalize chunk for value if needed
|
|
871
|
-
if "\r" in chunk:
|
|
872
|
-
chunk = chunk.replace("\r\n", "\n").replace("\r", "\n")
|
|
873
|
-
|
|
874
969
|
self.current_attr_value.append(chunk)
|
|
875
970
|
self.pos = end
|
|
876
971
|
|
|
@@ -897,7 +992,7 @@ class Tokenizer:
|
|
|
897
992
|
self._emit_error("unexpected-null-character")
|
|
898
993
|
self._append_attr_value_char(replacement)
|
|
899
994
|
|
|
900
|
-
def _state_attribute_value_unquoted(self):
|
|
995
|
+
def _state_attribute_value_unquoted(self) -> bool:
|
|
901
996
|
replacement = "\ufffd"
|
|
902
997
|
stop_pattern = _ATTR_VALUE_UNQUOTED_PATTERN
|
|
903
998
|
buffer = self.buffer
|
|
@@ -916,7 +1011,17 @@ class Tokenizer:
|
|
|
916
1011
|
self.current_attr_value.append(buffer[pos:end])
|
|
917
1012
|
self.pos = end
|
|
918
1013
|
|
|
919
|
-
|
|
1014
|
+
# Inline _get_char
|
|
1015
|
+
if self.reconsume:
|
|
1016
|
+
self.reconsume = False
|
|
1017
|
+
c = self.current_char
|
|
1018
|
+
elif self.pos >= length:
|
|
1019
|
+
c = None
|
|
1020
|
+
else:
|
|
1021
|
+
c = buffer[self.pos]
|
|
1022
|
+
self.pos += 1
|
|
1023
|
+
self.current_char = c
|
|
1024
|
+
|
|
920
1025
|
if c is None:
|
|
921
1026
|
# Per HTML5 spec: EOF in attribute value is a parse error
|
|
922
1027
|
# The incomplete tag is discarded (not emitted)
|
|
@@ -944,9 +1049,16 @@ class Tokenizer:
|
|
|
944
1049
|
continue
|
|
945
1050
|
self._append_attr_value_char(c)
|
|
946
1051
|
|
|
947
|
-
def _state_after_attribute_value_quoted(self):
|
|
1052
|
+
def _state_after_attribute_value_quoted(self) -> bool:
|
|
948
1053
|
"""After attribute value (quoted) state per HTML5 spec §13.2.5.42"""
|
|
949
|
-
|
|
1054
|
+
# Inline _get_char
|
|
1055
|
+
if self.pos >= self.length:
|
|
1056
|
+
c: str | None = None
|
|
1057
|
+
else:
|
|
1058
|
+
c = self.buffer[self.pos]
|
|
1059
|
+
self.pos += 1
|
|
1060
|
+
self.current_char = c
|
|
1061
|
+
|
|
950
1062
|
if c is None:
|
|
951
1063
|
self._emit_error("eof-in-tag")
|
|
952
1064
|
self._flush_text()
|
|
@@ -972,7 +1084,7 @@ class Tokenizer:
|
|
|
972
1084
|
self.state = self.BEFORE_ATTRIBUTE_NAME
|
|
973
1085
|
return False
|
|
974
1086
|
|
|
975
|
-
def _state_self_closing_start_tag(self):
|
|
1087
|
+
def _state_self_closing_start_tag(self) -> bool:
|
|
976
1088
|
c = self._get_char()
|
|
977
1089
|
if c is None:
|
|
978
1090
|
self._emit_error("eof-in-tag")
|
|
@@ -989,7 +1101,7 @@ class Tokenizer:
|
|
|
989
1101
|
self.state = self.BEFORE_ATTRIBUTE_NAME
|
|
990
1102
|
return False
|
|
991
1103
|
|
|
992
|
-
def _state_markup_declaration_open(self):
|
|
1104
|
+
def _state_markup_declaration_open(self) -> bool:
|
|
993
1105
|
# Note: Comment handling (<!--) is optimized in DATA state fast-path
|
|
994
1106
|
# This code only handles DOCTYPE and CDATA, or malformed markup
|
|
995
1107
|
if self._consume_case_insensitive("DOCTYPE"):
|
|
@@ -1023,7 +1135,7 @@ class Tokenizer:
|
|
|
1023
1135
|
self.state = self.BOGUS_COMMENT
|
|
1024
1136
|
return False
|
|
1025
1137
|
|
|
1026
|
-
def _state_comment_start(self):
|
|
1138
|
+
def _state_comment_start(self) -> bool:
|
|
1027
1139
|
replacement = "\ufffd"
|
|
1028
1140
|
c = self._get_char()
|
|
1029
1141
|
if c is None:
|
|
@@ -1047,7 +1159,7 @@ class Tokenizer:
|
|
|
1047
1159
|
self.state = self.COMMENT
|
|
1048
1160
|
return False
|
|
1049
1161
|
|
|
1050
|
-
def _state_comment_start_dash(self):
|
|
1162
|
+
def _state_comment_start_dash(self) -> bool:
|
|
1051
1163
|
replacement = "\ufffd"
|
|
1052
1164
|
c = self._get_char()
|
|
1053
1165
|
if c is None:
|
|
@@ -1071,12 +1183,19 @@ class Tokenizer:
|
|
|
1071
1183
|
self.state = self.COMMENT
|
|
1072
1184
|
return False
|
|
1073
1185
|
|
|
1074
|
-
def _state_comment(self):
|
|
1186
|
+
def _state_comment(self) -> bool:
|
|
1075
1187
|
replacement = "\ufffd"
|
|
1076
1188
|
while True:
|
|
1077
1189
|
if self._consume_comment_run():
|
|
1078
1190
|
continue
|
|
1079
|
-
|
|
1191
|
+
# Inline _get_char
|
|
1192
|
+
if self.pos >= self.length:
|
|
1193
|
+
c: str | None = None
|
|
1194
|
+
else:
|
|
1195
|
+
c = self.buffer[self.pos]
|
|
1196
|
+
self.pos += 1
|
|
1197
|
+
self.current_char = c
|
|
1198
|
+
|
|
1080
1199
|
if c is None:
|
|
1081
1200
|
self._emit_error("eof-in-comment")
|
|
1082
1201
|
self._emit_comment()
|
|
@@ -1089,7 +1208,7 @@ class Tokenizer:
|
|
|
1089
1208
|
self._emit_error("unexpected-null-character")
|
|
1090
1209
|
self.current_comment.append(replacement)
|
|
1091
1210
|
|
|
1092
|
-
def _state_comment_end_dash(self):
|
|
1211
|
+
def _state_comment_end_dash(self) -> bool:
|
|
1093
1212
|
replacement = "\ufffd"
|
|
1094
1213
|
c = self._get_char()
|
|
1095
1214
|
if c is None:
|
|
@@ -1110,7 +1229,7 @@ class Tokenizer:
|
|
|
1110
1229
|
self.state = self.COMMENT
|
|
1111
1230
|
return False
|
|
1112
1231
|
|
|
1113
|
-
def _state_comment_end(self):
|
|
1232
|
+
def _state_comment_end(self) -> bool:
|
|
1114
1233
|
replacement = "\ufffd"
|
|
1115
1234
|
c = self._get_char()
|
|
1116
1235
|
if c is None:
|
|
@@ -1138,7 +1257,7 @@ class Tokenizer:
|
|
|
1138
1257
|
self.state = self.COMMENT
|
|
1139
1258
|
return False
|
|
1140
1259
|
|
|
1141
|
-
def _state_comment_end_bang(self):
|
|
1260
|
+
def _state_comment_end_bang(self) -> bool:
|
|
1142
1261
|
replacement = "\ufffd"
|
|
1143
1262
|
c = self._get_char()
|
|
1144
1263
|
if c is None:
|
|
@@ -1172,7 +1291,7 @@ class Tokenizer:
|
|
|
1172
1291
|
self.state = self.COMMENT
|
|
1173
1292
|
return False
|
|
1174
1293
|
|
|
1175
|
-
def _state_bogus_comment(self):
|
|
1294
|
+
def _state_bogus_comment(self) -> bool:
|
|
1176
1295
|
replacement = "\ufffd"
|
|
1177
1296
|
while True:
|
|
1178
1297
|
c = self._get_char()
|
|
@@ -1189,7 +1308,7 @@ class Tokenizer:
|
|
|
1189
1308
|
else:
|
|
1190
1309
|
self.current_comment.append(c)
|
|
1191
1310
|
|
|
1192
|
-
def _state_doctype(self):
|
|
1311
|
+
def _state_doctype(self) -> bool:
|
|
1193
1312
|
c = self._get_char()
|
|
1194
1313
|
if c is None:
|
|
1195
1314
|
self._emit_error("eof-in-doctype")
|
|
@@ -1211,11 +1330,11 @@ class Tokenizer:
|
|
|
1211
1330
|
self.state = self.BEFORE_DOCTYPE_NAME
|
|
1212
1331
|
return False
|
|
1213
1332
|
|
|
1214
|
-
def _state_before_doctype_name(self):
|
|
1333
|
+
def _state_before_doctype_name(self) -> bool:
|
|
1215
1334
|
while True:
|
|
1216
1335
|
c = self._get_char()
|
|
1217
1336
|
if c is None:
|
|
1218
|
-
self._emit_error("eof-in-doctype
|
|
1337
|
+
self._emit_error("eof-in-doctype")
|
|
1219
1338
|
self.current_doctype_force_quirks = True
|
|
1220
1339
|
self._emit_doctype()
|
|
1221
1340
|
self._emit_token(EOFToken())
|
|
@@ -1238,11 +1357,11 @@ class Tokenizer:
|
|
|
1238
1357
|
self.state = self.DOCTYPE_NAME
|
|
1239
1358
|
return False
|
|
1240
1359
|
|
|
1241
|
-
def _state_doctype_name(self):
|
|
1360
|
+
def _state_doctype_name(self) -> bool:
|
|
1242
1361
|
while True:
|
|
1243
1362
|
c = self._get_char()
|
|
1244
1363
|
if c is None:
|
|
1245
|
-
self._emit_error("eof-in-doctype
|
|
1364
|
+
self._emit_error("eof-in-doctype")
|
|
1246
1365
|
self.current_doctype_force_quirks = True
|
|
1247
1366
|
self._emit_doctype()
|
|
1248
1367
|
self._emit_token(EOFToken())
|
|
@@ -1263,7 +1382,7 @@ class Tokenizer:
|
|
|
1263
1382
|
continue
|
|
1264
1383
|
self.current_doctype_name.append(c)
|
|
1265
1384
|
|
|
1266
|
-
def _state_after_doctype_name(self):
|
|
1385
|
+
def _state_after_doctype_name(self) -> bool:
|
|
1267
1386
|
if self._consume_case_insensitive("PUBLIC"):
|
|
1268
1387
|
self.state = self.AFTER_DOCTYPE_PUBLIC_KEYWORD
|
|
1269
1388
|
return False
|
|
@@ -1290,7 +1409,7 @@ class Tokenizer:
|
|
|
1290
1409
|
self.state = self.BOGUS_DOCTYPE
|
|
1291
1410
|
return False
|
|
1292
1411
|
|
|
1293
|
-
def _state_after_doctype_public_keyword(self):
|
|
1412
|
+
def _state_after_doctype_public_keyword(self) -> bool:
|
|
1294
1413
|
while True:
|
|
1295
1414
|
c = self._get_char()
|
|
1296
1415
|
if c is None:
|
|
@@ -1324,7 +1443,7 @@ class Tokenizer:
|
|
|
1324
1443
|
self.state = self.BOGUS_DOCTYPE
|
|
1325
1444
|
return False
|
|
1326
1445
|
|
|
1327
|
-
def _state_after_doctype_system_keyword(self):
|
|
1446
|
+
def _state_after_doctype_system_keyword(self) -> bool:
|
|
1328
1447
|
while True:
|
|
1329
1448
|
c = self._get_char()
|
|
1330
1449
|
if c is None:
|
|
@@ -1358,7 +1477,7 @@ class Tokenizer:
|
|
|
1358
1477
|
self.state = self.BOGUS_DOCTYPE
|
|
1359
1478
|
return False
|
|
1360
1479
|
|
|
1361
|
-
def _state_before_doctype_public_identifier(self):
|
|
1480
|
+
def _state_before_doctype_public_identifier(self) -> bool:
|
|
1362
1481
|
while True:
|
|
1363
1482
|
c = self._get_char()
|
|
1364
1483
|
if c is None:
|
|
@@ -1389,7 +1508,9 @@ class Tokenizer:
|
|
|
1389
1508
|
self.state = self.BOGUS_DOCTYPE
|
|
1390
1509
|
return False
|
|
1391
1510
|
|
|
1392
|
-
def _state_doctype_public_identifier_double_quoted(self):
|
|
1511
|
+
def _state_doctype_public_identifier_double_quoted(self) -> bool:
|
|
1512
|
+
if self.current_doctype_public is None: # pragma: no cover
|
|
1513
|
+
self.current_doctype_public = []
|
|
1393
1514
|
while True:
|
|
1394
1515
|
c = self._get_char()
|
|
1395
1516
|
if c is None:
|
|
@@ -1413,7 +1534,9 @@ class Tokenizer:
|
|
|
1413
1534
|
return False
|
|
1414
1535
|
self.current_doctype_public.append(c)
|
|
1415
1536
|
|
|
1416
|
-
def _state_doctype_public_identifier_single_quoted(self):
|
|
1537
|
+
def _state_doctype_public_identifier_single_quoted(self) -> bool:
|
|
1538
|
+
if self.current_doctype_public is None: # pragma: no cover
|
|
1539
|
+
self.current_doctype_public = []
|
|
1417
1540
|
while True:
|
|
1418
1541
|
c = self._get_char()
|
|
1419
1542
|
if c is None:
|
|
@@ -1437,7 +1560,7 @@ class Tokenizer:
|
|
|
1437
1560
|
return False
|
|
1438
1561
|
self.current_doctype_public.append(c)
|
|
1439
1562
|
|
|
1440
|
-
def _state_after_doctype_public_identifier(self):
|
|
1563
|
+
def _state_after_doctype_public_identifier(self) -> bool:
|
|
1441
1564
|
while True:
|
|
1442
1565
|
c = self._get_char()
|
|
1443
1566
|
if c is None:
|
|
@@ -1469,7 +1592,7 @@ class Tokenizer:
|
|
|
1469
1592
|
self.state = self.BOGUS_DOCTYPE
|
|
1470
1593
|
return False
|
|
1471
1594
|
|
|
1472
|
-
def _state_between_doctype_public_and_system_identifiers(self):
|
|
1595
|
+
def _state_between_doctype_public_and_system_identifiers(self) -> bool:
|
|
1473
1596
|
while True:
|
|
1474
1597
|
c = self._get_char()
|
|
1475
1598
|
if c is None:
|
|
@@ -1498,7 +1621,7 @@ class Tokenizer:
|
|
|
1498
1621
|
self.state = self.BOGUS_DOCTYPE
|
|
1499
1622
|
return False
|
|
1500
1623
|
|
|
1501
|
-
def _state_before_doctype_system_identifier(self):
|
|
1624
|
+
def _state_before_doctype_system_identifier(self) -> bool:
|
|
1502
1625
|
while True:
|
|
1503
1626
|
c = self._get_char()
|
|
1504
1627
|
if c is None:
|
|
@@ -1529,7 +1652,9 @@ class Tokenizer:
|
|
|
1529
1652
|
self.state = self.BOGUS_DOCTYPE
|
|
1530
1653
|
return False
|
|
1531
1654
|
|
|
1532
|
-
def _state_doctype_system_identifier_double_quoted(self):
|
|
1655
|
+
def _state_doctype_system_identifier_double_quoted(self) -> bool:
|
|
1656
|
+
if self.current_doctype_system is None: # pragma: no cover
|
|
1657
|
+
self.current_doctype_system = []
|
|
1533
1658
|
while True:
|
|
1534
1659
|
c = self._get_char()
|
|
1535
1660
|
if c is None:
|
|
@@ -1553,7 +1678,9 @@ class Tokenizer:
|
|
|
1553
1678
|
return False
|
|
1554
1679
|
self.current_doctype_system.append(c)
|
|
1555
1680
|
|
|
1556
|
-
def _state_doctype_system_identifier_single_quoted(self):
|
|
1681
|
+
def _state_doctype_system_identifier_single_quoted(self) -> bool:
|
|
1682
|
+
if self.current_doctype_system is None: # pragma: no cover
|
|
1683
|
+
self.current_doctype_system = []
|
|
1557
1684
|
while True:
|
|
1558
1685
|
c = self._get_char()
|
|
1559
1686
|
if c is None:
|
|
@@ -1577,7 +1704,7 @@ class Tokenizer:
|
|
|
1577
1704
|
return False
|
|
1578
1705
|
self.current_doctype_system.append(c)
|
|
1579
1706
|
|
|
1580
|
-
def _state_after_doctype_system_identifier(self):
|
|
1707
|
+
def _state_after_doctype_system_identifier(self) -> bool:
|
|
1581
1708
|
while True:
|
|
1582
1709
|
c = self._get_char()
|
|
1583
1710
|
if c is None:
|
|
@@ -1597,7 +1724,7 @@ class Tokenizer:
|
|
|
1597
1724
|
self.state = self.BOGUS_DOCTYPE
|
|
1598
1725
|
return False
|
|
1599
1726
|
|
|
1600
|
-
def _state_bogus_doctype(self):
|
|
1727
|
+
def _state_bogus_doctype(self) -> bool:
|
|
1601
1728
|
while True:
|
|
1602
1729
|
c = self._get_char()
|
|
1603
1730
|
if c is None:
|
|
@@ -1613,53 +1740,36 @@ class Tokenizer:
|
|
|
1613
1740
|
# Low-level helpers
|
|
1614
1741
|
# ---------------------
|
|
1615
1742
|
|
|
1616
|
-
def _get_char(self):
|
|
1743
|
+
def _get_char(self) -> str | None:
|
|
1617
1744
|
if self.reconsume:
|
|
1618
1745
|
self.reconsume = False
|
|
1619
1746
|
return self.current_char
|
|
1620
1747
|
|
|
1621
|
-
buffer = self.buffer
|
|
1622
1748
|
pos = self.pos
|
|
1623
|
-
|
|
1624
|
-
|
|
1625
|
-
|
|
1626
|
-
self.pos = pos
|
|
1627
|
-
self.current_char = None
|
|
1628
|
-
return None
|
|
1629
|
-
|
|
1630
|
-
c = buffer[pos]
|
|
1631
|
-
pos += 1
|
|
1749
|
+
if pos >= self.length:
|
|
1750
|
+
self.current_char = None
|
|
1751
|
+
return None
|
|
1632
1752
|
|
|
1633
|
-
|
|
1634
|
-
|
|
1635
|
-
|
|
1636
|
-
|
|
1637
|
-
|
|
1638
|
-
|
|
1639
|
-
|
|
1640
|
-
|
|
1641
|
-
self.ignore_lf = False
|
|
1642
|
-
continue
|
|
1643
|
-
# Line tracking now computed on-demand via _get_line_at_pos()
|
|
1644
|
-
|
|
1645
|
-
else:
|
|
1646
|
-
self.ignore_lf = False
|
|
1647
|
-
|
|
1648
|
-
self.current_char = c
|
|
1649
|
-
self.pos = pos
|
|
1650
|
-
return c
|
|
1753
|
+
c = self.buffer[pos]
|
|
1754
|
+
self.pos = pos + 1
|
|
1755
|
+
self.current_char = c
|
|
1756
|
+
if c == "<":
|
|
1757
|
+
self.current_token_start_pos = pos
|
|
1758
|
+
if self.collect_errors and not c.isascii() and _is_noncharacter_codepoint(ord(c)):
|
|
1759
|
+
self._emit_error_at_pos("noncharacter-in-input-stream", pos)
|
|
1760
|
+
return c
|
|
1651
1761
|
|
|
1652
|
-
def _reconsume_current(self):
|
|
1762
|
+
def _reconsume_current(self) -> None:
|
|
1653
1763
|
self.reconsume = True
|
|
1654
1764
|
|
|
1655
|
-
def _append_text(self, text):
|
|
1765
|
+
def _append_text(self, text: str) -> None:
|
|
1656
1766
|
"""Append text to buffer, recording start position if this is the first chunk."""
|
|
1657
1767
|
if not self.text_buffer:
|
|
1658
1768
|
# Record where text started (current position before this chunk)
|
|
1659
1769
|
self.text_start_pos = self.pos
|
|
1660
1770
|
self.text_buffer.append(text)
|
|
1661
1771
|
|
|
1662
|
-
def _flush_text(self):
|
|
1772
|
+
def _flush_text(self) -> None:
|
|
1663
1773
|
if not self.text_buffer:
|
|
1664
1774
|
return
|
|
1665
1775
|
|
|
@@ -1674,10 +1784,38 @@ class Tokenizer:
|
|
|
1674
1784
|
raw_len = len(data)
|
|
1675
1785
|
|
|
1676
1786
|
self.text_buffer.clear()
|
|
1677
|
-
|
|
1678
|
-
|
|
1679
|
-
|
|
1680
|
-
|
|
1787
|
+
# U+0000 NULL is a parse error in text.
|
|
1788
|
+
# Emit one error per NULL at the *actual* character position.
|
|
1789
|
+
if "\0" in data:
|
|
1790
|
+
base_pos = self.text_start_pos
|
|
1791
|
+
search_from = 0
|
|
1792
|
+
while True:
|
|
1793
|
+
idx = data.find("\0", search_from)
|
|
1794
|
+
if idx == -1:
|
|
1795
|
+
break
|
|
1796
|
+
error_pos = base_pos + idx
|
|
1797
|
+
|
|
1798
|
+
# Compute column at error_pos (1-indexed).
|
|
1799
|
+
last_newline = self.buffer.rfind("\n", 0, error_pos + 1)
|
|
1800
|
+
if last_newline == -1:
|
|
1801
|
+
column = error_pos + 1
|
|
1802
|
+
else:
|
|
1803
|
+
column = error_pos - last_newline
|
|
1804
|
+
line = self._get_line_at_pos(error_pos)
|
|
1805
|
+
|
|
1806
|
+
message = generate_error_message("unexpected-null-character")
|
|
1807
|
+
self.errors.append(
|
|
1808
|
+
ParseError(
|
|
1809
|
+
"unexpected-null-character",
|
|
1810
|
+
line=line,
|
|
1811
|
+
column=column,
|
|
1812
|
+
category="tokenizer",
|
|
1813
|
+
message=message,
|
|
1814
|
+
source_html=self.buffer,
|
|
1815
|
+
)
|
|
1816
|
+
)
|
|
1817
|
+
|
|
1818
|
+
search_from = idx + 1
|
|
1681
1819
|
|
|
1682
1820
|
# Per HTML5 spec:
|
|
1683
1821
|
# - RCDATA state (title, textarea): decode character references
|
|
@@ -1690,21 +1828,24 @@ class Tokenizer:
|
|
|
1690
1828
|
pass
|
|
1691
1829
|
else:
|
|
1692
1830
|
if "&" in data:
|
|
1693
|
-
|
|
1831
|
+
report_error = self._emit_error if self.collect_errors else None
|
|
1832
|
+
data = decode_entities_in_text(data, report_error=report_error)
|
|
1694
1833
|
# Apply XML coercion if enabled
|
|
1695
1834
|
if self.opts.xml_coercion:
|
|
1696
1835
|
data = _coerce_text_for_xml(data)
|
|
1697
1836
|
|
|
1698
1837
|
# Record position at END of raw text (1-indexed column = raw_len)
|
|
1699
|
-
self.
|
|
1838
|
+
if self.collect_errors:
|
|
1839
|
+
self._record_text_end_position(raw_len)
|
|
1840
|
+
self.last_token_start_pos = self.text_start_pos
|
|
1700
1841
|
self.sink.process_characters(data)
|
|
1701
1842
|
# Note: process_characters never returns Plaintext or RawData
|
|
1702
1843
|
# State switches happen via _emit_current_tag instead
|
|
1703
1844
|
|
|
1704
|
-
def _append_attr_value_char(self, c):
|
|
1845
|
+
def _append_attr_value_char(self, c: str) -> None:
|
|
1705
1846
|
self.current_attr_value.append(c)
|
|
1706
1847
|
|
|
1707
|
-
def _finish_attribute(self):
|
|
1848
|
+
def _finish_attribute(self) -> None:
|
|
1708
1849
|
attr_name_buffer = self.current_attr_name
|
|
1709
1850
|
if not attr_name_buffer:
|
|
1710
1851
|
return
|
|
@@ -1728,12 +1869,13 @@ class Tokenizer:
|
|
|
1728
1869
|
else:
|
|
1729
1870
|
value = "".join(attr_value_buffer)
|
|
1730
1871
|
if self.current_attr_value_has_amp:
|
|
1731
|
-
|
|
1872
|
+
report_error = self._emit_error if self.collect_errors else None
|
|
1873
|
+
value = decode_entities_in_text(value, in_attribute=True, report_error=report_error)
|
|
1732
1874
|
attrs[name] = value
|
|
1733
1875
|
attr_value_buffer.clear()
|
|
1734
1876
|
self.current_attr_value_has_amp = False
|
|
1735
1877
|
|
|
1736
|
-
def _emit_current_tag(self):
|
|
1878
|
+
def _emit_current_tag(self) -> bool:
|
|
1737
1879
|
name_parts = self.current_tag_name
|
|
1738
1880
|
part_count = len(name_parts)
|
|
1739
1881
|
# Note: part_count is always >= 1 because fast-path appends before entering TAG_NAME
|
|
@@ -1749,6 +1891,8 @@ class Tokenizer:
|
|
|
1749
1891
|
tag.name = name
|
|
1750
1892
|
tag.attrs = attrs
|
|
1751
1893
|
tag.self_closing = self.current_tag_self_closing
|
|
1894
|
+
tag.start_pos = self.current_token_start_pos
|
|
1895
|
+
self.last_token_start_pos = tag.start_pos
|
|
1752
1896
|
|
|
1753
1897
|
switched_to_rawtext = False
|
|
1754
1898
|
if self.current_tag_kind == Tag.START:
|
|
@@ -1774,7 +1918,8 @@ class Tokenizer:
|
|
|
1774
1918
|
# Remember current state before emitting
|
|
1775
1919
|
|
|
1776
1920
|
# Emit token to sink
|
|
1777
|
-
self.
|
|
1921
|
+
if self.collect_errors:
|
|
1922
|
+
self._record_token_position()
|
|
1778
1923
|
result = self.sink.process_token(tag)
|
|
1779
1924
|
if result == 1: # TokenSinkResult.Plaintext
|
|
1780
1925
|
self.state = self.PLAINTEXT
|
|
@@ -1787,16 +1932,18 @@ class Tokenizer:
|
|
|
1787
1932
|
self.current_tag_kind = Tag.START
|
|
1788
1933
|
return switched_to_rawtext
|
|
1789
1934
|
|
|
1790
|
-
def _emit_comment(self):
|
|
1935
|
+
def _emit_comment(self) -> None:
|
|
1791
1936
|
data = "".join(self.current_comment)
|
|
1792
1937
|
self.current_comment.clear()
|
|
1793
1938
|
# Apply XML coercion if enabled
|
|
1794
1939
|
if self.opts.xml_coercion:
|
|
1795
1940
|
data = _coerce_comment_for_xml(data)
|
|
1796
1941
|
self._comment_token.data = data
|
|
1942
|
+
self._comment_token.start_pos = self.current_token_start_pos
|
|
1943
|
+
self.last_token_start_pos = self._comment_token.start_pos
|
|
1797
1944
|
self._emit_token(self._comment_token)
|
|
1798
1945
|
|
|
1799
|
-
def _emit_doctype(self):
|
|
1946
|
+
def _emit_doctype(self) -> None:
|
|
1800
1947
|
name = "".join(self.current_doctype_name) if self.current_doctype_name else None
|
|
1801
1948
|
# If public_id/system_id is a list (even empty), join it; if None, keep None
|
|
1802
1949
|
public_id = "".join(self.current_doctype_public) if self.current_doctype_public is not None else None
|
|
@@ -1813,19 +1960,18 @@ class Tokenizer:
|
|
|
1813
1960
|
self.current_doctype_force_quirks = False
|
|
1814
1961
|
self._emit_token(DoctypeToken(doctype))
|
|
1815
1962
|
|
|
1816
|
-
def _emit_token(self, token):
|
|
1817
|
-
self.
|
|
1963
|
+
def _emit_token(self, token: AnyToken) -> None:
|
|
1964
|
+
if self.collect_errors:
|
|
1965
|
+
self._record_token_position()
|
|
1818
1966
|
self.sink.process_token(token)
|
|
1819
1967
|
# Note: process_token never returns Plaintext or RawData for state switches
|
|
1820
1968
|
# State switches happen via _emit_current_tag checking sink response
|
|
1821
1969
|
|
|
1822
|
-
def _record_token_position(self):
|
|
1970
|
+
def _record_token_position(self) -> None:
|
|
1823
1971
|
"""Record current position as 0-indexed column for the last emitted token.
|
|
1824
1972
|
|
|
1825
1973
|
Per the spec, the position should be at the end of the token (after the last char).
|
|
1826
1974
|
"""
|
|
1827
|
-
if not self.collect_errors:
|
|
1828
|
-
return
|
|
1829
1975
|
# pos points after the last consumed character, which is exactly what we want
|
|
1830
1976
|
pos = self.pos
|
|
1831
1977
|
last_newline = self.buffer.rfind("\n", 0, pos)
|
|
@@ -1836,14 +1982,12 @@ class Tokenizer:
|
|
|
1836
1982
|
self.last_token_line = self._get_line_at_pos(pos)
|
|
1837
1983
|
self.last_token_column = column
|
|
1838
1984
|
|
|
1839
|
-
def _record_text_end_position(self, raw_len):
|
|
1985
|
+
def _record_text_end_position(self, raw_len: int) -> None:
|
|
1840
1986
|
"""Record position at end of text token (after last character).
|
|
1841
1987
|
|
|
1842
1988
|
Uses text_start_pos + raw_len to compute where text ends, matching html5lib's
|
|
1843
1989
|
behavior of reporting the column of the last character (1-indexed).
|
|
1844
1990
|
"""
|
|
1845
|
-
if not self.collect_errors:
|
|
1846
|
-
return
|
|
1847
1991
|
# Position of last character of text (0-indexed)
|
|
1848
1992
|
end_pos = self.text_start_pos + raw_len
|
|
1849
1993
|
last_newline = self.buffer.rfind("\n", 0, end_pos)
|
|
@@ -1854,7 +1998,7 @@ class Tokenizer:
|
|
|
1854
1998
|
self.last_token_line = self._get_line_at_pos(end_pos)
|
|
1855
1999
|
self.last_token_column = column
|
|
1856
2000
|
|
|
1857
|
-
def _emit_error(self, code):
|
|
2001
|
+
def _emit_error(self, code: str) -> None:
|
|
1858
2002
|
if not self.collect_errors:
|
|
1859
2003
|
return
|
|
1860
2004
|
# Compute column on-demand: scan backwards to find last newline
|
|
@@ -1867,9 +2011,24 @@ class Tokenizer:
|
|
|
1867
2011
|
|
|
1868
2012
|
message = generate_error_message(code)
|
|
1869
2013
|
line = self._get_line_at_pos(self.pos)
|
|
1870
|
-
self.errors.append(
|
|
2014
|
+
self.errors.append(
|
|
2015
|
+
ParseError(code, line=line, column=column, category="tokenizer", message=message, source_html=self.buffer)
|
|
2016
|
+
)
|
|
1871
2017
|
|
|
1872
|
-
def
|
|
2018
|
+
def _emit_error_at_pos(self, code: str, pos: int) -> None:
|
|
2019
|
+
last_newline = self.buffer.rfind("\n", 0, pos + 1)
|
|
2020
|
+
if last_newline == -1:
|
|
2021
|
+
column = pos + 1
|
|
2022
|
+
else:
|
|
2023
|
+
column = pos - last_newline
|
|
2024
|
+
|
|
2025
|
+
message = generate_error_message(code)
|
|
2026
|
+
line = self._get_line_at_pos(pos)
|
|
2027
|
+
self.errors.append(
|
|
2028
|
+
ParseError(code, line=line, column=column, category="tokenizer", message=message, source_html=self.buffer)
|
|
2029
|
+
)
|
|
2030
|
+
|
|
2031
|
+
def _consume_if(self, literal: str) -> bool:
|
|
1873
2032
|
end = self.pos + len(literal)
|
|
1874
2033
|
if end > self.length:
|
|
1875
2034
|
return False
|
|
@@ -1879,7 +2038,7 @@ class Tokenizer:
|
|
|
1879
2038
|
self.pos = end
|
|
1880
2039
|
return True
|
|
1881
2040
|
|
|
1882
|
-
def _consume_case_insensitive(self, literal):
|
|
2041
|
+
def _consume_case_insensitive(self, literal: str) -> bool:
|
|
1883
2042
|
end = self.pos + len(literal)
|
|
1884
2043
|
if end > self.length:
|
|
1885
2044
|
return False
|
|
@@ -1889,34 +2048,22 @@ class Tokenizer:
|
|
|
1889
2048
|
self.pos = end
|
|
1890
2049
|
return True
|
|
1891
2050
|
|
|
1892
|
-
def _consume_comment_run(self):
|
|
2051
|
+
def _consume_comment_run(self) -> bool:
|
|
1893
2052
|
# Note: Comments are never reconsumed
|
|
1894
2053
|
pos = self.pos
|
|
1895
2054
|
length = self.length
|
|
1896
2055
|
if pos >= length:
|
|
1897
2056
|
return False
|
|
1898
2057
|
|
|
1899
|
-
# Handle ignore_lf for CRLF sequences
|
|
1900
|
-
if self.ignore_lf and pos < length and self.buffer[pos] == "\n":
|
|
1901
|
-
self.ignore_lf = False
|
|
1902
|
-
pos += 1
|
|
1903
|
-
self.pos = pos
|
|
1904
|
-
if pos >= length:
|
|
1905
|
-
return False
|
|
1906
|
-
|
|
1907
2058
|
match = _COMMENT_RUN_PATTERN.match(self.buffer, pos)
|
|
1908
2059
|
if match:
|
|
1909
2060
|
chunk = match.group(0)
|
|
1910
|
-
# Handle CRLF normalization for comments
|
|
1911
|
-
if "\r" in chunk:
|
|
1912
|
-
chunk = chunk.replace("\r\n", "\n").replace("\r", "\n")
|
|
1913
|
-
self.ignore_lf = chunk.endswith("\r")
|
|
1914
2061
|
self.current_comment.append(chunk)
|
|
1915
2062
|
self.pos = match.end()
|
|
1916
2063
|
return True
|
|
1917
2064
|
return False
|
|
1918
2065
|
|
|
1919
|
-
def _state_cdata_section(self):
|
|
2066
|
+
def _state_cdata_section(self) -> bool:
|
|
1920
2067
|
# CDATA section state - consume characters until we see ']'
|
|
1921
2068
|
while True:
|
|
1922
2069
|
c = self._get_char()
|
|
@@ -1930,7 +2077,7 @@ class Tokenizer:
|
|
|
1930
2077
|
return False
|
|
1931
2078
|
self._append_text(c)
|
|
1932
2079
|
|
|
1933
|
-
def _state_cdata_section_bracket(self):
|
|
2080
|
+
def _state_cdata_section_bracket(self) -> bool:
|
|
1934
2081
|
# Seen one ']', check for second ']'
|
|
1935
2082
|
c = self._get_char()
|
|
1936
2083
|
if c == "]":
|
|
@@ -1947,7 +2094,7 @@ class Tokenizer:
|
|
|
1947
2094
|
self.state = self.CDATA_SECTION
|
|
1948
2095
|
return False
|
|
1949
2096
|
|
|
1950
|
-
def _state_cdata_section_end(self):
|
|
2097
|
+
def _state_cdata_section_end(self) -> bool:
|
|
1951
2098
|
# Seen ']]', check for '>'
|
|
1952
2099
|
c = self._get_char()
|
|
1953
2100
|
if c == ">":
|
|
@@ -1973,7 +2120,7 @@ class Tokenizer:
|
|
|
1973
2120
|
self.state = self.CDATA_SECTION
|
|
1974
2121
|
return False
|
|
1975
2122
|
|
|
1976
|
-
def _state_rcdata(self):
|
|
2123
|
+
def _state_rcdata(self) -> bool:
|
|
1977
2124
|
buffer = self.buffer
|
|
1978
2125
|
length = self.length
|
|
1979
2126
|
pos = self.pos
|
|
@@ -2004,7 +2151,7 @@ class Tokenizer:
|
|
|
2004
2151
|
# Consume everything up to the special character
|
|
2005
2152
|
if next_special > pos:
|
|
2006
2153
|
chunk = buffer[pos:next_special]
|
|
2007
|
-
self._append_text_chunk(chunk
|
|
2154
|
+
self._append_text_chunk(chunk)
|
|
2008
2155
|
pos = next_special
|
|
2009
2156
|
self.pos = pos
|
|
2010
2157
|
|
|
@@ -2016,7 +2163,6 @@ class Tokenizer:
|
|
|
2016
2163
|
|
|
2017
2164
|
# Handle special characters - we're at one of them after find()
|
|
2018
2165
|
if null_index == pos:
|
|
2019
|
-
self.ignore_lf = False
|
|
2020
2166
|
self._emit_error("unexpected-null-character")
|
|
2021
2167
|
self._append_text("\ufffd")
|
|
2022
2168
|
pos += 1
|
|
@@ -2034,7 +2180,7 @@ class Tokenizer:
|
|
|
2034
2180
|
self.state = self.RCDATA_LESS_THAN_SIGN
|
|
2035
2181
|
return False
|
|
2036
2182
|
|
|
2037
|
-
def _state_rcdata_less_than_sign(self):
|
|
2183
|
+
def _state_rcdata_less_than_sign(self) -> bool:
|
|
2038
2184
|
c = self._get_char()
|
|
2039
2185
|
if c == "/":
|
|
2040
2186
|
self.current_tag_name.clear()
|
|
@@ -2045,7 +2191,7 @@ class Tokenizer:
|
|
|
2045
2191
|
self.state = self.RCDATA
|
|
2046
2192
|
return False
|
|
2047
2193
|
|
|
2048
|
-
def _state_rcdata_end_tag_open(self):
|
|
2194
|
+
def _state_rcdata_end_tag_open(self) -> bool:
|
|
2049
2195
|
c = self._get_char()
|
|
2050
2196
|
if c is not None and ("A" <= c <= "Z" or "a" <= c <= "z"):
|
|
2051
2197
|
self.current_tag_name.append(c.lower())
|
|
@@ -2057,7 +2203,7 @@ class Tokenizer:
|
|
|
2057
2203
|
self.state = self.RCDATA
|
|
2058
2204
|
return False
|
|
2059
2205
|
|
|
2060
|
-
def _state_rcdata_end_tag_name(self):
|
|
2206
|
+
def _state_rcdata_end_tag_name(self) -> bool:
|
|
2061
2207
|
# Check if this matches the opening tag name
|
|
2062
2208
|
while True:
|
|
2063
2209
|
c = self._get_char()
|
|
@@ -2069,7 +2215,7 @@ class Tokenizer:
|
|
|
2069
2215
|
tag_name = "".join(self.current_tag_name)
|
|
2070
2216
|
if tag_name == self.rawtext_tag_name:
|
|
2071
2217
|
if c == ">":
|
|
2072
|
-
attrs =
|
|
2218
|
+
attrs: dict[str, str | None] = {}
|
|
2073
2219
|
tag = Tag(Tag.END, tag_name, attrs, False)
|
|
2074
2220
|
self._flush_text()
|
|
2075
2221
|
self._emit_token(tag)
|
|
@@ -2110,7 +2256,7 @@ class Tokenizer:
|
|
|
2110
2256
|
self.state = self.RCDATA
|
|
2111
2257
|
return False
|
|
2112
2258
|
|
|
2113
|
-
def _state_rawtext(self):
|
|
2259
|
+
def _state_rawtext(self) -> bool:
|
|
2114
2260
|
buffer = self.buffer
|
|
2115
2261
|
length = self.length
|
|
2116
2262
|
pos = self.pos
|
|
@@ -2131,9 +2277,7 @@ class Tokenizer:
|
|
|
2131
2277
|
if null_index != -1 and null_index < next_special:
|
|
2132
2278
|
if null_index > pos:
|
|
2133
2279
|
chunk = buffer[pos:null_index]
|
|
2134
|
-
self._append_text_chunk(chunk
|
|
2135
|
-
else:
|
|
2136
|
-
self.ignore_lf = False
|
|
2280
|
+
self._append_text_chunk(chunk)
|
|
2137
2281
|
self._emit_error("unexpected-null-character")
|
|
2138
2282
|
self._append_text("\ufffd")
|
|
2139
2283
|
pos = null_index + 1
|
|
@@ -2142,14 +2286,14 @@ class Tokenizer:
|
|
|
2142
2286
|
if lt_index == -1:
|
|
2143
2287
|
if pos < length:
|
|
2144
2288
|
chunk = buffer[pos:length]
|
|
2145
|
-
self._append_text_chunk(chunk
|
|
2289
|
+
self._append_text_chunk(chunk)
|
|
2146
2290
|
self.pos = length
|
|
2147
2291
|
self._flush_text()
|
|
2148
2292
|
self._emit_token(EOFToken())
|
|
2149
2293
|
return True
|
|
2150
2294
|
if lt_index > pos:
|
|
2151
2295
|
chunk = buffer[pos:lt_index]
|
|
2152
|
-
self._append_text_chunk(chunk
|
|
2296
|
+
self._append_text_chunk(chunk)
|
|
2153
2297
|
pos = lt_index + 1
|
|
2154
2298
|
self.pos = pos
|
|
2155
2299
|
# Handle script escaped transition before treating '<' as markup boundary
|
|
@@ -2167,7 +2311,7 @@ class Tokenizer:
|
|
|
2167
2311
|
self.state = self.RAWTEXT_LESS_THAN_SIGN
|
|
2168
2312
|
return False
|
|
2169
2313
|
|
|
2170
|
-
def _state_rawtext_less_than_sign(self):
|
|
2314
|
+
def _state_rawtext_less_than_sign(self) -> bool:
|
|
2171
2315
|
c = self._get_char()
|
|
2172
2316
|
if c == "/":
|
|
2173
2317
|
self.current_tag_name.clear()
|
|
@@ -2178,7 +2322,7 @@ class Tokenizer:
|
|
|
2178
2322
|
self.state = self.RAWTEXT
|
|
2179
2323
|
return False
|
|
2180
2324
|
|
|
2181
|
-
def _state_rawtext_end_tag_open(self):
|
|
2325
|
+
def _state_rawtext_end_tag_open(self) -> bool:
|
|
2182
2326
|
c = self._get_char()
|
|
2183
2327
|
if c is not None and ("A" <= c <= "Z" or "a" <= c <= "z"):
|
|
2184
2328
|
self.current_tag_name.append(c.lower())
|
|
@@ -2190,7 +2334,7 @@ class Tokenizer:
|
|
|
2190
2334
|
self.state = self.RAWTEXT
|
|
2191
2335
|
return False
|
|
2192
2336
|
|
|
2193
|
-
def _state_rawtext_end_tag_name(self):
|
|
2337
|
+
def _state_rawtext_end_tag_name(self) -> bool:
|
|
2194
2338
|
# Check if this matches the opening tag name
|
|
2195
2339
|
while True:
|
|
2196
2340
|
c = self._get_char()
|
|
@@ -2202,7 +2346,7 @@ class Tokenizer:
|
|
|
2202
2346
|
tag_name = "".join(self.current_tag_name)
|
|
2203
2347
|
if tag_name == self.rawtext_tag_name:
|
|
2204
2348
|
if c == ">":
|
|
2205
|
-
attrs =
|
|
2349
|
+
attrs: dict[str, str | None] = {}
|
|
2206
2350
|
tag = Tag(Tag.END, tag_name, attrs, False)
|
|
2207
2351
|
self._flush_text()
|
|
2208
2352
|
self._emit_token(tag)
|
|
@@ -2243,7 +2387,7 @@ class Tokenizer:
|
|
|
2243
2387
|
self.state = self.RAWTEXT
|
|
2244
2388
|
return False
|
|
2245
2389
|
|
|
2246
|
-
def _state_plaintext(self):
|
|
2390
|
+
def _state_plaintext(self) -> bool:
|
|
2247
2391
|
# PLAINTEXT state - consume everything as text, no end tag
|
|
2248
2392
|
if self.pos < self.length:
|
|
2249
2393
|
remaining = self.buffer[self.pos :]
|
|
@@ -2257,7 +2401,7 @@ class Tokenizer:
|
|
|
2257
2401
|
self._emit_token(EOFToken())
|
|
2258
2402
|
return True
|
|
2259
2403
|
|
|
2260
|
-
def _state_script_data_escaped(self):
|
|
2404
|
+
def _state_script_data_escaped(self) -> bool:
|
|
2261
2405
|
c = self._get_char()
|
|
2262
2406
|
if c is None:
|
|
2263
2407
|
self._flush_text()
|
|
@@ -2277,7 +2421,7 @@ class Tokenizer:
|
|
|
2277
2421
|
self._append_text(c)
|
|
2278
2422
|
return False
|
|
2279
2423
|
|
|
2280
|
-
def _state_script_data_escaped_dash(self):
|
|
2424
|
+
def _state_script_data_escaped_dash(self) -> bool:
|
|
2281
2425
|
c = self._get_char()
|
|
2282
2426
|
if c is None:
|
|
2283
2427
|
self._flush_text()
|
|
@@ -2299,7 +2443,7 @@ class Tokenizer:
|
|
|
2299
2443
|
self.state = self.SCRIPT_DATA_ESCAPED
|
|
2300
2444
|
return False
|
|
2301
2445
|
|
|
2302
|
-
def _state_script_data_escaped_dash_dash(self):
|
|
2446
|
+
def _state_script_data_escaped_dash_dash(self) -> bool:
|
|
2303
2447
|
c = self._get_char()
|
|
2304
2448
|
if c is None:
|
|
2305
2449
|
self._flush_text()
|
|
@@ -2325,7 +2469,7 @@ class Tokenizer:
|
|
|
2325
2469
|
self.state = self.SCRIPT_DATA_ESCAPED
|
|
2326
2470
|
return False
|
|
2327
2471
|
|
|
2328
|
-
def _state_script_data_escaped_less_than_sign(self):
|
|
2472
|
+
def _state_script_data_escaped_less_than_sign(self) -> bool:
|
|
2329
2473
|
c = self._get_char()
|
|
2330
2474
|
if c == "/":
|
|
2331
2475
|
self.temp_buffer.clear()
|
|
@@ -2343,7 +2487,7 @@ class Tokenizer:
|
|
|
2343
2487
|
|
|
2344
2488
|
return False
|
|
2345
2489
|
|
|
2346
|
-
def _state_script_data_escaped_end_tag_open(self):
|
|
2490
|
+
def _state_script_data_escaped_end_tag_open(self) -> bool:
|
|
2347
2491
|
c = self._get_char()
|
|
2348
2492
|
if c is not None and ("A" <= c <= "Z" or "a" <= c <= "z"):
|
|
2349
2493
|
self.current_tag_name.clear()
|
|
@@ -2356,7 +2500,7 @@ class Tokenizer:
|
|
|
2356
2500
|
self.state = self.SCRIPT_DATA_ESCAPED
|
|
2357
2501
|
return False
|
|
2358
2502
|
|
|
2359
|
-
def _state_script_data_escaped_end_tag_name(self):
|
|
2503
|
+
def _state_script_data_escaped_end_tag_name(self) -> bool:
|
|
2360
2504
|
c = self._get_char()
|
|
2361
2505
|
if c is not None and ("A" <= c <= "Z" or "a" <= c <= "z"):
|
|
2362
2506
|
self.current_tag_name.append(c.lower())
|
|
@@ -2381,7 +2525,7 @@ class Tokenizer:
|
|
|
2381
2525
|
return False
|
|
2382
2526
|
if c == ">":
|
|
2383
2527
|
self._flush_text()
|
|
2384
|
-
attrs =
|
|
2528
|
+
attrs: dict[str, str | None] = {}
|
|
2385
2529
|
tag = Tag(Tag.END, tag_name, attrs, False)
|
|
2386
2530
|
self._emit_token(tag)
|
|
2387
2531
|
self.state = self.DATA
|
|
@@ -2397,7 +2541,7 @@ class Tokenizer:
|
|
|
2397
2541
|
self.state = self.SCRIPT_DATA_ESCAPED
|
|
2398
2542
|
return False
|
|
2399
2543
|
|
|
2400
|
-
def _state_script_data_double_escape_start(self):
|
|
2544
|
+
def _state_script_data_double_escape_start(self) -> bool:
|
|
2401
2545
|
c = self._get_char()
|
|
2402
2546
|
if c in (" ", "\t", "\n", "\r", "\f", "/", ">"):
|
|
2403
2547
|
# Check if temp_buffer contains "script"
|
|
@@ -2416,7 +2560,7 @@ class Tokenizer:
|
|
|
2416
2560
|
self.state = self.SCRIPT_DATA_ESCAPED
|
|
2417
2561
|
return False
|
|
2418
2562
|
|
|
2419
|
-
def _state_script_data_double_escaped(self):
|
|
2563
|
+
def _state_script_data_double_escaped(self) -> bool:
|
|
2420
2564
|
c = self._get_char()
|
|
2421
2565
|
if c is None:
|
|
2422
2566
|
self._flush_text()
|
|
@@ -2437,7 +2581,7 @@ class Tokenizer:
|
|
|
2437
2581
|
self._append_text(c)
|
|
2438
2582
|
return False
|
|
2439
2583
|
|
|
2440
|
-
def _state_script_data_double_escaped_dash(self):
|
|
2584
|
+
def _state_script_data_double_escaped_dash(self) -> bool:
|
|
2441
2585
|
c = self._get_char()
|
|
2442
2586
|
if c is None:
|
|
2443
2587
|
self._flush_text()
|
|
@@ -2460,7 +2604,7 @@ class Tokenizer:
|
|
|
2460
2604
|
self.state = self.SCRIPT_DATA_DOUBLE_ESCAPED
|
|
2461
2605
|
return False
|
|
2462
2606
|
|
|
2463
|
-
def _state_script_data_double_escaped_dash_dash(self):
|
|
2607
|
+
def _state_script_data_double_escaped_dash_dash(self) -> bool:
|
|
2464
2608
|
c = self._get_char()
|
|
2465
2609
|
if c is None:
|
|
2466
2610
|
self._flush_text()
|
|
@@ -2488,7 +2632,7 @@ class Tokenizer:
|
|
|
2488
2632
|
self.state = self.SCRIPT_DATA_DOUBLE_ESCAPED
|
|
2489
2633
|
return False
|
|
2490
2634
|
|
|
2491
|
-
def _state_script_data_double_escaped_less_than_sign(self):
|
|
2635
|
+
def _state_script_data_double_escaped_less_than_sign(self) -> bool:
|
|
2492
2636
|
c = self._get_char()
|
|
2493
2637
|
if c == "/":
|
|
2494
2638
|
self.temp_buffer.clear()
|
|
@@ -2504,7 +2648,7 @@ class Tokenizer:
|
|
|
2504
2648
|
self.state = self.SCRIPT_DATA_DOUBLE_ESCAPED
|
|
2505
2649
|
return False
|
|
2506
2650
|
|
|
2507
|
-
def _state_script_data_double_escape_end(self):
|
|
2651
|
+
def _state_script_data_double_escape_end(self) -> bool:
|
|
2508
2652
|
c = self._get_char()
|
|
2509
2653
|
if c in (" ", "\t", "\n", "\r", "\f", "/", ">"):
|
|
2510
2654
|
# Check if temp_buffer contains "script"
|
|
@@ -2525,7 +2669,7 @@ class Tokenizer:
|
|
|
2525
2669
|
return False
|
|
2526
2670
|
|
|
2527
2671
|
|
|
2528
|
-
Tokenizer._STATE_HANDLERS = [
|
|
2672
|
+
Tokenizer._STATE_HANDLERS = [ # type: ignore[attr-defined]
|
|
2529
2673
|
Tokenizer._state_data,
|
|
2530
2674
|
Tokenizer._state_tag_open,
|
|
2531
2675
|
Tokenizer._state_end_tag_open,
|