justhtml 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- justhtml/__init__.py +17 -0
- justhtml/__main__.py +29 -0
- justhtml/constants.py +441 -0
- justhtml/context.py +6 -0
- justhtml/entities.py +342 -0
- justhtml/errors.py +138 -0
- justhtml/node.py +208 -0
- justhtml/parser.py +86 -0
- justhtml/selector.py +925 -0
- justhtml/serialize.py +201 -0
- justhtml/stream.py +83 -0
- justhtml/tokenizer.py +2590 -0
- justhtml/tokens.py +175 -0
- justhtml/treebuilder.py +1231 -0
- justhtml/treebuilder_modes.py +2012 -0
- justhtml/treebuilder_utils.py +86 -0
- justhtml-0.6.0.dist-info/METADATA +126 -0
- justhtml-0.6.0.dist-info/RECORD +20 -0
- justhtml-0.6.0.dist-info/WHEEL +4 -0
- justhtml-0.6.0.dist-info/licenses/LICENSE +21 -0
justhtml/tokenizer.py
ADDED
|
@@ -0,0 +1,2590 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from bisect import bisect_right
|
|
3
|
+
|
|
4
|
+
from .entities import decode_entities_in_text
|
|
5
|
+
from .errors import generate_error_message
|
|
6
|
+
from .tokens import CommentToken, Doctype, DoctypeToken, EOFToken, ParseError, Tag
|
|
7
|
+
|
|
8
|
+
_ATTR_VALUE_UNQUOTED_TERMINATORS = "\t\n\f >&\"'<=`\r\0"
|
|
9
|
+
_ASCII_LOWER_TABLE = str.maketrans({chr(code): chr(code + 32) for code in range(65, 91)})
|
|
10
|
+
_RCDATA_ELEMENTS = {"title", "textarea"}
|
|
11
|
+
_RAWTEXT_SWITCH_TAGS = {
|
|
12
|
+
"script",
|
|
13
|
+
"style",
|
|
14
|
+
"xmp",
|
|
15
|
+
"iframe",
|
|
16
|
+
"noembed",
|
|
17
|
+
"noframes",
|
|
18
|
+
"textarea",
|
|
19
|
+
"title",
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
_ATTR_VALUE_DOUBLE_PATTERN = re.compile(r'["&\0]')
|
|
23
|
+
_ATTR_VALUE_SINGLE_PATTERN = re.compile(r"['&\0]")
|
|
24
|
+
_ATTR_VALUE_UNQUOTED_PATTERN = re.compile(f"[{re.escape(_ATTR_VALUE_UNQUOTED_TERMINATORS)}]")
|
|
25
|
+
|
|
26
|
+
_TAG_NAME_RUN_PATTERN = re.compile(r"[^\t\n\f />\0\r]+")
|
|
27
|
+
_ATTR_NAME_RUN_PATTERN = re.compile(r"[^\t\n\f />=\0\"'<\r]+")
|
|
28
|
+
_COMMENT_RUN_PATTERN = re.compile(r"[^-\0]+")
|
|
29
|
+
_WHITESPACE_PATTERN = re.compile(r"[ \t\n\f]+")
|
|
30
|
+
|
|
31
|
+
# XML Coercion Regex
|
|
32
|
+
_xml_invalid_single_chars = []
|
|
33
|
+
for _plane in range(17):
|
|
34
|
+
_base = _plane * 0x10000
|
|
35
|
+
_xml_invalid_single_chars.append(chr(_base + 0xFFFE))
|
|
36
|
+
_xml_invalid_single_chars.append(chr(_base + 0xFFFF))
|
|
37
|
+
|
|
38
|
+
_XML_COERCION_PATTERN = re.compile(r"[\f\uFDD0-\uFDEF" + "".join(_xml_invalid_single_chars) + "]")
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _xml_coercion_callback(match):
|
|
42
|
+
if match.group(0) == "\f":
|
|
43
|
+
return " "
|
|
44
|
+
return "\ufffd"
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _coerce_text_for_xml(text):
|
|
48
|
+
"""Apply XML coercion to text content."""
|
|
49
|
+
# Fast path for ASCII
|
|
50
|
+
if text.isascii():
|
|
51
|
+
if "\f" in text:
|
|
52
|
+
return text.replace("\f", " ")
|
|
53
|
+
return text
|
|
54
|
+
|
|
55
|
+
if not _XML_COERCION_PATTERN.search(text):
|
|
56
|
+
return text
|
|
57
|
+
return _XML_COERCION_PATTERN.sub(_xml_coercion_callback, text)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _coerce_comment_for_xml(text):
|
|
61
|
+
"""Apply XML coercion to comment content - handle double hyphens."""
|
|
62
|
+
# Replace -- with - - (with space)
|
|
63
|
+
if "--" in text:
|
|
64
|
+
return text.replace("--", "- -")
|
|
65
|
+
return text
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class TokenizerOpts:
|
|
69
|
+
__slots__ = ("discard_bom", "exact_errors", "initial_rawtext_tag", "initial_state", "xml_coercion")
|
|
70
|
+
|
|
71
|
+
def __init__(
|
|
72
|
+
self,
|
|
73
|
+
exact_errors=False,
|
|
74
|
+
discard_bom=True,
|
|
75
|
+
initial_state=None,
|
|
76
|
+
initial_rawtext_tag=None,
|
|
77
|
+
xml_coercion=False,
|
|
78
|
+
):
|
|
79
|
+
self.exact_errors = bool(exact_errors)
|
|
80
|
+
self.discard_bom = bool(discard_bom)
|
|
81
|
+
self.initial_state = initial_state
|
|
82
|
+
self.initial_rawtext_tag = initial_rawtext_tag
|
|
83
|
+
self.xml_coercion = bool(xml_coercion)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class Tokenizer:
|
|
87
|
+
DATA = 0
|
|
88
|
+
TAG_OPEN = 1
|
|
89
|
+
END_TAG_OPEN = 2
|
|
90
|
+
TAG_NAME = 3
|
|
91
|
+
BEFORE_ATTRIBUTE_NAME = 4
|
|
92
|
+
ATTRIBUTE_NAME = 5
|
|
93
|
+
AFTER_ATTRIBUTE_NAME = 6
|
|
94
|
+
BEFORE_ATTRIBUTE_VALUE = 7
|
|
95
|
+
ATTRIBUTE_VALUE_DOUBLE = 8
|
|
96
|
+
ATTRIBUTE_VALUE_SINGLE = 9
|
|
97
|
+
ATTRIBUTE_VALUE_UNQUOTED = 10
|
|
98
|
+
AFTER_ATTRIBUTE_VALUE_QUOTED = 11
|
|
99
|
+
SELF_CLOSING_START_TAG = 12
|
|
100
|
+
MARKUP_DECLARATION_OPEN = 13
|
|
101
|
+
COMMENT_START = 14
|
|
102
|
+
COMMENT_START_DASH = 15
|
|
103
|
+
COMMENT = 16
|
|
104
|
+
COMMENT_END_DASH = 17
|
|
105
|
+
COMMENT_END = 18
|
|
106
|
+
COMMENT_END_BANG = 19
|
|
107
|
+
BOGUS_COMMENT = 20
|
|
108
|
+
DOCTYPE = 21
|
|
109
|
+
BEFORE_DOCTYPE_NAME = 22
|
|
110
|
+
DOCTYPE_NAME = 23
|
|
111
|
+
AFTER_DOCTYPE_NAME = 24
|
|
112
|
+
BOGUS_DOCTYPE = 25
|
|
113
|
+
AFTER_DOCTYPE_PUBLIC_KEYWORD = 26
|
|
114
|
+
AFTER_DOCTYPE_SYSTEM_KEYWORD = 27
|
|
115
|
+
BEFORE_DOCTYPE_PUBLIC_IDENTIFIER = 28
|
|
116
|
+
DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED = 29
|
|
117
|
+
DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED = 30
|
|
118
|
+
AFTER_DOCTYPE_PUBLIC_IDENTIFIER = 31
|
|
119
|
+
BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS = 32
|
|
120
|
+
BEFORE_DOCTYPE_SYSTEM_IDENTIFIER = 33
|
|
121
|
+
DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED = 34
|
|
122
|
+
DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED = 35
|
|
123
|
+
AFTER_DOCTYPE_SYSTEM_IDENTIFIER = 36
|
|
124
|
+
CDATA_SECTION = 37
|
|
125
|
+
CDATA_SECTION_BRACKET = 38
|
|
126
|
+
CDATA_SECTION_END = 39
|
|
127
|
+
RCDATA = 40
|
|
128
|
+
RCDATA_LESS_THAN_SIGN = 41
|
|
129
|
+
RCDATA_END_TAG_OPEN = 42
|
|
130
|
+
RCDATA_END_TAG_NAME = 43
|
|
131
|
+
RAWTEXT = 44
|
|
132
|
+
RAWTEXT_LESS_THAN_SIGN = 45
|
|
133
|
+
RAWTEXT_END_TAG_OPEN = 46
|
|
134
|
+
RAWTEXT_END_TAG_NAME = 47
|
|
135
|
+
PLAINTEXT = 48
|
|
136
|
+
SCRIPT_DATA_ESCAPED = 49
|
|
137
|
+
SCRIPT_DATA_ESCAPED_DASH = 50
|
|
138
|
+
SCRIPT_DATA_ESCAPED_DASH_DASH = 51
|
|
139
|
+
SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN = 52
|
|
140
|
+
SCRIPT_DATA_ESCAPED_END_TAG_OPEN = 53
|
|
141
|
+
SCRIPT_DATA_ESCAPED_END_TAG_NAME = 54
|
|
142
|
+
SCRIPT_DATA_DOUBLE_ESCAPE_START = 55
|
|
143
|
+
SCRIPT_DATA_DOUBLE_ESCAPED = 56
|
|
144
|
+
SCRIPT_DATA_DOUBLE_ESCAPED_DASH = 57
|
|
145
|
+
SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH = 58
|
|
146
|
+
SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN = 59
|
|
147
|
+
SCRIPT_DATA_DOUBLE_ESCAPE_END = 60
|
|
148
|
+
|
|
149
|
+
__slots__ = (
|
|
150
|
+
"_comment_token",
|
|
151
|
+
"_newline_positions",
|
|
152
|
+
"_state_handlers",
|
|
153
|
+
"_tag_token",
|
|
154
|
+
"buffer",
|
|
155
|
+
"collect_errors",
|
|
156
|
+
"current_attr_name",
|
|
157
|
+
"current_attr_value",
|
|
158
|
+
"current_attr_value_has_amp",
|
|
159
|
+
"current_char",
|
|
160
|
+
"current_comment",
|
|
161
|
+
"current_doctype_force_quirks",
|
|
162
|
+
"current_doctype_name",
|
|
163
|
+
"current_doctype_public",
|
|
164
|
+
"current_doctype_system",
|
|
165
|
+
"current_tag_attrs",
|
|
166
|
+
"current_tag_kind",
|
|
167
|
+
"current_tag_name",
|
|
168
|
+
"current_tag_self_closing",
|
|
169
|
+
"errors",
|
|
170
|
+
"ignore_lf",
|
|
171
|
+
"last_start_tag_name",
|
|
172
|
+
"last_token_column",
|
|
173
|
+
"last_token_line",
|
|
174
|
+
"length",
|
|
175
|
+
"opts",
|
|
176
|
+
"original_tag_name",
|
|
177
|
+
"pos",
|
|
178
|
+
"rawtext_tag_name",
|
|
179
|
+
"reconsume",
|
|
180
|
+
"sink",
|
|
181
|
+
"state",
|
|
182
|
+
"temp_buffer",
|
|
183
|
+
"text_buffer",
|
|
184
|
+
"text_start_pos",
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
# _STATE_HANDLERS is defined at the end of the file
|
|
188
|
+
|
|
189
|
+
def __init__(self, sink, opts=None, collect_errors=False):
|
|
190
|
+
self.sink = sink
|
|
191
|
+
self.opts = opts or TokenizerOpts()
|
|
192
|
+
self.collect_errors = collect_errors
|
|
193
|
+
self.errors = []
|
|
194
|
+
|
|
195
|
+
self.state = self.DATA
|
|
196
|
+
self.buffer = ""
|
|
197
|
+
self.length = 0
|
|
198
|
+
self.pos = 0
|
|
199
|
+
self.reconsume = False
|
|
200
|
+
self.current_char = ""
|
|
201
|
+
self.ignore_lf = False
|
|
202
|
+
self.last_token_line = 1
|
|
203
|
+
self.last_token_column = 0
|
|
204
|
+
|
|
205
|
+
# Reusable buffers to avoid per-token allocations.
|
|
206
|
+
self.text_buffer = []
|
|
207
|
+
self.text_start_pos = 0
|
|
208
|
+
self.current_tag_name = []
|
|
209
|
+
self.current_tag_attrs = {}
|
|
210
|
+
self.current_attr_name = []
|
|
211
|
+
self.current_attr_value = []
|
|
212
|
+
self.current_attr_value_has_amp = False
|
|
213
|
+
self.current_tag_self_closing = False
|
|
214
|
+
self.current_tag_kind = Tag.START
|
|
215
|
+
self.current_comment = []
|
|
216
|
+
self.current_doctype_name = []
|
|
217
|
+
self.current_doctype_public = None # None = not set, [] = empty string
|
|
218
|
+
self.current_doctype_system = None # None = not set, [] = empty string
|
|
219
|
+
self.current_doctype_force_quirks = False
|
|
220
|
+
self.last_start_tag_name = None
|
|
221
|
+
self.rawtext_tag_name = None
|
|
222
|
+
self.original_tag_name = []
|
|
223
|
+
self.temp_buffer = []
|
|
224
|
+
self._tag_token = Tag(Tag.START, "", {}, False)
|
|
225
|
+
self._comment_token = CommentToken("")
|
|
226
|
+
|
|
227
|
+
def initialize(self, html):
|
|
228
|
+
if html and html[0] == "\ufeff" and self.opts.discard_bom:
|
|
229
|
+
html = html[1:]
|
|
230
|
+
|
|
231
|
+
self.buffer = html or ""
|
|
232
|
+
self.length = len(self.buffer)
|
|
233
|
+
self.pos = 0
|
|
234
|
+
self.reconsume = False
|
|
235
|
+
self.current_char = ""
|
|
236
|
+
self.ignore_lf = False
|
|
237
|
+
self.last_token_line = 1
|
|
238
|
+
self.last_token_column = 0
|
|
239
|
+
self.errors = []
|
|
240
|
+
self.text_buffer.clear()
|
|
241
|
+
self.text_start_pos = 0
|
|
242
|
+
self.current_tag_name.clear()
|
|
243
|
+
self.current_tag_attrs = {}
|
|
244
|
+
self.current_attr_name.clear()
|
|
245
|
+
self.current_attr_value.clear()
|
|
246
|
+
self.current_attr_value_has_amp = False
|
|
247
|
+
self.current_comment.clear()
|
|
248
|
+
self.current_doctype_name.clear()
|
|
249
|
+
self.current_doctype_public = None
|
|
250
|
+
self.current_doctype_system = None
|
|
251
|
+
self.current_doctype_force_quirks = False
|
|
252
|
+
self.current_tag_self_closing = False
|
|
253
|
+
self.current_tag_kind = Tag.START
|
|
254
|
+
self.rawtext_tag_name = self.opts.initial_rawtext_tag
|
|
255
|
+
self.temp_buffer.clear()
|
|
256
|
+
self.last_start_tag_name = None
|
|
257
|
+
self._tag_token.kind = Tag.START
|
|
258
|
+
self._tag_token.name = ""
|
|
259
|
+
self._tag_token.attrs = {}
|
|
260
|
+
self._tag_token.self_closing = False
|
|
261
|
+
|
|
262
|
+
initial_state = self.opts.initial_state
|
|
263
|
+
if isinstance(initial_state, int):
|
|
264
|
+
self.state = initial_state
|
|
265
|
+
else:
|
|
266
|
+
self.state = self.DATA
|
|
267
|
+
|
|
268
|
+
# Pre-compute newline positions for O(log n) line lookups
|
|
269
|
+
if self.collect_errors:
|
|
270
|
+
self._newline_positions = []
|
|
271
|
+
pos = -1
|
|
272
|
+
buffer = self.buffer
|
|
273
|
+
while True:
|
|
274
|
+
pos = buffer.find("\n", pos + 1)
|
|
275
|
+
if pos == -1:
|
|
276
|
+
break
|
|
277
|
+
self._newline_positions.append(pos)
|
|
278
|
+
else:
|
|
279
|
+
self._newline_positions = None
|
|
280
|
+
|
|
281
|
+
def _get_line_at_pos(self, pos):
|
|
282
|
+
"""Get line number (1-indexed) for a position using binary search."""
|
|
283
|
+
# Line number = count of newlines before pos + 1
|
|
284
|
+
return bisect_right(self._newline_positions, pos - 1) + 1
|
|
285
|
+
|
|
286
|
+
def step(self):
|
|
287
|
+
"""Run one step of the tokenizer state machine. Returns True if EOF reached."""
|
|
288
|
+
handler = self._STATE_HANDLERS[self.state]
|
|
289
|
+
return handler(self)
|
|
290
|
+
|
|
291
|
+
def run(self, html):
|
|
292
|
+
self.initialize(html)
|
|
293
|
+
while True:
|
|
294
|
+
if self.step():
|
|
295
|
+
break
|
|
296
|
+
|
|
297
|
+
# ---------------------
|
|
298
|
+
# Helper methods
|
|
299
|
+
# ---------------------
|
|
300
|
+
|
|
301
|
+
def _peek_char(self, offset):
|
|
302
|
+
"""Peek ahead at character at current position + offset without consuming"""
|
|
303
|
+
peek_pos = self.pos + offset
|
|
304
|
+
if peek_pos < self.length:
|
|
305
|
+
return self.buffer[peek_pos]
|
|
306
|
+
return None
|
|
307
|
+
|
|
308
|
+
def _append_text_chunk(self, chunk, *, ends_with_cr=False):
|
|
309
|
+
self._append_text(chunk)
|
|
310
|
+
self.ignore_lf = ends_with_cr
|
|
311
|
+
|
|
312
|
+
# ---------------------
|
|
313
|
+
# State handlers
|
|
314
|
+
# ---------------------
|
|
315
|
+
|
|
316
|
+
def _state_data(self):
|
|
317
|
+
buffer = self.buffer
|
|
318
|
+
length = self.length
|
|
319
|
+
pos = self.pos
|
|
320
|
+
while True:
|
|
321
|
+
if self.reconsume:
|
|
322
|
+
# Note: reconsume is never True at EOF in DATA state
|
|
323
|
+
self.reconsume = False
|
|
324
|
+
self.pos -= 1
|
|
325
|
+
pos = self.pos
|
|
326
|
+
|
|
327
|
+
if pos >= length:
|
|
328
|
+
self.pos = length
|
|
329
|
+
self.current_char = None
|
|
330
|
+
self._flush_text()
|
|
331
|
+
self._emit_token(EOFToken())
|
|
332
|
+
return True
|
|
333
|
+
|
|
334
|
+
# Optimized loop using find
|
|
335
|
+
next_lt = buffer.find("<", pos)
|
|
336
|
+
|
|
337
|
+
if next_lt == -1:
|
|
338
|
+
next_lt = length
|
|
339
|
+
|
|
340
|
+
end = next_lt
|
|
341
|
+
|
|
342
|
+
if end > pos:
|
|
343
|
+
chunk = buffer[pos:end]
|
|
344
|
+
|
|
345
|
+
if "\r" in chunk:
|
|
346
|
+
chunk = chunk.replace("\r\n", "\n").replace("\r", "\n")
|
|
347
|
+
|
|
348
|
+
self._append_text(chunk)
|
|
349
|
+
self.ignore_lf = chunk.endswith("\r")
|
|
350
|
+
|
|
351
|
+
pos = end
|
|
352
|
+
self.pos = pos
|
|
353
|
+
if pos >= length:
|
|
354
|
+
continue
|
|
355
|
+
|
|
356
|
+
# After find("<"), we're always at '<' unless reconsume is True
|
|
357
|
+
# But reconsume only happens after TAG_OPEN which reconsumed '<'
|
|
358
|
+
c = buffer[pos]
|
|
359
|
+
pos += 1
|
|
360
|
+
self.pos = pos
|
|
361
|
+
self.current_char = c
|
|
362
|
+
self.ignore_lf = False
|
|
363
|
+
# c is always '<' here due to find() optimization above
|
|
364
|
+
# Optimization: Peek ahead for common tag starts
|
|
365
|
+
if pos < length:
|
|
366
|
+
nc = buffer[pos]
|
|
367
|
+
if ("a" <= nc <= "z") or ("A" <= nc <= "Z"):
|
|
368
|
+
self._flush_text()
|
|
369
|
+
# Inline _start_tag(Tag.START)
|
|
370
|
+
self.current_tag_kind = Tag.START
|
|
371
|
+
self.current_tag_name.clear()
|
|
372
|
+
self.current_attr_name.clear()
|
|
373
|
+
self.current_attr_value.clear()
|
|
374
|
+
self.current_attr_value_has_amp = False
|
|
375
|
+
self.current_tag_self_closing = False
|
|
376
|
+
|
|
377
|
+
if "A" <= nc <= "Z":
|
|
378
|
+
nc = chr(ord(nc) + 32)
|
|
379
|
+
self.current_tag_name.append(nc)
|
|
380
|
+
self.pos += 1
|
|
381
|
+
self.state = self.TAG_NAME
|
|
382
|
+
return self._state_tag_name()
|
|
383
|
+
|
|
384
|
+
if nc == "!":
|
|
385
|
+
# Optimization: Peek ahead for comments
|
|
386
|
+
if pos + 2 < length and buffer[pos + 1] == "-" and buffer[pos + 2] == "-":
|
|
387
|
+
self._flush_text()
|
|
388
|
+
self.pos += 3 # Consume !--
|
|
389
|
+
self.current_comment.clear()
|
|
390
|
+
self.state = self.COMMENT_START
|
|
391
|
+
return self._state_comment_start()
|
|
392
|
+
|
|
393
|
+
if nc == "/":
|
|
394
|
+
# Check next char for end tag
|
|
395
|
+
if pos + 1 < length:
|
|
396
|
+
nnc = buffer[pos + 1]
|
|
397
|
+
if ("a" <= nnc <= "z") or ("A" <= nnc <= "Z"):
|
|
398
|
+
self._flush_text()
|
|
399
|
+
# Inline _start_tag(Tag.END)
|
|
400
|
+
self.current_tag_kind = Tag.END
|
|
401
|
+
self.current_tag_name.clear()
|
|
402
|
+
self.current_attr_name.clear()
|
|
403
|
+
self.current_attr_value.clear()
|
|
404
|
+
self.current_attr_value_has_amp = False
|
|
405
|
+
self.current_tag_self_closing = False
|
|
406
|
+
|
|
407
|
+
if "A" <= nnc <= "Z":
|
|
408
|
+
nnc = chr(ord(nnc) + 32)
|
|
409
|
+
self.current_tag_name.append(nnc)
|
|
410
|
+
self.pos += 2 # Consume / and nnc
|
|
411
|
+
self.state = self.TAG_NAME
|
|
412
|
+
return self._state_tag_name()
|
|
413
|
+
|
|
414
|
+
self._flush_text()
|
|
415
|
+
self.state = self.TAG_OPEN
|
|
416
|
+
return self._state_tag_open()
|
|
417
|
+
|
|
418
|
+
def _state_tag_open(self):
|
|
419
|
+
c = self._get_char()
|
|
420
|
+
if c is None:
|
|
421
|
+
self._emit_error("eof-before-tag-name")
|
|
422
|
+
self._append_text("<")
|
|
423
|
+
self._flush_text()
|
|
424
|
+
self._emit_token(EOFToken())
|
|
425
|
+
return True
|
|
426
|
+
if c == "!":
|
|
427
|
+
self.state = self.MARKUP_DECLARATION_OPEN
|
|
428
|
+
return False
|
|
429
|
+
if c == "/":
|
|
430
|
+
self.state = self.END_TAG_OPEN
|
|
431
|
+
return False
|
|
432
|
+
if c == "?":
|
|
433
|
+
self._emit_error("unexpected-question-mark-instead-of-tag-name")
|
|
434
|
+
self.current_comment.clear()
|
|
435
|
+
self._reconsume_current()
|
|
436
|
+
self.state = self.BOGUS_COMMENT
|
|
437
|
+
return False
|
|
438
|
+
|
|
439
|
+
self._emit_error("invalid-first-character-of-tag-name")
|
|
440
|
+
self._append_text("<")
|
|
441
|
+
self._reconsume_current()
|
|
442
|
+
self.state = self.DATA
|
|
443
|
+
return False
|
|
444
|
+
|
|
445
|
+
def _state_end_tag_open(self):
|
|
446
|
+
c = self._get_char()
|
|
447
|
+
if c is None:
|
|
448
|
+
self._emit_error("eof-before-tag-name")
|
|
449
|
+
self._append_text("<")
|
|
450
|
+
self._append_text("/")
|
|
451
|
+
self._flush_text()
|
|
452
|
+
self._emit_token(EOFToken())
|
|
453
|
+
return True
|
|
454
|
+
if c == ">":
|
|
455
|
+
self._emit_error("empty-end-tag")
|
|
456
|
+
self.state = self.DATA
|
|
457
|
+
return False
|
|
458
|
+
|
|
459
|
+
self._emit_error("invalid-first-character-of-tag-name")
|
|
460
|
+
self.current_comment.clear()
|
|
461
|
+
self._reconsume_current()
|
|
462
|
+
self.state = self.BOGUS_COMMENT
|
|
463
|
+
return False
|
|
464
|
+
|
|
465
|
+
def _state_tag_name(self):
|
|
466
|
+
replacement = "\ufffd"
|
|
467
|
+
append_tag_char = self.current_tag_name.append
|
|
468
|
+
buffer = self.buffer
|
|
469
|
+
length = self.length
|
|
470
|
+
|
|
471
|
+
while True:
|
|
472
|
+
# Inline _consume_tag_name_run
|
|
473
|
+
# Note: reconsume and ignore_lf are never True when entering TAG_NAME
|
|
474
|
+
pos = self.pos
|
|
475
|
+
if pos < length:
|
|
476
|
+
# Optimization: Check for common terminators before regex
|
|
477
|
+
match = None
|
|
478
|
+
if buffer[pos] not in "\t\n\f />\0\r":
|
|
479
|
+
match = _TAG_NAME_RUN_PATTERN.match(buffer, pos)
|
|
480
|
+
|
|
481
|
+
if match:
|
|
482
|
+
chunk = match.group(0)
|
|
483
|
+
if not chunk.islower():
|
|
484
|
+
chunk = chunk.translate(_ASCII_LOWER_TABLE)
|
|
485
|
+
append_tag_char(chunk)
|
|
486
|
+
self.pos = match.end()
|
|
487
|
+
|
|
488
|
+
if self.pos < length:
|
|
489
|
+
c = buffer[self.pos]
|
|
490
|
+
if c in (" ", "\t", "\n", "\f", "\r"):
|
|
491
|
+
self.pos += 1
|
|
492
|
+
if c == "\r":
|
|
493
|
+
self.ignore_lf = True
|
|
494
|
+
self.state = self.BEFORE_ATTRIBUTE_NAME
|
|
495
|
+
return self._state_before_attribute_name()
|
|
496
|
+
if c == ">":
|
|
497
|
+
self.pos += 1
|
|
498
|
+
if not self._emit_current_tag():
|
|
499
|
+
self.state = self.DATA
|
|
500
|
+
return False
|
|
501
|
+
if c == "/":
|
|
502
|
+
self.pos += 1
|
|
503
|
+
self.state = self.SELF_CLOSING_START_TAG
|
|
504
|
+
return self._state_self_closing_start_tag()
|
|
505
|
+
|
|
506
|
+
c = self._get_char()
|
|
507
|
+
if c is None:
|
|
508
|
+
self._emit_error("eof-in-tag")
|
|
509
|
+
# Per HTML5 spec: EOF in tag name is a parse error, emit EOF token only
|
|
510
|
+
# The incomplete tag is discarded (not emitted as text)
|
|
511
|
+
self._emit_token(EOFToken())
|
|
512
|
+
return True
|
|
513
|
+
if c in ("\t", "\n", "\f", " "):
|
|
514
|
+
self.state = self.BEFORE_ATTRIBUTE_NAME
|
|
515
|
+
return self._state_before_attribute_name()
|
|
516
|
+
if c == "/":
|
|
517
|
+
self.state = self.SELF_CLOSING_START_TAG
|
|
518
|
+
return self._state_self_closing_start_tag()
|
|
519
|
+
if c == ">":
|
|
520
|
+
# In slow path, tag name is only first char (from DATA),
|
|
521
|
+
# so no rawtext elements possible - always set DATA state
|
|
522
|
+
self._emit_current_tag()
|
|
523
|
+
self.state = self.DATA
|
|
524
|
+
return False
|
|
525
|
+
# c == "\0" - the only remaining possibility after fast-path
|
|
526
|
+
self._emit_error("unexpected-null-character")
|
|
527
|
+
append_tag_char(replacement)
|
|
528
|
+
|
|
529
|
+
def _state_before_attribute_name(self):
|
|
530
|
+
buffer = self.buffer
|
|
531
|
+
length = self.length
|
|
532
|
+
|
|
533
|
+
while True:
|
|
534
|
+
# Optimization: Skip whitespace
|
|
535
|
+
if not self.reconsume and not self.ignore_lf:
|
|
536
|
+
if self.pos < length:
|
|
537
|
+
# Check if current char is whitespace before running regex
|
|
538
|
+
if buffer[self.pos] in " \t\n\f":
|
|
539
|
+
match = _WHITESPACE_PATTERN.match(buffer, self.pos)
|
|
540
|
+
if match:
|
|
541
|
+
self.pos = match.end()
|
|
542
|
+
|
|
543
|
+
# Inline _get_char
|
|
544
|
+
if self.reconsume: # pragma: no cover
|
|
545
|
+
self.reconsume = False
|
|
546
|
+
c = self.current_char
|
|
547
|
+
elif self.pos >= length:
|
|
548
|
+
c = None
|
|
549
|
+
else:
|
|
550
|
+
c = buffer[self.pos]
|
|
551
|
+
self.pos += 1
|
|
552
|
+
|
|
553
|
+
self.current_char = c
|
|
554
|
+
|
|
555
|
+
if c == " ":
|
|
556
|
+
self.ignore_lf = False
|
|
557
|
+
continue
|
|
558
|
+
if c == "\n":
|
|
559
|
+
if self.ignore_lf:
|
|
560
|
+
self.ignore_lf = False
|
|
561
|
+
# Line tracking now computed on-demand via _get_line_at_pos()
|
|
562
|
+
continue
|
|
563
|
+
if c == "\t" or c == "\f":
|
|
564
|
+
self.ignore_lf = False
|
|
565
|
+
continue
|
|
566
|
+
if c == "\r":
|
|
567
|
+
self.ignore_lf = False
|
|
568
|
+
if self.pos < length and buffer[self.pos] == "\n":
|
|
569
|
+
self.pos += 1
|
|
570
|
+
continue
|
|
571
|
+
|
|
572
|
+
if c is None:
|
|
573
|
+
self._emit_error("eof-in-tag")
|
|
574
|
+
self._flush_text()
|
|
575
|
+
self._emit_token(EOFToken())
|
|
576
|
+
return True
|
|
577
|
+
|
|
578
|
+
if c == "/":
|
|
579
|
+
self.state = self.SELF_CLOSING_START_TAG
|
|
580
|
+
return False
|
|
581
|
+
if c == ">":
|
|
582
|
+
self._finish_attribute()
|
|
583
|
+
if not self._emit_current_tag():
|
|
584
|
+
self.state = self.DATA
|
|
585
|
+
return False
|
|
586
|
+
if c == "=":
|
|
587
|
+
self._emit_error("unexpected-equals-sign-before-attribute-name")
|
|
588
|
+
self.current_attr_name.clear()
|
|
589
|
+
self.current_attr_value.clear()
|
|
590
|
+
self.current_attr_value_has_amp = False
|
|
591
|
+
self.current_attr_name.append("=")
|
|
592
|
+
self.state = self.ATTRIBUTE_NAME
|
|
593
|
+
return False # Let main loop dispatch to avoid recursion
|
|
594
|
+
|
|
595
|
+
self.current_attr_name.clear()
|
|
596
|
+
self.current_attr_value.clear()
|
|
597
|
+
self.current_attr_value_has_amp = False
|
|
598
|
+
if c == "\0":
|
|
599
|
+
self._emit_error("unexpected-null-character")
|
|
600
|
+
c = "\ufffd"
|
|
601
|
+
elif "A" <= c <= "Z":
|
|
602
|
+
c = chr(ord(c) + 32)
|
|
603
|
+
|
|
604
|
+
self.current_attr_name.append(c)
|
|
605
|
+
self.state = self.ATTRIBUTE_NAME
|
|
606
|
+
return False # Let main loop dispatch to avoid recursion
|
|
607
|
+
|
|
608
|
+
def _state_attribute_name(self):
|
|
609
|
+
replacement = "\ufffd"
|
|
610
|
+
append_attr_char = self.current_attr_name.append
|
|
611
|
+
buffer = self.buffer
|
|
612
|
+
length = self.length
|
|
613
|
+
|
|
614
|
+
while True:
|
|
615
|
+
# Inline _consume_attribute_name_run
|
|
616
|
+
if not self.reconsume and not self.ignore_lf:
|
|
617
|
+
pos = self.pos
|
|
618
|
+
if pos < length:
|
|
619
|
+
# Optimization: Check for common terminators before regex
|
|
620
|
+
match = None
|
|
621
|
+
if buffer[pos] not in "\t\n\f />=\0\"'<\r":
|
|
622
|
+
match = _ATTR_NAME_RUN_PATTERN.match(buffer, pos)
|
|
623
|
+
|
|
624
|
+
if match:
|
|
625
|
+
chunk = match.group(0)
|
|
626
|
+
if not chunk.islower():
|
|
627
|
+
chunk = chunk.translate(_ASCII_LOWER_TABLE)
|
|
628
|
+
append_attr_char(chunk)
|
|
629
|
+
self.pos = match.end()
|
|
630
|
+
|
|
631
|
+
if self.pos < length:
|
|
632
|
+
c = buffer[self.pos]
|
|
633
|
+
if c == "=":
|
|
634
|
+
self.pos += 1
|
|
635
|
+
self.state = self.BEFORE_ATTRIBUTE_VALUE
|
|
636
|
+
return self._state_before_attribute_value()
|
|
637
|
+
if c in (" ", "\t", "\n", "\f", "\r"):
|
|
638
|
+
self.pos += 1
|
|
639
|
+
if c == "\r":
|
|
640
|
+
self.ignore_lf = True
|
|
641
|
+
self._finish_attribute()
|
|
642
|
+
self.state = self.AFTER_ATTRIBUTE_NAME
|
|
643
|
+
return False # Let main loop dispatch to avoid recursion
|
|
644
|
+
if c == ">":
|
|
645
|
+
self.pos += 1
|
|
646
|
+
self._finish_attribute()
|
|
647
|
+
if not self._emit_current_tag():
|
|
648
|
+
self.state = self.DATA
|
|
649
|
+
return False
|
|
650
|
+
if c == "/":
|
|
651
|
+
self.pos += 1
|
|
652
|
+
self._finish_attribute()
|
|
653
|
+
self.state = self.SELF_CLOSING_START_TAG
|
|
654
|
+
return self._state_self_closing_start_tag()
|
|
655
|
+
|
|
656
|
+
c = self._get_char()
|
|
657
|
+
if c is None:
|
|
658
|
+
self._emit_error("eof-in-tag")
|
|
659
|
+
self._flush_text()
|
|
660
|
+
self._emit_token(EOFToken())
|
|
661
|
+
return True
|
|
662
|
+
if c in ("\t", "\n", "\f", " "):
|
|
663
|
+
self._finish_attribute()
|
|
664
|
+
self.state = self.AFTER_ATTRIBUTE_NAME
|
|
665
|
+
return False # Let main loop dispatch to avoid recursion
|
|
666
|
+
if c == "/":
|
|
667
|
+
self._finish_attribute()
|
|
668
|
+
self.state = self.SELF_CLOSING_START_TAG
|
|
669
|
+
return self._state_self_closing_start_tag()
|
|
670
|
+
if c == "=":
|
|
671
|
+
self.state = self.BEFORE_ATTRIBUTE_VALUE
|
|
672
|
+
return self._state_before_attribute_value()
|
|
673
|
+
if c == ">":
|
|
674
|
+
self._finish_attribute()
|
|
675
|
+
if not self._emit_current_tag():
|
|
676
|
+
self.state = self.DATA
|
|
677
|
+
return False
|
|
678
|
+
if c == "\0":
|
|
679
|
+
self._emit_error("unexpected-null-character")
|
|
680
|
+
append_attr_char(replacement)
|
|
681
|
+
continue
|
|
682
|
+
if c in ('"', "'", "<"):
|
|
683
|
+
self._emit_error("unexpected-character-in-attribute-name")
|
|
684
|
+
append_attr_char(c)
|
|
685
|
+
|
|
686
|
+
def _state_after_attribute_name(self):
|
|
687
|
+
buffer = self.buffer
|
|
688
|
+
length = self.length
|
|
689
|
+
|
|
690
|
+
while True:
|
|
691
|
+
# Optimization: Skip whitespace
|
|
692
|
+
if not self.reconsume and not self.ignore_lf:
|
|
693
|
+
if self.pos < length:
|
|
694
|
+
match = _WHITESPACE_PATTERN.match(buffer, self.pos)
|
|
695
|
+
if match:
|
|
696
|
+
self.pos = match.end()
|
|
697
|
+
|
|
698
|
+
# Inline _get_char
|
|
699
|
+
if self.pos >= length:
|
|
700
|
+
c = None
|
|
701
|
+
else:
|
|
702
|
+
c = buffer[self.pos]
|
|
703
|
+
self.pos += 1
|
|
704
|
+
|
|
705
|
+
self.current_char = c
|
|
706
|
+
|
|
707
|
+
if c == " ":
|
|
708
|
+
self.ignore_lf = False
|
|
709
|
+
continue
|
|
710
|
+
if c == "\n":
|
|
711
|
+
# Note: Only reachable when ignore_lf=True (CR-LF handling)
|
|
712
|
+
# Standalone \n is caught by whitespace optimization
|
|
713
|
+
self.ignore_lf = False
|
|
714
|
+
continue
|
|
715
|
+
if c == "\r":
|
|
716
|
+
self.ignore_lf = True
|
|
717
|
+
continue
|
|
718
|
+
if c == "\t" or c == "\f":
|
|
719
|
+
self.ignore_lf = False
|
|
720
|
+
continue
|
|
721
|
+
|
|
722
|
+
self.ignore_lf = False
|
|
723
|
+
|
|
724
|
+
if c is None:
|
|
725
|
+
self._emit_error("eof-in-tag")
|
|
726
|
+
self._flush_text()
|
|
727
|
+
self._emit_token(EOFToken())
|
|
728
|
+
return True
|
|
729
|
+
if c == "/":
|
|
730
|
+
self._finish_attribute()
|
|
731
|
+
self.state = self.SELF_CLOSING_START_TAG
|
|
732
|
+
return False
|
|
733
|
+
if c == "=":
|
|
734
|
+
self.state = self.BEFORE_ATTRIBUTE_VALUE
|
|
735
|
+
return False
|
|
736
|
+
if c == ">":
|
|
737
|
+
self._finish_attribute()
|
|
738
|
+
if not self._emit_current_tag():
|
|
739
|
+
self.state = self.DATA
|
|
740
|
+
return False
|
|
741
|
+
self._finish_attribute()
|
|
742
|
+
self.current_attr_name.clear()
|
|
743
|
+
self.current_attr_value.clear()
|
|
744
|
+
self.current_attr_value_has_amp = False
|
|
745
|
+
if c == "\0":
|
|
746
|
+
self._emit_error("unexpected-null-character")
|
|
747
|
+
c = "\ufffd"
|
|
748
|
+
elif "A" <= c <= "Z":
|
|
749
|
+
c = chr(ord(c) + 32)
|
|
750
|
+
self.current_attr_name.append(c)
|
|
751
|
+
self.state = self.ATTRIBUTE_NAME
|
|
752
|
+
return False # Let main loop dispatch to avoid recursion
|
|
753
|
+
|
|
754
|
+
def _state_before_attribute_value(self):
|
|
755
|
+
while True:
|
|
756
|
+
c = self._get_char()
|
|
757
|
+
if c is None:
|
|
758
|
+
self._emit_error("eof-in-tag")
|
|
759
|
+
self._flush_text()
|
|
760
|
+
self._emit_token(EOFToken())
|
|
761
|
+
return True
|
|
762
|
+
if c in ("\t", "\n", "\f", " "):
|
|
763
|
+
continue
|
|
764
|
+
if c == '"':
|
|
765
|
+
self.state = self.ATTRIBUTE_VALUE_DOUBLE
|
|
766
|
+
return self._state_attribute_value_double()
|
|
767
|
+
if c == "'":
|
|
768
|
+
self.state = self.ATTRIBUTE_VALUE_SINGLE
|
|
769
|
+
return self._state_attribute_value_single()
|
|
770
|
+
if c == ">":
|
|
771
|
+
self._emit_error("missing-attribute-value")
|
|
772
|
+
self._finish_attribute()
|
|
773
|
+
if not self._emit_current_tag():
|
|
774
|
+
self.state = self.DATA
|
|
775
|
+
return False
|
|
776
|
+
self._reconsume_current()
|
|
777
|
+
self.state = self.ATTRIBUTE_VALUE_UNQUOTED
|
|
778
|
+
return self._state_attribute_value_unquoted()
|
|
779
|
+
|
|
780
|
+
def _state_attribute_value_double(self):
|
|
781
|
+
replacement = "\ufffd"
|
|
782
|
+
stop_pattern = _ATTR_VALUE_DOUBLE_PATTERN
|
|
783
|
+
buffer = self.buffer
|
|
784
|
+
length = self.length
|
|
785
|
+
|
|
786
|
+
while True:
|
|
787
|
+
# Inline _consume_attribute_value_run
|
|
788
|
+
pos = self.pos
|
|
789
|
+
if pos < length:
|
|
790
|
+
# Optimization: Optimistically look for quote
|
|
791
|
+
next_quote = buffer.find('"', pos)
|
|
792
|
+
if next_quote == -1:
|
|
793
|
+
next_quote = length
|
|
794
|
+
|
|
795
|
+
# Check if we skipped other terminators
|
|
796
|
+
chunk = buffer[pos:next_quote]
|
|
797
|
+
if "&" in chunk or "\0" in chunk:
|
|
798
|
+
# Fallback to regex if complex chars present
|
|
799
|
+
match = stop_pattern.search(buffer, pos)
|
|
800
|
+
# Note: match is always found because we checked for & or \0 above
|
|
801
|
+
end = match.start()
|
|
802
|
+
else:
|
|
803
|
+
end = next_quote
|
|
804
|
+
|
|
805
|
+
if end > pos:
|
|
806
|
+
# chunk is already valid if we took the fast path
|
|
807
|
+
if end != next_quote:
|
|
808
|
+
chunk = buffer[pos:end]
|
|
809
|
+
|
|
810
|
+
# Normalize chunk for value if needed
|
|
811
|
+
if "\r" in chunk:
|
|
812
|
+
chunk = chunk.replace("\r\n", "\n").replace("\r", "\n")
|
|
813
|
+
|
|
814
|
+
self.current_attr_value.append(chunk)
|
|
815
|
+
self.pos = end
|
|
816
|
+
|
|
817
|
+
# Inlined _get_char logic
|
|
818
|
+
if self.pos >= length:
|
|
819
|
+
self.current_char = None
|
|
820
|
+
self._emit_error("eof-in-tag")
|
|
821
|
+
self._emit_token(EOFToken())
|
|
822
|
+
return True
|
|
823
|
+
|
|
824
|
+
c = buffer[self.pos]
|
|
825
|
+
self.pos += 1
|
|
826
|
+
|
|
827
|
+
self.current_char = c
|
|
828
|
+
|
|
829
|
+
if c == '"':
|
|
830
|
+
self.state = self.AFTER_ATTRIBUTE_VALUE_QUOTED
|
|
831
|
+
return self._state_after_attribute_value_quoted()
|
|
832
|
+
if c == "&":
|
|
833
|
+
self._append_attr_value_char("&")
|
|
834
|
+
self.current_attr_value_has_amp = True
|
|
835
|
+
else:
|
|
836
|
+
# c == "\0" - the only remaining possibility after fast-path
|
|
837
|
+
self._emit_error("unexpected-null-character")
|
|
838
|
+
self._append_attr_value_char(replacement)
|
|
839
|
+
|
|
840
|
+
def _state_attribute_value_single(self):
|
|
841
|
+
replacement = "\ufffd"
|
|
842
|
+
stop_pattern = _ATTR_VALUE_SINGLE_PATTERN
|
|
843
|
+
buffer = self.buffer
|
|
844
|
+
length = self.length
|
|
845
|
+
|
|
846
|
+
while True:
|
|
847
|
+
# Inline _consume_attribute_value_run
|
|
848
|
+
pos = self.pos
|
|
849
|
+
if pos < length:
|
|
850
|
+
# Optimization: Optimistically look for quote
|
|
851
|
+
next_quote = buffer.find("'", pos)
|
|
852
|
+
if next_quote == -1:
|
|
853
|
+
next_quote = length
|
|
854
|
+
|
|
855
|
+
# Check if we skipped other terminators
|
|
856
|
+
chunk = buffer[pos:next_quote]
|
|
857
|
+
if "&" in chunk or "\0" in chunk:
|
|
858
|
+
# Fallback to regex if complex chars present
|
|
859
|
+
match = stop_pattern.search(buffer, pos)
|
|
860
|
+
# Note: match is always found because we checked for & or \0 above
|
|
861
|
+
end = match.start()
|
|
862
|
+
else:
|
|
863
|
+
end = next_quote
|
|
864
|
+
|
|
865
|
+
if end > pos:
|
|
866
|
+
# chunk is already valid if we took the fast path
|
|
867
|
+
if end != next_quote:
|
|
868
|
+
chunk = buffer[pos:end]
|
|
869
|
+
|
|
870
|
+
# Normalize chunk for value if needed
|
|
871
|
+
if "\r" in chunk:
|
|
872
|
+
chunk = chunk.replace("\r\n", "\n").replace("\r", "\n")
|
|
873
|
+
|
|
874
|
+
self.current_attr_value.append(chunk)
|
|
875
|
+
self.pos = end
|
|
876
|
+
|
|
877
|
+
# Inlined _get_char logic
|
|
878
|
+
if self.pos >= length:
|
|
879
|
+
self.current_char = None
|
|
880
|
+
self._emit_error("eof-in-tag")
|
|
881
|
+
self._emit_token(EOFToken())
|
|
882
|
+
return True
|
|
883
|
+
|
|
884
|
+
c = buffer[self.pos]
|
|
885
|
+
self.pos += 1
|
|
886
|
+
|
|
887
|
+
self.current_char = c
|
|
888
|
+
|
|
889
|
+
if c == "'":
|
|
890
|
+
self.state = self.AFTER_ATTRIBUTE_VALUE_QUOTED
|
|
891
|
+
return self._state_after_attribute_value_quoted()
|
|
892
|
+
if c == "&":
|
|
893
|
+
self._append_attr_value_char("&")
|
|
894
|
+
self.current_attr_value_has_amp = True
|
|
895
|
+
else:
|
|
896
|
+
# c == "\0" - the only remaining possibility after fast-path
|
|
897
|
+
self._emit_error("unexpected-null-character")
|
|
898
|
+
self._append_attr_value_char(replacement)
|
|
899
|
+
|
|
900
|
+
def _state_attribute_value_unquoted(self):
|
|
901
|
+
replacement = "\ufffd"
|
|
902
|
+
stop_pattern = _ATTR_VALUE_UNQUOTED_PATTERN
|
|
903
|
+
buffer = self.buffer
|
|
904
|
+
length = self.length
|
|
905
|
+
|
|
906
|
+
while True:
|
|
907
|
+
# Inline _consume_attribute_value_run
|
|
908
|
+
if not self.reconsume:
|
|
909
|
+
pos = self.pos
|
|
910
|
+
if pos < length:
|
|
911
|
+
match = stop_pattern.search(buffer, pos)
|
|
912
|
+
# Note: match is always found - pattern matches terminators or EOF
|
|
913
|
+
end = match.start() if match else length
|
|
914
|
+
|
|
915
|
+
if end > pos:
|
|
916
|
+
self.current_attr_value.append(buffer[pos:end])
|
|
917
|
+
self.pos = end
|
|
918
|
+
|
|
919
|
+
c = self._get_char()
|
|
920
|
+
if c is None:
|
|
921
|
+
# Per HTML5 spec: EOF in attribute value is a parse error
|
|
922
|
+
# The incomplete tag is discarded (not emitted)
|
|
923
|
+
self._emit_error("eof-in-tag")
|
|
924
|
+
self._emit_token(EOFToken())
|
|
925
|
+
return True
|
|
926
|
+
if c in ("\t", "\n", "\f", " "):
|
|
927
|
+
self._finish_attribute()
|
|
928
|
+
self.state = self.BEFORE_ATTRIBUTE_NAME
|
|
929
|
+
return False
|
|
930
|
+
if c == ">":
|
|
931
|
+
self._finish_attribute()
|
|
932
|
+
if not self._emit_current_tag():
|
|
933
|
+
self.state = self.DATA
|
|
934
|
+
return False
|
|
935
|
+
if c == "&":
|
|
936
|
+
self._append_attr_value_char("&")
|
|
937
|
+
self.current_attr_value_has_amp = True
|
|
938
|
+
continue
|
|
939
|
+
if c in ('"', "'", "<", "=", "`"):
|
|
940
|
+
self._emit_error("unexpected-character-in-unquoted-attribute-value")
|
|
941
|
+
if c == "\0":
|
|
942
|
+
self._emit_error("unexpected-null-character")
|
|
943
|
+
self._append_attr_value_char(replacement)
|
|
944
|
+
continue
|
|
945
|
+
self._append_attr_value_char(c)
|
|
946
|
+
|
|
947
|
+
def _state_after_attribute_value_quoted(self):
|
|
948
|
+
"""After attribute value (quoted) state per HTML5 spec §13.2.5.42"""
|
|
949
|
+
c = self._get_char()
|
|
950
|
+
if c is None:
|
|
951
|
+
self._emit_error("eof-in-tag")
|
|
952
|
+
self._flush_text()
|
|
953
|
+
self._emit_token(EOFToken())
|
|
954
|
+
return True
|
|
955
|
+
if c in ("\t", "\n", "\f", " "):
|
|
956
|
+
self._finish_attribute()
|
|
957
|
+
self.state = self.BEFORE_ATTRIBUTE_NAME
|
|
958
|
+
return False
|
|
959
|
+
if c == "/":
|
|
960
|
+
self._finish_attribute()
|
|
961
|
+
self.state = self.SELF_CLOSING_START_TAG
|
|
962
|
+
return False
|
|
963
|
+
if c == ">":
|
|
964
|
+
self._finish_attribute()
|
|
965
|
+
if not self._emit_current_tag():
|
|
966
|
+
self.state = self.DATA
|
|
967
|
+
return False
|
|
968
|
+
# Anything else: parse error, reconsume in before attribute name state
|
|
969
|
+
self._emit_error("missing-whitespace-between-attributes")
|
|
970
|
+
self._finish_attribute()
|
|
971
|
+
self._reconsume_current()
|
|
972
|
+
self.state = self.BEFORE_ATTRIBUTE_NAME
|
|
973
|
+
return False
|
|
974
|
+
|
|
975
|
+
def _state_self_closing_start_tag(self):
|
|
976
|
+
c = self._get_char()
|
|
977
|
+
if c is None:
|
|
978
|
+
self._emit_error("eof-in-tag")
|
|
979
|
+
self._flush_text()
|
|
980
|
+
self._emit_token(EOFToken())
|
|
981
|
+
return True
|
|
982
|
+
if c == ">":
|
|
983
|
+
self.current_tag_self_closing = True
|
|
984
|
+
self._emit_current_tag()
|
|
985
|
+
self.state = self.DATA
|
|
986
|
+
return False
|
|
987
|
+
self._emit_error("unexpected-character-after-solidus-in-tag")
|
|
988
|
+
self._reconsume_current()
|
|
989
|
+
self.state = self.BEFORE_ATTRIBUTE_NAME
|
|
990
|
+
return False
|
|
991
|
+
|
|
992
|
+
def _state_markup_declaration_open(self):
|
|
993
|
+
# Note: Comment handling (<!--) is optimized in DATA state fast-path
|
|
994
|
+
# This code only handles DOCTYPE and CDATA, or malformed markup
|
|
995
|
+
if self._consume_case_insensitive("DOCTYPE"):
|
|
996
|
+
self.current_doctype_name.clear()
|
|
997
|
+
self.current_doctype_public = None
|
|
998
|
+
self.current_doctype_system = None
|
|
999
|
+
self.current_doctype_force_quirks = False
|
|
1000
|
+
self.state = self.DOCTYPE
|
|
1001
|
+
return False
|
|
1002
|
+
if self._consume_if("[CDATA["):
|
|
1003
|
+
# CDATA sections are only valid in foreign content (SVG/MathML)
|
|
1004
|
+
# Check if the adjusted current node is in a foreign namespace
|
|
1005
|
+
stack = self.sink.open_elements
|
|
1006
|
+
if stack:
|
|
1007
|
+
current = stack[-1]
|
|
1008
|
+
if current and current.namespace not in {None, "html"}:
|
|
1009
|
+
# Proper CDATA section in foreign content
|
|
1010
|
+
self.state = self.CDATA_SECTION
|
|
1011
|
+
return False
|
|
1012
|
+
# Treat as bogus comment in HTML context, preserving "[CDATA[" prefix
|
|
1013
|
+
self._emit_error("cdata-in-html-content")
|
|
1014
|
+
self.current_comment.clear()
|
|
1015
|
+
# Add the consumed "[CDATA[" text to the comment
|
|
1016
|
+
for ch in "[CDATA[":
|
|
1017
|
+
self.current_comment.append(ch)
|
|
1018
|
+
self.state = self.BOGUS_COMMENT
|
|
1019
|
+
return False
|
|
1020
|
+
self._emit_error("incorrectly-opened-comment")
|
|
1021
|
+
self.current_comment.clear()
|
|
1022
|
+
# Don't reconsume - bogus comment starts from current position
|
|
1023
|
+
self.state = self.BOGUS_COMMENT
|
|
1024
|
+
return False
|
|
1025
|
+
|
|
1026
|
+
def _state_comment_start(self):
|
|
1027
|
+
replacement = "\ufffd"
|
|
1028
|
+
c = self._get_char()
|
|
1029
|
+
if c is None:
|
|
1030
|
+
self._emit_error("eof-in-comment")
|
|
1031
|
+
self._emit_comment()
|
|
1032
|
+
self._emit_token(EOFToken())
|
|
1033
|
+
return True
|
|
1034
|
+
if c == "-":
|
|
1035
|
+
self.state = self.COMMENT_START_DASH
|
|
1036
|
+
return False
|
|
1037
|
+
if c == ">":
|
|
1038
|
+
self._emit_error("abrupt-closing-of-empty-comment")
|
|
1039
|
+
self._emit_comment()
|
|
1040
|
+
self.state = self.DATA
|
|
1041
|
+
return False
|
|
1042
|
+
if c == "\0":
|
|
1043
|
+
self._emit_error("unexpected-null-character")
|
|
1044
|
+
self.current_comment.append(replacement)
|
|
1045
|
+
else:
|
|
1046
|
+
self.current_comment.append(c)
|
|
1047
|
+
self.state = self.COMMENT
|
|
1048
|
+
return False
|
|
1049
|
+
|
|
1050
|
+
def _state_comment_start_dash(self):
|
|
1051
|
+
replacement = "\ufffd"
|
|
1052
|
+
c = self._get_char()
|
|
1053
|
+
if c is None:
|
|
1054
|
+
self._emit_error("eof-in-comment")
|
|
1055
|
+
self._emit_comment()
|
|
1056
|
+
self._emit_token(EOFToken())
|
|
1057
|
+
return True
|
|
1058
|
+
if c == "-":
|
|
1059
|
+
self.state = self.COMMENT_END
|
|
1060
|
+
return False
|
|
1061
|
+
if c == ">":
|
|
1062
|
+
self._emit_error("abrupt-closing-of-empty-comment")
|
|
1063
|
+
self._emit_comment()
|
|
1064
|
+
self.state = self.DATA
|
|
1065
|
+
return False
|
|
1066
|
+
if c == "\0":
|
|
1067
|
+
self._emit_error("unexpected-null-character")
|
|
1068
|
+
self.current_comment.extend(("-", replacement))
|
|
1069
|
+
else:
|
|
1070
|
+
self.current_comment.extend(("-", c))
|
|
1071
|
+
self.state = self.COMMENT
|
|
1072
|
+
return False
|
|
1073
|
+
|
|
1074
|
+
def _state_comment(self):
|
|
1075
|
+
replacement = "\ufffd"
|
|
1076
|
+
while True:
|
|
1077
|
+
if self._consume_comment_run():
|
|
1078
|
+
continue
|
|
1079
|
+
c = self._get_char()
|
|
1080
|
+
if c is None:
|
|
1081
|
+
self._emit_error("eof-in-comment")
|
|
1082
|
+
self._emit_comment()
|
|
1083
|
+
self._emit_token(EOFToken())
|
|
1084
|
+
return True
|
|
1085
|
+
if c == "-":
|
|
1086
|
+
self.state = self.COMMENT_END_DASH
|
|
1087
|
+
return False
|
|
1088
|
+
# c == "\0" - the only remaining possibility after _consume_comment_run
|
|
1089
|
+
self._emit_error("unexpected-null-character")
|
|
1090
|
+
self.current_comment.append(replacement)
|
|
1091
|
+
|
|
1092
|
+
def _state_comment_end_dash(self):
|
|
1093
|
+
replacement = "\ufffd"
|
|
1094
|
+
c = self._get_char()
|
|
1095
|
+
if c is None:
|
|
1096
|
+
self._emit_error("eof-in-comment")
|
|
1097
|
+
self._emit_comment()
|
|
1098
|
+
self._emit_token(EOFToken())
|
|
1099
|
+
return True
|
|
1100
|
+
if c == "-":
|
|
1101
|
+
self.state = self.COMMENT_END
|
|
1102
|
+
return False
|
|
1103
|
+
if c == "\0":
|
|
1104
|
+
self._emit_error("unexpected-null-character")
|
|
1105
|
+
self.current_comment.extend(("-", replacement))
|
|
1106
|
+
self.state = self.COMMENT
|
|
1107
|
+
return False
|
|
1108
|
+
# Per spec: append "-" and current char, switch to COMMENT state
|
|
1109
|
+
self.current_comment.extend(("-", c))
|
|
1110
|
+
self.state = self.COMMENT
|
|
1111
|
+
return False
|
|
1112
|
+
|
|
1113
|
+
def _state_comment_end(self):
|
|
1114
|
+
replacement = "\ufffd"
|
|
1115
|
+
c = self._get_char()
|
|
1116
|
+
if c is None:
|
|
1117
|
+
self._emit_error("eof-in-comment")
|
|
1118
|
+
self._emit_comment()
|
|
1119
|
+
self._emit_token(EOFToken())
|
|
1120
|
+
return True
|
|
1121
|
+
if c == ">":
|
|
1122
|
+
self._emit_comment()
|
|
1123
|
+
self.state = self.DATA
|
|
1124
|
+
return False
|
|
1125
|
+
if c == "!":
|
|
1126
|
+
self.state = self.COMMENT_END_BANG
|
|
1127
|
+
return False
|
|
1128
|
+
if c == "-":
|
|
1129
|
+
self.current_comment.append("-")
|
|
1130
|
+
return False
|
|
1131
|
+
if c == "\0":
|
|
1132
|
+
self._emit_error("unexpected-null-character")
|
|
1133
|
+
self.current_comment.extend(("--", replacement))
|
|
1134
|
+
self.state = self.COMMENT
|
|
1135
|
+
return False
|
|
1136
|
+
self._emit_error("incorrectly-closed-comment")
|
|
1137
|
+
self.current_comment.extend(("--", c))
|
|
1138
|
+
self.state = self.COMMENT
|
|
1139
|
+
return False
|
|
1140
|
+
|
|
1141
|
+
def _state_comment_end_bang(self):
|
|
1142
|
+
replacement = "\ufffd"
|
|
1143
|
+
c = self._get_char()
|
|
1144
|
+
if c is None:
|
|
1145
|
+
self._emit_error("eof-in-comment")
|
|
1146
|
+
self._emit_comment()
|
|
1147
|
+
self._emit_token(EOFToken())
|
|
1148
|
+
return True
|
|
1149
|
+
if c == "-":
|
|
1150
|
+
self.current_comment.append("-")
|
|
1151
|
+
self.current_comment.append("-")
|
|
1152
|
+
self.current_comment.append("!")
|
|
1153
|
+
self.state = self.COMMENT_END_DASH
|
|
1154
|
+
return False
|
|
1155
|
+
if c == ">":
|
|
1156
|
+
self._emit_error("incorrectly-closed-comment")
|
|
1157
|
+
self._emit_comment()
|
|
1158
|
+
self.state = self.DATA
|
|
1159
|
+
return False
|
|
1160
|
+
if c == "\0":
|
|
1161
|
+
self._emit_error("unexpected-null-character")
|
|
1162
|
+
self.current_comment.append("-")
|
|
1163
|
+
self.current_comment.append("-")
|
|
1164
|
+
self.current_comment.append("!")
|
|
1165
|
+
self.current_comment.append(replacement)
|
|
1166
|
+
self.state = self.COMMENT
|
|
1167
|
+
return False
|
|
1168
|
+
self.current_comment.append("-")
|
|
1169
|
+
self.current_comment.append("-")
|
|
1170
|
+
self.current_comment.append("!")
|
|
1171
|
+
self.current_comment.append(c)
|
|
1172
|
+
self.state = self.COMMENT
|
|
1173
|
+
return False
|
|
1174
|
+
|
|
1175
|
+
def _state_bogus_comment(self):
|
|
1176
|
+
replacement = "\ufffd"
|
|
1177
|
+
while True:
|
|
1178
|
+
c = self._get_char()
|
|
1179
|
+
if c is None:
|
|
1180
|
+
self._emit_comment()
|
|
1181
|
+
self._emit_token(EOFToken())
|
|
1182
|
+
return True
|
|
1183
|
+
if c == ">":
|
|
1184
|
+
self._emit_comment()
|
|
1185
|
+
self.state = self.DATA
|
|
1186
|
+
return False
|
|
1187
|
+
if c == "\0":
|
|
1188
|
+
self.current_comment.append(replacement)
|
|
1189
|
+
else:
|
|
1190
|
+
self.current_comment.append(c)
|
|
1191
|
+
|
|
1192
|
+
def _state_doctype(self):
|
|
1193
|
+
c = self._get_char()
|
|
1194
|
+
if c is None:
|
|
1195
|
+
self._emit_error("eof-in-doctype")
|
|
1196
|
+
self.current_doctype_force_quirks = True
|
|
1197
|
+
self._emit_doctype()
|
|
1198
|
+
self._emit_token(EOFToken())
|
|
1199
|
+
return True
|
|
1200
|
+
if c in ("\t", "\n", "\f", " "):
|
|
1201
|
+
self.state = self.BEFORE_DOCTYPE_NAME
|
|
1202
|
+
return False
|
|
1203
|
+
if c == ">":
|
|
1204
|
+
self._emit_error("expected-doctype-name-but-got-right-bracket")
|
|
1205
|
+
self.current_doctype_force_quirks = True
|
|
1206
|
+
self._emit_doctype()
|
|
1207
|
+
self.state = self.DATA
|
|
1208
|
+
return False
|
|
1209
|
+
self._emit_error("missing-whitespace-before-doctype-name")
|
|
1210
|
+
self._reconsume_current()
|
|
1211
|
+
self.state = self.BEFORE_DOCTYPE_NAME
|
|
1212
|
+
return False
|
|
1213
|
+
|
|
1214
|
+
def _state_before_doctype_name(self):
|
|
1215
|
+
while True:
|
|
1216
|
+
c = self._get_char()
|
|
1217
|
+
if c is None:
|
|
1218
|
+
self._emit_error("eof-in-doctype-name")
|
|
1219
|
+
self.current_doctype_force_quirks = True
|
|
1220
|
+
self._emit_doctype()
|
|
1221
|
+
self._emit_token(EOFToken())
|
|
1222
|
+
return True
|
|
1223
|
+
if c in ("\t", "\n", "\f", " "):
|
|
1224
|
+
return False
|
|
1225
|
+
if c == ">":
|
|
1226
|
+
self._emit_error("expected-doctype-name-but-got-right-bracket")
|
|
1227
|
+
self.current_doctype_force_quirks = True
|
|
1228
|
+
self._emit_doctype()
|
|
1229
|
+
self.state = self.DATA
|
|
1230
|
+
return False
|
|
1231
|
+
if "A" <= c <= "Z":
|
|
1232
|
+
self.current_doctype_name.append(chr(ord(c) + 32))
|
|
1233
|
+
elif c == "\0":
|
|
1234
|
+
self._emit_error("unexpected-null-character")
|
|
1235
|
+
self.current_doctype_name.append("\ufffd")
|
|
1236
|
+
else:
|
|
1237
|
+
self.current_doctype_name.append(c)
|
|
1238
|
+
self.state = self.DOCTYPE_NAME
|
|
1239
|
+
return False
|
|
1240
|
+
|
|
1241
|
+
def _state_doctype_name(self):
|
|
1242
|
+
while True:
|
|
1243
|
+
c = self._get_char()
|
|
1244
|
+
if c is None:
|
|
1245
|
+
self._emit_error("eof-in-doctype-name")
|
|
1246
|
+
self.current_doctype_force_quirks = True
|
|
1247
|
+
self._emit_doctype()
|
|
1248
|
+
self._emit_token(EOFToken())
|
|
1249
|
+
return True
|
|
1250
|
+
if c in ("\t", "\n", "\f", " "):
|
|
1251
|
+
self.state = self.AFTER_DOCTYPE_NAME
|
|
1252
|
+
return False
|
|
1253
|
+
if c == ">":
|
|
1254
|
+
self._emit_doctype()
|
|
1255
|
+
self.state = self.DATA
|
|
1256
|
+
return False
|
|
1257
|
+
if "A" <= c <= "Z":
|
|
1258
|
+
self.current_doctype_name.append(chr(ord(c) + 32))
|
|
1259
|
+
continue
|
|
1260
|
+
if c == "\0":
|
|
1261
|
+
self._emit_error("unexpected-null-character")
|
|
1262
|
+
self.current_doctype_name.append("\ufffd")
|
|
1263
|
+
continue
|
|
1264
|
+
self.current_doctype_name.append(c)
|
|
1265
|
+
|
|
1266
|
+
def _state_after_doctype_name(self):
|
|
1267
|
+
if self._consume_case_insensitive("PUBLIC"):
|
|
1268
|
+
self.state = self.AFTER_DOCTYPE_PUBLIC_KEYWORD
|
|
1269
|
+
return False
|
|
1270
|
+
if self._consume_case_insensitive("SYSTEM"):
|
|
1271
|
+
self.state = self.AFTER_DOCTYPE_SYSTEM_KEYWORD
|
|
1272
|
+
return False
|
|
1273
|
+
while True:
|
|
1274
|
+
c = self._get_char()
|
|
1275
|
+
if c is None:
|
|
1276
|
+
self._emit_error("eof-in-doctype")
|
|
1277
|
+
self.current_doctype_force_quirks = True
|
|
1278
|
+
self._emit_doctype()
|
|
1279
|
+
self._emit_token(EOFToken())
|
|
1280
|
+
return True
|
|
1281
|
+
if c in ("\t", "\n", "\f", " "):
|
|
1282
|
+
continue
|
|
1283
|
+
if c == ">":
|
|
1284
|
+
self._emit_doctype()
|
|
1285
|
+
self.state = self.DATA
|
|
1286
|
+
return False
|
|
1287
|
+
self._emit_error("missing-whitespace-after-doctype-name")
|
|
1288
|
+
self.current_doctype_force_quirks = True
|
|
1289
|
+
self._reconsume_current()
|
|
1290
|
+
self.state = self.BOGUS_DOCTYPE
|
|
1291
|
+
return False
|
|
1292
|
+
|
|
1293
|
+
def _state_after_doctype_public_keyword(self):
|
|
1294
|
+
while True:
|
|
1295
|
+
c = self._get_char()
|
|
1296
|
+
if c is None:
|
|
1297
|
+
self._emit_error("missing-quote-before-doctype-public-identifier")
|
|
1298
|
+
self.current_doctype_force_quirks = True
|
|
1299
|
+
self._emit_doctype()
|
|
1300
|
+
self._emit_token(EOFToken())
|
|
1301
|
+
return True
|
|
1302
|
+
if c in ("\t", "\n", "\f", " "):
|
|
1303
|
+
self.state = self.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER
|
|
1304
|
+
return False
|
|
1305
|
+
if c == '"':
|
|
1306
|
+
self._emit_error("missing-whitespace-before-doctype-public-identifier")
|
|
1307
|
+
self.current_doctype_public = []
|
|
1308
|
+
self.state = self.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED
|
|
1309
|
+
return False
|
|
1310
|
+
if c == "'":
|
|
1311
|
+
self._emit_error("missing-whitespace-before-doctype-public-identifier")
|
|
1312
|
+
self.current_doctype_public = []
|
|
1313
|
+
self.state = self.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED
|
|
1314
|
+
return False
|
|
1315
|
+
if c == ">":
|
|
1316
|
+
self._emit_error("missing-doctype-public-identifier")
|
|
1317
|
+
self.current_doctype_force_quirks = True
|
|
1318
|
+
self._emit_doctype()
|
|
1319
|
+
self.state = self.DATA
|
|
1320
|
+
return False
|
|
1321
|
+
self._emit_error("unexpected-character-after-doctype-public-keyword")
|
|
1322
|
+
self.current_doctype_force_quirks = True
|
|
1323
|
+
self._reconsume_current()
|
|
1324
|
+
self.state = self.BOGUS_DOCTYPE
|
|
1325
|
+
return False
|
|
1326
|
+
|
|
1327
|
+
def _state_after_doctype_system_keyword(self):
|
|
1328
|
+
while True:
|
|
1329
|
+
c = self._get_char()
|
|
1330
|
+
if c is None:
|
|
1331
|
+
self._emit_error("missing-quote-before-doctype-system-identifier")
|
|
1332
|
+
self.current_doctype_force_quirks = True
|
|
1333
|
+
self._emit_doctype()
|
|
1334
|
+
self._emit_token(EOFToken())
|
|
1335
|
+
return True
|
|
1336
|
+
if c in ("\t", "\n", "\f", " "):
|
|
1337
|
+
self.state = self.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER
|
|
1338
|
+
return False
|
|
1339
|
+
if c == '"':
|
|
1340
|
+
self._emit_error("missing-whitespace-after-doctype-public-identifier")
|
|
1341
|
+
self.current_doctype_system = []
|
|
1342
|
+
self.state = self.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED
|
|
1343
|
+
return False
|
|
1344
|
+
if c == "'":
|
|
1345
|
+
self._emit_error("missing-whitespace-after-doctype-public-identifier")
|
|
1346
|
+
self.current_doctype_system = []
|
|
1347
|
+
self.state = self.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED
|
|
1348
|
+
return False
|
|
1349
|
+
if c == ">":
|
|
1350
|
+
self._emit_error("missing-doctype-system-identifier")
|
|
1351
|
+
self.current_doctype_force_quirks = True
|
|
1352
|
+
self._emit_doctype()
|
|
1353
|
+
self.state = self.DATA
|
|
1354
|
+
return False
|
|
1355
|
+
self._emit_error("unexpected-character-after-doctype-system-keyword")
|
|
1356
|
+
self.current_doctype_force_quirks = True
|
|
1357
|
+
self._reconsume_current()
|
|
1358
|
+
self.state = self.BOGUS_DOCTYPE
|
|
1359
|
+
return False
|
|
1360
|
+
|
|
1361
|
+
def _state_before_doctype_public_identifier(self):
|
|
1362
|
+
while True:
|
|
1363
|
+
c = self._get_char()
|
|
1364
|
+
if c is None:
|
|
1365
|
+
self._emit_error("missing-doctype-public-identifier")
|
|
1366
|
+
self.current_doctype_force_quirks = True
|
|
1367
|
+
self._emit_doctype()
|
|
1368
|
+
self._emit_token(EOFToken())
|
|
1369
|
+
return True
|
|
1370
|
+
if c in ("\t", "\n", "\f", " "):
|
|
1371
|
+
continue
|
|
1372
|
+
if c == '"':
|
|
1373
|
+
self.current_doctype_public = []
|
|
1374
|
+
self.state = self.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED
|
|
1375
|
+
return False
|
|
1376
|
+
if c == "'":
|
|
1377
|
+
self.current_doctype_public = []
|
|
1378
|
+
self.state = self.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED
|
|
1379
|
+
return False
|
|
1380
|
+
if c == ">":
|
|
1381
|
+
self._emit_error("missing-doctype-public-identifier")
|
|
1382
|
+
self.current_doctype_force_quirks = True
|
|
1383
|
+
self._emit_doctype()
|
|
1384
|
+
self.state = self.DATA
|
|
1385
|
+
return False
|
|
1386
|
+
self._emit_error("missing-quote-before-doctype-public-identifier")
|
|
1387
|
+
self.current_doctype_force_quirks = True
|
|
1388
|
+
self._reconsume_current()
|
|
1389
|
+
self.state = self.BOGUS_DOCTYPE
|
|
1390
|
+
return False
|
|
1391
|
+
|
|
1392
|
+
def _state_doctype_public_identifier_double_quoted(self):
|
|
1393
|
+
while True:
|
|
1394
|
+
c = self._get_char()
|
|
1395
|
+
if c is None:
|
|
1396
|
+
self._emit_error("eof-in-doctype-public-identifier")
|
|
1397
|
+
self.current_doctype_force_quirks = True
|
|
1398
|
+
self._emit_doctype()
|
|
1399
|
+
self._emit_token(EOFToken())
|
|
1400
|
+
return True
|
|
1401
|
+
if c == '"':
|
|
1402
|
+
self.state = self.AFTER_DOCTYPE_PUBLIC_IDENTIFIER
|
|
1403
|
+
return False
|
|
1404
|
+
if c == "\0":
|
|
1405
|
+
self._emit_error("unexpected-null-character")
|
|
1406
|
+
self.current_doctype_public.append("\ufffd")
|
|
1407
|
+
continue
|
|
1408
|
+
if c == ">":
|
|
1409
|
+
self._emit_error("abrupt-doctype-public-identifier")
|
|
1410
|
+
self.current_doctype_force_quirks = True
|
|
1411
|
+
self._emit_doctype()
|
|
1412
|
+
self.state = self.DATA
|
|
1413
|
+
return False
|
|
1414
|
+
self.current_doctype_public.append(c)
|
|
1415
|
+
|
|
1416
|
+
def _state_doctype_public_identifier_single_quoted(self):
|
|
1417
|
+
while True:
|
|
1418
|
+
c = self._get_char()
|
|
1419
|
+
if c is None:
|
|
1420
|
+
self._emit_error("eof-in-doctype-public-identifier")
|
|
1421
|
+
self.current_doctype_force_quirks = True
|
|
1422
|
+
self._emit_doctype()
|
|
1423
|
+
self._emit_token(EOFToken())
|
|
1424
|
+
return True
|
|
1425
|
+
if c == "'":
|
|
1426
|
+
self.state = self.AFTER_DOCTYPE_PUBLIC_IDENTIFIER
|
|
1427
|
+
return False
|
|
1428
|
+
if c == "\0":
|
|
1429
|
+
self._emit_error("unexpected-null-character")
|
|
1430
|
+
self.current_doctype_public.append("\ufffd")
|
|
1431
|
+
continue
|
|
1432
|
+
if c == ">":
|
|
1433
|
+
self._emit_error("abrupt-doctype-public-identifier")
|
|
1434
|
+
self.current_doctype_force_quirks = True
|
|
1435
|
+
self._emit_doctype()
|
|
1436
|
+
self.state = self.DATA
|
|
1437
|
+
return False
|
|
1438
|
+
self.current_doctype_public.append(c)
|
|
1439
|
+
|
|
1440
|
+
def _state_after_doctype_public_identifier(self):
|
|
1441
|
+
while True:
|
|
1442
|
+
c = self._get_char()
|
|
1443
|
+
if c is None:
|
|
1444
|
+
self._emit_error("missing-whitespace-between-doctype-public-and-system-identifiers")
|
|
1445
|
+
self.current_doctype_force_quirks = True
|
|
1446
|
+
self._emit_doctype()
|
|
1447
|
+
self._emit_token(EOFToken())
|
|
1448
|
+
return True
|
|
1449
|
+
if c in ("\t", "\n", "\f", " "):
|
|
1450
|
+
self.state = self.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS
|
|
1451
|
+
return False
|
|
1452
|
+
if c == ">":
|
|
1453
|
+
self._emit_doctype()
|
|
1454
|
+
self.state = self.DATA
|
|
1455
|
+
return False
|
|
1456
|
+
if c == '"':
|
|
1457
|
+
self._emit_error("missing-whitespace-between-doctype-public-and-system-identifiers")
|
|
1458
|
+
self.current_doctype_system = []
|
|
1459
|
+
self.state = self.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED
|
|
1460
|
+
return False
|
|
1461
|
+
if c == "'":
|
|
1462
|
+
self._emit_error("missing-whitespace-between-doctype-public-and-system-identifiers")
|
|
1463
|
+
self.current_doctype_system = []
|
|
1464
|
+
self.state = self.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED
|
|
1465
|
+
return False
|
|
1466
|
+
self._emit_error("unexpected-character-after-doctype-public-identifier")
|
|
1467
|
+
self.current_doctype_force_quirks = True
|
|
1468
|
+
self._reconsume_current()
|
|
1469
|
+
self.state = self.BOGUS_DOCTYPE
|
|
1470
|
+
return False
|
|
1471
|
+
|
|
1472
|
+
def _state_between_doctype_public_and_system_identifiers(self):
|
|
1473
|
+
while True:
|
|
1474
|
+
c = self._get_char()
|
|
1475
|
+
if c is None:
|
|
1476
|
+
self._emit_error("missing-quote-before-doctype-system-identifier")
|
|
1477
|
+
self.current_doctype_force_quirks = True
|
|
1478
|
+
self._emit_doctype()
|
|
1479
|
+
self._emit_token(EOFToken())
|
|
1480
|
+
return True
|
|
1481
|
+
if c in ("\t", "\n", "\f", " "):
|
|
1482
|
+
continue
|
|
1483
|
+
if c == ">":
|
|
1484
|
+
self._emit_doctype()
|
|
1485
|
+
self.state = self.DATA
|
|
1486
|
+
return False
|
|
1487
|
+
if c == '"':
|
|
1488
|
+
self.current_doctype_system = []
|
|
1489
|
+
self.state = self.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED
|
|
1490
|
+
return False
|
|
1491
|
+
if c == "'":
|
|
1492
|
+
self.current_doctype_system = []
|
|
1493
|
+
self.state = self.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED
|
|
1494
|
+
return False
|
|
1495
|
+
self._emit_error("missing-quote-before-doctype-system-identifier")
|
|
1496
|
+
self.current_doctype_force_quirks = True
|
|
1497
|
+
self._reconsume_current()
|
|
1498
|
+
self.state = self.BOGUS_DOCTYPE
|
|
1499
|
+
return False
|
|
1500
|
+
|
|
1501
|
+
def _state_before_doctype_system_identifier(self):
|
|
1502
|
+
while True:
|
|
1503
|
+
c = self._get_char()
|
|
1504
|
+
if c is None:
|
|
1505
|
+
self._emit_error("missing-doctype-system-identifier")
|
|
1506
|
+
self.current_doctype_force_quirks = True
|
|
1507
|
+
self._emit_doctype()
|
|
1508
|
+
self._emit_token(EOFToken())
|
|
1509
|
+
return True
|
|
1510
|
+
if c in ("\t", "\n", "\f", " "):
|
|
1511
|
+
continue
|
|
1512
|
+
if c == '"':
|
|
1513
|
+
self.current_doctype_system = []
|
|
1514
|
+
self.state = self.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED
|
|
1515
|
+
return False
|
|
1516
|
+
if c == "'":
|
|
1517
|
+
self.current_doctype_system = []
|
|
1518
|
+
self.state = self.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED
|
|
1519
|
+
return False
|
|
1520
|
+
if c == ">":
|
|
1521
|
+
self._emit_error("missing-doctype-system-identifier")
|
|
1522
|
+
self.current_doctype_force_quirks = True
|
|
1523
|
+
self._emit_doctype()
|
|
1524
|
+
self.state = self.DATA
|
|
1525
|
+
return False
|
|
1526
|
+
self._emit_error("missing-quote-before-doctype-system-identifier")
|
|
1527
|
+
self.current_doctype_force_quirks = True
|
|
1528
|
+
self._reconsume_current()
|
|
1529
|
+
self.state = self.BOGUS_DOCTYPE
|
|
1530
|
+
return False
|
|
1531
|
+
|
|
1532
|
+
def _state_doctype_system_identifier_double_quoted(self):
|
|
1533
|
+
while True:
|
|
1534
|
+
c = self._get_char()
|
|
1535
|
+
if c is None:
|
|
1536
|
+
self._emit_error("eof-in-doctype-system-identifier")
|
|
1537
|
+
self.current_doctype_force_quirks = True
|
|
1538
|
+
self._emit_doctype()
|
|
1539
|
+
self._emit_token(EOFToken())
|
|
1540
|
+
return True
|
|
1541
|
+
if c == '"':
|
|
1542
|
+
self.state = self.AFTER_DOCTYPE_SYSTEM_IDENTIFIER
|
|
1543
|
+
return False
|
|
1544
|
+
if c == "\0":
|
|
1545
|
+
self._emit_error("unexpected-null-character")
|
|
1546
|
+
self.current_doctype_system.append("\ufffd")
|
|
1547
|
+
continue
|
|
1548
|
+
if c == ">":
|
|
1549
|
+
self._emit_error("abrupt-doctype-system-identifier")
|
|
1550
|
+
self.current_doctype_force_quirks = True
|
|
1551
|
+
self._emit_doctype()
|
|
1552
|
+
self.state = self.DATA
|
|
1553
|
+
return False
|
|
1554
|
+
self.current_doctype_system.append(c)
|
|
1555
|
+
|
|
1556
|
+
def _state_doctype_system_identifier_single_quoted(self):
|
|
1557
|
+
while True:
|
|
1558
|
+
c = self._get_char()
|
|
1559
|
+
if c is None:
|
|
1560
|
+
self._emit_error("eof-in-doctype-system-identifier")
|
|
1561
|
+
self.current_doctype_force_quirks = True
|
|
1562
|
+
self._emit_doctype()
|
|
1563
|
+
self._emit_token(EOFToken())
|
|
1564
|
+
return True
|
|
1565
|
+
if c == "'":
|
|
1566
|
+
self.state = self.AFTER_DOCTYPE_SYSTEM_IDENTIFIER
|
|
1567
|
+
return False
|
|
1568
|
+
if c == "\0":
|
|
1569
|
+
self._emit_error("unexpected-null-character")
|
|
1570
|
+
self.current_doctype_system.append("\ufffd")
|
|
1571
|
+
continue
|
|
1572
|
+
if c == ">":
|
|
1573
|
+
self._emit_error("abrupt-doctype-system-identifier")
|
|
1574
|
+
self.current_doctype_force_quirks = True
|
|
1575
|
+
self._emit_doctype()
|
|
1576
|
+
self.state = self.DATA
|
|
1577
|
+
return False
|
|
1578
|
+
self.current_doctype_system.append(c)
|
|
1579
|
+
|
|
1580
|
+
def _state_after_doctype_system_identifier(self):
|
|
1581
|
+
while True:
|
|
1582
|
+
c = self._get_char()
|
|
1583
|
+
if c is None:
|
|
1584
|
+
self._emit_error("eof-in-doctype")
|
|
1585
|
+
self.current_doctype_force_quirks = True
|
|
1586
|
+
self._emit_doctype()
|
|
1587
|
+
self._emit_token(EOFToken())
|
|
1588
|
+
return True
|
|
1589
|
+
if c in ("\t", "\n", "\f", " "):
|
|
1590
|
+
continue
|
|
1591
|
+
if c == ">":
|
|
1592
|
+
self._emit_doctype()
|
|
1593
|
+
self.state = self.DATA
|
|
1594
|
+
return False
|
|
1595
|
+
self._emit_error("unexpected-character-after-doctype-system-identifier")
|
|
1596
|
+
self._reconsume_current()
|
|
1597
|
+
self.state = self.BOGUS_DOCTYPE
|
|
1598
|
+
return False
|
|
1599
|
+
|
|
1600
|
+
def _state_bogus_doctype(self):
|
|
1601
|
+
while True:
|
|
1602
|
+
c = self._get_char()
|
|
1603
|
+
if c is None:
|
|
1604
|
+
self._emit_doctype()
|
|
1605
|
+
self._emit_token(EOFToken())
|
|
1606
|
+
return True
|
|
1607
|
+
if c == ">":
|
|
1608
|
+
self._emit_doctype()
|
|
1609
|
+
self.state = self.DATA
|
|
1610
|
+
return False
|
|
1611
|
+
|
|
1612
|
+
# ---------------------
|
|
1613
|
+
# Low-level helpers
|
|
1614
|
+
# ---------------------
|
|
1615
|
+
|
|
1616
|
+
def _get_char(self):
|
|
1617
|
+
if self.reconsume:
|
|
1618
|
+
self.reconsume = False
|
|
1619
|
+
return self.current_char
|
|
1620
|
+
|
|
1621
|
+
buffer = self.buffer
|
|
1622
|
+
pos = self.pos
|
|
1623
|
+
length = self.length
|
|
1624
|
+
while True:
|
|
1625
|
+
if pos >= length:
|
|
1626
|
+
self.pos = pos
|
|
1627
|
+
self.current_char = None
|
|
1628
|
+
return None
|
|
1629
|
+
|
|
1630
|
+
c = buffer[pos]
|
|
1631
|
+
pos += 1
|
|
1632
|
+
|
|
1633
|
+
if c == "\r":
|
|
1634
|
+
self.ignore_lf = True
|
|
1635
|
+
self.current_char = "\n"
|
|
1636
|
+
self.pos = pos
|
|
1637
|
+
return "\n"
|
|
1638
|
+
|
|
1639
|
+
if c == "\n":
|
|
1640
|
+
if self.ignore_lf:
|
|
1641
|
+
self.ignore_lf = False
|
|
1642
|
+
continue
|
|
1643
|
+
# Line tracking now computed on-demand via _get_line_at_pos()
|
|
1644
|
+
|
|
1645
|
+
else:
|
|
1646
|
+
self.ignore_lf = False
|
|
1647
|
+
|
|
1648
|
+
self.current_char = c
|
|
1649
|
+
self.pos = pos
|
|
1650
|
+
return c
|
|
1651
|
+
|
|
1652
|
+
def _reconsume_current(self):
|
|
1653
|
+
self.reconsume = True
|
|
1654
|
+
|
|
1655
|
+
def _append_text(self, text):
|
|
1656
|
+
"""Append text to buffer, recording start position if this is the first chunk."""
|
|
1657
|
+
if not self.text_buffer:
|
|
1658
|
+
# Record where text started (current position before this chunk)
|
|
1659
|
+
self.text_start_pos = self.pos
|
|
1660
|
+
self.text_buffer.append(text)
|
|
1661
|
+
|
|
1662
|
+
def _flush_text(self):
|
|
1663
|
+
if not self.text_buffer:
|
|
1664
|
+
return
|
|
1665
|
+
|
|
1666
|
+
# Optimization: Avoid join for single chunk
|
|
1667
|
+
# text_buffer is never populated with empty strings
|
|
1668
|
+
if len(self.text_buffer) == 1:
|
|
1669
|
+
data = self.text_buffer[0]
|
|
1670
|
+
else:
|
|
1671
|
+
data = "".join(self.text_buffer)
|
|
1672
|
+
|
|
1673
|
+
# Calculate raw text length before any processing for position tracking
|
|
1674
|
+
raw_len = len(data)
|
|
1675
|
+
|
|
1676
|
+
self.text_buffer.clear()
|
|
1677
|
+
if self.state == self.DATA and "\0" in data:
|
|
1678
|
+
count = data.count("\0")
|
|
1679
|
+
for _ in range(count):
|
|
1680
|
+
self._emit_error("unexpected-null-character")
|
|
1681
|
+
|
|
1682
|
+
# Per HTML5 spec:
|
|
1683
|
+
# - RCDATA state (title, textarea): decode character references
|
|
1684
|
+
# - RAWTEXT state (style, script, etc): do NOT decode
|
|
1685
|
+
# - PLAINTEXT state: do NOT decode
|
|
1686
|
+
# - CDATA sections: do NOT decode
|
|
1687
|
+
if self.state >= self.PLAINTEXT or self.CDATA_SECTION <= self.state <= self.CDATA_SECTION_END:
|
|
1688
|
+
pass
|
|
1689
|
+
elif self.state >= self.RAWTEXT:
|
|
1690
|
+
pass
|
|
1691
|
+
else:
|
|
1692
|
+
if "&" in data:
|
|
1693
|
+
data = decode_entities_in_text(data)
|
|
1694
|
+
# Apply XML coercion if enabled
|
|
1695
|
+
if self.opts.xml_coercion:
|
|
1696
|
+
data = _coerce_text_for_xml(data)
|
|
1697
|
+
|
|
1698
|
+
# Record position at END of raw text (1-indexed column = raw_len)
|
|
1699
|
+
self._record_text_end_position(raw_len)
|
|
1700
|
+
self.sink.process_characters(data)
|
|
1701
|
+
# Note: process_characters never returns Plaintext or RawData
|
|
1702
|
+
# State switches happen via _emit_current_tag instead
|
|
1703
|
+
|
|
1704
|
+
def _append_attr_value_char(self, c):
|
|
1705
|
+
self.current_attr_value.append(c)
|
|
1706
|
+
|
|
1707
|
+
def _finish_attribute(self):
|
|
1708
|
+
attr_name_buffer = self.current_attr_name
|
|
1709
|
+
if not attr_name_buffer:
|
|
1710
|
+
return
|
|
1711
|
+
if len(attr_name_buffer) == 1:
|
|
1712
|
+
name = attr_name_buffer[0]
|
|
1713
|
+
else:
|
|
1714
|
+
name = "".join(attr_name_buffer)
|
|
1715
|
+
attrs = self.current_tag_attrs
|
|
1716
|
+
is_duplicate = name in attrs
|
|
1717
|
+
attr_name_buffer.clear()
|
|
1718
|
+
attr_value_buffer = self.current_attr_value
|
|
1719
|
+
if is_duplicate:
|
|
1720
|
+
self._emit_error("duplicate-attribute")
|
|
1721
|
+
attr_value_buffer.clear()
|
|
1722
|
+
self.current_attr_value_has_amp = False
|
|
1723
|
+
return
|
|
1724
|
+
if not attr_value_buffer:
|
|
1725
|
+
value = ""
|
|
1726
|
+
elif len(attr_value_buffer) == 1:
|
|
1727
|
+
value = attr_value_buffer[0]
|
|
1728
|
+
else:
|
|
1729
|
+
value = "".join(attr_value_buffer)
|
|
1730
|
+
if self.current_attr_value_has_amp:
|
|
1731
|
+
value = decode_entities_in_text(value, in_attribute=True)
|
|
1732
|
+
attrs[name] = value
|
|
1733
|
+
attr_value_buffer.clear()
|
|
1734
|
+
self.current_attr_value_has_amp = False
|
|
1735
|
+
|
|
1736
|
+
def _emit_current_tag(self):
|
|
1737
|
+
name_parts = self.current_tag_name
|
|
1738
|
+
part_count = len(name_parts)
|
|
1739
|
+
# Note: part_count is always >= 1 because fast-path appends before entering TAG_NAME
|
|
1740
|
+
if part_count == 1:
|
|
1741
|
+
name = name_parts[0]
|
|
1742
|
+
else:
|
|
1743
|
+
name = "".join(name_parts)
|
|
1744
|
+
attrs = self.current_tag_attrs
|
|
1745
|
+
self.current_tag_attrs = {}
|
|
1746
|
+
|
|
1747
|
+
tag = self._tag_token
|
|
1748
|
+
tag.kind = self.current_tag_kind
|
|
1749
|
+
tag.name = name
|
|
1750
|
+
tag.attrs = attrs
|
|
1751
|
+
tag.self_closing = self.current_tag_self_closing
|
|
1752
|
+
|
|
1753
|
+
switched_to_rawtext = False
|
|
1754
|
+
if self.current_tag_kind == Tag.START:
|
|
1755
|
+
self.last_start_tag_name = name
|
|
1756
|
+
needs_rawtext_check = name in _RAWTEXT_SWITCH_TAGS or name == "plaintext"
|
|
1757
|
+
if needs_rawtext_check:
|
|
1758
|
+
stack = self.sink.open_elements
|
|
1759
|
+
current_node = stack[-1] if stack else None
|
|
1760
|
+
namespace = current_node.namespace if current_node else None
|
|
1761
|
+
if namespace is None or namespace == "html":
|
|
1762
|
+
if name in _RCDATA_ELEMENTS:
|
|
1763
|
+
self.state = self.RCDATA
|
|
1764
|
+
self.rawtext_tag_name = name
|
|
1765
|
+
switched_to_rawtext = True
|
|
1766
|
+
elif name in _RAWTEXT_SWITCH_TAGS:
|
|
1767
|
+
self.state = self.RAWTEXT
|
|
1768
|
+
self.rawtext_tag_name = name
|
|
1769
|
+
switched_to_rawtext = True
|
|
1770
|
+
else:
|
|
1771
|
+
# Must be "plaintext" - the only other way needs_rawtext_check can be True
|
|
1772
|
+
self.state = self.PLAINTEXT
|
|
1773
|
+
switched_to_rawtext = True
|
|
1774
|
+
# Remember current state before emitting
|
|
1775
|
+
|
|
1776
|
+
# Emit token to sink
|
|
1777
|
+
self._record_token_position()
|
|
1778
|
+
result = self.sink.process_token(tag)
|
|
1779
|
+
if result == 1: # TokenSinkResult.Plaintext
|
|
1780
|
+
self.state = self.PLAINTEXT
|
|
1781
|
+
switched_to_rawtext = True
|
|
1782
|
+
|
|
1783
|
+
self.current_tag_name.clear()
|
|
1784
|
+
self.current_attr_name.clear()
|
|
1785
|
+
self.current_attr_value.clear()
|
|
1786
|
+
self.current_tag_self_closing = False
|
|
1787
|
+
self.current_tag_kind = Tag.START
|
|
1788
|
+
return switched_to_rawtext
|
|
1789
|
+
|
|
1790
|
+
def _emit_comment(self):
|
|
1791
|
+
data = "".join(self.current_comment)
|
|
1792
|
+
self.current_comment.clear()
|
|
1793
|
+
# Apply XML coercion if enabled
|
|
1794
|
+
if self.opts.xml_coercion:
|
|
1795
|
+
data = _coerce_comment_for_xml(data)
|
|
1796
|
+
self._comment_token.data = data
|
|
1797
|
+
self._emit_token(self._comment_token)
|
|
1798
|
+
|
|
1799
|
+
def _emit_doctype(self):
|
|
1800
|
+
name = "".join(self.current_doctype_name) if self.current_doctype_name else None
|
|
1801
|
+
# If public_id/system_id is a list (even empty), join it; if None, keep None
|
|
1802
|
+
public_id = "".join(self.current_doctype_public) if self.current_doctype_public is not None else None
|
|
1803
|
+
system_id = "".join(self.current_doctype_system) if self.current_doctype_system is not None else None
|
|
1804
|
+
doctype = Doctype(
|
|
1805
|
+
name=name,
|
|
1806
|
+
public_id=public_id,
|
|
1807
|
+
system_id=system_id,
|
|
1808
|
+
force_quirks=self.current_doctype_force_quirks,
|
|
1809
|
+
)
|
|
1810
|
+
self.current_doctype_name.clear()
|
|
1811
|
+
self.current_doctype_public = None
|
|
1812
|
+
self.current_doctype_system = None
|
|
1813
|
+
self.current_doctype_force_quirks = False
|
|
1814
|
+
self._emit_token(DoctypeToken(doctype))
|
|
1815
|
+
|
|
1816
|
+
def _emit_token(self, token):
|
|
1817
|
+
self._record_token_position()
|
|
1818
|
+
self.sink.process_token(token)
|
|
1819
|
+
# Note: process_token never returns Plaintext or RawData for state switches
|
|
1820
|
+
# State switches happen via _emit_current_tag checking sink response
|
|
1821
|
+
|
|
1822
|
+
def _record_token_position(self):
|
|
1823
|
+
"""Record current position as 0-indexed column for the last emitted token.
|
|
1824
|
+
|
|
1825
|
+
Per the spec, the position should be at the end of the token (after the last char).
|
|
1826
|
+
"""
|
|
1827
|
+
if not self.collect_errors:
|
|
1828
|
+
return
|
|
1829
|
+
# pos points after the last consumed character, which is exactly what we want
|
|
1830
|
+
pos = self.pos
|
|
1831
|
+
last_newline = self.buffer.rfind("\n", 0, pos)
|
|
1832
|
+
if last_newline == -1:
|
|
1833
|
+
column = pos # 0-indexed from start
|
|
1834
|
+
else:
|
|
1835
|
+
column = pos - last_newline - 1 # 0-indexed from after newline
|
|
1836
|
+
self.last_token_line = self._get_line_at_pos(pos)
|
|
1837
|
+
self.last_token_column = column
|
|
1838
|
+
|
|
1839
|
+
def _record_text_end_position(self, raw_len):
|
|
1840
|
+
"""Record position at end of text token (after last character).
|
|
1841
|
+
|
|
1842
|
+
Uses text_start_pos + raw_len to compute where text ends, matching html5lib's
|
|
1843
|
+
behavior of reporting the column of the last character (1-indexed).
|
|
1844
|
+
"""
|
|
1845
|
+
if not self.collect_errors:
|
|
1846
|
+
return
|
|
1847
|
+
# Position of last character of text (0-indexed)
|
|
1848
|
+
end_pos = self.text_start_pos + raw_len
|
|
1849
|
+
last_newline = self.buffer.rfind("\n", 0, end_pos)
|
|
1850
|
+
if last_newline == -1:
|
|
1851
|
+
column = end_pos # 1-indexed column = end_pos (position after last char)
|
|
1852
|
+
else:
|
|
1853
|
+
column = end_pos - last_newline - 1
|
|
1854
|
+
self.last_token_line = self._get_line_at_pos(end_pos)
|
|
1855
|
+
self.last_token_column = column
|
|
1856
|
+
|
|
1857
|
+
def _emit_error(self, code):
|
|
1858
|
+
if not self.collect_errors:
|
|
1859
|
+
return
|
|
1860
|
+
# Compute column on-demand: scan backwards to find last newline
|
|
1861
|
+
pos = max(0, self.pos - 1) # Current position being processed
|
|
1862
|
+
last_newline = self.buffer.rfind("\n", 0, pos + 1)
|
|
1863
|
+
if last_newline == -1:
|
|
1864
|
+
column = pos + 1 # 1-indexed from start of input
|
|
1865
|
+
else:
|
|
1866
|
+
column = pos - last_newline # 1-indexed from after newline
|
|
1867
|
+
|
|
1868
|
+
message = generate_error_message(code)
|
|
1869
|
+
line = self._get_line_at_pos(self.pos)
|
|
1870
|
+
self.errors.append(ParseError(code, line=line, column=column, message=message, source_html=self.buffer))
|
|
1871
|
+
|
|
1872
|
+
def _consume_if(self, literal):
|
|
1873
|
+
end = self.pos + len(literal)
|
|
1874
|
+
if end > self.length:
|
|
1875
|
+
return False
|
|
1876
|
+
segment = self.buffer[self.pos : end]
|
|
1877
|
+
if segment != literal:
|
|
1878
|
+
return False
|
|
1879
|
+
self.pos = end
|
|
1880
|
+
return True
|
|
1881
|
+
|
|
1882
|
+
def _consume_case_insensitive(self, literal):
|
|
1883
|
+
end = self.pos + len(literal)
|
|
1884
|
+
if end > self.length:
|
|
1885
|
+
return False
|
|
1886
|
+
segment = self.buffer[self.pos : end]
|
|
1887
|
+
if segment.lower() != literal.lower():
|
|
1888
|
+
return False
|
|
1889
|
+
self.pos = end
|
|
1890
|
+
return True
|
|
1891
|
+
|
|
1892
|
+
def _consume_comment_run(self):
|
|
1893
|
+
# Note: Comments are never reconsumed
|
|
1894
|
+
pos = self.pos
|
|
1895
|
+
length = self.length
|
|
1896
|
+
if pos >= length:
|
|
1897
|
+
return False
|
|
1898
|
+
|
|
1899
|
+
# Handle ignore_lf for CRLF sequences
|
|
1900
|
+
if self.ignore_lf and pos < length and self.buffer[pos] == "\n":
|
|
1901
|
+
self.ignore_lf = False
|
|
1902
|
+
pos += 1
|
|
1903
|
+
self.pos = pos
|
|
1904
|
+
if pos >= length:
|
|
1905
|
+
return False
|
|
1906
|
+
|
|
1907
|
+
match = _COMMENT_RUN_PATTERN.match(self.buffer, pos)
|
|
1908
|
+
if match:
|
|
1909
|
+
chunk = match.group(0)
|
|
1910
|
+
# Handle CRLF normalization for comments
|
|
1911
|
+
if "\r" in chunk:
|
|
1912
|
+
chunk = chunk.replace("\r\n", "\n").replace("\r", "\n")
|
|
1913
|
+
self.ignore_lf = chunk.endswith("\r")
|
|
1914
|
+
self.current_comment.append(chunk)
|
|
1915
|
+
self.pos = match.end()
|
|
1916
|
+
return True
|
|
1917
|
+
return False
|
|
1918
|
+
|
|
1919
|
+
def _state_cdata_section(self):
|
|
1920
|
+
# CDATA section state - consume characters until we see ']'
|
|
1921
|
+
while True:
|
|
1922
|
+
c = self._get_char()
|
|
1923
|
+
if c is None:
|
|
1924
|
+
self._emit_error("eof-in-cdata")
|
|
1925
|
+
self._flush_text()
|
|
1926
|
+
self._emit_token(EOFToken())
|
|
1927
|
+
return True
|
|
1928
|
+
if c == "]":
|
|
1929
|
+
self.state = self.CDATA_SECTION_BRACKET
|
|
1930
|
+
return False
|
|
1931
|
+
self._append_text(c)
|
|
1932
|
+
|
|
1933
|
+
def _state_cdata_section_bracket(self):
|
|
1934
|
+
# Seen one ']', check for second ']'
|
|
1935
|
+
c = self._get_char()
|
|
1936
|
+
if c == "]":
|
|
1937
|
+
self.state = self.CDATA_SECTION_END
|
|
1938
|
+
return False
|
|
1939
|
+
# False alarm, emit the ']' we saw and continue
|
|
1940
|
+
self._append_text("]")
|
|
1941
|
+
if c is None:
|
|
1942
|
+
self._emit_error("eof-in-cdata")
|
|
1943
|
+
self._flush_text()
|
|
1944
|
+
self._emit_token(EOFToken())
|
|
1945
|
+
return True
|
|
1946
|
+
self._reconsume_current()
|
|
1947
|
+
self.state = self.CDATA_SECTION
|
|
1948
|
+
return False
|
|
1949
|
+
|
|
1950
|
+
def _state_cdata_section_end(self):
|
|
1951
|
+
# Seen ']]', check for '>'
|
|
1952
|
+
c = self._get_char()
|
|
1953
|
+
if c == ">":
|
|
1954
|
+
# End of CDATA section
|
|
1955
|
+
self._flush_text()
|
|
1956
|
+
self.state = self.DATA
|
|
1957
|
+
return False
|
|
1958
|
+
# Not the end - we saw ']]' but not '>'. Emit one ']' and check if the next char is another ']'
|
|
1959
|
+
self._append_text("]")
|
|
1960
|
+
if c is None:
|
|
1961
|
+
# EOF after ']]' - emit the second ']' too
|
|
1962
|
+
self._append_text("]")
|
|
1963
|
+
self._emit_error("eof-in-cdata")
|
|
1964
|
+
self._flush_text()
|
|
1965
|
+
self._emit_token(EOFToken())
|
|
1966
|
+
return True
|
|
1967
|
+
if c == "]":
|
|
1968
|
+
# Still might be ']]>' sequence, stay in CDATA_SECTION_END
|
|
1969
|
+
return False
|
|
1970
|
+
# Not a bracket, so emit the second ']', reconsume current char and go back to CDATA_SECTION
|
|
1971
|
+
self._append_text("]")
|
|
1972
|
+
self._reconsume_current()
|
|
1973
|
+
self.state = self.CDATA_SECTION
|
|
1974
|
+
return False
|
|
1975
|
+
|
|
1976
|
+
def _state_rcdata(self):
|
|
1977
|
+
buffer = self.buffer
|
|
1978
|
+
length = self.length
|
|
1979
|
+
pos = self.pos
|
|
1980
|
+
while True:
|
|
1981
|
+
if self.reconsume:
|
|
1982
|
+
self.reconsume = False
|
|
1983
|
+
if self.current_char is None:
|
|
1984
|
+
self._flush_text()
|
|
1985
|
+
self._emit_token(EOFToken())
|
|
1986
|
+
return True
|
|
1987
|
+
self.pos -= 1
|
|
1988
|
+
pos = self.pos
|
|
1989
|
+
|
|
1990
|
+
# Optimized loop using find
|
|
1991
|
+
lt_index = buffer.find("<", pos)
|
|
1992
|
+
amp_index = buffer.find("&", pos)
|
|
1993
|
+
null_index = buffer.find("\0", pos)
|
|
1994
|
+
|
|
1995
|
+
# Find the nearest special character
|
|
1996
|
+
next_special = length
|
|
1997
|
+
if lt_index != -1:
|
|
1998
|
+
next_special = lt_index
|
|
1999
|
+
if amp_index != -1 and amp_index < next_special:
|
|
2000
|
+
next_special = amp_index
|
|
2001
|
+
if null_index != -1 and null_index < next_special:
|
|
2002
|
+
next_special = null_index
|
|
2003
|
+
|
|
2004
|
+
# Consume everything up to the special character
|
|
2005
|
+
if next_special > pos:
|
|
2006
|
+
chunk = buffer[pos:next_special]
|
|
2007
|
+
self._append_text_chunk(chunk, ends_with_cr=chunk.endswith("\r"))
|
|
2008
|
+
pos = next_special
|
|
2009
|
+
self.pos = pos
|
|
2010
|
+
|
|
2011
|
+
# Handle EOF
|
|
2012
|
+
if pos >= length:
|
|
2013
|
+
self._flush_text()
|
|
2014
|
+
self._emit_token(EOFToken())
|
|
2015
|
+
return True
|
|
2016
|
+
|
|
2017
|
+
# Handle special characters - we're at one of them after find()
|
|
2018
|
+
if null_index == pos:
|
|
2019
|
+
self.ignore_lf = False
|
|
2020
|
+
self._emit_error("unexpected-null-character")
|
|
2021
|
+
self._append_text("\ufffd")
|
|
2022
|
+
pos += 1
|
|
2023
|
+
self.pos = pos
|
|
2024
|
+
elif amp_index == pos:
|
|
2025
|
+
# Ampersand in RCDATA - will be decoded by _flush_text
|
|
2026
|
+
self._append_text("&")
|
|
2027
|
+
pos += 1
|
|
2028
|
+
self.pos = pos
|
|
2029
|
+
else:
|
|
2030
|
+
# lt_index == pos - the only remaining possibility
|
|
2031
|
+
# Less-than sign - might be start of end tag
|
|
2032
|
+
pos += 1
|
|
2033
|
+
self.pos = pos
|
|
2034
|
+
self.state = self.RCDATA_LESS_THAN_SIGN
|
|
2035
|
+
return False
|
|
2036
|
+
|
|
2037
|
+
def _state_rcdata_less_than_sign(self):
|
|
2038
|
+
c = self._get_char()
|
|
2039
|
+
if c == "/":
|
|
2040
|
+
self.current_tag_name.clear()
|
|
2041
|
+
self.state = self.RCDATA_END_TAG_OPEN
|
|
2042
|
+
return False
|
|
2043
|
+
self._append_text("<")
|
|
2044
|
+
self._reconsume_current()
|
|
2045
|
+
self.state = self.RCDATA
|
|
2046
|
+
return False
|
|
2047
|
+
|
|
2048
|
+
def _state_rcdata_end_tag_open(self):
|
|
2049
|
+
c = self._get_char()
|
|
2050
|
+
if c is not None and ("A" <= c <= "Z" or "a" <= c <= "z"):
|
|
2051
|
+
self.current_tag_name.append(c.lower())
|
|
2052
|
+
self.original_tag_name.append(c)
|
|
2053
|
+
self.state = self.RCDATA_END_TAG_NAME
|
|
2054
|
+
return False
|
|
2055
|
+
self.text_buffer.extend(("<", "/"))
|
|
2056
|
+
self._reconsume_current()
|
|
2057
|
+
self.state = self.RCDATA
|
|
2058
|
+
return False
|
|
2059
|
+
|
|
2060
|
+
def _state_rcdata_end_tag_name(self):
|
|
2061
|
+
# Check if this matches the opening tag name
|
|
2062
|
+
while True:
|
|
2063
|
+
c = self._get_char()
|
|
2064
|
+
if c is not None and ("A" <= c <= "Z" or "a" <= c <= "z"):
|
|
2065
|
+
self.current_tag_name.append(c.lower())
|
|
2066
|
+
self.original_tag_name.append(c)
|
|
2067
|
+
continue
|
|
2068
|
+
# End of tag name - check if it matches
|
|
2069
|
+
tag_name = "".join(self.current_tag_name)
|
|
2070
|
+
if tag_name == self.rawtext_tag_name:
|
|
2071
|
+
if c == ">":
|
|
2072
|
+
attrs = []
|
|
2073
|
+
tag = Tag(Tag.END, tag_name, attrs, False)
|
|
2074
|
+
self._flush_text()
|
|
2075
|
+
self._emit_token(tag)
|
|
2076
|
+
self.state = self.DATA
|
|
2077
|
+
self.rawtext_tag_name = None
|
|
2078
|
+
self.original_tag_name.clear()
|
|
2079
|
+
return False
|
|
2080
|
+
if c in (" ", "\t", "\n", "\r", "\f"):
|
|
2081
|
+
# Whitespace after tag name - switch to BEFORE_ATTRIBUTE_NAME
|
|
2082
|
+
self.current_tag_kind = Tag.END
|
|
2083
|
+
self.current_tag_attrs = {}
|
|
2084
|
+
self.state = self.BEFORE_ATTRIBUTE_NAME
|
|
2085
|
+
return False
|
|
2086
|
+
if c == "/":
|
|
2087
|
+
self._flush_text()
|
|
2088
|
+
self.current_tag_kind = Tag.END
|
|
2089
|
+
self.current_tag_attrs = {}
|
|
2090
|
+
self.state = self.SELF_CLOSING_START_TAG
|
|
2091
|
+
return False
|
|
2092
|
+
# If we hit EOF or tag doesn't match, emit as text
|
|
2093
|
+
if c is None:
|
|
2094
|
+
# EOF - emit incomplete tag as text (preserve original case) then EOF
|
|
2095
|
+
self.text_buffer.extend(("<", "/"))
|
|
2096
|
+
for ch in self.original_tag_name:
|
|
2097
|
+
self._append_text(ch)
|
|
2098
|
+
self.current_tag_name.clear()
|
|
2099
|
+
self.original_tag_name.clear()
|
|
2100
|
+
self._flush_text()
|
|
2101
|
+
self._emit_token(EOFToken())
|
|
2102
|
+
return True
|
|
2103
|
+
# Not a matching end tag - emit as text (preserve original case)
|
|
2104
|
+
self.text_buffer.extend(("<", "/"))
|
|
2105
|
+
for ch in self.original_tag_name:
|
|
2106
|
+
self._append_text(ch)
|
|
2107
|
+
self.current_tag_name.clear()
|
|
2108
|
+
self.original_tag_name.clear()
|
|
2109
|
+
self._reconsume_current()
|
|
2110
|
+
self.state = self.RCDATA
|
|
2111
|
+
return False
|
|
2112
|
+
|
|
2113
|
+
def _state_rawtext(self):
|
|
2114
|
+
buffer = self.buffer
|
|
2115
|
+
length = self.length
|
|
2116
|
+
pos = self.pos
|
|
2117
|
+
while True:
|
|
2118
|
+
if self.reconsume:
|
|
2119
|
+
self.reconsume = False
|
|
2120
|
+
if self.current_char is None:
|
|
2121
|
+
self._flush_text()
|
|
2122
|
+
self._emit_token(EOFToken())
|
|
2123
|
+
return True
|
|
2124
|
+
self.pos -= 1
|
|
2125
|
+
pos = self.pos
|
|
2126
|
+
|
|
2127
|
+
# Optimized loop using find
|
|
2128
|
+
lt_index = buffer.find("<", pos)
|
|
2129
|
+
null_index = buffer.find("\0", pos)
|
|
2130
|
+
next_special = lt_index if lt_index != -1 else length
|
|
2131
|
+
if null_index != -1 and null_index < next_special:
|
|
2132
|
+
if null_index > pos:
|
|
2133
|
+
chunk = buffer[pos:null_index]
|
|
2134
|
+
self._append_text_chunk(chunk, ends_with_cr=chunk.endswith("\r"))
|
|
2135
|
+
else:
|
|
2136
|
+
self.ignore_lf = False
|
|
2137
|
+
self._emit_error("unexpected-null-character")
|
|
2138
|
+
self._append_text("\ufffd")
|
|
2139
|
+
pos = null_index + 1
|
|
2140
|
+
self.pos = pos
|
|
2141
|
+
continue
|
|
2142
|
+
if lt_index == -1:
|
|
2143
|
+
if pos < length:
|
|
2144
|
+
chunk = buffer[pos:length]
|
|
2145
|
+
self._append_text_chunk(chunk, ends_with_cr=chunk.endswith("\r"))
|
|
2146
|
+
self.pos = length
|
|
2147
|
+
self._flush_text()
|
|
2148
|
+
self._emit_token(EOFToken())
|
|
2149
|
+
return True
|
|
2150
|
+
if lt_index > pos:
|
|
2151
|
+
chunk = buffer[pos:lt_index]
|
|
2152
|
+
self._append_text_chunk(chunk, ends_with_cr=chunk.endswith("\r"))
|
|
2153
|
+
pos = lt_index + 1
|
|
2154
|
+
self.pos = pos
|
|
2155
|
+
# Handle script escaped transition before treating '<' as markup boundary
|
|
2156
|
+
if self.rawtext_tag_name == "script":
|
|
2157
|
+
next1 = self._peek_char(0)
|
|
2158
|
+
next2 = self._peek_char(1)
|
|
2159
|
+
next3 = self._peek_char(2)
|
|
2160
|
+
if next1 == "!" and next2 == "-" and next3 == "-":
|
|
2161
|
+
self.text_buffer.extend(["<", "!", "-", "-"])
|
|
2162
|
+
self._get_char()
|
|
2163
|
+
self._get_char()
|
|
2164
|
+
self._get_char()
|
|
2165
|
+
self.state = self.SCRIPT_DATA_ESCAPED
|
|
2166
|
+
return False
|
|
2167
|
+
self.state = self.RAWTEXT_LESS_THAN_SIGN
|
|
2168
|
+
return False
|
|
2169
|
+
|
|
2170
|
+
def _state_rawtext_less_than_sign(self):
|
|
2171
|
+
c = self._get_char()
|
|
2172
|
+
if c == "/":
|
|
2173
|
+
self.current_tag_name.clear()
|
|
2174
|
+
self.state = self.RAWTEXT_END_TAG_OPEN
|
|
2175
|
+
return False
|
|
2176
|
+
self._append_text("<")
|
|
2177
|
+
self._reconsume_current()
|
|
2178
|
+
self.state = self.RAWTEXT
|
|
2179
|
+
return False
|
|
2180
|
+
|
|
2181
|
+
def _state_rawtext_end_tag_open(self):
|
|
2182
|
+
c = self._get_char()
|
|
2183
|
+
if c is not None and ("A" <= c <= "Z" or "a" <= c <= "z"):
|
|
2184
|
+
self.current_tag_name.append(c.lower())
|
|
2185
|
+
self.original_tag_name.append(c)
|
|
2186
|
+
self.state = self.RAWTEXT_END_TAG_NAME
|
|
2187
|
+
return False
|
|
2188
|
+
self.text_buffer.extend(("<", "/"))
|
|
2189
|
+
self._reconsume_current()
|
|
2190
|
+
self.state = self.RAWTEXT
|
|
2191
|
+
return False
|
|
2192
|
+
|
|
2193
|
+
def _state_rawtext_end_tag_name(self):
|
|
2194
|
+
# Check if this matches the opening tag name
|
|
2195
|
+
while True:
|
|
2196
|
+
c = self._get_char()
|
|
2197
|
+
if c is not None and ("A" <= c <= "Z" or "a" <= c <= "z"):
|
|
2198
|
+
self.current_tag_name.append(c.lower())
|
|
2199
|
+
self.original_tag_name.append(c)
|
|
2200
|
+
continue
|
|
2201
|
+
# End of tag name - check if it matches
|
|
2202
|
+
tag_name = "".join(self.current_tag_name)
|
|
2203
|
+
if tag_name == self.rawtext_tag_name:
|
|
2204
|
+
if c == ">":
|
|
2205
|
+
attrs = []
|
|
2206
|
+
tag = Tag(Tag.END, tag_name, attrs, False)
|
|
2207
|
+
self._flush_text()
|
|
2208
|
+
self._emit_token(tag)
|
|
2209
|
+
self.state = self.DATA
|
|
2210
|
+
self.rawtext_tag_name = None
|
|
2211
|
+
self.original_tag_name.clear()
|
|
2212
|
+
return False
|
|
2213
|
+
if c in (" ", "\t", "\n", "\r", "\f"):
|
|
2214
|
+
# Whitespace after tag name - switch to BEFORE_ATTRIBUTE_NAME
|
|
2215
|
+
self.current_tag_kind = Tag.END
|
|
2216
|
+
self.current_tag_attrs = {}
|
|
2217
|
+
self.state = self.BEFORE_ATTRIBUTE_NAME
|
|
2218
|
+
return False
|
|
2219
|
+
if c == "/":
|
|
2220
|
+
self._flush_text()
|
|
2221
|
+
self.current_tag_kind = Tag.END
|
|
2222
|
+
self.current_tag_attrs = {}
|
|
2223
|
+
self.state = self.SELF_CLOSING_START_TAG
|
|
2224
|
+
return False
|
|
2225
|
+
# If we hit EOF or tag doesn't match, emit as text
|
|
2226
|
+
if c is None:
|
|
2227
|
+
# EOF - emit incomplete tag as text (preserve original case) then EOF
|
|
2228
|
+
self.text_buffer.extend(("<", "/"))
|
|
2229
|
+
for ch in self.original_tag_name:
|
|
2230
|
+
self._append_text(ch)
|
|
2231
|
+
self.current_tag_name.clear()
|
|
2232
|
+
self.original_tag_name.clear()
|
|
2233
|
+
self._flush_text()
|
|
2234
|
+
self._emit_token(EOFToken())
|
|
2235
|
+
return True
|
|
2236
|
+
# Not a matching end tag - emit as text (preserve original case)
|
|
2237
|
+
self.text_buffer.extend(("<", "/"))
|
|
2238
|
+
for ch in self.original_tag_name:
|
|
2239
|
+
self._append_text(ch)
|
|
2240
|
+
self.current_tag_name.clear()
|
|
2241
|
+
self.original_tag_name.clear()
|
|
2242
|
+
self._reconsume_current()
|
|
2243
|
+
self.state = self.RAWTEXT
|
|
2244
|
+
return False
|
|
2245
|
+
|
|
2246
|
+
def _state_plaintext(self):
|
|
2247
|
+
# PLAINTEXT state - consume everything as text, no end tag
|
|
2248
|
+
if self.pos < self.length:
|
|
2249
|
+
remaining = self.buffer[self.pos :]
|
|
2250
|
+
# Replace null bytes with replacement character
|
|
2251
|
+
if "\0" in remaining:
|
|
2252
|
+
remaining = remaining.replace("\0", "\ufffd")
|
|
2253
|
+
self._emit_error("unexpected-null-character")
|
|
2254
|
+
self._append_text(remaining)
|
|
2255
|
+
self.pos = self.length
|
|
2256
|
+
self._flush_text()
|
|
2257
|
+
self._emit_token(EOFToken())
|
|
2258
|
+
return True
|
|
2259
|
+
|
|
2260
|
+
def _state_script_data_escaped(self):
|
|
2261
|
+
c = self._get_char()
|
|
2262
|
+
if c is None:
|
|
2263
|
+
self._flush_text()
|
|
2264
|
+
self._emit_token(EOFToken())
|
|
2265
|
+
return True
|
|
2266
|
+
if c == "-":
|
|
2267
|
+
self._append_text("-")
|
|
2268
|
+
self.state = self.SCRIPT_DATA_ESCAPED_DASH
|
|
2269
|
+
return False
|
|
2270
|
+
if c == "<":
|
|
2271
|
+
self.state = self.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN
|
|
2272
|
+
return False
|
|
2273
|
+
if c == "\0":
|
|
2274
|
+
self._emit_error("unexpected-null-character")
|
|
2275
|
+
self._append_text("\ufffd")
|
|
2276
|
+
return False
|
|
2277
|
+
self._append_text(c)
|
|
2278
|
+
return False
|
|
2279
|
+
|
|
2280
|
+
def _state_script_data_escaped_dash(self):
|
|
2281
|
+
c = self._get_char()
|
|
2282
|
+
if c is None:
|
|
2283
|
+
self._flush_text()
|
|
2284
|
+
self._emit_token(EOFToken())
|
|
2285
|
+
return True
|
|
2286
|
+
if c == "-":
|
|
2287
|
+
self._append_text("-")
|
|
2288
|
+
self.state = self.SCRIPT_DATA_ESCAPED_DASH_DASH
|
|
2289
|
+
return False
|
|
2290
|
+
if c == "<":
|
|
2291
|
+
self.state = self.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN
|
|
2292
|
+
return False
|
|
2293
|
+
if c == "\0":
|
|
2294
|
+
self._emit_error("unexpected-null-character")
|
|
2295
|
+
self._append_text("\ufffd")
|
|
2296
|
+
self.state = self.SCRIPT_DATA_ESCAPED
|
|
2297
|
+
return False
|
|
2298
|
+
self._append_text(c)
|
|
2299
|
+
self.state = self.SCRIPT_DATA_ESCAPED
|
|
2300
|
+
return False
|
|
2301
|
+
|
|
2302
|
+
def _state_script_data_escaped_dash_dash(self):
|
|
2303
|
+
c = self._get_char()
|
|
2304
|
+
if c is None:
|
|
2305
|
+
self._flush_text()
|
|
2306
|
+
self._emit_token(EOFToken())
|
|
2307
|
+
return True
|
|
2308
|
+
if c == "-":
|
|
2309
|
+
self._append_text("-")
|
|
2310
|
+
return False
|
|
2311
|
+
if c == "<":
|
|
2312
|
+
self._append_text("<")
|
|
2313
|
+
self.state = self.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN
|
|
2314
|
+
return False
|
|
2315
|
+
if c == ">":
|
|
2316
|
+
self._append_text(">")
|
|
2317
|
+
self.state = self.RAWTEXT
|
|
2318
|
+
return False
|
|
2319
|
+
if c == "\0":
|
|
2320
|
+
self._emit_error("unexpected-null-character")
|
|
2321
|
+
self._append_text("\ufffd")
|
|
2322
|
+
self.state = self.SCRIPT_DATA_ESCAPED
|
|
2323
|
+
return False
|
|
2324
|
+
self._append_text(c)
|
|
2325
|
+
self.state = self.SCRIPT_DATA_ESCAPED
|
|
2326
|
+
return False
|
|
2327
|
+
|
|
2328
|
+
def _state_script_data_escaped_less_than_sign(self):
|
|
2329
|
+
c = self._get_char()
|
|
2330
|
+
if c == "/":
|
|
2331
|
+
self.temp_buffer.clear()
|
|
2332
|
+
self.state = self.SCRIPT_DATA_ESCAPED_END_TAG_OPEN
|
|
2333
|
+
return False
|
|
2334
|
+
if c is not None and ("A" <= c <= "Z" or "a" <= c <= "z"):
|
|
2335
|
+
self.temp_buffer.clear()
|
|
2336
|
+
self._append_text("<")
|
|
2337
|
+
self._reconsume_current()
|
|
2338
|
+
self.state = self.SCRIPT_DATA_DOUBLE_ESCAPE_START
|
|
2339
|
+
return False
|
|
2340
|
+
self._append_text("<")
|
|
2341
|
+
self._reconsume_current()
|
|
2342
|
+
self.state = self.SCRIPT_DATA_ESCAPED
|
|
2343
|
+
|
|
2344
|
+
return False
|
|
2345
|
+
|
|
2346
|
+
def _state_script_data_escaped_end_tag_open(self):
|
|
2347
|
+
c = self._get_char()
|
|
2348
|
+
if c is not None and ("A" <= c <= "Z" or "a" <= c <= "z"):
|
|
2349
|
+
self.current_tag_name.clear()
|
|
2350
|
+
self.original_tag_name.clear()
|
|
2351
|
+
self._reconsume_current()
|
|
2352
|
+
self.state = self.SCRIPT_DATA_ESCAPED_END_TAG_NAME
|
|
2353
|
+
return False
|
|
2354
|
+
self.text_buffer.extend(("<", "/"))
|
|
2355
|
+
self._reconsume_current()
|
|
2356
|
+
self.state = self.SCRIPT_DATA_ESCAPED
|
|
2357
|
+
return False
|
|
2358
|
+
|
|
2359
|
+
def _state_script_data_escaped_end_tag_name(self):
|
|
2360
|
+
c = self._get_char()
|
|
2361
|
+
if c is not None and ("A" <= c <= "Z" or "a" <= c <= "z"):
|
|
2362
|
+
self.current_tag_name.append(c.lower())
|
|
2363
|
+
self.original_tag_name.append(c)
|
|
2364
|
+
self.temp_buffer.append(c)
|
|
2365
|
+
return False
|
|
2366
|
+
# Check if this is an appropriate end tag
|
|
2367
|
+
tag_name = "".join(self.current_tag_name)
|
|
2368
|
+
is_appropriate = tag_name == self.rawtext_tag_name
|
|
2369
|
+
|
|
2370
|
+
if is_appropriate:
|
|
2371
|
+
if c in (" ", "\t", "\n", "\r", "\f"):
|
|
2372
|
+
self.current_tag_kind = Tag.END
|
|
2373
|
+
self.current_tag_attrs = {}
|
|
2374
|
+
self.state = self.BEFORE_ATTRIBUTE_NAME
|
|
2375
|
+
return False
|
|
2376
|
+
if c == "/":
|
|
2377
|
+
self._flush_text()
|
|
2378
|
+
self.current_tag_kind = Tag.END
|
|
2379
|
+
self.current_tag_attrs = {}
|
|
2380
|
+
self.state = self.SELF_CLOSING_START_TAG
|
|
2381
|
+
return False
|
|
2382
|
+
if c == ">":
|
|
2383
|
+
self._flush_text()
|
|
2384
|
+
attrs = []
|
|
2385
|
+
tag = Tag(Tag.END, tag_name, attrs, False)
|
|
2386
|
+
self._emit_token(tag)
|
|
2387
|
+
self.state = self.DATA
|
|
2388
|
+
self.rawtext_tag_name = None
|
|
2389
|
+
self.current_tag_name.clear()
|
|
2390
|
+
self.original_tag_name.clear()
|
|
2391
|
+
return False
|
|
2392
|
+
# Not an appropriate end tag
|
|
2393
|
+
self.text_buffer.extend(("<", "/"))
|
|
2394
|
+
for ch in self.temp_buffer:
|
|
2395
|
+
self._append_text(ch)
|
|
2396
|
+
self._reconsume_current()
|
|
2397
|
+
self.state = self.SCRIPT_DATA_ESCAPED
|
|
2398
|
+
return False
|
|
2399
|
+
|
|
2400
|
+
def _state_script_data_double_escape_start(self):
|
|
2401
|
+
c = self._get_char()
|
|
2402
|
+
if c in (" ", "\t", "\n", "\r", "\f", "/", ">"):
|
|
2403
|
+
# Check if temp_buffer contains "script"
|
|
2404
|
+
temp = "".join(self.temp_buffer).lower()
|
|
2405
|
+
if temp == "script":
|
|
2406
|
+
self.state = self.SCRIPT_DATA_DOUBLE_ESCAPED
|
|
2407
|
+
else:
|
|
2408
|
+
self.state = self.SCRIPT_DATA_ESCAPED
|
|
2409
|
+
self._append_text(c)
|
|
2410
|
+
return False
|
|
2411
|
+
if c is not None and ("A" <= c <= "Z" or "a" <= c <= "z"):
|
|
2412
|
+
self.temp_buffer.append(c)
|
|
2413
|
+
self._append_text(c)
|
|
2414
|
+
return False
|
|
2415
|
+
self._reconsume_current()
|
|
2416
|
+
self.state = self.SCRIPT_DATA_ESCAPED
|
|
2417
|
+
return False
|
|
2418
|
+
|
|
2419
|
+
def _state_script_data_double_escaped(self):
|
|
2420
|
+
c = self._get_char()
|
|
2421
|
+
if c is None:
|
|
2422
|
+
self._flush_text()
|
|
2423
|
+
self._emit_token(EOFToken())
|
|
2424
|
+
return True
|
|
2425
|
+
if c == "-":
|
|
2426
|
+
self._append_text("-")
|
|
2427
|
+
self.state = self.SCRIPT_DATA_DOUBLE_ESCAPED_DASH
|
|
2428
|
+
return False
|
|
2429
|
+
if c == "<":
|
|
2430
|
+
self._append_text("<")
|
|
2431
|
+
self.state = self.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN
|
|
2432
|
+
return False
|
|
2433
|
+
if c == "\0":
|
|
2434
|
+
self._emit_error("unexpected-null-character")
|
|
2435
|
+
self._append_text("\ufffd")
|
|
2436
|
+
return False
|
|
2437
|
+
self._append_text(c)
|
|
2438
|
+
return False
|
|
2439
|
+
|
|
2440
|
+
def _state_script_data_double_escaped_dash(self):
|
|
2441
|
+
c = self._get_char()
|
|
2442
|
+
if c is None:
|
|
2443
|
+
self._flush_text()
|
|
2444
|
+
self._emit_token(EOFToken())
|
|
2445
|
+
return True
|
|
2446
|
+
if c == "-":
|
|
2447
|
+
self._append_text("-")
|
|
2448
|
+
self.state = self.SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH
|
|
2449
|
+
return False
|
|
2450
|
+
if c == "<":
|
|
2451
|
+
self._append_text("<")
|
|
2452
|
+
self.state = self.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN
|
|
2453
|
+
return False
|
|
2454
|
+
if c == "\0":
|
|
2455
|
+
self._emit_error("unexpected-null-character")
|
|
2456
|
+
self._append_text("\ufffd")
|
|
2457
|
+
self.state = self.SCRIPT_DATA_DOUBLE_ESCAPED
|
|
2458
|
+
return False
|
|
2459
|
+
self._append_text(c)
|
|
2460
|
+
self.state = self.SCRIPT_DATA_DOUBLE_ESCAPED
|
|
2461
|
+
return False
|
|
2462
|
+
|
|
2463
|
+
def _state_script_data_double_escaped_dash_dash(self):
|
|
2464
|
+
c = self._get_char()
|
|
2465
|
+
if c is None:
|
|
2466
|
+
self._flush_text()
|
|
2467
|
+
self._emit_token(EOFToken())
|
|
2468
|
+
return True
|
|
2469
|
+
if c == "-":
|
|
2470
|
+
self._append_text("-")
|
|
2471
|
+
return False
|
|
2472
|
+
if c == "<":
|
|
2473
|
+
self._append_text("<")
|
|
2474
|
+
self.state = self.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN
|
|
2475
|
+
|
|
2476
|
+
return False
|
|
2477
|
+
if c == ">":
|
|
2478
|
+
self._append_text(">")
|
|
2479
|
+
self.state = self.RAWTEXT
|
|
2480
|
+
|
|
2481
|
+
return False
|
|
2482
|
+
if c == "\0":
|
|
2483
|
+
self._emit_error("unexpected-null-character")
|
|
2484
|
+
self._append_text("\ufffd")
|
|
2485
|
+
self.state = self.SCRIPT_DATA_DOUBLE_ESCAPED
|
|
2486
|
+
return False
|
|
2487
|
+
self._append_text(c)
|
|
2488
|
+
self.state = self.SCRIPT_DATA_DOUBLE_ESCAPED
|
|
2489
|
+
return False
|
|
2490
|
+
|
|
2491
|
+
def _state_script_data_double_escaped_less_than_sign(self):
|
|
2492
|
+
c = self._get_char()
|
|
2493
|
+
if c == "/":
|
|
2494
|
+
self.temp_buffer.clear()
|
|
2495
|
+
self._append_text("/")
|
|
2496
|
+
self.state = self.SCRIPT_DATA_DOUBLE_ESCAPE_END
|
|
2497
|
+
return False
|
|
2498
|
+
if c is not None and ("A" <= c <= "Z" or "a" <= c <= "z"):
|
|
2499
|
+
self.temp_buffer.clear()
|
|
2500
|
+
self._reconsume_current()
|
|
2501
|
+
self.state = self.SCRIPT_DATA_DOUBLE_ESCAPE_START
|
|
2502
|
+
return False
|
|
2503
|
+
self._reconsume_current()
|
|
2504
|
+
self.state = self.SCRIPT_DATA_DOUBLE_ESCAPED
|
|
2505
|
+
return False
|
|
2506
|
+
|
|
2507
|
+
def _state_script_data_double_escape_end(self):
|
|
2508
|
+
c = self._get_char()
|
|
2509
|
+
if c in (" ", "\t", "\n", "\r", "\f", "/", ">"):
|
|
2510
|
+
# Check if temp_buffer contains "script"
|
|
2511
|
+
temp = "".join(self.temp_buffer).lower()
|
|
2512
|
+
|
|
2513
|
+
if temp == "script":
|
|
2514
|
+
self.state = self.SCRIPT_DATA_ESCAPED
|
|
2515
|
+
else:
|
|
2516
|
+
self.state = self.SCRIPT_DATA_DOUBLE_ESCAPED
|
|
2517
|
+
self._append_text(c)
|
|
2518
|
+
return False
|
|
2519
|
+
if c is not None and ("A" <= c <= "Z" or "a" <= c <= "z"):
|
|
2520
|
+
self.temp_buffer.append(c)
|
|
2521
|
+
self._append_text(c)
|
|
2522
|
+
return False
|
|
2523
|
+
self._reconsume_current()
|
|
2524
|
+
self.state = self.SCRIPT_DATA_DOUBLE_ESCAPED
|
|
2525
|
+
return False
|
|
2526
|
+
|
|
2527
|
+
|
|
2528
|
+
Tokenizer._STATE_HANDLERS = [
|
|
2529
|
+
Tokenizer._state_data,
|
|
2530
|
+
Tokenizer._state_tag_open,
|
|
2531
|
+
Tokenizer._state_end_tag_open,
|
|
2532
|
+
Tokenizer._state_tag_name,
|
|
2533
|
+
Tokenizer._state_before_attribute_name,
|
|
2534
|
+
Tokenizer._state_attribute_name,
|
|
2535
|
+
Tokenizer._state_after_attribute_name,
|
|
2536
|
+
Tokenizer._state_before_attribute_value,
|
|
2537
|
+
Tokenizer._state_attribute_value_double,
|
|
2538
|
+
Tokenizer._state_attribute_value_single,
|
|
2539
|
+
Tokenizer._state_attribute_value_unquoted,
|
|
2540
|
+
Tokenizer._state_after_attribute_value_quoted,
|
|
2541
|
+
Tokenizer._state_self_closing_start_tag,
|
|
2542
|
+
Tokenizer._state_markup_declaration_open,
|
|
2543
|
+
Tokenizer._state_comment_start,
|
|
2544
|
+
Tokenizer._state_comment_start_dash,
|
|
2545
|
+
Tokenizer._state_comment,
|
|
2546
|
+
Tokenizer._state_comment_end_dash,
|
|
2547
|
+
Tokenizer._state_comment_end,
|
|
2548
|
+
Tokenizer._state_comment_end_bang,
|
|
2549
|
+
Tokenizer._state_bogus_comment,
|
|
2550
|
+
Tokenizer._state_doctype,
|
|
2551
|
+
Tokenizer._state_before_doctype_name,
|
|
2552
|
+
Tokenizer._state_doctype_name,
|
|
2553
|
+
Tokenizer._state_after_doctype_name,
|
|
2554
|
+
Tokenizer._state_bogus_doctype,
|
|
2555
|
+
Tokenizer._state_after_doctype_public_keyword,
|
|
2556
|
+
Tokenizer._state_after_doctype_system_keyword,
|
|
2557
|
+
Tokenizer._state_before_doctype_public_identifier,
|
|
2558
|
+
Tokenizer._state_doctype_public_identifier_double_quoted,
|
|
2559
|
+
Tokenizer._state_doctype_public_identifier_single_quoted,
|
|
2560
|
+
Tokenizer._state_after_doctype_public_identifier,
|
|
2561
|
+
Tokenizer._state_between_doctype_public_and_system_identifiers,
|
|
2562
|
+
Tokenizer._state_before_doctype_system_identifier,
|
|
2563
|
+
Tokenizer._state_doctype_system_identifier_double_quoted,
|
|
2564
|
+
Tokenizer._state_doctype_system_identifier_single_quoted,
|
|
2565
|
+
Tokenizer._state_after_doctype_system_identifier,
|
|
2566
|
+
Tokenizer._state_cdata_section,
|
|
2567
|
+
Tokenizer._state_cdata_section_bracket,
|
|
2568
|
+
Tokenizer._state_cdata_section_end,
|
|
2569
|
+
Tokenizer._state_rcdata,
|
|
2570
|
+
Tokenizer._state_rcdata_less_than_sign,
|
|
2571
|
+
Tokenizer._state_rcdata_end_tag_open,
|
|
2572
|
+
Tokenizer._state_rcdata_end_tag_name,
|
|
2573
|
+
Tokenizer._state_rawtext,
|
|
2574
|
+
Tokenizer._state_rawtext_less_than_sign,
|
|
2575
|
+
Tokenizer._state_rawtext_end_tag_open,
|
|
2576
|
+
Tokenizer._state_rawtext_end_tag_name,
|
|
2577
|
+
Tokenizer._state_plaintext,
|
|
2578
|
+
Tokenizer._state_script_data_escaped,
|
|
2579
|
+
Tokenizer._state_script_data_escaped_dash,
|
|
2580
|
+
Tokenizer._state_script_data_escaped_dash_dash,
|
|
2581
|
+
Tokenizer._state_script_data_escaped_less_than_sign,
|
|
2582
|
+
Tokenizer._state_script_data_escaped_end_tag_open,
|
|
2583
|
+
Tokenizer._state_script_data_escaped_end_tag_name,
|
|
2584
|
+
Tokenizer._state_script_data_double_escape_start,
|
|
2585
|
+
Tokenizer._state_script_data_double_escaped,
|
|
2586
|
+
Tokenizer._state_script_data_double_escaped_dash,
|
|
2587
|
+
Tokenizer._state_script_data_double_escaped_dash_dash,
|
|
2588
|
+
Tokenizer._state_script_data_double_escaped_less_than_sign,
|
|
2589
|
+
Tokenizer._state_script_data_double_escape_end,
|
|
2590
|
+
]
|