justhtml 0.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of justhtml might be problematic. Click here for more details.
- justhtml/__init__.py +17 -0
- justhtml/__main__.py +144 -0
- justhtml/constants.py +445 -0
- justhtml/context.py +12 -0
- justhtml/encoding.py +405 -0
- justhtml/entities.py +344 -0
- justhtml/errors.py +140 -0
- justhtml/node.py +632 -0
- justhtml/parser.py +131 -0
- justhtml/py.typed +0 -0
- justhtml/selector.py +965 -0
- justhtml/serialize.py +258 -0
- justhtml/stream.py +107 -0
- justhtml/tokenizer.py +2647 -0
- justhtml/tokens.py +223 -0
- justhtml/treebuilder.py +1279 -0
- justhtml/treebuilder_modes.py +2016 -0
- justhtml/treebuilder_utils.py +93 -0
- justhtml-0.12.0.dist-info/METADATA +164 -0
- justhtml-0.12.0.dist-info/RECORD +23 -0
- justhtml-0.12.0.dist-info/WHEEL +4 -0
- justhtml-0.12.0.dist-info/entry_points.txt +2 -0
- justhtml-0.12.0.dist-info/licenses/LICENSE +21 -0
justhtml/encoding.py
ADDED
|
@@ -0,0 +1,405 @@
|
|
|
1
|
+
"""HTML encoding sniffing and decoding.
|
|
2
|
+
|
|
3
|
+
Implements the HTML encoding sniffing behavior needed for the html5lib-tests
|
|
4
|
+
encoding fixtures.
|
|
5
|
+
|
|
6
|
+
Inputs are bytes and an optional transport-supplied encoding label.
|
|
7
|
+
Outputs are a decoded Unicode string and the chosen encoding name.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
_ASCII_WHITESPACE: set[int] = {0x09, 0x0A, 0x0C, 0x0D, 0x20}
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _ascii_lower(b: int) -> int:
|
|
16
|
+
# b is an int 0..255
|
|
17
|
+
if 0x41 <= b <= 0x5A:
|
|
18
|
+
return b | 0x20
|
|
19
|
+
return b
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _is_ascii_alpha(b: int) -> bool:
|
|
23
|
+
b = _ascii_lower(b)
|
|
24
|
+
return 0x61 <= b <= 0x7A
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _skip_ascii_whitespace(data: bytes, i: int) -> int:
|
|
28
|
+
n = len(data)
|
|
29
|
+
while i < n and data[i] in _ASCII_WHITESPACE:
|
|
30
|
+
i += 1
|
|
31
|
+
return i
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _strip_ascii_whitespace(value: bytes | None) -> bytes | None:
|
|
35
|
+
if value is None:
|
|
36
|
+
return None
|
|
37
|
+
start = 0
|
|
38
|
+
end = len(value)
|
|
39
|
+
while start < end and value[start] in _ASCII_WHITESPACE:
|
|
40
|
+
start += 1
|
|
41
|
+
while end > start and value[end - 1] in _ASCII_WHITESPACE:
|
|
42
|
+
end -= 1
|
|
43
|
+
return value[start:end]
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def normalize_encoding_label(label: str | bytes | None) -> str | None:
|
|
47
|
+
if not label:
|
|
48
|
+
return None
|
|
49
|
+
|
|
50
|
+
if isinstance(label, bytes):
|
|
51
|
+
label = label.decode("ascii", "ignore")
|
|
52
|
+
|
|
53
|
+
s = str(label).strip()
|
|
54
|
+
if not s:
|
|
55
|
+
return None
|
|
56
|
+
|
|
57
|
+
s = s.lower()
|
|
58
|
+
|
|
59
|
+
# Security: never allow utf-7.
|
|
60
|
+
if s in {"utf-7", "utf7", "x-utf-7"}:
|
|
61
|
+
return "windows-1252"
|
|
62
|
+
|
|
63
|
+
if s in {"utf-8", "utf8"}:
|
|
64
|
+
return "utf-8"
|
|
65
|
+
|
|
66
|
+
# HTML treats latin-1 labels as windows-1252.
|
|
67
|
+
if s in {
|
|
68
|
+
"iso-8859-1",
|
|
69
|
+
"iso8859-1",
|
|
70
|
+
"latin1",
|
|
71
|
+
"latin-1",
|
|
72
|
+
"l1",
|
|
73
|
+
"cp819",
|
|
74
|
+
"ibm819",
|
|
75
|
+
}:
|
|
76
|
+
return "windows-1252"
|
|
77
|
+
|
|
78
|
+
if s in {"windows-1252", "windows1252", "cp1252", "x-cp1252"}:
|
|
79
|
+
return "windows-1252"
|
|
80
|
+
|
|
81
|
+
if s in {"iso-8859-2", "iso8859-2", "latin2", "latin-2"}:
|
|
82
|
+
return "iso-8859-2"
|
|
83
|
+
|
|
84
|
+
if s in {"euc-jp", "eucjp"}:
|
|
85
|
+
return "euc-jp"
|
|
86
|
+
|
|
87
|
+
if s in {"utf-16", "utf16"}:
|
|
88
|
+
return "utf-16"
|
|
89
|
+
if s in {"utf-16le", "utf16le"}:
|
|
90
|
+
return "utf-16le"
|
|
91
|
+
if s in {"utf-16be", "utf16be"}:
|
|
92
|
+
return "utf-16be"
|
|
93
|
+
|
|
94
|
+
return None
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _normalize_meta_declared_encoding(label: bytes | None) -> str | None:
|
|
98
|
+
enc = normalize_encoding_label(label)
|
|
99
|
+
if enc is None:
|
|
100
|
+
return None
|
|
101
|
+
|
|
102
|
+
# Per HTML meta charset handling: ignore UTF-16/UTF-32 declarations and
|
|
103
|
+
# treat them as UTF-8.
|
|
104
|
+
if enc in {"utf-16", "utf-16le", "utf-16be", "utf-32", "utf-32le", "utf-32be"}:
|
|
105
|
+
return "utf-8"
|
|
106
|
+
|
|
107
|
+
return enc
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _sniff_bom(data: bytes) -> tuple[str | None, int]:
|
|
111
|
+
if len(data) >= 3 and data[0:3] == b"\xef\xbb\xbf":
|
|
112
|
+
return "utf-8", 3
|
|
113
|
+
if len(data) >= 2 and data[0:2] == b"\xff\xfe":
|
|
114
|
+
return "utf-16le", 2
|
|
115
|
+
if len(data) >= 2 and data[0:2] == b"\xfe\xff":
|
|
116
|
+
return "utf-16be", 2
|
|
117
|
+
return None, 0
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def _extract_charset_from_content(content_bytes: bytes) -> bytes | None:
|
|
121
|
+
if not content_bytes:
|
|
122
|
+
return None
|
|
123
|
+
|
|
124
|
+
# Normalize whitespace to spaces for robust matching.
|
|
125
|
+
b = bytearray()
|
|
126
|
+
for ch in content_bytes:
|
|
127
|
+
if ch in _ASCII_WHITESPACE:
|
|
128
|
+
b.append(0x20)
|
|
129
|
+
else:
|
|
130
|
+
b.append(_ascii_lower(ch))
|
|
131
|
+
s = bytes(b)
|
|
132
|
+
|
|
133
|
+
idx = s.find(b"charset")
|
|
134
|
+
if idx == -1:
|
|
135
|
+
return None
|
|
136
|
+
|
|
137
|
+
i = idx + len(b"charset")
|
|
138
|
+
n = len(s)
|
|
139
|
+
while i < n and s[i] in _ASCII_WHITESPACE:
|
|
140
|
+
i += 1
|
|
141
|
+
if i >= n or s[i] != 0x3D: # '='
|
|
142
|
+
return None
|
|
143
|
+
i += 1
|
|
144
|
+
while i < n and s[i] in _ASCII_WHITESPACE:
|
|
145
|
+
i += 1
|
|
146
|
+
if i >= n:
|
|
147
|
+
return None
|
|
148
|
+
|
|
149
|
+
quote: int | None = None
|
|
150
|
+
if s[i] in (0x22, 0x27): # '"' or "'"
|
|
151
|
+
quote = s[i]
|
|
152
|
+
i += 1
|
|
153
|
+
|
|
154
|
+
start = i
|
|
155
|
+
while i < n:
|
|
156
|
+
ch = s[i]
|
|
157
|
+
if quote is not None:
|
|
158
|
+
if ch == quote:
|
|
159
|
+
break
|
|
160
|
+
else:
|
|
161
|
+
if ch in _ASCII_WHITESPACE or ch == 0x3B: # ';'
|
|
162
|
+
break
|
|
163
|
+
i += 1
|
|
164
|
+
|
|
165
|
+
if quote is not None and (i >= n or s[i] != quote):
|
|
166
|
+
return None
|
|
167
|
+
|
|
168
|
+
return s[start:i]
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def _prescan_for_meta_charset(data: bytes) -> str | None:
|
|
172
|
+
# Scan up to 1024 bytes worth of non-comment input, but allow skipping
|
|
173
|
+
# arbitrarily large comments (bounded by a hard cap).
|
|
174
|
+
max_non_comment = 1024
|
|
175
|
+
max_total_scan = 65536
|
|
176
|
+
|
|
177
|
+
n = len(data)
|
|
178
|
+
i = 0
|
|
179
|
+
non_comment = 0
|
|
180
|
+
|
|
181
|
+
while i < n and i < max_total_scan and non_comment < max_non_comment:
|
|
182
|
+
if data[i] != 0x3C: # '<'
|
|
183
|
+
i += 1
|
|
184
|
+
non_comment += 1
|
|
185
|
+
continue
|
|
186
|
+
|
|
187
|
+
# Comment
|
|
188
|
+
if i + 3 < n and data[i + 1 : i + 4] == b"!--":
|
|
189
|
+
end = data.find(b"-->", i + 4)
|
|
190
|
+
if end == -1:
|
|
191
|
+
return None
|
|
192
|
+
i = end + 3
|
|
193
|
+
continue
|
|
194
|
+
|
|
195
|
+
# Tag open
|
|
196
|
+
j = i + 1
|
|
197
|
+
if j < n and data[j] == 0x2F: # '/'
|
|
198
|
+
# Skip end tag.
|
|
199
|
+
k = i
|
|
200
|
+
quote: int | None = None
|
|
201
|
+
while k < n and k < max_total_scan and non_comment < max_non_comment:
|
|
202
|
+
ch = data[k]
|
|
203
|
+
if quote is None:
|
|
204
|
+
if ch in (0x22, 0x27):
|
|
205
|
+
quote = ch
|
|
206
|
+
elif ch == 0x3E: # '>'
|
|
207
|
+
k += 1
|
|
208
|
+
non_comment += 1
|
|
209
|
+
break
|
|
210
|
+
else:
|
|
211
|
+
if ch == quote:
|
|
212
|
+
quote = None
|
|
213
|
+
k += 1
|
|
214
|
+
non_comment += 1
|
|
215
|
+
i = k
|
|
216
|
+
continue
|
|
217
|
+
|
|
218
|
+
if j >= n or not _is_ascii_alpha(data[j]):
|
|
219
|
+
i += 1
|
|
220
|
+
non_comment += 1
|
|
221
|
+
continue
|
|
222
|
+
|
|
223
|
+
name_start = j
|
|
224
|
+
while j < n and _is_ascii_alpha(data[j]):
|
|
225
|
+
j += 1
|
|
226
|
+
|
|
227
|
+
tag_name = data[name_start:j]
|
|
228
|
+
if tag_name.lower() != b"meta":
|
|
229
|
+
# Skip the rest of this tag so we don't accidentally interpret '<'
|
|
230
|
+
# inside an attribute value as a new tag.
|
|
231
|
+
k = i
|
|
232
|
+
quote = None
|
|
233
|
+
while k < n and k < max_total_scan and non_comment < max_non_comment:
|
|
234
|
+
ch = data[k]
|
|
235
|
+
if quote is None:
|
|
236
|
+
if ch in (0x22, 0x27):
|
|
237
|
+
quote = ch
|
|
238
|
+
elif ch == 0x3E: # '>'
|
|
239
|
+
k += 1
|
|
240
|
+
non_comment += 1
|
|
241
|
+
break
|
|
242
|
+
else:
|
|
243
|
+
if ch == quote:
|
|
244
|
+
quote = None
|
|
245
|
+
k += 1
|
|
246
|
+
non_comment += 1
|
|
247
|
+
i = k
|
|
248
|
+
continue
|
|
249
|
+
|
|
250
|
+
# Parse attributes until '>'
|
|
251
|
+
charset: bytes | None = None
|
|
252
|
+
http_equiv: bytes | None = None
|
|
253
|
+
content: bytes | None = None
|
|
254
|
+
|
|
255
|
+
k = j
|
|
256
|
+
saw_gt = False
|
|
257
|
+
start_i = i
|
|
258
|
+
while k < n and k < max_total_scan:
|
|
259
|
+
ch = data[k]
|
|
260
|
+
if ch == 0x3E: # '>'
|
|
261
|
+
saw_gt = True
|
|
262
|
+
k += 1
|
|
263
|
+
break
|
|
264
|
+
|
|
265
|
+
if ch == 0x3C: # '<' - restart scanning from here
|
|
266
|
+
break
|
|
267
|
+
|
|
268
|
+
if ch in _ASCII_WHITESPACE or ch == 0x2F: # '/'
|
|
269
|
+
k += 1
|
|
270
|
+
continue
|
|
271
|
+
|
|
272
|
+
# Attribute name
|
|
273
|
+
attr_start = k
|
|
274
|
+
while k < n:
|
|
275
|
+
ch = data[k]
|
|
276
|
+
if ch in _ASCII_WHITESPACE or ch in {0x3D, 0x3E, 0x2F, 0x3C}:
|
|
277
|
+
break
|
|
278
|
+
k += 1
|
|
279
|
+
attr_name = data[attr_start:k].lower()
|
|
280
|
+
k = _skip_ascii_whitespace(data, k)
|
|
281
|
+
|
|
282
|
+
value: bytes | None = None
|
|
283
|
+
if k < n and data[k] == 0x3D: # '='
|
|
284
|
+
k += 1
|
|
285
|
+
k = _skip_ascii_whitespace(data, k)
|
|
286
|
+
if k >= n:
|
|
287
|
+
break
|
|
288
|
+
|
|
289
|
+
quote = None
|
|
290
|
+
if data[k] in (0x22, 0x27):
|
|
291
|
+
quote = data[k]
|
|
292
|
+
k += 1
|
|
293
|
+
val_start = k
|
|
294
|
+
end_quote = data.find(bytes((quote,)), k)
|
|
295
|
+
if end_quote == -1:
|
|
296
|
+
# Unclosed quote: ignore this meta.
|
|
297
|
+
i += 1
|
|
298
|
+
non_comment += 1
|
|
299
|
+
charset = None
|
|
300
|
+
http_equiv = None
|
|
301
|
+
content = None
|
|
302
|
+
saw_gt = False
|
|
303
|
+
break
|
|
304
|
+
value = data[val_start:end_quote]
|
|
305
|
+
k = end_quote + 1
|
|
306
|
+
else:
|
|
307
|
+
val_start = k
|
|
308
|
+
while k < n:
|
|
309
|
+
ch = data[k]
|
|
310
|
+
if ch in _ASCII_WHITESPACE or ch in {0x3E, 0x3C}:
|
|
311
|
+
break
|
|
312
|
+
k += 1
|
|
313
|
+
value = data[val_start:k]
|
|
314
|
+
|
|
315
|
+
if attr_name == b"charset":
|
|
316
|
+
charset = _strip_ascii_whitespace(value)
|
|
317
|
+
elif attr_name == b"http-equiv":
|
|
318
|
+
http_equiv = value
|
|
319
|
+
elif attr_name == b"content":
|
|
320
|
+
content = value
|
|
321
|
+
|
|
322
|
+
if saw_gt:
|
|
323
|
+
if charset:
|
|
324
|
+
enc = _normalize_meta_declared_encoding(charset)
|
|
325
|
+
if enc:
|
|
326
|
+
return enc
|
|
327
|
+
|
|
328
|
+
if http_equiv and http_equiv.lower() == b"content-type" and content:
|
|
329
|
+
extracted = _extract_charset_from_content(content)
|
|
330
|
+
if extracted:
|
|
331
|
+
enc = _normalize_meta_declared_encoding(extracted)
|
|
332
|
+
if enc:
|
|
333
|
+
return enc
|
|
334
|
+
|
|
335
|
+
# Continue scanning after this tag.
|
|
336
|
+
i = k
|
|
337
|
+
consumed = i - start_i
|
|
338
|
+
non_comment += consumed
|
|
339
|
+
else:
|
|
340
|
+
# Continue scanning after this tag attempt
|
|
341
|
+
i += 1
|
|
342
|
+
non_comment += 1
|
|
343
|
+
|
|
344
|
+
return None
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
def sniff_html_encoding(data: bytes, transport_encoding: str | None = None) -> tuple[str, int]:
|
|
348
|
+
# Transport overrides everything.
|
|
349
|
+
transport = normalize_encoding_label(transport_encoding)
|
|
350
|
+
if transport:
|
|
351
|
+
return transport, 0
|
|
352
|
+
|
|
353
|
+
bom_enc, bom_len = _sniff_bom(data)
|
|
354
|
+
if bom_enc:
|
|
355
|
+
return bom_enc, bom_len
|
|
356
|
+
|
|
357
|
+
meta_enc = _prescan_for_meta_charset(data)
|
|
358
|
+
if meta_enc:
|
|
359
|
+
return meta_enc, 0
|
|
360
|
+
|
|
361
|
+
return "windows-1252", 0
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
def decode_html(data: bytes, transport_encoding: str | None = None) -> tuple[str, str]:
|
|
365
|
+
"""Decode an HTML byte stream using HTML encoding sniffing.
|
|
366
|
+
|
|
367
|
+
Returns (text, encoding_name).
|
|
368
|
+
"""
|
|
369
|
+
enc, bom_len = sniff_html_encoding(data, transport_encoding=transport_encoding)
|
|
370
|
+
|
|
371
|
+
# Allowlist supported decoders.
|
|
372
|
+
if enc not in {
|
|
373
|
+
"utf-8",
|
|
374
|
+
"windows-1252",
|
|
375
|
+
"iso-8859-2",
|
|
376
|
+
"euc-jp",
|
|
377
|
+
"utf-16",
|
|
378
|
+
"utf-16le",
|
|
379
|
+
"utf-16be",
|
|
380
|
+
}: # pragma: no cover
|
|
381
|
+
enc = "windows-1252"
|
|
382
|
+
bom_len = 0
|
|
383
|
+
|
|
384
|
+
payload = data[bom_len:] if bom_len else data
|
|
385
|
+
|
|
386
|
+
if enc == "windows-1252":
|
|
387
|
+
return payload.decode("cp1252"), "windows-1252"
|
|
388
|
+
|
|
389
|
+
if enc == "iso-8859-2":
|
|
390
|
+
return payload.decode("iso-8859-2", "replace"), "iso-8859-2"
|
|
391
|
+
|
|
392
|
+
if enc == "euc-jp":
|
|
393
|
+
return payload.decode("euc_jp", "replace"), "euc-jp"
|
|
394
|
+
|
|
395
|
+
if enc == "utf-16le":
|
|
396
|
+
return payload.decode("utf-16le", "replace"), "utf-16le"
|
|
397
|
+
|
|
398
|
+
if enc == "utf-16be":
|
|
399
|
+
return payload.decode("utf-16be", "replace"), "utf-16be"
|
|
400
|
+
|
|
401
|
+
if enc == "utf-16":
|
|
402
|
+
return payload.decode("utf-16", "replace"), "utf-16"
|
|
403
|
+
|
|
404
|
+
# Default utf-8
|
|
405
|
+
return payload.decode("utf-8", "replace"), "utf-8"
|