justhtml 0.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of justhtml might be problematic. Click here for more details.

justhtml/encoding.py ADDED
@@ -0,0 +1,405 @@
1
+ """HTML encoding sniffing and decoding.
2
+
3
+ Implements the HTML encoding sniffing behavior needed for the html5lib-tests
4
+ encoding fixtures.
5
+
6
+ Inputs are bytes and an optional transport-supplied encoding label.
7
+ Outputs are a decoded Unicode string and the chosen encoding name.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ _ASCII_WHITESPACE: set[int] = {0x09, 0x0A, 0x0C, 0x0D, 0x20}
13
+
14
+
15
+ def _ascii_lower(b: int) -> int:
16
+ # b is an int 0..255
17
+ if 0x41 <= b <= 0x5A:
18
+ return b | 0x20
19
+ return b
20
+
21
+
22
+ def _is_ascii_alpha(b: int) -> bool:
23
+ b = _ascii_lower(b)
24
+ return 0x61 <= b <= 0x7A
25
+
26
+
27
+ def _skip_ascii_whitespace(data: bytes, i: int) -> int:
28
+ n = len(data)
29
+ while i < n and data[i] in _ASCII_WHITESPACE:
30
+ i += 1
31
+ return i
32
+
33
+
34
+ def _strip_ascii_whitespace(value: bytes | None) -> bytes | None:
35
+ if value is None:
36
+ return None
37
+ start = 0
38
+ end = len(value)
39
+ while start < end and value[start] in _ASCII_WHITESPACE:
40
+ start += 1
41
+ while end > start and value[end - 1] in _ASCII_WHITESPACE:
42
+ end -= 1
43
+ return value[start:end]
44
+
45
+
46
+ def normalize_encoding_label(label: str | bytes | None) -> str | None:
47
+ if not label:
48
+ return None
49
+
50
+ if isinstance(label, bytes):
51
+ label = label.decode("ascii", "ignore")
52
+
53
+ s = str(label).strip()
54
+ if not s:
55
+ return None
56
+
57
+ s = s.lower()
58
+
59
+ # Security: never allow utf-7.
60
+ if s in {"utf-7", "utf7", "x-utf-7"}:
61
+ return "windows-1252"
62
+
63
+ if s in {"utf-8", "utf8"}:
64
+ return "utf-8"
65
+
66
+ # HTML treats latin-1 labels as windows-1252.
67
+ if s in {
68
+ "iso-8859-1",
69
+ "iso8859-1",
70
+ "latin1",
71
+ "latin-1",
72
+ "l1",
73
+ "cp819",
74
+ "ibm819",
75
+ }:
76
+ return "windows-1252"
77
+
78
+ if s in {"windows-1252", "windows1252", "cp1252", "x-cp1252"}:
79
+ return "windows-1252"
80
+
81
+ if s in {"iso-8859-2", "iso8859-2", "latin2", "latin-2"}:
82
+ return "iso-8859-2"
83
+
84
+ if s in {"euc-jp", "eucjp"}:
85
+ return "euc-jp"
86
+
87
+ if s in {"utf-16", "utf16"}:
88
+ return "utf-16"
89
+ if s in {"utf-16le", "utf16le"}:
90
+ return "utf-16le"
91
+ if s in {"utf-16be", "utf16be"}:
92
+ return "utf-16be"
93
+
94
+ return None
95
+
96
+
97
+ def _normalize_meta_declared_encoding(label: bytes | None) -> str | None:
98
+ enc = normalize_encoding_label(label)
99
+ if enc is None:
100
+ return None
101
+
102
+ # Per HTML meta charset handling: ignore UTF-16/UTF-32 declarations and
103
+ # treat them as UTF-8.
104
+ if enc in {"utf-16", "utf-16le", "utf-16be", "utf-32", "utf-32le", "utf-32be"}:
105
+ return "utf-8"
106
+
107
+ return enc
108
+
109
+
110
+ def _sniff_bom(data: bytes) -> tuple[str | None, int]:
111
+ if len(data) >= 3 and data[0:3] == b"\xef\xbb\xbf":
112
+ return "utf-8", 3
113
+ if len(data) >= 2 and data[0:2] == b"\xff\xfe":
114
+ return "utf-16le", 2
115
+ if len(data) >= 2 and data[0:2] == b"\xfe\xff":
116
+ return "utf-16be", 2
117
+ return None, 0
118
+
119
+
120
+ def _extract_charset_from_content(content_bytes: bytes) -> bytes | None:
121
+ if not content_bytes:
122
+ return None
123
+
124
+ # Normalize whitespace to spaces for robust matching.
125
+ b = bytearray()
126
+ for ch in content_bytes:
127
+ if ch in _ASCII_WHITESPACE:
128
+ b.append(0x20)
129
+ else:
130
+ b.append(_ascii_lower(ch))
131
+ s = bytes(b)
132
+
133
+ idx = s.find(b"charset")
134
+ if idx == -1:
135
+ return None
136
+
137
+ i = idx + len(b"charset")
138
+ n = len(s)
139
+ while i < n and s[i] in _ASCII_WHITESPACE:
140
+ i += 1
141
+ if i >= n or s[i] != 0x3D: # '='
142
+ return None
143
+ i += 1
144
+ while i < n and s[i] in _ASCII_WHITESPACE:
145
+ i += 1
146
+ if i >= n:
147
+ return None
148
+
149
+ quote: int | None = None
150
+ if s[i] in (0x22, 0x27): # '"' or "'"
151
+ quote = s[i]
152
+ i += 1
153
+
154
+ start = i
155
+ while i < n:
156
+ ch = s[i]
157
+ if quote is not None:
158
+ if ch == quote:
159
+ break
160
+ else:
161
+ if ch in _ASCII_WHITESPACE or ch == 0x3B: # ';'
162
+ break
163
+ i += 1
164
+
165
+ if quote is not None and (i >= n or s[i] != quote):
166
+ return None
167
+
168
+ return s[start:i]
169
+
170
+
171
+ def _prescan_for_meta_charset(data: bytes) -> str | None:
172
+ # Scan up to 1024 bytes worth of non-comment input, but allow skipping
173
+ # arbitrarily large comments (bounded by a hard cap).
174
+ max_non_comment = 1024
175
+ max_total_scan = 65536
176
+
177
+ n = len(data)
178
+ i = 0
179
+ non_comment = 0
180
+
181
+ while i < n and i < max_total_scan and non_comment < max_non_comment:
182
+ if data[i] != 0x3C: # '<'
183
+ i += 1
184
+ non_comment += 1
185
+ continue
186
+
187
+ # Comment
188
+ if i + 3 < n and data[i + 1 : i + 4] == b"!--":
189
+ end = data.find(b"-->", i + 4)
190
+ if end == -1:
191
+ return None
192
+ i = end + 3
193
+ continue
194
+
195
+ # Tag open
196
+ j = i + 1
197
+ if j < n and data[j] == 0x2F: # '/'
198
+ # Skip end tag.
199
+ k = i
200
+ quote: int | None = None
201
+ while k < n and k < max_total_scan and non_comment < max_non_comment:
202
+ ch = data[k]
203
+ if quote is None:
204
+ if ch in (0x22, 0x27):
205
+ quote = ch
206
+ elif ch == 0x3E: # '>'
207
+ k += 1
208
+ non_comment += 1
209
+ break
210
+ else:
211
+ if ch == quote:
212
+ quote = None
213
+ k += 1
214
+ non_comment += 1
215
+ i = k
216
+ continue
217
+
218
+ if j >= n or not _is_ascii_alpha(data[j]):
219
+ i += 1
220
+ non_comment += 1
221
+ continue
222
+
223
+ name_start = j
224
+ while j < n and _is_ascii_alpha(data[j]):
225
+ j += 1
226
+
227
+ tag_name = data[name_start:j]
228
+ if tag_name.lower() != b"meta":
229
+ # Skip the rest of this tag so we don't accidentally interpret '<'
230
+ # inside an attribute value as a new tag.
231
+ k = i
232
+ quote = None
233
+ while k < n and k < max_total_scan and non_comment < max_non_comment:
234
+ ch = data[k]
235
+ if quote is None:
236
+ if ch in (0x22, 0x27):
237
+ quote = ch
238
+ elif ch == 0x3E: # '>'
239
+ k += 1
240
+ non_comment += 1
241
+ break
242
+ else:
243
+ if ch == quote:
244
+ quote = None
245
+ k += 1
246
+ non_comment += 1
247
+ i = k
248
+ continue
249
+
250
+ # Parse attributes until '>'
251
+ charset: bytes | None = None
252
+ http_equiv: bytes | None = None
253
+ content: bytes | None = None
254
+
255
+ k = j
256
+ saw_gt = False
257
+ start_i = i
258
+ while k < n and k < max_total_scan:
259
+ ch = data[k]
260
+ if ch == 0x3E: # '>'
261
+ saw_gt = True
262
+ k += 1
263
+ break
264
+
265
+ if ch == 0x3C: # '<' - restart scanning from here
266
+ break
267
+
268
+ if ch in _ASCII_WHITESPACE or ch == 0x2F: # '/'
269
+ k += 1
270
+ continue
271
+
272
+ # Attribute name
273
+ attr_start = k
274
+ while k < n:
275
+ ch = data[k]
276
+ if ch in _ASCII_WHITESPACE or ch in {0x3D, 0x3E, 0x2F, 0x3C}:
277
+ break
278
+ k += 1
279
+ attr_name = data[attr_start:k].lower()
280
+ k = _skip_ascii_whitespace(data, k)
281
+
282
+ value: bytes | None = None
283
+ if k < n and data[k] == 0x3D: # '='
284
+ k += 1
285
+ k = _skip_ascii_whitespace(data, k)
286
+ if k >= n:
287
+ break
288
+
289
+ quote = None
290
+ if data[k] in (0x22, 0x27):
291
+ quote = data[k]
292
+ k += 1
293
+ val_start = k
294
+ end_quote = data.find(bytes((quote,)), k)
295
+ if end_quote == -1:
296
+ # Unclosed quote: ignore this meta.
297
+ i += 1
298
+ non_comment += 1
299
+ charset = None
300
+ http_equiv = None
301
+ content = None
302
+ saw_gt = False
303
+ break
304
+ value = data[val_start:end_quote]
305
+ k = end_quote + 1
306
+ else:
307
+ val_start = k
308
+ while k < n:
309
+ ch = data[k]
310
+ if ch in _ASCII_WHITESPACE or ch in {0x3E, 0x3C}:
311
+ break
312
+ k += 1
313
+ value = data[val_start:k]
314
+
315
+ if attr_name == b"charset":
316
+ charset = _strip_ascii_whitespace(value)
317
+ elif attr_name == b"http-equiv":
318
+ http_equiv = value
319
+ elif attr_name == b"content":
320
+ content = value
321
+
322
+ if saw_gt:
323
+ if charset:
324
+ enc = _normalize_meta_declared_encoding(charset)
325
+ if enc:
326
+ return enc
327
+
328
+ if http_equiv and http_equiv.lower() == b"content-type" and content:
329
+ extracted = _extract_charset_from_content(content)
330
+ if extracted:
331
+ enc = _normalize_meta_declared_encoding(extracted)
332
+ if enc:
333
+ return enc
334
+
335
+ # Continue scanning after this tag.
336
+ i = k
337
+ consumed = i - start_i
338
+ non_comment += consumed
339
+ else:
340
+ # Continue scanning after this tag attempt
341
+ i += 1
342
+ non_comment += 1
343
+
344
+ return None
345
+
346
+
347
+ def sniff_html_encoding(data: bytes, transport_encoding: str | None = None) -> tuple[str, int]:
348
+ # Transport overrides everything.
349
+ transport = normalize_encoding_label(transport_encoding)
350
+ if transport:
351
+ return transport, 0
352
+
353
+ bom_enc, bom_len = _sniff_bom(data)
354
+ if bom_enc:
355
+ return bom_enc, bom_len
356
+
357
+ meta_enc = _prescan_for_meta_charset(data)
358
+ if meta_enc:
359
+ return meta_enc, 0
360
+
361
+ return "windows-1252", 0
362
+
363
+
364
+ def decode_html(data: bytes, transport_encoding: str | None = None) -> tuple[str, str]:
365
+ """Decode an HTML byte stream using HTML encoding sniffing.
366
+
367
+ Returns (text, encoding_name).
368
+ """
369
+ enc, bom_len = sniff_html_encoding(data, transport_encoding=transport_encoding)
370
+
371
+ # Allowlist supported decoders.
372
+ if enc not in {
373
+ "utf-8",
374
+ "windows-1252",
375
+ "iso-8859-2",
376
+ "euc-jp",
377
+ "utf-16",
378
+ "utf-16le",
379
+ "utf-16be",
380
+ }: # pragma: no cover
381
+ enc = "windows-1252"
382
+ bom_len = 0
383
+
384
+ payload = data[bom_len:] if bom_len else data
385
+
386
+ if enc == "windows-1252":
387
+ return payload.decode("cp1252"), "windows-1252"
388
+
389
+ if enc == "iso-8859-2":
390
+ return payload.decode("iso-8859-2", "replace"), "iso-8859-2"
391
+
392
+ if enc == "euc-jp":
393
+ return payload.decode("euc_jp", "replace"), "euc-jp"
394
+
395
+ if enc == "utf-16le":
396
+ return payload.decode("utf-16le", "replace"), "utf-16le"
397
+
398
+ if enc == "utf-16be":
399
+ return payload.decode("utf-16be", "replace"), "utf-16be"
400
+
401
+ if enc == "utf-16":
402
+ return payload.decode("utf-16", "replace"), "utf-16"
403
+
404
+ # Default utf-8
405
+ return payload.decode("utf-8", "replace"), "utf-8"