justhtml 0.6.0__py3-none-any.whl → 0.33.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- justhtml/__init__.py +28 -0
- justhtml/__main__.py +161 -13
- justhtml/constants.py +17 -1
- justhtml/context.py +7 -1
- justhtml/encoding.py +405 -0
- justhtml/entities.py +57 -17
- justhtml/errors.py +20 -4
- justhtml/linkify.py +438 -0
- justhtml/node.py +738 -41
- justhtml/parser.py +188 -21
- justhtml/py.typed +0 -0
- justhtml/sanitize.py +1141 -0
- justhtml/selector.py +240 -104
- justhtml/serialize.py +418 -57
- justhtml/stream.py +34 -10
- justhtml/tokenizer.py +433 -289
- justhtml/tokens.py +91 -23
- justhtml/transforms.py +690 -0
- justhtml/treebuilder.py +196 -111
- justhtml/treebuilder_modes.py +191 -117
- justhtml/treebuilder_utils.py +11 -4
- justhtml-0.33.0.dist-info/METADATA +196 -0
- justhtml-0.33.0.dist-info/RECORD +26 -0
- justhtml-0.33.0.dist-info/entry_points.txt +2 -0
- {justhtml-0.6.0.dist-info → justhtml-0.33.0.dist-info}/licenses/LICENSE +4 -1
- justhtml-0.6.0.dist-info/METADATA +0 -126
- justhtml-0.6.0.dist-info/RECORD +0 -20
- {justhtml-0.6.0.dist-info → justhtml-0.33.0.dist-info}/WHEEL +0 -0
justhtml/__init__.py
CHANGED
|
@@ -1,14 +1,42 @@
|
|
|
1
1
|
from .parser import JustHTML, StrictModeError
|
|
2
|
+
from .sanitize import (
|
|
3
|
+
CSS_PRESET_TEXT,
|
|
4
|
+
DEFAULT_DOCUMENT_POLICY,
|
|
5
|
+
DEFAULT_POLICY,
|
|
6
|
+
SanitizationPolicy,
|
|
7
|
+
UnsafeHtmlError,
|
|
8
|
+
UrlPolicy,
|
|
9
|
+
UrlProxy,
|
|
10
|
+
UrlRule,
|
|
11
|
+
)
|
|
2
12
|
from .selector import SelectorError, matches, query
|
|
3
13
|
from .serialize import to_html, to_test_format
|
|
4
14
|
from .stream import stream
|
|
5
15
|
from .tokens import ParseError
|
|
16
|
+
from .transforms import CollapseWhitespace, Drop, Edit, Empty, Linkify, PruneEmpty, Sanitize, SetAttrs, Unwrap
|
|
6
17
|
|
|
7
18
|
__all__ = [
|
|
19
|
+
"CSS_PRESET_TEXT",
|
|
20
|
+
"DEFAULT_DOCUMENT_POLICY",
|
|
21
|
+
"DEFAULT_POLICY",
|
|
22
|
+
"CollapseWhitespace",
|
|
23
|
+
"Drop",
|
|
24
|
+
"Edit",
|
|
25
|
+
"Empty",
|
|
8
26
|
"JustHTML",
|
|
27
|
+
"Linkify",
|
|
9
28
|
"ParseError",
|
|
29
|
+
"PruneEmpty",
|
|
30
|
+
"SanitizationPolicy",
|
|
31
|
+
"Sanitize",
|
|
10
32
|
"SelectorError",
|
|
33
|
+
"SetAttrs",
|
|
11
34
|
"StrictModeError",
|
|
35
|
+
"UnsafeHtmlError",
|
|
36
|
+
"Unwrap",
|
|
37
|
+
"UrlPolicy",
|
|
38
|
+
"UrlProxy",
|
|
39
|
+
"UrlRule",
|
|
12
40
|
"matches",
|
|
13
41
|
"query",
|
|
14
42
|
"stream",
|
justhtml/__main__.py
CHANGED
|
@@ -1,28 +1,176 @@
|
|
|
1
1
|
#!/usr/bin/env python3
|
|
2
2
|
"""Command-line interface for JustHTML."""
|
|
3
3
|
|
|
4
|
-
|
|
4
|
+
from __future__ import annotations
|
|
5
5
|
|
|
6
|
+
import argparse
|
|
7
|
+
import io
|
|
6
8
|
import sys
|
|
9
|
+
from importlib.metadata import PackageNotFoundError, version
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import TextIO, cast
|
|
7
12
|
|
|
8
13
|
from . import JustHTML
|
|
14
|
+
from .context import FragmentContext
|
|
15
|
+
from .selector import SelectorError
|
|
9
16
|
|
|
10
17
|
|
|
11
|
-
def
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
18
|
+
def _get_version() -> str:
|
|
19
|
+
try:
|
|
20
|
+
return version("justhtml")
|
|
21
|
+
except PackageNotFoundError: # pragma: no cover
|
|
22
|
+
return "dev"
|
|
16
23
|
|
|
17
|
-
|
|
24
|
+
|
|
25
|
+
def _parse_args(argv: list[str]) -> argparse.Namespace:
|
|
26
|
+
parser = argparse.ArgumentParser(
|
|
27
|
+
prog="justhtml",
|
|
28
|
+
description="Parse HTML5 and output text, pretty-printed HTML, or Markdown.",
|
|
29
|
+
epilog=(
|
|
30
|
+
"Examples:\n"
|
|
31
|
+
" justhtml page.html\n"
|
|
32
|
+
" curl -s https://example.com | justhtml -\n"
|
|
33
|
+
" justhtml page.html --selector 'main p' --format text\n"
|
|
34
|
+
" justhtml page.html --selector 'a' --format html\n"
|
|
35
|
+
" justhtml page.html --selector 'article' --format markdown\n"
|
|
36
|
+
"\n"
|
|
37
|
+
"If you don't have the 'justhtml' command available, use:\n"
|
|
38
|
+
" python -m justhtml ...\n"
|
|
39
|
+
),
|
|
40
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
parser.add_argument(
|
|
44
|
+
"path",
|
|
45
|
+
nargs="?",
|
|
46
|
+
help="HTML file to parse, or '-' to read from stdin",
|
|
47
|
+
)
|
|
48
|
+
parser.add_argument("--output", help="File to write output to")
|
|
49
|
+
parser.add_argument(
|
|
50
|
+
"--selector",
|
|
51
|
+
help="CSS selector for choosing nodes (defaults to the document root)",
|
|
52
|
+
)
|
|
53
|
+
parser.add_argument(
|
|
54
|
+
"--format",
|
|
55
|
+
choices=["html", "text", "markdown"],
|
|
56
|
+
default="html",
|
|
57
|
+
help="Output format (default: html)",
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
parser.add_argument(
|
|
61
|
+
"--unsafe",
|
|
62
|
+
action="store_true",
|
|
63
|
+
help="Disable sanitization (trusted input only)",
|
|
64
|
+
)
|
|
65
|
+
parser.add_argument(
|
|
66
|
+
"--first",
|
|
67
|
+
action="store_true",
|
|
68
|
+
help="Only output the first matching node",
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
parser.add_argument(
|
|
72
|
+
"--fragment",
|
|
73
|
+
action="store_true",
|
|
74
|
+
help="Parse input as an HTML fragment (context: <div>)",
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
parser.add_argument(
|
|
78
|
+
"--separator",
|
|
79
|
+
default=" ",
|
|
80
|
+
help="Text-only: join string between text nodes (default: a single space)",
|
|
81
|
+
)
|
|
82
|
+
strip_group = parser.add_mutually_exclusive_group()
|
|
83
|
+
strip_group.add_argument(
|
|
84
|
+
"--strip",
|
|
85
|
+
action="store_true",
|
|
86
|
+
default=True,
|
|
87
|
+
help="Text-only: strip each text node and drop empty segments (default)",
|
|
88
|
+
)
|
|
89
|
+
strip_group.add_argument(
|
|
90
|
+
"--no-strip",
|
|
91
|
+
action="store_false",
|
|
92
|
+
dest="strip",
|
|
93
|
+
help="Text-only: preserve text node whitespace",
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
parser.add_argument(
|
|
97
|
+
"--version",
|
|
98
|
+
action="version",
|
|
99
|
+
version=f"justhtml {_get_version()}",
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
args = parser.parse_args(argv)
|
|
103
|
+
|
|
104
|
+
if not args.path:
|
|
105
|
+
parser.print_help(sys.stderr)
|
|
106
|
+
raise SystemExit(1)
|
|
107
|
+
|
|
108
|
+
return args
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _read_html(path: str) -> str | bytes:
|
|
18
112
|
if path == "-":
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
113
|
+
stdin = sys.stdin
|
|
114
|
+
if isinstance(stdin, io.TextIOWrapper):
|
|
115
|
+
data: bytes = stdin.buffer.read()
|
|
116
|
+
return data
|
|
117
|
+
return cast("str", stdin.read())
|
|
118
|
+
|
|
119
|
+
return Path(path).read_bytes()
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def main() -> None:
|
|
123
|
+
args = _parse_args(sys.argv[1:])
|
|
124
|
+
html = _read_html(args.path)
|
|
125
|
+
fragment_context = FragmentContext("div") if args.fragment else None
|
|
126
|
+
doc = JustHTML(html, fragment_context=fragment_context)
|
|
127
|
+
|
|
128
|
+
try:
|
|
129
|
+
nodes = doc.query(args.selector) if args.selector else [doc.root]
|
|
130
|
+
except SelectorError as e:
|
|
131
|
+
print(str(e), file=sys.stderr)
|
|
132
|
+
raise SystemExit(2) from e
|
|
133
|
+
|
|
134
|
+
if not nodes:
|
|
135
|
+
raise SystemExit(1)
|
|
136
|
+
|
|
137
|
+
if args.first:
|
|
138
|
+
nodes = [nodes[0]]
|
|
139
|
+
|
|
140
|
+
def write_output(out: TextIO) -> None:
|
|
141
|
+
safe = not args.unsafe
|
|
142
|
+
if args.format == "html":
|
|
143
|
+
outputs = [node.to_html(safe=safe) for node in nodes]
|
|
144
|
+
out.write("\n".join(outputs))
|
|
145
|
+
out.write("\n")
|
|
146
|
+
return
|
|
147
|
+
|
|
148
|
+
if args.format == "text":
|
|
149
|
+
# Keep these branches explicit so coverage will highlight untested CLI options.
|
|
150
|
+
if args.separator == " ":
|
|
151
|
+
if args.strip:
|
|
152
|
+
outputs = [node.to_text(strip=True, safe=safe) for node in nodes]
|
|
153
|
+
else:
|
|
154
|
+
outputs = [node.to_text(strip=False, safe=safe) for node in nodes]
|
|
155
|
+
else:
|
|
156
|
+
if args.strip:
|
|
157
|
+
outputs = [node.to_text(separator=args.separator, strip=True, safe=safe) for node in nodes]
|
|
158
|
+
else:
|
|
159
|
+
outputs = [node.to_text(separator=args.separator, strip=False, safe=safe) for node in nodes]
|
|
160
|
+
out.write("\n".join(outputs))
|
|
161
|
+
out.write("\n")
|
|
162
|
+
return
|
|
163
|
+
|
|
164
|
+
outputs = [node.to_markdown(safe=safe) for node in nodes]
|
|
165
|
+
out.write("\n\n".join(outputs))
|
|
166
|
+
out.write("\n")
|
|
167
|
+
|
|
168
|
+
if args.output:
|
|
169
|
+
with Path(args.output).open(mode="w", encoding="utf-8") as outfile:
|
|
170
|
+
write_output(outfile)
|
|
171
|
+
return
|
|
23
172
|
|
|
24
|
-
|
|
25
|
-
print(doc.root.to_html())
|
|
173
|
+
write_output(sys.stdout)
|
|
26
174
|
|
|
27
175
|
|
|
28
176
|
if __name__ == "__main__":
|
justhtml/constants.py
CHANGED
|
@@ -1,5 +1,9 @@
|
|
|
1
1
|
"""HTML5 spec constants for tree building."""
|
|
2
2
|
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Final
|
|
6
|
+
|
|
3
7
|
# HTML5 spec: Foreign attribute adjustments for SVG/MathML
|
|
4
8
|
# Maps lowercase attribute names to (prefix, local_name, namespace_url)
|
|
5
9
|
FOREIGN_ATTRIBUTE_ADJUSTMENTS = {
|
|
@@ -180,6 +184,18 @@ HTML4_PUBLIC_PREFIXES = (
|
|
|
180
184
|
|
|
181
185
|
HEADING_ELEMENTS = {"h1", "h2", "h3", "h4", "h5", "h6"}
|
|
182
186
|
|
|
187
|
+
# Elements where pretty-printing and whitespace-collapsing transforms should
|
|
188
|
+
# preserve text node whitespace.
|
|
189
|
+
WHITESPACE_PRESERVING_ELEMENTS: Final[frozenset[str]] = frozenset(
|
|
190
|
+
{
|
|
191
|
+
"code",
|
|
192
|
+
"pre",
|
|
193
|
+
"script",
|
|
194
|
+
"style",
|
|
195
|
+
"textarea",
|
|
196
|
+
}
|
|
197
|
+
)
|
|
198
|
+
|
|
183
199
|
FORMATTING_ELEMENTS = {
|
|
184
200
|
"a",
|
|
185
201
|
"b",
|
|
@@ -284,7 +300,7 @@ SPECIAL_ELEMENTS = {
|
|
|
284
300
|
"wbr",
|
|
285
301
|
}
|
|
286
302
|
|
|
287
|
-
FORMAT_MARKER = object()
|
|
303
|
+
FORMAT_MARKER: Final[object] = object()
|
|
288
304
|
|
|
289
305
|
DEFAULT_SCOPE_TERMINATORS = {
|
|
290
306
|
"applet",
|
justhtml/context.py
CHANGED
|
@@ -1,6 +1,12 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
|
|
1
4
|
class FragmentContext:
|
|
2
5
|
__slots__ = ("namespace", "tag_name")
|
|
3
6
|
|
|
4
|
-
|
|
7
|
+
tag_name: str
|
|
8
|
+
namespace: str | None
|
|
9
|
+
|
|
10
|
+
def __init__(self, tag_name: str, namespace: str | None = None) -> None:
|
|
5
11
|
self.tag_name = tag_name
|
|
6
12
|
self.namespace = namespace
|
justhtml/encoding.py
ADDED
|
@@ -0,0 +1,405 @@
|
|
|
1
|
+
"""HTML encoding sniffing and decoding.
|
|
2
|
+
|
|
3
|
+
Implements the HTML encoding sniffing behavior needed for the html5lib-tests
|
|
4
|
+
encoding fixtures.
|
|
5
|
+
|
|
6
|
+
Inputs are bytes and an optional transport-supplied encoding label.
|
|
7
|
+
Outputs are a decoded Unicode string and the chosen encoding name.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
_ASCII_WHITESPACE: set[int] = {0x09, 0x0A, 0x0C, 0x0D, 0x20}
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _ascii_lower(b: int) -> int:
|
|
16
|
+
# b is an int 0..255
|
|
17
|
+
if 0x41 <= b <= 0x5A:
|
|
18
|
+
return b | 0x20
|
|
19
|
+
return b
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _is_ascii_alpha(b: int) -> bool:
|
|
23
|
+
b = _ascii_lower(b)
|
|
24
|
+
return 0x61 <= b <= 0x7A
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _skip_ascii_whitespace(data: bytes, i: int) -> int:
|
|
28
|
+
n = len(data)
|
|
29
|
+
while i < n and data[i] in _ASCII_WHITESPACE:
|
|
30
|
+
i += 1
|
|
31
|
+
return i
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _strip_ascii_whitespace(value: bytes | None) -> bytes | None:
|
|
35
|
+
if value is None:
|
|
36
|
+
return None
|
|
37
|
+
start = 0
|
|
38
|
+
end = len(value)
|
|
39
|
+
while start < end and value[start] in _ASCII_WHITESPACE:
|
|
40
|
+
start += 1
|
|
41
|
+
while end > start and value[end - 1] in _ASCII_WHITESPACE:
|
|
42
|
+
end -= 1
|
|
43
|
+
return value[start:end]
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def normalize_encoding_label(label: str | bytes | None) -> str | None:
|
|
47
|
+
if not label:
|
|
48
|
+
return None
|
|
49
|
+
|
|
50
|
+
if isinstance(label, bytes):
|
|
51
|
+
label = label.decode("ascii", "ignore")
|
|
52
|
+
|
|
53
|
+
s = str(label).strip()
|
|
54
|
+
if not s:
|
|
55
|
+
return None
|
|
56
|
+
|
|
57
|
+
s = s.lower()
|
|
58
|
+
|
|
59
|
+
# Security: never allow utf-7.
|
|
60
|
+
if s in {"utf-7", "utf7", "x-utf-7"}:
|
|
61
|
+
return "windows-1252"
|
|
62
|
+
|
|
63
|
+
if s in {"utf-8", "utf8"}:
|
|
64
|
+
return "utf-8"
|
|
65
|
+
|
|
66
|
+
# HTML treats latin-1 labels as windows-1252.
|
|
67
|
+
if s in {
|
|
68
|
+
"iso-8859-1",
|
|
69
|
+
"iso8859-1",
|
|
70
|
+
"latin1",
|
|
71
|
+
"latin-1",
|
|
72
|
+
"l1",
|
|
73
|
+
"cp819",
|
|
74
|
+
"ibm819",
|
|
75
|
+
}:
|
|
76
|
+
return "windows-1252"
|
|
77
|
+
|
|
78
|
+
if s in {"windows-1252", "windows1252", "cp1252", "x-cp1252"}:
|
|
79
|
+
return "windows-1252"
|
|
80
|
+
|
|
81
|
+
if s in {"iso-8859-2", "iso8859-2", "latin2", "latin-2"}:
|
|
82
|
+
return "iso-8859-2"
|
|
83
|
+
|
|
84
|
+
if s in {"euc-jp", "eucjp"}:
|
|
85
|
+
return "euc-jp"
|
|
86
|
+
|
|
87
|
+
if s in {"utf-16", "utf16"}:
|
|
88
|
+
return "utf-16"
|
|
89
|
+
if s in {"utf-16le", "utf16le"}:
|
|
90
|
+
return "utf-16le"
|
|
91
|
+
if s in {"utf-16be", "utf16be"}:
|
|
92
|
+
return "utf-16be"
|
|
93
|
+
|
|
94
|
+
return None
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _normalize_meta_declared_encoding(label: bytes | None) -> str | None:
|
|
98
|
+
enc = normalize_encoding_label(label)
|
|
99
|
+
if enc is None:
|
|
100
|
+
return None
|
|
101
|
+
|
|
102
|
+
# Per HTML meta charset handling: ignore UTF-16/UTF-32 declarations and
|
|
103
|
+
# treat them as UTF-8.
|
|
104
|
+
if enc in {"utf-16", "utf-16le", "utf-16be", "utf-32", "utf-32le", "utf-32be"}:
|
|
105
|
+
return "utf-8"
|
|
106
|
+
|
|
107
|
+
return enc
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _sniff_bom(data: bytes) -> tuple[str | None, int]:
|
|
111
|
+
if len(data) >= 3 and data[0:3] == b"\xef\xbb\xbf":
|
|
112
|
+
return "utf-8", 3
|
|
113
|
+
if len(data) >= 2 and data[0:2] == b"\xff\xfe":
|
|
114
|
+
return "utf-16le", 2
|
|
115
|
+
if len(data) >= 2 and data[0:2] == b"\xfe\xff":
|
|
116
|
+
return "utf-16be", 2
|
|
117
|
+
return None, 0
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def _extract_charset_from_content(content_bytes: bytes) -> bytes | None:
|
|
121
|
+
if not content_bytes:
|
|
122
|
+
return None
|
|
123
|
+
|
|
124
|
+
# Normalize whitespace to spaces for robust matching.
|
|
125
|
+
b = bytearray()
|
|
126
|
+
for ch in content_bytes:
|
|
127
|
+
if ch in _ASCII_WHITESPACE:
|
|
128
|
+
b.append(0x20)
|
|
129
|
+
else:
|
|
130
|
+
b.append(_ascii_lower(ch))
|
|
131
|
+
s = bytes(b)
|
|
132
|
+
|
|
133
|
+
idx = s.find(b"charset")
|
|
134
|
+
if idx == -1:
|
|
135
|
+
return None
|
|
136
|
+
|
|
137
|
+
i = idx + len(b"charset")
|
|
138
|
+
n = len(s)
|
|
139
|
+
while i < n and s[i] in _ASCII_WHITESPACE:
|
|
140
|
+
i += 1
|
|
141
|
+
if i >= n or s[i] != 0x3D: # '='
|
|
142
|
+
return None
|
|
143
|
+
i += 1
|
|
144
|
+
while i < n and s[i] in _ASCII_WHITESPACE:
|
|
145
|
+
i += 1
|
|
146
|
+
if i >= n:
|
|
147
|
+
return None
|
|
148
|
+
|
|
149
|
+
quote: int | None = None
|
|
150
|
+
if s[i] in (0x22, 0x27): # '"' or "'"
|
|
151
|
+
quote = s[i]
|
|
152
|
+
i += 1
|
|
153
|
+
|
|
154
|
+
start = i
|
|
155
|
+
while i < n:
|
|
156
|
+
ch = s[i]
|
|
157
|
+
if quote is not None:
|
|
158
|
+
if ch == quote:
|
|
159
|
+
break
|
|
160
|
+
else:
|
|
161
|
+
if ch in _ASCII_WHITESPACE or ch == 0x3B: # ';'
|
|
162
|
+
break
|
|
163
|
+
i += 1
|
|
164
|
+
|
|
165
|
+
if quote is not None and (i >= n or s[i] != quote):
|
|
166
|
+
return None
|
|
167
|
+
|
|
168
|
+
return s[start:i]
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def _prescan_for_meta_charset(data: bytes) -> str | None:
|
|
172
|
+
# Scan up to 1024 bytes worth of non-comment input, but allow skipping
|
|
173
|
+
# arbitrarily large comments (bounded by a hard cap).
|
|
174
|
+
max_non_comment = 1024
|
|
175
|
+
max_total_scan = 65536
|
|
176
|
+
|
|
177
|
+
n = len(data)
|
|
178
|
+
i = 0
|
|
179
|
+
non_comment = 0
|
|
180
|
+
|
|
181
|
+
while i < n and i < max_total_scan and non_comment < max_non_comment:
|
|
182
|
+
if data[i] != 0x3C: # '<'
|
|
183
|
+
i += 1
|
|
184
|
+
non_comment += 1
|
|
185
|
+
continue
|
|
186
|
+
|
|
187
|
+
# Comment
|
|
188
|
+
if i + 3 < n and data[i + 1 : i + 4] == b"!--":
|
|
189
|
+
end = data.find(b"-->", i + 4)
|
|
190
|
+
if end == -1:
|
|
191
|
+
return None
|
|
192
|
+
i = end + 3
|
|
193
|
+
continue
|
|
194
|
+
|
|
195
|
+
# Tag open
|
|
196
|
+
j = i + 1
|
|
197
|
+
if j < n and data[j] == 0x2F: # '/'
|
|
198
|
+
# Skip end tag.
|
|
199
|
+
k = i
|
|
200
|
+
quote: int | None = None
|
|
201
|
+
while k < n and k < max_total_scan and non_comment < max_non_comment:
|
|
202
|
+
ch = data[k]
|
|
203
|
+
if quote is None:
|
|
204
|
+
if ch in (0x22, 0x27):
|
|
205
|
+
quote = ch
|
|
206
|
+
elif ch == 0x3E: # '>'
|
|
207
|
+
k += 1
|
|
208
|
+
non_comment += 1
|
|
209
|
+
break
|
|
210
|
+
else:
|
|
211
|
+
if ch == quote:
|
|
212
|
+
quote = None
|
|
213
|
+
k += 1
|
|
214
|
+
non_comment += 1
|
|
215
|
+
i = k
|
|
216
|
+
continue
|
|
217
|
+
|
|
218
|
+
if j >= n or not _is_ascii_alpha(data[j]):
|
|
219
|
+
i += 1
|
|
220
|
+
non_comment += 1
|
|
221
|
+
continue
|
|
222
|
+
|
|
223
|
+
name_start = j
|
|
224
|
+
while j < n and _is_ascii_alpha(data[j]):
|
|
225
|
+
j += 1
|
|
226
|
+
|
|
227
|
+
tag_name = data[name_start:j]
|
|
228
|
+
if tag_name.lower() != b"meta":
|
|
229
|
+
# Skip the rest of this tag so we don't accidentally interpret '<'
|
|
230
|
+
# inside an attribute value as a new tag.
|
|
231
|
+
k = i
|
|
232
|
+
quote = None
|
|
233
|
+
while k < n and k < max_total_scan and non_comment < max_non_comment:
|
|
234
|
+
ch = data[k]
|
|
235
|
+
if quote is None:
|
|
236
|
+
if ch in (0x22, 0x27):
|
|
237
|
+
quote = ch
|
|
238
|
+
elif ch == 0x3E: # '>'
|
|
239
|
+
k += 1
|
|
240
|
+
non_comment += 1
|
|
241
|
+
break
|
|
242
|
+
else:
|
|
243
|
+
if ch == quote:
|
|
244
|
+
quote = None
|
|
245
|
+
k += 1
|
|
246
|
+
non_comment += 1
|
|
247
|
+
i = k
|
|
248
|
+
continue
|
|
249
|
+
|
|
250
|
+
# Parse attributes until '>'
|
|
251
|
+
charset: bytes | None = None
|
|
252
|
+
http_equiv: bytes | None = None
|
|
253
|
+
content: bytes | None = None
|
|
254
|
+
|
|
255
|
+
k = j
|
|
256
|
+
saw_gt = False
|
|
257
|
+
start_i = i
|
|
258
|
+
while k < n and k < max_total_scan:
|
|
259
|
+
ch = data[k]
|
|
260
|
+
if ch == 0x3E: # '>'
|
|
261
|
+
saw_gt = True
|
|
262
|
+
k += 1
|
|
263
|
+
break
|
|
264
|
+
|
|
265
|
+
if ch == 0x3C: # '<' - restart scanning from here
|
|
266
|
+
break
|
|
267
|
+
|
|
268
|
+
if ch in _ASCII_WHITESPACE or ch == 0x2F: # '/'
|
|
269
|
+
k += 1
|
|
270
|
+
continue
|
|
271
|
+
|
|
272
|
+
# Attribute name
|
|
273
|
+
attr_start = k
|
|
274
|
+
while k < n:
|
|
275
|
+
ch = data[k]
|
|
276
|
+
if ch in _ASCII_WHITESPACE or ch in {0x3D, 0x3E, 0x2F, 0x3C}:
|
|
277
|
+
break
|
|
278
|
+
k += 1
|
|
279
|
+
attr_name = data[attr_start:k].lower()
|
|
280
|
+
k = _skip_ascii_whitespace(data, k)
|
|
281
|
+
|
|
282
|
+
value: bytes | None = None
|
|
283
|
+
if k < n and data[k] == 0x3D: # '='
|
|
284
|
+
k += 1
|
|
285
|
+
k = _skip_ascii_whitespace(data, k)
|
|
286
|
+
if k >= n:
|
|
287
|
+
break
|
|
288
|
+
|
|
289
|
+
quote = None
|
|
290
|
+
if data[k] in (0x22, 0x27):
|
|
291
|
+
quote = data[k]
|
|
292
|
+
k += 1
|
|
293
|
+
val_start = k
|
|
294
|
+
end_quote = data.find(bytes((quote,)), k)
|
|
295
|
+
if end_quote == -1:
|
|
296
|
+
# Unclosed quote: ignore this meta.
|
|
297
|
+
i += 1
|
|
298
|
+
non_comment += 1
|
|
299
|
+
charset = None
|
|
300
|
+
http_equiv = None
|
|
301
|
+
content = None
|
|
302
|
+
saw_gt = False
|
|
303
|
+
break
|
|
304
|
+
value = data[val_start:end_quote]
|
|
305
|
+
k = end_quote + 1
|
|
306
|
+
else:
|
|
307
|
+
val_start = k
|
|
308
|
+
while k < n:
|
|
309
|
+
ch = data[k]
|
|
310
|
+
if ch in _ASCII_WHITESPACE or ch in {0x3E, 0x3C}:
|
|
311
|
+
break
|
|
312
|
+
k += 1
|
|
313
|
+
value = data[val_start:k]
|
|
314
|
+
|
|
315
|
+
if attr_name == b"charset":
|
|
316
|
+
charset = _strip_ascii_whitespace(value)
|
|
317
|
+
elif attr_name == b"http-equiv":
|
|
318
|
+
http_equiv = value
|
|
319
|
+
elif attr_name == b"content":
|
|
320
|
+
content = value
|
|
321
|
+
|
|
322
|
+
if saw_gt:
|
|
323
|
+
if charset:
|
|
324
|
+
enc = _normalize_meta_declared_encoding(charset)
|
|
325
|
+
if enc:
|
|
326
|
+
return enc
|
|
327
|
+
|
|
328
|
+
if http_equiv and http_equiv.lower() == b"content-type" and content:
|
|
329
|
+
extracted = _extract_charset_from_content(content)
|
|
330
|
+
if extracted:
|
|
331
|
+
enc = _normalize_meta_declared_encoding(extracted)
|
|
332
|
+
if enc:
|
|
333
|
+
return enc
|
|
334
|
+
|
|
335
|
+
# Continue scanning after this tag.
|
|
336
|
+
i = k
|
|
337
|
+
consumed = i - start_i
|
|
338
|
+
non_comment += consumed
|
|
339
|
+
else:
|
|
340
|
+
# Continue scanning after this tag attempt
|
|
341
|
+
i += 1
|
|
342
|
+
non_comment += 1
|
|
343
|
+
|
|
344
|
+
return None
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
def sniff_html_encoding(data: bytes, transport_encoding: str | None = None) -> tuple[str, int]:
|
|
348
|
+
# Transport overrides everything.
|
|
349
|
+
transport = normalize_encoding_label(transport_encoding)
|
|
350
|
+
if transport:
|
|
351
|
+
return transport, 0
|
|
352
|
+
|
|
353
|
+
bom_enc, bom_len = _sniff_bom(data)
|
|
354
|
+
if bom_enc:
|
|
355
|
+
return bom_enc, bom_len
|
|
356
|
+
|
|
357
|
+
meta_enc = _prescan_for_meta_charset(data)
|
|
358
|
+
if meta_enc:
|
|
359
|
+
return meta_enc, 0
|
|
360
|
+
|
|
361
|
+
return "windows-1252", 0
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
def decode_html(data: bytes, transport_encoding: str | None = None) -> tuple[str, str]:
|
|
365
|
+
"""Decode an HTML byte stream using HTML encoding sniffing.
|
|
366
|
+
|
|
367
|
+
Returns (text, encoding_name).
|
|
368
|
+
"""
|
|
369
|
+
enc, bom_len = sniff_html_encoding(data, transport_encoding=transport_encoding)
|
|
370
|
+
|
|
371
|
+
# Allowlist supported decoders.
|
|
372
|
+
if enc not in {
|
|
373
|
+
"utf-8",
|
|
374
|
+
"windows-1252",
|
|
375
|
+
"iso-8859-2",
|
|
376
|
+
"euc-jp",
|
|
377
|
+
"utf-16",
|
|
378
|
+
"utf-16le",
|
|
379
|
+
"utf-16be",
|
|
380
|
+
}: # pragma: no cover
|
|
381
|
+
enc = "windows-1252"
|
|
382
|
+
bom_len = 0
|
|
383
|
+
|
|
384
|
+
payload = data[bom_len:] if bom_len else data
|
|
385
|
+
|
|
386
|
+
if enc == "windows-1252":
|
|
387
|
+
return payload.decode("cp1252"), "windows-1252"
|
|
388
|
+
|
|
389
|
+
if enc == "iso-8859-2":
|
|
390
|
+
return payload.decode("iso-8859-2", "replace"), "iso-8859-2"
|
|
391
|
+
|
|
392
|
+
if enc == "euc-jp":
|
|
393
|
+
return payload.decode("euc_jp", "replace"), "euc-jp"
|
|
394
|
+
|
|
395
|
+
if enc == "utf-16le":
|
|
396
|
+
return payload.decode("utf-16le", "replace"), "utf-16le"
|
|
397
|
+
|
|
398
|
+
if enc == "utf-16be":
|
|
399
|
+
return payload.decode("utf-16be", "replace"), "utf-16be"
|
|
400
|
+
|
|
401
|
+
if enc == "utf-16":
|
|
402
|
+
return payload.decode("utf-16", "replace"), "utf-16"
|
|
403
|
+
|
|
404
|
+
# Default utf-8
|
|
405
|
+
return payload.decode("utf-8", "replace"), "utf-8"
|