justhtml 0.24.0__py3-none-any.whl → 0.38.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of justhtml might be problematic. Click here for more details.
- justhtml/__init__.py +44 -2
- justhtml/__main__.py +45 -9
- justhtml/constants.py +12 -0
- justhtml/errors.py +8 -3
- justhtml/linkify.py +438 -0
- justhtml/node.py +54 -35
- justhtml/parser.py +105 -38
- justhtml/sanitize.py +511 -282
- justhtml/selector.py +3 -1
- justhtml/serialize.py +398 -72
- justhtml/tokenizer.py +121 -21
- justhtml/tokens.py +21 -3
- justhtml/transforms.py +2568 -0
- justhtml/treebuilder.py +247 -190
- justhtml/treebuilder_modes.py +108 -102
- {justhtml-0.24.0.dist-info → justhtml-0.38.0.dist-info}/METADATA +28 -7
- justhtml-0.38.0.dist-info/RECORD +26 -0
- {justhtml-0.24.0.dist-info → justhtml-0.38.0.dist-info}/licenses/LICENSE +1 -1
- justhtml-0.24.0.dist-info/RECORD +0 -24
- {justhtml-0.24.0.dist-info → justhtml-0.38.0.dist-info}/WHEEL +0 -0
- {justhtml-0.24.0.dist-info → justhtml-0.38.0.dist-info}/entry_points.txt +0 -0
justhtml/__init__.py
CHANGED
|
@@ -1,22 +1,64 @@
|
|
|
1
1
|
from .parser import JustHTML, StrictModeError
|
|
2
|
-
from .sanitize import
|
|
2
|
+
from .sanitize import (
|
|
3
|
+
CSS_PRESET_TEXT,
|
|
4
|
+
DEFAULT_DOCUMENT_POLICY,
|
|
5
|
+
DEFAULT_POLICY,
|
|
6
|
+
SanitizationPolicy,
|
|
7
|
+
UnsafeHtmlError,
|
|
8
|
+
UrlPolicy,
|
|
9
|
+
UrlProxy,
|
|
10
|
+
UrlRule,
|
|
11
|
+
)
|
|
3
12
|
from .selector import SelectorError, matches, query
|
|
4
13
|
from .serialize import to_html, to_test_format
|
|
5
14
|
from .stream import stream
|
|
6
15
|
from .tokens import ParseError
|
|
16
|
+
from .transforms import (
|
|
17
|
+
CollapseWhitespace,
|
|
18
|
+
Decide,
|
|
19
|
+
Drop,
|
|
20
|
+
Edit,
|
|
21
|
+
EditAttrs,
|
|
22
|
+
EditDocument,
|
|
23
|
+
Empty,
|
|
24
|
+
Linkify,
|
|
25
|
+
PruneEmpty,
|
|
26
|
+
RewriteAttrs,
|
|
27
|
+
Sanitize,
|
|
28
|
+
SetAttrs,
|
|
29
|
+
Stage,
|
|
30
|
+
Unwrap,
|
|
31
|
+
)
|
|
7
32
|
|
|
8
33
|
__all__ = [
|
|
34
|
+
"CSS_PRESET_TEXT",
|
|
9
35
|
"DEFAULT_DOCUMENT_POLICY",
|
|
10
36
|
"DEFAULT_POLICY",
|
|
37
|
+
"CollapseWhitespace",
|
|
38
|
+
"Decide",
|
|
39
|
+
"Drop",
|
|
40
|
+
"Edit",
|
|
41
|
+
"EditAttrs",
|
|
42
|
+
"EditDocument",
|
|
43
|
+
"Empty",
|
|
11
44
|
"JustHTML",
|
|
45
|
+
"Linkify",
|
|
12
46
|
"ParseError",
|
|
47
|
+
"PruneEmpty",
|
|
48
|
+
"RewriteAttrs",
|
|
13
49
|
"SanitizationPolicy",
|
|
50
|
+
"Sanitize",
|
|
14
51
|
"SelectorError",
|
|
52
|
+
"SetAttrs",
|
|
53
|
+
"Stage",
|
|
15
54
|
"StrictModeError",
|
|
55
|
+
"UnsafeHtmlError",
|
|
56
|
+
"Unwrap",
|
|
57
|
+
"UrlPolicy",
|
|
58
|
+
"UrlProxy",
|
|
16
59
|
"UrlRule",
|
|
17
60
|
"matches",
|
|
18
61
|
"query",
|
|
19
|
-
"sanitize",
|
|
20
62
|
"stream",
|
|
21
63
|
"to_html",
|
|
22
64
|
"to_test_format",
|
justhtml/__main__.py
CHANGED
|
@@ -32,7 +32,7 @@ def _parse_args(argv: list[str]) -> argparse.Namespace:
|
|
|
32
32
|
" curl -s https://example.com | justhtml -\n"
|
|
33
33
|
" justhtml page.html --selector 'main p' --format text\n"
|
|
34
34
|
" justhtml page.html --selector 'a' --format html\n"
|
|
35
|
-
" justhtml page.html --selector 'article' --format markdown\n"
|
|
35
|
+
" justhtml page.html --selector 'article' --allow-tags article --format markdown\n"
|
|
36
36
|
"\n"
|
|
37
37
|
"If you don't have the 'justhtml' command available, use:\n"
|
|
38
38
|
" python -m justhtml ...\n"
|
|
@@ -62,6 +62,14 @@ def _parse_args(argv: list[str]) -> argparse.Namespace:
|
|
|
62
62
|
action="store_true",
|
|
63
63
|
help="Disable sanitization (trusted input only)",
|
|
64
64
|
)
|
|
65
|
+
|
|
66
|
+
parser.add_argument(
|
|
67
|
+
"--allow-tags",
|
|
68
|
+
help=(
|
|
69
|
+
"Safe mode: allow these additional tags during sanitization (comma-separated). "
|
|
70
|
+
"Example: --allow-tags article,section"
|
|
71
|
+
),
|
|
72
|
+
)
|
|
65
73
|
parser.add_argument(
|
|
66
74
|
"--first",
|
|
67
75
|
action="store_true",
|
|
@@ -123,7 +131,36 @@ def main() -> None:
|
|
|
123
131
|
args = _parse_args(sys.argv[1:])
|
|
124
132
|
html = _read_html(args.path)
|
|
125
133
|
fragment_context = FragmentContext("div") if args.fragment else None
|
|
126
|
-
|
|
134
|
+
safe = not args.unsafe
|
|
135
|
+
|
|
136
|
+
policy = None
|
|
137
|
+
if safe and args.allow_tags:
|
|
138
|
+
from .sanitize import DEFAULT_DOCUMENT_POLICY, DEFAULT_POLICY, SanitizationPolicy # noqa: PLC0415
|
|
139
|
+
|
|
140
|
+
extra_tags: set[str] = set()
|
|
141
|
+
for part in str(args.allow_tags).replace(" ", ",").split(","):
|
|
142
|
+
tag = part.strip().lower()
|
|
143
|
+
if tag:
|
|
144
|
+
extra_tags.add(tag)
|
|
145
|
+
|
|
146
|
+
base = DEFAULT_POLICY if fragment_context is not None else DEFAULT_DOCUMENT_POLICY
|
|
147
|
+
allowed = set(base.allowed_tags)
|
|
148
|
+
allowed.update(extra_tags)
|
|
149
|
+
policy = SanitizationPolicy(
|
|
150
|
+
allowed_tags=allowed,
|
|
151
|
+
allowed_attributes=base.allowed_attributes,
|
|
152
|
+
url_policy=base.url_policy,
|
|
153
|
+
drop_comments=base.drop_comments,
|
|
154
|
+
drop_doctype=base.drop_doctype,
|
|
155
|
+
drop_foreign_namespaces=base.drop_foreign_namespaces,
|
|
156
|
+
drop_content_tags=base.drop_content_tags,
|
|
157
|
+
allowed_css_properties=base.allowed_css_properties,
|
|
158
|
+
force_link_rel=base.force_link_rel,
|
|
159
|
+
unsafe_handling=base.unsafe_handling,
|
|
160
|
+
disallowed_tag_handling=base.disallowed_tag_handling,
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
doc = JustHTML(html, fragment_context=fragment_context, safe=safe, policy=policy)
|
|
127
164
|
|
|
128
165
|
try:
|
|
129
166
|
nodes = doc.query(args.selector) if args.selector else [doc.root]
|
|
@@ -138,9 +175,8 @@ def main() -> None:
|
|
|
138
175
|
nodes = [nodes[0]]
|
|
139
176
|
|
|
140
177
|
def write_output(out: TextIO) -> None:
|
|
141
|
-
safe = not args.unsafe
|
|
142
178
|
if args.format == "html":
|
|
143
|
-
outputs = [node.to_html(
|
|
179
|
+
outputs = [node.to_html() for node in nodes]
|
|
144
180
|
out.write("\n".join(outputs))
|
|
145
181
|
out.write("\n")
|
|
146
182
|
return
|
|
@@ -149,19 +185,19 @@ def main() -> None:
|
|
|
149
185
|
# Keep these branches explicit so coverage will highlight untested CLI options.
|
|
150
186
|
if args.separator == " ":
|
|
151
187
|
if args.strip:
|
|
152
|
-
outputs = [node.to_text(strip=True
|
|
188
|
+
outputs = [node.to_text(strip=True) for node in nodes]
|
|
153
189
|
else:
|
|
154
|
-
outputs = [node.to_text(strip=False
|
|
190
|
+
outputs = [node.to_text(strip=False) for node in nodes]
|
|
155
191
|
else:
|
|
156
192
|
if args.strip:
|
|
157
|
-
outputs = [node.to_text(separator=args.separator, strip=True
|
|
193
|
+
outputs = [node.to_text(separator=args.separator, strip=True) for node in nodes]
|
|
158
194
|
else:
|
|
159
|
-
outputs = [node.to_text(separator=args.separator, strip=False
|
|
195
|
+
outputs = [node.to_text(separator=args.separator, strip=False) for node in nodes]
|
|
160
196
|
out.write("\n".join(outputs))
|
|
161
197
|
out.write("\n")
|
|
162
198
|
return
|
|
163
199
|
|
|
164
|
-
outputs = [node.to_markdown(
|
|
200
|
+
outputs = [node.to_markdown() for node in nodes]
|
|
165
201
|
out.write("\n\n".join(outputs))
|
|
166
202
|
out.write("\n")
|
|
167
203
|
|
justhtml/constants.py
CHANGED
|
@@ -184,6 +184,18 @@ HTML4_PUBLIC_PREFIXES = (
|
|
|
184
184
|
|
|
185
185
|
HEADING_ELEMENTS = {"h1", "h2", "h3", "h4", "h5", "h6"}
|
|
186
186
|
|
|
187
|
+
# Elements where pretty-printing and whitespace-collapsing transforms should
|
|
188
|
+
# preserve text node whitespace.
|
|
189
|
+
WHITESPACE_PRESERVING_ELEMENTS: Final[frozenset[str]] = frozenset(
|
|
190
|
+
{
|
|
191
|
+
"code",
|
|
192
|
+
"pre",
|
|
193
|
+
"script",
|
|
194
|
+
"style",
|
|
195
|
+
"textarea",
|
|
196
|
+
}
|
|
197
|
+
)
|
|
198
|
+
|
|
187
199
|
FORMATTING_ELEMENTS = {
|
|
188
200
|
"a",
|
|
189
201
|
"b",
|
justhtml/errors.py
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
|
-
"""Centralized error message definitions and helpers for
|
|
1
|
+
"""Centralized error message definitions and helpers for JustHTML errors.
|
|
2
2
|
|
|
3
|
-
This module provides human-readable error messages for
|
|
4
|
-
emitted by
|
|
3
|
+
This module provides human-readable error messages for parse error codes
|
|
4
|
+
emitted by the tokenizer and tree builder during HTML parsing, plus selected
|
|
5
|
+
security findings emitted by the sanitizer.
|
|
5
6
|
"""
|
|
6
7
|
|
|
7
8
|
from __future__ import annotations
|
|
@@ -143,6 +144,10 @@ def generate_error_message(code: str, tag_name: str | None = None) -> str:
|
|
|
143
144
|
"unexpected-start-tag-in-select": f"Unexpected <{tag_name}> start tag in <select>",
|
|
144
145
|
"unexpected-end-tag-in-select": f"Unexpected </{tag_name}> end tag in <select>",
|
|
145
146
|
"unexpected-select-in-select": "Unexpected nested <select> in <select>",
|
|
147
|
+
# ================================================================
|
|
148
|
+
# SECURITY ERRORS
|
|
149
|
+
# ================================================================
|
|
150
|
+
"unsafe-html": "Unsafe HTML detected by sanitization policy",
|
|
146
151
|
}
|
|
147
152
|
|
|
148
153
|
# Return message or fall back to the code itself if not found
|
justhtml/linkify.py
ADDED
|
@@ -0,0 +1,438 @@
|
|
|
1
|
+
"""Text linkification scanner.
|
|
2
|
+
|
|
3
|
+
This module finds URL/email-like substrings in plain text.
|
|
4
|
+
|
|
5
|
+
It is intentionally HTML-agnostic: in JustHTML it is applied to DOM text nodes,
|
|
6
|
+
not to raw HTML strings.
|
|
7
|
+
|
|
8
|
+
The behavior is driven by vendored compliance fixtures from the upstream
|
|
9
|
+
`linkify-it` project (MIT licensed). See `tests/linkify-it/README.md`.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import re
|
|
15
|
+
from dataclasses import dataclass
|
|
16
|
+
from typing import Final
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass(frozen=True, slots=True)
|
|
20
|
+
class LinkMatch:
|
|
21
|
+
start: int
|
|
22
|
+
end: int
|
|
23
|
+
text: str
|
|
24
|
+
href: str
|
|
25
|
+
kind: str # "url" | "email"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
DEFAULT_TLDS: Final[frozenset[str]] = frozenset(
|
|
29
|
+
{
|
|
30
|
+
# Keep this aligned with linkify-it's default list.
|
|
31
|
+
# See: https://github.com/markdown-it/linkify-it/blob/master/index.mjs
|
|
32
|
+
"biz",
|
|
33
|
+
"com",
|
|
34
|
+
"edu",
|
|
35
|
+
"gov",
|
|
36
|
+
"net",
|
|
37
|
+
"org",
|
|
38
|
+
"pro",
|
|
39
|
+
"web",
|
|
40
|
+
"xxx",
|
|
41
|
+
"aero",
|
|
42
|
+
"asia",
|
|
43
|
+
"coop",
|
|
44
|
+
"info",
|
|
45
|
+
"museum",
|
|
46
|
+
"name",
|
|
47
|
+
"shop",
|
|
48
|
+
"рф",
|
|
49
|
+
}
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
# A pragmatic Unicode-aware domain label pattern.
|
|
54
|
+
#
|
|
55
|
+
# Use `\w` for Unicode letters/digits (and underscore), and reject underscores
|
|
56
|
+
# during validation. This is intentionally stricter than allowing all non-ASCII
|
|
57
|
+
# codepoints, and matches the fixture behavior around delimiter punctuation.
|
|
58
|
+
_LABEL_RE: Final[str] = (
|
|
59
|
+
r"[0-9A-Za-z\w\u2600-\u27bf]"
|
|
60
|
+
r"(?:[0-9A-Za-z\w\u2600-\u27bf-]{0,61}[0-9A-Za-z\w\u2600-\u27bf])?"
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
# A fast-ish candidate matcher. We do real validation after we find a candidate.
|
|
64
|
+
_CANDIDATE_PATTERN: Final[str] = "".join(
|
|
65
|
+
[
|
|
66
|
+
r"(?i)([^0-9A-Za-z_])", # left boundary (avoid matching after underscore)
|
|
67
|
+
r"(", # candidate group
|
|
68
|
+
r"(?:https?|ftp)://[^\s<>\uFF5C]+", # absolute URL
|
|
69
|
+
r"|mailto:[^\s<>\uFF5C]+", # mailto
|
|
70
|
+
r"|//[^\s<>\uFF5C]+", # protocol-relative
|
|
71
|
+
r"|(?:www\.)[^\s<>\uFF5C]+", # www.
|
|
72
|
+
rf"|[0-9A-Za-z.!#$%&'*+/=?^_`{{|}}~\-\"]+@(?:{_LABEL_RE}\.)+{_LABEL_RE}", # email
|
|
73
|
+
r"|(?:\d{1,3}\.){3}\d{1,3}(?:/[^\s<>\uFF5C]*)?", # IPv4
|
|
74
|
+
rf"|(?:{_LABEL_RE}\.)+{_LABEL_RE}(?:/[^\s<>\uFF5C]*)?", # fuzzy domain/path
|
|
75
|
+
r")",
|
|
76
|
+
]
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
_CANDIDATE_RE: Final[re.Pattern[str]] = re.compile(_CANDIDATE_PATTERN, re.UNICODE)
|
|
80
|
+
|
|
81
|
+
_TRAILING_PUNCT: Final[str] = ".,;:!?"
|
|
82
|
+
|
|
83
|
+
# RE pattern for 2-character TLDs, copied from linkify-it (MIT licensed).
|
|
84
|
+
_CC_TLD_RE: Final[re.Pattern[str]] = re.compile(
|
|
85
|
+
r"^(?:a[cdefgilmnoqrstuwxz]|b[abdefghijmnorstvwyz]|c[acdfghiklmnoruvwxyz]|d[ejkmoz]|e[cegrstu]|f[ijkmor]|g[abdefghilmnpqrstuwy]|h[kmnrtu]|i[delmnoqrst]|j[emop]|k[eghimnprwyz]|l[abcikrstuvy]|m[acdeghklmnopqrstuvwxyz]|n[acefgilopruz]|om|p[aefghklmnrstwy]|qa|r[eosuw]|s[abcdeghijklmnortuvxyz]|t[cdfghjklmnortvwz]|u[agksyz]|v[aceginu]|w[fs]|y[et]|z[amw])$",
|
|
86
|
+
re.IGNORECASE,
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _is_valid_tld(tld: str, *, extra_tlds: frozenset[str]) -> bool:
|
|
91
|
+
t = (tld or "").lower()
|
|
92
|
+
if not t:
|
|
93
|
+
return False
|
|
94
|
+
# Only valid 2-letter ccTLDs (avoid false positives like `.js`).
|
|
95
|
+
if len(t) == 2 and _CC_TLD_RE.match(t) is not None:
|
|
96
|
+
return True
|
|
97
|
+
# Any punycode root.
|
|
98
|
+
if t.startswith("xn--"):
|
|
99
|
+
return True
|
|
100
|
+
return t in DEFAULT_TLDS or t in extra_tlds
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _split_domain_for_tld(host: str) -> tuple[str, str] | None:
|
|
104
|
+
# Return (domain_without_tld, tld).
|
|
105
|
+
h = (host or "").strip().strip(".")
|
|
106
|
+
if not h:
|
|
107
|
+
return None
|
|
108
|
+
if h.lower() == "localhost":
|
|
109
|
+
return ("localhost", "")
|
|
110
|
+
if "." not in h:
|
|
111
|
+
return None
|
|
112
|
+
base, tld = h.rsplit(".", 1)
|
|
113
|
+
return (base, tld)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
@dataclass(frozen=True, slots=True)
|
|
117
|
+
class LinkifyConfig:
|
|
118
|
+
fuzzy_ip: bool = False
|
|
119
|
+
extra_tlds: frozenset[str] = frozenset()
|
|
120
|
+
|
|
121
|
+
@staticmethod
|
|
122
|
+
def with_extra_tlds(extra_tlds: list[str] | tuple[str, ...] | set[str] | frozenset[str]) -> LinkifyConfig:
|
|
123
|
+
return LinkifyConfig(extra_tlds=frozenset(str(t).lower() for t in extra_tlds))
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def _is_valid_ipv4(host: str) -> bool:
|
|
127
|
+
parts = host.split(".")
|
|
128
|
+
if len(parts) != 4:
|
|
129
|
+
return False
|
|
130
|
+
for p in parts:
|
|
131
|
+
if not p or len(p) > 3:
|
|
132
|
+
return False
|
|
133
|
+
if not p.isdigit():
|
|
134
|
+
return False
|
|
135
|
+
v = int(p)
|
|
136
|
+
if v < 0 or v > 255:
|
|
137
|
+
return False
|
|
138
|
+
return True
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _punycode_host(host: str) -> str:
|
|
142
|
+
# Safety default: normalize Unicode domains to punycode for href.
|
|
143
|
+
try:
|
|
144
|
+
return host.encode("idna").decode("ascii")
|
|
145
|
+
except UnicodeError:
|
|
146
|
+
return host
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def _split_host_and_rest(raw: str) -> tuple[str, str]:
|
|
150
|
+
# raw is after an optional scheme prefix (or for fuzzy domains, the whole).
|
|
151
|
+
# Extract host[:port] and the rest (path/query/fragment).
|
|
152
|
+
for i, ch in enumerate(raw):
|
|
153
|
+
if ch in "/?#":
|
|
154
|
+
return raw[:i], raw[i:]
|
|
155
|
+
return raw, ""
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def _strip_wrapping(raw: str) -> tuple[str, int, int]:
|
|
159
|
+
# Trim common wrappers like <...> or quotes, but report how many chars were removed
|
|
160
|
+
# from start/end so we can compute accurate offsets.
|
|
161
|
+
start_trim = 0
|
|
162
|
+
end_trim = 0
|
|
163
|
+
|
|
164
|
+
if raw and raw[0] in "<\"'([{" and raw[-1] in ">\"')]}":
|
|
165
|
+
# Angle brackets are common for autolinks.
|
|
166
|
+
# Quotes/brackets: we strip them only if they wrap the candidate.
|
|
167
|
+
raw = raw[1:-1]
|
|
168
|
+
start_trim = 1
|
|
169
|
+
end_trim = 1
|
|
170
|
+
|
|
171
|
+
return raw, start_trim, end_trim
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def _trim_trailing(candidate: str) -> str:
|
|
175
|
+
# Remove trailing punctuation and unbalanced closing brackets.
|
|
176
|
+
if not candidate:
|
|
177
|
+
return candidate
|
|
178
|
+
|
|
179
|
+
# First strip sentence punctuation.
|
|
180
|
+
while candidate and candidate[-1] in _TRAILING_PUNCT:
|
|
181
|
+
candidate = candidate[:-1]
|
|
182
|
+
|
|
183
|
+
# Then strip quoting terminators when unbalanced (treat quotes as wrappers).
|
|
184
|
+
while candidate and candidate[-1] in "\"'":
|
|
185
|
+
q = candidate[-1]
|
|
186
|
+
if candidate.count(q) % 2 == 1:
|
|
187
|
+
candidate = candidate[:-1]
|
|
188
|
+
continue
|
|
189
|
+
break
|
|
190
|
+
|
|
191
|
+
# Then strip unmatched closing brackets.
|
|
192
|
+
# We treat ) ] } > as potentially closable.
|
|
193
|
+
pairs = {")": "(", "]": "[", "}": "{", ">": "<"}
|
|
194
|
+
while candidate and candidate[-1] in pairs:
|
|
195
|
+
close = candidate[-1]
|
|
196
|
+
open_ch = pairs[close]
|
|
197
|
+
if candidate.count(close) > candidate.count(open_ch):
|
|
198
|
+
candidate = candidate[:-1]
|
|
199
|
+
continue
|
|
200
|
+
break
|
|
201
|
+
|
|
202
|
+
return candidate
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def _href_for(text: str) -> tuple[str, str]:
|
|
206
|
+
lower = text.lower()
|
|
207
|
+
|
|
208
|
+
if lower.startswith("mailto:"):
|
|
209
|
+
return text, "email"
|
|
210
|
+
|
|
211
|
+
if "@" in text and not lower.startswith(("http://", "https://", "ftp://", "//", "www.")):
|
|
212
|
+
return f"mailto:{text}", "email"
|
|
213
|
+
|
|
214
|
+
if lower.startswith(("http://", "https://", "ftp://", "//")):
|
|
215
|
+
return text, "url"
|
|
216
|
+
|
|
217
|
+
# www. and fuzzy domains default to http://
|
|
218
|
+
return f"http://{text}", "url"
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def _punycode_href(href: str) -> str:
|
|
222
|
+
# Convert the host portion to punycode (IDNA), keeping the rest intact.
|
|
223
|
+
lower = href.lower()
|
|
224
|
+
prefix = ""
|
|
225
|
+
rest = href
|
|
226
|
+
|
|
227
|
+
if lower.startswith("mailto:"):
|
|
228
|
+
return href
|
|
229
|
+
|
|
230
|
+
if lower.startswith("http://"):
|
|
231
|
+
prefix = href[:7]
|
|
232
|
+
rest = href[7:]
|
|
233
|
+
elif lower.startswith("https://"):
|
|
234
|
+
prefix = href[:8]
|
|
235
|
+
rest = href[8:]
|
|
236
|
+
elif lower.startswith("ftp://"):
|
|
237
|
+
prefix = href[:6]
|
|
238
|
+
rest = href[6:]
|
|
239
|
+
elif lower.startswith("//"):
|
|
240
|
+
prefix = href[:2]
|
|
241
|
+
rest = href[2:]
|
|
242
|
+
else:
|
|
243
|
+
# Shouldn't happen; fuzzy hrefs are normalized before calling.
|
|
244
|
+
prefix = ""
|
|
245
|
+
rest = href
|
|
246
|
+
|
|
247
|
+
hostport, tail = _split_host_and_rest(rest)
|
|
248
|
+
|
|
249
|
+
# Handle userinfo (user:pass@host)
|
|
250
|
+
userinfo = ""
|
|
251
|
+
hostport2 = hostport
|
|
252
|
+
if "@" in hostport:
|
|
253
|
+
userinfo, hostport2 = hostport.rsplit("@", 1)
|
|
254
|
+
|
|
255
|
+
host = hostport2
|
|
256
|
+
port = ""
|
|
257
|
+
if hostport2.startswith("["):
|
|
258
|
+
# IPv6-ish, don't punycode.
|
|
259
|
+
return href
|
|
260
|
+
if ":" in hostport2:
|
|
261
|
+
host, port = hostport2.split(":", 1)
|
|
262
|
+
|
|
263
|
+
host_pc = _punycode_host(host)
|
|
264
|
+
rebuilt = host_pc
|
|
265
|
+
if port:
|
|
266
|
+
rebuilt = f"{rebuilt}:{port}"
|
|
267
|
+
if userinfo:
|
|
268
|
+
rebuilt = f"{userinfo}@{rebuilt}"
|
|
269
|
+
|
|
270
|
+
return f"{prefix}{rebuilt}{tail}"
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def find_links(text: str) -> list[LinkMatch]:
|
|
274
|
+
return find_links_with_config(text, LinkifyConfig())
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
def find_links_with_config(text: str, config: LinkifyConfig) -> list[LinkMatch]:
|
|
278
|
+
if not text:
|
|
279
|
+
return []
|
|
280
|
+
|
|
281
|
+
# Mirror linkify-it behavior: always scan with a leading boundary character.
|
|
282
|
+
scan_text = "\n" + text
|
|
283
|
+
|
|
284
|
+
out: list[LinkMatch] = []
|
|
285
|
+
|
|
286
|
+
for m in _CANDIDATE_RE.finditer(scan_text):
|
|
287
|
+
raw = m.group(2)
|
|
288
|
+
|
|
289
|
+
# Compute absolute offsets (exclude the boundary prefix char).
|
|
290
|
+
start = m.start(2) - 1
|
|
291
|
+
end = m.end(2) - 1
|
|
292
|
+
|
|
293
|
+
stripped, s_trim, e_trim = _strip_wrapping(raw)
|
|
294
|
+
start += s_trim
|
|
295
|
+
end -= e_trim
|
|
296
|
+
|
|
297
|
+
cand = _trim_trailing(stripped)
|
|
298
|
+
if not cand:
|
|
299
|
+
continue
|
|
300
|
+
|
|
301
|
+
# Markdown-style termination: `(...URL...)[...]` should stop at the `)`.
|
|
302
|
+
lower = cand.lower()
|
|
303
|
+
if lower.startswith(("http://", "https://", "ftp://")) and ")[" in cand:
|
|
304
|
+
cand = cand.split(")[", 1)[0]
|
|
305
|
+
cand = _trim_trailing(cand)
|
|
306
|
+
if not cand:
|
|
307
|
+
continue
|
|
308
|
+
|
|
309
|
+
# Treat leading quotes as wrappers/delimiters, not part of the URL/email.
|
|
310
|
+
if cand and cand[0] in "\"'" and 0 <= start < len(text) and text[start] == cand[0]:
|
|
311
|
+
cand = cand[1:]
|
|
312
|
+
start += 1
|
|
313
|
+
if not cand:
|
|
314
|
+
continue
|
|
315
|
+
|
|
316
|
+
# Adjust end after trimming.
|
|
317
|
+
end = start + len(cand)
|
|
318
|
+
|
|
319
|
+
lower = cand.lower()
|
|
320
|
+
|
|
321
|
+
# If this looks like a fuzzy domain that starts immediately after ://,
|
|
322
|
+
# treat it as part of a broken/disabled schema (e.g. _http://example.com, hppt://example.com).
|
|
323
|
+
if not lower.startswith(("http://", "https://", "ftp://", "mailto:", "//", "www.")) and "@" not in cand:
|
|
324
|
+
if start >= 3 and text[start - 3 : start] == "://":
|
|
325
|
+
continue
|
|
326
|
+
if start > 0 and text[start - 1] in "/:@":
|
|
327
|
+
continue
|
|
328
|
+
|
|
329
|
+
# Validate fuzzy IP option.
|
|
330
|
+
if (
|
|
331
|
+
cand
|
|
332
|
+
and cand[0].isdigit()
|
|
333
|
+
and "." in cand
|
|
334
|
+
and not lower.startswith(("http://", "https://", "ftp://", "//"))
|
|
335
|
+
):
|
|
336
|
+
host, _ = _split_host_and_rest(cand)
|
|
337
|
+
if host.replace(".", "").isdigit() and _is_valid_ipv4(host):
|
|
338
|
+
if not config.fuzzy_ip:
|
|
339
|
+
continue
|
|
340
|
+
|
|
341
|
+
# Validate // URLs: allow localhost or dotted domains, but not single-level.
|
|
342
|
+
if lower.startswith("//"):
|
|
343
|
+
# Protect against matching the // inside :// or ///.
|
|
344
|
+
if start > 0 and text[start - 1] in ":/":
|
|
345
|
+
continue
|
|
346
|
+
after = cand[2:]
|
|
347
|
+
hostport, _ = _split_host_and_rest(after)
|
|
348
|
+
if not hostport:
|
|
349
|
+
continue
|
|
350
|
+
if hostport.startswith("["):
|
|
351
|
+
continue
|
|
352
|
+
host_only = hostport
|
|
353
|
+
if "@" in host_only:
|
|
354
|
+
host_only = host_only.rsplit("@", 1)[1]
|
|
355
|
+
if ":" in host_only:
|
|
356
|
+
host_only = host_only.split(":", 1)[0]
|
|
357
|
+
if host_only.lower() != "localhost" and "." not in host_only:
|
|
358
|
+
continue
|
|
359
|
+
|
|
360
|
+
if "_" in host_only:
|
|
361
|
+
continue
|
|
362
|
+
|
|
363
|
+
# Validate fuzzy domains and emails with TLD allowlist.
|
|
364
|
+
is_scheme = lower.startswith(("http://", "https://", "ftp://", "mailto:"))
|
|
365
|
+
is_www = lower.startswith("www.")
|
|
366
|
+
is_proto_rel = lower.startswith("//")
|
|
367
|
+
|
|
368
|
+
if not is_scheme and not is_proto_rel and not is_www and "@" not in cand:
|
|
369
|
+
host, _ = _split_host_and_rest(cand)
|
|
370
|
+
if "_" in host:
|
|
371
|
+
continue
|
|
372
|
+
|
|
373
|
+
# IPv4 candidates don't use the TLD allowlist.
|
|
374
|
+
if "." in host and host.replace(".", "").isdigit() and _is_valid_ipv4(host):
|
|
375
|
+
pass
|
|
376
|
+
else:
|
|
377
|
+
parts = _split_domain_for_tld(host)
|
|
378
|
+
if parts is None:
|
|
379
|
+
continue
|
|
380
|
+
_base, tld = parts
|
|
381
|
+
if not _is_valid_tld(tld, extra_tlds=config.extra_tlds):
|
|
382
|
+
continue
|
|
383
|
+
|
|
384
|
+
if (
|
|
385
|
+
"@" in cand
|
|
386
|
+
and not lower.startswith(("http://", "https://", "ftp://", "//"))
|
|
387
|
+
and not lower.startswith("mailto:")
|
|
388
|
+
):
|
|
389
|
+
# Fuzzy email requires a valid TLD.
|
|
390
|
+
local, domain = cand.rsplit("@", 1)
|
|
391
|
+
_ = local
|
|
392
|
+
host, _tail = _split_host_and_rest(domain)
|
|
393
|
+
if "_" in host:
|
|
394
|
+
continue
|
|
395
|
+
parts = _split_domain_for_tld(host)
|
|
396
|
+
if parts is None:
|
|
397
|
+
continue
|
|
398
|
+
_base, tld = parts
|
|
399
|
+
if not _is_valid_tld(tld, extra_tlds=config.extra_tlds):
|
|
400
|
+
continue
|
|
401
|
+
|
|
402
|
+
# Validate basic URL host/port if scheme-based.
|
|
403
|
+
if lower.startswith(("http://", "https://", "ftp://")):
|
|
404
|
+
after = cand.split("://", 1)[1]
|
|
405
|
+
hostport, _ = _split_host_and_rest(after)
|
|
406
|
+
if not hostport:
|
|
407
|
+
continue
|
|
408
|
+
if "@" in hostport:
|
|
409
|
+
hostport = hostport.rsplit("@", 1)[1]
|
|
410
|
+
host = hostport
|
|
411
|
+
if ":" in hostport and not hostport.startswith("["):
|
|
412
|
+
host, port = hostport.split(":", 1)
|
|
413
|
+
if port and (not port.isdigit() or int(port) > 65535):
|
|
414
|
+
continue
|
|
415
|
+
if not host or host.startswith(("-", ".")) or host.endswith(("-", ".")) or ".." in host:
|
|
416
|
+
continue
|
|
417
|
+
if "_" in host:
|
|
418
|
+
continue
|
|
419
|
+
if "." in host and host.replace(".", "").isdigit() and not _is_valid_ipv4(host):
|
|
420
|
+
continue
|
|
421
|
+
|
|
422
|
+
href, kind = _href_for(cand)
|
|
423
|
+
href = _punycode_href(href)
|
|
424
|
+
|
|
425
|
+
out.append(LinkMatch(start=start, end=end, text=cand, href=href, kind=kind))
|
|
426
|
+
|
|
427
|
+
# Avoid overlapping matches by keeping first-longest.
|
|
428
|
+
if not out:
|
|
429
|
+
return out
|
|
430
|
+
out.sort(key=lambda x: (x.start, -(x.end - x.start)))
|
|
431
|
+
filtered: list[LinkMatch] = []
|
|
432
|
+
last_end = -1
|
|
433
|
+
for lm in out:
|
|
434
|
+
if lm.start < last_end:
|
|
435
|
+
continue
|
|
436
|
+
filtered.append(lm)
|
|
437
|
+
last_end = lm.end
|
|
438
|
+
return filtered
|