justhtml 0.6.0__py3-none-any.whl → 0.33.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- justhtml/__init__.py +28 -0
- justhtml/__main__.py +161 -13
- justhtml/constants.py +17 -1
- justhtml/context.py +7 -1
- justhtml/encoding.py +405 -0
- justhtml/entities.py +57 -17
- justhtml/errors.py +20 -4
- justhtml/linkify.py +438 -0
- justhtml/node.py +738 -41
- justhtml/parser.py +188 -21
- justhtml/py.typed +0 -0
- justhtml/sanitize.py +1141 -0
- justhtml/selector.py +240 -104
- justhtml/serialize.py +418 -57
- justhtml/stream.py +34 -10
- justhtml/tokenizer.py +433 -289
- justhtml/tokens.py +91 -23
- justhtml/transforms.py +690 -0
- justhtml/treebuilder.py +196 -111
- justhtml/treebuilder_modes.py +191 -117
- justhtml/treebuilder_utils.py +11 -4
- justhtml-0.33.0.dist-info/METADATA +196 -0
- justhtml-0.33.0.dist-info/RECORD +26 -0
- justhtml-0.33.0.dist-info/entry_points.txt +2 -0
- {justhtml-0.6.0.dist-info → justhtml-0.33.0.dist-info}/licenses/LICENSE +4 -1
- justhtml-0.6.0.dist-info/METADATA +0 -126
- justhtml-0.6.0.dist-info/RECORD +0 -20
- {justhtml-0.6.0.dist-info → justhtml-0.33.0.dist-info}/WHEEL +0 -0
justhtml/entities.py
CHANGED
|
@@ -4,27 +4,33 @@ Implements HTML5 character reference (entity) decoding per WHATWG spec §13.2.5.
|
|
|
4
4
|
Supports both named entities (&, ) and numeric references (<, <).
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
7
9
|
import html.entities
|
|
10
|
+
from typing import TYPE_CHECKING
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from collections.abc import Callable
|
|
8
14
|
|
|
9
15
|
# Use Python's complete HTML5 entity list (2231 entities)
|
|
10
16
|
# Keys include the trailing semicolon (e.g., "amp;", "lang;")
|
|
11
17
|
# We'll strip semicolons when looking up to match both forms
|
|
12
|
-
_HTML5_ENTITIES = html.entities.html5
|
|
18
|
+
_HTML5_ENTITIES: dict[str, str] = html.entities.html5
|
|
13
19
|
|
|
14
20
|
# Build a normalized lookup without semicolons for easier access
|
|
15
|
-
NAMED_ENTITIES = {}
|
|
16
|
-
for
|
|
21
|
+
NAMED_ENTITIES: dict[str, str] = {}
|
|
22
|
+
for _key, _value in _HTML5_ENTITIES.items():
|
|
17
23
|
# Remove trailing semicolon for lookup
|
|
18
|
-
if
|
|
19
|
-
NAMED_ENTITIES[
|
|
24
|
+
if _key.endswith(";"):
|
|
25
|
+
NAMED_ENTITIES[_key[:-1]] = _value
|
|
20
26
|
else:
|
|
21
|
-
NAMED_ENTITIES[
|
|
27
|
+
NAMED_ENTITIES[_key] = _value
|
|
22
28
|
|
|
23
29
|
# Legacy named character references that can be used without semicolons
|
|
24
30
|
# Per HTML5 spec, these are primarily ISO-8859-1 (Latin-1) entities from HTML4
|
|
25
31
|
# Modern entities like "prod", "notin" etc. require semicolons
|
|
26
32
|
# Note: Some have both uppercase and lowercase versions (e.g., COPY/copy, GT/gt)
|
|
27
|
-
LEGACY_ENTITIES = {
|
|
33
|
+
LEGACY_ENTITIES: set[str] = {
|
|
28
34
|
"gt",
|
|
29
35
|
"lt",
|
|
30
36
|
"amp",
|
|
@@ -134,7 +140,7 @@ LEGACY_ENTITIES = {
|
|
|
134
140
|
}
|
|
135
141
|
|
|
136
142
|
# HTML5 numeric character reference replacements (§13.2.5.73)
|
|
137
|
-
NUMERIC_REPLACEMENTS = {
|
|
143
|
+
NUMERIC_REPLACEMENTS: dict[int, str] = {
|
|
138
144
|
0x00: "\ufffd", # NULL
|
|
139
145
|
0x80: "\u20ac", # EURO SIGN
|
|
140
146
|
0x82: "\u201a", # SINGLE LOW-9 QUOTATION MARK
|
|
@@ -166,7 +172,23 @@ NUMERIC_REPLACEMENTS = {
|
|
|
166
172
|
}
|
|
167
173
|
|
|
168
174
|
|
|
169
|
-
def
|
|
175
|
+
def _is_control_character(codepoint: int) -> bool:
|
|
176
|
+
# C0 controls and C1 controls
|
|
177
|
+
return (0x00 <= codepoint <= 0x1F) or (0x7F <= codepoint <= 0x9F)
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def _is_noncharacter(codepoint: int) -> bool:
|
|
181
|
+
if 0xFDD0 <= codepoint <= 0xFDEF:
|
|
182
|
+
return True
|
|
183
|
+
last = codepoint & 0xFFFF
|
|
184
|
+
return last == 0xFFFE or last == 0xFFFF
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def decode_numeric_entity(
|
|
188
|
+
text: str,
|
|
189
|
+
is_hex: bool = False,
|
|
190
|
+
report_error: Callable[[str], None] | None = None,
|
|
191
|
+
) -> str:
|
|
170
192
|
"""Decode a numeric character reference like < or <.
|
|
171
193
|
|
|
172
194
|
Args:
|
|
@@ -179,20 +201,30 @@ def decode_numeric_entity(text, is_hex=False):
|
|
|
179
201
|
base = 16 if is_hex else 10
|
|
180
202
|
codepoint = int(text, base)
|
|
181
203
|
|
|
182
|
-
# Apply HTML5 replacements for certain ranges
|
|
183
|
-
if codepoint in NUMERIC_REPLACEMENTS:
|
|
184
|
-
return NUMERIC_REPLACEMENTS[codepoint]
|
|
185
|
-
|
|
186
204
|
# Invalid ranges per HTML5 spec
|
|
187
205
|
if codepoint > 0x10FFFF:
|
|
188
206
|
return "\ufffd" # REPLACEMENT CHARACTER
|
|
189
207
|
if 0xD800 <= codepoint <= 0xDFFF: # Surrogate range
|
|
190
208
|
return "\ufffd"
|
|
191
209
|
|
|
210
|
+
if report_error is not None:
|
|
211
|
+
if _is_control_character(codepoint):
|
|
212
|
+
report_error("control-character-reference")
|
|
213
|
+
if _is_noncharacter(codepoint):
|
|
214
|
+
report_error("noncharacter-character-reference")
|
|
215
|
+
|
|
216
|
+
# Apply HTML5 replacements for certain ranges
|
|
217
|
+
if codepoint in NUMERIC_REPLACEMENTS:
|
|
218
|
+
return NUMERIC_REPLACEMENTS[codepoint]
|
|
219
|
+
|
|
192
220
|
return chr(codepoint)
|
|
193
221
|
|
|
194
222
|
|
|
195
|
-
def decode_entities_in_text(
|
|
223
|
+
def decode_entities_in_text(
|
|
224
|
+
text: str,
|
|
225
|
+
in_attribute: bool = False,
|
|
226
|
+
report_error: Callable[[str], None] | None = None,
|
|
227
|
+
) -> str:
|
|
196
228
|
"""Decode all HTML entities in text.
|
|
197
229
|
|
|
198
230
|
This is a simple implementation that handles:
|
|
@@ -207,7 +239,7 @@ def decode_entities_in_text(text, in_attribute=False):
|
|
|
207
239
|
Returns:
|
|
208
240
|
Text with entities decoded
|
|
209
241
|
"""
|
|
210
|
-
result = []
|
|
242
|
+
result: list[str] = []
|
|
211
243
|
i = 0
|
|
212
244
|
length = len(text)
|
|
213
245
|
while i < length:
|
|
@@ -245,7 +277,9 @@ def decode_entities_in_text(text, in_attribute=False):
|
|
|
245
277
|
digit_text = text[digit_start:j]
|
|
246
278
|
|
|
247
279
|
if digit_text:
|
|
248
|
-
|
|
280
|
+
if report_error is not None and not has_semicolon:
|
|
281
|
+
report_error("missing-semicolon-after-character-reference")
|
|
282
|
+
result.append(decode_numeric_entity(digit_text, is_hex=is_hex, report_error=report_error))
|
|
249
283
|
i = j + 1 if has_semicolon else j
|
|
250
284
|
continue
|
|
251
285
|
|
|
@@ -274,7 +308,7 @@ def decode_entities_in_text(text, in_attribute=False):
|
|
|
274
308
|
continue
|
|
275
309
|
# If semicolon present but no exact match, allow legacy prefix match in text
|
|
276
310
|
if has_semicolon and not in_attribute:
|
|
277
|
-
best_match = None
|
|
311
|
+
best_match: str | None = None
|
|
278
312
|
best_match_len = 0
|
|
279
313
|
for k in range(len(entity_name), 0, -1):
|
|
280
314
|
prefix = entity_name[:k]
|
|
@@ -283,6 +317,8 @@ def decode_entities_in_text(text, in_attribute=False):
|
|
|
283
317
|
best_match_len = k
|
|
284
318
|
break
|
|
285
319
|
if best_match:
|
|
320
|
+
if report_error is not None:
|
|
321
|
+
report_error("missing-semicolon-after-character-reference")
|
|
286
322
|
result.append(best_match)
|
|
287
323
|
i = i + 1 + best_match_len
|
|
288
324
|
continue
|
|
@@ -300,6 +336,8 @@ def decode_entities_in_text(text, in_attribute=False):
|
|
|
300
336
|
continue
|
|
301
337
|
|
|
302
338
|
# Decode legacy entity
|
|
339
|
+
if report_error is not None and not has_semicolon:
|
|
340
|
+
report_error("missing-semicolon-after-character-reference")
|
|
303
341
|
result.append(NAMED_ENTITIES[entity_name])
|
|
304
342
|
i = j
|
|
305
343
|
continue
|
|
@@ -327,6 +365,8 @@ def decode_entities_in_text(text, in_attribute=False):
|
|
|
327
365
|
i += 1
|
|
328
366
|
continue
|
|
329
367
|
|
|
368
|
+
if report_error is not None:
|
|
369
|
+
report_error("missing-semicolon-after-character-reference")
|
|
330
370
|
result.append(best_match)
|
|
331
371
|
i = i + 1 + best_match_len
|
|
332
372
|
continue
|
justhtml/errors.py
CHANGED
|
@@ -1,11 +1,14 @@
|
|
|
1
|
-
"""Centralized error message definitions and helpers for
|
|
1
|
+
"""Centralized error message definitions and helpers for JustHTML errors.
|
|
2
2
|
|
|
3
|
-
This module provides human-readable error messages for
|
|
4
|
-
emitted by
|
|
3
|
+
This module provides human-readable error messages for parse error codes
|
|
4
|
+
emitted by the tokenizer and tree builder during HTML parsing, plus selected
|
|
5
|
+
security findings emitted by the sanitizer.
|
|
5
6
|
"""
|
|
6
7
|
|
|
8
|
+
from __future__ import annotations
|
|
7
9
|
|
|
8
|
-
|
|
10
|
+
|
|
11
|
+
def generate_error_message(code: str, tag_name: str | None = None) -> str:
|
|
9
12
|
"""Generate human-readable error message from error code.
|
|
10
13
|
|
|
11
14
|
Args:
|
|
@@ -73,6 +76,8 @@ def generate_error_message(code, tag_name=None):
|
|
|
73
76
|
"illegal-codepoint-for-numeric-entity": "Invalid codepoint in numeric character reference",
|
|
74
77
|
"missing-semicolon-after-character-reference": "Missing semicolon after character reference",
|
|
75
78
|
"named-entity-without-semicolon": "Named entity used without semicolon",
|
|
79
|
+
"noncharacter-character-reference": "Noncharacter in character reference",
|
|
80
|
+
"noncharacter-in-input-stream": "Noncharacter in input stream",
|
|
76
81
|
# ================================================================
|
|
77
82
|
# TREE BUILDER ERRORS
|
|
78
83
|
# ================================================================
|
|
@@ -105,8 +110,11 @@ def generate_error_message(code, tag_name=None):
|
|
|
105
110
|
# Foster parenting / table errors
|
|
106
111
|
"foster-parenting-character": "Text content in table requires foster parenting",
|
|
107
112
|
"foster-parenting-start-tag": "Start tag in table requires foster parenting",
|
|
113
|
+
"unexpected-character-implies-table-voodoo": "Unexpected character in table triggers foster parenting",
|
|
108
114
|
"unexpected-start-tag-implies-table-voodoo": f"<{tag_name}> start tag in table triggers foster parenting",
|
|
109
115
|
"unexpected-end-tag-implies-table-voodoo": f"</{tag_name}> end tag in table triggers foster parenting",
|
|
116
|
+
"unexpected-implied-end-tag-in-table-view": "Unexpected implied end tag while closing table",
|
|
117
|
+
"eof-in-table": "Unexpected end of file in table",
|
|
110
118
|
"unexpected-cell-in-table-body": "Unexpected table cell outside of table row",
|
|
111
119
|
"unexpected-form-in-table": "Form element not allowed in table context",
|
|
112
120
|
"unexpected-hidden-input-in-table": "Hidden input in table triggers foster parenting",
|
|
@@ -132,6 +140,14 @@ def generate_error_message(code, tag_name=None):
|
|
|
132
140
|
"adoption-agency-1.3": "Misnested tags require adoption agency algorithm",
|
|
133
141
|
"non-void-html-element-start-tag-with-trailing-solidus": f"<{tag_name}/> self-closing syntax on non-void element",
|
|
134
142
|
"image-start-tag": f"Deprecated <{tag_name}> tag (use <img> instead)",
|
|
143
|
+
# Select insertion mode (context-specific taxonomy)
|
|
144
|
+
"unexpected-start-tag-in-select": f"Unexpected <{tag_name}> start tag in <select>",
|
|
145
|
+
"unexpected-end-tag-in-select": f"Unexpected </{tag_name}> end tag in <select>",
|
|
146
|
+
"unexpected-select-in-select": "Unexpected nested <select> in <select>",
|
|
147
|
+
# ================================================================
|
|
148
|
+
# SECURITY ERRORS
|
|
149
|
+
# ================================================================
|
|
150
|
+
"unsafe-html": "Unsafe HTML detected by sanitization policy",
|
|
135
151
|
}
|
|
136
152
|
|
|
137
153
|
# Return message or fall back to the code itself if not found
|
justhtml/linkify.py
ADDED
|
@@ -0,0 +1,438 @@
|
|
|
1
|
+
"""Text linkification scanner.
|
|
2
|
+
|
|
3
|
+
This module finds URL/email-like substrings in plain text.
|
|
4
|
+
|
|
5
|
+
It is intentionally HTML-agnostic: in JustHTML it is applied to DOM text nodes,
|
|
6
|
+
not to raw HTML strings.
|
|
7
|
+
|
|
8
|
+
The behavior is driven by vendored compliance fixtures from the upstream
|
|
9
|
+
`linkify-it` project (MIT licensed). See `tests/linkify-it/README.md`.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import re
|
|
15
|
+
from dataclasses import dataclass
|
|
16
|
+
from typing import Final
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass(frozen=True, slots=True)
|
|
20
|
+
class LinkMatch:
|
|
21
|
+
start: int
|
|
22
|
+
end: int
|
|
23
|
+
text: str
|
|
24
|
+
href: str
|
|
25
|
+
kind: str # "url" | "email"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
DEFAULT_TLDS: Final[frozenset[str]] = frozenset(
|
|
29
|
+
{
|
|
30
|
+
# Keep this aligned with linkify-it's default list.
|
|
31
|
+
# See: https://github.com/markdown-it/linkify-it/blob/master/index.mjs
|
|
32
|
+
"biz",
|
|
33
|
+
"com",
|
|
34
|
+
"edu",
|
|
35
|
+
"gov",
|
|
36
|
+
"net",
|
|
37
|
+
"org",
|
|
38
|
+
"pro",
|
|
39
|
+
"web",
|
|
40
|
+
"xxx",
|
|
41
|
+
"aero",
|
|
42
|
+
"asia",
|
|
43
|
+
"coop",
|
|
44
|
+
"info",
|
|
45
|
+
"museum",
|
|
46
|
+
"name",
|
|
47
|
+
"shop",
|
|
48
|
+
"рф",
|
|
49
|
+
}
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
# A pragmatic Unicode-aware domain label pattern.
|
|
54
|
+
#
|
|
55
|
+
# Use `\w` for Unicode letters/digits (and underscore), and reject underscores
|
|
56
|
+
# during validation. This is intentionally stricter than allowing all non-ASCII
|
|
57
|
+
# codepoints, and matches the fixture behavior around delimiter punctuation.
|
|
58
|
+
_LABEL_RE: Final[str] = (
|
|
59
|
+
r"[0-9A-Za-z\w\u2600-\u27bf]"
|
|
60
|
+
r"(?:[0-9A-Za-z\w\u2600-\u27bf-]{0,61}[0-9A-Za-z\w\u2600-\u27bf])?"
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
# A fast-ish candidate matcher. We do real validation after we find a candidate.
|
|
64
|
+
_CANDIDATE_PATTERN: Final[str] = "".join(
|
|
65
|
+
[
|
|
66
|
+
r"(?i)([^0-9A-Za-z_])", # left boundary (avoid matching after underscore)
|
|
67
|
+
r"(", # candidate group
|
|
68
|
+
r"(?:https?|ftp)://[^\s<>\uFF5C]+", # absolute URL
|
|
69
|
+
r"|mailto:[^\s<>\uFF5C]+", # mailto
|
|
70
|
+
r"|//[^\s<>\uFF5C]+", # protocol-relative
|
|
71
|
+
r"|(?:www\.)[^\s<>\uFF5C]+", # www.
|
|
72
|
+
rf"|[0-9A-Za-z.!#$%&'*+/=?^_`{{|}}~\-\"]+@(?:{_LABEL_RE}\.)+{_LABEL_RE}", # email
|
|
73
|
+
r"|(?:\d{1,3}\.){3}\d{1,3}(?:/[^\s<>\uFF5C]*)?", # IPv4
|
|
74
|
+
rf"|(?:{_LABEL_RE}\.)+{_LABEL_RE}(?:/[^\s<>\uFF5C]*)?", # fuzzy domain/path
|
|
75
|
+
r")",
|
|
76
|
+
]
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
_CANDIDATE_RE: Final[re.Pattern[str]] = re.compile(_CANDIDATE_PATTERN, re.UNICODE)
|
|
80
|
+
|
|
81
|
+
_TRAILING_PUNCT: Final[str] = ".,;:!?"
|
|
82
|
+
|
|
83
|
+
# RE pattern for 2-character TLDs, copied from linkify-it (MIT licensed).
|
|
84
|
+
_CC_TLD_RE: Final[re.Pattern[str]] = re.compile(
|
|
85
|
+
r"^(?:a[cdefgilmnoqrstuwxz]|b[abdefghijmnorstvwyz]|c[acdfghiklmnoruvwxyz]|d[ejkmoz]|e[cegrstu]|f[ijkmor]|g[abdefghilmnpqrstuwy]|h[kmnrtu]|i[delmnoqrst]|j[emop]|k[eghimnprwyz]|l[abcikrstuvy]|m[acdeghklmnopqrstuvwxyz]|n[acefgilopruz]|om|p[aefghklmnrstwy]|qa|r[eosuw]|s[abcdeghijklmnortuvxyz]|t[cdfghjklmnortvwz]|u[agksyz]|v[aceginu]|w[fs]|y[et]|z[amw])$",
|
|
86
|
+
re.IGNORECASE,
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _is_valid_tld(tld: str, *, extra_tlds: frozenset[str]) -> bool:
|
|
91
|
+
t = (tld or "").lower()
|
|
92
|
+
if not t:
|
|
93
|
+
return False
|
|
94
|
+
# Only valid 2-letter ccTLDs (avoid false positives like `.js`).
|
|
95
|
+
if len(t) == 2 and _CC_TLD_RE.match(t) is not None:
|
|
96
|
+
return True
|
|
97
|
+
# Any punycode root.
|
|
98
|
+
if t.startswith("xn--"):
|
|
99
|
+
return True
|
|
100
|
+
return t in DEFAULT_TLDS or t in extra_tlds
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _split_domain_for_tld(host: str) -> tuple[str, str] | None:
|
|
104
|
+
# Return (domain_without_tld, tld).
|
|
105
|
+
h = (host or "").strip().strip(".")
|
|
106
|
+
if not h:
|
|
107
|
+
return None
|
|
108
|
+
if h.lower() == "localhost":
|
|
109
|
+
return ("localhost", "")
|
|
110
|
+
if "." not in h:
|
|
111
|
+
return None
|
|
112
|
+
base, tld = h.rsplit(".", 1)
|
|
113
|
+
return (base, tld)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
@dataclass(frozen=True, slots=True)
|
|
117
|
+
class LinkifyConfig:
|
|
118
|
+
fuzzy_ip: bool = False
|
|
119
|
+
extra_tlds: frozenset[str] = frozenset()
|
|
120
|
+
|
|
121
|
+
@staticmethod
|
|
122
|
+
def with_extra_tlds(extra_tlds: list[str] | tuple[str, ...] | set[str] | frozenset[str]) -> LinkifyConfig:
|
|
123
|
+
return LinkifyConfig(extra_tlds=frozenset(str(t).lower() for t in extra_tlds))
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def _is_valid_ipv4(host: str) -> bool:
|
|
127
|
+
parts = host.split(".")
|
|
128
|
+
if len(parts) != 4:
|
|
129
|
+
return False
|
|
130
|
+
for p in parts:
|
|
131
|
+
if not p or len(p) > 3:
|
|
132
|
+
return False
|
|
133
|
+
if not p.isdigit():
|
|
134
|
+
return False
|
|
135
|
+
v = int(p)
|
|
136
|
+
if v < 0 or v > 255:
|
|
137
|
+
return False
|
|
138
|
+
return True
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _punycode_host(host: str) -> str:
|
|
142
|
+
# Safety default: normalize Unicode domains to punycode for href.
|
|
143
|
+
try:
|
|
144
|
+
return host.encode("idna").decode("ascii")
|
|
145
|
+
except UnicodeError:
|
|
146
|
+
return host
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def _split_host_and_rest(raw: str) -> tuple[str, str]:
|
|
150
|
+
# raw is after an optional scheme prefix (or for fuzzy domains, the whole).
|
|
151
|
+
# Extract host[:port] and the rest (path/query/fragment).
|
|
152
|
+
for i, ch in enumerate(raw):
|
|
153
|
+
if ch in "/?#":
|
|
154
|
+
return raw[:i], raw[i:]
|
|
155
|
+
return raw, ""
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def _strip_wrapping(raw: str) -> tuple[str, int, int]:
|
|
159
|
+
# Trim common wrappers like <...> or quotes, but report how many chars were removed
|
|
160
|
+
# from start/end so we can compute accurate offsets.
|
|
161
|
+
start_trim = 0
|
|
162
|
+
end_trim = 0
|
|
163
|
+
|
|
164
|
+
if raw and raw[0] in "<\"'([{" and raw[-1] in ">\"')]}":
|
|
165
|
+
# Angle brackets are common for autolinks.
|
|
166
|
+
# Quotes/brackets: we strip them only if they wrap the candidate.
|
|
167
|
+
raw = raw[1:-1]
|
|
168
|
+
start_trim = 1
|
|
169
|
+
end_trim = 1
|
|
170
|
+
|
|
171
|
+
return raw, start_trim, end_trim
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def _trim_trailing(candidate: str) -> str:
|
|
175
|
+
# Remove trailing punctuation and unbalanced closing brackets.
|
|
176
|
+
if not candidate:
|
|
177
|
+
return candidate
|
|
178
|
+
|
|
179
|
+
# First strip sentence punctuation.
|
|
180
|
+
while candidate and candidate[-1] in _TRAILING_PUNCT:
|
|
181
|
+
candidate = candidate[:-1]
|
|
182
|
+
|
|
183
|
+
# Then strip quoting terminators when unbalanced (treat quotes as wrappers).
|
|
184
|
+
while candidate and candidate[-1] in "\"'":
|
|
185
|
+
q = candidate[-1]
|
|
186
|
+
if candidate.count(q) % 2 == 1:
|
|
187
|
+
candidate = candidate[:-1]
|
|
188
|
+
continue
|
|
189
|
+
break
|
|
190
|
+
|
|
191
|
+
# Then strip unmatched closing brackets.
|
|
192
|
+
# We treat ) ] } > as potentially closable.
|
|
193
|
+
pairs = {")": "(", "]": "[", "}": "{", ">": "<"}
|
|
194
|
+
while candidate and candidate[-1] in pairs:
|
|
195
|
+
close = candidate[-1]
|
|
196
|
+
open_ch = pairs[close]
|
|
197
|
+
if candidate.count(close) > candidate.count(open_ch):
|
|
198
|
+
candidate = candidate[:-1]
|
|
199
|
+
continue
|
|
200
|
+
break
|
|
201
|
+
|
|
202
|
+
return candidate
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def _href_for(text: str) -> tuple[str, str]:
|
|
206
|
+
lower = text.lower()
|
|
207
|
+
|
|
208
|
+
if lower.startswith("mailto:"):
|
|
209
|
+
return text, "email"
|
|
210
|
+
|
|
211
|
+
if "@" in text and not lower.startswith(("http://", "https://", "ftp://", "//", "www.")):
|
|
212
|
+
return f"mailto:{text}", "email"
|
|
213
|
+
|
|
214
|
+
if lower.startswith(("http://", "https://", "ftp://", "//")):
|
|
215
|
+
return text, "url"
|
|
216
|
+
|
|
217
|
+
# www. and fuzzy domains default to http://
|
|
218
|
+
return f"http://{text}", "url"
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def _punycode_href(href: str) -> str:
|
|
222
|
+
# Convert the host portion to punycode (IDNA), keeping the rest intact.
|
|
223
|
+
lower = href.lower()
|
|
224
|
+
prefix = ""
|
|
225
|
+
rest = href
|
|
226
|
+
|
|
227
|
+
if lower.startswith("mailto:"):
|
|
228
|
+
return href
|
|
229
|
+
|
|
230
|
+
if lower.startswith("http://"):
|
|
231
|
+
prefix = href[:7]
|
|
232
|
+
rest = href[7:]
|
|
233
|
+
elif lower.startswith("https://"):
|
|
234
|
+
prefix = href[:8]
|
|
235
|
+
rest = href[8:]
|
|
236
|
+
elif lower.startswith("ftp://"):
|
|
237
|
+
prefix = href[:6]
|
|
238
|
+
rest = href[6:]
|
|
239
|
+
elif lower.startswith("//"):
|
|
240
|
+
prefix = href[:2]
|
|
241
|
+
rest = href[2:]
|
|
242
|
+
else:
|
|
243
|
+
# Shouldn't happen; fuzzy hrefs are normalized before calling.
|
|
244
|
+
prefix = ""
|
|
245
|
+
rest = href
|
|
246
|
+
|
|
247
|
+
hostport, tail = _split_host_and_rest(rest)
|
|
248
|
+
|
|
249
|
+
# Handle userinfo (user:pass@host)
|
|
250
|
+
userinfo = ""
|
|
251
|
+
hostport2 = hostport
|
|
252
|
+
if "@" in hostport:
|
|
253
|
+
userinfo, hostport2 = hostport.rsplit("@", 1)
|
|
254
|
+
|
|
255
|
+
host = hostport2
|
|
256
|
+
port = ""
|
|
257
|
+
if hostport2.startswith("["):
|
|
258
|
+
# IPv6-ish, don't punycode.
|
|
259
|
+
return href
|
|
260
|
+
if ":" in hostport2:
|
|
261
|
+
host, port = hostport2.split(":", 1)
|
|
262
|
+
|
|
263
|
+
host_pc = _punycode_host(host)
|
|
264
|
+
rebuilt = host_pc
|
|
265
|
+
if port:
|
|
266
|
+
rebuilt = f"{rebuilt}:{port}"
|
|
267
|
+
if userinfo:
|
|
268
|
+
rebuilt = f"{userinfo}@{rebuilt}"
|
|
269
|
+
|
|
270
|
+
return f"{prefix}{rebuilt}{tail}"
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def find_links(text: str) -> list[LinkMatch]:
|
|
274
|
+
return find_links_with_config(text, LinkifyConfig())
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
def find_links_with_config(text: str, config: LinkifyConfig) -> list[LinkMatch]:
|
|
278
|
+
if not text:
|
|
279
|
+
return []
|
|
280
|
+
|
|
281
|
+
# Mirror linkify-it behavior: always scan with a leading boundary character.
|
|
282
|
+
scan_text = "\n" + text
|
|
283
|
+
|
|
284
|
+
out: list[LinkMatch] = []
|
|
285
|
+
|
|
286
|
+
for m in _CANDIDATE_RE.finditer(scan_text):
|
|
287
|
+
raw = m.group(2)
|
|
288
|
+
|
|
289
|
+
# Compute absolute offsets (exclude the boundary prefix char).
|
|
290
|
+
start = m.start(2) - 1
|
|
291
|
+
end = m.end(2) - 1
|
|
292
|
+
|
|
293
|
+
stripped, s_trim, e_trim = _strip_wrapping(raw)
|
|
294
|
+
start += s_trim
|
|
295
|
+
end -= e_trim
|
|
296
|
+
|
|
297
|
+
cand = _trim_trailing(stripped)
|
|
298
|
+
if not cand:
|
|
299
|
+
continue
|
|
300
|
+
|
|
301
|
+
# Markdown-style termination: `(...URL...)[...]` should stop at the `)`.
|
|
302
|
+
lower = cand.lower()
|
|
303
|
+
if lower.startswith(("http://", "https://", "ftp://")) and ")[" in cand:
|
|
304
|
+
cand = cand.split(")[", 1)[0]
|
|
305
|
+
cand = _trim_trailing(cand)
|
|
306
|
+
if not cand:
|
|
307
|
+
continue
|
|
308
|
+
|
|
309
|
+
# Treat leading quotes as wrappers/delimiters, not part of the URL/email.
|
|
310
|
+
if cand and cand[0] in "\"'" and 0 <= start < len(text) and text[start] == cand[0]:
|
|
311
|
+
cand = cand[1:]
|
|
312
|
+
start += 1
|
|
313
|
+
if not cand:
|
|
314
|
+
continue
|
|
315
|
+
|
|
316
|
+
# Adjust end after trimming.
|
|
317
|
+
end = start + len(cand)
|
|
318
|
+
|
|
319
|
+
lower = cand.lower()
|
|
320
|
+
|
|
321
|
+
# If this looks like a fuzzy domain that starts immediately after ://,
|
|
322
|
+
# treat it as part of a broken/disabled schema (e.g. _http://example.com, hppt://example.com).
|
|
323
|
+
if not lower.startswith(("http://", "https://", "ftp://", "mailto:", "//", "www.")) and "@" not in cand:
|
|
324
|
+
if start >= 3 and text[start - 3 : start] == "://":
|
|
325
|
+
continue
|
|
326
|
+
if start > 0 and text[start - 1] in "/:@":
|
|
327
|
+
continue
|
|
328
|
+
|
|
329
|
+
# Validate fuzzy IP option.
|
|
330
|
+
if (
|
|
331
|
+
cand
|
|
332
|
+
and cand[0].isdigit()
|
|
333
|
+
and "." in cand
|
|
334
|
+
and not lower.startswith(("http://", "https://", "ftp://", "//"))
|
|
335
|
+
):
|
|
336
|
+
host, _ = _split_host_and_rest(cand)
|
|
337
|
+
if host.replace(".", "").isdigit() and _is_valid_ipv4(host):
|
|
338
|
+
if not config.fuzzy_ip:
|
|
339
|
+
continue
|
|
340
|
+
|
|
341
|
+
# Validate // URLs: allow localhost or dotted domains, but not single-level.
|
|
342
|
+
if lower.startswith("//"):
|
|
343
|
+
# Protect against matching the // inside :// or ///.
|
|
344
|
+
if start > 0 and text[start - 1] in ":/":
|
|
345
|
+
continue
|
|
346
|
+
after = cand[2:]
|
|
347
|
+
hostport, _ = _split_host_and_rest(after)
|
|
348
|
+
if not hostport:
|
|
349
|
+
continue
|
|
350
|
+
if hostport.startswith("["):
|
|
351
|
+
continue
|
|
352
|
+
host_only = hostport
|
|
353
|
+
if "@" in host_only:
|
|
354
|
+
host_only = host_only.rsplit("@", 1)[1]
|
|
355
|
+
if ":" in host_only:
|
|
356
|
+
host_only = host_only.split(":", 1)[0]
|
|
357
|
+
if host_only.lower() != "localhost" and "." not in host_only:
|
|
358
|
+
continue
|
|
359
|
+
|
|
360
|
+
if "_" in host_only:
|
|
361
|
+
continue
|
|
362
|
+
|
|
363
|
+
# Validate fuzzy domains and emails with TLD allowlist.
|
|
364
|
+
is_scheme = lower.startswith(("http://", "https://", "ftp://", "mailto:"))
|
|
365
|
+
is_www = lower.startswith("www.")
|
|
366
|
+
is_proto_rel = lower.startswith("//")
|
|
367
|
+
|
|
368
|
+
if not is_scheme and not is_proto_rel and not is_www and "@" not in cand:
|
|
369
|
+
host, _ = _split_host_and_rest(cand)
|
|
370
|
+
if "_" in host:
|
|
371
|
+
continue
|
|
372
|
+
|
|
373
|
+
# IPv4 candidates don't use the TLD allowlist.
|
|
374
|
+
if "." in host and host.replace(".", "").isdigit() and _is_valid_ipv4(host):
|
|
375
|
+
pass
|
|
376
|
+
else:
|
|
377
|
+
parts = _split_domain_for_tld(host)
|
|
378
|
+
if parts is None:
|
|
379
|
+
continue
|
|
380
|
+
_base, tld = parts
|
|
381
|
+
if not _is_valid_tld(tld, extra_tlds=config.extra_tlds):
|
|
382
|
+
continue
|
|
383
|
+
|
|
384
|
+
if (
|
|
385
|
+
"@" in cand
|
|
386
|
+
and not lower.startswith(("http://", "https://", "ftp://", "//"))
|
|
387
|
+
and not lower.startswith("mailto:")
|
|
388
|
+
):
|
|
389
|
+
# Fuzzy email requires a valid TLD.
|
|
390
|
+
local, domain = cand.rsplit("@", 1)
|
|
391
|
+
_ = local
|
|
392
|
+
host, _tail = _split_host_and_rest(domain)
|
|
393
|
+
if "_" in host:
|
|
394
|
+
continue
|
|
395
|
+
parts = _split_domain_for_tld(host)
|
|
396
|
+
if parts is None:
|
|
397
|
+
continue
|
|
398
|
+
_base, tld = parts
|
|
399
|
+
if not _is_valid_tld(tld, extra_tlds=config.extra_tlds):
|
|
400
|
+
continue
|
|
401
|
+
|
|
402
|
+
# Validate basic URL host/port if scheme-based.
|
|
403
|
+
if lower.startswith(("http://", "https://", "ftp://")):
|
|
404
|
+
after = cand.split("://", 1)[1]
|
|
405
|
+
hostport, _ = _split_host_and_rest(after)
|
|
406
|
+
if not hostport:
|
|
407
|
+
continue
|
|
408
|
+
if "@" in hostport:
|
|
409
|
+
hostport = hostport.rsplit("@", 1)[1]
|
|
410
|
+
host = hostport
|
|
411
|
+
if ":" in hostport and not hostport.startswith("["):
|
|
412
|
+
host, port = hostport.split(":", 1)
|
|
413
|
+
if port and (not port.isdigit() or int(port) > 65535):
|
|
414
|
+
continue
|
|
415
|
+
if not host or host.startswith(("-", ".")) or host.endswith(("-", ".")) or ".." in host:
|
|
416
|
+
continue
|
|
417
|
+
if "_" in host:
|
|
418
|
+
continue
|
|
419
|
+
if "." in host and host.replace(".", "").isdigit() and not _is_valid_ipv4(host):
|
|
420
|
+
continue
|
|
421
|
+
|
|
422
|
+
href, kind = _href_for(cand)
|
|
423
|
+
href = _punycode_href(href)
|
|
424
|
+
|
|
425
|
+
out.append(LinkMatch(start=start, end=end, text=cand, href=href, kind=kind))
|
|
426
|
+
|
|
427
|
+
# Avoid overlapping matches by keeping first-longest.
|
|
428
|
+
if not out:
|
|
429
|
+
return out
|
|
430
|
+
out.sort(key=lambda x: (x.start, -(x.end - x.start)))
|
|
431
|
+
filtered: list[LinkMatch] = []
|
|
432
|
+
last_end = -1
|
|
433
|
+
for lm in out:
|
|
434
|
+
if lm.start < last_end:
|
|
435
|
+
continue
|
|
436
|
+
filtered.append(lm)
|
|
437
|
+
last_end = lm.end
|
|
438
|
+
return filtered
|