justhtml 0.6.0__py3-none-any.whl → 0.33.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
justhtml/entities.py CHANGED
@@ -4,27 +4,33 @@ Implements HTML5 character reference (entity) decoding per WHATWG spec §13.2.5.
4
4
  Supports both named entities (&,  ) and numeric references (<, <).
5
5
  """
6
6
 
7
+ from __future__ import annotations
8
+
7
9
  import html.entities
10
+ from typing import TYPE_CHECKING
11
+
12
+ if TYPE_CHECKING:
13
+ from collections.abc import Callable
8
14
 
9
15
  # Use Python's complete HTML5 entity list (2231 entities)
10
16
  # Keys include the trailing semicolon (e.g., "amp;", "lang;")
11
17
  # We'll strip semicolons when looking up to match both forms
12
- _HTML5_ENTITIES = html.entities.html5
18
+ _HTML5_ENTITIES: dict[str, str] = html.entities.html5
13
19
 
14
20
  # Build a normalized lookup without semicolons for easier access
15
- NAMED_ENTITIES = {}
16
- for key, value in _HTML5_ENTITIES.items():
21
+ NAMED_ENTITIES: dict[str, str] = {}
22
+ for _key, _value in _HTML5_ENTITIES.items():
17
23
  # Remove trailing semicolon for lookup
18
- if key.endswith(";"):
19
- NAMED_ENTITIES[key[:-1]] = value
24
+ if _key.endswith(";"):
25
+ NAMED_ENTITIES[_key[:-1]] = _value
20
26
  else:
21
- NAMED_ENTITIES[key] = value
27
+ NAMED_ENTITIES[_key] = _value
22
28
 
23
29
  # Legacy named character references that can be used without semicolons
24
30
  # Per HTML5 spec, these are primarily ISO-8859-1 (Latin-1) entities from HTML4
25
31
  # Modern entities like "prod", "notin" etc. require semicolons
26
32
  # Note: Some have both uppercase and lowercase versions (e.g., COPY/copy, GT/gt)
27
- LEGACY_ENTITIES = {
33
+ LEGACY_ENTITIES: set[str] = {
28
34
  "gt",
29
35
  "lt",
30
36
  "amp",
@@ -134,7 +140,7 @@ LEGACY_ENTITIES = {
134
140
  }
135
141
 
136
142
  # HTML5 numeric character reference replacements (§13.2.5.73)
137
- NUMERIC_REPLACEMENTS = {
143
+ NUMERIC_REPLACEMENTS: dict[int, str] = {
138
144
  0x00: "\ufffd", # NULL
139
145
  0x80: "\u20ac", # EURO SIGN
140
146
  0x82: "\u201a", # SINGLE LOW-9 QUOTATION MARK
@@ -166,7 +172,23 @@ NUMERIC_REPLACEMENTS = {
166
172
  }
167
173
 
168
174
 
169
- def decode_numeric_entity(text, is_hex=False):
175
+ def _is_control_character(codepoint: int) -> bool:
176
+ # C0 controls and C1 controls
177
+ return (0x00 <= codepoint <= 0x1F) or (0x7F <= codepoint <= 0x9F)
178
+
179
+
180
+ def _is_noncharacter(codepoint: int) -> bool:
181
+ if 0xFDD0 <= codepoint <= 0xFDEF:
182
+ return True
183
+ last = codepoint & 0xFFFF
184
+ return last == 0xFFFE or last == 0xFFFF
185
+
186
+
187
+ def decode_numeric_entity(
188
+ text: str,
189
+ is_hex: bool = False,
190
+ report_error: Callable[[str], None] | None = None,
191
+ ) -> str:
170
192
  """Decode a numeric character reference like &#60; or &#x3C;.
171
193
 
172
194
  Args:
@@ -179,20 +201,30 @@ def decode_numeric_entity(text, is_hex=False):
179
201
  base = 16 if is_hex else 10
180
202
  codepoint = int(text, base)
181
203
 
182
- # Apply HTML5 replacements for certain ranges
183
- if codepoint in NUMERIC_REPLACEMENTS:
184
- return NUMERIC_REPLACEMENTS[codepoint]
185
-
186
204
  # Invalid ranges per HTML5 spec
187
205
  if codepoint > 0x10FFFF:
188
206
  return "\ufffd" # REPLACEMENT CHARACTER
189
207
  if 0xD800 <= codepoint <= 0xDFFF: # Surrogate range
190
208
  return "\ufffd"
191
209
 
210
+ if report_error is not None:
211
+ if _is_control_character(codepoint):
212
+ report_error("control-character-reference")
213
+ if _is_noncharacter(codepoint):
214
+ report_error("noncharacter-character-reference")
215
+
216
+ # Apply HTML5 replacements for certain ranges
217
+ if codepoint in NUMERIC_REPLACEMENTS:
218
+ return NUMERIC_REPLACEMENTS[codepoint]
219
+
192
220
  return chr(codepoint)
193
221
 
194
222
 
195
- def decode_entities_in_text(text, in_attribute=False):
223
+ def decode_entities_in_text(
224
+ text: str,
225
+ in_attribute: bool = False,
226
+ report_error: Callable[[str], None] | None = None,
227
+ ) -> str:
196
228
  """Decode all HTML entities in text.
197
229
 
198
230
  This is a simple implementation that handles:
@@ -207,7 +239,7 @@ def decode_entities_in_text(text, in_attribute=False):
207
239
  Returns:
208
240
  Text with entities decoded
209
241
  """
210
- result = []
242
+ result: list[str] = []
211
243
  i = 0
212
244
  length = len(text)
213
245
  while i < length:
@@ -245,7 +277,9 @@ def decode_entities_in_text(text, in_attribute=False):
245
277
  digit_text = text[digit_start:j]
246
278
 
247
279
  if digit_text:
248
- result.append(decode_numeric_entity(digit_text, is_hex=is_hex))
280
+ if report_error is not None and not has_semicolon:
281
+ report_error("missing-semicolon-after-character-reference")
282
+ result.append(decode_numeric_entity(digit_text, is_hex=is_hex, report_error=report_error))
249
283
  i = j + 1 if has_semicolon else j
250
284
  continue
251
285
 
@@ -274,7 +308,7 @@ def decode_entities_in_text(text, in_attribute=False):
274
308
  continue
275
309
  # If semicolon present but no exact match, allow legacy prefix match in text
276
310
  if has_semicolon and not in_attribute:
277
- best_match = None
311
+ best_match: str | None = None
278
312
  best_match_len = 0
279
313
  for k in range(len(entity_name), 0, -1):
280
314
  prefix = entity_name[:k]
@@ -283,6 +317,8 @@ def decode_entities_in_text(text, in_attribute=False):
283
317
  best_match_len = k
284
318
  break
285
319
  if best_match:
320
+ if report_error is not None:
321
+ report_error("missing-semicolon-after-character-reference")
286
322
  result.append(best_match)
287
323
  i = i + 1 + best_match_len
288
324
  continue
@@ -300,6 +336,8 @@ def decode_entities_in_text(text, in_attribute=False):
300
336
  continue
301
337
 
302
338
  # Decode legacy entity
339
+ if report_error is not None and not has_semicolon:
340
+ report_error("missing-semicolon-after-character-reference")
303
341
  result.append(NAMED_ENTITIES[entity_name])
304
342
  i = j
305
343
  continue
@@ -327,6 +365,8 @@ def decode_entities_in_text(text, in_attribute=False):
327
365
  i += 1
328
366
  continue
329
367
 
368
+ if report_error is not None:
369
+ report_error("missing-semicolon-after-character-reference")
330
370
  result.append(best_match)
331
371
  i = i + 1 + best_match_len
332
372
  continue
justhtml/errors.py CHANGED
@@ -1,11 +1,14 @@
1
- """Centralized error message definitions and helpers for HTML parsing errors.
1
+ """Centralized error message definitions and helpers for JustHTML errors.
2
2
 
3
- This module provides human-readable error messages for all parse error codes
4
- emitted by both the tokenizer and tree builder during HTML parsing.
3
+ This module provides human-readable error messages for parse error codes
4
+ emitted by the tokenizer and tree builder during HTML parsing, plus selected
5
+ security findings emitted by the sanitizer.
5
6
  """
6
7
 
8
+ from __future__ import annotations
7
9
 
8
- def generate_error_message(code, tag_name=None):
10
+
11
+ def generate_error_message(code: str, tag_name: str | None = None) -> str:
9
12
  """Generate human-readable error message from error code.
10
13
 
11
14
  Args:
@@ -73,6 +76,8 @@ def generate_error_message(code, tag_name=None):
73
76
  "illegal-codepoint-for-numeric-entity": "Invalid codepoint in numeric character reference",
74
77
  "missing-semicolon-after-character-reference": "Missing semicolon after character reference",
75
78
  "named-entity-without-semicolon": "Named entity used without semicolon",
79
+ "noncharacter-character-reference": "Noncharacter in character reference",
80
+ "noncharacter-in-input-stream": "Noncharacter in input stream",
76
81
  # ================================================================
77
82
  # TREE BUILDER ERRORS
78
83
  # ================================================================
@@ -105,8 +110,11 @@ def generate_error_message(code, tag_name=None):
105
110
  # Foster parenting / table errors
106
111
  "foster-parenting-character": "Text content in table requires foster parenting",
107
112
  "foster-parenting-start-tag": "Start tag in table requires foster parenting",
113
+ "unexpected-character-implies-table-voodoo": "Unexpected character in table triggers foster parenting",
108
114
  "unexpected-start-tag-implies-table-voodoo": f"<{tag_name}> start tag in table triggers foster parenting",
109
115
  "unexpected-end-tag-implies-table-voodoo": f"</{tag_name}> end tag in table triggers foster parenting",
116
+ "unexpected-implied-end-tag-in-table-view": "Unexpected implied end tag while closing table",
117
+ "eof-in-table": "Unexpected end of file in table",
110
118
  "unexpected-cell-in-table-body": "Unexpected table cell outside of table row",
111
119
  "unexpected-form-in-table": "Form element not allowed in table context",
112
120
  "unexpected-hidden-input-in-table": "Hidden input in table triggers foster parenting",
@@ -132,6 +140,14 @@ def generate_error_message(code, tag_name=None):
132
140
  "adoption-agency-1.3": "Misnested tags require adoption agency algorithm",
133
141
  "non-void-html-element-start-tag-with-trailing-solidus": f"<{tag_name}/> self-closing syntax on non-void element",
134
142
  "image-start-tag": f"Deprecated <{tag_name}> tag (use <img> instead)",
143
+ # Select insertion mode (context-specific taxonomy)
144
+ "unexpected-start-tag-in-select": f"Unexpected <{tag_name}> start tag in <select>",
145
+ "unexpected-end-tag-in-select": f"Unexpected </{tag_name}> end tag in <select>",
146
+ "unexpected-select-in-select": "Unexpected nested <select> in <select>",
147
+ # ================================================================
148
+ # SECURITY ERRORS
149
+ # ================================================================
150
+ "unsafe-html": "Unsafe HTML detected by sanitization policy",
135
151
  }
136
152
 
137
153
  # Return message or fall back to the code itself if not found
justhtml/linkify.py ADDED
@@ -0,0 +1,438 @@
1
+ """Text linkification scanner.
2
+
3
+ This module finds URL/email-like substrings in plain text.
4
+
5
+ It is intentionally HTML-agnostic: in JustHTML it is applied to DOM text nodes,
6
+ not to raw HTML strings.
7
+
8
+ The behavior is driven by vendored compliance fixtures from the upstream
9
+ `linkify-it` project (MIT licensed). See `tests/linkify-it/README.md`.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import re
15
+ from dataclasses import dataclass
16
+ from typing import Final
17
+
18
+
19
+ @dataclass(frozen=True, slots=True)
20
+ class LinkMatch:
21
+ start: int
22
+ end: int
23
+ text: str
24
+ href: str
25
+ kind: str # "url" | "email"
26
+
27
+
28
+ DEFAULT_TLDS: Final[frozenset[str]] = frozenset(
29
+ {
30
+ # Keep this aligned with linkify-it's default list.
31
+ # See: https://github.com/markdown-it/linkify-it/blob/master/index.mjs
32
+ "biz",
33
+ "com",
34
+ "edu",
35
+ "gov",
36
+ "net",
37
+ "org",
38
+ "pro",
39
+ "web",
40
+ "xxx",
41
+ "aero",
42
+ "asia",
43
+ "coop",
44
+ "info",
45
+ "museum",
46
+ "name",
47
+ "shop",
48
+ "рф",
49
+ }
50
+ )
51
+
52
+
53
+ # A pragmatic Unicode-aware domain label pattern.
54
+ #
55
+ # Use `\w` for Unicode letters/digits (and underscore), and reject underscores
56
+ # during validation. This is intentionally stricter than allowing all non-ASCII
57
+ # codepoints, and matches the fixture behavior around delimiter punctuation.
58
+ _LABEL_RE: Final[str] = (
59
+ r"[0-9A-Za-z\w\u2600-\u27bf]"
60
+ r"(?:[0-9A-Za-z\w\u2600-\u27bf-]{0,61}[0-9A-Za-z\w\u2600-\u27bf])?"
61
+ )
62
+
63
+ # A fast-ish candidate matcher. We do real validation after we find a candidate.
64
+ _CANDIDATE_PATTERN: Final[str] = "".join(
65
+ [
66
+ r"(?i)([^0-9A-Za-z_])", # left boundary (avoid matching after underscore)
67
+ r"(", # candidate group
68
+ r"(?:https?|ftp)://[^\s<>\uFF5C]+", # absolute URL
69
+ r"|mailto:[^\s<>\uFF5C]+", # mailto
70
+ r"|//[^\s<>\uFF5C]+", # protocol-relative
71
+ r"|(?:www\.)[^\s<>\uFF5C]+", # www.
72
+ rf"|[0-9A-Za-z.!#$%&'*+/=?^_`{{|}}~\-\"]+@(?:{_LABEL_RE}\.)+{_LABEL_RE}", # email
73
+ r"|(?:\d{1,3}\.){3}\d{1,3}(?:/[^\s<>\uFF5C]*)?", # IPv4
74
+ rf"|(?:{_LABEL_RE}\.)+{_LABEL_RE}(?:/[^\s<>\uFF5C]*)?", # fuzzy domain/path
75
+ r")",
76
+ ]
77
+ )
78
+
79
+ _CANDIDATE_RE: Final[re.Pattern[str]] = re.compile(_CANDIDATE_PATTERN, re.UNICODE)
80
+
81
+ _TRAILING_PUNCT: Final[str] = ".,;:!?"
82
+
83
+ # RE pattern for 2-character TLDs, copied from linkify-it (MIT licensed).
84
+ _CC_TLD_RE: Final[re.Pattern[str]] = re.compile(
85
+ r"^(?:a[cdefgilmnoqrstuwxz]|b[abdefghijmnorstvwyz]|c[acdfghiklmnoruvwxyz]|d[ejkmoz]|e[cegrstu]|f[ijkmor]|g[abdefghilmnpqrstuwy]|h[kmnrtu]|i[delmnoqrst]|j[emop]|k[eghimnprwyz]|l[abcikrstuvy]|m[acdeghklmnopqrstuvwxyz]|n[acefgilopruz]|om|p[aefghklmnrstwy]|qa|r[eosuw]|s[abcdeghijklmnortuvxyz]|t[cdfghjklmnortvwz]|u[agksyz]|v[aceginu]|w[fs]|y[et]|z[amw])$",
86
+ re.IGNORECASE,
87
+ )
88
+
89
+
90
+ def _is_valid_tld(tld: str, *, extra_tlds: frozenset[str]) -> bool:
91
+ t = (tld or "").lower()
92
+ if not t:
93
+ return False
94
+ # Only valid 2-letter ccTLDs (avoid false positives like `.js`).
95
+ if len(t) == 2 and _CC_TLD_RE.match(t) is not None:
96
+ return True
97
+ # Any punycode root.
98
+ if t.startswith("xn--"):
99
+ return True
100
+ return t in DEFAULT_TLDS or t in extra_tlds
101
+
102
+
103
+ def _split_domain_for_tld(host: str) -> tuple[str, str] | None:
104
+ # Return (domain_without_tld, tld).
105
+ h = (host or "").strip().strip(".")
106
+ if not h:
107
+ return None
108
+ if h.lower() == "localhost":
109
+ return ("localhost", "")
110
+ if "." not in h:
111
+ return None
112
+ base, tld = h.rsplit(".", 1)
113
+ return (base, tld)
114
+
115
+
116
+ @dataclass(frozen=True, slots=True)
117
+ class LinkifyConfig:
118
+ fuzzy_ip: bool = False
119
+ extra_tlds: frozenset[str] = frozenset()
120
+
121
+ @staticmethod
122
+ def with_extra_tlds(extra_tlds: list[str] | tuple[str, ...] | set[str] | frozenset[str]) -> LinkifyConfig:
123
+ return LinkifyConfig(extra_tlds=frozenset(str(t).lower() for t in extra_tlds))
124
+
125
+
126
+ def _is_valid_ipv4(host: str) -> bool:
127
+ parts = host.split(".")
128
+ if len(parts) != 4:
129
+ return False
130
+ for p in parts:
131
+ if not p or len(p) > 3:
132
+ return False
133
+ if not p.isdigit():
134
+ return False
135
+ v = int(p)
136
+ if v < 0 or v > 255:
137
+ return False
138
+ return True
139
+
140
+
141
+ def _punycode_host(host: str) -> str:
142
+ # Safety default: normalize Unicode domains to punycode for href.
143
+ try:
144
+ return host.encode("idna").decode("ascii")
145
+ except UnicodeError:
146
+ return host
147
+
148
+
149
+ def _split_host_and_rest(raw: str) -> tuple[str, str]:
150
+ # raw is after an optional scheme prefix (or for fuzzy domains, the whole).
151
+ # Extract host[:port] and the rest (path/query/fragment).
152
+ for i, ch in enumerate(raw):
153
+ if ch in "/?#":
154
+ return raw[:i], raw[i:]
155
+ return raw, ""
156
+
157
+
158
+ def _strip_wrapping(raw: str) -> tuple[str, int, int]:
159
+ # Trim common wrappers like <...> or quotes, but report how many chars were removed
160
+ # from start/end so we can compute accurate offsets.
161
+ start_trim = 0
162
+ end_trim = 0
163
+
164
+ if raw and raw[0] in "<\"'([{" and raw[-1] in ">\"')]}":
165
+ # Angle brackets are common for autolinks.
166
+ # Quotes/brackets: we strip them only if they wrap the candidate.
167
+ raw = raw[1:-1]
168
+ start_trim = 1
169
+ end_trim = 1
170
+
171
+ return raw, start_trim, end_trim
172
+
173
+
174
+ def _trim_trailing(candidate: str) -> str:
175
+ # Remove trailing punctuation and unbalanced closing brackets.
176
+ if not candidate:
177
+ return candidate
178
+
179
+ # First strip sentence punctuation.
180
+ while candidate and candidate[-1] in _TRAILING_PUNCT:
181
+ candidate = candidate[:-1]
182
+
183
+ # Then strip quoting terminators when unbalanced (treat quotes as wrappers).
184
+ while candidate and candidate[-1] in "\"'":
185
+ q = candidate[-1]
186
+ if candidate.count(q) % 2 == 1:
187
+ candidate = candidate[:-1]
188
+ continue
189
+ break
190
+
191
+ # Then strip unmatched closing brackets.
192
+ # We treat ) ] } > as potentially closable.
193
+ pairs = {")": "(", "]": "[", "}": "{", ">": "<"}
194
+ while candidate and candidate[-1] in pairs:
195
+ close = candidate[-1]
196
+ open_ch = pairs[close]
197
+ if candidate.count(close) > candidate.count(open_ch):
198
+ candidate = candidate[:-1]
199
+ continue
200
+ break
201
+
202
+ return candidate
203
+
204
+
205
+ def _href_for(text: str) -> tuple[str, str]:
206
+ lower = text.lower()
207
+
208
+ if lower.startswith("mailto:"):
209
+ return text, "email"
210
+
211
+ if "@" in text and not lower.startswith(("http://", "https://", "ftp://", "//", "www.")):
212
+ return f"mailto:{text}", "email"
213
+
214
+ if lower.startswith(("http://", "https://", "ftp://", "//")):
215
+ return text, "url"
216
+
217
+ # www. and fuzzy domains default to http://
218
+ return f"http://{text}", "url"
219
+
220
+
221
+ def _punycode_href(href: str) -> str:
222
+ # Convert the host portion to punycode (IDNA), keeping the rest intact.
223
+ lower = href.lower()
224
+ prefix = ""
225
+ rest = href
226
+
227
+ if lower.startswith("mailto:"):
228
+ return href
229
+
230
+ if lower.startswith("http://"):
231
+ prefix = href[:7]
232
+ rest = href[7:]
233
+ elif lower.startswith("https://"):
234
+ prefix = href[:8]
235
+ rest = href[8:]
236
+ elif lower.startswith("ftp://"):
237
+ prefix = href[:6]
238
+ rest = href[6:]
239
+ elif lower.startswith("//"):
240
+ prefix = href[:2]
241
+ rest = href[2:]
242
+ else:
243
+ # Shouldn't happen; fuzzy hrefs are normalized before calling.
244
+ prefix = ""
245
+ rest = href
246
+
247
+ hostport, tail = _split_host_and_rest(rest)
248
+
249
+ # Handle userinfo (user:pass@host)
250
+ userinfo = ""
251
+ hostport2 = hostport
252
+ if "@" in hostport:
253
+ userinfo, hostport2 = hostport.rsplit("@", 1)
254
+
255
+ host = hostport2
256
+ port = ""
257
+ if hostport2.startswith("["):
258
+ # IPv6-ish, don't punycode.
259
+ return href
260
+ if ":" in hostport2:
261
+ host, port = hostport2.split(":", 1)
262
+
263
+ host_pc = _punycode_host(host)
264
+ rebuilt = host_pc
265
+ if port:
266
+ rebuilt = f"{rebuilt}:{port}"
267
+ if userinfo:
268
+ rebuilt = f"{userinfo}@{rebuilt}"
269
+
270
+ return f"{prefix}{rebuilt}{tail}"
271
+
272
+
273
+ def find_links(text: str) -> list[LinkMatch]:
274
+ return find_links_with_config(text, LinkifyConfig())
275
+
276
+
277
+ def find_links_with_config(text: str, config: LinkifyConfig) -> list[LinkMatch]:
278
+ if not text:
279
+ return []
280
+
281
+ # Mirror linkify-it behavior: always scan with a leading boundary character.
282
+ scan_text = "\n" + text
283
+
284
+ out: list[LinkMatch] = []
285
+
286
+ for m in _CANDIDATE_RE.finditer(scan_text):
287
+ raw = m.group(2)
288
+
289
+ # Compute absolute offsets (exclude the boundary prefix char).
290
+ start = m.start(2) - 1
291
+ end = m.end(2) - 1
292
+
293
+ stripped, s_trim, e_trim = _strip_wrapping(raw)
294
+ start += s_trim
295
+ end -= e_trim
296
+
297
+ cand = _trim_trailing(stripped)
298
+ if not cand:
299
+ continue
300
+
301
+ # Markdown-style termination: `(...URL...)[...]` should stop at the `)`.
302
+ lower = cand.lower()
303
+ if lower.startswith(("http://", "https://", "ftp://")) and ")[" in cand:
304
+ cand = cand.split(")[", 1)[0]
305
+ cand = _trim_trailing(cand)
306
+ if not cand:
307
+ continue
308
+
309
+ # Treat leading quotes as wrappers/delimiters, not part of the URL/email.
310
+ if cand and cand[0] in "\"'" and 0 <= start < len(text) and text[start] == cand[0]:
311
+ cand = cand[1:]
312
+ start += 1
313
+ if not cand:
314
+ continue
315
+
316
+ # Adjust end after trimming.
317
+ end = start + len(cand)
318
+
319
+ lower = cand.lower()
320
+
321
+ # If this looks like a fuzzy domain that starts immediately after ://,
322
+ # treat it as part of a broken/disabled schema (e.g. _http://example.com, hppt://example.com).
323
+ if not lower.startswith(("http://", "https://", "ftp://", "mailto:", "//", "www.")) and "@" not in cand:
324
+ if start >= 3 and text[start - 3 : start] == "://":
325
+ continue
326
+ if start > 0 and text[start - 1] in "/:@":
327
+ continue
328
+
329
+ # Validate fuzzy IP option.
330
+ if (
331
+ cand
332
+ and cand[0].isdigit()
333
+ and "." in cand
334
+ and not lower.startswith(("http://", "https://", "ftp://", "//"))
335
+ ):
336
+ host, _ = _split_host_and_rest(cand)
337
+ if host.replace(".", "").isdigit() and _is_valid_ipv4(host):
338
+ if not config.fuzzy_ip:
339
+ continue
340
+
341
+ # Validate // URLs: allow localhost or dotted domains, but not single-level.
342
+ if lower.startswith("//"):
343
+ # Protect against matching the // inside :// or ///.
344
+ if start > 0 and text[start - 1] in ":/":
345
+ continue
346
+ after = cand[2:]
347
+ hostport, _ = _split_host_and_rest(after)
348
+ if not hostport:
349
+ continue
350
+ if hostport.startswith("["):
351
+ continue
352
+ host_only = hostport
353
+ if "@" in host_only:
354
+ host_only = host_only.rsplit("@", 1)[1]
355
+ if ":" in host_only:
356
+ host_only = host_only.split(":", 1)[0]
357
+ if host_only.lower() != "localhost" and "." not in host_only:
358
+ continue
359
+
360
+ if "_" in host_only:
361
+ continue
362
+
363
+ # Validate fuzzy domains and emails with TLD allowlist.
364
+ is_scheme = lower.startswith(("http://", "https://", "ftp://", "mailto:"))
365
+ is_www = lower.startswith("www.")
366
+ is_proto_rel = lower.startswith("//")
367
+
368
+ if not is_scheme and not is_proto_rel and not is_www and "@" not in cand:
369
+ host, _ = _split_host_and_rest(cand)
370
+ if "_" in host:
371
+ continue
372
+
373
+ # IPv4 candidates don't use the TLD allowlist.
374
+ if "." in host and host.replace(".", "").isdigit() and _is_valid_ipv4(host):
375
+ pass
376
+ else:
377
+ parts = _split_domain_for_tld(host)
378
+ if parts is None:
379
+ continue
380
+ _base, tld = parts
381
+ if not _is_valid_tld(tld, extra_tlds=config.extra_tlds):
382
+ continue
383
+
384
+ if (
385
+ "@" in cand
386
+ and not lower.startswith(("http://", "https://", "ftp://", "//"))
387
+ and not lower.startswith("mailto:")
388
+ ):
389
+ # Fuzzy email requires a valid TLD.
390
+ local, domain = cand.rsplit("@", 1)
391
+ _ = local
392
+ host, _tail = _split_host_and_rest(domain)
393
+ if "_" in host:
394
+ continue
395
+ parts = _split_domain_for_tld(host)
396
+ if parts is None:
397
+ continue
398
+ _base, tld = parts
399
+ if not _is_valid_tld(tld, extra_tlds=config.extra_tlds):
400
+ continue
401
+
402
+ # Validate basic URL host/port if scheme-based.
403
+ if lower.startswith(("http://", "https://", "ftp://")):
404
+ after = cand.split("://", 1)[1]
405
+ hostport, _ = _split_host_and_rest(after)
406
+ if not hostport:
407
+ continue
408
+ if "@" in hostport:
409
+ hostport = hostport.rsplit("@", 1)[1]
410
+ host = hostport
411
+ if ":" in hostport and not hostport.startswith("["):
412
+ host, port = hostport.split(":", 1)
413
+ if port and (not port.isdigit() or int(port) > 65535):
414
+ continue
415
+ if not host or host.startswith(("-", ".")) or host.endswith(("-", ".")) or ".." in host:
416
+ continue
417
+ if "_" in host:
418
+ continue
419
+ if "." in host and host.replace(".", "").isdigit() and not _is_valid_ipv4(host):
420
+ continue
421
+
422
+ href, kind = _href_for(cand)
423
+ href = _punycode_href(href)
424
+
425
+ out.append(LinkMatch(start=start, end=end, text=cand, href=href, kind=kind))
426
+
427
+ # Avoid overlapping matches by keeping first-longest.
428
+ if not out:
429
+ return out
430
+ out.sort(key=lambda x: (x.start, -(x.end - x.start)))
431
+ filtered: list[LinkMatch] = []
432
+ last_end = -1
433
+ for lm in out:
434
+ if lm.start < last_end:
435
+ continue
436
+ filtered.append(lm)
437
+ last_end = lm.end
438
+ return filtered