tldextract 5.2.0__py3-none-any.whl → 5.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tldextract/_version.py +2 -2
- tldextract/cli.py +9 -1
- tldextract/tldextract.py +170 -36
- {tldextract-5.2.0.dist-info → tldextract-5.3.0.dist-info}/METADATA +3 -3
- {tldextract-5.2.0.dist-info → tldextract-5.3.0.dist-info}/RECORD +9 -9
- {tldextract-5.2.0.dist-info → tldextract-5.3.0.dist-info}/WHEEL +1 -1
- {tldextract-5.2.0.dist-info → tldextract-5.3.0.dist-info}/entry_points.txt +0 -0
- {tldextract-5.2.0.dist-info → tldextract-5.3.0.dist-info}/licenses/LICENSE +0 -0
- {tldextract-5.2.0.dist-info → tldextract-5.3.0.dist-info}/top_level.txt +0 -0
tldextract/_version.py
CHANGED
tldextract/cli.py
CHANGED
@@ -98,7 +98,15 @@ def main() -> None:
|
|
98
98
|
for i in args.input:
|
99
99
|
ext = tld_extract(i)
|
100
100
|
if args.json:
|
101
|
-
properties = (
|
101
|
+
properties = (
|
102
|
+
"fqdn",
|
103
|
+
"ipv4",
|
104
|
+
"ipv6",
|
105
|
+
"registered_domain",
|
106
|
+
"reverse_domain_name",
|
107
|
+
"top_domain_under_public_suffix",
|
108
|
+
"top_domain_under_registry_suffix",
|
109
|
+
)
|
102
110
|
print(
|
103
111
|
json.dumps(
|
104
112
|
{
|
tldextract/tldextract.py
CHANGED
@@ -28,7 +28,7 @@ subdomain or a valid suffix.
|
|
28
28
|
To rejoin the original hostname, if it was indeed a valid, registered hostname:
|
29
29
|
|
30
30
|
>>> ext = tldextract.extract("http://forums.bbc.co.uk")
|
31
|
-
>>> ext.
|
31
|
+
>>> ext.top_domain_under_public_suffix
|
32
32
|
'bbc.co.uk'
|
33
33
|
>>> ext.fqdn
|
34
34
|
'forums.bbc.co.uk'
|
@@ -38,8 +38,9 @@ from __future__ import annotations
|
|
38
38
|
|
39
39
|
import os
|
40
40
|
import urllib.parse
|
41
|
+
import warnings
|
41
42
|
from collections.abc import Collection, Sequence
|
42
|
-
from dataclasses import dataclass
|
43
|
+
from dataclasses import dataclass, field
|
43
44
|
from functools import wraps
|
44
45
|
|
45
46
|
import idna
|
@@ -96,18 +97,16 @@ class ExtractResult:
|
|
96
97
|
`False`.
|
97
98
|
"""
|
98
99
|
|
99
|
-
|
100
|
-
|
101
|
-
"""The `domain` and `suffix` fields joined with a dot, if they're both set, or else the empty string.
|
100
|
+
registry_suffix: str = field(repr=False)
|
101
|
+
"""The registry suffix of the input URL, if it contained one, or else the empty string.
|
102
102
|
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
return ""
|
103
|
+
This field is a domain under which people can register subdomains through a
|
104
|
+
registar.
|
105
|
+
|
106
|
+
This field is unaffected by the `include_psl_private_domains` setting. If
|
107
|
+
`include_psl_private_domains` was set to `False`, this field is always the
|
108
|
+
same as `suffix`.
|
109
|
+
"""
|
111
110
|
|
112
111
|
@property
|
113
112
|
def fqdn(self) -> str:
|
@@ -168,6 +167,56 @@ class ExtractResult:
|
|
168
167
|
return debracketed
|
169
168
|
return ""
|
170
169
|
|
170
|
+
@property
|
171
|
+
def registered_domain(self) -> str:
|
172
|
+
"""The `domain` and `suffix` fields joined with a dot, if they're both set, or else the empty string.
|
173
|
+
|
174
|
+
>>> extract("http://forums.bbc.co.uk").registered_domain
|
175
|
+
'bbc.co.uk'
|
176
|
+
>>> extract("http://localhost:8080").registered_domain
|
177
|
+
''
|
178
|
+
|
179
|
+
.. deprecated:: 6.0.0
|
180
|
+
This property is deprecated and will be removed in the next major
|
181
|
+
version. Use `top_domain_under_public_suffix` instead, which has the
|
182
|
+
same behavior but a more accurate name.
|
183
|
+
|
184
|
+
This is an alias for the `top_domain_under_public_suffix` property.
|
185
|
+
`registered_domain` is so called because is roughly the domain the
|
186
|
+
owner paid to register with a registrar or, in the case of a private
|
187
|
+
domain, "registered" with the domain owner. If the input was not
|
188
|
+
something one could register, this property returns the empty string.
|
189
|
+
|
190
|
+
To distinguish the case of private domains, consider Blogspot, which is
|
191
|
+
in the PSL's private domains. If `include_psl_private_domains` was set
|
192
|
+
to `False`, the `registered_domain` property of a Blogspot URL
|
193
|
+
represents the domain the owner of Blogspot registered with a
|
194
|
+
registrar, i.e. Google registered "blogspot.com". If
|
195
|
+
`include_psl_private_domains=True`, the `registered_domain` property
|
196
|
+
represents the "blogspot.com" _subdomain_ the owner of a blog
|
197
|
+
"registered" with Blogspot.
|
198
|
+
|
199
|
+
>>> extract(
|
200
|
+
... "http://waiterrant.blogspot.com", include_psl_private_domains=False
|
201
|
+
... ).registered_domain
|
202
|
+
'blogspot.com'
|
203
|
+
>>> extract(
|
204
|
+
... "http://waiterrant.blogspot.com", include_psl_private_domains=True
|
205
|
+
... ).registered_domain
|
206
|
+
'waiterrant.blogspot.com'
|
207
|
+
|
208
|
+
To always get the same joined string, regardless of the
|
209
|
+
`include_psl_private_domains` setting, consider the
|
210
|
+
`top_domain_under_registry_suffix` property.
|
211
|
+
"""
|
212
|
+
warnings.warn(
|
213
|
+
"The 'registered_domain' property is deprecated and will be removed in the next major version. "
|
214
|
+
"Use 'top_domain_under_public_suffix' instead, which has the same behavior but a more accurate name.",
|
215
|
+
DeprecationWarning,
|
216
|
+
stacklevel=2,
|
217
|
+
)
|
218
|
+
return self.top_domain_under_public_suffix
|
219
|
+
|
171
220
|
@property
|
172
221
|
def reverse_domain_name(self) -> str:
|
173
222
|
"""The domain name in Reverse Domain Name Notation.
|
@@ -193,6 +242,48 @@ class ExtractResult:
|
|
193
242
|
stack.extend(reversed(self.subdomain.split(".")))
|
194
243
|
return ".".join(stack)
|
195
244
|
|
245
|
+
@property
|
246
|
+
def top_domain_under_registry_suffix(self) -> str:
|
247
|
+
"""The rightmost domain label and `registry_suffix` joined with a dot, if such a domain is available and `registry_suffix` is set, or else the empty string.
|
248
|
+
|
249
|
+
The rightmost domain label might be in the `domain` field, or, if the
|
250
|
+
input URL's suffix is a PSL private domain, in the public suffix
|
251
|
+
`suffix` field.
|
252
|
+
|
253
|
+
If the input was not in the PSL's private domains, this property is
|
254
|
+
equivalent to `top_domain_under_public_suffix`.
|
255
|
+
|
256
|
+
>>> extract(
|
257
|
+
... "http://waiterrant.blogspot.com", include_psl_private_domains=False
|
258
|
+
... ).top_domain_under_registry_suffix
|
259
|
+
'blogspot.com'
|
260
|
+
>>> extract(
|
261
|
+
... "http://waiterrant.blogspot.com", include_psl_private_domains=True
|
262
|
+
... ).top_domain_under_registry_suffix
|
263
|
+
'blogspot.com'
|
264
|
+
>>> extract("http://localhost:8080").top_domain_under_registry_suffix
|
265
|
+
''
|
266
|
+
"""
|
267
|
+
top_domain_under_public_suffix = self.top_domain_under_public_suffix
|
268
|
+
if not top_domain_under_public_suffix or not self.is_private:
|
269
|
+
return top_domain_under_public_suffix
|
270
|
+
|
271
|
+
num_labels = self.registry_suffix.count(".") + 2
|
272
|
+
return ".".join(top_domain_under_public_suffix.split(".")[-num_labels:])
|
273
|
+
|
274
|
+
@property
|
275
|
+
def top_domain_under_public_suffix(self) -> str:
|
276
|
+
"""The `domain` and `suffix` fields joined with a dot, if they're both set, or else the empty string.
|
277
|
+
|
278
|
+
>>> extract("http://forums.bbc.co.uk").top_domain_under_public_suffix
|
279
|
+
'bbc.co.uk'
|
280
|
+
>>> extract("http://localhost:8080").top_domain_under_public_suffix
|
281
|
+
''
|
282
|
+
"""
|
283
|
+
if self.suffix and self.domain:
|
284
|
+
return f"{self.domain}.{self.suffix}"
|
285
|
+
return ""
|
286
|
+
|
196
287
|
|
197
288
|
class TLDExtract:
|
198
289
|
"""A callable for extracting, subdomain, domain, and suffix components from a URL."""
|
@@ -357,24 +448,58 @@ class TLDExtract:
|
|
357
448
|
and netloc_with_ascii_dots[-1] == "]"
|
358
449
|
and looks_like_ipv6(netloc_with_ascii_dots[1:-1])
|
359
450
|
):
|
360
|
-
return ExtractResult(
|
451
|
+
return ExtractResult(
|
452
|
+
"", netloc_with_ascii_dots, "", is_private=False, registry_suffix=""
|
453
|
+
)
|
361
454
|
|
362
455
|
labels = netloc_with_ascii_dots.split(".")
|
363
456
|
|
364
|
-
|
365
|
-
|
366
|
-
)
|
457
|
+
maybe_indexes = self._get_tld_extractor(session).suffix_index(
|
458
|
+
labels, include_psl_private_domains=include_psl_private_domains
|
459
|
+
)
|
367
460
|
|
368
461
|
num_ipv4_labels = 4
|
369
|
-
if
|
370
|
-
|
462
|
+
if (
|
463
|
+
not maybe_indexes
|
464
|
+
and len(labels) == num_ipv4_labels
|
465
|
+
and looks_like_ip(netloc_with_ascii_dots)
|
371
466
|
):
|
372
|
-
return ExtractResult(
|
467
|
+
return ExtractResult(
|
468
|
+
"", netloc_with_ascii_dots, "", is_private=False, registry_suffix=""
|
469
|
+
)
|
470
|
+
elif not maybe_indexes:
|
471
|
+
return ExtractResult(
|
472
|
+
subdomain=".".join(labels[:-1]),
|
473
|
+
domain=labels[-1],
|
474
|
+
suffix="",
|
475
|
+
is_private=False,
|
476
|
+
registry_suffix="",
|
477
|
+
)
|
373
478
|
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
479
|
+
(
|
480
|
+
(public_suffix_index, public_suffix_node),
|
481
|
+
(registry_suffix_index, registry_suffix_node),
|
482
|
+
) = maybe_indexes
|
483
|
+
|
484
|
+
subdomain = (
|
485
|
+
".".join(labels[: public_suffix_index - 1])
|
486
|
+
if public_suffix_index >= 2
|
487
|
+
else ""
|
488
|
+
)
|
489
|
+
domain = labels[public_suffix_index - 1] if public_suffix_index > 0 else ""
|
490
|
+
public_suffix = ".".join(labels[public_suffix_index:])
|
491
|
+
registry_suffix = (
|
492
|
+
".".join(labels[registry_suffix_index:])
|
493
|
+
if public_suffix_node.is_private
|
494
|
+
else public_suffix
|
495
|
+
)
|
496
|
+
return ExtractResult(
|
497
|
+
subdomain=subdomain,
|
498
|
+
domain=domain,
|
499
|
+
suffix=public_suffix,
|
500
|
+
is_private=public_suffix_node.is_private,
|
501
|
+
registry_suffix=registry_suffix,
|
502
|
+
)
|
378
503
|
|
379
504
|
def update(
|
380
505
|
self, fetch_now: bool = False, session: requests.Session | None = None
|
@@ -531,40 +656,49 @@ class _PublicSuffixListTLDExtractor:
|
|
531
656
|
|
532
657
|
def suffix_index(
|
533
658
|
self, spl: list[str], include_psl_private_domains: bool | None = None
|
534
|
-
) -> tuple[int,
|
535
|
-
"""Return the index of the first suffix label, and
|
659
|
+
) -> tuple[tuple[int, Trie], tuple[int, Trie]] | None:
|
660
|
+
"""Return the index of the first public suffix label, the index of the first registry suffix label, and their corresponding trie nodes.
|
536
661
|
|
537
|
-
Returns
|
662
|
+
Returns `None` if no suffix is found.
|
538
663
|
"""
|
539
664
|
if include_psl_private_domains is None:
|
540
665
|
include_psl_private_domains = self.include_psl_private_domains
|
541
666
|
|
542
|
-
node = (
|
667
|
+
node = reg_node = (
|
543
668
|
self.tlds_incl_private_trie
|
544
669
|
if include_psl_private_domains
|
545
670
|
else self.tlds_excl_private_trie
|
546
671
|
)
|
547
|
-
|
548
|
-
j = i
|
672
|
+
suffix_idx = reg_idx = label_idx = len(spl)
|
549
673
|
for label in reversed(spl):
|
550
674
|
decoded_label = _decode_punycode(label)
|
551
675
|
if decoded_label in node.matches:
|
552
|
-
|
676
|
+
label_idx -= 1
|
553
677
|
node = node.matches[decoded_label]
|
554
678
|
if node.end:
|
555
|
-
|
679
|
+
suffix_idx = label_idx
|
680
|
+
if not node.is_private:
|
681
|
+
reg_node = node
|
682
|
+
reg_idx = label_idx
|
556
683
|
continue
|
557
684
|
|
558
685
|
is_wildcard = "*" in node.matches
|
559
686
|
if is_wildcard:
|
560
687
|
is_wildcard_exception = "!" + decoded_label in node.matches
|
561
|
-
|
562
|
-
|
563
|
-
|
688
|
+
return (
|
689
|
+
label_idx if is_wildcard_exception else label_idx - 1,
|
690
|
+
node.matches["*"],
|
691
|
+
), (
|
692
|
+
reg_idx,
|
693
|
+
reg_node,
|
694
|
+
)
|
564
695
|
|
565
696
|
break
|
566
697
|
|
567
|
-
|
698
|
+
if suffix_idx == len(spl):
|
699
|
+
return None
|
700
|
+
|
701
|
+
return ((suffix_idx, node), (reg_idx, reg_node))
|
568
702
|
|
569
703
|
|
570
704
|
def _decode_punycode(label: str) -> str:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: tldextract
|
3
|
-
Version: 5.
|
3
|
+
Version: 5.3.0
|
4
4
|
Summary: Accurately separates a URL's subdomain, domain, and public suffix, using the Public Suffix List (PSL). By default, this includes the public ICANN TLDs and their exceptions. You can optionally support the Public Suffix List's private domains as well.
|
5
5
|
Author-email: John Kurkowski <john.kurkowski@gmail.com>
|
6
6
|
License: BSD-3-Clause
|
@@ -90,7 +90,7 @@ To rejoin the original hostname, if it was indeed a valid, registered hostname:
|
|
90
90
|
|
91
91
|
```python
|
92
92
|
>>> ext = tldextract.extract('http://forums.bbc.co.uk')
|
93
|
-
>>> ext.
|
93
|
+
>>> ext.top_domain_under_public_suffix
|
94
94
|
'bbc.co.uk'
|
95
95
|
>>> ext.fqdn
|
96
96
|
'forums.bbc.co.uk'
|
@@ -287,7 +287,7 @@ For example:
|
|
287
287
|
extractor = TLDExtract()
|
288
288
|
split_url = urllib.parse.urlsplit("https://foo.bar.com:8080")
|
289
289
|
split_suffix = extractor.extract_urllib(split_url)
|
290
|
-
url_to_crawl = f"{split_url.scheme}://{split_suffix.
|
290
|
+
url_to_crawl = f"{split_url.scheme}://{split_suffix.top_domain_under_public_suffix}:{split_url.port}"
|
291
291
|
```
|
292
292
|
|
293
293
|
`tldextract`'s lenient string parsing stance lowers the learning curve of using
|
@@ -1,16 +1,16 @@
|
|
1
1
|
tldextract/.tld_set_snapshot,sha256=tpMVwIXVOXJyS48t8RH_wymwyE_gpH1iyMkWVcx3Sjg,318581
|
2
2
|
tldextract/__init__.py,sha256=1n2QxAmFCFp3X1A5O46wJOTZqWM2ukshNkkG-TrOaLQ,274
|
3
3
|
tldextract/__main__.py,sha256=oiZ5EW_lxRLH6Khk6MdzXf7a1Ld5-A3k4wOFRmNNk2o,89
|
4
|
-
tldextract/_version.py,sha256=
|
4
|
+
tldextract/_version.py,sha256=eUUqvIdRVH9jjg_LcI8eEHFrv_K9YLfGNamSZ0trHII,511
|
5
5
|
tldextract/cache.py,sha256=nrT9VuLmrjHHFxj-Cai97IyUXXenCX6KbHi07mPkzMc,8289
|
6
|
-
tldextract/cli.py,sha256=
|
6
|
+
tldextract/cli.py,sha256=ZYXwybL76KucaVH4GCz5Uiy4PmL6oXARiKgLWv64c0I,3230
|
7
7
|
tldextract/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
8
8
|
tldextract/remote.py,sha256=rovbxR73G8To-QRrA_cgSfk0S3k0_B2SyYefXiLHrQg,1940
|
9
9
|
tldextract/suffix_list.py,sha256=ePH6iOIUBe0JE_mc07a34Nd7tFyfmHgP_mJkFhxzr7c,3947
|
10
|
-
tldextract/tldextract.py,sha256=
|
11
|
-
tldextract-5.
|
12
|
-
tldextract-5.
|
13
|
-
tldextract-5.
|
14
|
-
tldextract-5.
|
15
|
-
tldextract-5.
|
16
|
-
tldextract-5.
|
10
|
+
tldextract/tldextract.py,sha256=j93PohoNaNpH1tJ3zZ2z3f4JMt_JOmQ5RYUgrUcNvP0,27008
|
11
|
+
tldextract-5.3.0.dist-info/licenses/LICENSE,sha256=ZUrmz9cSprvhQmqmUdHIWbD51Cytv6PDTMlJLruTLuI,1527
|
12
|
+
tldextract-5.3.0.dist-info/METADATA,sha256=V9aCLPpJ5uHtPugYuncUOrvNoHwuXN0YZ040Wtm8RWM,11735
|
13
|
+
tldextract-5.3.0.dist-info/WHEEL,sha256=pxyMxgL8-pra_rKaQ4drOZAegBVuX-G_4nRHjjgWbmo,91
|
14
|
+
tldextract-5.3.0.dist-info/entry_points.txt,sha256=EStkXC80BetCMp1UDhU3kWuXBo3qDpgKltZTJ1x4x1U,51
|
15
|
+
tldextract-5.3.0.dist-info/top_level.txt,sha256=DWZIjV49WP30tyC1KOEP7t-EaS4IRCXQzc0KXAOn_bk,11
|
16
|
+
tldextract-5.3.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|