tldextract 5.1.2__py3-none-any.whl → 5.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tldextract/.tld_set_snapshot +5247 -3232
- tldextract/__init__.py +4 -2
- tldextract/_version.py +9 -4
- tldextract/cache.py +5 -16
- tldextract/remote.py +1 -1
- tldextract/suffix_list.py +3 -1
- tldextract/tldextract.py +99 -42
- {tldextract-5.1.2.dist-info → tldextract-5.2.0.dist-info}/METADATA +58 -44
- tldextract-5.2.0.dist-info/RECORD +16 -0
- {tldextract-5.1.2.dist-info → tldextract-5.2.0.dist-info}/WHEEL +1 -1
- {tldextract-5.1.2.dist-info → tldextract-5.2.0.dist-info/licenses}/LICENSE +1 -1
- tldextract-5.1.2.dist-info/RECORD +0 -16
- {tldextract-5.1.2.dist-info → tldextract-5.2.0.dist-info}/entry_points.txt +0 -0
- {tldextract-5.1.2.dist-info → tldextract-5.2.0.dist-info}/top_level.txt +0 -0
tldextract/__init__.py
CHANGED
@@ -1,12 +1,14 @@
|
|
1
1
|
"""Export tldextract's public interface."""
|
2
2
|
|
3
3
|
from . import _version
|
4
|
-
from .tldextract import TLDExtract, extract
|
4
|
+
from .tldextract import ExtractResult, TLDExtract, extract, update
|
5
5
|
|
6
6
|
__version__: str = _version.version
|
7
7
|
|
8
8
|
__all__ = [
|
9
|
+
"__version__",
|
9
10
|
"extract",
|
11
|
+
"ExtractResult",
|
10
12
|
"TLDExtract",
|
11
|
-
"
|
13
|
+
"update",
|
12
14
|
]
|
tldextract/_version.py
CHANGED
@@ -1,8 +1,13 @@
|
|
1
|
-
# file generated by
|
1
|
+
# file generated by setuptools-scm
|
2
2
|
# don't change, don't track in version control
|
3
|
+
|
4
|
+
__all__ = ["__version__", "__version_tuple__", "version", "version_tuple"]
|
5
|
+
|
3
6
|
TYPE_CHECKING = False
|
4
7
|
if TYPE_CHECKING:
|
5
|
-
from typing import Tuple
|
8
|
+
from typing import Tuple
|
9
|
+
from typing import Union
|
10
|
+
|
6
11
|
VERSION_TUPLE = Tuple[Union[int, str], ...]
|
7
12
|
else:
|
8
13
|
VERSION_TUPLE = object
|
@@ -12,5 +17,5 @@ __version__: str
|
|
12
17
|
__version_tuple__: VERSION_TUPLE
|
13
18
|
version_tuple: VERSION_TUPLE
|
14
19
|
|
15
|
-
__version__ = version = '5.
|
16
|
-
__version_tuple__ = version_tuple = (5,
|
20
|
+
__version__ = version = '5.2.0'
|
21
|
+
__version_tuple__ = version_tuple = (5, 2, 0)
|
tldextract/cache.py
CHANGED
@@ -24,18 +24,6 @@ _DID_LOG_UNABLE_TO_CACHE = False
|
|
24
24
|
|
25
25
|
T = TypeVar("T")
|
26
26
|
|
27
|
-
if sys.version_info >= (3, 9):
|
28
|
-
|
29
|
-
def md5(*args: bytes) -> hashlib._Hash:
|
30
|
-
"""Use argument only available in newer Python.
|
31
|
-
|
32
|
-
In this file, MD5 is only used for cache location, not security.
|
33
|
-
"""
|
34
|
-
return hashlib.md5(*args, usedforsecurity=False)
|
35
|
-
|
36
|
-
else:
|
37
|
-
md5 = hashlib.md5
|
38
|
-
|
39
27
|
|
40
28
|
def get_pkg_unique_identifier() -> str:
|
41
29
|
"""Generate an identifier unique to the python version, tldextract version, and python instance.
|
@@ -51,7 +39,9 @@ def get_pkg_unique_identifier() -> str:
|
|
51
39
|
tldextract_version = "tldextract-" + version
|
52
40
|
python_env_name = os.path.basename(sys.prefix)
|
53
41
|
# just to handle the edge case of two identically named python environments
|
54
|
-
python_binary_path_short_hash = md5(
|
42
|
+
python_binary_path_short_hash = hashlib.md5(
|
43
|
+
sys.prefix.encode("utf-8"), usedforsecurity=False
|
44
|
+
).hexdigest()[:6]
|
55
45
|
python_version = ".".join([str(v) for v in sys.version_info[:-1]])
|
56
46
|
identifier_parts = [
|
57
47
|
python_version,
|
@@ -113,8 +103,7 @@ class DiskCache:
|
|
113
103
|
with open(cache_filepath) as cache_file:
|
114
104
|
return json.load(cache_file)
|
115
105
|
except (OSError, ValueError) as exc:
|
116
|
-
|
117
|
-
raise KeyError("namespace: " + namespace + " key: " + repr(key)) from None
|
106
|
+
raise KeyError("namespace: " + namespace + " key: " + repr(key)) from exc
|
118
107
|
|
119
108
|
def set( # noqa: A003
|
120
109
|
self, namespace: str, key: str | dict[str, Hashable], value: object
|
@@ -238,7 +227,7 @@ def _fetch_url(session: requests.Session, url: str, timeout: int | None) -> str:
|
|
238
227
|
|
239
228
|
def _make_cache_key(inputs: str | dict[str, Hashable]) -> str:
|
240
229
|
key = repr(inputs)
|
241
|
-
return md5(key.encode("utf8")).hexdigest()
|
230
|
+
return hashlib.md5(key.encode("utf8"), usedforsecurity=False).hexdigest()
|
242
231
|
|
243
232
|
|
244
233
|
def _make_dir(filename: str) -> None:
|
tldextract/remote.py
CHANGED
@@ -46,7 +46,7 @@ def _schemeless_url(url: str) -> str:
|
|
46
46
|
return url[2:]
|
47
47
|
if (
|
48
48
|
double_slashes_start < 2
|
49
|
-
or
|
49
|
+
or url[double_slashes_start - 1] != ":"
|
50
50
|
or set(url[: double_slashes_start - 1]) - scheme_chars_set
|
51
51
|
):
|
52
52
|
return url
|
tldextract/suffix_list.py
CHANGED
@@ -47,7 +47,9 @@ def find_first_response(
|
|
47
47
|
session=session, url=url, timeout=cache_fetch_timeout
|
48
48
|
)
|
49
49
|
except requests.exceptions.RequestException:
|
50
|
-
LOG.
|
50
|
+
LOG.warning(
|
51
|
+
"Exception reading Public Suffix List url %s", url, exc_info=True
|
52
|
+
)
|
51
53
|
finally:
|
52
54
|
# Ensure the session is always closed if it's constructed in the method
|
53
55
|
if session_created:
|
tldextract/tldextract.py
CHANGED
@@ -4,30 +4,30 @@ It does this via the Public Suffix List (PSL).
|
|
4
4
|
|
5
5
|
>>> import tldextract
|
6
6
|
|
7
|
-
>>> tldextract.extract(
|
7
|
+
>>> tldextract.extract("http://forums.news.cnn.com/")
|
8
8
|
ExtractResult(subdomain='forums.news', domain='cnn', suffix='com', is_private=False)
|
9
9
|
|
10
|
-
>>> tldextract.extract(
|
10
|
+
>>> tldextract.extract("http://forums.bbc.co.uk/") # United Kingdom
|
11
11
|
ExtractResult(subdomain='forums', domain='bbc', suffix='co.uk', is_private=False)
|
12
12
|
|
13
|
-
>>> tldextract.extract(
|
13
|
+
>>> tldextract.extract("http://www.worldbank.org.kg/") # Kyrgyzstan
|
14
14
|
ExtractResult(subdomain='www', domain='worldbank', suffix='org.kg', is_private=False)
|
15
15
|
|
16
16
|
Note subdomain and suffix are _optional_. Not all URL-like inputs have a
|
17
17
|
subdomain or a valid suffix.
|
18
18
|
|
19
|
-
>>> tldextract.extract(
|
19
|
+
>>> tldextract.extract("google.com")
|
20
20
|
ExtractResult(subdomain='', domain='google', suffix='com', is_private=False)
|
21
21
|
|
22
|
-
>>> tldextract.extract(
|
22
|
+
>>> tldextract.extract("google.notavalidsuffix")
|
23
23
|
ExtractResult(subdomain='google', domain='notavalidsuffix', suffix='', is_private=False)
|
24
24
|
|
25
|
-
>>> tldextract.extract(
|
25
|
+
>>> tldextract.extract("http://127.0.0.1:8080/deployed/")
|
26
26
|
ExtractResult(subdomain='', domain='127.0.0.1', suffix='', is_private=False)
|
27
27
|
|
28
28
|
To rejoin the original hostname, if it was indeed a valid, registered hostname:
|
29
29
|
|
30
|
-
>>> ext = tldextract.extract(
|
30
|
+
>>> ext = tldextract.extract("http://forums.bbc.co.uk")
|
31
31
|
>>> ext.registered_domain
|
32
32
|
'bbc.co.uk'
|
33
33
|
>>> ext.fqdn
|
@@ -36,7 +36,6 @@ To rejoin the original hostname, if it was indeed a valid, registered hostname:
|
|
36
36
|
|
37
37
|
from __future__ import annotations
|
38
38
|
|
39
|
-
import logging
|
40
39
|
import os
|
41
40
|
import urllib.parse
|
42
41
|
from collections.abc import Collection, Sequence
|
@@ -50,9 +49,6 @@ from .cache import DiskCache, get_cache_dir
|
|
50
49
|
from .remote import lenient_netloc, looks_like_ip, looks_like_ipv6
|
51
50
|
from .suffix_list import get_suffix_lists
|
52
51
|
|
53
|
-
LOG = logging.getLogger("tldextract")
|
54
|
-
|
55
|
-
|
56
52
|
CACHE_TIMEOUT = os.environ.get("TLDEXTRACT_CACHE_TIMEOUT")
|
57
53
|
|
58
54
|
PUBLIC_SUFFIX_LIST_URLS = (
|
@@ -65,21 +61,48 @@ PUBLIC_SUFFIX_LIST_URLS = (
|
|
65
61
|
class ExtractResult:
|
66
62
|
"""A URL's extracted subdomain, domain, and suffix.
|
67
63
|
|
68
|
-
|
64
|
+
These first 3 fields are what most users of this library will care about.
|
65
|
+
They are the split, non-overlapping hostname components of the input URL.
|
66
|
+
They can be used to rebuild the original URL's hostname.
|
67
|
+
|
68
|
+
Beyond the first 3 fields, the class contains metadata fields, like a flag
|
69
|
+
that indicates if the input URL's suffix is from a private domain.
|
69
70
|
"""
|
70
71
|
|
71
72
|
subdomain: str
|
73
|
+
"""All subdomains beneath the domain of the input URL, if it contained any such subdomains, or else the empty string."""
|
74
|
+
|
72
75
|
domain: str
|
76
|
+
"""The topmost domain of the input URL, if it contained a domain name, or else everything hostname-like in the input.
|
77
|
+
|
78
|
+
If the input URL didn't contain a real domain name, the `suffix` field will
|
79
|
+
be empty, and this field will catch values like an IP address, or
|
80
|
+
private network hostnames like "localhost".
|
81
|
+
"""
|
82
|
+
|
73
83
|
suffix: str
|
84
|
+
"""The public suffix of the input URL, if it contained one, or else the empty string.
|
85
|
+
|
86
|
+
If `include_psl_private_domains` was set to `False`, this field is the same
|
87
|
+
as `registry_suffix`, i.e. a domain under which people can register
|
88
|
+
subdomains through a registrar. If `include_psl_private_domains` was set to
|
89
|
+
`True`, this field may be a PSL private domain, like "blogspot.com".
|
90
|
+
"""
|
91
|
+
|
74
92
|
is_private: bool
|
93
|
+
"""Whether the input URL belongs in the Public Suffix List's private domains.
|
94
|
+
|
95
|
+
If `include_psl_private_domains` was set to `False`, this field is always
|
96
|
+
`False`.
|
97
|
+
"""
|
75
98
|
|
76
99
|
@property
|
77
100
|
def registered_domain(self) -> str:
|
78
|
-
"""
|
101
|
+
"""The `domain` and `suffix` fields joined with a dot, if they're both set, or else the empty string.
|
79
102
|
|
80
|
-
>>> extract(
|
103
|
+
>>> extract("http://forums.bbc.co.uk").registered_domain
|
81
104
|
'bbc.co.uk'
|
82
|
-
>>> extract(
|
105
|
+
>>> extract("http://localhost:8080").registered_domain
|
83
106
|
''
|
84
107
|
"""
|
85
108
|
if self.suffix and self.domain:
|
@@ -88,11 +111,11 @@ class ExtractResult:
|
|
88
111
|
|
89
112
|
@property
|
90
113
|
def fqdn(self) -> str:
|
91
|
-
"""
|
114
|
+
"""The Fully Qualified Domain Name (FQDN), if there is a proper `domain` and `suffix`, or else the empty string.
|
92
115
|
|
93
|
-
>>> extract(
|
116
|
+
>>> extract("http://forums.bbc.co.uk/path/to/file").fqdn
|
94
117
|
'forums.bbc.co.uk'
|
95
|
-
>>> extract(
|
118
|
+
>>> extract("http://localhost:8080").fqdn
|
96
119
|
''
|
97
120
|
"""
|
98
121
|
if self.suffix and (self.domain or self.is_private):
|
@@ -101,13 +124,13 @@ class ExtractResult:
|
|
101
124
|
|
102
125
|
@property
|
103
126
|
def ipv4(self) -> str:
|
104
|
-
"""
|
127
|
+
"""The IPv4 address, if that is what the input domain/URL was, or else the empty string.
|
105
128
|
|
106
|
-
>>> extract(
|
129
|
+
>>> extract("http://127.0.0.1/path/to/file").ipv4
|
107
130
|
'127.0.0.1'
|
108
|
-
>>> extract(
|
131
|
+
>>> extract("http://127.0.0.1.1/path/to/file").ipv4
|
109
132
|
''
|
110
|
-
>>> extract(
|
133
|
+
>>> extract("http://256.1.1.1").ipv4
|
111
134
|
''
|
112
135
|
"""
|
113
136
|
if (
|
@@ -120,13 +143,17 @@ class ExtractResult:
|
|
120
143
|
|
121
144
|
@property
|
122
145
|
def ipv6(self) -> str:
|
123
|
-
"""
|
146
|
+
"""The IPv6 address, if that is what the input domain/URL was, or else the empty string.
|
124
147
|
|
125
|
-
>>> extract(
|
148
|
+
>>> extract(
|
149
|
+
... "http://[aBcD:ef01:2345:6789:aBcD:ef01:127.0.0.1]/path/to/file"
|
150
|
+
... ).ipv6
|
126
151
|
'aBcD:ef01:2345:6789:aBcD:ef01:127.0.0.1'
|
127
|
-
>>> extract(
|
152
|
+
>>> extract(
|
153
|
+
... "http://[aBcD:ef01:2345:6789:aBcD:ef01:127.0.0.1.1]/path/to/file"
|
154
|
+
... ).ipv6
|
128
155
|
''
|
129
|
-
>>> extract(
|
156
|
+
>>> extract("http://[aBcD:ef01:2345:6789:aBcD:ef01:256.0.0.1]").ipv6
|
130
157
|
''
|
131
158
|
"""
|
132
159
|
min_num_ipv6_chars = 4
|
@@ -141,6 +168,31 @@ class ExtractResult:
|
|
141
168
|
return debracketed
|
142
169
|
return ""
|
143
170
|
|
171
|
+
@property
|
172
|
+
def reverse_domain_name(self) -> str:
|
173
|
+
"""The domain name in Reverse Domain Name Notation.
|
174
|
+
|
175
|
+
Joins extracted components of the input URL in reverse domain name
|
176
|
+
notation. The suffix is used as the leftmost component, followed by the
|
177
|
+
domain, then followed by the subdomain with its parts reversed.
|
178
|
+
|
179
|
+
Reverse Domain Name Notation is typically used to organize namespaces
|
180
|
+
for packages and plugins. Technically, a full reversal would reverse
|
181
|
+
the parts of the suffix, e.g. "co.uk" would become "uk.co", but this is
|
182
|
+
not done in practice when Reverse Domain Name Notation is called for.
|
183
|
+
So this property leaves the `suffix` part in its original order.
|
184
|
+
|
185
|
+
>>> extract("login.example.com").reverse_domain_name
|
186
|
+
'com.example.login'
|
187
|
+
|
188
|
+
>>> extract("login.example.co.uk").reverse_domain_name
|
189
|
+
'co.uk.example.login'
|
190
|
+
"""
|
191
|
+
stack = [self.suffix, self.domain]
|
192
|
+
if self.subdomain:
|
193
|
+
stack.extend(reversed(self.subdomain.split(".")))
|
194
|
+
return ".".join(stack)
|
195
|
+
|
144
196
|
|
145
197
|
class TLDExtract:
|
146
198
|
"""A callable for extracting, subdomain, domain, and suffix components from a URL."""
|
@@ -161,13 +213,14 @@ class TLDExtract:
|
|
161
213
|
the `cache_dir` will live in the tldextract directory. You can disable
|
162
214
|
the caching functionality of this module by setting `cache_dir` to `None`.
|
163
215
|
|
164
|
-
If the cached version does not exist
|
165
|
-
`suffix_list_urls` in order,
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
216
|
+
If the cached version does not exist, such as on the first run, HTTP
|
217
|
+
request the URLs in `suffix_list_urls` in order, and use the first
|
218
|
+
successful response for public suffix definitions. Subsequent, untried
|
219
|
+
URLs are ignored. The default URLs are the latest version of the
|
220
|
+
Mozilla Public Suffix List and its mirror, but any similar document URL
|
221
|
+
could be specified. Local files can be specified by using the `file://`
|
222
|
+
protocol (see `urllib2` documentation). To disable HTTP requests, set
|
223
|
+
this to an empty sequence.
|
171
224
|
|
172
225
|
If there is no cached version loaded and no data is found from the `suffix_list_urls`,
|
173
226
|
the module will fall back to the included TLD set snapshot. If you do not want
|
@@ -179,7 +232,9 @@ class TLDExtract:
|
|
179
232
|
suffix, so these domains are excluded by default. If you'd like them
|
180
233
|
included instead, set `include_psl_private_domains` to True.
|
181
234
|
|
182
|
-
You can
|
235
|
+
You can specify additional suffixes in the `extra_suffixes` argument.
|
236
|
+
These will be merged into whatever public suffix definitions are
|
237
|
+
already in use by `tldextract`, above.
|
183
238
|
|
184
239
|
cache_fetch_timeout is passed unmodified to the underlying request object
|
185
240
|
per the requests documentation here:
|
@@ -237,9 +292,9 @@ class TLDExtract:
|
|
237
292
|
I.e. its effective TLD, gTLD, ccTLD, etc. components.
|
238
293
|
|
239
294
|
>>> extractor = TLDExtract()
|
240
|
-
>>> extractor.extract_str(
|
295
|
+
>>> extractor.extract_str("http://forums.news.cnn.com/")
|
241
296
|
ExtractResult(subdomain='forums.news', domain='cnn', suffix='com', is_private=False)
|
242
|
-
>>> extractor.extract_str(
|
297
|
+
>>> extractor.extract_str("http://forums.bbc.co.uk/")
|
243
298
|
ExtractResult(subdomain='forums', domain='bbc', suffix='co.uk', is_private=False)
|
244
299
|
|
245
300
|
Allows configuring the HTTP request via the optional `session`
|
@@ -272,9 +327,11 @@ class TLDExtract:
|
|
272
327
|
name has already been parsed.
|
273
328
|
|
274
329
|
>>> extractor = TLDExtract()
|
275
|
-
>>> extractor.extract_urllib(
|
330
|
+
>>> extractor.extract_urllib(
|
331
|
+
... urllib.parse.urlsplit("http://forums.news.cnn.com/")
|
332
|
+
... )
|
276
333
|
ExtractResult(subdomain='forums.news', domain='cnn', suffix='com', is_private=False)
|
277
|
-
>>> extractor.extract_urllib(urllib.parse.urlsplit(
|
334
|
+
>>> extractor.extract_urllib(urllib.parse.urlsplit("http://forums.bbc.co.uk/"))
|
278
335
|
ExtractResult(subdomain='forums', domain='bbc', suffix='co.uk', is_private=False)
|
279
336
|
"""
|
280
337
|
return self._extract_netloc(
|
@@ -298,9 +355,9 @@ class TLDExtract:
|
|
298
355
|
len(netloc_with_ascii_dots) >= min_num_ipv6_chars
|
299
356
|
and netloc_with_ascii_dots[0] == "["
|
300
357
|
and netloc_with_ascii_dots[-1] == "]"
|
358
|
+
and looks_like_ipv6(netloc_with_ascii_dots[1:-1])
|
301
359
|
):
|
302
|
-
|
303
|
-
return ExtractResult("", netloc_with_ascii_dots, "", is_private=False)
|
360
|
+
return ExtractResult("", netloc_with_ascii_dots, "", is_private=False)
|
304
361
|
|
305
362
|
labels = netloc_with_ascii_dots.split(".")
|
306
363
|
|
@@ -330,9 +387,9 @@ class TLDExtract:
|
|
330
387
|
|
331
388
|
@property
|
332
389
|
def tlds(self, session: requests.Session | None = None) -> list[str]:
|
333
|
-
"""
|
390
|
+
"""The list of TLDs used by default.
|
334
391
|
|
335
|
-
This will vary based on `include_psl_private_domains` and `extra_suffixes
|
392
|
+
This will vary based on `include_psl_private_domains` and `extra_suffixes`.
|
336
393
|
"""
|
337
394
|
return list(self._get_tld_extractor(session=session).tlds())
|
338
395
|
|
@@ -1,6 +1,6 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.4
|
2
2
|
Name: tldextract
|
3
|
-
Version: 5.
|
3
|
+
Version: 5.2.0
|
4
4
|
Summary: Accurately separates a URL's subdomain, domain, and public suffix, using the Public Suffix List (PSL). By default, this includes the public ICANN TLDs and their exceptions. You can optionally support the Public Suffix List's private domains as well.
|
5
5
|
Author-email: John Kurkowski <john.kurkowski@gmail.com>
|
6
6
|
License: BSD-3-Clause
|
@@ -10,33 +10,34 @@ Classifier: Development Status :: 5 - Production/Stable
|
|
10
10
|
Classifier: Topic :: Utilities
|
11
11
|
Classifier: License :: OSI Approved :: BSD License
|
12
12
|
Classifier: Programming Language :: Python :: 3
|
13
|
-
Classifier: Programming Language :: Python :: 3.8
|
14
13
|
Classifier: Programming Language :: Python :: 3.9
|
15
14
|
Classifier: Programming Language :: Python :: 3.10
|
16
15
|
Classifier: Programming Language :: Python :: 3.11
|
17
16
|
Classifier: Programming Language :: Python :: 3.12
|
18
|
-
|
17
|
+
Classifier: Programming Language :: Python :: 3.13
|
18
|
+
Requires-Python: >=3.9
|
19
19
|
Description-Content-Type: text/markdown
|
20
20
|
License-File: LICENSE
|
21
21
|
Requires-Dist: idna
|
22
|
-
Requires-Dist: requests
|
23
|
-
Requires-Dist: requests-file
|
24
|
-
Requires-Dist: filelock
|
22
|
+
Requires-Dist: requests>=2.1.0
|
23
|
+
Requires-Dist: requests-file>=1.4
|
24
|
+
Requires-Dist: filelock>=3.0.8
|
25
25
|
Provides-Extra: release
|
26
|
-
Requires-Dist: build
|
27
|
-
Requires-Dist: twine
|
26
|
+
Requires-Dist: build; extra == "release"
|
27
|
+
Requires-Dist: twine; extra == "release"
|
28
28
|
Provides-Extra: testing
|
29
|
-
Requires-Dist:
|
30
|
-
Requires-Dist:
|
31
|
-
Requires-Dist: pytest
|
32
|
-
Requires-Dist: pytest-
|
33
|
-
Requires-Dist:
|
34
|
-
Requires-Dist:
|
35
|
-
Requires-Dist:
|
36
|
-
Requires-Dist:
|
37
|
-
Requires-Dist: tox
|
38
|
-
Requires-Dist: types-filelock
|
39
|
-
Requires-Dist: types-requests
|
29
|
+
Requires-Dist: mypy; extra == "testing"
|
30
|
+
Requires-Dist: pytest; extra == "testing"
|
31
|
+
Requires-Dist: pytest-gitignore; extra == "testing"
|
32
|
+
Requires-Dist: pytest-mock; extra == "testing"
|
33
|
+
Requires-Dist: responses; extra == "testing"
|
34
|
+
Requires-Dist: ruff; extra == "testing"
|
35
|
+
Requires-Dist: syrupy; extra == "testing"
|
36
|
+
Requires-Dist: tox; extra == "testing"
|
37
|
+
Requires-Dist: tox-uv; extra == "testing"
|
38
|
+
Requires-Dist: types-filelock; extra == "testing"
|
39
|
+
Requires-Dist: types-requests; extra == "testing"
|
40
|
+
Dynamic: license-file
|
40
41
|
|
41
42
|
# tldextract [](https://badge.fury.io/py/tldextract) [](https://github.com/john-kurkowski/tldextract/actions/workflows/ci.yml)
|
42
43
|
|
@@ -95,8 +96,17 @@ To rejoin the original hostname, if it was indeed a valid, registered hostname:
|
|
95
96
|
'forums.bbc.co.uk'
|
96
97
|
```
|
97
98
|
|
99
|
+
In addition to the Python interface, there is a command-line interface. Split
|
100
|
+
the URL components by space:
|
101
|
+
|
102
|
+
```zsh
|
103
|
+
$ tldextract 'http://forums.bbc.co.uk'
|
104
|
+
forums bbc co.uk
|
105
|
+
```
|
106
|
+
|
98
107
|
By default, this package supports the public ICANN TLDs and their exceptions.
|
99
|
-
You can optionally support the Public Suffix List's private
|
108
|
+
You can optionally support the Public Suffix List's [private
|
109
|
+
domains](#public-vs-private-domains) as well.
|
100
110
|
|
101
111
|
This package started by implementing the chosen answer from [this StackOverflow question on
|
102
112
|
getting the "domain name" from a URL](http://stackoverflow.com/questions/569137/how-to-get-domain-name-from-url/569219#569219).
|
@@ -118,19 +128,12 @@ Or the latest dev version:
|
|
118
128
|
pip install -e 'git://github.com/john-kurkowski/tldextract.git#egg=tldextract'
|
119
129
|
```
|
120
130
|
|
121
|
-
Command-line usage, splits the URL components by space:
|
122
|
-
|
123
|
-
```zsh
|
124
|
-
tldextract http://forums.bbc.co.uk
|
125
|
-
# forums bbc co.uk
|
126
|
-
```
|
127
|
-
|
128
131
|
## Note about caching
|
129
132
|
|
130
133
|
Beware when first calling `tldextract`, it updates its TLD list with a live HTTP
|
131
134
|
request. This updated TLD set is usually cached indefinitely in `$HOME/.cache/python-tldextract`.
|
132
|
-
To control the cache's location, set TLDEXTRACT_CACHE environment variable or set the
|
133
|
-
cache_dir path
|
135
|
+
To control the cache's location, set the `TLDEXTRACT_CACHE` environment variable or set the
|
136
|
+
`cache_dir` path when constructing a `TLDExtract`.
|
134
137
|
|
135
138
|
(Arguably runtime bootstrapping like that shouldn't be the default behavior,
|
136
139
|
like for production systems. But I want you to have the latest TLDs, especially
|
@@ -188,15 +191,17 @@ ExtractResult(subdomain='waiterrant', domain='blogspot', suffix='com', is_privat
|
|
188
191
|
```
|
189
192
|
|
190
193
|
The following overrides this.
|
194
|
+
|
191
195
|
```python
|
192
196
|
>>> extract = tldextract.TLDExtract()
|
193
197
|
>>> extract('waiterrant.blogspot.com', include_psl_private_domains=True)
|
194
198
|
ExtractResult(subdomain='', domain='waiterrant', suffix='blogspot.com', is_private=True)
|
195
199
|
```
|
196
200
|
|
197
|
-
|
201
|
+
To change the default for all extract calls:
|
202
|
+
|
198
203
|
```python
|
199
|
-
>>> extract = tldextract.TLDExtract(
|
204
|
+
>>> extract = tldextract.TLDExtract(include_psl_private_domains=True)
|
200
205
|
>>> extract('waiterrant.blogspot.com')
|
201
206
|
ExtractResult(subdomain='', domain='waiterrant', suffix='blogspot.com', is_private=True)
|
202
207
|
```
|
@@ -219,10 +224,12 @@ extract = tldextract.TLDExtract(
|
|
219
224
|
fallback_to_snapshot=False)
|
220
225
|
```
|
221
226
|
|
222
|
-
|
223
|
-
|
227
|
+
If the cached version of public suffix definitions doesn't exist, such as on
|
228
|
+
the first run, the above snippet will request the URLs you specified in order,
|
229
|
+
and use the first successful response.
|
224
230
|
|
225
|
-
If you want to use input data from your local filesystem,
|
231
|
+
If you want to use input data from your local filesystem, use the `file://`
|
232
|
+
protocol with an absolute path:
|
226
233
|
|
227
234
|
```python
|
228
235
|
extract = tldextract.TLDExtract(
|
@@ -231,17 +238,24 @@ extract = tldextract.TLDExtract(
|
|
231
238
|
fallback_to_snapshot=False)
|
232
239
|
```
|
233
240
|
|
234
|
-
|
235
|
-
`os.path` is your friend.
|
236
|
-
|
237
|
-
The command line update command can be used with a URL or local file you specify:
|
241
|
+
This also works via command line update:
|
238
242
|
|
239
243
|
```zsh
|
240
244
|
tldextract --update --suffix_list_url "http://foo.bar.baz"
|
241
245
|
```
|
242
246
|
|
243
|
-
|
244
|
-
list on first use, or if you are behind a complex
|
247
|
+
Using your own URLs could be useful in production when you don't want the delay
|
248
|
+
with updating the suffix list on first use, or if you are behind a complex
|
249
|
+
firewall.
|
250
|
+
|
251
|
+
You can also specify additional suffixes in the `extra_suffixes` param. These
|
252
|
+
will be merged into whatever public suffix definitions are already in use by
|
253
|
+
`tldextract`.
|
254
|
+
|
255
|
+
```python
|
256
|
+
extract = tldextract.TLDExtract(
|
257
|
+
extra_suffixes=["foo", "bar", "baz"])
|
258
|
+
```
|
245
259
|
|
246
260
|
## FAQ
|
247
261
|
|
@@ -250,9 +264,9 @@ list on first use, or if you are behind a complex firewall that prevents a simpl
|
|
250
264
|
This project doesn't contain an actual list of public suffixes. That comes from
|
251
265
|
[the Public Suffix List (PSL)](https://publicsuffix.org/). Submit amendments there.
|
252
266
|
|
253
|
-
|
267
|
+
In the meantime, you can tell tldextract about your exception by either
|
254
268
|
forking the PSL and using your fork in the `suffix_list_urls` param, or adding
|
255
|
-
your suffix piecemeal with the `extra_suffixes` param.
|
269
|
+
your suffix piecemeal with the `extra_suffixes` param.
|
256
270
|
|
257
271
|
### I see my suffix in [the Public Suffix List (PSL)](https://publicsuffix.org/), but this library doesn't extract it.
|
258
272
|
|
@@ -309,5 +323,5 @@ tox -e py311
|
|
309
323
|
Automatically format all code:
|
310
324
|
|
311
325
|
```zsh
|
312
|
-
|
326
|
+
ruff format .
|
313
327
|
```
|
@@ -0,0 +1,16 @@
|
|
1
|
+
tldextract/.tld_set_snapshot,sha256=tpMVwIXVOXJyS48t8RH_wymwyE_gpH1iyMkWVcx3Sjg,318581
|
2
|
+
tldextract/__init__.py,sha256=1n2QxAmFCFp3X1A5O46wJOTZqWM2ukshNkkG-TrOaLQ,274
|
3
|
+
tldextract/__main__.py,sha256=oiZ5EW_lxRLH6Khk6MdzXf7a1Ld5-A3k4wOFRmNNk2o,89
|
4
|
+
tldextract/_version.py,sha256=1-tO6tx4p9okXz3ScGW6YFdQDbS8ruoK2_y0riYBx7M,511
|
5
|
+
tldextract/cache.py,sha256=nrT9VuLmrjHHFxj-Cai97IyUXXenCX6KbHi07mPkzMc,8289
|
6
|
+
tldextract/cli.py,sha256=nCzBAFrgAopTK1t5eBRQgeveSgWheUx4LAlAHE_8mzQ,3010
|
7
|
+
tldextract/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
8
|
+
tldextract/remote.py,sha256=rovbxR73G8To-QRrA_cgSfk0S3k0_B2SyYefXiLHrQg,1940
|
9
|
+
tldextract/suffix_list.py,sha256=ePH6iOIUBe0JE_mc07a34Nd7tFyfmHgP_mJkFhxzr7c,3947
|
10
|
+
tldextract/tldextract.py,sha256=tM2Lrj0yclAulBueRxAK40bzhMl86Ftz4FHaFGyec7k,21454
|
11
|
+
tldextract-5.2.0.dist-info/licenses/LICENSE,sha256=ZUrmz9cSprvhQmqmUdHIWbD51Cytv6PDTMlJLruTLuI,1527
|
12
|
+
tldextract-5.2.0.dist-info/METADATA,sha256=LNhqdHD4eMtqA1DaxoVw0YpFq-XqW4_dtqc53bUAriM,11709
|
13
|
+
tldextract-5.2.0.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
14
|
+
tldextract-5.2.0.dist-info/entry_points.txt,sha256=EStkXC80BetCMp1UDhU3kWuXBo3qDpgKltZTJ1x4x1U,51
|
15
|
+
tldextract-5.2.0.dist-info/top_level.txt,sha256=DWZIjV49WP30tyC1KOEP7t-EaS4IRCXQzc0KXAOn_bk,11
|
16
|
+
tldextract-5.2.0.dist-info/RECORD,,
|
@@ -1,16 +0,0 @@
|
|
1
|
-
tldextract/.tld_set_snapshot,sha256=TVya0bCcmRKl_16oPKPIlNmWS09rXrjOKGgYjhvAGLE,238022
|
2
|
-
tldextract/__init__.py,sha256=rZg3DKzS9CTARuF4Tuq50ViILwUktDED89Av8nStNuM,216
|
3
|
-
tldextract/__main__.py,sha256=oiZ5EW_lxRLH6Khk6MdzXf7a1Ld5-A3k4wOFRmNNk2o,89
|
4
|
-
tldextract/_version.py,sha256=iJQJoAO8HGnLsPBpH1rkF4KPbrYxIqs4qAXfUgzhRqQ,411
|
5
|
-
tldextract/cache.py,sha256=vsr4ERgNxmBO_mYwXLCMbRRKq1s-IDZZLXoaGIYXmBM,8601
|
6
|
-
tldextract/cli.py,sha256=nCzBAFrgAopTK1t5eBRQgeveSgWheUx4LAlAHE_8mzQ,3010
|
7
|
-
tldextract/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
8
|
-
tldextract/remote.py,sha256=sklRFbATwPs_S33-KeIu9ixuSWP5w7QXO8jnhi_lgJs,1944
|
9
|
-
tldextract/suffix_list.py,sha256=TcUpMTZwsicZn6_eHKqA4bjurQrKYde14P-4HT4s4yE,3896
|
10
|
-
tldextract/tldextract.py,sha256=oUYLJcgWmeika0teDq2nNI5UCSbAR0c3eosYslVJPUY,18731
|
11
|
-
tldextract-5.1.2.dist-info/LICENSE,sha256=dKIruBYZ9wJFoTWv8hvg2bhDv9TXDQ82u-0EERuGJYg,1527
|
12
|
-
tldextract-5.1.2.dist-info/METADATA,sha256=dkiY2wl_8M2guJ0MGhGi0YQ9OgZI4vGpJ0I9LMLSGyQ,11464
|
13
|
-
tldextract-5.1.2.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
14
|
-
tldextract-5.1.2.dist-info/entry_points.txt,sha256=EStkXC80BetCMp1UDhU3kWuXBo3qDpgKltZTJ1x4x1U,51
|
15
|
-
tldextract-5.1.2.dist-info/top_level.txt,sha256=DWZIjV49WP30tyC1KOEP7t-EaS4IRCXQzc0KXAOn_bk,11
|
16
|
-
tldextract-5.1.2.dist-info/RECORD,,
|
File without changes
|
File without changes
|