tldextract 4.0.0__py3-none-any.whl → 5.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tldextract/_version.py +2 -2
- tldextract/cache.py +13 -21
- tldextract/cli.py +2 -2
- tldextract/suffix_list.py +1 -1
- tldextract/tldextract.py +21 -38
- {tldextract-4.0.0.dist-info → tldextract-5.0.0.dist-info}/METADATA +9 -27
- tldextract-5.0.0.dist-info/RECORD +16 -0
- tldextract-4.0.0.dist-info/RECORD +0 -16
- {tldextract-4.0.0.dist-info → tldextract-5.0.0.dist-info}/LICENSE +0 -0
- {tldextract-4.0.0.dist-info → tldextract-5.0.0.dist-info}/WHEEL +0 -0
- {tldextract-4.0.0.dist-info → tldextract-5.0.0.dist-info}/entry_points.txt +0 -0
- {tldextract-4.0.0.dist-info → tldextract-5.0.0.dist-info}/top_level.txt +0 -0
tldextract/_version.py
CHANGED
tldextract/cache.py
CHANGED
@@ -21,7 +21,7 @@ LOG = logging.getLogger(__name__)
|
|
21
21
|
|
22
22
|
_DID_LOG_UNABLE_TO_CACHE = False
|
23
23
|
|
24
|
-
T = TypeVar("T")
|
24
|
+
T = TypeVar("T")
|
25
25
|
|
26
26
|
|
27
27
|
def get_pkg_unique_identifier() -> str:
|
@@ -32,7 +32,6 @@ def get_pkg_unique_identifier() -> str:
|
|
32
32
|
a new version of tldextract
|
33
33
|
"""
|
34
34
|
try:
|
35
|
-
# pylint: disable=import-outside-toplevel
|
36
35
|
from tldextract._version import version
|
37
36
|
except ImportError:
|
38
37
|
version = "dev"
|
@@ -83,6 +82,7 @@ class DiskCache:
|
|
83
82
|
"""Disk _cache that only works for jsonable values."""
|
84
83
|
|
85
84
|
def __init__(self, cache_dir: str | None, lock_timeout: int = 20):
|
85
|
+
"""Construct a disk cache in the given directory."""
|
86
86
|
self.enabled = bool(cache_dir)
|
87
87
|
self.cache_dir = os.path.expanduser(str(cache_dir) or "")
|
88
88
|
self.lock_timeout = lock_timeout
|
@@ -99,14 +99,13 @@ class DiskCache:
|
|
99
99
|
if not os.path.isfile(cache_filepath):
|
100
100
|
raise KeyError("namespace: " + namespace + " key: " + repr(key))
|
101
101
|
try:
|
102
|
-
# pylint: disable-next=unspecified-encoding
|
103
102
|
with open(cache_filepath) as cache_file:
|
104
103
|
return json.load(cache_file)
|
105
104
|
except (OSError, ValueError) as exc:
|
106
105
|
LOG.error("error reading TLD cache file %s: %s", cache_filepath, exc)
|
107
106
|
raise KeyError("namespace: " + namespace + " key: " + repr(key)) from None
|
108
107
|
|
109
|
-
def set(
|
108
|
+
def set( # noqa: A003
|
110
109
|
self, namespace: str, key: str | dict[str, Hashable], value: object
|
111
110
|
) -> None:
|
112
111
|
"""Set a value in the disk cache."""
|
@@ -117,19 +116,16 @@ class DiskCache:
|
|
117
116
|
|
118
117
|
try:
|
119
118
|
_make_dir(cache_filepath)
|
120
|
-
# pylint: disable-next=unspecified-encoding
|
121
119
|
with open(cache_filepath, "w") as cache_file:
|
122
120
|
json.dump(value, cache_file)
|
123
121
|
except OSError as ioe:
|
124
|
-
global _DID_LOG_UNABLE_TO_CACHE
|
122
|
+
global _DID_LOG_UNABLE_TO_CACHE
|
125
123
|
if not _DID_LOG_UNABLE_TO_CACHE:
|
126
124
|
LOG.warning(
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
"set `cache_dir=None` to silence this warning. %s"
|
132
|
-
),
|
125
|
+
"unable to cache %s.%s in %s. This could refresh the "
|
126
|
+
"Public Suffix List over HTTP every app startup. "
|
127
|
+
"Construct your `TLDExtract` with a writable `cache_dir` or "
|
128
|
+
"set `cache_dir=None` to silence this warning. %s",
|
133
129
|
namespace,
|
134
130
|
key,
|
135
131
|
cache_filepath,
|
@@ -181,15 +177,13 @@ class DiskCache:
|
|
181
177
|
try:
|
182
178
|
_make_dir(cache_filepath)
|
183
179
|
except OSError as ioe:
|
184
|
-
global _DID_LOG_UNABLE_TO_CACHE
|
180
|
+
global _DID_LOG_UNABLE_TO_CACHE
|
185
181
|
if not _DID_LOG_UNABLE_TO_CACHE:
|
186
182
|
LOG.warning(
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
"set `cache_dir=None` to silence this warning. %s"
|
192
|
-
),
|
183
|
+
"unable to cache %s.%s in %s. This could refresh the "
|
184
|
+
"Public Suffix List over HTTP every app startup. "
|
185
|
+
"Construct your `TLDExtract` with a writable `cache_dir` or "
|
186
|
+
"set `cache_dir=None` to silence this warning. %s",
|
193
187
|
namespace,
|
194
188
|
key_args,
|
195
189
|
cache_filepath,
|
@@ -199,8 +193,6 @@ class DiskCache:
|
|
199
193
|
|
200
194
|
return func(**kwargs)
|
201
195
|
|
202
|
-
# Disable lint of 3rd party (see also https://github.com/tox-dev/py-filelock/issues/102)
|
203
|
-
# pylint: disable-next=abstract-class-instantiated
|
204
196
|
with FileLock(lock_path, timeout=self.lock_timeout):
|
205
197
|
try:
|
206
198
|
result = cast(T, self.get(namespace=namespace, key=key_args))
|
tldextract/cli.py
CHANGED
tldextract/suffix_list.py
CHANGED
@@ -19,7 +19,7 @@ PUBLIC_SUFFIX_RE = re.compile(r"^(?P<suffix>[.*!]*\w[\S]*)", re.UNICODE | re.MUL
|
|
19
19
|
PUBLIC_PRIVATE_SUFFIX_SEPARATOR = "// ===BEGIN PRIVATE DOMAINS==="
|
20
20
|
|
21
21
|
|
22
|
-
class SuffixListNotFound(LookupError):
|
22
|
+
class SuffixListNotFound(LookupError): # noqa: N818
|
23
23
|
"""A recoverable error while looking up a suffix list.
|
24
24
|
|
25
25
|
Recoverable because you can specify backups, or use this library's bundled
|
tldextract/tldextract.py
CHANGED
@@ -13,18 +13,6 @@ It does this via the Public Suffix List (PSL).
|
|
13
13
|
>>> tldextract.extract('http://www.worldbank.org.kg/') # Kyrgyzstan
|
14
14
|
ExtractResult(subdomain='www', domain='worldbank', suffix='org.kg', is_private=False)
|
15
15
|
|
16
|
-
`ExtractResult` is a namedtuple, so it's simple to access the parts you want.
|
17
|
-
|
18
|
-
>>> ext = tldextract.extract('http://forums.bbc.co.uk')
|
19
|
-
>>> (ext.subdomain, ext.domain, ext.suffix)
|
20
|
-
('forums', 'bbc', 'co.uk')
|
21
|
-
>>> # rejoin subdomain and domain
|
22
|
-
>>> '.'.join(ext[:2])
|
23
|
-
'forums.bbc'
|
24
|
-
>>> # a common alias
|
25
|
-
>>> ext.registered_domain
|
26
|
-
'bbc.co.uk'
|
27
|
-
|
28
16
|
Note subdomain and suffix are _optional_. Not all URL-like inputs have a
|
29
17
|
subdomain or a valid suffix.
|
30
18
|
|
@@ -37,16 +25,13 @@ subdomain or a valid suffix.
|
|
37
25
|
>>> tldextract.extract('http://127.0.0.1:8080/deployed/')
|
38
26
|
ExtractResult(subdomain='', domain='127.0.0.1', suffix='', is_private=False)
|
39
27
|
|
40
|
-
|
41
|
-
or suffix were found:
|
28
|
+
To rejoin the original hostname, if it was indeed a valid, registered hostname:
|
42
29
|
|
43
|
-
>>> ext = tldextract.extract('http://
|
44
|
-
>>>
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
>>> '.'.join(part for part in ext[:3] if part)
|
49
|
-
'127.0.0.1'
|
30
|
+
>>> ext = tldextract.extract('http://forums.bbc.co.uk')
|
31
|
+
>>> ext.registered_domain
|
32
|
+
'bbc.co.uk'
|
33
|
+
>>> ext.fqdn
|
34
|
+
'forums.bbc.co.uk'
|
50
35
|
"""
|
51
36
|
|
52
37
|
from __future__ import annotations
|
@@ -55,10 +40,8 @@ import logging
|
|
55
40
|
import os
|
56
41
|
import urllib.parse
|
57
42
|
from collections.abc import Collection, Sequence
|
43
|
+
from dataclasses import dataclass
|
58
44
|
from functools import wraps
|
59
|
-
from typing import (
|
60
|
-
NamedTuple,
|
61
|
-
)
|
62
45
|
|
63
46
|
import idna
|
64
47
|
|
@@ -77,14 +60,17 @@ PUBLIC_SUFFIX_LIST_URLS = (
|
|
77
60
|
)
|
78
61
|
|
79
62
|
|
80
|
-
|
81
|
-
|
82
|
-
|
63
|
+
@dataclass(order=True)
|
64
|
+
class ExtractResult:
|
65
|
+
"""A URL's extracted subdomain, domain, and suffix.
|
66
|
+
|
67
|
+
Also contains metadata, like a flag that indicates if the URL has a private suffix.
|
68
|
+
"""
|
83
69
|
|
84
70
|
subdomain: str
|
85
71
|
domain: str
|
86
72
|
suffix: str
|
87
|
-
is_private: bool
|
73
|
+
is_private: bool
|
88
74
|
|
89
75
|
@property
|
90
76
|
def registered_domain(self) -> str:
|
@@ -111,9 +97,7 @@ class ExtractResult(NamedTuple):
|
|
111
97
|
''
|
112
98
|
"""
|
113
99
|
if self.suffix and (self.domain or self.is_private):
|
114
|
-
|
115
|
-
# pylint: disable-next=not-an-iterable,unsubscriptable-object
|
116
|
-
return ".".join(i for i in self[:3] if i)
|
100
|
+
return ".".join(i for i in (self.subdomain, self.domain, self.suffix) if i)
|
117
101
|
return ""
|
118
102
|
|
119
103
|
@property
|
@@ -164,8 +148,8 @@ class ExtractResult(NamedTuple):
|
|
164
148
|
class TLDExtract:
|
165
149
|
"""A callable for extracting, subdomain, domain, and suffix components from a URL."""
|
166
150
|
|
167
|
-
# TODO:
|
168
|
-
def __init__(
|
151
|
+
# TODO: too-many-arguments
|
152
|
+
def __init__(
|
169
153
|
self,
|
170
154
|
cache_dir: str | None = get_cache_dir(),
|
171
155
|
suffix_list_urls: Sequence[str] = PUBLIC_SUFFIX_LIST_URLS,
|
@@ -294,7 +278,7 @@ class TLDExtract:
|
|
294
278
|
and netloc_with_ascii_dots[-1] == "]"
|
295
279
|
):
|
296
280
|
if looks_like_ipv6(netloc_with_ascii_dots[1:-1]):
|
297
|
-
return ExtractResult("", netloc_with_ascii_dots, "")
|
281
|
+
return ExtractResult("", netloc_with_ascii_dots, "", is_private=False)
|
298
282
|
|
299
283
|
labels = netloc_with_ascii_dots.split(".")
|
300
284
|
|
@@ -371,6 +355,7 @@ class Trie:
|
|
371
355
|
def __init__(
|
372
356
|
self, matches: dict | None = None, end: bool = False, is_private: bool = False
|
373
357
|
) -> None:
|
358
|
+
"""TODO."""
|
374
359
|
self.matches = matches if matches else {}
|
375
360
|
self.end = end
|
376
361
|
self.is_private = is_private
|
@@ -411,16 +396,14 @@ class Trie:
|
|
411
396
|
|
412
397
|
|
413
398
|
@wraps(TLD_EXTRACTOR.__call__)
|
414
|
-
def extract( #
|
399
|
+
def extract( # noqa: D103
|
415
400
|
url: str, include_psl_private_domains: bool | None = False
|
416
401
|
) -> ExtractResult:
|
417
402
|
return TLD_EXTRACTOR(url, include_psl_private_domains=include_psl_private_domains)
|
418
403
|
|
419
404
|
|
420
405
|
@wraps(TLD_EXTRACTOR.update)
|
421
|
-
def update( # type: ignore[no-untyped-def]
|
422
|
-
*args, **kwargs
|
423
|
-
): # pylint: disable=missing-function-docstring
|
406
|
+
def update(*args, **kwargs): # type: ignore[no-untyped-def] # noqa: D103
|
424
407
|
return TLD_EXTRACTOR.update(*args, **kwargs)
|
425
408
|
|
426
409
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: tldextract
|
3
|
-
Version:
|
3
|
+
Version: 5.0.0
|
4
4
|
Summary: Accurately separates a URL's subdomain, domain, and public suffix, using the Public Suffix List (PSL). By default, this includes the public ICANN TLDs and their exceptions. You can optionally support the Public Suffix List's private domains as well.
|
5
5
|
Author-email: John Kurkowski <john.kurkowski@gmail.com>
|
6
6
|
License: BSD-3-Clause
|
@@ -10,12 +10,11 @@ Classifier: Development Status :: 5 - Production/Stable
|
|
10
10
|
Classifier: Topic :: Utilities
|
11
11
|
Classifier: License :: OSI Approved :: BSD License
|
12
12
|
Classifier: Programming Language :: Python :: 3
|
13
|
-
Classifier: Programming Language :: Python :: 3.7
|
14
13
|
Classifier: Programming Language :: Python :: 3.8
|
15
14
|
Classifier: Programming Language :: Python :: 3.9
|
16
15
|
Classifier: Programming Language :: Python :: 3.10
|
17
16
|
Classifier: Programming Language :: Python :: 3.11
|
18
|
-
Requires-Python: >=3.
|
17
|
+
Requires-Python: >=3.8
|
19
18
|
Description-Content-Type: text/markdown
|
20
19
|
License-File: LICENSE
|
21
20
|
Requires-Dist: idna
|
@@ -56,20 +55,6 @@ ExtractResult(subdomain='forums', domain='bbc', suffix='co.uk', is_private=False
|
|
56
55
|
ExtractResult(subdomain='www', domain='worldbank', suffix='org.kg', is_private=False)
|
57
56
|
```
|
58
57
|
|
59
|
-
`ExtractResult` is a namedtuple, so it's simple to access the parts you want.
|
60
|
-
|
61
|
-
```python
|
62
|
-
>>> ext = tldextract.extract('http://forums.bbc.co.uk')
|
63
|
-
>>> (ext.subdomain, ext.domain, ext.suffix)
|
64
|
-
('forums', 'bbc', 'co.uk')
|
65
|
-
>>> # rejoin subdomain and domain
|
66
|
-
>>> '.'.join(ext[:2])
|
67
|
-
'forums.bbc'
|
68
|
-
>>> # a common alias
|
69
|
-
>>> ext.registered_domain
|
70
|
-
'bbc.co.uk'
|
71
|
-
```
|
72
|
-
|
73
58
|
Note subdomain and suffix are _optional_. Not all URL-like inputs have a
|
74
59
|
subdomain or a valid suffix.
|
75
60
|
|
@@ -84,17 +69,14 @@ ExtractResult(subdomain='google', domain='notavalidsuffix', suffix='', is_privat
|
|
84
69
|
ExtractResult(subdomain='', domain='127.0.0.1', suffix='', is_private=False)
|
85
70
|
```
|
86
71
|
|
87
|
-
|
88
|
-
or suffix were found:
|
72
|
+
To rejoin the original hostname, if it was indeed a valid, registered hostname:
|
89
73
|
|
90
74
|
```python
|
91
|
-
>>> ext = tldextract.extract('http://
|
92
|
-
>>>
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
>>> '.'.join(part for part in ext[:3] if part)
|
97
|
-
'127.0.0.1'
|
75
|
+
>>> ext = tldextract.extract('http://forums.bbc.co.uk')
|
76
|
+
>>> ext.registered_domain
|
77
|
+
'bbc.co.uk'
|
78
|
+
>>> ext.fqdn
|
79
|
+
'forums.bbc.co.uk'
|
98
80
|
```
|
99
81
|
|
100
82
|
By default, this package supports the public ICANN TLDs and their exceptions.
|
@@ -303,7 +285,7 @@ Run all tests against a specific Python environment configuration:
|
|
303
285
|
|
304
286
|
```zsh
|
305
287
|
tox -l
|
306
|
-
tox -e
|
288
|
+
tox -e py311
|
307
289
|
```
|
308
290
|
|
309
291
|
### Code Style
|
@@ -0,0 +1,16 @@
|
|
1
|
+
tldextract/.tld_set_snapshot,sha256=TVya0bCcmRKl_16oPKPIlNmWS09rXrjOKGgYjhvAGLE,238022
|
2
|
+
tldextract/__init__.py,sha256=rZg3DKzS9CTARuF4Tuq50ViILwUktDED89Av8nStNuM,216
|
3
|
+
tldextract/__main__.py,sha256=FxfCNOozXSaJP2GTjgWLAn03oNMd_EUUOWkfT1_YRgM,90
|
4
|
+
tldextract/_version.py,sha256=U7HnWMtKn0QTFHRJAzsVjr4cELMq3Toi6P5afKP6ah0,411
|
5
|
+
tldextract/cache.py,sha256=kcSovX7j1V43s3gOuav8nlFl5Dgkl0O576H3_Tiqkc0,8323
|
6
|
+
tldextract/cli.py,sha256=F5w9Haz7rWdrgIgRwZJn04t7qRBQAHUKzQnYXwDUfLs,2465
|
7
|
+
tldextract/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
8
|
+
tldextract/remote.py,sha256=dpLz-s-1AP4Ai4XPVQe-uT2Nmev8CZEMKURdqGw5XiA,2550
|
9
|
+
tldextract/suffix_list.py,sha256=3N8jlmFY-EbQ-kxT2iTryFpCCeGqBUm9NiUXKiAbaOY,3443
|
10
|
+
tldextract/tldextract.py,sha256=2AxRAWtT70jNSPcvBdk7FlrksUwbLDOXM9W8eaB13Bg,17585
|
11
|
+
tldextract-5.0.0.dist-info/LICENSE,sha256=oqlDTqZaKpeJ6jYsQYqTkmV8gGGg-o7cO_OnH79KjsE,1522
|
12
|
+
tldextract-5.0.0.dist-info/METADATA,sha256=pwH-aKifyF_J6gDZ-o6nPDHBSQtll7zuK1v_ceH4YCQ,10739
|
13
|
+
tldextract-5.0.0.dist-info/WHEEL,sha256=yQN5g4mg4AybRjkgi-9yy4iQEFibGQmlz78Pik5Or-A,92
|
14
|
+
tldextract-5.0.0.dist-info/entry_points.txt,sha256=EStkXC80BetCMp1UDhU3kWuXBo3qDpgKltZTJ1x4x1U,51
|
15
|
+
tldextract-5.0.0.dist-info/top_level.txt,sha256=DWZIjV49WP30tyC1KOEP7t-EaS4IRCXQzc0KXAOn_bk,11
|
16
|
+
tldextract-5.0.0.dist-info/RECORD,,
|
@@ -1,16 +0,0 @@
|
|
1
|
-
tldextract/.tld_set_snapshot,sha256=TVya0bCcmRKl_16oPKPIlNmWS09rXrjOKGgYjhvAGLE,238022
|
2
|
-
tldextract/__init__.py,sha256=rZg3DKzS9CTARuF4Tuq50ViILwUktDED89Av8nStNuM,216
|
3
|
-
tldextract/__main__.py,sha256=FxfCNOozXSaJP2GTjgWLAn03oNMd_EUUOWkfT1_YRgM,90
|
4
|
-
tldextract/_version.py,sha256=TgVqVkMXXQVomuTpZfj8uxnyooVWsiw-3pM8cC2qwwE,411
|
5
|
-
tldextract/cache.py,sha256=_hUjP-cw4BpR2TG-_XAD6YL1pQMNe64b4O0nVWMVLAY,8790
|
6
|
-
tldextract/cli.py,sha256=5BMCp-DjY3_-KlZ1wnpycHrUHzaqLL4r4TiHl2-xiCU,2478
|
7
|
-
tldextract/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
8
|
-
tldextract/remote.py,sha256=dpLz-s-1AP4Ai4XPVQe-uT2Nmev8CZEMKURdqGw5XiA,2550
|
9
|
-
tldextract/suffix_list.py,sha256=W797R-PLIw_8yBPnopoMTimPiv1P1csh9Wcs-_E4Ous,3429
|
10
|
-
tldextract/tldextract.py,sha256=kHtO_xVbSgOEsLU526gxHa5Mz5Pjxe6fP-xvJmzWosM,18272
|
11
|
-
tldextract-4.0.0.dist-info/LICENSE,sha256=oqlDTqZaKpeJ6jYsQYqTkmV8gGGg-o7cO_OnH79KjsE,1522
|
12
|
-
tldextract-4.0.0.dist-info/METADATA,sha256=MYr1oUzvOfV47wNqn276x-7NxY0gTfm-9S__ltSLbMI,11260
|
13
|
-
tldextract-4.0.0.dist-info/WHEEL,sha256=yQN5g4mg4AybRjkgi-9yy4iQEFibGQmlz78Pik5Or-A,92
|
14
|
-
tldextract-4.0.0.dist-info/entry_points.txt,sha256=EStkXC80BetCMp1UDhU3kWuXBo3qDpgKltZTJ1x4x1U,51
|
15
|
-
tldextract-4.0.0.dist-info/top_level.txt,sha256=DWZIjV49WP30tyC1KOEP7t-EaS4IRCXQzc0KXAOn_bk,11
|
16
|
-
tldextract-4.0.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|