tldextract 5.0.1__tar.gz → 5.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tldextract-5.1.1/.github/workflows/ci.yml +40 -0
- {tldextract-5.0.1 → tldextract-5.1.1}/CHANGELOG.md +17 -0
- {tldextract-5.0.1 → tldextract-5.1.1}/PKG-INFO +17 -6
- {tldextract-5.0.1 → tldextract-5.1.1}/README.md +3 -4
- {tldextract-5.0.1 → tldextract-5.1.1}/pyproject.toml +18 -2
- {tldextract-5.0.1 → tldextract-5.1.1}/tests/cli_test.py +23 -0
- {tldextract-5.0.1 → tldextract-5.1.1}/tests/custom_suffix_test.py +7 -3
- {tldextract-5.0.1 → tldextract-5.1.1}/tests/main_test.py +29 -0
- {tldextract-5.0.1 → tldextract-5.1.1}/tests/test_cache.py +5 -6
- {tldextract-5.0.1 → tldextract-5.1.1}/tests/test_parallel.py +25 -8
- {tldextract-5.0.1 → tldextract-5.1.1}/tldextract/_version.py +2 -2
- {tldextract-5.0.1 → tldextract-5.1.1}/tldextract/cache.py +8 -8
- {tldextract-5.0.1 → tldextract-5.1.1}/tldextract/cli.py +21 -2
- {tldextract-5.0.1 → tldextract-5.1.1}/tldextract/suffix_list.py +17 -2
- {tldextract-5.0.1 → tldextract-5.1.1}/tldextract/tldextract.py +51 -16
- {tldextract-5.0.1 → tldextract-5.1.1}/tldextract.egg-info/PKG-INFO +17 -6
- {tldextract-5.0.1 → tldextract-5.1.1}/tldextract.egg-info/SOURCES.txt +1 -1
- tldextract-5.1.1/tldextract.egg-info/requires.txt +16 -0
- {tldextract-5.0.1 → tldextract-5.1.1}/tox.ini +5 -18
- tldextract-5.0.1/.travis.yml +0 -21
- tldextract-5.0.1/tldextract.egg-info/requires.txt +0 -4
- {tldextract-5.0.1 → tldextract-5.1.1}/.github/FUNDING.yml +0 -0
- {tldextract-5.0.1 → tldextract-5.1.1}/.gitignore +0 -0
- {tldextract-5.0.1 → tldextract-5.1.1}/LICENSE +0 -0
- {tldextract-5.0.1 → tldextract-5.1.1}/setup.cfg +0 -0
- {tldextract-5.0.1 → tldextract-5.1.1}/tests/__init__.py +0 -0
- {tldextract-5.0.1 → tldextract-5.1.1}/tests/conftest.py +0 -0
- {tldextract-5.0.1 → tldextract-5.1.1}/tests/fixtures/fake_suffix_list_fixture.dat +0 -0
- {tldextract-5.0.1 → tldextract-5.1.1}/tests/integration_test.py +0 -0
- {tldextract-5.0.1 → tldextract-5.1.1}/tests/test_trie.py +0 -0
- {tldextract-5.0.1 → tldextract-5.1.1}/tldextract/.tld_set_snapshot +0 -0
- {tldextract-5.0.1 → tldextract-5.1.1}/tldextract/__init__.py +0 -0
- {tldextract-5.0.1 → tldextract-5.1.1}/tldextract/__main__.py +0 -0
- {tldextract-5.0.1 → tldextract-5.1.1}/tldextract/py.typed +0 -0
- {tldextract-5.0.1 → tldextract-5.1.1}/tldextract/remote.py +0 -0
- {tldextract-5.0.1 → tldextract-5.1.1}/tldextract.egg-info/dependency_links.txt +0 -0
- {tldextract-5.0.1 → tldextract-5.1.1}/tldextract.egg-info/entry_points.txt +0 -0
- {tldextract-5.0.1 → tldextract-5.1.1}/tldextract.egg-info/top_level.txt +0 -0
@@ -0,0 +1,40 @@
|
|
1
|
+
name: build
|
2
|
+
on: [push, pull_request]
|
3
|
+
jobs:
|
4
|
+
test:
|
5
|
+
strategy:
|
6
|
+
fail-fast: false
|
7
|
+
matrix:
|
8
|
+
os: [macos-latest, windows-latest, ubuntu-latest]
|
9
|
+
language:
|
10
|
+
[
|
11
|
+
{python-version: "3.8", toxenv: "py38"},
|
12
|
+
{python-version: "3.9", toxenv: "py39"},
|
13
|
+
{python-version: "3.10", toxenv: "py310"},
|
14
|
+
{python-version: "3.11", toxenv: "py311"},
|
15
|
+
{python-version: "3.12", toxenv: "py312"},
|
16
|
+
{python-version: "pypy3.8", toxenv: "pypy38"},
|
17
|
+
]
|
18
|
+
include:
|
19
|
+
- os: ubuntu-latest
|
20
|
+
language: {python-version: "3.8", toxenv: "codestyle"}
|
21
|
+
- os: ubuntu-latest
|
22
|
+
language: {python-version: "3.8", toxenv: "lint"}
|
23
|
+
- os: ubuntu-latest
|
24
|
+
language: {python-version: "3.8", toxenv: "typecheck"}
|
25
|
+
runs-on: ${{ matrix.os }}
|
26
|
+
steps:
|
27
|
+
- name: Check out repository
|
28
|
+
uses: actions/checkout@v4
|
29
|
+
- name: Setup Python
|
30
|
+
uses: actions/setup-python@v4
|
31
|
+
with:
|
32
|
+
python-version: ${{ matrix.language.python-version }}
|
33
|
+
- name: Install Python requirements
|
34
|
+
run: |
|
35
|
+
pip install --upgrade pip
|
36
|
+
pip install --upgrade --editable '.[testing]'
|
37
|
+
- name: Test
|
38
|
+
run: tox
|
39
|
+
env:
|
40
|
+
TOXENV: ${{ matrix.language.toxenv }}
|
@@ -3,6 +3,23 @@
|
|
3
3
|
After upgrading, update your cache file by deleting it or via `tldextract
|
4
4
|
--update`.
|
5
5
|
|
6
|
+
## 5.1.1 (2023-11-16)
|
7
|
+
|
8
|
+
* Bugfixes
|
9
|
+
* Fix path join on Windows ([#314](https://github.com/john-kurkowski/tldextract/issues/314))
|
10
|
+
* Support Python 3.12
|
11
|
+
|
12
|
+
## 5.1.0 (2023-11-05)
|
13
|
+
|
14
|
+
* Features
|
15
|
+
* Allow passing in `requests.Session` ([#311](https://github.com/john-kurkowski/tldextract/issues/311))
|
16
|
+
* Add "-j, --json" option to support output in json format ([#313](https://github.com/john-kurkowski/tldextract/issues/313))
|
17
|
+
* Docs
|
18
|
+
* Improve clarity of absolute path ([#312](https://github.com/john-kurkowski/tldextract/issues/312))
|
19
|
+
* Misc.
|
20
|
+
* Extract all testing deps from tox.ini to pyproject.toml extras ([#310](https://github.com/john-kurkowski/tldextract/issues/310))
|
21
|
+
* Work around responses type union error, in tests
|
22
|
+
|
6
23
|
## 5.0.1 (2023-10-17)
|
7
24
|
|
8
25
|
* Bugfixes
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: tldextract
|
3
|
-
Version: 5.
|
3
|
+
Version: 5.1.1
|
4
4
|
Summary: Accurately separates a URL's subdomain, domain, and public suffix, using the Public Suffix List (PSL). By default, this includes the public ICANN TLDs and their exceptions. You can optionally support the Public Suffix List's private domains as well.
|
5
5
|
Author-email: John Kurkowski <john.kurkowski@gmail.com>
|
6
6
|
License: BSD-3-Clause
|
@@ -14,6 +14,7 @@ Classifier: Programming Language :: Python :: 3.8
|
|
14
14
|
Classifier: Programming Language :: Python :: 3.9
|
15
15
|
Classifier: Programming Language :: Python :: 3.10
|
16
16
|
Classifier: Programming Language :: Python :: 3.11
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
17
18
|
Requires-Python: >=3.8
|
18
19
|
Description-Content-Type: text/markdown
|
19
20
|
License-File: LICENSE
|
@@ -21,8 +22,19 @@ Requires-Dist: idna
|
|
21
22
|
Requires-Dist: requests>=2.1.0
|
22
23
|
Requires-Dist: requests-file>=1.4
|
23
24
|
Requires-Dist: filelock>=3.0.8
|
24
|
-
|
25
|
-
|
25
|
+
Provides-Extra: testing
|
26
|
+
Requires-Dist: black; extra == "testing"
|
27
|
+
Requires-Dist: mypy; extra == "testing"
|
28
|
+
Requires-Dist: pytest; extra == "testing"
|
29
|
+
Requires-Dist: pytest-gitignore; extra == "testing"
|
30
|
+
Requires-Dist: pytest-mock; extra == "testing"
|
31
|
+
Requires-Dist: responses; extra == "testing"
|
32
|
+
Requires-Dist: ruff; extra == "testing"
|
33
|
+
Requires-Dist: tox; extra == "testing"
|
34
|
+
Requires-Dist: types-filelock; extra == "testing"
|
35
|
+
Requires-Dist: types-requests; extra == "testing"
|
36
|
+
|
37
|
+
# tldextract [](https://badge.fury.io/py/tldextract) [](https://github.com/john-kurkowski/tldextract/actions/workflows/ci.yml)
|
26
38
|
|
27
39
|
`tldextract` accurately separates a URL's subdomain, domain, and public suffix,
|
28
40
|
using [the Public Suffix List (PSL)](https://publicsuffix.org).
|
@@ -210,7 +222,7 @@ If you want to use input data from your local filesystem, just use the `file://`
|
|
210
222
|
|
211
223
|
```python
|
212
224
|
extract = tldextract.TLDExtract(
|
213
|
-
suffix_list_urls=["file://absolute/path/to/your/local/suffix/list/file"],
|
225
|
+
suffix_list_urls=["file://" + "/absolute/path/to/your/local/suffix/list/file"],
|
214
226
|
cache_dir='/path/to/your/cache/',
|
215
227
|
fallback_to_snapshot=False)
|
216
228
|
```
|
@@ -271,7 +283,7 @@ receiving exceptions or error metadata on results.
|
|
271
283
|
|
272
284
|
1. `git clone` this repository.
|
273
285
|
2. Change into the new directory.
|
274
|
-
3. `pip install
|
286
|
+
3. `pip install --upgrade --editable '.[testing]'`
|
275
287
|
|
276
288
|
### Running the test suite
|
277
289
|
|
@@ -293,6 +305,5 @@ tox -e py311
|
|
293
305
|
Automatically format all code:
|
294
306
|
|
295
307
|
```zsh
|
296
|
-
pip install black
|
297
308
|
black .
|
298
309
|
```
|
@@ -1,4 +1,4 @@
|
|
1
|
-
# tldextract [](https://badge.fury.io/py/tldextract) [](https://badge.fury.io/py/tldextract) [](https://github.com/john-kurkowski/tldextract/actions/workflows/ci.yml)
|
2
2
|
|
3
3
|
`tldextract` accurately separates a URL's subdomain, domain, and public suffix,
|
4
4
|
using [the Public Suffix List (PSL)](https://publicsuffix.org).
|
@@ -186,7 +186,7 @@ If you want to use input data from your local filesystem, just use the `file://`
|
|
186
186
|
|
187
187
|
```python
|
188
188
|
extract = tldextract.TLDExtract(
|
189
|
-
suffix_list_urls=["file://absolute/path/to/your/local/suffix/list/file"],
|
189
|
+
suffix_list_urls=["file://" + "/absolute/path/to/your/local/suffix/list/file"],
|
190
190
|
cache_dir='/path/to/your/cache/',
|
191
191
|
fallback_to_snapshot=False)
|
192
192
|
```
|
@@ -247,7 +247,7 @@ receiving exceptions or error metadata on results.
|
|
247
247
|
|
248
248
|
1. `git clone` this repository.
|
249
249
|
2. Change into the new directory.
|
250
|
-
3. `pip install
|
250
|
+
3. `pip install --upgrade --editable '.[testing]'`
|
251
251
|
|
252
252
|
### Running the test suite
|
253
253
|
|
@@ -269,6 +269,5 @@ tox -e py311
|
|
269
269
|
Automatically format all code:
|
270
270
|
|
271
271
|
```zsh
|
272
|
-
pip install black
|
273
272
|
black .
|
274
273
|
```
|
@@ -27,16 +27,32 @@ classifiers = [
|
|
27
27
|
"Programming Language :: Python :: 3.9",
|
28
28
|
"Programming Language :: Python :: 3.10",
|
29
29
|
"Programming Language :: Python :: 3.11",
|
30
|
+
"Programming Language :: Python :: 3.12",
|
30
31
|
]
|
31
32
|
requires-python = ">=3.8"
|
33
|
+
dynamic = ["version"]
|
34
|
+
readme = "README.md"
|
35
|
+
|
32
36
|
dependencies = [
|
33
37
|
"idna",
|
34
38
|
"requests>=2.1.0",
|
35
39
|
"requests-file>=1.4",
|
36
40
|
"filelock>=3.0.8",
|
37
41
|
]
|
38
|
-
|
39
|
-
|
42
|
+
|
43
|
+
[project.optional-dependencies]
|
44
|
+
testing = [
|
45
|
+
"black",
|
46
|
+
"mypy",
|
47
|
+
"pytest",
|
48
|
+
"pytest-gitignore",
|
49
|
+
"pytest-mock",
|
50
|
+
"responses",
|
51
|
+
"ruff",
|
52
|
+
"tox",
|
53
|
+
"types-filelock",
|
54
|
+
"types-requests",
|
55
|
+
]
|
40
56
|
|
41
57
|
[project.urls]
|
42
58
|
Homepage = "https://github.com/john-kurkowski/tldextract"
|
@@ -1,5 +1,6 @@
|
|
1
1
|
"""tldextract integration tests."""
|
2
2
|
|
3
|
+
import json
|
3
4
|
import sys
|
4
5
|
|
5
6
|
import pytest
|
@@ -63,3 +64,25 @@ def test_cli_namedargs(
|
|
63
64
|
stdout, stderr = capsys.readouterr()
|
64
65
|
assert not stderr
|
65
66
|
assert stdout == " example com\n bbc co.uk\nforums bbc co.uk\n"
|
67
|
+
|
68
|
+
|
69
|
+
def test_cli_json_output(
|
70
|
+
capsys: pytest.CaptureFixture[str], monkeypatch: pytest.MonkeyPatch
|
71
|
+
) -> None:
|
72
|
+
"""Test CLI with --json option."""
|
73
|
+
monkeypatch.setattr(sys, "argv", ["tldextract", "--json", "www.bbc.co.uk"])
|
74
|
+
|
75
|
+
main()
|
76
|
+
|
77
|
+
stdout, stderr = capsys.readouterr()
|
78
|
+
assert not stderr
|
79
|
+
assert json.loads(stdout) == {
|
80
|
+
"subdomain": "www",
|
81
|
+
"domain": "bbc",
|
82
|
+
"suffix": "co.uk",
|
83
|
+
"fqdn": "www.bbc.co.uk",
|
84
|
+
"ipv4": "",
|
85
|
+
"ipv6": "",
|
86
|
+
"is_private": False,
|
87
|
+
"registered_domain": "bbc.co.uk",
|
88
|
+
}
|
@@ -2,13 +2,17 @@
|
|
2
2
|
|
3
3
|
import os
|
4
4
|
import tempfile
|
5
|
+
from pathlib import Path
|
5
6
|
|
6
7
|
import tldextract
|
7
8
|
from tldextract.tldextract import ExtractResult
|
8
9
|
|
9
|
-
FAKE_SUFFIX_LIST_URL =
|
10
|
-
os.path.dirname(os.path.abspath(__file__)),
|
11
|
-
|
10
|
+
FAKE_SUFFIX_LIST_URL = Path(
|
11
|
+
os.path.dirname(os.path.abspath(__file__)),
|
12
|
+
"fixtures",
|
13
|
+
"fake_suffix_list_fixture.dat",
|
14
|
+
).as_uri()
|
15
|
+
|
12
16
|
EXTRA_SUFFIXES = ["foo1", "bar1", "baz1"]
|
13
17
|
|
14
18
|
extract_using_fake_suffix_list = tldextract.TLDExtract(
|
@@ -8,6 +8,7 @@ import tempfile
|
|
8
8
|
from collections.abc import Sequence
|
9
9
|
from pathlib import Path
|
10
10
|
from typing import Any
|
11
|
+
from unittest.mock import Mock
|
11
12
|
|
12
13
|
import pytest
|
13
14
|
import pytest_mock
|
@@ -449,6 +450,34 @@ def test_cache_timeouts(tmp_path: Path) -> None:
|
|
449
450
|
tldextract.suffix_list.find_first_response(cache, [server], 5)
|
450
451
|
|
451
452
|
|
453
|
+
@responses.activate
|
454
|
+
def test_find_first_response_without_session(tmp_path: Path) -> None:
|
455
|
+
"""Test it is able to find first response without session passed in."""
|
456
|
+
server = "http://some-server.com"
|
457
|
+
response_text = "server response"
|
458
|
+
responses.add(responses.GET, server, status=200, body=response_text)
|
459
|
+
cache = DiskCache(str(tmp_path))
|
460
|
+
|
461
|
+
result = tldextract.suffix_list.find_first_response(cache, [server], 5)
|
462
|
+
assert result == response_text
|
463
|
+
|
464
|
+
|
465
|
+
def test_find_first_response_with_session(tmp_path: Path) -> None:
|
466
|
+
"""Test it is able to find first response with passed in session."""
|
467
|
+
server = "http://some-server.com"
|
468
|
+
response_text = "server response"
|
469
|
+
cache = DiskCache(str(tmp_path))
|
470
|
+
mock_session = Mock()
|
471
|
+
mock_session.get.return_value.text = response_text
|
472
|
+
|
473
|
+
result = tldextract.suffix_list.find_first_response(
|
474
|
+
cache, [server], 5, mock_session
|
475
|
+
)
|
476
|
+
assert result == response_text
|
477
|
+
mock_session.get.assert_called_once_with(server, timeout=5)
|
478
|
+
mock_session.close.assert_not_called()
|
479
|
+
|
480
|
+
|
452
481
|
def test_include_psl_private_domain_attr() -> None:
|
453
482
|
"""Test private domains, which default to not being treated differently."""
|
454
483
|
extract_private = tldextract.TLDExtract(include_psl_private_domains=True)
|
@@ -1,7 +1,6 @@
|
|
1
1
|
"""Test the caching functionality."""
|
2
2
|
from __future__ import annotations
|
3
3
|
|
4
|
-
import os.path
|
5
4
|
import sys
|
6
5
|
import types
|
7
6
|
from collections.abc import Hashable
|
@@ -56,14 +55,14 @@ def test_get_cache_dir(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
56
55
|
monkeypatch.delenv("HOME", raising=False)
|
57
56
|
monkeypatch.delenv("XDG_CACHE_HOME", raising=False)
|
58
57
|
monkeypatch.delenv("TLDEXTRACT_CACHE", raising=False)
|
59
|
-
assert get_cache_dir().endswith("tldextract
|
58
|
+
assert get_cache_dir().endswith(str(Path("tldextract", ".suffix_cache")))
|
60
59
|
|
61
60
|
# with home set, but not anything else specified, use XDG_CACHE_HOME default
|
62
61
|
monkeypatch.setenv("HOME", "/home/john")
|
63
62
|
monkeypatch.delenv("XDG_CACHE_HOME", raising=False)
|
64
63
|
monkeypatch.delenv("TLDEXTRACT_CACHE", raising=False)
|
65
|
-
assert get_cache_dir() ==
|
66
|
-
"/home/john", ".cache/python-tldextract", pkg_identifier
|
64
|
+
assert get_cache_dir() == str(
|
65
|
+
Path("/home/john", ".cache/python-tldextract", pkg_identifier)
|
67
66
|
)
|
68
67
|
|
69
68
|
# if XDG_CACHE_HOME is set, use it
|
@@ -71,8 +70,8 @@ def test_get_cache_dir(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
71
70
|
monkeypatch.setenv("XDG_CACHE_HOME", "/my/alt/cache")
|
72
71
|
monkeypatch.delenv("TLDEXTRACT_CACHE", raising=False)
|
73
72
|
|
74
|
-
assert get_cache_dir() ==
|
75
|
-
"/my/alt/cache/python-tldextract", pkg_identifier
|
73
|
+
assert get_cache_dir() == str(
|
74
|
+
Path("/my/alt/cache/python-tldextract", pkg_identifier)
|
76
75
|
)
|
77
76
|
|
78
77
|
# if TLDEXTRACT_CACHE is set, use it
|
@@ -1,6 +1,8 @@
|
|
1
1
|
"""Test ability to run in parallel with shared cache."""
|
2
|
+
|
3
|
+
from __future__ import annotations
|
4
|
+
|
2
5
|
import os
|
3
|
-
import os.path
|
4
6
|
from multiprocessing import Pool
|
5
7
|
from pathlib import Path
|
6
8
|
|
@@ -19,14 +21,15 @@ def test_multiprocessing_makes_one_request(tmp_path: Path) -> None:
|
|
19
21
|
assert sum(http_request_counts) == 1
|
20
22
|
|
21
23
|
|
22
|
-
@responses.activate
|
23
24
|
def _run_extractor(cache_dir: Path) -> int:
|
24
25
|
"""Run the extractor."""
|
25
|
-
responses.
|
26
|
-
|
26
|
+
with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps:
|
27
|
+
rsps.add(responses.GET, PUBLIC_SUFFIX_LIST_URLS[0], status=208, body="uk.co")
|
28
|
+
extract = TLDExtract(cache_dir=str(cache_dir))
|
27
29
|
|
28
|
-
|
29
|
-
|
30
|
+
extract("bar.uk.com", include_psl_private_domains=True)
|
31
|
+
num_calls = len(rsps.calls)
|
32
|
+
return num_calls
|
30
33
|
|
31
34
|
|
32
35
|
@responses.activate
|
@@ -41,9 +44,23 @@ def test_cache_cleared_by_other_process(
|
|
41
44
|
extract("google.com")
|
42
45
|
orig_unlink = os.unlink
|
43
46
|
|
44
|
-
def
|
47
|
+
def is_relative_to(path: Path, other_path: str | Path) -> bool:
|
48
|
+
"""Return True if path is relative to other_path or False.
|
49
|
+
|
50
|
+
Taken from the Python 3.9 standard library.
|
51
|
+
Reference: https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.is_relative_to
|
52
|
+
"""
|
53
|
+
try:
|
54
|
+
path.relative_to(other_path)
|
55
|
+
return True
|
56
|
+
except ValueError:
|
57
|
+
return False
|
58
|
+
|
59
|
+
def evil_unlink(filename: str | Path) -> None:
|
45
60
|
"""Simulate someone deletes the file right before we try to."""
|
46
|
-
if filename.startswith(cache_dir)
|
61
|
+
if (isinstance(filename, str) and filename.startswith(cache_dir)) or (
|
62
|
+
isinstance(filename, Path) and is_relative_to(filename, cache_dir)
|
63
|
+
):
|
47
64
|
orig_unlink(filename)
|
48
65
|
orig_unlink(filename)
|
49
66
|
|
@@ -6,9 +6,9 @@ import hashlib
|
|
6
6
|
import json
|
7
7
|
import logging
|
8
8
|
import os
|
9
|
-
import os.path
|
10
9
|
import sys
|
11
10
|
from collections.abc import Callable, Hashable, Iterable
|
11
|
+
from pathlib import Path
|
12
12
|
from typing import (
|
13
13
|
TypeVar,
|
14
14
|
cast,
|
@@ -79,15 +79,15 @@ def get_cache_dir() -> str:
|
|
79
79
|
if xdg_cache_home is None:
|
80
80
|
user_home = os.getenv("HOME", None)
|
81
81
|
if user_home:
|
82
|
-
xdg_cache_home =
|
82
|
+
xdg_cache_home = str(Path(user_home, ".cache"))
|
83
83
|
|
84
84
|
if xdg_cache_home is not None:
|
85
|
-
return
|
86
|
-
xdg_cache_home, "python-tldextract", get_pkg_unique_identifier()
|
85
|
+
return str(
|
86
|
+
Path(xdg_cache_home, "python-tldextract", get_pkg_unique_identifier())
|
87
87
|
)
|
88
88
|
|
89
89
|
# fallback to trying to use package directory itself
|
90
|
-
return
|
90
|
+
return str(Path(os.path.dirname(__file__), ".suffix_cache"))
|
91
91
|
|
92
92
|
|
93
93
|
class DiskCache:
|
@@ -153,7 +153,7 @@ class DiskCache:
|
|
153
153
|
self.file_ext + ".lock"
|
154
154
|
):
|
155
155
|
try:
|
156
|
-
os.unlink(
|
156
|
+
os.unlink(str(Path(root, filename)))
|
157
157
|
except FileNotFoundError:
|
158
158
|
pass
|
159
159
|
except OSError as exc:
|
@@ -165,10 +165,10 @@ class DiskCache:
|
|
165
165
|
def _key_to_cachefile_path(
|
166
166
|
self, namespace: str, key: str | dict[str, Hashable]
|
167
167
|
) -> str:
|
168
|
-
namespace_path =
|
168
|
+
namespace_path = str(Path(self.cache_dir, namespace))
|
169
169
|
hashed_key = _make_cache_key(key)
|
170
170
|
|
171
|
-
cache_path =
|
171
|
+
cache_path = str(Path(namespace_path, hashed_key + self.file_ext))
|
172
172
|
|
173
173
|
return cache_path
|
174
174
|
|
@@ -1,7 +1,8 @@
|
|
1
1
|
"""tldextract CLI."""
|
2
2
|
|
3
|
-
|
4
3
|
import argparse
|
4
|
+
import dataclasses
|
5
|
+
import json
|
5
6
|
import logging
|
6
7
|
import os.path
|
7
8
|
import pathlib
|
@@ -22,6 +23,13 @@ def main() -> None:
|
|
22
23
|
parser.add_argument(
|
23
24
|
"--version", action="version", version="%(prog)s " + __version__
|
24
25
|
)
|
26
|
+
parser.add_argument(
|
27
|
+
"-j",
|
28
|
+
"--json",
|
29
|
+
default=False,
|
30
|
+
action="store_true",
|
31
|
+
help="output in json format",
|
32
|
+
)
|
25
33
|
parser.add_argument(
|
26
34
|
"input", metavar="fqdn|url", type=str, nargs="*", help="fqdn or url"
|
27
35
|
)
|
@@ -89,4 +97,15 @@ def main() -> None:
|
|
89
97
|
|
90
98
|
for i in args.input:
|
91
99
|
ext = tld_extract(i)
|
92
|
-
|
100
|
+
if args.json:
|
101
|
+
properties = ("fqdn", "ipv4", "ipv6", "registered_domain")
|
102
|
+
print(
|
103
|
+
json.dumps(
|
104
|
+
{
|
105
|
+
**dataclasses.asdict(ext),
|
106
|
+
**{prop: getattr(ext, prop) for prop in properties},
|
107
|
+
}
|
108
|
+
)
|
109
|
+
)
|
110
|
+
else:
|
111
|
+
print(f"{ext.subdomain} {ext.domain} {ext.suffix}")
|
@@ -31,11 +31,16 @@ def find_first_response(
|
|
31
31
|
cache: DiskCache,
|
32
32
|
urls: Sequence[str],
|
33
33
|
cache_fetch_timeout: float | int | None = None,
|
34
|
+
session: requests.Session | None = None,
|
34
35
|
) -> str:
|
35
36
|
"""Decode the first successfully fetched URL, from UTF-8 encoding to Python unicode."""
|
36
|
-
|
37
|
+
session_created = False
|
38
|
+
if session is None:
|
39
|
+
session = requests.Session()
|
37
40
|
session.mount("file://", FileAdapter())
|
41
|
+
session_created = True
|
38
42
|
|
43
|
+
try:
|
39
44
|
for url in urls:
|
40
45
|
try:
|
41
46
|
return cache.cached_fetch_url(
|
@@ -43,6 +48,11 @@ def find_first_response(
|
|
43
48
|
)
|
44
49
|
except requests.exceptions.RequestException:
|
45
50
|
LOG.exception("Exception reading Public Suffix List url %s", url)
|
51
|
+
finally:
|
52
|
+
# Ensure the session is always closed if it's constructed in the method
|
53
|
+
if session_created:
|
54
|
+
session.close()
|
55
|
+
|
46
56
|
raise SuffixListNotFound(
|
47
57
|
"No remote Public Suffix List found. Consider using a mirror, or avoid this"
|
48
58
|
" fetch by constructing your TLDExtract with `suffix_list_urls=()`."
|
@@ -65,6 +75,7 @@ def get_suffix_lists(
|
|
65
75
|
urls: Sequence[str],
|
66
76
|
cache_fetch_timeout: float | int | None,
|
67
77
|
fallback_to_snapshot: bool,
|
78
|
+
session: requests.Session | None = None,
|
68
79
|
) -> tuple[list[str], list[str]]:
|
69
80
|
"""Fetch, parse, and cache the suffix lists."""
|
70
81
|
return cache.run_and_cache(
|
@@ -75,6 +86,7 @@ def get_suffix_lists(
|
|
75
86
|
"urls": urls,
|
76
87
|
"cache_fetch_timeout": cache_fetch_timeout,
|
77
88
|
"fallback_to_snapshot": fallback_to_snapshot,
|
89
|
+
"session": session,
|
78
90
|
},
|
79
91
|
hashed_argnames=["urls", "fallback_to_snapshot"],
|
80
92
|
)
|
@@ -85,10 +97,13 @@ def _get_suffix_lists(
|
|
85
97
|
urls: Sequence[str],
|
86
98
|
cache_fetch_timeout: float | int | None,
|
87
99
|
fallback_to_snapshot: bool,
|
100
|
+
session: requests.Session | None = None,
|
88
101
|
) -> tuple[list[str], list[str]]:
|
89
102
|
"""Fetch, parse, and cache the suffix lists."""
|
90
103
|
try:
|
91
|
-
text = find_first_response(
|
104
|
+
text = find_first_response(
|
105
|
+
cache, urls, cache_fetch_timeout=cache_fetch_timeout, session=session
|
106
|
+
)
|
92
107
|
except SuffixListNotFound as exc:
|
93
108
|
if fallback_to_snapshot:
|
94
109
|
maybe_pkg_data = pkgutil.get_data("tldextract", ".tld_set_snapshot")
|
@@ -44,6 +44,7 @@ from dataclasses import dataclass
|
|
44
44
|
from functools import wraps
|
45
45
|
|
46
46
|
import idna
|
47
|
+
import requests
|
47
48
|
|
48
49
|
from .cache import DiskCache, get_cache_dir
|
49
50
|
from .remote import lenient_netloc, looks_like_ip, looks_like_ipv6
|
@@ -221,13 +222,19 @@ class TLDExtract:
|
|
221
222
|
self._cache = DiskCache(cache_dir)
|
222
223
|
|
223
224
|
def __call__(
|
224
|
-
self,
|
225
|
+
self,
|
226
|
+
url: str,
|
227
|
+
include_psl_private_domains: bool | None = None,
|
228
|
+
session: requests.Session | None = None,
|
225
229
|
) -> ExtractResult:
|
226
230
|
"""Alias for `extract_str`."""
|
227
|
-
return self.extract_str(url, include_psl_private_domains)
|
231
|
+
return self.extract_str(url, include_psl_private_domains, session=session)
|
228
232
|
|
229
233
|
def extract_str(
|
230
|
-
self,
|
234
|
+
self,
|
235
|
+
url: str,
|
236
|
+
include_psl_private_domains: bool | None = None,
|
237
|
+
session: requests.Session | None = None,
|
231
238
|
) -> ExtractResult:
|
232
239
|
"""Take a string URL and splits it into its subdomain, domain, and suffix components.
|
233
240
|
|
@@ -238,13 +245,27 @@ class TLDExtract:
|
|
238
245
|
ExtractResult(subdomain='forums.news', domain='cnn', suffix='com', is_private=False)
|
239
246
|
>>> extractor.extract_str('http://forums.bbc.co.uk/')
|
240
247
|
ExtractResult(subdomain='forums', domain='bbc', suffix='co.uk', is_private=False)
|
248
|
+
|
249
|
+
Allows configuring the HTTP request via the optional `session`
|
250
|
+
parameter. For example, if you need to use a HTTP proxy. See also
|
251
|
+
`requests.Session`.
|
252
|
+
|
253
|
+
>>> import requests
|
254
|
+
>>> session = requests.Session()
|
255
|
+
>>> # customize your session here
|
256
|
+
>>> with session:
|
257
|
+
... extractor.extract_str("http://forums.news.cnn.com/", session=session)
|
258
|
+
ExtractResult(subdomain='forums.news', domain='cnn', suffix='com', is_private=False)
|
241
259
|
"""
|
242
|
-
return self._extract_netloc(
|
260
|
+
return self._extract_netloc(
|
261
|
+
lenient_netloc(url), include_psl_private_domains, session=session
|
262
|
+
)
|
243
263
|
|
244
264
|
def extract_urllib(
|
245
265
|
self,
|
246
266
|
url: urllib.parse.ParseResult | urllib.parse.SplitResult,
|
247
267
|
include_psl_private_domains: bool | None = None,
|
268
|
+
session: requests.Session | None = None,
|
248
269
|
) -> ExtractResult:
|
249
270
|
"""Take the output of urllib.parse URL parsing methods and further splits the parsed URL.
|
250
271
|
|
@@ -260,10 +281,15 @@ class TLDExtract:
|
|
260
281
|
>>> extractor.extract_urllib(urllib.parse.urlsplit('http://forums.bbc.co.uk/'))
|
261
282
|
ExtractResult(subdomain='forums', domain='bbc', suffix='co.uk', is_private=False)
|
262
283
|
"""
|
263
|
-
return self._extract_netloc(
|
284
|
+
return self._extract_netloc(
|
285
|
+
url.netloc, include_psl_private_domains, session=session
|
286
|
+
)
|
264
287
|
|
265
288
|
def _extract_netloc(
|
266
|
-
self,
|
289
|
+
self,
|
290
|
+
netloc: str,
|
291
|
+
include_psl_private_domains: bool | None,
|
292
|
+
session: requests.Session | None = None,
|
267
293
|
) -> ExtractResult:
|
268
294
|
netloc_with_ascii_dots = (
|
269
295
|
netloc.replace("\u3002", "\u002e")
|
@@ -282,9 +308,9 @@ class TLDExtract:
|
|
282
308
|
|
283
309
|
labels = netloc_with_ascii_dots.split(".")
|
284
310
|
|
285
|
-
suffix_index, is_private = self._get_tld_extractor(
|
286
|
-
|
287
|
-
)
|
311
|
+
suffix_index, is_private = self._get_tld_extractor(
|
312
|
+
session=session
|
313
|
+
).suffix_index(labels, include_psl_private_domains=include_psl_private_domains)
|
288
314
|
|
289
315
|
num_ipv4_labels = 4
|
290
316
|
if suffix_index == len(labels) == num_ipv4_labels and looks_like_ip(
|
@@ -297,23 +323,27 @@ class TLDExtract:
|
|
297
323
|
domain = labels[suffix_index - 1] if suffix_index else ""
|
298
324
|
return ExtractResult(subdomain, domain, suffix, is_private)
|
299
325
|
|
300
|
-
def update(
|
326
|
+
def update(
|
327
|
+
self, fetch_now: bool = False, session: requests.Session | None = None
|
328
|
+
) -> None:
|
301
329
|
"""Force fetch the latest suffix list definitions."""
|
302
330
|
self._extractor = None
|
303
331
|
self._cache.clear()
|
304
332
|
if fetch_now:
|
305
|
-
self._get_tld_extractor()
|
333
|
+
self._get_tld_extractor(session=session)
|
306
334
|
|
307
335
|
@property
|
308
|
-
def tlds(self) -> list[str]:
|
336
|
+
def tlds(self, session: requests.Session | None = None) -> list[str]:
|
309
337
|
"""
|
310
338
|
Returns the list of tld's used by default.
|
311
339
|
|
312
340
|
This will vary based on `include_psl_private_domains` and `extra_suffixes`
|
313
341
|
"""
|
314
|
-
return list(self._get_tld_extractor().tlds())
|
342
|
+
return list(self._get_tld_extractor(session=session).tlds())
|
315
343
|
|
316
|
-
def _get_tld_extractor(
|
344
|
+
def _get_tld_extractor(
|
345
|
+
self, session: requests.Session | None = None
|
346
|
+
) -> _PublicSuffixListTLDExtractor:
|
317
347
|
"""Get or compute this object's TLDExtractor.
|
318
348
|
|
319
349
|
Looks up the TLDExtractor in roughly the following order, based on the
|
@@ -332,6 +362,7 @@ class TLDExtract:
|
|
332
362
|
urls=self.suffix_list_urls,
|
333
363
|
cache_fetch_timeout=self.cache_fetch_timeout,
|
334
364
|
fallback_to_snapshot=self.fallback_to_snapshot,
|
365
|
+
session=session,
|
335
366
|
)
|
336
367
|
|
337
368
|
if not any([public_tlds, private_tlds, self.extra_suffixes]):
|
@@ -400,9 +431,13 @@ class Trie:
|
|
400
431
|
|
401
432
|
@wraps(TLD_EXTRACTOR.__call__)
|
402
433
|
def extract( # noqa: D103
|
403
|
-
url: str,
|
434
|
+
url: str,
|
435
|
+
include_psl_private_domains: bool | None = False,
|
436
|
+
session: requests.Session | None = None,
|
404
437
|
) -> ExtractResult:
|
405
|
-
return TLD_EXTRACTOR(
|
438
|
+
return TLD_EXTRACTOR(
|
439
|
+
url, include_psl_private_domains=include_psl_private_domains, session=session
|
440
|
+
)
|
406
441
|
|
407
442
|
|
408
443
|
@wraps(TLD_EXTRACTOR.update)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: tldextract
|
3
|
-
Version: 5.
|
3
|
+
Version: 5.1.1
|
4
4
|
Summary: Accurately separates a URL's subdomain, domain, and public suffix, using the Public Suffix List (PSL). By default, this includes the public ICANN TLDs and their exceptions. You can optionally support the Public Suffix List's private domains as well.
|
5
5
|
Author-email: John Kurkowski <john.kurkowski@gmail.com>
|
6
6
|
License: BSD-3-Clause
|
@@ -14,6 +14,7 @@ Classifier: Programming Language :: Python :: 3.8
|
|
14
14
|
Classifier: Programming Language :: Python :: 3.9
|
15
15
|
Classifier: Programming Language :: Python :: 3.10
|
16
16
|
Classifier: Programming Language :: Python :: 3.11
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
17
18
|
Requires-Python: >=3.8
|
18
19
|
Description-Content-Type: text/markdown
|
19
20
|
License-File: LICENSE
|
@@ -21,8 +22,19 @@ Requires-Dist: idna
|
|
21
22
|
Requires-Dist: requests>=2.1.0
|
22
23
|
Requires-Dist: requests-file>=1.4
|
23
24
|
Requires-Dist: filelock>=3.0.8
|
24
|
-
|
25
|
-
|
25
|
+
Provides-Extra: testing
|
26
|
+
Requires-Dist: black; extra == "testing"
|
27
|
+
Requires-Dist: mypy; extra == "testing"
|
28
|
+
Requires-Dist: pytest; extra == "testing"
|
29
|
+
Requires-Dist: pytest-gitignore; extra == "testing"
|
30
|
+
Requires-Dist: pytest-mock; extra == "testing"
|
31
|
+
Requires-Dist: responses; extra == "testing"
|
32
|
+
Requires-Dist: ruff; extra == "testing"
|
33
|
+
Requires-Dist: tox; extra == "testing"
|
34
|
+
Requires-Dist: types-filelock; extra == "testing"
|
35
|
+
Requires-Dist: types-requests; extra == "testing"
|
36
|
+
|
37
|
+
# tldextract [](https://badge.fury.io/py/tldextract) [](https://github.com/john-kurkowski/tldextract/actions/workflows/ci.yml)
|
26
38
|
|
27
39
|
`tldextract` accurately separates a URL's subdomain, domain, and public suffix,
|
28
40
|
using [the Public Suffix List (PSL)](https://publicsuffix.org).
|
@@ -210,7 +222,7 @@ If you want to use input data from your local filesystem, just use the `file://`
|
|
210
222
|
|
211
223
|
```python
|
212
224
|
extract = tldextract.TLDExtract(
|
213
|
-
suffix_list_urls=["file://absolute/path/to/your/local/suffix/list/file"],
|
225
|
+
suffix_list_urls=["file://" + "/absolute/path/to/your/local/suffix/list/file"],
|
214
226
|
cache_dir='/path/to/your/cache/',
|
215
227
|
fallback_to_snapshot=False)
|
216
228
|
```
|
@@ -271,7 +283,7 @@ receiving exceptions or error metadata on results.
|
|
271
283
|
|
272
284
|
1. `git clone` this repository.
|
273
285
|
2. Change into the new directory.
|
274
|
-
3. `pip install
|
286
|
+
3. `pip install --upgrade --editable '.[testing]'`
|
275
287
|
|
276
288
|
### Running the test suite
|
277
289
|
|
@@ -293,6 +305,5 @@ tox -e py311
|
|
293
305
|
Automatically format all code:
|
294
306
|
|
295
307
|
```zsh
|
296
|
-
pip install black
|
297
308
|
black .
|
298
309
|
```
|
@@ -1,35 +1,22 @@
|
|
1
1
|
[tox]
|
2
|
-
envlist = py{38,39,310,311,
|
2
|
+
envlist = py{38,39,310,311,312,py38},codestyle,lint,typecheck
|
3
3
|
|
4
4
|
[testenv]
|
5
|
-
deps =
|
6
|
-
pytest
|
7
|
-
pytest-gitignore
|
8
|
-
pytest-mock
|
9
|
-
responses
|
10
5
|
commands = pytest {posargs}
|
6
|
+
extras = testing
|
11
7
|
|
12
8
|
[testenv:codestyle]
|
13
9
|
basepython = python3.8
|
14
|
-
deps =
|
15
|
-
black
|
16
10
|
commands =
|
17
11
|
black --check {posargs:.}
|
12
|
+
extras = testing
|
18
13
|
|
19
14
|
[testenv:lint]
|
20
15
|
basepython = python3.8
|
21
|
-
deps =
|
22
|
-
ruff
|
23
16
|
commands = ruff check {posargs:.}
|
17
|
+
extras = testing
|
24
18
|
|
25
19
|
[testenv:typecheck]
|
26
20
|
basepython = python3.8
|
27
|
-
deps =
|
28
|
-
mypy
|
29
|
-
pytest
|
30
|
-
pytest-gitignore
|
31
|
-
pytest-mock
|
32
|
-
responses
|
33
|
-
types-filelock
|
34
|
-
types-requests
|
35
21
|
commands = mypy --show-error-codes tldextract tests
|
22
|
+
extras = testing
|
tldextract-5.0.1/.travis.yml
DELETED
@@ -1,21 +0,0 @@
|
|
1
|
-
dist: focal
|
2
|
-
language: python
|
3
|
-
matrix:
|
4
|
-
include:
|
5
|
-
- python: "3.8"
|
6
|
-
env: TOXENV=py38
|
7
|
-
- python: "3.9"
|
8
|
-
env: TOXENV=py39
|
9
|
-
- python: "3.10"
|
10
|
-
env: TOXENV=py310
|
11
|
-
- python: "3.11"
|
12
|
-
env: TOXENV=py311
|
13
|
-
- python: pypy3.8-7.3.9
|
14
|
-
dist: xenial
|
15
|
-
env: TOXENV=pypy3
|
16
|
-
- env: TOXENV=codestyle
|
17
|
-
- env: TOXENV=lint
|
18
|
-
- env: TOXENV=typecheck
|
19
|
-
python: "3.10"
|
20
|
-
install: pip install tox
|
21
|
-
script: tox
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|