tldextract 5.2.0__tar.gz → 5.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {tldextract-5.2.0 → tldextract-5.3.0}/CHANGELOG.md +17 -0
- {tldextract-5.2.0 → tldextract-5.3.0}/PKG-INFO +3 -3
- {tldextract-5.2.0 → tldextract-5.3.0}/README.md +2 -2
- {tldextract-5.2.0 → tldextract-5.3.0}/pyproject.toml +3 -0
- {tldextract-5.2.0 → tldextract-5.3.0}/tests/cli_test.py +6 -2
- {tldextract-5.2.0 → tldextract-5.3.0}/tests/custom_suffix_test.py +12 -5
- {tldextract-5.2.0 → tldextract-5.3.0}/tests/main_test.py +98 -10
- {tldextract-5.2.0 → tldextract-5.3.0}/tldextract/_version.py +2 -2
- {tldextract-5.2.0 → tldextract-5.3.0}/tldextract/cli.py +9 -1
- {tldextract-5.2.0 → tldextract-5.3.0}/tldextract/tldextract.py +170 -36
- {tldextract-5.2.0 → tldextract-5.3.0}/tldextract.egg-info/PKG-INFO +3 -3
- {tldextract-5.2.0 → tldextract-5.3.0}/.github/FUNDING.yml +0 -0
- {tldextract-5.2.0 → tldextract-5.3.0}/.github/workflows/ci.yml +0 -0
- {tldextract-5.2.0 → tldextract-5.3.0}/.gitignore +0 -0
- {tldextract-5.2.0 → tldextract-5.3.0}/LICENSE +0 -0
- {tldextract-5.2.0 → tldextract-5.3.0}/scripts/release.py +0 -0
- {tldextract-5.2.0 → tldextract-5.3.0}/setup.cfg +0 -0
- {tldextract-5.2.0 → tldextract-5.3.0}/tests/__init__.py +0 -0
- {tldextract-5.2.0 → tldextract-5.3.0}/tests/__snapshots__/test_release.ambr +0 -0
- {tldextract-5.2.0 → tldextract-5.3.0}/tests/conftest.py +0 -0
- {tldextract-5.2.0 → tldextract-5.3.0}/tests/fixtures/fake_suffix_list_fixture.dat +0 -0
- {tldextract-5.2.0 → tldextract-5.3.0}/tests/test_cache.py +0 -0
- {tldextract-5.2.0 → tldextract-5.3.0}/tests/test_parallel.py +0 -0
- {tldextract-5.2.0 → tldextract-5.3.0}/tests/test_release.py +0 -0
- {tldextract-5.2.0 → tldextract-5.3.0}/tests/test_trie.py +0 -0
- {tldextract-5.2.0 → tldextract-5.3.0}/tldextract/.tld_set_snapshot +0 -0
- {tldextract-5.2.0 → tldextract-5.3.0}/tldextract/__init__.py +0 -0
- {tldextract-5.2.0 → tldextract-5.3.0}/tldextract/__main__.py +0 -0
- {tldextract-5.2.0 → tldextract-5.3.0}/tldextract/cache.py +0 -0
- {tldextract-5.2.0 → tldextract-5.3.0}/tldextract/py.typed +0 -0
- {tldextract-5.2.0 → tldextract-5.3.0}/tldextract/remote.py +0 -0
- {tldextract-5.2.0 → tldextract-5.3.0}/tldextract/suffix_list.py +0 -0
- {tldextract-5.2.0 → tldextract-5.3.0}/tldextract.egg-info/SOURCES.txt +0 -0
- {tldextract-5.2.0 → tldextract-5.3.0}/tldextract.egg-info/dependency_links.txt +0 -0
- {tldextract-5.2.0 → tldextract-5.3.0}/tldextract.egg-info/entry_points.txt +0 -0
- {tldextract-5.2.0 → tldextract-5.3.0}/tldextract.egg-info/requires.txt +0 -0
- {tldextract-5.2.0 → tldextract-5.3.0}/tldextract.egg-info/top_level.txt +0 -0
- {tldextract-5.2.0 → tldextract-5.3.0}/tox.ini +0 -0
@@ -3,6 +3,23 @@
|
|
3
3
|
After upgrading, update your cache file by deleting it or via `tldextract
|
4
4
|
--update`.
|
5
5
|
|
6
|
+
## 5.3.0 (2025-04-21)
|
7
|
+
|
8
|
+
* Features
|
9
|
+
* Add result field `registry_suffix` ([#344](https://github.com/john-kurkowski/tldextract/issues/344))
|
10
|
+
* To complement the existing public suffix field `suffix`
|
11
|
+
* Add result property `top_domain_under_public_suffix` ([#344](https://github.com/john-kurkowski/tldextract/issues/344))
|
12
|
+
* Add result property `top_domain_under_registry_suffix` ([#344](https://github.com/john-kurkowski/tldextract/issues/344))
|
13
|
+
* Deprecate `registered_domain` property
|
14
|
+
* Use `top_domain_under_public_suffix` instead, which has the same behavior
|
15
|
+
but a more accurate name
|
16
|
+
* Bugfixes
|
17
|
+
* Fix missing `reverse_domain_name` property in CLI `--json` output ([`a545c67`](https://github.com/john-kurkowski/tldextract/commit/a545c67d87223616fc13e90692886b3ca9af18bb))
|
18
|
+
* Misc.
|
19
|
+
* Expand internal `suffix_index` return type to be richer than bools, and
|
20
|
+
include the registry suffix during trie traversal
|
21
|
+
([#344](https://github.com/john-kurkowski/tldextract/issues/344))
|
22
|
+
|
6
23
|
## 5.2.0 (2025-04-07)
|
7
24
|
|
8
25
|
* Features
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: tldextract
|
3
|
-
Version: 5.
|
3
|
+
Version: 5.3.0
|
4
4
|
Summary: Accurately separates a URL's subdomain, domain, and public suffix, using the Public Suffix List (PSL). By default, this includes the public ICANN TLDs and their exceptions. You can optionally support the Public Suffix List's private domains as well.
|
5
5
|
Author-email: John Kurkowski <john.kurkowski@gmail.com>
|
6
6
|
License: BSD-3-Clause
|
@@ -90,7 +90,7 @@ To rejoin the original hostname, if it was indeed a valid, registered hostname:
|
|
90
90
|
|
91
91
|
```python
|
92
92
|
>>> ext = tldextract.extract('http://forums.bbc.co.uk')
|
93
|
-
>>> ext.
|
93
|
+
>>> ext.top_domain_under_public_suffix
|
94
94
|
'bbc.co.uk'
|
95
95
|
>>> ext.fqdn
|
96
96
|
'forums.bbc.co.uk'
|
@@ -287,7 +287,7 @@ For example:
|
|
287
287
|
extractor = TLDExtract()
|
288
288
|
split_url = urllib.parse.urlsplit("https://foo.bar.com:8080")
|
289
289
|
split_suffix = extractor.extract_urllib(split_url)
|
290
|
-
url_to_crawl = f"{split_url.scheme}://{split_suffix.
|
290
|
+
url_to_crawl = f"{split_url.scheme}://{split_suffix.top_domain_under_public_suffix}:{split_url.port}"
|
291
291
|
```
|
292
292
|
|
293
293
|
`tldextract`'s lenient string parsing stance lowers the learning curve of using
|
@@ -49,7 +49,7 @@ To rejoin the original hostname, if it was indeed a valid, registered hostname:
|
|
49
49
|
|
50
50
|
```python
|
51
51
|
>>> ext = tldextract.extract('http://forums.bbc.co.uk')
|
52
|
-
>>> ext.
|
52
|
+
>>> ext.top_domain_under_public_suffix
|
53
53
|
'bbc.co.uk'
|
54
54
|
>>> ext.fqdn
|
55
55
|
'forums.bbc.co.uk'
|
@@ -246,7 +246,7 @@ For example:
|
|
246
246
|
extractor = TLDExtract()
|
247
247
|
split_url = urllib.parse.urlsplit("https://foo.bar.com:8080")
|
248
248
|
split_suffix = extractor.extract_urllib(split_url)
|
249
|
-
url_to_crawl = f"{split_url.scheme}://{split_suffix.
|
249
|
+
url_to_crawl = f"{split_url.scheme}://{split_suffix.top_domain_under_public_suffix}:{split_url.port}"
|
250
250
|
```
|
251
251
|
|
252
252
|
`tldextract`'s lenient string parsing stance lowers the learning curve of using
|
@@ -77,12 +77,16 @@ def test_cli_json_output(
|
|
77
77
|
stdout, stderr = capsys.readouterr()
|
78
78
|
assert not stderr
|
79
79
|
assert json.loads(stdout) == {
|
80
|
-
"subdomain": "www",
|
81
80
|
"domain": "bbc",
|
82
|
-
"suffix": "co.uk",
|
83
81
|
"fqdn": "www.bbc.co.uk",
|
84
82
|
"ipv4": "",
|
85
83
|
"ipv6": "",
|
86
84
|
"is_private": False,
|
87
85
|
"registered_domain": "bbc.co.uk",
|
86
|
+
"registry_suffix": "co.uk",
|
87
|
+
"reverse_domain_name": "co.uk.bbc.www",
|
88
|
+
"subdomain": "www",
|
89
|
+
"suffix": "co.uk",
|
90
|
+
"top_domain_under_public_suffix": "bbc.co.uk",
|
91
|
+
"top_domain_under_registry_suffix": "bbc.co.uk",
|
88
92
|
}
|
@@ -32,12 +32,19 @@ def test_private_extraction() -> None:
|
|
32
32
|
"""Test this library's uncached, offline, private domain extraction."""
|
33
33
|
tld = tldextract.TLDExtract(cache_dir=tempfile.mkdtemp(), suffix_list_urls=[])
|
34
34
|
|
35
|
-
assert tld("foo.blogspot.com") == ExtractResult(
|
35
|
+
assert tld("foo.blogspot.com") == ExtractResult(
|
36
|
+
subdomain="foo",
|
37
|
+
domain="blogspot",
|
38
|
+
suffix="com",
|
39
|
+
is_private=False,
|
40
|
+
registry_suffix="com",
|
41
|
+
)
|
36
42
|
assert tld("foo.blogspot.com", include_psl_private_domains=True) == ExtractResult(
|
37
|
-
"",
|
38
|
-
"foo",
|
39
|
-
"blogspot.com",
|
40
|
-
True,
|
43
|
+
subdomain="",
|
44
|
+
domain="foo",
|
45
|
+
suffix="blogspot.com",
|
46
|
+
is_private=True,
|
47
|
+
registry_suffix="com",
|
41
48
|
)
|
42
49
|
|
43
50
|
|
@@ -374,6 +374,42 @@ def test_dns_root_label() -> None:
|
|
374
374
|
)
|
375
375
|
|
376
376
|
|
377
|
+
def test_top_domain_under_public_suffix() -> None:
|
378
|
+
"""Test property `top_domain_under_public_suffix`."""
|
379
|
+
assert (
|
380
|
+
tldextract.extract(
|
381
|
+
"http://www.example.auth.us-east-1.amazoncognito.com",
|
382
|
+
include_psl_private_domains=False,
|
383
|
+
).top_domain_under_public_suffix
|
384
|
+
== "amazoncognito.com"
|
385
|
+
)
|
386
|
+
assert (
|
387
|
+
tldextract.extract(
|
388
|
+
"http://www.example.auth.us-east-1.amazoncognito.com",
|
389
|
+
include_psl_private_domains=True,
|
390
|
+
).top_domain_under_public_suffix
|
391
|
+
== "example.auth.us-east-1.amazoncognito.com"
|
392
|
+
)
|
393
|
+
|
394
|
+
|
395
|
+
def test_top_domain_under_registry_suffix() -> None:
|
396
|
+
"""Test property `top_domain_under_registry_suffix`."""
|
397
|
+
assert (
|
398
|
+
tldextract.extract(
|
399
|
+
"http://www.example.auth.us-east-1.amazoncognito.com",
|
400
|
+
include_psl_private_domains=False,
|
401
|
+
).top_domain_under_registry_suffix
|
402
|
+
== "amazoncognito.com"
|
403
|
+
)
|
404
|
+
assert (
|
405
|
+
tldextract.extract(
|
406
|
+
"http://www.example.auth.us-east-1.amazoncognito.com",
|
407
|
+
include_psl_private_domains=True,
|
408
|
+
).top_domain_under_registry_suffix
|
409
|
+
== "amazoncognito.com"
|
410
|
+
)
|
411
|
+
|
412
|
+
|
377
413
|
def test_ipv4() -> None:
|
378
414
|
"""Test IPv4 addresses."""
|
379
415
|
assert_extract(
|
@@ -526,12 +562,22 @@ def test_include_psl_private_domain_attr() -> None:
|
|
526
562
|
extract_public1 = tldextract.TLDExtract()
|
527
563
|
extract_public2 = tldextract.TLDExtract(include_psl_private_domains=False)
|
528
564
|
assert extract_private("foo.uk.com") == ExtractResult(
|
529
|
-
subdomain="",
|
565
|
+
subdomain="",
|
566
|
+
domain="foo",
|
567
|
+
suffix="uk.com",
|
568
|
+
is_private=True,
|
569
|
+
registry_suffix="com",
|
530
570
|
)
|
531
571
|
assert (
|
532
572
|
extract_public1("foo.uk.com")
|
533
573
|
== extract_public2("foo.uk.com")
|
534
|
-
== ExtractResult(
|
574
|
+
== ExtractResult(
|
575
|
+
subdomain="foo",
|
576
|
+
domain="uk",
|
577
|
+
suffix="com",
|
578
|
+
is_private=False,
|
579
|
+
registry_suffix="com",
|
580
|
+
)
|
535
581
|
)
|
536
582
|
|
537
583
|
|
@@ -554,11 +600,21 @@ def test_global_extract() -> None:
|
|
554
600
|
"""
|
555
601
|
assert tldextract.extract(
|
556
602
|
"blogspot.com", include_psl_private_domains=True
|
557
|
-
) == ExtractResult(
|
603
|
+
) == ExtractResult(
|
604
|
+
subdomain="",
|
605
|
+
domain="",
|
606
|
+
suffix="blogspot.com",
|
607
|
+
is_private=True,
|
608
|
+
registry_suffix="com",
|
609
|
+
)
|
558
610
|
assert tldextract.extract(
|
559
611
|
"foo.blogspot.com", include_psl_private_domains=True
|
560
612
|
) == ExtractResult(
|
561
|
-
subdomain="",
|
613
|
+
subdomain="",
|
614
|
+
domain="foo",
|
615
|
+
suffix="blogspot.com",
|
616
|
+
is_private=True,
|
617
|
+
registry_suffix="com",
|
562
618
|
)
|
563
619
|
|
564
620
|
|
@@ -574,15 +630,26 @@ def test_private_domains_depth() -> None:
|
|
574
630
|
domain="amazonaws",
|
575
631
|
suffix="com",
|
576
632
|
is_private=False,
|
633
|
+
registry_suffix="com",
|
577
634
|
)
|
578
635
|
assert tldextract.extract(
|
579
636
|
"ap-south-1.amazonaws.com", include_psl_private_domains=True
|
580
637
|
) == ExtractResult(
|
581
|
-
subdomain="ap-south-1",
|
638
|
+
subdomain="ap-south-1",
|
639
|
+
domain="amazonaws",
|
640
|
+
suffix="com",
|
641
|
+
is_private=False,
|
642
|
+
registry_suffix="com",
|
582
643
|
)
|
583
644
|
assert tldextract.extract(
|
584
645
|
"amazonaws.com", include_psl_private_domains=True
|
585
|
-
) == ExtractResult(
|
646
|
+
) == ExtractResult(
|
647
|
+
subdomain="",
|
648
|
+
domain="amazonaws",
|
649
|
+
suffix="com",
|
650
|
+
is_private=False,
|
651
|
+
registry_suffix="com",
|
652
|
+
)
|
586
653
|
assert tldextract.extract(
|
587
654
|
"the-quick-brown-fox.cn-north-1.amazonaws.com.cn",
|
588
655
|
include_psl_private_domains=True,
|
@@ -591,16 +658,25 @@ def test_private_domains_depth() -> None:
|
|
591
658
|
domain="amazonaws",
|
592
659
|
suffix="com.cn",
|
593
660
|
is_private=False,
|
661
|
+
registry_suffix="com.cn",
|
594
662
|
)
|
595
663
|
assert tldextract.extract(
|
596
664
|
"cn-north-1.amazonaws.com.cn", include_psl_private_domains=True
|
597
665
|
) == ExtractResult(
|
598
|
-
subdomain="cn-north-1",
|
666
|
+
subdomain="cn-north-1",
|
667
|
+
domain="amazonaws",
|
668
|
+
suffix="com.cn",
|
669
|
+
is_private=False,
|
670
|
+
registry_suffix="com.cn",
|
599
671
|
)
|
600
672
|
assert tldextract.extract(
|
601
673
|
"amazonaws.com.cn", include_psl_private_domains=True
|
602
674
|
) == ExtractResult(
|
603
|
-
subdomain="",
|
675
|
+
subdomain="",
|
676
|
+
domain="amazonaws",
|
677
|
+
suffix="com.cn",
|
678
|
+
is_private=False,
|
679
|
+
registry_suffix="com.cn",
|
604
680
|
)
|
605
681
|
assert tldextract.extract(
|
606
682
|
"another.icann.compute.amazonaws.com", include_psl_private_domains=True
|
@@ -609,6 +685,7 @@ def test_private_domains_depth() -> None:
|
|
609
685
|
domain="another",
|
610
686
|
suffix="icann.compute.amazonaws.com",
|
611
687
|
is_private=True,
|
688
|
+
registry_suffix="com",
|
612
689
|
)
|
613
690
|
assert tldextract.extract(
|
614
691
|
"another.s3.dualstack.us-east-1.amazonaws.com", include_psl_private_domains=True
|
@@ -617,12 +694,17 @@ def test_private_domains_depth() -> None:
|
|
617
694
|
domain="another",
|
618
695
|
suffix="s3.dualstack.us-east-1.amazonaws.com",
|
619
696
|
is_private=True,
|
697
|
+
registry_suffix="com",
|
620
698
|
)
|
621
699
|
|
622
700
|
assert tldextract.extract(
|
623
701
|
"s3.ap-south-1.amazonaws.com", include_psl_private_domains=True
|
624
702
|
) == ExtractResult(
|
625
|
-
subdomain="",
|
703
|
+
subdomain="",
|
704
|
+
domain="",
|
705
|
+
suffix="s3.ap-south-1.amazonaws.com",
|
706
|
+
is_private=True,
|
707
|
+
registry_suffix="com",
|
626
708
|
)
|
627
709
|
assert tldextract.extract(
|
628
710
|
"s3.cn-north-1.amazonaws.com.cn", include_psl_private_domains=True
|
@@ -631,11 +713,16 @@ def test_private_domains_depth() -> None:
|
|
631
713
|
domain="",
|
632
714
|
suffix="s3.cn-north-1.amazonaws.com.cn",
|
633
715
|
is_private=True,
|
716
|
+
registry_suffix="com.cn",
|
634
717
|
)
|
635
718
|
assert tldextract.extract(
|
636
719
|
"icann.compute.amazonaws.com", include_psl_private_domains=True
|
637
720
|
) == ExtractResult(
|
638
|
-
subdomain="",
|
721
|
+
subdomain="",
|
722
|
+
domain="",
|
723
|
+
suffix="icann.compute.amazonaws.com",
|
724
|
+
is_private=True,
|
725
|
+
registry_suffix="com",
|
639
726
|
)
|
640
727
|
|
641
728
|
# Entire URL is private suffix which ends with another private suffix
|
@@ -647,4 +734,5 @@ def test_private_domains_depth() -> None:
|
|
647
734
|
domain="",
|
648
735
|
suffix="s3.dualstack.us-east-1.amazonaws.com",
|
649
736
|
is_private=True,
|
737
|
+
registry_suffix="com",
|
650
738
|
)
|
@@ -98,7 +98,15 @@ def main() -> None:
|
|
98
98
|
for i in args.input:
|
99
99
|
ext = tld_extract(i)
|
100
100
|
if args.json:
|
101
|
-
properties = (
|
101
|
+
properties = (
|
102
|
+
"fqdn",
|
103
|
+
"ipv4",
|
104
|
+
"ipv6",
|
105
|
+
"registered_domain",
|
106
|
+
"reverse_domain_name",
|
107
|
+
"top_domain_under_public_suffix",
|
108
|
+
"top_domain_under_registry_suffix",
|
109
|
+
)
|
102
110
|
print(
|
103
111
|
json.dumps(
|
104
112
|
{
|
@@ -28,7 +28,7 @@ subdomain or a valid suffix.
|
|
28
28
|
To rejoin the original hostname, if it was indeed a valid, registered hostname:
|
29
29
|
|
30
30
|
>>> ext = tldextract.extract("http://forums.bbc.co.uk")
|
31
|
-
>>> ext.
|
31
|
+
>>> ext.top_domain_under_public_suffix
|
32
32
|
'bbc.co.uk'
|
33
33
|
>>> ext.fqdn
|
34
34
|
'forums.bbc.co.uk'
|
@@ -38,8 +38,9 @@ from __future__ import annotations
|
|
38
38
|
|
39
39
|
import os
|
40
40
|
import urllib.parse
|
41
|
+
import warnings
|
41
42
|
from collections.abc import Collection, Sequence
|
42
|
-
from dataclasses import dataclass
|
43
|
+
from dataclasses import dataclass, field
|
43
44
|
from functools import wraps
|
44
45
|
|
45
46
|
import idna
|
@@ -96,18 +97,16 @@ class ExtractResult:
|
|
96
97
|
`False`.
|
97
98
|
"""
|
98
99
|
|
99
|
-
|
100
|
-
|
101
|
-
"""The `domain` and `suffix` fields joined with a dot, if they're both set, or else the empty string.
|
100
|
+
registry_suffix: str = field(repr=False)
|
101
|
+
"""The registry suffix of the input URL, if it contained one, or else the empty string.
|
102
102
|
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
return ""
|
103
|
+
This field is a domain under which people can register subdomains through a
|
104
|
+
registar.
|
105
|
+
|
106
|
+
This field is unaffected by the `include_psl_private_domains` setting. If
|
107
|
+
`include_psl_private_domains` was set to `False`, this field is always the
|
108
|
+
same as `suffix`.
|
109
|
+
"""
|
111
110
|
|
112
111
|
@property
|
113
112
|
def fqdn(self) -> str:
|
@@ -168,6 +167,56 @@ class ExtractResult:
|
|
168
167
|
return debracketed
|
169
168
|
return ""
|
170
169
|
|
170
|
+
@property
|
171
|
+
def registered_domain(self) -> str:
|
172
|
+
"""The `domain` and `suffix` fields joined with a dot, if they're both set, or else the empty string.
|
173
|
+
|
174
|
+
>>> extract("http://forums.bbc.co.uk").registered_domain
|
175
|
+
'bbc.co.uk'
|
176
|
+
>>> extract("http://localhost:8080").registered_domain
|
177
|
+
''
|
178
|
+
|
179
|
+
.. deprecated:: 6.0.0
|
180
|
+
This property is deprecated and will be removed in the next major
|
181
|
+
version. Use `top_domain_under_public_suffix` instead, which has the
|
182
|
+
same behavior but a more accurate name.
|
183
|
+
|
184
|
+
This is an alias for the `top_domain_under_public_suffix` property.
|
185
|
+
`registered_domain` is so called because is roughly the domain the
|
186
|
+
owner paid to register with a registrar or, in the case of a private
|
187
|
+
domain, "registered" with the domain owner. If the input was not
|
188
|
+
something one could register, this property returns the empty string.
|
189
|
+
|
190
|
+
To distinguish the case of private domains, consider Blogspot, which is
|
191
|
+
in the PSL's private domains. If `include_psl_private_domains` was set
|
192
|
+
to `False`, the `registered_domain` property of a Blogspot URL
|
193
|
+
represents the domain the owner of Blogspot registered with a
|
194
|
+
registrar, i.e. Google registered "blogspot.com". If
|
195
|
+
`include_psl_private_domains=True`, the `registered_domain` property
|
196
|
+
represents the "blogspot.com" _subdomain_ the owner of a blog
|
197
|
+
"registered" with Blogspot.
|
198
|
+
|
199
|
+
>>> extract(
|
200
|
+
... "http://waiterrant.blogspot.com", include_psl_private_domains=False
|
201
|
+
... ).registered_domain
|
202
|
+
'blogspot.com'
|
203
|
+
>>> extract(
|
204
|
+
... "http://waiterrant.blogspot.com", include_psl_private_domains=True
|
205
|
+
... ).registered_domain
|
206
|
+
'waiterrant.blogspot.com'
|
207
|
+
|
208
|
+
To always get the same joined string, regardless of the
|
209
|
+
`include_psl_private_domains` setting, consider the
|
210
|
+
`top_domain_under_registry_suffix` property.
|
211
|
+
"""
|
212
|
+
warnings.warn(
|
213
|
+
"The 'registered_domain' property is deprecated and will be removed in the next major version. "
|
214
|
+
"Use 'top_domain_under_public_suffix' instead, which has the same behavior but a more accurate name.",
|
215
|
+
DeprecationWarning,
|
216
|
+
stacklevel=2,
|
217
|
+
)
|
218
|
+
return self.top_domain_under_public_suffix
|
219
|
+
|
171
220
|
@property
|
172
221
|
def reverse_domain_name(self) -> str:
|
173
222
|
"""The domain name in Reverse Domain Name Notation.
|
@@ -193,6 +242,48 @@ class ExtractResult:
|
|
193
242
|
stack.extend(reversed(self.subdomain.split(".")))
|
194
243
|
return ".".join(stack)
|
195
244
|
|
245
|
+
@property
|
246
|
+
def top_domain_under_registry_suffix(self) -> str:
|
247
|
+
"""The rightmost domain label and `registry_suffix` joined with a dot, if such a domain is available and `registry_suffix` is set, or else the empty string.
|
248
|
+
|
249
|
+
The rightmost domain label might be in the `domain` field, or, if the
|
250
|
+
input URL's suffix is a PSL private domain, in the public suffix
|
251
|
+
`suffix` field.
|
252
|
+
|
253
|
+
If the input was not in the PSL's private domains, this property is
|
254
|
+
equivalent to `top_domain_under_public_suffix`.
|
255
|
+
|
256
|
+
>>> extract(
|
257
|
+
... "http://waiterrant.blogspot.com", include_psl_private_domains=False
|
258
|
+
... ).top_domain_under_registry_suffix
|
259
|
+
'blogspot.com'
|
260
|
+
>>> extract(
|
261
|
+
... "http://waiterrant.blogspot.com", include_psl_private_domains=True
|
262
|
+
... ).top_domain_under_registry_suffix
|
263
|
+
'blogspot.com'
|
264
|
+
>>> extract("http://localhost:8080").top_domain_under_registry_suffix
|
265
|
+
''
|
266
|
+
"""
|
267
|
+
top_domain_under_public_suffix = self.top_domain_under_public_suffix
|
268
|
+
if not top_domain_under_public_suffix or not self.is_private:
|
269
|
+
return top_domain_under_public_suffix
|
270
|
+
|
271
|
+
num_labels = self.registry_suffix.count(".") + 2
|
272
|
+
return ".".join(top_domain_under_public_suffix.split(".")[-num_labels:])
|
273
|
+
|
274
|
+
@property
|
275
|
+
def top_domain_under_public_suffix(self) -> str:
|
276
|
+
"""The `domain` and `suffix` fields joined with a dot, if they're both set, or else the empty string.
|
277
|
+
|
278
|
+
>>> extract("http://forums.bbc.co.uk").top_domain_under_public_suffix
|
279
|
+
'bbc.co.uk'
|
280
|
+
>>> extract("http://localhost:8080").top_domain_under_public_suffix
|
281
|
+
''
|
282
|
+
"""
|
283
|
+
if self.suffix and self.domain:
|
284
|
+
return f"{self.domain}.{self.suffix}"
|
285
|
+
return ""
|
286
|
+
|
196
287
|
|
197
288
|
class TLDExtract:
|
198
289
|
"""A callable for extracting, subdomain, domain, and suffix components from a URL."""
|
@@ -357,24 +448,58 @@ class TLDExtract:
|
|
357
448
|
and netloc_with_ascii_dots[-1] == "]"
|
358
449
|
and looks_like_ipv6(netloc_with_ascii_dots[1:-1])
|
359
450
|
):
|
360
|
-
return ExtractResult(
|
451
|
+
return ExtractResult(
|
452
|
+
"", netloc_with_ascii_dots, "", is_private=False, registry_suffix=""
|
453
|
+
)
|
361
454
|
|
362
455
|
labels = netloc_with_ascii_dots.split(".")
|
363
456
|
|
364
|
-
|
365
|
-
|
366
|
-
)
|
457
|
+
maybe_indexes = self._get_tld_extractor(session).suffix_index(
|
458
|
+
labels, include_psl_private_domains=include_psl_private_domains
|
459
|
+
)
|
367
460
|
|
368
461
|
num_ipv4_labels = 4
|
369
|
-
if
|
370
|
-
|
462
|
+
if (
|
463
|
+
not maybe_indexes
|
464
|
+
and len(labels) == num_ipv4_labels
|
465
|
+
and looks_like_ip(netloc_with_ascii_dots)
|
371
466
|
):
|
372
|
-
return ExtractResult(
|
467
|
+
return ExtractResult(
|
468
|
+
"", netloc_with_ascii_dots, "", is_private=False, registry_suffix=""
|
469
|
+
)
|
470
|
+
elif not maybe_indexes:
|
471
|
+
return ExtractResult(
|
472
|
+
subdomain=".".join(labels[:-1]),
|
473
|
+
domain=labels[-1],
|
474
|
+
suffix="",
|
475
|
+
is_private=False,
|
476
|
+
registry_suffix="",
|
477
|
+
)
|
373
478
|
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
479
|
+
(
|
480
|
+
(public_suffix_index, public_suffix_node),
|
481
|
+
(registry_suffix_index, registry_suffix_node),
|
482
|
+
) = maybe_indexes
|
483
|
+
|
484
|
+
subdomain = (
|
485
|
+
".".join(labels[: public_suffix_index - 1])
|
486
|
+
if public_suffix_index >= 2
|
487
|
+
else ""
|
488
|
+
)
|
489
|
+
domain = labels[public_suffix_index - 1] if public_suffix_index > 0 else ""
|
490
|
+
public_suffix = ".".join(labels[public_suffix_index:])
|
491
|
+
registry_suffix = (
|
492
|
+
".".join(labels[registry_suffix_index:])
|
493
|
+
if public_suffix_node.is_private
|
494
|
+
else public_suffix
|
495
|
+
)
|
496
|
+
return ExtractResult(
|
497
|
+
subdomain=subdomain,
|
498
|
+
domain=domain,
|
499
|
+
suffix=public_suffix,
|
500
|
+
is_private=public_suffix_node.is_private,
|
501
|
+
registry_suffix=registry_suffix,
|
502
|
+
)
|
378
503
|
|
379
504
|
def update(
|
380
505
|
self, fetch_now: bool = False, session: requests.Session | None = None
|
@@ -531,40 +656,49 @@ class _PublicSuffixListTLDExtractor:
|
|
531
656
|
|
532
657
|
def suffix_index(
|
533
658
|
self, spl: list[str], include_psl_private_domains: bool | None = None
|
534
|
-
) -> tuple[int,
|
535
|
-
"""Return the index of the first suffix label, and
|
659
|
+
) -> tuple[tuple[int, Trie], tuple[int, Trie]] | None:
|
660
|
+
"""Return the index of the first public suffix label, the index of the first registry suffix label, and their corresponding trie nodes.
|
536
661
|
|
537
|
-
Returns
|
662
|
+
Returns `None` if no suffix is found.
|
538
663
|
"""
|
539
664
|
if include_psl_private_domains is None:
|
540
665
|
include_psl_private_domains = self.include_psl_private_domains
|
541
666
|
|
542
|
-
node = (
|
667
|
+
node = reg_node = (
|
543
668
|
self.tlds_incl_private_trie
|
544
669
|
if include_psl_private_domains
|
545
670
|
else self.tlds_excl_private_trie
|
546
671
|
)
|
547
|
-
|
548
|
-
j = i
|
672
|
+
suffix_idx = reg_idx = label_idx = len(spl)
|
549
673
|
for label in reversed(spl):
|
550
674
|
decoded_label = _decode_punycode(label)
|
551
675
|
if decoded_label in node.matches:
|
552
|
-
|
676
|
+
label_idx -= 1
|
553
677
|
node = node.matches[decoded_label]
|
554
678
|
if node.end:
|
555
|
-
|
679
|
+
suffix_idx = label_idx
|
680
|
+
if not node.is_private:
|
681
|
+
reg_node = node
|
682
|
+
reg_idx = label_idx
|
556
683
|
continue
|
557
684
|
|
558
685
|
is_wildcard = "*" in node.matches
|
559
686
|
if is_wildcard:
|
560
687
|
is_wildcard_exception = "!" + decoded_label in node.matches
|
561
|
-
|
562
|
-
|
563
|
-
|
688
|
+
return (
|
689
|
+
label_idx if is_wildcard_exception else label_idx - 1,
|
690
|
+
node.matches["*"],
|
691
|
+
), (
|
692
|
+
reg_idx,
|
693
|
+
reg_node,
|
694
|
+
)
|
564
695
|
|
565
696
|
break
|
566
697
|
|
567
|
-
|
698
|
+
if suffix_idx == len(spl):
|
699
|
+
return None
|
700
|
+
|
701
|
+
return ((suffix_idx, node), (reg_idx, reg_node))
|
568
702
|
|
569
703
|
|
570
704
|
def _decode_punycode(label: str) -> str:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: tldextract
|
3
|
-
Version: 5.
|
3
|
+
Version: 5.3.0
|
4
4
|
Summary: Accurately separates a URL's subdomain, domain, and public suffix, using the Public Suffix List (PSL). By default, this includes the public ICANN TLDs and their exceptions. You can optionally support the Public Suffix List's private domains as well.
|
5
5
|
Author-email: John Kurkowski <john.kurkowski@gmail.com>
|
6
6
|
License: BSD-3-Clause
|
@@ -90,7 +90,7 @@ To rejoin the original hostname, if it was indeed a valid, registered hostname:
|
|
90
90
|
|
91
91
|
```python
|
92
92
|
>>> ext = tldextract.extract('http://forums.bbc.co.uk')
|
93
|
-
>>> ext.
|
93
|
+
>>> ext.top_domain_under_public_suffix
|
94
94
|
'bbc.co.uk'
|
95
95
|
>>> ext.fqdn
|
96
96
|
'forums.bbc.co.uk'
|
@@ -287,7 +287,7 @@ For example:
|
|
287
287
|
extractor = TLDExtract()
|
288
288
|
split_url = urllib.parse.urlsplit("https://foo.bar.com:8080")
|
289
289
|
split_suffix = extractor.extract_urllib(split_url)
|
290
|
-
url_to_crawl = f"{split_url.scheme}://{split_suffix.
|
290
|
+
url_to_crawl = f"{split_url.scheme}://{split_suffix.top_domain_under_public_suffix}:{split_url.port}"
|
291
291
|
```
|
292
292
|
|
293
293
|
`tldextract`'s lenient string parsing stance lowers the learning curve of using
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|