tldextract 5.2.0__py3-none-any.whl → 5.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tldextract/_version.py CHANGED
@@ -17,5 +17,5 @@ __version__: str
17
17
  __version_tuple__: VERSION_TUPLE
18
18
  version_tuple: VERSION_TUPLE
19
19
 
20
- __version__ = version = '5.2.0'
21
- __version_tuple__ = version_tuple = (5, 2, 0)
20
+ __version__ = version = '5.3.0'
21
+ __version_tuple__ = version_tuple = (5, 3, 0)
tldextract/cli.py CHANGED
@@ -98,7 +98,15 @@ def main() -> None:
98
98
  for i in args.input:
99
99
  ext = tld_extract(i)
100
100
  if args.json:
101
- properties = ("fqdn", "ipv4", "ipv6", "registered_domain")
101
+ properties = (
102
+ "fqdn",
103
+ "ipv4",
104
+ "ipv6",
105
+ "registered_domain",
106
+ "reverse_domain_name",
107
+ "top_domain_under_public_suffix",
108
+ "top_domain_under_registry_suffix",
109
+ )
102
110
  print(
103
111
  json.dumps(
104
112
  {
tldextract/tldextract.py CHANGED
@@ -28,7 +28,7 @@ subdomain or a valid suffix.
28
28
  To rejoin the original hostname, if it was indeed a valid, registered hostname:
29
29
 
30
30
  >>> ext = tldextract.extract("http://forums.bbc.co.uk")
31
- >>> ext.registered_domain
31
+ >>> ext.top_domain_under_public_suffix
32
32
  'bbc.co.uk'
33
33
  >>> ext.fqdn
34
34
  'forums.bbc.co.uk'
@@ -38,8 +38,9 @@ from __future__ import annotations
38
38
 
39
39
  import os
40
40
  import urllib.parse
41
+ import warnings
41
42
  from collections.abc import Collection, Sequence
42
- from dataclasses import dataclass
43
+ from dataclasses import dataclass, field
43
44
  from functools import wraps
44
45
 
45
46
  import idna
@@ -96,18 +97,16 @@ class ExtractResult:
96
97
  `False`.
97
98
  """
98
99
 
99
- @property
100
- def registered_domain(self) -> str:
101
- """The `domain` and `suffix` fields joined with a dot, if they're both set, or else the empty string.
100
+ registry_suffix: str = field(repr=False)
101
+ """The registry suffix of the input URL, if it contained one, or else the empty string.
102
102
 
103
- >>> extract("http://forums.bbc.co.uk").registered_domain
104
- 'bbc.co.uk'
105
- >>> extract("http://localhost:8080").registered_domain
106
- ''
107
- """
108
- if self.suffix and self.domain:
109
- return f"{self.domain}.{self.suffix}"
110
- return ""
103
+ This field is a domain under which people can register subdomains through a
104
+ registar.
105
+
106
+ This field is unaffected by the `include_psl_private_domains` setting. If
107
+ `include_psl_private_domains` was set to `False`, this field is always the
108
+ same as `suffix`.
109
+ """
111
110
 
112
111
  @property
113
112
  def fqdn(self) -> str:
@@ -168,6 +167,56 @@ class ExtractResult:
168
167
  return debracketed
169
168
  return ""
170
169
 
170
+ @property
171
+ def registered_domain(self) -> str:
172
+ """The `domain` and `suffix` fields joined with a dot, if they're both set, or else the empty string.
173
+
174
+ >>> extract("http://forums.bbc.co.uk").registered_domain
175
+ 'bbc.co.uk'
176
+ >>> extract("http://localhost:8080").registered_domain
177
+ ''
178
+
179
+ .. deprecated:: 6.0.0
180
+ This property is deprecated and will be removed in the next major
181
+ version. Use `top_domain_under_public_suffix` instead, which has the
182
+ same behavior but a more accurate name.
183
+
184
+ This is an alias for the `top_domain_under_public_suffix` property.
185
+ `registered_domain` is so called because is roughly the domain the
186
+ owner paid to register with a registrar or, in the case of a private
187
+ domain, "registered" with the domain owner. If the input was not
188
+ something one could register, this property returns the empty string.
189
+
190
+ To distinguish the case of private domains, consider Blogspot, which is
191
+ in the PSL's private domains. If `include_psl_private_domains` was set
192
+ to `False`, the `registered_domain` property of a Blogspot URL
193
+ represents the domain the owner of Blogspot registered with a
194
+ registrar, i.e. Google registered "blogspot.com". If
195
+ `include_psl_private_domains=True`, the `registered_domain` property
196
+ represents the "blogspot.com" _subdomain_ the owner of a blog
197
+ "registered" with Blogspot.
198
+
199
+ >>> extract(
200
+ ... "http://waiterrant.blogspot.com", include_psl_private_domains=False
201
+ ... ).registered_domain
202
+ 'blogspot.com'
203
+ >>> extract(
204
+ ... "http://waiterrant.blogspot.com", include_psl_private_domains=True
205
+ ... ).registered_domain
206
+ 'waiterrant.blogspot.com'
207
+
208
+ To always get the same joined string, regardless of the
209
+ `include_psl_private_domains` setting, consider the
210
+ `top_domain_under_registry_suffix` property.
211
+ """
212
+ warnings.warn(
213
+ "The 'registered_domain' property is deprecated and will be removed in the next major version. "
214
+ "Use 'top_domain_under_public_suffix' instead, which has the same behavior but a more accurate name.",
215
+ DeprecationWarning,
216
+ stacklevel=2,
217
+ )
218
+ return self.top_domain_under_public_suffix
219
+
171
220
  @property
172
221
  def reverse_domain_name(self) -> str:
173
222
  """The domain name in Reverse Domain Name Notation.
@@ -193,6 +242,48 @@ class ExtractResult:
193
242
  stack.extend(reversed(self.subdomain.split(".")))
194
243
  return ".".join(stack)
195
244
 
245
+ @property
246
+ def top_domain_under_registry_suffix(self) -> str:
247
+ """The rightmost domain label and `registry_suffix` joined with a dot, if such a domain is available and `registry_suffix` is set, or else the empty string.
248
+
249
+ The rightmost domain label might be in the `domain` field, or, if the
250
+ input URL's suffix is a PSL private domain, in the public suffix
251
+ `suffix` field.
252
+
253
+ If the input was not in the PSL's private domains, this property is
254
+ equivalent to `top_domain_under_public_suffix`.
255
+
256
+ >>> extract(
257
+ ... "http://waiterrant.blogspot.com", include_psl_private_domains=False
258
+ ... ).top_domain_under_registry_suffix
259
+ 'blogspot.com'
260
+ >>> extract(
261
+ ... "http://waiterrant.blogspot.com", include_psl_private_domains=True
262
+ ... ).top_domain_under_registry_suffix
263
+ 'blogspot.com'
264
+ >>> extract("http://localhost:8080").top_domain_under_registry_suffix
265
+ ''
266
+ """
267
+ top_domain_under_public_suffix = self.top_domain_under_public_suffix
268
+ if not top_domain_under_public_suffix or not self.is_private:
269
+ return top_domain_under_public_suffix
270
+
271
+ num_labels = self.registry_suffix.count(".") + 2
272
+ return ".".join(top_domain_under_public_suffix.split(".")[-num_labels:])
273
+
274
+ @property
275
+ def top_domain_under_public_suffix(self) -> str:
276
+ """The `domain` and `suffix` fields joined with a dot, if they're both set, or else the empty string.
277
+
278
+ >>> extract("http://forums.bbc.co.uk").top_domain_under_public_suffix
279
+ 'bbc.co.uk'
280
+ >>> extract("http://localhost:8080").top_domain_under_public_suffix
281
+ ''
282
+ """
283
+ if self.suffix and self.domain:
284
+ return f"{self.domain}.{self.suffix}"
285
+ return ""
286
+
196
287
 
197
288
  class TLDExtract:
198
289
  """A callable for extracting, subdomain, domain, and suffix components from a URL."""
@@ -357,24 +448,58 @@ class TLDExtract:
357
448
  and netloc_with_ascii_dots[-1] == "]"
358
449
  and looks_like_ipv6(netloc_with_ascii_dots[1:-1])
359
450
  ):
360
- return ExtractResult("", netloc_with_ascii_dots, "", is_private=False)
451
+ return ExtractResult(
452
+ "", netloc_with_ascii_dots, "", is_private=False, registry_suffix=""
453
+ )
361
454
 
362
455
  labels = netloc_with_ascii_dots.split(".")
363
456
 
364
- suffix_index, is_private = self._get_tld_extractor(
365
- session=session
366
- ).suffix_index(labels, include_psl_private_domains=include_psl_private_domains)
457
+ maybe_indexes = self._get_tld_extractor(session).suffix_index(
458
+ labels, include_psl_private_domains=include_psl_private_domains
459
+ )
367
460
 
368
461
  num_ipv4_labels = 4
369
- if suffix_index == len(labels) == num_ipv4_labels and looks_like_ip(
370
- netloc_with_ascii_dots
462
+ if (
463
+ not maybe_indexes
464
+ and len(labels) == num_ipv4_labels
465
+ and looks_like_ip(netloc_with_ascii_dots)
371
466
  ):
372
- return ExtractResult("", netloc_with_ascii_dots, "", is_private)
467
+ return ExtractResult(
468
+ "", netloc_with_ascii_dots, "", is_private=False, registry_suffix=""
469
+ )
470
+ elif not maybe_indexes:
471
+ return ExtractResult(
472
+ subdomain=".".join(labels[:-1]),
473
+ domain=labels[-1],
474
+ suffix="",
475
+ is_private=False,
476
+ registry_suffix="",
477
+ )
373
478
 
374
- suffix = ".".join(labels[suffix_index:]) if suffix_index != len(labels) else ""
375
- subdomain = ".".join(labels[: suffix_index - 1]) if suffix_index >= 2 else ""
376
- domain = labels[suffix_index - 1] if suffix_index else ""
377
- return ExtractResult(subdomain, domain, suffix, is_private)
479
+ (
480
+ (public_suffix_index, public_suffix_node),
481
+ (registry_suffix_index, registry_suffix_node),
482
+ ) = maybe_indexes
483
+
484
+ subdomain = (
485
+ ".".join(labels[: public_suffix_index - 1])
486
+ if public_suffix_index >= 2
487
+ else ""
488
+ )
489
+ domain = labels[public_suffix_index - 1] if public_suffix_index > 0 else ""
490
+ public_suffix = ".".join(labels[public_suffix_index:])
491
+ registry_suffix = (
492
+ ".".join(labels[registry_suffix_index:])
493
+ if public_suffix_node.is_private
494
+ else public_suffix
495
+ )
496
+ return ExtractResult(
497
+ subdomain=subdomain,
498
+ domain=domain,
499
+ suffix=public_suffix,
500
+ is_private=public_suffix_node.is_private,
501
+ registry_suffix=registry_suffix,
502
+ )
378
503
 
379
504
  def update(
380
505
  self, fetch_now: bool = False, session: requests.Session | None = None
@@ -531,40 +656,49 @@ class _PublicSuffixListTLDExtractor:
531
656
 
532
657
  def suffix_index(
533
658
  self, spl: list[str], include_psl_private_domains: bool | None = None
534
- ) -> tuple[int, bool]:
535
- """Return the index of the first suffix label, and whether it is private.
659
+ ) -> tuple[tuple[int, Trie], tuple[int, Trie]] | None:
660
+ """Return the index of the first public suffix label, the index of the first registry suffix label, and their corresponding trie nodes.
536
661
 
537
- Returns len(spl) if no suffix is found.
662
+ Returns `None` if no suffix is found.
538
663
  """
539
664
  if include_psl_private_domains is None:
540
665
  include_psl_private_domains = self.include_psl_private_domains
541
666
 
542
- node = (
667
+ node = reg_node = (
543
668
  self.tlds_incl_private_trie
544
669
  if include_psl_private_domains
545
670
  else self.tlds_excl_private_trie
546
671
  )
547
- i = len(spl)
548
- j = i
672
+ suffix_idx = reg_idx = label_idx = len(spl)
549
673
  for label in reversed(spl):
550
674
  decoded_label = _decode_punycode(label)
551
675
  if decoded_label in node.matches:
552
- j -= 1
676
+ label_idx -= 1
553
677
  node = node.matches[decoded_label]
554
678
  if node.end:
555
- i = j
679
+ suffix_idx = label_idx
680
+ if not node.is_private:
681
+ reg_node = node
682
+ reg_idx = label_idx
556
683
  continue
557
684
 
558
685
  is_wildcard = "*" in node.matches
559
686
  if is_wildcard:
560
687
  is_wildcard_exception = "!" + decoded_label in node.matches
561
- if is_wildcard_exception:
562
- return j, node.matches["*"].is_private
563
- return j - 1, node.matches["*"].is_private
688
+ return (
689
+ label_idx if is_wildcard_exception else label_idx - 1,
690
+ node.matches["*"],
691
+ ), (
692
+ reg_idx,
693
+ reg_node,
694
+ )
564
695
 
565
696
  break
566
697
 
567
- return i, node.is_private
698
+ if suffix_idx == len(spl):
699
+ return None
700
+
701
+ return ((suffix_idx, node), (reg_idx, reg_node))
568
702
 
569
703
 
570
704
  def _decode_punycode(label: str) -> str:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: tldextract
3
- Version: 5.2.0
3
+ Version: 5.3.0
4
4
  Summary: Accurately separates a URL's subdomain, domain, and public suffix, using the Public Suffix List (PSL). By default, this includes the public ICANN TLDs and their exceptions. You can optionally support the Public Suffix List's private domains as well.
5
5
  Author-email: John Kurkowski <john.kurkowski@gmail.com>
6
6
  License: BSD-3-Clause
@@ -90,7 +90,7 @@ To rejoin the original hostname, if it was indeed a valid, registered hostname:
90
90
 
91
91
  ```python
92
92
  >>> ext = tldextract.extract('http://forums.bbc.co.uk')
93
- >>> ext.registered_domain
93
+ >>> ext.top_domain_under_public_suffix
94
94
  'bbc.co.uk'
95
95
  >>> ext.fqdn
96
96
  'forums.bbc.co.uk'
@@ -287,7 +287,7 @@ For example:
287
287
  extractor = TLDExtract()
288
288
  split_url = urllib.parse.urlsplit("https://foo.bar.com:8080")
289
289
  split_suffix = extractor.extract_urllib(split_url)
290
- url_to_crawl = f"{split_url.scheme}://{split_suffix.registered_domain}:{split_url.port}"
290
+ url_to_crawl = f"{split_url.scheme}://{split_suffix.top_domain_under_public_suffix}:{split_url.port}"
291
291
  ```
292
292
 
293
293
  `tldextract`'s lenient string parsing stance lowers the learning curve of using
@@ -1,16 +1,16 @@
1
1
  tldextract/.tld_set_snapshot,sha256=tpMVwIXVOXJyS48t8RH_wymwyE_gpH1iyMkWVcx3Sjg,318581
2
2
  tldextract/__init__.py,sha256=1n2QxAmFCFp3X1A5O46wJOTZqWM2ukshNkkG-TrOaLQ,274
3
3
  tldextract/__main__.py,sha256=oiZ5EW_lxRLH6Khk6MdzXf7a1Ld5-A3k4wOFRmNNk2o,89
4
- tldextract/_version.py,sha256=1-tO6tx4p9okXz3ScGW6YFdQDbS8ruoK2_y0riYBx7M,511
4
+ tldextract/_version.py,sha256=eUUqvIdRVH9jjg_LcI8eEHFrv_K9YLfGNamSZ0trHII,511
5
5
  tldextract/cache.py,sha256=nrT9VuLmrjHHFxj-Cai97IyUXXenCX6KbHi07mPkzMc,8289
6
- tldextract/cli.py,sha256=nCzBAFrgAopTK1t5eBRQgeveSgWheUx4LAlAHE_8mzQ,3010
6
+ tldextract/cli.py,sha256=ZYXwybL76KucaVH4GCz5Uiy4PmL6oXARiKgLWv64c0I,3230
7
7
  tldextract/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
8
  tldextract/remote.py,sha256=rovbxR73G8To-QRrA_cgSfk0S3k0_B2SyYefXiLHrQg,1940
9
9
  tldextract/suffix_list.py,sha256=ePH6iOIUBe0JE_mc07a34Nd7tFyfmHgP_mJkFhxzr7c,3947
10
- tldextract/tldextract.py,sha256=tM2Lrj0yclAulBueRxAK40bzhMl86Ftz4FHaFGyec7k,21454
11
- tldextract-5.2.0.dist-info/licenses/LICENSE,sha256=ZUrmz9cSprvhQmqmUdHIWbD51Cytv6PDTMlJLruTLuI,1527
12
- tldextract-5.2.0.dist-info/METADATA,sha256=LNhqdHD4eMtqA1DaxoVw0YpFq-XqW4_dtqc53bUAriM,11709
13
- tldextract-5.2.0.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
14
- tldextract-5.2.0.dist-info/entry_points.txt,sha256=EStkXC80BetCMp1UDhU3kWuXBo3qDpgKltZTJ1x4x1U,51
15
- tldextract-5.2.0.dist-info/top_level.txt,sha256=DWZIjV49WP30tyC1KOEP7t-EaS4IRCXQzc0KXAOn_bk,11
16
- tldextract-5.2.0.dist-info/RECORD,,
10
+ tldextract/tldextract.py,sha256=j93PohoNaNpH1tJ3zZ2z3f4JMt_JOmQ5RYUgrUcNvP0,27008
11
+ tldextract-5.3.0.dist-info/licenses/LICENSE,sha256=ZUrmz9cSprvhQmqmUdHIWbD51Cytv6PDTMlJLruTLuI,1527
12
+ tldextract-5.3.0.dist-info/METADATA,sha256=V9aCLPpJ5uHtPugYuncUOrvNoHwuXN0YZ040Wtm8RWM,11735
13
+ tldextract-5.3.0.dist-info/WHEEL,sha256=pxyMxgL8-pra_rKaQ4drOZAegBVuX-G_4nRHjjgWbmo,91
14
+ tldextract-5.3.0.dist-info/entry_points.txt,sha256=EStkXC80BetCMp1UDhU3kWuXBo3qDpgKltZTJ1x4x1U,51
15
+ tldextract-5.3.0.dist-info/top_level.txt,sha256=DWZIjV49WP30tyC1KOEP7t-EaS4IRCXQzc0KXAOn_bk,11
16
+ tldextract-5.3.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (78.1.0)
2
+ Generator: setuptools (79.0.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5