PyPI - tldextract - Versions diffs - 5.2.0__py3-none-any.whl → 5.3.0__py3-none-any.whl - Mend

tldextract 5.2.0py3-none-any.whl → 5.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

tldextract/_version.py CHANGED Viewed

@@ -17,5 +17,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '5.2.0'
-__version_tuple__ = version_tuple = (5, 2, 0)
+__version__ = version = '5.3.0'
+__version_tuple__ = version_tuple = (5, 3, 0)

tldextract/cli.py CHANGED Viewed

@@ -98,7 +98,15 @@ def main() -> None:
     for i in args.input:
         ext = tld_extract(i)
         if args.json:
-            properties = ("fqdn", "ipv4", "ipv6", "registered_domain")
+            properties = (
+                "fqdn",
+                "ipv4",
+                "ipv6",
+                "registered_domain",
+                "reverse_domain_name",
+                "top_domain_under_public_suffix",
+                "top_domain_under_registry_suffix",
+            )
             print(
                 json.dumps(
                     {

tldextract/tldextract.py CHANGED Viewed

@@ -28,7 +28,7 @@ subdomain or a valid suffix.
 To rejoin the original hostname, if it was indeed a valid, registered hostname:
     >>> ext = tldextract.extract("http://forums.bbc.co.uk")
-    >>> ext.registered_domain
+    >>> ext.top_domain_under_public_suffix
     'bbc.co.uk'
     >>> ext.fqdn
     'forums.bbc.co.uk'
@@ -38,8 +38,9 @@ from __future__ import annotations
 import os
 import urllib.parse
+import warnings
 from collections.abc import Collection, Sequence
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from functools import wraps
 import idna
@@ -96,18 +97,16 @@ class ExtractResult:
     `False`.
     """
-    @property
-    def registered_domain(self) -> str:
-        """The `domain` and `suffix` fields joined with a dot, if they're both set, or else the empty string.
+    registry_suffix: str = field(repr=False)
+    """The registry suffix of the input URL, if it contained one, or else the empty string.
-        >>> extract("http://forums.bbc.co.uk").registered_domain
-        'bbc.co.uk'
-        >>> extract("http://localhost:8080").registered_domain
-        ''
-        """
-        if self.suffix and self.domain:
-            return f"{self.domain}.{self.suffix}"
-        return ""
+    This field is a domain under which people can register subdomains through a
+    registar.
+    This field is unaffected by the `include_psl_private_domains` setting. If
+    `include_psl_private_domains` was set to `False`, this field is always the
+    same as `suffix`.
+    """
     @property
     def fqdn(self) -> str:
@@ -168,6 +167,56 @@ class ExtractResult:
                 return debracketed
         return ""
+    @property
+    def registered_domain(self) -> str:
+        """The `domain` and `suffix` fields joined with a dot, if they're both set, or else the empty string.
+        >>> extract("http://forums.bbc.co.uk").registered_domain
+        'bbc.co.uk'
+        >>> extract("http://localhost:8080").registered_domain
+        ''
+        .. deprecated:: 6.0.0
+           This property is deprecated and will be removed in the next major
+           version. Use `top_domain_under_public_suffix` instead, which has the
+           same behavior but a more accurate name.
+        This is an alias for the `top_domain_under_public_suffix` property.
+        `registered_domain` is so called because is roughly the domain the
+        owner paid to register with a registrar or, in the case of a private
+        domain, "registered" with the domain owner. If the input was not
+        something one could register, this property returns the empty string.
+        To distinguish the case of private domains, consider Blogspot, which is
+        in the PSL's private domains. If `include_psl_private_domains` was set
+        to `False`, the `registered_domain` property of a Blogspot URL
+        represents the domain the owner of Blogspot registered with a
+        registrar, i.e. Google registered "blogspot.com". If
+        `include_psl_private_domains=True`, the `registered_domain` property
+        represents the "blogspot.com" _subdomain_ the owner of a blog
+        "registered" with Blogspot.
+        >>> extract(
+        ...     "http://waiterrant.blogspot.com", include_psl_private_domains=False
+        ... ).registered_domain
+        'blogspot.com'
+        >>> extract(
+        ...     "http://waiterrant.blogspot.com", include_psl_private_domains=True
+        ... ).registered_domain
+        'waiterrant.blogspot.com'
+        To always get the same joined string, regardless of the
+        `include_psl_private_domains` setting, consider the
+        `top_domain_under_registry_suffix` property.
+        """
+        warnings.warn(
+            "The 'registered_domain' property is deprecated and will be removed in the next major version. "
+            "Use 'top_domain_under_public_suffix' instead, which has the same behavior but a more accurate name.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+        return self.top_domain_under_public_suffix
     @property
     def reverse_domain_name(self) -> str:
         """The domain name in Reverse Domain Name Notation.
@@ -193,6 +242,48 @@ class ExtractResult:
             stack.extend(reversed(self.subdomain.split(".")))
         return ".".join(stack)
+    @property
+    def top_domain_under_registry_suffix(self) -> str:
+        """The rightmost domain label and `registry_suffix` joined with a dot, if such a domain is available and `registry_suffix` is set, or else the empty string.
+        The rightmost domain label might be in the `domain` field, or, if the
+        input URL's suffix is a PSL private domain, in the public suffix
+        `suffix` field.
+        If the input was not in the PSL's private domains, this property is
+        equivalent to `top_domain_under_public_suffix`.
+        >>> extract(
+        ...     "http://waiterrant.blogspot.com", include_psl_private_domains=False
+        ... ).top_domain_under_registry_suffix
+        'blogspot.com'
+        >>> extract(
+        ...     "http://waiterrant.blogspot.com", include_psl_private_domains=True
+        ... ).top_domain_under_registry_suffix
+        'blogspot.com'
+        >>> extract("http://localhost:8080").top_domain_under_registry_suffix
+        ''
+        """
+        top_domain_under_public_suffix = self.top_domain_under_public_suffix
+        if not top_domain_under_public_suffix or not self.is_private:
+            return top_domain_under_public_suffix
+        num_labels = self.registry_suffix.count(".") + 2
+        return ".".join(top_domain_under_public_suffix.split(".")[-num_labels:])
+    @property
+    def top_domain_under_public_suffix(self) -> str:
+        """The `domain` and `suffix` fields joined with a dot, if they're both set, or else the empty string.
+        >>> extract("http://forums.bbc.co.uk").top_domain_under_public_suffix
+        'bbc.co.uk'
+        >>> extract("http://localhost:8080").top_domain_under_public_suffix
+        ''
+        """
+        if self.suffix and self.domain:
+            return f"{self.domain}.{self.suffix}"
+        return ""
 class TLDExtract:
     """A callable for extracting, subdomain, domain, and suffix components from a URL."""
@@ -357,24 +448,58 @@ class TLDExtract:
             and netloc_with_ascii_dots[-1] == "]"
             and looks_like_ipv6(netloc_with_ascii_dots[1:-1])
         ):
-            return ExtractResult("", netloc_with_ascii_dots, "", is_private=False)
+            return ExtractResult(
+                "", netloc_with_ascii_dots, "", is_private=False, registry_suffix=""
+            )
         labels = netloc_with_ascii_dots.split(".")
-        suffix_index, is_private = self._get_tld_extractor(
-            session=session
-        ).suffix_index(labels, include_psl_private_domains=include_psl_private_domains)
+        maybe_indexes = self._get_tld_extractor(session).suffix_index(
+            labels, include_psl_private_domains=include_psl_private_domains
+        )
         num_ipv4_labels = 4
-        if suffix_index == len(labels) == num_ipv4_labels and looks_like_ip(
-            netloc_with_ascii_dots
+        if (
+            not maybe_indexes
+            and len(labels) == num_ipv4_labels
+            and looks_like_ip(netloc_with_ascii_dots)
         ):
-            return ExtractResult("", netloc_with_ascii_dots, "", is_private)
+            return ExtractResult(
+                "", netloc_with_ascii_dots, "", is_private=False, registry_suffix=""
+            )
+        elif not maybe_indexes:
+            return ExtractResult(
+                subdomain=".".join(labels[:-1]),
+                domain=labels[-1],
+                suffix="",
+                is_private=False,
+                registry_suffix="",
+            )
-        suffix = ".".join(labels[suffix_index:]) if suffix_index != len(labels) else ""
-        subdomain = ".".join(labels[: suffix_index - 1]) if suffix_index >= 2 else ""
-        domain = labels[suffix_index - 1] if suffix_index else ""
-        return ExtractResult(subdomain, domain, suffix, is_private)
+        (
+            (public_suffix_index, public_suffix_node),
+            (registry_suffix_index, registry_suffix_node),
+        ) = maybe_indexes
+        subdomain = (
+            ".".join(labels[: public_suffix_index - 1])
+            if public_suffix_index >= 2
+            else ""
+        )
+        domain = labels[public_suffix_index - 1] if public_suffix_index > 0 else ""
+        public_suffix = ".".join(labels[public_suffix_index:])
+        registry_suffix = (
+            ".".join(labels[registry_suffix_index:])
+            if public_suffix_node.is_private
+            else public_suffix
+        )
+        return ExtractResult(
+            subdomain=subdomain,
+            domain=domain,
+            suffix=public_suffix,
+            is_private=public_suffix_node.is_private,
+            registry_suffix=registry_suffix,
+        )
     def update(
         self, fetch_now: bool = False, session: requests.Session | None = None
@@ -531,40 +656,49 @@ class _PublicSuffixListTLDExtractor:
     def suffix_index(
         self, spl: list[str], include_psl_private_domains: bool | None = None
-    ) -> tuple[int, bool]:
-        """Return the index of the first suffix label, and whether it is private.
+    ) -> tuple[tuple[int, Trie], tuple[int, Trie]] | None:
+        """Return the index of the first public suffix label, the index of the first registry suffix label, and their corresponding trie nodes.
-        Returns len(spl) if no suffix is found.
+        Returns `None` if no suffix is found.
         """
         if include_psl_private_domains is None:
             include_psl_private_domains = self.include_psl_private_domains
-        node = (
+        node = reg_node = (
             self.tlds_incl_private_trie
             if include_psl_private_domains
             else self.tlds_excl_private_trie
         )
-        i = len(spl)
-        j = i
+        suffix_idx = reg_idx = label_idx = len(spl)
         for label in reversed(spl):
             decoded_label = _decode_punycode(label)
             if decoded_label in node.matches:
-                j -= 1
+                label_idx -= 1
                 node = node.matches[decoded_label]
                 if node.end:
-                    i = j
+                    suffix_idx = label_idx
+                    if not node.is_private:
+                        reg_node = node
+                        reg_idx = label_idx
                 continue
             is_wildcard = "*" in node.matches
             if is_wildcard:
                 is_wildcard_exception = "!" + decoded_label in node.matches
-                if is_wildcard_exception:
-                    return j, node.matches["*"].is_private
-                return j - 1, node.matches["*"].is_private
+                return (
+                    label_idx if is_wildcard_exception else label_idx - 1,
+                    node.matches["*"],
+                ), (
+                    reg_idx,
+                    reg_node,
+                )
             break
-        return i, node.is_private
+        if suffix_idx == len(spl):
+            return None
+        return ((suffix_idx, node), (reg_idx, reg_node))
 def _decode_punycode(label: str) -> str:

{tldextract-5.2.0.dist-info → tldextract-5.3.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: tldextract
-Version: 5.2.0
+Version: 5.3.0
 Summary: Accurately separates a URL's subdomain, domain, and public suffix, using the Public Suffix List (PSL). By default, this includes the public ICANN TLDs and their exceptions. You can optionally support the Public Suffix List's private domains as well.
 Author-email: John Kurkowski <john.kurkowski@gmail.com>
 License: BSD-3-Clause
@@ -90,7 +90,7 @@ To rejoin the original hostname, if it was indeed a valid, registered hostname:
 ```python
 >>> ext = tldextract.extract('http://forums.bbc.co.uk')
->>> ext.registered_domain
+>>> ext.top_domain_under_public_suffix
 'bbc.co.uk'
 >>> ext.fqdn
 'forums.bbc.co.uk'
@@ -287,7 +287,7 @@ For example:
 extractor = TLDExtract()
 split_url = urllib.parse.urlsplit("https://foo.bar.com:8080")
 split_suffix = extractor.extract_urllib(split_url)
-url_to_crawl = f"{split_url.scheme}://{split_suffix.registered_domain}:{split_url.port}"
+url_to_crawl = f"{split_url.scheme}://{split_suffix.top_domain_under_public_suffix}:{split_url.port}"
 ```
 `tldextract`'s lenient string parsing stance lowers the learning curve of using

{tldextract-5.2.0.dist-info → tldextract-5.3.0.dist-info}/RECORD RENAMED Viewed

@@ -1,16 +1,16 @@
 tldextract/.tld_set_snapshot,sha256=tpMVwIXVOXJyS48t8RH_wymwyE_gpH1iyMkWVcx3Sjg,318581
 tldextract/__init__.py,sha256=1n2QxAmFCFp3X1A5O46wJOTZqWM2ukshNkkG-TrOaLQ,274
 tldextract/__main__.py,sha256=oiZ5EW_lxRLH6Khk6MdzXf7a1Ld5-A3k4wOFRmNNk2o,89
-tldextract/_version.py,sha256=1-tO6tx4p9okXz3ScGW6YFdQDbS8ruoK2_y0riYBx7M,511
+tldextract/_version.py,sha256=eUUqvIdRVH9jjg_LcI8eEHFrv_K9YLfGNamSZ0trHII,511
 tldextract/cache.py,sha256=nrT9VuLmrjHHFxj-Cai97IyUXXenCX6KbHi07mPkzMc,8289
-tldextract/cli.py,sha256=nCzBAFrgAopTK1t5eBRQgeveSgWheUx4LAlAHE_8mzQ,3010
+tldextract/cli.py,sha256=ZYXwybL76KucaVH4GCz5Uiy4PmL6oXARiKgLWv64c0I,3230
 tldextract/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 tldextract/remote.py,sha256=rovbxR73G8To-QRrA_cgSfk0S3k0_B2SyYefXiLHrQg,1940
 tldextract/suffix_list.py,sha256=ePH6iOIUBe0JE_mc07a34Nd7tFyfmHgP_mJkFhxzr7c,3947
-tldextract/tldextract.py,sha256=tM2Lrj0yclAulBueRxAK40bzhMl86Ftz4FHaFGyec7k,21454
-tldextract-5.2.0.dist-info/licenses/LICENSE,sha256=ZUrmz9cSprvhQmqmUdHIWbD51Cytv6PDTMlJLruTLuI,1527
-tldextract-5.2.0.dist-info/METADATA,sha256=LNhqdHD4eMtqA1DaxoVw0YpFq-XqW4_dtqc53bUAriM,11709
-tldextract-5.2.0.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
-tldextract-5.2.0.dist-info/entry_points.txt,sha256=EStkXC80BetCMp1UDhU3kWuXBo3qDpgKltZTJ1x4x1U,51
-tldextract-5.2.0.dist-info/top_level.txt,sha256=DWZIjV49WP30tyC1KOEP7t-EaS4IRCXQzc0KXAOn_bk,11
-tldextract-5.2.0.dist-info/RECORD,,
+tldextract/tldextract.py,sha256=j93PohoNaNpH1tJ3zZ2z3f4JMt_JOmQ5RYUgrUcNvP0,27008
+tldextract-5.3.0.dist-info/licenses/LICENSE,sha256=ZUrmz9cSprvhQmqmUdHIWbD51Cytv6PDTMlJLruTLuI,1527
+tldextract-5.3.0.dist-info/METADATA,sha256=V9aCLPpJ5uHtPugYuncUOrvNoHwuXN0YZ040Wtm8RWM,11735
+tldextract-5.3.0.dist-info/WHEEL,sha256=pxyMxgL8-pra_rKaQ4drOZAegBVuX-G_4nRHjjgWbmo,91
+tldextract-5.3.0.dist-info/entry_points.txt,sha256=EStkXC80BetCMp1UDhU3kWuXBo3qDpgKltZTJ1x4x1U,51
+tldextract-5.3.0.dist-info/top_level.txt,sha256=DWZIjV49WP30tyC1KOEP7t-EaS4IRCXQzc0KXAOn_bk,11
+tldextract-5.3.0.dist-info/RECORD,,

{tldextract-5.2.0.dist-info → tldextract-5.3.0.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (78.1.0)
+Generator: setuptools (79.0.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

{tldextract-5.2.0.dist-info → tldextract-5.3.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{tldextract-5.2.0.dist-info → tldextract-5.3.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{tldextract-5.2.0.dist-info → tldextract-5.3.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

tldextract 5.2.0__py3-none-any.whl → 5.3.0__py3-none-any.whl

tldextract 5.2.0py3-none-any.whl → 5.3.0py3-none-any.whl