PyPI - ultimate-sitemap-parser - Versions diffs - 1.0.0rc1__tar.gz → 1.1.0__tar.gz - Mend

ultimate-sitemap-parser 1.0.0rc1tar.gz → 1.1.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ultimate-sitemap-parser might be problematic. Click here for more details.

Files changed (21) hide show

{ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.0}/PKG-INFO RENAMED Viewed

@@ -1,15 +1,14 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.3
 Name: ultimate-sitemap-parser
-Version: 1.0.0rc1
+Version: 1.1.0
 Summary: A performant library for parsing and crawling sitemaps
-Home-page: https://ultimate-sitemap-parser.readthedocs.io/
 License: GPL-3.0-or-later
 Keywords: sitemap,crawler,indexing,xml,rss,atom,google news
 Author: Linas Valiukas
 Author-email: linas@media.mit.edu
 Maintainer: Freddy Heppell
 Maintainer-email: f.heppell@sheffield.ac.uk
-Requires-Python: >=3.8,<4.0
+Requires-Python: >=3.8
 Classifier: Development Status :: 5 - Production/Stable
 Classifier: Intended Audience :: Developers
 Classifier: Intended Audience :: Information Technology
@@ -22,12 +21,14 @@ Classifier: Programming Language :: Python :: 3.9
 Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
 Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
 Classifier: Topic :: Text Processing :: Indexing
 Classifier: Topic :: Text Processing :: Markup :: XML
 Requires-Dist: python-dateutil (>=2.7,<3.0.0)
-Requires-Dist: requests (>=2.2.1)
+Requires-Dist: requests (>=2.2.1,<3.0.0)
 Project-URL: Documentation, https://ultimate-sitemap-parser.readthedocs.io/
+Project-URL: Homepage, https://ultimate-sitemap-parser.readthedocs.io/
 Project-URL: Repository, https://github.com/GateNLP/ultimate-sitemap-parser
 Description-Content-Type: text/x-rst

{ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.0}/pyproject.toml RENAMED Viewed

@@ -1,20 +1,36 @@
-[tool.poetry]
+[project]
 name = "ultimate-sitemap-parser"
-version = "1.0.0rc1"
+version = "1.1.0"
 description = "A performant library for parsing and crawling sitemaps"
 authors = [
-    "Linas Valiukas <linas@media.mit.edu>",
-    "Hal Roberts <hroberts@cyber.law.harvard.edu>",
-    "Freddy Heppell <f.heppell@sheffield.ac.uk>"
+    { name = "Linas Valiukas", email = "linas@media.mit.edu"},
+    { name = "Hal Roberts", email = "hroberts@cyber.law.harvard.edu"},
+    { name = "Freddy Heppell", email = "f.heppell@sheffield.ac.uk"},
 ]
 maintainers = [
-    "Freddy Heppell <f.heppell@sheffield.ac.uk>"
+    { name = "Freddy Heppell", email = "f.heppell@sheffield.ac.uk"},
+]
+license = "GPL-3.0-or-later"
+readme = "README.rst"
+keywords = ["sitemap", "crawler", "indexing", "xml", "rss", "atom", "google news"]
+dynamic = ["classifiers"]
+requires-python = ">=3.8"
+dependencies = [
+    "python-dateutil (>=2.7,<3.0.0)",
+    "requests (>=2.2.1,<3.0.0)"
 ]
+[project.urls]
 homepage = "https://ultimate-sitemap-parser.readthedocs.io/"
 documentation = "https://ultimate-sitemap-parser.readthedocs.io/"
 repository = "https://github.com/GateNLP/ultimate-sitemap-parser"
-license = "GPL-3.0-or-later"
-readme = "README.rst"
+[project.scripts]
+usp = 'usp.cli:main'
+[tool.poetry]
+requires-poetry = ">=2.0"
 classifiers=[
     'Development Status :: 5 - Production/Stable',
     'Intended Audience :: Developers',
@@ -26,24 +42,20 @@ classifiers=[
     'Topic :: Text Processing :: Indexing',
     'Topic :: Text Processing :: Markup :: XML',
 ]
-keywords = ["sitemap", "crawler", "indexing", "xml", "rss", "atom", "google news"]
 packages = [
     { include = "usp" }
 ]
-[tool.poetry.scripts]
-usp = 'usp.cli:main'
 [tool.poetry.dependencies]
-python = "^3.8"
-python-dateutil = ">=2.7,<3.0.0"
-requests = ">=2.2.1"
+# Specify upper bound for locking
+python = ">=3.8,<4.0"
 [tool.poetry.group.dev.dependencies]
 requests-mock = ">=1.6.0,<2.0"
 pytest = "^8.3.0"
 ruff = "^0.6.1"
 vcrpy = "6.0.1"
+pytest-mock = "^3.14.0"
 [tool.poetry.group.perf]
 optional = true

{ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.0}/usp/fetch_parse.py RENAMED Viewed

@@ -643,6 +643,7 @@ class PagesXMLSitemapParser(AbstractXMLSitemapParser):
             "news_keywords",
             "news_stock_tickers",
             "images",
+            "alternates",
         ]
         def __init__(self):
@@ -659,6 +660,7 @@ class PagesXMLSitemapParser(AbstractXMLSitemapParser):
             self.news_keywords = None
             self.news_stock_tickers = None
             self.images = []
+            self.alternates = []
         def __hash__(self):
             return hash(
@@ -763,6 +765,10 @@ class PagesXMLSitemapParser(AbstractXMLSitemapParser):
                     for image in self.images
                 ]
+            alternates = None
+            if len(self.alternates) > 0:
+                alternates = self.alternates
             return SitemapPage(
                 url=url,
                 last_modified=last_modified,
@@ -770,6 +776,7 @@ class PagesXMLSitemapParser(AbstractXMLSitemapParser):
                 priority=priority,
                 news_story=sitemap_news_story,
                 images=sitemap_images,
+                alternates=alternates,
             )
     __slots__ = ["_current_page", "_pages", "_page_urls", "_current_image"]
@@ -801,6 +808,19 @@ class PagesXMLSitemapParser(AbstractXMLSitemapParser):
                     "Page is expected to be set before <image:image>."
                 )
             self._current_image = self.Image()
+        elif name == "link":
+            if not self._current_page:
+                raise SitemapXMLParsingException(
+                    "Page is expected to be set before <link>."
+                )
+            if "rel" not in attrs or attrs["rel"] != "alternate":
+                log.warning(f"<link> element is missing rel attribute: {attrs}.")
+            elif "hreflang" not in attrs or "href" not in attrs:
+                log.warning(
+                    f"<link> element is missing hreflang or href attributes: {attrs}."
+                )
+            else:
+                self._current_page.alternates.append((attrs["hreflang"], attrs["href"]))
     def __require_last_char_data_to_be_set(self, name: str) -> None:
         if not self._last_char_data:

{ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.0}/usp/objects/page.py RENAMED Viewed

@@ -3,7 +3,7 @@
 import datetime
 from decimal import Decimal
 from enum import Enum, unique
-from typing import List, Optional
+from typing import List, Optional, Tuple
 SITEMAP_PAGE_DEFAULT_PRIORITY = Decimal("0.5")
 """Default sitemap page priority, as per the spec."""
@@ -331,6 +331,7 @@ class SitemapPage:
         "__change_frequency",
         "__news_story",
         "__images",
+        "__alternates",
     ]
     def __init__(
@@ -341,6 +342,7 @@ class SitemapPage:
         change_frequency: Optional[SitemapPageChangeFrequency] = None,
         news_story: Optional[SitemapNewsStory] = None,
         images: Optional[List[SitemapImage]] = None,
+        alternates: Optional[List[Tuple[str, str]]] = None,
     ):
         """
         Initialize a new sitemap-derived page.
@@ -357,6 +359,7 @@ class SitemapPage:
         self.__change_frequency = change_frequency
         self.__news_story = news_story
         self.__images = images
+        self.__alternates = alternates
     def __eq__(self, other) -> bool:
         if not isinstance(other, SitemapPage):
@@ -380,6 +383,9 @@ class SitemapPage:
         if self.images != other.images:
             return False
+        if self.alternates != other.alternates:
+            return False
         return True
     def __hash__(self):
@@ -442,10 +448,30 @@ class SitemapPage:
     @property
     def news_story(self) -> Optional[SitemapNewsStory]:
-        """Get the Google News story attached to the URL."""
+        """Get the Google News story attached to the URL.
+        See :ref:`google-news-ext` reference
+        """
         return self.__news_story
     @property
     def images(self) -> Optional[List[SitemapImage]]:
-        """Get the images attached to the URL."""
+        """Get the images attached to the URL.
+        See :ref:`google-image-ext` reference
+        """
         return self.__images
+    @property
+    def alternates(self) -> Optional[List[Tuple[str, str]]]:
+        """Get the alternate URLs for the URL.
+        A tuple of (language code, URL) for each ``<xhtml:link>`` element with ``rel="alternate"`` attribute.
+        See :ref:`sitemap-extra-localisation` reference
+        Example::
+            [('fr', 'https://www.example.com/fr/page'), ('de', 'https://www.example.com/de/page')]
+        """
+        return self.__alternates

{ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.0}/usp/web_client/abstract_client.py RENAMED Viewed

@@ -1,7 +1,9 @@
 """Abstract web client class."""
 import abc
+import random
 from http import HTTPStatus
+import time
 from typing import Optional
 RETRYABLE_HTTP_STATUS_CODES = {
@@ -187,3 +189,36 @@ class LocalWebClient(AbstractWebClient):
     def get(self, url: str) -> AbstractWebClientResponse:
         raise NoWebClientException
+class RequestWaiter:
+    """
+    Manages waiting between requests.
+    """
+    def __init__(self, wait: Optional[float] = None, random_wait: bool = True):
+        """
+        :param wait: time to wait between requests, in seconds.
+        :param random_wait: if true, wait time is multiplied by a random number between 0.5 and 1.5.
+        """
+        self.wait_s = wait or 0
+        self.random_wait = random_wait
+        self.is_first = True
+    def wait(self) -> None:
+        """Perform a wait if needed. Should be called before each request.
+        Will skip wait if this is the first request.
+        """
+        if self.wait_s == 0:
+            return
+        if self.is_first:
+            self.is_first = False
+            return
+        wait_f = 1.0
+        if self.random_wait:
+            wait_f = random.uniform(0.5, 1.5)
+        time.sleep(self.wait_s * wait_f)

{ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.0}/usp/web_client/requests_client.py RENAMED Viewed

@@ -1,6 +1,7 @@
 """Implementation of :mod:`usp.web_client.abstract_client` with Requests."""
 from http import HTTPStatus
+import logging
 from typing import Optional, Dict, Tuple, Union
 import requests
@@ -9,6 +10,7 @@ from .abstract_client import (
     AbstractWebClient,
     AbstractWebClientResponse,
     AbstractWebClientSuccessResponse,
+    RequestWaiter,
     WebClientErrorResponse,
     RETRYABLE_HTTP_STATUS_CODES,
 )
@@ -78,16 +80,27 @@ class RequestsWebClient(AbstractWebClient):
     Some webservers might be generating huge sitemaps on the fly, so this is why it's rather big.
     """
-    __slots__ = ["__max_response_data_length", "__timeout", "__proxies", "__verify"]
+    __slots__ = [
+        "__max_response_data_length",
+        "__timeout",
+        "__proxies",
+        "__verify",
+        "__waiter",
+    ]
-    def __init__(self, verify=True):
+    def __init__(
+        self, verify=True, wait: Optional[float] = None, random_wait: bool = False
+    ):
         """
         :param verify: whether certificates should be verified for HTTPS requests.
+        :param wait: time to wait between requests, in seconds.
+        :param random_wait: if true, wait time is multiplied by a random number between 0.5 and 1.5.
         """
         self.__max_response_data_length = None
         self.__timeout = self.__HTTP_REQUEST_TIMEOUT
         self.__proxies = {}
         self.__verify = verify
+        self.__waiter = RequestWaiter(wait, random_wait)
     def set_timeout(self, timeout: Union[int, Tuple[int, int], None]) -> None:
         """Set HTTP request timeout.
@@ -114,6 +127,7 @@ class RequestsWebClient(AbstractWebClient):
         self.__max_response_data_length = max_response_data_length
     def get(self, url: str) -> AbstractWebClientResponse:
+        self.__waiter.wait()
         try:
             response = requests.get(
                 url,
@@ -139,6 +153,7 @@ class RequestsWebClient(AbstractWebClient):
                 )
             else:
                 message = f"{response.status_code} {response.reason}"
+                logging.info(f"Response content: {response.text}")
                 if response.status_code in RETRYABLE_HTTP_STATUS_CODES:
                     return RequestsWebClientErrorResponse(