PyPI - ultimate-sitemap-parser - Versions diffs - 1.0.0rc1__tar.gz → 1.1.1__tar.gz - Mend

ultimate-sitemap-parser 1.0.0rc1tar.gz → 1.1.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ultimate-sitemap-parser might be problematic. Click here for more details.

Files changed (21) hide show

{ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.1}/PKG-INFO RENAMED Viewed

@@ -1,15 +1,14 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.3
 Name: ultimate-sitemap-parser
-Version: 1.0.0rc1
+Version: 1.1.1
 Summary: A performant library for parsing and crawling sitemaps
-Home-page: https://ultimate-sitemap-parser.readthedocs.io/
 License: GPL-3.0-or-later
 Keywords: sitemap,crawler,indexing,xml,rss,atom,google news
 Author: Linas Valiukas
 Author-email: linas@media.mit.edu
 Maintainer: Freddy Heppell
 Maintainer-email: f.heppell@sheffield.ac.uk
-Requires-Python: >=3.8,<4.0
+Requires-Python: >=3.8
 Classifier: Development Status :: 5 - Production/Stable
 Classifier: Intended Audience :: Developers
 Classifier: Intended Audience :: Information Technology
@@ -22,12 +21,14 @@ Classifier: Programming Language :: Python :: 3.9
 Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
 Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
 Classifier: Topic :: Text Processing :: Indexing
 Classifier: Topic :: Text Processing :: Markup :: XML
 Requires-Dist: python-dateutil (>=2.7,<3.0.0)
-Requires-Dist: requests (>=2.2.1)
+Requires-Dist: requests (>=2.2.1,<3.0.0)
 Project-URL: Documentation, https://ultimate-sitemap-parser.readthedocs.io/
+Project-URL: Homepage, https://ultimate-sitemap-parser.readthedocs.io/
 Project-URL: Repository, https://github.com/GateNLP/ultimate-sitemap-parser
 Description-Content-Type: text/x-rst

{ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.1}/pyproject.toml RENAMED Viewed

@@ -1,20 +1,36 @@
-[tool.poetry]
+[project]
 name = "ultimate-sitemap-parser"
-version = "1.0.0rc1"
+version = "1.1.1"
 description = "A performant library for parsing and crawling sitemaps"
 authors = [
-    "Linas Valiukas <linas@media.mit.edu>",
-    "Hal Roberts <hroberts@cyber.law.harvard.edu>",
-    "Freddy Heppell <f.heppell@sheffield.ac.uk>"
+    { name = "Linas Valiukas", email = "linas@media.mit.edu"},
+    { name = "Hal Roberts", email = "hroberts@cyber.law.harvard.edu"},
+    { name = "Freddy Heppell", email = "f.heppell@sheffield.ac.uk"},
 ]
 maintainers = [
-    "Freddy Heppell <f.heppell@sheffield.ac.uk>"
+    { name = "Freddy Heppell", email = "f.heppell@sheffield.ac.uk"},
 ]
+license = "GPL-3.0-or-later"
+readme = "README.rst"
+keywords = ["sitemap", "crawler", "indexing", "xml", "rss", "atom", "google news"]
+dynamic = ["classifiers"]
+requires-python = ">=3.8"
+dependencies = [
+    "python-dateutil (>=2.7,<3.0.0)",
+    "requests (>=2.2.1,<3.0.0)"
+]
+[project.urls]
 homepage = "https://ultimate-sitemap-parser.readthedocs.io/"
 documentation = "https://ultimate-sitemap-parser.readthedocs.io/"
 repository = "https://github.com/GateNLP/ultimate-sitemap-parser"
-license = "GPL-3.0-or-later"
-readme = "README.rst"
+[project.scripts]
+usp = 'usp.cli:main'
+[tool.poetry]
+requires-poetry = ">=2.0"
 classifiers=[
     'Development Status :: 5 - Production/Stable',
     'Intended Audience :: Developers',
@@ -26,24 +42,20 @@ classifiers=[
     'Topic :: Text Processing :: Indexing',
     'Topic :: Text Processing :: Markup :: XML',
 ]
-keywords = ["sitemap", "crawler", "indexing", "xml", "rss", "atom", "google news"]
 packages = [
     { include = "usp" }
 ]
-[tool.poetry.scripts]
-usp = 'usp.cli:main'
 [tool.poetry.dependencies]
-python = "^3.8"
-python-dateutil = ">=2.7,<3.0.0"
-requests = ">=2.2.1"
+# Specify upper bound for locking
+python = ">=3.8,<4.0"
 [tool.poetry.group.dev.dependencies]
 requests-mock = ">=1.6.0,<2.0"
 pytest = "^8.3.0"
-ruff = "^0.6.1"
+ruff = "^0.9.3"
 vcrpy = "6.0.1"
+pytest-mock = "^3.14.0"
 [tool.poetry.group.perf]
 optional = true
@@ -71,12 +83,20 @@ extend-exclude = ["docs/*"]
 [tool.ruff.lint]
 select = [
-    "E4",
-    "E7",
-    "E9",
-    "F",
-    "UP",
-    "PT"
+    "E4",  # pycodestyle Import
+    "E7",  # pycodestyle Statement
+    "E9",  # pycodestyle Runtime
+    "F",   # pyflakes
+    "UP",  # pyupgrde
+    "PT",  # flake8-pytest-style
+    "I",   # isort
+    "T20", # flake8-print
+    "LOG", # flake8-logging
+]
+[tool.ruff.lint.per-file-ignores]
+"**/tests/*" = [
+    "T20", # Allow print in tests
 ]
 [tool.pytest.ini_options]

{ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.1}/usp/cli/_ls.py RENAMED Viewed

@@ -2,7 +2,7 @@ import argparse
 import sys
 from typing import Iterator
-from usp.cli._util import tabs, format_help
+from usp.cli._util import format_help, tabs
 from usp.objects.sitemap import AbstractSitemap
 from usp.tree import sitemap_tree_for_homepage

{ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.1}/usp/fetch_parse.py RENAMED Viewed

@@ -8,49 +8,49 @@
 """
 import abc
+import logging
 import re
 import xml.parsers.expat
 from collections import OrderedDict
 from decimal import Decimal, InvalidOperation
-from typing import Optional, Dict, Union
+from typing import Dict, Optional, Union
 from .exceptions import SitemapException, SitemapXMLParsingException
 from .helpers import (
-    html_unescape_strip,
-    parse_iso8601_date,
     get_url_retry_on_client_errors,
-    ungzipped_response_content,
+    html_unescape_strip,
     is_http_url,
+    parse_iso8601_date,
     parse_rfc2822_date,
+    ungzipped_response_content,
 )
-from .log import create_logger
 from .objects.page import (
+    SITEMAP_PAGE_DEFAULT_PRIORITY,
     SitemapImage,
-    SitemapPage,
     SitemapNewsStory,
+    SitemapPage,
     SitemapPageChangeFrequency,
-    SITEMAP_PAGE_DEFAULT_PRIORITY,
 )
 from .objects.sitemap import (
     AbstractSitemap,
-    InvalidSitemap,
     IndexRobotsTxtSitemap,
     IndexXMLSitemap,
-    PagesXMLSitemap,
-    PagesTextSitemap,
-    PagesRSSSitemap,
+    InvalidSitemap,
     PagesAtomSitemap,
+    PagesRSSSitemap,
+    PagesTextSitemap,
+    PagesXMLSitemap,
 )
 from .web_client.abstract_client import (
     AbstractWebClient,
     AbstractWebClientSuccessResponse,
+    LocalWebClient,
+    NoWebClientException,
     WebClientErrorResponse,
 )
-from .web_client.abstract_client import LocalWebClient, NoWebClientException
 from .web_client.requests_client import RequestsWebClient
-log = create_logger(__name__)
+log = logging.getLogger(__name__)
 class SitemapFetcher:
@@ -643,6 +643,7 @@ class PagesXMLSitemapParser(AbstractXMLSitemapParser):
             "news_keywords",
             "news_stock_tickers",
             "images",
+            "alternates",
         ]
         def __init__(self):
@@ -659,6 +660,7 @@ class PagesXMLSitemapParser(AbstractXMLSitemapParser):
             self.news_keywords = None
             self.news_stock_tickers = None
             self.images = []
+            self.alternates = []
         def __hash__(self):
             return hash(
@@ -763,6 +765,10 @@ class PagesXMLSitemapParser(AbstractXMLSitemapParser):
                     for image in self.images
                 ]
+            alternates = None
+            if len(self.alternates) > 0:
+                alternates = self.alternates
             return SitemapPage(
                 url=url,
                 last_modified=last_modified,
@@ -770,6 +776,7 @@ class PagesXMLSitemapParser(AbstractXMLSitemapParser):
                 priority=priority,
                 news_story=sitemap_news_story,
                 images=sitemap_images,
+                alternates=alternates,
             )
     __slots__ = ["_current_page", "_pages", "_page_urls", "_current_image"]
@@ -801,6 +808,19 @@ class PagesXMLSitemapParser(AbstractXMLSitemapParser):
                     "Page is expected to be set before <image:image>."
                 )
             self._current_image = self.Image()
+        elif name == "link":
+            if not self._current_page:
+                raise SitemapXMLParsingException(
+                    "Page is expected to be set before <link>."
+                )
+            if "rel" not in attrs or attrs["rel"] != "alternate":
+                log.warning(f"<link> element is missing rel attribute: {attrs}.")
+            elif "hreflang" not in attrs or "href" not in attrs:
+                log.warning(
+                    f"<link> element is missing hreflang or href attributes: {attrs}."
+                )
+            else:
+                self._current_page.alternates.append((attrs["hreflang"], attrs["href"]))
     def __require_last_char_data_to_be_set(self, name: str) -> None:
         if not self._last_char_data:

{ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.1}/usp/helpers.py RENAMED Viewed

@@ -3,24 +3,25 @@
 import datetime
 import gzip as gzip_lib
 import html
+import logging
 import re
 import sys
 import time
 from typing import Optional
-from urllib.parse import urlparse, unquote_plus, urlunparse
-from dateutil.parser import parse as dateutil_parse
+from urllib.parse import unquote_plus, urlparse, urlunparse
 from dateutil.parser import isoparse as dateutil_isoparse
+from dateutil.parser import parse as dateutil_parse
-from .exceptions import SitemapException, GunzipException, StripURLToHomepageException
-from .log import create_logger
+from .exceptions import GunzipException, SitemapException, StripURLToHomepageException
 from .web_client.abstract_client import (
     AbstractWebClient,
+    AbstractWebClientResponse,
     AbstractWebClientSuccessResponse,
     WebClientErrorResponse,
-    AbstractWebClientResponse,
 )
-log = create_logger(__name__)
+log = logging.getLogger(__name__)
 __URL_REGEX = re.compile(r"^https?://[^\s/$.?#].[^\s]*$", re.IGNORECASE)
 """Regular expression to match HTTP(s) URLs."""
@@ -247,7 +248,7 @@ def ungzipped_response_content(
             data = gunzip(data)
         except GunzipException as ex:
             # In case of an error, just assume that it's one of the non-gzipped sitemaps with ".gz" extension
-            log.error(
+            log.warning(
                 f"Unable to gunzip response {response}, maybe it's a non-gzipped sitemap: {ex}"
             )

{ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.1}/usp/objects/page.py RENAMED Viewed

@@ -3,7 +3,7 @@
 import datetime
 from decimal import Decimal
 from enum import Enum, unique
-from typing import List, Optional
+from typing import List, Optional, Tuple
 SITEMAP_PAGE_DEFAULT_PRIORITY = Decimal("0.5")
 """Default sitemap page priority, as per the spec."""
@@ -331,6 +331,7 @@ class SitemapPage:
         "__change_frequency",
         "__news_story",
         "__images",
+        "__alternates",
     ]
     def __init__(
@@ -341,6 +342,7 @@ class SitemapPage:
         change_frequency: Optional[SitemapPageChangeFrequency] = None,
         news_story: Optional[SitemapNewsStory] = None,
         images: Optional[List[SitemapImage]] = None,
+        alternates: Optional[List[Tuple[str, str]]] = None,
     ):
         """
         Initialize a new sitemap-derived page.
@@ -357,6 +359,7 @@ class SitemapPage:
         self.__change_frequency = change_frequency
         self.__news_story = news_story
         self.__images = images
+        self.__alternates = alternates
     def __eq__(self, other) -> bool:
         if not isinstance(other, SitemapPage):
@@ -380,6 +383,9 @@ class SitemapPage:
         if self.images != other.images:
             return False
+        if self.alternates != other.alternates:
+            return False
         return True
     def __hash__(self):
@@ -442,10 +448,30 @@ class SitemapPage:
     @property
     def news_story(self) -> Optional[SitemapNewsStory]:
-        """Get the Google News story attached to the URL."""
+        """Get the Google News story attached to the URL.
+        See :ref:`google-news-ext` reference
+        """
         return self.__news_story
     @property
     def images(self) -> Optional[List[SitemapImage]]:
-        """Get the images attached to the URL."""
+        """Get the images attached to the URL.
+        See :ref:`google-image-ext` reference
+        """
         return self.__images
+    @property
+    def alternates(self) -> Optional[List[Tuple[str, str]]]:
+        """Get the alternate URLs for the URL.
+        A tuple of (language code, URL) for each ``<xhtml:link>`` element with ``rel="alternate"`` attribute.
+        See :ref:`sitemap-extra-localisation` reference
+        Example::
+            [('fr', 'https://www.example.com/fr/page'), ('de', 'https://www.example.com/de/page')]
+        """
+        return self.__alternates

{ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.1}/usp/objects/sitemap.py RENAMED Viewed

@@ -9,11 +9,11 @@
 """
 import abc
-from functools import lru_cache
 import os
 import pickle
 import tempfile
-from typing import List, Iterator, Tuple
+from functools import lru_cache
+from typing import Iterator, List, Tuple
 from .page import SitemapPage
@@ -72,7 +72,7 @@ class AbstractSitemap(metaclass=abc.ABCMeta):
         return hash((self.url,))
     def __repr__(self):
-        return f"{self.__class__.__name__}(" f"url={self.url}" ")"
+        return f"{self.__class__.__name__}(url={self.url})"
     @property
     def url(self) -> str:
@@ -167,12 +167,7 @@ class InvalidSitemap(AbstractSitemap):
         return True
     def __repr__(self):
-        return (
-            f"{self.__class__.__name__}("
-            f"url={self.url}, "
-            f"reason={self.reason}"
-            ")"
-        )
+        return f"{self.__class__.__name__}(url={self.url}, reason={self.reason})"
     def to_dict(self, with_pages=True) -> dict:
         return {

{ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.1}/usp/tree.py RENAMED Viewed

@@ -1,19 +1,20 @@
 """Helpers to generate a sitemap tree."""
+import logging
 from typing import Optional
 from .exceptions import SitemapException
 from .fetch_parse import SitemapFetcher, SitemapStrParser
 from .helpers import is_http_url, strip_url_to_homepage
-from .log import create_logger
 from .objects.sitemap import (
     AbstractSitemap,
-    InvalidSitemap,
-    IndexWebsiteSitemap,
     IndexRobotsTxtSitemap,
+    IndexWebsiteSitemap,
+    InvalidSitemap,
 )
 from .web_client.abstract_client import AbstractWebClient
-log = create_logger(__name__)
+log = logging.getLogger(__name__)
 _UNPUBLISHED_SITEMAP_PATHS = {
     "sitemap.xml",

{ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.1}/usp/web_client/abstract_client.py RENAMED Viewed

@@ -1,6 +1,8 @@
 """Abstract web client class."""
 import abc
+import random
+import time
 from http import HTTPStatus
 from typing import Optional
@@ -187,3 +189,36 @@ class LocalWebClient(AbstractWebClient):
     def get(self, url: str) -> AbstractWebClientResponse:
         raise NoWebClientException
+class RequestWaiter:
+    """
+    Manages waiting between requests.
+    """
+    def __init__(self, wait: Optional[float] = None, random_wait: bool = True):
+        """
+        :param wait: time to wait between requests, in seconds.
+        :param random_wait: if true, wait time is multiplied by a random number between 0.5 and 1.5.
+        """
+        self.wait_s = wait or 0
+        self.random_wait = random_wait
+        self.is_first = True
+    def wait(self) -> None:
+        """Perform a wait if needed. Should be called before each request.
+        Will skip wait if this is the first request.
+        """
+        if self.wait_s == 0:
+            return
+        if self.is_first:
+            self.is_first = False
+            return
+        wait_f = 1.0
+        if self.random_wait:
+            wait_f = random.uniform(0.5, 1.5)
+        time.sleep(self.wait_s * wait_f)

{ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.1}/usp/web_client/requests_client.py RENAMED Viewed

@@ -1,18 +1,23 @@
 """Implementation of :mod:`usp.web_client.abstract_client` with Requests."""
+import logging
 from http import HTTPStatus
-from typing import Optional, Dict, Tuple, Union
+from typing import Dict, Optional, Tuple, Union
 import requests
+from usp import __version__
 from .abstract_client import (
+    RETRYABLE_HTTP_STATUS_CODES,
     AbstractWebClient,
     AbstractWebClientResponse,
     AbstractWebClientSuccessResponse,
+    RequestWaiter,
     WebClientErrorResponse,
-    RETRYABLE_HTTP_STATUS_CODES,
 )
-from usp import __version__
+log = logging.getLogger(__name__)
 class RequestsWebClientSuccessResponse(AbstractWebClientSuccessResponse):
@@ -78,16 +83,27 @@ class RequestsWebClient(AbstractWebClient):
     Some webservers might be generating huge sitemaps on the fly, so this is why it's rather big.
     """
-    __slots__ = ["__max_response_data_length", "__timeout", "__proxies", "__verify"]
+    __slots__ = [
+        "__max_response_data_length",
+        "__timeout",
+        "__proxies",
+        "__verify",
+        "__waiter",
+    ]
-    def __init__(self, verify=True):
+    def __init__(
+        self, verify=True, wait: Optional[float] = None, random_wait: bool = False
+    ):
         """
         :param verify: whether certificates should be verified for HTTPS requests.
+        :param wait: time to wait between requests, in seconds.
+        :param random_wait: if true, wait time is multiplied by a random number between 0.5 and 1.5.
         """
         self.__max_response_data_length = None
         self.__timeout = self.__HTTP_REQUEST_TIMEOUT
         self.__proxies = {}
         self.__verify = verify
+        self.__waiter = RequestWaiter(wait, random_wait)
     def set_timeout(self, timeout: Union[int, Tuple[int, int], None]) -> None:
         """Set HTTP request timeout.
@@ -114,6 +130,7 @@ class RequestsWebClient(AbstractWebClient):
         self.__max_response_data_length = max_response_data_length
     def get(self, url: str) -> AbstractWebClientResponse:
+        self.__waiter.wait()
         try:
             response = requests.get(
                 url,
@@ -139,6 +156,7 @@ class RequestsWebClient(AbstractWebClient):
                 )
             else:
                 message = f"{response.status_code} {response.reason}"
+                log.info(f"Response content: {response.text}")
                 if response.status_code in RETRYABLE_HTTP_STATUS_CODES:
                     return RequestsWebClientErrorResponse(

ultimate_sitemap_parser-1.0.0rc1/usp/log.py DELETED Viewed

@@ -1,77 +0,0 @@
-"""Logging utilities."""
-import logging
-class Logger:
-    """
-    Logging helper class.
-    """
-    __LEVELS = {
-        "CRITICAL": logging.CRITICAL,
-        "ERROR": logging.ERROR,
-        "WARNING": logging.WARNING,
-        "INFO": logging.INFO,
-        "DEBUG": logging.DEBUG,
-    }
-    """Valid logging levels and their "logging" counterparts."""
-    __DEFAULT_LEVEL = "INFO"
-    """Default logging level."""
-    __slots__ = [
-        # "logging" object
-        "__l",
-    ]
-    def __init__(self, name: str):
-        """
-        Initialize logger object for a given name.
-        :param name: Module name that the logger should be initialized for.
-        """
-        self.__l = logging.getLogger(name)
-    def error(self, message: str) -> None:
-        """
-        Log error message.
-        :param message: Message to log.
-        """
-        self.__l.error(message)
-    def warning(self, message: str) -> None:
-        """
-        Log warning message.
-        :param message: Message to log.
-        """
-        self.__l.warning(message)
-    def info(self, message: str) -> None:
-        """
-        Log informational message.
-        :param message: Message to log.
-        """
-        self.__l.info(message)
-    def debug(self, message: str) -> None:
-        """
-        Log debugging message.
-        :param message: Message to log.
-        """
-        self.__l.debug(message)
-def create_logger(name: str) -> Logger:
-    """
-    Create and return Logger object.
-    :param name: Module name that the logger should be initialized for.
-    :return: Logger object.
-    """
-    return Logger(name=name)

{ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.1}/LICENSE RENAMED Viewed

File without changes

{ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.1}/NOTICE RENAMED Viewed

File without changes

{ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.1}/README.rst RENAMED Viewed

File without changes

{ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.1}/usp/__init__.py RENAMED Viewed

File without changes

{ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.1}/usp/cli/__init__.py RENAMED Viewed

File without changes

{ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.1}/usp/cli/_util.py RENAMED Viewed

File without changes

{ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.1}/usp/cli/cli.py RENAMED Viewed

@@ -1,7 +1,7 @@
 from argparse import ArgumentParser
-from usp.cli import _ls as ls_cmd
 from usp import __version__
+from usp.cli import _ls as ls_cmd
 def main():

{ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.1}/usp/exceptions.py RENAMED Viewed

File without changes

{ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.1}/usp/objects/__init__.py RENAMED Viewed

File without changes

{ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.1}/usp/web_client/__init__.py RENAMED Viewed

File without changes

ultimate-sitemap-parser 1.0.0rc1__tar.gz → 1.1.1__tar.gz

Potentially problematic release.

ultimate-sitemap-parser 1.0.0rc1tar.gz → 1.1.1tar.gz