PyPI - ultimate-sitemap-parser - Versions diffs - 1.1.0__tar.gz → 1.2.0__tar.gz - Mend

ultimate-sitemap-parser 1.1.0tar.gz → 1.2.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ultimate-sitemap-parser might be problematic. Click here for more details.

Files changed (21) hide show

{ultimate_sitemap_parser-1.1.0 → ultimate_sitemap_parser-1.2.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: ultimate-sitemap-parser
-Version: 1.1.0
+Version: 1.2.0
 Summary: A performant library for parsing and crawling sitemaps
 License: GPL-3.0-or-later
 Keywords: sitemap,crawler,indexing,xml,rss,atom,google news

{ultimate_sitemap_parser-1.1.0 → ultimate_sitemap_parser-1.2.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "ultimate-sitemap-parser"
-version = "1.1.0"
+version = "1.2.0"
 description = "A performant library for parsing and crawling sitemaps"
 authors = [
     { name = "Linas Valiukas", email = "linas@media.mit.edu"},
@@ -53,7 +53,7 @@ python = ">=3.8,<4.0"
 [tool.poetry.group.dev.dependencies]
 requests-mock = ">=1.6.0,<2.0"
 pytest = "^8.3.0"
-ruff = "^0.6.1"
+ruff = "^0.9.3"
 vcrpy = "6.0.1"
 pytest-mock = "^3.14.0"
@@ -83,12 +83,20 @@ extend-exclude = ["docs/*"]
 [tool.ruff.lint]
 select = [
-    "E4",
-    "E7",
-    "E9",
-    "F",
-    "UP",
-    "PT"
+    "E4",  # pycodestyle Import
+    "E7",  # pycodestyle Statement
+    "E9",  # pycodestyle Runtime
+    "F",   # pyflakes
+    "UP",  # pyupgrde
+    "PT",  # flake8-pytest-style
+    "I",   # isort
+    "T20", # flake8-print
+    "LOG", # flake8-logging
+]
+[tool.ruff.lint.per-file-ignores]
+"**/tests/*" = [
+    "T20", # Allow print in tests
 ]
 [tool.pytest.ini_options]

{ultimate_sitemap_parser-1.1.0 → ultimate_sitemap_parser-1.2.0}/usp/cli/_ls.py RENAMED Viewed

@@ -2,7 +2,7 @@ import argparse
 import sys
 from typing import Iterator
-from usp.cli._util import tabs, format_help
+from usp.cli._util import format_help, tabs
 from usp.objects.sitemap import AbstractSitemap
 from usp.tree import sitemap_tree_for_homepage

{ultimate_sitemap_parser-1.1.0 → ultimate_sitemap_parser-1.2.0}/usp/fetch_parse.py RENAMED Viewed

@@ -8,49 +8,49 @@
 """
 import abc
+import logging
 import re
 import xml.parsers.expat
 from collections import OrderedDict
 from decimal import Decimal, InvalidOperation
-from typing import Optional, Dict, Union
+from typing import Dict, Optional, Union
 from .exceptions import SitemapException, SitemapXMLParsingException
 from .helpers import (
-    html_unescape_strip,
-    parse_iso8601_date,
     get_url_retry_on_client_errors,
-    ungzipped_response_content,
+    html_unescape_strip,
     is_http_url,
+    parse_iso8601_date,
     parse_rfc2822_date,
+    ungzipped_response_content,
 )
-from .log import create_logger
 from .objects.page import (
+    SITEMAP_PAGE_DEFAULT_PRIORITY,
     SitemapImage,
-    SitemapPage,
     SitemapNewsStory,
+    SitemapPage,
     SitemapPageChangeFrequency,
-    SITEMAP_PAGE_DEFAULT_PRIORITY,
 )
 from .objects.sitemap import (
     AbstractSitemap,
-    InvalidSitemap,
     IndexRobotsTxtSitemap,
     IndexXMLSitemap,
-    PagesXMLSitemap,
-    PagesTextSitemap,
-    PagesRSSSitemap,
+    InvalidSitemap,
     PagesAtomSitemap,
+    PagesRSSSitemap,
+    PagesTextSitemap,
+    PagesXMLSitemap,
 )
 from .web_client.abstract_client import (
     AbstractWebClient,
     AbstractWebClientSuccessResponse,
+    LocalWebClient,
+    NoWebClientException,
     WebClientErrorResponse,
 )
-from .web_client.abstract_client import LocalWebClient, NoWebClientException
 from .web_client.requests_client import RequestsWebClient
-log = create_logger(__name__)
+log = logging.getLogger(__name__)
 class SitemapFetcher:

{ultimate_sitemap_parser-1.1.0 → ultimate_sitemap_parser-1.2.0}/usp/helpers.py RENAMED Viewed

@@ -3,24 +3,25 @@
 import datetime
 import gzip as gzip_lib
 import html
+import logging
 import re
 import sys
 import time
 from typing import Optional
-from urllib.parse import urlparse, unquote_plus, urlunparse
-from dateutil.parser import parse as dateutil_parse
+from urllib.parse import unquote_plus, urlparse, urlunparse
 from dateutil.parser import isoparse as dateutil_isoparse
+from dateutil.parser import parse as dateutil_parse
-from .exceptions import SitemapException, GunzipException, StripURLToHomepageException
-from .log import create_logger
+from .exceptions import GunzipException, SitemapException, StripURLToHomepageException
 from .web_client.abstract_client import (
     AbstractWebClient,
+    AbstractWebClientResponse,
     AbstractWebClientSuccessResponse,
     WebClientErrorResponse,
-    AbstractWebClientResponse,
 )
-log = create_logger(__name__)
+log = logging.getLogger(__name__)
 __URL_REGEX = re.compile(r"^https?://[^\s/$.?#].[^\s]*$", re.IGNORECASE)
 """Regular expression to match HTTP(s) URLs."""
@@ -247,7 +248,7 @@ def ungzipped_response_content(
             data = gunzip(data)
         except GunzipException as ex:
             # In case of an error, just assume that it's one of the non-gzipped sitemaps with ".gz" extension
-            log.error(
+            log.warning(
                 f"Unable to gunzip response {response}, maybe it's a non-gzipped sitemap: {ex}"
             )

{ultimate_sitemap_parser-1.1.0 → ultimate_sitemap_parser-1.2.0}/usp/objects/sitemap.py RENAMED Viewed

@@ -9,11 +9,11 @@
 """
 import abc
-from functools import lru_cache
 import os
 import pickle
 import tempfile
-from typing import List, Iterator, Tuple
+from functools import lru_cache
+from typing import Iterator, List, Tuple
 from .page import SitemapPage
@@ -72,7 +72,7 @@ class AbstractSitemap(metaclass=abc.ABCMeta):
         return hash((self.url,))
     def __repr__(self):
-        return f"{self.__class__.__name__}(" f"url={self.url}" ")"
+        return f"{self.__class__.__name__}(url={self.url})"
     @property
     def url(self) -> str:
@@ -167,12 +167,7 @@ class InvalidSitemap(AbstractSitemap):
         return True
     def __repr__(self):
-        return (
-            f"{self.__class__.__name__}("
-            f"url={self.url}, "
-            f"reason={self.reason}"
-            ")"
-        )
+        return f"{self.__class__.__name__}(url={self.url}, reason={self.reason})"
     def to_dict(self, with_pages=True) -> dict:
         return {

{ultimate_sitemap_parser-1.1.0 → ultimate_sitemap_parser-1.2.0}/usp/tree.py RENAMED Viewed

@@ -1,19 +1,20 @@
 """Helpers to generate a sitemap tree."""
+import logging
 from typing import Optional
 from .exceptions import SitemapException
 from .fetch_parse import SitemapFetcher, SitemapStrParser
 from .helpers import is_http_url, strip_url_to_homepage
-from .log import create_logger
 from .objects.sitemap import (
     AbstractSitemap,
-    InvalidSitemap,
-    IndexWebsiteSitemap,
     IndexRobotsTxtSitemap,
+    IndexWebsiteSitemap,
+    InvalidSitemap,
 )
 from .web_client.abstract_client import AbstractWebClient
-log = create_logger(__name__)
+log = logging.getLogger(__name__)
 _UNPUBLISHED_SITEMAP_PATHS = {
     "sitemap.xml",
@@ -39,6 +40,7 @@ def sitemap_tree_for_homepage(
     web_client: Optional[AbstractWebClient] = None,
     use_robots: bool = True,
     use_known_paths: bool = True,
+    extra_known_paths: Optional[set] = None,
 ) -> AbstractSitemap:
     """
     Using a homepage URL, fetch the tree of sitemaps and pages listed in them.
@@ -48,12 +50,15 @@ def sitemap_tree_for_homepage(
         If ``None``, a :class:`~.RequestsWebClient` will be used.
     :param use_robots: Whether to discover sitemaps through robots.txt.
     :param use_known_paths: Whether to discover sitemaps through common known paths.
+    :param extra_known_paths: Extra paths to check for sitemaps.
     :return: Root sitemap object of the fetched sitemap tree.
     """
     if not is_http_url(homepage_url):
         raise SitemapException(f"URL {homepage_url} is not a HTTP(s) URL.")
+    extra_known_paths = extra_known_paths or set()
     stripped_homepage_url = strip_url_to_homepage(url=homepage_url)
     if homepage_url != stripped_homepage_url:
         log.warning(
@@ -81,7 +86,7 @@ def sitemap_tree_for_homepage(
                 sitemap_urls_found_in_robots_txt.add(sub_sitemap.url)
     if use_known_paths:
-        for unpublished_sitemap_path in _UNPUBLISHED_SITEMAP_PATHS:
+        for unpublished_sitemap_path in _UNPUBLISHED_SITEMAP_PATHS | extra_known_paths:
             unpublished_sitemap_url = homepage_url + unpublished_sitemap_path
             # Don't refetch URLs already found in robots.txt

{ultimate_sitemap_parser-1.1.0 → ultimate_sitemap_parser-1.2.0}/usp/web_client/requests_client.py RENAMED Viewed

@@ -1,20 +1,23 @@
 """Implementation of :mod:`usp.web_client.abstract_client` with Requests."""
-from http import HTTPStatus
 import logging
-from typing import Optional, Dict, Tuple, Union
+from http import HTTPStatus
+from typing import Dict, Optional, Tuple, Union
 import requests
+from usp import __version__
 from .abstract_client import (
+    RETRYABLE_HTTP_STATUS_CODES,
     AbstractWebClient,
     AbstractWebClientResponse,
     AbstractWebClientSuccessResponse,
     RequestWaiter,
     WebClientErrorResponse,
-    RETRYABLE_HTTP_STATUS_CODES,
 )
-from usp import __version__
+log = logging.getLogger(__name__)
 class RequestsWebClientSuccessResponse(AbstractWebClientSuccessResponse):
@@ -89,18 +92,24 @@ class RequestsWebClient(AbstractWebClient):
     ]
     def __init__(
-        self, verify=True, wait: Optional[float] = None, random_wait: bool = False
+        self,
+        verify=True,
+        wait: Optional[float] = None,
+        random_wait: bool = False,
+        session: Optional[requests.Session] = None,
     ):
         """
         :param verify: whether certificates should be verified for HTTPS requests.
         :param wait: time to wait between requests, in seconds.
         :param random_wait: if true, wait time is multiplied by a random number between 0.5 and 1.5.
+        :param session: a custom session object to use, or None to create a new one.
         """
         self.__max_response_data_length = None
         self.__timeout = self.__HTTP_REQUEST_TIMEOUT
         self.__proxies = {}
         self.__verify = verify
         self.__waiter = RequestWaiter(wait, random_wait)
+        self.__session = session or requests.Session()
     def set_timeout(self, timeout: Union[int, Tuple[int, int], None]) -> None:
         """Set HTTP request timeout.
@@ -129,7 +138,7 @@ class RequestsWebClient(AbstractWebClient):
     def get(self, url: str) -> AbstractWebClientResponse:
         self.__waiter.wait()
         try:
-            response = requests.get(
+            response = self.__session.get(
                 url,
                 timeout=self.__timeout,
                 stream=True,
@@ -153,7 +162,7 @@ class RequestsWebClient(AbstractWebClient):
                 )
             else:
                 message = f"{response.status_code} {response.reason}"
-                logging.info(f"Response content: {response.text}")
+                log.info(f"Response content: {response.text}")
                 if response.status_code in RETRYABLE_HTTP_STATUS_CODES:
                     return RequestsWebClientErrorResponse(

ultimate_sitemap_parser-1.1.0/usp/log.py DELETED Viewed

@@ -1,77 +0,0 @@
-"""Logging utilities."""
-import logging
-class Logger:
-    """
-    Logging helper class.
-    """
-    __LEVELS = {
-        "CRITICAL": logging.CRITICAL,
-        "ERROR": logging.ERROR,
-        "WARNING": logging.WARNING,
-        "INFO": logging.INFO,
-        "DEBUG": logging.DEBUG,
-    }
-    """Valid logging levels and their "logging" counterparts."""
-    __DEFAULT_LEVEL = "INFO"
-    """Default logging level."""
-    __slots__ = [
-        # "logging" object
-        "__l",
-    ]
-    def __init__(self, name: str):
-        """
-        Initialize logger object for a given name.
-        :param name: Module name that the logger should be initialized for.
-        """
-        self.__l = logging.getLogger(name)
-    def error(self, message: str) -> None:
-        """
-        Log error message.
-        :param message: Message to log.
-        """
-        self.__l.error(message)
-    def warning(self, message: str) -> None:
-        """
-        Log warning message.
-        :param message: Message to log.
-        """
-        self.__l.warning(message)
-    def info(self, message: str) -> None:
-        """
-        Log informational message.
-        :param message: Message to log.
-        """
-        self.__l.info(message)
-    def debug(self, message: str) -> None:
-        """
-        Log debugging message.
-        :param message: Message to log.
-        """
-        self.__l.debug(message)
-def create_logger(name: str) -> Logger:
-    """
-    Create and return Logger object.
-    :param name: Module name that the logger should be initialized for.
-    :return: Logger object.
-    """
-    return Logger(name=name)

{ultimate_sitemap_parser-1.1.0 → ultimate_sitemap_parser-1.2.0}/LICENSE RENAMED Viewed

File without changes

{ultimate_sitemap_parser-1.1.0 → ultimate_sitemap_parser-1.2.0}/NOTICE RENAMED Viewed

File without changes

{ultimate_sitemap_parser-1.1.0 → ultimate_sitemap_parser-1.2.0}/README.rst RENAMED Viewed

File without changes

{ultimate_sitemap_parser-1.1.0 → ultimate_sitemap_parser-1.2.0}/usp/__init__.py RENAMED Viewed

File without changes

{ultimate_sitemap_parser-1.1.0 → ultimate_sitemap_parser-1.2.0}/usp/cli/__init__.py RENAMED Viewed

File without changes

{ultimate_sitemap_parser-1.1.0 → ultimate_sitemap_parser-1.2.0}/usp/cli/_util.py RENAMED Viewed

File without changes

{ultimate_sitemap_parser-1.1.0 → ultimate_sitemap_parser-1.2.0}/usp/cli/cli.py RENAMED Viewed

@@ -1,7 +1,7 @@
 from argparse import ArgumentParser
-from usp.cli import _ls as ls_cmd
 from usp import __version__
+from usp.cli import _ls as ls_cmd
 def main():

{ultimate_sitemap_parser-1.1.0 → ultimate_sitemap_parser-1.2.0}/usp/exceptions.py RENAMED Viewed

File without changes

{ultimate_sitemap_parser-1.1.0 → ultimate_sitemap_parser-1.2.0}/usp/objects/__init__.py RENAMED Viewed

File without changes

{ultimate_sitemap_parser-1.1.0 → ultimate_sitemap_parser-1.2.0}/usp/objects/page.py RENAMED Viewed

File without changes

{ultimate_sitemap_parser-1.1.0 → ultimate_sitemap_parser-1.2.0}/usp/web_client/__init__.py RENAMED Viewed

File without changes

{ultimate_sitemap_parser-1.1.0 → ultimate_sitemap_parser-1.2.0}/usp/web_client/abstract_client.py RENAMED Viewed

@@ -2,8 +2,8 @@
 import abc
 import random
-from http import HTTPStatus
 import time
+from http import HTTPStatus
 from typing import Optional
 RETRYABLE_HTTP_STATUS_CODES = {

ultimate-sitemap-parser 1.1.0__tar.gz → 1.2.0__tar.gz

Potentially problematic release.

ultimate-sitemap-parser 1.1.0tar.gz → 1.2.0tar.gz