PyPI - ultimate-sitemap-parser - Versions diffs - 1.1.1__py3-none-any.whl → 1.2.0__py3-none-any.whl - Mend

ultimate-sitemap-parser 1.1.1py3-none-any.whl → 1.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ultimate-sitemap-parser might be problematic. Click here for more details.

Files changed (8) hide show

{ultimate_sitemap_parser-1.1.1.dist-info → ultimate_sitemap_parser-1.2.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: ultimate-sitemap-parser
-Version: 1.1.1
+Version: 1.2.0
 Summary: A performant library for parsing and crawling sitemaps
 License: GPL-3.0-or-later
 Keywords: sitemap,crawler,indexing,xml,rss,atom,google news

{ultimate_sitemap_parser-1.1.1.dist-info → ultimate_sitemap_parser-1.2.0.dist-info}/RECORD RENAMED Viewed

@@ -9,13 +9,13 @@ usp/helpers.py,sha256=S9d8fEhHzZqVCx3SkcWVTgW1JYKujH-tM86urjORNWA,8482
 usp/objects/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 usp/objects/page.py,sha256=vz2QXC9Z3E65Cxf68tKfQkubIc_OB0m6pNYH146Qx_8,14253
 usp/objects/sitemap.py,sha256=yt5qe6fyKfmvJmV60mB8kc7yooGcpYhuIcNlmUqFGFA,11486
-usp/tree.py,sha256=AmK0TptwNAexwSBAjrziYvx9cueQDMt5w9_1m8d4edI,4055
+usp/tree.py,sha256=pwSTp1Zok4evzrNFavP-hh5i9xGGzObj_sKUqjk72UU,4237
 usp/web_client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 usp/web_client/abstract_client.py,sha256=7MpIfqQpi1_yojEmuReT8iy9kFUWCD3i2LMpHmBOwV0,6291
-usp/web_client/requests_client.py,sha256=xxkBUHvakBN-Guw_DqGElZJVS42xgUwWHxM7jA_QEPI,5593
-ultimate_sitemap_parser-1.1.1.dist-info/LICENSE,sha256=ixuiBLtpoK3iv89l7ylKkg9rs2GzF9ukPH7ynZYzK5s,35148
-ultimate_sitemap_parser-1.1.1.dist-info/METADATA,sha256=9sK5LCSHHPuSvdDjakIqOm-Gv3_Lgm1tsZdDDFs8vSE,4447
-ultimate_sitemap_parser-1.1.1.dist-info/NOTICE,sha256=3ANZA5R9rYnCOnUoroGfFUOZ__ww_yG01NUAx0X6J7E,632
-ultimate_sitemap_parser-1.1.1.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
-ultimate_sitemap_parser-1.1.1.dist-info/entry_points.txt,sha256=v60w5WzqYlPOucntZUy0ydzlYwuAPSwoQY0KdT5ragQ,36
-ultimate_sitemap_parser-1.1.1.dist-info/RECORD,,
+usp/web_client/requests_client.py,sha256=1nyXXBxiapDNN5jNpCAXRL5rgjptK4oKvaJhV5nhLsA,5816
+ultimate_sitemap_parser-1.2.0.dist-info/LICENSE,sha256=ixuiBLtpoK3iv89l7ylKkg9rs2GzF9ukPH7ynZYzK5s,35148
+ultimate_sitemap_parser-1.2.0.dist-info/METADATA,sha256=46wVZspA5eUgbXefu2Fu7xtE03TbFsgjEwLL5BT-mj0,4447
+ultimate_sitemap_parser-1.2.0.dist-info/NOTICE,sha256=3ANZA5R9rYnCOnUoroGfFUOZ__ww_yG01NUAx0X6J7E,632
+ultimate_sitemap_parser-1.2.0.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
+ultimate_sitemap_parser-1.2.0.dist-info/entry_points.txt,sha256=v60w5WzqYlPOucntZUy0ydzlYwuAPSwoQY0KdT5ragQ,36
+ultimate_sitemap_parser-1.2.0.dist-info/RECORD,,

usp/tree.py CHANGED Viewed

@@ -40,6 +40,7 @@ def sitemap_tree_for_homepage(
     web_client: Optional[AbstractWebClient] = None,
     use_robots: bool = True,
     use_known_paths: bool = True,
+    extra_known_paths: Optional[set] = None,
 ) -> AbstractSitemap:
     """
     Using a homepage URL, fetch the tree of sitemaps and pages listed in them.
@@ -49,12 +50,15 @@ def sitemap_tree_for_homepage(
         If ``None``, a :class:`~.RequestsWebClient` will be used.
     :param use_robots: Whether to discover sitemaps through robots.txt.
     :param use_known_paths: Whether to discover sitemaps through common known paths.
+    :param extra_known_paths: Extra paths to check for sitemaps.
     :return: Root sitemap object of the fetched sitemap tree.
     """
     if not is_http_url(homepage_url):
         raise SitemapException(f"URL {homepage_url} is not a HTTP(s) URL.")
+    extra_known_paths = extra_known_paths or set()
     stripped_homepage_url = strip_url_to_homepage(url=homepage_url)
     if homepage_url != stripped_homepage_url:
         log.warning(
@@ -82,7 +86,7 @@ def sitemap_tree_for_homepage(
                 sitemap_urls_found_in_robots_txt.add(sub_sitemap.url)
     if use_known_paths:
-        for unpublished_sitemap_path in _UNPUBLISHED_SITEMAP_PATHS:
+        for unpublished_sitemap_path in _UNPUBLISHED_SITEMAP_PATHS | extra_known_paths:
             unpublished_sitemap_url = homepage_url + unpublished_sitemap_path
             # Don't refetch URLs already found in robots.txt

usp/web_client/requests_client.py CHANGED Viewed

@@ -92,18 +92,24 @@ class RequestsWebClient(AbstractWebClient):
     ]
     def __init__(
-        self, verify=True, wait: Optional[float] = None, random_wait: bool = False
+        self,
+        verify=True,
+        wait: Optional[float] = None,
+        random_wait: bool = False,
+        session: Optional[requests.Session] = None,
     ):
         """
         :param verify: whether certificates should be verified for HTTPS requests.
         :param wait: time to wait between requests, in seconds.
         :param random_wait: if true, wait time is multiplied by a random number between 0.5 and 1.5.
+        :param session: a custom session object to use, or None to create a new one.
         """
         self.__max_response_data_length = None
         self.__timeout = self.__HTTP_REQUEST_TIMEOUT
         self.__proxies = {}
         self.__verify = verify
         self.__waiter = RequestWaiter(wait, random_wait)
+        self.__session = session or requests.Session()
     def set_timeout(self, timeout: Union[int, Tuple[int, int], None]) -> None:
         """Set HTTP request timeout.
@@ -132,7 +138,7 @@ class RequestsWebClient(AbstractWebClient):
     def get(self, url: str) -> AbstractWebClientResponse:
         self.__waiter.wait()
         try:
-            response = requests.get(
+            response = self.__session.get(
                 url,
                 timeout=self.__timeout,
                 stream=True,

{ultimate_sitemap_parser-1.1.1.dist-info → ultimate_sitemap_parser-1.2.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{ultimate_sitemap_parser-1.1.1.dist-info → ultimate_sitemap_parser-1.2.0.dist-info}/NOTICE RENAMED Viewed

File without changes

{ultimate_sitemap_parser-1.1.1.dist-info → ultimate_sitemap_parser-1.2.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{ultimate_sitemap_parser-1.1.1.dist-info → ultimate_sitemap_parser-1.2.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

ultimate-sitemap-parser 1.1.1__py3-none-any.whl → 1.2.0__py3-none-any.whl

Potentially problematic release.

ultimate-sitemap-parser 1.1.1py3-none-any.whl → 1.2.0py3-none-any.whl