ultimate-sitemap-parser 1.1.1__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ultimate-sitemap-parser might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: ultimate-sitemap-parser
3
- Version: 1.1.1
3
+ Version: 1.2.0
4
4
  Summary: A performant library for parsing and crawling sitemaps
5
5
  License: GPL-3.0-or-later
6
6
  Keywords: sitemap,crawler,indexing,xml,rss,atom,google news
@@ -9,13 +9,13 @@ usp/helpers.py,sha256=S9d8fEhHzZqVCx3SkcWVTgW1JYKujH-tM86urjORNWA,8482
9
9
  usp/objects/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  usp/objects/page.py,sha256=vz2QXC9Z3E65Cxf68tKfQkubIc_OB0m6pNYH146Qx_8,14253
11
11
  usp/objects/sitemap.py,sha256=yt5qe6fyKfmvJmV60mB8kc7yooGcpYhuIcNlmUqFGFA,11486
12
- usp/tree.py,sha256=AmK0TptwNAexwSBAjrziYvx9cueQDMt5w9_1m8d4edI,4055
12
+ usp/tree.py,sha256=pwSTp1Zok4evzrNFavP-hh5i9xGGzObj_sKUqjk72UU,4237
13
13
  usp/web_client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
14
  usp/web_client/abstract_client.py,sha256=7MpIfqQpi1_yojEmuReT8iy9kFUWCD3i2LMpHmBOwV0,6291
15
- usp/web_client/requests_client.py,sha256=xxkBUHvakBN-Guw_DqGElZJVS42xgUwWHxM7jA_QEPI,5593
16
- ultimate_sitemap_parser-1.1.1.dist-info/LICENSE,sha256=ixuiBLtpoK3iv89l7ylKkg9rs2GzF9ukPH7ynZYzK5s,35148
17
- ultimate_sitemap_parser-1.1.1.dist-info/METADATA,sha256=9sK5LCSHHPuSvdDjakIqOm-Gv3_Lgm1tsZdDDFs8vSE,4447
18
- ultimate_sitemap_parser-1.1.1.dist-info/NOTICE,sha256=3ANZA5R9rYnCOnUoroGfFUOZ__ww_yG01NUAx0X6J7E,632
19
- ultimate_sitemap_parser-1.1.1.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
20
- ultimate_sitemap_parser-1.1.1.dist-info/entry_points.txt,sha256=v60w5WzqYlPOucntZUy0ydzlYwuAPSwoQY0KdT5ragQ,36
21
- ultimate_sitemap_parser-1.1.1.dist-info/RECORD,,
15
+ usp/web_client/requests_client.py,sha256=1nyXXBxiapDNN5jNpCAXRL5rgjptK4oKvaJhV5nhLsA,5816
16
+ ultimate_sitemap_parser-1.2.0.dist-info/LICENSE,sha256=ixuiBLtpoK3iv89l7ylKkg9rs2GzF9ukPH7ynZYzK5s,35148
17
+ ultimate_sitemap_parser-1.2.0.dist-info/METADATA,sha256=46wVZspA5eUgbXefu2Fu7xtE03TbFsgjEwLL5BT-mj0,4447
18
+ ultimate_sitemap_parser-1.2.0.dist-info/NOTICE,sha256=3ANZA5R9rYnCOnUoroGfFUOZ__ww_yG01NUAx0X6J7E,632
19
+ ultimate_sitemap_parser-1.2.0.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
20
+ ultimate_sitemap_parser-1.2.0.dist-info/entry_points.txt,sha256=v60w5WzqYlPOucntZUy0ydzlYwuAPSwoQY0KdT5ragQ,36
21
+ ultimate_sitemap_parser-1.2.0.dist-info/RECORD,,
usp/tree.py CHANGED
@@ -40,6 +40,7 @@ def sitemap_tree_for_homepage(
40
40
  web_client: Optional[AbstractWebClient] = None,
41
41
  use_robots: bool = True,
42
42
  use_known_paths: bool = True,
43
+ extra_known_paths: Optional[set] = None,
43
44
  ) -> AbstractSitemap:
44
45
  """
45
46
  Using a homepage URL, fetch the tree of sitemaps and pages listed in them.
@@ -49,12 +50,15 @@ def sitemap_tree_for_homepage(
49
50
  If ``None``, a :class:`~.RequestsWebClient` will be used.
50
51
  :param use_robots: Whether to discover sitemaps through robots.txt.
51
52
  :param use_known_paths: Whether to discover sitemaps through common known paths.
53
+ :param extra_known_paths: Extra paths to check for sitemaps.
52
54
  :return: Root sitemap object of the fetched sitemap tree.
53
55
  """
54
56
 
55
57
  if not is_http_url(homepage_url):
56
58
  raise SitemapException(f"URL {homepage_url} is not a HTTP(s) URL.")
57
59
 
60
+ extra_known_paths = extra_known_paths or set()
61
+
58
62
  stripped_homepage_url = strip_url_to_homepage(url=homepage_url)
59
63
  if homepage_url != stripped_homepage_url:
60
64
  log.warning(
@@ -82,7 +86,7 @@ def sitemap_tree_for_homepage(
82
86
  sitemap_urls_found_in_robots_txt.add(sub_sitemap.url)
83
87
 
84
88
  if use_known_paths:
85
- for unpublished_sitemap_path in _UNPUBLISHED_SITEMAP_PATHS:
89
+ for unpublished_sitemap_path in _UNPUBLISHED_SITEMAP_PATHS | extra_known_paths:
86
90
  unpublished_sitemap_url = homepage_url + unpublished_sitemap_path
87
91
 
88
92
  # Don't refetch URLs already found in robots.txt
@@ -92,18 +92,24 @@ class RequestsWebClient(AbstractWebClient):
92
92
  ]
93
93
 
94
94
  def __init__(
95
- self, verify=True, wait: Optional[float] = None, random_wait: bool = False
95
+ self,
96
+ verify=True,
97
+ wait: Optional[float] = None,
98
+ random_wait: bool = False,
99
+ session: Optional[requests.Session] = None,
96
100
  ):
97
101
  """
98
102
  :param verify: whether certificates should be verified for HTTPS requests.
99
103
  :param wait: time to wait between requests, in seconds.
100
104
  :param random_wait: if true, wait time is multiplied by a random number between 0.5 and 1.5.
105
+ :param session: a custom session object to use, or None to create a new one.
101
106
  """
102
107
  self.__max_response_data_length = None
103
108
  self.__timeout = self.__HTTP_REQUEST_TIMEOUT
104
109
  self.__proxies = {}
105
110
  self.__verify = verify
106
111
  self.__waiter = RequestWaiter(wait, random_wait)
112
+ self.__session = session or requests.Session()
107
113
 
108
114
  def set_timeout(self, timeout: Union[int, Tuple[int, int], None]) -> None:
109
115
  """Set HTTP request timeout.
@@ -132,7 +138,7 @@ class RequestsWebClient(AbstractWebClient):
132
138
  def get(self, url: str) -> AbstractWebClientResponse:
133
139
  self.__waiter.wait()
134
140
  try:
135
- response = requests.get(
141
+ response = self.__session.get(
136
142
  url,
137
143
  timeout=self.__timeout,
138
144
  stream=True,