ultimate-sitemap-parser 1.4.0__tar.gz → 1.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ultimate-sitemap-parser might be problematic. Click here for more details.

Files changed (20) hide show
  1. {ultimate_sitemap_parser-1.4.0 → ultimate_sitemap_parser-1.6.0}/PKG-INFO +1 -1
  2. {ultimate_sitemap_parser-1.4.0 → ultimate_sitemap_parser-1.6.0}/pyproject.toml +5 -2
  3. {ultimate_sitemap_parser-1.4.0 → ultimate_sitemap_parser-1.6.0}/usp/fetch_parse.py +102 -18
  4. {ultimate_sitemap_parser-1.4.0 → ultimate_sitemap_parser-1.6.0}/usp/helpers.py +13 -1
  5. {ultimate_sitemap_parser-1.4.0 → ultimate_sitemap_parser-1.6.0}/usp/objects/sitemap.py +4 -1
  6. {ultimate_sitemap_parser-1.4.0 → ultimate_sitemap_parser-1.6.0}/usp/tree.py +14 -1
  7. {ultimate_sitemap_parser-1.4.0 → ultimate_sitemap_parser-1.6.0}/usp/web_client/requests_client.py +2 -2
  8. {ultimate_sitemap_parser-1.4.0 → ultimate_sitemap_parser-1.6.0}/LICENSE +0 -0
  9. {ultimate_sitemap_parser-1.4.0 → ultimate_sitemap_parser-1.6.0}/NOTICE +0 -0
  10. {ultimate_sitemap_parser-1.4.0 → ultimate_sitemap_parser-1.6.0}/README.rst +0 -0
  11. {ultimate_sitemap_parser-1.4.0 → ultimate_sitemap_parser-1.6.0}/usp/__init__.py +0 -0
  12. {ultimate_sitemap_parser-1.4.0 → ultimate_sitemap_parser-1.6.0}/usp/cli/__init__.py +0 -0
  13. {ultimate_sitemap_parser-1.4.0 → ultimate_sitemap_parser-1.6.0}/usp/cli/_ls.py +0 -0
  14. {ultimate_sitemap_parser-1.4.0 → ultimate_sitemap_parser-1.6.0}/usp/cli/_util.py +0 -0
  15. {ultimate_sitemap_parser-1.4.0 → ultimate_sitemap_parser-1.6.0}/usp/cli/cli.py +0 -0
  16. {ultimate_sitemap_parser-1.4.0 → ultimate_sitemap_parser-1.6.0}/usp/exceptions.py +0 -0
  17. {ultimate_sitemap_parser-1.4.0 → ultimate_sitemap_parser-1.6.0}/usp/objects/__init__.py +0 -0
  18. {ultimate_sitemap_parser-1.4.0 → ultimate_sitemap_parser-1.6.0}/usp/objects/page.py +0 -0
  19. {ultimate_sitemap_parser-1.4.0 → ultimate_sitemap_parser-1.6.0}/usp/web_client/__init__.py +0 -0
  20. {ultimate_sitemap_parser-1.4.0 → ultimate_sitemap_parser-1.6.0}/usp/web_client/abstract_client.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: ultimate-sitemap-parser
3
- Version: 1.4.0
3
+ Version: 1.6.0
4
4
  Summary: A performant library for parsing and crawling sitemaps
5
5
  License: GPL-3.0-or-later
6
6
  Keywords: sitemap,crawler,indexing,xml,rss,atom,google news
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "ultimate-sitemap-parser"
3
- version = "1.4.0"
3
+ version = "1.6.0"
4
4
  description = "A performant library for parsing and crawling sitemaps"
5
5
  authors = [
6
6
  { name = "Linas Valiukas", email = "linas@media.mit.edu"},
@@ -103,4 +103,7 @@ select = [
103
103
  junit_suite_name = "ultimate-sitemap-parser"
104
104
  junit_duration_report = "call"
105
105
  log_cli = true
106
- log_cli_level = "DEBUG"
106
+ log_cli_level = "DEBUG"
107
+ filterwarnings = [
108
+ "error::pytest.PytestUnraisableExceptionWarning"
109
+ ]
@@ -17,6 +17,8 @@ from typing import Dict, Optional, Set
17
17
 
18
18
  from .exceptions import SitemapException, SitemapXMLParsingException
19
19
  from .helpers import (
20
+ RecurseCallbackType,
21
+ RecurseListCallbackType,
20
22
  get_url_retry_on_client_errors,
21
23
  html_unescape_strip,
22
24
  is_http_url,
@@ -77,6 +79,8 @@ class SitemapFetcher:
77
79
  "_web_client",
78
80
  "_parent_urls",
79
81
  "_quiet_404",
82
+ "_recurse_callback",
83
+ "_recurse_list_callback",
80
84
  ]
81
85
 
82
86
  def __init__(
@@ -86,6 +90,8 @@ class SitemapFetcher:
86
90
  web_client: Optional[AbstractWebClient] = None,
87
91
  parent_urls: Optional[Set[str]] = None,
88
92
  quiet_404: bool = False,
93
+ recurse_callback: Optional[RecurseCallbackType] = None,
94
+ recurse_list_callback: Optional[RecurseListCallbackType] = None,
89
95
  ):
90
96
  """
91
97
 
@@ -94,6 +100,8 @@ class SitemapFetcher:
94
100
  :param web_client: Web client to use. If ``None``, a :class:`~.RequestsWebClient` will be used.
95
101
  :param parent_urls: Set of parent URLs that led to this sitemap.
96
102
  :param quiet_404: Whether 404 errors are expected and should be logged at a reduced level, useful for speculative fetching of known URLs.
103
+ :param recurse_callback: Optional callback to filter out a sub-sitemap. See :data:`~.RecurseCallbackType`.
104
+ :param recurse_list_callback: Optional callback to filter the list of sub-sitemaps. See :data:`~.RecurseListCallbackType`.
97
105
 
98
106
  :raises SitemapException: If the maximum recursion depth is exceeded.
99
107
  :raises SitemapException: If the URL is in the parent URLs set.
@@ -128,6 +136,9 @@ class SitemapFetcher:
128
136
  self._parent_urls = parent_urls or set()
129
137
  self._quiet_404 = quiet_404
130
138
 
139
+ self._recurse_callback = recurse_callback
140
+ self._recurse_list_callback = recurse_list_callback
141
+
131
142
  def _fetch(self) -> AbstractWebClientResponse:
132
143
  log.info(f"Fetching level {self._recursion_level} sitemap from {self._url}...")
133
144
  response = get_url_retry_on_client_errors(
@@ -173,6 +184,8 @@ class SitemapFetcher:
173
184
  recursion_level=self._recursion_level,
174
185
  web_client=self._web_client,
175
186
  parent_urls=self._parent_urls,
187
+ recurse_callback=self._recurse_callback,
188
+ recurse_list_callback=self._recurse_list_callback,
176
189
  )
177
190
 
178
191
  else:
@@ -184,6 +197,8 @@ class SitemapFetcher:
184
197
  recursion_level=self._recursion_level,
185
198
  web_client=self._web_client,
186
199
  parent_urls=self._parent_urls,
200
+ recurse_callback=self._recurse_callback,
201
+ recurse_list_callback=self._recurse_list_callback,
187
202
  )
188
203
  else:
189
204
  parser = PlainTextSitemapParser(
@@ -234,6 +249,8 @@ class AbstractSitemapParser(metaclass=abc.ABCMeta):
234
249
  "_web_client",
235
250
  "_recursion_level",
236
251
  "_parent_urls",
252
+ "_recurse_callback",
253
+ "_recurse_list_callback",
237
254
  ]
238
255
 
239
256
  def __init__(
@@ -243,6 +260,8 @@ class AbstractSitemapParser(metaclass=abc.ABCMeta):
243
260
  recursion_level: int,
244
261
  web_client: AbstractWebClient,
245
262
  parent_urls: Set[str],
263
+ recurse_callback: Optional[RecurseCallbackType] = None,
264
+ recurse_list_callback: Optional[RecurseListCallbackType] = None,
246
265
  ):
247
266
  self._url = url
248
267
  self._content = content
@@ -250,6 +269,16 @@ class AbstractSitemapParser(metaclass=abc.ABCMeta):
250
269
  self._web_client = web_client
251
270
  self._parent_urls = parent_urls
252
271
 
272
+ if recurse_callback is None: # Always allow child recursion
273
+ self._recurse_callback = lambda url, level, parent_urls: True
274
+ else:
275
+ self._recurse_callback = recurse_callback
276
+
277
+ if recurse_list_callback is None: # Always allow child recursion
278
+ self._recurse_list_callback = lambda urls, level, parent_urls: urls
279
+ else:
280
+ self._recurse_list_callback = recurse_list_callback
281
+
253
282
  @abc.abstractmethod
254
283
  def sitemap(self) -> AbstractSitemap:
255
284
  """
@@ -270,6 +299,8 @@ class IndexRobotsTxtSitemapParser(AbstractSitemapParser):
270
299
  recursion_level: int,
271
300
  web_client: AbstractWebClient,
272
301
  parent_urls: Set[str],
302
+ recurse_callback: Optional[RecurseCallbackType] = None,
303
+ recurse_list_callback: Optional[RecurseListCallbackType] = None,
273
304
  ):
274
305
  super().__init__(
275
306
  url=url,
@@ -277,6 +308,8 @@ class IndexRobotsTxtSitemapParser(AbstractSitemapParser):
277
308
  recursion_level=recursion_level,
278
309
  web_client=web_client,
279
310
  parent_urls=parent_urls,
311
+ recurse_callback=recurse_callback,
312
+ recurse_list_callback=recurse_list_callback,
280
313
  )
281
314
 
282
315
  if not self._url.endswith("/robots.txt"):
@@ -304,16 +337,27 @@ class IndexRobotsTxtSitemapParser(AbstractSitemapParser):
304
337
  )
305
338
 
306
339
  sub_sitemaps = []
340
+ parent_urls = self._parent_urls | {self._url}
307
341
 
308
- for sitemap_url in sitemap_urls.keys():
342
+ filtered_sitemap_urls = self._recurse_list_callback(
343
+ list(sitemap_urls.keys()), self._recursion_level, parent_urls
344
+ )
345
+ for sitemap_url in filtered_sitemap_urls:
309
346
  try:
310
- fetcher = SitemapFetcher(
311
- url=sitemap_url,
312
- recursion_level=self._recursion_level + 1,
313
- web_client=self._web_client,
314
- parent_urls=self._parent_urls | {self._url},
315
- )
316
- fetched_sitemap = fetcher.sitemap()
347
+ if self._recurse_callback(
348
+ sitemap_url, self._recursion_level, parent_urls
349
+ ):
350
+ fetcher = SitemapFetcher(
351
+ url=sitemap_url,
352
+ recursion_level=self._recursion_level + 1,
353
+ web_client=self._web_client,
354
+ parent_urls=parent_urls,
355
+ recurse_callback=self._recurse_callback,
356
+ recurse_list_callback=self._recurse_list_callback,
357
+ )
358
+ fetched_sitemap = fetcher.sitemap()
359
+ else:
360
+ continue
317
361
  except NoWebClientException:
318
362
  fetched_sitemap = InvalidSitemap(
319
363
  url=sitemap_url, reason="Un-fetched child sitemap"
@@ -376,6 +420,8 @@ class XMLSitemapParser(AbstractSitemapParser):
376
420
  recursion_level: int,
377
421
  web_client: AbstractWebClient,
378
422
  parent_urls: Set[str],
423
+ recurse_callback: Optional[RecurseCallbackType] = None,
424
+ recurse_list_callback: Optional[RecurseListCallbackType] = None,
379
425
  ):
380
426
  super().__init__(
381
427
  url=url,
@@ -383,6 +429,8 @@ class XMLSitemapParser(AbstractSitemapParser):
383
429
  recursion_level=recursion_level,
384
430
  web_client=web_client,
385
431
  parent_urls=parent_urls,
432
+ recurse_callback=recurse_callback,
433
+ recurse_list_callback=recurse_list_callback,
386
434
  )
387
435
 
388
436
  # Will be initialized when the type of sitemap is known
@@ -491,6 +539,8 @@ class XMLSitemapParser(AbstractSitemapParser):
491
539
  web_client=self._web_client,
492
540
  recursion_level=self._recursion_level,
493
541
  parent_urls=self._parent_urls,
542
+ recurse_callback=self._recurse_callback,
543
+ recurse_list_callback=self._recurse_list_callback,
494
544
  )
495
545
 
496
546
  elif name == "rss":
@@ -536,13 +586,30 @@ class AbstractXMLSitemapParser(metaclass=abc.ABCMeta):
536
586
  # Last encountered character data
537
587
  "_last_char_data",
538
588
  "_last_handler_call_was_xml_char_data",
589
+ "_recurse_callback",
590
+ "_recurse_list_callback",
539
591
  ]
540
592
 
541
- def __init__(self, url: str):
593
+ def __init__(
594
+ self,
595
+ url: str,
596
+ recurse_callback: Optional[RecurseCallbackType] = None,
597
+ recurse_list_callback: Optional[RecurseListCallbackType] = None,
598
+ ):
542
599
  self._url = url
543
600
  self._last_char_data = ""
544
601
  self._last_handler_call_was_xml_char_data = False
545
602
 
603
+ if recurse_callback is None: # Always allow child recursion
604
+ self._recurse_callback = lambda url, level, parent_urls: True
605
+ else:
606
+ self._recurse_callback = recurse_callback
607
+
608
+ if recurse_list_callback is None: # Always allow child recursion
609
+ self._recurse_list_callback = lambda urls, level, parent_urls: urls
610
+ else:
611
+ self._recurse_list_callback = recurse_list_callback
612
+
546
613
  def xml_element_start(self, name: str, attrs: Dict[str, str]) -> None:
547
614
  """Concrete parser handler when the start of an element is encountered.
548
615
 
@@ -613,8 +680,14 @@ class IndexXMLSitemapParser(AbstractXMLSitemapParser):
613
680
  web_client: AbstractWebClient,
614
681
  recursion_level: int,
615
682
  parent_urls: Set[str],
683
+ recurse_callback: Optional[RecurseCallbackType] = None,
684
+ recurse_list_callback: Optional[RecurseListCallbackType] = None,
616
685
  ):
617
- super().__init__(url=url)
686
+ super().__init__(
687
+ url=url,
688
+ recurse_callback=recurse_callback,
689
+ recurse_list_callback=recurse_list_callback,
690
+ )
618
691
 
619
692
  self._web_client = web_client
620
693
  self._recursion_level = recursion_level
@@ -638,16 +711,27 @@ class IndexXMLSitemapParser(AbstractXMLSitemapParser):
638
711
  def sitemap(self) -> AbstractSitemap:
639
712
  sub_sitemaps = []
640
713
 
641
- for sub_sitemap_url in self._sub_sitemap_urls:
714
+ parent_urls = self._parent_urls | {self._url}
715
+ filtered_sitemap_urls = self._recurse_list_callback(
716
+ list(self._sub_sitemap_urls), self._recursion_level, parent_urls
717
+ )
718
+ for sub_sitemap_url in filtered_sitemap_urls:
642
719
  # URL might be invalid, or recursion limit might have been reached
643
720
  try:
644
- fetcher = SitemapFetcher(
645
- url=sub_sitemap_url,
646
- recursion_level=self._recursion_level + 1,
647
- web_client=self._web_client,
648
- parent_urls=self._parent_urls | {self._url},
649
- )
650
- fetched_sitemap = fetcher.sitemap()
721
+ if self._recurse_callback(
722
+ sub_sitemap_url, self._recursion_level, parent_urls
723
+ ):
724
+ fetcher = SitemapFetcher(
725
+ url=sub_sitemap_url,
726
+ recursion_level=self._recursion_level + 1,
727
+ web_client=self._web_client,
728
+ parent_urls=parent_urls,
729
+ recurse_callback=self._recurse_callback,
730
+ recurse_list_callback=self._recurse_list_callback,
731
+ )
732
+ fetched_sitemap = fetcher.sitemap()
733
+ else:
734
+ continue
651
735
  except NoWebClientException:
652
736
  fetched_sitemap = InvalidSitemap(
653
737
  url=sub_sitemap_url, reason="Un-fetched child sitemap"
@@ -8,7 +8,7 @@ import re
8
8
  import sys
9
9
  import time
10
10
  from http import HTTPStatus
11
- from typing import Optional
11
+ from typing import Callable, List, Optional, Set
12
12
  from urllib.parse import unquote_plus, urlparse, urlunparse
13
13
 
14
14
  from dateutil.parser import isoparse as dateutil_isoparse
@@ -29,6 +29,18 @@ __URL_REGEX = re.compile(r"^https?://[^\s/$.?#].[^\s]*$", re.IGNORECASE)
29
29
 
30
30
  HAS_DATETIME_NEW_ISOPARSER = sys.version_info >= (3, 11)
31
31
 
32
+ # TODO: Convert to TypeAlias when Python3.9 support is dropped.
33
+ RecurseCallbackType = Callable[[str, int, Set[str]], bool]
34
+ """Type for the callback function used to decide whether to recurse into a sitemap.
35
+
36
+ A function that takes the sub-sitemap URL, the current recursion level, and the set of parent URLs as arguments, and returns a boolean indicating whether to recurse into the sub-sitemap.
37
+ """
38
+ RecurseListCallbackType = Callable[[List[str], int, Set[str]], List[str]]
39
+ """Type for the callback function used to filter the list of sitemaps to recurse into.
40
+
41
+ A function that takes the list of sub-sitemap URLs, the current recursion level, and the set of parent URLs as arguments, and returns a list of sub-sitemap URLs to recurse into.
42
+ """
43
+
32
44
 
33
45
  def is_http_url(url: str) -> bool:
34
46
  """
@@ -230,7 +230,10 @@ class AbstractPagesSitemap(AbstractSitemap, metaclass=abc.ABCMeta):
230
230
  pickle.dump(pages, tmp, protocol=pickle.HIGHEST_PROTOCOL)
231
231
 
232
232
  def __del__(self):
233
- os.unlink(self.__pages_temp_file_path)
233
+ try:
234
+ os.unlink(self.__pages_temp_file_path)
235
+ except FileNotFoundError as e:
236
+ log.warning("Unable to remove temp file", exc_info=e)
234
237
 
235
238
  def __eq__(self, other) -> bool:
236
239
  if not isinstance(other, AbstractPagesSitemap):
@@ -5,7 +5,12 @@ from typing import Optional
5
5
 
6
6
  from .exceptions import SitemapException
7
7
  from .fetch_parse import SitemapFetcher, SitemapStrParser
8
- from .helpers import is_http_url, strip_url_to_homepage
8
+ from .helpers import (
9
+ RecurseCallbackType,
10
+ RecurseListCallbackType,
11
+ is_http_url,
12
+ strip_url_to_homepage,
13
+ )
9
14
  from .objects.sitemap import (
10
15
  AbstractSitemap,
11
16
  IndexRobotsTxtSitemap,
@@ -41,6 +46,8 @@ def sitemap_tree_for_homepage(
41
46
  use_robots: bool = True,
42
47
  use_known_paths: bool = True,
43
48
  extra_known_paths: Optional[set] = None,
49
+ recurse_callback: Optional[RecurseCallbackType] = None,
50
+ recurse_list_callback: Optional[RecurseListCallbackType] = None,
44
51
  ) -> AbstractSitemap:
45
52
  """
46
53
  Using a homepage URL, fetch the tree of sitemaps and pages listed in them.
@@ -51,6 +58,8 @@ def sitemap_tree_for_homepage(
51
58
  :param use_robots: Whether to discover sitemaps through robots.txt.
52
59
  :param use_known_paths: Whether to discover sitemaps through common known paths.
53
60
  :param extra_known_paths: Extra paths to check for sitemaps.
61
+ :param recurse_callback: Optional callback function to determine if a sub-sitemap should be recursed into. See :data:`~.RecurseCallbackType`.
62
+ :param recurse_list_callback: Optional callback function to filter the list of sub-sitemaps to recurse into. See :data:`~.RecurseListCallbackType`.
54
63
  :return: Root sitemap object of the fetched sitemap tree.
55
64
  """
56
65
 
@@ -79,6 +88,8 @@ def sitemap_tree_for_homepage(
79
88
  web_client=web_client,
80
89
  recursion_level=0,
81
90
  parent_urls=set(),
91
+ recurse_callback=recurse_callback,
92
+ recurse_list_callback=recurse_list_callback,
82
93
  )
83
94
  robots_txt_sitemap = robots_txt_fetcher.sitemap()
84
95
  if not isinstance(robots_txt_sitemap, InvalidSitemap):
@@ -100,6 +111,8 @@ def sitemap_tree_for_homepage(
100
111
  recursion_level=0,
101
112
  parent_urls=sitemap_urls_found_in_robots_txt,
102
113
  quiet_404=True,
114
+ recurse_callback=recurse_callback,
115
+ recurse_list_callback=recurse_list_callback,
103
116
  )
104
117
  unpublished_sitemap = unpublished_sitemap_fetcher.sitemap()
105
118
 
@@ -79,7 +79,7 @@ class RequestsWebClient(AbstractWebClient):
79
79
 
80
80
  __USER_AGENT = f"ultimate_sitemap_parser/{__version__}"
81
81
 
82
- __HTTP_REQUEST_TIMEOUT = 60
82
+ __HTTP_REQUEST_TIMEOUT = (9.05, 60)
83
83
  """
84
84
  HTTP request timeout.
85
85
 
@@ -114,7 +114,7 @@ class RequestsWebClient(AbstractWebClient):
114
114
  self.__waiter = RequestWaiter(wait, random_wait)
115
115
  self.__session = session or requests.Session()
116
116
 
117
- def set_timeout(self, timeout: Union[int, Tuple[int, int], None]) -> None:
117
+ def set_timeout(self, timeout: Optional[Union[float, Tuple[float, float]]]) -> None:
118
118
  """Set HTTP request timeout.
119
119
 
120
120
  See also: `Requests timeout docs <https://requests.readthedocs.io/en/latest/user/advanced/#timeouts>`__