ultimate-sitemap-parser 1.2.0__py3-none-any.whl → 1.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ultimate-sitemap-parser might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: ultimate-sitemap-parser
3
- Version: 1.2.0
3
+ Version: 1.3.1
4
4
  Summary: A performant library for parsing and crawling sitemaps
5
5
  License: GPL-3.0-or-later
6
6
  Keywords: sitemap,crawler,indexing,xml,rss,atom,google news
@@ -8,7 +8,7 @@ Author: Linas Valiukas
8
8
  Author-email: linas@media.mit.edu
9
9
  Maintainer: Freddy Heppell
10
10
  Maintainer-email: f.heppell@sheffield.ac.uk
11
- Requires-Python: >=3.8
11
+ Requires-Python: >=3.9
12
12
  Classifier: Development Status :: 5 - Production/Stable
13
13
  Classifier: Intended Audience :: Developers
14
14
  Classifier: Intended Audience :: Information Technology
@@ -16,7 +16,6 @@ Classifier: License :: OSI Approved :: GNU General Public License v3 or later (G
16
16
  Classifier: Operating System :: OS Independent
17
17
  Classifier: Programming Language :: Python
18
18
  Classifier: Programming Language :: Python :: 3
19
- Classifier: Programming Language :: Python :: 3.8
20
19
  Classifier: Programming Language :: Python :: 3.9
21
20
  Classifier: Programming Language :: Python :: 3.10
22
21
  Classifier: Programming Language :: Python :: 3.11
@@ -0,0 +1,21 @@
1
+ usp/__init__.py,sha256=_jshbOBBUHRZ5ko4SdI7GRFiF9xKGJVCEPgL9lZJ81o,124
2
+ usp/cli/__init__.py,sha256=mGrjSftUYfM2SGp9yEN2dTJndl5thOdv77-EAe6ocWo,37
3
+ usp/cli/_ls.py,sha256=V0pMsDiQK_9RZ5MyUS2toW8b6e2FJ4spb3Grw6PayAI,3419
4
+ usp/cli/_util.py,sha256=OrT9en350tATnaUrUn0peXr7aFPyYaaHGbEXGY6O4wI,2015
5
+ usp/cli/cli.py,sha256=2byuqhBUhb7c1qUpBfTTufG-jvtiEWWq97GvCgv-s44,777
6
+ usp/exceptions.py,sha256=9KTgnocYYZCfyaCf9BrBN7Ok4cwn7_DlrNFbhUfFsGM,634
7
+ usp/fetch_parse.py,sha256=69U1uAKawUym41N4nwJXLW9tQ0WXO4Pi63hnljYCXPM,43524
8
+ usp/helpers.py,sha256=FeIZcEuEM3Uz8tHeNucgoB3_27Ax6qCatfalPIHHGUY,8862
9
+ usp/objects/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
+ usp/objects/page.py,sha256=vz2QXC9Z3E65Cxf68tKfQkubIc_OB0m6pNYH146Qx_8,14253
11
+ usp/objects/sitemap.py,sha256=_t0ej2UmNsIb0NkxYkwYGxBqX_LHEJfNc-cRulQXyIk,11495
12
+ usp/tree.py,sha256=MdnVxfIIMqWrudsYxFI8yQTXnlmNLFEcQEOkXbnuBr4,4395
13
+ usp/web_client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
+ usp/web_client/abstract_client.py,sha256=EWY4lPYJqpV7ge0DTZESTAOofAjNMIJnDm_2PPeZ9z4,7007
15
+ usp/web_client/requests_client.py,sha256=sFYtJ8Q5z27WlTG1PgBzcvbS75pJ0pYUastEFmxa95U,5888
16
+ ultimate_sitemap_parser-1.3.1.dist-info/LICENSE,sha256=ixuiBLtpoK3iv89l7ylKkg9rs2GzF9ukPH7ynZYzK5s,35148
17
+ ultimate_sitemap_parser-1.3.1.dist-info/METADATA,sha256=GUU8qLo24ZGBtAd4CYaHxY927eFFGvKlVPhc6jfg5so,4397
18
+ ultimate_sitemap_parser-1.3.1.dist-info/NOTICE,sha256=3ANZA5R9rYnCOnUoroGfFUOZ__ww_yG01NUAx0X6J7E,632
19
+ ultimate_sitemap_parser-1.3.1.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
20
+ ultimate_sitemap_parser-1.3.1.dist-info/entry_points.txt,sha256=v60w5WzqYlPOucntZUy0ydzlYwuAPSwoQY0KdT5ragQ,36
21
+ ultimate_sitemap_parser-1.3.1.dist-info/RECORD,,
usp/cli/_ls.py CHANGED
@@ -2,7 +2,7 @@ import argparse
2
2
  import sys
3
3
  from typing import Iterator
4
4
 
5
- from usp.cli._util import format_help, tabs
5
+ from usp.cli._util import CountAction, format_help, setup_logging, tabs
6
6
  from usp.objects.sitemap import AbstractSitemap
7
7
  from usp.tree import sitemap_tree_for_homepage
8
8
 
@@ -26,7 +26,7 @@ def register(subparsers):
26
26
  choices=LS_FORMATS,
27
27
  default="tabtree",
28
28
  help=format_help(LS_FORMATS, "set output format"),
29
- metavar="",
29
+ metavar="FORMAT",
30
30
  )
31
31
  ls_parser.add_argument(
32
32
  "-r",
@@ -46,6 +46,21 @@ def register(subparsers):
46
46
  action="store_true",
47
47
  help="strip the supplied URL from each page and sitemap URL",
48
48
  )
49
+ ls_parser.add_argument(
50
+ "-v",
51
+ "--verbose",
52
+ action=CountAction,
53
+ help="increase output verbosity (-v=INFO, -vv=DEBUG)",
54
+ dest="verbosity",
55
+ default=0,
56
+ max_count=2,
57
+ )
58
+ ls_parser.add_argument(
59
+ "-l",
60
+ "--log-file",
61
+ type=str,
62
+ help="write log to this file and suppress console output",
63
+ )
49
64
  ls_parser.set_defaults(no_robots=False, no_known=False, strip_url=False)
50
65
 
51
66
  ls_parser.set_defaults(func=ls)
@@ -85,6 +100,8 @@ def _output_pages(sitemap: AbstractSitemap, strip_prefix: str = ""):
85
100
 
86
101
 
87
102
  def ls(args):
103
+ setup_logging(args.verbosity, args.log_file)
104
+
88
105
  tree = sitemap_tree_for_homepage(
89
106
  args.url,
90
107
  use_robots=not args.no_robots,
usp/cli/_util.py CHANGED
@@ -1,4 +1,6 @@
1
- from typing import Dict
1
+ import logging
2
+ from argparse import Action
3
+ from typing import Dict, Optional
2
4
 
3
5
 
4
6
  def format_help(choices: Dict[str, str], opt_help: str) -> str:
@@ -19,3 +21,57 @@ def format_help(choices: Dict[str, str], opt_help: str) -> str:
19
21
  def tabs(n: int):
20
22
  """Generate n tabs."""
21
23
  return "\t" * n
24
+
25
+
26
+ _log_levels = {
27
+ 0: logging.WARNING,
28
+ 1: logging.INFO,
29
+ 2: logging.DEBUG,
30
+ }
31
+
32
+
33
+ class CountAction(Action):
34
+ """Modified version of argparse._CountAction to output better help."""
35
+
36
+ def __init__(
37
+ self,
38
+ option_strings,
39
+ dest,
40
+ default=None,
41
+ required=False,
42
+ help=None,
43
+ max_count=None,
44
+ ):
45
+ super().__init__(
46
+ option_strings=option_strings,
47
+ dest=dest,
48
+ nargs=0,
49
+ default=default,
50
+ required=required,
51
+ help=help,
52
+ )
53
+ self.max_count = max_count
54
+
55
+ def __call__(self, parser, namespace, values, option_string=None):
56
+ count = getattr(namespace, self.dest, None)
57
+ if count is None:
58
+ count = 0
59
+ if self.max_count:
60
+ count = min(count, self.max_count)
61
+ setattr(namespace, self.dest, count + 1)
62
+
63
+ def format_usage(self):
64
+ option_str = self.option_strings[0]
65
+ if self.max_count is None:
66
+ return option_str
67
+ letter = self.option_strings[0][1]
68
+ usages = [f"-{letter * i}" for i in range(1, self.max_count + 1)]
69
+ return "/".join(usages)
70
+
71
+
72
+ def setup_logging(verbosity: int, log_path: Optional[str]) -> None:
73
+ log_level = _log_levels.get(verbosity, logging.DEBUG)
74
+ if log_path is not None:
75
+ logging.basicConfig(level=log_level, filename=log_path)
76
+ else:
77
+ logging.basicConfig(level=log_level)
usp/cli/cli.py CHANGED
@@ -1,10 +1,11 @@
1
1
  from argparse import ArgumentParser
2
+ from typing import Optional
2
3
 
3
4
  from usp import __version__
4
5
  from usp.cli import _ls as ls_cmd
5
6
 
6
7
 
7
- def main():
8
+ def parse_args(arg_list: Optional[list[str]]):
8
9
  parser = ArgumentParser(prog="usp", description="Ultimate Sitemap Parser")
9
10
  parser.add_argument(
10
11
  "-v", "--version", action="version", version=f"%(prog)s v{__version__}"
@@ -13,7 +14,12 @@ def main():
13
14
  subparsers = parser.add_subparsers(required=False, title="commands", metavar="")
14
15
  ls_cmd.register(subparsers)
15
16
 
16
- args = parser.parse_args()
17
+ args = parser.parse_args(arg_list)
18
+ return args, parser
19
+
20
+
21
+ def main(arg_list: Optional[list[str]] = None):
22
+ args, parser = parse_args(arg_list)
17
23
 
18
24
  if "func" in args:
19
25
  args.func(args)
usp/fetch_parse.py CHANGED
@@ -13,7 +13,7 @@ import re
13
13
  import xml.parsers.expat
14
14
  from collections import OrderedDict
15
15
  from decimal import Decimal, InvalidOperation
16
- from typing import Dict, Optional, Union
16
+ from typing import Dict, Optional, Set
17
17
 
18
18
  from .exceptions import SitemapException, SitemapXMLParsingException
19
19
  from .helpers import (
@@ -43,8 +43,10 @@ from .objects.sitemap import (
43
43
  )
44
44
  from .web_client.abstract_client import (
45
45
  AbstractWebClient,
46
+ AbstractWebClientResponse,
46
47
  AbstractWebClientSuccessResponse,
47
48
  LocalWebClient,
49
+ LocalWebClientSuccessResponse,
48
50
  NoWebClientException,
49
51
  WebClientErrorResponse,
50
52
  )
@@ -64,12 +66,17 @@ class SitemapFetcher:
64
66
  Spec says it might be up to 50 MB but let's go for the full 100 MB here."""
65
67
 
66
68
  __MAX_RECURSION_LEVEL = 11
67
- """Max. recursion level in iterating over sub-sitemaps."""
69
+ """Max. depth level in iterating over sub-sitemaps.
70
+
71
+ Recursive sitemaps (i.e. child sitemaps pointing to their parent) are stopped immediately.
72
+ """
68
73
 
69
74
  __slots__ = [
70
75
  "_url",
71
76
  "_recursion_level",
72
77
  "_web_client",
78
+ "_parent_urls",
79
+ "_quiet_404",
73
80
  ]
74
81
 
75
82
  def __init__(
@@ -77,14 +84,19 @@ class SitemapFetcher:
77
84
  url: str,
78
85
  recursion_level: int,
79
86
  web_client: Optional[AbstractWebClient] = None,
87
+ parent_urls: Optional[Set[str]] = None,
88
+ quiet_404: bool = False,
80
89
  ):
81
90
  """
82
91
 
83
92
  :param url: URL of the sitemap to fetch and parse.
84
93
  :param recursion_level: current recursion level of parser
85
94
  :param web_client: Web client to use. If ``None``, a :class:`~.RequestsWebClient` will be used.
95
+ :param parent_urls: Set of parent URLs that led to this sitemap.
96
+ :param quiet_404: Whether 404 errors are expected and should be logged at a reduced level, useful for speculative fetching of known URLs.
86
97
 
87
98
  :raises SitemapException: If the maximum recursion depth is exceeded.
99
+ :raises SitemapException: If the URL is in the parent URLs set.
88
100
  :raises SitemapException: If the URL is not an HTTP(S) URL
89
101
  """
90
102
  if recursion_level > self.__MAX_RECURSION_LEVEL:
@@ -92,9 +104,19 @@ class SitemapFetcher:
92
104
  f"Recursion level exceeded {self.__MAX_RECURSION_LEVEL} for URL {url}."
93
105
  )
94
106
 
107
+ log.debug(f"Parent URLs is {parent_urls}")
108
+
95
109
  if not is_http_url(url):
96
110
  raise SitemapException(f"URL {url} is not a HTTP(s) URL.")
97
111
 
112
+ parent_urls = parent_urls or set()
113
+
114
+ if url in parent_urls:
115
+ # Likely a sitemap index points to itself/a higher level index
116
+ raise SitemapException(
117
+ f"Recursion detected in URL {url} with parent URLs {parent_urls}."
118
+ )
119
+
98
120
  if not web_client:
99
121
  web_client = RequestsWebClient()
100
122
 
@@ -103,19 +125,15 @@ class SitemapFetcher:
103
125
  self._url = url
104
126
  self._web_client = web_client
105
127
  self._recursion_level = recursion_level
128
+ self._parent_urls = parent_urls or set()
129
+ self._quiet_404 = quiet_404
106
130
 
107
- def _fetch(self) -> Union[str, WebClientErrorResponse]:
131
+ def _fetch(self) -> AbstractWebClientResponse:
108
132
  log.info(f"Fetching level {self._recursion_level} sitemap from {self._url}...")
109
133
  response = get_url_retry_on_client_errors(
110
- url=self._url, web_client=self._web_client
134
+ url=self._url, web_client=self._web_client, quiet_404=self._quiet_404
111
135
  )
112
-
113
- if isinstance(response, WebClientErrorResponse):
114
- return response
115
-
116
- assert isinstance(response, AbstractWebClientSuccessResponse)
117
-
118
- return ungzipped_response_content(url=self._url, response=response)
136
+ return response
119
137
 
120
138
  def sitemap(self) -> AbstractSitemap:
121
139
  """
@@ -124,13 +142,27 @@ class SitemapFetcher:
124
142
  :return: the parsed sitemap. Will be a child of :class:`~.AbstractSitemap`.
125
143
  If an HTTP error is encountered, or the sitemap cannot be parsed, will be :class:`~.InvalidSitemap`.
126
144
  """
127
- response_content = self._fetch()
145
+ response = self._fetch()
128
146
 
129
- if isinstance(response_content, WebClientErrorResponse):
147
+ if isinstance(response, WebClientErrorResponse):
130
148
  return InvalidSitemap(
131
149
  url=self._url,
132
- reason=f"Unable to fetch sitemap from {self._url}: {response_content.message()}",
150
+ reason=f"Unable to fetch sitemap from {self._url}: {response.message()}",
133
151
  )
152
+ assert isinstance(response, AbstractWebClientSuccessResponse)
153
+
154
+ response_url = response.url()
155
+ log.debug(f"Response URL is {response_url}")
156
+ if response_url in self._parent_urls:
157
+ # Likely a sitemap has redirected to a parent URL
158
+ return InvalidSitemap(
159
+ url=self._url,
160
+ reason=f"Recursion detected when {self._url} redirected to {response_url} with parent URLs {self._parent_urls}.",
161
+ )
162
+
163
+ self._url = response_url
164
+
165
+ response_content = ungzipped_response_content(url=self._url, response=response)
134
166
 
135
167
  # MIME types returned in Content-Type are unpredictable, so peek into the content instead
136
168
  if response_content[:20].strip().startswith("<"):
@@ -140,6 +172,7 @@ class SitemapFetcher:
140
172
  content=response_content,
141
173
  recursion_level=self._recursion_level,
142
174
  web_client=self._web_client,
175
+ parent_urls=self._parent_urls,
143
176
  )
144
177
 
145
178
  else:
@@ -150,6 +183,7 @@ class SitemapFetcher:
150
183
  content=response_content,
151
184
  recursion_level=self._recursion_level,
152
185
  web_client=self._web_client,
186
+ parent_urls=self._parent_urls,
153
187
  )
154
188
  else:
155
189
  parser = PlainTextSitemapParser(
@@ -157,6 +191,7 @@ class SitemapFetcher:
157
191
  content=response_content,
158
192
  recursion_level=self._recursion_level,
159
193
  web_client=self._web_client,
194
+ parent_urls=self._parent_urls,
160
195
  )
161
196
 
162
197
  log.info(f"Parsing sitemap from URL {self._url}...")
@@ -186,8 +221,8 @@ class SitemapStrParser(SitemapFetcher):
186
221
  )
187
222
  self._static_content = static_content
188
223
 
189
- def _fetch(self) -> Union[str, WebClientErrorResponse]:
190
- return self._static_content
224
+ def _fetch(self) -> AbstractWebClientResponse:
225
+ return LocalWebClientSuccessResponse(url=self._url, data=self._static_content)
191
226
 
192
227
 
193
228
  class AbstractSitemapParser(metaclass=abc.ABCMeta):
@@ -198,6 +233,7 @@ class AbstractSitemapParser(metaclass=abc.ABCMeta):
198
233
  "_content",
199
234
  "_web_client",
200
235
  "_recursion_level",
236
+ "_parent_urls",
201
237
  ]
202
238
 
203
239
  def __init__(
@@ -206,11 +242,13 @@ class AbstractSitemapParser(metaclass=abc.ABCMeta):
206
242
  content: str,
207
243
  recursion_level: int,
208
244
  web_client: AbstractWebClient,
245
+ parent_urls: Set[str],
209
246
  ):
210
247
  self._url = url
211
248
  self._content = content
212
249
  self._recursion_level = recursion_level
213
250
  self._web_client = web_client
251
+ self._parent_urls = parent_urls
214
252
 
215
253
  @abc.abstractmethod
216
254
  def sitemap(self) -> AbstractSitemap:
@@ -231,12 +269,14 @@ class IndexRobotsTxtSitemapParser(AbstractSitemapParser):
231
269
  content: str,
232
270
  recursion_level: int,
233
271
  web_client: AbstractWebClient,
272
+ parent_urls: Set[str],
234
273
  ):
235
274
  super().__init__(
236
275
  url=url,
237
276
  content=content,
238
277
  recursion_level=recursion_level,
239
278
  web_client=web_client,
279
+ parent_urls=parent_urls,
240
280
  )
241
281
 
242
282
  if not self._url.endswith("/robots.txt"):
@@ -271,6 +311,7 @@ class IndexRobotsTxtSitemapParser(AbstractSitemapParser):
271
311
  url=sitemap_url,
272
312
  recursion_level=self._recursion_level + 1,
273
313
  web_client=self._web_client,
314
+ parent_urls=self._parent_urls | {self._url},
274
315
  )
275
316
  fetched_sitemap = fetcher.sitemap()
276
317
  except NoWebClientException:
@@ -333,12 +374,14 @@ class XMLSitemapParser(AbstractSitemapParser):
333
374
  content: str,
334
375
  recursion_level: int,
335
376
  web_client: AbstractWebClient,
377
+ parent_urls: Set[str],
336
378
  ):
337
379
  super().__init__(
338
380
  url=url,
339
381
  content=content,
340
382
  recursion_level=recursion_level,
341
383
  web_client=web_client,
384
+ parent_urls=parent_urls,
342
385
  )
343
386
 
344
387
  # Will be initialized when the type of sitemap is known
@@ -432,6 +475,7 @@ class XMLSitemapParser(AbstractSitemapParser):
432
475
  url=self._url,
433
476
  web_client=self._web_client,
434
477
  recursion_level=self._recursion_level,
478
+ parent_urls=self._parent_urls,
435
479
  )
436
480
 
437
481
  elif name == "rss":
@@ -545,14 +589,22 @@ class IndexXMLSitemapParser(AbstractXMLSitemapParser):
545
589
  "_recursion_level",
546
590
  # List of sub-sitemap URLs found in this index sitemap
547
591
  "_sub_sitemap_urls",
592
+ "_parent_urls",
548
593
  ]
549
594
 
550
- def __init__(self, url: str, web_client: AbstractWebClient, recursion_level: int):
595
+ def __init__(
596
+ self,
597
+ url: str,
598
+ web_client: AbstractWebClient,
599
+ recursion_level: int,
600
+ parent_urls: Set[str],
601
+ ):
551
602
  super().__init__(url=url)
552
603
 
553
604
  self._web_client = web_client
554
605
  self._recursion_level = recursion_level
555
606
  self._sub_sitemap_urls = []
607
+ self._parent_urls = parent_urls
556
608
 
557
609
  def xml_element_end(self, name: str) -> None:
558
610
  if name == "sitemap:loc":
@@ -578,6 +630,7 @@ class IndexXMLSitemapParser(AbstractXMLSitemapParser):
578
630
  url=sub_sitemap_url,
579
631
  recursion_level=self._recursion_level + 1,
580
632
  web_client=self._web_client,
633
+ parent_urls=self._parent_urls | {self._url},
581
634
  )
582
635
  fetched_sitemap = fetcher.sitemap()
583
636
  except NoWebClientException:
usp/helpers.py CHANGED
@@ -7,6 +7,7 @@ import logging
7
7
  import re
8
8
  import sys
9
9
  import time
10
+ from http import HTTPStatus
10
11
  from typing import Optional
11
12
  from urllib.parse import unquote_plus, urlparse, urlunparse
12
13
 
@@ -130,11 +131,15 @@ def parse_rfc2822_date(date_string: str) -> Optional[datetime.datetime]:
130
131
  return None
131
132
 
132
133
 
134
+ _404_log_message = f"{HTTPStatus.NOT_FOUND} {HTTPStatus.NOT_FOUND.phrase}"
135
+
136
+
133
137
  def get_url_retry_on_client_errors(
134
138
  url: str,
135
139
  web_client: AbstractWebClient,
136
140
  retry_count: int = 5,
137
141
  sleep_between_retries: int = 1,
142
+ quiet_404: bool = False,
138
143
  ) -> AbstractWebClientResponse:
139
144
  """
140
145
  Fetch URL, retry on retryable errors.
@@ -143,6 +148,8 @@ def get_url_retry_on_client_errors(
143
148
  :param web_client: Web client object to use for fetching.
144
149
  :param retry_count: How many times to retry fetching the same URL.
145
150
  :param sleep_between_retries: How long to sleep between retries, in seconds.
151
+ :param quiet_404: Whether to log 404 errors at a lower level.
152
+
146
153
  :return: Web client response object.
147
154
  """
148
155
  assert retry_count > 0, "Retry count must be positive."
@@ -153,7 +160,11 @@ def get_url_retry_on_client_errors(
153
160
  response = web_client.get(url)
154
161
 
155
162
  if isinstance(response, WebClientErrorResponse):
156
- log.warning(f"Request for URL {url} failed: {response.message()}")
163
+ if quiet_404 and response.message() == _404_log_message:
164
+ log_level = logging.INFO
165
+ else:
166
+ log_level = logging.WARNING
167
+ log.log(log_level, f"Request for URL {url} failed: {response.message()}")
157
168
 
158
169
  if response.retryable():
159
170
  log.info(f"Retrying URL {url} in {sleep_between_retries} seconds...")
usp/objects/sitemap.py CHANGED
@@ -9,17 +9,19 @@
9
9
  """
10
10
 
11
11
  import abc
12
+ import logging
12
13
  import os
13
14
  import pickle
14
15
  import tempfile
15
- from functools import lru_cache
16
+ from functools import cache
16
17
  from typing import Iterator, List, Tuple
17
18
 
18
19
  from .page import SitemapPage
19
20
 
21
+ log = logging.getLogger(__name__)
20
22
 
21
- # TODO: change to functools.cache when dropping py3.8
22
- @lru_cache(maxsize=None)
23
+
24
+ @cache
23
25
  def _all_slots(target_cls):
24
26
  mro = target_cls.__mro__
25
27
 
@@ -153,6 +155,7 @@ class InvalidSitemap(AbstractSitemap):
153
155
  """
154
156
  super().__init__(url=url)
155
157
  self.__reason = reason
158
+ log.info(f"Invalid sitemap: {url}, reason: {reason}")
156
159
 
157
160
  def __eq__(self, other) -> bool:
158
161
  if not isinstance(other, InvalidSitemap):
@@ -222,8 +225,8 @@ class AbstractPagesSitemap(AbstractSitemap, metaclass=abc.ABCMeta):
222
225
  self._dump_pages(pages)
223
226
 
224
227
  def _dump_pages(self, pages: List[SitemapPage]):
225
- temp_file, self.__pages_temp_file_path = tempfile.mkstemp()
226
- with open(self.__pages_temp_file_path, "wb") as tmp:
228
+ fd, self.__pages_temp_file_path = tempfile.mkstemp()
229
+ with os.fdopen(fd, "wb") as tmp:
227
230
  pickle.dump(pages, tmp, protocol=pickle.HIGHEST_PROTOCOL)
228
231
 
229
232
  def __del__(self):
usp/tree.py CHANGED
@@ -75,7 +75,10 @@ def sitemap_tree_for_homepage(
75
75
  sitemap_urls_found_in_robots_txt = set()
76
76
  if use_robots:
77
77
  robots_txt_fetcher = SitemapFetcher(
78
- url=robots_txt_url, web_client=web_client, recursion_level=0
78
+ url=robots_txt_url,
79
+ web_client=web_client,
80
+ recursion_level=0,
81
+ parent_urls=set(),
79
82
  )
80
83
  robots_txt_sitemap = robots_txt_fetcher.sitemap()
81
84
  if not isinstance(robots_txt_sitemap, InvalidSitemap):
@@ -95,6 +98,8 @@ def sitemap_tree_for_homepage(
95
98
  url=unpublished_sitemap_url,
96
99
  web_client=web_client,
97
100
  recursion_level=0,
101
+ parent_urls=sitemap_urls_found_in_robots_txt,
102
+ quiet_404=True,
98
103
  )
99
104
  unpublished_sitemap = unpublished_sitemap_fetcher.sitemap()
100
105
 
@@ -101,6 +101,15 @@ class AbstractWebClientSuccessResponse(
101
101
  """
102
102
  raise NotImplementedError("Abstract method.")
103
103
 
104
+ @abc.abstractmethod
105
+ def url(self) -> str:
106
+ """
107
+ Return the actual URL fetched, after any redirects.
108
+
109
+ :return: URL fetched.
110
+ """
111
+ raise NotImplementedError("Abstract method.")
112
+
104
113
 
105
114
  class WebClientErrorResponse(AbstractWebClientResponse, metaclass=abc.ABCMeta):
106
115
  """
@@ -191,6 +200,27 @@ class LocalWebClient(AbstractWebClient):
191
200
  raise NoWebClientException
192
201
 
193
202
 
203
+ class LocalWebClientSuccessResponse(AbstractWebClientSuccessResponse):
204
+ def __init__(self, url: str, data: str):
205
+ self._url = url
206
+ self._data = data
207
+
208
+ def status_code(self) -> int:
209
+ return 200
210
+
211
+ def status_message(self) -> str:
212
+ return "OK"
213
+
214
+ def header(self, case_insensitive_name: str) -> Optional[str]:
215
+ return None
216
+
217
+ def raw_data(self) -> bytes:
218
+ return self._data.encode("utf-8")
219
+
220
+ def url(self) -> str:
221
+ return self._url
222
+
223
+
194
224
  class RequestWaiter:
195
225
  """
196
226
  Manages waiting between requests.
@@ -62,6 +62,9 @@ class RequestsWebClientSuccessResponse(AbstractWebClientSuccessResponse):
62
62
 
63
63
  return data
64
64
 
65
+ def url(self) -> str:
66
+ return self.__requests_response.url
67
+
65
68
 
66
69
  class RequestsWebClientErrorResponse(WebClientErrorResponse):
67
70
  """
@@ -162,7 +165,7 @@ class RequestsWebClient(AbstractWebClient):
162
165
  )
163
166
  else:
164
167
  message = f"{response.status_code} {response.reason}"
165
- log.info(f"Response content: {response.text}")
168
+ log.debug(f"Response content: {response.text}")
166
169
 
167
170
  if response.status_code in RETRYABLE_HTTP_STATUS_CODES:
168
171
  return RequestsWebClientErrorResponse(
@@ -1,21 +0,0 @@
1
- usp/__init__.py,sha256=_jshbOBBUHRZ5ko4SdI7GRFiF9xKGJVCEPgL9lZJ81o,124
2
- usp/cli/__init__.py,sha256=mGrjSftUYfM2SGp9yEN2dTJndl5thOdv77-EAe6ocWo,37
3
- usp/cli/_ls.py,sha256=BjF5bGuhe_E_Ak-yyY0cDM83LFstl5tA3XNIrGZJujs,2954
4
- usp/cli/_util.py,sha256=UL5WiRZlpiDOI_QvSU1PdjcS6iCmfcLQlO1Mm1wjSAw,505
5
- usp/cli/cli.py,sha256=ySNyYHoCQ440KfxmpTkzLXgqtbnt5ru-TgPs2Zw2-LI,592
6
- usp/exceptions.py,sha256=9KTgnocYYZCfyaCf9BrBN7Ok4cwn7_DlrNFbhUfFsGM,634
7
- usp/fetch_parse.py,sha256=VJrJSAG1X8oQyW2p9wSepuGWfHlMDNoJG8jn3an2XUY,41396
8
- usp/helpers.py,sha256=S9d8fEhHzZqVCx3SkcWVTgW1JYKujH-tM86urjORNWA,8482
9
- usp/objects/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
- usp/objects/page.py,sha256=vz2QXC9Z3E65Cxf68tKfQkubIc_OB0m6pNYH146Qx_8,14253
11
- usp/objects/sitemap.py,sha256=yt5qe6fyKfmvJmV60mB8kc7yooGcpYhuIcNlmUqFGFA,11486
12
- usp/tree.py,sha256=pwSTp1Zok4evzrNFavP-hh5i9xGGzObj_sKUqjk72UU,4237
13
- usp/web_client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
- usp/web_client/abstract_client.py,sha256=7MpIfqQpi1_yojEmuReT8iy9kFUWCD3i2LMpHmBOwV0,6291
15
- usp/web_client/requests_client.py,sha256=1nyXXBxiapDNN5jNpCAXRL5rgjptK4oKvaJhV5nhLsA,5816
16
- ultimate_sitemap_parser-1.2.0.dist-info/LICENSE,sha256=ixuiBLtpoK3iv89l7ylKkg9rs2GzF9ukPH7ynZYzK5s,35148
17
- ultimate_sitemap_parser-1.2.0.dist-info/METADATA,sha256=46wVZspA5eUgbXefu2Fu7xtE03TbFsgjEwLL5BT-mj0,4447
18
- ultimate_sitemap_parser-1.2.0.dist-info/NOTICE,sha256=3ANZA5R9rYnCOnUoroGfFUOZ__ww_yG01NUAx0X6J7E,632
19
- ultimate_sitemap_parser-1.2.0.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
20
- ultimate_sitemap_parser-1.2.0.dist-info/entry_points.txt,sha256=v60w5WzqYlPOucntZUy0ydzlYwuAPSwoQY0KdT5ragQ,36
21
- ultimate_sitemap_parser-1.2.0.dist-info/RECORD,,