ultimate-sitemap-parser 1.0.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ultimate-sitemap-parser might be problematic. Click here for more details.
- ultimate_sitemap_parser-1.0.0rc1.dist-info/LICENSE +674 -0
- ultimate_sitemap_parser-1.0.0rc1.dist-info/METADATA +109 -0
- ultimate_sitemap_parser-1.0.0rc1.dist-info/NOTICE +12 -0
- ultimate_sitemap_parser-1.0.0rc1.dist-info/RECORD +22 -0
- ultimate_sitemap_parser-1.0.0rc1.dist-info/WHEEL +4 -0
- ultimate_sitemap_parser-1.0.0rc1.dist-info/entry_points.txt +3 -0
- usp/__init__.py +5 -0
- usp/cli/__init__.py +1 -0
- usp/cli/_ls.py +105 -0
- usp/cli/_util.py +21 -0
- usp/cli/cli.py +27 -0
- usp/exceptions.py +35 -0
- usp/fetch_parse.py +1182 -0
- usp/helpers.py +293 -0
- usp/log.py +77 -0
- usp/objects/__init__.py +0 -0
- usp/objects/page.py +451 -0
- usp/objects/sitemap.py +436 -0
- usp/tree.py +114 -0
- usp/web_client/__init__.py +0 -0
- usp/web_client/abstract_client.py +189 -0
- usp/web_client/requests_client.py +150 -0
usp/objects/sitemap.py
ADDED
|
@@ -0,0 +1,436 @@
|
|
|
1
|
+
"""Objects that represent one of the found sitemaps.
|
|
2
|
+
|
|
3
|
+
.. seealso::
|
|
4
|
+
|
|
5
|
+
:doc:`Reference of classes used for each format </reference/formats>`
|
|
6
|
+
|
|
7
|
+
.. inheritance-diagram:: AbstractSitemap InvalidSitemap AbstractIndexSitemap IndexWebsiteSitemap IndexXMLSitemap IndexRobotsTxtSitemap AbstractPagesSitemap PagesXMLSitemap PagesTextSitemap PagesRSSSitemap PagesAtomSitemap
|
|
8
|
+
:parts: 1
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import abc
|
|
12
|
+
from functools import lru_cache
|
|
13
|
+
import os
|
|
14
|
+
import pickle
|
|
15
|
+
import tempfile
|
|
16
|
+
from typing import List, Iterator, Tuple
|
|
17
|
+
|
|
18
|
+
from .page import SitemapPage
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
# TODO: change to functools.cache when dropping py3.8
|
|
22
|
+
@lru_cache(maxsize=None)
|
|
23
|
+
def _all_slots(target_cls):
|
|
24
|
+
mro = target_cls.__mro__
|
|
25
|
+
|
|
26
|
+
# If a child class doesn't declare slots, getattr reports its parents' slots
|
|
27
|
+
# So we need to track the highest class that declared each slot
|
|
28
|
+
last_slot = {}
|
|
29
|
+
|
|
30
|
+
for cls in mro:
|
|
31
|
+
attrs = getattr(cls, "__slots__", tuple())
|
|
32
|
+
for attr in attrs:
|
|
33
|
+
last_slot[attr] = cls
|
|
34
|
+
|
|
35
|
+
slots = set()
|
|
36
|
+
for attr, cls in last_slot.items():
|
|
37
|
+
# Attrs belonging to parent classes may be mangled
|
|
38
|
+
if cls is not target_cls and attr.startswith("__"):
|
|
39
|
+
attr = "_" + cls.__name__ + attr
|
|
40
|
+
slots.add(attr)
|
|
41
|
+
|
|
42
|
+
return slots
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class AbstractSitemap(metaclass=abc.ABCMeta):
|
|
46
|
+
"""
|
|
47
|
+
Abstract sitemap.
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
__slots__ = [
|
|
51
|
+
"__url",
|
|
52
|
+
]
|
|
53
|
+
|
|
54
|
+
def __init__(self, url: str):
|
|
55
|
+
"""
|
|
56
|
+
Initialize a new sitemap.
|
|
57
|
+
|
|
58
|
+
:param url: Sitemap URL.
|
|
59
|
+
"""
|
|
60
|
+
self.__url = url
|
|
61
|
+
|
|
62
|
+
def __eq__(self, other) -> bool:
|
|
63
|
+
if not isinstance(other, AbstractSitemap):
|
|
64
|
+
raise NotImplementedError
|
|
65
|
+
|
|
66
|
+
if self.url != other.url:
|
|
67
|
+
return False
|
|
68
|
+
|
|
69
|
+
return True
|
|
70
|
+
|
|
71
|
+
def __hash__(self):
|
|
72
|
+
return hash((self.url,))
|
|
73
|
+
|
|
74
|
+
def __repr__(self):
|
|
75
|
+
return f"{self.__class__.__name__}(" f"url={self.url}" ")"
|
|
76
|
+
|
|
77
|
+
@property
|
|
78
|
+
def url(self) -> str:
|
|
79
|
+
"""
|
|
80
|
+
Return sitemap URL.
|
|
81
|
+
|
|
82
|
+
:return: Sitemap URL.
|
|
83
|
+
"""
|
|
84
|
+
return self.__url
|
|
85
|
+
|
|
86
|
+
def to_dict(self, with_pages=True) -> dict:
|
|
87
|
+
"""
|
|
88
|
+
Return a dictionary representation of the sitemap, including its child sitemaps and optionally pages
|
|
89
|
+
|
|
90
|
+
:param with_pages: Include pages in the representation of this sitemap or descendants.
|
|
91
|
+
:return: Dictionary representation of the sitemap.
|
|
92
|
+
"""
|
|
93
|
+
|
|
94
|
+
return {
|
|
95
|
+
"url": self.url,
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
@property
|
|
99
|
+
@abc.abstractmethod
|
|
100
|
+
def pages(self) -> List[SitemapPage]:
|
|
101
|
+
"""
|
|
102
|
+
Return a list of pages found in a sitemap (if any).
|
|
103
|
+
|
|
104
|
+
Should return an empty list if this sitemap cannot have sub-pages, to allow traversal with a consistent interface.
|
|
105
|
+
|
|
106
|
+
:return: the list of pages, or an empty list.
|
|
107
|
+
"""
|
|
108
|
+
raise NotImplementedError("Abstract method")
|
|
109
|
+
|
|
110
|
+
# TODO: return custom iterator with set length here?
|
|
111
|
+
def all_pages(self) -> Iterator[SitemapPage]:
|
|
112
|
+
"""
|
|
113
|
+
Return iterator which yields all pages of this sitemap and linked sitemaps (if any).
|
|
114
|
+
|
|
115
|
+
:return: Iterator which yields all pages of this sitemap and linked sitemaps (if any).
|
|
116
|
+
"""
|
|
117
|
+
yield from self.pages
|
|
118
|
+
|
|
119
|
+
@property
|
|
120
|
+
@abc.abstractmethod
|
|
121
|
+
def sub_sitemaps(self) -> List["AbstractSitemap"]:
|
|
122
|
+
"""
|
|
123
|
+
Return a list of sub-sitemaps of this sitemap (if any).
|
|
124
|
+
|
|
125
|
+
Should return an empty list if this sitemap cannot have sub-pages, to allow traversal with a consistent interface.
|
|
126
|
+
|
|
127
|
+
:return: the list of sub-sitemaps, or an empty list.
|
|
128
|
+
"""
|
|
129
|
+
raise NotImplementedError("Abstract method")
|
|
130
|
+
|
|
131
|
+
def all_sitemaps(self) -> Iterator["AbstractSitemap"]:
|
|
132
|
+
"""
|
|
133
|
+
Return iterator which yields all sub-sitemaps descended from this sitemap.
|
|
134
|
+
|
|
135
|
+
:return: Iterator which yields all sub-sitemaps descended from this sitemap.
|
|
136
|
+
"""
|
|
137
|
+
yield from self.sub_sitemaps
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
class InvalidSitemap(AbstractSitemap):
|
|
141
|
+
"""Invalid sitemap, e.g. the one that can't be parsed."""
|
|
142
|
+
|
|
143
|
+
__slots__ = [
|
|
144
|
+
"__reason",
|
|
145
|
+
]
|
|
146
|
+
|
|
147
|
+
def __init__(self, url: str, reason: str):
|
|
148
|
+
"""
|
|
149
|
+
Initialize a new invalid sitemap.
|
|
150
|
+
|
|
151
|
+
:param url: Sitemap URL.
|
|
152
|
+
:param reason: Reason why the sitemap is deemed invalid.
|
|
153
|
+
"""
|
|
154
|
+
super().__init__(url=url)
|
|
155
|
+
self.__reason = reason
|
|
156
|
+
|
|
157
|
+
def __eq__(self, other) -> bool:
|
|
158
|
+
if not isinstance(other, InvalidSitemap):
|
|
159
|
+
raise NotImplementedError
|
|
160
|
+
|
|
161
|
+
if self.url != other.url:
|
|
162
|
+
return False
|
|
163
|
+
|
|
164
|
+
if self.reason != other.reason:
|
|
165
|
+
return False
|
|
166
|
+
|
|
167
|
+
return True
|
|
168
|
+
|
|
169
|
+
def __repr__(self):
|
|
170
|
+
return (
|
|
171
|
+
f"{self.__class__.__name__}("
|
|
172
|
+
f"url={self.url}, "
|
|
173
|
+
f"reason={self.reason}"
|
|
174
|
+
")"
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
def to_dict(self, with_pages=True) -> dict:
|
|
178
|
+
return {
|
|
179
|
+
**super().to_dict(with_pages=with_pages),
|
|
180
|
+
"reason": self.reason,
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
@property
|
|
184
|
+
def reason(self) -> str:
|
|
185
|
+
"""
|
|
186
|
+
Return reason why the sitemap is deemed invalid.
|
|
187
|
+
|
|
188
|
+
:return: Reason why the sitemap is deemed invalid.
|
|
189
|
+
"""
|
|
190
|
+
return self.__reason
|
|
191
|
+
|
|
192
|
+
@property
|
|
193
|
+
def pages(self) -> List[SitemapPage]:
|
|
194
|
+
"""
|
|
195
|
+
Return an empty list of pages, as invalid sitemaps have no pages.
|
|
196
|
+
|
|
197
|
+
:return: Empty list of pages.
|
|
198
|
+
"""
|
|
199
|
+
return []
|
|
200
|
+
|
|
201
|
+
@property
|
|
202
|
+
def sub_sitemaps(self) -> List["AbstractSitemap"]:
|
|
203
|
+
"""
|
|
204
|
+
Return an empty list of sub-sitemaps, as invalid sitemaps have no sub-sitemaps.
|
|
205
|
+
|
|
206
|
+
:return: Empty list of sub-sitemaps.
|
|
207
|
+
"""
|
|
208
|
+
return []
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
class AbstractPagesSitemap(AbstractSitemap, metaclass=abc.ABCMeta):
|
|
212
|
+
"""Abstract sitemap that contains URLs to pages."""
|
|
213
|
+
|
|
214
|
+
__slots__ = [
|
|
215
|
+
"__pages_temp_file_path",
|
|
216
|
+
]
|
|
217
|
+
|
|
218
|
+
def __init__(self, url: str, pages: List[SitemapPage]):
|
|
219
|
+
"""
|
|
220
|
+
Initialize new pages sitemap.
|
|
221
|
+
|
|
222
|
+
:param url: Sitemap URL.
|
|
223
|
+
:param pages: List of pages found in a sitemap.
|
|
224
|
+
"""
|
|
225
|
+
super().__init__(url=url)
|
|
226
|
+
|
|
227
|
+
self._dump_pages(pages)
|
|
228
|
+
|
|
229
|
+
def _dump_pages(self, pages: List[SitemapPage]):
|
|
230
|
+
temp_file, self.__pages_temp_file_path = tempfile.mkstemp()
|
|
231
|
+
with open(self.__pages_temp_file_path, "wb") as tmp:
|
|
232
|
+
pickle.dump(pages, tmp, protocol=pickle.HIGHEST_PROTOCOL)
|
|
233
|
+
|
|
234
|
+
def __del__(self):
|
|
235
|
+
os.unlink(self.__pages_temp_file_path)
|
|
236
|
+
|
|
237
|
+
def __eq__(self, other) -> bool:
|
|
238
|
+
if not isinstance(other, AbstractPagesSitemap):
|
|
239
|
+
raise NotImplementedError
|
|
240
|
+
|
|
241
|
+
if self.url != other.url:
|
|
242
|
+
return False
|
|
243
|
+
|
|
244
|
+
if self.pages != other.pages:
|
|
245
|
+
return False
|
|
246
|
+
|
|
247
|
+
return True
|
|
248
|
+
|
|
249
|
+
def __repr__(self):
|
|
250
|
+
return f"{self.__class__.__name__}(url={self.url}, pages={self.pages})"
|
|
251
|
+
|
|
252
|
+
def __getstate__(self) -> Tuple[None, dict]:
|
|
253
|
+
# Load slots of this class and its parents (mangling if appropriate)
|
|
254
|
+
obj_slots = {slot: getattr(self, slot) for slot in _all_slots(self.__class__)}
|
|
255
|
+
# Replace temp file path with actual content
|
|
256
|
+
del obj_slots["_AbstractPagesSitemap__pages_temp_file_path"]
|
|
257
|
+
obj_slots["_pages_value"] = self.pages
|
|
258
|
+
return None, obj_slots
|
|
259
|
+
|
|
260
|
+
def __setstate__(self, state: tuple):
|
|
261
|
+
_, attrs = state
|
|
262
|
+
# We can't restore contents without this key
|
|
263
|
+
if "_pages_value" not in attrs:
|
|
264
|
+
raise ValueError("State does not contain pages value")
|
|
265
|
+
pages_val = attrs.pop("_pages_value")
|
|
266
|
+
for slot, val in attrs.items():
|
|
267
|
+
setattr(self, slot, val)
|
|
268
|
+
self._dump_pages(pages_val)
|
|
269
|
+
|
|
270
|
+
def to_dict(self, with_pages=True) -> dict:
|
|
271
|
+
obj = {
|
|
272
|
+
**super().to_dict(with_pages=with_pages),
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
if with_pages:
|
|
276
|
+
obj["pages"] = [page.to_dict() for page in self.pages]
|
|
277
|
+
|
|
278
|
+
return obj
|
|
279
|
+
|
|
280
|
+
@property
|
|
281
|
+
def pages(self) -> List[SitemapPage]:
|
|
282
|
+
"""
|
|
283
|
+
Load pages from disk swap file and return them.
|
|
284
|
+
|
|
285
|
+
:return: List of pages found in the sitemap.
|
|
286
|
+
"""
|
|
287
|
+
with open(self.__pages_temp_file_path, "rb") as tmp:
|
|
288
|
+
pages = pickle.load(tmp)
|
|
289
|
+
return pages
|
|
290
|
+
|
|
291
|
+
@property
|
|
292
|
+
def sub_sitemaps(self) -> List["AbstractSitemap"]:
|
|
293
|
+
"""
|
|
294
|
+
Return an empty list of sub-sitemaps, as pages sitemaps have no sub-sitemaps.
|
|
295
|
+
|
|
296
|
+
:return: Empty list of sub-sitemaps.
|
|
297
|
+
"""
|
|
298
|
+
return []
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
# TODO: declare empty __slots__
|
|
302
|
+
class PagesXMLSitemap(AbstractPagesSitemap):
|
|
303
|
+
"""
|
|
304
|
+
XML sitemap that contains URLs to pages.
|
|
305
|
+
"""
|
|
306
|
+
|
|
307
|
+
pass
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
class PagesTextSitemap(AbstractPagesSitemap):
|
|
311
|
+
"""
|
|
312
|
+
Plain text sitemap that contains URLs to pages.
|
|
313
|
+
"""
|
|
314
|
+
|
|
315
|
+
pass
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
class PagesRSSSitemap(AbstractPagesSitemap):
|
|
319
|
+
"""
|
|
320
|
+
RSS 2.0 sitemap that contains URLs to pages.
|
|
321
|
+
"""
|
|
322
|
+
|
|
323
|
+
pass
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
class PagesAtomSitemap(AbstractPagesSitemap):
|
|
327
|
+
"""
|
|
328
|
+
RSS 0.3 / 1.0 sitemap that contains URLs to pages.
|
|
329
|
+
"""
|
|
330
|
+
|
|
331
|
+
pass
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
class AbstractIndexSitemap(AbstractSitemap):
|
|
335
|
+
"""
|
|
336
|
+
Abstract sitemap with URLs to other sitemaps.
|
|
337
|
+
"""
|
|
338
|
+
|
|
339
|
+
__slots__ = [
|
|
340
|
+
"__sub_sitemaps",
|
|
341
|
+
]
|
|
342
|
+
|
|
343
|
+
def __init__(self, url: str, sub_sitemaps: List[AbstractSitemap]):
|
|
344
|
+
"""
|
|
345
|
+
Initialize index sitemap.
|
|
346
|
+
|
|
347
|
+
:param url: Sitemap URL.
|
|
348
|
+
:param sub_sitemaps: Sub-sitemaps that are linked to from this sitemap.
|
|
349
|
+
"""
|
|
350
|
+
super().__init__(url=url)
|
|
351
|
+
self.__sub_sitemaps = sub_sitemaps
|
|
352
|
+
|
|
353
|
+
def __eq__(self, other) -> bool:
|
|
354
|
+
if not isinstance(other, AbstractIndexSitemap):
|
|
355
|
+
raise NotImplementedError
|
|
356
|
+
|
|
357
|
+
if self.url != other.url:
|
|
358
|
+
return False
|
|
359
|
+
|
|
360
|
+
if self.sub_sitemaps != other.sub_sitemaps:
|
|
361
|
+
return False
|
|
362
|
+
|
|
363
|
+
return True
|
|
364
|
+
|
|
365
|
+
def __repr__(self):
|
|
366
|
+
return (
|
|
367
|
+
f"{self.__class__.__name__}("
|
|
368
|
+
f"url={self.url}, "
|
|
369
|
+
f"sub_sitemaps={self.sub_sitemaps}"
|
|
370
|
+
")"
|
|
371
|
+
)
|
|
372
|
+
|
|
373
|
+
def to_dict(self, with_pages=True) -> dict:
|
|
374
|
+
return {
|
|
375
|
+
**super().to_dict(with_pages=with_pages),
|
|
376
|
+
"sub_sitemaps": [
|
|
377
|
+
sub_sitemap.to_dict(with_pages=with_pages)
|
|
378
|
+
for sub_sitemap in self.sub_sitemaps
|
|
379
|
+
],
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
@property
|
|
383
|
+
def sub_sitemaps(self) -> List["AbstractSitemap"]:
|
|
384
|
+
return self.__sub_sitemaps
|
|
385
|
+
|
|
386
|
+
@property
|
|
387
|
+
def pages(self) -> List[SitemapPage]:
|
|
388
|
+
"""
|
|
389
|
+
Return an empty list of pages, as index sitemaps have no pages.
|
|
390
|
+
|
|
391
|
+
:return: Empty list of pages.
|
|
392
|
+
"""
|
|
393
|
+
return []
|
|
394
|
+
|
|
395
|
+
def all_pages(self) -> Iterator[SitemapPage]:
|
|
396
|
+
"""
|
|
397
|
+
Return iterator which yields all pages of this sitemap and linked sitemaps (if any).
|
|
398
|
+
|
|
399
|
+
:return: Iterator which yields all pages of this sitemap and linked sitemaps (if any).
|
|
400
|
+
"""
|
|
401
|
+
for sub_sitemap in self.sub_sitemaps:
|
|
402
|
+
yield from sub_sitemap.all_pages()
|
|
403
|
+
|
|
404
|
+
def all_sitemaps(self) -> Iterator["AbstractSitemap"]:
|
|
405
|
+
"""
|
|
406
|
+
Return iterator which yields all sub-sitemaps of this sitemap.
|
|
407
|
+
|
|
408
|
+
:return: Iterator which yields all sub-sitemaps of this sitemap.
|
|
409
|
+
"""
|
|
410
|
+
for sub_sitemap in self.sub_sitemaps:
|
|
411
|
+
yield sub_sitemap
|
|
412
|
+
yield from sub_sitemap.all_sitemaps()
|
|
413
|
+
|
|
414
|
+
|
|
415
|
+
class IndexWebsiteSitemap(AbstractIndexSitemap):
|
|
416
|
+
"""
|
|
417
|
+
Website's root sitemaps, including robots.txt and extra ones.
|
|
418
|
+
"""
|
|
419
|
+
|
|
420
|
+
pass
|
|
421
|
+
|
|
422
|
+
|
|
423
|
+
class IndexXMLSitemap(AbstractIndexSitemap):
|
|
424
|
+
"""
|
|
425
|
+
XML sitemap with URLs to other sitemaps.
|
|
426
|
+
"""
|
|
427
|
+
|
|
428
|
+
pass
|
|
429
|
+
|
|
430
|
+
|
|
431
|
+
class IndexRobotsTxtSitemap(AbstractIndexSitemap):
|
|
432
|
+
"""
|
|
433
|
+
robots.txt sitemap with URLs to other sitemaps.
|
|
434
|
+
"""
|
|
435
|
+
|
|
436
|
+
pass
|
usp/tree.py
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
"""Helpers to generate a sitemap tree."""
|
|
2
|
+
|
|
3
|
+
from typing import Optional
|
|
4
|
+
from .exceptions import SitemapException
|
|
5
|
+
from .fetch_parse import SitemapFetcher, SitemapStrParser
|
|
6
|
+
from .helpers import is_http_url, strip_url_to_homepage
|
|
7
|
+
from .log import create_logger
|
|
8
|
+
from .objects.sitemap import (
|
|
9
|
+
AbstractSitemap,
|
|
10
|
+
InvalidSitemap,
|
|
11
|
+
IndexWebsiteSitemap,
|
|
12
|
+
IndexRobotsTxtSitemap,
|
|
13
|
+
)
|
|
14
|
+
from .web_client.abstract_client import AbstractWebClient
|
|
15
|
+
|
|
16
|
+
log = create_logger(__name__)
|
|
17
|
+
|
|
18
|
+
_UNPUBLISHED_SITEMAP_PATHS = {
|
|
19
|
+
"sitemap.xml",
|
|
20
|
+
"sitemap.xml.gz",
|
|
21
|
+
"sitemap_index.xml",
|
|
22
|
+
"sitemap-index.xml",
|
|
23
|
+
"sitemap_index.xml.gz",
|
|
24
|
+
"sitemap-index.xml.gz",
|
|
25
|
+
".sitemap.xml",
|
|
26
|
+
"sitemap",
|
|
27
|
+
"admin/config/search/xmlsitemap",
|
|
28
|
+
"sitemap/sitemap-index.xml",
|
|
29
|
+
"sitemap_news.xml",
|
|
30
|
+
"sitemap-news.xml",
|
|
31
|
+
"sitemap_news.xml.gz",
|
|
32
|
+
"sitemap-news.xml.gz",
|
|
33
|
+
}
|
|
34
|
+
"""Paths which are not exposed in robots.txt but might still contain a sitemap."""
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def sitemap_tree_for_homepage(
|
|
38
|
+
homepage_url: str,
|
|
39
|
+
web_client: Optional[AbstractWebClient] = None,
|
|
40
|
+
use_robots: bool = True,
|
|
41
|
+
use_known_paths: bool = True,
|
|
42
|
+
) -> AbstractSitemap:
|
|
43
|
+
"""
|
|
44
|
+
Using a homepage URL, fetch the tree of sitemaps and pages listed in them.
|
|
45
|
+
|
|
46
|
+
:param homepage_url: Homepage URL of a website to fetch the sitemap tree for, e.g. "http://www.example.com/".
|
|
47
|
+
:param web_client: Custom web client implementation to use when fetching sitemaps.
|
|
48
|
+
If ``None``, a :class:`~.RequestsWebClient` will be used.
|
|
49
|
+
:param use_robots: Whether to discover sitemaps through robots.txt.
|
|
50
|
+
:param use_known_paths: Whether to discover sitemaps through common known paths.
|
|
51
|
+
:return: Root sitemap object of the fetched sitemap tree.
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
if not is_http_url(homepage_url):
|
|
55
|
+
raise SitemapException(f"URL {homepage_url} is not a HTTP(s) URL.")
|
|
56
|
+
|
|
57
|
+
stripped_homepage_url = strip_url_to_homepage(url=homepage_url)
|
|
58
|
+
if homepage_url != stripped_homepage_url:
|
|
59
|
+
log.warning(
|
|
60
|
+
f"Assuming that the homepage of {homepage_url} is {stripped_homepage_url}"
|
|
61
|
+
)
|
|
62
|
+
homepage_url = stripped_homepage_url
|
|
63
|
+
|
|
64
|
+
if not homepage_url.endswith("/"):
|
|
65
|
+
homepage_url += "/"
|
|
66
|
+
robots_txt_url = homepage_url + "robots.txt"
|
|
67
|
+
|
|
68
|
+
sitemaps = []
|
|
69
|
+
|
|
70
|
+
sitemap_urls_found_in_robots_txt = set()
|
|
71
|
+
if use_robots:
|
|
72
|
+
robots_txt_fetcher = SitemapFetcher(
|
|
73
|
+
url=robots_txt_url, web_client=web_client, recursion_level=0
|
|
74
|
+
)
|
|
75
|
+
robots_txt_sitemap = robots_txt_fetcher.sitemap()
|
|
76
|
+
if not isinstance(robots_txt_sitemap, InvalidSitemap):
|
|
77
|
+
sitemaps.append(robots_txt_sitemap)
|
|
78
|
+
|
|
79
|
+
if isinstance(robots_txt_sitemap, IndexRobotsTxtSitemap):
|
|
80
|
+
for sub_sitemap in robots_txt_sitemap.all_sitemaps():
|
|
81
|
+
sitemap_urls_found_in_robots_txt.add(sub_sitemap.url)
|
|
82
|
+
|
|
83
|
+
if use_known_paths:
|
|
84
|
+
for unpublished_sitemap_path in _UNPUBLISHED_SITEMAP_PATHS:
|
|
85
|
+
unpublished_sitemap_url = homepage_url + unpublished_sitemap_path
|
|
86
|
+
|
|
87
|
+
# Don't refetch URLs already found in robots.txt
|
|
88
|
+
if unpublished_sitemap_url not in sitemap_urls_found_in_robots_txt:
|
|
89
|
+
unpublished_sitemap_fetcher = SitemapFetcher(
|
|
90
|
+
url=unpublished_sitemap_url,
|
|
91
|
+
web_client=web_client,
|
|
92
|
+
recursion_level=0,
|
|
93
|
+
)
|
|
94
|
+
unpublished_sitemap = unpublished_sitemap_fetcher.sitemap()
|
|
95
|
+
|
|
96
|
+
# Skip the ones that weren't found
|
|
97
|
+
if not isinstance(unpublished_sitemap, InvalidSitemap):
|
|
98
|
+
sitemaps.append(unpublished_sitemap)
|
|
99
|
+
|
|
100
|
+
index_sitemap = IndexWebsiteSitemap(url=homepage_url, sub_sitemaps=sitemaps)
|
|
101
|
+
|
|
102
|
+
return index_sitemap
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def sitemap_from_str(content: str) -> AbstractSitemap:
|
|
106
|
+
"""Parse sitemap from a string.
|
|
107
|
+
|
|
108
|
+
Will return the parsed sitemaps, and any sub-sitemaps will be returned as :class:`~.InvalidSitemap`.
|
|
109
|
+
|
|
110
|
+
:param content: Sitemap string to parse
|
|
111
|
+
:return: Parsed sitemap
|
|
112
|
+
"""
|
|
113
|
+
fetcher = SitemapStrParser(static_content=content)
|
|
114
|
+
return fetcher.sitemap()
|
|
File without changes
|