ultimate-sitemap-parser 1.0.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ultimate-sitemap-parser might be problematic. Click here for more details.

usp/objects/sitemap.py ADDED
@@ -0,0 +1,436 @@
1
+ """Objects that represent one of the found sitemaps.
2
+
3
+ .. seealso::
4
+
5
+ :doc:`Reference of classes used for each format </reference/formats>`
6
+
7
+ .. inheritance-diagram:: AbstractSitemap InvalidSitemap AbstractIndexSitemap IndexWebsiteSitemap IndexXMLSitemap IndexRobotsTxtSitemap AbstractPagesSitemap PagesXMLSitemap PagesTextSitemap PagesRSSSitemap PagesAtomSitemap
8
+ :parts: 1
9
+ """
10
+
11
+ import abc
12
+ from functools import lru_cache
13
+ import os
14
+ import pickle
15
+ import tempfile
16
+ from typing import List, Iterator, Tuple
17
+
18
+ from .page import SitemapPage
19
+
20
+
21
+ # TODO: change to functools.cache when dropping py3.8
22
+ @lru_cache(maxsize=None)
23
+ def _all_slots(target_cls):
24
+ mro = target_cls.__mro__
25
+
26
+ # If a child class doesn't declare slots, getattr reports its parents' slots
27
+ # So we need to track the highest class that declared each slot
28
+ last_slot = {}
29
+
30
+ for cls in mro:
31
+ attrs = getattr(cls, "__slots__", tuple())
32
+ for attr in attrs:
33
+ last_slot[attr] = cls
34
+
35
+ slots = set()
36
+ for attr, cls in last_slot.items():
37
+ # Attrs belonging to parent classes may be mangled
38
+ if cls is not target_cls and attr.startswith("__"):
39
+ attr = "_" + cls.__name__ + attr
40
+ slots.add(attr)
41
+
42
+ return slots
43
+
44
+
45
+ class AbstractSitemap(metaclass=abc.ABCMeta):
46
+ """
47
+ Abstract sitemap.
48
+ """
49
+
50
+ __slots__ = [
51
+ "__url",
52
+ ]
53
+
54
+ def __init__(self, url: str):
55
+ """
56
+ Initialize a new sitemap.
57
+
58
+ :param url: Sitemap URL.
59
+ """
60
+ self.__url = url
61
+
62
+ def __eq__(self, other) -> bool:
63
+ if not isinstance(other, AbstractSitemap):
64
+ raise NotImplementedError
65
+
66
+ if self.url != other.url:
67
+ return False
68
+
69
+ return True
70
+
71
+ def __hash__(self):
72
+ return hash((self.url,))
73
+
74
+ def __repr__(self):
75
+ return f"{self.__class__.__name__}(" f"url={self.url}" ")"
76
+
77
+ @property
78
+ def url(self) -> str:
79
+ """
80
+ Return sitemap URL.
81
+
82
+ :return: Sitemap URL.
83
+ """
84
+ return self.__url
85
+
86
+ def to_dict(self, with_pages=True) -> dict:
87
+ """
88
+ Return a dictionary representation of the sitemap, including its child sitemaps and optionally pages
89
+
90
+ :param with_pages: Include pages in the representation of this sitemap or descendants.
91
+ :return: Dictionary representation of the sitemap.
92
+ """
93
+
94
+ return {
95
+ "url": self.url,
96
+ }
97
+
98
+ @property
99
+ @abc.abstractmethod
100
+ def pages(self) -> List[SitemapPage]:
101
+ """
102
+ Return a list of pages found in a sitemap (if any).
103
+
104
+ Should return an empty list if this sitemap cannot have sub-pages, to allow traversal with a consistent interface.
105
+
106
+ :return: the list of pages, or an empty list.
107
+ """
108
+ raise NotImplementedError("Abstract method")
109
+
110
+ # TODO: return custom iterator with set length here?
111
+ def all_pages(self) -> Iterator[SitemapPage]:
112
+ """
113
+ Return iterator which yields all pages of this sitemap and linked sitemaps (if any).
114
+
115
+ :return: Iterator which yields all pages of this sitemap and linked sitemaps (if any).
116
+ """
117
+ yield from self.pages
118
+
119
+ @property
120
+ @abc.abstractmethod
121
+ def sub_sitemaps(self) -> List["AbstractSitemap"]:
122
+ """
123
+ Return a list of sub-sitemaps of this sitemap (if any).
124
+
125
+ Should return an empty list if this sitemap cannot have sub-pages, to allow traversal with a consistent interface.
126
+
127
+ :return: the list of sub-sitemaps, or an empty list.
128
+ """
129
+ raise NotImplementedError("Abstract method")
130
+
131
+ def all_sitemaps(self) -> Iterator["AbstractSitemap"]:
132
+ """
133
+ Return iterator which yields all sub-sitemaps descended from this sitemap.
134
+
135
+ :return: Iterator which yields all sub-sitemaps descended from this sitemap.
136
+ """
137
+ yield from self.sub_sitemaps
138
+
139
+
140
+ class InvalidSitemap(AbstractSitemap):
141
+ """Invalid sitemap, e.g. the one that can't be parsed."""
142
+
143
+ __slots__ = [
144
+ "__reason",
145
+ ]
146
+
147
+ def __init__(self, url: str, reason: str):
148
+ """
149
+ Initialize a new invalid sitemap.
150
+
151
+ :param url: Sitemap URL.
152
+ :param reason: Reason why the sitemap is deemed invalid.
153
+ """
154
+ super().__init__(url=url)
155
+ self.__reason = reason
156
+
157
+ def __eq__(self, other) -> bool:
158
+ if not isinstance(other, InvalidSitemap):
159
+ raise NotImplementedError
160
+
161
+ if self.url != other.url:
162
+ return False
163
+
164
+ if self.reason != other.reason:
165
+ return False
166
+
167
+ return True
168
+
169
+ def __repr__(self):
170
+ return (
171
+ f"{self.__class__.__name__}("
172
+ f"url={self.url}, "
173
+ f"reason={self.reason}"
174
+ ")"
175
+ )
176
+
177
+ def to_dict(self, with_pages=True) -> dict:
178
+ return {
179
+ **super().to_dict(with_pages=with_pages),
180
+ "reason": self.reason,
181
+ }
182
+
183
+ @property
184
+ def reason(self) -> str:
185
+ """
186
+ Return reason why the sitemap is deemed invalid.
187
+
188
+ :return: Reason why the sitemap is deemed invalid.
189
+ """
190
+ return self.__reason
191
+
192
+ @property
193
+ def pages(self) -> List[SitemapPage]:
194
+ """
195
+ Return an empty list of pages, as invalid sitemaps have no pages.
196
+
197
+ :return: Empty list of pages.
198
+ """
199
+ return []
200
+
201
+ @property
202
+ def sub_sitemaps(self) -> List["AbstractSitemap"]:
203
+ """
204
+ Return an empty list of sub-sitemaps, as invalid sitemaps have no sub-sitemaps.
205
+
206
+ :return: Empty list of sub-sitemaps.
207
+ """
208
+ return []
209
+
210
+
211
+ class AbstractPagesSitemap(AbstractSitemap, metaclass=abc.ABCMeta):
212
+ """Abstract sitemap that contains URLs to pages."""
213
+
214
+ __slots__ = [
215
+ "__pages_temp_file_path",
216
+ ]
217
+
218
+ def __init__(self, url: str, pages: List[SitemapPage]):
219
+ """
220
+ Initialize new pages sitemap.
221
+
222
+ :param url: Sitemap URL.
223
+ :param pages: List of pages found in a sitemap.
224
+ """
225
+ super().__init__(url=url)
226
+
227
+ self._dump_pages(pages)
228
+
229
+ def _dump_pages(self, pages: List[SitemapPage]):
230
+ temp_file, self.__pages_temp_file_path = tempfile.mkstemp()
231
+ with open(self.__pages_temp_file_path, "wb") as tmp:
232
+ pickle.dump(pages, tmp, protocol=pickle.HIGHEST_PROTOCOL)
233
+
234
+ def __del__(self):
235
+ os.unlink(self.__pages_temp_file_path)
236
+
237
+ def __eq__(self, other) -> bool:
238
+ if not isinstance(other, AbstractPagesSitemap):
239
+ raise NotImplementedError
240
+
241
+ if self.url != other.url:
242
+ return False
243
+
244
+ if self.pages != other.pages:
245
+ return False
246
+
247
+ return True
248
+
249
+ def __repr__(self):
250
+ return f"{self.__class__.__name__}(url={self.url}, pages={self.pages})"
251
+
252
+ def __getstate__(self) -> Tuple[None, dict]:
253
+ # Load slots of this class and its parents (mangling if appropriate)
254
+ obj_slots = {slot: getattr(self, slot) for slot in _all_slots(self.__class__)}
255
+ # Replace temp file path with actual content
256
+ del obj_slots["_AbstractPagesSitemap__pages_temp_file_path"]
257
+ obj_slots["_pages_value"] = self.pages
258
+ return None, obj_slots
259
+
260
+ def __setstate__(self, state: tuple):
261
+ _, attrs = state
262
+ # We can't restore contents without this key
263
+ if "_pages_value" not in attrs:
264
+ raise ValueError("State does not contain pages value")
265
+ pages_val = attrs.pop("_pages_value")
266
+ for slot, val in attrs.items():
267
+ setattr(self, slot, val)
268
+ self._dump_pages(pages_val)
269
+
270
+ def to_dict(self, with_pages=True) -> dict:
271
+ obj = {
272
+ **super().to_dict(with_pages=with_pages),
273
+ }
274
+
275
+ if with_pages:
276
+ obj["pages"] = [page.to_dict() for page in self.pages]
277
+
278
+ return obj
279
+
280
+ @property
281
+ def pages(self) -> List[SitemapPage]:
282
+ """
283
+ Load pages from disk swap file and return them.
284
+
285
+ :return: List of pages found in the sitemap.
286
+ """
287
+ with open(self.__pages_temp_file_path, "rb") as tmp:
288
+ pages = pickle.load(tmp)
289
+ return pages
290
+
291
+ @property
292
+ def sub_sitemaps(self) -> List["AbstractSitemap"]:
293
+ """
294
+ Return an empty list of sub-sitemaps, as pages sitemaps have no sub-sitemaps.
295
+
296
+ :return: Empty list of sub-sitemaps.
297
+ """
298
+ return []
299
+
300
+
301
+ # TODO: declare empty __slots__
302
+ class PagesXMLSitemap(AbstractPagesSitemap):
303
+ """
304
+ XML sitemap that contains URLs to pages.
305
+ """
306
+
307
+ pass
308
+
309
+
310
+ class PagesTextSitemap(AbstractPagesSitemap):
311
+ """
312
+ Plain text sitemap that contains URLs to pages.
313
+ """
314
+
315
+ pass
316
+
317
+
318
+ class PagesRSSSitemap(AbstractPagesSitemap):
319
+ """
320
+ RSS 2.0 sitemap that contains URLs to pages.
321
+ """
322
+
323
+ pass
324
+
325
+
326
+ class PagesAtomSitemap(AbstractPagesSitemap):
327
+ """
328
+ RSS 0.3 / 1.0 sitemap that contains URLs to pages.
329
+ """
330
+
331
+ pass
332
+
333
+
334
+ class AbstractIndexSitemap(AbstractSitemap):
335
+ """
336
+ Abstract sitemap with URLs to other sitemaps.
337
+ """
338
+
339
+ __slots__ = [
340
+ "__sub_sitemaps",
341
+ ]
342
+
343
+ def __init__(self, url: str, sub_sitemaps: List[AbstractSitemap]):
344
+ """
345
+ Initialize index sitemap.
346
+
347
+ :param url: Sitemap URL.
348
+ :param sub_sitemaps: Sub-sitemaps that are linked to from this sitemap.
349
+ """
350
+ super().__init__(url=url)
351
+ self.__sub_sitemaps = sub_sitemaps
352
+
353
+ def __eq__(self, other) -> bool:
354
+ if not isinstance(other, AbstractIndexSitemap):
355
+ raise NotImplementedError
356
+
357
+ if self.url != other.url:
358
+ return False
359
+
360
+ if self.sub_sitemaps != other.sub_sitemaps:
361
+ return False
362
+
363
+ return True
364
+
365
+ def __repr__(self):
366
+ return (
367
+ f"{self.__class__.__name__}("
368
+ f"url={self.url}, "
369
+ f"sub_sitemaps={self.sub_sitemaps}"
370
+ ")"
371
+ )
372
+
373
+ def to_dict(self, with_pages=True) -> dict:
374
+ return {
375
+ **super().to_dict(with_pages=with_pages),
376
+ "sub_sitemaps": [
377
+ sub_sitemap.to_dict(with_pages=with_pages)
378
+ for sub_sitemap in self.sub_sitemaps
379
+ ],
380
+ }
381
+
382
+ @property
383
+ def sub_sitemaps(self) -> List["AbstractSitemap"]:
384
+ return self.__sub_sitemaps
385
+
386
+ @property
387
+ def pages(self) -> List[SitemapPage]:
388
+ """
389
+ Return an empty list of pages, as index sitemaps have no pages.
390
+
391
+ :return: Empty list of pages.
392
+ """
393
+ return []
394
+
395
+ def all_pages(self) -> Iterator[SitemapPage]:
396
+ """
397
+ Return iterator which yields all pages of this sitemap and linked sitemaps (if any).
398
+
399
+ :return: Iterator which yields all pages of this sitemap and linked sitemaps (if any).
400
+ """
401
+ for sub_sitemap in self.sub_sitemaps:
402
+ yield from sub_sitemap.all_pages()
403
+
404
+ def all_sitemaps(self) -> Iterator["AbstractSitemap"]:
405
+ """
406
+ Return iterator which yields all sub-sitemaps of this sitemap.
407
+
408
+ :return: Iterator which yields all sub-sitemaps of this sitemap.
409
+ """
410
+ for sub_sitemap in self.sub_sitemaps:
411
+ yield sub_sitemap
412
+ yield from sub_sitemap.all_sitemaps()
413
+
414
+
415
+ class IndexWebsiteSitemap(AbstractIndexSitemap):
416
+ """
417
+ Website's root sitemaps, including robots.txt and extra ones.
418
+ """
419
+
420
+ pass
421
+
422
+
423
+ class IndexXMLSitemap(AbstractIndexSitemap):
424
+ """
425
+ XML sitemap with URLs to other sitemaps.
426
+ """
427
+
428
+ pass
429
+
430
+
431
+ class IndexRobotsTxtSitemap(AbstractIndexSitemap):
432
+ """
433
+ robots.txt sitemap with URLs to other sitemaps.
434
+ """
435
+
436
+ pass
usp/tree.py ADDED
@@ -0,0 +1,114 @@
1
+ """Helpers to generate a sitemap tree."""
2
+
3
+ from typing import Optional
4
+ from .exceptions import SitemapException
5
+ from .fetch_parse import SitemapFetcher, SitemapStrParser
6
+ from .helpers import is_http_url, strip_url_to_homepage
7
+ from .log import create_logger
8
+ from .objects.sitemap import (
9
+ AbstractSitemap,
10
+ InvalidSitemap,
11
+ IndexWebsiteSitemap,
12
+ IndexRobotsTxtSitemap,
13
+ )
14
+ from .web_client.abstract_client import AbstractWebClient
15
+
16
+ log = create_logger(__name__)
17
+
18
+ _UNPUBLISHED_SITEMAP_PATHS = {
19
+ "sitemap.xml",
20
+ "sitemap.xml.gz",
21
+ "sitemap_index.xml",
22
+ "sitemap-index.xml",
23
+ "sitemap_index.xml.gz",
24
+ "sitemap-index.xml.gz",
25
+ ".sitemap.xml",
26
+ "sitemap",
27
+ "admin/config/search/xmlsitemap",
28
+ "sitemap/sitemap-index.xml",
29
+ "sitemap_news.xml",
30
+ "sitemap-news.xml",
31
+ "sitemap_news.xml.gz",
32
+ "sitemap-news.xml.gz",
33
+ }
34
+ """Paths which are not exposed in robots.txt but might still contain a sitemap."""
35
+
36
+
37
+ def sitemap_tree_for_homepage(
38
+ homepage_url: str,
39
+ web_client: Optional[AbstractWebClient] = None,
40
+ use_robots: bool = True,
41
+ use_known_paths: bool = True,
42
+ ) -> AbstractSitemap:
43
+ """
44
+ Using a homepage URL, fetch the tree of sitemaps and pages listed in them.
45
+
46
+ :param homepage_url: Homepage URL of a website to fetch the sitemap tree for, e.g. "http://www.example.com/".
47
+ :param web_client: Custom web client implementation to use when fetching sitemaps.
48
+ If ``None``, a :class:`~.RequestsWebClient` will be used.
49
+ :param use_robots: Whether to discover sitemaps through robots.txt.
50
+ :param use_known_paths: Whether to discover sitemaps through common known paths.
51
+ :return: Root sitemap object of the fetched sitemap tree.
52
+ """
53
+
54
+ if not is_http_url(homepage_url):
55
+ raise SitemapException(f"URL {homepage_url} is not a HTTP(s) URL.")
56
+
57
+ stripped_homepage_url = strip_url_to_homepage(url=homepage_url)
58
+ if homepage_url != stripped_homepage_url:
59
+ log.warning(
60
+ f"Assuming that the homepage of {homepage_url} is {stripped_homepage_url}"
61
+ )
62
+ homepage_url = stripped_homepage_url
63
+
64
+ if not homepage_url.endswith("/"):
65
+ homepage_url += "/"
66
+ robots_txt_url = homepage_url + "robots.txt"
67
+
68
+ sitemaps = []
69
+
70
+ sitemap_urls_found_in_robots_txt = set()
71
+ if use_robots:
72
+ robots_txt_fetcher = SitemapFetcher(
73
+ url=robots_txt_url, web_client=web_client, recursion_level=0
74
+ )
75
+ robots_txt_sitemap = robots_txt_fetcher.sitemap()
76
+ if not isinstance(robots_txt_sitemap, InvalidSitemap):
77
+ sitemaps.append(robots_txt_sitemap)
78
+
79
+ if isinstance(robots_txt_sitemap, IndexRobotsTxtSitemap):
80
+ for sub_sitemap in robots_txt_sitemap.all_sitemaps():
81
+ sitemap_urls_found_in_robots_txt.add(sub_sitemap.url)
82
+
83
+ if use_known_paths:
84
+ for unpublished_sitemap_path in _UNPUBLISHED_SITEMAP_PATHS:
85
+ unpublished_sitemap_url = homepage_url + unpublished_sitemap_path
86
+
87
+ # Don't refetch URLs already found in robots.txt
88
+ if unpublished_sitemap_url not in sitemap_urls_found_in_robots_txt:
89
+ unpublished_sitemap_fetcher = SitemapFetcher(
90
+ url=unpublished_sitemap_url,
91
+ web_client=web_client,
92
+ recursion_level=0,
93
+ )
94
+ unpublished_sitemap = unpublished_sitemap_fetcher.sitemap()
95
+
96
+ # Skip the ones that weren't found
97
+ if not isinstance(unpublished_sitemap, InvalidSitemap):
98
+ sitemaps.append(unpublished_sitemap)
99
+
100
+ index_sitemap = IndexWebsiteSitemap(url=homepage_url, sub_sitemaps=sitemaps)
101
+
102
+ return index_sitemap
103
+
104
+
105
+ def sitemap_from_str(content: str) -> AbstractSitemap:
106
+ """Parse sitemap from a string.
107
+
108
+ Will return the parsed sitemaps, and any sub-sitemaps will be returned as :class:`~.InvalidSitemap`.
109
+
110
+ :param content: Sitemap string to parse
111
+ :return: Parsed sitemap
112
+ """
113
+ fetcher = SitemapStrParser(static_content=content)
114
+ return fetcher.sitemap()
File without changes