ultimate-sitemap-parser 1.0.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ultimate-sitemap-parser might be problematic. Click here for more details.

usp/fetch_parse.py ADDED
@@ -0,0 +1,1182 @@
1
+ """Sitemap fetchers and parsers.
2
+
3
+ .. seealso::
4
+
5
+ :doc:`Reference of classes used for each format </reference/formats>`
6
+
7
+ :doc:`Overview of parse process </guides/fetch-parse>`
8
+ """
9
+
10
+ import abc
11
+ import re
12
+ import xml.parsers.expat
13
+ from collections import OrderedDict
14
+ from decimal import Decimal, InvalidOperation
15
+ from typing import Optional, Dict, Union
16
+
17
+
18
+ from .exceptions import SitemapException, SitemapXMLParsingException
19
+ from .helpers import (
20
+ html_unescape_strip,
21
+ parse_iso8601_date,
22
+ get_url_retry_on_client_errors,
23
+ ungzipped_response_content,
24
+ is_http_url,
25
+ parse_rfc2822_date,
26
+ )
27
+ from .log import create_logger
28
+ from .objects.page import (
29
+ SitemapImage,
30
+ SitemapPage,
31
+ SitemapNewsStory,
32
+ SitemapPageChangeFrequency,
33
+ SITEMAP_PAGE_DEFAULT_PRIORITY,
34
+ )
35
+ from .objects.sitemap import (
36
+ AbstractSitemap,
37
+ InvalidSitemap,
38
+ IndexRobotsTxtSitemap,
39
+ IndexXMLSitemap,
40
+ PagesXMLSitemap,
41
+ PagesTextSitemap,
42
+ PagesRSSSitemap,
43
+ PagesAtomSitemap,
44
+ )
45
+ from .web_client.abstract_client import (
46
+ AbstractWebClient,
47
+ AbstractWebClientSuccessResponse,
48
+ WebClientErrorResponse,
49
+ )
50
+ from .web_client.abstract_client import LocalWebClient, NoWebClientException
51
+ from .web_client.requests_client import RequestsWebClient
52
+
53
+ log = create_logger(__name__)
54
+
55
+
56
+ class SitemapFetcher:
57
+ """
58
+ Fetches and parses the sitemap at a given URL, and any declared sub-sitemaps.
59
+ """
60
+
61
+ __MAX_SITEMAP_SIZE = 100 * 1024 * 1024
62
+ """Max. uncompressed sitemap size.
63
+
64
+ Spec says it might be up to 50 MB but let's go for the full 100 MB here."""
65
+
66
+ __MAX_RECURSION_LEVEL = 11
67
+ """Max. recursion level in iterating over sub-sitemaps."""
68
+
69
+ __slots__ = [
70
+ "_url",
71
+ "_recursion_level",
72
+ "_web_client",
73
+ ]
74
+
75
+ def __init__(
76
+ self,
77
+ url: str,
78
+ recursion_level: int,
79
+ web_client: Optional[AbstractWebClient] = None,
80
+ ):
81
+ """
82
+
83
+ :param url: URL of the sitemap to fetch and parse.
84
+ :param recursion_level: current recursion level of parser
85
+ :param web_client: Web client to use. If ``None``, a :class:`~.RequestsWebClient` will be used.
86
+
87
+ :raises SitemapException: If the maximum recursion depth is exceeded.
88
+ :raises SitemapException: If the URL is not an HTTP(S) URL
89
+ """
90
+ if recursion_level > self.__MAX_RECURSION_LEVEL:
91
+ raise SitemapException(
92
+ f"Recursion level exceeded {self.__MAX_RECURSION_LEVEL} for URL {url}."
93
+ )
94
+
95
+ if not is_http_url(url):
96
+ raise SitemapException(f"URL {url} is not a HTTP(s) URL.")
97
+
98
+ if not web_client:
99
+ web_client = RequestsWebClient()
100
+
101
+ web_client.set_max_response_data_length(self.__MAX_SITEMAP_SIZE)
102
+
103
+ self._url = url
104
+ self._web_client = web_client
105
+ self._recursion_level = recursion_level
106
+
107
+ def _fetch(self) -> Union[str, WebClientErrorResponse]:
108
+ log.info(f"Fetching level {self._recursion_level} sitemap from {self._url}...")
109
+ response = get_url_retry_on_client_errors(
110
+ url=self._url, web_client=self._web_client
111
+ )
112
+
113
+ if isinstance(response, WebClientErrorResponse):
114
+ return response
115
+
116
+ assert isinstance(response, AbstractWebClientSuccessResponse)
117
+
118
+ return ungzipped_response_content(url=self._url, response=response)
119
+
120
+ def sitemap(self) -> AbstractSitemap:
121
+ """
122
+ Fetch and parse the sitemap.
123
+
124
+ :return: the parsed sitemap. Will be a child of :class:`~.AbstractSitemap`.
125
+ If an HTTP error is encountered, or the sitemap cannot be parsed, will be :class:`~.InvalidSitemap`.
126
+ """
127
+ response_content = self._fetch()
128
+
129
+ if isinstance(response_content, WebClientErrorResponse):
130
+ return InvalidSitemap(
131
+ url=self._url,
132
+ reason=f"Unable to fetch sitemap from {self._url}: {response_content.message()}",
133
+ )
134
+
135
+ # MIME types returned in Content-Type are unpredictable, so peek into the content instead
136
+ if response_content[:20].strip().startswith("<"):
137
+ # XML sitemap (the specific kind is to be determined later)
138
+ parser = XMLSitemapParser(
139
+ url=self._url,
140
+ content=response_content,
141
+ recursion_level=self._recursion_level,
142
+ web_client=self._web_client,
143
+ )
144
+
145
+ else:
146
+ # Assume that it's some sort of a text file (robots.txt or plain text sitemap)
147
+ if self._url.endswith("/robots.txt"):
148
+ parser = IndexRobotsTxtSitemapParser(
149
+ url=self._url,
150
+ content=response_content,
151
+ recursion_level=self._recursion_level,
152
+ web_client=self._web_client,
153
+ )
154
+ else:
155
+ parser = PlainTextSitemapParser(
156
+ url=self._url,
157
+ content=response_content,
158
+ recursion_level=self._recursion_level,
159
+ web_client=self._web_client,
160
+ )
161
+
162
+ log.info(f"Parsing sitemap from URL {self._url}...")
163
+ sitemap = parser.sitemap()
164
+
165
+ return sitemap
166
+
167
+
168
+ class SitemapStrParser(SitemapFetcher):
169
+ """Custom fetcher to parse a string instead of download from a URL.
170
+
171
+ This is a little bit hacky, but it allows us to support local content parsing without
172
+ having to change too much.
173
+ """
174
+
175
+ __slots__ = ["_static_content"]
176
+
177
+ def __init__(self, static_content: str):
178
+ """Init a new string parser
179
+
180
+ :param static_content: String containing sitemap text to parse
181
+ """
182
+ super().__init__(
183
+ url="http://usp-local-dummy.local/",
184
+ recursion_level=0,
185
+ web_client=LocalWebClient(),
186
+ )
187
+ self._static_content = static_content
188
+
189
+ def _fetch(self) -> Union[str, WebClientErrorResponse]:
190
+ return self._static_content
191
+
192
+
193
+ class AbstractSitemapParser(metaclass=abc.ABCMeta):
194
+ """Abstract robots.txt / XML / plain text sitemap parser."""
195
+
196
+ __slots__ = [
197
+ "_url",
198
+ "_content",
199
+ "_web_client",
200
+ "_recursion_level",
201
+ ]
202
+
203
+ def __init__(
204
+ self,
205
+ url: str,
206
+ content: str,
207
+ recursion_level: int,
208
+ web_client: AbstractWebClient,
209
+ ):
210
+ self._url = url
211
+ self._content = content
212
+ self._recursion_level = recursion_level
213
+ self._web_client = web_client
214
+
215
+ @abc.abstractmethod
216
+ def sitemap(self) -> AbstractSitemap:
217
+ """
218
+ Create the parsed sitemap instance and perform any sub-parsing needed.
219
+
220
+ :return: an instance of the appropriate sitemap class
221
+ """
222
+ raise NotImplementedError("Abstract method.")
223
+
224
+
225
+ class IndexRobotsTxtSitemapParser(AbstractSitemapParser):
226
+ """robots.txt index sitemap parser."""
227
+
228
+ def __init__(
229
+ self,
230
+ url: str,
231
+ content: str,
232
+ recursion_level: int,
233
+ web_client: AbstractWebClient,
234
+ ):
235
+ super().__init__(
236
+ url=url,
237
+ content=content,
238
+ recursion_level=recursion_level,
239
+ web_client=web_client,
240
+ )
241
+
242
+ if not self._url.endswith("/robots.txt"):
243
+ raise SitemapException(
244
+ f"URL does not look like robots.txt URL: {self._url}"
245
+ )
246
+
247
+ def sitemap(self) -> AbstractSitemap:
248
+ # Serves as an ordered set because we want to deduplicate URLs but also retain the order
249
+ sitemap_urls = OrderedDict()
250
+
251
+ for robots_txt_line in self._content.splitlines():
252
+ robots_txt_line = robots_txt_line.strip()
253
+ # robots.txt is supposed to be case sensitive but who cares in these Node.js times?
254
+ sitemap_match = re.search(
255
+ r"^site-?map:\s*(.+?)$", robots_txt_line, flags=re.IGNORECASE
256
+ )
257
+ if sitemap_match:
258
+ sitemap_url = sitemap_match.group(1)
259
+ if is_http_url(sitemap_url):
260
+ sitemap_urls[sitemap_url] = True
261
+ else:
262
+ log.warning(
263
+ f"Sitemap URL {sitemap_url} doesn't look like an URL, skipping"
264
+ )
265
+
266
+ sub_sitemaps = []
267
+
268
+ for sitemap_url in sitemap_urls.keys():
269
+ try:
270
+ fetcher = SitemapFetcher(
271
+ url=sitemap_url,
272
+ recursion_level=self._recursion_level + 1,
273
+ web_client=self._web_client,
274
+ )
275
+ fetched_sitemap = fetcher.sitemap()
276
+ except NoWebClientException:
277
+ fetched_sitemap = InvalidSitemap(
278
+ url=sitemap_url, reason="Un-fetched child sitemap"
279
+ )
280
+ except Exception as ex:
281
+ fetched_sitemap = InvalidSitemap(
282
+ url=sitemap_url,
283
+ reason=f"Unable to add sub-sitemap from URL {sitemap_url}: {str(ex)}",
284
+ )
285
+ sub_sitemaps.append(fetched_sitemap)
286
+
287
+ index_sitemap = IndexRobotsTxtSitemap(url=self._url, sub_sitemaps=sub_sitemaps)
288
+
289
+ return index_sitemap
290
+
291
+
292
+ class PlainTextSitemapParser(AbstractSitemapParser):
293
+ """Plain text sitemap parser."""
294
+
295
+ def sitemap(self) -> AbstractSitemap:
296
+ story_urls = OrderedDict()
297
+
298
+ for story_url in self._content.splitlines():
299
+ story_url = story_url.strip()
300
+ if not story_url:
301
+ continue
302
+ if is_http_url(story_url):
303
+ story_urls[story_url] = True
304
+ else:
305
+ log.warning(f"Story URL {story_url} doesn't look like an URL, skipping")
306
+
307
+ pages = []
308
+ for page_url in story_urls.keys():
309
+ page = SitemapPage(url=page_url)
310
+ pages.append(page)
311
+
312
+ text_sitemap = PagesTextSitemap(url=self._url, pages=pages)
313
+
314
+ return text_sitemap
315
+
316
+
317
+ class XMLSitemapParser(AbstractSitemapParser):
318
+ """Initial XML sitemap parser.
319
+
320
+ Instantiates an Expat parser and registers handler methods, which determine the specific format
321
+ and instantiates a concrete parser (inheriting from :class:`AbstractXMLSitemapParser`) to extract data.
322
+ """
323
+
324
+ __XML_NAMESPACE_SEPARATOR = " "
325
+
326
+ __slots__ = [
327
+ "_concrete_parser",
328
+ ]
329
+
330
+ def __init__(
331
+ self,
332
+ url: str,
333
+ content: str,
334
+ recursion_level: int,
335
+ web_client: AbstractWebClient,
336
+ ):
337
+ super().__init__(
338
+ url=url,
339
+ content=content,
340
+ recursion_level=recursion_level,
341
+ web_client=web_client,
342
+ )
343
+
344
+ # Will be initialized when the type of sitemap is known
345
+ self._concrete_parser = None
346
+
347
+ def sitemap(self) -> AbstractSitemap:
348
+ parser = xml.parsers.expat.ParserCreate(
349
+ namespace_separator=self.__XML_NAMESPACE_SEPARATOR
350
+ )
351
+ parser.StartElementHandler = self._xml_element_start
352
+ parser.EndElementHandler = self._xml_element_end
353
+ parser.CharacterDataHandler = self._xml_char_data
354
+
355
+ try:
356
+ is_final = True
357
+ parser.Parse(self._content, is_final)
358
+ except Exception as ex:
359
+ # Some sitemap XML files might end abruptly because webservers might be timing out on returning huge XML
360
+ # files so don't return InvalidSitemap() but try to get as much pages as possible
361
+ log.error(f"Parsing sitemap from URL {self._url} failed: {ex}")
362
+
363
+ if not self._concrete_parser:
364
+ return InvalidSitemap(
365
+ url=self._url,
366
+ reason=f"No parsers support sitemap from {self._url}",
367
+ )
368
+
369
+ return self._concrete_parser.sitemap()
370
+
371
+ @classmethod
372
+ def __normalize_xml_element_name(cls, name: str):
373
+ """
374
+ Replace the namespace URL in the argument element name with internal namespace.
375
+
376
+ * Elements from http://www.sitemaps.org/schemas/sitemap/0.9 namespace will be prefixed with "sitemap:",
377
+ e.g. "<loc>" will become "<sitemap:loc>"
378
+
379
+ * Elements from http://www.google.com/schemas/sitemap-news/0.9 namespace will be prefixed with "news:",
380
+ e.g. "<publication>" will become "<news:publication>"
381
+
382
+ For non-sitemap namespaces, return the element name with the namespace stripped.
383
+
384
+ :param name: Namespace URL plus XML element name, e.g. "http://www.sitemaps.org/schemas/sitemap/0.9 loc"
385
+ :return: Internal namespace name plus element name, e.g. "sitemap loc"
386
+ """
387
+
388
+ name_parts = name.split(cls.__XML_NAMESPACE_SEPARATOR)
389
+
390
+ if len(name_parts) == 1:
391
+ namespace_url = ""
392
+ name = name_parts[0]
393
+
394
+ elif len(name_parts) == 2:
395
+ namespace_url = name_parts[0]
396
+ name = name_parts[1]
397
+
398
+ else:
399
+ raise SitemapXMLParsingException(
400
+ f"Unable to determine namespace for element '{name}'"
401
+ )
402
+
403
+ if "/sitemap/" in namespace_url:
404
+ name = f"sitemap:{name}"
405
+ elif "/sitemap-news/" in namespace_url:
406
+ name = f"news:{name}"
407
+ elif "/sitemap-image/" in namespace_url:
408
+ name = f"image:{name}"
409
+ elif "/sitemap-video/" in namespace_url:
410
+ name = f"video:{name}"
411
+ else:
412
+ # We don't care about the rest of the namespaces, so just keep the plain element name
413
+ pass
414
+
415
+ return name
416
+
417
+ def _xml_element_start(self, name: str, attrs: Dict[str, str]) -> None:
418
+ name = self.__normalize_xml_element_name(name)
419
+
420
+ if self._concrete_parser:
421
+ self._concrete_parser.xml_element_start(name=name, attrs=attrs)
422
+
423
+ else:
424
+ # Root element -- initialize concrete parser
425
+ if name == "sitemap:urlset":
426
+ self._concrete_parser = PagesXMLSitemapParser(
427
+ url=self._url,
428
+ )
429
+
430
+ elif name == "sitemap:sitemapindex":
431
+ self._concrete_parser = IndexXMLSitemapParser(
432
+ url=self._url,
433
+ web_client=self._web_client,
434
+ recursion_level=self._recursion_level,
435
+ )
436
+
437
+ elif name == "rss":
438
+ self._concrete_parser = PagesRSSSitemapParser(
439
+ url=self._url,
440
+ )
441
+
442
+ elif name == "feed":
443
+ self._concrete_parser = PagesAtomSitemapParser(
444
+ url=self._url,
445
+ )
446
+
447
+ else:
448
+ raise SitemapXMLParsingException(f"Unsupported root element '{name}'.")
449
+
450
+ def _xml_element_end(self, name: str) -> None:
451
+ name = self.__normalize_xml_element_name(name)
452
+
453
+ if not self._concrete_parser:
454
+ raise SitemapXMLParsingException(
455
+ "Concrete sitemap parser should be set by now."
456
+ )
457
+
458
+ self._concrete_parser.xml_element_end(name=name)
459
+
460
+ def _xml_char_data(self, data: str) -> None:
461
+ if not self._concrete_parser:
462
+ raise SitemapXMLParsingException(
463
+ "Concrete sitemap parser should be set by now."
464
+ )
465
+
466
+ self._concrete_parser.xml_char_data(data=data)
467
+
468
+
469
+ class AbstractXMLSitemapParser(metaclass=abc.ABCMeta):
470
+ """
471
+ Abstract XML sitemap parser.
472
+ """
473
+
474
+ __slots__ = [
475
+ # URL of the sitemap that is being parsed
476
+ "_url",
477
+ # Last encountered character data
478
+ "_last_char_data",
479
+ "_last_handler_call_was_xml_char_data",
480
+ ]
481
+
482
+ def __init__(self, url: str):
483
+ self._url = url
484
+ self._last_char_data = ""
485
+ self._last_handler_call_was_xml_char_data = False
486
+
487
+ def xml_element_start(self, name: str, attrs: Dict[str, str]) -> None:
488
+ """Concrete parser handler when the start of an element is encountered.
489
+
490
+ See :external+python:meth:`xmlparser.StartElementHandler <xml.parsers.expat.xmlparser.StartElementHandler>`
491
+
492
+ :param name: element name, potentially prefixed with namespace
493
+ :param attrs: element attributes
494
+ """
495
+ self._last_handler_call_was_xml_char_data = False
496
+ pass
497
+
498
+ def xml_element_end(self, name: str) -> None:
499
+ """Concrete parser handler when the end of an element is encountered.
500
+
501
+ See :external+python:meth:`xmlparser.EndElementHandler <xml.parsers.expat.xmlparser.EndElementHandler>`
502
+
503
+ :param name: element name, potentially prefixed with namespace
504
+ """
505
+ # End of any element always resets last encountered character data
506
+ self._last_char_data = ""
507
+ self._last_handler_call_was_xml_char_data = False
508
+
509
+ def xml_char_data(self, data: str) -> None:
510
+ """
511
+ Concrete parser handler for character data.
512
+
513
+ Multiple concurrent calls are concatenated until an XML element start or end is reached,
514
+ as it may be called multiple times for a single string.
515
+ E.g. ``ABC &amp; DEF``.
516
+
517
+ See :external+python:meth:`xmlparser.CharacterDataHandler <xml.parsers.expat.xmlparser.CharacterDataHandler>`
518
+
519
+ :param data: string data
520
+ """
521
+ if self._last_handler_call_was_xml_char_data:
522
+ self._last_char_data += data
523
+ else:
524
+ self._last_char_data = data
525
+
526
+ self._last_handler_call_was_xml_char_data = True
527
+
528
+ @abc.abstractmethod
529
+ def sitemap(self) -> AbstractSitemap:
530
+ """
531
+ Create the parsed sitemap instance and perform any sub-parsing needed.
532
+
533
+ :return: an instance of the appropriate sitemap class
534
+ """
535
+ raise NotImplementedError("Abstract method.")
536
+
537
+
538
+ class IndexXMLSitemapParser(AbstractXMLSitemapParser):
539
+ """
540
+ Index XML sitemap parser.
541
+ """
542
+
543
+ __slots__ = [
544
+ "_web_client",
545
+ "_recursion_level",
546
+ # List of sub-sitemap URLs found in this index sitemap
547
+ "_sub_sitemap_urls",
548
+ ]
549
+
550
+ def __init__(self, url: str, web_client: AbstractWebClient, recursion_level: int):
551
+ super().__init__(url=url)
552
+
553
+ self._web_client = web_client
554
+ self._recursion_level = recursion_level
555
+ self._sub_sitemap_urls = []
556
+
557
+ def xml_element_end(self, name: str) -> None:
558
+ if name == "sitemap:loc":
559
+ sub_sitemap_url = html_unescape_strip(self._last_char_data)
560
+ if not is_http_url(sub_sitemap_url):
561
+ log.warning(
562
+ f"Sub-sitemap URL does not look like one: {sub_sitemap_url}"
563
+ )
564
+
565
+ else:
566
+ if sub_sitemap_url not in self._sub_sitemap_urls:
567
+ self._sub_sitemap_urls.append(sub_sitemap_url)
568
+
569
+ super().xml_element_end(name=name)
570
+
571
+ def sitemap(self) -> AbstractSitemap:
572
+ sub_sitemaps = []
573
+
574
+ for sub_sitemap_url in self._sub_sitemap_urls:
575
+ # URL might be invalid, or recursion limit might have been reached
576
+ try:
577
+ fetcher = SitemapFetcher(
578
+ url=sub_sitemap_url,
579
+ recursion_level=self._recursion_level + 1,
580
+ web_client=self._web_client,
581
+ )
582
+ fetched_sitemap = fetcher.sitemap()
583
+ except NoWebClientException:
584
+ fetched_sitemap = InvalidSitemap(
585
+ url=sub_sitemap_url, reason="Un-fetched child sitemap"
586
+ )
587
+ except Exception as ex:
588
+ fetched_sitemap = InvalidSitemap(
589
+ url=sub_sitemap_url,
590
+ reason=f"Unable to add sub-sitemap from URL {sub_sitemap_url}: {str(ex)}",
591
+ )
592
+
593
+ sub_sitemaps.append(fetched_sitemap)
594
+
595
+ index_sitemap = IndexXMLSitemap(url=self._url, sub_sitemaps=sub_sitemaps)
596
+
597
+ return index_sitemap
598
+
599
+
600
+ MIN_VALID_PRIORITY = Decimal("0.0")
601
+ MAX_VALID_PRIORITY = Decimal("1.0")
602
+
603
+
604
+ class PagesXMLSitemapParser(AbstractXMLSitemapParser):
605
+ """
606
+ Pages XML sitemap parser.
607
+ """
608
+
609
+ class Image:
610
+ """Data class for holding image data while parsing."""
611
+
612
+ __slots__ = ["loc", "caption", "geo_location", "title", "license"]
613
+
614
+ def __init__(self):
615
+ self.loc = None
616
+ self.caption = None
617
+ self.geo_location = None
618
+ self.title = None
619
+ self.license = None
620
+
621
+ def __hash__(self):
622
+ return hash(
623
+ (
624
+ # Hash only the URL to be able to find unique ones
625
+ self.loc,
626
+ )
627
+ )
628
+
629
+ class Page:
630
+ """Simple data class for holding various properties for a single <url> entry while parsing."""
631
+
632
+ __slots__ = [
633
+ "url",
634
+ "last_modified",
635
+ "change_frequency",
636
+ "priority",
637
+ "news_title",
638
+ "news_publish_date",
639
+ "news_publication_name",
640
+ "news_publication_language",
641
+ "news_access",
642
+ "news_genres",
643
+ "news_keywords",
644
+ "news_stock_tickers",
645
+ "images",
646
+ ]
647
+
648
+ def __init__(self):
649
+ self.url = None
650
+ self.last_modified = None
651
+ self.change_frequency = None
652
+ self.priority = None
653
+ self.news_title = None
654
+ self.news_publish_date = None
655
+ self.news_publication_name = None
656
+ self.news_publication_language = None
657
+ self.news_access = None
658
+ self.news_genres = None
659
+ self.news_keywords = None
660
+ self.news_stock_tickers = None
661
+ self.images = []
662
+
663
+ def __hash__(self):
664
+ return hash(
665
+ (
666
+ # Hash only the URL to be able to find unique ones
667
+ self.url,
668
+ )
669
+ )
670
+
671
+ def page(self) -> Optional[SitemapPage]:
672
+ """Return constructed sitemap page if one has been completed, otherwise None."""
673
+
674
+ # Required
675
+ url = html_unescape_strip(self.url)
676
+ if not url:
677
+ log.error("URL is unset")
678
+ return None
679
+
680
+ last_modified = html_unescape_strip(self.last_modified)
681
+ if last_modified:
682
+ last_modified = parse_iso8601_date(last_modified)
683
+
684
+ change_frequency = html_unescape_strip(self.change_frequency)
685
+ if change_frequency:
686
+ change_frequency = change_frequency.lower()
687
+ if SitemapPageChangeFrequency.has_value(change_frequency):
688
+ change_frequency = SitemapPageChangeFrequency(change_frequency)
689
+ else:
690
+ log.warning(
691
+ "Invalid change frequency, defaulting to 'always'.".format()
692
+ )
693
+ change_frequency = SitemapPageChangeFrequency.ALWAYS
694
+ assert isinstance(change_frequency, SitemapPageChangeFrequency)
695
+
696
+ priority = html_unescape_strip(self.priority)
697
+ if priority:
698
+ try:
699
+ priority = Decimal(priority)
700
+
701
+ if priority < MIN_VALID_PRIORITY or priority > MAX_VALID_PRIORITY:
702
+ log.warning(f"Priority is not within 0 and 1: {priority}")
703
+ priority = SITEMAP_PAGE_DEFAULT_PRIORITY
704
+ except InvalidOperation:
705
+ log.warning(f"Invalid priority: {priority}")
706
+ priority = SITEMAP_PAGE_DEFAULT_PRIORITY
707
+ else:
708
+ priority = SITEMAP_PAGE_DEFAULT_PRIORITY
709
+
710
+ news_title = html_unescape_strip(self.news_title)
711
+
712
+ news_publish_date = html_unescape_strip(self.news_publish_date)
713
+ if news_publish_date:
714
+ news_publish_date = parse_iso8601_date(date_string=news_publish_date)
715
+
716
+ news_publication_name = html_unescape_strip(self.news_publication_name)
717
+ news_publication_language = html_unescape_strip(
718
+ self.news_publication_language
719
+ )
720
+ news_access = html_unescape_strip(self.news_access)
721
+
722
+ news_genres = html_unescape_strip(self.news_genres)
723
+ if news_genres:
724
+ news_genres = [x.strip() for x in news_genres.split(",")]
725
+ else:
726
+ news_genres = []
727
+
728
+ news_keywords = html_unescape_strip(self.news_keywords)
729
+ if news_keywords:
730
+ news_keywords = [x.strip() for x in news_keywords.split(",")]
731
+ else:
732
+ news_keywords = []
733
+
734
+ news_stock_tickers = html_unescape_strip(self.news_stock_tickers)
735
+ if news_stock_tickers:
736
+ news_stock_tickers = [x.strip() for x in news_stock_tickers.split(",")]
737
+ else:
738
+ news_stock_tickers = []
739
+
740
+ sitemap_news_story = None
741
+ if news_title and news_publish_date:
742
+ sitemap_news_story = SitemapNewsStory(
743
+ title=news_title,
744
+ publish_date=news_publish_date,
745
+ publication_name=news_publication_name,
746
+ publication_language=news_publication_language,
747
+ access=news_access,
748
+ genres=news_genres,
749
+ keywords=news_keywords,
750
+ stock_tickers=news_stock_tickers,
751
+ )
752
+
753
+ sitemap_images = None
754
+ if len(self.images) > 0:
755
+ sitemap_images = [
756
+ SitemapImage(
757
+ loc=image.loc,
758
+ caption=image.caption,
759
+ geo_location=image.geo_location,
760
+ title=image.title,
761
+ license_=image.license,
762
+ )
763
+ for image in self.images
764
+ ]
765
+
766
+ return SitemapPage(
767
+ url=url,
768
+ last_modified=last_modified,
769
+ change_frequency=change_frequency,
770
+ priority=priority,
771
+ news_story=sitemap_news_story,
772
+ images=sitemap_images,
773
+ )
774
+
775
+ __slots__ = ["_current_page", "_pages", "_page_urls", "_current_image"]
776
+
777
+ def __init__(self, url: str):
778
+ super().__init__(url=url)
779
+
780
+ self._current_page = None
781
+ self._pages = []
782
+ self._page_urls = set()
783
+ self._current_image = None
784
+
785
+ def xml_element_start(self, name: str, attrs: Dict[str, str]) -> None:
786
+ super().xml_element_start(name=name, attrs=attrs)
787
+
788
+ if name == "sitemap:url":
789
+ if self._current_page:
790
+ raise SitemapXMLParsingException(
791
+ "Page is expected to be unset by <url>."
792
+ )
793
+ self._current_page = self.Page()
794
+ elif name == "image:image":
795
+ if self._current_image:
796
+ raise SitemapXMLParsingException(
797
+ "Image is expected to be unset by <image:image>."
798
+ )
799
+ if not self._current_page:
800
+ raise SitemapXMLParsingException(
801
+ "Page is expected to be set before <image:image>."
802
+ )
803
+ self._current_image = self.Image()
804
+
805
+ def __require_last_char_data_to_be_set(self, name: str) -> None:
806
+ if not self._last_char_data:
807
+ raise SitemapXMLParsingException(
808
+ f"Character data is expected to be set at the end of <{name}>."
809
+ )
810
+
811
+ def xml_element_end(self, name: str) -> None:
812
+ if not self._current_page and name != "sitemap:urlset":
813
+ raise SitemapXMLParsingException(
814
+ f"Page is expected to be set at the end of <{name}>."
815
+ )
816
+
817
+ if name == "sitemap:url":
818
+ if self._current_page.url not in self._page_urls:
819
+ self._pages.append(self._current_page)
820
+ self._page_urls.add(self._current_page.url)
821
+ self._current_page = None
822
+ elif name == "image:image":
823
+ self._current_page.images.append(self._current_image)
824
+ self._current_image = None
825
+ else:
826
+ if name == "sitemap:loc":
827
+ # Every entry must have <loc>
828
+ self.__require_last_char_data_to_be_set(name=name)
829
+ self._current_page.url = self._last_char_data
830
+
831
+ elif name == "sitemap:lastmod":
832
+ # Element might be present but character data might be empty
833
+ self._current_page.last_modified = self._last_char_data
834
+
835
+ elif name == "sitemap:changefreq":
836
+ # Element might be present but character data might be empty
837
+ self._current_page.change_frequency = self._last_char_data
838
+
839
+ elif name == "sitemap:priority":
840
+ # Element might be present but character data might be empty
841
+ self._current_page.priority = self._last_char_data
842
+
843
+ elif name == "news:name": # news/publication/name
844
+ # Element might be present but character data might be empty
845
+ self._current_page.news_publication_name = self._last_char_data
846
+
847
+ elif name == "news:language": # news/publication/language
848
+ # Element might be present but character data might be empty
849
+ self._current_page.news_publication_language = self._last_char_data
850
+
851
+ elif name == "news:publication_date":
852
+ # Element might be present but character data might be empty
853
+ self._current_page.news_publish_date = self._last_char_data
854
+
855
+ elif name == "news:title":
856
+ # Every Google News sitemap entry must have <title>
857
+ self.__require_last_char_data_to_be_set(name=name)
858
+ self._current_page.news_title = self._last_char_data
859
+
860
+ elif name == "news:access":
861
+ # Element might be present but character data might be empty
862
+ self._current_page.news_access = self._last_char_data
863
+
864
+ elif name == "news:keywords":
865
+ # Element might be present but character data might be empty
866
+ self._current_page.news_keywords = self._last_char_data
867
+
868
+ elif name == "news:stock_tickers":
869
+ # Element might be present but character data might be empty
870
+ self._current_page.news_stock_tickers = self._last_char_data
871
+
872
+ elif name == "image:loc":
873
+ # Every image entry must have <loc>
874
+ self.__require_last_char_data_to_be_set(name=name)
875
+ self._current_image.loc = self._last_char_data
876
+
877
+ elif name == "image:caption":
878
+ self._current_image.caption = self._last_char_data
879
+
880
+ elif name == "image:geo_location":
881
+ self._current_image.geo_location = self._last_char_data
882
+
883
+ elif name == "image:title":
884
+ self._current_image.title = self._last_char_data
885
+
886
+ elif name == "image:license":
887
+ self._current_image.license = self._last_char_data
888
+
889
+ super().xml_element_end(name=name)
890
+
891
+ def sitemap(self) -> AbstractSitemap:
892
+ pages = []
893
+
894
+ for page_row in self._pages:
895
+ page = page_row.page()
896
+ if page:
897
+ pages.append(page)
898
+
899
+ pages_sitemap = PagesXMLSitemap(url=self._url, pages=pages)
900
+
901
+ return pages_sitemap
902
+
903
+
904
+ class PagesRSSSitemapParser(AbstractXMLSitemapParser):
905
+ """
906
+ Pages RSS 2.0 sitemap parser.
907
+
908
+ https://validator.w3.org/feed/docs/rss2.html
909
+ """
910
+
911
+ class Page:
912
+ """
913
+ Data class for holding various properties for a single RSS <item> while parsing.
914
+ """
915
+
916
+ __slots__ = [
917
+ "link",
918
+ "title",
919
+ "description",
920
+ "publication_date",
921
+ ]
922
+
923
+ def __init__(self):
924
+ self.link = None
925
+ self.title = None
926
+ self.description = None
927
+ self.publication_date = None
928
+
929
+ def __hash__(self):
930
+ return hash(
931
+ (
932
+ # Hash only the URL
933
+ self.link,
934
+ )
935
+ )
936
+
937
+ def page(self) -> Optional[SitemapPage]:
938
+ """Return constructed sitemap page if one has been completed, otherwise None."""
939
+
940
+ # Required
941
+ link = html_unescape_strip(self.link)
942
+ if not link:
943
+ log.error("Link is unset")
944
+ return None
945
+
946
+ title = html_unescape_strip(self.title)
947
+ description = html_unescape_strip(self.description)
948
+ if not (title or description):
949
+ log.error("Both title and description are unset")
950
+ return None
951
+
952
+ publication_date = html_unescape_strip(self.publication_date)
953
+ if publication_date:
954
+ publication_date = parse_rfc2822_date(publication_date)
955
+
956
+ return SitemapPage(
957
+ url=link,
958
+ news_story=SitemapNewsStory(
959
+ title=title or description,
960
+ publish_date=publication_date,
961
+ ),
962
+ )
963
+
964
+ __slots__ = ["_current_page", "_pages", "_page_links"]
965
+
966
+ def __init__(self, url: str):
967
+ super().__init__(url=url)
968
+
969
+ self._current_page = None
970
+ self._pages = []
971
+ self._page_links = set()
972
+
973
+ def xml_element_start(self, name: str, attrs: Dict[str, str]) -> None:
974
+ super().xml_element_start(name=name, attrs=attrs)
975
+
976
+ if name == "item":
977
+ if self._current_page:
978
+ raise SitemapXMLParsingException(
979
+ "Page is expected to be unset by <item>."
980
+ )
981
+ self._current_page = self.Page()
982
+
983
+ def __require_last_char_data_to_be_set(self, name: str) -> None:
984
+ if not self._last_char_data:
985
+ raise SitemapXMLParsingException(
986
+ f"Character data is expected to be set at the end of <{name}>."
987
+ )
988
+
989
+ def xml_element_end(self, name: str) -> None:
990
+ # If within <item> already
991
+ if self._current_page:
992
+ if name == "item":
993
+ if self._current_page.link not in self._page_links:
994
+ self._pages.append(self._current_page)
995
+ self._page_links.add(self._current_page.link)
996
+ self._current_page = None
997
+
998
+ else:
999
+ if name == "link":
1000
+ # Every entry must have <link>
1001
+ self.__require_last_char_data_to_be_set(name=name)
1002
+ self._current_page.link = self._last_char_data
1003
+
1004
+ elif name == "title":
1005
+ # Title (if set) can't be empty
1006
+ self.__require_last_char_data_to_be_set(name=name)
1007
+ self._current_page.title = self._last_char_data
1008
+
1009
+ elif name == "description":
1010
+ # Description (if set) can't be empty
1011
+ self.__require_last_char_data_to_be_set(name=name)
1012
+ self._current_page.description = self._last_char_data
1013
+
1014
+ elif name == "pubDate":
1015
+ # Element might be present but character data might be empty
1016
+ self._current_page.publication_date = self._last_char_data
1017
+
1018
+ super().xml_element_end(name=name)
1019
+
1020
+ def sitemap(self) -> AbstractSitemap:
1021
+ pages = []
1022
+
1023
+ for page_row in self._pages:
1024
+ page = page_row.page()
1025
+ if page:
1026
+ pages.append(page)
1027
+
1028
+ pages_sitemap = PagesRSSSitemap(url=self._url, pages=pages)
1029
+
1030
+ return pages_sitemap
1031
+
1032
+
1033
+ class PagesAtomSitemapParser(AbstractXMLSitemapParser):
1034
+ """
1035
+ Pages Atom 0.3 / 1.0 sitemap parser.
1036
+
1037
+ References:
1038
+
1039
+ - https://github.com/simplepie/simplepie-ng/wiki/Spec:-Atom-0.3
1040
+ - https://www.ietf.org/rfc/rfc4287.txt
1041
+ - http://rakaz.nl/2005/07/moving-from-atom-03-to-10.html
1042
+ """
1043
+
1044
+ # FIXME merge with RSS parser class as there are too many similarities
1045
+
1046
+ class Page:
1047
+ """Data class for holding various properties for a single Atom <entry> while parsing."""
1048
+
1049
+ __slots__ = [
1050
+ "link",
1051
+ "title",
1052
+ "description",
1053
+ "publication_date",
1054
+ ]
1055
+
1056
+ def __init__(self):
1057
+ self.link = None
1058
+ self.title = None
1059
+ self.description = None
1060
+ self.publication_date = None
1061
+
1062
+ def __hash__(self):
1063
+ return hash(
1064
+ (
1065
+ # Hash only the URL
1066
+ self.link,
1067
+ )
1068
+ )
1069
+
1070
+ def page(self) -> Optional[SitemapPage]:
1071
+ """Return constructed sitemap page if one has been completed, otherwise None."""
1072
+
1073
+ # Required
1074
+ link = html_unescape_strip(self.link)
1075
+ if not link:
1076
+ log.error("Link is unset")
1077
+ return None
1078
+
1079
+ title = html_unescape_strip(self.title)
1080
+ description = html_unescape_strip(self.description)
1081
+ if not (title or description):
1082
+ log.error("Both title and description are unset")
1083
+ return None
1084
+
1085
+ publication_date = html_unescape_strip(self.publication_date)
1086
+ if publication_date:
1087
+ publication_date = parse_iso8601_date(publication_date)
1088
+
1089
+ return SitemapPage(
1090
+ url=link,
1091
+ news_story=SitemapNewsStory(
1092
+ title=title or description,
1093
+ publish_date=publication_date,
1094
+ ),
1095
+ )
1096
+
1097
+ __slots__ = [
1098
+ "_current_page",
1099
+ "_pages",
1100
+ "_page_links",
1101
+ "_last_link_rel_self_href",
1102
+ ]
1103
+
1104
+ def __init__(self, url: str):
1105
+ super().__init__(url=url)
1106
+
1107
+ self._current_page = None
1108
+ self._pages = []
1109
+ self._page_links = set()
1110
+ self._last_link_rel_self_href = None
1111
+
1112
+ def xml_element_start(self, name: str, attrs: Dict[str, str]) -> None:
1113
+ super().xml_element_start(name=name, attrs=attrs)
1114
+
1115
+ if name == "entry":
1116
+ if self._current_page:
1117
+ raise SitemapXMLParsingException(
1118
+ "Page is expected to be unset by <entry>."
1119
+ )
1120
+ self._current_page = self.Page()
1121
+
1122
+ elif name == "link":
1123
+ if self._current_page:
1124
+ if (
1125
+ attrs.get("rel", "self").lower() == "self"
1126
+ or self._last_link_rel_self_href is None
1127
+ ):
1128
+ self._last_link_rel_self_href = attrs.get("href", None)
1129
+
1130
+ def __require_last_char_data_to_be_set(self, name: str) -> None:
1131
+ if not self._last_char_data:
1132
+ raise SitemapXMLParsingException(
1133
+ f"Character data is expected to be set at the end of <{name}>."
1134
+ )
1135
+
1136
+ def xml_element_end(self, name: str) -> None:
1137
+ # If within <entry> already
1138
+ if self._current_page:
1139
+ if name == "entry":
1140
+ if self._last_link_rel_self_href:
1141
+ self._current_page.link = self._last_link_rel_self_href
1142
+ self._last_link_rel_self_href = None
1143
+
1144
+ if self._current_page.link not in self._page_links:
1145
+ self._pages.append(self._current_page)
1146
+ self._page_links.add(self._current_page.link)
1147
+
1148
+ self._current_page = None
1149
+
1150
+ else:
1151
+ if name == "title":
1152
+ # Title (if set) can't be empty
1153
+ self.__require_last_char_data_to_be_set(name=name)
1154
+ self._current_page.title = self._last_char_data
1155
+
1156
+ elif name == "tagline" or name == "summary":
1157
+ # Description (if set) can't be empty
1158
+ self.__require_last_char_data_to_be_set(name=name)
1159
+ self._current_page.description = self._last_char_data
1160
+
1161
+ elif name == "issued" or name == "published":
1162
+ # Element might be present but character data might be empty
1163
+ self._current_page.publication_date = self._last_char_data
1164
+
1165
+ elif name == "updated":
1166
+ # No 'issued' or 'published' were set before
1167
+ if not self._current_page.publication_date:
1168
+ self._current_page.publication_date = self._last_char_data
1169
+
1170
+ super().xml_element_end(name=name)
1171
+
1172
+ def sitemap(self) -> AbstractSitemap:
1173
+ pages = []
1174
+
1175
+ for page_row in self._pages:
1176
+ page = page_row.page()
1177
+ if page:
1178
+ pages.append(page)
1179
+
1180
+ pages_sitemap = PagesAtomSitemap(url=self._url, pages=pages)
1181
+
1182
+ return pages_sitemap