ultimate-sitemap-parser 1.0.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ultimate-sitemap-parser might be problematic. Click here for more details.
- ultimate_sitemap_parser-1.0.0rc1.dist-info/LICENSE +674 -0
- ultimate_sitemap_parser-1.0.0rc1.dist-info/METADATA +109 -0
- ultimate_sitemap_parser-1.0.0rc1.dist-info/NOTICE +12 -0
- ultimate_sitemap_parser-1.0.0rc1.dist-info/RECORD +22 -0
- ultimate_sitemap_parser-1.0.0rc1.dist-info/WHEEL +4 -0
- ultimate_sitemap_parser-1.0.0rc1.dist-info/entry_points.txt +3 -0
- usp/__init__.py +5 -0
- usp/cli/__init__.py +1 -0
- usp/cli/_ls.py +105 -0
- usp/cli/_util.py +21 -0
- usp/cli/cli.py +27 -0
- usp/exceptions.py +35 -0
- usp/fetch_parse.py +1182 -0
- usp/helpers.py +293 -0
- usp/log.py +77 -0
- usp/objects/__init__.py +0 -0
- usp/objects/page.py +451 -0
- usp/objects/sitemap.py +436 -0
- usp/tree.py +114 -0
- usp/web_client/__init__.py +0 -0
- usp/web_client/abstract_client.py +189 -0
- usp/web_client/requests_client.py +150 -0
usp/fetch_parse.py
ADDED
|
@@ -0,0 +1,1182 @@
|
|
|
1
|
+
"""Sitemap fetchers and parsers.
|
|
2
|
+
|
|
3
|
+
.. seealso::
|
|
4
|
+
|
|
5
|
+
:doc:`Reference of classes used for each format </reference/formats>`
|
|
6
|
+
|
|
7
|
+
:doc:`Overview of parse process </guides/fetch-parse>`
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import abc
|
|
11
|
+
import re
|
|
12
|
+
import xml.parsers.expat
|
|
13
|
+
from collections import OrderedDict
|
|
14
|
+
from decimal import Decimal, InvalidOperation
|
|
15
|
+
from typing import Optional, Dict, Union
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
from .exceptions import SitemapException, SitemapXMLParsingException
|
|
19
|
+
from .helpers import (
|
|
20
|
+
html_unescape_strip,
|
|
21
|
+
parse_iso8601_date,
|
|
22
|
+
get_url_retry_on_client_errors,
|
|
23
|
+
ungzipped_response_content,
|
|
24
|
+
is_http_url,
|
|
25
|
+
parse_rfc2822_date,
|
|
26
|
+
)
|
|
27
|
+
from .log import create_logger
|
|
28
|
+
from .objects.page import (
|
|
29
|
+
SitemapImage,
|
|
30
|
+
SitemapPage,
|
|
31
|
+
SitemapNewsStory,
|
|
32
|
+
SitemapPageChangeFrequency,
|
|
33
|
+
SITEMAP_PAGE_DEFAULT_PRIORITY,
|
|
34
|
+
)
|
|
35
|
+
from .objects.sitemap import (
|
|
36
|
+
AbstractSitemap,
|
|
37
|
+
InvalidSitemap,
|
|
38
|
+
IndexRobotsTxtSitemap,
|
|
39
|
+
IndexXMLSitemap,
|
|
40
|
+
PagesXMLSitemap,
|
|
41
|
+
PagesTextSitemap,
|
|
42
|
+
PagesRSSSitemap,
|
|
43
|
+
PagesAtomSitemap,
|
|
44
|
+
)
|
|
45
|
+
from .web_client.abstract_client import (
|
|
46
|
+
AbstractWebClient,
|
|
47
|
+
AbstractWebClientSuccessResponse,
|
|
48
|
+
WebClientErrorResponse,
|
|
49
|
+
)
|
|
50
|
+
from .web_client.abstract_client import LocalWebClient, NoWebClientException
|
|
51
|
+
from .web_client.requests_client import RequestsWebClient
|
|
52
|
+
|
|
53
|
+
log = create_logger(__name__)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class SitemapFetcher:
|
|
57
|
+
"""
|
|
58
|
+
Fetches and parses the sitemap at a given URL, and any declared sub-sitemaps.
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
__MAX_SITEMAP_SIZE = 100 * 1024 * 1024
|
|
62
|
+
"""Max. uncompressed sitemap size.
|
|
63
|
+
|
|
64
|
+
Spec says it might be up to 50 MB but let's go for the full 100 MB here."""
|
|
65
|
+
|
|
66
|
+
__MAX_RECURSION_LEVEL = 11
|
|
67
|
+
"""Max. recursion level in iterating over sub-sitemaps."""
|
|
68
|
+
|
|
69
|
+
__slots__ = [
|
|
70
|
+
"_url",
|
|
71
|
+
"_recursion_level",
|
|
72
|
+
"_web_client",
|
|
73
|
+
]
|
|
74
|
+
|
|
75
|
+
def __init__(
|
|
76
|
+
self,
|
|
77
|
+
url: str,
|
|
78
|
+
recursion_level: int,
|
|
79
|
+
web_client: Optional[AbstractWebClient] = None,
|
|
80
|
+
):
|
|
81
|
+
"""
|
|
82
|
+
|
|
83
|
+
:param url: URL of the sitemap to fetch and parse.
|
|
84
|
+
:param recursion_level: current recursion level of parser
|
|
85
|
+
:param web_client: Web client to use. If ``None``, a :class:`~.RequestsWebClient` will be used.
|
|
86
|
+
|
|
87
|
+
:raises SitemapException: If the maximum recursion depth is exceeded.
|
|
88
|
+
:raises SitemapException: If the URL is not an HTTP(S) URL
|
|
89
|
+
"""
|
|
90
|
+
if recursion_level > self.__MAX_RECURSION_LEVEL:
|
|
91
|
+
raise SitemapException(
|
|
92
|
+
f"Recursion level exceeded {self.__MAX_RECURSION_LEVEL} for URL {url}."
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
if not is_http_url(url):
|
|
96
|
+
raise SitemapException(f"URL {url} is not a HTTP(s) URL.")
|
|
97
|
+
|
|
98
|
+
if not web_client:
|
|
99
|
+
web_client = RequestsWebClient()
|
|
100
|
+
|
|
101
|
+
web_client.set_max_response_data_length(self.__MAX_SITEMAP_SIZE)
|
|
102
|
+
|
|
103
|
+
self._url = url
|
|
104
|
+
self._web_client = web_client
|
|
105
|
+
self._recursion_level = recursion_level
|
|
106
|
+
|
|
107
|
+
def _fetch(self) -> Union[str, WebClientErrorResponse]:
|
|
108
|
+
log.info(f"Fetching level {self._recursion_level} sitemap from {self._url}...")
|
|
109
|
+
response = get_url_retry_on_client_errors(
|
|
110
|
+
url=self._url, web_client=self._web_client
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
if isinstance(response, WebClientErrorResponse):
|
|
114
|
+
return response
|
|
115
|
+
|
|
116
|
+
assert isinstance(response, AbstractWebClientSuccessResponse)
|
|
117
|
+
|
|
118
|
+
return ungzipped_response_content(url=self._url, response=response)
|
|
119
|
+
|
|
120
|
+
def sitemap(self) -> AbstractSitemap:
|
|
121
|
+
"""
|
|
122
|
+
Fetch and parse the sitemap.
|
|
123
|
+
|
|
124
|
+
:return: the parsed sitemap. Will be a child of :class:`~.AbstractSitemap`.
|
|
125
|
+
If an HTTP error is encountered, or the sitemap cannot be parsed, will be :class:`~.InvalidSitemap`.
|
|
126
|
+
"""
|
|
127
|
+
response_content = self._fetch()
|
|
128
|
+
|
|
129
|
+
if isinstance(response_content, WebClientErrorResponse):
|
|
130
|
+
return InvalidSitemap(
|
|
131
|
+
url=self._url,
|
|
132
|
+
reason=f"Unable to fetch sitemap from {self._url}: {response_content.message()}",
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
# MIME types returned in Content-Type are unpredictable, so peek into the content instead
|
|
136
|
+
if response_content[:20].strip().startswith("<"):
|
|
137
|
+
# XML sitemap (the specific kind is to be determined later)
|
|
138
|
+
parser = XMLSitemapParser(
|
|
139
|
+
url=self._url,
|
|
140
|
+
content=response_content,
|
|
141
|
+
recursion_level=self._recursion_level,
|
|
142
|
+
web_client=self._web_client,
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
else:
|
|
146
|
+
# Assume that it's some sort of a text file (robots.txt or plain text sitemap)
|
|
147
|
+
if self._url.endswith("/robots.txt"):
|
|
148
|
+
parser = IndexRobotsTxtSitemapParser(
|
|
149
|
+
url=self._url,
|
|
150
|
+
content=response_content,
|
|
151
|
+
recursion_level=self._recursion_level,
|
|
152
|
+
web_client=self._web_client,
|
|
153
|
+
)
|
|
154
|
+
else:
|
|
155
|
+
parser = PlainTextSitemapParser(
|
|
156
|
+
url=self._url,
|
|
157
|
+
content=response_content,
|
|
158
|
+
recursion_level=self._recursion_level,
|
|
159
|
+
web_client=self._web_client,
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
log.info(f"Parsing sitemap from URL {self._url}...")
|
|
163
|
+
sitemap = parser.sitemap()
|
|
164
|
+
|
|
165
|
+
return sitemap
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
class SitemapStrParser(SitemapFetcher):
|
|
169
|
+
"""Custom fetcher to parse a string instead of download from a URL.
|
|
170
|
+
|
|
171
|
+
This is a little bit hacky, but it allows us to support local content parsing without
|
|
172
|
+
having to change too much.
|
|
173
|
+
"""
|
|
174
|
+
|
|
175
|
+
__slots__ = ["_static_content"]
|
|
176
|
+
|
|
177
|
+
def __init__(self, static_content: str):
|
|
178
|
+
"""Init a new string parser
|
|
179
|
+
|
|
180
|
+
:param static_content: String containing sitemap text to parse
|
|
181
|
+
"""
|
|
182
|
+
super().__init__(
|
|
183
|
+
url="http://usp-local-dummy.local/",
|
|
184
|
+
recursion_level=0,
|
|
185
|
+
web_client=LocalWebClient(),
|
|
186
|
+
)
|
|
187
|
+
self._static_content = static_content
|
|
188
|
+
|
|
189
|
+
def _fetch(self) -> Union[str, WebClientErrorResponse]:
|
|
190
|
+
return self._static_content
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
class AbstractSitemapParser(metaclass=abc.ABCMeta):
|
|
194
|
+
"""Abstract robots.txt / XML / plain text sitemap parser."""
|
|
195
|
+
|
|
196
|
+
__slots__ = [
|
|
197
|
+
"_url",
|
|
198
|
+
"_content",
|
|
199
|
+
"_web_client",
|
|
200
|
+
"_recursion_level",
|
|
201
|
+
]
|
|
202
|
+
|
|
203
|
+
def __init__(
|
|
204
|
+
self,
|
|
205
|
+
url: str,
|
|
206
|
+
content: str,
|
|
207
|
+
recursion_level: int,
|
|
208
|
+
web_client: AbstractWebClient,
|
|
209
|
+
):
|
|
210
|
+
self._url = url
|
|
211
|
+
self._content = content
|
|
212
|
+
self._recursion_level = recursion_level
|
|
213
|
+
self._web_client = web_client
|
|
214
|
+
|
|
215
|
+
@abc.abstractmethod
|
|
216
|
+
def sitemap(self) -> AbstractSitemap:
|
|
217
|
+
"""
|
|
218
|
+
Create the parsed sitemap instance and perform any sub-parsing needed.
|
|
219
|
+
|
|
220
|
+
:return: an instance of the appropriate sitemap class
|
|
221
|
+
"""
|
|
222
|
+
raise NotImplementedError("Abstract method.")
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
class IndexRobotsTxtSitemapParser(AbstractSitemapParser):
|
|
226
|
+
"""robots.txt index sitemap parser."""
|
|
227
|
+
|
|
228
|
+
def __init__(
|
|
229
|
+
self,
|
|
230
|
+
url: str,
|
|
231
|
+
content: str,
|
|
232
|
+
recursion_level: int,
|
|
233
|
+
web_client: AbstractWebClient,
|
|
234
|
+
):
|
|
235
|
+
super().__init__(
|
|
236
|
+
url=url,
|
|
237
|
+
content=content,
|
|
238
|
+
recursion_level=recursion_level,
|
|
239
|
+
web_client=web_client,
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
if not self._url.endswith("/robots.txt"):
|
|
243
|
+
raise SitemapException(
|
|
244
|
+
f"URL does not look like robots.txt URL: {self._url}"
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
def sitemap(self) -> AbstractSitemap:
|
|
248
|
+
# Serves as an ordered set because we want to deduplicate URLs but also retain the order
|
|
249
|
+
sitemap_urls = OrderedDict()
|
|
250
|
+
|
|
251
|
+
for robots_txt_line in self._content.splitlines():
|
|
252
|
+
robots_txt_line = robots_txt_line.strip()
|
|
253
|
+
# robots.txt is supposed to be case sensitive but who cares in these Node.js times?
|
|
254
|
+
sitemap_match = re.search(
|
|
255
|
+
r"^site-?map:\s*(.+?)$", robots_txt_line, flags=re.IGNORECASE
|
|
256
|
+
)
|
|
257
|
+
if sitemap_match:
|
|
258
|
+
sitemap_url = sitemap_match.group(1)
|
|
259
|
+
if is_http_url(sitemap_url):
|
|
260
|
+
sitemap_urls[sitemap_url] = True
|
|
261
|
+
else:
|
|
262
|
+
log.warning(
|
|
263
|
+
f"Sitemap URL {sitemap_url} doesn't look like an URL, skipping"
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
sub_sitemaps = []
|
|
267
|
+
|
|
268
|
+
for sitemap_url in sitemap_urls.keys():
|
|
269
|
+
try:
|
|
270
|
+
fetcher = SitemapFetcher(
|
|
271
|
+
url=sitemap_url,
|
|
272
|
+
recursion_level=self._recursion_level + 1,
|
|
273
|
+
web_client=self._web_client,
|
|
274
|
+
)
|
|
275
|
+
fetched_sitemap = fetcher.sitemap()
|
|
276
|
+
except NoWebClientException:
|
|
277
|
+
fetched_sitemap = InvalidSitemap(
|
|
278
|
+
url=sitemap_url, reason="Un-fetched child sitemap"
|
|
279
|
+
)
|
|
280
|
+
except Exception as ex:
|
|
281
|
+
fetched_sitemap = InvalidSitemap(
|
|
282
|
+
url=sitemap_url,
|
|
283
|
+
reason=f"Unable to add sub-sitemap from URL {sitemap_url}: {str(ex)}",
|
|
284
|
+
)
|
|
285
|
+
sub_sitemaps.append(fetched_sitemap)
|
|
286
|
+
|
|
287
|
+
index_sitemap = IndexRobotsTxtSitemap(url=self._url, sub_sitemaps=sub_sitemaps)
|
|
288
|
+
|
|
289
|
+
return index_sitemap
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
class PlainTextSitemapParser(AbstractSitemapParser):
|
|
293
|
+
"""Plain text sitemap parser."""
|
|
294
|
+
|
|
295
|
+
def sitemap(self) -> AbstractSitemap:
|
|
296
|
+
story_urls = OrderedDict()
|
|
297
|
+
|
|
298
|
+
for story_url in self._content.splitlines():
|
|
299
|
+
story_url = story_url.strip()
|
|
300
|
+
if not story_url:
|
|
301
|
+
continue
|
|
302
|
+
if is_http_url(story_url):
|
|
303
|
+
story_urls[story_url] = True
|
|
304
|
+
else:
|
|
305
|
+
log.warning(f"Story URL {story_url} doesn't look like an URL, skipping")
|
|
306
|
+
|
|
307
|
+
pages = []
|
|
308
|
+
for page_url in story_urls.keys():
|
|
309
|
+
page = SitemapPage(url=page_url)
|
|
310
|
+
pages.append(page)
|
|
311
|
+
|
|
312
|
+
text_sitemap = PagesTextSitemap(url=self._url, pages=pages)
|
|
313
|
+
|
|
314
|
+
return text_sitemap
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
class XMLSitemapParser(AbstractSitemapParser):
|
|
318
|
+
"""Initial XML sitemap parser.
|
|
319
|
+
|
|
320
|
+
Instantiates an Expat parser and registers handler methods, which determine the specific format
|
|
321
|
+
and instantiates a concrete parser (inheriting from :class:`AbstractXMLSitemapParser`) to extract data.
|
|
322
|
+
"""
|
|
323
|
+
|
|
324
|
+
__XML_NAMESPACE_SEPARATOR = " "
|
|
325
|
+
|
|
326
|
+
__slots__ = [
|
|
327
|
+
"_concrete_parser",
|
|
328
|
+
]
|
|
329
|
+
|
|
330
|
+
def __init__(
|
|
331
|
+
self,
|
|
332
|
+
url: str,
|
|
333
|
+
content: str,
|
|
334
|
+
recursion_level: int,
|
|
335
|
+
web_client: AbstractWebClient,
|
|
336
|
+
):
|
|
337
|
+
super().__init__(
|
|
338
|
+
url=url,
|
|
339
|
+
content=content,
|
|
340
|
+
recursion_level=recursion_level,
|
|
341
|
+
web_client=web_client,
|
|
342
|
+
)
|
|
343
|
+
|
|
344
|
+
# Will be initialized when the type of sitemap is known
|
|
345
|
+
self._concrete_parser = None
|
|
346
|
+
|
|
347
|
+
def sitemap(self) -> AbstractSitemap:
|
|
348
|
+
parser = xml.parsers.expat.ParserCreate(
|
|
349
|
+
namespace_separator=self.__XML_NAMESPACE_SEPARATOR
|
|
350
|
+
)
|
|
351
|
+
parser.StartElementHandler = self._xml_element_start
|
|
352
|
+
parser.EndElementHandler = self._xml_element_end
|
|
353
|
+
parser.CharacterDataHandler = self._xml_char_data
|
|
354
|
+
|
|
355
|
+
try:
|
|
356
|
+
is_final = True
|
|
357
|
+
parser.Parse(self._content, is_final)
|
|
358
|
+
except Exception as ex:
|
|
359
|
+
# Some sitemap XML files might end abruptly because webservers might be timing out on returning huge XML
|
|
360
|
+
# files so don't return InvalidSitemap() but try to get as much pages as possible
|
|
361
|
+
log.error(f"Parsing sitemap from URL {self._url} failed: {ex}")
|
|
362
|
+
|
|
363
|
+
if not self._concrete_parser:
|
|
364
|
+
return InvalidSitemap(
|
|
365
|
+
url=self._url,
|
|
366
|
+
reason=f"No parsers support sitemap from {self._url}",
|
|
367
|
+
)
|
|
368
|
+
|
|
369
|
+
return self._concrete_parser.sitemap()
|
|
370
|
+
|
|
371
|
+
@classmethod
|
|
372
|
+
def __normalize_xml_element_name(cls, name: str):
|
|
373
|
+
"""
|
|
374
|
+
Replace the namespace URL in the argument element name with internal namespace.
|
|
375
|
+
|
|
376
|
+
* Elements from http://www.sitemaps.org/schemas/sitemap/0.9 namespace will be prefixed with "sitemap:",
|
|
377
|
+
e.g. "<loc>" will become "<sitemap:loc>"
|
|
378
|
+
|
|
379
|
+
* Elements from http://www.google.com/schemas/sitemap-news/0.9 namespace will be prefixed with "news:",
|
|
380
|
+
e.g. "<publication>" will become "<news:publication>"
|
|
381
|
+
|
|
382
|
+
For non-sitemap namespaces, return the element name with the namespace stripped.
|
|
383
|
+
|
|
384
|
+
:param name: Namespace URL plus XML element name, e.g. "http://www.sitemaps.org/schemas/sitemap/0.9 loc"
|
|
385
|
+
:return: Internal namespace name plus element name, e.g. "sitemap loc"
|
|
386
|
+
"""
|
|
387
|
+
|
|
388
|
+
name_parts = name.split(cls.__XML_NAMESPACE_SEPARATOR)
|
|
389
|
+
|
|
390
|
+
if len(name_parts) == 1:
|
|
391
|
+
namespace_url = ""
|
|
392
|
+
name = name_parts[0]
|
|
393
|
+
|
|
394
|
+
elif len(name_parts) == 2:
|
|
395
|
+
namespace_url = name_parts[0]
|
|
396
|
+
name = name_parts[1]
|
|
397
|
+
|
|
398
|
+
else:
|
|
399
|
+
raise SitemapXMLParsingException(
|
|
400
|
+
f"Unable to determine namespace for element '{name}'"
|
|
401
|
+
)
|
|
402
|
+
|
|
403
|
+
if "/sitemap/" in namespace_url:
|
|
404
|
+
name = f"sitemap:{name}"
|
|
405
|
+
elif "/sitemap-news/" in namespace_url:
|
|
406
|
+
name = f"news:{name}"
|
|
407
|
+
elif "/sitemap-image/" in namespace_url:
|
|
408
|
+
name = f"image:{name}"
|
|
409
|
+
elif "/sitemap-video/" in namespace_url:
|
|
410
|
+
name = f"video:{name}"
|
|
411
|
+
else:
|
|
412
|
+
# We don't care about the rest of the namespaces, so just keep the plain element name
|
|
413
|
+
pass
|
|
414
|
+
|
|
415
|
+
return name
|
|
416
|
+
|
|
417
|
+
def _xml_element_start(self, name: str, attrs: Dict[str, str]) -> None:
|
|
418
|
+
name = self.__normalize_xml_element_name(name)
|
|
419
|
+
|
|
420
|
+
if self._concrete_parser:
|
|
421
|
+
self._concrete_parser.xml_element_start(name=name, attrs=attrs)
|
|
422
|
+
|
|
423
|
+
else:
|
|
424
|
+
# Root element -- initialize concrete parser
|
|
425
|
+
if name == "sitemap:urlset":
|
|
426
|
+
self._concrete_parser = PagesXMLSitemapParser(
|
|
427
|
+
url=self._url,
|
|
428
|
+
)
|
|
429
|
+
|
|
430
|
+
elif name == "sitemap:sitemapindex":
|
|
431
|
+
self._concrete_parser = IndexXMLSitemapParser(
|
|
432
|
+
url=self._url,
|
|
433
|
+
web_client=self._web_client,
|
|
434
|
+
recursion_level=self._recursion_level,
|
|
435
|
+
)
|
|
436
|
+
|
|
437
|
+
elif name == "rss":
|
|
438
|
+
self._concrete_parser = PagesRSSSitemapParser(
|
|
439
|
+
url=self._url,
|
|
440
|
+
)
|
|
441
|
+
|
|
442
|
+
elif name == "feed":
|
|
443
|
+
self._concrete_parser = PagesAtomSitemapParser(
|
|
444
|
+
url=self._url,
|
|
445
|
+
)
|
|
446
|
+
|
|
447
|
+
else:
|
|
448
|
+
raise SitemapXMLParsingException(f"Unsupported root element '{name}'.")
|
|
449
|
+
|
|
450
|
+
def _xml_element_end(self, name: str) -> None:
|
|
451
|
+
name = self.__normalize_xml_element_name(name)
|
|
452
|
+
|
|
453
|
+
if not self._concrete_parser:
|
|
454
|
+
raise SitemapXMLParsingException(
|
|
455
|
+
"Concrete sitemap parser should be set by now."
|
|
456
|
+
)
|
|
457
|
+
|
|
458
|
+
self._concrete_parser.xml_element_end(name=name)
|
|
459
|
+
|
|
460
|
+
def _xml_char_data(self, data: str) -> None:
|
|
461
|
+
if not self._concrete_parser:
|
|
462
|
+
raise SitemapXMLParsingException(
|
|
463
|
+
"Concrete sitemap parser should be set by now."
|
|
464
|
+
)
|
|
465
|
+
|
|
466
|
+
self._concrete_parser.xml_char_data(data=data)
|
|
467
|
+
|
|
468
|
+
|
|
469
|
+
class AbstractXMLSitemapParser(metaclass=abc.ABCMeta):
|
|
470
|
+
"""
|
|
471
|
+
Abstract XML sitemap parser.
|
|
472
|
+
"""
|
|
473
|
+
|
|
474
|
+
__slots__ = [
|
|
475
|
+
# URL of the sitemap that is being parsed
|
|
476
|
+
"_url",
|
|
477
|
+
# Last encountered character data
|
|
478
|
+
"_last_char_data",
|
|
479
|
+
"_last_handler_call_was_xml_char_data",
|
|
480
|
+
]
|
|
481
|
+
|
|
482
|
+
def __init__(self, url: str):
|
|
483
|
+
self._url = url
|
|
484
|
+
self._last_char_data = ""
|
|
485
|
+
self._last_handler_call_was_xml_char_data = False
|
|
486
|
+
|
|
487
|
+
def xml_element_start(self, name: str, attrs: Dict[str, str]) -> None:
|
|
488
|
+
"""Concrete parser handler when the start of an element is encountered.
|
|
489
|
+
|
|
490
|
+
See :external+python:meth:`xmlparser.StartElementHandler <xml.parsers.expat.xmlparser.StartElementHandler>`
|
|
491
|
+
|
|
492
|
+
:param name: element name, potentially prefixed with namespace
|
|
493
|
+
:param attrs: element attributes
|
|
494
|
+
"""
|
|
495
|
+
self._last_handler_call_was_xml_char_data = False
|
|
496
|
+
pass
|
|
497
|
+
|
|
498
|
+
def xml_element_end(self, name: str) -> None:
|
|
499
|
+
"""Concrete parser handler when the end of an element is encountered.
|
|
500
|
+
|
|
501
|
+
See :external+python:meth:`xmlparser.EndElementHandler <xml.parsers.expat.xmlparser.EndElementHandler>`
|
|
502
|
+
|
|
503
|
+
:param name: element name, potentially prefixed with namespace
|
|
504
|
+
"""
|
|
505
|
+
# End of any element always resets last encountered character data
|
|
506
|
+
self._last_char_data = ""
|
|
507
|
+
self._last_handler_call_was_xml_char_data = False
|
|
508
|
+
|
|
509
|
+
def xml_char_data(self, data: str) -> None:
|
|
510
|
+
"""
|
|
511
|
+
Concrete parser handler for character data.
|
|
512
|
+
|
|
513
|
+
Multiple concurrent calls are concatenated until an XML element start or end is reached,
|
|
514
|
+
as it may be called multiple times for a single string.
|
|
515
|
+
E.g. ``ABC & DEF``.
|
|
516
|
+
|
|
517
|
+
See :external+python:meth:`xmlparser.CharacterDataHandler <xml.parsers.expat.xmlparser.CharacterDataHandler>`
|
|
518
|
+
|
|
519
|
+
:param data: string data
|
|
520
|
+
"""
|
|
521
|
+
if self._last_handler_call_was_xml_char_data:
|
|
522
|
+
self._last_char_data += data
|
|
523
|
+
else:
|
|
524
|
+
self._last_char_data = data
|
|
525
|
+
|
|
526
|
+
self._last_handler_call_was_xml_char_data = True
|
|
527
|
+
|
|
528
|
+
@abc.abstractmethod
|
|
529
|
+
def sitemap(self) -> AbstractSitemap:
|
|
530
|
+
"""
|
|
531
|
+
Create the parsed sitemap instance and perform any sub-parsing needed.
|
|
532
|
+
|
|
533
|
+
:return: an instance of the appropriate sitemap class
|
|
534
|
+
"""
|
|
535
|
+
raise NotImplementedError("Abstract method.")
|
|
536
|
+
|
|
537
|
+
|
|
538
|
+
class IndexXMLSitemapParser(AbstractXMLSitemapParser):
|
|
539
|
+
"""
|
|
540
|
+
Index XML sitemap parser.
|
|
541
|
+
"""
|
|
542
|
+
|
|
543
|
+
__slots__ = [
|
|
544
|
+
"_web_client",
|
|
545
|
+
"_recursion_level",
|
|
546
|
+
# List of sub-sitemap URLs found in this index sitemap
|
|
547
|
+
"_sub_sitemap_urls",
|
|
548
|
+
]
|
|
549
|
+
|
|
550
|
+
def __init__(self, url: str, web_client: AbstractWebClient, recursion_level: int):
|
|
551
|
+
super().__init__(url=url)
|
|
552
|
+
|
|
553
|
+
self._web_client = web_client
|
|
554
|
+
self._recursion_level = recursion_level
|
|
555
|
+
self._sub_sitemap_urls = []
|
|
556
|
+
|
|
557
|
+
def xml_element_end(self, name: str) -> None:
|
|
558
|
+
if name == "sitemap:loc":
|
|
559
|
+
sub_sitemap_url = html_unescape_strip(self._last_char_data)
|
|
560
|
+
if not is_http_url(sub_sitemap_url):
|
|
561
|
+
log.warning(
|
|
562
|
+
f"Sub-sitemap URL does not look like one: {sub_sitemap_url}"
|
|
563
|
+
)
|
|
564
|
+
|
|
565
|
+
else:
|
|
566
|
+
if sub_sitemap_url not in self._sub_sitemap_urls:
|
|
567
|
+
self._sub_sitemap_urls.append(sub_sitemap_url)
|
|
568
|
+
|
|
569
|
+
super().xml_element_end(name=name)
|
|
570
|
+
|
|
571
|
+
def sitemap(self) -> AbstractSitemap:
|
|
572
|
+
sub_sitemaps = []
|
|
573
|
+
|
|
574
|
+
for sub_sitemap_url in self._sub_sitemap_urls:
|
|
575
|
+
# URL might be invalid, or recursion limit might have been reached
|
|
576
|
+
try:
|
|
577
|
+
fetcher = SitemapFetcher(
|
|
578
|
+
url=sub_sitemap_url,
|
|
579
|
+
recursion_level=self._recursion_level + 1,
|
|
580
|
+
web_client=self._web_client,
|
|
581
|
+
)
|
|
582
|
+
fetched_sitemap = fetcher.sitemap()
|
|
583
|
+
except NoWebClientException:
|
|
584
|
+
fetched_sitemap = InvalidSitemap(
|
|
585
|
+
url=sub_sitemap_url, reason="Un-fetched child sitemap"
|
|
586
|
+
)
|
|
587
|
+
except Exception as ex:
|
|
588
|
+
fetched_sitemap = InvalidSitemap(
|
|
589
|
+
url=sub_sitemap_url,
|
|
590
|
+
reason=f"Unable to add sub-sitemap from URL {sub_sitemap_url}: {str(ex)}",
|
|
591
|
+
)
|
|
592
|
+
|
|
593
|
+
sub_sitemaps.append(fetched_sitemap)
|
|
594
|
+
|
|
595
|
+
index_sitemap = IndexXMLSitemap(url=self._url, sub_sitemaps=sub_sitemaps)
|
|
596
|
+
|
|
597
|
+
return index_sitemap
|
|
598
|
+
|
|
599
|
+
|
|
600
|
+
MIN_VALID_PRIORITY = Decimal("0.0")
|
|
601
|
+
MAX_VALID_PRIORITY = Decimal("1.0")
|
|
602
|
+
|
|
603
|
+
|
|
604
|
+
class PagesXMLSitemapParser(AbstractXMLSitemapParser):
|
|
605
|
+
"""
|
|
606
|
+
Pages XML sitemap parser.
|
|
607
|
+
"""
|
|
608
|
+
|
|
609
|
+
class Image:
|
|
610
|
+
"""Data class for holding image data while parsing."""
|
|
611
|
+
|
|
612
|
+
__slots__ = ["loc", "caption", "geo_location", "title", "license"]
|
|
613
|
+
|
|
614
|
+
def __init__(self):
|
|
615
|
+
self.loc = None
|
|
616
|
+
self.caption = None
|
|
617
|
+
self.geo_location = None
|
|
618
|
+
self.title = None
|
|
619
|
+
self.license = None
|
|
620
|
+
|
|
621
|
+
def __hash__(self):
|
|
622
|
+
return hash(
|
|
623
|
+
(
|
|
624
|
+
# Hash only the URL to be able to find unique ones
|
|
625
|
+
self.loc,
|
|
626
|
+
)
|
|
627
|
+
)
|
|
628
|
+
|
|
629
|
+
class Page:
|
|
630
|
+
"""Simple data class for holding various properties for a single <url> entry while parsing."""
|
|
631
|
+
|
|
632
|
+
__slots__ = [
|
|
633
|
+
"url",
|
|
634
|
+
"last_modified",
|
|
635
|
+
"change_frequency",
|
|
636
|
+
"priority",
|
|
637
|
+
"news_title",
|
|
638
|
+
"news_publish_date",
|
|
639
|
+
"news_publication_name",
|
|
640
|
+
"news_publication_language",
|
|
641
|
+
"news_access",
|
|
642
|
+
"news_genres",
|
|
643
|
+
"news_keywords",
|
|
644
|
+
"news_stock_tickers",
|
|
645
|
+
"images",
|
|
646
|
+
]
|
|
647
|
+
|
|
648
|
+
def __init__(self):
|
|
649
|
+
self.url = None
|
|
650
|
+
self.last_modified = None
|
|
651
|
+
self.change_frequency = None
|
|
652
|
+
self.priority = None
|
|
653
|
+
self.news_title = None
|
|
654
|
+
self.news_publish_date = None
|
|
655
|
+
self.news_publication_name = None
|
|
656
|
+
self.news_publication_language = None
|
|
657
|
+
self.news_access = None
|
|
658
|
+
self.news_genres = None
|
|
659
|
+
self.news_keywords = None
|
|
660
|
+
self.news_stock_tickers = None
|
|
661
|
+
self.images = []
|
|
662
|
+
|
|
663
|
+
def __hash__(self):
|
|
664
|
+
return hash(
|
|
665
|
+
(
|
|
666
|
+
# Hash only the URL to be able to find unique ones
|
|
667
|
+
self.url,
|
|
668
|
+
)
|
|
669
|
+
)
|
|
670
|
+
|
|
671
|
+
def page(self) -> Optional[SitemapPage]:
|
|
672
|
+
"""Return constructed sitemap page if one has been completed, otherwise None."""
|
|
673
|
+
|
|
674
|
+
# Required
|
|
675
|
+
url = html_unescape_strip(self.url)
|
|
676
|
+
if not url:
|
|
677
|
+
log.error("URL is unset")
|
|
678
|
+
return None
|
|
679
|
+
|
|
680
|
+
last_modified = html_unescape_strip(self.last_modified)
|
|
681
|
+
if last_modified:
|
|
682
|
+
last_modified = parse_iso8601_date(last_modified)
|
|
683
|
+
|
|
684
|
+
change_frequency = html_unescape_strip(self.change_frequency)
|
|
685
|
+
if change_frequency:
|
|
686
|
+
change_frequency = change_frequency.lower()
|
|
687
|
+
if SitemapPageChangeFrequency.has_value(change_frequency):
|
|
688
|
+
change_frequency = SitemapPageChangeFrequency(change_frequency)
|
|
689
|
+
else:
|
|
690
|
+
log.warning(
|
|
691
|
+
"Invalid change frequency, defaulting to 'always'.".format()
|
|
692
|
+
)
|
|
693
|
+
change_frequency = SitemapPageChangeFrequency.ALWAYS
|
|
694
|
+
assert isinstance(change_frequency, SitemapPageChangeFrequency)
|
|
695
|
+
|
|
696
|
+
priority = html_unescape_strip(self.priority)
|
|
697
|
+
if priority:
|
|
698
|
+
try:
|
|
699
|
+
priority = Decimal(priority)
|
|
700
|
+
|
|
701
|
+
if priority < MIN_VALID_PRIORITY or priority > MAX_VALID_PRIORITY:
|
|
702
|
+
log.warning(f"Priority is not within 0 and 1: {priority}")
|
|
703
|
+
priority = SITEMAP_PAGE_DEFAULT_PRIORITY
|
|
704
|
+
except InvalidOperation:
|
|
705
|
+
log.warning(f"Invalid priority: {priority}")
|
|
706
|
+
priority = SITEMAP_PAGE_DEFAULT_PRIORITY
|
|
707
|
+
else:
|
|
708
|
+
priority = SITEMAP_PAGE_DEFAULT_PRIORITY
|
|
709
|
+
|
|
710
|
+
news_title = html_unescape_strip(self.news_title)
|
|
711
|
+
|
|
712
|
+
news_publish_date = html_unescape_strip(self.news_publish_date)
|
|
713
|
+
if news_publish_date:
|
|
714
|
+
news_publish_date = parse_iso8601_date(date_string=news_publish_date)
|
|
715
|
+
|
|
716
|
+
news_publication_name = html_unescape_strip(self.news_publication_name)
|
|
717
|
+
news_publication_language = html_unescape_strip(
|
|
718
|
+
self.news_publication_language
|
|
719
|
+
)
|
|
720
|
+
news_access = html_unescape_strip(self.news_access)
|
|
721
|
+
|
|
722
|
+
news_genres = html_unescape_strip(self.news_genres)
|
|
723
|
+
if news_genres:
|
|
724
|
+
news_genres = [x.strip() for x in news_genres.split(",")]
|
|
725
|
+
else:
|
|
726
|
+
news_genres = []
|
|
727
|
+
|
|
728
|
+
news_keywords = html_unescape_strip(self.news_keywords)
|
|
729
|
+
if news_keywords:
|
|
730
|
+
news_keywords = [x.strip() for x in news_keywords.split(",")]
|
|
731
|
+
else:
|
|
732
|
+
news_keywords = []
|
|
733
|
+
|
|
734
|
+
news_stock_tickers = html_unescape_strip(self.news_stock_tickers)
|
|
735
|
+
if news_stock_tickers:
|
|
736
|
+
news_stock_tickers = [x.strip() for x in news_stock_tickers.split(",")]
|
|
737
|
+
else:
|
|
738
|
+
news_stock_tickers = []
|
|
739
|
+
|
|
740
|
+
sitemap_news_story = None
|
|
741
|
+
if news_title and news_publish_date:
|
|
742
|
+
sitemap_news_story = SitemapNewsStory(
|
|
743
|
+
title=news_title,
|
|
744
|
+
publish_date=news_publish_date,
|
|
745
|
+
publication_name=news_publication_name,
|
|
746
|
+
publication_language=news_publication_language,
|
|
747
|
+
access=news_access,
|
|
748
|
+
genres=news_genres,
|
|
749
|
+
keywords=news_keywords,
|
|
750
|
+
stock_tickers=news_stock_tickers,
|
|
751
|
+
)
|
|
752
|
+
|
|
753
|
+
sitemap_images = None
|
|
754
|
+
if len(self.images) > 0:
|
|
755
|
+
sitemap_images = [
|
|
756
|
+
SitemapImage(
|
|
757
|
+
loc=image.loc,
|
|
758
|
+
caption=image.caption,
|
|
759
|
+
geo_location=image.geo_location,
|
|
760
|
+
title=image.title,
|
|
761
|
+
license_=image.license,
|
|
762
|
+
)
|
|
763
|
+
for image in self.images
|
|
764
|
+
]
|
|
765
|
+
|
|
766
|
+
return SitemapPage(
|
|
767
|
+
url=url,
|
|
768
|
+
last_modified=last_modified,
|
|
769
|
+
change_frequency=change_frequency,
|
|
770
|
+
priority=priority,
|
|
771
|
+
news_story=sitemap_news_story,
|
|
772
|
+
images=sitemap_images,
|
|
773
|
+
)
|
|
774
|
+
|
|
775
|
+
__slots__ = ["_current_page", "_pages", "_page_urls", "_current_image"]
|
|
776
|
+
|
|
777
|
+
def __init__(self, url: str):
|
|
778
|
+
super().__init__(url=url)
|
|
779
|
+
|
|
780
|
+
self._current_page = None
|
|
781
|
+
self._pages = []
|
|
782
|
+
self._page_urls = set()
|
|
783
|
+
self._current_image = None
|
|
784
|
+
|
|
785
|
+
def xml_element_start(self, name: str, attrs: Dict[str, str]) -> None:
|
|
786
|
+
super().xml_element_start(name=name, attrs=attrs)
|
|
787
|
+
|
|
788
|
+
if name == "sitemap:url":
|
|
789
|
+
if self._current_page:
|
|
790
|
+
raise SitemapXMLParsingException(
|
|
791
|
+
"Page is expected to be unset by <url>."
|
|
792
|
+
)
|
|
793
|
+
self._current_page = self.Page()
|
|
794
|
+
elif name == "image:image":
|
|
795
|
+
if self._current_image:
|
|
796
|
+
raise SitemapXMLParsingException(
|
|
797
|
+
"Image is expected to be unset by <image:image>."
|
|
798
|
+
)
|
|
799
|
+
if not self._current_page:
|
|
800
|
+
raise SitemapXMLParsingException(
|
|
801
|
+
"Page is expected to be set before <image:image>."
|
|
802
|
+
)
|
|
803
|
+
self._current_image = self.Image()
|
|
804
|
+
|
|
805
|
+
def __require_last_char_data_to_be_set(self, name: str) -> None:
|
|
806
|
+
if not self._last_char_data:
|
|
807
|
+
raise SitemapXMLParsingException(
|
|
808
|
+
f"Character data is expected to be set at the end of <{name}>."
|
|
809
|
+
)
|
|
810
|
+
|
|
811
|
+
def xml_element_end(self, name: str) -> None:
|
|
812
|
+
if not self._current_page and name != "sitemap:urlset":
|
|
813
|
+
raise SitemapXMLParsingException(
|
|
814
|
+
f"Page is expected to be set at the end of <{name}>."
|
|
815
|
+
)
|
|
816
|
+
|
|
817
|
+
if name == "sitemap:url":
|
|
818
|
+
if self._current_page.url not in self._page_urls:
|
|
819
|
+
self._pages.append(self._current_page)
|
|
820
|
+
self._page_urls.add(self._current_page.url)
|
|
821
|
+
self._current_page = None
|
|
822
|
+
elif name == "image:image":
|
|
823
|
+
self._current_page.images.append(self._current_image)
|
|
824
|
+
self._current_image = None
|
|
825
|
+
else:
|
|
826
|
+
if name == "sitemap:loc":
|
|
827
|
+
# Every entry must have <loc>
|
|
828
|
+
self.__require_last_char_data_to_be_set(name=name)
|
|
829
|
+
self._current_page.url = self._last_char_data
|
|
830
|
+
|
|
831
|
+
elif name == "sitemap:lastmod":
|
|
832
|
+
# Element might be present but character data might be empty
|
|
833
|
+
self._current_page.last_modified = self._last_char_data
|
|
834
|
+
|
|
835
|
+
elif name == "sitemap:changefreq":
|
|
836
|
+
# Element might be present but character data might be empty
|
|
837
|
+
self._current_page.change_frequency = self._last_char_data
|
|
838
|
+
|
|
839
|
+
elif name == "sitemap:priority":
|
|
840
|
+
# Element might be present but character data might be empty
|
|
841
|
+
self._current_page.priority = self._last_char_data
|
|
842
|
+
|
|
843
|
+
elif name == "news:name": # news/publication/name
|
|
844
|
+
# Element might be present but character data might be empty
|
|
845
|
+
self._current_page.news_publication_name = self._last_char_data
|
|
846
|
+
|
|
847
|
+
elif name == "news:language": # news/publication/language
|
|
848
|
+
# Element might be present but character data might be empty
|
|
849
|
+
self._current_page.news_publication_language = self._last_char_data
|
|
850
|
+
|
|
851
|
+
elif name == "news:publication_date":
|
|
852
|
+
# Element might be present but character data might be empty
|
|
853
|
+
self._current_page.news_publish_date = self._last_char_data
|
|
854
|
+
|
|
855
|
+
elif name == "news:title":
|
|
856
|
+
# Every Google News sitemap entry must have <title>
|
|
857
|
+
self.__require_last_char_data_to_be_set(name=name)
|
|
858
|
+
self._current_page.news_title = self._last_char_data
|
|
859
|
+
|
|
860
|
+
elif name == "news:access":
|
|
861
|
+
# Element might be present but character data might be empty
|
|
862
|
+
self._current_page.news_access = self._last_char_data
|
|
863
|
+
|
|
864
|
+
elif name == "news:keywords":
|
|
865
|
+
# Element might be present but character data might be empty
|
|
866
|
+
self._current_page.news_keywords = self._last_char_data
|
|
867
|
+
|
|
868
|
+
elif name == "news:stock_tickers":
|
|
869
|
+
# Element might be present but character data might be empty
|
|
870
|
+
self._current_page.news_stock_tickers = self._last_char_data
|
|
871
|
+
|
|
872
|
+
elif name == "image:loc":
|
|
873
|
+
# Every image entry must have <loc>
|
|
874
|
+
self.__require_last_char_data_to_be_set(name=name)
|
|
875
|
+
self._current_image.loc = self._last_char_data
|
|
876
|
+
|
|
877
|
+
elif name == "image:caption":
|
|
878
|
+
self._current_image.caption = self._last_char_data
|
|
879
|
+
|
|
880
|
+
elif name == "image:geo_location":
|
|
881
|
+
self._current_image.geo_location = self._last_char_data
|
|
882
|
+
|
|
883
|
+
elif name == "image:title":
|
|
884
|
+
self._current_image.title = self._last_char_data
|
|
885
|
+
|
|
886
|
+
elif name == "image:license":
|
|
887
|
+
self._current_image.license = self._last_char_data
|
|
888
|
+
|
|
889
|
+
super().xml_element_end(name=name)
|
|
890
|
+
|
|
891
|
+
def sitemap(self) -> AbstractSitemap:
|
|
892
|
+
pages = []
|
|
893
|
+
|
|
894
|
+
for page_row in self._pages:
|
|
895
|
+
page = page_row.page()
|
|
896
|
+
if page:
|
|
897
|
+
pages.append(page)
|
|
898
|
+
|
|
899
|
+
pages_sitemap = PagesXMLSitemap(url=self._url, pages=pages)
|
|
900
|
+
|
|
901
|
+
return pages_sitemap
|
|
902
|
+
|
|
903
|
+
|
|
904
|
+
class PagesRSSSitemapParser(AbstractXMLSitemapParser):
|
|
905
|
+
"""
|
|
906
|
+
Pages RSS 2.0 sitemap parser.
|
|
907
|
+
|
|
908
|
+
https://validator.w3.org/feed/docs/rss2.html
|
|
909
|
+
"""
|
|
910
|
+
|
|
911
|
+
class Page:
|
|
912
|
+
"""
|
|
913
|
+
Data class for holding various properties for a single RSS <item> while parsing.
|
|
914
|
+
"""
|
|
915
|
+
|
|
916
|
+
__slots__ = [
|
|
917
|
+
"link",
|
|
918
|
+
"title",
|
|
919
|
+
"description",
|
|
920
|
+
"publication_date",
|
|
921
|
+
]
|
|
922
|
+
|
|
923
|
+
def __init__(self):
|
|
924
|
+
self.link = None
|
|
925
|
+
self.title = None
|
|
926
|
+
self.description = None
|
|
927
|
+
self.publication_date = None
|
|
928
|
+
|
|
929
|
+
def __hash__(self):
|
|
930
|
+
return hash(
|
|
931
|
+
(
|
|
932
|
+
# Hash only the URL
|
|
933
|
+
self.link,
|
|
934
|
+
)
|
|
935
|
+
)
|
|
936
|
+
|
|
937
|
+
def page(self) -> Optional[SitemapPage]:
|
|
938
|
+
"""Return constructed sitemap page if one has been completed, otherwise None."""
|
|
939
|
+
|
|
940
|
+
# Required
|
|
941
|
+
link = html_unescape_strip(self.link)
|
|
942
|
+
if not link:
|
|
943
|
+
log.error("Link is unset")
|
|
944
|
+
return None
|
|
945
|
+
|
|
946
|
+
title = html_unescape_strip(self.title)
|
|
947
|
+
description = html_unescape_strip(self.description)
|
|
948
|
+
if not (title or description):
|
|
949
|
+
log.error("Both title and description are unset")
|
|
950
|
+
return None
|
|
951
|
+
|
|
952
|
+
publication_date = html_unescape_strip(self.publication_date)
|
|
953
|
+
if publication_date:
|
|
954
|
+
publication_date = parse_rfc2822_date(publication_date)
|
|
955
|
+
|
|
956
|
+
return SitemapPage(
|
|
957
|
+
url=link,
|
|
958
|
+
news_story=SitemapNewsStory(
|
|
959
|
+
title=title or description,
|
|
960
|
+
publish_date=publication_date,
|
|
961
|
+
),
|
|
962
|
+
)
|
|
963
|
+
|
|
964
|
+
__slots__ = ["_current_page", "_pages", "_page_links"]
|
|
965
|
+
|
|
966
|
+
def __init__(self, url: str):
|
|
967
|
+
super().__init__(url=url)
|
|
968
|
+
|
|
969
|
+
self._current_page = None
|
|
970
|
+
self._pages = []
|
|
971
|
+
self._page_links = set()
|
|
972
|
+
|
|
973
|
+
def xml_element_start(self, name: str, attrs: Dict[str, str]) -> None:
|
|
974
|
+
super().xml_element_start(name=name, attrs=attrs)
|
|
975
|
+
|
|
976
|
+
if name == "item":
|
|
977
|
+
if self._current_page:
|
|
978
|
+
raise SitemapXMLParsingException(
|
|
979
|
+
"Page is expected to be unset by <item>."
|
|
980
|
+
)
|
|
981
|
+
self._current_page = self.Page()
|
|
982
|
+
|
|
983
|
+
def __require_last_char_data_to_be_set(self, name: str) -> None:
|
|
984
|
+
if not self._last_char_data:
|
|
985
|
+
raise SitemapXMLParsingException(
|
|
986
|
+
f"Character data is expected to be set at the end of <{name}>."
|
|
987
|
+
)
|
|
988
|
+
|
|
989
|
+
def xml_element_end(self, name: str) -> None:
|
|
990
|
+
# If within <item> already
|
|
991
|
+
if self._current_page:
|
|
992
|
+
if name == "item":
|
|
993
|
+
if self._current_page.link not in self._page_links:
|
|
994
|
+
self._pages.append(self._current_page)
|
|
995
|
+
self._page_links.add(self._current_page.link)
|
|
996
|
+
self._current_page = None
|
|
997
|
+
|
|
998
|
+
else:
|
|
999
|
+
if name == "link":
|
|
1000
|
+
# Every entry must have <link>
|
|
1001
|
+
self.__require_last_char_data_to_be_set(name=name)
|
|
1002
|
+
self._current_page.link = self._last_char_data
|
|
1003
|
+
|
|
1004
|
+
elif name == "title":
|
|
1005
|
+
# Title (if set) can't be empty
|
|
1006
|
+
self.__require_last_char_data_to_be_set(name=name)
|
|
1007
|
+
self._current_page.title = self._last_char_data
|
|
1008
|
+
|
|
1009
|
+
elif name == "description":
|
|
1010
|
+
# Description (if set) can't be empty
|
|
1011
|
+
self.__require_last_char_data_to_be_set(name=name)
|
|
1012
|
+
self._current_page.description = self._last_char_data
|
|
1013
|
+
|
|
1014
|
+
elif name == "pubDate":
|
|
1015
|
+
# Element might be present but character data might be empty
|
|
1016
|
+
self._current_page.publication_date = self._last_char_data
|
|
1017
|
+
|
|
1018
|
+
super().xml_element_end(name=name)
|
|
1019
|
+
|
|
1020
|
+
def sitemap(self) -> AbstractSitemap:
|
|
1021
|
+
pages = []
|
|
1022
|
+
|
|
1023
|
+
for page_row in self._pages:
|
|
1024
|
+
page = page_row.page()
|
|
1025
|
+
if page:
|
|
1026
|
+
pages.append(page)
|
|
1027
|
+
|
|
1028
|
+
pages_sitemap = PagesRSSSitemap(url=self._url, pages=pages)
|
|
1029
|
+
|
|
1030
|
+
return pages_sitemap
|
|
1031
|
+
|
|
1032
|
+
|
|
1033
|
+
class PagesAtomSitemapParser(AbstractXMLSitemapParser):
|
|
1034
|
+
"""
|
|
1035
|
+
Pages Atom 0.3 / 1.0 sitemap parser.
|
|
1036
|
+
|
|
1037
|
+
References:
|
|
1038
|
+
|
|
1039
|
+
- https://github.com/simplepie/simplepie-ng/wiki/Spec:-Atom-0.3
|
|
1040
|
+
- https://www.ietf.org/rfc/rfc4287.txt
|
|
1041
|
+
- http://rakaz.nl/2005/07/moving-from-atom-03-to-10.html
|
|
1042
|
+
"""
|
|
1043
|
+
|
|
1044
|
+
# FIXME merge with RSS parser class as there are too many similarities
|
|
1045
|
+
|
|
1046
|
+
class Page:
|
|
1047
|
+
"""Data class for holding various properties for a single Atom <entry> while parsing."""
|
|
1048
|
+
|
|
1049
|
+
__slots__ = [
|
|
1050
|
+
"link",
|
|
1051
|
+
"title",
|
|
1052
|
+
"description",
|
|
1053
|
+
"publication_date",
|
|
1054
|
+
]
|
|
1055
|
+
|
|
1056
|
+
def __init__(self):
|
|
1057
|
+
self.link = None
|
|
1058
|
+
self.title = None
|
|
1059
|
+
self.description = None
|
|
1060
|
+
self.publication_date = None
|
|
1061
|
+
|
|
1062
|
+
def __hash__(self):
|
|
1063
|
+
return hash(
|
|
1064
|
+
(
|
|
1065
|
+
# Hash only the URL
|
|
1066
|
+
self.link,
|
|
1067
|
+
)
|
|
1068
|
+
)
|
|
1069
|
+
|
|
1070
|
+
def page(self) -> Optional[SitemapPage]:
|
|
1071
|
+
"""Return constructed sitemap page if one has been completed, otherwise None."""
|
|
1072
|
+
|
|
1073
|
+
# Required
|
|
1074
|
+
link = html_unescape_strip(self.link)
|
|
1075
|
+
if not link:
|
|
1076
|
+
log.error("Link is unset")
|
|
1077
|
+
return None
|
|
1078
|
+
|
|
1079
|
+
title = html_unescape_strip(self.title)
|
|
1080
|
+
description = html_unescape_strip(self.description)
|
|
1081
|
+
if not (title or description):
|
|
1082
|
+
log.error("Both title and description are unset")
|
|
1083
|
+
return None
|
|
1084
|
+
|
|
1085
|
+
publication_date = html_unescape_strip(self.publication_date)
|
|
1086
|
+
if publication_date:
|
|
1087
|
+
publication_date = parse_iso8601_date(publication_date)
|
|
1088
|
+
|
|
1089
|
+
return SitemapPage(
|
|
1090
|
+
url=link,
|
|
1091
|
+
news_story=SitemapNewsStory(
|
|
1092
|
+
title=title or description,
|
|
1093
|
+
publish_date=publication_date,
|
|
1094
|
+
),
|
|
1095
|
+
)
|
|
1096
|
+
|
|
1097
|
+
__slots__ = [
|
|
1098
|
+
"_current_page",
|
|
1099
|
+
"_pages",
|
|
1100
|
+
"_page_links",
|
|
1101
|
+
"_last_link_rel_self_href",
|
|
1102
|
+
]
|
|
1103
|
+
|
|
1104
|
+
def __init__(self, url: str):
|
|
1105
|
+
super().__init__(url=url)
|
|
1106
|
+
|
|
1107
|
+
self._current_page = None
|
|
1108
|
+
self._pages = []
|
|
1109
|
+
self._page_links = set()
|
|
1110
|
+
self._last_link_rel_self_href = None
|
|
1111
|
+
|
|
1112
|
+
def xml_element_start(self, name: str, attrs: Dict[str, str]) -> None:
|
|
1113
|
+
super().xml_element_start(name=name, attrs=attrs)
|
|
1114
|
+
|
|
1115
|
+
if name == "entry":
|
|
1116
|
+
if self._current_page:
|
|
1117
|
+
raise SitemapXMLParsingException(
|
|
1118
|
+
"Page is expected to be unset by <entry>."
|
|
1119
|
+
)
|
|
1120
|
+
self._current_page = self.Page()
|
|
1121
|
+
|
|
1122
|
+
elif name == "link":
|
|
1123
|
+
if self._current_page:
|
|
1124
|
+
if (
|
|
1125
|
+
attrs.get("rel", "self").lower() == "self"
|
|
1126
|
+
or self._last_link_rel_self_href is None
|
|
1127
|
+
):
|
|
1128
|
+
self._last_link_rel_self_href = attrs.get("href", None)
|
|
1129
|
+
|
|
1130
|
+
def __require_last_char_data_to_be_set(self, name: str) -> None:
|
|
1131
|
+
if not self._last_char_data:
|
|
1132
|
+
raise SitemapXMLParsingException(
|
|
1133
|
+
f"Character data is expected to be set at the end of <{name}>."
|
|
1134
|
+
)
|
|
1135
|
+
|
|
1136
|
+
def xml_element_end(self, name: str) -> None:
|
|
1137
|
+
# If within <entry> already
|
|
1138
|
+
if self._current_page:
|
|
1139
|
+
if name == "entry":
|
|
1140
|
+
if self._last_link_rel_self_href:
|
|
1141
|
+
self._current_page.link = self._last_link_rel_self_href
|
|
1142
|
+
self._last_link_rel_self_href = None
|
|
1143
|
+
|
|
1144
|
+
if self._current_page.link not in self._page_links:
|
|
1145
|
+
self._pages.append(self._current_page)
|
|
1146
|
+
self._page_links.add(self._current_page.link)
|
|
1147
|
+
|
|
1148
|
+
self._current_page = None
|
|
1149
|
+
|
|
1150
|
+
else:
|
|
1151
|
+
if name == "title":
|
|
1152
|
+
# Title (if set) can't be empty
|
|
1153
|
+
self.__require_last_char_data_to_be_set(name=name)
|
|
1154
|
+
self._current_page.title = self._last_char_data
|
|
1155
|
+
|
|
1156
|
+
elif name == "tagline" or name == "summary":
|
|
1157
|
+
# Description (if set) can't be empty
|
|
1158
|
+
self.__require_last_char_data_to_be_set(name=name)
|
|
1159
|
+
self._current_page.description = self._last_char_data
|
|
1160
|
+
|
|
1161
|
+
elif name == "issued" or name == "published":
|
|
1162
|
+
# Element might be present but character data might be empty
|
|
1163
|
+
self._current_page.publication_date = self._last_char_data
|
|
1164
|
+
|
|
1165
|
+
elif name == "updated":
|
|
1166
|
+
# No 'issued' or 'published' were set before
|
|
1167
|
+
if not self._current_page.publication_date:
|
|
1168
|
+
self._current_page.publication_date = self._last_char_data
|
|
1169
|
+
|
|
1170
|
+
super().xml_element_end(name=name)
|
|
1171
|
+
|
|
1172
|
+
def sitemap(self) -> AbstractSitemap:
|
|
1173
|
+
pages = []
|
|
1174
|
+
|
|
1175
|
+
for page_row in self._pages:
|
|
1176
|
+
page = page_row.page()
|
|
1177
|
+
if page:
|
|
1178
|
+
pages.append(page)
|
|
1179
|
+
|
|
1180
|
+
pages_sitemap = PagesAtomSitemap(url=self._url, pages=pages)
|
|
1181
|
+
|
|
1182
|
+
return pages_sitemap
|