ultimate-sitemap-parser 1.0.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ultimate-sitemap-parser might be problematic. Click here for more details.
- ultimate_sitemap_parser-1.0.0rc1.dist-info/LICENSE +674 -0
- ultimate_sitemap_parser-1.0.0rc1.dist-info/METADATA +109 -0
- ultimate_sitemap_parser-1.0.0rc1.dist-info/NOTICE +12 -0
- ultimate_sitemap_parser-1.0.0rc1.dist-info/RECORD +22 -0
- ultimate_sitemap_parser-1.0.0rc1.dist-info/WHEEL +4 -0
- ultimate_sitemap_parser-1.0.0rc1.dist-info/entry_points.txt +3 -0
- usp/__init__.py +5 -0
- usp/cli/__init__.py +1 -0
- usp/cli/_ls.py +105 -0
- usp/cli/_util.py +21 -0
- usp/cli/cli.py +27 -0
- usp/exceptions.py +35 -0
- usp/fetch_parse.py +1182 -0
- usp/helpers.py +293 -0
- usp/log.py +77 -0
- usp/objects/__init__.py +0 -0
- usp/objects/page.py +451 -0
- usp/objects/sitemap.py +436 -0
- usp/tree.py +114 -0
- usp/web_client/__init__.py +0 -0
- usp/web_client/abstract_client.py +189 -0
- usp/web_client/requests_client.py +150 -0
usp/objects/page.py
ADDED
|
@@ -0,0 +1,451 @@
|
|
|
1
|
+
"""Objects that represent a page found in one of the sitemaps."""
|
|
2
|
+
|
|
3
|
+
import datetime
|
|
4
|
+
from decimal import Decimal
|
|
5
|
+
from enum import Enum, unique
|
|
6
|
+
from typing import List, Optional
|
|
7
|
+
|
|
8
|
+
SITEMAP_PAGE_DEFAULT_PRIORITY = Decimal("0.5")
|
|
9
|
+
"""Default sitemap page priority, as per the spec."""
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class SitemapNewsStory:
|
|
13
|
+
"""
|
|
14
|
+
Single story derived from Google News XML sitemap.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
__slots__ = [
|
|
18
|
+
"__title",
|
|
19
|
+
"__publish_date",
|
|
20
|
+
"__publication_name",
|
|
21
|
+
"__publication_language",
|
|
22
|
+
"__access",
|
|
23
|
+
"__genres",
|
|
24
|
+
"__keywords",
|
|
25
|
+
"__stock_tickers",
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
def __init__(
|
|
29
|
+
self,
|
|
30
|
+
title: str,
|
|
31
|
+
publish_date: datetime.datetime,
|
|
32
|
+
publication_name: Optional[str] = None,
|
|
33
|
+
publication_language: Optional[str] = None,
|
|
34
|
+
access: Optional[str] = None,
|
|
35
|
+
genres: List[str] = None,
|
|
36
|
+
keywords: List[str] = None,
|
|
37
|
+
stock_tickers: List[str] = None,
|
|
38
|
+
):
|
|
39
|
+
"""
|
|
40
|
+
Initialize a new Google News story.
|
|
41
|
+
|
|
42
|
+
:param title: Story title.
|
|
43
|
+
:param publish_date: Story publication date.
|
|
44
|
+
:param publication_name: Name of the news publication in which the article appears in.
|
|
45
|
+
:param publication_language: Primary language of the news publication in which the article appears in.
|
|
46
|
+
:param access: Accessibility of the article.
|
|
47
|
+
:param genres: List of properties characterizing the content of the article.
|
|
48
|
+
:param keywords: List of keywords describing the topic of the article.
|
|
49
|
+
:param stock_tickers: List of up to 5 stock tickers that are the main subject of the article.
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
# Spec defines that some of the properties below are "required" but in practice not every website provides the
|
|
53
|
+
# required properties. So, we require only "title" and "publish_date" to be set.
|
|
54
|
+
|
|
55
|
+
self.__title = title
|
|
56
|
+
self.__publish_date = publish_date
|
|
57
|
+
self.__publication_name = publication_name
|
|
58
|
+
self.__publication_language = publication_language
|
|
59
|
+
self.__access = access
|
|
60
|
+
self.__genres = genres if genres else []
|
|
61
|
+
self.__keywords = keywords if keywords else []
|
|
62
|
+
self.__stock_tickers = stock_tickers if stock_tickers else []
|
|
63
|
+
|
|
64
|
+
def __eq__(self, other) -> bool:
|
|
65
|
+
"""Check equality."""
|
|
66
|
+
if not isinstance(other, SitemapNewsStory):
|
|
67
|
+
raise NotImplementedError
|
|
68
|
+
|
|
69
|
+
if self.title != other.title:
|
|
70
|
+
return False
|
|
71
|
+
|
|
72
|
+
if self.publish_date != other.publish_date:
|
|
73
|
+
return False
|
|
74
|
+
|
|
75
|
+
if self.publication_name != other.publication_name:
|
|
76
|
+
return False
|
|
77
|
+
|
|
78
|
+
if self.publication_language != other.publication_language:
|
|
79
|
+
return False
|
|
80
|
+
|
|
81
|
+
if self.access != other.access:
|
|
82
|
+
return False
|
|
83
|
+
|
|
84
|
+
if self.genres != other.genres:
|
|
85
|
+
return False
|
|
86
|
+
|
|
87
|
+
if self.keywords != other.keywords:
|
|
88
|
+
return False
|
|
89
|
+
|
|
90
|
+
if self.stock_tickers != other.stock_tickers:
|
|
91
|
+
return False
|
|
92
|
+
|
|
93
|
+
return True
|
|
94
|
+
|
|
95
|
+
def to_dict(self) -> dict:
|
|
96
|
+
"""
|
|
97
|
+
Convert to a dictionary representation.
|
|
98
|
+
|
|
99
|
+
:return: the news story data as a dictionary
|
|
100
|
+
"""
|
|
101
|
+
return {
|
|
102
|
+
"title": self.title,
|
|
103
|
+
"publish_date": self.publish_date,
|
|
104
|
+
"publication_name": self.publication_name,
|
|
105
|
+
"publication_language": self.publication_language,
|
|
106
|
+
"access": self.access,
|
|
107
|
+
"genres": self.genres,
|
|
108
|
+
"keywords": self.keywords,
|
|
109
|
+
"stock_tickers": self.stock_tickers,
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
def __hash__(self):
|
|
113
|
+
return hash(
|
|
114
|
+
(
|
|
115
|
+
self.title,
|
|
116
|
+
self.publish_date,
|
|
117
|
+
self.publication_name,
|
|
118
|
+
self.publication_language,
|
|
119
|
+
self.access,
|
|
120
|
+
self.genres,
|
|
121
|
+
self.keywords,
|
|
122
|
+
self.stock_tickers,
|
|
123
|
+
)
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
def __repr__(self) -> str:
|
|
127
|
+
return (
|
|
128
|
+
f"{self.__class__.__name__}("
|
|
129
|
+
f"title={self.title}, "
|
|
130
|
+
f"publish_date={self.publish_date}, "
|
|
131
|
+
f"publication_name={self.publication_name}, "
|
|
132
|
+
f"publication_language={self.publication_language}, "
|
|
133
|
+
f"access={self.access}, "
|
|
134
|
+
f"genres={self.genres}, "
|
|
135
|
+
f"keywords={self.keywords}, "
|
|
136
|
+
f"stock_tickers={self.stock_tickers}"
|
|
137
|
+
")"
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
@property
|
|
141
|
+
def title(self) -> str:
|
|
142
|
+
"""Get the story title."""
|
|
143
|
+
return self.__title
|
|
144
|
+
|
|
145
|
+
@property
|
|
146
|
+
def publish_date(self) -> datetime.datetime:
|
|
147
|
+
"""Get the story publication date."""
|
|
148
|
+
return self.__publish_date
|
|
149
|
+
|
|
150
|
+
@property
|
|
151
|
+
def publication_name(self) -> Optional[str]:
|
|
152
|
+
"""Get the name of the news publication in which the article appears."""
|
|
153
|
+
return self.__publication_name
|
|
154
|
+
|
|
155
|
+
@property
|
|
156
|
+
def publication_language(self) -> Optional[str]:
|
|
157
|
+
"""Get the primary language of the news publication in which the article appears.
|
|
158
|
+
|
|
159
|
+
It should be an ISO 639 Language Code (either 2 or 3 letters).
|
|
160
|
+
"""
|
|
161
|
+
return self.__publication_language
|
|
162
|
+
|
|
163
|
+
@property
|
|
164
|
+
def access(self) -> Optional[str]:
|
|
165
|
+
"""Get the accessibility of the article.
|
|
166
|
+
|
|
167
|
+
:return: Accessibility of the article.
|
|
168
|
+
"""
|
|
169
|
+
return self.__access
|
|
170
|
+
|
|
171
|
+
@property
|
|
172
|
+
def genres(self) -> List[str]:
|
|
173
|
+
"""Get list of genres characterizing the content of the article.
|
|
174
|
+
|
|
175
|
+
Genres will be one "PressRelease", "Satire", "Blog", "OpEd", "Opinion", "UserGenerated"
|
|
176
|
+
"""
|
|
177
|
+
return self.__genres
|
|
178
|
+
|
|
179
|
+
@property
|
|
180
|
+
def keywords(self) -> List[str]:
|
|
181
|
+
"""Get list of keywords describing the topic of the article."""
|
|
182
|
+
return self.__keywords
|
|
183
|
+
|
|
184
|
+
@property
|
|
185
|
+
def stock_tickers(self) -> List[str]:
|
|
186
|
+
"""Get stock tickers that are the main subject of the article.
|
|
187
|
+
|
|
188
|
+
Each ticker must be prefixed by the name of its stock exchange, and must match its entry in Google Finance.
|
|
189
|
+
For example, "NASDAQ:AMAT" (but not "NASD:AMAT"), or "BOM:500325" (but not "BOM:RIL").
|
|
190
|
+
|
|
191
|
+
Up to 5 tickers can be provided.
|
|
192
|
+
"""
|
|
193
|
+
return self.__stock_tickers
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
class SitemapImage:
|
|
197
|
+
"""
|
|
198
|
+
Single image derived from Google Image XML sitemap.
|
|
199
|
+
|
|
200
|
+
All properties except ``loc`` are now deprecated in the XML specification, see
|
|
201
|
+
https://developers.google.com/search/blog/2022/05/spring-cleaning-sitemap-extensions
|
|
202
|
+
|
|
203
|
+
They will continue to be supported here.
|
|
204
|
+
"""
|
|
205
|
+
|
|
206
|
+
__slots__ = ["__loc", "__caption", "__geo_location", "__title", "__license"]
|
|
207
|
+
|
|
208
|
+
def __init__(
|
|
209
|
+
self,
|
|
210
|
+
loc: str,
|
|
211
|
+
caption: Optional[str] = None,
|
|
212
|
+
geo_location: Optional[str] = None,
|
|
213
|
+
title: Optional[str] = None,
|
|
214
|
+
license_: Optional[str] = None,
|
|
215
|
+
):
|
|
216
|
+
"""Initialise a Google Image.
|
|
217
|
+
|
|
218
|
+
:param loc: the URL of the image
|
|
219
|
+
:param caption: the caption of the image, optional
|
|
220
|
+
:param geo_location: the geographic location of the image, for example "Limerick, Ireland", optional
|
|
221
|
+
:param title: the title of the image, optional
|
|
222
|
+
:param license_: a URL to the license of the image, optional
|
|
223
|
+
"""
|
|
224
|
+
self.__loc = loc
|
|
225
|
+
self.__caption = caption
|
|
226
|
+
self.__geo_location = geo_location
|
|
227
|
+
self.__title = title
|
|
228
|
+
self.__license = license_
|
|
229
|
+
|
|
230
|
+
def __eq__(self, other) -> bool:
|
|
231
|
+
if not isinstance(other, SitemapImage):
|
|
232
|
+
raise NotImplementedError
|
|
233
|
+
|
|
234
|
+
if self.loc != other.loc:
|
|
235
|
+
return False
|
|
236
|
+
|
|
237
|
+
if self.caption != other.caption:
|
|
238
|
+
return False
|
|
239
|
+
|
|
240
|
+
if self.geo_location != other.geo_location:
|
|
241
|
+
return False
|
|
242
|
+
|
|
243
|
+
if self.title != other.title:
|
|
244
|
+
return False
|
|
245
|
+
|
|
246
|
+
if self.license != other.license:
|
|
247
|
+
return False
|
|
248
|
+
|
|
249
|
+
return True
|
|
250
|
+
|
|
251
|
+
def to_dict(self):
|
|
252
|
+
"""Convert to a dictionary representation.
|
|
253
|
+
|
|
254
|
+
:return: the image data as a dictionary
|
|
255
|
+
"""
|
|
256
|
+
return {
|
|
257
|
+
"loc": self.loc,
|
|
258
|
+
"caption": self.caption,
|
|
259
|
+
"geo_location": self.geo_location,
|
|
260
|
+
"title": self.title,
|
|
261
|
+
"license": self.license,
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
def __hash__(self):
|
|
265
|
+
return hash(
|
|
266
|
+
(self.loc, self.caption, self.geo_location, self.title, self.license)
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
def __repr__(self) -> str:
|
|
270
|
+
return (
|
|
271
|
+
f"{self.__class__.__name__}("
|
|
272
|
+
f"loc={self.loc}, "
|
|
273
|
+
f"caption={self.caption}, "
|
|
274
|
+
f"geo_location={self.geo_location}, "
|
|
275
|
+
f"title={self.title}, "
|
|
276
|
+
f"license={self.license}"
|
|
277
|
+
")"
|
|
278
|
+
)
|
|
279
|
+
|
|
280
|
+
@property
|
|
281
|
+
def loc(self) -> str:
|
|
282
|
+
"""Get the URL of the image."""
|
|
283
|
+
return self.__loc
|
|
284
|
+
|
|
285
|
+
@property
|
|
286
|
+
def caption(self) -> Optional[str]:
|
|
287
|
+
"""Get the caption of the image."""
|
|
288
|
+
return self.__caption
|
|
289
|
+
|
|
290
|
+
@property
|
|
291
|
+
def geo_location(self) -> Optional[str]:
|
|
292
|
+
"""Get the geographic location of the image."""
|
|
293
|
+
return self.__geo_location
|
|
294
|
+
|
|
295
|
+
@property
|
|
296
|
+
def title(self) -> Optional[str]:
|
|
297
|
+
"""Get the title of the image."""
|
|
298
|
+
return self.__title
|
|
299
|
+
|
|
300
|
+
@property
|
|
301
|
+
def license(self) -> Optional[str]:
|
|
302
|
+
"""Get a URL to the license of the image."""
|
|
303
|
+
return self.__license
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
@unique
|
|
307
|
+
class SitemapPageChangeFrequency(Enum):
|
|
308
|
+
"""Change frequency of a sitemap URL."""
|
|
309
|
+
|
|
310
|
+
ALWAYS = "always"
|
|
311
|
+
HOURLY = "hourly"
|
|
312
|
+
DAILY = "daily"
|
|
313
|
+
WEEKLY = "weekly"
|
|
314
|
+
MONTHLY = "monthly"
|
|
315
|
+
YEARLY = "yearly"
|
|
316
|
+
NEVER = "never"
|
|
317
|
+
|
|
318
|
+
@classmethod
|
|
319
|
+
def has_value(cls, value: str) -> bool:
|
|
320
|
+
"""Test if enum has specified value."""
|
|
321
|
+
return any(value == item.value for item in cls)
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
class SitemapPage:
|
|
325
|
+
"""Single sitemap-derived page."""
|
|
326
|
+
|
|
327
|
+
__slots__ = [
|
|
328
|
+
"__url",
|
|
329
|
+
"__priority",
|
|
330
|
+
"__last_modified",
|
|
331
|
+
"__change_frequency",
|
|
332
|
+
"__news_story",
|
|
333
|
+
"__images",
|
|
334
|
+
]
|
|
335
|
+
|
|
336
|
+
def __init__(
|
|
337
|
+
self,
|
|
338
|
+
url: str,
|
|
339
|
+
priority: Decimal = SITEMAP_PAGE_DEFAULT_PRIORITY,
|
|
340
|
+
last_modified: Optional[datetime.datetime] = None,
|
|
341
|
+
change_frequency: Optional[SitemapPageChangeFrequency] = None,
|
|
342
|
+
news_story: Optional[SitemapNewsStory] = None,
|
|
343
|
+
images: Optional[List[SitemapImage]] = None,
|
|
344
|
+
):
|
|
345
|
+
"""
|
|
346
|
+
Initialize a new sitemap-derived page.
|
|
347
|
+
|
|
348
|
+
:param url: Page URL.
|
|
349
|
+
:param priority: Priority of this URL relative to other URLs on your site.
|
|
350
|
+
:param last_modified: Date of last modification of the URL.
|
|
351
|
+
:param change_frequency: Change frequency of a sitemap URL.
|
|
352
|
+
:param news_story: Google News story attached to the URL.
|
|
353
|
+
"""
|
|
354
|
+
self.__url = url
|
|
355
|
+
self.__priority = priority
|
|
356
|
+
self.__last_modified = last_modified
|
|
357
|
+
self.__change_frequency = change_frequency
|
|
358
|
+
self.__news_story = news_story
|
|
359
|
+
self.__images = images
|
|
360
|
+
|
|
361
|
+
def __eq__(self, other) -> bool:
|
|
362
|
+
if not isinstance(other, SitemapPage):
|
|
363
|
+
raise NotImplementedError
|
|
364
|
+
|
|
365
|
+
if self.url != other.url:
|
|
366
|
+
return False
|
|
367
|
+
|
|
368
|
+
if self.priority != other.priority:
|
|
369
|
+
return False
|
|
370
|
+
|
|
371
|
+
if self.last_modified != other.last_modified:
|
|
372
|
+
return False
|
|
373
|
+
|
|
374
|
+
if self.change_frequency != other.change_frequency:
|
|
375
|
+
return False
|
|
376
|
+
|
|
377
|
+
if self.news_story != other.news_story:
|
|
378
|
+
return False
|
|
379
|
+
|
|
380
|
+
if self.images != other.images:
|
|
381
|
+
return False
|
|
382
|
+
|
|
383
|
+
return True
|
|
384
|
+
|
|
385
|
+
def __hash__(self):
|
|
386
|
+
return hash(
|
|
387
|
+
(
|
|
388
|
+
# Hash only the URL to be able to find unique pages later on
|
|
389
|
+
self.url,
|
|
390
|
+
)
|
|
391
|
+
)
|
|
392
|
+
|
|
393
|
+
def __repr__(self) -> str:
|
|
394
|
+
return (
|
|
395
|
+
f"{self.__class__.__name__}("
|
|
396
|
+
f"url={self.url}, "
|
|
397
|
+
f"priority={self.priority}, "
|
|
398
|
+
f"last_modified={self.last_modified}, "
|
|
399
|
+
f"change_frequency={self.change_frequency}, "
|
|
400
|
+
f"news_story={self.news_story}, "
|
|
401
|
+
f"images={self.images}"
|
|
402
|
+
")"
|
|
403
|
+
)
|
|
404
|
+
|
|
405
|
+
def to_dict(self):
|
|
406
|
+
"""
|
|
407
|
+
Convert this page to a dictionary.
|
|
408
|
+
"""
|
|
409
|
+
|
|
410
|
+
return {
|
|
411
|
+
"url": self.url,
|
|
412
|
+
"priority": self.priority,
|
|
413
|
+
"last_modified": self.last_modified,
|
|
414
|
+
"change_frequency": self.change_frequency.value
|
|
415
|
+
if self.change_frequency
|
|
416
|
+
else None,
|
|
417
|
+
"news_story": self.news_story.to_dict() if self.news_story else None,
|
|
418
|
+
"images": [image.to_dict() for image in self.images]
|
|
419
|
+
if self.images
|
|
420
|
+
else None,
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
@property
|
|
424
|
+
def url(self) -> str:
|
|
425
|
+
"""Get the page URL."""
|
|
426
|
+
return self.__url
|
|
427
|
+
|
|
428
|
+
@property
|
|
429
|
+
def priority(self) -> Decimal:
|
|
430
|
+
"""Get the priority of this URL relative to other URLs on the site."""
|
|
431
|
+
return self.__priority
|
|
432
|
+
|
|
433
|
+
@property
|
|
434
|
+
def last_modified(self) -> Optional[datetime.datetime]:
|
|
435
|
+
"""Get the date of last modification of the URL."""
|
|
436
|
+
return self.__last_modified
|
|
437
|
+
|
|
438
|
+
@property
|
|
439
|
+
def change_frequency(self) -> Optional[SitemapPageChangeFrequency]:
|
|
440
|
+
"""Get the change frequency of a sitemap URL."""
|
|
441
|
+
return self.__change_frequency
|
|
442
|
+
|
|
443
|
+
@property
|
|
444
|
+
def news_story(self) -> Optional[SitemapNewsStory]:
|
|
445
|
+
"""Get the Google News story attached to the URL."""
|
|
446
|
+
return self.__news_story
|
|
447
|
+
|
|
448
|
+
@property
|
|
449
|
+
def images(self) -> Optional[List[SitemapImage]]:
|
|
450
|
+
"""Get the images attached to the URL."""
|
|
451
|
+
return self.__images
|