ultimate-sitemap-parser 1.0.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ultimate-sitemap-parser might be problematic. Click here for more details.

usp/objects/page.py ADDED
@@ -0,0 +1,451 @@
1
+ """Objects that represent a page found in one of the sitemaps."""
2
+
3
+ import datetime
4
+ from decimal import Decimal
5
+ from enum import Enum, unique
6
+ from typing import List, Optional
7
+
8
+ SITEMAP_PAGE_DEFAULT_PRIORITY = Decimal("0.5")
9
+ """Default sitemap page priority, as per the spec."""
10
+
11
+
12
+ class SitemapNewsStory:
13
+ """
14
+ Single story derived from Google News XML sitemap.
15
+ """
16
+
17
+ __slots__ = [
18
+ "__title",
19
+ "__publish_date",
20
+ "__publication_name",
21
+ "__publication_language",
22
+ "__access",
23
+ "__genres",
24
+ "__keywords",
25
+ "__stock_tickers",
26
+ ]
27
+
28
+ def __init__(
29
+ self,
30
+ title: str,
31
+ publish_date: datetime.datetime,
32
+ publication_name: Optional[str] = None,
33
+ publication_language: Optional[str] = None,
34
+ access: Optional[str] = None,
35
+ genres: List[str] = None,
36
+ keywords: List[str] = None,
37
+ stock_tickers: List[str] = None,
38
+ ):
39
+ """
40
+ Initialize a new Google News story.
41
+
42
+ :param title: Story title.
43
+ :param publish_date: Story publication date.
44
+ :param publication_name: Name of the news publication in which the article appears in.
45
+ :param publication_language: Primary language of the news publication in which the article appears in.
46
+ :param access: Accessibility of the article.
47
+ :param genres: List of properties characterizing the content of the article.
48
+ :param keywords: List of keywords describing the topic of the article.
49
+ :param stock_tickers: List of up to 5 stock tickers that are the main subject of the article.
50
+ """
51
+
52
+ # Spec defines that some of the properties below are "required" but in practice not every website provides the
53
+ # required properties. So, we require only "title" and "publish_date" to be set.
54
+
55
+ self.__title = title
56
+ self.__publish_date = publish_date
57
+ self.__publication_name = publication_name
58
+ self.__publication_language = publication_language
59
+ self.__access = access
60
+ self.__genres = genres if genres else []
61
+ self.__keywords = keywords if keywords else []
62
+ self.__stock_tickers = stock_tickers if stock_tickers else []
63
+
64
+ def __eq__(self, other) -> bool:
65
+ """Check equality."""
66
+ if not isinstance(other, SitemapNewsStory):
67
+ raise NotImplementedError
68
+
69
+ if self.title != other.title:
70
+ return False
71
+
72
+ if self.publish_date != other.publish_date:
73
+ return False
74
+
75
+ if self.publication_name != other.publication_name:
76
+ return False
77
+
78
+ if self.publication_language != other.publication_language:
79
+ return False
80
+
81
+ if self.access != other.access:
82
+ return False
83
+
84
+ if self.genres != other.genres:
85
+ return False
86
+
87
+ if self.keywords != other.keywords:
88
+ return False
89
+
90
+ if self.stock_tickers != other.stock_tickers:
91
+ return False
92
+
93
+ return True
94
+
95
+ def to_dict(self) -> dict:
96
+ """
97
+ Convert to a dictionary representation.
98
+
99
+ :return: the news story data as a dictionary
100
+ """
101
+ return {
102
+ "title": self.title,
103
+ "publish_date": self.publish_date,
104
+ "publication_name": self.publication_name,
105
+ "publication_language": self.publication_language,
106
+ "access": self.access,
107
+ "genres": self.genres,
108
+ "keywords": self.keywords,
109
+ "stock_tickers": self.stock_tickers,
110
+ }
111
+
112
+ def __hash__(self):
113
+ return hash(
114
+ (
115
+ self.title,
116
+ self.publish_date,
117
+ self.publication_name,
118
+ self.publication_language,
119
+ self.access,
120
+ self.genres,
121
+ self.keywords,
122
+ self.stock_tickers,
123
+ )
124
+ )
125
+
126
+ def __repr__(self) -> str:
127
+ return (
128
+ f"{self.__class__.__name__}("
129
+ f"title={self.title}, "
130
+ f"publish_date={self.publish_date}, "
131
+ f"publication_name={self.publication_name}, "
132
+ f"publication_language={self.publication_language}, "
133
+ f"access={self.access}, "
134
+ f"genres={self.genres}, "
135
+ f"keywords={self.keywords}, "
136
+ f"stock_tickers={self.stock_tickers}"
137
+ ")"
138
+ )
139
+
140
+ @property
141
+ def title(self) -> str:
142
+ """Get the story title."""
143
+ return self.__title
144
+
145
+ @property
146
+ def publish_date(self) -> datetime.datetime:
147
+ """Get the story publication date."""
148
+ return self.__publish_date
149
+
150
+ @property
151
+ def publication_name(self) -> Optional[str]:
152
+ """Get the name of the news publication in which the article appears."""
153
+ return self.__publication_name
154
+
155
+ @property
156
+ def publication_language(self) -> Optional[str]:
157
+ """Get the primary language of the news publication in which the article appears.
158
+
159
+ It should be an ISO 639 Language Code (either 2 or 3 letters).
160
+ """
161
+ return self.__publication_language
162
+
163
+ @property
164
+ def access(self) -> Optional[str]:
165
+ """Get the accessibility of the article.
166
+
167
+ :return: Accessibility of the article.
168
+ """
169
+ return self.__access
170
+
171
+ @property
172
+ def genres(self) -> List[str]:
173
+ """Get list of genres characterizing the content of the article.
174
+
175
+ Genres will be one "PressRelease", "Satire", "Blog", "OpEd", "Opinion", "UserGenerated"
176
+ """
177
+ return self.__genres
178
+
179
+ @property
180
+ def keywords(self) -> List[str]:
181
+ """Get list of keywords describing the topic of the article."""
182
+ return self.__keywords
183
+
184
+ @property
185
+ def stock_tickers(self) -> List[str]:
186
+ """Get stock tickers that are the main subject of the article.
187
+
188
+ Each ticker must be prefixed by the name of its stock exchange, and must match its entry in Google Finance.
189
+ For example, "NASDAQ:AMAT" (but not "NASD:AMAT"), or "BOM:500325" (but not "BOM:RIL").
190
+
191
+ Up to 5 tickers can be provided.
192
+ """
193
+ return self.__stock_tickers
194
+
195
+
196
+ class SitemapImage:
197
+ """
198
+ Single image derived from Google Image XML sitemap.
199
+
200
+ All properties except ``loc`` are now deprecated in the XML specification, see
201
+ https://developers.google.com/search/blog/2022/05/spring-cleaning-sitemap-extensions
202
+
203
+ They will continue to be supported here.
204
+ """
205
+
206
+ __slots__ = ["__loc", "__caption", "__geo_location", "__title", "__license"]
207
+
208
+ def __init__(
209
+ self,
210
+ loc: str,
211
+ caption: Optional[str] = None,
212
+ geo_location: Optional[str] = None,
213
+ title: Optional[str] = None,
214
+ license_: Optional[str] = None,
215
+ ):
216
+ """Initialise a Google Image.
217
+
218
+ :param loc: the URL of the image
219
+ :param caption: the caption of the image, optional
220
+ :param geo_location: the geographic location of the image, for example "Limerick, Ireland", optional
221
+ :param title: the title of the image, optional
222
+ :param license_: a URL to the license of the image, optional
223
+ """
224
+ self.__loc = loc
225
+ self.__caption = caption
226
+ self.__geo_location = geo_location
227
+ self.__title = title
228
+ self.__license = license_
229
+
230
+ def __eq__(self, other) -> bool:
231
+ if not isinstance(other, SitemapImage):
232
+ raise NotImplementedError
233
+
234
+ if self.loc != other.loc:
235
+ return False
236
+
237
+ if self.caption != other.caption:
238
+ return False
239
+
240
+ if self.geo_location != other.geo_location:
241
+ return False
242
+
243
+ if self.title != other.title:
244
+ return False
245
+
246
+ if self.license != other.license:
247
+ return False
248
+
249
+ return True
250
+
251
+ def to_dict(self):
252
+ """Convert to a dictionary representation.
253
+
254
+ :return: the image data as a dictionary
255
+ """
256
+ return {
257
+ "loc": self.loc,
258
+ "caption": self.caption,
259
+ "geo_location": self.geo_location,
260
+ "title": self.title,
261
+ "license": self.license,
262
+ }
263
+
264
+ def __hash__(self):
265
+ return hash(
266
+ (self.loc, self.caption, self.geo_location, self.title, self.license)
267
+ )
268
+
269
+ def __repr__(self) -> str:
270
+ return (
271
+ f"{self.__class__.__name__}("
272
+ f"loc={self.loc}, "
273
+ f"caption={self.caption}, "
274
+ f"geo_location={self.geo_location}, "
275
+ f"title={self.title}, "
276
+ f"license={self.license}"
277
+ ")"
278
+ )
279
+
280
+ @property
281
+ def loc(self) -> str:
282
+ """Get the URL of the image."""
283
+ return self.__loc
284
+
285
+ @property
286
+ def caption(self) -> Optional[str]:
287
+ """Get the caption of the image."""
288
+ return self.__caption
289
+
290
+ @property
291
+ def geo_location(self) -> Optional[str]:
292
+ """Get the geographic location of the image."""
293
+ return self.__geo_location
294
+
295
+ @property
296
+ def title(self) -> Optional[str]:
297
+ """Get the title of the image."""
298
+ return self.__title
299
+
300
+ @property
301
+ def license(self) -> Optional[str]:
302
+ """Get a URL to the license of the image."""
303
+ return self.__license
304
+
305
+
306
+ @unique
307
+ class SitemapPageChangeFrequency(Enum):
308
+ """Change frequency of a sitemap URL."""
309
+
310
+ ALWAYS = "always"
311
+ HOURLY = "hourly"
312
+ DAILY = "daily"
313
+ WEEKLY = "weekly"
314
+ MONTHLY = "monthly"
315
+ YEARLY = "yearly"
316
+ NEVER = "never"
317
+
318
+ @classmethod
319
+ def has_value(cls, value: str) -> bool:
320
+ """Test if enum has specified value."""
321
+ return any(value == item.value for item in cls)
322
+
323
+
324
+ class SitemapPage:
325
+ """Single sitemap-derived page."""
326
+
327
+ __slots__ = [
328
+ "__url",
329
+ "__priority",
330
+ "__last_modified",
331
+ "__change_frequency",
332
+ "__news_story",
333
+ "__images",
334
+ ]
335
+
336
+ def __init__(
337
+ self,
338
+ url: str,
339
+ priority: Decimal = SITEMAP_PAGE_DEFAULT_PRIORITY,
340
+ last_modified: Optional[datetime.datetime] = None,
341
+ change_frequency: Optional[SitemapPageChangeFrequency] = None,
342
+ news_story: Optional[SitemapNewsStory] = None,
343
+ images: Optional[List[SitemapImage]] = None,
344
+ ):
345
+ """
346
+ Initialize a new sitemap-derived page.
347
+
348
+ :param url: Page URL.
349
+ :param priority: Priority of this URL relative to other URLs on your site.
350
+ :param last_modified: Date of last modification of the URL.
351
+ :param change_frequency: Change frequency of a sitemap URL.
352
+ :param news_story: Google News story attached to the URL.
353
+ """
354
+ self.__url = url
355
+ self.__priority = priority
356
+ self.__last_modified = last_modified
357
+ self.__change_frequency = change_frequency
358
+ self.__news_story = news_story
359
+ self.__images = images
360
+
361
+ def __eq__(self, other) -> bool:
362
+ if not isinstance(other, SitemapPage):
363
+ raise NotImplementedError
364
+
365
+ if self.url != other.url:
366
+ return False
367
+
368
+ if self.priority != other.priority:
369
+ return False
370
+
371
+ if self.last_modified != other.last_modified:
372
+ return False
373
+
374
+ if self.change_frequency != other.change_frequency:
375
+ return False
376
+
377
+ if self.news_story != other.news_story:
378
+ return False
379
+
380
+ if self.images != other.images:
381
+ return False
382
+
383
+ return True
384
+
385
+ def __hash__(self):
386
+ return hash(
387
+ (
388
+ # Hash only the URL to be able to find unique pages later on
389
+ self.url,
390
+ )
391
+ )
392
+
393
+ def __repr__(self) -> str:
394
+ return (
395
+ f"{self.__class__.__name__}("
396
+ f"url={self.url}, "
397
+ f"priority={self.priority}, "
398
+ f"last_modified={self.last_modified}, "
399
+ f"change_frequency={self.change_frequency}, "
400
+ f"news_story={self.news_story}, "
401
+ f"images={self.images}"
402
+ ")"
403
+ )
404
+
405
+ def to_dict(self):
406
+ """
407
+ Convert this page to a dictionary.
408
+ """
409
+
410
+ return {
411
+ "url": self.url,
412
+ "priority": self.priority,
413
+ "last_modified": self.last_modified,
414
+ "change_frequency": self.change_frequency.value
415
+ if self.change_frequency
416
+ else None,
417
+ "news_story": self.news_story.to_dict() if self.news_story else None,
418
+ "images": [image.to_dict() for image in self.images]
419
+ if self.images
420
+ else None,
421
+ }
422
+
423
+ @property
424
+ def url(self) -> str:
425
+ """Get the page URL."""
426
+ return self.__url
427
+
428
+ @property
429
+ def priority(self) -> Decimal:
430
+ """Get the priority of this URL relative to other URLs on the site."""
431
+ return self.__priority
432
+
433
+ @property
434
+ def last_modified(self) -> Optional[datetime.datetime]:
435
+ """Get the date of last modification of the URL."""
436
+ return self.__last_modified
437
+
438
+ @property
439
+ def change_frequency(self) -> Optional[SitemapPageChangeFrequency]:
440
+ """Get the change frequency of a sitemap URL."""
441
+ return self.__change_frequency
442
+
443
+ @property
444
+ def news_story(self) -> Optional[SitemapNewsStory]:
445
+ """Get the Google News story attached to the URL."""
446
+ return self.__news_story
447
+
448
+ @property
449
+ def images(self) -> Optional[List[SitemapImage]]:
450
+ """Get the images attached to the URL."""
451
+ return self.__images