ultimate-sitemap-parser 1.0.0rc1__tar.gz → 1.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ultimate-sitemap-parser might be problematic. Click here for more details.

Files changed (21) hide show
  1. {ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.0}/PKG-INFO +6 -5
  2. {ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.0}/pyproject.toml +27 -15
  3. {ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.0}/usp/fetch_parse.py +20 -0
  4. {ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.0}/usp/objects/page.py +29 -3
  5. {ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.0}/usp/web_client/abstract_client.py +35 -0
  6. {ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.0}/usp/web_client/requests_client.py +17 -2
  7. {ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.0}/LICENSE +0 -0
  8. {ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.0}/NOTICE +0 -0
  9. {ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.0}/README.rst +0 -0
  10. {ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.0}/usp/__init__.py +0 -0
  11. {ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.0}/usp/cli/__init__.py +0 -0
  12. {ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.0}/usp/cli/_ls.py +0 -0
  13. {ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.0}/usp/cli/_util.py +0 -0
  14. {ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.0}/usp/cli/cli.py +0 -0
  15. {ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.0}/usp/exceptions.py +0 -0
  16. {ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.0}/usp/helpers.py +0 -0
  17. {ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.0}/usp/log.py +0 -0
  18. {ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.0}/usp/objects/__init__.py +0 -0
  19. {ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.0}/usp/objects/sitemap.py +0 -0
  20. {ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.0}/usp/tree.py +0 -0
  21. {ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.0}/usp/web_client/__init__.py +0 -0
@@ -1,15 +1,14 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.3
2
2
  Name: ultimate-sitemap-parser
3
- Version: 1.0.0rc1
3
+ Version: 1.1.0
4
4
  Summary: A performant library for parsing and crawling sitemaps
5
- Home-page: https://ultimate-sitemap-parser.readthedocs.io/
6
5
  License: GPL-3.0-or-later
7
6
  Keywords: sitemap,crawler,indexing,xml,rss,atom,google news
8
7
  Author: Linas Valiukas
9
8
  Author-email: linas@media.mit.edu
10
9
  Maintainer: Freddy Heppell
11
10
  Maintainer-email: f.heppell@sheffield.ac.uk
12
- Requires-Python: >=3.8,<4.0
11
+ Requires-Python: >=3.8
13
12
  Classifier: Development Status :: 5 - Production/Stable
14
13
  Classifier: Intended Audience :: Developers
15
14
  Classifier: Intended Audience :: Information Technology
@@ -22,12 +21,14 @@ Classifier: Programming Language :: Python :: 3.9
22
21
  Classifier: Programming Language :: Python :: 3.10
23
22
  Classifier: Programming Language :: Python :: 3.11
24
23
  Classifier: Programming Language :: Python :: 3.12
24
+ Classifier: Programming Language :: Python :: 3.13
25
25
  Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
26
26
  Classifier: Topic :: Text Processing :: Indexing
27
27
  Classifier: Topic :: Text Processing :: Markup :: XML
28
28
  Requires-Dist: python-dateutil (>=2.7,<3.0.0)
29
- Requires-Dist: requests (>=2.2.1)
29
+ Requires-Dist: requests (>=2.2.1,<3.0.0)
30
30
  Project-URL: Documentation, https://ultimate-sitemap-parser.readthedocs.io/
31
+ Project-URL: Homepage, https://ultimate-sitemap-parser.readthedocs.io/
31
32
  Project-URL: Repository, https://github.com/GateNLP/ultimate-sitemap-parser
32
33
  Description-Content-Type: text/x-rst
33
34
 
@@ -1,20 +1,36 @@
1
- [tool.poetry]
1
+ [project]
2
2
  name = "ultimate-sitemap-parser"
3
- version = "1.0.0rc1"
3
+ version = "1.1.0"
4
4
  description = "A performant library for parsing and crawling sitemaps"
5
5
  authors = [
6
- "Linas Valiukas <linas@media.mit.edu>",
7
- "Hal Roberts <hroberts@cyber.law.harvard.edu>",
8
- "Freddy Heppell <f.heppell@sheffield.ac.uk>"
6
+ { name = "Linas Valiukas", email = "linas@media.mit.edu"},
7
+ { name = "Hal Roberts", email = "hroberts@cyber.law.harvard.edu"},
8
+ { name = "Freddy Heppell", email = "f.heppell@sheffield.ac.uk"},
9
9
  ]
10
10
  maintainers = [
11
- "Freddy Heppell <f.heppell@sheffield.ac.uk>"
11
+ { name = "Freddy Heppell", email = "f.heppell@sheffield.ac.uk"},
12
+ ]
13
+ license = "GPL-3.0-or-later"
14
+ readme = "README.rst"
15
+ keywords = ["sitemap", "crawler", "indexing", "xml", "rss", "atom", "google news"]
16
+ dynamic = ["classifiers"]
17
+
18
+ requires-python = ">=3.8"
19
+ dependencies = [
20
+ "python-dateutil (>=2.7,<3.0.0)",
21
+ "requests (>=2.2.1,<3.0.0)"
12
22
  ]
23
+
24
+ [project.urls]
13
25
  homepage = "https://ultimate-sitemap-parser.readthedocs.io/"
14
26
  documentation = "https://ultimate-sitemap-parser.readthedocs.io/"
15
27
  repository = "https://github.com/GateNLP/ultimate-sitemap-parser"
16
- license = "GPL-3.0-or-later"
17
- readme = "README.rst"
28
+
29
+ [project.scripts]
30
+ usp = 'usp.cli:main'
31
+
32
+ [tool.poetry]
33
+ requires-poetry = ">=2.0"
18
34
  classifiers=[
19
35
  'Development Status :: 5 - Production/Stable',
20
36
  'Intended Audience :: Developers',
@@ -26,24 +42,20 @@ classifiers=[
26
42
  'Topic :: Text Processing :: Indexing',
27
43
  'Topic :: Text Processing :: Markup :: XML',
28
44
  ]
29
- keywords = ["sitemap", "crawler", "indexing", "xml", "rss", "atom", "google news"]
30
45
  packages = [
31
46
  { include = "usp" }
32
47
  ]
33
48
 
34
- [tool.poetry.scripts]
35
- usp = 'usp.cli:main'
36
-
37
49
  [tool.poetry.dependencies]
38
- python = "^3.8"
39
- python-dateutil = ">=2.7,<3.0.0"
40
- requests = ">=2.2.1"
50
+ # Specify upper bound for locking
51
+ python = ">=3.8,<4.0"
41
52
 
42
53
  [tool.poetry.group.dev.dependencies]
43
54
  requests-mock = ">=1.6.0,<2.0"
44
55
  pytest = "^8.3.0"
45
56
  ruff = "^0.6.1"
46
57
  vcrpy = "6.0.1"
58
+ pytest-mock = "^3.14.0"
47
59
 
48
60
  [tool.poetry.group.perf]
49
61
  optional = true
@@ -643,6 +643,7 @@ class PagesXMLSitemapParser(AbstractXMLSitemapParser):
643
643
  "news_keywords",
644
644
  "news_stock_tickers",
645
645
  "images",
646
+ "alternates",
646
647
  ]
647
648
 
648
649
  def __init__(self):
@@ -659,6 +660,7 @@ class PagesXMLSitemapParser(AbstractXMLSitemapParser):
659
660
  self.news_keywords = None
660
661
  self.news_stock_tickers = None
661
662
  self.images = []
663
+ self.alternates = []
662
664
 
663
665
  def __hash__(self):
664
666
  return hash(
@@ -763,6 +765,10 @@ class PagesXMLSitemapParser(AbstractXMLSitemapParser):
763
765
  for image in self.images
764
766
  ]
765
767
 
768
+ alternates = None
769
+ if len(self.alternates) > 0:
770
+ alternates = self.alternates
771
+
766
772
  return SitemapPage(
767
773
  url=url,
768
774
  last_modified=last_modified,
@@ -770,6 +776,7 @@ class PagesXMLSitemapParser(AbstractXMLSitemapParser):
770
776
  priority=priority,
771
777
  news_story=sitemap_news_story,
772
778
  images=sitemap_images,
779
+ alternates=alternates,
773
780
  )
774
781
 
775
782
  __slots__ = ["_current_page", "_pages", "_page_urls", "_current_image"]
@@ -801,6 +808,19 @@ class PagesXMLSitemapParser(AbstractXMLSitemapParser):
801
808
  "Page is expected to be set before <image:image>."
802
809
  )
803
810
  self._current_image = self.Image()
811
+ elif name == "link":
812
+ if not self._current_page:
813
+ raise SitemapXMLParsingException(
814
+ "Page is expected to be set before <link>."
815
+ )
816
+ if "rel" not in attrs or attrs["rel"] != "alternate":
817
+ log.warning(f"<link> element is missing rel attribute: {attrs}.")
818
+ elif "hreflang" not in attrs or "href" not in attrs:
819
+ log.warning(
820
+ f"<link> element is missing hreflang or href attributes: {attrs}."
821
+ )
822
+ else:
823
+ self._current_page.alternates.append((attrs["hreflang"], attrs["href"]))
804
824
 
805
825
  def __require_last_char_data_to_be_set(self, name: str) -> None:
806
826
  if not self._last_char_data:
@@ -3,7 +3,7 @@
3
3
  import datetime
4
4
  from decimal import Decimal
5
5
  from enum import Enum, unique
6
- from typing import List, Optional
6
+ from typing import List, Optional, Tuple
7
7
 
8
8
  SITEMAP_PAGE_DEFAULT_PRIORITY = Decimal("0.5")
9
9
  """Default sitemap page priority, as per the spec."""
@@ -331,6 +331,7 @@ class SitemapPage:
331
331
  "__change_frequency",
332
332
  "__news_story",
333
333
  "__images",
334
+ "__alternates",
334
335
  ]
335
336
 
336
337
  def __init__(
@@ -341,6 +342,7 @@ class SitemapPage:
341
342
  change_frequency: Optional[SitemapPageChangeFrequency] = None,
342
343
  news_story: Optional[SitemapNewsStory] = None,
343
344
  images: Optional[List[SitemapImage]] = None,
345
+ alternates: Optional[List[Tuple[str, str]]] = None,
344
346
  ):
345
347
  """
346
348
  Initialize a new sitemap-derived page.
@@ -357,6 +359,7 @@ class SitemapPage:
357
359
  self.__change_frequency = change_frequency
358
360
  self.__news_story = news_story
359
361
  self.__images = images
362
+ self.__alternates = alternates
360
363
 
361
364
  def __eq__(self, other) -> bool:
362
365
  if not isinstance(other, SitemapPage):
@@ -380,6 +383,9 @@ class SitemapPage:
380
383
  if self.images != other.images:
381
384
  return False
382
385
 
386
+ if self.alternates != other.alternates:
387
+ return False
388
+
383
389
  return True
384
390
 
385
391
  def __hash__(self):
@@ -442,10 +448,30 @@ class SitemapPage:
442
448
 
443
449
  @property
444
450
  def news_story(self) -> Optional[SitemapNewsStory]:
445
- """Get the Google News story attached to the URL."""
451
+ """Get the Google News story attached to the URL.
452
+
453
+ See :ref:`google-news-ext` reference
454
+ """
446
455
  return self.__news_story
447
456
 
448
457
  @property
449
458
  def images(self) -> Optional[List[SitemapImage]]:
450
- """Get the images attached to the URL."""
459
+ """Get the images attached to the URL.
460
+
461
+ See :ref:`google-image-ext` reference
462
+ """
451
463
  return self.__images
464
+
465
+ @property
466
+ def alternates(self) -> Optional[List[Tuple[str, str]]]:
467
+ """Get the alternate URLs for the URL.
468
+
469
+ A tuple of (language code, URL) for each ``<xhtml:link>`` element with ``rel="alternate"`` attribute.
470
+
471
+ See :ref:`sitemap-extra-localisation` reference
472
+
473
+ Example::
474
+
475
+ [('fr', 'https://www.example.com/fr/page'), ('de', 'https://www.example.com/de/page')]
476
+ """
477
+ return self.__alternates
@@ -1,7 +1,9 @@
1
1
  """Abstract web client class."""
2
2
 
3
3
  import abc
4
+ import random
4
5
  from http import HTTPStatus
6
+ import time
5
7
  from typing import Optional
6
8
 
7
9
  RETRYABLE_HTTP_STATUS_CODES = {
@@ -187,3 +189,36 @@ class LocalWebClient(AbstractWebClient):
187
189
 
188
190
  def get(self, url: str) -> AbstractWebClientResponse:
189
191
  raise NoWebClientException
192
+
193
+
194
+ class RequestWaiter:
195
+ """
196
+ Manages waiting between requests.
197
+ """
198
+
199
+ def __init__(self, wait: Optional[float] = None, random_wait: bool = True):
200
+ """
201
+ :param wait: time to wait between requests, in seconds.
202
+ :param random_wait: if true, wait time is multiplied by a random number between 0.5 and 1.5.
203
+ """
204
+ self.wait_s = wait or 0
205
+ self.random_wait = random_wait
206
+ self.is_first = True
207
+
208
+ def wait(self) -> None:
209
+ """Perform a wait if needed. Should be called before each request.
210
+
211
+ Will skip wait if this is the first request.
212
+ """
213
+ if self.wait_s == 0:
214
+ return
215
+
216
+ if self.is_first:
217
+ self.is_first = False
218
+ return
219
+
220
+ wait_f = 1.0
221
+ if self.random_wait:
222
+ wait_f = random.uniform(0.5, 1.5)
223
+
224
+ time.sleep(self.wait_s * wait_f)
@@ -1,6 +1,7 @@
1
1
  """Implementation of :mod:`usp.web_client.abstract_client` with Requests."""
2
2
 
3
3
  from http import HTTPStatus
4
+ import logging
4
5
  from typing import Optional, Dict, Tuple, Union
5
6
 
6
7
  import requests
@@ -9,6 +10,7 @@ from .abstract_client import (
9
10
  AbstractWebClient,
10
11
  AbstractWebClientResponse,
11
12
  AbstractWebClientSuccessResponse,
13
+ RequestWaiter,
12
14
  WebClientErrorResponse,
13
15
  RETRYABLE_HTTP_STATUS_CODES,
14
16
  )
@@ -78,16 +80,27 @@ class RequestsWebClient(AbstractWebClient):
78
80
  Some webservers might be generating huge sitemaps on the fly, so this is why it's rather big.
79
81
  """
80
82
 
81
- __slots__ = ["__max_response_data_length", "__timeout", "__proxies", "__verify"]
83
+ __slots__ = [
84
+ "__max_response_data_length",
85
+ "__timeout",
86
+ "__proxies",
87
+ "__verify",
88
+ "__waiter",
89
+ ]
82
90
 
83
- def __init__(self, verify=True):
91
+ def __init__(
92
+ self, verify=True, wait: Optional[float] = None, random_wait: bool = False
93
+ ):
84
94
  """
85
95
  :param verify: whether certificates should be verified for HTTPS requests.
96
+ :param wait: time to wait between requests, in seconds.
97
+ :param random_wait: if true, wait time is multiplied by a random number between 0.5 and 1.5.
86
98
  """
87
99
  self.__max_response_data_length = None
88
100
  self.__timeout = self.__HTTP_REQUEST_TIMEOUT
89
101
  self.__proxies = {}
90
102
  self.__verify = verify
103
+ self.__waiter = RequestWaiter(wait, random_wait)
91
104
 
92
105
  def set_timeout(self, timeout: Union[int, Tuple[int, int], None]) -> None:
93
106
  """Set HTTP request timeout.
@@ -114,6 +127,7 @@ class RequestsWebClient(AbstractWebClient):
114
127
  self.__max_response_data_length = max_response_data_length
115
128
 
116
129
  def get(self, url: str) -> AbstractWebClientResponse:
130
+ self.__waiter.wait()
117
131
  try:
118
132
  response = requests.get(
119
133
  url,
@@ -139,6 +153,7 @@ class RequestsWebClient(AbstractWebClient):
139
153
  )
140
154
  else:
141
155
  message = f"{response.status_code} {response.reason}"
156
+ logging.info(f"Response content: {response.text}")
142
157
 
143
158
  if response.status_code in RETRYABLE_HTTP_STATUS_CODES:
144
159
  return RequestsWebClientErrorResponse(