ultimate-sitemap-parser 1.0.0rc1__tar.gz → 1.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ultimate-sitemap-parser might be problematic. Click here for more details.
- {ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.0}/PKG-INFO +6 -5
- {ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.0}/pyproject.toml +27 -15
- {ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.0}/usp/fetch_parse.py +20 -0
- {ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.0}/usp/objects/page.py +29 -3
- {ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.0}/usp/web_client/abstract_client.py +35 -0
- {ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.0}/usp/web_client/requests_client.py +17 -2
- {ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.0}/LICENSE +0 -0
- {ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.0}/NOTICE +0 -0
- {ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.0}/README.rst +0 -0
- {ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.0}/usp/__init__.py +0 -0
- {ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.0}/usp/cli/__init__.py +0 -0
- {ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.0}/usp/cli/_ls.py +0 -0
- {ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.0}/usp/cli/_util.py +0 -0
- {ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.0}/usp/cli/cli.py +0 -0
- {ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.0}/usp/exceptions.py +0 -0
- {ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.0}/usp/helpers.py +0 -0
- {ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.0}/usp/log.py +0 -0
- {ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.0}/usp/objects/__init__.py +0 -0
- {ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.0}/usp/objects/sitemap.py +0 -0
- {ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.0}/usp/tree.py +0 -0
- {ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.0}/usp/web_client/__init__.py +0 -0
|
@@ -1,15 +1,14 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
2
|
Name: ultimate-sitemap-parser
|
|
3
|
-
Version: 1.0
|
|
3
|
+
Version: 1.1.0
|
|
4
4
|
Summary: A performant library for parsing and crawling sitemaps
|
|
5
|
-
Home-page: https://ultimate-sitemap-parser.readthedocs.io/
|
|
6
5
|
License: GPL-3.0-or-later
|
|
7
6
|
Keywords: sitemap,crawler,indexing,xml,rss,atom,google news
|
|
8
7
|
Author: Linas Valiukas
|
|
9
8
|
Author-email: linas@media.mit.edu
|
|
10
9
|
Maintainer: Freddy Heppell
|
|
11
10
|
Maintainer-email: f.heppell@sheffield.ac.uk
|
|
12
|
-
Requires-Python: >=3.8
|
|
11
|
+
Requires-Python: >=3.8
|
|
13
12
|
Classifier: Development Status :: 5 - Production/Stable
|
|
14
13
|
Classifier: Intended Audience :: Developers
|
|
15
14
|
Classifier: Intended Audience :: Information Technology
|
|
@@ -22,12 +21,14 @@ Classifier: Programming Language :: Python :: 3.9
|
|
|
22
21
|
Classifier: Programming Language :: Python :: 3.10
|
|
23
22
|
Classifier: Programming Language :: Python :: 3.11
|
|
24
23
|
Classifier: Programming Language :: Python :: 3.12
|
|
24
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
25
25
|
Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
|
|
26
26
|
Classifier: Topic :: Text Processing :: Indexing
|
|
27
27
|
Classifier: Topic :: Text Processing :: Markup :: XML
|
|
28
28
|
Requires-Dist: python-dateutil (>=2.7,<3.0.0)
|
|
29
|
-
Requires-Dist: requests (>=2.2.1)
|
|
29
|
+
Requires-Dist: requests (>=2.2.1,<3.0.0)
|
|
30
30
|
Project-URL: Documentation, https://ultimate-sitemap-parser.readthedocs.io/
|
|
31
|
+
Project-URL: Homepage, https://ultimate-sitemap-parser.readthedocs.io/
|
|
31
32
|
Project-URL: Repository, https://github.com/GateNLP/ultimate-sitemap-parser
|
|
32
33
|
Description-Content-Type: text/x-rst
|
|
33
34
|
|
|
@@ -1,20 +1,36 @@
|
|
|
1
|
-
[
|
|
1
|
+
[project]
|
|
2
2
|
name = "ultimate-sitemap-parser"
|
|
3
|
-
version = "1.0
|
|
3
|
+
version = "1.1.0"
|
|
4
4
|
description = "A performant library for parsing and crawling sitemaps"
|
|
5
5
|
authors = [
|
|
6
|
-
"Linas Valiukas
|
|
7
|
-
"Hal Roberts
|
|
8
|
-
"Freddy Heppell
|
|
6
|
+
{ name = "Linas Valiukas", email = "linas@media.mit.edu"},
|
|
7
|
+
{ name = "Hal Roberts", email = "hroberts@cyber.law.harvard.edu"},
|
|
8
|
+
{ name = "Freddy Heppell", email = "f.heppell@sheffield.ac.uk"},
|
|
9
9
|
]
|
|
10
10
|
maintainers = [
|
|
11
|
-
"Freddy Heppell
|
|
11
|
+
{ name = "Freddy Heppell", email = "f.heppell@sheffield.ac.uk"},
|
|
12
|
+
]
|
|
13
|
+
license = "GPL-3.0-or-later"
|
|
14
|
+
readme = "README.rst"
|
|
15
|
+
keywords = ["sitemap", "crawler", "indexing", "xml", "rss", "atom", "google news"]
|
|
16
|
+
dynamic = ["classifiers"]
|
|
17
|
+
|
|
18
|
+
requires-python = ">=3.8"
|
|
19
|
+
dependencies = [
|
|
20
|
+
"python-dateutil (>=2.7,<3.0.0)",
|
|
21
|
+
"requests (>=2.2.1,<3.0.0)"
|
|
12
22
|
]
|
|
23
|
+
|
|
24
|
+
[project.urls]
|
|
13
25
|
homepage = "https://ultimate-sitemap-parser.readthedocs.io/"
|
|
14
26
|
documentation = "https://ultimate-sitemap-parser.readthedocs.io/"
|
|
15
27
|
repository = "https://github.com/GateNLP/ultimate-sitemap-parser"
|
|
16
|
-
|
|
17
|
-
|
|
28
|
+
|
|
29
|
+
[project.scripts]
|
|
30
|
+
usp = 'usp.cli:main'
|
|
31
|
+
|
|
32
|
+
[tool.poetry]
|
|
33
|
+
requires-poetry = ">=2.0"
|
|
18
34
|
classifiers=[
|
|
19
35
|
'Development Status :: 5 - Production/Stable',
|
|
20
36
|
'Intended Audience :: Developers',
|
|
@@ -26,24 +42,20 @@ classifiers=[
|
|
|
26
42
|
'Topic :: Text Processing :: Indexing',
|
|
27
43
|
'Topic :: Text Processing :: Markup :: XML',
|
|
28
44
|
]
|
|
29
|
-
keywords = ["sitemap", "crawler", "indexing", "xml", "rss", "atom", "google news"]
|
|
30
45
|
packages = [
|
|
31
46
|
{ include = "usp" }
|
|
32
47
|
]
|
|
33
48
|
|
|
34
|
-
[tool.poetry.scripts]
|
|
35
|
-
usp = 'usp.cli:main'
|
|
36
|
-
|
|
37
49
|
[tool.poetry.dependencies]
|
|
38
|
-
|
|
39
|
-
python
|
|
40
|
-
requests = ">=2.2.1"
|
|
50
|
+
# Specify upper bound for locking
|
|
51
|
+
python = ">=3.8,<4.0"
|
|
41
52
|
|
|
42
53
|
[tool.poetry.group.dev.dependencies]
|
|
43
54
|
requests-mock = ">=1.6.0,<2.0"
|
|
44
55
|
pytest = "^8.3.0"
|
|
45
56
|
ruff = "^0.6.1"
|
|
46
57
|
vcrpy = "6.0.1"
|
|
58
|
+
pytest-mock = "^3.14.0"
|
|
47
59
|
|
|
48
60
|
[tool.poetry.group.perf]
|
|
49
61
|
optional = true
|
|
@@ -643,6 +643,7 @@ class PagesXMLSitemapParser(AbstractXMLSitemapParser):
|
|
|
643
643
|
"news_keywords",
|
|
644
644
|
"news_stock_tickers",
|
|
645
645
|
"images",
|
|
646
|
+
"alternates",
|
|
646
647
|
]
|
|
647
648
|
|
|
648
649
|
def __init__(self):
|
|
@@ -659,6 +660,7 @@ class PagesXMLSitemapParser(AbstractXMLSitemapParser):
|
|
|
659
660
|
self.news_keywords = None
|
|
660
661
|
self.news_stock_tickers = None
|
|
661
662
|
self.images = []
|
|
663
|
+
self.alternates = []
|
|
662
664
|
|
|
663
665
|
def __hash__(self):
|
|
664
666
|
return hash(
|
|
@@ -763,6 +765,10 @@ class PagesXMLSitemapParser(AbstractXMLSitemapParser):
|
|
|
763
765
|
for image in self.images
|
|
764
766
|
]
|
|
765
767
|
|
|
768
|
+
alternates = None
|
|
769
|
+
if len(self.alternates) > 0:
|
|
770
|
+
alternates = self.alternates
|
|
771
|
+
|
|
766
772
|
return SitemapPage(
|
|
767
773
|
url=url,
|
|
768
774
|
last_modified=last_modified,
|
|
@@ -770,6 +776,7 @@ class PagesXMLSitemapParser(AbstractXMLSitemapParser):
|
|
|
770
776
|
priority=priority,
|
|
771
777
|
news_story=sitemap_news_story,
|
|
772
778
|
images=sitemap_images,
|
|
779
|
+
alternates=alternates,
|
|
773
780
|
)
|
|
774
781
|
|
|
775
782
|
__slots__ = ["_current_page", "_pages", "_page_urls", "_current_image"]
|
|
@@ -801,6 +808,19 @@ class PagesXMLSitemapParser(AbstractXMLSitemapParser):
|
|
|
801
808
|
"Page is expected to be set before <image:image>."
|
|
802
809
|
)
|
|
803
810
|
self._current_image = self.Image()
|
|
811
|
+
elif name == "link":
|
|
812
|
+
if not self._current_page:
|
|
813
|
+
raise SitemapXMLParsingException(
|
|
814
|
+
"Page is expected to be set before <link>."
|
|
815
|
+
)
|
|
816
|
+
if "rel" not in attrs or attrs["rel"] != "alternate":
|
|
817
|
+
log.warning(f"<link> element is missing rel attribute: {attrs}.")
|
|
818
|
+
elif "hreflang" not in attrs or "href" not in attrs:
|
|
819
|
+
log.warning(
|
|
820
|
+
f"<link> element is missing hreflang or href attributes: {attrs}."
|
|
821
|
+
)
|
|
822
|
+
else:
|
|
823
|
+
self._current_page.alternates.append((attrs["hreflang"], attrs["href"]))
|
|
804
824
|
|
|
805
825
|
def __require_last_char_data_to_be_set(self, name: str) -> None:
|
|
806
826
|
if not self._last_char_data:
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
import datetime
|
|
4
4
|
from decimal import Decimal
|
|
5
5
|
from enum import Enum, unique
|
|
6
|
-
from typing import List, Optional
|
|
6
|
+
from typing import List, Optional, Tuple
|
|
7
7
|
|
|
8
8
|
SITEMAP_PAGE_DEFAULT_PRIORITY = Decimal("0.5")
|
|
9
9
|
"""Default sitemap page priority, as per the spec."""
|
|
@@ -331,6 +331,7 @@ class SitemapPage:
|
|
|
331
331
|
"__change_frequency",
|
|
332
332
|
"__news_story",
|
|
333
333
|
"__images",
|
|
334
|
+
"__alternates",
|
|
334
335
|
]
|
|
335
336
|
|
|
336
337
|
def __init__(
|
|
@@ -341,6 +342,7 @@ class SitemapPage:
|
|
|
341
342
|
change_frequency: Optional[SitemapPageChangeFrequency] = None,
|
|
342
343
|
news_story: Optional[SitemapNewsStory] = None,
|
|
343
344
|
images: Optional[List[SitemapImage]] = None,
|
|
345
|
+
alternates: Optional[List[Tuple[str, str]]] = None,
|
|
344
346
|
):
|
|
345
347
|
"""
|
|
346
348
|
Initialize a new sitemap-derived page.
|
|
@@ -357,6 +359,7 @@ class SitemapPage:
|
|
|
357
359
|
self.__change_frequency = change_frequency
|
|
358
360
|
self.__news_story = news_story
|
|
359
361
|
self.__images = images
|
|
362
|
+
self.__alternates = alternates
|
|
360
363
|
|
|
361
364
|
def __eq__(self, other) -> bool:
|
|
362
365
|
if not isinstance(other, SitemapPage):
|
|
@@ -380,6 +383,9 @@ class SitemapPage:
|
|
|
380
383
|
if self.images != other.images:
|
|
381
384
|
return False
|
|
382
385
|
|
|
386
|
+
if self.alternates != other.alternates:
|
|
387
|
+
return False
|
|
388
|
+
|
|
383
389
|
return True
|
|
384
390
|
|
|
385
391
|
def __hash__(self):
|
|
@@ -442,10 +448,30 @@ class SitemapPage:
|
|
|
442
448
|
|
|
443
449
|
@property
|
|
444
450
|
def news_story(self) -> Optional[SitemapNewsStory]:
|
|
445
|
-
"""Get the Google News story attached to the URL.
|
|
451
|
+
"""Get the Google News story attached to the URL.
|
|
452
|
+
|
|
453
|
+
See :ref:`google-news-ext` reference
|
|
454
|
+
"""
|
|
446
455
|
return self.__news_story
|
|
447
456
|
|
|
448
457
|
@property
|
|
449
458
|
def images(self) -> Optional[List[SitemapImage]]:
|
|
450
|
-
"""Get the images attached to the URL.
|
|
459
|
+
"""Get the images attached to the URL.
|
|
460
|
+
|
|
461
|
+
See :ref:`google-image-ext` reference
|
|
462
|
+
"""
|
|
451
463
|
return self.__images
|
|
464
|
+
|
|
465
|
+
@property
|
|
466
|
+
def alternates(self) -> Optional[List[Tuple[str, str]]]:
|
|
467
|
+
"""Get the alternate URLs for the URL.
|
|
468
|
+
|
|
469
|
+
A tuple of (language code, URL) for each ``<xhtml:link>`` element with ``rel="alternate"`` attribute.
|
|
470
|
+
|
|
471
|
+
See :ref:`sitemap-extra-localisation` reference
|
|
472
|
+
|
|
473
|
+
Example::
|
|
474
|
+
|
|
475
|
+
[('fr', 'https://www.example.com/fr/page'), ('de', 'https://www.example.com/de/page')]
|
|
476
|
+
"""
|
|
477
|
+
return self.__alternates
|
{ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.0}/usp/web_client/abstract_client.py
RENAMED
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
"""Abstract web client class."""
|
|
2
2
|
|
|
3
3
|
import abc
|
|
4
|
+
import random
|
|
4
5
|
from http import HTTPStatus
|
|
6
|
+
import time
|
|
5
7
|
from typing import Optional
|
|
6
8
|
|
|
7
9
|
RETRYABLE_HTTP_STATUS_CODES = {
|
|
@@ -187,3 +189,36 @@ class LocalWebClient(AbstractWebClient):
|
|
|
187
189
|
|
|
188
190
|
def get(self, url: str) -> AbstractWebClientResponse:
|
|
189
191
|
raise NoWebClientException
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
class RequestWaiter:
|
|
195
|
+
"""
|
|
196
|
+
Manages waiting between requests.
|
|
197
|
+
"""
|
|
198
|
+
|
|
199
|
+
def __init__(self, wait: Optional[float] = None, random_wait: bool = True):
|
|
200
|
+
"""
|
|
201
|
+
:param wait: time to wait between requests, in seconds.
|
|
202
|
+
:param random_wait: if true, wait time is multiplied by a random number between 0.5 and 1.5.
|
|
203
|
+
"""
|
|
204
|
+
self.wait_s = wait or 0
|
|
205
|
+
self.random_wait = random_wait
|
|
206
|
+
self.is_first = True
|
|
207
|
+
|
|
208
|
+
def wait(self) -> None:
|
|
209
|
+
"""Perform a wait if needed. Should be called before each request.
|
|
210
|
+
|
|
211
|
+
Will skip wait if this is the first request.
|
|
212
|
+
"""
|
|
213
|
+
if self.wait_s == 0:
|
|
214
|
+
return
|
|
215
|
+
|
|
216
|
+
if self.is_first:
|
|
217
|
+
self.is_first = False
|
|
218
|
+
return
|
|
219
|
+
|
|
220
|
+
wait_f = 1.0
|
|
221
|
+
if self.random_wait:
|
|
222
|
+
wait_f = random.uniform(0.5, 1.5)
|
|
223
|
+
|
|
224
|
+
time.sleep(self.wait_s * wait_f)
|
{ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.0}/usp/web_client/requests_client.py
RENAMED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""Implementation of :mod:`usp.web_client.abstract_client` with Requests."""
|
|
2
2
|
|
|
3
3
|
from http import HTTPStatus
|
|
4
|
+
import logging
|
|
4
5
|
from typing import Optional, Dict, Tuple, Union
|
|
5
6
|
|
|
6
7
|
import requests
|
|
@@ -9,6 +10,7 @@ from .abstract_client import (
|
|
|
9
10
|
AbstractWebClient,
|
|
10
11
|
AbstractWebClientResponse,
|
|
11
12
|
AbstractWebClientSuccessResponse,
|
|
13
|
+
RequestWaiter,
|
|
12
14
|
WebClientErrorResponse,
|
|
13
15
|
RETRYABLE_HTTP_STATUS_CODES,
|
|
14
16
|
)
|
|
@@ -78,16 +80,27 @@ class RequestsWebClient(AbstractWebClient):
|
|
|
78
80
|
Some webservers might be generating huge sitemaps on the fly, so this is why it's rather big.
|
|
79
81
|
"""
|
|
80
82
|
|
|
81
|
-
__slots__ = [
|
|
83
|
+
__slots__ = [
|
|
84
|
+
"__max_response_data_length",
|
|
85
|
+
"__timeout",
|
|
86
|
+
"__proxies",
|
|
87
|
+
"__verify",
|
|
88
|
+
"__waiter",
|
|
89
|
+
]
|
|
82
90
|
|
|
83
|
-
def __init__(
|
|
91
|
+
def __init__(
|
|
92
|
+
self, verify=True, wait: Optional[float] = None, random_wait: bool = False
|
|
93
|
+
):
|
|
84
94
|
"""
|
|
85
95
|
:param verify: whether certificates should be verified for HTTPS requests.
|
|
96
|
+
:param wait: time to wait between requests, in seconds.
|
|
97
|
+
:param random_wait: if true, wait time is multiplied by a random number between 0.5 and 1.5.
|
|
86
98
|
"""
|
|
87
99
|
self.__max_response_data_length = None
|
|
88
100
|
self.__timeout = self.__HTTP_REQUEST_TIMEOUT
|
|
89
101
|
self.__proxies = {}
|
|
90
102
|
self.__verify = verify
|
|
103
|
+
self.__waiter = RequestWaiter(wait, random_wait)
|
|
91
104
|
|
|
92
105
|
def set_timeout(self, timeout: Union[int, Tuple[int, int], None]) -> None:
|
|
93
106
|
"""Set HTTP request timeout.
|
|
@@ -114,6 +127,7 @@ class RequestsWebClient(AbstractWebClient):
|
|
|
114
127
|
self.__max_response_data_length = max_response_data_length
|
|
115
128
|
|
|
116
129
|
def get(self, url: str) -> AbstractWebClientResponse:
|
|
130
|
+
self.__waiter.wait()
|
|
117
131
|
try:
|
|
118
132
|
response = requests.get(
|
|
119
133
|
url,
|
|
@@ -139,6 +153,7 @@ class RequestsWebClient(AbstractWebClient):
|
|
|
139
153
|
)
|
|
140
154
|
else:
|
|
141
155
|
message = f"{response.status_code} {response.reason}"
|
|
156
|
+
logging.info(f"Response content: {response.text}")
|
|
142
157
|
|
|
143
158
|
if response.status_code in RETRYABLE_HTTP_STATUS_CODES:
|
|
144
159
|
return RequestsWebClientErrorResponse(
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.0}/usp/web_client/__init__.py
RENAMED
|
File without changes
|