ultimate-sitemap-parser 1.0.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ultimate-sitemap-parser might be problematic. Click here for more details.
- ultimate_sitemap_parser-1.0.0rc1.dist-info/LICENSE +674 -0
- ultimate_sitemap_parser-1.0.0rc1.dist-info/METADATA +109 -0
- ultimate_sitemap_parser-1.0.0rc1.dist-info/NOTICE +12 -0
- ultimate_sitemap_parser-1.0.0rc1.dist-info/RECORD +22 -0
- ultimate_sitemap_parser-1.0.0rc1.dist-info/WHEEL +4 -0
- ultimate_sitemap_parser-1.0.0rc1.dist-info/entry_points.txt +3 -0
- usp/__init__.py +5 -0
- usp/cli/__init__.py +1 -0
- usp/cli/_ls.py +105 -0
- usp/cli/_util.py +21 -0
- usp/cli/cli.py +27 -0
- usp/exceptions.py +35 -0
- usp/fetch_parse.py +1182 -0
- usp/helpers.py +293 -0
- usp/log.py +77 -0
- usp/objects/__init__.py +0 -0
- usp/objects/page.py +451 -0
- usp/objects/sitemap.py +436 -0
- usp/tree.py +114 -0
- usp/web_client/__init__.py +0 -0
- usp/web_client/abstract_client.py +189 -0
- usp/web_client/requests_client.py +150 -0
usp/helpers.py
ADDED
|
@@ -0,0 +1,293 @@
|
|
|
1
|
+
"""Helper utilities."""
|
|
2
|
+
|
|
3
|
+
import datetime
|
|
4
|
+
import gzip as gzip_lib
|
|
5
|
+
import html
|
|
6
|
+
import re
|
|
7
|
+
import sys
|
|
8
|
+
import time
|
|
9
|
+
from typing import Optional
|
|
10
|
+
from urllib.parse import urlparse, unquote_plus, urlunparse
|
|
11
|
+
from dateutil.parser import parse as dateutil_parse
|
|
12
|
+
from dateutil.parser import isoparse as dateutil_isoparse
|
|
13
|
+
|
|
14
|
+
from .exceptions import SitemapException, GunzipException, StripURLToHomepageException
|
|
15
|
+
from .log import create_logger
|
|
16
|
+
from .web_client.abstract_client import (
|
|
17
|
+
AbstractWebClient,
|
|
18
|
+
AbstractWebClientSuccessResponse,
|
|
19
|
+
WebClientErrorResponse,
|
|
20
|
+
AbstractWebClientResponse,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
log = create_logger(__name__)
|
|
24
|
+
|
|
25
|
+
__URL_REGEX = re.compile(r"^https?://[^\s/$.?#].[^\s]*$", re.IGNORECASE)
|
|
26
|
+
"""Regular expression to match HTTP(s) URLs."""
|
|
27
|
+
|
|
28
|
+
HAS_DATETIME_NEW_ISOPARSER = sys.version_info >= (3, 11)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def is_http_url(url: str) -> bool:
|
|
32
|
+
"""
|
|
33
|
+
Returns true if URL is of the "http" ("https") scheme.
|
|
34
|
+
|
|
35
|
+
:param url: URL to test.
|
|
36
|
+
:return: True if argument URL is of the "http" ("https") scheme.
|
|
37
|
+
"""
|
|
38
|
+
if url is None:
|
|
39
|
+
log.debug("URL is None")
|
|
40
|
+
return False
|
|
41
|
+
if len(url) == 0:
|
|
42
|
+
log.debug("URL is empty")
|
|
43
|
+
return False
|
|
44
|
+
|
|
45
|
+
log.debug(f"Testing if URL '{url}' is HTTP(s) URL")
|
|
46
|
+
|
|
47
|
+
if not re.search(__URL_REGEX, url):
|
|
48
|
+
log.debug(f"URL '{url}' does not match URL's regexp")
|
|
49
|
+
return False
|
|
50
|
+
|
|
51
|
+
try:
|
|
52
|
+
# Try parsing the URL
|
|
53
|
+
uri = urlparse(url)
|
|
54
|
+
_ = urlunparse(uri)
|
|
55
|
+
|
|
56
|
+
except Exception as ex:
|
|
57
|
+
log.debug(f"Cannot parse URL {url}: {ex}")
|
|
58
|
+
return False
|
|
59
|
+
|
|
60
|
+
if not uri.scheme:
|
|
61
|
+
log.debug(f"Scheme is undefined for URL {url}.")
|
|
62
|
+
return False
|
|
63
|
+
if uri.scheme.lower() not in ["http", "https"]:
|
|
64
|
+
log.debug(f"Scheme is not HTTP(s) for URL {url}.")
|
|
65
|
+
return False
|
|
66
|
+
if not uri.hostname:
|
|
67
|
+
log.debug(f"Host is undefined for URL {url}.")
|
|
68
|
+
return False
|
|
69
|
+
|
|
70
|
+
return True
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def html_unescape_strip(string: Optional[str]) -> Optional[str]:
|
|
74
|
+
"""
|
|
75
|
+
Decode HTML entities, strip string, set to None if it's empty; ignore None as input.
|
|
76
|
+
|
|
77
|
+
:param string: String to decode HTML entities in.
|
|
78
|
+
:return: Stripped string with HTML entities decoded; None if parameter string was empty or None.
|
|
79
|
+
"""
|
|
80
|
+
if string:
|
|
81
|
+
string = html.unescape(string)
|
|
82
|
+
string = string.strip()
|
|
83
|
+
if not string:
|
|
84
|
+
string = None
|
|
85
|
+
return string
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def parse_iso8601_date(date_string: str) -> Optional[datetime.datetime]:
|
|
89
|
+
"""
|
|
90
|
+
Parse ISO 8601 date (e.g. from sitemap's <publication_date>) into datetime.datetime object.
|
|
91
|
+
|
|
92
|
+
:param date_string: ISO 8601 date, e.g. "2018-01-12T21:57:27Z" or "1997-07-16T19:20:30+01:00".
|
|
93
|
+
:return: datetime.datetime object of a parsed date.
|
|
94
|
+
"""
|
|
95
|
+
# FIXME parse known date formats faster
|
|
96
|
+
|
|
97
|
+
if not date_string:
|
|
98
|
+
raise SitemapException("Date string is unset.")
|
|
99
|
+
|
|
100
|
+
try:
|
|
101
|
+
if HAS_DATETIME_NEW_ISOPARSER:
|
|
102
|
+
# From Python 3.11, fromisosort is able to parse nearly any valid ISO 8601 string
|
|
103
|
+
return datetime.datetime.fromisoformat(date_string)
|
|
104
|
+
# Try the more efficient ISO 8601 parser
|
|
105
|
+
return dateutil_isoparse(date_string)
|
|
106
|
+
except ValueError:
|
|
107
|
+
pass
|
|
108
|
+
|
|
109
|
+
# Try the less efficient general parser
|
|
110
|
+
try:
|
|
111
|
+
return dateutil_parse(date_string)
|
|
112
|
+
except ValueError:
|
|
113
|
+
return None
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def parse_rfc2822_date(date_string: str) -> Optional[datetime.datetime]:
|
|
117
|
+
"""
|
|
118
|
+
Parse RFC 2822 date (e.g. from Atom's <issued>) into datetime.datetime object.
|
|
119
|
+
|
|
120
|
+
:param date_string: RFC 2822 date, e.g. "Tue, 10 Aug 2010 20:43:53 -0000".
|
|
121
|
+
:return: datetime.datetime object of a parsed date.
|
|
122
|
+
"""
|
|
123
|
+
if not date_string:
|
|
124
|
+
raise SitemapException("Date string is unset.")
|
|
125
|
+
|
|
126
|
+
try:
|
|
127
|
+
return dateutil_parse(date_string)
|
|
128
|
+
except ValueError:
|
|
129
|
+
return None
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def get_url_retry_on_client_errors(
|
|
133
|
+
url: str,
|
|
134
|
+
web_client: AbstractWebClient,
|
|
135
|
+
retry_count: int = 5,
|
|
136
|
+
sleep_between_retries: int = 1,
|
|
137
|
+
) -> AbstractWebClientResponse:
|
|
138
|
+
"""
|
|
139
|
+
Fetch URL, retry on retryable errors.
|
|
140
|
+
|
|
141
|
+
:param url: URL to fetch.
|
|
142
|
+
:param web_client: Web client object to use for fetching.
|
|
143
|
+
:param retry_count: How many times to retry fetching the same URL.
|
|
144
|
+
:param sleep_between_retries: How long to sleep between retries, in seconds.
|
|
145
|
+
:return: Web client response object.
|
|
146
|
+
"""
|
|
147
|
+
assert retry_count > 0, "Retry count must be positive."
|
|
148
|
+
|
|
149
|
+
response = None
|
|
150
|
+
for retry in range(0, retry_count):
|
|
151
|
+
log.info(f"Fetching URL {url}...")
|
|
152
|
+
response = web_client.get(url)
|
|
153
|
+
|
|
154
|
+
if isinstance(response, WebClientErrorResponse):
|
|
155
|
+
log.warning(f"Request for URL {url} failed: {response.message()}")
|
|
156
|
+
|
|
157
|
+
if response.retryable():
|
|
158
|
+
log.info(f"Retrying URL {url} in {sleep_between_retries} seconds...")
|
|
159
|
+
time.sleep(sleep_between_retries)
|
|
160
|
+
|
|
161
|
+
else:
|
|
162
|
+
log.info(f"Not retrying for URL {url}")
|
|
163
|
+
return response
|
|
164
|
+
|
|
165
|
+
else:
|
|
166
|
+
return response
|
|
167
|
+
|
|
168
|
+
log.info(f"Giving up on URL {url}")
|
|
169
|
+
return response
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def __response_is_gzipped_data(
|
|
173
|
+
url: str, response: AbstractWebClientSuccessResponse
|
|
174
|
+
) -> bool:
|
|
175
|
+
"""
|
|
176
|
+
Return True if Response looks like it's gzipped.
|
|
177
|
+
|
|
178
|
+
:param url: URL the response was fetched from.
|
|
179
|
+
:param response: Response object.
|
|
180
|
+
:return: True if response looks like it might contain gzipped data.
|
|
181
|
+
"""
|
|
182
|
+
uri = urlparse(url)
|
|
183
|
+
url_path = unquote_plus(uri.path)
|
|
184
|
+
content_type = response.header("content-type") or ""
|
|
185
|
+
content_encoding = response.header("content-encoding") or ""
|
|
186
|
+
|
|
187
|
+
if (
|
|
188
|
+
url_path.lower().endswith(".gz")
|
|
189
|
+
or "gzip" in content_type.lower()
|
|
190
|
+
or "gzip" in content_encoding.lower()
|
|
191
|
+
):
|
|
192
|
+
return True
|
|
193
|
+
|
|
194
|
+
else:
|
|
195
|
+
return False
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def gunzip(data: bytes) -> bytes:
|
|
199
|
+
"""
|
|
200
|
+
Gunzip data.
|
|
201
|
+
|
|
202
|
+
:raises GunzipException: If the data cannot be decompressed.
|
|
203
|
+
:param data: Gzipped data.
|
|
204
|
+
:return: Gunzipped data.
|
|
205
|
+
"""
|
|
206
|
+
|
|
207
|
+
if data is None:
|
|
208
|
+
raise GunzipException("Data is None.")
|
|
209
|
+
|
|
210
|
+
if not isinstance(data, bytes):
|
|
211
|
+
raise GunzipException(f"Data is not bytes: {str(data)}")
|
|
212
|
+
|
|
213
|
+
if len(data) == 0:
|
|
214
|
+
raise GunzipException(
|
|
215
|
+
"Data is empty (no way an empty string is a valid Gzip archive)."
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
try:
|
|
219
|
+
gunzipped_data = gzip_lib.decompress(data)
|
|
220
|
+
except Exception as ex:
|
|
221
|
+
raise GunzipException(f"Unable to gunzip data: {str(ex)}")
|
|
222
|
+
|
|
223
|
+
if gunzipped_data is None:
|
|
224
|
+
raise GunzipException("Gunzipped data is None.")
|
|
225
|
+
|
|
226
|
+
if not isinstance(gunzipped_data, bytes):
|
|
227
|
+
raise GunzipException("Gunzipped data is not bytes.")
|
|
228
|
+
|
|
229
|
+
return gunzipped_data
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def ungzipped_response_content(
|
|
233
|
+
url: str, response: AbstractWebClientSuccessResponse
|
|
234
|
+
) -> str:
|
|
235
|
+
"""
|
|
236
|
+
Return HTTP response's decoded content, gunzip it if necessary.
|
|
237
|
+
|
|
238
|
+
:param url: URL the response was fetched from.
|
|
239
|
+
:param response: Response object.
|
|
240
|
+
:return: Decoded and (if necessary) gunzipped response string.
|
|
241
|
+
"""
|
|
242
|
+
|
|
243
|
+
data = response.raw_data()
|
|
244
|
+
|
|
245
|
+
if __response_is_gzipped_data(url=url, response=response):
|
|
246
|
+
try:
|
|
247
|
+
data = gunzip(data)
|
|
248
|
+
except GunzipException as ex:
|
|
249
|
+
# In case of an error, just assume that it's one of the non-gzipped sitemaps with ".gz" extension
|
|
250
|
+
log.error(
|
|
251
|
+
f"Unable to gunzip response {response}, maybe it's a non-gzipped sitemap: {ex}"
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
# FIXME other encodings
|
|
255
|
+
data = data.decode("utf-8-sig", errors="replace")
|
|
256
|
+
|
|
257
|
+
assert isinstance(data, str)
|
|
258
|
+
|
|
259
|
+
return data
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
def strip_url_to_homepage(url: str) -> str:
|
|
263
|
+
"""
|
|
264
|
+
Strip URL to its homepage.
|
|
265
|
+
|
|
266
|
+
:raises StripURLToHomepageException: If URL is empty or cannot be parsed.
|
|
267
|
+
|
|
268
|
+
:param url: URL to strip, e.g. "http://www.example.com/page.html".
|
|
269
|
+
:return: Stripped homepage URL, e.g. "http://www.example.com/"
|
|
270
|
+
"""
|
|
271
|
+
if not url:
|
|
272
|
+
raise StripURLToHomepageException("URL is empty.")
|
|
273
|
+
|
|
274
|
+
try:
|
|
275
|
+
uri = urlparse(url)
|
|
276
|
+
assert uri.scheme, "Scheme must be set."
|
|
277
|
+
assert uri.scheme.lower() in [
|
|
278
|
+
"http",
|
|
279
|
+
"https",
|
|
280
|
+
], "Scheme must be http:// or https://"
|
|
281
|
+
uri = (
|
|
282
|
+
uri.scheme,
|
|
283
|
+
uri.netloc,
|
|
284
|
+
"/", # path
|
|
285
|
+
"", # params
|
|
286
|
+
"", # query
|
|
287
|
+
"", # fragment
|
|
288
|
+
)
|
|
289
|
+
url = urlunparse(uri)
|
|
290
|
+
except Exception as ex:
|
|
291
|
+
raise StripURLToHomepageException(f"Unable to parse URL {url}: {ex}")
|
|
292
|
+
|
|
293
|
+
return url
|
usp/log.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
"""Logging utilities."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class Logger:
|
|
7
|
+
"""
|
|
8
|
+
Logging helper class.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
__LEVELS = {
|
|
12
|
+
"CRITICAL": logging.CRITICAL,
|
|
13
|
+
"ERROR": logging.ERROR,
|
|
14
|
+
"WARNING": logging.WARNING,
|
|
15
|
+
"INFO": logging.INFO,
|
|
16
|
+
"DEBUG": logging.DEBUG,
|
|
17
|
+
}
|
|
18
|
+
"""Valid logging levels and their "logging" counterparts."""
|
|
19
|
+
|
|
20
|
+
__DEFAULT_LEVEL = "INFO"
|
|
21
|
+
"""Default logging level."""
|
|
22
|
+
|
|
23
|
+
__slots__ = [
|
|
24
|
+
# "logging" object
|
|
25
|
+
"__l",
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
def __init__(self, name: str):
|
|
29
|
+
"""
|
|
30
|
+
Initialize logger object for a given name.
|
|
31
|
+
|
|
32
|
+
:param name: Module name that the logger should be initialized for.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
self.__l = logging.getLogger(name)
|
|
36
|
+
|
|
37
|
+
def error(self, message: str) -> None:
|
|
38
|
+
"""
|
|
39
|
+
Log error message.
|
|
40
|
+
|
|
41
|
+
:param message: Message to log.
|
|
42
|
+
"""
|
|
43
|
+
self.__l.error(message)
|
|
44
|
+
|
|
45
|
+
def warning(self, message: str) -> None:
|
|
46
|
+
"""
|
|
47
|
+
Log warning message.
|
|
48
|
+
|
|
49
|
+
:param message: Message to log.
|
|
50
|
+
"""
|
|
51
|
+
self.__l.warning(message)
|
|
52
|
+
|
|
53
|
+
def info(self, message: str) -> None:
|
|
54
|
+
"""
|
|
55
|
+
Log informational message.
|
|
56
|
+
|
|
57
|
+
:param message: Message to log.
|
|
58
|
+
"""
|
|
59
|
+
self.__l.info(message)
|
|
60
|
+
|
|
61
|
+
def debug(self, message: str) -> None:
|
|
62
|
+
"""
|
|
63
|
+
Log debugging message.
|
|
64
|
+
|
|
65
|
+
:param message: Message to log.
|
|
66
|
+
"""
|
|
67
|
+
self.__l.debug(message)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def create_logger(name: str) -> Logger:
|
|
71
|
+
"""
|
|
72
|
+
Create and return Logger object.
|
|
73
|
+
|
|
74
|
+
:param name: Module name that the logger should be initialized for.
|
|
75
|
+
:return: Logger object.
|
|
76
|
+
"""
|
|
77
|
+
return Logger(name=name)
|
usp/objects/__init__.py
ADDED
|
File without changes
|