ultimate-sitemap-parser 1.0.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ultimate-sitemap-parser might be problematic. Click here for more details.
- ultimate_sitemap_parser-1.0.0rc1.dist-info/LICENSE +674 -0
- ultimate_sitemap_parser-1.0.0rc1.dist-info/METADATA +109 -0
- ultimate_sitemap_parser-1.0.0rc1.dist-info/NOTICE +12 -0
- ultimate_sitemap_parser-1.0.0rc1.dist-info/RECORD +22 -0
- ultimate_sitemap_parser-1.0.0rc1.dist-info/WHEEL +4 -0
- ultimate_sitemap_parser-1.0.0rc1.dist-info/entry_points.txt +3 -0
- usp/__init__.py +5 -0
- usp/cli/__init__.py +1 -0
- usp/cli/_ls.py +105 -0
- usp/cli/_util.py +21 -0
- usp/cli/cli.py +27 -0
- usp/exceptions.py +35 -0
- usp/fetch_parse.py +1182 -0
- usp/helpers.py +293 -0
- usp/log.py +77 -0
- usp/objects/__init__.py +0 -0
- usp/objects/page.py +451 -0
- usp/objects/sitemap.py +436 -0
- usp/tree.py +114 -0
- usp/web_client/__init__.py +0 -0
- usp/web_client/abstract_client.py +189 -0
- usp/web_client/requests_client.py +150 -0
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
"""Abstract web client class."""
|
|
2
|
+
|
|
3
|
+
import abc
|
|
4
|
+
from http import HTTPStatus
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
RETRYABLE_HTTP_STATUS_CODES = {
|
|
8
|
+
# Some servers return "400 Bad Request" initially but upon retry start working again, no idea why
|
|
9
|
+
int(HTTPStatus.BAD_REQUEST),
|
|
10
|
+
# If we timed out requesting stuff, we can just try again
|
|
11
|
+
int(HTTPStatus.REQUEST_TIMEOUT),
|
|
12
|
+
# If we got rate limited, it makes sense to wait a bit
|
|
13
|
+
int(HTTPStatus.TOO_MANY_REQUESTS),
|
|
14
|
+
# Server might be just fine on a subsequent attempt
|
|
15
|
+
int(HTTPStatus.INTERNAL_SERVER_ERROR),
|
|
16
|
+
# Upstream might reappear on a retry
|
|
17
|
+
int(HTTPStatus.BAD_GATEWAY),
|
|
18
|
+
# Service might become available again on a retry
|
|
19
|
+
int(HTTPStatus.SERVICE_UNAVAILABLE),
|
|
20
|
+
# Upstream might reappear on a retry
|
|
21
|
+
int(HTTPStatus.GATEWAY_TIMEOUT),
|
|
22
|
+
# (unofficial) 509 Bandwidth Limit Exceeded (Apache Web Server/cPanel)
|
|
23
|
+
509,
|
|
24
|
+
# (unofficial) 598 Network read timeout error
|
|
25
|
+
598,
|
|
26
|
+
# (unofficial, nginx) 499 Client Closed Request
|
|
27
|
+
499,
|
|
28
|
+
# (unofficial, Cloudflare) 520 Unknown Error
|
|
29
|
+
520,
|
|
30
|
+
# (unofficial, Cloudflare) 521 Web Server Is Down
|
|
31
|
+
521,
|
|
32
|
+
# (unofficial, Cloudflare) 522 Connection Timed Out
|
|
33
|
+
522,
|
|
34
|
+
# (unofficial, Cloudflare) 523 Origin Is Unreachable
|
|
35
|
+
523,
|
|
36
|
+
# (unofficial, Cloudflare) 524 A Timeout Occurred
|
|
37
|
+
524,
|
|
38
|
+
# (unofficial, Cloudflare) 525 SSL Handshake Failed
|
|
39
|
+
525,
|
|
40
|
+
# (unofficial, Cloudflare) 526 Invalid SSL Certificate
|
|
41
|
+
526,
|
|
42
|
+
# (unofficial, Cloudflare) 527 Railgun Error
|
|
43
|
+
527,
|
|
44
|
+
# (unofficial, Cloudflare) 530 Origin DNS Error
|
|
45
|
+
530,
|
|
46
|
+
}
|
|
47
|
+
"""HTTP status codes on which a request should be retried."""
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class AbstractWebClientResponse(metaclass=abc.ABCMeta):
|
|
51
|
+
"""
|
|
52
|
+
Abstract response.
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
pass
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class AbstractWebClientSuccessResponse(
|
|
59
|
+
AbstractWebClientResponse, metaclass=abc.ABCMeta
|
|
60
|
+
):
|
|
61
|
+
"""
|
|
62
|
+
Successful response.
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
@abc.abstractmethod
|
|
66
|
+
def status_code(self) -> int:
|
|
67
|
+
"""
|
|
68
|
+
Return HTTP status code of the response.
|
|
69
|
+
|
|
70
|
+
:return: HTTP status code of the response, e.g. 200.
|
|
71
|
+
"""
|
|
72
|
+
raise NotImplementedError("Abstract method.")
|
|
73
|
+
|
|
74
|
+
@abc.abstractmethod
|
|
75
|
+
def status_message(self) -> str:
|
|
76
|
+
"""
|
|
77
|
+
Return HTTP status message of the response.
|
|
78
|
+
|
|
79
|
+
:return: HTTP status message of the response, e.g. "OK".
|
|
80
|
+
"""
|
|
81
|
+
raise NotImplementedError("Abstract method.")
|
|
82
|
+
|
|
83
|
+
@abc.abstractmethod
|
|
84
|
+
def header(self, case_insensitive_name: str) -> Optional[str]:
|
|
85
|
+
"""
|
|
86
|
+
Return HTTP header value for a given case-insensitive name, or None if such header wasn't set.
|
|
87
|
+
|
|
88
|
+
:param case_insensitive_name: HTTP header's name, e.g. "Content-Type".
|
|
89
|
+
:return: HTTP header's value, or None if it was unset.
|
|
90
|
+
"""
|
|
91
|
+
raise NotImplementedError("Abstract method.")
|
|
92
|
+
|
|
93
|
+
@abc.abstractmethod
|
|
94
|
+
def raw_data(self) -> bytes:
|
|
95
|
+
"""
|
|
96
|
+
Return encoded raw data of the response.
|
|
97
|
+
|
|
98
|
+
:return: Encoded raw data of the response.
|
|
99
|
+
"""
|
|
100
|
+
raise NotImplementedError("Abstract method.")
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
class WebClientErrorResponse(AbstractWebClientResponse, metaclass=abc.ABCMeta):
|
|
104
|
+
"""
|
|
105
|
+
Error response.
|
|
106
|
+
"""
|
|
107
|
+
|
|
108
|
+
__slots__ = [
|
|
109
|
+
"_message",
|
|
110
|
+
"_retryable",
|
|
111
|
+
]
|
|
112
|
+
|
|
113
|
+
def __init__(self, message: str, retryable: bool):
|
|
114
|
+
"""
|
|
115
|
+
Constructor.
|
|
116
|
+
|
|
117
|
+
:param message: Message describing what went wrong.
|
|
118
|
+
:param retryable: True if the request should be retried.
|
|
119
|
+
"""
|
|
120
|
+
super().__init__()
|
|
121
|
+
self._message = message
|
|
122
|
+
self._retryable = retryable
|
|
123
|
+
|
|
124
|
+
def message(self) -> str:
|
|
125
|
+
"""
|
|
126
|
+
Return message describing what went wrong.
|
|
127
|
+
|
|
128
|
+
:return: Message describing what went wrong.
|
|
129
|
+
"""
|
|
130
|
+
return self._message
|
|
131
|
+
|
|
132
|
+
def retryable(self) -> bool:
|
|
133
|
+
"""
|
|
134
|
+
Return True if request should be retried.
|
|
135
|
+
|
|
136
|
+
:return: True if request should be retried.
|
|
137
|
+
"""
|
|
138
|
+
return self._retryable
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
class AbstractWebClient(metaclass=abc.ABCMeta):
|
|
142
|
+
"""
|
|
143
|
+
Abstract web client to be used by the sitemap fetcher.
|
|
144
|
+
"""
|
|
145
|
+
|
|
146
|
+
@abc.abstractmethod
|
|
147
|
+
def set_max_response_data_length(
|
|
148
|
+
self, max_response_data_length: Optional[int]
|
|
149
|
+
) -> None:
|
|
150
|
+
"""
|
|
151
|
+
Set the maximum number of bytes that the web client will fetch.
|
|
152
|
+
|
|
153
|
+
:param max_response_data_length: Maximum number of bytes that the web client will fetch, or None to fetch all.
|
|
154
|
+
"""
|
|
155
|
+
raise NotImplementedError("Abstract method.")
|
|
156
|
+
|
|
157
|
+
@abc.abstractmethod
|
|
158
|
+
def get(self, url: str) -> AbstractWebClientResponse:
|
|
159
|
+
"""
|
|
160
|
+
Fetch a URL and return a response.
|
|
161
|
+
|
|
162
|
+
Method shouldn't throw exceptions on connection errors (including timeouts); instead, such errors should be
|
|
163
|
+
reported via Response object.
|
|
164
|
+
|
|
165
|
+
:param url: URL to fetch.
|
|
166
|
+
:return: Response object.
|
|
167
|
+
"""
|
|
168
|
+
raise NotImplementedError("Abstract method.")
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
class NoWebClientException(Exception):
|
|
172
|
+
"""Error indicating this web client cannot fetch pages."""
|
|
173
|
+
|
|
174
|
+
pass
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
class LocalWebClient(AbstractWebClient):
|
|
178
|
+
"""Dummy web client which is a valid implementation but errors if called.
|
|
179
|
+
|
|
180
|
+
Used for local parsing
|
|
181
|
+
"""
|
|
182
|
+
|
|
183
|
+
def set_max_response_data_length(
|
|
184
|
+
self, max_response_data_length: Optional[int]
|
|
185
|
+
) -> None:
|
|
186
|
+
pass
|
|
187
|
+
|
|
188
|
+
def get(self, url: str) -> AbstractWebClientResponse:
|
|
189
|
+
raise NoWebClientException
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
"""Implementation of :mod:`usp.web_client.abstract_client` with Requests."""
|
|
2
|
+
|
|
3
|
+
from http import HTTPStatus
|
|
4
|
+
from typing import Optional, Dict, Tuple, Union
|
|
5
|
+
|
|
6
|
+
import requests
|
|
7
|
+
|
|
8
|
+
from .abstract_client import (
|
|
9
|
+
AbstractWebClient,
|
|
10
|
+
AbstractWebClientResponse,
|
|
11
|
+
AbstractWebClientSuccessResponse,
|
|
12
|
+
WebClientErrorResponse,
|
|
13
|
+
RETRYABLE_HTTP_STATUS_CODES,
|
|
14
|
+
)
|
|
15
|
+
from usp import __version__
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class RequestsWebClientSuccessResponse(AbstractWebClientSuccessResponse):
|
|
19
|
+
"""
|
|
20
|
+
requests-based successful response.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
__slots__ = [
|
|
24
|
+
"__requests_response",
|
|
25
|
+
"__max_response_data_length",
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
def __init__(
|
|
29
|
+
self,
|
|
30
|
+
requests_response: requests.Response,
|
|
31
|
+
max_response_data_length: Optional[int] = None,
|
|
32
|
+
):
|
|
33
|
+
"""
|
|
34
|
+
:param requests_response: Response data
|
|
35
|
+
:param max_response_data_length: Maximum data length, or ``None`` to not restrict.
|
|
36
|
+
"""
|
|
37
|
+
self.__requests_response = requests_response
|
|
38
|
+
self.__max_response_data_length = max_response_data_length
|
|
39
|
+
|
|
40
|
+
def status_code(self) -> int:
|
|
41
|
+
return int(self.__requests_response.status_code)
|
|
42
|
+
|
|
43
|
+
def status_message(self) -> str:
|
|
44
|
+
message = self.__requests_response.reason
|
|
45
|
+
if not message:
|
|
46
|
+
message = HTTPStatus(self.status_code()).phrase
|
|
47
|
+
return message
|
|
48
|
+
|
|
49
|
+
def header(self, case_insensitive_name: str) -> Optional[str]:
|
|
50
|
+
return self.__requests_response.headers.get(case_insensitive_name.lower(), None)
|
|
51
|
+
|
|
52
|
+
def raw_data(self) -> bytes:
|
|
53
|
+
if self.__max_response_data_length:
|
|
54
|
+
data = self.__requests_response.content[: self.__max_response_data_length]
|
|
55
|
+
else:
|
|
56
|
+
data = self.__requests_response.content
|
|
57
|
+
|
|
58
|
+
return data
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class RequestsWebClientErrorResponse(WebClientErrorResponse):
|
|
62
|
+
"""
|
|
63
|
+
Error response from the Requests parser.
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
pass
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class RequestsWebClient(AbstractWebClient):
|
|
70
|
+
"""requests-based web client to be used by the sitemap fetcher."""
|
|
71
|
+
|
|
72
|
+
__USER_AGENT = f"ultimate_sitemap_parser/{__version__}"
|
|
73
|
+
|
|
74
|
+
__HTTP_REQUEST_TIMEOUT = 60
|
|
75
|
+
"""
|
|
76
|
+
HTTP request timeout.
|
|
77
|
+
|
|
78
|
+
Some webservers might be generating huge sitemaps on the fly, so this is why it's rather big.
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
__slots__ = ["__max_response_data_length", "__timeout", "__proxies", "__verify"]
|
|
82
|
+
|
|
83
|
+
def __init__(self, verify=True):
|
|
84
|
+
"""
|
|
85
|
+
:param verify: whether certificates should be verified for HTTPS requests.
|
|
86
|
+
"""
|
|
87
|
+
self.__max_response_data_length = None
|
|
88
|
+
self.__timeout = self.__HTTP_REQUEST_TIMEOUT
|
|
89
|
+
self.__proxies = {}
|
|
90
|
+
self.__verify = verify
|
|
91
|
+
|
|
92
|
+
def set_timeout(self, timeout: Union[int, Tuple[int, int], None]) -> None:
|
|
93
|
+
"""Set HTTP request timeout.
|
|
94
|
+
|
|
95
|
+
See also: `Requests timeout docs <https://requests.readthedocs.io/en/latest/user/advanced/#timeouts>`__
|
|
96
|
+
|
|
97
|
+
:param timeout: An integer to use as both the connect and read timeouts,
|
|
98
|
+
or a tuple to specify them individually, or None for no timeout
|
|
99
|
+
"""
|
|
100
|
+
# Used mostly for testing
|
|
101
|
+
self.__timeout = timeout
|
|
102
|
+
|
|
103
|
+
def set_proxies(self, proxies: Dict[str, str]) -> None:
|
|
104
|
+
"""
|
|
105
|
+
Set a proxy for the request.
|
|
106
|
+
|
|
107
|
+
:param proxies: Proxy definition where the keys are schemes ("http" or "https") and values are the proxy address.
|
|
108
|
+
Example: ``{'http': 'http://user:pass@10.10.1.10:3128/'}, or an empty dict to disable proxy.``
|
|
109
|
+
"""
|
|
110
|
+
# Used mostly for testing
|
|
111
|
+
self.__proxies = proxies
|
|
112
|
+
|
|
113
|
+
def set_max_response_data_length(self, max_response_data_length: int) -> None:
|
|
114
|
+
self.__max_response_data_length = max_response_data_length
|
|
115
|
+
|
|
116
|
+
def get(self, url: str) -> AbstractWebClientResponse:
|
|
117
|
+
try:
|
|
118
|
+
response = requests.get(
|
|
119
|
+
url,
|
|
120
|
+
timeout=self.__timeout,
|
|
121
|
+
stream=True,
|
|
122
|
+
headers={"User-Agent": self.__USER_AGENT},
|
|
123
|
+
proxies=self.__proxies,
|
|
124
|
+
verify=self.__verify,
|
|
125
|
+
)
|
|
126
|
+
except requests.exceptions.Timeout as ex:
|
|
127
|
+
# Retryable timeouts
|
|
128
|
+
return RequestsWebClientErrorResponse(message=str(ex), retryable=True)
|
|
129
|
+
|
|
130
|
+
except requests.exceptions.RequestException as ex:
|
|
131
|
+
# Other errors, e.g. redirect loops
|
|
132
|
+
return RequestsWebClientErrorResponse(message=str(ex), retryable=False)
|
|
133
|
+
|
|
134
|
+
else:
|
|
135
|
+
if 200 <= response.status_code < 300:
|
|
136
|
+
return RequestsWebClientSuccessResponse(
|
|
137
|
+
requests_response=response,
|
|
138
|
+
max_response_data_length=self.__max_response_data_length,
|
|
139
|
+
)
|
|
140
|
+
else:
|
|
141
|
+
message = f"{response.status_code} {response.reason}"
|
|
142
|
+
|
|
143
|
+
if response.status_code in RETRYABLE_HTTP_STATUS_CODES:
|
|
144
|
+
return RequestsWebClientErrorResponse(
|
|
145
|
+
message=message, retryable=True
|
|
146
|
+
)
|
|
147
|
+
else:
|
|
148
|
+
return RequestsWebClientErrorResponse(
|
|
149
|
+
message=message, retryable=False
|
|
150
|
+
)
|