ultimate-sitemap-parser 1.0.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ultimate-sitemap-parser might be problematic. Click here for more details.

@@ -0,0 +1,189 @@
1
+ """Abstract web client class."""
2
+
3
+ import abc
4
+ from http import HTTPStatus
5
+ from typing import Optional
6
+
7
+ RETRYABLE_HTTP_STATUS_CODES = {
8
+ # Some servers return "400 Bad Request" initially but upon retry start working again, no idea why
9
+ int(HTTPStatus.BAD_REQUEST),
10
+ # If we timed out requesting stuff, we can just try again
11
+ int(HTTPStatus.REQUEST_TIMEOUT),
12
+ # If we got rate limited, it makes sense to wait a bit
13
+ int(HTTPStatus.TOO_MANY_REQUESTS),
14
+ # Server might be just fine on a subsequent attempt
15
+ int(HTTPStatus.INTERNAL_SERVER_ERROR),
16
+ # Upstream might reappear on a retry
17
+ int(HTTPStatus.BAD_GATEWAY),
18
+ # Service might become available again on a retry
19
+ int(HTTPStatus.SERVICE_UNAVAILABLE),
20
+ # Upstream might reappear on a retry
21
+ int(HTTPStatus.GATEWAY_TIMEOUT),
22
+ # (unofficial) 509 Bandwidth Limit Exceeded (Apache Web Server/cPanel)
23
+ 509,
24
+ # (unofficial) 598 Network read timeout error
25
+ 598,
26
+ # (unofficial, nginx) 499 Client Closed Request
27
+ 499,
28
+ # (unofficial, Cloudflare) 520 Unknown Error
29
+ 520,
30
+ # (unofficial, Cloudflare) 521 Web Server Is Down
31
+ 521,
32
+ # (unofficial, Cloudflare) 522 Connection Timed Out
33
+ 522,
34
+ # (unofficial, Cloudflare) 523 Origin Is Unreachable
35
+ 523,
36
+ # (unofficial, Cloudflare) 524 A Timeout Occurred
37
+ 524,
38
+ # (unofficial, Cloudflare) 525 SSL Handshake Failed
39
+ 525,
40
+ # (unofficial, Cloudflare) 526 Invalid SSL Certificate
41
+ 526,
42
+ # (unofficial, Cloudflare) 527 Railgun Error
43
+ 527,
44
+ # (unofficial, Cloudflare) 530 Origin DNS Error
45
+ 530,
46
+ }
47
+ """HTTP status codes on which a request should be retried."""
48
+
49
+
50
+ class AbstractWebClientResponse(metaclass=abc.ABCMeta):
51
+ """
52
+ Abstract response.
53
+ """
54
+
55
+ pass
56
+
57
+
58
+ class AbstractWebClientSuccessResponse(
59
+ AbstractWebClientResponse, metaclass=abc.ABCMeta
60
+ ):
61
+ """
62
+ Successful response.
63
+ """
64
+
65
+ @abc.abstractmethod
66
+ def status_code(self) -> int:
67
+ """
68
+ Return HTTP status code of the response.
69
+
70
+ :return: HTTP status code of the response, e.g. 200.
71
+ """
72
+ raise NotImplementedError("Abstract method.")
73
+
74
+ @abc.abstractmethod
75
+ def status_message(self) -> str:
76
+ """
77
+ Return HTTP status message of the response.
78
+
79
+ :return: HTTP status message of the response, e.g. "OK".
80
+ """
81
+ raise NotImplementedError("Abstract method.")
82
+
83
+ @abc.abstractmethod
84
+ def header(self, case_insensitive_name: str) -> Optional[str]:
85
+ """
86
+ Return HTTP header value for a given case-insensitive name, or None if such header wasn't set.
87
+
88
+ :param case_insensitive_name: HTTP header's name, e.g. "Content-Type".
89
+ :return: HTTP header's value, or None if it was unset.
90
+ """
91
+ raise NotImplementedError("Abstract method.")
92
+
93
+ @abc.abstractmethod
94
+ def raw_data(self) -> bytes:
95
+ """
96
+ Return encoded raw data of the response.
97
+
98
+ :return: Encoded raw data of the response.
99
+ """
100
+ raise NotImplementedError("Abstract method.")
101
+
102
+
103
+ class WebClientErrorResponse(AbstractWebClientResponse, metaclass=abc.ABCMeta):
104
+ """
105
+ Error response.
106
+ """
107
+
108
+ __slots__ = [
109
+ "_message",
110
+ "_retryable",
111
+ ]
112
+
113
+ def __init__(self, message: str, retryable: bool):
114
+ """
115
+ Constructor.
116
+
117
+ :param message: Message describing what went wrong.
118
+ :param retryable: True if the request should be retried.
119
+ """
120
+ super().__init__()
121
+ self._message = message
122
+ self._retryable = retryable
123
+
124
+ def message(self) -> str:
125
+ """
126
+ Return message describing what went wrong.
127
+
128
+ :return: Message describing what went wrong.
129
+ """
130
+ return self._message
131
+
132
+ def retryable(self) -> bool:
133
+ """
134
+ Return True if request should be retried.
135
+
136
+ :return: True if request should be retried.
137
+ """
138
+ return self._retryable
139
+
140
+
141
+ class AbstractWebClient(metaclass=abc.ABCMeta):
142
+ """
143
+ Abstract web client to be used by the sitemap fetcher.
144
+ """
145
+
146
+ @abc.abstractmethod
147
+ def set_max_response_data_length(
148
+ self, max_response_data_length: Optional[int]
149
+ ) -> None:
150
+ """
151
+ Set the maximum number of bytes that the web client will fetch.
152
+
153
+ :param max_response_data_length: Maximum number of bytes that the web client will fetch, or None to fetch all.
154
+ """
155
+ raise NotImplementedError("Abstract method.")
156
+
157
+ @abc.abstractmethod
158
+ def get(self, url: str) -> AbstractWebClientResponse:
159
+ """
160
+ Fetch a URL and return a response.
161
+
162
+ Method shouldn't throw exceptions on connection errors (including timeouts); instead, such errors should be
163
+ reported via Response object.
164
+
165
+ :param url: URL to fetch.
166
+ :return: Response object.
167
+ """
168
+ raise NotImplementedError("Abstract method.")
169
+
170
+
171
+ class NoWebClientException(Exception):
172
+ """Error indicating this web client cannot fetch pages."""
173
+
174
+ pass
175
+
176
+
177
+ class LocalWebClient(AbstractWebClient):
178
+ """Dummy web client which is a valid implementation but errors if called.
179
+
180
+ Used for local parsing
181
+ """
182
+
183
+ def set_max_response_data_length(
184
+ self, max_response_data_length: Optional[int]
185
+ ) -> None:
186
+ pass
187
+
188
+ def get(self, url: str) -> AbstractWebClientResponse:
189
+ raise NoWebClientException
@@ -0,0 +1,150 @@
1
+ """Implementation of :mod:`usp.web_client.abstract_client` with Requests."""
2
+
3
+ from http import HTTPStatus
4
+ from typing import Optional, Dict, Tuple, Union
5
+
6
+ import requests
7
+
8
+ from .abstract_client import (
9
+ AbstractWebClient,
10
+ AbstractWebClientResponse,
11
+ AbstractWebClientSuccessResponse,
12
+ WebClientErrorResponse,
13
+ RETRYABLE_HTTP_STATUS_CODES,
14
+ )
15
+ from usp import __version__
16
+
17
+
18
+ class RequestsWebClientSuccessResponse(AbstractWebClientSuccessResponse):
19
+ """
20
+ requests-based successful response.
21
+ """
22
+
23
+ __slots__ = [
24
+ "__requests_response",
25
+ "__max_response_data_length",
26
+ ]
27
+
28
+ def __init__(
29
+ self,
30
+ requests_response: requests.Response,
31
+ max_response_data_length: Optional[int] = None,
32
+ ):
33
+ """
34
+ :param requests_response: Response data
35
+ :param max_response_data_length: Maximum data length, or ``None`` to not restrict.
36
+ """
37
+ self.__requests_response = requests_response
38
+ self.__max_response_data_length = max_response_data_length
39
+
40
+ def status_code(self) -> int:
41
+ return int(self.__requests_response.status_code)
42
+
43
+ def status_message(self) -> str:
44
+ message = self.__requests_response.reason
45
+ if not message:
46
+ message = HTTPStatus(self.status_code()).phrase
47
+ return message
48
+
49
+ def header(self, case_insensitive_name: str) -> Optional[str]:
50
+ return self.__requests_response.headers.get(case_insensitive_name.lower(), None)
51
+
52
+ def raw_data(self) -> bytes:
53
+ if self.__max_response_data_length:
54
+ data = self.__requests_response.content[: self.__max_response_data_length]
55
+ else:
56
+ data = self.__requests_response.content
57
+
58
+ return data
59
+
60
+
61
+ class RequestsWebClientErrorResponse(WebClientErrorResponse):
62
+ """
63
+ Error response from the Requests parser.
64
+ """
65
+
66
+ pass
67
+
68
+
69
+ class RequestsWebClient(AbstractWebClient):
70
+ """requests-based web client to be used by the sitemap fetcher."""
71
+
72
+ __USER_AGENT = f"ultimate_sitemap_parser/{__version__}"
73
+
74
+ __HTTP_REQUEST_TIMEOUT = 60
75
+ """
76
+ HTTP request timeout.
77
+
78
+ Some webservers might be generating huge sitemaps on the fly, so this is why it's rather big.
79
+ """
80
+
81
+ __slots__ = ["__max_response_data_length", "__timeout", "__proxies", "__verify"]
82
+
83
+ def __init__(self, verify=True):
84
+ """
85
+ :param verify: whether certificates should be verified for HTTPS requests.
86
+ """
87
+ self.__max_response_data_length = None
88
+ self.__timeout = self.__HTTP_REQUEST_TIMEOUT
89
+ self.__proxies = {}
90
+ self.__verify = verify
91
+
92
+ def set_timeout(self, timeout: Union[int, Tuple[int, int], None]) -> None:
93
+ """Set HTTP request timeout.
94
+
95
+ See also: `Requests timeout docs <https://requests.readthedocs.io/en/latest/user/advanced/#timeouts>`__
96
+
97
+ :param timeout: An integer to use as both the connect and read timeouts,
98
+ or a tuple to specify them individually, or None for no timeout
99
+ """
100
+ # Used mostly for testing
101
+ self.__timeout = timeout
102
+
103
+ def set_proxies(self, proxies: Dict[str, str]) -> None:
104
+ """
105
+ Set a proxy for the request.
106
+
107
+ :param proxies: Proxy definition where the keys are schemes ("http" or "https") and values are the proxy address.
108
+ Example: ``{'http': 'http://user:pass@10.10.1.10:3128/'}, or an empty dict to disable proxy.``
109
+ """
110
+ # Used mostly for testing
111
+ self.__proxies = proxies
112
+
113
+ def set_max_response_data_length(self, max_response_data_length: int) -> None:
114
+ self.__max_response_data_length = max_response_data_length
115
+
116
+ def get(self, url: str) -> AbstractWebClientResponse:
117
+ try:
118
+ response = requests.get(
119
+ url,
120
+ timeout=self.__timeout,
121
+ stream=True,
122
+ headers={"User-Agent": self.__USER_AGENT},
123
+ proxies=self.__proxies,
124
+ verify=self.__verify,
125
+ )
126
+ except requests.exceptions.Timeout as ex:
127
+ # Retryable timeouts
128
+ return RequestsWebClientErrorResponse(message=str(ex), retryable=True)
129
+
130
+ except requests.exceptions.RequestException as ex:
131
+ # Other errors, e.g. redirect loops
132
+ return RequestsWebClientErrorResponse(message=str(ex), retryable=False)
133
+
134
+ else:
135
+ if 200 <= response.status_code < 300:
136
+ return RequestsWebClientSuccessResponse(
137
+ requests_response=response,
138
+ max_response_data_length=self.__max_response_data_length,
139
+ )
140
+ else:
141
+ message = f"{response.status_code} {response.reason}"
142
+
143
+ if response.status_code in RETRYABLE_HTTP_STATUS_CODES:
144
+ return RequestsWebClientErrorResponse(
145
+ message=message, retryable=True
146
+ )
147
+ else:
148
+ return RequestsWebClientErrorResponse(
149
+ message=message, retryable=False
150
+ )