careful 0.2.1__py3-none-any.whl → 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- careful/httpx/__init__.py +129 -41
- careful/httpx/dev_cache.py +58 -36
- careful/httpx/retries.py +23 -9
- careful/httpx/robots.py +72 -0
- careful/httpx/throttle.py +20 -9
- {careful-0.2.1.dist-info → careful-0.3.2.dist-info}/METADATA +25 -17
- careful-0.3.2.dist-info/RECORD +12 -0
- {careful-0.2.1.dist-info → careful-0.3.2.dist-info}/WHEEL +1 -1
- careful-0.2.1.dist-info/RECORD +0 -11
- {careful-0.2.1.dist-info → careful-0.3.2.dist-info}/licenses/LICENSE +0 -0
careful/httpx/__init__.py
CHANGED
|
@@ -1,17 +1,30 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import logging
|
|
3
|
+
import pathlib
|
|
4
|
+
from urllib.parse import urlparse
|
|
1
5
|
from .retries import make_retry_client, retry_default_rule
|
|
2
6
|
from .throttle import make_throttled_client
|
|
7
|
+
from .robots import (
|
|
8
|
+
make_robots_txt_client,
|
|
9
|
+
RobotExclusionError,
|
|
10
|
+
RobotsRejectFunc,
|
|
11
|
+
raise_robots_txt,
|
|
12
|
+
)
|
|
3
13
|
from .dev_cache import (
|
|
4
14
|
make_dev_caching_client,
|
|
5
15
|
MemoryCache,
|
|
6
16
|
FileCache,
|
|
7
17
|
SqliteCache,
|
|
8
|
-
|
|
18
|
+
CacheStorage,
|
|
19
|
+
CacheResponse,
|
|
9
20
|
_cache_200s,
|
|
10
21
|
_default_keyfunc,
|
|
11
22
|
)
|
|
12
23
|
from ._types import ResponsePredicate, CacheKeyfunc
|
|
13
24
|
from httpx import Client
|
|
14
25
|
|
|
26
|
+
log = logging.getLogger("careful")
|
|
27
|
+
|
|
15
28
|
|
|
16
29
|
def make_careful_client(
|
|
17
30
|
*,
|
|
@@ -20,11 +33,14 @@ def make_careful_client(
|
|
|
20
33
|
retry_wait_seconds: float = 10,
|
|
21
34
|
should_retry: ResponsePredicate = retry_default_rule,
|
|
22
35
|
requests_per_minute: int = 0,
|
|
23
|
-
cache_storage:
|
|
36
|
+
cache_storage: CacheStorage | None = None,
|
|
24
37
|
cache_write_only: bool = False,
|
|
25
38
|
should_cache: ResponsePredicate = _cache_200s,
|
|
26
39
|
cache_keyfunc: CacheKeyfunc = _default_keyfunc,
|
|
27
|
-
|
|
40
|
+
check_robots_txt: bool = False,
|
|
41
|
+
robots_txt_user_agent: str | None = None,
|
|
42
|
+
robots_txt_on_reject: RobotsRejectFunc = raise_robots_txt,
|
|
43
|
+
) -> Client:
|
|
28
44
|
"""
|
|
29
45
|
This function patches an `httpx.Client` so that all requests made with the client support
|
|
30
46
|
[retries](#retries), [throttling](#throttling), and [development caching](#development-caching).
|
|
@@ -51,43 +67,6 @@ def make_careful_client(
|
|
|
51
67
|
|
|
52
68
|
cache_keyfunc: Function that takes request details and returns a unique cache key.
|
|
53
69
|
|
|
54
|
-
## Retries
|
|
55
|
-
|
|
56
|
-
If `retry_attempts` is set, responses will be passed to `should_retry`.
|
|
57
|
-
Responses that are rejected (return `True`) will be retried after a wait based on
|
|
58
|
-
`retry_wait_seconds`.
|
|
59
|
-
Each retry will wait twice as long as the one before.
|
|
60
|
-
|
|
61
|
-
## Throttling
|
|
62
|
-
|
|
63
|
-
If `requests_per_minute` is set, standard (non-retry) requests will automatically
|
|
64
|
-
sleep for a short period to target the given rate.
|
|
65
|
-
|
|
66
|
-
For example, at 30rpm, the sleep time on a fast request will be close to 2 seconds.
|
|
67
|
-
|
|
68
|
-
## Development Caching
|
|
69
|
-
|
|
70
|
-
Why **development caching?**
|
|
71
|
-
|
|
72
|
-
This feature is named as a reminder that **this is not true HTTP caching**, which
|
|
73
|
-
should take various headers into account. Look at libraries like [hishel](https://hishel.com) if that's what you are after.
|
|
74
|
-
|
|
75
|
-
The purpose of this feature is to allow you to cache all of your HTTP requests during development.
|
|
76
|
-
Often when writing a scraper or crawler, you wind up hitting the site you are working on more often than you'd like-- each time you iterate on your code you're likely making redundant requests to pages that haven't changed.
|
|
77
|
-
|
|
78
|
-
By caching all successful requests (configurable with the `should_cache` parameter),
|
|
79
|
-
you can easily re-run scrapers without making redundant HTTP requests.
|
|
80
|
-
This means faster development time & happier upstream servers.
|
|
81
|
-
|
|
82
|
-
To enable development caching, assign a [`MemoryCache`][careful.httpx.MemoryCache],
|
|
83
|
-
[`FileCache`][careful.httpx.FileCache], or [`SqliteCache`][careful.httpx.SqliteCache] to
|
|
84
|
-
the `cache_storage` property of a `scrapelib.Scraper`.
|
|
85
|
-
|
|
86
|
-
---
|
|
87
|
-
|
|
88
|
-
When multiple features are applied, the order of wrapping ensures that:
|
|
89
|
-
- the cache is checked first, and bypasses throttling if hit
|
|
90
|
-
- retries use their own delays, but not throttled separately
|
|
91
70
|
"""
|
|
92
71
|
if client is None:
|
|
93
72
|
client = Client()
|
|
@@ -104,7 +83,7 @@ def make_careful_client(
|
|
|
104
83
|
client = make_throttled_client(
|
|
105
84
|
client=client, requests_per_minute=requests_per_minute
|
|
106
85
|
)
|
|
107
|
-
# caching on top layer, so cache will be checked
|
|
86
|
+
# caching on top layer, so cache will be checked before throttling/etc.
|
|
108
87
|
if cache_storage:
|
|
109
88
|
client = make_dev_caching_client(
|
|
110
89
|
client=client,
|
|
@@ -113,15 +92,124 @@ def make_careful_client(
|
|
|
113
92
|
should_cache=should_cache,
|
|
114
93
|
write_only=cache_write_only,
|
|
115
94
|
)
|
|
95
|
+
# robots.txt before cache
|
|
96
|
+
if check_robots_txt:
|
|
97
|
+
client = make_robots_txt_client(
|
|
98
|
+
client=client,
|
|
99
|
+
as_user_agent=robots_txt_user_agent,
|
|
100
|
+
on_rejection=robots_txt_on_reject,
|
|
101
|
+
)
|
|
116
102
|
|
|
117
103
|
return client
|
|
118
104
|
|
|
119
105
|
|
|
106
|
+
def _int_env(var_name: str, default: int) -> int:
|
|
107
|
+
return int(os.environ.get(var_name, default))
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _float_env(var_name: str, default: float) -> float:
|
|
111
|
+
return float(os.environ.get(var_name, default))
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _bool_env(var_name: str, default: bool) -> bool:
|
|
115
|
+
"""helper function for bool env vars"""
|
|
116
|
+
return bool(os.environ.get(var_name, "T" if default else ""))
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def _cache_env(var_name: str, default: CacheStorage | None) -> CacheStorage | None:
|
|
120
|
+
"""
|
|
121
|
+
helper function that reads cache as a protocol string
|
|
122
|
+
"""
|
|
123
|
+
cache_str = os.environ.get(var_name)
|
|
124
|
+
if not cache_str:
|
|
125
|
+
return default
|
|
126
|
+
parsed = urlparse(cache_str)
|
|
127
|
+
# urlparse always starts with a / (var needs :/// to skip netloc -> into path)
|
|
128
|
+
true_path = parsed.path[1:] if parsed.path.startswith("/") else parsed.path
|
|
129
|
+
# if it starts with a //// then it is an absolute path
|
|
130
|
+
if true_path.startswith("/"):
|
|
131
|
+
path = pathlib.Path(true_path)
|
|
132
|
+
else:
|
|
133
|
+
path = pathlib.Path.cwd() / true_path
|
|
134
|
+
if parsed.scheme == "memory":
|
|
135
|
+
log.info("cache from env %s => MemoryCache", var_name)
|
|
136
|
+
return MemoryCache()
|
|
137
|
+
elif parsed.scheme == "file":
|
|
138
|
+
log.info("cache from env %s => FileCache(%s)", var_name, path)
|
|
139
|
+
return FileCache(path)
|
|
140
|
+
elif parsed.scheme == "sqlite":
|
|
141
|
+
log.info("cache from env %s => SqliteCache(%s)", var_name, path)
|
|
142
|
+
return SqliteCache(path)
|
|
143
|
+
else:
|
|
144
|
+
log.warning("invalid cache %s=%s", var_name, cache_str)
|
|
145
|
+
return default
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def make_careful_client_from_env(
|
|
149
|
+
*,
|
|
150
|
+
client: Client | None = None,
|
|
151
|
+
retry_attempts: int = 0,
|
|
152
|
+
retry_wait_seconds: float = 10,
|
|
153
|
+
should_retry: ResponsePredicate = retry_default_rule,
|
|
154
|
+
requests_per_minute: int = 0,
|
|
155
|
+
cache_storage: CacheStorage | None = None,
|
|
156
|
+
cache_write_only: bool = False,
|
|
157
|
+
should_cache: ResponsePredicate = _cache_200s,
|
|
158
|
+
cache_keyfunc: CacheKeyfunc = _default_keyfunc,
|
|
159
|
+
check_robots_txt: bool = False,
|
|
160
|
+
robots_txt_user_agent: str | None = None,
|
|
161
|
+
robots_txt_on_reject: RobotsRejectFunc = raise_robots_txt,
|
|
162
|
+
) -> Client:
|
|
163
|
+
"""
|
|
164
|
+
Make a careful client from environment variables.
|
|
165
|
+
|
|
166
|
+
Any set environment variables will override parameters if set.
|
|
167
|
+
|
|
168
|
+
Numeric:
|
|
169
|
+
- CAREFUL_RETRY_ATTEMPTS
|
|
170
|
+
- CAREFUL_RETRY_WAIT_SECONDS
|
|
171
|
+
- CAREFUL_REQUESTS_PER_MINUTE
|
|
172
|
+
- CAREFUL_CHECK_ROBOTS_TXT
|
|
173
|
+
Booleans (any non-empty value is true):
|
|
174
|
+
- CAREFUL_CACHE_WRITE_ONLY
|
|
175
|
+
- CAREFUL_ROBOTS_TXT_USER_AGENT
|
|
176
|
+
Cache:
|
|
177
|
+
- CAREFUL_CACHE, which can be:
|
|
178
|
+
memory://
|
|
179
|
+
cache://path/to/db.sqlite3
|
|
180
|
+
file://path/to/directory
|
|
181
|
+
|
|
182
|
+
Function parameters do not have environment variables.
|
|
183
|
+
"""
|
|
184
|
+
return make_careful_client(
|
|
185
|
+
client=client,
|
|
186
|
+
retry_attempts=_int_env("CAREFUL_RETRY_ATTEMPTS", retry_attempts),
|
|
187
|
+
retry_wait_seconds=_float_env("CAREFUL_RETRY_WAIT_SECONDS", retry_wait_seconds),
|
|
188
|
+
should_retry=should_retry,
|
|
189
|
+
requests_per_minute=_int_env(
|
|
190
|
+
"CAREFUL_REQUESTS_PER_MINUTE", requests_per_minute
|
|
191
|
+
),
|
|
192
|
+
cache_storage=_cache_env("CAREFUL_CACHE", cache_storage),
|
|
193
|
+
cache_write_only=_bool_env("CAREFUL_CACHE_WRITE_ONLY", cache_write_only),
|
|
194
|
+
should_cache=should_cache,
|
|
195
|
+
cache_keyfunc=cache_keyfunc,
|
|
196
|
+
check_robots_txt=_bool_env("CAREFUL_CHECK_ROBOTS_TXT", check_robots_txt),
|
|
197
|
+
robots_txt_user_agent=os.environ.get(
|
|
198
|
+
"CAREFUL_ROBOTS_TXT_USER_AGENT", robots_txt_user_agent
|
|
199
|
+
),
|
|
200
|
+
robots_txt_on_reject=robots_txt_on_reject,
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
|
|
120
204
|
__all__ = [
|
|
205
|
+
"make_careful_client",
|
|
121
206
|
"make_retry_client",
|
|
122
207
|
"make_throttled_client",
|
|
123
208
|
"make_dev_caching_client",
|
|
209
|
+
"make_robots_txt_client",
|
|
124
210
|
"MemoryCache",
|
|
125
211
|
"FileCache",
|
|
126
212
|
"SqliteCache",
|
|
213
|
+
"CacheResponse",
|
|
214
|
+
"RobotExclusionError",
|
|
127
215
|
]
|
careful/httpx/dev_cache.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
import abc
|
|
2
1
|
import types
|
|
3
2
|
import functools
|
|
3
|
+
import pathlib
|
|
4
4
|
import logging
|
|
5
5
|
import re
|
|
6
6
|
import os
|
|
@@ -8,10 +8,27 @@ import glob
|
|
|
8
8
|
import hashlib
|
|
9
9
|
import sqlite3
|
|
10
10
|
import json
|
|
11
|
+
from typing import cast, Protocol, Callable
|
|
12
|
+
from ._types import ResponsePredicate, CacheKeyfunc
|
|
11
13
|
|
|
12
14
|
from httpx import Client, Response, Request
|
|
13
15
|
|
|
14
|
-
log = logging.getLogger("
|
|
16
|
+
log = logging.getLogger("careful")
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class CacheStorage(Protocol):
|
|
20
|
+
def get(self, key: str) -> None | Response: ...
|
|
21
|
+
def set(self, key: str, response: Response) -> None: ...
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class DevCacheClient(Protocol):
|
|
25
|
+
_retry_attempts: int
|
|
26
|
+
_cache_storage: CacheStorage
|
|
27
|
+
_write_only: bool
|
|
28
|
+
_should_cache: ResponsePredicate
|
|
29
|
+
_cache_keyfunc: CacheKeyfunc
|
|
30
|
+
_no_cache_request: Callable
|
|
31
|
+
request: Callable
|
|
15
32
|
|
|
16
33
|
|
|
17
34
|
def _default_keyfunc(
|
|
@@ -28,7 +45,7 @@ def _default_keyfunc(
|
|
|
28
45
|
if method.lower() != "get":
|
|
29
46
|
return None
|
|
30
47
|
|
|
31
|
-
return Request(url=url, method=method, params=params).url
|
|
48
|
+
return str(Request(url=url, method=method, params=params).url)
|
|
32
49
|
|
|
33
50
|
|
|
34
51
|
def _cache_200s(response: Response) -> bool:
|
|
@@ -40,7 +57,11 @@ def _cache_200s(response: Response) -> bool:
|
|
|
40
57
|
return response.status_code == 200
|
|
41
58
|
|
|
42
59
|
|
|
43
|
-
|
|
60
|
+
class CacheResponse(Response):
|
|
61
|
+
fromcache: bool
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _cached_request(client: DevCacheClient, *args, **kwargs) -> CacheResponse:
|
|
44
65
|
method, url = args
|
|
45
66
|
request_key = client._cache_keyfunc(method, url, kwargs["params"])
|
|
46
67
|
|
|
@@ -50,30 +71,28 @@ def _cached_request(client: Client, *args, **kwargs):
|
|
|
50
71
|
cached_resp = client._cache_storage.get(request_key)
|
|
51
72
|
|
|
52
73
|
if cached_resp:
|
|
53
|
-
# resp = cast(CacheResponse, resp_maybe)
|
|
54
74
|
log.info("using cached response request_key=%s", request_key)
|
|
55
|
-
|
|
56
|
-
|
|
75
|
+
new_resp = cast(CacheResponse, cached_resp)
|
|
76
|
+
new_resp.fromcache = True
|
|
57
77
|
else:
|
|
58
|
-
|
|
78
|
+
new_resp = cast(CacheResponse, client._no_cache_request(*args, **kwargs))
|
|
79
|
+
new_resp.fromcache = False
|
|
59
80
|
# save to cache if request and response meet criteria
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
client._cache_storage.set(request_key, resp)
|
|
81
|
+
if request_key and client._should_cache(new_resp):
|
|
82
|
+
client._cache_storage.set(request_key, new_resp)
|
|
63
83
|
log.info("caching response request_key=%s", request_key)
|
|
64
|
-
resp.fromcache = False
|
|
65
84
|
|
|
66
|
-
return
|
|
85
|
+
return new_resp
|
|
67
86
|
|
|
68
87
|
|
|
69
88
|
def make_dev_caching_client(
|
|
70
89
|
*,
|
|
90
|
+
cache_storage: CacheStorage,
|
|
71
91
|
client: Client | None = None,
|
|
72
|
-
cache_storage=None,
|
|
73
92
|
cache_keyfunc=_default_keyfunc,
|
|
74
93
|
should_cache=_cache_200s,
|
|
75
94
|
write_only=False,
|
|
76
|
-
):
|
|
95
|
+
) -> Client:
|
|
77
96
|
"""
|
|
78
97
|
Returns an enhanced `httpx.Client` where requests are saved to a
|
|
79
98
|
specified cache.
|
|
@@ -98,29 +117,21 @@ def make_dev_caching_client(
|
|
|
98
117
|
if client is None:
|
|
99
118
|
client = Client()
|
|
100
119
|
|
|
101
|
-
|
|
102
|
-
client._cache_keyfunc = cache_keyfunc
|
|
103
|
-
client._should_cache = should_cache
|
|
104
|
-
client._write_only = write_only
|
|
120
|
+
tclient = cast(DevCacheClient, client)
|
|
105
121
|
|
|
106
|
-
|
|
107
|
-
|
|
122
|
+
tclient._cache_storage = cache_storage
|
|
123
|
+
tclient._cache_keyfunc = cache_keyfunc
|
|
124
|
+
tclient._should_cache = should_cache
|
|
125
|
+
tclient._write_only = write_only
|
|
126
|
+
|
|
127
|
+
tclient._no_cache_request = client.request
|
|
128
|
+
tclient.request = types.MethodType(
|
|
108
129
|
functools.wraps(client.request)(_cached_request), client
|
|
109
130
|
)
|
|
110
131
|
return client
|
|
111
132
|
|
|
112
133
|
|
|
113
|
-
class
|
|
114
|
-
@abc.abstractmethod
|
|
115
|
-
def get(self, key: str) -> None | Response:
|
|
116
|
-
raise NotImplementedError()
|
|
117
|
-
|
|
118
|
-
@abc.abstractmethod
|
|
119
|
-
def set(self, key: str, response: Response) -> None:
|
|
120
|
-
raise NotImplementedError()
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
class MemoryCache(CacheStorageBase):
|
|
134
|
+
class MemoryCache(CacheStorage):
|
|
124
135
|
"""
|
|
125
136
|
In memory cache for request responses.
|
|
126
137
|
|
|
@@ -144,7 +155,7 @@ class MemoryCache(CacheStorageBase):
|
|
|
144
155
|
self.cache[key] = response
|
|
145
156
|
|
|
146
157
|
|
|
147
|
-
class FileCache(
|
|
158
|
+
class FileCache(CacheStorage):
|
|
148
159
|
"""
|
|
149
160
|
File-based cache for request responses.
|
|
150
161
|
|
|
@@ -228,6 +239,7 @@ class FileCache(CacheStorageBase):
|
|
|
228
239
|
content=resp_content,
|
|
229
240
|
default_encoding=resp_headers.pop("encoding"),
|
|
230
241
|
headers=resp_headers,
|
|
242
|
+
request=Request("GET", key), # not perfect, but it'll do
|
|
231
243
|
)
|
|
232
244
|
return resp
|
|
233
245
|
except IOError:
|
|
@@ -244,6 +256,8 @@ class FileCache(CacheStorageBase):
|
|
|
244
256
|
encoding_str = "encoding: {0}\n".format(response.encoding)
|
|
245
257
|
f.write(encoding_str.encode("utf8"))
|
|
246
258
|
for h, v in response.headers.items():
|
|
259
|
+
if h.lower() in ("content-encoding", "content-length"):
|
|
260
|
+
continue
|
|
247
261
|
# header: value\n
|
|
248
262
|
f.write(h.encode("utf8"))
|
|
249
263
|
f.write(b": ")
|
|
@@ -260,7 +274,7 @@ class FileCache(CacheStorageBase):
|
|
|
260
274
|
os.remove(fname)
|
|
261
275
|
|
|
262
276
|
|
|
263
|
-
class SqliteCache(
|
|
277
|
+
class SqliteCache(CacheStorage):
|
|
264
278
|
"""
|
|
265
279
|
sqlite cache for request responses.
|
|
266
280
|
|
|
@@ -276,12 +290,17 @@ class SqliteCache(CacheStorageBase):
|
|
|
276
290
|
|
|
277
291
|
_columns = ["key", "status", "modified", "encoding", "data", "headers"]
|
|
278
292
|
|
|
279
|
-
def __init__(
|
|
293
|
+
def __init__(
|
|
294
|
+
self, cache_path: str | pathlib.Path, check_last_modified: bool = False
|
|
295
|
+
):
|
|
280
296
|
self.cache_path = cache_path
|
|
281
297
|
self.check_last_modified = check_last_modified
|
|
282
|
-
self._conn = sqlite3.connect(cache_path)
|
|
298
|
+
self._conn = sqlite3.connect(str(cache_path))
|
|
283
299
|
self._conn.text_factory = str
|
|
284
300
|
self._build_table()
|
|
301
|
+
# self._conn.execute("PRAGMA journal_mode=WAL;")
|
|
302
|
+
# self._conn.execute("PRAGMA synchronous=1;")
|
|
303
|
+
# self._conn.isolation_level = None
|
|
285
304
|
|
|
286
305
|
def _build_table(self) -> None:
|
|
287
306
|
"""Create table for storing request information and response."""
|
|
@@ -294,6 +313,8 @@ class SqliteCache(CacheStorageBase):
|
|
|
294
313
|
def set(self, key: str, response: Response) -> None:
|
|
295
314
|
"""Set cache entry for key with contents of response."""
|
|
296
315
|
mod = response.headers.pop("last-modified", None)
|
|
316
|
+
response.headers.pop("content-encoding", None)
|
|
317
|
+
response.headers.pop("content-length", None)
|
|
297
318
|
status = int(response.status_code)
|
|
298
319
|
rec = (
|
|
299
320
|
key,
|
|
@@ -330,6 +351,7 @@ class SqliteCache(CacheStorageBase):
|
|
|
330
351
|
content=rec["data"],
|
|
331
352
|
default_encoding=rec["encoding"],
|
|
332
353
|
headers=json.loads(rec["headers"]),
|
|
354
|
+
request=Request("GET", key), # not perfect, but it'll do
|
|
333
355
|
)
|
|
334
356
|
return resp
|
|
335
357
|
|
careful/httpx/retries.py
CHANGED
|
@@ -2,25 +2,37 @@ import time
|
|
|
2
2
|
import types
|
|
3
3
|
import functools
|
|
4
4
|
import logging
|
|
5
|
+
from typing import Protocol, Callable, cast
|
|
5
6
|
from httpx import Client, Response, HTTPError
|
|
7
|
+
from ._types import ResponsePredicate
|
|
6
8
|
|
|
7
|
-
log = logging.getLogger("
|
|
9
|
+
log = logging.getLogger("careful")
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class RetryClient(Protocol):
|
|
13
|
+
_retry_attempts: int
|
|
14
|
+
_retry_wait_seconds: float
|
|
15
|
+
_should_retry: ResponsePredicate
|
|
16
|
+
_no_retry_request: Callable
|
|
17
|
+
request: Callable
|
|
8
18
|
|
|
9
19
|
|
|
10
20
|
def retry_default_rule(response: Response) -> bool:
|
|
11
|
-
|
|
21
|
+
"""default behavior is to retry 400s and 500s but not 404s"""
|
|
12
22
|
return response.status_code >= 400 and response.status_code != 404
|
|
13
23
|
|
|
14
24
|
|
|
15
25
|
def retry_only_500s(response: Response) -> bool:
|
|
26
|
+
"""retry all status codes that are 500 or above"""
|
|
16
27
|
return response.status_code >= 500
|
|
17
28
|
|
|
18
29
|
|
|
19
30
|
def retry_all_400s_500s(response: Response) -> bool:
|
|
31
|
+
"""retry all status codes that are 400 or above"""
|
|
20
32
|
return response.status_code >= 400
|
|
21
33
|
|
|
22
34
|
|
|
23
|
-
def _retry_request(client:
|
|
35
|
+
def _retry_request(client: RetryClient, *args, **kwargs) -> Response:
|
|
24
36
|
# the retry loop
|
|
25
37
|
tries = 0
|
|
26
38
|
exception_raised = None
|
|
@@ -75,15 +87,17 @@ def make_retry_client(
|
|
|
75
87
|
attempts: int = 1,
|
|
76
88
|
wait_seconds: float = 10,
|
|
77
89
|
should_retry=retry_default_rule,
|
|
78
|
-
):
|
|
90
|
+
) -> Client:
|
|
79
91
|
if client is None:
|
|
80
92
|
client = Client()
|
|
81
|
-
client._retry_attempts = max(0, attempts)
|
|
82
|
-
client._retry_wait_seconds = wait_seconds
|
|
83
|
-
client._should_retry = should_retry
|
|
84
93
|
|
|
85
|
-
|
|
86
|
-
|
|
94
|
+
tclient = cast(RetryClient, client)
|
|
95
|
+
tclient._retry_attempts = max(0, attempts)
|
|
96
|
+
tclient._retry_wait_seconds = wait_seconds
|
|
97
|
+
tclient._should_retry = should_retry
|
|
98
|
+
|
|
99
|
+
tclient._no_retry_request = client.request
|
|
100
|
+
tclient.request = types.MethodType(
|
|
87
101
|
functools.wraps(client.request)(_retry_request), client
|
|
88
102
|
)
|
|
89
103
|
|
careful/httpx/robots.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
import types
|
|
2
|
+
import functools
|
|
3
|
+
import warnings
|
|
4
|
+
from urllib.robotparser import RobotFileParser
|
|
5
|
+
from typing import Protocol, cast, Callable
|
|
6
|
+
from httpx import Client, Response, URL
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class RobotExclusionError(Exception):
|
|
10
|
+
pass
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def raise_robots_txt(url, robots):
|
|
14
|
+
raise RobotExclusionError(f"{url} excluded by {robots.url}")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def warn_robots_txt(url, robots):
|
|
18
|
+
warnings.warn(f"{url} excluded by {robots.url}")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class RobotsClient(Protocol):
|
|
22
|
+
_robots_for_domain: dict[str, RobotFileParser]
|
|
23
|
+
_robots_ua: str
|
|
24
|
+
_rejected_action: Callable[[str, RobotFileParser], None]
|
|
25
|
+
_no_check_request: Callable
|
|
26
|
+
request: Callable
|
|
27
|
+
headers: dict
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _robot_check_request(client: RobotsClient, *args, **kwargs) -> Response:
|
|
31
|
+
method, url = args
|
|
32
|
+
uurl = URL(url)
|
|
33
|
+
domain = uurl.host
|
|
34
|
+
if domain not in client._robots_for_domain:
|
|
35
|
+
robots_url = f"{uurl.scheme}://{domain}/robots.txt"
|
|
36
|
+
robots_resp = client._no_check_request("GET", robots_url)
|
|
37
|
+
# pass url for output, but don't do read
|
|
38
|
+
parser = RobotFileParser(robots_url)
|
|
39
|
+
parser.parse(robots_resp.text.splitlines())
|
|
40
|
+
client._robots_for_domain[domain] = parser
|
|
41
|
+
if not client._robots_for_domain[domain].can_fetch(client._robots_ua, url):
|
|
42
|
+
client._rejected_action(url, client._robots_for_domain[domain])
|
|
43
|
+
# if action doesn't raise an exception, the request goes through
|
|
44
|
+
return client._no_check_request(*args, **kwargs)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
RobotsRejectFunc = Callable[[str, RobotFileParser], None]
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def make_robots_txt_client(
|
|
51
|
+
*,
|
|
52
|
+
client: Client | None = None,
|
|
53
|
+
as_user_agent: str | None = None,
|
|
54
|
+
on_rejection: RobotsRejectFunc = raise_robots_txt,
|
|
55
|
+
) -> Client:
|
|
56
|
+
if client is None:
|
|
57
|
+
client = Client()
|
|
58
|
+
|
|
59
|
+
tclient = cast(RobotsClient, client)
|
|
60
|
+
|
|
61
|
+
tclient._robots_for_domain = {}
|
|
62
|
+
if as_user_agent:
|
|
63
|
+
tclient._robots_ua = as_user_agent
|
|
64
|
+
else:
|
|
65
|
+
tclient._robots_ua = tclient.headers["user-agent"]
|
|
66
|
+
tclient._rejected_action = on_rejection
|
|
67
|
+
|
|
68
|
+
tclient._no_check_request = client.request
|
|
69
|
+
tclient.request = types.MethodType(
|
|
70
|
+
functools.wraps(client.request)(_robot_check_request), client
|
|
71
|
+
)
|
|
72
|
+
return client
|
careful/httpx/throttle.py
CHANGED
|
@@ -2,12 +2,21 @@ import time
|
|
|
2
2
|
import types
|
|
3
3
|
import functools
|
|
4
4
|
import logging
|
|
5
|
-
from
|
|
5
|
+
from typing import Protocol, cast, Callable
|
|
6
|
+
from httpx import Client, Response
|
|
6
7
|
|
|
7
|
-
log = logging.getLogger("
|
|
8
|
+
log = logging.getLogger("careful")
|
|
8
9
|
|
|
9
10
|
|
|
10
|
-
|
|
11
|
+
class ThrottledClient(Protocol):
|
|
12
|
+
_last_request: float
|
|
13
|
+
_requests_per_minute: float
|
|
14
|
+
_request_frequency: float
|
|
15
|
+
_no_throttle_request: Callable
|
|
16
|
+
request: Callable
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _throttle_request(client: ThrottledClient, *args, **kwargs) -> Response:
|
|
11
20
|
now = time.time()
|
|
12
21
|
diff = client._request_frequency - (now - client._last_request)
|
|
13
22
|
if diff > 0:
|
|
@@ -23,19 +32,21 @@ def make_throttled_client(
|
|
|
23
32
|
*,
|
|
24
33
|
client: Client | None = None,
|
|
25
34
|
requests_per_minute: float = 0,
|
|
26
|
-
):
|
|
35
|
+
) -> Client:
|
|
27
36
|
if requests_per_minute <= 0:
|
|
28
37
|
raise ValueError("requests per minute must be a positive number")
|
|
29
38
|
|
|
30
39
|
if client is None:
|
|
31
40
|
client = Client()
|
|
32
41
|
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
42
|
+
tclient = cast(ThrottledClient, client)
|
|
43
|
+
|
|
44
|
+
tclient._last_request = 0.0
|
|
45
|
+
tclient._requests_per_minute = requests_per_minute
|
|
46
|
+
tclient._request_frequency = 60.0 / requests_per_minute
|
|
36
47
|
|
|
37
|
-
|
|
38
|
-
|
|
48
|
+
tclient._no_throttle_request = client.request
|
|
49
|
+
tclient.request = types.MethodType(
|
|
39
50
|
functools.wraps(client.request)(_throttle_request), client
|
|
40
51
|
)
|
|
41
52
|
return client
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: careful
|
|
3
|
-
Version: 0.2
|
|
4
|
-
Summary:
|
|
3
|
+
Version: 0.3.2
|
|
4
|
+
Summary: a small library for writing resilient, well-behaved HTTP code
|
|
5
5
|
Project-URL: Repository, https://codeberg.org/jpt/careful
|
|
6
6
|
Author-email: jpt <dev@jpt.sh>
|
|
7
7
|
License: BSD-2-Clause
|
|
@@ -15,6 +15,7 @@ Classifier: Programming Language :: Python :: 3.10
|
|
|
15
15
|
Classifier: Programming Language :: Python :: 3.11
|
|
16
16
|
Classifier: Programming Language :: Python :: 3.12
|
|
17
17
|
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
18
19
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
19
20
|
Requires-Python: >=3.10
|
|
20
21
|
Requires-Dist: httpx>=0.28.1
|
|
@@ -22,43 +23,50 @@ Description-Content-Type: text/markdown
|
|
|
22
23
|
|
|
23
24
|
# careful
|
|
24
25
|
|
|
25
|
-
<img src="https://
|
|
26
|
+
<img src="https://jpt.sh/projects/careful/carefully-3681327.svg" width=100 height=100 alt="logo of a warning sign">
|
|
27
|
+
|
|
28
|
+
**careful** is a Python library for writing resilient, well-behaved HTTP clients.
|
|
26
29
|
|
|
27
|
-
**careful** is a Python library for making requests to unreliable websites with `httpx`.
|
|
28
|
-
|
|
29
30
|
**Code**: <https://codeberg.org/jpt/careful>
|
|
30
31
|
|
|
31
|
-
**Docs**: <https://
|
|
32
|
+
**Docs**: <https://jpt.sh/projects/careful/>
|
|
32
33
|
|
|
34
|
+

|
|
33
35
|
[](https://ci.codeberg.org/repos/15185)
|
|
34
36
|
|
|
35
|
-
|
|
36
|
-
[
|
|
37
|
-
useful for writing long-running scrapers & crawlers, particularly against sites that are slow or have intermittent errors.
|
|
37
|
+
Call one function to enchant an
|
|
38
|
+
**[httpx.Client](https://www.python-httpx.org)**, making your HTTP connections more resilient and better mannered.
|
|
38
39
|
|
|
39
|
-
- **
|
|
40
|
-
- **
|
|
41
|
-
- **
|
|
40
|
+
- Configure **throttling** to avoid accidental Denial-of-Service / risking getting banned.
|
|
41
|
+
- **Retries** help overcome intermittent failures on flaky sites or long crawls.
|
|
42
|
+
- **Development caching** Cache persists between runs during development, reduces redundant requests made while iterating on your crawlers & scrapers.
|
|
42
43
|
|
|
43
|
-
###
|
|
44
|
+
### Example
|
|
44
45
|
|
|
45
46
|
```python
|
|
46
47
|
from httpx import Client
|
|
47
48
|
from careful.httpx import make_careful_client
|
|
48
49
|
|
|
50
|
+
# the only function you need to call is make_careful_client
|
|
51
|
+
# this wraps your existing `httpx.Client` with your preferred
|
|
52
|
+
# careful behaviors
|
|
53
|
+
|
|
49
54
|
client = make_careful_client(
|
|
50
|
-
|
|
51
|
-
|
|
55
|
+
client=Client(headers={'user-agent': 'spiderman/1.0'}),
|
|
56
|
+
|
|
52
57
|
# retries are configurable w/ exponential back off
|
|
53
58
|
retry_attempts=2,
|
|
54
59
|
retry_wait_seconds=5,
|
|
60
|
+
|
|
55
61
|
# can cache to process memory, filesystem, or SQLite
|
|
56
62
|
cache_storage=MemoryCache(),
|
|
57
|
-
|
|
63
|
+
|
|
64
|
+
# easy-to-configure throttling
|
|
58
65
|
requests_per_minute=60,
|
|
59
66
|
)
|
|
60
67
|
|
|
61
|
-
#
|
|
68
|
+
# methods on client are called as they always are
|
|
69
|
+
# configured behaviors occur without further code changes
|
|
62
70
|
client.get("https://example.com")
|
|
63
71
|
```
|
|
64
72
|
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
careful/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
careful/httpx/__init__.py,sha256=Ul_K5XSMMW8yo3M-lq8nDbvtDEQS0N-vmurIGnzE8dY,7381
|
|
3
|
+
careful/httpx/_types.py,sha256=jefYDxSbLRUatU8QKeyxStc9UC3AJwAba2SfhNkM0RY,151
|
|
4
|
+
careful/httpx/dev_cache.py,sha256=cc4_rLKFc6Xggpx5MKc8DBurn5KXTDLHO19U_SipXiY,11873
|
|
5
|
+
careful/httpx/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
+
careful/httpx/retries.py,sha256=3kjuHKYnK1N4Rtum5gUyY_XO4o4cL4jc59d17Y6UwrI,2949
|
|
7
|
+
careful/httpx/robots.py,sha256=jfqQdplTap_RCENu6MHEIabFVznFLruMvSIaG_u0v_8,2168
|
|
8
|
+
careful/httpx/throttle.py,sha256=b1fbmUskcm343D1bbPbY-ITLdL1zVm1dXtjt9LT1bEA,1412
|
|
9
|
+
careful-0.3.2.dist-info/METADATA,sha256=fN6sNY5n4PFna9Eq9uTfxDbvzCpct-qXcLOkzrdItGE,2692
|
|
10
|
+
careful-0.3.2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
11
|
+
careful-0.3.2.dist-info/licenses/LICENSE,sha256=oHe4LmCuo6CZne42DRXfiR3uqqIfGsk4dAvDKucAi4M,1315
|
|
12
|
+
careful-0.3.2.dist-info/RECORD,,
|
careful-0.2.1.dist-info/RECORD
DELETED
|
@@ -1,11 +0,0 @@
|
|
|
1
|
-
careful/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
careful/httpx/__init__.py,sha256=u-n0uKIWAd3NXsZUd1UA4wzJJTEhRR74diHzDV2EpEU,4885
|
|
3
|
-
careful/httpx/_types.py,sha256=jefYDxSbLRUatU8QKeyxStc9UC3AJwAba2SfhNkM0RY,151
|
|
4
|
-
careful/httpx/dev_cache.py,sha256=HNtEXncPpqsjIEoz5UhRf4YO2iVwz5uowKc4_B74fZg,11024
|
|
5
|
-
careful/httpx/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
-
careful/httpx/retries.py,sha256=Kszm0wDITyPZ3qx5TsDL__HjCYVJyAZ2WehrlpXV5Cc,2500
|
|
7
|
-
careful/httpx/throttle.py,sha256=ZpuFABYHGQ4D0zks922SCXp7WZG_-Ysafz-Npa2QVwQ,1096
|
|
8
|
-
careful-0.2.1.dist-info/METADATA,sha256=ZAKwiwqykmep0LiYCzFLWJfTgharbvhW3FCJ3p0b_-8,2498
|
|
9
|
-
careful-0.2.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
10
|
-
careful-0.2.1.dist-info/licenses/LICENSE,sha256=oHe4LmCuo6CZne42DRXfiR3uqqIfGsk4dAvDKucAi4M,1315
|
|
11
|
-
careful-0.2.1.dist-info/RECORD,,
|
|
File without changes
|