careful 0.2.1__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- careful/httpx/__init__.py +112 -41
- careful/httpx/dev_cache.py +44 -34
- careful/httpx/retries.py +23 -9
- careful/httpx/robots.py +72 -0
- careful/httpx/throttle.py +20 -9
- {careful-0.2.1.dist-info → careful-0.3.1.dist-info}/METADATA +25 -17
- careful-0.3.1.dist-info/RECORD +12 -0
- {careful-0.2.1.dist-info → careful-0.3.1.dist-info}/WHEEL +1 -1
- careful-0.2.1.dist-info/RECORD +0 -11
- {careful-0.2.1.dist-info → careful-0.3.1.dist-info}/licenses/LICENSE +0 -0
careful/httpx/__init__.py
CHANGED
|
@@ -1,11 +1,20 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from urllib.parse import urlparse
|
|
1
3
|
from .retries import make_retry_client, retry_default_rule
|
|
2
4
|
from .throttle import make_throttled_client
|
|
5
|
+
from .robots import (
|
|
6
|
+
make_robots_txt_client,
|
|
7
|
+
RobotExclusionError,
|
|
8
|
+
RobotsRejectFunc,
|
|
9
|
+
raise_robots_txt,
|
|
10
|
+
)
|
|
3
11
|
from .dev_cache import (
|
|
4
12
|
make_dev_caching_client,
|
|
5
13
|
MemoryCache,
|
|
6
14
|
FileCache,
|
|
7
15
|
SqliteCache,
|
|
8
|
-
|
|
16
|
+
CacheStorage,
|
|
17
|
+
CacheResponse,
|
|
9
18
|
_cache_200s,
|
|
10
19
|
_default_keyfunc,
|
|
11
20
|
)
|
|
@@ -20,11 +29,14 @@ def make_careful_client(
|
|
|
20
29
|
retry_wait_seconds: float = 10,
|
|
21
30
|
should_retry: ResponsePredicate = retry_default_rule,
|
|
22
31
|
requests_per_minute: int = 0,
|
|
23
|
-
cache_storage:
|
|
32
|
+
cache_storage: CacheStorage | None = None,
|
|
24
33
|
cache_write_only: bool = False,
|
|
25
34
|
should_cache: ResponsePredicate = _cache_200s,
|
|
26
35
|
cache_keyfunc: CacheKeyfunc = _default_keyfunc,
|
|
27
|
-
|
|
36
|
+
check_robots_txt: bool = False,
|
|
37
|
+
robots_txt_user_agent: str | None = None,
|
|
38
|
+
robots_txt_on_reject: RobotsRejectFunc = raise_robots_txt,
|
|
39
|
+
) -> Client:
|
|
28
40
|
"""
|
|
29
41
|
This function patches an `httpx.Client` so that all requests made with the client support
|
|
30
42
|
[retries](#retries), [throttling](#throttling), and [development caching](#development-caching).
|
|
@@ -51,43 +63,6 @@ def make_careful_client(
|
|
|
51
63
|
|
|
52
64
|
cache_keyfunc: Function that takes request details and returns a unique cache key.
|
|
53
65
|
|
|
54
|
-
## Retries
|
|
55
|
-
|
|
56
|
-
If `retry_attempts` is set, responses will be passed to `should_retry`.
|
|
57
|
-
Responses that are rejected (return `True`) will be retried after a wait based on
|
|
58
|
-
`retry_wait_seconds`.
|
|
59
|
-
Each retry will wait twice as long as the one before.
|
|
60
|
-
|
|
61
|
-
## Throttling
|
|
62
|
-
|
|
63
|
-
If `requests_per_minute` is set, standard (non-retry) requests will automatically
|
|
64
|
-
sleep for a short period to target the given rate.
|
|
65
|
-
|
|
66
|
-
For example, at 30rpm, the sleep time on a fast request will be close to 2 seconds.
|
|
67
|
-
|
|
68
|
-
## Development Caching
|
|
69
|
-
|
|
70
|
-
Why **development caching?**
|
|
71
|
-
|
|
72
|
-
This feature is named as a reminder that **this is not true HTTP caching**, which
|
|
73
|
-
should take various headers into account. Look at libraries like [hishel](https://hishel.com) if that's what you are after.
|
|
74
|
-
|
|
75
|
-
The purpose of this feature is to allow you to cache all of your HTTP requests during development.
|
|
76
|
-
Often when writing a scraper or crawler, you wind up hitting the site you are working on more often than you'd like-- each time you iterate on your code you're likely making redundant requests to pages that haven't changed.
|
|
77
|
-
|
|
78
|
-
By caching all successful requests (configurable with the `should_cache` parameter),
|
|
79
|
-
you can easily re-run scrapers without making redundant HTTP requests.
|
|
80
|
-
This means faster development time & happier upstream servers.
|
|
81
|
-
|
|
82
|
-
To enable development caching, assign a [`MemoryCache`][careful.httpx.MemoryCache],
|
|
83
|
-
[`FileCache`][careful.httpx.FileCache], or [`SqliteCache`][careful.httpx.SqliteCache] to
|
|
84
|
-
the `cache_storage` property of a `scrapelib.Scraper`.
|
|
85
|
-
|
|
86
|
-
---
|
|
87
|
-
|
|
88
|
-
When multiple features are applied, the order of wrapping ensures that:
|
|
89
|
-
- the cache is checked first, and bypasses throttling if hit
|
|
90
|
-
- retries use their own delays, but not throttled separately
|
|
91
66
|
"""
|
|
92
67
|
if client is None:
|
|
93
68
|
client = Client()
|
|
@@ -104,7 +79,7 @@ def make_careful_client(
|
|
|
104
79
|
client = make_throttled_client(
|
|
105
80
|
client=client, requests_per_minute=requests_per_minute
|
|
106
81
|
)
|
|
107
|
-
# caching on top layer, so cache will be checked
|
|
82
|
+
# caching on top layer, so cache will be checked before throttling/etc.
|
|
108
83
|
if cache_storage:
|
|
109
84
|
client = make_dev_caching_client(
|
|
110
85
|
client=client,
|
|
@@ -113,15 +88,111 @@ def make_careful_client(
|
|
|
113
88
|
should_cache=should_cache,
|
|
114
89
|
write_only=cache_write_only,
|
|
115
90
|
)
|
|
91
|
+
# robots.txt before cache
|
|
92
|
+
if check_robots_txt:
|
|
93
|
+
client = make_robots_txt_client(
|
|
94
|
+
client=client,
|
|
95
|
+
as_user_agent=robots_txt_user_agent,
|
|
96
|
+
on_rejection=robots_txt_on_reject,
|
|
97
|
+
)
|
|
116
98
|
|
|
117
99
|
return client
|
|
118
100
|
|
|
119
101
|
|
|
102
|
+
def _int_env(var_name: str, default: int) -> int:
|
|
103
|
+
return int(os.environ.get(var_name, default))
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _float_env(var_name: str, default: float) -> float:
|
|
107
|
+
return float(os.environ.get(var_name, default))
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _bool_env(var_name: str, default: bool) -> bool:
|
|
111
|
+
"""helper function for bool env vars"""
|
|
112
|
+
return bool(os.environ.get(var_name, "T" if default else ""))
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def _cache_env(var_name: str, default: CacheStorage | None) -> CacheStorage | None:
|
|
116
|
+
"""
|
|
117
|
+
helper function that reads cache as a protocol string
|
|
118
|
+
"""
|
|
119
|
+
cache_str = os.environ.get(var_name)
|
|
120
|
+
if not cache_str:
|
|
121
|
+
return default
|
|
122
|
+
parsed = urlparse(cache_str)
|
|
123
|
+
if parsed.scheme == "memory":
|
|
124
|
+
return MemoryCache()
|
|
125
|
+
elif parsed.scheme == "file":
|
|
126
|
+
return FileCache(parsed.path)
|
|
127
|
+
elif parsed.scheme == "sqlite":
|
|
128
|
+
return SqliteCache(parsed.path)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def make_careful_client_from_env(
|
|
132
|
+
*,
|
|
133
|
+
client: Client | None = None,
|
|
134
|
+
retry_attempts: int = 0,
|
|
135
|
+
retry_wait_seconds: float = 10,
|
|
136
|
+
should_retry: ResponsePredicate = retry_default_rule,
|
|
137
|
+
requests_per_minute: int = 0,
|
|
138
|
+
cache_storage: CacheStorage | None = None,
|
|
139
|
+
cache_write_only: bool = False,
|
|
140
|
+
should_cache: ResponsePredicate = _cache_200s,
|
|
141
|
+
cache_keyfunc: CacheKeyfunc = _default_keyfunc,
|
|
142
|
+
check_robots_txt: bool = False,
|
|
143
|
+
robots_txt_user_agent: str | None = None,
|
|
144
|
+
robots_txt_on_reject: RobotsRejectFunc = raise_robots_txt,
|
|
145
|
+
) -> Client:
|
|
146
|
+
"""
|
|
147
|
+
Make a careful client from environment variables.
|
|
148
|
+
|
|
149
|
+
Any set environment variables will override parameters if set.
|
|
150
|
+
|
|
151
|
+
Numeric:
|
|
152
|
+
- CAREFUL_RETRY_ATTEMPTS
|
|
153
|
+
- CAREFUL_RETRY_WAIT_SECONDS
|
|
154
|
+
- CAREFUL_REQUESTS_PER_MINUTE
|
|
155
|
+
- CAREFUL_CHECK_ROBOTS_TXT
|
|
156
|
+
Booleans (any non-empty value is true):
|
|
157
|
+
- CAREFUL_CACHE_WRITE_ONLY
|
|
158
|
+
- CAREFUL_ROBOTS_TXT_USER_AGENT
|
|
159
|
+
Cache:
|
|
160
|
+
- CAREFUL_CACHE, which can be:
|
|
161
|
+
memory://
|
|
162
|
+
cache://path/to/db.sqlite3
|
|
163
|
+
file://path/to/directory
|
|
164
|
+
|
|
165
|
+
Function parameters do not have environment variables.
|
|
166
|
+
"""
|
|
167
|
+
return make_careful_client(
|
|
168
|
+
client=client,
|
|
169
|
+
retry_attempts=_int_env("CAREFUL_RETRY_ATTEMPTS", retry_attempts),
|
|
170
|
+
retry_wait_seconds=_float_env("CAREFUL_RETRY_WAIT_SECONDS", retry_wait_seconds),
|
|
171
|
+
should_retry=should_retry,
|
|
172
|
+
requests_per_minute=_int_env(
|
|
173
|
+
"CAREFUL_REQUESTS_PER_MINUTE", requests_per_minute
|
|
174
|
+
),
|
|
175
|
+
cache_storage=_cache_env("CAREFUL_CACHE", cache_storage),
|
|
176
|
+
cache_write_only=_bool_env("CAREFUL_CACHE_WRITE_ONLY", cache_write_only),
|
|
177
|
+
should_cache=should_cache,
|
|
178
|
+
cache_keyfunc=cache_keyfunc,
|
|
179
|
+
check_robots_txt=_bool_env("CAREFUL_CHECK_ROBOTS_TXT", check_robots_txt),
|
|
180
|
+
robots_txt_user_agent=os.environ.get(
|
|
181
|
+
"CAREFUL_ROBOTS_TXT_USER_AGENT", robots_txt_user_agent
|
|
182
|
+
),
|
|
183
|
+
robots_txt_on_reject=robots_txt_on_reject,
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
|
|
120
187
|
__all__ = [
|
|
188
|
+
"make_careful_client",
|
|
121
189
|
"make_retry_client",
|
|
122
190
|
"make_throttled_client",
|
|
123
191
|
"make_dev_caching_client",
|
|
192
|
+
"make_robots_txt_client",
|
|
124
193
|
"MemoryCache",
|
|
125
194
|
"FileCache",
|
|
126
195
|
"SqliteCache",
|
|
196
|
+
"CacheResponse",
|
|
197
|
+
"RobotExclusionError",
|
|
127
198
|
]
|
careful/httpx/dev_cache.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import abc
|
|
2
1
|
import types
|
|
3
2
|
import functools
|
|
4
3
|
import logging
|
|
@@ -8,10 +7,27 @@ import glob
|
|
|
8
7
|
import hashlib
|
|
9
8
|
import sqlite3
|
|
10
9
|
import json
|
|
10
|
+
from typing import cast, Protocol, Callable
|
|
11
|
+
from ._types import ResponsePredicate, CacheKeyfunc
|
|
11
12
|
|
|
12
13
|
from httpx import Client, Response, Request
|
|
13
14
|
|
|
14
|
-
log = logging.getLogger("
|
|
15
|
+
log = logging.getLogger("careful")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class CacheStorage(Protocol):
|
|
19
|
+
def get(self, key: str) -> None | Response: ...
|
|
20
|
+
def set(self, key: str, response: Response) -> None: ...
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class DevCacheClient(Protocol):
|
|
24
|
+
_retry_attempts: int
|
|
25
|
+
_cache_storage: CacheStorage
|
|
26
|
+
_write_only: bool
|
|
27
|
+
_should_cache: ResponsePredicate
|
|
28
|
+
_cache_keyfunc: CacheKeyfunc
|
|
29
|
+
_no_cache_request: Callable
|
|
30
|
+
request: Callable
|
|
15
31
|
|
|
16
32
|
|
|
17
33
|
def _default_keyfunc(
|
|
@@ -28,7 +44,7 @@ def _default_keyfunc(
|
|
|
28
44
|
if method.lower() != "get":
|
|
29
45
|
return None
|
|
30
46
|
|
|
31
|
-
return Request(url=url, method=method, params=params).url
|
|
47
|
+
return str(Request(url=url, method=method, params=params).url)
|
|
32
48
|
|
|
33
49
|
|
|
34
50
|
def _cache_200s(response: Response) -> bool:
|
|
@@ -40,7 +56,11 @@ def _cache_200s(response: Response) -> bool:
|
|
|
40
56
|
return response.status_code == 200
|
|
41
57
|
|
|
42
58
|
|
|
43
|
-
|
|
59
|
+
class CacheResponse(Response):
|
|
60
|
+
fromcache: bool
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _cached_request(client: DevCacheClient, *args, **kwargs) -> CacheResponse:
|
|
44
64
|
method, url = args
|
|
45
65
|
request_key = client._cache_keyfunc(method, url, kwargs["params"])
|
|
46
66
|
|
|
@@ -50,30 +70,28 @@ def _cached_request(client: Client, *args, **kwargs):
|
|
|
50
70
|
cached_resp = client._cache_storage.get(request_key)
|
|
51
71
|
|
|
52
72
|
if cached_resp:
|
|
53
|
-
# resp = cast(CacheResponse, resp_maybe)
|
|
54
73
|
log.info("using cached response request_key=%s", request_key)
|
|
55
|
-
|
|
56
|
-
|
|
74
|
+
new_resp = cast(CacheResponse, cached_resp)
|
|
75
|
+
new_resp.fromcache = True
|
|
57
76
|
else:
|
|
58
|
-
|
|
77
|
+
new_resp = cast(CacheResponse, client._no_cache_request(*args, **kwargs))
|
|
78
|
+
new_resp.fromcache = False
|
|
59
79
|
# save to cache if request and response meet criteria
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
client._cache_storage.set(request_key, resp)
|
|
80
|
+
if request_key and client._should_cache(new_resp):
|
|
81
|
+
client._cache_storage.set(request_key, new_resp)
|
|
63
82
|
log.info("caching response request_key=%s", request_key)
|
|
64
|
-
resp.fromcache = False
|
|
65
83
|
|
|
66
|
-
return
|
|
84
|
+
return new_resp
|
|
67
85
|
|
|
68
86
|
|
|
69
87
|
def make_dev_caching_client(
|
|
70
88
|
*,
|
|
89
|
+
cache_storage: CacheStorage,
|
|
71
90
|
client: Client | None = None,
|
|
72
|
-
cache_storage=None,
|
|
73
91
|
cache_keyfunc=_default_keyfunc,
|
|
74
92
|
should_cache=_cache_200s,
|
|
75
93
|
write_only=False,
|
|
76
|
-
):
|
|
94
|
+
) -> Client:
|
|
77
95
|
"""
|
|
78
96
|
Returns an enhanced `httpx.Client` where requests are saved to a
|
|
79
97
|
specified cache.
|
|
@@ -98,29 +116,21 @@ def make_dev_caching_client(
|
|
|
98
116
|
if client is None:
|
|
99
117
|
client = Client()
|
|
100
118
|
|
|
101
|
-
|
|
102
|
-
client._cache_keyfunc = cache_keyfunc
|
|
103
|
-
client._should_cache = should_cache
|
|
104
|
-
client._write_only = write_only
|
|
119
|
+
tclient = cast(DevCacheClient, client)
|
|
105
120
|
|
|
106
|
-
|
|
107
|
-
|
|
121
|
+
tclient._cache_storage = cache_storage
|
|
122
|
+
tclient._cache_keyfunc = cache_keyfunc
|
|
123
|
+
tclient._should_cache = should_cache
|
|
124
|
+
tclient._write_only = write_only
|
|
125
|
+
|
|
126
|
+
tclient._no_cache_request = client.request
|
|
127
|
+
tclient.request = types.MethodType(
|
|
108
128
|
functools.wraps(client.request)(_cached_request), client
|
|
109
129
|
)
|
|
110
130
|
return client
|
|
111
131
|
|
|
112
132
|
|
|
113
|
-
class
|
|
114
|
-
@abc.abstractmethod
|
|
115
|
-
def get(self, key: str) -> None | Response:
|
|
116
|
-
raise NotImplementedError()
|
|
117
|
-
|
|
118
|
-
@abc.abstractmethod
|
|
119
|
-
def set(self, key: str, response: Response) -> None:
|
|
120
|
-
raise NotImplementedError()
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
class MemoryCache(CacheStorageBase):
|
|
133
|
+
class MemoryCache(CacheStorage):
|
|
124
134
|
"""
|
|
125
135
|
In memory cache for request responses.
|
|
126
136
|
|
|
@@ -144,7 +154,7 @@ class MemoryCache(CacheStorageBase):
|
|
|
144
154
|
self.cache[key] = response
|
|
145
155
|
|
|
146
156
|
|
|
147
|
-
class FileCache(
|
|
157
|
+
class FileCache(CacheStorage):
|
|
148
158
|
"""
|
|
149
159
|
File-based cache for request responses.
|
|
150
160
|
|
|
@@ -260,7 +270,7 @@ class FileCache(CacheStorageBase):
|
|
|
260
270
|
os.remove(fname)
|
|
261
271
|
|
|
262
272
|
|
|
263
|
-
class SqliteCache(
|
|
273
|
+
class SqliteCache(CacheStorage):
|
|
264
274
|
"""
|
|
265
275
|
sqlite cache for request responses.
|
|
266
276
|
|
careful/httpx/retries.py
CHANGED
|
@@ -2,25 +2,37 @@ import time
|
|
|
2
2
|
import types
|
|
3
3
|
import functools
|
|
4
4
|
import logging
|
|
5
|
+
from typing import Protocol, Callable, cast
|
|
5
6
|
from httpx import Client, Response, HTTPError
|
|
7
|
+
from ._types import ResponsePredicate
|
|
6
8
|
|
|
7
|
-
log = logging.getLogger("
|
|
9
|
+
log = logging.getLogger("careful")
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class RetryClient(Protocol):
|
|
13
|
+
_retry_attempts: int
|
|
14
|
+
_retry_wait_seconds: float
|
|
15
|
+
_should_retry: ResponsePredicate
|
|
16
|
+
_no_retry_request: Callable
|
|
17
|
+
request: Callable
|
|
8
18
|
|
|
9
19
|
|
|
10
20
|
def retry_default_rule(response: Response) -> bool:
|
|
11
|
-
|
|
21
|
+
"""default behavior is to retry 400s and 500s but not 404s"""
|
|
12
22
|
return response.status_code >= 400 and response.status_code != 404
|
|
13
23
|
|
|
14
24
|
|
|
15
25
|
def retry_only_500s(response: Response) -> bool:
|
|
26
|
+
"""retry all status codes that are 500 or above"""
|
|
16
27
|
return response.status_code >= 500
|
|
17
28
|
|
|
18
29
|
|
|
19
30
|
def retry_all_400s_500s(response: Response) -> bool:
|
|
31
|
+
"""retry all status codes that are 400 or above"""
|
|
20
32
|
return response.status_code >= 400
|
|
21
33
|
|
|
22
34
|
|
|
23
|
-
def _retry_request(client:
|
|
35
|
+
def _retry_request(client: RetryClient, *args, **kwargs) -> Response:
|
|
24
36
|
# the retry loop
|
|
25
37
|
tries = 0
|
|
26
38
|
exception_raised = None
|
|
@@ -75,15 +87,17 @@ def make_retry_client(
|
|
|
75
87
|
attempts: int = 1,
|
|
76
88
|
wait_seconds: float = 10,
|
|
77
89
|
should_retry=retry_default_rule,
|
|
78
|
-
):
|
|
90
|
+
) -> Client:
|
|
79
91
|
if client is None:
|
|
80
92
|
client = Client()
|
|
81
|
-
client._retry_attempts = max(0, attempts)
|
|
82
|
-
client._retry_wait_seconds = wait_seconds
|
|
83
|
-
client._should_retry = should_retry
|
|
84
93
|
|
|
85
|
-
|
|
86
|
-
|
|
94
|
+
tclient = cast(RetryClient, client)
|
|
95
|
+
tclient._retry_attempts = max(0, attempts)
|
|
96
|
+
tclient._retry_wait_seconds = wait_seconds
|
|
97
|
+
tclient._should_retry = should_retry
|
|
98
|
+
|
|
99
|
+
tclient._no_retry_request = client.request
|
|
100
|
+
tclient.request = types.MethodType(
|
|
87
101
|
functools.wraps(client.request)(_retry_request), client
|
|
88
102
|
)
|
|
89
103
|
|
careful/httpx/robots.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
import types
|
|
2
|
+
import functools
|
|
3
|
+
import warnings
|
|
4
|
+
from urllib.robotparser import RobotFileParser
|
|
5
|
+
from typing import Protocol, cast, Callable
|
|
6
|
+
from httpx import Client, Response, URL
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class RobotExclusionError(Exception):
|
|
10
|
+
pass
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def raise_robots_txt(url, robots):
|
|
14
|
+
raise RobotExclusionError(f"{url} excluded by {robots.url}")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def warn_robots_txt(url, robots):
|
|
18
|
+
warnings.warn(f"{url} excluded by {robots.url}")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class RobotsClient(Protocol):
|
|
22
|
+
_robots_for_domain: dict[str, RobotFileParser]
|
|
23
|
+
_robots_ua: str
|
|
24
|
+
_rejected_action: Callable[[str, RobotFileParser], None]
|
|
25
|
+
_no_check_request: Callable
|
|
26
|
+
request: Callable
|
|
27
|
+
headers: dict
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _robot_check_request(client: RobotsClient, *args, **kwargs) -> Response:
|
|
31
|
+
method, url = args
|
|
32
|
+
uurl = URL(url)
|
|
33
|
+
domain = uurl.host
|
|
34
|
+
if domain not in client._robots_for_domain:
|
|
35
|
+
robots_url = f"{uurl.scheme}://{domain}/robots.txt"
|
|
36
|
+
robots_resp = client._no_check_request("GET", robots_url)
|
|
37
|
+
# pass url for output, but don't do read
|
|
38
|
+
parser = RobotFileParser(robots_url)
|
|
39
|
+
parser.parse(robots_resp.text.splitlines())
|
|
40
|
+
client._robots_for_domain[domain] = parser
|
|
41
|
+
if not client._robots_for_domain[domain].can_fetch(client._robots_ua, url):
|
|
42
|
+
client._rejected_action(url, client._robots_for_domain[domain])
|
|
43
|
+
# if action doesn't raise an exception, the request goes through
|
|
44
|
+
return client._no_check_request(*args, **kwargs)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
RobotsRejectFunc = Callable[[str, RobotFileParser], None]
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def make_robots_txt_client(
|
|
51
|
+
*,
|
|
52
|
+
client: Client | None = None,
|
|
53
|
+
as_user_agent: str | None = None,
|
|
54
|
+
on_rejection: RobotsRejectFunc = raise_robots_txt,
|
|
55
|
+
) -> Client:
|
|
56
|
+
if client is None:
|
|
57
|
+
client = Client()
|
|
58
|
+
|
|
59
|
+
tclient = cast(RobotsClient, client)
|
|
60
|
+
|
|
61
|
+
tclient._robots_for_domain = {}
|
|
62
|
+
if as_user_agent:
|
|
63
|
+
tclient._robots_ua = as_user_agent
|
|
64
|
+
else:
|
|
65
|
+
tclient._robots_ua = tclient.headers["user-agent"]
|
|
66
|
+
tclient._rejected_action = on_rejection
|
|
67
|
+
|
|
68
|
+
tclient._no_check_request = client.request
|
|
69
|
+
tclient.request = types.MethodType(
|
|
70
|
+
functools.wraps(client.request)(_robot_check_request), client
|
|
71
|
+
)
|
|
72
|
+
return client
|
careful/httpx/throttle.py
CHANGED
|
@@ -2,12 +2,21 @@ import time
|
|
|
2
2
|
import types
|
|
3
3
|
import functools
|
|
4
4
|
import logging
|
|
5
|
-
from
|
|
5
|
+
from typing import Protocol, cast, Callable
|
|
6
|
+
from httpx import Client, Response
|
|
6
7
|
|
|
7
|
-
log = logging.getLogger("
|
|
8
|
+
log = logging.getLogger("careful")
|
|
8
9
|
|
|
9
10
|
|
|
10
|
-
|
|
11
|
+
class ThrottledClient(Protocol):
|
|
12
|
+
_last_request: float
|
|
13
|
+
_requests_per_minute: float
|
|
14
|
+
_request_frequency: float
|
|
15
|
+
_no_throttle_request: Callable
|
|
16
|
+
request: Callable
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _throttle_request(client: ThrottledClient, *args, **kwargs) -> Response:
|
|
11
20
|
now = time.time()
|
|
12
21
|
diff = client._request_frequency - (now - client._last_request)
|
|
13
22
|
if diff > 0:
|
|
@@ -23,19 +32,21 @@ def make_throttled_client(
|
|
|
23
32
|
*,
|
|
24
33
|
client: Client | None = None,
|
|
25
34
|
requests_per_minute: float = 0,
|
|
26
|
-
):
|
|
35
|
+
) -> Client:
|
|
27
36
|
if requests_per_minute <= 0:
|
|
28
37
|
raise ValueError("requests per minute must be a positive number")
|
|
29
38
|
|
|
30
39
|
if client is None:
|
|
31
40
|
client = Client()
|
|
32
41
|
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
42
|
+
tclient = cast(ThrottledClient, client)
|
|
43
|
+
|
|
44
|
+
tclient._last_request = 0.0
|
|
45
|
+
tclient._requests_per_minute = requests_per_minute
|
|
46
|
+
tclient._request_frequency = 60.0 / requests_per_minute
|
|
36
47
|
|
|
37
|
-
|
|
38
|
-
|
|
48
|
+
tclient._no_throttle_request = client.request
|
|
49
|
+
tclient.request = types.MethodType(
|
|
39
50
|
functools.wraps(client.request)(_throttle_request), client
|
|
40
51
|
)
|
|
41
52
|
return client
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: careful
|
|
3
|
-
Version: 0.
|
|
4
|
-
Summary:
|
|
3
|
+
Version: 0.3.1
|
|
4
|
+
Summary: a small library for writing resilient, well-behaved HTTP code
|
|
5
5
|
Project-URL: Repository, https://codeberg.org/jpt/careful
|
|
6
6
|
Author-email: jpt <dev@jpt.sh>
|
|
7
7
|
License: BSD-2-Clause
|
|
@@ -15,6 +15,7 @@ Classifier: Programming Language :: Python :: 3.10
|
|
|
15
15
|
Classifier: Programming Language :: Python :: 3.11
|
|
16
16
|
Classifier: Programming Language :: Python :: 3.12
|
|
17
17
|
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
18
19
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
19
20
|
Requires-Python: >=3.10
|
|
20
21
|
Requires-Dist: httpx>=0.28.1
|
|
@@ -22,43 +23,50 @@ Description-Content-Type: text/markdown
|
|
|
22
23
|
|
|
23
24
|
# careful
|
|
24
25
|
|
|
25
|
-
<img src="https://
|
|
26
|
+
<img src="https://jpt.sh/projects/careful/carefully-3681327.svg" width=100 height=100 alt="logo of a warning sign">
|
|
27
|
+
|
|
28
|
+
**careful** is a Python library for writing resilient, well-behaved HTTP clients.
|
|
26
29
|
|
|
27
|
-
**careful** is a Python library for making requests to unreliable websites with `httpx`.
|
|
28
|
-
|
|
29
30
|
**Code**: <https://codeberg.org/jpt/careful>
|
|
30
31
|
|
|
31
|
-
**Docs**: <https://
|
|
32
|
+
**Docs**: <https://jpt.sh/projects/careful/>
|
|
32
33
|
|
|
34
|
+

|
|
33
35
|
[](https://ci.codeberg.org/repos/15185)
|
|
34
36
|
|
|
35
|
-
|
|
36
|
-
[
|
|
37
|
-
useful for writing long-running scrapers & crawlers, particularly against sites that are slow or have intermittent errors.
|
|
37
|
+
Call one function to enchant an
|
|
38
|
+
**[httpx.Client](https://www.python-httpx.org)**, making your HTTP connections more resilient and better mannered.
|
|
38
39
|
|
|
39
|
-
- **
|
|
40
|
-
- **
|
|
41
|
-
- **
|
|
40
|
+
- Configure **throttling** to avoid accidental Denial-of-Service / risking getting banned.
|
|
41
|
+
- **Retries** help overcome intermittent failures on flaky sites or long crawls.
|
|
42
|
+
- **Development caching** Cache persists between runs during development, reduces redundant requests made while iterating on your crawlers & scrapers.
|
|
42
43
|
|
|
43
|
-
###
|
|
44
|
+
### Example
|
|
44
45
|
|
|
45
46
|
```python
|
|
46
47
|
from httpx import Client
|
|
47
48
|
from careful.httpx import make_careful_client
|
|
48
49
|
|
|
50
|
+
# the only function you need to call is make_careful_client
|
|
51
|
+
# this wraps your existing `httpx.Client` with your preferred
|
|
52
|
+
# careful behaviors
|
|
53
|
+
|
|
49
54
|
client = make_careful_client(
|
|
50
|
-
|
|
51
|
-
|
|
55
|
+
client=Client(headers={'user-agent': 'spiderman/1.0'}),
|
|
56
|
+
|
|
52
57
|
# retries are configurable w/ exponential back off
|
|
53
58
|
retry_attempts=2,
|
|
54
59
|
retry_wait_seconds=5,
|
|
60
|
+
|
|
55
61
|
# can cache to process memory, filesystem, or SQLite
|
|
56
62
|
cache_storage=MemoryCache(),
|
|
57
|
-
|
|
63
|
+
|
|
64
|
+
# easy-to-configure throttling
|
|
58
65
|
requests_per_minute=60,
|
|
59
66
|
)
|
|
60
67
|
|
|
61
|
-
#
|
|
68
|
+
# methods on client are called as they always are
|
|
69
|
+
# configured behaviors occur without further code changes
|
|
62
70
|
client.get("https://example.com")
|
|
63
71
|
```
|
|
64
72
|
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
careful/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
careful/httpx/__init__.py,sha256=sIKMgGf-Ea5VEsEFqxXFbAiaF1tYOYm9CIOvSCPooqw,6674
|
|
3
|
+
careful/httpx/_types.py,sha256=jefYDxSbLRUatU8QKeyxStc9UC3AJwAba2SfhNkM0RY,151
|
|
4
|
+
careful/httpx/dev_cache.py,sha256=sfHX2TkKZQRTrOXSGSMgIWE0HbX3Fvuz88o8_jK9P8g,11316
|
|
5
|
+
careful/httpx/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
+
careful/httpx/retries.py,sha256=3kjuHKYnK1N4Rtum5gUyY_XO4o4cL4jc59d17Y6UwrI,2949
|
|
7
|
+
careful/httpx/robots.py,sha256=jfqQdplTap_RCENu6MHEIabFVznFLruMvSIaG_u0v_8,2168
|
|
8
|
+
careful/httpx/throttle.py,sha256=b1fbmUskcm343D1bbPbY-ITLdL1zVm1dXtjt9LT1bEA,1412
|
|
9
|
+
careful-0.3.1.dist-info/METADATA,sha256=fkI2V9YX5JKhPTYZga5q_3cyfatB4B_PMdTci3sRMmc,2692
|
|
10
|
+
careful-0.3.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
11
|
+
careful-0.3.1.dist-info/licenses/LICENSE,sha256=oHe4LmCuo6CZne42DRXfiR3uqqIfGsk4dAvDKucAi4M,1315
|
|
12
|
+
careful-0.3.1.dist-info/RECORD,,
|
careful-0.2.1.dist-info/RECORD
DELETED
|
@@ -1,11 +0,0 @@
|
|
|
1
|
-
careful/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
careful/httpx/__init__.py,sha256=u-n0uKIWAd3NXsZUd1UA4wzJJTEhRR74diHzDV2EpEU,4885
|
|
3
|
-
careful/httpx/_types.py,sha256=jefYDxSbLRUatU8QKeyxStc9UC3AJwAba2SfhNkM0RY,151
|
|
4
|
-
careful/httpx/dev_cache.py,sha256=HNtEXncPpqsjIEoz5UhRf4YO2iVwz5uowKc4_B74fZg,11024
|
|
5
|
-
careful/httpx/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
-
careful/httpx/retries.py,sha256=Kszm0wDITyPZ3qx5TsDL__HjCYVJyAZ2WehrlpXV5Cc,2500
|
|
7
|
-
careful/httpx/throttle.py,sha256=ZpuFABYHGQ4D0zks922SCXp7WZG_-Ysafz-Npa2QVwQ,1096
|
|
8
|
-
careful-0.2.1.dist-info/METADATA,sha256=ZAKwiwqykmep0LiYCzFLWJfTgharbvhW3FCJ3p0b_-8,2498
|
|
9
|
-
careful-0.2.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
10
|
-
careful-0.2.1.dist-info/licenses/LICENSE,sha256=oHe4LmCuo6CZne42DRXfiR3uqqIfGsk4dAvDKucAi4M,1315
|
|
11
|
-
careful-0.2.1.dist-info/RECORD,,
|
|
File without changes
|