careful 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- careful/httpx/__init__.py +81 -13
- careful/httpx/_types.py +6 -0
- careful/httpx/dev_cache.py +67 -21
- careful/httpx/retries.py +20 -17
- careful/httpx/throttle.py +2 -2
- careful-0.2.0.dist-info/METADATA +71 -0
- careful-0.2.0.dist-info/RECORD +11 -0
- careful-0.1.0.dist-info/METADATA +0 -48
- careful-0.1.0.dist-info/RECORD +0 -10
- {careful-0.1.0.dist-info → careful-0.2.0.dist-info}/WHEEL +0 -0
- {careful-0.1.0.dist-info → careful-0.2.0.dist-info}/licenses/LICENSE +0 -0
careful/httpx/__init__.py
CHANGED
|
@@ -1,41 +1,109 @@
|
|
|
1
|
-
from .retries import make_retry_client,
|
|
1
|
+
from .retries import make_retry_client, retry_default_rule
|
|
2
2
|
from .throttle import make_throttled_client
|
|
3
3
|
from .dev_cache import (
|
|
4
4
|
make_dev_caching_client,
|
|
5
5
|
MemoryCache,
|
|
6
6
|
FileCache,
|
|
7
|
-
|
|
7
|
+
SqliteCache,
|
|
8
|
+
CacheStorageBase,
|
|
8
9
|
_cache_200s,
|
|
9
10
|
_default_keyfunc,
|
|
10
11
|
)
|
|
12
|
+
from ._types import ResponsePredicate, CacheKeyfunc
|
|
11
13
|
from httpx import Client
|
|
12
14
|
|
|
13
15
|
|
|
14
16
|
def make_careful_client(
|
|
15
|
-
client: Client,
|
|
16
17
|
*,
|
|
18
|
+
client: Client | None = None,
|
|
17
19
|
retry_attempts: int = 0,
|
|
18
20
|
retry_wait_seconds: float = 10,
|
|
19
|
-
|
|
20
|
-
accept_response=_default_accept_response,
|
|
21
|
+
should_retry: ResponsePredicate = retry_default_rule,
|
|
21
22
|
requests_per_minute: int = 0,
|
|
22
|
-
cache_storage=None,
|
|
23
|
-
cache_write_only=False,
|
|
24
|
-
should_cache=_cache_200s,
|
|
25
|
-
cache_keyfunc=_default_keyfunc,
|
|
23
|
+
cache_storage: CacheStorageBase = None,
|
|
24
|
+
cache_write_only: bool = False,
|
|
25
|
+
should_cache: ResponsePredicate = _cache_200s,
|
|
26
|
+
cache_keyfunc: CacheKeyfunc = _default_keyfunc,
|
|
26
27
|
):
|
|
28
|
+
"""
|
|
29
|
+
This function patches an `httpx.Client` so that all requests made with the client support
|
|
30
|
+
[retries](#retries), [throttling](#throttling), and [development caching](#development-caching).
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
Parameters:
|
|
34
|
+
client: A pre-configured `httpx.Client`. If omitted a default client will be created.
|
|
35
|
+
|
|
36
|
+
retry_attempts: Maximum number of retries. If non-zero will retry up to this many times
|
|
37
|
+
with increasing wait times, starting with `retry_wait_seconds`.
|
|
38
|
+
|
|
39
|
+
retry_wait_seconds: Number of seconds to sleep between first attempt and first retry.
|
|
40
|
+
Subsequent attempts will increase exponentially (2x, 4x, 8x, etc.)
|
|
41
|
+
|
|
42
|
+
should_retry: Predicate function that takes a `httpx.Response` and returns `True` if it should be retried.
|
|
43
|
+
|
|
44
|
+
requests_per_minute: Maximum number of requests per minute. (e.g. 30 will throttle to ~2s between requests)
|
|
45
|
+
|
|
46
|
+
cache_storage: An object that implements the [cache storage interface](#cache-storage).
|
|
47
|
+
|
|
48
|
+
cache_write_only: Update cache, but never read from it.
|
|
49
|
+
|
|
50
|
+
should_cache: Predicate function that takes a `httpx.Response` and returns `True` if it should be cached.
|
|
51
|
+
|
|
52
|
+
cache_keyfunc: Function that takes request details and returns a unique cache key.
|
|
53
|
+
|
|
54
|
+
## Retries
|
|
55
|
+
|
|
56
|
+
If `retry_attempts` is set, responses will be passed to `should_retry`.
|
|
57
|
+
Responses that are rejected (return `True`) will be retried after a wait based on
|
|
58
|
+
`retry_wait_seconds`.
|
|
59
|
+
Each retry will wait twice as long as the one before.
|
|
60
|
+
|
|
61
|
+
## Throttling
|
|
62
|
+
|
|
63
|
+
If `requests_per_minute` is set, standard (non-retry) requests will automatically
|
|
64
|
+
sleep for a short period to target the given rate.
|
|
65
|
+
|
|
66
|
+
For example, at 30rpm, the sleep time on a fast request will be close to 2 seconds.
|
|
67
|
+
|
|
68
|
+
## Development Caching
|
|
69
|
+
|
|
70
|
+
Why **development caching?**
|
|
71
|
+
|
|
72
|
+
This feature is named as a reminder that **this is not true HTTP caching**, which
|
|
73
|
+
should take various headers into account. Look at libraries like [hishel](https://hishel.com) if that's what you are after.
|
|
74
|
+
|
|
75
|
+
The purpose of this feature is to allow you to cache all of your HTTP requests during development.
|
|
76
|
+
Often when writing a scraper or crawler, you wind up hitting the site you are working on more often than you'd like-- each time you iterate on your code you're likely making redundant requests to pages that haven't changed.
|
|
77
|
+
|
|
78
|
+
By caching all successful requests (configurable with the `should_cache` parameter),
|
|
79
|
+
you can easily re-run scrapers without making redundant HTTP requests.
|
|
80
|
+
This means faster development time & happier upstream servers.
|
|
81
|
+
|
|
82
|
+
To enable development caching, assign a [`MemoryCache`][careful.httpx.MemoryCache],
|
|
83
|
+
[`FileCache`][careful.httpx.FileCache], or [`SqliteCache`][careful.httpx.SqliteCache] to
|
|
84
|
+
the `cache_storage` property of a `scrapelib.Scraper`.
|
|
85
|
+
|
|
86
|
+
---
|
|
87
|
+
|
|
88
|
+
When multiple features are applied, the order of wrapping ensures that:
|
|
89
|
+
- the cache is checked first, and bypasses throttling if hit
|
|
90
|
+
- retries use their own delays, but not throttled separately
|
|
91
|
+
"""
|
|
92
|
+
if client is None:
|
|
93
|
+
client = Client()
|
|
27
94
|
# order matters, retry on inside b/c it is last-chance scenario
|
|
28
95
|
if retry_attempts:
|
|
29
96
|
client = make_retry_client(
|
|
30
97
|
client=client,
|
|
31
98
|
attempts=retry_attempts,
|
|
32
99
|
wait_seconds=retry_wait_seconds,
|
|
33
|
-
|
|
34
|
-
accept_response=accept_response,
|
|
100
|
+
should_retry=should_retry,
|
|
35
101
|
)
|
|
36
102
|
# throttling around retries
|
|
37
103
|
if requests_per_minute:
|
|
38
|
-
client = make_throttled_client(
|
|
104
|
+
client = make_throttled_client(
|
|
105
|
+
client=client, requests_per_minute=requests_per_minute
|
|
106
|
+
)
|
|
39
107
|
# caching on top layer, so cache will be checked first
|
|
40
108
|
if cache_storage:
|
|
41
109
|
client = make_dev_caching_client(
|
|
@@ -55,5 +123,5 @@ __all__ = [
|
|
|
55
123
|
"make_dev_caching_client",
|
|
56
124
|
"MemoryCache",
|
|
57
125
|
"FileCache",
|
|
58
|
-
"
|
|
126
|
+
"SqliteCache",
|
|
59
127
|
]
|
careful/httpx/_types.py
ADDED
careful/httpx/dev_cache.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import abc
|
|
1
2
|
import types
|
|
2
3
|
import functools
|
|
3
4
|
import logging
|
|
@@ -40,13 +41,6 @@ def _cache_200s(response: Response) -> bool:
|
|
|
40
41
|
|
|
41
42
|
|
|
42
43
|
def _cached_request(client: Client, *args, **kwargs):
|
|
43
|
-
# short circuit if cache isn't configured
|
|
44
|
-
if not client._cache_storage:
|
|
45
|
-
log.debug("bypassing cache, no storage configured")
|
|
46
|
-
resp = client._wrapped_request(*args, **kwargs)
|
|
47
|
-
resp.fromcache = False
|
|
48
|
-
return resp
|
|
49
|
-
|
|
50
44
|
method, url = args
|
|
51
45
|
request_key = client._cache_keyfunc(method, url, kwargs["params"])
|
|
52
46
|
|
|
@@ -61,7 +55,7 @@ def _cached_request(client: Client, *args, **kwargs):
|
|
|
61
55
|
cached_resp.fromcache = True
|
|
62
56
|
resp = cached_resp
|
|
63
57
|
else:
|
|
64
|
-
resp = client.
|
|
58
|
+
resp = client._no_cache_request(*args, **kwargs)
|
|
65
59
|
# save to cache if request and response meet criteria
|
|
66
60
|
log.debug("XX %s %s", request_key, client._should_cache(resp))
|
|
67
61
|
if request_key and client._should_cache(resp):
|
|
@@ -80,6 +74,27 @@ def make_dev_caching_client(
|
|
|
80
74
|
should_cache=_cache_200s,
|
|
81
75
|
write_only=False,
|
|
82
76
|
):
|
|
77
|
+
"""
|
|
78
|
+
Returns an enhanced `httpx.Client` where requests are saved to a
|
|
79
|
+
specified cache.
|
|
80
|
+
|
|
81
|
+
This is denoted as a "dev_cache" because it is not intended to be a true
|
|
82
|
+
HTTP cache, respecting cache headers/etc. If you are looking for that
|
|
83
|
+
behavior, there are httpx libraries for that explicit purpose.
|
|
84
|
+
|
|
85
|
+
Instead, the purpose of this cache is to make it possible to test scrapers
|
|
86
|
+
locally without making hundreds of redundant requests.
|
|
87
|
+
|
|
88
|
+
The strategy is configurable via `cache_keyfunc` and `should_cache`.
|
|
89
|
+
|
|
90
|
+
The default strategy is simple:
|
|
91
|
+
cache all GET requests that result in 200s, with no expiry.
|
|
92
|
+
|
|
93
|
+
This works well for the case where you have hundreds of pages to scrape
|
|
94
|
+
and want to make scraper adjustments without repeatedly making hits.
|
|
95
|
+
|
|
96
|
+
It should *NOT* be used in production without adjusting these rules.
|
|
97
|
+
"""
|
|
83
98
|
if client is None:
|
|
84
99
|
client = Client()
|
|
85
100
|
|
|
@@ -88,23 +103,34 @@ def make_dev_caching_client(
|
|
|
88
103
|
client._should_cache = should_cache
|
|
89
104
|
client._write_only = write_only
|
|
90
105
|
|
|
91
|
-
client.
|
|
106
|
+
client._no_cache_request = client.request
|
|
92
107
|
client.request = types.MethodType(
|
|
93
108
|
functools.wraps(client.request)(_cached_request), client
|
|
94
109
|
)
|
|
95
110
|
return client
|
|
96
111
|
|
|
97
112
|
|
|
98
|
-
class CacheStorageBase:
|
|
113
|
+
class CacheStorageBase(abc.ABC):
|
|
114
|
+
@abc.abstractmethod
|
|
99
115
|
def get(self, key: str) -> None | Response:
|
|
100
116
|
raise NotImplementedError()
|
|
101
117
|
|
|
118
|
+
@abc.abstractmethod
|
|
102
119
|
def set(self, key: str, response: Response) -> None:
|
|
103
120
|
raise NotImplementedError()
|
|
104
121
|
|
|
105
122
|
|
|
106
123
|
class MemoryCache(CacheStorageBase):
|
|
107
|
-
"""
|
|
124
|
+
"""
|
|
125
|
+
In memory cache for request responses.
|
|
126
|
+
|
|
127
|
+
Example:
|
|
128
|
+
|
|
129
|
+
make_careful_client(
|
|
130
|
+
cache_storage=MemoryCache(),
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
"""
|
|
108
134
|
|
|
109
135
|
def __init__(self) -> None:
|
|
110
136
|
self.cache: dict[str, Response] = {}
|
|
@@ -122,11 +148,21 @@ class FileCache(CacheStorageBase):
|
|
|
122
148
|
"""
|
|
123
149
|
File-based cache for request responses.
|
|
124
150
|
|
|
125
|
-
:
|
|
126
|
-
|
|
127
|
-
|
|
151
|
+
Parameters:
|
|
152
|
+
cache_dir: directory for storing responses
|
|
153
|
+
|
|
154
|
+
Example:
|
|
155
|
+
|
|
156
|
+
make_careful_client(
|
|
157
|
+
cache_storage=FileCache("_httpcache/"),
|
|
158
|
+
)
|
|
159
|
+
|
|
128
160
|
"""
|
|
129
161
|
|
|
162
|
+
# TODO: restore?
|
|
163
|
+
# check_last_modified: set to True to compare last-modified
|
|
164
|
+
# timestamp in cached response with value from HEAD request
|
|
165
|
+
|
|
130
166
|
# file name escaping inspired by httplib2
|
|
131
167
|
_prefix = re.compile(r"^\w+://")
|
|
132
168
|
_illegal = re.compile(r"[?/:|]+")
|
|
@@ -188,7 +224,7 @@ class FileCache(CacheStorageBase):
|
|
|
188
224
|
# status & encoding will be in headers, but are faked
|
|
189
225
|
# need to split spaces out of status to get code (e.g. '200 OK')
|
|
190
226
|
resp = Response(
|
|
191
|
-
status_code
|
|
227
|
+
status_code=int(resp_headers.pop("status").split(" ")[0]),
|
|
192
228
|
content=resp_content,
|
|
193
229
|
default_encoding=resp_headers.pop("encoding"),
|
|
194
230
|
headers=resp_headers,
|
|
@@ -224,13 +260,18 @@ class FileCache(CacheStorageBase):
|
|
|
224
260
|
os.remove(fname)
|
|
225
261
|
|
|
226
262
|
|
|
227
|
-
class
|
|
228
|
-
"""
|
|
263
|
+
class SqliteCache(CacheStorageBase):
|
|
264
|
+
"""
|
|
265
|
+
sqlite cache for request responses.
|
|
266
|
+
|
|
267
|
+
Parameters:
|
|
268
|
+
cache_path: path for SQLite database file
|
|
229
269
|
|
|
230
|
-
:
|
|
231
|
-
:param check_last_modified: set to True to compare last-modified
|
|
232
|
-
timestamp in cached response with value from HEAD request
|
|
270
|
+
Example:
|
|
233
271
|
|
|
272
|
+
make_careful_client(
|
|
273
|
+
cache_storage=SQLiteCache("_cache.db"),
|
|
274
|
+
)
|
|
234
275
|
"""
|
|
235
276
|
|
|
236
277
|
_columns = ["key", "status", "modified", "encoding", "data", "headers"]
|
|
@@ -284,7 +325,12 @@ class SQLiteCache(CacheStorageBase):
|
|
|
284
325
|
# if rec["modified"] != new_lm:
|
|
285
326
|
# return None
|
|
286
327
|
|
|
287
|
-
resp = Response(
|
|
328
|
+
resp = Response(
|
|
329
|
+
rec["status"],
|
|
330
|
+
content=rec["data"],
|
|
331
|
+
default_encoding=rec["encoding"],
|
|
332
|
+
headers=json.loads(rec["headers"]),
|
|
333
|
+
)
|
|
288
334
|
return resp
|
|
289
335
|
|
|
290
336
|
def clear(self) -> None:
|
careful/httpx/retries.py
CHANGED
|
@@ -2,13 +2,22 @@ import time
|
|
|
2
2
|
import types
|
|
3
3
|
import functools
|
|
4
4
|
import logging
|
|
5
|
-
from httpx import Client, Response
|
|
5
|
+
from httpx import Client, Response, HTTPError
|
|
6
6
|
|
|
7
7
|
log = logging.getLogger("httpx")
|
|
8
8
|
|
|
9
9
|
|
|
10
|
-
def
|
|
11
|
-
|
|
10
|
+
def retry_default_rule(response: Response) -> bool:
|
|
11
|
+
# default behavior is to retry 400s and 500s but not 404s
|
|
12
|
+
return response.status_code >= 400 and response.status_code != 404
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def retry_only_500s(response: Response) -> bool:
|
|
16
|
+
return response.status_code >= 500
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def retry_all_400s_500s(response: Response) -> bool:
|
|
20
|
+
return response.status_code >= 400
|
|
12
21
|
|
|
13
22
|
|
|
14
23
|
def _retry_request(client: Client, *args, **kwargs):
|
|
@@ -20,24 +29,21 @@ def _retry_request(client: Client, *args, **kwargs):
|
|
|
20
29
|
exception_raised = None
|
|
21
30
|
|
|
22
31
|
try:
|
|
23
|
-
|
|
32
|
+
tries += 1
|
|
33
|
+
resp = client._no_retry_request(*args, **kwargs)
|
|
24
34
|
|
|
25
35
|
# break from loop on an accepted response
|
|
26
|
-
if client.
|
|
27
|
-
resp.status_code == 404 and not client._retry_on_404
|
|
28
|
-
):
|
|
36
|
+
if not client._should_retry(resp):
|
|
29
37
|
break
|
|
30
38
|
|
|
31
|
-
except
|
|
32
|
-
# TODO: exclude certain kinds of exceptions (SSL?) from retry
|
|
39
|
+
except HTTPError as e:
|
|
33
40
|
exception_raised = e
|
|
34
41
|
|
|
35
42
|
if exception_response := getattr(e, "response", None):
|
|
36
|
-
if client.
|
|
43
|
+
if not client._should_retry(exception_response):
|
|
37
44
|
break
|
|
38
45
|
|
|
39
46
|
# if we're going to retry, sleep first
|
|
40
|
-
tries += 1
|
|
41
47
|
if tries <= client._retry_attempts:
|
|
42
48
|
# twice as long each time
|
|
43
49
|
wait = client._retry_wait_seconds * (2 ** (tries - 1))
|
|
@@ -68,20 +74,17 @@ def make_retry_client(
|
|
|
68
74
|
client: Client | None = None,
|
|
69
75
|
attempts: int = 1,
|
|
70
76
|
wait_seconds: float = 10,
|
|
71
|
-
|
|
72
|
-
accept_response=_default_accept_response,
|
|
77
|
+
should_retry=retry_default_rule,
|
|
73
78
|
):
|
|
74
79
|
if client is None:
|
|
75
80
|
client = Client()
|
|
76
81
|
client._retry_attempts = max(0, attempts)
|
|
77
82
|
client._retry_wait_seconds = wait_seconds
|
|
78
|
-
client.
|
|
79
|
-
client._accept_response = accept_response
|
|
83
|
+
client._should_retry = should_retry
|
|
80
84
|
|
|
81
|
-
client.
|
|
85
|
+
client._no_retry_request = client.request
|
|
82
86
|
client.request = types.MethodType(
|
|
83
87
|
functools.wraps(client.request)(_retry_request), client
|
|
84
88
|
)
|
|
85
89
|
|
|
86
90
|
return client
|
|
87
|
-
|
careful/httpx/throttle.py
CHANGED
|
@@ -16,7 +16,7 @@ def _throttle_request(client: Client, *args, **kwargs):
|
|
|
16
16
|
client._last_request = time.time()
|
|
17
17
|
else:
|
|
18
18
|
client._last_request = now
|
|
19
|
-
return client.
|
|
19
|
+
return client._no_throttle_request(*args, **kwargs)
|
|
20
20
|
|
|
21
21
|
|
|
22
22
|
def make_throttled_client(
|
|
@@ -34,7 +34,7 @@ def make_throttled_client(
|
|
|
34
34
|
client._requests_per_minute = requests_per_minute
|
|
35
35
|
client._request_frequency = 60.0 / requests_per_minute
|
|
36
36
|
|
|
37
|
-
client.
|
|
37
|
+
client._no_throttle_request = client.request
|
|
38
38
|
client.request = types.MethodType(
|
|
39
39
|
functools.wraps(client.request)(_throttle_request), client
|
|
40
40
|
)
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: careful
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: careful extensions to httpx: throttle, retry, cache
|
|
5
|
+
Project-URL: Repository, https://codeberg.org/jpt/careful
|
|
6
|
+
Author-email: jpt <dev@jpt.sh>
|
|
7
|
+
License: BSD-2-Clause
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Classifier: Development Status :: 6 - Mature
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: License :: OSI Approved :: BSD License
|
|
12
|
+
Classifier: Natural Language :: English
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
19
|
+
Requires-Python: >=3.10
|
|
20
|
+
Requires-Dist: httpx>=0.28.1
|
|
21
|
+
Requires-Dist: mkdocs-material>=9.6.18
|
|
22
|
+
Requires-Dist: mkdocstrings-python>=1.18.2
|
|
23
|
+
Requires-Dist: mkdocstrings>=0.30.0
|
|
24
|
+
Requires-Dist: pytest-httpbin>=2.1.0
|
|
25
|
+
Requires-Dist: pytest>=8.4.2
|
|
26
|
+
Description-Content-Type: text/markdown
|
|
27
|
+
|
|
28
|
+
# careful
|
|
29
|
+
|
|
30
|
+
<img src="/carefully-3681327.svg" width=100 height=100 alt="logo of a warning sign">
|
|
31
|
+
|
|
32
|
+
**careful** is a library for making requests to unreliable websites with httpx.
|
|
33
|
+
|
|
34
|
+
**Code**: <https://codeberg.org/jpt/careful>
|
|
35
|
+
|
|
36
|
+
**Docs**: <https://careful.jpt.sh>
|
|
37
|
+
|
|
38
|
+
It offers enhancements to
|
|
39
|
+
[`httpx.Client`](https://www.python-httpx.org)
|
|
40
|
+
useful for writing long-running scrapers & crawlers, particularly against sites that are slow or have intermittent errors.
|
|
41
|
+
|
|
42
|
+
- **configurable retry support.** retry on timeouts or other errors, with exponential back-off.
|
|
43
|
+
- **simple request throttling.** set a maximum number of requests per minute.
|
|
44
|
+
- **development cache.** configurable caching aimed at reducing redundant requests made while authoring/testing web scrapers.
|
|
45
|
+
|
|
46
|
+
### example
|
|
47
|
+
|
|
48
|
+
```python
|
|
49
|
+
from httpx import Client
|
|
50
|
+
from careful.httpx import make_careful_client
|
|
51
|
+
|
|
52
|
+
client = make_careful_client(
|
|
53
|
+
# can configure httpx.Client however you usually would
|
|
54
|
+
client=Client(headers={'user-agent': 'careful/1.0'}),
|
|
55
|
+
# retries are configurable w/ exponential back off
|
|
56
|
+
retry_attempts=2,
|
|
57
|
+
retry_wait_seconds=5,
|
|
58
|
+
# can cache to process memory, filesystem, or SQLite
|
|
59
|
+
cache_storage=MemoryCache(),
|
|
60
|
+
# requests will automatically be throttled to aim at this rate
|
|
61
|
+
requests_per_minute=60,
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
# all normal methods on httpx.Client make use of configured enhancements
|
|
65
|
+
client.get("https://example.com")
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
---
|
|
70
|
+
|
|
71
|
+
Logo licensed from [Adrien Coquet via Noun Project](https://thenounproject.com/icon/carefully-3681327/)
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
careful/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
careful/httpx/__init__.py,sha256=u-n0uKIWAd3NXsZUd1UA4wzJJTEhRR74diHzDV2EpEU,4885
|
|
3
|
+
careful/httpx/_types.py,sha256=NwyQ-ItodN9HnO7d7b0M1M4M9y90TjRkhQFqNuypKRI,149
|
|
4
|
+
careful/httpx/dev_cache.py,sha256=KR35u0CvutqTOWQ8pO-hzwbPy0lDBhShJfhCAbOvqv0,11032
|
|
5
|
+
careful/httpx/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
+
careful/httpx/retries.py,sha256=Kszm0wDITyPZ3qx5TsDL__HjCYVJyAZ2WehrlpXV5Cc,2500
|
|
7
|
+
careful/httpx/throttle.py,sha256=ZpuFABYHGQ4D0zks922SCXp7WZG_-Ysafz-Npa2QVwQ,1096
|
|
8
|
+
careful-0.2.0.dist-info/METADATA,sha256=A82D5ltN7bDh1dXkOqdBLcW8fxxxqsonFgf9hZQlors,2541
|
|
9
|
+
careful-0.2.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
10
|
+
careful-0.2.0.dist-info/licenses/LICENSE,sha256=oHe4LmCuo6CZne42DRXfiR3uqqIfGsk4dAvDKucAi4M,1315
|
|
11
|
+
careful-0.2.0.dist-info/RECORD,,
|
careful-0.1.0.dist-info/METADATA
DELETED
|
@@ -1,48 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: careful
|
|
3
|
-
Version: 0.1.0
|
|
4
|
-
Summary: Add your description here
|
|
5
|
-
Author-email: jpt <dev@jpt.sh>
|
|
6
|
-
License-File: LICENSE
|
|
7
|
-
Requires-Python: >=3.13
|
|
8
|
-
Requires-Dist: httpx>=0.28.1
|
|
9
|
-
Requires-Dist: pytest-httpbin>=2.1.0
|
|
10
|
-
Requires-Dist: pytest>=8.4.2
|
|
11
|
-
Description-Content-Type: text/markdown
|
|
12
|
-
|
|
13
|
-
**careful_httpx** is a library for making requests to less-than-reliable websites.
|
|
14
|
-
|
|
15
|
-
It is based on [scrapelib](https://pypi.org/scrapelib/), which has powered Open States & many other Python scrapers for over 15 years.
|
|
16
|
-
|
|
17
|
-
Code: <https://codeberg.org/jpt/careful_httpx>
|
|
18
|
-
|
|
19
|
-
Documentation: TODO
|
|
20
|
-
|
|
21
|
-
## Features
|
|
22
|
-
|
|
23
|
-
Enhances [`httpx.Client`](https://www.python-httpx.org) with features useful for writing long-running scrapers & crawlers, particularly against sites that are slow or have intermittent errors.
|
|
24
|
-
|
|
25
|
-
- retries
|
|
26
|
-
- throttling
|
|
27
|
-
- dev-cache for iterating on scrapers
|
|
28
|
-
|
|
29
|
-
### example
|
|
30
|
-
|
|
31
|
-
TODO
|
|
32
|
-
|
|
33
|
-
### features this has that scrapelib doesn't
|
|
34
|
-
|
|
35
|
-
- httpx support
|
|
36
|
-
- composable interface, can augment Client with just the enhancements you want
|
|
37
|
-
|
|
38
|
-
TODO: don't allow instantiating bad patch classes, and check for incompatible configs
|
|
39
|
-
|
|
40
|
-
### features scrapelib had that this doesn't
|
|
41
|
-
|
|
42
|
-
Open to considering if there is interest, but didn't seem necessary.
|
|
43
|
-
|
|
44
|
-
- HTTP(S) and FTP requests via an identical API
|
|
45
|
-
- allow setting custom ciphers
|
|
46
|
-
- have urlretrieve
|
|
47
|
-
- support FTP
|
|
48
|
-
- set custom user-agent/mess w/ headers
|
careful-0.1.0.dist-info/RECORD
DELETED
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
careful/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
careful/httpx/__init__.py,sha256=gDSnAnqxFt9mLi2laArt7BUn_wPU5ub0k9zeqsexYJY,1605
|
|
3
|
-
careful/httpx/dev_cache.py,sha256=_jwpnf1fzBzR23Of2HdsyiN_MPHvRG0gtM49Y4qRtQg,10031
|
|
4
|
-
careful/httpx/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
5
|
-
careful/httpx/retries.py,sha256=mMIZf-EP9bhzEZnEmwtjWZ2qdl6ZCJ7vq3hZltT6Zms,2458
|
|
6
|
-
careful/httpx/throttle.py,sha256=wCJWHERr5manyKq07ZdonmxbK0oh0PYJgO6a94IzN0s,1088
|
|
7
|
-
careful-0.1.0.dist-info/METADATA,sha256=wBFvqh5xyMfNRVB8Jg9Aa3s5_Te2NxcBy5BE-3NMYeY,1373
|
|
8
|
-
careful-0.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
9
|
-
careful-0.1.0.dist-info/licenses/LICENSE,sha256=oHe4LmCuo6CZne42DRXfiR3uqqIfGsk4dAvDKucAi4M,1315
|
|
10
|
-
careful-0.1.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|