careful 0.2.0__tar.gz → 0.3.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- careful-0.3.1/.spellignore +6 -0
- careful-0.3.1/.woodpecker.yml +31 -0
- {careful-0.2.0 → careful-0.3.1}/Justfile +4 -3
- careful-0.3.1/PKG-INFO +76 -0
- careful-0.3.1/README.md +53 -0
- {careful-0.2.0 → careful-0.3.1}/docs/changelog.md +10 -0
- careful-0.3.1/docs/design.md +224 -0
- careful-0.3.1/docs/index.md +56 -0
- careful-0.3.1/docs/reference.md +139 -0
- {careful-0.2.0 → careful-0.3.1}/mkdocs.yml +3 -4
- {careful-0.2.0 → careful-0.3.1}/pyproject.toml +15 -7
- careful-0.3.1/src/careful/httpx/__init__.py +198 -0
- {careful-0.2.0 → careful-0.3.1}/src/careful/httpx/_types.py +1 -1
- {careful-0.2.0 → careful-0.3.1}/src/careful/httpx/dev_cache.py +45 -35
- {careful-0.2.0 → careful-0.3.1}/src/careful/httpx/retries.py +23 -9
- careful-0.3.1/src/careful/httpx/robots.py +72 -0
- {careful-0.2.0 → careful-0.3.1}/src/careful/httpx/throttle.py +20 -9
- {careful-0.2.0 → careful-0.3.1}/tests/fakeresponse.py +0 -2
- {careful-0.2.0 → careful-0.3.1}/tests/test_cache.py +13 -6
- {careful-0.2.0 → careful-0.3.1}/tests/test_careful.py +2 -2
- {careful-0.2.0 → careful-0.3.1}/tests/test_retries.py +3 -1
- careful-0.3.1/tests/test_robots_txt.py +28 -0
- {careful-0.2.0 → careful-0.3.1}/tests/test_throttle.py +1 -3
- careful-0.3.1/trifold.toml +8 -0
- careful-0.2.0/PKG-INFO +0 -71
- careful-0.2.0/README.md +0 -44
- careful-0.2.0/docs/index.md +0 -45
- careful-0.2.0/docs/reference.md +0 -34
- careful-0.2.0/src/careful/httpx/__init__.py +0 -127
- {careful-0.2.0 → careful-0.3.1}/.gitignore +0 -0
- {careful-0.2.0 → careful-0.3.1}/.pre-commit-config.yaml +0 -0
- {careful-0.2.0 → careful-0.3.1}/LICENSE +0 -0
- {careful-0.2.0 → careful-0.3.1}/docs/carefully-3681327.svg +0 -0
- {careful-0.2.0 → careful-0.3.1}/src/careful/__init__.py +0 -0
- {careful-0.2.0 → careful-0.3.1}/src/careful/httpx/py.typed +0 -0
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
when:
|
|
2
|
+
- event: push
|
|
3
|
+
- event: pull_request
|
|
4
|
+
|
|
5
|
+
matrix:
|
|
6
|
+
PYTHON_VERSION:
|
|
7
|
+
- "3.10"
|
|
8
|
+
- "3.11"
|
|
9
|
+
- "3.12"
|
|
10
|
+
- "3.13"
|
|
11
|
+
|
|
12
|
+
steps:
|
|
13
|
+
lint:
|
|
14
|
+
image: python:3.13
|
|
15
|
+
commands:
|
|
16
|
+
- curl -LsSf https://astral.sh/uv/install.sh | sh
|
|
17
|
+
- "export PATH=/root/.local/bin:$PATH"
|
|
18
|
+
- uv run ruff check .
|
|
19
|
+
- uv run ruff format --check .
|
|
20
|
+
- uvx ty check .
|
|
21
|
+
when:
|
|
22
|
+
- evaluate: 'PYTHON_VERSION == "3.13"'
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
test:
|
|
26
|
+
image: "python:${PYTHON_VERSION}"
|
|
27
|
+
commands:
|
|
28
|
+
- curl -LsSf https://astral.sh/uv/install.sh | sh
|
|
29
|
+
- "export PATH=/root/.local/bin:$PATH"
|
|
30
|
+
- uv sync --group httpbin
|
|
31
|
+
- uv run --python ${PYTHON_VERSION} pytest
|
careful-0.3.1/PKG-INFO
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: careful
|
|
3
|
+
Version: 0.3.1
|
|
4
|
+
Summary: a small library for writing resilient, well-behaved HTTP code
|
|
5
|
+
Project-URL: Repository, https://codeberg.org/jpt/careful
|
|
6
|
+
Author-email: jpt <dev@jpt.sh>
|
|
7
|
+
License: BSD-2-Clause
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Classifier: Development Status :: 6 - Mature
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: License :: OSI Approved :: BSD License
|
|
12
|
+
Classifier: Natural Language :: English
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
19
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
20
|
+
Requires-Python: >=3.10
|
|
21
|
+
Requires-Dist: httpx>=0.28.1
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
|
|
24
|
+
# careful
|
|
25
|
+
|
|
26
|
+
<img src="https://jpt.sh/projects/careful/carefully-3681327.svg" width=100 height=100 alt="logo of a warning sign">
|
|
27
|
+
|
|
28
|
+
**careful** is a Python library for writing resilient, well-behaved HTTP clients.
|
|
29
|
+
|
|
30
|
+
**Code**: <https://codeberg.org/jpt/careful>
|
|
31
|
+
|
|
32
|
+
**Docs**: <https://jpt.sh/projects/careful/>
|
|
33
|
+
|
|
34
|
+

|
|
35
|
+
[](https://ci.codeberg.org/repos/15185)
|
|
36
|
+
|
|
37
|
+
Call one function to enchant an
|
|
38
|
+
**[httpx.Client](https://www.python-httpx.org)**, making your HTTP connections more resilient and better mannered.
|
|
39
|
+
|
|
40
|
+
- Configure **throttling** to avoid accidental Denial-of-Service / risking getting banned.
|
|
41
|
+
- **Retries** help overcome intermittent failures on flaky sites or long crawls.
|
|
42
|
+
- **Development caching** Cache persists between runs during development, reduces redundant requests made while iterating on your crawlers & scrapers.
|
|
43
|
+
|
|
44
|
+
### Example
|
|
45
|
+
|
|
46
|
+
```python
|
|
47
|
+
from httpx import Client
|
|
48
|
+
from careful.httpx import make_careful_client
|
|
49
|
+
|
|
50
|
+
# the only function you need to call is make_careful_client
|
|
51
|
+
# this wraps your existing `httpx.Client` with your preferred
|
|
52
|
+
# careful behaviors
|
|
53
|
+
|
|
54
|
+
client = make_careful_client(
|
|
55
|
+
client=Client(headers={'user-agent': 'spiderman/1.0'}),
|
|
56
|
+
|
|
57
|
+
# retries are configurable w/ exponential back off
|
|
58
|
+
retry_attempts=2,
|
|
59
|
+
retry_wait_seconds=5,
|
|
60
|
+
|
|
61
|
+
# can cache to process memory, filesystem, or SQLite
|
|
62
|
+
cache_storage=MemoryCache(),
|
|
63
|
+
|
|
64
|
+
# easy-to-configure throttling
|
|
65
|
+
requests_per_minute=60,
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
# methods on client are called as they always are
|
|
69
|
+
# configured behaviors occur without further code changes
|
|
70
|
+
client.get("https://example.com")
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
---
|
|
75
|
+
|
|
76
|
+
Logo licensed from [Adrien Coquet via Noun Project](https://thenounproject.com/icon/carefully-3681327/)
|
careful-0.3.1/README.md
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# careful
|
|
2
|
+
|
|
3
|
+
<img src="https://jpt.sh/projects/careful/carefully-3681327.svg" width=100 height=100 alt="logo of a warning sign">
|
|
4
|
+
|
|
5
|
+
**careful** is a Python library for writing resilient, well-behaved HTTP clients.
|
|
6
|
+
|
|
7
|
+
**Code**: <https://codeberg.org/jpt/careful>
|
|
8
|
+
|
|
9
|
+
**Docs**: <https://jpt.sh/projects/careful/>
|
|
10
|
+
|
|
11
|
+

|
|
12
|
+
[](https://ci.codeberg.org/repos/15185)
|
|
13
|
+
|
|
14
|
+
Call one function to enchant an
|
|
15
|
+
**[httpx.Client](https://www.python-httpx.org)**, making your HTTP connections more resilient and better mannered.
|
|
16
|
+
|
|
17
|
+
- Configure **throttling** to avoid accidental Denial-of-Service / risking getting banned.
|
|
18
|
+
- **Retries** help overcome intermittent failures on flaky sites or long crawls.
|
|
19
|
+
- **Development caching** Cache persists between runs during development, reduces redundant requests made while iterating on your crawlers & scrapers.
|
|
20
|
+
|
|
21
|
+
### Example
|
|
22
|
+
|
|
23
|
+
```python
|
|
24
|
+
from httpx import Client
|
|
25
|
+
from careful.httpx import make_careful_client
|
|
26
|
+
|
|
27
|
+
# the only function you need to call is make_careful_client
|
|
28
|
+
# this wraps your existing `httpx.Client` with your preferred
|
|
29
|
+
# careful behaviors
|
|
30
|
+
|
|
31
|
+
client = make_careful_client(
|
|
32
|
+
client=Client(headers={'user-agent': 'spiderman/1.0'}),
|
|
33
|
+
|
|
34
|
+
# retries are configurable w/ exponential back off
|
|
35
|
+
retry_attempts=2,
|
|
36
|
+
retry_wait_seconds=5,
|
|
37
|
+
|
|
38
|
+
# can cache to process memory, filesystem, or SQLite
|
|
39
|
+
cache_storage=MemoryCache(),
|
|
40
|
+
|
|
41
|
+
# easy-to-configure throttling
|
|
42
|
+
requests_per_minute=60,
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
# methods on client are called as they always are
|
|
46
|
+
# configured behaviors occur without further code changes
|
|
47
|
+
client.get("https://example.com")
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
---
|
|
52
|
+
|
|
53
|
+
Logo licensed from [Adrien Coquet via Noun Project](https://thenounproject.com/icon/carefully-3681327/)
|
|
@@ -1,5 +1,15 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## 0.3.1
|
|
4
|
+
|
|
5
|
+
- Add `make_careful_client_from_env`
|
|
6
|
+
|
|
7
|
+
## 0.3.0 - 8 September 2025
|
|
8
|
+
|
|
9
|
+
- Increased consistency across interfaces with parameter names, etc.
|
|
10
|
+
- Added support for robots.txt protocol.
|
|
11
|
+
- Documentation overhaul.
|
|
12
|
+
|
|
3
13
|
## 0.2.0 - 6 September 2025
|
|
4
14
|
|
|
5
15
|
- Initial release, mostly a port of `scrapelib` functionality.
|
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
# Design Decisions
|
|
2
|
+
|
|
3
|
+
!!! note
|
|
4
|
+
|
|
5
|
+
This section is not necessary for users of the library to understand.
|
|
6
|
+
|
|
7
|
+
If you are looking to contribute, or curious about why things work the way they do, read on.
|
|
8
|
+
|
|
9
|
+
## Design Goals
|
|
10
|
+
|
|
11
|
+
In order of importance:
|
|
12
|
+
|
|
13
|
+
1. Work as a drop-in replacement for `httpx.Client`, so existing code can gain the benefits of the library without being changed.
|
|
14
|
+
2. Preserve the 100% typed interface of `httpx`.
|
|
15
|
+
3. Make writing & testing new augmentations easier.
|
|
16
|
+
4. Organize code in a way that'll make augmenting future libraries (or `httpx.AsyncClient`) easier. (Based on limitations in porting `scrapelib` that made a partial rewrite easier.)
|
|
17
|
+
|
|
18
|
+
## How it works
|
|
19
|
+
|
|
20
|
+
Each "suite" of behavior: throttling, retries, caching consists of a function that
|
|
21
|
+
[monkey patches](https://en.wikipedia.org/wiki/Monkey_patch) `httpx.Client` to add this behavior
|
|
22
|
+
to the all-important `request` method. (The other common methods like `get` and `post` call this method.)
|
|
23
|
+
|
|
24
|
+
If you'd like to follow along, `throttle.py` is the simplest of them, at around 50 lines long.
|
|
25
|
+
|
|
26
|
+
You'll see two functions (ignore the class for now):
|
|
27
|
+
|
|
28
|
+
- `_throttle_request` - this acts as a sort of decorator for `httpx.Client.request`
|
|
29
|
+
- `make_throttled_client` - a psuedo-constructor for our monkey patched client.
|
|
30
|
+
|
|
31
|
+
Each feature is implemented in a similar way.
|
|
32
|
+
|
|
33
|
+
The recommended [make_careful_client][careful.httpx.make_careful_client] entrypoint is
|
|
34
|
+
just a convinient combination of these `make_ZZZ_ciient` functions.
|
|
35
|
+
|
|
36
|
+
If you don't need to be convinced that monkey patching was a reasonable choice here, you can skip to [Our patch pattern](#our-patch-pattern)
|
|
37
|
+
|
|
38
|
+
## Why not inheritance?
|
|
39
|
+
|
|
40
|
+
For 15 years `scrapelib` used a class hierarchy to a similar end, it'd certainly work here too.
|
|
41
|
+
|
|
42
|
+
The equivalent to a careful client in `scrapelib` is a `scrapelib.Scraper`.
|
|
43
|
+
It has a long inheritance hierarchy:
|
|
44
|
+
|
|
45
|
+
`scrapelib.Scraper` -> `CachingSession` -> `ThrottledSession` -> `RetrySession` -> `requests.Session`
|
|
46
|
+
|
|
47
|
+
This hierarchy means that there is no such thing as a `CachingSession` that doesn't use throttling,
|
|
48
|
+
and adding new behavior means considering exactly where it works best in the chain and then setting that in stone.
|
|
49
|
+
|
|
50
|
+
There's arguably no benefit derived from having things set up this way.
|
|
51
|
+
It is just too annoying to mix & match behaviors, or add new ones, a single class would have been easier to maintain over the years.
|
|
52
|
+
|
|
53
|
+
## Why not mixins?
|
|
54
|
+
|
|
55
|
+
We don't want to just give up and go with a single monolithic class. (See design goals 3 and 4.)
|
|
56
|
+
|
|
57
|
+
It is worth revisiting why those classes weren't [mixins](https://en.wikipedia.org/wiki/Mixin).
|
|
58
|
+
|
|
59
|
+
It seems like we could have `ThrottledMixin`, `RetryMixin`, `DevCacheMixin`, etc.
|
|
60
|
+
|
|
61
|
+
Then to use retry & cache together someone would:
|
|
62
|
+
|
|
63
|
+
```python
|
|
64
|
+
import httpx
|
|
65
|
+
from careful.hypothetical import ThrottledMixin, RetryMixin, DevCacheMixin
|
|
66
|
+
|
|
67
|
+
class CustomClient(RetryMixin, DevCacheMixin, Client):
|
|
68
|
+
pass
|
|
69
|
+
|
|
70
|
+
client = CustomClient()
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
Honestly, not a great start: having to declare an empty class, and to carefully think about method resolution order rules in ordering them.
|
|
74
|
+
|
|
75
|
+
But there's a bigger problem lurking here... configuration.
|
|
76
|
+
|
|
77
|
+
Assume each mixin is configured through its constructor:
|
|
78
|
+
|
|
79
|
+
```python
|
|
80
|
+
|
|
81
|
+
class RetryMixin:
|
|
82
|
+
def __init__(self, num_retries=2, retry_delay_seconds=10, **kwargs):
|
|
83
|
+
...
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class DevCacheMixin:
|
|
87
|
+
def __init__(self, cache_backend=..., should_cache=..., **kwargs):
|
|
88
|
+
...
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
To make this work properly, our custom class needs a constructor too.
|
|
92
|
+
It'd wind up looking like:
|
|
93
|
+
|
|
94
|
+
```python
|
|
95
|
+
def __init__(self, num_retries, retry_delay_seconds, cache_backend, should_cache, ...)
|
|
96
|
+
# Initialize mixins explicitly
|
|
97
|
+
RetryMixin.__init__(self, num_retries=num_retries,
|
|
98
|
+
retry_delay_seconds=retry_delay_seconds)
|
|
99
|
+
|
|
100
|
+
DevCacheMixin.__init__(self, cache_backend=cache_backend,
|
|
101
|
+
should_cache=should_cache)
|
|
102
|
+
|
|
103
|
+
Client.__init__(self, **kwargs)
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
This makes working with the mixins frustrating, since any new combination requires modifying a repetitive constructor.
|
|
107
|
+
|
|
108
|
+
## Preserving type signatures
|
|
109
|
+
|
|
110
|
+
One of the most annoying parts of maintaining `scrapelib` has been keeping its function signatures in sync with small `requests` changes.
|
|
111
|
+
|
|
112
|
+
Design goal #1 is that someone's existing usage of `httpx.Client` is unimpeded.
|
|
113
|
+
The most important method, and where we need to hook in our overrides, is `Client.request`.
|
|
114
|
+
The method takes a whopping 13 parameters, and of course `httpx` is also fully type-annotaed.
|
|
115
|
+
|
|
116
|
+
To replace `request` we have three options:
|
|
117
|
+
|
|
118
|
+
1. Give our new class a `request` method which takes `*args, and **kwargs` and passes them up the chain.
|
|
119
|
+
2. Give each class a `request` method take the exact same 13 parameters, and be careful to keep them in sync.
|
|
120
|
+
3. Use `functools.wraps` to replace the function but leave existing annotations & docstrings in place.
|
|
121
|
+
|
|
122
|
+
\#1 reduces type safety and leads to a worse DX overall since language servers can no longer offer suggestions. **This won't work for us.**
|
|
123
|
+
|
|
124
|
+
\#2 is the approach that `scrapelib` took. It was annoying and conflicts with goals 3 and 4.
|
|
125
|
+
|
|
126
|
+
\#3 is the approach taken by `careful`, our actual monkey patch. Each `make_ZZZ_client` winds up with code resembling:
|
|
127
|
+
|
|
128
|
+
```python
|
|
129
|
+
tclient._no_throttle_request = tclient.request
|
|
130
|
+
tclient.request = types.MethodType(
|
|
131
|
+
functools.wraps(client.request)(_throttle_request), client
|
|
132
|
+
)
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
The first line tucks away the pre-patch request method for use within the decorated function. It uses a unique name since it'll be sharing a namespace with other patches.
|
|
136
|
+
|
|
137
|
+
The second line does two neat things:
|
|
138
|
+
|
|
139
|
+
- `_throttle_request` is given the signature of `client.request` via `functools.wraps`
|
|
140
|
+
- `types.MethodType` rebinds the member function (so `self` is correctly handled as the first parameter)
|
|
141
|
+
|
|
142
|
+
## Our patch pattern
|
|
143
|
+
|
|
144
|
+
Augmenting `Client` is done in two steps:
|
|
145
|
+
|
|
146
|
+
- a request function that acts as a decorator for`Client.request`, this is where the actual logic for the augmentation lives
|
|
147
|
+
- a patch function that:
|
|
148
|
+
- writes any private state needed for the new behavior to a `Client` instance
|
|
149
|
+
- replaces `Client.request` with our patched request
|
|
150
|
+
|
|
151
|
+
All of the files in `careful.httpx` follow this structure.
|
|
152
|
+
|
|
153
|
+
### Protocol-typing the internal interface
|
|
154
|
+
|
|
155
|
+
After considering some of the issues above, I was considering that I'd probably have to have type ignore statements everywhere to get the monkey patching to work with a type checker.
|
|
156
|
+
|
|
157
|
+
While this might have been an option, after all the end user experience is the priority, it'd be nice to keep the benefits of type checking for myself and other authors of extensions.
|
|
158
|
+
|
|
159
|
+
It turns out, as long as we consider the patches fully internal to the `Client`, there is a way to make this work.
|
|
160
|
+
|
|
161
|
+
The key challenge presented is that we add new variables to the `Client` during augmentation:
|
|
162
|
+
|
|
163
|
+
```python
|
|
164
|
+
# this is that internal state we store on a client
|
|
165
|
+
tclient._last_request = 0.0
|
|
166
|
+
tclient._requests_per_minute = requests_per_minute
|
|
167
|
+
tclient._request_frequency = 60.0 / requests_per_minute
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
These set off type checker alarms, and if we simply ignore them then when they are used in the request wrapper
|
|
171
|
+
they'll set off alarms again!
|
|
172
|
+
|
|
173
|
+
It'd be nice to at least have a consistency check between these, so the wrapped `request` doesn't accidentally use the wrong name, I typed `_requests_per_second` at least once while writing.
|
|
174
|
+
|
|
175
|
+
The answer here is a `Protocol` and a `cast`.
|
|
176
|
+
|
|
177
|
+
Each augmentation now comes with a `typing.Protocol`:
|
|
178
|
+
|
|
179
|
+
```python
|
|
180
|
+
class Throttled(Protocol):
|
|
181
|
+
_last_request: float
|
|
182
|
+
_requests_per_minute: float
|
|
183
|
+
_request_frequency: float
|
|
184
|
+
_no_throttle_request: Callable
|
|
185
|
+
request: Callable
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
This defines all of the hidden state for the augmentation, as well as a placeholder for our overriden `request`.
|
|
189
|
+
|
|
190
|
+
Then, our decorator looks like this:
|
|
191
|
+
|
|
192
|
+
```python
|
|
193
|
+
def _throttle_request(client: Throttled, *args, **kwargs) -> Response:
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
Which satisfies the type checker when it comes to internal use of those new attributes.
|
|
197
|
+
|
|
198
|
+
The final change comes in where we initialize the attributes in the wrapper functions:
|
|
199
|
+
|
|
200
|
+
```python
|
|
201
|
+
# a cast is made to the new type, allowing assignment
|
|
202
|
+
tclient = cast(ThrottledClient, client)
|
|
203
|
+
|
|
204
|
+
tclient._last_request = 0.0
|
|
205
|
+
tclient._requests_per_minute = requests_per_minute
|
|
206
|
+
tclient._request_frequency = 60.0 / requests_per_minute
|
|
207
|
+
|
|
208
|
+
tclient._no_throttle_request = client.request
|
|
209
|
+
tclient.request = types.MethodType(
|
|
210
|
+
functools.wraps(client.request)(_throttle_request), client
|
|
211
|
+
)
|
|
212
|
+
# the original client can be returned, of type `Client`
|
|
213
|
+
return client
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
### Closing thoughts
|
|
218
|
+
|
|
219
|
+
With this approach, users do not know at any point they have a `ThrottledClient` or a `CachedClient`, etc.
|
|
220
|
+
Not having the final type change is not ideal, but the compromise made for today.
|
|
221
|
+
|
|
222
|
+
It would be nice to be able to expose an extra method or two, but this approach leans on only having private attributes & therefore being able to safely treat an augmented client as a `Client`.
|
|
223
|
+
|
|
224
|
+
There's almost certainly room for improvement, but I'm fairly happy with the trade-offs for now.
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
# careful
|
|
2
|
+
|
|
3
|
+
<img src="carefully-3681327.svg" width=100 height=100 alt="logo of a warning sign">
|
|
4
|
+
|
|
5
|
+
**careful** is a Python library for writing resilient, well-behaved HTTP clients.
|
|
6
|
+
|
|
7
|
+
**Code**: <https://codeberg.org/jpt/careful>
|
|
8
|
+
|
|
9
|
+
**Docs**: <https://jpt.sh/projects/careful/>
|
|
10
|
+
|
|
11
|
+

|
|
12
|
+
[](https://ci.codeberg.org/repos/15185)
|
|
13
|
+
|
|
14
|
+
Call one function to enchant an
|
|
15
|
+
**[httpx.Client](https://www.python-httpx.org)**, making your HTTP connections more resilient and better mannered.
|
|
16
|
+
|
|
17
|
+
- Configure **throttling** to avoid accidental Denial-of-Service / risking getting banned.
|
|
18
|
+
- **Retries** help overcome intermittent failures on flaky sites or long crawls.
|
|
19
|
+
- **Development caching** Cache persists between runs during development, reduces redundant requests made while iterating on your crawlers & scrapers.
|
|
20
|
+
|
|
21
|
+
Additionally the library is fully typed, thorougly tested, and [based on code that powered production web scrapers for 15+ years](changelog#scrapelib).
|
|
22
|
+
|
|
23
|
+
### Example
|
|
24
|
+
|
|
25
|
+
```python
|
|
26
|
+
from httpx import Client
|
|
27
|
+
from careful.httpx import make_careful_client
|
|
28
|
+
|
|
29
|
+
# the only function you need to call is make_careful_client
|
|
30
|
+
# this wraps your existing `httpx.Client` with your preferred
|
|
31
|
+
# careful behaviors
|
|
32
|
+
|
|
33
|
+
client = make_careful_client(
|
|
34
|
+
client=Client(headers={'user-agent': 'spiderman/1.0'}),
|
|
35
|
+
|
|
36
|
+
# retries are configurable w/ exponential back off
|
|
37
|
+
retry_attempts=2,
|
|
38
|
+
retry_wait_seconds=5,
|
|
39
|
+
|
|
40
|
+
# can cache to process memory, filesystem, or SQLite
|
|
41
|
+
cache_storage=MemoryCache(),
|
|
42
|
+
|
|
43
|
+
# easy-to-configure throttling
|
|
44
|
+
requests_per_minute=60,
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
# methods on client are called as they always are
|
|
48
|
+
# configured behaviors occur without further code changes
|
|
49
|
+
client.get("https://example.com")
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
---
|
|
54
|
+
|
|
55
|
+
Logo licensed from [Adrien Coquet via Noun Project](https://thenounproject.com/icon/carefully-3681327/)
|
|
56
|
+
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
# Usage
|
|
2
|
+
|
|
3
|
+
Most users will only need to call `careful.httpx.make_careful_client`.
|
|
4
|
+
|
|
5
|
+
::: careful.httpx.make_careful_client
|
|
6
|
+
options:
|
|
7
|
+
annotations_path: brief
|
|
8
|
+
show_signature: false
|
|
9
|
+
show_root_heading: true
|
|
10
|
+
|
|
11
|
+
## Throttling
|
|
12
|
+
|
|
13
|
+
If `requests_per_minute` is set, standard (non-retry) requests will automatically
|
|
14
|
+
sleep for a short period to target the given rate.
|
|
15
|
+
|
|
16
|
+
For example, at 30rpm, the sleep time on a fast request will be close to 2 seconds.
|
|
17
|
+
|
|
18
|
+
```python
|
|
19
|
+
client = make_careful_client(requests_per_minute=20)
|
|
20
|
+
|
|
21
|
+
for page in range(10):
|
|
22
|
+
# will sleep ~3 seconds each time
|
|
23
|
+
client.get(f"https://example.com?page={page}")
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
## Retries
|
|
27
|
+
|
|
28
|
+
If `retry_attempts` is set, responses will be passed to `should_retry`.
|
|
29
|
+
Responses that are rejected (return `True`) will be retried after a wait based on
|
|
30
|
+
`retry_wait_seconds`.
|
|
31
|
+
Each retry will wait twice as long as the one before.
|
|
32
|
+
|
|
33
|
+
```python
|
|
34
|
+
client = make_careful_client(retry_attempts=2, retry_wait_seconds=30)
|
|
35
|
+
|
|
36
|
+
# will try, wait 30s, try again, wait 60s, try again, then give up & return the 500
|
|
37
|
+
client.get("https://httpbin.org/status/500")
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
There are a few simple pre-written `should_retry` predicates you can use:
|
|
41
|
+
|
|
42
|
+
::: careful.httpx.retries.retry_default_rule
|
|
43
|
+
options:
|
|
44
|
+
heading_level: 3
|
|
45
|
+
show_root_heading: true
|
|
46
|
+
::: careful.httpx.retries.retry_only_500s
|
|
47
|
+
options:
|
|
48
|
+
heading_level: 3
|
|
49
|
+
show_root_heading: true
|
|
50
|
+
::: careful.httpx.retries.retry_all_400s_500s
|
|
51
|
+
options:
|
|
52
|
+
heading_level: 3
|
|
53
|
+
show_root_heading: true
|
|
54
|
+
|
|
55
|
+
## Development Caching
|
|
56
|
+
|
|
57
|
+
!!! warning "Why _development_ caching?"
|
|
58
|
+
|
|
59
|
+
This feature is named as a reminder that **this is not true HTTP caching**, which
|
|
60
|
+
should take various headers into account. Look at libraries like [hishel](https://hishel.com) if that's what you are after.
|
|
61
|
+
|
|
62
|
+
The purpose of this feature is to allow you to cache all of your HTTP requests during development.
|
|
63
|
+
Often when writing a scraper or crawler, you wind up hitting the site you are working on more often than you'd like-- each time you iterate on your code you're likely making redundant requests to pages that haven't changed.
|
|
64
|
+
|
|
65
|
+
By caching all successful requests (configurable with the `should_cache` parameter),
|
|
66
|
+
you can easily re-run scrapers without making redundant HTTP requests.
|
|
67
|
+
This means much faster development & happier upstream servers.
|
|
68
|
+
|
|
69
|
+
To enable development caching, assign a [`MemoryCache`][careful.httpx.MemoryCache],
|
|
70
|
+
[`FileCache`][careful.httpx.FileCache], or [`SqliteCache`][careful.httpx.SqliteCache] to
|
|
71
|
+
the `cache_storage` property of a `scrapelib.Scraper`.
|
|
72
|
+
|
|
73
|
+
```python
|
|
74
|
+
client = make_careful_client(
|
|
75
|
+
cache_storage=FileStorage("_cache")
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
# only one HTTP request is made
|
|
79
|
+
client.get("https://example.com")
|
|
80
|
+
client.get("https://example.com")
|
|
81
|
+
client.get("https://example.com")
|
|
82
|
+
client.get("https://example.com")
|
|
83
|
+
# on subsequent runs, zero will be made until _cache is cleared
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
## robots.txt
|
|
87
|
+
|
|
88
|
+
If `check_robots_txt` is set to `True`, then each request will be checked
|
|
89
|
+
against that domain's `robots.txt`.
|
|
90
|
+
(Each domain's `robots.txt` will be fetched once and cached.)
|
|
91
|
+
|
|
92
|
+
Evaluating `robots.txt` requires a *user agent*. `careful` will use the user agent
|
|
93
|
+
set on the `client` at the time of creation by default, but sometimes it is preferable
|
|
94
|
+
to use a custom user agent for these checks. To do so, set the `robots_txt_user_agent` parameter.
|
|
95
|
+
|
|
96
|
+
The default behavior when checking is enabled is that any attempt to fetch a denied page will
|
|
97
|
+
raise a `RobotExclusionError`. This behavior can be overriden by passing a function to `robots_txt_on_reject` that takes two parameters: `(url: str, rfp: RobotFileParser)`.
|
|
98
|
+
|
|
99
|
+
::: careful.httpx.RobotExclusionError
|
|
100
|
+
options:
|
|
101
|
+
heading_level: 3
|
|
102
|
+
members: False
|
|
103
|
+
show_root_heading: true
|
|
104
|
+
|
|
105
|
+
::: careful.httpx.robots.warn_robots_txt
|
|
106
|
+
options:
|
|
107
|
+
heading_level: 3
|
|
108
|
+
members: False
|
|
109
|
+
show_root_heading: true
|
|
110
|
+
|
|
111
|
+
### Note about multiple enhancements
|
|
112
|
+
|
|
113
|
+
When multiple features are applied, the order of wrapping ensures that:
|
|
114
|
+
|
|
115
|
+
- the cache is checked first, and bypasses throttling if hit
|
|
116
|
+
- retries use their own delays, but are not throttled separately
|
|
117
|
+
|
|
118
|
+
## Cache Storage Options
|
|
119
|
+
|
|
120
|
+
These options are available for `cache_storage`:
|
|
121
|
+
|
|
122
|
+
::: careful.httpx.MemoryCache
|
|
123
|
+
options:
|
|
124
|
+
heading_level: 3
|
|
125
|
+
members: False
|
|
126
|
+
show_root_heading: true
|
|
127
|
+
|
|
128
|
+
::: careful.httpx.FileCache
|
|
129
|
+
options:
|
|
130
|
+
heading_level: 3
|
|
131
|
+
members: False
|
|
132
|
+
show_root_heading: true
|
|
133
|
+
|
|
134
|
+
::: careful.httpx.SqliteCache
|
|
135
|
+
options:
|
|
136
|
+
heading_level: 3
|
|
137
|
+
members: False
|
|
138
|
+
show_root_heading: true
|
|
139
|
+
|
|
@@ -1,11 +1,10 @@
|
|
|
1
1
|
site_name: careful
|
|
2
|
-
site_url: https://
|
|
2
|
+
site_url: https://jpt.sh/projects/careful/
|
|
3
3
|
site_author: James Turk
|
|
4
4
|
site_description: A library for making requests to unreliable sites with httpx.
|
|
5
5
|
copyright: Copyright © 2025 James Turk
|
|
6
6
|
repo_url: https://codeberg.org/jpt/careful
|
|
7
7
|
repo_name: careful
|
|
8
|
-
#edit_uri: edit/main/docs/
|
|
9
8
|
|
|
10
9
|
theme:
|
|
11
10
|
name: material
|
|
@@ -29,8 +28,7 @@ theme:
|
|
|
29
28
|
- navigation.sections
|
|
30
29
|
- navigation.top
|
|
31
30
|
- content.tabs.link
|
|
32
|
-
|
|
33
|
-
# repo:
|
|
31
|
+
|
|
34
32
|
markdown_extensions:
|
|
35
33
|
- admonition
|
|
36
34
|
- def_list
|
|
@@ -51,4 +49,5 @@ watch:
|
|
|
51
49
|
nav:
|
|
52
50
|
- 'index.md'
|
|
53
51
|
- 'reference.md'
|
|
52
|
+
- 'design.md'
|
|
54
53
|
- 'changelog.md'
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "careful"
|
|
3
|
-
version = "0.
|
|
4
|
-
description = "
|
|
3
|
+
version = "0.3.1"
|
|
4
|
+
description = "a small library for writing resilient, well-behaved HTTP code"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
authors = [
|
|
7
7
|
{ name = "jpt", email = "dev@jpt.sh" }
|
|
@@ -18,15 +18,11 @@ classifiers = [
|
|
|
18
18
|
"Programming Language :: Python :: 3.11",
|
|
19
19
|
"Programming Language :: Python :: 3.12",
|
|
20
20
|
"Programming Language :: Python :: 3.13",
|
|
21
|
+
"Programming Language :: Python :: 3.14",
|
|
21
22
|
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
22
23
|
]
|
|
23
24
|
dependencies = [
|
|
24
25
|
"httpx>=0.28.1",
|
|
25
|
-
"mkdocs-material>=9.6.18",
|
|
26
|
-
"mkdocstrings>=0.30.0",
|
|
27
|
-
"mkdocstrings-python>=1.18.2",
|
|
28
|
-
"pytest>=8.4.2",
|
|
29
|
-
"pytest-httpbin>=2.1.0",
|
|
30
26
|
]
|
|
31
27
|
[project.urls]
|
|
32
28
|
Repository = "https://codeberg.org/jpt/careful"
|
|
@@ -35,3 +31,15 @@ Repository = "https://codeberg.org/jpt/careful"
|
|
|
35
31
|
[build-system]
|
|
36
32
|
requires = ["hatchling"]
|
|
37
33
|
build-backend = "hatchling.build"
|
|
34
|
+
|
|
35
|
+
[dependency-groups]
|
|
36
|
+
dev = [
|
|
37
|
+
"ruff>=0.12.12",
|
|
38
|
+
"mkdocs-material>=9.6.18",
|
|
39
|
+
"mkdocstrings>=0.30.0",
|
|
40
|
+
"mkdocstrings-python>=1.18.2",
|
|
41
|
+
"pytest>=8.4.2",
|
|
42
|
+
]
|
|
43
|
+
httpbin = [
|
|
44
|
+
"pytest-httpbin>=2.1.0",
|
|
45
|
+
]
|