careful 0.1.0__tar.gz → 0.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {careful-0.1.0 → careful-0.2.1}/.gitignore +2 -0
- careful-0.2.1/.woodpecker.yml +32 -0
- careful-0.2.1/Justfile +16 -0
- careful-0.2.1/PKG-INFO +68 -0
- careful-0.2.1/README.md +46 -0
- careful-0.2.1/docs/carefully-3681327.svg +9 -0
- careful-0.2.1/docs/changelog.md +29 -0
- careful-0.2.1/docs/index.md +47 -0
- careful-0.2.1/docs/reference.md +32 -0
- careful-0.2.1/mkdocs.yml +52 -0
- careful-0.2.1/pyproject.toml +44 -0
- careful-0.2.1/src/careful/httpx/__init__.py +127 -0
- careful-0.2.1/src/careful/httpx/_types.py +6 -0
- {careful-0.1.0 → careful-0.2.1}/src/careful/httpx/dev_cache.py +67 -21
- {careful-0.1.0 → careful-0.2.1}/src/careful/httpx/retries.py +20 -17
- {careful-0.1.0 → careful-0.2.1}/src/careful/httpx/throttle.py +2 -2
- {careful-0.1.0 → careful-0.2.1}/tests/fakeresponse.py +0 -2
- {careful-0.1.0 → careful-0.2.1}/tests/test_cache.py +2 -2
- careful-0.2.1/tests/test_careful.py +47 -0
- {careful-0.1.0 → careful-0.2.1}/tests/test_retries.py +10 -7
- {careful-0.1.0 → careful-0.2.1}/tests/test_throttle.py +4 -5
- careful-0.1.0/PKG-INFO +0 -48
- careful-0.1.0/README.md +0 -36
- careful-0.1.0/pyproject.toml +0 -19
- careful-0.1.0/src/careful/httpx/__init__.py +0 -59
- careful-0.1.0/tests/test_careful.py +0 -4
- {careful-0.1.0 → careful-0.2.1}/.pre-commit-config.yaml +0 -0
- {careful-0.1.0 → careful-0.2.1}/LICENSE +0 -0
- {careful-0.1.0 → careful-0.2.1}/src/careful/__init__.py +0 -0
- {careful-0.1.0 → careful-0.2.1}/src/careful/httpx/py.typed +0 -0
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
when:
|
|
2
|
+
- event: push
|
|
3
|
+
- event: pull_request
|
|
4
|
+
|
|
5
|
+
matrix:
|
|
6
|
+
PYTHON_VERSION:
|
|
7
|
+
- "3.10"
|
|
8
|
+
- "3.11"
|
|
9
|
+
- "3.12"
|
|
10
|
+
- "3.13"
|
|
11
|
+
|
|
12
|
+
steps:
|
|
13
|
+
lint:
|
|
14
|
+
image: python:3.11
|
|
15
|
+
commands:
|
|
16
|
+
- curl -LsSf https://astral.sh/uv/install.sh | sh
|
|
17
|
+
- "export PATH=/root/.local/bin:$PATH"
|
|
18
|
+
- uv run ruff check .
|
|
19
|
+
- uv run ruff format --check .
|
|
20
|
+
when:
|
|
21
|
+
- matrix:
|
|
22
|
+
PYTHON_VERSION: 3.13
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
test:
|
|
26
|
+
image: "python:${PYTHON_VERSION}"
|
|
27
|
+
commands:
|
|
28
|
+
- curl -LsSf https://astral.sh/uv/install.sh | sh
|
|
29
|
+
- "export PATH=/root/.local/bin:$PATH"
|
|
30
|
+
- uv run --python ${PYTHON_VERSION} pytest
|
|
31
|
+
depends_on:
|
|
32
|
+
- lint
|
careful-0.2.1/Justfile
ADDED
careful-0.2.1/PKG-INFO
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: careful
|
|
3
|
+
Version: 0.2.1
|
|
4
|
+
Summary: careful extensions to httpx: throttle, retry, cache
|
|
5
|
+
Project-URL: Repository, https://codeberg.org/jpt/careful
|
|
6
|
+
Author-email: jpt <dev@jpt.sh>
|
|
7
|
+
License: BSD-2-Clause
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Classifier: Development Status :: 6 - Mature
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: License :: OSI Approved :: BSD License
|
|
12
|
+
Classifier: Natural Language :: English
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
19
|
+
Requires-Python: >=3.10
|
|
20
|
+
Requires-Dist: httpx>=0.28.1
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
|
|
23
|
+
# careful
|
|
24
|
+
|
|
25
|
+
<img src="https://careful.jpt.sh/carefully-3681327.svg" width=100 height=100 alt="logo of a warning sign">
|
|
26
|
+
|
|
27
|
+
**careful** is a Python library for making requests to unreliable websites with `httpx`.
|
|
28
|
+
|
|
29
|
+
**Code**: <https://codeberg.org/jpt/careful>
|
|
30
|
+
|
|
31
|
+
**Docs**: <https://careful.jpt.sh>
|
|
32
|
+
|
|
33
|
+
[](https://ci.codeberg.org/repos/15185)
|
|
34
|
+
|
|
35
|
+
It offers enhancements to
|
|
36
|
+
[`httpx.Client`](https://www.python-httpx.org)
|
|
37
|
+
useful for writing long-running scrapers & crawlers, particularly against sites that are slow or have intermittent errors.
|
|
38
|
+
|
|
39
|
+
- **configurable retry support.** retry on timeouts or other errors, with exponential back-off.
|
|
40
|
+
- **simple request throttling.** set a maximum number of requests per minute.
|
|
41
|
+
- **development cache.** configurable caching aimed at reducing redundant requests made while authoring/testing web scrapers.
|
|
42
|
+
|
|
43
|
+
### example
|
|
44
|
+
|
|
45
|
+
```python
|
|
46
|
+
from httpx import Client
|
|
47
|
+
from careful.httpx import make_careful_client
|
|
48
|
+
|
|
49
|
+
client = make_careful_client(
|
|
50
|
+
# can configure httpx.Client however you usually would
|
|
51
|
+
client=Client(headers={'user-agent': 'careful/1.0'}),
|
|
52
|
+
# retries are configurable w/ exponential back off
|
|
53
|
+
retry_attempts=2,
|
|
54
|
+
retry_wait_seconds=5,
|
|
55
|
+
# can cache to process memory, filesystem, or SQLite
|
|
56
|
+
cache_storage=MemoryCache(),
|
|
57
|
+
# requests will automatically be throttled to aim at this rate
|
|
58
|
+
requests_per_minute=60,
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
# all normal methods on httpx.Client make use of configured enhancements
|
|
62
|
+
client.get("https://example.com")
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
---
|
|
67
|
+
|
|
68
|
+
Logo licensed from [Adrien Coquet via Noun Project](https://thenounproject.com/icon/carefully-3681327/)
|
careful-0.2.1/README.md
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
# careful
|
|
2
|
+
|
|
3
|
+
<img src="https://careful.jpt.sh/carefully-3681327.svg" width=100 height=100 alt="logo of a warning sign">
|
|
4
|
+
|
|
5
|
+
**careful** is a Python library for making requests to unreliable websites with `httpx`.
|
|
6
|
+
|
|
7
|
+
**Code**: <https://codeberg.org/jpt/careful>
|
|
8
|
+
|
|
9
|
+
**Docs**: <https://careful.jpt.sh>
|
|
10
|
+
|
|
11
|
+
[](https://ci.codeberg.org/repos/15185)
|
|
12
|
+
|
|
13
|
+
It offers enhancements to
|
|
14
|
+
[`httpx.Client`](https://www.python-httpx.org)
|
|
15
|
+
useful for writing long-running scrapers & crawlers, particularly against sites that are slow or have intermittent errors.
|
|
16
|
+
|
|
17
|
+
- **configurable retry support.** retry on timeouts or other errors, with exponential back-off.
|
|
18
|
+
- **simple request throttling.** set a maximum number of requests per minute.
|
|
19
|
+
- **development cache.** configurable caching aimed at reducing redundant requests made while authoring/testing web scrapers.
|
|
20
|
+
|
|
21
|
+
### example
|
|
22
|
+
|
|
23
|
+
```python
|
|
24
|
+
from httpx import Client
|
|
25
|
+
from careful.httpx import make_careful_client
|
|
26
|
+
|
|
27
|
+
client = make_careful_client(
|
|
28
|
+
# can configure httpx.Client however you usually would
|
|
29
|
+
client=Client(headers={'user-agent': 'careful/1.0'}),
|
|
30
|
+
# retries are configurable w/ exponential back off
|
|
31
|
+
retry_attempts=2,
|
|
32
|
+
retry_wait_seconds=5,
|
|
33
|
+
# can cache to process memory, filesystem, or SQLite
|
|
34
|
+
cache_storage=MemoryCache(),
|
|
35
|
+
# requests will automatically be throttled to aim at this rate
|
|
36
|
+
requests_per_minute=60,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
# all normal methods on httpx.Client make use of configured enhancements
|
|
40
|
+
client.get("https://example.com")
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
---
|
|
45
|
+
|
|
46
|
+
Logo licensed from [Adrien Coquet via Noun Project](https://thenounproject.com/icon/carefully-3681327/)
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
|
2
|
+
<svg width="1200pt" height="1200pt" version="1.1" viewBox="0 0 1200 1200" xmlns="http://www.w3.org/2000/svg">
|
|
3
|
+
<path d="m600 195.6c19.199 0 34.801-15.602 34.801-34.801v-96c0-19.199-15.602-34.801-34.801-34.801s-34.801 15.602-34.801 34.801v97.199c0 18 15.602 33.602 34.801 33.602z" fill="#ff814a"/>
|
|
4
|
+
<path d="m734.4 225.6c4.8008 2.3984 9.6016 3.6016 14.398 3.6016 13.199 0 25.199-7.1992 31.199-20.398l40.801-87.602c8.3984-16.801 0-37.199-16.801-45.602-16.801-8.3984-37.199 0-45.602 16.801l-39.594 87.598c-8.4023 16.801-1.1992 37.199 15.598 45.602z" fill="#ff814a"/>
|
|
5
|
+
<path d="m420 208.8c6 12 18 20.398 31.199 20.398 4.8008 0 9.6016-1.1992 14.398-3.6016 16.801-8.3984 25.199-28.801 16.801-45.602l-40.801-87.602c-8.3984-16.801-28.801-25.199-45.602-16.801-16.801 8.3984-25.199 28.801-16.801 45.602z" fill="#ff814a"/>
|
|
6
|
+
<path d="m632.4 746.4c0 17.895-14.504 32.402-32.398 32.402s-32.398-14.508-32.398-32.402c0-17.891 14.504-32.398 32.398-32.398s32.398 14.508 32.398 32.398z" fill="#ff814a"/>
|
|
7
|
+
<path d="m598.8 691.2s1.1992 0 0 0c13.199 0 22.801-9.6016 24-21.602l6-133.2v-2.3984c-1.1992-15.602-14.398-27.602-30-27.602-15.602 1.1992-27.602 14.398-27.602 30l6 133.2c1.1992 12.004 9.6016 21.602 21.602 21.602z" fill="#ff814a"/>
|
|
8
|
+
<path d="m871.2 333.6c-4.8008-24-25.199-42-50.398-42h-441.6c-24 0-45.602 18-50.398 42l-150 806.4c-2.3984 15.602 9.6016 30 25.199 30h76.801c12 0 22.801-8.3984 25.199-20.398l32.398-171.6h526.8l32.398 171.6c2.3984 12 13.199 20.398 25.199 20.398h73.203c15.602 0 27.602-14.398 25.199-30zm-87.598 494.4h-367.2c-33.602 0-54-36-37.199-64.801l183.6-315.6c16.801-28.801 57.602-28.801 74.398 0l183.6 315.6c15.598 28.801-4.8008 64.801-37.199 64.801z" fill="#ff814a"/>
|
|
9
|
+
</svg>
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
## 0.2.0 - 6 September 2025
|
|
4
|
+
|
|
5
|
+
- Initial release, mostly a port of `scrapelib` functionality.
|
|
6
|
+
|
|
7
|
+
## scrapelib
|
|
8
|
+
|
|
9
|
+
The original version of this library is a port of `scrapelib` (2.4.1).
|
|
10
|
+
|
|
11
|
+
Changes from this version were to:
|
|
12
|
+
|
|
13
|
+
- use `httpx` instead of `requests`
|
|
14
|
+
- dropped quite a few unnecessary features that were mainly in `scrapelib` for backwards-compatability reasons.
|
|
15
|
+
- use a composable interface instead of the inheritance-based one from `scrapelib`, aiming at making future enhancements/porting easier.
|
|
16
|
+
|
|
17
|
+
This library is a partial rewrite of [scrapelib](https://pypi.org/project/scrapelib/).
|
|
18
|
+
Thanks to all of [scrapelib's original contributors](https://github.com/jamesturk/scrapelib/graphs/contributors) and users.
|
|
19
|
+
|
|
20
|
+
`scrapelib` originally wrapped `urllib2`, eventually migrating to `requests`.
|
|
21
|
+
|
|
22
|
+
There are a few things that scrapelib did that this doesn't:
|
|
23
|
+
|
|
24
|
+
- support FTP requests via HTTP-like API
|
|
25
|
+
- extend the client with a `urlretrieve` function
|
|
26
|
+
- provide helpers for working with headers, timeouts, and custom ciphers
|
|
27
|
+
|
|
28
|
+
The first two are possible but didn't seem necessary at the moment.
|
|
29
|
+
The latter was very `requests`-specific, and so hasn't been replicated here.
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# careful
|
|
2
|
+
|
|
3
|
+
<img src="/carefully-3681327.svg" width=100 height=100 alt="logo of a warning sign">
|
|
4
|
+
|
|
5
|
+
**careful** is a Python library for making requests to unreliable websites with `httpx`.
|
|
6
|
+
|
|
7
|
+
**Code**: <https://codeberg.org/jpt/careful>
|
|
8
|
+
|
|
9
|
+
**Docs**: <https://careful.jpt.sh>
|
|
10
|
+
|
|
11
|
+
[](https://ci.codeberg.org/repos/15185)
|
|
12
|
+
|
|
13
|
+
It offers enhancements to
|
|
14
|
+
[`httpx.Client`](https://www.python-httpx.org)
|
|
15
|
+
useful for writing long-running scrapers & crawlers, particularly against sites that are slow or have intermittent errors.
|
|
16
|
+
|
|
17
|
+
- **configurable retry support.** retry on timeouts or other errors, with exponential back-off.
|
|
18
|
+
- **simple request throttling.** set a maximum number of requests per minute.
|
|
19
|
+
- **development cache.** configurable caching aimed at reducing redundant requests made while authoring/testing web scrapers.
|
|
20
|
+
|
|
21
|
+
### example
|
|
22
|
+
|
|
23
|
+
```python
|
|
24
|
+
from httpx import Client
|
|
25
|
+
from careful.httpx import make_careful_client
|
|
26
|
+
|
|
27
|
+
client = make_careful_client(
|
|
28
|
+
# can configure httpx.Client however you usually would
|
|
29
|
+
client=Client(headers={'user-agent': 'careful/1.0'}),
|
|
30
|
+
# retries are configurable w/ exponential back off
|
|
31
|
+
retry_attempts=2,
|
|
32
|
+
retry_wait_seconds=5,
|
|
33
|
+
# can cache to process memory, filesystem, or SQLite
|
|
34
|
+
cache_storage=MemoryCache(),
|
|
35
|
+
# requests will automatically be throttled to aim at this rate
|
|
36
|
+
requests_per_minute=60,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
# all normal methods on httpx.Client make use of configured enhancements
|
|
40
|
+
client.get("https://example.com")
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
---
|
|
45
|
+
|
|
46
|
+
Logo licensed from [Adrien Coquet via Noun Project](https://thenounproject.com/icon/carefully-3681327/)
|
|
47
|
+
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
# Usage
|
|
2
|
+
|
|
3
|
+
Most users will only need to call `careful.httpx.make_careful_client`.
|
|
4
|
+
|
|
5
|
+
::: careful.httpx.make_careful_client
|
|
6
|
+
options:
|
|
7
|
+
annotations_path: brief
|
|
8
|
+
show_signature: false
|
|
9
|
+
show_root_heading: true
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
## cache storage
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
::: careful.httpx.MemoryCache
|
|
16
|
+
options:
|
|
17
|
+
heading_level: 3
|
|
18
|
+
members: False
|
|
19
|
+
show_root_heading: true
|
|
20
|
+
|
|
21
|
+
::: careful.httpx.FileCache
|
|
22
|
+
options:
|
|
23
|
+
heading_level: 3
|
|
24
|
+
members: False
|
|
25
|
+
show_root_heading: true
|
|
26
|
+
|
|
27
|
+
::: careful.httpx.SqliteCache
|
|
28
|
+
options:
|
|
29
|
+
heading_level: 3
|
|
30
|
+
members: False
|
|
31
|
+
show_root_heading: true
|
|
32
|
+
|
careful-0.2.1/mkdocs.yml
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
site_name: careful
|
|
2
|
+
site_url: https://careful.jpt.sh/
|
|
3
|
+
site_author: James Turk
|
|
4
|
+
site_description: A library for making requests to unreliable sites with httpx.
|
|
5
|
+
copyright: Copyright © 2025 James Turk
|
|
6
|
+
repo_url: https://codeberg.org/jpt/careful
|
|
7
|
+
repo_name: careful
|
|
8
|
+
|
|
9
|
+
theme:
|
|
10
|
+
name: material
|
|
11
|
+
logo: carefully-3681327.svg
|
|
12
|
+
palette:
|
|
13
|
+
- scheme: default
|
|
14
|
+
primary: teal
|
|
15
|
+
accent: teal
|
|
16
|
+
toggle:
|
|
17
|
+
icon: material/toggle-switch-off-outline
|
|
18
|
+
name: Switch to dark mode
|
|
19
|
+
- scheme: slate
|
|
20
|
+
primary: teal
|
|
21
|
+
accent: teal
|
|
22
|
+
toggle:
|
|
23
|
+
icon: material/toggle-switch
|
|
24
|
+
name: Switch to light mode
|
|
25
|
+
|
|
26
|
+
features:
|
|
27
|
+
#- navigation.tabs
|
|
28
|
+
- navigation.sections
|
|
29
|
+
- navigation.top
|
|
30
|
+
- content.tabs.link
|
|
31
|
+
|
|
32
|
+
markdown_extensions:
|
|
33
|
+
- admonition
|
|
34
|
+
- def_list
|
|
35
|
+
- pymdownx.highlight:
|
|
36
|
+
anchor_linenums: true
|
|
37
|
+
line_spans: __span
|
|
38
|
+
pygments_lang_class: true
|
|
39
|
+
- pymdownx.inlinehilite
|
|
40
|
+
- pymdownx.snippets
|
|
41
|
+
- pymdownx.superfences
|
|
42
|
+
- toc:
|
|
43
|
+
permalink: true
|
|
44
|
+
plugins:
|
|
45
|
+
- search
|
|
46
|
+
- mkdocstrings:
|
|
47
|
+
watch:
|
|
48
|
+
- src
|
|
49
|
+
nav:
|
|
50
|
+
- 'index.md'
|
|
51
|
+
- 'reference.md'
|
|
52
|
+
- 'changelog.md'
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "careful"
|
|
3
|
+
version = "0.2.1"
|
|
4
|
+
description = "careful extensions to httpx: throttle, retry, cache"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
authors = [
|
|
7
|
+
{ name = "jpt", email = "dev@jpt.sh" }
|
|
8
|
+
]
|
|
9
|
+
license = {text = "BSD-2-Clause"}
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
classifiers = [
|
|
12
|
+
"Development Status :: 6 - Mature",
|
|
13
|
+
"Intended Audience :: Developers",
|
|
14
|
+
"License :: OSI Approved :: BSD License",
|
|
15
|
+
"Natural Language :: English",
|
|
16
|
+
"Operating System :: OS Independent",
|
|
17
|
+
"Programming Language :: Python :: 3.10",
|
|
18
|
+
"Programming Language :: Python :: 3.11",
|
|
19
|
+
"Programming Language :: Python :: 3.12",
|
|
20
|
+
"Programming Language :: Python :: 3.13",
|
|
21
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
22
|
+
]
|
|
23
|
+
dependencies = [
|
|
24
|
+
"httpx>=0.28.1",
|
|
25
|
+
]
|
|
26
|
+
[project.urls]
|
|
27
|
+
Repository = "https://codeberg.org/jpt/careful"
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
[build-system]
|
|
31
|
+
requires = ["hatchling"]
|
|
32
|
+
build-backend = "hatchling.build"
|
|
33
|
+
|
|
34
|
+
[dependency-groups]
|
|
35
|
+
dev = [
|
|
36
|
+
"ruff>=0.12.12",
|
|
37
|
+
"mkdocs-material>=9.6.18",
|
|
38
|
+
"mkdocstrings>=0.30.0",
|
|
39
|
+
"mkdocstrings-python>=1.18.2",
|
|
40
|
+
"pytest>=8.4.2",
|
|
41
|
+
]
|
|
42
|
+
httpbin = [
|
|
43
|
+
"pytest-httpbin>=2.1.0",
|
|
44
|
+
]
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
from .retries import make_retry_client, retry_default_rule
|
|
2
|
+
from .throttle import make_throttled_client
|
|
3
|
+
from .dev_cache import (
|
|
4
|
+
make_dev_caching_client,
|
|
5
|
+
MemoryCache,
|
|
6
|
+
FileCache,
|
|
7
|
+
SqliteCache,
|
|
8
|
+
CacheStorageBase,
|
|
9
|
+
_cache_200s,
|
|
10
|
+
_default_keyfunc,
|
|
11
|
+
)
|
|
12
|
+
from ._types import ResponsePredicate, CacheKeyfunc
|
|
13
|
+
from httpx import Client
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def make_careful_client(
|
|
17
|
+
*,
|
|
18
|
+
client: Client | None = None,
|
|
19
|
+
retry_attempts: int = 0,
|
|
20
|
+
retry_wait_seconds: float = 10,
|
|
21
|
+
should_retry: ResponsePredicate = retry_default_rule,
|
|
22
|
+
requests_per_minute: int = 0,
|
|
23
|
+
cache_storage: CacheStorageBase = None,
|
|
24
|
+
cache_write_only: bool = False,
|
|
25
|
+
should_cache: ResponsePredicate = _cache_200s,
|
|
26
|
+
cache_keyfunc: CacheKeyfunc = _default_keyfunc,
|
|
27
|
+
):
|
|
28
|
+
"""
|
|
29
|
+
This function patches an `httpx.Client` so that all requests made with the client support
|
|
30
|
+
[retries](#retries), [throttling](#throttling), and [development caching](#development-caching).
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
Parameters:
|
|
34
|
+
client: A pre-configured `httpx.Client`. If omitted a default client will be created.
|
|
35
|
+
|
|
36
|
+
retry_attempts: Maximum number of retries. If non-zero will retry up to this many times
|
|
37
|
+
with increasing wait times, starting with `retry_wait_seconds`.
|
|
38
|
+
|
|
39
|
+
retry_wait_seconds: Number of seconds to sleep between first attempt and first retry.
|
|
40
|
+
Subsequent attempts will increase exponentially (2x, 4x, 8x, etc.)
|
|
41
|
+
|
|
42
|
+
should_retry: Predicate function that takes a `httpx.Response` and returns `True` if it should be retried.
|
|
43
|
+
|
|
44
|
+
requests_per_minute: Maximum number of requests per minute. (e.g. 30 will throttle to ~2s between requests)
|
|
45
|
+
|
|
46
|
+
cache_storage: An object that implements the [cache storage interface](#cache-storage).
|
|
47
|
+
|
|
48
|
+
cache_write_only: Update cache, but never read from it.
|
|
49
|
+
|
|
50
|
+
should_cache: Predicate function that takes a `httpx.Response` and returns `True` if it should be cached.
|
|
51
|
+
|
|
52
|
+
cache_keyfunc: Function that takes request details and returns a unique cache key.
|
|
53
|
+
|
|
54
|
+
## Retries
|
|
55
|
+
|
|
56
|
+
If `retry_attempts` is set, responses will be passed to `should_retry`.
|
|
57
|
+
Responses that are rejected (return `True`) will be retried after a wait based on
|
|
58
|
+
`retry_wait_seconds`.
|
|
59
|
+
Each retry will wait twice as long as the one before.
|
|
60
|
+
|
|
61
|
+
## Throttling
|
|
62
|
+
|
|
63
|
+
If `requests_per_minute` is set, standard (non-retry) requests will automatically
|
|
64
|
+
sleep for a short period to target the given rate.
|
|
65
|
+
|
|
66
|
+
For example, at 30rpm, the sleep time on a fast request will be close to 2 seconds.
|
|
67
|
+
|
|
68
|
+
## Development Caching
|
|
69
|
+
|
|
70
|
+
Why **development caching?**
|
|
71
|
+
|
|
72
|
+
This feature is named as a reminder that **this is not true HTTP caching**, which
|
|
73
|
+
should take various headers into account. Look at libraries like [hishel](https://hishel.com) if that's what you are after.
|
|
74
|
+
|
|
75
|
+
The purpose of this feature is to allow you to cache all of your HTTP requests during development.
|
|
76
|
+
Often when writing a scraper or crawler, you wind up hitting the site you are working on more often than you'd like-- each time you iterate on your code you're likely making redundant requests to pages that haven't changed.
|
|
77
|
+
|
|
78
|
+
By caching all successful requests (configurable with the `should_cache` parameter),
|
|
79
|
+
you can easily re-run scrapers without making redundant HTTP requests.
|
|
80
|
+
This means faster development time & happier upstream servers.
|
|
81
|
+
|
|
82
|
+
To enable development caching, assign a [`MemoryCache`][careful.httpx.MemoryCache],
|
|
83
|
+
[`FileCache`][careful.httpx.FileCache], or [`SqliteCache`][careful.httpx.SqliteCache] to
|
|
84
|
+
the `cache_storage` property of a `scrapelib.Scraper`.
|
|
85
|
+
|
|
86
|
+
---
|
|
87
|
+
|
|
88
|
+
When multiple features are applied, the order of wrapping ensures that:
|
|
89
|
+
- the cache is checked first, and bypasses throttling if hit
|
|
90
|
+
- retries use their own delays, but not throttled separately
|
|
91
|
+
"""
|
|
92
|
+
if client is None:
|
|
93
|
+
client = Client()
|
|
94
|
+
# order matters, retry on inside b/c it is last-chance scenario
|
|
95
|
+
if retry_attempts:
|
|
96
|
+
client = make_retry_client(
|
|
97
|
+
client=client,
|
|
98
|
+
attempts=retry_attempts,
|
|
99
|
+
wait_seconds=retry_wait_seconds,
|
|
100
|
+
should_retry=should_retry,
|
|
101
|
+
)
|
|
102
|
+
# throttling around retries
|
|
103
|
+
if requests_per_minute:
|
|
104
|
+
client = make_throttled_client(
|
|
105
|
+
client=client, requests_per_minute=requests_per_minute
|
|
106
|
+
)
|
|
107
|
+
# caching on top layer, so cache will be checked first
|
|
108
|
+
if cache_storage:
|
|
109
|
+
client = make_dev_caching_client(
|
|
110
|
+
client=client,
|
|
111
|
+
cache_storage=cache_storage,
|
|
112
|
+
cache_keyfunc=cache_keyfunc,
|
|
113
|
+
should_cache=should_cache,
|
|
114
|
+
write_only=cache_write_only,
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
return client
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
__all__ = [
|
|
121
|
+
"make_retry_client",
|
|
122
|
+
"make_throttled_client",
|
|
123
|
+
"make_dev_caching_client",
|
|
124
|
+
"MemoryCache",
|
|
125
|
+
"FileCache",
|
|
126
|
+
"SqliteCache",
|
|
127
|
+
]
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import abc
|
|
1
2
|
import types
|
|
2
3
|
import functools
|
|
3
4
|
import logging
|
|
@@ -40,13 +41,6 @@ def _cache_200s(response: Response) -> bool:
|
|
|
40
41
|
|
|
41
42
|
|
|
42
43
|
def _cached_request(client: Client, *args, **kwargs):
|
|
43
|
-
# short circuit if cache isn't configured
|
|
44
|
-
if not client._cache_storage:
|
|
45
|
-
log.debug("bypassing cache, no storage configured")
|
|
46
|
-
resp = client._wrapped_request(*args, **kwargs)
|
|
47
|
-
resp.fromcache = False
|
|
48
|
-
return resp
|
|
49
|
-
|
|
50
44
|
method, url = args
|
|
51
45
|
request_key = client._cache_keyfunc(method, url, kwargs["params"])
|
|
52
46
|
|
|
@@ -61,7 +55,7 @@ def _cached_request(client: Client, *args, **kwargs):
|
|
|
61
55
|
cached_resp.fromcache = True
|
|
62
56
|
resp = cached_resp
|
|
63
57
|
else:
|
|
64
|
-
resp = client.
|
|
58
|
+
resp = client._no_cache_request(*args, **kwargs)
|
|
65
59
|
# save to cache if request and response meet criteria
|
|
66
60
|
log.debug("XX %s %s", request_key, client._should_cache(resp))
|
|
67
61
|
if request_key and client._should_cache(resp):
|
|
@@ -80,6 +74,27 @@ def make_dev_caching_client(
|
|
|
80
74
|
should_cache=_cache_200s,
|
|
81
75
|
write_only=False,
|
|
82
76
|
):
|
|
77
|
+
"""
|
|
78
|
+
Returns an enhanced `httpx.Client` where requests are saved to a
|
|
79
|
+
specified cache.
|
|
80
|
+
|
|
81
|
+
This is denoted as a "dev_cache" because it is not intended to be a true
|
|
82
|
+
HTTP cache, respecting cache headers/etc. If you are looking for that
|
|
83
|
+
behavior, there are httpx libraries for that explicit purpose.
|
|
84
|
+
|
|
85
|
+
Instead, the purpose of this cache is to make it possible to test scrapers
|
|
86
|
+
locally without making hundreds of redundant requests.
|
|
87
|
+
|
|
88
|
+
The strategy is configurable via `cache_keyfunc` and `should_cache`.
|
|
89
|
+
|
|
90
|
+
The default strategy is simple:
|
|
91
|
+
cache all GET requests that result in 200s, with no expiry.
|
|
92
|
+
|
|
93
|
+
This works well for the case where you have hundreds of pages to scrape
|
|
94
|
+
and want to make scraper adjustments without repeatedly making hits.
|
|
95
|
+
|
|
96
|
+
It should *NOT* be used in production without adjusting these rules.
|
|
97
|
+
"""
|
|
83
98
|
if client is None:
|
|
84
99
|
client = Client()
|
|
85
100
|
|
|
@@ -88,23 +103,34 @@ def make_dev_caching_client(
|
|
|
88
103
|
client._should_cache = should_cache
|
|
89
104
|
client._write_only = write_only
|
|
90
105
|
|
|
91
|
-
client.
|
|
106
|
+
client._no_cache_request = client.request
|
|
92
107
|
client.request = types.MethodType(
|
|
93
108
|
functools.wraps(client.request)(_cached_request), client
|
|
94
109
|
)
|
|
95
110
|
return client
|
|
96
111
|
|
|
97
112
|
|
|
98
|
-
class CacheStorageBase:
|
|
113
|
+
class CacheStorageBase(abc.ABC):
|
|
114
|
+
@abc.abstractmethod
|
|
99
115
|
def get(self, key: str) -> None | Response:
|
|
100
116
|
raise NotImplementedError()
|
|
101
117
|
|
|
118
|
+
@abc.abstractmethod
|
|
102
119
|
def set(self, key: str, response: Response) -> None:
|
|
103
120
|
raise NotImplementedError()
|
|
104
121
|
|
|
105
122
|
|
|
106
123
|
class MemoryCache(CacheStorageBase):
|
|
107
|
-
"""
|
|
124
|
+
"""
|
|
125
|
+
In memory cache for request responses.
|
|
126
|
+
|
|
127
|
+
Example:
|
|
128
|
+
|
|
129
|
+
make_careful_client(
|
|
130
|
+
cache_storage=MemoryCache(),
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
"""
|
|
108
134
|
|
|
109
135
|
def __init__(self) -> None:
|
|
110
136
|
self.cache: dict[str, Response] = {}
|
|
@@ -122,11 +148,21 @@ class FileCache(CacheStorageBase):
|
|
|
122
148
|
"""
|
|
123
149
|
File-based cache for request responses.
|
|
124
150
|
|
|
125
|
-
:
|
|
126
|
-
|
|
127
|
-
|
|
151
|
+
Parameters:
|
|
152
|
+
cache_dir: directory for storing responses
|
|
153
|
+
|
|
154
|
+
Example:
|
|
155
|
+
|
|
156
|
+
make_careful_client(
|
|
157
|
+
cache_storage=FileCache("_httpcache/"),
|
|
158
|
+
)
|
|
159
|
+
|
|
128
160
|
"""
|
|
129
161
|
|
|
162
|
+
# TODO: restore?
|
|
163
|
+
# check_last_modified: set to True to compare last-modified
|
|
164
|
+
# timestamp in cached response with value from HEAD request
|
|
165
|
+
|
|
130
166
|
# file name escaping inspired by httplib2
|
|
131
167
|
_prefix = re.compile(r"^\w+://")
|
|
132
168
|
_illegal = re.compile(r"[?/:|]+")
|
|
@@ -188,7 +224,7 @@ class FileCache(CacheStorageBase):
|
|
|
188
224
|
# status & encoding will be in headers, but are faked
|
|
189
225
|
# need to split spaces out of status to get code (e.g. '200 OK')
|
|
190
226
|
resp = Response(
|
|
191
|
-
status_code
|
|
227
|
+
status_code=int(resp_headers.pop("status").split(" ")[0]),
|
|
192
228
|
content=resp_content,
|
|
193
229
|
default_encoding=resp_headers.pop("encoding"),
|
|
194
230
|
headers=resp_headers,
|
|
@@ -224,13 +260,18 @@ class FileCache(CacheStorageBase):
|
|
|
224
260
|
os.remove(fname)
|
|
225
261
|
|
|
226
262
|
|
|
227
|
-
class
|
|
228
|
-
"""
|
|
263
|
+
class SqliteCache(CacheStorageBase):
|
|
264
|
+
"""
|
|
265
|
+
sqlite cache for request responses.
|
|
266
|
+
|
|
267
|
+
Parameters:
|
|
268
|
+
cache_path: path for SQLite database file
|
|
229
269
|
|
|
230
|
-
:
|
|
231
|
-
:param check_last_modified: set to True to compare last-modified
|
|
232
|
-
timestamp in cached response with value from HEAD request
|
|
270
|
+
Example:
|
|
233
271
|
|
|
272
|
+
make_careful_client(
|
|
273
|
+
cache_storage=SQLiteCache("_cache.db"),
|
|
274
|
+
)
|
|
234
275
|
"""
|
|
235
276
|
|
|
236
277
|
_columns = ["key", "status", "modified", "encoding", "data", "headers"]
|
|
@@ -284,7 +325,12 @@ class SQLiteCache(CacheStorageBase):
|
|
|
284
325
|
# if rec["modified"] != new_lm:
|
|
285
326
|
# return None
|
|
286
327
|
|
|
287
|
-
resp = Response(
|
|
328
|
+
resp = Response(
|
|
329
|
+
rec["status"],
|
|
330
|
+
content=rec["data"],
|
|
331
|
+
default_encoding=rec["encoding"],
|
|
332
|
+
headers=json.loads(rec["headers"]),
|
|
333
|
+
)
|
|
288
334
|
return resp
|
|
289
335
|
|
|
290
336
|
def clear(self) -> None:
|
|
@@ -2,13 +2,22 @@ import time
|
|
|
2
2
|
import types
|
|
3
3
|
import functools
|
|
4
4
|
import logging
|
|
5
|
-
from httpx import Client, Response
|
|
5
|
+
from httpx import Client, Response, HTTPError
|
|
6
6
|
|
|
7
7
|
log = logging.getLogger("httpx")
|
|
8
8
|
|
|
9
9
|
|
|
10
|
-
def
|
|
11
|
-
|
|
10
|
+
def retry_default_rule(response: Response) -> bool:
|
|
11
|
+
# default behavior is to retry 400s and 500s but not 404s
|
|
12
|
+
return response.status_code >= 400 and response.status_code != 404
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def retry_only_500s(response: Response) -> bool:
|
|
16
|
+
return response.status_code >= 500
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def retry_all_400s_500s(response: Response) -> bool:
|
|
20
|
+
return response.status_code >= 400
|
|
12
21
|
|
|
13
22
|
|
|
14
23
|
def _retry_request(client: Client, *args, **kwargs):
|
|
@@ -20,24 +29,21 @@ def _retry_request(client: Client, *args, **kwargs):
|
|
|
20
29
|
exception_raised = None
|
|
21
30
|
|
|
22
31
|
try:
|
|
23
|
-
|
|
32
|
+
tries += 1
|
|
33
|
+
resp = client._no_retry_request(*args, **kwargs)
|
|
24
34
|
|
|
25
35
|
# break from loop on an accepted response
|
|
26
|
-
if client.
|
|
27
|
-
resp.status_code == 404 and not client._retry_on_404
|
|
28
|
-
):
|
|
36
|
+
if not client._should_retry(resp):
|
|
29
37
|
break
|
|
30
38
|
|
|
31
|
-
except
|
|
32
|
-
# TODO: exclude certain kinds of exceptions (SSL?) from retry
|
|
39
|
+
except HTTPError as e:
|
|
33
40
|
exception_raised = e
|
|
34
41
|
|
|
35
42
|
if exception_response := getattr(e, "response", None):
|
|
36
|
-
if client.
|
|
43
|
+
if not client._should_retry(exception_response):
|
|
37
44
|
break
|
|
38
45
|
|
|
39
46
|
# if we're going to retry, sleep first
|
|
40
|
-
tries += 1
|
|
41
47
|
if tries <= client._retry_attempts:
|
|
42
48
|
# twice as long each time
|
|
43
49
|
wait = client._retry_wait_seconds * (2 ** (tries - 1))
|
|
@@ -68,20 +74,17 @@ def make_retry_client(
|
|
|
68
74
|
client: Client | None = None,
|
|
69
75
|
attempts: int = 1,
|
|
70
76
|
wait_seconds: float = 10,
|
|
71
|
-
|
|
72
|
-
accept_response=_default_accept_response,
|
|
77
|
+
should_retry=retry_default_rule,
|
|
73
78
|
):
|
|
74
79
|
if client is None:
|
|
75
80
|
client = Client()
|
|
76
81
|
client._retry_attempts = max(0, attempts)
|
|
77
82
|
client._retry_wait_seconds = wait_seconds
|
|
78
|
-
client.
|
|
79
|
-
client._accept_response = accept_response
|
|
83
|
+
client._should_retry = should_retry
|
|
80
84
|
|
|
81
|
-
client.
|
|
85
|
+
client._no_retry_request = client.request
|
|
82
86
|
client.request = types.MethodType(
|
|
83
87
|
functools.wraps(client.request)(_retry_request), client
|
|
84
88
|
)
|
|
85
89
|
|
|
86
90
|
return client
|
|
87
|
-
|
|
@@ -16,7 +16,7 @@ def _throttle_request(client: Client, *args, **kwargs):
|
|
|
16
16
|
client._last_request = time.time()
|
|
17
17
|
else:
|
|
18
18
|
client._last_request = now
|
|
19
|
-
return client.
|
|
19
|
+
return client._no_throttle_request(*args, **kwargs)
|
|
20
20
|
|
|
21
21
|
|
|
22
22
|
def make_throttled_client(
|
|
@@ -34,7 +34,7 @@ def make_throttled_client(
|
|
|
34
34
|
client._requests_per_minute = requests_per_minute
|
|
35
35
|
client._request_frequency = 60.0 / requests_per_minute
|
|
36
36
|
|
|
37
|
-
client.
|
|
37
|
+
client._no_throttle_request = client.request
|
|
38
38
|
client.request = types.MethodType(
|
|
39
39
|
functools.wraps(client.request)(_throttle_request), client
|
|
40
40
|
)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from pytest_httpbin.serve import Server # type: ignore
|
|
2
2
|
from httpx import Response
|
|
3
|
-
from careful.httpx import make_dev_caching_client, MemoryCache, FileCache,
|
|
3
|
+
from careful.httpx import make_dev_caching_client, MemoryCache, FileCache, SqliteCache
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
def test_dev_caching(httpbin: Server) -> None:
|
|
@@ -59,7 +59,7 @@ def test_file_cache() -> None:
|
|
|
59
59
|
|
|
60
60
|
|
|
61
61
|
def test_sqlite_cache() -> None:
|
|
62
|
-
sc =
|
|
62
|
+
sc = SqliteCache("cache.db")
|
|
63
63
|
sc.clear()
|
|
64
64
|
_test_cache_storage(sc)
|
|
65
65
|
sc.clear()
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
from careful.httpx import make_careful_client, MemoryCache
|
|
2
|
+
from unittest import mock
|
|
3
|
+
from fakeresponse import FakeResponse
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def test_full_careful_client():
|
|
7
|
+
client = make_careful_client(
|
|
8
|
+
retry_attempts=3,
|
|
9
|
+
retry_wait_seconds=0.00001,
|
|
10
|
+
cache_storage=MemoryCache(),
|
|
11
|
+
requests_per_minute=60,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
# On the first call return a 500, then a 200, then a 500 again
|
|
15
|
+
mock_send = mock.Mock(
|
|
16
|
+
side_effect=[
|
|
17
|
+
FakeResponse("http://dummy/", 500, "failure!"),
|
|
18
|
+
FakeResponse("http://dummy/", 200, "success!"),
|
|
19
|
+
FakeResponse("http://dummy/2", 404, "success!"),
|
|
20
|
+
]
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
mock_sleep = mock.Mock()
|
|
24
|
+
|
|
25
|
+
# check that sleep is called
|
|
26
|
+
with mock.patch("time.sleep", mock_sleep):
|
|
27
|
+
with mock.patch.object(client, "send", mock_send):
|
|
28
|
+
resp = client.get("http://dummy/")
|
|
29
|
+
|
|
30
|
+
# demonstrates a retry
|
|
31
|
+
assert mock_send.call_count == 2
|
|
32
|
+
assert resp.status_code == 200
|
|
33
|
+
# sleep called by retry, not by throttle yet
|
|
34
|
+
assert mock_sleep.call_count == 1
|
|
35
|
+
|
|
36
|
+
# demonstrates a cache (no new call)
|
|
37
|
+
resp = client.get("http://dummy/")
|
|
38
|
+
assert mock_send.call_count == 2
|
|
39
|
+
assert resp.status_code == 200
|
|
40
|
+
assert mock_sleep.call_count == 1
|
|
41
|
+
|
|
42
|
+
# a new, throttled call (no retry)
|
|
43
|
+
resp = client.get("http://dummy/2")
|
|
44
|
+
assert mock_send.call_count == 3
|
|
45
|
+
assert resp.status_code == 404
|
|
46
|
+
# call was throttled
|
|
47
|
+
assert mock_sleep.call_count == 2
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
from careful.httpx import make_retry_client
|
|
2
|
+
from careful.httpx.retries import retry_all_400s_500s
|
|
2
3
|
from unittest import mock
|
|
3
4
|
from fakeresponse import FakeResponse
|
|
4
5
|
|
|
@@ -14,7 +15,7 @@ def test_retry() -> None:
|
|
|
14
15
|
]
|
|
15
16
|
)
|
|
16
17
|
|
|
17
|
-
with mock.patch.object(client, "
|
|
18
|
+
with mock.patch.object(client, "send", mock_request):
|
|
18
19
|
resp = client.get("http://dummy/")
|
|
19
20
|
assert mock_request.call_count == 2
|
|
20
21
|
|
|
@@ -23,14 +24,16 @@ def test_retry() -> None:
|
|
|
23
24
|
return_value=FakeResponse("http://dummy/", 500, "failure!")
|
|
24
25
|
)
|
|
25
26
|
|
|
26
|
-
with mock.patch.object(client, "
|
|
27
|
+
with mock.patch.object(client, "send", mock_request):
|
|
27
28
|
resp = client.get("http://dummy/")
|
|
28
29
|
assert resp.status_code == 500
|
|
29
30
|
assert mock_request.call_count == 4 # try four times
|
|
30
31
|
|
|
31
32
|
|
|
32
33
|
def test_retry_404() -> None:
|
|
33
|
-
client = make_retry_client(
|
|
34
|
+
client = make_retry_client(
|
|
35
|
+
attempts=3, wait_seconds=0.001, should_retry=retry_all_400s_500s
|
|
36
|
+
)
|
|
34
37
|
|
|
35
38
|
# On the first call return a 404, then a 200
|
|
36
39
|
mock_request = mock.Mock(
|
|
@@ -40,7 +43,7 @@ def test_retry_404() -> None:
|
|
|
40
43
|
]
|
|
41
44
|
)
|
|
42
45
|
|
|
43
|
-
with mock.patch.object(client, "
|
|
46
|
+
with mock.patch.object(client, "send", mock_request):
|
|
44
47
|
resp = client.get("http://dummy/") # type: ignore
|
|
45
48
|
assert mock_request.call_count == 2
|
|
46
49
|
assert resp.status_code == 200
|
|
@@ -51,7 +54,7 @@ def test_retry_404() -> None:
|
|
|
51
54
|
)
|
|
52
55
|
|
|
53
56
|
# four tries
|
|
54
|
-
with mock.patch.object(client, "
|
|
57
|
+
with mock.patch.object(client, "send", mock_request):
|
|
55
58
|
resp = client.get("http://dummy/")
|
|
56
59
|
assert resp.status_code == 404
|
|
57
60
|
assert mock_request.call_count == 4
|
|
@@ -59,7 +62,7 @@ def test_retry_404() -> None:
|
|
|
59
62
|
|
|
60
63
|
|
|
61
64
|
def test_no_retry_404() -> None:
|
|
62
|
-
client = make_retry_client(attempts=3, wait_seconds=0.001
|
|
65
|
+
client = make_retry_client(attempts=3, wait_seconds=0.001)
|
|
63
66
|
|
|
64
67
|
# On the first call return a 404, then a 200
|
|
65
68
|
mock_request = mock.Mock(
|
|
@@ -69,7 +72,7 @@ def test_no_retry_404() -> None:
|
|
|
69
72
|
]
|
|
70
73
|
)
|
|
71
74
|
|
|
72
|
-
with mock.patch.object(client, "
|
|
75
|
+
with mock.patch.object(client, "send", mock_request):
|
|
73
76
|
resp = client.get("http://dummy/") # type: ignore
|
|
74
77
|
assert mock_request.call_count == 1
|
|
75
78
|
assert resp.status_code == 404
|
|
@@ -1,11 +1,12 @@
|
|
|
1
1
|
from careful.httpx import make_throttled_client
|
|
2
2
|
from unittest import mock
|
|
3
3
|
from typing import Any
|
|
4
|
+
from httpx import Request
|
|
4
5
|
from fakeresponse import FakeResponse
|
|
5
6
|
|
|
6
7
|
|
|
7
|
-
def request_200(
|
|
8
|
-
return FakeResponse(url, 200, b"ok")
|
|
8
|
+
def request_200(request: Request, *args: Any, **kwargs: Any) -> FakeResponse:
|
|
9
|
+
return FakeResponse(request.url, 200, b"ok")
|
|
9
10
|
|
|
10
11
|
|
|
11
12
|
mock_200 = mock.Mock(wraps=request_200)
|
|
@@ -18,12 +19,10 @@ def test_request_throttling() -> None:
|
|
|
18
19
|
|
|
19
20
|
# check that sleep is called on call 2 & 3
|
|
20
21
|
with mock.patch("time.sleep", mock_sleep):
|
|
21
|
-
with mock.patch.object(client, "
|
|
22
|
+
with mock.patch.object(client, "send", mock_200):
|
|
22
23
|
client.get("http://dummy/")
|
|
23
24
|
client.get("http://dummy/")
|
|
24
25
|
client.get("http://dummy/")
|
|
25
26
|
assert mock_sleep.call_count == 2
|
|
26
27
|
# should have slept for ~2 seconds to aim at 30 per min
|
|
27
28
|
assert 1.8 <= mock_sleep.call_args[0][0] <= 2.2
|
|
28
|
-
|
|
29
|
-
|
careful-0.1.0/PKG-INFO
DELETED
|
@@ -1,48 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: careful
|
|
3
|
-
Version: 0.1.0
|
|
4
|
-
Summary: Add your description here
|
|
5
|
-
Author-email: jpt <dev@jpt.sh>
|
|
6
|
-
License-File: LICENSE
|
|
7
|
-
Requires-Python: >=3.13
|
|
8
|
-
Requires-Dist: httpx>=0.28.1
|
|
9
|
-
Requires-Dist: pytest-httpbin>=2.1.0
|
|
10
|
-
Requires-Dist: pytest>=8.4.2
|
|
11
|
-
Description-Content-Type: text/markdown
|
|
12
|
-
|
|
13
|
-
**careful_httpx** is a library for making requests to less-than-reliable websites.
|
|
14
|
-
|
|
15
|
-
It is based on [scrapelib](https://pypi.org/scrapelib/), which has powered Open States & many other Python scrapers for over 15 years.
|
|
16
|
-
|
|
17
|
-
Code: <https://codeberg.org/jpt/careful_httpx>
|
|
18
|
-
|
|
19
|
-
Documentation: TODO
|
|
20
|
-
|
|
21
|
-
## Features
|
|
22
|
-
|
|
23
|
-
Enhances [`httpx.Client`](https://www.python-httpx.org) with features useful for writing long-running scrapers & crawlers, particularly against sites that are slow or have intermittent errors.
|
|
24
|
-
|
|
25
|
-
- retries
|
|
26
|
-
- throttling
|
|
27
|
-
- dev-cache for iterating on scrapers
|
|
28
|
-
|
|
29
|
-
### example
|
|
30
|
-
|
|
31
|
-
TODO
|
|
32
|
-
|
|
33
|
-
### features this has that scrapelib doesn't
|
|
34
|
-
|
|
35
|
-
- httpx support
|
|
36
|
-
- composable interface, can augment Client with just the enhancements you want
|
|
37
|
-
|
|
38
|
-
TODO: don't allow instantiating bad patch classes, and check for incompatible configs
|
|
39
|
-
|
|
40
|
-
### features scrapelib had that this doesn't
|
|
41
|
-
|
|
42
|
-
Open to considering if there is interest, but didn't seem necessary.
|
|
43
|
-
|
|
44
|
-
- HTTP(S) and FTP requests via an identical API
|
|
45
|
-
- allow setting custom ciphers
|
|
46
|
-
- have urlretrieve
|
|
47
|
-
- support FTP
|
|
48
|
-
- set custom user-agent/mess w/ headers
|
careful-0.1.0/README.md
DELETED
|
@@ -1,36 +0,0 @@
|
|
|
1
|
-
**careful_httpx** is a library for making requests to less-than-reliable websites.
|
|
2
|
-
|
|
3
|
-
It is based on [scrapelib](https://pypi.org/scrapelib/), which has powered Open States & many other Python scrapers for over 15 years.
|
|
4
|
-
|
|
5
|
-
Code: <https://codeberg.org/jpt/careful_httpx>
|
|
6
|
-
|
|
7
|
-
Documentation: TODO
|
|
8
|
-
|
|
9
|
-
## Features
|
|
10
|
-
|
|
11
|
-
Enhances [`httpx.Client`](https://www.python-httpx.org) with features useful for writing long-running scrapers & crawlers, particularly against sites that are slow or have intermittent errors.
|
|
12
|
-
|
|
13
|
-
- retries
|
|
14
|
-
- throttling
|
|
15
|
-
- dev-cache for iterating on scrapers
|
|
16
|
-
|
|
17
|
-
### example
|
|
18
|
-
|
|
19
|
-
TODO
|
|
20
|
-
|
|
21
|
-
### features this has that scrapelib doesn't
|
|
22
|
-
|
|
23
|
-
- httpx support
|
|
24
|
-
- composable interface, can augment Client with just the enhancements you want
|
|
25
|
-
|
|
26
|
-
TODO: don't allow instantiating bad patch classes, and check for incompatible configs
|
|
27
|
-
|
|
28
|
-
### features scrapelib had that this doesn't
|
|
29
|
-
|
|
30
|
-
Open to considering if there is interest, but didn't seem necessary.
|
|
31
|
-
|
|
32
|
-
- HTTP(S) and FTP requests via an identical API
|
|
33
|
-
- allow setting custom ciphers
|
|
34
|
-
- have urlretrieve
|
|
35
|
-
- support FTP
|
|
36
|
-
- set custom user-agent/mess w/ headers
|
careful-0.1.0/pyproject.toml
DELETED
|
@@ -1,19 +0,0 @@
|
|
|
1
|
-
[project]
|
|
2
|
-
name = "careful"
|
|
3
|
-
version = "0.1.0"
|
|
4
|
-
description = "Add your description here"
|
|
5
|
-
readme = "README.md"
|
|
6
|
-
authors = [
|
|
7
|
-
{ name = "jpt", email = "dev@jpt.sh" }
|
|
8
|
-
]
|
|
9
|
-
requires-python = ">=3.13"
|
|
10
|
-
dependencies = [
|
|
11
|
-
"httpx>=0.28.1",
|
|
12
|
-
"pytest>=8.4.2",
|
|
13
|
-
"pytest-httpbin>=2.1.0",
|
|
14
|
-
]
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
[build-system]
|
|
18
|
-
requires = ["hatchling"]
|
|
19
|
-
build-backend = "hatchling.build"
|
|
@@ -1,59 +0,0 @@
|
|
|
1
|
-
from .retries import make_retry_client, _default_accept_response
|
|
2
|
-
from .throttle import make_throttled_client
|
|
3
|
-
from .dev_cache import (
|
|
4
|
-
make_dev_caching_client,
|
|
5
|
-
MemoryCache,
|
|
6
|
-
FileCache,
|
|
7
|
-
SQLiteCache,
|
|
8
|
-
_cache_200s,
|
|
9
|
-
_default_keyfunc,
|
|
10
|
-
)
|
|
11
|
-
from httpx import Client
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
def make_careful_client(
|
|
15
|
-
client: Client,
|
|
16
|
-
*,
|
|
17
|
-
retry_attempts: int = 0,
|
|
18
|
-
retry_wait_seconds: float = 10,
|
|
19
|
-
retry_on_404: bool = False,
|
|
20
|
-
accept_response=_default_accept_response,
|
|
21
|
-
requests_per_minute: int = 0,
|
|
22
|
-
cache_storage=None,
|
|
23
|
-
cache_write_only=False,
|
|
24
|
-
should_cache=_cache_200s,
|
|
25
|
-
cache_keyfunc=_default_keyfunc,
|
|
26
|
-
):
|
|
27
|
-
# order matters, retry on inside b/c it is last-chance scenario
|
|
28
|
-
if retry_attempts:
|
|
29
|
-
client = make_retry_client(
|
|
30
|
-
client=client,
|
|
31
|
-
attempts=retry_attempts,
|
|
32
|
-
wait_seconds=retry_wait_seconds,
|
|
33
|
-
retry_on_404=retry_on_404,
|
|
34
|
-
accept_response=accept_response,
|
|
35
|
-
)
|
|
36
|
-
# throttling around retries
|
|
37
|
-
if requests_per_minute:
|
|
38
|
-
client = make_throttled_client(client, requests_per_minute=requests_per_minute)
|
|
39
|
-
# caching on top layer, so cache will be checked first
|
|
40
|
-
if cache_storage:
|
|
41
|
-
client = make_dev_caching_client(
|
|
42
|
-
client=client,
|
|
43
|
-
cache_storage=cache_storage,
|
|
44
|
-
cache_keyfunc=cache_keyfunc,
|
|
45
|
-
should_cache=should_cache,
|
|
46
|
-
write_only=cache_write_only,
|
|
47
|
-
)
|
|
48
|
-
|
|
49
|
-
return client
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
__all__ = [
|
|
53
|
-
"make_retry_client",
|
|
54
|
-
"make_throttled_client",
|
|
55
|
-
"make_dev_caching_client",
|
|
56
|
-
"MemoryCache",
|
|
57
|
-
"FileCache",
|
|
58
|
-
"SQLiteCache",
|
|
59
|
-
]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|