careful 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- careful-0.1.0/.gitignore +14 -0
- careful-0.1.0/.pre-commit-config.yaml +25 -0
- careful-0.1.0/LICENSE +24 -0
- careful-0.1.0/PKG-INFO +48 -0
- careful-0.1.0/README.md +36 -0
- careful-0.1.0/pyproject.toml +19 -0
- careful-0.1.0/src/careful/__init__.py +0 -0
- careful-0.1.0/src/careful/httpx/__init__.py +59 -0
- careful-0.1.0/src/careful/httpx/dev_cache.py +296 -0
- careful-0.1.0/src/careful/httpx/py.typed +0 -0
- careful-0.1.0/src/careful/httpx/retries.py +87 -0
- careful-0.1.0/src/careful/httpx/throttle.py +41 -0
- careful-0.1.0/tests/fakeresponse.py +17 -0
- careful-0.1.0/tests/test_cache.py +65 -0
- careful-0.1.0/tests/test_careful.py +4 -0
- careful-0.1.0/tests/test_retries.py +85 -0
- careful-0.1.0/tests/test_throttle.py +29 -0
careful-0.1.0/.gitignore
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# updated 2025-04-16
|
|
2
|
+
repos:
|
|
3
|
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
|
4
|
+
rev: v0.11.5
|
|
5
|
+
hooks:
|
|
6
|
+
- id: ruff
|
|
7
|
+
- id: ruff-format
|
|
8
|
+
|
|
9
|
+
- repo: https://github.com/pre-commit/pre-commit-hooks
|
|
10
|
+
rev: v5.0.0 # Use the ref you want to point at
|
|
11
|
+
hooks:
|
|
12
|
+
- id: trailing-whitespace
|
|
13
|
+
- id: check-added-large-files
|
|
14
|
+
args: ['--maxkb=1024']
|
|
15
|
+
- id: check-case-conflict
|
|
16
|
+
- id: check-executables-have-shebangs
|
|
17
|
+
- id: check-json
|
|
18
|
+
- id: check-merge-conflict
|
|
19
|
+
- id: check-symlinks
|
|
20
|
+
- id: check-toml
|
|
21
|
+
- id: check-yaml
|
|
22
|
+
- id: debug-statements
|
|
23
|
+
- id: forbid-submodules
|
|
24
|
+
- id: mixed-line-ending
|
|
25
|
+
#- id: no-commit-to-branch
|
careful-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
Copyright (c) 2025, James Turk
|
|
2
|
+
|
|
3
|
+
All rights reserved.
|
|
4
|
+
|
|
5
|
+
Redistribution and use in source and binary forms, with or without modification,
|
|
6
|
+
are permitted provided that the following conditions are met:
|
|
7
|
+
|
|
8
|
+
* Redistributions of source code must retain the above copyright notice,
|
|
9
|
+
this list of conditions and the following disclaimer.
|
|
10
|
+
* Redistributions in binary form must reproduce the above copyright notice,
|
|
11
|
+
this list of conditions and the following disclaimer in the documentation
|
|
12
|
+
and/or other materials provided with the distribution.
|
|
13
|
+
|
|
14
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
15
|
+
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
16
|
+
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
17
|
+
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
|
18
|
+
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
19
|
+
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
20
|
+
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
21
|
+
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
|
22
|
+
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
|
23
|
+
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
24
|
+
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
careful-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: careful
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Add your description here
|
|
5
|
+
Author-email: jpt <dev@jpt.sh>
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Requires-Python: >=3.13
|
|
8
|
+
Requires-Dist: httpx>=0.28.1
|
|
9
|
+
Requires-Dist: pytest-httpbin>=2.1.0
|
|
10
|
+
Requires-Dist: pytest>=8.4.2
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
|
|
13
|
+
**careful_httpx** is a library for making requests to less-than-reliable websites.
|
|
14
|
+
|
|
15
|
+
It is based on [scrapelib](https://pypi.org/scrapelib/), which has powered Open States & many other Python scrapers for over 15 years.
|
|
16
|
+
|
|
17
|
+
Code: <https://codeberg.org/jpt/careful_httpx>
|
|
18
|
+
|
|
19
|
+
Documentation: TODO
|
|
20
|
+
|
|
21
|
+
## Features
|
|
22
|
+
|
|
23
|
+
Enhances [`httpx.Client`](https://www.python-httpx.org) with features useful for writing long-running scrapers & crawlers, particularly against sites that are slow or have intermittent errors.
|
|
24
|
+
|
|
25
|
+
- retries
|
|
26
|
+
- throttling
|
|
27
|
+
- dev-cache for iterating on scrapers
|
|
28
|
+
|
|
29
|
+
### example
|
|
30
|
+
|
|
31
|
+
TODO
|
|
32
|
+
|
|
33
|
+
### features this has that scrapelib doesn't
|
|
34
|
+
|
|
35
|
+
- httpx support
|
|
36
|
+
- composable interface, can augment Client with just the enhancements you want
|
|
37
|
+
|
|
38
|
+
TODO: don't allow instantiating bad patch classes, and check for incompatible configs
|
|
39
|
+
|
|
40
|
+
### features scrapelib had that this doesn't
|
|
41
|
+
|
|
42
|
+
Open to considering if there is interest, but didn't seem necessary.
|
|
43
|
+
|
|
44
|
+
- HTTP(S) and FTP requests via an identical API
|
|
45
|
+
- allow setting custom ciphers
|
|
46
|
+
- have urlretrieve
|
|
47
|
+
- support FTP
|
|
48
|
+
- set custom user-agent/mess w/ headers
|
careful-0.1.0/README.md
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
**careful_httpx** is a library for making requests to less-than-reliable websites.
|
|
2
|
+
|
|
3
|
+
It is based on [scrapelib](https://pypi.org/scrapelib/), which has powered Open States & many other Python scrapers for over 15 years.
|
|
4
|
+
|
|
5
|
+
Code: <https://codeberg.org/jpt/careful_httpx>
|
|
6
|
+
|
|
7
|
+
Documentation: TODO
|
|
8
|
+
|
|
9
|
+
## Features
|
|
10
|
+
|
|
11
|
+
Enhances [`httpx.Client`](https://www.python-httpx.org) with features useful for writing long-running scrapers & crawlers, particularly against sites that are slow or have intermittent errors.
|
|
12
|
+
|
|
13
|
+
- retries
|
|
14
|
+
- throttling
|
|
15
|
+
- dev-cache for iterating on scrapers
|
|
16
|
+
|
|
17
|
+
### example
|
|
18
|
+
|
|
19
|
+
TODO
|
|
20
|
+
|
|
21
|
+
### features this has that scrapelib doesn't
|
|
22
|
+
|
|
23
|
+
- httpx support
|
|
24
|
+
- composable interface, can augment Client with just the enhancements you want
|
|
25
|
+
|
|
26
|
+
TODO: don't allow instantiating bad patch classes, and check for incompatible configs
|
|
27
|
+
|
|
28
|
+
### features scrapelib had that this doesn't
|
|
29
|
+
|
|
30
|
+
Open to considering if there is interest, but didn't seem necessary.
|
|
31
|
+
|
|
32
|
+
- HTTP(S) and FTP requests via an identical API
|
|
33
|
+
- allow setting custom ciphers
|
|
34
|
+
- have urlretrieve
|
|
35
|
+
- support FTP
|
|
36
|
+
- set custom user-agent/mess w/ headers
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "careful"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Add your description here"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
authors = [
|
|
7
|
+
{ name = "jpt", email = "dev@jpt.sh" }
|
|
8
|
+
]
|
|
9
|
+
requires-python = ">=3.13"
|
|
10
|
+
dependencies = [
|
|
11
|
+
"httpx>=0.28.1",
|
|
12
|
+
"pytest>=8.4.2",
|
|
13
|
+
"pytest-httpbin>=2.1.0",
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
[build-system]
|
|
18
|
+
requires = ["hatchling"]
|
|
19
|
+
build-backend = "hatchling.build"
|
|
File without changes
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
from .retries import make_retry_client, _default_accept_response
|
|
2
|
+
from .throttle import make_throttled_client
|
|
3
|
+
from .dev_cache import (
|
|
4
|
+
make_dev_caching_client,
|
|
5
|
+
MemoryCache,
|
|
6
|
+
FileCache,
|
|
7
|
+
SQLiteCache,
|
|
8
|
+
_cache_200s,
|
|
9
|
+
_default_keyfunc,
|
|
10
|
+
)
|
|
11
|
+
from httpx import Client
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def make_careful_client(
|
|
15
|
+
client: Client,
|
|
16
|
+
*,
|
|
17
|
+
retry_attempts: int = 0,
|
|
18
|
+
retry_wait_seconds: float = 10,
|
|
19
|
+
retry_on_404: bool = False,
|
|
20
|
+
accept_response=_default_accept_response,
|
|
21
|
+
requests_per_minute: int = 0,
|
|
22
|
+
cache_storage=None,
|
|
23
|
+
cache_write_only=False,
|
|
24
|
+
should_cache=_cache_200s,
|
|
25
|
+
cache_keyfunc=_default_keyfunc,
|
|
26
|
+
):
|
|
27
|
+
# order matters, retry on inside b/c it is last-chance scenario
|
|
28
|
+
if retry_attempts:
|
|
29
|
+
client = make_retry_client(
|
|
30
|
+
client=client,
|
|
31
|
+
attempts=retry_attempts,
|
|
32
|
+
wait_seconds=retry_wait_seconds,
|
|
33
|
+
retry_on_404=retry_on_404,
|
|
34
|
+
accept_response=accept_response,
|
|
35
|
+
)
|
|
36
|
+
# throttling around retries
|
|
37
|
+
if requests_per_minute:
|
|
38
|
+
client = make_throttled_client(client, requests_per_minute=requests_per_minute)
|
|
39
|
+
# caching on top layer, so cache will be checked first
|
|
40
|
+
if cache_storage:
|
|
41
|
+
client = make_dev_caching_client(
|
|
42
|
+
client=client,
|
|
43
|
+
cache_storage=cache_storage,
|
|
44
|
+
cache_keyfunc=cache_keyfunc,
|
|
45
|
+
should_cache=should_cache,
|
|
46
|
+
write_only=cache_write_only,
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
return client
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
__all__ = [
|
|
53
|
+
"make_retry_client",
|
|
54
|
+
"make_throttled_client",
|
|
55
|
+
"make_dev_caching_client",
|
|
56
|
+
"MemoryCache",
|
|
57
|
+
"FileCache",
|
|
58
|
+
"SQLiteCache",
|
|
59
|
+
]
|
|
@@ -0,0 +1,296 @@
|
|
|
1
|
+
import types
|
|
2
|
+
import functools
|
|
3
|
+
import logging
|
|
4
|
+
import re
|
|
5
|
+
import os
|
|
6
|
+
import glob
|
|
7
|
+
import hashlib
|
|
8
|
+
import sqlite3
|
|
9
|
+
import json
|
|
10
|
+
|
|
11
|
+
from httpx import Client, Response, Request
|
|
12
|
+
|
|
13
|
+
log = logging.getLogger("httpx")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _default_keyfunc(
|
|
17
|
+
method: str,
|
|
18
|
+
url: str,
|
|
19
|
+
params: dict | None = None,
|
|
20
|
+
) -> str | None:
|
|
21
|
+
"""
|
|
22
|
+
Return a cache key from a given set of request parameters.
|
|
23
|
+
|
|
24
|
+
Default behavior is to return a complete URL for all GET
|
|
25
|
+
requests, and None otherwise.
|
|
26
|
+
"""
|
|
27
|
+
if method.lower() != "get":
|
|
28
|
+
return None
|
|
29
|
+
|
|
30
|
+
return Request(url=url, method=method, params=params).url
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _cache_200s(response: Response) -> bool:
|
|
34
|
+
"""
|
|
35
|
+
Check if a given Response object should be cached.
|
|
36
|
+
|
|
37
|
+
Default behavior is to only cache responses with a 200 status code.
|
|
38
|
+
"""
|
|
39
|
+
return response.status_code == 200
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _cached_request(client: Client, *args, **kwargs):
|
|
43
|
+
# short circuit if cache isn't configured
|
|
44
|
+
if not client._cache_storage:
|
|
45
|
+
log.debug("bypassing cache, no storage configured")
|
|
46
|
+
resp = client._wrapped_request(*args, **kwargs)
|
|
47
|
+
resp.fromcache = False
|
|
48
|
+
return resp
|
|
49
|
+
|
|
50
|
+
method, url = args
|
|
51
|
+
request_key = client._cache_keyfunc(method, url, kwargs["params"])
|
|
52
|
+
|
|
53
|
+
# check cache for response
|
|
54
|
+
cached_resp = None
|
|
55
|
+
if request_key and not client._write_only:
|
|
56
|
+
cached_resp = client._cache_storage.get(request_key)
|
|
57
|
+
|
|
58
|
+
if cached_resp:
|
|
59
|
+
# resp = cast(CacheResponse, resp_maybe)
|
|
60
|
+
log.info("using cached response request_key=%s", request_key)
|
|
61
|
+
cached_resp.fromcache = True
|
|
62
|
+
resp = cached_resp
|
|
63
|
+
else:
|
|
64
|
+
resp = client._wrapped_request(*args, **kwargs)
|
|
65
|
+
# save to cache if request and response meet criteria
|
|
66
|
+
log.debug("XX %s %s", request_key, client._should_cache(resp))
|
|
67
|
+
if request_key and client._should_cache(resp):
|
|
68
|
+
client._cache_storage.set(request_key, resp)
|
|
69
|
+
log.info("caching response request_key=%s", request_key)
|
|
70
|
+
resp.fromcache = False
|
|
71
|
+
|
|
72
|
+
return resp
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def make_dev_caching_client(
|
|
76
|
+
*,
|
|
77
|
+
client: Client | None = None,
|
|
78
|
+
cache_storage=None,
|
|
79
|
+
cache_keyfunc=_default_keyfunc,
|
|
80
|
+
should_cache=_cache_200s,
|
|
81
|
+
write_only=False,
|
|
82
|
+
):
|
|
83
|
+
if client is None:
|
|
84
|
+
client = Client()
|
|
85
|
+
|
|
86
|
+
client._cache_storage = cache_storage
|
|
87
|
+
client._cache_keyfunc = cache_keyfunc
|
|
88
|
+
client._should_cache = should_cache
|
|
89
|
+
client._write_only = write_only
|
|
90
|
+
|
|
91
|
+
client._wrapped_request = client.request
|
|
92
|
+
client.request = types.MethodType(
|
|
93
|
+
functools.wraps(client.request)(_cached_request), client
|
|
94
|
+
)
|
|
95
|
+
return client
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
class CacheStorageBase:
|
|
99
|
+
def get(self, key: str) -> None | Response:
|
|
100
|
+
raise NotImplementedError()
|
|
101
|
+
|
|
102
|
+
def set(self, key: str, response: Response) -> None:
|
|
103
|
+
raise NotImplementedError()
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
class MemoryCache(CacheStorageBase):
|
|
107
|
+
"""In memory cache for request responses."""
|
|
108
|
+
|
|
109
|
+
def __init__(self) -> None:
|
|
110
|
+
self.cache: dict[str, Response] = {}
|
|
111
|
+
|
|
112
|
+
def get(self, key: str) -> None | Response:
|
|
113
|
+
"""Get cache entry for key, or return None."""
|
|
114
|
+
return self.cache.get(key, None)
|
|
115
|
+
|
|
116
|
+
def set(self, key: str, response: Response) -> None:
|
|
117
|
+
"""Set cache entry for key with contents of response."""
|
|
118
|
+
self.cache[key] = response
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
class FileCache(CacheStorageBase):
|
|
122
|
+
"""
|
|
123
|
+
File-based cache for request responses.
|
|
124
|
+
|
|
125
|
+
:param cache_dir: directory for storing responses
|
|
126
|
+
:param check_last_modified: set to True to compare last-modified
|
|
127
|
+
timestamp in cached response with value from HEAD request
|
|
128
|
+
"""
|
|
129
|
+
|
|
130
|
+
# file name escaping inspired by httplib2
|
|
131
|
+
_prefix = re.compile(r"^\w+://")
|
|
132
|
+
_illegal = re.compile(r"[?/:|]+")
|
|
133
|
+
_header_re = re.compile(r"([-\w]+): (.*)")
|
|
134
|
+
_maxlen = 200
|
|
135
|
+
|
|
136
|
+
def _clean_key(self, key: str) -> str:
|
|
137
|
+
# strip scheme
|
|
138
|
+
md5 = hashlib.md5(key.encode("utf8")).hexdigest()
|
|
139
|
+
key = self._prefix.sub("", key)
|
|
140
|
+
key = self._illegal.sub(",", key)
|
|
141
|
+
return ",".join((key[: self._maxlen], md5))
|
|
142
|
+
|
|
143
|
+
def __init__(self, cache_dir: str, check_last_modified: bool = False):
|
|
144
|
+
# normalize path
|
|
145
|
+
self.cache_dir = os.path.join(os.getcwd(), cache_dir)
|
|
146
|
+
self.check_last_modified = check_last_modified
|
|
147
|
+
# create directory
|
|
148
|
+
if not os.path.isdir(self.cache_dir):
|
|
149
|
+
os.makedirs(self.cache_dir)
|
|
150
|
+
|
|
151
|
+
def get(self, orig_key: str) -> None | Response:
|
|
152
|
+
"""Get cache entry for key, or return None."""
|
|
153
|
+
key = self._clean_key(orig_key)
|
|
154
|
+
path = os.path.join(self.cache_dir, key)
|
|
155
|
+
resp_headers = {}
|
|
156
|
+
|
|
157
|
+
try:
|
|
158
|
+
with open(path, "rb") as f:
|
|
159
|
+
# read lines one at a time
|
|
160
|
+
while True:
|
|
161
|
+
line = f.readline().decode("utf8").strip("\r\n")
|
|
162
|
+
# set headers
|
|
163
|
+
|
|
164
|
+
# if self.check_last_modified and re.search(
|
|
165
|
+
# "last-modified", line, flags=re.I
|
|
166
|
+
# ):
|
|
167
|
+
# # line contains last modified header
|
|
168
|
+
# head_resp = requests.head(orig_key)
|
|
169
|
+
|
|
170
|
+
# try:
|
|
171
|
+
# new_lm = head_resp.headers["last-modified"]
|
|
172
|
+
# old_lm = line[line.find(":") + 1 :].strip()
|
|
173
|
+
# if old_lm != new_lm:
|
|
174
|
+
# # last modified timestamps don't match, need to download again
|
|
175
|
+
# return None
|
|
176
|
+
# except KeyError:
|
|
177
|
+
# # no last modified header present, so redownload
|
|
178
|
+
# return None
|
|
179
|
+
|
|
180
|
+
header = self._header_re.match(line)
|
|
181
|
+
if header:
|
|
182
|
+
resp_headers[header.group(1)] = header.group(2)
|
|
183
|
+
else:
|
|
184
|
+
break
|
|
185
|
+
# everything left is the real content
|
|
186
|
+
resp_content = f.read()
|
|
187
|
+
|
|
188
|
+
# status & encoding will be in headers, but are faked
|
|
189
|
+
# need to split spaces out of status to get code (e.g. '200 OK')
|
|
190
|
+
resp = Response(
|
|
191
|
+
status_code = int(resp_headers.pop("status").split(" ")[0]),
|
|
192
|
+
content=resp_content,
|
|
193
|
+
default_encoding=resp_headers.pop("encoding"),
|
|
194
|
+
headers=resp_headers,
|
|
195
|
+
)
|
|
196
|
+
return resp
|
|
197
|
+
except IOError:
|
|
198
|
+
return None
|
|
199
|
+
|
|
200
|
+
def set(self, key: str, response: Response) -> None:
|
|
201
|
+
"""Set cache entry for key with contents of response."""
|
|
202
|
+
key = self._clean_key(key)
|
|
203
|
+
path = os.path.join(self.cache_dir, key)
|
|
204
|
+
|
|
205
|
+
with open(path, "wb") as f:
|
|
206
|
+
status_str = "status: {0}\n".format(response.status_code)
|
|
207
|
+
f.write(status_str.encode("utf8"))
|
|
208
|
+
encoding_str = "encoding: {0}\n".format(response.encoding)
|
|
209
|
+
f.write(encoding_str.encode("utf8"))
|
|
210
|
+
for h, v in response.headers.items():
|
|
211
|
+
# header: value\n
|
|
212
|
+
f.write(h.encode("utf8"))
|
|
213
|
+
f.write(b": ")
|
|
214
|
+
f.write(v.encode("utf8"))
|
|
215
|
+
f.write(b"\n")
|
|
216
|
+
# one blank line
|
|
217
|
+
f.write(b"\n")
|
|
218
|
+
f.write(response.content)
|
|
219
|
+
|
|
220
|
+
def clear(self) -> None:
|
|
221
|
+
# only delete things that end w/ a md5, less dangerous this way
|
|
222
|
+
cache_glob = "*," + ("[0-9a-f]" * 32)
|
|
223
|
+
for fname in glob.glob(os.path.join(self.cache_dir, cache_glob)):
|
|
224
|
+
os.remove(fname)
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
class SQLiteCache(CacheStorageBase):
|
|
228
|
+
"""SQLite cache for request responses.
|
|
229
|
+
|
|
230
|
+
:param cache_path: path for SQLite database file
|
|
231
|
+
:param check_last_modified: set to True to compare last-modified
|
|
232
|
+
timestamp in cached response with value from HEAD request
|
|
233
|
+
|
|
234
|
+
"""
|
|
235
|
+
|
|
236
|
+
_columns = ["key", "status", "modified", "encoding", "data", "headers"]
|
|
237
|
+
|
|
238
|
+
def __init__(self, cache_path: str, check_last_modified: bool = False):
|
|
239
|
+
self.cache_path = cache_path
|
|
240
|
+
self.check_last_modified = check_last_modified
|
|
241
|
+
self._conn = sqlite3.connect(cache_path)
|
|
242
|
+
self._conn.text_factory = str
|
|
243
|
+
self._build_table()
|
|
244
|
+
|
|
245
|
+
def _build_table(self) -> None:
|
|
246
|
+
"""Create table for storing request information and response."""
|
|
247
|
+
self._conn.execute(
|
|
248
|
+
"""CREATE TABLE IF NOT EXISTS cache
|
|
249
|
+
(key text UNIQUE, status integer, modified text,
|
|
250
|
+
encoding text, data blob, headers blob)"""
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
def set(self, key: str, response: Response) -> None:
|
|
254
|
+
"""Set cache entry for key with contents of response."""
|
|
255
|
+
mod = response.headers.pop("last-modified", None)
|
|
256
|
+
status = int(response.status_code)
|
|
257
|
+
rec = (
|
|
258
|
+
key,
|
|
259
|
+
status,
|
|
260
|
+
mod,
|
|
261
|
+
response.encoding,
|
|
262
|
+
response.content,
|
|
263
|
+
json.dumps(dict(response.headers)),
|
|
264
|
+
)
|
|
265
|
+
with self._conn:
|
|
266
|
+
self._conn.execute("DELETE FROM cache WHERE key=?", (key,))
|
|
267
|
+
self._conn.execute("INSERT INTO cache VALUES (?,?,?,?,?,?)", rec)
|
|
268
|
+
|
|
269
|
+
def get(self, key: str) -> None | Response:
|
|
270
|
+
"""Get cache entry for key, or return None."""
|
|
271
|
+
query = self._conn.execute("SELECT * FROM cache WHERE key=?", (key,))
|
|
272
|
+
rec = query.fetchone()
|
|
273
|
+
if rec is None:
|
|
274
|
+
return None
|
|
275
|
+
rec = dict(zip(self._columns, rec))
|
|
276
|
+
|
|
277
|
+
# TODO evaluate/remove?
|
|
278
|
+
# if self.check_last_modified:
|
|
279
|
+
# if rec["modified"] is None:
|
|
280
|
+
# return None # no last modified header present, so redownload
|
|
281
|
+
|
|
282
|
+
# head_resp = requests.head(key)
|
|
283
|
+
# new_lm = head_resp.headers.get("last-modified", None)
|
|
284
|
+
# if rec["modified"] != new_lm:
|
|
285
|
+
# return None
|
|
286
|
+
|
|
287
|
+
resp = Response(rec["status"], content=rec["data"], default_encoding=rec["encoding"], headers=json.loads(rec["headers"]))
|
|
288
|
+
return resp
|
|
289
|
+
|
|
290
|
+
def clear(self) -> None:
|
|
291
|
+
"""Remove all records from cache."""
|
|
292
|
+
with self._conn:
|
|
293
|
+
self._conn.execute("DELETE FROM cache")
|
|
294
|
+
|
|
295
|
+
def __del__(self) -> None:
|
|
296
|
+
self._conn.close()
|
|
File without changes
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
import time
|
|
2
|
+
import types
|
|
3
|
+
import functools
|
|
4
|
+
import logging
|
|
5
|
+
from httpx import Client, Response
|
|
6
|
+
|
|
7
|
+
log = logging.getLogger("httpx")
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _default_accept_response(response: Response) -> bool:
|
|
11
|
+
return response.status_code < 400
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _retry_request(client: Client, *args, **kwargs):
|
|
15
|
+
# the retry loop
|
|
16
|
+
tries = 0
|
|
17
|
+
exception_raised = None
|
|
18
|
+
|
|
19
|
+
while tries <= client._retry_attempts:
|
|
20
|
+
exception_raised = None
|
|
21
|
+
|
|
22
|
+
try:
|
|
23
|
+
resp = client._wrapped_request(*args, **kwargs)
|
|
24
|
+
|
|
25
|
+
# break from loop on an accepted response
|
|
26
|
+
if client._accept_response(resp) or (
|
|
27
|
+
resp.status_code == 404 and not client._retry_on_404
|
|
28
|
+
):
|
|
29
|
+
break
|
|
30
|
+
|
|
31
|
+
except Exception as e:
|
|
32
|
+
# TODO: exclude certain kinds of exceptions (SSL?) from retry
|
|
33
|
+
exception_raised = e
|
|
34
|
+
|
|
35
|
+
if exception_response := getattr(e, "response", None):
|
|
36
|
+
if client._accept_response(exception_response):
|
|
37
|
+
break
|
|
38
|
+
|
|
39
|
+
# if we're going to retry, sleep first
|
|
40
|
+
tries += 1
|
|
41
|
+
if tries <= client._retry_attempts:
|
|
42
|
+
# twice as long each time
|
|
43
|
+
wait = client._retry_wait_seconds * (2 ** (tries - 1))
|
|
44
|
+
if exception_raised:
|
|
45
|
+
log.info(
|
|
46
|
+
"exception %s, sleeping for %s seconds before retry #%s",
|
|
47
|
+
exception_raised,
|
|
48
|
+
wait,
|
|
49
|
+
tries,
|
|
50
|
+
)
|
|
51
|
+
else:
|
|
52
|
+
log.info(
|
|
53
|
+
"response %s, sleeping for %s seconds before retry #%s",
|
|
54
|
+
resp,
|
|
55
|
+
wait,
|
|
56
|
+
tries,
|
|
57
|
+
)
|
|
58
|
+
time.sleep(wait)
|
|
59
|
+
|
|
60
|
+
# out of the loop, either an exception was raised or we had a success
|
|
61
|
+
if exception_raised:
|
|
62
|
+
raise exception_raised
|
|
63
|
+
return resp
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def make_retry_client(
|
|
67
|
+
*,
|
|
68
|
+
client: Client | None = None,
|
|
69
|
+
attempts: int = 1,
|
|
70
|
+
wait_seconds: float = 10,
|
|
71
|
+
retry_on_404: bool = False,
|
|
72
|
+
accept_response=_default_accept_response,
|
|
73
|
+
):
|
|
74
|
+
if client is None:
|
|
75
|
+
client = Client()
|
|
76
|
+
client._retry_attempts = max(0, attempts)
|
|
77
|
+
client._retry_wait_seconds = wait_seconds
|
|
78
|
+
client._retry_on_404 = retry_on_404
|
|
79
|
+
client._accept_response = accept_response
|
|
80
|
+
|
|
81
|
+
client._wrapped_request = client.request
|
|
82
|
+
client.request = types.MethodType(
|
|
83
|
+
functools.wraps(client.request)(_retry_request), client
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
return client
|
|
87
|
+
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import time
|
|
2
|
+
import types
|
|
3
|
+
import functools
|
|
4
|
+
import logging
|
|
5
|
+
from httpx import Client
|
|
6
|
+
|
|
7
|
+
log = logging.getLogger("httpx")
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _throttle_request(client: Client, *args, **kwargs):
|
|
11
|
+
now = time.time()
|
|
12
|
+
diff = client._request_frequency - (now - client._last_request)
|
|
13
|
+
if diff > 0:
|
|
14
|
+
log.debug("throttled, sleeping for %fs", diff)
|
|
15
|
+
time.sleep(diff)
|
|
16
|
+
client._last_request = time.time()
|
|
17
|
+
else:
|
|
18
|
+
client._last_request = now
|
|
19
|
+
return client._wrapped_request(*args, **kwargs)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def make_throttled_client(
|
|
23
|
+
*,
|
|
24
|
+
client: Client | None = None,
|
|
25
|
+
requests_per_minute: float = 0,
|
|
26
|
+
):
|
|
27
|
+
if requests_per_minute <= 0:
|
|
28
|
+
raise ValueError("requests per minute must be a positive number")
|
|
29
|
+
|
|
30
|
+
if client is None:
|
|
31
|
+
client = Client()
|
|
32
|
+
|
|
33
|
+
client._last_request = 0.0
|
|
34
|
+
client._requests_per_minute = requests_per_minute
|
|
35
|
+
client._request_frequency = 60.0 / requests_per_minute
|
|
36
|
+
|
|
37
|
+
client._wrapped_request = client.request
|
|
38
|
+
client.request = types.MethodType(
|
|
39
|
+
functools.wraps(client.request)(_throttle_request), client
|
|
40
|
+
)
|
|
41
|
+
return client
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
|
|
2
|
+
class FakeResponse:
|
|
3
|
+
def __init__(
|
|
4
|
+
self,
|
|
5
|
+
url: str,
|
|
6
|
+
code: int,
|
|
7
|
+
content: str | bytes,
|
|
8
|
+
encoding: str = "utf-8",
|
|
9
|
+
headers: dict | None = None,
|
|
10
|
+
):
|
|
11
|
+
self.url = url
|
|
12
|
+
self.status_code = code
|
|
13
|
+
self.content = content
|
|
14
|
+
self.text = str(content)
|
|
15
|
+
self.encoding = encoding
|
|
16
|
+
self.headers = headers or {}
|
|
17
|
+
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
from pytest_httpbin.serve import Server # type: ignore
|
|
2
|
+
from httpx import Response
|
|
3
|
+
from careful.httpx import make_dev_caching_client, MemoryCache, FileCache, SQLiteCache
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def test_dev_caching(httpbin: Server) -> None:
|
|
7
|
+
client = make_dev_caching_client(cache_storage=MemoryCache(), write_only=False)
|
|
8
|
+
|
|
9
|
+
resp = client.get(httpbin.url + "/status/200")
|
|
10
|
+
assert not resp.fromcache
|
|
11
|
+
resp = client.get(httpbin.url + "/status/200")
|
|
12
|
+
assert resp.fromcache
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def test_dev_caching_params(httpbin: Server) -> None:
|
|
16
|
+
client = make_dev_caching_client(cache_storage=MemoryCache(), write_only=False)
|
|
17
|
+
|
|
18
|
+
resp = client.get(httpbin.url + "/status/200?a=1&b=2")
|
|
19
|
+
assert not resp.fromcache
|
|
20
|
+
resp = client.get(httpbin.url + "/status/200?a=1&b=2")
|
|
21
|
+
assert resp.fromcache
|
|
22
|
+
resp = client.get(httpbin.url + "/status/200?a=1&b=3")
|
|
23
|
+
assert not resp.fromcache
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# test storages #####
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _test_cache_storage(storage_obj) -> None:
|
|
30
|
+
# unknown key returns None
|
|
31
|
+
assert storage_obj.get("one") is None
|
|
32
|
+
|
|
33
|
+
_content_as_bytes = b"here's unicode: \xe2\x98\x83"
|
|
34
|
+
_content_as_unicode = "here's unicode: \u2603"
|
|
35
|
+
|
|
36
|
+
# set 'one'
|
|
37
|
+
resp = Response(200)
|
|
38
|
+
resp.headers["x-num"] = "one"
|
|
39
|
+
resp._content = _content_as_bytes
|
|
40
|
+
storage_obj.set("one", resp)
|
|
41
|
+
cached_resp = storage_obj.get("one")
|
|
42
|
+
assert cached_resp is not None
|
|
43
|
+
if cached_resp is not None:
|
|
44
|
+
assert cached_resp.headers["x-num"] == "one"
|
|
45
|
+
assert cached_resp.status_code == 200
|
|
46
|
+
cached_resp.encoding = "utf8"
|
|
47
|
+
assert cached_resp.text == _content_as_unicode
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def test_memory_cache() -> None:
|
|
51
|
+
_test_cache_storage(MemoryCache())
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def test_file_cache() -> None:
|
|
55
|
+
fc = FileCache("cache")
|
|
56
|
+
fc.clear()
|
|
57
|
+
_test_cache_storage(fc)
|
|
58
|
+
fc.clear()
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def test_sqlite_cache() -> None:
|
|
62
|
+
sc = SQLiteCache("cache.db")
|
|
63
|
+
sc.clear()
|
|
64
|
+
_test_cache_storage(sc)
|
|
65
|
+
sc.clear()
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
from careful.httpx import make_retry_client
|
|
2
|
+
from unittest import mock
|
|
3
|
+
from fakeresponse import FakeResponse
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def test_retry() -> None:
|
|
7
|
+
client = make_retry_client(attempts=3, wait_seconds=0.001)
|
|
8
|
+
|
|
9
|
+
# On the first call return a 500, then a 200
|
|
10
|
+
mock_request = mock.Mock(
|
|
11
|
+
side_effect=[
|
|
12
|
+
FakeResponse("http://dummy/", 500, "failure!"),
|
|
13
|
+
FakeResponse("http://dummy/", 200, "success!"),
|
|
14
|
+
]
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
with mock.patch.object(client, "_wrapped_request", mock_request):
|
|
18
|
+
resp = client.get("http://dummy/")
|
|
19
|
+
assert mock_request.call_count == 2
|
|
20
|
+
|
|
21
|
+
# 500 always
|
|
22
|
+
mock_request = mock.Mock(
|
|
23
|
+
return_value=FakeResponse("http://dummy/", 500, "failure!")
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
with mock.patch.object(client, "_wrapped_request", mock_request):
|
|
27
|
+
resp = client.get("http://dummy/")
|
|
28
|
+
assert resp.status_code == 500
|
|
29
|
+
assert mock_request.call_count == 4 # try four times
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def test_retry_404() -> None:
|
|
33
|
+
client = make_retry_client(attempts=3, wait_seconds=0.001, retry_on_404=True)
|
|
34
|
+
|
|
35
|
+
# On the first call return a 404, then a 200
|
|
36
|
+
mock_request = mock.Mock(
|
|
37
|
+
side_effect=[
|
|
38
|
+
FakeResponse("http://dummy/", 404, "failure!"),
|
|
39
|
+
FakeResponse("http://dummy/", 200, "success!"),
|
|
40
|
+
]
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
with mock.patch.object(client, "_wrapped_request", mock_request):
|
|
44
|
+
resp = client.get("http://dummy/") # type: ignore
|
|
45
|
+
assert mock_request.call_count == 2
|
|
46
|
+
assert resp.status_code == 200
|
|
47
|
+
|
|
48
|
+
# 404 always
|
|
49
|
+
mock_request = mock.Mock(
|
|
50
|
+
return_value=FakeResponse("http://dummy/", 404, "failure!")
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
# four tries
|
|
54
|
+
with mock.patch.object(client, "_wrapped_request", mock_request):
|
|
55
|
+
resp = client.get("http://dummy/")
|
|
56
|
+
assert resp.status_code == 404
|
|
57
|
+
assert mock_request.call_count == 4
|
|
58
|
+
assert resp.status_code == 404
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def test_no_retry_404() -> None:
|
|
62
|
+
client = make_retry_client(attempts=3, wait_seconds=0.001, retry_on_404=False)
|
|
63
|
+
|
|
64
|
+
# On the first call return a 404, then a 200
|
|
65
|
+
mock_request = mock.Mock(
|
|
66
|
+
side_effect=[
|
|
67
|
+
FakeResponse("http://dummy/", 404, "failure!"),
|
|
68
|
+
FakeResponse("http://dummy/", 200, "success!"),
|
|
69
|
+
]
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
with mock.patch.object(client, "_wrapped_request", mock_request):
|
|
73
|
+
resp = client.get("http://dummy/") # type: ignore
|
|
74
|
+
assert mock_request.call_count == 1
|
|
75
|
+
assert resp.status_code == 404
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
# def test_retry_ssl() -> None:
|
|
79
|
+
# s = make_retry_client(retry_attempts=5, retry_wait_seconds=0.001, raise_errors=False)
|
|
80
|
+
|
|
81
|
+
# # ensure SSLError is considered fatal even w/ retries
|
|
82
|
+
# with mock.patch.object(requests.Session, "request", mock_sslerror):
|
|
83
|
+
# with pytest.raises(requests.exceptions.SSLError):
|
|
84
|
+
# s.get("http://dummy/", retry_on_404=True) # type: ignore
|
|
85
|
+
# assert mock_sslerror.call_count == 1
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
from careful.httpx import make_throttled_client
|
|
2
|
+
from unittest import mock
|
|
3
|
+
from typing import Any
|
|
4
|
+
from fakeresponse import FakeResponse
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def request_200(method: str, url: str, *args: Any, **kwargs: Any) -> FakeResponse:
|
|
8
|
+
return FakeResponse(url, 200, b"ok")
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
mock_200 = mock.Mock(wraps=request_200)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def test_request_throttling() -> None:
|
|
15
|
+
client = make_throttled_client(requests_per_minute=30)
|
|
16
|
+
|
|
17
|
+
mock_sleep = mock.Mock()
|
|
18
|
+
|
|
19
|
+
# check that sleep is called on call 2 & 3
|
|
20
|
+
with mock.patch("time.sleep", mock_sleep):
|
|
21
|
+
with mock.patch.object(client, "_wrapped_request", mock_200):
|
|
22
|
+
client.get("http://dummy/")
|
|
23
|
+
client.get("http://dummy/")
|
|
24
|
+
client.get("http://dummy/")
|
|
25
|
+
assert mock_sleep.call_count == 2
|
|
26
|
+
# should have slept for ~2 seconds to aim at 30 per min
|
|
27
|
+
assert 1.8 <= mock_sleep.call_args[0][0] <= 2.2
|
|
28
|
+
|
|
29
|
+
|