careful 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,14 @@
1
+ # Python-generated files
2
+ __pycache__/
3
+ *.py[oc]
4
+ build/
5
+ dist/
6
+ wheels/
7
+ *.egg-info
8
+ uv.lock
9
+
10
+ # Virtual environments
11
+ .venv
12
+
13
+ # cache
14
+ cache.db
@@ -0,0 +1,25 @@
1
+ # updated 2025-04-16
2
+ repos:
3
+ - repo: https://github.com/astral-sh/ruff-pre-commit
4
+ rev: v0.11.5
5
+ hooks:
6
+ - id: ruff
7
+ - id: ruff-format
8
+
9
+ - repo: https://github.com/pre-commit/pre-commit-hooks
10
+ rev: v5.0.0 # Use the ref you want to point at
11
+ hooks:
12
+ - id: trailing-whitespace
13
+ - id: check-added-large-files
14
+ args: ['--maxkb=1024']
15
+ - id: check-case-conflict
16
+ - id: check-executables-have-shebangs
17
+ - id: check-json
18
+ - id: check-merge-conflict
19
+ - id: check-symlinks
20
+ - id: check-toml
21
+ - id: check-yaml
22
+ - id: debug-statements
23
+ - id: forbid-submodules
24
+ - id: mixed-line-ending
25
+ #- id: no-commit-to-branch
careful-0.1.0/LICENSE ADDED
@@ -0,0 +1,24 @@
1
+ Copyright (c) 2025, James Turk
2
+
3
+ All rights reserved.
4
+
5
+ Redistribution and use in source and binary forms, with or without modification,
6
+ are permitted provided that the following conditions are met:
7
+
8
+ * Redistributions of source code must retain the above copyright notice,
9
+ this list of conditions and the following disclaimer.
10
+ * Redistributions in binary form must reproduce the above copyright notice,
11
+ this list of conditions and the following disclaimer in the documentation
12
+ and/or other materials provided with the distribution.
13
+
14
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
15
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
16
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
17
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
18
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
22
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
23
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
careful-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,48 @@
1
+ Metadata-Version: 2.4
2
+ Name: careful
3
+ Version: 0.1.0
4
+ Summary: Add your description here
5
+ Author-email: jpt <dev@jpt.sh>
6
+ License-File: LICENSE
7
+ Requires-Python: >=3.13
8
+ Requires-Dist: httpx>=0.28.1
9
+ Requires-Dist: pytest-httpbin>=2.1.0
10
+ Requires-Dist: pytest>=8.4.2
11
+ Description-Content-Type: text/markdown
12
+
13
+ **careful_httpx** is a library for making requests to less-than-reliable websites.
14
+
15
+ It is based on [scrapelib](https://pypi.org/scrapelib/), which has powered Open States & many other Python scrapers for over 15 years.
16
+
17
+ Code: <https://codeberg.org/jpt/careful_httpx>
18
+
19
+ Documentation: TODO
20
+
21
+ ## Features
22
+
23
+ Enhances [`httpx.Client`](https://www.python-httpx.org) with features useful for writing long-running scrapers & crawlers, particularly against sites that are slow or have intermittent errors.
24
+
25
+ - retries
26
+ - throttling
27
+ - dev-cache for iterating on scrapers
28
+
29
+ ### example
30
+
31
+ TODO
32
+
33
+ ### features this has that scrapelib doesn't
34
+
35
+ - httpx support
36
+ - composable interface, can augment Client with just the enhancements you want
37
+
38
+ TODO: don't allow instantiating bad patch classes, and check for incompatible configs
39
+
40
+ ### features scrapelib had that this doesn't
41
+
42
+ Open to considering if there is interest, but didn't seem necessary.
43
+
44
+ - HTTP(S) and FTP requests via an identical API
45
+ - allow setting custom ciphers
46
+ - have urlretrieve
47
+ - support FTP
48
+ - set custom user-agent/mess w/ headers
@@ -0,0 +1,36 @@
1
+ **careful_httpx** is a library for making requests to less-than-reliable websites.
2
+
3
+ It is based on [scrapelib](https://pypi.org/scrapelib/), which has powered Open States & many other Python scrapers for over 15 years.
4
+
5
+ Code: <https://codeberg.org/jpt/careful_httpx>
6
+
7
+ Documentation: TODO
8
+
9
+ ## Features
10
+
11
+ Enhances [`httpx.Client`](https://www.python-httpx.org) with features useful for writing long-running scrapers & crawlers, particularly against sites that are slow or have intermittent errors.
12
+
13
+ - retries
14
+ - throttling
15
+ - dev-cache for iterating on scrapers
16
+
17
+ ### example
18
+
19
+ TODO
20
+
21
+ ### features this has that scrapelib doesn't
22
+
23
+ - httpx support
24
+ - composable interface, can augment Client with just the enhancements you want
25
+
26
+ TODO: don't allow instantiating bad patch classes, and check for incompatible configs
27
+
28
+ ### features scrapelib had that this doesn't
29
+
30
+ Open to considering if there is interest, but didn't seem necessary.
31
+
32
+ - HTTP(S) and FTP requests via an identical API
33
+ - allow setting custom ciphers
34
+ - have urlretrieve
35
+ - support FTP
36
+ - set custom user-agent/mess w/ headers
@@ -0,0 +1,19 @@
1
+ [project]
2
+ name = "careful"
3
+ version = "0.1.0"
4
+ description = "Add your description here"
5
+ readme = "README.md"
6
+ authors = [
7
+ { name = "jpt", email = "dev@jpt.sh" }
8
+ ]
9
+ requires-python = ">=3.13"
10
+ dependencies = [
11
+ "httpx>=0.28.1",
12
+ "pytest>=8.4.2",
13
+ "pytest-httpbin>=2.1.0",
14
+ ]
15
+
16
+
17
+ [build-system]
18
+ requires = ["hatchling"]
19
+ build-backend = "hatchling.build"
File without changes
@@ -0,0 +1,59 @@
1
+ from .retries import make_retry_client, _default_accept_response
2
+ from .throttle import make_throttled_client
3
+ from .dev_cache import (
4
+ make_dev_caching_client,
5
+ MemoryCache,
6
+ FileCache,
7
+ SQLiteCache,
8
+ _cache_200s,
9
+ _default_keyfunc,
10
+ )
11
+ from httpx import Client
12
+
13
+
14
+ def make_careful_client(
15
+ client: Client,
16
+ *,
17
+ retry_attempts: int = 0,
18
+ retry_wait_seconds: float = 10,
19
+ retry_on_404: bool = False,
20
+ accept_response=_default_accept_response,
21
+ requests_per_minute: int = 0,
22
+ cache_storage=None,
23
+ cache_write_only=False,
24
+ should_cache=_cache_200s,
25
+ cache_keyfunc=_default_keyfunc,
26
+ ):
27
+ # order matters, retry on inside b/c it is last-chance scenario
28
+ if retry_attempts:
29
+ client = make_retry_client(
30
+ client=client,
31
+ attempts=retry_attempts,
32
+ wait_seconds=retry_wait_seconds,
33
+ retry_on_404=retry_on_404,
34
+ accept_response=accept_response,
35
+ )
36
+ # throttling around retries
37
+ if requests_per_minute:
38
+ client = make_throttled_client(client, requests_per_minute=requests_per_minute)
39
+ # caching on top layer, so cache will be checked first
40
+ if cache_storage:
41
+ client = make_dev_caching_client(
42
+ client=client,
43
+ cache_storage=cache_storage,
44
+ cache_keyfunc=cache_keyfunc,
45
+ should_cache=should_cache,
46
+ write_only=cache_write_only,
47
+ )
48
+
49
+ return client
50
+
51
+
52
+ __all__ = [
53
+ "make_retry_client",
54
+ "make_throttled_client",
55
+ "make_dev_caching_client",
56
+ "MemoryCache",
57
+ "FileCache",
58
+ "SQLiteCache",
59
+ ]
@@ -0,0 +1,296 @@
1
+ import types
2
+ import functools
3
+ import logging
4
+ import re
5
+ import os
6
+ import glob
7
+ import hashlib
8
+ import sqlite3
9
+ import json
10
+
11
+ from httpx import Client, Response, Request
12
+
13
+ log = logging.getLogger("httpx")
14
+
15
+
16
+ def _default_keyfunc(
17
+ method: str,
18
+ url: str,
19
+ params: dict | None = None,
20
+ ) -> str | None:
21
+ """
22
+ Return a cache key from a given set of request parameters.
23
+
24
+ Default behavior is to return a complete URL for all GET
25
+ requests, and None otherwise.
26
+ """
27
+ if method.lower() != "get":
28
+ return None
29
+
30
+ return Request(url=url, method=method, params=params).url
31
+
32
+
33
+ def _cache_200s(response: Response) -> bool:
34
+ """
35
+ Check if a given Response object should be cached.
36
+
37
+ Default behavior is to only cache responses with a 200 status code.
38
+ """
39
+ return response.status_code == 200
40
+
41
+
42
+ def _cached_request(client: Client, *args, **kwargs):
43
+ # short circuit if cache isn't configured
44
+ if not client._cache_storage:
45
+ log.debug("bypassing cache, no storage configured")
46
+ resp = client._wrapped_request(*args, **kwargs)
47
+ resp.fromcache = False
48
+ return resp
49
+
50
+ method, url = args
51
+ request_key = client._cache_keyfunc(method, url, kwargs["params"])
52
+
53
+ # check cache for response
54
+ cached_resp = None
55
+ if request_key and not client._write_only:
56
+ cached_resp = client._cache_storage.get(request_key)
57
+
58
+ if cached_resp:
59
+ # resp = cast(CacheResponse, resp_maybe)
60
+ log.info("using cached response request_key=%s", request_key)
61
+ cached_resp.fromcache = True
62
+ resp = cached_resp
63
+ else:
64
+ resp = client._wrapped_request(*args, **kwargs)
65
+ # save to cache if request and response meet criteria
66
+ log.debug("XX %s %s", request_key, client._should_cache(resp))
67
+ if request_key and client._should_cache(resp):
68
+ client._cache_storage.set(request_key, resp)
69
+ log.info("caching response request_key=%s", request_key)
70
+ resp.fromcache = False
71
+
72
+ return resp
73
+
74
+
75
+ def make_dev_caching_client(
76
+ *,
77
+ client: Client | None = None,
78
+ cache_storage=None,
79
+ cache_keyfunc=_default_keyfunc,
80
+ should_cache=_cache_200s,
81
+ write_only=False,
82
+ ):
83
+ if client is None:
84
+ client = Client()
85
+
86
+ client._cache_storage = cache_storage
87
+ client._cache_keyfunc = cache_keyfunc
88
+ client._should_cache = should_cache
89
+ client._write_only = write_only
90
+
91
+ client._wrapped_request = client.request
92
+ client.request = types.MethodType(
93
+ functools.wraps(client.request)(_cached_request), client
94
+ )
95
+ return client
96
+
97
+
98
+ class CacheStorageBase:
99
+ def get(self, key: str) -> None | Response:
100
+ raise NotImplementedError()
101
+
102
+ def set(self, key: str, response: Response) -> None:
103
+ raise NotImplementedError()
104
+
105
+
106
+ class MemoryCache(CacheStorageBase):
107
+ """In memory cache for request responses."""
108
+
109
+ def __init__(self) -> None:
110
+ self.cache: dict[str, Response] = {}
111
+
112
+ def get(self, key: str) -> None | Response:
113
+ """Get cache entry for key, or return None."""
114
+ return self.cache.get(key, None)
115
+
116
+ def set(self, key: str, response: Response) -> None:
117
+ """Set cache entry for key with contents of response."""
118
+ self.cache[key] = response
119
+
120
+
121
+ class FileCache(CacheStorageBase):
122
+ """
123
+ File-based cache for request responses.
124
+
125
+ :param cache_dir: directory for storing responses
126
+ :param check_last_modified: set to True to compare last-modified
127
+ timestamp in cached response with value from HEAD request
128
+ """
129
+
130
+ # file name escaping inspired by httplib2
131
+ _prefix = re.compile(r"^\w+://")
132
+ _illegal = re.compile(r"[?/:|]+")
133
+ _header_re = re.compile(r"([-\w]+): (.*)")
134
+ _maxlen = 200
135
+
136
+ def _clean_key(self, key: str) -> str:
137
+ # strip scheme
138
+ md5 = hashlib.md5(key.encode("utf8")).hexdigest()
139
+ key = self._prefix.sub("", key)
140
+ key = self._illegal.sub(",", key)
141
+ return ",".join((key[: self._maxlen], md5))
142
+
143
+ def __init__(self, cache_dir: str, check_last_modified: bool = False):
144
+ # normalize path
145
+ self.cache_dir = os.path.join(os.getcwd(), cache_dir)
146
+ self.check_last_modified = check_last_modified
147
+ # create directory
148
+ if not os.path.isdir(self.cache_dir):
149
+ os.makedirs(self.cache_dir)
150
+
151
+ def get(self, orig_key: str) -> None | Response:
152
+ """Get cache entry for key, or return None."""
153
+ key = self._clean_key(orig_key)
154
+ path = os.path.join(self.cache_dir, key)
155
+ resp_headers = {}
156
+
157
+ try:
158
+ with open(path, "rb") as f:
159
+ # read lines one at a time
160
+ while True:
161
+ line = f.readline().decode("utf8").strip("\r\n")
162
+ # set headers
163
+
164
+ # if self.check_last_modified and re.search(
165
+ # "last-modified", line, flags=re.I
166
+ # ):
167
+ # # line contains last modified header
168
+ # head_resp = requests.head(orig_key)
169
+
170
+ # try:
171
+ # new_lm = head_resp.headers["last-modified"]
172
+ # old_lm = line[line.find(":") + 1 :].strip()
173
+ # if old_lm != new_lm:
174
+ # # last modified timestamps don't match, need to download again
175
+ # return None
176
+ # except KeyError:
177
+ # # no last modified header present, so redownload
178
+ # return None
179
+
180
+ header = self._header_re.match(line)
181
+ if header:
182
+ resp_headers[header.group(1)] = header.group(2)
183
+ else:
184
+ break
185
+ # everything left is the real content
186
+ resp_content = f.read()
187
+
188
+ # status & encoding will be in headers, but are faked
189
+ # need to split spaces out of status to get code (e.g. '200 OK')
190
+ resp = Response(
191
+ status_code = int(resp_headers.pop("status").split(" ")[0]),
192
+ content=resp_content,
193
+ default_encoding=resp_headers.pop("encoding"),
194
+ headers=resp_headers,
195
+ )
196
+ return resp
197
+ except IOError:
198
+ return None
199
+
200
+ def set(self, key: str, response: Response) -> None:
201
+ """Set cache entry for key with contents of response."""
202
+ key = self._clean_key(key)
203
+ path = os.path.join(self.cache_dir, key)
204
+
205
+ with open(path, "wb") as f:
206
+ status_str = "status: {0}\n".format(response.status_code)
207
+ f.write(status_str.encode("utf8"))
208
+ encoding_str = "encoding: {0}\n".format(response.encoding)
209
+ f.write(encoding_str.encode("utf8"))
210
+ for h, v in response.headers.items():
211
+ # header: value\n
212
+ f.write(h.encode("utf8"))
213
+ f.write(b": ")
214
+ f.write(v.encode("utf8"))
215
+ f.write(b"\n")
216
+ # one blank line
217
+ f.write(b"\n")
218
+ f.write(response.content)
219
+
220
+ def clear(self) -> None:
221
+ # only delete things that end w/ a md5, less dangerous this way
222
+ cache_glob = "*," + ("[0-9a-f]" * 32)
223
+ for fname in glob.glob(os.path.join(self.cache_dir, cache_glob)):
224
+ os.remove(fname)
225
+
226
+
227
+ class SQLiteCache(CacheStorageBase):
228
+ """SQLite cache for request responses.
229
+
230
+ :param cache_path: path for SQLite database file
231
+ :param check_last_modified: set to True to compare last-modified
232
+ timestamp in cached response with value from HEAD request
233
+
234
+ """
235
+
236
+ _columns = ["key", "status", "modified", "encoding", "data", "headers"]
237
+
238
+ def __init__(self, cache_path: str, check_last_modified: bool = False):
239
+ self.cache_path = cache_path
240
+ self.check_last_modified = check_last_modified
241
+ self._conn = sqlite3.connect(cache_path)
242
+ self._conn.text_factory = str
243
+ self._build_table()
244
+
245
+ def _build_table(self) -> None:
246
+ """Create table for storing request information and response."""
247
+ self._conn.execute(
248
+ """CREATE TABLE IF NOT EXISTS cache
249
+ (key text UNIQUE, status integer, modified text,
250
+ encoding text, data blob, headers blob)"""
251
+ )
252
+
253
+ def set(self, key: str, response: Response) -> None:
254
+ """Set cache entry for key with contents of response."""
255
+ mod = response.headers.pop("last-modified", None)
256
+ status = int(response.status_code)
257
+ rec = (
258
+ key,
259
+ status,
260
+ mod,
261
+ response.encoding,
262
+ response.content,
263
+ json.dumps(dict(response.headers)),
264
+ )
265
+ with self._conn:
266
+ self._conn.execute("DELETE FROM cache WHERE key=?", (key,))
267
+ self._conn.execute("INSERT INTO cache VALUES (?,?,?,?,?,?)", rec)
268
+
269
+ def get(self, key: str) -> None | Response:
270
+ """Get cache entry for key, or return None."""
271
+ query = self._conn.execute("SELECT * FROM cache WHERE key=?", (key,))
272
+ rec = query.fetchone()
273
+ if rec is None:
274
+ return None
275
+ rec = dict(zip(self._columns, rec))
276
+
277
+ # TODO evaluate/remove?
278
+ # if self.check_last_modified:
279
+ # if rec["modified"] is None:
280
+ # return None # no last modified header present, so redownload
281
+
282
+ # head_resp = requests.head(key)
283
+ # new_lm = head_resp.headers.get("last-modified", None)
284
+ # if rec["modified"] != new_lm:
285
+ # return None
286
+
287
+ resp = Response(rec["status"], content=rec["data"], default_encoding=rec["encoding"], headers=json.loads(rec["headers"]))
288
+ return resp
289
+
290
+ def clear(self) -> None:
291
+ """Remove all records from cache."""
292
+ with self._conn:
293
+ self._conn.execute("DELETE FROM cache")
294
+
295
+ def __del__(self) -> None:
296
+ self._conn.close()
File without changes
@@ -0,0 +1,87 @@
1
+ import time
2
+ import types
3
+ import functools
4
+ import logging
5
+ from httpx import Client, Response
6
+
7
+ log = logging.getLogger("httpx")
8
+
9
+
10
+ def _default_accept_response(response: Response) -> bool:
11
+ return response.status_code < 400
12
+
13
+
14
+ def _retry_request(client: Client, *args, **kwargs):
15
+ # the retry loop
16
+ tries = 0
17
+ exception_raised = None
18
+
19
+ while tries <= client._retry_attempts:
20
+ exception_raised = None
21
+
22
+ try:
23
+ resp = client._wrapped_request(*args, **kwargs)
24
+
25
+ # break from loop on an accepted response
26
+ if client._accept_response(resp) or (
27
+ resp.status_code == 404 and not client._retry_on_404
28
+ ):
29
+ break
30
+
31
+ except Exception as e:
32
+ # TODO: exclude certain kinds of exceptions (SSL?) from retry
33
+ exception_raised = e
34
+
35
+ if exception_response := getattr(e, "response", None):
36
+ if client._accept_response(exception_response):
37
+ break
38
+
39
+ # if we're going to retry, sleep first
40
+ tries += 1
41
+ if tries <= client._retry_attempts:
42
+ # twice as long each time
43
+ wait = client._retry_wait_seconds * (2 ** (tries - 1))
44
+ if exception_raised:
45
+ log.info(
46
+ "exception %s, sleeping for %s seconds before retry #%s",
47
+ exception_raised,
48
+ wait,
49
+ tries,
50
+ )
51
+ else:
52
+ log.info(
53
+ "response %s, sleeping for %s seconds before retry #%s",
54
+ resp,
55
+ wait,
56
+ tries,
57
+ )
58
+ time.sleep(wait)
59
+
60
+ # out of the loop, either an exception was raised or we had a success
61
+ if exception_raised:
62
+ raise exception_raised
63
+ return resp
64
+
65
+
66
+ def make_retry_client(
67
+ *,
68
+ client: Client | None = None,
69
+ attempts: int = 1,
70
+ wait_seconds: float = 10,
71
+ retry_on_404: bool = False,
72
+ accept_response=_default_accept_response,
73
+ ):
74
+ if client is None:
75
+ client = Client()
76
+ client._retry_attempts = max(0, attempts)
77
+ client._retry_wait_seconds = wait_seconds
78
+ client._retry_on_404 = retry_on_404
79
+ client._accept_response = accept_response
80
+
81
+ client._wrapped_request = client.request
82
+ client.request = types.MethodType(
83
+ functools.wraps(client.request)(_retry_request), client
84
+ )
85
+
86
+ return client
87
+
@@ -0,0 +1,41 @@
1
+ import time
2
+ import types
3
+ import functools
4
+ import logging
5
+ from httpx import Client
6
+
7
+ log = logging.getLogger("httpx")
8
+
9
+
10
+ def _throttle_request(client: Client, *args, **kwargs):
11
+ now = time.time()
12
+ diff = client._request_frequency - (now - client._last_request)
13
+ if diff > 0:
14
+ log.debug("throttled, sleeping for %fs", diff)
15
+ time.sleep(diff)
16
+ client._last_request = time.time()
17
+ else:
18
+ client._last_request = now
19
+ return client._wrapped_request(*args, **kwargs)
20
+
21
+
22
+ def make_throttled_client(
23
+ *,
24
+ client: Client | None = None,
25
+ requests_per_minute: float = 0,
26
+ ):
27
+ if requests_per_minute <= 0:
28
+ raise ValueError("requests per minute must be a positive number")
29
+
30
+ if client is None:
31
+ client = Client()
32
+
33
+ client._last_request = 0.0
34
+ client._requests_per_minute = requests_per_minute
35
+ client._request_frequency = 60.0 / requests_per_minute
36
+
37
+ client._wrapped_request = client.request
38
+ client.request = types.MethodType(
39
+ functools.wraps(client.request)(_throttle_request), client
40
+ )
41
+ return client
@@ -0,0 +1,17 @@
1
+
2
+ class FakeResponse:
3
+ def __init__(
4
+ self,
5
+ url: str,
6
+ code: int,
7
+ content: str | bytes,
8
+ encoding: str = "utf-8",
9
+ headers: dict | None = None,
10
+ ):
11
+ self.url = url
12
+ self.status_code = code
13
+ self.content = content
14
+ self.text = str(content)
15
+ self.encoding = encoding
16
+ self.headers = headers or {}
17
+
@@ -0,0 +1,65 @@
1
+ from pytest_httpbin.serve import Server # type: ignore
2
+ from httpx import Response
3
+ from careful.httpx import make_dev_caching_client, MemoryCache, FileCache, SQLiteCache
4
+
5
+
6
+ def test_dev_caching(httpbin: Server) -> None:
7
+ client = make_dev_caching_client(cache_storage=MemoryCache(), write_only=False)
8
+
9
+ resp = client.get(httpbin.url + "/status/200")
10
+ assert not resp.fromcache
11
+ resp = client.get(httpbin.url + "/status/200")
12
+ assert resp.fromcache
13
+
14
+
15
+ def test_dev_caching_params(httpbin: Server) -> None:
16
+ client = make_dev_caching_client(cache_storage=MemoryCache(), write_only=False)
17
+
18
+ resp = client.get(httpbin.url + "/status/200?a=1&b=2")
19
+ assert not resp.fromcache
20
+ resp = client.get(httpbin.url + "/status/200?a=1&b=2")
21
+ assert resp.fromcache
22
+ resp = client.get(httpbin.url + "/status/200?a=1&b=3")
23
+ assert not resp.fromcache
24
+
25
+
26
+ # test storages #####
27
+
28
+
29
+ def _test_cache_storage(storage_obj) -> None:
30
+ # unknown key returns None
31
+ assert storage_obj.get("one") is None
32
+
33
+ _content_as_bytes = b"here's unicode: \xe2\x98\x83"
34
+ _content_as_unicode = "here's unicode: \u2603"
35
+
36
+ # set 'one'
37
+ resp = Response(200)
38
+ resp.headers["x-num"] = "one"
39
+ resp._content = _content_as_bytes
40
+ storage_obj.set("one", resp)
41
+ cached_resp = storage_obj.get("one")
42
+ assert cached_resp is not None
43
+ if cached_resp is not None:
44
+ assert cached_resp.headers["x-num"] == "one"
45
+ assert cached_resp.status_code == 200
46
+ cached_resp.encoding = "utf8"
47
+ assert cached_resp.text == _content_as_unicode
48
+
49
+
50
+ def test_memory_cache() -> None:
51
+ _test_cache_storage(MemoryCache())
52
+
53
+
54
+ def test_file_cache() -> None:
55
+ fc = FileCache("cache")
56
+ fc.clear()
57
+ _test_cache_storage(fc)
58
+ fc.clear()
59
+
60
+
61
+ def test_sqlite_cache() -> None:
62
+ sc = SQLiteCache("cache.db")
63
+ sc.clear()
64
+ _test_cache_storage(sc)
65
+ sc.clear()
@@ -0,0 +1,4 @@
1
+ from careful.httpx import make_careful_client
2
+
3
+ def test_full_careful_client():
4
+ client = make_careful_client()
@@ -0,0 +1,85 @@
1
+ from careful.httpx import make_retry_client
2
+ from unittest import mock
3
+ from fakeresponse import FakeResponse
4
+
5
+
6
+ def test_retry() -> None:
7
+ client = make_retry_client(attempts=3, wait_seconds=0.001)
8
+
9
+ # On the first call return a 500, then a 200
10
+ mock_request = mock.Mock(
11
+ side_effect=[
12
+ FakeResponse("http://dummy/", 500, "failure!"),
13
+ FakeResponse("http://dummy/", 200, "success!"),
14
+ ]
15
+ )
16
+
17
+ with mock.patch.object(client, "_wrapped_request", mock_request):
18
+ resp = client.get("http://dummy/")
19
+ assert mock_request.call_count == 2
20
+
21
+ # 500 always
22
+ mock_request = mock.Mock(
23
+ return_value=FakeResponse("http://dummy/", 500, "failure!")
24
+ )
25
+
26
+ with mock.patch.object(client, "_wrapped_request", mock_request):
27
+ resp = client.get("http://dummy/")
28
+ assert resp.status_code == 500
29
+ assert mock_request.call_count == 4 # try four times
30
+
31
+
32
+ def test_retry_404() -> None:
33
+ client = make_retry_client(attempts=3, wait_seconds=0.001, retry_on_404=True)
34
+
35
+ # On the first call return a 404, then a 200
36
+ mock_request = mock.Mock(
37
+ side_effect=[
38
+ FakeResponse("http://dummy/", 404, "failure!"),
39
+ FakeResponse("http://dummy/", 200, "success!"),
40
+ ]
41
+ )
42
+
43
+ with mock.patch.object(client, "_wrapped_request", mock_request):
44
+ resp = client.get("http://dummy/") # type: ignore
45
+ assert mock_request.call_count == 2
46
+ assert resp.status_code == 200
47
+
48
+ # 404 always
49
+ mock_request = mock.Mock(
50
+ return_value=FakeResponse("http://dummy/", 404, "failure!")
51
+ )
52
+
53
+ # four tries
54
+ with mock.patch.object(client, "_wrapped_request", mock_request):
55
+ resp = client.get("http://dummy/")
56
+ assert resp.status_code == 404
57
+ assert mock_request.call_count == 4
58
+ assert resp.status_code == 404
59
+
60
+
61
+ def test_no_retry_404() -> None:
62
+ client = make_retry_client(attempts=3, wait_seconds=0.001, retry_on_404=False)
63
+
64
+ # On the first call return a 404, then a 200
65
+ mock_request = mock.Mock(
66
+ side_effect=[
67
+ FakeResponse("http://dummy/", 404, "failure!"),
68
+ FakeResponse("http://dummy/", 200, "success!"),
69
+ ]
70
+ )
71
+
72
+ with mock.patch.object(client, "_wrapped_request", mock_request):
73
+ resp = client.get("http://dummy/") # type: ignore
74
+ assert mock_request.call_count == 1
75
+ assert resp.status_code == 404
76
+
77
+
78
+ # def test_retry_ssl() -> None:
79
+ # s = make_retry_client(retry_attempts=5, retry_wait_seconds=0.001, raise_errors=False)
80
+
81
+ # # ensure SSLError is considered fatal even w/ retries
82
+ # with mock.patch.object(requests.Session, "request", mock_sslerror):
83
+ # with pytest.raises(requests.exceptions.SSLError):
84
+ # s.get("http://dummy/", retry_on_404=True) # type: ignore
85
+ # assert mock_sslerror.call_count == 1
@@ -0,0 +1,29 @@
1
+ from careful.httpx import make_throttled_client
2
+ from unittest import mock
3
+ from typing import Any
4
+ from fakeresponse import FakeResponse
5
+
6
+
7
+ def request_200(method: str, url: str, *args: Any, **kwargs: Any) -> FakeResponse:
8
+ return FakeResponse(url, 200, b"ok")
9
+
10
+
11
+ mock_200 = mock.Mock(wraps=request_200)
12
+
13
+
14
+ def test_request_throttling() -> None:
15
+ client = make_throttled_client(requests_per_minute=30)
16
+
17
+ mock_sleep = mock.Mock()
18
+
19
+ # check that sleep is called on call 2 & 3
20
+ with mock.patch("time.sleep", mock_sleep):
21
+ with mock.patch.object(client, "_wrapped_request", mock_200):
22
+ client.get("http://dummy/")
23
+ client.get("http://dummy/")
24
+ client.get("http://dummy/")
25
+ assert mock_sleep.call_count == 2
26
+ # should have slept for ~2 seconds to aim at 30 per min
27
+ assert 1.8 <= mock_sleep.call_args[0][0] <= 2.2
28
+
29
+