careful 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
careful/__init__.py ADDED
File without changes
@@ -0,0 +1,59 @@
1
+ from .retries import make_retry_client, _default_accept_response
2
+ from .throttle import make_throttled_client
3
+ from .dev_cache import (
4
+ make_dev_caching_client,
5
+ MemoryCache,
6
+ FileCache,
7
+ SQLiteCache,
8
+ _cache_200s,
9
+ _default_keyfunc,
10
+ )
11
+ from httpx import Client
12
+
13
+
14
+ def make_careful_client(
15
+ client: Client,
16
+ *,
17
+ retry_attempts: int = 0,
18
+ retry_wait_seconds: float = 10,
19
+ retry_on_404: bool = False,
20
+ accept_response=_default_accept_response,
21
+ requests_per_minute: int = 0,
22
+ cache_storage=None,
23
+ cache_write_only=False,
24
+ should_cache=_cache_200s,
25
+ cache_keyfunc=_default_keyfunc,
26
+ ):
27
+ # order matters, retry on inside b/c it is last-chance scenario
28
+ if retry_attempts:
29
+ client = make_retry_client(
30
+ client=client,
31
+ attempts=retry_attempts,
32
+ wait_seconds=retry_wait_seconds,
33
+ retry_on_404=retry_on_404,
34
+ accept_response=accept_response,
35
+ )
36
+ # throttling around retries
37
+ if requests_per_minute:
38
+ client = make_throttled_client(client, requests_per_minute=requests_per_minute)
39
+ # caching on top layer, so cache will be checked first
40
+ if cache_storage:
41
+ client = make_dev_caching_client(
42
+ client=client,
43
+ cache_storage=cache_storage,
44
+ cache_keyfunc=cache_keyfunc,
45
+ should_cache=should_cache,
46
+ write_only=cache_write_only,
47
+ )
48
+
49
+ return client
50
+
51
+
52
+ __all__ = [
53
+ "make_retry_client",
54
+ "make_throttled_client",
55
+ "make_dev_caching_client",
56
+ "MemoryCache",
57
+ "FileCache",
58
+ "SQLiteCache",
59
+ ]
@@ -0,0 +1,296 @@
1
+ import types
2
+ import functools
3
+ import logging
4
+ import re
5
+ import os
6
+ import glob
7
+ import hashlib
8
+ import sqlite3
9
+ import json
10
+
11
+ from httpx import Client, Response, Request
12
+
13
+ log = logging.getLogger("httpx")
14
+
15
+
16
+ def _default_keyfunc(
17
+ method: str,
18
+ url: str,
19
+ params: dict | None = None,
20
+ ) -> str | None:
21
+ """
22
+ Return a cache key from a given set of request parameters.
23
+
24
+ Default behavior is to return a complete URL for all GET
25
+ requests, and None otherwise.
26
+ """
27
+ if method.lower() != "get":
28
+ return None
29
+
30
+ return Request(url=url, method=method, params=params).url
31
+
32
+
33
+ def _cache_200s(response: Response) -> bool:
34
+ """
35
+ Check if a given Response object should be cached.
36
+
37
+ Default behavior is to only cache responses with a 200 status code.
38
+ """
39
+ return response.status_code == 200
40
+
41
+
42
+ def _cached_request(client: Client, *args, **kwargs):
43
+ # short circuit if cache isn't configured
44
+ if not client._cache_storage:
45
+ log.debug("bypassing cache, no storage configured")
46
+ resp = client._wrapped_request(*args, **kwargs)
47
+ resp.fromcache = False
48
+ return resp
49
+
50
+ method, url = args
51
+ request_key = client._cache_keyfunc(method, url, kwargs["params"])
52
+
53
+ # check cache for response
54
+ cached_resp = None
55
+ if request_key and not client._write_only:
56
+ cached_resp = client._cache_storage.get(request_key)
57
+
58
+ if cached_resp:
59
+ # resp = cast(CacheResponse, resp_maybe)
60
+ log.info("using cached response request_key=%s", request_key)
61
+ cached_resp.fromcache = True
62
+ resp = cached_resp
63
+ else:
64
+ resp = client._wrapped_request(*args, **kwargs)
65
+ # save to cache if request and response meet criteria
66
+ log.debug("XX %s %s", request_key, client._should_cache(resp))
67
+ if request_key and client._should_cache(resp):
68
+ client._cache_storage.set(request_key, resp)
69
+ log.info("caching response request_key=%s", request_key)
70
+ resp.fromcache = False
71
+
72
+ return resp
73
+
74
+
75
+ def make_dev_caching_client(
76
+ *,
77
+ client: Client | None = None,
78
+ cache_storage=None,
79
+ cache_keyfunc=_default_keyfunc,
80
+ should_cache=_cache_200s,
81
+ write_only=False,
82
+ ):
83
+ if client is None:
84
+ client = Client()
85
+
86
+ client._cache_storage = cache_storage
87
+ client._cache_keyfunc = cache_keyfunc
88
+ client._should_cache = should_cache
89
+ client._write_only = write_only
90
+
91
+ client._wrapped_request = client.request
92
+ client.request = types.MethodType(
93
+ functools.wraps(client.request)(_cached_request), client
94
+ )
95
+ return client
96
+
97
+
98
+ class CacheStorageBase:
99
+ def get(self, key: str) -> None | Response:
100
+ raise NotImplementedError()
101
+
102
+ def set(self, key: str, response: Response) -> None:
103
+ raise NotImplementedError()
104
+
105
+
106
+ class MemoryCache(CacheStorageBase):
107
+ """In memory cache for request responses."""
108
+
109
+ def __init__(self) -> None:
110
+ self.cache: dict[str, Response] = {}
111
+
112
+ def get(self, key: str) -> None | Response:
113
+ """Get cache entry for key, or return None."""
114
+ return self.cache.get(key, None)
115
+
116
+ def set(self, key: str, response: Response) -> None:
117
+ """Set cache entry for key with contents of response."""
118
+ self.cache[key] = response
119
+
120
+
121
+ class FileCache(CacheStorageBase):
122
+ """
123
+ File-based cache for request responses.
124
+
125
+ :param cache_dir: directory for storing responses
126
+ :param check_last_modified: set to True to compare last-modified
127
+ timestamp in cached response with value from HEAD request
128
+ """
129
+
130
+ # file name escaping inspired by httplib2
131
+ _prefix = re.compile(r"^\w+://")
132
+ _illegal = re.compile(r"[?/:|]+")
133
+ _header_re = re.compile(r"([-\w]+): (.*)")
134
+ _maxlen = 200
135
+
136
+ def _clean_key(self, key: str) -> str:
137
+ # strip scheme
138
+ md5 = hashlib.md5(key.encode("utf8")).hexdigest()
139
+ key = self._prefix.sub("", key)
140
+ key = self._illegal.sub(",", key)
141
+ return ",".join((key[: self._maxlen], md5))
142
+
143
+ def __init__(self, cache_dir: str, check_last_modified: bool = False):
144
+ # normalize path
145
+ self.cache_dir = os.path.join(os.getcwd(), cache_dir)
146
+ self.check_last_modified = check_last_modified
147
+ # create directory
148
+ if not os.path.isdir(self.cache_dir):
149
+ os.makedirs(self.cache_dir)
150
+
151
+ def get(self, orig_key: str) -> None | Response:
152
+ """Get cache entry for key, or return None."""
153
+ key = self._clean_key(orig_key)
154
+ path = os.path.join(self.cache_dir, key)
155
+ resp_headers = {}
156
+
157
+ try:
158
+ with open(path, "rb") as f:
159
+ # read lines one at a time
160
+ while True:
161
+ line = f.readline().decode("utf8").strip("\r\n")
162
+ # set headers
163
+
164
+ # if self.check_last_modified and re.search(
165
+ # "last-modified", line, flags=re.I
166
+ # ):
167
+ # # line contains last modified header
168
+ # head_resp = requests.head(orig_key)
169
+
170
+ # try:
171
+ # new_lm = head_resp.headers["last-modified"]
172
+ # old_lm = line[line.find(":") + 1 :].strip()
173
+ # if old_lm != new_lm:
174
+ # # last modified timestamps don't match, need to download again
175
+ # return None
176
+ # except KeyError:
177
+ # # no last modified header present, so redownload
178
+ # return None
179
+
180
+ header = self._header_re.match(line)
181
+ if header:
182
+ resp_headers[header.group(1)] = header.group(2)
183
+ else:
184
+ break
185
+ # everything left is the real content
186
+ resp_content = f.read()
187
+
188
+ # status & encoding will be in headers, but are faked
189
+ # need to split spaces out of status to get code (e.g. '200 OK')
190
+ resp = Response(
191
+ status_code = int(resp_headers.pop("status").split(" ")[0]),
192
+ content=resp_content,
193
+ default_encoding=resp_headers.pop("encoding"),
194
+ headers=resp_headers,
195
+ )
196
+ return resp
197
+ except IOError:
198
+ return None
199
+
200
+ def set(self, key: str, response: Response) -> None:
201
+ """Set cache entry for key with contents of response."""
202
+ key = self._clean_key(key)
203
+ path = os.path.join(self.cache_dir, key)
204
+
205
+ with open(path, "wb") as f:
206
+ status_str = "status: {0}\n".format(response.status_code)
207
+ f.write(status_str.encode("utf8"))
208
+ encoding_str = "encoding: {0}\n".format(response.encoding)
209
+ f.write(encoding_str.encode("utf8"))
210
+ for h, v in response.headers.items():
211
+ # header: value\n
212
+ f.write(h.encode("utf8"))
213
+ f.write(b": ")
214
+ f.write(v.encode("utf8"))
215
+ f.write(b"\n")
216
+ # one blank line
217
+ f.write(b"\n")
218
+ f.write(response.content)
219
+
220
+ def clear(self) -> None:
221
+ # only delete things that end w/ a md5, less dangerous this way
222
+ cache_glob = "*," + ("[0-9a-f]" * 32)
223
+ for fname in glob.glob(os.path.join(self.cache_dir, cache_glob)):
224
+ os.remove(fname)
225
+
226
+
227
+ class SQLiteCache(CacheStorageBase):
228
+ """SQLite cache for request responses.
229
+
230
+ :param cache_path: path for SQLite database file
231
+ :param check_last_modified: set to True to compare last-modified
232
+ timestamp in cached response with value from HEAD request
233
+
234
+ """
235
+
236
+ _columns = ["key", "status", "modified", "encoding", "data", "headers"]
237
+
238
+ def __init__(self, cache_path: str, check_last_modified: bool = False):
239
+ self.cache_path = cache_path
240
+ self.check_last_modified = check_last_modified
241
+ self._conn = sqlite3.connect(cache_path)
242
+ self._conn.text_factory = str
243
+ self._build_table()
244
+
245
+ def _build_table(self) -> None:
246
+ """Create table for storing request information and response."""
247
+ self._conn.execute(
248
+ """CREATE TABLE IF NOT EXISTS cache
249
+ (key text UNIQUE, status integer, modified text,
250
+ encoding text, data blob, headers blob)"""
251
+ )
252
+
253
+ def set(self, key: str, response: Response) -> None:
254
+ """Set cache entry for key with contents of response."""
255
+ mod = response.headers.pop("last-modified", None)
256
+ status = int(response.status_code)
257
+ rec = (
258
+ key,
259
+ status,
260
+ mod,
261
+ response.encoding,
262
+ response.content,
263
+ json.dumps(dict(response.headers)),
264
+ )
265
+ with self._conn:
266
+ self._conn.execute("DELETE FROM cache WHERE key=?", (key,))
267
+ self._conn.execute("INSERT INTO cache VALUES (?,?,?,?,?,?)", rec)
268
+
269
+ def get(self, key: str) -> None | Response:
270
+ """Get cache entry for key, or return None."""
271
+ query = self._conn.execute("SELECT * FROM cache WHERE key=?", (key,))
272
+ rec = query.fetchone()
273
+ if rec is None:
274
+ return None
275
+ rec = dict(zip(self._columns, rec))
276
+
277
+ # TODO evaluate/remove?
278
+ # if self.check_last_modified:
279
+ # if rec["modified"] is None:
280
+ # return None # no last modified header present, so redownload
281
+
282
+ # head_resp = requests.head(key)
283
+ # new_lm = head_resp.headers.get("last-modified", None)
284
+ # if rec["modified"] != new_lm:
285
+ # return None
286
+
287
+ resp = Response(rec["status"], content=rec["data"], default_encoding=rec["encoding"], headers=json.loads(rec["headers"]))
288
+ return resp
289
+
290
+ def clear(self) -> None:
291
+ """Remove all records from cache."""
292
+ with self._conn:
293
+ self._conn.execute("DELETE FROM cache")
294
+
295
+ def __del__(self) -> None:
296
+ self._conn.close()
careful/httpx/py.typed ADDED
File without changes
@@ -0,0 +1,87 @@
1
+ import time
2
+ import types
3
+ import functools
4
+ import logging
5
+ from httpx import Client, Response
6
+
7
+ log = logging.getLogger("httpx")
8
+
9
+
10
+ def _default_accept_response(response: Response) -> bool:
11
+ return response.status_code < 400
12
+
13
+
14
+ def _retry_request(client: Client, *args, **kwargs):
15
+ # the retry loop
16
+ tries = 0
17
+ exception_raised = None
18
+
19
+ while tries <= client._retry_attempts:
20
+ exception_raised = None
21
+
22
+ try:
23
+ resp = client._wrapped_request(*args, **kwargs)
24
+
25
+ # break from loop on an accepted response
26
+ if client._accept_response(resp) or (
27
+ resp.status_code == 404 and not client._retry_on_404
28
+ ):
29
+ break
30
+
31
+ except Exception as e:
32
+ # TODO: exclude certain kinds of exceptions (SSL?) from retry
33
+ exception_raised = e
34
+
35
+ if exception_response := getattr(e, "response", None):
36
+ if client._accept_response(exception_response):
37
+ break
38
+
39
+ # if we're going to retry, sleep first
40
+ tries += 1
41
+ if tries <= client._retry_attempts:
42
+ # twice as long each time
43
+ wait = client._retry_wait_seconds * (2 ** (tries - 1))
44
+ if exception_raised:
45
+ log.info(
46
+ "exception %s, sleeping for %s seconds before retry #%s",
47
+ exception_raised,
48
+ wait,
49
+ tries,
50
+ )
51
+ else:
52
+ log.info(
53
+ "response %s, sleeping for %s seconds before retry #%s",
54
+ resp,
55
+ wait,
56
+ tries,
57
+ )
58
+ time.sleep(wait)
59
+
60
+ # out of the loop, either an exception was raised or we had a success
61
+ if exception_raised:
62
+ raise exception_raised
63
+ return resp
64
+
65
+
66
+ def make_retry_client(
67
+ *,
68
+ client: Client | None = None,
69
+ attempts: int = 1,
70
+ wait_seconds: float = 10,
71
+ retry_on_404: bool = False,
72
+ accept_response=_default_accept_response,
73
+ ):
74
+ if client is None:
75
+ client = Client()
76
+ client._retry_attempts = max(0, attempts)
77
+ client._retry_wait_seconds = wait_seconds
78
+ client._retry_on_404 = retry_on_404
79
+ client._accept_response = accept_response
80
+
81
+ client._wrapped_request = client.request
82
+ client.request = types.MethodType(
83
+ functools.wraps(client.request)(_retry_request), client
84
+ )
85
+
86
+ return client
87
+
@@ -0,0 +1,41 @@
1
+ import time
2
+ import types
3
+ import functools
4
+ import logging
5
+ from httpx import Client
6
+
7
+ log = logging.getLogger("httpx")
8
+
9
+
10
+ def _throttle_request(client: Client, *args, **kwargs):
11
+ now = time.time()
12
+ diff = client._request_frequency - (now - client._last_request)
13
+ if diff > 0:
14
+ log.debug("throttled, sleeping for %fs", diff)
15
+ time.sleep(diff)
16
+ client._last_request = time.time()
17
+ else:
18
+ client._last_request = now
19
+ return client._wrapped_request(*args, **kwargs)
20
+
21
+
22
+ def make_throttled_client(
23
+ *,
24
+ client: Client | None = None,
25
+ requests_per_minute: float = 0,
26
+ ):
27
+ if requests_per_minute <= 0:
28
+ raise ValueError("requests per minute must be a positive number")
29
+
30
+ if client is None:
31
+ client = Client()
32
+
33
+ client._last_request = 0.0
34
+ client._requests_per_minute = requests_per_minute
35
+ client._request_frequency = 60.0 / requests_per_minute
36
+
37
+ client._wrapped_request = client.request
38
+ client.request = types.MethodType(
39
+ functools.wraps(client.request)(_throttle_request), client
40
+ )
41
+ return client
@@ -0,0 +1,48 @@
1
+ Metadata-Version: 2.4
2
+ Name: careful
3
+ Version: 0.1.0
4
+ Summary: Add your description here
5
+ Author-email: jpt <dev@jpt.sh>
6
+ License-File: LICENSE
7
+ Requires-Python: >=3.13
8
+ Requires-Dist: httpx>=0.28.1
9
+ Requires-Dist: pytest-httpbin>=2.1.0
10
+ Requires-Dist: pytest>=8.4.2
11
+ Description-Content-Type: text/markdown
12
+
13
+ **careful_httpx** is a library for making requests to less-than-reliable websites.
14
+
15
+ It is based on [scrapelib](https://pypi.org/scrapelib/), which has powered Open States & many other Python scrapers for over 15 years.
16
+
17
+ Code: <https://codeberg.org/jpt/careful_httpx>
18
+
19
+ Documentation: TODO
20
+
21
+ ## Features
22
+
23
+ Enhances [`httpx.Client`](https://www.python-httpx.org) with features useful for writing long-running scrapers & crawlers, particularly against sites that are slow or have intermittent errors.
24
+
25
+ - retries
26
+ - throttling
27
+ - dev-cache for iterating on scrapers
28
+
29
+ ### example
30
+
31
+ TODO
32
+
33
+ ### features this has that scrapelib doesn't
34
+
35
+ - httpx support
36
+ - composable interface, can augment Client with just the enhancements you want
37
+
38
+ TODO: don't allow instantiating bad patch classes, and check for incompatible configs
39
+
40
+ ### features scrapelib had that this doesn't
41
+
42
+ Open to considering if there is interest, but didn't seem necessary.
43
+
44
+ - HTTP(S) and FTP requests via an identical API
45
+ - allow setting custom ciphers
46
+ - have urlretrieve
47
+ - support FTP
48
+ - set custom user-agent/mess w/ headers
@@ -0,0 +1,10 @@
1
+ careful/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ careful/httpx/__init__.py,sha256=gDSnAnqxFt9mLi2laArt7BUn_wPU5ub0k9zeqsexYJY,1605
3
+ careful/httpx/dev_cache.py,sha256=_jwpnf1fzBzR23Of2HdsyiN_MPHvRG0gtM49Y4qRtQg,10031
4
+ careful/httpx/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ careful/httpx/retries.py,sha256=mMIZf-EP9bhzEZnEmwtjWZ2qdl6ZCJ7vq3hZltT6Zms,2458
6
+ careful/httpx/throttle.py,sha256=wCJWHERr5manyKq07ZdonmxbK0oh0PYJgO6a94IzN0s,1088
7
+ careful-0.1.0.dist-info/METADATA,sha256=wBFvqh5xyMfNRVB8Jg9Aa3s5_Te2NxcBy5BE-3NMYeY,1373
8
+ careful-0.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
9
+ careful-0.1.0.dist-info/licenses/LICENSE,sha256=oHe4LmCuo6CZne42DRXfiR3uqqIfGsk4dAvDKucAi4M,1315
10
+ careful-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.27.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,24 @@
1
+ Copyright (c) 2025, James Turk
2
+
3
+ All rights reserved.
4
+
5
+ Redistribution and use in source and binary forms, with or without modification,
6
+ are permitted provided that the following conditions are met:
7
+
8
+ * Redistributions of source code must retain the above copyright notice,
9
+ this list of conditions and the following disclaimer.
10
+ * Redistributions in binary form must reproduce the above copyright notice,
11
+ this list of conditions and the following disclaimer in the documentation
12
+ and/or other materials provided with the distribution.
13
+
14
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
15
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
16
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
17
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
18
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
22
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
23
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.