careful 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
careful/httpx/__init__.py CHANGED
@@ -1,41 +1,109 @@
1
- from .retries import make_retry_client, _default_accept_response
1
+ from .retries import make_retry_client, retry_default_rule
2
2
  from .throttle import make_throttled_client
3
3
  from .dev_cache import (
4
4
  make_dev_caching_client,
5
5
  MemoryCache,
6
6
  FileCache,
7
- SQLiteCache,
7
+ SqliteCache,
8
+ CacheStorageBase,
8
9
  _cache_200s,
9
10
  _default_keyfunc,
10
11
  )
12
+ from ._types import ResponsePredicate, CacheKeyfunc
11
13
  from httpx import Client
12
14
 
13
15
 
14
16
  def make_careful_client(
15
- client: Client,
16
17
  *,
18
+ client: Client | None = None,
17
19
  retry_attempts: int = 0,
18
20
  retry_wait_seconds: float = 10,
19
- retry_on_404: bool = False,
20
- accept_response=_default_accept_response,
21
+ should_retry: ResponsePredicate = retry_default_rule,
21
22
  requests_per_minute: int = 0,
22
- cache_storage=None,
23
- cache_write_only=False,
24
- should_cache=_cache_200s,
25
- cache_keyfunc=_default_keyfunc,
23
+ cache_storage: CacheStorageBase = None,
24
+ cache_write_only: bool = False,
25
+ should_cache: ResponsePredicate = _cache_200s,
26
+ cache_keyfunc: CacheKeyfunc = _default_keyfunc,
26
27
  ):
28
+ """
29
+ This function patches an `httpx.Client` so that all requests made with the client support
30
+ [retries](#retries), [throttling](#throttling), and [development caching](#development-caching).
31
+
32
+
33
+ Parameters:
34
+ client: A pre-configured `httpx.Client`. If omitted a default client will be created.
35
+
36
+ retry_attempts: Maximum number of retries. If non-zero will retry up to this many times
37
+ with increasing wait times, starting with `retry_wait_seconds`.
38
+
39
+ retry_wait_seconds: Number of seconds to sleep between first attempt and first retry.
40
+ Subsequent attempts will increase exponentially (2x, 4x, 8x, etc.)
41
+
42
+ should_retry: Predicate function that takes a `httpx.Response` and returns `True` if it should be retried.
43
+
44
+ requests_per_minute: Maximum number of requests per minute. (e.g. 30 will throttle to ~2s between requests)
45
+
46
+ cache_storage: An object that implements the [cache storage interface](#cache-storage).
47
+
48
+ cache_write_only: Update cache, but never read from it.
49
+
50
+ should_cache: Predicate function that takes a `httpx.Response` and returns `True` if it should be cached.
51
+
52
+ cache_keyfunc: Function that takes request details and returns a unique cache key.
53
+
54
+ ## Retries
55
+
56
+ If `retry_attempts` is set, responses will be passed to `should_retry`.
57
+ Responses that are rejected (return `True`) will be retried after a wait based on
58
+ `retry_wait_seconds`.
59
+ Each retry will wait twice as long as the one before.
60
+
61
+ ## Throttling
62
+
63
+ If `requests_per_minute` is set, standard (non-retry) requests will automatically
64
+ sleep for a short period to target the given rate.
65
+
66
+ For example, at 30rpm, the sleep time on a fast request will be close to 2 seconds.
67
+
68
+ ## Development Caching
69
+
70
+ Why **development caching?**
71
+
72
+ This feature is named as a reminder that **this is not true HTTP caching**, which
73
+ should take various headers into account. Look at libraries like [hishel](https://hishel.com) if that's what you are after.
74
+
75
+ The purpose of this feature is to allow you to cache all of your HTTP requests during development.
76
+ Often when writing a scraper or crawler, you wind up hitting the site you are working on more often than you'd like-- each time you iterate on your code you're likely making redundant requests to pages that haven't changed.
77
+
78
+ By caching all successful requests (configurable with the `should_cache` parameter),
79
+ you can easily re-run scrapers without making redundant HTTP requests.
80
+ This means faster development time & happier upstream servers.
81
+
82
+ To enable development caching, assign a [`MemoryCache`][careful.httpx.MemoryCache],
83
+ [`FileCache`][careful.httpx.FileCache], or [`SqliteCache`][careful.httpx.SqliteCache] to
84
+ the `cache_storage` property of a `scrapelib.Scraper`.
85
+
86
+ ---
87
+
88
+ When multiple features are applied, the order of wrapping ensures that:
89
+ - the cache is checked first, and bypasses throttling if hit
90
+ - retries use their own delays, but not throttled separately
91
+ """
92
+ if client is None:
93
+ client = Client()
27
94
  # order matters, retry on inside b/c it is last-chance scenario
28
95
  if retry_attempts:
29
96
  client = make_retry_client(
30
97
  client=client,
31
98
  attempts=retry_attempts,
32
99
  wait_seconds=retry_wait_seconds,
33
- retry_on_404=retry_on_404,
34
- accept_response=accept_response,
100
+ should_retry=should_retry,
35
101
  )
36
102
  # throttling around retries
37
103
  if requests_per_minute:
38
- client = make_throttled_client(client, requests_per_minute=requests_per_minute)
104
+ client = make_throttled_client(
105
+ client=client, requests_per_minute=requests_per_minute
106
+ )
39
107
  # caching on top layer, so cache will be checked first
40
108
  if cache_storage:
41
109
  client = make_dev_caching_client(
@@ -55,5 +123,5 @@ __all__ = [
55
123
  "make_dev_caching_client",
56
124
  "MemoryCache",
57
125
  "FileCache",
58
- "SQLiteCache",
126
+ "SqliteCache",
59
127
  ]
@@ -0,0 +1,6 @@
1
+ from httpx import Response
2
+ from typing import Callable
3
+
4
+ ResponsePredicate = Callable[[Response], bool]
5
+
6
+ CacheKeyfunc = Callable[[str,str,dict], str]
@@ -1,3 +1,4 @@
1
+ import abc
1
2
  import types
2
3
  import functools
3
4
  import logging
@@ -40,13 +41,6 @@ def _cache_200s(response: Response) -> bool:
40
41
 
41
42
 
42
43
  def _cached_request(client: Client, *args, **kwargs):
43
- # short circuit if cache isn't configured
44
- if not client._cache_storage:
45
- log.debug("bypassing cache, no storage configured")
46
- resp = client._wrapped_request(*args, **kwargs)
47
- resp.fromcache = False
48
- return resp
49
-
50
44
  method, url = args
51
45
  request_key = client._cache_keyfunc(method, url, kwargs["params"])
52
46
 
@@ -61,7 +55,7 @@ def _cached_request(client: Client, *args, **kwargs):
61
55
  cached_resp.fromcache = True
62
56
  resp = cached_resp
63
57
  else:
64
- resp = client._wrapped_request(*args, **kwargs)
58
+ resp = client._no_cache_request(*args, **kwargs)
65
59
  # save to cache if request and response meet criteria
66
60
  log.debug("XX %s %s", request_key, client._should_cache(resp))
67
61
  if request_key and client._should_cache(resp):
@@ -80,6 +74,27 @@ def make_dev_caching_client(
80
74
  should_cache=_cache_200s,
81
75
  write_only=False,
82
76
  ):
77
+ """
78
+ Returns an enhanced `httpx.Client` where requests are saved to a
79
+ specified cache.
80
+
81
+ This is denoted as a "dev_cache" because it is not intended to be a true
82
+ HTTP cache, respecting cache headers/etc. If you are looking for that
83
+ behavior, there are httpx libraries for that explicit purpose.
84
+
85
+ Instead, the purpose of this cache is to make it possible to test scrapers
86
+ locally without making hundreds of redundant requests.
87
+
88
+ The strategy is configurable via `cache_keyfunc` and `should_cache`.
89
+
90
+ The default strategy is simple:
91
+ cache all GET requests that result in 200s, with no expiry.
92
+
93
+ This works well for the case where you have hundreds of pages to scrape
94
+ and want to make scraper adjustments without repeatedly making hits.
95
+
96
+ It should *NOT* be used in production without adjusting these rules.
97
+ """
83
98
  if client is None:
84
99
  client = Client()
85
100
 
@@ -88,23 +103,34 @@ def make_dev_caching_client(
88
103
  client._should_cache = should_cache
89
104
  client._write_only = write_only
90
105
 
91
- client._wrapped_request = client.request
106
+ client._no_cache_request = client.request
92
107
  client.request = types.MethodType(
93
108
  functools.wraps(client.request)(_cached_request), client
94
109
  )
95
110
  return client
96
111
 
97
112
 
98
- class CacheStorageBase:
113
+ class CacheStorageBase(abc.ABC):
114
+ @abc.abstractmethod
99
115
  def get(self, key: str) -> None | Response:
100
116
  raise NotImplementedError()
101
117
 
118
+ @abc.abstractmethod
102
119
  def set(self, key: str, response: Response) -> None:
103
120
  raise NotImplementedError()
104
121
 
105
122
 
106
123
  class MemoryCache(CacheStorageBase):
107
- """In memory cache for request responses."""
124
+ """
125
+ In memory cache for request responses.
126
+
127
+ Example:
128
+
129
+ make_careful_client(
130
+ cache_storage=MemoryCache(),
131
+ )
132
+
133
+ """
108
134
 
109
135
  def __init__(self) -> None:
110
136
  self.cache: dict[str, Response] = {}
@@ -122,11 +148,21 @@ class FileCache(CacheStorageBase):
122
148
  """
123
149
  File-based cache for request responses.
124
150
 
125
- :param cache_dir: directory for storing responses
126
- :param check_last_modified: set to True to compare last-modified
127
- timestamp in cached response with value from HEAD request
151
+ Parameters:
152
+ cache_dir: directory for storing responses
153
+
154
+ Example:
155
+
156
+ make_careful_client(
157
+ cache_storage=FileCache("_httpcache/"),
158
+ )
159
+
128
160
  """
129
161
 
162
+ # TODO: restore?
163
+ # check_last_modified: set to True to compare last-modified
164
+ # timestamp in cached response with value from HEAD request
165
+
130
166
  # file name escaping inspired by httplib2
131
167
  _prefix = re.compile(r"^\w+://")
132
168
  _illegal = re.compile(r"[?/:|]+")
@@ -188,7 +224,7 @@ class FileCache(CacheStorageBase):
188
224
  # status & encoding will be in headers, but are faked
189
225
  # need to split spaces out of status to get code (e.g. '200 OK')
190
226
  resp = Response(
191
- status_code = int(resp_headers.pop("status").split(" ")[0]),
227
+ status_code=int(resp_headers.pop("status").split(" ")[0]),
192
228
  content=resp_content,
193
229
  default_encoding=resp_headers.pop("encoding"),
194
230
  headers=resp_headers,
@@ -224,13 +260,18 @@ class FileCache(CacheStorageBase):
224
260
  os.remove(fname)
225
261
 
226
262
 
227
- class SQLiteCache(CacheStorageBase):
228
- """SQLite cache for request responses.
263
+ class SqliteCache(CacheStorageBase):
264
+ """
265
+ sqlite cache for request responses.
266
+
267
+ Parameters:
268
+ cache_path: path for SQLite database file
229
269
 
230
- :param cache_path: path for SQLite database file
231
- :param check_last_modified: set to True to compare last-modified
232
- timestamp in cached response with value from HEAD request
270
+ Example:
233
271
 
272
+ make_careful_client(
273
+ cache_storage=SQLiteCache("_cache.db"),
274
+ )
234
275
  """
235
276
 
236
277
  _columns = ["key", "status", "modified", "encoding", "data", "headers"]
@@ -284,7 +325,12 @@ class SQLiteCache(CacheStorageBase):
284
325
  # if rec["modified"] != new_lm:
285
326
  # return None
286
327
 
287
- resp = Response(rec["status"], content=rec["data"], default_encoding=rec["encoding"], headers=json.loads(rec["headers"]))
328
+ resp = Response(
329
+ rec["status"],
330
+ content=rec["data"],
331
+ default_encoding=rec["encoding"],
332
+ headers=json.loads(rec["headers"]),
333
+ )
288
334
  return resp
289
335
 
290
336
  def clear(self) -> None:
careful/httpx/retries.py CHANGED
@@ -2,13 +2,22 @@ import time
2
2
  import types
3
3
  import functools
4
4
  import logging
5
- from httpx import Client, Response
5
+ from httpx import Client, Response, HTTPError
6
6
 
7
7
  log = logging.getLogger("httpx")
8
8
 
9
9
 
10
- def _default_accept_response(response: Response) -> bool:
11
- return response.status_code < 400
10
+ def retry_default_rule(response: Response) -> bool:
11
+ # default behavior is to retry 400s and 500s but not 404s
12
+ return response.status_code >= 400 and response.status_code != 404
13
+
14
+
15
+ def retry_only_500s(response: Response) -> bool:
16
+ return response.status_code >= 500
17
+
18
+
19
+ def retry_all_400s_500s(response: Response) -> bool:
20
+ return response.status_code >= 400
12
21
 
13
22
 
14
23
  def _retry_request(client: Client, *args, **kwargs):
@@ -20,24 +29,21 @@ def _retry_request(client: Client, *args, **kwargs):
20
29
  exception_raised = None
21
30
 
22
31
  try:
23
- resp = client._wrapped_request(*args, **kwargs)
32
+ tries += 1
33
+ resp = client._no_retry_request(*args, **kwargs)
24
34
 
25
35
  # break from loop on an accepted response
26
- if client._accept_response(resp) or (
27
- resp.status_code == 404 and not client._retry_on_404
28
- ):
36
+ if not client._should_retry(resp):
29
37
  break
30
38
 
31
- except Exception as e:
32
- # TODO: exclude certain kinds of exceptions (SSL?) from retry
39
+ except HTTPError as e:
33
40
  exception_raised = e
34
41
 
35
42
  if exception_response := getattr(e, "response", None):
36
- if client._accept_response(exception_response):
43
+ if not client._should_retry(exception_response):
37
44
  break
38
45
 
39
46
  # if we're going to retry, sleep first
40
- tries += 1
41
47
  if tries <= client._retry_attempts:
42
48
  # twice as long each time
43
49
  wait = client._retry_wait_seconds * (2 ** (tries - 1))
@@ -68,20 +74,17 @@ def make_retry_client(
68
74
  client: Client | None = None,
69
75
  attempts: int = 1,
70
76
  wait_seconds: float = 10,
71
- retry_on_404: bool = False,
72
- accept_response=_default_accept_response,
77
+ should_retry=retry_default_rule,
73
78
  ):
74
79
  if client is None:
75
80
  client = Client()
76
81
  client._retry_attempts = max(0, attempts)
77
82
  client._retry_wait_seconds = wait_seconds
78
- client._retry_on_404 = retry_on_404
79
- client._accept_response = accept_response
83
+ client._should_retry = should_retry
80
84
 
81
- client._wrapped_request = client.request
85
+ client._no_retry_request = client.request
82
86
  client.request = types.MethodType(
83
87
  functools.wraps(client.request)(_retry_request), client
84
88
  )
85
89
 
86
90
  return client
87
-
careful/httpx/throttle.py CHANGED
@@ -16,7 +16,7 @@ def _throttle_request(client: Client, *args, **kwargs):
16
16
  client._last_request = time.time()
17
17
  else:
18
18
  client._last_request = now
19
- return client._wrapped_request(*args, **kwargs)
19
+ return client._no_throttle_request(*args, **kwargs)
20
20
 
21
21
 
22
22
  def make_throttled_client(
@@ -34,7 +34,7 @@ def make_throttled_client(
34
34
  client._requests_per_minute = requests_per_minute
35
35
  client._request_frequency = 60.0 / requests_per_minute
36
36
 
37
- client._wrapped_request = client.request
37
+ client._no_throttle_request = client.request
38
38
  client.request = types.MethodType(
39
39
  functools.wraps(client.request)(_throttle_request), client
40
40
  )
@@ -0,0 +1,71 @@
1
+ Metadata-Version: 2.4
2
+ Name: careful
3
+ Version: 0.2.0
4
+ Summary: careful extensions to httpx: throttle, retry, cache
5
+ Project-URL: Repository, https://codeberg.org/jpt/careful
6
+ Author-email: jpt <dev@jpt.sh>
7
+ License: BSD-2-Clause
8
+ License-File: LICENSE
9
+ Classifier: Development Status :: 6 - Mature
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: License :: OSI Approved :: BSD License
12
+ Classifier: Natural Language :: English
13
+ Classifier: Operating System :: OS Independent
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Programming Language :: Python :: 3.13
18
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
19
+ Requires-Python: >=3.10
20
+ Requires-Dist: httpx>=0.28.1
21
+ Requires-Dist: mkdocs-material>=9.6.18
22
+ Requires-Dist: mkdocstrings-python>=1.18.2
23
+ Requires-Dist: mkdocstrings>=0.30.0
24
+ Requires-Dist: pytest-httpbin>=2.1.0
25
+ Requires-Dist: pytest>=8.4.2
26
+ Description-Content-Type: text/markdown
27
+
28
+ # careful
29
+
30
+ <img src="/carefully-3681327.svg" width=100 height=100 alt="logo of a warning sign">
31
+
32
+ **careful** is a library for making requests to unreliable websites with httpx.
33
+
34
+ **Code**: <https://codeberg.org/jpt/careful>
35
+
36
+ **Docs**: <https://careful.jpt.sh>
37
+
38
+ It offers enhancements to
39
+ [`httpx.Client`](https://www.python-httpx.org)
40
+ useful for writing long-running scrapers & crawlers, particularly against sites that are slow or have intermittent errors.
41
+
42
+ - **configurable retry support.** retry on timeouts or other errors, with exponential back-off.
43
+ - **simple request throttling.** set a maximum number of requests per minute.
44
+ - **development cache.** configurable caching aimed at reducing redundant requests made while authoring/testing web scrapers.
45
+
46
+ ### example
47
+
48
+ ```python
49
+ from httpx import Client
50
+ from careful.httpx import make_careful_client
51
+
52
+ client = make_careful_client(
53
+ # can configure httpx.Client however you usually would
54
+ client=Client(headers={'user-agent': 'careful/1.0'}),
55
+ # retries are configurable w/ exponential back off
56
+ retry_attempts=2,
57
+ retry_wait_seconds=5,
58
+ # can cache to process memory, filesystem, or SQLite
59
+ cache_storage=MemoryCache(),
60
+ # requests will automatically be throttled to aim at this rate
61
+ requests_per_minute=60,
62
+ )
63
+
64
+ # all normal methods on httpx.Client make use of configured enhancements
65
+ client.get("https://example.com")
66
+ ```
67
+
68
+
69
+ ---
70
+
71
+ Logo licensed from [Adrien Coquet via Noun Project](https://thenounproject.com/icon/carefully-3681327/)
@@ -0,0 +1,11 @@
1
+ careful/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ careful/httpx/__init__.py,sha256=u-n0uKIWAd3NXsZUd1UA4wzJJTEhRR74diHzDV2EpEU,4885
3
+ careful/httpx/_types.py,sha256=NwyQ-ItodN9HnO7d7b0M1M4M9y90TjRkhQFqNuypKRI,149
4
+ careful/httpx/dev_cache.py,sha256=KR35u0CvutqTOWQ8pO-hzwbPy0lDBhShJfhCAbOvqv0,11032
5
+ careful/httpx/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ careful/httpx/retries.py,sha256=Kszm0wDITyPZ3qx5TsDL__HjCYVJyAZ2WehrlpXV5Cc,2500
7
+ careful/httpx/throttle.py,sha256=ZpuFABYHGQ4D0zks922SCXp7WZG_-Ysafz-Npa2QVwQ,1096
8
+ careful-0.2.0.dist-info/METADATA,sha256=A82D5ltN7bDh1dXkOqdBLcW8fxxxqsonFgf9hZQlors,2541
9
+ careful-0.2.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
10
+ careful-0.2.0.dist-info/licenses/LICENSE,sha256=oHe4LmCuo6CZne42DRXfiR3uqqIfGsk4dAvDKucAi4M,1315
11
+ careful-0.2.0.dist-info/RECORD,,
@@ -1,48 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: careful
3
- Version: 0.1.0
4
- Summary: Add your description here
5
- Author-email: jpt <dev@jpt.sh>
6
- License-File: LICENSE
7
- Requires-Python: >=3.13
8
- Requires-Dist: httpx>=0.28.1
9
- Requires-Dist: pytest-httpbin>=2.1.0
10
- Requires-Dist: pytest>=8.4.2
11
- Description-Content-Type: text/markdown
12
-
13
- **careful_httpx** is a library for making requests to less-than-reliable websites.
14
-
15
- It is based on [scrapelib](https://pypi.org/scrapelib/), which has powered Open States & many other Python scrapers for over 15 years.
16
-
17
- Code: <https://codeberg.org/jpt/careful_httpx>
18
-
19
- Documentation: TODO
20
-
21
- ## Features
22
-
23
- Enhances [`httpx.Client`](https://www.python-httpx.org) with features useful for writing long-running scrapers & crawlers, particularly against sites that are slow or have intermittent errors.
24
-
25
- - retries
26
- - throttling
27
- - dev-cache for iterating on scrapers
28
-
29
- ### example
30
-
31
- TODO
32
-
33
- ### features this has that scrapelib doesn't
34
-
35
- - httpx support
36
- - composable interface, can augment Client with just the enhancements you want
37
-
38
- TODO: don't allow instantiating bad patch classes, and check for incompatible configs
39
-
40
- ### features scrapelib had that this doesn't
41
-
42
- Open to considering if there is interest, but didn't seem necessary.
43
-
44
- - HTTP(S) and FTP requests via an identical API
45
- - allow setting custom ciphers
46
- - have urlretrieve
47
- - support FTP
48
- - set custom user-agent/mess w/ headers
@@ -1,10 +0,0 @@
1
- careful/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- careful/httpx/__init__.py,sha256=gDSnAnqxFt9mLi2laArt7BUn_wPU5ub0k9zeqsexYJY,1605
3
- careful/httpx/dev_cache.py,sha256=_jwpnf1fzBzR23Of2HdsyiN_MPHvRG0gtM49Y4qRtQg,10031
4
- careful/httpx/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
- careful/httpx/retries.py,sha256=mMIZf-EP9bhzEZnEmwtjWZ2qdl6ZCJ7vq3hZltT6Zms,2458
6
- careful/httpx/throttle.py,sha256=wCJWHERr5manyKq07ZdonmxbK0oh0PYJgO6a94IzN0s,1088
7
- careful-0.1.0.dist-info/METADATA,sha256=wBFvqh5xyMfNRVB8Jg9Aa3s5_Te2NxcBy5BE-3NMYeY,1373
8
- careful-0.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
9
- careful-0.1.0.dist-info/licenses/LICENSE,sha256=oHe4LmCuo6CZne42DRXfiR3uqqIfGsk4dAvDKucAi4M,1315
10
- careful-0.1.0.dist-info/RECORD,,