careful 0.2.1__py3-none-any.whl → 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
careful/httpx/__init__.py CHANGED
@@ -1,17 +1,30 @@
1
+ import os
2
+ import logging
3
+ import pathlib
4
+ from urllib.parse import urlparse
1
5
  from .retries import make_retry_client, retry_default_rule
2
6
  from .throttle import make_throttled_client
7
+ from .robots import (
8
+ make_robots_txt_client,
9
+ RobotExclusionError,
10
+ RobotsRejectFunc,
11
+ raise_robots_txt,
12
+ )
3
13
  from .dev_cache import (
4
14
  make_dev_caching_client,
5
15
  MemoryCache,
6
16
  FileCache,
7
17
  SqliteCache,
8
- CacheStorageBase,
18
+ CacheStorage,
19
+ CacheResponse,
9
20
  _cache_200s,
10
21
  _default_keyfunc,
11
22
  )
12
23
  from ._types import ResponsePredicate, CacheKeyfunc
13
24
  from httpx import Client
14
25
 
26
+ log = logging.getLogger("careful")
27
+
15
28
 
16
29
  def make_careful_client(
17
30
  *,
@@ -20,11 +33,14 @@ def make_careful_client(
20
33
  retry_wait_seconds: float = 10,
21
34
  should_retry: ResponsePredicate = retry_default_rule,
22
35
  requests_per_minute: int = 0,
23
- cache_storage: CacheStorageBase = None,
36
+ cache_storage: CacheStorage | None = None,
24
37
  cache_write_only: bool = False,
25
38
  should_cache: ResponsePredicate = _cache_200s,
26
39
  cache_keyfunc: CacheKeyfunc = _default_keyfunc,
27
- ):
40
+ check_robots_txt: bool = False,
41
+ robots_txt_user_agent: str | None = None,
42
+ robots_txt_on_reject: RobotsRejectFunc = raise_robots_txt,
43
+ ) -> Client:
28
44
  """
29
45
  This function patches an `httpx.Client` so that all requests made with the client support
30
46
  [retries](#retries), [throttling](#throttling), and [development caching](#development-caching).
@@ -51,43 +67,6 @@ def make_careful_client(
51
67
 
52
68
  cache_keyfunc: Function that takes request details and returns a unique cache key.
53
69
 
54
- ## Retries
55
-
56
- If `retry_attempts` is set, responses will be passed to `should_retry`.
57
- Responses that are rejected (return `True`) will be retried after a wait based on
58
- `retry_wait_seconds`.
59
- Each retry will wait twice as long as the one before.
60
-
61
- ## Throttling
62
-
63
- If `requests_per_minute` is set, standard (non-retry) requests will automatically
64
- sleep for a short period to target the given rate.
65
-
66
- For example, at 30rpm, the sleep time on a fast request will be close to 2 seconds.
67
-
68
- ## Development Caching
69
-
70
- Why **development caching?**
71
-
72
- This feature is named as a reminder that **this is not true HTTP caching**, which
73
- should take various headers into account. Look at libraries like [hishel](https://hishel.com) if that's what you are after.
74
-
75
- The purpose of this feature is to allow you to cache all of your HTTP requests during development.
76
- Often when writing a scraper or crawler, you wind up hitting the site you are working on more often than you'd like-- each time you iterate on your code you're likely making redundant requests to pages that haven't changed.
77
-
78
- By caching all successful requests (configurable with the `should_cache` parameter),
79
- you can easily re-run scrapers without making redundant HTTP requests.
80
- This means faster development time & happier upstream servers.
81
-
82
- To enable development caching, assign a [`MemoryCache`][careful.httpx.MemoryCache],
83
- [`FileCache`][careful.httpx.FileCache], or [`SqliteCache`][careful.httpx.SqliteCache] to
84
- the `cache_storage` property of a `scrapelib.Scraper`.
85
-
86
- ---
87
-
88
- When multiple features are applied, the order of wrapping ensures that:
89
- - the cache is checked first, and bypasses throttling if hit
90
- - retries use their own delays, but not throttled separately
91
70
  """
92
71
  if client is None:
93
72
  client = Client()
@@ -104,7 +83,7 @@ def make_careful_client(
104
83
  client = make_throttled_client(
105
84
  client=client, requests_per_minute=requests_per_minute
106
85
  )
107
- # caching on top layer, so cache will be checked first
86
+ # caching on top layer, so cache will be checked before throttling/etc.
108
87
  if cache_storage:
109
88
  client = make_dev_caching_client(
110
89
  client=client,
@@ -113,15 +92,124 @@ def make_careful_client(
113
92
  should_cache=should_cache,
114
93
  write_only=cache_write_only,
115
94
  )
95
+ # robots.txt before cache
96
+ if check_robots_txt:
97
+ client = make_robots_txt_client(
98
+ client=client,
99
+ as_user_agent=robots_txt_user_agent,
100
+ on_rejection=robots_txt_on_reject,
101
+ )
116
102
 
117
103
  return client
118
104
 
119
105
 
106
+ def _int_env(var_name: str, default: int) -> int:
107
+ return int(os.environ.get(var_name, default))
108
+
109
+
110
+ def _float_env(var_name: str, default: float) -> float:
111
+ return float(os.environ.get(var_name, default))
112
+
113
+
114
+ def _bool_env(var_name: str, default: bool) -> bool:
115
+ """helper function for bool env vars"""
116
+ return bool(os.environ.get(var_name, "T" if default else ""))
117
+
118
+
119
+ def _cache_env(var_name: str, default: CacheStorage | None) -> CacheStorage | None:
120
+ """
121
+ helper function that reads cache as a protocol string
122
+ """
123
+ cache_str = os.environ.get(var_name)
124
+ if not cache_str:
125
+ return default
126
+ parsed = urlparse(cache_str)
127
+ # urlparse always starts with a / (var needs :/// to skip netloc -> into path)
128
+ true_path = parsed.path[1:] if parsed.path.startswith("/") else parsed.path
129
+ # if it starts with a //// then it is an absolute path
130
+ if true_path.startswith("/"):
131
+ path = pathlib.Path(true_path)
132
+ else:
133
+ path = pathlib.Path.cwd() / true_path
134
+ if parsed.scheme == "memory":
135
+ log.info("cache from env %s => MemoryCache", var_name)
136
+ return MemoryCache()
137
+ elif parsed.scheme == "file":
138
+ log.info("cache from env %s => FileCache(%s)", var_name, path)
139
+ return FileCache(path)
140
+ elif parsed.scheme == "sqlite":
141
+ log.info("cache from env %s => SqliteCache(%s)", var_name, path)
142
+ return SqliteCache(path)
143
+ else:
144
+ log.warning("invalid cache %s=%s", var_name, cache_str)
145
+ return default
146
+
147
+
148
+ def make_careful_client_from_env(
149
+ *,
150
+ client: Client | None = None,
151
+ retry_attempts: int = 0,
152
+ retry_wait_seconds: float = 10,
153
+ should_retry: ResponsePredicate = retry_default_rule,
154
+ requests_per_minute: int = 0,
155
+ cache_storage: CacheStorage | None = None,
156
+ cache_write_only: bool = False,
157
+ should_cache: ResponsePredicate = _cache_200s,
158
+ cache_keyfunc: CacheKeyfunc = _default_keyfunc,
159
+ check_robots_txt: bool = False,
160
+ robots_txt_user_agent: str | None = None,
161
+ robots_txt_on_reject: RobotsRejectFunc = raise_robots_txt,
162
+ ) -> Client:
163
+ """
164
+ Make a careful client from environment variables.
165
+
166
+ Any set environment variables will override parameters if set.
167
+
168
+ Numeric:
169
+ - CAREFUL_RETRY_ATTEMPTS
170
+ - CAREFUL_RETRY_WAIT_SECONDS
171
+ - CAREFUL_REQUESTS_PER_MINUTE
172
+ - CAREFUL_CHECK_ROBOTS_TXT
173
+ Booleans (any non-empty value is true):
174
+ - CAREFUL_CACHE_WRITE_ONLY
175
+ - CAREFUL_ROBOTS_TXT_USER_AGENT
176
+ Cache:
177
+ - CAREFUL_CACHE, which can be:
178
+ memory://
179
+ cache://path/to/db.sqlite3
180
+ file://path/to/directory
181
+
182
+ Function parameters do not have environment variables.
183
+ """
184
+ return make_careful_client(
185
+ client=client,
186
+ retry_attempts=_int_env("CAREFUL_RETRY_ATTEMPTS", retry_attempts),
187
+ retry_wait_seconds=_float_env("CAREFUL_RETRY_WAIT_SECONDS", retry_wait_seconds),
188
+ should_retry=should_retry,
189
+ requests_per_minute=_int_env(
190
+ "CAREFUL_REQUESTS_PER_MINUTE", requests_per_minute
191
+ ),
192
+ cache_storage=_cache_env("CAREFUL_CACHE", cache_storage),
193
+ cache_write_only=_bool_env("CAREFUL_CACHE_WRITE_ONLY", cache_write_only),
194
+ should_cache=should_cache,
195
+ cache_keyfunc=cache_keyfunc,
196
+ check_robots_txt=_bool_env("CAREFUL_CHECK_ROBOTS_TXT", check_robots_txt),
197
+ robots_txt_user_agent=os.environ.get(
198
+ "CAREFUL_ROBOTS_TXT_USER_AGENT", robots_txt_user_agent
199
+ ),
200
+ robots_txt_on_reject=robots_txt_on_reject,
201
+ )
202
+
203
+
120
204
  __all__ = [
205
+ "make_careful_client",
121
206
  "make_retry_client",
122
207
  "make_throttled_client",
123
208
  "make_dev_caching_client",
209
+ "make_robots_txt_client",
124
210
  "MemoryCache",
125
211
  "FileCache",
126
212
  "SqliteCache",
213
+ "CacheResponse",
214
+ "RobotExclusionError",
127
215
  ]
@@ -1,6 +1,6 @@
1
- import abc
2
1
  import types
3
2
  import functools
3
+ import pathlib
4
4
  import logging
5
5
  import re
6
6
  import os
@@ -8,10 +8,27 @@ import glob
8
8
  import hashlib
9
9
  import sqlite3
10
10
  import json
11
+ from typing import cast, Protocol, Callable
12
+ from ._types import ResponsePredicate, CacheKeyfunc
11
13
 
12
14
  from httpx import Client, Response, Request
13
15
 
14
- log = logging.getLogger("httpx")
16
+ log = logging.getLogger("careful")
17
+
18
+
19
+ class CacheStorage(Protocol):
20
+ def get(self, key: str) -> None | Response: ...
21
+ def set(self, key: str, response: Response) -> None: ...
22
+
23
+
24
+ class DevCacheClient(Protocol):
25
+ _retry_attempts: int
26
+ _cache_storage: CacheStorage
27
+ _write_only: bool
28
+ _should_cache: ResponsePredicate
29
+ _cache_keyfunc: CacheKeyfunc
30
+ _no_cache_request: Callable
31
+ request: Callable
15
32
 
16
33
 
17
34
  def _default_keyfunc(
@@ -28,7 +45,7 @@ def _default_keyfunc(
28
45
  if method.lower() != "get":
29
46
  return None
30
47
 
31
- return Request(url=url, method=method, params=params).url
48
+ return str(Request(url=url, method=method, params=params).url)
32
49
 
33
50
 
34
51
  def _cache_200s(response: Response) -> bool:
@@ -40,7 +57,11 @@ def _cache_200s(response: Response) -> bool:
40
57
  return response.status_code == 200
41
58
 
42
59
 
43
- def _cached_request(client: Client, *args, **kwargs):
60
+ class CacheResponse(Response):
61
+ fromcache: bool
62
+
63
+
64
+ def _cached_request(client: DevCacheClient, *args, **kwargs) -> CacheResponse:
44
65
  method, url = args
45
66
  request_key = client._cache_keyfunc(method, url, kwargs["params"])
46
67
 
@@ -50,30 +71,28 @@ def _cached_request(client: Client, *args, **kwargs):
50
71
  cached_resp = client._cache_storage.get(request_key)
51
72
 
52
73
  if cached_resp:
53
- # resp = cast(CacheResponse, resp_maybe)
54
74
  log.info("using cached response request_key=%s", request_key)
55
- cached_resp.fromcache = True
56
- resp = cached_resp
75
+ new_resp = cast(CacheResponse, cached_resp)
76
+ new_resp.fromcache = True
57
77
  else:
58
- resp = client._no_cache_request(*args, **kwargs)
78
+ new_resp = cast(CacheResponse, client._no_cache_request(*args, **kwargs))
79
+ new_resp.fromcache = False
59
80
  # save to cache if request and response meet criteria
60
- log.debug("XX %s %s", request_key, client._should_cache(resp))
61
- if request_key and client._should_cache(resp):
62
- client._cache_storage.set(request_key, resp)
81
+ if request_key and client._should_cache(new_resp):
82
+ client._cache_storage.set(request_key, new_resp)
63
83
  log.info("caching response request_key=%s", request_key)
64
- resp.fromcache = False
65
84
 
66
- return resp
85
+ return new_resp
67
86
 
68
87
 
69
88
  def make_dev_caching_client(
70
89
  *,
90
+ cache_storage: CacheStorage,
71
91
  client: Client | None = None,
72
- cache_storage=None,
73
92
  cache_keyfunc=_default_keyfunc,
74
93
  should_cache=_cache_200s,
75
94
  write_only=False,
76
- ):
95
+ ) -> Client:
77
96
  """
78
97
  Returns an enhanced `httpx.Client` where requests are saved to a
79
98
  specified cache.
@@ -98,29 +117,21 @@ def make_dev_caching_client(
98
117
  if client is None:
99
118
  client = Client()
100
119
 
101
- client._cache_storage = cache_storage
102
- client._cache_keyfunc = cache_keyfunc
103
- client._should_cache = should_cache
104
- client._write_only = write_only
120
+ tclient = cast(DevCacheClient, client)
105
121
 
106
- client._no_cache_request = client.request
107
- client.request = types.MethodType(
122
+ tclient._cache_storage = cache_storage
123
+ tclient._cache_keyfunc = cache_keyfunc
124
+ tclient._should_cache = should_cache
125
+ tclient._write_only = write_only
126
+
127
+ tclient._no_cache_request = client.request
128
+ tclient.request = types.MethodType(
108
129
  functools.wraps(client.request)(_cached_request), client
109
130
  )
110
131
  return client
111
132
 
112
133
 
113
- class CacheStorageBase(abc.ABC):
114
- @abc.abstractmethod
115
- def get(self, key: str) -> None | Response:
116
- raise NotImplementedError()
117
-
118
- @abc.abstractmethod
119
- def set(self, key: str, response: Response) -> None:
120
- raise NotImplementedError()
121
-
122
-
123
- class MemoryCache(CacheStorageBase):
134
+ class MemoryCache(CacheStorage):
124
135
  """
125
136
  In memory cache for request responses.
126
137
 
@@ -144,7 +155,7 @@ class MemoryCache(CacheStorageBase):
144
155
  self.cache[key] = response
145
156
 
146
157
 
147
- class FileCache(CacheStorageBase):
158
+ class FileCache(CacheStorage):
148
159
  """
149
160
  File-based cache for request responses.
150
161
 
@@ -228,6 +239,7 @@ class FileCache(CacheStorageBase):
228
239
  content=resp_content,
229
240
  default_encoding=resp_headers.pop("encoding"),
230
241
  headers=resp_headers,
242
+ request=Request("GET", key), # not perfect, but it'll do
231
243
  )
232
244
  return resp
233
245
  except IOError:
@@ -244,6 +256,8 @@ class FileCache(CacheStorageBase):
244
256
  encoding_str = "encoding: {0}\n".format(response.encoding)
245
257
  f.write(encoding_str.encode("utf8"))
246
258
  for h, v in response.headers.items():
259
+ if h.lower() in ("content-encoding", "content-length"):
260
+ continue
247
261
  # header: value\n
248
262
  f.write(h.encode("utf8"))
249
263
  f.write(b": ")
@@ -260,7 +274,7 @@ class FileCache(CacheStorageBase):
260
274
  os.remove(fname)
261
275
 
262
276
 
263
- class SqliteCache(CacheStorageBase):
277
+ class SqliteCache(CacheStorage):
264
278
  """
265
279
  sqlite cache for request responses.
266
280
 
@@ -276,12 +290,17 @@ class SqliteCache(CacheStorageBase):
276
290
 
277
291
  _columns = ["key", "status", "modified", "encoding", "data", "headers"]
278
292
 
279
- def __init__(self, cache_path: str, check_last_modified: bool = False):
293
+ def __init__(
294
+ self, cache_path: str | pathlib.Path, check_last_modified: bool = False
295
+ ):
280
296
  self.cache_path = cache_path
281
297
  self.check_last_modified = check_last_modified
282
- self._conn = sqlite3.connect(cache_path)
298
+ self._conn = sqlite3.connect(str(cache_path))
283
299
  self._conn.text_factory = str
284
300
  self._build_table()
301
+ # self._conn.execute("PRAGMA journal_mode=WAL;")
302
+ # self._conn.execute("PRAGMA synchronous=1;")
303
+ # self._conn.isolation_level = None
285
304
 
286
305
  def _build_table(self) -> None:
287
306
  """Create table for storing request information and response."""
@@ -294,6 +313,8 @@ class SqliteCache(CacheStorageBase):
294
313
  def set(self, key: str, response: Response) -> None:
295
314
  """Set cache entry for key with contents of response."""
296
315
  mod = response.headers.pop("last-modified", None)
316
+ response.headers.pop("content-encoding", None)
317
+ response.headers.pop("content-length", None)
297
318
  status = int(response.status_code)
298
319
  rec = (
299
320
  key,
@@ -330,6 +351,7 @@ class SqliteCache(CacheStorageBase):
330
351
  content=rec["data"],
331
352
  default_encoding=rec["encoding"],
332
353
  headers=json.loads(rec["headers"]),
354
+ request=Request("GET", key), # not perfect, but it'll do
333
355
  )
334
356
  return resp
335
357
 
careful/httpx/retries.py CHANGED
@@ -2,25 +2,37 @@ import time
2
2
  import types
3
3
  import functools
4
4
  import logging
5
+ from typing import Protocol, Callable, cast
5
6
  from httpx import Client, Response, HTTPError
7
+ from ._types import ResponsePredicate
6
8
 
7
- log = logging.getLogger("httpx")
9
+ log = logging.getLogger("careful")
10
+
11
+
12
+ class RetryClient(Protocol):
13
+ _retry_attempts: int
14
+ _retry_wait_seconds: float
15
+ _should_retry: ResponsePredicate
16
+ _no_retry_request: Callable
17
+ request: Callable
8
18
 
9
19
 
10
20
  def retry_default_rule(response: Response) -> bool:
11
- # default behavior is to retry 400s and 500s but not 404s
21
+ """default behavior is to retry 400s and 500s but not 404s"""
12
22
  return response.status_code >= 400 and response.status_code != 404
13
23
 
14
24
 
15
25
  def retry_only_500s(response: Response) -> bool:
26
+ """retry all status codes that are 500 or above"""
16
27
  return response.status_code >= 500
17
28
 
18
29
 
19
30
  def retry_all_400s_500s(response: Response) -> bool:
31
+ """retry all status codes that are 400 or above"""
20
32
  return response.status_code >= 400
21
33
 
22
34
 
23
- def _retry_request(client: Client, *args, **kwargs):
35
+ def _retry_request(client: RetryClient, *args, **kwargs) -> Response:
24
36
  # the retry loop
25
37
  tries = 0
26
38
  exception_raised = None
@@ -75,15 +87,17 @@ def make_retry_client(
75
87
  attempts: int = 1,
76
88
  wait_seconds: float = 10,
77
89
  should_retry=retry_default_rule,
78
- ):
90
+ ) -> Client:
79
91
  if client is None:
80
92
  client = Client()
81
- client._retry_attempts = max(0, attempts)
82
- client._retry_wait_seconds = wait_seconds
83
- client._should_retry = should_retry
84
93
 
85
- client._no_retry_request = client.request
86
- client.request = types.MethodType(
94
+ tclient = cast(RetryClient, client)
95
+ tclient._retry_attempts = max(0, attempts)
96
+ tclient._retry_wait_seconds = wait_seconds
97
+ tclient._should_retry = should_retry
98
+
99
+ tclient._no_retry_request = client.request
100
+ tclient.request = types.MethodType(
87
101
  functools.wraps(client.request)(_retry_request), client
88
102
  )
89
103
 
@@ -0,0 +1,72 @@
1
+ import types
2
+ import functools
3
+ import warnings
4
+ from urllib.robotparser import RobotFileParser
5
+ from typing import Protocol, cast, Callable
6
+ from httpx import Client, Response, URL
7
+
8
+
9
+ class RobotExclusionError(Exception):
10
+ pass
11
+
12
+
13
+ def raise_robots_txt(url, robots):
14
+ raise RobotExclusionError(f"{url} excluded by {robots.url}")
15
+
16
+
17
+ def warn_robots_txt(url, robots):
18
+ warnings.warn(f"{url} excluded by {robots.url}")
19
+
20
+
21
+ class RobotsClient(Protocol):
22
+ _robots_for_domain: dict[str, RobotFileParser]
23
+ _robots_ua: str
24
+ _rejected_action: Callable[[str, RobotFileParser], None]
25
+ _no_check_request: Callable
26
+ request: Callable
27
+ headers: dict
28
+
29
+
30
+ def _robot_check_request(client: RobotsClient, *args, **kwargs) -> Response:
31
+ method, url = args
32
+ uurl = URL(url)
33
+ domain = uurl.host
34
+ if domain not in client._robots_for_domain:
35
+ robots_url = f"{uurl.scheme}://{domain}/robots.txt"
36
+ robots_resp = client._no_check_request("GET", robots_url)
37
+ # pass url for output, but don't do read
38
+ parser = RobotFileParser(robots_url)
39
+ parser.parse(robots_resp.text.splitlines())
40
+ client._robots_for_domain[domain] = parser
41
+ if not client._robots_for_domain[domain].can_fetch(client._robots_ua, url):
42
+ client._rejected_action(url, client._robots_for_domain[domain])
43
+ # if action doesn't raise an exception, the request goes through
44
+ return client._no_check_request(*args, **kwargs)
45
+
46
+
47
+ RobotsRejectFunc = Callable[[str, RobotFileParser], None]
48
+
49
+
50
+ def make_robots_txt_client(
51
+ *,
52
+ client: Client | None = None,
53
+ as_user_agent: str | None = None,
54
+ on_rejection: RobotsRejectFunc = raise_robots_txt,
55
+ ) -> Client:
56
+ if client is None:
57
+ client = Client()
58
+
59
+ tclient = cast(RobotsClient, client)
60
+
61
+ tclient._robots_for_domain = {}
62
+ if as_user_agent:
63
+ tclient._robots_ua = as_user_agent
64
+ else:
65
+ tclient._robots_ua = tclient.headers["user-agent"]
66
+ tclient._rejected_action = on_rejection
67
+
68
+ tclient._no_check_request = client.request
69
+ tclient.request = types.MethodType(
70
+ functools.wraps(client.request)(_robot_check_request), client
71
+ )
72
+ return client
careful/httpx/throttle.py CHANGED
@@ -2,12 +2,21 @@ import time
2
2
  import types
3
3
  import functools
4
4
  import logging
5
- from httpx import Client
5
+ from typing import Protocol, cast, Callable
6
+ from httpx import Client, Response
6
7
 
7
- log = logging.getLogger("httpx")
8
+ log = logging.getLogger("careful")
8
9
 
9
10
 
10
- def _throttle_request(client: Client, *args, **kwargs):
11
+ class ThrottledClient(Protocol):
12
+ _last_request: float
13
+ _requests_per_minute: float
14
+ _request_frequency: float
15
+ _no_throttle_request: Callable
16
+ request: Callable
17
+
18
+
19
+ def _throttle_request(client: ThrottledClient, *args, **kwargs) -> Response:
11
20
  now = time.time()
12
21
  diff = client._request_frequency - (now - client._last_request)
13
22
  if diff > 0:
@@ -23,19 +32,21 @@ def make_throttled_client(
23
32
  *,
24
33
  client: Client | None = None,
25
34
  requests_per_minute: float = 0,
26
- ):
35
+ ) -> Client:
27
36
  if requests_per_minute <= 0:
28
37
  raise ValueError("requests per minute must be a positive number")
29
38
 
30
39
  if client is None:
31
40
  client = Client()
32
41
 
33
- client._last_request = 0.0
34
- client._requests_per_minute = requests_per_minute
35
- client._request_frequency = 60.0 / requests_per_minute
42
+ tclient = cast(ThrottledClient, client)
43
+
44
+ tclient._last_request = 0.0
45
+ tclient._requests_per_minute = requests_per_minute
46
+ tclient._request_frequency = 60.0 / requests_per_minute
36
47
 
37
- client._no_throttle_request = client.request
38
- client.request = types.MethodType(
48
+ tclient._no_throttle_request = client.request
49
+ tclient.request = types.MethodType(
39
50
  functools.wraps(client.request)(_throttle_request), client
40
51
  )
41
52
  return client
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: careful
3
- Version: 0.2.1
4
- Summary: careful extensions to httpx: throttle, retry, cache
3
+ Version: 0.3.2
4
+ Summary: a small library for writing resilient, well-behaved HTTP code
5
5
  Project-URL: Repository, https://codeberg.org/jpt/careful
6
6
  Author-email: jpt <dev@jpt.sh>
7
7
  License: BSD-2-Clause
@@ -15,6 +15,7 @@ Classifier: Programming Language :: Python :: 3.10
15
15
  Classifier: Programming Language :: Python :: 3.11
16
16
  Classifier: Programming Language :: Python :: 3.12
17
17
  Classifier: Programming Language :: Python :: 3.13
18
+ Classifier: Programming Language :: Python :: 3.14
18
19
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
19
20
  Requires-Python: >=3.10
20
21
  Requires-Dist: httpx>=0.28.1
@@ -22,43 +23,50 @@ Description-Content-Type: text/markdown
22
23
 
23
24
  # careful
24
25
 
25
- <img src="https://careful.jpt.sh/carefully-3681327.svg" width=100 height=100 alt="logo of a warning sign">
26
+ <img src="https://jpt.sh/projects/careful/carefully-3681327.svg" width=100 height=100 alt="logo of a warning sign">
27
+
28
+ **careful** is a Python library for writing resilient, well-behaved HTTP clients.
26
29
 
27
- **careful** is a Python library for making requests to unreliable websites with `httpx`.
28
-
29
30
  **Code**: <https://codeberg.org/jpt/careful>
30
31
 
31
- **Docs**: <https://careful.jpt.sh>
32
+ **Docs**: <https://jpt.sh/projects/careful/>
32
33
 
34
+ ![PyPI - Version](https://img.shields.io/pypi/v/careful)
33
35
  [![status-badge](https://ci.codeberg.org/api/badges/15185/status.svg)](https://ci.codeberg.org/repos/15185)
34
36
 
35
- It offers enhancements to
36
- [`httpx.Client`](https://www.python-httpx.org)
37
- useful for writing long-running scrapers & crawlers, particularly against sites that are slow or have intermittent errors.
37
+ Call one function to enchant an
38
+ **[httpx.Client](https://www.python-httpx.org)**, making your HTTP connections more resilient and better mannered.
38
39
 
39
- - **configurable retry support.** retry on timeouts or other errors, with exponential back-off.
40
- - **simple request throttling.** set a maximum number of requests per minute.
41
- - **development cache.** configurable caching aimed at reducing redundant requests made while authoring/testing web scrapers.
40
+ - Configure **throttling** to avoid accidental Denial-of-Service / risking getting banned.
41
+ - **Retries** help overcome intermittent failures on flaky sites or long crawls.
42
+ - **Development caching** Cache persists between runs during development, reduces redundant requests made while iterating on your crawlers & scrapers.
42
43
 
43
- ### example
44
+ ### Example
44
45
 
45
46
  ```python
46
47
  from httpx import Client
47
48
  from careful.httpx import make_careful_client
48
49
 
50
+ # the only function you need to call is make_careful_client
51
+ # this wraps your existing `httpx.Client` with your preferred
52
+ # careful behaviors
53
+
49
54
  client = make_careful_client(
50
- # can configure httpx.Client however you usually would
51
- client=Client(headers={'user-agent': 'careful/1.0'}),
55
+ client=Client(headers={'user-agent': 'spiderman/1.0'}),
56
+
52
57
  # retries are configurable w/ exponential back off
53
58
  retry_attempts=2,
54
59
  retry_wait_seconds=5,
60
+
55
61
  # can cache to process memory, filesystem, or SQLite
56
62
  cache_storage=MemoryCache(),
57
- # requests will automatically be throttled to aim at this rate
63
+
64
+ # easy-to-configure throttling
58
65
  requests_per_minute=60,
59
66
  )
60
67
 
61
- # all normal methods on httpx.Client make use of configured enhancements
68
+ # methods on client are called as they always are
69
+ # configured behaviors occur without further code changes
62
70
  client.get("https://example.com")
63
71
  ```
64
72
 
@@ -0,0 +1,12 @@
1
+ careful/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ careful/httpx/__init__.py,sha256=Ul_K5XSMMW8yo3M-lq8nDbvtDEQS0N-vmurIGnzE8dY,7381
3
+ careful/httpx/_types.py,sha256=jefYDxSbLRUatU8QKeyxStc9UC3AJwAba2SfhNkM0RY,151
4
+ careful/httpx/dev_cache.py,sha256=cc4_rLKFc6Xggpx5MKc8DBurn5KXTDLHO19U_SipXiY,11873
5
+ careful/httpx/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ careful/httpx/retries.py,sha256=3kjuHKYnK1N4Rtum5gUyY_XO4o4cL4jc59d17Y6UwrI,2949
7
+ careful/httpx/robots.py,sha256=jfqQdplTap_RCENu6MHEIabFVznFLruMvSIaG_u0v_8,2168
8
+ careful/httpx/throttle.py,sha256=b1fbmUskcm343D1bbPbY-ITLdL1zVm1dXtjt9LT1bEA,1412
9
+ careful-0.3.2.dist-info/METADATA,sha256=fN6sNY5n4PFna9Eq9uTfxDbvzCpct-qXcLOkzrdItGE,2692
10
+ careful-0.3.2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
11
+ careful-0.3.2.dist-info/licenses/LICENSE,sha256=oHe4LmCuo6CZne42DRXfiR3uqqIfGsk4dAvDKucAi4M,1315
12
+ careful-0.3.2.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: hatchling 1.27.0
2
+ Generator: hatchling 1.28.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
@@ -1,11 +0,0 @@
1
- careful/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- careful/httpx/__init__.py,sha256=u-n0uKIWAd3NXsZUd1UA4wzJJTEhRR74diHzDV2EpEU,4885
3
- careful/httpx/_types.py,sha256=jefYDxSbLRUatU8QKeyxStc9UC3AJwAba2SfhNkM0RY,151
4
- careful/httpx/dev_cache.py,sha256=HNtEXncPpqsjIEoz5UhRf4YO2iVwz5uowKc4_B74fZg,11024
5
- careful/httpx/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- careful/httpx/retries.py,sha256=Kszm0wDITyPZ3qx5TsDL__HjCYVJyAZ2WehrlpXV5Cc,2500
7
- careful/httpx/throttle.py,sha256=ZpuFABYHGQ4D0zks922SCXp7WZG_-Ysafz-Npa2QVwQ,1096
8
- careful-0.2.1.dist-info/METADATA,sha256=ZAKwiwqykmep0LiYCzFLWJfTgharbvhW3FCJ3p0b_-8,2498
9
- careful-0.2.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
10
- careful-0.2.1.dist-info/licenses/LICENSE,sha256=oHe4LmCuo6CZne42DRXfiR3uqqIfGsk4dAvDKucAi4M,1315
11
- careful-0.2.1.dist-info/RECORD,,