careful 0.2.0__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
careful/httpx/__init__.py CHANGED
@@ -1,11 +1,20 @@
1
+ import os
2
+ from urllib.parse import urlparse
1
3
  from .retries import make_retry_client, retry_default_rule
2
4
  from .throttle import make_throttled_client
5
+ from .robots import (
6
+ make_robots_txt_client,
7
+ RobotExclusionError,
8
+ RobotsRejectFunc,
9
+ raise_robots_txt,
10
+ )
3
11
  from .dev_cache import (
4
12
  make_dev_caching_client,
5
13
  MemoryCache,
6
14
  FileCache,
7
15
  SqliteCache,
8
- CacheStorageBase,
16
+ CacheStorage,
17
+ CacheResponse,
9
18
  _cache_200s,
10
19
  _default_keyfunc,
11
20
  )
@@ -20,11 +29,14 @@ def make_careful_client(
20
29
  retry_wait_seconds: float = 10,
21
30
  should_retry: ResponsePredicate = retry_default_rule,
22
31
  requests_per_minute: int = 0,
23
- cache_storage: CacheStorageBase = None,
32
+ cache_storage: CacheStorage | None = None,
24
33
  cache_write_only: bool = False,
25
34
  should_cache: ResponsePredicate = _cache_200s,
26
35
  cache_keyfunc: CacheKeyfunc = _default_keyfunc,
27
- ):
36
+ check_robots_txt: bool = False,
37
+ robots_txt_user_agent: str | None = None,
38
+ robots_txt_on_reject: RobotsRejectFunc = raise_robots_txt,
39
+ ) -> Client:
28
40
  """
29
41
  This function patches an `httpx.Client` so that all requests made with the client support
30
42
  [retries](#retries), [throttling](#throttling), and [development caching](#development-caching).
@@ -51,43 +63,6 @@ def make_careful_client(
51
63
 
52
64
  cache_keyfunc: Function that takes request details and returns a unique cache key.
53
65
 
54
- ## Retries
55
-
56
- If `retry_attempts` is set, responses will be passed to `should_retry`.
57
- Responses that are rejected (return `True`) will be retried after a wait based on
58
- `retry_wait_seconds`.
59
- Each retry will wait twice as long as the one before.
60
-
61
- ## Throttling
62
-
63
- If `requests_per_minute` is set, standard (non-retry) requests will automatically
64
- sleep for a short period to target the given rate.
65
-
66
- For example, at 30rpm, the sleep time on a fast request will be close to 2 seconds.
67
-
68
- ## Development Caching
69
-
70
- Why **development caching?**
71
-
72
- This feature is named as a reminder that **this is not true HTTP caching**, which
73
- should take various headers into account. Look at libraries like [hishel](https://hishel.com) if that's what you are after.
74
-
75
- The purpose of this feature is to allow you to cache all of your HTTP requests during development.
76
- Often when writing a scraper or crawler, you wind up hitting the site you are working on more often than you'd like-- each time you iterate on your code you're likely making redundant requests to pages that haven't changed.
77
-
78
- By caching all successful requests (configurable with the `should_cache` parameter),
79
- you can easily re-run scrapers without making redundant HTTP requests.
80
- This means faster development time & happier upstream servers.
81
-
82
- To enable development caching, assign a [`MemoryCache`][careful.httpx.MemoryCache],
83
- [`FileCache`][careful.httpx.FileCache], or [`SqliteCache`][careful.httpx.SqliteCache] to
84
- the `cache_storage` property of a `scrapelib.Scraper`.
85
-
86
- ---
87
-
88
- When multiple features are applied, the order of wrapping ensures that:
89
- - the cache is checked first, and bypasses throttling if hit
90
- - retries use their own delays, but not throttled separately
91
66
  """
92
67
  if client is None:
93
68
  client = Client()
@@ -104,7 +79,7 @@ def make_careful_client(
104
79
  client = make_throttled_client(
105
80
  client=client, requests_per_minute=requests_per_minute
106
81
  )
107
- # caching on top layer, so cache will be checked first
82
+ # caching on top layer, so cache will be checked before throttling/etc.
108
83
  if cache_storage:
109
84
  client = make_dev_caching_client(
110
85
  client=client,
@@ -113,15 +88,111 @@ def make_careful_client(
113
88
  should_cache=should_cache,
114
89
  write_only=cache_write_only,
115
90
  )
91
+ # robots.txt before cache
92
+ if check_robots_txt:
93
+ client = make_robots_txt_client(
94
+ client=client,
95
+ as_user_agent=robots_txt_user_agent,
96
+ on_rejection=robots_txt_on_reject,
97
+ )
116
98
 
117
99
  return client
118
100
 
119
101
 
102
+ def _int_env(var_name: str, default: int) -> int:
103
+ return int(os.environ.get(var_name, default))
104
+
105
+
106
+ def _float_env(var_name: str, default: float) -> float:
107
+ return float(os.environ.get(var_name, default))
108
+
109
+
110
+ def _bool_env(var_name: str, default: bool) -> bool:
111
+ """helper function for bool env vars"""
112
+ return bool(os.environ.get(var_name, "T" if default else ""))
113
+
114
+
115
+ def _cache_env(var_name: str, default: CacheStorage | None) -> CacheStorage | None:
116
+ """
117
+ helper function that reads cache as a protocol string
118
+ """
119
+ cache_str = os.environ.get(var_name)
120
+ if not cache_str:
121
+ return default
122
+ parsed = urlparse(cache_str)
123
+ if parsed.scheme == "memory":
124
+ return MemoryCache()
125
+ elif parsed.scheme == "file":
126
+ return FileCache(parsed.path)
127
+ elif parsed.scheme == "sqlite":
128
+ return SqliteCache(parsed.path)
129
+
130
+
131
+ def make_careful_client_from_env(
132
+ *,
133
+ client: Client | None = None,
134
+ retry_attempts: int = 0,
135
+ retry_wait_seconds: float = 10,
136
+ should_retry: ResponsePredicate = retry_default_rule,
137
+ requests_per_minute: int = 0,
138
+ cache_storage: CacheStorage | None = None,
139
+ cache_write_only: bool = False,
140
+ should_cache: ResponsePredicate = _cache_200s,
141
+ cache_keyfunc: CacheKeyfunc = _default_keyfunc,
142
+ check_robots_txt: bool = False,
143
+ robots_txt_user_agent: str | None = None,
144
+ robots_txt_on_reject: RobotsRejectFunc = raise_robots_txt,
145
+ ) -> Client:
146
+ """
147
+ Make a careful client from environment variables.
148
+
149
+ Any set environment variables will override parameters if set.
150
+
151
+ Numeric:
152
+ - CAREFUL_RETRY_ATTEMPTS
153
+ - CAREFUL_RETRY_WAIT_SECONDS
154
+ - CAREFUL_REQUESTS_PER_MINUTE
155
+ - CAREFUL_CHECK_ROBOTS_TXT
156
+ Booleans (any non-empty value is true):
157
+ - CAREFUL_CACHE_WRITE_ONLY
158
+ - CAREFUL_ROBOTS_TXT_USER_AGENT
159
+ Cache:
160
+ - CAREFUL_CACHE, which can be:
161
+ memory://
162
+ cache://path/to/db.sqlite3
163
+ file://path/to/directory
164
+
165
+ Function parameters do not have environment variables.
166
+ """
167
+ return make_careful_client(
168
+ client=client,
169
+ retry_attempts=_int_env("CAREFUL_RETRY_ATTEMPTS", retry_attempts),
170
+ retry_wait_seconds=_float_env("CAREFUL_RETRY_WAIT_SECONDS", retry_wait_seconds),
171
+ should_retry=should_retry,
172
+ requests_per_minute=_int_env(
173
+ "CAREFUL_REQUESTS_PER_MINUTE", requests_per_minute
174
+ ),
175
+ cache_storage=_cache_env("CAREFUL_CACHE", cache_storage),
176
+ cache_write_only=_bool_env("CAREFUL_CACHE_WRITE_ONLY", cache_write_only),
177
+ should_cache=should_cache,
178
+ cache_keyfunc=cache_keyfunc,
179
+ check_robots_txt=_bool_env("CAREFUL_CHECK_ROBOTS_TXT", check_robots_txt),
180
+ robots_txt_user_agent=os.environ.get(
181
+ "CAREFUL_ROBOTS_TXT_USER_AGENT", robots_txt_user_agent
182
+ ),
183
+ robots_txt_on_reject=robots_txt_on_reject,
184
+ )
185
+
186
+
120
187
  __all__ = [
188
+ "make_careful_client",
121
189
  "make_retry_client",
122
190
  "make_throttled_client",
123
191
  "make_dev_caching_client",
192
+ "make_robots_txt_client",
124
193
  "MemoryCache",
125
194
  "FileCache",
126
195
  "SqliteCache",
196
+ "CacheResponse",
197
+ "RobotExclusionError",
127
198
  ]
careful/httpx/_types.py CHANGED
@@ -3,4 +3,4 @@ from typing import Callable
3
3
 
4
4
  ResponsePredicate = Callable[[Response], bool]
5
5
 
6
- CacheKeyfunc = Callable[[str,str,dict], str]
6
+ CacheKeyfunc = Callable[[str, str, dict], str]
@@ -1,4 +1,3 @@
1
- import abc
2
1
  import types
3
2
  import functools
4
3
  import logging
@@ -8,10 +7,27 @@ import glob
8
7
  import hashlib
9
8
  import sqlite3
10
9
  import json
10
+ from typing import cast, Protocol, Callable
11
+ from ._types import ResponsePredicate, CacheKeyfunc
11
12
 
12
13
  from httpx import Client, Response, Request
13
14
 
14
- log = logging.getLogger("httpx")
15
+ log = logging.getLogger("careful")
16
+
17
+
18
+ class CacheStorage(Protocol):
19
+ def get(self, key: str) -> None | Response: ...
20
+ def set(self, key: str, response: Response) -> None: ...
21
+
22
+
23
+ class DevCacheClient(Protocol):
24
+ _retry_attempts: int
25
+ _cache_storage: CacheStorage
26
+ _write_only: bool
27
+ _should_cache: ResponsePredicate
28
+ _cache_keyfunc: CacheKeyfunc
29
+ _no_cache_request: Callable
30
+ request: Callable
15
31
 
16
32
 
17
33
  def _default_keyfunc(
@@ -28,7 +44,7 @@ def _default_keyfunc(
28
44
  if method.lower() != "get":
29
45
  return None
30
46
 
31
- return Request(url=url, method=method, params=params).url
47
+ return str(Request(url=url, method=method, params=params).url)
32
48
 
33
49
 
34
50
  def _cache_200s(response: Response) -> bool:
@@ -40,7 +56,11 @@ def _cache_200s(response: Response) -> bool:
40
56
  return response.status_code == 200
41
57
 
42
58
 
43
- def _cached_request(client: Client, *args, **kwargs):
59
+ class CacheResponse(Response):
60
+ fromcache: bool
61
+
62
+
63
+ def _cached_request(client: DevCacheClient, *args, **kwargs) -> CacheResponse:
44
64
  method, url = args
45
65
  request_key = client._cache_keyfunc(method, url, kwargs["params"])
46
66
 
@@ -50,30 +70,28 @@ def _cached_request(client: Client, *args, **kwargs):
50
70
  cached_resp = client._cache_storage.get(request_key)
51
71
 
52
72
  if cached_resp:
53
- # resp = cast(CacheResponse, resp_maybe)
54
73
  log.info("using cached response request_key=%s", request_key)
55
- cached_resp.fromcache = True
56
- resp = cached_resp
74
+ new_resp = cast(CacheResponse, cached_resp)
75
+ new_resp.fromcache = True
57
76
  else:
58
- resp = client._no_cache_request(*args, **kwargs)
77
+ new_resp = cast(CacheResponse, client._no_cache_request(*args, **kwargs))
78
+ new_resp.fromcache = False
59
79
  # save to cache if request and response meet criteria
60
- log.debug("XX %s %s", request_key, client._should_cache(resp))
61
- if request_key and client._should_cache(resp):
62
- client._cache_storage.set(request_key, resp)
80
+ if request_key and client._should_cache(new_resp):
81
+ client._cache_storage.set(request_key, new_resp)
63
82
  log.info("caching response request_key=%s", request_key)
64
- resp.fromcache = False
65
83
 
66
- return resp
84
+ return new_resp
67
85
 
68
86
 
69
87
  def make_dev_caching_client(
70
88
  *,
89
+ cache_storage: CacheStorage,
71
90
  client: Client | None = None,
72
- cache_storage=None,
73
91
  cache_keyfunc=_default_keyfunc,
74
92
  should_cache=_cache_200s,
75
93
  write_only=False,
76
- ):
94
+ ) -> Client:
77
95
  """
78
96
  Returns an enhanced `httpx.Client` where requests are saved to a
79
97
  specified cache.
@@ -98,29 +116,21 @@ def make_dev_caching_client(
98
116
  if client is None:
99
117
  client = Client()
100
118
 
101
- client._cache_storage = cache_storage
102
- client._cache_keyfunc = cache_keyfunc
103
- client._should_cache = should_cache
104
- client._write_only = write_only
119
+ tclient = cast(DevCacheClient, client)
120
+
121
+ tclient._cache_storage = cache_storage
122
+ tclient._cache_keyfunc = cache_keyfunc
123
+ tclient._should_cache = should_cache
124
+ tclient._write_only = write_only
105
125
 
106
- client._no_cache_request = client.request
107
- client.request = types.MethodType(
126
+ tclient._no_cache_request = client.request
127
+ tclient.request = types.MethodType(
108
128
  functools.wraps(client.request)(_cached_request), client
109
129
  )
110
130
  return client
111
131
 
112
132
 
113
- class CacheStorageBase(abc.ABC):
114
- @abc.abstractmethod
115
- def get(self, key: str) -> None | Response:
116
- raise NotImplementedError()
117
-
118
- @abc.abstractmethod
119
- def set(self, key: str, response: Response) -> None:
120
- raise NotImplementedError()
121
-
122
-
123
- class MemoryCache(CacheStorageBase):
133
+ class MemoryCache(CacheStorage):
124
134
  """
125
135
  In memory cache for request responses.
126
136
 
@@ -144,13 +154,13 @@ class MemoryCache(CacheStorageBase):
144
154
  self.cache[key] = response
145
155
 
146
156
 
147
- class FileCache(CacheStorageBase):
157
+ class FileCache(CacheStorage):
148
158
  """
149
159
  File-based cache for request responses.
150
160
 
151
161
  Parameters:
152
162
  cache_dir: directory for storing responses
153
-
163
+
154
164
  Example:
155
165
 
156
166
  make_careful_client(
@@ -260,7 +270,7 @@ class FileCache(CacheStorageBase):
260
270
  os.remove(fname)
261
271
 
262
272
 
263
- class SqliteCache(CacheStorageBase):
273
+ class SqliteCache(CacheStorage):
264
274
  """
265
275
  sqlite cache for request responses.
266
276
 
careful/httpx/retries.py CHANGED
@@ -2,25 +2,37 @@ import time
2
2
  import types
3
3
  import functools
4
4
  import logging
5
+ from typing import Protocol, Callable, cast
5
6
  from httpx import Client, Response, HTTPError
7
+ from ._types import ResponsePredicate
6
8
 
7
- log = logging.getLogger("httpx")
9
+ log = logging.getLogger("careful")
10
+
11
+
12
+ class RetryClient(Protocol):
13
+ _retry_attempts: int
14
+ _retry_wait_seconds: float
15
+ _should_retry: ResponsePredicate
16
+ _no_retry_request: Callable
17
+ request: Callable
8
18
 
9
19
 
10
20
  def retry_default_rule(response: Response) -> bool:
11
- # default behavior is to retry 400s and 500s but not 404s
21
+ """default behavior is to retry 400s and 500s but not 404s"""
12
22
  return response.status_code >= 400 and response.status_code != 404
13
23
 
14
24
 
15
25
  def retry_only_500s(response: Response) -> bool:
26
+ """retry all status codes that are 500 or above"""
16
27
  return response.status_code >= 500
17
28
 
18
29
 
19
30
  def retry_all_400s_500s(response: Response) -> bool:
31
+ """retry all status codes that are 400 or above"""
20
32
  return response.status_code >= 400
21
33
 
22
34
 
23
- def _retry_request(client: Client, *args, **kwargs):
35
+ def _retry_request(client: RetryClient, *args, **kwargs) -> Response:
24
36
  # the retry loop
25
37
  tries = 0
26
38
  exception_raised = None
@@ -75,15 +87,17 @@ def make_retry_client(
75
87
  attempts: int = 1,
76
88
  wait_seconds: float = 10,
77
89
  should_retry=retry_default_rule,
78
- ):
90
+ ) -> Client:
79
91
  if client is None:
80
92
  client = Client()
81
- client._retry_attempts = max(0, attempts)
82
- client._retry_wait_seconds = wait_seconds
83
- client._should_retry = should_retry
84
93
 
85
- client._no_retry_request = client.request
86
- client.request = types.MethodType(
94
+ tclient = cast(RetryClient, client)
95
+ tclient._retry_attempts = max(0, attempts)
96
+ tclient._retry_wait_seconds = wait_seconds
97
+ tclient._should_retry = should_retry
98
+
99
+ tclient._no_retry_request = client.request
100
+ tclient.request = types.MethodType(
87
101
  functools.wraps(client.request)(_retry_request), client
88
102
  )
89
103
 
@@ -0,0 +1,72 @@
1
+ import types
2
+ import functools
3
+ import warnings
4
+ from urllib.robotparser import RobotFileParser
5
+ from typing import Protocol, cast, Callable
6
+ from httpx import Client, Response, URL
7
+
8
+
9
+ class RobotExclusionError(Exception):
10
+ pass
11
+
12
+
13
+ def raise_robots_txt(url, robots):
14
+ raise RobotExclusionError(f"{url} excluded by {robots.url}")
15
+
16
+
17
+ def warn_robots_txt(url, robots):
18
+ warnings.warn(f"{url} excluded by {robots.url}")
19
+
20
+
21
+ class RobotsClient(Protocol):
22
+ _robots_for_domain: dict[str, RobotFileParser]
23
+ _robots_ua: str
24
+ _rejected_action: Callable[[str, RobotFileParser], None]
25
+ _no_check_request: Callable
26
+ request: Callable
27
+ headers: dict
28
+
29
+
30
+ def _robot_check_request(client: RobotsClient, *args, **kwargs) -> Response:
31
+ method, url = args
32
+ uurl = URL(url)
33
+ domain = uurl.host
34
+ if domain not in client._robots_for_domain:
35
+ robots_url = f"{uurl.scheme}://{domain}/robots.txt"
36
+ robots_resp = client._no_check_request("GET", robots_url)
37
+ # pass url for output, but don't do read
38
+ parser = RobotFileParser(robots_url)
39
+ parser.parse(robots_resp.text.splitlines())
40
+ client._robots_for_domain[domain] = parser
41
+ if not client._robots_for_domain[domain].can_fetch(client._robots_ua, url):
42
+ client._rejected_action(url, client._robots_for_domain[domain])
43
+ # if action doesn't raise an exception, the request goes through
44
+ return client._no_check_request(*args, **kwargs)
45
+
46
+
47
+ RobotsRejectFunc = Callable[[str, RobotFileParser], None]
48
+
49
+
50
+ def make_robots_txt_client(
51
+ *,
52
+ client: Client | None = None,
53
+ as_user_agent: str | None = None,
54
+ on_rejection: RobotsRejectFunc = raise_robots_txt,
55
+ ) -> Client:
56
+ if client is None:
57
+ client = Client()
58
+
59
+ tclient = cast(RobotsClient, client)
60
+
61
+ tclient._robots_for_domain = {}
62
+ if as_user_agent:
63
+ tclient._robots_ua = as_user_agent
64
+ else:
65
+ tclient._robots_ua = tclient.headers["user-agent"]
66
+ tclient._rejected_action = on_rejection
67
+
68
+ tclient._no_check_request = client.request
69
+ tclient.request = types.MethodType(
70
+ functools.wraps(client.request)(_robot_check_request), client
71
+ )
72
+ return client
careful/httpx/throttle.py CHANGED
@@ -2,12 +2,21 @@ import time
2
2
  import types
3
3
  import functools
4
4
  import logging
5
- from httpx import Client
5
+ from typing import Protocol, cast, Callable
6
+ from httpx import Client, Response
6
7
 
7
- log = logging.getLogger("httpx")
8
+ log = logging.getLogger("careful")
8
9
 
9
10
 
10
- def _throttle_request(client: Client, *args, **kwargs):
11
+ class ThrottledClient(Protocol):
12
+ _last_request: float
13
+ _requests_per_minute: float
14
+ _request_frequency: float
15
+ _no_throttle_request: Callable
16
+ request: Callable
17
+
18
+
19
+ def _throttle_request(client: ThrottledClient, *args, **kwargs) -> Response:
11
20
  now = time.time()
12
21
  diff = client._request_frequency - (now - client._last_request)
13
22
  if diff > 0:
@@ -23,19 +32,21 @@ def make_throttled_client(
23
32
  *,
24
33
  client: Client | None = None,
25
34
  requests_per_minute: float = 0,
26
- ):
35
+ ) -> Client:
27
36
  if requests_per_minute <= 0:
28
37
  raise ValueError("requests per minute must be a positive number")
29
38
 
30
39
  if client is None:
31
40
  client = Client()
32
41
 
33
- client._last_request = 0.0
34
- client._requests_per_minute = requests_per_minute
35
- client._request_frequency = 60.0 / requests_per_minute
42
+ tclient = cast(ThrottledClient, client)
43
+
44
+ tclient._last_request = 0.0
45
+ tclient._requests_per_minute = requests_per_minute
46
+ tclient._request_frequency = 60.0 / requests_per_minute
36
47
 
37
- client._no_throttle_request = client.request
38
- client.request = types.MethodType(
48
+ tclient._no_throttle_request = client.request
49
+ tclient.request = types.MethodType(
39
50
  functools.wraps(client.request)(_throttle_request), client
40
51
  )
41
52
  return client
@@ -0,0 +1,76 @@
1
+ Metadata-Version: 2.4
2
+ Name: careful
3
+ Version: 0.3.1
4
+ Summary: a small library for writing resilient, well-behaved HTTP code
5
+ Project-URL: Repository, https://codeberg.org/jpt/careful
6
+ Author-email: jpt <dev@jpt.sh>
7
+ License: BSD-2-Clause
8
+ License-File: LICENSE
9
+ Classifier: Development Status :: 6 - Mature
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: License :: OSI Approved :: BSD License
12
+ Classifier: Natural Language :: English
13
+ Classifier: Operating System :: OS Independent
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Programming Language :: Python :: 3.13
18
+ Classifier: Programming Language :: Python :: 3.14
19
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
20
+ Requires-Python: >=3.10
21
+ Requires-Dist: httpx>=0.28.1
22
+ Description-Content-Type: text/markdown
23
+
24
+ # careful
25
+
26
+ <img src="https://jpt.sh/projects/careful/carefully-3681327.svg" width=100 height=100 alt="logo of a warning sign">
27
+
28
+ **careful** is a Python library for writing resilient, well-behaved HTTP clients.
29
+
30
+ **Code**: <https://codeberg.org/jpt/careful>
31
+
32
+ **Docs**: <https://jpt.sh/projects/careful/>
33
+
34
+ ![PyPI - Version](https://img.shields.io/pypi/v/careful)
35
+ [![status-badge](https://ci.codeberg.org/api/badges/15185/status.svg)](https://ci.codeberg.org/repos/15185)
36
+
37
+ Call one function to enchant an
38
+ **[httpx.Client](https://www.python-httpx.org)**, making your HTTP connections more resilient and better mannered.
39
+
40
+ - Configure **throttling** to avoid accidental Denial-of-Service / risking getting banned.
41
+ - **Retries** help overcome intermittent failures on flaky sites or long crawls.
42
+ - **Development caching** Cache persists between runs during development, reduces redundant requests made while iterating on your crawlers & scrapers.
43
+
44
+ ### Example
45
+
46
+ ```python
47
+ from httpx import Client
48
+ from careful.httpx import make_careful_client
49
+
50
+ # the only function you need to call is make_careful_client
51
+ # this wraps your existing `httpx.Client` with your preferred
52
+ # careful behaviors
53
+
54
+ client = make_careful_client(
55
+ client=Client(headers={'user-agent': 'spiderman/1.0'}),
56
+
57
+ # retries are configurable w/ exponential back off
58
+ retry_attempts=2,
59
+ retry_wait_seconds=5,
60
+
61
+ # can cache to process memory, filesystem, or SQLite
62
+ cache_storage=MemoryCache(),
63
+
64
+ # easy-to-configure throttling
65
+ requests_per_minute=60,
66
+ )
67
+
68
+ # methods on client are called as they always are
69
+ # configured behaviors occur without further code changes
70
+ client.get("https://example.com")
71
+ ```
72
+
73
+
74
+ ---
75
+
76
+ Logo licensed from [Adrien Coquet via Noun Project](https://thenounproject.com/icon/carefully-3681327/)
@@ -0,0 +1,12 @@
1
+ careful/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ careful/httpx/__init__.py,sha256=sIKMgGf-Ea5VEsEFqxXFbAiaF1tYOYm9CIOvSCPooqw,6674
3
+ careful/httpx/_types.py,sha256=jefYDxSbLRUatU8QKeyxStc9UC3AJwAba2SfhNkM0RY,151
4
+ careful/httpx/dev_cache.py,sha256=sfHX2TkKZQRTrOXSGSMgIWE0HbX3Fvuz88o8_jK9P8g,11316
5
+ careful/httpx/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ careful/httpx/retries.py,sha256=3kjuHKYnK1N4Rtum5gUyY_XO4o4cL4jc59d17Y6UwrI,2949
7
+ careful/httpx/robots.py,sha256=jfqQdplTap_RCENu6MHEIabFVznFLruMvSIaG_u0v_8,2168
8
+ careful/httpx/throttle.py,sha256=b1fbmUskcm343D1bbPbY-ITLdL1zVm1dXtjt9LT1bEA,1412
9
+ careful-0.3.1.dist-info/METADATA,sha256=fkI2V9YX5JKhPTYZga5q_3cyfatB4B_PMdTci3sRMmc,2692
10
+ careful-0.3.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
11
+ careful-0.3.1.dist-info/licenses/LICENSE,sha256=oHe4LmCuo6CZne42DRXfiR3uqqIfGsk4dAvDKucAi4M,1315
12
+ careful-0.3.1.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: hatchling 1.27.0
2
+ Generator: hatchling 1.28.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
@@ -1,71 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: careful
3
- Version: 0.2.0
4
- Summary: careful extensions to httpx: throttle, retry, cache
5
- Project-URL: Repository, https://codeberg.org/jpt/careful
6
- Author-email: jpt <dev@jpt.sh>
7
- License: BSD-2-Clause
8
- License-File: LICENSE
9
- Classifier: Development Status :: 6 - Mature
10
- Classifier: Intended Audience :: Developers
11
- Classifier: License :: OSI Approved :: BSD License
12
- Classifier: Natural Language :: English
13
- Classifier: Operating System :: OS Independent
14
- Classifier: Programming Language :: Python :: 3.10
15
- Classifier: Programming Language :: Python :: 3.11
16
- Classifier: Programming Language :: Python :: 3.12
17
- Classifier: Programming Language :: Python :: 3.13
18
- Classifier: Topic :: Software Development :: Libraries :: Python Modules
19
- Requires-Python: >=3.10
20
- Requires-Dist: httpx>=0.28.1
21
- Requires-Dist: mkdocs-material>=9.6.18
22
- Requires-Dist: mkdocstrings-python>=1.18.2
23
- Requires-Dist: mkdocstrings>=0.30.0
24
- Requires-Dist: pytest-httpbin>=2.1.0
25
- Requires-Dist: pytest>=8.4.2
26
- Description-Content-Type: text/markdown
27
-
28
- # careful
29
-
30
- <img src="/carefully-3681327.svg" width=100 height=100 alt="logo of a warning sign">
31
-
32
- **careful** is a library for making requests to unreliable websites with httpx.
33
-
34
- **Code**: <https://codeberg.org/jpt/careful>
35
-
36
- **Docs**: <https://careful.jpt.sh>
37
-
38
- It offers enhancements to
39
- [`httpx.Client`](https://www.python-httpx.org)
40
- useful for writing long-running scrapers & crawlers, particularly against sites that are slow or have intermittent errors.
41
-
42
- - **configurable retry support.** retry on timeouts or other errors, with exponential back-off.
43
- - **simple request throttling.** set a maximum number of requests per minute.
44
- - **development cache.** configurable caching aimed at reducing redundant requests made while authoring/testing web scrapers.
45
-
46
- ### example
47
-
48
- ```python
49
- from httpx import Client
50
- from careful.httpx import make_careful_client
51
-
52
- client = make_careful_client(
53
- # can configure httpx.Client however you usually would
54
- client=Client(headers={'user-agent': 'careful/1.0'}),
55
- # retries are configurable w/ exponential back off
56
- retry_attempts=2,
57
- retry_wait_seconds=5,
58
- # can cache to process memory, filesystem, or SQLite
59
- cache_storage=MemoryCache(),
60
- # requests will automatically be throttled to aim at this rate
61
- requests_per_minute=60,
62
- )
63
-
64
- # all normal methods on httpx.Client make use of configured enhancements
65
- client.get("https://example.com")
66
- ```
67
-
68
-
69
- ---
70
-
71
- Logo licensed from [Adrien Coquet via Noun Project](https://thenounproject.com/icon/carefully-3681327/)
@@ -1,11 +0,0 @@
1
- careful/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- careful/httpx/__init__.py,sha256=u-n0uKIWAd3NXsZUd1UA4wzJJTEhRR74diHzDV2EpEU,4885
3
- careful/httpx/_types.py,sha256=NwyQ-ItodN9HnO7d7b0M1M4M9y90TjRkhQFqNuypKRI,149
4
- careful/httpx/dev_cache.py,sha256=KR35u0CvutqTOWQ8pO-hzwbPy0lDBhShJfhCAbOvqv0,11032
5
- careful/httpx/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- careful/httpx/retries.py,sha256=Kszm0wDITyPZ3qx5TsDL__HjCYVJyAZ2WehrlpXV5Cc,2500
7
- careful/httpx/throttle.py,sha256=ZpuFABYHGQ4D0zks922SCXp7WZG_-Ysafz-Npa2QVwQ,1096
8
- careful-0.2.0.dist-info/METADATA,sha256=A82D5ltN7bDh1dXkOqdBLcW8fxxxqsonFgf9hZQlors,2541
9
- careful-0.2.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
10
- careful-0.2.0.dist-info/licenses/LICENSE,sha256=oHe4LmCuo6CZne42DRXfiR3uqqIfGsk4dAvDKucAi4M,1315
11
- careful-0.2.0.dist-info/RECORD,,