PyPI - careful - Versions diffs - 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

careful 0.1.0py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

careful/httpx/__init__.py +81 -13
careful/httpx/_types.py +6 -0
careful/httpx/dev_cache.py +67 -21
careful/httpx/retries.py +20 -17
careful/httpx/throttle.py +2 -2
careful-0.2.0.dist-info/METADATA +71 -0
careful-0.2.0.dist-info/RECORD +11 -0
careful-0.1.0.dist-info/METADATA +0 -48
careful-0.1.0.dist-info/RECORD +0 -10
{careful-0.1.0.dist-info → careful-0.2.0.dist-info}/WHEEL +0 -0
{careful-0.1.0.dist-info → careful-0.2.0.dist-info}/licenses/LICENSE +0 -0

careful/httpx/__init__.py CHANGED Viewed

@@ -1,41 +1,109 @@
-from .retries import make_retry_client, _default_accept_response
+from .retries import make_retry_client, retry_default_rule
 from .throttle import make_throttled_client
 from .dev_cache import (
     make_dev_caching_client,
     MemoryCache,
     FileCache,
-    SQLiteCache,
+    SqliteCache,
+    CacheStorageBase,
     _cache_200s,
     _default_keyfunc,
 )
+from ._types import ResponsePredicate, CacheKeyfunc
 from httpx import Client
 def make_careful_client(
-    client: Client,
     *,
+    client: Client | None = None,
     retry_attempts: int = 0,
     retry_wait_seconds: float = 10,
-    retry_on_404: bool = False,
-    accept_response=_default_accept_response,
+    should_retry: ResponsePredicate = retry_default_rule,
     requests_per_minute: int = 0,
-    cache_storage=None,
-    cache_write_only=False,
-    should_cache=_cache_200s,
-    cache_keyfunc=_default_keyfunc,
+    cache_storage: CacheStorageBase = None,
+    cache_write_only: bool = False,
+    should_cache: ResponsePredicate = _cache_200s,
+    cache_keyfunc: CacheKeyfunc = _default_keyfunc,
 ):
+    """
+    This function patches an `httpx.Client` so that all requests made with the client support
+     [retries](#retries), [throttling](#throttling), and [development caching](#development-caching).
+    Parameters:
+        client: A pre-configured `httpx.Client`. If omitted a default client will be created.
+        retry_attempts: Maximum number of retries. If non-zero will retry up to this many times
+                         with increasing wait times, starting with `retry_wait_seconds`.
+        retry_wait_seconds: Number of seconds to sleep between first attempt and first retry.
+                             Subsequent attempts will increase exponentially (2x, 4x, 8x, etc.)
+        should_retry: Predicate function that takes a `httpx.Response` and returns `True` if it should be retried.
+        requests_per_minute: Maximum number of requests per minute. (e.g. 30 will throttle to ~2s between requests)
+        cache_storage: An object that implements the [cache storage interface](#cache-storage).
+        cache_write_only: Update cache, but never read from it.
+        should_cache: Predicate function that takes a `httpx.Response` and returns `True` if it should be cached.
+        cache_keyfunc: Function that takes request details and returns a unique cache key.
+    ## Retries
+    If `retry_attempts` is set, responses will be passed to `should_retry`.
+    Responses that are rejected (return `True`) will be retried after a wait based on
+    `retry_wait_seconds`.
+    Each retry will wait twice as long as the one before.
+    ## Throttling
+    If `requests_per_minute` is set, standard (non-retry) requests will automatically
+    sleep for a short period to target the given rate.
+    For example, at 30rpm, the sleep time on a fast request will be close to 2 seconds.
+    ## Development Caching
+    Why **development caching?**
+    This feature is named as a reminder that **this is not true HTTP caching**, which
+    should take various headers into account. Look at libraries like [hishel](https://hishel.com) if that's what you are after.
+    The purpose of this feature is to allow you to cache all of your HTTP requests during development.
+    Often when writing a scraper or crawler, you wind up hitting the site you are working on more often than you'd like-- each time you iterate on your code you're likely making redundant requests to pages that haven't changed.
+    By caching all successful requests (configurable with the `should_cache` parameter),
+    you can easily re-run scrapers without making redundant HTTP requests.
+    This means faster development time & happier upstream servers.
+    To enable development caching, assign a [`MemoryCache`][careful.httpx.MemoryCache],
+    [`FileCache`][careful.httpx.FileCache], or [`SqliteCache`][careful.httpx.SqliteCache] to
+    the `cache_storage` property of a `scrapelib.Scraper`.
+    ---
+    When multiple features are applied, the order of wrapping ensures that:
+       - the cache is checked first, and bypasses throttling if hit
+       - retries use their own delays, but not throttled separately
+    """
+    if client is None:
+        client = Client()
     # order matters, retry on inside b/c it is last-chance scenario
     if retry_attempts:
         client = make_retry_client(
             client=client,
             attempts=retry_attempts,
             wait_seconds=retry_wait_seconds,
-            retry_on_404=retry_on_404,
-            accept_response=accept_response,
+            should_retry=should_retry,
         )
     # throttling around retries
     if requests_per_minute:
-        client = make_throttled_client(client, requests_per_minute=requests_per_minute)
+        client = make_throttled_client(
+            client=client, requests_per_minute=requests_per_minute
+        )
     # caching on top layer, so cache will be checked first
     if cache_storage:
         client = make_dev_caching_client(
@@ -55,5 +123,5 @@ __all__ = [
     "make_dev_caching_client",
     "MemoryCache",
     "FileCache",
-    "SQLiteCache",
+    "SqliteCache",
 ]

careful/httpx/_types.py ADDED Viewed

@@ -0,0 +1,6 @@
+from httpx import Response
+from typing import Callable
+ResponsePredicate = Callable[[Response], bool]
+CacheKeyfunc = Callable[[str,str,dict], str]

careful/httpx/dev_cache.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import abc
 import types
 import functools
 import logging
@@ -40,13 +41,6 @@ def _cache_200s(response: Response) -> bool:
 def _cached_request(client: Client, *args, **kwargs):
-    # short circuit if cache isn't configured
-    if not client._cache_storage:
-        log.debug("bypassing cache, no storage configured")
-        resp = client._wrapped_request(*args, **kwargs)
-        resp.fromcache = False
-        return resp
     method, url = args
     request_key = client._cache_keyfunc(method, url, kwargs["params"])
@@ -61,7 +55,7 @@ def _cached_request(client: Client, *args, **kwargs):
         cached_resp.fromcache = True
         resp = cached_resp
     else:
-        resp = client._wrapped_request(*args, **kwargs)
+        resp = client._no_cache_request(*args, **kwargs)
         # save to cache if request and response meet criteria
         log.debug("XX %s %s", request_key, client._should_cache(resp))
         if request_key and client._should_cache(resp):
@@ -80,6 +74,27 @@ def make_dev_caching_client(
     should_cache=_cache_200s,
     write_only=False,
 ):
+    """
+    Returns an enhanced `httpx.Client` where requests are saved to a
+    specified cache.
+    This is denoted as a "dev_cache" because it is not intended to be a true
+    HTTP cache, respecting cache headers/etc. If you are looking for that
+    behavior, there are httpx libraries for that explicit purpose.
+    Instead, the purpose of this cache is to make it possible to test scrapers
+    locally without making hundreds of redundant requests.
+    The strategy is configurable via `cache_keyfunc` and `should_cache`.
+    The default strategy is simple:
+    cache all GET requests that result in 200s, with no expiry.
+    This works well for the case where you have hundreds of pages to scrape
+    and want to make scraper adjustments without repeatedly making hits.
+    It should *NOT* be used in production without adjusting these rules.
+    """
     if client is None:
         client = Client()
@@ -88,23 +103,34 @@ def make_dev_caching_client(
     client._should_cache = should_cache
     client._write_only = write_only
-    client._wrapped_request = client.request
+    client._no_cache_request = client.request
     client.request = types.MethodType(
         functools.wraps(client.request)(_cached_request), client
     )
     return client
-class CacheStorageBase:
+class CacheStorageBase(abc.ABC):
+    @abc.abstractmethod
     def get(self, key: str) -> None | Response:
         raise NotImplementedError()
+    @abc.abstractmethod
     def set(self, key: str, response: Response) -> None:
         raise NotImplementedError()
 class MemoryCache(CacheStorageBase):
-    """In memory cache for request responses."""
+    """
+    In memory cache for request responses.
+    Example:
+        make_careful_client(
+            cache_storage=MemoryCache(),
+        )
+    """
     def __init__(self) -> None:
         self.cache: dict[str, Response] = {}
@@ -122,11 +148,21 @@ class FileCache(CacheStorageBase):
     """
     File-based cache for request responses.
-    :param cache_dir: directory for storing responses
-    :param check_last_modified:  set to True to compare last-modified
-        timestamp in cached response with value from HEAD request
+    Parameters:
+        cache_dir: directory for storing responses
+    Example:
+        make_careful_client(
+            cache_storage=FileCache("_httpcache/"),
+        )
     """
+    # TODO: restore?
+    # check_last_modified:  set to True to compare last-modified
+    #                       timestamp in cached response with value from HEAD request
     # file name escaping inspired by httplib2
     _prefix = re.compile(r"^\w+://")
     _illegal = re.compile(r"[?/:|]+")
@@ -188,7 +224,7 @@ class FileCache(CacheStorageBase):
             # status & encoding will be in headers, but are faked
             # need to split spaces out of status to get code (e.g. '200 OK')
             resp = Response(
-                status_code = int(resp_headers.pop("status").split(" ")[0]),
+                status_code=int(resp_headers.pop("status").split(" ")[0]),
                 content=resp_content,
                 default_encoding=resp_headers.pop("encoding"),
                 headers=resp_headers,
@@ -224,13 +260,18 @@ class FileCache(CacheStorageBase):
             os.remove(fname)
-class SQLiteCache(CacheStorageBase):
-    """SQLite cache for request responses.
+class SqliteCache(CacheStorageBase):
+    """
+    sqlite cache for request responses.
+    Parameters:
+        cache_path: path for SQLite database file
-    :param cache_path: path for SQLite database file
-    :param check_last_modified: set to True to compare last-modified
-        timestamp in cached response with value from HEAD request
+    Example:
+        make_careful_client(
+            cache_storage=SQLiteCache("_cache.db"),
+        )
     """
     _columns = ["key", "status", "modified", "encoding", "data", "headers"]
@@ -284,7 +325,12 @@ class SQLiteCache(CacheStorageBase):
         #     if rec["modified"] != new_lm:
         #         return None
-        resp = Response(rec["status"], content=rec["data"], default_encoding=rec["encoding"], headers=json.loads(rec["headers"]))
+        resp = Response(
+            rec["status"],
+            content=rec["data"],
+            default_encoding=rec["encoding"],
+            headers=json.loads(rec["headers"]),
+        )
         return resp
     def clear(self) -> None:

careful/httpx/retries.py CHANGED Viewed

@@ -2,13 +2,22 @@ import time
 import types
 import functools
 import logging
-from httpx import Client, Response
+from httpx import Client, Response, HTTPError
 log = logging.getLogger("httpx")
-def _default_accept_response(response: Response) -> bool:
-    return response.status_code < 400
+def retry_default_rule(response: Response) -> bool:
+    # default behavior is to retry 400s and 500s but not 404s
+    return response.status_code >= 400 and response.status_code != 404
+def retry_only_500s(response: Response) -> bool:
+    return response.status_code >= 500
+def retry_all_400s_500s(response: Response) -> bool:
+    return response.status_code >= 400
 def _retry_request(client: Client, *args, **kwargs):
@@ -20,24 +29,21 @@ def _retry_request(client: Client, *args, **kwargs):
         exception_raised = None
         try:
-            resp = client._wrapped_request(*args, **kwargs)
+            tries += 1
+            resp = client._no_retry_request(*args, **kwargs)
             # break from loop on an accepted response
-            if client._accept_response(resp) or (
-                resp.status_code == 404 and not client._retry_on_404
-            ):
+            if not client._should_retry(resp):
                 break
-        except Exception as e:
-            # TODO: exclude certain kinds of exceptions (SSL?) from retry
+        except HTTPError as e:
             exception_raised = e
             if exception_response := getattr(e, "response", None):
-                if client._accept_response(exception_response):
+                if not client._should_retry(exception_response):
                     break
         # if we're going to retry, sleep first
-        tries += 1
         if tries <= client._retry_attempts:
             # twice as long each time
             wait = client._retry_wait_seconds * (2 ** (tries - 1))
@@ -68,20 +74,17 @@ def make_retry_client(
     client: Client | None = None,
     attempts: int = 1,
     wait_seconds: float = 10,
-    retry_on_404: bool = False,
-    accept_response=_default_accept_response,
+    should_retry=retry_default_rule,
 ):
     if client is None:
         client = Client()
     client._retry_attempts = max(0, attempts)
     client._retry_wait_seconds = wait_seconds
-    client._retry_on_404 = retry_on_404
-    client._accept_response = accept_response
+    client._should_retry = should_retry
-    client._wrapped_request = client.request
+    client._no_retry_request = client.request
     client.request = types.MethodType(
         functools.wraps(client.request)(_retry_request), client
     )
     return client

careful/httpx/throttle.py CHANGED Viewed

@@ -16,7 +16,7 @@ def _throttle_request(client: Client, *args, **kwargs):
         client._last_request = time.time()
     else:
         client._last_request = now
-    return client._wrapped_request(*args, **kwargs)
+    return client._no_throttle_request(*args, **kwargs)
 def make_throttled_client(
@@ -34,7 +34,7 @@ def make_throttled_client(
     client._requests_per_minute = requests_per_minute
     client._request_frequency = 60.0 / requests_per_minute
-    client._wrapped_request = client.request
+    client._no_throttle_request = client.request
     client.request = types.MethodType(
         functools.wraps(client.request)(_throttle_request), client
     )

careful-0.2.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,71 @@
+Metadata-Version: 2.4
+Name: careful
+Version: 0.2.0
+Summary: careful extensions to httpx: throttle, retry, cache
+Project-URL: Repository, https://codeberg.org/jpt/careful
+Author-email: jpt <dev@jpt.sh>
+License: BSD-2-Clause
+License-File: LICENSE
+Classifier: Development Status :: 6 - Mature
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: BSD License
+Classifier: Natural Language :: English
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Requires-Python: >=3.10
+Requires-Dist: httpx>=0.28.1
+Requires-Dist: mkdocs-material>=9.6.18
+Requires-Dist: mkdocstrings-python>=1.18.2
+Requires-Dist: mkdocstrings>=0.30.0
+Requires-Dist: pytest-httpbin>=2.1.0
+Requires-Dist: pytest>=8.4.2
+Description-Content-Type: text/markdown
+# careful
+<img src="/carefully-3681327.svg" width=100 height=100 alt="logo of a warning sign">
+**careful** is a library for making requests to unreliable websites with httpx.
+**Code**: <https://codeberg.org/jpt/careful>
+**Docs**: <https://careful.jpt.sh>
+It offers enhancements to
+[`httpx.Client`](https://www.python-httpx.org)
+useful for writing long-running scrapers & crawlers, particularly against sites that are slow or have intermittent errors.
+- **configurable retry support.** retry on timeouts or other errors, with exponential back-off.
+- **simple request throttling.** set a maximum number of requests per minute.
+- **development cache.** configurable caching aimed at reducing redundant requests made while authoring/testing web scrapers.
+### example
+```python
+from httpx import Client
+from careful.httpx import make_careful_client
+client = make_careful_client(
+    # can configure httpx.Client however you usually would
+    client=Client(headers={'user-agent': 'careful/1.0'}),
+    # retries are configurable w/ exponential back off
+    retry_attempts=2,
+    retry_wait_seconds=5,
+    # can cache to process memory, filesystem, or SQLite
+    cache_storage=MemoryCache(),
+    # requests will automatically be throttled to aim at this rate
+    requests_per_minute=60,
+)
+# all normal methods on httpx.Client make use of configured enhancements
+client.get("https://example.com")
+```
+---
+Logo licensed from [Adrien Coquet via Noun Project](https://thenounproject.com/icon/carefully-3681327/)

careful-0.2.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,11 @@
+careful/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+careful/httpx/__init__.py,sha256=u-n0uKIWAd3NXsZUd1UA4wzJJTEhRR74diHzDV2EpEU,4885
+careful/httpx/_types.py,sha256=NwyQ-ItodN9HnO7d7b0M1M4M9y90TjRkhQFqNuypKRI,149
+careful/httpx/dev_cache.py,sha256=KR35u0CvutqTOWQ8pO-hzwbPy0lDBhShJfhCAbOvqv0,11032
+careful/httpx/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+careful/httpx/retries.py,sha256=Kszm0wDITyPZ3qx5TsDL__HjCYVJyAZ2WehrlpXV5Cc,2500
+careful/httpx/throttle.py,sha256=ZpuFABYHGQ4D0zks922SCXp7WZG_-Ysafz-Npa2QVwQ,1096
+careful-0.2.0.dist-info/METADATA,sha256=A82D5ltN7bDh1dXkOqdBLcW8fxxxqsonFgf9hZQlors,2541
+careful-0.2.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+careful-0.2.0.dist-info/licenses/LICENSE,sha256=oHe4LmCuo6CZne42DRXfiR3uqqIfGsk4dAvDKucAi4M,1315
+careful-0.2.0.dist-info/RECORD,,

careful-0.1.0.dist-info/METADATA DELETED Viewed

@@ -1,48 +0,0 @@
-Metadata-Version: 2.4
-Name: careful
-Version: 0.1.0
-Summary: Add your description here
-Author-email: jpt <dev@jpt.sh>
-License-File: LICENSE
-Requires-Python: >=3.13
-Requires-Dist: httpx>=0.28.1
-Requires-Dist: pytest-httpbin>=2.1.0
-Requires-Dist: pytest>=8.4.2
-Description-Content-Type: text/markdown
-**careful_httpx** is a library for making requests to less-than-reliable websites.
-It is based on [scrapelib](https://pypi.org/scrapelib/), which has powered Open States & many other Python scrapers for over 15 years.
-Code: <https://codeberg.org/jpt/careful_httpx>
-Documentation: TODO
-## Features
-Enhances [`httpx.Client`](https://www.python-httpx.org) with features useful for writing long-running scrapers & crawlers, particularly against sites that are slow or have intermittent errors.
-- retries
-- throttling
-- dev-cache for iterating on scrapers
-### example
-TODO
-### features this has that scrapelib doesn't
-- httpx support
-- composable interface, can augment Client with just the enhancements you want
-TODO: don't allow instantiating bad patch classes, and check for incompatible configs
-### features scrapelib had that this doesn't
-Open to considering if there is interest, but didn't seem necessary.
-- HTTP(S) and FTP requests via an identical API
-- allow setting custom ciphers
-- have urlretrieve
-- support FTP
-- set custom user-agent/mess w/ headers

careful-0.1.0.dist-info/RECORD DELETED Viewed

@@ -1,10 +0,0 @@
-careful/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-careful/httpx/__init__.py,sha256=gDSnAnqxFt9mLi2laArt7BUn_wPU5ub0k9zeqsexYJY,1605
-careful/httpx/dev_cache.py,sha256=_jwpnf1fzBzR23Of2HdsyiN_MPHvRG0gtM49Y4qRtQg,10031
-careful/httpx/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-careful/httpx/retries.py,sha256=mMIZf-EP9bhzEZnEmwtjWZ2qdl6ZCJ7vq3hZltT6Zms,2458
-careful/httpx/throttle.py,sha256=wCJWHERr5manyKq07ZdonmxbK0oh0PYJgO6a94IzN0s,1088
-careful-0.1.0.dist-info/METADATA,sha256=wBFvqh5xyMfNRVB8Jg9Aa3s5_Te2NxcBy5BE-3NMYeY,1373
-careful-0.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-careful-0.1.0.dist-info/licenses/LICENSE,sha256=oHe4LmCuo6CZne42DRXfiR3uqqIfGsk4dAvDKucAi4M,1315
-careful-0.1.0.dist-info/RECORD,,

{careful-0.1.0.dist-info → careful-0.2.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{careful-0.1.0.dist-info → careful-0.2.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

careful 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

careful 0.1.0py3-none-any.whl → 0.2.0py3-none-any.whl