PyPI - careful - Versions diffs - 0.1.0__tar.gz → 0.2.0__tar.gz - Mend

careful 0.1.0tar.gz → 0.2.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

{careful-0.1.0 → careful-0.2.0}/.gitignore +2 -0
careful-0.2.0/Justfile +16 -0
careful-0.2.0/PKG-INFO +71 -0
careful-0.2.0/README.md +44 -0
careful-0.2.0/docs/carefully-3681327.svg +9 -0
careful-0.2.0/docs/changelog.md +29 -0
careful-0.2.0/docs/index.md +45 -0
careful-0.2.0/docs/reference.md +34 -0
careful-0.2.0/mkdocs.yml +54 -0
careful-0.2.0/pyproject.toml +37 -0
careful-0.2.0/src/careful/httpx/__init__.py +127 -0
careful-0.2.0/src/careful/httpx/_types.py +6 -0
{careful-0.1.0 → careful-0.2.0}/src/careful/httpx/dev_cache.py +67 -21
{careful-0.1.0 → careful-0.2.0}/src/careful/httpx/retries.py +20 -17
{careful-0.1.0 → careful-0.2.0}/src/careful/httpx/throttle.py +2 -2
{careful-0.1.0 → careful-0.2.0}/tests/test_cache.py +2 -2
careful-0.2.0/tests/test_careful.py +47 -0
{careful-0.1.0 → careful-0.2.0}/tests/test_retries.py +8 -7
{careful-0.1.0 → careful-0.2.0}/tests/test_throttle.py +4 -3
careful-0.1.0/PKG-INFO +0 -48
careful-0.1.0/README.md +0 -36
careful-0.1.0/pyproject.toml +0 -19
careful-0.1.0/src/careful/httpx/__init__.py +0 -59
careful-0.1.0/tests/test_careful.py +0 -4
{careful-0.1.0 → careful-0.2.0}/.pre-commit-config.yaml +0 -0
{careful-0.1.0 → careful-0.2.0}/LICENSE +0 -0
{careful-0.1.0 → careful-0.2.0}/src/careful/__init__.py +0 -0
{careful-0.1.0 → careful-0.2.0}/src/careful/httpx/py.typed +0 -0
{careful-0.1.0 → careful-0.2.0}/tests/fakeresponse.py +0 -0

{careful-0.1.0 → careful-0.2.0}/.gitignore RENAMED Viewed

@@ -7,6 +7,8 @@ wheels/
 *.egg-info
 uv.lock
+site/
 # Virtual environments
 .venv

careful-0.2.0/Justfile ADDED Viewed

@@ -0,0 +1,16 @@
+test:
+    uv run pytest
+lint:
+    uv run ruff check
+preview:
+    uv run mkdocs serve
+publish:
+    uv build
+    uv publish
+deploy:
+    uv run mkdocs build
+    netlify deploy --prod -s careful-docs -d site

careful-0.2.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,71 @@
+Metadata-Version: 2.4
+Name: careful
+Version: 0.2.0
+Summary: careful extensions to httpx: throttle, retry, cache
+Project-URL: Repository, https://codeberg.org/jpt/careful
+Author-email: jpt <dev@jpt.sh>
+License: BSD-2-Clause
+License-File: LICENSE
+Classifier: Development Status :: 6 - Mature
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: BSD License
+Classifier: Natural Language :: English
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Requires-Python: >=3.10
+Requires-Dist: httpx>=0.28.1
+Requires-Dist: mkdocs-material>=9.6.18
+Requires-Dist: mkdocstrings-python>=1.18.2
+Requires-Dist: mkdocstrings>=0.30.0
+Requires-Dist: pytest-httpbin>=2.1.0
+Requires-Dist: pytest>=8.4.2
+Description-Content-Type: text/markdown
+# careful
+<img src="/carefully-3681327.svg" width=100 height=100 alt="logo of a warning sign">
+**careful** is a library for making requests to unreliable websites with httpx.
+**Code**: <https://codeberg.org/jpt/careful>
+**Docs**: <https://careful.jpt.sh>
+It offers enhancements to
+[`httpx.Client`](https://www.python-httpx.org)
+useful for writing long-running scrapers & crawlers, particularly against sites that are slow or have intermittent errors.
+- **configurable retry support.** retry on timeouts or other errors, with exponential back-off.
+- **simple request throttling.** set a maximum number of requests per minute.
+- **development cache.** configurable caching aimed at reducing redundant requests made while authoring/testing web scrapers.
+### example
+```python
+from httpx import Client
+from careful.httpx import make_careful_client
+client = make_careful_client(
+    # can configure httpx.Client however you usually would
+    client=Client(headers={'user-agent': 'careful/1.0'}),
+    # retries are configurable w/ exponential back off
+    retry_attempts=2,
+    retry_wait_seconds=5,
+    # can cache to process memory, filesystem, or SQLite
+    cache_storage=MemoryCache(),
+    # requests will automatically be throttled to aim at this rate
+    requests_per_minute=60,
+)
+# all normal methods on httpx.Client make use of configured enhancements
+client.get("https://example.com")
+```
+---
+Logo licensed from [Adrien Coquet via Noun Project](https://thenounproject.com/icon/carefully-3681327/)

careful-0.2.0/README.md ADDED Viewed

@@ -0,0 +1,44 @@
+# careful
+<img src="/carefully-3681327.svg" width=100 height=100 alt="logo of a warning sign">
+**careful** is a library for making requests to unreliable websites with httpx.
+**Code**: <https://codeberg.org/jpt/careful>
+**Docs**: <https://careful.jpt.sh>
+It offers enhancements to
+[`httpx.Client`](https://www.python-httpx.org)
+useful for writing long-running scrapers & crawlers, particularly against sites that are slow or have intermittent errors.
+- **configurable retry support.** retry on timeouts or other errors, with exponential back-off.
+- **simple request throttling.** set a maximum number of requests per minute.
+- **development cache.** configurable caching aimed at reducing redundant requests made while authoring/testing web scrapers.
+### example
+```python
+from httpx import Client
+from careful.httpx import make_careful_client
+client = make_careful_client(
+    # can configure httpx.Client however you usually would
+    client=Client(headers={'user-agent': 'careful/1.0'}),
+    # retries are configurable w/ exponential back off
+    retry_attempts=2,
+    retry_wait_seconds=5,
+    # can cache to process memory, filesystem, or SQLite
+    cache_storage=MemoryCache(),
+    # requests will automatically be throttled to aim at this rate
+    requests_per_minute=60,
+)
+# all normal methods on httpx.Client make use of configured enhancements
+client.get("https://example.com")
+```
+---
+Logo licensed from [Adrien Coquet via Noun Project](https://thenounproject.com/icon/carefully-3681327/)

careful-0.2.0/docs/carefully-3681327.svg ADDED Viewed

@@ -0,0 +1,9 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<svg width="1200pt" height="1200pt" version="1.1" viewBox="0 0 1200 1200" xmlns="http://www.w3.org/2000/svg">
+ <path d="m600 195.6c19.199 0 34.801-15.602 34.801-34.801v-96c0-19.199-15.602-34.801-34.801-34.801s-34.801 15.602-34.801 34.801v97.199c0 18 15.602 33.602 34.801 33.602z" fill="#ff814a"/>
+ <path d="m734.4 225.6c4.8008 2.3984 9.6016 3.6016 14.398 3.6016 13.199 0 25.199-7.1992 31.199-20.398l40.801-87.602c8.3984-16.801 0-37.199-16.801-45.602-16.801-8.3984-37.199 0-45.602 16.801l-39.594 87.598c-8.4023 16.801-1.1992 37.199 15.598 45.602z" fill="#ff814a"/>
+ <path d="m420 208.8c6 12 18 20.398 31.199 20.398 4.8008 0 9.6016-1.1992 14.398-3.6016 16.801-8.3984 25.199-28.801 16.801-45.602l-40.801-87.602c-8.3984-16.801-28.801-25.199-45.602-16.801-16.801 8.3984-25.199 28.801-16.801 45.602z" fill="#ff814a"/>
+ <path d="m632.4 746.4c0 17.895-14.504 32.402-32.398 32.402s-32.398-14.508-32.398-32.402c0-17.891 14.504-32.398 32.398-32.398s32.398 14.508 32.398 32.398z" fill="#ff814a"/>
+ <path d="m598.8 691.2s1.1992 0 0 0c13.199 0 22.801-9.6016 24-21.602l6-133.2v-2.3984c-1.1992-15.602-14.398-27.602-30-27.602-15.602 1.1992-27.602 14.398-27.602 30l6 133.2c1.1992 12.004 9.6016 21.602 21.602 21.602z" fill="#ff814a"/>
+ <path d="m871.2 333.6c-4.8008-24-25.199-42-50.398-42h-441.6c-24 0-45.602 18-50.398 42l-150 806.4c-2.3984 15.602 9.6016 30 25.199 30h76.801c12 0 22.801-8.3984 25.199-20.398l32.398-171.6h526.8l32.398 171.6c2.3984 12 13.199 20.398 25.199 20.398h73.203c15.602 0 27.602-14.398 25.199-30zm-87.598 494.4h-367.2c-33.602 0-54-36-37.199-64.801l183.6-315.6c16.801-28.801 57.602-28.801 74.398 0l183.6 315.6c15.598 28.801-4.8008 64.801-37.199 64.801z" fill="#ff814a"/>
+</svg>

careful-0.2.0/docs/changelog.md ADDED Viewed

@@ -0,0 +1,29 @@
+# Changelog
+## 0.2.0 - 6 September 2025
+- Initial release, mostly a port of `scrapelib` functionality.
+## scrapelib
+The original version of this library is a port of `scrapelib` (2.4.1).
+Changes from this version were to:
+- use `httpx` instead of `requests`
+- dropped quite a few unnecessary features that were mainly in `scrapelib` for backwards-compatability reasons.
+- use a composable interface instead of the inheritance-based one from `scrapelib`, aiming at making future enhancements/porting easier.
+This library is a partial rewrite of [scrapelib](https://pypi.org/project/scrapelib/).
+Thanks to all of [scrapelib's original contributors](https://github.com/jamesturk/scrapelib/graphs/contributors) and users.
+`scrapelib` originally wrapped `urllib2`, eventually migrating to `requests`.
+There are a few things that scrapelib did that this doesn't:
+- support FTP requests via HTTP-like API
+- extend the client with a `urlretrieve` function
+- provide helpers for working with headers, timeouts, and custom ciphers
+The first two are possible but didn't seem necessary at the moment.
+The latter was very `requests`-specific, and so hasn't been replicated here.

careful-0.2.0/docs/index.md ADDED Viewed

@@ -0,0 +1,45 @@
+# careful
+<img src="/carefully-3681327.svg" width=100 height=100 alt="logo of a warning sign">
+**careful** is a library for making requests to unreliable websites with httpx.
+**Code**: <https://codeberg.org/jpt/careful>
+**Docs**: <https://careful.jpt.sh>
+It offers enhancements to
+[`httpx.Client`](https://www.python-httpx.org)
+useful for writing long-running scrapers & crawlers, particularly against sites that are slow or have intermittent errors.
+- **configurable retry support.** retry on timeouts or other errors, with exponential back-off.
+- **simple request throttling.** set a maximum number of requests per minute.
+- **development cache.** configurable caching aimed at reducing redundant requests made while authoring/testing web scrapers.
+### example
+```python
+from httpx import Client
+from careful.httpx import make_careful_client
+client = make_careful_client(
+    # can configure httpx.Client however you usually would
+    client=Client(headers={'user-agent': 'careful/1.0'}),
+    # retries are configurable w/ exponential back off
+    retry_attempts=2,
+    retry_wait_seconds=5,
+    # can cache to process memory, filesystem, or SQLite
+    cache_storage=MemoryCache(),
+    # requests will automatically be throttled to aim at this rate
+    requests_per_minute=60,
+)
+# all normal methods on httpx.Client make use of configured enhancements
+client.get("https://example.com")
+```
+---
+Logo licensed from [Adrien Coquet via Noun Project](https://thenounproject.com/icon/carefully-3681327/)

careful-0.2.0/docs/reference.md ADDED Viewed

@@ -0,0 +1,34 @@
+# Usage
+Most users will only need to call `careful.httpx.make_careful_client`.
+::: careful.httpx.make_careful_client
+    options:
+      annotations_path: brief
+      show_signature: false
+      show_root_heading: true
+## cache storage
+::: careful.httpx.MemoryCache
+    options:
+       heading_level: 3
+       members: False
+       show_root_heading: true
+::: careful.httpx.FileCache
+    options:
+       heading_level: 3
+       members: False
+       show_root_heading: true
+::: careful.httpx.SqliteCache
+    options:
+       heading_level: 3
+       members: False
+       show_root_heading: true
+## Individual Wrappers

careful-0.2.0/mkdocs.yml ADDED Viewed

@@ -0,0 +1,54 @@
+site_name: careful
+site_url: https://careful.jpt.sh/
+site_author: James Turk
+site_description: A library for making requests to unreliable sites with httpx.
+copyright: Copyright &copy; 2025 James Turk
+repo_url: https://codeberg.org/jpt/careful
+repo_name: careful
+#edit_uri: edit/main/docs/
+theme:
+  name: material
+  logo: carefully-3681327.svg
+  palette:
+    - scheme: default
+      primary: teal
+      accent: teal
+      toggle:
+        icon: material/toggle-switch-off-outline
+        name: Switch to dark mode
+    - scheme: slate
+      primary: teal
+      accent: teal
+      toggle:
+        icon: material/toggle-switch
+        name: Switch to light mode
+  features:
+    #- navigation.tabs
+    - navigation.sections
+    - navigation.top
+    - content.tabs.link
+  # icon:
+  #   repo:
+markdown_extensions:
+  - admonition
+  - def_list
+  - pymdownx.highlight:
+      anchor_linenums: true
+      line_spans: __span
+      pygments_lang_class: true
+  - pymdownx.inlinehilite
+  - pymdownx.snippets
+  - pymdownx.superfences
+  - toc:
+      permalink: true
+plugins:
+- search
+- mkdocstrings:
+watch:
+  - src
+nav:
+  - 'index.md'
+  - 'reference.md'
+  - 'changelog.md'

careful-0.2.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,37 @@
+[project]
+name = "careful"
+version = "0.2.0"
+description = "careful extensions to httpx: throttle, retry, cache"
+readme = "README.md"
+authors = [
+    { name = "jpt", email = "dev@jpt.sh" }
+]
+license = {text = "BSD-2-Clause"}
+requires-python = ">=3.10"
+classifiers = [
+    "Development Status :: 6 - Mature",
+    "Intended Audience :: Developers",
+    "License :: OSI Approved :: BSD License",
+    "Natural Language :: English",
+    "Operating System :: OS Independent",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
+    "Topic :: Software Development :: Libraries :: Python Modules",
+]
+dependencies = [
+    "httpx>=0.28.1",
+    "mkdocs-material>=9.6.18",
+    "mkdocstrings>=0.30.0",
+    "mkdocstrings-python>=1.18.2",
+    "pytest>=8.4.2",
+    "pytest-httpbin>=2.1.0",
+]
+[project.urls]
+Repository = "https://codeberg.org/jpt/careful"
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"

careful-0.2.0/src/careful/httpx/__init__.py ADDED Viewed

@@ -0,0 +1,127 @@
+from .retries import make_retry_client, retry_default_rule
+from .throttle import make_throttled_client
+from .dev_cache import (
+    make_dev_caching_client,
+    MemoryCache,
+    FileCache,
+    SqliteCache,
+    CacheStorageBase,
+    _cache_200s,
+    _default_keyfunc,
+)
+from ._types import ResponsePredicate, CacheKeyfunc
+from httpx import Client
+def make_careful_client(
+    *,
+    client: Client | None = None,
+    retry_attempts: int = 0,
+    retry_wait_seconds: float = 10,
+    should_retry: ResponsePredicate = retry_default_rule,
+    requests_per_minute: int = 0,
+    cache_storage: CacheStorageBase = None,
+    cache_write_only: bool = False,
+    should_cache: ResponsePredicate = _cache_200s,
+    cache_keyfunc: CacheKeyfunc = _default_keyfunc,
+):
+    """
+    This function patches an `httpx.Client` so that all requests made with the client support
+     [retries](#retries), [throttling](#throttling), and [development caching](#development-caching).
+    Parameters:
+        client: A pre-configured `httpx.Client`. If omitted a default client will be created.
+        retry_attempts: Maximum number of retries. If non-zero will retry up to this many times
+                         with increasing wait times, starting with `retry_wait_seconds`.
+        retry_wait_seconds: Number of seconds to sleep between first attempt and first retry.
+                             Subsequent attempts will increase exponentially (2x, 4x, 8x, etc.)
+        should_retry: Predicate function that takes a `httpx.Response` and returns `True` if it should be retried.
+        requests_per_minute: Maximum number of requests per minute. (e.g. 30 will throttle to ~2s between requests)
+        cache_storage: An object that implements the [cache storage interface](#cache-storage).
+        cache_write_only: Update cache, but never read from it.
+        should_cache: Predicate function that takes a `httpx.Response` and returns `True` if it should be cached.
+        cache_keyfunc: Function that takes request details and returns a unique cache key.
+    ## Retries
+    If `retry_attempts` is set, responses will be passed to `should_retry`.
+    Responses that are rejected (return `True`) will be retried after a wait based on
+    `retry_wait_seconds`.
+    Each retry will wait twice as long as the one before.
+    ## Throttling
+    If `requests_per_minute` is set, standard (non-retry) requests will automatically
+    sleep for a short period to target the given rate.
+    For example, at 30rpm, the sleep time on a fast request will be close to 2 seconds.
+    ## Development Caching
+    Why **development caching?**
+    This feature is named as a reminder that **this is not true HTTP caching**, which
+    should take various headers into account. Look at libraries like [hishel](https://hishel.com) if that's what you are after.
+    The purpose of this feature is to allow you to cache all of your HTTP requests during development.
+    Often when writing a scraper or crawler, you wind up hitting the site you are working on more often than you'd like-- each time you iterate on your code you're likely making redundant requests to pages that haven't changed.
+    By caching all successful requests (configurable with the `should_cache` parameter),
+    you can easily re-run scrapers without making redundant HTTP requests.
+    This means faster development time & happier upstream servers.
+    To enable development caching, assign a [`MemoryCache`][careful.httpx.MemoryCache],
+    [`FileCache`][careful.httpx.FileCache], or [`SqliteCache`][careful.httpx.SqliteCache] to
+    the `cache_storage` property of a `scrapelib.Scraper`.
+    ---
+    When multiple features are applied, the order of wrapping ensures that:
+       - the cache is checked first, and bypasses throttling if hit
+       - retries use their own delays, but not throttled separately
+    """
+    if client is None:
+        client = Client()
+    # order matters, retry on inside b/c it is last-chance scenario
+    if retry_attempts:
+        client = make_retry_client(
+            client=client,
+            attempts=retry_attempts,
+            wait_seconds=retry_wait_seconds,
+            should_retry=should_retry,
+        )
+    # throttling around retries
+    if requests_per_minute:
+        client = make_throttled_client(
+            client=client, requests_per_minute=requests_per_minute
+        )
+    # caching on top layer, so cache will be checked first
+    if cache_storage:
+        client = make_dev_caching_client(
+            client=client,
+            cache_storage=cache_storage,
+            cache_keyfunc=cache_keyfunc,
+            should_cache=should_cache,
+            write_only=cache_write_only,
+        )
+    return client
+__all__ = [
+    "make_retry_client",
+    "make_throttled_client",
+    "make_dev_caching_client",
+    "MemoryCache",
+    "FileCache",
+    "SqliteCache",
+]

careful-0.2.0/src/careful/httpx/_types.py ADDED Viewed

@@ -0,0 +1,6 @@
+from httpx import Response
+from typing import Callable
+ResponsePredicate = Callable[[Response], bool]
+CacheKeyfunc = Callable[[str,str,dict], str]

{careful-0.1.0 → careful-0.2.0}/src/careful/httpx/dev_cache.py RENAMED Viewed

@@ -1,3 +1,4 @@
+import abc
 import types
 import functools
 import logging
@@ -40,13 +41,6 @@ def _cache_200s(response: Response) -> bool:
 def _cached_request(client: Client, *args, **kwargs):
-    # short circuit if cache isn't configured
-    if not client._cache_storage:
-        log.debug("bypassing cache, no storage configured")
-        resp = client._wrapped_request(*args, **kwargs)
-        resp.fromcache = False
-        return resp
     method, url = args
     request_key = client._cache_keyfunc(method, url, kwargs["params"])
@@ -61,7 +55,7 @@ def _cached_request(client: Client, *args, **kwargs):
         cached_resp.fromcache = True
         resp = cached_resp
     else:
-        resp = client._wrapped_request(*args, **kwargs)
+        resp = client._no_cache_request(*args, **kwargs)
         # save to cache if request and response meet criteria
         log.debug("XX %s %s", request_key, client._should_cache(resp))
         if request_key and client._should_cache(resp):
@@ -80,6 +74,27 @@ def make_dev_caching_client(
     should_cache=_cache_200s,
     write_only=False,
 ):
+    """
+    Returns an enhanced `httpx.Client` where requests are saved to a
+    specified cache.
+    This is denoted as a "dev_cache" because it is not intended to be a true
+    HTTP cache, respecting cache headers/etc. If you are looking for that
+    behavior, there are httpx libraries for that explicit purpose.
+    Instead, the purpose of this cache is to make it possible to test scrapers
+    locally without making hundreds of redundant requests.
+    The strategy is configurable via `cache_keyfunc` and `should_cache`.
+    The default strategy is simple:
+    cache all GET requests that result in 200s, with no expiry.
+    This works well for the case where you have hundreds of pages to scrape
+    and want to make scraper adjustments without repeatedly making hits.
+    It should *NOT* be used in production without adjusting these rules.
+    """
     if client is None:
         client = Client()
@@ -88,23 +103,34 @@ def make_dev_caching_client(
     client._should_cache = should_cache
     client._write_only = write_only
-    client._wrapped_request = client.request
+    client._no_cache_request = client.request
     client.request = types.MethodType(
         functools.wraps(client.request)(_cached_request), client
     )
     return client
-class CacheStorageBase:
+class CacheStorageBase(abc.ABC):
+    @abc.abstractmethod
     def get(self, key: str) -> None | Response:
         raise NotImplementedError()
+    @abc.abstractmethod
     def set(self, key: str, response: Response) -> None:
         raise NotImplementedError()
 class MemoryCache(CacheStorageBase):
-    """In memory cache for request responses."""
+    """
+    In memory cache for request responses.
+    Example:
+        make_careful_client(
+            cache_storage=MemoryCache(),
+        )
+    """
     def __init__(self) -> None:
         self.cache: dict[str, Response] = {}
@@ -122,11 +148,21 @@ class FileCache(CacheStorageBase):
     """
     File-based cache for request responses.
-    :param cache_dir: directory for storing responses
-    :param check_last_modified:  set to True to compare last-modified
-        timestamp in cached response with value from HEAD request
+    Parameters:
+        cache_dir: directory for storing responses
+    Example:
+        make_careful_client(
+            cache_storage=FileCache("_httpcache/"),
+        )
     """
+    # TODO: restore?
+    # check_last_modified:  set to True to compare last-modified
+    #                       timestamp in cached response with value from HEAD request
     # file name escaping inspired by httplib2
     _prefix = re.compile(r"^\w+://")
     _illegal = re.compile(r"[?/:|]+")
@@ -188,7 +224,7 @@ class FileCache(CacheStorageBase):
             # status & encoding will be in headers, but are faked
             # need to split spaces out of status to get code (e.g. '200 OK')
             resp = Response(
-                status_code = int(resp_headers.pop("status").split(" ")[0]),
+                status_code=int(resp_headers.pop("status").split(" ")[0]),
                 content=resp_content,
                 default_encoding=resp_headers.pop("encoding"),
                 headers=resp_headers,
@@ -224,13 +260,18 @@ class FileCache(CacheStorageBase):
             os.remove(fname)
-class SQLiteCache(CacheStorageBase):
-    """SQLite cache for request responses.
+class SqliteCache(CacheStorageBase):
+    """
+    sqlite cache for request responses.
+    Parameters:
+        cache_path: path for SQLite database file
-    :param cache_path: path for SQLite database file
-    :param check_last_modified: set to True to compare last-modified
-        timestamp in cached response with value from HEAD request
+    Example:
+        make_careful_client(
+            cache_storage=SQLiteCache("_cache.db"),
+        )
     """
     _columns = ["key", "status", "modified", "encoding", "data", "headers"]
@@ -284,7 +325,12 @@ class SQLiteCache(CacheStorageBase):
         #     if rec["modified"] != new_lm:
         #         return None
-        resp = Response(rec["status"], content=rec["data"], default_encoding=rec["encoding"], headers=json.loads(rec["headers"]))
+        resp = Response(
+            rec["status"],
+            content=rec["data"],
+            default_encoding=rec["encoding"],
+            headers=json.loads(rec["headers"]),
+        )
         return resp
     def clear(self) -> None:

{careful-0.1.0 → careful-0.2.0}/src/careful/httpx/retries.py RENAMED Viewed

@@ -2,13 +2,22 @@ import time
 import types
 import functools
 import logging
-from httpx import Client, Response
+from httpx import Client, Response, HTTPError
 log = logging.getLogger("httpx")
-def _default_accept_response(response: Response) -> bool:
-    return response.status_code < 400
+def retry_default_rule(response: Response) -> bool:
+    # default behavior is to retry 400s and 500s but not 404s
+    return response.status_code >= 400 and response.status_code != 404
+def retry_only_500s(response: Response) -> bool:
+    return response.status_code >= 500
+def retry_all_400s_500s(response: Response) -> bool:
+    return response.status_code >= 400
 def _retry_request(client: Client, *args, **kwargs):
@@ -20,24 +29,21 @@ def _retry_request(client: Client, *args, **kwargs):
         exception_raised = None
         try:
-            resp = client._wrapped_request(*args, **kwargs)
+            tries += 1
+            resp = client._no_retry_request(*args, **kwargs)
             # break from loop on an accepted response
-            if client._accept_response(resp) or (
-                resp.status_code == 404 and not client._retry_on_404
-            ):
+            if not client._should_retry(resp):
                 break
-        except Exception as e:
-            # TODO: exclude certain kinds of exceptions (SSL?) from retry
+        except HTTPError as e:
             exception_raised = e
             if exception_response := getattr(e, "response", None):
-                if client._accept_response(exception_response):
+                if not client._should_retry(exception_response):
                     break
         # if we're going to retry, sleep first
-        tries += 1
         if tries <= client._retry_attempts:
             # twice as long each time
             wait = client._retry_wait_seconds * (2 ** (tries - 1))
@@ -68,20 +74,17 @@ def make_retry_client(
     client: Client | None = None,
     attempts: int = 1,
     wait_seconds: float = 10,
-    retry_on_404: bool = False,
-    accept_response=_default_accept_response,
+    should_retry=retry_default_rule,
 ):
     if client is None:
         client = Client()
     client._retry_attempts = max(0, attempts)
     client._retry_wait_seconds = wait_seconds
-    client._retry_on_404 = retry_on_404
-    client._accept_response = accept_response
+    client._should_retry = should_retry
-    client._wrapped_request = client.request
+    client._no_retry_request = client.request
     client.request = types.MethodType(
         functools.wraps(client.request)(_retry_request), client
     )
     return client

{careful-0.1.0 → careful-0.2.0}/src/careful/httpx/throttle.py RENAMED Viewed

@@ -16,7 +16,7 @@ def _throttle_request(client: Client, *args, **kwargs):
         client._last_request = time.time()
     else:
         client._last_request = now
-    return client._wrapped_request(*args, **kwargs)
+    return client._no_throttle_request(*args, **kwargs)
 def make_throttled_client(
@@ -34,7 +34,7 @@ def make_throttled_client(
     client._requests_per_minute = requests_per_minute
     client._request_frequency = 60.0 / requests_per_minute
-    client._wrapped_request = client.request
+    client._no_throttle_request = client.request
     client.request = types.MethodType(
         functools.wraps(client.request)(_throttle_request), client
     )

{careful-0.1.0 → careful-0.2.0}/tests/test_cache.py RENAMED Viewed

@@ -1,6 +1,6 @@
 from pytest_httpbin.serve import Server  # type: ignore
 from httpx import Response
-from careful.httpx import make_dev_caching_client, MemoryCache, FileCache, SQLiteCache
+from careful.httpx import make_dev_caching_client, MemoryCache, FileCache, SqliteCache
 def test_dev_caching(httpbin: Server) -> None:
@@ -59,7 +59,7 @@ def test_file_cache() -> None:
 def test_sqlite_cache() -> None:
-    sc = SQLiteCache("cache.db")
+    sc = SqliteCache("cache.db")
     sc.clear()
     _test_cache_storage(sc)
     sc.clear()

careful-0.2.0/tests/test_careful.py ADDED Viewed

@@ -0,0 +1,47 @@
+from careful.httpx import make_careful_client, MemoryCache
+from unittest import mock
+from fakeresponse import FakeResponse
+def test_full_careful_client():
+    client = make_careful_client(
+        retry_attempts=3,
+        retry_wait_seconds=0.00001,
+        cache_storage=MemoryCache(),
+        requests_per_minute=60,
+    )
+    # On the first call return a 500, then a 200, then a 500 again
+    mock_send = mock.Mock(
+        side_effect=[
+            FakeResponse("http://dummy/", 500, "failure!"),
+            FakeResponse("http://dummy/", 200, "success!"),
+            FakeResponse("http://dummy/2", 404, "success!"),
+        ]
+    )
+    mock_sleep = mock.Mock()
+    # check that sleep is called
+    with mock.patch("time.sleep", mock_sleep):
+        with mock.patch.object(client, "send", mock_send):
+            resp = client.get("http://dummy/")
+            # demonstrates a retry
+            assert mock_send.call_count == 2
+            assert resp.status_code == 200
+            # sleep called by retry, not by throttle yet
+            assert mock_sleep.call_count == 1
+            # demonstrates a cache (no new call)
+            resp = client.get("http://dummy/")
+            assert mock_send.call_count == 2
+            assert resp.status_code == 200
+            assert mock_sleep.call_count == 1
+            # a new, throttled call (no retry)
+            resp =  client.get("http://dummy/2")
+            assert  mock_send.call_count == 3
+            assert resp.status_code == 404
+            # call was throttled
+            assert mock_sleep.call_count == 2

{careful-0.1.0 → careful-0.2.0}/tests/test_retries.py RENAMED Viewed

@@ -1,4 +1,5 @@
 from careful.httpx import make_retry_client
+from careful.httpx.retries import retry_all_400s_500s
 from unittest import mock
 from fakeresponse import FakeResponse
@@ -14,7 +15,7 @@ def test_retry() -> None:
         ]
     )
-    with mock.patch.object(client, "_wrapped_request", mock_request):
+    with mock.patch.object(client, "send", mock_request):
         resp = client.get("http://dummy/")
     assert mock_request.call_count == 2
@@ -23,14 +24,14 @@ def test_retry() -> None:
         return_value=FakeResponse("http://dummy/", 500, "failure!")
     )
-    with mock.patch.object(client, "_wrapped_request", mock_request):
+    with mock.patch.object(client, "send", mock_request):
         resp = client.get("http://dummy/")
     assert resp.status_code == 500
     assert mock_request.call_count == 4  # try four times
 def test_retry_404() -> None:
-    client = make_retry_client(attempts=3, wait_seconds=0.001, retry_on_404=True)
+    client = make_retry_client(attempts=3, wait_seconds=0.001, should_retry=retry_all_400s_500s)
     # On the first call return a 404, then a 200
     mock_request = mock.Mock(
@@ -40,7 +41,7 @@ def test_retry_404() -> None:
         ]
     )
-    with mock.patch.object(client, "_wrapped_request", mock_request):
+    with mock.patch.object(client, "send", mock_request):
         resp = client.get("http://dummy/")  # type: ignore
     assert mock_request.call_count == 2
     assert resp.status_code == 200
@@ -51,7 +52,7 @@ def test_retry_404() -> None:
     )
     # four tries
-    with mock.patch.object(client, "_wrapped_request", mock_request):
+    with mock.patch.object(client, "send", mock_request):
         resp = client.get("http://dummy/")
     assert resp.status_code == 404
     assert mock_request.call_count == 4
@@ -59,7 +60,7 @@ def test_retry_404() -> None:
 def test_no_retry_404() -> None:
-    client = make_retry_client(attempts=3, wait_seconds=0.001, retry_on_404=False)
+    client = make_retry_client(attempts=3, wait_seconds=0.001)
     # On the first call return a 404, then a 200
     mock_request = mock.Mock(
@@ -69,7 +70,7 @@ def test_no_retry_404() -> None:
         ]
     )
-    with mock.patch.object(client, "_wrapped_request", mock_request):
+    with mock.patch.object(client, "send", mock_request):
         resp = client.get("http://dummy/")  # type: ignore
     assert mock_request.call_count == 1
     assert resp.status_code == 404

{careful-0.1.0 → careful-0.2.0}/tests/test_throttle.py RENAMED Viewed

@@ -1,11 +1,12 @@
 from careful.httpx import make_throttled_client
 from unittest import mock
 from typing import Any
+from httpx import Request
 from fakeresponse import FakeResponse
-def request_200(method: str, url: str, *args: Any, **kwargs: Any) -> FakeResponse:
-    return FakeResponse(url, 200, b"ok")
+def request_200(request: Request, *args: Any, **kwargs: Any) -> FakeResponse:
+    return FakeResponse(request.url, 200, b"ok")
 mock_200 = mock.Mock(wraps=request_200)
@@ -18,7 +19,7 @@ def test_request_throttling() -> None:
     # check that sleep is called on call 2 & 3
     with mock.patch("time.sleep", mock_sleep):
-        with mock.patch.object(client, "_wrapped_request", mock_200):
+        with mock.patch.object(client, "send", mock_200):
             client.get("http://dummy/")
             client.get("http://dummy/")
             client.get("http://dummy/")

careful-0.1.0/PKG-INFO DELETED Viewed

@@ -1,48 +0,0 @@
-Metadata-Version: 2.4
-Name: careful
-Version: 0.1.0
-Summary: Add your description here
-Author-email: jpt <dev@jpt.sh>
-License-File: LICENSE
-Requires-Python: >=3.13
-Requires-Dist: httpx>=0.28.1
-Requires-Dist: pytest-httpbin>=2.1.0
-Requires-Dist: pytest>=8.4.2
-Description-Content-Type: text/markdown
-**careful_httpx** is a library for making requests to less-than-reliable websites.
-It is based on [scrapelib](https://pypi.org/scrapelib/), which has powered Open States & many other Python scrapers for over 15 years.
-Code: <https://codeberg.org/jpt/careful_httpx>
-Documentation: TODO
-## Features
-Enhances [`httpx.Client`](https://www.python-httpx.org) with features useful for writing long-running scrapers & crawlers, particularly against sites that are slow or have intermittent errors.
-- retries
-- throttling
-- dev-cache for iterating on scrapers
-### example
-TODO
-### features this has that scrapelib doesn't
-- httpx support
-- composable interface, can augment Client with just the enhancements you want
-TODO: don't allow instantiating bad patch classes, and check for incompatible configs
-### features scrapelib had that this doesn't
-Open to considering if there is interest, but didn't seem necessary.
-- HTTP(S) and FTP requests via an identical API
-- allow setting custom ciphers
-- have urlretrieve
-- support FTP
-- set custom user-agent/mess w/ headers

careful-0.1.0/README.md DELETED Viewed

@@ -1,36 +0,0 @@
-**careful_httpx** is a library for making requests to less-than-reliable websites.
-It is based on [scrapelib](https://pypi.org/scrapelib/), which has powered Open States & many other Python scrapers for over 15 years.
-Code: <https://codeberg.org/jpt/careful_httpx>
-Documentation: TODO
-## Features
-Enhances [`httpx.Client`](https://www.python-httpx.org) with features useful for writing long-running scrapers & crawlers, particularly against sites that are slow or have intermittent errors.
-- retries
-- throttling
-- dev-cache for iterating on scrapers
-### example
-TODO
-### features this has that scrapelib doesn't
-- httpx support
-- composable interface, can augment Client with just the enhancements you want
-TODO: don't allow instantiating bad patch classes, and check for incompatible configs
-### features scrapelib had that this doesn't
-Open to considering if there is interest, but didn't seem necessary.
-- HTTP(S) and FTP requests via an identical API
-- allow setting custom ciphers
-- have urlretrieve
-- support FTP
-- set custom user-agent/mess w/ headers

careful-0.1.0/pyproject.toml DELETED Viewed

@@ -1,19 +0,0 @@
-[project]
-name = "careful"
-version = "0.1.0"
-description = "Add your description here"
-readme = "README.md"
-authors = [
-    { name = "jpt", email = "dev@jpt.sh" }
-]
-requires-python = ">=3.13"
-dependencies = [
-    "httpx>=0.28.1",
-    "pytest>=8.4.2",
-    "pytest-httpbin>=2.1.0",
-]
-[build-system]
-requires = ["hatchling"]
-build-backend = "hatchling.build"

careful-0.1.0/src/careful/httpx/__init__.py DELETED Viewed

@@ -1,59 +0,0 @@
-from .retries import make_retry_client, _default_accept_response
-from .throttle import make_throttled_client
-from .dev_cache import (
-    make_dev_caching_client,
-    MemoryCache,
-    FileCache,
-    SQLiteCache,
-    _cache_200s,
-    _default_keyfunc,
-)
-from httpx import Client
-def make_careful_client(
-    client: Client,
-    *,
-    retry_attempts: int = 0,
-    retry_wait_seconds: float = 10,
-    retry_on_404: bool = False,
-    accept_response=_default_accept_response,
-    requests_per_minute: int = 0,
-    cache_storage=None,
-    cache_write_only=False,
-    should_cache=_cache_200s,
-    cache_keyfunc=_default_keyfunc,
-):
-    # order matters, retry on inside b/c it is last-chance scenario
-    if retry_attempts:
-        client = make_retry_client(
-            client=client,
-            attempts=retry_attempts,
-            wait_seconds=retry_wait_seconds,
-            retry_on_404=retry_on_404,
-            accept_response=accept_response,
-        )
-    # throttling around retries
-    if requests_per_minute:
-        client = make_throttled_client(client, requests_per_minute=requests_per_minute)
-    # caching on top layer, so cache will be checked first
-    if cache_storage:
-        client = make_dev_caching_client(
-            client=client,
-            cache_storage=cache_storage,
-            cache_keyfunc=cache_keyfunc,
-            should_cache=should_cache,
-            write_only=cache_write_only,
-        )
-    return client
-__all__ = [
-    "make_retry_client",
-    "make_throttled_client",
-    "make_dev_caching_client",
-    "MemoryCache",
-    "FileCache",
-    "SQLiteCache",
-]

careful-0.1.0/tests/test_careful.py DELETED Viewed

@@ -1,4 +0,0 @@
-from careful.httpx import make_careful_client
-def test_full_careful_client():
-    client = make_careful_client()

{careful-0.1.0 → careful-0.2.0}/.pre-commit-config.yaml RENAMED Viewed

File without changes

{careful-0.1.0 → careful-0.2.0}/LICENSE RENAMED Viewed

File without changes

{careful-0.1.0 → careful-0.2.0}/src/careful/__init__.py RENAMED Viewed

File without changes

{careful-0.1.0 → careful-0.2.0}/src/careful/httpx/py.typed RENAMED Viewed

File without changes

{careful-0.1.0 → careful-0.2.0}/tests/fakeresponse.py RENAMED Viewed

File without changes

careful 0.1.0__tar.gz → 0.2.0__tar.gz

careful 0.1.0tar.gz → 0.2.0tar.gz