careful 0.1.0__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,6 +7,8 @@ wheels/
7
7
  *.egg-info
8
8
  uv.lock
9
9
 
10
+ site/
11
+
10
12
  # Virtual environments
11
13
  .venv
12
14
 
careful-0.2.0/Justfile ADDED
@@ -0,0 +1,16 @@
1
+ test:
2
+ uv run pytest
3
+
4
+ lint:
5
+ uv run ruff check
6
+
7
+ preview:
8
+ uv run mkdocs serve
9
+
10
+ publish:
11
+ uv build
12
+ uv publish
13
+
14
+ deploy:
15
+ uv run mkdocs build
16
+ netlify deploy --prod -s careful-docs -d site
careful-0.2.0/PKG-INFO ADDED
@@ -0,0 +1,71 @@
1
+ Metadata-Version: 2.4
2
+ Name: careful
3
+ Version: 0.2.0
4
+ Summary: careful extensions to httpx: throttle, retry, cache
5
+ Project-URL: Repository, https://codeberg.org/jpt/careful
6
+ Author-email: jpt <dev@jpt.sh>
7
+ License: BSD-2-Clause
8
+ License-File: LICENSE
9
+ Classifier: Development Status :: 6 - Mature
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: License :: OSI Approved :: BSD License
12
+ Classifier: Natural Language :: English
13
+ Classifier: Operating System :: OS Independent
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Programming Language :: Python :: 3.13
18
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
19
+ Requires-Python: >=3.10
20
+ Requires-Dist: httpx>=0.28.1
21
+ Requires-Dist: mkdocs-material>=9.6.18
22
+ Requires-Dist: mkdocstrings-python>=1.18.2
23
+ Requires-Dist: mkdocstrings>=0.30.0
24
+ Requires-Dist: pytest-httpbin>=2.1.0
25
+ Requires-Dist: pytest>=8.4.2
26
+ Description-Content-Type: text/markdown
27
+
28
+ # careful
29
+
30
+ <img src="/carefully-3681327.svg" width=100 height=100 alt="logo of a warning sign">
31
+
32
+ **careful** is a library for making requests to unreliable websites with httpx.
33
+
34
+ **Code**: <https://codeberg.org/jpt/careful>
35
+
36
+ **Docs**: <https://careful.jpt.sh>
37
+
38
+ It offers enhancements to
39
+ [`httpx.Client`](https://www.python-httpx.org)
40
+ useful for writing long-running scrapers & crawlers, particularly against sites that are slow or have intermittent errors.
41
+
42
+ - **configurable retry support.** retry on timeouts or other errors, with exponential back-off.
43
+ - **simple request throttling.** set a maximum number of requests per minute.
44
+ - **development cache.** configurable caching aimed at reducing redundant requests made while authoring/testing web scrapers.
45
+
46
+ ### example
47
+
48
+ ```python
49
+ from httpx import Client
50
+ from careful.httpx import make_careful_client
51
+
52
+ client = make_careful_client(
53
+ # can configure httpx.Client however you usually would
54
+ client=Client(headers={'user-agent': 'careful/1.0'}),
55
+ # retries are configurable w/ exponential back off
56
+ retry_attempts=2,
57
+ retry_wait_seconds=5,
58
+ # can cache to process memory, filesystem, or SQLite
59
+ cache_storage=MemoryCache(),
60
+ # requests will automatically be throttled to aim at this rate
61
+ requests_per_minute=60,
62
+ )
63
+
64
+ # all normal methods on httpx.Client make use of configured enhancements
65
+ client.get("https://example.com")
66
+ ```
67
+
68
+
69
+ ---
70
+
71
+ Logo licensed from [Adrien Coquet via Noun Project](https://thenounproject.com/icon/carefully-3681327/)
@@ -0,0 +1,44 @@
1
+ # careful
2
+
3
+ <img src="/carefully-3681327.svg" width=100 height=100 alt="logo of a warning sign">
4
+
5
+ **careful** is a library for making requests to unreliable websites with httpx.
6
+
7
+ **Code**: <https://codeberg.org/jpt/careful>
8
+
9
+ **Docs**: <https://careful.jpt.sh>
10
+
11
+ It offers enhancements to
12
+ [`httpx.Client`](https://www.python-httpx.org)
13
+ useful for writing long-running scrapers & crawlers, particularly against sites that are slow or have intermittent errors.
14
+
15
+ - **configurable retry support.** retry on timeouts or other errors, with exponential back-off.
16
+ - **simple request throttling.** set a maximum number of requests per minute.
17
+ - **development cache.** configurable caching aimed at reducing redundant requests made while authoring/testing web scrapers.
18
+
19
+ ### example
20
+
21
+ ```python
22
+ from httpx import Client
23
+ from careful.httpx import make_careful_client
24
+
25
+ client = make_careful_client(
26
+ # can configure httpx.Client however you usually would
27
+ client=Client(headers={'user-agent': 'careful/1.0'}),
28
+ # retries are configurable w/ exponential back off
29
+ retry_attempts=2,
30
+ retry_wait_seconds=5,
31
+ # can cache to process memory, filesystem, or SQLite
32
+ cache_storage=MemoryCache(),
33
+ # requests will automatically be throttled to aim at this rate
34
+ requests_per_minute=60,
35
+ )
36
+
37
+ # all normal methods on httpx.Client make use of configured enhancements
38
+ client.get("https://example.com")
39
+ ```
40
+
41
+
42
+ ---
43
+
44
+ Logo licensed from [Adrien Coquet via Noun Project](https://thenounproject.com/icon/carefully-3681327/)
@@ -0,0 +1,9 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <svg width="1200pt" height="1200pt" version="1.1" viewBox="0 0 1200 1200" xmlns="http://www.w3.org/2000/svg">
3
+ <path d="m600 195.6c19.199 0 34.801-15.602 34.801-34.801v-96c0-19.199-15.602-34.801-34.801-34.801s-34.801 15.602-34.801 34.801v97.199c0 18 15.602 33.602 34.801 33.602z" fill="#ff814a"/>
4
+ <path d="m734.4 225.6c4.8008 2.3984 9.6016 3.6016 14.398 3.6016 13.199 0 25.199-7.1992 31.199-20.398l40.801-87.602c8.3984-16.801 0-37.199-16.801-45.602-16.801-8.3984-37.199 0-45.602 16.801l-39.594 87.598c-8.4023 16.801-1.1992 37.199 15.598 45.602z" fill="#ff814a"/>
5
+ <path d="m420 208.8c6 12 18 20.398 31.199 20.398 4.8008 0 9.6016-1.1992 14.398-3.6016 16.801-8.3984 25.199-28.801 16.801-45.602l-40.801-87.602c-8.3984-16.801-28.801-25.199-45.602-16.801-16.801 8.3984-25.199 28.801-16.801 45.602z" fill="#ff814a"/>
6
+ <path d="m632.4 746.4c0 17.895-14.504 32.402-32.398 32.402s-32.398-14.508-32.398-32.402c0-17.891 14.504-32.398 32.398-32.398s32.398 14.508 32.398 32.398z" fill="#ff814a"/>
7
+ <path d="m598.8 691.2s1.1992 0 0 0c13.199 0 22.801-9.6016 24-21.602l6-133.2v-2.3984c-1.1992-15.602-14.398-27.602-30-27.602-15.602 1.1992-27.602 14.398-27.602 30l6 133.2c1.1992 12.004 9.6016 21.602 21.602 21.602z" fill="#ff814a"/>
8
+ <path d="m871.2 333.6c-4.8008-24-25.199-42-50.398-42h-441.6c-24 0-45.602 18-50.398 42l-150 806.4c-2.3984 15.602 9.6016 30 25.199 30h76.801c12 0 22.801-8.3984 25.199-20.398l32.398-171.6h526.8l32.398 171.6c2.3984 12 13.199 20.398 25.199 20.398h73.203c15.602 0 27.602-14.398 25.199-30zm-87.598 494.4h-367.2c-33.602 0-54-36-37.199-64.801l183.6-315.6c16.801-28.801 57.602-28.801 74.398 0l183.6 315.6c15.598 28.801-4.8008 64.801-37.199 64.801z" fill="#ff814a"/>
9
+ </svg>
@@ -0,0 +1,29 @@
1
+ # Changelog
2
+
3
+ ## 0.2.0 - 6 September 2025
4
+
5
+ - Initial release, mostly a port of `scrapelib` functionality.
6
+
7
+ ## scrapelib
8
+
9
+ The original version of this library is a port of `scrapelib` (2.4.1).
10
+
11
+ Changes from this version were to:
12
+
13
+ - use `httpx` instead of `requests`
14
+ - dropped quite a few unnecessary features that were mainly in `scrapelib` for backwards-compatability reasons.
15
+ - use a composable interface instead of the inheritance-based one from `scrapelib`, aiming at making future enhancements/porting easier.
16
+
17
+ This library is a partial rewrite of [scrapelib](https://pypi.org/project/scrapelib/).
18
+ Thanks to all of [scrapelib's original contributors](https://github.com/jamesturk/scrapelib/graphs/contributors) and users.
19
+
20
+ `scrapelib` originally wrapped `urllib2`, eventually migrating to `requests`.
21
+
22
+ There are a few things that scrapelib did that this doesn't:
23
+
24
+ - support FTP requests via HTTP-like API
25
+ - extend the client with a `urlretrieve` function
26
+ - provide helpers for working with headers, timeouts, and custom ciphers
27
+
28
+ The first two are possible but didn't seem necessary at the moment.
29
+ The latter was very `requests`-specific, and so hasn't been replicated here.
@@ -0,0 +1,45 @@
1
+ # careful
2
+
3
+ <img src="/carefully-3681327.svg" width=100 height=100 alt="logo of a warning sign">
4
+
5
+ **careful** is a library for making requests to unreliable websites with httpx.
6
+
7
+ **Code**: <https://codeberg.org/jpt/careful>
8
+
9
+ **Docs**: <https://careful.jpt.sh>
10
+
11
+ It offers enhancements to
12
+ [`httpx.Client`](https://www.python-httpx.org)
13
+ useful for writing long-running scrapers & crawlers, particularly against sites that are slow or have intermittent errors.
14
+
15
+ - **configurable retry support.** retry on timeouts or other errors, with exponential back-off.
16
+ - **simple request throttling.** set a maximum number of requests per minute.
17
+ - **development cache.** configurable caching aimed at reducing redundant requests made while authoring/testing web scrapers.
18
+
19
+ ### example
20
+
21
+ ```python
22
+ from httpx import Client
23
+ from careful.httpx import make_careful_client
24
+
25
+ client = make_careful_client(
26
+ # can configure httpx.Client however you usually would
27
+ client=Client(headers={'user-agent': 'careful/1.0'}),
28
+ # retries are configurable w/ exponential back off
29
+ retry_attempts=2,
30
+ retry_wait_seconds=5,
31
+ # can cache to process memory, filesystem, or SQLite
32
+ cache_storage=MemoryCache(),
33
+ # requests will automatically be throttled to aim at this rate
34
+ requests_per_minute=60,
35
+ )
36
+
37
+ # all normal methods on httpx.Client make use of configured enhancements
38
+ client.get("https://example.com")
39
+ ```
40
+
41
+
42
+ ---
43
+
44
+ Logo licensed from [Adrien Coquet via Noun Project](https://thenounproject.com/icon/carefully-3681327/)
45
+
@@ -0,0 +1,34 @@
1
+ # Usage
2
+
3
+ Most users will only need to call `careful.httpx.make_careful_client`.
4
+
5
+ ::: careful.httpx.make_careful_client
6
+ options:
7
+ annotations_path: brief
8
+ show_signature: false
9
+ show_root_heading: true
10
+
11
+
12
+ ## cache storage
13
+
14
+
15
+ ::: careful.httpx.MemoryCache
16
+ options:
17
+ heading_level: 3
18
+ members: False
19
+ show_root_heading: true
20
+
21
+ ::: careful.httpx.FileCache
22
+ options:
23
+ heading_level: 3
24
+ members: False
25
+ show_root_heading: true
26
+
27
+ ::: careful.httpx.SqliteCache
28
+ options:
29
+ heading_level: 3
30
+ members: False
31
+ show_root_heading: true
32
+
33
+
34
+ ## Individual Wrappers
@@ -0,0 +1,54 @@
1
+ site_name: careful
2
+ site_url: https://careful.jpt.sh/
3
+ site_author: James Turk
4
+ site_description: A library for making requests to unreliable sites with httpx.
5
+ copyright: Copyright &copy; 2025 James Turk
6
+ repo_url: https://codeberg.org/jpt/careful
7
+ repo_name: careful
8
+ #edit_uri: edit/main/docs/
9
+
10
+ theme:
11
+ name: material
12
+ logo: carefully-3681327.svg
13
+ palette:
14
+ - scheme: default
15
+ primary: teal
16
+ accent: teal
17
+ toggle:
18
+ icon: material/toggle-switch-off-outline
19
+ name: Switch to dark mode
20
+ - scheme: slate
21
+ primary: teal
22
+ accent: teal
23
+ toggle:
24
+ icon: material/toggle-switch
25
+ name: Switch to light mode
26
+
27
+ features:
28
+ #- navigation.tabs
29
+ - navigation.sections
30
+ - navigation.top
31
+ - content.tabs.link
32
+ # icon:
33
+ # repo:
34
+ markdown_extensions:
35
+ - admonition
36
+ - def_list
37
+ - pymdownx.highlight:
38
+ anchor_linenums: true
39
+ line_spans: __span
40
+ pygments_lang_class: true
41
+ - pymdownx.inlinehilite
42
+ - pymdownx.snippets
43
+ - pymdownx.superfences
44
+ - toc:
45
+ permalink: true
46
+ plugins:
47
+ - search
48
+ - mkdocstrings:
49
+ watch:
50
+ - src
51
+ nav:
52
+ - 'index.md'
53
+ - 'reference.md'
54
+ - 'changelog.md'
@@ -0,0 +1,37 @@
1
+ [project]
2
+ name = "careful"
3
+ version = "0.2.0"
4
+ description = "careful extensions to httpx: throttle, retry, cache"
5
+ readme = "README.md"
6
+ authors = [
7
+ { name = "jpt", email = "dev@jpt.sh" }
8
+ ]
9
+ license = {text = "BSD-2-Clause"}
10
+ requires-python = ">=3.10"
11
+ classifiers = [
12
+ "Development Status :: 6 - Mature",
13
+ "Intended Audience :: Developers",
14
+ "License :: OSI Approved :: BSD License",
15
+ "Natural Language :: English",
16
+ "Operating System :: OS Independent",
17
+ "Programming Language :: Python :: 3.10",
18
+ "Programming Language :: Python :: 3.11",
19
+ "Programming Language :: Python :: 3.12",
20
+ "Programming Language :: Python :: 3.13",
21
+ "Topic :: Software Development :: Libraries :: Python Modules",
22
+ ]
23
+ dependencies = [
24
+ "httpx>=0.28.1",
25
+ "mkdocs-material>=9.6.18",
26
+ "mkdocstrings>=0.30.0",
27
+ "mkdocstrings-python>=1.18.2",
28
+ "pytest>=8.4.2",
29
+ "pytest-httpbin>=2.1.0",
30
+ ]
31
+ [project.urls]
32
+ Repository = "https://codeberg.org/jpt/careful"
33
+
34
+
35
+ [build-system]
36
+ requires = ["hatchling"]
37
+ build-backend = "hatchling.build"
@@ -0,0 +1,127 @@
1
+ from .retries import make_retry_client, retry_default_rule
2
+ from .throttle import make_throttled_client
3
+ from .dev_cache import (
4
+ make_dev_caching_client,
5
+ MemoryCache,
6
+ FileCache,
7
+ SqliteCache,
8
+ CacheStorageBase,
9
+ _cache_200s,
10
+ _default_keyfunc,
11
+ )
12
+ from ._types import ResponsePredicate, CacheKeyfunc
13
+ from httpx import Client
14
+
15
+
16
+ def make_careful_client(
17
+ *,
18
+ client: Client | None = None,
19
+ retry_attempts: int = 0,
20
+ retry_wait_seconds: float = 10,
21
+ should_retry: ResponsePredicate = retry_default_rule,
22
+ requests_per_minute: int = 0,
23
+ cache_storage: CacheStorageBase = None,
24
+ cache_write_only: bool = False,
25
+ should_cache: ResponsePredicate = _cache_200s,
26
+ cache_keyfunc: CacheKeyfunc = _default_keyfunc,
27
+ ):
28
+ """
29
+ This function patches an `httpx.Client` so that all requests made with the client support
30
+ [retries](#retries), [throttling](#throttling), and [development caching](#development-caching).
31
+
32
+
33
+ Parameters:
34
+ client: A pre-configured `httpx.Client`. If omitted a default client will be created.
35
+
36
+ retry_attempts: Maximum number of retries. If non-zero will retry up to this many times
37
+ with increasing wait times, starting with `retry_wait_seconds`.
38
+
39
+ retry_wait_seconds: Number of seconds to sleep between first attempt and first retry.
40
+ Subsequent attempts will increase exponentially (2x, 4x, 8x, etc.)
41
+
42
+ should_retry: Predicate function that takes a `httpx.Response` and returns `True` if it should be retried.
43
+
44
+ requests_per_minute: Maximum number of requests per minute. (e.g. 30 will throttle to ~2s between requests)
45
+
46
+ cache_storage: An object that implements the [cache storage interface](#cache-storage).
47
+
48
+ cache_write_only: Update cache, but never read from it.
49
+
50
+ should_cache: Predicate function that takes a `httpx.Response` and returns `True` if it should be cached.
51
+
52
+ cache_keyfunc: Function that takes request details and returns a unique cache key.
53
+
54
+ ## Retries
55
+
56
+ If `retry_attempts` is set, responses will be passed to `should_retry`.
57
+ Responses that are rejected (return `True`) will be retried after a wait based on
58
+ `retry_wait_seconds`.
59
+ Each retry will wait twice as long as the one before.
60
+
61
+ ## Throttling
62
+
63
+ If `requests_per_minute` is set, standard (non-retry) requests will automatically
64
+ sleep for a short period to target the given rate.
65
+
66
+ For example, at 30rpm, the sleep time on a fast request will be close to 2 seconds.
67
+
68
+ ## Development Caching
69
+
70
+ Why **development caching?**
71
+
72
+ This feature is named as a reminder that **this is not true HTTP caching**, which
73
+ should take various headers into account. Look at libraries like [hishel](https://hishel.com) if that's what you are after.
74
+
75
+ The purpose of this feature is to allow you to cache all of your HTTP requests during development.
76
+ Often when writing a scraper or crawler, you wind up hitting the site you are working on more often than you'd like-- each time you iterate on your code you're likely making redundant requests to pages that haven't changed.
77
+
78
+ By caching all successful requests (configurable with the `should_cache` parameter),
79
+ you can easily re-run scrapers without making redundant HTTP requests.
80
+ This means faster development time & happier upstream servers.
81
+
82
+ To enable development caching, assign a [`MemoryCache`][careful.httpx.MemoryCache],
83
+ [`FileCache`][careful.httpx.FileCache], or [`SqliteCache`][careful.httpx.SqliteCache] to
84
+ the `cache_storage` property of a `scrapelib.Scraper`.
85
+
86
+ ---
87
+
88
+ When multiple features are applied, the order of wrapping ensures that:
89
+ - the cache is checked first, and bypasses throttling if hit
90
+ - retries use their own delays, but not throttled separately
91
+ """
92
+ if client is None:
93
+ client = Client()
94
+ # order matters, retry on inside b/c it is last-chance scenario
95
+ if retry_attempts:
96
+ client = make_retry_client(
97
+ client=client,
98
+ attempts=retry_attempts,
99
+ wait_seconds=retry_wait_seconds,
100
+ should_retry=should_retry,
101
+ )
102
+ # throttling around retries
103
+ if requests_per_minute:
104
+ client = make_throttled_client(
105
+ client=client, requests_per_minute=requests_per_minute
106
+ )
107
+ # caching on top layer, so cache will be checked first
108
+ if cache_storage:
109
+ client = make_dev_caching_client(
110
+ client=client,
111
+ cache_storage=cache_storage,
112
+ cache_keyfunc=cache_keyfunc,
113
+ should_cache=should_cache,
114
+ write_only=cache_write_only,
115
+ )
116
+
117
+ return client
118
+
119
+
120
+ __all__ = [
121
+ "make_retry_client",
122
+ "make_throttled_client",
123
+ "make_dev_caching_client",
124
+ "MemoryCache",
125
+ "FileCache",
126
+ "SqliteCache",
127
+ ]
@@ -0,0 +1,6 @@
1
+ from httpx import Response
2
+ from typing import Callable
3
+
4
+ ResponsePredicate = Callable[[Response], bool]
5
+
6
+ CacheKeyfunc = Callable[[str,str,dict], str]
@@ -1,3 +1,4 @@
1
+ import abc
1
2
  import types
2
3
  import functools
3
4
  import logging
@@ -40,13 +41,6 @@ def _cache_200s(response: Response) -> bool:
40
41
 
41
42
 
42
43
  def _cached_request(client: Client, *args, **kwargs):
43
- # short circuit if cache isn't configured
44
- if not client._cache_storage:
45
- log.debug("bypassing cache, no storage configured")
46
- resp = client._wrapped_request(*args, **kwargs)
47
- resp.fromcache = False
48
- return resp
49
-
50
44
  method, url = args
51
45
  request_key = client._cache_keyfunc(method, url, kwargs["params"])
52
46
 
@@ -61,7 +55,7 @@ def _cached_request(client: Client, *args, **kwargs):
61
55
  cached_resp.fromcache = True
62
56
  resp = cached_resp
63
57
  else:
64
- resp = client._wrapped_request(*args, **kwargs)
58
+ resp = client._no_cache_request(*args, **kwargs)
65
59
  # save to cache if request and response meet criteria
66
60
  log.debug("XX %s %s", request_key, client._should_cache(resp))
67
61
  if request_key and client._should_cache(resp):
@@ -80,6 +74,27 @@ def make_dev_caching_client(
80
74
  should_cache=_cache_200s,
81
75
  write_only=False,
82
76
  ):
77
+ """
78
+ Returns an enhanced `httpx.Client` where requests are saved to a
79
+ specified cache.
80
+
81
+ This is denoted as a "dev_cache" because it is not intended to be a true
82
+ HTTP cache, respecting cache headers/etc. If you are looking for that
83
+ behavior, there are httpx libraries for that explicit purpose.
84
+
85
+ Instead, the purpose of this cache is to make it possible to test scrapers
86
+ locally without making hundreds of redundant requests.
87
+
88
+ The strategy is configurable via `cache_keyfunc` and `should_cache`.
89
+
90
+ The default strategy is simple:
91
+ cache all GET requests that result in 200s, with no expiry.
92
+
93
+ This works well for the case where you have hundreds of pages to scrape
94
+ and want to make scraper adjustments without repeatedly making hits.
95
+
96
+ It should *NOT* be used in production without adjusting these rules.
97
+ """
83
98
  if client is None:
84
99
  client = Client()
85
100
 
@@ -88,23 +103,34 @@ def make_dev_caching_client(
88
103
  client._should_cache = should_cache
89
104
  client._write_only = write_only
90
105
 
91
- client._wrapped_request = client.request
106
+ client._no_cache_request = client.request
92
107
  client.request = types.MethodType(
93
108
  functools.wraps(client.request)(_cached_request), client
94
109
  )
95
110
  return client
96
111
 
97
112
 
98
- class CacheStorageBase:
113
+ class CacheStorageBase(abc.ABC):
114
+ @abc.abstractmethod
99
115
  def get(self, key: str) -> None | Response:
100
116
  raise NotImplementedError()
101
117
 
118
+ @abc.abstractmethod
102
119
  def set(self, key: str, response: Response) -> None:
103
120
  raise NotImplementedError()
104
121
 
105
122
 
106
123
  class MemoryCache(CacheStorageBase):
107
- """In memory cache for request responses."""
124
+ """
125
+ In memory cache for request responses.
126
+
127
+ Example:
128
+
129
+ make_careful_client(
130
+ cache_storage=MemoryCache(),
131
+ )
132
+
133
+ """
108
134
 
109
135
  def __init__(self) -> None:
110
136
  self.cache: dict[str, Response] = {}
@@ -122,11 +148,21 @@ class FileCache(CacheStorageBase):
122
148
  """
123
149
  File-based cache for request responses.
124
150
 
125
- :param cache_dir: directory for storing responses
126
- :param check_last_modified: set to True to compare last-modified
127
- timestamp in cached response with value from HEAD request
151
+ Parameters:
152
+ cache_dir: directory for storing responses
153
+
154
+ Example:
155
+
156
+ make_careful_client(
157
+ cache_storage=FileCache("_httpcache/"),
158
+ )
159
+
128
160
  """
129
161
 
162
+ # TODO: restore?
163
+ # check_last_modified: set to True to compare last-modified
164
+ # timestamp in cached response with value from HEAD request
165
+
130
166
  # file name escaping inspired by httplib2
131
167
  _prefix = re.compile(r"^\w+://")
132
168
  _illegal = re.compile(r"[?/:|]+")
@@ -188,7 +224,7 @@ class FileCache(CacheStorageBase):
188
224
  # status & encoding will be in headers, but are faked
189
225
  # need to split spaces out of status to get code (e.g. '200 OK')
190
226
  resp = Response(
191
- status_code = int(resp_headers.pop("status").split(" ")[0]),
227
+ status_code=int(resp_headers.pop("status").split(" ")[0]),
192
228
  content=resp_content,
193
229
  default_encoding=resp_headers.pop("encoding"),
194
230
  headers=resp_headers,
@@ -224,13 +260,18 @@ class FileCache(CacheStorageBase):
224
260
  os.remove(fname)
225
261
 
226
262
 
227
- class SQLiteCache(CacheStorageBase):
228
- """SQLite cache for request responses.
263
+ class SqliteCache(CacheStorageBase):
264
+ """
265
+ sqlite cache for request responses.
266
+
267
+ Parameters:
268
+ cache_path: path for SQLite database file
229
269
 
230
- :param cache_path: path for SQLite database file
231
- :param check_last_modified: set to True to compare last-modified
232
- timestamp in cached response with value from HEAD request
270
+ Example:
233
271
 
272
+ make_careful_client(
273
+ cache_storage=SQLiteCache("_cache.db"),
274
+ )
234
275
  """
235
276
 
236
277
  _columns = ["key", "status", "modified", "encoding", "data", "headers"]
@@ -284,7 +325,12 @@ class SQLiteCache(CacheStorageBase):
284
325
  # if rec["modified"] != new_lm:
285
326
  # return None
286
327
 
287
- resp = Response(rec["status"], content=rec["data"], default_encoding=rec["encoding"], headers=json.loads(rec["headers"]))
328
+ resp = Response(
329
+ rec["status"],
330
+ content=rec["data"],
331
+ default_encoding=rec["encoding"],
332
+ headers=json.loads(rec["headers"]),
333
+ )
288
334
  return resp
289
335
 
290
336
  def clear(self) -> None:
@@ -2,13 +2,22 @@ import time
2
2
  import types
3
3
  import functools
4
4
  import logging
5
- from httpx import Client, Response
5
+ from httpx import Client, Response, HTTPError
6
6
 
7
7
  log = logging.getLogger("httpx")
8
8
 
9
9
 
10
- def _default_accept_response(response: Response) -> bool:
11
- return response.status_code < 400
10
+ def retry_default_rule(response: Response) -> bool:
11
+ # default behavior is to retry 400s and 500s but not 404s
12
+ return response.status_code >= 400 and response.status_code != 404
13
+
14
+
15
+ def retry_only_500s(response: Response) -> bool:
16
+ return response.status_code >= 500
17
+
18
+
19
+ def retry_all_400s_500s(response: Response) -> bool:
20
+ return response.status_code >= 400
12
21
 
13
22
 
14
23
  def _retry_request(client: Client, *args, **kwargs):
@@ -20,24 +29,21 @@ def _retry_request(client: Client, *args, **kwargs):
20
29
  exception_raised = None
21
30
 
22
31
  try:
23
- resp = client._wrapped_request(*args, **kwargs)
32
+ tries += 1
33
+ resp = client._no_retry_request(*args, **kwargs)
24
34
 
25
35
  # break from loop on an accepted response
26
- if client._accept_response(resp) or (
27
- resp.status_code == 404 and not client._retry_on_404
28
- ):
36
+ if not client._should_retry(resp):
29
37
  break
30
38
 
31
- except Exception as e:
32
- # TODO: exclude certain kinds of exceptions (SSL?) from retry
39
+ except HTTPError as e:
33
40
  exception_raised = e
34
41
 
35
42
  if exception_response := getattr(e, "response", None):
36
- if client._accept_response(exception_response):
43
+ if not client._should_retry(exception_response):
37
44
  break
38
45
 
39
46
  # if we're going to retry, sleep first
40
- tries += 1
41
47
  if tries <= client._retry_attempts:
42
48
  # twice as long each time
43
49
  wait = client._retry_wait_seconds * (2 ** (tries - 1))
@@ -68,20 +74,17 @@ def make_retry_client(
68
74
  client: Client | None = None,
69
75
  attempts: int = 1,
70
76
  wait_seconds: float = 10,
71
- retry_on_404: bool = False,
72
- accept_response=_default_accept_response,
77
+ should_retry=retry_default_rule,
73
78
  ):
74
79
  if client is None:
75
80
  client = Client()
76
81
  client._retry_attempts = max(0, attempts)
77
82
  client._retry_wait_seconds = wait_seconds
78
- client._retry_on_404 = retry_on_404
79
- client._accept_response = accept_response
83
+ client._should_retry = should_retry
80
84
 
81
- client._wrapped_request = client.request
85
+ client._no_retry_request = client.request
82
86
  client.request = types.MethodType(
83
87
  functools.wraps(client.request)(_retry_request), client
84
88
  )
85
89
 
86
90
  return client
87
-
@@ -16,7 +16,7 @@ def _throttle_request(client: Client, *args, **kwargs):
16
16
  client._last_request = time.time()
17
17
  else:
18
18
  client._last_request = now
19
- return client._wrapped_request(*args, **kwargs)
19
+ return client._no_throttle_request(*args, **kwargs)
20
20
 
21
21
 
22
22
  def make_throttled_client(
@@ -34,7 +34,7 @@ def make_throttled_client(
34
34
  client._requests_per_minute = requests_per_minute
35
35
  client._request_frequency = 60.0 / requests_per_minute
36
36
 
37
- client._wrapped_request = client.request
37
+ client._no_throttle_request = client.request
38
38
  client.request = types.MethodType(
39
39
  functools.wraps(client.request)(_throttle_request), client
40
40
  )
@@ -1,6 +1,6 @@
1
1
  from pytest_httpbin.serve import Server # type: ignore
2
2
  from httpx import Response
3
- from careful.httpx import make_dev_caching_client, MemoryCache, FileCache, SQLiteCache
3
+ from careful.httpx import make_dev_caching_client, MemoryCache, FileCache, SqliteCache
4
4
 
5
5
 
6
6
  def test_dev_caching(httpbin: Server) -> None:
@@ -59,7 +59,7 @@ def test_file_cache() -> None:
59
59
 
60
60
 
61
61
  def test_sqlite_cache() -> None:
62
- sc = SQLiteCache("cache.db")
62
+ sc = SqliteCache("cache.db")
63
63
  sc.clear()
64
64
  _test_cache_storage(sc)
65
65
  sc.clear()
@@ -0,0 +1,47 @@
1
+ from careful.httpx import make_careful_client, MemoryCache
2
+ from unittest import mock
3
+ from fakeresponse import FakeResponse
4
+
5
+
6
+ def test_full_careful_client():
7
+ client = make_careful_client(
8
+ retry_attempts=3,
9
+ retry_wait_seconds=0.00001,
10
+ cache_storage=MemoryCache(),
11
+ requests_per_minute=60,
12
+ )
13
+
14
+ # On the first call return a 500, then a 200, then a 500 again
15
+ mock_send = mock.Mock(
16
+ side_effect=[
17
+ FakeResponse("http://dummy/", 500, "failure!"),
18
+ FakeResponse("http://dummy/", 200, "success!"),
19
+ FakeResponse("http://dummy/2", 404, "success!"),
20
+ ]
21
+ )
22
+
23
+ mock_sleep = mock.Mock()
24
+
25
+ # check that sleep is called
26
+ with mock.patch("time.sleep", mock_sleep):
27
+ with mock.patch.object(client, "send", mock_send):
28
+ resp = client.get("http://dummy/")
29
+
30
+ # demonstrates a retry
31
+ assert mock_send.call_count == 2
32
+ assert resp.status_code == 200
33
+ # sleep called by retry, not by throttle yet
34
+ assert mock_sleep.call_count == 1
35
+
36
+ # demonstrates a cache (no new call)
37
+ resp = client.get("http://dummy/")
38
+ assert mock_send.call_count == 2
39
+ assert resp.status_code == 200
40
+ assert mock_sleep.call_count == 1
41
+
42
+ # a new, throttled call (no retry)
43
+ resp = client.get("http://dummy/2")
44
+ assert mock_send.call_count == 3
45
+ assert resp.status_code == 404
46
+ # call was throttled
47
+ assert mock_sleep.call_count == 2
@@ -1,4 +1,5 @@
1
1
  from careful.httpx import make_retry_client
2
+ from careful.httpx.retries import retry_all_400s_500s
2
3
  from unittest import mock
3
4
  from fakeresponse import FakeResponse
4
5
 
@@ -14,7 +15,7 @@ def test_retry() -> None:
14
15
  ]
15
16
  )
16
17
 
17
- with mock.patch.object(client, "_wrapped_request", mock_request):
18
+ with mock.patch.object(client, "send", mock_request):
18
19
  resp = client.get("http://dummy/")
19
20
  assert mock_request.call_count == 2
20
21
 
@@ -23,14 +24,14 @@ def test_retry() -> None:
23
24
  return_value=FakeResponse("http://dummy/", 500, "failure!")
24
25
  )
25
26
 
26
- with mock.patch.object(client, "_wrapped_request", mock_request):
27
+ with mock.patch.object(client, "send", mock_request):
27
28
  resp = client.get("http://dummy/")
28
29
  assert resp.status_code == 500
29
30
  assert mock_request.call_count == 4 # try four times
30
31
 
31
32
 
32
33
  def test_retry_404() -> None:
33
- client = make_retry_client(attempts=3, wait_seconds=0.001, retry_on_404=True)
34
+ client = make_retry_client(attempts=3, wait_seconds=0.001, should_retry=retry_all_400s_500s)
34
35
 
35
36
  # On the first call return a 404, then a 200
36
37
  mock_request = mock.Mock(
@@ -40,7 +41,7 @@ def test_retry_404() -> None:
40
41
  ]
41
42
  )
42
43
 
43
- with mock.patch.object(client, "_wrapped_request", mock_request):
44
+ with mock.patch.object(client, "send", mock_request):
44
45
  resp = client.get("http://dummy/") # type: ignore
45
46
  assert mock_request.call_count == 2
46
47
  assert resp.status_code == 200
@@ -51,7 +52,7 @@ def test_retry_404() -> None:
51
52
  )
52
53
 
53
54
  # four tries
54
- with mock.patch.object(client, "_wrapped_request", mock_request):
55
+ with mock.patch.object(client, "send", mock_request):
55
56
  resp = client.get("http://dummy/")
56
57
  assert resp.status_code == 404
57
58
  assert mock_request.call_count == 4
@@ -59,7 +60,7 @@ def test_retry_404() -> None:
59
60
 
60
61
 
61
62
  def test_no_retry_404() -> None:
62
- client = make_retry_client(attempts=3, wait_seconds=0.001, retry_on_404=False)
63
+ client = make_retry_client(attempts=3, wait_seconds=0.001)
63
64
 
64
65
  # On the first call return a 404, then a 200
65
66
  mock_request = mock.Mock(
@@ -69,7 +70,7 @@ def test_no_retry_404() -> None:
69
70
  ]
70
71
  )
71
72
 
72
- with mock.patch.object(client, "_wrapped_request", mock_request):
73
+ with mock.patch.object(client, "send", mock_request):
73
74
  resp = client.get("http://dummy/") # type: ignore
74
75
  assert mock_request.call_count == 1
75
76
  assert resp.status_code == 404
@@ -1,11 +1,12 @@
1
1
  from careful.httpx import make_throttled_client
2
2
  from unittest import mock
3
3
  from typing import Any
4
+ from httpx import Request
4
5
  from fakeresponse import FakeResponse
5
6
 
6
7
 
7
- def request_200(method: str, url: str, *args: Any, **kwargs: Any) -> FakeResponse:
8
- return FakeResponse(url, 200, b"ok")
8
+ def request_200(request: Request, *args: Any, **kwargs: Any) -> FakeResponse:
9
+ return FakeResponse(request.url, 200, b"ok")
9
10
 
10
11
 
11
12
  mock_200 = mock.Mock(wraps=request_200)
@@ -18,7 +19,7 @@ def test_request_throttling() -> None:
18
19
 
19
20
  # check that sleep is called on call 2 & 3
20
21
  with mock.patch("time.sleep", mock_sleep):
21
- with mock.patch.object(client, "_wrapped_request", mock_200):
22
+ with mock.patch.object(client, "send", mock_200):
22
23
  client.get("http://dummy/")
23
24
  client.get("http://dummy/")
24
25
  client.get("http://dummy/")
careful-0.1.0/PKG-INFO DELETED
@@ -1,48 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: careful
3
- Version: 0.1.0
4
- Summary: Add your description here
5
- Author-email: jpt <dev@jpt.sh>
6
- License-File: LICENSE
7
- Requires-Python: >=3.13
8
- Requires-Dist: httpx>=0.28.1
9
- Requires-Dist: pytest-httpbin>=2.1.0
10
- Requires-Dist: pytest>=8.4.2
11
- Description-Content-Type: text/markdown
12
-
13
- **careful_httpx** is a library for making requests to less-than-reliable websites.
14
-
15
- It is based on [scrapelib](https://pypi.org/scrapelib/), which has powered Open States & many other Python scrapers for over 15 years.
16
-
17
- Code: <https://codeberg.org/jpt/careful_httpx>
18
-
19
- Documentation: TODO
20
-
21
- ## Features
22
-
23
- Enhances [`httpx.Client`](https://www.python-httpx.org) with features useful for writing long-running scrapers & crawlers, particularly against sites that are slow or have intermittent errors.
24
-
25
- - retries
26
- - throttling
27
- - dev-cache for iterating on scrapers
28
-
29
- ### example
30
-
31
- TODO
32
-
33
- ### features this has that scrapelib doesn't
34
-
35
- - httpx support
36
- - composable interface, can augment Client with just the enhancements you want
37
-
38
- TODO: don't allow instantiating bad patch classes, and check for incompatible configs
39
-
40
- ### features scrapelib had that this doesn't
41
-
42
- Open to considering if there is interest, but didn't seem necessary.
43
-
44
- - HTTP(S) and FTP requests via an identical API
45
- - allow setting custom ciphers
46
- - have urlretrieve
47
- - support FTP
48
- - set custom user-agent/mess w/ headers
careful-0.1.0/README.md DELETED
@@ -1,36 +0,0 @@
1
- **careful_httpx** is a library for making requests to less-than-reliable websites.
2
-
3
- It is based on [scrapelib](https://pypi.org/scrapelib/), which has powered Open States & many other Python scrapers for over 15 years.
4
-
5
- Code: <https://codeberg.org/jpt/careful_httpx>
6
-
7
- Documentation: TODO
8
-
9
- ## Features
10
-
11
- Enhances [`httpx.Client`](https://www.python-httpx.org) with features useful for writing long-running scrapers & crawlers, particularly against sites that are slow or have intermittent errors.
12
-
13
- - retries
14
- - throttling
15
- - dev-cache for iterating on scrapers
16
-
17
- ### example
18
-
19
- TODO
20
-
21
- ### features this has that scrapelib doesn't
22
-
23
- - httpx support
24
- - composable interface, can augment Client with just the enhancements you want
25
-
26
- TODO: don't allow instantiating bad patch classes, and check for incompatible configs
27
-
28
- ### features scrapelib had that this doesn't
29
-
30
- Open to considering if there is interest, but didn't seem necessary.
31
-
32
- - HTTP(S) and FTP requests via an identical API
33
- - allow setting custom ciphers
34
- - have urlretrieve
35
- - support FTP
36
- - set custom user-agent/mess w/ headers
@@ -1,19 +0,0 @@
1
- [project]
2
- name = "careful"
3
- version = "0.1.0"
4
- description = "Add your description here"
5
- readme = "README.md"
6
- authors = [
7
- { name = "jpt", email = "dev@jpt.sh" }
8
- ]
9
- requires-python = ">=3.13"
10
- dependencies = [
11
- "httpx>=0.28.1",
12
- "pytest>=8.4.2",
13
- "pytest-httpbin>=2.1.0",
14
- ]
15
-
16
-
17
- [build-system]
18
- requires = ["hatchling"]
19
- build-backend = "hatchling.build"
@@ -1,59 +0,0 @@
1
- from .retries import make_retry_client, _default_accept_response
2
- from .throttle import make_throttled_client
3
- from .dev_cache import (
4
- make_dev_caching_client,
5
- MemoryCache,
6
- FileCache,
7
- SQLiteCache,
8
- _cache_200s,
9
- _default_keyfunc,
10
- )
11
- from httpx import Client
12
-
13
-
14
- def make_careful_client(
15
- client: Client,
16
- *,
17
- retry_attempts: int = 0,
18
- retry_wait_seconds: float = 10,
19
- retry_on_404: bool = False,
20
- accept_response=_default_accept_response,
21
- requests_per_minute: int = 0,
22
- cache_storage=None,
23
- cache_write_only=False,
24
- should_cache=_cache_200s,
25
- cache_keyfunc=_default_keyfunc,
26
- ):
27
- # order matters, retry on inside b/c it is last-chance scenario
28
- if retry_attempts:
29
- client = make_retry_client(
30
- client=client,
31
- attempts=retry_attempts,
32
- wait_seconds=retry_wait_seconds,
33
- retry_on_404=retry_on_404,
34
- accept_response=accept_response,
35
- )
36
- # throttling around retries
37
- if requests_per_minute:
38
- client = make_throttled_client(client, requests_per_minute=requests_per_minute)
39
- # caching on top layer, so cache will be checked first
40
- if cache_storage:
41
- client = make_dev_caching_client(
42
- client=client,
43
- cache_storage=cache_storage,
44
- cache_keyfunc=cache_keyfunc,
45
- should_cache=should_cache,
46
- write_only=cache_write_only,
47
- )
48
-
49
- return client
50
-
51
-
52
- __all__ = [
53
- "make_retry_client",
54
- "make_throttled_client",
55
- "make_dev_caching_client",
56
- "MemoryCache",
57
- "FileCache",
58
- "SQLiteCache",
59
- ]
@@ -1,4 +0,0 @@
1
- from careful.httpx import make_careful_client
2
-
3
- def test_full_careful_client():
4
- client = make_careful_client()
File without changes
File without changes
File without changes
File without changes