PyPI - careful - Versions diffs - 0.1.0__py3-none-any.whl - Mend

careful 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

careful/__init__.py +0 -0
careful/httpx/__init__.py +59 -0
careful/httpx/dev_cache.py +296 -0
careful/httpx/py.typed +0 -0
careful/httpx/retries.py +87 -0
careful/httpx/throttle.py +41 -0
careful-0.1.0.dist-info/METADATA +48 -0
careful-0.1.0.dist-info/RECORD +10 -0
careful-0.1.0.dist-info/WHEEL +4 -0
careful-0.1.0.dist-info/licenses/LICENSE +24 -0

careful/__init__.py ADDED Viewed

File without changes

careful/httpx/__init__.py ADDED Viewed

@@ -0,0 +1,59 @@
+from .retries import make_retry_client, _default_accept_response
+from .throttle import make_throttled_client
+from .dev_cache import (
+    make_dev_caching_client,
+    MemoryCache,
+    FileCache,
+    SQLiteCache,
+    _cache_200s,
+    _default_keyfunc,
+)
+from httpx import Client
+def make_careful_client(
+    client: Client,
+    *,
+    retry_attempts: int = 0,
+    retry_wait_seconds: float = 10,
+    retry_on_404: bool = False,
+    accept_response=_default_accept_response,
+    requests_per_minute: int = 0,
+    cache_storage=None,
+    cache_write_only=False,
+    should_cache=_cache_200s,
+    cache_keyfunc=_default_keyfunc,
+):
+    # order matters, retry on inside b/c it is last-chance scenario
+    if retry_attempts:
+        client = make_retry_client(
+            client=client,
+            attempts=retry_attempts,
+            wait_seconds=retry_wait_seconds,
+            retry_on_404=retry_on_404,
+            accept_response=accept_response,
+        )
+    # throttling around retries
+    if requests_per_minute:
+        client = make_throttled_client(client, requests_per_minute=requests_per_minute)
+    # caching on top layer, so cache will be checked first
+    if cache_storage:
+        client = make_dev_caching_client(
+            client=client,
+            cache_storage=cache_storage,
+            cache_keyfunc=cache_keyfunc,
+            should_cache=should_cache,
+            write_only=cache_write_only,
+        )
+    return client
+__all__ = [
+    "make_retry_client",
+    "make_throttled_client",
+    "make_dev_caching_client",
+    "MemoryCache",
+    "FileCache",
+    "SQLiteCache",
+]

careful/httpx/dev_cache.py ADDED Viewed

@@ -0,0 +1,296 @@
+import types
+import functools
+import logging
+import re
+import os
+import glob
+import hashlib
+import sqlite3
+import json
+from httpx import Client, Response, Request
+log = logging.getLogger("httpx")
+def _default_keyfunc(
+    method: str,
+    url: str,
+    params: dict | None = None,
+) -> str | None:
+    """
+    Return a cache key from a given set of request parameters.
+    Default behavior is to return a complete URL for all GET
+    requests, and None otherwise.
+    """
+    if method.lower() != "get":
+        return None
+    return Request(url=url, method=method, params=params).url
+def _cache_200s(response: Response) -> bool:
+    """
+    Check if a given Response object should be cached.
+    Default behavior is to only cache responses with a 200 status code.
+    """
+    return response.status_code == 200
+def _cached_request(client: Client, *args, **kwargs):
+    # short circuit if cache isn't configured
+    if not client._cache_storage:
+        log.debug("bypassing cache, no storage configured")
+        resp = client._wrapped_request(*args, **kwargs)
+        resp.fromcache = False
+        return resp
+    method, url = args
+    request_key = client._cache_keyfunc(method, url, kwargs["params"])
+    # check cache for response
+    cached_resp = None
+    if request_key and not client._write_only:
+        cached_resp = client._cache_storage.get(request_key)
+    if cached_resp:
+        # resp = cast(CacheResponse, resp_maybe)
+        log.info("using cached response request_key=%s", request_key)
+        cached_resp.fromcache = True
+        resp = cached_resp
+    else:
+        resp = client._wrapped_request(*args, **kwargs)
+        # save to cache if request and response meet criteria
+        log.debug("XX %s %s", request_key, client._should_cache(resp))
+        if request_key and client._should_cache(resp):
+            client._cache_storage.set(request_key, resp)
+            log.info("caching response request_key=%s", request_key)
+        resp.fromcache = False
+    return resp
+def make_dev_caching_client(
+    *,
+    client: Client | None = None,
+    cache_storage=None,
+    cache_keyfunc=_default_keyfunc,
+    should_cache=_cache_200s,
+    write_only=False,
+):
+    if client is None:
+        client = Client()
+    client._cache_storage = cache_storage
+    client._cache_keyfunc = cache_keyfunc
+    client._should_cache = should_cache
+    client._write_only = write_only
+    client._wrapped_request = client.request
+    client.request = types.MethodType(
+        functools.wraps(client.request)(_cached_request), client
+    )
+    return client
+class CacheStorageBase:
+    def get(self, key: str) -> None | Response:
+        raise NotImplementedError()
+    def set(self, key: str, response: Response) -> None:
+        raise NotImplementedError()
+class MemoryCache(CacheStorageBase):
+    """In memory cache for request responses."""
+    def __init__(self) -> None:
+        self.cache: dict[str, Response] = {}
+    def get(self, key: str) -> None | Response:
+        """Get cache entry for key, or return None."""
+        return self.cache.get(key, None)
+    def set(self, key: str, response: Response) -> None:
+        """Set cache entry for key with contents of response."""
+        self.cache[key] = response
+class FileCache(CacheStorageBase):
+    """
+    File-based cache for request responses.
+    :param cache_dir: directory for storing responses
+    :param check_last_modified:  set to True to compare last-modified
+        timestamp in cached response with value from HEAD request
+    """
+    # file name escaping inspired by httplib2
+    _prefix = re.compile(r"^\w+://")
+    _illegal = re.compile(r"[?/:|]+")
+    _header_re = re.compile(r"([-\w]+): (.*)")
+    _maxlen = 200
+    def _clean_key(self, key: str) -> str:
+        # strip scheme
+        md5 = hashlib.md5(key.encode("utf8")).hexdigest()
+        key = self._prefix.sub("", key)
+        key = self._illegal.sub(",", key)
+        return ",".join((key[: self._maxlen], md5))
+    def __init__(self, cache_dir: str, check_last_modified: bool = False):
+        # normalize path
+        self.cache_dir = os.path.join(os.getcwd(), cache_dir)
+        self.check_last_modified = check_last_modified
+        # create directory
+        if not os.path.isdir(self.cache_dir):
+            os.makedirs(self.cache_dir)
+    def get(self, orig_key: str) -> None | Response:
+        """Get cache entry for key, or return None."""
+        key = self._clean_key(orig_key)
+        path = os.path.join(self.cache_dir, key)
+        resp_headers = {}
+        try:
+            with open(path, "rb") as f:
+                # read lines one at a time
+                while True:
+                    line = f.readline().decode("utf8").strip("\r\n")
+                    # set headers
+                    # if self.check_last_modified and re.search(
+                    #     "last-modified", line, flags=re.I
+                    # ):
+                    #     # line contains last modified header
+                    #     head_resp = requests.head(orig_key)
+                    #     try:
+                    #         new_lm = head_resp.headers["last-modified"]
+                    #         old_lm = line[line.find(":") + 1 :].strip()
+                    #         if old_lm != new_lm:
+                    #             # last modified timestamps don't match, need to download again
+                    #             return None
+                    #     except KeyError:
+                    #         # no last modified header present, so redownload
+                    #         return None
+                    header = self._header_re.match(line)
+                    if header:
+                        resp_headers[header.group(1)] = header.group(2)
+                    else:
+                        break
+                # everything left is the real content
+                resp_content = f.read()
+            # status & encoding will be in headers, but are faked
+            # need to split spaces out of status to get code (e.g. '200 OK')
+            resp = Response(
+                status_code = int(resp_headers.pop("status").split(" ")[0]),
+                content=resp_content,
+                default_encoding=resp_headers.pop("encoding"),
+                headers=resp_headers,
+            )
+            return resp
+        except IOError:
+            return None
+    def set(self, key: str, response: Response) -> None:
+        """Set cache entry for key with contents of response."""
+        key = self._clean_key(key)
+        path = os.path.join(self.cache_dir, key)
+        with open(path, "wb") as f:
+            status_str = "status: {0}\n".format(response.status_code)
+            f.write(status_str.encode("utf8"))
+            encoding_str = "encoding: {0}\n".format(response.encoding)
+            f.write(encoding_str.encode("utf8"))
+            for h, v in response.headers.items():
+                # header: value\n
+                f.write(h.encode("utf8"))
+                f.write(b": ")
+                f.write(v.encode("utf8"))
+                f.write(b"\n")
+            # one blank line
+            f.write(b"\n")
+            f.write(response.content)
+    def clear(self) -> None:
+        # only delete things that end w/ a md5, less dangerous this way
+        cache_glob = "*," + ("[0-9a-f]" * 32)
+        for fname in glob.glob(os.path.join(self.cache_dir, cache_glob)):
+            os.remove(fname)
+class SQLiteCache(CacheStorageBase):
+    """SQLite cache for request responses.
+    :param cache_path: path for SQLite database file
+    :param check_last_modified: set to True to compare last-modified
+        timestamp in cached response with value from HEAD request
+    """
+    _columns = ["key", "status", "modified", "encoding", "data", "headers"]
+    def __init__(self, cache_path: str, check_last_modified: bool = False):
+        self.cache_path = cache_path
+        self.check_last_modified = check_last_modified
+        self._conn = sqlite3.connect(cache_path)
+        self._conn.text_factory = str
+        self._build_table()
+    def _build_table(self) -> None:
+        """Create table for storing request information and response."""
+        self._conn.execute(
+            """CREATE TABLE IF NOT EXISTS cache
+                (key text UNIQUE, status integer, modified text,
+                 encoding text, data blob, headers blob)"""
+        )
+    def set(self, key: str, response: Response) -> None:
+        """Set cache entry for key with contents of response."""
+        mod = response.headers.pop("last-modified", None)
+        status = int(response.status_code)
+        rec = (
+            key,
+            status,
+            mod,
+            response.encoding,
+            response.content,
+            json.dumps(dict(response.headers)),
+        )
+        with self._conn:
+            self._conn.execute("DELETE FROM cache WHERE key=?", (key,))
+            self._conn.execute("INSERT INTO cache VALUES (?,?,?,?,?,?)", rec)
+    def get(self, key: str) -> None | Response:
+        """Get cache entry for key, or return None."""
+        query = self._conn.execute("SELECT * FROM cache WHERE key=?", (key,))
+        rec = query.fetchone()
+        if rec is None:
+            return None
+        rec = dict(zip(self._columns, rec))
+        # TODO evaluate/remove?
+        # if self.check_last_modified:
+        #     if rec["modified"] is None:
+        #         return None  # no last modified header present, so redownload
+        #     head_resp = requests.head(key)
+        #     new_lm = head_resp.headers.get("last-modified", None)
+        #     if rec["modified"] != new_lm:
+        #         return None
+        resp = Response(rec["status"], content=rec["data"], default_encoding=rec["encoding"], headers=json.loads(rec["headers"]))
+        return resp
+    def clear(self) -> None:
+        """Remove all records from cache."""
+        with self._conn:
+            self._conn.execute("DELETE FROM cache")
+    def __del__(self) -> None:
+        self._conn.close()

careful/httpx/py.typed ADDED Viewed

File without changes

careful/httpx/retries.py ADDED Viewed

@@ -0,0 +1,87 @@
+import time
+import types
+import functools
+import logging
+from httpx import Client, Response
+log = logging.getLogger("httpx")
+def _default_accept_response(response: Response) -> bool:
+    return response.status_code < 400
+def _retry_request(client: Client, *args, **kwargs):
+    # the retry loop
+    tries = 0
+    exception_raised = None
+    while tries <= client._retry_attempts:
+        exception_raised = None
+        try:
+            resp = client._wrapped_request(*args, **kwargs)
+            # break from loop on an accepted response
+            if client._accept_response(resp) or (
+                resp.status_code == 404 and not client._retry_on_404
+            ):
+                break
+        except Exception as e:
+            # TODO: exclude certain kinds of exceptions (SSL?) from retry
+            exception_raised = e
+            if exception_response := getattr(e, "response", None):
+                if client._accept_response(exception_response):
+                    break
+        # if we're going to retry, sleep first
+        tries += 1
+        if tries <= client._retry_attempts:
+            # twice as long each time
+            wait = client._retry_wait_seconds * (2 ** (tries - 1))
+            if exception_raised:
+                log.info(
+                    "exception %s, sleeping for %s seconds before retry #%s",
+                    exception_raised,
+                    wait,
+                    tries,
+                )
+            else:
+                log.info(
+                    "response %s, sleeping for %s seconds before retry #%s",
+                    resp,
+                    wait,
+                    tries,
+                )
+            time.sleep(wait)
+    # out of the loop, either an exception was raised or we had a success
+    if exception_raised:
+        raise exception_raised
+    return resp
+def make_retry_client(
+    *,
+    client: Client | None = None,
+    attempts: int = 1,
+    wait_seconds: float = 10,
+    retry_on_404: bool = False,
+    accept_response=_default_accept_response,
+):
+    if client is None:
+        client = Client()
+    client._retry_attempts = max(0, attempts)
+    client._retry_wait_seconds = wait_seconds
+    client._retry_on_404 = retry_on_404
+    client._accept_response = accept_response
+    client._wrapped_request = client.request
+    client.request = types.MethodType(
+        functools.wraps(client.request)(_retry_request), client
+    )
+    return client

careful/httpx/throttle.py ADDED Viewed

@@ -0,0 +1,41 @@
+import time
+import types
+import functools
+import logging
+from httpx import Client
+log = logging.getLogger("httpx")
+def _throttle_request(client: Client, *args, **kwargs):
+    now = time.time()
+    diff = client._request_frequency - (now - client._last_request)
+    if diff > 0:
+        log.debug("throttled, sleeping for %fs", diff)
+        time.sleep(diff)
+        client._last_request = time.time()
+    else:
+        client._last_request = now
+    return client._wrapped_request(*args, **kwargs)
+def make_throttled_client(
+    *,
+    client: Client | None = None,
+    requests_per_minute: float = 0,
+):
+    if requests_per_minute <= 0:
+        raise ValueError("requests per minute must be a positive number")
+    if client is None:
+        client = Client()
+    client._last_request = 0.0
+    client._requests_per_minute = requests_per_minute
+    client._request_frequency = 60.0 / requests_per_minute
+    client._wrapped_request = client.request
+    client.request = types.MethodType(
+        functools.wraps(client.request)(_throttle_request), client
+    )
+    return client

careful-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,48 @@
+Metadata-Version: 2.4
+Name: careful
+Version: 0.1.0
+Summary: Add your description here
+Author-email: jpt <dev@jpt.sh>
+License-File: LICENSE
+Requires-Python: >=3.13
+Requires-Dist: httpx>=0.28.1
+Requires-Dist: pytest-httpbin>=2.1.0
+Requires-Dist: pytest>=8.4.2
+Description-Content-Type: text/markdown
+**careful_httpx** is a library for making requests to less-than-reliable websites.
+It is based on [scrapelib](https://pypi.org/scrapelib/), which has powered Open States & many other Python scrapers for over 15 years.
+Code: <https://codeberg.org/jpt/careful_httpx>
+Documentation: TODO
+## Features
+Enhances [`httpx.Client`](https://www.python-httpx.org) with features useful for writing long-running scrapers & crawlers, particularly against sites that are slow or have intermittent errors.
+- retries
+- throttling
+- dev-cache for iterating on scrapers
+### example
+TODO
+### features this has that scrapelib doesn't
+- httpx support
+- composable interface, can augment Client with just the enhancements you want
+TODO: don't allow instantiating bad patch classes, and check for incompatible configs
+### features scrapelib had that this doesn't
+Open to considering if there is interest, but didn't seem necessary.
+- HTTP(S) and FTP requests via an identical API
+- allow setting custom ciphers
+- have urlretrieve
+- support FTP
+- set custom user-agent/mess w/ headers

careful-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,10 @@
+careful/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+careful/httpx/__init__.py,sha256=gDSnAnqxFt9mLi2laArt7BUn_wPU5ub0k9zeqsexYJY,1605
+careful/httpx/dev_cache.py,sha256=_jwpnf1fzBzR23Of2HdsyiN_MPHvRG0gtM49Y4qRtQg,10031
+careful/httpx/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+careful/httpx/retries.py,sha256=mMIZf-EP9bhzEZnEmwtjWZ2qdl6ZCJ7vq3hZltT6Zms,2458
+careful/httpx/throttle.py,sha256=wCJWHERr5manyKq07ZdonmxbK0oh0PYJgO6a94IzN0s,1088
+careful-0.1.0.dist-info/METADATA,sha256=wBFvqh5xyMfNRVB8Jg9Aa3s5_Te2NxcBy5BE-3NMYeY,1373
+careful-0.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+careful-0.1.0.dist-info/licenses/LICENSE,sha256=oHe4LmCuo6CZne42DRXfiR3uqqIfGsk4dAvDKucAi4M,1315
+careful-0.1.0.dist-info/RECORD,,

careful-0.1.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,4 @@
+Wheel-Version: 1.0
+Generator: hatchling 1.27.0
+Root-Is-Purelib: true
+Tag: py3-none-any

careful-0.1.0.dist-info/licenses/LICENSE ADDED Viewed

@@ -0,0 +1,24 @@
+Copyright (c) 2025, James Turk
+All rights reserved.
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice,
+      this list of conditions and the following disclaimer in the documentation
+      and/or other materials provided with the distribution.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.