PyPI - withcache - Versions diffs - 0.4.0__tar.gz → 0.4.2__tar.gz - Mend

withcache 0.4.0tar.gz → 0.4.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

{withcache-0.4.0 → withcache-0.4.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: withcache
-Version: 0.4.0
+Version: 0.4.2
 Summary: Operator-curated, URL-keyed artifact cache for a small lab (CUDA/ROCm/DOCA/firmware)
 Project-URL: Homepage, https://github.com/safl/withcache
 Author-email: "Simon A. F. Lund" <safl@safl.dk>

{withcache-0.4.0 → withcache-0.4.2}/shim/build.zig.zon RENAMED Viewed

@@ -2,7 +2,7 @@
     .name = .withcache_shim,
     // Zig requires a literal here; keep it in lockstep with the project's
     // single source (src/withcache/__init__.py) via `make bump` / `make version-check`.
-    .version = "0.4.0",
+    .version = "0.4.2",
     .fingerprint = 0xd7d96c5ed212ccaa,
     .minimum_zig_version = "0.16.0",
     .paths = .{

{withcache-0.4.0 → withcache-0.4.2}/src/withcache/__init__.py RENAMED Viewed

@@ -12,6 +12,6 @@ All modules are stdlib-only and self-contained.
 from .client import blob_url, cache_base, is_cached, serve_url
-__version__ = "0.4.0"
+__version__ = "0.4.2"
 __all__ = ["__version__", "blob_url", "cache_base", "is_cached", "serve_url"]

{withcache-0.4.0 → withcache-0.4.2}/src/withcache/server.py RENAMED Viewed

@@ -43,6 +43,16 @@ from datetime import datetime, timezone
 CHUNK = 64 * 1024
 USER_AGENT = "withcache-cache/0.1"
+# Resume budget for a single store_from_origin call. A truncated
+# upstream stream re-fetches with ``Range: bytes=<got>-`` so the
+# next attempt picks up where the cut happened. Five tries cover
+# the realistic failure mode (e.g. ghcr.io serves blobs via Azure
+# Blob Storage SAS URLs with a ~10 minute expiry; a >2 GiB image
+# at modest bandwidth blows past one window and the connection is
+# cut server-side, but a fresh redirect through ghcr yields a new
+# SAS URL each retry). The cap is the give-up gate, not a normal
+# operating depth.
+RESUME_MAX_ATTEMPTS = 5
 STATIC_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "static")
 MIME_TYPES = {".css": "text/css; charset=utf-8", ".js": "application/javascript; charset=utf-8"}
 _DB_WRITE_LOCK = threading.Lock()
@@ -285,7 +295,14 @@ class Store:
         except FileNotFoundError:
             pass
-    def store_from_origin(self, url: str, progress=None, cancel=None, headers=None) -> sqlite3.Row:
+    def store_from_origin(
+        self,
+        url: str,
+        progress=None,
+        cancel=None,
+        headers=None,
+        max_resume_attempts: int = RESUME_MAX_ATTEMPTS,
+    ) -> sqlite3.Row:
         """Operator-triggered: pull the artifact from origin and store it.
         ``progress(done, total)`` is called as bytes arrive (total may be None);
@@ -294,41 +311,105 @@ class Store:
         ``headers`` adds request headers to the origin fetch (e.g. a registry
         bearer token bty pre-resolved for an oras blob). Raises :class:`CacheFull`
         if the cache is already at --max-bytes.
+        Resume-on-truncation: if the upstream stream ends before its
+        declared Content-Length, the partial bytes are kept and the
+        next attempt requests ``Range: bytes=<got>-`` so the fetch
+        picks up where the connection died. Up to
+        ``max_resume_attempts`` attempts are made before
+        :class:`TruncatedDownload` is raised; on giving up the
+        partial file is removed. A 200 response to a Range request
+        (the origin chose to ignore the header, common on naive
+        upstreams) is handled by restarting from byte 0 and counts
+        against the same attempt budget. Re-issuing the request also
+        re-resolves any 30x redirect chain, which matters for
+        ghcr.io: each ghcr request hands back a fresh Azure Blob
+        Storage SAS URL valid only for a short window, and the
+        prior cut almost certainly was that SAS expiring mid-stream.
         """
         if not self.has_capacity():
             raise CacheFull(f"cache full (>= {self.max_bytes} bytes); refusing to fetch {url}")
         normalized = self.normalize(url)
         key = self.key_of(normalized)
         tmp = os.path.join(self.tmp_dir, key + ".part")
-        req_headers = {"User-Agent": USER_AGENT}
+        base_headers = {"User-Agent": USER_AGENT}
         if headers:
-            req_headers.update(headers)
-        req = urllib.request.Request(url, headers=req_headers)
+            base_headers.update(headers)
         sha = hashlib.sha256()
         size = 0
+        total: int | None = None
+        content_type: str | None = None
         try:
-            with urllib.request.urlopen(req, timeout=120) as resp:
-                content_type = resp.headers.get_content_type()
-                cl = resp.headers.get("Content-Length")
-                total = int(cl) if cl and cl.isdigit() else None
-                if progress:
-                    progress(0, total)
-                with open(tmp, "wb") as f:
-                    while True:
-                        if cancel and cancel():
-                            raise DownloadCancelled()
-                        chunk = resp.read(CHUNK)
-                        if not chunk:
-                            break
-                        f.write(chunk)
-                        sha.update(chunk)
-                        size += len(chunk)
-                        if progress:
-                            progress(size, total)
+            for _ in range(max_resume_attempts):
+                req_headers = dict(base_headers)
+                if size > 0:
+                    # Resume from where the previous attempt cut.
+                    # A 206 response continues the stream; a 200
+                    # means the origin ignored Range (e.g. a dumb
+                    # static server) and we restart from 0.
+                    req_headers["Range"] = f"bytes={size}-"
+                req = urllib.request.Request(url, headers=req_headers)
+                with urllib.request.urlopen(req, timeout=120) as resp:
+                    status = getattr(resp, "status", None) or resp.getcode()
+                    if content_type is None:
+                        content_type = resp.headers.get_content_type()
+                    if size > 0 and status == 200:
+                        # Range ignored by origin: discard the partial
+                        # and start a fresh full-stream attempt.
+                        size = 0
+                        sha = hashlib.sha256()
+                        if os.path.exists(tmp):
+                            os.remove(tmp)
+                    if size > 0 and status == 206:
+                        # ``Content-Range: bytes <start>-<end>/<total>``;
+                        # use the total declared there as the contract,
+                        # not Content-Length (which on 206 is the size
+                        # of the partial response, not the whole blob).
+                        cr = resp.headers.get("Content-Range") or ""
+                        if "/" in cr:
+                            tail = cr.rsplit("/", 1)[1].strip()
+                            if tail.isdigit():
+                                total = int(tail)
+                    else:
+                        cl = resp.headers.get("Content-Length")
+                        if cl and cl.isdigit():
+                            total = int(cl)
+                    if progress:
+                        progress(size, total)
+                    mode = "ab" if size > 0 else "wb"
+                    with open(tmp, mode) as f:
+                        while True:
+                            if cancel and cancel():
+                                raise DownloadCancelled()
+                            chunk = resp.read(CHUNK)
+                            if not chunk:
+                                break
+                            f.write(chunk)
+                            sha.update(chunk)
+                            size += len(chunk)
+                            if progress:
+                                progress(size, total)
+                # urllib's read loop exits on clean EOF AND on transport-
+                # aborted close; HTTPResponse only raises IncompleteRead
+                # in some configurations. When the origin declared a
+                # total (either via Content-Length on a 200 or via
+                # Content-Range on a 206), treat that as the contract:
+                # try to resume from the cut, give up after the budget
+                # is exhausted. Without a declared total there is no
+                # truncation signal, so a single attempt is the whole
+                # story.
+                if total is None or size >= total:
+                    break
+            else:
+                # for/else: ran out of attempts before reaching total
+                raise TruncatedDownload(
+                    f"upstream truncated for {url}: declared {total} bytes, got {size}"
+                    f" after {max_resume_attempts} attempts"
+                )
             os.replace(tmp, self.blob_path(key))
         except BaseException:
             if os.path.exists(tmp):
-                os.remove(tmp)  # no half-written blob on cancel/error
+                os.remove(tmp)  # no half-written blob on cancel/error/give-up
             raise
         ts = now_iso()
         with _DB_WRITE_LOCK, self.conn() as c:
@@ -369,6 +450,14 @@ class CacheFull(Exception):
     """Raised when --max-bytes is reached; the fill is refused, not evicted."""
+class TruncatedDownload(Exception):
+    """Raised when the upstream stream ended before the declared
+    Content-Length. The temp file is removed and no blob row is
+    written, so the same URL re-enqueues cleanly on the next request
+    instead of permanently serving a malformed file.
+    """
 @dataclass
 class Job:
     id: int

{withcache-0.4.0 → withcache-0.4.2}/tests/test_withcache.py RENAMED Viewed

@@ -7,6 +7,7 @@ without an install.
 import http.server
 import os
 import shutil
+import socket
 import socketserver
 import sys
 import tempfile
@@ -167,6 +168,208 @@ class TestStoreFromOrigin(unittest.TestCase):
             store.store_from_origin(f"http://127.0.0.1:{self.port}/b.bin")
+class _TruncatingOrigin(http.server.BaseHTTPRequestHandler):
+    """Declare a full Content-Length, then send half the payload and
+    close the socket. Mirrors the real-world failure mode where the
+    upstream drops the connection mid-stream (lab-box fedora-44-desktop
+    flash that surfaced this bug)."""
+    PAYLOAD = b"abcdefghij" * 100  # 1000 bytes; will write half then close
+    def do_GET(self):
+        self.send_response(200)
+        self.send_header("Content-Type", "application/octet-stream")
+        self.send_header("Content-Length", str(len(self.PAYLOAD)))
+        self.end_headers()
+        half = len(self.PAYLOAD) // 2
+        self.wfile.write(self.PAYLOAD[:half])
+        # close the underlying socket so urllib observes EOF before
+        # Content-Length bytes arrive
+        self.wfile.flush()
+        try:
+            self.connection.shutdown(socket.SHUT_RDWR)
+        except OSError:
+            pass
+    def log_message(self, format, *args):
+        pass
+class TestTruncatedDownloadRejected(unittest.TestCase):
+    """Regression for the lab-spotted bug where a transport-aborted
+    upstream stream silently became a permanent cached blob: future
+    HEADs returned 200 with the partial bytes, every consumer got a
+    malformed file, and the only escape was hand-deleting the blob.
+    Content-Length mismatches now fail loudly and leave no entry."""
+    def setUp(self):
+        self.httpd = socketserver.TCPServer(("127.0.0.1", 0), _TruncatingOrigin)
+        self.port = self.httpd.server_address[1]
+        self.t = threading.Thread(target=self.httpd.serve_forever, daemon=True)
+        self.t.start()
+        self.store = server.Store(tempfile.mkdtemp(), keep_query=False)
+    def tearDown(self):
+        self.httpd.shutdown()
+        self.httpd.server_close()
+    def test_truncated_upstream_raises_and_leaves_no_blob(self):
+        url = f"http://127.0.0.1:{self.port}/truncated.bin"
+        # _TruncatingOrigin truncates EVERY response (including
+        # ranged retries) so capping max_resume_attempts at 1 keeps
+        # the test fast: the single attempt cuts at 500 bytes,
+        # exhausts the budget, and the TruncatedDownload fires.
+        with self.assertRaises(server.TruncatedDownload) as cm:
+            self.store.store_from_origin(url, max_resume_attempts=1)
+        # the message must name both totals so the operator can see
+        # how short the upstream came
+        msg = str(cm.exception)
+        self.assertIn("1000", msg)  # declared
+        self.assertIn("500", msg)  # got
+        # no row was written; no blob file lingers on disk
+        self.assertIsNone(self.store.get_blob(url))
+        blobs = list(self.store.blob_path("").rsplit("/", 1)[0:1])
+        if os.path.isdir(blobs[0]):
+            self.assertEqual(os.listdir(blobs[0]), [])
+    def test_repeat_request_after_truncation_can_retry_cleanly(self):
+        url = f"http://127.0.0.1:{self.port}/truncated.bin"
+        with self.assertRaises(server.TruncatedDownload):
+            self.store.store_from_origin(url, max_resume_attempts=1)
+        # second attempt against the same URL would have hit the
+        # poisoned cache before the fix; now it must repeat the
+        # failure mode (no sticky blob blocking the retry) so a
+        # later origin recovery can re-fill the entry cleanly.
+        with self.assertRaises(server.TruncatedDownload):
+            self.store.store_from_origin(url, max_resume_attempts=1)
+# --------------------------------------------------------------------------
+# Range-resume: a flaky upstream that cuts mid-stream MUST be retried with
+# ``Range: bytes=<got>-`` so the partial is filled rather than discarded.
+# This is the lab-spotted ghcr.io failure mode where Azure Blob Storage
+# SAS URLs expire mid-download for any blob bigger than a few minutes of
+# bandwidth: a single attempt always loses, but a retried Range request
+# starts a fresh SAS window and the second leg finishes the blob.
+# --------------------------------------------------------------------------
+class _ResumableTruncatingOrigin(http.server.BaseHTTPRequestHandler):
+    """Cut the FIRST GET in half; honor ``Range: bytes=<n>-`` on retries
+    by serving from offset n to end. Mirrors the ghcr -> Azure Blob
+    pattern: each connection has a hard wall-clock limit but the bytes
+    themselves are available on re-fetch.
+    Shared class-level counter so multiple instances (the threaded server
+    spawns one handler per request) all see the same call count and the
+    first GET truncates regardless of which thread services it.
+    """
+    PAYLOAD = b"abcdefghij" * 100  # 1000 bytes
+    _lock = threading.Lock()
+    _calls = 0
+    @classmethod
+    def reset(cls) -> None:
+        with cls._lock:
+            cls._calls = 0
+    def do_GET(self):
+        with self._lock:
+            self.__class__._calls += 1
+            call = self._calls
+        rng = self.headers.get("Range") or ""
+        start = 0
+        if rng.startswith("bytes="):
+            try:
+                start = int(rng[len("bytes=") :].split("-", 1)[0])
+            except ValueError:
+                start = 0
+        full = len(self.PAYLOAD)
+        if start > 0:
+            # ranged retry: serve the rest cleanly
+            body = self.PAYLOAD[start:]
+            self.send_response(206)
+            self.send_header("Content-Type", "application/octet-stream")
+            self.send_header("Content-Length", str(len(body)))
+            self.send_header(
+                "Content-Range",
+                f"bytes {start}-{full - 1}/{full}",
+            )
+            self.end_headers()
+            self.wfile.write(body)
+            return
+        # first attempt: declare full length but cut at half
+        self.send_response(200)
+        self.send_header("Content-Type", "application/octet-stream")
+        self.send_header("Content-Length", str(full))
+        self.end_headers()
+        if call == 1:
+            half = full // 2
+            self.wfile.write(self.PAYLOAD[:half])
+            self.wfile.flush()
+            try:
+                self.connection.shutdown(socket.SHUT_RDWR)
+            except OSError:
+                pass
+        else:
+            # any non-ranged retry serves the whole thing (covers the
+            # 200-on-Range fallback path: origin ignored Range, we
+            # restart from 0)
+            self.wfile.write(self.PAYLOAD)
+    def log_message(self, format, *args):
+        pass
+class TestRangeResumeOnTruncation(unittest.TestCase):
+    def setUp(self):
+        _ResumableTruncatingOrigin.reset()
+        self.httpd = socketserver.ThreadingTCPServer(("127.0.0.1", 0), _ResumableTruncatingOrigin)
+        self.port = self.httpd.server_address[1]
+        self.t = threading.Thread(target=self.httpd.serve_forever, daemon=True)
+        self.t.start()
+        self.store = server.Store(tempfile.mkdtemp(), keep_query=False)
+    def tearDown(self):
+        self.httpd.shutdown()
+        self.httpd.server_close()
+    def test_truncated_stream_resumes_via_range(self):
+        """First GET cuts at byte 500; second GET (with
+        ``Range: bytes=500-``) returns 206 and the remaining 500.
+        Result: a complete 1000-byte blob in the cache, sha256 matches
+        the upstream's full payload, no TruncatedDownload raised."""
+        import hashlib
+        url = f"http://127.0.0.1:{self.port}/resumable.bin"
+        row = self.store.store_from_origin(url)
+        self.assertEqual(row["size"], len(_ResumableTruncatingOrigin.PAYLOAD))
+        self.assertEqual(
+            row["sha256"],
+            hashlib.sha256(_ResumableTruncatingOrigin.PAYLOAD).hexdigest(),
+        )
+        with open(self.store.blob_path(row["key"]), "rb") as f:
+            self.assertEqual(f.read(), _ResumableTruncatingOrigin.PAYLOAD)
+    def test_progress_callback_reports_continuing_offset_on_resume(self):
+        """Progress reports must be monotonic across the resume: the
+        second leg's reads start at 500 (the partial-so-far) and walk
+        up to 1000, NOT restart at 0. An operator dashboard watching
+        ``progress`` for a stuck job needs to see the bytes climb."""
+        observed: list[tuple[int, int | None]] = []
+        url = f"http://127.0.0.1:{self.port}/resumable.bin"
+        self.store.store_from_origin(url, progress=lambda d, t: observed.append((d, t)))
+        # final report should be the full payload
+        self.assertEqual(observed[-1][0], len(_ResumableTruncatingOrigin.PAYLOAD))
+        # at no point did the byte counter regress
+        for prev, curr in zip(observed, observed[1:], strict=False):
+            self.assertGreaterEqual(curr[0], prev[0])
+        # the resume actually crossed the cut point: at least one
+        # progress call lands above the half-mark (otherwise we
+        # would have stalled at 500)
+        half = len(_ResumableTruncatingOrigin.PAYLOAD) // 2
+        self.assertTrue(any(d > half for d, _ in observed))
 # --------------------------------------------------------------------------
 # _shim: URL detection, rewrite, real-tool resolution, env, path-encoding
 # --------------------------------------------------------------------------