PyPI - withcache - Versions diffs - 0.4.1__tar.gz → 0.4.2__tar.gz - Mend

withcache 0.4.1tar.gz → 0.4.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

{withcache-0.4.1 → withcache-0.4.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: withcache
-Version: 0.4.1
+Version: 0.4.2
 Summary: Operator-curated, URL-keyed artifact cache for a small lab (CUDA/ROCm/DOCA/firmware)
 Project-URL: Homepage, https://github.com/safl/withcache
 Author-email: "Simon A. F. Lund" <safl@safl.dk>

{withcache-0.4.1 → withcache-0.4.2}/shim/build.zig.zon RENAMED Viewed

@@ -2,7 +2,7 @@
     .name = .withcache_shim,
     // Zig requires a literal here; keep it in lockstep with the project's
     // single source (src/withcache/__init__.py) via `make bump` / `make version-check`.
-    .version = "0.4.1",
+    .version = "0.4.2",
     .fingerprint = 0xd7d96c5ed212ccaa,
     .minimum_zig_version = "0.16.0",
     .paths = .{

{withcache-0.4.1 → withcache-0.4.2}/src/withcache/__init__.py RENAMED Viewed

@@ -12,6 +12,6 @@ All modules are stdlib-only and self-contained.
 from .client import blob_url, cache_base, is_cached, serve_url
-__version__ = "0.4.1"
+__version__ = "0.4.2"
 __all__ = ["__version__", "blob_url", "cache_base", "is_cached", "serve_url"]

{withcache-0.4.1 → withcache-0.4.2}/src/withcache/server.py RENAMED Viewed

@@ -43,6 +43,16 @@ from datetime import datetime, timezone
 CHUNK = 64 * 1024
 USER_AGENT = "withcache-cache/0.1"
+# Resume budget for a single store_from_origin call. A truncated
+# upstream stream re-fetches with ``Range: bytes=<got>-`` so the
+# next attempt picks up where the cut happened. Five tries cover
+# the realistic failure mode (e.g. ghcr.io serves blobs via Azure
+# Blob Storage SAS URLs with a ~10 minute expiry; a >2 GiB image
+# at modest bandwidth blows past one window and the connection is
+# cut server-side, but a fresh redirect through ghcr yields a new
+# SAS URL each retry). The cap is the give-up gate, not a normal
+# operating depth.
+RESUME_MAX_ATTEMPTS = 5
 STATIC_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "static")
 MIME_TYPES = {".css": "text/css; charset=utf-8", ".js": "application/javascript; charset=utf-8"}
 _DB_WRITE_LOCK = threading.Lock()
@@ -285,7 +295,14 @@ class Store:
         except FileNotFoundError:
             pass
-    def store_from_origin(self, url: str, progress=None, cancel=None, headers=None) -> sqlite3.Row:
+    def store_from_origin(
+        self,
+        url: str,
+        progress=None,
+        cancel=None,
+        headers=None,
+        max_resume_attempts: int = RESUME_MAX_ATTEMPTS,
+    ) -> sqlite3.Row:
         """Operator-triggered: pull the artifact from origin and store it.
         ``progress(done, total)`` is called as bytes arrive (total may be None);
@@ -294,52 +311,105 @@ class Store:
         ``headers`` adds request headers to the origin fetch (e.g. a registry
         bearer token bty pre-resolved for an oras blob). Raises :class:`CacheFull`
         if the cache is already at --max-bytes.
+        Resume-on-truncation: if the upstream stream ends before its
+        declared Content-Length, the partial bytes are kept and the
+        next attempt requests ``Range: bytes=<got>-`` so the fetch
+        picks up where the connection died. Up to
+        ``max_resume_attempts`` attempts are made before
+        :class:`TruncatedDownload` is raised; on giving up the
+        partial file is removed. A 200 response to a Range request
+        (the origin chose to ignore the header, common on naive
+        upstreams) is handled by restarting from byte 0 and counts
+        against the same attempt budget. Re-issuing the request also
+        re-resolves any 30x redirect chain, which matters for
+        ghcr.io: each ghcr request hands back a fresh Azure Blob
+        Storage SAS URL valid only for a short window, and the
+        prior cut almost certainly was that SAS expiring mid-stream.
         """
         if not self.has_capacity():
             raise CacheFull(f"cache full (>= {self.max_bytes} bytes); refusing to fetch {url}")
         normalized = self.normalize(url)
         key = self.key_of(normalized)
         tmp = os.path.join(self.tmp_dir, key + ".part")
-        req_headers = {"User-Agent": USER_AGENT}
+        base_headers = {"User-Agent": USER_AGENT}
         if headers:
-            req_headers.update(headers)
-        req = urllib.request.Request(url, headers=req_headers)
+            base_headers.update(headers)
         sha = hashlib.sha256()
         size = 0
+        total: int | None = None
+        content_type: str | None = None
         try:
-            with urllib.request.urlopen(req, timeout=120) as resp:
-                content_type = resp.headers.get_content_type()
-                cl = resp.headers.get("Content-Length")
-                total = int(cl) if cl and cl.isdigit() else None
-                if progress:
-                    progress(0, total)
-                with open(tmp, "wb") as f:
-                    while True:
-                        if cancel and cancel():
-                            raise DownloadCancelled()
-                        chunk = resp.read(CHUNK)
-                        if not chunk:
-                            break
-                        f.write(chunk)
-                        sha.update(chunk)
-                        size += len(chunk)
-                        if progress:
-                            progress(size, total)
-            # urllib's read loop exits on clean EOF AND on transport-
-            # aborted close; HTTPResponse only raises IncompleteRead
-            # in some configurations. When the origin declared
-            # Content-Length, treat that as the contract and refuse
-            # to promote a short blob. A silent partial-promotion
-            # would serve malformed bytes to every future consumer
-            # with no way for them to invalidate the entry.
-            if total is not None and size != total:
+            for _ in range(max_resume_attempts):
+                req_headers = dict(base_headers)
+                if size > 0:
+                    # Resume from where the previous attempt cut.
+                    # A 206 response continues the stream; a 200
+                    # means the origin ignored Range (e.g. a dumb
+                    # static server) and we restart from 0.
+                    req_headers["Range"] = f"bytes={size}-"
+                req = urllib.request.Request(url, headers=req_headers)
+                with urllib.request.urlopen(req, timeout=120) as resp:
+                    status = getattr(resp, "status", None) or resp.getcode()
+                    if content_type is None:
+                        content_type = resp.headers.get_content_type()
+                    if size > 0 and status == 200:
+                        # Range ignored by origin: discard the partial
+                        # and start a fresh full-stream attempt.
+                        size = 0
+                        sha = hashlib.sha256()
+                        if os.path.exists(tmp):
+                            os.remove(tmp)
+                    if size > 0 and status == 206:
+                        # ``Content-Range: bytes <start>-<end>/<total>``;
+                        # use the total declared there as the contract,
+                        # not Content-Length (which on 206 is the size
+                        # of the partial response, not the whole blob).
+                        cr = resp.headers.get("Content-Range") or ""
+                        if "/" in cr:
+                            tail = cr.rsplit("/", 1)[1].strip()
+                            if tail.isdigit():
+                                total = int(tail)
+                    else:
+                        cl = resp.headers.get("Content-Length")
+                        if cl and cl.isdigit():
+                            total = int(cl)
+                    if progress:
+                        progress(size, total)
+                    mode = "ab" if size > 0 else "wb"
+                    with open(tmp, mode) as f:
+                        while True:
+                            if cancel and cancel():
+                                raise DownloadCancelled()
+                            chunk = resp.read(CHUNK)
+                            if not chunk:
+                                break
+                            f.write(chunk)
+                            sha.update(chunk)
+                            size += len(chunk)
+                            if progress:
+                                progress(size, total)
+                # urllib's read loop exits on clean EOF AND on transport-
+                # aborted close; HTTPResponse only raises IncompleteRead
+                # in some configurations. When the origin declared a
+                # total (either via Content-Length on a 200 or via
+                # Content-Range on a 206), treat that as the contract:
+                # try to resume from the cut, give up after the budget
+                # is exhausted. Without a declared total there is no
+                # truncation signal, so a single attempt is the whole
+                # story.
+                if total is None or size >= total:
+                    break
+            else:
+                # for/else: ran out of attempts before reaching total
                 raise TruncatedDownload(
                     f"upstream truncated for {url}: declared {total} bytes, got {size}"
+                    f" after {max_resume_attempts} attempts"
                 )
             os.replace(tmp, self.blob_path(key))
         except BaseException:
             if os.path.exists(tmp):
-                os.remove(tmp)  # no half-written blob on cancel/error
+                os.remove(tmp)  # no half-written blob on cancel/error/give-up
             raise
         ts = now_iso()
         with _DB_WRITE_LOCK, self.conn() as c:

{withcache-0.4.1 → withcache-0.4.2}/tests/test_withcache.py RENAMED Viewed

@@ -215,8 +215,12 @@ class TestTruncatedDownloadRejected(unittest.TestCase):
     def test_truncated_upstream_raises_and_leaves_no_blob(self):
         url = f"http://127.0.0.1:{self.port}/truncated.bin"
+        # _TruncatingOrigin truncates EVERY response (including
+        # ranged retries) so capping max_resume_attempts at 1 keeps
+        # the test fast: the single attempt cuts at 500 bytes,
+        # exhausts the budget, and the TruncatedDownload fires.
         with self.assertRaises(server.TruncatedDownload) as cm:
-            self.store.store_from_origin(url)
+            self.store.store_from_origin(url, max_resume_attempts=1)
         # the message must name both totals so the operator can see
         # how short the upstream came
         msg = str(cm.exception)
@@ -231,13 +235,139 @@ class TestTruncatedDownloadRejected(unittest.TestCase):
     def test_repeat_request_after_truncation_can_retry_cleanly(self):
         url = f"http://127.0.0.1:{self.port}/truncated.bin"
         with self.assertRaises(server.TruncatedDownload):
-            self.store.store_from_origin(url)
+            self.store.store_from_origin(url, max_resume_attempts=1)
         # second attempt against the same URL would have hit the
         # poisoned cache before the fix; now it must repeat the
         # failure mode (no sticky blob blocking the retry) so a
         # later origin recovery can re-fill the entry cleanly.
         with self.assertRaises(server.TruncatedDownload):
-            self.store.store_from_origin(url)
+            self.store.store_from_origin(url, max_resume_attempts=1)
+# --------------------------------------------------------------------------
+# Range-resume: a flaky upstream that cuts mid-stream MUST be retried with
+# ``Range: bytes=<got>-`` so the partial is filled rather than discarded.
+# This is the lab-spotted ghcr.io failure mode where Azure Blob Storage
+# SAS URLs expire mid-download for any blob bigger than a few minutes of
+# bandwidth: a single attempt always loses, but a retried Range request
+# starts a fresh SAS window and the second leg finishes the blob.
+# --------------------------------------------------------------------------
+class _ResumableTruncatingOrigin(http.server.BaseHTTPRequestHandler):
+    """Cut the FIRST GET in half; honor ``Range: bytes=<n>-`` on retries
+    by serving from offset n to end. Mirrors the ghcr -> Azure Blob
+    pattern: each connection has a hard wall-clock limit but the bytes
+    themselves are available on re-fetch.
+    Shared class-level counter so multiple instances (the threaded server
+    spawns one handler per request) all see the same call count and the
+    first GET truncates regardless of which thread services it.
+    """
+    PAYLOAD = b"abcdefghij" * 100  # 1000 bytes
+    _lock = threading.Lock()
+    _calls = 0
+    @classmethod
+    def reset(cls) -> None:
+        with cls._lock:
+            cls._calls = 0
+    def do_GET(self):
+        with self._lock:
+            self.__class__._calls += 1
+            call = self._calls
+        rng = self.headers.get("Range") or ""
+        start = 0
+        if rng.startswith("bytes="):
+            try:
+                start = int(rng[len("bytes=") :].split("-", 1)[0])
+            except ValueError:
+                start = 0
+        full = len(self.PAYLOAD)
+        if start > 0:
+            # ranged retry: serve the rest cleanly
+            body = self.PAYLOAD[start:]
+            self.send_response(206)
+            self.send_header("Content-Type", "application/octet-stream")
+            self.send_header("Content-Length", str(len(body)))
+            self.send_header(
+                "Content-Range",
+                f"bytes {start}-{full - 1}/{full}",
+            )
+            self.end_headers()
+            self.wfile.write(body)
+            return
+        # first attempt: declare full length but cut at half
+        self.send_response(200)
+        self.send_header("Content-Type", "application/octet-stream")
+        self.send_header("Content-Length", str(full))
+        self.end_headers()
+        if call == 1:
+            half = full // 2
+            self.wfile.write(self.PAYLOAD[:half])
+            self.wfile.flush()
+            try:
+                self.connection.shutdown(socket.SHUT_RDWR)
+            except OSError:
+                pass
+        else:
+            # any non-ranged retry serves the whole thing (covers the
+            # 200-on-Range fallback path: origin ignored Range, we
+            # restart from 0)
+            self.wfile.write(self.PAYLOAD)
+    def log_message(self, format, *args):
+        pass
+class TestRangeResumeOnTruncation(unittest.TestCase):
+    def setUp(self):
+        _ResumableTruncatingOrigin.reset()
+        self.httpd = socketserver.ThreadingTCPServer(("127.0.0.1", 0), _ResumableTruncatingOrigin)
+        self.port = self.httpd.server_address[1]
+        self.t = threading.Thread(target=self.httpd.serve_forever, daemon=True)
+        self.t.start()
+        self.store = server.Store(tempfile.mkdtemp(), keep_query=False)
+    def tearDown(self):
+        self.httpd.shutdown()
+        self.httpd.server_close()
+    def test_truncated_stream_resumes_via_range(self):
+        """First GET cuts at byte 500; second GET (with
+        ``Range: bytes=500-``) returns 206 and the remaining 500.
+        Result: a complete 1000-byte blob in the cache, sha256 matches
+        the upstream's full payload, no TruncatedDownload raised."""
+        import hashlib
+        url = f"http://127.0.0.1:{self.port}/resumable.bin"
+        row = self.store.store_from_origin(url)
+        self.assertEqual(row["size"], len(_ResumableTruncatingOrigin.PAYLOAD))
+        self.assertEqual(
+            row["sha256"],
+            hashlib.sha256(_ResumableTruncatingOrigin.PAYLOAD).hexdigest(),
+        )
+        with open(self.store.blob_path(row["key"]), "rb") as f:
+            self.assertEqual(f.read(), _ResumableTruncatingOrigin.PAYLOAD)
+    def test_progress_callback_reports_continuing_offset_on_resume(self):
+        """Progress reports must be monotonic across the resume: the
+        second leg's reads start at 500 (the partial-so-far) and walk
+        up to 1000, NOT restart at 0. An operator dashboard watching
+        ``progress`` for a stuck job needs to see the bytes climb."""
+        observed: list[tuple[int, int | None]] = []
+        url = f"http://127.0.0.1:{self.port}/resumable.bin"
+        self.store.store_from_origin(url, progress=lambda d, t: observed.append((d, t)))
+        # final report should be the full payload
+        self.assertEqual(observed[-1][0], len(_ResumableTruncatingOrigin.PAYLOAD))
+        # at no point did the byte counter regress
+        for prev, curr in zip(observed, observed[1:], strict=False):
+            self.assertGreaterEqual(curr[0], prev[0])
+        # the resume actually crossed the cut point: at least one
+        # progress call lands above the half-mark (otherwise we
+        # would have stalled at 500)
+        half = len(_ResumableTruncatingOrigin.PAYLOAD) // 2
+        self.assertTrue(any(d > half for d, _ in observed))
 # --------------------------------------------------------------------------