withcache 0.4.1__tar.gz → 0.4.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: withcache
3
- Version: 0.4.1
3
+ Version: 0.4.3
4
4
  Summary: Operator-curated, URL-keyed artifact cache for a small lab (CUDA/ROCm/DOCA/firmware)
5
5
  Project-URL: Homepage, https://github.com/safl/withcache
6
6
  Author-email: "Simon A. F. Lund" <safl@safl.dk>
@@ -2,7 +2,7 @@
2
2
  .name = .withcache_shim,
3
3
  // Zig requires a literal here; keep it in lockstep with the project's
4
4
  // single source (src/withcache/__init__.py) via `make bump` / `make version-check`.
5
- .version = "0.4.1",
5
+ .version = "0.4.3",
6
6
  .fingerprint = 0xd7d96c5ed212ccaa,
7
7
  .minimum_zig_version = "0.16.0",
8
8
  .paths = .{
@@ -12,6 +12,6 @@ All modules are stdlib-only and self-contained.
12
12
 
13
13
  from .client import blob_url, cache_base, is_cached, serve_url
14
14
 
15
- __version__ = "0.4.1"
15
+ __version__ = "0.4.3"
16
16
 
17
17
  __all__ = ["__version__", "blob_url", "cache_base", "is_cached", "serve_url"]
@@ -41,8 +41,20 @@ import urllib.request
41
41
  from dataclasses import dataclass, field
42
42
  from datetime import datetime, timezone
43
43
 
44
+ from . import __version__
45
+
44
46
  CHUNK = 64 * 1024
45
- USER_AGENT = "withcache-cache/0.1"
47
+ USER_AGENT = f"withcache-cache/{__version__}"
48
+ # Resume budget for a single store_from_origin call. A truncated
49
+ # upstream stream re-fetches with ``Range: bytes=<got>-`` so the
50
+ # next attempt picks up where the cut happened. Five tries cover
51
+ # the realistic failure mode (e.g. ghcr.io serves blobs via Azure
52
+ # Blob Storage SAS URLs with a ~10 minute expiry; a >2 GiB image
53
+ # at modest bandwidth blows past one window and the connection is
54
+ # cut server-side, but a fresh redirect through ghcr yields a new
55
+ # SAS URL each retry). The cap is the give-up gate, not a normal
56
+ # operating depth.
57
+ RESUME_MAX_ATTEMPTS = 5
46
58
  STATIC_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "static")
47
59
  MIME_TYPES = {".css": "text/css; charset=utf-8", ".js": "application/javascript; charset=utf-8"}
48
60
  _DB_WRITE_LOCK = threading.Lock()
@@ -285,7 +297,14 @@ class Store:
285
297
  except FileNotFoundError:
286
298
  pass
287
299
 
288
- def store_from_origin(self, url: str, progress=None, cancel=None, headers=None) -> sqlite3.Row:
300
+ def store_from_origin(
301
+ self,
302
+ url: str,
303
+ progress=None,
304
+ cancel=None,
305
+ headers=None,
306
+ max_resume_attempts: int = RESUME_MAX_ATTEMPTS,
307
+ ) -> sqlite3.Row:
289
308
  """Operator-triggered: pull the artifact from origin and store it.
290
309
 
291
310
  ``progress(done, total)`` is called as bytes arrive (total may be None);
@@ -294,52 +313,105 @@ class Store:
294
313
  ``headers`` adds request headers to the origin fetch (e.g. a registry
295
314
  bearer token bty pre-resolved for an oras blob). Raises :class:`CacheFull`
296
315
  if the cache is already at --max-bytes.
316
+
317
+ Resume-on-truncation: if the upstream stream ends before its
318
+ declared Content-Length, the partial bytes are kept and the
319
+ next attempt requests ``Range: bytes=<got>-`` so the fetch
320
+ picks up where the connection died. Up to
321
+ ``max_resume_attempts`` attempts are made before
322
+ :class:`TruncatedDownload` is raised; on giving up the
323
+ partial file is removed. A 200 response to a Range request
324
+ (the origin chose to ignore the header, common on naive
325
+ upstreams) is handled by restarting from byte 0 and counts
326
+ against the same attempt budget. Re-issuing the request also
327
+ re-resolves any 30x redirect chain, which matters for
328
+ ghcr.io: each ghcr request hands back a fresh Azure Blob
329
+ Storage SAS URL valid only for a short window, and the
330
+ prior cut almost certainly was that SAS expiring mid-stream.
297
331
  """
298
332
  if not self.has_capacity():
299
333
  raise CacheFull(f"cache full (>= {self.max_bytes} bytes); refusing to fetch {url}")
300
334
  normalized = self.normalize(url)
301
335
  key = self.key_of(normalized)
302
336
  tmp = os.path.join(self.tmp_dir, key + ".part")
303
- req_headers = {"User-Agent": USER_AGENT}
337
+ base_headers = {"User-Agent": USER_AGENT}
304
338
  if headers:
305
- req_headers.update(headers)
306
- req = urllib.request.Request(url, headers=req_headers)
339
+ base_headers.update(headers)
307
340
  sha = hashlib.sha256()
308
341
  size = 0
342
+ total: int | None = None
343
+ content_type: str | None = None
309
344
  try:
310
- with urllib.request.urlopen(req, timeout=120) as resp:
311
- content_type = resp.headers.get_content_type()
312
- cl = resp.headers.get("Content-Length")
313
- total = int(cl) if cl and cl.isdigit() else None
314
- if progress:
315
- progress(0, total)
316
- with open(tmp, "wb") as f:
317
- while True:
318
- if cancel and cancel():
319
- raise DownloadCancelled()
320
- chunk = resp.read(CHUNK)
321
- if not chunk:
322
- break
323
- f.write(chunk)
324
- sha.update(chunk)
325
- size += len(chunk)
326
- if progress:
327
- progress(size, total)
328
- # urllib's read loop exits on clean EOF AND on transport-
329
- # aborted close; HTTPResponse only raises IncompleteRead
330
- # in some configurations. When the origin declared
331
- # Content-Length, treat that as the contract and refuse
332
- # to promote a short blob. A silent partial-promotion
333
- # would serve malformed bytes to every future consumer
334
- # with no way for them to invalidate the entry.
335
- if total is not None and size != total:
345
+ for _ in range(max_resume_attempts):
346
+ req_headers = dict(base_headers)
347
+ if size > 0:
348
+ # Resume from where the previous attempt cut.
349
+ # A 206 response continues the stream; a 200
350
+ # means the origin ignored Range (e.g. a dumb
351
+ # static server) and we restart from 0.
352
+ req_headers["Range"] = f"bytes={size}-"
353
+ req = urllib.request.Request(url, headers=req_headers)
354
+ with urllib.request.urlopen(req, timeout=120) as resp:
355
+ status = getattr(resp, "status", None) or resp.getcode()
356
+ if content_type is None:
357
+ content_type = resp.headers.get_content_type()
358
+ if size > 0 and status == 200:
359
+ # Range ignored by origin: discard the partial
360
+ # and start a fresh full-stream attempt.
361
+ size = 0
362
+ sha = hashlib.sha256()
363
+ if os.path.exists(tmp):
364
+ os.remove(tmp)
365
+ if size > 0 and status == 206:
366
+ # ``Content-Range: bytes <start>-<end>/<total>``;
367
+ # use the total declared there as the contract,
368
+ # not Content-Length (which on 206 is the size
369
+ # of the partial response, not the whole blob).
370
+ cr = resp.headers.get("Content-Range") or ""
371
+ if "/" in cr:
372
+ tail = cr.rsplit("/", 1)[1].strip()
373
+ if tail.isdigit():
374
+ total = int(tail)
375
+ else:
376
+ cl = resp.headers.get("Content-Length")
377
+ if cl and cl.isdigit():
378
+ total = int(cl)
379
+ if progress:
380
+ progress(size, total)
381
+ mode = "ab" if size > 0 else "wb"
382
+ with open(tmp, mode) as f:
383
+ while True:
384
+ if cancel and cancel():
385
+ raise DownloadCancelled()
386
+ chunk = resp.read(CHUNK)
387
+ if not chunk:
388
+ break
389
+ f.write(chunk)
390
+ sha.update(chunk)
391
+ size += len(chunk)
392
+ if progress:
393
+ progress(size, total)
394
+ # urllib's read loop exits on clean EOF AND on transport-
395
+ # aborted close; HTTPResponse only raises IncompleteRead
396
+ # in some configurations. When the origin declared a
397
+ # total (either via Content-Length on a 200 or via
398
+ # Content-Range on a 206), treat that as the contract:
399
+ # try to resume from the cut, give up after the budget
400
+ # is exhausted. Without a declared total there is no
401
+ # truncation signal, so a single attempt is the whole
402
+ # story.
403
+ if total is None or size >= total:
404
+ break
405
+ else:
406
+ # for/else: ran out of attempts before reaching total
336
407
  raise TruncatedDownload(
337
408
  f"upstream truncated for {url}: declared {total} bytes, got {size}"
409
+ f" after {max_resume_attempts} attempts"
338
410
  )
339
411
  os.replace(tmp, self.blob_path(key))
340
412
  except BaseException:
341
413
  if os.path.exists(tmp):
342
- os.remove(tmp) # no half-written blob on cancel/error
414
+ os.remove(tmp) # no half-written blob on cancel/error/give-up
343
415
  raise
344
416
  ts = now_iso()
345
417
  with _DB_WRITE_LOCK, self.conn() as c:
@@ -496,7 +568,7 @@ def _set_progress(job: Job, done: int, total: int | None):
496
568
  # HTTP handler
497
569
  # --------------------------------------------------------------------------
498
570
  class Handler(http.server.BaseHTTPRequestHandler):
499
- server_version = "withcache/0.1"
571
+ server_version = f"withcache/{__version__}"
500
572
  protocol_version = "HTTP/1.1"
501
573
 
502
574
  @property
@@ -789,7 +861,10 @@ class Handler(http.server.BaseHTTPRequestHandler):
789
861
  return f"""{self._head("withcache — login")}
790
862
  <body><main class="container">
791
863
  <article style="max-width: 24rem; margin: 4rem auto;">
792
- <hgroup><h2>withcache</h2><p>operator login</p></hgroup>
864
+ <hgroup>
865
+ <h2>withcache <small class="mono">v{html.escape(__version__)}</small></h2>
866
+ <p>operator login</p>
867
+ </hgroup>
793
868
  {err}
794
869
  <form method="post" action="/ui/login">
795
870
  <input type="password" name="password" placeholder="Admin password" autofocus required>
@@ -809,7 +884,10 @@ class Handler(http.server.BaseHTTPRequestHandler):
809
884
  return f"""{self._head("withcache cache-host")}
810
885
  <body><main class="container">
811
886
  <nav>
812
- <ul><li><strong>withcache</strong> &nbsp;<small>cache-host</small></li></ul>
887
+ <ul><li>
888
+ <strong>withcache</strong> &nbsp;<small>cache-host</small>
889
+ &nbsp;<small class="mono">v{html.escape(__version__)}</small>
890
+ </li></ul>
813
891
  <ul>
814
892
  <li><progress id="spin" class="htmx-indicator"></progress></li>
815
893
  {logout}
@@ -215,8 +215,12 @@ class TestTruncatedDownloadRejected(unittest.TestCase):
215
215
 
216
216
  def test_truncated_upstream_raises_and_leaves_no_blob(self):
217
217
  url = f"http://127.0.0.1:{self.port}/truncated.bin"
218
+ # _TruncatingOrigin truncates EVERY response (including
219
+ # ranged retries) so capping max_resume_attempts at 1 keeps
220
+ # the test fast: the single attempt cuts at 500 bytes,
221
+ # exhausts the budget, and the TruncatedDownload fires.
218
222
  with self.assertRaises(server.TruncatedDownload) as cm:
219
- self.store.store_from_origin(url)
223
+ self.store.store_from_origin(url, max_resume_attempts=1)
220
224
  # the message must name both totals so the operator can see
221
225
  # how short the upstream came
222
226
  msg = str(cm.exception)
@@ -231,13 +235,139 @@ class TestTruncatedDownloadRejected(unittest.TestCase):
231
235
  def test_repeat_request_after_truncation_can_retry_cleanly(self):
232
236
  url = f"http://127.0.0.1:{self.port}/truncated.bin"
233
237
  with self.assertRaises(server.TruncatedDownload):
234
- self.store.store_from_origin(url)
238
+ self.store.store_from_origin(url, max_resume_attempts=1)
235
239
  # second attempt against the same URL would have hit the
236
240
  # poisoned cache before the fix; now it must repeat the
237
241
  # failure mode (no sticky blob blocking the retry) so a
238
242
  # later origin recovery can re-fill the entry cleanly.
239
243
  with self.assertRaises(server.TruncatedDownload):
240
- self.store.store_from_origin(url)
244
+ self.store.store_from_origin(url, max_resume_attempts=1)
245
+
246
+
247
+ # --------------------------------------------------------------------------
248
+ # Range-resume: a flaky upstream that cuts mid-stream MUST be retried with
249
+ # ``Range: bytes=<got>-`` so the partial is filled rather than discarded.
250
+ # This is the lab-spotted ghcr.io failure mode where Azure Blob Storage
251
+ # SAS URLs expire mid-download for any blob bigger than a few minutes of
252
+ # bandwidth: a single attempt always loses, but a retried Range request
253
+ # starts a fresh SAS window and the second leg finishes the blob.
254
+ # --------------------------------------------------------------------------
255
+ class _ResumableTruncatingOrigin(http.server.BaseHTTPRequestHandler):
256
+ """Cut the FIRST GET in half; honor ``Range: bytes=<n>-`` on retries
257
+ by serving from offset n to end. Mirrors the ghcr -> Azure Blob
258
+ pattern: each connection has a hard wall-clock limit but the bytes
259
+ themselves are available on re-fetch.
260
+
261
+ Shared class-level counter so multiple instances (the threaded server
262
+ spawns one handler per request) all see the same call count and the
263
+ first GET truncates regardless of which thread services it.
264
+ """
265
+
266
+ PAYLOAD = b"abcdefghij" * 100 # 1000 bytes
267
+ _lock = threading.Lock()
268
+ _calls = 0
269
+
270
+ @classmethod
271
+ def reset(cls) -> None:
272
+ with cls._lock:
273
+ cls._calls = 0
274
+
275
+ def do_GET(self):
276
+ with self._lock:
277
+ self.__class__._calls += 1
278
+ call = self._calls
279
+ rng = self.headers.get("Range") or ""
280
+ start = 0
281
+ if rng.startswith("bytes="):
282
+ try:
283
+ start = int(rng[len("bytes=") :].split("-", 1)[0])
284
+ except ValueError:
285
+ start = 0
286
+ full = len(self.PAYLOAD)
287
+ if start > 0:
288
+ # ranged retry: serve the rest cleanly
289
+ body = self.PAYLOAD[start:]
290
+ self.send_response(206)
291
+ self.send_header("Content-Type", "application/octet-stream")
292
+ self.send_header("Content-Length", str(len(body)))
293
+ self.send_header(
294
+ "Content-Range",
295
+ f"bytes {start}-{full - 1}/{full}",
296
+ )
297
+ self.end_headers()
298
+ self.wfile.write(body)
299
+ return
300
+ # first attempt: declare full length but cut at half
301
+ self.send_response(200)
302
+ self.send_header("Content-Type", "application/octet-stream")
303
+ self.send_header("Content-Length", str(full))
304
+ self.end_headers()
305
+ if call == 1:
306
+ half = full // 2
307
+ self.wfile.write(self.PAYLOAD[:half])
308
+ self.wfile.flush()
309
+ try:
310
+ self.connection.shutdown(socket.SHUT_RDWR)
311
+ except OSError:
312
+ pass
313
+ else:
314
+ # any non-ranged retry serves the whole thing (covers the
315
+ # 200-on-Range fallback path: origin ignored Range, we
316
+ # restart from 0)
317
+ self.wfile.write(self.PAYLOAD)
318
+
319
+ def log_message(self, format, *args):
320
+ pass
321
+
322
+
323
+ class TestRangeResumeOnTruncation(unittest.TestCase):
324
+ def setUp(self):
325
+ _ResumableTruncatingOrigin.reset()
326
+ self.httpd = socketserver.ThreadingTCPServer(("127.0.0.1", 0), _ResumableTruncatingOrigin)
327
+ self.port = self.httpd.server_address[1]
328
+ self.t = threading.Thread(target=self.httpd.serve_forever, daemon=True)
329
+ self.t.start()
330
+ self.store = server.Store(tempfile.mkdtemp(), keep_query=False)
331
+
332
+ def tearDown(self):
333
+ self.httpd.shutdown()
334
+ self.httpd.server_close()
335
+
336
+ def test_truncated_stream_resumes_via_range(self):
337
+ """First GET cuts at byte 500; second GET (with
338
+ ``Range: bytes=500-``) returns 206 and the remaining 500.
339
+ Result: a complete 1000-byte blob in the cache, sha256 matches
340
+ the upstream's full payload, no TruncatedDownload raised."""
341
+ import hashlib
342
+
343
+ url = f"http://127.0.0.1:{self.port}/resumable.bin"
344
+ row = self.store.store_from_origin(url)
345
+ self.assertEqual(row["size"], len(_ResumableTruncatingOrigin.PAYLOAD))
346
+ self.assertEqual(
347
+ row["sha256"],
348
+ hashlib.sha256(_ResumableTruncatingOrigin.PAYLOAD).hexdigest(),
349
+ )
350
+ with open(self.store.blob_path(row["key"]), "rb") as f:
351
+ self.assertEqual(f.read(), _ResumableTruncatingOrigin.PAYLOAD)
352
+
353
+ def test_progress_callback_reports_continuing_offset_on_resume(self):
354
+ """Progress reports must be monotonic across the resume: the
355
+ second leg's reads start at 500 (the partial-so-far) and walk
356
+ up to 1000, NOT restart at 0. An operator dashboard watching
357
+ ``progress`` for a stuck job needs to see the bytes climb."""
358
+ observed: list[tuple[int, int | None]] = []
359
+ url = f"http://127.0.0.1:{self.port}/resumable.bin"
360
+ self.store.store_from_origin(url, progress=lambda d, t: observed.append((d, t)))
361
+ # final report should be the full payload
362
+ self.assertEqual(observed[-1][0], len(_ResumableTruncatingOrigin.PAYLOAD))
363
+ # at no point did the byte counter regress
364
+ for prev, curr in zip(observed, observed[1:], strict=False):
365
+ self.assertGreaterEqual(curr[0], prev[0])
366
+ # the resume actually crossed the cut point: at least one
367
+ # progress call lands above the half-mark (otherwise we
368
+ # would have stalled at 500)
369
+ half = len(_ResumableTruncatingOrigin.PAYLOAD) // 2
370
+ self.assertTrue(any(d > half for d, _ in observed))
241
371
 
242
372
 
243
373
  # --------------------------------------------------------------------------
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes