withcache 0.4.1__tar.gz → 0.4.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: withcache
3
- Version: 0.4.1
3
+ Version: 0.4.2
4
4
  Summary: Operator-curated, URL-keyed artifact cache for a small lab (CUDA/ROCm/DOCA/firmware)
5
5
  Project-URL: Homepage, https://github.com/safl/withcache
6
6
  Author-email: "Simon A. F. Lund" <safl@safl.dk>
@@ -2,7 +2,7 @@
2
2
  .name = .withcache_shim,
3
3
  // Zig requires a literal here; keep it in lockstep with the project's
4
4
  // single source (src/withcache/__init__.py) via `make bump` / `make version-check`.
5
- .version = "0.4.1",
5
+ .version = "0.4.2",
6
6
  .fingerprint = 0xd7d96c5ed212ccaa,
7
7
  .minimum_zig_version = "0.16.0",
8
8
  .paths = .{
@@ -12,6 +12,6 @@ All modules are stdlib-only and self-contained.
12
12
 
13
13
  from .client import blob_url, cache_base, is_cached, serve_url
14
14
 
15
- __version__ = "0.4.1"
15
+ __version__ = "0.4.2"
16
16
 
17
17
  __all__ = ["__version__", "blob_url", "cache_base", "is_cached", "serve_url"]
@@ -43,6 +43,16 @@ from datetime import datetime, timezone
43
43
 
44
44
  CHUNK = 64 * 1024
45
45
  USER_AGENT = "withcache-cache/0.1"
46
+ # Resume budget for a single store_from_origin call. A truncated
47
+ # upstream stream re-fetches with ``Range: bytes=<got>-`` so the
48
+ # next attempt picks up where the cut happened. Five tries cover
49
+ # the realistic failure mode (e.g. ghcr.io serves blobs via Azure
50
+ # Blob Storage SAS URLs with a ~10 minute expiry; a >2 GiB image
51
+ # at modest bandwidth blows past one window and the connection is
52
+ # cut server-side, but a fresh redirect through ghcr yields a new
53
+ # SAS URL each retry). The cap is the give-up gate, not a normal
54
+ # operating depth.
55
+ RESUME_MAX_ATTEMPTS = 5
46
56
  STATIC_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "static")
47
57
  MIME_TYPES = {".css": "text/css; charset=utf-8", ".js": "application/javascript; charset=utf-8"}
48
58
  _DB_WRITE_LOCK = threading.Lock()
@@ -285,7 +295,14 @@ class Store:
285
295
  except FileNotFoundError:
286
296
  pass
287
297
 
288
- def store_from_origin(self, url: str, progress=None, cancel=None, headers=None) -> sqlite3.Row:
298
+ def store_from_origin(
299
+ self,
300
+ url: str,
301
+ progress=None,
302
+ cancel=None,
303
+ headers=None,
304
+ max_resume_attempts: int = RESUME_MAX_ATTEMPTS,
305
+ ) -> sqlite3.Row:
289
306
  """Operator-triggered: pull the artifact from origin and store it.
290
307
 
291
308
  ``progress(done, total)`` is called as bytes arrive (total may be None);
@@ -294,52 +311,105 @@ class Store:
294
311
  ``headers`` adds request headers to the origin fetch (e.g. a registry
295
312
  bearer token bty pre-resolved for an oras blob). Raises :class:`CacheFull`
296
313
  if the cache is already at --max-bytes.
314
+
315
+ Resume-on-truncation: if the upstream stream ends before its
316
+ declared Content-Length, the partial bytes are kept and the
317
+ next attempt requests ``Range: bytes=<got>-`` so the fetch
318
+ picks up where the connection died. Up to
319
+ ``max_resume_attempts`` attempts are made before
320
+ :class:`TruncatedDownload` is raised; on giving up the
321
+ partial file is removed. A 200 response to a Range request
322
+ (the origin chose to ignore the header, common on naive
323
+ upstreams) is handled by restarting from byte 0 and counts
324
+ against the same attempt budget. Re-issuing the request also
325
+ re-resolves any 30x redirect chain, which matters for
326
+ ghcr.io: each ghcr request hands back a fresh Azure Blob
327
+ Storage SAS URL valid only for a short window, and the
328
+ prior cut almost certainly was that SAS expiring mid-stream.
297
329
  """
298
330
  if not self.has_capacity():
299
331
  raise CacheFull(f"cache full (>= {self.max_bytes} bytes); refusing to fetch {url}")
300
332
  normalized = self.normalize(url)
301
333
  key = self.key_of(normalized)
302
334
  tmp = os.path.join(self.tmp_dir, key + ".part")
303
- req_headers = {"User-Agent": USER_AGENT}
335
+ base_headers = {"User-Agent": USER_AGENT}
304
336
  if headers:
305
- req_headers.update(headers)
306
- req = urllib.request.Request(url, headers=req_headers)
337
+ base_headers.update(headers)
307
338
  sha = hashlib.sha256()
308
339
  size = 0
340
+ total: int | None = None
341
+ content_type: str | None = None
309
342
  try:
310
- with urllib.request.urlopen(req, timeout=120) as resp:
311
- content_type = resp.headers.get_content_type()
312
- cl = resp.headers.get("Content-Length")
313
- total = int(cl) if cl and cl.isdigit() else None
314
- if progress:
315
- progress(0, total)
316
- with open(tmp, "wb") as f:
317
- while True:
318
- if cancel and cancel():
319
- raise DownloadCancelled()
320
- chunk = resp.read(CHUNK)
321
- if not chunk:
322
- break
323
- f.write(chunk)
324
- sha.update(chunk)
325
- size += len(chunk)
326
- if progress:
327
- progress(size, total)
328
- # urllib's read loop exits on clean EOF AND on transport-
329
- # aborted close; HTTPResponse only raises IncompleteRead
330
- # in some configurations. When the origin declared
331
- # Content-Length, treat that as the contract and refuse
332
- # to promote a short blob. A silent partial-promotion
333
- # would serve malformed bytes to every future consumer
334
- # with no way for them to invalidate the entry.
335
- if total is not None and size != total:
343
+ for _ in range(max_resume_attempts):
344
+ req_headers = dict(base_headers)
345
+ if size > 0:
346
+ # Resume from where the previous attempt cut.
347
+ # A 206 response continues the stream; a 200
348
+ # means the origin ignored Range (e.g. a dumb
349
+ # static server) and we restart from 0.
350
+ req_headers["Range"] = f"bytes={size}-"
351
+ req = urllib.request.Request(url, headers=req_headers)
352
+ with urllib.request.urlopen(req, timeout=120) as resp:
353
+ status = getattr(resp, "status", None) or resp.getcode()
354
+ if content_type is None:
355
+ content_type = resp.headers.get_content_type()
356
+ if size > 0 and status == 200:
357
+ # Range ignored by origin: discard the partial
358
+ # and start a fresh full-stream attempt.
359
+ size = 0
360
+ sha = hashlib.sha256()
361
+ if os.path.exists(tmp):
362
+ os.remove(tmp)
363
+ if size > 0 and status == 206:
364
+ # ``Content-Range: bytes <start>-<end>/<total>``;
365
+ # use the total declared there as the contract,
366
+ # not Content-Length (which on 206 is the size
367
+ # of the partial response, not the whole blob).
368
+ cr = resp.headers.get("Content-Range") or ""
369
+ if "/" in cr:
370
+ tail = cr.rsplit("/", 1)[1].strip()
371
+ if tail.isdigit():
372
+ total = int(tail)
373
+ else:
374
+ cl = resp.headers.get("Content-Length")
375
+ if cl and cl.isdigit():
376
+ total = int(cl)
377
+ if progress:
378
+ progress(size, total)
379
+ mode = "ab" if size > 0 else "wb"
380
+ with open(tmp, mode) as f:
381
+ while True:
382
+ if cancel and cancel():
383
+ raise DownloadCancelled()
384
+ chunk = resp.read(CHUNK)
385
+ if not chunk:
386
+ break
387
+ f.write(chunk)
388
+ sha.update(chunk)
389
+ size += len(chunk)
390
+ if progress:
391
+ progress(size, total)
392
+ # urllib's read loop exits on clean EOF AND on transport-
393
+ # aborted close; HTTPResponse only raises IncompleteRead
394
+ # in some configurations. When the origin declared a
395
+ # total (either via Content-Length on a 200 or via
396
+ # Content-Range on a 206), treat that as the contract:
397
+ # try to resume from the cut, give up after the budget
398
+ # is exhausted. Without a declared total there is no
399
+ # truncation signal, so a single attempt is the whole
400
+ # story.
401
+ if total is None or size >= total:
402
+ break
403
+ else:
404
+ # for/else: ran out of attempts before reaching total
336
405
  raise TruncatedDownload(
337
406
  f"upstream truncated for {url}: declared {total} bytes, got {size}"
407
+ f" after {max_resume_attempts} attempts"
338
408
  )
339
409
  os.replace(tmp, self.blob_path(key))
340
410
  except BaseException:
341
411
  if os.path.exists(tmp):
342
- os.remove(tmp) # no half-written blob on cancel/error
412
+ os.remove(tmp) # no half-written blob on cancel/error/give-up
343
413
  raise
344
414
  ts = now_iso()
345
415
  with _DB_WRITE_LOCK, self.conn() as c:
@@ -215,8 +215,12 @@ class TestTruncatedDownloadRejected(unittest.TestCase):
215
215
 
216
216
  def test_truncated_upstream_raises_and_leaves_no_blob(self):
217
217
  url = f"http://127.0.0.1:{self.port}/truncated.bin"
218
+ # _TruncatingOrigin truncates EVERY response (including
219
+ # ranged retries) so capping max_resume_attempts at 1 keeps
220
+ # the test fast: the single attempt cuts at 500 bytes,
221
+ # exhausts the budget, and the TruncatedDownload fires.
218
222
  with self.assertRaises(server.TruncatedDownload) as cm:
219
- self.store.store_from_origin(url)
223
+ self.store.store_from_origin(url, max_resume_attempts=1)
220
224
  # the message must name both totals so the operator can see
221
225
  # how short the upstream came
222
226
  msg = str(cm.exception)
@@ -231,13 +235,139 @@ class TestTruncatedDownloadRejected(unittest.TestCase):
231
235
  def test_repeat_request_after_truncation_can_retry_cleanly(self):
232
236
  url = f"http://127.0.0.1:{self.port}/truncated.bin"
233
237
  with self.assertRaises(server.TruncatedDownload):
234
- self.store.store_from_origin(url)
238
+ self.store.store_from_origin(url, max_resume_attempts=1)
235
239
  # second attempt against the same URL would have hit the
236
240
  # poisoned cache before the fix; now it must repeat the
237
241
  # failure mode (no sticky blob blocking the retry) so a
238
242
  # later origin recovery can re-fill the entry cleanly.
239
243
  with self.assertRaises(server.TruncatedDownload):
240
- self.store.store_from_origin(url)
244
+ self.store.store_from_origin(url, max_resume_attempts=1)
245
+
246
+
247
+ # --------------------------------------------------------------------------
248
+ # Range-resume: a flaky upstream that cuts mid-stream MUST be retried with
249
+ # ``Range: bytes=<got>-`` so the partial is filled rather than discarded.
250
+ # This is the lab-spotted ghcr.io failure mode where Azure Blob Storage
251
+ # SAS URLs expire mid-download for any blob bigger than a few minutes of
252
+ # bandwidth: a single attempt always loses, but a retried Range request
253
+ # starts a fresh SAS window and the second leg finishes the blob.
254
+ # --------------------------------------------------------------------------
255
+ class _ResumableTruncatingOrigin(http.server.BaseHTTPRequestHandler):
256
+ """Cut the FIRST GET in half; honor ``Range: bytes=<n>-`` on retries
257
+ by serving from offset n to end. Mirrors the ghcr -> Azure Blob
258
+ pattern: each connection has a hard wall-clock limit but the bytes
259
+ themselves are available on re-fetch.
260
+
261
+ Shared class-level counter so multiple instances (the threaded server
262
+ spawns one handler per request) all see the same call count and the
263
+ first GET truncates regardless of which thread services it.
264
+ """
265
+
266
+ PAYLOAD = b"abcdefghij" * 100 # 1000 bytes
267
+ _lock = threading.Lock()
268
+ _calls = 0
269
+
270
+ @classmethod
271
+ def reset(cls) -> None:
272
+ with cls._lock:
273
+ cls._calls = 0
274
+
275
+ def do_GET(self):
276
+ with self._lock:
277
+ self.__class__._calls += 1
278
+ call = self._calls
279
+ rng = self.headers.get("Range") or ""
280
+ start = 0
281
+ if rng.startswith("bytes="):
282
+ try:
283
+ start = int(rng[len("bytes=") :].split("-", 1)[0])
284
+ except ValueError:
285
+ start = 0
286
+ full = len(self.PAYLOAD)
287
+ if start > 0:
288
+ # ranged retry: serve the rest cleanly
289
+ body = self.PAYLOAD[start:]
290
+ self.send_response(206)
291
+ self.send_header("Content-Type", "application/octet-stream")
292
+ self.send_header("Content-Length", str(len(body)))
293
+ self.send_header(
294
+ "Content-Range",
295
+ f"bytes {start}-{full - 1}/{full}",
296
+ )
297
+ self.end_headers()
298
+ self.wfile.write(body)
299
+ return
300
+ # first attempt: declare full length but cut at half
301
+ self.send_response(200)
302
+ self.send_header("Content-Type", "application/octet-stream")
303
+ self.send_header("Content-Length", str(full))
304
+ self.end_headers()
305
+ if call == 1:
306
+ half = full // 2
307
+ self.wfile.write(self.PAYLOAD[:half])
308
+ self.wfile.flush()
309
+ try:
310
+ self.connection.shutdown(socket.SHUT_RDWR)
311
+ except OSError:
312
+ pass
313
+ else:
314
+ # any non-ranged retry serves the whole thing (covers the
315
+ # 200-on-Range fallback path: origin ignored Range, we
316
+ # restart from 0)
317
+ self.wfile.write(self.PAYLOAD)
318
+
319
+ def log_message(self, format, *args):
320
+ pass
321
+
322
+
323
+ class TestRangeResumeOnTruncation(unittest.TestCase):
324
+ def setUp(self):
325
+ _ResumableTruncatingOrigin.reset()
326
+ self.httpd = socketserver.ThreadingTCPServer(("127.0.0.1", 0), _ResumableTruncatingOrigin)
327
+ self.port = self.httpd.server_address[1]
328
+ self.t = threading.Thread(target=self.httpd.serve_forever, daemon=True)
329
+ self.t.start()
330
+ self.store = server.Store(tempfile.mkdtemp(), keep_query=False)
331
+
332
+ def tearDown(self):
333
+ self.httpd.shutdown()
334
+ self.httpd.server_close()
335
+
336
+ def test_truncated_stream_resumes_via_range(self):
337
+ """First GET cuts at byte 500; second GET (with
338
+ ``Range: bytes=500-``) returns 206 and the remaining 500.
339
+ Result: a complete 1000-byte blob in the cache, sha256 matches
340
+ the upstream's full payload, no TruncatedDownload raised."""
341
+ import hashlib
342
+
343
+ url = f"http://127.0.0.1:{self.port}/resumable.bin"
344
+ row = self.store.store_from_origin(url)
345
+ self.assertEqual(row["size"], len(_ResumableTruncatingOrigin.PAYLOAD))
346
+ self.assertEqual(
347
+ row["sha256"],
348
+ hashlib.sha256(_ResumableTruncatingOrigin.PAYLOAD).hexdigest(),
349
+ )
350
+ with open(self.store.blob_path(row["key"]), "rb") as f:
351
+ self.assertEqual(f.read(), _ResumableTruncatingOrigin.PAYLOAD)
352
+
353
+ def test_progress_callback_reports_continuing_offset_on_resume(self):
354
+ """Progress reports must be monotonic across the resume: the
355
+ second leg's reads start at 500 (the partial-so-far) and walk
356
+ up to 1000, NOT restart at 0. An operator dashboard watching
357
+ ``progress`` for a stuck job needs to see the bytes climb."""
358
+ observed: list[tuple[int, int | None]] = []
359
+ url = f"http://127.0.0.1:{self.port}/resumable.bin"
360
+ self.store.store_from_origin(url, progress=lambda d, t: observed.append((d, t)))
361
+ # final report should be the full payload
362
+ self.assertEqual(observed[-1][0], len(_ResumableTruncatingOrigin.PAYLOAD))
363
+ # at no point did the byte counter regress
364
+ for prev, curr in zip(observed, observed[1:], strict=False):
365
+ self.assertGreaterEqual(curr[0], prev[0])
366
+ # the resume actually crossed the cut point: at least one
367
+ # progress call lands above the half-mark (otherwise we
368
+ # would have stalled at 500)
369
+ half = len(_ResumableTruncatingOrigin.PAYLOAD) // 2
370
+ self.assertTrue(any(d > half for d, _ in observed))
241
371
 
242
372
 
243
373
  # --------------------------------------------------------------------------
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes