withcache 0.4.0__tar.gz → 0.4.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: withcache
3
- Version: 0.4.0
3
+ Version: 0.4.2
4
4
  Summary: Operator-curated, URL-keyed artifact cache for a small lab (CUDA/ROCm/DOCA/firmware)
5
5
  Project-URL: Homepage, https://github.com/safl/withcache
6
6
  Author-email: "Simon A. F. Lund" <safl@safl.dk>
@@ -2,7 +2,7 @@
2
2
  .name = .withcache_shim,
3
3
  // Zig requires a literal here; keep it in lockstep with the project's
4
4
  // single source (src/withcache/__init__.py) via `make bump` / `make version-check`.
5
- .version = "0.4.0",
5
+ .version = "0.4.2",
6
6
  .fingerprint = 0xd7d96c5ed212ccaa,
7
7
  .minimum_zig_version = "0.16.0",
8
8
  .paths = .{
@@ -12,6 +12,6 @@ All modules are stdlib-only and self-contained.
12
12
 
13
13
  from .client import blob_url, cache_base, is_cached, serve_url
14
14
 
15
- __version__ = "0.4.0"
15
+ __version__ = "0.4.2"
16
16
 
17
17
  __all__ = ["__version__", "blob_url", "cache_base", "is_cached", "serve_url"]
@@ -43,6 +43,16 @@ from datetime import datetime, timezone
43
43
 
44
44
  CHUNK = 64 * 1024
45
45
  USER_AGENT = "withcache-cache/0.1"
46
+ # Resume budget for a single store_from_origin call. A truncated
47
+ # upstream stream re-fetches with ``Range: bytes=<got>-`` so the
48
+ # next attempt picks up where the cut happened. Five tries cover
49
+ # the realistic failure mode (e.g. ghcr.io serves blobs via Azure
50
+ # Blob Storage SAS URLs with a ~10 minute expiry; a >2 GiB image
51
+ # at modest bandwidth blows past one window and the connection is
52
+ # cut server-side, but a fresh redirect through ghcr yields a new
53
+ # SAS URL each retry). The cap is the give-up gate, not a normal
54
+ # operating depth.
55
+ RESUME_MAX_ATTEMPTS = 5
46
56
  STATIC_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "static")
47
57
  MIME_TYPES = {".css": "text/css; charset=utf-8", ".js": "application/javascript; charset=utf-8"}
48
58
  _DB_WRITE_LOCK = threading.Lock()
@@ -285,7 +295,14 @@ class Store:
285
295
  except FileNotFoundError:
286
296
  pass
287
297
 
288
- def store_from_origin(self, url: str, progress=None, cancel=None, headers=None) -> sqlite3.Row:
298
+ def store_from_origin(
299
+ self,
300
+ url: str,
301
+ progress=None,
302
+ cancel=None,
303
+ headers=None,
304
+ max_resume_attempts: int = RESUME_MAX_ATTEMPTS,
305
+ ) -> sqlite3.Row:
289
306
  """Operator-triggered: pull the artifact from origin and store it.
290
307
 
291
308
  ``progress(done, total)`` is called as bytes arrive (total may be None);
@@ -294,41 +311,105 @@ class Store:
294
311
  ``headers`` adds request headers to the origin fetch (e.g. a registry
295
312
  bearer token bty pre-resolved for an oras blob). Raises :class:`CacheFull`
296
313
  if the cache is already at --max-bytes.
314
+
315
+ Resume-on-truncation: if the upstream stream ends before its
316
+ declared Content-Length, the partial bytes are kept and the
317
+ next attempt requests ``Range: bytes=<got>-`` so the fetch
318
+ picks up where the connection died. Up to
319
+ ``max_resume_attempts`` attempts are made before
320
+ :class:`TruncatedDownload` is raised; on giving up the
321
+ partial file is removed. A 200 response to a Range request
322
+ (the origin chose to ignore the header, common on naive
323
+ upstreams) is handled by restarting from byte 0 and counts
324
+ against the same attempt budget. Re-issuing the request also
325
+ re-resolves any 30x redirect chain, which matters for
326
+ ghcr.io: each ghcr request hands back a fresh Azure Blob
327
+ Storage SAS URL valid only for a short window, and the
328
+ prior cut almost certainly was that SAS expiring mid-stream.
297
329
  """
298
330
  if not self.has_capacity():
299
331
  raise CacheFull(f"cache full (>= {self.max_bytes} bytes); refusing to fetch {url}")
300
332
  normalized = self.normalize(url)
301
333
  key = self.key_of(normalized)
302
334
  tmp = os.path.join(self.tmp_dir, key + ".part")
303
- req_headers = {"User-Agent": USER_AGENT}
335
+ base_headers = {"User-Agent": USER_AGENT}
304
336
  if headers:
305
- req_headers.update(headers)
306
- req = urllib.request.Request(url, headers=req_headers)
337
+ base_headers.update(headers)
307
338
  sha = hashlib.sha256()
308
339
  size = 0
340
+ total: int | None = None
341
+ content_type: str | None = None
309
342
  try:
310
- with urllib.request.urlopen(req, timeout=120) as resp:
311
- content_type = resp.headers.get_content_type()
312
- cl = resp.headers.get("Content-Length")
313
- total = int(cl) if cl and cl.isdigit() else None
314
- if progress:
315
- progress(0, total)
316
- with open(tmp, "wb") as f:
317
- while True:
318
- if cancel and cancel():
319
- raise DownloadCancelled()
320
- chunk = resp.read(CHUNK)
321
- if not chunk:
322
- break
323
- f.write(chunk)
324
- sha.update(chunk)
325
- size += len(chunk)
326
- if progress:
327
- progress(size, total)
343
+ for _ in range(max_resume_attempts):
344
+ req_headers = dict(base_headers)
345
+ if size > 0:
346
+ # Resume from where the previous attempt cut.
347
+ # A 206 response continues the stream; a 200
348
+ # means the origin ignored Range (e.g. a dumb
349
+ # static server) and we restart from 0.
350
+ req_headers["Range"] = f"bytes={size}-"
351
+ req = urllib.request.Request(url, headers=req_headers)
352
+ with urllib.request.urlopen(req, timeout=120) as resp:
353
+ status = getattr(resp, "status", None) or resp.getcode()
354
+ if content_type is None:
355
+ content_type = resp.headers.get_content_type()
356
+ if size > 0 and status == 200:
357
+ # Range ignored by origin: discard the partial
358
+ # and start a fresh full-stream attempt.
359
+ size = 0
360
+ sha = hashlib.sha256()
361
+ if os.path.exists(tmp):
362
+ os.remove(tmp)
363
+ if size > 0 and status == 206:
364
+ # ``Content-Range: bytes <start>-<end>/<total>``;
365
+ # use the total declared there as the contract,
366
+ # not Content-Length (which on 206 is the size
367
+ # of the partial response, not the whole blob).
368
+ cr = resp.headers.get("Content-Range") or ""
369
+ if "/" in cr:
370
+ tail = cr.rsplit("/", 1)[1].strip()
371
+ if tail.isdigit():
372
+ total = int(tail)
373
+ else:
374
+ cl = resp.headers.get("Content-Length")
375
+ if cl and cl.isdigit():
376
+ total = int(cl)
377
+ if progress:
378
+ progress(size, total)
379
+ mode = "ab" if size > 0 else "wb"
380
+ with open(tmp, mode) as f:
381
+ while True:
382
+ if cancel and cancel():
383
+ raise DownloadCancelled()
384
+ chunk = resp.read(CHUNK)
385
+ if not chunk:
386
+ break
387
+ f.write(chunk)
388
+ sha.update(chunk)
389
+ size += len(chunk)
390
+ if progress:
391
+ progress(size, total)
392
+ # urllib's read loop exits on clean EOF AND on transport-
393
+ # aborted close; HTTPResponse only raises IncompleteRead
394
+ # in some configurations. When the origin declared a
395
+ # total (either via Content-Length on a 200 or via
396
+ # Content-Range on a 206), treat that as the contract:
397
+ # try to resume from the cut, give up after the budget
398
+ # is exhausted. Without a declared total there is no
399
+ # truncation signal, so a single attempt is the whole
400
+ # story.
401
+ if total is None or size >= total:
402
+ break
403
+ else:
404
+ # for/else: ran out of attempts before reaching total
405
+ raise TruncatedDownload(
406
+ f"upstream truncated for {url}: declared {total} bytes, got {size}"
407
+ f" after {max_resume_attempts} attempts"
408
+ )
328
409
  os.replace(tmp, self.blob_path(key))
329
410
  except BaseException:
330
411
  if os.path.exists(tmp):
331
- os.remove(tmp) # no half-written blob on cancel/error
412
+ os.remove(tmp) # no half-written blob on cancel/error/give-up
332
413
  raise
333
414
  ts = now_iso()
334
415
  with _DB_WRITE_LOCK, self.conn() as c:
@@ -369,6 +450,14 @@ class CacheFull(Exception):
369
450
  """Raised when --max-bytes is reached; the fill is refused, not evicted."""
370
451
 
371
452
 
453
+ class TruncatedDownload(Exception):
454
+ """Raised when the upstream stream ended before the declared
455
+ Content-Length. The temp file is removed and no blob row is
456
+ written, so the same URL re-enqueues cleanly on the next request
457
+ instead of permanently serving a malformed file.
458
+ """
459
+
460
+
372
461
  @dataclass
373
462
  class Job:
374
463
  id: int
@@ -7,6 +7,7 @@ without an install.
7
7
  import http.server
8
8
  import os
9
9
  import shutil
10
+ import socket
10
11
  import socketserver
11
12
  import sys
12
13
  import tempfile
@@ -167,6 +168,208 @@ class TestStoreFromOrigin(unittest.TestCase):
167
168
  store.store_from_origin(f"http://127.0.0.1:{self.port}/b.bin")
168
169
 
169
170
 
171
+ class _TruncatingOrigin(http.server.BaseHTTPRequestHandler):
172
+ """Declare a full Content-Length, then send half the payload and
173
+ close the socket. Mirrors the real-world failure mode where the
174
+ upstream drops the connection mid-stream (lab-box fedora-44-desktop
175
+ flash that surfaced this bug)."""
176
+
177
+ PAYLOAD = b"abcdefghij" * 100 # 1000 bytes; will write half then close
178
+
179
+ def do_GET(self):
180
+ self.send_response(200)
181
+ self.send_header("Content-Type", "application/octet-stream")
182
+ self.send_header("Content-Length", str(len(self.PAYLOAD)))
183
+ self.end_headers()
184
+ half = len(self.PAYLOAD) // 2
185
+ self.wfile.write(self.PAYLOAD[:half])
186
+ # close the underlying socket so urllib observes EOF before
187
+ # Content-Length bytes arrive
188
+ self.wfile.flush()
189
+ try:
190
+ self.connection.shutdown(socket.SHUT_RDWR)
191
+ except OSError:
192
+ pass
193
+
194
+ def log_message(self, format, *args):
195
+ pass
196
+
197
+
198
+ class TestTruncatedDownloadRejected(unittest.TestCase):
199
+ """Regression for the lab-spotted bug where a transport-aborted
200
+ upstream stream silently became a permanent cached blob: future
201
+ HEADs returned 200 with the partial bytes, every consumer got a
202
+ malformed file, and the only escape was hand-deleting the blob.
203
+ Content-Length mismatches now fail loudly and leave no entry."""
204
+
205
+ def setUp(self):
206
+ self.httpd = socketserver.TCPServer(("127.0.0.1", 0), _TruncatingOrigin)
207
+ self.port = self.httpd.server_address[1]
208
+ self.t = threading.Thread(target=self.httpd.serve_forever, daemon=True)
209
+ self.t.start()
210
+ self.store = server.Store(tempfile.mkdtemp(), keep_query=False)
211
+
212
+ def tearDown(self):
213
+ self.httpd.shutdown()
214
+ self.httpd.server_close()
215
+
216
+ def test_truncated_upstream_raises_and_leaves_no_blob(self):
217
+ url = f"http://127.0.0.1:{self.port}/truncated.bin"
218
+ # _TruncatingOrigin truncates EVERY response (including
219
+ # ranged retries) so capping max_resume_attempts at 1 keeps
220
+ # the test fast: the single attempt cuts at 500 bytes,
221
+ # exhausts the budget, and the TruncatedDownload fires.
222
+ with self.assertRaises(server.TruncatedDownload) as cm:
223
+ self.store.store_from_origin(url, max_resume_attempts=1)
224
+ # the message must name both totals so the operator can see
225
+ # how short the upstream came
226
+ msg = str(cm.exception)
227
+ self.assertIn("1000", msg) # declared
228
+ self.assertIn("500", msg) # got
229
+ # no row was written; no blob file lingers on disk
230
+ self.assertIsNone(self.store.get_blob(url))
231
+ blobs = list(self.store.blob_path("").rsplit("/", 1)[0:1])
232
+ if os.path.isdir(blobs[0]):
233
+ self.assertEqual(os.listdir(blobs[0]), [])
234
+
235
+ def test_repeat_request_after_truncation_can_retry_cleanly(self):
236
+ url = f"http://127.0.0.1:{self.port}/truncated.bin"
237
+ with self.assertRaises(server.TruncatedDownload):
238
+ self.store.store_from_origin(url, max_resume_attempts=1)
239
+ # second attempt against the same URL would have hit the
240
+ # poisoned cache before the fix; now it must repeat the
241
+ # failure mode (no sticky blob blocking the retry) so a
242
+ # later origin recovery can re-fill the entry cleanly.
243
+ with self.assertRaises(server.TruncatedDownload):
244
+ self.store.store_from_origin(url, max_resume_attempts=1)
245
+
246
+
247
+ # --------------------------------------------------------------------------
248
+ # Range-resume: a flaky upstream that cuts mid-stream MUST be retried with
249
+ # ``Range: bytes=<got>-`` so the partial is filled rather than discarded.
250
+ # This is the lab-spotted ghcr.io failure mode where Azure Blob Storage
251
+ # SAS URLs expire mid-download for any blob bigger than a few minutes of
252
+ # bandwidth: a single attempt always loses, but a retried Range request
253
+ # starts a fresh SAS window and the second leg finishes the blob.
254
+ # --------------------------------------------------------------------------
255
+ class _ResumableTruncatingOrigin(http.server.BaseHTTPRequestHandler):
256
+ """Cut the FIRST GET in half; honor ``Range: bytes=<n>-`` on retries
257
+ by serving from offset n to end. Mirrors the ghcr -> Azure Blob
258
+ pattern: each connection has a hard wall-clock limit but the bytes
259
+ themselves are available on re-fetch.
260
+
261
+ Shared class-level counter so multiple instances (the threaded server
262
+ spawns one handler per request) all see the same call count and the
263
+ first GET truncates regardless of which thread services it.
264
+ """
265
+
266
+ PAYLOAD = b"abcdefghij" * 100 # 1000 bytes
267
+ _lock = threading.Lock()
268
+ _calls = 0
269
+
270
+ @classmethod
271
+ def reset(cls) -> None:
272
+ with cls._lock:
273
+ cls._calls = 0
274
+
275
+ def do_GET(self):
276
+ with self._lock:
277
+ self.__class__._calls += 1
278
+ call = self._calls
279
+ rng = self.headers.get("Range") or ""
280
+ start = 0
281
+ if rng.startswith("bytes="):
282
+ try:
283
+ start = int(rng[len("bytes=") :].split("-", 1)[0])
284
+ except ValueError:
285
+ start = 0
286
+ full = len(self.PAYLOAD)
287
+ if start > 0:
288
+ # ranged retry: serve the rest cleanly
289
+ body = self.PAYLOAD[start:]
290
+ self.send_response(206)
291
+ self.send_header("Content-Type", "application/octet-stream")
292
+ self.send_header("Content-Length", str(len(body)))
293
+ self.send_header(
294
+ "Content-Range",
295
+ f"bytes {start}-{full - 1}/{full}",
296
+ )
297
+ self.end_headers()
298
+ self.wfile.write(body)
299
+ return
300
+ # first attempt: declare full length but cut at half
301
+ self.send_response(200)
302
+ self.send_header("Content-Type", "application/octet-stream")
303
+ self.send_header("Content-Length", str(full))
304
+ self.end_headers()
305
+ if call == 1:
306
+ half = full // 2
307
+ self.wfile.write(self.PAYLOAD[:half])
308
+ self.wfile.flush()
309
+ try:
310
+ self.connection.shutdown(socket.SHUT_RDWR)
311
+ except OSError:
312
+ pass
313
+ else:
314
+ # any non-ranged retry serves the whole thing (covers the
315
+ # 200-on-Range fallback path: origin ignored Range, we
316
+ # restart from 0)
317
+ self.wfile.write(self.PAYLOAD)
318
+
319
+ def log_message(self, format, *args):
320
+ pass
321
+
322
+
323
+ class TestRangeResumeOnTruncation(unittest.TestCase):
324
+ def setUp(self):
325
+ _ResumableTruncatingOrigin.reset()
326
+ self.httpd = socketserver.ThreadingTCPServer(("127.0.0.1", 0), _ResumableTruncatingOrigin)
327
+ self.port = self.httpd.server_address[1]
328
+ self.t = threading.Thread(target=self.httpd.serve_forever, daemon=True)
329
+ self.t.start()
330
+ self.store = server.Store(tempfile.mkdtemp(), keep_query=False)
331
+
332
+ def tearDown(self):
333
+ self.httpd.shutdown()
334
+ self.httpd.server_close()
335
+
336
+ def test_truncated_stream_resumes_via_range(self):
337
+ """First GET cuts at byte 500; second GET (with
338
+ ``Range: bytes=500-``) returns 206 and the remaining 500.
339
+ Result: a complete 1000-byte blob in the cache, sha256 matches
340
+ the upstream's full payload, no TruncatedDownload raised."""
341
+ import hashlib
342
+
343
+ url = f"http://127.0.0.1:{self.port}/resumable.bin"
344
+ row = self.store.store_from_origin(url)
345
+ self.assertEqual(row["size"], len(_ResumableTruncatingOrigin.PAYLOAD))
346
+ self.assertEqual(
347
+ row["sha256"],
348
+ hashlib.sha256(_ResumableTruncatingOrigin.PAYLOAD).hexdigest(),
349
+ )
350
+ with open(self.store.blob_path(row["key"]), "rb") as f:
351
+ self.assertEqual(f.read(), _ResumableTruncatingOrigin.PAYLOAD)
352
+
353
+ def test_progress_callback_reports_continuing_offset_on_resume(self):
354
+ """Progress reports must be monotonic across the resume: the
355
+ second leg's reads start at 500 (the partial-so-far) and walk
356
+ up to 1000, NOT restart at 0. An operator dashboard watching
357
+ ``progress`` for a stuck job needs to see the bytes climb."""
358
+ observed: list[tuple[int, int | None]] = []
359
+ url = f"http://127.0.0.1:{self.port}/resumable.bin"
360
+ self.store.store_from_origin(url, progress=lambda d, t: observed.append((d, t)))
361
+ # final report should be the full payload
362
+ self.assertEqual(observed[-1][0], len(_ResumableTruncatingOrigin.PAYLOAD))
363
+ # at no point did the byte counter regress
364
+ for prev, curr in zip(observed, observed[1:], strict=False):
365
+ self.assertGreaterEqual(curr[0], prev[0])
366
+ # the resume actually crossed the cut point: at least one
367
+ # progress call lands above the half-mark (otherwise we
368
+ # would have stalled at 500)
369
+ half = len(_ResumableTruncatingOrigin.PAYLOAD) // 2
370
+ self.assertTrue(any(d > half for d, _ in observed))
371
+
372
+
170
373
  # --------------------------------------------------------------------------
171
374
  # _shim: URL detection, rewrite, real-tool resolution, env, path-encoding
172
375
  # --------------------------------------------------------------------------
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes