withcache 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- withcache/__init__.py +11 -0
- withcache/_shim.py +134 -0
- withcache/curlwithcache.py +51 -0
- withcache/server.py +901 -0
- withcache/static/htmx.min.js +1 -0
- withcache/static/pico.min.css +4 -0
- withcache/wgetwithcache.py +51 -0
- withcache-0.2.0.data/scripts/curlwithcache +4 -0
- withcache-0.2.0.data/scripts/wgetwithcache +4 -0
- withcache-0.2.0.dist-info/METADATA +271 -0
- withcache-0.2.0.dist-info/RECORD +14 -0
- withcache-0.2.0.dist-info/WHEEL +4 -0
- withcache-0.2.0.dist-info/entry_points.txt +2 -0
- withcache-0.2.0.dist-info/licenses/LICENSE +28 -0
withcache/__init__.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""withcache — operator-curated, URL-keyed artifact cache for a small lab.
|
|
2
|
+
|
|
3
|
+
Two console entry points (see pyproject.toml):
|
|
4
|
+
withcache -> withcache.client:main (the cache-aware downloader)
|
|
5
|
+
withcache-server -> withcache.server:main (the cache-host)
|
|
6
|
+
|
|
7
|
+
Both modules are stdlib-only and self-contained, so either file can also be
|
|
8
|
+
copied and run on its own with a plain ``python3``.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
__version__ = "0.2.0"
|
withcache/_shim.py
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
"""Shared core for the withcache download-tool shims (curlwithcache, wgetwithcache).
|
|
2
|
+
|
|
3
|
+
Every shim does the same three things — find the URL in the wrapped tool's
|
|
4
|
+
arguments, ask the cache-host whether it has that artifact, and on a hit
|
|
5
|
+
re-point just the URL at the cache before exec'ing the real tool — so that
|
|
6
|
+
logic lives here. A shim only supplies (a) the tool's name and (b) how to
|
|
7
|
+
probe the cache with that tool.
|
|
8
|
+
|
|
9
|
+
The cache fetch URL is path-encoded as ``<server>/b/<base64(origin)>/<basename>``
|
|
10
|
+
so that ANY downloader names the saved file after the real artifact (``-O`` /
|
|
11
|
+
bare ``wget`` derive the name from the URL's last path segment), with no query
|
|
12
|
+
string to pollute the name and no per-tool output-flag parsing.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import base64
|
|
16
|
+
import os
|
|
17
|
+
import re
|
|
18
|
+
import sys
|
|
19
|
+
import urllib.parse
|
|
20
|
+
|
|
21
|
+
PROBE_TIMEOUT = 5 # seconds; a slow/unreachable cache must never block the user
|
|
22
|
+
|
|
23
|
+
# A real URL argument begins with a scheme; this excludes header/data values
|
|
24
|
+
# like "Referer: https://…" or "u=https://…" that merely contain "://".
|
|
25
|
+
_SCHEME = re.compile(r"^[a-zA-Z][a-zA-Z0-9+.\-]*://")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def cache_base(server: str) -> str:
|
|
29
|
+
"""Accept 'host', 'host:3000', or 'http://withcache-server:3000'."""
|
|
30
|
+
server = server.strip().rstrip("/")
|
|
31
|
+
if "://" not in server:
|
|
32
|
+
server = "http://" + server
|
|
33
|
+
return server
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def env_server(tool: str) -> str | None:
|
|
37
|
+
"""Per-tool override (e.g. CURLWITHCACHE_SERVER) wins, else WITHCACHE_SERVER."""
|
|
38
|
+
return os.environ.get(tool.upper() + "WITHCACHE_SERVER") or os.environ.get("WITHCACHE_SERVER")
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def find_real(name: str) -> str | None:
|
|
42
|
+
"""The next executable ``name`` on PATH that isn't this shim. $REAL_<NAME>
|
|
43
|
+
(e.g. $REAL_CURL) overrides."""
|
|
44
|
+
override = os.environ.get("REAL_" + name.upper())
|
|
45
|
+
if override and os.path.isfile(override) and os.access(override, os.X_OK):
|
|
46
|
+
return override
|
|
47
|
+
me = os.path.realpath(sys.argv[0]) if sys.argv and sys.argv[0] else None
|
|
48
|
+
for d in os.environ.get("PATH", "").split(os.pathsep):
|
|
49
|
+
if not d:
|
|
50
|
+
continue
|
|
51
|
+
cand = os.path.join(d, name)
|
|
52
|
+
if os.path.isfile(cand) and os.access(cand, os.X_OK):
|
|
53
|
+
if me and os.path.realpath(cand) == me:
|
|
54
|
+
continue # that's us — keep looking for the real one
|
|
55
|
+
return cand
|
|
56
|
+
return None
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def find_url(argv: list):
|
|
60
|
+
"""Return (index, origin_url, kind) where kind is 'bare' or 'urleq', or None.
|
|
61
|
+
|
|
62
|
+
'bare' -> argv[index] is the URL (replace the whole token).
|
|
63
|
+
'urleq' -> argv[index] is '--url=URL' (replace, keeping the prefix).
|
|
64
|
+
"""
|
|
65
|
+
i = 0
|
|
66
|
+
while i < len(argv):
|
|
67
|
+
t = argv[i]
|
|
68
|
+
if t == "--": # everything after is an operand
|
|
69
|
+
for k in range(i + 1, len(argv)):
|
|
70
|
+
if _SCHEME.match(argv[k]):
|
|
71
|
+
return (k, argv[k], "bare")
|
|
72
|
+
return None
|
|
73
|
+
if t == "--url" and i + 1 < len(argv):
|
|
74
|
+
return (i + 1, argv[i + 1], "bare")
|
|
75
|
+
if t.startswith("--url="):
|
|
76
|
+
return (i, t[len("--url=") :], "urleq")
|
|
77
|
+
if _SCHEME.match(t):
|
|
78
|
+
return (i, t, "bare")
|
|
79
|
+
i += 1
|
|
80
|
+
return None
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _basename(origin: str) -> str:
|
|
84
|
+
name = os.path.basename(urllib.parse.urlsplit(origin).path)
|
|
85
|
+
return name or "download"
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def blob_url(base: str, origin: str) -> str:
|
|
89
|
+
"""<base>/b/<urlsafe-base64(origin), unpadded>/<basename> — path-encoded so
|
|
90
|
+
every downloader derives the correct output filename."""
|
|
91
|
+
token = base64.urlsafe_b64encode(origin.encode("utf-8")).decode("ascii").rstrip("=")
|
|
92
|
+
return f"{base}/b/{token}/{urllib.parse.quote(_basename(origin))}"
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def rewrite(argv: list, idx: int, kind: str, new_url: str) -> list:
|
|
96
|
+
argv = list(argv)
|
|
97
|
+
argv[idx] = ("--url=" + new_url) if kind == "urleq" else new_url
|
|
98
|
+
return argv
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def plan(tool: str, probe, argv: list):
|
|
102
|
+
"""Resolve (real_tool_path, final_argv) WITHOUT exec'ing — the testable core
|
|
103
|
+
of run(). ``probe(real_tool, url)`` returns True (hit) / False (miss) / None
|
|
104
|
+
(unreachable). On a hit the URL token is re-pointed at the cache; otherwise
|
|
105
|
+
argv is returned exactly as the user wrote it. real is None if no real tool
|
|
106
|
+
is found."""
|
|
107
|
+
real = find_real(tool)
|
|
108
|
+
if real is None:
|
|
109
|
+
return None, argv
|
|
110
|
+
server = env_server(tool)
|
|
111
|
+
found = find_url(argv) if server else None
|
|
112
|
+
if server and found is not None:
|
|
113
|
+
idx, origin, kind = found
|
|
114
|
+
url = blob_url(cache_base(server), origin)
|
|
115
|
+
if probe(real, url) is True:
|
|
116
|
+
argv = rewrite(argv, idx, kind, url)
|
|
117
|
+
return real, argv
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def run(tool: str, probe, argv=None):
|
|
121
|
+
"""The shim entry point. ``probe(real_tool, url)`` returns True (hit),
|
|
122
|
+
False (miss), or None (cache unreachable)."""
|
|
123
|
+
argv = list(sys.argv[1:] if argv is None else argv)
|
|
124
|
+
real, final = plan(tool, probe, argv)
|
|
125
|
+
if real is None:
|
|
126
|
+
sys.stderr.write(
|
|
127
|
+
f"{tool}withcache: no real {tool} found on PATH (set $REAL_{tool.upper()})\n"
|
|
128
|
+
)
|
|
129
|
+
sys.exit(127)
|
|
130
|
+
try:
|
|
131
|
+
os.execv(real, [real, *final]) # become the tool: exit code, signals, I/O
|
|
132
|
+
except OSError as e:
|
|
133
|
+
sys.stderr.write(f"{tool}withcache: cannot exec {real}: {e}\n")
|
|
134
|
+
sys.exit(127)
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""curlwithcache — a transparent caching shim for ``curl`` (part of withcache).
|
|
3
|
+
|
|
4
|
+
Think "ccache for HTTP artifacts, without a proxy". Drop it on $PATH ahead of
|
|
5
|
+
the real curl (typically as a ``curl`` symlink). If WITHCACHE_SERVER points at a
|
|
6
|
+
withcache cache-host and the artifact is cached there, the download is served
|
|
7
|
+
from the cache; otherwise — server unset, not cached, or unreachable — your curl
|
|
8
|
+
runs exactly as written. Existing scripts need no changes.
|
|
9
|
+
|
|
10
|
+
export WITHCACHE_SERVER=http://withcache-server:3000
|
|
11
|
+
curl -fsSL https://the/origin/cuda.tar.gz -o cuda.tar.gz # cache hit -> local
|
|
12
|
+
|
|
13
|
+
It wraps the system curl, so all curl flags keep working; on a miss it hands
|
|
14
|
+
your original arguments straight to the real curl. Set $REAL_CURL to pin the
|
|
15
|
+
wrapped binary; CURLWITHCACHE_SERVER overrides WITHCACHE_SERVER for curl only.
|
|
16
|
+
|
|
17
|
+
Stdlib only.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
import os
|
|
21
|
+
import subprocess
|
|
22
|
+
import sys
|
|
23
|
+
|
|
24
|
+
try:
|
|
25
|
+
from withcache import _shim
|
|
26
|
+
except ImportError: # running the source file directly, uninstalled
|
|
27
|
+
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
|
|
28
|
+
from withcache import _shim
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def probe(real_curl: str, url: str):
|
|
32
|
+
"""Probe the cache with the same curl we'll exec. 0 -> hit, 22 (curl -f on
|
|
33
|
+
HTTP >=400) -> miss, anything else -> unreachable."""
|
|
34
|
+
try:
|
|
35
|
+
r = subprocess.run(
|
|
36
|
+
[real_curl, "-fsS", "-I", "-m", str(_shim.PROBE_TIMEOUT), "-o", os.devnull, url],
|
|
37
|
+
stdin=subprocess.DEVNULL,
|
|
38
|
+
stdout=subprocess.DEVNULL,
|
|
39
|
+
stderr=subprocess.DEVNULL,
|
|
40
|
+
)
|
|
41
|
+
except OSError:
|
|
42
|
+
return None
|
|
43
|
+
return True if r.returncode == 0 else False if r.returncode == 22 else None
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def main(argv=None):
|
|
47
|
+
_shim.run("curl", probe, argv)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
if __name__ == "__main__":
|
|
51
|
+
main()
|