withcache 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
withcache/__init__.py ADDED
@@ -0,0 +1,11 @@
1
+ """withcache — operator-curated, URL-keyed artifact cache for a small lab.
2
+
3
+ Two console entry points (see pyproject.toml):
4
+ withcache -> withcache.client:main (the cache-aware downloader)
5
+ withcache-server -> withcache.server:main (the cache-host)
6
+
7
+ Both modules are stdlib-only and self-contained, so either file can also be
8
+ copied and run on its own with a plain ``python3``.
9
+ """
10
+
11
+ __version__ = "0.2.0"
withcache/_shim.py ADDED
@@ -0,0 +1,134 @@
1
+ """Shared core for the withcache download-tool shims (curlwithcache, wgetwithcache).
2
+
3
+ Every shim does the same three things — find the URL in the wrapped tool's
4
+ arguments, ask the cache-host whether it has that artifact, and on a hit
5
+ re-point just the URL at the cache before exec'ing the real tool — so that
6
+ logic lives here. A shim only supplies (a) the tool's name and (b) how to
7
+ probe the cache with that tool.
8
+
9
+ The cache fetch URL is path-encoded as ``<server>/b/<base64(origin)>/<basename>``
10
+ so that ANY downloader names the saved file after the real artifact (``-O`` /
11
+ bare ``wget`` derive the name from the URL's last path segment), with no query
12
+ string to pollute the name and no per-tool output-flag parsing.
13
+ """
14
+
15
+ import base64
16
+ import os
17
+ import re
18
+ import sys
19
+ import urllib.parse
20
+
21
+ PROBE_TIMEOUT = 5 # seconds; a slow/unreachable cache must never block the user
22
+
23
+ # A real URL argument begins with a scheme; this excludes header/data values
24
+ # like "Referer: https://…" or "u=https://…" that merely contain "://".
25
+ _SCHEME = re.compile(r"^[a-zA-Z][a-zA-Z0-9+.\-]*://")
26
+
27
+
28
+ def cache_base(server: str) -> str:
29
+ """Accept 'host', 'host:3000', or 'http://withcache-server:3000'."""
30
+ server = server.strip().rstrip("/")
31
+ if "://" not in server:
32
+ server = "http://" + server
33
+ return server
34
+
35
+
36
+ def env_server(tool: str) -> str | None:
37
+ """Per-tool override (e.g. CURLWITHCACHE_SERVER) wins, else WITHCACHE_SERVER."""
38
+ return os.environ.get(tool.upper() + "WITHCACHE_SERVER") or os.environ.get("WITHCACHE_SERVER")
39
+
40
+
41
+ def find_real(name: str) -> str | None:
42
+ """The next executable ``name`` on PATH that isn't this shim. $REAL_<NAME>
43
+ (e.g. $REAL_CURL) overrides."""
44
+ override = os.environ.get("REAL_" + name.upper())
45
+ if override and os.path.isfile(override) and os.access(override, os.X_OK):
46
+ return override
47
+ me = os.path.realpath(sys.argv[0]) if sys.argv and sys.argv[0] else None
48
+ for d in os.environ.get("PATH", "").split(os.pathsep):
49
+ if not d:
50
+ continue
51
+ cand = os.path.join(d, name)
52
+ if os.path.isfile(cand) and os.access(cand, os.X_OK):
53
+ if me and os.path.realpath(cand) == me:
54
+ continue # that's us — keep looking for the real one
55
+ return cand
56
+ return None
57
+
58
+
59
+ def find_url(argv: list):
60
+ """Return (index, origin_url, kind) where kind is 'bare' or 'urleq', or None.
61
+
62
+ 'bare' -> argv[index] is the URL (replace the whole token).
63
+ 'urleq' -> argv[index] is '--url=URL' (replace, keeping the prefix).
64
+ """
65
+ i = 0
66
+ while i < len(argv):
67
+ t = argv[i]
68
+ if t == "--": # everything after is an operand
69
+ for k in range(i + 1, len(argv)):
70
+ if _SCHEME.match(argv[k]):
71
+ return (k, argv[k], "bare")
72
+ return None
73
+ if t == "--url" and i + 1 < len(argv):
74
+ return (i + 1, argv[i + 1], "bare")
75
+ if t.startswith("--url="):
76
+ return (i, t[len("--url=") :], "urleq")
77
+ if _SCHEME.match(t):
78
+ return (i, t, "bare")
79
+ i += 1
80
+ return None
81
+
82
+
83
+ def _basename(origin: str) -> str:
84
+ name = os.path.basename(urllib.parse.urlsplit(origin).path)
85
+ return name or "download"
86
+
87
+
88
+ def blob_url(base: str, origin: str) -> str:
89
+ """<base>/b/<urlsafe-base64(origin), unpadded>/<basename> — path-encoded so
90
+ every downloader derives the correct output filename."""
91
+ token = base64.urlsafe_b64encode(origin.encode("utf-8")).decode("ascii").rstrip("=")
92
+ return f"{base}/b/{token}/{urllib.parse.quote(_basename(origin))}"
93
+
94
+
95
+ def rewrite(argv: list, idx: int, kind: str, new_url: str) -> list:
96
+ argv = list(argv)
97
+ argv[idx] = ("--url=" + new_url) if kind == "urleq" else new_url
98
+ return argv
99
+
100
+
101
+ def plan(tool: str, probe, argv: list):
102
+ """Resolve (real_tool_path, final_argv) WITHOUT exec'ing — the testable core
103
+ of run(). ``probe(real_tool, url)`` returns True (hit) / False (miss) / None
104
+ (unreachable). On a hit the URL token is re-pointed at the cache; otherwise
105
+ argv is returned exactly as the user wrote it. real is None if no real tool
106
+ is found."""
107
+ real = find_real(tool)
108
+ if real is None:
109
+ return None, argv
110
+ server = env_server(tool)
111
+ found = find_url(argv) if server else None
112
+ if server and found is not None:
113
+ idx, origin, kind = found
114
+ url = blob_url(cache_base(server), origin)
115
+ if probe(real, url) is True:
116
+ argv = rewrite(argv, idx, kind, url)
117
+ return real, argv
118
+
119
+
120
+ def run(tool: str, probe, argv=None):
121
+ """The shim entry point. ``probe(real_tool, url)`` returns True (hit),
122
+ False (miss), or None (cache unreachable)."""
123
+ argv = list(sys.argv[1:] if argv is None else argv)
124
+ real, final = plan(tool, probe, argv)
125
+ if real is None:
126
+ sys.stderr.write(
127
+ f"{tool}withcache: no real {tool} found on PATH (set $REAL_{tool.upper()})\n"
128
+ )
129
+ sys.exit(127)
130
+ try:
131
+ os.execv(real, [real, *final]) # become the tool: exit code, signals, I/O
132
+ except OSError as e:
133
+ sys.stderr.write(f"{tool}withcache: cannot exec {real}: {e}\n")
134
+ sys.exit(127)
@@ -0,0 +1,51 @@
1
+ #!/usr/bin/env python3
2
+ """curlwithcache — a transparent caching shim for ``curl`` (part of withcache).
3
+
4
+ Think "ccache for HTTP artifacts, without a proxy". Drop it on $PATH ahead of
5
+ the real curl (typically as a ``curl`` symlink). If WITHCACHE_SERVER points at a
6
+ withcache cache-host and the artifact is cached there, the download is served
7
+ from the cache; otherwise — server unset, not cached, or unreachable — your curl
8
+ runs exactly as written. Existing scripts need no changes.
9
+
10
+ export WITHCACHE_SERVER=http://withcache-server:3000
11
+ curl -fsSL https://the/origin/cuda.tar.gz -o cuda.tar.gz # cache hit -> local
12
+
13
+ It wraps the system curl, so all curl flags keep working; on a miss it hands
14
+ your original arguments straight to the real curl. Set $REAL_CURL to pin the
15
+ wrapped binary; CURLWITHCACHE_SERVER overrides WITHCACHE_SERVER for curl only.
16
+
17
+ Stdlib only.
18
+ """
19
+
20
+ import os
21
+ import subprocess
22
+ import sys
23
+
24
+ try:
25
+ from withcache import _shim
26
+ except ImportError: # running the source file directly, uninstalled
27
+ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
28
+ from withcache import _shim
29
+
30
+
31
+ def probe(real_curl: str, url: str):
32
+ """Probe the cache with the same curl we'll exec. 0 -> hit, 22 (curl -f on
33
+ HTTP >=400) -> miss, anything else -> unreachable."""
34
+ try:
35
+ r = subprocess.run(
36
+ [real_curl, "-fsS", "-I", "-m", str(_shim.PROBE_TIMEOUT), "-o", os.devnull, url],
37
+ stdin=subprocess.DEVNULL,
38
+ stdout=subprocess.DEVNULL,
39
+ stderr=subprocess.DEVNULL,
40
+ )
41
+ except OSError:
42
+ return None
43
+ return True if r.returncode == 0 else False if r.returncode == 22 else None
44
+
45
+
46
+ def main(argv=None):
47
+ _shim.run("curl", probe, argv)
48
+
49
+
50
+ if __name__ == "__main__":
51
+ main()