inferhost 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
inferhost/__init__.py ADDED
@@ -0,0 +1 @@
1
+ __version__ = "0.1.0"
inferhost/__main__.py ADDED
@@ -0,0 +1,4 @@
1
+ from inferhost.cli import app
2
+
3
+ if __name__ == "__main__":
4
+ app()
inferhost/cli.py ADDED
@@ -0,0 +1,305 @@
1
+ """inferhost command-line interface."""
2
+ from __future__ import annotations
3
+
4
+ from typing import Annotated, Optional
5
+
6
+ import typer
7
+ from rich.console import Console
8
+ from rich.panel import Panel
9
+ from rich.table import Table
10
+
11
+ from inferhost import __version__
12
+ from inferhost.core import binaries, configs, hf, paths, processes, probe, quant, registry
13
+ from inferhost.core.logs import log_path, tail
14
+ from inferhost.settings import settings
15
+
16
+ app = typer.Typer(
17
+ name="inferhost",
18
+ help="Run any Hugging Face model on your own GPU. No configs, no YAML.",
19
+ no_args_is_help=False,
20
+ add_completion=False,
21
+ )
22
+ gateway_app = typer.Typer(name="gateway", help="Manage the LiteLLM OpenAI-compatible gateway.")
23
+ app.add_typer(gateway_app, name="gateway")
24
+
25
+ console = Console()
26
+
27
+
28
+ # ---- helpers ----
29
+
30
+ def _resolve_model_filename(repo_id: str, prefer_quant: Optional[str]) -> hf.GgufFile:
31
+ files = hf.list_ggufs(repo_id)
32
+ if not files:
33
+ raise typer.BadParameter(f"No .gguf files found in {repo_id}")
34
+ if prefer_quant:
35
+ for f in files:
36
+ if f.quant and f.quant.upper() == prefer_quant.upper():
37
+ return f
38
+ console.print(f"[yellow]Requested quant {prefer_quant!r} not found; auto-picking.[/yellow]")
39
+ vram = probe.probe().primary_vram_gib
40
+ target = vram if vram > 0 else 8.0 # CPU fallback budget; user can override later
41
+ pick = quant.pick_best(files, target)
42
+ return pick or files[0]
43
+
44
+
45
+ def _add_model_to_registry(repo_id: str, prefer_quant: Optional[str], ctx: Optional[int]) -> registry.Model:
46
+ paths.ensure_dirs()
47
+ pick = _resolve_model_filename(repo_id, prefer_quant)
48
+ console.print(f"Selected: [bold]{pick.filename}[/bold] ({pick.quant or '?'}, {pick.size_gib} GiB)")
49
+ console.print(f"Downloading from {repo_id} ...")
50
+ local = hf.download_gguf(repo_id, pick.filename)
51
+ reg = registry.load()
52
+ name = hf.normalize_name(repo_id)
53
+ if pick.quant:
54
+ name = f"{name}-{pick.quant.lower().replace('_', '-')}"
55
+ s = settings()
56
+ model = registry.Model(
57
+ name=name,
58
+ repo_id=repo_id,
59
+ filename=pick.filename,
60
+ quant=pick.quant,
61
+ ctx=ctx or s.default_ctx,
62
+ port=reg.next_port(s.swap_port),
63
+ size_gib=pick.size_gib,
64
+ local_path=str(local),
65
+ )
66
+ reg.add(model)
67
+ registry.save(reg)
68
+ configs.write_all(reg)
69
+ return model
70
+
71
+
72
+ # ---- commands ----
73
+
74
+ @app.command()
75
+ def install(
76
+ skip_binaries: Annotated[bool, typer.Option("--skip-binaries", help="Just create dirs.")] = False,
77
+ ) -> None:
78
+ """First-time setup: download llama.cpp + llama-swap binaries, create dirs."""
79
+ paths.ensure_dirs()
80
+ console.print(f"[green]Created[/green] {paths.data_dir()}")
81
+ console.print(f"[green]Created[/green] {paths.config_dir()}")
82
+ if skip_binaries:
83
+ return
84
+ console.print("Fetching llama-server (llama.cpp) ...")
85
+ server = binaries.install_llama_server()
86
+ console.print(f" → {server.path} ({server.version})")
87
+ console.print("Fetching llama-swap ...")
88
+ swap = binaries.install_llama_swap()
89
+ console.print(f" → {swap.path} ({swap.version})")
90
+ console.print("[bold green]Install complete.[/bold green]")
91
+
92
+
93
+ @app.command()
94
+ def doctor() -> None:
95
+ """Show environment summary: binaries, GPU, config paths."""
96
+ pr = probe.probe()
97
+ bins = binaries.installed_versions()
98
+ table = Table(title="inferhost doctor", show_header=False, expand=False)
99
+ table.add_column(style="bold")
100
+ table.add_column()
101
+ table.add_row("Version", __version__)
102
+ table.add_row("OS / arch", f"{pr.os} / {pr.arch}")
103
+ table.add_row("RAM", f"{pr.ram_gib} GiB")
104
+ if pr.gpus:
105
+ for g in pr.gpus:
106
+ table.add_row(f"GPU {g.index}", f"{g.name} — {g.vram_total_gib} GiB ({g.vram_free_gib} free)")
107
+ else:
108
+ table.add_row("GPU", "none detected")
109
+ table.add_row("Data dir", str(paths.data_dir()))
110
+ table.add_row("Config dir", str(paths.config_dir()))
111
+ table.add_row("llama-server", "installed" if bins["llama-server"] else "[red]missing[/red] (run `inferhost install`)")
112
+ table.add_row("llama-swap", "installed" if bins["llama-swap"] else "[red]missing[/red] (run `inferhost install`)")
113
+ table.add_row("litellm gateway", "available" if processes.gateway_available() else "not installed (optional)")
114
+ console.print(table)
115
+ for note in pr.notes:
116
+ console.print(f"[yellow]Note:[/yellow] {note}")
117
+
118
+
119
+ @app.command()
120
+ def serve(
121
+ repo_id: str = typer.Argument(..., help="Hugging Face repo id, e.g. Qwen/Qwen2.5-7B-Instruct-GGUF"),
122
+ quant_pref: Annotated[Optional[str], typer.Option("--quant", help="Preferred quant (e.g. Q4_K_M)")] = None,
123
+ ctx: Annotated[Optional[int], typer.Option("--ctx", help="Context length")] = None,
124
+ ) -> None:
125
+ """Add a Hugging Face model and start serving it (one-command path)."""
126
+ model = _add_model_to_registry(repo_id, quant_pref, ctx)
127
+ console.print(f"[green]Added[/green] {model.name}")
128
+ st = processes.start_swap()
129
+ console.print(f"[green]llama-swap[/green] {'running' if st.running else 'failed to start'} on port {st.port}")
130
+ base = f"http://localhost:{st.port}/v1"
131
+ console.print(Panel.fit(
132
+ f"OpenAI-compatible endpoint:\n [bold cyan]{base}[/bold cyan]\n\n"
133
+ f"Try it:\n"
134
+ f" curl -s {base}/chat/completions \\\n"
135
+ f" -H 'Content-Type: application/json' \\\n"
136
+ f" -d '{{\"model\":\"{model.name}\",\"messages\":[{{\"role\":\"user\",\"content\":\"hi\"}}]}}'",
137
+ title="Ready",
138
+ ))
139
+
140
+
141
+ @app.command()
142
+ def add(
143
+ repo_id: str = typer.Argument(...),
144
+ quant_pref: Annotated[Optional[str], typer.Option("--quant")] = None,
145
+ ctx: Annotated[Optional[int], typer.Option("--ctx")] = None,
146
+ ) -> None:
147
+ """Register a model without starting llama-swap."""
148
+ model = _add_model_to_registry(repo_id, quant_pref, ctx)
149
+ console.print(f"[green]Added[/green] {model.name}")
150
+
151
+
152
+ @app.command()
153
+ def start(
154
+ name: Annotated[Optional[str], typer.Argument()] = None,
155
+ ) -> None:
156
+ """Start llama-swap (which lazy-spawns model backends on first request)."""
157
+ if name is not None:
158
+ console.print(
159
+ "[yellow]Note:[/yellow] llama-swap loads models lazily on first request; "
160
+ "starting the daemon serves all registered models."
161
+ )
162
+ reg = registry.load()
163
+ configs.write_all(reg)
164
+ st = processes.start_swap()
165
+ console.print(f"llama-swap: {'running' if st.running else 'stopped'} (pid {st.pid}, port {st.port})")
166
+
167
+
168
+ @app.command()
169
+ def stop(
170
+ all_: Annotated[bool, typer.Option("--all", help="Also stop the LiteLLM gateway.")] = False,
171
+ ) -> None:
172
+ """Stop llama-swap (and optionally the gateway)."""
173
+ processes.stop_swap()
174
+ if all_:
175
+ processes.stop_gateway()
176
+ console.print("Stopped.")
177
+
178
+
179
+ @app.command()
180
+ def restart() -> None:
181
+ """Restart llama-swap with the current config."""
182
+ processes.stop_swap()
183
+ reg = registry.load()
184
+ configs.write_all(reg)
185
+ st = processes.start_swap()
186
+ console.print(f"llama-swap: {'running' if st.running else 'failed'} (port {st.port})")
187
+
188
+
189
+ @app.command()
190
+ def ls() -> None:
191
+ """List registered models and daemon status."""
192
+ reg = registry.load()
193
+ if not reg.models:
194
+ console.print("No models registered. Try: [bold]inferhost serve <hf_repo_id>[/bold]")
195
+ return
196
+ swap = processes.swap_status()
197
+ table = Table(title="Models", expand=False)
198
+ table.add_column("name", style="bold cyan")
199
+ table.add_column("repo")
200
+ table.add_column("quant")
201
+ table.add_column("size")
202
+ table.add_column("ctx")
203
+ table.add_column("port")
204
+ for m in reg.models:
205
+ table.add_row(m.name, m.repo_id, m.quant or "-", f"{m.size_gib} GiB", str(m.ctx), str(m.port))
206
+ console.print(table)
207
+ console.print(
208
+ f"\nllama-swap: {'[green]running[/green]' if swap.running else '[red]stopped[/red]'} "
209
+ f"endpoint: http://localhost:{swap.port}/v1"
210
+ )
211
+
212
+
213
+ @app.command()
214
+ def rm(name: str) -> None:
215
+ """Remove a model from the registry. Does not delete the GGUF file from HF cache."""
216
+ reg = registry.load()
217
+ if not reg.remove(name):
218
+ raise typer.BadParameter(f"No model named {name!r}")
219
+ registry.save(reg)
220
+ configs.write_all(reg)
221
+ console.print(f"Removed {name}")
222
+
223
+
224
+ @app.command()
225
+ def logs(
226
+ name: Annotated[str, typer.Argument(help="Model name, or 'swap' / 'gateway'")] = "swap",
227
+ follow: Annotated[bool, typer.Option("--follow", "-f")] = False,
228
+ n: Annotated[int, typer.Option("--lines", "-n")] = 200,
229
+ ) -> None:
230
+ """Show logs."""
231
+ path = log_path(name)
232
+ if not path.exists():
233
+ console.print(f"[yellow]No log file at {path}[/yellow]")
234
+ return
235
+ if follow:
236
+ from inferhost.core.logs import follow as follow_log
237
+ try:
238
+ for line in follow_log(path):
239
+ console.print(line, markup=False, highlight=False)
240
+ except KeyboardInterrupt:
241
+ return
242
+ else:
243
+ for line in tail(path, n):
244
+ console.print(line, markup=False, highlight=False)
245
+
246
+
247
+ @app.command()
248
+ def status() -> None:
249
+ """Show daemon status table."""
250
+ swap = processes.swap_status()
251
+ gw = processes.gateway_status()
252
+ table = Table(title="Daemon status", expand=False)
253
+ table.add_column("name", style="bold")
254
+ table.add_column("status")
255
+ table.add_column("pid")
256
+ table.add_column("port")
257
+ table.add_row(swap.name, "running" if swap.running else "stopped", str(swap.pid or "-"), str(swap.port or "-"))
258
+ table.add_row(gw.name, "running" if gw.running else "stopped", str(gw.pid or "-"), str(gw.port or "-"))
259
+ console.print(table)
260
+
261
+
262
+ @gateway_app.command("start")
263
+ def gateway_start() -> None:
264
+ """Start the LiteLLM unified gateway."""
265
+ if not processes.gateway_available():
266
+ console.print(
267
+ "[red]litellm not installed.[/red] Install with: "
268
+ "pip install 'inferhost[gateway]'"
269
+ )
270
+ raise typer.Exit(1)
271
+ st = processes.start_gateway()
272
+ console.print(f"litellm gateway: {'running' if st.running else 'failed'} on port {st.port}")
273
+ if st.running:
274
+ console.print(f" → http://localhost:{st.port}/v1")
275
+
276
+
277
+ @gateway_app.command("stop")
278
+ def gateway_stop() -> None:
279
+ """Stop the LiteLLM gateway."""
280
+ processes.stop_gateway()
281
+ console.print("Gateway stopped.")
282
+
283
+
284
+ @app.command()
285
+ def tui() -> None:
286
+ """Launch the interactive dashboard."""
287
+ from inferhost.tui.app import run_tui
288
+ run_tui()
289
+
290
+
291
+ @app.callback(invoke_without_command=True)
292
+ def _root(
293
+ ctx: typer.Context,
294
+ version: Annotated[bool, typer.Option("--version", help="Show version and exit.")] = False,
295
+ ) -> None:
296
+ if version:
297
+ console.print(f"inferhost {__version__}")
298
+ raise typer.Exit()
299
+ if ctx.invoked_subcommand is None:
300
+ console.print(ctx.get_help())
301
+ raise typer.Exit()
302
+
303
+
304
+ if __name__ == "__main__":
305
+ app()
File without changes
@@ -0,0 +1,308 @@
1
+ """Download and manage prebuilt binaries: llama.cpp (llama-server) and llama-swap."""
2
+ from __future__ import annotations
3
+
4
+ import io
5
+ import os
6
+ import platform
7
+ import shutil
8
+ import stat
9
+ import tarfile
10
+ import zipfile
11
+ from dataclasses import dataclass
12
+ from pathlib import Path
13
+
14
+ import httpx
15
+
16
+ from inferhost.core import paths
17
+ from inferhost.core.probe import probe
18
+ from inferhost.settings import settings
19
+
20
+ LLAMACPP_REPO = "ggml-org/llama.cpp"
21
+ LLAMASWAP_REPO = "mostlygeek/llama-swap"
22
+
23
+ GH_API = "https://api.github.com"
24
+
25
+
26
+ @dataclass
27
+ class ReleaseAsset:
28
+ name: str
29
+ download_url: str
30
+ size: int
31
+
32
+
33
+ @dataclass
34
+ class InstalledBinary:
35
+ path: Path
36
+ version: str
37
+
38
+
39
+ def _release_json(repo: str, version: str) -> dict:
40
+ url = (
41
+ f"{GH_API}/repos/{repo}/releases/latest"
42
+ if version == "latest"
43
+ else f"{GH_API}/repos/{repo}/releases/tags/{version}"
44
+ )
45
+ headers = {"Accept": "application/vnd.github+json"}
46
+ token = os.environ.get("GITHUB_TOKEN")
47
+ if token:
48
+ headers["Authorization"] = f"Bearer {token}"
49
+ r = httpx.get(url, headers=headers, timeout=30, follow_redirects=True)
50
+ r.raise_for_status()
51
+ return r.json()
52
+
53
+
54
+ def _platform_keys() -> tuple[str, str, str]:
55
+ sysname = platform.system().lower()
56
+ machine = platform.machine().lower()
57
+ if sysname == "darwin":
58
+ os_key = "macos"
59
+ swap_os = "darwin"
60
+ elif sysname == "linux":
61
+ os_key = "linux"
62
+ swap_os = "linux"
63
+ else:
64
+ raise RuntimeError(f"Unsupported OS: {sysname}")
65
+ if machine in ("x86_64", "amd64"):
66
+ cpp_arch = "x64"
67
+ swap_arch = "amd64"
68
+ elif machine in ("arm64", "aarch64"):
69
+ cpp_arch = "arm64"
70
+ swap_arch = "arm64"
71
+ else:
72
+ raise RuntimeError(f"Unsupported arch: {machine}")
73
+ return os_key, cpp_arch, swap_arch # plus swap_os derivable from sysname
74
+
75
+
76
+ _BACKEND_TAGS = ("cuda", "cu12", "cu11", "vulkan", "rocm", "hip", "sycl", "openvino", "kompute")
77
+
78
+
79
+ def _asset_backend(name: str) -> str:
80
+ n = name.lower()
81
+ for tag in _BACKEND_TAGS:
82
+ if tag in n:
83
+ return "cuda" if tag in ("cuda", "cu12", "cu11") else tag
84
+ return "cpu"
85
+
86
+
87
+ def _pick_llamacpp_asset(
88
+ assets: list[dict], os_key: str, arch: str, want_gpu: bool, preferred_backend: str | None = None
89
+ ) -> ReleaseAsset:
90
+ candidates = []
91
+ for a in assets:
92
+ name = a.get("name", "")
93
+ lname = name.lower()
94
+ if not (lname.endswith(".zip") or lname.endswith(".tar.gz")):
95
+ continue
96
+ if os_key == "linux" and not ("linux" in lname or "ubuntu" in lname):
97
+ continue
98
+ if os_key == "macos" and "macos" not in lname:
99
+ continue
100
+ if arch not in lname:
101
+ continue
102
+ candidates.append(a)
103
+
104
+ if not candidates:
105
+ raise RuntimeError(
106
+ f"No llama.cpp asset found for os={os_key} arch={arch}. "
107
+ f"Available: {[a.get('name') for a in assets][:10]}"
108
+ )
109
+
110
+ # macOS releases are universally Metal-accelerated; pick the plain build.
111
+ if os_key == "macos":
112
+ for a in candidates:
113
+ if _asset_backend(a["name"]) == "cpu":
114
+ return ReleaseAsset(name=a["name"], download_url=a["browser_download_url"], size=a.get("size", 0))
115
+ a = candidates[0]
116
+ return ReleaseAsset(name=a["name"], download_url=a["browser_download_url"], size=a.get("size", 0))
117
+
118
+ if want_gpu:
119
+ ranked_backends = (
120
+ (preferred_backend,) if preferred_backend else ()
121
+ ) + ("cuda", "vulkan", "rocm", "sycl", "openvino", "cpu")
122
+ else:
123
+ ranked_backends = ("cpu", "vulkan", "openvino", "sycl", "rocm", "cuda")
124
+
125
+ by_backend: dict[str, dict] = {}
126
+ for a in candidates:
127
+ b = _asset_backend(a["name"])
128
+ if b not in by_backend or a.get("size", 0) < by_backend[b].get("size", 0):
129
+ by_backend[b] = a
130
+
131
+ for backend in ranked_backends:
132
+ if backend and backend in by_backend:
133
+ a = by_backend[backend]
134
+ return ReleaseAsset(name=a["name"], download_url=a["browser_download_url"], size=a.get("size", 0))
135
+
136
+ a = candidates[0]
137
+ return ReleaseAsset(name=a["name"], download_url=a["browser_download_url"], size=a.get("size", 0))
138
+
139
+
140
+ def _pick_llamaswap_asset(assets: list[dict], swap_os: str, swap_arch: str) -> ReleaseAsset:
141
+ for a in assets:
142
+ name = a.get("name", "").lower()
143
+ if not name.endswith(".tar.gz"):
144
+ continue
145
+ if swap_os in name and swap_arch in name:
146
+ return ReleaseAsset(name=a["name"], download_url=a["browser_download_url"], size=a.get("size", 0))
147
+ raise RuntimeError(
148
+ f"No llama-swap asset found for os={swap_os} arch={swap_arch}. "
149
+ f"Available: {[a.get('name') for a in assets][:10]}"
150
+ )
151
+
152
+
153
+ def _download(url: str) -> bytes:
154
+ with httpx.stream("GET", url, follow_redirects=True, timeout=120) as r:
155
+ r.raise_for_status()
156
+ return r.read()
157
+
158
+
159
+ def _is_lib_or_binary(name: str) -> bool:
160
+ base = Path(name).name.lower()
161
+ if base.startswith("lib") and (".so" in base or ".dylib" in base):
162
+ return True
163
+ return False
164
+
165
+
166
+ def _extract_archive(
167
+ blob: bytes,
168
+ name: str,
169
+ dest_dir: Path,
170
+ want_basenames: tuple[str, ...],
171
+ take_libs: bool = False,
172
+ ) -> list[Path]:
173
+ """Extract archive. Pulls out files matching want_basenames; if take_libs, also pulls .so/.dylib."""
174
+ extracted: list[Path] = []
175
+ dest_dir.mkdir(parents=True, exist_ok=True)
176
+
177
+ def wants(member_name: str, is_file: bool) -> bool:
178
+ if not is_file:
179
+ return False
180
+ base = Path(member_name).name
181
+ stem = base.split(".")[0]
182
+ if base in want_basenames or stem in want_basenames:
183
+ return True
184
+ if take_libs and _is_lib_or_binary(member_name):
185
+ return True
186
+ return False
187
+
188
+ if name.lower().endswith(".zip"):
189
+ with zipfile.ZipFile(io.BytesIO(blob)) as z:
190
+ for info in z.infolist():
191
+ if not wants(info.filename, not info.is_dir()):
192
+ continue
193
+ target = dest_dir / Path(info.filename).name
194
+ with z.open(info) as src, target.open("wb") as dst:
195
+ shutil.copyfileobj(src, dst)
196
+ target.chmod(target.stat().st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH)
197
+ extracted.append(target)
198
+ elif name.lower().endswith(".tar.gz"):
199
+ with tarfile.open(fileobj=io.BytesIO(blob), mode="r:gz") as t:
200
+ for member in t.getmembers():
201
+ if not wants(member.name, member.isfile()):
202
+ continue
203
+ f = t.extractfile(member)
204
+ if f is None:
205
+ continue
206
+ target = dest_dir / Path(member.name).name
207
+ with target.open("wb") as dst:
208
+ shutil.copyfileobj(f, dst)
209
+ target.chmod(target.stat().st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH)
210
+ extracted.append(target)
211
+ else:
212
+ raise RuntimeError(f"Unsupported archive type: {name}")
213
+ return extracted
214
+
215
+
216
+ def _link_so_versions(directory: Path) -> None:
217
+ """For each libfoo.so.MAJOR.MINOR[.PATCH], create symlinks libfoo.so.MAJOR and libfoo.so."""
218
+ import re
219
+
220
+ so_pattern = re.compile(r"^(lib[\w\-]+\.so)\.([\d.]+)$")
221
+ dylib_pattern = re.compile(r"^(lib[\w\-]+)\.([\d.]+)\.dylib$")
222
+
223
+ for f in directory.iterdir():
224
+ if not f.is_file():
225
+ continue
226
+ m = so_pattern.match(f.name)
227
+ if m:
228
+ base = m.group(1) # "libfoo.so"
229
+ version = m.group(2) # e.g. "0.0.9244"
230
+ major = version.split(".")[0]
231
+ for link in (f"{base}.{major}", base):
232
+ link_path = directory / link
233
+ if link_path.exists() or link_path.is_symlink():
234
+ try:
235
+ link_path.unlink()
236
+ except OSError:
237
+ continue
238
+ try:
239
+ link_path.symlink_to(f.name)
240
+ except OSError:
241
+ pass
242
+ continue
243
+ m = dylib_pattern.match(f.name)
244
+ if m:
245
+ base = m.group(1) # "libfoo"
246
+ link_path = directory / f"{base}.dylib"
247
+ if link_path.exists() or link_path.is_symlink():
248
+ try:
249
+ link_path.unlink()
250
+ except OSError:
251
+ continue
252
+ try:
253
+ link_path.symlink_to(f.name)
254
+ except OSError:
255
+ pass
256
+
257
+
258
+ def install_llama_server(version: str | None = None) -> InstalledBinary:
259
+ paths.ensure_dirs()
260
+ version = version or settings().llamacpp_version
261
+ rel = _release_json(LLAMACPP_REPO, version)
262
+ os_key, arch, _ = _platform_keys()
263
+ preferred = os.environ.get("INFERHOST_LLAMACPP_BACKEND")
264
+ asset = _pick_llamacpp_asset(rel["assets"], os_key, arch, want_gpu=probe().has_gpu, preferred_backend=preferred)
265
+ blob = _download(asset.download_url)
266
+ extracted = _extract_archive(
267
+ blob,
268
+ asset.name,
269
+ paths.bin_dir(),
270
+ want_basenames=("llama-server",),
271
+ take_libs=True,
272
+ )
273
+ target = paths.llama_server_path()
274
+ if not target.exists():
275
+ raise RuntimeError(f"llama-server not found inside {asset.name}")
276
+ _link_so_versions(paths.bin_dir())
277
+ return InstalledBinary(path=target, version=rel.get("tag_name", "unknown"))
278
+
279
+
280
+ def install_llama_swap(version: str | None = None) -> InstalledBinary:
281
+ paths.ensure_dirs()
282
+ version = version or settings().llamaswap_version
283
+ rel = _release_json(LLAMASWAP_REPO, version)
284
+ sysname = platform.system().lower()
285
+ swap_os = "darwin" if sysname == "darwin" else "linux"
286
+ _, _, swap_arch = _platform_keys()
287
+ asset = _pick_llamaswap_asset(rel["assets"], swap_os, swap_arch)
288
+ blob = _download(asset.download_url)
289
+ extracted = _extract_archive(
290
+ blob,
291
+ asset.name,
292
+ paths.bin_dir(),
293
+ want_basenames=("llama-swap",),
294
+ )
295
+ if not extracted:
296
+ raise RuntimeError(f"llama-swap binary not found inside {asset.name}")
297
+ target = paths.llama_swap_path()
298
+ if extracted[0] != target:
299
+ shutil.move(str(extracted[0]), str(target))
300
+ return InstalledBinary(path=target, version=rel.get("tag_name", "unknown"))
301
+
302
+
303
+ def installed_versions() -> dict[str, str | None]:
304
+ out: dict[str, str | None] = {"llama-server": None, "llama-swap": None}
305
+ for label, p in (("llama-server", paths.llama_server_path()), ("llama-swap", paths.llama_swap_path())):
306
+ if p.exists():
307
+ out[label] = "installed"
308
+ return out