PyPI - inferhost - Versions diffs - 0.1.0__py3-none-any.whl - Mend

inferhost 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

inferhost/__init__.py +1 -0
inferhost/__main__.py +4 -0
inferhost/cli.py +305 -0
inferhost/core/__init__.py +0 -0
inferhost/core/binaries.py +308 -0
inferhost/core/configs.py +78 -0
inferhost/core/hf.py +80 -0
inferhost/core/logs.py +53 -0
inferhost/core/paths.py +79 -0
inferhost/core/probe.py +113 -0
inferhost/core/processes.py +226 -0
inferhost/core/quant.py +67 -0
inferhost/core/registry.py +106 -0
inferhost/settings.py +57 -0
inferhost/tui/__init__.py +0 -0
inferhost/tui/app.py +28 -0
inferhost/tui/screens/__init__.py +0 -0
inferhost/tui/screens/add_model.py +121 -0
inferhost/tui/screens/dashboard.py +165 -0
inferhost/tui/styles.tcss +91 -0
inferhost-0.1.0.dist-info/METADATA +417 -0
inferhost-0.1.0.dist-info/RECORD +25 -0
inferhost-0.1.0.dist-info/WHEEL +4 -0
inferhost-0.1.0.dist-info/entry_points.txt +2 -0
inferhost-0.1.0.dist-info/licenses/LICENSE +201 -0

inferhost/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "0.1.0"

inferhost/__main__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from inferhost.cli import app
+if __name__ == "__main__":
+    app()

inferhost/cli.py ADDED Viewed

@@ -0,0 +1,305 @@
+"""inferhost command-line interface."""
+from __future__ import annotations
+from typing import Annotated, Optional
+import typer
+from rich.console import Console
+from rich.panel import Panel
+from rich.table import Table
+from inferhost import __version__
+from inferhost.core import binaries, configs, hf, paths, processes, probe, quant, registry
+from inferhost.core.logs import log_path, tail
+from inferhost.settings import settings
+app = typer.Typer(
+    name="inferhost",
+    help="Run any Hugging Face model on your own GPU. No configs, no YAML.",
+    no_args_is_help=False,
+    add_completion=False,
+)
+gateway_app = typer.Typer(name="gateway", help="Manage the LiteLLM OpenAI-compatible gateway.")
+app.add_typer(gateway_app, name="gateway")
+console = Console()
+# ---- helpers ----
+def _resolve_model_filename(repo_id: str, prefer_quant: Optional[str]) -> hf.GgufFile:
+    files = hf.list_ggufs(repo_id)
+    if not files:
+        raise typer.BadParameter(f"No .gguf files found in {repo_id}")
+    if prefer_quant:
+        for f in files:
+            if f.quant and f.quant.upper() == prefer_quant.upper():
+                return f
+        console.print(f"[yellow]Requested quant {prefer_quant!r} not found; auto-picking.[/yellow]")
+    vram = probe.probe().primary_vram_gib
+    target = vram if vram > 0 else 8.0  # CPU fallback budget; user can override later
+    pick = quant.pick_best(files, target)
+    return pick or files[0]
+def _add_model_to_registry(repo_id: str, prefer_quant: Optional[str], ctx: Optional[int]) -> registry.Model:
+    paths.ensure_dirs()
+    pick = _resolve_model_filename(repo_id, prefer_quant)
+    console.print(f"Selected: [bold]{pick.filename}[/bold]  ({pick.quant or '?'}, {pick.size_gib} GiB)")
+    console.print(f"Downloading from {repo_id} ...")
+    local = hf.download_gguf(repo_id, pick.filename)
+    reg = registry.load()
+    name = hf.normalize_name(repo_id)
+    if pick.quant:
+        name = f"{name}-{pick.quant.lower().replace('_', '-')}"
+    s = settings()
+    model = registry.Model(
+        name=name,
+        repo_id=repo_id,
+        filename=pick.filename,
+        quant=pick.quant,
+        ctx=ctx or s.default_ctx,
+        port=reg.next_port(s.swap_port),
+        size_gib=pick.size_gib,
+        local_path=str(local),
+    )
+    reg.add(model)
+    registry.save(reg)
+    configs.write_all(reg)
+    return model
+# ---- commands ----
+@app.command()
+def install(
+    skip_binaries: Annotated[bool, typer.Option("--skip-binaries", help="Just create dirs.")] = False,
+) -> None:
+    """First-time setup: download llama.cpp + llama-swap binaries, create dirs."""
+    paths.ensure_dirs()
+    console.print(f"[green]Created[/green] {paths.data_dir()}")
+    console.print(f"[green]Created[/green] {paths.config_dir()}")
+    if skip_binaries:
+        return
+    console.print("Fetching llama-server (llama.cpp) ...")
+    server = binaries.install_llama_server()
+    console.print(f"  → {server.path} ({server.version})")
+    console.print("Fetching llama-swap ...")
+    swap = binaries.install_llama_swap()
+    console.print(f"  → {swap.path} ({swap.version})")
+    console.print("[bold green]Install complete.[/bold green]")
+@app.command()
+def doctor() -> None:
+    """Show environment summary: binaries, GPU, config paths."""
+    pr = probe.probe()
+    bins = binaries.installed_versions()
+    table = Table(title="inferhost doctor", show_header=False, expand=False)
+    table.add_column(style="bold")
+    table.add_column()
+    table.add_row("Version", __version__)
+    table.add_row("OS / arch", f"{pr.os} / {pr.arch}")
+    table.add_row("RAM", f"{pr.ram_gib} GiB")
+    if pr.gpus:
+        for g in pr.gpus:
+            table.add_row(f"GPU {g.index}", f"{g.name} — {g.vram_total_gib} GiB ({g.vram_free_gib} free)")
+    else:
+        table.add_row("GPU", "none detected")
+    table.add_row("Data dir", str(paths.data_dir()))
+    table.add_row("Config dir", str(paths.config_dir()))
+    table.add_row("llama-server", "installed" if bins["llama-server"] else "[red]missing[/red] (run `inferhost install`)")
+    table.add_row("llama-swap", "installed" if bins["llama-swap"] else "[red]missing[/red] (run `inferhost install`)")
+    table.add_row("litellm gateway", "available" if processes.gateway_available() else "not installed (optional)")
+    console.print(table)
+    for note in pr.notes:
+        console.print(f"[yellow]Note:[/yellow] {note}")
+@app.command()
+def serve(
+    repo_id: str = typer.Argument(..., help="Hugging Face repo id, e.g. Qwen/Qwen2.5-7B-Instruct-GGUF"),
+    quant_pref: Annotated[Optional[str], typer.Option("--quant", help="Preferred quant (e.g. Q4_K_M)")] = None,
+    ctx: Annotated[Optional[int], typer.Option("--ctx", help="Context length")] = None,
+) -> None:
+    """Add a Hugging Face model and start serving it (one-command path)."""
+    model = _add_model_to_registry(repo_id, quant_pref, ctx)
+    console.print(f"[green]Added[/green] {model.name}")
+    st = processes.start_swap()
+    console.print(f"[green]llama-swap[/green] {'running' if st.running else 'failed to start'} on port {st.port}")
+    base = f"http://localhost:{st.port}/v1"
+    console.print(Panel.fit(
+        f"OpenAI-compatible endpoint:\n  [bold cyan]{base}[/bold cyan]\n\n"
+        f"Try it:\n"
+        f"  curl -s {base}/chat/completions \\\n"
+        f"    -H 'Content-Type: application/json' \\\n"
+        f"    -d '{{\"model\":\"{model.name}\",\"messages\":[{{\"role\":\"user\",\"content\":\"hi\"}}]}}'",
+        title="Ready",
+    ))
+@app.command()
+def add(
+    repo_id: str = typer.Argument(...),
+    quant_pref: Annotated[Optional[str], typer.Option("--quant")] = None,
+    ctx: Annotated[Optional[int], typer.Option("--ctx")] = None,
+) -> None:
+    """Register a model without starting llama-swap."""
+    model = _add_model_to_registry(repo_id, quant_pref, ctx)
+    console.print(f"[green]Added[/green] {model.name}")
+@app.command()
+def start(
+    name: Annotated[Optional[str], typer.Argument()] = None,
+) -> None:
+    """Start llama-swap (which lazy-spawns model backends on first request)."""
+    if name is not None:
+        console.print(
+            "[yellow]Note:[/yellow] llama-swap loads models lazily on first request; "
+            "starting the daemon serves all registered models."
+        )
+    reg = registry.load()
+    configs.write_all(reg)
+    st = processes.start_swap()
+    console.print(f"llama-swap: {'running' if st.running else 'stopped'} (pid {st.pid}, port {st.port})")
+@app.command()
+def stop(
+    all_: Annotated[bool, typer.Option("--all", help="Also stop the LiteLLM gateway.")] = False,
+) -> None:
+    """Stop llama-swap (and optionally the gateway)."""
+    processes.stop_swap()
+    if all_:
+        processes.stop_gateway()
+    console.print("Stopped.")
+@app.command()
+def restart() -> None:
+    """Restart llama-swap with the current config."""
+    processes.stop_swap()
+    reg = registry.load()
+    configs.write_all(reg)
+    st = processes.start_swap()
+    console.print(f"llama-swap: {'running' if st.running else 'failed'} (port {st.port})")
+@app.command()
+def ls() -> None:
+    """List registered models and daemon status."""
+    reg = registry.load()
+    if not reg.models:
+        console.print("No models registered. Try: [bold]inferhost serve <hf_repo_id>[/bold]")
+        return
+    swap = processes.swap_status()
+    table = Table(title="Models", expand=False)
+    table.add_column("name", style="bold cyan")
+    table.add_column("repo")
+    table.add_column("quant")
+    table.add_column("size")
+    table.add_column("ctx")
+    table.add_column("port")
+    for m in reg.models:
+        table.add_row(m.name, m.repo_id, m.quant or "-", f"{m.size_gib} GiB", str(m.ctx), str(m.port))
+    console.print(table)
+    console.print(
+        f"\nllama-swap: {'[green]running[/green]' if swap.running else '[red]stopped[/red]'}  "
+        f"endpoint: http://localhost:{swap.port}/v1"
+    )
+@app.command()
+def rm(name: str) -> None:
+    """Remove a model from the registry. Does not delete the GGUF file from HF cache."""
+    reg = registry.load()
+    if not reg.remove(name):
+        raise typer.BadParameter(f"No model named {name!r}")
+    registry.save(reg)
+    configs.write_all(reg)
+    console.print(f"Removed {name}")
+@app.command()
+def logs(
+    name: Annotated[str, typer.Argument(help="Model name, or 'swap' / 'gateway'")] = "swap",
+    follow: Annotated[bool, typer.Option("--follow", "-f")] = False,
+    n: Annotated[int, typer.Option("--lines", "-n")] = 200,
+) -> None:
+    """Show logs."""
+    path = log_path(name)
+    if not path.exists():
+        console.print(f"[yellow]No log file at {path}[/yellow]")
+        return
+    if follow:
+        from inferhost.core.logs import follow as follow_log
+        try:
+            for line in follow_log(path):
+                console.print(line, markup=False, highlight=False)
+        except KeyboardInterrupt:
+            return
+    else:
+        for line in tail(path, n):
+            console.print(line, markup=False, highlight=False)
+@app.command()
+def status() -> None:
+    """Show daemon status table."""
+    swap = processes.swap_status()
+    gw = processes.gateway_status()
+    table = Table(title="Daemon status", expand=False)
+    table.add_column("name", style="bold")
+    table.add_column("status")
+    table.add_column("pid")
+    table.add_column("port")
+    table.add_row(swap.name, "running" if swap.running else "stopped", str(swap.pid or "-"), str(swap.port or "-"))
+    table.add_row(gw.name, "running" if gw.running else "stopped", str(gw.pid or "-"), str(gw.port or "-"))
+    console.print(table)
+@gateway_app.command("start")
+def gateway_start() -> None:
+    """Start the LiteLLM unified gateway."""
+    if not processes.gateway_available():
+        console.print(
+            "[red]litellm not installed.[/red] Install with: "
+            "pip install 'inferhost[gateway]'"
+        )
+        raise typer.Exit(1)
+    st = processes.start_gateway()
+    console.print(f"litellm gateway: {'running' if st.running else 'failed'} on port {st.port}")
+    if st.running:
+        console.print(f"  → http://localhost:{st.port}/v1")
+@gateway_app.command("stop")
+def gateway_stop() -> None:
+    """Stop the LiteLLM gateway."""
+    processes.stop_gateway()
+    console.print("Gateway stopped.")
+@app.command()
+def tui() -> None:
+    """Launch the interactive dashboard."""
+    from inferhost.tui.app import run_tui
+    run_tui()
+@app.callback(invoke_without_command=True)
+def _root(
+    ctx: typer.Context,
+    version: Annotated[bool, typer.Option("--version", help="Show version and exit.")] = False,
+) -> None:
+    if version:
+        console.print(f"inferhost {__version__}")
+        raise typer.Exit()
+    if ctx.invoked_subcommand is None:
+        console.print(ctx.get_help())
+        raise typer.Exit()
+if __name__ == "__main__":
+    app()

inferhost/core/__init__.py ADDED Viewed

File without changes

inferhost/core/binaries.py ADDED Viewed

@@ -0,0 +1,308 @@
+"""Download and manage prebuilt binaries: llama.cpp (llama-server) and llama-swap."""
+from __future__ import annotations
+import io
+import os
+import platform
+import shutil
+import stat
+import tarfile
+import zipfile
+from dataclasses import dataclass
+from pathlib import Path
+import httpx
+from inferhost.core import paths
+from inferhost.core.probe import probe
+from inferhost.settings import settings
+LLAMACPP_REPO = "ggml-org/llama.cpp"
+LLAMASWAP_REPO = "mostlygeek/llama-swap"
+GH_API = "https://api.github.com"
+@dataclass
+class ReleaseAsset:
+    name: str
+    download_url: str
+    size: int
+@dataclass
+class InstalledBinary:
+    path: Path
+    version: str
+def _release_json(repo: str, version: str) -> dict:
+    url = (
+        f"{GH_API}/repos/{repo}/releases/latest"
+        if version == "latest"
+        else f"{GH_API}/repos/{repo}/releases/tags/{version}"
+    )
+    headers = {"Accept": "application/vnd.github+json"}
+    token = os.environ.get("GITHUB_TOKEN")
+    if token:
+        headers["Authorization"] = f"Bearer {token}"
+    r = httpx.get(url, headers=headers, timeout=30, follow_redirects=True)
+    r.raise_for_status()
+    return r.json()
+def _platform_keys() -> tuple[str, str, str]:
+    sysname = platform.system().lower()
+    machine = platform.machine().lower()
+    if sysname == "darwin":
+        os_key = "macos"
+        swap_os = "darwin"
+    elif sysname == "linux":
+        os_key = "linux"
+        swap_os = "linux"
+    else:
+        raise RuntimeError(f"Unsupported OS: {sysname}")
+    if machine in ("x86_64", "amd64"):
+        cpp_arch = "x64"
+        swap_arch = "amd64"
+    elif machine in ("arm64", "aarch64"):
+        cpp_arch = "arm64"
+        swap_arch = "arm64"
+    else:
+        raise RuntimeError(f"Unsupported arch: {machine}")
+    return os_key, cpp_arch, swap_arch  # plus swap_os derivable from sysname
+_BACKEND_TAGS = ("cuda", "cu12", "cu11", "vulkan", "rocm", "hip", "sycl", "openvino", "kompute")
+def _asset_backend(name: str) -> str:
+    n = name.lower()
+    for tag in _BACKEND_TAGS:
+        if tag in n:
+            return "cuda" if tag in ("cuda", "cu12", "cu11") else tag
+    return "cpu"
+def _pick_llamacpp_asset(
+    assets: list[dict], os_key: str, arch: str, want_gpu: bool, preferred_backend: str | None = None
+) -> ReleaseAsset:
+    candidates = []
+    for a in assets:
+        name = a.get("name", "")
+        lname = name.lower()
+        if not (lname.endswith(".zip") or lname.endswith(".tar.gz")):
+            continue
+        if os_key == "linux" and not ("linux" in lname or "ubuntu" in lname):
+            continue
+        if os_key == "macos" and "macos" not in lname:
+            continue
+        if arch not in lname:
+            continue
+        candidates.append(a)
+    if not candidates:
+        raise RuntimeError(
+            f"No llama.cpp asset found for os={os_key} arch={arch}. "
+            f"Available: {[a.get('name') for a in assets][:10]}"
+        )
+    # macOS releases are universally Metal-accelerated; pick the plain build.
+    if os_key == "macos":
+        for a in candidates:
+            if _asset_backend(a["name"]) == "cpu":
+                return ReleaseAsset(name=a["name"], download_url=a["browser_download_url"], size=a.get("size", 0))
+        a = candidates[0]
+        return ReleaseAsset(name=a["name"], download_url=a["browser_download_url"], size=a.get("size", 0))
+    if want_gpu:
+        ranked_backends = (
+            (preferred_backend,) if preferred_backend else ()
+        ) + ("cuda", "vulkan", "rocm", "sycl", "openvino", "cpu")
+    else:
+        ranked_backends = ("cpu", "vulkan", "openvino", "sycl", "rocm", "cuda")
+    by_backend: dict[str, dict] = {}
+    for a in candidates:
+        b = _asset_backend(a["name"])
+        if b not in by_backend or a.get("size", 0) < by_backend[b].get("size", 0):
+            by_backend[b] = a
+    for backend in ranked_backends:
+        if backend and backend in by_backend:
+            a = by_backend[backend]
+            return ReleaseAsset(name=a["name"], download_url=a["browser_download_url"], size=a.get("size", 0))
+    a = candidates[0]
+    return ReleaseAsset(name=a["name"], download_url=a["browser_download_url"], size=a.get("size", 0))
+def _pick_llamaswap_asset(assets: list[dict], swap_os: str, swap_arch: str) -> ReleaseAsset:
+    for a in assets:
+        name = a.get("name", "").lower()
+        if not name.endswith(".tar.gz"):
+            continue
+        if swap_os in name and swap_arch in name:
+            return ReleaseAsset(name=a["name"], download_url=a["browser_download_url"], size=a.get("size", 0))
+    raise RuntimeError(
+        f"No llama-swap asset found for os={swap_os} arch={swap_arch}. "
+        f"Available: {[a.get('name') for a in assets][:10]}"
+    )
+def _download(url: str) -> bytes:
+    with httpx.stream("GET", url, follow_redirects=True, timeout=120) as r:
+        r.raise_for_status()
+        return r.read()
+def _is_lib_or_binary(name: str) -> bool:
+    base = Path(name).name.lower()
+    if base.startswith("lib") and (".so" in base or ".dylib" in base):
+        return True
+    return False
+def _extract_archive(
+    blob: bytes,
+    name: str,
+    dest_dir: Path,
+    want_basenames: tuple[str, ...],
+    take_libs: bool = False,
+) -> list[Path]:
+    """Extract archive. Pulls out files matching want_basenames; if take_libs, also pulls .so/.dylib."""
+    extracted: list[Path] = []
+    dest_dir.mkdir(parents=True, exist_ok=True)
+    def wants(member_name: str, is_file: bool) -> bool:
+        if not is_file:
+            return False
+        base = Path(member_name).name
+        stem = base.split(".")[0]
+        if base in want_basenames or stem in want_basenames:
+            return True
+        if take_libs and _is_lib_or_binary(member_name):
+            return True
+        return False
+    if name.lower().endswith(".zip"):
+        with zipfile.ZipFile(io.BytesIO(blob)) as z:
+            for info in z.infolist():
+                if not wants(info.filename, not info.is_dir()):
+                    continue
+                target = dest_dir / Path(info.filename).name
+                with z.open(info) as src, target.open("wb") as dst:
+                    shutil.copyfileobj(src, dst)
+                target.chmod(target.stat().st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH)
+                extracted.append(target)
+    elif name.lower().endswith(".tar.gz"):
+        with tarfile.open(fileobj=io.BytesIO(blob), mode="r:gz") as t:
+            for member in t.getmembers():
+                if not wants(member.name, member.isfile()):
+                    continue
+                f = t.extractfile(member)
+                if f is None:
+                    continue
+                target = dest_dir / Path(member.name).name
+                with target.open("wb") as dst:
+                    shutil.copyfileobj(f, dst)
+                target.chmod(target.stat().st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH)
+                extracted.append(target)
+    else:
+        raise RuntimeError(f"Unsupported archive type: {name}")
+    return extracted
+def _link_so_versions(directory: Path) -> None:
+    """For each libfoo.so.MAJOR.MINOR[.PATCH], create symlinks libfoo.so.MAJOR and libfoo.so."""
+    import re
+    so_pattern = re.compile(r"^(lib[\w\-]+\.so)\.([\d.]+)$")
+    dylib_pattern = re.compile(r"^(lib[\w\-]+)\.([\d.]+)\.dylib$")
+    for f in directory.iterdir():
+        if not f.is_file():
+            continue
+        m = so_pattern.match(f.name)
+        if m:
+            base = m.group(1)  # "libfoo.so"
+            version = m.group(2)  # e.g. "0.0.9244"
+            major = version.split(".")[0]
+            for link in (f"{base}.{major}", base):
+                link_path = directory / link
+                if link_path.exists() or link_path.is_symlink():
+                    try:
+                        link_path.unlink()
+                    except OSError:
+                        continue
+                try:
+                    link_path.symlink_to(f.name)
+                except OSError:
+                    pass
+            continue
+        m = dylib_pattern.match(f.name)
+        if m:
+            base = m.group(1)  # "libfoo"
+            link_path = directory / f"{base}.dylib"
+            if link_path.exists() or link_path.is_symlink():
+                try:
+                    link_path.unlink()
+                except OSError:
+                    continue
+            try:
+                link_path.symlink_to(f.name)
+            except OSError:
+                pass
+def install_llama_server(version: str | None = None) -> InstalledBinary:
+    paths.ensure_dirs()
+    version = version or settings().llamacpp_version
+    rel = _release_json(LLAMACPP_REPO, version)
+    os_key, arch, _ = _platform_keys()
+    preferred = os.environ.get("INFERHOST_LLAMACPP_BACKEND")
+    asset = _pick_llamacpp_asset(rel["assets"], os_key, arch, want_gpu=probe().has_gpu, preferred_backend=preferred)
+    blob = _download(asset.download_url)
+    extracted = _extract_archive(
+        blob,
+        asset.name,
+        paths.bin_dir(),
+        want_basenames=("llama-server",),
+        take_libs=True,
+    )
+    target = paths.llama_server_path()
+    if not target.exists():
+        raise RuntimeError(f"llama-server not found inside {asset.name}")
+    _link_so_versions(paths.bin_dir())
+    return InstalledBinary(path=target, version=rel.get("tag_name", "unknown"))
+def install_llama_swap(version: str | None = None) -> InstalledBinary:
+    paths.ensure_dirs()
+    version = version or settings().llamaswap_version
+    rel = _release_json(LLAMASWAP_REPO, version)
+    sysname = platform.system().lower()
+    swap_os = "darwin" if sysname == "darwin" else "linux"
+    _, _, swap_arch = _platform_keys()
+    asset = _pick_llamaswap_asset(rel["assets"], swap_os, swap_arch)
+    blob = _download(asset.download_url)
+    extracted = _extract_archive(
+        blob,
+        asset.name,
+        paths.bin_dir(),
+        want_basenames=("llama-swap",),
+    )
+    if not extracted:
+        raise RuntimeError(f"llama-swap binary not found inside {asset.name}")
+    target = paths.llama_swap_path()
+    if extracted[0] != target:
+        shutil.move(str(extracted[0]), str(target))
+    return InstalledBinary(path=target, version=rel.get("tag_name", "unknown"))
+def installed_versions() -> dict[str, str | None]:
+    out: dict[str, str | None] = {"llama-server": None, "llama-swap": None}
+    for label, p in (("llama-server", paths.llama_server_path()), ("llama-swap", paths.llama_swap_path())):
+        if p.exists():
+            out[label] = "installed"
+    return out