llmstack-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llmstack/__init__.py +3 -0
- llmstack/__main__.py +5 -0
- llmstack/cli/__init__.py +0 -0
- llmstack/cli/app.py +87 -0
- llmstack/cli/commands/__init__.py +0 -0
- llmstack/cli/commands/doctor.py +72 -0
- llmstack/cli/commands/down.py +25 -0
- llmstack/cli/commands/init.py +66 -0
- llmstack/cli/commands/logs.py +25 -0
- llmstack/cli/commands/status.py +45 -0
- llmstack/cli/commands/up.py +30 -0
- llmstack/cli/console.py +13 -0
- llmstack/config/__init__.py +4 -0
- llmstack/config/loader.py +44 -0
- llmstack/config/presets/__init__.py +11 -0
- llmstack/config/presets/agent.py +13 -0
- llmstack/config/presets/chat.py +14 -0
- llmstack/config/presets/rag.py +10 -0
- llmstack/config/schema.py +76 -0
- llmstack/core/__init__.py +0 -0
- llmstack/core/hardware.py +131 -0
- llmstack/core/health.py +23 -0
- llmstack/core/resolver.py +49 -0
- llmstack/core/stack.py +207 -0
- llmstack/docker/__init__.py +0 -0
- llmstack/docker/manager.py +134 -0
- llmstack/gateway/Dockerfile +16 -0
- llmstack/gateway/__init__.py +0 -0
- llmstack/gateway/main.py +52 -0
- llmstack/gateway/middleware/__init__.py +0 -0
- llmstack/gateway/middleware/auth.py +32 -0
- llmstack/gateway/middleware/metrics.py +115 -0
- llmstack/gateway/proxy.py +58 -0
- llmstack/gateway/routes/__init__.py +0 -0
- llmstack/gateway/routes/chat.py +27 -0
- llmstack/gateway/routes/embeddings.py +17 -0
- llmstack/gateway/routes/health.py +55 -0
- llmstack/gateway/routes/models.py +16 -0
- llmstack/plugins/__init__.py +0 -0
- llmstack/plugins/loader.py +5 -0
- llmstack/plugins/spec.py +20 -0
- llmstack/services/__init__.py +0 -0
- llmstack/services/base.py +65 -0
- llmstack/services/cache/__init__.py +0 -0
- llmstack/services/cache/redis.py +33 -0
- llmstack/services/embeddings/__init__.py +0 -0
- llmstack/services/embeddings/tei.py +49 -0
- llmstack/services/gateway/__init__.py +0 -0
- llmstack/services/gateway/service.py +47 -0
- llmstack/services/inference/__init__.py +0 -0
- llmstack/services/inference/ollama.py +60 -0
- llmstack/services/inference/vllm.py +57 -0
- llmstack/services/observe/__init__.py +0 -0
- llmstack/services/observe/prometheus.py +168 -0
- llmstack/services/registry.py +53 -0
- llmstack/services/vectordb/__init__.py +0 -0
- llmstack/services/vectordb/qdrant.py +33 -0
- llmstack_cli-0.1.0.dist-info/METADATA +252 -0
- llmstack_cli-0.1.0.dist-info/RECORD +62 -0
- llmstack_cli-0.1.0.dist-info/WHEEL +4 -0
- llmstack_cli-0.1.0.dist-info/entry_points.txt +2 -0
- llmstack_cli-0.1.0.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
"""Detect GPU, CPU, and RAM available on the host machine."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import platform
|
|
6
|
+
import shutil
|
|
7
|
+
import subprocess
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from typing import Literal
|
|
10
|
+
|
|
11
|
+
import psutil
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass(frozen=True)
|
|
15
|
+
class HardwareProfile:
|
|
16
|
+
gpu_vendor: Literal["nvidia", "amd", "apple", "none"]
|
|
17
|
+
gpu_name: str | None
|
|
18
|
+
gpu_vram_mb: int
|
|
19
|
+
cpu_cores: int
|
|
20
|
+
ram_mb: int
|
|
21
|
+
os: Literal["linux", "darwin", "windows"]
|
|
22
|
+
docker_runtime: Literal["nvidia", "default"]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _detect_nvidia() -> tuple[str | None, int]:
|
|
26
|
+
"""Return (gpu_name, vram_mb) via nvidia-smi, or (None, 0)."""
|
|
27
|
+
if not shutil.which("nvidia-smi"):
|
|
28
|
+
return None, 0
|
|
29
|
+
try:
|
|
30
|
+
out = subprocess.check_output(
|
|
31
|
+
["nvidia-smi", "--query-gpu=name,memory.total", "--format=csv,noheader,nounits"],
|
|
32
|
+
text=True,
|
|
33
|
+
timeout=5,
|
|
34
|
+
).strip()
|
|
35
|
+
if not out:
|
|
36
|
+
return None, 0
|
|
37
|
+
# Take the first GPU
|
|
38
|
+
line = out.splitlines()[0]
|
|
39
|
+
name, vram = line.split(",", 1)
|
|
40
|
+
return name.strip(), int(float(vram.strip()))
|
|
41
|
+
except (subprocess.SubprocessError, ValueError):
|
|
42
|
+
return None, 0
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _detect_apple() -> tuple[str | None, int]:
|
|
46
|
+
"""Return (chip_name, unified_memory_mb) on macOS."""
|
|
47
|
+
if platform.system() != "Darwin":
|
|
48
|
+
return None, 0
|
|
49
|
+
try:
|
|
50
|
+
out = subprocess.check_output(
|
|
51
|
+
["sysctl", "-n", "machdep.cpu.brand_string"], text=True, timeout=5
|
|
52
|
+
).strip()
|
|
53
|
+
# On Apple Silicon, unified memory = total RAM
|
|
54
|
+
ram_bytes = psutil.virtual_memory().total
|
|
55
|
+
if "Apple" in out:
|
|
56
|
+
return out, int(ram_bytes / 1024 / 1024)
|
|
57
|
+
return None, 0
|
|
58
|
+
except (subprocess.SubprocessError, ValueError):
|
|
59
|
+
return None, 0
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _check_nvidia_docker() -> bool:
|
|
63
|
+
"""Check if nvidia-container-toolkit is available."""
|
|
64
|
+
if not shutil.which("nvidia-smi"):
|
|
65
|
+
return False
|
|
66
|
+
try:
|
|
67
|
+
subprocess.check_output(
|
|
68
|
+
["docker", "info", "--format", "{{.Runtimes}}"],
|
|
69
|
+
text=True,
|
|
70
|
+
timeout=10,
|
|
71
|
+
)
|
|
72
|
+
# If nvidia runtime exists, docker info will mention it
|
|
73
|
+
out = subprocess.check_output(
|
|
74
|
+
["docker", "info"], text=True, timeout=10
|
|
75
|
+
)
|
|
76
|
+
return "nvidia" in out.lower()
|
|
77
|
+
except (subprocess.SubprocessError, FileNotFoundError):
|
|
78
|
+
return False
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def detect_hardware() -> HardwareProfile:
|
|
82
|
+
"""Detect hardware capabilities of the host machine."""
|
|
83
|
+
os_name: Literal["linux", "darwin", "windows"]
|
|
84
|
+
sys = platform.system()
|
|
85
|
+
if sys == "Linux":
|
|
86
|
+
os_name = "linux"
|
|
87
|
+
elif sys == "Darwin":
|
|
88
|
+
os_name = "darwin"
|
|
89
|
+
else:
|
|
90
|
+
os_name = "windows"
|
|
91
|
+
|
|
92
|
+
cpu_cores = psutil.cpu_count(logical=False) or psutil.cpu_count() or 1
|
|
93
|
+
ram_mb = int(psutil.virtual_memory().total / 1024 / 1024)
|
|
94
|
+
|
|
95
|
+
# Try NVIDIA first
|
|
96
|
+
gpu_name, gpu_vram = _detect_nvidia()
|
|
97
|
+
if gpu_name:
|
|
98
|
+
has_nvidia_docker = _check_nvidia_docker()
|
|
99
|
+
return HardwareProfile(
|
|
100
|
+
gpu_vendor="nvidia",
|
|
101
|
+
gpu_name=gpu_name,
|
|
102
|
+
gpu_vram_mb=gpu_vram,
|
|
103
|
+
cpu_cores=cpu_cores,
|
|
104
|
+
ram_mb=ram_mb,
|
|
105
|
+
os=os_name,
|
|
106
|
+
docker_runtime="nvidia" if has_nvidia_docker else "default",
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
# Try Apple Silicon
|
|
110
|
+
gpu_name, gpu_vram = _detect_apple()
|
|
111
|
+
if gpu_name:
|
|
112
|
+
return HardwareProfile(
|
|
113
|
+
gpu_vendor="apple",
|
|
114
|
+
gpu_name=gpu_name,
|
|
115
|
+
gpu_vram_mb=gpu_vram,
|
|
116
|
+
cpu_cores=cpu_cores,
|
|
117
|
+
ram_mb=ram_mb,
|
|
118
|
+
os=os_name,
|
|
119
|
+
docker_runtime="default",
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
# No GPU detected
|
|
123
|
+
return HardwareProfile(
|
|
124
|
+
gpu_vendor="none",
|
|
125
|
+
gpu_name=None,
|
|
126
|
+
gpu_vram_mb=0,
|
|
127
|
+
cpu_cores=cpu_cores,
|
|
128
|
+
ram_mb=ram_mb,
|
|
129
|
+
os=os_name,
|
|
130
|
+
docker_runtime="default",
|
|
131
|
+
)
|
llmstack/core/health.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""Health check polling for services."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
|
|
7
|
+
import httpx
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
async def wait_healthy(url: str, timeout: int = 120, interval: float = 2.0) -> bool:
|
|
11
|
+
"""Poll a health endpoint until it returns 200 or timeout expires."""
|
|
12
|
+
elapsed = 0.0
|
|
13
|
+
async with httpx.AsyncClient(timeout=5) as client:
|
|
14
|
+
while elapsed < timeout:
|
|
15
|
+
try:
|
|
16
|
+
resp = await client.get(url)
|
|
17
|
+
if resp.status_code == 200:
|
|
18
|
+
return True
|
|
19
|
+
except (httpx.ConnectError, httpx.ReadTimeout, httpx.ConnectTimeout):
|
|
20
|
+
pass
|
|
21
|
+
await asyncio.sleep(interval)
|
|
22
|
+
elapsed += interval
|
|
23
|
+
return False
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""Resolve 'auto' config values into concrete choices based on hardware."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from llmstack.config.schema import ModelSpec, EmbeddingSpec
|
|
6
|
+
from llmstack.core.hardware import HardwareProfile
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def resolve_inference_backend(model: ModelSpec, hw: HardwareProfile) -> str:
|
|
10
|
+
"""Pick the best inference backend for the detected hardware."""
|
|
11
|
+
if model.backend != "auto":
|
|
12
|
+
return model.backend
|
|
13
|
+
|
|
14
|
+
# vLLM requires NVIDIA GPU with sufficient VRAM
|
|
15
|
+
if hw.gpu_vendor == "nvidia" and hw.gpu_vram_mb >= 16_000:
|
|
16
|
+
return "vllm"
|
|
17
|
+
|
|
18
|
+
# Everything else: Ollama (supports CPU, Apple Silicon, smaller GPUs)
|
|
19
|
+
return "ollama"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def resolve_embedding_backend(spec: EmbeddingSpec, hw: HardwareProfile) -> str:
|
|
23
|
+
"""Pick the best embedding backend."""
|
|
24
|
+
if spec.backend != "auto":
|
|
25
|
+
return spec.backend
|
|
26
|
+
|
|
27
|
+
# TEI (Text Embeddings Inference) works well with GPU
|
|
28
|
+
if hw.gpu_vendor == "nvidia" and hw.gpu_vram_mb >= 4_000:
|
|
29
|
+
return "tei"
|
|
30
|
+
|
|
31
|
+
# Fallback: use Ollama for embeddings too (simpler, works everywhere)
|
|
32
|
+
return "ollama"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def resolve_quantization(model: ModelSpec, hw: HardwareProfile) -> str | None:
|
|
36
|
+
"""Auto-pick quantization based on available memory."""
|
|
37
|
+
if model.quantization is not None:
|
|
38
|
+
return model.quantization
|
|
39
|
+
|
|
40
|
+
# Only auto-quantize for very large models on limited hardware
|
|
41
|
+
model_lower = model.name.lower()
|
|
42
|
+
if "70b" in model_lower:
|
|
43
|
+
if hw.gpu_vram_mb < 48_000:
|
|
44
|
+
return "q4_k_m"
|
|
45
|
+
if "13b" in model_lower or "14b" in model_lower:
|
|
46
|
+
if hw.gpu_vram_mb < 16_000:
|
|
47
|
+
return "q4_k_m"
|
|
48
|
+
|
|
49
|
+
return None
|
llmstack/core/stack.py
ADDED
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
"""Stack orchestrator — manages the full lifecycle of an llmstack deployment."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import secrets
|
|
6
|
+
|
|
7
|
+
from rich.console import Console
|
|
8
|
+
from rich.table import Table
|
|
9
|
+
|
|
10
|
+
from llmstack.config.schema import StackConfig
|
|
11
|
+
from llmstack.core.hardware import detect_hardware
|
|
12
|
+
from llmstack.core.health import wait_healthy
|
|
13
|
+
from llmstack.core.resolver import resolve_inference_backend, resolve_embedding_backend
|
|
14
|
+
from llmstack.docker.manager import DockerManager
|
|
15
|
+
from llmstack.services.base import ServiceBase, ServiceStatus, ServiceState
|
|
16
|
+
from llmstack.services.inference.ollama import OllamaService
|
|
17
|
+
from llmstack.services.inference.vllm import VllmService
|
|
18
|
+
from llmstack.services.embeddings.tei import TEIService
|
|
19
|
+
from llmstack.services.vectordb.qdrant import QdrantService
|
|
20
|
+
from llmstack.services.cache.redis import RedisService
|
|
21
|
+
from llmstack.services.gateway.service import GatewayService
|
|
22
|
+
from llmstack.services.observe.prometheus import PrometheusService, GrafanaService
|
|
23
|
+
|
|
24
|
+
console = Console()
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class Stack:
|
|
28
|
+
"""Orchestrates boot and teardown of all services."""
|
|
29
|
+
|
|
30
|
+
def __init__(self, config: StackConfig):
|
|
31
|
+
self.config = config
|
|
32
|
+
self.hw = detect_hardware()
|
|
33
|
+
self.docker = DockerManager(network_name=config.docker.network)
|
|
34
|
+
self._services: list[ServiceBase] = []
|
|
35
|
+
|
|
36
|
+
def _build_services(self) -> list[ServiceBase]:
|
|
37
|
+
"""Instantiate services in boot order:
|
|
38
|
+
vectordb -> cache -> inference -> embeddings
|
|
39
|
+
"""
|
|
40
|
+
services: list[ServiceBase] = []
|
|
41
|
+
|
|
42
|
+
# 1. Vector DB
|
|
43
|
+
services.append(QdrantService(self.config.services.vectors))
|
|
44
|
+
|
|
45
|
+
# 2. Cache
|
|
46
|
+
services.append(RedisService(self.config.services.cache))
|
|
47
|
+
|
|
48
|
+
# 3. Inference
|
|
49
|
+
inference_backend = resolve_inference_backend(self.config.models.chat, self.hw)
|
|
50
|
+
if inference_backend == "vllm":
|
|
51
|
+
services.append(VllmService(self.config.models.chat, self.hw))
|
|
52
|
+
else:
|
|
53
|
+
services.append(OllamaService(self.config.models.chat, self.hw))
|
|
54
|
+
|
|
55
|
+
# 4. Embeddings
|
|
56
|
+
embed_backend = resolve_embedding_backend(self.config.models.embeddings, self.hw)
|
|
57
|
+
if embed_backend == "tei":
|
|
58
|
+
services.append(TEIService(self.config.models.embeddings, self.hw))
|
|
59
|
+
# If embed_backend == "ollama", we reuse the Ollama container (no extra service)
|
|
60
|
+
|
|
61
|
+
# 5. Gateway
|
|
62
|
+
inference_url = self._resolve_inference_url(services, backend=inference_backend)
|
|
63
|
+
embeddings_url = self._resolve_embeddings_url(services, embed_backend)
|
|
64
|
+
qdrant_url = f"http://llmstack-qdrant:{self.config.services.vectors.port}"
|
|
65
|
+
redis_url = f"redis://llmstack-redis:{self.config.services.cache.port}"
|
|
66
|
+
|
|
67
|
+
services.append(GatewayService(
|
|
68
|
+
config=self.config.gateway,
|
|
69
|
+
inference_url=inference_url,
|
|
70
|
+
embeddings_url=embeddings_url,
|
|
71
|
+
qdrant_url=qdrant_url,
|
|
72
|
+
redis_url=redis_url,
|
|
73
|
+
))
|
|
74
|
+
|
|
75
|
+
# 6. Observability (optional)
|
|
76
|
+
if self.config.observe.metrics:
|
|
77
|
+
services.append(PrometheusService(self.config.observe))
|
|
78
|
+
services.append(GrafanaService(self.config.observe))
|
|
79
|
+
|
|
80
|
+
return services
|
|
81
|
+
|
|
82
|
+
def _resolve_inference_url(self, services: list[ServiceBase], backend: str) -> str:
|
|
83
|
+
for svc in services:
|
|
84
|
+
if svc.category == "inference":
|
|
85
|
+
return svc.openai_base_url() or ""
|
|
86
|
+
return ""
|
|
87
|
+
|
|
88
|
+
def _resolve_embeddings_url(self, services: list[ServiceBase], backend: str) -> str:
|
|
89
|
+
for svc in services:
|
|
90
|
+
if svc.category == "embeddings":
|
|
91
|
+
return svc.openai_base_url() or ""
|
|
92
|
+
# Fallback to inference (Ollama can do embeddings)
|
|
93
|
+
for svc in services:
|
|
94
|
+
if svc.category == "inference":
|
|
95
|
+
return svc.openai_base_url() or ""
|
|
96
|
+
return ""
|
|
97
|
+
|
|
98
|
+
async def up(self) -> None:
|
|
99
|
+
"""Boot all services in order with health checks."""
|
|
100
|
+
self._services = self._build_services()
|
|
101
|
+
self.docker.ensure_network()
|
|
102
|
+
|
|
103
|
+
# Generate API key if needed
|
|
104
|
+
if self.config.gateway.auth == "api_key" and not self.config.gateway.api_keys:
|
|
105
|
+
key = f"sk-llmstack-{secrets.token_urlsafe(24)}"
|
|
106
|
+
self.config.gateway.api_keys = [key]
|
|
107
|
+
console.print(f"\n[bold green]Generated API key:[/] {key}\n")
|
|
108
|
+
|
|
109
|
+
for svc in self._services:
|
|
110
|
+
console.print(f" [cyan]Starting {svc.name}...[/]", end="")
|
|
111
|
+
self.docker.run_service(svc)
|
|
112
|
+
|
|
113
|
+
# Health check (skip Redis — no HTTP health endpoint)
|
|
114
|
+
if svc.category != "cache":
|
|
115
|
+
healthy = await wait_healthy(svc.health_url(), timeout=180)
|
|
116
|
+
if not healthy:
|
|
117
|
+
console.print(" [red]FAILED[/]")
|
|
118
|
+
raise RuntimeError(f"Service {svc.name} failed to start")
|
|
119
|
+
|
|
120
|
+
console.print(" [green]ready[/]")
|
|
121
|
+
|
|
122
|
+
# Post-start hook (e.g., pull model)
|
|
123
|
+
if svc.category == "inference":
|
|
124
|
+
model_name = self.config.models.chat.name
|
|
125
|
+
console.print(f" [cyan]Pulling model {model_name}...[/]", end="")
|
|
126
|
+
await svc.post_start()
|
|
127
|
+
if svc.category == "inference":
|
|
128
|
+
console.print(" [green]done[/]")
|
|
129
|
+
|
|
130
|
+
# Print summary
|
|
131
|
+
self._print_summary()
|
|
132
|
+
|
|
133
|
+
def down(self, remove_volumes: bool = False) -> list[str]:
|
|
134
|
+
"""Stop all services in reverse order."""
|
|
135
|
+
return self.docker.stop_all(remove_volumes=remove_volumes)
|
|
136
|
+
|
|
137
|
+
def status(self) -> list[ServiceStatus]:
|
|
138
|
+
"""Get status of all managed services."""
|
|
139
|
+
containers = self.docker.list_services()
|
|
140
|
+
result = []
|
|
141
|
+
for info in containers:
|
|
142
|
+
state = ServiceState.RUNNING if info["status"] == "running" else ServiceState.STOPPED
|
|
143
|
+
ports = info.get("ports", {})
|
|
144
|
+
port = None
|
|
145
|
+
if ports:
|
|
146
|
+
first = list(ports.values())[0]
|
|
147
|
+
if first and isinstance(first, list) and first:
|
|
148
|
+
port = first[0].get("HostPort")
|
|
149
|
+
|
|
150
|
+
result.append(ServiceStatus(
|
|
151
|
+
name=info["name"],
|
|
152
|
+
state=state,
|
|
153
|
+
port=int(port) if port else None,
|
|
154
|
+
container_id=info["container_id"],
|
|
155
|
+
))
|
|
156
|
+
return result
|
|
157
|
+
|
|
158
|
+
def _get_inference_url(self) -> str:
|
|
159
|
+
"""Get the OpenAI base URL for the inference service."""
|
|
160
|
+
for svc in self._services:
|
|
161
|
+
if svc.category == "inference":
|
|
162
|
+
return svc.openai_base_url() or ""
|
|
163
|
+
return ""
|
|
164
|
+
|
|
165
|
+
def _get_embeddings_url(self) -> str:
|
|
166
|
+
"""Get the embeddings URL."""
|
|
167
|
+
for svc in self._services:
|
|
168
|
+
if svc.category == "embeddings":
|
|
169
|
+
return svc.openai_base_url() or ""
|
|
170
|
+
# Fallback: use Ollama for embeddings
|
|
171
|
+
for svc in self._services:
|
|
172
|
+
if svc.category == "inference" and isinstance(svc, OllamaService):
|
|
173
|
+
return svc.openai_base_url() or ""
|
|
174
|
+
return ""
|
|
175
|
+
|
|
176
|
+
def _print_summary(self) -> None:
|
|
177
|
+
"""Print a summary table of running services."""
|
|
178
|
+
table = Table(title="LLMStack Services", show_header=True)
|
|
179
|
+
table.add_column("Service", style="cyan")
|
|
180
|
+
table.add_column("Category")
|
|
181
|
+
table.add_column("Status", style="green")
|
|
182
|
+
table.add_column("URL")
|
|
183
|
+
|
|
184
|
+
for svc in self._services:
|
|
185
|
+
url = svc.health_url()
|
|
186
|
+
# Clean up URL for display
|
|
187
|
+
for suffix in ["/healthz", "/health", "/api/tags"]:
|
|
188
|
+
url = url.replace(suffix, "")
|
|
189
|
+
table.add_row(svc.name, svc.category, "running", url)
|
|
190
|
+
|
|
191
|
+
console.print()
|
|
192
|
+
console.print(table)
|
|
193
|
+
|
|
194
|
+
# Print usage hint
|
|
195
|
+
inference_svc = next((s for s in self._services if s.category == "inference"), None)
|
|
196
|
+
if inference_svc:
|
|
197
|
+
base = inference_svc.health_url()
|
|
198
|
+
for suffix in ["/healthz", "/health", "/api/tags"]:
|
|
199
|
+
base = base.replace(suffix, "")
|
|
200
|
+
model = self.config.models.chat.name
|
|
201
|
+
console.print("\n[bold]Try it:[/]")
|
|
202
|
+
console.print(
|
|
203
|
+
f" curl {base}/v1/chat/completions \\\n"
|
|
204
|
+
f" -H 'Content-Type: application/json' \\\n"
|
|
205
|
+
f" -d '{{\"model\":\"{model}\",\"messages\":[{{\"role\":\"user\",\"content\":\"Hello!\"}}]}}'"
|
|
206
|
+
)
|
|
207
|
+
console.print()
|
|
File without changes
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
"""Docker container lifecycle management."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Iterator
|
|
6
|
+
|
|
7
|
+
import docker
|
|
8
|
+
from docker.errors import NotFound, APIError
|
|
9
|
+
from docker.models.containers import Container
|
|
10
|
+
|
|
11
|
+
from llmstack.services.base import ServiceBase
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class DockerManager:
|
|
15
|
+
"""Wraps Docker SDK to manage llmstack containers."""
|
|
16
|
+
|
|
17
|
+
LABEL_MANAGED = "llmstack.managed"
|
|
18
|
+
LABEL_SERVICE = "llmstack.service"
|
|
19
|
+
|
|
20
|
+
def __init__(self, network_name: str = "llmstack_net"):
|
|
21
|
+
try:
|
|
22
|
+
self.client = docker.from_env()
|
|
23
|
+
self.client.ping()
|
|
24
|
+
except docker.errors.DockerException as exc:
|
|
25
|
+
raise SystemExit(
|
|
26
|
+
"Cannot connect to Docker daemon. Is Docker running?\n"
|
|
27
|
+
"Install: https://docs.docker.com/get-docker/"
|
|
28
|
+
) from exc
|
|
29
|
+
self.network_name = network_name
|
|
30
|
+
|
|
31
|
+
def ensure_network(self) -> None:
|
|
32
|
+
"""Create the bridge network if it doesn't exist."""
|
|
33
|
+
try:
|
|
34
|
+
self.client.networks.get(self.network_name)
|
|
35
|
+
except NotFound:
|
|
36
|
+
self.client.networks.create(self.network_name, driver="bridge")
|
|
37
|
+
|
|
38
|
+
def run_service(self, service: ServiceBase) -> Container:
|
|
39
|
+
"""Start a container for a service. Removes any existing container with the same name."""
|
|
40
|
+
spec = service.container_spec()
|
|
41
|
+
name = spec.pop("name", f"llmstack-{service.name}")
|
|
42
|
+
|
|
43
|
+
# Remove existing container if present
|
|
44
|
+
try:
|
|
45
|
+
existing = self.client.containers.get(name)
|
|
46
|
+
existing.stop(timeout=10)
|
|
47
|
+
existing.remove(force=True)
|
|
48
|
+
except NotFound:
|
|
49
|
+
pass
|
|
50
|
+
|
|
51
|
+
labels = {
|
|
52
|
+
self.LABEL_MANAGED: "true",
|
|
53
|
+
self.LABEL_SERVICE: service.name,
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
container = self.client.containers.run(
|
|
57
|
+
detach=True,
|
|
58
|
+
name=name,
|
|
59
|
+
network=self.network_name,
|
|
60
|
+
labels=labels,
|
|
61
|
+
**spec,
|
|
62
|
+
)
|
|
63
|
+
return container
|
|
64
|
+
|
|
65
|
+
def stop_service(self, service_name: str) -> None:
|
|
66
|
+
"""Stop and remove a container by service name."""
|
|
67
|
+
for container in self._managed_containers():
|
|
68
|
+
if container.labels.get(self.LABEL_SERVICE) == service_name:
|
|
69
|
+
container.stop(timeout=10)
|
|
70
|
+
container.remove(force=True)
|
|
71
|
+
return
|
|
72
|
+
|
|
73
|
+
def stop_all(self, remove_volumes: bool = False) -> list[str]:
|
|
74
|
+
"""Stop and remove all llmstack containers. Returns names of stopped containers."""
|
|
75
|
+
stopped = []
|
|
76
|
+
for container in self._managed_containers():
|
|
77
|
+
name = container.name
|
|
78
|
+
container.stop(timeout=10)
|
|
79
|
+
container.remove(force=True)
|
|
80
|
+
stopped.append(name)
|
|
81
|
+
|
|
82
|
+
if remove_volumes:
|
|
83
|
+
for vol in self.client.volumes.list():
|
|
84
|
+
if vol.name.startswith("llmstack_"):
|
|
85
|
+
try:
|
|
86
|
+
vol.remove(force=True)
|
|
87
|
+
except APIError:
|
|
88
|
+
pass
|
|
89
|
+
|
|
90
|
+
# Remove network
|
|
91
|
+
try:
|
|
92
|
+
net = self.client.networks.get(self.network_name)
|
|
93
|
+
net.remove()
|
|
94
|
+
except (NotFound, APIError):
|
|
95
|
+
pass
|
|
96
|
+
|
|
97
|
+
return stopped
|
|
98
|
+
|
|
99
|
+
def get_container(self, service_name: str) -> Container | None:
|
|
100
|
+
"""Find a running container by service name."""
|
|
101
|
+
for container in self._managed_containers():
|
|
102
|
+
if container.labels.get(self.LABEL_SERVICE) == service_name:
|
|
103
|
+
return container
|
|
104
|
+
return None
|
|
105
|
+
|
|
106
|
+
def stream_logs(self, service_name: str, follow: bool = True, tail: int = 50) -> Iterator[str]:
|
|
107
|
+
"""Yield decoded log lines from a service container."""
|
|
108
|
+
container = self.get_container(service_name)
|
|
109
|
+
if container is None:
|
|
110
|
+
raise ValueError(f"No running container for service '{service_name}'")
|
|
111
|
+
|
|
112
|
+
for chunk in container.logs(stream=True, follow=follow, tail=tail):
|
|
113
|
+
yield chunk.decode("utf-8", errors="replace")
|
|
114
|
+
|
|
115
|
+
def list_services(self) -> list[dict]:
|
|
116
|
+
"""Return info about all managed containers."""
|
|
117
|
+
result = []
|
|
118
|
+
for container in self._managed_containers():
|
|
119
|
+
container.reload()
|
|
120
|
+
result.append({
|
|
121
|
+
"name": container.labels.get(self.LABEL_SERVICE, "unknown"),
|
|
122
|
+
"container_name": container.name,
|
|
123
|
+
"container_id": container.short_id,
|
|
124
|
+
"status": container.status,
|
|
125
|
+
"ports": container.ports,
|
|
126
|
+
})
|
|
127
|
+
return result
|
|
128
|
+
|
|
129
|
+
def _managed_containers(self) -> list[Container]:
|
|
130
|
+
"""List all containers with the llmstack.managed label."""
|
|
131
|
+
return self.client.containers.list(
|
|
132
|
+
all=True,
|
|
133
|
+
filters={"label": f"{self.LABEL_MANAGED}=true"},
|
|
134
|
+
)
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
FROM python:3.11-slim
|
|
2
|
+
|
|
3
|
+
WORKDIR /app
|
|
4
|
+
|
|
5
|
+
RUN pip install --no-cache-dir \
|
|
6
|
+
fastapi>=0.115 \
|
|
7
|
+
uvicorn[standard]>=0.30 \
|
|
8
|
+
httpx>=0.27 \
|
|
9
|
+
starlette>=0.40
|
|
10
|
+
|
|
11
|
+
COPY . /app/llmstack/gateway/
|
|
12
|
+
ENV PYTHONPATH=/app
|
|
13
|
+
|
|
14
|
+
EXPOSE 8000
|
|
15
|
+
|
|
16
|
+
CMD ["uvicorn", "llmstack.gateway.main:app", "--host", "0.0.0.0", "--port", "8000"]
|
|
File without changes
|
llmstack/gateway/main.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
"""LLMStack Gateway — OpenAI-compatible API gateway."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
|
|
7
|
+
from fastapi import FastAPI
|
|
8
|
+
from fastapi.middleware.cors import CORSMiddleware
|
|
9
|
+
|
|
10
|
+
from llmstack.gateway.routes.chat import router as chat_router
|
|
11
|
+
from llmstack.gateway.routes.embeddings import router as embeddings_router
|
|
12
|
+
from llmstack.gateway.routes.models import router as models_router
|
|
13
|
+
from llmstack.gateway.routes.health import router as health_router
|
|
14
|
+
from llmstack.gateway.middleware.auth import AuthMiddleware
|
|
15
|
+
from llmstack.gateway.middleware.metrics import MetricsMiddleware
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def create_app() -> FastAPI:
|
|
19
|
+
app = FastAPI(
|
|
20
|
+
title="LLMStack Gateway",
|
|
21
|
+
description="OpenAI-compatible API gateway for LLMStack",
|
|
22
|
+
version="0.1.0",
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
# CORS
|
|
26
|
+
cors_origins = os.getenv("LLMSTACK_CORS_ORIGINS", "*").split(",")
|
|
27
|
+
app.add_middleware(
|
|
28
|
+
CORSMiddleware,
|
|
29
|
+
allow_origins=cors_origins,
|
|
30
|
+
allow_credentials=True,
|
|
31
|
+
allow_methods=["*"],
|
|
32
|
+
allow_headers=["*"],
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
# Auth
|
|
36
|
+
api_keys = os.getenv("LLMSTACK_API_KEYS", "")
|
|
37
|
+
if api_keys:
|
|
38
|
+
app.add_middleware(AuthMiddleware, api_keys=api_keys.split(","))
|
|
39
|
+
|
|
40
|
+
# Metrics
|
|
41
|
+
app.add_middleware(MetricsMiddleware)
|
|
42
|
+
|
|
43
|
+
# Routes
|
|
44
|
+
app.include_router(chat_router, prefix="/v1")
|
|
45
|
+
app.include_router(embeddings_router, prefix="/v1")
|
|
46
|
+
app.include_router(models_router, prefix="/v1")
|
|
47
|
+
app.include_router(health_router)
|
|
48
|
+
|
|
49
|
+
return app
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
app = create_app()
|
|
File without changes
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"""API key authentication middleware."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from starlette.middleware.base import BaseHTTPMiddleware
|
|
6
|
+
from starlette.requests import Request
|
|
7
|
+
from starlette.responses import JSONResponse
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class AuthMiddleware(BaseHTTPMiddleware):
|
|
11
|
+
def __init__(self, app, api_keys: list[str]):
|
|
12
|
+
super().__init__(app)
|
|
13
|
+
self.api_keys = set(k.strip() for k in api_keys if k.strip())
|
|
14
|
+
|
|
15
|
+
async def dispatch(self, request: Request, call_next):
|
|
16
|
+
# Skip auth for health checks and docs
|
|
17
|
+
if request.url.path in ("/healthz", "/metrics", "/docs", "/openapi.json"):
|
|
18
|
+
return await call_next(request)
|
|
19
|
+
|
|
20
|
+
if not self.api_keys:
|
|
21
|
+
return await call_next(request)
|
|
22
|
+
|
|
23
|
+
auth = request.headers.get("Authorization", "")
|
|
24
|
+
if auth.startswith("Bearer "):
|
|
25
|
+
token = auth[7:]
|
|
26
|
+
if token in self.api_keys:
|
|
27
|
+
return await call_next(request)
|
|
28
|
+
|
|
29
|
+
return JSONResponse(
|
|
30
|
+
status_code=401,
|
|
31
|
+
content={"error": {"message": "Invalid API key", "type": "auth_error"}},
|
|
32
|
+
)
|