llmstack-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. llmstack/__init__.py +3 -0
  2. llmstack/__main__.py +5 -0
  3. llmstack/cli/__init__.py +0 -0
  4. llmstack/cli/app.py +87 -0
  5. llmstack/cli/commands/__init__.py +0 -0
  6. llmstack/cli/commands/doctor.py +72 -0
  7. llmstack/cli/commands/down.py +25 -0
  8. llmstack/cli/commands/init.py +66 -0
  9. llmstack/cli/commands/logs.py +25 -0
  10. llmstack/cli/commands/status.py +45 -0
  11. llmstack/cli/commands/up.py +30 -0
  12. llmstack/cli/console.py +13 -0
  13. llmstack/config/__init__.py +4 -0
  14. llmstack/config/loader.py +44 -0
  15. llmstack/config/presets/__init__.py +11 -0
  16. llmstack/config/presets/agent.py +13 -0
  17. llmstack/config/presets/chat.py +14 -0
  18. llmstack/config/presets/rag.py +10 -0
  19. llmstack/config/schema.py +76 -0
  20. llmstack/core/__init__.py +0 -0
  21. llmstack/core/hardware.py +131 -0
  22. llmstack/core/health.py +23 -0
  23. llmstack/core/resolver.py +49 -0
  24. llmstack/core/stack.py +207 -0
  25. llmstack/docker/__init__.py +0 -0
  26. llmstack/docker/manager.py +134 -0
  27. llmstack/gateway/Dockerfile +16 -0
  28. llmstack/gateway/__init__.py +0 -0
  29. llmstack/gateway/main.py +52 -0
  30. llmstack/gateway/middleware/__init__.py +0 -0
  31. llmstack/gateway/middleware/auth.py +32 -0
  32. llmstack/gateway/middleware/metrics.py +115 -0
  33. llmstack/gateway/proxy.py +58 -0
  34. llmstack/gateway/routes/__init__.py +0 -0
  35. llmstack/gateway/routes/chat.py +27 -0
  36. llmstack/gateway/routes/embeddings.py +17 -0
  37. llmstack/gateway/routes/health.py +55 -0
  38. llmstack/gateway/routes/models.py +16 -0
  39. llmstack/plugins/__init__.py +0 -0
  40. llmstack/plugins/loader.py +5 -0
  41. llmstack/plugins/spec.py +20 -0
  42. llmstack/services/__init__.py +0 -0
  43. llmstack/services/base.py +65 -0
  44. llmstack/services/cache/__init__.py +0 -0
  45. llmstack/services/cache/redis.py +33 -0
  46. llmstack/services/embeddings/__init__.py +0 -0
  47. llmstack/services/embeddings/tei.py +49 -0
  48. llmstack/services/gateway/__init__.py +0 -0
  49. llmstack/services/gateway/service.py +47 -0
  50. llmstack/services/inference/__init__.py +0 -0
  51. llmstack/services/inference/ollama.py +60 -0
  52. llmstack/services/inference/vllm.py +57 -0
  53. llmstack/services/observe/__init__.py +0 -0
  54. llmstack/services/observe/prometheus.py +168 -0
  55. llmstack/services/registry.py +53 -0
  56. llmstack/services/vectordb/__init__.py +0 -0
  57. llmstack/services/vectordb/qdrant.py +33 -0
  58. llmstack_cli-0.1.0.dist-info/METADATA +252 -0
  59. llmstack_cli-0.1.0.dist-info/RECORD +62 -0
  60. llmstack_cli-0.1.0.dist-info/WHEEL +4 -0
  61. llmstack_cli-0.1.0.dist-info/entry_points.txt +2 -0
  62. llmstack_cli-0.1.0.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,131 @@
1
+ """Detect GPU, CPU, and RAM available on the host machine."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import platform
6
+ import shutil
7
+ import subprocess
8
+ from dataclasses import dataclass
9
+ from typing import Literal
10
+
11
+ import psutil
12
+
13
+
14
+ @dataclass(frozen=True)
15
+ class HardwareProfile:
16
+ gpu_vendor: Literal["nvidia", "amd", "apple", "none"]
17
+ gpu_name: str | None
18
+ gpu_vram_mb: int
19
+ cpu_cores: int
20
+ ram_mb: int
21
+ os: Literal["linux", "darwin", "windows"]
22
+ docker_runtime: Literal["nvidia", "default"]
23
+
24
+
25
+ def _detect_nvidia() -> tuple[str | None, int]:
26
+ """Return (gpu_name, vram_mb) via nvidia-smi, or (None, 0)."""
27
+ if not shutil.which("nvidia-smi"):
28
+ return None, 0
29
+ try:
30
+ out = subprocess.check_output(
31
+ ["nvidia-smi", "--query-gpu=name,memory.total", "--format=csv,noheader,nounits"],
32
+ text=True,
33
+ timeout=5,
34
+ ).strip()
35
+ if not out:
36
+ return None, 0
37
+ # Take the first GPU
38
+ line = out.splitlines()[0]
39
+ name, vram = line.split(",", 1)
40
+ return name.strip(), int(float(vram.strip()))
41
+ except (subprocess.SubprocessError, ValueError):
42
+ return None, 0
43
+
44
+
45
+ def _detect_apple() -> tuple[str | None, int]:
46
+ """Return (chip_name, unified_memory_mb) on macOS."""
47
+ if platform.system() != "Darwin":
48
+ return None, 0
49
+ try:
50
+ out = subprocess.check_output(
51
+ ["sysctl", "-n", "machdep.cpu.brand_string"], text=True, timeout=5
52
+ ).strip()
53
+ # On Apple Silicon, unified memory = total RAM
54
+ ram_bytes = psutil.virtual_memory().total
55
+ if "Apple" in out:
56
+ return out, int(ram_bytes / 1024 / 1024)
57
+ return None, 0
58
+ except (subprocess.SubprocessError, ValueError):
59
+ return None, 0
60
+
61
+
62
+ def _check_nvidia_docker() -> bool:
63
+ """Check if nvidia-container-toolkit is available."""
64
+ if not shutil.which("nvidia-smi"):
65
+ return False
66
+ try:
67
+ subprocess.check_output(
68
+ ["docker", "info", "--format", "{{.Runtimes}}"],
69
+ text=True,
70
+ timeout=10,
71
+ )
72
+ # If nvidia runtime exists, docker info will mention it
73
+ out = subprocess.check_output(
74
+ ["docker", "info"], text=True, timeout=10
75
+ )
76
+ return "nvidia" in out.lower()
77
+ except (subprocess.SubprocessError, FileNotFoundError):
78
+ return False
79
+
80
+
81
+ def detect_hardware() -> HardwareProfile:
82
+ """Detect hardware capabilities of the host machine."""
83
+ os_name: Literal["linux", "darwin", "windows"]
84
+ sys = platform.system()
85
+ if sys == "Linux":
86
+ os_name = "linux"
87
+ elif sys == "Darwin":
88
+ os_name = "darwin"
89
+ else:
90
+ os_name = "windows"
91
+
92
+ cpu_cores = psutil.cpu_count(logical=False) or psutil.cpu_count() or 1
93
+ ram_mb = int(psutil.virtual_memory().total / 1024 / 1024)
94
+
95
+ # Try NVIDIA first
96
+ gpu_name, gpu_vram = _detect_nvidia()
97
+ if gpu_name:
98
+ has_nvidia_docker = _check_nvidia_docker()
99
+ return HardwareProfile(
100
+ gpu_vendor="nvidia",
101
+ gpu_name=gpu_name,
102
+ gpu_vram_mb=gpu_vram,
103
+ cpu_cores=cpu_cores,
104
+ ram_mb=ram_mb,
105
+ os=os_name,
106
+ docker_runtime="nvidia" if has_nvidia_docker else "default",
107
+ )
108
+
109
+ # Try Apple Silicon
110
+ gpu_name, gpu_vram = _detect_apple()
111
+ if gpu_name:
112
+ return HardwareProfile(
113
+ gpu_vendor="apple",
114
+ gpu_name=gpu_name,
115
+ gpu_vram_mb=gpu_vram,
116
+ cpu_cores=cpu_cores,
117
+ ram_mb=ram_mb,
118
+ os=os_name,
119
+ docker_runtime="default",
120
+ )
121
+
122
+ # No GPU detected
123
+ return HardwareProfile(
124
+ gpu_vendor="none",
125
+ gpu_name=None,
126
+ gpu_vram_mb=0,
127
+ cpu_cores=cpu_cores,
128
+ ram_mb=ram_mb,
129
+ os=os_name,
130
+ docker_runtime="default",
131
+ )
@@ -0,0 +1,23 @@
1
+ """Health check polling for services."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+
7
+ import httpx
8
+
9
+
10
+ async def wait_healthy(url: str, timeout: int = 120, interval: float = 2.0) -> bool:
11
+ """Poll a health endpoint until it returns 200 or timeout expires."""
12
+ elapsed = 0.0
13
+ async with httpx.AsyncClient(timeout=5) as client:
14
+ while elapsed < timeout:
15
+ try:
16
+ resp = await client.get(url)
17
+ if resp.status_code == 200:
18
+ return True
19
+ except (httpx.ConnectError, httpx.ReadTimeout, httpx.ConnectTimeout):
20
+ pass
21
+ await asyncio.sleep(interval)
22
+ elapsed += interval
23
+ return False
@@ -0,0 +1,49 @@
1
+ """Resolve 'auto' config values into concrete choices based on hardware."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from llmstack.config.schema import ModelSpec, EmbeddingSpec
6
+ from llmstack.core.hardware import HardwareProfile
7
+
8
+
9
+ def resolve_inference_backend(model: ModelSpec, hw: HardwareProfile) -> str:
10
+ """Pick the best inference backend for the detected hardware."""
11
+ if model.backend != "auto":
12
+ return model.backend
13
+
14
+ # vLLM requires NVIDIA GPU with sufficient VRAM
15
+ if hw.gpu_vendor == "nvidia" and hw.gpu_vram_mb >= 16_000:
16
+ return "vllm"
17
+
18
+ # Everything else: Ollama (supports CPU, Apple Silicon, smaller GPUs)
19
+ return "ollama"
20
+
21
+
22
+ def resolve_embedding_backend(spec: EmbeddingSpec, hw: HardwareProfile) -> str:
23
+ """Pick the best embedding backend."""
24
+ if spec.backend != "auto":
25
+ return spec.backend
26
+
27
+ # TEI (Text Embeddings Inference) works well with GPU
28
+ if hw.gpu_vendor == "nvidia" and hw.gpu_vram_mb >= 4_000:
29
+ return "tei"
30
+
31
+ # Fallback: use Ollama for embeddings too (simpler, works everywhere)
32
+ return "ollama"
33
+
34
+
35
+ def resolve_quantization(model: ModelSpec, hw: HardwareProfile) -> str | None:
36
+ """Auto-pick quantization based on available memory."""
37
+ if model.quantization is not None:
38
+ return model.quantization
39
+
40
+ # Only auto-quantize for very large models on limited hardware
41
+ model_lower = model.name.lower()
42
+ if "70b" in model_lower:
43
+ if hw.gpu_vram_mb < 48_000:
44
+ return "q4_k_m"
45
+ if "13b" in model_lower or "14b" in model_lower:
46
+ if hw.gpu_vram_mb < 16_000:
47
+ return "q4_k_m"
48
+
49
+ return None
llmstack/core/stack.py ADDED
@@ -0,0 +1,207 @@
1
+ """Stack orchestrator — manages the full lifecycle of an llmstack deployment."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import secrets
6
+
7
+ from rich.console import Console
8
+ from rich.table import Table
9
+
10
+ from llmstack.config.schema import StackConfig
11
+ from llmstack.core.hardware import detect_hardware
12
+ from llmstack.core.health import wait_healthy
13
+ from llmstack.core.resolver import resolve_inference_backend, resolve_embedding_backend
14
+ from llmstack.docker.manager import DockerManager
15
+ from llmstack.services.base import ServiceBase, ServiceStatus, ServiceState
16
+ from llmstack.services.inference.ollama import OllamaService
17
+ from llmstack.services.inference.vllm import VllmService
18
+ from llmstack.services.embeddings.tei import TEIService
19
+ from llmstack.services.vectordb.qdrant import QdrantService
20
+ from llmstack.services.cache.redis import RedisService
21
+ from llmstack.services.gateway.service import GatewayService
22
+ from llmstack.services.observe.prometheus import PrometheusService, GrafanaService
23
+
24
+ console = Console()
25
+
26
+
27
+ class Stack:
28
+ """Orchestrates boot and teardown of all services."""
29
+
30
+ def __init__(self, config: StackConfig):
31
+ self.config = config
32
+ self.hw = detect_hardware()
33
+ self.docker = DockerManager(network_name=config.docker.network)
34
+ self._services: list[ServiceBase] = []
35
+
36
+ def _build_services(self) -> list[ServiceBase]:
37
+ """Instantiate services in boot order:
38
+ vectordb -> cache -> inference -> embeddings
39
+ """
40
+ services: list[ServiceBase] = []
41
+
42
+ # 1. Vector DB
43
+ services.append(QdrantService(self.config.services.vectors))
44
+
45
+ # 2. Cache
46
+ services.append(RedisService(self.config.services.cache))
47
+
48
+ # 3. Inference
49
+ inference_backend = resolve_inference_backend(self.config.models.chat, self.hw)
50
+ if inference_backend == "vllm":
51
+ services.append(VllmService(self.config.models.chat, self.hw))
52
+ else:
53
+ services.append(OllamaService(self.config.models.chat, self.hw))
54
+
55
+ # 4. Embeddings
56
+ embed_backend = resolve_embedding_backend(self.config.models.embeddings, self.hw)
57
+ if embed_backend == "tei":
58
+ services.append(TEIService(self.config.models.embeddings, self.hw))
59
+ # If embed_backend == "ollama", we reuse the Ollama container (no extra service)
60
+
61
+ # 5. Gateway
62
+ inference_url = self._resolve_inference_url(services, backend=inference_backend)
63
+ embeddings_url = self._resolve_embeddings_url(services, embed_backend)
64
+ qdrant_url = f"http://llmstack-qdrant:{self.config.services.vectors.port}"
65
+ redis_url = f"redis://llmstack-redis:{self.config.services.cache.port}"
66
+
67
+ services.append(GatewayService(
68
+ config=self.config.gateway,
69
+ inference_url=inference_url,
70
+ embeddings_url=embeddings_url,
71
+ qdrant_url=qdrant_url,
72
+ redis_url=redis_url,
73
+ ))
74
+
75
+ # 6. Observability (optional)
76
+ if self.config.observe.metrics:
77
+ services.append(PrometheusService(self.config.observe))
78
+ services.append(GrafanaService(self.config.observe))
79
+
80
+ return services
81
+
82
+ def _resolve_inference_url(self, services: list[ServiceBase], backend: str) -> str:
83
+ for svc in services:
84
+ if svc.category == "inference":
85
+ return svc.openai_base_url() or ""
86
+ return ""
87
+
88
+ def _resolve_embeddings_url(self, services: list[ServiceBase], backend: str) -> str:
89
+ for svc in services:
90
+ if svc.category == "embeddings":
91
+ return svc.openai_base_url() or ""
92
+ # Fallback to inference (Ollama can do embeddings)
93
+ for svc in services:
94
+ if svc.category == "inference":
95
+ return svc.openai_base_url() or ""
96
+ return ""
97
+
98
+ async def up(self) -> None:
99
+ """Boot all services in order with health checks."""
100
+ self._services = self._build_services()
101
+ self.docker.ensure_network()
102
+
103
+ # Generate API key if needed
104
+ if self.config.gateway.auth == "api_key" and not self.config.gateway.api_keys:
105
+ key = f"sk-llmstack-{secrets.token_urlsafe(24)}"
106
+ self.config.gateway.api_keys = [key]
107
+ console.print(f"\n[bold green]Generated API key:[/] {key}\n")
108
+
109
+ for svc in self._services:
110
+ console.print(f" [cyan]Starting {svc.name}...[/]", end="")
111
+ self.docker.run_service(svc)
112
+
113
+ # Health check (skip Redis — no HTTP health endpoint)
114
+ if svc.category != "cache":
115
+ healthy = await wait_healthy(svc.health_url(), timeout=180)
116
+ if not healthy:
117
+ console.print(" [red]FAILED[/]")
118
+ raise RuntimeError(f"Service {svc.name} failed to start")
119
+
120
+ console.print(" [green]ready[/]")
121
+
122
+ # Post-start hook (e.g., pull model)
123
+ if svc.category == "inference":
124
+ model_name = self.config.models.chat.name
125
+ console.print(f" [cyan]Pulling model {model_name}...[/]", end="")
126
+ await svc.post_start()
127
+ if svc.category == "inference":
128
+ console.print(" [green]done[/]")
129
+
130
+ # Print summary
131
+ self._print_summary()
132
+
133
+ def down(self, remove_volumes: bool = False) -> list[str]:
134
+ """Stop all services in reverse order."""
135
+ return self.docker.stop_all(remove_volumes=remove_volumes)
136
+
137
+ def status(self) -> list[ServiceStatus]:
138
+ """Get status of all managed services."""
139
+ containers = self.docker.list_services()
140
+ result = []
141
+ for info in containers:
142
+ state = ServiceState.RUNNING if info["status"] == "running" else ServiceState.STOPPED
143
+ ports = info.get("ports", {})
144
+ port = None
145
+ if ports:
146
+ first = list(ports.values())[0]
147
+ if first and isinstance(first, list) and first:
148
+ port = first[0].get("HostPort")
149
+
150
+ result.append(ServiceStatus(
151
+ name=info["name"],
152
+ state=state,
153
+ port=int(port) if port else None,
154
+ container_id=info["container_id"],
155
+ ))
156
+ return result
157
+
158
+ def _get_inference_url(self) -> str:
159
+ """Get the OpenAI base URL for the inference service."""
160
+ for svc in self._services:
161
+ if svc.category == "inference":
162
+ return svc.openai_base_url() or ""
163
+ return ""
164
+
165
+ def _get_embeddings_url(self) -> str:
166
+ """Get the embeddings URL."""
167
+ for svc in self._services:
168
+ if svc.category == "embeddings":
169
+ return svc.openai_base_url() or ""
170
+ # Fallback: use Ollama for embeddings
171
+ for svc in self._services:
172
+ if svc.category == "inference" and isinstance(svc, OllamaService):
173
+ return svc.openai_base_url() or ""
174
+ return ""
175
+
176
+ def _print_summary(self) -> None:
177
+ """Print a summary table of running services."""
178
+ table = Table(title="LLMStack Services", show_header=True)
179
+ table.add_column("Service", style="cyan")
180
+ table.add_column("Category")
181
+ table.add_column("Status", style="green")
182
+ table.add_column("URL")
183
+
184
+ for svc in self._services:
185
+ url = svc.health_url()
186
+ # Clean up URL for display
187
+ for suffix in ["/healthz", "/health", "/api/tags"]:
188
+ url = url.replace(suffix, "")
189
+ table.add_row(svc.name, svc.category, "running", url)
190
+
191
+ console.print()
192
+ console.print(table)
193
+
194
+ # Print usage hint
195
+ inference_svc = next((s for s in self._services if s.category == "inference"), None)
196
+ if inference_svc:
197
+ base = inference_svc.health_url()
198
+ for suffix in ["/healthz", "/health", "/api/tags"]:
199
+ base = base.replace(suffix, "")
200
+ model = self.config.models.chat.name
201
+ console.print("\n[bold]Try it:[/]")
202
+ console.print(
203
+ f" curl {base}/v1/chat/completions \\\n"
204
+ f" -H 'Content-Type: application/json' \\\n"
205
+ f" -d '{{\"model\":\"{model}\",\"messages\":[{{\"role\":\"user\",\"content\":\"Hello!\"}}]}}'"
206
+ )
207
+ console.print()
File without changes
@@ -0,0 +1,134 @@
1
+ """Docker container lifecycle management."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Iterator
6
+
7
+ import docker
8
+ from docker.errors import NotFound, APIError
9
+ from docker.models.containers import Container
10
+
11
+ from llmstack.services.base import ServiceBase
12
+
13
+
14
+ class DockerManager:
15
+ """Wraps Docker SDK to manage llmstack containers."""
16
+
17
+ LABEL_MANAGED = "llmstack.managed"
18
+ LABEL_SERVICE = "llmstack.service"
19
+
20
+ def __init__(self, network_name: str = "llmstack_net"):
21
+ try:
22
+ self.client = docker.from_env()
23
+ self.client.ping()
24
+ except docker.errors.DockerException as exc:
25
+ raise SystemExit(
26
+ "Cannot connect to Docker daemon. Is Docker running?\n"
27
+ "Install: https://docs.docker.com/get-docker/"
28
+ ) from exc
29
+ self.network_name = network_name
30
+
31
+ def ensure_network(self) -> None:
32
+ """Create the bridge network if it doesn't exist."""
33
+ try:
34
+ self.client.networks.get(self.network_name)
35
+ except NotFound:
36
+ self.client.networks.create(self.network_name, driver="bridge")
37
+
38
+ def run_service(self, service: ServiceBase) -> Container:
39
+ """Start a container for a service. Removes any existing container with the same name."""
40
+ spec = service.container_spec()
41
+ name = spec.pop("name", f"llmstack-{service.name}")
42
+
43
+ # Remove existing container if present
44
+ try:
45
+ existing = self.client.containers.get(name)
46
+ existing.stop(timeout=10)
47
+ existing.remove(force=True)
48
+ except NotFound:
49
+ pass
50
+
51
+ labels = {
52
+ self.LABEL_MANAGED: "true",
53
+ self.LABEL_SERVICE: service.name,
54
+ }
55
+
56
+ container = self.client.containers.run(
57
+ detach=True,
58
+ name=name,
59
+ network=self.network_name,
60
+ labels=labels,
61
+ **spec,
62
+ )
63
+ return container
64
+
65
+ def stop_service(self, service_name: str) -> None:
66
+ """Stop and remove a container by service name."""
67
+ for container in self._managed_containers():
68
+ if container.labels.get(self.LABEL_SERVICE) == service_name:
69
+ container.stop(timeout=10)
70
+ container.remove(force=True)
71
+ return
72
+
73
+ def stop_all(self, remove_volumes: bool = False) -> list[str]:
74
+ """Stop and remove all llmstack containers. Returns names of stopped containers."""
75
+ stopped = []
76
+ for container in self._managed_containers():
77
+ name = container.name
78
+ container.stop(timeout=10)
79
+ container.remove(force=True)
80
+ stopped.append(name)
81
+
82
+ if remove_volumes:
83
+ for vol in self.client.volumes.list():
84
+ if vol.name.startswith("llmstack_"):
85
+ try:
86
+ vol.remove(force=True)
87
+ except APIError:
88
+ pass
89
+
90
+ # Remove network
91
+ try:
92
+ net = self.client.networks.get(self.network_name)
93
+ net.remove()
94
+ except (NotFound, APIError):
95
+ pass
96
+
97
+ return stopped
98
+
99
+ def get_container(self, service_name: str) -> Container | None:
100
+ """Find a running container by service name."""
101
+ for container in self._managed_containers():
102
+ if container.labels.get(self.LABEL_SERVICE) == service_name:
103
+ return container
104
+ return None
105
+
106
+ def stream_logs(self, service_name: str, follow: bool = True, tail: int = 50) -> Iterator[str]:
107
+ """Yield decoded log lines from a service container."""
108
+ container = self.get_container(service_name)
109
+ if container is None:
110
+ raise ValueError(f"No running container for service '{service_name}'")
111
+
112
+ for chunk in container.logs(stream=True, follow=follow, tail=tail):
113
+ yield chunk.decode("utf-8", errors="replace")
114
+
115
+ def list_services(self) -> list[dict]:
116
+ """Return info about all managed containers."""
117
+ result = []
118
+ for container in self._managed_containers():
119
+ container.reload()
120
+ result.append({
121
+ "name": container.labels.get(self.LABEL_SERVICE, "unknown"),
122
+ "container_name": container.name,
123
+ "container_id": container.short_id,
124
+ "status": container.status,
125
+ "ports": container.ports,
126
+ })
127
+ return result
128
+
129
+ def _managed_containers(self) -> list[Container]:
130
+ """List all containers with the llmstack.managed label."""
131
+ return self.client.containers.list(
132
+ all=True,
133
+ filters={"label": f"{self.LABEL_MANAGED}=true"},
134
+ )
@@ -0,0 +1,16 @@
1
+ FROM python:3.11-slim
2
+
3
+ WORKDIR /app
4
+
5
+ RUN pip install --no-cache-dir \
6
+ fastapi>=0.115 \
7
+ uvicorn[standard]>=0.30 \
8
+ httpx>=0.27 \
9
+ starlette>=0.40
10
+
11
+ COPY . /app/llmstack/gateway/
12
+ ENV PYTHONPATH=/app
13
+
14
+ EXPOSE 8000
15
+
16
+ CMD ["uvicorn", "llmstack.gateway.main:app", "--host", "0.0.0.0", "--port", "8000"]
File without changes
@@ -0,0 +1,52 @@
1
+ """LLMStack Gateway — OpenAI-compatible API gateway."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+
7
+ from fastapi import FastAPI
8
+ from fastapi.middleware.cors import CORSMiddleware
9
+
10
+ from llmstack.gateway.routes.chat import router as chat_router
11
+ from llmstack.gateway.routes.embeddings import router as embeddings_router
12
+ from llmstack.gateway.routes.models import router as models_router
13
+ from llmstack.gateway.routes.health import router as health_router
14
+ from llmstack.gateway.middleware.auth import AuthMiddleware
15
+ from llmstack.gateway.middleware.metrics import MetricsMiddleware
16
+
17
+
18
+ def create_app() -> FastAPI:
19
+ app = FastAPI(
20
+ title="LLMStack Gateway",
21
+ description="OpenAI-compatible API gateway for LLMStack",
22
+ version="0.1.0",
23
+ )
24
+
25
+ # CORS
26
+ cors_origins = os.getenv("LLMSTACK_CORS_ORIGINS", "*").split(",")
27
+ app.add_middleware(
28
+ CORSMiddleware,
29
+ allow_origins=cors_origins,
30
+ allow_credentials=True,
31
+ allow_methods=["*"],
32
+ allow_headers=["*"],
33
+ )
34
+
35
+ # Auth
36
+ api_keys = os.getenv("LLMSTACK_API_KEYS", "")
37
+ if api_keys:
38
+ app.add_middleware(AuthMiddleware, api_keys=api_keys.split(","))
39
+
40
+ # Metrics
41
+ app.add_middleware(MetricsMiddleware)
42
+
43
+ # Routes
44
+ app.include_router(chat_router, prefix="/v1")
45
+ app.include_router(embeddings_router, prefix="/v1")
46
+ app.include_router(models_router, prefix="/v1")
47
+ app.include_router(health_router)
48
+
49
+ return app
50
+
51
+
52
+ app = create_app()
File without changes
@@ -0,0 +1,32 @@
1
+ """API key authentication middleware."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from starlette.middleware.base import BaseHTTPMiddleware
6
+ from starlette.requests import Request
7
+ from starlette.responses import JSONResponse
8
+
9
+
10
+ class AuthMiddleware(BaseHTTPMiddleware):
11
+ def __init__(self, app, api_keys: list[str]):
12
+ super().__init__(app)
13
+ self.api_keys = set(k.strip() for k in api_keys if k.strip())
14
+
15
+ async def dispatch(self, request: Request, call_next):
16
+ # Skip auth for health checks and docs
17
+ if request.url.path in ("/healthz", "/metrics", "/docs", "/openapi.json"):
18
+ return await call_next(request)
19
+
20
+ if not self.api_keys:
21
+ return await call_next(request)
22
+
23
+ auth = request.headers.get("Authorization", "")
24
+ if auth.startswith("Bearer "):
25
+ token = auth[7:]
26
+ if token in self.api_keys:
27
+ return await call_next(request)
28
+
29
+ return JSONResponse(
30
+ status_code=401,
31
+ content={"error": {"message": "Invalid API key", "type": "auth_error"}},
32
+ )