llmstack-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. llmstack/__init__.py +3 -0
  2. llmstack/__main__.py +5 -0
  3. llmstack/cli/__init__.py +0 -0
  4. llmstack/cli/app.py +87 -0
  5. llmstack/cli/commands/__init__.py +0 -0
  6. llmstack/cli/commands/doctor.py +72 -0
  7. llmstack/cli/commands/down.py +25 -0
  8. llmstack/cli/commands/init.py +66 -0
  9. llmstack/cli/commands/logs.py +25 -0
  10. llmstack/cli/commands/status.py +45 -0
  11. llmstack/cli/commands/up.py +30 -0
  12. llmstack/cli/console.py +13 -0
  13. llmstack/config/__init__.py +4 -0
  14. llmstack/config/loader.py +44 -0
  15. llmstack/config/presets/__init__.py +11 -0
  16. llmstack/config/presets/agent.py +13 -0
  17. llmstack/config/presets/chat.py +14 -0
  18. llmstack/config/presets/rag.py +10 -0
  19. llmstack/config/schema.py +76 -0
  20. llmstack/core/__init__.py +0 -0
  21. llmstack/core/hardware.py +131 -0
  22. llmstack/core/health.py +23 -0
  23. llmstack/core/resolver.py +49 -0
  24. llmstack/core/stack.py +207 -0
  25. llmstack/docker/__init__.py +0 -0
  26. llmstack/docker/manager.py +134 -0
  27. llmstack/gateway/Dockerfile +16 -0
  28. llmstack/gateway/__init__.py +0 -0
  29. llmstack/gateway/main.py +52 -0
  30. llmstack/gateway/middleware/__init__.py +0 -0
  31. llmstack/gateway/middleware/auth.py +32 -0
  32. llmstack/gateway/middleware/metrics.py +115 -0
  33. llmstack/gateway/proxy.py +58 -0
  34. llmstack/gateway/routes/__init__.py +0 -0
  35. llmstack/gateway/routes/chat.py +27 -0
  36. llmstack/gateway/routes/embeddings.py +17 -0
  37. llmstack/gateway/routes/health.py +55 -0
  38. llmstack/gateway/routes/models.py +16 -0
  39. llmstack/plugins/__init__.py +0 -0
  40. llmstack/plugins/loader.py +5 -0
  41. llmstack/plugins/spec.py +20 -0
  42. llmstack/services/__init__.py +0 -0
  43. llmstack/services/base.py +65 -0
  44. llmstack/services/cache/__init__.py +0 -0
  45. llmstack/services/cache/redis.py +33 -0
  46. llmstack/services/embeddings/__init__.py +0 -0
  47. llmstack/services/embeddings/tei.py +49 -0
  48. llmstack/services/gateway/__init__.py +0 -0
  49. llmstack/services/gateway/service.py +47 -0
  50. llmstack/services/inference/__init__.py +0 -0
  51. llmstack/services/inference/ollama.py +60 -0
  52. llmstack/services/inference/vllm.py +57 -0
  53. llmstack/services/observe/__init__.py +0 -0
  54. llmstack/services/observe/prometheus.py +168 -0
  55. llmstack/services/registry.py +53 -0
  56. llmstack/services/vectordb/__init__.py +0 -0
  57. llmstack/services/vectordb/qdrant.py +33 -0
  58. llmstack_cli-0.1.0.dist-info/METADATA +252 -0
  59. llmstack_cli-0.1.0.dist-info/RECORD +62 -0
  60. llmstack_cli-0.1.0.dist-info/WHEEL +4 -0
  61. llmstack_cli-0.1.0.dist-info/entry_points.txt +2 -0
  62. llmstack_cli-0.1.0.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,115 @@
1
+ """Metrics middleware — tracks request counts, latencies, tokens in Prometheus format."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import time
6
+ from collections import defaultdict
7
+ from threading import Lock
8
+
9
+ from starlette.middleware.base import BaseHTTPMiddleware
10
+ from starlette.requests import Request
11
+
12
+ _lock = Lock()
13
+ _request_count: dict[str, int] = defaultdict(int)
14
+ _error_count: dict[str, int] = defaultdict(int)
15
+ _tokens_in: int = 0
16
+ _tokens_out: int = 0
17
+
18
+ # Histogram buckets for latency
19
+ _BUCKETS = [0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0, 120.0]
20
+ _latency_buckets: dict[str, list[int]] = defaultdict(lambda: [0] * (len(_BUCKETS) + 1))
21
+ _latency_sum: dict[str, float] = defaultdict(float)
22
+ _latency_count: dict[str, int] = defaultdict(int)
23
+
24
+
25
+ class MetricsMiddleware(BaseHTTPMiddleware):
26
+ async def dispatch(self, request: Request, call_next):
27
+ path = request.url.path
28
+ if path in ("/metrics", "/healthz"):
29
+ return await call_next(request)
30
+
31
+ start = time.monotonic()
32
+ response = await call_next(request)
33
+ duration = time.monotonic() - start
34
+
35
+ with _lock:
36
+ _request_count[path] += 1
37
+ _latency_sum[path] += duration
38
+ _latency_count[path] += 1
39
+
40
+ # Bucket assignment
41
+ buckets = _latency_buckets[path]
42
+ for i, bound in enumerate(_BUCKETS):
43
+ if duration <= bound:
44
+ buckets[i] += 1
45
+ buckets[-1] += 1 # +Inf
46
+
47
+ if response.status_code >= 400:
48
+ _error_count[path] += 1
49
+
50
+ return response
51
+
52
+
53
+ def record_tokens(input_tokens: int = 0, output_tokens: int = 0) -> None:
54
+ """Record token usage from a chat completion response."""
55
+ global _tokens_in, _tokens_out
56
+ with _lock:
57
+ _tokens_in += input_tokens
58
+ _tokens_out += output_tokens
59
+
60
+
61
+ def get_metrics() -> dict:
62
+ """Return metrics as JSON (for /metrics JSON endpoint)."""
63
+ with _lock:
64
+ result = {}
65
+ for path in _request_count:
66
+ result[path] = {
67
+ "requests": _request_count[path],
68
+ "errors": _error_count.get(path, 0),
69
+ "latency_avg_ms": round(
70
+ (_latency_sum[path] / _latency_count[path]) * 1000, 1
71
+ ) if _latency_count[path] else 0,
72
+ }
73
+ result["tokens"] = {"input": _tokens_in, "output": _tokens_out}
74
+ return result
75
+
76
+
77
+ def get_prometheus_metrics() -> str:
78
+ """Return metrics in Prometheus exposition format."""
79
+ lines: list[str] = []
80
+
81
+ with _lock:
82
+ # Request counter
83
+ lines.append("# HELP llmstack_requests_total Total HTTP requests")
84
+ lines.append("# TYPE llmstack_requests_total counter")
85
+ for path, count in _request_count.items():
86
+ lines.append(f'llmstack_requests_total{{path="{path}"}} {count}')
87
+
88
+ # Error counter
89
+ lines.append("# HELP llmstack_errors_total Total HTTP errors (4xx/5xx)")
90
+ lines.append("# TYPE llmstack_errors_total counter")
91
+ for path, count in _error_count.items():
92
+ lines.append(f'llmstack_errors_total{{path="{path}"}} {count}')
93
+
94
+ # Latency histogram
95
+ lines.append("# HELP llmstack_request_duration_seconds Request latency histogram")
96
+ lines.append("# TYPE llmstack_request_duration_seconds histogram")
97
+ for path in _latency_count:
98
+ buckets = _latency_buckets[path]
99
+ cumulative = 0
100
+ for i, bound in enumerate(_BUCKETS):
101
+ cumulative += buckets[i]
102
+ lines.append(f'llmstack_request_duration_seconds_bucket{{path="{path}",le="{bound}"}} {cumulative}')
103
+ cumulative += buckets[-1]
104
+ lines.append(f'llmstack_request_duration_seconds_bucket{{path="{path}",le="+Inf"}} {cumulative}')
105
+ lines.append(f'llmstack_request_duration_seconds_sum{{path="{path}"}} {_latency_sum[path]:.4f}')
106
+ lines.append(f'llmstack_request_duration_seconds_count{{path="{path}"}} {_latency_count[path]}')
107
+
108
+ # Token counter
109
+ lines.append("# HELP llmstack_tokens_total Total tokens processed")
110
+ lines.append("# TYPE llmstack_tokens_total counter")
111
+ lines.append(f'llmstack_tokens_total{{type="input"}} {_tokens_in}')
112
+ lines.append(f'llmstack_tokens_total{{type="output"}} {_tokens_out}')
113
+
114
+ lines.append("")
115
+ return "\n".join(lines)
@@ -0,0 +1,58 @@
1
+ """Proxy layer — forwards requests to inference and embedding backends."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ from typing import AsyncIterator
7
+
8
+ import httpx
9
+
10
+ INFERENCE_URL = os.getenv("LLMSTACK_INFERENCE_URL", "http://llmstack-ollama:11434/v1")
11
+ EMBEDDINGS_URL = os.getenv("LLMSTACK_EMBEDDINGS_URL", "")
12
+
13
+ # Timeout for inference (can be long for large models)
14
+ REQUEST_TIMEOUT = int(os.getenv("LLMSTACK_REQUEST_TIMEOUT", "120"))
15
+
16
+
17
+ async def proxy_chat_completion(payload: dict, stream: bool = False) -> dict | AsyncIterator[bytes]:
18
+ """Forward a chat completion request to the inference backend."""
19
+ url = f"{INFERENCE_URL}/chat/completions"
20
+ timeout = httpx.Timeout(REQUEST_TIMEOUT, connect=10)
21
+
22
+ if stream:
23
+ return _stream_response(url, payload, timeout)
24
+ else:
25
+ async with httpx.AsyncClient(timeout=timeout) as client:
26
+ resp = await client.post(url, json=payload)
27
+ resp.raise_for_status()
28
+ return resp.json()
29
+
30
+
31
+ async def _stream_response(url: str, payload: dict, timeout: httpx.Timeout) -> AsyncIterator[bytes]:
32
+ """Stream SSE chunks from the inference backend."""
33
+ async with httpx.AsyncClient(timeout=timeout) as client:
34
+ async with client.stream("POST", url, json=payload) as resp:
35
+ resp.raise_for_status()
36
+ async for chunk in resp.aiter_bytes():
37
+ yield chunk
38
+
39
+
40
+ async def proxy_embeddings(payload: dict) -> dict:
41
+ """Forward an embeddings request to the embedding backend."""
42
+ url = EMBEDDINGS_URL or INFERENCE_URL
43
+ if not url.endswith("/embeddings"):
44
+ url = f"{url}/embeddings"
45
+
46
+ async with httpx.AsyncClient(timeout=30) as client:
47
+ resp = await client.post(url, json=payload)
48
+ resp.raise_for_status()
49
+ return resp.json()
50
+
51
+
52
+ async def proxy_models() -> dict:
53
+ """List available models from the inference backend."""
54
+ url = f"{INFERENCE_URL}/models"
55
+ async with httpx.AsyncClient(timeout=10) as client:
56
+ resp = await client.get(url)
57
+ resp.raise_for_status()
58
+ return resp.json()
File without changes
@@ -0,0 +1,27 @@
1
+ """POST /v1/chat/completions — OpenAI-compatible chat endpoint."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from fastapi import APIRouter, Request
6
+ from fastapi.responses import JSONResponse, StreamingResponse
7
+
8
+ from llmstack.gateway.proxy import proxy_chat_completion
9
+
10
+ router = APIRouter()
11
+
12
+
13
+ @router.post("/chat/completions")
14
+ async def chat_completions(request: Request):
15
+ payload = await request.json()
16
+ stream = payload.get("stream", False)
17
+
18
+ if stream:
19
+ chunks = await proxy_chat_completion(payload, stream=True)
20
+ return StreamingResponse(
21
+ chunks,
22
+ media_type="text/event-stream",
23
+ headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"},
24
+ )
25
+ else:
26
+ result = await proxy_chat_completion(payload, stream=False)
27
+ return JSONResponse(content=result)
@@ -0,0 +1,17 @@
1
+ """POST /v1/embeddings — OpenAI-compatible embeddings endpoint."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from fastapi import APIRouter, Request
6
+ from fastapi.responses import JSONResponse
7
+
8
+ from llmstack.gateway.proxy import proxy_embeddings
9
+
10
+ router = APIRouter()
11
+
12
+
13
+ @router.post("/embeddings")
14
+ async def embeddings(request: Request):
15
+ payload = await request.json()
16
+ result = await proxy_embeddings(payload)
17
+ return JSONResponse(content=result)
@@ -0,0 +1,55 @@
1
+ """GET /healthz — gateway health check."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+
7
+ import httpx
8
+ from fastapi import APIRouter
9
+ from fastapi.responses import JSONResponse
10
+
11
+ router = APIRouter()
12
+
13
+ INFERENCE_URL = os.getenv("LLMSTACK_INFERENCE_URL", "")
14
+ QDRANT_URL = os.getenv("LLMSTACK_QDRANT_URL", "")
15
+ REDIS_URL = os.getenv("LLMSTACK_REDIS_URL", "")
16
+
17
+
18
+ async def _check_url(url: str) -> bool:
19
+ if not url:
20
+ return False
21
+ try:
22
+ async with httpx.AsyncClient(timeout=5) as client:
23
+ resp = await client.get(url)
24
+ return resp.status_code == 200
25
+ except Exception:
26
+ return False
27
+
28
+
29
+ @router.get("/healthz")
30
+ async def healthz():
31
+ checks = {}
32
+
33
+ if INFERENCE_URL:
34
+ checks["inference"] = await _check_url(INFERENCE_URL.replace("/v1", "/health"))
35
+
36
+ if QDRANT_URL:
37
+ checks["qdrant"] = await _check_url(f"{QDRANT_URL}/healthz")
38
+
39
+ all_ok = all(checks.values()) if checks else True
40
+ status_code = 200 if all_ok else 503
41
+
42
+ return JSONResponse(
43
+ content={"status": "ok" if all_ok else "degraded", "services": checks},
44
+ status_code=status_code,
45
+ )
46
+
47
+
48
+ @router.get("/metrics")
49
+ async def metrics():
50
+ from llmstack.gateway.middleware.metrics import get_prometheus_metrics
51
+ from fastapi.responses import PlainTextResponse
52
+ return PlainTextResponse(
53
+ content=get_prometheus_metrics(),
54
+ media_type="text/plain; version=0.0.4; charset=utf-8",
55
+ )
@@ -0,0 +1,16 @@
1
+ """GET /v1/models — list available models."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from fastapi import APIRouter
6
+ from fastapi.responses import JSONResponse
7
+
8
+ from llmstack.gateway.proxy import proxy_models
9
+
10
+ router = APIRouter()
11
+
12
+
13
+ @router.get("/models")
14
+ async def list_models():
15
+ result = await proxy_models()
16
+ return JSONResponse(content=result)
File without changes
@@ -0,0 +1,5 @@
1
+ """Plugin loader — re-exports the registry for convenience."""
2
+
3
+ from llmstack.services.registry import ServiceRegistry
4
+
5
+ __all__ = ["ServiceRegistry"]
@@ -0,0 +1,20 @@
1
+ """Plugin specification.
2
+
3
+ To create an llmstack plugin:
4
+
5
+ 1. Create a class that extends ServiceBase
6
+ 2. Set `name` and `category` class attributes
7
+ 3. Implement: container_spec(), health_url()
8
+ 4. Optionally implement: post_start(), openai_base_url()
9
+ 5. Register via entry_points in pyproject.toml:
10
+
11
+ [project.entry-points."llmstack.services"]
12
+ my_service = "my_package:MyService"
13
+
14
+ 6. Publish to PyPI. Users install with:
15
+ pip install llmstack-plugin-myservice
16
+ """
17
+
18
+ from llmstack.services.base import ServiceBase
19
+
20
+ __all__ = ["ServiceBase"]
File without changes
@@ -0,0 +1,65 @@
1
+ """Base class for all managed services."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from abc import ABC, abstractmethod
6
+ from dataclasses import dataclass
7
+ from enum import Enum
8
+ from typing import Any
9
+
10
+
11
+ class ServiceState(str, Enum):
12
+ STOPPED = "stopped"
13
+ STARTING = "starting"
14
+ RUNNING = "running"
15
+ UNHEALTHY = "unhealthy"
16
+ ERROR = "error"
17
+
18
+
19
+ @dataclass
20
+ class ServiceStatus:
21
+ name: str
22
+ state: ServiceState
23
+ port: int | None = None
24
+ container_id: str | None = None
25
+ message: str = ""
26
+
27
+
28
+ class ServiceBase(ABC):
29
+ """Every llmstack service (inference, vectordb, cache, etc.) implements this."""
30
+
31
+ name: str
32
+ category: str # inference, vectordb, cache, embeddings, gateway, observe
33
+
34
+ @abstractmethod
35
+ def container_spec(self) -> dict[str, Any]:
36
+ """Return kwargs for docker.containers.run().
37
+
38
+ Must include at least: image, ports, environment.
39
+ May include: volumes, device_requests, healthcheck, command.
40
+ """
41
+
42
+ @abstractmethod
43
+ def health_url(self) -> str:
44
+ """HTTP URL to GET for health checks (from the host)."""
45
+
46
+ async def post_start(self) -> None:
47
+ """Hook called after the container is healthy.
48
+
49
+ Override for actions like pulling a model.
50
+ """
51
+
52
+ def openai_base_url(self) -> str | None:
53
+ """If this service exposes an OpenAI-compatible API, return its internal Docker URL."""
54
+ return None
55
+
56
+ def internal_url(self) -> str:
57
+ """Return the URL reachable from other containers on the Docker network."""
58
+ spec = self.container_spec()
59
+ ports = spec.get("ports", {})
60
+ # Get the first container port
61
+ if ports:
62
+ container_port = list(ports.values())[0] if isinstance(ports, dict) else None
63
+ if container_port:
64
+ return f"http://{self.name}:{container_port}"
65
+ return f"http://{self.name}"
File without changes
@@ -0,0 +1,33 @@
1
+ """Redis cache service."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ from llmstack.config.schema import CacheConfig
8
+ from llmstack.services.base import ServiceBase
9
+
10
+
11
+ class RedisService(ServiceBase):
12
+ name = "redis"
13
+ category = "cache"
14
+
15
+ def __init__(self, config: CacheConfig):
16
+ self.config = config
17
+
18
+ def container_spec(self) -> dict[str, Any]:
19
+ return {
20
+ "image": "redis:7-alpine",
21
+ "name": "llmstack-redis",
22
+ "ports": {"6379/tcp": self.config.port},
23
+ "command": [
24
+ "redis-server",
25
+ "--maxmemory", self.config.max_memory,
26
+ "--maxmemory-policy", "allkeys-lru",
27
+ ],
28
+ "environment": {},
29
+ }
30
+
31
+ def health_url(self) -> str:
32
+ # Redis doesn't have HTTP health, we check via TCP
33
+ return f"http://localhost:{self.config.port}"
File without changes
@@ -0,0 +1,49 @@
1
+ """HuggingFace Text Embeddings Inference (TEI) service."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ from llmstack.config.schema import EmbeddingSpec
8
+ from llmstack.core.hardware import HardwareProfile
9
+ from llmstack.services.base import ServiceBase
10
+
11
+
12
+ class TEIService(ServiceBase):
13
+ name = "tei"
14
+ category = "embeddings"
15
+
16
+ def __init__(self, spec: EmbeddingSpec, hw: HardwareProfile):
17
+ self.spec = spec
18
+ self.hw = hw
19
+ self.host_port = 8002
20
+
21
+ def container_spec(self) -> dict[str, Any]:
22
+ cmd = ["--model-id", self.spec.name, "--port", "80"]
23
+
24
+ spec: dict[str, Any] = {
25
+ "image": "ghcr.io/huggingface/text-embeddings-inference:cpu-latest",
26
+ "name": "llmstack-tei",
27
+ "ports": {"80/tcp": self.host_port},
28
+ "command": cmd,
29
+ "volumes": {
30
+ "llmstack_tei_cache": {"bind": "/data", "mode": "rw"},
31
+ },
32
+ "environment": {},
33
+ }
34
+
35
+ # Use GPU image if NVIDIA available
36
+ if self.hw.gpu_vendor == "nvidia":
37
+ import docker
38
+ spec["image"] = "ghcr.io/huggingface/text-embeddings-inference:latest"
39
+ spec["device_requests"] = [
40
+ docker.types.DeviceRequest(count=-1, capabilities=[["gpu"]])
41
+ ]
42
+
43
+ return spec
44
+
45
+ def health_url(self) -> str:
46
+ return f"http://localhost:{self.host_port}/health"
47
+
48
+ def openai_base_url(self) -> str:
49
+ return "http://llmstack-tei:80/v1"
File without changes
@@ -0,0 +1,47 @@
1
+ """Gateway service — runs the FastAPI proxy as a Docker container."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ from llmstack.config.schema import GatewayConfig
8
+ from llmstack.services.base import ServiceBase
9
+
10
+
11
+ class GatewayService(ServiceBase):
12
+ name = "gateway"
13
+ category = "gateway"
14
+
15
+ def __init__(
16
+ self,
17
+ config: GatewayConfig,
18
+ inference_url: str,
19
+ embeddings_url: str,
20
+ qdrant_url: str = "",
21
+ redis_url: str = "",
22
+ ):
23
+ self.config = config
24
+ self.inference_url = inference_url
25
+ self.embeddings_url = embeddings_url
26
+ self.qdrant_url = qdrant_url
27
+ self.redis_url = redis_url
28
+
29
+ def container_spec(self) -> dict[str, Any]:
30
+ return {
31
+ "image": "ghcr.io/mara-werils/llmstack-gateway:latest",
32
+ "name": "llmstack-gateway",
33
+ "ports": {"8000/tcp": self.config.port},
34
+ "environment": {
35
+ "LLMSTACK_INFERENCE_URL": self.inference_url,
36
+ "LLMSTACK_EMBEDDINGS_URL": self.embeddings_url,
37
+ "LLMSTACK_QDRANT_URL": self.qdrant_url,
38
+ "LLMSTACK_REDIS_URL": self.redis_url,
39
+ "LLMSTACK_API_KEYS": ",".join(self.config.api_keys),
40
+ "LLMSTACK_CORS_ORIGINS": ",".join(self.config.cors),
41
+ "LLMSTACK_REQUEST_TIMEOUT": str(self.config.request_timeout),
42
+ "LLMSTACK_RATE_LIMIT": self.config.rate_limit,
43
+ },
44
+ }
45
+
46
+ def health_url(self) -> str:
47
+ return f"http://localhost:{self.config.port}/healthz"
File without changes
@@ -0,0 +1,60 @@
1
+ """Ollama inference service."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ import httpx
8
+
9
+ from llmstack.config.schema import ModelSpec
10
+ from llmstack.core.hardware import HardwareProfile
11
+ from llmstack.services.base import ServiceBase
12
+
13
+
14
+ class OllamaService(ServiceBase):
15
+ name = "ollama"
16
+ category = "inference"
17
+
18
+ def __init__(self, model: ModelSpec, hw: HardwareProfile):
19
+ self.model = model
20
+ self.hw = hw
21
+ self.host_port = 11434
22
+
23
+ def container_spec(self) -> dict[str, Any]:
24
+ spec: dict[str, Any] = {
25
+ "image": "ollama/ollama:latest",
26
+ "name": "llmstack-ollama",
27
+ "ports": {"11434/tcp": self.host_port},
28
+ "volumes": {
29
+ "llmstack_ollama_data": {"bind": "/root/.ollama", "mode": "rw"},
30
+ },
31
+ "environment": {},
32
+ }
33
+
34
+ # GPU passthrough for NVIDIA
35
+ if self.hw.gpu_vendor == "nvidia":
36
+ import docker
37
+ spec["device_requests"] = [
38
+ docker.types.DeviceRequest(count=-1, capabilities=[["gpu"]])
39
+ ]
40
+
41
+ return spec
42
+
43
+ def health_url(self) -> str:
44
+ return f"http://localhost:{self.host_port}"
45
+
46
+ async def post_start(self) -> None:
47
+ """Pull the model after Ollama is healthy."""
48
+ model_name = self.model.name
49
+ if self.model.quantization:
50
+ model_name = f"{self.model.name}:{self.model.quantization}"
51
+
52
+ async with httpx.AsyncClient(timeout=600) as client:
53
+ resp = await client.post(
54
+ f"http://localhost:{self.host_port}/api/pull",
55
+ json={"name": model_name, "stream": False},
56
+ )
57
+ resp.raise_for_status()
58
+
59
+ def openai_base_url(self) -> str:
60
+ return f"http://llmstack-ollama:{self.host_port}/v1"
@@ -0,0 +1,57 @@
1
+ """vLLM inference service."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ from llmstack.config.schema import ModelSpec
8
+ from llmstack.core.hardware import HardwareProfile
9
+ from llmstack.services.base import ServiceBase
10
+
11
+
12
+ class VllmService(ServiceBase):
13
+ name = "vllm"
14
+ category = "inference"
15
+
16
+ def __init__(self, model: ModelSpec, hw: HardwareProfile):
17
+ self.model = model
18
+ self.hw = hw
19
+ self.host_port = 8001
20
+
21
+ def container_spec(self) -> dict[str, Any]:
22
+ import docker
23
+
24
+ cmd = [
25
+ "--model", self.model.name,
26
+ "--host", "0.0.0.0",
27
+ "--port", "8000",
28
+ "--max-model-len", str(self.model.context_length),
29
+ ]
30
+
31
+ if self.model.quantization:
32
+ cmd.extend(["--quantization", self.model.quantization])
33
+
34
+ spec: dict[str, Any] = {
35
+ "image": "vllm/vllm-openai:latest",
36
+ "name": "llmstack-vllm",
37
+ "ports": {"8000/tcp": self.host_port},
38
+ "command": cmd,
39
+ "environment": {
40
+ "HUGGING_FACE_HUB_TOKEN": "",
41
+ },
42
+ "volumes": {
43
+ "llmstack_vllm_cache": {"bind": "/root/.cache/huggingface", "mode": "rw"},
44
+ },
45
+ "device_requests": [
46
+ docker.types.DeviceRequest(count=-1, capabilities=[["gpu"]])
47
+ ],
48
+ "shm_size": "4g",
49
+ }
50
+
51
+ return spec
52
+
53
+ def health_url(self) -> str:
54
+ return f"http://localhost:{self.host_port}/health"
55
+
56
+ def openai_base_url(self) -> str:
57
+ return "http://llmstack-vllm:8000/v1"
File without changes