llmstack-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llmstack/__init__.py +3 -0
- llmstack/__main__.py +5 -0
- llmstack/cli/__init__.py +0 -0
- llmstack/cli/app.py +87 -0
- llmstack/cli/commands/__init__.py +0 -0
- llmstack/cli/commands/doctor.py +72 -0
- llmstack/cli/commands/down.py +25 -0
- llmstack/cli/commands/init.py +66 -0
- llmstack/cli/commands/logs.py +25 -0
- llmstack/cli/commands/status.py +45 -0
- llmstack/cli/commands/up.py +30 -0
- llmstack/cli/console.py +13 -0
- llmstack/config/__init__.py +4 -0
- llmstack/config/loader.py +44 -0
- llmstack/config/presets/__init__.py +11 -0
- llmstack/config/presets/agent.py +13 -0
- llmstack/config/presets/chat.py +14 -0
- llmstack/config/presets/rag.py +10 -0
- llmstack/config/schema.py +76 -0
- llmstack/core/__init__.py +0 -0
- llmstack/core/hardware.py +131 -0
- llmstack/core/health.py +23 -0
- llmstack/core/resolver.py +49 -0
- llmstack/core/stack.py +207 -0
- llmstack/docker/__init__.py +0 -0
- llmstack/docker/manager.py +134 -0
- llmstack/gateway/Dockerfile +16 -0
- llmstack/gateway/__init__.py +0 -0
- llmstack/gateway/main.py +52 -0
- llmstack/gateway/middleware/__init__.py +0 -0
- llmstack/gateway/middleware/auth.py +32 -0
- llmstack/gateway/middleware/metrics.py +115 -0
- llmstack/gateway/proxy.py +58 -0
- llmstack/gateway/routes/__init__.py +0 -0
- llmstack/gateway/routes/chat.py +27 -0
- llmstack/gateway/routes/embeddings.py +17 -0
- llmstack/gateway/routes/health.py +55 -0
- llmstack/gateway/routes/models.py +16 -0
- llmstack/plugins/__init__.py +0 -0
- llmstack/plugins/loader.py +5 -0
- llmstack/plugins/spec.py +20 -0
- llmstack/services/__init__.py +0 -0
- llmstack/services/base.py +65 -0
- llmstack/services/cache/__init__.py +0 -0
- llmstack/services/cache/redis.py +33 -0
- llmstack/services/embeddings/__init__.py +0 -0
- llmstack/services/embeddings/tei.py +49 -0
- llmstack/services/gateway/__init__.py +0 -0
- llmstack/services/gateway/service.py +47 -0
- llmstack/services/inference/__init__.py +0 -0
- llmstack/services/inference/ollama.py +60 -0
- llmstack/services/inference/vllm.py +57 -0
- llmstack/services/observe/__init__.py +0 -0
- llmstack/services/observe/prometheus.py +168 -0
- llmstack/services/registry.py +53 -0
- llmstack/services/vectordb/__init__.py +0 -0
- llmstack/services/vectordb/qdrant.py +33 -0
- llmstack_cli-0.1.0.dist-info/METADATA +252 -0
- llmstack_cli-0.1.0.dist-info/RECORD +62 -0
- llmstack_cli-0.1.0.dist-info/WHEEL +4 -0
- llmstack_cli-0.1.0.dist-info/entry_points.txt +2 -0
- llmstack_cli-0.1.0.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
"""Metrics middleware — tracks request counts, latencies, tokens in Prometheus format."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import time
|
|
6
|
+
from collections import defaultdict
|
|
7
|
+
from threading import Lock
|
|
8
|
+
|
|
9
|
+
from starlette.middleware.base import BaseHTTPMiddleware
|
|
10
|
+
from starlette.requests import Request
|
|
11
|
+
|
|
12
|
+
_lock = Lock()
|
|
13
|
+
_request_count: dict[str, int] = defaultdict(int)
|
|
14
|
+
_error_count: dict[str, int] = defaultdict(int)
|
|
15
|
+
_tokens_in: int = 0
|
|
16
|
+
_tokens_out: int = 0
|
|
17
|
+
|
|
18
|
+
# Histogram buckets for latency
|
|
19
|
+
_BUCKETS = [0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0, 120.0]
|
|
20
|
+
_latency_buckets: dict[str, list[int]] = defaultdict(lambda: [0] * (len(_BUCKETS) + 1))
|
|
21
|
+
_latency_sum: dict[str, float] = defaultdict(float)
|
|
22
|
+
_latency_count: dict[str, int] = defaultdict(int)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class MetricsMiddleware(BaseHTTPMiddleware):
|
|
26
|
+
async def dispatch(self, request: Request, call_next):
|
|
27
|
+
path = request.url.path
|
|
28
|
+
if path in ("/metrics", "/healthz"):
|
|
29
|
+
return await call_next(request)
|
|
30
|
+
|
|
31
|
+
start = time.monotonic()
|
|
32
|
+
response = await call_next(request)
|
|
33
|
+
duration = time.monotonic() - start
|
|
34
|
+
|
|
35
|
+
with _lock:
|
|
36
|
+
_request_count[path] += 1
|
|
37
|
+
_latency_sum[path] += duration
|
|
38
|
+
_latency_count[path] += 1
|
|
39
|
+
|
|
40
|
+
# Bucket assignment
|
|
41
|
+
buckets = _latency_buckets[path]
|
|
42
|
+
for i, bound in enumerate(_BUCKETS):
|
|
43
|
+
if duration <= bound:
|
|
44
|
+
buckets[i] += 1
|
|
45
|
+
buckets[-1] += 1 # +Inf
|
|
46
|
+
|
|
47
|
+
if response.status_code >= 400:
|
|
48
|
+
_error_count[path] += 1
|
|
49
|
+
|
|
50
|
+
return response
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def record_tokens(input_tokens: int = 0, output_tokens: int = 0) -> None:
|
|
54
|
+
"""Record token usage from a chat completion response."""
|
|
55
|
+
global _tokens_in, _tokens_out
|
|
56
|
+
with _lock:
|
|
57
|
+
_tokens_in += input_tokens
|
|
58
|
+
_tokens_out += output_tokens
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def get_metrics() -> dict:
|
|
62
|
+
"""Return metrics as JSON (for /metrics JSON endpoint)."""
|
|
63
|
+
with _lock:
|
|
64
|
+
result = {}
|
|
65
|
+
for path in _request_count:
|
|
66
|
+
result[path] = {
|
|
67
|
+
"requests": _request_count[path],
|
|
68
|
+
"errors": _error_count.get(path, 0),
|
|
69
|
+
"latency_avg_ms": round(
|
|
70
|
+
(_latency_sum[path] / _latency_count[path]) * 1000, 1
|
|
71
|
+
) if _latency_count[path] else 0,
|
|
72
|
+
}
|
|
73
|
+
result["tokens"] = {"input": _tokens_in, "output": _tokens_out}
|
|
74
|
+
return result
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def get_prometheus_metrics() -> str:
|
|
78
|
+
"""Return metrics in Prometheus exposition format."""
|
|
79
|
+
lines: list[str] = []
|
|
80
|
+
|
|
81
|
+
with _lock:
|
|
82
|
+
# Request counter
|
|
83
|
+
lines.append("# HELP llmstack_requests_total Total HTTP requests")
|
|
84
|
+
lines.append("# TYPE llmstack_requests_total counter")
|
|
85
|
+
for path, count in _request_count.items():
|
|
86
|
+
lines.append(f'llmstack_requests_total{{path="{path}"}} {count}')
|
|
87
|
+
|
|
88
|
+
# Error counter
|
|
89
|
+
lines.append("# HELP llmstack_errors_total Total HTTP errors (4xx/5xx)")
|
|
90
|
+
lines.append("# TYPE llmstack_errors_total counter")
|
|
91
|
+
for path, count in _error_count.items():
|
|
92
|
+
lines.append(f'llmstack_errors_total{{path="{path}"}} {count}')
|
|
93
|
+
|
|
94
|
+
# Latency histogram
|
|
95
|
+
lines.append("# HELP llmstack_request_duration_seconds Request latency histogram")
|
|
96
|
+
lines.append("# TYPE llmstack_request_duration_seconds histogram")
|
|
97
|
+
for path in _latency_count:
|
|
98
|
+
buckets = _latency_buckets[path]
|
|
99
|
+
cumulative = 0
|
|
100
|
+
for i, bound in enumerate(_BUCKETS):
|
|
101
|
+
cumulative += buckets[i]
|
|
102
|
+
lines.append(f'llmstack_request_duration_seconds_bucket{{path="{path}",le="{bound}"}} {cumulative}')
|
|
103
|
+
cumulative += buckets[-1]
|
|
104
|
+
lines.append(f'llmstack_request_duration_seconds_bucket{{path="{path}",le="+Inf"}} {cumulative}')
|
|
105
|
+
lines.append(f'llmstack_request_duration_seconds_sum{{path="{path}"}} {_latency_sum[path]:.4f}')
|
|
106
|
+
lines.append(f'llmstack_request_duration_seconds_count{{path="{path}"}} {_latency_count[path]}')
|
|
107
|
+
|
|
108
|
+
# Token counter
|
|
109
|
+
lines.append("# HELP llmstack_tokens_total Total tokens processed")
|
|
110
|
+
lines.append("# TYPE llmstack_tokens_total counter")
|
|
111
|
+
lines.append(f'llmstack_tokens_total{{type="input"}} {_tokens_in}')
|
|
112
|
+
lines.append(f'llmstack_tokens_total{{type="output"}} {_tokens_out}')
|
|
113
|
+
|
|
114
|
+
lines.append("")
|
|
115
|
+
return "\n".join(lines)
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""Proxy layer — forwards requests to inference and embedding backends."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
from typing import AsyncIterator
|
|
7
|
+
|
|
8
|
+
import httpx
|
|
9
|
+
|
|
10
|
+
INFERENCE_URL = os.getenv("LLMSTACK_INFERENCE_URL", "http://llmstack-ollama:11434/v1")
|
|
11
|
+
EMBEDDINGS_URL = os.getenv("LLMSTACK_EMBEDDINGS_URL", "")
|
|
12
|
+
|
|
13
|
+
# Timeout for inference (can be long for large models)
|
|
14
|
+
REQUEST_TIMEOUT = int(os.getenv("LLMSTACK_REQUEST_TIMEOUT", "120"))
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
async def proxy_chat_completion(payload: dict, stream: bool = False) -> dict | AsyncIterator[bytes]:
|
|
18
|
+
"""Forward a chat completion request to the inference backend."""
|
|
19
|
+
url = f"{INFERENCE_URL}/chat/completions"
|
|
20
|
+
timeout = httpx.Timeout(REQUEST_TIMEOUT, connect=10)
|
|
21
|
+
|
|
22
|
+
if stream:
|
|
23
|
+
return _stream_response(url, payload, timeout)
|
|
24
|
+
else:
|
|
25
|
+
async with httpx.AsyncClient(timeout=timeout) as client:
|
|
26
|
+
resp = await client.post(url, json=payload)
|
|
27
|
+
resp.raise_for_status()
|
|
28
|
+
return resp.json()
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
async def _stream_response(url: str, payload: dict, timeout: httpx.Timeout) -> AsyncIterator[bytes]:
|
|
32
|
+
"""Stream SSE chunks from the inference backend."""
|
|
33
|
+
async with httpx.AsyncClient(timeout=timeout) as client:
|
|
34
|
+
async with client.stream("POST", url, json=payload) as resp:
|
|
35
|
+
resp.raise_for_status()
|
|
36
|
+
async for chunk in resp.aiter_bytes():
|
|
37
|
+
yield chunk
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
async def proxy_embeddings(payload: dict) -> dict:
|
|
41
|
+
"""Forward an embeddings request to the embedding backend."""
|
|
42
|
+
url = EMBEDDINGS_URL or INFERENCE_URL
|
|
43
|
+
if not url.endswith("/embeddings"):
|
|
44
|
+
url = f"{url}/embeddings"
|
|
45
|
+
|
|
46
|
+
async with httpx.AsyncClient(timeout=30) as client:
|
|
47
|
+
resp = await client.post(url, json=payload)
|
|
48
|
+
resp.raise_for_status()
|
|
49
|
+
return resp.json()
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
async def proxy_models() -> dict:
|
|
53
|
+
"""List available models from the inference backend."""
|
|
54
|
+
url = f"{INFERENCE_URL}/models"
|
|
55
|
+
async with httpx.AsyncClient(timeout=10) as client:
|
|
56
|
+
resp = await client.get(url)
|
|
57
|
+
resp.raise_for_status()
|
|
58
|
+
return resp.json()
|
|
File without changes
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""POST /v1/chat/completions — OpenAI-compatible chat endpoint."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from fastapi import APIRouter, Request
|
|
6
|
+
from fastapi.responses import JSONResponse, StreamingResponse
|
|
7
|
+
|
|
8
|
+
from llmstack.gateway.proxy import proxy_chat_completion
|
|
9
|
+
|
|
10
|
+
router = APIRouter()
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@router.post("/chat/completions")
|
|
14
|
+
async def chat_completions(request: Request):
|
|
15
|
+
payload = await request.json()
|
|
16
|
+
stream = payload.get("stream", False)
|
|
17
|
+
|
|
18
|
+
if stream:
|
|
19
|
+
chunks = await proxy_chat_completion(payload, stream=True)
|
|
20
|
+
return StreamingResponse(
|
|
21
|
+
chunks,
|
|
22
|
+
media_type="text/event-stream",
|
|
23
|
+
headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"},
|
|
24
|
+
)
|
|
25
|
+
else:
|
|
26
|
+
result = await proxy_chat_completion(payload, stream=False)
|
|
27
|
+
return JSONResponse(content=result)
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""POST /v1/embeddings — OpenAI-compatible embeddings endpoint."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from fastapi import APIRouter, Request
|
|
6
|
+
from fastapi.responses import JSONResponse
|
|
7
|
+
|
|
8
|
+
from llmstack.gateway.proxy import proxy_embeddings
|
|
9
|
+
|
|
10
|
+
router = APIRouter()
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@router.post("/embeddings")
|
|
14
|
+
async def embeddings(request: Request):
|
|
15
|
+
payload = await request.json()
|
|
16
|
+
result = await proxy_embeddings(payload)
|
|
17
|
+
return JSONResponse(content=result)
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
"""GET /healthz — gateway health check."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
|
|
7
|
+
import httpx
|
|
8
|
+
from fastapi import APIRouter
|
|
9
|
+
from fastapi.responses import JSONResponse
|
|
10
|
+
|
|
11
|
+
router = APIRouter()
|
|
12
|
+
|
|
13
|
+
INFERENCE_URL = os.getenv("LLMSTACK_INFERENCE_URL", "")
|
|
14
|
+
QDRANT_URL = os.getenv("LLMSTACK_QDRANT_URL", "")
|
|
15
|
+
REDIS_URL = os.getenv("LLMSTACK_REDIS_URL", "")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
async def _check_url(url: str) -> bool:
|
|
19
|
+
if not url:
|
|
20
|
+
return False
|
|
21
|
+
try:
|
|
22
|
+
async with httpx.AsyncClient(timeout=5) as client:
|
|
23
|
+
resp = await client.get(url)
|
|
24
|
+
return resp.status_code == 200
|
|
25
|
+
except Exception:
|
|
26
|
+
return False
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@router.get("/healthz")
|
|
30
|
+
async def healthz():
|
|
31
|
+
checks = {}
|
|
32
|
+
|
|
33
|
+
if INFERENCE_URL:
|
|
34
|
+
checks["inference"] = await _check_url(INFERENCE_URL.replace("/v1", "/health"))
|
|
35
|
+
|
|
36
|
+
if QDRANT_URL:
|
|
37
|
+
checks["qdrant"] = await _check_url(f"{QDRANT_URL}/healthz")
|
|
38
|
+
|
|
39
|
+
all_ok = all(checks.values()) if checks else True
|
|
40
|
+
status_code = 200 if all_ok else 503
|
|
41
|
+
|
|
42
|
+
return JSONResponse(
|
|
43
|
+
content={"status": "ok" if all_ok else "degraded", "services": checks},
|
|
44
|
+
status_code=status_code,
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@router.get("/metrics")
|
|
49
|
+
async def metrics():
|
|
50
|
+
from llmstack.gateway.middleware.metrics import get_prometheus_metrics
|
|
51
|
+
from fastapi.responses import PlainTextResponse
|
|
52
|
+
return PlainTextResponse(
|
|
53
|
+
content=get_prometheus_metrics(),
|
|
54
|
+
media_type="text/plain; version=0.0.4; charset=utf-8",
|
|
55
|
+
)
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""GET /v1/models — list available models."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from fastapi import APIRouter
|
|
6
|
+
from fastapi.responses import JSONResponse
|
|
7
|
+
|
|
8
|
+
from llmstack.gateway.proxy import proxy_models
|
|
9
|
+
|
|
10
|
+
router = APIRouter()
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@router.get("/models")
|
|
14
|
+
async def list_models():
|
|
15
|
+
result = await proxy_models()
|
|
16
|
+
return JSONResponse(content=result)
|
|
File without changes
|
llmstack/plugins/spec.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
"""Plugin specification.
|
|
2
|
+
|
|
3
|
+
To create an llmstack plugin:
|
|
4
|
+
|
|
5
|
+
1. Create a class that extends ServiceBase
|
|
6
|
+
2. Set `name` and `category` class attributes
|
|
7
|
+
3. Implement: container_spec(), health_url()
|
|
8
|
+
4. Optionally implement: post_start(), openai_base_url()
|
|
9
|
+
5. Register via entry_points in pyproject.toml:
|
|
10
|
+
|
|
11
|
+
[project.entry-points."llmstack.services"]
|
|
12
|
+
my_service = "my_package:MyService"
|
|
13
|
+
|
|
14
|
+
6. Publish to PyPI. Users install with:
|
|
15
|
+
pip install llmstack-plugin-myservice
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from llmstack.services.base import ServiceBase
|
|
19
|
+
|
|
20
|
+
__all__ = ["ServiceBase"]
|
|
File without changes
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"""Base class for all managed services."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from enum import Enum
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ServiceState(str, Enum):
|
|
12
|
+
STOPPED = "stopped"
|
|
13
|
+
STARTING = "starting"
|
|
14
|
+
RUNNING = "running"
|
|
15
|
+
UNHEALTHY = "unhealthy"
|
|
16
|
+
ERROR = "error"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class ServiceStatus:
|
|
21
|
+
name: str
|
|
22
|
+
state: ServiceState
|
|
23
|
+
port: int | None = None
|
|
24
|
+
container_id: str | None = None
|
|
25
|
+
message: str = ""
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class ServiceBase(ABC):
|
|
29
|
+
"""Every llmstack service (inference, vectordb, cache, etc.) implements this."""
|
|
30
|
+
|
|
31
|
+
name: str
|
|
32
|
+
category: str # inference, vectordb, cache, embeddings, gateway, observe
|
|
33
|
+
|
|
34
|
+
@abstractmethod
|
|
35
|
+
def container_spec(self) -> dict[str, Any]:
|
|
36
|
+
"""Return kwargs for docker.containers.run().
|
|
37
|
+
|
|
38
|
+
Must include at least: image, ports, environment.
|
|
39
|
+
May include: volumes, device_requests, healthcheck, command.
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
@abstractmethod
|
|
43
|
+
def health_url(self) -> str:
|
|
44
|
+
"""HTTP URL to GET for health checks (from the host)."""
|
|
45
|
+
|
|
46
|
+
async def post_start(self) -> None:
|
|
47
|
+
"""Hook called after the container is healthy.
|
|
48
|
+
|
|
49
|
+
Override for actions like pulling a model.
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
def openai_base_url(self) -> str | None:
|
|
53
|
+
"""If this service exposes an OpenAI-compatible API, return its internal Docker URL."""
|
|
54
|
+
return None
|
|
55
|
+
|
|
56
|
+
def internal_url(self) -> str:
|
|
57
|
+
"""Return the URL reachable from other containers on the Docker network."""
|
|
58
|
+
spec = self.container_spec()
|
|
59
|
+
ports = spec.get("ports", {})
|
|
60
|
+
# Get the first container port
|
|
61
|
+
if ports:
|
|
62
|
+
container_port = list(ports.values())[0] if isinstance(ports, dict) else None
|
|
63
|
+
if container_port:
|
|
64
|
+
return f"http://{self.name}:{container_port}"
|
|
65
|
+
return f"http://{self.name}"
|
|
File without changes
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""Redis cache service."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from llmstack.config.schema import CacheConfig
|
|
8
|
+
from llmstack.services.base import ServiceBase
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class RedisService(ServiceBase):
|
|
12
|
+
name = "redis"
|
|
13
|
+
category = "cache"
|
|
14
|
+
|
|
15
|
+
def __init__(self, config: CacheConfig):
|
|
16
|
+
self.config = config
|
|
17
|
+
|
|
18
|
+
def container_spec(self) -> dict[str, Any]:
|
|
19
|
+
return {
|
|
20
|
+
"image": "redis:7-alpine",
|
|
21
|
+
"name": "llmstack-redis",
|
|
22
|
+
"ports": {"6379/tcp": self.config.port},
|
|
23
|
+
"command": [
|
|
24
|
+
"redis-server",
|
|
25
|
+
"--maxmemory", self.config.max_memory,
|
|
26
|
+
"--maxmemory-policy", "allkeys-lru",
|
|
27
|
+
],
|
|
28
|
+
"environment": {},
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
def health_url(self) -> str:
|
|
32
|
+
# Redis doesn't have HTTP health, we check via TCP
|
|
33
|
+
return f"http://localhost:{self.config.port}"
|
|
File without changes
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""HuggingFace Text Embeddings Inference (TEI) service."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from llmstack.config.schema import EmbeddingSpec
|
|
8
|
+
from llmstack.core.hardware import HardwareProfile
|
|
9
|
+
from llmstack.services.base import ServiceBase
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class TEIService(ServiceBase):
|
|
13
|
+
name = "tei"
|
|
14
|
+
category = "embeddings"
|
|
15
|
+
|
|
16
|
+
def __init__(self, spec: EmbeddingSpec, hw: HardwareProfile):
|
|
17
|
+
self.spec = spec
|
|
18
|
+
self.hw = hw
|
|
19
|
+
self.host_port = 8002
|
|
20
|
+
|
|
21
|
+
def container_spec(self) -> dict[str, Any]:
|
|
22
|
+
cmd = ["--model-id", self.spec.name, "--port", "80"]
|
|
23
|
+
|
|
24
|
+
spec: dict[str, Any] = {
|
|
25
|
+
"image": "ghcr.io/huggingface/text-embeddings-inference:cpu-latest",
|
|
26
|
+
"name": "llmstack-tei",
|
|
27
|
+
"ports": {"80/tcp": self.host_port},
|
|
28
|
+
"command": cmd,
|
|
29
|
+
"volumes": {
|
|
30
|
+
"llmstack_tei_cache": {"bind": "/data", "mode": "rw"},
|
|
31
|
+
},
|
|
32
|
+
"environment": {},
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
# Use GPU image if NVIDIA available
|
|
36
|
+
if self.hw.gpu_vendor == "nvidia":
|
|
37
|
+
import docker
|
|
38
|
+
spec["image"] = "ghcr.io/huggingface/text-embeddings-inference:latest"
|
|
39
|
+
spec["device_requests"] = [
|
|
40
|
+
docker.types.DeviceRequest(count=-1, capabilities=[["gpu"]])
|
|
41
|
+
]
|
|
42
|
+
|
|
43
|
+
return spec
|
|
44
|
+
|
|
45
|
+
def health_url(self) -> str:
|
|
46
|
+
return f"http://localhost:{self.host_port}/health"
|
|
47
|
+
|
|
48
|
+
def openai_base_url(self) -> str:
|
|
49
|
+
return "http://llmstack-tei:80/v1"
|
|
File without changes
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""Gateway service — runs the FastAPI proxy as a Docker container."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from llmstack.config.schema import GatewayConfig
|
|
8
|
+
from llmstack.services.base import ServiceBase
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class GatewayService(ServiceBase):
|
|
12
|
+
name = "gateway"
|
|
13
|
+
category = "gateway"
|
|
14
|
+
|
|
15
|
+
def __init__(
|
|
16
|
+
self,
|
|
17
|
+
config: GatewayConfig,
|
|
18
|
+
inference_url: str,
|
|
19
|
+
embeddings_url: str,
|
|
20
|
+
qdrant_url: str = "",
|
|
21
|
+
redis_url: str = "",
|
|
22
|
+
):
|
|
23
|
+
self.config = config
|
|
24
|
+
self.inference_url = inference_url
|
|
25
|
+
self.embeddings_url = embeddings_url
|
|
26
|
+
self.qdrant_url = qdrant_url
|
|
27
|
+
self.redis_url = redis_url
|
|
28
|
+
|
|
29
|
+
def container_spec(self) -> dict[str, Any]:
|
|
30
|
+
return {
|
|
31
|
+
"image": "ghcr.io/mara-werils/llmstack-gateway:latest",
|
|
32
|
+
"name": "llmstack-gateway",
|
|
33
|
+
"ports": {"8000/tcp": self.config.port},
|
|
34
|
+
"environment": {
|
|
35
|
+
"LLMSTACK_INFERENCE_URL": self.inference_url,
|
|
36
|
+
"LLMSTACK_EMBEDDINGS_URL": self.embeddings_url,
|
|
37
|
+
"LLMSTACK_QDRANT_URL": self.qdrant_url,
|
|
38
|
+
"LLMSTACK_REDIS_URL": self.redis_url,
|
|
39
|
+
"LLMSTACK_API_KEYS": ",".join(self.config.api_keys),
|
|
40
|
+
"LLMSTACK_CORS_ORIGINS": ",".join(self.config.cors),
|
|
41
|
+
"LLMSTACK_REQUEST_TIMEOUT": str(self.config.request_timeout),
|
|
42
|
+
"LLMSTACK_RATE_LIMIT": self.config.rate_limit,
|
|
43
|
+
},
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
def health_url(self) -> str:
|
|
47
|
+
return f"http://localhost:{self.config.port}/healthz"
|
|
File without changes
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
"""Ollama inference service."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
import httpx
|
|
8
|
+
|
|
9
|
+
from llmstack.config.schema import ModelSpec
|
|
10
|
+
from llmstack.core.hardware import HardwareProfile
|
|
11
|
+
from llmstack.services.base import ServiceBase
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class OllamaService(ServiceBase):
|
|
15
|
+
name = "ollama"
|
|
16
|
+
category = "inference"
|
|
17
|
+
|
|
18
|
+
def __init__(self, model: ModelSpec, hw: HardwareProfile):
|
|
19
|
+
self.model = model
|
|
20
|
+
self.hw = hw
|
|
21
|
+
self.host_port = 11434
|
|
22
|
+
|
|
23
|
+
def container_spec(self) -> dict[str, Any]:
|
|
24
|
+
spec: dict[str, Any] = {
|
|
25
|
+
"image": "ollama/ollama:latest",
|
|
26
|
+
"name": "llmstack-ollama",
|
|
27
|
+
"ports": {"11434/tcp": self.host_port},
|
|
28
|
+
"volumes": {
|
|
29
|
+
"llmstack_ollama_data": {"bind": "/root/.ollama", "mode": "rw"},
|
|
30
|
+
},
|
|
31
|
+
"environment": {},
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
# GPU passthrough for NVIDIA
|
|
35
|
+
if self.hw.gpu_vendor == "nvidia":
|
|
36
|
+
import docker
|
|
37
|
+
spec["device_requests"] = [
|
|
38
|
+
docker.types.DeviceRequest(count=-1, capabilities=[["gpu"]])
|
|
39
|
+
]
|
|
40
|
+
|
|
41
|
+
return spec
|
|
42
|
+
|
|
43
|
+
def health_url(self) -> str:
|
|
44
|
+
return f"http://localhost:{self.host_port}"
|
|
45
|
+
|
|
46
|
+
async def post_start(self) -> None:
|
|
47
|
+
"""Pull the model after Ollama is healthy."""
|
|
48
|
+
model_name = self.model.name
|
|
49
|
+
if self.model.quantization:
|
|
50
|
+
model_name = f"{self.model.name}:{self.model.quantization}"
|
|
51
|
+
|
|
52
|
+
async with httpx.AsyncClient(timeout=600) as client:
|
|
53
|
+
resp = await client.post(
|
|
54
|
+
f"http://localhost:{self.host_port}/api/pull",
|
|
55
|
+
json={"name": model_name, "stream": False},
|
|
56
|
+
)
|
|
57
|
+
resp.raise_for_status()
|
|
58
|
+
|
|
59
|
+
def openai_base_url(self) -> str:
|
|
60
|
+
return f"http://llmstack-ollama:{self.host_port}/v1"
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""vLLM inference service."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from llmstack.config.schema import ModelSpec
|
|
8
|
+
from llmstack.core.hardware import HardwareProfile
|
|
9
|
+
from llmstack.services.base import ServiceBase
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class VllmService(ServiceBase):
|
|
13
|
+
name = "vllm"
|
|
14
|
+
category = "inference"
|
|
15
|
+
|
|
16
|
+
def __init__(self, model: ModelSpec, hw: HardwareProfile):
|
|
17
|
+
self.model = model
|
|
18
|
+
self.hw = hw
|
|
19
|
+
self.host_port = 8001
|
|
20
|
+
|
|
21
|
+
def container_spec(self) -> dict[str, Any]:
|
|
22
|
+
import docker
|
|
23
|
+
|
|
24
|
+
cmd = [
|
|
25
|
+
"--model", self.model.name,
|
|
26
|
+
"--host", "0.0.0.0",
|
|
27
|
+
"--port", "8000",
|
|
28
|
+
"--max-model-len", str(self.model.context_length),
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
if self.model.quantization:
|
|
32
|
+
cmd.extend(["--quantization", self.model.quantization])
|
|
33
|
+
|
|
34
|
+
spec: dict[str, Any] = {
|
|
35
|
+
"image": "vllm/vllm-openai:latest",
|
|
36
|
+
"name": "llmstack-vllm",
|
|
37
|
+
"ports": {"8000/tcp": self.host_port},
|
|
38
|
+
"command": cmd,
|
|
39
|
+
"environment": {
|
|
40
|
+
"HUGGING_FACE_HUB_TOKEN": "",
|
|
41
|
+
},
|
|
42
|
+
"volumes": {
|
|
43
|
+
"llmstack_vllm_cache": {"bind": "/root/.cache/huggingface", "mode": "rw"},
|
|
44
|
+
},
|
|
45
|
+
"device_requests": [
|
|
46
|
+
docker.types.DeviceRequest(count=-1, capabilities=[["gpu"]])
|
|
47
|
+
],
|
|
48
|
+
"shm_size": "4g",
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
return spec
|
|
52
|
+
|
|
53
|
+
def health_url(self) -> str:
|
|
54
|
+
return f"http://localhost:{self.host_port}/health"
|
|
55
|
+
|
|
56
|
+
def openai_base_url(self) -> str:
|
|
57
|
+
return "http://llmstack-vllm:8000/v1"
|
|
File without changes
|