llmstack-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llmstack/__init__.py +3 -0
- llmstack/__main__.py +5 -0
- llmstack/cli/__init__.py +0 -0
- llmstack/cli/app.py +87 -0
- llmstack/cli/commands/__init__.py +0 -0
- llmstack/cli/commands/doctor.py +72 -0
- llmstack/cli/commands/down.py +25 -0
- llmstack/cli/commands/init.py +66 -0
- llmstack/cli/commands/logs.py +25 -0
- llmstack/cli/commands/status.py +45 -0
- llmstack/cli/commands/up.py +30 -0
- llmstack/cli/console.py +13 -0
- llmstack/config/__init__.py +4 -0
- llmstack/config/loader.py +44 -0
- llmstack/config/presets/__init__.py +11 -0
- llmstack/config/presets/agent.py +13 -0
- llmstack/config/presets/chat.py +14 -0
- llmstack/config/presets/rag.py +10 -0
- llmstack/config/schema.py +76 -0
- llmstack/core/__init__.py +0 -0
- llmstack/core/hardware.py +131 -0
- llmstack/core/health.py +23 -0
- llmstack/core/resolver.py +49 -0
- llmstack/core/stack.py +207 -0
- llmstack/docker/__init__.py +0 -0
- llmstack/docker/manager.py +134 -0
- llmstack/gateway/Dockerfile +16 -0
- llmstack/gateway/__init__.py +0 -0
- llmstack/gateway/main.py +52 -0
- llmstack/gateway/middleware/__init__.py +0 -0
- llmstack/gateway/middleware/auth.py +32 -0
- llmstack/gateway/middleware/metrics.py +115 -0
- llmstack/gateway/proxy.py +58 -0
- llmstack/gateway/routes/__init__.py +0 -0
- llmstack/gateway/routes/chat.py +27 -0
- llmstack/gateway/routes/embeddings.py +17 -0
- llmstack/gateway/routes/health.py +55 -0
- llmstack/gateway/routes/models.py +16 -0
- llmstack/plugins/__init__.py +0 -0
- llmstack/plugins/loader.py +5 -0
- llmstack/plugins/spec.py +20 -0
- llmstack/services/__init__.py +0 -0
- llmstack/services/base.py +65 -0
- llmstack/services/cache/__init__.py +0 -0
- llmstack/services/cache/redis.py +33 -0
- llmstack/services/embeddings/__init__.py +0 -0
- llmstack/services/embeddings/tei.py +49 -0
- llmstack/services/gateway/__init__.py +0 -0
- llmstack/services/gateway/service.py +47 -0
- llmstack/services/inference/__init__.py +0 -0
- llmstack/services/inference/ollama.py +60 -0
- llmstack/services/inference/vllm.py +57 -0
- llmstack/services/observe/__init__.py +0 -0
- llmstack/services/observe/prometheus.py +168 -0
- llmstack/services/registry.py +53 -0
- llmstack/services/vectordb/__init__.py +0 -0
- llmstack/services/vectordb/qdrant.py +33 -0
- llmstack_cli-0.1.0.dist-info/METADATA +252 -0
- llmstack_cli-0.1.0.dist-info/RECORD +62 -0
- llmstack_cli-0.1.0.dist-info/WHEEL +4 -0
- llmstack_cli-0.1.0.dist-info/entry_points.txt +2 -0
- llmstack_cli-0.1.0.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
"""Prometheus + Grafana observability services."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from llmstack.config.schema import ObserveConfig
|
|
9
|
+
from llmstack.services.base import ServiceBase
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
# Prometheus config that scrapes the gateway /metrics endpoint
|
|
13
|
+
PROMETHEUS_CONFIG = """
|
|
14
|
+
global:
|
|
15
|
+
scrape_interval: 15s
|
|
16
|
+
evaluation_interval: 15s
|
|
17
|
+
|
|
18
|
+
scrape_configs:
|
|
19
|
+
- job_name: 'llmstack-gateway'
|
|
20
|
+
metrics_path: '/metrics'
|
|
21
|
+
static_configs:
|
|
22
|
+
- targets: ['llmstack-gateway:8000']
|
|
23
|
+
scrape_interval: 5s
|
|
24
|
+
|
|
25
|
+
- job_name: 'qdrant'
|
|
26
|
+
metrics_path: '/metrics'
|
|
27
|
+
static_configs:
|
|
28
|
+
- targets: ['llmstack-qdrant:6333']
|
|
29
|
+
scrape_interval: 15s
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
# Grafana datasource provisioning
|
|
33
|
+
GRAFANA_DATASOURCE = {
|
|
34
|
+
"apiVersion": 1,
|
|
35
|
+
"datasources": [{
|
|
36
|
+
"name": "Prometheus",
|
|
37
|
+
"type": "prometheus",
|
|
38
|
+
"access": "proxy",
|
|
39
|
+
"url": "http://llmstack-prometheus:9090",
|
|
40
|
+
"isDefault": True,
|
|
41
|
+
}],
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
# Grafana dashboard provisioning config
|
|
45
|
+
GRAFANA_DASHBOARD_PROVIDER = {
|
|
46
|
+
"apiVersion": 1,
|
|
47
|
+
"providers": [{
|
|
48
|
+
"name": "LLMStack",
|
|
49
|
+
"type": "file",
|
|
50
|
+
"options": {"path": "/var/lib/grafana/dashboards"},
|
|
51
|
+
}],
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
# Pre-built Grafana dashboard JSON
|
|
55
|
+
GRAFANA_DASHBOARD = {
|
|
56
|
+
"dashboard": {
|
|
57
|
+
"title": "LLMStack Overview",
|
|
58
|
+
"uid": "llmstack-overview",
|
|
59
|
+
"timezone": "browser",
|
|
60
|
+
"refresh": "10s",
|
|
61
|
+
"panels": [
|
|
62
|
+
{
|
|
63
|
+
"title": "Request Rate",
|
|
64
|
+
"type": "timeseries",
|
|
65
|
+
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 0},
|
|
66
|
+
"targets": [{"expr": "rate(llmstack_requests_total[1m])", "legendFormat": "{{path}}"}],
|
|
67
|
+
},
|
|
68
|
+
{
|
|
69
|
+
"title": "Latency (p50 / p99)",
|
|
70
|
+
"type": "timeseries",
|
|
71
|
+
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 0},
|
|
72
|
+
"targets": [
|
|
73
|
+
{"expr": "histogram_quantile(0.5, rate(llmstack_request_duration_seconds_bucket[5m]))", "legendFormat": "p50"},
|
|
74
|
+
{"expr": "histogram_quantile(0.99, rate(llmstack_request_duration_seconds_bucket[5m]))", "legendFormat": "p99"},
|
|
75
|
+
],
|
|
76
|
+
},
|
|
77
|
+
{
|
|
78
|
+
"title": "Error Rate",
|
|
79
|
+
"type": "stat",
|
|
80
|
+
"gridPos": {"h": 4, "w": 6, "x": 0, "y": 8},
|
|
81
|
+
"targets": [{"expr": "sum(rate(llmstack_errors_total[5m]))"}],
|
|
82
|
+
},
|
|
83
|
+
{
|
|
84
|
+
"title": "Active Services",
|
|
85
|
+
"type": "stat",
|
|
86
|
+
"gridPos": {"h": 4, "w": 6, "x": 6, "y": 8},
|
|
87
|
+
"targets": [{"expr": "up"}],
|
|
88
|
+
},
|
|
89
|
+
{
|
|
90
|
+
"title": "Token Throughput",
|
|
91
|
+
"type": "timeseries",
|
|
92
|
+
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 8},
|
|
93
|
+
"targets": [{"expr": "rate(llmstack_tokens_total[1m])", "legendFormat": "{{type}}"}],
|
|
94
|
+
},
|
|
95
|
+
],
|
|
96
|
+
},
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
class PrometheusService(ServiceBase):
|
|
101
|
+
name = "prometheus"
|
|
102
|
+
category = "observe"
|
|
103
|
+
|
|
104
|
+
def __init__(self, config: ObserveConfig):
|
|
105
|
+
self.config = config
|
|
106
|
+
self.host_port = 9090
|
|
107
|
+
|
|
108
|
+
def container_spec(self) -> dict[str, Any]:
|
|
109
|
+
return {
|
|
110
|
+
"image": "prom/prometheus:latest",
|
|
111
|
+
"name": "llmstack-prometheus",
|
|
112
|
+
"ports": {"9090/tcp": self.host_port},
|
|
113
|
+
"command": [
|
|
114
|
+
"--config.file=/etc/prometheus/prometheus.yml",
|
|
115
|
+
f"--storage.tsdb.retention.time={self.config.retention}",
|
|
116
|
+
"--web.enable-lifecycle",
|
|
117
|
+
],
|
|
118
|
+
"volumes": {
|
|
119
|
+
"llmstack_prometheus_config": {"bind": "/etc/prometheus", "mode": "rw"},
|
|
120
|
+
"llmstack_prometheus_data": {"bind": "/prometheus", "mode": "rw"},
|
|
121
|
+
},
|
|
122
|
+
"environment": {},
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
def health_url(self) -> str:
|
|
126
|
+
return f"http://localhost:{self.host_port}/-/healthy"
|
|
127
|
+
|
|
128
|
+
def get_config_yaml(self) -> str:
|
|
129
|
+
"""Return the prometheus.yml content."""
|
|
130
|
+
return PROMETHEUS_CONFIG
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
class GrafanaService(ServiceBase):
|
|
134
|
+
name = "grafana"
|
|
135
|
+
category = "observe"
|
|
136
|
+
|
|
137
|
+
def __init__(self, config: ObserveConfig):
|
|
138
|
+
self.config = config
|
|
139
|
+
self.host_port = config.dashboard_port
|
|
140
|
+
|
|
141
|
+
def container_spec(self) -> dict[str, Any]:
|
|
142
|
+
return {
|
|
143
|
+
"image": "grafana/grafana:latest",
|
|
144
|
+
"name": "llmstack-grafana",
|
|
145
|
+
"ports": {"3000/tcp": self.host_port},
|
|
146
|
+
"environment": {
|
|
147
|
+
"GF_SECURITY_ADMIN_USER": "admin",
|
|
148
|
+
"GF_SECURITY_ADMIN_PASSWORD": "llmstack",
|
|
149
|
+
"GF_AUTH_ANONYMOUS_ENABLED": "true",
|
|
150
|
+
"GF_AUTH_ANONYMOUS_ORG_ROLE": "Viewer",
|
|
151
|
+
"GF_DASHBOARDS_DEFAULT_HOME_DASHBOARD_PATH": "/var/lib/grafana/dashboards/llmstack.json",
|
|
152
|
+
},
|
|
153
|
+
"volumes": {
|
|
154
|
+
"llmstack_grafana_data": {"bind": "/var/lib/grafana", "mode": "rw"},
|
|
155
|
+
},
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
def health_url(self) -> str:
|
|
159
|
+
return f"http://localhost:{self.host_port}/api/health"
|
|
160
|
+
|
|
161
|
+
def get_datasource_json(self) -> str:
|
|
162
|
+
return json.dumps(GRAFANA_DATASOURCE, indent=2)
|
|
163
|
+
|
|
164
|
+
def get_dashboard_provider_json(self) -> str:
|
|
165
|
+
return json.dumps(GRAFANA_DASHBOARD_PROVIDER, indent=2)
|
|
166
|
+
|
|
167
|
+
def get_dashboard_json(self) -> str:
|
|
168
|
+
return json.dumps(GRAFANA_DASHBOARD, indent=2)
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
"""Service registry — discovers built-in and plugin services."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from importlib.metadata import entry_points
|
|
6
|
+
from typing import Type
|
|
7
|
+
|
|
8
|
+
from llmstack.services.base import ServiceBase
|
|
9
|
+
from llmstack.services.inference.ollama import OllamaService
|
|
10
|
+
from llmstack.services.inference.vllm import VllmService
|
|
11
|
+
from llmstack.services.vectordb.qdrant import QdrantService
|
|
12
|
+
from llmstack.services.cache.redis import RedisService
|
|
13
|
+
from llmstack.services.embeddings.tei import TEIService
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class ServiceRegistry:
|
|
17
|
+
"""Discovers all built-in + plugin services."""
|
|
18
|
+
|
|
19
|
+
def __init__(self):
|
|
20
|
+
self._services: dict[str, Type[ServiceBase]] = {}
|
|
21
|
+
self._load_builtins()
|
|
22
|
+
self._load_plugins()
|
|
23
|
+
|
|
24
|
+
def _load_builtins(self) -> None:
|
|
25
|
+
for cls in [OllamaService, VllmService, QdrantService, RedisService, TEIService]:
|
|
26
|
+
self._services[cls.name] = cls
|
|
27
|
+
|
|
28
|
+
def _load_plugins(self) -> None:
|
|
29
|
+
try:
|
|
30
|
+
eps = entry_points(group="llmstack.services")
|
|
31
|
+
except TypeError:
|
|
32
|
+
# Python 3.11 compat
|
|
33
|
+
eps = entry_points().get("llmstack.services", [])
|
|
34
|
+
|
|
35
|
+
for ep in eps:
|
|
36
|
+
try:
|
|
37
|
+
cls = ep.load()
|
|
38
|
+
if hasattr(cls, "name"):
|
|
39
|
+
self._services[cls.name] = cls
|
|
40
|
+
except Exception:
|
|
41
|
+
pass
|
|
42
|
+
|
|
43
|
+
def get(self, name: str) -> Type[ServiceBase]:
|
|
44
|
+
if name not in self._services:
|
|
45
|
+
available = ", ".join(sorted(self._services.keys()))
|
|
46
|
+
raise KeyError(f"Unknown service '{name}'. Available: {available}")
|
|
47
|
+
return self._services[name]
|
|
48
|
+
|
|
49
|
+
def list_by_category(self, category: str) -> list[Type[ServiceBase]]:
|
|
50
|
+
return [s for s in self._services.values() if s.category == category]
|
|
51
|
+
|
|
52
|
+
def all_names(self) -> list[str]:
|
|
53
|
+
return sorted(self._services.keys())
|
|
File without changes
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""Qdrant vector database service."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from llmstack.config.schema import VectorDBConfig
|
|
8
|
+
from llmstack.services.base import ServiceBase
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class QdrantService(ServiceBase):
|
|
12
|
+
name = "qdrant"
|
|
13
|
+
category = "vectordb"
|
|
14
|
+
|
|
15
|
+
def __init__(self, config: VectorDBConfig):
|
|
16
|
+
self.config = config
|
|
17
|
+
|
|
18
|
+
def container_spec(self) -> dict[str, Any]:
|
|
19
|
+
return {
|
|
20
|
+
"image": "qdrant/qdrant:latest",
|
|
21
|
+
"name": "llmstack-qdrant",
|
|
22
|
+
"ports": {
|
|
23
|
+
"6333/tcp": self.config.port,
|
|
24
|
+
"6334/tcp": self.config.port + 1,
|
|
25
|
+
},
|
|
26
|
+
"volumes": {
|
|
27
|
+
"llmstack_qdrant_data": {"bind": "/qdrant/storage", "mode": "rw"},
|
|
28
|
+
},
|
|
29
|
+
"environment": {},
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
def health_url(self) -> str:
|
|
33
|
+
return f"http://localhost:{self.config.port}/healthz"
|
|
@@ -0,0 +1,252 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: llmstack-cli
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: One command. Full LLM stack. Zero config.
|
|
5
|
+
Author: mara-werils
|
|
6
|
+
License-Expression: Apache-2.0
|
|
7
|
+
License-File: LICENSE
|
|
8
|
+
Keywords: ai,cli,docker,inference,llm,openai,rag
|
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
|
10
|
+
Classifier: Environment :: Console
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
16
|
+
Requires-Python: >=3.11
|
|
17
|
+
Requires-Dist: docker>=7.0
|
|
18
|
+
Requires-Dist: httpx>=0.27
|
|
19
|
+
Requires-Dist: psutil>=5.9
|
|
20
|
+
Requires-Dist: pydantic>=2.0
|
|
21
|
+
Requires-Dist: pyyaml>=6.0
|
|
22
|
+
Requires-Dist: rich>=13.0
|
|
23
|
+
Requires-Dist: typer>=0.12
|
|
24
|
+
Provides-Extra: dev
|
|
25
|
+
Requires-Dist: fastapi>=0.115; extra == 'dev'
|
|
26
|
+
Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
|
|
27
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
28
|
+
Requires-Dist: ruff>=0.4; extra == 'dev'
|
|
29
|
+
Requires-Dist: starlette>=0.40; extra == 'dev'
|
|
30
|
+
Provides-Extra: gateway
|
|
31
|
+
Requires-Dist: fastapi>=0.115; extra == 'gateway'
|
|
32
|
+
Requires-Dist: starlette>=0.40; extra == 'gateway'
|
|
33
|
+
Requires-Dist: uvicorn[standard]>=0.30; extra == 'gateway'
|
|
34
|
+
Description-Content-Type: text/markdown
|
|
35
|
+
|
|
36
|
+
<p align="center">
|
|
37
|
+
<h1 align="center">llmstack</h1>
|
|
38
|
+
<p align="center"><strong>One command. Full LLM stack. Zero config.</strong></p>
|
|
39
|
+
<p align="center">Stop wiring Docker containers. Start building AI apps.</p>
|
|
40
|
+
</p>
|
|
41
|
+
|
|
42
|
+
<p align="center">
|
|
43
|
+
<a href="https://pypi.org/project/llmstack-cli/"><img src="https://img.shields.io/pypi/v/llmstack-cli?color=blue" alt="PyPI"></a>
|
|
44
|
+
<a href="https://github.com/mara-werils/llmstack/actions/workflows/ci.yml"><img src="https://github.com/mara-werils/llmstack/actions/workflows/ci.yml/badge.svg" alt="CI"></a>
|
|
45
|
+
<a href="https://github.com/mara-werils/llmstack/blob/main/LICENSE"><img src="https://img.shields.io/badge/license-Apache%202.0-green" alt="License"></a>
|
|
46
|
+
<a href="https://www.python.org/"><img src="https://img.shields.io/badge/python-3.11+-blue" alt="Python"></a>
|
|
47
|
+
</p>
|
|
48
|
+
|
|
49
|
+
---
|
|
50
|
+
|
|
51
|
+
**llmstack** spins up a production-grade LLM stack locally with a single command. It auto-detects your hardware, picks the optimal inference backend, and wires everything together.
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
pip install llmstack-cli
|
|
55
|
+
llmstack init
|
|
56
|
+
llmstack up
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
That's it. You now have a full LLM API running locally.
|
|
60
|
+
|
|
61
|
+
## Architecture
|
|
62
|
+
|
|
63
|
+
```
|
|
64
|
+
llmstack up
|
|
65
|
+
|
|
|
66
|
+
+---------v----------+
|
|
67
|
+
| Hardware Detect |
|
|
68
|
+
| NVIDIA / Apple / CPU|
|
|
69
|
+
+---------+----------+
|
|
70
|
+
|
|
|
71
|
+
+-------+-------+-------+-------+
|
|
72
|
+
| | | | |
|
|
73
|
+
+----v--+ +--v---+ +v-----+ +v----+ +v--------+
|
|
74
|
+
|Qdrant | |Redis | |Ollama| | TEI | | Gateway |
|
|
75
|
+
|Vector | |Cache | | or | |Embed| | FastAPI |
|
|
76
|
+
| DB | | | | vLLM | | | | OpenAI |
|
|
77
|
+
+-------+ +------+ +------+ +-----+ |compatible|
|
|
78
|
+
:6333 :6379 :11434 :8002 +----+-----+
|
|
79
|
+
|:8000
|
|
80
|
+
+----v-----+
|
|
81
|
+
|Prometheus |
|
|
82
|
+
| + Grafana |
|
|
83
|
+
+----------+
|
|
84
|
+
:8080
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
## What you get
|
|
88
|
+
|
|
89
|
+
| Layer | Service | Default | Port |
|
|
90
|
+
|-------|---------|---------|------|
|
|
91
|
+
| Inference | Ollama / vLLM (auto) | llama3.2 | 11434 |
|
|
92
|
+
| Embeddings | TEI / Ollama (auto) | bge-m3 | 8002 |
|
|
93
|
+
| Vector DB | Qdrant | - | 6333 |
|
|
94
|
+
| Cache | Redis | 256MB LRU | 6379 |
|
|
95
|
+
| API Gateway | FastAPI (OpenAI-compatible) | auth + rate limit | 8000 |
|
|
96
|
+
| Dashboard | Grafana + Prometheus | pre-built panels | 8080 |
|
|
97
|
+
|
|
98
|
+
## How it works
|
|
99
|
+
|
|
100
|
+
```
|
|
101
|
+
llmstack init # Detects hardware, generates llmstack.yaml
|
|
102
|
+
# Picks optimal backend: vLLM for NVIDIA 16GB+, Ollama otherwise
|
|
103
|
+
|
|
104
|
+
llmstack up # Boots services in order with health checks:
|
|
105
|
+
# Qdrant -> Redis -> Inference -> Embeddings -> Gateway -> Metrics
|
|
106
|
+
|
|
107
|
+
llmstack status # Shows health of all running services
|
|
108
|
+
llmstack logs ollama # Stream inference logs
|
|
109
|
+
llmstack down # Stops everything
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
### Use the API
|
|
113
|
+
|
|
114
|
+
```bash
|
|
115
|
+
curl http://localhost:8000/v1/chat/completions \
|
|
116
|
+
-H "Authorization: Bearer YOUR_KEY" \
|
|
117
|
+
-H "Content-Type: application/json" \
|
|
118
|
+
-d '{"model":"llama3.2","messages":[{"role":"user","content":"Hello!"}]}'
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
Works with **any OpenAI-compatible client**: LangChain, LlamaIndex, Vercel AI SDK, openai-python.
|
|
122
|
+
|
|
123
|
+
```python
|
|
124
|
+
from openai import OpenAI
|
|
125
|
+
|
|
126
|
+
client = OpenAI(base_url="http://localhost:8000/v1", api_key="YOUR_KEY")
|
|
127
|
+
response = client.chat.completions.create(
|
|
128
|
+
model="llama3.2",
|
|
129
|
+
messages=[{"role": "user", "content": "Explain quantum computing"}]
|
|
130
|
+
)
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
## Auto hardware detection
|
|
134
|
+
|
|
135
|
+
| Your hardware | Backend | Why |
|
|
136
|
+
|---|---|---|
|
|
137
|
+
| NVIDIA GPU 16GB+ VRAM | vLLM | Max throughput, PagedAttention |
|
|
138
|
+
| NVIDIA GPU <16GB | Ollama | Lower memory overhead |
|
|
139
|
+
| Apple Silicon (M1-M4) | Ollama | Metal acceleration |
|
|
140
|
+
| CPU only | Ollama | GGUF quantized models |
|
|
141
|
+
|
|
142
|
+
## Presets
|
|
143
|
+
|
|
144
|
+
```bash
|
|
145
|
+
llmstack init --preset chat # Minimal: inference + cache + gateway
|
|
146
|
+
llmstack init --preset rag # + Qdrant + embeddings for RAG apps
|
|
147
|
+
llmstack init --preset agent # 70B model + 16K context + longer timeouts
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
## Configuration
|
|
151
|
+
|
|
152
|
+
One file: `llmstack.yaml`
|
|
153
|
+
|
|
154
|
+
```yaml
|
|
155
|
+
version: "1"
|
|
156
|
+
|
|
157
|
+
models:
|
|
158
|
+
chat:
|
|
159
|
+
name: llama3.2
|
|
160
|
+
backend: auto # auto | ollama | vllm
|
|
161
|
+
context_length: 8192
|
|
162
|
+
embeddings:
|
|
163
|
+
name: bge-m3
|
|
164
|
+
|
|
165
|
+
services:
|
|
166
|
+
vectors:
|
|
167
|
+
provider: qdrant
|
|
168
|
+
port: 6333
|
|
169
|
+
cache:
|
|
170
|
+
provider: redis
|
|
171
|
+
max_memory: 256mb
|
|
172
|
+
|
|
173
|
+
gateway:
|
|
174
|
+
port: 8000
|
|
175
|
+
auth: api_key
|
|
176
|
+
rate_limit: 100/min
|
|
177
|
+
cors: ["*"]
|
|
178
|
+
|
|
179
|
+
observe:
|
|
180
|
+
metrics: true
|
|
181
|
+
dashboard_port: 8080
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
## CLI
|
|
185
|
+
|
|
186
|
+
| Command | Description |
|
|
187
|
+
|---------|-------------|
|
|
188
|
+
| `llmstack init [--preset]` | Create config with smart defaults |
|
|
189
|
+
| `llmstack up [--attach]` | Start all services |
|
|
190
|
+
| `llmstack down [--volumes]` | Stop and clean up |
|
|
191
|
+
| `llmstack status` | Health check all services |
|
|
192
|
+
| `llmstack logs <service>` | Stream service logs |
|
|
193
|
+
| `llmstack doctor` | Diagnose system issues |
|
|
194
|
+
|
|
195
|
+
## Observability
|
|
196
|
+
|
|
197
|
+
When `observe.metrics: true`, llmstack boots Prometheus + Grafana with a pre-built dashboard:
|
|
198
|
+
|
|
199
|
+
- **Request rate** per endpoint
|
|
200
|
+
- **Latency** p50 / p99 histograms
|
|
201
|
+
- **Token throughput** (input + output)
|
|
202
|
+
- **Error rate** (4xx / 5xx)
|
|
203
|
+
- **Service health** (up/down)
|
|
204
|
+
|
|
205
|
+
Access at `http://localhost:8080` (login: admin / llmstack)
|
|
206
|
+
|
|
207
|
+
## Plugins
|
|
208
|
+
|
|
209
|
+
Extend llmstack with new backends via pip:
|
|
210
|
+
|
|
211
|
+
```bash
|
|
212
|
+
pip install llmstack-cli-plugin-chromadb
|
|
213
|
+
# Now: vectors.provider: chromadb in llmstack.yaml
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
Create your own: implement `ServiceBase`, register via entry_points. See [CONTRIBUTING.md](CONTRIBUTING.md).
|
|
217
|
+
|
|
218
|
+
## Why llmstack?
|
|
219
|
+
|
|
220
|
+
| | llmstack | Ollama | Harbor | AnythingLLM | LiteLLM |
|
|
221
|
+
|---|---|---|---|---|---|
|
|
222
|
+
| One-command full stack | Yes | No (inference only) | Partial | Partial | No (proxy only) |
|
|
223
|
+
| Auto hardware detection | Yes | No | No | No | No |
|
|
224
|
+
| OpenAI-compatible API | Yes | Yes | Varies | No | Yes |
|
|
225
|
+
| Built-in vector DB | Yes | No | Config needed | Bundled | No |
|
|
226
|
+
| Built-in embeddings | Yes | No | No | Bundled | No |
|
|
227
|
+
| Caching (Redis) | Yes | No | No | No | No |
|
|
228
|
+
| Auth + rate limiting | Yes | No | No | Yes | Yes |
|
|
229
|
+
| Observability dashboard | Yes | No | Partial | No | Partial |
|
|
230
|
+
| Plugin ecosystem | Yes | No | No | No | No |
|
|
231
|
+
| SSE streaming | Yes | Yes | Yes | Yes | Yes |
|
|
232
|
+
|
|
233
|
+
## Tech stack
|
|
234
|
+
|
|
235
|
+
- **CLI**: [Typer](https://typer.tiangolo.com/) + [Rich](https://rich.readthedocs.io/)
|
|
236
|
+
- **Config**: [Pydantic v2](https://docs.pydantic.dev/)
|
|
237
|
+
- **Gateway**: [FastAPI](https://fastapi.tiangolo.com/)
|
|
238
|
+
- **Containers**: [Docker SDK for Python](https://docker-py.readthedocs.io/)
|
|
239
|
+
- **Metrics**: Prometheus + Grafana
|
|
240
|
+
|
|
241
|
+
## Requirements
|
|
242
|
+
|
|
243
|
+
- Python 3.11+
|
|
244
|
+
- Docker
|
|
245
|
+
|
|
246
|
+
## Contributing
|
|
247
|
+
|
|
248
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md) for development setup and guidelines.
|
|
249
|
+
|
|
250
|
+
## License
|
|
251
|
+
|
|
252
|
+
Apache-2.0
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
llmstack/__init__.py,sha256=eeHMsyABNyNRnVgYQqyU_R4RZpbJ7Kz3HzPWsisW5zo,84
|
|
2
|
+
llmstack/__main__.py,sha256=XhknKG6tlsgyslTC57MpF3IGUu_cIo6G5rSIz4kxuyc,86
|
|
3
|
+
llmstack/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
+
llmstack/cli/app.py,sha256=qcrUcYkJApyX2JYVCFsu2wXZx81b0ma5aBXRNVNukSw,2378
|
|
5
|
+
llmstack/cli/console.py,sha256=qnwJ2g39BwXhA_WhGtJW5E1ZdbTau4npG9j9mBo2zX0,259
|
|
6
|
+
llmstack/cli/commands/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
7
|
+
llmstack/cli/commands/doctor.py,sha256=71PM5uG-a8A4nNf4ywMXmVIL8w6W5MwLgLco9DxbnG8,2440
|
|
8
|
+
llmstack/cli/commands/down.py,sha256=51XiP8n_Nq3oX4gbeLd_vCvXB1OP7dZEi3392eYgSJM,752
|
|
9
|
+
llmstack/cli/commands/init.py,sha256=1olPvaPLapdSkuSBVIum4kzXh2BMtkRw-HNZKs1slKc,2311
|
|
10
|
+
llmstack/cli/commands/logs.py,sha256=CGZO9kFiqc224LtpYH22uf0k5E26sCmAucrozjtb7BQ,840
|
|
11
|
+
llmstack/cli/commands/status.py,sha256=ahnhKsw0C6-x2umOVUNsCnTU5XQS-nGIjKOFxEZVjdA,1382
|
|
12
|
+
llmstack/cli/commands/up.py,sha256=plZvcqZfA5eIVIDu8n6iMIum3jN-QAhhHfLR4JRFomg,854
|
|
13
|
+
llmstack/config/__init__.py,sha256=7D4wkOUIE-BmuMJPBUg_d_g3QRpSKEvHC3mfdxBb_Zw,136
|
|
14
|
+
llmstack/config/loader.py,sha256=d4XnRBMoGGF7tvQRMlbFQE-8za2SBkElEdrjTXvLAVk,1375
|
|
15
|
+
llmstack/config/schema.py,sha256=p9dwVFSoE0tECuUzvtAoRKv3aLPY8bdpT1ouf5hsTCg,2133
|
|
16
|
+
llmstack/config/presets/__init__.py,sha256=Z1JX3ZGvQHWZLYbY7yb-ntcg9_-wB4AMmYiXO1epzZs,317
|
|
17
|
+
llmstack/config/presets/agent.py,sha256=VOIg088i_CR9Dv7JXPgQvQ84xSUE0IJ1V-38XyJ1r0Q,449
|
|
18
|
+
llmstack/config/presets/chat.py,sha256=OoJ_MqSFlbPa_nUeAvyUU3O1OsrfBW95Gw9bXjvoV38,385
|
|
19
|
+
llmstack/config/presets/rag.py,sha256=_aisSEegOHDCaieNKamoHJxdW7CG2N9T48qEWQjiG4I,340
|
|
20
|
+
llmstack/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
21
|
+
llmstack/core/hardware.py,sha256=dO2VRD2OZ4BnDETkHBRw1YeTERz6XhFByQpy8z7-jWc,3844
|
|
22
|
+
llmstack/core/health.py,sha256=BxH-Y1DsoDFc0zNxYVTdYEmKTC1dIWiBxhCLDLbUyr0,706
|
|
23
|
+
llmstack/core/resolver.py,sha256=MPUNLAoe0U_htLtMy789RbBir6xljWnw9p0AGOHYrAc,1633
|
|
24
|
+
llmstack/core/stack.py,sha256=v9kPijvUm5IF-B608b0fUw71wDv14vg0K1ovffSuHf4,8310
|
|
25
|
+
llmstack/docker/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
26
|
+
llmstack/docker/manager.py,sha256=OzfY8H46dEw7acSe8sUglayo4hgxO0tj3SxEC47Xmmg,4728
|
|
27
|
+
llmstack/gateway/Dockerfile,sha256=Mtiy9fcfK1DikZ6v-I4zBI_Op1pABVm1nYZ-LT8J9fg,308
|
|
28
|
+
llmstack/gateway/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
29
|
+
llmstack/gateway/main.py,sha256=PYT48w4MuBNCqJ3YTq27OF_yC0dRRpnC61bpos1aveY,1483
|
|
30
|
+
llmstack/gateway/proxy.py,sha256=WLErhSUICVK1XWu6Qt9v5gBWYtwalP3LKJE_D5Dj6C4,2077
|
|
31
|
+
llmstack/gateway/middleware/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
32
|
+
llmstack/gateway/middleware/auth.py,sha256=k49XxcGuoZEEf_upkvrbhev5K0Y9ZZiyvJU8ibx_-Sw,1088
|
|
33
|
+
llmstack/gateway/middleware/metrics.py,sha256=nFeEyF5kBDbd9PCgMrt1-FAPZxXiLS4WMvP8tEnq-io,4387
|
|
34
|
+
llmstack/gateway/routes/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
35
|
+
llmstack/gateway/routes/chat.py,sha256=qVbXYRdiQihawMF0qxzVeL7Ud5O-UyPjQjmOwEbN4H8,837
|
|
36
|
+
llmstack/gateway/routes/embeddings.py,sha256=H8FCv0cPsgwAMejC6ytCHXwy88vGktIMH_sKimEMoUk,453
|
|
37
|
+
llmstack/gateway/routes/health.py,sha256=K4EGOMVVMvOfAJDGEOFrVOvnT5OiZkH2U_u1WqdWP08,1449
|
|
38
|
+
llmstack/gateway/routes/models.py,sha256=0jBHyop_HJyArvKXKOZKZwzj6rVCeuLZhj6AdIR8Hwc,353
|
|
39
|
+
llmstack/plugins/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
40
|
+
llmstack/plugins/loader.py,sha256=cg_aldCDSNJWM_Es7iUI-1ycwJwhzNLf9G0CYJrw8Ow,152
|
|
41
|
+
llmstack/plugins/spec.py,sha256=nteKuPgDjok8b1rSKsN6rFVXGjhYwwfuRpiubKEIfqc,546
|
|
42
|
+
llmstack/services/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
43
|
+
llmstack/services/base.py,sha256=qe2c6uZbSwaaItvkEq4T9kxujOGEgS2KL0giKrea1g8,1888
|
|
44
|
+
llmstack/services/registry.py,sha256=pbDscM_YezzKMikuHqbO7cuekmiP84IyOKmwctkeAow,1855
|
|
45
|
+
llmstack/services/cache/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
46
|
+
llmstack/services/cache/redis.py,sha256=_6_JlFxFEvVutkmFwo-qzola6heODG_fxrfGVt0x-1k,898
|
|
47
|
+
llmstack/services/embeddings/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
48
|
+
llmstack/services/embeddings/tei.py,sha256=GNJL6PxQVUu7FQEzR0jj1N2jDCkhZoO1ezKLcEdo5EI,1502
|
|
49
|
+
llmstack/services/gateway/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
50
|
+
llmstack/services/gateway/service.py,sha256=4PSED82DHUlkqP4FxL-lNxY5kO1E-ODTN-x6msEY3HE,1580
|
|
51
|
+
llmstack/services/inference/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
52
|
+
llmstack/services/inference/ollama.py,sha256=0A-4V3Ks3fiyQwnJkBhLtj9oIvKAYPK-QgsbcAsuqFI,1814
|
|
53
|
+
llmstack/services/inference/vllm.py,sha256=fhi7xa2RitNG0FNnhypUYfjEemvylnnxOIksRfBzJcE,1607
|
|
54
|
+
llmstack/services/observe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
55
|
+
llmstack/services/observe/prometheus.py,sha256=6NG8yDY2JQqYCWtSldb6LEKbOqLdHiPB9u9G0c2dJpw,5343
|
|
56
|
+
llmstack/services/vectordb/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
57
|
+
llmstack/services/vectordb/qdrant.py,sha256=zIOqf5km-4pItXOW8Y-qdxHLiU9lFfhDz33nBrTgPhM,900
|
|
58
|
+
llmstack_cli-0.1.0.dist-info/METADATA,sha256=aWalxTiNMqv0lz5GjICnaOhTC1QgYXRR8Tq7Jyk4LQE,8128
|
|
59
|
+
llmstack_cli-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
|
|
60
|
+
llmstack_cli-0.1.0.dist-info/entry_points.txt,sha256=i2BIacwqAqUaN1yAe-MaJZ22unHqAAUkTopk9M_iZPo,50
|
|
61
|
+
llmstack_cli-0.1.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
62
|
+
llmstack_cli-0.1.0.dist-info/RECORD,,
|