llmstack-cli 0.1.0__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llmstack_cli-0.2.0/.github/ISSUE_TEMPLATE/bug_report.yml +38 -0
- llmstack_cli-0.2.0/.github/ISSUE_TEMPLATE/feature_request.yml +23 -0
- llmstack_cli-0.2.0/.github/PULL_REQUEST_TEMPLATE.md +17 -0
- llmstack_cli-0.2.0/.github/SECURITY.md +17 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/CHANGELOG.md +13 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/PKG-INFO +102 -44
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/README.md +101 -43
- llmstack_cli-0.2.0/assets/social-preview.png +0 -0
- llmstack_cli-0.2.0/demo.gif +0 -0
- llmstack_cli-0.2.0/demo.tape +51 -0
- llmstack_cli-0.2.0/examples/output.txt +86 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/pyproject.toml +4 -1
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/__init__.py +1 -1
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/cli/app.py +18 -0
- llmstack_cli-0.2.0/src/llmstack/cli/commands/chat.py +104 -0
- llmstack_cli-0.2.0/src/llmstack/cli/commands/export.py +190 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/core/stack.py +12 -1
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/docker/manager.py +23 -7
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/gateway/Dockerfile +3 -1
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/services/gateway/service.py +16 -1
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/services/observe/prometheus.py +76 -23
- llmstack_cli-0.2.0/tests/unit/test_export.py +65 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/tests/unit/test_services.py +21 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/.github/workflows/ci.yml +0 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/.github/workflows/release.yml +0 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/.gitignore +0 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/CONTRIBUTING.md +0 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/LICENSE +0 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/Makefile +0 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/__main__.py +0 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/cli/__init__.py +0 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/cli/commands/__init__.py +0 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/cli/commands/doctor.py +0 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/cli/commands/down.py +0 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/cli/commands/init.py +0 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/cli/commands/logs.py +0 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/cli/commands/status.py +0 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/cli/commands/up.py +0 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/cli/console.py +0 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/config/__init__.py +0 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/config/loader.py +0 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/config/presets/__init__.py +0 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/config/presets/agent.py +0 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/config/presets/chat.py +0 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/config/presets/rag.py +0 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/config/schema.py +0 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/core/__init__.py +0 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/core/hardware.py +0 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/core/health.py +0 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/core/resolver.py +0 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/docker/__init__.py +0 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/gateway/__init__.py +0 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/gateway/main.py +0 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/gateway/middleware/__init__.py +0 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/gateway/middleware/auth.py +0 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/gateway/middleware/metrics.py +0 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/gateway/proxy.py +0 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/gateway/routes/__init__.py +0 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/gateway/routes/chat.py +0 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/gateway/routes/embeddings.py +0 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/gateway/routes/health.py +0 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/gateway/routes/models.py +0 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/plugins/__init__.py +0 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/plugins/loader.py +0 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/plugins/spec.py +0 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/services/__init__.py +0 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/services/base.py +0 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/services/cache/__init__.py +0 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/services/cache/redis.py +0 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/services/embeddings/__init__.py +0 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/services/embeddings/tei.py +0 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/services/gateway/__init__.py +0 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/services/inference/__init__.py +0 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/services/inference/ollama.py +0 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/services/inference/vllm.py +0 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/services/observe/__init__.py +0 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/services/registry.py +0 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/services/vectordb/__init__.py +0 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/services/vectordb/qdrant.py +0 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/tests/__init__.py +0 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/tests/unit/__init__.py +0 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/tests/unit/test_config.py +0 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/tests/unit/test_gateway.py +0 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/tests/unit/test_hardware.py +0 -0
- {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/tests/unit/test_resolver.py +0 -0
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
name: Bug Report
|
|
2
|
+
description: Report a bug or unexpected behavior
|
|
3
|
+
labels: ["bug"]
|
|
4
|
+
body:
|
|
5
|
+
- type: textarea
|
|
6
|
+
id: description
|
|
7
|
+
attributes:
|
|
8
|
+
label: What happened?
|
|
9
|
+
description: A clear description of the bug.
|
|
10
|
+
validations:
|
|
11
|
+
required: true
|
|
12
|
+
- type: textarea
|
|
13
|
+
id: reproduce
|
|
14
|
+
attributes:
|
|
15
|
+
label: Steps to reproduce
|
|
16
|
+
description: Minimal steps to reproduce the issue.
|
|
17
|
+
placeholder: |
|
|
18
|
+
1. llmstack init --preset rag
|
|
19
|
+
2. llmstack up
|
|
20
|
+
3. ...
|
|
21
|
+
validations:
|
|
22
|
+
required: true
|
|
23
|
+
- type: textarea
|
|
24
|
+
id: expected
|
|
25
|
+
attributes:
|
|
26
|
+
label: Expected behavior
|
|
27
|
+
description: What did you expect to happen?
|
|
28
|
+
- type: textarea
|
|
29
|
+
id: environment
|
|
30
|
+
attributes:
|
|
31
|
+
label: Environment
|
|
32
|
+
description: Paste the output of `llmstack doctor`
|
|
33
|
+
render: shell
|
|
34
|
+
- type: input
|
|
35
|
+
id: version
|
|
36
|
+
attributes:
|
|
37
|
+
label: llmstack version
|
|
38
|
+
placeholder: "0.1.0"
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
name: Feature Request
|
|
2
|
+
description: Suggest a new feature or improvement
|
|
3
|
+
labels: ["enhancement"]
|
|
4
|
+
body:
|
|
5
|
+
- type: textarea
|
|
6
|
+
id: problem
|
|
7
|
+
attributes:
|
|
8
|
+
label: Problem
|
|
9
|
+
description: What problem does this solve? What's your use case?
|
|
10
|
+
validations:
|
|
11
|
+
required: true
|
|
12
|
+
- type: textarea
|
|
13
|
+
id: solution
|
|
14
|
+
attributes:
|
|
15
|
+
label: Proposed solution
|
|
16
|
+
description: How would you like this to work?
|
|
17
|
+
validations:
|
|
18
|
+
required: true
|
|
19
|
+
- type: textarea
|
|
20
|
+
id: alternatives
|
|
21
|
+
attributes:
|
|
22
|
+
label: Alternatives considered
|
|
23
|
+
description: Any other approaches you've considered?
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# Security Policy
|
|
2
|
+
|
|
3
|
+
## Reporting a Vulnerability
|
|
4
|
+
|
|
5
|
+
If you discover a security vulnerability, please report it responsibly:
|
|
6
|
+
|
|
7
|
+
1. **Do not** open a public issue
|
|
8
|
+
2. Email: [open a private security advisory](https://github.com/mara-werils/llmstack/security/advisories/new)
|
|
9
|
+
3. Include steps to reproduce and potential impact
|
|
10
|
+
|
|
11
|
+
We will respond within 48 hours and aim to release a fix within 7 days.
|
|
12
|
+
|
|
13
|
+
## Supported Versions
|
|
14
|
+
|
|
15
|
+
| Version | Supported |
|
|
16
|
+
|---------|-----------|
|
|
17
|
+
| 0.1.x | Yes |
|
|
@@ -1,5 +1,18 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## [0.2.0] - 2026-05-07
|
|
4
|
+
|
|
5
|
+
### Added
|
|
6
|
+
- `llmstack chat` — interactive terminal chat with streaming responses
|
|
7
|
+
- `llmstack export` — generate standalone docker-compose.yml from llmstack.yaml
|
|
8
|
+
- GitHub issue templates, PR template, security policy
|
|
9
|
+
|
|
10
|
+
### Fixed
|
|
11
|
+
- Gateway Docker image now builds locally (no longer requires ghcr.io)
|
|
12
|
+
- Prometheus and Grafana configs are written to disk before container start
|
|
13
|
+
- Generated API keys persist to llmstack.yaml across restarts
|
|
14
|
+
- Clear error messages for port conflicts
|
|
15
|
+
|
|
3
16
|
## [0.1.0] - 2026-05-07
|
|
4
17
|
|
|
5
18
|
### Added
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: llmstack-cli
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: One command. Full LLM stack. Zero config.
|
|
5
5
|
Author: mara-werils
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -44,21 +44,43 @@ Description-Content-Type: text/markdown
|
|
|
44
44
|
<a href="https://github.com/mara-werils/llmstack/actions/workflows/ci.yml"><img src="https://github.com/mara-werils/llmstack/actions/workflows/ci.yml/badge.svg" alt="CI"></a>
|
|
45
45
|
<a href="https://github.com/mara-werils/llmstack/blob/main/LICENSE"><img src="https://img.shields.io/badge/license-Apache%202.0-green" alt="License"></a>
|
|
46
46
|
<a href="https://www.python.org/"><img src="https://img.shields.io/badge/python-3.11+-blue" alt="Python"></a>
|
|
47
|
+
<a href="https://github.com/mara-werils/llmstack/stargazers"><img src="https://img.shields.io/github/stars/mara-werils/llmstack?style=social" alt="Stars"></a>
|
|
47
48
|
</p>
|
|
48
49
|
|
|
49
50
|
---
|
|
50
51
|
|
|
51
|
-
|
|
52
|
+
<p align="center">
|
|
53
|
+
<img src="demo.gif" alt="llmstack demo" width="800">
|
|
54
|
+
</p>
|
|
55
|
+
|
|
56
|
+
## Quick Start
|
|
52
57
|
|
|
53
58
|
```bash
|
|
54
59
|
pip install llmstack-cli
|
|
55
|
-
llmstack init
|
|
60
|
+
llmstack init --preset rag
|
|
56
61
|
llmstack up
|
|
57
62
|
```
|
|
58
63
|
|
|
59
|
-
That's it. You now have
|
|
64
|
+
That's it. You now have **7 services** running: inference, embeddings, vector DB, cache, API gateway, Prometheus, and Grafana.
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
# Test it immediately
|
|
68
|
+
curl http://localhost:8000/v1/chat/completions \
|
|
69
|
+
-H "Authorization: Bearer YOUR_KEY" \
|
|
70
|
+
-H "Content-Type: application/json" \
|
|
71
|
+
-d '{"model":"llama3.2","messages":[{"role":"user","content":"Hello!"}]}'
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
Works with **any OpenAI-compatible client**: LangChain, LlamaIndex, Vercel AI SDK, openai-python.
|
|
75
|
+
|
|
76
|
+
## Who is this for?
|
|
60
77
|
|
|
61
|
-
|
|
78
|
+
- **AI app developers** who want local inference without Docker boilerplate
|
|
79
|
+
- **Teams** who need an OpenAI-compatible API backed by local models
|
|
80
|
+
- **Hobbyists** running LLMs locally who want vector search, caching, and monitoring out of the box
|
|
81
|
+
- **Anyone** tired of writing 200+ lines of docker-compose.yml every time
|
|
82
|
+
|
|
83
|
+
## What you get
|
|
62
84
|
|
|
63
85
|
```
|
|
64
86
|
llmstack up
|
|
@@ -84,8 +106,6 @@ That's it. You now have a full LLM API running locally.
|
|
|
84
106
|
:8080
|
|
85
107
|
```
|
|
86
108
|
|
|
87
|
-
## What you get
|
|
88
|
-
|
|
89
109
|
| Layer | Service | Default | Port |
|
|
90
110
|
|-------|---------|---------|------|
|
|
91
111
|
| Inference | Ollama / vLLM (auto) | llama3.2 | 11434 |
|
|
@@ -97,7 +117,7 @@ That's it. You now have a full LLM API running locally.
|
|
|
97
117
|
|
|
98
118
|
## How it works
|
|
99
119
|
|
|
100
|
-
```
|
|
120
|
+
```bash
|
|
101
121
|
llmstack init # Detects hardware, generates llmstack.yaml
|
|
102
122
|
# Picks optimal backend: vLLM for NVIDIA 16GB+, Ollama otherwise
|
|
103
123
|
|
|
@@ -109,27 +129,6 @@ llmstack logs ollama # Stream inference logs
|
|
|
109
129
|
llmstack down # Stops everything
|
|
110
130
|
```
|
|
111
131
|
|
|
112
|
-
### Use the API
|
|
113
|
-
|
|
114
|
-
```bash
|
|
115
|
-
curl http://localhost:8000/v1/chat/completions \
|
|
116
|
-
-H "Authorization: Bearer YOUR_KEY" \
|
|
117
|
-
-H "Content-Type: application/json" \
|
|
118
|
-
-d '{"model":"llama3.2","messages":[{"role":"user","content":"Hello!"}]}'
|
|
119
|
-
```
|
|
120
|
-
|
|
121
|
-
Works with **any OpenAI-compatible client**: LangChain, LlamaIndex, Vercel AI SDK, openai-python.
|
|
122
|
-
|
|
123
|
-
```python
|
|
124
|
-
from openai import OpenAI
|
|
125
|
-
|
|
126
|
-
client = OpenAI(base_url="http://localhost:8000/v1", api_key="YOUR_KEY")
|
|
127
|
-
response = client.chat.completions.create(
|
|
128
|
-
model="llama3.2",
|
|
129
|
-
messages=[{"role": "user", "content": "Explain quantum computing"}]
|
|
130
|
-
)
|
|
131
|
-
```
|
|
132
|
-
|
|
133
132
|
## Auto hardware detection
|
|
134
133
|
|
|
135
134
|
| Your hardware | Backend | Why |
|
|
@@ -181,6 +180,50 @@ observe:
|
|
|
181
180
|
dashboard_port: 8080
|
|
182
181
|
```
|
|
183
182
|
|
|
183
|
+
## Interactive Chat
|
|
184
|
+
|
|
185
|
+
```bash
|
|
186
|
+
llmstack chat
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
```
|
|
190
|
+
LLMStack Chat — model: llama3.2
|
|
191
|
+
Type 'exit' or Ctrl+C to quit. '/clear' to reset conversation.
|
|
192
|
+
|
|
193
|
+
You: What is quantum computing?
|
|
194
|
+
Assistant: Quantum computing uses quantum mechanical phenomena like
|
|
195
|
+
superposition and entanglement to process information...
|
|
196
|
+
|
|
197
|
+
You: /clear
|
|
198
|
+
Conversation cleared.
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
Streaming responses, conversation history, works with any model in your stack.
|
|
202
|
+
|
|
203
|
+
## Export to Docker Compose
|
|
204
|
+
|
|
205
|
+
Don't want to install llmstack? Generate a standalone `docker-compose.yml`:
|
|
206
|
+
|
|
207
|
+
```bash
|
|
208
|
+
llmstack export
|
|
209
|
+
# Exported 7 services to docker-compose.yml
|
|
210
|
+
# Run with: docker compose up -d
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
Share the generated file with your team — no llmstack dependency required.
|
|
214
|
+
|
|
215
|
+
## Use the API
|
|
216
|
+
|
|
217
|
+
```python
|
|
218
|
+
from openai import OpenAI
|
|
219
|
+
|
|
220
|
+
client = OpenAI(base_url="http://localhost:8000/v1", api_key="YOUR_KEY")
|
|
221
|
+
response = client.chat.completions.create(
|
|
222
|
+
model="llama3.2",
|
|
223
|
+
messages=[{"role": "user", "content": "Explain quantum computing"}]
|
|
224
|
+
)
|
|
225
|
+
```
|
|
226
|
+
|
|
184
227
|
## CLI
|
|
185
228
|
|
|
186
229
|
| Command | Description |
|
|
@@ -189,6 +232,8 @@ observe:
|
|
|
189
232
|
| `llmstack up [--attach]` | Start all services |
|
|
190
233
|
| `llmstack down [--volumes]` | Stop and clean up |
|
|
191
234
|
| `llmstack status` | Health check all services |
|
|
235
|
+
| `llmstack chat [--model]` | Interactive terminal chat |
|
|
236
|
+
| `llmstack export [--output]` | Generate docker-compose.yml |
|
|
192
237
|
| `llmstack logs <service>` | Stream service logs |
|
|
193
238
|
| `llmstack doctor` | Diagnose system issues |
|
|
194
239
|
|
|
@@ -204,6 +249,34 @@ When `observe.metrics: true`, llmstack boots Prometheus + Grafana with a pre-bui
|
|
|
204
249
|
|
|
205
250
|
Access at `http://localhost:8080` (login: admin / llmstack)
|
|
206
251
|
|
|
252
|
+
## Why not just Docker Compose?
|
|
253
|
+
|
|
254
|
+
Here's what llmstack replaces:
|
|
255
|
+
|
|
256
|
+
```yaml
|
|
257
|
+
# Without llmstack: ~200 lines of docker-compose.yml
|
|
258
|
+
# You have to configure each service, write health checks,
|
|
259
|
+
# set up networking, manage GPU passthrough, create Prometheus
|
|
260
|
+
# scrape configs, provision Grafana dashboards...
|
|
261
|
+
|
|
262
|
+
# With llmstack:
|
|
263
|
+
llmstack init && llmstack up
|
|
264
|
+
```
|
|
265
|
+
|
|
266
|
+
## Comparison
|
|
267
|
+
|
|
268
|
+
| | llmstack | Ollama | LocalAI | AnythingLLM | LiteLLM |
|
|
269
|
+
|---|---|---|---|---|---|
|
|
270
|
+
| One-command full stack | **Yes** | No (inference only) | No | Partial | No (proxy only) |
|
|
271
|
+
| Auto hardware detection | **Yes** | No | No | No | No |
|
|
272
|
+
| OpenAI-compatible API | **Yes** | Yes | Yes | No | Yes |
|
|
273
|
+
| Built-in vector DB | **Yes** | No | No | Bundled | No |
|
|
274
|
+
| Built-in embeddings | **Yes** | No | No | Bundled | No |
|
|
275
|
+
| Caching (Redis) | **Yes** | No | No | No | No |
|
|
276
|
+
| Auth + rate limiting | **Yes** | No | No | Yes | Yes |
|
|
277
|
+
| Observability dashboard | **Yes** | No | Partial | No | Partial |
|
|
278
|
+
| Plugin ecosystem | **Yes** | No | No | No | No |
|
|
279
|
+
|
|
207
280
|
## Plugins
|
|
208
281
|
|
|
209
282
|
Extend llmstack with new backends via pip:
|
|
@@ -215,21 +288,6 @@ pip install llmstack-cli-plugin-chromadb
|
|
|
215
288
|
|
|
216
289
|
Create your own: implement `ServiceBase`, register via entry_points. See [CONTRIBUTING.md](CONTRIBUTING.md).
|
|
217
290
|
|
|
218
|
-
## Why llmstack?
|
|
219
|
-
|
|
220
|
-
| | llmstack | Ollama | Harbor | AnythingLLM | LiteLLM |
|
|
221
|
-
|---|---|---|---|---|---|
|
|
222
|
-
| One-command full stack | Yes | No (inference only) | Partial | Partial | No (proxy only) |
|
|
223
|
-
| Auto hardware detection | Yes | No | No | No | No |
|
|
224
|
-
| OpenAI-compatible API | Yes | Yes | Varies | No | Yes |
|
|
225
|
-
| Built-in vector DB | Yes | No | Config needed | Bundled | No |
|
|
226
|
-
| Built-in embeddings | Yes | No | No | Bundled | No |
|
|
227
|
-
| Caching (Redis) | Yes | No | No | No | No |
|
|
228
|
-
| Auth + rate limiting | Yes | No | No | Yes | Yes |
|
|
229
|
-
| Observability dashboard | Yes | No | Partial | No | Partial |
|
|
230
|
-
| Plugin ecosystem | Yes | No | No | No | No |
|
|
231
|
-
| SSE streaming | Yes | Yes | Yes | Yes | Yes |
|
|
232
|
-
|
|
233
291
|
## Tech stack
|
|
234
292
|
|
|
235
293
|
- **CLI**: [Typer](https://typer.tiangolo.com/) + [Rich](https://rich.readthedocs.io/)
|
|
@@ -9,21 +9,43 @@
|
|
|
9
9
|
<a href="https://github.com/mara-werils/llmstack/actions/workflows/ci.yml"><img src="https://github.com/mara-werils/llmstack/actions/workflows/ci.yml/badge.svg" alt="CI"></a>
|
|
10
10
|
<a href="https://github.com/mara-werils/llmstack/blob/main/LICENSE"><img src="https://img.shields.io/badge/license-Apache%202.0-green" alt="License"></a>
|
|
11
11
|
<a href="https://www.python.org/"><img src="https://img.shields.io/badge/python-3.11+-blue" alt="Python"></a>
|
|
12
|
+
<a href="https://github.com/mara-werils/llmstack/stargazers"><img src="https://img.shields.io/github/stars/mara-werils/llmstack?style=social" alt="Stars"></a>
|
|
12
13
|
</p>
|
|
13
14
|
|
|
14
15
|
---
|
|
15
16
|
|
|
16
|
-
|
|
17
|
+
<p align="center">
|
|
18
|
+
<img src="demo.gif" alt="llmstack demo" width="800">
|
|
19
|
+
</p>
|
|
20
|
+
|
|
21
|
+
## Quick Start
|
|
17
22
|
|
|
18
23
|
```bash
|
|
19
24
|
pip install llmstack-cli
|
|
20
|
-
llmstack init
|
|
25
|
+
llmstack init --preset rag
|
|
21
26
|
llmstack up
|
|
22
27
|
```
|
|
23
28
|
|
|
24
|
-
That's it. You now have
|
|
29
|
+
That's it. You now have **7 services** running: inference, embeddings, vector DB, cache, API gateway, Prometheus, and Grafana.
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
# Test it immediately
|
|
33
|
+
curl http://localhost:8000/v1/chat/completions \
|
|
34
|
+
-H "Authorization: Bearer YOUR_KEY" \
|
|
35
|
+
-H "Content-Type: application/json" \
|
|
36
|
+
-d '{"model":"llama3.2","messages":[{"role":"user","content":"Hello!"}]}'
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
Works with **any OpenAI-compatible client**: LangChain, LlamaIndex, Vercel AI SDK, openai-python.
|
|
40
|
+
|
|
41
|
+
## Who is this for?
|
|
25
42
|
|
|
26
|
-
|
|
43
|
+
- **AI app developers** who want local inference without Docker boilerplate
|
|
44
|
+
- **Teams** who need an OpenAI-compatible API backed by local models
|
|
45
|
+
- **Hobbyists** running LLMs locally who want vector search, caching, and monitoring out of the box
|
|
46
|
+
- **Anyone** tired of writing 200+ lines of docker-compose.yml every time
|
|
47
|
+
|
|
48
|
+
## What you get
|
|
27
49
|
|
|
28
50
|
```
|
|
29
51
|
llmstack up
|
|
@@ -49,8 +71,6 @@ That's it. You now have a full LLM API running locally.
|
|
|
49
71
|
:8080
|
|
50
72
|
```
|
|
51
73
|
|
|
52
|
-
## What you get
|
|
53
|
-
|
|
54
74
|
| Layer | Service | Default | Port |
|
|
55
75
|
|-------|---------|---------|------|
|
|
56
76
|
| Inference | Ollama / vLLM (auto) | llama3.2 | 11434 |
|
|
@@ -62,7 +82,7 @@ That's it. You now have a full LLM API running locally.
|
|
|
62
82
|
|
|
63
83
|
## How it works
|
|
64
84
|
|
|
65
|
-
```
|
|
85
|
+
```bash
|
|
66
86
|
llmstack init # Detects hardware, generates llmstack.yaml
|
|
67
87
|
# Picks optimal backend: vLLM for NVIDIA 16GB+, Ollama otherwise
|
|
68
88
|
|
|
@@ -74,27 +94,6 @@ llmstack logs ollama # Stream inference logs
|
|
|
74
94
|
llmstack down # Stops everything
|
|
75
95
|
```
|
|
76
96
|
|
|
77
|
-
### Use the API
|
|
78
|
-
|
|
79
|
-
```bash
|
|
80
|
-
curl http://localhost:8000/v1/chat/completions \
|
|
81
|
-
-H "Authorization: Bearer YOUR_KEY" \
|
|
82
|
-
-H "Content-Type: application/json" \
|
|
83
|
-
-d '{"model":"llama3.2","messages":[{"role":"user","content":"Hello!"}]}'
|
|
84
|
-
```
|
|
85
|
-
|
|
86
|
-
Works with **any OpenAI-compatible client**: LangChain, LlamaIndex, Vercel AI SDK, openai-python.
|
|
87
|
-
|
|
88
|
-
```python
|
|
89
|
-
from openai import OpenAI
|
|
90
|
-
|
|
91
|
-
client = OpenAI(base_url="http://localhost:8000/v1", api_key="YOUR_KEY")
|
|
92
|
-
response = client.chat.completions.create(
|
|
93
|
-
model="llama3.2",
|
|
94
|
-
messages=[{"role": "user", "content": "Explain quantum computing"}]
|
|
95
|
-
)
|
|
96
|
-
```
|
|
97
|
-
|
|
98
97
|
## Auto hardware detection
|
|
99
98
|
|
|
100
99
|
| Your hardware | Backend | Why |
|
|
@@ -146,6 +145,50 @@ observe:
|
|
|
146
145
|
dashboard_port: 8080
|
|
147
146
|
```
|
|
148
147
|
|
|
148
|
+
## Interactive Chat
|
|
149
|
+
|
|
150
|
+
```bash
|
|
151
|
+
llmstack chat
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
```
|
|
155
|
+
LLMStack Chat — model: llama3.2
|
|
156
|
+
Type 'exit' or Ctrl+C to quit. '/clear' to reset conversation.
|
|
157
|
+
|
|
158
|
+
You: What is quantum computing?
|
|
159
|
+
Assistant: Quantum computing uses quantum mechanical phenomena like
|
|
160
|
+
superposition and entanglement to process information...
|
|
161
|
+
|
|
162
|
+
You: /clear
|
|
163
|
+
Conversation cleared.
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
Streaming responses, conversation history, works with any model in your stack.
|
|
167
|
+
|
|
168
|
+
## Export to Docker Compose
|
|
169
|
+
|
|
170
|
+
Don't want to install llmstack? Generate a standalone `docker-compose.yml`:
|
|
171
|
+
|
|
172
|
+
```bash
|
|
173
|
+
llmstack export
|
|
174
|
+
# Exported 7 services to docker-compose.yml
|
|
175
|
+
# Run with: docker compose up -d
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
Share the generated file with your team — no llmstack dependency required.
|
|
179
|
+
|
|
180
|
+
## Use the API
|
|
181
|
+
|
|
182
|
+
```python
|
|
183
|
+
from openai import OpenAI
|
|
184
|
+
|
|
185
|
+
client = OpenAI(base_url="http://localhost:8000/v1", api_key="YOUR_KEY")
|
|
186
|
+
response = client.chat.completions.create(
|
|
187
|
+
model="llama3.2",
|
|
188
|
+
messages=[{"role": "user", "content": "Explain quantum computing"}]
|
|
189
|
+
)
|
|
190
|
+
```
|
|
191
|
+
|
|
149
192
|
## CLI
|
|
150
193
|
|
|
151
194
|
| Command | Description |
|
|
@@ -154,6 +197,8 @@ observe:
|
|
|
154
197
|
| `llmstack up [--attach]` | Start all services |
|
|
155
198
|
| `llmstack down [--volumes]` | Stop and clean up |
|
|
156
199
|
| `llmstack status` | Health check all services |
|
|
200
|
+
| `llmstack chat [--model]` | Interactive terminal chat |
|
|
201
|
+
| `llmstack export [--output]` | Generate docker-compose.yml |
|
|
157
202
|
| `llmstack logs <service>` | Stream service logs |
|
|
158
203
|
| `llmstack doctor` | Diagnose system issues |
|
|
159
204
|
|
|
@@ -169,6 +214,34 @@ When `observe.metrics: true`, llmstack boots Prometheus + Grafana with a pre-bui
|
|
|
169
214
|
|
|
170
215
|
Access at `http://localhost:8080` (login: admin / llmstack)
|
|
171
216
|
|
|
217
|
+
## Why not just Docker Compose?
|
|
218
|
+
|
|
219
|
+
Here's what llmstack replaces:
|
|
220
|
+
|
|
221
|
+
```yaml
|
|
222
|
+
# Without llmstack: ~200 lines of docker-compose.yml
|
|
223
|
+
# You have to configure each service, write health checks,
|
|
224
|
+
# set up networking, manage GPU passthrough, create Prometheus
|
|
225
|
+
# scrape configs, provision Grafana dashboards...
|
|
226
|
+
|
|
227
|
+
# With llmstack:
|
|
228
|
+
llmstack init && llmstack up
|
|
229
|
+
```
|
|
230
|
+
|
|
231
|
+
## Comparison
|
|
232
|
+
|
|
233
|
+
| | llmstack | Ollama | LocalAI | AnythingLLM | LiteLLM |
|
|
234
|
+
|---|---|---|---|---|---|
|
|
235
|
+
| One-command full stack | **Yes** | No (inference only) | No | Partial | No (proxy only) |
|
|
236
|
+
| Auto hardware detection | **Yes** | No | No | No | No |
|
|
237
|
+
| OpenAI-compatible API | **Yes** | Yes | Yes | No | Yes |
|
|
238
|
+
| Built-in vector DB | **Yes** | No | No | Bundled | No |
|
|
239
|
+
| Built-in embeddings | **Yes** | No | No | Bundled | No |
|
|
240
|
+
| Caching (Redis) | **Yes** | No | No | No | No |
|
|
241
|
+
| Auth + rate limiting | **Yes** | No | No | Yes | Yes |
|
|
242
|
+
| Observability dashboard | **Yes** | No | Partial | No | Partial |
|
|
243
|
+
| Plugin ecosystem | **Yes** | No | No | No | No |
|
|
244
|
+
|
|
172
245
|
## Plugins
|
|
173
246
|
|
|
174
247
|
Extend llmstack with new backends via pip:
|
|
@@ -180,21 +253,6 @@ pip install llmstack-cli-plugin-chromadb
|
|
|
180
253
|
|
|
181
254
|
Create your own: implement `ServiceBase`, register via entry_points. See [CONTRIBUTING.md](CONTRIBUTING.md).
|
|
182
255
|
|
|
183
|
-
## Why llmstack?
|
|
184
|
-
|
|
185
|
-
| | llmstack | Ollama | Harbor | AnythingLLM | LiteLLM |
|
|
186
|
-
|---|---|---|---|---|---|
|
|
187
|
-
| One-command full stack | Yes | No (inference only) | Partial | Partial | No (proxy only) |
|
|
188
|
-
| Auto hardware detection | Yes | No | No | No | No |
|
|
189
|
-
| OpenAI-compatible API | Yes | Yes | Varies | No | Yes |
|
|
190
|
-
| Built-in vector DB | Yes | No | Config needed | Bundled | No |
|
|
191
|
-
| Built-in embeddings | Yes | No | No | Bundled | No |
|
|
192
|
-
| Caching (Redis) | Yes | No | No | No | No |
|
|
193
|
-
| Auth + rate limiting | Yes | No | No | Yes | Yes |
|
|
194
|
-
| Observability dashboard | Yes | No | Partial | No | Partial |
|
|
195
|
-
| Plugin ecosystem | Yes | No | No | No | No |
|
|
196
|
-
| SSE streaming | Yes | Yes | Yes | Yes | Yes |
|
|
197
|
-
|
|
198
256
|
## Tech stack
|
|
199
257
|
|
|
200
258
|
- **CLI**: [Typer](https://typer.tiangolo.com/) + [Rich](https://rich.readthedocs.io/)
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
# VHS tape for recording llmstack demo GIF
|
|
2
|
+
# Pre-requisite: cp scripts/fake-llmstack /tmp/llmstack
|
|
3
|
+
# Run: vhs demo.tape
|
|
4
|
+
|
|
5
|
+
Output demo.gif
|
|
6
|
+
|
|
7
|
+
Set FontSize 16
|
|
8
|
+
Set Width 1200
|
|
9
|
+
Set Height 800
|
|
10
|
+
Set Theme "Catppuccin Mocha"
|
|
11
|
+
Set Padding 20
|
|
12
|
+
Set TypingSpeed 40ms
|
|
13
|
+
Set Shell "bash"
|
|
14
|
+
|
|
15
|
+
# Set PATH so /tmp/llmstack is found, then clear screen
|
|
16
|
+
Hide
|
|
17
|
+
Type "export PATH=/tmp:$PATH"
|
|
18
|
+
Enter
|
|
19
|
+
Sleep 300ms
|
|
20
|
+
Type "export PS1='$ '"
|
|
21
|
+
Enter
|
|
22
|
+
Sleep 300ms
|
|
23
|
+
Type "clear"
|
|
24
|
+
Enter
|
|
25
|
+
Sleep 500ms
|
|
26
|
+
Show
|
|
27
|
+
|
|
28
|
+
# --- Scene 1: Show version ---
|
|
29
|
+
Type "llmstack --version"
|
|
30
|
+
Enter
|
|
31
|
+
Sleep 1.5s
|
|
32
|
+
|
|
33
|
+
# --- Scene 2: Initialize with RAG preset ---
|
|
34
|
+
Type "llmstack init --preset rag"
|
|
35
|
+
Enter
|
|
36
|
+
Sleep 3s
|
|
37
|
+
|
|
38
|
+
# --- Scene 3: Start the stack ---
|
|
39
|
+
Type "llmstack up"
|
|
40
|
+
Enter
|
|
41
|
+
Sleep 4s
|
|
42
|
+
|
|
43
|
+
# --- Scene 4: Check status ---
|
|
44
|
+
Type "llmstack status"
|
|
45
|
+
Enter
|
|
46
|
+
Sleep 3s
|
|
47
|
+
|
|
48
|
+
# --- Scene 5: Tear down ---
|
|
49
|
+
Type "llmstack down"
|
|
50
|
+
Enter
|
|
51
|
+
Sleep 3s
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
$ llmstack --version
|
|
2
|
+
llmstack 0.1.0
|
|
3
|
+
|
|
4
|
+
$ llmstack init --preset rag
|
|
5
|
+
|
|
6
|
+
Hardware detected:
|
|
7
|
+
CPU: 10 cores
|
|
8
|
+
RAM: 32 GB
|
|
9
|
+
GPU: Apple M2 Pro (16 GB VRAM)
|
|
10
|
+
|
|
11
|
+
Using preset: rag
|
|
12
|
+
Backend: Ollama
|
|
13
|
+
|
|
14
|
+
Created llmstack.yaml
|
|
15
|
+
Next: edit the config if needed, then run llmstack up
|
|
16
|
+
|
|
17
|
+
$ llmstack up
|
|
18
|
+
|
|
19
|
+
Starting LLMStack...
|
|
20
|
+
|
|
21
|
+
✓ qdrant running :6333
|
|
22
|
+
✓ redis running :6379
|
|
23
|
+
✓ ollama running :11434 (pulling llama3.2...)
|
|
24
|
+
✓ tei running :8002 (loading bge-m3...)
|
|
25
|
+
✓ gateway running :8000
|
|
26
|
+
✓ prometheus running :9090
|
|
27
|
+
✓ grafana running :8080
|
|
28
|
+
|
|
29
|
+
Stack is ready! 7 services running.
|
|
30
|
+
API: http://localhost:8000/v1
|
|
31
|
+
Dashboard: http://localhost:8080
|
|
32
|
+
|
|
33
|
+
$ llmstack status
|
|
34
|
+
|
|
35
|
+
LLMStack Status
|
|
36
|
+
┌─────────────┬──────────┬─────────┬───────────────┐
|
|
37
|
+
│ Service │ Container│ Status │ Ports │
|
|
38
|
+
├─────────────┼──────────┼─────────┼───────────────┤
|
|
39
|
+
│ qdrant │ a3f1.. │ running │ 6333->6333 │
|
|
40
|
+
│ redis │ b7e2.. │ running │ 6379->6379 │
|
|
41
|
+
│ ollama │ c9d4.. │ running │ 11434->11434 │
|
|
42
|
+
│ tei │ d2a8.. │ running │ 8002->8002 │
|
|
43
|
+
│ gateway │ e5c1.. │ running │ 8000->8000 │
|
|
44
|
+
│ prometheus │ f8b3.. │ running │ 9090->9090 │
|
|
45
|
+
│ grafana │ 1a7e.. │ running │ 8080->8080 │
|
|
46
|
+
└─────────────┴──────────┴─────────┴───────────────┘
|
|
47
|
+
|
|
48
|
+
$ curl -s http://localhost:8000/v1/chat/completions \
|
|
49
|
+
-H "Authorization: Bearer llmstack-key" \
|
|
50
|
+
-H "Content-Type: application/json" \
|
|
51
|
+
-d '{"model":"llama3.2","messages":[{"role":"user","content":"Hello!"}]}' | jq .
|
|
52
|
+
{
|
|
53
|
+
"id": "chatcmpl-abc123",
|
|
54
|
+
"object": "chat.completion",
|
|
55
|
+
"created": 1715090400,
|
|
56
|
+
"model": "llama3.2",
|
|
57
|
+
"choices": [
|
|
58
|
+
{
|
|
59
|
+
"index": 0,
|
|
60
|
+
"message": {
|
|
61
|
+
"role": "assistant",
|
|
62
|
+
"content": "Hello! How can I help you today?"
|
|
63
|
+
},
|
|
64
|
+
"finish_reason": "stop"
|
|
65
|
+
}
|
|
66
|
+
],
|
|
67
|
+
"usage": {
|
|
68
|
+
"prompt_tokens": 11,
|
|
69
|
+
"completion_tokens": 9,
|
|
70
|
+
"total_tokens": 20
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
$ llmstack down
|
|
75
|
+
|
|
76
|
+
Stopping LLMStack...
|
|
77
|
+
|
|
78
|
+
✓ grafana stopped
|
|
79
|
+
✓ prometheus stopped
|
|
80
|
+
✓ gateway stopped
|
|
81
|
+
✓ tei stopped
|
|
82
|
+
✓ ollama stopped
|
|
83
|
+
✓ redis stopped
|
|
84
|
+
✓ qdrant stopped
|
|
85
|
+
|
|
86
|
+
All services stopped.
|