llmstack-cli 0.1.0__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. llmstack_cli-0.2.0/.github/ISSUE_TEMPLATE/bug_report.yml +38 -0
  2. llmstack_cli-0.2.0/.github/ISSUE_TEMPLATE/feature_request.yml +23 -0
  3. llmstack_cli-0.2.0/.github/PULL_REQUEST_TEMPLATE.md +17 -0
  4. llmstack_cli-0.2.0/.github/SECURITY.md +17 -0
  5. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/CHANGELOG.md +13 -0
  6. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/PKG-INFO +102 -44
  7. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/README.md +101 -43
  8. llmstack_cli-0.2.0/assets/social-preview.png +0 -0
  9. llmstack_cli-0.2.0/demo.gif +0 -0
  10. llmstack_cli-0.2.0/demo.tape +51 -0
  11. llmstack_cli-0.2.0/examples/output.txt +86 -0
  12. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/pyproject.toml +4 -1
  13. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/__init__.py +1 -1
  14. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/cli/app.py +18 -0
  15. llmstack_cli-0.2.0/src/llmstack/cli/commands/chat.py +104 -0
  16. llmstack_cli-0.2.0/src/llmstack/cli/commands/export.py +190 -0
  17. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/core/stack.py +12 -1
  18. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/docker/manager.py +23 -7
  19. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/gateway/Dockerfile +3 -1
  20. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/services/gateway/service.py +16 -1
  21. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/services/observe/prometheus.py +76 -23
  22. llmstack_cli-0.2.0/tests/unit/test_export.py +65 -0
  23. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/tests/unit/test_services.py +21 -0
  24. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/.github/workflows/ci.yml +0 -0
  25. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/.github/workflows/release.yml +0 -0
  26. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/.gitignore +0 -0
  27. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/CONTRIBUTING.md +0 -0
  28. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/LICENSE +0 -0
  29. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/Makefile +0 -0
  30. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/__main__.py +0 -0
  31. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/cli/__init__.py +0 -0
  32. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/cli/commands/__init__.py +0 -0
  33. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/cli/commands/doctor.py +0 -0
  34. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/cli/commands/down.py +0 -0
  35. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/cli/commands/init.py +0 -0
  36. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/cli/commands/logs.py +0 -0
  37. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/cli/commands/status.py +0 -0
  38. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/cli/commands/up.py +0 -0
  39. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/cli/console.py +0 -0
  40. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/config/__init__.py +0 -0
  41. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/config/loader.py +0 -0
  42. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/config/presets/__init__.py +0 -0
  43. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/config/presets/agent.py +0 -0
  44. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/config/presets/chat.py +0 -0
  45. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/config/presets/rag.py +0 -0
  46. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/config/schema.py +0 -0
  47. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/core/__init__.py +0 -0
  48. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/core/hardware.py +0 -0
  49. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/core/health.py +0 -0
  50. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/core/resolver.py +0 -0
  51. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/docker/__init__.py +0 -0
  52. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/gateway/__init__.py +0 -0
  53. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/gateway/main.py +0 -0
  54. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/gateway/middleware/__init__.py +0 -0
  55. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/gateway/middleware/auth.py +0 -0
  56. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/gateway/middleware/metrics.py +0 -0
  57. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/gateway/proxy.py +0 -0
  58. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/gateway/routes/__init__.py +0 -0
  59. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/gateway/routes/chat.py +0 -0
  60. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/gateway/routes/embeddings.py +0 -0
  61. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/gateway/routes/health.py +0 -0
  62. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/gateway/routes/models.py +0 -0
  63. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/plugins/__init__.py +0 -0
  64. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/plugins/loader.py +0 -0
  65. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/plugins/spec.py +0 -0
  66. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/services/__init__.py +0 -0
  67. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/services/base.py +0 -0
  68. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/services/cache/__init__.py +0 -0
  69. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/services/cache/redis.py +0 -0
  70. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/services/embeddings/__init__.py +0 -0
  71. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/services/embeddings/tei.py +0 -0
  72. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/services/gateway/__init__.py +0 -0
  73. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/services/inference/__init__.py +0 -0
  74. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/services/inference/ollama.py +0 -0
  75. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/services/inference/vllm.py +0 -0
  76. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/services/observe/__init__.py +0 -0
  77. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/services/registry.py +0 -0
  78. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/services/vectordb/__init__.py +0 -0
  79. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/src/llmstack/services/vectordb/qdrant.py +0 -0
  80. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/tests/__init__.py +0 -0
  81. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/tests/unit/__init__.py +0 -0
  82. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/tests/unit/test_config.py +0 -0
  83. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/tests/unit/test_gateway.py +0 -0
  84. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/tests/unit/test_hardware.py +0 -0
  85. {llmstack_cli-0.1.0 → llmstack_cli-0.2.0}/tests/unit/test_resolver.py +0 -0
@@ -0,0 +1,38 @@
1
+ name: Bug Report
2
+ description: Report a bug or unexpected behavior
3
+ labels: ["bug"]
4
+ body:
5
+ - type: textarea
6
+ id: description
7
+ attributes:
8
+ label: What happened?
9
+ description: A clear description of the bug.
10
+ validations:
11
+ required: true
12
+ - type: textarea
13
+ id: reproduce
14
+ attributes:
15
+ label: Steps to reproduce
16
+ description: Minimal steps to reproduce the issue.
17
+ placeholder: |
18
+ 1. llmstack init --preset rag
19
+ 2. llmstack up
20
+ 3. ...
21
+ validations:
22
+ required: true
23
+ - type: textarea
24
+ id: expected
25
+ attributes:
26
+ label: Expected behavior
27
+ description: What did you expect to happen?
28
+ - type: textarea
29
+ id: environment
30
+ attributes:
31
+ label: Environment
32
+ description: Paste the output of `llmstack doctor`
33
+ render: shell
34
+ - type: input
35
+ id: version
36
+ attributes:
37
+ label: llmstack version
38
+ placeholder: "0.1.0"
@@ -0,0 +1,23 @@
1
+ name: Feature Request
2
+ description: Suggest a new feature or improvement
3
+ labels: ["enhancement"]
4
+ body:
5
+ - type: textarea
6
+ id: problem
7
+ attributes:
8
+ label: Problem
9
+ description: What problem does this solve? What's your use case?
10
+ validations:
11
+ required: true
12
+ - type: textarea
13
+ id: solution
14
+ attributes:
15
+ label: Proposed solution
16
+ description: How would you like this to work?
17
+ validations:
18
+ required: true
19
+ - type: textarea
20
+ id: alternatives
21
+ attributes:
22
+ label: Alternatives considered
23
+ description: Any other approaches you've considered?
@@ -0,0 +1,17 @@
1
+ ## Summary
2
+
3
+ <!-- What does this PR do? -->
4
+
5
+ ## Changes
6
+
7
+ -
8
+
9
+ ## Test plan
10
+
11
+ - [ ] `make test` passes
12
+ - [ ] `make lint` passes
13
+ - [ ] Tested manually with `llmstack up`
14
+
15
+ ## Related issues
16
+
17
+ <!-- Fixes #123 -->
@@ -0,0 +1,17 @@
1
+ # Security Policy
2
+
3
+ ## Reporting a Vulnerability
4
+
5
+ If you discover a security vulnerability, please report it responsibly:
6
+
7
+ 1. **Do not** open a public issue
8
+ 2. Email: [open a private security advisory](https://github.com/mara-werils/llmstack/security/advisories/new)
9
+ 3. Include steps to reproduce and potential impact
10
+
11
+ We will respond within 48 hours and aim to release a fix within 7 days.
12
+
13
+ ## Supported Versions
14
+
15
+ | Version | Supported |
16
+ |---------|-----------|
17
+ | 0.1.x | Yes |
@@ -1,5 +1,18 @@
1
1
  # Changelog
2
2
 
3
+ ## [0.2.0] - 2026-05-07
4
+
5
+ ### Added
6
+ - `llmstack chat` — interactive terminal chat with streaming responses
7
+ - `llmstack export` — generate standalone docker-compose.yml from llmstack.yaml
8
+ - GitHub issue templates, PR template, security policy
9
+
10
+ ### Fixed
11
+ - Gateway Docker image now builds locally (no longer requires ghcr.io)
12
+ - Prometheus and Grafana configs are written to disk before container start
13
+ - Generated API keys persist to llmstack.yaml across restarts
14
+ - Clear error messages for port conflicts
15
+
3
16
  ## [0.1.0] - 2026-05-07
4
17
 
5
18
  ### Added
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: llmstack-cli
3
- Version: 0.1.0
3
+ Version: 0.2.0
4
4
  Summary: One command. Full LLM stack. Zero config.
5
5
  Author: mara-werils
6
6
  License-Expression: Apache-2.0
@@ -44,21 +44,43 @@ Description-Content-Type: text/markdown
44
44
  <a href="https://github.com/mara-werils/llmstack/actions/workflows/ci.yml"><img src="https://github.com/mara-werils/llmstack/actions/workflows/ci.yml/badge.svg" alt="CI"></a>
45
45
  <a href="https://github.com/mara-werils/llmstack/blob/main/LICENSE"><img src="https://img.shields.io/badge/license-Apache%202.0-green" alt="License"></a>
46
46
  <a href="https://www.python.org/"><img src="https://img.shields.io/badge/python-3.11+-blue" alt="Python"></a>
47
+ <a href="https://github.com/mara-werils/llmstack/stargazers"><img src="https://img.shields.io/github/stars/mara-werils/llmstack?style=social" alt="Stars"></a>
47
48
  </p>
48
49
 
49
50
  ---
50
51
 
51
- **llmstack** spins up a production-grade LLM stack locally with a single command. It auto-detects your hardware, picks the optimal inference backend, and wires everything together.
52
+ <p align="center">
53
+ <img src="demo.gif" alt="llmstack demo" width="800">
54
+ </p>
55
+
56
+ ## Quick Start
52
57
 
53
58
  ```bash
54
59
  pip install llmstack-cli
55
- llmstack init
60
+ llmstack init --preset rag
56
61
  llmstack up
57
62
  ```
58
63
 
59
- That's it. You now have a full LLM API running locally.
64
+ That's it. You now have **7 services** running: inference, embeddings, vector DB, cache, API gateway, Prometheus, and Grafana.
65
+
66
+ ```bash
67
+ # Test it immediately
68
+ curl http://localhost:8000/v1/chat/completions \
69
+ -H "Authorization: Bearer YOUR_KEY" \
70
+ -H "Content-Type: application/json" \
71
+ -d '{"model":"llama3.2","messages":[{"role":"user","content":"Hello!"}]}'
72
+ ```
73
+
74
+ Works with **any OpenAI-compatible client**: LangChain, LlamaIndex, Vercel AI SDK, openai-python.
75
+
76
+ ## Who is this for?
60
77
 
61
- ## Architecture
78
+ - **AI app developers** who want local inference without Docker boilerplate
79
+ - **Teams** who need an OpenAI-compatible API backed by local models
80
+ - **Hobbyists** running LLMs locally who want vector search, caching, and monitoring out of the box
81
+ - **Anyone** tired of writing 200+ lines of docker-compose.yml every time
82
+
83
+ ## What you get
62
84
 
63
85
  ```
64
86
  llmstack up
@@ -84,8 +106,6 @@ That's it. You now have a full LLM API running locally.
84
106
  :8080
85
107
  ```
86
108
 
87
- ## What you get
88
-
89
109
  | Layer | Service | Default | Port |
90
110
  |-------|---------|---------|------|
91
111
  | Inference | Ollama / vLLM (auto) | llama3.2 | 11434 |
@@ -97,7 +117,7 @@ That's it. You now have a full LLM API running locally.
97
117
 
98
118
  ## How it works
99
119
 
100
- ```
120
+ ```bash
101
121
  llmstack init # Detects hardware, generates llmstack.yaml
102
122
  # Picks optimal backend: vLLM for NVIDIA 16GB+, Ollama otherwise
103
123
 
@@ -109,27 +129,6 @@ llmstack logs ollama # Stream inference logs
109
129
  llmstack down # Stops everything
110
130
  ```
111
131
 
112
- ### Use the API
113
-
114
- ```bash
115
- curl http://localhost:8000/v1/chat/completions \
116
- -H "Authorization: Bearer YOUR_KEY" \
117
- -H "Content-Type: application/json" \
118
- -d '{"model":"llama3.2","messages":[{"role":"user","content":"Hello!"}]}'
119
- ```
120
-
121
- Works with **any OpenAI-compatible client**: LangChain, LlamaIndex, Vercel AI SDK, openai-python.
122
-
123
- ```python
124
- from openai import OpenAI
125
-
126
- client = OpenAI(base_url="http://localhost:8000/v1", api_key="YOUR_KEY")
127
- response = client.chat.completions.create(
128
- model="llama3.2",
129
- messages=[{"role": "user", "content": "Explain quantum computing"}]
130
- )
131
- ```
132
-
133
132
  ## Auto hardware detection
134
133
 
135
134
  | Your hardware | Backend | Why |
@@ -181,6 +180,50 @@ observe:
181
180
  dashboard_port: 8080
182
181
  ```
183
182
 
183
+ ## Interactive Chat
184
+
185
+ ```bash
186
+ llmstack chat
187
+ ```
188
+
189
+ ```
190
+ LLMStack Chat — model: llama3.2
191
+ Type 'exit' or Ctrl+C to quit. '/clear' to reset conversation.
192
+
193
+ You: What is quantum computing?
194
+ Assistant: Quantum computing uses quantum mechanical phenomena like
195
+ superposition and entanglement to process information...
196
+
197
+ You: /clear
198
+ Conversation cleared.
199
+ ```
200
+
201
+ Streaming responses, conversation history, works with any model in your stack.
202
+
203
+ ## Export to Docker Compose
204
+
205
+ Don't want to install llmstack? Generate a standalone `docker-compose.yml`:
206
+
207
+ ```bash
208
+ llmstack export
209
+ # Exported 7 services to docker-compose.yml
210
+ # Run with: docker compose up -d
211
+ ```
212
+
213
+ Share the generated file with your team — no llmstack dependency required.
214
+
215
+ ## Use the API
216
+
217
+ ```python
218
+ from openai import OpenAI
219
+
220
+ client = OpenAI(base_url="http://localhost:8000/v1", api_key="YOUR_KEY")
221
+ response = client.chat.completions.create(
222
+ model="llama3.2",
223
+ messages=[{"role": "user", "content": "Explain quantum computing"}]
224
+ )
225
+ ```
226
+
184
227
  ## CLI
185
228
 
186
229
  | Command | Description |
@@ -189,6 +232,8 @@ observe:
189
232
  | `llmstack up [--attach]` | Start all services |
190
233
  | `llmstack down [--volumes]` | Stop and clean up |
191
234
  | `llmstack status` | Health check all services |
235
+ | `llmstack chat [--model]` | Interactive terminal chat |
236
+ | `llmstack export [--output]` | Generate docker-compose.yml |
192
237
  | `llmstack logs <service>` | Stream service logs |
193
238
  | `llmstack doctor` | Diagnose system issues |
194
239
 
@@ -204,6 +249,34 @@ When `observe.metrics: true`, llmstack boots Prometheus + Grafana with a pre-bui
204
249
 
205
250
  Access at `http://localhost:8080` (login: admin / llmstack)
206
251
 
252
+ ## Why not just Docker Compose?
253
+
254
+ Here's what llmstack replaces:
255
+
256
+ ```yaml
257
+ # Without llmstack: ~200 lines of docker-compose.yml
258
+ # You have to configure each service, write health checks,
259
+ # set up networking, manage GPU passthrough, create Prometheus
260
+ # scrape configs, provision Grafana dashboards...
261
+
262
+ # With llmstack:
263
+ llmstack init && llmstack up
264
+ ```
265
+
266
+ ## Comparison
267
+
268
+ | | llmstack | Ollama | LocalAI | AnythingLLM | LiteLLM |
269
+ |---|---|---|---|---|---|
270
+ | One-command full stack | **Yes** | No (inference only) | No | Partial | No (proxy only) |
271
+ | Auto hardware detection | **Yes** | No | No | No | No |
272
+ | OpenAI-compatible API | **Yes** | Yes | Yes | No | Yes |
273
+ | Built-in vector DB | **Yes** | No | No | Bundled | No |
274
+ | Built-in embeddings | **Yes** | No | No | Bundled | No |
275
+ | Caching (Redis) | **Yes** | No | No | No | No |
276
+ | Auth + rate limiting | **Yes** | No | No | Yes | Yes |
277
+ | Observability dashboard | **Yes** | No | Partial | No | Partial |
278
+ | Plugin ecosystem | **Yes** | No | No | No | No |
279
+
207
280
  ## Plugins
208
281
 
209
282
  Extend llmstack with new backends via pip:
@@ -215,21 +288,6 @@ pip install llmstack-cli-plugin-chromadb
215
288
 
216
289
  Create your own: implement `ServiceBase`, register via entry_points. See [CONTRIBUTING.md](CONTRIBUTING.md).
217
290
 
218
- ## Why llmstack?
219
-
220
- | | llmstack | Ollama | Harbor | AnythingLLM | LiteLLM |
221
- |---|---|---|---|---|---|
222
- | One-command full stack | Yes | No (inference only) | Partial | Partial | No (proxy only) |
223
- | Auto hardware detection | Yes | No | No | No | No |
224
- | OpenAI-compatible API | Yes | Yes | Varies | No | Yes |
225
- | Built-in vector DB | Yes | No | Config needed | Bundled | No |
226
- | Built-in embeddings | Yes | No | No | Bundled | No |
227
- | Caching (Redis) | Yes | No | No | No | No |
228
- | Auth + rate limiting | Yes | No | No | Yes | Yes |
229
- | Observability dashboard | Yes | No | Partial | No | Partial |
230
- | Plugin ecosystem | Yes | No | No | No | No |
231
- | SSE streaming | Yes | Yes | Yes | Yes | Yes |
232
-
233
291
  ## Tech stack
234
292
 
235
293
  - **CLI**: [Typer](https://typer.tiangolo.com/) + [Rich](https://rich.readthedocs.io/)
@@ -9,21 +9,43 @@
9
9
  <a href="https://github.com/mara-werils/llmstack/actions/workflows/ci.yml"><img src="https://github.com/mara-werils/llmstack/actions/workflows/ci.yml/badge.svg" alt="CI"></a>
10
10
  <a href="https://github.com/mara-werils/llmstack/blob/main/LICENSE"><img src="https://img.shields.io/badge/license-Apache%202.0-green" alt="License"></a>
11
11
  <a href="https://www.python.org/"><img src="https://img.shields.io/badge/python-3.11+-blue" alt="Python"></a>
12
+ <a href="https://github.com/mara-werils/llmstack/stargazers"><img src="https://img.shields.io/github/stars/mara-werils/llmstack?style=social" alt="Stars"></a>
12
13
  </p>
13
14
 
14
15
  ---
15
16
 
16
- **llmstack** spins up a production-grade LLM stack locally with a single command. It auto-detects your hardware, picks the optimal inference backend, and wires everything together.
17
+ <p align="center">
18
+ <img src="demo.gif" alt="llmstack demo" width="800">
19
+ </p>
20
+
21
+ ## Quick Start
17
22
 
18
23
  ```bash
19
24
  pip install llmstack-cli
20
- llmstack init
25
+ llmstack init --preset rag
21
26
  llmstack up
22
27
  ```
23
28
 
24
- That's it. You now have a full LLM API running locally.
29
+ That's it. You now have **7 services** running: inference, embeddings, vector DB, cache, API gateway, Prometheus, and Grafana.
30
+
31
+ ```bash
32
+ # Test it immediately
33
+ curl http://localhost:8000/v1/chat/completions \
34
+ -H "Authorization: Bearer YOUR_KEY" \
35
+ -H "Content-Type: application/json" \
36
+ -d '{"model":"llama3.2","messages":[{"role":"user","content":"Hello!"}]}'
37
+ ```
38
+
39
+ Works with **any OpenAI-compatible client**: LangChain, LlamaIndex, Vercel AI SDK, openai-python.
40
+
41
+ ## Who is this for?
25
42
 
26
- ## Architecture
43
+ - **AI app developers** who want local inference without Docker boilerplate
44
+ - **Teams** who need an OpenAI-compatible API backed by local models
45
+ - **Hobbyists** running LLMs locally who want vector search, caching, and monitoring out of the box
46
+ - **Anyone** tired of writing 200+ lines of docker-compose.yml every time
47
+
48
+ ## What you get
27
49
 
28
50
  ```
29
51
  llmstack up
@@ -49,8 +71,6 @@ That's it. You now have a full LLM API running locally.
49
71
  :8080
50
72
  ```
51
73
 
52
- ## What you get
53
-
54
74
  | Layer | Service | Default | Port |
55
75
  |-------|---------|---------|------|
56
76
  | Inference | Ollama / vLLM (auto) | llama3.2 | 11434 |
@@ -62,7 +82,7 @@ That's it. You now have a full LLM API running locally.
62
82
 
63
83
  ## How it works
64
84
 
65
- ```
85
+ ```bash
66
86
  llmstack init # Detects hardware, generates llmstack.yaml
67
87
  # Picks optimal backend: vLLM for NVIDIA 16GB+, Ollama otherwise
68
88
 
@@ -74,27 +94,6 @@ llmstack logs ollama # Stream inference logs
74
94
  llmstack down # Stops everything
75
95
  ```
76
96
 
77
- ### Use the API
78
-
79
- ```bash
80
- curl http://localhost:8000/v1/chat/completions \
81
- -H "Authorization: Bearer YOUR_KEY" \
82
- -H "Content-Type: application/json" \
83
- -d '{"model":"llama3.2","messages":[{"role":"user","content":"Hello!"}]}'
84
- ```
85
-
86
- Works with **any OpenAI-compatible client**: LangChain, LlamaIndex, Vercel AI SDK, openai-python.
87
-
88
- ```python
89
- from openai import OpenAI
90
-
91
- client = OpenAI(base_url="http://localhost:8000/v1", api_key="YOUR_KEY")
92
- response = client.chat.completions.create(
93
- model="llama3.2",
94
- messages=[{"role": "user", "content": "Explain quantum computing"}]
95
- )
96
- ```
97
-
98
97
  ## Auto hardware detection
99
98
 
100
99
  | Your hardware | Backend | Why |
@@ -146,6 +145,50 @@ observe:
146
145
  dashboard_port: 8080
147
146
  ```
148
147
 
148
+ ## Interactive Chat
149
+
150
+ ```bash
151
+ llmstack chat
152
+ ```
153
+
154
+ ```
155
+ LLMStack Chat — model: llama3.2
156
+ Type 'exit' or Ctrl+C to quit. '/clear' to reset conversation.
157
+
158
+ You: What is quantum computing?
159
+ Assistant: Quantum computing uses quantum mechanical phenomena like
160
+ superposition and entanglement to process information...
161
+
162
+ You: /clear
163
+ Conversation cleared.
164
+ ```
165
+
166
+ Streaming responses, conversation history, works with any model in your stack.
167
+
168
+ ## Export to Docker Compose
169
+
170
+ Don't want to install llmstack? Generate a standalone `docker-compose.yml`:
171
+
172
+ ```bash
173
+ llmstack export
174
+ # Exported 7 services to docker-compose.yml
175
+ # Run with: docker compose up -d
176
+ ```
177
+
178
+ Share the generated file with your team — no llmstack dependency required.
179
+
180
+ ## Use the API
181
+
182
+ ```python
183
+ from openai import OpenAI
184
+
185
+ client = OpenAI(base_url="http://localhost:8000/v1", api_key="YOUR_KEY")
186
+ response = client.chat.completions.create(
187
+ model="llama3.2",
188
+ messages=[{"role": "user", "content": "Explain quantum computing"}]
189
+ )
190
+ ```
191
+
149
192
  ## CLI
150
193
 
151
194
  | Command | Description |
@@ -154,6 +197,8 @@ observe:
154
197
  | `llmstack up [--attach]` | Start all services |
155
198
  | `llmstack down [--volumes]` | Stop and clean up |
156
199
  | `llmstack status` | Health check all services |
200
+ | `llmstack chat [--model]` | Interactive terminal chat |
201
+ | `llmstack export [--output]` | Generate docker-compose.yml |
157
202
  | `llmstack logs <service>` | Stream service logs |
158
203
  | `llmstack doctor` | Diagnose system issues |
159
204
 
@@ -169,6 +214,34 @@ When `observe.metrics: true`, llmstack boots Prometheus + Grafana with a pre-bui
169
214
 
170
215
  Access at `http://localhost:8080` (login: admin / llmstack)
171
216
 
217
+ ## Why not just Docker Compose?
218
+
219
+ Here's what llmstack replaces:
220
+
221
+ ```yaml
222
+ # Without llmstack: ~200 lines of docker-compose.yml
223
+ # You have to configure each service, write health checks,
224
+ # set up networking, manage GPU passthrough, create Prometheus
225
+ # scrape configs, provision Grafana dashboards...
226
+
227
+ # With llmstack:
228
+ llmstack init && llmstack up
229
+ ```
230
+
231
+ ## Comparison
232
+
233
+ | | llmstack | Ollama | LocalAI | AnythingLLM | LiteLLM |
234
+ |---|---|---|---|---|---|
235
+ | One-command full stack | **Yes** | No (inference only) | No | Partial | No (proxy only) |
236
+ | Auto hardware detection | **Yes** | No | No | No | No |
237
+ | OpenAI-compatible API | **Yes** | Yes | Yes | No | Yes |
238
+ | Built-in vector DB | **Yes** | No | No | Bundled | No |
239
+ | Built-in embeddings | **Yes** | No | No | Bundled | No |
240
+ | Caching (Redis) | **Yes** | No | No | No | No |
241
+ | Auth + rate limiting | **Yes** | No | No | Yes | Yes |
242
+ | Observability dashboard | **Yes** | No | Partial | No | Partial |
243
+ | Plugin ecosystem | **Yes** | No | No | No | No |
244
+
172
245
  ## Plugins
173
246
 
174
247
  Extend llmstack with new backends via pip:
@@ -180,21 +253,6 @@ pip install llmstack-cli-plugin-chromadb
180
253
 
181
254
  Create your own: implement `ServiceBase`, register via entry_points. See [CONTRIBUTING.md](CONTRIBUTING.md).
182
255
 
183
- ## Why llmstack?
184
-
185
- | | llmstack | Ollama | Harbor | AnythingLLM | LiteLLM |
186
- |---|---|---|---|---|---|
187
- | One-command full stack | Yes | No (inference only) | Partial | Partial | No (proxy only) |
188
- | Auto hardware detection | Yes | No | No | No | No |
189
- | OpenAI-compatible API | Yes | Yes | Varies | No | Yes |
190
- | Built-in vector DB | Yes | No | Config needed | Bundled | No |
191
- | Built-in embeddings | Yes | No | No | Bundled | No |
192
- | Caching (Redis) | Yes | No | No | No | No |
193
- | Auth + rate limiting | Yes | No | No | Yes | Yes |
194
- | Observability dashboard | Yes | No | Partial | No | Partial |
195
- | Plugin ecosystem | Yes | No | No | No | No |
196
- | SSE streaming | Yes | Yes | Yes | Yes | Yes |
197
-
198
256
  ## Tech stack
199
257
 
200
258
  - **CLI**: [Typer](https://typer.tiangolo.com/) + [Rich](https://rich.readthedocs.io/)
Binary file
@@ -0,0 +1,51 @@
1
+ # VHS tape for recording llmstack demo GIF
2
+ # Pre-requisite: cp scripts/fake-llmstack /tmp/llmstack
3
+ # Run: vhs demo.tape
4
+
5
+ Output demo.gif
6
+
7
+ Set FontSize 16
8
+ Set Width 1200
9
+ Set Height 800
10
+ Set Theme "Catppuccin Mocha"
11
+ Set Padding 20
12
+ Set TypingSpeed 40ms
13
+ Set Shell "bash"
14
+
15
+ # Set PATH so /tmp/llmstack is found, then clear screen
16
+ Hide
17
+ Type "export PATH=/tmp:$PATH"
18
+ Enter
19
+ Sleep 300ms
20
+ Type "export PS1='$ '"
21
+ Enter
22
+ Sleep 300ms
23
+ Type "clear"
24
+ Enter
25
+ Sleep 500ms
26
+ Show
27
+
28
+ # --- Scene 1: Show version ---
29
+ Type "llmstack --version"
30
+ Enter
31
+ Sleep 1.5s
32
+
33
+ # --- Scene 2: Initialize with RAG preset ---
34
+ Type "llmstack init --preset rag"
35
+ Enter
36
+ Sleep 3s
37
+
38
+ # --- Scene 3: Start the stack ---
39
+ Type "llmstack up"
40
+ Enter
41
+ Sleep 4s
42
+
43
+ # --- Scene 4: Check status ---
44
+ Type "llmstack status"
45
+ Enter
46
+ Sleep 3s
47
+
48
+ # --- Scene 5: Tear down ---
49
+ Type "llmstack down"
50
+ Enter
51
+ Sleep 3s
@@ -0,0 +1,86 @@
1
+ $ llmstack --version
2
+ llmstack 0.1.0
3
+
4
+ $ llmstack init --preset rag
5
+
6
+ Hardware detected:
7
+ CPU: 10 cores
8
+ RAM: 32 GB
9
+ GPU: Apple M2 Pro (16 GB VRAM)
10
+
11
+ Using preset: rag
12
+ Backend: Ollama
13
+
14
+ Created llmstack.yaml
15
+ Next: edit the config if needed, then run llmstack up
16
+
17
+ $ llmstack up
18
+
19
+ Starting LLMStack...
20
+
21
+ ✓ qdrant running :6333
22
+ ✓ redis running :6379
23
+ ✓ ollama running :11434 (pulling llama3.2...)
24
+ ✓ tei running :8002 (loading bge-m3...)
25
+ ✓ gateway running :8000
26
+ ✓ prometheus running :9090
27
+ ✓ grafana running :8080
28
+
29
+ Stack is ready! 7 services running.
30
+ API: http://localhost:8000/v1
31
+ Dashboard: http://localhost:8080
32
+
33
+ $ llmstack status
34
+
35
+ LLMStack Status
36
+ ┌─────────────┬──────────┬─────────┬───────────────┐
37
+ │ Service │ Container│ Status │ Ports │
38
+ ├─────────────┼──────────┼─────────┼───────────────┤
39
+ │ qdrant │ a3f1.. │ running │ 6333->6333 │
40
+ │ redis │ b7e2.. │ running │ 6379->6379 │
41
+ │ ollama │ c9d4.. │ running │ 11434->11434 │
42
+ │ tei │ d2a8.. │ running │ 8002->8002 │
43
+ │ gateway │ e5c1.. │ running │ 8000->8000 │
44
+ │ prometheus │ f8b3.. │ running │ 9090->9090 │
45
+ │ grafana │ 1a7e.. │ running │ 8080->8080 │
46
+ └─────────────┴──────────┴─────────┴───────────────┘
47
+
48
+ $ curl -s http://localhost:8000/v1/chat/completions \
49
+ -H "Authorization: Bearer llmstack-key" \
50
+ -H "Content-Type: application/json" \
51
+ -d '{"model":"llama3.2","messages":[{"role":"user","content":"Hello!"}]}' | jq .
52
+ {
53
+ "id": "chatcmpl-abc123",
54
+ "object": "chat.completion",
55
+ "created": 1715090400,
56
+ "model": "llama3.2",
57
+ "choices": [
58
+ {
59
+ "index": 0,
60
+ "message": {
61
+ "role": "assistant",
62
+ "content": "Hello! How can I help you today?"
63
+ },
64
+ "finish_reason": "stop"
65
+ }
66
+ ],
67
+ "usage": {
68
+ "prompt_tokens": 11,
69
+ "completion_tokens": 9,
70
+ "total_tokens": 20
71
+ }
72
+ }
73
+
74
+ $ llmstack down
75
+
76
+ Stopping LLMStack...
77
+
78
+ ✓ grafana stopped
79
+ ✓ prometheus stopped
80
+ ✓ gateway stopped
81
+ ✓ tei stopped
82
+ ✓ ollama stopped
83
+ ✓ redis stopped
84
+ ✓ qdrant stopped
85
+
86
+ All services stopped.