vllmd 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vllmd-0.2.0/.github/workflows/ci.yml +19 -0
- vllmd-0.2.0/.github/workflows/publish.yml +29 -0
- vllmd-0.2.0/.gitignore +12 -0
- vllmd-0.2.0/Dockerfile +13 -0
- vllmd-0.2.0/LICENSE +21 -0
- vllmd-0.2.0/PKG-INFO +238 -0
- vllmd-0.2.0/README.md +223 -0
- vllmd-0.2.0/docker-compose.yaml +10 -0
- vllmd-0.2.0/pyproject.toml +42 -0
- vllmd-0.2.0/src/vllmd/__init__.py +5 -0
- vllmd-0.2.0/src/vllmd/cli.py +292 -0
- vllmd-0.2.0/src/vllmd/runner.py +239 -0
- vllmd-0.2.0/src/vllmd/sessions/__init__.py +5 -0
- vllmd-0.2.0/src/vllmd/sessions/chat.py +123 -0
- vllmd-0.2.0/src/vllmd/sessions/cli.py +367 -0
- vllmd-0.2.0/src/vllmd/sessions/session.py +101 -0
- vllmd-0.2.0/src/vllmd/vectordb/__init__.py +5 -0
- vllmd-0.2.0/src/vllmd/vectordb/cli.py +223 -0
- vllmd-0.2.0/src/vllmd/vectordb/embeddings.py +24 -0
- vllmd-0.2.0/src/vllmd/vectordb/store.py +259 -0
- vllmd-0.2.0/src/vllmd/vectordb/sync.py +24 -0
- vllmd-0.2.0/tests/__init__.py +0 -0
- vllmd-0.2.0/tests/test_runner.py +97 -0
- vllmd-0.2.0/tests/test_sessions.py +260 -0
- vllmd-0.2.0/uv.lock +2357 -0
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: ["**"]
|
|
6
|
+
pull_request:
|
|
7
|
+
workflow_call:
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
test:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
steps:
|
|
13
|
+
- uses: actions/checkout@v4
|
|
14
|
+
- uses: astral-sh/setup-uv@v4
|
|
15
|
+
- run: uv venv
|
|
16
|
+
- run: uv pip install -e ".[dev]"
|
|
17
|
+
- run: uv run ruff check src/ tests/
|
|
18
|
+
- run: uv run ruff format --check src/ tests/
|
|
19
|
+
- run: uv run pytest
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags:
|
|
6
|
+
- "v*"
|
|
7
|
+
|
|
8
|
+
permissions:
|
|
9
|
+
id-token: write # trusted publishing (no API token needed)
|
|
10
|
+
|
|
11
|
+
jobs:
|
|
12
|
+
test:
|
|
13
|
+
uses: ./.github/workflows/ci.yml
|
|
14
|
+
|
|
15
|
+
publish:
|
|
16
|
+
needs: test
|
|
17
|
+
runs-on: ubuntu-latest
|
|
18
|
+
environment: pypi
|
|
19
|
+
steps:
|
|
20
|
+
- uses: actions/checkout@v4
|
|
21
|
+
- uses: astral-sh/setup-uv@v4
|
|
22
|
+
|
|
23
|
+
- name: Build package
|
|
24
|
+
run: uv build
|
|
25
|
+
|
|
26
|
+
- name: Publish to PyPI
|
|
27
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
28
|
+
with:
|
|
29
|
+
attestations: false
|
vllmd-0.2.0/.gitignore
ADDED
vllmd-0.2.0/Dockerfile
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
FROM python:3.12-slim
|
|
2
|
+
|
|
3
|
+
COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
|
|
4
|
+
|
|
5
|
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
6
|
+
docker.io \
|
|
7
|
+
&& rm -rf /var/lib/apt/lists/*
|
|
8
|
+
|
|
9
|
+
WORKDIR /app
|
|
10
|
+
COPY . /app
|
|
11
|
+
RUN uv pip install --system --no-cache .
|
|
12
|
+
|
|
13
|
+
ENTRYPOINT ["agent-runner"]
|
vllmd-0.2.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 sroomberg
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
vllmd-0.2.0/PKG-INFO
ADDED
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: vllmd
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Run local models via vLLM in Docker containers
|
|
5
|
+
License-Expression: MIT
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Requires-Python: >=3.10
|
|
8
|
+
Requires-Dist: chromadb>=0.5
|
|
9
|
+
Requires-Dist: rich>=13.0
|
|
10
|
+
Requires-Dist: typer>=0.9
|
|
11
|
+
Provides-Extra: dev
|
|
12
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
13
|
+
Requires-Dist: ruff>=0.4; extra == 'dev'
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
|
|
16
|
+
# vllmd
|
|
17
|
+
|
|
18
|
+
> **⚠️ Experimental** — This project is under active development. APIs, config format, and CLI flags may change without notice.
|
|
19
|
+
|
|
20
|
+
Run a model that is already on the machine in a [vLLM](https://github.com/vllm-project/vllm) container and serve it on a specified port with an OpenAI-compatible API.
|
|
21
|
+
|
|
22
|
+
## Install
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
uv pip install -e ".[dev]"
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
Requires Docker with the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) for GPU support.
|
|
29
|
+
|
|
30
|
+
## Quick Start
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
# Serve a model on port 8000 (foreground — streams vLLM logs until Ctrl+C)
|
|
34
|
+
vllmd run --model /path/to/my-model --port 8000
|
|
35
|
+
|
|
36
|
+
# Run in the background; wait for the API to be ready, then return
|
|
37
|
+
vllmd run --model /path/to/my-model --port 8000 -d
|
|
38
|
+
|
|
39
|
+
# Run in the background and return immediately without waiting
|
|
40
|
+
vllmd run --model /path/to/my-model --port 8000 -d --no-wait
|
|
41
|
+
|
|
42
|
+
# CPU-only (no GPU)
|
|
43
|
+
vllmd run --model /path/to/my-model --port 8000 --no-gpu
|
|
44
|
+
|
|
45
|
+
# Check if the container is up and the API is healthy
|
|
46
|
+
vllmd status
|
|
47
|
+
|
|
48
|
+
# Stream container logs
|
|
49
|
+
vllmd logs --follow
|
|
50
|
+
|
|
51
|
+
# Stop the container
|
|
52
|
+
vllmd stop
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
## Multiple Models
|
|
56
|
+
|
|
57
|
+
Multiple models can run concurrently, each in its own container on a different port. The container name defaults to `vllmd-<model-dir-name>`.
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
# Start two models on different ports
|
|
61
|
+
vllmd run --model /models/llama3 --port 8001 -d
|
|
62
|
+
vllmd run --model /models/mistral --port 8002 -d
|
|
63
|
+
|
|
64
|
+
# List all running vllmd containers
|
|
65
|
+
vllmd ps
|
|
66
|
+
|
|
67
|
+
# Check health of all containers at once
|
|
68
|
+
vllmd status
|
|
69
|
+
|
|
70
|
+
# Stop a specific container
|
|
71
|
+
vllmd stop --name vllmd-llama3
|
|
72
|
+
|
|
73
|
+
# Stop all vllmd containers
|
|
74
|
+
vllmd stop --all
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
When only one container is running, `stop`, `status`, `logs`, and `session create` all auto-resolve to it without needing `--name`.
|
|
78
|
+
|
|
79
|
+
## How It Works
|
|
80
|
+
|
|
81
|
+
1. `vllmd run` resolves the model path and pulls `vllm/vllm-openai:latest` if needed
|
|
82
|
+
2. A Docker container is started with the model directory mounted read-only at `/model`
|
|
83
|
+
3. vLLM serves the model on port 8000 inside the container, mapped to `--port` on the host
|
|
84
|
+
4. The served model ID is the directory name of the model path
|
|
85
|
+
5. The endpoint exposes a standard OpenAI-compatible API at `http://localhost:<port>/v1`
|
|
86
|
+
|
|
87
|
+
## Commands
|
|
88
|
+
|
|
89
|
+
| Command | Description |
|
|
90
|
+
|---------|-------------|
|
|
91
|
+
| `run` | Start a vLLM container for a model |
|
|
92
|
+
| `ps` | List all running vllmd containers |
|
|
93
|
+
| `stop` | Stop a container (`--all` to stop every managed container) |
|
|
94
|
+
| `status` | Show container and API health (all containers if no `--name`) |
|
|
95
|
+
| `logs` | Print container logs |
|
|
96
|
+
| `session create` | Create a persistent chat session |
|
|
97
|
+
| `session chat` | Send a one-shot message in a session |
|
|
98
|
+
| `session attach` | Open an interactive REPL for a session |
|
|
99
|
+
| `session list` | List all sessions |
|
|
100
|
+
| `session history` | Print conversation history |
|
|
101
|
+
| `session clear` | Clear conversation history |
|
|
102
|
+
| `session delete` | Delete a session |
|
|
103
|
+
| `db ingest` | Add documents or code to the vector database |
|
|
104
|
+
| `db search` | Query the vector database for relevant context |
|
|
105
|
+
| `db history` | Store a conversation message |
|
|
106
|
+
| `db summarize` | Replace a session's history with an abridged summary |
|
|
107
|
+
| `db sync` | Sync the vector DB to/from S3 |
|
|
108
|
+
| `db stats` | Show collection sizes |
|
|
109
|
+
|
|
110
|
+
## Options
|
|
111
|
+
|
|
112
|
+
### `run`
|
|
113
|
+
|
|
114
|
+
| Flag | Default | Description |
|
|
115
|
+
|------|---------|-------------|
|
|
116
|
+
| `--model`, `-m` | (required) | Path to the model directory |
|
|
117
|
+
| `--port`, `-p` | `8000` | Host port for the vLLM API |
|
|
118
|
+
| `--name`, `-n` | `vllmd-<model-dir>` | Docker container name |
|
|
119
|
+
| `--gpu/--no-gpu` | `--gpu` | Enable/disable GPU passthrough |
|
|
120
|
+
| `--dtype` | `auto` | Model dtype (`auto`, `float16`, `bfloat16`, `float32`) |
|
|
121
|
+
| `--max-model-len` | — | Override max context length |
|
|
122
|
+
| `--detach`, `-d` | `false` | Start in background |
|
|
123
|
+
| `--wait/--no-wait` | `--wait` | Wait for API to be ready (implies background start) |
|
|
124
|
+
|
|
125
|
+
Extra positional arguments are forwarded verbatim to vLLM.
|
|
126
|
+
|
|
127
|
+
## Sessions
|
|
128
|
+
|
|
129
|
+
Sessions are persistent, named conversations tied to a running model. Each session maintains sequential conversation history and optionally retrieves semantic context from the vector database.
|
|
130
|
+
|
|
131
|
+
Sessions are stored as JSON files in `~/.vllmd/sessions/` (override with `--sessions-dir`).
|
|
132
|
+
|
|
133
|
+
```bash
|
|
134
|
+
# Create a session (auto-resolves endpoint if one container is running)
|
|
135
|
+
vllmd session create my-session
|
|
136
|
+
|
|
137
|
+
# Create a session bound to a specific container, with context retrieval
|
|
138
|
+
vllmd session create my-session \
|
|
139
|
+
--container vllmd-llama3 \
|
|
140
|
+
--embedding-model llama3 \
|
|
141
|
+
--system-prompt "You are a helpful coding assistant."
|
|
142
|
+
|
|
143
|
+
# One-shot message
|
|
144
|
+
vllmd session chat my-session "Explain the main training loop"
|
|
145
|
+
|
|
146
|
+
# Interactive REPL (supports /history, /context <query>, /reset, /exit)
|
|
147
|
+
vllmd session attach my-session
|
|
148
|
+
|
|
149
|
+
# View conversation history
|
|
150
|
+
vllmd session history my-session --last 10
|
|
151
|
+
|
|
152
|
+
# List all sessions
|
|
153
|
+
vllmd session list
|
|
154
|
+
|
|
155
|
+
# Clear history (keeps session config)
|
|
156
|
+
vllmd session clear my-session
|
|
157
|
+
|
|
158
|
+
# Delete a session
|
|
159
|
+
vllmd session delete my-session
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
### Context retrieval
|
|
163
|
+
|
|
164
|
+
When a session is created with `--embedding-model`, each message automatically retrieves the most relevant chunks from the session's vector store (documents and code) and injects them as system context before the conversation history. Exchanges are also stored in the ChromaDB history collection for future semantic search.
|
|
165
|
+
|
|
166
|
+
If the embedding endpoint is unavailable, retrieval is silently skipped and the session continues with history-only context.
|
|
167
|
+
|
|
168
|
+
## Vector Context Database
|
|
169
|
+
|
|
170
|
+
vllmd includes a local vector database (backed by [ChromaDB](https://docs.trychroma.com/)) that stores documents, code, and conversation history as embeddings. Embeddings are generated using the same vLLM server the model runs on.
|
|
171
|
+
|
|
172
|
+
```bash
|
|
173
|
+
# Ingest a directory of documents
|
|
174
|
+
vllmd db ingest ./docs --type documents --model my-model
|
|
175
|
+
|
|
176
|
+
# Ingest a codebase
|
|
177
|
+
vllmd db ingest ./src --type code --model my-model
|
|
178
|
+
|
|
179
|
+
# Search for relevant context
|
|
180
|
+
vllmd db search "how does auth work" --collection code --model my-model
|
|
181
|
+
|
|
182
|
+
# Store a conversation message
|
|
183
|
+
vllmd db history "Explain the main loop" --role user --session my-session --model my-model
|
|
184
|
+
|
|
185
|
+
# Abridge old history with a summary
|
|
186
|
+
vllmd db summarize --session my-session "Previous conversation covered auth and the main loop." --model my-model
|
|
187
|
+
|
|
188
|
+
# Push DB to S3
|
|
189
|
+
vllmd db sync s3://my-bucket/vectordb --direction push
|
|
190
|
+
|
|
191
|
+
# Pull DB from S3
|
|
192
|
+
vllmd db sync s3://my-bucket/vectordb --direction pull
|
|
193
|
+
|
|
194
|
+
# Show collection sizes
|
|
195
|
+
vllmd db stats
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
The DB directory (`./vectordb` by default, override with `--db-path`) can be mounted as a Docker volume for persistence and shared across machines via S3 sync.
|
|
199
|
+
|
|
200
|
+
## Using the API
|
|
201
|
+
|
|
202
|
+
Once running, the endpoint is OpenAI-compatible:
|
|
203
|
+
|
|
204
|
+
```bash
|
|
205
|
+
curl http://localhost:8000/v1/chat/completions \
|
|
206
|
+
-H "Content-Type: application/json" \
|
|
207
|
+
-d '{
|
|
208
|
+
"model": "my-model",
|
|
209
|
+
"messages": [{"role": "user", "content": "Hello!"}]
|
|
210
|
+
}'
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
Works out of the box with [AgentTester](https://github.com/sroomberg/agenttester):
|
|
214
|
+
|
|
215
|
+
```yaml
|
|
216
|
+
# agent-tester.yaml
|
|
217
|
+
agents:
|
|
218
|
+
my-model:
|
|
219
|
+
command: 'agent-tester query http://localhost:8000 my-model {prompt}'
|
|
220
|
+
host: localhost
|
|
221
|
+
commit_style: manual
|
|
222
|
+
timeout: 120
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
## Development
|
|
226
|
+
|
|
227
|
+
```bash
|
|
228
|
+
uv pip install -e ".[dev]"
|
|
229
|
+
ruff check src/ tests/
|
|
230
|
+
ruff format src/ tests/
|
|
231
|
+
pytest
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
## Docker
|
|
235
|
+
|
|
236
|
+
```bash
|
|
237
|
+
MODEL_PATH=/path/to/my-model docker compose run --rm vllmd run --model /model --port 8000
|
|
238
|
+
```
|
vllmd-0.2.0/README.md
ADDED
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
# vllmd
|
|
2
|
+
|
|
3
|
+
> **⚠️ Experimental** — This project is under active development. APIs, config format, and CLI flags may change without notice.
|
|
4
|
+
|
|
5
|
+
Run a model that is already on the machine in a [vLLM](https://github.com/vllm-project/vllm) container and serve it on a specified port with an OpenAI-compatible API.
|
|
6
|
+
|
|
7
|
+
## Install
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
uv pip install -e ".[dev]"
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
Requires Docker with the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) for GPU support.
|
|
14
|
+
|
|
15
|
+
## Quick Start
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
# Serve a model on port 8000 (foreground — streams vLLM logs until Ctrl+C)
|
|
19
|
+
vllmd run --model /path/to/my-model --port 8000
|
|
20
|
+
|
|
21
|
+
# Run in the background; wait for the API to be ready, then return
|
|
22
|
+
vllmd run --model /path/to/my-model --port 8000 -d
|
|
23
|
+
|
|
24
|
+
# Run in the background and return immediately without waiting
|
|
25
|
+
vllmd run --model /path/to/my-model --port 8000 -d --no-wait
|
|
26
|
+
|
|
27
|
+
# CPU-only (no GPU)
|
|
28
|
+
vllmd run --model /path/to/my-model --port 8000 --no-gpu
|
|
29
|
+
|
|
30
|
+
# Check if the container is up and the API is healthy
|
|
31
|
+
vllmd status
|
|
32
|
+
|
|
33
|
+
# Stream container logs
|
|
34
|
+
vllmd logs --follow
|
|
35
|
+
|
|
36
|
+
# Stop the container
|
|
37
|
+
vllmd stop
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Multiple Models
|
|
41
|
+
|
|
42
|
+
Multiple models can run concurrently, each in its own container on a different port. The container name defaults to `vllmd-<model-dir-name>`.
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
# Start two models on different ports
|
|
46
|
+
vllmd run --model /models/llama3 --port 8001 -d
|
|
47
|
+
vllmd run --model /models/mistral --port 8002 -d
|
|
48
|
+
|
|
49
|
+
# List all running vllmd containers
|
|
50
|
+
vllmd ps
|
|
51
|
+
|
|
52
|
+
# Check health of all containers at once
|
|
53
|
+
vllmd status
|
|
54
|
+
|
|
55
|
+
# Stop a specific container
|
|
56
|
+
vllmd stop --name vllmd-llama3
|
|
57
|
+
|
|
58
|
+
# Stop all vllmd containers
|
|
59
|
+
vllmd stop --all
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
When only one container is running, `stop`, `status`, `logs`, and `session create` all auto-resolve to it without needing `--name`.
|
|
63
|
+
|
|
64
|
+
## How It Works
|
|
65
|
+
|
|
66
|
+
1. `vllmd run` resolves the model path and pulls `vllm/vllm-openai:latest` if needed
|
|
67
|
+
2. A Docker container is started with the model directory mounted read-only at `/model`
|
|
68
|
+
3. vLLM serves the model on port 8000 inside the container, mapped to `--port` on the host
|
|
69
|
+
4. The served model ID is the directory name of the model path
|
|
70
|
+
5. The endpoint exposes a standard OpenAI-compatible API at `http://localhost:<port>/v1`
|
|
71
|
+
|
|
72
|
+
## Commands
|
|
73
|
+
|
|
74
|
+
| Command | Description |
|
|
75
|
+
|---------|-------------|
|
|
76
|
+
| `run` | Start a vLLM container for a model |
|
|
77
|
+
| `ps` | List all running vllmd containers |
|
|
78
|
+
| `stop` | Stop a container (`--all` to stop every managed container) |
|
|
79
|
+
| `status` | Show container and API health (all containers if no `--name`) |
|
|
80
|
+
| `logs` | Print container logs |
|
|
81
|
+
| `session create` | Create a persistent chat session |
|
|
82
|
+
| `session chat` | Send a one-shot message in a session |
|
|
83
|
+
| `session attach` | Open an interactive REPL for a session |
|
|
84
|
+
| `session list` | List all sessions |
|
|
85
|
+
| `session history` | Print conversation history |
|
|
86
|
+
| `session clear` | Clear conversation history |
|
|
87
|
+
| `session delete` | Delete a session |
|
|
88
|
+
| `db ingest` | Add documents or code to the vector database |
|
|
89
|
+
| `db search` | Query the vector database for relevant context |
|
|
90
|
+
| `db history` | Store a conversation message |
|
|
91
|
+
| `db summarize` | Replace a session's history with an abridged summary |
|
|
92
|
+
| `db sync` | Sync the vector DB to/from S3 |
|
|
93
|
+
| `db stats` | Show collection sizes |
|
|
94
|
+
|
|
95
|
+
## Options
|
|
96
|
+
|
|
97
|
+
### `run`
|
|
98
|
+
|
|
99
|
+
| Flag | Default | Description |
|
|
100
|
+
|------|---------|-------------|
|
|
101
|
+
| `--model`, `-m` | (required) | Path to the model directory |
|
|
102
|
+
| `--port`, `-p` | `8000` | Host port for the vLLM API |
|
|
103
|
+
| `--name`, `-n` | `vllmd-<model-dir>` | Docker container name |
|
|
104
|
+
| `--gpu/--no-gpu` | `--gpu` | Enable/disable GPU passthrough |
|
|
105
|
+
| `--dtype` | `auto` | Model dtype (`auto`, `float16`, `bfloat16`, `float32`) |
|
|
106
|
+
| `--max-model-len` | — | Override max context length |
|
|
107
|
+
| `--detach`, `-d` | `false` | Start in background |
|
|
108
|
+
| `--wait/--no-wait` | `--wait` | Wait for API to be ready (implies background start) |
|
|
109
|
+
|
|
110
|
+
Extra positional arguments are forwarded verbatim to vLLM.
|
|
111
|
+
|
|
112
|
+
## Sessions
|
|
113
|
+
|
|
114
|
+
Sessions are persistent, named conversations tied to a running model. Each session maintains sequential conversation history and optionally retrieves semantic context from the vector database.
|
|
115
|
+
|
|
116
|
+
Sessions are stored as JSON files in `~/.vllmd/sessions/` (override with `--sessions-dir`).
|
|
117
|
+
|
|
118
|
+
```bash
|
|
119
|
+
# Create a session (auto-resolves endpoint if one container is running)
|
|
120
|
+
vllmd session create my-session
|
|
121
|
+
|
|
122
|
+
# Create a session bound to a specific container, with context retrieval
|
|
123
|
+
vllmd session create my-session \
|
|
124
|
+
--container vllmd-llama3 \
|
|
125
|
+
--embedding-model llama3 \
|
|
126
|
+
--system-prompt "You are a helpful coding assistant."
|
|
127
|
+
|
|
128
|
+
# One-shot message
|
|
129
|
+
vllmd session chat my-session "Explain the main training loop"
|
|
130
|
+
|
|
131
|
+
# Interactive REPL (supports /history, /context <query>, /reset, /exit)
|
|
132
|
+
vllmd session attach my-session
|
|
133
|
+
|
|
134
|
+
# View conversation history
|
|
135
|
+
vllmd session history my-session --last 10
|
|
136
|
+
|
|
137
|
+
# List all sessions
|
|
138
|
+
vllmd session list
|
|
139
|
+
|
|
140
|
+
# Clear history (keeps session config)
|
|
141
|
+
vllmd session clear my-session
|
|
142
|
+
|
|
143
|
+
# Delete a session
|
|
144
|
+
vllmd session delete my-session
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
### Context retrieval
|
|
148
|
+
|
|
149
|
+
When a session is created with `--embedding-model`, each message automatically retrieves the most relevant chunks from the session's vector store (documents and code) and injects them as system context before the conversation history. Exchanges are also stored in the ChromaDB history collection for future semantic search.
|
|
150
|
+
|
|
151
|
+
If the embedding endpoint is unavailable, retrieval is silently skipped and the session continues with history-only context.
|
|
152
|
+
|
|
153
|
+
## Vector Context Database
|
|
154
|
+
|
|
155
|
+
vllmd includes a local vector database (backed by [ChromaDB](https://docs.trychroma.com/)) that stores documents, code, and conversation history as embeddings. Embeddings are generated using the same vLLM server the model runs on.
|
|
156
|
+
|
|
157
|
+
```bash
|
|
158
|
+
# Ingest a directory of documents
|
|
159
|
+
vllmd db ingest ./docs --type documents --model my-model
|
|
160
|
+
|
|
161
|
+
# Ingest a codebase
|
|
162
|
+
vllmd db ingest ./src --type code --model my-model
|
|
163
|
+
|
|
164
|
+
# Search for relevant context
|
|
165
|
+
vllmd db search "how does auth work" --collection code --model my-model
|
|
166
|
+
|
|
167
|
+
# Store a conversation message
|
|
168
|
+
vllmd db history "Explain the main loop" --role user --session my-session --model my-model
|
|
169
|
+
|
|
170
|
+
# Abridge old history with a summary
|
|
171
|
+
vllmd db summarize --session my-session "Previous conversation covered auth and the main loop." --model my-model
|
|
172
|
+
|
|
173
|
+
# Push DB to S3
|
|
174
|
+
vllmd db sync s3://my-bucket/vectordb --direction push
|
|
175
|
+
|
|
176
|
+
# Pull DB from S3
|
|
177
|
+
vllmd db sync s3://my-bucket/vectordb --direction pull
|
|
178
|
+
|
|
179
|
+
# Show collection sizes
|
|
180
|
+
vllmd db stats
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
The DB directory (`./vectordb` by default, override with `--db-path`) can be mounted as a Docker volume for persistence and shared across machines via S3 sync.
|
|
184
|
+
|
|
185
|
+
## Using the API
|
|
186
|
+
|
|
187
|
+
Once running, the endpoint is OpenAI-compatible:
|
|
188
|
+
|
|
189
|
+
```bash
|
|
190
|
+
curl http://localhost:8000/v1/chat/completions \
|
|
191
|
+
-H "Content-Type: application/json" \
|
|
192
|
+
-d '{
|
|
193
|
+
"model": "my-model",
|
|
194
|
+
"messages": [{"role": "user", "content": "Hello!"}]
|
|
195
|
+
}'
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
Works out of the box with [AgentTester](https://github.com/sroomberg/agenttester):
|
|
199
|
+
|
|
200
|
+
```yaml
|
|
201
|
+
# agent-tester.yaml
|
|
202
|
+
agents:
|
|
203
|
+
my-model:
|
|
204
|
+
command: 'agent-tester query http://localhost:8000 my-model {prompt}'
|
|
205
|
+
host: localhost
|
|
206
|
+
commit_style: manual
|
|
207
|
+
timeout: 120
|
|
208
|
+
```
|
|
209
|
+
|
|
210
|
+
## Development
|
|
211
|
+
|
|
212
|
+
```bash
|
|
213
|
+
uv pip install -e ".[dev]"
|
|
214
|
+
ruff check src/ tests/
|
|
215
|
+
ruff format src/ tests/
|
|
216
|
+
pytest
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
## Docker
|
|
220
|
+
|
|
221
|
+
```bash
|
|
222
|
+
MODEL_PATH=/path/to/my-model docker compose run --rm vllmd run --model /model --port 8000
|
|
223
|
+
```
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
services:
|
|
2
|
+
agent-runner:
|
|
3
|
+
build: .
|
|
4
|
+
volumes:
|
|
5
|
+
# Mount host Docker socket so AgentRunner can manage containers
|
|
6
|
+
- /var/run/docker.sock:/var/run/docker.sock
|
|
7
|
+
# Mount the model directory into the tool container
|
|
8
|
+
- ${MODEL_PATH:-/tmp}:/model:ro
|
|
9
|
+
environment:
|
|
10
|
+
- MODEL_PATH=/model
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "vllmd"
|
|
7
|
+
version = "0.2.0"
|
|
8
|
+
description = "Run local models via vLLM in Docker containers"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = "MIT"
|
|
12
|
+
dependencies = [
|
|
13
|
+
"typer>=0.9",
|
|
14
|
+
"rich>=13.0",
|
|
15
|
+
"chromadb>=0.5",
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
[project.optional-dependencies]
|
|
19
|
+
dev = [
|
|
20
|
+
"ruff>=0.4",
|
|
21
|
+
"pytest>=8.0",
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
[project.scripts]
|
|
25
|
+
vllmd = "vllmd.cli:app"
|
|
26
|
+
|
|
27
|
+
[tool.hatch.build.targets.wheel]
|
|
28
|
+
include = ["src/vllmd/**"]
|
|
29
|
+
|
|
30
|
+
[tool.ruff]
|
|
31
|
+
target-version = "py310"
|
|
32
|
+
line-length = 88
|
|
33
|
+
|
|
34
|
+
[tool.ruff.lint]
|
|
35
|
+
select = ["E", "F", "I", "N", "UP", "B", "SIM", "RUF"]
|
|
36
|
+
|
|
37
|
+
[tool.ruff.lint.isort]
|
|
38
|
+
known-first-party = ["vllmd"]
|
|
39
|
+
|
|
40
|
+
[tool.pytest.ini_options]
|
|
41
|
+
testpaths = ["tests"]
|
|
42
|
+
pythonpath = ["src"]
|