vllmd 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,19 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: ["**"]
6
+ pull_request:
7
+ workflow_call:
8
+
9
+ jobs:
10
+ test:
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - uses: actions/checkout@v4
14
+ - uses: astral-sh/setup-uv@v4
15
+ - run: uv venv
16
+ - run: uv pip install -e ".[dev]"
17
+ - run: uv run ruff check src/ tests/
18
+ - run: uv run ruff format --check src/ tests/
19
+ - run: uv run pytest
@@ -0,0 +1,29 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - "v*"
7
+
8
+ permissions:
9
+ id-token: write # trusted publishing (no API token needed)
10
+
11
+ jobs:
12
+ test:
13
+ uses: ./.github/workflows/ci.yml
14
+
15
+ publish:
16
+ needs: test
17
+ runs-on: ubuntu-latest
18
+ environment: pypi
19
+ steps:
20
+ - uses: actions/checkout@v4
21
+ - uses: astral-sh/setup-uv@v4
22
+
23
+ - name: Build package
24
+ run: uv build
25
+
26
+ - name: Publish to PyPI
27
+ uses: pypa/gh-action-pypi-publish@release/v1
28
+ with:
29
+ attestations: false
vllmd-0.2.0/.gitignore ADDED
@@ -0,0 +1,12 @@
1
+ .claude/
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.egg-info/
5
+ dist/
6
+ build/
7
+ .venv/
8
+ venv/
9
+ .env
10
+ *.log
11
+ .pytest_cache/
12
+ .ruff_cache/
vllmd-0.2.0/Dockerfile ADDED
@@ -0,0 +1,13 @@
1
+ FROM python:3.12-slim
2
+
3
+ COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
4
+
5
+ RUN apt-get update && apt-get install -y --no-install-recommends \
6
+ docker.io \
7
+ && rm -rf /var/lib/apt/lists/*
8
+
9
+ WORKDIR /app
10
+ COPY . /app
11
+ RUN uv pip install --system --no-cache .
12
+
13
+ ENTRYPOINT ["agent-runner"]
vllmd-0.2.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 sroomberg
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
vllmd-0.2.0/PKG-INFO ADDED
@@ -0,0 +1,238 @@
1
+ Metadata-Version: 2.4
2
+ Name: vllmd
3
+ Version: 0.2.0
4
+ Summary: Run local models via vLLM in Docker containers
5
+ License-Expression: MIT
6
+ License-File: LICENSE
7
+ Requires-Python: >=3.10
8
+ Requires-Dist: chromadb>=0.5
9
+ Requires-Dist: rich>=13.0
10
+ Requires-Dist: typer>=0.9
11
+ Provides-Extra: dev
12
+ Requires-Dist: pytest>=8.0; extra == 'dev'
13
+ Requires-Dist: ruff>=0.4; extra == 'dev'
14
+ Description-Content-Type: text/markdown
15
+
16
+ # vllmd
17
+
18
+ > **⚠️ Experimental** — This project is under active development. APIs, config format, and CLI flags may change without notice.
19
+
20
+ Run a model that is already on the machine in a [vLLM](https://github.com/vllm-project/vllm) container and serve it on a specified port with an OpenAI-compatible API.
21
+
22
+ ## Install
23
+
24
+ ```bash
25
+ uv pip install -e ".[dev]"
26
+ ```
27
+
28
+ Requires Docker with the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) for GPU support.
29
+
30
+ ## Quick Start
31
+
32
+ ```bash
33
+ # Serve a model on port 8000 (foreground — streams vLLM logs until Ctrl+C)
34
+ vllmd run --model /path/to/my-model --port 8000
35
+
36
+ # Run in the background; wait for the API to be ready, then return
37
+ vllmd run --model /path/to/my-model --port 8000 -d
38
+
39
+ # Run in the background and return immediately without waiting
40
+ vllmd run --model /path/to/my-model --port 8000 -d --no-wait
41
+
42
+ # CPU-only (no GPU)
43
+ vllmd run --model /path/to/my-model --port 8000 --no-gpu
44
+
45
+ # Check if the container is up and the API is healthy
46
+ vllmd status
47
+
48
+ # Stream container logs
49
+ vllmd logs --follow
50
+
51
+ # Stop the container
52
+ vllmd stop
53
+ ```
54
+
55
+ ## Multiple Models
56
+
57
+ Multiple models can run concurrently, each in its own container on a different port. The container name defaults to `vllmd-<model-dir-name>`.
58
+
59
+ ```bash
60
+ # Start two models on different ports
61
+ vllmd run --model /models/llama3 --port 8001 -d
62
+ vllmd run --model /models/mistral --port 8002 -d
63
+
64
+ # List all running vllmd containers
65
+ vllmd ps
66
+
67
+ # Check health of all containers at once
68
+ vllmd status
69
+
70
+ # Stop a specific container
71
+ vllmd stop --name vllmd-llama3
72
+
73
+ # Stop all vllmd containers
74
+ vllmd stop --all
75
+ ```
76
+
77
+ When only one container is running, `stop`, `status`, `logs`, and `session create` all auto-resolve to it without needing `--name`.
78
+
79
+ ## How It Works
80
+
81
+ 1. `vllmd run` resolves the model path and pulls `vllm/vllm-openai:latest` if needed
82
+ 2. A Docker container is started with the model directory mounted read-only at `/model`
83
+ 3. vLLM serves the model on port 8000 inside the container, mapped to `--port` on the host
84
+ 4. The served model ID is the directory name of the model path
85
+ 5. The endpoint exposes a standard OpenAI-compatible API at `http://localhost:<port>/v1`
86
+
87
+ ## Commands
88
+
89
+ | Command | Description |
90
+ |---------|-------------|
91
+ | `run` | Start a vLLM container for a model |
92
+ | `ps` | List all running vllmd containers |
93
+ | `stop` | Stop a container (`--all` to stop every managed container) |
94
+ | `status` | Show container and API health (all containers if no `--name`) |
95
+ | `logs` | Print container logs |
96
+ | `session create` | Create a persistent chat session |
97
+ | `session chat` | Send a one-shot message in a session |
98
+ | `session attach` | Open an interactive REPL for a session |
99
+ | `session list` | List all sessions |
100
+ | `session history` | Print conversation history |
101
+ | `session clear` | Clear conversation history |
102
+ | `session delete` | Delete a session |
103
+ | `db ingest` | Add documents or code to the vector database |
104
+ | `db search` | Query the vector database for relevant context |
105
+ | `db history` | Store a conversation message |
106
+ | `db summarize` | Replace a session's history with an abridged summary |
107
+ | `db sync` | Sync the vector DB to/from S3 |
108
+ | `db stats` | Show collection sizes |
109
+
110
+ ## Options
111
+
112
+ ### `run`
113
+
114
+ | Flag | Default | Description |
115
+ |------|---------|-------------|
116
+ | `--model`, `-m` | (required) | Path to the model directory |
117
+ | `--port`, `-p` | `8000` | Host port for the vLLM API |
118
+ | `--name`, `-n` | `vllmd-<model-dir>` | Docker container name |
119
+ | `--gpu/--no-gpu` | `--gpu` | Enable/disable GPU passthrough |
120
+ | `--dtype` | `auto` | Model dtype (`auto`, `float16`, `bfloat16`, `float32`) |
121
+ | `--max-model-len` | — | Override max context length |
122
+ | `--detach`, `-d` | `false` | Start in background |
123
+ | `--wait/--no-wait` | `--wait` | Wait for API to be ready (implies background start) |
124
+
125
+ Extra positional arguments are forwarded verbatim to vLLM.
126
+
127
+ ## Sessions
128
+
129
+ Sessions are persistent, named conversations tied to a running model. Each session maintains sequential conversation history and optionally retrieves semantic context from the vector database.
130
+
131
+ Sessions are stored as JSON files in `~/.vllmd/sessions/` (override with `--sessions-dir`).
132
+
133
+ ```bash
134
+ # Create a session (auto-resolves endpoint if one container is running)
135
+ vllmd session create my-session
136
+
137
+ # Create a session bound to a specific container, with context retrieval
138
+ vllmd session create my-session \
139
+ --container vllmd-llama3 \
140
+ --embedding-model llama3 \
141
+ --system-prompt "You are a helpful coding assistant."
142
+
143
+ # One-shot message
144
+ vllmd session chat my-session "Explain the main training loop"
145
+
146
+ # Interactive REPL (supports /history, /context <query>, /reset, /exit)
147
+ vllmd session attach my-session
148
+
149
+ # View conversation history
150
+ vllmd session history my-session --last 10
151
+
152
+ # List all sessions
153
+ vllmd session list
154
+
155
+ # Clear history (keeps session config)
156
+ vllmd session clear my-session
157
+
158
+ # Delete a session
159
+ vllmd session delete my-session
160
+ ```
161
+
162
+ ### Context retrieval
163
+
164
+ When a session is created with `--embedding-model`, each message automatically retrieves the most relevant chunks from the session's vector store (documents and code) and injects them as system context before the conversation history. Exchanges are also stored in the ChromaDB history collection for future semantic search.
165
+
166
+ If the embedding endpoint is unavailable, retrieval is silently skipped and the session continues with history-only context.
167
+
168
+ ## Vector Context Database
169
+
170
+ vllmd includes a local vector database (backed by [ChromaDB](https://docs.trychroma.com/)) that stores documents, code, and conversation history as embeddings. Embeddings are generated using the same vLLM server the model runs on.
171
+
172
+ ```bash
173
+ # Ingest a directory of documents
174
+ vllmd db ingest ./docs --type documents --model my-model
175
+
176
+ # Ingest a codebase
177
+ vllmd db ingest ./src --type code --model my-model
178
+
179
+ # Search for relevant context
180
+ vllmd db search "how does auth work" --collection code --model my-model
181
+
182
+ # Store a conversation message
183
+ vllmd db history "Explain the main loop" --role user --session my-session --model my-model
184
+
185
+ # Abridge old history with a summary
186
+ vllmd db summarize --session my-session "Previous conversation covered auth and the main loop." --model my-model
187
+
188
+ # Push DB to S3
189
+ vllmd db sync s3://my-bucket/vectordb --direction push
190
+
191
+ # Pull DB from S3
192
+ vllmd db sync s3://my-bucket/vectordb --direction pull
193
+
194
+ # Show collection sizes
195
+ vllmd db stats
196
+ ```
197
+
198
+ The DB directory (`./vectordb` by default, override with `--db-path`) can be mounted as a Docker volume for persistence and shared across machines via S3 sync.
199
+
200
+ ## Using the API
201
+
202
+ Once running, the endpoint is OpenAI-compatible:
203
+
204
+ ```bash
205
+ curl http://localhost:8000/v1/chat/completions \
206
+ -H "Content-Type: application/json" \
207
+ -d '{
208
+ "model": "my-model",
209
+ "messages": [{"role": "user", "content": "Hello!"}]
210
+ }'
211
+ ```
212
+
213
+ Works out of the box with [AgentTester](https://github.com/sroomberg/agenttester):
214
+
215
+ ```yaml
216
+ # agent-tester.yaml
217
+ agents:
218
+ my-model:
219
+ command: 'agent-tester query http://localhost:8000 my-model {prompt}'
220
+ host: localhost
221
+ commit_style: manual
222
+ timeout: 120
223
+ ```
224
+
225
+ ## Development
226
+
227
+ ```bash
228
+ uv pip install -e ".[dev]"
229
+ ruff check src/ tests/
230
+ ruff format src/ tests/
231
+ pytest
232
+ ```
233
+
234
+ ## Docker
235
+
236
+ ```bash
237
+ MODEL_PATH=/path/to/my-model docker compose run --rm vllmd run --model /model --port 8000
238
+ ```
vllmd-0.2.0/README.md ADDED
@@ -0,0 +1,223 @@
1
+ # vllmd
2
+
3
+ > **⚠️ Experimental** — This project is under active development. APIs, config format, and CLI flags may change without notice.
4
+
5
+ Run a model that is already on the machine in a [vLLM](https://github.com/vllm-project/vllm) container and serve it on a specified port with an OpenAI-compatible API.
6
+
7
+ ## Install
8
+
9
+ ```bash
10
+ uv pip install -e ".[dev]"
11
+ ```
12
+
13
+ Requires Docker with the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) for GPU support.
14
+
15
+ ## Quick Start
16
+
17
+ ```bash
18
+ # Serve a model on port 8000 (foreground — streams vLLM logs until Ctrl+C)
19
+ vllmd run --model /path/to/my-model --port 8000
20
+
21
+ # Run in the background; wait for the API to be ready, then return
22
+ vllmd run --model /path/to/my-model --port 8000 -d
23
+
24
+ # Run in the background and return immediately without waiting
25
+ vllmd run --model /path/to/my-model --port 8000 -d --no-wait
26
+
27
+ # CPU-only (no GPU)
28
+ vllmd run --model /path/to/my-model --port 8000 --no-gpu
29
+
30
+ # Check if the container is up and the API is healthy
31
+ vllmd status
32
+
33
+ # Stream container logs
34
+ vllmd logs --follow
35
+
36
+ # Stop the container
37
+ vllmd stop
38
+ ```
39
+
40
+ ## Multiple Models
41
+
42
+ Multiple models can run concurrently, each in its own container on a different port. The container name defaults to `vllmd-<model-dir-name>`.
43
+
44
+ ```bash
45
+ # Start two models on different ports
46
+ vllmd run --model /models/llama3 --port 8001 -d
47
+ vllmd run --model /models/mistral --port 8002 -d
48
+
49
+ # List all running vllmd containers
50
+ vllmd ps
51
+
52
+ # Check health of all containers at once
53
+ vllmd status
54
+
55
+ # Stop a specific container
56
+ vllmd stop --name vllmd-llama3
57
+
58
+ # Stop all vllmd containers
59
+ vllmd stop --all
60
+ ```
61
+
62
+ When only one container is running, `stop`, `status`, `logs`, and `session create` all auto-resolve to it without needing `--name`.
63
+
64
+ ## How It Works
65
+
66
+ 1. `vllmd run` resolves the model path and pulls `vllm/vllm-openai:latest` if needed
67
+ 2. A Docker container is started with the model directory mounted read-only at `/model`
68
+ 3. vLLM serves the model on port 8000 inside the container, mapped to `--port` on the host
69
+ 4. The served model ID is the directory name of the model path
70
+ 5. The endpoint exposes a standard OpenAI-compatible API at `http://localhost:<port>/v1`
71
+
72
+ ## Commands
73
+
74
+ | Command | Description |
75
+ |---------|-------------|
76
+ | `run` | Start a vLLM container for a model |
77
+ | `ps` | List all running vllmd containers |
78
+ | `stop` | Stop a container (`--all` to stop every managed container) |
79
+ | `status` | Show container and API health (all containers if no `--name`) |
80
+ | `logs` | Print container logs |
81
+ | `session create` | Create a persistent chat session |
82
+ | `session chat` | Send a one-shot message in a session |
83
+ | `session attach` | Open an interactive REPL for a session |
84
+ | `session list` | List all sessions |
85
+ | `session history` | Print conversation history |
86
+ | `session clear` | Clear conversation history |
87
+ | `session delete` | Delete a session |
88
+ | `db ingest` | Add documents or code to the vector database |
89
+ | `db search` | Query the vector database for relevant context |
90
+ | `db history` | Store a conversation message |
91
+ | `db summarize` | Replace a session's history with an abridged summary |
92
+ | `db sync` | Sync the vector DB to/from S3 |
93
+ | `db stats` | Show collection sizes |
94
+
95
+ ## Options
96
+
97
+ ### `run`
98
+
99
+ | Flag | Default | Description |
100
+ |------|---------|-------------|
101
+ | `--model`, `-m` | (required) | Path to the model directory |
102
+ | `--port`, `-p` | `8000` | Host port for the vLLM API |
103
+ | `--name`, `-n` | `vllmd-<model-dir>` | Docker container name |
104
+ | `--gpu/--no-gpu` | `--gpu` | Enable/disable GPU passthrough |
105
+ | `--dtype` | `auto` | Model dtype (`auto`, `float16`, `bfloat16`, `float32`) |
106
+ | `--max-model-len` | — | Override max context length |
107
+ | `--detach`, `-d` | `false` | Start in background |
108
+ | `--wait/--no-wait` | `--wait` | Wait for API to be ready (implies background start) |
109
+
110
+ Extra positional arguments are forwarded verbatim to vLLM.
111
+
112
+ ## Sessions
113
+
114
+ Sessions are persistent, named conversations tied to a running model. Each session maintains sequential conversation history and optionally retrieves semantic context from the vector database.
115
+
116
+ Sessions are stored as JSON files in `~/.vllmd/sessions/` (override with `--sessions-dir`).
117
+
118
+ ```bash
119
+ # Create a session (auto-resolves endpoint if one container is running)
120
+ vllmd session create my-session
121
+
122
+ # Create a session bound to a specific container, with context retrieval
123
+ vllmd session create my-session \
124
+ --container vllmd-llama3 \
125
+ --embedding-model llama3 \
126
+ --system-prompt "You are a helpful coding assistant."
127
+
128
+ # One-shot message
129
+ vllmd session chat my-session "Explain the main training loop"
130
+
131
+ # Interactive REPL (supports /history, /context <query>, /reset, /exit)
132
+ vllmd session attach my-session
133
+
134
+ # View conversation history
135
+ vllmd session history my-session --last 10
136
+
137
+ # List all sessions
138
+ vllmd session list
139
+
140
+ # Clear history (keeps session config)
141
+ vllmd session clear my-session
142
+
143
+ # Delete a session
144
+ vllmd session delete my-session
145
+ ```
146
+
147
+ ### Context retrieval
148
+
149
+ When a session is created with `--embedding-model`, each message automatically retrieves the most relevant chunks from the session's vector store (documents and code) and injects them as system context before the conversation history. Exchanges are also stored in the ChromaDB history collection for future semantic search.
150
+
151
+ If the embedding endpoint is unavailable, retrieval is silently skipped and the session continues with history-only context.
152
+
153
+ ## Vector Context Database
154
+
155
+ vllmd includes a local vector database (backed by [ChromaDB](https://docs.trychroma.com/)) that stores documents, code, and conversation history as embeddings. Embeddings are generated using the same vLLM server the model runs on.
156
+
157
+ ```bash
158
+ # Ingest a directory of documents
159
+ vllmd db ingest ./docs --type documents --model my-model
160
+
161
+ # Ingest a codebase
162
+ vllmd db ingest ./src --type code --model my-model
163
+
164
+ # Search for relevant context
165
+ vllmd db search "how does auth work" --collection code --model my-model
166
+
167
+ # Store a conversation message
168
+ vllmd db history "Explain the main loop" --role user --session my-session --model my-model
169
+
170
+ # Abridge old history with a summary
171
+ vllmd db summarize --session my-session "Previous conversation covered auth and the main loop." --model my-model
172
+
173
+ # Push DB to S3
174
+ vllmd db sync s3://my-bucket/vectordb --direction push
175
+
176
+ # Pull DB from S3
177
+ vllmd db sync s3://my-bucket/vectordb --direction pull
178
+
179
+ # Show collection sizes
180
+ vllmd db stats
181
+ ```
182
+
183
+ The DB directory (`./vectordb` by default, override with `--db-path`) can be mounted as a Docker volume for persistence and shared across machines via S3 sync.
184
+
185
+ ## Using the API
186
+
187
+ Once running, the endpoint is OpenAI-compatible:
188
+
189
+ ```bash
190
+ curl http://localhost:8000/v1/chat/completions \
191
+ -H "Content-Type: application/json" \
192
+ -d '{
193
+ "model": "my-model",
194
+ "messages": [{"role": "user", "content": "Hello!"}]
195
+ }'
196
+ ```
197
+
198
+ Works out of the box with [AgentTester](https://github.com/sroomberg/agenttester):
199
+
200
+ ```yaml
201
+ # agent-tester.yaml
202
+ agents:
203
+ my-model:
204
+ command: 'agent-tester query http://localhost:8000 my-model {prompt}'
205
+ host: localhost
206
+ commit_style: manual
207
+ timeout: 120
208
+ ```
209
+
210
+ ## Development
211
+
212
+ ```bash
213
+ uv pip install -e ".[dev]"
214
+ ruff check src/ tests/
215
+ ruff format src/ tests/
216
+ pytest
217
+ ```
218
+
219
+ ## Docker
220
+
221
+ ```bash
222
+ MODEL_PATH=/path/to/my-model docker compose run --rm vllmd run --model /model --port 8000
223
+ ```
@@ -0,0 +1,10 @@
1
+ services:
2
+ agent-runner:
3
+ build: .
4
+ volumes:
5
+ # Mount host Docker socket so AgentRunner can manage containers
6
+ - /var/run/docker.sock:/var/run/docker.sock
7
+ # Mount the model directory into the tool container
8
+ - ${MODEL_PATH:-/tmp}:/model:ro
9
+ environment:
10
+ - MODEL_PATH=/model
@@ -0,0 +1,42 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "vllmd"
7
+ version = "0.2.0"
8
+ description = "Run local models via vLLM in Docker containers"
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ license = "MIT"
12
+ dependencies = [
13
+ "typer>=0.9",
14
+ "rich>=13.0",
15
+ "chromadb>=0.5",
16
+ ]
17
+
18
+ [project.optional-dependencies]
19
+ dev = [
20
+ "ruff>=0.4",
21
+ "pytest>=8.0",
22
+ ]
23
+
24
+ [project.scripts]
25
+ vllmd = "vllmd.cli:app"
26
+
27
+ [tool.hatch.build.targets.wheel]
28
+ include = ["src/vllmd/**"]
29
+
30
+ [tool.ruff]
31
+ target-version = "py310"
32
+ line-length = 88
33
+
34
+ [tool.ruff.lint]
35
+ select = ["E", "F", "I", "N", "UP", "B", "SIM", "RUF"]
36
+
37
+ [tool.ruff.lint.isort]
38
+ known-first-party = ["vllmd"]
39
+
40
+ [tool.pytest.ini_options]
41
+ testpaths = ["tests"]
42
+ pythonpath = ["src"]
@@ -0,0 +1,5 @@
1
+ """vllmd — run local models via vLLM in Docker containers."""
2
+
3
+ from .runner import RunConfig, logs, start, status, stop
4
+
5
+ __all__ = ["RunConfig", "logs", "start", "status", "stop"]