vserve 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vserve-0.1.0/.githooks/pre-commit +15 -0
- vserve-0.1.0/.github/workflows/ci.yml +19 -0
- vserve-0.1.0/.github/workflows/publish.yml +46 -0
- vserve-0.1.0/.gitignore +10 -0
- vserve-0.1.0/LICENSE +21 -0
- vserve-0.1.0/PKG-INFO +156 -0
- vserve-0.1.0/README.md +129 -0
- vserve-0.1.0/docs/troubleshooting.md +142 -0
- vserve-0.1.0/pyproject.toml +47 -0
- vserve-0.1.0/src/vserve/__init__.py +1 -0
- vserve-0.1.0/src/vserve/cli.py +1485 -0
- vserve-0.1.0/src/vserve/compare.py +43 -0
- vserve-0.1.0/src/vserve/config.py +266 -0
- vserve-0.1.0/src/vserve/fan.py +334 -0
- vserve-0.1.0/src/vserve/gpu.py +117 -0
- vserve-0.1.0/src/vserve/lock.py +196 -0
- vserve-0.1.0/src/vserve/models.py +136 -0
- vserve-0.1.0/src/vserve/probe.py +86 -0
- vserve-0.1.0/src/vserve/serve.py +39 -0
- vserve-0.1.0/src/vserve/variants.py +193 -0
- vserve-0.1.0/src/vserve/welcome.sh +146 -0
- vserve-0.1.0/tests/conftest.py +84 -0
- vserve-0.1.0/tests/test_cli.py +117 -0
- vserve-0.1.0/tests/test_cli_tune.py +121 -0
- vserve-0.1.0/tests/test_compare.py +53 -0
- vserve-0.1.0/tests/test_config.py +214 -0
- vserve-0.1.0/tests/test_fan.py +268 -0
- vserve-0.1.0/tests/test_gpu.py +74 -0
- vserve-0.1.0/tests/test_imports.py +84 -0
- vserve-0.1.0/tests/test_lock.py +246 -0
- vserve-0.1.0/tests/test_models.py +187 -0
- vserve-0.1.0/tests/test_probe.py +212 -0
- vserve-0.1.0/tests/test_serve.py +73 -0
- vserve-0.1.0/tests/test_variants.py +332 -0
- vserve-0.1.0/uv.lock +576 -0
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
set -e
|
|
3
|
+
|
|
4
|
+
cd "$(git rev-parse --show-toplevel)"
|
|
5
|
+
|
|
6
|
+
echo "=== ruff ==="
|
|
7
|
+
uv run ruff check src/ tests/
|
|
8
|
+
|
|
9
|
+
echo "=== mypy ==="
|
|
10
|
+
uv run mypy src/vserve/ --ignore-missing-imports
|
|
11
|
+
|
|
12
|
+
echo "=== pytest ==="
|
|
13
|
+
uv run pytest tests/ -q --tb=short
|
|
14
|
+
|
|
15
|
+
echo "=== all checks passed ==="
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
test:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
steps:
|
|
13
|
+
- uses: actions/checkout@v6
|
|
14
|
+
- uses: astral-sh/setup-uv@v7
|
|
15
|
+
- run: uv python install 3.13
|
|
16
|
+
- run: uv sync --dev
|
|
17
|
+
- run: uv run ruff check src/ tests/
|
|
18
|
+
- run: uv run mypy src/vserve/ --ignore-missing-imports
|
|
19
|
+
- run: uv run pytest tests/ -q --tb=short
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags:
|
|
6
|
+
- "v*"
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
test:
|
|
10
|
+
runs-on: ubuntu-latest
|
|
11
|
+
steps:
|
|
12
|
+
- uses: actions/checkout@v6
|
|
13
|
+
- uses: astral-sh/setup-uv@v7
|
|
14
|
+
- run: uv python install 3.13
|
|
15
|
+
- run: uv sync --dev
|
|
16
|
+
- run: uv run ruff check src/ tests/
|
|
17
|
+
- run: uv run mypy src/vserve/ --ignore-missing-imports
|
|
18
|
+
- run: uv run pytest tests/ -q --tb=short
|
|
19
|
+
|
|
20
|
+
build:
|
|
21
|
+
runs-on: ubuntu-latest
|
|
22
|
+
needs: test
|
|
23
|
+
steps:
|
|
24
|
+
- uses: actions/checkout@v6
|
|
25
|
+
- uses: astral-sh/setup-uv@v7
|
|
26
|
+
- run: uv python install 3.13
|
|
27
|
+
- run: uv build
|
|
28
|
+
- uses: actions/upload-artifact@v4
|
|
29
|
+
with:
|
|
30
|
+
name: dist
|
|
31
|
+
path: dist/
|
|
32
|
+
|
|
33
|
+
publish:
|
|
34
|
+
runs-on: ubuntu-latest
|
|
35
|
+
needs: build
|
|
36
|
+
environment:
|
|
37
|
+
name: pypi
|
|
38
|
+
url: https://pypi.org/p/vserve
|
|
39
|
+
permissions:
|
|
40
|
+
id-token: write
|
|
41
|
+
steps:
|
|
42
|
+
- uses: actions/download-artifact@v4
|
|
43
|
+
with:
|
|
44
|
+
name: dist
|
|
45
|
+
path: dist/
|
|
46
|
+
- uses: pypa/gh-action-pypi-publish@release/v1
|
vserve-0.1.0/.gitignore
ADDED
vserve-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Mohan Qiao
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
vserve-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: vserve
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: CLI for managing vLLM inference on GPU workstations
|
|
5
|
+
Project-URL: Homepage, https://github.com/Gavin-Qiao/vserve
|
|
6
|
+
Project-URL: Repository, https://github.com/Gavin-Qiao/vserve
|
|
7
|
+
Project-URL: Issues, https://github.com/Gavin-Qiao/vserve/issues
|
|
8
|
+
Author: Mohan Qiao
|
|
9
|
+
License-Expression: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Environment :: Console
|
|
13
|
+
Classifier: Environment :: GPU :: NVIDIA CUDA
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Intended Audience :: Science/Research
|
|
16
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
20
|
+
Requires-Python: >=3.12
|
|
21
|
+
Requires-Dist: huggingface-hub>=0.30
|
|
22
|
+
Requires-Dist: nvidia-ml-py>=13.595.45
|
|
23
|
+
Requires-Dist: pyyaml>=6
|
|
24
|
+
Requires-Dist: rich>=14
|
|
25
|
+
Requires-Dist: typer>=0.15
|
|
26
|
+
Description-Content-Type: text/markdown
|
|
27
|
+
|
|
28
|
+
<div align="center">
|
|
29
|
+
|
|
30
|
+
# vserve
|
|
31
|
+
|
|
32
|
+
**A CLI for managing vLLM inference on GPU workstations.**
|
|
33
|
+
|
|
34
|
+
Download models. Calculate limits. Serve with one command. Control fans.
|
|
35
|
+
|
|
36
|
+

|
|
37
|
+

|
|
38
|
+

|
|
39
|
+

|
|
40
|
+
|
|
41
|
+
</div>
|
|
42
|
+
|
|
43
|
+
---
|
|
44
|
+
|
|
45
|
+
## Install
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
uv tool install vserve
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
Or with pip:
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
pip install vserve
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
---
|
|
58
|
+
|
|
59
|
+
## Quick Start
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
vserve init # scan GPU, vLLM, CUDA, systemd — write config
|
|
63
|
+
vserve download # search HuggingFace, pick variant, download
|
|
64
|
+
vserve tune <model> # calculate context/concurrency limits for your GPU
|
|
65
|
+
vserve start <model> # interactive config → serve via systemd
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
---
|
|
69
|
+
|
|
70
|
+
## What It Does
|
|
71
|
+
|
|
72
|
+
**vserve** manages the full lifecycle of serving LLMs with vLLM on a GPU workstation:
|
|
73
|
+
|
|
74
|
+
- **Download** — search HuggingFace, see available weight variants (MXFP4, BF16, GGUF) with sizes, download only what you need
|
|
75
|
+
- **Tune** — calculate exactly what context lengths and concurrency your GPU can handle, based on model architecture and available VRAM
|
|
76
|
+
- **Start/Stop** — interactive config wizard, systemd service management, health check with timeout
|
|
77
|
+
- **Fan control** — temperature-based curve daemon with quiet hours, or hold a fixed speed
|
|
78
|
+
- **Multi-user** — file-based locking, terminal notifications when another user holds the GPU
|
|
79
|
+
- **Doctor** — diagnose GPU, CUDA, vLLM, systemd issues with actionable fix suggestions
|
|
80
|
+
|
|
81
|
+
---
|
|
82
|
+
|
|
83
|
+
## Commands
|
|
84
|
+
|
|
85
|
+
| Command | Description |
|
|
86
|
+
|:--------|:------------|
|
|
87
|
+
| `vserve` | Dashboard — GPU, models, status |
|
|
88
|
+
| `vserve init` | Auto-discover vLLM and write config |
|
|
89
|
+
| `vserve download [model]` | Search and download from HuggingFace with variant picker |
|
|
90
|
+
| `vserve models [name]` | List models or show detail (fuzzy match) |
|
|
91
|
+
| `vserve tune <model>` | Calculate context/concurrency limits for your GPU |
|
|
92
|
+
| `vserve start [model]` | Configure and start serving |
|
|
93
|
+
| `vserve stop` | Stop the vLLM service |
|
|
94
|
+
| `vserve status` | Show current serving config |
|
|
95
|
+
| `vserve fan [auto\|off\|30-100]` | GPU fan control with temp-based curve |
|
|
96
|
+
| `vserve doctor` | Check system readiness |
|
|
97
|
+
|
|
98
|
+
All commands support **fuzzy matching** — `vserve start qwen fp8` finds the right model.
|
|
99
|
+
|
|
100
|
+
---
|
|
101
|
+
|
|
102
|
+
## Prerequisites
|
|
103
|
+
|
|
104
|
+
| Requirement | Check | Install |
|
|
105
|
+
|------------|-------|---------|
|
|
106
|
+
| NVIDIA GPU + drivers | `nvidia-smi` | [nvidia.com/drivers](https://www.nvidia.com/drivers) |
|
|
107
|
+
| CUDA toolkit | `nvcc --version` | `sudo apt install nvidia-cuda-toolkit` |
|
|
108
|
+
| vLLM 0.18+ | `vllm --version` | [docs.vllm.ai](https://docs.vllm.ai/en/latest/getting_started/installation.html) |
|
|
109
|
+
| systemd | (most Linux servers) | See [troubleshooting](docs/troubleshooting.md) |
|
|
110
|
+
| sudo access | for systemctl, fan control | |
|
|
111
|
+
|
|
112
|
+
---
|
|
113
|
+
|
|
114
|
+
## Configuration
|
|
115
|
+
|
|
116
|
+
Auto-discovered on first run. Override at `~/.config/vserve/config.yaml`:
|
|
117
|
+
|
|
118
|
+
```yaml
|
|
119
|
+
vllm_root: /opt/vllm
|
|
120
|
+
cuda_home: /usr/local/cuda
|
|
121
|
+
service_name: vllm
|
|
122
|
+
service_user: vllm
|
|
123
|
+
port: 8888
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
---
|
|
127
|
+
|
|
128
|
+
## Fan Control
|
|
129
|
+
|
|
130
|
+
```bash
|
|
131
|
+
vserve fan # show status, interactive menu
|
|
132
|
+
vserve fan auto # temp-based curve with quiet hours
|
|
133
|
+
vserve fan 80 # hold at 80% (persistent daemon)
|
|
134
|
+
vserve fan off # stop daemon, restore NVIDIA auto
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
The auto curve ramps with temperature and caps fan speed during quiet hours (configurable). Emergency override at 88C ignores quiet hours.
|
|
138
|
+
|
|
139
|
+
---
|
|
140
|
+
|
|
141
|
+
## Development
|
|
142
|
+
|
|
143
|
+
```bash
|
|
144
|
+
git clone https://github.com/Gavin-Qiao/vserve.git
|
|
145
|
+
cd vserve
|
|
146
|
+
uv sync --dev
|
|
147
|
+
uv run pytest tests/ # 175 tests
|
|
148
|
+
uv run ruff check src/ tests/ # lint
|
|
149
|
+
uv run mypy src/vserve/ # type check
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
---
|
|
153
|
+
|
|
154
|
+
## License
|
|
155
|
+
|
|
156
|
+
[MIT](LICENSE)
|
vserve-0.1.0/README.md
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
<div align="center">
|
|
2
|
+
|
|
3
|
+
# vserve
|
|
4
|
+
|
|
5
|
+
**A CLI for managing vLLM inference on GPU workstations.**
|
|
6
|
+
|
|
7
|
+
Download models. Calculate limits. Serve with one command. Control fans.
|
|
8
|
+
|
|
9
|
+

|
|
10
|
+

|
|
11
|
+

|
|
12
|
+

|
|
13
|
+
|
|
14
|
+
</div>
|
|
15
|
+
|
|
16
|
+
---
|
|
17
|
+
|
|
18
|
+
## Install
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
uv tool install vserve
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
Or with pip:
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
pip install vserve
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
---
|
|
31
|
+
|
|
32
|
+
## Quick Start
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
vserve init # scan GPU, vLLM, CUDA, systemd — write config
|
|
36
|
+
vserve download # search HuggingFace, pick variant, download
|
|
37
|
+
vserve tune <model> # calculate context/concurrency limits for your GPU
|
|
38
|
+
vserve start <model> # interactive config → serve via systemd
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
---
|
|
42
|
+
|
|
43
|
+
## What It Does
|
|
44
|
+
|
|
45
|
+
**vserve** manages the full lifecycle of serving LLMs with vLLM on a GPU workstation:
|
|
46
|
+
|
|
47
|
+
- **Download** — search HuggingFace, see available weight variants (MXFP4, BF16, GGUF) with sizes, download only what you need
|
|
48
|
+
- **Tune** — calculate exactly what context lengths and concurrency your GPU can handle, based on model architecture and available VRAM
|
|
49
|
+
- **Start/Stop** — interactive config wizard, systemd service management, health check with timeout
|
|
50
|
+
- **Fan control** — temperature-based curve daemon with quiet hours, or hold a fixed speed
|
|
51
|
+
- **Multi-user** — file-based locking, terminal notifications when another user holds the GPU
|
|
52
|
+
- **Doctor** — diagnose GPU, CUDA, vLLM, systemd issues with actionable fix suggestions
|
|
53
|
+
|
|
54
|
+
---
|
|
55
|
+
|
|
56
|
+
## Commands
|
|
57
|
+
|
|
58
|
+
| Command | Description |
|
|
59
|
+
|:--------|:------------|
|
|
60
|
+
| `vserve` | Dashboard — GPU, models, status |
|
|
61
|
+
| `vserve init` | Auto-discover vLLM and write config |
|
|
62
|
+
| `vserve download [model]` | Search and download from HuggingFace with variant picker |
|
|
63
|
+
| `vserve models [name]` | List models or show detail (fuzzy match) |
|
|
64
|
+
| `vserve tune <model>` | Calculate context/concurrency limits for your GPU |
|
|
65
|
+
| `vserve start [model]` | Configure and start serving |
|
|
66
|
+
| `vserve stop` | Stop the vLLM service |
|
|
67
|
+
| `vserve status` | Show current serving config |
|
|
68
|
+
| `vserve fan [auto\|off\|30-100]` | GPU fan control with temp-based curve |
|
|
69
|
+
| `vserve doctor` | Check system readiness |
|
|
70
|
+
|
|
71
|
+
All commands support **fuzzy matching** — `vserve start qwen fp8` finds the right model.
|
|
72
|
+
|
|
73
|
+
---
|
|
74
|
+
|
|
75
|
+
## Prerequisites
|
|
76
|
+
|
|
77
|
+
| Requirement | Check | Install |
|
|
78
|
+
|------------|-------|---------|
|
|
79
|
+
| NVIDIA GPU + drivers | `nvidia-smi` | [nvidia.com/drivers](https://www.nvidia.com/drivers) |
|
|
80
|
+
| CUDA toolkit | `nvcc --version` | `sudo apt install nvidia-cuda-toolkit` |
|
|
81
|
+
| vLLM 0.18+ | `vllm --version` | [docs.vllm.ai](https://docs.vllm.ai/en/latest/getting_started/installation.html) |
|
|
82
|
+
| systemd | (most Linux servers) | See [troubleshooting](docs/troubleshooting.md) |
|
|
83
|
+
| sudo access | for systemctl, fan control | |
|
|
84
|
+
|
|
85
|
+
---
|
|
86
|
+
|
|
87
|
+
## Configuration
|
|
88
|
+
|
|
89
|
+
Auto-discovered on first run. Override at `~/.config/vserve/config.yaml`:
|
|
90
|
+
|
|
91
|
+
```yaml
|
|
92
|
+
vllm_root: /opt/vllm
|
|
93
|
+
cuda_home: /usr/local/cuda
|
|
94
|
+
service_name: vllm
|
|
95
|
+
service_user: vllm
|
|
96
|
+
port: 8888
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
---
|
|
100
|
+
|
|
101
|
+
## Fan Control
|
|
102
|
+
|
|
103
|
+
```bash
|
|
104
|
+
vserve fan # show status, interactive menu
|
|
105
|
+
vserve fan auto # temp-based curve with quiet hours
|
|
106
|
+
vserve fan 80 # hold at 80% (persistent daemon)
|
|
107
|
+
vserve fan off # stop daemon, restore NVIDIA auto
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
The auto curve ramps with temperature and caps fan speed during quiet hours (configurable). Emergency override at 88C ignores quiet hours.
|
|
111
|
+
|
|
112
|
+
---
|
|
113
|
+
|
|
114
|
+
## Development
|
|
115
|
+
|
|
116
|
+
```bash
|
|
117
|
+
git clone https://github.com/Gavin-Qiao/vserve.git
|
|
118
|
+
cd vserve
|
|
119
|
+
uv sync --dev
|
|
120
|
+
uv run pytest tests/ # 175 tests
|
|
121
|
+
uv run ruff check src/ tests/ # lint
|
|
122
|
+
uv run mypy src/vserve/ # type check
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
---
|
|
126
|
+
|
|
127
|
+
## License
|
|
128
|
+
|
|
129
|
+
[MIT](LICENSE)
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
# Troubleshooting
|
|
2
|
+
|
|
3
|
+
Hard-won lessons from running vLLM on NVIDIA workstation GPUs.
|
|
4
|
+
|
|
5
|
+
## GPU Crashes (Xid Errors)
|
|
6
|
+
|
|
7
|
+
### Xid 8 — GPU Stopped Processing
|
|
8
|
+
|
|
9
|
+
**Symptom:** vLLM dies after hours of stable inference. Kernel log shows:
|
|
10
|
+
```
|
|
11
|
+
NVRM: krcWatchdog_IMPL: RC watchdog: GPU is probably locked!
|
|
12
|
+
NVRM: Xid (PCI:0000:02:00): 8, pid=..., name=VLLM::EngineCor
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
**Root cause:** The GPU's recovery counter watchdog detected a hang. On Blackwell (SM120), this correlates with CUDA graphs under sustained FP8 load (see [vllm-project/vllm#35659](https://github.com/vllm-project/vllm/issues/35659)).
|
|
16
|
+
|
|
17
|
+
**Recovery:**
|
|
18
|
+
1. The GPU usually recovers automatically after the faulting process exits — no reset needed.
|
|
19
|
+
2. `nvidia-smi --gpu-reset` is deprecated as of driver 570+.
|
|
20
|
+
3. If the GPU is unresponsive, reload kernel modules: `sudo rmmod nvidia_uvm nvidia_drm nvidia_modeset nvidia && sudo modprobe nvidia`
|
|
21
|
+
4. Last resort: reboot.
|
|
22
|
+
|
|
23
|
+
**Prevention:**
|
|
24
|
+
- Use the production driver branch (595.x), not the new-feature branch (590.x).
|
|
25
|
+
- `vserve fan auto` keeps temps below throttle threshold.
|
|
26
|
+
- systemd `Restart=always` with exponential backoff recovers automatically.
|
|
27
|
+
- If crashes persist, try `--enforce-eager` (disables CUDA graphs, ~2.3x throughput cost).
|
|
28
|
+
|
|
29
|
+
### vLLM exits cleanly but systemd doesn't restart
|
|
30
|
+
|
|
31
|
+
**Cause:** vLLM's APIServer shuts down cleanly (exit 0) after an EngineCore crash. `Restart=on-failure` ignores exit 0.
|
|
32
|
+
|
|
33
|
+
**Fix:** Use `Restart=always` in the systemd unit.
|
|
34
|
+
|
|
35
|
+
## Fan Control
|
|
36
|
+
|
|
37
|
+
### Why override NVIDIA's auto curve?
|
|
38
|
+
|
|
39
|
+
The default auto curve on workstation GPUs (RTX PRO series) caps fan speed conservatively for noise. Under sustained 300W inference, this can allow temps to reach 85-90°C, causing:
|
|
40
|
+
- Thermal throttling (reduced inference throughput)
|
|
41
|
+
- Accelerated component aging (electromigration, capacitor degradation)
|
|
42
|
+
- Increased risk of Xid errors
|
|
43
|
+
|
|
44
|
+
### Coolbits setup
|
|
45
|
+
|
|
46
|
+
Fan control via `nvidia-settings` requires Coolbits enabled in X11 config:
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
# /etc/X11/xorg.conf
|
|
50
|
+
Section "Device"
|
|
51
|
+
Identifier "GPU0"
|
|
52
|
+
Driver "nvidia"
|
|
53
|
+
BusID "PCI:2:0:0" # check with: nvidia-smi --query-gpu=gpu_bus_id --format=csv,noheader
|
|
54
|
+
Option "Coolbits" "4" # bit 2 = manual fan control
|
|
55
|
+
EndSection
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
On headless systems, `vserve fan` uses a temporary Xvfb virtual display — no persistent X server needed.
|
|
59
|
+
|
|
60
|
+
### Hardware thermal failsafe
|
|
61
|
+
|
|
62
|
+
Even if the fan daemon crashes and the fan is stuck at 30%, the GPU has hardware protection:
|
|
63
|
+
- GPU Boost reduces clocks starting at ~83°C
|
|
64
|
+
- Aggressive power limiting at ~90°C
|
|
65
|
+
- Hardware shutdown at ~100-105°C (cannot be overridden by software)
|
|
66
|
+
|
|
67
|
+
The `vserve fan` emergency override (100% fan at 88°C regardless of quiet hours) is software-level defense. The hardware failsafe is always present underneath.
|
|
68
|
+
|
|
69
|
+
## JIT Compilation
|
|
70
|
+
|
|
71
|
+
### FlashInfer JIT cache
|
|
72
|
+
|
|
73
|
+
vLLM 0.18+ uses FlashInfer with just-in-time compiled CUDA kernels. First run for each model/KV-dtype combination triggers JIT compilation that takes 2-5 minutes.
|
|
74
|
+
|
|
75
|
+
**Where the cache lives:** `$VLLM_ROOT/.cache/flashinfer/`
|
|
76
|
+
|
|
77
|
+
**Problem:** If vLLM runs as a different user (e.g., systemd `User=vllm`), the JIT cache must be writable by that user. First-time startup will be slow.
|
|
78
|
+
|
|
79
|
+
**The preheat solution:** `vserve tune` runs a preheat step that starts vLLM briefly as the service user to build the JIT cache before benchmarking. This uses `sudo -u <service_user>` with the correct `CUDA_HOME`, `PATH`, and `HF_HOME` environment.
|
|
80
|
+
|
|
81
|
+
### torch.compile cache
|
|
82
|
+
|
|
83
|
+
vLLM also uses `torch.compile` which has its own cache at `$VLLM_ROOT/.cache/vllm/torch_compile_cache/`. Same ownership considerations apply.
|
|
84
|
+
|
|
85
|
+
### CUDA_HOME and PATH
|
|
86
|
+
|
|
87
|
+
The JIT compiler needs:
|
|
88
|
+
- `nvcc` accessible (CUDA toolkit bin directory in PATH)
|
|
89
|
+
- `CUDA_HOME` set to the CUDA toolkit root
|
|
90
|
+
- Matching CUDA version between the toolkit and the PyTorch build
|
|
91
|
+
|
|
92
|
+
If JIT compilation fails with `nvcc not found` or architecture mismatches, check:
|
|
93
|
+
```bash
|
|
94
|
+
which nvcc
|
|
95
|
+
nvcc --version
|
|
96
|
+
python -c "import torch; print(torch.version.cuda)"
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
### ProtectSystem=strict breaks JIT
|
|
100
|
+
|
|
101
|
+
Do NOT use `ProtectSystem=strict` in the vLLM systemd unit — it makes `/usr` read-only, preventing nvcc from writing temporary files during JIT compilation. The `gpu-fan.service` can use it (fan control doesn't need JIT), but the main `vllm.service` cannot.
|
|
102
|
+
|
|
103
|
+
## Driver Management
|
|
104
|
+
|
|
105
|
+
### Which driver branch to use
|
|
106
|
+
|
|
107
|
+
| Branch | Status | Use |
|
|
108
|
+
|--------|--------|-----|
|
|
109
|
+
| 570.x | Old production | Pre-Blackwell only |
|
|
110
|
+
| 575.x | Early Blackwell | Known issues, avoid |
|
|
111
|
+
| 590.x | New-feature branch | Experimental, not for production |
|
|
112
|
+
| 595.x | **Current production** | Recommended for Blackwell |
|
|
113
|
+
|
|
114
|
+
Check your branch: `nvidia-smi` shows the driver version in the header.
|
|
115
|
+
|
|
116
|
+
### Open vs. proprietary kernel modules
|
|
117
|
+
|
|
118
|
+
Blackwell GPUs (RTX 50-series, RTX PRO 5000/6000) **require open kernel modules**. The proprietary modules do not support Blackwell at all. Always use packages with `-open` suffix (e.g., `nvidia-driver-595-server-open` or `nvidia-open` from the CUDA repo).
|
|
119
|
+
|
|
120
|
+
### nvidia-settings version mismatch
|
|
121
|
+
|
|
122
|
+
If `nvidia-settings --version` shows a different version than `nvidia-smi`, you have a partial driver upgrade. This can cause subtle issues. Fix by upgrading the full driver stack to match.
|
|
123
|
+
|
|
124
|
+
## Headless Operation
|
|
125
|
+
|
|
126
|
+
### Disabling the display manager
|
|
127
|
+
|
|
128
|
+
For 24/7 inference, disable the display manager to free GPU memory and simplify driver reloads:
|
|
129
|
+
|
|
130
|
+
```bash
|
|
131
|
+
sudo systemctl disable gdm # or lightdm/sddm
|
|
132
|
+
sudo systemctl set-default multi-user.target
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
GUI is still available on demand: `sudo systemctl start gdm`
|
|
136
|
+
|
|
137
|
+
### GPU memory with no display
|
|
138
|
+
|
|
139
|
+
With no display manager: ~2 MiB GPU memory used (driver overhead only).
|
|
140
|
+
With Xorg/GDM running: ~15 MiB (Xorg frame buffer).
|
|
141
|
+
|
|
142
|
+
This is negligible for 48 GB, but matters for driver reload — fewer processes holding the GPU open means cleaner `rmmod`.
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "vserve"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "CLI for managing vLLM inference on GPU workstations"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
license = "MIT"
|
|
7
|
+
requires-python = ">=3.12"
|
|
8
|
+
authors = [{ name = "Mohan Qiao" }]
|
|
9
|
+
classifiers = [
|
|
10
|
+
"Development Status :: 4 - Beta",
|
|
11
|
+
"Environment :: Console",
|
|
12
|
+
"Environment :: GPU :: NVIDIA CUDA",
|
|
13
|
+
"Intended Audience :: Developers",
|
|
14
|
+
"Intended Audience :: Science/Research",
|
|
15
|
+
"License :: OSI Approved :: MIT License",
|
|
16
|
+
"Programming Language :: Python :: 3.12",
|
|
17
|
+
"Programming Language :: Python :: 3.13",
|
|
18
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
19
|
+
]
|
|
20
|
+
dependencies = [
|
|
21
|
+
"typer>=0.15",
|
|
22
|
+
"rich>=14",
|
|
23
|
+
"huggingface-hub>=0.30",
|
|
24
|
+
"pyyaml>=6",
|
|
25
|
+
"nvidia-ml-py>=13.595.45",
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
[dependency-groups]
|
|
29
|
+
dev = ["pytest>=8", "pytest-mock>=3", "ruff>=0.15", "mypy>=1.15", "types-PyYAML>=6", "types-requests>=2.33"]
|
|
30
|
+
|
|
31
|
+
[project.urls]
|
|
32
|
+
Homepage = "https://github.com/Gavin-Qiao/vserve"
|
|
33
|
+
Repository = "https://github.com/Gavin-Qiao/vserve"
|
|
34
|
+
Issues = "https://github.com/Gavin-Qiao/vserve/issues"
|
|
35
|
+
|
|
36
|
+
[project.scripts]
|
|
37
|
+
vserve = "vserve.cli:app"
|
|
38
|
+
|
|
39
|
+
[build-system]
|
|
40
|
+
requires = ["hatchling"]
|
|
41
|
+
build-backend = "hatchling.build"
|
|
42
|
+
|
|
43
|
+
[tool.hatch.build.targets.wheel]
|
|
44
|
+
packages = ["src/vserve"]
|
|
45
|
+
|
|
46
|
+
[tool.pytest.ini_options]
|
|
47
|
+
testpaths = ["tests"]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.1.0"
|