vllm-speculative-autoconfig 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vllm_speculative_autoconfig-0.1.1/LICENSE +22 -0
- vllm_speculative_autoconfig-0.1.1/MANIFEST.in +7 -0
- vllm_speculative_autoconfig-0.1.1/PKG-INFO +230 -0
- vllm_speculative_autoconfig-0.1.1/README.md +203 -0
- vllm_speculative_autoconfig-0.1.1/pyproject.toml +41 -0
- vllm_speculative_autoconfig-0.1.1/setup.cfg +4 -0
- vllm_speculative_autoconfig-0.1.1/src/vllm_autoconfig/__init__.py +1 -0
- vllm_speculative_autoconfig-0.1.1/src/vllm_autoconfig/cache.py +30 -0
- vllm_speculative_autoconfig-0.1.1/src/vllm_autoconfig/client.py +247 -0
- vllm_speculative_autoconfig-0.1.1/src/vllm_autoconfig/gpu_probe.py +41 -0
- vllm_speculative_autoconfig-0.1.1/src/vllm_autoconfig/kv_math.py +73 -0
- vllm_speculative_autoconfig-0.1.1/src/vllm_autoconfig/model_probe.py +93 -0
- vllm_speculative_autoconfig-0.1.1/src/vllm_autoconfig/planner.py +366 -0
- vllm_speculative_autoconfig-0.1.1/src/vllm_autoconfig/temp_client.py +185 -0
- vllm_speculative_autoconfig-0.1.1/src/vllm_speculative_autoconfig.egg-info/PKG-INFO +230 -0
- vllm_speculative_autoconfig-0.1.1/src/vllm_speculative_autoconfig.egg-info/SOURCES.txt +17 -0
- vllm_speculative_autoconfig-0.1.1/src/vllm_speculative_autoconfig.egg-info/dependency_links.txt +1 -0
- vllm_speculative_autoconfig-0.1.1/src/vllm_speculative_autoconfig.egg-info/requires.txt +6 -0
- vllm_speculative_autoconfig-0.1.1/src/vllm_speculative_autoconfig.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 vllm-autoconfig contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
22
|
+
|
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: vllm-speculative-autoconfig
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: Automatic configuration planner for vLLM with PyTorch-based GPU probing and intelligent memory management
|
|
5
|
+
Author: vllm-speculative-autoconfig contributors
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Keywords: vllm,llm,inference,optimization,gpu,deep-learning,transformer
|
|
8
|
+
Classifier: Development Status :: 3 - Alpha
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: Intended Audience :: Science/Research
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
16
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
17
|
+
Requires-Python: >=3.10
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
License-File: LICENSE
|
|
20
|
+
Requires-Dist: torch>=2.0.0
|
|
21
|
+
Requires-Dist: transformers>=4.40.0
|
|
22
|
+
Requires-Dist: vllm>=0.6.0
|
|
23
|
+
Requires-Dist: tqdm>=4.65.0
|
|
24
|
+
Requires-Dist: huggingface-hub>=0.23.0
|
|
25
|
+
Requires-Dist: platformdirs>=4.0.0
|
|
26
|
+
Dynamic: license-file
|
|
27
|
+
|
|
28
|
+
# vllm-autoconfig
|
|
29
|
+
|
|
30
|
+
**Automatic configuration planner for vLLM** - Eliminate the guesswork of configuring vLLM by automatically determining optimal parameters based on your GPU hardware and model requirements.
|
|
31
|
+
|
|
32
|
+
## 🚀 Features
|
|
33
|
+
|
|
34
|
+
- **Zero-configuration vLLM setup**: Automatically calculates optimal `max_model_len`, `gpu_memory_utilization`, and other vLLM parameters
|
|
35
|
+
- **Hardware-aware planning**: Probes GPU memory and capabilities using PyTorch to ensure configurations fit your hardware
|
|
36
|
+
- **Model-specific optimizations**: Applies model-family-specific settings (Mistral, Llama, Qwen, etc.)
|
|
37
|
+
- **KV cache sizing**: Intelligently calculates memory requirements for attention key-value caches
|
|
38
|
+
- **Configuration caching**: Saves computed plans to avoid redundant calculations
|
|
39
|
+
- **Performance modes**: Choose between `throughput` and `latency` optimization strategies
|
|
40
|
+
- **FP8 KV cache support**: Automatically enables FP8 quantization for KV caches when beneficial
|
|
41
|
+
- **Simple API**: Just specify your model name and desired context length - everything else is handled automatically
|
|
42
|
+
|
|
43
|
+
## 📦 Installation
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
pip install vllm-autoconfig
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
**Requirements:**
|
|
50
|
+
- Python >= 3.10
|
|
51
|
+
- PyTorch with CUDA support
|
|
52
|
+
- vLLM
|
|
53
|
+
- Access to CUDA-capable GPU(s)
|
|
54
|
+
|
|
55
|
+
## 🎯 Quick Start
|
|
56
|
+
|
|
57
|
+
### Python API
|
|
58
|
+
|
|
59
|
+
```python
|
|
60
|
+
from vllm_autoconfig import AutoVLLMClient, SamplingConfig
|
|
61
|
+
|
|
62
|
+
# Initialize with your model and desired context length
|
|
63
|
+
client = AutoVLLMClient(
|
|
64
|
+
model_name="meta-llama/Llama-3.1-8B-Instruct",
|
|
65
|
+
context_len=1024, # The ONLY parameter you need to set!
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
# Prepare your prompts
|
|
69
|
+
prompts = [
|
|
70
|
+
{
|
|
71
|
+
"messages": [
|
|
72
|
+
{"role": "user", "content": "What is the capital of France?"}
|
|
73
|
+
],
|
|
74
|
+
"metadata": {"id": 1},
|
|
75
|
+
}
|
|
76
|
+
]
|
|
77
|
+
|
|
78
|
+
# Run inference
|
|
79
|
+
results = client.run_batch(
|
|
80
|
+
prompts,
|
|
81
|
+
SamplingConfig(max_tokens=100, temperature=0.7)
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
print(results)
|
|
85
|
+
client.close()
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
### Advanced Usage
|
|
89
|
+
|
|
90
|
+
```python
|
|
91
|
+
from vllm_autoconfig import AutoVLLMClient, SamplingConfig
|
|
92
|
+
|
|
93
|
+
# Fine-tune the configuration
|
|
94
|
+
client = AutoVLLMClient(
|
|
95
|
+
model_name="mistralai/Mistral-7B-Instruct-v0.3",
|
|
96
|
+
context_len=2048,
|
|
97
|
+
perf_mode="latency", # or "throughput" (default)
|
|
98
|
+
prefer_fp8_kv_cache=True, # Enable FP8 KV cache if supported
|
|
99
|
+
trust_remote_code=False, # For models requiring custom code
|
|
100
|
+
debug=True, # Enable detailed logging
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
# Check the computed plan
|
|
104
|
+
print(f"Plan cache key: {client.plan.cache_key}")
|
|
105
|
+
print(f"vLLM kwargs: {client.plan.vllm_kwargs}")
|
|
106
|
+
print(f"Notes: {client.plan.notes}")
|
|
107
|
+
|
|
108
|
+
# Run inference with custom sampling
|
|
109
|
+
sampling = SamplingConfig(
|
|
110
|
+
temperature=0.8,
|
|
111
|
+
top_p=0.95,
|
|
112
|
+
max_tokens=256,
|
|
113
|
+
stop=["###", "\n\n"]
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
results = client.run_batch(prompts, sampling)
|
|
117
|
+
client.close()
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
## 🛠️ How It Works
|
|
121
|
+
|
|
122
|
+
1. **GPU Probing**: Detects available GPU memory and capabilities (BF16 support, compute capability)
|
|
123
|
+
2. **Model Analysis**: Downloads model configuration from HuggingFace Hub and analyzes architecture
|
|
124
|
+
3. **Weight Calculation**: Computes actual model weight size from checkpoint files
|
|
125
|
+
4. **Memory Planning**: Calculates KV cache memory requirements based on context length and batch size
|
|
126
|
+
5. **Configuration Generation**: Produces optimal vLLM initialization parameters within hardware constraints
|
|
127
|
+
6. **Caching**: Saves the computed plan for reuse with the same configuration
|
|
128
|
+
|
|
129
|
+
## 📊 Configuration Parameters
|
|
130
|
+
|
|
131
|
+
The `AutoVLLMClient` automatically configures:
|
|
132
|
+
|
|
133
|
+
- `model`: Model name/path
|
|
134
|
+
- `max_model_len`: Maximum sequence length
|
|
135
|
+
- `gpu_memory_utilization`: GPU memory usage fraction
|
|
136
|
+
- `dtype`: Weight precision (bfloat16 or float16)
|
|
137
|
+
- `kv_cache_dtype`: KV cache precision (including FP8 when beneficial)
|
|
138
|
+
- `enforce_eager`: Whether to use eager mode (affects compilation)
|
|
139
|
+
- `trust_remote_code`: Whether to trust remote code execution
|
|
140
|
+
- Model-specific parameters (e.g., `tokenizer_mode`, `load_format` for Mistral)
|
|
141
|
+
|
|
142
|
+
## 🎛️ API Reference
|
|
143
|
+
|
|
144
|
+
### `AutoVLLMClient`
|
|
145
|
+
|
|
146
|
+
```python
|
|
147
|
+
AutoVLLMClient(
|
|
148
|
+
model_name: str, # HuggingFace model name or local path
|
|
149
|
+
context_len: int, # Desired context length
|
|
150
|
+
device_index: int = 0, # GPU device index
|
|
151
|
+
perf_mode: str = "throughput", # "throughput" or "latency"
|
|
152
|
+
trust_remote_code: bool = False,
|
|
153
|
+
prefer_fp8_kv_cache: bool = False,
|
|
154
|
+
enforce_eager: bool = False,
|
|
155
|
+
local_files_only: bool = False,
|
|
156
|
+
cache_plan: bool = True, # Cache computed plans
|
|
157
|
+
debug: bool = False, # Enable debug logging
|
|
158
|
+
vllm_logging_level: str = None, # vLLM logging level
|
|
159
|
+
)
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
### `SamplingConfig`
|
|
163
|
+
|
|
164
|
+
```python
|
|
165
|
+
SamplingConfig(
|
|
166
|
+
temperature: float = 0.0, # Sampling temperature
|
|
167
|
+
top_p: float = 1.0, # Nucleus sampling threshold
|
|
168
|
+
max_tokens: int = 32, # Maximum tokens to generate
|
|
169
|
+
stop: List[str] = None, # Stop sequences
|
|
170
|
+
)
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
### Methods
|
|
174
|
+
|
|
175
|
+
- `run_batch(prompts, sampling, output_field="output")`: Run inference on a batch of prompts
|
|
176
|
+
- `close()`: Clean up resources and free GPU memory
|
|
177
|
+
|
|
178
|
+
## 🏗️ Project Structure
|
|
179
|
+
|
|
180
|
+
```
|
|
181
|
+
vllm-autoconfig/
|
|
182
|
+
├── src/vllm_autoconfig/
|
|
183
|
+
│ ├── __init__.py # Package exports
|
|
184
|
+
│ ├── client.py # AutoVLLMClient implementation
|
|
185
|
+
│ ├── planner.py # Configuration planning logic
|
|
186
|
+
│ ├── gpu_probe.py # GPU detection and probing
|
|
187
|
+
│ ├── model_probe.py # Model analysis utilities
|
|
188
|
+
│ ├── kv_math.py # KV cache memory calculations
|
|
189
|
+
│ └── cache.py # Plan caching utilities
|
|
190
|
+
├── examples/
|
|
191
|
+
│ └── simple_run.py # Usage examples
|
|
192
|
+
└── pyproject.toml
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
## 🤝 Contributing
|
|
196
|
+
|
|
197
|
+
Contributions are welcome! Please feel free to submit a Pull Request. For major changes, please open an issue first to discuss what you would like to change.
|
|
198
|
+
|
|
199
|
+
## 📝 License
|
|
200
|
+
|
|
201
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
202
|
+
|
|
203
|
+
## 🙏 Acknowledgments
|
|
204
|
+
|
|
205
|
+
- Built on top of [vLLM](https://github.com/vllm-project/vllm) - the high-performance LLM inference engine
|
|
206
|
+
- Uses [HuggingFace Transformers](https://github.com/huggingface/transformers) for model configuration
|
|
207
|
+
|
|
208
|
+
## 📚 Citation
|
|
209
|
+
|
|
210
|
+
If you use vllm-autoconfig in your research or production systems, please cite:
|
|
211
|
+
|
|
212
|
+
```bibtex
|
|
213
|
+
@software{vllm_autoconfig,
|
|
214
|
+
title = {vllm-autoconfig: Automatic Configuration Planning for vLLM},
|
|
215
|
+
author = {Your Name},
|
|
216
|
+
year = {2024},
|
|
217
|
+
url = {https://github.com/yourusername/vllm-autoconfig}
|
|
218
|
+
}
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
## 🐛 Issues and Support
|
|
222
|
+
|
|
223
|
+
For issues, questions, or feature requests, please open an issue on [GitHub Issues](https://github.com/yourusername/vllm-autoconfig/issues).
|
|
224
|
+
|
|
225
|
+
## 🔗 Links
|
|
226
|
+
|
|
227
|
+
- [Documentation](https://github.com/yourusername/vllm-autoconfig)
|
|
228
|
+
- [PyPI Package](https://pypi.org/project/vllm-autoconfig/)
|
|
229
|
+
- [vLLM Documentation](https://docs.vllm.ai/)
|
|
230
|
+
|
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
# vllm-autoconfig
|
|
2
|
+
|
|
3
|
+
**Automatic configuration planner for vLLM** - Eliminate the guesswork of configuring vLLM by automatically determining optimal parameters based on your GPU hardware and model requirements.
|
|
4
|
+
|
|
5
|
+
## 🚀 Features
|
|
6
|
+
|
|
7
|
+
- **Zero-configuration vLLM setup**: Automatically calculates optimal `max_model_len`, `gpu_memory_utilization`, and other vLLM parameters
|
|
8
|
+
- **Hardware-aware planning**: Probes GPU memory and capabilities using PyTorch to ensure configurations fit your hardware
|
|
9
|
+
- **Model-specific optimizations**: Applies model-family-specific settings (Mistral, Llama, Qwen, etc.)
|
|
10
|
+
- **KV cache sizing**: Intelligently calculates memory requirements for attention key-value caches
|
|
11
|
+
- **Configuration caching**: Saves computed plans to avoid redundant calculations
|
|
12
|
+
- **Performance modes**: Choose between `throughput` and `latency` optimization strategies
|
|
13
|
+
- **FP8 KV cache support**: Automatically enables FP8 quantization for KV caches when beneficial
|
|
14
|
+
- **Simple API**: Just specify your model name and desired context length - everything else is handled automatically
|
|
15
|
+
|
|
16
|
+
## 📦 Installation
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
pip install vllm-autoconfig
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
**Requirements:**
|
|
23
|
+
- Python >= 3.10
|
|
24
|
+
- PyTorch with CUDA support
|
|
25
|
+
- vLLM
|
|
26
|
+
- Access to CUDA-capable GPU(s)
|
|
27
|
+
|
|
28
|
+
## 🎯 Quick Start
|
|
29
|
+
|
|
30
|
+
### Python API
|
|
31
|
+
|
|
32
|
+
```python
|
|
33
|
+
from vllm_autoconfig import AutoVLLMClient, SamplingConfig
|
|
34
|
+
|
|
35
|
+
# Initialize with your model and desired context length
|
|
36
|
+
client = AutoVLLMClient(
|
|
37
|
+
model_name="meta-llama/Llama-3.1-8B-Instruct",
|
|
38
|
+
context_len=1024, # The ONLY parameter you need to set!
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
# Prepare your prompts
|
|
42
|
+
prompts = [
|
|
43
|
+
{
|
|
44
|
+
"messages": [
|
|
45
|
+
{"role": "user", "content": "What is the capital of France?"}
|
|
46
|
+
],
|
|
47
|
+
"metadata": {"id": 1},
|
|
48
|
+
}
|
|
49
|
+
]
|
|
50
|
+
|
|
51
|
+
# Run inference
|
|
52
|
+
results = client.run_batch(
|
|
53
|
+
prompts,
|
|
54
|
+
SamplingConfig(max_tokens=100, temperature=0.7)
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
print(results)
|
|
58
|
+
client.close()
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
### Advanced Usage
|
|
62
|
+
|
|
63
|
+
```python
|
|
64
|
+
from vllm_autoconfig import AutoVLLMClient, SamplingConfig
|
|
65
|
+
|
|
66
|
+
# Fine-tune the configuration
|
|
67
|
+
client = AutoVLLMClient(
|
|
68
|
+
model_name="mistralai/Mistral-7B-Instruct-v0.3",
|
|
69
|
+
context_len=2048,
|
|
70
|
+
perf_mode="latency", # or "throughput" (default)
|
|
71
|
+
prefer_fp8_kv_cache=True, # Enable FP8 KV cache if supported
|
|
72
|
+
trust_remote_code=False, # For models requiring custom code
|
|
73
|
+
debug=True, # Enable detailed logging
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
# Check the computed plan
|
|
77
|
+
print(f"Plan cache key: {client.plan.cache_key}")
|
|
78
|
+
print(f"vLLM kwargs: {client.plan.vllm_kwargs}")
|
|
79
|
+
print(f"Notes: {client.plan.notes}")
|
|
80
|
+
|
|
81
|
+
# Run inference with custom sampling
|
|
82
|
+
sampling = SamplingConfig(
|
|
83
|
+
temperature=0.8,
|
|
84
|
+
top_p=0.95,
|
|
85
|
+
max_tokens=256,
|
|
86
|
+
stop=["###", "\n\n"]
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
results = client.run_batch(prompts, sampling)
|
|
90
|
+
client.close()
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
## 🛠️ How It Works
|
|
94
|
+
|
|
95
|
+
1. **GPU Probing**: Detects available GPU memory and capabilities (BF16 support, compute capability)
|
|
96
|
+
2. **Model Analysis**: Downloads model configuration from HuggingFace Hub and analyzes architecture
|
|
97
|
+
3. **Weight Calculation**: Computes actual model weight size from checkpoint files
|
|
98
|
+
4. **Memory Planning**: Calculates KV cache memory requirements based on context length and batch size
|
|
99
|
+
5. **Configuration Generation**: Produces optimal vLLM initialization parameters within hardware constraints
|
|
100
|
+
6. **Caching**: Saves the computed plan for reuse with the same configuration
|
|
101
|
+
|
|
102
|
+
## 📊 Configuration Parameters
|
|
103
|
+
|
|
104
|
+
The `AutoVLLMClient` automatically configures:
|
|
105
|
+
|
|
106
|
+
- `model`: Model name/path
|
|
107
|
+
- `max_model_len`: Maximum sequence length
|
|
108
|
+
- `gpu_memory_utilization`: GPU memory usage fraction
|
|
109
|
+
- `dtype`: Weight precision (bfloat16 or float16)
|
|
110
|
+
- `kv_cache_dtype`: KV cache precision (including FP8 when beneficial)
|
|
111
|
+
- `enforce_eager`: Whether to use eager mode (affects compilation)
|
|
112
|
+
- `trust_remote_code`: Whether to trust remote code execution
|
|
113
|
+
- Model-specific parameters (e.g., `tokenizer_mode`, `load_format` for Mistral)
|
|
114
|
+
|
|
115
|
+
## 🎛️ API Reference
|
|
116
|
+
|
|
117
|
+
### `AutoVLLMClient`
|
|
118
|
+
|
|
119
|
+
```python
|
|
120
|
+
AutoVLLMClient(
|
|
121
|
+
model_name: str, # HuggingFace model name or local path
|
|
122
|
+
context_len: int, # Desired context length
|
|
123
|
+
device_index: int = 0, # GPU device index
|
|
124
|
+
perf_mode: str = "throughput", # "throughput" or "latency"
|
|
125
|
+
trust_remote_code: bool = False,
|
|
126
|
+
prefer_fp8_kv_cache: bool = False,
|
|
127
|
+
enforce_eager: bool = False,
|
|
128
|
+
local_files_only: bool = False,
|
|
129
|
+
cache_plan: bool = True, # Cache computed plans
|
|
130
|
+
debug: bool = False, # Enable debug logging
|
|
131
|
+
vllm_logging_level: str = None, # vLLM logging level
|
|
132
|
+
)
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
### `SamplingConfig`
|
|
136
|
+
|
|
137
|
+
```python
|
|
138
|
+
SamplingConfig(
|
|
139
|
+
temperature: float = 0.0, # Sampling temperature
|
|
140
|
+
top_p: float = 1.0, # Nucleus sampling threshold
|
|
141
|
+
max_tokens: int = 32, # Maximum tokens to generate
|
|
142
|
+
stop: List[str] = None, # Stop sequences
|
|
143
|
+
)
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
### Methods
|
|
147
|
+
|
|
148
|
+
- `run_batch(prompts, sampling, output_field="output")`: Run inference on a batch of prompts
|
|
149
|
+
- `close()`: Clean up resources and free GPU memory
|
|
150
|
+
|
|
151
|
+
## 🏗️ Project Structure
|
|
152
|
+
|
|
153
|
+
```
|
|
154
|
+
vllm-autoconfig/
|
|
155
|
+
├── src/vllm_autoconfig/
|
|
156
|
+
│ ├── __init__.py # Package exports
|
|
157
|
+
│ ├── client.py # AutoVLLMClient implementation
|
|
158
|
+
│ ├── planner.py # Configuration planning logic
|
|
159
|
+
│ ├── gpu_probe.py # GPU detection and probing
|
|
160
|
+
│ ├── model_probe.py # Model analysis utilities
|
|
161
|
+
│ ├── kv_math.py # KV cache memory calculations
|
|
162
|
+
│ └── cache.py # Plan caching utilities
|
|
163
|
+
├── examples/
|
|
164
|
+
│ └── simple_run.py # Usage examples
|
|
165
|
+
└── pyproject.toml
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
## 🤝 Contributing
|
|
169
|
+
|
|
170
|
+
Contributions are welcome! Please feel free to submit a Pull Request. For major changes, please open an issue first to discuss what you would like to change.
|
|
171
|
+
|
|
172
|
+
## 📝 License
|
|
173
|
+
|
|
174
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
175
|
+
|
|
176
|
+
## 🙏 Acknowledgments
|
|
177
|
+
|
|
178
|
+
- Built on top of [vLLM](https://github.com/vllm-project/vllm) - the high-performance LLM inference engine
|
|
179
|
+
- Uses [HuggingFace Transformers](https://github.com/huggingface/transformers) for model configuration
|
|
180
|
+
|
|
181
|
+
## 📚 Citation
|
|
182
|
+
|
|
183
|
+
If you use vllm-autoconfig in your research or production systems, please cite:
|
|
184
|
+
|
|
185
|
+
```bibtex
|
|
186
|
+
@software{vllm_autoconfig,
|
|
187
|
+
title = {vllm-autoconfig: Automatic Configuration Planning for vLLM},
|
|
188
|
+
author = {Your Name},
|
|
189
|
+
year = {2024},
|
|
190
|
+
url = {https://github.com/yourusername/vllm-autoconfig}
|
|
191
|
+
}
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
## 🐛 Issues and Support
|
|
195
|
+
|
|
196
|
+
For issues, questions, or feature requests, please open an issue on [GitHub Issues](https://github.com/yourusername/vllm-autoconfig/issues).
|
|
197
|
+
|
|
198
|
+
## 🔗 Links
|
|
199
|
+
|
|
200
|
+
- [Documentation](https://github.com/yourusername/vllm-autoconfig)
|
|
201
|
+
- [PyPI Package](https://pypi.org/project/vllm-autoconfig/)
|
|
202
|
+
- [vLLM Documentation](https://docs.vllm.ai/)
|
|
203
|
+
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "vllm-speculative-autoconfig"
|
|
3
|
+
version = "0.1.1"
|
|
4
|
+
description = "Automatic configuration planner for vLLM with PyTorch-based GPU probing and intelligent memory management"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.10"
|
|
7
|
+
license = "MIT"
|
|
8
|
+
authors = [
|
|
9
|
+
{name = "vllm-speculative-autoconfig contributors"}
|
|
10
|
+
]
|
|
11
|
+
keywords = ["vllm", "llm", "inference", "optimization", "gpu", "deep-learning", "transformer"]
|
|
12
|
+
classifiers = [
|
|
13
|
+
"Development Status :: 3 - Alpha",
|
|
14
|
+
"Intended Audience :: Developers",
|
|
15
|
+
"Intended Audience :: Science/Research",
|
|
16
|
+
"Programming Language :: Python :: 3",
|
|
17
|
+
"Programming Language :: Python :: 3.10",
|
|
18
|
+
"Programming Language :: Python :: 3.11",
|
|
19
|
+
"Programming Language :: Python :: 3.12",
|
|
20
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
21
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
22
|
+
]
|
|
23
|
+
dependencies = [
|
|
24
|
+
"torch>=2.0.0",
|
|
25
|
+
"transformers>=4.40.0",
|
|
26
|
+
"vllm>=0.6.0",
|
|
27
|
+
"tqdm>=4.65.0",
|
|
28
|
+
"huggingface-hub>=0.23.0",
|
|
29
|
+
"platformdirs>=4.0.0",
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
[build-system]
|
|
33
|
+
requires = ["setuptools>=68"]
|
|
34
|
+
build-backend = "setuptools.build_meta"
|
|
35
|
+
|
|
36
|
+
[tool.setuptools]
|
|
37
|
+
package-dir = {"" = "src"}
|
|
38
|
+
|
|
39
|
+
[tool.setuptools.packages.find]
|
|
40
|
+
where = ["src"]
|
|
41
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .client import AutoVLLMClient, SamplingConfig
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
import json
|
|
5
|
+
from dataclasses import asdict
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _cache_dir() -> Path:
|
|
11
|
+
return Path.home() / ".cache" / "vllm_autoconfig"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def make_cache_key(payload: dict[str, Any]) -> str:
|
|
15
|
+
blob = json.dumps(payload, sort_keys=True, separators=(",", ":")).encode("utf-8")
|
|
16
|
+
return hashlib.sha256(blob).hexdigest()[:24]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def load_cached_plan(key: str) -> dict[str, Any] | None:
|
|
20
|
+
path = _cache_dir() / "plans" / f"{key}.json"
|
|
21
|
+
if not path.exists():
|
|
22
|
+
return None
|
|
23
|
+
return json.loads(path.read_text(encoding="utf-8"))
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def save_cached_plan(key: str, plan: dict[str, Any]) -> None:
|
|
27
|
+
plans_dir = _cache_dir() / "plans"
|
|
28
|
+
plans_dir.mkdir(parents=True, exist_ok=True)
|
|
29
|
+
path = plans_dir / f"{key}.json"
|
|
30
|
+
path.write_text(json.dumps(plan, indent=2, sort_keys=True), encoding="utf-8")
|