vllm-speculative-autoconfig 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,22 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 vllm-autoconfig contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
22
+
@@ -0,0 +1,7 @@
1
+ include README.md
2
+ include LICENSE
3
+ include pyproject.toml
4
+ recursive-include src *.py
5
+ recursive-exclude * __pycache__
6
+ recursive-exclude * *.py[co]
7
+
@@ -0,0 +1,230 @@
1
+ Metadata-Version: 2.4
2
+ Name: vllm-speculative-autoconfig
3
+ Version: 0.1.1
4
+ Summary: Automatic configuration planner for vLLM with PyTorch-based GPU probing and intelligent memory management
5
+ Author: vllm-speculative-autoconfig contributors
6
+ License-Expression: MIT
7
+ Keywords: vllm,llm,inference,optimization,gpu,deep-learning,transformer
8
+ Classifier: Development Status :: 3 - Alpha
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: Intended Audience :: Science/Research
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.10
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
16
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
17
+ Requires-Python: >=3.10
18
+ Description-Content-Type: text/markdown
19
+ License-File: LICENSE
20
+ Requires-Dist: torch>=2.0.0
21
+ Requires-Dist: transformers>=4.40.0
22
+ Requires-Dist: vllm>=0.6.0
23
+ Requires-Dist: tqdm>=4.65.0
24
+ Requires-Dist: huggingface-hub>=0.23.0
25
+ Requires-Dist: platformdirs>=4.0.0
26
+ Dynamic: license-file
27
+
28
+ # vllm-autoconfig
29
+
30
+ **Automatic configuration planner for vLLM** - Eliminate the guesswork of configuring vLLM by automatically determining optimal parameters based on your GPU hardware and model requirements.
31
+
32
+ ## 🚀 Features
33
+
34
+ - **Zero-configuration vLLM setup**: Automatically calculates optimal `max_model_len`, `gpu_memory_utilization`, and other vLLM parameters
35
+ - **Hardware-aware planning**: Probes GPU memory and capabilities using PyTorch to ensure configurations fit your hardware
36
+ - **Model-specific optimizations**: Applies model-family-specific settings (Mistral, Llama, Qwen, etc.)
37
+ - **KV cache sizing**: Intelligently calculates memory requirements for attention key-value caches
38
+ - **Configuration caching**: Saves computed plans to avoid redundant calculations
39
+ - **Performance modes**: Choose between `throughput` and `latency` optimization strategies
40
+ - **FP8 KV cache support**: Automatically enables FP8 quantization for KV caches when beneficial
41
+ - **Simple API**: Just specify your model name and desired context length - everything else is handled automatically
42
+
43
+ ## 📦 Installation
44
+
45
+ ```bash
46
+ pip install vllm-autoconfig
47
+ ```
48
+
49
+ **Requirements:**
50
+ - Python >= 3.10
51
+ - PyTorch with CUDA support
52
+ - vLLM
53
+ - Access to CUDA-capable GPU(s)
54
+
55
+ ## 🎯 Quick Start
56
+
57
+ ### Python API
58
+
59
+ ```python
60
+ from vllm_autoconfig import AutoVLLMClient, SamplingConfig
61
+
62
+ # Initialize with your model and desired context length
63
+ client = AutoVLLMClient(
64
+ model_name="meta-llama/Llama-3.1-8B-Instruct",
65
+ context_len=1024, # The ONLY parameter you need to set!
66
+ )
67
+
68
+ # Prepare your prompts
69
+ prompts = [
70
+ {
71
+ "messages": [
72
+ {"role": "user", "content": "What is the capital of France?"}
73
+ ],
74
+ "metadata": {"id": 1},
75
+ }
76
+ ]
77
+
78
+ # Run inference
79
+ results = client.run_batch(
80
+ prompts,
81
+ SamplingConfig(max_tokens=100, temperature=0.7)
82
+ )
83
+
84
+ print(results)
85
+ client.close()
86
+ ```
87
+
88
+ ### Advanced Usage
89
+
90
+ ```python
91
+ from vllm_autoconfig import AutoVLLMClient, SamplingConfig
92
+
93
+ # Fine-tune the configuration
94
+ client = AutoVLLMClient(
95
+ model_name="mistralai/Mistral-7B-Instruct-v0.3",
96
+ context_len=2048,
97
+ perf_mode="latency", # or "throughput" (default)
98
+ prefer_fp8_kv_cache=True, # Enable FP8 KV cache if supported
99
+ trust_remote_code=False, # For models requiring custom code
100
+ debug=True, # Enable detailed logging
101
+ )
102
+
103
+ # Check the computed plan
104
+ print(f"Plan cache key: {client.plan.cache_key}")
105
+ print(f"vLLM kwargs: {client.plan.vllm_kwargs}")
106
+ print(f"Notes: {client.plan.notes}")
107
+
108
+ # Run inference with custom sampling
109
+ sampling = SamplingConfig(
110
+ temperature=0.8,
111
+ top_p=0.95,
112
+ max_tokens=256,
113
+ stop=["###", "\n\n"]
114
+ )
115
+
116
+ results = client.run_batch(prompts, sampling)
117
+ client.close()
118
+ ```
119
+
120
+ ## 🛠️ How It Works
121
+
122
+ 1. **GPU Probing**: Detects available GPU memory and capabilities (BF16 support, compute capability)
123
+ 2. **Model Analysis**: Downloads model configuration from HuggingFace Hub and analyzes architecture
124
+ 3. **Weight Calculation**: Computes actual model weight size from checkpoint files
125
+ 4. **Memory Planning**: Calculates KV cache memory requirements based on context length and batch size
126
+ 5. **Configuration Generation**: Produces optimal vLLM initialization parameters within hardware constraints
127
+ 6. **Caching**: Saves the computed plan for reuse with the same configuration
128
+
129
+ ## 📊 Configuration Parameters
130
+
131
+ The `AutoVLLMClient` automatically configures:
132
+
133
+ - `model`: Model name/path
134
+ - `max_model_len`: Maximum sequence length
135
+ - `gpu_memory_utilization`: GPU memory usage fraction
136
+ - `dtype`: Weight precision (bfloat16 or float16)
137
+ - `kv_cache_dtype`: KV cache precision (including FP8 when beneficial)
138
+ - `enforce_eager`: Whether to use eager mode (affects compilation)
139
+ - `trust_remote_code`: Whether to trust remote code execution
140
+ - Model-specific parameters (e.g., `tokenizer_mode`, `load_format` for Mistral)
141
+
142
+ ## 🎛️ API Reference
143
+
144
+ ### `AutoVLLMClient`
145
+
146
+ ```python
147
+ AutoVLLMClient(
148
+ model_name: str, # HuggingFace model name or local path
149
+ context_len: int, # Desired context length
150
+ device_index: int = 0, # GPU device index
151
+ perf_mode: str = "throughput", # "throughput" or "latency"
152
+ trust_remote_code: bool = False,
153
+ prefer_fp8_kv_cache: bool = False,
154
+ enforce_eager: bool = False,
155
+ local_files_only: bool = False,
156
+ cache_plan: bool = True, # Cache computed plans
157
+ debug: bool = False, # Enable debug logging
158
+ vllm_logging_level: str = None, # vLLM logging level
159
+ )
160
+ ```
161
+
162
+ ### `SamplingConfig`
163
+
164
+ ```python
165
+ SamplingConfig(
166
+ temperature: float = 0.0, # Sampling temperature
167
+ top_p: float = 1.0, # Nucleus sampling threshold
168
+ max_tokens: int = 32, # Maximum tokens to generate
169
+ stop: List[str] = None, # Stop sequences
170
+ )
171
+ ```
172
+
173
+ ### Methods
174
+
175
+ - `run_batch(prompts, sampling, output_field="output")`: Run inference on a batch of prompts
176
+ - `close()`: Clean up resources and free GPU memory
177
+
178
+ ## 🏗️ Project Structure
179
+
180
+ ```
181
+ vllm-autoconfig/
182
+ ├── src/vllm_autoconfig/
183
+ │ ├── __init__.py # Package exports
184
+ │ ├── client.py # AutoVLLMClient implementation
185
+ │ ├── planner.py # Configuration planning logic
186
+ │ ├── gpu_probe.py # GPU detection and probing
187
+ │ ├── model_probe.py # Model analysis utilities
188
+ │ ├── kv_math.py # KV cache memory calculations
189
+ │ └── cache.py # Plan caching utilities
190
+ ├── examples/
191
+ │ └── simple_run.py # Usage examples
192
+ └── pyproject.toml
193
+ ```
194
+
195
+ ## 🤝 Contributing
196
+
197
+ Contributions are welcome! Please feel free to submit a Pull Request. For major changes, please open an issue first to discuss what you would like to change.
198
+
199
+ ## 📝 License
200
+
201
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
202
+
203
+ ## 🙏 Acknowledgments
204
+
205
+ - Built on top of [vLLM](https://github.com/vllm-project/vllm) - the high-performance LLM inference engine
206
+ - Uses [HuggingFace Transformers](https://github.com/huggingface/transformers) for model configuration
207
+
208
+ ## 📚 Citation
209
+
210
+ If you use vllm-autoconfig in your research or production systems, please cite:
211
+
212
+ ```bibtex
213
+ @software{vllm_autoconfig,
214
+ title = {vllm-autoconfig: Automatic Configuration Planning for vLLM},
215
+ author = {Your Name},
216
+ year = {2024},
217
+ url = {https://github.com/yourusername/vllm-autoconfig}
218
+ }
219
+ ```
220
+
221
+ ## 🐛 Issues and Support
222
+
223
+ For issues, questions, or feature requests, please open an issue on [GitHub Issues](https://github.com/yourusername/vllm-autoconfig/issues).
224
+
225
+ ## 🔗 Links
226
+
227
+ - [Documentation](https://github.com/yourusername/vllm-autoconfig)
228
+ - [PyPI Package](https://pypi.org/project/vllm-autoconfig/)
229
+ - [vLLM Documentation](https://docs.vllm.ai/)
230
+
@@ -0,0 +1,203 @@
1
+ # vllm-autoconfig
2
+
3
+ **Automatic configuration planner for vLLM** - Eliminate the guesswork of configuring vLLM by automatically determining optimal parameters based on your GPU hardware and model requirements.
4
+
5
+ ## 🚀 Features
6
+
7
+ - **Zero-configuration vLLM setup**: Automatically calculates optimal `max_model_len`, `gpu_memory_utilization`, and other vLLM parameters
8
+ - **Hardware-aware planning**: Probes GPU memory and capabilities using PyTorch to ensure configurations fit your hardware
9
+ - **Model-specific optimizations**: Applies model-family-specific settings (Mistral, Llama, Qwen, etc.)
10
+ - **KV cache sizing**: Intelligently calculates memory requirements for attention key-value caches
11
+ - **Configuration caching**: Saves computed plans to avoid redundant calculations
12
+ - **Performance modes**: Choose between `throughput` and `latency` optimization strategies
13
+ - **FP8 KV cache support**: Automatically enables FP8 quantization for KV caches when beneficial
14
+ - **Simple API**: Just specify your model name and desired context length - everything else is handled automatically
15
+
16
+ ## 📦 Installation
17
+
18
+ ```bash
19
+ pip install vllm-autoconfig
20
+ ```
21
+
22
+ **Requirements:**
23
+ - Python >= 3.10
24
+ - PyTorch with CUDA support
25
+ - vLLM
26
+ - Access to CUDA-capable GPU(s)
27
+
28
+ ## 🎯 Quick Start
29
+
30
+ ### Python API
31
+
32
+ ```python
33
+ from vllm_autoconfig import AutoVLLMClient, SamplingConfig
34
+
35
+ # Initialize with your model and desired context length
36
+ client = AutoVLLMClient(
37
+ model_name="meta-llama/Llama-3.1-8B-Instruct",
38
+ context_len=1024, # The ONLY parameter you need to set!
39
+ )
40
+
41
+ # Prepare your prompts
42
+ prompts = [
43
+ {
44
+ "messages": [
45
+ {"role": "user", "content": "What is the capital of France?"}
46
+ ],
47
+ "metadata": {"id": 1},
48
+ }
49
+ ]
50
+
51
+ # Run inference
52
+ results = client.run_batch(
53
+ prompts,
54
+ SamplingConfig(max_tokens=100, temperature=0.7)
55
+ )
56
+
57
+ print(results)
58
+ client.close()
59
+ ```
60
+
61
+ ### Advanced Usage
62
+
63
+ ```python
64
+ from vllm_autoconfig import AutoVLLMClient, SamplingConfig
65
+
66
+ # Fine-tune the configuration
67
+ client = AutoVLLMClient(
68
+ model_name="mistralai/Mistral-7B-Instruct-v0.3",
69
+ context_len=2048,
70
+ perf_mode="latency", # or "throughput" (default)
71
+ prefer_fp8_kv_cache=True, # Enable FP8 KV cache if supported
72
+ trust_remote_code=False, # For models requiring custom code
73
+ debug=True, # Enable detailed logging
74
+ )
75
+
76
+ # Check the computed plan
77
+ print(f"Plan cache key: {client.plan.cache_key}")
78
+ print(f"vLLM kwargs: {client.plan.vllm_kwargs}")
79
+ print(f"Notes: {client.plan.notes}")
80
+
81
+ # Run inference with custom sampling
82
+ sampling = SamplingConfig(
83
+ temperature=0.8,
84
+ top_p=0.95,
85
+ max_tokens=256,
86
+ stop=["###", "\n\n"]
87
+ )
88
+
89
+ results = client.run_batch(prompts, sampling)
90
+ client.close()
91
+ ```
92
+
93
+ ## 🛠️ How It Works
94
+
95
+ 1. **GPU Probing**: Detects available GPU memory and capabilities (BF16 support, compute capability)
96
+ 2. **Model Analysis**: Downloads model configuration from HuggingFace Hub and analyzes architecture
97
+ 3. **Weight Calculation**: Computes actual model weight size from checkpoint files
98
+ 4. **Memory Planning**: Calculates KV cache memory requirements based on context length and batch size
99
+ 5. **Configuration Generation**: Produces optimal vLLM initialization parameters within hardware constraints
100
+ 6. **Caching**: Saves the computed plan for reuse with the same configuration
101
+
102
+ ## 📊 Configuration Parameters
103
+
104
+ The `AutoVLLMClient` automatically configures:
105
+
106
+ - `model`: Model name/path
107
+ - `max_model_len`: Maximum sequence length
108
+ - `gpu_memory_utilization`: GPU memory usage fraction
109
+ - `dtype`: Weight precision (bfloat16 or float16)
110
+ - `kv_cache_dtype`: KV cache precision (including FP8 when beneficial)
111
+ - `enforce_eager`: Whether to use eager mode (affects compilation)
112
+ - `trust_remote_code`: Whether to trust remote code execution
113
+ - Model-specific parameters (e.g., `tokenizer_mode`, `load_format` for Mistral)
114
+
115
+ ## 🎛️ API Reference
116
+
117
+ ### `AutoVLLMClient`
118
+
119
+ ```python
120
+ AutoVLLMClient(
121
+ model_name: str, # HuggingFace model name or local path
122
+ context_len: int, # Desired context length
123
+ device_index: int = 0, # GPU device index
124
+ perf_mode: str = "throughput", # "throughput" or "latency"
125
+ trust_remote_code: bool = False,
126
+ prefer_fp8_kv_cache: bool = False,
127
+ enforce_eager: bool = False,
128
+ local_files_only: bool = False,
129
+ cache_plan: bool = True, # Cache computed plans
130
+ debug: bool = False, # Enable debug logging
131
+ vllm_logging_level: str = None, # vLLM logging level
132
+ )
133
+ ```
134
+
135
+ ### `SamplingConfig`
136
+
137
+ ```python
138
+ SamplingConfig(
139
+ temperature: float = 0.0, # Sampling temperature
140
+ top_p: float = 1.0, # Nucleus sampling threshold
141
+ max_tokens: int = 32, # Maximum tokens to generate
142
+ stop: List[str] = None, # Stop sequences
143
+ )
144
+ ```
145
+
146
+ ### Methods
147
+
148
+ - `run_batch(prompts, sampling, output_field="output")`: Run inference on a batch of prompts
149
+ - `close()`: Clean up resources and free GPU memory
150
+
151
+ ## 🏗️ Project Structure
152
+
153
+ ```
154
+ vllm-autoconfig/
155
+ ├── src/vllm_autoconfig/
156
+ │ ├── __init__.py # Package exports
157
+ │ ├── client.py # AutoVLLMClient implementation
158
+ │ ├── planner.py # Configuration planning logic
159
+ │ ├── gpu_probe.py # GPU detection and probing
160
+ │ ├── model_probe.py # Model analysis utilities
161
+ │ ├── kv_math.py # KV cache memory calculations
162
+ │ └── cache.py # Plan caching utilities
163
+ ├── examples/
164
+ │ └── simple_run.py # Usage examples
165
+ └── pyproject.toml
166
+ ```
167
+
168
+ ## 🤝 Contributing
169
+
170
+ Contributions are welcome! Please feel free to submit a Pull Request. For major changes, please open an issue first to discuss what you would like to change.
171
+
172
+ ## 📝 License
173
+
174
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
175
+
176
+ ## 🙏 Acknowledgments
177
+
178
+ - Built on top of [vLLM](https://github.com/vllm-project/vllm) - the high-performance LLM inference engine
179
+ - Uses [HuggingFace Transformers](https://github.com/huggingface/transformers) for model configuration
180
+
181
+ ## 📚 Citation
182
+
183
+ If you use vllm-autoconfig in your research or production systems, please cite:
184
+
185
+ ```bibtex
186
+ @software{vllm_autoconfig,
187
+ title = {vllm-autoconfig: Automatic Configuration Planning for vLLM},
188
+ author = {Your Name},
189
+ year = {2024},
190
+ url = {https://github.com/yourusername/vllm-autoconfig}
191
+ }
192
+ ```
193
+
194
+ ## 🐛 Issues and Support
195
+
196
+ For issues, questions, or feature requests, please open an issue on [GitHub Issues](https://github.com/yourusername/vllm-autoconfig/issues).
197
+
198
+ ## 🔗 Links
199
+
200
+ - [Documentation](https://github.com/yourusername/vllm-autoconfig)
201
+ - [PyPI Package](https://pypi.org/project/vllm-autoconfig/)
202
+ - [vLLM Documentation](https://docs.vllm.ai/)
203
+
@@ -0,0 +1,41 @@
1
+ [project]
2
+ name = "vllm-speculative-autoconfig"
3
+ version = "0.1.1"
4
+ description = "Automatic configuration planner for vLLM with PyTorch-based GPU probing and intelligent memory management"
5
+ readme = "README.md"
6
+ requires-python = ">=3.10"
7
+ license = "MIT"
8
+ authors = [
9
+ {name = "vllm-speculative-autoconfig contributors"}
10
+ ]
11
+ keywords = ["vllm", "llm", "inference", "optimization", "gpu", "deep-learning", "transformer"]
12
+ classifiers = [
13
+ "Development Status :: 3 - Alpha",
14
+ "Intended Audience :: Developers",
15
+ "Intended Audience :: Science/Research",
16
+ "Programming Language :: Python :: 3",
17
+ "Programming Language :: Python :: 3.10",
18
+ "Programming Language :: Python :: 3.11",
19
+ "Programming Language :: Python :: 3.12",
20
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
21
+ "Topic :: Software Development :: Libraries :: Python Modules",
22
+ ]
23
+ dependencies = [
24
+ "torch>=2.0.0",
25
+ "transformers>=4.40.0",
26
+ "vllm>=0.6.0",
27
+ "tqdm>=4.65.0",
28
+ "huggingface-hub>=0.23.0",
29
+ "platformdirs>=4.0.0",
30
+ ]
31
+
32
+ [build-system]
33
+ requires = ["setuptools>=68"]
34
+ build-backend = "setuptools.build_meta"
35
+
36
+ [tool.setuptools]
37
+ package-dir = {"" = "src"}
38
+
39
+ [tool.setuptools.packages.find]
40
+ where = ["src"]
41
+
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1 @@
1
+ from .client import AutoVLLMClient, SamplingConfig
@@ -0,0 +1,30 @@
1
+ from __future__ import annotations
2
+
3
+ import hashlib
4
+ import json
5
+ from dataclasses import asdict
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+
10
+ def _cache_dir() -> Path:
11
+ return Path.home() / ".cache" / "vllm_autoconfig"
12
+
13
+
14
+ def make_cache_key(payload: dict[str, Any]) -> str:
15
+ blob = json.dumps(payload, sort_keys=True, separators=(",", ":")).encode("utf-8")
16
+ return hashlib.sha256(blob).hexdigest()[:24]
17
+
18
+
19
+ def load_cached_plan(key: str) -> dict[str, Any] | None:
20
+ path = _cache_dir() / "plans" / f"{key}.json"
21
+ if not path.exists():
22
+ return None
23
+ return json.loads(path.read_text(encoding="utf-8"))
24
+
25
+
26
+ def save_cached_plan(key: str, plan: dict[str, Any]) -> None:
27
+ plans_dir = _cache_dir() / "plans"
28
+ plans_dir.mkdir(parents=True, exist_ok=True)
29
+ path = plans_dir / f"{key}.json"
30
+ path.write_text(json.dumps(plan, indent=2, sort_keys=True), encoding="utf-8")