vllm-mlx 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vllm_mlx-0.2.0/PKG-INFO +291 -0
- vllm_mlx-0.2.0/README.md +219 -0
- vllm_mlx-0.2.0/pyproject.toml +131 -0
- vllm_mlx-0.2.0/setup.cfg +4 -0
- vllm_mlx-0.2.0/tests/test_audio.py +283 -0
- vllm_mlx-0.2.0/tests/test_batching.py +477 -0
- vllm_mlx-0.2.0/tests/test_batching_deterministic.py +437 -0
- vllm_mlx-0.2.0/tests/test_continuous_batching.py +282 -0
- vllm_mlx-0.2.0/tests/test_llm.py +116 -0
- vllm_mlx-0.2.0/tests/test_mllm.py +398 -0
- vllm_mlx-0.2.0/tests/test_model_registry.py +259 -0
- vllm_mlx-0.2.0/tests/test_optimizations.py +110 -0
- vllm_mlx-0.2.0/tests/test_paged_cache.py +726 -0
- vllm_mlx-0.2.0/tests/test_paged_cache_benefits.py +441 -0
- vllm_mlx-0.2.0/tests/test_paged_cache_real_inference.py +272 -0
- vllm_mlx-0.2.0/tests/test_paged_cache_real_model.py +547 -0
- vllm_mlx-0.2.0/tests/test_platform.py +98 -0
- vllm_mlx-0.2.0/tests/test_prefix_cache.py +513 -0
- vllm_mlx-0.2.0/tests/test_server.py +338 -0
- vllm_mlx-0.2.0/tests/test_streaming_latency.py +321 -0
- vllm_mlx-0.2.0/tests/test_structured_output.py +393 -0
- vllm_mlx-0.2.0/tests/test_vlm_cache.py +906 -0
- vllm_mlx-0.2.0/vllm_mlx/__init__.py +80 -0
- vllm_mlx-0.2.0/vllm_mlx/api/__init__.py +119 -0
- vllm_mlx-0.2.0/vllm_mlx/api/models.py +341 -0
- vllm_mlx-0.2.0/vllm_mlx/api/tool_calling.py +392 -0
- vllm_mlx-0.2.0/vllm_mlx/api/utils.py +195 -0
- vllm_mlx-0.2.0/vllm_mlx/attention.py +243 -0
- vllm_mlx-0.2.0/vllm_mlx/audio/__init__.py +25 -0
- vllm_mlx-0.2.0/vllm_mlx/audio/processor.py +212 -0
- vllm_mlx-0.2.0/vllm_mlx/audio/stt.py +158 -0
- vllm_mlx-0.2.0/vllm_mlx/audio/tts.py +298 -0
- vllm_mlx-0.2.0/vllm_mlx/benchmark.py +1569 -0
- vllm_mlx-0.2.0/vllm_mlx/cli.py +323 -0
- vllm_mlx-0.2.0/vllm_mlx/engine/__init__.py +28 -0
- vllm_mlx-0.2.0/vllm_mlx/engine/base.py +182 -0
- vllm_mlx-0.2.0/vllm_mlx/engine/batched.py +362 -0
- vllm_mlx-0.2.0/vllm_mlx/engine/simple.py +357 -0
- vllm_mlx-0.2.0/vllm_mlx/engine_core.py +562 -0
- vllm_mlx-0.2.0/vllm_mlx/gradio_app.py +372 -0
- vllm_mlx-0.2.0/vllm_mlx/gradio_text_app.py +171 -0
- vllm_mlx-0.2.0/vllm_mlx/mcp/__init__.py +54 -0
- vllm_mlx-0.2.0/vllm_mlx/mcp/client.py +320 -0
- vllm_mlx-0.2.0/vllm_mlx/mcp/config.py +181 -0
- vllm_mlx-0.2.0/vllm_mlx/mcp/executor.py +216 -0
- vllm_mlx-0.2.0/vllm_mlx/mcp/manager.py +299 -0
- vllm_mlx-0.2.0/vllm_mlx/mcp/tools.py +173 -0
- vllm_mlx-0.2.0/vllm_mlx/mcp/types.py +152 -0
- vllm_mlx-0.2.0/vllm_mlx/model_registry.py +190 -0
- vllm_mlx-0.2.0/vllm_mlx/model_runner.py +468 -0
- vllm_mlx-0.2.0/vllm_mlx/models/__init__.py +15 -0
- vllm_mlx-0.2.0/vllm_mlx/models/llm.py +329 -0
- vllm_mlx-0.2.0/vllm_mlx/models/mllm.py +1107 -0
- vllm_mlx-0.2.0/vllm_mlx/optimizations.py +203 -0
- vllm_mlx-0.2.0/vllm_mlx/output_collector.py +206 -0
- vllm_mlx-0.2.0/vllm_mlx/paged_cache.py +1191 -0
- vllm_mlx-0.2.0/vllm_mlx/platform.py +331 -0
- vllm_mlx-0.2.0/vllm_mlx/plugin.py +153 -0
- vllm_mlx-0.2.0/vllm_mlx/prefix_cache.py +948 -0
- vllm_mlx-0.2.0/vllm_mlx/request.py +205 -0
- vllm_mlx-0.2.0/vllm_mlx/scheduler.py +829 -0
- vllm_mlx-0.2.0/vllm_mlx/server.py +816 -0
- vllm_mlx-0.2.0/vllm_mlx/vlm_cache.py +249 -0
- vllm_mlx-0.2.0/vllm_mlx/worker.py +260 -0
- vllm_mlx-0.2.0/vllm_mlx.egg-info/PKG-INFO +291 -0
- vllm_mlx-0.2.0/vllm_mlx.egg-info/SOURCES.txt +68 -0
- vllm_mlx-0.2.0/vllm_mlx.egg-info/dependency_links.txt +1 -0
- vllm_mlx-0.2.0/vllm_mlx.egg-info/entry_points.txt +8 -0
- vllm_mlx-0.2.0/vllm_mlx.egg-info/requires.txt +52 -0
- vllm_mlx-0.2.0/vllm_mlx.egg-info/top_level.txt +1 -0
vllm_mlx-0.2.0/PKG-INFO
ADDED
|
@@ -0,0 +1,291 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: vllm-mlx
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: vLLM-like inference for Apple Silicon - GPU-accelerated Text, Image, Video & Audio on Mac
|
|
5
|
+
Author: vllm-mlx contributors
|
|
6
|
+
License: Apache-2.0
|
|
7
|
+
Project-URL: Homepage, https://github.com/vllm-mlx/vllm-mlx
|
|
8
|
+
Project-URL: Documentation, https://github.com/vllm-mlx/vllm-mlx#readme
|
|
9
|
+
Project-URL: Repository, https://github.com/vllm-mlx/vllm-mlx
|
|
10
|
+
Keywords: llm,mlx,apple-silicon,vllm,inference,transformers
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
15
|
+
Classifier: Operating System :: MacOS
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
22
|
+
Requires-Python: >=3.10
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
Requires-Dist: mlx>=0.29.0
|
|
25
|
+
Requires-Dist: mlx-lm>=0.20.0
|
|
26
|
+
Requires-Dist: mlx-vlm>=0.1.0
|
|
27
|
+
Requires-Dist: transformers<5.0.0,>=4.40.0
|
|
28
|
+
Requires-Dist: tokenizers>=0.19.0
|
|
29
|
+
Requires-Dist: huggingface-hub>=0.23.0
|
|
30
|
+
Requires-Dist: numpy>=1.24.0
|
|
31
|
+
Requires-Dist: pillow>=10.0.0
|
|
32
|
+
Requires-Dist: tqdm>=4.66.0
|
|
33
|
+
Requires-Dist: pyyaml>=6.0
|
|
34
|
+
Requires-Dist: gradio>=4.0.0
|
|
35
|
+
Requires-Dist: requests>=2.28.0
|
|
36
|
+
Requires-Dist: tabulate>=0.9.0
|
|
37
|
+
Requires-Dist: opencv-python>=4.8.0
|
|
38
|
+
Requires-Dist: psutil>=5.9.0
|
|
39
|
+
Requires-Dist: fastapi>=0.100.0
|
|
40
|
+
Requires-Dist: uvicorn>=0.23.0
|
|
41
|
+
Requires-Dist: mcp>=1.0.0
|
|
42
|
+
Requires-Dist: jsonschema>=4.0.0
|
|
43
|
+
Requires-Dist: mlx-audio>=0.2.9
|
|
44
|
+
Provides-Extra: dev
|
|
45
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
46
|
+
Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
|
|
47
|
+
Requires-Dist: black>=23.0.0; extra == "dev"
|
|
48
|
+
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
49
|
+
Requires-Dist: mypy>=1.0.0; extra == "dev"
|
|
50
|
+
Provides-Extra: vllm
|
|
51
|
+
Requires-Dist: vllm>=0.4.0; extra == "vllm"
|
|
52
|
+
Provides-Extra: vision
|
|
53
|
+
Requires-Dist: torch>=2.3.0; extra == "vision"
|
|
54
|
+
Requires-Dist: torchvision>=0.18.0; extra == "vision"
|
|
55
|
+
Provides-Extra: audio
|
|
56
|
+
Requires-Dist: mlx-audio>=0.2.9; extra == "audio"
|
|
57
|
+
Requires-Dist: sounddevice>=0.4.0; extra == "audio"
|
|
58
|
+
Requires-Dist: soundfile>=0.12.0; extra == "audio"
|
|
59
|
+
Requires-Dist: scipy>=1.10.0; extra == "audio"
|
|
60
|
+
Requires-Dist: numba>=0.57.0; extra == "audio"
|
|
61
|
+
Requires-Dist: tiktoken>=0.5.0; extra == "audio"
|
|
62
|
+
Requires-Dist: misaki[ja,zh]>=0.5.0; extra == "audio"
|
|
63
|
+
Requires-Dist: spacy>=3.7.0; extra == "audio"
|
|
64
|
+
Requires-Dist: num2words>=0.5.0; extra == "audio"
|
|
65
|
+
Requires-Dist: loguru>=0.7.0; extra == "audio"
|
|
66
|
+
Requires-Dist: phonemizer>=3.2.0; extra == "audio"
|
|
67
|
+
Requires-Dist: ordered_set>=4.1.0; extra == "audio"
|
|
68
|
+
Requires-Dist: cn2an>=0.5.0; extra == "audio"
|
|
69
|
+
Requires-Dist: fugashi>=1.3.0; extra == "audio"
|
|
70
|
+
Requires-Dist: unidic-lite>=1.0.0; extra == "audio"
|
|
71
|
+
Requires-Dist: jieba>=0.42.0; extra == "audio"
|
|
72
|
+
|
|
73
|
+
# vLLM-MLX
|
|
74
|
+
|
|
75
|
+
**vLLM-like inference for Apple Silicon** - GPU-accelerated Text, Image, Video & Audio on Mac
|
|
76
|
+
|
|
77
|
+
[](LICENSE)
|
|
78
|
+
[](https://www.python.org/downloads/)
|
|
79
|
+
[](https://support.apple.com/en-us/HT211814)
|
|
80
|
+
[](https://github.com/waybarrios/vllm-mlx)
|
|
81
|
+
|
|
82
|
+
## Overview
|
|
83
|
+
|
|
84
|
+
vllm-mlx brings native Apple Silicon GPU acceleration to vLLM by integrating:
|
|
85
|
+
|
|
86
|
+
- **[MLX](https://github.com/ml-explore/mlx)**: Apple's ML framework with unified memory and Metal kernels
|
|
87
|
+
- **[mlx-lm](https://github.com/ml-explore/mlx-lm)**: Optimized LLM inference with KV cache and quantization
|
|
88
|
+
- **[mlx-vlm](https://github.com/Blaizzy/mlx-vlm)**: Vision-language models for multimodal inference
|
|
89
|
+
- **[mlx-audio](https://github.com/Blaizzy/mlx-audio)**: Speech-to-Text and Text-to-Speech with native voices
|
|
90
|
+
|
|
91
|
+
## Features
|
|
92
|
+
|
|
93
|
+
- **Multimodal** - Text, Image, Video & Audio in one platform
|
|
94
|
+
- **Native GPU acceleration** on Apple Silicon (M1, M2, M3, M4)
|
|
95
|
+
- **Native TTS voices** - Spanish, French, Chinese, Japanese + 5 more languages
|
|
96
|
+
- **OpenAI API compatible** - drop-in replacement for OpenAI client
|
|
97
|
+
- **MCP Tool Calling** - integrate external tools via Model Context Protocol
|
|
98
|
+
- **Paged KV Cache** - memory-efficient caching with prefix sharing
|
|
99
|
+
- **Continuous Batching** - high throughput for multiple concurrent users
|
|
100
|
+
|
|
101
|
+
## Quick Start
|
|
102
|
+
|
|
103
|
+
### Installation
|
|
104
|
+
|
|
105
|
+
```bash
|
|
106
|
+
git clone https://github.com/waybarrios/vllm-mlx.git
|
|
107
|
+
cd vllm-mlx
|
|
108
|
+
pip install -e .
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
### Start Server
|
|
112
|
+
|
|
113
|
+
```bash
|
|
114
|
+
# Simple mode (single user, max throughput)
|
|
115
|
+
vllm-mlx serve mlx-community/Llama-3.2-3B-Instruct-4bit --port 8000
|
|
116
|
+
|
|
117
|
+
# Continuous batching (multiple users)
|
|
118
|
+
vllm-mlx serve mlx-community/Llama-3.2-3B-Instruct-4bit --port 8000 --continuous-batching
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
### Use with OpenAI SDK
|
|
122
|
+
|
|
123
|
+
```python
|
|
124
|
+
from openai import OpenAI
|
|
125
|
+
|
|
126
|
+
client = OpenAI(base_url="http://localhost:8000/v1", api_key="not-needed")
|
|
127
|
+
|
|
128
|
+
response = client.chat.completions.create(
|
|
129
|
+
model="default",
|
|
130
|
+
messages=[{"role": "user", "content": "Hello!"}],
|
|
131
|
+
)
|
|
132
|
+
print(response.choices[0].message.content)
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
### Multimodal (Images & Video)
|
|
136
|
+
|
|
137
|
+
```bash
|
|
138
|
+
vllm-mlx serve mlx-community/Qwen3-VL-4B-Instruct-3bit --port 8000
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
```python
|
|
142
|
+
response = client.chat.completions.create(
|
|
143
|
+
model="default",
|
|
144
|
+
messages=[{
|
|
145
|
+
"role": "user",
|
|
146
|
+
"content": [
|
|
147
|
+
{"type": "text", "text": "What's in this image?"},
|
|
148
|
+
{"type": "image_url", "image_url": {"url": "https://example.com/image.jpg"}}
|
|
149
|
+
]
|
|
150
|
+
}]
|
|
151
|
+
)
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
### Audio (TTS/STT)
|
|
155
|
+
|
|
156
|
+
```bash
|
|
157
|
+
# Install audio dependencies
|
|
158
|
+
pip install vllm-mlx[audio]
|
|
159
|
+
python -m spacy download en_core_web_sm
|
|
160
|
+
brew install espeak-ng # macOS, for non-English languages
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
```bash
|
|
164
|
+
# Text-to-Speech (English)
|
|
165
|
+
python examples/tts_example.py "Hello, how are you?" --play
|
|
166
|
+
|
|
167
|
+
# Text-to-Speech (Spanish)
|
|
168
|
+
python examples/tts_multilingual.py "Hola mundo" --lang es --play
|
|
169
|
+
|
|
170
|
+
# List available models and languages
|
|
171
|
+
python examples/tts_multilingual.py --list-models
|
|
172
|
+
python examples/tts_multilingual.py --list-languages
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
**Supported TTS Models:**
|
|
176
|
+
| Model | Languages | Description |
|
|
177
|
+
|-------|-----------|-------------|
|
|
178
|
+
| Kokoro | EN, ES, FR, JA, ZH, IT, PT, HI | Fast, 82M params, 11 voices |
|
|
179
|
+
| Chatterbox | 15+ languages | Expressive, voice cloning |
|
|
180
|
+
| VibeVoice | EN | Realtime, low latency |
|
|
181
|
+
| VoxCPM | ZH, EN | High quality Chinese/English |
|
|
182
|
+
|
|
183
|
+
## Documentation
|
|
184
|
+
|
|
185
|
+
For full documentation, see the [docs](docs/) directory:
|
|
186
|
+
|
|
187
|
+
- **Getting Started**
|
|
188
|
+
- [Installation](docs/getting-started/installation.md)
|
|
189
|
+
- [Quick Start](docs/getting-started/quickstart.md)
|
|
190
|
+
|
|
191
|
+
- **User Guides**
|
|
192
|
+
- [OpenAI-Compatible Server](docs/guides/server.md)
|
|
193
|
+
- [Python API](docs/guides/python-api.md)
|
|
194
|
+
- [Multimodal (Images & Video)](docs/guides/multimodal.md)
|
|
195
|
+
- [Audio (STT/TTS)](docs/guides/audio.md)
|
|
196
|
+
- [MCP & Tool Calling](docs/guides/mcp-tools.md)
|
|
197
|
+
- [Continuous Batching](docs/guides/continuous-batching.md)
|
|
198
|
+
|
|
199
|
+
- **Reference**
|
|
200
|
+
- [CLI Commands](docs/reference/cli.md)
|
|
201
|
+
- [Supported Models](docs/reference/models.md)
|
|
202
|
+
- [Configuration](docs/reference/configuration.md)
|
|
203
|
+
|
|
204
|
+
- **Benchmarks**
|
|
205
|
+
- [LLM Benchmarks](docs/benchmarks/llm.md)
|
|
206
|
+
- [Image Benchmarks](docs/benchmarks/image.md)
|
|
207
|
+
- [Video Benchmarks](docs/benchmarks/video.md)
|
|
208
|
+
|
|
209
|
+
## Architecture
|
|
210
|
+
|
|
211
|
+
```
|
|
212
|
+
┌─────────────────────────────────────────────┐
|
|
213
|
+
│ vLLM API Layer │
|
|
214
|
+
│ (OpenAI-compatible interface) │
|
|
215
|
+
└─────────────────────────────────────────────┘
|
|
216
|
+
│
|
|
217
|
+
▼
|
|
218
|
+
┌─────────────────────────────────────────────┐
|
|
219
|
+
│ MLXPlatform │
|
|
220
|
+
│ (vLLM platform plugin for Apple Silicon) │
|
|
221
|
+
└─────────────────────────────────────────────┘
|
|
222
|
+
│
|
|
223
|
+
┌───────────┴───────────┐
|
|
224
|
+
▼ ▼
|
|
225
|
+
┌──────────────────┐ ┌──────────────────┐
|
|
226
|
+
│ mlx-lm │ │ mlx-vlm │
|
|
227
|
+
│ (LLM inference) │ │ (MLLM inference) │
|
|
228
|
+
└──────────────────┘ └──────────────────┘
|
|
229
|
+
│ │
|
|
230
|
+
└───────────┬───────────┘
|
|
231
|
+
▼
|
|
232
|
+
┌─────────────────────────────────────────────┐
|
|
233
|
+
│ MLX │
|
|
234
|
+
│ (Apple ML Framework - Metal kernels) │
|
|
235
|
+
└─────────────────────────────────────────────┘
|
|
236
|
+
```
|
|
237
|
+
|
|
238
|
+
## Performance
|
|
239
|
+
|
|
240
|
+
**LLM Performance (M4 Max, 128GB):**
|
|
241
|
+
|
|
242
|
+
| Model | Speed | Memory |
|
|
243
|
+
|-------|-------|--------|
|
|
244
|
+
| Qwen3-0.6B-8bit | 402 tok/s | 0.7 GB |
|
|
245
|
+
| Llama-3.2-1B-4bit | 464 tok/s | 0.7 GB |
|
|
246
|
+
| Llama-3.2-3B-4bit | 200 tok/s | 1.8 GB |
|
|
247
|
+
|
|
248
|
+
**Continuous Batching (5 concurrent requests):**
|
|
249
|
+
|
|
250
|
+
| Model | Single | Batched | Speedup |
|
|
251
|
+
|-------|--------|---------|---------|
|
|
252
|
+
| Qwen3-0.6B-8bit | 328 tok/s | 1112 tok/s | **3.4x** |
|
|
253
|
+
| Llama-3.2-1B-4bit | 299 tok/s | 613 tok/s | **2.0x** |
|
|
254
|
+
|
|
255
|
+
See [benchmarks](docs/benchmarks/) for detailed results.
|
|
256
|
+
|
|
257
|
+
## Contributing
|
|
258
|
+
|
|
259
|
+
We welcome contributions! See [Contributing Guide](docs/development/contributing.md) for details.
|
|
260
|
+
|
|
261
|
+
- Bug fixes and improvements
|
|
262
|
+
- Performance optimizations
|
|
263
|
+
- Documentation improvements
|
|
264
|
+
- Benchmarks on different Apple Silicon chips
|
|
265
|
+
|
|
266
|
+
Submit PRs to: [https://github.com/waybarrios/vllm-mlx](https://github.com/waybarrios/vllm-mlx)
|
|
267
|
+
|
|
268
|
+
## License
|
|
269
|
+
|
|
270
|
+
Apache 2.0 - see [LICENSE](LICENSE) for details.
|
|
271
|
+
|
|
272
|
+
## Citation
|
|
273
|
+
|
|
274
|
+
If you use vLLM-MLX in your research or project, please cite:
|
|
275
|
+
|
|
276
|
+
```bibtex
|
|
277
|
+
@software{vllm_mlx2025,
|
|
278
|
+
author = {Barrios, Wayner},
|
|
279
|
+
title = {vLLM-MLX: Apple Silicon MLX Backend for vLLM},
|
|
280
|
+
year = {2025},
|
|
281
|
+
url = {https://github.com/waybarrios/vllm-mlx},
|
|
282
|
+
note = {Native GPU-accelerated LLM and vision-language model inference on Apple Silicon}
|
|
283
|
+
}
|
|
284
|
+
```
|
|
285
|
+
|
|
286
|
+
## Acknowledgments
|
|
287
|
+
|
|
288
|
+
- [MLX](https://github.com/ml-explore/mlx) - Apple's ML framework
|
|
289
|
+
- [mlx-lm](https://github.com/ml-explore/mlx-lm) - LLM inference library
|
|
290
|
+
- [mlx-vlm](https://github.com/Blaizzy/mlx-vlm) - Vision-language models
|
|
291
|
+
- [vLLM](https://github.com/vllm-project/vllm) - High-throughput LLM serving
|
vllm_mlx-0.2.0/README.md
ADDED
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
# vLLM-MLX
|
|
2
|
+
|
|
3
|
+
**vLLM-like inference for Apple Silicon** - GPU-accelerated Text, Image, Video & Audio on Mac
|
|
4
|
+
|
|
5
|
+
[](LICENSE)
|
|
6
|
+
[](https://www.python.org/downloads/)
|
|
7
|
+
[](https://support.apple.com/en-us/HT211814)
|
|
8
|
+
[](https://github.com/waybarrios/vllm-mlx)
|
|
9
|
+
|
|
10
|
+
## Overview
|
|
11
|
+
|
|
12
|
+
vllm-mlx brings native Apple Silicon GPU acceleration to vLLM by integrating:
|
|
13
|
+
|
|
14
|
+
- **[MLX](https://github.com/ml-explore/mlx)**: Apple's ML framework with unified memory and Metal kernels
|
|
15
|
+
- **[mlx-lm](https://github.com/ml-explore/mlx-lm)**: Optimized LLM inference with KV cache and quantization
|
|
16
|
+
- **[mlx-vlm](https://github.com/Blaizzy/mlx-vlm)**: Vision-language models for multimodal inference
|
|
17
|
+
- **[mlx-audio](https://github.com/Blaizzy/mlx-audio)**: Speech-to-Text and Text-to-Speech with native voices
|
|
18
|
+
|
|
19
|
+
## Features
|
|
20
|
+
|
|
21
|
+
- **Multimodal** - Text, Image, Video & Audio in one platform
|
|
22
|
+
- **Native GPU acceleration** on Apple Silicon (M1, M2, M3, M4)
|
|
23
|
+
- **Native TTS voices** - Spanish, French, Chinese, Japanese + 5 more languages
|
|
24
|
+
- **OpenAI API compatible** - drop-in replacement for OpenAI client
|
|
25
|
+
- **MCP Tool Calling** - integrate external tools via Model Context Protocol
|
|
26
|
+
- **Paged KV Cache** - memory-efficient caching with prefix sharing
|
|
27
|
+
- **Continuous Batching** - high throughput for multiple concurrent users
|
|
28
|
+
|
|
29
|
+
## Quick Start
|
|
30
|
+
|
|
31
|
+
### Installation
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
git clone https://github.com/waybarrios/vllm-mlx.git
|
|
35
|
+
cd vllm-mlx
|
|
36
|
+
pip install -e .
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
### Start Server
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
# Simple mode (single user, max throughput)
|
|
43
|
+
vllm-mlx serve mlx-community/Llama-3.2-3B-Instruct-4bit --port 8000
|
|
44
|
+
|
|
45
|
+
# Continuous batching (multiple users)
|
|
46
|
+
vllm-mlx serve mlx-community/Llama-3.2-3B-Instruct-4bit --port 8000 --continuous-batching
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
### Use with OpenAI SDK
|
|
50
|
+
|
|
51
|
+
```python
|
|
52
|
+
from openai import OpenAI
|
|
53
|
+
|
|
54
|
+
client = OpenAI(base_url="http://localhost:8000/v1", api_key="not-needed")
|
|
55
|
+
|
|
56
|
+
response = client.chat.completions.create(
|
|
57
|
+
model="default",
|
|
58
|
+
messages=[{"role": "user", "content": "Hello!"}],
|
|
59
|
+
)
|
|
60
|
+
print(response.choices[0].message.content)
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
### Multimodal (Images & Video)
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
vllm-mlx serve mlx-community/Qwen3-VL-4B-Instruct-3bit --port 8000
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
```python
|
|
70
|
+
response = client.chat.completions.create(
|
|
71
|
+
model="default",
|
|
72
|
+
messages=[{
|
|
73
|
+
"role": "user",
|
|
74
|
+
"content": [
|
|
75
|
+
{"type": "text", "text": "What's in this image?"},
|
|
76
|
+
{"type": "image_url", "image_url": {"url": "https://example.com/image.jpg"}}
|
|
77
|
+
]
|
|
78
|
+
}]
|
|
79
|
+
)
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
### Audio (TTS/STT)
|
|
83
|
+
|
|
84
|
+
```bash
|
|
85
|
+
# Install audio dependencies
|
|
86
|
+
pip install vllm-mlx[audio]
|
|
87
|
+
python -m spacy download en_core_web_sm
|
|
88
|
+
brew install espeak-ng # macOS, for non-English languages
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
```bash
|
|
92
|
+
# Text-to-Speech (English)
|
|
93
|
+
python examples/tts_example.py "Hello, how are you?" --play
|
|
94
|
+
|
|
95
|
+
# Text-to-Speech (Spanish)
|
|
96
|
+
python examples/tts_multilingual.py "Hola mundo" --lang es --play
|
|
97
|
+
|
|
98
|
+
# List available models and languages
|
|
99
|
+
python examples/tts_multilingual.py --list-models
|
|
100
|
+
python examples/tts_multilingual.py --list-languages
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
**Supported TTS Models:**
|
|
104
|
+
| Model | Languages | Description |
|
|
105
|
+
|-------|-----------|-------------|
|
|
106
|
+
| Kokoro | EN, ES, FR, JA, ZH, IT, PT, HI | Fast, 82M params, 11 voices |
|
|
107
|
+
| Chatterbox | 15+ languages | Expressive, voice cloning |
|
|
108
|
+
| VibeVoice | EN | Realtime, low latency |
|
|
109
|
+
| VoxCPM | ZH, EN | High quality Chinese/English |
|
|
110
|
+
|
|
111
|
+
## Documentation
|
|
112
|
+
|
|
113
|
+
For full documentation, see the [docs](docs/) directory:
|
|
114
|
+
|
|
115
|
+
- **Getting Started**
|
|
116
|
+
- [Installation](docs/getting-started/installation.md)
|
|
117
|
+
- [Quick Start](docs/getting-started/quickstart.md)
|
|
118
|
+
|
|
119
|
+
- **User Guides**
|
|
120
|
+
- [OpenAI-Compatible Server](docs/guides/server.md)
|
|
121
|
+
- [Python API](docs/guides/python-api.md)
|
|
122
|
+
- [Multimodal (Images & Video)](docs/guides/multimodal.md)
|
|
123
|
+
- [Audio (STT/TTS)](docs/guides/audio.md)
|
|
124
|
+
- [MCP & Tool Calling](docs/guides/mcp-tools.md)
|
|
125
|
+
- [Continuous Batching](docs/guides/continuous-batching.md)
|
|
126
|
+
|
|
127
|
+
- **Reference**
|
|
128
|
+
- [CLI Commands](docs/reference/cli.md)
|
|
129
|
+
- [Supported Models](docs/reference/models.md)
|
|
130
|
+
- [Configuration](docs/reference/configuration.md)
|
|
131
|
+
|
|
132
|
+
- **Benchmarks**
|
|
133
|
+
- [LLM Benchmarks](docs/benchmarks/llm.md)
|
|
134
|
+
- [Image Benchmarks](docs/benchmarks/image.md)
|
|
135
|
+
- [Video Benchmarks](docs/benchmarks/video.md)
|
|
136
|
+
|
|
137
|
+
## Architecture
|
|
138
|
+
|
|
139
|
+
```
|
|
140
|
+
┌─────────────────────────────────────────────┐
|
|
141
|
+
│ vLLM API Layer │
|
|
142
|
+
│ (OpenAI-compatible interface) │
|
|
143
|
+
└─────────────────────────────────────────────┘
|
|
144
|
+
│
|
|
145
|
+
▼
|
|
146
|
+
┌─────────────────────────────────────────────┐
|
|
147
|
+
│ MLXPlatform │
|
|
148
|
+
│ (vLLM platform plugin for Apple Silicon) │
|
|
149
|
+
└─────────────────────────────────────────────┘
|
|
150
|
+
│
|
|
151
|
+
┌───────────┴───────────┐
|
|
152
|
+
▼ ▼
|
|
153
|
+
┌──────────────────┐ ┌──────────────────┐
|
|
154
|
+
│ mlx-lm │ │ mlx-vlm │
|
|
155
|
+
│ (LLM inference) │ │ (MLLM inference) │
|
|
156
|
+
└──────────────────┘ └──────────────────┘
|
|
157
|
+
│ │
|
|
158
|
+
└───────────┬───────────┘
|
|
159
|
+
▼
|
|
160
|
+
┌─────────────────────────────────────────────┐
|
|
161
|
+
│ MLX │
|
|
162
|
+
│ (Apple ML Framework - Metal kernels) │
|
|
163
|
+
└─────────────────────────────────────────────┘
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
## Performance
|
|
167
|
+
|
|
168
|
+
**LLM Performance (M4 Max, 128GB):**
|
|
169
|
+
|
|
170
|
+
| Model | Speed | Memory |
|
|
171
|
+
|-------|-------|--------|
|
|
172
|
+
| Qwen3-0.6B-8bit | 402 tok/s | 0.7 GB |
|
|
173
|
+
| Llama-3.2-1B-4bit | 464 tok/s | 0.7 GB |
|
|
174
|
+
| Llama-3.2-3B-4bit | 200 tok/s | 1.8 GB |
|
|
175
|
+
|
|
176
|
+
**Continuous Batching (5 concurrent requests):**
|
|
177
|
+
|
|
178
|
+
| Model | Single | Batched | Speedup |
|
|
179
|
+
|-------|--------|---------|---------|
|
|
180
|
+
| Qwen3-0.6B-8bit | 328 tok/s | 1112 tok/s | **3.4x** |
|
|
181
|
+
| Llama-3.2-1B-4bit | 299 tok/s | 613 tok/s | **2.0x** |
|
|
182
|
+
|
|
183
|
+
See [benchmarks](docs/benchmarks/) for detailed results.
|
|
184
|
+
|
|
185
|
+
## Contributing
|
|
186
|
+
|
|
187
|
+
We welcome contributions! See [Contributing Guide](docs/development/contributing.md) for details.
|
|
188
|
+
|
|
189
|
+
- Bug fixes and improvements
|
|
190
|
+
- Performance optimizations
|
|
191
|
+
- Documentation improvements
|
|
192
|
+
- Benchmarks on different Apple Silicon chips
|
|
193
|
+
|
|
194
|
+
Submit PRs to: [https://github.com/waybarrios/vllm-mlx](https://github.com/waybarrios/vllm-mlx)
|
|
195
|
+
|
|
196
|
+
## License
|
|
197
|
+
|
|
198
|
+
Apache 2.0 - see [LICENSE](LICENSE) for details.
|
|
199
|
+
|
|
200
|
+
## Citation
|
|
201
|
+
|
|
202
|
+
If you use vLLM-MLX in your research or project, please cite:
|
|
203
|
+
|
|
204
|
+
```bibtex
|
|
205
|
+
@software{vllm_mlx2025,
|
|
206
|
+
author = {Barrios, Wayner},
|
|
207
|
+
title = {vLLM-MLX: Apple Silicon MLX Backend for vLLM},
|
|
208
|
+
year = {2025},
|
|
209
|
+
url = {https://github.com/waybarrios/vllm-mlx},
|
|
210
|
+
note = {Native GPU-accelerated LLM and vision-language model inference on Apple Silicon}
|
|
211
|
+
}
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
## Acknowledgments
|
|
215
|
+
|
|
216
|
+
- [MLX](https://github.com/ml-explore/mlx) - Apple's ML framework
|
|
217
|
+
- [mlx-lm](https://github.com/ml-explore/mlx-lm) - LLM inference library
|
|
218
|
+
- [mlx-vlm](https://github.com/Blaizzy/mlx-vlm) - Vision-language models
|
|
219
|
+
- [vLLM](https://github.com/vllm-project/vllm) - High-throughput LLM serving
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "vllm-mlx"
|
|
7
|
+
version = "0.2.0"
|
|
8
|
+
description = "vLLM-like inference for Apple Silicon - GPU-accelerated Text, Image, Video & Audio on Mac"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = {text = "Apache-2.0"}
|
|
11
|
+
requires-python = ">=3.10"
|
|
12
|
+
authors = [
|
|
13
|
+
{name = "vllm-mlx contributors"}
|
|
14
|
+
]
|
|
15
|
+
keywords = ["llm", "mlx", "apple-silicon", "vllm", "inference", "transformers"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 3 - Alpha",
|
|
18
|
+
"Intended Audience :: Developers",
|
|
19
|
+
"Intended Audience :: Science/Research",
|
|
20
|
+
"License :: OSI Approved :: Apache Software License",
|
|
21
|
+
"Operating System :: MacOS",
|
|
22
|
+
"Programming Language :: Python :: 3",
|
|
23
|
+
"Programming Language :: Python :: 3.10",
|
|
24
|
+
"Programming Language :: Python :: 3.11",
|
|
25
|
+
"Programming Language :: Python :: 3.12",
|
|
26
|
+
"Programming Language :: Python :: 3.13",
|
|
27
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
dependencies = [
|
|
31
|
+
"mlx>=0.29.0",
|
|
32
|
+
"mlx-lm>=0.20.0",
|
|
33
|
+
"mlx-vlm>=0.1.0", # VLM support
|
|
34
|
+
"transformers>=4.40.0,<5.0.0",
|
|
35
|
+
"tokenizers>=0.19.0",
|
|
36
|
+
"huggingface-hub>=0.23.0",
|
|
37
|
+
"numpy>=1.24.0",
|
|
38
|
+
"pillow>=10.0.0",
|
|
39
|
+
"tqdm>=4.66.0",
|
|
40
|
+
"pyyaml>=6.0",
|
|
41
|
+
"gradio>=4.0.0",
|
|
42
|
+
"requests>=2.28.0",
|
|
43
|
+
"tabulate>=0.9.0",
|
|
44
|
+
# Video processing for VLM
|
|
45
|
+
"opencv-python>=4.8.0",
|
|
46
|
+
# Resource monitoring
|
|
47
|
+
"psutil>=5.9.0",
|
|
48
|
+
# Server
|
|
49
|
+
"fastapi>=0.100.0",
|
|
50
|
+
"uvicorn>=0.23.0",
|
|
51
|
+
# MCP (Model Context Protocol) support
|
|
52
|
+
"mcp>=1.0.0",
|
|
53
|
+
# JSON Schema validation for structured output
|
|
54
|
+
"jsonschema>=4.0.0",
|
|
55
|
+
# Audio support (STT, TTS, audio processing)
|
|
56
|
+
"mlx-audio>=0.2.9",
|
|
57
|
+
]
|
|
58
|
+
|
|
59
|
+
[project.optional-dependencies]
|
|
60
|
+
dev = [
|
|
61
|
+
"pytest>=7.0.0",
|
|
62
|
+
"pytest-asyncio>=0.21.0",
|
|
63
|
+
"black>=23.0.0",
|
|
64
|
+
"ruff>=0.1.0",
|
|
65
|
+
"mypy>=1.0.0",
|
|
66
|
+
]
|
|
67
|
+
vllm = [
|
|
68
|
+
"vllm>=0.4.0",
|
|
69
|
+
]
|
|
70
|
+
vision = [
|
|
71
|
+
"torch>=2.3.0",
|
|
72
|
+
"torchvision>=0.18.0",
|
|
73
|
+
]
|
|
74
|
+
# Audio dependencies for TTS/STT (mlx-audio)
|
|
75
|
+
audio = [
|
|
76
|
+
"mlx-audio>=0.2.9",
|
|
77
|
+
"sounddevice>=0.4.0",
|
|
78
|
+
"soundfile>=0.12.0",
|
|
79
|
+
"scipy>=1.10.0",
|
|
80
|
+
"numba>=0.57.0",
|
|
81
|
+
"tiktoken>=0.5.0",
|
|
82
|
+
"misaki[zh,ja]>=0.5.0", # Chinese (zh) and Japanese (ja) support
|
|
83
|
+
"spacy>=3.7.0",
|
|
84
|
+
"num2words>=0.5.0",
|
|
85
|
+
"loguru>=0.7.0",
|
|
86
|
+
"phonemizer>=3.2.0",
|
|
87
|
+
# Additional multilingual dependencies
|
|
88
|
+
"ordered_set>=4.1.0", # Required for Chinese TTS
|
|
89
|
+
"cn2an>=0.5.0", # Chinese number conversion
|
|
90
|
+
"fugashi>=1.3.0", # Japanese tokenizer
|
|
91
|
+
"unidic-lite>=1.0.0", # Japanese dictionary for fugashi
|
|
92
|
+
"jieba>=0.42.0", # Chinese word segmentation
|
|
93
|
+
]
|
|
94
|
+
|
|
95
|
+
[project.urls]
|
|
96
|
+
Homepage = "https://github.com/vllm-mlx/vllm-mlx"
|
|
97
|
+
Documentation = "https://github.com/vllm-mlx/vllm-mlx#readme"
|
|
98
|
+
Repository = "https://github.com/vllm-mlx/vllm-mlx"
|
|
99
|
+
|
|
100
|
+
[project.entry-points."vllm.platform_plugins"]
|
|
101
|
+
mlx = "vllm_mlx.plugin:mlx_platform_plugin"
|
|
102
|
+
|
|
103
|
+
[project.scripts]
|
|
104
|
+
vllm-mlx = "vllm_mlx.cli:main"
|
|
105
|
+
vllm-mlx-serve = "vllm_mlx.server_v2:main"
|
|
106
|
+
vllm-mlx-chat = "vllm_mlx.gradio_app:main"
|
|
107
|
+
vllm-mlx-bench = "vllm_mlx.benchmark:main"
|
|
108
|
+
|
|
109
|
+
[tool.setuptools.packages.find]
|
|
110
|
+
where = ["."]
|
|
111
|
+
include = ["vllm_mlx*"]
|
|
112
|
+
|
|
113
|
+
[tool.black]
|
|
114
|
+
line-length = 88
|
|
115
|
+
target-version = ["py310", "py311", "py312", "py313"]
|
|
116
|
+
|
|
117
|
+
[tool.ruff]
|
|
118
|
+
line-length = 88
|
|
119
|
+
select = ["E", "F", "W", "I", "N", "UP", "B", "SIM"]
|
|
120
|
+
ignore = ["E501", "B905"]
|
|
121
|
+
|
|
122
|
+
[tool.mypy]
|
|
123
|
+
python_version = "3.10"
|
|
124
|
+
warn_return_any = true
|
|
125
|
+
warn_unused_configs = true
|
|
126
|
+
ignore_missing_imports = true
|
|
127
|
+
|
|
128
|
+
[tool.pytest.ini_options]
|
|
129
|
+
testpaths = ["tests"]
|
|
130
|
+
python_files = ["test_*.py"]
|
|
131
|
+
asyncio_mode = "auto"
|
vllm_mlx-0.2.0/setup.cfg
ADDED