vllm-mlx 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. vllm_mlx-0.2.0/PKG-INFO +291 -0
  2. vllm_mlx-0.2.0/README.md +219 -0
  3. vllm_mlx-0.2.0/pyproject.toml +131 -0
  4. vllm_mlx-0.2.0/setup.cfg +4 -0
  5. vllm_mlx-0.2.0/tests/test_audio.py +283 -0
  6. vllm_mlx-0.2.0/tests/test_batching.py +477 -0
  7. vllm_mlx-0.2.0/tests/test_batching_deterministic.py +437 -0
  8. vllm_mlx-0.2.0/tests/test_continuous_batching.py +282 -0
  9. vllm_mlx-0.2.0/tests/test_llm.py +116 -0
  10. vllm_mlx-0.2.0/tests/test_mllm.py +398 -0
  11. vllm_mlx-0.2.0/tests/test_model_registry.py +259 -0
  12. vllm_mlx-0.2.0/tests/test_optimizations.py +110 -0
  13. vllm_mlx-0.2.0/tests/test_paged_cache.py +726 -0
  14. vllm_mlx-0.2.0/tests/test_paged_cache_benefits.py +441 -0
  15. vllm_mlx-0.2.0/tests/test_paged_cache_real_inference.py +272 -0
  16. vllm_mlx-0.2.0/tests/test_paged_cache_real_model.py +547 -0
  17. vllm_mlx-0.2.0/tests/test_platform.py +98 -0
  18. vllm_mlx-0.2.0/tests/test_prefix_cache.py +513 -0
  19. vllm_mlx-0.2.0/tests/test_server.py +338 -0
  20. vllm_mlx-0.2.0/tests/test_streaming_latency.py +321 -0
  21. vllm_mlx-0.2.0/tests/test_structured_output.py +393 -0
  22. vllm_mlx-0.2.0/tests/test_vlm_cache.py +906 -0
  23. vllm_mlx-0.2.0/vllm_mlx/__init__.py +80 -0
  24. vllm_mlx-0.2.0/vllm_mlx/api/__init__.py +119 -0
  25. vllm_mlx-0.2.0/vllm_mlx/api/models.py +341 -0
  26. vllm_mlx-0.2.0/vllm_mlx/api/tool_calling.py +392 -0
  27. vllm_mlx-0.2.0/vllm_mlx/api/utils.py +195 -0
  28. vllm_mlx-0.2.0/vllm_mlx/attention.py +243 -0
  29. vllm_mlx-0.2.0/vllm_mlx/audio/__init__.py +25 -0
  30. vllm_mlx-0.2.0/vllm_mlx/audio/processor.py +212 -0
  31. vllm_mlx-0.2.0/vllm_mlx/audio/stt.py +158 -0
  32. vllm_mlx-0.2.0/vllm_mlx/audio/tts.py +298 -0
  33. vllm_mlx-0.2.0/vllm_mlx/benchmark.py +1569 -0
  34. vllm_mlx-0.2.0/vllm_mlx/cli.py +323 -0
  35. vllm_mlx-0.2.0/vllm_mlx/engine/__init__.py +28 -0
  36. vllm_mlx-0.2.0/vllm_mlx/engine/base.py +182 -0
  37. vllm_mlx-0.2.0/vllm_mlx/engine/batched.py +362 -0
  38. vllm_mlx-0.2.0/vllm_mlx/engine/simple.py +357 -0
  39. vllm_mlx-0.2.0/vllm_mlx/engine_core.py +562 -0
  40. vllm_mlx-0.2.0/vllm_mlx/gradio_app.py +372 -0
  41. vllm_mlx-0.2.0/vllm_mlx/gradio_text_app.py +171 -0
  42. vllm_mlx-0.2.0/vllm_mlx/mcp/__init__.py +54 -0
  43. vllm_mlx-0.2.0/vllm_mlx/mcp/client.py +320 -0
  44. vllm_mlx-0.2.0/vllm_mlx/mcp/config.py +181 -0
  45. vllm_mlx-0.2.0/vllm_mlx/mcp/executor.py +216 -0
  46. vllm_mlx-0.2.0/vllm_mlx/mcp/manager.py +299 -0
  47. vllm_mlx-0.2.0/vllm_mlx/mcp/tools.py +173 -0
  48. vllm_mlx-0.2.0/vllm_mlx/mcp/types.py +152 -0
  49. vllm_mlx-0.2.0/vllm_mlx/model_registry.py +190 -0
  50. vllm_mlx-0.2.0/vllm_mlx/model_runner.py +468 -0
  51. vllm_mlx-0.2.0/vllm_mlx/models/__init__.py +15 -0
  52. vllm_mlx-0.2.0/vllm_mlx/models/llm.py +329 -0
  53. vllm_mlx-0.2.0/vllm_mlx/models/mllm.py +1107 -0
  54. vllm_mlx-0.2.0/vllm_mlx/optimizations.py +203 -0
  55. vllm_mlx-0.2.0/vllm_mlx/output_collector.py +206 -0
  56. vllm_mlx-0.2.0/vllm_mlx/paged_cache.py +1191 -0
  57. vllm_mlx-0.2.0/vllm_mlx/platform.py +331 -0
  58. vllm_mlx-0.2.0/vllm_mlx/plugin.py +153 -0
  59. vllm_mlx-0.2.0/vllm_mlx/prefix_cache.py +948 -0
  60. vllm_mlx-0.2.0/vllm_mlx/request.py +205 -0
  61. vllm_mlx-0.2.0/vllm_mlx/scheduler.py +829 -0
  62. vllm_mlx-0.2.0/vllm_mlx/server.py +816 -0
  63. vllm_mlx-0.2.0/vllm_mlx/vlm_cache.py +249 -0
  64. vllm_mlx-0.2.0/vllm_mlx/worker.py +260 -0
  65. vllm_mlx-0.2.0/vllm_mlx.egg-info/PKG-INFO +291 -0
  66. vllm_mlx-0.2.0/vllm_mlx.egg-info/SOURCES.txt +68 -0
  67. vllm_mlx-0.2.0/vllm_mlx.egg-info/dependency_links.txt +1 -0
  68. vllm_mlx-0.2.0/vllm_mlx.egg-info/entry_points.txt +8 -0
  69. vllm_mlx-0.2.0/vllm_mlx.egg-info/requires.txt +52 -0
  70. vllm_mlx-0.2.0/vllm_mlx.egg-info/top_level.txt +1 -0
@@ -0,0 +1,291 @@
1
+ Metadata-Version: 2.4
2
+ Name: vllm-mlx
3
+ Version: 0.2.0
4
+ Summary: vLLM-like inference for Apple Silicon - GPU-accelerated Text, Image, Video & Audio on Mac
5
+ Author: vllm-mlx contributors
6
+ License: Apache-2.0
7
+ Project-URL: Homepage, https://github.com/vllm-mlx/vllm-mlx
8
+ Project-URL: Documentation, https://github.com/vllm-mlx/vllm-mlx#readme
9
+ Project-URL: Repository, https://github.com/vllm-mlx/vllm-mlx
10
+ Keywords: llm,mlx,apple-silicon,vllm,inference,transformers
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: License :: OSI Approved :: Apache Software License
15
+ Classifier: Operating System :: MacOS
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Programming Language :: Python :: 3.13
21
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
+ Requires-Python: >=3.10
23
+ Description-Content-Type: text/markdown
24
+ Requires-Dist: mlx>=0.29.0
25
+ Requires-Dist: mlx-lm>=0.20.0
26
+ Requires-Dist: mlx-vlm>=0.1.0
27
+ Requires-Dist: transformers<5.0.0,>=4.40.0
28
+ Requires-Dist: tokenizers>=0.19.0
29
+ Requires-Dist: huggingface-hub>=0.23.0
30
+ Requires-Dist: numpy>=1.24.0
31
+ Requires-Dist: pillow>=10.0.0
32
+ Requires-Dist: tqdm>=4.66.0
33
+ Requires-Dist: pyyaml>=6.0
34
+ Requires-Dist: gradio>=4.0.0
35
+ Requires-Dist: requests>=2.28.0
36
+ Requires-Dist: tabulate>=0.9.0
37
+ Requires-Dist: opencv-python>=4.8.0
38
+ Requires-Dist: psutil>=5.9.0
39
+ Requires-Dist: fastapi>=0.100.0
40
+ Requires-Dist: uvicorn>=0.23.0
41
+ Requires-Dist: mcp>=1.0.0
42
+ Requires-Dist: jsonschema>=4.0.0
43
+ Requires-Dist: mlx-audio>=0.2.9
44
+ Provides-Extra: dev
45
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
46
+ Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
47
+ Requires-Dist: black>=23.0.0; extra == "dev"
48
+ Requires-Dist: ruff>=0.1.0; extra == "dev"
49
+ Requires-Dist: mypy>=1.0.0; extra == "dev"
50
+ Provides-Extra: vllm
51
+ Requires-Dist: vllm>=0.4.0; extra == "vllm"
52
+ Provides-Extra: vision
53
+ Requires-Dist: torch>=2.3.0; extra == "vision"
54
+ Requires-Dist: torchvision>=0.18.0; extra == "vision"
55
+ Provides-Extra: audio
56
+ Requires-Dist: mlx-audio>=0.2.9; extra == "audio"
57
+ Requires-Dist: sounddevice>=0.4.0; extra == "audio"
58
+ Requires-Dist: soundfile>=0.12.0; extra == "audio"
59
+ Requires-Dist: scipy>=1.10.0; extra == "audio"
60
+ Requires-Dist: numba>=0.57.0; extra == "audio"
61
+ Requires-Dist: tiktoken>=0.5.0; extra == "audio"
62
+ Requires-Dist: misaki[ja,zh]>=0.5.0; extra == "audio"
63
+ Requires-Dist: spacy>=3.7.0; extra == "audio"
64
+ Requires-Dist: num2words>=0.5.0; extra == "audio"
65
+ Requires-Dist: loguru>=0.7.0; extra == "audio"
66
+ Requires-Dist: phonemizer>=3.2.0; extra == "audio"
67
+ Requires-Dist: ordered_set>=4.1.0; extra == "audio"
68
+ Requires-Dist: cn2an>=0.5.0; extra == "audio"
69
+ Requires-Dist: fugashi>=1.3.0; extra == "audio"
70
+ Requires-Dist: unidic-lite>=1.0.0; extra == "audio"
71
+ Requires-Dist: jieba>=0.42.0; extra == "audio"
72
+
73
+ # vLLM-MLX
74
+
75
+ **vLLM-like inference for Apple Silicon** - GPU-accelerated Text, Image, Video & Audio on Mac
76
+
77
+ [![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](LICENSE)
78
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/)
79
+ [![Apple Silicon](https://img.shields.io/badge/Apple-Silicon-black.svg)](https://support.apple.com/en-us/HT211814)
80
+ [![GitHub](https://img.shields.io/badge/GitHub-waybarrios%2Fvllm--mlx-blue?logo=github)](https://github.com/waybarrios/vllm-mlx)
81
+
82
+ ## Overview
83
+
84
+ vllm-mlx brings native Apple Silicon GPU acceleration to vLLM by integrating:
85
+
86
+ - **[MLX](https://github.com/ml-explore/mlx)**: Apple's ML framework with unified memory and Metal kernels
87
+ - **[mlx-lm](https://github.com/ml-explore/mlx-lm)**: Optimized LLM inference with KV cache and quantization
88
+ - **[mlx-vlm](https://github.com/Blaizzy/mlx-vlm)**: Vision-language models for multimodal inference
89
+ - **[mlx-audio](https://github.com/Blaizzy/mlx-audio)**: Speech-to-Text and Text-to-Speech with native voices
90
+
91
+ ## Features
92
+
93
+ - **Multimodal** - Text, Image, Video & Audio in one platform
94
+ - **Native GPU acceleration** on Apple Silicon (M1, M2, M3, M4)
95
+ - **Native TTS voices** - Spanish, French, Chinese, Japanese + 5 more languages
96
+ - **OpenAI API compatible** - drop-in replacement for OpenAI client
97
+ - **MCP Tool Calling** - integrate external tools via Model Context Protocol
98
+ - **Paged KV Cache** - memory-efficient caching with prefix sharing
99
+ - **Continuous Batching** - high throughput for multiple concurrent users
100
+
101
+ ## Quick Start
102
+
103
+ ### Installation
104
+
105
+ ```bash
106
+ git clone https://github.com/waybarrios/vllm-mlx.git
107
+ cd vllm-mlx
108
+ pip install -e .
109
+ ```
110
+
111
+ ### Start Server
112
+
113
+ ```bash
114
+ # Simple mode (single user, max throughput)
115
+ vllm-mlx serve mlx-community/Llama-3.2-3B-Instruct-4bit --port 8000
116
+
117
+ # Continuous batching (multiple users)
118
+ vllm-mlx serve mlx-community/Llama-3.2-3B-Instruct-4bit --port 8000 --continuous-batching
119
+ ```
120
+
121
+ ### Use with OpenAI SDK
122
+
123
+ ```python
124
+ from openai import OpenAI
125
+
126
+ client = OpenAI(base_url="http://localhost:8000/v1", api_key="not-needed")
127
+
128
+ response = client.chat.completions.create(
129
+ model="default",
130
+ messages=[{"role": "user", "content": "Hello!"}],
131
+ )
132
+ print(response.choices[0].message.content)
133
+ ```
134
+
135
+ ### Multimodal (Images & Video)
136
+
137
+ ```bash
138
+ vllm-mlx serve mlx-community/Qwen3-VL-4B-Instruct-3bit --port 8000
139
+ ```
140
+
141
+ ```python
142
+ response = client.chat.completions.create(
143
+ model="default",
144
+ messages=[{
145
+ "role": "user",
146
+ "content": [
147
+ {"type": "text", "text": "What's in this image?"},
148
+ {"type": "image_url", "image_url": {"url": "https://example.com/image.jpg"}}
149
+ ]
150
+ }]
151
+ )
152
+ ```
153
+
154
+ ### Audio (TTS/STT)
155
+
156
+ ```bash
157
+ # Install audio dependencies
158
+ pip install vllm-mlx[audio]
159
+ python -m spacy download en_core_web_sm
160
+ brew install espeak-ng # macOS, for non-English languages
161
+ ```
162
+
163
+ ```bash
164
+ # Text-to-Speech (English)
165
+ python examples/tts_example.py "Hello, how are you?" --play
166
+
167
+ # Text-to-Speech (Spanish)
168
+ python examples/tts_multilingual.py "Hola mundo" --lang es --play
169
+
170
+ # List available models and languages
171
+ python examples/tts_multilingual.py --list-models
172
+ python examples/tts_multilingual.py --list-languages
173
+ ```
174
+
175
+ **Supported TTS Models:**
176
+ | Model | Languages | Description |
177
+ |-------|-----------|-------------|
178
+ | Kokoro | EN, ES, FR, JA, ZH, IT, PT, HI | Fast, 82M params, 11 voices |
179
+ | Chatterbox | 15+ languages | Expressive, voice cloning |
180
+ | VibeVoice | EN | Realtime, low latency |
181
+ | VoxCPM | ZH, EN | High quality Chinese/English |
182
+
183
+ ## Documentation
184
+
185
+ For full documentation, see the [docs](docs/) directory:
186
+
187
+ - **Getting Started**
188
+ - [Installation](docs/getting-started/installation.md)
189
+ - [Quick Start](docs/getting-started/quickstart.md)
190
+
191
+ - **User Guides**
192
+ - [OpenAI-Compatible Server](docs/guides/server.md)
193
+ - [Python API](docs/guides/python-api.md)
194
+ - [Multimodal (Images & Video)](docs/guides/multimodal.md)
195
+ - [Audio (STT/TTS)](docs/guides/audio.md)
196
+ - [MCP & Tool Calling](docs/guides/mcp-tools.md)
197
+ - [Continuous Batching](docs/guides/continuous-batching.md)
198
+
199
+ - **Reference**
200
+ - [CLI Commands](docs/reference/cli.md)
201
+ - [Supported Models](docs/reference/models.md)
202
+ - [Configuration](docs/reference/configuration.md)
203
+
204
+ - **Benchmarks**
205
+ - [LLM Benchmarks](docs/benchmarks/llm.md)
206
+ - [Image Benchmarks](docs/benchmarks/image.md)
207
+ - [Video Benchmarks](docs/benchmarks/video.md)
208
+
209
+ ## Architecture
210
+
211
+ ```
212
+ ┌─────────────────────────────────────────────┐
213
+ │ vLLM API Layer │
214
+ │ (OpenAI-compatible interface) │
215
+ └─────────────────────────────────────────────┘
216
+
217
+
218
+ ┌─────────────────────────────────────────────┐
219
+ │ MLXPlatform │
220
+ │ (vLLM platform plugin for Apple Silicon) │
221
+ └─────────────────────────────────────────────┘
222
+
223
+ ┌───────────┴───────────┐
224
+ ▼ ▼
225
+ ┌──────────────────┐ ┌──────────────────┐
226
+ │ mlx-lm │ │ mlx-vlm │
227
+ │ (LLM inference) │ │ (MLLM inference) │
228
+ └──────────────────┘ └──────────────────┘
229
+ │ │
230
+ └───────────┬───────────┘
231
+
232
+ ┌─────────────────────────────────────────────┐
233
+ │ MLX │
234
+ │ (Apple ML Framework - Metal kernels) │
235
+ └─────────────────────────────────────────────┘
236
+ ```
237
+
238
+ ## Performance
239
+
240
+ **LLM Performance (M4 Max, 128GB):**
241
+
242
+ | Model | Speed | Memory |
243
+ |-------|-------|--------|
244
+ | Qwen3-0.6B-8bit | 402 tok/s | 0.7 GB |
245
+ | Llama-3.2-1B-4bit | 464 tok/s | 0.7 GB |
246
+ | Llama-3.2-3B-4bit | 200 tok/s | 1.8 GB |
247
+
248
+ **Continuous Batching (5 concurrent requests):**
249
+
250
+ | Model | Single | Batched | Speedup |
251
+ |-------|--------|---------|---------|
252
+ | Qwen3-0.6B-8bit | 328 tok/s | 1112 tok/s | **3.4x** |
253
+ | Llama-3.2-1B-4bit | 299 tok/s | 613 tok/s | **2.0x** |
254
+
255
+ See [benchmarks](docs/benchmarks/) for detailed results.
256
+
257
+ ## Contributing
258
+
259
+ We welcome contributions! See [Contributing Guide](docs/development/contributing.md) for details.
260
+
261
+ - Bug fixes and improvements
262
+ - Performance optimizations
263
+ - Documentation improvements
264
+ - Benchmarks on different Apple Silicon chips
265
+
266
+ Submit PRs to: [https://github.com/waybarrios/vllm-mlx](https://github.com/waybarrios/vllm-mlx)
267
+
268
+ ## License
269
+
270
+ Apache 2.0 - see [LICENSE](LICENSE) for details.
271
+
272
+ ## Citation
273
+
274
+ If you use vLLM-MLX in your research or project, please cite:
275
+
276
+ ```bibtex
277
+ @software{vllm_mlx2025,
278
+ author = {Barrios, Wayner},
279
+ title = {vLLM-MLX: Apple Silicon MLX Backend for vLLM},
280
+ year = {2025},
281
+ url = {https://github.com/waybarrios/vllm-mlx},
282
+ note = {Native GPU-accelerated LLM and vision-language model inference on Apple Silicon}
283
+ }
284
+ ```
285
+
286
+ ## Acknowledgments
287
+
288
+ - [MLX](https://github.com/ml-explore/mlx) - Apple's ML framework
289
+ - [mlx-lm](https://github.com/ml-explore/mlx-lm) - LLM inference library
290
+ - [mlx-vlm](https://github.com/Blaizzy/mlx-vlm) - Vision-language models
291
+ - [vLLM](https://github.com/vllm-project/vllm) - High-throughput LLM serving
@@ -0,0 +1,219 @@
1
+ # vLLM-MLX
2
+
3
+ **vLLM-like inference for Apple Silicon** - GPU-accelerated Text, Image, Video & Audio on Mac
4
+
5
+ [![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](LICENSE)
6
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/)
7
+ [![Apple Silicon](https://img.shields.io/badge/Apple-Silicon-black.svg)](https://support.apple.com/en-us/HT211814)
8
+ [![GitHub](https://img.shields.io/badge/GitHub-waybarrios%2Fvllm--mlx-blue?logo=github)](https://github.com/waybarrios/vllm-mlx)
9
+
10
+ ## Overview
11
+
12
+ vllm-mlx brings native Apple Silicon GPU acceleration to vLLM by integrating:
13
+
14
+ - **[MLX](https://github.com/ml-explore/mlx)**: Apple's ML framework with unified memory and Metal kernels
15
+ - **[mlx-lm](https://github.com/ml-explore/mlx-lm)**: Optimized LLM inference with KV cache and quantization
16
+ - **[mlx-vlm](https://github.com/Blaizzy/mlx-vlm)**: Vision-language models for multimodal inference
17
+ - **[mlx-audio](https://github.com/Blaizzy/mlx-audio)**: Speech-to-Text and Text-to-Speech with native voices
18
+
19
+ ## Features
20
+
21
+ - **Multimodal** - Text, Image, Video & Audio in one platform
22
+ - **Native GPU acceleration** on Apple Silicon (M1, M2, M3, M4)
23
+ - **Native TTS voices** - Spanish, French, Chinese, Japanese + 5 more languages
24
+ - **OpenAI API compatible** - drop-in replacement for OpenAI client
25
+ - **MCP Tool Calling** - integrate external tools via Model Context Protocol
26
+ - **Paged KV Cache** - memory-efficient caching with prefix sharing
27
+ - **Continuous Batching** - high throughput for multiple concurrent users
28
+
29
+ ## Quick Start
30
+
31
+ ### Installation
32
+
33
+ ```bash
34
+ git clone https://github.com/waybarrios/vllm-mlx.git
35
+ cd vllm-mlx
36
+ pip install -e .
37
+ ```
38
+
39
+ ### Start Server
40
+
41
+ ```bash
42
+ # Simple mode (single user, max throughput)
43
+ vllm-mlx serve mlx-community/Llama-3.2-3B-Instruct-4bit --port 8000
44
+
45
+ # Continuous batching (multiple users)
46
+ vllm-mlx serve mlx-community/Llama-3.2-3B-Instruct-4bit --port 8000 --continuous-batching
47
+ ```
48
+
49
+ ### Use with OpenAI SDK
50
+
51
+ ```python
52
+ from openai import OpenAI
53
+
54
+ client = OpenAI(base_url="http://localhost:8000/v1", api_key="not-needed")
55
+
56
+ response = client.chat.completions.create(
57
+ model="default",
58
+ messages=[{"role": "user", "content": "Hello!"}],
59
+ )
60
+ print(response.choices[0].message.content)
61
+ ```
62
+
63
+ ### Multimodal (Images & Video)
64
+
65
+ ```bash
66
+ vllm-mlx serve mlx-community/Qwen3-VL-4B-Instruct-3bit --port 8000
67
+ ```
68
+
69
+ ```python
70
+ response = client.chat.completions.create(
71
+ model="default",
72
+ messages=[{
73
+ "role": "user",
74
+ "content": [
75
+ {"type": "text", "text": "What's in this image?"},
76
+ {"type": "image_url", "image_url": {"url": "https://example.com/image.jpg"}}
77
+ ]
78
+ }]
79
+ )
80
+ ```
81
+
82
+ ### Audio (TTS/STT)
83
+
84
+ ```bash
85
+ # Install audio dependencies
86
+ pip install vllm-mlx[audio]
87
+ python -m spacy download en_core_web_sm
88
+ brew install espeak-ng # macOS, for non-English languages
89
+ ```
90
+
91
+ ```bash
92
+ # Text-to-Speech (English)
93
+ python examples/tts_example.py "Hello, how are you?" --play
94
+
95
+ # Text-to-Speech (Spanish)
96
+ python examples/tts_multilingual.py "Hola mundo" --lang es --play
97
+
98
+ # List available models and languages
99
+ python examples/tts_multilingual.py --list-models
100
+ python examples/tts_multilingual.py --list-languages
101
+ ```
102
+
103
+ **Supported TTS Models:**
104
+ | Model | Languages | Description |
105
+ |-------|-----------|-------------|
106
+ | Kokoro | EN, ES, FR, JA, ZH, IT, PT, HI | Fast, 82M params, 11 voices |
107
+ | Chatterbox | 15+ languages | Expressive, voice cloning |
108
+ | VibeVoice | EN | Realtime, low latency |
109
+ | VoxCPM | ZH, EN | High quality Chinese/English |
110
+
111
+ ## Documentation
112
+
113
+ For full documentation, see the [docs](docs/) directory:
114
+
115
+ - **Getting Started**
116
+ - [Installation](docs/getting-started/installation.md)
117
+ - [Quick Start](docs/getting-started/quickstart.md)
118
+
119
+ - **User Guides**
120
+ - [OpenAI-Compatible Server](docs/guides/server.md)
121
+ - [Python API](docs/guides/python-api.md)
122
+ - [Multimodal (Images & Video)](docs/guides/multimodal.md)
123
+ - [Audio (STT/TTS)](docs/guides/audio.md)
124
+ - [MCP & Tool Calling](docs/guides/mcp-tools.md)
125
+ - [Continuous Batching](docs/guides/continuous-batching.md)
126
+
127
+ - **Reference**
128
+ - [CLI Commands](docs/reference/cli.md)
129
+ - [Supported Models](docs/reference/models.md)
130
+ - [Configuration](docs/reference/configuration.md)
131
+
132
+ - **Benchmarks**
133
+ - [LLM Benchmarks](docs/benchmarks/llm.md)
134
+ - [Image Benchmarks](docs/benchmarks/image.md)
135
+ - [Video Benchmarks](docs/benchmarks/video.md)
136
+
137
+ ## Architecture
138
+
139
+ ```
140
+ ┌─────────────────────────────────────────────┐
141
+ │ vLLM API Layer │
142
+ │ (OpenAI-compatible interface) │
143
+ └─────────────────────────────────────────────┘
144
+
145
+
146
+ ┌─────────────────────────────────────────────┐
147
+ │ MLXPlatform │
148
+ │ (vLLM platform plugin for Apple Silicon) │
149
+ └─────────────────────────────────────────────┘
150
+
151
+ ┌───────────┴───────────┐
152
+ ▼ ▼
153
+ ┌──────────────────┐ ┌──────────────────┐
154
+ │ mlx-lm │ │ mlx-vlm │
155
+ │ (LLM inference) │ │ (MLLM inference) │
156
+ └──────────────────┘ └──────────────────┘
157
+ │ │
158
+ └───────────┬───────────┘
159
+
160
+ ┌─────────────────────────────────────────────┐
161
+ │ MLX │
162
+ │ (Apple ML Framework - Metal kernels) │
163
+ └─────────────────────────────────────────────┘
164
+ ```
165
+
166
+ ## Performance
167
+
168
+ **LLM Performance (M4 Max, 128GB):**
169
+
170
+ | Model | Speed | Memory |
171
+ |-------|-------|--------|
172
+ | Qwen3-0.6B-8bit | 402 tok/s | 0.7 GB |
173
+ | Llama-3.2-1B-4bit | 464 tok/s | 0.7 GB |
174
+ | Llama-3.2-3B-4bit | 200 tok/s | 1.8 GB |
175
+
176
+ **Continuous Batching (5 concurrent requests):**
177
+
178
+ | Model | Single | Batched | Speedup |
179
+ |-------|--------|---------|---------|
180
+ | Qwen3-0.6B-8bit | 328 tok/s | 1112 tok/s | **3.4x** |
181
+ | Llama-3.2-1B-4bit | 299 tok/s | 613 tok/s | **2.0x** |
182
+
183
+ See [benchmarks](docs/benchmarks/) for detailed results.
184
+
185
+ ## Contributing
186
+
187
+ We welcome contributions! See [Contributing Guide](docs/development/contributing.md) for details.
188
+
189
+ - Bug fixes and improvements
190
+ - Performance optimizations
191
+ - Documentation improvements
192
+ - Benchmarks on different Apple Silicon chips
193
+
194
+ Submit PRs to: [https://github.com/waybarrios/vllm-mlx](https://github.com/waybarrios/vllm-mlx)
195
+
196
+ ## License
197
+
198
+ Apache 2.0 - see [LICENSE](LICENSE) for details.
199
+
200
+ ## Citation
201
+
202
+ If you use vLLM-MLX in your research or project, please cite:
203
+
204
+ ```bibtex
205
+ @software{vllm_mlx2025,
206
+ author = {Barrios, Wayner},
207
+ title = {vLLM-MLX: Apple Silicon MLX Backend for vLLM},
208
+ year = {2025},
209
+ url = {https://github.com/waybarrios/vllm-mlx},
210
+ note = {Native GPU-accelerated LLM and vision-language model inference on Apple Silicon}
211
+ }
212
+ ```
213
+
214
+ ## Acknowledgments
215
+
216
+ - [MLX](https://github.com/ml-explore/mlx) - Apple's ML framework
217
+ - [mlx-lm](https://github.com/ml-explore/mlx-lm) - LLM inference library
218
+ - [mlx-vlm](https://github.com/Blaizzy/mlx-vlm) - Vision-language models
219
+ - [vLLM](https://github.com/vllm-project/vllm) - High-throughput LLM serving
@@ -0,0 +1,131 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "vllm-mlx"
7
+ version = "0.2.0"
8
+ description = "vLLM-like inference for Apple Silicon - GPU-accelerated Text, Image, Video & Audio on Mac"
9
+ readme = "README.md"
10
+ license = {text = "Apache-2.0"}
11
+ requires-python = ">=3.10"
12
+ authors = [
13
+ {name = "vllm-mlx contributors"}
14
+ ]
15
+ keywords = ["llm", "mlx", "apple-silicon", "vllm", "inference", "transformers"]
16
+ classifiers = [
17
+ "Development Status :: 3 - Alpha",
18
+ "Intended Audience :: Developers",
19
+ "Intended Audience :: Science/Research",
20
+ "License :: OSI Approved :: Apache Software License",
21
+ "Operating System :: MacOS",
22
+ "Programming Language :: Python :: 3",
23
+ "Programming Language :: Python :: 3.10",
24
+ "Programming Language :: Python :: 3.11",
25
+ "Programming Language :: Python :: 3.12",
26
+ "Programming Language :: Python :: 3.13",
27
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
28
+ ]
29
+
30
+ dependencies = [
31
+ "mlx>=0.29.0",
32
+ "mlx-lm>=0.20.0",
33
+ "mlx-vlm>=0.1.0", # VLM support
34
+ "transformers>=4.40.0,<5.0.0",
35
+ "tokenizers>=0.19.0",
36
+ "huggingface-hub>=0.23.0",
37
+ "numpy>=1.24.0",
38
+ "pillow>=10.0.0",
39
+ "tqdm>=4.66.0",
40
+ "pyyaml>=6.0",
41
+ "gradio>=4.0.0",
42
+ "requests>=2.28.0",
43
+ "tabulate>=0.9.0",
44
+ # Video processing for VLM
45
+ "opencv-python>=4.8.0",
46
+ # Resource monitoring
47
+ "psutil>=5.9.0",
48
+ # Server
49
+ "fastapi>=0.100.0",
50
+ "uvicorn>=0.23.0",
51
+ # MCP (Model Context Protocol) support
52
+ "mcp>=1.0.0",
53
+ # JSON Schema validation for structured output
54
+ "jsonschema>=4.0.0",
55
+ # Audio support (STT, TTS, audio processing)
56
+ "mlx-audio>=0.2.9",
57
+ ]
58
+
59
+ [project.optional-dependencies]
60
+ dev = [
61
+ "pytest>=7.0.0",
62
+ "pytest-asyncio>=0.21.0",
63
+ "black>=23.0.0",
64
+ "ruff>=0.1.0",
65
+ "mypy>=1.0.0",
66
+ ]
67
+ vllm = [
68
+ "vllm>=0.4.0",
69
+ ]
70
+ vision = [
71
+ "torch>=2.3.0",
72
+ "torchvision>=0.18.0",
73
+ ]
74
+ # Audio dependencies for TTS/STT (mlx-audio)
75
+ audio = [
76
+ "mlx-audio>=0.2.9",
77
+ "sounddevice>=0.4.0",
78
+ "soundfile>=0.12.0",
79
+ "scipy>=1.10.0",
80
+ "numba>=0.57.0",
81
+ "tiktoken>=0.5.0",
82
+ "misaki[zh,ja]>=0.5.0", # Chinese (zh) and Japanese (ja) support
83
+ "spacy>=3.7.0",
84
+ "num2words>=0.5.0",
85
+ "loguru>=0.7.0",
86
+ "phonemizer>=3.2.0",
87
+ # Additional multilingual dependencies
88
+ "ordered_set>=4.1.0", # Required for Chinese TTS
89
+ "cn2an>=0.5.0", # Chinese number conversion
90
+ "fugashi>=1.3.0", # Japanese tokenizer
91
+ "unidic-lite>=1.0.0", # Japanese dictionary for fugashi
92
+ "jieba>=0.42.0", # Chinese word segmentation
93
+ ]
94
+
95
+ [project.urls]
96
+ Homepage = "https://github.com/vllm-mlx/vllm-mlx"
97
+ Documentation = "https://github.com/vllm-mlx/vllm-mlx#readme"
98
+ Repository = "https://github.com/vllm-mlx/vllm-mlx"
99
+
100
+ [project.entry-points."vllm.platform_plugins"]
101
+ mlx = "vllm_mlx.plugin:mlx_platform_plugin"
102
+
103
+ [project.scripts]
104
+ vllm-mlx = "vllm_mlx.cli:main"
105
+ vllm-mlx-serve = "vllm_mlx.server_v2:main"
106
+ vllm-mlx-chat = "vllm_mlx.gradio_app:main"
107
+ vllm-mlx-bench = "vllm_mlx.benchmark:main"
108
+
109
+ [tool.setuptools.packages.find]
110
+ where = ["."]
111
+ include = ["vllm_mlx*"]
112
+
113
+ [tool.black]
114
+ line-length = 88
115
+ target-version = ["py310", "py311", "py312", "py313"]
116
+
117
+ [tool.ruff]
118
+ line-length = 88
119
+ select = ["E", "F", "W", "I", "N", "UP", "B", "SIM"]
120
+ ignore = ["E501", "B905"]
121
+
122
+ [tool.mypy]
123
+ python_version = "3.10"
124
+ warn_return_any = true
125
+ warn_unused_configs = true
126
+ ignore_missing_imports = true
127
+
128
+ [tool.pytest.ini_options]
129
+ testpaths = ["tests"]
130
+ python_files = ["test_*.py"]
131
+ asyncio_mode = "auto"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+