vmlx 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. vmlx-1.0.0.dist-info/METADATA +340 -0
  2. vmlx-1.0.0.dist-info/RECORD +99 -0
  3. vmlx-1.0.0.dist-info/WHEEL +5 -0
  4. vmlx-1.0.0.dist-info/entry_points.txt +9 -0
  5. vmlx-1.0.0.dist-info/top_level.txt +1 -0
  6. vmlx_engine/__init__.py +138 -0
  7. vmlx_engine/api/__init__.py +121 -0
  8. vmlx_engine/api/anthropic_adapter.py +609 -0
  9. vmlx_engine/api/models.py +757 -0
  10. vmlx_engine/api/streaming.py +210 -0
  11. vmlx_engine/api/tool_calling.py +677 -0
  12. vmlx_engine/api/utils.py +314 -0
  13. vmlx_engine/attention.py +245 -0
  14. vmlx_engine/audio/__init__.py +25 -0
  15. vmlx_engine/audio/processor.py +214 -0
  16. vmlx_engine/audio/stt.py +167 -0
  17. vmlx_engine/audio/tts.py +322 -0
  18. vmlx_engine/benchmark.py +1654 -0
  19. vmlx_engine/block_disk_store.py +770 -0
  20. vmlx_engine/cli.py +1287 -0
  21. vmlx_engine/commands/__init__.py +2 -0
  22. vmlx_engine/commands/convert.py +510 -0
  23. vmlx_engine/commands/doctor.py +309 -0
  24. vmlx_engine/commands/info.py +30 -0
  25. vmlx_engine/commands/list.py +38 -0
  26. vmlx_engine/disk_cache.py +468 -0
  27. vmlx_engine/embedding.py +109 -0
  28. vmlx_engine/engine/__init__.py +28 -0
  29. vmlx_engine/engine/base.py +201 -0
  30. vmlx_engine/engine/batched.py +810 -0
  31. vmlx_engine/engine/simple.py +721 -0
  32. vmlx_engine/engine_core.py +720 -0
  33. vmlx_engine/gradio_app.py +390 -0
  34. vmlx_engine/gradio_text_app.py +176 -0
  35. vmlx_engine/image_gen.py +275 -0
  36. vmlx_engine/mcp/__init__.py +85 -0
  37. vmlx_engine/mcp/client.py +370 -0
  38. vmlx_engine/mcp/config.py +186 -0
  39. vmlx_engine/mcp/executor.py +500 -0
  40. vmlx_engine/mcp/manager.py +302 -0
  41. vmlx_engine/mcp/security.py +699 -0
  42. vmlx_engine/mcp/tools.py +174 -0
  43. vmlx_engine/mcp/types.py +189 -0
  44. vmlx_engine/memory_cache.py +660 -0
  45. vmlx_engine/mllm_batch_generator.py +1800 -0
  46. vmlx_engine/mllm_cache.py +467 -0
  47. vmlx_engine/mllm_scheduler.py +2074 -0
  48. vmlx_engine/mlx_platform.py +333 -0
  49. vmlx_engine/model_config_registry.py +224 -0
  50. vmlx_engine/model_configs.py +684 -0
  51. vmlx_engine/model_registry.py +185 -0
  52. vmlx_engine/model_runner.py +456 -0
  53. vmlx_engine/models/__init__.py +15 -0
  54. vmlx_engine/models/llm.py +366 -0
  55. vmlx_engine/models/mllm.py +1965 -0
  56. vmlx_engine/multimodal_processor.py +200 -0
  57. vmlx_engine/optimizations.py +139 -0
  58. vmlx_engine/output_collector.py +247 -0
  59. vmlx_engine/paged_cache.py +1364 -0
  60. vmlx_engine/plugin.py +155 -0
  61. vmlx_engine/prefix_cache.py +1257 -0
  62. vmlx_engine/reasoning/__init__.py +101 -0
  63. vmlx_engine/reasoning/base.py +110 -0
  64. vmlx_engine/reasoning/deepseek_r1_parser.py +113 -0
  65. vmlx_engine/reasoning/gptoss_parser.py +336 -0
  66. vmlx_engine/reasoning/qwen3_parser.py +73 -0
  67. vmlx_engine/reasoning/think_parser.py +227 -0
  68. vmlx_engine/request.py +219 -0
  69. vmlx_engine/reranker.py +221 -0
  70. vmlx_engine/scheduler.py +2202 -0
  71. vmlx_engine/server.py +4700 -0
  72. vmlx_engine/simple.py +445 -0
  73. vmlx_engine/speculative.py +257 -0
  74. vmlx_engine/tool_parsers/__init__.py +83 -0
  75. vmlx_engine/tool_parsers/abstract_tool_parser.py +290 -0
  76. vmlx_engine/tool_parsers/auto_tool_parser.py +379 -0
  77. vmlx_engine/tool_parsers/deepseek_tool_parser.py +165 -0
  78. vmlx_engine/tool_parsers/functionary_tool_parser.py +188 -0
  79. vmlx_engine/tool_parsers/glm47_tool_parser.py +213 -0
  80. vmlx_engine/tool_parsers/granite_tool_parser.py +142 -0
  81. vmlx_engine/tool_parsers/hermes_tool_parser.py +235 -0
  82. vmlx_engine/tool_parsers/kimi_tool_parser.py +155 -0
  83. vmlx_engine/tool_parsers/llama_tool_parser.py +123 -0
  84. vmlx_engine/tool_parsers/minimax_tool_parser.py +338 -0
  85. vmlx_engine/tool_parsers/mistral_tool_parser.py +262 -0
  86. vmlx_engine/tool_parsers/nemotron_tool_parser.py +161 -0
  87. vmlx_engine/tool_parsers/qwen_tool_parser.py +152 -0
  88. vmlx_engine/tool_parsers/step3p5_tool_parser.py +232 -0
  89. vmlx_engine/tool_parsers/xlam_tool_parser.py +172 -0
  90. vmlx_engine/utils/__init__.py +6 -0
  91. vmlx_engine/utils/cache_types.py +196 -0
  92. vmlx_engine/utils/chat_templates.py +231 -0
  93. vmlx_engine/utils/jang_loader.py +567 -0
  94. vmlx_engine/utils/mamba_cache.py +327 -0
  95. vmlx_engine/utils/model_inspector.py +592 -0
  96. vmlx_engine/utils/nemotron_latent_moe.py +226 -0
  97. vmlx_engine/utils/tokenizer.py +247 -0
  98. vmlx_engine/vision_embedding_cache.py +219 -0
  99. vmlx_engine/worker.py +266 -0
@@ -0,0 +1,340 @@
1
+ Metadata-Version: 2.4
2
+ Name: vmlx
3
+ Version: 1.0.0
4
+ Summary: Local AI inference for Apple Silicon — Text, Image, Video & Audio generation on Mac
5
+ Author-email: Jinho Jang <eric@jangq.ai>
6
+ License: Apache-2.0
7
+ Project-URL: Homepage, https://github.com/vmlxllm/vmlx
8
+ Project-URL: Documentation, https://github.com/vmlxllm/vmlx#readme
9
+ Project-URL: Repository, https://github.com/vmlxllm/vmlx
10
+ Keywords: llm,mlx,apple-silicon,vllm,inference,transformers
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: License :: OSI Approved :: Apache Software License
15
+ Classifier: Operating System :: MacOS
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Programming Language :: Python :: 3.13
21
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
+ Requires-Python: >=3.10
23
+ Description-Content-Type: text/markdown
24
+ Requires-Dist: mlx>=0.29.0
25
+ Requires-Dist: mlx-lm>=0.30.2
26
+ Requires-Dist: mlx-vlm>=0.1.0
27
+ Requires-Dist: transformers>=4.40.0
28
+ Requires-Dist: tokenizers>=0.19.0
29
+ Requires-Dist: huggingface-hub>=0.23.0
30
+ Requires-Dist: numpy>=1.24.0
31
+ Requires-Dist: pillow>=10.0.0
32
+ Requires-Dist: tqdm>=4.66.0
33
+ Requires-Dist: pyyaml>=6.0
34
+ Requires-Dist: requests>=2.28.0
35
+ Requires-Dist: tabulate>=0.9.0
36
+ Requires-Dist: opencv-python-headless>=4.8.0
37
+ Requires-Dist: psutil>=5.9.0
38
+ Requires-Dist: fastapi>=0.100.0
39
+ Requires-Dist: uvicorn>=0.23.0
40
+ Requires-Dist: mcp>=1.0.0
41
+ Requires-Dist: jsonschema>=4.0.0
42
+ Requires-Dist: mlx-embeddings>=0.0.5
43
+ Provides-Extra: ui
44
+ Requires-Dist: gradio>=4.0.0; extra == "ui"
45
+ Requires-Dist: pytz>=2024.1; extra == "ui"
46
+ Provides-Extra: dev
47
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
48
+ Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
49
+ Requires-Dist: black>=23.0.0; extra == "dev"
50
+ Requires-Dist: ruff>=0.1.0; extra == "dev"
51
+ Requires-Dist: mypy>=1.0.0; extra == "dev"
52
+ Provides-Extra: vllm
53
+ Requires-Dist: vllm>=0.4.0; extra == "vllm"
54
+ Provides-Extra: vision
55
+ Requires-Dist: torch>=2.3.0; extra == "vision"
56
+ Requires-Dist: torchvision>=0.18.0; extra == "vision"
57
+ Provides-Extra: audio
58
+ Requires-Dist: mlx-audio>=0.2.9; extra == "audio"
59
+ Requires-Dist: sounddevice>=0.4.0; extra == "audio"
60
+ Requires-Dist: soundfile>=0.12.0; extra == "audio"
61
+ Requires-Dist: scipy>=1.10.0; extra == "audio"
62
+ Requires-Dist: numba>=0.57.0; extra == "audio"
63
+ Requires-Dist: tiktoken>=0.5.0; extra == "audio"
64
+ Requires-Dist: misaki[ja,zh]>=0.5.0; extra == "audio"
65
+ Requires-Dist: spacy>=3.7.0; extra == "audio"
66
+ Requires-Dist: num2words>=0.5.0; extra == "audio"
67
+ Requires-Dist: loguru>=0.7.0; extra == "audio"
68
+ Requires-Dist: phonemizer>=3.2.0; extra == "audio"
69
+ Requires-Dist: ordered_set>=4.1.0; extra == "audio"
70
+ Requires-Dist: cn2an>=0.5.0; extra == "audio"
71
+ Requires-Dist: fugashi>=1.3.0; extra == "audio"
72
+ Requires-Dist: unidic-lite>=1.0.0; extra == "audio"
73
+ Requires-Dist: jieba>=0.42.0; extra == "audio"
74
+ Provides-Extra: jang
75
+ Requires-Dist: jang>=1.0.0; extra == "jang"
76
+ Provides-Extra: image
77
+ Requires-Dist: mflux>=0.16.0; extra == "image"
78
+
79
+ <p align="center">
80
+ <picture>
81
+ <source media="(prefers-color-scheme: dark)" srcset="https://vmlx.net/logos/png/wordmark-dark-600x150.png">
82
+ <source media="(prefers-color-scheme: light)" srcset="https://vmlx.net/logos/png/wordmark-light-600x150.png">
83
+ <img alt="vMLX" src="https://vmlx.net/logos/png/wordmark-transparent-600x150.png" width="400">
84
+ </picture>
85
+ </p>
86
+
87
+ <p align="center">
88
+ <strong>Native macOS AI inference — local models, remote endpoints, zero config</strong>
89
+ </p>
90
+
91
+ <p align="center">
92
+ <a href="https://vmlx.net">Website</a> · <a href="panel/CHANGELOG.md">Panel Changelog</a> · <a href="CHANGELOG.md">Engine Changelog</a> · <a href="docs/">Documentation</a>
93
+ </p>
94
+
95
+ ---
96
+
97
+ ## What is vMLX?
98
+
99
+ vMLX is a native macOS application for running AI models on Apple Silicon. It bundles a custom inference engine with a full-featured desktop interface — manage sessions, chat with models, download from HuggingFace, connect to remote APIs, and use agentic tool-calling workflows.
100
+
101
+ - **Local inference** with GPU acceleration via MLX
102
+ - **Remote endpoints** — connect to any OpenAI-compatible API
103
+ - **HuggingFace downloader** — search, download, and serve models in-app
104
+ - **Built-in tools** — file I/O, shell, search, image reading, ask_user interrupt
105
+ - **MCP integration** — Model Context Protocol tool servers (local sessions)
106
+
107
+ ---
108
+
109
+ ## Key Features
110
+
111
+ ### Inference Engine (v0.2.18)
112
+
113
+ | Feature | Description |
114
+ |---------|-------------|
115
+ | **Paged KV Cache** | Memory-efficient caching with prefix sharing and block-level reuse |
116
+ | **KV Cache Quantization** | Q4/Q8 quantized cache storage (2–4× memory savings) |
117
+ | **Prefix Cache** | Token-level prefix matching for fast prompt reuse across requests |
118
+ | **Continuous Batching** | Concurrent request handling with slot management |
119
+ | **VLM Caching** | Full KV cache pipeline for vision-language models (Qwen-VL, Gemma 3, etc.) |
120
+ | **Mamba Hybrid Support** | Auto-detects mixed KVCache + MambaCache models (Qwen3.5-VL, Qwen3-Coder-Next, Nemotron) |
121
+ | **Streaming Detokenizer** | Per-request UTF-8 buffering — emoji, CJK, Arabic render correctly |
122
+ | **Request Cancellation** | Stop inference mid-stream via API or connection close |
123
+ | **OpenAI-Compatible API** | Chat Completions + Responses API with full streaming support |
124
+ | **Speculative Decoding** | Draft model acceleration (20-90% speedup, zero quality loss) |
125
+
126
+ ### Desktop App (Panel v1.2.1)
127
+
128
+ | Feature | Description |
129
+ |---------|-------------|
130
+ | **Multi-session** | Run multiple models simultaneously on different ports |
131
+ | **Remote endpoints** | Connect to OpenAI, Groq, local vLLM, or any compatible API |
132
+ | **HuggingFace browser** | Search, download, and install MLX models with progress tracking |
133
+ | **Agentic tools** | File I/O, shell, search, image reading with auto-continue loops (up to 10 iterations) |
134
+ | **Per-chat settings** | Temperature, Top P/K, Min P, Repeat Penalty, Stop Sequences, Max Tokens |
135
+ | **Reasoning display** | Collapsible thinking sections for Qwen3, DeepSeek-R1, GLM-4.7 |
136
+ | **Tool parsers** | hermes, pythonic, llama3, mistral, minimax, qwen3, nemotron, step3p5, and more |
137
+ | **Auto-detection** | Reads model config JSON for automatic parser and cache type selection |
138
+ | **Persistent history** | SQLite-backed chat history with metrics, tool calls, and reasoning content |
139
+ | **Live metrics** | TTFT, tokens/sec, prompt processing speed, prefix cache hits |
140
+
141
+ ---
142
+
143
+ ## Quick Start
144
+
145
+ ### Desktop App (recommended)
146
+
147
+ ```bash
148
+ # Clone and build
149
+ git clone https://github.com/vmlxllm/vmlx.git
150
+ cd vmlx/panel
151
+
152
+ # Install dependencies
153
+ npm install
154
+
155
+ # Development mode
156
+ npm run dev
157
+
158
+ # Build and install to /Applications
159
+ bash scripts/build-and-install.sh
160
+ ```
161
+
162
+ ### Engine Only (CLI)
163
+
164
+ ```bash
165
+ # Install
166
+ uv tool install git+https://github.com/vmlxllm/vmlx.git
167
+ # or
168
+ pip install git+https://github.com/vmlxllm/vmlx.git
169
+
170
+ # Start server
171
+ vmlx-engine serve mlx-community/Llama-3.2-3B-Instruct-4bit --port 8000
172
+
173
+ # With continuous batching
174
+ vmlx-engine serve mlx-community/Llama-3.2-3B-Instruct-4bit --port 8000 --continuous-batching
175
+
176
+ # With API key
177
+ vmlx-engine serve mlx-community/Llama-3.2-3B-Instruct-4bit --port 8000 --api-key your-key
178
+
179
+ # With speculative decoding (20-90% faster)
180
+ vmlx-engine serve mlx-community/Llama-3.2-3B-Instruct-4bit --port 8000 \
181
+ --speculative-model mlx-community/Llama-3.2-1B-Instruct-4bit \
182
+ --num-draft-tokens 3
183
+ ```
184
+
185
+ ### Use with OpenAI SDK
186
+
187
+ ```python
188
+ from openai import OpenAI
189
+
190
+ client = OpenAI(base_url="http://localhost:8000/v1", api_key="not-needed")
191
+
192
+ response = client.chat.completions.create(
193
+ model="default",
194
+ messages=[{"role": "user", "content": "Hello!"}],
195
+ )
196
+ print(response.choices[0].message.content)
197
+ ```
198
+
199
+ ---
200
+
201
+ ## API Endpoints
202
+
203
+ | Endpoint | Description |
204
+ |----------|-------------|
205
+ | `POST /v1/chat/completions` | Chat Completions API (streaming) |
206
+ | `POST /v1/responses` | Responses API (streaming) |
207
+ | `GET /v1/models` | List loaded models |
208
+ | `GET /health` | Server health + model info |
209
+ | `POST /v1/mcp/execute` | Execute MCP tool |
210
+ | `GET /v1/cache/stats` | Prefix cache statistics |
211
+ | `POST /v1/cache/warm` | Pre-warm cache with prompt |
212
+ | `DELETE /v1/cache` | Clear prefix cache |
213
+ | `POST /v1/chat/completions/{id}/cancel` | Cancel inference (save GPU) |
214
+ | `POST /v1/embeddings` | Text embeddings (mlx-embeddings) |
215
+
216
+ ---
217
+
218
+ ## Reasoning Models
219
+
220
+ Extract thinking process from reasoning-capable models:
221
+
222
+ ```bash
223
+ vmlx-engine serve mlx-community/Qwen3-8B-4bit --reasoning-parser qwen3
224
+ ```
225
+
226
+ | Parser | Models | Format |
227
+ |--------|--------|--------|
228
+ | `qwen3` | Qwen3, QwQ, MiniMax M2/M2.5, StepFun | `<think>` / `</think>` tags |
229
+ | `deepseek_r1` | DeepSeek-R1, Gemma 3, Phi-4 Reasoning, GLM-4.7, GLM-Z1 | Lenient `<think>` (handles missing open tag) |
230
+ | `openai_gptoss` | GLM-4.7 Flash, GPT-OSS | Harmony `<\|channel\|>analysis/final` protocol |
231
+
232
+ ---
233
+
234
+ ## Tool Calling
235
+
236
+ Built-in agentic tools available in the desktop app:
237
+
238
+ | Category | Tools |
239
+ |----------|-------|
240
+ | **File** | read_file, write_file, edit_file, patch_file, batch_edit, copy, move, delete, create_directory, list_directory, read_image |
241
+ | **Search** | search_files, find_files, file_info, get_diagnostics, get_tree, diff_files |
242
+ | **Shell** | run_command, spawn_process, get_process_output |
243
+ | **Web** | fetchUrl, brave_search |
244
+ | **Utility** | ask_user (interactive interrupt) |
245
+
246
+ Plus MCP tool server passthrough for local sessions.
247
+
248
+ ---
249
+
250
+ ## Architecture
251
+
252
+ ```
253
+ ┌─────────────────────────────────────────────────────────┐
254
+ │ vMLX Desktop App │
255
+ │ (Electron + React + TypeScript) │
256
+ └─────────────────────────────────────────────────────────┘
257
+
258
+ ┌────────────┴────────────┐
259
+ ▼ ▼
260
+ ┌──────────────────────┐ ┌──────────────────────┐
261
+ │ Local vmlx-engine │ │ Remote Endpoints │
262
+ │ (spawned process) │ │ (OpenAI, Groq, etc.) │
263
+ └──────────────────────┘ └──────────────────────┘
264
+
265
+
266
+ ┌─────────────────────────────────────────────────────────┐
267
+ │ vMLX Engine │
268
+ │ (FastAPI + MLX inference + caching) │
269
+ └─────────────────────────────────────────────────────────┘
270
+
271
+ ┌─────────┼──────────┬──────────┐
272
+ ▼ ▼ ▼ ▼
273
+ ┌────────┐┌────────┐┌────────┐┌────────────┐
274
+ │ mlx-lm ││mlx-vlm ││mlx-aud ││mlx-embed │
275
+ │ (LLMs) ││(Vision)││(Audio) ││(Embeddings)│
276
+ └────────┘└────────┘└────────┘└────────────┘
277
+
278
+
279
+ ┌─────────────────────────────────────────────────────────┐
280
+ │ Apple MLX │
281
+ │ (Metal GPU + Unified Memory) │
282
+ └─────────────────────────────────────────────────────────┘
283
+ ```
284
+
285
+ ---
286
+
287
+ ## Tech Stack
288
+
289
+ | Layer | Technology |
290
+ |-------|-----------|
291
+ | Desktop app | Electron 28 + React 18 + TypeScript |
292
+ | Styling | Tailwind CSS |
293
+ | Database | SQLite (WAL mode, better-sqlite3) |
294
+ | Inference engine | vMLX Engine v0.2.18 (Python, FastAPI) |
295
+ | ML framework | Apple MLX (Metal GPU acceleration) |
296
+ | Build | electron-vite + electron-builder |
297
+ | Tests | Vitest (panel: 542 tests), pytest (engine: 1595 tests) |
298
+ | Python | Bundled relocatable Python 3.12 |
299
+
300
+ ---
301
+
302
+ ## Recent Changes
303
+
304
+ ### Panel v1.2.1 / Engine v0.2.18 (2026-03-09)
305
+ - **Tool calling fix**: `enableAutoToolChoice` default changed from `false` to `undefined` (auto-detect) — MCP and built-in tools now work out of the box without manual enable
306
+ - **MCP tool result truncation**: MCP tool results now capped at same limit as built-in tools (50KB default) to prevent context overflow
307
+ - **Command preview parity**: `buildCommandPreview` in SessionSettings now matches actual `buildArgs` logic for auto-tool-choice flags
308
+ - **Old config migration**: Stored sessions with `enableAutoToolChoice: false` auto-migrate to `undefined` on load
309
+ - **2137 total tests**: 1595 engine + 542 panel (12 new regression tests for tool calling and MCP)
310
+
311
+ ### Panel v1.2.0 / Engine v0.2.18 (2026-03-09)
312
+ - **HuggingFace download fix**: Download progress no longer stuck at 0% — tqdm `\r` chunk splitting, ANSI stripping, highest-percent extraction
313
+ - **HF browser NaN/Unknown fix**: Model ages and authors display correctly (uses `createdAt` fallback, extracts author from modelId)
314
+ - **macOS 15 launch fix**: `minimumSystemVersion` corrected from 26.0.0 to 14.0.0 (fixes GitHub #10)
315
+ - **Deep stability audit**: 14 fixes across paged cache block lifecycle, KV dequantize safety, reasoning marker detection, tool fallback, Mistral JSON validation
316
+ - **CancelledError SSE hang**: Engine cancellation now unblocks all waiting SSE consumers
317
+ - **2125 total tests**: 1595 engine + 530 panel with full regression coverage
318
+
319
+ ### Panel v1.1.4 / Engine v0.2.12 (2026-03-07)
320
+ - **tool_choice="none" fix**: Content no longer swallowed when tool markers detected with tools suppressed
321
+ - **suppress_reasoning**: Reasoning leaks plugged in both API paths
322
+ - **First-launch UX**: Auto-creates initial chat, dynamic About page version
323
+ - **1571 engine tests**, **530 panel tests** across 6 vitest suites
324
+
325
+ See [Panel Changelog](panel/CHANGELOG.md) and [Engine Changelog](CHANGELOG.md) for full history.
326
+
327
+ ---
328
+
329
+ ## Current Version
330
+
331
+ **Engine v0.2.18** / **Panel v1.2.1** — macOS 26+ (Tahoe) for local inference, macOS 14+ for remote endpoints. Apple Silicon (M1, M2, M3, M4)
332
+
333
+ ## Links
334
+
335
+ - **Website**: [vmlx.net](https://vmlx.net)
336
+ - **Contact**: admin@vmlx.net
337
+
338
+ ## License
339
+
340
+ Apache 2.0 — see [LICENSE](LICENSE) for details.
@@ -0,0 +1,99 @@
1
+ vmlx_engine/__init__.py,sha256=fdRdlj9A2xlAgdLmTeJLsrQCwMKK2I8fYQdG3SVNkv8,3786
2
+ vmlx_engine/attention.py,sha256=OnPcOUsK0Kzs9u_K4VS6Xa6Ss8piMsNlHVkxsAB2nlg,7139
3
+ vmlx_engine/benchmark.py,sha256=_GC86qPz76E2NPHLDYInYrvJ_By8JIq65SjXsBk6xS4,55298
4
+ vmlx_engine/block_disk_store.py,sha256=xy1ls9RmxEGIHSS72Zzmw_twFXf32cVaqQ-65ql5ois,29097
5
+ vmlx_engine/cli.py,sha256=Snr_kbaDnVJdGjP-Tq2kZmxt1JJckFsqlxGc9iUuJz0,51056
6
+ vmlx_engine/disk_cache.py,sha256=x4S15n2FjSFNfL67fS21D4Fv-JGS-fcBDxcP53w-97s,17037
7
+ vmlx_engine/embedding.py,sha256=dkO6YjP2O4tLJEeaig4v4wkTqByuDMG3MJuZlSFVmTw,3397
8
+ vmlx_engine/engine_core.py,sha256=bToRsn8ybXPkl1aSz6_A_zTeGwDNUBLGEECY_-xC7Qc,27504
9
+ vmlx_engine/gradio_app.py,sha256=5kEuDf-gkLBHydorh8DCDxLT0qrvQT6K9loGhlnV7gQ,13118
10
+ vmlx_engine/gradio_text_app.py,sha256=_0q5OevibP-KSER82qAE7PwvheekStS0PBSG6BemEAE,5088
11
+ vmlx_engine/image_gen.py,sha256=OSfvJk17sDHgyJDw6ZoApLeRXzwiELnv_P_eK5QDOSk,9232
12
+ vmlx_engine/memory_cache.py,sha256=W7oLaMspIgq-yhLGrFDsu5-KthR3LgmBxtIp4ccYL38,23958
13
+ vmlx_engine/mllm_batch_generator.py,sha256=0JQk-OJMFtCYAZ783NBQgSPxerEmMg3fPoXE3-edJh4,80639
14
+ vmlx_engine/mllm_cache.py,sha256=E2nJx-7L1rMj-uzMsszP4mNfYLYjcfWY0RxdbNrvJSU,16548
15
+ vmlx_engine/mllm_scheduler.py,sha256=SLj006oozcz3UKUMianf_SIIf1mCs4-Sx9k2Jh3TvlU,89599
16
+ vmlx_engine/mlx_platform.py,sha256=CfzG8mIQP7OK9pX_vWRUy7OZLruWsjTnBPxoMF-unCY,10116
17
+ vmlx_engine/model_config_registry.py,sha256=6_qUByzetyf248iONNqGhZd7TMfXm-vu52BegCleKbQ,7856
18
+ vmlx_engine/model_configs.py,sha256=0SQhq_7IqF3fYmWZ5ggQ51dekiw1h_vBVgAGilsk7cM,18640
19
+ vmlx_engine/model_registry.py,sha256=bOgEVbUgRZvl4jA-qnnsNrH3aFvYFwHeeI_8ejKWANw,5827
20
+ vmlx_engine/model_runner.py,sha256=5oakkkNSqtzTap9A1Xw8scEpc0AEnRHOn0FMx9zaG7k,15681
21
+ vmlx_engine/multimodal_processor.py,sha256=qzDcZzszmQ5DxPZ7-RtwdTWt3D0j-QqgCJNd_FI8X1Y,6722
22
+ vmlx_engine/optimizations.py,sha256=xiVkF2qXhe9rEraHMJmLqCDmpZZLVFDwTAa80hQn5UU,4103
23
+ vmlx_engine/output_collector.py,sha256=aNUIf5Lmgeu-jdl40n67m50B8iDHvoxxBwqCKSOAQ84,8075
24
+ vmlx_engine/paged_cache.py,sha256=rQgcFixssVrtUL8ZvcMGrRbPwA9_onrdkbOv25aeNEQ,47190
25
+ vmlx_engine/plugin.py,sha256=78NTg1Pi2D4VamUKlZhUAdhR9q3SNH_rYLDmHZYYkN8,3926
26
+ vmlx_engine/prefix_cache.py,sha256=Yr9zcOfs8hNwMzMXK7vU6aA3csDpfw4c9R543txjBdk,48108
27
+ vmlx_engine/request.py,sha256=eZOzNX6sTj07RVSTxvyUKa2MYARftKPqbNpCY5-sfX8,7550
28
+ vmlx_engine/reranker.py,sha256=TjBCAI8qoRndeuUvrG105CcooU6jooH9FFvXun3dkeE,7301
29
+ vmlx_engine/scheduler.py,sha256=Ot7sj2snRJ-qu_p5E6Cwec3wR5yGyRkjLGl8yf0eTGQ,101206
30
+ vmlx_engine/server.py,sha256=lgL69yBWwrRE84WzomFpp_xKkhi4vMgHRDusMs5w5zc,193534
31
+ vmlx_engine/simple.py,sha256=ibUBsfInVD_r-qQTLr9LATBrw24ZfyVnPJjW-myOUa4,15110
32
+ vmlx_engine/speculative.py,sha256=fcMC8_ZhL3XRGixNDMa4PLbcrT4fiUvzBZvfVCs5rGE,8183
33
+ vmlx_engine/vision_embedding_cache.py,sha256=VK5Gxc75UPTfYxKoVUsRjLLLOyqn2e8QYON1Fz3KKh8,6873
34
+ vmlx_engine/worker.py,sha256=RPVRtSg9nlQIQNu-vuKLG3Kdp7uMh3KlVEjWZ0RnGjg,8627
35
+ vmlx_engine/api/__init__.py,sha256=-qlYDRLM_ehkJzOatKGtAECZyjoaacWTPI4me2Xcm6I,2614
36
+ vmlx_engine/api/anthropic_adapter.py,sha256=_UjQlaP5gERYif69cwra5iRioZuaFJya8KVgSPYLKR8,22239
37
+ vmlx_engine/api/models.py,sha256=jI0IkgdDG3fG3VqV76xikr1p1fpqM0KN9raxag145eA,22697
38
+ vmlx_engine/api/streaming.py,sha256=8Vbix5ulFgZNlN294bXwaimvtSUt-DyVhXYVd7RoqSI,7372
39
+ vmlx_engine/api/tool_calling.py,sha256=nw6XW1RkPlpQL1-fzmmUNXuPgbNacqThb7MVzS_1bk0,23111
40
+ vmlx_engine/api/utils.py,sha256=JHL0in1jJAL2Eols9B0yvEbpvRmWHsUbAaCLTN_3u0g,11611
41
+ vmlx_engine/audio/__init__.py,sha256=veFwcZDlL3E4YE9ewCr1-ehM0EoN7Jo-2HVtuqfXAsw,578
42
+ vmlx_engine/audio/processor.py,sha256=b7rIKNbsiA23O0anzZ0YdLGBEKp3sTDLkNWiIZqqixc,6561
43
+ vmlx_engine/audio/stt.py,sha256=6l4_Crg8KpKmItsIOwkLQ5dTiYge0hdyWYOx3lhLsU0,5076
44
+ vmlx_engine/audio/tts.py,sha256=TrrNFjo4_2_-rirMTUZVpJBfg0oYRAN6AgsX_7TcPa4,8942
45
+ vmlx_engine/commands/__init__.py,sha256=BWEp1IBfddVrcik90AHCtBlVbtsJO3uor6Eo5qGHFAk,74
46
+ vmlx_engine/commands/convert.py,sha256=oGI2JM19CXK2bhmRmOeRFJU4esgkSG-ZY1j-FsNQrZ0,17574
47
+ vmlx_engine/commands/doctor.py,sha256=BgsVGhpQZSuJ54NsaJQqUjJG_vaBd9lcETuWtRDm4eU,10138
48
+ vmlx_engine/commands/info.py,sha256=NyrrLnRoxD9IjSZ90x3kgMshTZ2GNaxZmOcffjKXoJg,671
49
+ vmlx_engine/commands/list.py,sha256=gGuc88GNYbv1J2OJwR2eaOnMISPNSrBuSjFnRh6zvv8,1149
50
+ vmlx_engine/engine/__init__.py,sha256=Jm93jgs2EVplFYqXKF6k1N3wsZ1q0szM1EgIBKeg-hI,771
51
+ vmlx_engine/engine/base.py,sha256=_qW-_7_qtQzdVl6bZFpgSUZqBpHAUK0SEpMsg-DiO7Q,5541
52
+ vmlx_engine/engine/batched.py,sha256=1KV50l2oSBtRiffy87MTGKaroufB1la4DsdiJUDBOKw,32660
53
+ vmlx_engine/engine/simple.py,sha256=YN2Hha1nvTKeVvbWrtPcD2iQRrlNt_Kymkk_VNRNMXk,28797
54
+ vmlx_engine/mcp/__init__.py,sha256=mpX1cOkGxezkMFg2ASdE8pW6Anfo-i1fYp4tlC39a2g,2124
55
+ vmlx_engine/mcp/client.py,sha256=nP62shUP2kf5txLJjuogxoqgLQ0y4JY6mol6c1FbftM,12363
56
+ vmlx_engine/mcp/config.py,sha256=ZXp6w6YBINmZHWf7PNZBZyyCuPk-DVMwSq8XO8E2Zdw,5229
57
+ vmlx_engine/mcp/executor.py,sha256=5Qe-hieGPW_COoX2MDs-Y6h248OAgF6wGHLHlE542h0,16459
58
+ vmlx_engine/mcp/manager.py,sha256=SvHmOHIQT-PWsg0na_Xo6Yyh-FQiw5Z0qHvVnAxMLJQ,8991
59
+ vmlx_engine/mcp/security.py,sha256=-B-PDwix7YUllc8SFku3zuI4hHHFsS8gt2h3Qfl0AQM,23719
60
+ vmlx_engine/mcp/tools.py,sha256=gX6CDAD6scQulaHj_p4KexDjS8upGZmifyVg7_MJtAc,4405
61
+ vmlx_engine/mcp/types.py,sha256=IU6zWy9kyZFIciNPnm3FICmCuq9Lq2xhgF9i9wJ2Nt8,5170
62
+ vmlx_engine/models/__init__.py,sha256=Y0tba7FfCsZqr_OA4FaAW7ndXmom13b0CgdJVoI8kPI,442
63
+ vmlx_engine/models/llm.py,sha256=T-h2yeLkQTpQkak80ZGdeYnRcuWVd7xdHYoTrqPNuSo,12042
64
+ vmlx_engine/models/mllm.py,sha256=JQSeruPgVpYOo6_frsOLtqy1FMd-KCagQWVQuzE0Mj0,70665
65
+ vmlx_engine/reasoning/__init__.py,sha256=IhiUam8mASfxMakZ0xDeQtRtpZ1tyHm4o40hfBYswD0,2773
66
+ vmlx_engine/reasoning/base.py,sha256=Po-W6B5fPxl1MlFnddYLtR6vH2PxDQBJXgp_W51GbLQ,3462
67
+ vmlx_engine/reasoning/deepseek_r1_parser.py,sha256=xhIHgCDYxf1bfwK4ejikx4QRp-iBfgrZrNES93jWFyk,4119
68
+ vmlx_engine/reasoning/gptoss_parser.py,sha256=wvbFYneLzPib8ehxijtZUUNcEWyTGJYUag-hZk9vKnM,14324
69
+ vmlx_engine/reasoning/qwen3_parser.py,sha256=k8CqjArFDZlYl810SCcrA9GzlzYNINm33VxzadLZDgE,2545
70
+ vmlx_engine/reasoning/think_parser.py,sha256=zIce3GaAm1ZGcTqjWx0UZslZ4rIh_UzHk74Dos8XbPg,8888
71
+ vmlx_engine/tool_parsers/__init__.py,sha256=sMy9yR9kOM4EDvqsJ1pJN91dwzf_MSk3t59BTzuGBrA,2799
72
+ vmlx_engine/tool_parsers/abstract_tool_parser.py,sha256=NiwhncFKGx0IpCYrLFt0CFnI-4y66Kgff7JpI_V2Wew,9597
73
+ vmlx_engine/tool_parsers/auto_tool_parser.py,sha256=wdRYsuDw1SiTHzul-PfhRUGy2p9VIt8z_g7Zbp3t8Lo,14144
74
+ vmlx_engine/tool_parsers/deepseek_tool_parser.py,sha256=zm4B8uYEEZtgJ_3zQTVfWeVl0grfjxq9l5pNTHjHydE,5639
75
+ vmlx_engine/tool_parsers/functionary_tool_parser.py,sha256=SXiNBErgr79wQRijeFDhh_wEJ6fXO1kMReL8nC-0Tfg,6494
76
+ vmlx_engine/tool_parsers/glm47_tool_parser.py,sha256=HYjk4auMSAZGhtPHIz4qoW7zvEPxoK8evMI3eIRpUxw,7909
77
+ vmlx_engine/tool_parsers/granite_tool_parser.py,sha256=kDYG2vuCm0xDczy58pyDDQiiZMctP7UfwDSSwAr_5qE,4740
78
+ vmlx_engine/tool_parsers/hermes_tool_parser.py,sha256=USwc1d0FZ8hUestH_NkI1m8586F3zxxilfiX7xhqkTo,9559
79
+ vmlx_engine/tool_parsers/kimi_tool_parser.py,sha256=Qhp7ys5FDSPuEPvsBdJandkhoe-2ET5mamVsp8Ghf4Y,5331
80
+ vmlx_engine/tool_parsers/llama_tool_parser.py,sha256=uU8CilBQbWy8pYMjUeLYxbMbZFGToRITm5Ua_Zq-1eI,4004
81
+ vmlx_engine/tool_parsers/minimax_tool_parser.py,sha256=zFIUtJy6AOws86qE5wGUlnUu0PCwSQ7y05UrW0Vduvc,12276
82
+ vmlx_engine/tool_parsers/mistral_tool_parser.py,sha256=0H2HtR2HXRWm_GEs-e-HHzRmf9SaRMm5vyXfHkzSM-E,9455
83
+ vmlx_engine/tool_parsers/nemotron_tool_parser.py,sha256=YlHsxc_NypHJF_fpnGSxEziIe6Xywf-IDhcO1vfPogE,5364
84
+ vmlx_engine/tool_parsers/qwen_tool_parser.py,sha256=kQldPdxL2bqRMOMACF5OUp2p5GDBgX4rRPYQo_vQIPs,5324
85
+ vmlx_engine/tool_parsers/step3p5_tool_parser.py,sha256=OXy-PjGkR-zSwjOGBd5_fTlA5aeruPfOX3DB62buMRA,7857
86
+ vmlx_engine/tool_parsers/xlam_tool_parser.py,sha256=Xkxeq8R3jZl4Ovc_gXnDWC8sUWztqURthexQur-V8S4,5754
87
+ vmlx_engine/utils/__init__.py,sha256=CzYWXj2cfLI59cCbkpFTiSsHHgnlT6RFfof6ruJLNyY,166
88
+ vmlx_engine/utils/cache_types.py,sha256=NJ-6u2jfyBshbFNIJ0Wfm5FTiUczrom5RhpbpczgMlM,6191
89
+ vmlx_engine/utils/chat_templates.py,sha256=q4kyDOq7NW3xUbTegn-7eXTF_0NBIwS4tVxGW5uqUM0,11528
90
+ vmlx_engine/utils/jang_loader.py,sha256=j6BKtKN05tBiauMpmdyvZKQ08lZBRSuOnJxHE0LrF1A,21898
91
+ vmlx_engine/utils/mamba_cache.py,sha256=Mjju8utGS9pTfK4Os4kd9QsQX4Tzt6qRRi8gYzqa_W8,12567
92
+ vmlx_engine/utils/model_inspector.py,sha256=L600Cw0vm4ZLui_Qxib_zINsGDu5SUCv1GR-26PN5UQ,21151
93
+ vmlx_engine/utils/nemotron_latent_moe.py,sha256=dDY8ENbFhE-vnirEokWWdg7hhqfQ4pNyG6Ud0xbi7AE,8764
94
+ vmlx_engine/utils/tokenizer.py,sha256=emzGrBwscFiP0Ci5ZFgVxlRhJa64hiUHrQvkDpPKQSA,9317
95
+ vmlx-1.0.0.dist-info/METADATA,sha256=7Apcm4VI6NXvmmGMIP8W8hG9aUhQfg6xYUDA7yHQYo8,15010
96
+ vmlx-1.0.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
97
+ vmlx-1.0.0.dist-info/entry_points.txt,sha256=8mdqB1DfpzN621JwfowYRrbiYziJohYxXdpynFujhrQ,279
98
+ vmlx-1.0.0.dist-info/top_level.txt,sha256=lpCt_1sm95ED0QLpzudgm3VGOkaD9q9MZCkC8NDTkak,12
99
+ vmlx-1.0.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,9 @@
1
+ [console_scripts]
2
+ vmlx = vmlx_engine.cli:main
3
+ vmlx-engine = vmlx_engine.cli:main
4
+ vmlx-engine-bench = vmlx_engine.benchmark:main
5
+ vmlx-engine-chat = vmlx_engine.gradio_app:main
6
+ vmlx-serve = vmlx_engine.cli:main
7
+
8
+ [vllm.platform_plugins]
9
+ mlx = vmlx_engine.plugin:mlx_platform_plugin
@@ -0,0 +1 @@
1
+ vmlx_engine
@@ -0,0 +1,138 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ """
3
+ vmlx-engine: Apple Silicon MLX backend for vLLM
4
+
5
+ This package provides native Apple Silicon GPU acceleration for vLLM
6
+ using Apple's MLX framework, mlx-lm for LLMs, and mlx-vlm for
7
+ vision-language models.
8
+
9
+ Features:
10
+ - Continuous batching via vLLM-style scheduler
11
+ - OpenAI-compatible API server
12
+ - Support for LLM and multimodal models
13
+ """
14
+
15
+ __version__ = "0.2.18"
16
+
17
+ # All imports are lazy to allow usage on non-Apple Silicon platforms
18
+ # (e.g., CI running on Linux) where mlx_lm is not available.
19
+
20
+
21
+ def __getattr__(name):
22
+ """Lazy load all components to avoid mlx_lm import on non-Apple platforms."""
23
+ # Request management
24
+ if name in ("Request", "RequestOutput", "RequestStatus", "SamplingParams"):
25
+ from vmlx_engine import request
26
+
27
+ return getattr(request, name)
28
+
29
+ # Scheduler
30
+ if name in ("Scheduler", "SchedulerConfig", "SchedulerOutput"):
31
+ from vmlx_engine import scheduler
32
+
33
+ return getattr(scheduler, name)
34
+
35
+ # Engine
36
+ if name in ("EngineCore", "AsyncEngineCore", "EngineConfig"):
37
+ from vmlx_engine import engine_core
38
+
39
+ return getattr(engine_core, name)
40
+
41
+ # Prefix cache
42
+ if name in ("PrefixCacheManager", "PrefixCacheStats", "BlockAwarePrefixCache"):
43
+ from vmlx_engine import prefix_cache
44
+
45
+ return getattr(prefix_cache, name)
46
+
47
+ # Paged cache
48
+ if name in ("PagedCacheManager", "CacheBlock", "BlockTable", "CacheStats"):
49
+ from vmlx_engine import paged_cache
50
+
51
+ return getattr(paged_cache, name)
52
+
53
+ # MLLM cache (with legacy VLM aliases)
54
+ if name in (
55
+ "MLLMCacheManager",
56
+ "MLLMCacheStats",
57
+ "VLMCacheManager",
58
+ "VLMCacheStats",
59
+ ):
60
+ from vmlx_engine import mllm_cache
61
+
62
+ # Map legacy VLM names to MLLM
63
+ mllm_name = name.replace("VLM", "MLLM") if name.startswith("VLM") else name
64
+ return getattr(mllm_cache, mllm_name)
65
+
66
+ # Model registry
67
+ if name in ("get_registry", "ModelOwnershipError"):
68
+ from vmlx_engine import model_registry
69
+
70
+ return getattr(model_registry, name)
71
+
72
+ # Model config registry
73
+ if name in ("get_model_config_registry", "ModelConfigRegistry", "ModelConfig"):
74
+ from vmlx_engine import model_config_registry
75
+
76
+ return getattr(model_config_registry, name)
77
+
78
+ # vLLM integration components (require torch)
79
+ if name == "MLXPlatform":
80
+ from vmlx_engine.mlx_platform import MLXPlatform
81
+
82
+ return MLXPlatform
83
+ if name == "MLXWorker":
84
+ from vmlx_engine.worker import MLXWorker
85
+
86
+ return MLXWorker
87
+ if name == "MLXModelRunner":
88
+ from vmlx_engine.model_runner import MLXModelRunner
89
+
90
+ return MLXModelRunner
91
+ if name == "MLXAttentionBackend":
92
+ from vmlx_engine.attention import MLXAttentionBackend
93
+
94
+ return MLXAttentionBackend
95
+
96
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
97
+
98
+
99
+ __all__ = [
100
+ # Core (lazy loaded, require torch)
101
+ "MLXPlatform",
102
+ "MLXWorker",
103
+ "MLXModelRunner",
104
+ "MLXAttentionBackend",
105
+ # Request management
106
+ "Request",
107
+ "RequestOutput",
108
+ "RequestStatus",
109
+ "SamplingParams",
110
+ # Scheduler
111
+ "Scheduler",
112
+ "SchedulerConfig",
113
+ "SchedulerOutput",
114
+ # Engine
115
+ "EngineCore",
116
+ "AsyncEngineCore",
117
+ "EngineConfig",
118
+ # Model registry
119
+ "get_registry",
120
+ "ModelOwnershipError",
121
+ # Prefix cache (LLM)
122
+ "PrefixCacheManager",
123
+ "PrefixCacheStats",
124
+ "BlockAwarePrefixCache",
125
+ # Paged cache (memory efficiency)
126
+ "PagedCacheManager",
127
+ "CacheBlock",
128
+ "BlockTable",
129
+ "CacheStats",
130
+ # MLLM cache (images/videos)
131
+ "MLLMCacheManager",
132
+ "MLLMCacheStats",
133
+ # Legacy aliases
134
+ "VLMCacheManager",
135
+ "VLMCacheStats",
136
+ # Version
137
+ "__version__",
138
+ ]