vmlx 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (164) hide show
  1. vmlx-1.0.0/PKG-INFO +340 -0
  2. vmlx-1.0.0/README.md +262 -0
  3. vmlx-1.0.0/pyproject.toml +146 -0
  4. vmlx-1.0.0/setup.cfg +4 -0
  5. vmlx-1.0.0/tests/test_anthropic_adapter.py +1004 -0
  6. vmlx-1.0.0/tests/test_api_models.py +798 -0
  7. vmlx-1.0.0/tests/test_api_utils.py +429 -0
  8. vmlx-1.0.0/tests/test_audio.py +362 -0
  9. vmlx-1.0.0/tests/test_audit_fixes.py +609 -0
  10. vmlx-1.0.0/tests/test_batching.py +648 -0
  11. vmlx-1.0.0/tests/test_batching_deterministic.py +459 -0
  12. vmlx-1.0.0/tests/test_cache_isolation.py +197 -0
  13. vmlx-1.0.0/tests/test_cache_types.py +253 -0
  14. vmlx-1.0.0/tests/test_cancellation.py +63 -0
  15. vmlx-1.0.0/tests/test_cli_commands.py +292 -0
  16. vmlx-1.0.0/tests/test_continuous_batching.py +308 -0
  17. vmlx-1.0.0/tests/test_disk_cache_unit.py +239 -0
  18. vmlx-1.0.0/tests/test_embeddings.py +293 -0
  19. vmlx-1.0.0/tests/test_emoji_comprehensive.py +310 -0
  20. vmlx-1.0.0/tests/test_engine_audit.py +2155 -0
  21. vmlx-1.0.0/tests/test_gemma3_27b_comprehensive.py +384 -0
  22. vmlx-1.0.0/tests/test_health_endpoint.py +170 -0
  23. vmlx-1.0.0/tests/test_hybrid_batching.py +1044 -0
  24. vmlx-1.0.0/tests/test_jit_toggle.py +116 -0
  25. vmlx-1.0.0/tests/test_kv_quant.py +145 -0
  26. vmlx-1.0.0/tests/test_llm.py +115 -0
  27. vmlx-1.0.0/tests/test_mcp_security.py +760 -0
  28. vmlx-1.0.0/tests/test_medgemma_comprehensive.py +494 -0
  29. vmlx-1.0.0/tests/test_memory_cache.py +623 -0
  30. vmlx-1.0.0/tests/test_mllm.py +668 -0
  31. vmlx-1.0.0/tests/test_mllm_cache.py +1230 -0
  32. vmlx-1.0.0/tests/test_mllm_continuous_batching.py +484 -0
  33. vmlx-1.0.0/tests/test_mllm_message_serialization.py +1373 -0
  34. vmlx-1.0.0/tests/test_mllm_scheduler_cache.py +649 -0
  35. vmlx-1.0.0/tests/test_mllm_scheduler_stability.py +223 -0
  36. vmlx-1.0.0/tests/test_model_config_registry.py +867 -0
  37. vmlx-1.0.0/tests/test_model_inspector.py +505 -0
  38. vmlx-1.0.0/tests/test_model_name.py +682 -0
  39. vmlx-1.0.0/tests/test_model_registry.py +257 -0
  40. vmlx-1.0.0/tests/test_native_tool_format.py +358 -0
  41. vmlx-1.0.0/tests/test_optimizations.py +64 -0
  42. vmlx-1.0.0/tests/test_paged_cache.py +727 -0
  43. vmlx-1.0.0/tests/test_paged_cache_benefits.py +464 -0
  44. vmlx-1.0.0/tests/test_paged_cache_real_inference.py +269 -0
  45. vmlx-1.0.0/tests/test_paged_cache_real_model.py +587 -0
  46. vmlx-1.0.0/tests/test_paged_cache_unit.py +184 -0
  47. vmlx-1.0.0/tests/test_platform.py +111 -0
  48. vmlx-1.0.0/tests/test_prefix_cache.py +547 -0
  49. vmlx-1.0.0/tests/test_reasoning_parser.py +1352 -0
  50. vmlx-1.0.0/tests/test_reasoning_tool_interaction.py +780 -0
  51. vmlx-1.0.0/tests/test_request.py +475 -0
  52. vmlx-1.0.0/tests/test_request_cancellation.py +59 -0
  53. vmlx-1.0.0/tests/test_reranker_endpoint.py +94 -0
  54. vmlx-1.0.0/tests/test_server.py +759 -0
  55. vmlx-1.0.0/tests/test_simple_engine.py +213 -0
  56. vmlx-1.0.0/tests/test_speculative.py +845 -0
  57. vmlx-1.0.0/tests/test_streaming_detokenizer.py +259 -0
  58. vmlx-1.0.0/tests/test_streaming_json_encoder.py +438 -0
  59. vmlx-1.0.0/tests/test_streaming_latency.py +338 -0
  60. vmlx-1.0.0/tests/test_streaming_reasoning.py +1536 -0
  61. vmlx-1.0.0/tests/test_structured_output.py +379 -0
  62. vmlx-1.0.0/tests/test_tool_fallback_injection.py +202 -0
  63. vmlx-1.0.0/tests/test_tool_format.py +595 -0
  64. vmlx-1.0.0/tests/test_tool_parsers.py +984 -0
  65. vmlx-1.0.0/vmlx.egg-info/PKG-INFO +340 -0
  66. vmlx-1.0.0/vmlx.egg-info/SOURCES.txt +162 -0
  67. vmlx-1.0.0/vmlx.egg-info/dependency_links.txt +1 -0
  68. vmlx-1.0.0/vmlx.egg-info/entry_points.txt +9 -0
  69. vmlx-1.0.0/vmlx.egg-info/requires.txt +61 -0
  70. vmlx-1.0.0/vmlx.egg-info/top_level.txt +1 -0
  71. vmlx-1.0.0/vmlx_engine/__init__.py +138 -0
  72. vmlx-1.0.0/vmlx_engine/api/__init__.py +121 -0
  73. vmlx-1.0.0/vmlx_engine/api/anthropic_adapter.py +609 -0
  74. vmlx-1.0.0/vmlx_engine/api/models.py +757 -0
  75. vmlx-1.0.0/vmlx_engine/api/streaming.py +210 -0
  76. vmlx-1.0.0/vmlx_engine/api/tool_calling.py +677 -0
  77. vmlx-1.0.0/vmlx_engine/api/utils.py +314 -0
  78. vmlx-1.0.0/vmlx_engine/attention.py +245 -0
  79. vmlx-1.0.0/vmlx_engine/audio/__init__.py +25 -0
  80. vmlx-1.0.0/vmlx_engine/audio/processor.py +214 -0
  81. vmlx-1.0.0/vmlx_engine/audio/stt.py +167 -0
  82. vmlx-1.0.0/vmlx_engine/audio/tts.py +322 -0
  83. vmlx-1.0.0/vmlx_engine/benchmark.py +1654 -0
  84. vmlx-1.0.0/vmlx_engine/block_disk_store.py +770 -0
  85. vmlx-1.0.0/vmlx_engine/cli.py +1287 -0
  86. vmlx-1.0.0/vmlx_engine/commands/__init__.py +2 -0
  87. vmlx-1.0.0/vmlx_engine/commands/convert.py +510 -0
  88. vmlx-1.0.0/vmlx_engine/commands/doctor.py +309 -0
  89. vmlx-1.0.0/vmlx_engine/commands/info.py +30 -0
  90. vmlx-1.0.0/vmlx_engine/commands/list.py +38 -0
  91. vmlx-1.0.0/vmlx_engine/disk_cache.py +468 -0
  92. vmlx-1.0.0/vmlx_engine/embedding.py +109 -0
  93. vmlx-1.0.0/vmlx_engine/engine/__init__.py +28 -0
  94. vmlx-1.0.0/vmlx_engine/engine/base.py +201 -0
  95. vmlx-1.0.0/vmlx_engine/engine/batched.py +810 -0
  96. vmlx-1.0.0/vmlx_engine/engine/simple.py +721 -0
  97. vmlx-1.0.0/vmlx_engine/engine_core.py +720 -0
  98. vmlx-1.0.0/vmlx_engine/gradio_app.py +390 -0
  99. vmlx-1.0.0/vmlx_engine/gradio_text_app.py +176 -0
  100. vmlx-1.0.0/vmlx_engine/image_gen.py +275 -0
  101. vmlx-1.0.0/vmlx_engine/mcp/__init__.py +85 -0
  102. vmlx-1.0.0/vmlx_engine/mcp/client.py +370 -0
  103. vmlx-1.0.0/vmlx_engine/mcp/config.py +186 -0
  104. vmlx-1.0.0/vmlx_engine/mcp/executor.py +500 -0
  105. vmlx-1.0.0/vmlx_engine/mcp/manager.py +302 -0
  106. vmlx-1.0.0/vmlx_engine/mcp/security.py +699 -0
  107. vmlx-1.0.0/vmlx_engine/mcp/tools.py +174 -0
  108. vmlx-1.0.0/vmlx_engine/mcp/types.py +189 -0
  109. vmlx-1.0.0/vmlx_engine/memory_cache.py +660 -0
  110. vmlx-1.0.0/vmlx_engine/mllm_batch_generator.py +1800 -0
  111. vmlx-1.0.0/vmlx_engine/mllm_cache.py +467 -0
  112. vmlx-1.0.0/vmlx_engine/mllm_scheduler.py +2074 -0
  113. vmlx-1.0.0/vmlx_engine/mlx_platform.py +333 -0
  114. vmlx-1.0.0/vmlx_engine/model_config_registry.py +224 -0
  115. vmlx-1.0.0/vmlx_engine/model_configs.py +684 -0
  116. vmlx-1.0.0/vmlx_engine/model_registry.py +185 -0
  117. vmlx-1.0.0/vmlx_engine/model_runner.py +456 -0
  118. vmlx-1.0.0/vmlx_engine/models/__init__.py +15 -0
  119. vmlx-1.0.0/vmlx_engine/models/llm.py +366 -0
  120. vmlx-1.0.0/vmlx_engine/models/mllm.py +1965 -0
  121. vmlx-1.0.0/vmlx_engine/multimodal_processor.py +200 -0
  122. vmlx-1.0.0/vmlx_engine/optimizations.py +139 -0
  123. vmlx-1.0.0/vmlx_engine/output_collector.py +247 -0
  124. vmlx-1.0.0/vmlx_engine/paged_cache.py +1364 -0
  125. vmlx-1.0.0/vmlx_engine/plugin.py +155 -0
  126. vmlx-1.0.0/vmlx_engine/prefix_cache.py +1257 -0
  127. vmlx-1.0.0/vmlx_engine/reasoning/__init__.py +101 -0
  128. vmlx-1.0.0/vmlx_engine/reasoning/base.py +110 -0
  129. vmlx-1.0.0/vmlx_engine/reasoning/deepseek_r1_parser.py +113 -0
  130. vmlx-1.0.0/vmlx_engine/reasoning/gptoss_parser.py +336 -0
  131. vmlx-1.0.0/vmlx_engine/reasoning/qwen3_parser.py +73 -0
  132. vmlx-1.0.0/vmlx_engine/reasoning/think_parser.py +227 -0
  133. vmlx-1.0.0/vmlx_engine/request.py +219 -0
  134. vmlx-1.0.0/vmlx_engine/reranker.py +221 -0
  135. vmlx-1.0.0/vmlx_engine/scheduler.py +2202 -0
  136. vmlx-1.0.0/vmlx_engine/server.py +4700 -0
  137. vmlx-1.0.0/vmlx_engine/simple.py +445 -0
  138. vmlx-1.0.0/vmlx_engine/speculative.py +257 -0
  139. vmlx-1.0.0/vmlx_engine/tool_parsers/__init__.py +83 -0
  140. vmlx-1.0.0/vmlx_engine/tool_parsers/abstract_tool_parser.py +290 -0
  141. vmlx-1.0.0/vmlx_engine/tool_parsers/auto_tool_parser.py +379 -0
  142. vmlx-1.0.0/vmlx_engine/tool_parsers/deepseek_tool_parser.py +165 -0
  143. vmlx-1.0.0/vmlx_engine/tool_parsers/functionary_tool_parser.py +188 -0
  144. vmlx-1.0.0/vmlx_engine/tool_parsers/glm47_tool_parser.py +213 -0
  145. vmlx-1.0.0/vmlx_engine/tool_parsers/granite_tool_parser.py +142 -0
  146. vmlx-1.0.0/vmlx_engine/tool_parsers/hermes_tool_parser.py +235 -0
  147. vmlx-1.0.0/vmlx_engine/tool_parsers/kimi_tool_parser.py +155 -0
  148. vmlx-1.0.0/vmlx_engine/tool_parsers/llama_tool_parser.py +123 -0
  149. vmlx-1.0.0/vmlx_engine/tool_parsers/minimax_tool_parser.py +338 -0
  150. vmlx-1.0.0/vmlx_engine/tool_parsers/mistral_tool_parser.py +262 -0
  151. vmlx-1.0.0/vmlx_engine/tool_parsers/nemotron_tool_parser.py +161 -0
  152. vmlx-1.0.0/vmlx_engine/tool_parsers/qwen_tool_parser.py +152 -0
  153. vmlx-1.0.0/vmlx_engine/tool_parsers/step3p5_tool_parser.py +232 -0
  154. vmlx-1.0.0/vmlx_engine/tool_parsers/xlam_tool_parser.py +172 -0
  155. vmlx-1.0.0/vmlx_engine/utils/__init__.py +6 -0
  156. vmlx-1.0.0/vmlx_engine/utils/cache_types.py +196 -0
  157. vmlx-1.0.0/vmlx_engine/utils/chat_templates.py +231 -0
  158. vmlx-1.0.0/vmlx_engine/utils/jang_loader.py +567 -0
  159. vmlx-1.0.0/vmlx_engine/utils/mamba_cache.py +327 -0
  160. vmlx-1.0.0/vmlx_engine/utils/model_inspector.py +592 -0
  161. vmlx-1.0.0/vmlx_engine/utils/nemotron_latent_moe.py +226 -0
  162. vmlx-1.0.0/vmlx_engine/utils/tokenizer.py +247 -0
  163. vmlx-1.0.0/vmlx_engine/vision_embedding_cache.py +219 -0
  164. vmlx-1.0.0/vmlx_engine/worker.py +266 -0
vmlx-1.0.0/PKG-INFO ADDED
@@ -0,0 +1,340 @@
1
+ Metadata-Version: 2.4
2
+ Name: vmlx
3
+ Version: 1.0.0
4
+ Summary: Local AI inference for Apple Silicon — Text, Image, Video & Audio generation on Mac
5
+ Author-email: Jinho Jang <eric@jangq.ai>
6
+ License: Apache-2.0
7
+ Project-URL: Homepage, https://github.com/vmlxllm/vmlx
8
+ Project-URL: Documentation, https://github.com/vmlxllm/vmlx#readme
9
+ Project-URL: Repository, https://github.com/vmlxllm/vmlx
10
+ Keywords: llm,mlx,apple-silicon,vllm,inference,transformers
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: License :: OSI Approved :: Apache Software License
15
+ Classifier: Operating System :: MacOS
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Programming Language :: Python :: 3.13
21
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
+ Requires-Python: >=3.10
23
+ Description-Content-Type: text/markdown
24
+ Requires-Dist: mlx>=0.29.0
25
+ Requires-Dist: mlx-lm>=0.30.2
26
+ Requires-Dist: mlx-vlm>=0.1.0
27
+ Requires-Dist: transformers>=4.40.0
28
+ Requires-Dist: tokenizers>=0.19.0
29
+ Requires-Dist: huggingface-hub>=0.23.0
30
+ Requires-Dist: numpy>=1.24.0
31
+ Requires-Dist: pillow>=10.0.0
32
+ Requires-Dist: tqdm>=4.66.0
33
+ Requires-Dist: pyyaml>=6.0
34
+ Requires-Dist: requests>=2.28.0
35
+ Requires-Dist: tabulate>=0.9.0
36
+ Requires-Dist: opencv-python-headless>=4.8.0
37
+ Requires-Dist: psutil>=5.9.0
38
+ Requires-Dist: fastapi>=0.100.0
39
+ Requires-Dist: uvicorn>=0.23.0
40
+ Requires-Dist: mcp>=1.0.0
41
+ Requires-Dist: jsonschema>=4.0.0
42
+ Requires-Dist: mlx-embeddings>=0.0.5
43
+ Provides-Extra: ui
44
+ Requires-Dist: gradio>=4.0.0; extra == "ui"
45
+ Requires-Dist: pytz>=2024.1; extra == "ui"
46
+ Provides-Extra: dev
47
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
48
+ Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
49
+ Requires-Dist: black>=23.0.0; extra == "dev"
50
+ Requires-Dist: ruff>=0.1.0; extra == "dev"
51
+ Requires-Dist: mypy>=1.0.0; extra == "dev"
52
+ Provides-Extra: vllm
53
+ Requires-Dist: vllm>=0.4.0; extra == "vllm"
54
+ Provides-Extra: vision
55
+ Requires-Dist: torch>=2.3.0; extra == "vision"
56
+ Requires-Dist: torchvision>=0.18.0; extra == "vision"
57
+ Provides-Extra: audio
58
+ Requires-Dist: mlx-audio>=0.2.9; extra == "audio"
59
+ Requires-Dist: sounddevice>=0.4.0; extra == "audio"
60
+ Requires-Dist: soundfile>=0.12.0; extra == "audio"
61
+ Requires-Dist: scipy>=1.10.0; extra == "audio"
62
+ Requires-Dist: numba>=0.57.0; extra == "audio"
63
+ Requires-Dist: tiktoken>=0.5.0; extra == "audio"
64
+ Requires-Dist: misaki[ja,zh]>=0.5.0; extra == "audio"
65
+ Requires-Dist: spacy>=3.7.0; extra == "audio"
66
+ Requires-Dist: num2words>=0.5.0; extra == "audio"
67
+ Requires-Dist: loguru>=0.7.0; extra == "audio"
68
+ Requires-Dist: phonemizer>=3.2.0; extra == "audio"
69
+ Requires-Dist: ordered_set>=4.1.0; extra == "audio"
70
+ Requires-Dist: cn2an>=0.5.0; extra == "audio"
71
+ Requires-Dist: fugashi>=1.3.0; extra == "audio"
72
+ Requires-Dist: unidic-lite>=1.0.0; extra == "audio"
73
+ Requires-Dist: jieba>=0.42.0; extra == "audio"
74
+ Provides-Extra: jang
75
+ Requires-Dist: jang>=1.0.0; extra == "jang"
76
+ Provides-Extra: image
77
+ Requires-Dist: mflux>=0.16.0; extra == "image"
78
+
79
+ <p align="center">
80
+ <picture>
81
+ <source media="(prefers-color-scheme: dark)" srcset="https://vmlx.net/logos/png/wordmark-dark-600x150.png">
82
+ <source media="(prefers-color-scheme: light)" srcset="https://vmlx.net/logos/png/wordmark-light-600x150.png">
83
+ <img alt="vMLX" src="https://vmlx.net/logos/png/wordmark-transparent-600x150.png" width="400">
84
+ </picture>
85
+ </p>
86
+
87
+ <p align="center">
88
+ <strong>Native macOS AI inference — local models, remote endpoints, zero config</strong>
89
+ </p>
90
+
91
+ <p align="center">
92
+ <a href="https://vmlx.net">Website</a> · <a href="panel/CHANGELOG.md">Panel Changelog</a> · <a href="CHANGELOG.md">Engine Changelog</a> · <a href="docs/">Documentation</a>
93
+ </p>
94
+
95
+ ---
96
+
97
+ ## What is vMLX?
98
+
99
+ vMLX is a native macOS application for running AI models on Apple Silicon. It bundles a custom inference engine with a full-featured desktop interface — manage sessions, chat with models, download from HuggingFace, connect to remote APIs, and use agentic tool-calling workflows.
100
+
101
+ - **Local inference** with GPU acceleration via MLX
102
+ - **Remote endpoints** — connect to any OpenAI-compatible API
103
+ - **HuggingFace downloader** — search, download, and serve models in-app
104
+ - **Built-in tools** — file I/O, shell, search, image reading, ask_user interrupt
105
+ - **MCP integration** — Model Context Protocol tool servers (local sessions)
106
+
107
+ ---
108
+
109
+ ## Key Features
110
+
111
+ ### Inference Engine (v0.2.18)
112
+
113
+ | Feature | Description |
114
+ |---------|-------------|
115
+ | **Paged KV Cache** | Memory-efficient caching with prefix sharing and block-level reuse |
116
+ | **KV Cache Quantization** | Q4/Q8 quantized cache storage (2–4× memory savings) |
117
+ | **Prefix Cache** | Token-level prefix matching for fast prompt reuse across requests |
118
+ | **Continuous Batching** | Concurrent request handling with slot management |
119
+ | **VLM Caching** | Full KV cache pipeline for vision-language models (Qwen-VL, Gemma 3, etc.) |
120
+ | **Mamba Hybrid Support** | Auto-detects mixed KVCache + MambaCache models (Qwen3.5-VL, Qwen3-Coder-Next, Nemotron) |
121
+ | **Streaming Detokenizer** | Per-request UTF-8 buffering — emoji, CJK, Arabic render correctly |
122
+ | **Request Cancellation** | Stop inference mid-stream via API or connection close |
123
+ | **OpenAI-Compatible API** | Chat Completions + Responses API with full streaming support |
124
+ | **Speculative Decoding** | Draft model acceleration (20-90% speedup, zero quality loss) |
125
+
126
+ ### Desktop App (Panel v1.2.1)
127
+
128
+ | Feature | Description |
129
+ |---------|-------------|
130
+ | **Multi-session** | Run multiple models simultaneously on different ports |
131
+ | **Remote endpoints** | Connect to OpenAI, Groq, local vLLM, or any compatible API |
132
+ | **HuggingFace browser** | Search, download, and install MLX models with progress tracking |
133
+ | **Agentic tools** | File I/O, shell, search, image reading with auto-continue loops (up to 10 iterations) |
134
+ | **Per-chat settings** | Temperature, Top P/K, Min P, Repeat Penalty, Stop Sequences, Max Tokens |
135
+ | **Reasoning display** | Collapsible thinking sections for Qwen3, DeepSeek-R1, GLM-4.7 |
136
+ | **Tool parsers** | hermes, pythonic, llama3, mistral, minimax, qwen3, nemotron, step3p5, and more |
137
+ | **Auto-detection** | Reads model config JSON for automatic parser and cache type selection |
138
+ | **Persistent history** | SQLite-backed chat history with metrics, tool calls, and reasoning content |
139
+ | **Live metrics** | TTFT, tokens/sec, prompt processing speed, prefix cache hits |
140
+
141
+ ---
142
+
143
+ ## Quick Start
144
+
145
+ ### Desktop App (recommended)
146
+
147
+ ```bash
148
+ # Clone and build
149
+ git clone https://github.com/vmlxllm/vmlx.git
150
+ cd vmlx/panel
151
+
152
+ # Install dependencies
153
+ npm install
154
+
155
+ # Development mode
156
+ npm run dev
157
+
158
+ # Build and install to /Applications
159
+ bash scripts/build-and-install.sh
160
+ ```
161
+
162
+ ### Engine Only (CLI)
163
+
164
+ ```bash
165
+ # Install
166
+ uv tool install git+https://github.com/vmlxllm/vmlx.git
167
+ # or
168
+ pip install git+https://github.com/vmlxllm/vmlx.git
169
+
170
+ # Start server
171
+ vmlx-engine serve mlx-community/Llama-3.2-3B-Instruct-4bit --port 8000
172
+
173
+ # With continuous batching
174
+ vmlx-engine serve mlx-community/Llama-3.2-3B-Instruct-4bit --port 8000 --continuous-batching
175
+
176
+ # With API key
177
+ vmlx-engine serve mlx-community/Llama-3.2-3B-Instruct-4bit --port 8000 --api-key your-key
178
+
179
+ # With speculative decoding (20-90% faster)
180
+ vmlx-engine serve mlx-community/Llama-3.2-3B-Instruct-4bit --port 8000 \
181
+ --speculative-model mlx-community/Llama-3.2-1B-Instruct-4bit \
182
+ --num-draft-tokens 3
183
+ ```
184
+
185
+ ### Use with OpenAI SDK
186
+
187
+ ```python
188
+ from openai import OpenAI
189
+
190
+ client = OpenAI(base_url="http://localhost:8000/v1", api_key="not-needed")
191
+
192
+ response = client.chat.completions.create(
193
+ model="default",
194
+ messages=[{"role": "user", "content": "Hello!"}],
195
+ )
196
+ print(response.choices[0].message.content)
197
+ ```
198
+
199
+ ---
200
+
201
+ ## API Endpoints
202
+
203
+ | Endpoint | Description |
204
+ |----------|-------------|
205
+ | `POST /v1/chat/completions` | Chat Completions API (streaming) |
206
+ | `POST /v1/responses` | Responses API (streaming) |
207
+ | `GET /v1/models` | List loaded models |
208
+ | `GET /health` | Server health + model info |
209
+ | `POST /v1/mcp/execute` | Execute MCP tool |
210
+ | `GET /v1/cache/stats` | Prefix cache statistics |
211
+ | `POST /v1/cache/warm` | Pre-warm cache with prompt |
212
+ | `DELETE /v1/cache` | Clear prefix cache |
213
+ | `POST /v1/chat/completions/{id}/cancel` | Cancel inference (save GPU) |
214
+ | `POST /v1/embeddings` | Text embeddings (mlx-embeddings) |
215
+
216
+ ---
217
+
218
+ ## Reasoning Models
219
+
220
+ Extract thinking process from reasoning-capable models:
221
+
222
+ ```bash
223
+ vmlx-engine serve mlx-community/Qwen3-8B-4bit --reasoning-parser qwen3
224
+ ```
225
+
226
+ | Parser | Models | Format |
227
+ |--------|--------|--------|
228
+ | `qwen3` | Qwen3, QwQ, MiniMax M2/M2.5, StepFun | `<think>` / `</think>` tags |
229
+ | `deepseek_r1` | DeepSeek-R1, Gemma 3, Phi-4 Reasoning, GLM-4.7, GLM-Z1 | Lenient `<think>` (handles missing open tag) |
230
+ | `openai_gptoss` | GLM-4.7 Flash, GPT-OSS | Harmony `<\|channel\|>analysis/final` protocol |
231
+
232
+ ---
233
+
234
+ ## Tool Calling
235
+
236
+ Built-in agentic tools available in the desktop app:
237
+
238
+ | Category | Tools |
239
+ |----------|-------|
240
+ | **File** | read_file, write_file, edit_file, patch_file, batch_edit, copy, move, delete, create_directory, list_directory, read_image |
241
+ | **Search** | search_files, find_files, file_info, get_diagnostics, get_tree, diff_files |
242
+ | **Shell** | run_command, spawn_process, get_process_output |
243
+ | **Web** | fetchUrl, brave_search |
244
+ | **Utility** | ask_user (interactive interrupt) |
245
+
246
+ Plus MCP tool server passthrough for local sessions.
247
+
248
+ ---
249
+
250
+ ## Architecture
251
+
252
+ ```
253
+ ┌─────────────────────────────────────────────────────────┐
254
+ │ vMLX Desktop App │
255
+ │ (Electron + React + TypeScript) │
256
+ └─────────────────────────────────────────────────────────┘
257
+
258
+ ┌────────────┴────────────┐
259
+ ▼ ▼
260
+ ┌──────────────────────┐ ┌──────────────────────┐
261
+ │ Local vmlx-engine │ │ Remote Endpoints │
262
+ │ (spawned process) │ │ (OpenAI, Groq, etc.) │
263
+ └──────────────────────┘ └──────────────────────┘
264
+
265
+
266
+ ┌─────────────────────────────────────────────────────────┐
267
+ │ vMLX Engine │
268
+ │ (FastAPI + MLX inference + caching) │
269
+ └─────────────────────────────────────────────────────────┘
270
+
271
+ ┌─────────┼──────────┬──────────┐
272
+ ▼ ▼ ▼ ▼
273
+ ┌────────┐┌────────┐┌────────┐┌────────────┐
274
+ │ mlx-lm ││mlx-vlm ││mlx-aud ││mlx-embed │
275
+ │ (LLMs) ││(Vision)││(Audio) ││(Embeddings)│
276
+ └────────┘└────────┘└────────┘└────────────┘
277
+
278
+
279
+ ┌─────────────────────────────────────────────────────────┐
280
+ │ Apple MLX │
281
+ │ (Metal GPU + Unified Memory) │
282
+ └─────────────────────────────────────────────────────────┘
283
+ ```
284
+
285
+ ---
286
+
287
+ ## Tech Stack
288
+
289
+ | Layer | Technology |
290
+ |-------|-----------|
291
+ | Desktop app | Electron 28 + React 18 + TypeScript |
292
+ | Styling | Tailwind CSS |
293
+ | Database | SQLite (WAL mode, better-sqlite3) |
294
+ | Inference engine | vMLX Engine v0.2.18 (Python, FastAPI) |
295
+ | ML framework | Apple MLX (Metal GPU acceleration) |
296
+ | Build | electron-vite + electron-builder |
297
+ | Tests | Vitest (panel: 542 tests), pytest (engine: 1595 tests) |
298
+ | Python | Bundled relocatable Python 3.12 |
299
+
300
+ ---
301
+
302
+ ## Recent Changes
303
+
304
+ ### Panel v1.2.1 / Engine v0.2.18 (2026-03-09)
305
+ - **Tool calling fix**: `enableAutoToolChoice` default changed from `false` to `undefined` (auto-detect) — MCP and built-in tools now work out of the box without manual enable
306
+ - **MCP tool result truncation**: MCP tool results now capped at same limit as built-in tools (50KB default) to prevent context overflow
307
+ - **Command preview parity**: `buildCommandPreview` in SessionSettings now matches actual `buildArgs` logic for auto-tool-choice flags
308
+ - **Old config migration**: Stored sessions with `enableAutoToolChoice: false` auto-migrate to `undefined` on load
309
+ - **2137 total tests**: 1595 engine + 542 panel (12 new regression tests for tool calling and MCP)
310
+
311
+ ### Panel v1.2.0 / Engine v0.2.18 (2026-03-09)
312
+ - **HuggingFace download fix**: Download progress no longer stuck at 0% — tqdm `\r` chunk splitting, ANSI stripping, highest-percent extraction
313
+ - **HF browser NaN/Unknown fix**: Model ages and authors display correctly (uses `createdAt` fallback, extracts author from modelId)
314
+ - **macOS 15 launch fix**: `minimumSystemVersion` corrected from 26.0.0 to 14.0.0 (fixes GitHub #10)
315
+ - **Deep stability audit**: 14 fixes across paged cache block lifecycle, KV dequantize safety, reasoning marker detection, tool fallback, Mistral JSON validation
316
+ - **CancelledError SSE hang**: Engine cancellation now unblocks all waiting SSE consumers
317
+ - **2125 total tests**: 1595 engine + 530 panel with full regression coverage
318
+
319
+ ### Panel v1.1.4 / Engine v0.2.12 (2026-03-07)
320
+ - **tool_choice="none" fix**: Content no longer swallowed when tool markers detected with tools suppressed
321
+ - **suppress_reasoning**: Reasoning leaks plugged in both API paths
322
+ - **First-launch UX**: Auto-creates initial chat, dynamic About page version
323
+ - **1571 engine tests**, **530 panel tests** across 6 vitest suites
324
+
325
+ See [Panel Changelog](panel/CHANGELOG.md) and [Engine Changelog](CHANGELOG.md) for full history.
326
+
327
+ ---
328
+
329
+ ## Current Version
330
+
331
+ **Engine v0.2.18** / **Panel v1.2.1** — macOS 26+ (Tahoe) for local inference, macOS 14+ for remote endpoints. Apple Silicon (M1, M2, M3, M4)
332
+
333
+ ## Links
334
+
335
+ - **Website**: [vmlx.net](https://vmlx.net)
336
+ - **Contact**: admin@vmlx.net
337
+
338
+ ## License
339
+
340
+ Apache 2.0 — see [LICENSE](LICENSE) for details.
vmlx-1.0.0/README.md ADDED
@@ -0,0 +1,262 @@
1
+ <p align="center">
2
+ <picture>
3
+ <source media="(prefers-color-scheme: dark)" srcset="https://vmlx.net/logos/png/wordmark-dark-600x150.png">
4
+ <source media="(prefers-color-scheme: light)" srcset="https://vmlx.net/logos/png/wordmark-light-600x150.png">
5
+ <img alt="vMLX" src="https://vmlx.net/logos/png/wordmark-transparent-600x150.png" width="400">
6
+ </picture>
7
+ </p>
8
+
9
+ <p align="center">
10
+ <strong>Native macOS AI inference — local models, remote endpoints, zero config</strong>
11
+ </p>
12
+
13
+ <p align="center">
14
+ <a href="https://vmlx.net">Website</a> · <a href="panel/CHANGELOG.md">Panel Changelog</a> · <a href="CHANGELOG.md">Engine Changelog</a> · <a href="docs/">Documentation</a>
15
+ </p>
16
+
17
+ ---
18
+
19
+ ## What is vMLX?
20
+
21
+ vMLX is a native macOS application for running AI models on Apple Silicon. It bundles a custom inference engine with a full-featured desktop interface — manage sessions, chat with models, download from HuggingFace, connect to remote APIs, and use agentic tool-calling workflows.
22
+
23
+ - **Local inference** with GPU acceleration via MLX
24
+ - **Remote endpoints** — connect to any OpenAI-compatible API
25
+ - **HuggingFace downloader** — search, download, and serve models in-app
26
+ - **Built-in tools** — file I/O, shell, search, image reading, ask_user interrupt
27
+ - **MCP integration** — Model Context Protocol tool servers (local sessions)
28
+
29
+ ---
30
+
31
+ ## Key Features
32
+
33
+ ### Inference Engine (v0.2.18)
34
+
35
+ | Feature | Description |
36
+ |---------|-------------|
37
+ | **Paged KV Cache** | Memory-efficient caching with prefix sharing and block-level reuse |
38
+ | **KV Cache Quantization** | Q4/Q8 quantized cache storage (2–4× memory savings) |
39
+ | **Prefix Cache** | Token-level prefix matching for fast prompt reuse across requests |
40
+ | **Continuous Batching** | Concurrent request handling with slot management |
41
+ | **VLM Caching** | Full KV cache pipeline for vision-language models (Qwen-VL, Gemma 3, etc.) |
42
+ | **Mamba Hybrid Support** | Auto-detects mixed KVCache + MambaCache models (Qwen3.5-VL, Qwen3-Coder-Next, Nemotron) |
43
+ | **Streaming Detokenizer** | Per-request UTF-8 buffering — emoji, CJK, Arabic render correctly |
44
+ | **Request Cancellation** | Stop inference mid-stream via API or connection close |
45
+ | **OpenAI-Compatible API** | Chat Completions + Responses API with full streaming support |
46
+ | **Speculative Decoding** | Draft model acceleration (20-90% speedup, zero quality loss) |
47
+
48
+ ### Desktop App (Panel v1.2.1)
49
+
50
+ | Feature | Description |
51
+ |---------|-------------|
52
+ | **Multi-session** | Run multiple models simultaneously on different ports |
53
+ | **Remote endpoints** | Connect to OpenAI, Groq, local vLLM, or any compatible API |
54
+ | **HuggingFace browser** | Search, download, and install MLX models with progress tracking |
55
+ | **Agentic tools** | File I/O, shell, search, image reading with auto-continue loops (up to 10 iterations) |
56
+ | **Per-chat settings** | Temperature, Top P/K, Min P, Repeat Penalty, Stop Sequences, Max Tokens |
57
+ | **Reasoning display** | Collapsible thinking sections for Qwen3, DeepSeek-R1, GLM-4.7 |
58
+ | **Tool parsers** | hermes, pythonic, llama3, mistral, minimax, qwen3, nemotron, step3p5, and more |
59
+ | **Auto-detection** | Reads model config JSON for automatic parser and cache type selection |
60
+ | **Persistent history** | SQLite-backed chat history with metrics, tool calls, and reasoning content |
61
+ | **Live metrics** | TTFT, tokens/sec, prompt processing speed, prefix cache hits |
62
+
63
+ ---
64
+
65
+ ## Quick Start
66
+
67
+ ### Desktop App (recommended)
68
+
69
+ ```bash
70
+ # Clone and build
71
+ git clone https://github.com/vmlxllm/vmlx.git
72
+ cd vmlx/panel
73
+
74
+ # Install dependencies
75
+ npm install
76
+
77
+ # Development mode
78
+ npm run dev
79
+
80
+ # Build and install to /Applications
81
+ bash scripts/build-and-install.sh
82
+ ```
83
+
84
+ ### Engine Only (CLI)
85
+
86
+ ```bash
87
+ # Install
88
+ uv tool install git+https://github.com/vmlxllm/vmlx.git
89
+ # or
90
+ pip install git+https://github.com/vmlxllm/vmlx.git
91
+
92
+ # Start server
93
+ vmlx-engine serve mlx-community/Llama-3.2-3B-Instruct-4bit --port 8000
94
+
95
+ # With continuous batching
96
+ vmlx-engine serve mlx-community/Llama-3.2-3B-Instruct-4bit --port 8000 --continuous-batching
97
+
98
+ # With API key
99
+ vmlx-engine serve mlx-community/Llama-3.2-3B-Instruct-4bit --port 8000 --api-key your-key
100
+
101
+ # With speculative decoding (20-90% faster)
102
+ vmlx-engine serve mlx-community/Llama-3.2-3B-Instruct-4bit --port 8000 \
103
+ --speculative-model mlx-community/Llama-3.2-1B-Instruct-4bit \
104
+ --num-draft-tokens 3
105
+ ```
106
+
107
+ ### Use with OpenAI SDK
108
+
109
+ ```python
110
+ from openai import OpenAI
111
+
112
+ client = OpenAI(base_url="http://localhost:8000/v1", api_key="not-needed")
113
+
114
+ response = client.chat.completions.create(
115
+ model="default",
116
+ messages=[{"role": "user", "content": "Hello!"}],
117
+ )
118
+ print(response.choices[0].message.content)
119
+ ```
120
+
121
+ ---
122
+
123
+ ## API Endpoints
124
+
125
+ | Endpoint | Description |
126
+ |----------|-------------|
127
+ | `POST /v1/chat/completions` | Chat Completions API (streaming) |
128
+ | `POST /v1/responses` | Responses API (streaming) |
129
+ | `GET /v1/models` | List loaded models |
130
+ | `GET /health` | Server health + model info |
131
+ | `POST /v1/mcp/execute` | Execute MCP tool |
132
+ | `GET /v1/cache/stats` | Prefix cache statistics |
133
+ | `POST /v1/cache/warm` | Pre-warm cache with prompt |
134
+ | `DELETE /v1/cache` | Clear prefix cache |
135
+ | `POST /v1/chat/completions/{id}/cancel` | Cancel inference (save GPU) |
136
+ | `POST /v1/embeddings` | Text embeddings (mlx-embeddings) |
137
+
138
+ ---
139
+
140
+ ## Reasoning Models
141
+
142
+ Extract thinking process from reasoning-capable models:
143
+
144
+ ```bash
145
+ vmlx-engine serve mlx-community/Qwen3-8B-4bit --reasoning-parser qwen3
146
+ ```
147
+
148
+ | Parser | Models | Format |
149
+ |--------|--------|--------|
150
+ | `qwen3` | Qwen3, QwQ, MiniMax M2/M2.5, StepFun | `<think>` / `</think>` tags |
151
+ | `deepseek_r1` | DeepSeek-R1, Gemma 3, Phi-4 Reasoning, GLM-4.7, GLM-Z1 | Lenient `<think>` (handles missing open tag) |
152
+ | `openai_gptoss` | GLM-4.7 Flash, GPT-OSS | Harmony `<\|channel\|>analysis/final` protocol |
153
+
154
+ ---
155
+
156
+ ## Tool Calling
157
+
158
+ Built-in agentic tools available in the desktop app:
159
+
160
+ | Category | Tools |
161
+ |----------|-------|
162
+ | **File** | read_file, write_file, edit_file, patch_file, batch_edit, copy, move, delete, create_directory, list_directory, read_image |
163
+ | **Search** | search_files, find_files, file_info, get_diagnostics, get_tree, diff_files |
164
+ | **Shell** | run_command, spawn_process, get_process_output |
165
+ | **Web** | fetchUrl, brave_search |
166
+ | **Utility** | ask_user (interactive interrupt) |
167
+
168
+ Plus MCP tool server passthrough for local sessions.
169
+
170
+ ---
171
+
172
+ ## Architecture
173
+
174
+ ```
175
+ ┌─────────────────────────────────────────────────────────┐
176
+ │ vMLX Desktop App │
177
+ │ (Electron + React + TypeScript) │
178
+ └─────────────────────────────────────────────────────────┘
179
+
180
+ ┌────────────┴────────────┐
181
+ ▼ ▼
182
+ ┌──────────────────────┐ ┌──────────────────────┐
183
+ │ Local vmlx-engine │ │ Remote Endpoints │
184
+ │ (spawned process) │ │ (OpenAI, Groq, etc.) │
185
+ └──────────────────────┘ └──────────────────────┘
186
+
187
+
188
+ ┌─────────────────────────────────────────────────────────┐
189
+ │ vMLX Engine │
190
+ │ (FastAPI + MLX inference + caching) │
191
+ └─────────────────────────────────────────────────────────┘
192
+
193
+ ┌─────────┼──────────┬──────────┐
194
+ ▼ ▼ ▼ ▼
195
+ ┌────────┐┌────────┐┌────────┐┌────────────┐
196
+ │ mlx-lm ││mlx-vlm ││mlx-aud ││mlx-embed │
197
+ │ (LLMs) ││(Vision)││(Audio) ││(Embeddings)│
198
+ └────────┘└────────┘└────────┘└────────────┘
199
+
200
+
201
+ ┌─────────────────────────────────────────────────────────┐
202
+ │ Apple MLX │
203
+ │ (Metal GPU + Unified Memory) │
204
+ └─────────────────────────────────────────────────────────┘
205
+ ```
206
+
207
+ ---
208
+
209
+ ## Tech Stack
210
+
211
+ | Layer | Technology |
212
+ |-------|-----------|
213
+ | Desktop app | Electron 28 + React 18 + TypeScript |
214
+ | Styling | Tailwind CSS |
215
+ | Database | SQLite (WAL mode, better-sqlite3) |
216
+ | Inference engine | vMLX Engine v0.2.18 (Python, FastAPI) |
217
+ | ML framework | Apple MLX (Metal GPU acceleration) |
218
+ | Build | electron-vite + electron-builder |
219
+ | Tests | Vitest (panel: 542 tests), pytest (engine: 1595 tests) |
220
+ | Python | Bundled relocatable Python 3.12 |
221
+
222
+ ---
223
+
224
+ ## Recent Changes
225
+
226
+ ### Panel v1.2.1 / Engine v0.2.18 (2026-03-09)
227
+ - **Tool calling fix**: `enableAutoToolChoice` default changed from `false` to `undefined` (auto-detect) — MCP and built-in tools now work out of the box without manual enable
228
+ - **MCP tool result truncation**: MCP tool results now capped at same limit as built-in tools (50KB default) to prevent context overflow
229
+ - **Command preview parity**: `buildCommandPreview` in SessionSettings now matches actual `buildArgs` logic for auto-tool-choice flags
230
+ - **Old config migration**: Stored sessions with `enableAutoToolChoice: false` auto-migrate to `undefined` on load
231
+ - **2137 total tests**: 1595 engine + 542 panel (12 new regression tests for tool calling and MCP)
232
+
233
+ ### Panel v1.2.0 / Engine v0.2.18 (2026-03-09)
234
+ - **HuggingFace download fix**: Download progress no longer stuck at 0% — tqdm `\r` chunk splitting, ANSI stripping, highest-percent extraction
235
+ - **HF browser NaN/Unknown fix**: Model ages and authors display correctly (uses `createdAt` fallback, extracts author from modelId)
236
+ - **macOS 15 launch fix**: `minimumSystemVersion` corrected from 26.0.0 to 14.0.0 (fixes GitHub #10)
237
+ - **Deep stability audit**: 14 fixes across paged cache block lifecycle, KV dequantize safety, reasoning marker detection, tool fallback, Mistral JSON validation
238
+ - **CancelledError SSE hang**: Engine cancellation now unblocks all waiting SSE consumers
239
+ - **2125 total tests**: 1595 engine + 530 panel with full regression coverage
240
+
241
+ ### Panel v1.1.4 / Engine v0.2.12 (2026-03-07)
242
+ - **tool_choice="none" fix**: Content no longer swallowed when tool markers detected with tools suppressed
243
+ - **suppress_reasoning**: Reasoning leaks plugged in both API paths
244
+ - **First-launch UX**: Auto-creates initial chat, dynamic About page version
245
+ - **1571 engine tests**, **530 panel tests** across 6 vitest suites
246
+
247
+ See [Panel Changelog](panel/CHANGELOG.md) and [Engine Changelog](CHANGELOG.md) for full history.
248
+
249
+ ---
250
+
251
+ ## Current Version
252
+
253
+ **Engine v0.2.18** / **Panel v1.2.1** — macOS 26+ (Tahoe) for local inference, macOS 14+ for remote endpoints. Apple Silicon (M1, M2, M3, M4)
254
+
255
+ ## Links
256
+
257
+ - **Website**: [vmlx.net](https://vmlx.net)
258
+ - **Contact**: admin@vmlx.net
259
+
260
+ ## License
261
+
262
+ Apache 2.0 — see [LICENSE](LICENSE) for details.