rapid-mlx 0.3.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (166) hide show
  1. rapid_mlx-0.3.2/PKG-INFO +556 -0
  2. rapid_mlx-0.3.2/README.md +477 -0
  3. rapid_mlx-0.3.2/pyproject.toml +167 -0
  4. rapid_mlx-0.3.2/rapid_mlx.egg-info/PKG-INFO +556 -0
  5. rapid_mlx-0.3.2/rapid_mlx.egg-info/SOURCES.txt +164 -0
  6. rapid_mlx-0.3.2/rapid_mlx.egg-info/dependency_links.txt +1 -0
  7. rapid_mlx-0.3.2/rapid_mlx.egg-info/entry_points.txt +8 -0
  8. rapid_mlx-0.3.2/rapid_mlx.egg-info/requires.txt +63 -0
  9. rapid_mlx-0.3.2/rapid_mlx.egg-info/top_level.txt +1 -0
  10. rapid_mlx-0.3.2/setup.cfg +4 -0
  11. rapid_mlx-0.3.2/tests/test_anthropic_adapter.py +457 -0
  12. rapid_mlx-0.3.2/tests/test_anthropic_models.py +360 -0
  13. rapid_mlx-0.3.2/tests/test_api_models.py +669 -0
  14. rapid_mlx-0.3.2/tests/test_api_utils.py +594 -0
  15. rapid_mlx-0.3.2/tests/test_audio.py +275 -0
  16. rapid_mlx-0.3.2/tests/test_batching.py +481 -0
  17. rapid_mlx-0.3.2/tests/test_batching_deterministic.py +460 -0
  18. rapid_mlx-0.3.2/tests/test_cloud_router.py +841 -0
  19. rapid_mlx-0.3.2/tests/test_continuous_batching.py +320 -0
  20. rapid_mlx-0.3.2/tests/test_deltanet_cache.py +251 -0
  21. rapid_mlx-0.3.2/tests/test_deltanet_snapshot.py +369 -0
  22. rapid_mlx-0.3.2/tests/test_embeddings.py +289 -0
  23. rapid_mlx-0.3.2/tests/test_engine_parity.py +344 -0
  24. rapid_mlx-0.3.2/tests/test_event_loop.py +281 -0
  25. rapid_mlx-0.3.2/tests/test_guided.py +508 -0
  26. rapid_mlx-0.3.2/tests/test_harmony_parsers.py +1325 -0
  27. rapid_mlx-0.3.2/tests/test_hybrid.py +643 -0
  28. rapid_mlx-0.3.2/tests/test_kv_cache_quantization.py +342 -0
  29. rapid_mlx-0.3.2/tests/test_llm.py +115 -0
  30. rapid_mlx-0.3.2/tests/test_llm_cache.py +443 -0
  31. rapid_mlx-0.3.2/tests/test_mcp_security.py +762 -0
  32. rapid_mlx-0.3.2/tests/test_memory_cache.py +558 -0
  33. rapid_mlx-0.3.2/tests/test_memory_stability.py +271 -0
  34. rapid_mlx-0.3.2/tests/test_minimax_reasoning_parser.py +650 -0
  35. rapid_mlx-0.3.2/tests/test_minimax_tool_parser.py +447 -0
  36. rapid_mlx-0.3.2/tests/test_mllm.py +401 -0
  37. rapid_mlx-0.3.2/tests/test_mllm_cache.py +1123 -0
  38. rapid_mlx-0.3.2/tests/test_mllm_continuous_batching.py +974 -0
  39. rapid_mlx-0.3.2/tests/test_mllm_stream_lock.py +227 -0
  40. rapid_mlx-0.3.2/tests/test_model_auto_config.py +168 -0
  41. rapid_mlx-0.3.2/tests/test_model_registry.py +259 -0
  42. rapid_mlx-0.3.2/tests/test_native_tool_format.py +369 -0
  43. rapid_mlx-0.3.2/tests/test_optimizations.py +110 -0
  44. rapid_mlx-0.3.2/tests/test_paged_cache.py +728 -0
  45. rapid_mlx-0.3.2/tests/test_paged_cache_benefits.py +463 -0
  46. rapid_mlx-0.3.2/tests/test_paged_cache_real_inference.py +269 -0
  47. rapid_mlx-0.3.2/tests/test_paged_cache_real_model.py +587 -0
  48. rapid_mlx-0.3.2/tests/test_platform.py +100 -0
  49. rapid_mlx-0.3.2/tests/test_prefix_cache.py +704 -0
  50. rapid_mlx-0.3.2/tests/test_prompt_lookup.py +343 -0
  51. rapid_mlx-0.3.2/tests/test_prompt_lookup_bench.py +300 -0
  52. rapid_mlx-0.3.2/tests/test_reasoning_parser.py +1045 -0
  53. rapid_mlx-0.3.2/tests/test_reasoning_parsers.py +984 -0
  54. rapid_mlx-0.3.2/tests/test_request.py +475 -0
  55. rapid_mlx-0.3.2/tests/test_server.py +783 -0
  56. rapid_mlx-0.3.2/tests/test_server_utils.py +396 -0
  57. rapid_mlx-0.3.2/tests/test_simple_engine.py +214 -0
  58. rapid_mlx-0.3.2/tests/test_simple_engine_unit.py +368 -0
  59. rapid_mlx-0.3.2/tests/test_streaming.py +195 -0
  60. rapid_mlx-0.3.2/tests/test_streaming_detokenizer.py +259 -0
  61. rapid_mlx-0.3.2/tests/test_streaming_json_encoder.py +441 -0
  62. rapid_mlx-0.3.2/tests/test_streaming_latency.py +339 -0
  63. rapid_mlx-0.3.2/tests/test_streaming_newlines.py +212 -0
  64. rapid_mlx-0.3.2/tests/test_streaming_simulator.py +650 -0
  65. rapid_mlx-0.3.2/tests/test_structured_output.py +381 -0
  66. rapid_mlx-0.3.2/tests/test_tool_call_e2e.py +594 -0
  67. rapid_mlx-0.3.2/tests/test_tool_calling.py +472 -0
  68. rapid_mlx-0.3.2/tests/test_tool_injection.py +237 -0
  69. rapid_mlx-0.3.2/tests/test_tool_logits.py +502 -0
  70. rapid_mlx-0.3.2/tests/test_tool_parsers.py +1583 -0
  71. rapid_mlx-0.3.2/tests/test_upstream_regression.py +1282 -0
  72. rapid_mlx-0.3.2/vllm_mlx/__init__.py +132 -0
  73. rapid_mlx-0.3.2/vllm_mlx/api/__init__.py +129 -0
  74. rapid_mlx-0.3.2/vllm_mlx/api/anthropic_adapter.py +312 -0
  75. rapid_mlx-0.3.2/vllm_mlx/api/anthropic_models.py +105 -0
  76. rapid_mlx-0.3.2/vllm_mlx/api/guided.py +239 -0
  77. rapid_mlx-0.3.2/vllm_mlx/api/harmony_tools.py +109 -0
  78. rapid_mlx-0.3.2/vllm_mlx/api/models.py +482 -0
  79. rapid_mlx-0.3.2/vllm_mlx/api/streaming.py +208 -0
  80. rapid_mlx-0.3.2/vllm_mlx/api/tool_calling.py +707 -0
  81. rapid_mlx-0.3.2/vllm_mlx/api/tool_logits.py +448 -0
  82. rapid_mlx-0.3.2/vllm_mlx/api/utils.py +450 -0
  83. rapid_mlx-0.3.2/vllm_mlx/attention.py +245 -0
  84. rapid_mlx-0.3.2/vllm_mlx/audio/__init__.py +25 -0
  85. rapid_mlx-0.3.2/vllm_mlx/audio/processor.py +213 -0
  86. rapid_mlx-0.3.2/vllm_mlx/audio/stt.py +159 -0
  87. rapid_mlx-0.3.2/vllm_mlx/audio/tts.py +313 -0
  88. rapid_mlx-0.3.2/vllm_mlx/benchmark.py +1697 -0
  89. rapid_mlx-0.3.2/vllm_mlx/cli.py +1114 -0
  90. rapid_mlx-0.3.2/vllm_mlx/cloud_router.py +192 -0
  91. rapid_mlx-0.3.2/vllm_mlx/embedding.py +105 -0
  92. rapid_mlx-0.3.2/vllm_mlx/engine/__init__.py +29 -0
  93. rapid_mlx-0.3.2/vllm_mlx/engine/base.py +200 -0
  94. rapid_mlx-0.3.2/vllm_mlx/engine/batched.py +869 -0
  95. rapid_mlx-0.3.2/vllm_mlx/engine/hybrid.py +523 -0
  96. rapid_mlx-0.3.2/vllm_mlx/engine/simple.py +764 -0
  97. rapid_mlx-0.3.2/vllm_mlx/engine_core.py +696 -0
  98. rapid_mlx-0.3.2/vllm_mlx/gradio_app.py +395 -0
  99. rapid_mlx-0.3.2/vllm_mlx/gradio_text_app.py +181 -0
  100. rapid_mlx-0.3.2/vllm_mlx/mcp/__init__.py +85 -0
  101. rapid_mlx-0.3.2/vllm_mlx/mcp/client.py +328 -0
  102. rapid_mlx-0.3.2/vllm_mlx/mcp/config.py +186 -0
  103. rapid_mlx-0.3.2/vllm_mlx/mcp/executor.py +500 -0
  104. rapid_mlx-0.3.2/vllm_mlx/mcp/manager.py +301 -0
  105. rapid_mlx-0.3.2/vllm_mlx/mcp/security.py +698 -0
  106. rapid_mlx-0.3.2/vllm_mlx/mcp/tools.py +174 -0
  107. rapid_mlx-0.3.2/vllm_mlx/mcp/types.py +189 -0
  108. rapid_mlx-0.3.2/vllm_mlx/memory_cache.py +1043 -0
  109. rapid_mlx-0.3.2/vllm_mlx/mllm_batch_generator.py +904 -0
  110. rapid_mlx-0.3.2/vllm_mlx/mllm_cache.py +461 -0
  111. rapid_mlx-0.3.2/vllm_mlx/mllm_scheduler.py +981 -0
  112. rapid_mlx-0.3.2/vllm_mlx/model_auto_config.py +115 -0
  113. rapid_mlx-0.3.2/vllm_mlx/model_registry.py +185 -0
  114. rapid_mlx-0.3.2/vllm_mlx/model_runner.py +476 -0
  115. rapid_mlx-0.3.2/vllm_mlx/models/__init__.py +15 -0
  116. rapid_mlx-0.3.2/vllm_mlx/models/llm.py +798 -0
  117. rapid_mlx-0.3.2/vllm_mlx/models/mllm.py +1886 -0
  118. rapid_mlx-0.3.2/vllm_mlx/multimodal_processor.py +452 -0
  119. rapid_mlx-0.3.2/vllm_mlx/optimizations.py +209 -0
  120. rapid_mlx-0.3.2/vllm_mlx/output_collector.py +211 -0
  121. rapid_mlx-0.3.2/vllm_mlx/paged_cache.py +1333 -0
  122. rapid_mlx-0.3.2/vllm_mlx/patches/__init__.py +1 -0
  123. rapid_mlx-0.3.2/vllm_mlx/patches/qwen3_next_mtp.py +96 -0
  124. rapid_mlx-0.3.2/vllm_mlx/platform.py +332 -0
  125. rapid_mlx-0.3.2/vllm_mlx/plugin.py +155 -0
  126. rapid_mlx-0.3.2/vllm_mlx/prefix_cache.py +1080 -0
  127. rapid_mlx-0.3.2/vllm_mlx/reasoning/__init__.py +104 -0
  128. rapid_mlx-0.3.2/vllm_mlx/reasoning/base.py +128 -0
  129. rapid_mlx-0.3.2/vllm_mlx/reasoning/deepseek_r1_parser.py +152 -0
  130. rapid_mlx-0.3.2/vllm_mlx/reasoning/gpt_oss_parser.py +214 -0
  131. rapid_mlx-0.3.2/vllm_mlx/reasoning/harmony_parser.py +163 -0
  132. rapid_mlx-0.3.2/vllm_mlx/reasoning/minimax_parser.py +288 -0
  133. rapid_mlx-0.3.2/vllm_mlx/reasoning/qwen3_parser.py +87 -0
  134. rapid_mlx-0.3.2/vllm_mlx/reasoning/think_parser.py +223 -0
  135. rapid_mlx-0.3.2/vllm_mlx/request.py +217 -0
  136. rapid_mlx-0.3.2/vllm_mlx/scheduler.py +2519 -0
  137. rapid_mlx-0.3.2/vllm_mlx/server.py +3271 -0
  138. rapid_mlx-0.3.2/vllm_mlx/speculative/__init__.py +8 -0
  139. rapid_mlx-0.3.2/vllm_mlx/speculative/prompt_lookup.py +324 -0
  140. rapid_mlx-0.3.2/vllm_mlx/tool_parsers/__init__.py +91 -0
  141. rapid_mlx-0.3.2/vllm_mlx/tool_parsers/abstract_tool_parser.py +372 -0
  142. rapid_mlx-0.3.2/vllm_mlx/tool_parsers/auto_tool_parser.py +361 -0
  143. rapid_mlx-0.3.2/vllm_mlx/tool_parsers/deepseek_tool_parser.py +170 -0
  144. rapid_mlx-0.3.2/vllm_mlx/tool_parsers/deepseekv31_tool_parser.py +309 -0
  145. rapid_mlx-0.3.2/vllm_mlx/tool_parsers/functionary_tool_parser.py +193 -0
  146. rapid_mlx-0.3.2/vllm_mlx/tool_parsers/glm47_tool_parser.py +188 -0
  147. rapid_mlx-0.3.2/vllm_mlx/tool_parsers/granite_tool_parser.py +147 -0
  148. rapid_mlx-0.3.2/vllm_mlx/tool_parsers/harmony_tool_parser.py +236 -0
  149. rapid_mlx-0.3.2/vllm_mlx/tool_parsers/hermes_tool_parser.py +336 -0
  150. rapid_mlx-0.3.2/vllm_mlx/tool_parsers/kimi_tool_parser.py +160 -0
  151. rapid_mlx-0.3.2/vllm_mlx/tool_parsers/llama_tool_parser.py +131 -0
  152. rapid_mlx-0.3.2/vllm_mlx/tool_parsers/minimax_tool_parser.py +243 -0
  153. rapid_mlx-0.3.2/vllm_mlx/tool_parsers/mistral_tool_parser.py +262 -0
  154. rapid_mlx-0.3.2/vllm_mlx/tool_parsers/nemotron_tool_parser.py +166 -0
  155. rapid_mlx-0.3.2/vllm_mlx/tool_parsers/qwen3coder_tool_parser.py +538 -0
  156. rapid_mlx-0.3.2/vllm_mlx/tool_parsers/qwen_tool_parser.py +157 -0
  157. rapid_mlx-0.3.2/vllm_mlx/tool_parsers/seed_oss_tool_parser.py +541 -0
  158. rapid_mlx-0.3.2/vllm_mlx/tool_parsers/xlam_tool_parser.py +177 -0
  159. rapid_mlx-0.3.2/vllm_mlx/utils/__init__.py +6 -0
  160. rapid_mlx-0.3.2/vllm_mlx/utils/chat_template.py +158 -0
  161. rapid_mlx-0.3.2/vllm_mlx/utils/chat_templates.py +225 -0
  162. rapid_mlx-0.3.2/vllm_mlx/utils/decode.py +69 -0
  163. rapid_mlx-0.3.2/vllm_mlx/utils/mamba_cache.py +214 -0
  164. rapid_mlx-0.3.2/vllm_mlx/utils/tokenizer.py +155 -0
  165. rapid_mlx-0.3.2/vllm_mlx/vision_embedding_cache.py +411 -0
  166. rapid_mlx-0.3.2/vllm_mlx/worker.py +266 -0
@@ -0,0 +1,556 @@
1
+ Metadata-Version: 2.4
2
+ Name: rapid-mlx
3
+ Version: 0.3.2
4
+ Summary: Rapid-MLX — AI inference for Apple Silicon. Drop-in OpenAI API, 2-4x faster than Ollama.
5
+ Author: vllm-mlx contributors
6
+ License: Apache-2.0
7
+ Project-URL: Homepage, https://github.com/raullenchai/Rapid-MLX
8
+ Project-URL: Documentation, https://github.com/raullenchai/Rapid-MLX#readme
9
+ Project-URL: Repository, https://github.com/raullenchai/Rapid-MLX
10
+ Keywords: llm,mlx,apple-silicon,vllm,inference,transformers
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: License :: OSI Approved :: Apache Software License
15
+ Classifier: Operating System :: MacOS
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Programming Language :: Python :: 3.13
21
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
+ Requires-Python: >=3.10
23
+ Description-Content-Type: text/markdown
24
+ Requires-Dist: mlx>=0.29.0
25
+ Requires-Dist: mlx-lm>=0.30.5
26
+ Requires-Dist: transformers>=5.0.0
27
+ Requires-Dist: tokenizers>=0.19.0
28
+ Requires-Dist: huggingface-hub>=0.23.0
29
+ Requires-Dist: numpy>=1.24.0
30
+ Requires-Dist: pillow>=10.0.0
31
+ Requires-Dist: tqdm>=4.66.0
32
+ Requires-Dist: pyyaml>=6.0
33
+ Requires-Dist: requests>=2.28.0
34
+ Requires-Dist: tabulate>=0.9.0
35
+ Requires-Dist: psutil>=5.9.0
36
+ Requires-Dist: fastapi>=0.100.0
37
+ Requires-Dist: uvicorn>=0.23.0
38
+ Requires-Dist: mcp>=1.0.0
39
+ Requires-Dist: jsonschema>=4.0.0
40
+ Provides-Extra: vision
41
+ Requires-Dist: mlx-vlm>=0.1.0; extra == "vision"
42
+ Requires-Dist: opencv-python>=4.8.0; extra == "vision"
43
+ Requires-Dist: torch>=2.3.0; extra == "vision"
44
+ Requires-Dist: torchvision>=0.18.0; extra == "vision"
45
+ Provides-Extra: embeddings
46
+ Requires-Dist: mlx-embeddings>=0.0.5; extra == "embeddings"
47
+ Provides-Extra: chat
48
+ Requires-Dist: gradio>=4.0.0; extra == "chat"
49
+ Requires-Dist: pytz>=2024.1; extra == "chat"
50
+ Provides-Extra: all
51
+ Requires-Dist: rapid-mlx[chat,embeddings,vision]; extra == "all"
52
+ Provides-Extra: dev
53
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
54
+ Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
55
+ Requires-Dist: black>=23.0.0; extra == "dev"
56
+ Requires-Dist: ruff>=0.1.0; extra == "dev"
57
+ Requires-Dist: mypy>=1.0.0; extra == "dev"
58
+ Provides-Extra: vllm
59
+ Requires-Dist: vllm>=0.4.0; extra == "vllm"
60
+ Provides-Extra: guided
61
+ Requires-Dist: outlines[mlxlm]>=1.0.0; extra == "guided"
62
+ Provides-Extra: audio
63
+ Requires-Dist: mlx-audio>=0.2.9; extra == "audio"
64
+ Requires-Dist: sounddevice>=0.4.0; extra == "audio"
65
+ Requires-Dist: soundfile>=0.12.0; extra == "audio"
66
+ Requires-Dist: scipy>=1.10.0; extra == "audio"
67
+ Requires-Dist: numba>=0.57.0; extra == "audio"
68
+ Requires-Dist: tiktoken>=0.5.0; extra == "audio"
69
+ Requires-Dist: misaki[ja,zh]>=0.5.0; extra == "audio"
70
+ Requires-Dist: spacy>=3.7.0; extra == "audio"
71
+ Requires-Dist: num2words>=0.5.0; extra == "audio"
72
+ Requires-Dist: loguru>=0.7.0; extra == "audio"
73
+ Requires-Dist: phonemizer>=3.2.0; extra == "audio"
74
+ Requires-Dist: ordered_set>=4.1.0; extra == "audio"
75
+ Requires-Dist: cn2an>=0.5.0; extra == "audio"
76
+ Requires-Dist: fugashi>=1.3.0; extra == "audio"
77
+ Requires-Dist: unidic-lite>=1.0.0; extra == "audio"
78
+ Requires-Dist: jieba>=0.42.0; extra == "audio"
79
+
80
+ # Rapid-MLX
81
+
82
+ **Run AI on your Mac. Faster than anything else.**
83
+
84
+ [![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](LICENSE)
85
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/)
86
+ [![Tests](https://img.shields.io/badge/tests-1900%2B-brightgreen.svg)](tests/)
87
+ [![Apple Silicon](https://img.shields.io/badge/Apple_Silicon-M1%20|%20M2%20|%20M3%20|%20M4-black.svg?logo=apple)](https://support.apple.com/en-us/HT211814)
88
+
89
+ Drop-in OpenAI API replacement for Apple Silicon. 2-4x faster than Ollama, 100% tool-calling, sub-200ms cached TTFT.
90
+
91
+ <p align="center">
92
+ <img src="https://raw.githubusercontent.com/raullenchai/Rapid-MLX/main/docs/assets/demo.gif" alt="Rapid-MLX vs Ollama — 2.4x faster on Qwen3.5-9B" width="800">
93
+ <br>
94
+ <em>Same model (Qwen3.5-9B), same Mac, head-to-head. Rapid-MLX: 79 tok/s vs Ollama: 33 tok/s.</em>
95
+ </p>
96
+
97
+ | | Your Mac runs AI | How fast | What works |
98
+ |:---|:---:|:---:|:---:|
99
+ | **16 GB MacBook Air** | Qwen3.5-4B | 168 tok/s | Chat, coding, tools |
100
+ | **64 GB Mac Mini / Studio** | Qwen3.5-35B | 83 tok/s | Best balance of smart + fast |
101
+ | **96+ GB Mac Studio / Pro** | Qwen3.5-122B | 57 tok/s | Frontier-level intelligence |
102
+
103
+ ---
104
+
105
+ ## Quick Start
106
+
107
+ ```bash
108
+ # 1. Install (one command, checks Apple Silicon + Python automatically)
109
+ curl -fsSL https://raw.githubusercontent.com/raullenchai/Rapid-MLX/main/install.sh | bash
110
+ source ~/.zshrc # or restart your terminal
111
+
112
+ # 2. Pick a model and start serving (first run downloads ~5 GB)
113
+ rapid-mlx serve mlx-community/Qwen3.5-9B-4bit --port 8000
114
+
115
+ # 3. Test it — in a new terminal:
116
+ curl http://localhost:8000/v1/chat/completions \
117
+ -H "Content-Type: application/json" \
118
+ -d '{"model":"default","messages":[{"role":"user","content":"Hello!"}]}'
119
+ ```
120
+
121
+ That's it — you now have an AI server on `localhost:8000`. Works with Claude Code, Cursor, Aider, Open WebUI, or any app that speaks the OpenAI API.
122
+
123
+ <details>
124
+ <summary>Other install methods</summary>
125
+
126
+ **Homebrew:**
127
+ ```bash
128
+ brew install raullenchai/rapid-mlx/rapid-mlx
129
+ ```
130
+
131
+ **pip**:
132
+ ```bash
133
+ pip install rapid-mlx
134
+ ```
135
+
136
+ **From source** (for development):
137
+ ```bash
138
+ git clone https://github.com/raullenchai/Rapid-MLX.git
139
+ cd Rapid-MLX && pip install -e .
140
+ ```
141
+
142
+ **Vision models** (adds torch + torchvision, ~2.5 GB extra):
143
+ ```bash
144
+ pip install 'rapid-mlx[vision] @ git+https://github.com/raullenchai/Rapid-MLX.git'
145
+ ```
146
+ </details>
147
+
148
+ **Try it with Python:**
149
+
150
+ ```python
151
+ from openai import OpenAI
152
+ client = OpenAI(base_url="http://localhost:8000/v1", api_key="not-needed")
153
+
154
+ response = client.chat.completions.create(
155
+ model="default",
156
+ messages=[{"role": "user", "content": "Hello!"}],
157
+ )
158
+ print(response.choices[0].message.content)
159
+ ```
160
+
161
+ ---
162
+
163
+ ## Works With
164
+
165
+ | Client | Status | Notes |
166
+ |--------|--------|-------|
167
+ | [Claude Code](https://claude.ai/claude-code) | Verified | Env var config, streaming tools |
168
+ | [Cursor](https://cursor.com) | Verified | Settings UI config |
169
+ | [Aider](https://aider.chat) | Verified | Code editing agent |
170
+ | [Open WebUI](https://github.com/open-webui/open-webui) | Verified | Self-hosted ChatGPT UI, Docker one-liner |
171
+ | [Continue.dev](https://continue.dev) | Verified | YAML config, VS Code + JetBrains |
172
+ | [OpenClaw](https://github.com/nicepkg/openclaw) | Verified | 14 tools, multi-round, streaming |
173
+ | [OpenCode](https://github.com/opencode-ai/opencode) | Verified | JSON config |
174
+ | [LangChain](https://langchain.com) | Compatible | Standard OpenAI client |
175
+ | Any OpenAI SDK client | Compatible | Drop-in `base_url` swap |
176
+
177
+ <details>
178
+ <summary><strong>Client setup instructions</strong></summary>
179
+
180
+ **Claude Code:**
181
+ ```bash
182
+ OPENAI_BASE_URL=http://localhost:8000/v1 claude
183
+ # Or add to ~/.claude/settings.json:
184
+ # { "env": { "OPENAI_BASE_URL": "http://localhost:8000/v1" } }
185
+ ```
186
+
187
+ **Cursor:** Settings > Models > OpenAI API Base → `http://localhost:8000/v1`
188
+
189
+ **Continue.dev** (`~/.continue/config.yaml`):
190
+ ```yaml
191
+ models:
192
+ - name: rapid-mlx
193
+ provider: openai
194
+ model: default
195
+ apiBase: http://localhost:8000/v1
196
+ apiKey: not-needed
197
+ ```
198
+
199
+ **Aider:**
200
+ ```bash
201
+ aider --openai-api-base http://localhost:8000/v1 --openai-api-key not-needed
202
+ ```
203
+
204
+ **Open WebUI** (Docker one-liner):
205
+ ```bash
206
+ docker run -d -p 3000:8080 \
207
+ --add-host=host.docker.internal:host-gateway \
208
+ -e ENABLE_OLLAMA_API=False \
209
+ -e OPENAI_API_BASE_URL=http://host.docker.internal:8000/v1 \
210
+ -e OPENAI_API_KEY=not-needed \
211
+ -v open-webui:/app/backend/data \
212
+ --name open-webui \
213
+ ghcr.io/open-webui/open-webui:main
214
+ ```
215
+
216
+ **OpenCode** (`~/.config/opencode/opencode.json`):
217
+ ```json
218
+ {
219
+ "provider": {
220
+ "openai-compatible": {
221
+ "apiKey": "not-needed",
222
+ "models": {
223
+ "default": {
224
+ "id": "default",
225
+ "name": "rapid-mlx local",
226
+ "api_base": "http://localhost:8000/v1"
227
+ }
228
+ }
229
+ }
230
+ }
231
+ }
232
+ ```
233
+
234
+ </details>
235
+
236
+ ---
237
+
238
+ ## Choose Your Model
239
+
240
+ ### What fits my Mac?
241
+
242
+ Model weights must fit in unified memory. If Activity Monitor shows red memory pressure, the model is too big — switch to a smaller one or a lower quantization.
243
+
244
+ | Your Mac | Best Model | RAM Used | Speed | Quality |
245
+ |----------|-----------|---------|-------|---------|
246
+ | **16 GB** MacBook Air/Pro | [Qwen3.5-4B 4bit](https://huggingface.co/mlx-community/Qwen3.5-4B-MLX-4bit) | 2.4 GB | 168 tok/s | Good for chat and simple tasks |
247
+ | **24 GB** MacBook Pro | [Qwen3.5-9B 4bit](https://huggingface.co/mlx-community/Qwen3.5-9B-4bit) | 5.1 GB | 108 tok/s | Great all-rounder |
248
+ | **32 GB** Mac Mini / Studio | [Qwen3.5-27B 4bit](https://huggingface.co/mlx-community/Qwen3.5-27B-4bit) | 15.3 GB | 39 tok/s | Solid coding model |
249
+ | **64 GB** Mac Mini / Studio | [Qwen3.5-35B-A3B 8bit](https://huggingface.co/mlx-community/Qwen3.5-35B-A3B-8bit) | 37 GB | 83 tok/s | **Sweet spot** — smart + fast |
250
+ | **96 GB** Mac Studio / Pro | [Qwen3.5-122B mxfp4](https://huggingface.co/nightmedia/Qwen3.5-122B-A10B-Text-mxfp4-mlx) | 65 GB | 57 tok/s | Best model, fits comfortably |
251
+ | **96+ GB** Mac Studio / Pro | [Qwen3.5-122B 8bit](https://huggingface.co/mlx-community/Qwen3.5-122B-A10B-8bit) | 130 GB | 44 tok/s | Maximum quality |
252
+
253
+ ### Copy-paste commands
254
+
255
+ Pick the one that matches your Mac:
256
+
257
+ ```bash
258
+ # 16 GB — lightweight, fast
259
+ rapid-mlx serve mlx-community/Qwen3.5-4B-MLX-4bit --port 8000
260
+
261
+ # 24 GB — best small model
262
+ rapid-mlx serve mlx-community/Qwen3.5-9B-4bit --port 8000
263
+
264
+ # 64 GB — sweet spot
265
+ rapid-mlx serve mlx-community/Qwen3.5-35B-A3B-8bit --prefill-step-size 8192 --port 8000
266
+
267
+ # 96+ GB — best model
268
+ rapid-mlx serve nightmedia/Qwen3.5-122B-A10B-Text-mxfp4-mlx --kv-bits 8 --prefill-step-size 8192 --port 8000
269
+
270
+ # Coding agent — fast MoE, great for Claude Code / Cursor
271
+ rapid-mlx serve lmstudio-community/Qwen3-Coder-Next-MLX-4bit --prefill-step-size 8192 --port 8000
272
+
273
+ # Vision — image understanding (requires: pip install 'rapid-mlx[vision]')
274
+ rapid-mlx serve mlx-community/Qwen3-VL-4B-Instruct-MLX-4bit --mllm --port 8000
275
+ ```
276
+
277
+ <details>
278
+ <summary><strong>Parser auto-detection & manual overrides</strong></summary>
279
+
280
+ Parsers are **auto-detected from the model name** — you don't need to specify `--tool-call-parser` or `--reasoning-parser` for supported families. Explicit flags always override auto-detection.
281
+
282
+ | Model Family | Auto-detected `--tool-call-parser` | Auto-detected `--reasoning-parser` | Notes |
283
+ |-------------|---------------------|---------------------|-------|
284
+ | Qwen3.5 (all sizes) | `hermes` | `qwen3` | **Recommended** — 100% tool calling |
285
+ | Qwen3-Coder-Next | `hermes` | *(none)* | Fast coding, non-thinking mode |
286
+ | DeepSeek R1-0528 / V3.1 | `deepseek_v31` | `deepseek_r1` | Dedicated V3.1 parser |
287
+ | DeepSeek R1 (older) | `deepseek` | `deepseek_r1` | With reasoning |
288
+ | DeepSeek V3 / V2.5 | `deepseek` | *(none)* | No reasoning parser |
289
+ | GLM-4.7 | `glm47` | *(none)* | 100% tool calling |
290
+ | MiniMax-M2.5 | `minimax` | `minimax` | XML tool format |
291
+ | GPT-OSS | `harmony` | `harmony` | Native format |
292
+ | Kimi-Linear | `kimi` | *(none)* | Kimi tool format |
293
+ | Llama 3.x | `llama` | *(none)* | JSON tool format |
294
+ | Mistral / Devstral | `hermes` | *(none)* | Hermes-compatible |
295
+ | Gemma | `hermes` | *(none)* | Hermes-compatible |
296
+ | Phi-3/4 | `hermes` | *(none)* | Hermes-compatible |
297
+
298
+ All 17 parsers include automatic recovery — if a quantized model outputs broken tool calls as text, they're auto-converted back to structured format.
299
+
300
+ </details>
301
+
302
+ ---
303
+
304
+ ## Benchmarks
305
+
306
+ 22 models tested across 6 engines on **Mac Studio M3 Ultra (256GB)**. Rapid-MLX uses Apple's [MLX framework](https://github.com/ml-explore/mlx) — purpose-built for unified memory with native Metal compute kernels — which is why it beats C++-based engines (Ollama, llama.cpp) on most models. **#1 on 16 of 18 benchmarked models.**
307
+
308
+ | Model | Rapid-MLX | Best Alternative | Speedup |
309
+ |-------|----------|-----------------|---------|
310
+ | **Phi-4 Mini 14B** | **180** tok/s | 77 (mlx-lm) / 56 (Ollama) | **2.3x** / **3.2x** |
311
+ | **Qwen3.5-4B** | **168** tok/s | 155 (upstream) | **1.1x** |
312
+ | **GPT-OSS 20B** | **127** tok/s · 100% tools | 79 (upstream) | **1.6x** |
313
+ | **Qwen3.5-9B** | **108** tok/s | 46 (Ollama) | **2.3x** |
314
+ | **Kimi-Linear-48B** | **94** tok/s · 100% tools | — (only engine) | — |
315
+ | **Qwen3.5-35B-A3B** | **83** tok/s · 100% tools | 75 (oMLX) | **1.1x** |
316
+ | **Qwen3-Coder 80B** | **74** tok/s · 100% tools | 69 (upstream) | **1.1x** |
317
+ | **Qwen3.5-122B** | **44** tok/s · 100% tools | 43 (upstream) | ~1.0x |
318
+
319
+ *Full benchmark data with all 18 models, TTFT tables, DeltaNet snapshots, and engine comparison below.*
320
+
321
+ <details>
322
+ <summary><strong>TTFT — Prompt Cache Advantage</strong></summary>
323
+
324
+ Prompt cache keeps multi-turn conversations fast. For standard transformers, KV cache trimming gives sub-100ms TTFT. For hybrid RNN models (Qwen3.5 DeltaNet), we use state snapshots — the first technique to bring prompt cache to non-trimmable architectures on MLX.
325
+
326
+ **Pure KV cache (transformers):**
327
+
328
+ | Model | Rapid-MLX (cached) | vllm-mlx (upstream) | Speedup |
329
+ |-------|-------------------|-------------------|---------|
330
+ | Kimi-Linear-48B | **0.08s** | — | — |
331
+ | Llama 3.2 3B | **0.10s** | — | — |
332
+ | Hermes-3-Llama 8B | **0.10s** | 0.18s | 1.8x |
333
+ | Phi-4 Mini 14B | **0.13s** | 0.15s | 1.2x |
334
+ | Devstral-Small-2 24B | **0.13s** | 0.38s | 2.9x |
335
+ | Mistral Small 24B | **0.13s** | 0.38s | 2.9x |
336
+ | GLM-4.7-Flash 9B | **0.13s** | 0.23s | 1.8x |
337
+ | GLM-4.5-Air | **0.14s** | 0.47s | 3.4x |
338
+ | Qwen3-Coder-Next 80B | **0.16s** | 0.27s | 1.7x |
339
+ | GPT-OSS 20B | **0.16s** | 0.27s | 1.7x |
340
+ | Qwen3.5-9B | **0.22s** | 0.26s | 1.2x |
341
+
342
+ **DeltaNet state snapshots (hybrid RNN + attention):**
343
+
344
+ Qwen3.5 uses Gated DeltaNet (75% RNN) + full attention (25% KV). Other engines recreate the entire cache from scratch every request — we snapshot the RNN state at the system prompt boundary, restoring in ~0.1ms instead of re-running hundreds of tokens through the recurrent layers.
345
+
346
+ | Model | Cold TTFT | Snapshot TTFT | Speedup |
347
+ |-------|-----------|---------------|---------|
348
+ | Qwen3-Coder-Next 6bit (48L) | 0.66s | **0.16s** | **4.3x** |
349
+ | Qwen3.5-35B-A3B 8bit (40L) | 0.49s | **0.19s** | **2.6x** |
350
+ | Qwen3.5-27B 4bit (40L) | 0.58s | **0.27s** | **2.1x** |
351
+ | Qwen3.5-9B 4bit (40L) | 0.27s | **0.22s** | **1.2x** |
352
+ | Qwen3.5-4B 4bit (32L) | 0.24s | **0.16s** | **1.5x** |
353
+
354
+ </details>
355
+
356
+ <details>
357
+ <summary><strong>Capability Comparison</strong></summary>
358
+
359
+ | Feature | Rapid-MLX | oMLX | Ollama | llama.cpp | mlx-lm |
360
+ |---------|-----------|------|--------|-----------|--------|
361
+ | **Tool calling** | 100% (Qwen/GLM/GPT-OSS/Kimi) | N/A | 100% (Qwen) | 80% (Phi-4) | N/A |
362
+ | **Tool call recovery** | 100% | N/A | 100% | 100% | N/A |
363
+ | **Tool injection fallback** | Yes | No | No | No | No |
364
+ | **Think-tag leak** | 0% | N/A | 0% | 0% | N/A |
365
+ | **Prompt cache** | KV + DeltaNet | No | No | No | No |
366
+ | **Vision** | Yes | Yes | Yes | No | No |
367
+ | **Audio (STT/TTS)** | Yes | No | No | No | No |
368
+ | **17 tool parsers** | Yes | No | No | No | No |
369
+ | **Cloud routing** | Yes | No | No | No | No |
370
+ | **Streaming** | Yes | Yes | Yes | Yes | No |
371
+ | **OpenAI API** | Yes | Yes | Yes | Yes | No |
372
+
373
+ </details>
374
+
375
+ <details>
376
+ <summary><strong>Optimization Techniques Per Model</strong></summary>
377
+
378
+ | Technique | What it does | Models |
379
+ |-----------|-------------|--------|
380
+ | **KV prompt cache** | Trim KV cache to common prefix, skip re-prefill | All transformer models |
381
+ | **DeltaNet state snapshots** | Deep-copy RNN state at prefix boundary, restore in ~0.1ms | Qwen3.5 (4B, 9B, 27B, 35B, 122B), Qwen3-Coder-Next |
382
+ | **Hybrid cache sync** | Keep trimmable KV + non-trimmable RNN layers in sync | Qwen3.5 (Gated DeltaNet + attention) |
383
+ | **Tool logits bias** | Jump-forward decoding — bias logits toward structured tokens | All models with `--enable-tool-logits-bias` |
384
+ | **Auto tool recovery** | Detect broken text-format tool calls, convert to structured | All 17 parser formats |
385
+ | **Speculative decoding** | Draft model generates candidates, main model verifies | Any model + `--draft-model` |
386
+ | **KV quantization** | 4/8-bit KV cache for longer contexts in less memory | All models with `--kv-bits` |
387
+ | **Prefill chunking** | Configurable step size for large-prompt throughput | All models |
388
+ | **Cloud routing** | Offload high-token requests to cloud LLM when local is slow | All models with `--cloud-model` |
389
+
390
+ </details>
391
+
392
+ <details>
393
+ <summary><strong>Eval benchmarks (17 models, 4 suites)</strong></summary>
394
+
395
+ 17 models across tool calling (30 scenarios), coding (HumanEval+), reasoning (MATH-500), and general knowledge (MMLU-Pro). All with `enable_thinking: false` on M3 Ultra.
396
+
397
+ | Model | Quant | RAM | Decode | Tools | Code | Reason | General | Avg |
398
+ |-------|-------|-----|--------|-------|------|--------|---------|-----|
399
+ | Qwen3.5-122B-A10B | 8bit | 129.8 GB | 44 t/s | 87% | **90%** | **90%** | **90%** | **89%** |
400
+ | Qwen3.5-122B-A10B | mxfp4 | 65.0 GB | 57 t/s | **90%** | **90%** | 80% | **90%** | 88% |
401
+ | Qwen3.5-35B-A3B | 8bit | 36.9 GB | 83 t/s | **90%** | **90%** | 80% | 80% | 85% |
402
+ | Qwen3-Coder-Next | 6bit | 64.8 GB | 66 t/s | 87% | **90%** | 80% | 70% | 82% |
403
+ | Qwen3-Coder-Next | 4bit | 44.9 GB | 74 t/s | **90%** | **90%** | 70% | 70% | 80% |
404
+ | GLM-4.5-Air | 4bit | 60.3 GB | 46 t/s | 73% | **90%** | 70% | 80% | 78% |
405
+ | GLM-4.7-Flash | 8bit | 31.9 GB | 58 t/s | 73% | **100%** | **90%** | 50% | 78% |
406
+ | Qwen3.5-27B | 4bit | 15.3 GB | 39 t/s | 83% | **90%** | 50% | 80% | 76% |
407
+ | Qwen3.5-35B-A3B | 4bit | 19.6 GB | 95 t/s | 87% | **90%** | 50% | 70% | 74% |
408
+ | Qwen3.5-9B | 4bit | 5.1 GB | 108 t/s | 83% | 70% | 60% | 70% | 71% |
409
+ | MiniMax-M2.5 | 4bit | 128.9 GB | 52 t/s | 87% | 10%\* | 80% | **90%** | 67% |
410
+ | Devstral-Small-2 | 4bit | 13.4 GB | 49 t/s | 17% | **90%** | 70% | 70% | 62% |
411
+ | GPT-OSS-20B | mxfp4-q8 | 12.1 GB | 127 t/s | 80% | 20% | 60% | **90%** | 62% |
412
+ | Qwen3.5-4B | 4bit | 2.4 GB | 168 t/s | 73% | 50% | 50% | 50% | 56% |
413
+ | Mistral-Small-3.2 | 4bit | 13.4 GB | 49 t/s | 17% | 80% | 60% | 60% | 54% |
414
+ | Hermes-3-Llama-8B | 4bit | 4.6 GB | 127 t/s | 17% | 20% | 30% | 40% | 27% |
415
+ | Qwen3-0.6B | 4bit | 0.4 GB | 365 t/s | 30% | 20% | 20% | 30% | 25% |
416
+
417
+ \* *MiniMax coding score likely affected by a code extraction parser issue, not model capability.*
418
+
419
+ </details>
420
+
421
+ *Benchmark script: [`scripts/benchmark_engines.py`](scripts/benchmark_engines.py). Run your own: `python scripts/benchmark_engines.py --engine rapid-mlx ollama --runs 3`. Eval suites: [evals/](evals/)*
422
+
423
+ ---
424
+
425
+ ## Features
426
+
427
+ ### Tool Calling
428
+
429
+ Full OpenAI-compatible tool calling with 17 parser formats and **automatic recovery when quantized models break**. Models at 4-bit degrade after multiple tool rounds — Rapid-MLX auto-detects broken output and converts it back to structured `tool_calls`.
430
+
431
+ ### Reasoning Separation
432
+
433
+ Models with chain-of-thought (Qwen3, DeepSeek-R1) output reasoning in a separate `reasoning_content` field — never mixed into `content`. 0% leak rate.
434
+
435
+ ### Prompt Cache
436
+
437
+ Persistent cache across requests — only new tokens are prefilled on each turn. For standard transformers, KV cache trimming. For hybrid models (Qwen3.5 DeltaNet), RNN state snapshots restore non-trimmable layers from memory instead of re-computing. 2-5x faster TTFT on all architectures. Always on, no flags needed.
438
+
439
+ ### Smart Cloud Routing
440
+
441
+ Large-context requests auto-route to a cloud LLM (GPT-5, Claude, etc.) when local prefill would be slow. Routing based on new tokens after cache hit. `--cloud-model openai/gpt-5 --cloud-threshold 20000`
442
+
443
+ ### Multimodal
444
+
445
+ Vision, audio (STT/TTS), video understanding, and text embeddings — all through the same OpenAI-compatible API.
446
+
447
+ <details>
448
+ <summary><strong>All features (35 total)</strong></summary>
449
+
450
+ **Tool Calling (15):** Text-format recovery, 17 parsers, streaming, tool logits bias (2-5x faster structured output), disconnect guard, think-tag filter, chunk-boundary leak fix, developer role normalization, logprobs API, system prompt tool injection fallback for incompatible chat templates, end-to-end agent simulation tests.
451
+
452
+ **Reasoning (3):** MiniMax/Qwen3/DeepSeek parsers, Chinese reasoning pattern recognition, clean `reasoning_content` field.
453
+
454
+ **Performance (9):** Prompt cache (KV trim + DeltaNet state snapshots), SSE template pre-computation, MTP (multi-token prediction), configurable prefill step size, KV cache quantization (4/8 bit), speculative decoding, cloud routing, frequency-aware cache eviction.
455
+
456
+ **Reliability (6):** Accurate `prompt_tokens` reporting, EOS cache fix, crash prevention on malformed `response_format`, GC control during generation, system prompt pinning, 1900+ tests.
457
+
458
+ **Multimodal (4):** Vision (Qwen-VL), audio STT (Whisper), audio TTS (Kokoro), text embeddings.
459
+
460
+ </details>
461
+
462
+ ---
463
+
464
+ <details>
465
+ <summary><strong>Server Flags Reference</strong></summary>
466
+
467
+ ### Core
468
+
469
+ | Flag | Description | Default |
470
+ |------|-------------|---------|
471
+ | `--model` | HuggingFace model name or local path | *(required)* |
472
+ | `--host` | Host to bind to | `0.0.0.0` |
473
+ | `--port` | Port to bind to | `8000` |
474
+ | `--max-tokens` | Default max tokens for generation | `32768` |
475
+ | `--continuous-batching` | Multi-user mode with scheduler | off |
476
+
477
+ ### Tool Calling & Reasoning
478
+
479
+ | Flag | Description | Default |
480
+ |------|-------------|---------|
481
+ | `--tool-call-parser` | Parser: `hermes`, `minimax`, `qwen`, `llama`, `deepseek`, etc. | *(auto-detected)* |
482
+ | `--reasoning-parser` | Parser: `qwen3`, `deepseek_r1`, `minimax`, `gpt_oss` | *(auto-detected)* |
483
+ | `--enable-tool-logits-bias` | Jump-forward decoding for faster tool calls | off |
484
+
485
+ ### Performance
486
+
487
+ | Flag | Description | Default |
488
+ |------|-------------|---------|
489
+ | `--prefill-step-size` | Tokens per prefill chunk | `2048` |
490
+ | `--kv-bits` | KV cache quantization: `4` or `8` bit | *(full precision)* |
491
+ | `--draft-model` | Draft model for speculative decoding | *(none)* |
492
+ | `--num-draft-tokens` | Speculative tokens per step | `4` |
493
+
494
+ ### Cloud Routing
495
+
496
+ | Flag | Description | Default |
497
+ |------|-------------|---------|
498
+ | `--cloud-model` | litellm model string (e.g. `openai/gpt-5`) | *(disabled)* |
499
+ | `--cloud-threshold` | New token threshold to trigger cloud routing | `20000` |
500
+
501
+ ### Security & Other
502
+
503
+ | Flag | Description | Default |
504
+ |------|-------------|---------|
505
+ | `--api-key` | API key for authentication | *(no auth)* |
506
+ | `--rate-limit` | Requests per minute per client | *(unlimited)* |
507
+ | `--timeout` | Request timeout in seconds | `300` |
508
+ | `--mllm` | Force multimodal (vision) mode | auto-detect |
509
+ | `--mcp-config` | MCP configuration file for tool integration | *(none)* |
510
+ | `--embedding-model` | Pre-load embedding model at startup | *(none)* |
511
+
512
+ </details>
513
+
514
+ <details>
515
+ <summary><strong>Troubleshooting</strong></summary>
516
+
517
+ **"parameters not found in model" warnings at startup** — Normal for VLMs. Vision weights are auto-skipped.
518
+
519
+ **Out of memory / very slow (<5 tok/s)** — Model too big. Check [What fits my Mac?](#what-fits-my-mac) Use `--kv-bits 4` for long contexts. Close other apps.
520
+
521
+ **Empty responses** — Remove `--reasoning-parser` for non-thinking models. Only use it with Qwen3 (thinking), MiniMax, DeepSeek-R1.
522
+
523
+ **Tool calls as plain text** — Set the correct `--tool-call-parser` for your model. Even without it, Rapid-MLX auto-recovers most cases.
524
+
525
+ **Slow first response** — Cold start is normal. Subsequent turns hit prompt cache (10-30x faster). Use `--prefill-step-size 8192` to speed up cold starts.
526
+
527
+ **Server hangs after client disconnect** — Fixed in this fork. Upgrade to latest.
528
+
529
+ </details>
530
+
531
+ ---
532
+
533
+ ## Roadmap
534
+
535
+ | Technique | Expected Gain | Status |
536
+ |-----------|---------------|--------|
537
+ | **DeltaNet state snapshots** — hybrid RNN cache reuse for Qwen3.5 | 1.5-4.3x TTFT | **Done** |
538
+ | **SSE streaming optimization** — pre-computed templates, micro-opts | +10.5% composite | **Done** |
539
+ | **Tool injection fallback** — system prompt injection for broken templates | 0→100% tools | **Done** |
540
+ | [MTP in SimpleEngine](https://arxiv.org/abs/2404.19737) — multi-token prediction | 1.4x decode | **Done** |
541
+ | [Standard Speculative Decode](https://arxiv.org/abs/2302.01318) — draft model acceleration | 1.5-2.3x decode | Not started |
542
+ | [EAGLE-3](https://arxiv.org/abs/2503.01840) — feature-level draft on Metal | 3-6.5x decode | Not started |
543
+ | [ReDrafter](https://arxiv.org/abs/2403.09919) — Apple's RNN draft head | 1.4-1.5x decode | Not started |
544
+ | Auto-optimization per model — zero-config best settings | N/A | Not started |
545
+
546
+ ---
547
+
548
+ ## Contributing
549
+
550
+ Issues and PRs welcome at [github.com/raullenchai/Rapid-MLX](https://github.com/raullenchai/Rapid-MLX).
551
+
552
+ We need community data — hardware benchmarks, client verifications, model reports. If you test a model on your Mac, [open an issue](https://github.com/raullenchai/Rapid-MLX/issues/new) with your hardware, model, decode speed, and what worked.
553
+
554
+ ## License
555
+
556
+ Apache 2.0 — see [LICENSE](LICENSE).