PyPI - vmlx - Versions diffs - 1.0.0__tar.gz - Mend

vmlx 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (164) hide show

vmlx-1.0.0/PKG-INFO +340 -0
vmlx-1.0.0/README.md +262 -0
vmlx-1.0.0/pyproject.toml +146 -0
vmlx-1.0.0/setup.cfg +4 -0
vmlx-1.0.0/tests/test_anthropic_adapter.py +1004 -0
vmlx-1.0.0/tests/test_api_models.py +798 -0
vmlx-1.0.0/tests/test_api_utils.py +429 -0
vmlx-1.0.0/tests/test_audio.py +362 -0
vmlx-1.0.0/tests/test_audit_fixes.py +609 -0
vmlx-1.0.0/tests/test_batching.py +648 -0
vmlx-1.0.0/tests/test_batching_deterministic.py +459 -0
vmlx-1.0.0/tests/test_cache_isolation.py +197 -0
vmlx-1.0.0/tests/test_cache_types.py +253 -0
vmlx-1.0.0/tests/test_cancellation.py +63 -0
vmlx-1.0.0/tests/test_cli_commands.py +292 -0
vmlx-1.0.0/tests/test_continuous_batching.py +308 -0
vmlx-1.0.0/tests/test_disk_cache_unit.py +239 -0
vmlx-1.0.0/tests/test_embeddings.py +293 -0
vmlx-1.0.0/tests/test_emoji_comprehensive.py +310 -0
vmlx-1.0.0/tests/test_engine_audit.py +2155 -0
vmlx-1.0.0/tests/test_gemma3_27b_comprehensive.py +384 -0
vmlx-1.0.0/tests/test_health_endpoint.py +170 -0
vmlx-1.0.0/tests/test_hybrid_batching.py +1044 -0
vmlx-1.0.0/tests/test_jit_toggle.py +116 -0
vmlx-1.0.0/tests/test_kv_quant.py +145 -0
vmlx-1.0.0/tests/test_llm.py +115 -0
vmlx-1.0.0/tests/test_mcp_security.py +760 -0
vmlx-1.0.0/tests/test_medgemma_comprehensive.py +494 -0
vmlx-1.0.0/tests/test_memory_cache.py +623 -0
vmlx-1.0.0/tests/test_mllm.py +668 -0
vmlx-1.0.0/tests/test_mllm_cache.py +1230 -0
vmlx-1.0.0/tests/test_mllm_continuous_batching.py +484 -0
vmlx-1.0.0/tests/test_mllm_message_serialization.py +1373 -0
vmlx-1.0.0/tests/test_mllm_scheduler_cache.py +649 -0
vmlx-1.0.0/tests/test_mllm_scheduler_stability.py +223 -0
vmlx-1.0.0/tests/test_model_config_registry.py +867 -0
vmlx-1.0.0/tests/test_model_inspector.py +505 -0
vmlx-1.0.0/tests/test_model_name.py +682 -0
vmlx-1.0.0/tests/test_model_registry.py +257 -0
vmlx-1.0.0/tests/test_native_tool_format.py +358 -0
vmlx-1.0.0/tests/test_optimizations.py +64 -0
vmlx-1.0.0/tests/test_paged_cache.py +727 -0
vmlx-1.0.0/tests/test_paged_cache_benefits.py +464 -0
vmlx-1.0.0/tests/test_paged_cache_real_inference.py +269 -0
vmlx-1.0.0/tests/test_paged_cache_real_model.py +587 -0
vmlx-1.0.0/tests/test_paged_cache_unit.py +184 -0
vmlx-1.0.0/tests/test_platform.py +111 -0
vmlx-1.0.0/tests/test_prefix_cache.py +547 -0
vmlx-1.0.0/tests/test_reasoning_parser.py +1352 -0
vmlx-1.0.0/tests/test_reasoning_tool_interaction.py +780 -0
vmlx-1.0.0/tests/test_request.py +475 -0
vmlx-1.0.0/tests/test_request_cancellation.py +59 -0
vmlx-1.0.0/tests/test_reranker_endpoint.py +94 -0
vmlx-1.0.0/tests/test_server.py +759 -0
vmlx-1.0.0/tests/test_simple_engine.py +213 -0
vmlx-1.0.0/tests/test_speculative.py +845 -0
vmlx-1.0.0/tests/test_streaming_detokenizer.py +259 -0
vmlx-1.0.0/tests/test_streaming_json_encoder.py +438 -0
vmlx-1.0.0/tests/test_streaming_latency.py +338 -0
vmlx-1.0.0/tests/test_streaming_reasoning.py +1536 -0
vmlx-1.0.0/tests/test_structured_output.py +379 -0
vmlx-1.0.0/tests/test_tool_fallback_injection.py +202 -0
vmlx-1.0.0/tests/test_tool_format.py +595 -0
vmlx-1.0.0/tests/test_tool_parsers.py +984 -0
vmlx-1.0.0/vmlx.egg-info/PKG-INFO +340 -0
vmlx-1.0.0/vmlx.egg-info/SOURCES.txt +162 -0
vmlx-1.0.0/vmlx.egg-info/dependency_links.txt +1 -0
vmlx-1.0.0/vmlx.egg-info/entry_points.txt +9 -0
vmlx-1.0.0/vmlx.egg-info/requires.txt +61 -0
vmlx-1.0.0/vmlx.egg-info/top_level.txt +1 -0
vmlx-1.0.0/vmlx_engine/__init__.py +138 -0
vmlx-1.0.0/vmlx_engine/api/__init__.py +121 -0
vmlx-1.0.0/vmlx_engine/api/anthropic_adapter.py +609 -0
vmlx-1.0.0/vmlx_engine/api/models.py +757 -0
vmlx-1.0.0/vmlx_engine/api/streaming.py +210 -0
vmlx-1.0.0/vmlx_engine/api/tool_calling.py +677 -0
vmlx-1.0.0/vmlx_engine/api/utils.py +314 -0
vmlx-1.0.0/vmlx_engine/attention.py +245 -0
vmlx-1.0.0/vmlx_engine/audio/__init__.py +25 -0
vmlx-1.0.0/vmlx_engine/audio/processor.py +214 -0
vmlx-1.0.0/vmlx_engine/audio/stt.py +167 -0
vmlx-1.0.0/vmlx_engine/audio/tts.py +322 -0
vmlx-1.0.0/vmlx_engine/benchmark.py +1654 -0
vmlx-1.0.0/vmlx_engine/block_disk_store.py +770 -0
vmlx-1.0.0/vmlx_engine/cli.py +1287 -0
vmlx-1.0.0/vmlx_engine/commands/__init__.py +2 -0
vmlx-1.0.0/vmlx_engine/commands/convert.py +510 -0
vmlx-1.0.0/vmlx_engine/commands/doctor.py +309 -0
vmlx-1.0.0/vmlx_engine/commands/info.py +30 -0
vmlx-1.0.0/vmlx_engine/commands/list.py +38 -0
vmlx-1.0.0/vmlx_engine/disk_cache.py +468 -0
vmlx-1.0.0/vmlx_engine/embedding.py +109 -0
vmlx-1.0.0/vmlx_engine/engine/__init__.py +28 -0
vmlx-1.0.0/vmlx_engine/engine/base.py +201 -0
vmlx-1.0.0/vmlx_engine/engine/batched.py +810 -0
vmlx-1.0.0/vmlx_engine/engine/simple.py +721 -0
vmlx-1.0.0/vmlx_engine/engine_core.py +720 -0
vmlx-1.0.0/vmlx_engine/gradio_app.py +390 -0
vmlx-1.0.0/vmlx_engine/gradio_text_app.py +176 -0
vmlx-1.0.0/vmlx_engine/image_gen.py +275 -0
vmlx-1.0.0/vmlx_engine/mcp/__init__.py +85 -0
vmlx-1.0.0/vmlx_engine/mcp/client.py +370 -0
vmlx-1.0.0/vmlx_engine/mcp/config.py +186 -0
vmlx-1.0.0/vmlx_engine/mcp/executor.py +500 -0
vmlx-1.0.0/vmlx_engine/mcp/manager.py +302 -0
vmlx-1.0.0/vmlx_engine/mcp/security.py +699 -0
vmlx-1.0.0/vmlx_engine/mcp/tools.py +174 -0
vmlx-1.0.0/vmlx_engine/mcp/types.py +189 -0
vmlx-1.0.0/vmlx_engine/memory_cache.py +660 -0
vmlx-1.0.0/vmlx_engine/mllm_batch_generator.py +1800 -0
vmlx-1.0.0/vmlx_engine/mllm_cache.py +467 -0
vmlx-1.0.0/vmlx_engine/mllm_scheduler.py +2074 -0
vmlx-1.0.0/vmlx_engine/mlx_platform.py +333 -0
vmlx-1.0.0/vmlx_engine/model_config_registry.py +224 -0
vmlx-1.0.0/vmlx_engine/model_configs.py +684 -0
vmlx-1.0.0/vmlx_engine/model_registry.py +185 -0
vmlx-1.0.0/vmlx_engine/model_runner.py +456 -0
vmlx-1.0.0/vmlx_engine/models/__init__.py +15 -0
vmlx-1.0.0/vmlx_engine/models/llm.py +366 -0
vmlx-1.0.0/vmlx_engine/models/mllm.py +1965 -0
vmlx-1.0.0/vmlx_engine/multimodal_processor.py +200 -0
vmlx-1.0.0/vmlx_engine/optimizations.py +139 -0
vmlx-1.0.0/vmlx_engine/output_collector.py +247 -0
vmlx-1.0.0/vmlx_engine/paged_cache.py +1364 -0
vmlx-1.0.0/vmlx_engine/plugin.py +155 -0
vmlx-1.0.0/vmlx_engine/prefix_cache.py +1257 -0
vmlx-1.0.0/vmlx_engine/reasoning/__init__.py +101 -0
vmlx-1.0.0/vmlx_engine/reasoning/base.py +110 -0
vmlx-1.0.0/vmlx_engine/reasoning/deepseek_r1_parser.py +113 -0
vmlx-1.0.0/vmlx_engine/reasoning/gptoss_parser.py +336 -0
vmlx-1.0.0/vmlx_engine/reasoning/qwen3_parser.py +73 -0
vmlx-1.0.0/vmlx_engine/reasoning/think_parser.py +227 -0
vmlx-1.0.0/vmlx_engine/request.py +219 -0
vmlx-1.0.0/vmlx_engine/reranker.py +221 -0
vmlx-1.0.0/vmlx_engine/scheduler.py +2202 -0
vmlx-1.0.0/vmlx_engine/server.py +4700 -0
vmlx-1.0.0/vmlx_engine/simple.py +445 -0
vmlx-1.0.0/vmlx_engine/speculative.py +257 -0
vmlx-1.0.0/vmlx_engine/tool_parsers/__init__.py +83 -0
vmlx-1.0.0/vmlx_engine/tool_parsers/abstract_tool_parser.py +290 -0
vmlx-1.0.0/vmlx_engine/tool_parsers/auto_tool_parser.py +379 -0
vmlx-1.0.0/vmlx_engine/tool_parsers/deepseek_tool_parser.py +165 -0
vmlx-1.0.0/vmlx_engine/tool_parsers/functionary_tool_parser.py +188 -0
vmlx-1.0.0/vmlx_engine/tool_parsers/glm47_tool_parser.py +213 -0
vmlx-1.0.0/vmlx_engine/tool_parsers/granite_tool_parser.py +142 -0
vmlx-1.0.0/vmlx_engine/tool_parsers/hermes_tool_parser.py +235 -0
vmlx-1.0.0/vmlx_engine/tool_parsers/kimi_tool_parser.py +155 -0
vmlx-1.0.0/vmlx_engine/tool_parsers/llama_tool_parser.py +123 -0
vmlx-1.0.0/vmlx_engine/tool_parsers/minimax_tool_parser.py +338 -0
vmlx-1.0.0/vmlx_engine/tool_parsers/mistral_tool_parser.py +262 -0
vmlx-1.0.0/vmlx_engine/tool_parsers/nemotron_tool_parser.py +161 -0
vmlx-1.0.0/vmlx_engine/tool_parsers/qwen_tool_parser.py +152 -0
vmlx-1.0.0/vmlx_engine/tool_parsers/step3p5_tool_parser.py +232 -0
vmlx-1.0.0/vmlx_engine/tool_parsers/xlam_tool_parser.py +172 -0
vmlx-1.0.0/vmlx_engine/utils/__init__.py +6 -0
vmlx-1.0.0/vmlx_engine/utils/cache_types.py +196 -0
vmlx-1.0.0/vmlx_engine/utils/chat_templates.py +231 -0
vmlx-1.0.0/vmlx_engine/utils/jang_loader.py +567 -0
vmlx-1.0.0/vmlx_engine/utils/mamba_cache.py +327 -0
vmlx-1.0.0/vmlx_engine/utils/model_inspector.py +592 -0
vmlx-1.0.0/vmlx_engine/utils/nemotron_latent_moe.py +226 -0
vmlx-1.0.0/vmlx_engine/utils/tokenizer.py +247 -0
vmlx-1.0.0/vmlx_engine/vision_embedding_cache.py +219 -0
vmlx-1.0.0/vmlx_engine/worker.py +266 -0

vmlx-1.0.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,340 @@
+Metadata-Version: 2.4
+Name: vmlx
+Version: 1.0.0
+Summary: Local AI inference for Apple Silicon — Text, Image, Video & Audio generation on Mac
+Author-email: Jinho Jang <eric@jangq.ai>
+License: Apache-2.0
+Project-URL: Homepage, https://github.com/vmlxllm/vmlx
+Project-URL: Documentation, https://github.com/vmlxllm/vmlx#readme
+Project-URL: Repository, https://github.com/vmlxllm/vmlx
+Keywords: llm,mlx,apple-silicon,vllm,inference,transformers
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Developers
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: OSI Approved :: Apache Software License
+Classifier: Operating System :: MacOS
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+Requires-Dist: mlx>=0.29.0
+Requires-Dist: mlx-lm>=0.30.2
+Requires-Dist: mlx-vlm>=0.1.0
+Requires-Dist: transformers>=4.40.0
+Requires-Dist: tokenizers>=0.19.0
+Requires-Dist: huggingface-hub>=0.23.0
+Requires-Dist: numpy>=1.24.0
+Requires-Dist: pillow>=10.0.0
+Requires-Dist: tqdm>=4.66.0
+Requires-Dist: pyyaml>=6.0
+Requires-Dist: requests>=2.28.0
+Requires-Dist: tabulate>=0.9.0
+Requires-Dist: opencv-python-headless>=4.8.0
+Requires-Dist: psutil>=5.9.0
+Requires-Dist: fastapi>=0.100.0
+Requires-Dist: uvicorn>=0.23.0
+Requires-Dist: mcp>=1.0.0
+Requires-Dist: jsonschema>=4.0.0
+Requires-Dist: mlx-embeddings>=0.0.5
+Provides-Extra: ui
+Requires-Dist: gradio>=4.0.0; extra == "ui"
+Requires-Dist: pytz>=2024.1; extra == "ui"
+Provides-Extra: dev
+Requires-Dist: pytest>=7.0.0; extra == "dev"
+Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
+Requires-Dist: black>=23.0.0; extra == "dev"
+Requires-Dist: ruff>=0.1.0; extra == "dev"
+Requires-Dist: mypy>=1.0.0; extra == "dev"
+Provides-Extra: vllm
+Requires-Dist: vllm>=0.4.0; extra == "vllm"
+Provides-Extra: vision
+Requires-Dist: torch>=2.3.0; extra == "vision"
+Requires-Dist: torchvision>=0.18.0; extra == "vision"
+Provides-Extra: audio
+Requires-Dist: mlx-audio>=0.2.9; extra == "audio"
+Requires-Dist: sounddevice>=0.4.0; extra == "audio"
+Requires-Dist: soundfile>=0.12.0; extra == "audio"
+Requires-Dist: scipy>=1.10.0; extra == "audio"
+Requires-Dist: numba>=0.57.0; extra == "audio"
+Requires-Dist: tiktoken>=0.5.0; extra == "audio"
+Requires-Dist: misaki[ja,zh]>=0.5.0; extra == "audio"
+Requires-Dist: spacy>=3.7.0; extra == "audio"
+Requires-Dist: num2words>=0.5.0; extra == "audio"
+Requires-Dist: loguru>=0.7.0; extra == "audio"
+Requires-Dist: phonemizer>=3.2.0; extra == "audio"
+Requires-Dist: ordered_set>=4.1.0; extra == "audio"
+Requires-Dist: cn2an>=0.5.0; extra == "audio"
+Requires-Dist: fugashi>=1.3.0; extra == "audio"
+Requires-Dist: unidic-lite>=1.0.0; extra == "audio"
+Requires-Dist: jieba>=0.42.0; extra == "audio"
+Provides-Extra: jang
+Requires-Dist: jang>=1.0.0; extra == "jang"
+Provides-Extra: image
+Requires-Dist: mflux>=0.16.0; extra == "image"
+<p align="center">
+  <picture>
+    <source media="(prefers-color-scheme: dark)" srcset="https://vmlx.net/logos/png/wordmark-dark-600x150.png">
+    <source media="(prefers-color-scheme: light)" srcset="https://vmlx.net/logos/png/wordmark-light-600x150.png">
+    <img alt="vMLX" src="https://vmlx.net/logos/png/wordmark-transparent-600x150.png" width="400">
+  </picture>
+</p>
+<p align="center">
+  <strong>Native macOS AI inference — local models, remote endpoints, zero config</strong>
+</p>
+<p align="center">
+  <a href="https://vmlx.net">Website</a> · <a href="panel/CHANGELOG.md">Panel Changelog</a> · <a href="CHANGELOG.md">Engine Changelog</a> · <a href="docs/">Documentation</a>
+</p>
+---
+## What is vMLX?
+vMLX is a native macOS application for running AI models on Apple Silicon. It bundles a custom inference engine with a full-featured desktop interface — manage sessions, chat with models, download from HuggingFace, connect to remote APIs, and use agentic tool-calling workflows.
+- **Local inference** with GPU acceleration via MLX
+- **Remote endpoints** — connect to any OpenAI-compatible API
+- **HuggingFace downloader** — search, download, and serve models in-app
+- **Built-in tools** — file I/O, shell, search, image reading, ask_user interrupt
+- **MCP integration** — Model Context Protocol tool servers (local sessions)
+---
+## Key Features
+### Inference Engine (v0.2.18)
+| Feature | Description |
+|---------|-------------|
+| **Paged KV Cache** | Memory-efficient caching with prefix sharing and block-level reuse |
+| **KV Cache Quantization** | Q4/Q8 quantized cache storage (2–4× memory savings) |
+| **Prefix Cache** | Token-level prefix matching for fast prompt reuse across requests |
+| **Continuous Batching** | Concurrent request handling with slot management |
+| **VLM Caching** | Full KV cache pipeline for vision-language models (Qwen-VL, Gemma 3, etc.) |
+| **Mamba Hybrid Support** | Auto-detects mixed KVCache + MambaCache models (Qwen3.5-VL, Qwen3-Coder-Next, Nemotron) |
+| **Streaming Detokenizer** | Per-request UTF-8 buffering — emoji, CJK, Arabic render correctly |
+| **Request Cancellation** | Stop inference mid-stream via API or connection close |
+| **OpenAI-Compatible API** | Chat Completions + Responses API with full streaming support |
+| **Speculative Decoding** | Draft model acceleration (20-90% speedup, zero quality loss) |
+### Desktop App (Panel v1.2.1)
+| Feature | Description |
+|---------|-------------|
+| **Multi-session** | Run multiple models simultaneously on different ports |
+| **Remote endpoints** | Connect to OpenAI, Groq, local vLLM, or any compatible API |
+| **HuggingFace browser** | Search, download, and install MLX models with progress tracking |
+| **Agentic tools** | File I/O, shell, search, image reading with auto-continue loops (up to 10 iterations) |
+| **Per-chat settings** | Temperature, Top P/K, Min P, Repeat Penalty, Stop Sequences, Max Tokens |
+| **Reasoning display** | Collapsible thinking sections for Qwen3, DeepSeek-R1, GLM-4.7 |
+| **Tool parsers** | hermes, pythonic, llama3, mistral, minimax, qwen3, nemotron, step3p5, and more |
+| **Auto-detection** | Reads model config JSON for automatic parser and cache type selection |
+| **Persistent history** | SQLite-backed chat history with metrics, tool calls, and reasoning content |
+| **Live metrics** | TTFT, tokens/sec, prompt processing speed, prefix cache hits |
+---
+## Quick Start
+### Desktop App (recommended)
+```bash
+# Clone and build
+git clone https://github.com/vmlxllm/vmlx.git
+cd vmlx/panel
+# Install dependencies
+npm install
+# Development mode
+npm run dev
+# Build and install to /Applications
+bash scripts/build-and-install.sh
+```
+### Engine Only (CLI)
+```bash
+# Install
+uv tool install git+https://github.com/vmlxllm/vmlx.git
+# or
+pip install git+https://github.com/vmlxllm/vmlx.git
+# Start server
+vmlx-engine serve mlx-community/Llama-3.2-3B-Instruct-4bit --port 8000
+# With continuous batching
+vmlx-engine serve mlx-community/Llama-3.2-3B-Instruct-4bit --port 8000 --continuous-batching
+# With API key
+vmlx-engine serve mlx-community/Llama-3.2-3B-Instruct-4bit --port 8000 --api-key your-key
+# With speculative decoding (20-90% faster)
+vmlx-engine serve mlx-community/Llama-3.2-3B-Instruct-4bit --port 8000 \
+  --speculative-model mlx-community/Llama-3.2-1B-Instruct-4bit \
+  --num-draft-tokens 3
+```
+### Use with OpenAI SDK
+```python
+from openai import OpenAI
+client = OpenAI(base_url="http://localhost:8000/v1", api_key="not-needed")
+response = client.chat.completions.create(
+    model="default",
+    messages=[{"role": "user", "content": "Hello!"}],
+)
+print(response.choices[0].message.content)
+```
+---
+## API Endpoints
+| Endpoint | Description |
+|----------|-------------|
+| `POST /v1/chat/completions` | Chat Completions API (streaming) |
+| `POST /v1/responses` | Responses API (streaming) |
+| `GET /v1/models` | List loaded models |
+| `GET /health` | Server health + model info |
+| `POST /v1/mcp/execute` | Execute MCP tool |
+| `GET /v1/cache/stats` | Prefix cache statistics |
+| `POST /v1/cache/warm` | Pre-warm cache with prompt |
+| `DELETE /v1/cache` | Clear prefix cache |
+| `POST /v1/chat/completions/{id}/cancel` | Cancel inference (save GPU) |
+| `POST /v1/embeddings` | Text embeddings (mlx-embeddings) |
+---
+## Reasoning Models
+Extract thinking process from reasoning-capable models:
+```bash
+vmlx-engine serve mlx-community/Qwen3-8B-4bit --reasoning-parser qwen3
+```
+| Parser | Models | Format |
+|--------|--------|--------|
+| `qwen3` | Qwen3, QwQ, MiniMax M2/M2.5, StepFun | `<think>` / `</think>` tags |
+| `deepseek_r1` | DeepSeek-R1, Gemma 3, Phi-4 Reasoning, GLM-4.7, GLM-Z1 | Lenient `<think>` (handles missing open tag) |
+| `openai_gptoss` | GLM-4.7 Flash, GPT-OSS | Harmony `<\|channel\|>analysis/final` protocol |
+---
+## Tool Calling
+Built-in agentic tools available in the desktop app:
+| Category | Tools |
+|----------|-------|
+| **File** | read_file, write_file, edit_file, patch_file, batch_edit, copy, move, delete, create_directory, list_directory, read_image |
+| **Search** | search_files, find_files, file_info, get_diagnostics, get_tree, diff_files |
+| **Shell** | run_command, spawn_process, get_process_output |
+| **Web** | fetchUrl, brave_search |
+| **Utility** | ask_user (interactive interrupt) |
+Plus MCP tool server passthrough for local sessions.
+---
+## Architecture
+```
+┌─────────────────────────────────────────────────────────┐
+│                    vMLX Desktop App                      │
+│              (Electron + React + TypeScript)              │
+└─────────────────────────────────────────────────────────┘
+                           │
+              ┌────────────┴────────────┐
+              ▼                         ▼
+┌──────────────────────┐  ┌──────────────────────┐
+│   Local vmlx-engine     │  │   Remote Endpoints   │
+│   (spawned process)  │  │ (OpenAI, Groq, etc.) │
+└──────────────────────┘  └──────────────────────┘
+              │
+              ▼
+┌─────────────────────────────────────────────────────────┐
+│                      vMLX Engine                             │
+│         (FastAPI + MLX inference + caching)               │
+└─────────────────────────────────────────────────────────┘
+              │
+    ┌─────────┼──────────┬──────────┐
+    ▼         ▼          ▼          ▼
+┌────────┐┌────────┐┌────────┐┌────────────┐
+│ mlx-lm ││mlx-vlm ││mlx-aud ││mlx-embed   │
+│ (LLMs) ││(Vision)││(Audio) ││(Embeddings)│
+└────────┘└────────┘└────────┘└────────────┘
+              │
+              ▼
+┌─────────────────────────────────────────────────────────┐
+│                     Apple MLX                            │
+│             (Metal GPU + Unified Memory)                 │
+└─────────────────────────────────────────────────────────┘
+```
+---
+## Tech Stack
+| Layer | Technology |
+|-------|-----------|
+| Desktop app | Electron 28 + React 18 + TypeScript |
+| Styling | Tailwind CSS |
+| Database | SQLite (WAL mode, better-sqlite3) |
+| Inference engine | vMLX Engine v0.2.18 (Python, FastAPI) |
+| ML framework | Apple MLX (Metal GPU acceleration) |
+| Build | electron-vite + electron-builder |
+| Tests | Vitest (panel: 542 tests), pytest (engine: 1595 tests) |
+| Python | Bundled relocatable Python 3.12 |
+---
+## Recent Changes
+### Panel v1.2.1 / Engine v0.2.18 (2026-03-09)
+- **Tool calling fix**: `enableAutoToolChoice` default changed from `false` to `undefined` (auto-detect) — MCP and built-in tools now work out of the box without manual enable
+- **MCP tool result truncation**: MCP tool results now capped at same limit as built-in tools (50KB default) to prevent context overflow
+- **Command preview parity**: `buildCommandPreview` in SessionSettings now matches actual `buildArgs` logic for auto-tool-choice flags
+- **Old config migration**: Stored sessions with `enableAutoToolChoice: false` auto-migrate to `undefined` on load
+- **2137 total tests**: 1595 engine + 542 panel (12 new regression tests for tool calling and MCP)
+### Panel v1.2.0 / Engine v0.2.18 (2026-03-09)
+- **HuggingFace download fix**: Download progress no longer stuck at 0% — tqdm `\r` chunk splitting, ANSI stripping, highest-percent extraction
+- **HF browser NaN/Unknown fix**: Model ages and authors display correctly (uses `createdAt` fallback, extracts author from modelId)
+- **macOS 15 launch fix**: `minimumSystemVersion` corrected from 26.0.0 to 14.0.0 (fixes GitHub #10)
+- **Deep stability audit**: 14 fixes across paged cache block lifecycle, KV dequantize safety, reasoning marker detection, tool fallback, Mistral JSON validation
+- **CancelledError SSE hang**: Engine cancellation now unblocks all waiting SSE consumers
+- **2125 total tests**: 1595 engine + 530 panel with full regression coverage
+### Panel v1.1.4 / Engine v0.2.12 (2026-03-07)
+- **tool_choice="none" fix**: Content no longer swallowed when tool markers detected with tools suppressed
+- **suppress_reasoning**: Reasoning leaks plugged in both API paths
+- **First-launch UX**: Auto-creates initial chat, dynamic About page version
+- **1571 engine tests**, **530 panel tests** across 6 vitest suites
+See [Panel Changelog](panel/CHANGELOG.md) and [Engine Changelog](CHANGELOG.md) for full history.
+---
+## Current Version
+**Engine v0.2.18** / **Panel v1.2.1** — macOS 26+ (Tahoe) for local inference, macOS 14+ for remote endpoints. Apple Silicon (M1, M2, M3, M4)
+## Links
+- **Website**: [vmlx.net](https://vmlx.net)
+- **Contact**: admin@vmlx.net
+## License
+Apache 2.0 — see [LICENSE](LICENSE) for details.

vmlx-1.0.0/README.md ADDED Viewed

@@ -0,0 +1,262 @@
+<p align="center">
+  <picture>
+    <source media="(prefers-color-scheme: dark)" srcset="https://vmlx.net/logos/png/wordmark-dark-600x150.png">
+    <source media="(prefers-color-scheme: light)" srcset="https://vmlx.net/logos/png/wordmark-light-600x150.png">
+    <img alt="vMLX" src="https://vmlx.net/logos/png/wordmark-transparent-600x150.png" width="400">
+  </picture>
+</p>
+<p align="center">
+  <strong>Native macOS AI inference — local models, remote endpoints, zero config</strong>
+</p>
+<p align="center">
+  <a href="https://vmlx.net">Website</a> · <a href="panel/CHANGELOG.md">Panel Changelog</a> · <a href="CHANGELOG.md">Engine Changelog</a> · <a href="docs/">Documentation</a>
+</p>
+---
+## What is vMLX?
+vMLX is a native macOS application for running AI models on Apple Silicon. It bundles a custom inference engine with a full-featured desktop interface — manage sessions, chat with models, download from HuggingFace, connect to remote APIs, and use agentic tool-calling workflows.
+- **Local inference** with GPU acceleration via MLX
+- **Remote endpoints** — connect to any OpenAI-compatible API
+- **HuggingFace downloader** — search, download, and serve models in-app
+- **Built-in tools** — file I/O, shell, search, image reading, ask_user interrupt
+- **MCP integration** — Model Context Protocol tool servers (local sessions)
+---
+## Key Features
+### Inference Engine (v0.2.18)
+| Feature | Description |
+|---------|-------------|
+| **Paged KV Cache** | Memory-efficient caching with prefix sharing and block-level reuse |
+| **KV Cache Quantization** | Q4/Q8 quantized cache storage (2–4× memory savings) |
+| **Prefix Cache** | Token-level prefix matching for fast prompt reuse across requests |
+| **Continuous Batching** | Concurrent request handling with slot management |
+| **VLM Caching** | Full KV cache pipeline for vision-language models (Qwen-VL, Gemma 3, etc.) |
+| **Mamba Hybrid Support** | Auto-detects mixed KVCache + MambaCache models (Qwen3.5-VL, Qwen3-Coder-Next, Nemotron) |
+| **Streaming Detokenizer** | Per-request UTF-8 buffering — emoji, CJK, Arabic render correctly |
+| **Request Cancellation** | Stop inference mid-stream via API or connection close |
+| **OpenAI-Compatible API** | Chat Completions + Responses API with full streaming support |
+| **Speculative Decoding** | Draft model acceleration (20-90% speedup, zero quality loss) |
+### Desktop App (Panel v1.2.1)
+| Feature | Description |
+|---------|-------------|
+| **Multi-session** | Run multiple models simultaneously on different ports |
+| **Remote endpoints** | Connect to OpenAI, Groq, local vLLM, or any compatible API |
+| **HuggingFace browser** | Search, download, and install MLX models with progress tracking |
+| **Agentic tools** | File I/O, shell, search, image reading with auto-continue loops (up to 10 iterations) |
+| **Per-chat settings** | Temperature, Top P/K, Min P, Repeat Penalty, Stop Sequences, Max Tokens |
+| **Reasoning display** | Collapsible thinking sections for Qwen3, DeepSeek-R1, GLM-4.7 |
+| **Tool parsers** | hermes, pythonic, llama3, mistral, minimax, qwen3, nemotron, step3p5, and more |
+| **Auto-detection** | Reads model config JSON for automatic parser and cache type selection |
+| **Persistent history** | SQLite-backed chat history with metrics, tool calls, and reasoning content |
+| **Live metrics** | TTFT, tokens/sec, prompt processing speed, prefix cache hits |
+---
+## Quick Start
+### Desktop App (recommended)
+```bash
+# Clone and build
+git clone https://github.com/vmlxllm/vmlx.git
+cd vmlx/panel
+# Install dependencies
+npm install
+# Development mode
+npm run dev
+# Build and install to /Applications
+bash scripts/build-and-install.sh
+```
+### Engine Only (CLI)
+```bash
+# Install
+uv tool install git+https://github.com/vmlxllm/vmlx.git
+# or
+pip install git+https://github.com/vmlxllm/vmlx.git
+# Start server
+vmlx-engine serve mlx-community/Llama-3.2-3B-Instruct-4bit --port 8000
+# With continuous batching
+vmlx-engine serve mlx-community/Llama-3.2-3B-Instruct-4bit --port 8000 --continuous-batching
+# With API key
+vmlx-engine serve mlx-community/Llama-3.2-3B-Instruct-4bit --port 8000 --api-key your-key
+# With speculative decoding (20-90% faster)
+vmlx-engine serve mlx-community/Llama-3.2-3B-Instruct-4bit --port 8000 \
+  --speculative-model mlx-community/Llama-3.2-1B-Instruct-4bit \
+  --num-draft-tokens 3
+```
+### Use with OpenAI SDK
+```python
+from openai import OpenAI
+client = OpenAI(base_url="http://localhost:8000/v1", api_key="not-needed")
+response = client.chat.completions.create(
+    model="default",
+    messages=[{"role": "user", "content": "Hello!"}],
+)
+print(response.choices[0].message.content)
+```
+---
+## API Endpoints
+| Endpoint | Description |
+|----------|-------------|
+| `POST /v1/chat/completions` | Chat Completions API (streaming) |
+| `POST /v1/responses` | Responses API (streaming) |
+| `GET /v1/models` | List loaded models |
+| `GET /health` | Server health + model info |
+| `POST /v1/mcp/execute` | Execute MCP tool |
+| `GET /v1/cache/stats` | Prefix cache statistics |
+| `POST /v1/cache/warm` | Pre-warm cache with prompt |
+| `DELETE /v1/cache` | Clear prefix cache |
+| `POST /v1/chat/completions/{id}/cancel` | Cancel inference (save GPU) |
+| `POST /v1/embeddings` | Text embeddings (mlx-embeddings) |
+---
+## Reasoning Models
+Extract thinking process from reasoning-capable models:
+```bash
+vmlx-engine serve mlx-community/Qwen3-8B-4bit --reasoning-parser qwen3
+```
+| Parser | Models | Format |
+|--------|--------|--------|
+| `qwen3` | Qwen3, QwQ, MiniMax M2/M2.5, StepFun | `<think>` / `</think>` tags |
+| `deepseek_r1` | DeepSeek-R1, Gemma 3, Phi-4 Reasoning, GLM-4.7, GLM-Z1 | Lenient `<think>` (handles missing open tag) |
+| `openai_gptoss` | GLM-4.7 Flash, GPT-OSS | Harmony `<\|channel\|>analysis/final` protocol |
+---
+## Tool Calling
+Built-in agentic tools available in the desktop app:
+| Category | Tools |
+|----------|-------|
+| **File** | read_file, write_file, edit_file, patch_file, batch_edit, copy, move, delete, create_directory, list_directory, read_image |
+| **Search** | search_files, find_files, file_info, get_diagnostics, get_tree, diff_files |
+| **Shell** | run_command, spawn_process, get_process_output |
+| **Web** | fetchUrl, brave_search |
+| **Utility** | ask_user (interactive interrupt) |
+Plus MCP tool server passthrough for local sessions.
+---
+## Architecture
+```
+┌─────────────────────────────────────────────────────────┐
+│                    vMLX Desktop App                      │
+│              (Electron + React + TypeScript)              │
+└─────────────────────────────────────────────────────────┘
+                           │
+              ┌────────────┴────────────┐
+              ▼                         ▼
+┌──────────────────────┐  ┌──────────────────────┐
+│   Local vmlx-engine     │  │   Remote Endpoints   │
+│   (spawned process)  │  │ (OpenAI, Groq, etc.) │
+└──────────────────────┘  └──────────────────────┘
+              │
+              ▼
+┌─────────────────────────────────────────────────────────┐
+│                      vMLX Engine                             │
+│         (FastAPI + MLX inference + caching)               │
+└─────────────────────────────────────────────────────────┘
+              │
+    ┌─────────┼──────────┬──────────┐
+    ▼         ▼          ▼          ▼
+┌────────┐┌────────┐┌────────┐┌────────────┐
+│ mlx-lm ││mlx-vlm ││mlx-aud ││mlx-embed   │
+│ (LLMs) ││(Vision)││(Audio) ││(Embeddings)│
+└────────┘└────────┘└────────┘└────────────┘
+              │
+              ▼
+┌─────────────────────────────────────────────────────────┐
+│                     Apple MLX                            │
+│             (Metal GPU + Unified Memory)                 │
+└─────────────────────────────────────────────────────────┘
+```
+---
+## Tech Stack
+| Layer | Technology |
+|-------|-----------|
+| Desktop app | Electron 28 + React 18 + TypeScript |
+| Styling | Tailwind CSS |
+| Database | SQLite (WAL mode, better-sqlite3) |
+| Inference engine | vMLX Engine v0.2.18 (Python, FastAPI) |
+| ML framework | Apple MLX (Metal GPU acceleration) |
+| Build | electron-vite + electron-builder |
+| Tests | Vitest (panel: 542 tests), pytest (engine: 1595 tests) |
+| Python | Bundled relocatable Python 3.12 |
+---
+## Recent Changes
+### Panel v1.2.1 / Engine v0.2.18 (2026-03-09)
+- **Tool calling fix**: `enableAutoToolChoice` default changed from `false` to `undefined` (auto-detect) — MCP and built-in tools now work out of the box without manual enable
+- **MCP tool result truncation**: MCP tool results now capped at same limit as built-in tools (50KB default) to prevent context overflow
+- **Command preview parity**: `buildCommandPreview` in SessionSettings now matches actual `buildArgs` logic for auto-tool-choice flags
+- **Old config migration**: Stored sessions with `enableAutoToolChoice: false` auto-migrate to `undefined` on load
+- **2137 total tests**: 1595 engine + 542 panel (12 new regression tests for tool calling and MCP)
+### Panel v1.2.0 / Engine v0.2.18 (2026-03-09)
+- **HuggingFace download fix**: Download progress no longer stuck at 0% — tqdm `\r` chunk splitting, ANSI stripping, highest-percent extraction
+- **HF browser NaN/Unknown fix**: Model ages and authors display correctly (uses `createdAt` fallback, extracts author from modelId)
+- **macOS 15 launch fix**: `minimumSystemVersion` corrected from 26.0.0 to 14.0.0 (fixes GitHub #10)
+- **Deep stability audit**: 14 fixes across paged cache block lifecycle, KV dequantize safety, reasoning marker detection, tool fallback, Mistral JSON validation
+- **CancelledError SSE hang**: Engine cancellation now unblocks all waiting SSE consumers
+- **2125 total tests**: 1595 engine + 530 panel with full regression coverage
+### Panel v1.1.4 / Engine v0.2.12 (2026-03-07)
+- **tool_choice="none" fix**: Content no longer swallowed when tool markers detected with tools suppressed
+- **suppress_reasoning**: Reasoning leaks plugged in both API paths
+- **First-launch UX**: Auto-creates initial chat, dynamic About page version
+- **1571 engine tests**, **530 panel tests** across 6 vitest suites
+See [Panel Changelog](panel/CHANGELOG.md) and [Engine Changelog](CHANGELOG.md) for full history.
+---
+## Current Version
+**Engine v0.2.18** / **Panel v1.2.1** — macOS 26+ (Tahoe) for local inference, macOS 14+ for remote endpoints. Apple Silicon (M1, M2, M3, M4)
+## Links
+- **Website**: [vmlx.net](https://vmlx.net)
+- **Contact**: admin@vmlx.net
+## License
+Apache 2.0 — see [LICENSE](LICENSE) for details.