cortex-llm 1.0.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cortex_llm-1.0.4/LICENSE +21 -0
- cortex_llm-1.0.4/PKG-INFO +275 -0
- cortex_llm-1.0.4/README.md +216 -0
- cortex_llm-1.0.4/cortex/__init__.py +73 -0
- cortex_llm-1.0.4/cortex/__main__.py +90 -0
- cortex_llm-1.0.4/cortex/config.py +329 -0
- cortex_llm-1.0.4/cortex/conversation_manager.py +468 -0
- cortex_llm-1.0.4/cortex/fine_tuning/__init__.py +8 -0
- cortex_llm-1.0.4/cortex/fine_tuning/dataset.py +332 -0
- cortex_llm-1.0.4/cortex/fine_tuning/mlx_lora_trainer.py +502 -0
- cortex_llm-1.0.4/cortex/fine_tuning/trainer.py +957 -0
- cortex_llm-1.0.4/cortex/fine_tuning/wizard.py +707 -0
- cortex_llm-1.0.4/cortex/gpu_validator.py +467 -0
- cortex_llm-1.0.4/cortex/inference_engine.py +729 -0
- cortex_llm-1.0.4/cortex/metal/__init__.py +275 -0
- cortex_llm-1.0.4/cortex/metal/gpu_validator.py +177 -0
- cortex_llm-1.0.4/cortex/metal/memory_pool.py +886 -0
- cortex_llm-1.0.4/cortex/metal/mlx_accelerator.py +680 -0
- cortex_llm-1.0.4/cortex/metal/mlx_compat.py +105 -0
- cortex_llm-1.0.4/cortex/metal/mlx_converter.py +638 -0
- cortex_llm-1.0.4/cortex/metal/mps_optimizer.py +417 -0
- cortex_llm-1.0.4/cortex/metal/optimizer.py +665 -0
- cortex_llm-1.0.4/cortex/metal/performance_profiler.py +364 -0
- cortex_llm-1.0.4/cortex/model_downloader.py +130 -0
- cortex_llm-1.0.4/cortex/model_manager.py +2187 -0
- cortex_llm-1.0.4/cortex/quantization/__init__.py +5 -0
- cortex_llm-1.0.4/cortex/quantization/dynamic_quantizer.py +736 -0
- cortex_llm-1.0.4/cortex/template_registry/__init__.py +15 -0
- cortex_llm-1.0.4/cortex/template_registry/auto_detector.py +144 -0
- cortex_llm-1.0.4/cortex/template_registry/config_manager.py +234 -0
- cortex_llm-1.0.4/cortex/template_registry/interactive.py +260 -0
- cortex_llm-1.0.4/cortex/template_registry/registry.py +347 -0
- cortex_llm-1.0.4/cortex/template_registry/template_profiles/__init__.py +5 -0
- cortex_llm-1.0.4/cortex/template_registry/template_profiles/base.py +142 -0
- cortex_llm-1.0.4/cortex/template_registry/template_profiles/complex/__init__.py +5 -0
- cortex_llm-1.0.4/cortex/template_registry/template_profiles/complex/reasoning.py +263 -0
- cortex_llm-1.0.4/cortex/template_registry/template_profiles/standard/__init__.py +9 -0
- cortex_llm-1.0.4/cortex/template_registry/template_profiles/standard/alpaca.py +73 -0
- cortex_llm-1.0.4/cortex/template_registry/template_profiles/standard/chatml.py +82 -0
- cortex_llm-1.0.4/cortex/template_registry/template_profiles/standard/gemma.py +103 -0
- cortex_llm-1.0.4/cortex/template_registry/template_profiles/standard/llama.py +87 -0
- cortex_llm-1.0.4/cortex/template_registry/template_profiles/standard/simple.py +65 -0
- cortex_llm-1.0.4/cortex/ui/__init__.py +120 -0
- cortex_llm-1.0.4/cortex/ui/cli.py +1685 -0
- cortex_llm-1.0.4/cortex/ui/markdown_render.py +185 -0
- cortex_llm-1.0.4/cortex/ui/terminal_app.py +534 -0
- cortex_llm-1.0.4/cortex_llm.egg-info/PKG-INFO +275 -0
- cortex_llm-1.0.4/cortex_llm.egg-info/SOURCES.txt +56 -0
- cortex_llm-1.0.4/cortex_llm.egg-info/dependency_links.txt +1 -0
- cortex_llm-1.0.4/cortex_llm.egg-info/entry_points.txt +2 -0
- cortex_llm-1.0.4/cortex_llm.egg-info/not-zip-safe +1 -0
- cortex_llm-1.0.4/cortex_llm.egg-info/requires.txt +31 -0
- cortex_llm-1.0.4/cortex_llm.egg-info/top_level.txt +1 -0
- cortex_llm-1.0.4/pyproject.toml +112 -0
- cortex_llm-1.0.4/setup.cfg +4 -0
- cortex_llm-1.0.4/setup.py +82 -0
- cortex_llm-1.0.4/tests/test_apple_silicon.py +602 -0
- cortex_llm-1.0.4/tests/test_metal_optimization.py +279 -0
cortex_llm-1.0.4/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Faisal Mumtaz
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,275 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: cortex-llm
|
|
3
|
+
Version: 1.0.4
|
|
4
|
+
Summary: GPU-Accelerated LLM Terminal for Apple Silicon
|
|
5
|
+
Home-page: https://github.com/faisalmumtaz/Cortex
|
|
6
|
+
Author: Cortex Development Team
|
|
7
|
+
License: MIT
|
|
8
|
+
Project-URL: Homepage, https://github.com/faisalmumtaz/Cortex
|
|
9
|
+
Project-URL: Bug Tracker, https://github.com/faisalmumtaz/Cortex/issues
|
|
10
|
+
Project-URL: Documentation, https://github.com/faisalmumtaz/Cortex/wiki
|
|
11
|
+
Keywords: llm,gpu,metal,mps,apple-silicon,ai,machine-learning,terminal,mlx,pytorch
|
|
12
|
+
Platform: darwin
|
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
16
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Operating System :: MacOS
|
|
21
|
+
Classifier: Environment :: Console
|
|
22
|
+
Classifier: Environment :: GPU
|
|
23
|
+
Requires-Python: >=3.11
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
License-File: LICENSE
|
|
26
|
+
Requires-Dist: torch>=2.1.0
|
|
27
|
+
Requires-Dist: mlx>=0.10.0
|
|
28
|
+
Requires-Dist: mlx-lm>=0.10.0
|
|
29
|
+
Requires-Dist: transformers>=4.36.0
|
|
30
|
+
Requires-Dist: safetensors>=0.4.0
|
|
31
|
+
Requires-Dist: huggingface-hub>=0.19.0
|
|
32
|
+
Requires-Dist: accelerate>=0.25.0
|
|
33
|
+
Requires-Dist: llama-cpp-python>=0.2.0
|
|
34
|
+
Requires-Dist: pyyaml>=6.0
|
|
35
|
+
Requires-Dist: pydantic>=2.5.0
|
|
36
|
+
Requires-Dist: rich>=13.0.0
|
|
37
|
+
Requires-Dist: psutil>=5.9.0
|
|
38
|
+
Requires-Dist: numpy>=1.24.0
|
|
39
|
+
Requires-Dist: packaging>=23.0
|
|
40
|
+
Requires-Dist: requests>=2.31.0
|
|
41
|
+
Provides-Extra: dev
|
|
42
|
+
Requires-Dist: pytest>=7.4.0; extra == "dev"
|
|
43
|
+
Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
|
|
44
|
+
Requires-Dist: black>=23.0.0; extra == "dev"
|
|
45
|
+
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
46
|
+
Requires-Dist: mypy>=1.8.0; extra == "dev"
|
|
47
|
+
Provides-Extra: optional
|
|
48
|
+
Requires-Dist: sentencepiece>=0.1.99; extra == "optional"
|
|
49
|
+
Requires-Dist: auto-gptq>=0.7.0; extra == "optional"
|
|
50
|
+
Requires-Dist: autoawq>=0.2.0; extra == "optional"
|
|
51
|
+
Requires-Dist: bitsandbytes>=0.41.0; extra == "optional"
|
|
52
|
+
Requires-Dist: optimum>=1.16.0; extra == "optional"
|
|
53
|
+
Requires-Dist: torchvision>=0.16.0; extra == "optional"
|
|
54
|
+
Requires-Dist: torchaudio>=2.1.0; extra == "optional"
|
|
55
|
+
Dynamic: home-page
|
|
56
|
+
Dynamic: license-file
|
|
57
|
+
Dynamic: platform
|
|
58
|
+
Dynamic: requires-python
|
|
59
|
+
|
|
60
|
+
# Cortex - LLM Terminal Client for Apple Silicon
|
|
61
|
+
|
|
62
|
+
Cortex is an LLM terminal interface designed for Apple Silicon, using MLX and PyTorch MPS frameworks for GPU-accelerated inference.
|
|
63
|
+
|
|
64
|
+
## What It Does
|
|
65
|
+
|
|
66
|
+
- **GPU-accelerated inference** via MLX (primary) and PyTorch MPS backends
|
|
67
|
+
- **Apple Silicon required** - leverages unified memory architecture
|
|
68
|
+
- **Multiple model formats** - MLX, GGUF, SafeTensors, PyTorch, GPTQ, AWQ
|
|
69
|
+
- **Built-in fine-tuning** - LoRA-based model customization via interactive wizard
|
|
70
|
+
- **Chat template auto-detection** - automatic format detection with confidence scoring
|
|
71
|
+
- **Conversation persistence** - SQLite-backed chat history with branching
|
|
72
|
+
|
|
73
|
+
## Features
|
|
74
|
+
|
|
75
|
+
- **GPU-Accelerated Inference** - Delegates to MLX and PyTorch MPS for Metal-based execution
|
|
76
|
+
- **Apple Silicon Only** - Requires Metal GPU; exits if GPU acceleration is unavailable
|
|
77
|
+
- **Model Format Support**:
|
|
78
|
+
- MLX (Apple's format, loaded via `mlx_lm`)
|
|
79
|
+
- GGUF (via `llama-cpp-python` with Metal backend)
|
|
80
|
+
- SafeTensors (via HuggingFace `transformers`)
|
|
81
|
+
- PyTorch models (via HuggingFace `transformers` with MPS device)
|
|
82
|
+
- GPTQ quantized (via `auto-gptq`)
|
|
83
|
+
- AWQ quantized (via `awq`)
|
|
84
|
+
- **Quantization** - 4-bit, 5-bit, 8-bit, and mixed-precision quantization via MLX conversion pipeline
|
|
85
|
+
- **Model Conversion** - Convert HuggingFace models to MLX format with configurable quantization recipes
|
|
86
|
+
- **Template Registry** - Automatic detection of chat templates (ChatML, Llama, Alpaca, Gemma, Reasoning) with confidence scoring and real-time token filtering for reasoning models
|
|
87
|
+
- **Rotating KV Cache** - MLX-based KV cache for long context handling (default 4096 tokens)
|
|
88
|
+
- **Fine-Tuning** - LoRA-based model customization with interactive 6-step wizard
|
|
89
|
+
- **Terminal UI** - ANSI terminal interface with streaming output
|
|
90
|
+
|
|
91
|
+
## Installation
|
|
92
|
+
|
|
93
|
+
```bash
|
|
94
|
+
# Clone and install
|
|
95
|
+
git clone https://github.com/faisalmumtaz/Cortex.git
|
|
96
|
+
cd Cortex
|
|
97
|
+
./install.sh
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
The installer:
|
|
101
|
+
- Checks for Apple Silicon (arm64) compatibility
|
|
102
|
+
- Creates a Python virtual environment
|
|
103
|
+
- Installs dependencies via `pip install -e .` (from `pyproject.toml`)
|
|
104
|
+
- Sets up the `cortex` command in your PATH
|
|
105
|
+
|
|
106
|
+
### Quick Install (pipx)
|
|
107
|
+
|
|
108
|
+
If you just want the CLI without cloning the repo, use pipx:
|
|
109
|
+
|
|
110
|
+
```bash
|
|
111
|
+
pipx install cortex-llm
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
## Quick Start
|
|
115
|
+
|
|
116
|
+
```bash
|
|
117
|
+
# After installation, just run:
|
|
118
|
+
cortex
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
### Downloading Models
|
|
122
|
+
|
|
123
|
+
```bash
|
|
124
|
+
# Inside Cortex, use the download command:
|
|
125
|
+
cortex
|
|
126
|
+
# Then type: /download
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
The download feature:
|
|
130
|
+
- **HuggingFace integration** - download any model by repository ID
|
|
131
|
+
- **Automatic loading** - option to load model immediately after download
|
|
132
|
+
|
|
133
|
+
## Documentation
|
|
134
|
+
|
|
135
|
+
### User Documentation
|
|
136
|
+
- **[Installation Guide](docs/installation.md)** - Complete setup instructions
|
|
137
|
+
- **[CLI Reference](docs/cli.md)** - Commands and user interface
|
|
138
|
+
- **[Configuration](docs/configuration.md)** - System settings and optimization
|
|
139
|
+
- **[Model Management](docs/model-management.md)** - Loading and managing models
|
|
140
|
+
- **[Template Registry](docs/template-registry.md)** - Automatic chat template detection and management
|
|
141
|
+
- **[Fine-Tuning Guide](docs/fine-tuning.md)** - Customize models with LoRA
|
|
142
|
+
- **[Troubleshooting](docs/troubleshooting.md)** - Common issues and solutions
|
|
143
|
+
|
|
144
|
+
### Technical Documentation
|
|
145
|
+
- **[MLX Acceleration](docs/mlx-acceleration.md)** - MLX framework integration and optimization
|
|
146
|
+
- **[GPU Validation](docs/gpu-validation.md)** - Hardware requirements and detection
|
|
147
|
+
- **[Inference Engine](docs/inference-engine.md)** - Text generation architecture
|
|
148
|
+
- **[Conversation Management](docs/conversation-management.md)** - Chat history and persistence
|
|
149
|
+
- **[Development Guide](docs/development.md)** - Contributing and architecture
|
|
150
|
+
|
|
151
|
+
## System Requirements
|
|
152
|
+
|
|
153
|
+
- Apple Silicon Mac (M1/M2/M3/M4 - all variants supported)
|
|
154
|
+
- macOS 13.3+ (required by MLX framework)
|
|
155
|
+
- Python 3.11+
|
|
156
|
+
- 16GB+ unified memory (24GB+ recommended for larger models)
|
|
157
|
+
- Xcode Command Line Tools
|
|
158
|
+
|
|
159
|
+
## Performance
|
|
160
|
+
|
|
161
|
+
Performance depends on your Apple Silicon chip, model size, and quantization level. The inference engine measures tokens/second, first-token latency, and memory usage at runtime.
|
|
162
|
+
|
|
163
|
+
To check that GPU acceleration is working:
|
|
164
|
+
|
|
165
|
+
```bash
|
|
166
|
+
source venv/bin/activate
|
|
167
|
+
python tests/test_apple_silicon.py
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
You should see:
|
|
171
|
+
- All validation checks passing
|
|
172
|
+
- Measured GFLOPS from matrix operations
|
|
173
|
+
- Confirmation of Metal and MLX availability
|
|
174
|
+
|
|
175
|
+
## GPU Acceleration Architecture
|
|
176
|
+
|
|
177
|
+
Cortex uses a multi-layer approach, delegating all GPU computation to established frameworks:
|
|
178
|
+
|
|
179
|
+
1. **MLX Framework (Primary Backend)**
|
|
180
|
+
- Apple's ML framework with native Metal support
|
|
181
|
+
- Quantization support (4-bit, 5-bit, 8-bit, mixed-precision)
|
|
182
|
+
- Rotating KV cache for long contexts
|
|
183
|
+
- JIT compilation via `mx.compile`
|
|
184
|
+
- Operation fusion for reduced kernel launches
|
|
185
|
+
|
|
186
|
+
2. **PyTorch MPS Backend**
|
|
187
|
+
- Metal Performance Shaders for PyTorch models
|
|
188
|
+
- FP16 optimization and channels-last tensor format
|
|
189
|
+
|
|
190
|
+
3. **llama.cpp (GGUF Backend)**
|
|
191
|
+
- Metal-accelerated inference for GGUF models
|
|
192
|
+
|
|
193
|
+
4. **Memory Management**
|
|
194
|
+
- Pre-allocated memory pools with best-fit/first-fit allocation strategies
|
|
195
|
+
- Automatic pool sizing (60% of available memory, capped at 75% of total)
|
|
196
|
+
- Defragmentation support
|
|
197
|
+
|
|
198
|
+
### Understanding "Skipping Kernel" Messages
|
|
199
|
+
|
|
200
|
+
When loading GGUF models, you may see messages like:
|
|
201
|
+
```
|
|
202
|
+
ggml_metal_init: skipping kernel_xxx_bf16 (not supported)
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
**These are NORMAL!** They indicate:
|
|
206
|
+
- BF16 kernels being skipped (your GPU uses FP16 instead)
|
|
207
|
+
- GPU acceleration is still fully active
|
|
208
|
+
- The system automatically uses optimal alternatives
|
|
209
|
+
|
|
210
|
+
## Troubleshooting
|
|
211
|
+
|
|
212
|
+
If you suspect GPU isn't being used:
|
|
213
|
+
|
|
214
|
+
1. **Run validation**: `python tests/test_apple_silicon.py`
|
|
215
|
+
2. **Check output**: Should see passing checks and measured GFLOPS
|
|
216
|
+
3. **Monitor tokens/sec**: Displayed during inference
|
|
217
|
+
4. **Verify Metal**: Ensure Xcode Command Line Tools installed
|
|
218
|
+
|
|
219
|
+
Common issues:
|
|
220
|
+
- **Low performance**: Run `python tests/test_apple_silicon.py` to diagnose
|
|
221
|
+
- **Memory errors**: Reduce `gpu_memory_fraction` in config.yaml
|
|
222
|
+
|
|
223
|
+
## MLX Model Conversion
|
|
224
|
+
|
|
225
|
+
Cortex includes an MLX model converter:
|
|
226
|
+
|
|
227
|
+
```python
|
|
228
|
+
from cortex.metal.mlx_converter import MLXConverter, ConversionConfig, QuantizationRecipe
|
|
229
|
+
|
|
230
|
+
converter = MLXConverter()
|
|
231
|
+
config = ConversionConfig(
|
|
232
|
+
quantization=QuantizationRecipe.SPEED_4BIT, # 4-bit quantization
|
|
233
|
+
compile_model=True # JIT compilation
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
success, message, output_path = converter.convert_model(
|
|
237
|
+
"microsoft/DialoGPT-medium",
|
|
238
|
+
config=config
|
|
239
|
+
)
|
|
240
|
+
```
|
|
241
|
+
|
|
242
|
+
### Quantization Options
|
|
243
|
+
|
|
244
|
+
- **4-bit**: Maximum speed, 75% size reduction
|
|
245
|
+
- **5-bit**: Balanced speed and quality
|
|
246
|
+
- **8-bit**: Higher quality, 50% size reduction
|
|
247
|
+
- **Mixed Precision**: Custom per-layer quantization
|
|
248
|
+
|
|
249
|
+
## MLX as Primary Backend
|
|
250
|
+
|
|
251
|
+
Cortex uses MLX (Apple's machine learning framework) as the primary acceleration backend:
|
|
252
|
+
- **Metal Support**: GPU execution via MLX's built-in Metal operations
|
|
253
|
+
- **Quantization**: Support for 4-bit, 5-bit, 8-bit, and mixed-precision quantization
|
|
254
|
+
- **Model Conversion**: Convert HuggingFace models to MLX format
|
|
255
|
+
|
|
256
|
+
## Built With
|
|
257
|
+
|
|
258
|
+
- [MLX](https://github.com/ml-explore/mlx) - Apple's machine learning framework
|
|
259
|
+
- [mlx-lm](https://github.com/ml-explore/mlx-examples) - LLM utilities and LoRA fine-tuning for MLX
|
|
260
|
+
- [PyTorch](https://pytorch.org/) - With Metal Performance Shaders backend
|
|
261
|
+
- [llama.cpp](https://github.com/ggerganov/llama.cpp) - Metal-accelerated GGUF support
|
|
262
|
+
- [Rich](https://github.com/Textualize/rich) - Terminal formatting
|
|
263
|
+
- [HuggingFace](https://huggingface.co/) - Model hub and transformers
|
|
264
|
+
|
|
265
|
+
## Contributing
|
|
266
|
+
|
|
267
|
+
We welcome contributions! Please see the [Development Guide](docs/development.md) for contributing guidelines and setup instructions.
|
|
268
|
+
|
|
269
|
+
## License
|
|
270
|
+
|
|
271
|
+
MIT License - See [LICENSE](LICENSE) for details.
|
|
272
|
+
|
|
273
|
+
---
|
|
274
|
+
|
|
275
|
+
**Note**: Cortex requires Apple Silicon. Intel Macs are not supported.
|
|
@@ -0,0 +1,216 @@
|
|
|
1
|
+
# Cortex - LLM Terminal Client for Apple Silicon
|
|
2
|
+
|
|
3
|
+
Cortex is an LLM terminal interface designed for Apple Silicon, using MLX and PyTorch MPS frameworks for GPU-accelerated inference.
|
|
4
|
+
|
|
5
|
+
## What It Does
|
|
6
|
+
|
|
7
|
+
- **GPU-accelerated inference** via MLX (primary) and PyTorch MPS backends
|
|
8
|
+
- **Apple Silicon required** - leverages unified memory architecture
|
|
9
|
+
- **Multiple model formats** - MLX, GGUF, SafeTensors, PyTorch, GPTQ, AWQ
|
|
10
|
+
- **Built-in fine-tuning** - LoRA-based model customization via interactive wizard
|
|
11
|
+
- **Chat template auto-detection** - automatic format detection with confidence scoring
|
|
12
|
+
- **Conversation persistence** - SQLite-backed chat history with branching
|
|
13
|
+
|
|
14
|
+
## Features
|
|
15
|
+
|
|
16
|
+
- **GPU-Accelerated Inference** - Delegates to MLX and PyTorch MPS for Metal-based execution
|
|
17
|
+
- **Apple Silicon Only** - Requires Metal GPU; exits if GPU acceleration is unavailable
|
|
18
|
+
- **Model Format Support**:
|
|
19
|
+
- MLX (Apple's format, loaded via `mlx_lm`)
|
|
20
|
+
- GGUF (via `llama-cpp-python` with Metal backend)
|
|
21
|
+
- SafeTensors (via HuggingFace `transformers`)
|
|
22
|
+
- PyTorch models (via HuggingFace `transformers` with MPS device)
|
|
23
|
+
- GPTQ quantized (via `auto-gptq`)
|
|
24
|
+
- AWQ quantized (via `awq`)
|
|
25
|
+
- **Quantization** - 4-bit, 5-bit, 8-bit, and mixed-precision quantization via MLX conversion pipeline
|
|
26
|
+
- **Model Conversion** - Convert HuggingFace models to MLX format with configurable quantization recipes
|
|
27
|
+
- **Template Registry** - Automatic detection of chat templates (ChatML, Llama, Alpaca, Gemma, Reasoning) with confidence scoring and real-time token filtering for reasoning models
|
|
28
|
+
- **Rotating KV Cache** - MLX-based KV cache for long context handling (default 4096 tokens)
|
|
29
|
+
- **Fine-Tuning** - LoRA-based model customization with interactive 6-step wizard
|
|
30
|
+
- **Terminal UI** - ANSI terminal interface with streaming output
|
|
31
|
+
|
|
32
|
+
## Installation
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
# Clone and install
|
|
36
|
+
git clone https://github.com/faisalmumtaz/Cortex.git
|
|
37
|
+
cd Cortex
|
|
38
|
+
./install.sh
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
The installer:
|
|
42
|
+
- Checks for Apple Silicon (arm64) compatibility
|
|
43
|
+
- Creates a Python virtual environment
|
|
44
|
+
- Installs dependencies via `pip install -e .` (from `pyproject.toml`)
|
|
45
|
+
- Sets up the `cortex` command in your PATH
|
|
46
|
+
|
|
47
|
+
### Quick Install (pipx)
|
|
48
|
+
|
|
49
|
+
If you just want the CLI without cloning the repo, use pipx:
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
pipx install cortex-llm
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
## Quick Start
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
# After installation, just run:
|
|
59
|
+
cortex
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
### Downloading Models
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
# Inside Cortex, use the download command:
|
|
66
|
+
cortex
|
|
67
|
+
# Then type: /download
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
The download feature:
|
|
71
|
+
- **HuggingFace integration** - download any model by repository ID
|
|
72
|
+
- **Automatic loading** - option to load model immediately after download
|
|
73
|
+
|
|
74
|
+
## Documentation
|
|
75
|
+
|
|
76
|
+
### User Documentation
|
|
77
|
+
- **[Installation Guide](docs/installation.md)** - Complete setup instructions
|
|
78
|
+
- **[CLI Reference](docs/cli.md)** - Commands and user interface
|
|
79
|
+
- **[Configuration](docs/configuration.md)** - System settings and optimization
|
|
80
|
+
- **[Model Management](docs/model-management.md)** - Loading and managing models
|
|
81
|
+
- **[Template Registry](docs/template-registry.md)** - Automatic chat template detection and management
|
|
82
|
+
- **[Fine-Tuning Guide](docs/fine-tuning.md)** - Customize models with LoRA
|
|
83
|
+
- **[Troubleshooting](docs/troubleshooting.md)** - Common issues and solutions
|
|
84
|
+
|
|
85
|
+
### Technical Documentation
|
|
86
|
+
- **[MLX Acceleration](docs/mlx-acceleration.md)** - MLX framework integration and optimization
|
|
87
|
+
- **[GPU Validation](docs/gpu-validation.md)** - Hardware requirements and detection
|
|
88
|
+
- **[Inference Engine](docs/inference-engine.md)** - Text generation architecture
|
|
89
|
+
- **[Conversation Management](docs/conversation-management.md)** - Chat history and persistence
|
|
90
|
+
- **[Development Guide](docs/development.md)** - Contributing and architecture
|
|
91
|
+
|
|
92
|
+
## System Requirements
|
|
93
|
+
|
|
94
|
+
- Apple Silicon Mac (M1/M2/M3/M4 - all variants supported)
|
|
95
|
+
- macOS 13.3+ (required by MLX framework)
|
|
96
|
+
- Python 3.11+
|
|
97
|
+
- 16GB+ unified memory (24GB+ recommended for larger models)
|
|
98
|
+
- Xcode Command Line Tools
|
|
99
|
+
|
|
100
|
+
## Performance
|
|
101
|
+
|
|
102
|
+
Performance depends on your Apple Silicon chip, model size, and quantization level. The inference engine measures tokens/second, first-token latency, and memory usage at runtime.
|
|
103
|
+
|
|
104
|
+
To check that GPU acceleration is working:
|
|
105
|
+
|
|
106
|
+
```bash
|
|
107
|
+
source venv/bin/activate
|
|
108
|
+
python tests/test_apple_silicon.py
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
You should see:
|
|
112
|
+
- All validation checks passing
|
|
113
|
+
- Measured GFLOPS from matrix operations
|
|
114
|
+
- Confirmation of Metal and MLX availability
|
|
115
|
+
|
|
116
|
+
## GPU Acceleration Architecture
|
|
117
|
+
|
|
118
|
+
Cortex uses a multi-layer approach, delegating all GPU computation to established frameworks:
|
|
119
|
+
|
|
120
|
+
1. **MLX Framework (Primary Backend)**
|
|
121
|
+
- Apple's ML framework with native Metal support
|
|
122
|
+
- Quantization support (4-bit, 5-bit, 8-bit, mixed-precision)
|
|
123
|
+
- Rotating KV cache for long contexts
|
|
124
|
+
- JIT compilation via `mx.compile`
|
|
125
|
+
- Operation fusion for reduced kernel launches
|
|
126
|
+
|
|
127
|
+
2. **PyTorch MPS Backend**
|
|
128
|
+
- Metal Performance Shaders for PyTorch models
|
|
129
|
+
- FP16 optimization and channels-last tensor format
|
|
130
|
+
|
|
131
|
+
3. **llama.cpp (GGUF Backend)**
|
|
132
|
+
- Metal-accelerated inference for GGUF models
|
|
133
|
+
|
|
134
|
+
4. **Memory Management**
|
|
135
|
+
- Pre-allocated memory pools with best-fit/first-fit allocation strategies
|
|
136
|
+
- Automatic pool sizing (60% of available memory, capped at 75% of total)
|
|
137
|
+
- Defragmentation support
|
|
138
|
+
|
|
139
|
+
### Understanding "Skipping Kernel" Messages
|
|
140
|
+
|
|
141
|
+
When loading GGUF models, you may see messages like:
|
|
142
|
+
```
|
|
143
|
+
ggml_metal_init: skipping kernel_xxx_bf16 (not supported)
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
**These are NORMAL!** They indicate:
|
|
147
|
+
- BF16 kernels being skipped (your GPU uses FP16 instead)
|
|
148
|
+
- GPU acceleration is still fully active
|
|
149
|
+
- The system automatically uses optimal alternatives
|
|
150
|
+
|
|
151
|
+
## Troubleshooting
|
|
152
|
+
|
|
153
|
+
If you suspect GPU isn't being used:
|
|
154
|
+
|
|
155
|
+
1. **Run validation**: `python tests/test_apple_silicon.py`
|
|
156
|
+
2. **Check output**: Should see passing checks and measured GFLOPS
|
|
157
|
+
3. **Monitor tokens/sec**: Displayed during inference
|
|
158
|
+
4. **Verify Metal**: Ensure Xcode Command Line Tools installed
|
|
159
|
+
|
|
160
|
+
Common issues:
|
|
161
|
+
- **Low performance**: Run `python tests/test_apple_silicon.py` to diagnose
|
|
162
|
+
- **Memory errors**: Reduce `gpu_memory_fraction` in config.yaml
|
|
163
|
+
|
|
164
|
+
## MLX Model Conversion
|
|
165
|
+
|
|
166
|
+
Cortex includes an MLX model converter:
|
|
167
|
+
|
|
168
|
+
```python
|
|
169
|
+
from cortex.metal.mlx_converter import MLXConverter, ConversionConfig, QuantizationRecipe
|
|
170
|
+
|
|
171
|
+
converter = MLXConverter()
|
|
172
|
+
config = ConversionConfig(
|
|
173
|
+
quantization=QuantizationRecipe.SPEED_4BIT, # 4-bit quantization
|
|
174
|
+
compile_model=True # JIT compilation
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
success, message, output_path = converter.convert_model(
|
|
178
|
+
"microsoft/DialoGPT-medium",
|
|
179
|
+
config=config
|
|
180
|
+
)
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
### Quantization Options
|
|
184
|
+
|
|
185
|
+
- **4-bit**: Maximum speed, 75% size reduction
|
|
186
|
+
- **5-bit**: Balanced speed and quality
|
|
187
|
+
- **8-bit**: Higher quality, 50% size reduction
|
|
188
|
+
- **Mixed Precision**: Custom per-layer quantization
|
|
189
|
+
|
|
190
|
+
## MLX as Primary Backend
|
|
191
|
+
|
|
192
|
+
Cortex uses MLX (Apple's machine learning framework) as the primary acceleration backend:
|
|
193
|
+
- **Metal Support**: GPU execution via MLX's built-in Metal operations
|
|
194
|
+
- **Quantization**: Support for 4-bit, 5-bit, 8-bit, and mixed-precision quantization
|
|
195
|
+
- **Model Conversion**: Convert HuggingFace models to MLX format
|
|
196
|
+
|
|
197
|
+
## Built With
|
|
198
|
+
|
|
199
|
+
- [MLX](https://github.com/ml-explore/mlx) - Apple's machine learning framework
|
|
200
|
+
- [mlx-lm](https://github.com/ml-explore/mlx-examples) - LLM utilities and LoRA fine-tuning for MLX
|
|
201
|
+
- [PyTorch](https://pytorch.org/) - With Metal Performance Shaders backend
|
|
202
|
+
- [llama.cpp](https://github.com/ggerganov/llama.cpp) - Metal-accelerated GGUF support
|
|
203
|
+
- [Rich](https://github.com/Textualize/rich) - Terminal formatting
|
|
204
|
+
- [HuggingFace](https://huggingface.co/) - Model hub and transformers
|
|
205
|
+
|
|
206
|
+
## Contributing
|
|
207
|
+
|
|
208
|
+
We welcome contributions! Please see the [Development Guide](docs/development.md) for contributing guidelines and setup instructions.
|
|
209
|
+
|
|
210
|
+
## License
|
|
211
|
+
|
|
212
|
+
MIT License - See [LICENSE](LICENSE) for details.
|
|
213
|
+
|
|
214
|
+
---
|
|
215
|
+
|
|
216
|
+
**Note**: Cortex requires Apple Silicon. Intel Macs are not supported.
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Cortex - GPU-Accelerated LLM Terminal for Apple Silicon
|
|
3
|
+
|
|
4
|
+
A high-performance terminal interface for running Hugging Face LLMs locally
|
|
5
|
+
with exclusive GPU acceleration via Metal Performance Shaders (MPS) and MLX.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
__version__ = "1.0.4"
|
|
9
|
+
__author__ = "Cortex Development Team"
|
|
10
|
+
__license__ = "MIT"
|
|
11
|
+
|
|
12
|
+
from typing import Optional, Dict, Any
|
|
13
|
+
import platform
|
|
14
|
+
import sys
|
|
15
|
+
|
|
16
|
+
MINIMUM_PYTHON_VERSION = (3, 11)
|
|
17
|
+
SUPPORTED_PLATFORM = "darwin"
|
|
18
|
+
|
|
19
|
+
def verify_system_requirements() -> Dict[str, Any]:
|
|
20
|
+
"""Verify that the system meets Cortex requirements."""
|
|
21
|
+
requirements = {
|
|
22
|
+
"python_version": sys.version_info >= MINIMUM_PYTHON_VERSION,
|
|
23
|
+
"platform": platform.system().lower() == SUPPORTED_PLATFORM,
|
|
24
|
+
"architecture": platform.machine() == "arm64",
|
|
25
|
+
"errors": []
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
if not requirements["python_version"]:
|
|
29
|
+
requirements["errors"].append(
|
|
30
|
+
f"Python {MINIMUM_PYTHON_VERSION[0]}.{MINIMUM_PYTHON_VERSION[1]}+ required, "
|
|
31
|
+
f"found {sys.version_info.major}.{sys.version_info.minor}"
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
if not requirements["platform"]:
|
|
35
|
+
requirements["errors"].append(
|
|
36
|
+
f"macOS required, found {platform.system()}"
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
if not requirements["architecture"]:
|
|
40
|
+
requirements["errors"].append(
|
|
41
|
+
f"ARM64 architecture required, found {platform.machine()}"
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
requirements["valid"] = len(requirements["errors"]) == 0
|
|
45
|
+
return requirements
|
|
46
|
+
|
|
47
|
+
def initialize_cortex() -> bool:
|
|
48
|
+
"""Initialize Cortex and verify system compatibility."""
|
|
49
|
+
requirements = verify_system_requirements()
|
|
50
|
+
|
|
51
|
+
if not requirements["valid"]:
|
|
52
|
+
for error in requirements["errors"]:
|
|
53
|
+
print(f"❌ {error}", file=sys.stderr)
|
|
54
|
+
return False
|
|
55
|
+
|
|
56
|
+
return True
|
|
57
|
+
|
|
58
|
+
from cortex.config import Config
|
|
59
|
+
from cortex.gpu_validator import GPUValidator
|
|
60
|
+
from cortex.model_manager import ModelManager
|
|
61
|
+
from cortex.inference_engine import InferenceEngine
|
|
62
|
+
from cortex.conversation_manager import ConversationManager
|
|
63
|
+
|
|
64
|
+
__all__ = [
|
|
65
|
+
"__version__",
|
|
66
|
+
"Config",
|
|
67
|
+
"GPUValidator",
|
|
68
|
+
"ModelManager",
|
|
69
|
+
"InferenceEngine",
|
|
70
|
+
"ConversationManager",
|
|
71
|
+
"initialize_cortex",
|
|
72
|
+
"verify_system_requirements"
|
|
73
|
+
]
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
"""Main entry point for Cortex."""
|
|
2
|
+
|
|
3
|
+
import sys
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
import os
|
|
6
|
+
import warnings
|
|
7
|
+
|
|
8
|
+
# Disable multiprocessing resource tracking before any imports that might use it
|
|
9
|
+
# This prevents the semaphore leak warning from transformers library
|
|
10
|
+
os.environ['PYTHONWARNINGS'] = 'ignore::UserWarning:multiprocessing.resource_tracker'
|
|
11
|
+
|
|
12
|
+
# Apply MLX compatibility shims before any MLX/MLX-LM imports.
|
|
13
|
+
try:
|
|
14
|
+
from cortex.metal.mlx_compat import patch_mlx_lm_device_info
|
|
15
|
+
patch_mlx_lm_device_info()
|
|
16
|
+
except Exception:
|
|
17
|
+
pass
|
|
18
|
+
|
|
19
|
+
# Alternative: Monkey-patch the resource tracker before it's used
|
|
20
|
+
try:
|
|
21
|
+
from multiprocessing import resource_tracker
|
|
22
|
+
def dummy_register(*args, **kwargs):
|
|
23
|
+
pass
|
|
24
|
+
def dummy_unregister(*args, **kwargs):
|
|
25
|
+
pass
|
|
26
|
+
resource_tracker.register = dummy_register
|
|
27
|
+
resource_tracker.unregister = dummy_unregister
|
|
28
|
+
except ImportError:
|
|
29
|
+
pass
|
|
30
|
+
|
|
31
|
+
from cortex.config import Config
|
|
32
|
+
from cortex.gpu_validator import GPUValidator
|
|
33
|
+
from cortex.model_manager import ModelManager
|
|
34
|
+
from cortex.inference_engine import InferenceEngine
|
|
35
|
+
from cortex.conversation_manager import ConversationManager
|
|
36
|
+
from cortex.ui.cli import CortexCLI
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def main():
|
|
40
|
+
"""Main entry point."""
|
|
41
|
+
|
|
42
|
+
inference_engine = None
|
|
43
|
+
try:
|
|
44
|
+
# Load configuration
|
|
45
|
+
config = Config()
|
|
46
|
+
|
|
47
|
+
# Initialize GPU validator
|
|
48
|
+
gpu_validator = GPUValidator()
|
|
49
|
+
|
|
50
|
+
# Validate GPU
|
|
51
|
+
is_valid, gpu_info, errors = gpu_validator.validate()
|
|
52
|
+
if not is_valid:
|
|
53
|
+
print("Error: GPU validation failed. Cortex requires Apple Silicon with Metal support.", file=sys.stderr)
|
|
54
|
+
for error in errors:
|
|
55
|
+
print(f" - {error}", file=sys.stderr)
|
|
56
|
+
sys.exit(1)
|
|
57
|
+
|
|
58
|
+
# Initialize components
|
|
59
|
+
model_manager = ModelManager(config, gpu_validator)
|
|
60
|
+
inference_engine = InferenceEngine(config, model_manager)
|
|
61
|
+
conversation_manager = ConversationManager(config)
|
|
62
|
+
|
|
63
|
+
# Create and run the CLI
|
|
64
|
+
cli = CortexCLI(
|
|
65
|
+
config=config,
|
|
66
|
+
gpu_validator=gpu_validator,
|
|
67
|
+
model_manager=model_manager,
|
|
68
|
+
inference_engine=inference_engine,
|
|
69
|
+
conversation_manager=conversation_manager
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
cli.run()
|
|
73
|
+
finally:
|
|
74
|
+
# Clean up resources
|
|
75
|
+
if inference_engine is not None and hasattr(inference_engine, 'memory_pool') and inference_engine.memory_pool:
|
|
76
|
+
inference_engine.memory_pool.cleanup()
|
|
77
|
+
|
|
78
|
+
# Force PyTorch cleanup
|
|
79
|
+
try:
|
|
80
|
+
import torch
|
|
81
|
+
if torch.backends.mps.is_available():
|
|
82
|
+
torch.mps.synchronize()
|
|
83
|
+
if hasattr(torch.mps, 'empty_cache'):
|
|
84
|
+
torch.mps.empty_cache()
|
|
85
|
+
except Exception:
|
|
86
|
+
pass # Ignore cleanup errors
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
if __name__ == "__main__":
|
|
90
|
+
main()
|