cortex-llm 1.0.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. cortex_llm-1.0.3/LICENSE +21 -0
  2. cortex_llm-1.0.3/PKG-INFO +275 -0
  3. cortex_llm-1.0.3/README.md +216 -0
  4. cortex_llm-1.0.3/cortex/__init__.py +73 -0
  5. cortex_llm-1.0.3/cortex/__main__.py +89 -0
  6. cortex_llm-1.0.3/cortex/config.py +329 -0
  7. cortex_llm-1.0.3/cortex/conversation_manager.py +468 -0
  8. cortex_llm-1.0.3/cortex/fine_tuning/__init__.py +8 -0
  9. cortex_llm-1.0.3/cortex/fine_tuning/dataset.py +332 -0
  10. cortex_llm-1.0.3/cortex/fine_tuning/mlx_lora_trainer.py +502 -0
  11. cortex_llm-1.0.3/cortex/fine_tuning/trainer.py +957 -0
  12. cortex_llm-1.0.3/cortex/fine_tuning/wizard.py +707 -0
  13. cortex_llm-1.0.3/cortex/gpu_validator.py +467 -0
  14. cortex_llm-1.0.3/cortex/inference_engine.py +729 -0
  15. cortex_llm-1.0.3/cortex/metal/__init__.py +275 -0
  16. cortex_llm-1.0.3/cortex/metal/gpu_validator.py +177 -0
  17. cortex_llm-1.0.3/cortex/metal/memory_pool.py +886 -0
  18. cortex_llm-1.0.3/cortex/metal/mlx_accelerator.py +680 -0
  19. cortex_llm-1.0.3/cortex/metal/mlx_compat.py +90 -0
  20. cortex_llm-1.0.3/cortex/metal/mlx_converter.py +638 -0
  21. cortex_llm-1.0.3/cortex/metal/mps_optimizer.py +417 -0
  22. cortex_llm-1.0.3/cortex/metal/optimizer.py +665 -0
  23. cortex_llm-1.0.3/cortex/metal/performance_profiler.py +364 -0
  24. cortex_llm-1.0.3/cortex/model_downloader.py +130 -0
  25. cortex_llm-1.0.3/cortex/model_manager.py +2187 -0
  26. cortex_llm-1.0.3/cortex/quantization/__init__.py +5 -0
  27. cortex_llm-1.0.3/cortex/quantization/dynamic_quantizer.py +736 -0
  28. cortex_llm-1.0.3/cortex/template_registry/__init__.py +15 -0
  29. cortex_llm-1.0.3/cortex/template_registry/auto_detector.py +144 -0
  30. cortex_llm-1.0.3/cortex/template_registry/config_manager.py +234 -0
  31. cortex_llm-1.0.3/cortex/template_registry/interactive.py +260 -0
  32. cortex_llm-1.0.3/cortex/template_registry/registry.py +347 -0
  33. cortex_llm-1.0.3/cortex/template_registry/template_profiles/__init__.py +5 -0
  34. cortex_llm-1.0.3/cortex/template_registry/template_profiles/base.py +142 -0
  35. cortex_llm-1.0.3/cortex/template_registry/template_profiles/complex/__init__.py +5 -0
  36. cortex_llm-1.0.3/cortex/template_registry/template_profiles/complex/reasoning.py +263 -0
  37. cortex_llm-1.0.3/cortex/template_registry/template_profiles/standard/__init__.py +9 -0
  38. cortex_llm-1.0.3/cortex/template_registry/template_profiles/standard/alpaca.py +73 -0
  39. cortex_llm-1.0.3/cortex/template_registry/template_profiles/standard/chatml.py +82 -0
  40. cortex_llm-1.0.3/cortex/template_registry/template_profiles/standard/gemma.py +103 -0
  41. cortex_llm-1.0.3/cortex/template_registry/template_profiles/standard/llama.py +87 -0
  42. cortex_llm-1.0.3/cortex/template_registry/template_profiles/standard/simple.py +65 -0
  43. cortex_llm-1.0.3/cortex/ui/__init__.py +120 -0
  44. cortex_llm-1.0.3/cortex/ui/cli.py +1685 -0
  45. cortex_llm-1.0.3/cortex/ui/markdown_render.py +185 -0
  46. cortex_llm-1.0.3/cortex/ui/terminal_app.py +534 -0
  47. cortex_llm-1.0.3/cortex_llm.egg-info/PKG-INFO +275 -0
  48. cortex_llm-1.0.3/cortex_llm.egg-info/SOURCES.txt +56 -0
  49. cortex_llm-1.0.3/cortex_llm.egg-info/dependency_links.txt +1 -0
  50. cortex_llm-1.0.3/cortex_llm.egg-info/entry_points.txt +2 -0
  51. cortex_llm-1.0.3/cortex_llm.egg-info/not-zip-safe +1 -0
  52. cortex_llm-1.0.3/cortex_llm.egg-info/requires.txt +31 -0
  53. cortex_llm-1.0.3/cortex_llm.egg-info/top_level.txt +1 -0
  54. cortex_llm-1.0.3/pyproject.toml +112 -0
  55. cortex_llm-1.0.3/setup.cfg +4 -0
  56. cortex_llm-1.0.3/setup.py +82 -0
  57. cortex_llm-1.0.3/tests/test_apple_silicon.py +602 -0
  58. cortex_llm-1.0.3/tests/test_metal_optimization.py +279 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Faisal Mumtaz
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,275 @@
1
+ Metadata-Version: 2.4
2
+ Name: cortex-llm
3
+ Version: 1.0.3
4
+ Summary: GPU-Accelerated LLM Terminal for Apple Silicon
5
+ Home-page: https://github.com/faisalmumtaz/Cortex
6
+ Author: Cortex Development Team
7
+ License: MIT
8
+ Project-URL: Homepage, https://github.com/faisalmumtaz/Cortex
9
+ Project-URL: Bug Tracker, https://github.com/faisalmumtaz/Cortex/issues
10
+ Project-URL: Documentation, https://github.com/faisalmumtaz/Cortex/wiki
11
+ Keywords: llm,gpu,metal,mps,apple-silicon,ai,machine-learning,terminal,mlx,pytorch
12
+ Platform: darwin
13
+ Classifier: Development Status :: 4 - Beta
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
16
+ Classifier: License :: OSI Approved :: MIT License
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Operating System :: MacOS
21
+ Classifier: Environment :: Console
22
+ Classifier: Environment :: GPU
23
+ Requires-Python: >=3.11
24
+ Description-Content-Type: text/markdown
25
+ License-File: LICENSE
26
+ Requires-Dist: torch>=2.1.0
27
+ Requires-Dist: mlx>=0.10.0
28
+ Requires-Dist: mlx-lm>=0.10.0
29
+ Requires-Dist: transformers>=4.36.0
30
+ Requires-Dist: safetensors>=0.4.0
31
+ Requires-Dist: huggingface-hub>=0.19.0
32
+ Requires-Dist: accelerate>=0.25.0
33
+ Requires-Dist: llama-cpp-python>=0.2.0
34
+ Requires-Dist: pyyaml>=6.0
35
+ Requires-Dist: pydantic>=2.5.0
36
+ Requires-Dist: rich>=13.0.0
37
+ Requires-Dist: psutil>=5.9.0
38
+ Requires-Dist: numpy>=1.24.0
39
+ Requires-Dist: packaging>=23.0
40
+ Requires-Dist: requests>=2.31.0
41
+ Provides-Extra: dev
42
+ Requires-Dist: pytest>=7.4.0; extra == "dev"
43
+ Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
44
+ Requires-Dist: black>=23.0.0; extra == "dev"
45
+ Requires-Dist: ruff>=0.1.0; extra == "dev"
46
+ Requires-Dist: mypy>=1.8.0; extra == "dev"
47
+ Provides-Extra: optional
48
+ Requires-Dist: sentencepiece>=0.1.99; extra == "optional"
49
+ Requires-Dist: auto-gptq>=0.7.0; extra == "optional"
50
+ Requires-Dist: autoawq>=0.2.0; extra == "optional"
51
+ Requires-Dist: bitsandbytes>=0.41.0; extra == "optional"
52
+ Requires-Dist: optimum>=1.16.0; extra == "optional"
53
+ Requires-Dist: torchvision>=0.16.0; extra == "optional"
54
+ Requires-Dist: torchaudio>=2.1.0; extra == "optional"
55
+ Dynamic: home-page
56
+ Dynamic: license-file
57
+ Dynamic: platform
58
+ Dynamic: requires-python
59
+
60
+ # Cortex - LLM Terminal Client for Apple Silicon
61
+
62
+ Cortex is an LLM terminal interface designed for Apple Silicon, using MLX and PyTorch MPS frameworks for GPU-accelerated inference.
63
+
64
+ ## What It Does
65
+
66
+ - **GPU-accelerated inference** via MLX (primary) and PyTorch MPS backends
67
+ - **Apple Silicon required** - leverages unified memory architecture
68
+ - **Multiple model formats** - MLX, GGUF, SafeTensors, PyTorch, GPTQ, AWQ
69
+ - **Built-in fine-tuning** - LoRA-based model customization via interactive wizard
70
+ - **Chat template auto-detection** - automatic format detection with confidence scoring
71
+ - **Conversation persistence** - SQLite-backed chat history with branching
72
+
73
+ ## Features
74
+
75
+ - **GPU-Accelerated Inference** - Delegates to MLX and PyTorch MPS for Metal-based execution
76
+ - **Apple Silicon Only** - Requires Metal GPU; exits if GPU acceleration is unavailable
77
+ - **Model Format Support**:
78
+ - MLX (Apple's format, loaded via `mlx_lm`)
79
+ - GGUF (via `llama-cpp-python` with Metal backend)
80
+ - SafeTensors (via HuggingFace `transformers`)
81
+ - PyTorch models (via HuggingFace `transformers` with MPS device)
82
+ - GPTQ quantized (via `auto-gptq`)
83
+ - AWQ quantized (via `awq`)
84
+ - **Quantization** - 4-bit, 5-bit, 8-bit, and mixed-precision quantization via MLX conversion pipeline
85
+ - **Model Conversion** - Convert HuggingFace models to MLX format with configurable quantization recipes
86
+ - **Template Registry** - Automatic detection of chat templates (ChatML, Llama, Alpaca, Gemma, Reasoning) with confidence scoring and real-time token filtering for reasoning models
87
+ - **Rotating KV Cache** - MLX-based KV cache for long context handling (default 4096 tokens)
88
+ - **Fine-Tuning** - LoRA-based model customization with interactive 6-step wizard
89
+ - **Terminal UI** - ANSI terminal interface with streaming output
90
+
91
+ ## Installation
92
+
93
+ ```bash
94
+ # Clone and install
95
+ git clone https://github.com/faisalmumtaz/Cortex.git
96
+ cd Cortex
97
+ ./install.sh
98
+ ```
99
+
100
+ The installer:
101
+ - Checks for Apple Silicon (arm64) compatibility
102
+ - Creates a Python virtual environment
103
+ - Installs dependencies via `pip install -e .` (from `pyproject.toml`)
104
+ - Sets up the `cortex` command in your PATH
105
+
106
+ ### Quick Install (pipx)
107
+
108
+ If you just want the CLI without cloning the repo, use pipx:
109
+
110
+ ```bash
111
+ pipx install cortex-llm
112
+ ```
113
+
114
+ ## Quick Start
115
+
116
+ ```bash
117
+ # After installation, just run:
118
+ cortex
119
+ ```
120
+
121
+ ### Downloading Models
122
+
123
+ ```bash
124
+ # Inside Cortex, use the download command:
125
+ cortex
126
+ # Then type: /download
127
+ ```
128
+
129
+ The download feature:
130
+ - **HuggingFace integration** - download any model by repository ID
131
+ - **Automatic loading** - option to load model immediately after download
132
+
133
+ ## Documentation
134
+
135
+ ### User Documentation
136
+ - **[Installation Guide](docs/installation.md)** - Complete setup instructions
137
+ - **[CLI Reference](docs/cli.md)** - Commands and user interface
138
+ - **[Configuration](docs/configuration.md)** - System settings and optimization
139
+ - **[Model Management](docs/model-management.md)** - Loading and managing models
140
+ - **[Template Registry](docs/template-registry.md)** - Automatic chat template detection and management
141
+ - **[Fine-Tuning Guide](docs/fine-tuning.md)** - Customize models with LoRA
142
+ - **[Troubleshooting](docs/troubleshooting.md)** - Common issues and solutions
143
+
144
+ ### Technical Documentation
145
+ - **[MLX Acceleration](docs/mlx-acceleration.md)** - MLX framework integration and optimization
146
+ - **[GPU Validation](docs/gpu-validation.md)** - Hardware requirements and detection
147
+ - **[Inference Engine](docs/inference-engine.md)** - Text generation architecture
148
+ - **[Conversation Management](docs/conversation-management.md)** - Chat history and persistence
149
+ - **[Development Guide](docs/development.md)** - Contributing and architecture
150
+
151
+ ## System Requirements
152
+
153
+ - Apple Silicon Mac (M1/M2/M3/M4 - all variants supported)
154
+ - macOS 13.3+ (required by MLX framework)
155
+ - Python 3.11+
156
+ - 16GB+ unified memory (24GB+ recommended for larger models)
157
+ - Xcode Command Line Tools
158
+
159
+ ## Performance
160
+
161
+ Performance depends on your Apple Silicon chip, model size, and quantization level. The inference engine measures tokens/second, first-token latency, and memory usage at runtime.
162
+
163
+ To check that GPU acceleration is working:
164
+
165
+ ```bash
166
+ source venv/bin/activate
167
+ python tests/test_apple_silicon.py
168
+ ```
169
+
170
+ You should see:
171
+ - All validation checks passing
172
+ - Measured GFLOPS from matrix operations
173
+ - Confirmation of Metal and MLX availability
174
+
175
+ ## GPU Acceleration Architecture
176
+
177
+ Cortex uses a multi-layer approach, delegating all GPU computation to established frameworks:
178
+
179
+ 1. **MLX Framework (Primary Backend)**
180
+ - Apple's ML framework with native Metal support
181
+ - Quantization support (4-bit, 5-bit, 8-bit, mixed-precision)
182
+ - Rotating KV cache for long contexts
183
+ - JIT compilation via `mx.compile`
184
+ - Operation fusion for reduced kernel launches
185
+
186
+ 2. **PyTorch MPS Backend**
187
+ - Metal Performance Shaders for PyTorch models
188
+ - FP16 optimization and channels-last tensor format
189
+
190
+ 3. **llama.cpp (GGUF Backend)**
191
+ - Metal-accelerated inference for GGUF models
192
+
193
+ 4. **Memory Management**
194
+ - Pre-allocated memory pools with best-fit/first-fit allocation strategies
195
+ - Automatic pool sizing (60% of available memory, capped at 75% of total)
196
+ - Defragmentation support
197
+
198
+ ### Understanding "Skipping Kernel" Messages
199
+
200
+ When loading GGUF models, you may see messages like:
201
+ ```
202
+ ggml_metal_init: skipping kernel_xxx_bf16 (not supported)
203
+ ```
204
+
205
+ **These are NORMAL!** They indicate:
206
+ - BF16 kernels being skipped (your GPU uses FP16 instead)
207
+ - GPU acceleration is still fully active
208
+ - The system automatically uses optimal alternatives
209
+
210
+ ## Troubleshooting
211
+
212
+ If you suspect GPU isn't being used:
213
+
214
+ 1. **Run validation**: `python tests/test_apple_silicon.py`
215
+ 2. **Check output**: Should see passing checks and measured GFLOPS
216
+ 3. **Monitor tokens/sec**: Displayed during inference
217
+ 4. **Verify Metal**: Ensure Xcode Command Line Tools installed
218
+
219
+ Common issues:
220
+ - **Low performance**: Run `python tests/test_apple_silicon.py` to diagnose
221
+ - **Memory errors**: Reduce `gpu_memory_fraction` in config.yaml
222
+
223
+ ## MLX Model Conversion
224
+
225
+ Cortex includes an MLX model converter:
226
+
227
+ ```python
228
+ from cortex.metal.mlx_converter import MLXConverter, ConversionConfig, QuantizationRecipe
229
+
230
+ converter = MLXConverter()
231
+ config = ConversionConfig(
232
+ quantization=QuantizationRecipe.SPEED_4BIT, # 4-bit quantization
233
+ compile_model=True # JIT compilation
234
+ )
235
+
236
+ success, message, output_path = converter.convert_model(
237
+ "microsoft/DialoGPT-medium",
238
+ config=config
239
+ )
240
+ ```
241
+
242
+ ### Quantization Options
243
+
244
+ - **4-bit**: Maximum speed, 75% size reduction
245
+ - **5-bit**: Balanced speed and quality
246
+ - **8-bit**: Higher quality, 50% size reduction
247
+ - **Mixed Precision**: Custom per-layer quantization
248
+
249
+ ## MLX as Primary Backend
250
+
251
+ Cortex uses MLX (Apple's machine learning framework) as the primary acceleration backend:
252
+ - **Metal Support**: GPU execution via MLX's built-in Metal operations
253
+ - **Quantization**: Support for 4-bit, 5-bit, 8-bit, and mixed-precision quantization
254
+ - **Model Conversion**: Convert HuggingFace models to MLX format
255
+
256
+ ## Built With
257
+
258
+ - [MLX](https://github.com/ml-explore/mlx) - Apple's machine learning framework
259
+ - [mlx-lm](https://github.com/ml-explore/mlx-examples) - LLM utilities and LoRA fine-tuning for MLX
260
+ - [PyTorch](https://pytorch.org/) - With Metal Performance Shaders backend
261
+ - [llama.cpp](https://github.com/ggerganov/llama.cpp) - Metal-accelerated GGUF support
262
+ - [Rich](https://github.com/Textualize/rich) - Terminal formatting
263
+ - [HuggingFace](https://huggingface.co/) - Model hub and transformers
264
+
265
+ ## Contributing
266
+
267
+ We welcome contributions! Please see the [Development Guide](docs/development.md) for contributing guidelines and setup instructions.
268
+
269
+ ## License
270
+
271
+ MIT License - See [LICENSE](LICENSE) for details.
272
+
273
+ ---
274
+
275
+ **Note**: Cortex requires Apple Silicon. Intel Macs are not supported.
@@ -0,0 +1,216 @@
1
+ # Cortex - LLM Terminal Client for Apple Silicon
2
+
3
+ Cortex is an LLM terminal interface designed for Apple Silicon, using MLX and PyTorch MPS frameworks for GPU-accelerated inference.
4
+
5
+ ## What It Does
6
+
7
+ - **GPU-accelerated inference** via MLX (primary) and PyTorch MPS backends
8
+ - **Apple Silicon required** - leverages unified memory architecture
9
+ - **Multiple model formats** - MLX, GGUF, SafeTensors, PyTorch, GPTQ, AWQ
10
+ - **Built-in fine-tuning** - LoRA-based model customization via interactive wizard
11
+ - **Chat template auto-detection** - automatic format detection with confidence scoring
12
+ - **Conversation persistence** - SQLite-backed chat history with branching
13
+
14
+ ## Features
15
+
16
+ - **GPU-Accelerated Inference** - Delegates to MLX and PyTorch MPS for Metal-based execution
17
+ - **Apple Silicon Only** - Requires Metal GPU; exits if GPU acceleration is unavailable
18
+ - **Model Format Support**:
19
+ - MLX (Apple's format, loaded via `mlx_lm`)
20
+ - GGUF (via `llama-cpp-python` with Metal backend)
21
+ - SafeTensors (via HuggingFace `transformers`)
22
+ - PyTorch models (via HuggingFace `transformers` with MPS device)
23
+ - GPTQ quantized (via `auto-gptq`)
24
+ - AWQ quantized (via `awq`)
25
+ - **Quantization** - 4-bit, 5-bit, 8-bit, and mixed-precision quantization via MLX conversion pipeline
26
+ - **Model Conversion** - Convert HuggingFace models to MLX format with configurable quantization recipes
27
+ - **Template Registry** - Automatic detection of chat templates (ChatML, Llama, Alpaca, Gemma, Reasoning) with confidence scoring and real-time token filtering for reasoning models
28
+ - **Rotating KV Cache** - MLX-based KV cache for long context handling (default 4096 tokens)
29
+ - **Fine-Tuning** - LoRA-based model customization with interactive 6-step wizard
30
+ - **Terminal UI** - ANSI terminal interface with streaming output
31
+
32
+ ## Installation
33
+
34
+ ```bash
35
+ # Clone and install
36
+ git clone https://github.com/faisalmumtaz/Cortex.git
37
+ cd Cortex
38
+ ./install.sh
39
+ ```
40
+
41
+ The installer:
42
+ - Checks for Apple Silicon (arm64) compatibility
43
+ - Creates a Python virtual environment
44
+ - Installs dependencies via `pip install -e .` (from `pyproject.toml`)
45
+ - Sets up the `cortex` command in your PATH
46
+
47
+ ### Quick Install (pipx)
48
+
49
+ If you just want the CLI without cloning the repo, use pipx:
50
+
51
+ ```bash
52
+ pipx install cortex-llm
53
+ ```
54
+
55
+ ## Quick Start
56
+
57
+ ```bash
58
+ # After installation, just run:
59
+ cortex
60
+ ```
61
+
62
+ ### Downloading Models
63
+
64
+ ```bash
65
+ # Inside Cortex, use the download command:
66
+ cortex
67
+ # Then type: /download
68
+ ```
69
+
70
+ The download feature:
71
+ - **HuggingFace integration** - download any model by repository ID
72
+ - **Automatic loading** - option to load model immediately after download
73
+
74
+ ## Documentation
75
+
76
+ ### User Documentation
77
+ - **[Installation Guide](docs/installation.md)** - Complete setup instructions
78
+ - **[CLI Reference](docs/cli.md)** - Commands and user interface
79
+ - **[Configuration](docs/configuration.md)** - System settings and optimization
80
+ - **[Model Management](docs/model-management.md)** - Loading and managing models
81
+ - **[Template Registry](docs/template-registry.md)** - Automatic chat template detection and management
82
+ - **[Fine-Tuning Guide](docs/fine-tuning.md)** - Customize models with LoRA
83
+ - **[Troubleshooting](docs/troubleshooting.md)** - Common issues and solutions
84
+
85
+ ### Technical Documentation
86
+ - **[MLX Acceleration](docs/mlx-acceleration.md)** - MLX framework integration and optimization
87
+ - **[GPU Validation](docs/gpu-validation.md)** - Hardware requirements and detection
88
+ - **[Inference Engine](docs/inference-engine.md)** - Text generation architecture
89
+ - **[Conversation Management](docs/conversation-management.md)** - Chat history and persistence
90
+ - **[Development Guide](docs/development.md)** - Contributing and architecture
91
+
92
+ ## System Requirements
93
+
94
+ - Apple Silicon Mac (M1/M2/M3/M4 - all variants supported)
95
+ - macOS 13.3+ (required by MLX framework)
96
+ - Python 3.11+
97
+ - 16GB+ unified memory (24GB+ recommended for larger models)
98
+ - Xcode Command Line Tools
99
+
100
+ ## Performance
101
+
102
+ Performance depends on your Apple Silicon chip, model size, and quantization level. The inference engine measures tokens/second, first-token latency, and memory usage at runtime.
103
+
104
+ To check that GPU acceleration is working:
105
+
106
+ ```bash
107
+ source venv/bin/activate
108
+ python tests/test_apple_silicon.py
109
+ ```
110
+
111
+ You should see:
112
+ - All validation checks passing
113
+ - Measured GFLOPS from matrix operations
114
+ - Confirmation of Metal and MLX availability
115
+
116
+ ## GPU Acceleration Architecture
117
+
118
+ Cortex uses a multi-layer approach, delegating all GPU computation to established frameworks:
119
+
120
+ 1. **MLX Framework (Primary Backend)**
121
+ - Apple's ML framework with native Metal support
122
+ - Quantization support (4-bit, 5-bit, 8-bit, mixed-precision)
123
+ - Rotating KV cache for long contexts
124
+ - JIT compilation via `mx.compile`
125
+ - Operation fusion for reduced kernel launches
126
+
127
+ 2. **PyTorch MPS Backend**
128
+ - Metal Performance Shaders for PyTorch models
129
+ - FP16 optimization and channels-last tensor format
130
+
131
+ 3. **llama.cpp (GGUF Backend)**
132
+ - Metal-accelerated inference for GGUF models
133
+
134
+ 4. **Memory Management**
135
+ - Pre-allocated memory pools with best-fit/first-fit allocation strategies
136
+ - Automatic pool sizing (60% of available memory, capped at 75% of total)
137
+ - Defragmentation support
138
+
139
+ ### Understanding "Skipping Kernel" Messages
140
+
141
+ When loading GGUF models, you may see messages like:
142
+ ```
143
+ ggml_metal_init: skipping kernel_xxx_bf16 (not supported)
144
+ ```
145
+
146
+ **These are NORMAL!** They indicate:
147
+ - BF16 kernels being skipped (your GPU uses FP16 instead)
148
+ - GPU acceleration is still fully active
149
+ - The system automatically uses optimal alternatives
150
+
151
+ ## Troubleshooting
152
+
153
+ If you suspect GPU isn't being used:
154
+
155
+ 1. **Run validation**: `python tests/test_apple_silicon.py`
156
+ 2. **Check output**: Should see passing checks and measured GFLOPS
157
+ 3. **Monitor tokens/sec**: Displayed during inference
158
+ 4. **Verify Metal**: Ensure Xcode Command Line Tools installed
159
+
160
+ Common issues:
161
+ - **Low performance**: Run `python tests/test_apple_silicon.py` to diagnose
162
+ - **Memory errors**: Reduce `gpu_memory_fraction` in config.yaml
163
+
164
+ ## MLX Model Conversion
165
+
166
+ Cortex includes an MLX model converter:
167
+
168
+ ```python
169
+ from cortex.metal.mlx_converter import MLXConverter, ConversionConfig, QuantizationRecipe
170
+
171
+ converter = MLXConverter()
172
+ config = ConversionConfig(
173
+ quantization=QuantizationRecipe.SPEED_4BIT, # 4-bit quantization
174
+ compile_model=True # JIT compilation
175
+ )
176
+
177
+ success, message, output_path = converter.convert_model(
178
+ "microsoft/DialoGPT-medium",
179
+ config=config
180
+ )
181
+ ```
182
+
183
+ ### Quantization Options
184
+
185
+ - **4-bit**: Maximum speed, 75% size reduction
186
+ - **5-bit**: Balanced speed and quality
187
+ - **8-bit**: Higher quality, 50% size reduction
188
+ - **Mixed Precision**: Custom per-layer quantization
189
+
190
+ ## MLX as Primary Backend
191
+
192
+ Cortex uses MLX (Apple's machine learning framework) as the primary acceleration backend:
193
+ - **Metal Support**: GPU execution via MLX's built-in Metal operations
194
+ - **Quantization**: Support for 4-bit, 5-bit, 8-bit, and mixed-precision quantization
195
+ - **Model Conversion**: Convert HuggingFace models to MLX format
196
+
197
+ ## Built With
198
+
199
+ - [MLX](https://github.com/ml-explore/mlx) - Apple's machine learning framework
200
+ - [mlx-lm](https://github.com/ml-explore/mlx-examples) - LLM utilities and LoRA fine-tuning for MLX
201
+ - [PyTorch](https://pytorch.org/) - With Metal Performance Shaders backend
202
+ - [llama.cpp](https://github.com/ggerganov/llama.cpp) - Metal-accelerated GGUF support
203
+ - [Rich](https://github.com/Textualize/rich) - Terminal formatting
204
+ - [HuggingFace](https://huggingface.co/) - Model hub and transformers
205
+
206
+ ## Contributing
207
+
208
+ We welcome contributions! Please see the [Development Guide](docs/development.md) for contributing guidelines and setup instructions.
209
+
210
+ ## License
211
+
212
+ MIT License - See [LICENSE](LICENSE) for details.
213
+
214
+ ---
215
+
216
+ **Note**: Cortex requires Apple Silicon. Intel Macs are not supported.
@@ -0,0 +1,73 @@
1
+ """
2
+ Cortex - GPU-Accelerated LLM Terminal for Apple Silicon
3
+
4
+ A high-performance terminal interface for running Hugging Face LLMs locally
5
+ with exclusive GPU acceleration via Metal Performance Shaders (MPS) and MLX.
6
+ """
7
+
8
+ __version__ = "1.0.3"
9
+ __author__ = "Cortex Development Team"
10
+ __license__ = "MIT"
11
+
12
+ from typing import Optional, Dict, Any
13
+ import platform
14
+ import sys
15
+
16
+ MINIMUM_PYTHON_VERSION = (3, 11)
17
+ SUPPORTED_PLATFORM = "darwin"
18
+
19
+ def verify_system_requirements() -> Dict[str, Any]:
20
+ """Verify that the system meets Cortex requirements."""
21
+ requirements = {
22
+ "python_version": sys.version_info >= MINIMUM_PYTHON_VERSION,
23
+ "platform": platform.system().lower() == SUPPORTED_PLATFORM,
24
+ "architecture": platform.machine() == "arm64",
25
+ "errors": []
26
+ }
27
+
28
+ if not requirements["python_version"]:
29
+ requirements["errors"].append(
30
+ f"Python {MINIMUM_PYTHON_VERSION[0]}.{MINIMUM_PYTHON_VERSION[1]}+ required, "
31
+ f"found {sys.version_info.major}.{sys.version_info.minor}"
32
+ )
33
+
34
+ if not requirements["platform"]:
35
+ requirements["errors"].append(
36
+ f"macOS required, found {platform.system()}"
37
+ )
38
+
39
+ if not requirements["architecture"]:
40
+ requirements["errors"].append(
41
+ f"ARM64 architecture required, found {platform.machine()}"
42
+ )
43
+
44
+ requirements["valid"] = len(requirements["errors"]) == 0
45
+ return requirements
46
+
47
+ def initialize_cortex() -> bool:
48
+ """Initialize Cortex and verify system compatibility."""
49
+ requirements = verify_system_requirements()
50
+
51
+ if not requirements["valid"]:
52
+ for error in requirements["errors"]:
53
+ print(f"❌ {error}", file=sys.stderr)
54
+ return False
55
+
56
+ return True
57
+
58
+ from cortex.config import Config
59
+ from cortex.gpu_validator import GPUValidator
60
+ from cortex.model_manager import ModelManager
61
+ from cortex.inference_engine import InferenceEngine
62
+ from cortex.conversation_manager import ConversationManager
63
+
64
+ __all__ = [
65
+ "__version__",
66
+ "Config",
67
+ "GPUValidator",
68
+ "ModelManager",
69
+ "InferenceEngine",
70
+ "ConversationManager",
71
+ "initialize_cortex",
72
+ "verify_system_requirements"
73
+ ]
@@ -0,0 +1,89 @@
1
+ """Main entry point for Cortex."""
2
+
3
+ import sys
4
+ from pathlib import Path
5
+ import os
6
+ import warnings
7
+
8
+ # Disable multiprocessing resource tracking before any imports that might use it
9
+ # This prevents the semaphore leak warning from transformers library
10
+ os.environ['PYTHONWARNINGS'] = 'ignore::UserWarning:multiprocessing.resource_tracker'
11
+
12
+ # Silence known MLX deprecation warning surfaced during generation.
13
+ warnings.filterwarnings(
14
+ "ignore",
15
+ message=r"mx\.metal\.device_info is deprecated.*",
16
+ )
17
+
18
+ # Alternative: Monkey-patch the resource tracker before it's used
19
+ try:
20
+ from multiprocessing import resource_tracker
21
+ def dummy_register(*args, **kwargs):
22
+ pass
23
+ def dummy_unregister(*args, **kwargs):
24
+ pass
25
+ resource_tracker.register = dummy_register
26
+ resource_tracker.unregister = dummy_unregister
27
+ except ImportError:
28
+ pass
29
+
30
+ from cortex.config import Config
31
+ from cortex.gpu_validator import GPUValidator
32
+ from cortex.model_manager import ModelManager
33
+ from cortex.inference_engine import InferenceEngine
34
+ from cortex.conversation_manager import ConversationManager
35
+ from cortex.ui.cli import CortexCLI
36
+
37
+
38
+ def main():
39
+ """Main entry point."""
40
+
41
+ inference_engine = None
42
+ try:
43
+ # Load configuration
44
+ config = Config()
45
+
46
+ # Initialize GPU validator
47
+ gpu_validator = GPUValidator()
48
+
49
+ # Validate GPU
50
+ is_valid, gpu_info, errors = gpu_validator.validate()
51
+ if not is_valid:
52
+ print("Error: GPU validation failed. Cortex requires Apple Silicon with Metal support.", file=sys.stderr)
53
+ for error in errors:
54
+ print(f" - {error}", file=sys.stderr)
55
+ sys.exit(1)
56
+
57
+ # Initialize components
58
+ model_manager = ModelManager(config, gpu_validator)
59
+ inference_engine = InferenceEngine(config, model_manager)
60
+ conversation_manager = ConversationManager(config)
61
+
62
+ # Create and run the CLI
63
+ cli = CortexCLI(
64
+ config=config,
65
+ gpu_validator=gpu_validator,
66
+ model_manager=model_manager,
67
+ inference_engine=inference_engine,
68
+ conversation_manager=conversation_manager
69
+ )
70
+
71
+ cli.run()
72
+ finally:
73
+ # Clean up resources
74
+ if inference_engine is not None and hasattr(inference_engine, 'memory_pool') and inference_engine.memory_pool:
75
+ inference_engine.memory_pool.cleanup()
76
+
77
+ # Force PyTorch cleanup
78
+ try:
79
+ import torch
80
+ if torch.backends.mps.is_available():
81
+ torch.mps.synchronize()
82
+ if hasattr(torch.mps, 'empty_cache'):
83
+ torch.mps.empty_cache()
84
+ except Exception:
85
+ pass # Ignore cleanup errors
86
+
87
+
88
+ if __name__ == "__main__":
89
+ main()