cortex-llm 1.0.5__tar.gz → 1.0.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cortex_llm-1.0.7/PKG-INFO +169 -0
- cortex_llm-1.0.7/README.md +111 -0
- {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/__init__.py +1 -1
- {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/config.py +1 -1
- {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/inference_engine.py +17 -3
- {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/model_manager.py +45 -1
- {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/quantization/dynamic_quantizer.py +8 -5
- {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/ui/cli.py +59 -41
- {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/ui/markdown_render.py +68 -3
- cortex_llm-1.0.7/cortex_llm.egg-info/PKG-INFO +169 -0
- {cortex_llm-1.0.5 → cortex_llm-1.0.7}/pyproject.toml +3 -4
- {cortex_llm-1.0.5 → cortex_llm-1.0.7}/setup.py +4 -4
- cortex_llm-1.0.5/PKG-INFO +0 -275
- cortex_llm-1.0.5/README.md +0 -216
- cortex_llm-1.0.5/cortex_llm.egg-info/PKG-INFO +0 -275
- {cortex_llm-1.0.5 → cortex_llm-1.0.7}/LICENSE +0 -0
- {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/__main__.py +0 -0
- {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/conversation_manager.py +0 -0
- {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/fine_tuning/__init__.py +0 -0
- {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/fine_tuning/dataset.py +0 -0
- {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/fine_tuning/mlx_lora_trainer.py +0 -0
- {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/fine_tuning/trainer.py +0 -0
- {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/fine_tuning/wizard.py +0 -0
- {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/gpu_validator.py +0 -0
- {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/metal/__init__.py +0 -0
- {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/metal/gpu_validator.py +0 -0
- {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/metal/memory_pool.py +0 -0
- {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/metal/mlx_accelerator.py +0 -0
- {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/metal/mlx_compat.py +0 -0
- {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/metal/mlx_converter.py +0 -0
- {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/metal/mps_optimizer.py +0 -0
- {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/metal/optimizer.py +0 -0
- {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/metal/performance_profiler.py +0 -0
- {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/model_downloader.py +0 -0
- {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/quantization/__init__.py +0 -0
- {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/template_registry/__init__.py +0 -0
- {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/template_registry/auto_detector.py +0 -0
- {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/template_registry/config_manager.py +0 -0
- {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/template_registry/interactive.py +0 -0
- {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/template_registry/registry.py +0 -0
- {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/template_registry/template_profiles/__init__.py +0 -0
- {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/template_registry/template_profiles/base.py +0 -0
- {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/template_registry/template_profiles/complex/__init__.py +0 -0
- {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/template_registry/template_profiles/complex/reasoning.py +0 -0
- {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/template_registry/template_profiles/standard/__init__.py +0 -0
- {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/template_registry/template_profiles/standard/alpaca.py +0 -0
- {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/template_registry/template_profiles/standard/chatml.py +0 -0
- {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/template_registry/template_profiles/standard/gemma.py +0 -0
- {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/template_registry/template_profiles/standard/llama.py +0 -0
- {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/template_registry/template_profiles/standard/simple.py +0 -0
- {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/ui/__init__.py +0 -0
- {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/ui/terminal_app.py +0 -0
- {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex_llm.egg-info/SOURCES.txt +0 -0
- {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex_llm.egg-info/dependency_links.txt +0 -0
- {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex_llm.egg-info/entry_points.txt +0 -0
- {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex_llm.egg-info/not-zip-safe +0 -0
- {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex_llm.egg-info/requires.txt +0 -0
- {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex_llm.egg-info/top_level.txt +0 -0
- {cortex_llm-1.0.5 → cortex_llm-1.0.7}/setup.cfg +0 -0
- {cortex_llm-1.0.5 → cortex_llm-1.0.7}/tests/test_apple_silicon.py +0 -0
- {cortex_llm-1.0.5 → cortex_llm-1.0.7}/tests/test_metal_optimization.py +0 -0
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: cortex-llm
|
|
3
|
+
Version: 1.0.7
|
|
4
|
+
Summary: GPU-Accelerated LLM Terminal for Apple Silicon
|
|
5
|
+
Home-page: https://github.com/faisalmumtaz/Cortex
|
|
6
|
+
Author: Cortex Development Team
|
|
7
|
+
License-Expression: MIT
|
|
8
|
+
Project-URL: Homepage, https://github.com/faisalmumtaz/Cortex
|
|
9
|
+
Project-URL: Bug Tracker, https://github.com/faisalmumtaz/Cortex/issues
|
|
10
|
+
Project-URL: Documentation, https://github.com/faisalmumtaz/Cortex/wiki
|
|
11
|
+
Keywords: llm,gpu,metal,mps,apple-silicon,ai,machine-learning,terminal,mlx,pytorch
|
|
12
|
+
Platform: darwin
|
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Operating System :: MacOS
|
|
20
|
+
Classifier: Environment :: Console
|
|
21
|
+
Classifier: Environment :: GPU
|
|
22
|
+
Requires-Python: >=3.11
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
License-File: LICENSE
|
|
25
|
+
Requires-Dist: torch>=2.1.0
|
|
26
|
+
Requires-Dist: mlx>=0.30.4
|
|
27
|
+
Requires-Dist: mlx-lm>=0.30.5
|
|
28
|
+
Requires-Dist: transformers>=4.36.0
|
|
29
|
+
Requires-Dist: safetensors>=0.4.0
|
|
30
|
+
Requires-Dist: huggingface-hub>=0.19.0
|
|
31
|
+
Requires-Dist: accelerate>=0.25.0
|
|
32
|
+
Requires-Dist: llama-cpp-python>=0.2.0
|
|
33
|
+
Requires-Dist: pyyaml>=6.0
|
|
34
|
+
Requires-Dist: pydantic>=2.5.0
|
|
35
|
+
Requires-Dist: rich>=13.0.0
|
|
36
|
+
Requires-Dist: psutil>=5.9.0
|
|
37
|
+
Requires-Dist: numpy>=1.24.0
|
|
38
|
+
Requires-Dist: packaging>=23.0
|
|
39
|
+
Requires-Dist: requests>=2.31.0
|
|
40
|
+
Provides-Extra: dev
|
|
41
|
+
Requires-Dist: pytest>=7.4.0; extra == "dev"
|
|
42
|
+
Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
|
|
43
|
+
Requires-Dist: black>=23.0.0; extra == "dev"
|
|
44
|
+
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
45
|
+
Requires-Dist: mypy>=1.8.0; extra == "dev"
|
|
46
|
+
Provides-Extra: optional
|
|
47
|
+
Requires-Dist: sentencepiece>=0.1.99; extra == "optional"
|
|
48
|
+
Requires-Dist: auto-gptq>=0.7.0; extra == "optional"
|
|
49
|
+
Requires-Dist: autoawq>=0.2.0; extra == "optional"
|
|
50
|
+
Requires-Dist: bitsandbytes>=0.41.0; extra == "optional"
|
|
51
|
+
Requires-Dist: optimum>=1.16.0; extra == "optional"
|
|
52
|
+
Requires-Dist: torchvision>=0.16.0; extra == "optional"
|
|
53
|
+
Requires-Dist: torchaudio>=2.1.0; extra == "optional"
|
|
54
|
+
Dynamic: home-page
|
|
55
|
+
Dynamic: license-file
|
|
56
|
+
Dynamic: platform
|
|
57
|
+
Dynamic: requires-python
|
|
58
|
+
|
|
59
|
+
# Cortex
|
|
60
|
+
|
|
61
|
+
GPU-accelerated local LLMs on Apple Silicon, built for the terminal.
|
|
62
|
+
|
|
63
|
+
Cortex is a fast, native CLI for running and fine-tuning LLMs on Apple Silicon using MLX and Metal. It automatically detects chat templates, supports multiple model formats, and keeps your workflow inside the terminal.
|
|
64
|
+
|
|
65
|
+
## Highlights
|
|
66
|
+
|
|
67
|
+
- Apple Silicon GPU acceleration via MLX (primary) and PyTorch MPS
|
|
68
|
+
- Multi-format model support: MLX, GGUF, SafeTensors, PyTorch, GPTQ, AWQ
|
|
69
|
+
- Built-in LoRA fine-tuning wizard
|
|
70
|
+
- Chat template auto-detection (ChatML, Llama, Alpaca, Gemma, Reasoning)
|
|
71
|
+
- Conversation history with autosave and export
|
|
72
|
+
|
|
73
|
+
## Quick Start
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
pipx install cortex-llm
|
|
77
|
+
cortex
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
Inside Cortex:
|
|
81
|
+
|
|
82
|
+
- `/download` to fetch a model from HuggingFace
|
|
83
|
+
- `/model` to load or manage models
|
|
84
|
+
- `/status` to confirm GPU acceleration and current settings
|
|
85
|
+
|
|
86
|
+
## Installation
|
|
87
|
+
|
|
88
|
+
### Option A: pipx (recommended)
|
|
89
|
+
|
|
90
|
+
```bash
|
|
91
|
+
pipx install cortex-llm
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
### Option B: from source
|
|
95
|
+
|
|
96
|
+
```bash
|
|
97
|
+
git clone https://github.com/faisalmumtaz/Cortex.git
|
|
98
|
+
cd Cortex
|
|
99
|
+
./install.sh
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
The installer checks Apple Silicon compatibility, creates a venv, installs dependencies from `pyproject.toml`, and sets up the `cortex` command.
|
|
103
|
+
|
|
104
|
+
## Requirements
|
|
105
|
+
|
|
106
|
+
- Apple Silicon Mac (M1/M2/M3/M4)
|
|
107
|
+
- macOS 13.3+
|
|
108
|
+
- Python 3.11+
|
|
109
|
+
- 16GB+ unified memory (24GB+ recommended for larger models)
|
|
110
|
+
- Xcode Command Line Tools
|
|
111
|
+
|
|
112
|
+
## Model Support
|
|
113
|
+
|
|
114
|
+
Cortex supports:
|
|
115
|
+
|
|
116
|
+
- **MLX** (recommended)
|
|
117
|
+
- **GGUF** (llama.cpp + Metal)
|
|
118
|
+
- **SafeTensors**
|
|
119
|
+
- **PyTorch** (Transformers + MPS)
|
|
120
|
+
- **GPTQ** / **AWQ** quantized models
|
|
121
|
+
|
|
122
|
+
## Advanced Features
|
|
123
|
+
|
|
124
|
+
- **Dynamic quantization fallback** for PyTorch/SafeTensors models that do not fit GPU memory (INT8 preferred, INT4 fallback)
|
|
125
|
+
- `docs/dynamic-quantization.md`
|
|
126
|
+
- **MLX conversion with quantization recipes** (4/5/8-bit, mixed precision) for speed vs quality control
|
|
127
|
+
- `docs/mlx-acceleration.md`
|
|
128
|
+
- **LoRA fine-tuning wizard** for local adapters (`/finetune`)
|
|
129
|
+
- `docs/fine-tuning.md`
|
|
130
|
+
- **Template registry and auto-detection** for chat formatting (ChatML, Llama, Alpaca, Gemma, Reasoning)
|
|
131
|
+
- `docs/template-registry.md`
|
|
132
|
+
- **Inference engine details** and backend behavior
|
|
133
|
+
- `docs/inference-engine.md`
|
|
134
|
+
|
|
135
|
+
## Configuration
|
|
136
|
+
|
|
137
|
+
Cortex reads `config.yaml` from the current working directory. For tuning GPU memory limits, quantization defaults, and inference parameters, see:
|
|
138
|
+
|
|
139
|
+
- `docs/configuration.md`
|
|
140
|
+
|
|
141
|
+
## Documentation
|
|
142
|
+
|
|
143
|
+
Start here:
|
|
144
|
+
|
|
145
|
+
- `docs/installation.md`
|
|
146
|
+
- `docs/cli.md`
|
|
147
|
+
- `docs/model-management.md`
|
|
148
|
+
- `docs/troubleshooting.md`
|
|
149
|
+
|
|
150
|
+
Advanced topics:
|
|
151
|
+
|
|
152
|
+
- `docs/mlx-acceleration.md`
|
|
153
|
+
- `docs/inference-engine.md`
|
|
154
|
+
- `docs/dynamic-quantization.md`
|
|
155
|
+
- `docs/template-registry.md`
|
|
156
|
+
- `docs/fine-tuning.md`
|
|
157
|
+
- `docs/development.md`
|
|
158
|
+
|
|
159
|
+
## Contributing
|
|
160
|
+
|
|
161
|
+
Contributions are welcome. See `docs/development.md` for setup and workflow.
|
|
162
|
+
|
|
163
|
+
## License
|
|
164
|
+
|
|
165
|
+
MIT License. See `LICENSE`.
|
|
166
|
+
|
|
167
|
+
---
|
|
168
|
+
|
|
169
|
+
Note: Cortex requires Apple Silicon. Intel Macs are not supported.
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
# Cortex
|
|
2
|
+
|
|
3
|
+
GPU-accelerated local LLMs on Apple Silicon, built for the terminal.
|
|
4
|
+
|
|
5
|
+
Cortex is a fast, native CLI for running and fine-tuning LLMs on Apple Silicon using MLX and Metal. It automatically detects chat templates, supports multiple model formats, and keeps your workflow inside the terminal.
|
|
6
|
+
|
|
7
|
+
## Highlights
|
|
8
|
+
|
|
9
|
+
- Apple Silicon GPU acceleration via MLX (primary) and PyTorch MPS
|
|
10
|
+
- Multi-format model support: MLX, GGUF, SafeTensors, PyTorch, GPTQ, AWQ
|
|
11
|
+
- Built-in LoRA fine-tuning wizard
|
|
12
|
+
- Chat template auto-detection (ChatML, Llama, Alpaca, Gemma, Reasoning)
|
|
13
|
+
- Conversation history with autosave and export
|
|
14
|
+
|
|
15
|
+
## Quick Start
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
pipx install cortex-llm
|
|
19
|
+
cortex
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
Inside Cortex:
|
|
23
|
+
|
|
24
|
+
- `/download` to fetch a model from HuggingFace
|
|
25
|
+
- `/model` to load or manage models
|
|
26
|
+
- `/status` to confirm GPU acceleration and current settings
|
|
27
|
+
|
|
28
|
+
## Installation
|
|
29
|
+
|
|
30
|
+
### Option A: pipx (recommended)
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
pipx install cortex-llm
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
### Option B: from source
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
git clone https://github.com/faisalmumtaz/Cortex.git
|
|
40
|
+
cd Cortex
|
|
41
|
+
./install.sh
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
The installer checks Apple Silicon compatibility, creates a venv, installs dependencies from `pyproject.toml`, and sets up the `cortex` command.
|
|
45
|
+
|
|
46
|
+
## Requirements
|
|
47
|
+
|
|
48
|
+
- Apple Silicon Mac (M1/M2/M3/M4)
|
|
49
|
+
- macOS 13.3+
|
|
50
|
+
- Python 3.11+
|
|
51
|
+
- 16GB+ unified memory (24GB+ recommended for larger models)
|
|
52
|
+
- Xcode Command Line Tools
|
|
53
|
+
|
|
54
|
+
## Model Support
|
|
55
|
+
|
|
56
|
+
Cortex supports:
|
|
57
|
+
|
|
58
|
+
- **MLX** (recommended)
|
|
59
|
+
- **GGUF** (llama.cpp + Metal)
|
|
60
|
+
- **SafeTensors**
|
|
61
|
+
- **PyTorch** (Transformers + MPS)
|
|
62
|
+
- **GPTQ** / **AWQ** quantized models
|
|
63
|
+
|
|
64
|
+
## Advanced Features
|
|
65
|
+
|
|
66
|
+
- **Dynamic quantization fallback** for PyTorch/SafeTensors models that do not fit GPU memory (INT8 preferred, INT4 fallback)
|
|
67
|
+
- `docs/dynamic-quantization.md`
|
|
68
|
+
- **MLX conversion with quantization recipes** (4/5/8-bit, mixed precision) for speed vs quality control
|
|
69
|
+
- `docs/mlx-acceleration.md`
|
|
70
|
+
- **LoRA fine-tuning wizard** for local adapters (`/finetune`)
|
|
71
|
+
- `docs/fine-tuning.md`
|
|
72
|
+
- **Template registry and auto-detection** for chat formatting (ChatML, Llama, Alpaca, Gemma, Reasoning)
|
|
73
|
+
- `docs/template-registry.md`
|
|
74
|
+
- **Inference engine details** and backend behavior
|
|
75
|
+
- `docs/inference-engine.md`
|
|
76
|
+
|
|
77
|
+
## Configuration
|
|
78
|
+
|
|
79
|
+
Cortex reads `config.yaml` from the current working directory. For tuning GPU memory limits, quantization defaults, and inference parameters, see:
|
|
80
|
+
|
|
81
|
+
- `docs/configuration.md`
|
|
82
|
+
|
|
83
|
+
## Documentation
|
|
84
|
+
|
|
85
|
+
Start here:
|
|
86
|
+
|
|
87
|
+
- `docs/installation.md`
|
|
88
|
+
- `docs/cli.md`
|
|
89
|
+
- `docs/model-management.md`
|
|
90
|
+
- `docs/troubleshooting.md`
|
|
91
|
+
|
|
92
|
+
Advanced topics:
|
|
93
|
+
|
|
94
|
+
- `docs/mlx-acceleration.md`
|
|
95
|
+
- `docs/inference-engine.md`
|
|
96
|
+
- `docs/dynamic-quantization.md`
|
|
97
|
+
- `docs/template-registry.md`
|
|
98
|
+
- `docs/fine-tuning.md`
|
|
99
|
+
- `docs/development.md`
|
|
100
|
+
|
|
101
|
+
## Contributing
|
|
102
|
+
|
|
103
|
+
Contributions are welcome. See `docs/development.md` for setup and workflow.
|
|
104
|
+
|
|
105
|
+
## License
|
|
106
|
+
|
|
107
|
+
MIT License. See `LICENSE`.
|
|
108
|
+
|
|
109
|
+
---
|
|
110
|
+
|
|
111
|
+
Note: Cortex requires Apple Silicon. Intel Macs are not supported.
|
|
@@ -5,7 +5,7 @@ A high-performance terminal interface for running Hugging Face LLMs locally
|
|
|
5
5
|
with exclusive GPU acceleration via Metal Performance Shaders (MPS) and MLX.
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
|
-
__version__ = "1.0.
|
|
8
|
+
__version__ = "1.0.7"
|
|
9
9
|
__author__ = "Cortex Development Team"
|
|
10
10
|
__license__ = "MIT"
|
|
11
11
|
|
|
@@ -74,7 +74,7 @@ class InferenceConfig(BaseModel):
|
|
|
74
74
|
top_p: float = Field(default=0.95, ge=0.0, le=1.0)
|
|
75
75
|
top_k: int = Field(default=40, ge=0)
|
|
76
76
|
repetition_penalty: float = Field(default=1.1, ge=0.0, le=2.0)
|
|
77
|
-
max_tokens: int = Field(default=
|
|
77
|
+
max_tokens: int = Field(default=4096, ge=1)
|
|
78
78
|
stream_output: bool = True
|
|
79
79
|
seed: int = Field(default=-1)
|
|
80
80
|
|
|
@@ -138,7 +138,7 @@ class InferenceEngine:
|
|
|
138
138
|
use_fp16=True,
|
|
139
139
|
use_channels_last=True,
|
|
140
140
|
optimize_memory=True,
|
|
141
|
-
max_batch_size=self.config.performance.
|
|
141
|
+
max_batch_size=self.config.performance.max_batch_size
|
|
142
142
|
)
|
|
143
143
|
self.mps_optimizer = MPSOptimizer(mps_config)
|
|
144
144
|
|
|
@@ -153,7 +153,7 @@ class InferenceEngine:
|
|
|
153
153
|
fuse_operations=True,
|
|
154
154
|
lazy_evaluation=True,
|
|
155
155
|
rotating_kv_cache=True,
|
|
156
|
-
kv_cache_size=self.config.
|
|
156
|
+
kv_cache_size=self.config.performance.context_length,
|
|
157
157
|
quantization_bits=4
|
|
158
158
|
)
|
|
159
159
|
self.mlx_accelerator = MLXAccelerator(mlx_config)
|
|
@@ -204,6 +204,9 @@ class InferenceEngine:
|
|
|
204
204
|
yield from self._generate_pytorch(model, tokenizer, request)
|
|
205
205
|
elif model_info.format == ModelFormat.SAFETENSORS:
|
|
206
206
|
yield from self._generate_safetensors(model, tokenizer, request)
|
|
207
|
+
elif model_info.format == ModelFormat.QUANTIZED:
|
|
208
|
+
# Quantized models are loaded as PyTorch-compatible modules
|
|
209
|
+
yield from self._generate_pytorch(model, tokenizer, request)
|
|
207
210
|
elif model_info.format == ModelFormat.GGUF:
|
|
208
211
|
yield from self._generate_gguf(model, tokenizer, request)
|
|
209
212
|
else:
|
|
@@ -401,7 +404,18 @@ class InferenceEngine:
|
|
|
401
404
|
last_metrics_update = time.time()
|
|
402
405
|
|
|
403
406
|
try:
|
|
404
|
-
device
|
|
407
|
+
# Use the model's device when available (quantized models may be CPU-only on macOS)
|
|
408
|
+
device = None
|
|
409
|
+
try:
|
|
410
|
+
first_param = next(model.parameters())
|
|
411
|
+
device = first_param.device
|
|
412
|
+
except Exception:
|
|
413
|
+
device = None
|
|
414
|
+
|
|
415
|
+
if device is None or str(device) == "meta":
|
|
416
|
+
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
|
|
417
|
+
elif device.type == "mps" and not torch.backends.mps.is_available():
|
|
418
|
+
device = torch.device("cpu")
|
|
405
419
|
|
|
406
420
|
inputs = tokenizer(request.prompt, return_tensors="pt").to(device)
|
|
407
421
|
|
|
@@ -133,7 +133,8 @@ class ModelManager:
|
|
|
133
133
|
self.quantizer = DynamicQuantizer(QuantizationConfig(
|
|
134
134
|
mode=QuantizationMode.DYNAMIC,
|
|
135
135
|
per_channel=True,
|
|
136
|
-
cache_quantized=True
|
|
136
|
+
cache_quantized=True,
|
|
137
|
+
cache_dir=self.config.model.quantization_cache
|
|
137
138
|
))
|
|
138
139
|
|
|
139
140
|
# Initialize MLX converter for native conversion
|
|
@@ -201,6 +202,39 @@ class ModelManager:
|
|
|
201
202
|
level = getattr(self.config.gpu, "gpu_optimization_level", "maximum")
|
|
202
203
|
level = str(level).lower().strip()
|
|
203
204
|
return level in {"maximum", "max", "speed", "fast", "performance"}
|
|
205
|
+
|
|
206
|
+
def _get_default_quant_recipe(self) -> Optional[QuantizationRecipe]:
|
|
207
|
+
"""Map configured default_quantization to an MLX quantization recipe."""
|
|
208
|
+
raw = getattr(self.config.model, "default_quantization", "") or ""
|
|
209
|
+
value = str(raw).strip().lower()
|
|
210
|
+
if not value or value == "auto":
|
|
211
|
+
return None
|
|
212
|
+
|
|
213
|
+
mapping = {
|
|
214
|
+
"q4_k_m": QuantizationRecipe.SPEED_4BIT,
|
|
215
|
+
"q5_k_m": QuantizationRecipe.BALANCED_5BIT,
|
|
216
|
+
"q6_k": QuantizationRecipe.QUALITY_8BIT, # closest available MLX recipe
|
|
217
|
+
"q8_0": QuantizationRecipe.QUALITY_8BIT,
|
|
218
|
+
"4bit": QuantizationRecipe.SPEED_4BIT,
|
|
219
|
+
"5bit": QuantizationRecipe.BALANCED_5BIT,
|
|
220
|
+
"8bit": QuantizationRecipe.QUALITY_8BIT,
|
|
221
|
+
"mixed": QuantizationRecipe.MIXED_PRECISION,
|
|
222
|
+
"none": QuantizationRecipe.NONE,
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
recipe = mapping.get(value)
|
|
226
|
+
if recipe is None:
|
|
227
|
+
logger.warning("Unknown default_quantization value: %s", raw)
|
|
228
|
+
return None
|
|
229
|
+
|
|
230
|
+
supported = getattr(self.config.model, "supported_quantizations", None)
|
|
231
|
+
if supported:
|
|
232
|
+
supported_norm = {str(s).strip().lower() for s in supported}
|
|
233
|
+
if value.startswith("q") and value not in supported_norm:
|
|
234
|
+
logger.warning("default_quantization '%s' not in supported_quantizations", raw)
|
|
235
|
+
return None
|
|
236
|
+
|
|
237
|
+
return recipe
|
|
204
238
|
|
|
205
239
|
def load_model(
|
|
206
240
|
self,
|
|
@@ -374,6 +408,10 @@ class ModelManager:
|
|
|
374
408
|
except Exception as e:
|
|
375
409
|
logger.warning(f"Could not estimate model parameters: {e}, defaulting to 4-bit")
|
|
376
410
|
quant_recipe = QuantizationRecipe.SPEED_4BIT # Fallback
|
|
411
|
+
|
|
412
|
+
default_recipe = self._get_default_quant_recipe()
|
|
413
|
+
if default_recipe is not None:
|
|
414
|
+
quant_recipe = default_recipe
|
|
377
415
|
|
|
378
416
|
if quantization:
|
|
379
417
|
quant_map = {
|
|
@@ -452,6 +490,10 @@ class ModelManager:
|
|
|
452
490
|
else:
|
|
453
491
|
quant_recipe = QuantizationRecipe.SPEED_4BIT # Default for larger models
|
|
454
492
|
|
|
493
|
+
default_recipe = self._get_default_quant_recipe()
|
|
494
|
+
if default_recipe is not None:
|
|
495
|
+
quant_recipe = default_recipe
|
|
496
|
+
|
|
455
497
|
if quantization:
|
|
456
498
|
quant_map = {
|
|
457
499
|
"4bit": QuantizationRecipe.SPEED_4BIT,
|
|
@@ -563,6 +605,8 @@ class ModelManager:
|
|
|
563
605
|
)
|
|
564
606
|
|
|
565
607
|
if not can_load and can_apply_quantization:
|
|
608
|
+
if not getattr(self.config.model, "auto_quantize", True):
|
|
609
|
+
return False, f"GPU incompatible: {message} (auto_quantize disabled)"
|
|
566
610
|
# Check if quantization would help
|
|
567
611
|
gpu_status = self.gpu_validator.get_gpu_memory_status()
|
|
568
612
|
available_gb = gpu_status['available_gb']
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
import torch
|
|
4
4
|
import torch.nn as nn
|
|
5
5
|
from typing import Dict, Any, Optional, Tuple, Union
|
|
6
|
-
from dataclasses import dataclass
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
7
|
from enum import Enum
|
|
8
8
|
import gc
|
|
9
9
|
from pathlib import Path
|
|
@@ -40,6 +40,7 @@ class QuantizationConfig:
|
|
|
40
40
|
cache_quantized: bool = True # Cache quantized models to disk
|
|
41
41
|
compress_cache: bool = False # Compress cached models (slower but smaller)
|
|
42
42
|
validate_quantization: bool = True # Validate quantized models work correctly
|
|
43
|
+
cache_dir: Path = field(default_factory=lambda: Path.home() / ".cortex" / "quantized_models")
|
|
43
44
|
|
|
44
45
|
def to_dict(self) -> Dict[str, Any]:
|
|
45
46
|
"""Convert to dictionary for serialization."""
|
|
@@ -118,6 +119,8 @@ class DynamicQuantizer:
|
|
|
118
119
|
def __init__(self, config: Optional[QuantizationConfig] = None):
|
|
119
120
|
"""Initialize quantizer with configuration."""
|
|
120
121
|
self.config = config or QuantizationConfig()
|
|
122
|
+
self.config.cache_dir = Path(self.config.cache_dir).expanduser()
|
|
123
|
+
self.config.cache_dir.mkdir(parents=True, exist_ok=True)
|
|
121
124
|
self.device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
|
|
122
125
|
self._quantization_cache: Dict[str, Dict[str, Any]] = {}
|
|
123
126
|
|
|
@@ -681,10 +684,10 @@ class DynamicQuantizer:
|
|
|
681
684
|
|
|
682
685
|
# Generate cache key including model metadata
|
|
683
686
|
cache_key = hashlib.md5(
|
|
684
|
-
f"{model_path}_{model_mtime}_{model_size}_{json.dumps(
|
|
687
|
+
f"{model_path}_{model_mtime}_{model_size}_{json.dumps(self.config.to_dict())}".encode()
|
|
685
688
|
).hexdigest()
|
|
686
689
|
|
|
687
|
-
cache_dir =
|
|
690
|
+
cache_dir = self.config.cache_dir
|
|
688
691
|
cache_dir.mkdir(parents=True, exist_ok=True)
|
|
689
692
|
|
|
690
693
|
cache_path = cache_dir / f"{cache_key}.pt"
|
|
@@ -723,7 +726,7 @@ class DynamicQuantizer:
|
|
|
723
726
|
f"{model_path}_{model_mtime}_{model_size}_{json.dumps(config.to_dict())}".encode()
|
|
724
727
|
).hexdigest()
|
|
725
728
|
|
|
726
|
-
cache_path = Path.
|
|
729
|
+
cache_path = Path(self.config.cache_dir) / f"{cache_key}.pt"
|
|
727
730
|
|
|
728
731
|
if cache_path.exists():
|
|
729
732
|
try:
|
|
@@ -733,4 +736,4 @@ class DynamicQuantizer:
|
|
|
733
736
|
# Cache corrupted, will re-quantize
|
|
734
737
|
cache_path.unlink()
|
|
735
738
|
|
|
736
|
-
return None
|
|
739
|
+
return None
|
|
@@ -30,7 +30,7 @@ from cortex.conversation_manager import ConversationManager, MessageRole
|
|
|
30
30
|
from cortex.model_downloader import ModelDownloader
|
|
31
31
|
from cortex.template_registry import TemplateRegistry
|
|
32
32
|
from cortex.fine_tuning import FineTuneWizard
|
|
33
|
-
from cortex.ui.markdown_render import ThinkMarkdown, PrefixedRenderable
|
|
33
|
+
from cortex.ui.markdown_render import ThinkMarkdown, PrefixedRenderable, render_plain_with_think
|
|
34
34
|
|
|
35
35
|
|
|
36
36
|
class CortexCLI:
|
|
@@ -1135,15 +1135,15 @@ class CortexCLI:
|
|
|
1135
1135
|
logger.debug(f"Could not get stop sequences: {e}")
|
|
1136
1136
|
|
|
1137
1137
|
# Create generation request with formatted prompt
|
|
1138
|
-
# Use lower temperature for more focused responses
|
|
1139
1138
|
request = GenerationRequest(
|
|
1140
1139
|
prompt=formatted_prompt,
|
|
1141
1140
|
max_tokens=self.config.inference.max_tokens,
|
|
1142
|
-
temperature=
|
|
1143
|
-
top_p=
|
|
1141
|
+
temperature=self.config.inference.temperature,
|
|
1142
|
+
top_p=self.config.inference.top_p,
|
|
1144
1143
|
top_k=self.config.inference.top_k,
|
|
1145
1144
|
repetition_penalty=self.config.inference.repetition_penalty,
|
|
1146
|
-
stream=
|
|
1145
|
+
stream=self.config.inference.stream_output,
|
|
1146
|
+
seed=self.config.inference.seed if self.config.inference.seed >= 0 else None,
|
|
1147
1147
|
stop_sequences=stop_sequences
|
|
1148
1148
|
)
|
|
1149
1149
|
|
|
@@ -1167,50 +1167,65 @@ class CortexCLI:
|
|
|
1167
1167
|
prefix_style = Style(color="cyan")
|
|
1168
1168
|
|
|
1169
1169
|
def build_renderable(text: str):
|
|
1170
|
-
|
|
1171
|
-
|
|
1170
|
+
if getattr(self.config.ui, "markdown_rendering", True):
|
|
1171
|
+
markdown = ThinkMarkdown(
|
|
1172
|
+
text,
|
|
1173
|
+
code_theme="monokai",
|
|
1174
|
+
use_line_numbers=False,
|
|
1175
|
+
syntax_highlighting=getattr(self.config.ui, "syntax_highlighting", True),
|
|
1176
|
+
)
|
|
1177
|
+
renderable = markdown
|
|
1178
|
+
else:
|
|
1179
|
+
renderable = render_plain_with_think(text)
|
|
1172
1180
|
|
|
1173
|
-
|
|
1174
|
-
build_renderable(""),
|
|
1175
|
-
console=self.console,
|
|
1176
|
-
refresh_per_second=20,
|
|
1177
|
-
transient=False,
|
|
1178
|
-
) as live:
|
|
1179
|
-
for token in self.inference_engine.generate(request):
|
|
1180
|
-
if first_token_time is None:
|
|
1181
|
-
first_token_time = time.time()
|
|
1181
|
+
return PrefixedRenderable(renderable, prefix="⏺ ", prefix_style=prefix_style, indent=" ")
|
|
1182
1182
|
|
|
1183
|
-
|
|
1184
|
-
|
|
1183
|
+
original_console_width = self.console._width
|
|
1184
|
+
target_width = max(40, int(self.get_terminal_width() * 0.75))
|
|
1185
|
+
self.console.width = target_width
|
|
1186
|
+
try:
|
|
1187
|
+
with Live(
|
|
1188
|
+
build_renderable(""),
|
|
1189
|
+
console=self.console,
|
|
1190
|
+
auto_refresh=False,
|
|
1191
|
+
refresh_per_second=20,
|
|
1192
|
+
transient=False,
|
|
1193
|
+
vertical_overflow="visible",
|
|
1194
|
+
) as live:
|
|
1195
|
+
for token in self.inference_engine.generate(request):
|
|
1196
|
+
if first_token_time is None:
|
|
1197
|
+
first_token_time = time.time()
|
|
1185
1198
|
|
|
1186
|
-
|
|
1187
|
-
|
|
1188
|
-
|
|
1189
|
-
|
|
1190
|
-
)
|
|
1191
|
-
|
|
1192
|
-
|
|
1193
|
-
|
|
1199
|
+
generated_text += token
|
|
1200
|
+
token_count += 1
|
|
1201
|
+
|
|
1202
|
+
display_token = token
|
|
1203
|
+
if uses_reasoning_template and template_profile and template_profile.supports_streaming():
|
|
1204
|
+
display_token, should_display = template_profile.process_streaming_response(
|
|
1205
|
+
token, accumulated_response
|
|
1206
|
+
)
|
|
1207
|
+
accumulated_response += token
|
|
1208
|
+
if not should_display:
|
|
1209
|
+
display_token = ""
|
|
1194
1210
|
|
|
1195
|
-
|
|
1196
|
-
|
|
1211
|
+
if display_token:
|
|
1212
|
+
display_text += display_token
|
|
1197
1213
|
|
|
1198
|
-
|
|
1199
|
-
|
|
1200
|
-
|
|
1201
|
-
|
|
1214
|
+
now = time.time()
|
|
1215
|
+
if display_token and ("\n" in display_token or now - last_render_time >= render_interval):
|
|
1216
|
+
live.update(build_renderable(display_text), refresh=True)
|
|
1217
|
+
last_render_time = now
|
|
1202
1218
|
|
|
1203
|
-
|
|
1204
|
-
|
|
1205
|
-
|
|
1206
|
-
|
|
1207
|
-
|
|
1219
|
+
if uses_reasoning_template and template_profile:
|
|
1220
|
+
final_text = template_profile.process_response(generated_text)
|
|
1221
|
+
generated_text = final_text
|
|
1222
|
+
if not template_profile.config.show_reasoning:
|
|
1223
|
+
display_text = final_text
|
|
1208
1224
|
|
|
1209
|
-
|
|
1225
|
+
live.update(build_renderable(display_text), refresh=True)
|
|
1226
|
+
finally:
|
|
1227
|
+
self.console._width = original_console_width
|
|
1210
1228
|
|
|
1211
|
-
# Add blank line for spacing between response and metrics
|
|
1212
|
-
print()
|
|
1213
|
-
|
|
1214
1229
|
# Display final metrics in a clean, professional way
|
|
1215
1230
|
elapsed = time.time() - start_time
|
|
1216
1231
|
if token_count > 0 and elapsed > 0:
|
|
@@ -1238,6 +1253,9 @@ class CortexCLI:
|
|
|
1238
1253
|
metrics_line = " · ".join(metrics_parts)
|
|
1239
1254
|
print(f" \033[2m{metrics_line}\033[0m")
|
|
1240
1255
|
|
|
1256
|
+
if token_count >= request.max_tokens:
|
|
1257
|
+
print(f" \033[2m(output truncated at max_tokens={request.max_tokens}; increase in config.yaml)\033[0m")
|
|
1258
|
+
|
|
1241
1259
|
# Add assistant message to conversation history
|
|
1242
1260
|
self.conversation_manager.add_message(MessageRole.ASSISTANT, generated_text)
|
|
1243
1261
|
|