cortex-llm 1.0.0__py3-none-any.whl → 1.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cortex/__init__.py +2 -2
- cortex/__main__.py +8 -1
- cortex/config.py +1 -1
- cortex/inference_engine.py +20 -4
- cortex/metal/mlx_accelerator.py +3 -1
- cortex/metal/mlx_compat.py +105 -0
- cortex/metal/mlx_converter.py +105 -4
- cortex/model_manager.py +45 -1
- cortex/quantization/dynamic_quantizer.py +8 -5
- cortex/ui/cli.py +61 -43
- cortex/ui/markdown_render.py +68 -3
- cortex_llm-1.0.8.dist-info/METADATA +169 -0
- {cortex_llm-1.0.0.dist-info → cortex_llm-1.0.8.dist-info}/RECORD +17 -16
- cortex_llm-1.0.0.dist-info/METADATA +0 -275
- {cortex_llm-1.0.0.dist-info → cortex_llm-1.0.8.dist-info}/WHEEL +0 -0
- {cortex_llm-1.0.0.dist-info → cortex_llm-1.0.8.dist-info}/entry_points.txt +0 -0
- {cortex_llm-1.0.0.dist-info → cortex_llm-1.0.8.dist-info}/licenses/LICENSE +0 -0
- {cortex_llm-1.0.0.dist-info → cortex_llm-1.0.8.dist-info}/top_level.txt +0 -0
cortex/__init__.py
CHANGED
|
@@ -5,7 +5,7 @@ A high-performance terminal interface for running Hugging Face LLMs locally
|
|
|
5
5
|
with exclusive GPU acceleration via Metal Performance Shaders (MPS) and MLX.
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
|
-
__version__ = "1.0.
|
|
8
|
+
__version__ = "1.0.8"
|
|
9
9
|
__author__ = "Cortex Development Team"
|
|
10
10
|
__license__ = "MIT"
|
|
11
11
|
|
|
@@ -70,4 +70,4 @@ __all__ = [
|
|
|
70
70
|
"ConversationManager",
|
|
71
71
|
"initialize_cortex",
|
|
72
72
|
"verify_system_requirements"
|
|
73
|
-
]
|
|
73
|
+
]
|
cortex/__main__.py
CHANGED
|
@@ -9,6 +9,13 @@ import warnings
|
|
|
9
9
|
# This prevents the semaphore leak warning from transformers library
|
|
10
10
|
os.environ['PYTHONWARNINGS'] = 'ignore::UserWarning:multiprocessing.resource_tracker'
|
|
11
11
|
|
|
12
|
+
# Apply MLX compatibility shims before any MLX/MLX-LM imports.
|
|
13
|
+
try:
|
|
14
|
+
from cortex.metal.mlx_compat import patch_mlx_lm_device_info
|
|
15
|
+
patch_mlx_lm_device_info()
|
|
16
|
+
except Exception:
|
|
17
|
+
pass
|
|
18
|
+
|
|
12
19
|
# Alternative: Monkey-patch the resource tracker before it's used
|
|
13
20
|
try:
|
|
14
21
|
from multiprocessing import resource_tracker
|
|
@@ -80,4 +87,4 @@ def main():
|
|
|
80
87
|
|
|
81
88
|
|
|
82
89
|
if __name__ == "__main__":
|
|
83
|
-
main()
|
|
90
|
+
main()
|
cortex/config.py
CHANGED
|
@@ -74,7 +74,7 @@ class InferenceConfig(BaseModel):
|
|
|
74
74
|
top_p: float = Field(default=0.95, ge=0.0, le=1.0)
|
|
75
75
|
top_k: int = Field(default=40, ge=0)
|
|
76
76
|
repetition_penalty: float = Field(default=1.1, ge=0.0, le=2.0)
|
|
77
|
-
max_tokens: int = Field(default=
|
|
77
|
+
max_tokens: int = Field(default=4096, ge=1)
|
|
78
78
|
stream_output: bool = True
|
|
79
79
|
seed: int = Field(default=-1)
|
|
80
80
|
|
cortex/inference_engine.py
CHANGED
|
@@ -25,6 +25,8 @@ try:
|
|
|
25
25
|
except ImportError:
|
|
26
26
|
mlx_generate = None
|
|
27
27
|
mlx_stream_generate = None
|
|
28
|
+
from cortex.metal.mlx_compat import patch_mlx_lm_device_info
|
|
29
|
+
patch_mlx_lm_device_info()
|
|
28
30
|
|
|
29
31
|
from cortex.config import Config
|
|
30
32
|
from cortex.model_manager import ModelManager, ModelFormat
|
|
@@ -136,7 +138,7 @@ class InferenceEngine:
|
|
|
136
138
|
use_fp16=True,
|
|
137
139
|
use_channels_last=True,
|
|
138
140
|
optimize_memory=True,
|
|
139
|
-
max_batch_size=self.config.performance.
|
|
141
|
+
max_batch_size=self.config.performance.max_batch_size
|
|
140
142
|
)
|
|
141
143
|
self.mps_optimizer = MPSOptimizer(mps_config)
|
|
142
144
|
|
|
@@ -151,7 +153,7 @@ class InferenceEngine:
|
|
|
151
153
|
fuse_operations=True,
|
|
152
154
|
lazy_evaluation=True,
|
|
153
155
|
rotating_kv_cache=True,
|
|
154
|
-
kv_cache_size=self.config.
|
|
156
|
+
kv_cache_size=self.config.performance.context_length,
|
|
155
157
|
quantization_bits=4
|
|
156
158
|
)
|
|
157
159
|
self.mlx_accelerator = MLXAccelerator(mlx_config)
|
|
@@ -202,6 +204,9 @@ class InferenceEngine:
|
|
|
202
204
|
yield from self._generate_pytorch(model, tokenizer, request)
|
|
203
205
|
elif model_info.format == ModelFormat.SAFETENSORS:
|
|
204
206
|
yield from self._generate_safetensors(model, tokenizer, request)
|
|
207
|
+
elif model_info.format == ModelFormat.QUANTIZED:
|
|
208
|
+
# Quantized models are loaded as PyTorch-compatible modules
|
|
209
|
+
yield from self._generate_pytorch(model, tokenizer, request)
|
|
205
210
|
elif model_info.format == ModelFormat.GGUF:
|
|
206
211
|
yield from self._generate_gguf(model, tokenizer, request)
|
|
207
212
|
else:
|
|
@@ -399,7 +404,18 @@ class InferenceEngine:
|
|
|
399
404
|
last_metrics_update = time.time()
|
|
400
405
|
|
|
401
406
|
try:
|
|
402
|
-
device
|
|
407
|
+
# Use the model's device when available (quantized models may be CPU-only on macOS)
|
|
408
|
+
device = None
|
|
409
|
+
try:
|
|
410
|
+
first_param = next(model.parameters())
|
|
411
|
+
device = first_param.device
|
|
412
|
+
except Exception:
|
|
413
|
+
device = None
|
|
414
|
+
|
|
415
|
+
if device is None or str(device) == "meta":
|
|
416
|
+
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
|
|
417
|
+
elif device.type == "mps" and not torch.backends.mps.is_available():
|
|
418
|
+
device = torch.device("cpu")
|
|
403
419
|
|
|
404
420
|
inputs = tokenizer(request.prompt, return_tensors="pt").to(device)
|
|
405
421
|
|
|
@@ -724,4 +740,4 @@ class InferenceEngine:
|
|
|
724
740
|
pass
|
|
725
741
|
|
|
726
742
|
except Exception as e:
|
|
727
|
-
print(f"Warning: GPU warmup failed: {e}")
|
|
743
|
+
print(f"Warning: GPU warmup failed: {e}")
|
cortex/metal/mlx_accelerator.py
CHANGED
|
@@ -21,6 +21,8 @@ except ImportError:
|
|
|
21
21
|
# Fallback if mlx_lm is not available
|
|
22
22
|
generate = None
|
|
23
23
|
stream_generate = None
|
|
24
|
+
from cortex.metal.mlx_compat import patch_mlx_lm_device_info
|
|
25
|
+
patch_mlx_lm_device_info()
|
|
24
26
|
|
|
25
27
|
@dataclass
|
|
26
28
|
class MLXConfig:
|
|
@@ -675,4 +677,4 @@ class MLXAccelerator:
|
|
|
675
677
|
}
|
|
676
678
|
|
|
677
679
|
logger.debug(f"Benchmark results: {result}")
|
|
678
|
-
return result
|
|
680
|
+
return result
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
"""Compatibility helpers for MLX / mlx_lm API changes."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import contextlib
|
|
6
|
+
from typing import Optional, List, Any
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _get_device_info(mx) -> dict:
|
|
10
|
+
try:
|
|
11
|
+
return mx.device_info()
|
|
12
|
+
except Exception:
|
|
13
|
+
return {}
|
|
14
|
+
|
|
15
|
+
def patch_mlx_device_info() -> None:
|
|
16
|
+
"""Redirect deprecated mx.metal.device_info to mx.device_info when possible."""
|
|
17
|
+
try:
|
|
18
|
+
import mlx.core as mx
|
|
19
|
+
except Exception:
|
|
20
|
+
return
|
|
21
|
+
|
|
22
|
+
if hasattr(mx, "device_info") and hasattr(mx, "metal") and hasattr(mx.metal, "device_info"):
|
|
23
|
+
try:
|
|
24
|
+
mx.metal.device_info = mx.device_info # type: ignore[attr-defined]
|
|
25
|
+
except Exception:
|
|
26
|
+
pass
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def patch_mlx_lm_device_info() -> None:
|
|
30
|
+
"""Patch mlx_lm call sites to use mx.device_info() instead of mx.metal.device_info()."""
|
|
31
|
+
try:
|
|
32
|
+
import mlx.core as mx
|
|
33
|
+
from mlx.utils import tree_reduce
|
|
34
|
+
except Exception:
|
|
35
|
+
return
|
|
36
|
+
|
|
37
|
+
if not hasattr(mx, "device_info"):
|
|
38
|
+
return
|
|
39
|
+
|
|
40
|
+
patch_mlx_device_info()
|
|
41
|
+
|
|
42
|
+
try:
|
|
43
|
+
import mlx_lm.generate as mlx_generate
|
|
44
|
+
except Exception:
|
|
45
|
+
mlx_generate = None
|
|
46
|
+
|
|
47
|
+
try:
|
|
48
|
+
import mlx_lm.server as mlx_server
|
|
49
|
+
except Exception:
|
|
50
|
+
mlx_server = None
|
|
51
|
+
|
|
52
|
+
if mlx_generate is not None and getattr(mlx_generate, "__cortex_patched__", False) is False:
|
|
53
|
+
@contextlib.contextmanager
|
|
54
|
+
def wired_limit(model: Any, streams: Optional[List[Any]] = None):
|
|
55
|
+
if not mx.metal.is_available():
|
|
56
|
+
try:
|
|
57
|
+
yield
|
|
58
|
+
finally:
|
|
59
|
+
pass
|
|
60
|
+
return
|
|
61
|
+
|
|
62
|
+
model_bytes = tree_reduce(
|
|
63
|
+
lambda acc, x: acc + x.nbytes if isinstance(x, mx.array) else acc, model, 0
|
|
64
|
+
)
|
|
65
|
+
info = _get_device_info(mx)
|
|
66
|
+
max_rec_size = info.get("max_recommended_working_set_size")
|
|
67
|
+
|
|
68
|
+
if max_rec_size and model_bytes > 0.9 * max_rec_size:
|
|
69
|
+
model_mb = model_bytes // 2**20
|
|
70
|
+
max_rec_mb = max_rec_size // 2**20
|
|
71
|
+
print(
|
|
72
|
+
f"[WARNING] Generating with a model that requires {model_mb} MB "
|
|
73
|
+
f"which is close to the maximum recommended size of {max_rec_mb} "
|
|
74
|
+
"MB. This can be slow. See the documentation for possible work-arounds: "
|
|
75
|
+
"https://github.com/ml-explore/mlx-lm/tree/main#large-models"
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
old_limit = None
|
|
79
|
+
if max_rec_size:
|
|
80
|
+
old_limit = mx.set_wired_limit(max_rec_size)
|
|
81
|
+
|
|
82
|
+
try:
|
|
83
|
+
yield
|
|
84
|
+
finally:
|
|
85
|
+
if streams is not None:
|
|
86
|
+
for s in streams:
|
|
87
|
+
mx.synchronize(s)
|
|
88
|
+
else:
|
|
89
|
+
mx.synchronize()
|
|
90
|
+
if old_limit is not None:
|
|
91
|
+
mx.set_wired_limit(old_limit)
|
|
92
|
+
|
|
93
|
+
mlx_generate.wired_limit = wired_limit
|
|
94
|
+
mlx_generate.__cortex_patched__ = True
|
|
95
|
+
|
|
96
|
+
if mlx_server is not None and getattr(mlx_server, "__cortex_patched__", False) is False:
|
|
97
|
+
def get_system_fingerprint():
|
|
98
|
+
gpu_arch = ""
|
|
99
|
+
if mx.metal.is_available():
|
|
100
|
+
info = _get_device_info(mx)
|
|
101
|
+
gpu_arch = info.get("architecture", "") if isinstance(info, dict) else ""
|
|
102
|
+
return f"{mlx_server.__version__}-{mx.__version__}-{mlx_server.platform.platform()}-{gpu_arch}"
|
|
103
|
+
|
|
104
|
+
mlx_server.get_system_fingerprint = get_system_fingerprint
|
|
105
|
+
mlx_server.__cortex_patched__ = True
|
cortex/metal/mlx_converter.py
CHANGED
|
@@ -66,9 +66,22 @@ class MLXConverter:
|
|
|
66
66
|
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
|
67
67
|
self.conversion_cache = self.cache_dir / "conversion_cache.json"
|
|
68
68
|
self._load_conversion_cache()
|
|
69
|
+
self._warned_mlx_lm_compat = False
|
|
69
70
|
|
|
70
71
|
logger.info(f"MLX Converter initialized with cache dir: {self.cache_dir}")
|
|
71
72
|
logger.info(f"MLX LM available: {mlx_utils is not None and load is not None}")
|
|
73
|
+
|
|
74
|
+
def _warn_mlx_lm_compat(self, missing: str) -> None:
|
|
75
|
+
"""Warn once when mlx-lm is missing newer helper APIs."""
|
|
76
|
+
if self._warned_mlx_lm_compat:
|
|
77
|
+
return
|
|
78
|
+
self._warned_mlx_lm_compat = True
|
|
79
|
+
message = (
|
|
80
|
+
f"[WARN] mlx-lm is missing '{missing}'. Using compatibility fallback. "
|
|
81
|
+
"For best support, upgrade mlx-lm to a newer version."
|
|
82
|
+
)
|
|
83
|
+
logger.warning(message)
|
|
84
|
+
print(message)
|
|
72
85
|
|
|
73
86
|
def _load_conversion_cache(self) -> None:
|
|
74
87
|
"""Load conversion cache metadata."""
|
|
@@ -206,6 +219,83 @@ class MLXConverter:
|
|
|
206
219
|
|
|
207
220
|
return download_dir
|
|
208
221
|
|
|
222
|
+
def _mlx_get_model_path(self, source_path: Path) -> Tuple[Path, Optional[str]]:
|
|
223
|
+
"""Resolve model path with MLX LM compatibility fallbacks."""
|
|
224
|
+
if mlx_utils is not None and hasattr(mlx_utils, "get_model_path"):
|
|
225
|
+
return mlx_utils.get_model_path(str(source_path))
|
|
226
|
+
self._warn_mlx_lm_compat("get_model_path")
|
|
227
|
+
|
|
228
|
+
# Fallback: local path or direct HF download.
|
|
229
|
+
model_path = Path(source_path)
|
|
230
|
+
if model_path.exists():
|
|
231
|
+
hf_repo = None
|
|
232
|
+
try:
|
|
233
|
+
from huggingface_hub import ModelCard
|
|
234
|
+
|
|
235
|
+
card_path = model_path / "README.md"
|
|
236
|
+
if card_path.is_file():
|
|
237
|
+
card = ModelCard.load(card_path)
|
|
238
|
+
hf_repo = getattr(card.data, "base_model", None)
|
|
239
|
+
except Exception:
|
|
240
|
+
hf_repo = None
|
|
241
|
+
return model_path, hf_repo
|
|
242
|
+
|
|
243
|
+
try:
|
|
244
|
+
model_path = Path(
|
|
245
|
+
snapshot_download(
|
|
246
|
+
str(source_path),
|
|
247
|
+
allow_patterns=[
|
|
248
|
+
"*.json",
|
|
249
|
+
"model*.safetensors",
|
|
250
|
+
"*.py",
|
|
251
|
+
"tokenizer.model",
|
|
252
|
+
"*.tiktoken",
|
|
253
|
+
"tiktoken.model",
|
|
254
|
+
"*.txt",
|
|
255
|
+
"*.jsonl",
|
|
256
|
+
"*.jinja",
|
|
257
|
+
],
|
|
258
|
+
)
|
|
259
|
+
)
|
|
260
|
+
except Exception as e:
|
|
261
|
+
raise RuntimeError(f"Failed to download model from Hugging Face: {e}") from e
|
|
262
|
+
|
|
263
|
+
return model_path, str(source_path)
|
|
264
|
+
|
|
265
|
+
def _mlx_fetch_from_hub(
|
|
266
|
+
self,
|
|
267
|
+
model_path: Path,
|
|
268
|
+
trust_remote_code: bool = False
|
|
269
|
+
) -> Tuple[Any, Dict[str, Any], Any]:
|
|
270
|
+
"""Fetch model/config/tokenizer with MLX LM compatibility fallbacks."""
|
|
271
|
+
if mlx_utils is not None and hasattr(mlx_utils, "fetch_from_hub"):
|
|
272
|
+
return mlx_utils.fetch_from_hub(
|
|
273
|
+
model_path,
|
|
274
|
+
lazy=True,
|
|
275
|
+
trust_remote_code=trust_remote_code
|
|
276
|
+
)
|
|
277
|
+
self._warn_mlx_lm_compat("fetch_from_hub")
|
|
278
|
+
|
|
279
|
+
if mlx_utils is not None and hasattr(mlx_utils, "load_model") and hasattr(mlx_utils, "load_tokenizer"):
|
|
280
|
+
model, model_config = mlx_utils.load_model(model_path, lazy=True)
|
|
281
|
+
try:
|
|
282
|
+
tokenizer = mlx_utils.load_tokenizer(
|
|
283
|
+
model_path,
|
|
284
|
+
eos_token_ids=model_config.get("eos_token_id", None),
|
|
285
|
+
tokenizer_config_extra={"trust_remote_code": trust_remote_code},
|
|
286
|
+
)
|
|
287
|
+
except TypeError:
|
|
288
|
+
tokenizer = mlx_utils.load_tokenizer(
|
|
289
|
+
model_path,
|
|
290
|
+
eos_token_ids=model_config.get("eos_token_id", None),
|
|
291
|
+
)
|
|
292
|
+
return model, model_config, tokenizer
|
|
293
|
+
|
|
294
|
+
raise RuntimeError(
|
|
295
|
+
"mlx_lm.utils is missing required helpers (fetch_from_hub/load_model). "
|
|
296
|
+
"Upgrade mlx-lm to a newer version."
|
|
297
|
+
)
|
|
298
|
+
|
|
209
299
|
def _requires_sentencepiece(self, model_path: Path) -> bool:
|
|
210
300
|
"""Return True if the model likely needs SentencePiece."""
|
|
211
301
|
# If a fast tokenizer is present, SentencePiece should not be required.
|
|
@@ -379,10 +469,17 @@ class MLXConverter:
|
|
|
379
469
|
# Build quantization configuration
|
|
380
470
|
quantize_config = self._build_quantization_config(config)
|
|
381
471
|
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
472
|
+
try:
|
|
473
|
+
model_path, hf_repo = self._mlx_get_model_path(Path(source_path))
|
|
474
|
+
except Exception as e:
|
|
475
|
+
return False, f"Model path resolution failed: {e}", None
|
|
476
|
+
|
|
477
|
+
try:
|
|
478
|
+
model, model_config, tokenizer = self._mlx_fetch_from_hub(
|
|
479
|
+
model_path, trust_remote_code=False
|
|
480
|
+
)
|
|
481
|
+
except Exception as e:
|
|
482
|
+
return False, f"Model fetch failed: {e}", None
|
|
386
483
|
|
|
387
484
|
dtype = model_config.get("torch_dtype", None)
|
|
388
485
|
if dtype in ["float16", "bfloat16", "float32"]:
|
|
@@ -398,6 +495,8 @@ class MLXConverter:
|
|
|
398
495
|
model.update(tree_map_with_path(set_dtype, model.parameters()))
|
|
399
496
|
|
|
400
497
|
if config.quantization != QuantizationRecipe.NONE:
|
|
498
|
+
if mlx_utils is None or not hasattr(mlx_utils, "quantize_model"):
|
|
499
|
+
return False, "MLX LM quantize_model not available; upgrade mlx-lm.", None
|
|
401
500
|
quant_predicate = None
|
|
402
501
|
if quantize_config and "quant_predicate" in quantize_config:
|
|
403
502
|
quant_predicate = quantize_config["quant_predicate"]
|
|
@@ -411,6 +510,8 @@ class MLXConverter:
|
|
|
411
510
|
)
|
|
412
511
|
|
|
413
512
|
normalized_hf_repo = self._normalize_hf_repo(hf_repo)
|
|
513
|
+
if mlx_utils is None or not hasattr(mlx_utils, "save"):
|
|
514
|
+
return False, "MLX LM save() not available; upgrade mlx-lm.", None
|
|
414
515
|
mlx_utils.save(output_path, model_path, model, tokenizer, model_config, hf_repo=normalized_hf_repo)
|
|
415
516
|
logger.info("MLX conversion completed")
|
|
416
517
|
|
cortex/model_manager.py
CHANGED
|
@@ -133,7 +133,8 @@ class ModelManager:
|
|
|
133
133
|
self.quantizer = DynamicQuantizer(QuantizationConfig(
|
|
134
134
|
mode=QuantizationMode.DYNAMIC,
|
|
135
135
|
per_channel=True,
|
|
136
|
-
cache_quantized=True
|
|
136
|
+
cache_quantized=True,
|
|
137
|
+
cache_dir=self.config.model.quantization_cache
|
|
137
138
|
))
|
|
138
139
|
|
|
139
140
|
# Initialize MLX converter for native conversion
|
|
@@ -201,6 +202,39 @@ class ModelManager:
|
|
|
201
202
|
level = getattr(self.config.gpu, "gpu_optimization_level", "maximum")
|
|
202
203
|
level = str(level).lower().strip()
|
|
203
204
|
return level in {"maximum", "max", "speed", "fast", "performance"}
|
|
205
|
+
|
|
206
|
+
def _get_default_quant_recipe(self) -> Optional[QuantizationRecipe]:
|
|
207
|
+
"""Map configured default_quantization to an MLX quantization recipe."""
|
|
208
|
+
raw = getattr(self.config.model, "default_quantization", "") or ""
|
|
209
|
+
value = str(raw).strip().lower()
|
|
210
|
+
if not value or value == "auto":
|
|
211
|
+
return None
|
|
212
|
+
|
|
213
|
+
mapping = {
|
|
214
|
+
"q4_k_m": QuantizationRecipe.SPEED_4BIT,
|
|
215
|
+
"q5_k_m": QuantizationRecipe.BALANCED_5BIT,
|
|
216
|
+
"q6_k": QuantizationRecipe.QUALITY_8BIT, # closest available MLX recipe
|
|
217
|
+
"q8_0": QuantizationRecipe.QUALITY_8BIT,
|
|
218
|
+
"4bit": QuantizationRecipe.SPEED_4BIT,
|
|
219
|
+
"5bit": QuantizationRecipe.BALANCED_5BIT,
|
|
220
|
+
"8bit": QuantizationRecipe.QUALITY_8BIT,
|
|
221
|
+
"mixed": QuantizationRecipe.MIXED_PRECISION,
|
|
222
|
+
"none": QuantizationRecipe.NONE,
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
recipe = mapping.get(value)
|
|
226
|
+
if recipe is None:
|
|
227
|
+
logger.warning("Unknown default_quantization value: %s", raw)
|
|
228
|
+
return None
|
|
229
|
+
|
|
230
|
+
supported = getattr(self.config.model, "supported_quantizations", None)
|
|
231
|
+
if supported:
|
|
232
|
+
supported_norm = {str(s).strip().lower() for s in supported}
|
|
233
|
+
if value.startswith("q") and value not in supported_norm:
|
|
234
|
+
logger.warning("default_quantization '%s' not in supported_quantizations", raw)
|
|
235
|
+
return None
|
|
236
|
+
|
|
237
|
+
return recipe
|
|
204
238
|
|
|
205
239
|
def load_model(
|
|
206
240
|
self,
|
|
@@ -374,6 +408,10 @@ class ModelManager:
|
|
|
374
408
|
except Exception as e:
|
|
375
409
|
logger.warning(f"Could not estimate model parameters: {e}, defaulting to 4-bit")
|
|
376
410
|
quant_recipe = QuantizationRecipe.SPEED_4BIT # Fallback
|
|
411
|
+
|
|
412
|
+
default_recipe = self._get_default_quant_recipe()
|
|
413
|
+
if default_recipe is not None:
|
|
414
|
+
quant_recipe = default_recipe
|
|
377
415
|
|
|
378
416
|
if quantization:
|
|
379
417
|
quant_map = {
|
|
@@ -452,6 +490,10 @@ class ModelManager:
|
|
|
452
490
|
else:
|
|
453
491
|
quant_recipe = QuantizationRecipe.SPEED_4BIT # Default for larger models
|
|
454
492
|
|
|
493
|
+
default_recipe = self._get_default_quant_recipe()
|
|
494
|
+
if default_recipe is not None:
|
|
495
|
+
quant_recipe = default_recipe
|
|
496
|
+
|
|
455
497
|
if quantization:
|
|
456
498
|
quant_map = {
|
|
457
499
|
"4bit": QuantizationRecipe.SPEED_4BIT,
|
|
@@ -563,6 +605,8 @@ class ModelManager:
|
|
|
563
605
|
)
|
|
564
606
|
|
|
565
607
|
if not can_load and can_apply_quantization:
|
|
608
|
+
if not getattr(self.config.model, "auto_quantize", True):
|
|
609
|
+
return False, f"GPU incompatible: {message} (auto_quantize disabled)"
|
|
566
610
|
# Check if quantization would help
|
|
567
611
|
gpu_status = self.gpu_validator.get_gpu_memory_status()
|
|
568
612
|
available_gb = gpu_status['available_gb']
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
import torch
|
|
4
4
|
import torch.nn as nn
|
|
5
5
|
from typing import Dict, Any, Optional, Tuple, Union
|
|
6
|
-
from dataclasses import dataclass
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
7
|
from enum import Enum
|
|
8
8
|
import gc
|
|
9
9
|
from pathlib import Path
|
|
@@ -40,6 +40,7 @@ class QuantizationConfig:
|
|
|
40
40
|
cache_quantized: bool = True # Cache quantized models to disk
|
|
41
41
|
compress_cache: bool = False # Compress cached models (slower but smaller)
|
|
42
42
|
validate_quantization: bool = True # Validate quantized models work correctly
|
|
43
|
+
cache_dir: Path = field(default_factory=lambda: Path.home() / ".cortex" / "quantized_models")
|
|
43
44
|
|
|
44
45
|
def to_dict(self) -> Dict[str, Any]:
|
|
45
46
|
"""Convert to dictionary for serialization."""
|
|
@@ -118,6 +119,8 @@ class DynamicQuantizer:
|
|
|
118
119
|
def __init__(self, config: Optional[QuantizationConfig] = None):
|
|
119
120
|
"""Initialize quantizer with configuration."""
|
|
120
121
|
self.config = config or QuantizationConfig()
|
|
122
|
+
self.config.cache_dir = Path(self.config.cache_dir).expanduser()
|
|
123
|
+
self.config.cache_dir.mkdir(parents=True, exist_ok=True)
|
|
121
124
|
self.device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
|
|
122
125
|
self._quantization_cache: Dict[str, Dict[str, Any]] = {}
|
|
123
126
|
|
|
@@ -681,10 +684,10 @@ class DynamicQuantizer:
|
|
|
681
684
|
|
|
682
685
|
# Generate cache key including model metadata
|
|
683
686
|
cache_key = hashlib.md5(
|
|
684
|
-
f"{model_path}_{model_mtime}_{model_size}_{json.dumps(
|
|
687
|
+
f"{model_path}_{model_mtime}_{model_size}_{json.dumps(self.config.to_dict())}".encode()
|
|
685
688
|
).hexdigest()
|
|
686
689
|
|
|
687
|
-
cache_dir =
|
|
690
|
+
cache_dir = self.config.cache_dir
|
|
688
691
|
cache_dir.mkdir(parents=True, exist_ok=True)
|
|
689
692
|
|
|
690
693
|
cache_path = cache_dir / f"{cache_key}.pt"
|
|
@@ -723,7 +726,7 @@ class DynamicQuantizer:
|
|
|
723
726
|
f"{model_path}_{model_mtime}_{model_size}_{json.dumps(config.to_dict())}".encode()
|
|
724
727
|
).hexdigest()
|
|
725
728
|
|
|
726
|
-
cache_path = Path.
|
|
729
|
+
cache_path = Path(self.config.cache_dir) / f"{cache_key}.pt"
|
|
727
730
|
|
|
728
731
|
if cache_path.exists():
|
|
729
732
|
try:
|
|
@@ -733,4 +736,4 @@ class DynamicQuantizer:
|
|
|
733
736
|
# Cache corrupted, will re-quantize
|
|
734
737
|
cache_path.unlink()
|
|
735
738
|
|
|
736
|
-
return None
|
|
739
|
+
return None
|
cortex/ui/cli.py
CHANGED
|
@@ -30,7 +30,7 @@ from cortex.conversation_manager import ConversationManager, MessageRole
|
|
|
30
30
|
from cortex.model_downloader import ModelDownloader
|
|
31
31
|
from cortex.template_registry import TemplateRegistry
|
|
32
32
|
from cortex.fine_tuning import FineTuneWizard
|
|
33
|
-
from cortex.ui.markdown_render import ThinkMarkdown, PrefixedRenderable
|
|
33
|
+
from cortex.ui.markdown_render import ThinkMarkdown, PrefixedRenderable, render_plain_with_think
|
|
34
34
|
|
|
35
35
|
|
|
36
36
|
class CortexCLI:
|
|
@@ -1135,15 +1135,15 @@ class CortexCLI:
|
|
|
1135
1135
|
logger.debug(f"Could not get stop sequences: {e}")
|
|
1136
1136
|
|
|
1137
1137
|
# Create generation request with formatted prompt
|
|
1138
|
-
# Use lower temperature for more focused responses
|
|
1139
1138
|
request = GenerationRequest(
|
|
1140
1139
|
prompt=formatted_prompt,
|
|
1141
1140
|
max_tokens=self.config.inference.max_tokens,
|
|
1142
|
-
temperature=
|
|
1143
|
-
top_p=
|
|
1141
|
+
temperature=self.config.inference.temperature,
|
|
1142
|
+
top_p=self.config.inference.top_p,
|
|
1144
1143
|
top_k=self.config.inference.top_k,
|
|
1145
1144
|
repetition_penalty=self.config.inference.repetition_penalty,
|
|
1146
|
-
stream=
|
|
1145
|
+
stream=self.config.inference.stream_output,
|
|
1146
|
+
seed=self.config.inference.seed if self.config.inference.seed >= 0 else None,
|
|
1147
1147
|
stop_sequences=stop_sequences
|
|
1148
1148
|
)
|
|
1149
1149
|
|
|
@@ -1167,50 +1167,65 @@ class CortexCLI:
|
|
|
1167
1167
|
prefix_style = Style(color="cyan")
|
|
1168
1168
|
|
|
1169
1169
|
def build_renderable(text: str):
|
|
1170
|
-
|
|
1171
|
-
|
|
1170
|
+
if getattr(self.config.ui, "markdown_rendering", True):
|
|
1171
|
+
markdown = ThinkMarkdown(
|
|
1172
|
+
text,
|
|
1173
|
+
code_theme="monokai",
|
|
1174
|
+
use_line_numbers=False,
|
|
1175
|
+
syntax_highlighting=getattr(self.config.ui, "syntax_highlighting", True),
|
|
1176
|
+
)
|
|
1177
|
+
renderable = markdown
|
|
1178
|
+
else:
|
|
1179
|
+
renderable = render_plain_with_think(text)
|
|
1172
1180
|
|
|
1173
|
-
|
|
1174
|
-
build_renderable(""),
|
|
1175
|
-
console=self.console,
|
|
1176
|
-
refresh_per_second=20,
|
|
1177
|
-
transient=False,
|
|
1178
|
-
) as live:
|
|
1179
|
-
for token in self.inference_engine.generate(request):
|
|
1180
|
-
if first_token_time is None:
|
|
1181
|
-
first_token_time = time.time()
|
|
1181
|
+
return PrefixedRenderable(renderable, prefix="⏺ ", prefix_style=prefix_style, indent=" ")
|
|
1182
1182
|
|
|
1183
|
-
|
|
1184
|
-
|
|
1183
|
+
original_console_width = self.console._width
|
|
1184
|
+
target_width = max(40, int(self.get_terminal_width() * 0.75))
|
|
1185
|
+
self.console.width = target_width
|
|
1186
|
+
try:
|
|
1187
|
+
with Live(
|
|
1188
|
+
build_renderable(""),
|
|
1189
|
+
console=self.console,
|
|
1190
|
+
auto_refresh=False,
|
|
1191
|
+
refresh_per_second=20,
|
|
1192
|
+
transient=False,
|
|
1193
|
+
vertical_overflow="visible",
|
|
1194
|
+
) as live:
|
|
1195
|
+
for token in self.inference_engine.generate(request):
|
|
1196
|
+
if first_token_time is None:
|
|
1197
|
+
first_token_time = time.time()
|
|
1185
1198
|
|
|
1186
|
-
|
|
1187
|
-
|
|
1188
|
-
|
|
1189
|
-
|
|
1190
|
-
)
|
|
1191
|
-
|
|
1192
|
-
|
|
1193
|
-
|
|
1199
|
+
generated_text += token
|
|
1200
|
+
token_count += 1
|
|
1201
|
+
|
|
1202
|
+
display_token = token
|
|
1203
|
+
if uses_reasoning_template and template_profile and template_profile.supports_streaming():
|
|
1204
|
+
display_token, should_display = template_profile.process_streaming_response(
|
|
1205
|
+
token, accumulated_response
|
|
1206
|
+
)
|
|
1207
|
+
accumulated_response += token
|
|
1208
|
+
if not should_display:
|
|
1209
|
+
display_token = ""
|
|
1194
1210
|
|
|
1195
|
-
|
|
1196
|
-
|
|
1211
|
+
if display_token:
|
|
1212
|
+
display_text += display_token
|
|
1197
1213
|
|
|
1198
|
-
|
|
1199
|
-
|
|
1200
|
-
|
|
1201
|
-
|
|
1214
|
+
now = time.time()
|
|
1215
|
+
if display_token and ("\n" in display_token or now - last_render_time >= render_interval):
|
|
1216
|
+
live.update(build_renderable(display_text), refresh=True)
|
|
1217
|
+
last_render_time = now
|
|
1202
1218
|
|
|
1203
|
-
|
|
1204
|
-
|
|
1205
|
-
|
|
1206
|
-
|
|
1207
|
-
|
|
1219
|
+
if uses_reasoning_template and template_profile:
|
|
1220
|
+
final_text = template_profile.process_response(generated_text)
|
|
1221
|
+
generated_text = final_text
|
|
1222
|
+
if not template_profile.config.show_reasoning:
|
|
1223
|
+
display_text = final_text
|
|
1208
1224
|
|
|
1209
|
-
|
|
1225
|
+
live.update(build_renderable(display_text), refresh=True)
|
|
1226
|
+
finally:
|
|
1227
|
+
self.console._width = original_console_width
|
|
1210
1228
|
|
|
1211
|
-
# Add blank line for spacing between response and metrics
|
|
1212
|
-
print()
|
|
1213
|
-
|
|
1214
1229
|
# Display final metrics in a clean, professional way
|
|
1215
1230
|
elapsed = time.time() - start_time
|
|
1216
1231
|
if token_count > 0 and elapsed > 0:
|
|
@@ -1238,6 +1253,9 @@ class CortexCLI:
|
|
|
1238
1253
|
metrics_line = " · ".join(metrics_parts)
|
|
1239
1254
|
print(f" \033[2m{metrics_line}\033[0m")
|
|
1240
1255
|
|
|
1256
|
+
if token_count >= request.max_tokens:
|
|
1257
|
+
print(f" \033[2m(output truncated at max_tokens={request.max_tokens}; increase in config.yaml)\033[0m")
|
|
1258
|
+
|
|
1241
1259
|
# Add assistant message to conversation history
|
|
1242
1260
|
self.conversation_manager.add_message(MessageRole.ASSISTANT, generated_text)
|
|
1243
1261
|
|
|
@@ -1405,8 +1423,8 @@ class CortexCLI:
|
|
|
1405
1423
|
else:
|
|
1406
1424
|
print() # Empty line if no model loaded
|
|
1407
1425
|
|
|
1408
|
-
# Move cursor to input position inside the box
|
|
1409
|
-
sys.stdout.write("\033[
|
|
1426
|
+
# Move cursor to input position inside the box (center of 3 interior lines)
|
|
1427
|
+
sys.stdout.write("\033[4A") # Move up 4 lines to the input line
|
|
1410
1428
|
sys.stdout.write(f"\r{DIM}│{RESET} > ") # Position at prompt
|
|
1411
1429
|
sys.stdout.flush()
|
|
1412
1430
|
|
cortex/ui/markdown_render.py
CHANGED
|
@@ -3,10 +3,12 @@
|
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
5
|
from rich.console import Console
|
|
6
|
+
from rich.cells import cell_len
|
|
6
7
|
from rich.markdown import Markdown
|
|
7
8
|
from rich.segment import Segment
|
|
8
9
|
from rich.style import Style
|
|
9
10
|
from rich.syntax import Syntax
|
|
11
|
+
from rich.text import Text
|
|
10
12
|
|
|
11
13
|
THINK_START_MARKER = "[[[THINK_START]]]"
|
|
12
14
|
THINK_END_MARKER = "[[[THINK_END]]]"
|
|
@@ -45,6 +47,14 @@ class CodeBlockWithLineNumbers(Markdown.elements["fence"]):
|
|
|
45
47
|
yield syntax
|
|
46
48
|
|
|
47
49
|
|
|
50
|
+
class CodeBlockPlain(Markdown.elements["fence"]):
|
|
51
|
+
"""Markdown code block rendered as plain text (no syntax highlighting)."""
|
|
52
|
+
|
|
53
|
+
def __rich_console__(self, console: Console, options):
|
|
54
|
+
code = str(self.text).rstrip()
|
|
55
|
+
yield Text(code)
|
|
56
|
+
|
|
57
|
+
|
|
48
58
|
class MarkdownWithLineNumbers(Markdown):
|
|
49
59
|
"""Markdown renderer that keeps line numbers for fenced code blocks."""
|
|
50
60
|
|
|
@@ -55,6 +65,26 @@ class MarkdownWithLineNumbers(Markdown):
|
|
|
55
65
|
})
|
|
56
66
|
|
|
57
67
|
|
|
68
|
+
class MarkdownPlainCode(Markdown):
|
|
69
|
+
"""Markdown renderer that disables syntax highlighting for code blocks."""
|
|
70
|
+
|
|
71
|
+
elements = Markdown.elements.copy()
|
|
72
|
+
elements.update({
|
|
73
|
+
"fence": CodeBlockPlain,
|
|
74
|
+
"code_block": CodeBlockPlain,
|
|
75
|
+
})
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class MarkdownPlainCodeWithLineNumbers(Markdown):
|
|
79
|
+
"""Markdown renderer with plain code blocks and line numbers."""
|
|
80
|
+
|
|
81
|
+
elements = MarkdownWithLineNumbers.elements.copy()
|
|
82
|
+
elements.update({
|
|
83
|
+
"fence": CodeBlockPlain,
|
|
84
|
+
"code_block": CodeBlockPlain,
|
|
85
|
+
})
|
|
86
|
+
|
|
87
|
+
|
|
58
88
|
class ThinkMarkdown:
|
|
59
89
|
"""Markdown renderer that dims content inside <think> tags."""
|
|
60
90
|
|
|
@@ -63,10 +93,15 @@ class ThinkMarkdown:
|
|
|
63
93
|
markup: str,
|
|
64
94
|
code_theme: str = "monokai",
|
|
65
95
|
use_line_numbers: bool = False,
|
|
96
|
+
syntax_highlighting: bool = True,
|
|
66
97
|
) -> None:
|
|
67
98
|
marked = _mark_think_sections(markup)
|
|
68
|
-
|
|
69
|
-
|
|
99
|
+
if syntax_highlighting:
|
|
100
|
+
markdown_cls = MarkdownWithLineNumbers if use_line_numbers else Markdown
|
|
101
|
+
self._markdown = markdown_cls(marked, code_theme=code_theme)
|
|
102
|
+
else:
|
|
103
|
+
markdown_cls = MarkdownPlainCodeWithLineNumbers if use_line_numbers else MarkdownPlainCode
|
|
104
|
+
self._markdown = markdown_cls(marked)
|
|
70
105
|
|
|
71
106
|
def __rich_console__(self, console: Console, options):
|
|
72
107
|
segments = console.render(self._markdown, options)
|
|
@@ -162,9 +197,15 @@ class PrefixedRenderable:
|
|
|
162
197
|
self.indent = indent if indent is not None else " " * len(prefix)
|
|
163
198
|
|
|
164
199
|
def __rich_console__(self, console: Console, options):
|
|
200
|
+
prefix_width = cell_len(self.prefix)
|
|
201
|
+
indent_width = cell_len(self.indent) if self.indent is not None else prefix_width
|
|
202
|
+
offset = max(prefix_width, indent_width)
|
|
203
|
+
inner_width = max(1, options.max_width - offset)
|
|
204
|
+
inner_options = options.update_width(inner_width)
|
|
205
|
+
|
|
165
206
|
yield Segment(self.prefix, self.prefix_style)
|
|
166
207
|
|
|
167
|
-
for segment in console.render(self.renderable,
|
|
208
|
+
for segment in console.render(self.renderable, inner_options):
|
|
168
209
|
if segment.control:
|
|
169
210
|
yield segment
|
|
170
211
|
continue
|
|
@@ -183,3 +224,27 @@ class PrefixedRenderable:
|
|
|
183
224
|
if index < len(parts) - 1:
|
|
184
225
|
yield Segment("\n", style)
|
|
185
226
|
yield Segment(self.indent, None)
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def render_plain_with_think(text: str) -> Text:
|
|
230
|
+
"""Render plain text while dimming content inside <think> tags."""
|
|
231
|
+
output = Text()
|
|
232
|
+
dim_style = Style(dim=True)
|
|
233
|
+
idx = 0
|
|
234
|
+
in_think = False
|
|
235
|
+
|
|
236
|
+
while idx < len(text):
|
|
237
|
+
if text.startswith("<think>", idx):
|
|
238
|
+
in_think = True
|
|
239
|
+
idx += len("<think>")
|
|
240
|
+
continue
|
|
241
|
+
if text.startswith("</think>", idx):
|
|
242
|
+
in_think = False
|
|
243
|
+
idx += len("</think>")
|
|
244
|
+
continue
|
|
245
|
+
|
|
246
|
+
char = text[idx]
|
|
247
|
+
output.append(char, dim_style if in_think else None)
|
|
248
|
+
idx += 1
|
|
249
|
+
|
|
250
|
+
return output
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: cortex-llm
|
|
3
|
+
Version: 1.0.8
|
|
4
|
+
Summary: GPU-Accelerated LLM Terminal for Apple Silicon
|
|
5
|
+
Home-page: https://github.com/faisalmumtaz/Cortex
|
|
6
|
+
Author: Cortex Development Team
|
|
7
|
+
License-Expression: MIT
|
|
8
|
+
Project-URL: Homepage, https://github.com/faisalmumtaz/Cortex
|
|
9
|
+
Project-URL: Bug Tracker, https://github.com/faisalmumtaz/Cortex/issues
|
|
10
|
+
Project-URL: Documentation, https://github.com/faisalmumtaz/Cortex/wiki
|
|
11
|
+
Keywords: llm,gpu,metal,mps,apple-silicon,ai,machine-learning,terminal,mlx,pytorch
|
|
12
|
+
Platform: darwin
|
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Operating System :: MacOS
|
|
20
|
+
Classifier: Environment :: Console
|
|
21
|
+
Classifier: Environment :: GPU
|
|
22
|
+
Requires-Python: >=3.11
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
License-File: LICENSE
|
|
25
|
+
Requires-Dist: torch>=2.1.0
|
|
26
|
+
Requires-Dist: mlx>=0.30.4
|
|
27
|
+
Requires-Dist: mlx-lm>=0.30.5
|
|
28
|
+
Requires-Dist: transformers>=4.36.0
|
|
29
|
+
Requires-Dist: safetensors>=0.4.0
|
|
30
|
+
Requires-Dist: huggingface-hub>=0.19.0
|
|
31
|
+
Requires-Dist: accelerate>=0.25.0
|
|
32
|
+
Requires-Dist: llama-cpp-python>=0.2.0
|
|
33
|
+
Requires-Dist: pyyaml>=6.0
|
|
34
|
+
Requires-Dist: pydantic>=2.5.0
|
|
35
|
+
Requires-Dist: rich>=13.0.0
|
|
36
|
+
Requires-Dist: psutil>=5.9.0
|
|
37
|
+
Requires-Dist: numpy>=1.24.0
|
|
38
|
+
Requires-Dist: packaging>=23.0
|
|
39
|
+
Requires-Dist: requests>=2.31.0
|
|
40
|
+
Provides-Extra: dev
|
|
41
|
+
Requires-Dist: pytest>=7.4.0; extra == "dev"
|
|
42
|
+
Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
|
|
43
|
+
Requires-Dist: black>=23.0.0; extra == "dev"
|
|
44
|
+
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
45
|
+
Requires-Dist: mypy>=1.8.0; extra == "dev"
|
|
46
|
+
Provides-Extra: optional
|
|
47
|
+
Requires-Dist: sentencepiece>=0.1.99; extra == "optional"
|
|
48
|
+
Requires-Dist: auto-gptq>=0.7.0; extra == "optional"
|
|
49
|
+
Requires-Dist: autoawq>=0.2.0; extra == "optional"
|
|
50
|
+
Requires-Dist: bitsandbytes>=0.41.0; extra == "optional"
|
|
51
|
+
Requires-Dist: optimum>=1.16.0; extra == "optional"
|
|
52
|
+
Requires-Dist: torchvision>=0.16.0; extra == "optional"
|
|
53
|
+
Requires-Dist: torchaudio>=2.1.0; extra == "optional"
|
|
54
|
+
Dynamic: home-page
|
|
55
|
+
Dynamic: license-file
|
|
56
|
+
Dynamic: platform
|
|
57
|
+
Dynamic: requires-python
|
|
58
|
+
|
|
59
|
+
# Cortex
|
|
60
|
+
|
|
61
|
+
GPU-accelerated local LLMs on Apple Silicon, built for the terminal.
|
|
62
|
+
|
|
63
|
+
Cortex is a fast, native CLI for running and fine-tuning LLMs on Apple Silicon using MLX and Metal. It automatically detects chat templates, supports multiple model formats, and keeps your workflow inside the terminal.
|
|
64
|
+
|
|
65
|
+
## Highlights
|
|
66
|
+
|
|
67
|
+
- Apple Silicon GPU acceleration via MLX (primary) and PyTorch MPS
|
|
68
|
+
- Multi-format model support: MLX, GGUF, SafeTensors, PyTorch, GPTQ, AWQ
|
|
69
|
+
- Built-in LoRA fine-tuning wizard
|
|
70
|
+
- Chat template auto-detection (ChatML, Llama, Alpaca, Gemma, Reasoning)
|
|
71
|
+
- Conversation history with autosave and export
|
|
72
|
+
|
|
73
|
+
## Quick Start
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
pipx install cortex-llm
|
|
77
|
+
cortex
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
Inside Cortex:
|
|
81
|
+
|
|
82
|
+
- `/download` to fetch a model from HuggingFace
|
|
83
|
+
- `/model` to load or manage models
|
|
84
|
+
- `/status` to confirm GPU acceleration and current settings
|
|
85
|
+
|
|
86
|
+
## Installation
|
|
87
|
+
|
|
88
|
+
### Option A: pipx (recommended)
|
|
89
|
+
|
|
90
|
+
```bash
|
|
91
|
+
pipx install cortex-llm
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
### Option B: from source
|
|
95
|
+
|
|
96
|
+
```bash
|
|
97
|
+
git clone https://github.com/faisalmumtaz/Cortex.git
|
|
98
|
+
cd Cortex
|
|
99
|
+
./install.sh
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
The installer checks Apple Silicon compatibility, creates a venv, installs dependencies from `pyproject.toml`, and sets up the `cortex` command.
|
|
103
|
+
|
|
104
|
+
## Requirements
|
|
105
|
+
|
|
106
|
+
- Apple Silicon Mac (M1/M2/M3/M4)
|
|
107
|
+
- macOS 13.3+
|
|
108
|
+
- Python 3.11+
|
|
109
|
+
- 16GB+ unified memory (24GB+ recommended for larger models)
|
|
110
|
+
- Xcode Command Line Tools
|
|
111
|
+
|
|
112
|
+
## Model Support
|
|
113
|
+
|
|
114
|
+
Cortex supports:
|
|
115
|
+
|
|
116
|
+
- **MLX** (recommended)
|
|
117
|
+
- **GGUF** (llama.cpp + Metal)
|
|
118
|
+
- **SafeTensors**
|
|
119
|
+
- **PyTorch** (Transformers + MPS)
|
|
120
|
+
- **GPTQ** / **AWQ** quantized models
|
|
121
|
+
|
|
122
|
+
## Advanced Features
|
|
123
|
+
|
|
124
|
+
- **Dynamic quantization fallback** for PyTorch/SafeTensors models that do not fit GPU memory (INT8 preferred, INT4 fallback)
|
|
125
|
+
- `docs/dynamic-quantization.md`
|
|
126
|
+
- **MLX conversion with quantization recipes** (4/5/8-bit, mixed precision) for speed vs quality control
|
|
127
|
+
- `docs/mlx-acceleration.md`
|
|
128
|
+
- **LoRA fine-tuning wizard** for local adapters (`/finetune`)
|
|
129
|
+
- `docs/fine-tuning.md`
|
|
130
|
+
- **Template registry and auto-detection** for chat formatting (ChatML, Llama, Alpaca, Gemma, Reasoning)
|
|
131
|
+
- `docs/template-registry.md`
|
|
132
|
+
- **Inference engine details** and backend behavior
|
|
133
|
+
- `docs/inference-engine.md`
|
|
134
|
+
|
|
135
|
+
## Configuration
|
|
136
|
+
|
|
137
|
+
Cortex reads `config.yaml` from the current working directory. For tuning GPU memory limits, quantization defaults, and inference parameters, see:
|
|
138
|
+
|
|
139
|
+
- `docs/configuration.md`
|
|
140
|
+
|
|
141
|
+
## Documentation
|
|
142
|
+
|
|
143
|
+
Start here:
|
|
144
|
+
|
|
145
|
+
- `docs/installation.md`
|
|
146
|
+
- `docs/cli.md`
|
|
147
|
+
- `docs/model-management.md`
|
|
148
|
+
- `docs/troubleshooting.md`
|
|
149
|
+
|
|
150
|
+
Advanced topics:
|
|
151
|
+
|
|
152
|
+
- `docs/mlx-acceleration.md`
|
|
153
|
+
- `docs/inference-engine.md`
|
|
154
|
+
- `docs/dynamic-quantization.md`
|
|
155
|
+
- `docs/template-registry.md`
|
|
156
|
+
- `docs/fine-tuning.md`
|
|
157
|
+
- `docs/development.md`
|
|
158
|
+
|
|
159
|
+
## Contributing
|
|
160
|
+
|
|
161
|
+
Contributions are welcome. See `docs/development.md` for setup and workflow.
|
|
162
|
+
|
|
163
|
+
## License
|
|
164
|
+
|
|
165
|
+
MIT License. See `LICENSE`.
|
|
166
|
+
|
|
167
|
+
---
|
|
168
|
+
|
|
169
|
+
Note: Cortex requires Apple Silicon. Intel Macs are not supported.
|
|
@@ -1,11 +1,11 @@
|
|
|
1
|
-
cortex/__init__.py,sha256=
|
|
2
|
-
cortex/__main__.py,sha256=
|
|
3
|
-
cortex/config.py,sha256=
|
|
1
|
+
cortex/__init__.py,sha256=lj0fAHYDEHTqAY8WK0eDX-_prqihmKLlsH_ELz6DxMs,2202
|
|
2
|
+
cortex/__main__.py,sha256=I7Njt7BjGoHtPhftDoA44OyOYbwWNNaPwP_qlJSn0J4,2857
|
|
3
|
+
cortex/config.py,sha256=IQnMaXznTflTSvr91aybtPMnNW088r-BYeVMhxny63w,13444
|
|
4
4
|
cortex/conversation_manager.py,sha256=aSTdGjVttsMKIiRPzztP0tOXlqZBkWtgZDNCZGyaR-c,17177
|
|
5
5
|
cortex/gpu_validator.py,sha256=un6vMQ78MWMnKWIz8n-92v9Fb4g_YXqU_E1pUPinncY,16582
|
|
6
|
-
cortex/inference_engine.py,sha256=
|
|
6
|
+
cortex/inference_engine.py,sha256=bklCjmiMn3psFp14EZxRzePEuA33NCHJ1bQdsbvMlfg,29343
|
|
7
7
|
cortex/model_downloader.py,sha256=VuPhvxq_66qKjsPjEWcLW-VmUHzOHik6LBMiGDk-cX8,4977
|
|
8
|
-
cortex/model_manager.py,sha256=
|
|
8
|
+
cortex/model_manager.py,sha256=Ra21TjhtFS-7_hRzDMh9m0BUazIGWoKr7Gye3GiVRJM,102671
|
|
9
9
|
cortex/fine_tuning/__init__.py,sha256=IXKQqNqN1C3mha3na35i7KI-hMnsqqrmUgV4NrPKHy0,269
|
|
10
10
|
cortex/fine_tuning/dataset.py,sha256=hIz_dfFSaJoiFzWZ6vwlwqjpTfdsnFNIEmwhhTD2d9k,15414
|
|
11
11
|
cortex/fine_tuning/mlx_lora_trainer.py,sha256=idNzKtVG8pObwsnSrP0N1rU1EanhrIRvHiNL1asdzr8,22438
|
|
@@ -14,13 +14,14 @@ cortex/fine_tuning/wizard.py,sha256=eIRUM3zTqKKATJEbQrBsaOfFfRWfY9BV5FkSAzT82QM,
|
|
|
14
14
|
cortex/metal/__init__.py,sha256=Ycs81qVOsaYV4UJocCFGW3rPPBySMPy7eOHKzfc4Q7o,8780
|
|
15
15
|
cortex/metal/gpu_validator.py,sha256=1YHKJXqicXvTwKIdSj34n1DgKoluy9yho6S1jWt1UAs,5818
|
|
16
16
|
cortex/metal/memory_pool.py,sha256=g5PFQAiouQe4TyX-SVi-Di1MLysb3YBF77uR4nAEomo,34698
|
|
17
|
-
cortex/metal/mlx_accelerator.py,sha256=
|
|
18
|
-
cortex/metal/
|
|
17
|
+
cortex/metal/mlx_accelerator.py,sha256=f3tfHAaRQqc5KteXNkf7n610SxOLo2hoGj5_GgqEL2Y,25726
|
|
18
|
+
cortex/metal/mlx_compat.py,sha256=oZ_RNjJzWs6h6Q3mSNK-K2--BhJwKt7d_tYt3uMN1pM,3531
|
|
19
|
+
cortex/metal/mlx_converter.py,sha256=6tiVDFWOZOmhwTN2uS93_Ny4iM1X0NTrGp6AAik6l0Q,30823
|
|
19
20
|
cortex/metal/mps_optimizer.py,sha256=4r6dj-_KAr3vedCwwu7lR-nIaF4g4D4kkOoF2KiQ0FQ,15307
|
|
20
21
|
cortex/metal/optimizer.py,sha256=9ixKj8ca1iovF-mFHYGa9_DUHcqgGyzLoP_lIRAzfMM,21996
|
|
21
22
|
cortex/metal/performance_profiler.py,sha256=GMxxqwqE2kVJ4WePwVdUp2ADqhrV6wCCNrFnaMfBDpI,12274
|
|
22
23
|
cortex/quantization/__init__.py,sha256=ElLP3ZO_XItddTl-PeoJ5GPb16RYIAk8m5sqwfAVE9s,184
|
|
23
|
-
cortex/quantization/dynamic_quantizer.py,sha256=
|
|
24
|
+
cortex/quantization/dynamic_quantizer.py,sha256=vV0RSPMoWeOPALwFOs0DzqIA2MkGpeEpqB2vTeudhW0,31934
|
|
24
25
|
cortex/template_registry/__init__.py,sha256=O5BWmHRmfMSK-Ukpu8UqFO_kaN0kum-d-Wsz0Ds-sC0,491
|
|
25
26
|
cortex/template_registry/auto_detector.py,sha256=lqI19Ef_w6ClZvD5dzDw1i5gnf2AUN_L4WjCMvW99Yg,5432
|
|
26
27
|
cortex/template_registry/config_manager.py,sha256=vh7cXAUTJ4dLY74u5EHTpTa46jXxj34BlMyWsC_ZIaM,8658
|
|
@@ -37,12 +38,12 @@ cortex/template_registry/template_profiles/standard/gemma.py,sha256=D4wZN3_6QzUj
|
|
|
37
38
|
cortex/template_registry/template_profiles/standard/llama.py,sha256=jz4MyvmISSPtIAcffPE7LrTosHvlC0NoJhzTw1DCvpY,3209
|
|
38
39
|
cortex/template_registry/template_profiles/standard/simple.py,sha256=dGOOcL6HRoJFxkixLrYC4w7c63h-QmOOWC2TsOihYog,2422
|
|
39
40
|
cortex/ui/__init__.py,sha256=t3GrHJMHTVgBEKh2_qt4B9mS594V5jriTDqc3eZKMGc,3409
|
|
40
|
-
cortex/ui/cli.py,sha256=
|
|
41
|
-
cortex/ui/markdown_render.py,sha256=
|
|
41
|
+
cortex/ui/cli.py,sha256=QZhiV9z8hP9Fu5mvpzURSWLptDDRaJLmNLm2AqTGlqE,75734
|
|
42
|
+
cortex/ui/markdown_render.py,sha256=D4gSvv0TERFIAXYs3e76eaPsuvvD2cNT98PDKyUPnWI,7776
|
|
42
43
|
cortex/ui/terminal_app.py,sha256=SF3KqcGFyZ4hpTmgX21idPzOTJLdKGkt4QdA-wwUBNE,18317
|
|
43
|
-
cortex_llm-1.0.
|
|
44
|
-
cortex_llm-1.0.
|
|
45
|
-
cortex_llm-1.0.
|
|
46
|
-
cortex_llm-1.0.
|
|
47
|
-
cortex_llm-1.0.
|
|
48
|
-
cortex_llm-1.0.
|
|
44
|
+
cortex_llm-1.0.8.dist-info/licenses/LICENSE,sha256=_frJ3VsZWQGhMznZw2Tgjk7xwfAfDZRcBl43uZh8_4E,1070
|
|
45
|
+
cortex_llm-1.0.8.dist-info/METADATA,sha256=p_zPy3yz5xrZO-oIkujQm_agneaE1RNR2smb5nBreQc,5119
|
|
46
|
+
cortex_llm-1.0.8.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
47
|
+
cortex_llm-1.0.8.dist-info/entry_points.txt,sha256=g83Nuz3iFrNdMLHxGLR2LnscdM7rdQRchuL3WGobQC8,48
|
|
48
|
+
cortex_llm-1.0.8.dist-info/top_level.txt,sha256=79LAeTJJ_pMIBy3mkF7uNaN0mdBRt5tGrnne5N_iAio,7
|
|
49
|
+
cortex_llm-1.0.8.dist-info/RECORD,,
|
|
@@ -1,275 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: cortex-llm
|
|
3
|
-
Version: 1.0.0
|
|
4
|
-
Summary: GPU-Accelerated LLM Terminal for Apple Silicon
|
|
5
|
-
Home-page: https://github.com/faisalmumtaz/Cortex
|
|
6
|
-
Author: Cortex Development Team
|
|
7
|
-
License: MIT
|
|
8
|
-
Project-URL: Homepage, https://github.com/faisalmumtaz/Cortex
|
|
9
|
-
Project-URL: Bug Tracker, https://github.com/faisalmumtaz/Cortex/issues
|
|
10
|
-
Project-URL: Documentation, https://github.com/faisalmumtaz/Cortex/wiki
|
|
11
|
-
Keywords: llm,gpu,metal,mps,apple-silicon,ai,machine-learning,terminal,mlx,pytorch
|
|
12
|
-
Platform: darwin
|
|
13
|
-
Classifier: Development Status :: 4 - Beta
|
|
14
|
-
Classifier: Intended Audience :: Developers
|
|
15
|
-
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
16
|
-
Classifier: License :: OSI Approved :: MIT License
|
|
17
|
-
Classifier: Programming Language :: Python :: 3
|
|
18
|
-
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
-
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
-
Classifier: Operating System :: MacOS
|
|
21
|
-
Classifier: Environment :: Console
|
|
22
|
-
Classifier: Environment :: GPU
|
|
23
|
-
Requires-Python: >=3.11
|
|
24
|
-
Description-Content-Type: text/markdown
|
|
25
|
-
License-File: LICENSE
|
|
26
|
-
Requires-Dist: torch>=2.1.0
|
|
27
|
-
Requires-Dist: mlx>=0.10.0
|
|
28
|
-
Requires-Dist: mlx-lm>=0.10.0
|
|
29
|
-
Requires-Dist: transformers>=4.36.0
|
|
30
|
-
Requires-Dist: safetensors>=0.4.0
|
|
31
|
-
Requires-Dist: huggingface-hub>=0.19.0
|
|
32
|
-
Requires-Dist: accelerate>=0.25.0
|
|
33
|
-
Requires-Dist: llama-cpp-python>=0.2.0
|
|
34
|
-
Requires-Dist: pyyaml>=6.0
|
|
35
|
-
Requires-Dist: pydantic>=2.5.0
|
|
36
|
-
Requires-Dist: rich>=13.0.0
|
|
37
|
-
Requires-Dist: psutil>=5.9.0
|
|
38
|
-
Requires-Dist: numpy>=1.24.0
|
|
39
|
-
Requires-Dist: packaging>=23.0
|
|
40
|
-
Requires-Dist: requests>=2.31.0
|
|
41
|
-
Provides-Extra: dev
|
|
42
|
-
Requires-Dist: pytest>=7.4.0; extra == "dev"
|
|
43
|
-
Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
|
|
44
|
-
Requires-Dist: black>=23.0.0; extra == "dev"
|
|
45
|
-
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
46
|
-
Requires-Dist: mypy>=1.8.0; extra == "dev"
|
|
47
|
-
Provides-Extra: optional
|
|
48
|
-
Requires-Dist: sentencepiece>=0.1.99; extra == "optional"
|
|
49
|
-
Requires-Dist: auto-gptq>=0.7.0; extra == "optional"
|
|
50
|
-
Requires-Dist: autoawq>=0.2.0; extra == "optional"
|
|
51
|
-
Requires-Dist: bitsandbytes>=0.41.0; extra == "optional"
|
|
52
|
-
Requires-Dist: optimum>=1.16.0; extra == "optional"
|
|
53
|
-
Requires-Dist: torchvision>=0.16.0; extra == "optional"
|
|
54
|
-
Requires-Dist: torchaudio>=2.1.0; extra == "optional"
|
|
55
|
-
Dynamic: home-page
|
|
56
|
-
Dynamic: license-file
|
|
57
|
-
Dynamic: platform
|
|
58
|
-
Dynamic: requires-python
|
|
59
|
-
|
|
60
|
-
# Cortex - LLM Terminal Client for Apple Silicon
|
|
61
|
-
|
|
62
|
-
Cortex is an LLM terminal interface designed for Apple Silicon, using MLX and PyTorch MPS frameworks for GPU-accelerated inference.
|
|
63
|
-
|
|
64
|
-
## What It Does
|
|
65
|
-
|
|
66
|
-
- **GPU-accelerated inference** via MLX (primary) and PyTorch MPS backends
|
|
67
|
-
- **Apple Silicon required** - leverages unified memory architecture
|
|
68
|
-
- **Multiple model formats** - MLX, GGUF, SafeTensors, PyTorch, GPTQ, AWQ
|
|
69
|
-
- **Built-in fine-tuning** - LoRA-based model customization via interactive wizard
|
|
70
|
-
- **Chat template auto-detection** - automatic format detection with confidence scoring
|
|
71
|
-
- **Conversation persistence** - SQLite-backed chat history with branching
|
|
72
|
-
|
|
73
|
-
## Features
|
|
74
|
-
|
|
75
|
-
- **GPU-Accelerated Inference** - Delegates to MLX and PyTorch MPS for Metal-based execution
|
|
76
|
-
- **Apple Silicon Only** - Requires Metal GPU; exits if GPU acceleration is unavailable
|
|
77
|
-
- **Model Format Support**:
|
|
78
|
-
- MLX (Apple's format, loaded via `mlx_lm`)
|
|
79
|
-
- GGUF (via `llama-cpp-python` with Metal backend)
|
|
80
|
-
- SafeTensors (via HuggingFace `transformers`)
|
|
81
|
-
- PyTorch models (via HuggingFace `transformers` with MPS device)
|
|
82
|
-
- GPTQ quantized (via `auto-gptq`)
|
|
83
|
-
- AWQ quantized (via `awq`)
|
|
84
|
-
- **Quantization** - 4-bit, 5-bit, 8-bit, and mixed-precision quantization via MLX conversion pipeline
|
|
85
|
-
- **Model Conversion** - Convert HuggingFace models to MLX format with configurable quantization recipes
|
|
86
|
-
- **Template Registry** - Automatic detection of chat templates (ChatML, Llama, Alpaca, Gemma, Reasoning) with confidence scoring and real-time token filtering for reasoning models
|
|
87
|
-
- **Rotating KV Cache** - MLX-based KV cache for long context handling (default 4096 tokens)
|
|
88
|
-
- **Fine-Tuning** - LoRA-based model customization with interactive 6-step wizard
|
|
89
|
-
- **Terminal UI** - ANSI terminal interface with streaming output
|
|
90
|
-
|
|
91
|
-
## Installation
|
|
92
|
-
|
|
93
|
-
```bash
|
|
94
|
-
# Clone and install
|
|
95
|
-
git clone https://github.com/faisalmumtaz/Cortex.git
|
|
96
|
-
cd Cortex
|
|
97
|
-
./install.sh
|
|
98
|
-
```
|
|
99
|
-
|
|
100
|
-
The installer:
|
|
101
|
-
- Checks for Apple Silicon (arm64) compatibility
|
|
102
|
-
- Creates a Python virtual environment
|
|
103
|
-
- Installs dependencies via `pip install -e .` (from `pyproject.toml`)
|
|
104
|
-
- Sets up the `cortex` command in your PATH
|
|
105
|
-
|
|
106
|
-
### Quick Install (pipx)
|
|
107
|
-
|
|
108
|
-
If you just want the CLI without cloning the repo, use pipx:
|
|
109
|
-
|
|
110
|
-
```bash
|
|
111
|
-
pipx install cortex-llm
|
|
112
|
-
```
|
|
113
|
-
|
|
114
|
-
## Quick Start
|
|
115
|
-
|
|
116
|
-
```bash
|
|
117
|
-
# After installation, just run:
|
|
118
|
-
cortex
|
|
119
|
-
```
|
|
120
|
-
|
|
121
|
-
### Downloading Models
|
|
122
|
-
|
|
123
|
-
```bash
|
|
124
|
-
# Inside Cortex, use the download command:
|
|
125
|
-
cortex
|
|
126
|
-
# Then type: /download
|
|
127
|
-
```
|
|
128
|
-
|
|
129
|
-
The download feature:
|
|
130
|
-
- **HuggingFace integration** - download any model by repository ID
|
|
131
|
-
- **Automatic loading** - option to load model immediately after download
|
|
132
|
-
|
|
133
|
-
## Documentation
|
|
134
|
-
|
|
135
|
-
### User Documentation
|
|
136
|
-
- **[Installation Guide](docs/installation.md)** - Complete setup instructions
|
|
137
|
-
- **[CLI Reference](docs/cli.md)** - Commands and user interface
|
|
138
|
-
- **[Configuration](docs/configuration.md)** - System settings and optimization
|
|
139
|
-
- **[Model Management](docs/model-management.md)** - Loading and managing models
|
|
140
|
-
- **[Template Registry](docs/template-registry.md)** - Automatic chat template detection and management
|
|
141
|
-
- **[Fine-Tuning Guide](docs/fine-tuning.md)** - Customize models with LoRA
|
|
142
|
-
- **[Troubleshooting](docs/troubleshooting.md)** - Common issues and solutions
|
|
143
|
-
|
|
144
|
-
### Technical Documentation
|
|
145
|
-
- **[MLX Acceleration](docs/mlx-acceleration.md)** - MLX framework integration and optimization
|
|
146
|
-
- **[GPU Validation](docs/gpu-validation.md)** - Hardware requirements and detection
|
|
147
|
-
- **[Inference Engine](docs/inference-engine.md)** - Text generation architecture
|
|
148
|
-
- **[Conversation Management](docs/conversation-management.md)** - Chat history and persistence
|
|
149
|
-
- **[Development Guide](docs/development.md)** - Contributing and architecture
|
|
150
|
-
|
|
151
|
-
## System Requirements
|
|
152
|
-
|
|
153
|
-
- Apple Silicon Mac (M1/M2/M3/M4 - all variants supported)
|
|
154
|
-
- macOS 13.3+ (required by MLX framework)
|
|
155
|
-
- Python 3.11+
|
|
156
|
-
- 16GB+ unified memory (24GB+ recommended for larger models)
|
|
157
|
-
- Xcode Command Line Tools
|
|
158
|
-
|
|
159
|
-
## Performance
|
|
160
|
-
|
|
161
|
-
Performance depends on your Apple Silicon chip, model size, and quantization level. The inference engine measures tokens/second, first-token latency, and memory usage at runtime.
|
|
162
|
-
|
|
163
|
-
To check that GPU acceleration is working:
|
|
164
|
-
|
|
165
|
-
```bash
|
|
166
|
-
source venv/bin/activate
|
|
167
|
-
python tests/test_apple_silicon.py
|
|
168
|
-
```
|
|
169
|
-
|
|
170
|
-
You should see:
|
|
171
|
-
- All validation checks passing
|
|
172
|
-
- Measured GFLOPS from matrix operations
|
|
173
|
-
- Confirmation of Metal and MLX availability
|
|
174
|
-
|
|
175
|
-
## GPU Acceleration Architecture
|
|
176
|
-
|
|
177
|
-
Cortex uses a multi-layer approach, delegating all GPU computation to established frameworks:
|
|
178
|
-
|
|
179
|
-
1. **MLX Framework (Primary Backend)**
|
|
180
|
-
- Apple's ML framework with native Metal support
|
|
181
|
-
- Quantization support (4-bit, 5-bit, 8-bit, mixed-precision)
|
|
182
|
-
- Rotating KV cache for long contexts
|
|
183
|
-
- JIT compilation via `mx.compile`
|
|
184
|
-
- Operation fusion for reduced kernel launches
|
|
185
|
-
|
|
186
|
-
2. **PyTorch MPS Backend**
|
|
187
|
-
- Metal Performance Shaders for PyTorch models
|
|
188
|
-
- FP16 optimization and channels-last tensor format
|
|
189
|
-
|
|
190
|
-
3. **llama.cpp (GGUF Backend)**
|
|
191
|
-
- Metal-accelerated inference for GGUF models
|
|
192
|
-
|
|
193
|
-
4. **Memory Management**
|
|
194
|
-
- Pre-allocated memory pools with best-fit/first-fit allocation strategies
|
|
195
|
-
- Automatic pool sizing (60% of available memory, capped at 75% of total)
|
|
196
|
-
- Defragmentation support
|
|
197
|
-
|
|
198
|
-
### Understanding "Skipping Kernel" Messages
|
|
199
|
-
|
|
200
|
-
When loading GGUF models, you may see messages like:
|
|
201
|
-
```
|
|
202
|
-
ggml_metal_init: skipping kernel_xxx_bf16 (not supported)
|
|
203
|
-
```
|
|
204
|
-
|
|
205
|
-
**These are NORMAL!** They indicate:
|
|
206
|
-
- BF16 kernels being skipped (your GPU uses FP16 instead)
|
|
207
|
-
- GPU acceleration is still fully active
|
|
208
|
-
- The system automatically uses optimal alternatives
|
|
209
|
-
|
|
210
|
-
## Troubleshooting
|
|
211
|
-
|
|
212
|
-
If you suspect GPU isn't being used:
|
|
213
|
-
|
|
214
|
-
1. **Run validation**: `python tests/test_apple_silicon.py`
|
|
215
|
-
2. **Check output**: Should see passing checks and measured GFLOPS
|
|
216
|
-
3. **Monitor tokens/sec**: Displayed during inference
|
|
217
|
-
4. **Verify Metal**: Ensure Xcode Command Line Tools installed
|
|
218
|
-
|
|
219
|
-
Common issues:
|
|
220
|
-
- **Low performance**: Run `python tests/test_apple_silicon.py` to diagnose
|
|
221
|
-
- **Memory errors**: Reduce `gpu_memory_fraction` in config.yaml
|
|
222
|
-
|
|
223
|
-
## MLX Model Conversion
|
|
224
|
-
|
|
225
|
-
Cortex includes an MLX model converter:
|
|
226
|
-
|
|
227
|
-
```python
|
|
228
|
-
from cortex.metal.mlx_converter import MLXConverter, ConversionConfig, QuantizationRecipe
|
|
229
|
-
|
|
230
|
-
converter = MLXConverter()
|
|
231
|
-
config = ConversionConfig(
|
|
232
|
-
quantization=QuantizationRecipe.SPEED_4BIT, # 4-bit quantization
|
|
233
|
-
compile_model=True # JIT compilation
|
|
234
|
-
)
|
|
235
|
-
|
|
236
|
-
success, message, output_path = converter.convert_model(
|
|
237
|
-
"microsoft/DialoGPT-medium",
|
|
238
|
-
config=config
|
|
239
|
-
)
|
|
240
|
-
```
|
|
241
|
-
|
|
242
|
-
### Quantization Options
|
|
243
|
-
|
|
244
|
-
- **4-bit**: Maximum speed, 75% size reduction
|
|
245
|
-
- **5-bit**: Balanced speed and quality
|
|
246
|
-
- **8-bit**: Higher quality, 50% size reduction
|
|
247
|
-
- **Mixed Precision**: Custom per-layer quantization
|
|
248
|
-
|
|
249
|
-
## MLX as Primary Backend
|
|
250
|
-
|
|
251
|
-
Cortex uses MLX (Apple's machine learning framework) as the primary acceleration backend:
|
|
252
|
-
- **Metal Support**: GPU execution via MLX's built-in Metal operations
|
|
253
|
-
- **Quantization**: Support for 4-bit, 5-bit, 8-bit, and mixed-precision quantization
|
|
254
|
-
- **Model Conversion**: Convert HuggingFace models to MLX format
|
|
255
|
-
|
|
256
|
-
## Built With
|
|
257
|
-
|
|
258
|
-
- [MLX](https://github.com/ml-explore/mlx) - Apple's machine learning framework
|
|
259
|
-
- [mlx-lm](https://github.com/ml-explore/mlx-examples) - LLM utilities and LoRA fine-tuning for MLX
|
|
260
|
-
- [PyTorch](https://pytorch.org/) - With Metal Performance Shaders backend
|
|
261
|
-
- [llama.cpp](https://github.com/ggerganov/llama.cpp) - Metal-accelerated GGUF support
|
|
262
|
-
- [Rich](https://github.com/Textualize/rich) - Terminal formatting
|
|
263
|
-
- [HuggingFace](https://huggingface.co/) - Model hub and transformers
|
|
264
|
-
|
|
265
|
-
## Contributing
|
|
266
|
-
|
|
267
|
-
We welcome contributions! Please see the [Development Guide](docs/development.md) for contributing guidelines and setup instructions.
|
|
268
|
-
|
|
269
|
-
## License
|
|
270
|
-
|
|
271
|
-
MIT License - See [LICENSE](LICENSE) for details.
|
|
272
|
-
|
|
273
|
-
---
|
|
274
|
-
|
|
275
|
-
**Note**: Cortex requires Apple Silicon. Intel Macs are not supported.
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|