cortex-llm 1.0.5__py3-none-any.whl → 1.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cortex/__init__.py CHANGED
@@ -5,7 +5,7 @@ A high-performance terminal interface for running Hugging Face LLMs locally
5
5
  with exclusive GPU acceleration via Metal Performance Shaders (MPS) and MLX.
6
6
  """
7
7
 
8
- __version__ = "1.0.5"
8
+ __version__ = "1.0.7"
9
9
  __author__ = "Cortex Development Team"
10
10
  __license__ = "MIT"
11
11
 
cortex/config.py CHANGED
@@ -74,7 +74,7 @@ class InferenceConfig(BaseModel):
74
74
  top_p: float = Field(default=0.95, ge=0.0, le=1.0)
75
75
  top_k: int = Field(default=40, ge=0)
76
76
  repetition_penalty: float = Field(default=1.1, ge=0.0, le=2.0)
77
- max_tokens: int = Field(default=2048, ge=1)
77
+ max_tokens: int = Field(default=4096, ge=1)
78
78
  stream_output: bool = True
79
79
  seed: int = Field(default=-1)
80
80
 
@@ -138,7 +138,7 @@ class InferenceEngine:
138
138
  use_fp16=True,
139
139
  use_channels_last=True,
140
140
  optimize_memory=True,
141
- max_batch_size=self.config.performance.batch_size
141
+ max_batch_size=self.config.performance.max_batch_size
142
142
  )
143
143
  self.mps_optimizer = MPSOptimizer(mps_config)
144
144
 
@@ -153,7 +153,7 @@ class InferenceEngine:
153
153
  fuse_operations=True,
154
154
  lazy_evaluation=True,
155
155
  rotating_kv_cache=True,
156
- kv_cache_size=self.config.model.context_length if hasattr(self.config.model, 'context_length') else 4096,
156
+ kv_cache_size=self.config.performance.context_length,
157
157
  quantization_bits=4
158
158
  )
159
159
  self.mlx_accelerator = MLXAccelerator(mlx_config)
@@ -204,6 +204,9 @@ class InferenceEngine:
204
204
  yield from self._generate_pytorch(model, tokenizer, request)
205
205
  elif model_info.format == ModelFormat.SAFETENSORS:
206
206
  yield from self._generate_safetensors(model, tokenizer, request)
207
+ elif model_info.format == ModelFormat.QUANTIZED:
208
+ # Quantized models are loaded as PyTorch-compatible modules
209
+ yield from self._generate_pytorch(model, tokenizer, request)
207
210
  elif model_info.format == ModelFormat.GGUF:
208
211
  yield from self._generate_gguf(model, tokenizer, request)
209
212
  else:
@@ -401,7 +404,18 @@ class InferenceEngine:
401
404
  last_metrics_update = time.time()
402
405
 
403
406
  try:
404
- device = torch.device("mps")
407
+ # Use the model's device when available (quantized models may be CPU-only on macOS)
408
+ device = None
409
+ try:
410
+ first_param = next(model.parameters())
411
+ device = first_param.device
412
+ except Exception:
413
+ device = None
414
+
415
+ if device is None or str(device) == "meta":
416
+ device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
417
+ elif device.type == "mps" and not torch.backends.mps.is_available():
418
+ device = torch.device("cpu")
405
419
 
406
420
  inputs = tokenizer(request.prompt, return_tensors="pt").to(device)
407
421
 
cortex/model_manager.py CHANGED
@@ -133,7 +133,8 @@ class ModelManager:
133
133
  self.quantizer = DynamicQuantizer(QuantizationConfig(
134
134
  mode=QuantizationMode.DYNAMIC,
135
135
  per_channel=True,
136
- cache_quantized=True
136
+ cache_quantized=True,
137
+ cache_dir=self.config.model.quantization_cache
137
138
  ))
138
139
 
139
140
  # Initialize MLX converter for native conversion
@@ -201,6 +202,39 @@ class ModelManager:
201
202
  level = getattr(self.config.gpu, "gpu_optimization_level", "maximum")
202
203
  level = str(level).lower().strip()
203
204
  return level in {"maximum", "max", "speed", "fast", "performance"}
205
+
206
+ def _get_default_quant_recipe(self) -> Optional[QuantizationRecipe]:
207
+ """Map configured default_quantization to an MLX quantization recipe."""
208
+ raw = getattr(self.config.model, "default_quantization", "") or ""
209
+ value = str(raw).strip().lower()
210
+ if not value or value == "auto":
211
+ return None
212
+
213
+ mapping = {
214
+ "q4_k_m": QuantizationRecipe.SPEED_4BIT,
215
+ "q5_k_m": QuantizationRecipe.BALANCED_5BIT,
216
+ "q6_k": QuantizationRecipe.QUALITY_8BIT, # closest available MLX recipe
217
+ "q8_0": QuantizationRecipe.QUALITY_8BIT,
218
+ "4bit": QuantizationRecipe.SPEED_4BIT,
219
+ "5bit": QuantizationRecipe.BALANCED_5BIT,
220
+ "8bit": QuantizationRecipe.QUALITY_8BIT,
221
+ "mixed": QuantizationRecipe.MIXED_PRECISION,
222
+ "none": QuantizationRecipe.NONE,
223
+ }
224
+
225
+ recipe = mapping.get(value)
226
+ if recipe is None:
227
+ logger.warning("Unknown default_quantization value: %s", raw)
228
+ return None
229
+
230
+ supported = getattr(self.config.model, "supported_quantizations", None)
231
+ if supported:
232
+ supported_norm = {str(s).strip().lower() for s in supported}
233
+ if value.startswith("q") and value not in supported_norm:
234
+ logger.warning("default_quantization '%s' not in supported_quantizations", raw)
235
+ return None
236
+
237
+ return recipe
204
238
 
205
239
  def load_model(
206
240
  self,
@@ -374,6 +408,10 @@ class ModelManager:
374
408
  except Exception as e:
375
409
  logger.warning(f"Could not estimate model parameters: {e}, defaulting to 4-bit")
376
410
  quant_recipe = QuantizationRecipe.SPEED_4BIT # Fallback
411
+
412
+ default_recipe = self._get_default_quant_recipe()
413
+ if default_recipe is not None:
414
+ quant_recipe = default_recipe
377
415
 
378
416
  if quantization:
379
417
  quant_map = {
@@ -452,6 +490,10 @@ class ModelManager:
452
490
  else:
453
491
  quant_recipe = QuantizationRecipe.SPEED_4BIT # Default for larger models
454
492
 
493
+ default_recipe = self._get_default_quant_recipe()
494
+ if default_recipe is not None:
495
+ quant_recipe = default_recipe
496
+
455
497
  if quantization:
456
498
  quant_map = {
457
499
  "4bit": QuantizationRecipe.SPEED_4BIT,
@@ -563,6 +605,8 @@ class ModelManager:
563
605
  )
564
606
 
565
607
  if not can_load and can_apply_quantization:
608
+ if not getattr(self.config.model, "auto_quantize", True):
609
+ return False, f"GPU incompatible: {message} (auto_quantize disabled)"
566
610
  # Check if quantization would help
567
611
  gpu_status = self.gpu_validator.get_gpu_memory_status()
568
612
  available_gb = gpu_status['available_gb']
@@ -3,7 +3,7 @@
3
3
  import torch
4
4
  import torch.nn as nn
5
5
  from typing import Dict, Any, Optional, Tuple, Union
6
- from dataclasses import dataclass
6
+ from dataclasses import dataclass, field
7
7
  from enum import Enum
8
8
  import gc
9
9
  from pathlib import Path
@@ -40,6 +40,7 @@ class QuantizationConfig:
40
40
  cache_quantized: bool = True # Cache quantized models to disk
41
41
  compress_cache: bool = False # Compress cached models (slower but smaller)
42
42
  validate_quantization: bool = True # Validate quantized models work correctly
43
+ cache_dir: Path = field(default_factory=lambda: Path.home() / ".cortex" / "quantized_models")
43
44
 
44
45
  def to_dict(self) -> Dict[str, Any]:
45
46
  """Convert to dictionary for serialization."""
@@ -118,6 +119,8 @@ class DynamicQuantizer:
118
119
  def __init__(self, config: Optional[QuantizationConfig] = None):
119
120
  """Initialize quantizer with configuration."""
120
121
  self.config = config or QuantizationConfig()
122
+ self.config.cache_dir = Path(self.config.cache_dir).expanduser()
123
+ self.config.cache_dir.mkdir(parents=True, exist_ok=True)
121
124
  self.device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
122
125
  self._quantization_cache: Dict[str, Dict[str, Any]] = {}
123
126
 
@@ -681,10 +684,10 @@ class DynamicQuantizer:
681
684
 
682
685
  # Generate cache key including model metadata
683
686
  cache_key = hashlib.md5(
684
- f"{model_path}_{model_mtime}_{model_size}_{json.dumps(quantization_info)}".encode()
687
+ f"{model_path}_{model_mtime}_{model_size}_{json.dumps(self.config.to_dict())}".encode()
685
688
  ).hexdigest()
686
689
 
687
- cache_dir = Path.home() / ".cortex" / "quantized_cache"
690
+ cache_dir = self.config.cache_dir
688
691
  cache_dir.mkdir(parents=True, exist_ok=True)
689
692
 
690
693
  cache_path = cache_dir / f"{cache_key}.pt"
@@ -723,7 +726,7 @@ class DynamicQuantizer:
723
726
  f"{model_path}_{model_mtime}_{model_size}_{json.dumps(config.to_dict())}".encode()
724
727
  ).hexdigest()
725
728
 
726
- cache_path = Path.home() / ".cortex" / "quantized_cache" / f"{cache_key}.pt"
729
+ cache_path = Path(self.config.cache_dir) / f"{cache_key}.pt"
727
730
 
728
731
  if cache_path.exists():
729
732
  try:
@@ -733,4 +736,4 @@ class DynamicQuantizer:
733
736
  # Cache corrupted, will re-quantize
734
737
  cache_path.unlink()
735
738
 
736
- return None
739
+ return None
cortex/ui/cli.py CHANGED
@@ -30,7 +30,7 @@ from cortex.conversation_manager import ConversationManager, MessageRole
30
30
  from cortex.model_downloader import ModelDownloader
31
31
  from cortex.template_registry import TemplateRegistry
32
32
  from cortex.fine_tuning import FineTuneWizard
33
- from cortex.ui.markdown_render import ThinkMarkdown, PrefixedRenderable
33
+ from cortex.ui.markdown_render import ThinkMarkdown, PrefixedRenderable, render_plain_with_think
34
34
 
35
35
 
36
36
  class CortexCLI:
@@ -1135,15 +1135,15 @@ class CortexCLI:
1135
1135
  logger.debug(f"Could not get stop sequences: {e}")
1136
1136
 
1137
1137
  # Create generation request with formatted prompt
1138
- # Use lower temperature for more focused responses
1139
1138
  request = GenerationRequest(
1140
1139
  prompt=formatted_prompt,
1141
1140
  max_tokens=self.config.inference.max_tokens,
1142
- temperature=0.3, # Lower temperature for less randomness
1143
- top_p=0.9, # Slightly lower top_p
1141
+ temperature=self.config.inference.temperature,
1142
+ top_p=self.config.inference.top_p,
1144
1143
  top_k=self.config.inference.top_k,
1145
1144
  repetition_penalty=self.config.inference.repetition_penalty,
1146
- stream=True,
1145
+ stream=self.config.inference.stream_output,
1146
+ seed=self.config.inference.seed if self.config.inference.seed >= 0 else None,
1147
1147
  stop_sequences=stop_sequences
1148
1148
  )
1149
1149
 
@@ -1167,50 +1167,65 @@ class CortexCLI:
1167
1167
  prefix_style = Style(color="cyan")
1168
1168
 
1169
1169
  def build_renderable(text: str):
1170
- markdown = ThinkMarkdown(text, code_theme="monokai", use_line_numbers=False)
1171
- return PrefixedRenderable(markdown, prefix="⏺ ", prefix_style=prefix_style, indent=" ")
1170
+ if getattr(self.config.ui, "markdown_rendering", True):
1171
+ markdown = ThinkMarkdown(
1172
+ text,
1173
+ code_theme="monokai",
1174
+ use_line_numbers=False,
1175
+ syntax_highlighting=getattr(self.config.ui, "syntax_highlighting", True),
1176
+ )
1177
+ renderable = markdown
1178
+ else:
1179
+ renderable = render_plain_with_think(text)
1172
1180
 
1173
- with Live(
1174
- build_renderable(""),
1175
- console=self.console,
1176
- refresh_per_second=20,
1177
- transient=False,
1178
- ) as live:
1179
- for token in self.inference_engine.generate(request):
1180
- if first_token_time is None:
1181
- first_token_time = time.time()
1181
+ return PrefixedRenderable(renderable, prefix="⏺ ", prefix_style=prefix_style, indent=" ")
1182
1182
 
1183
- generated_text += token
1184
- token_count += 1
1183
+ original_console_width = self.console._width
1184
+ target_width = max(40, int(self.get_terminal_width() * 0.75))
1185
+ self.console.width = target_width
1186
+ try:
1187
+ with Live(
1188
+ build_renderable(""),
1189
+ console=self.console,
1190
+ auto_refresh=False,
1191
+ refresh_per_second=20,
1192
+ transient=False,
1193
+ vertical_overflow="visible",
1194
+ ) as live:
1195
+ for token in self.inference_engine.generate(request):
1196
+ if first_token_time is None:
1197
+ first_token_time = time.time()
1185
1198
 
1186
- display_token = token
1187
- if uses_reasoning_template and template_profile and template_profile.supports_streaming():
1188
- display_token, should_display = template_profile.process_streaming_response(
1189
- token, accumulated_response
1190
- )
1191
- accumulated_response += token
1192
- if not should_display:
1193
- display_token = ""
1199
+ generated_text += token
1200
+ token_count += 1
1201
+
1202
+ display_token = token
1203
+ if uses_reasoning_template and template_profile and template_profile.supports_streaming():
1204
+ display_token, should_display = template_profile.process_streaming_response(
1205
+ token, accumulated_response
1206
+ )
1207
+ accumulated_response += token
1208
+ if not should_display:
1209
+ display_token = ""
1194
1210
 
1195
- if display_token:
1196
- display_text += display_token
1211
+ if display_token:
1212
+ display_text += display_token
1197
1213
 
1198
- now = time.time()
1199
- if display_token and ("\n" in display_token or now - last_render_time >= render_interval):
1200
- live.update(build_renderable(display_text))
1201
- last_render_time = now
1214
+ now = time.time()
1215
+ if display_token and ("\n" in display_token or now - last_render_time >= render_interval):
1216
+ live.update(build_renderable(display_text), refresh=True)
1217
+ last_render_time = now
1202
1218
 
1203
- if uses_reasoning_template and template_profile:
1204
- final_text = template_profile.process_response(generated_text)
1205
- generated_text = final_text
1206
- if not template_profile.config.show_reasoning:
1207
- display_text = final_text
1219
+ if uses_reasoning_template and template_profile:
1220
+ final_text = template_profile.process_response(generated_text)
1221
+ generated_text = final_text
1222
+ if not template_profile.config.show_reasoning:
1223
+ display_text = final_text
1208
1224
 
1209
- live.update(build_renderable(display_text))
1225
+ live.update(build_renderable(display_text), refresh=True)
1226
+ finally:
1227
+ self.console._width = original_console_width
1210
1228
 
1211
- # Add blank line for spacing between response and metrics
1212
- print()
1213
-
1214
1229
  # Display final metrics in a clean, professional way
1215
1230
  elapsed = time.time() - start_time
1216
1231
  if token_count > 0 and elapsed > 0:
@@ -1238,6 +1253,9 @@ class CortexCLI:
1238
1253
  metrics_line = " · ".join(metrics_parts)
1239
1254
  print(f" \033[2m{metrics_line}\033[0m")
1240
1255
 
1256
+ if token_count >= request.max_tokens:
1257
+ print(f" \033[2m(output truncated at max_tokens={request.max_tokens}; increase in config.yaml)\033[0m")
1258
+
1241
1259
  # Add assistant message to conversation history
1242
1260
  self.conversation_manager.add_message(MessageRole.ASSISTANT, generated_text)
1243
1261
 
@@ -3,10 +3,12 @@
3
3
  from typing import List
4
4
 
5
5
  from rich.console import Console
6
+ from rich.cells import cell_len
6
7
  from rich.markdown import Markdown
7
8
  from rich.segment import Segment
8
9
  from rich.style import Style
9
10
  from rich.syntax import Syntax
11
+ from rich.text import Text
10
12
 
11
13
  THINK_START_MARKER = "[[[THINK_START]]]"
12
14
  THINK_END_MARKER = "[[[THINK_END]]]"
@@ -45,6 +47,14 @@ class CodeBlockWithLineNumbers(Markdown.elements["fence"]):
45
47
  yield syntax
46
48
 
47
49
 
50
+ class CodeBlockPlain(Markdown.elements["fence"]):
51
+ """Markdown code block rendered as plain text (no syntax highlighting)."""
52
+
53
+ def __rich_console__(self, console: Console, options):
54
+ code = str(self.text).rstrip()
55
+ yield Text(code)
56
+
57
+
48
58
  class MarkdownWithLineNumbers(Markdown):
49
59
  """Markdown renderer that keeps line numbers for fenced code blocks."""
50
60
 
@@ -55,6 +65,26 @@ class MarkdownWithLineNumbers(Markdown):
55
65
  })
56
66
 
57
67
 
68
+ class MarkdownPlainCode(Markdown):
69
+ """Markdown renderer that disables syntax highlighting for code blocks."""
70
+
71
+ elements = Markdown.elements.copy()
72
+ elements.update({
73
+ "fence": CodeBlockPlain,
74
+ "code_block": CodeBlockPlain,
75
+ })
76
+
77
+
78
+ class MarkdownPlainCodeWithLineNumbers(Markdown):
79
+ """Markdown renderer with plain code blocks and line numbers."""
80
+
81
+ elements = MarkdownWithLineNumbers.elements.copy()
82
+ elements.update({
83
+ "fence": CodeBlockPlain,
84
+ "code_block": CodeBlockPlain,
85
+ })
86
+
87
+
58
88
  class ThinkMarkdown:
59
89
  """Markdown renderer that dims content inside <think> tags."""
60
90
 
@@ -63,10 +93,15 @@ class ThinkMarkdown:
63
93
  markup: str,
64
94
  code_theme: str = "monokai",
65
95
  use_line_numbers: bool = False,
96
+ syntax_highlighting: bool = True,
66
97
  ) -> None:
67
98
  marked = _mark_think_sections(markup)
68
- markdown_cls = MarkdownWithLineNumbers if use_line_numbers else Markdown
69
- self._markdown = markdown_cls(marked, code_theme=code_theme)
99
+ if syntax_highlighting:
100
+ markdown_cls = MarkdownWithLineNumbers if use_line_numbers else Markdown
101
+ self._markdown = markdown_cls(marked, code_theme=code_theme)
102
+ else:
103
+ markdown_cls = MarkdownPlainCodeWithLineNumbers if use_line_numbers else MarkdownPlainCode
104
+ self._markdown = markdown_cls(marked)
70
105
 
71
106
  def __rich_console__(self, console: Console, options):
72
107
  segments = console.render(self._markdown, options)
@@ -162,9 +197,15 @@ class PrefixedRenderable:
162
197
  self.indent = indent if indent is not None else " " * len(prefix)
163
198
 
164
199
  def __rich_console__(self, console: Console, options):
200
+ prefix_width = cell_len(self.prefix)
201
+ indent_width = cell_len(self.indent) if self.indent is not None else prefix_width
202
+ offset = max(prefix_width, indent_width)
203
+ inner_width = max(1, options.max_width - offset)
204
+ inner_options = options.update_width(inner_width)
205
+
165
206
  yield Segment(self.prefix, self.prefix_style)
166
207
 
167
- for segment in console.render(self.renderable, options):
208
+ for segment in console.render(self.renderable, inner_options):
168
209
  if segment.control:
169
210
  yield segment
170
211
  continue
@@ -183,3 +224,27 @@ class PrefixedRenderable:
183
224
  if index < len(parts) - 1:
184
225
  yield Segment("\n", style)
185
226
  yield Segment(self.indent, None)
227
+
228
+
229
+ def render_plain_with_think(text: str) -> Text:
230
+ """Render plain text while dimming content inside <think> tags."""
231
+ output = Text()
232
+ dim_style = Style(dim=True)
233
+ idx = 0
234
+ in_think = False
235
+
236
+ while idx < len(text):
237
+ if text.startswith("<think>", idx):
238
+ in_think = True
239
+ idx += len("<think>")
240
+ continue
241
+ if text.startswith("</think>", idx):
242
+ in_think = False
243
+ idx += len("</think>")
244
+ continue
245
+
246
+ char = text[idx]
247
+ output.append(char, dim_style if in_think else None)
248
+ idx += 1
249
+
250
+ return output
@@ -0,0 +1,169 @@
1
+ Metadata-Version: 2.4
2
+ Name: cortex-llm
3
+ Version: 1.0.7
4
+ Summary: GPU-Accelerated LLM Terminal for Apple Silicon
5
+ Home-page: https://github.com/faisalmumtaz/Cortex
6
+ Author: Cortex Development Team
7
+ License-Expression: MIT
8
+ Project-URL: Homepage, https://github.com/faisalmumtaz/Cortex
9
+ Project-URL: Bug Tracker, https://github.com/faisalmumtaz/Cortex/issues
10
+ Project-URL: Documentation, https://github.com/faisalmumtaz/Cortex/wiki
11
+ Keywords: llm,gpu,metal,mps,apple-silicon,ai,machine-learning,terminal,mlx,pytorch
12
+ Platform: darwin
13
+ Classifier: Development Status :: 4 - Beta
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Operating System :: MacOS
20
+ Classifier: Environment :: Console
21
+ Classifier: Environment :: GPU
22
+ Requires-Python: >=3.11
23
+ Description-Content-Type: text/markdown
24
+ License-File: LICENSE
25
+ Requires-Dist: torch>=2.1.0
26
+ Requires-Dist: mlx>=0.30.4
27
+ Requires-Dist: mlx-lm>=0.30.5
28
+ Requires-Dist: transformers>=4.36.0
29
+ Requires-Dist: safetensors>=0.4.0
30
+ Requires-Dist: huggingface-hub>=0.19.0
31
+ Requires-Dist: accelerate>=0.25.0
32
+ Requires-Dist: llama-cpp-python>=0.2.0
33
+ Requires-Dist: pyyaml>=6.0
34
+ Requires-Dist: pydantic>=2.5.0
35
+ Requires-Dist: rich>=13.0.0
36
+ Requires-Dist: psutil>=5.9.0
37
+ Requires-Dist: numpy>=1.24.0
38
+ Requires-Dist: packaging>=23.0
39
+ Requires-Dist: requests>=2.31.0
40
+ Provides-Extra: dev
41
+ Requires-Dist: pytest>=7.4.0; extra == "dev"
42
+ Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
43
+ Requires-Dist: black>=23.0.0; extra == "dev"
44
+ Requires-Dist: ruff>=0.1.0; extra == "dev"
45
+ Requires-Dist: mypy>=1.8.0; extra == "dev"
46
+ Provides-Extra: optional
47
+ Requires-Dist: sentencepiece>=0.1.99; extra == "optional"
48
+ Requires-Dist: auto-gptq>=0.7.0; extra == "optional"
49
+ Requires-Dist: autoawq>=0.2.0; extra == "optional"
50
+ Requires-Dist: bitsandbytes>=0.41.0; extra == "optional"
51
+ Requires-Dist: optimum>=1.16.0; extra == "optional"
52
+ Requires-Dist: torchvision>=0.16.0; extra == "optional"
53
+ Requires-Dist: torchaudio>=2.1.0; extra == "optional"
54
+ Dynamic: home-page
55
+ Dynamic: license-file
56
+ Dynamic: platform
57
+ Dynamic: requires-python
58
+
59
+ # Cortex
60
+
61
+ GPU-accelerated local LLMs on Apple Silicon, built for the terminal.
62
+
63
+ Cortex is a fast, native CLI for running and fine-tuning LLMs on Apple Silicon using MLX and Metal. It automatically detects chat templates, supports multiple model formats, and keeps your workflow inside the terminal.
64
+
65
+ ## Highlights
66
+
67
+ - Apple Silicon GPU acceleration via MLX (primary) and PyTorch MPS
68
+ - Multi-format model support: MLX, GGUF, SafeTensors, PyTorch, GPTQ, AWQ
69
+ - Built-in LoRA fine-tuning wizard
70
+ - Chat template auto-detection (ChatML, Llama, Alpaca, Gemma, Reasoning)
71
+ - Conversation history with autosave and export
72
+
73
+ ## Quick Start
74
+
75
+ ```bash
76
+ pipx install cortex-llm
77
+ cortex
78
+ ```
79
+
80
+ Inside Cortex:
81
+
82
+ - `/download` to fetch a model from HuggingFace
83
+ - `/model` to load or manage models
84
+ - `/status` to confirm GPU acceleration and current settings
85
+
86
+ ## Installation
87
+
88
+ ### Option A: pipx (recommended)
89
+
90
+ ```bash
91
+ pipx install cortex-llm
92
+ ```
93
+
94
+ ### Option B: from source
95
+
96
+ ```bash
97
+ git clone https://github.com/faisalmumtaz/Cortex.git
98
+ cd Cortex
99
+ ./install.sh
100
+ ```
101
+
102
+ The installer checks Apple Silicon compatibility, creates a venv, installs dependencies from `pyproject.toml`, and sets up the `cortex` command.
103
+
104
+ ## Requirements
105
+
106
+ - Apple Silicon Mac (M1/M2/M3/M4)
107
+ - macOS 13.3+
108
+ - Python 3.11+
109
+ - 16GB+ unified memory (24GB+ recommended for larger models)
110
+ - Xcode Command Line Tools
111
+
112
+ ## Model Support
113
+
114
+ Cortex supports:
115
+
116
+ - **MLX** (recommended)
117
+ - **GGUF** (llama.cpp + Metal)
118
+ - **SafeTensors**
119
+ - **PyTorch** (Transformers + MPS)
120
+ - **GPTQ** / **AWQ** quantized models
121
+
122
+ ## Advanced Features
123
+
124
+ - **Dynamic quantization fallback** for PyTorch/SafeTensors models that do not fit GPU memory (INT8 preferred, INT4 fallback)
125
+ - `docs/dynamic-quantization.md`
126
+ - **MLX conversion with quantization recipes** (4/5/8-bit, mixed precision) for speed vs quality control
127
+ - `docs/mlx-acceleration.md`
128
+ - **LoRA fine-tuning wizard** for local adapters (`/finetune`)
129
+ - `docs/fine-tuning.md`
130
+ - **Template registry and auto-detection** for chat formatting (ChatML, Llama, Alpaca, Gemma, Reasoning)
131
+ - `docs/template-registry.md`
132
+ - **Inference engine details** and backend behavior
133
+ - `docs/inference-engine.md`
134
+
135
+ ## Configuration
136
+
137
+ Cortex reads `config.yaml` from the current working directory. For tuning GPU memory limits, quantization defaults, and inference parameters, see:
138
+
139
+ - `docs/configuration.md`
140
+
141
+ ## Documentation
142
+
143
+ Start here:
144
+
145
+ - `docs/installation.md`
146
+ - `docs/cli.md`
147
+ - `docs/model-management.md`
148
+ - `docs/troubleshooting.md`
149
+
150
+ Advanced topics:
151
+
152
+ - `docs/mlx-acceleration.md`
153
+ - `docs/inference-engine.md`
154
+ - `docs/dynamic-quantization.md`
155
+ - `docs/template-registry.md`
156
+ - `docs/fine-tuning.md`
157
+ - `docs/development.md`
158
+
159
+ ## Contributing
160
+
161
+ Contributions are welcome. See `docs/development.md` for setup and workflow.
162
+
163
+ ## License
164
+
165
+ MIT License. See `LICENSE`.
166
+
167
+ ---
168
+
169
+ Note: Cortex requires Apple Silicon. Intel Macs are not supported.
@@ -1,11 +1,11 @@
1
- cortex/__init__.py,sha256=MV1R79rhxMlbGTjL6X9AedyOGPdotx-PJf2RviGbXUM,2202
1
+ cortex/__init__.py,sha256=zd80dwfLqU5IbsIPvILFhFEI58aI4oOjk1jpzzqMKKw,2202
2
2
  cortex/__main__.py,sha256=I7Njt7BjGoHtPhftDoA44OyOYbwWNNaPwP_qlJSn0J4,2857
3
- cortex/config.py,sha256=txmpJXy3kUEKULZyu1OWb_jkNQRHZClm5ovZfCTX_Zc,13444
3
+ cortex/config.py,sha256=IQnMaXznTflTSvr91aybtPMnNW088r-BYeVMhxny63w,13444
4
4
  cortex/conversation_manager.py,sha256=aSTdGjVttsMKIiRPzztP0tOXlqZBkWtgZDNCZGyaR-c,17177
5
5
  cortex/gpu_validator.py,sha256=un6vMQ78MWMnKWIz8n-92v9Fb4g_YXqU_E1pUPinncY,16582
6
- cortex/inference_engine.py,sha256=pcoSBw8ooqdJmQtPP8Y-DrBusf6VGWZjPRik9NLSRrg,28632
6
+ cortex/inference_engine.py,sha256=bklCjmiMn3psFp14EZxRzePEuA33NCHJ1bQdsbvMlfg,29343
7
7
  cortex/model_downloader.py,sha256=VuPhvxq_66qKjsPjEWcLW-VmUHzOHik6LBMiGDk-cX8,4977
8
- cortex/model_manager.py,sha256=Blk-JA_kajJcDp-h2A4tplECijHPw8LZ8c_fbq0FGFg,100670
8
+ cortex/model_manager.py,sha256=Ra21TjhtFS-7_hRzDMh9m0BUazIGWoKr7Gye3GiVRJM,102671
9
9
  cortex/fine_tuning/__init__.py,sha256=IXKQqNqN1C3mha3na35i7KI-hMnsqqrmUgV4NrPKHy0,269
10
10
  cortex/fine_tuning/dataset.py,sha256=hIz_dfFSaJoiFzWZ6vwlwqjpTfdsnFNIEmwhhTD2d9k,15414
11
11
  cortex/fine_tuning/mlx_lora_trainer.py,sha256=idNzKtVG8pObwsnSrP0N1rU1EanhrIRvHiNL1asdzr8,22438
@@ -21,7 +21,7 @@ cortex/metal/mps_optimizer.py,sha256=4r6dj-_KAr3vedCwwu7lR-nIaF4g4D4kkOoF2KiQ0FQ
21
21
  cortex/metal/optimizer.py,sha256=9ixKj8ca1iovF-mFHYGa9_DUHcqgGyzLoP_lIRAzfMM,21996
22
22
  cortex/metal/performance_profiler.py,sha256=GMxxqwqE2kVJ4WePwVdUp2ADqhrV6wCCNrFnaMfBDpI,12274
23
23
  cortex/quantization/__init__.py,sha256=ElLP3ZO_XItddTl-PeoJ5GPb16RYIAk8m5sqwfAVE9s,184
24
- cortex/quantization/dynamic_quantizer.py,sha256=sAoHoQ6wfs6FvejG-iehB2Qij-0WC9qSTlBfj3D1pTI,31724
24
+ cortex/quantization/dynamic_quantizer.py,sha256=vV0RSPMoWeOPALwFOs0DzqIA2MkGpeEpqB2vTeudhW0,31934
25
25
  cortex/template_registry/__init__.py,sha256=O5BWmHRmfMSK-Ukpu8UqFO_kaN0kum-d-Wsz0Ds-sC0,491
26
26
  cortex/template_registry/auto_detector.py,sha256=lqI19Ef_w6ClZvD5dzDw1i5gnf2AUN_L4WjCMvW99Yg,5432
27
27
  cortex/template_registry/config_manager.py,sha256=vh7cXAUTJ4dLY74u5EHTpTa46jXxj34BlMyWsC_ZIaM,8658
@@ -38,12 +38,12 @@ cortex/template_registry/template_profiles/standard/gemma.py,sha256=D4wZN3_6QzUj
38
38
  cortex/template_registry/template_profiles/standard/llama.py,sha256=jz4MyvmISSPtIAcffPE7LrTosHvlC0NoJhzTw1DCvpY,3209
39
39
  cortex/template_registry/template_profiles/standard/simple.py,sha256=dGOOcL6HRoJFxkixLrYC4w7c63h-QmOOWC2TsOihYog,2422
40
40
  cortex/ui/__init__.py,sha256=t3GrHJMHTVgBEKh2_qt4B9mS594V5jriTDqc3eZKMGc,3409
41
- cortex/ui/cli.py,sha256=ExzP56n1yV4bdA1EOqHSDFRWhpgpX0lkghq0H0FXw7Q,74661
42
- cortex/ui/markdown_render.py,sha256=bXt60vkNYT_jbpKeIg_1OlcrxssmdbMO7RB2E1sWw3E,5759
41
+ cortex/ui/cli.py,sha256=QZhiV9z8hP9Fu5mvpzURSWLptDDRaJLmNLm2AqTGlqE,75734
42
+ cortex/ui/markdown_render.py,sha256=D4gSvv0TERFIAXYs3e76eaPsuvvD2cNT98PDKyUPnWI,7776
43
43
  cortex/ui/terminal_app.py,sha256=SF3KqcGFyZ4hpTmgX21idPzOTJLdKGkt4QdA-wwUBNE,18317
44
- cortex_llm-1.0.5.dist-info/licenses/LICENSE,sha256=_frJ3VsZWQGhMznZw2Tgjk7xwfAfDZRcBl43uZh8_4E,1070
45
- cortex_llm-1.0.5.dist-info/METADATA,sha256=YMeikmD3YDjQhf8DjPd2pBSywHsNiqZJmMNNxQLAH-w,10087
46
- cortex_llm-1.0.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
47
- cortex_llm-1.0.5.dist-info/entry_points.txt,sha256=g83Nuz3iFrNdMLHxGLR2LnscdM7rdQRchuL3WGobQC8,48
48
- cortex_llm-1.0.5.dist-info/top_level.txt,sha256=79LAeTJJ_pMIBy3mkF7uNaN0mdBRt5tGrnne5N_iAio,7
49
- cortex_llm-1.0.5.dist-info/RECORD,,
44
+ cortex_llm-1.0.7.dist-info/licenses/LICENSE,sha256=_frJ3VsZWQGhMznZw2Tgjk7xwfAfDZRcBl43uZh8_4E,1070
45
+ cortex_llm-1.0.7.dist-info/METADATA,sha256=jUwV2nVs0EL01Iqap64U3mI5QFPrHv3pt5sE1SvmAA0,5119
46
+ cortex_llm-1.0.7.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
47
+ cortex_llm-1.0.7.dist-info/entry_points.txt,sha256=g83Nuz3iFrNdMLHxGLR2LnscdM7rdQRchuL3WGobQC8,48
48
+ cortex_llm-1.0.7.dist-info/top_level.txt,sha256=79LAeTJJ_pMIBy3mkF7uNaN0mdBRt5tGrnne5N_iAio,7
49
+ cortex_llm-1.0.7.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.9.0)
2
+ Generator: setuptools (80.10.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,275 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: cortex-llm
3
- Version: 1.0.5
4
- Summary: GPU-Accelerated LLM Terminal for Apple Silicon
5
- Home-page: https://github.com/faisalmumtaz/Cortex
6
- Author: Cortex Development Team
7
- License: MIT
8
- Project-URL: Homepage, https://github.com/faisalmumtaz/Cortex
9
- Project-URL: Bug Tracker, https://github.com/faisalmumtaz/Cortex/issues
10
- Project-URL: Documentation, https://github.com/faisalmumtaz/Cortex/wiki
11
- Keywords: llm,gpu,metal,mps,apple-silicon,ai,machine-learning,terminal,mlx,pytorch
12
- Platform: darwin
13
- Classifier: Development Status :: 4 - Beta
14
- Classifier: Intended Audience :: Developers
15
- Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
16
- Classifier: License :: OSI Approved :: MIT License
17
- Classifier: Programming Language :: Python :: 3
18
- Classifier: Programming Language :: Python :: 3.11
19
- Classifier: Programming Language :: Python :: 3.12
20
- Classifier: Operating System :: MacOS
21
- Classifier: Environment :: Console
22
- Classifier: Environment :: GPU
23
- Requires-Python: >=3.11
24
- Description-Content-Type: text/markdown
25
- License-File: LICENSE
26
- Requires-Dist: torch>=2.1.0
27
- Requires-Dist: mlx>=0.30.4
28
- Requires-Dist: mlx-lm>=0.30.5
29
- Requires-Dist: transformers>=4.36.0
30
- Requires-Dist: safetensors>=0.4.0
31
- Requires-Dist: huggingface-hub>=0.19.0
32
- Requires-Dist: accelerate>=0.25.0
33
- Requires-Dist: llama-cpp-python>=0.2.0
34
- Requires-Dist: pyyaml>=6.0
35
- Requires-Dist: pydantic>=2.5.0
36
- Requires-Dist: rich>=13.0.0
37
- Requires-Dist: psutil>=5.9.0
38
- Requires-Dist: numpy>=1.24.0
39
- Requires-Dist: packaging>=23.0
40
- Requires-Dist: requests>=2.31.0
41
- Provides-Extra: dev
42
- Requires-Dist: pytest>=7.4.0; extra == "dev"
43
- Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
44
- Requires-Dist: black>=23.0.0; extra == "dev"
45
- Requires-Dist: ruff>=0.1.0; extra == "dev"
46
- Requires-Dist: mypy>=1.8.0; extra == "dev"
47
- Provides-Extra: optional
48
- Requires-Dist: sentencepiece>=0.1.99; extra == "optional"
49
- Requires-Dist: auto-gptq>=0.7.0; extra == "optional"
50
- Requires-Dist: autoawq>=0.2.0; extra == "optional"
51
- Requires-Dist: bitsandbytes>=0.41.0; extra == "optional"
52
- Requires-Dist: optimum>=1.16.0; extra == "optional"
53
- Requires-Dist: torchvision>=0.16.0; extra == "optional"
54
- Requires-Dist: torchaudio>=2.1.0; extra == "optional"
55
- Dynamic: home-page
56
- Dynamic: license-file
57
- Dynamic: platform
58
- Dynamic: requires-python
59
-
60
- # Cortex - LLM Terminal Client for Apple Silicon
61
-
62
- Cortex is an LLM terminal interface designed for Apple Silicon, using MLX and PyTorch MPS frameworks for GPU-accelerated inference.
63
-
64
- ## What It Does
65
-
66
- - **GPU-accelerated inference** via MLX (primary) and PyTorch MPS backends
67
- - **Apple Silicon required** - leverages unified memory architecture
68
- - **Multiple model formats** - MLX, GGUF, SafeTensors, PyTorch, GPTQ, AWQ
69
- - **Built-in fine-tuning** - LoRA-based model customization via interactive wizard
70
- - **Chat template auto-detection** - automatic format detection with confidence scoring
71
- - **Conversation persistence** - SQLite-backed chat history with branching
72
-
73
- ## Features
74
-
75
- - **GPU-Accelerated Inference** - Delegates to MLX and PyTorch MPS for Metal-based execution
76
- - **Apple Silicon Only** - Requires Metal GPU; exits if GPU acceleration is unavailable
77
- - **Model Format Support**:
78
- - MLX (Apple's format, loaded via `mlx_lm`)
79
- - GGUF (via `llama-cpp-python` with Metal backend)
80
- - SafeTensors (via HuggingFace `transformers`)
81
- - PyTorch models (via HuggingFace `transformers` with MPS device)
82
- - GPTQ quantized (via `auto-gptq`)
83
- - AWQ quantized (via `awq`)
84
- - **Quantization** - 4-bit, 5-bit, 8-bit, and mixed-precision quantization via MLX conversion pipeline
85
- - **Model Conversion** - Convert HuggingFace models to MLX format with configurable quantization recipes
86
- - **Template Registry** - Automatic detection of chat templates (ChatML, Llama, Alpaca, Gemma, Reasoning) with confidence scoring and real-time token filtering for reasoning models
87
- - **Rotating KV Cache** - MLX-based KV cache for long context handling (default 4096 tokens)
88
- - **Fine-Tuning** - LoRA-based model customization with interactive 6-step wizard
89
- - **Terminal UI** - ANSI terminal interface with streaming output
90
-
91
- ## Installation
92
-
93
- ```bash
94
- # Clone and install
95
- git clone https://github.com/faisalmumtaz/Cortex.git
96
- cd Cortex
97
- ./install.sh
98
- ```
99
-
100
- The installer:
101
- - Checks for Apple Silicon (arm64) compatibility
102
- - Creates a Python virtual environment
103
- - Installs dependencies via `pip install -e .` (from `pyproject.toml`)
104
- - Sets up the `cortex` command in your PATH
105
-
106
- ### Quick Install (pipx)
107
-
108
- If you just want the CLI without cloning the repo, use pipx:
109
-
110
- ```bash
111
- pipx install cortex-llm
112
- ```
113
-
114
- ## Quick Start
115
-
116
- ```bash
117
- # After installation, just run:
118
- cortex
119
- ```
120
-
121
- ### Downloading Models
122
-
123
- ```bash
124
- # Inside Cortex, use the download command:
125
- cortex
126
- # Then type: /download
127
- ```
128
-
129
- The download feature:
130
- - **HuggingFace integration** - download any model by repository ID
131
- - **Automatic loading** - option to load model immediately after download
132
-
133
- ## Documentation
134
-
135
- ### User Documentation
136
- - **[Installation Guide](docs/installation.md)** - Complete setup instructions
137
- - **[CLI Reference](docs/cli.md)** - Commands and user interface
138
- - **[Configuration](docs/configuration.md)** - System settings and optimization
139
- - **[Model Management](docs/model-management.md)** - Loading and managing models
140
- - **[Template Registry](docs/template-registry.md)** - Automatic chat template detection and management
141
- - **[Fine-Tuning Guide](docs/fine-tuning.md)** - Customize models with LoRA
142
- - **[Troubleshooting](docs/troubleshooting.md)** - Common issues and solutions
143
-
144
- ### Technical Documentation
145
- - **[MLX Acceleration](docs/mlx-acceleration.md)** - MLX framework integration and optimization
146
- - **[GPU Validation](docs/gpu-validation.md)** - Hardware requirements and detection
147
- - **[Inference Engine](docs/inference-engine.md)** - Text generation architecture
148
- - **[Conversation Management](docs/conversation-management.md)** - Chat history and persistence
149
- - **[Development Guide](docs/development.md)** - Contributing and architecture
150
-
151
- ## System Requirements
152
-
153
- - Apple Silicon Mac (M1/M2/M3/M4 - all variants supported)
154
- - macOS 13.3+ (required by MLX framework)
155
- - Python 3.11+
156
- - 16GB+ unified memory (24GB+ recommended for larger models)
157
- - Xcode Command Line Tools
158
-
159
- ## Performance
160
-
161
- Performance depends on your Apple Silicon chip, model size, and quantization level. The inference engine measures tokens/second, first-token latency, and memory usage at runtime.
162
-
163
- To check that GPU acceleration is working:
164
-
165
- ```bash
166
- source venv/bin/activate
167
- python tests/test_apple_silicon.py
168
- ```
169
-
170
- You should see:
171
- - All validation checks passing
172
- - Measured GFLOPS from matrix operations
173
- - Confirmation of Metal and MLX availability
174
-
175
- ## GPU Acceleration Architecture
176
-
177
- Cortex uses a multi-layer approach, delegating all GPU computation to established frameworks:
178
-
179
- 1. **MLX Framework (Primary Backend)**
180
- - Apple's ML framework with native Metal support
181
- - Quantization support (4-bit, 5-bit, 8-bit, mixed-precision)
182
- - Rotating KV cache for long contexts
183
- - JIT compilation via `mx.compile`
184
- - Operation fusion for reduced kernel launches
185
-
186
- 2. **PyTorch MPS Backend**
187
- - Metal Performance Shaders for PyTorch models
188
- - FP16 optimization and channels-last tensor format
189
-
190
- 3. **llama.cpp (GGUF Backend)**
191
- - Metal-accelerated inference for GGUF models
192
-
193
- 4. **Memory Management**
194
- - Pre-allocated memory pools with best-fit/first-fit allocation strategies
195
- - Automatic pool sizing (60% of available memory, capped at 75% of total)
196
- - Defragmentation support
197
-
198
- ### Understanding "Skipping Kernel" Messages
199
-
200
- When loading GGUF models, you may see messages like:
201
- ```
202
- ggml_metal_init: skipping kernel_xxx_bf16 (not supported)
203
- ```
204
-
205
- **These are NORMAL!** They indicate:
206
- - BF16 kernels being skipped (your GPU uses FP16 instead)
207
- - GPU acceleration is still fully active
208
- - The system automatically uses optimal alternatives
209
-
210
- ## Troubleshooting
211
-
212
- If you suspect GPU isn't being used:
213
-
214
- 1. **Run validation**: `python tests/test_apple_silicon.py`
215
- 2. **Check output**: Should see passing checks and measured GFLOPS
216
- 3. **Monitor tokens/sec**: Displayed during inference
217
- 4. **Verify Metal**: Ensure Xcode Command Line Tools installed
218
-
219
- Common issues:
220
- - **Low performance**: Run `python tests/test_apple_silicon.py` to diagnose
221
- - **Memory errors**: Reduce `gpu_memory_fraction` in config.yaml
222
-
223
- ## MLX Model Conversion
224
-
225
- Cortex includes an MLX model converter:
226
-
227
- ```python
228
- from cortex.metal.mlx_converter import MLXConverter, ConversionConfig, QuantizationRecipe
229
-
230
- converter = MLXConverter()
231
- config = ConversionConfig(
232
- quantization=QuantizationRecipe.SPEED_4BIT, # 4-bit quantization
233
- compile_model=True # JIT compilation
234
- )
235
-
236
- success, message, output_path = converter.convert_model(
237
- "microsoft/DialoGPT-medium",
238
- config=config
239
- )
240
- ```
241
-
242
- ### Quantization Options
243
-
244
- - **4-bit**: Maximum speed, 75% size reduction
245
- - **5-bit**: Balanced speed and quality
246
- - **8-bit**: Higher quality, 50% size reduction
247
- - **Mixed Precision**: Custom per-layer quantization
248
-
249
- ## MLX as Primary Backend
250
-
251
- Cortex uses MLX (Apple's machine learning framework) as the primary acceleration backend:
252
- - **Metal Support**: GPU execution via MLX's built-in Metal operations
253
- - **Quantization**: Support for 4-bit, 5-bit, 8-bit, and mixed-precision quantization
254
- - **Model Conversion**: Convert HuggingFace models to MLX format
255
-
256
- ## Built With
257
-
258
- - [MLX](https://github.com/ml-explore/mlx) - Apple's machine learning framework
259
- - [mlx-lm](https://github.com/ml-explore/mlx-examples) - LLM utilities and LoRA fine-tuning for MLX
260
- - [PyTorch](https://pytorch.org/) - With Metal Performance Shaders backend
261
- - [llama.cpp](https://github.com/ggerganov/llama.cpp) - Metal-accelerated GGUF support
262
- - [Rich](https://github.com/Textualize/rich) - Terminal formatting
263
- - [HuggingFace](https://huggingface.co/) - Model hub and transformers
264
-
265
- ## Contributing
266
-
267
- We welcome contributions! Please see the [Development Guide](docs/development.md) for contributing guidelines and setup instructions.
268
-
269
- ## License
270
-
271
- MIT License - See [LICENSE](LICENSE) for details.
272
-
273
- ---
274
-
275
- **Note**: Cortex requires Apple Silicon. Intel Macs are not supported.