cortex-llm 1.0.5__tar.gz → 1.0.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. cortex_llm-1.0.7/PKG-INFO +169 -0
  2. cortex_llm-1.0.7/README.md +111 -0
  3. {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/__init__.py +1 -1
  4. {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/config.py +1 -1
  5. {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/inference_engine.py +17 -3
  6. {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/model_manager.py +45 -1
  7. {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/quantization/dynamic_quantizer.py +8 -5
  8. {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/ui/cli.py +59 -41
  9. {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/ui/markdown_render.py +68 -3
  10. cortex_llm-1.0.7/cortex_llm.egg-info/PKG-INFO +169 -0
  11. {cortex_llm-1.0.5 → cortex_llm-1.0.7}/pyproject.toml +3 -4
  12. {cortex_llm-1.0.5 → cortex_llm-1.0.7}/setup.py +4 -4
  13. cortex_llm-1.0.5/PKG-INFO +0 -275
  14. cortex_llm-1.0.5/README.md +0 -216
  15. cortex_llm-1.0.5/cortex_llm.egg-info/PKG-INFO +0 -275
  16. {cortex_llm-1.0.5 → cortex_llm-1.0.7}/LICENSE +0 -0
  17. {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/__main__.py +0 -0
  18. {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/conversation_manager.py +0 -0
  19. {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/fine_tuning/__init__.py +0 -0
  20. {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/fine_tuning/dataset.py +0 -0
  21. {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/fine_tuning/mlx_lora_trainer.py +0 -0
  22. {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/fine_tuning/trainer.py +0 -0
  23. {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/fine_tuning/wizard.py +0 -0
  24. {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/gpu_validator.py +0 -0
  25. {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/metal/__init__.py +0 -0
  26. {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/metal/gpu_validator.py +0 -0
  27. {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/metal/memory_pool.py +0 -0
  28. {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/metal/mlx_accelerator.py +0 -0
  29. {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/metal/mlx_compat.py +0 -0
  30. {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/metal/mlx_converter.py +0 -0
  31. {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/metal/mps_optimizer.py +0 -0
  32. {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/metal/optimizer.py +0 -0
  33. {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/metal/performance_profiler.py +0 -0
  34. {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/model_downloader.py +0 -0
  35. {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/quantization/__init__.py +0 -0
  36. {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/template_registry/__init__.py +0 -0
  37. {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/template_registry/auto_detector.py +0 -0
  38. {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/template_registry/config_manager.py +0 -0
  39. {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/template_registry/interactive.py +0 -0
  40. {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/template_registry/registry.py +0 -0
  41. {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/template_registry/template_profiles/__init__.py +0 -0
  42. {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/template_registry/template_profiles/base.py +0 -0
  43. {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/template_registry/template_profiles/complex/__init__.py +0 -0
  44. {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/template_registry/template_profiles/complex/reasoning.py +0 -0
  45. {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/template_registry/template_profiles/standard/__init__.py +0 -0
  46. {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/template_registry/template_profiles/standard/alpaca.py +0 -0
  47. {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/template_registry/template_profiles/standard/chatml.py +0 -0
  48. {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/template_registry/template_profiles/standard/gemma.py +0 -0
  49. {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/template_registry/template_profiles/standard/llama.py +0 -0
  50. {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/template_registry/template_profiles/standard/simple.py +0 -0
  51. {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/ui/__init__.py +0 -0
  52. {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex/ui/terminal_app.py +0 -0
  53. {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex_llm.egg-info/SOURCES.txt +0 -0
  54. {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex_llm.egg-info/dependency_links.txt +0 -0
  55. {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex_llm.egg-info/entry_points.txt +0 -0
  56. {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex_llm.egg-info/not-zip-safe +0 -0
  57. {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex_llm.egg-info/requires.txt +0 -0
  58. {cortex_llm-1.0.5 → cortex_llm-1.0.7}/cortex_llm.egg-info/top_level.txt +0 -0
  59. {cortex_llm-1.0.5 → cortex_llm-1.0.7}/setup.cfg +0 -0
  60. {cortex_llm-1.0.5 → cortex_llm-1.0.7}/tests/test_apple_silicon.py +0 -0
  61. {cortex_llm-1.0.5 → cortex_llm-1.0.7}/tests/test_metal_optimization.py +0 -0
@@ -0,0 +1,169 @@
1
+ Metadata-Version: 2.4
2
+ Name: cortex-llm
3
+ Version: 1.0.7
4
+ Summary: GPU-Accelerated LLM Terminal for Apple Silicon
5
+ Home-page: https://github.com/faisalmumtaz/Cortex
6
+ Author: Cortex Development Team
7
+ License-Expression: MIT
8
+ Project-URL: Homepage, https://github.com/faisalmumtaz/Cortex
9
+ Project-URL: Bug Tracker, https://github.com/faisalmumtaz/Cortex/issues
10
+ Project-URL: Documentation, https://github.com/faisalmumtaz/Cortex/wiki
11
+ Keywords: llm,gpu,metal,mps,apple-silicon,ai,machine-learning,terminal,mlx,pytorch
12
+ Platform: darwin
13
+ Classifier: Development Status :: 4 - Beta
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Operating System :: MacOS
20
+ Classifier: Environment :: Console
21
+ Classifier: Environment :: GPU
22
+ Requires-Python: >=3.11
23
+ Description-Content-Type: text/markdown
24
+ License-File: LICENSE
25
+ Requires-Dist: torch>=2.1.0
26
+ Requires-Dist: mlx>=0.30.4
27
+ Requires-Dist: mlx-lm>=0.30.5
28
+ Requires-Dist: transformers>=4.36.0
29
+ Requires-Dist: safetensors>=0.4.0
30
+ Requires-Dist: huggingface-hub>=0.19.0
31
+ Requires-Dist: accelerate>=0.25.0
32
+ Requires-Dist: llama-cpp-python>=0.2.0
33
+ Requires-Dist: pyyaml>=6.0
34
+ Requires-Dist: pydantic>=2.5.0
35
+ Requires-Dist: rich>=13.0.0
36
+ Requires-Dist: psutil>=5.9.0
37
+ Requires-Dist: numpy>=1.24.0
38
+ Requires-Dist: packaging>=23.0
39
+ Requires-Dist: requests>=2.31.0
40
+ Provides-Extra: dev
41
+ Requires-Dist: pytest>=7.4.0; extra == "dev"
42
+ Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
43
+ Requires-Dist: black>=23.0.0; extra == "dev"
44
+ Requires-Dist: ruff>=0.1.0; extra == "dev"
45
+ Requires-Dist: mypy>=1.8.0; extra == "dev"
46
+ Provides-Extra: optional
47
+ Requires-Dist: sentencepiece>=0.1.99; extra == "optional"
48
+ Requires-Dist: auto-gptq>=0.7.0; extra == "optional"
49
+ Requires-Dist: autoawq>=0.2.0; extra == "optional"
50
+ Requires-Dist: bitsandbytes>=0.41.0; extra == "optional"
51
+ Requires-Dist: optimum>=1.16.0; extra == "optional"
52
+ Requires-Dist: torchvision>=0.16.0; extra == "optional"
53
+ Requires-Dist: torchaudio>=2.1.0; extra == "optional"
54
+ Dynamic: home-page
55
+ Dynamic: license-file
56
+ Dynamic: platform
57
+ Dynamic: requires-python
58
+
59
+ # Cortex
60
+
61
+ GPU-accelerated local LLMs on Apple Silicon, built for the terminal.
62
+
63
+ Cortex is a fast, native CLI for running and fine-tuning LLMs on Apple Silicon using MLX and Metal. It automatically detects chat templates, supports multiple model formats, and keeps your workflow inside the terminal.
64
+
65
+ ## Highlights
66
+
67
+ - Apple Silicon GPU acceleration via MLX (primary) and PyTorch MPS
68
+ - Multi-format model support: MLX, GGUF, SafeTensors, PyTorch, GPTQ, AWQ
69
+ - Built-in LoRA fine-tuning wizard
70
+ - Chat template auto-detection (ChatML, Llama, Alpaca, Gemma, Reasoning)
71
+ - Conversation history with autosave and export
72
+
73
+ ## Quick Start
74
+
75
+ ```bash
76
+ pipx install cortex-llm
77
+ cortex
78
+ ```
79
+
80
+ Inside Cortex:
81
+
82
+ - `/download` to fetch a model from HuggingFace
83
+ - `/model` to load or manage models
84
+ - `/status` to confirm GPU acceleration and current settings
85
+
86
+ ## Installation
87
+
88
+ ### Option A: pipx (recommended)
89
+
90
+ ```bash
91
+ pipx install cortex-llm
92
+ ```
93
+
94
+ ### Option B: from source
95
+
96
+ ```bash
97
+ git clone https://github.com/faisalmumtaz/Cortex.git
98
+ cd Cortex
99
+ ./install.sh
100
+ ```
101
+
102
+ The installer checks Apple Silicon compatibility, creates a venv, installs dependencies from `pyproject.toml`, and sets up the `cortex` command.
103
+
104
+ ## Requirements
105
+
106
+ - Apple Silicon Mac (M1/M2/M3/M4)
107
+ - macOS 13.3+
108
+ - Python 3.11+
109
+ - 16GB+ unified memory (24GB+ recommended for larger models)
110
+ - Xcode Command Line Tools
111
+
112
+ ## Model Support
113
+
114
+ Cortex supports:
115
+
116
+ - **MLX** (recommended)
117
+ - **GGUF** (llama.cpp + Metal)
118
+ - **SafeTensors**
119
+ - **PyTorch** (Transformers + MPS)
120
+ - **GPTQ** / **AWQ** quantized models
121
+
122
+ ## Advanced Features
123
+
124
+ - **Dynamic quantization fallback** for PyTorch/SafeTensors models that do not fit GPU memory (INT8 preferred, INT4 fallback)
125
+ - `docs/dynamic-quantization.md`
126
+ - **MLX conversion with quantization recipes** (4/5/8-bit, mixed precision) for speed vs quality control
127
+ - `docs/mlx-acceleration.md`
128
+ - **LoRA fine-tuning wizard** for local adapters (`/finetune`)
129
+ - `docs/fine-tuning.md`
130
+ - **Template registry and auto-detection** for chat formatting (ChatML, Llama, Alpaca, Gemma, Reasoning)
131
+ - `docs/template-registry.md`
132
+ - **Inference engine details** and backend behavior
133
+ - `docs/inference-engine.md`
134
+
135
+ ## Configuration
136
+
137
+ Cortex reads `config.yaml` from the current working directory. For tuning GPU memory limits, quantization defaults, and inference parameters, see:
138
+
139
+ - `docs/configuration.md`
140
+
141
+ ## Documentation
142
+
143
+ Start here:
144
+
145
+ - `docs/installation.md`
146
+ - `docs/cli.md`
147
+ - `docs/model-management.md`
148
+ - `docs/troubleshooting.md`
149
+
150
+ Advanced topics:
151
+
152
+ - `docs/mlx-acceleration.md`
153
+ - `docs/inference-engine.md`
154
+ - `docs/dynamic-quantization.md`
155
+ - `docs/template-registry.md`
156
+ - `docs/fine-tuning.md`
157
+ - `docs/development.md`
158
+
159
+ ## Contributing
160
+
161
+ Contributions are welcome. See `docs/development.md` for setup and workflow.
162
+
163
+ ## License
164
+
165
+ MIT License. See `LICENSE`.
166
+
167
+ ---
168
+
169
+ Note: Cortex requires Apple Silicon. Intel Macs are not supported.
@@ -0,0 +1,111 @@
1
+ # Cortex
2
+
3
+ GPU-accelerated local LLMs on Apple Silicon, built for the terminal.
4
+
5
+ Cortex is a fast, native CLI for running and fine-tuning LLMs on Apple Silicon using MLX and Metal. It automatically detects chat templates, supports multiple model formats, and keeps your workflow inside the terminal.
6
+
7
+ ## Highlights
8
+
9
+ - Apple Silicon GPU acceleration via MLX (primary) and PyTorch MPS
10
+ - Multi-format model support: MLX, GGUF, SafeTensors, PyTorch, GPTQ, AWQ
11
+ - Built-in LoRA fine-tuning wizard
12
+ - Chat template auto-detection (ChatML, Llama, Alpaca, Gemma, Reasoning)
13
+ - Conversation history with autosave and export
14
+
15
+ ## Quick Start
16
+
17
+ ```bash
18
+ pipx install cortex-llm
19
+ cortex
20
+ ```
21
+
22
+ Inside Cortex:
23
+
24
+ - `/download` to fetch a model from HuggingFace
25
+ - `/model` to load or manage models
26
+ - `/status` to confirm GPU acceleration and current settings
27
+
28
+ ## Installation
29
+
30
+ ### Option A: pipx (recommended)
31
+
32
+ ```bash
33
+ pipx install cortex-llm
34
+ ```
35
+
36
+ ### Option B: from source
37
+
38
+ ```bash
39
+ git clone https://github.com/faisalmumtaz/Cortex.git
40
+ cd Cortex
41
+ ./install.sh
42
+ ```
43
+
44
+ The installer checks Apple Silicon compatibility, creates a venv, installs dependencies from `pyproject.toml`, and sets up the `cortex` command.
45
+
46
+ ## Requirements
47
+
48
+ - Apple Silicon Mac (M1/M2/M3/M4)
49
+ - macOS 13.3+
50
+ - Python 3.11+
51
+ - 16GB+ unified memory (24GB+ recommended for larger models)
52
+ - Xcode Command Line Tools
53
+
54
+ ## Model Support
55
+
56
+ Cortex supports:
57
+
58
+ - **MLX** (recommended)
59
+ - **GGUF** (llama.cpp + Metal)
60
+ - **SafeTensors**
61
+ - **PyTorch** (Transformers + MPS)
62
+ - **GPTQ** / **AWQ** quantized models
63
+
64
+ ## Advanced Features
65
+
66
+ - **Dynamic quantization fallback** for PyTorch/SafeTensors models that do not fit GPU memory (INT8 preferred, INT4 fallback)
67
+ - `docs/dynamic-quantization.md`
68
+ - **MLX conversion with quantization recipes** (4/5/8-bit, mixed precision) for speed vs quality control
69
+ - `docs/mlx-acceleration.md`
70
+ - **LoRA fine-tuning wizard** for local adapters (`/finetune`)
71
+ - `docs/fine-tuning.md`
72
+ - **Template registry and auto-detection** for chat formatting (ChatML, Llama, Alpaca, Gemma, Reasoning)
73
+ - `docs/template-registry.md`
74
+ - **Inference engine details** and backend behavior
75
+ - `docs/inference-engine.md`
76
+
77
+ ## Configuration
78
+
79
+ Cortex reads `config.yaml` from the current working directory. For tuning GPU memory limits, quantization defaults, and inference parameters, see:
80
+
81
+ - `docs/configuration.md`
82
+
83
+ ## Documentation
84
+
85
+ Start here:
86
+
87
+ - `docs/installation.md`
88
+ - `docs/cli.md`
89
+ - `docs/model-management.md`
90
+ - `docs/troubleshooting.md`
91
+
92
+ Advanced topics:
93
+
94
+ - `docs/mlx-acceleration.md`
95
+ - `docs/inference-engine.md`
96
+ - `docs/dynamic-quantization.md`
97
+ - `docs/template-registry.md`
98
+ - `docs/fine-tuning.md`
99
+ - `docs/development.md`
100
+
101
+ ## Contributing
102
+
103
+ Contributions are welcome. See `docs/development.md` for setup and workflow.
104
+
105
+ ## License
106
+
107
+ MIT License. See `LICENSE`.
108
+
109
+ ---
110
+
111
+ Note: Cortex requires Apple Silicon. Intel Macs are not supported.
@@ -5,7 +5,7 @@ A high-performance terminal interface for running Hugging Face LLMs locally
5
5
  with exclusive GPU acceleration via Metal Performance Shaders (MPS) and MLX.
6
6
  """
7
7
 
8
- __version__ = "1.0.5"
8
+ __version__ = "1.0.7"
9
9
  __author__ = "Cortex Development Team"
10
10
  __license__ = "MIT"
11
11
 
@@ -74,7 +74,7 @@ class InferenceConfig(BaseModel):
74
74
  top_p: float = Field(default=0.95, ge=0.0, le=1.0)
75
75
  top_k: int = Field(default=40, ge=0)
76
76
  repetition_penalty: float = Field(default=1.1, ge=0.0, le=2.0)
77
- max_tokens: int = Field(default=2048, ge=1)
77
+ max_tokens: int = Field(default=4096, ge=1)
78
78
  stream_output: bool = True
79
79
  seed: int = Field(default=-1)
80
80
 
@@ -138,7 +138,7 @@ class InferenceEngine:
138
138
  use_fp16=True,
139
139
  use_channels_last=True,
140
140
  optimize_memory=True,
141
- max_batch_size=self.config.performance.batch_size
141
+ max_batch_size=self.config.performance.max_batch_size
142
142
  )
143
143
  self.mps_optimizer = MPSOptimizer(mps_config)
144
144
 
@@ -153,7 +153,7 @@ class InferenceEngine:
153
153
  fuse_operations=True,
154
154
  lazy_evaluation=True,
155
155
  rotating_kv_cache=True,
156
- kv_cache_size=self.config.model.context_length if hasattr(self.config.model, 'context_length') else 4096,
156
+ kv_cache_size=self.config.performance.context_length,
157
157
  quantization_bits=4
158
158
  )
159
159
  self.mlx_accelerator = MLXAccelerator(mlx_config)
@@ -204,6 +204,9 @@ class InferenceEngine:
204
204
  yield from self._generate_pytorch(model, tokenizer, request)
205
205
  elif model_info.format == ModelFormat.SAFETENSORS:
206
206
  yield from self._generate_safetensors(model, tokenizer, request)
207
+ elif model_info.format == ModelFormat.QUANTIZED:
208
+ # Quantized models are loaded as PyTorch-compatible modules
209
+ yield from self._generate_pytorch(model, tokenizer, request)
207
210
  elif model_info.format == ModelFormat.GGUF:
208
211
  yield from self._generate_gguf(model, tokenizer, request)
209
212
  else:
@@ -401,7 +404,18 @@ class InferenceEngine:
401
404
  last_metrics_update = time.time()
402
405
 
403
406
  try:
404
- device = torch.device("mps")
407
+ # Use the model's device when available (quantized models may be CPU-only on macOS)
408
+ device = None
409
+ try:
410
+ first_param = next(model.parameters())
411
+ device = first_param.device
412
+ except Exception:
413
+ device = None
414
+
415
+ if device is None or str(device) == "meta":
416
+ device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
417
+ elif device.type == "mps" and not torch.backends.mps.is_available():
418
+ device = torch.device("cpu")
405
419
 
406
420
  inputs = tokenizer(request.prompt, return_tensors="pt").to(device)
407
421
 
@@ -133,7 +133,8 @@ class ModelManager:
133
133
  self.quantizer = DynamicQuantizer(QuantizationConfig(
134
134
  mode=QuantizationMode.DYNAMIC,
135
135
  per_channel=True,
136
- cache_quantized=True
136
+ cache_quantized=True,
137
+ cache_dir=self.config.model.quantization_cache
137
138
  ))
138
139
 
139
140
  # Initialize MLX converter for native conversion
@@ -201,6 +202,39 @@ class ModelManager:
201
202
  level = getattr(self.config.gpu, "gpu_optimization_level", "maximum")
202
203
  level = str(level).lower().strip()
203
204
  return level in {"maximum", "max", "speed", "fast", "performance"}
205
+
206
+ def _get_default_quant_recipe(self) -> Optional[QuantizationRecipe]:
207
+ """Map configured default_quantization to an MLX quantization recipe."""
208
+ raw = getattr(self.config.model, "default_quantization", "") or ""
209
+ value = str(raw).strip().lower()
210
+ if not value or value == "auto":
211
+ return None
212
+
213
+ mapping = {
214
+ "q4_k_m": QuantizationRecipe.SPEED_4BIT,
215
+ "q5_k_m": QuantizationRecipe.BALANCED_5BIT,
216
+ "q6_k": QuantizationRecipe.QUALITY_8BIT, # closest available MLX recipe
217
+ "q8_0": QuantizationRecipe.QUALITY_8BIT,
218
+ "4bit": QuantizationRecipe.SPEED_4BIT,
219
+ "5bit": QuantizationRecipe.BALANCED_5BIT,
220
+ "8bit": QuantizationRecipe.QUALITY_8BIT,
221
+ "mixed": QuantizationRecipe.MIXED_PRECISION,
222
+ "none": QuantizationRecipe.NONE,
223
+ }
224
+
225
+ recipe = mapping.get(value)
226
+ if recipe is None:
227
+ logger.warning("Unknown default_quantization value: %s", raw)
228
+ return None
229
+
230
+ supported = getattr(self.config.model, "supported_quantizations", None)
231
+ if supported:
232
+ supported_norm = {str(s).strip().lower() for s in supported}
233
+ if value.startswith("q") and value not in supported_norm:
234
+ logger.warning("default_quantization '%s' not in supported_quantizations", raw)
235
+ return None
236
+
237
+ return recipe
204
238
 
205
239
  def load_model(
206
240
  self,
@@ -374,6 +408,10 @@ class ModelManager:
374
408
  except Exception as e:
375
409
  logger.warning(f"Could not estimate model parameters: {e}, defaulting to 4-bit")
376
410
  quant_recipe = QuantizationRecipe.SPEED_4BIT # Fallback
411
+
412
+ default_recipe = self._get_default_quant_recipe()
413
+ if default_recipe is not None:
414
+ quant_recipe = default_recipe
377
415
 
378
416
  if quantization:
379
417
  quant_map = {
@@ -452,6 +490,10 @@ class ModelManager:
452
490
  else:
453
491
  quant_recipe = QuantizationRecipe.SPEED_4BIT # Default for larger models
454
492
 
493
+ default_recipe = self._get_default_quant_recipe()
494
+ if default_recipe is not None:
495
+ quant_recipe = default_recipe
496
+
455
497
  if quantization:
456
498
  quant_map = {
457
499
  "4bit": QuantizationRecipe.SPEED_4BIT,
@@ -563,6 +605,8 @@ class ModelManager:
563
605
  )
564
606
 
565
607
  if not can_load and can_apply_quantization:
608
+ if not getattr(self.config.model, "auto_quantize", True):
609
+ return False, f"GPU incompatible: {message} (auto_quantize disabled)"
566
610
  # Check if quantization would help
567
611
  gpu_status = self.gpu_validator.get_gpu_memory_status()
568
612
  available_gb = gpu_status['available_gb']
@@ -3,7 +3,7 @@
3
3
  import torch
4
4
  import torch.nn as nn
5
5
  from typing import Dict, Any, Optional, Tuple, Union
6
- from dataclasses import dataclass
6
+ from dataclasses import dataclass, field
7
7
  from enum import Enum
8
8
  import gc
9
9
  from pathlib import Path
@@ -40,6 +40,7 @@ class QuantizationConfig:
40
40
  cache_quantized: bool = True # Cache quantized models to disk
41
41
  compress_cache: bool = False # Compress cached models (slower but smaller)
42
42
  validate_quantization: bool = True # Validate quantized models work correctly
43
+ cache_dir: Path = field(default_factory=lambda: Path.home() / ".cortex" / "quantized_models")
43
44
 
44
45
  def to_dict(self) -> Dict[str, Any]:
45
46
  """Convert to dictionary for serialization."""
@@ -118,6 +119,8 @@ class DynamicQuantizer:
118
119
  def __init__(self, config: Optional[QuantizationConfig] = None):
119
120
  """Initialize quantizer with configuration."""
120
121
  self.config = config or QuantizationConfig()
122
+ self.config.cache_dir = Path(self.config.cache_dir).expanduser()
123
+ self.config.cache_dir.mkdir(parents=True, exist_ok=True)
121
124
  self.device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
122
125
  self._quantization_cache: Dict[str, Dict[str, Any]] = {}
123
126
 
@@ -681,10 +684,10 @@ class DynamicQuantizer:
681
684
 
682
685
  # Generate cache key including model metadata
683
686
  cache_key = hashlib.md5(
684
- f"{model_path}_{model_mtime}_{model_size}_{json.dumps(quantization_info)}".encode()
687
+ f"{model_path}_{model_mtime}_{model_size}_{json.dumps(self.config.to_dict())}".encode()
685
688
  ).hexdigest()
686
689
 
687
- cache_dir = Path.home() / ".cortex" / "quantized_cache"
690
+ cache_dir = self.config.cache_dir
688
691
  cache_dir.mkdir(parents=True, exist_ok=True)
689
692
 
690
693
  cache_path = cache_dir / f"{cache_key}.pt"
@@ -723,7 +726,7 @@ class DynamicQuantizer:
723
726
  f"{model_path}_{model_mtime}_{model_size}_{json.dumps(config.to_dict())}".encode()
724
727
  ).hexdigest()
725
728
 
726
- cache_path = Path.home() / ".cortex" / "quantized_cache" / f"{cache_key}.pt"
729
+ cache_path = Path(self.config.cache_dir) / f"{cache_key}.pt"
727
730
 
728
731
  if cache_path.exists():
729
732
  try:
@@ -733,4 +736,4 @@ class DynamicQuantizer:
733
736
  # Cache corrupted, will re-quantize
734
737
  cache_path.unlink()
735
738
 
736
- return None
739
+ return None
@@ -30,7 +30,7 @@ from cortex.conversation_manager import ConversationManager, MessageRole
30
30
  from cortex.model_downloader import ModelDownloader
31
31
  from cortex.template_registry import TemplateRegistry
32
32
  from cortex.fine_tuning import FineTuneWizard
33
- from cortex.ui.markdown_render import ThinkMarkdown, PrefixedRenderable
33
+ from cortex.ui.markdown_render import ThinkMarkdown, PrefixedRenderable, render_plain_with_think
34
34
 
35
35
 
36
36
  class CortexCLI:
@@ -1135,15 +1135,15 @@ class CortexCLI:
1135
1135
  logger.debug(f"Could not get stop sequences: {e}")
1136
1136
 
1137
1137
  # Create generation request with formatted prompt
1138
- # Use lower temperature for more focused responses
1139
1138
  request = GenerationRequest(
1140
1139
  prompt=formatted_prompt,
1141
1140
  max_tokens=self.config.inference.max_tokens,
1142
- temperature=0.3, # Lower temperature for less randomness
1143
- top_p=0.9, # Slightly lower top_p
1141
+ temperature=self.config.inference.temperature,
1142
+ top_p=self.config.inference.top_p,
1144
1143
  top_k=self.config.inference.top_k,
1145
1144
  repetition_penalty=self.config.inference.repetition_penalty,
1146
- stream=True,
1145
+ stream=self.config.inference.stream_output,
1146
+ seed=self.config.inference.seed if self.config.inference.seed >= 0 else None,
1147
1147
  stop_sequences=stop_sequences
1148
1148
  )
1149
1149
 
@@ -1167,50 +1167,65 @@ class CortexCLI:
1167
1167
  prefix_style = Style(color="cyan")
1168
1168
 
1169
1169
  def build_renderable(text: str):
1170
- markdown = ThinkMarkdown(text, code_theme="monokai", use_line_numbers=False)
1171
- return PrefixedRenderable(markdown, prefix="⏺ ", prefix_style=prefix_style, indent=" ")
1170
+ if getattr(self.config.ui, "markdown_rendering", True):
1171
+ markdown = ThinkMarkdown(
1172
+ text,
1173
+ code_theme="monokai",
1174
+ use_line_numbers=False,
1175
+ syntax_highlighting=getattr(self.config.ui, "syntax_highlighting", True),
1176
+ )
1177
+ renderable = markdown
1178
+ else:
1179
+ renderable = render_plain_with_think(text)
1172
1180
 
1173
- with Live(
1174
- build_renderable(""),
1175
- console=self.console,
1176
- refresh_per_second=20,
1177
- transient=False,
1178
- ) as live:
1179
- for token in self.inference_engine.generate(request):
1180
- if first_token_time is None:
1181
- first_token_time = time.time()
1181
+ return PrefixedRenderable(renderable, prefix="⏺ ", prefix_style=prefix_style, indent=" ")
1182
1182
 
1183
- generated_text += token
1184
- token_count += 1
1183
+ original_console_width = self.console._width
1184
+ target_width = max(40, int(self.get_terminal_width() * 0.75))
1185
+ self.console.width = target_width
1186
+ try:
1187
+ with Live(
1188
+ build_renderable(""),
1189
+ console=self.console,
1190
+ auto_refresh=False,
1191
+ refresh_per_second=20,
1192
+ transient=False,
1193
+ vertical_overflow="visible",
1194
+ ) as live:
1195
+ for token in self.inference_engine.generate(request):
1196
+ if first_token_time is None:
1197
+ first_token_time = time.time()
1185
1198
 
1186
- display_token = token
1187
- if uses_reasoning_template and template_profile and template_profile.supports_streaming():
1188
- display_token, should_display = template_profile.process_streaming_response(
1189
- token, accumulated_response
1190
- )
1191
- accumulated_response += token
1192
- if not should_display:
1193
- display_token = ""
1199
+ generated_text += token
1200
+ token_count += 1
1201
+
1202
+ display_token = token
1203
+ if uses_reasoning_template and template_profile and template_profile.supports_streaming():
1204
+ display_token, should_display = template_profile.process_streaming_response(
1205
+ token, accumulated_response
1206
+ )
1207
+ accumulated_response += token
1208
+ if not should_display:
1209
+ display_token = ""
1194
1210
 
1195
- if display_token:
1196
- display_text += display_token
1211
+ if display_token:
1212
+ display_text += display_token
1197
1213
 
1198
- now = time.time()
1199
- if display_token and ("\n" in display_token or now - last_render_time >= render_interval):
1200
- live.update(build_renderable(display_text))
1201
- last_render_time = now
1214
+ now = time.time()
1215
+ if display_token and ("\n" in display_token or now - last_render_time >= render_interval):
1216
+ live.update(build_renderable(display_text), refresh=True)
1217
+ last_render_time = now
1202
1218
 
1203
- if uses_reasoning_template and template_profile:
1204
- final_text = template_profile.process_response(generated_text)
1205
- generated_text = final_text
1206
- if not template_profile.config.show_reasoning:
1207
- display_text = final_text
1219
+ if uses_reasoning_template and template_profile:
1220
+ final_text = template_profile.process_response(generated_text)
1221
+ generated_text = final_text
1222
+ if not template_profile.config.show_reasoning:
1223
+ display_text = final_text
1208
1224
 
1209
- live.update(build_renderable(display_text))
1225
+ live.update(build_renderable(display_text), refresh=True)
1226
+ finally:
1227
+ self.console._width = original_console_width
1210
1228
 
1211
- # Add blank line for spacing between response and metrics
1212
- print()
1213
-
1214
1229
  # Display final metrics in a clean, professional way
1215
1230
  elapsed = time.time() - start_time
1216
1231
  if token_count > 0 and elapsed > 0:
@@ -1238,6 +1253,9 @@ class CortexCLI:
1238
1253
  metrics_line = " · ".join(metrics_parts)
1239
1254
  print(f" \033[2m{metrics_line}\033[0m")
1240
1255
 
1256
+ if token_count >= request.max_tokens:
1257
+ print(f" \033[2m(output truncated at max_tokens={request.max_tokens}; increase in config.yaml)\033[0m")
1258
+
1241
1259
  # Add assistant message to conversation history
1242
1260
  self.conversation_manager.add_message(MessageRole.ASSISTANT, generated_text)
1243
1261