cortex-llm 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. cortex/__init__.py +73 -0
  2. cortex/__main__.py +83 -0
  3. cortex/config.py +329 -0
  4. cortex/conversation_manager.py +468 -0
  5. cortex/fine_tuning/__init__.py +8 -0
  6. cortex/fine_tuning/dataset.py +332 -0
  7. cortex/fine_tuning/mlx_lora_trainer.py +502 -0
  8. cortex/fine_tuning/trainer.py +957 -0
  9. cortex/fine_tuning/wizard.py +707 -0
  10. cortex/gpu_validator.py +467 -0
  11. cortex/inference_engine.py +727 -0
  12. cortex/metal/__init__.py +275 -0
  13. cortex/metal/gpu_validator.py +177 -0
  14. cortex/metal/memory_pool.py +886 -0
  15. cortex/metal/mlx_accelerator.py +678 -0
  16. cortex/metal/mlx_converter.py +638 -0
  17. cortex/metal/mps_optimizer.py +417 -0
  18. cortex/metal/optimizer.py +665 -0
  19. cortex/metal/performance_profiler.py +364 -0
  20. cortex/model_downloader.py +130 -0
  21. cortex/model_manager.py +2187 -0
  22. cortex/quantization/__init__.py +5 -0
  23. cortex/quantization/dynamic_quantizer.py +736 -0
  24. cortex/template_registry/__init__.py +15 -0
  25. cortex/template_registry/auto_detector.py +144 -0
  26. cortex/template_registry/config_manager.py +234 -0
  27. cortex/template_registry/interactive.py +260 -0
  28. cortex/template_registry/registry.py +347 -0
  29. cortex/template_registry/template_profiles/__init__.py +5 -0
  30. cortex/template_registry/template_profiles/base.py +142 -0
  31. cortex/template_registry/template_profiles/complex/__init__.py +5 -0
  32. cortex/template_registry/template_profiles/complex/reasoning.py +263 -0
  33. cortex/template_registry/template_profiles/standard/__init__.py +9 -0
  34. cortex/template_registry/template_profiles/standard/alpaca.py +73 -0
  35. cortex/template_registry/template_profiles/standard/chatml.py +82 -0
  36. cortex/template_registry/template_profiles/standard/gemma.py +103 -0
  37. cortex/template_registry/template_profiles/standard/llama.py +87 -0
  38. cortex/template_registry/template_profiles/standard/simple.py +65 -0
  39. cortex/ui/__init__.py +120 -0
  40. cortex/ui/cli.py +1685 -0
  41. cortex/ui/markdown_render.py +185 -0
  42. cortex/ui/terminal_app.py +534 -0
  43. cortex_llm-1.0.0.dist-info/METADATA +275 -0
  44. cortex_llm-1.0.0.dist-info/RECORD +48 -0
  45. cortex_llm-1.0.0.dist-info/WHEEL +5 -0
  46. cortex_llm-1.0.0.dist-info/entry_points.txt +2 -0
  47. cortex_llm-1.0.0.dist-info/licenses/LICENSE +21 -0
  48. cortex_llm-1.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,364 @@
1
+ """Performance profiler for GPU operations with Metal/MPS."""
2
+
3
+ import time
4
+ import json
5
+ import psutil
6
+ import subprocess
7
+ from pathlib import Path
8
+ from typing import Dict, Any, Optional, List, Tuple, Callable
9
+ from dataclasses import dataclass, asdict
10
+ from datetime import datetime
11
+ from collections import deque
12
+ import threading
13
+
14
+ @dataclass
15
+ class ProfileResult:
16
+ """Result from a profiling session."""
17
+ operation_name: str
18
+ start_time: datetime
19
+ end_time: datetime
20
+ duration_ms: float
21
+ gpu_utilization: float
22
+ memory_used_mb: float
23
+ memory_bandwidth_gb: float
24
+ tokens_per_second: float
25
+ flops: float
26
+ metadata: Dict[str, Any]
27
+
28
+ def to_dict(self) -> Dict[str, Any]:
29
+ """Convert to dictionary."""
30
+ result = asdict(self)
31
+ result['start_time'] = self.start_time.isoformat()
32
+ result['end_time'] = self.end_time.isoformat()
33
+ return result
34
+
35
+ class PerformanceProfiler:
36
+ """Profile GPU performance for Metal/MPS operations."""
37
+
38
+ def __init__(self, sample_interval: float = 0.1):
39
+ """
40
+ Initialize performance profiler.
41
+
42
+ Args:
43
+ sample_interval: Sampling interval in seconds
44
+ """
45
+ self.sample_interval = sample_interval
46
+ self.results: List[ProfileResult] = []
47
+ self.current_profile: Optional[ProfileResult] = None
48
+ self._monitoring = False
49
+ self._monitor_thread: Optional[threading.Thread] = None
50
+ self._gpu_samples: deque = deque(maxlen=1000)
51
+ self._memory_samples: deque = deque(maxlen=1000)
52
+
53
+ def start_profiling(
54
+ self,
55
+ operation_name: str,
56
+ metadata: Optional[Dict[str, Any]] = None
57
+ ) -> None:
58
+ """Start profiling an operation."""
59
+ self.current_profile = ProfileResult(
60
+ operation_name=operation_name,
61
+ start_time=datetime.now(),
62
+ end_time=datetime.now(),
63
+ duration_ms=0,
64
+ gpu_utilization=0,
65
+ memory_used_mb=0,
66
+ memory_bandwidth_gb=0,
67
+ tokens_per_second=0,
68
+ flops=0,
69
+ metadata=metadata or {}
70
+ )
71
+
72
+ self._start_monitoring()
73
+
74
+ def get_current_metrics(self) -> Optional[ProfileResult]:
75
+ """Get current metrics without stopping the profiling session."""
76
+ if not self.current_profile:
77
+ return None
78
+
79
+ current_time = datetime.now()
80
+ duration_ms = (current_time - self.current_profile.start_time).total_seconds() * 1000
81
+
82
+ gpu_utilization = 0
83
+ if self._gpu_samples:
84
+ recent_samples = list(self._gpu_samples)[-10:] # Use last 10 samples for smoother metrics
85
+ gpu_utilization = sum(recent_samples) / len(recent_samples) if recent_samples else 0
86
+
87
+ memory_used_mb = 0
88
+ if self._memory_samples:
89
+ recent_memory = list(self._memory_samples)[-10:]
90
+ memory_used_mb = sum(recent_memory) / len(recent_memory) if recent_memory else 0
91
+
92
+ return ProfileResult(
93
+ operation_name=self.current_profile.operation_name,
94
+ start_time=self.current_profile.start_time,
95
+ end_time=current_time,
96
+ duration_ms=duration_ms,
97
+ gpu_utilization=gpu_utilization,
98
+ memory_used_mb=memory_used_mb,
99
+ memory_bandwidth_gb=0,
100
+ tokens_per_second=0,
101
+ flops=0,
102
+ metadata=self.current_profile.metadata
103
+ )
104
+
105
+ def stop_profiling(self) -> ProfileResult:
106
+ """Stop profiling and return results."""
107
+ if not self.current_profile:
108
+ raise RuntimeError("No profiling session active")
109
+
110
+ self._stop_monitoring()
111
+
112
+ self.current_profile.end_time = datetime.now()
113
+ self.current_profile.duration_ms = (
114
+ self.current_profile.end_time - self.current_profile.start_time
115
+ ).total_seconds() * 1000
116
+
117
+ if self._gpu_samples:
118
+ self.current_profile.gpu_utilization = sum(self._gpu_samples) / len(self._gpu_samples)
119
+
120
+ if self._memory_samples:
121
+ self.current_profile.memory_used_mb = max(self._memory_samples)
122
+
123
+ self.results.append(self.current_profile)
124
+ result = self.current_profile
125
+ self.current_profile = None
126
+
127
+ return result
128
+
129
+ def _start_monitoring(self) -> None:
130
+ """Start GPU monitoring thread."""
131
+ self._monitoring = True
132
+ self._gpu_samples.clear()
133
+ self._memory_samples.clear()
134
+
135
+ self._monitor_thread = threading.Thread(target=self._monitor_loop)
136
+ self._monitor_thread.daemon = True
137
+ self._monitor_thread.start()
138
+
139
+ def _stop_monitoring(self) -> None:
140
+ """Stop GPU monitoring thread."""
141
+ self._monitoring = False
142
+ if self._monitor_thread:
143
+ self._monitor_thread.join(timeout=1.0)
144
+ self._monitor_thread = None
145
+
146
+ def _monitor_loop(self) -> None:
147
+ """Monitoring loop for GPU metrics."""
148
+ while self._monitoring:
149
+ try:
150
+ gpu_util = self._get_gpu_utilization()
151
+ memory_mb = self._get_memory_usage()
152
+
153
+ self._gpu_samples.append(gpu_util)
154
+ self._memory_samples.append(memory_mb)
155
+
156
+ time.sleep(self.sample_interval)
157
+ except Exception:
158
+ pass
159
+
160
+ def _get_gpu_utilization(self) -> float:
161
+ """Get current GPU utilization percentage."""
162
+ try:
163
+ result = subprocess.run(
164
+ ["ioreg", "-l", "-w", "0"],
165
+ capture_output=True,
166
+ text=True,
167
+ timeout=1
168
+ )
169
+
170
+ lines = result.stdout.split('\n')
171
+ for line in lines:
172
+ if "PercentGPUUtilization" in line:
173
+ parts = line.split('=')
174
+ if len(parts) > 1:
175
+ return float(parts[1].strip())
176
+
177
+ cpu_percent = psutil.cpu_percent(interval=0.1)
178
+ return min(cpu_percent * 1.5, 100.0)
179
+
180
+ except:
181
+ return 0.0
182
+
183
+ def _get_memory_usage(self) -> float:
184
+ """Get current memory usage in MB."""
185
+ try:
186
+ vm = psutil.virtual_memory()
187
+ return vm.used / (1024 * 1024)
188
+ except:
189
+ return 0.0
190
+
191
+ def profile_operation(
192
+ self,
193
+ operation: Callable,
194
+ operation_name: str,
195
+ args: tuple = (),
196
+ kwargs: dict = None,
197
+ warmup_runs: int = 3,
198
+ profile_runs: int = 10
199
+ ) -> ProfileResult:
200
+ """
201
+ Profile a specific operation.
202
+
203
+ Args:
204
+ operation: Operation to profile
205
+ operation_name: Name for the operation
206
+ args: Arguments for operation
207
+ kwargs: Keyword arguments for operation
208
+ warmup_runs: Number of warmup runs
209
+ profile_runs: Number of profiling runs
210
+
211
+ Returns:
212
+ Profile result
213
+ """
214
+ kwargs = kwargs or {}
215
+
216
+ for _ in range(warmup_runs):
217
+ operation(*args, **kwargs)
218
+
219
+ self.start_profiling(operation_name, {
220
+ "warmup_runs": warmup_runs,
221
+ "profile_runs": profile_runs
222
+ })
223
+
224
+ start = time.perf_counter()
225
+ for _ in range(profile_runs):
226
+ operation(*args, **kwargs)
227
+ end = time.perf_counter()
228
+
229
+ result = self.stop_profiling()
230
+
231
+ avg_time = (end - start) / profile_runs
232
+ result.duration_ms = avg_time * 1000
233
+
234
+ return result
235
+
236
+ def compare_operations(
237
+ self,
238
+ operations: List[Tuple[Callable, str]],
239
+ args: tuple = (),
240
+ kwargs: dict = None
241
+ ) -> Dict[str, ProfileResult]:
242
+ """
243
+ Compare performance of multiple operations.
244
+
245
+ Args:
246
+ operations: List of (operation, name) tuples
247
+ args: Common arguments
248
+ kwargs: Common keyword arguments
249
+
250
+ Returns:
251
+ Dictionary of results by operation name
252
+ """
253
+ results = {}
254
+
255
+ for operation, name in operations:
256
+ result = self.profile_operation(
257
+ operation, name, args, kwargs
258
+ )
259
+ results[name] = result
260
+
261
+ return results
262
+
263
+ def profile_model_inference(
264
+ self,
265
+ model: Any,
266
+ input_data: Any,
267
+ num_iterations: int = 100
268
+ ) -> ProfileResult:
269
+ """Profile model inference performance."""
270
+ def inference():
271
+ return model(input_data)
272
+
273
+ return self.profile_operation(
274
+ inference,
275
+ "model_inference",
276
+ warmup_runs=5,
277
+ profile_runs=num_iterations
278
+ )
279
+
280
+ def estimate_flops(
281
+ self,
282
+ operation_type: str,
283
+ input_shape: Tuple[int, ...],
284
+ duration_ms: float
285
+ ) -> float:
286
+ """Estimate FLOPS for an operation."""
287
+ flops_map = {
288
+ "matmul": lambda shape: 2 * shape[0] * shape[1] * shape[2] if len(shape) >= 3 else 0,
289
+ "attention": lambda shape: 4 * shape[0] * shape[1] * shape[1] * shape[2],
290
+ "layernorm": lambda shape: 3 * shape[0] * shape[1],
291
+ "gelu": lambda shape: 10 * shape[0] * shape[1],
292
+ "softmax": lambda shape: 3 * shape[0] * shape[1]
293
+ }
294
+
295
+ if operation_type in flops_map and duration_ms > 0:
296
+ total_flops = flops_map[operation_type](input_shape)
297
+ return total_flops / (duration_ms / 1000)
298
+
299
+ return 0.0
300
+
301
+ def generate_report(self) -> Dict[str, Any]:
302
+ """Generate performance report from all results."""
303
+ if not self.results:
304
+ return {"error": "No profiling results available"}
305
+
306
+ total_time = sum(r.duration_ms for r in self.results)
307
+ avg_gpu = sum(r.gpu_utilization for r in self.results) / len(self.results)
308
+ peak_memory = max(r.memory_used_mb for r in self.results)
309
+
310
+ operations_summary = []
311
+ for result in self.results:
312
+ operations_summary.append({
313
+ "name": result.operation_name,
314
+ "duration_ms": result.duration_ms,
315
+ "gpu_utilization": result.gpu_utilization,
316
+ "memory_mb": result.memory_used_mb
317
+ })
318
+
319
+ return {
320
+ "total_operations": len(self.results),
321
+ "total_time_ms": total_time,
322
+ "average_gpu_utilization": avg_gpu,
323
+ "peak_memory_mb": peak_memory,
324
+ "operations": operations_summary,
325
+ "timestamp": datetime.now().isoformat()
326
+ }
327
+
328
+ def save_results(self, filepath: Path) -> None:
329
+ """Save profiling results to JSON file."""
330
+ report = self.generate_report()
331
+
332
+ with open(filepath, 'w') as f:
333
+ json.dump(report, f, indent=2)
334
+
335
+ def clear_results(self) -> None:
336
+ """Clear all profiling results."""
337
+ self.results.clear()
338
+ self._gpu_samples.clear()
339
+ self._memory_samples.clear()
340
+
341
+ def get_optimization_suggestions(self) -> List[str]:
342
+ """Get optimization suggestions based on profiling results."""
343
+ suggestions = []
344
+
345
+ if not self.results:
346
+ return ["No profiling data available"]
347
+
348
+ avg_gpu = sum(r.gpu_utilization for r in self.results) / len(self.results)
349
+
350
+ if avg_gpu < 50:
351
+ suggestions.append("Low GPU utilization - consider increasing batch size")
352
+
353
+ if avg_gpu > 95:
354
+ suggestions.append("Very high GPU utilization - may be throttling")
355
+
356
+ peak_memory = max(r.memory_used_mb for r in self.results)
357
+ if peak_memory > 18000: # 18GB for high memory systems
358
+ suggestions.append("High memory usage - consider model quantization")
359
+
360
+ slow_ops = [r for r in self.results if r.duration_ms > 100]
361
+ if slow_ops:
362
+ suggestions.append(f"Found {len(slow_ops)} slow operations (>100ms)")
363
+
364
+ return suggestions if suggestions else ["Performance looks optimal"]
@@ -0,0 +1,130 @@
1
+ """Simple model downloader for Cortex."""
2
+
3
+ import os
4
+ import sys
5
+ from pathlib import Path
6
+ from typing import Optional, Tuple
7
+ import requests
8
+
9
+ try:
10
+ from huggingface_hub import snapshot_download, hf_hub_download, HfApi
11
+ from huggingface_hub.utils import GatedRepoError, RepositoryNotFoundError
12
+ HF_HUB_AVAILABLE = True
13
+ except ImportError:
14
+ HF_HUB_AVAILABLE = False
15
+
16
+
17
+ class ModelDownloader:
18
+ """Simple model downloader from HuggingFace."""
19
+
20
+ def __init__(self, model_path: Path):
21
+ """Initialize downloader with model directory."""
22
+ self.model_path = Path(model_path).expanduser().resolve()
23
+ self.model_path.mkdir(parents=True, exist_ok=True)
24
+
25
+ def check_auth_status(self) -> Tuple[bool, Optional[str]]:
26
+ """Check if user is authenticated with HuggingFace.
27
+
28
+ Returns:
29
+ Tuple of (is_authenticated, username)
30
+ """
31
+ if not HF_HUB_AVAILABLE:
32
+ return False, None
33
+
34
+ try:
35
+ api = HfApi()
36
+ user_info = api.whoami()
37
+ if user_info:
38
+ return True, user_info.get('name', 'Unknown')
39
+ except:
40
+ pass
41
+
42
+ return False, None
43
+
44
+ def download_model(self, repo_id: str, filename: Optional[str] = None) -> Tuple[bool, str, Optional[Path]]:
45
+ """
46
+ Download a model from HuggingFace.
47
+
48
+ Args:
49
+ repo_id: HuggingFace repository ID (e.g., "meta-llama/Llama-3.1-8B-Instruct")
50
+ filename: Optional specific file to download (for GGUF models)
51
+
52
+ Returns:
53
+ Tuple of (success, message, local_path)
54
+ """
55
+ if not HF_HUB_AVAILABLE:
56
+ return False, "huggingface-hub not installed. Install with: pip install huggingface-hub", None
57
+
58
+ try:
59
+ if filename:
60
+ # Download single file
61
+ print(f"Downloading {filename} from {repo_id}...")
62
+ local_path = self.model_path / filename
63
+
64
+ if local_path.exists():
65
+ return False, f"File already exists: {local_path}", local_path
66
+
67
+ downloaded_path = hf_hub_download(
68
+ repo_id=repo_id,
69
+ filename=filename,
70
+ local_dir=self.model_path
71
+ # Downloads always resume when possible by default
72
+ )
73
+
74
+ return True, f"Downloaded to {local_path}", Path(downloaded_path)
75
+
76
+ else:
77
+ # Download entire repository
78
+ model_name = repo_id.split('/')[-1]
79
+ local_path = self.model_path / model_name
80
+
81
+ print(f"Downloading repository {repo_id}...")
82
+
83
+ if local_path.exists() and any(local_path.iterdir()):
84
+ return False, f"Model already exists: {local_path}", local_path
85
+
86
+ downloaded_path = snapshot_download(
87
+ repo_id=repo_id,
88
+ local_dir=local_path
89
+ # Downloads always resume when possible by default
90
+ )
91
+
92
+ return True, f"Downloaded to {local_path}", local_path
93
+
94
+ except GatedRepoError:
95
+ # Check if user is logged in
96
+ is_auth, username = self.check_auth_status()
97
+ if is_auth:
98
+ return False, f"Model {repo_id} is gated. You're logged in as {username} but may need to accept the model's license agreement at https://huggingface.co/{repo_id}", None
99
+ else:
100
+ return False, f"Model {repo_id} requires authentication. Please use /login command to authenticate with HuggingFace", None
101
+ except RepositoryNotFoundError:
102
+ return False, f"Repository {repo_id} not found on HuggingFace", None
103
+ except Exception as e:
104
+ return False, f"Download failed: {str(e)}", None
105
+
106
+ def list_downloaded_models(self) -> list:
107
+ """List all downloaded models."""
108
+ models = []
109
+
110
+ if not self.model_path.exists():
111
+ return models
112
+
113
+ for item in self.model_path.iterdir():
114
+ if item.is_file() and item.suffix in ['.gguf', '.ggml', '.bin']:
115
+ size_gb = item.stat().st_size / (1024**3)
116
+ models.append({
117
+ 'name': item.name,
118
+ 'path': str(item),
119
+ 'size_gb': round(size_gb, 2)
120
+ })
121
+ elif item.is_dir() and any(item.iterdir()):
122
+ total_size = sum(f.stat().st_size for f in item.rglob('*') if f.is_file())
123
+ size_gb = total_size / (1024**3)
124
+ models.append({
125
+ 'name': item.name,
126
+ 'path': str(item),
127
+ 'size_gb': round(size_gb, 2)
128
+ })
129
+
130
+ return models