PyPI - cortex-llm - Versions diffs - 1.0.0__py3-none-any.whl - Mend

cortex-llm 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

cortex/__init__.py +73 -0
cortex/__main__.py +83 -0
cortex/config.py +329 -0
cortex/conversation_manager.py +468 -0
cortex/fine_tuning/__init__.py +8 -0
cortex/fine_tuning/dataset.py +332 -0
cortex/fine_tuning/mlx_lora_trainer.py +502 -0
cortex/fine_tuning/trainer.py +957 -0
cortex/fine_tuning/wizard.py +707 -0
cortex/gpu_validator.py +467 -0
cortex/inference_engine.py +727 -0
cortex/metal/__init__.py +275 -0
cortex/metal/gpu_validator.py +177 -0
cortex/metal/memory_pool.py +886 -0
cortex/metal/mlx_accelerator.py +678 -0
cortex/metal/mlx_converter.py +638 -0
cortex/metal/mps_optimizer.py +417 -0
cortex/metal/optimizer.py +665 -0
cortex/metal/performance_profiler.py +364 -0
cortex/model_downloader.py +130 -0
cortex/model_manager.py +2187 -0
cortex/quantization/__init__.py +5 -0
cortex/quantization/dynamic_quantizer.py +736 -0
cortex/template_registry/__init__.py +15 -0
cortex/template_registry/auto_detector.py +144 -0
cortex/template_registry/config_manager.py +234 -0
cortex/template_registry/interactive.py +260 -0
cortex/template_registry/registry.py +347 -0
cortex/template_registry/template_profiles/__init__.py +5 -0
cortex/template_registry/template_profiles/base.py +142 -0
cortex/template_registry/template_profiles/complex/__init__.py +5 -0
cortex/template_registry/template_profiles/complex/reasoning.py +263 -0
cortex/template_registry/template_profiles/standard/__init__.py +9 -0
cortex/template_registry/template_profiles/standard/alpaca.py +73 -0
cortex/template_registry/template_profiles/standard/chatml.py +82 -0
cortex/template_registry/template_profiles/standard/gemma.py +103 -0
cortex/template_registry/template_profiles/standard/llama.py +87 -0
cortex/template_registry/template_profiles/standard/simple.py +65 -0
cortex/ui/__init__.py +120 -0
cortex/ui/cli.py +1685 -0
cortex/ui/markdown_render.py +185 -0
cortex/ui/terminal_app.py +534 -0
cortex_llm-1.0.0.dist-info/METADATA +275 -0
cortex_llm-1.0.0.dist-info/RECORD +48 -0
cortex_llm-1.0.0.dist-info/WHEEL +5 -0
cortex_llm-1.0.0.dist-info/entry_points.txt +2 -0
cortex_llm-1.0.0.dist-info/licenses/LICENSE +21 -0
cortex_llm-1.0.0.dist-info/top_level.txt +1 -0

cortex/metal/performance_profiler.py ADDED Viewed

@@ -0,0 +1,364 @@
+"""Performance profiler for GPU operations with Metal/MPS."""
+import time
+import json
+import psutil
+import subprocess
+from pathlib import Path
+from typing import Dict, Any, Optional, List, Tuple, Callable
+from dataclasses import dataclass, asdict
+from datetime import datetime
+from collections import deque
+import threading
+@dataclass
+class ProfileResult:
+    """Result from a profiling session."""
+    operation_name: str
+    start_time: datetime
+    end_time: datetime
+    duration_ms: float
+    gpu_utilization: float
+    memory_used_mb: float
+    memory_bandwidth_gb: float
+    tokens_per_second: float
+    flops: float
+    metadata: Dict[str, Any]
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary."""
+        result = asdict(self)
+        result['start_time'] = self.start_time.isoformat()
+        result['end_time'] = self.end_time.isoformat()
+        return result
+class PerformanceProfiler:
+    """Profile GPU performance for Metal/MPS operations."""
+    def __init__(self, sample_interval: float = 0.1):
+        """
+        Initialize performance profiler.
+        Args:
+            sample_interval: Sampling interval in seconds
+        """
+        self.sample_interval = sample_interval
+        self.results: List[ProfileResult] = []
+        self.current_profile: Optional[ProfileResult] = None
+        self._monitoring = False
+        self._monitor_thread: Optional[threading.Thread] = None
+        self._gpu_samples: deque = deque(maxlen=1000)
+        self._memory_samples: deque = deque(maxlen=1000)
+    def start_profiling(
+        self,
+        operation_name: str,
+        metadata: Optional[Dict[str, Any]] = None
+    ) -> None:
+        """Start profiling an operation."""
+        self.current_profile = ProfileResult(
+            operation_name=operation_name,
+            start_time=datetime.now(),
+            end_time=datetime.now(),
+            duration_ms=0,
+            gpu_utilization=0,
+            memory_used_mb=0,
+            memory_bandwidth_gb=0,
+            tokens_per_second=0,
+            flops=0,
+            metadata=metadata or {}
+        )
+        self._start_monitoring()
+    def get_current_metrics(self) -> Optional[ProfileResult]:
+        """Get current metrics without stopping the profiling session."""
+        if not self.current_profile:
+            return None
+        current_time = datetime.now()
+        duration_ms = (current_time - self.current_profile.start_time).total_seconds() * 1000
+        gpu_utilization = 0
+        if self._gpu_samples:
+            recent_samples = list(self._gpu_samples)[-10:]  # Use last 10 samples for smoother metrics
+            gpu_utilization = sum(recent_samples) / len(recent_samples) if recent_samples else 0
+        memory_used_mb = 0
+        if self._memory_samples:
+            recent_memory = list(self._memory_samples)[-10:]
+            memory_used_mb = sum(recent_memory) / len(recent_memory) if recent_memory else 0
+        return ProfileResult(
+            operation_name=self.current_profile.operation_name,
+            start_time=self.current_profile.start_time,
+            end_time=current_time,
+            duration_ms=duration_ms,
+            gpu_utilization=gpu_utilization,
+            memory_used_mb=memory_used_mb,
+            memory_bandwidth_gb=0,
+            tokens_per_second=0,
+            flops=0,
+            metadata=self.current_profile.metadata
+        )
+    def stop_profiling(self) -> ProfileResult:
+        """Stop profiling and return results."""
+        if not self.current_profile:
+            raise RuntimeError("No profiling session active")
+        self._stop_monitoring()
+        self.current_profile.end_time = datetime.now()
+        self.current_profile.duration_ms = (
+            self.current_profile.end_time - self.current_profile.start_time
+        ).total_seconds() * 1000
+        if self._gpu_samples:
+            self.current_profile.gpu_utilization = sum(self._gpu_samples) / len(self._gpu_samples)
+        if self._memory_samples:
+            self.current_profile.memory_used_mb = max(self._memory_samples)
+        self.results.append(self.current_profile)
+        result = self.current_profile
+        self.current_profile = None
+        return result
+    def _start_monitoring(self) -> None:
+        """Start GPU monitoring thread."""
+        self._monitoring = True
+        self._gpu_samples.clear()
+        self._memory_samples.clear()
+        self._monitor_thread = threading.Thread(target=self._monitor_loop)
+        self._monitor_thread.daemon = True
+        self._monitor_thread.start()
+    def _stop_monitoring(self) -> None:
+        """Stop GPU monitoring thread."""
+        self._monitoring = False
+        if self._monitor_thread:
+            self._monitor_thread.join(timeout=1.0)
+            self._monitor_thread = None
+    def _monitor_loop(self) -> None:
+        """Monitoring loop for GPU metrics."""
+        while self._monitoring:
+            try:
+                gpu_util = self._get_gpu_utilization()
+                memory_mb = self._get_memory_usage()
+                self._gpu_samples.append(gpu_util)
+                self._memory_samples.append(memory_mb)
+                time.sleep(self.sample_interval)
+            except Exception:
+                pass
+    def _get_gpu_utilization(self) -> float:
+        """Get current GPU utilization percentage."""
+        try:
+            result = subprocess.run(
+                ["ioreg", "-l", "-w", "0"],
+                capture_output=True,
+                text=True,
+                timeout=1
+            )
+            lines = result.stdout.split('\n')
+            for line in lines:
+                if "PercentGPUUtilization" in line:
+                    parts = line.split('=')
+                    if len(parts) > 1:
+                        return float(parts[1].strip())
+            cpu_percent = psutil.cpu_percent(interval=0.1)
+            return min(cpu_percent * 1.5, 100.0)
+        except:
+            return 0.0
+    def _get_memory_usage(self) -> float:
+        """Get current memory usage in MB."""
+        try:
+            vm = psutil.virtual_memory()
+            return vm.used / (1024 * 1024)
+        except:
+            return 0.0
+    def profile_operation(
+        self,
+        operation: Callable,
+        operation_name: str,
+        args: tuple = (),
+        kwargs: dict = None,
+        warmup_runs: int = 3,
+        profile_runs: int = 10
+    ) -> ProfileResult:
+        """
+        Profile a specific operation.
+        Args:
+            operation: Operation to profile
+            operation_name: Name for the operation
+            args: Arguments for operation
+            kwargs: Keyword arguments for operation
+            warmup_runs: Number of warmup runs
+            profile_runs: Number of profiling runs
+        Returns:
+            Profile result
+        """
+        kwargs = kwargs or {}
+        for _ in range(warmup_runs):
+            operation(*args, **kwargs)
+        self.start_profiling(operation_name, {
+            "warmup_runs": warmup_runs,
+            "profile_runs": profile_runs
+        })
+        start = time.perf_counter()
+        for _ in range(profile_runs):
+            operation(*args, **kwargs)
+        end = time.perf_counter()
+        result = self.stop_profiling()
+        avg_time = (end - start) / profile_runs
+        result.duration_ms = avg_time * 1000
+        return result
+    def compare_operations(
+        self,
+        operations: List[Tuple[Callable, str]],
+        args: tuple = (),
+        kwargs: dict = None
+    ) -> Dict[str, ProfileResult]:
+        """
+        Compare performance of multiple operations.
+        Args:
+            operations: List of (operation, name) tuples
+            args: Common arguments
+            kwargs: Common keyword arguments
+        Returns:
+            Dictionary of results by operation name
+        """
+        results = {}
+        for operation, name in operations:
+            result = self.profile_operation(
+                operation, name, args, kwargs
+            )
+            results[name] = result
+        return results
+    def profile_model_inference(
+        self,
+        model: Any,
+        input_data: Any,
+        num_iterations: int = 100
+    ) -> ProfileResult:
+        """Profile model inference performance."""
+        def inference():
+            return model(input_data)
+        return self.profile_operation(
+            inference,
+            "model_inference",
+            warmup_runs=5,
+            profile_runs=num_iterations
+        )
+    def estimate_flops(
+        self,
+        operation_type: str,
+        input_shape: Tuple[int, ...],
+        duration_ms: float
+    ) -> float:
+        """Estimate FLOPS for an operation."""
+        flops_map = {
+            "matmul": lambda shape: 2 * shape[0] * shape[1] * shape[2] if len(shape) >= 3 else 0,
+            "attention": lambda shape: 4 * shape[0] * shape[1] * shape[1] * shape[2],
+            "layernorm": lambda shape: 3 * shape[0] * shape[1],
+            "gelu": lambda shape: 10 * shape[0] * shape[1],
+            "softmax": lambda shape: 3 * shape[0] * shape[1]
+        }
+        if operation_type in flops_map and duration_ms > 0:
+            total_flops = flops_map[operation_type](input_shape)
+            return total_flops / (duration_ms / 1000)
+        return 0.0
+    def generate_report(self) -> Dict[str, Any]:
+        """Generate performance report from all results."""
+        if not self.results:
+            return {"error": "No profiling results available"}
+        total_time = sum(r.duration_ms for r in self.results)
+        avg_gpu = sum(r.gpu_utilization for r in self.results) / len(self.results)
+        peak_memory = max(r.memory_used_mb for r in self.results)
+        operations_summary = []
+        for result in self.results:
+            operations_summary.append({
+                "name": result.operation_name,
+                "duration_ms": result.duration_ms,
+                "gpu_utilization": result.gpu_utilization,
+                "memory_mb": result.memory_used_mb
+            })
+        return {
+            "total_operations": len(self.results),
+            "total_time_ms": total_time,
+            "average_gpu_utilization": avg_gpu,
+            "peak_memory_mb": peak_memory,
+            "operations": operations_summary,
+            "timestamp": datetime.now().isoformat()
+        }
+    def save_results(self, filepath: Path) -> None:
+        """Save profiling results to JSON file."""
+        report = self.generate_report()
+        with open(filepath, 'w') as f:
+            json.dump(report, f, indent=2)
+    def clear_results(self) -> None:
+        """Clear all profiling results."""
+        self.results.clear()
+        self._gpu_samples.clear()
+        self._memory_samples.clear()
+    def get_optimization_suggestions(self) -> List[str]:
+        """Get optimization suggestions based on profiling results."""
+        suggestions = []
+        if not self.results:
+            return ["No profiling data available"]
+        avg_gpu = sum(r.gpu_utilization for r in self.results) / len(self.results)
+        if avg_gpu < 50:
+            suggestions.append("Low GPU utilization - consider increasing batch size")
+        if avg_gpu > 95:
+            suggestions.append("Very high GPU utilization - may be throttling")
+        peak_memory = max(r.memory_used_mb for r in self.results)
+        if peak_memory > 18000:  # 18GB for high memory systems
+            suggestions.append("High memory usage - consider model quantization")
+        slow_ops = [r for r in self.results if r.duration_ms > 100]
+        if slow_ops:
+            suggestions.append(f"Found {len(slow_ops)} slow operations (>100ms)")
+        return suggestions if suggestions else ["Performance looks optimal"]

cortex/model_downloader.py ADDED Viewed

@@ -0,0 +1,130 @@
+"""Simple model downloader for Cortex."""
+import os
+import sys
+from pathlib import Path
+from typing import Optional, Tuple
+import requests
+try:
+    from huggingface_hub import snapshot_download, hf_hub_download, HfApi
+    from huggingface_hub.utils import GatedRepoError, RepositoryNotFoundError
+    HF_HUB_AVAILABLE = True
+except ImportError:
+    HF_HUB_AVAILABLE = False
+class ModelDownloader:
+    """Simple model downloader from HuggingFace."""
+    def __init__(self, model_path: Path):
+        """Initialize downloader with model directory."""
+        self.model_path = Path(model_path).expanduser().resolve()
+        self.model_path.mkdir(parents=True, exist_ok=True)
+    def check_auth_status(self) -> Tuple[bool, Optional[str]]:
+        """Check if user is authenticated with HuggingFace.
+        Returns:
+            Tuple of (is_authenticated, username)
+        """
+        if not HF_HUB_AVAILABLE:
+            return False, None
+        try:
+            api = HfApi()
+            user_info = api.whoami()
+            if user_info:
+                return True, user_info.get('name', 'Unknown')
+        except:
+            pass
+        return False, None
+    def download_model(self, repo_id: str, filename: Optional[str] = None) -> Tuple[bool, str, Optional[Path]]:
+        """
+        Download a model from HuggingFace.
+        Args:
+            repo_id: HuggingFace repository ID (e.g., "meta-llama/Llama-3.1-8B-Instruct")
+            filename: Optional specific file to download (for GGUF models)
+        Returns:
+            Tuple of (success, message, local_path)
+        """
+        if not HF_HUB_AVAILABLE:
+            return False, "huggingface-hub not installed. Install with: pip install huggingface-hub", None
+        try:
+            if filename:
+                # Download single file
+                print(f"Downloading {filename} from {repo_id}...")
+                local_path = self.model_path / filename
+                if local_path.exists():
+                    return False, f"File already exists: {local_path}", local_path
+                downloaded_path = hf_hub_download(
+                    repo_id=repo_id,
+                    filename=filename,
+                    local_dir=self.model_path
+                    # Downloads always resume when possible by default
+                )
+                return True, f"Downloaded to {local_path}", Path(downloaded_path)
+            else:
+                # Download entire repository
+                model_name = repo_id.split('/')[-1]
+                local_path = self.model_path / model_name
+                print(f"Downloading repository {repo_id}...")
+                if local_path.exists() and any(local_path.iterdir()):
+                    return False, f"Model already exists: {local_path}", local_path
+                downloaded_path = snapshot_download(
+                    repo_id=repo_id,
+                    local_dir=local_path
+                    # Downloads always resume when possible by default
+                )
+                return True, f"Downloaded to {local_path}", local_path
+        except GatedRepoError:
+            # Check if user is logged in
+            is_auth, username = self.check_auth_status()
+            if is_auth:
+                return False, f"Model {repo_id} is gated. You're logged in as {username} but may need to accept the model's license agreement at https://huggingface.co/{repo_id}", None
+            else:
+                return False, f"Model {repo_id} requires authentication. Please use /login command to authenticate with HuggingFace", None
+        except RepositoryNotFoundError:
+            return False, f"Repository {repo_id} not found on HuggingFace", None
+        except Exception as e:
+            return False, f"Download failed: {str(e)}", None
+    def list_downloaded_models(self) -> list:
+        """List all downloaded models."""
+        models = []
+        if not self.model_path.exists():
+            return models
+        for item in self.model_path.iterdir():
+            if item.is_file() and item.suffix in ['.gguf', '.ggml', '.bin']:
+                size_gb = item.stat().st_size / (1024**3)
+                models.append({
+                    'name': item.name,
+                    'path': str(item),
+                    'size_gb': round(size_gb, 2)
+                })
+            elif item.is_dir() and any(item.iterdir()):
+                total_size = sum(f.stat().st_size for f in item.rglob('*') if f.is_file())
+                size_gb = total_size / (1024**3)
+                models.append({
+                    'name': item.name,
+                    'path': str(item),
+                    'size_gb': round(size_gb, 2)
+                })
+        return models