PyPI - cortex-llm - Versions diffs - 1.0.0__py3-none-any.whl - Mend

cortex-llm 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

cortex/__init__.py +73 -0
cortex/__main__.py +83 -0
cortex/config.py +329 -0
cortex/conversation_manager.py +468 -0
cortex/fine_tuning/__init__.py +8 -0
cortex/fine_tuning/dataset.py +332 -0
cortex/fine_tuning/mlx_lora_trainer.py +502 -0
cortex/fine_tuning/trainer.py +957 -0
cortex/fine_tuning/wizard.py +707 -0
cortex/gpu_validator.py +467 -0
cortex/inference_engine.py +727 -0
cortex/metal/__init__.py +275 -0
cortex/metal/gpu_validator.py +177 -0
cortex/metal/memory_pool.py +886 -0
cortex/metal/mlx_accelerator.py +678 -0
cortex/metal/mlx_converter.py +638 -0
cortex/metal/mps_optimizer.py +417 -0
cortex/metal/optimizer.py +665 -0
cortex/metal/performance_profiler.py +364 -0
cortex/model_downloader.py +130 -0
cortex/model_manager.py +2187 -0
cortex/quantization/__init__.py +5 -0
cortex/quantization/dynamic_quantizer.py +736 -0
cortex/template_registry/__init__.py +15 -0
cortex/template_registry/auto_detector.py +144 -0
cortex/template_registry/config_manager.py +234 -0
cortex/template_registry/interactive.py +260 -0
cortex/template_registry/registry.py +347 -0
cortex/template_registry/template_profiles/__init__.py +5 -0
cortex/template_registry/template_profiles/base.py +142 -0
cortex/template_registry/template_profiles/complex/__init__.py +5 -0
cortex/template_registry/template_profiles/complex/reasoning.py +263 -0
cortex/template_registry/template_profiles/standard/__init__.py +9 -0
cortex/template_registry/template_profiles/standard/alpaca.py +73 -0
cortex/template_registry/template_profiles/standard/chatml.py +82 -0
cortex/template_registry/template_profiles/standard/gemma.py +103 -0
cortex/template_registry/template_profiles/standard/llama.py +87 -0
cortex/template_registry/template_profiles/standard/simple.py +65 -0
cortex/ui/__init__.py +120 -0
cortex/ui/cli.py +1685 -0
cortex/ui/markdown_render.py +185 -0
cortex/ui/terminal_app.py +534 -0
cortex_llm-1.0.0.dist-info/METADATA +275 -0
cortex_llm-1.0.0.dist-info/RECORD +48 -0
cortex_llm-1.0.0.dist-info/WHEEL +5 -0
cortex_llm-1.0.0.dist-info/entry_points.txt +2 -0
cortex_llm-1.0.0.dist-info/licenses/LICENSE +21 -0
cortex_llm-1.0.0.dist-info/top_level.txt +1 -0

cortex/metal/mps_optimizer.py ADDED Viewed

@@ -0,0 +1,417 @@
+"""MPS backend optimization for PyTorch models on Metal."""
+import torch
+import torch.nn as nn
+from typing import Dict, Any, Optional, List, Tuple, Callable
+from dataclasses import dataclass
+import functools
+import warnings
+import sys
+import os
+# Add parent directory to path for imports
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from gpu_validator import GPUValidator
+@dataclass
+class MPSConfig:
+    """Configuration for MPS optimization."""
+    use_channels_last: bool = True
+    use_fp16: Optional[bool] = None  # None = auto-detect based on hardware
+    use_bfloat16: Optional[bool] = None  # None = auto-detect based on hardware
+    use_jit: bool = False  # JIT not fully supported on MPS yet
+    use_graph_mode: bool = True
+    fuse_operations: bool = True
+    optimize_memory: bool = True
+    max_batch_size: int = 8
+    prefetch_factor: int = 2
+    num_workers: int = 0  # MPS works best with main thread
+    auto_detect_dtype: bool = True  # Automatically select best dtype based on hardware
+class MPSOptimizer:
+    """Optimize PyTorch models for Metal Performance Shaders backend."""
+    FUSED_OPERATIONS = {
+        "conv_bn_relu": ["Conv2d", "BatchNorm2d", "ReLU"],
+        "linear_relu": ["Linear", "ReLU"],
+        "linear_gelu": ["Linear", "GELU"],
+        "layer_norm_linear": ["LayerNorm", "Linear"],
+    }
+    def __init__(self, config: Optional[MPSConfig] = None):
+        """Initialize MPS optimizer."""
+        self.config = config or MPSConfig()
+        if not torch.backends.mps.is_available():
+            raise RuntimeError("MPS backend not available")
+        if not torch.backends.mps.is_built():
+            raise RuntimeError("PyTorch not built with MPS support")
+        self.device = torch.device("mps")
+        # Initialize GPU validator for hardware detection
+        self.gpu_validator = GPUValidator()
+        self.gpu_validator.validate()
+        # Auto-detect optimal dtype if enabled
+        if (
+            self.config.auto_detect_dtype
+            and self.config.use_bfloat16 is None
+            and self.config.use_fp16 is None
+        ):
+            self._auto_detect_dtype()
+        elif self.config.use_bfloat16 is None and self.config.use_fp16 is None:
+            # Default to fp16 if auto-detect is disabled but no dtype is specified
+            self.config.use_bfloat16 = False
+            self.config.use_fp16 = True
+    def _auto_detect_dtype(self) -> None:
+        """Automatically detect optimal dtype based on hardware."""
+        if self.gpu_validator.check_bfloat16_support():
+            # Check if PyTorch supports bfloat16 on MPS
+            try:
+                test_tensor = torch.tensor([1.0], dtype=torch.bfloat16, device=self.device)
+                self.config.use_bfloat16 = True
+                self.config.use_fp16 = False  # Prefer bfloat16 over fp16
+            except (RuntimeError, TypeError):
+                # PyTorch doesn't support bfloat16 on MPS yet, fall back to fp16
+                self.config.use_bfloat16 = False
+                self.config.use_fp16 = True
+        else:
+            # Hardware doesn't support bfloat16, use fp16
+            self.config.use_bfloat16 = False
+            self.config.use_fp16 = True
+    def optimize_model(
+        self,
+        model: nn.Module,
+        example_input: Optional[torch.Tensor] = None
+    ) -> nn.Module:
+        """
+        Optimize a PyTorch model for MPS backend.
+        Args:
+            model: PyTorch model to optimize
+            example_input: Example input for shape inference
+        Returns:
+            Optimized model
+        """
+        model = model.to(self.device)
+        if self.config.use_channels_last:
+            model = self._convert_to_channels_last(model)
+        # Convert to optimal dtype (bfloat16 or fp16)
+        if self.config.use_bfloat16 or self.config.use_fp16:
+            model = self._convert_dtype(model)
+        if self.config.fuse_operations:
+            model = self._fuse_operations(model)
+        if self.config.optimize_memory:
+            model = self._optimize_memory_layout(model)
+        if self.config.use_graph_mode and example_input is not None:
+            model = self._enable_graph_mode(model, example_input)
+        model.eval()
+        return model
+    def _convert_to_channels_last(self, model: nn.Module) -> nn.Module:
+        """Convert model to channels_last memory format for better performance."""
+        def convert_layer(module):
+            if isinstance(module, (nn.Conv2d, nn.BatchNorm2d)):
+                module = module.to(memory_format=torch.channels_last)
+            return module
+        model.apply(convert_layer)
+        return model
+    def _convert_dtype(self, model: nn.Module) -> nn.Module:
+        """
+        Convert model to optimal dtype (bfloat16 or fp16) for faster computation.
+        Args:
+            model: Model to convert
+        Returns:
+            Converted model
+        """
+        # Determine target dtype
+        if self.config.use_bfloat16:
+            target_dtype = torch.bfloat16
+            conversion_method = lambda m: m.to(dtype=torch.bfloat16)
+        else:
+            target_dtype = torch.float16
+            conversion_method = lambda m: m.half()
+        def should_convert(module):
+            # These layers should stay in float32 for numerical stability
+            exclude_types = (nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d, nn.LayerNorm)
+            return not isinstance(module, exclude_types)
+        for name, module in model.named_modules():
+            if should_convert(module):
+                if hasattr(module, 'weight') and module.weight is not None:
+                    if module.weight.dtype == torch.float32:
+                        conversion_method(module)
+        return model
+    def get_optimal_dtype(self) -> torch.dtype:
+        """
+        Get the optimal dtype for current hardware.
+        Returns:
+            torch.bfloat16, torch.float16, or torch.float32
+        """
+        if self.config.use_bfloat16:
+            return torch.bfloat16
+        elif self.config.use_fp16:
+            return torch.float16
+        else:
+            return torch.float32
+    def _fuse_operations(self, model: nn.Module) -> nn.Module:
+        """Fuse compatible operations for better performance."""
+        # Note: Conv-BN fusion requires specific module pairs, not whole model
+        # Skipping automatic fusion as it's model-specific
+        for name, module in model.named_children():
+            if isinstance(module, nn.Sequential):
+                fused = self._try_fuse_sequential(module)
+                if fused is not None:
+                    setattr(model, name, fused)
+        return model
+    def _try_fuse_sequential(self, sequential: nn.Sequential) -> Optional[nn.Module]:
+        """Try to fuse operations in a sequential module."""
+        layers = list(sequential.children())
+        if len(layers) < 2:
+            return None
+        for pattern_name, pattern in self.FUSED_OPERATIONS.items():
+            if self._matches_pattern(layers, pattern):
+                return self._create_fused_module(layers, pattern_name)
+        return None
+    def _matches_pattern(self, layers: List[nn.Module], pattern: List[str]) -> bool:
+        """Check if layers match a fusion pattern."""
+        if len(layers) < len(pattern):
+            return False
+        for i, expected_type in enumerate(pattern):
+            if not hasattr(nn, expected_type):
+                return False
+            expected_class = getattr(nn, expected_type)
+            if not isinstance(layers[i], expected_class):
+                return False
+        return True
+    def _create_fused_module(self, layers: List[nn.Module], pattern_name: str) -> nn.Module:
+        """Create a fused module from layers."""
+        if pattern_name == "conv_bn_relu":
+            return ConvBNReLU(layers[0], layers[1])
+        elif pattern_name == "linear_relu":
+            return LinearReLU(layers[0])
+        elif pattern_name == "linear_gelu":
+            return LinearGELU(layers[0])
+        else:
+            return nn.Sequential(*layers)
+    def _optimize_memory_layout(self, model: nn.Module) -> nn.Module:
+        """Optimize memory layout for MPS."""
+        def optimize_layer(module):
+            if hasattr(module, 'weight'):
+                if module.weight is not None and module.weight.is_contiguous():
+                    module.weight = module.weight.contiguous()
+            if hasattr(module, 'bias'):
+                if module.bias is not None and not module.bias.is_contiguous():
+                    module.bias = module.bias.contiguous()
+            return module
+        model.apply(optimize_layer)
+        return model
+    def _enable_graph_mode(
+        self,
+        model: nn.Module,
+        example_input: torch.Tensor
+    ) -> nn.Module:
+        """Enable graph mode optimization (experimental)."""
+        try:
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore")
+                example_input = example_input.to(self.device)
+                # Convert input to optimal dtype
+                if self.config.use_bfloat16:
+                    example_input = example_input.to(dtype=torch.bfloat16)
+                elif self.config.use_fp16:
+                    example_input = example_input.half()
+                with torch.no_grad():
+                    _ = model(example_input)
+        except Exception as e:
+            print(f"Warning: Graph mode optimization failed: {e}")
+        return model
+    def optimize_dataloader(
+        self,
+        dataloader: torch.utils.data.DataLoader
+    ) -> torch.utils.data.DataLoader:
+        """Optimize DataLoader for MPS backend."""
+        optimized_dataloader = torch.utils.data.DataLoader(
+            dataloader.dataset,
+            batch_size=min(dataloader.batch_size, self.config.max_batch_size),
+            shuffle=dataloader.shuffle if hasattr(dataloader, 'shuffle') else False,
+            num_workers=self.config.num_workers,
+            pin_memory=False,  # Not needed for unified memory
+            prefetch_factor=self.config.prefetch_factor if self.config.num_workers > 0 else None
+        )
+        return optimized_dataloader
+    def profile_model(
+        self,
+        model: nn.Module,
+        input_shape: Tuple[int, ...],
+        num_iterations: int = 100
+    ) -> Dict[str, Any]:
+        """Profile model performance on MPS."""
+        model.eval()
+        device = torch.device("mps")
+        dummy_input = torch.randn(input_shape).to(device)
+        # Convert to optimal dtype
+        if self.config.use_bfloat16:
+            dummy_input = dummy_input.to(dtype=torch.bfloat16)
+        elif self.config.use_fp16:
+            dummy_input = dummy_input.half()
+        torch.mps.synchronize()
+        import time
+        warmup_iterations = 10
+        for _ in range(warmup_iterations):
+            with torch.no_grad():
+                _ = model(dummy_input)
+        torch.mps.synchronize()
+        start_time = time.perf_counter()
+        for _ in range(num_iterations):
+            with torch.no_grad():
+                _ = model(dummy_input)
+        torch.mps.synchronize()
+        end_time = time.perf_counter()
+        avg_time = (end_time - start_time) / num_iterations
+        throughput = input_shape[0] / avg_time if avg_time > 0 else 0
+        memory_allocated = torch.mps.current_allocated_memory() if hasattr(torch.mps, 'current_allocated_memory') else 0
+        return {
+            "avg_inference_time": avg_time,
+            "throughput": throughput,
+            "memory_allocated": memory_allocated,
+            "device": "mps",
+            "dtype": str(self.get_optimal_dtype()),
+            "fp16": self.config.use_fp16,
+            "bfloat16": self.config.use_bfloat16,
+            "batch_size": input_shape[0]
+        }
+    @staticmethod
+    def get_mps_info() -> Dict[str, Any]:
+        """Get MPS backend information."""
+        info = {
+            "available": torch.backends.mps.is_available(),
+            "built": torch.backends.mps.is_built()
+        }
+        if info["available"]:
+            info["current_allocated_memory"] = torch.mps.current_allocated_memory() if hasattr(torch.mps, 'current_allocated_memory') else 0
+            info["driver_allocated_memory"] = torch.mps.driver_allocated_memory() if hasattr(torch.mps, 'driver_allocated_memory') else 0
+            # Check bfloat16 support
+            try:
+                test_tensor = torch.tensor([1.0], dtype=torch.bfloat16, device="mps")
+                info["bfloat16_supported"] = True
+            except (RuntimeError, TypeError):
+                info["bfloat16_supported"] = False
+        return info
+    def get_optimization_summary(self) -> Dict[str, Any]:
+        """
+        Get summary of optimizations applied.
+        Returns:
+            Dictionary with optimization details
+        """
+        optimal_dtype = self.get_optimal_dtype()
+        return {
+            "device": str(self.device),
+            "optimal_dtype": str(optimal_dtype),
+            "dtype_bits": 16 if optimal_dtype in [torch.float16, torch.bfloat16] else 32,
+            "memory_reduction": "50%" if optimal_dtype in [torch.float16, torch.bfloat16] else "0%",
+            "hardware_features": {
+                "bfloat16": self.config.use_bfloat16,
+                "channels_last": self.config.use_channels_last,
+                "graph_mode": self.config.use_graph_mode,
+                "fused_operations": self.config.fuse_operations,
+                "memory_optimized": self.config.optimize_memory
+            },
+            "gpu_family": self.gpu_validator.gpu_info.gpu_family if self.gpu_validator.gpu_info else "unknown",
+            "expected_speedup": "10-15%" if self.config.use_bfloat16 else "baseline"
+        }
+class ConvBNReLU(nn.Module):
+    """Fused Conv-BatchNorm-ReLU module."""
+    def __init__(self, conv: nn.Conv2d, bn: nn.BatchNorm2d):
+        super().__init__()
+        self.conv = conv
+        self.bn = bn
+        self.relu = nn.ReLU(inplace=True)
+    def forward(self, x):
+        return self.relu(self.bn(self.conv(x)))
+class LinearReLU(nn.Module):
+    """Fused Linear-ReLU module."""
+    def __init__(self, linear: nn.Linear):
+        super().__init__()
+        self.linear = linear
+        self.relu = nn.ReLU(inplace=True)
+    def forward(self, x):
+        return self.relu(self.linear(x))
+class LinearGELU(nn.Module):
+    """Fused Linear-GELU module."""
+    def __init__(self, linear: nn.Linear):
+        super().__init__()
+        self.linear = linear
+        self.gelu = nn.GELU()
+    def forward(self, x):
+        return self.gelu(self.linear(x))