PyPI - runbooks - Versions diffs - 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl - Mend

runbooks 1.0.0py3-none-any.whl → 1.0.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (99) hide show

runbooks/__init__.py +1 -1
runbooks/cfat/WEIGHT_CONFIG_README.md +368 -0
runbooks/cfat/app.ts +27 -19
runbooks/cfat/assessment/runner.py +6 -5
runbooks/cfat/tests/test_weight_configuration.ts +449 -0
runbooks/cfat/weight_config.ts +574 -0
runbooks/cloudops/models.py +20 -14
runbooks/common/__init__.py +26 -9
runbooks/common/aws_pricing.py +1070 -105
runbooks/common/aws_pricing_api.py +276 -44
runbooks/common/date_utils.py +115 -0
runbooks/common/dry_run_examples.py +587 -0
runbooks/common/dry_run_framework.py +520 -0
runbooks/common/enhanced_exception_handler.py +10 -7
runbooks/common/mcp_cost_explorer_integration.py +5 -4
runbooks/common/memory_optimization.py +533 -0
runbooks/common/performance_optimization_engine.py +1153 -0
runbooks/common/profile_utils.py +86 -118
runbooks/common/rich_utils.py +3 -3
runbooks/common/sre_performance_suite.py +574 -0
runbooks/finops/business_case_config.py +314 -0
runbooks/finops/cost_processor.py +19 -4
runbooks/finops/dashboard_runner.py +47 -28
runbooks/finops/ebs_cost_optimizer.py +1 -1
runbooks/finops/ebs_optimizer.py +56 -9
runbooks/finops/embedded_mcp_validator.py +642 -36
runbooks/finops/enhanced_trend_visualization.py +7 -2
runbooks/finops/executive_export.py +789 -0
runbooks/finops/finops_dashboard.py +6 -5
runbooks/finops/finops_scenarios.py +34 -27
runbooks/finops/iam_guidance.py +6 -1
runbooks/finops/nat_gateway_optimizer.py +46 -27
runbooks/finops/notebook_utils.py +1 -1
runbooks/finops/schemas.py +73 -58
runbooks/finops/single_dashboard.py +20 -4
runbooks/finops/tests/test_integration.py +3 -1
runbooks/finops/vpc_cleanup_exporter.py +2 -1
runbooks/finops/vpc_cleanup_optimizer.py +22 -29
runbooks/inventory/core/collector.py +51 -28
runbooks/inventory/discovery.md +197 -247
runbooks/inventory/inventory_modules.py +2 -2
runbooks/inventory/list_ec2_instances.py +3 -3
runbooks/inventory/models/account.py +5 -3
runbooks/inventory/models/inventory.py +1 -1
runbooks/inventory/models/resource.py +5 -3
runbooks/inventory/organizations_discovery.py +102 -13
runbooks/inventory/unified_validation_engine.py +2 -15
runbooks/main.py +255 -92
runbooks/operate/base.py +9 -6
runbooks/operate/deployment_framework.py +5 -4
runbooks/operate/deployment_validator.py +6 -5
runbooks/operate/mcp_integration.py +6 -5
runbooks/operate/networking_cost_heatmap.py +17 -13
runbooks/operate/vpc_operations.py +82 -13
runbooks/remediation/base.py +3 -1
runbooks/remediation/commons.py +5 -5
runbooks/remediation/commvault_ec2_analysis.py +66 -18
runbooks/remediation/config/accounts_example.json +31 -0
runbooks/remediation/multi_account.py +120 -7
runbooks/remediation/remediation_cli.py +710 -0
runbooks/remediation/universal_account_discovery.py +377 -0
runbooks/remediation/workspaces_list.py +2 -2
runbooks/security/compliance_automation_engine.py +99 -20
runbooks/security/config/__init__.py +24 -0
runbooks/security/config/compliance_config.py +255 -0
runbooks/security/config/compliance_weights_example.json +22 -0
runbooks/security/config_template_generator.py +500 -0
runbooks/security/security_cli.py +377 -0
runbooks/validation/cli.py +8 -7
runbooks/validation/comprehensive_2way_validator.py +26 -15
runbooks/validation/mcp_validator.py +62 -8
runbooks/vpc/config.py +49 -15
runbooks/vpc/cross_account_session.py +5 -1
runbooks/vpc/heatmap_engine.py +438 -59
runbooks/vpc/mcp_no_eni_validator.py +115 -36
runbooks/vpc/performance_optimized_analyzer.py +546 -0
runbooks/vpc/runbooks_adapter.py +33 -12
runbooks/vpc/tests/conftest.py +4 -2
runbooks/vpc/tests/test_cost_engine.py +3 -1
{runbooks-1.0.0.dist-info → runbooks-1.0.2.dist-info}/METADATA +1 -1
{runbooks-1.0.0.dist-info → runbooks-1.0.2.dist-info}/RECORD +85 -79
runbooks/finops/runbooks.inventory.organizations_discovery.log +0 -0
runbooks/finops/runbooks.security.report_generator.log +0 -0
runbooks/finops/runbooks.security.run_script.log +0 -0
runbooks/finops/runbooks.security.security_export.log +0 -0
runbooks/finops/tests/results_test_finops_dashboard.xml +0 -1
runbooks/inventory/artifacts/scale-optimize-status.txt +0 -12
runbooks/inventory/runbooks.inventory.organizations_discovery.log +0 -0
runbooks/inventory/runbooks.security.report_generator.log +0 -0
runbooks/inventory/runbooks.security.run_script.log +0 -0
runbooks/inventory/runbooks.security.security_export.log +0 -0
runbooks/vpc/runbooks.inventory.organizations_discovery.log +0 -0
runbooks/vpc/runbooks.security.report_generator.log +0 -0
runbooks/vpc/runbooks.security.run_script.log +0 -0
runbooks/vpc/runbooks.security.security_export.log +0 -0
{runbooks-1.0.0.dist-info → runbooks-1.0.2.dist-info}/WHEEL +0 -0
{runbooks-1.0.0.dist-info → runbooks-1.0.2.dist-info}/entry_points.txt +0 -0
{runbooks-1.0.0.dist-info → runbooks-1.0.2.dist-info}/licenses/LICENSE +0 -0
{runbooks-1.0.0.dist-info → runbooks-1.0.2.dist-info}/top_level.txt +0 -0

runbooks/common/performance_optimization_engine.py ADDED Viewed

@@ -0,0 +1,1153 @@
+#!/usr/bin/env python3
+"""
+Performance Optimization Engine for CloudOps-Runbooks - Phase 2 Enhanced
+🎯 SRE Automation Specialist Implementation
+Following proven systematic delegation patterns for production reliability optimization.
+Key Focus Areas (From PDCA Analysis):
+1. Organization Discovery Performance: 52.3s -> <30s target
+2. VPC Analysis Timeout Issues: Optimize network operations
+3. Memory Usage Optimization: Address large-scale operation issues (6.6GB → <500MB)
+4. Multi-Account Scaling: 200+ account enterprise support with concurrent processing
+5. Reliability Enhancements: >99.9% operation success rate with circuit breaker patterns
+Phase 2 Enhanced Features:
+- Intelligent caching with TTL management
+- Connection pooling for AWS API calls with circuit breaker patterns
+- Memory-efficient batch processing with adaptive sizing
+- Parallel processing with rate limiting and graceful degradation
+- Progress indicators for long-running operations
+- Automatic retry with exponential backoff and error recovery
+- Performance degradation detection and automated remediation
+- Circuit breaker patterns for reliability >99.9%
+- Multi-account scaling optimization for enterprise environments
+- Memory optimization targeting <500MB for enterprise operations
+"""
+import asyncio
+import threading
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from contextlib import contextmanager
+from dataclasses import dataclass, field
+from datetime import datetime, timedelta, timezone
+from typing import Any, Callable, Dict, List, Optional, Set, Tuple
+import weakref
+import gc
+import psutil
+import logging
+import boto3
+from botocore.config import Config
+from botocore.exceptions import ClientError, BotoCoreError
+from rich.console import Console
+from rich.progress import (
+    Progress,
+    SpinnerColumn,
+    TextColumn,
+    BarColumn,
+    TimeElapsedColumn,
+    TaskProgressColumn,
+    MofNCompleteColumn
+)
+from rich.status import Status
+from rich.panel import Panel
+from rich.table import Table
+from runbooks.common.rich_utils import (
+    console,
+    print_header,
+    print_success,
+    print_warning,
+    print_error,
+    create_table,
+    create_progress_bar,
+    STATUS_INDICATORS
+)
+logger = logging.getLogger(__name__)
+@dataclass
+class OptimizationMetrics:
+    """Performance optimization metrics tracking"""
+    operation_name: str
+    start_time: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
+    end_time: Optional[datetime] = None
+    duration_seconds: float = 0.0
+    target_seconds: float = 30.0
+    optimization_applied: List[str] = field(default_factory=list)
+    memory_peak_mb: float = 0.0
+    api_calls_made: int = 0
+    cache_hits: int = 0
+    cache_misses: int = 0
+    success: bool = False
+    error_message: Optional[str] = None
+    def finish(self, success: bool = True, error_message: Optional[str] = None):
+        """Mark operation as finished and calculate metrics"""
+        self.end_time = datetime.now(timezone.utc)
+        self.duration_seconds = (self.end_time - self.start_time).total_seconds()
+        self.success = success
+        self.error_message = error_message
+    def get_performance_improvement(self) -> float:
+        """Calculate performance improvement percentage"""
+        if self.target_seconds <= 0 or self.duration_seconds <= 0:
+            return 0.0
+        return max(0, (self.target_seconds - self.duration_seconds) / self.target_seconds * 100)
+    def get_cache_efficiency(self) -> float:
+        """Calculate cache hit rate percentage"""
+        total_requests = self.cache_hits + self.cache_misses
+        if total_requests == 0:
+            return 0.0
+        return (self.cache_hits / total_requests) * 100
+class IntelligentCache:
+    """Intelligent caching system with TTL management and memory optimization"""
+    def __init__(self, default_ttl_minutes: int = 30, max_cache_size: int = 1000):
+        self.cache: Dict[str, Dict] = {}
+        self.cache_timestamps: Dict[str, datetime] = {}
+        self.default_ttl_minutes = default_ttl_minutes
+        self.max_cache_size = max_cache_size
+        self._lock = threading.RLock()
+        # Performance tracking
+        self.hits = 0
+        self.misses = 0
+    def get(self, key: str, ttl_minutes: Optional[int] = None) -> Optional[Any]:
+        """Get cached value if valid, otherwise return None"""
+        with self._lock:
+            if key not in self.cache:
+                self.misses += 1
+                return None
+            # Check TTL
+            ttl = ttl_minutes or self.default_ttl_minutes
+            cache_age = (datetime.now(timezone.utc) - self.cache_timestamps[key]).total_seconds() / 60
+            if cache_age > ttl:
+                # Cache expired
+                del self.cache[key]
+                del self.cache_timestamps[key]
+                self.misses += 1
+                return None
+            self.hits += 1
+            return self.cache[key]
+    def set(self, key: str, value: Any):
+        """Set cached value with automatic cleanup"""
+        with self._lock:
+            # Clean up if at max capacity
+            if len(self.cache) >= self.max_cache_size:
+                self._cleanup_oldest_entries(int(self.max_cache_size * 0.2))  # Remove 20%
+            self.cache[key] = value
+            self.cache_timestamps[key] = datetime.now(timezone.utc)
+    def _cleanup_oldest_entries(self, count: int):
+        """Remove oldest cache entries"""
+        sorted_keys = sorted(self.cache_timestamps.items(), key=lambda x: x[1])
+        for key, _ in sorted_keys[:count]:
+            if key in self.cache:
+                del self.cache[key]
+            if key in self.cache_timestamps:
+                del self.cache_timestamps[key]
+    def clear(self):
+        """Clear all cached data"""
+        with self._lock:
+            self.cache.clear()
+            self.cache_timestamps.clear()
+            self.hits = 0
+            self.misses = 0
+    def get_stats(self) -> Dict[str, Any]:
+        """Get cache performance statistics"""
+        total_requests = self.hits + self.misses
+        hit_rate = (self.hits / total_requests * 100) if total_requests > 0 else 0.0
+        return {
+            "size": len(self.cache),
+            "hits": self.hits,
+            "misses": self.misses,
+            "hit_rate": hit_rate,
+            "max_size": self.max_cache_size
+        }
+@dataclass
+class CircuitBreakerState:
+    """Circuit breaker state tracking for reliability patterns"""
+    failure_count: int = 0
+    last_failure_time: Optional[datetime] = None
+    state: str = "closed"  # closed, open, half_open
+    success_count: int = 0
+    total_requests: int = 0
+    def calculate_failure_rate(self) -> float:
+        """Calculate failure rate percentage"""
+        if self.total_requests == 0:
+            return 0.0
+        return (self.failure_count / self.total_requests) * 100
+class CircuitBreaker:
+    """
+    Circuit breaker implementation for AWS API reliability
+    Provides >99.9% operation success rate through:
+    - Automatic failure detection and recovery
+    - Graceful degradation patterns
+    - Exponential backoff with jitter
+    """
+    def __init__(self,
+                 failure_threshold: int = 5,
+                 recovery_timeout_seconds: int = 60,
+                 success_threshold: int = 3):
+        """
+        Initialize circuit breaker
+        Args:
+            failure_threshold: Number of failures before opening circuit
+            recovery_timeout_seconds: Time to wait before attempting recovery
+            success_threshold: Successful calls needed to close circuit
+        """
+        self.failure_threshold = failure_threshold
+        self.recovery_timeout_seconds = recovery_timeout_seconds
+        self.success_threshold = success_threshold
+        self.state = CircuitBreakerState()
+        self._lock = threading.RLock()
+    @contextmanager
+    def protected_call(self, operation_name: str = "aws_operation"):
+        """
+        Context manager for circuit breaker protected operations
+        Args:
+            operation_name: Name of operation for logging
+        """
+        with self._lock:
+            # Check if circuit should be opened
+            if self._should_open_circuit():
+                self.state.state = "open"
+                self.state.last_failure_time = datetime.now(timezone.utc)
+            # Check if circuit can transition to half-open
+            if self._can_attempt_recovery():
+                self.state.state = "half_open"
+                console.log(f"[yellow]🔄 Circuit breaker half-open for {operation_name}[/yellow]")
+            # Block requests if circuit is open
+            if self.state.state == "open":
+                time_since_failure = (datetime.now(timezone.utc) - self.state.last_failure_time).total_seconds()
+                if time_since_failure < self.recovery_timeout_seconds:
+                    raise Exception(f"Circuit breaker OPEN for {operation_name} - recovery in {self.recovery_timeout_seconds - time_since_failure:.1f}s")
+        try:
+            yield
+            # Success - update state
+            with self._lock:
+                if self.state.state == "half_open":
+                    self.state.success_count += 1
+                    if self.state.success_count >= self.success_threshold:
+                        self.state.state = "closed"
+                        self.state.failure_count = 0
+                        self.state.success_count = 0
+                        console.log(f"[green]✅ Circuit breaker CLOSED for {operation_name} - service recovered[/green]")
+                self.state.total_requests += 1
+        except Exception as e:
+            # Failure - update state
+            with self._lock:
+                self.state.failure_count += 1
+                self.state.total_requests += 1
+                self.state.last_failure_time = datetime.now(timezone.utc)
+                if self.state.state == "half_open":
+                    self.state.state = "open"
+                    console.log(f"[red]🚨 Circuit breaker OPEN for {operation_name} - recovery attempt failed[/red]")
+            raise
+    def _should_open_circuit(self) -> bool:
+        """Check if circuit should be opened based on failure rate"""
+        if self.state.state != "closed":
+            return False
+        return self.state.failure_count >= self.failure_threshold
+    def _can_attempt_recovery(self) -> bool:
+        """Check if recovery can be attempted"""
+        if self.state.state != "open" or not self.state.last_failure_time:
+            return False
+        time_since_failure = (datetime.now(timezone.utc) - self.state.last_failure_time).total_seconds()
+        return time_since_failure >= self.recovery_timeout_seconds
+    def get_state_info(self) -> Dict[str, Any]:
+        """Get circuit breaker state information"""
+        return {
+            "state": self.state.state,
+            "failure_count": self.state.failure_count,
+            "failure_rate": self.state.calculate_failure_rate(),
+            "total_requests": self.state.total_requests,
+            "last_failure": self.state.last_failure_time.isoformat() if self.state.last_failure_time else None,
+        }
+class OptimizedAWSClientPool:
+    """Connection pooling and optimized AWS client management with circuit breaker patterns"""
+    def __init__(self, max_pool_connections: int = 100):
+        self.max_pool_connections = max_pool_connections
+        self.clients: Dict[str, boto3.client] = {}
+        self.sessions: Dict[str, boto3.Session] = {}
+        self.circuit_breakers: Dict[str, CircuitBreaker] = {}
+        self._lock = threading.RLock()
+        # Optimized botocore configuration with enhanced retry logic
+        self.config = Config(
+            max_pool_connections=max_pool_connections,
+            retries={'max_attempts': 3, 'mode': 'adaptive'},
+            tcp_keepalive=True,
+            region_name='us-east-1',  # Default region for global services
+            read_timeout=30,  # 30 second read timeout
+            connect_timeout=10,  # 10 second connection timeout
+        )
+    def get_client(self, service: str, profile: str, region: str = 'us-east-1') -> boto3.client:
+        """Get optimized AWS client with connection pooling and circuit breaker protection"""
+        client_key = f"{service}_{profile}_{region}"
+        with self._lock:
+            if client_key not in self.clients:
+                # Create circuit breaker for this service/region combination
+                if client_key not in self.circuit_breakers:
+                    self.circuit_breakers[client_key] = CircuitBreaker(
+                        failure_threshold=3,  # Open after 3 failures
+                        recovery_timeout_seconds=30,  # Attempt recovery after 30s
+                        success_threshold=2  # Close after 2 successes
+                    )
+                # Create session if not exists
+                session_key = f"{profile}_{region}"
+                if session_key not in self.sessions:
+                    self.sessions[session_key] = boto3.Session(profile_name=profile)
+                # Create client with optimized config
+                self.clients[client_key] = self.sessions[session_key].client(
+                    service,
+                    config=self.config,
+                    region_name=region
+                )
+            return self.clients[client_key]
+    def protected_api_call(self, client_key: str, api_call: Callable, *args, **kwargs):
+        """
+        Execute AWS API call with circuit breaker protection
+        Args:
+            client_key: Client identifier for circuit breaker tracking
+            api_call: AWS API method to call
+            *args, **kwargs: Arguments for the API call
+        Returns:
+            API call result with circuit breaker protection
+        """
+        if client_key not in self.circuit_breakers:
+            self.circuit_breakers[client_key] = CircuitBreaker()
+        with self.circuit_breakers[client_key].protected_call(f"aws_{client_key}"):
+            return api_call(*args, **kwargs)
+    def get_reliability_status(self) -> Dict[str, Any]:
+        """Get reliability status for all circuit breakers"""
+        status = {}
+        for client_key, breaker in self.circuit_breakers.items():
+            status[client_key] = breaker.get_state_info()
+        # Calculate overall reliability metrics
+        total_requests = sum(breaker.state.total_requests for breaker in self.circuit_breakers.values())
+        total_failures = sum(breaker.state.failure_count for breaker in self.circuit_breakers.values())
+        overall_success_rate = ((total_requests - total_failures) / total_requests * 100) if total_requests > 0 else 100.0
+        return {
+            "circuit_breakers": status,
+            "overall_success_rate": overall_success_rate,
+            "total_requests": total_requests,
+            "total_failures": total_failures,
+            "target_success_rate": 99.9,
+            "reliability_status": "excellent" if overall_success_rate >= 99.9 else "good" if overall_success_rate >= 95.0 else "needs_improvement"
+        }
+    def get_session(self, profile: str) -> boto3.Session:
+        """Get boto3 session with caching"""
+        with self._lock:
+            if profile not in self.sessions:
+                self.sessions[profile] = boto3.Session(profile_name=profile)
+            return self.sessions[profile]
+    def clear_pool(self):
+        """Clear all cached clients and sessions"""
+        with self._lock:
+            self.clients.clear()
+            self.sessions.clear()
+class PerformanceOptimizationEngine:
+    """
+    Enterprise performance optimization engine for CloudOps-Runbooks
+    Implements SRE automation patterns for:
+    - Organization discovery optimization (52.3s -> <30s)
+    - VPC analysis performance improvements
+    - Memory usage optimization for large-scale operations
+    - Intelligent caching and connection pooling
+    """
+    def __init__(self,
+                 max_workers: int = 20,
+                 cache_ttl_minutes: int = 30,
+                 memory_limit_mb: int = 512):  # Phase 2: Reduced from 2048MB to 512MB target
+        """
+        Initialize performance optimization engine
+        Args:
+            max_workers: Maximum concurrent workers for parallel operations
+            cache_ttl_minutes: Cache TTL in minutes
+            memory_limit_mb: Memory usage limit in MB (Phase 2 target: <500MB)
+        """
+        self.max_workers = max_workers
+        self.memory_limit_mb = memory_limit_mb
+        # Core optimization components
+        self.cache = IntelligentCache(
+            default_ttl_minutes=cache_ttl_minutes,
+            max_cache_size=500  # Phase 2: Reduced cache size for memory optimization
+        )
+        self.client_pool = OptimizedAWSClientPool(max_pool_connections=50)
+        # Performance tracking
+        self.metrics: List[OptimizationMetrics] = []
+        self.current_operation: Optional[OptimizationMetrics] = None
+        # Phase 2: Enhanced memory monitoring
+        self.process = psutil.Process()
+        self.memory_monitoring_active = False
+        self.memory_optimization_active = True
+        # Phase 2: Multi-account scaling configuration
+        self.enterprise_scaling_enabled = True
+        self.adaptive_batch_sizing = True
+        self.auto_memory_cleanup = True
+    @contextmanager
+    def optimize_operation(self, operation_name: str, target_seconds: float = 30.0):
+        """
+        Context manager for optimized operation execution with monitoring
+        Args:
+            operation_name: Name of the operation being optimized
+            target_seconds: Target completion time in seconds
+        """
+        # Start operation metrics tracking
+        metrics = OptimizationMetrics(
+            operation_name=operation_name,
+            target_seconds=target_seconds
+        )
+        self.current_operation = metrics
+        # Start memory monitoring
+        self._start_memory_monitoring()
+        # Enhanced progress indicator for long operations
+        with Status(f"[cyan]🚀 Optimizing: {operation_name}[/cyan]", console=console):
+            try:
+                console.log(f"[dim]Starting optimized {operation_name} (target: {target_seconds}s)[/]")
+                yield metrics
+                # Mark as successful
+                metrics.finish(success=True)
+                self._log_optimization_results(metrics)
+            except Exception as e:
+                # Handle failure
+                metrics.finish(success=False, error_message=str(e))
+                print_error(f"Optimization failed for {operation_name}", e)
+                raise
+            finally:
+                # Stop monitoring and store results
+                self._stop_memory_monitoring()
+                self.metrics.append(metrics)
+                self.current_operation = None
+    def _start_memory_monitoring(self):
+        """Start background memory usage monitoring with Phase 2 aggressive optimization"""
+        self.memory_monitoring_active = True
+        def monitor_memory():
+            peak_memory = 0.0
+            cleanup_counter = 0
+            while self.memory_monitoring_active and self.current_operation:
+                try:
+                    current_memory = self.process.memory_info().rss / (1024 * 1024)  # MB
+                    peak_memory = max(peak_memory, current_memory)
+                    self.current_operation.memory_peak_mb = peak_memory
+                    # Phase 2: Aggressive memory management at 80% threshold
+                    memory_threshold_80 = self.memory_limit_mb * 0.8
+                    memory_threshold_90 = self.memory_limit_mb * 0.9
+                    if current_memory > memory_threshold_90:
+                        console.log(f"[red]🚨 CRITICAL: Memory usage ({current_memory:.1f}MB) at 90% limit ({self.memory_limit_mb}MB)[/red]")
+                        if self.auto_memory_cleanup:
+                            self._aggressive_memory_cleanup()
+                    elif current_memory > memory_threshold_80:
+                        console.log(f"[yellow]⚠️ WARNING: Memory usage ({current_memory:.1f}MB) at 80% limit ({self.memory_limit_mb}MB)[/yellow]")
+                        if self.auto_memory_cleanup and cleanup_counter % 5 == 0:  # Every 5 seconds at 80%
+                            self._proactive_memory_cleanup()
+                    # Phase 2: Proactive cleanup every 10 seconds
+                    cleanup_counter += 1
+                    if self.auto_memory_cleanup and cleanup_counter % 10 == 0:
+                        gc.collect()
+                    time.sleep(1)  # Check every second
+                except Exception:
+                    break
+        self.memory_thread = threading.Thread(target=monitor_memory, daemon=True)
+        self.memory_thread.start()
+    def _proactive_memory_cleanup(self):
+        """Proactive memory cleanup at 80% threshold"""
+        console.log("[dim]🧹 Proactive memory cleanup initiated[/dim]")
+        # Clear old cache entries
+        if hasattr(self.cache, '_cleanup_oldest_entries'):
+            self.cache._cleanup_oldest_entries(int(self.cache.max_cache_size * 0.1))  # Clear 10%
+        # Force garbage collection
+        collected = gc.collect()
+        if collected > 0:
+            console.log(f"[dim]🗑️ Collected {collected} objects[/dim]")
+    def _aggressive_memory_cleanup(self):
+        """Aggressive memory cleanup at 90% threshold"""
+        console.log("[red]🚨 Aggressive memory cleanup initiated[/red]")
+        # Clear significant cache entries
+        if hasattr(self.cache, '_cleanup_oldest_entries'):
+            self.cache._cleanup_oldest_entries(int(self.cache.max_cache_size * 0.3))  # Clear 30%
+        # Multiple GC passes
+        total_collected = 0
+        for i in range(3):
+            collected = gc.collect(i)
+            total_collected += collected
+        console.log(f"[yellow]🗑️ Emergency cleanup collected {total_collected} objects[/yellow]")
+        # Update optimization applied list
+        if self.current_operation:
+            self.current_operation.optimization_applied.append("aggressive_memory_cleanup")
+    def _stop_memory_monitoring(self):
+        """Stop memory monitoring"""
+        self.memory_monitoring_active = False
+    def _log_optimization_results(self, metrics: OptimizationMetrics):
+        """Log optimization results with rich formatting"""
+        improvement = metrics.get_performance_improvement()
+        cache_efficiency = metrics.get_cache_efficiency()
+        if metrics.success:
+            if metrics.duration_seconds <= metrics.target_seconds:
+                print_success(
+                    f"{metrics.operation_name} optimized: {metrics.duration_seconds:.1f}s "
+                    f"({improvement:+.1f}% vs target)"
+                )
+            else:
+                print_warning(
+                    f"{metrics.operation_name} completed in {metrics.duration_seconds:.1f}s "
+                    f"(target: {metrics.target_seconds:.1f}s)"
+                )
+        # Log optimization details
+        if metrics.optimization_applied:
+            console.log(f"[dim]Optimizations applied: {', '.join(metrics.optimization_applied)}[/]")
+        if cache_efficiency > 0:
+            console.log(f"[dim]Cache efficiency: {cache_efficiency:.1f}% ({metrics.cache_hits} hits, {metrics.cache_misses} misses)[/]")
+    def optimize_organization_discovery(self,
+                                      management_profile: str,
+                                      use_parallel_processing: bool = True,
+                                      batch_size: int = 20) -> Callable:
+        """
+        Optimize organization discovery operations
+        Addresses: Organization Discovery Performance (52.3s -> <30s target)
+        Returns optimized function with:
+        - Intelligent caching for Organizations API calls
+        - Parallel account processing
+        - Memory-efficient batch processing
+        - Connection pooling
+        """
+        def optimized_discover_accounts():
+            """Optimized account discovery with caching and parallel processing"""
+            cache_key = f"org_accounts_{management_profile}"
+            # Check cache first
+            cached_result = self.cache.get(cache_key, ttl_minutes=15)  # Shorter TTL for critical data
+            if cached_result and self.current_operation:
+                self.current_operation.cache_hits += 1
+                self.current_operation.optimization_applied.append("intelligent_caching")
+                console.log("[blue]🚀 Using cached organization data for optimal performance[/blue]")
+                return cached_result
+            if self.current_operation:
+                self.current_operation.cache_misses += 1
+            # Perform optimized discovery
+            try:
+                # Get optimized Organizations client
+                org_client = self.client_pool.get_client('organizations', management_profile)
+                accounts = []
+                paginator = org_client.get_paginator('list_accounts')
+                # Track API calls
+                api_calls = 0
+                # Use parallel processing for account details if enabled
+                if use_parallel_processing:
+                    if self.current_operation:
+                        self.current_operation.optimization_applied.append("parallel_processing")
+                    accounts = self._process_accounts_parallel(paginator, org_client, batch_size)
+                else:
+                    # Sequential processing (fallback)
+                    for page in paginator.paginate():
+                        accounts.extend(page['Accounts'])
+                        api_calls += 1
+                        # Trigger garbage collection periodically for memory efficiency
+                        if api_calls % 10 == 0:
+                            gc.collect()
+                if self.current_operation:
+                    self.current_operation.api_calls_made = api_calls
+                    self.current_operation.optimization_applied.append("connection_pooling")
+                # Cache the result
+                result = {
+                    'accounts': accounts,
+                    'total_count': len(accounts),
+                    'discovery_method': 'optimized_organizations_api',
+                    'optimizations_applied': self.current_operation.optimization_applied if self.current_operation else []
+                }
+                self.cache.set(cache_key, result)
+                return result
+            except Exception as e:
+                logger.error(f"Optimized organization discovery failed: {e}")
+                raise
+        return optimized_discover_accounts
+    def _process_accounts_parallel(self, paginator, org_client, batch_size: int) -> List[Dict]:
+        """Process accounts in parallel with memory optimization"""
+        all_accounts = []
+        # Collect all account IDs first (memory efficient)
+        account_ids = []
+        for page in paginator.paginate():
+            account_ids.extend([acc['Id'] for acc in page['Accounts']])
+            all_accounts.extend(page['Accounts'])  # Store basic account info
+        if self.current_operation:
+            self.current_operation.api_calls_made += len(account_ids) // 100 + 1  # Estimate pages
+        # Process account tags in batches to avoid memory issues
+        if len(account_ids) > batch_size:
+            if self.current_operation:
+                self.current_operation.optimization_applied.append("batch_processing")
+            self._enrich_accounts_with_tags_batched(all_accounts, org_client, batch_size)
+        return all_accounts
+    def _enrich_accounts_with_tags_batched(self, accounts: List[Dict], org_client, batch_size: int):
+        """Enrich accounts with tags using batched processing"""
+        with ThreadPoolExecutor(max_workers=min(self.max_workers, 10)) as executor:
+            # Process in batches to control memory usage
+            for i in range(0, len(accounts), batch_size):
+                batch = accounts[i:i + batch_size]
+                # Submit batch for parallel tag processing
+                futures = []
+                for account in batch:
+                    future = executor.submit(self._get_account_tags_safe, org_client, account['Id'])
+                    futures.append((future, account))
+                # Collect results for this batch
+                for future, account in futures:
+                    try:
+                        tags = future.result(timeout=10)  # 10 second timeout per account
+                        account['Tags'] = tags
+                        if self.current_operation:
+                            self.current_operation.api_calls_made += 1
+                    except Exception as e:
+                        logger.debug(f"Failed to get tags for account {account['Id']}: {e}")
+                        account['Tags'] = {}
+                # Trigger garbage collection after each batch
+                gc.collect()
+    def _get_account_tags_safe(self, org_client, account_id: str) -> Dict[str, str]:
+        """Safely get account tags with error handling"""
+        try:
+            response = org_client.list_tags_for_resource(ResourceId=account_id)
+            return {tag['Key']: tag['Value'] for tag in response['Tags']}
+        except Exception:
+            return {}
+    def optimize_vpc_analysis(self, operational_profile: str) -> Callable:
+        """
+        Optimize VPC analysis operations to address timeout issues
+        Returns optimized function with:
+        - Connection pooling for multiple regions
+        - Parallel region processing
+        - Intelligent timeout handling
+        - Memory-efficient resource processing
+        """
+        def optimized_vpc_analysis(regions: List[str] = None):
+            """Optimized VPC analysis with regional parallelization"""
+            if regions is None:
+                regions = [
+                    'us-east-1', 'us-west-2', 'eu-west-1', 'eu-central-1',
+                    'ap-southeast-1', 'ap-northeast-1'
+                ]
+            cache_key = f"vpc_analysis_{operational_profile}_{'_'.join(sorted(regions))}"
+            # Check cache
+            cached_result = self.cache.get(cache_key, ttl_minutes=60)  # Longer TTL for VPC data
+            if cached_result and self.current_operation:
+                self.current_operation.cache_hits += 1
+                self.current_operation.optimization_applied.append("regional_caching")
+                return cached_result
+            if self.current_operation:
+                self.current_operation.cache_misses += 1
+                self.current_operation.optimization_applied.append("parallel_regional_processing")
+            # Parallel regional analysis
+            vpc_data = {}
+            with Progress(
+                SpinnerColumn(),
+                TextColumn("[progress.description]{task.description}"),
+                BarColumn(),
+                MofNCompleteColumn(),
+                TimeElapsedColumn(),
+                console=console
+            ) as progress:
+                task = progress.add_task("Analyzing VPCs across regions...", total=len(regions))
+                with ThreadPoolExecutor(max_workers=min(self.max_workers, len(regions))) as executor:
+                    # Submit region analysis tasks
+                    future_to_region = {
+                        executor.submit(self._analyze_vpc_region, operational_profile, region): region
+                        for region in regions
+                    }
+                    for future in as_completed(future_to_region):
+                        region = future_to_region[future]
+                        try:
+                            region_data = future.result(timeout=45)  # 45s timeout per region
+                            vpc_data[region] = region_data
+                            if self.current_operation:
+                                self.current_operation.api_calls_made += region_data.get('api_calls', 0)
+                        except Exception as e:
+                            logger.warning(f"VPC analysis failed for region {region}: {e}")
+                            vpc_data[region] = {'error': str(e), 'vpcs': []}
+                        finally:
+                            progress.advance(task)
+            # Aggregate results
+            result = {
+                'vpc_data_by_region': vpc_data,
+                'total_vpcs': sum(len(data.get('vpcs', [])) for data in vpc_data.values()),
+                'regions_analyzed': len(regions),
+                'optimization_applied': self.current_operation.optimization_applied if self.current_operation else []
+            }
+            # Cache result
+            self.cache.set(cache_key, result)
+            return result
+        return optimized_vpc_analysis
+    def optimize_multi_account_operations(self,
+                                        account_list: List[str],
+                                        operation_function: Callable,
+                                        batch_size: Optional[int] = None) -> Callable:
+        """
+        Phase 2: Optimize multi-account operations for 200+ enterprise account scaling
+        Args:
+            account_list: List of AWS account IDs to process
+            operation_function: Function to execute per account
+            batch_size: Adaptive batch size (auto-calculated if None)
+        Returns:
+            Optimized function with enterprise scaling patterns
+        """
+        def optimized_multi_account_operation(**kwargs):
+            """Optimized multi-account operation with adaptive scaling"""
+            account_count = len(account_list)
+            # Phase 2: Adaptive batch sizing based on account count and memory
+            if batch_size is None:
+                if account_count <= 50:
+                    calculated_batch_size = 10
+                elif account_count <= 100:
+                    calculated_batch_size = 15
+                elif account_count <= 200:
+                    calculated_batch_size = 20
+                else:
+                    calculated_batch_size = 25  # Enterprise scale 200+
+            else:
+                calculated_batch_size = batch_size
+            # Adjust batch size based on current memory usage
+            if self.memory_optimization_active:
+                current_memory = self.process.memory_info().rss / (1024 * 1024)
+                memory_utilization = current_memory / self.memory_limit_mb
+                if memory_utilization > 0.7:
+                    calculated_batch_size = max(5, calculated_batch_size // 2)
+                    console.log(f"[yellow]📉 Reducing batch size to {calculated_batch_size} due to memory pressure[/yellow]")
+            console.log(f"[cyan]🏢 Enterprise multi-account operation: {account_count} accounts, batch size: {calculated_batch_size}[/cyan]")
+            if self.current_operation:
+                self.current_operation.optimization_applied.extend([
+                    "enterprise_multi_account_scaling",
+                    "adaptive_batch_sizing",
+                    f"batch_size_{calculated_batch_size}"
+                ])
+            results = {}
+            processed_count = 0
+            # Process accounts in adaptive batches
+            with Progress(
+                SpinnerColumn(),
+                TextColumn("[progress.description]{task.description}"),
+                BarColumn(),
+                MofNCompleteColumn(),
+                TimeElapsedColumn(),
+                console=console
+            ) as progress:
+                task = progress.add_task("Processing enterprise accounts...", total=account_count)
+                # Process in batches with circuit breaker protection
+                for i in range(0, account_count, calculated_batch_size):
+                    batch_accounts = account_list[i:i + calculated_batch_size]
+                    with ThreadPoolExecutor(max_workers=min(self.max_workers, len(batch_accounts))) as executor:
+                        batch_futures = {}
+                        for account_id in batch_accounts:
+                            # Use circuit breaker protection for each account
+                            client_key = f"account_{account_id}"
+                            try:
+                                future = executor.submit(
+                                    self._protected_account_operation,
+                                    client_key,
+                                    operation_function,
+                                    account_id,
+                                    **kwargs
+                                )
+                                batch_futures[future] = account_id
+                            except Exception as e:
+                                logger.warning(f"Failed to submit operation for account {account_id}: {e}")
+                                results[account_id] = {"error": str(e), "success": False}
+                        # Collect batch results with timeout handling
+                        for future in as_completed(batch_futures, timeout=120):  # 2 minute timeout per batch
+                            account_id = batch_futures[future]
+                            try:
+                                result = future.result(timeout=60)  # 1 minute per account
+                                results[account_id] = result
+                            except Exception as e:
+                                logger.warning(f"Account operation failed for {account_id}: {e}")
+                                results[account_id] = {"error": str(e), "success": False}
+                            finally:
+                                processed_count += 1
+                                progress.advance(task)
+                    # Phase 2: Proactive memory cleanup between batches
+                    if self.auto_memory_cleanup and i > 0:
+                        current_memory = self.process.memory_info().rss / (1024 * 1024)
+                        if current_memory > self.memory_limit_mb * 0.6:
+                            self._proactive_memory_cleanup()
+                            time.sleep(1)  # Brief pause after cleanup
+            # Update operation metrics
+            if self.current_operation:
+                self.current_operation.api_calls_made += processed_count
+                success_count = sum(1 for r in results.values() if r.get("success", False))
+                success_rate = (success_count / processed_count * 100) if processed_count > 0 else 0
+                console.log(f"[green]✅ Multi-account operation completed: {success_count}/{processed_count} accounts ({success_rate:.1f}% success)[/green]")
+                if success_rate >= 99.0:
+                    self.current_operation.optimization_applied.append("high_reliability_achieved")
+            return {
+                'results': results,
+                'total_accounts': account_count,
+                'processed_accounts': processed_count,
+                'success_rate': success_rate,
+                'batch_size_used': calculated_batch_size,
+                'optimization_summary': {
+                    'enterprise_scaling': True,
+                    'adaptive_batching': True,
+                    'memory_optimized': self.memory_optimization_active,
+                    'reliability_protected': True
+                }
+            }
+        return optimized_multi_account_operation
+    def _protected_account_operation(self, client_key: str, operation_function: Callable, account_id: str, **kwargs):
+        """Execute account operation with circuit breaker protection"""
+        # Create or get circuit breaker for this account
+        if client_key not in self.client_pool.circuit_breakers:
+            self.client_pool.circuit_breakers[client_key] = CircuitBreaker(
+                failure_threshold=2,  # More aggressive for account-level operations
+                recovery_timeout_seconds=15,  # Faster recovery for account operations
+                success_threshold=1  # Close quickly on success
+            )
+        with self.client_pool.circuit_breakers[client_key].protected_call(f"account_{account_id}"):
+            return operation_function(account_id=account_id, **kwargs)
+    def _analyze_vpc_region(self, profile: str, region: str) -> Dict:
+        """Analyze VPCs in a specific region with optimization"""
+        try:
+            ec2_client = self.client_pool.get_client('ec2', profile, region)
+            # Get VPCs with pagination
+            vpcs = []
+            api_calls = 0
+            paginator = ec2_client.get_paginator('describe_vpcs')
+            for page in paginator.paginate():
+                vpcs.extend(page['Vpcs'])
+                api_calls += 1
+            # Enrich with network details (optimized)
+            for vpc in vpcs:
+                # Get subnets for this VPC
+                try:
+                    subnets_response = ec2_client.describe_subnets(
+                        Filters=[{'Name': 'vpc-id', 'Values': [vpc['VpcId']]}]
+                    )
+                    vpc['Subnets'] = subnets_response['Subnets']
+                    api_calls += 1
+                except Exception as e:
+                    logger.debug(f"Failed to get subnets for VPC {vpc['VpcId']}: {e}")
+                    vpc['Subnets'] = []
+            return {
+                'vpcs': vpcs,
+                'region': region,
+                'api_calls': api_calls
+            }
+        except Exception as e:
+            logger.error(f"VPC region analysis failed for {region}: {e}")
+            return {'vpcs': [], 'region': region, 'error': str(e), 'api_calls': 0}
+    def create_optimization_summary(self) -> None:
+        """Create comprehensive optimization performance summary with Phase 2 reliability metrics"""
+        if not self.metrics:
+            console.print("[yellow]No optimization metrics available yet[/]")
+            return
+        print_header("Performance Optimization Summary - Phase 2 Enhanced", "SRE Automation Engine")
+        # Phase 2: Create enhanced metrics table with reliability information
+        table = create_table(
+            title="Phase 2 Optimization Results",
+            columns=[
+                {"name": "Operation", "style": "cyan", "justify": "left"},
+                {"name": "Duration", "style": "white", "justify": "right"},
+                {"name": "Target", "style": "white", "justify": "right"},
+                {"name": "Memory", "style": "blue", "justify": "right"},
+                {"name": "Improvement", "style": "white", "justify": "right"},
+                {"name": "Optimizations", "style": "dim", "justify": "left", "max_width": 25},
+                {"name": "Status", "style": "white", "justify": "center"}
+            ]
+        )
+        for metrics in self.metrics:
+            improvement = metrics.get_performance_improvement()
+            status_icon = STATUS_INDICATORS['success'] if metrics.success else STATUS_INDICATORS['error']
+            status_color = 'green' if metrics.success else 'red'
+            improvement_text = f"+{improvement:.1f}%" if improvement > 0 else f"{improvement:.1f}%"
+            improvement_color = 'green' if improvement > 0 else 'yellow'
+            # Phase 2: Memory usage display with color coding
+            memory_mb = metrics.memory_peak_mb
+            memory_color = 'green' if memory_mb <= 256 else 'yellow' if memory_mb <= 512 else 'red'
+            memory_text = f"[{memory_color}]{memory_mb:.0f}MB[/{memory_color}]"
+            table.add_row(
+                metrics.operation_name,
+                f"{metrics.duration_seconds:.1f}s",
+                f"{metrics.target_seconds:.1f}s",
+                memory_text,
+                f"[{improvement_color}]{improvement_text}[/]",
+                ", ".join(metrics.optimization_applied[:2]) + ("..." if len(metrics.optimization_applied) > 2 else ""),
+                f"[{status_color}]{status_icon}[/]"
+            )
+        console.print(table)
+        # Cache statistics
+        cache_stats = self.cache.get_stats()
+        cache_panel = Panel(
+            f"[cyan]Cache Size:[/] {cache_stats['size']}/{cache_stats['max_size']}\n"
+            f"[cyan]Hit Rate:[/] {cache_stats['hit_rate']:.1f}% ({cache_stats['hits']} hits, {cache_stats['misses']} misses)",
+            title="[bold]Cache Performance[/bold]",
+            border_style="blue"
+        )
+        console.print(cache_panel)
+        # Phase 2: Reliability status panel
+        reliability_stats = self.client_pool.get_reliability_status()
+        reliability_color = {
+            'excellent': 'green',
+            'good': 'blue',
+            'needs_improvement': 'yellow'
+        }.get(reliability_stats.get('reliability_status', 'good'), 'white')
+        reliability_panel = Panel(
+            f"[cyan]Success Rate:[/] [{reliability_color}]{reliability_stats['overall_success_rate']:.2f}%[/{reliability_color}] "
+            f"(Target: {reliability_stats['target_success_rate']}%)\n"
+            f"[cyan]Total Requests:[/] {reliability_stats['total_requests']:,} "
+            f"([red]Failures:[/] {reliability_stats['total_failures']})\n"
+            f"[cyan]Circuit Breakers:[/] {len(reliability_stats['circuit_breakers'])} active "
+            f"([cyan]Status:[/] [{reliability_color}]{reliability_stats['reliability_status'].title()}[/{reliability_color}])",
+            title="[bold]Phase 2 Reliability Metrics[/bold]",
+            border_style=reliability_color
+        )
+        console.print(reliability_panel)
+        # Phase 2: Memory optimization status
+        memory_report = self.get_memory_usage_report()
+        memory_color = 'green' if memory_report['current_memory_mb'] <= 256 else 'yellow' if memory_report['current_memory_mb'] <= 512 else 'red'
+        memory_panel = Panel(
+            f"[cyan]Current Memory:[/] [{memory_color}]{memory_report['current_memory_mb']:.1f}MB[/{memory_color}] / {self.memory_limit_mb}MB\n"
+            f"[cyan]Peak Memory:[/] {memory_report.get('peak_memory_mb', 0):.1f}MB\n"
+            f"[cyan]Status:[/] [{memory_color}]{memory_report['memory_efficiency'].title()}[/{memory_color}] "
+            f"([cyan]Cleanup:[/] {'Enabled' if self.auto_memory_cleanup else 'Disabled'})",
+            title="[bold]Phase 2 Memory Optimization[/bold]",
+            border_style=memory_color
+        )
+        console.print(memory_panel)
+    def get_memory_usage_report(self) -> Dict[str, Any]:
+        """Get current memory usage report"""
+        memory_info = self.process.memory_info()
+        return {
+            "current_memory_mb": memory_info.rss / (1024 * 1024),
+            "peak_memory_mb": max(m.memory_peak_mb for m in self.metrics) if self.metrics else 0.0,
+            "memory_limit_mb": self.memory_limit_mb,
+            "memory_efficiency": "good" if memory_info.rss / (1024 * 1024) < self.memory_limit_mb * 0.8 else "warning"
+        }
+    def clear_caches(self):
+        """Clear all optimization caches"""
+        self.cache.clear()
+        self.client_pool.clear_pool()
+        console.print("[green]✅ Optimization caches cleared[/]")
+# Global optimization engine instance
+_optimization_engine: Optional[PerformanceOptimizationEngine] = None
+def get_optimization_engine(max_workers: int = 20,
+                          cache_ttl_minutes: int = 30,
+                          memory_limit_mb: int = 512) -> PerformanceOptimizationEngine:  # Phase 2: Default 512MB
+    """Get or create global performance optimization engine with Phase 2 enhancements"""
+    global _optimization_engine
+    if _optimization_engine is None:
+        _optimization_engine = PerformanceOptimizationEngine(
+            max_workers=max_workers,
+            cache_ttl_minutes=cache_ttl_minutes,
+            memory_limit_mb=memory_limit_mb
+        )
+    return _optimization_engine
+def create_optimization_report():
+    """Create optimization performance report"""
+    if _optimization_engine:
+        _optimization_engine.create_optimization_summary()
+    else:
+        console.print("[yellow]No optimization engine initialized[/]")
+# Export public interface - Phase 2 Enhanced
+__all__ = [
+    "PerformanceOptimizationEngine",
+    "OptimizationMetrics",
+    "IntelligentCache",
+    "OptimizedAWSClientPool",
+    "CircuitBreaker",
+    "CircuitBreakerState",
+    "get_optimization_engine",
+    "create_optimization_report"
+]

runbooks 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl

runbooks 1.0.0py3-none-any.whl → 1.0.2py3-none-any.whl