PyPI - claude-mpm - Versions diffs - 4.0.9__py3-none-any.whl → 4.0.11__py3-none-any.whl - Mend

claude-mpm 4.0.9py3-none-any.whl → 4.0.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

claude_mpm/services/socketio/handlers/connection.py CHANGED Viewed

@@ -5,13 +5,87 @@ disconnect, status requests, and history management. Separating these
 from other handlers makes connection management more maintainable.
 """
+import asyncio
+import functools
+import time
 from datetime import datetime
-from typing import Any, Dict, List, Optional, Set
+from typing import Any, Callable, Dict, List, Optional, Set
 from ....core.typing_utils import ClaudeStatus, EventData, SocketId
 from .base import BaseEventHandler
+def timeout_handler(timeout_seconds: float = 5.0):
+    """Decorator to add timeout protection to async handlers.
+    WHY: Network operations can hang indefinitely, causing resource leaks
+    and poor user experience. This decorator ensures handlers complete
+    within a reasonable time or fail gracefully.
+    Args:
+        timeout_seconds: Maximum time allowed for handler execution (default: 5s)
+    """
+    def decorator(func: Callable) -> Callable:
+        @functools.wraps(func)
+        async def wrapper(self, *args, **kwargs):
+            handler_name = func.__name__
+            start_time = time.time()
+            try:
+                # Create a task with timeout
+                result = await asyncio.wait_for(
+                    func(self, *args, **kwargs),
+                    timeout=timeout_seconds
+                )
+                elapsed = time.time() - start_time
+                if elapsed > timeout_seconds * 0.8:  # Warn if close to timeout
+                    self.logger.warning(
+                        f"⚠️ Handler {handler_name} took {elapsed:.2f}s "
+                        f"(close to {timeout_seconds}s timeout)"
+                    )
+                return result
+            except asyncio.TimeoutError:
+                elapsed = time.time() - start_time
+                self.logger.error(
+                    f"❌ Handler {handler_name} timed out after {elapsed:.2f}s"
+                )
+                # Try to send error response to client if we have their sid
+                if args and isinstance(args[0], str):  # First arg is usually sid
+                    sid = args[0]
+                    try:
+                        # Use a short timeout for error response
+                        await asyncio.wait_for(
+                            self.emit_to_client(
+                                sid,
+                                "error",
+                                {
+                                    "message": f"Handler {handler_name} timed out",
+                                    "handler": handler_name,
+                                    "timeout": timeout_seconds
+                                }
+                            ),
+                            timeout=1.0
+                        )
+                    except:
+                        pass  # Best effort error notification
+                return None
+            except Exception as e:
+                elapsed = time.time() - start_time
+                self.logger.error(
+                    f"❌ Handler {handler_name} failed after {elapsed:.2f}s: {e}"
+                )
+                raise
+        return wrapper
+    return decorator
 class ConnectionEventHandler(BaseEventHandler):
     """Handles Socket.IO connection lifecycle events.
@@ -19,11 +93,189 @@ class ConnectionEventHandler(BaseEventHandler):
     that deserves its own focused handler. This includes client connections,
     disconnections, status updates, and event history management.
     """
+    def __init__(self, server):
+        """Initialize connection handler with health monitoring.
+        WHY: We need to track connection health metrics and implement
+        ping/pong mechanism for detecting stale connections.
+        """
+        super().__init__(server)
+        # Connection health tracking
+        self.connection_metrics = {}
+        self.last_ping_times = {}
+        self.ping_interval = 30  # seconds
+        self.ping_timeout = 10  # seconds
+        self.stale_check_interval = 60  # seconds
+        # Health monitoring tasks (will be started after event registration)
+        self.ping_task = None
+        self.stale_check_task = None
+    def _start_health_monitoring(self):
+        """Start background tasks for connection health monitoring.
+        WHY: We need to actively monitor connection health to detect
+        and clean up stale connections, ensuring reliable event delivery.
+        """
+        # Only start if we have a valid event loop and tasks aren't already running
+        if hasattr(self.server, 'core') and hasattr(self.server.core, 'loop'):
+            loop = self.server.core.loop
+            if loop and not loop.is_closed():
+                if not self.ping_task or self.ping_task.done():
+                    self.ping_task = asyncio.run_coroutine_threadsafe(
+                        self._periodic_ping(), loop
+                    )
+                    self.logger.info("🏓 Started connection ping monitoring")
+                if not self.stale_check_task or self.stale_check_task.done():
+                    self.stale_check_task = asyncio.run_coroutine_threadsafe(
+                        self._check_stale_connections(), loop
+                    )
+                    self.logger.info("🧹 Started stale connection checker")
+    def stop_health_monitoring(self):
+        """Stop health monitoring tasks.
+        WHY: Clean shutdown requires stopping background tasks to
+        prevent errors and resource leaks.
+        """
+        if self.ping_task and not self.ping_task.done():
+            self.ping_task.cancel()
+            self.logger.info("🚫 Stopped connection ping monitoring")
+        if self.stale_check_task and not self.stale_check_task.done():
+            self.stale_check_task.cancel()
+            self.logger.info("🚫 Stopped stale connection checker")
+    async def _periodic_ping(self):
+        """Send periodic pings to all connected clients.
+        WHY: WebSocket connections can silently fail. Regular pings
+        help detect dead connections and maintain connection state.
+        """
+        while True:
+            try:
+                await asyncio.sleep(self.ping_interval)
+                if not self.clients:
+                    continue
+                current_time = time.time()
+                disconnected = []
+                for sid in list(self.clients):
+                    try:
+                        # Send ping and record time
+                        await self.sio.emit('ping', {'timestamp': current_time}, room=sid)
+                        self.last_ping_times[sid] = current_time
+                        # Update connection metrics
+                        if sid not in self.connection_metrics:
+                            self.connection_metrics[sid] = {
+                                'connected_at': current_time,
+                                'reconnects': 0,
+                                'failures': 0,
+                                'last_activity': current_time
+                            }
+                        self.connection_metrics[sid]['last_activity'] = current_time
+                    except Exception as e:
+                        self.logger.warning(f"Failed to ping client {sid}: {e}")
+                        disconnected.append(sid)
+                # Clean up failed connections
+                for sid in disconnected:
+                    await self._cleanup_stale_connection(sid)
+                if self.clients:
+                    self.logger.debug(
+                        f"🏓 Sent pings to {len(self.clients)} clients, "
+                        f"{len(disconnected)} failed"
+                    )
+            except Exception as e:
+                self.logger.error(f"Error in periodic ping: {e}")
+    async def _check_stale_connections(self):
+        """Check for and clean up stale connections.
+        WHY: Some clients may not properly disconnect, leaving zombie
+        connections that consume resources and prevent proper cleanup.
+        """
+        while True:
+            try:
+                await asyncio.sleep(self.stale_check_interval)
+                current_time = time.time()
+                stale_threshold = current_time - (self.ping_timeout + self.ping_interval)
+                stale_sids = []
+                for sid in list(self.clients):
+                    last_ping = self.last_ping_times.get(sid, 0)
+                    if last_ping < stale_threshold:
+                        stale_sids.append(sid)
+                        self.logger.warning(
+                            f"🧟 Detected stale connection {sid} "
+                            f"(last ping: {current_time - last_ping:.1f}s ago)"
+                        )
+                # Clean up stale connections
+                for sid in stale_sids:
+                    await self._cleanup_stale_connection(sid)
+                if stale_sids:
+                    self.logger.info(
+                        f"🧹 Cleaned up {len(stale_sids)} stale connections"
+                    )
+            except Exception as e:
+                self.logger.error(f"Error checking stale connections: {e}")
+    async def _cleanup_stale_connection(self, sid: str):
+        """Clean up a stale or dead connection.
+        WHY: Proper cleanup prevents memory leaks and ensures
+        accurate connection tracking.
+        """
+        try:
+            if sid in self.clients:
+                self.clients.remove(sid)
+            if sid in self.last_ping_times:
+                del self.last_ping_times[sid]
+            if sid in self.connection_metrics:
+                metrics = self.connection_metrics[sid]
+                uptime = time.time() - metrics.get('connected_at', 0)
+                self.logger.info(
+                    f"📊 Connection {sid} stats - uptime: {uptime:.1f}s, "
+                    f"reconnects: {metrics.get('reconnects', 0)}, "
+                    f"failures: {metrics.get('failures', 0)}"
+                )
+                del self.connection_metrics[sid]
+            # Force disconnect if still connected
+            try:
+                await self.sio.disconnect(sid)
+            except:
+                pass  # Already disconnected
+            self.logger.info(f"🔌 Cleaned up stale connection: {sid}")
+        except Exception as e:
+            self.logger.error(f"Error cleaning up connection {sid}: {e}")
     def register_events(self) -> None:
         """Register connection-related event handlers."""
+        # Start health monitoring now that we're registering events
+        self._start_health_monitoring()
         @self.sio.event
+        @timeout_handler(timeout_seconds=5.0)
         async def connect(sid, environ, *args):
             """Handle client connection.
@@ -72,6 +324,7 @@ class ConnectionEventHandler(BaseEventHandler):
                 self.log_error(f"sending welcome to client {sid}", e)
         @self.sio.event
+        @timeout_handler(timeout_seconds=3.0)
         async def disconnect(sid):
             """Handle client disconnection.
@@ -86,8 +339,15 @@ class ConnectionEventHandler(BaseEventHandler):
                 self.logger.warning(
                     f"⚠️  Attempted to disconnect unknown client: {sid}"
                 )
+            # Clean up health tracking
+            if sid in self.last_ping_times:
+                del self.last_ping_times[sid]
+            if sid in self.connection_metrics:
+                del self.connection_metrics[sid]
         @self.sio.event
+        @timeout_handler(timeout_seconds=3.0)
         async def get_status(sid):
             """Handle status request.
@@ -105,6 +365,7 @@ class ConnectionEventHandler(BaseEventHandler):
             await self.emit_to_client(sid, "status", status_data)
         @self.sio.event
+        @timeout_handler(timeout_seconds=5.0)
         async def get_history(sid, data=None):
             """Handle history request.
@@ -118,6 +379,7 @@ class ConnectionEventHandler(BaseEventHandler):
             await self._send_event_history(sid, event_types=event_types, limit=limit)
         @self.sio.event
+        @timeout_handler(timeout_seconds=5.0)
         async def request_history(sid, data=None):
             """Handle legacy history request (for client compatibility).
@@ -131,6 +393,7 @@ class ConnectionEventHandler(BaseEventHandler):
             await self._send_event_history(sid, event_types=event_types, limit=limit)
         @self.sio.event
+        @timeout_handler(timeout_seconds=3.0)
         async def subscribe(sid, data=None):
             """Handle subscription request.
@@ -141,6 +404,7 @@ class ConnectionEventHandler(BaseEventHandler):
             await self.emit_to_client(sid, "subscribed", {"channels": channels})
         @self.sio.event
+        @timeout_handler(timeout_seconds=5.0)
         async def claude_event(sid, data):
             """Handle events from client proxies.
@@ -198,6 +462,25 @@ class ConnectionEventHandler(BaseEventHandler):
             self.logger.info(f"📡 Broadcasting claude_event to all clients except {sid}")
             await self.broadcast_event("claude_event", data, skip_sid=sid)
             self.logger.info(f"✅ Broadcast complete")
+        @self.sio.event
+        async def pong(sid, data=None):
+            """Handle pong response from client.
+            WHY: Clients respond to our pings with pongs, confirming
+            they're still alive and the connection is healthy.
+            """
+            current_time = time.time()
+            # Update last activity time
+            if sid in self.connection_metrics:
+                self.connection_metrics[sid]['last_activity'] = current_time
+            # Calculate round-trip time if timestamp provided
+            if data and 'timestamp' in data:
+                rtt = current_time - data['timestamp']
+                if rtt < 10:  # Reasonable RTT
+                    self.logger.debug(f"🏓 Pong from {sid}, RTT: {rtt*1000:.1f}ms")
     def _normalize_event(self, event_data: Dict[str, Any]) -> Dict[str, Any]:
         """Normalize event format to ensure consistency.

claude_mpm/services/socketio/server/broadcaster.py CHANGED Viewed

@@ -10,12 +10,144 @@ to create focused, testable modules with single responsibilities.
 """
 import asyncio
+import time
+from collections import deque
+from dataclasses import dataclass
 from datetime import datetime
-from typing import Any, Dict, List, Optional, Set
+from typing import Any, Deque, Dict, List, Optional, Set
 from ....core.logging_config import get_logger
+@dataclass
+class RetryableEvent:
+    """Represents an event that can be retried on failure.
+    WHY: Network failures are common and transient. By tracking retry
+    attempts, we can recover from temporary issues while avoiding
+    infinite retry loops.
+    """
+    event_type: str
+    data: Dict[str, Any]
+    attempt_count: int = 0
+    max_retries: int = 3
+    created_at: float = None
+    last_attempt: float = None
+    skip_sid: Optional[str] = None
+    def __post_init__(self):
+        if self.created_at is None:
+            self.created_at = time.time()
+        if self.last_attempt is None:
+            self.last_attempt = time.time()
+    def should_retry(self) -> bool:
+        """Check if this event should be retried.
+        WHY: We need to balance reliability with resource usage.
+        Events older than 30 seconds or with too many attempts
+        should be abandoned.
+        """
+        if self.attempt_count >= self.max_retries:
+            return False
+        # Don't retry events older than 30 seconds
+        if time.time() - self.created_at > 30:
+            return False
+        return True
+    def get_backoff_delay(self) -> float:
+        """Calculate exponential backoff delay.
+        WHY: Exponential backoff prevents overwhelming the system
+        during recovery from failures.
+        """
+        base_delay = 1.0  # 1 second
+        max_delay = 8.0   # 8 seconds max
+        delay = min(base_delay * (2 ** self.attempt_count), max_delay)
+        return delay
+class RetryQueue:
+    """Manages retry queue for failed event broadcasts.
+    WHY: Transient network issues shouldn't cause event loss.
+    This queue provides resilient event delivery with backoff.
+    """
+    def __init__(self, max_size: int = 1000):
+        self.queue: Deque[RetryableEvent] = deque(maxlen=max_size)
+        self.lock = asyncio.Lock()
+        self.stats = {
+            'queued': 0,
+            'retried': 0,
+            'succeeded': 0,
+            'abandoned': 0
+        }
+    async def add(self, event: RetryableEvent) -> None:
+        """Add an event to the retry queue."""
+        async with self.lock:
+            self.queue.append(event)
+            self.stats['queued'] += 1
+    async def get_ready_events(self) -> List[RetryableEvent]:
+        """Get events that are ready for retry.
+        WHY: We need to respect backoff delays to avoid
+        overwhelming the system during recovery.
+        """
+        async with self.lock:
+            current_time = time.time()
+            ready = []
+            # Check each event in queue
+            remaining = []
+            for event in self.queue:
+                if not event.should_retry():
+                    self.stats['abandoned'] += 1
+                    continue
+                # First attempt (attempt_count == 0) should be immediate
+                if event.attempt_count == 0:
+                    ready.append(event)
+                else:
+                    # For retries, check backoff delay
+                    time_since_attempt = current_time - event.last_attempt
+                    if time_since_attempt >= event.get_backoff_delay():
+                        ready.append(event)
+                    else:
+                        remaining.append(event)
+            # Update queue with events not ready yet
+            self.queue.clear()
+            self.queue.extend(remaining)
+            return ready
+    async def mark_success(self, event: RetryableEvent) -> None:
+        """Mark an event as successfully sent."""
+        self.stats['succeeded'] += 1
+    async def mark_retry(self, event: RetryableEvent) -> None:
+        """Mark an event for retry."""
+        event.attempt_count += 1
+        event.last_attempt = time.time()
+        self.stats['retried'] += 1
+        if event.should_retry():
+            await self.add(event)
+    def get_stats(self) -> Dict[str, int]:
+        """Get retry queue statistics."""
+        return {
+            **self.stats,
+            'queue_size': len(self.queue)
+        }
 class SocketIOEventBroadcaster:
     """Handles broadcasting events to connected Socket.IO clients.
@@ -41,9 +173,113 @@ class SocketIOEventBroadcaster:
         self.logger = logger
         self.loop = None  # Will be set by main server
         self.server = server  # Reference to main server for event history
-    def broadcast_event(self, event_type: str, data: Dict[str, Any]):
-        """Broadcast an event to all connected clients."""
+        # Initialize retry queue for resilient delivery
+        self.retry_queue = RetryQueue(max_size=1000)
+        self.retry_task = None
+        self.retry_interval = 2.0  # Process retry queue every 2 seconds
+    def start_retry_processor(self):
+        """Start the background retry processor.
+        WHY: Failed broadcasts need to be retried automatically
+        to ensure reliable event delivery.
+        """
+        if self.loop and not self.retry_task:
+            self.retry_task = asyncio.create_task(self._process_retry_queue())
+            self.logger.info("🔄 Started retry queue processor")
+    def stop_retry_processor(self):
+        """Stop the background retry processor."""
+        if self.retry_task:
+            self.retry_task.cancel()
+            self.retry_task = None
+            self.logger.info("🚫 Stopped retry queue processor")
+    async def _process_retry_queue(self):
+        """Process the retry queue periodically.
+        WHY: Regular processing ensures failed events are retried
+        with appropriate backoff delays.
+        """
+        while True:
+            try:
+                await asyncio.sleep(self.retry_interval)
+                # Get events ready for retry
+                ready_events = await self.retry_queue.get_ready_events()
+                if ready_events:
+                    self.logger.debug(
+                        f"🔄 Processing {len(ready_events)} events from retry queue"
+                    )
+                    for event in ready_events:
+                        success = await self._retry_broadcast(event)
+                        if success:
+                            await self.retry_queue.mark_success(event)
+                        else:
+                            await self.retry_queue.mark_retry(event)
+                    # Log stats periodically
+                    stats = self.retry_queue.get_stats()
+                    if stats['retried'] > 0 or stats['abandoned'] > 0:
+                        self.logger.info(
+                            f"📊 Retry queue stats - "
+                            f"queued: {stats['queued']}, "
+                            f"retried: {stats['retried']}, "
+                            f"succeeded: {stats['succeeded']}, "
+                            f"abandoned: {stats['abandoned']}, "
+                            f"current size: {stats['queue_size']}"
+                        )
+            except asyncio.CancelledError:
+                break
+            except Exception as e:
+                self.logger.error(f"Error processing retry queue: {e}")
+    async def _retry_broadcast(self, event: RetryableEvent) -> bool:
+        """Retry broadcasting a failed event.
+        WHY: Isolated retry logic allows for special handling
+        and metrics tracking of retry attempts.
+        """
+        try:
+            self.logger.debug(
+                f"🔄 Retrying {event.event_type} (attempt {event.attempt_count + 1}/{event.max_retries})"
+            )
+            # Reconstruct the full event
+            full_event = {
+                "type": event.event_type,
+                "timestamp": datetime.now().isoformat(),
+                "data": event.data,
+                "retry_attempt": event.attempt_count + 1
+            }
+            # Attempt broadcast
+            if event.skip_sid:
+                await self.sio.emit("claude_event", full_event, skip_sid=event.skip_sid)
+            else:
+                await self.sio.emit("claude_event", full_event)
+            self.logger.debug(f"✅ Successfully retried {event.event_type}")
+            return True
+        except Exception as e:
+            self.logger.warning(
+                f"⚠️ Retry failed for {event.event_type} "
+                f"(attempt {event.attempt_count + 1}): {e}"
+            )
+            return False
+    def broadcast_event(self, event_type: str, data: Dict[str, Any], skip_sid: Optional[str] = None):
+        """Broadcast an event to all connected clients with retry support.
+        WHY: Enhanced with retry queue to ensure reliable delivery
+        even during transient network issues.
+        """
         if not self.sio:
             return
@@ -65,15 +301,27 @@ class SocketIOEventBroadcaster:
                 self.logger.debug(f"Added {event_type} to history (total: {len(self.server.event_history)})")
         # Broadcast to all connected clients
+        broadcast_success = False
         try:
             # Use run_coroutine_threadsafe to safely call from any thread
             if hasattr(self, "loop") and self.loop and not self.loop.is_closed():
-                future = asyncio.run_coroutine_threadsafe(
-                    self.sio.emit("claude_event", event), self.loop
-                )
-                # Don't wait for the result to avoid blocking
-                self.stats["events_sent"] += 1
-                self.logger.debug(f"Broadcasted event: {event_type}")
+                # Create broadcast coroutine
+                if skip_sid:
+                    coro = self.sio.emit("claude_event", event, skip_sid=skip_sid)
+                else:
+                    coro = self.sio.emit("claude_event", event)
+                future = asyncio.run_coroutine_threadsafe(coro, self.loop)
+                # Wait briefly to see if broadcast succeeds
+                try:
+                    future.result(timeout=0.5)  # 500ms timeout
+                    broadcast_success = True
+                    self.stats["events_sent"] += 1
+                    self.logger.debug(f"Broadcasted event: {event_type}")
+                except:
+                    # Will be added to retry queue below
+                    pass
             else:
                 self.logger.warning(
                     f"Cannot broadcast {event_type}: server loop not available"
@@ -81,6 +329,24 @@ class SocketIOEventBroadcaster:
         except Exception as e:
             self.logger.error(f"Failed to broadcast event {event_type}: {e}")
+        # Add to retry queue if broadcast failed
+        if not broadcast_success and self.loop:
+            retryable_event = RetryableEvent(
+                event_type=event_type,
+                data=data,
+                skip_sid=skip_sid
+            )
+            # Queue for retry
+            asyncio.run_coroutine_threadsafe(
+                self.retry_queue.add(retryable_event),
+                self.loop
+            )
+            self.logger.warning(
+                f"⚠️ Queued {event_type} for retry (queue size: {len(self.retry_queue.queue)})"
+            )
     def session_started(self, session_id: str, launch_method: str, working_dir: str):
         """Notify that a session has started."""

claude-mpm 4.0.9__py3-none-any.whl → 4.0.11__py3-none-any.whl

claude-mpm 4.0.9py3-none-any.whl → 4.0.11py3-none-any.whl