claude-mpm 4.0.9__py3-none-any.whl → 4.0.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,13 +5,87 @@ disconnect, status requests, and history management. Separating these
5
5
  from other handlers makes connection management more maintainable.
6
6
  """
7
7
 
8
+ import asyncio
9
+ import functools
10
+ import time
8
11
  from datetime import datetime
9
- from typing import Any, Dict, List, Optional, Set
12
+ from typing import Any, Callable, Dict, List, Optional, Set
10
13
 
11
14
  from ....core.typing_utils import ClaudeStatus, EventData, SocketId
12
15
  from .base import BaseEventHandler
13
16
 
14
17
 
18
+ def timeout_handler(timeout_seconds: float = 5.0):
19
+ """Decorator to add timeout protection to async handlers.
20
+
21
+ WHY: Network operations can hang indefinitely, causing resource leaks
22
+ and poor user experience. This decorator ensures handlers complete
23
+ within a reasonable time or fail gracefully.
24
+
25
+ Args:
26
+ timeout_seconds: Maximum time allowed for handler execution (default: 5s)
27
+ """
28
+ def decorator(func: Callable) -> Callable:
29
+ @functools.wraps(func)
30
+ async def wrapper(self, *args, **kwargs):
31
+ handler_name = func.__name__
32
+ start_time = time.time()
33
+
34
+ try:
35
+ # Create a task with timeout
36
+ result = await asyncio.wait_for(
37
+ func(self, *args, **kwargs),
38
+ timeout=timeout_seconds
39
+ )
40
+
41
+ elapsed = time.time() - start_time
42
+ if elapsed > timeout_seconds * 0.8: # Warn if close to timeout
43
+ self.logger.warning(
44
+ f"⚠️ Handler {handler_name} took {elapsed:.2f}s "
45
+ f"(close to {timeout_seconds}s timeout)"
46
+ )
47
+
48
+ return result
49
+
50
+ except asyncio.TimeoutError:
51
+ elapsed = time.time() - start_time
52
+ self.logger.error(
53
+ f"❌ Handler {handler_name} timed out after {elapsed:.2f}s"
54
+ )
55
+
56
+ # Try to send error response to client if we have their sid
57
+ if args and isinstance(args[0], str): # First arg is usually sid
58
+ sid = args[0]
59
+ try:
60
+ # Use a short timeout for error response
61
+ await asyncio.wait_for(
62
+ self.emit_to_client(
63
+ sid,
64
+ "error",
65
+ {
66
+ "message": f"Handler {handler_name} timed out",
67
+ "handler": handler_name,
68
+ "timeout": timeout_seconds
69
+ }
70
+ ),
71
+ timeout=1.0
72
+ )
73
+ except:
74
+ pass # Best effort error notification
75
+
76
+ return None
77
+
78
+ except Exception as e:
79
+ elapsed = time.time() - start_time
80
+ self.logger.error(
81
+ f"❌ Handler {handler_name} failed after {elapsed:.2f}s: {e}"
82
+ )
83
+ raise
84
+
85
+ return wrapper
86
+ return decorator
87
+
88
+
15
89
  class ConnectionEventHandler(BaseEventHandler):
16
90
  """Handles Socket.IO connection lifecycle events.
17
91
 
@@ -19,11 +93,189 @@ class ConnectionEventHandler(BaseEventHandler):
19
93
  that deserves its own focused handler. This includes client connections,
20
94
  disconnections, status updates, and event history management.
21
95
  """
96
+
97
+ def __init__(self, server):
98
+ """Initialize connection handler with health monitoring.
99
+
100
+ WHY: We need to track connection health metrics and implement
101
+ ping/pong mechanism for detecting stale connections.
102
+ """
103
+ super().__init__(server)
104
+
105
+ # Connection health tracking
106
+ self.connection_metrics = {}
107
+ self.last_ping_times = {}
108
+ self.ping_interval = 30 # seconds
109
+ self.ping_timeout = 10 # seconds
110
+ self.stale_check_interval = 60 # seconds
111
+
112
+ # Health monitoring tasks (will be started after event registration)
113
+ self.ping_task = None
114
+ self.stale_check_task = None
22
115
 
116
+ def _start_health_monitoring(self):
117
+ """Start background tasks for connection health monitoring.
118
+
119
+ WHY: We need to actively monitor connection health to detect
120
+ and clean up stale connections, ensuring reliable event delivery.
121
+ """
122
+ # Only start if we have a valid event loop and tasks aren't already running
123
+ if hasattr(self.server, 'core') and hasattr(self.server.core, 'loop'):
124
+ loop = self.server.core.loop
125
+ if loop and not loop.is_closed():
126
+ if not self.ping_task or self.ping_task.done():
127
+ self.ping_task = asyncio.run_coroutine_threadsafe(
128
+ self._periodic_ping(), loop
129
+ )
130
+ self.logger.info("🏓 Started connection ping monitoring")
131
+
132
+ if not self.stale_check_task or self.stale_check_task.done():
133
+ self.stale_check_task = asyncio.run_coroutine_threadsafe(
134
+ self._check_stale_connections(), loop
135
+ )
136
+ self.logger.info("🧹 Started stale connection checker")
137
+
138
+ def stop_health_monitoring(self):
139
+ """Stop health monitoring tasks.
140
+
141
+ WHY: Clean shutdown requires stopping background tasks to
142
+ prevent errors and resource leaks.
143
+ """
144
+ if self.ping_task and not self.ping_task.done():
145
+ self.ping_task.cancel()
146
+ self.logger.info("🚫 Stopped connection ping monitoring")
147
+
148
+ if self.stale_check_task and not self.stale_check_task.done():
149
+ self.stale_check_task.cancel()
150
+ self.logger.info("🚫 Stopped stale connection checker")
151
+
152
+ async def _periodic_ping(self):
153
+ """Send periodic pings to all connected clients.
154
+
155
+ WHY: WebSocket connections can silently fail. Regular pings
156
+ help detect dead connections and maintain connection state.
157
+ """
158
+ while True:
159
+ try:
160
+ await asyncio.sleep(self.ping_interval)
161
+
162
+ if not self.clients:
163
+ continue
164
+
165
+ current_time = time.time()
166
+ disconnected = []
167
+
168
+ for sid in list(self.clients):
169
+ try:
170
+ # Send ping and record time
171
+ await self.sio.emit('ping', {'timestamp': current_time}, room=sid)
172
+ self.last_ping_times[sid] = current_time
173
+
174
+ # Update connection metrics
175
+ if sid not in self.connection_metrics:
176
+ self.connection_metrics[sid] = {
177
+ 'connected_at': current_time,
178
+ 'reconnects': 0,
179
+ 'failures': 0,
180
+ 'last_activity': current_time
181
+ }
182
+ self.connection_metrics[sid]['last_activity'] = current_time
183
+
184
+ except Exception as e:
185
+ self.logger.warning(f"Failed to ping client {sid}: {e}")
186
+ disconnected.append(sid)
187
+
188
+ # Clean up failed connections
189
+ for sid in disconnected:
190
+ await self._cleanup_stale_connection(sid)
191
+
192
+ if self.clients:
193
+ self.logger.debug(
194
+ f"🏓 Sent pings to {len(self.clients)} clients, "
195
+ f"{len(disconnected)} failed"
196
+ )
197
+
198
+ except Exception as e:
199
+ self.logger.error(f"Error in periodic ping: {e}")
200
+
201
+ async def _check_stale_connections(self):
202
+ """Check for and clean up stale connections.
203
+
204
+ WHY: Some clients may not properly disconnect, leaving zombie
205
+ connections that consume resources and prevent proper cleanup.
206
+ """
207
+ while True:
208
+ try:
209
+ await asyncio.sleep(self.stale_check_interval)
210
+
211
+ current_time = time.time()
212
+ stale_threshold = current_time - (self.ping_timeout + self.ping_interval)
213
+ stale_sids = []
214
+
215
+ for sid in list(self.clients):
216
+ last_ping = self.last_ping_times.get(sid, 0)
217
+
218
+ if last_ping < stale_threshold:
219
+ stale_sids.append(sid)
220
+ self.logger.warning(
221
+ f"🧟 Detected stale connection {sid} "
222
+ f"(last ping: {current_time - last_ping:.1f}s ago)"
223
+ )
224
+
225
+ # Clean up stale connections
226
+ for sid in stale_sids:
227
+ await self._cleanup_stale_connection(sid)
228
+
229
+ if stale_sids:
230
+ self.logger.info(
231
+ f"🧹 Cleaned up {len(stale_sids)} stale connections"
232
+ )
233
+
234
+ except Exception as e:
235
+ self.logger.error(f"Error checking stale connections: {e}")
236
+
237
+ async def _cleanup_stale_connection(self, sid: str):
238
+ """Clean up a stale or dead connection.
239
+
240
+ WHY: Proper cleanup prevents memory leaks and ensures
241
+ accurate connection tracking.
242
+ """
243
+ try:
244
+ if sid in self.clients:
245
+ self.clients.remove(sid)
246
+
247
+ if sid in self.last_ping_times:
248
+ del self.last_ping_times[sid]
249
+
250
+ if sid in self.connection_metrics:
251
+ metrics = self.connection_metrics[sid]
252
+ uptime = time.time() - metrics.get('connected_at', 0)
253
+ self.logger.info(
254
+ f"📊 Connection {sid} stats - uptime: {uptime:.1f}s, "
255
+ f"reconnects: {metrics.get('reconnects', 0)}, "
256
+ f"failures: {metrics.get('failures', 0)}"
257
+ )
258
+ del self.connection_metrics[sid]
259
+
260
+ # Force disconnect if still connected
261
+ try:
262
+ await self.sio.disconnect(sid)
263
+ except:
264
+ pass # Already disconnected
265
+
266
+ self.logger.info(f"🔌 Cleaned up stale connection: {sid}")
267
+
268
+ except Exception as e:
269
+ self.logger.error(f"Error cleaning up connection {sid}: {e}")
270
+
23
271
  def register_events(self) -> None:
24
272
  """Register connection-related event handlers."""
273
+
274
+ # Start health monitoring now that we're registering events
275
+ self._start_health_monitoring()
25
276
 
26
277
  @self.sio.event
278
+ @timeout_handler(timeout_seconds=5.0)
27
279
  async def connect(sid, environ, *args):
28
280
  """Handle client connection.
29
281
 
@@ -72,6 +324,7 @@ class ConnectionEventHandler(BaseEventHandler):
72
324
  self.log_error(f"sending welcome to client {sid}", e)
73
325
 
74
326
  @self.sio.event
327
+ @timeout_handler(timeout_seconds=3.0)
75
328
  async def disconnect(sid):
76
329
  """Handle client disconnection.
77
330
 
@@ -86,8 +339,15 @@ class ConnectionEventHandler(BaseEventHandler):
86
339
  self.logger.warning(
87
340
  f"⚠️ Attempted to disconnect unknown client: {sid}"
88
341
  )
342
+
343
+ # Clean up health tracking
344
+ if sid in self.last_ping_times:
345
+ del self.last_ping_times[sid]
346
+ if sid in self.connection_metrics:
347
+ del self.connection_metrics[sid]
89
348
 
90
349
  @self.sio.event
350
+ @timeout_handler(timeout_seconds=3.0)
91
351
  async def get_status(sid):
92
352
  """Handle status request.
93
353
 
@@ -105,6 +365,7 @@ class ConnectionEventHandler(BaseEventHandler):
105
365
  await self.emit_to_client(sid, "status", status_data)
106
366
 
107
367
  @self.sio.event
368
+ @timeout_handler(timeout_seconds=5.0)
108
369
  async def get_history(sid, data=None):
109
370
  """Handle history request.
110
371
 
@@ -118,6 +379,7 @@ class ConnectionEventHandler(BaseEventHandler):
118
379
  await self._send_event_history(sid, event_types=event_types, limit=limit)
119
380
 
120
381
  @self.sio.event
382
+ @timeout_handler(timeout_seconds=5.0)
121
383
  async def request_history(sid, data=None):
122
384
  """Handle legacy history request (for client compatibility).
123
385
 
@@ -131,6 +393,7 @@ class ConnectionEventHandler(BaseEventHandler):
131
393
  await self._send_event_history(sid, event_types=event_types, limit=limit)
132
394
 
133
395
  @self.sio.event
396
+ @timeout_handler(timeout_seconds=3.0)
134
397
  async def subscribe(sid, data=None):
135
398
  """Handle subscription request.
136
399
 
@@ -141,6 +404,7 @@ class ConnectionEventHandler(BaseEventHandler):
141
404
  await self.emit_to_client(sid, "subscribed", {"channels": channels})
142
405
 
143
406
  @self.sio.event
407
+ @timeout_handler(timeout_seconds=5.0)
144
408
  async def claude_event(sid, data):
145
409
  """Handle events from client proxies.
146
410
 
@@ -198,6 +462,25 @@ class ConnectionEventHandler(BaseEventHandler):
198
462
  self.logger.info(f"📡 Broadcasting claude_event to all clients except {sid}")
199
463
  await self.broadcast_event("claude_event", data, skip_sid=sid)
200
464
  self.logger.info(f"✅ Broadcast complete")
465
+
466
+ @self.sio.event
467
+ async def pong(sid, data=None):
468
+ """Handle pong response from client.
469
+
470
+ WHY: Clients respond to our pings with pongs, confirming
471
+ they're still alive and the connection is healthy.
472
+ """
473
+ current_time = time.time()
474
+
475
+ # Update last activity time
476
+ if sid in self.connection_metrics:
477
+ self.connection_metrics[sid]['last_activity'] = current_time
478
+
479
+ # Calculate round-trip time if timestamp provided
480
+ if data and 'timestamp' in data:
481
+ rtt = current_time - data['timestamp']
482
+ if rtt < 10: # Reasonable RTT
483
+ self.logger.debug(f"🏓 Pong from {sid}, RTT: {rtt*1000:.1f}ms")
201
484
 
202
485
  def _normalize_event(self, event_data: Dict[str, Any]) -> Dict[str, Any]:
203
486
  """Normalize event format to ensure consistency.
@@ -10,12 +10,144 @@ to create focused, testable modules with single responsibilities.
10
10
  """
11
11
 
12
12
  import asyncio
13
+ import time
14
+ from collections import deque
15
+ from dataclasses import dataclass
13
16
  from datetime import datetime
14
- from typing import Any, Dict, List, Optional, Set
17
+ from typing import Any, Deque, Dict, List, Optional, Set
15
18
 
16
19
  from ....core.logging_config import get_logger
17
20
 
18
21
 
22
+ @dataclass
23
+ class RetryableEvent:
24
+ """Represents an event that can be retried on failure.
25
+
26
+ WHY: Network failures are common and transient. By tracking retry
27
+ attempts, we can recover from temporary issues while avoiding
28
+ infinite retry loops.
29
+ """
30
+ event_type: str
31
+ data: Dict[str, Any]
32
+ attempt_count: int = 0
33
+ max_retries: int = 3
34
+ created_at: float = None
35
+ last_attempt: float = None
36
+ skip_sid: Optional[str] = None
37
+
38
+ def __post_init__(self):
39
+ if self.created_at is None:
40
+ self.created_at = time.time()
41
+ if self.last_attempt is None:
42
+ self.last_attempt = time.time()
43
+
44
+ def should_retry(self) -> bool:
45
+ """Check if this event should be retried.
46
+
47
+ WHY: We need to balance reliability with resource usage.
48
+ Events older than 30 seconds or with too many attempts
49
+ should be abandoned.
50
+ """
51
+ if self.attempt_count >= self.max_retries:
52
+ return False
53
+
54
+ # Don't retry events older than 30 seconds
55
+ if time.time() - self.created_at > 30:
56
+ return False
57
+
58
+ return True
59
+
60
+ def get_backoff_delay(self) -> float:
61
+ """Calculate exponential backoff delay.
62
+
63
+ WHY: Exponential backoff prevents overwhelming the system
64
+ during recovery from failures.
65
+ """
66
+ base_delay = 1.0 # 1 second
67
+ max_delay = 8.0 # 8 seconds max
68
+
69
+ delay = min(base_delay * (2 ** self.attempt_count), max_delay)
70
+ return delay
71
+
72
+
73
+ class RetryQueue:
74
+ """Manages retry queue for failed event broadcasts.
75
+
76
+ WHY: Transient network issues shouldn't cause event loss.
77
+ This queue provides resilient event delivery with backoff.
78
+ """
79
+
80
+ def __init__(self, max_size: int = 1000):
81
+ self.queue: Deque[RetryableEvent] = deque(maxlen=max_size)
82
+ self.lock = asyncio.Lock()
83
+ self.stats = {
84
+ 'queued': 0,
85
+ 'retried': 0,
86
+ 'succeeded': 0,
87
+ 'abandoned': 0
88
+ }
89
+
90
+ async def add(self, event: RetryableEvent) -> None:
91
+ """Add an event to the retry queue."""
92
+ async with self.lock:
93
+ self.queue.append(event)
94
+ self.stats['queued'] += 1
95
+
96
+ async def get_ready_events(self) -> List[RetryableEvent]:
97
+ """Get events that are ready for retry.
98
+
99
+ WHY: We need to respect backoff delays to avoid
100
+ overwhelming the system during recovery.
101
+ """
102
+ async with self.lock:
103
+ current_time = time.time()
104
+ ready = []
105
+
106
+ # Check each event in queue
107
+ remaining = []
108
+ for event in self.queue:
109
+ if not event.should_retry():
110
+ self.stats['abandoned'] += 1
111
+ continue
112
+
113
+ # First attempt (attempt_count == 0) should be immediate
114
+ if event.attempt_count == 0:
115
+ ready.append(event)
116
+ else:
117
+ # For retries, check backoff delay
118
+ time_since_attempt = current_time - event.last_attempt
119
+ if time_since_attempt >= event.get_backoff_delay():
120
+ ready.append(event)
121
+ else:
122
+ remaining.append(event)
123
+
124
+ # Update queue with events not ready yet
125
+ self.queue.clear()
126
+ self.queue.extend(remaining)
127
+
128
+ return ready
129
+
130
+ async def mark_success(self, event: RetryableEvent) -> None:
131
+ """Mark an event as successfully sent."""
132
+ self.stats['succeeded'] += 1
133
+
134
+ async def mark_retry(self, event: RetryableEvent) -> None:
135
+ """Mark an event for retry."""
136
+ event.attempt_count += 1
137
+ event.last_attempt = time.time()
138
+ self.stats['retried'] += 1
139
+
140
+ if event.should_retry():
141
+ await self.add(event)
142
+
143
+ def get_stats(self) -> Dict[str, int]:
144
+ """Get retry queue statistics."""
145
+ return {
146
+ **self.stats,
147
+ 'queue_size': len(self.queue)
148
+ }
149
+
150
+
19
151
  class SocketIOEventBroadcaster:
20
152
  """Handles broadcasting events to connected Socket.IO clients.
21
153
 
@@ -41,9 +173,113 @@ class SocketIOEventBroadcaster:
41
173
  self.logger = logger
42
174
  self.loop = None # Will be set by main server
43
175
  self.server = server # Reference to main server for event history
44
-
45
- def broadcast_event(self, event_type: str, data: Dict[str, Any]):
46
- """Broadcast an event to all connected clients."""
176
+
177
+ # Initialize retry queue for resilient delivery
178
+ self.retry_queue = RetryQueue(max_size=1000)
179
+ self.retry_task = None
180
+ self.retry_interval = 2.0 # Process retry queue every 2 seconds
181
+
182
+ def start_retry_processor(self):
183
+ """Start the background retry processor.
184
+
185
+ WHY: Failed broadcasts need to be retried automatically
186
+ to ensure reliable event delivery.
187
+ """
188
+ if self.loop and not self.retry_task:
189
+ self.retry_task = asyncio.create_task(self._process_retry_queue())
190
+ self.logger.info("🔄 Started retry queue processor")
191
+
192
+ def stop_retry_processor(self):
193
+ """Stop the background retry processor."""
194
+ if self.retry_task:
195
+ self.retry_task.cancel()
196
+ self.retry_task = None
197
+ self.logger.info("🚫 Stopped retry queue processor")
198
+
199
+ async def _process_retry_queue(self):
200
+ """Process the retry queue periodically.
201
+
202
+ WHY: Regular processing ensures failed events are retried
203
+ with appropriate backoff delays.
204
+ """
205
+ while True:
206
+ try:
207
+ await asyncio.sleep(self.retry_interval)
208
+
209
+ # Get events ready for retry
210
+ ready_events = await self.retry_queue.get_ready_events()
211
+
212
+ if ready_events:
213
+ self.logger.debug(
214
+ f"🔄 Processing {len(ready_events)} events from retry queue"
215
+ )
216
+
217
+ for event in ready_events:
218
+ success = await self._retry_broadcast(event)
219
+
220
+ if success:
221
+ await self.retry_queue.mark_success(event)
222
+ else:
223
+ await self.retry_queue.mark_retry(event)
224
+
225
+ # Log stats periodically
226
+ stats = self.retry_queue.get_stats()
227
+ if stats['retried'] > 0 or stats['abandoned'] > 0:
228
+ self.logger.info(
229
+ f"📊 Retry queue stats - "
230
+ f"queued: {stats['queued']}, "
231
+ f"retried: {stats['retried']}, "
232
+ f"succeeded: {stats['succeeded']}, "
233
+ f"abandoned: {stats['abandoned']}, "
234
+ f"current size: {stats['queue_size']}"
235
+ )
236
+
237
+ except asyncio.CancelledError:
238
+ break
239
+ except Exception as e:
240
+ self.logger.error(f"Error processing retry queue: {e}")
241
+
242
+ async def _retry_broadcast(self, event: RetryableEvent) -> bool:
243
+ """Retry broadcasting a failed event.
244
+
245
+ WHY: Isolated retry logic allows for special handling
246
+ and metrics tracking of retry attempts.
247
+ """
248
+ try:
249
+ self.logger.debug(
250
+ f"🔄 Retrying {event.event_type} (attempt {event.attempt_count + 1}/{event.max_retries})"
251
+ )
252
+
253
+ # Reconstruct the full event
254
+ full_event = {
255
+ "type": event.event_type,
256
+ "timestamp": datetime.now().isoformat(),
257
+ "data": event.data,
258
+ "retry_attempt": event.attempt_count + 1
259
+ }
260
+
261
+ # Attempt broadcast
262
+ if event.skip_sid:
263
+ await self.sio.emit("claude_event", full_event, skip_sid=event.skip_sid)
264
+ else:
265
+ await self.sio.emit("claude_event", full_event)
266
+
267
+ self.logger.debug(f"✅ Successfully retried {event.event_type}")
268
+ return True
269
+
270
+ except Exception as e:
271
+ self.logger.warning(
272
+ f"⚠️ Retry failed for {event.event_type} "
273
+ f"(attempt {event.attempt_count + 1}): {e}"
274
+ )
275
+ return False
276
+
277
+ def broadcast_event(self, event_type: str, data: Dict[str, Any], skip_sid: Optional[str] = None):
278
+ """Broadcast an event to all connected clients with retry support.
279
+
280
+ WHY: Enhanced with retry queue to ensure reliable delivery
281
+ even during transient network issues.
282
+ """
47
283
  if not self.sio:
48
284
  return
49
285
 
@@ -65,15 +301,27 @@ class SocketIOEventBroadcaster:
65
301
  self.logger.debug(f"Added {event_type} to history (total: {len(self.server.event_history)})")
66
302
 
67
303
  # Broadcast to all connected clients
304
+ broadcast_success = False
68
305
  try:
69
306
  # Use run_coroutine_threadsafe to safely call from any thread
70
307
  if hasattr(self, "loop") and self.loop and not self.loop.is_closed():
71
- future = asyncio.run_coroutine_threadsafe(
72
- self.sio.emit("claude_event", event), self.loop
73
- )
74
- # Don't wait for the result to avoid blocking
75
- self.stats["events_sent"] += 1
76
- self.logger.debug(f"Broadcasted event: {event_type}")
308
+ # Create broadcast coroutine
309
+ if skip_sid:
310
+ coro = self.sio.emit("claude_event", event, skip_sid=skip_sid)
311
+ else:
312
+ coro = self.sio.emit("claude_event", event)
313
+
314
+ future = asyncio.run_coroutine_threadsafe(coro, self.loop)
315
+
316
+ # Wait briefly to see if broadcast succeeds
317
+ try:
318
+ future.result(timeout=0.5) # 500ms timeout
319
+ broadcast_success = True
320
+ self.stats["events_sent"] += 1
321
+ self.logger.debug(f"Broadcasted event: {event_type}")
322
+ except:
323
+ # Will be added to retry queue below
324
+ pass
77
325
  else:
78
326
  self.logger.warning(
79
327
  f"Cannot broadcast {event_type}: server loop not available"
@@ -81,6 +329,24 @@ class SocketIOEventBroadcaster:
81
329
 
82
330
  except Exception as e:
83
331
  self.logger.error(f"Failed to broadcast event {event_type}: {e}")
332
+
333
+ # Add to retry queue if broadcast failed
334
+ if not broadcast_success and self.loop:
335
+ retryable_event = RetryableEvent(
336
+ event_type=event_type,
337
+ data=data,
338
+ skip_sid=skip_sid
339
+ )
340
+
341
+ # Queue for retry
342
+ asyncio.run_coroutine_threadsafe(
343
+ self.retry_queue.add(retryable_event),
344
+ self.loop
345
+ )
346
+
347
+ self.logger.warning(
348
+ f"⚠️ Queued {event_type} for retry (queue size: {len(self.retry_queue.queue)})"
349
+ )
84
350
 
85
351
  def session_started(self, session_id: str, launch_method: str, working_dir: str):
86
352
  """Notify that a session has started."""