claude-mpm 4.0.9__py3-none-any.whl → 4.0.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- claude_mpm/BUILD_NUMBER +1 -0
- claude_mpm/VERSION +1 -1
- claude_mpm/core/framework_loader.py +36 -20
- claude_mpm/core/interactive_session.py +2 -2
- claude_mpm/dashboard/static/js/socket-client.js +270 -3
- claude_mpm/hooks/claude_hooks/hook_handler.py +3 -1
- claude_mpm/hooks/claude_hooks/hook_wrapper.sh +3 -3
- claude_mpm/services/agents/memory/agent_memory_manager.py +1 -1
- claude_mpm/services/socketio/handlers/connection.py +284 -1
- claude_mpm/services/socketio/server/broadcaster.py +276 -10
- claude_mpm/services/socketio/server/main.py +15 -1
- claude_mpm/services/version_service.py +18 -11
- {claude_mpm-4.0.9.dist-info → claude_mpm-4.0.11.dist-info}/METADATA +1 -1
- {claude_mpm-4.0.9.dist-info → claude_mpm-4.0.11.dist-info}/RECORD +18 -17
- {claude_mpm-4.0.9.dist-info → claude_mpm-4.0.11.dist-info}/WHEEL +0 -0
- {claude_mpm-4.0.9.dist-info → claude_mpm-4.0.11.dist-info}/entry_points.txt +0 -0
- {claude_mpm-4.0.9.dist-info → claude_mpm-4.0.11.dist-info}/licenses/LICENSE +0 -0
- {claude_mpm-4.0.9.dist-info → claude_mpm-4.0.11.dist-info}/top_level.txt +0 -0
|
@@ -5,13 +5,87 @@ disconnect, status requests, and history management. Separating these
|
|
|
5
5
|
from other handlers makes connection management more maintainable.
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
|
+
import asyncio
|
|
9
|
+
import functools
|
|
10
|
+
import time
|
|
8
11
|
from datetime import datetime
|
|
9
|
-
from typing import Any, Dict, List, Optional, Set
|
|
12
|
+
from typing import Any, Callable, Dict, List, Optional, Set
|
|
10
13
|
|
|
11
14
|
from ....core.typing_utils import ClaudeStatus, EventData, SocketId
|
|
12
15
|
from .base import BaseEventHandler
|
|
13
16
|
|
|
14
17
|
|
|
18
|
+
def timeout_handler(timeout_seconds: float = 5.0):
|
|
19
|
+
"""Decorator to add timeout protection to async handlers.
|
|
20
|
+
|
|
21
|
+
WHY: Network operations can hang indefinitely, causing resource leaks
|
|
22
|
+
and poor user experience. This decorator ensures handlers complete
|
|
23
|
+
within a reasonable time or fail gracefully.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
timeout_seconds: Maximum time allowed for handler execution (default: 5s)
|
|
27
|
+
"""
|
|
28
|
+
def decorator(func: Callable) -> Callable:
|
|
29
|
+
@functools.wraps(func)
|
|
30
|
+
async def wrapper(self, *args, **kwargs):
|
|
31
|
+
handler_name = func.__name__
|
|
32
|
+
start_time = time.time()
|
|
33
|
+
|
|
34
|
+
try:
|
|
35
|
+
# Create a task with timeout
|
|
36
|
+
result = await asyncio.wait_for(
|
|
37
|
+
func(self, *args, **kwargs),
|
|
38
|
+
timeout=timeout_seconds
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
elapsed = time.time() - start_time
|
|
42
|
+
if elapsed > timeout_seconds * 0.8: # Warn if close to timeout
|
|
43
|
+
self.logger.warning(
|
|
44
|
+
f"⚠️ Handler {handler_name} took {elapsed:.2f}s "
|
|
45
|
+
f"(close to {timeout_seconds}s timeout)"
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
return result
|
|
49
|
+
|
|
50
|
+
except asyncio.TimeoutError:
|
|
51
|
+
elapsed = time.time() - start_time
|
|
52
|
+
self.logger.error(
|
|
53
|
+
f"❌ Handler {handler_name} timed out after {elapsed:.2f}s"
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
# Try to send error response to client if we have their sid
|
|
57
|
+
if args and isinstance(args[0], str): # First arg is usually sid
|
|
58
|
+
sid = args[0]
|
|
59
|
+
try:
|
|
60
|
+
# Use a short timeout for error response
|
|
61
|
+
await asyncio.wait_for(
|
|
62
|
+
self.emit_to_client(
|
|
63
|
+
sid,
|
|
64
|
+
"error",
|
|
65
|
+
{
|
|
66
|
+
"message": f"Handler {handler_name} timed out",
|
|
67
|
+
"handler": handler_name,
|
|
68
|
+
"timeout": timeout_seconds
|
|
69
|
+
}
|
|
70
|
+
),
|
|
71
|
+
timeout=1.0
|
|
72
|
+
)
|
|
73
|
+
except:
|
|
74
|
+
pass # Best effort error notification
|
|
75
|
+
|
|
76
|
+
return None
|
|
77
|
+
|
|
78
|
+
except Exception as e:
|
|
79
|
+
elapsed = time.time() - start_time
|
|
80
|
+
self.logger.error(
|
|
81
|
+
f"❌ Handler {handler_name} failed after {elapsed:.2f}s: {e}"
|
|
82
|
+
)
|
|
83
|
+
raise
|
|
84
|
+
|
|
85
|
+
return wrapper
|
|
86
|
+
return decorator
|
|
87
|
+
|
|
88
|
+
|
|
15
89
|
class ConnectionEventHandler(BaseEventHandler):
|
|
16
90
|
"""Handles Socket.IO connection lifecycle events.
|
|
17
91
|
|
|
@@ -19,11 +93,189 @@ class ConnectionEventHandler(BaseEventHandler):
|
|
|
19
93
|
that deserves its own focused handler. This includes client connections,
|
|
20
94
|
disconnections, status updates, and event history management.
|
|
21
95
|
"""
|
|
96
|
+
|
|
97
|
+
def __init__(self, server):
|
|
98
|
+
"""Initialize connection handler with health monitoring.
|
|
99
|
+
|
|
100
|
+
WHY: We need to track connection health metrics and implement
|
|
101
|
+
ping/pong mechanism for detecting stale connections.
|
|
102
|
+
"""
|
|
103
|
+
super().__init__(server)
|
|
104
|
+
|
|
105
|
+
# Connection health tracking
|
|
106
|
+
self.connection_metrics = {}
|
|
107
|
+
self.last_ping_times = {}
|
|
108
|
+
self.ping_interval = 30 # seconds
|
|
109
|
+
self.ping_timeout = 10 # seconds
|
|
110
|
+
self.stale_check_interval = 60 # seconds
|
|
111
|
+
|
|
112
|
+
# Health monitoring tasks (will be started after event registration)
|
|
113
|
+
self.ping_task = None
|
|
114
|
+
self.stale_check_task = None
|
|
22
115
|
|
|
116
|
+
def _start_health_monitoring(self):
|
|
117
|
+
"""Start background tasks for connection health monitoring.
|
|
118
|
+
|
|
119
|
+
WHY: We need to actively monitor connection health to detect
|
|
120
|
+
and clean up stale connections, ensuring reliable event delivery.
|
|
121
|
+
"""
|
|
122
|
+
# Only start if we have a valid event loop and tasks aren't already running
|
|
123
|
+
if hasattr(self.server, 'core') and hasattr(self.server.core, 'loop'):
|
|
124
|
+
loop = self.server.core.loop
|
|
125
|
+
if loop and not loop.is_closed():
|
|
126
|
+
if not self.ping_task or self.ping_task.done():
|
|
127
|
+
self.ping_task = asyncio.run_coroutine_threadsafe(
|
|
128
|
+
self._periodic_ping(), loop
|
|
129
|
+
)
|
|
130
|
+
self.logger.info("🏓 Started connection ping monitoring")
|
|
131
|
+
|
|
132
|
+
if not self.stale_check_task or self.stale_check_task.done():
|
|
133
|
+
self.stale_check_task = asyncio.run_coroutine_threadsafe(
|
|
134
|
+
self._check_stale_connections(), loop
|
|
135
|
+
)
|
|
136
|
+
self.logger.info("🧹 Started stale connection checker")
|
|
137
|
+
|
|
138
|
+
def stop_health_monitoring(self):
|
|
139
|
+
"""Stop health monitoring tasks.
|
|
140
|
+
|
|
141
|
+
WHY: Clean shutdown requires stopping background tasks to
|
|
142
|
+
prevent errors and resource leaks.
|
|
143
|
+
"""
|
|
144
|
+
if self.ping_task and not self.ping_task.done():
|
|
145
|
+
self.ping_task.cancel()
|
|
146
|
+
self.logger.info("🚫 Stopped connection ping monitoring")
|
|
147
|
+
|
|
148
|
+
if self.stale_check_task and not self.stale_check_task.done():
|
|
149
|
+
self.stale_check_task.cancel()
|
|
150
|
+
self.logger.info("🚫 Stopped stale connection checker")
|
|
151
|
+
|
|
152
|
+
async def _periodic_ping(self):
|
|
153
|
+
"""Send periodic pings to all connected clients.
|
|
154
|
+
|
|
155
|
+
WHY: WebSocket connections can silently fail. Regular pings
|
|
156
|
+
help detect dead connections and maintain connection state.
|
|
157
|
+
"""
|
|
158
|
+
while True:
|
|
159
|
+
try:
|
|
160
|
+
await asyncio.sleep(self.ping_interval)
|
|
161
|
+
|
|
162
|
+
if not self.clients:
|
|
163
|
+
continue
|
|
164
|
+
|
|
165
|
+
current_time = time.time()
|
|
166
|
+
disconnected = []
|
|
167
|
+
|
|
168
|
+
for sid in list(self.clients):
|
|
169
|
+
try:
|
|
170
|
+
# Send ping and record time
|
|
171
|
+
await self.sio.emit('ping', {'timestamp': current_time}, room=sid)
|
|
172
|
+
self.last_ping_times[sid] = current_time
|
|
173
|
+
|
|
174
|
+
# Update connection metrics
|
|
175
|
+
if sid not in self.connection_metrics:
|
|
176
|
+
self.connection_metrics[sid] = {
|
|
177
|
+
'connected_at': current_time,
|
|
178
|
+
'reconnects': 0,
|
|
179
|
+
'failures': 0,
|
|
180
|
+
'last_activity': current_time
|
|
181
|
+
}
|
|
182
|
+
self.connection_metrics[sid]['last_activity'] = current_time
|
|
183
|
+
|
|
184
|
+
except Exception as e:
|
|
185
|
+
self.logger.warning(f"Failed to ping client {sid}: {e}")
|
|
186
|
+
disconnected.append(sid)
|
|
187
|
+
|
|
188
|
+
# Clean up failed connections
|
|
189
|
+
for sid in disconnected:
|
|
190
|
+
await self._cleanup_stale_connection(sid)
|
|
191
|
+
|
|
192
|
+
if self.clients:
|
|
193
|
+
self.logger.debug(
|
|
194
|
+
f"🏓 Sent pings to {len(self.clients)} clients, "
|
|
195
|
+
f"{len(disconnected)} failed"
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
except Exception as e:
|
|
199
|
+
self.logger.error(f"Error in periodic ping: {e}")
|
|
200
|
+
|
|
201
|
+
async def _check_stale_connections(self):
|
|
202
|
+
"""Check for and clean up stale connections.
|
|
203
|
+
|
|
204
|
+
WHY: Some clients may not properly disconnect, leaving zombie
|
|
205
|
+
connections that consume resources and prevent proper cleanup.
|
|
206
|
+
"""
|
|
207
|
+
while True:
|
|
208
|
+
try:
|
|
209
|
+
await asyncio.sleep(self.stale_check_interval)
|
|
210
|
+
|
|
211
|
+
current_time = time.time()
|
|
212
|
+
stale_threshold = current_time - (self.ping_timeout + self.ping_interval)
|
|
213
|
+
stale_sids = []
|
|
214
|
+
|
|
215
|
+
for sid in list(self.clients):
|
|
216
|
+
last_ping = self.last_ping_times.get(sid, 0)
|
|
217
|
+
|
|
218
|
+
if last_ping < stale_threshold:
|
|
219
|
+
stale_sids.append(sid)
|
|
220
|
+
self.logger.warning(
|
|
221
|
+
f"🧟 Detected stale connection {sid} "
|
|
222
|
+
f"(last ping: {current_time - last_ping:.1f}s ago)"
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
# Clean up stale connections
|
|
226
|
+
for sid in stale_sids:
|
|
227
|
+
await self._cleanup_stale_connection(sid)
|
|
228
|
+
|
|
229
|
+
if stale_sids:
|
|
230
|
+
self.logger.info(
|
|
231
|
+
f"🧹 Cleaned up {len(stale_sids)} stale connections"
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
except Exception as e:
|
|
235
|
+
self.logger.error(f"Error checking stale connections: {e}")
|
|
236
|
+
|
|
237
|
+
async def _cleanup_stale_connection(self, sid: str):
|
|
238
|
+
"""Clean up a stale or dead connection.
|
|
239
|
+
|
|
240
|
+
WHY: Proper cleanup prevents memory leaks and ensures
|
|
241
|
+
accurate connection tracking.
|
|
242
|
+
"""
|
|
243
|
+
try:
|
|
244
|
+
if sid in self.clients:
|
|
245
|
+
self.clients.remove(sid)
|
|
246
|
+
|
|
247
|
+
if sid in self.last_ping_times:
|
|
248
|
+
del self.last_ping_times[sid]
|
|
249
|
+
|
|
250
|
+
if sid in self.connection_metrics:
|
|
251
|
+
metrics = self.connection_metrics[sid]
|
|
252
|
+
uptime = time.time() - metrics.get('connected_at', 0)
|
|
253
|
+
self.logger.info(
|
|
254
|
+
f"📊 Connection {sid} stats - uptime: {uptime:.1f}s, "
|
|
255
|
+
f"reconnects: {metrics.get('reconnects', 0)}, "
|
|
256
|
+
f"failures: {metrics.get('failures', 0)}"
|
|
257
|
+
)
|
|
258
|
+
del self.connection_metrics[sid]
|
|
259
|
+
|
|
260
|
+
# Force disconnect if still connected
|
|
261
|
+
try:
|
|
262
|
+
await self.sio.disconnect(sid)
|
|
263
|
+
except:
|
|
264
|
+
pass # Already disconnected
|
|
265
|
+
|
|
266
|
+
self.logger.info(f"🔌 Cleaned up stale connection: {sid}")
|
|
267
|
+
|
|
268
|
+
except Exception as e:
|
|
269
|
+
self.logger.error(f"Error cleaning up connection {sid}: {e}")
|
|
270
|
+
|
|
23
271
|
def register_events(self) -> None:
|
|
24
272
|
"""Register connection-related event handlers."""
|
|
273
|
+
|
|
274
|
+
# Start health monitoring now that we're registering events
|
|
275
|
+
self._start_health_monitoring()
|
|
25
276
|
|
|
26
277
|
@self.sio.event
|
|
278
|
+
@timeout_handler(timeout_seconds=5.0)
|
|
27
279
|
async def connect(sid, environ, *args):
|
|
28
280
|
"""Handle client connection.
|
|
29
281
|
|
|
@@ -72,6 +324,7 @@ class ConnectionEventHandler(BaseEventHandler):
|
|
|
72
324
|
self.log_error(f"sending welcome to client {sid}", e)
|
|
73
325
|
|
|
74
326
|
@self.sio.event
|
|
327
|
+
@timeout_handler(timeout_seconds=3.0)
|
|
75
328
|
async def disconnect(sid):
|
|
76
329
|
"""Handle client disconnection.
|
|
77
330
|
|
|
@@ -86,8 +339,15 @@ class ConnectionEventHandler(BaseEventHandler):
|
|
|
86
339
|
self.logger.warning(
|
|
87
340
|
f"⚠️ Attempted to disconnect unknown client: {sid}"
|
|
88
341
|
)
|
|
342
|
+
|
|
343
|
+
# Clean up health tracking
|
|
344
|
+
if sid in self.last_ping_times:
|
|
345
|
+
del self.last_ping_times[sid]
|
|
346
|
+
if sid in self.connection_metrics:
|
|
347
|
+
del self.connection_metrics[sid]
|
|
89
348
|
|
|
90
349
|
@self.sio.event
|
|
350
|
+
@timeout_handler(timeout_seconds=3.0)
|
|
91
351
|
async def get_status(sid):
|
|
92
352
|
"""Handle status request.
|
|
93
353
|
|
|
@@ -105,6 +365,7 @@ class ConnectionEventHandler(BaseEventHandler):
|
|
|
105
365
|
await self.emit_to_client(sid, "status", status_data)
|
|
106
366
|
|
|
107
367
|
@self.sio.event
|
|
368
|
+
@timeout_handler(timeout_seconds=5.0)
|
|
108
369
|
async def get_history(sid, data=None):
|
|
109
370
|
"""Handle history request.
|
|
110
371
|
|
|
@@ -118,6 +379,7 @@ class ConnectionEventHandler(BaseEventHandler):
|
|
|
118
379
|
await self._send_event_history(sid, event_types=event_types, limit=limit)
|
|
119
380
|
|
|
120
381
|
@self.sio.event
|
|
382
|
+
@timeout_handler(timeout_seconds=5.0)
|
|
121
383
|
async def request_history(sid, data=None):
|
|
122
384
|
"""Handle legacy history request (for client compatibility).
|
|
123
385
|
|
|
@@ -131,6 +393,7 @@ class ConnectionEventHandler(BaseEventHandler):
|
|
|
131
393
|
await self._send_event_history(sid, event_types=event_types, limit=limit)
|
|
132
394
|
|
|
133
395
|
@self.sio.event
|
|
396
|
+
@timeout_handler(timeout_seconds=3.0)
|
|
134
397
|
async def subscribe(sid, data=None):
|
|
135
398
|
"""Handle subscription request.
|
|
136
399
|
|
|
@@ -141,6 +404,7 @@ class ConnectionEventHandler(BaseEventHandler):
|
|
|
141
404
|
await self.emit_to_client(sid, "subscribed", {"channels": channels})
|
|
142
405
|
|
|
143
406
|
@self.sio.event
|
|
407
|
+
@timeout_handler(timeout_seconds=5.0)
|
|
144
408
|
async def claude_event(sid, data):
|
|
145
409
|
"""Handle events from client proxies.
|
|
146
410
|
|
|
@@ -198,6 +462,25 @@ class ConnectionEventHandler(BaseEventHandler):
|
|
|
198
462
|
self.logger.info(f"📡 Broadcasting claude_event to all clients except {sid}")
|
|
199
463
|
await self.broadcast_event("claude_event", data, skip_sid=sid)
|
|
200
464
|
self.logger.info(f"✅ Broadcast complete")
|
|
465
|
+
|
|
466
|
+
@self.sio.event
|
|
467
|
+
async def pong(sid, data=None):
|
|
468
|
+
"""Handle pong response from client.
|
|
469
|
+
|
|
470
|
+
WHY: Clients respond to our pings with pongs, confirming
|
|
471
|
+
they're still alive and the connection is healthy.
|
|
472
|
+
"""
|
|
473
|
+
current_time = time.time()
|
|
474
|
+
|
|
475
|
+
# Update last activity time
|
|
476
|
+
if sid in self.connection_metrics:
|
|
477
|
+
self.connection_metrics[sid]['last_activity'] = current_time
|
|
478
|
+
|
|
479
|
+
# Calculate round-trip time if timestamp provided
|
|
480
|
+
if data and 'timestamp' in data:
|
|
481
|
+
rtt = current_time - data['timestamp']
|
|
482
|
+
if rtt < 10: # Reasonable RTT
|
|
483
|
+
self.logger.debug(f"🏓 Pong from {sid}, RTT: {rtt*1000:.1f}ms")
|
|
201
484
|
|
|
202
485
|
def _normalize_event(self, event_data: Dict[str, Any]) -> Dict[str, Any]:
|
|
203
486
|
"""Normalize event format to ensure consistency.
|
|
@@ -10,12 +10,144 @@ to create focused, testable modules with single responsibilities.
|
|
|
10
10
|
"""
|
|
11
11
|
|
|
12
12
|
import asyncio
|
|
13
|
+
import time
|
|
14
|
+
from collections import deque
|
|
15
|
+
from dataclasses import dataclass
|
|
13
16
|
from datetime import datetime
|
|
14
|
-
from typing import Any, Dict, List, Optional, Set
|
|
17
|
+
from typing import Any, Deque, Dict, List, Optional, Set
|
|
15
18
|
|
|
16
19
|
from ....core.logging_config import get_logger
|
|
17
20
|
|
|
18
21
|
|
|
22
|
+
@dataclass
|
|
23
|
+
class RetryableEvent:
|
|
24
|
+
"""Represents an event that can be retried on failure.
|
|
25
|
+
|
|
26
|
+
WHY: Network failures are common and transient. By tracking retry
|
|
27
|
+
attempts, we can recover from temporary issues while avoiding
|
|
28
|
+
infinite retry loops.
|
|
29
|
+
"""
|
|
30
|
+
event_type: str
|
|
31
|
+
data: Dict[str, Any]
|
|
32
|
+
attempt_count: int = 0
|
|
33
|
+
max_retries: int = 3
|
|
34
|
+
created_at: float = None
|
|
35
|
+
last_attempt: float = None
|
|
36
|
+
skip_sid: Optional[str] = None
|
|
37
|
+
|
|
38
|
+
def __post_init__(self):
|
|
39
|
+
if self.created_at is None:
|
|
40
|
+
self.created_at = time.time()
|
|
41
|
+
if self.last_attempt is None:
|
|
42
|
+
self.last_attempt = time.time()
|
|
43
|
+
|
|
44
|
+
def should_retry(self) -> bool:
|
|
45
|
+
"""Check if this event should be retried.
|
|
46
|
+
|
|
47
|
+
WHY: We need to balance reliability with resource usage.
|
|
48
|
+
Events older than 30 seconds or with too many attempts
|
|
49
|
+
should be abandoned.
|
|
50
|
+
"""
|
|
51
|
+
if self.attempt_count >= self.max_retries:
|
|
52
|
+
return False
|
|
53
|
+
|
|
54
|
+
# Don't retry events older than 30 seconds
|
|
55
|
+
if time.time() - self.created_at > 30:
|
|
56
|
+
return False
|
|
57
|
+
|
|
58
|
+
return True
|
|
59
|
+
|
|
60
|
+
def get_backoff_delay(self) -> float:
|
|
61
|
+
"""Calculate exponential backoff delay.
|
|
62
|
+
|
|
63
|
+
WHY: Exponential backoff prevents overwhelming the system
|
|
64
|
+
during recovery from failures.
|
|
65
|
+
"""
|
|
66
|
+
base_delay = 1.0 # 1 second
|
|
67
|
+
max_delay = 8.0 # 8 seconds max
|
|
68
|
+
|
|
69
|
+
delay = min(base_delay * (2 ** self.attempt_count), max_delay)
|
|
70
|
+
return delay
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class RetryQueue:
|
|
74
|
+
"""Manages retry queue for failed event broadcasts.
|
|
75
|
+
|
|
76
|
+
WHY: Transient network issues shouldn't cause event loss.
|
|
77
|
+
This queue provides resilient event delivery with backoff.
|
|
78
|
+
"""
|
|
79
|
+
|
|
80
|
+
def __init__(self, max_size: int = 1000):
|
|
81
|
+
self.queue: Deque[RetryableEvent] = deque(maxlen=max_size)
|
|
82
|
+
self.lock = asyncio.Lock()
|
|
83
|
+
self.stats = {
|
|
84
|
+
'queued': 0,
|
|
85
|
+
'retried': 0,
|
|
86
|
+
'succeeded': 0,
|
|
87
|
+
'abandoned': 0
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
async def add(self, event: RetryableEvent) -> None:
|
|
91
|
+
"""Add an event to the retry queue."""
|
|
92
|
+
async with self.lock:
|
|
93
|
+
self.queue.append(event)
|
|
94
|
+
self.stats['queued'] += 1
|
|
95
|
+
|
|
96
|
+
async def get_ready_events(self) -> List[RetryableEvent]:
|
|
97
|
+
"""Get events that are ready for retry.
|
|
98
|
+
|
|
99
|
+
WHY: We need to respect backoff delays to avoid
|
|
100
|
+
overwhelming the system during recovery.
|
|
101
|
+
"""
|
|
102
|
+
async with self.lock:
|
|
103
|
+
current_time = time.time()
|
|
104
|
+
ready = []
|
|
105
|
+
|
|
106
|
+
# Check each event in queue
|
|
107
|
+
remaining = []
|
|
108
|
+
for event in self.queue:
|
|
109
|
+
if not event.should_retry():
|
|
110
|
+
self.stats['abandoned'] += 1
|
|
111
|
+
continue
|
|
112
|
+
|
|
113
|
+
# First attempt (attempt_count == 0) should be immediate
|
|
114
|
+
if event.attempt_count == 0:
|
|
115
|
+
ready.append(event)
|
|
116
|
+
else:
|
|
117
|
+
# For retries, check backoff delay
|
|
118
|
+
time_since_attempt = current_time - event.last_attempt
|
|
119
|
+
if time_since_attempt >= event.get_backoff_delay():
|
|
120
|
+
ready.append(event)
|
|
121
|
+
else:
|
|
122
|
+
remaining.append(event)
|
|
123
|
+
|
|
124
|
+
# Update queue with events not ready yet
|
|
125
|
+
self.queue.clear()
|
|
126
|
+
self.queue.extend(remaining)
|
|
127
|
+
|
|
128
|
+
return ready
|
|
129
|
+
|
|
130
|
+
async def mark_success(self, event: RetryableEvent) -> None:
|
|
131
|
+
"""Mark an event as successfully sent."""
|
|
132
|
+
self.stats['succeeded'] += 1
|
|
133
|
+
|
|
134
|
+
async def mark_retry(self, event: RetryableEvent) -> None:
|
|
135
|
+
"""Mark an event for retry."""
|
|
136
|
+
event.attempt_count += 1
|
|
137
|
+
event.last_attempt = time.time()
|
|
138
|
+
self.stats['retried'] += 1
|
|
139
|
+
|
|
140
|
+
if event.should_retry():
|
|
141
|
+
await self.add(event)
|
|
142
|
+
|
|
143
|
+
def get_stats(self) -> Dict[str, int]:
|
|
144
|
+
"""Get retry queue statistics."""
|
|
145
|
+
return {
|
|
146
|
+
**self.stats,
|
|
147
|
+
'queue_size': len(self.queue)
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
|
|
19
151
|
class SocketIOEventBroadcaster:
|
|
20
152
|
"""Handles broadcasting events to connected Socket.IO clients.
|
|
21
153
|
|
|
@@ -41,9 +173,113 @@ class SocketIOEventBroadcaster:
|
|
|
41
173
|
self.logger = logger
|
|
42
174
|
self.loop = None # Will be set by main server
|
|
43
175
|
self.server = server # Reference to main server for event history
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
176
|
+
|
|
177
|
+
# Initialize retry queue for resilient delivery
|
|
178
|
+
self.retry_queue = RetryQueue(max_size=1000)
|
|
179
|
+
self.retry_task = None
|
|
180
|
+
self.retry_interval = 2.0 # Process retry queue every 2 seconds
|
|
181
|
+
|
|
182
|
+
def start_retry_processor(self):
|
|
183
|
+
"""Start the background retry processor.
|
|
184
|
+
|
|
185
|
+
WHY: Failed broadcasts need to be retried automatically
|
|
186
|
+
to ensure reliable event delivery.
|
|
187
|
+
"""
|
|
188
|
+
if self.loop and not self.retry_task:
|
|
189
|
+
self.retry_task = asyncio.create_task(self._process_retry_queue())
|
|
190
|
+
self.logger.info("🔄 Started retry queue processor")
|
|
191
|
+
|
|
192
|
+
def stop_retry_processor(self):
|
|
193
|
+
"""Stop the background retry processor."""
|
|
194
|
+
if self.retry_task:
|
|
195
|
+
self.retry_task.cancel()
|
|
196
|
+
self.retry_task = None
|
|
197
|
+
self.logger.info("🚫 Stopped retry queue processor")
|
|
198
|
+
|
|
199
|
+
async def _process_retry_queue(self):
|
|
200
|
+
"""Process the retry queue periodically.
|
|
201
|
+
|
|
202
|
+
WHY: Regular processing ensures failed events are retried
|
|
203
|
+
with appropriate backoff delays.
|
|
204
|
+
"""
|
|
205
|
+
while True:
|
|
206
|
+
try:
|
|
207
|
+
await asyncio.sleep(self.retry_interval)
|
|
208
|
+
|
|
209
|
+
# Get events ready for retry
|
|
210
|
+
ready_events = await self.retry_queue.get_ready_events()
|
|
211
|
+
|
|
212
|
+
if ready_events:
|
|
213
|
+
self.logger.debug(
|
|
214
|
+
f"🔄 Processing {len(ready_events)} events from retry queue"
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
for event in ready_events:
|
|
218
|
+
success = await self._retry_broadcast(event)
|
|
219
|
+
|
|
220
|
+
if success:
|
|
221
|
+
await self.retry_queue.mark_success(event)
|
|
222
|
+
else:
|
|
223
|
+
await self.retry_queue.mark_retry(event)
|
|
224
|
+
|
|
225
|
+
# Log stats periodically
|
|
226
|
+
stats = self.retry_queue.get_stats()
|
|
227
|
+
if stats['retried'] > 0 or stats['abandoned'] > 0:
|
|
228
|
+
self.logger.info(
|
|
229
|
+
f"📊 Retry queue stats - "
|
|
230
|
+
f"queued: {stats['queued']}, "
|
|
231
|
+
f"retried: {stats['retried']}, "
|
|
232
|
+
f"succeeded: {stats['succeeded']}, "
|
|
233
|
+
f"abandoned: {stats['abandoned']}, "
|
|
234
|
+
f"current size: {stats['queue_size']}"
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
except asyncio.CancelledError:
|
|
238
|
+
break
|
|
239
|
+
except Exception as e:
|
|
240
|
+
self.logger.error(f"Error processing retry queue: {e}")
|
|
241
|
+
|
|
242
|
+
async def _retry_broadcast(self, event: RetryableEvent) -> bool:
|
|
243
|
+
"""Retry broadcasting a failed event.
|
|
244
|
+
|
|
245
|
+
WHY: Isolated retry logic allows for special handling
|
|
246
|
+
and metrics tracking of retry attempts.
|
|
247
|
+
"""
|
|
248
|
+
try:
|
|
249
|
+
self.logger.debug(
|
|
250
|
+
f"🔄 Retrying {event.event_type} (attempt {event.attempt_count + 1}/{event.max_retries})"
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
# Reconstruct the full event
|
|
254
|
+
full_event = {
|
|
255
|
+
"type": event.event_type,
|
|
256
|
+
"timestamp": datetime.now().isoformat(),
|
|
257
|
+
"data": event.data,
|
|
258
|
+
"retry_attempt": event.attempt_count + 1
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
# Attempt broadcast
|
|
262
|
+
if event.skip_sid:
|
|
263
|
+
await self.sio.emit("claude_event", full_event, skip_sid=event.skip_sid)
|
|
264
|
+
else:
|
|
265
|
+
await self.sio.emit("claude_event", full_event)
|
|
266
|
+
|
|
267
|
+
self.logger.debug(f"✅ Successfully retried {event.event_type}")
|
|
268
|
+
return True
|
|
269
|
+
|
|
270
|
+
except Exception as e:
|
|
271
|
+
self.logger.warning(
|
|
272
|
+
f"⚠️ Retry failed for {event.event_type} "
|
|
273
|
+
f"(attempt {event.attempt_count + 1}): {e}"
|
|
274
|
+
)
|
|
275
|
+
return False
|
|
276
|
+
|
|
277
|
+
def broadcast_event(self, event_type: str, data: Dict[str, Any], skip_sid: Optional[str] = None):
|
|
278
|
+
"""Broadcast an event to all connected clients with retry support.
|
|
279
|
+
|
|
280
|
+
WHY: Enhanced with retry queue to ensure reliable delivery
|
|
281
|
+
even during transient network issues.
|
|
282
|
+
"""
|
|
47
283
|
if not self.sio:
|
|
48
284
|
return
|
|
49
285
|
|
|
@@ -65,15 +301,27 @@ class SocketIOEventBroadcaster:
|
|
|
65
301
|
self.logger.debug(f"Added {event_type} to history (total: {len(self.server.event_history)})")
|
|
66
302
|
|
|
67
303
|
# Broadcast to all connected clients
|
|
304
|
+
broadcast_success = False
|
|
68
305
|
try:
|
|
69
306
|
# Use run_coroutine_threadsafe to safely call from any thread
|
|
70
307
|
if hasattr(self, "loop") and self.loop and not self.loop.is_closed():
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
308
|
+
# Create broadcast coroutine
|
|
309
|
+
if skip_sid:
|
|
310
|
+
coro = self.sio.emit("claude_event", event, skip_sid=skip_sid)
|
|
311
|
+
else:
|
|
312
|
+
coro = self.sio.emit("claude_event", event)
|
|
313
|
+
|
|
314
|
+
future = asyncio.run_coroutine_threadsafe(coro, self.loop)
|
|
315
|
+
|
|
316
|
+
# Wait briefly to see if broadcast succeeds
|
|
317
|
+
try:
|
|
318
|
+
future.result(timeout=0.5) # 500ms timeout
|
|
319
|
+
broadcast_success = True
|
|
320
|
+
self.stats["events_sent"] += 1
|
|
321
|
+
self.logger.debug(f"Broadcasted event: {event_type}")
|
|
322
|
+
except:
|
|
323
|
+
# Will be added to retry queue below
|
|
324
|
+
pass
|
|
77
325
|
else:
|
|
78
326
|
self.logger.warning(
|
|
79
327
|
f"Cannot broadcast {event_type}: server loop not available"
|
|
@@ -81,6 +329,24 @@ class SocketIOEventBroadcaster:
|
|
|
81
329
|
|
|
82
330
|
except Exception as e:
|
|
83
331
|
self.logger.error(f"Failed to broadcast event {event_type}: {e}")
|
|
332
|
+
|
|
333
|
+
# Add to retry queue if broadcast failed
|
|
334
|
+
if not broadcast_success and self.loop:
|
|
335
|
+
retryable_event = RetryableEvent(
|
|
336
|
+
event_type=event_type,
|
|
337
|
+
data=data,
|
|
338
|
+
skip_sid=skip_sid
|
|
339
|
+
)
|
|
340
|
+
|
|
341
|
+
# Queue for retry
|
|
342
|
+
asyncio.run_coroutine_threadsafe(
|
|
343
|
+
self.retry_queue.add(retryable_event),
|
|
344
|
+
self.loop
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
self.logger.warning(
|
|
348
|
+
f"⚠️ Queued {event_type} for retry (queue size: {len(self.retry_queue.queue)})"
|
|
349
|
+
)
|
|
84
350
|
|
|
85
351
|
def session_started(self, session_id: str, launch_method: str, working_dir: str):
|
|
86
352
|
"""Notify that a session has started."""
|