claude-mpm 4.1.5__py3-none-any.whl → 4.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- claude_mpm/VERSION +1 -1
- claude_mpm/agents/templates/agent-manager.json +1 -1
- claude_mpm/agents/templates/agent-manager.md +111 -34
- claude_mpm/agents/templates/research.json +39 -13
- claude_mpm/cli/__init__.py +2 -0
- claude_mpm/cli/commands/__init__.py +2 -0
- claude_mpm/cli/commands/configure.py +1221 -0
- claude_mpm/cli/commands/configure_tui.py +1921 -0
- claude_mpm/cli/parsers/base_parser.py +7 -0
- claude_mpm/cli/parsers/configure_parser.py +119 -0
- claude_mpm/cli/startup_logging.py +39 -12
- claude_mpm/config/socketio_config.py +33 -4
- claude_mpm/constants.py +1 -0
- claude_mpm/core/socketio_pool.py +35 -3
- claude_mpm/dashboard/static/css/connection-status.css +370 -0
- claude_mpm/dashboard/static/js/components/connection-debug.js +654 -0
- claude_mpm/dashboard/static/js/connection-manager.js +536 -0
- claude_mpm/dashboard/static/js/socket-client.js +40 -16
- claude_mpm/dashboard/templates/index.html +11 -0
- claude_mpm/hooks/claude_hooks/services/__init__.py +3 -1
- claude_mpm/hooks/claude_hooks/services/connection_manager.py +17 -0
- claude_mpm/hooks/claude_hooks/services/connection_manager_http.py +190 -0
- claude_mpm/services/diagnostics/checks/__init__.py +2 -0
- claude_mpm/services/diagnostics/checks/instructions_check.py +418 -0
- claude_mpm/services/diagnostics/diagnostic_runner.py +15 -2
- claude_mpm/services/event_bus/direct_relay.py +230 -0
- claude_mpm/services/socketio/handlers/connection_handler.py +330 -0
- claude_mpm/services/socketio/server/broadcaster.py +32 -1
- claude_mpm/services/socketio/server/connection_manager.py +547 -0
- claude_mpm/services/socketio/server/core.py +78 -7
- claude_mpm/services/socketio/server/eventbus_integration.py +20 -9
- claude_mpm/services/socketio/server/main.py +74 -19
- {claude_mpm-4.1.5.dist-info → claude_mpm-4.1.7.dist-info}/METADATA +3 -1
- {claude_mpm-4.1.5.dist-info → claude_mpm-4.1.7.dist-info}/RECORD +38 -41
- claude_mpm/agents/OUTPUT_STYLE.md +0 -73
- claude_mpm/agents/backups/INSTRUCTIONS.md +0 -352
- claude_mpm/agents/templates/OPTIMIZATION_REPORT.md +0 -156
- claude_mpm/agents/templates/backup/data_engineer_agent_20250726_234551.json +0 -79
- claude_mpm/agents/templates/backup/documentation_agent_20250726_234551.json +0 -68
- claude_mpm/agents/templates/backup/engineer_agent_20250726_234551.json +0 -77
- claude_mpm/agents/templates/backup/ops_agent_20250726_234551.json +0 -78
- claude_mpm/agents/templates/backup/qa_agent_20250726_234551.json +0 -67
- claude_mpm/agents/templates/backup/research_agent_2025011_234551.json +0 -88
- claude_mpm/agents/templates/backup/research_agent_20250726_234551.json +0 -72
- claude_mpm/agents/templates/backup/research_memory_efficient.json +0 -88
- claude_mpm/agents/templates/backup/security_agent_20250726_234551.json +0 -78
- claude_mpm/agents/templates/backup/version_control_agent_20250726_234551.json +0 -62
- claude_mpm/agents/templates/vercel_ops_instructions.md +0 -582
- {claude_mpm-4.1.5.dist-info → claude_mpm-4.1.7.dist-info}/WHEEL +0 -0
- {claude_mpm-4.1.5.dist-info → claude_mpm-4.1.7.dist-info}/entry_points.txt +0 -0
- {claude_mpm-4.1.5.dist-info → claude_mpm-4.1.7.dist-info}/licenses/LICENSE +0 -0
- {claude_mpm-4.1.5.dist-info → claude_mpm-4.1.7.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,547 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Enhanced Connection Manager for SocketIO Server.
|
|
3
|
+
|
|
4
|
+
WHY: This module provides robust connection management with state tracking,
|
|
5
|
+
health monitoring, event buffering for disconnected clients, and automatic
|
|
6
|
+
recovery from connection failures.
|
|
7
|
+
|
|
8
|
+
DESIGN DECISION: Centralized connection management ensures consistent handling
|
|
9
|
+
of client states, proper event delivery, and automatic recovery mechanisms.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import asyncio
|
|
13
|
+
import time
|
|
14
|
+
from collections import deque
|
|
15
|
+
from dataclasses import dataclass, field
|
|
16
|
+
from datetime import datetime
|
|
17
|
+
from enum import Enum
|
|
18
|
+
from typing import Any, Deque, Dict, List, Optional
|
|
19
|
+
from uuid import uuid4
|
|
20
|
+
|
|
21
|
+
from ....core.logging_config import get_logger
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class ConnectionState(Enum):
|
|
25
|
+
"""Connection states for tracking client lifecycle."""
|
|
26
|
+
|
|
27
|
+
CONNECTING = "connecting"
|
|
28
|
+
CONNECTED = "connected"
|
|
29
|
+
DISCONNECTING = "disconnecting"
|
|
30
|
+
DISCONNECTED = "disconnected"
|
|
31
|
+
RECONNECTING = "reconnecting"
|
|
32
|
+
STALE = "stale" # Connected but not responding
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class ConnectionMetrics:
|
|
37
|
+
"""Metrics for a single connection."""
|
|
38
|
+
|
|
39
|
+
connect_count: int = 0
|
|
40
|
+
disconnect_count: int = 0
|
|
41
|
+
reconnect_count: int = 0
|
|
42
|
+
events_sent: int = 0
|
|
43
|
+
events_acked: int = 0
|
|
44
|
+
events_buffered: int = 0
|
|
45
|
+
events_dropped: int = 0
|
|
46
|
+
last_activity: float = field(default_factory=time.time)
|
|
47
|
+
total_uptime: float = 0.0
|
|
48
|
+
total_downtime: float = 0.0
|
|
49
|
+
connection_quality: float = 1.0 # 0-1 quality score
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@dataclass
|
|
53
|
+
class ClientConnection:
|
|
54
|
+
"""Represents a client connection with full state tracking."""
|
|
55
|
+
|
|
56
|
+
sid: str # Socket ID
|
|
57
|
+
client_id: str # Persistent client ID across reconnections
|
|
58
|
+
state: ConnectionState
|
|
59
|
+
connected_at: float
|
|
60
|
+
disconnected_at: Optional[float] = None
|
|
61
|
+
last_ping: Optional[float] = None
|
|
62
|
+
last_pong: Optional[float] = None
|
|
63
|
+
last_event: Optional[float] = None
|
|
64
|
+
event_buffer: Deque[Dict[str, Any]] = field(
|
|
65
|
+
default_factory=lambda: deque(maxlen=1000)
|
|
66
|
+
)
|
|
67
|
+
event_sequence: int = 0
|
|
68
|
+
last_acked_sequence: int = 0
|
|
69
|
+
pending_acks: Dict[int, Dict[str, Any]] = field(default_factory=dict)
|
|
70
|
+
metrics: ConnectionMetrics = field(default_factory=ConnectionMetrics)
|
|
71
|
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
|
72
|
+
|
|
73
|
+
def is_healthy(self, timeout: float = 180.0) -> bool:
|
|
74
|
+
"""Check if connection is healthy based on activity.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
timeout: Seconds before considering connection unhealthy (default 180s)
|
|
78
|
+
"""
|
|
79
|
+
if self.state != ConnectionState.CONNECTED:
|
|
80
|
+
return False
|
|
81
|
+
|
|
82
|
+
now = time.time()
|
|
83
|
+
|
|
84
|
+
# Check last activity (ping, pong, or event)
|
|
85
|
+
# Include metrics.last_activity for more comprehensive tracking
|
|
86
|
+
last_activity = max(
|
|
87
|
+
self.last_ping or 0,
|
|
88
|
+
self.last_pong or 0,
|
|
89
|
+
self.last_event or 0,
|
|
90
|
+
self.metrics.last_activity or 0,
|
|
91
|
+
self.connected_at,
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
# Add grace period for network hiccups (additional 10% of timeout)
|
|
95
|
+
grace_period = timeout * 1.1
|
|
96
|
+
return (now - last_activity) < grace_period
|
|
97
|
+
|
|
98
|
+
def calculate_quality(self) -> float:
|
|
99
|
+
"""Calculate connection quality score (0-1)."""
|
|
100
|
+
if self.state != ConnectionState.CONNECTED:
|
|
101
|
+
return 0.0
|
|
102
|
+
|
|
103
|
+
# Factors for quality calculation
|
|
104
|
+
factors = []
|
|
105
|
+
|
|
106
|
+
# Reconnection rate (lower is better)
|
|
107
|
+
if self.metrics.connect_count > 0:
|
|
108
|
+
reconnect_rate = self.metrics.reconnect_count / self.metrics.connect_count
|
|
109
|
+
factors.append(1.0 - min(reconnect_rate, 1.0))
|
|
110
|
+
|
|
111
|
+
# Event acknowledgment rate
|
|
112
|
+
if self.metrics.events_sent > 0:
|
|
113
|
+
ack_rate = self.metrics.events_acked / self.metrics.events_sent
|
|
114
|
+
factors.append(ack_rate)
|
|
115
|
+
|
|
116
|
+
# Uptime ratio
|
|
117
|
+
total_time = self.metrics.total_uptime + self.metrics.total_downtime
|
|
118
|
+
if total_time > 0:
|
|
119
|
+
uptime_ratio = self.metrics.total_uptime / total_time
|
|
120
|
+
factors.append(uptime_ratio)
|
|
121
|
+
|
|
122
|
+
# Recent activity (exponential decay over 5 minutes)
|
|
123
|
+
now = time.time()
|
|
124
|
+
time_since_activity = now - self.metrics.last_activity
|
|
125
|
+
activity_score = max(0, 1.0 - (time_since_activity / 300))
|
|
126
|
+
factors.append(activity_score)
|
|
127
|
+
|
|
128
|
+
# Calculate average quality
|
|
129
|
+
if factors:
|
|
130
|
+
quality = sum(factors) / len(factors)
|
|
131
|
+
else:
|
|
132
|
+
quality = 1.0 if self.state == ConnectionState.CONNECTED else 0.0
|
|
133
|
+
|
|
134
|
+
self.metrics.connection_quality = quality
|
|
135
|
+
return quality
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
class ConnectionManager:
|
|
139
|
+
"""
|
|
140
|
+
Enhanced connection manager with robust state tracking and recovery.
|
|
141
|
+
|
|
142
|
+
Features:
|
|
143
|
+
- Persistent client IDs across reconnections
|
|
144
|
+
- Event buffering for disconnected clients
|
|
145
|
+
- Sequence numbers for event ordering
|
|
146
|
+
- Health monitoring with automatic stale detection
|
|
147
|
+
- Connection quality metrics
|
|
148
|
+
- Automatic event replay on reconnection
|
|
149
|
+
"""
|
|
150
|
+
|
|
151
|
+
def __init__(self, max_buffer_size: int = None, event_ttl: int = None):
|
|
152
|
+
"""
|
|
153
|
+
Initialize connection manager with centralized configuration.
|
|
154
|
+
|
|
155
|
+
Args:
|
|
156
|
+
max_buffer_size: Maximum events to buffer per client (uses config if None)
|
|
157
|
+
event_ttl: Time-to-live for buffered events in seconds (uses config if None)
|
|
158
|
+
"""
|
|
159
|
+
from ....config.socketio_config import CONNECTION_CONFIG
|
|
160
|
+
|
|
161
|
+
self.logger = get_logger(__name__)
|
|
162
|
+
self.connections: Dict[str, ClientConnection] = {}
|
|
163
|
+
self.client_mapping: Dict[str, str] = {} # client_id -> current sid
|
|
164
|
+
|
|
165
|
+
# Use centralized configuration with optional overrides
|
|
166
|
+
self.max_buffer_size = max_buffer_size or CONNECTION_CONFIG['max_events_buffer']
|
|
167
|
+
self.event_ttl = event_ttl or CONNECTION_CONFIG['event_ttl']
|
|
168
|
+
self.global_sequence = 0
|
|
169
|
+
self.health_check_interval = CONNECTION_CONFIG['health_check_interval'] # 30 seconds
|
|
170
|
+
self.stale_timeout = CONNECTION_CONFIG['stale_timeout'] # 180 seconds (was 90)
|
|
171
|
+
self.health_task = None
|
|
172
|
+
self._lock = asyncio.Lock()
|
|
173
|
+
|
|
174
|
+
async def register_connection(
|
|
175
|
+
self, sid: str, client_id: Optional[str] = None
|
|
176
|
+
) -> ClientConnection:
|
|
177
|
+
"""
|
|
178
|
+
Register a new connection or reconnection.
|
|
179
|
+
|
|
180
|
+
Args:
|
|
181
|
+
sid: Socket ID
|
|
182
|
+
client_id: Optional persistent client ID for reconnection
|
|
183
|
+
|
|
184
|
+
Returns:
|
|
185
|
+
ClientConnection object
|
|
186
|
+
"""
|
|
187
|
+
async with self._lock:
|
|
188
|
+
now = time.time()
|
|
189
|
+
|
|
190
|
+
# Check if this is a reconnection
|
|
191
|
+
if client_id and client_id in self.client_mapping:
|
|
192
|
+
old_sid = self.client_mapping[client_id]
|
|
193
|
+
if old_sid in self.connections:
|
|
194
|
+
old_conn = self.connections[old_sid]
|
|
195
|
+
|
|
196
|
+
# Create new connection with history
|
|
197
|
+
conn = ClientConnection(
|
|
198
|
+
sid=sid,
|
|
199
|
+
client_id=client_id,
|
|
200
|
+
state=ConnectionState.CONNECTED,
|
|
201
|
+
connected_at=now,
|
|
202
|
+
event_buffer=old_conn.event_buffer,
|
|
203
|
+
event_sequence=old_conn.event_sequence,
|
|
204
|
+
last_acked_sequence=old_conn.last_acked_sequence,
|
|
205
|
+
metrics=old_conn.metrics,
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
# Update metrics
|
|
209
|
+
conn.metrics.reconnect_count += 1
|
|
210
|
+
conn.metrics.connect_count += 1
|
|
211
|
+
if old_conn.disconnected_at:
|
|
212
|
+
conn.metrics.total_downtime += now - old_conn.disconnected_at
|
|
213
|
+
|
|
214
|
+
# Clean up old connection
|
|
215
|
+
del self.connections[old_sid]
|
|
216
|
+
|
|
217
|
+
self.logger.info(
|
|
218
|
+
f"Client {client_id} reconnected (new sid: {sid}, "
|
|
219
|
+
f"buffered events: {len(conn.event_buffer)})"
|
|
220
|
+
)
|
|
221
|
+
else:
|
|
222
|
+
# No old connection found, create new
|
|
223
|
+
client_id = client_id or str(uuid4())
|
|
224
|
+
conn = self._create_new_connection(sid, client_id, now)
|
|
225
|
+
else:
|
|
226
|
+
# New client
|
|
227
|
+
client_id = client_id or str(uuid4())
|
|
228
|
+
conn = self._create_new_connection(sid, client_id, now)
|
|
229
|
+
|
|
230
|
+
# Register connection
|
|
231
|
+
self.connections[sid] = conn
|
|
232
|
+
self.client_mapping[client_id] = sid
|
|
233
|
+
|
|
234
|
+
return conn
|
|
235
|
+
|
|
236
|
+
def _create_new_connection(
|
|
237
|
+
self, sid: str, client_id: str, now: float
|
|
238
|
+
) -> ClientConnection:
|
|
239
|
+
"""Create a new connection object."""
|
|
240
|
+
conn = ClientConnection(
|
|
241
|
+
sid=sid,
|
|
242
|
+
client_id=client_id,
|
|
243
|
+
state=ConnectionState.CONNECTED,
|
|
244
|
+
connected_at=now,
|
|
245
|
+
)
|
|
246
|
+
conn.metrics.connect_count = 1
|
|
247
|
+
self.logger.info(f"New client connected: {client_id} (sid: {sid})")
|
|
248
|
+
return conn
|
|
249
|
+
|
|
250
|
+
async def unregister_connection(self, sid: str, reason: str = "unknown") -> None:
|
|
251
|
+
"""
|
|
252
|
+
Unregister a connection but keep state for reconnection.
|
|
253
|
+
|
|
254
|
+
Args:
|
|
255
|
+
sid: Socket ID
|
|
256
|
+
reason: Disconnection reason
|
|
257
|
+
"""
|
|
258
|
+
async with self._lock:
|
|
259
|
+
if sid not in self.connections:
|
|
260
|
+
return
|
|
261
|
+
|
|
262
|
+
conn = self.connections[sid]
|
|
263
|
+
now = time.time()
|
|
264
|
+
|
|
265
|
+
# Update connection state
|
|
266
|
+
conn.state = ConnectionState.DISCONNECTED
|
|
267
|
+
conn.disconnected_at = now
|
|
268
|
+
conn.metrics.disconnect_count += 1
|
|
269
|
+
|
|
270
|
+
# Update uptime
|
|
271
|
+
if conn.connected_at:
|
|
272
|
+
conn.metrics.total_uptime += now - conn.connected_at
|
|
273
|
+
|
|
274
|
+
self.logger.info(
|
|
275
|
+
f"Client {conn.client_id} disconnected (sid: {sid}, reason: {reason}, "
|
|
276
|
+
f"buffered events: {len(conn.event_buffer)})"
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
# Keep connection for potential reconnection
|
|
280
|
+
# It will be cleaned up by health check if not reconnected
|
|
281
|
+
|
|
282
|
+
async def buffer_event(self, sid: str, event: Dict[str, Any]) -> bool:
|
|
283
|
+
"""
|
|
284
|
+
Buffer an event for a client.
|
|
285
|
+
|
|
286
|
+
Args:
|
|
287
|
+
sid: Socket ID
|
|
288
|
+
event: Event to buffer
|
|
289
|
+
|
|
290
|
+
Returns:
|
|
291
|
+
True if buffered successfully
|
|
292
|
+
"""
|
|
293
|
+
async with self._lock:
|
|
294
|
+
if sid not in self.connections:
|
|
295
|
+
return False
|
|
296
|
+
|
|
297
|
+
conn = self.connections[sid]
|
|
298
|
+
|
|
299
|
+
# Add sequence number
|
|
300
|
+
self.global_sequence += 1
|
|
301
|
+
event["sequence"] = self.global_sequence
|
|
302
|
+
event["timestamp"] = time.time()
|
|
303
|
+
|
|
304
|
+
# Buffer the event
|
|
305
|
+
conn.event_buffer.append(event)
|
|
306
|
+
conn.event_sequence = self.global_sequence
|
|
307
|
+
conn.metrics.events_buffered += 1
|
|
308
|
+
|
|
309
|
+
# Drop old events if buffer is full
|
|
310
|
+
if len(conn.event_buffer) >= self.max_buffer_size:
|
|
311
|
+
conn.metrics.events_dropped += 1
|
|
312
|
+
|
|
313
|
+
return True
|
|
314
|
+
|
|
315
|
+
async def get_replay_events(
|
|
316
|
+
self, sid: str, last_sequence: int = 0
|
|
317
|
+
) -> List[Dict[str, Any]]:
|
|
318
|
+
"""
|
|
319
|
+
Get events to replay for a client after reconnection.
|
|
320
|
+
|
|
321
|
+
Args:
|
|
322
|
+
sid: Socket ID
|
|
323
|
+
last_sequence: Last sequence number received by client
|
|
324
|
+
|
|
325
|
+
Returns:
|
|
326
|
+
List of events to replay
|
|
327
|
+
"""
|
|
328
|
+
async with self._lock:
|
|
329
|
+
if sid not in self.connections:
|
|
330
|
+
return []
|
|
331
|
+
|
|
332
|
+
conn = self.connections[sid]
|
|
333
|
+
now = time.time()
|
|
334
|
+
|
|
335
|
+
# Filter events by sequence and TTL
|
|
336
|
+
replay_events = []
|
|
337
|
+
for event in conn.event_buffer:
|
|
338
|
+
if event.get("sequence", 0) > last_sequence:
|
|
339
|
+
# Check TTL
|
|
340
|
+
event_age = now - event.get("timestamp", 0)
|
|
341
|
+
if event_age < self.event_ttl:
|
|
342
|
+
replay_events.append(event)
|
|
343
|
+
|
|
344
|
+
self.logger.info(
|
|
345
|
+
f"Replaying {len(replay_events)} events for {conn.client_id} "
|
|
346
|
+
f"(from sequence {last_sequence})"
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
return replay_events
|
|
350
|
+
|
|
351
|
+
async def acknowledge_event(self, sid: str, sequence: int) -> None:
|
|
352
|
+
"""
|
|
353
|
+
Acknowledge receipt of an event by a client.
|
|
354
|
+
|
|
355
|
+
Args:
|
|
356
|
+
sid: Socket ID
|
|
357
|
+
sequence: Sequence number of acknowledged event
|
|
358
|
+
"""
|
|
359
|
+
async with self._lock:
|
|
360
|
+
if sid not in self.connections:
|
|
361
|
+
return
|
|
362
|
+
|
|
363
|
+
conn = self.connections[sid]
|
|
364
|
+
conn.last_acked_sequence = max(conn.last_acked_sequence, sequence)
|
|
365
|
+
conn.metrics.events_acked += 1
|
|
366
|
+
|
|
367
|
+
# Remove from pending acks
|
|
368
|
+
if sequence in conn.pending_acks:
|
|
369
|
+
del conn.pending_acks[sequence]
|
|
370
|
+
|
|
371
|
+
async def update_activity(self, sid: str, activity_type: str = "event") -> None:
|
|
372
|
+
"""
|
|
373
|
+
Update last activity time for a connection.
|
|
374
|
+
|
|
375
|
+
Args:
|
|
376
|
+
sid: Socket ID
|
|
377
|
+
activity_type: Type of activity (event, ping, pong)
|
|
378
|
+
"""
|
|
379
|
+
if sid not in self.connections:
|
|
380
|
+
return
|
|
381
|
+
|
|
382
|
+
conn = self.connections[sid]
|
|
383
|
+
now = time.time()
|
|
384
|
+
|
|
385
|
+
if activity_type == "ping":
|
|
386
|
+
conn.last_ping = now
|
|
387
|
+
elif activity_type == "pong":
|
|
388
|
+
conn.last_pong = now
|
|
389
|
+
else:
|
|
390
|
+
conn.last_event = now
|
|
391
|
+
|
|
392
|
+
conn.metrics.last_activity = now
|
|
393
|
+
|
|
394
|
+
async def start_health_monitoring(self) -> None:
|
|
395
|
+
"""Start the health monitoring task."""
|
|
396
|
+
if self.health_task:
|
|
397
|
+
return
|
|
398
|
+
|
|
399
|
+
self.health_task = asyncio.create_task(self._health_check_loop())
|
|
400
|
+
self.logger.info("Started connection health monitoring")
|
|
401
|
+
|
|
402
|
+
async def stop_health_monitoring(self) -> None:
|
|
403
|
+
"""Stop the health monitoring task."""
|
|
404
|
+
if self.health_task:
|
|
405
|
+
self.health_task.cancel()
|
|
406
|
+
try:
|
|
407
|
+
await self.health_task
|
|
408
|
+
except asyncio.CancelledError:
|
|
409
|
+
pass
|
|
410
|
+
self.health_task = None
|
|
411
|
+
self.logger.info("Stopped connection health monitoring")
|
|
412
|
+
|
|
413
|
+
async def _health_check_loop(self) -> None:
|
|
414
|
+
"""Periodic health check for all connections."""
|
|
415
|
+
while True:
|
|
416
|
+
try:
|
|
417
|
+
await asyncio.sleep(self.health_check_interval)
|
|
418
|
+
await self.check_connection_health()
|
|
419
|
+
except asyncio.CancelledError:
|
|
420
|
+
break
|
|
421
|
+
except Exception as e:
|
|
422
|
+
self.logger.error(f"Error in health check loop: {e}")
|
|
423
|
+
|
|
424
|
+
async def check_connection_health(self) -> Dict[str, Any]:
|
|
425
|
+
"""
|
|
426
|
+
Check health of all connections and clean up stale ones.
|
|
427
|
+
|
|
428
|
+
Returns:
|
|
429
|
+
Health status report
|
|
430
|
+
"""
|
|
431
|
+
async with self._lock:
|
|
432
|
+
now = time.time()
|
|
433
|
+
report = {
|
|
434
|
+
"timestamp": datetime.now().isoformat(),
|
|
435
|
+
"total_connections": len(self.connections),
|
|
436
|
+
"healthy": 0,
|
|
437
|
+
"stale": 0,
|
|
438
|
+
"disconnected": 0,
|
|
439
|
+
"cleaned": 0,
|
|
440
|
+
"quality_scores": {},
|
|
441
|
+
}
|
|
442
|
+
|
|
443
|
+
to_clean = []
|
|
444
|
+
|
|
445
|
+
for sid, conn in self.connections.items():
|
|
446
|
+
# Calculate quality
|
|
447
|
+
quality = conn.calculate_quality()
|
|
448
|
+
report["quality_scores"][conn.client_id] = quality
|
|
449
|
+
|
|
450
|
+
if conn.state == ConnectionState.CONNECTED:
|
|
451
|
+
if conn.is_healthy(self.stale_timeout):
|
|
452
|
+
report["healthy"] += 1
|
|
453
|
+
else:
|
|
454
|
+
# Mark as stale only if really stale (no grace period activity)
|
|
455
|
+
last_activity = max(
|
|
456
|
+
conn.last_ping or 0,
|
|
457
|
+
conn.last_pong or 0,
|
|
458
|
+
conn.last_event or 0,
|
|
459
|
+
conn.metrics.last_activity or 0,
|
|
460
|
+
conn.connected_at,
|
|
461
|
+
)
|
|
462
|
+
time_since_activity = now - last_activity
|
|
463
|
+
|
|
464
|
+
# Only mark as stale if significantly over timeout (2x)
|
|
465
|
+
if time_since_activity > (self.stale_timeout * 2):
|
|
466
|
+
conn.state = ConnectionState.STALE
|
|
467
|
+
report["stale"] += 1
|
|
468
|
+
self.logger.warning(
|
|
469
|
+
f"Connection {conn.client_id} marked as stale "
|
|
470
|
+
f"(last activity: {time_since_activity:.1f}s ago)"
|
|
471
|
+
)
|
|
472
|
+
else:
|
|
473
|
+
# Connection is borderline - keep it alive but log
|
|
474
|
+
report["healthy"] += 1
|
|
475
|
+
self.logger.debug(
|
|
476
|
+
f"Connection {conn.client_id} borderline "
|
|
477
|
+
f"(last activity: {time_since_activity:.1f}s ago)"
|
|
478
|
+
)
|
|
479
|
+
|
|
480
|
+
elif conn.state == ConnectionState.DISCONNECTED:
|
|
481
|
+
report["disconnected"] += 1
|
|
482
|
+
|
|
483
|
+
# Clean up old disconnected connections (be conservative)
|
|
484
|
+
if (
|
|
485
|
+
conn.disconnected_at
|
|
486
|
+
and (now - conn.disconnected_at) > (self.event_ttl * 2) # Double the TTL
|
|
487
|
+
):
|
|
488
|
+
to_clean.append(sid)
|
|
489
|
+
|
|
490
|
+
# Clean up old connections
|
|
491
|
+
for sid in to_clean:
|
|
492
|
+
conn = self.connections[sid]
|
|
493
|
+
del self.connections[sid]
|
|
494
|
+
if conn.client_id in self.client_mapping:
|
|
495
|
+
del self.client_mapping[conn.client_id]
|
|
496
|
+
report["cleaned"] += 1
|
|
497
|
+
self.logger.info(f"Cleaned up old connection: {conn.client_id}")
|
|
498
|
+
|
|
499
|
+
if report["stale"] > 0 or report["cleaned"] > 0:
|
|
500
|
+
self.logger.info(
|
|
501
|
+
f"Health check: {report['healthy']} healthy, "
|
|
502
|
+
f"{report['stale']} stale, {report['disconnected']} disconnected, "
|
|
503
|
+
f"{report['cleaned']} cleaned"
|
|
504
|
+
)
|
|
505
|
+
|
|
506
|
+
return report
|
|
507
|
+
|
|
508
|
+
def get_connection(self, sid: str) -> Optional[ClientConnection]:
|
|
509
|
+
"""Get connection by socket ID."""
|
|
510
|
+
return self.connections.get(sid)
|
|
511
|
+
|
|
512
|
+
def get_all_connections(self) -> Dict[str, ClientConnection]:
|
|
513
|
+
"""Get all connections."""
|
|
514
|
+
return self.connections.copy()
|
|
515
|
+
|
|
516
|
+
def get_metrics(self) -> Dict[str, Any]:
|
|
517
|
+
"""Get overall connection metrics."""
|
|
518
|
+
total_events_sent = sum(
|
|
519
|
+
c.metrics.events_sent for c in self.connections.values()
|
|
520
|
+
)
|
|
521
|
+
total_events_acked = sum(
|
|
522
|
+
c.metrics.events_acked for c in self.connections.values()
|
|
523
|
+
)
|
|
524
|
+
total_events_buffered = sum(
|
|
525
|
+
c.metrics.events_buffered for c in self.connections.values()
|
|
526
|
+
)
|
|
527
|
+
total_events_dropped = sum(
|
|
528
|
+
c.metrics.events_dropped for c in self.connections.values()
|
|
529
|
+
)
|
|
530
|
+
avg_quality = sum(
|
|
531
|
+
c.metrics.connection_quality for c in self.connections.values()
|
|
532
|
+
) / max(len(self.connections), 1)
|
|
533
|
+
|
|
534
|
+
return {
|
|
535
|
+
"total_connections": len(self.connections),
|
|
536
|
+
"active_connections": sum(
|
|
537
|
+
1
|
|
538
|
+
for c in self.connections.values()
|
|
539
|
+
if c.state == ConnectionState.CONNECTED
|
|
540
|
+
),
|
|
541
|
+
"total_events_sent": total_events_sent,
|
|
542
|
+
"total_events_acked": total_events_acked,
|
|
543
|
+
"total_events_buffered": total_events_buffered,
|
|
544
|
+
"total_events_dropped": total_events_dropped,
|
|
545
|
+
"average_quality": avg_quality,
|
|
546
|
+
"global_sequence": self.global_sequence,
|
|
547
|
+
}
|
|
@@ -158,20 +158,27 @@ class SocketIOServerCore:
|
|
|
158
158
|
async def _start_server(self):
|
|
159
159
|
"""Start the Socket.IO server with aiohttp."""
|
|
160
160
|
try:
|
|
161
|
-
#
|
|
161
|
+
# Import centralized configuration for consistency
|
|
162
|
+
from ....config.socketio_config import CONNECTION_CONFIG
|
|
163
|
+
|
|
164
|
+
# Create Socket.IO server with centralized configuration
|
|
165
|
+
# CRITICAL: These values MUST match client settings to prevent disconnections
|
|
162
166
|
self.sio = socketio.AsyncServer(
|
|
163
167
|
cors_allowed_origins="*",
|
|
164
168
|
logger=False, # Disable Socket.IO's own logging
|
|
165
169
|
engineio_logger=False,
|
|
166
|
-
ping_interval=
|
|
167
|
-
ping_timeout=
|
|
168
|
-
max_http_buffer_size=
|
|
170
|
+
ping_interval=CONNECTION_CONFIG['ping_interval'], # 45 seconds from config
|
|
171
|
+
ping_timeout=CONNECTION_CONFIG['ping_timeout'], # 20 seconds from config
|
|
172
|
+
max_http_buffer_size=CONNECTION_CONFIG['max_http_buffer_size'], # 100MB from config
|
|
169
173
|
)
|
|
170
174
|
|
|
171
175
|
# Create aiohttp application
|
|
172
176
|
self.app = web.Application()
|
|
173
177
|
self.sio.attach(self.app)
|
|
174
178
|
|
|
179
|
+
# Setup HTTP API endpoints for receiving events from hook handlers
|
|
180
|
+
self._setup_http_api()
|
|
181
|
+
|
|
175
182
|
# Find and serve static files
|
|
176
183
|
self._setup_static_files()
|
|
177
184
|
|
|
@@ -193,9 +200,13 @@ class SocketIOServerCore:
|
|
|
193
200
|
if self.static_path:
|
|
194
201
|
self.logger.info(f"Serving static files from: {self.static_path}")
|
|
195
202
|
|
|
196
|
-
#
|
|
197
|
-
|
|
198
|
-
|
|
203
|
+
# Conditionally start heartbeat task based on configuration
|
|
204
|
+
from ....config.socketio_config import CONNECTION_CONFIG
|
|
205
|
+
if CONNECTION_CONFIG.get('enable_extra_heartbeat', False):
|
|
206
|
+
self.heartbeat_task = asyncio.create_task(self._heartbeat_loop())
|
|
207
|
+
self.logger.info("Started system heartbeat task")
|
|
208
|
+
else:
|
|
209
|
+
self.logger.info("System heartbeat disabled (using Socket.IO ping/pong instead)")
|
|
199
210
|
|
|
200
211
|
# Keep the server running
|
|
201
212
|
while self.running:
|
|
@@ -229,6 +240,48 @@ class SocketIOServerCore:
|
|
|
229
240
|
except Exception as e:
|
|
230
241
|
self.logger.error(f"Error stopping Socket.IO server: {e}")
|
|
231
242
|
|
|
243
|
+
def _setup_http_api(self):
|
|
244
|
+
"""Setup HTTP API endpoints for receiving events from hook handlers.
|
|
245
|
+
|
|
246
|
+
WHY: Hook handlers are ephemeral processes that spawn and die quickly.
|
|
247
|
+
Using HTTP POST allows them to send events without managing persistent
|
|
248
|
+
connections, eliminating disconnection issues.
|
|
249
|
+
"""
|
|
250
|
+
|
|
251
|
+
async def api_events_handler(request):
|
|
252
|
+
"""Handle POST /api/events from hook handlers."""
|
|
253
|
+
try:
|
|
254
|
+
# Parse JSON payload
|
|
255
|
+
event_data = await request.json()
|
|
256
|
+
|
|
257
|
+
# Log receipt if debugging
|
|
258
|
+
event_type = event_data.get("subtype", "unknown")
|
|
259
|
+
self.logger.debug(f"Received HTTP event: {event_type}")
|
|
260
|
+
|
|
261
|
+
# Broadcast to all connected dashboard clients via SocketIO
|
|
262
|
+
if self.sio:
|
|
263
|
+
# The event is already in claude_event format from the hook handler
|
|
264
|
+
await self.sio.emit("claude_event", event_data)
|
|
265
|
+
|
|
266
|
+
# Update stats
|
|
267
|
+
self.stats["events_sent"] = self.stats.get("events_sent", 0) + 1
|
|
268
|
+
|
|
269
|
+
# Add to event buffer for late-joining clients
|
|
270
|
+
with self.buffer_lock:
|
|
271
|
+
self.event_buffer.append(event_data)
|
|
272
|
+
self.stats["events_buffered"] = len(self.event_buffer)
|
|
273
|
+
|
|
274
|
+
# Return 204 No Content for success
|
|
275
|
+
return web.Response(status=204)
|
|
276
|
+
|
|
277
|
+
except Exception as e:
|
|
278
|
+
self.logger.error(f"Error handling HTTP event: {e}")
|
|
279
|
+
return web.Response(status=500, text=str(e))
|
|
280
|
+
|
|
281
|
+
# Register the HTTP POST endpoint
|
|
282
|
+
self.app.router.add_post("/api/events", api_events_handler)
|
|
283
|
+
self.logger.info("✅ HTTP API endpoint registered at /api/events")
|
|
284
|
+
|
|
232
285
|
def _setup_static_files(self):
|
|
233
286
|
"""Setup static file serving for the dashboard."""
|
|
234
287
|
try:
|
|
@@ -261,6 +314,24 @@ class SocketIOServerCore:
|
|
|
261
314
|
|
|
262
315
|
self.app.router.add_get("/", index_handler)
|
|
263
316
|
|
|
317
|
+
# Serve the actual dashboard template at /dashboard
|
|
318
|
+
async def dashboard_handler(request):
|
|
319
|
+
dashboard_template = (
|
|
320
|
+
self.dashboard_path.parent / "templates" / "index.html"
|
|
321
|
+
)
|
|
322
|
+
if dashboard_template.exists():
|
|
323
|
+
self.logger.debug(
|
|
324
|
+
f"Serving dashboard template from: {dashboard_template}"
|
|
325
|
+
)
|
|
326
|
+
return web.FileResponse(dashboard_template)
|
|
327
|
+
# Fallback to the main index if template doesn't exist
|
|
328
|
+
self.logger.warning(
|
|
329
|
+
f"Dashboard template not found at: {dashboard_template}, falling back to index"
|
|
330
|
+
)
|
|
331
|
+
return await index_handler(request)
|
|
332
|
+
|
|
333
|
+
self.app.router.add_get("/dashboard", dashboard_handler)
|
|
334
|
+
|
|
264
335
|
# Serve version.json from dashboard directory
|
|
265
336
|
async def version_handler(request):
|
|
266
337
|
version_file = self.dashboard_path / "version.json"
|