claude-mpm 4.1.5__py3-none-any.whl → 4.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. claude_mpm/VERSION +1 -1
  2. claude_mpm/agents/templates/agent-manager.json +1 -1
  3. claude_mpm/agents/templates/agent-manager.md +111 -34
  4. claude_mpm/agents/templates/research.json +39 -13
  5. claude_mpm/cli/__init__.py +2 -0
  6. claude_mpm/cli/commands/__init__.py +2 -0
  7. claude_mpm/cli/commands/configure.py +1221 -0
  8. claude_mpm/cli/commands/configure_tui.py +1921 -0
  9. claude_mpm/cli/parsers/base_parser.py +7 -0
  10. claude_mpm/cli/parsers/configure_parser.py +119 -0
  11. claude_mpm/cli/startup_logging.py +39 -12
  12. claude_mpm/config/socketio_config.py +33 -4
  13. claude_mpm/constants.py +1 -0
  14. claude_mpm/core/socketio_pool.py +35 -3
  15. claude_mpm/dashboard/static/css/connection-status.css +370 -0
  16. claude_mpm/dashboard/static/js/components/connection-debug.js +654 -0
  17. claude_mpm/dashboard/static/js/connection-manager.js +536 -0
  18. claude_mpm/dashboard/static/js/socket-client.js +40 -16
  19. claude_mpm/dashboard/templates/index.html +11 -0
  20. claude_mpm/hooks/claude_hooks/services/__init__.py +3 -1
  21. claude_mpm/hooks/claude_hooks/services/connection_manager.py +17 -0
  22. claude_mpm/hooks/claude_hooks/services/connection_manager_http.py +190 -0
  23. claude_mpm/services/diagnostics/checks/__init__.py +2 -0
  24. claude_mpm/services/diagnostics/checks/instructions_check.py +418 -0
  25. claude_mpm/services/diagnostics/diagnostic_runner.py +15 -2
  26. claude_mpm/services/event_bus/direct_relay.py +230 -0
  27. claude_mpm/services/socketio/handlers/connection_handler.py +330 -0
  28. claude_mpm/services/socketio/server/broadcaster.py +32 -1
  29. claude_mpm/services/socketio/server/connection_manager.py +547 -0
  30. claude_mpm/services/socketio/server/core.py +78 -7
  31. claude_mpm/services/socketio/server/eventbus_integration.py +20 -9
  32. claude_mpm/services/socketio/server/main.py +74 -19
  33. {claude_mpm-4.1.5.dist-info → claude_mpm-4.1.7.dist-info}/METADATA +3 -1
  34. {claude_mpm-4.1.5.dist-info → claude_mpm-4.1.7.dist-info}/RECORD +38 -41
  35. claude_mpm/agents/OUTPUT_STYLE.md +0 -73
  36. claude_mpm/agents/backups/INSTRUCTIONS.md +0 -352
  37. claude_mpm/agents/templates/OPTIMIZATION_REPORT.md +0 -156
  38. claude_mpm/agents/templates/backup/data_engineer_agent_20250726_234551.json +0 -79
  39. claude_mpm/agents/templates/backup/documentation_agent_20250726_234551.json +0 -68
  40. claude_mpm/agents/templates/backup/engineer_agent_20250726_234551.json +0 -77
  41. claude_mpm/agents/templates/backup/ops_agent_20250726_234551.json +0 -78
  42. claude_mpm/agents/templates/backup/qa_agent_20250726_234551.json +0 -67
  43. claude_mpm/agents/templates/backup/research_agent_2025011_234551.json +0 -88
  44. claude_mpm/agents/templates/backup/research_agent_20250726_234551.json +0 -72
  45. claude_mpm/agents/templates/backup/research_memory_efficient.json +0 -88
  46. claude_mpm/agents/templates/backup/security_agent_20250726_234551.json +0 -78
  47. claude_mpm/agents/templates/backup/version_control_agent_20250726_234551.json +0 -62
  48. claude_mpm/agents/templates/vercel_ops_instructions.md +0 -582
  49. {claude_mpm-4.1.5.dist-info → claude_mpm-4.1.7.dist-info}/WHEEL +0 -0
  50. {claude_mpm-4.1.5.dist-info → claude_mpm-4.1.7.dist-info}/entry_points.txt +0 -0
  51. {claude_mpm-4.1.5.dist-info → claude_mpm-4.1.7.dist-info}/licenses/LICENSE +0 -0
  52. {claude_mpm-4.1.5.dist-info → claude_mpm-4.1.7.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,547 @@
1
+ """
2
+ Enhanced Connection Manager for SocketIO Server.
3
+
4
+ WHY: This module provides robust connection management with state tracking,
5
+ health monitoring, event buffering for disconnected clients, and automatic
6
+ recovery from connection failures.
7
+
8
+ DESIGN DECISION: Centralized connection management ensures consistent handling
9
+ of client states, proper event delivery, and automatic recovery mechanisms.
10
+ """
11
+
12
+ import asyncio
13
+ import time
14
+ from collections import deque
15
+ from dataclasses import dataclass, field
16
+ from datetime import datetime
17
+ from enum import Enum
18
+ from typing import Any, Deque, Dict, List, Optional
19
+ from uuid import uuid4
20
+
21
+ from ....core.logging_config import get_logger
22
+
23
+
24
+ class ConnectionState(Enum):
25
+ """Connection states for tracking client lifecycle."""
26
+
27
+ CONNECTING = "connecting"
28
+ CONNECTED = "connected"
29
+ DISCONNECTING = "disconnecting"
30
+ DISCONNECTED = "disconnected"
31
+ RECONNECTING = "reconnecting"
32
+ STALE = "stale" # Connected but not responding
33
+
34
+
35
+ @dataclass
36
+ class ConnectionMetrics:
37
+ """Metrics for a single connection."""
38
+
39
+ connect_count: int = 0
40
+ disconnect_count: int = 0
41
+ reconnect_count: int = 0
42
+ events_sent: int = 0
43
+ events_acked: int = 0
44
+ events_buffered: int = 0
45
+ events_dropped: int = 0
46
+ last_activity: float = field(default_factory=time.time)
47
+ total_uptime: float = 0.0
48
+ total_downtime: float = 0.0
49
+ connection_quality: float = 1.0 # 0-1 quality score
50
+
51
+
52
+ @dataclass
53
+ class ClientConnection:
54
+ """Represents a client connection with full state tracking."""
55
+
56
+ sid: str # Socket ID
57
+ client_id: str # Persistent client ID across reconnections
58
+ state: ConnectionState
59
+ connected_at: float
60
+ disconnected_at: Optional[float] = None
61
+ last_ping: Optional[float] = None
62
+ last_pong: Optional[float] = None
63
+ last_event: Optional[float] = None
64
+ event_buffer: Deque[Dict[str, Any]] = field(
65
+ default_factory=lambda: deque(maxlen=1000)
66
+ )
67
+ event_sequence: int = 0
68
+ last_acked_sequence: int = 0
69
+ pending_acks: Dict[int, Dict[str, Any]] = field(default_factory=dict)
70
+ metrics: ConnectionMetrics = field(default_factory=ConnectionMetrics)
71
+ metadata: Dict[str, Any] = field(default_factory=dict)
72
+
73
+ def is_healthy(self, timeout: float = 180.0) -> bool:
74
+ """Check if connection is healthy based on activity.
75
+
76
+ Args:
77
+ timeout: Seconds before considering connection unhealthy (default 180s)
78
+ """
79
+ if self.state != ConnectionState.CONNECTED:
80
+ return False
81
+
82
+ now = time.time()
83
+
84
+ # Check last activity (ping, pong, or event)
85
+ # Include metrics.last_activity for more comprehensive tracking
86
+ last_activity = max(
87
+ self.last_ping or 0,
88
+ self.last_pong or 0,
89
+ self.last_event or 0,
90
+ self.metrics.last_activity or 0,
91
+ self.connected_at,
92
+ )
93
+
94
+ # Add grace period for network hiccups (additional 10% of timeout)
95
+ grace_period = timeout * 1.1
96
+ return (now - last_activity) < grace_period
97
+
98
+ def calculate_quality(self) -> float:
99
+ """Calculate connection quality score (0-1)."""
100
+ if self.state != ConnectionState.CONNECTED:
101
+ return 0.0
102
+
103
+ # Factors for quality calculation
104
+ factors = []
105
+
106
+ # Reconnection rate (lower is better)
107
+ if self.metrics.connect_count > 0:
108
+ reconnect_rate = self.metrics.reconnect_count / self.metrics.connect_count
109
+ factors.append(1.0 - min(reconnect_rate, 1.0))
110
+
111
+ # Event acknowledgment rate
112
+ if self.metrics.events_sent > 0:
113
+ ack_rate = self.metrics.events_acked / self.metrics.events_sent
114
+ factors.append(ack_rate)
115
+
116
+ # Uptime ratio
117
+ total_time = self.metrics.total_uptime + self.metrics.total_downtime
118
+ if total_time > 0:
119
+ uptime_ratio = self.metrics.total_uptime / total_time
120
+ factors.append(uptime_ratio)
121
+
122
+ # Recent activity (exponential decay over 5 minutes)
123
+ now = time.time()
124
+ time_since_activity = now - self.metrics.last_activity
125
+ activity_score = max(0, 1.0 - (time_since_activity / 300))
126
+ factors.append(activity_score)
127
+
128
+ # Calculate average quality
129
+ if factors:
130
+ quality = sum(factors) / len(factors)
131
+ else:
132
+ quality = 1.0 if self.state == ConnectionState.CONNECTED else 0.0
133
+
134
+ self.metrics.connection_quality = quality
135
+ return quality
136
+
137
+
138
+ class ConnectionManager:
139
+ """
140
+ Enhanced connection manager with robust state tracking and recovery.
141
+
142
+ Features:
143
+ - Persistent client IDs across reconnections
144
+ - Event buffering for disconnected clients
145
+ - Sequence numbers for event ordering
146
+ - Health monitoring with automatic stale detection
147
+ - Connection quality metrics
148
+ - Automatic event replay on reconnection
149
+ """
150
+
151
+ def __init__(self, max_buffer_size: int = None, event_ttl: int = None):
152
+ """
153
+ Initialize connection manager with centralized configuration.
154
+
155
+ Args:
156
+ max_buffer_size: Maximum events to buffer per client (uses config if None)
157
+ event_ttl: Time-to-live for buffered events in seconds (uses config if None)
158
+ """
159
+ from ....config.socketio_config import CONNECTION_CONFIG
160
+
161
+ self.logger = get_logger(__name__)
162
+ self.connections: Dict[str, ClientConnection] = {}
163
+ self.client_mapping: Dict[str, str] = {} # client_id -> current sid
164
+
165
+ # Use centralized configuration with optional overrides
166
+ self.max_buffer_size = max_buffer_size or CONNECTION_CONFIG['max_events_buffer']
167
+ self.event_ttl = event_ttl or CONNECTION_CONFIG['event_ttl']
168
+ self.global_sequence = 0
169
+ self.health_check_interval = CONNECTION_CONFIG['health_check_interval'] # 30 seconds
170
+ self.stale_timeout = CONNECTION_CONFIG['stale_timeout'] # 180 seconds (was 90)
171
+ self.health_task = None
172
+ self._lock = asyncio.Lock()
173
+
174
+ async def register_connection(
175
+ self, sid: str, client_id: Optional[str] = None
176
+ ) -> ClientConnection:
177
+ """
178
+ Register a new connection or reconnection.
179
+
180
+ Args:
181
+ sid: Socket ID
182
+ client_id: Optional persistent client ID for reconnection
183
+
184
+ Returns:
185
+ ClientConnection object
186
+ """
187
+ async with self._lock:
188
+ now = time.time()
189
+
190
+ # Check if this is a reconnection
191
+ if client_id and client_id in self.client_mapping:
192
+ old_sid = self.client_mapping[client_id]
193
+ if old_sid in self.connections:
194
+ old_conn = self.connections[old_sid]
195
+
196
+ # Create new connection with history
197
+ conn = ClientConnection(
198
+ sid=sid,
199
+ client_id=client_id,
200
+ state=ConnectionState.CONNECTED,
201
+ connected_at=now,
202
+ event_buffer=old_conn.event_buffer,
203
+ event_sequence=old_conn.event_sequence,
204
+ last_acked_sequence=old_conn.last_acked_sequence,
205
+ metrics=old_conn.metrics,
206
+ )
207
+
208
+ # Update metrics
209
+ conn.metrics.reconnect_count += 1
210
+ conn.metrics.connect_count += 1
211
+ if old_conn.disconnected_at:
212
+ conn.metrics.total_downtime += now - old_conn.disconnected_at
213
+
214
+ # Clean up old connection
215
+ del self.connections[old_sid]
216
+
217
+ self.logger.info(
218
+ f"Client {client_id} reconnected (new sid: {sid}, "
219
+ f"buffered events: {len(conn.event_buffer)})"
220
+ )
221
+ else:
222
+ # No old connection found, create new
223
+ client_id = client_id or str(uuid4())
224
+ conn = self._create_new_connection(sid, client_id, now)
225
+ else:
226
+ # New client
227
+ client_id = client_id or str(uuid4())
228
+ conn = self._create_new_connection(sid, client_id, now)
229
+
230
+ # Register connection
231
+ self.connections[sid] = conn
232
+ self.client_mapping[client_id] = sid
233
+
234
+ return conn
235
+
236
+ def _create_new_connection(
237
+ self, sid: str, client_id: str, now: float
238
+ ) -> ClientConnection:
239
+ """Create a new connection object."""
240
+ conn = ClientConnection(
241
+ sid=sid,
242
+ client_id=client_id,
243
+ state=ConnectionState.CONNECTED,
244
+ connected_at=now,
245
+ )
246
+ conn.metrics.connect_count = 1
247
+ self.logger.info(f"New client connected: {client_id} (sid: {sid})")
248
+ return conn
249
+
250
+ async def unregister_connection(self, sid: str, reason: str = "unknown") -> None:
251
+ """
252
+ Unregister a connection but keep state for reconnection.
253
+
254
+ Args:
255
+ sid: Socket ID
256
+ reason: Disconnection reason
257
+ """
258
+ async with self._lock:
259
+ if sid not in self.connections:
260
+ return
261
+
262
+ conn = self.connections[sid]
263
+ now = time.time()
264
+
265
+ # Update connection state
266
+ conn.state = ConnectionState.DISCONNECTED
267
+ conn.disconnected_at = now
268
+ conn.metrics.disconnect_count += 1
269
+
270
+ # Update uptime
271
+ if conn.connected_at:
272
+ conn.metrics.total_uptime += now - conn.connected_at
273
+
274
+ self.logger.info(
275
+ f"Client {conn.client_id} disconnected (sid: {sid}, reason: {reason}, "
276
+ f"buffered events: {len(conn.event_buffer)})"
277
+ )
278
+
279
+ # Keep connection for potential reconnection
280
+ # It will be cleaned up by health check if not reconnected
281
+
282
+ async def buffer_event(self, sid: str, event: Dict[str, Any]) -> bool:
283
+ """
284
+ Buffer an event for a client.
285
+
286
+ Args:
287
+ sid: Socket ID
288
+ event: Event to buffer
289
+
290
+ Returns:
291
+ True if buffered successfully
292
+ """
293
+ async with self._lock:
294
+ if sid not in self.connections:
295
+ return False
296
+
297
+ conn = self.connections[sid]
298
+
299
+ # Add sequence number
300
+ self.global_sequence += 1
301
+ event["sequence"] = self.global_sequence
302
+ event["timestamp"] = time.time()
303
+
304
+ # Buffer the event
305
+ conn.event_buffer.append(event)
306
+ conn.event_sequence = self.global_sequence
307
+ conn.metrics.events_buffered += 1
308
+
309
+ # Drop old events if buffer is full
310
+ if len(conn.event_buffer) >= self.max_buffer_size:
311
+ conn.metrics.events_dropped += 1
312
+
313
+ return True
314
+
315
+ async def get_replay_events(
316
+ self, sid: str, last_sequence: int = 0
317
+ ) -> List[Dict[str, Any]]:
318
+ """
319
+ Get events to replay for a client after reconnection.
320
+
321
+ Args:
322
+ sid: Socket ID
323
+ last_sequence: Last sequence number received by client
324
+
325
+ Returns:
326
+ List of events to replay
327
+ """
328
+ async with self._lock:
329
+ if sid not in self.connections:
330
+ return []
331
+
332
+ conn = self.connections[sid]
333
+ now = time.time()
334
+
335
+ # Filter events by sequence and TTL
336
+ replay_events = []
337
+ for event in conn.event_buffer:
338
+ if event.get("sequence", 0) > last_sequence:
339
+ # Check TTL
340
+ event_age = now - event.get("timestamp", 0)
341
+ if event_age < self.event_ttl:
342
+ replay_events.append(event)
343
+
344
+ self.logger.info(
345
+ f"Replaying {len(replay_events)} events for {conn.client_id} "
346
+ f"(from sequence {last_sequence})"
347
+ )
348
+
349
+ return replay_events
350
+
351
+ async def acknowledge_event(self, sid: str, sequence: int) -> None:
352
+ """
353
+ Acknowledge receipt of an event by a client.
354
+
355
+ Args:
356
+ sid: Socket ID
357
+ sequence: Sequence number of acknowledged event
358
+ """
359
+ async with self._lock:
360
+ if sid not in self.connections:
361
+ return
362
+
363
+ conn = self.connections[sid]
364
+ conn.last_acked_sequence = max(conn.last_acked_sequence, sequence)
365
+ conn.metrics.events_acked += 1
366
+
367
+ # Remove from pending acks
368
+ if sequence in conn.pending_acks:
369
+ del conn.pending_acks[sequence]
370
+
371
+ async def update_activity(self, sid: str, activity_type: str = "event") -> None:
372
+ """
373
+ Update last activity time for a connection.
374
+
375
+ Args:
376
+ sid: Socket ID
377
+ activity_type: Type of activity (event, ping, pong)
378
+ """
379
+ if sid not in self.connections:
380
+ return
381
+
382
+ conn = self.connections[sid]
383
+ now = time.time()
384
+
385
+ if activity_type == "ping":
386
+ conn.last_ping = now
387
+ elif activity_type == "pong":
388
+ conn.last_pong = now
389
+ else:
390
+ conn.last_event = now
391
+
392
+ conn.metrics.last_activity = now
393
+
394
+ async def start_health_monitoring(self) -> None:
395
+ """Start the health monitoring task."""
396
+ if self.health_task:
397
+ return
398
+
399
+ self.health_task = asyncio.create_task(self._health_check_loop())
400
+ self.logger.info("Started connection health monitoring")
401
+
402
+ async def stop_health_monitoring(self) -> None:
403
+ """Stop the health monitoring task."""
404
+ if self.health_task:
405
+ self.health_task.cancel()
406
+ try:
407
+ await self.health_task
408
+ except asyncio.CancelledError:
409
+ pass
410
+ self.health_task = None
411
+ self.logger.info("Stopped connection health monitoring")
412
+
413
+ async def _health_check_loop(self) -> None:
414
+ """Periodic health check for all connections."""
415
+ while True:
416
+ try:
417
+ await asyncio.sleep(self.health_check_interval)
418
+ await self.check_connection_health()
419
+ except asyncio.CancelledError:
420
+ break
421
+ except Exception as e:
422
+ self.logger.error(f"Error in health check loop: {e}")
423
+
424
+ async def check_connection_health(self) -> Dict[str, Any]:
425
+ """
426
+ Check health of all connections and clean up stale ones.
427
+
428
+ Returns:
429
+ Health status report
430
+ """
431
+ async with self._lock:
432
+ now = time.time()
433
+ report = {
434
+ "timestamp": datetime.now().isoformat(),
435
+ "total_connections": len(self.connections),
436
+ "healthy": 0,
437
+ "stale": 0,
438
+ "disconnected": 0,
439
+ "cleaned": 0,
440
+ "quality_scores": {},
441
+ }
442
+
443
+ to_clean = []
444
+
445
+ for sid, conn in self.connections.items():
446
+ # Calculate quality
447
+ quality = conn.calculate_quality()
448
+ report["quality_scores"][conn.client_id] = quality
449
+
450
+ if conn.state == ConnectionState.CONNECTED:
451
+ if conn.is_healthy(self.stale_timeout):
452
+ report["healthy"] += 1
453
+ else:
454
+ # Mark as stale only if really stale (no grace period activity)
455
+ last_activity = max(
456
+ conn.last_ping or 0,
457
+ conn.last_pong or 0,
458
+ conn.last_event or 0,
459
+ conn.metrics.last_activity or 0,
460
+ conn.connected_at,
461
+ )
462
+ time_since_activity = now - last_activity
463
+
464
+ # Only mark as stale if significantly over timeout (2x)
465
+ if time_since_activity > (self.stale_timeout * 2):
466
+ conn.state = ConnectionState.STALE
467
+ report["stale"] += 1
468
+ self.logger.warning(
469
+ f"Connection {conn.client_id} marked as stale "
470
+ f"(last activity: {time_since_activity:.1f}s ago)"
471
+ )
472
+ else:
473
+ # Connection is borderline - keep it alive but log
474
+ report["healthy"] += 1
475
+ self.logger.debug(
476
+ f"Connection {conn.client_id} borderline "
477
+ f"(last activity: {time_since_activity:.1f}s ago)"
478
+ )
479
+
480
+ elif conn.state == ConnectionState.DISCONNECTED:
481
+ report["disconnected"] += 1
482
+
483
+ # Clean up old disconnected connections (be conservative)
484
+ if (
485
+ conn.disconnected_at
486
+ and (now - conn.disconnected_at) > (self.event_ttl * 2) # Double the TTL
487
+ ):
488
+ to_clean.append(sid)
489
+
490
+ # Clean up old connections
491
+ for sid in to_clean:
492
+ conn = self.connections[sid]
493
+ del self.connections[sid]
494
+ if conn.client_id in self.client_mapping:
495
+ del self.client_mapping[conn.client_id]
496
+ report["cleaned"] += 1
497
+ self.logger.info(f"Cleaned up old connection: {conn.client_id}")
498
+
499
+ if report["stale"] > 0 or report["cleaned"] > 0:
500
+ self.logger.info(
501
+ f"Health check: {report['healthy']} healthy, "
502
+ f"{report['stale']} stale, {report['disconnected']} disconnected, "
503
+ f"{report['cleaned']} cleaned"
504
+ )
505
+
506
+ return report
507
+
508
+ def get_connection(self, sid: str) -> Optional[ClientConnection]:
509
+ """Get connection by socket ID."""
510
+ return self.connections.get(sid)
511
+
512
+ def get_all_connections(self) -> Dict[str, ClientConnection]:
513
+ """Get all connections."""
514
+ return self.connections.copy()
515
+
516
+ def get_metrics(self) -> Dict[str, Any]:
517
+ """Get overall connection metrics."""
518
+ total_events_sent = sum(
519
+ c.metrics.events_sent for c in self.connections.values()
520
+ )
521
+ total_events_acked = sum(
522
+ c.metrics.events_acked for c in self.connections.values()
523
+ )
524
+ total_events_buffered = sum(
525
+ c.metrics.events_buffered for c in self.connections.values()
526
+ )
527
+ total_events_dropped = sum(
528
+ c.metrics.events_dropped for c in self.connections.values()
529
+ )
530
+ avg_quality = sum(
531
+ c.metrics.connection_quality for c in self.connections.values()
532
+ ) / max(len(self.connections), 1)
533
+
534
+ return {
535
+ "total_connections": len(self.connections),
536
+ "active_connections": sum(
537
+ 1
538
+ for c in self.connections.values()
539
+ if c.state == ConnectionState.CONNECTED
540
+ ),
541
+ "total_events_sent": total_events_sent,
542
+ "total_events_acked": total_events_acked,
543
+ "total_events_buffered": total_events_buffered,
544
+ "total_events_dropped": total_events_dropped,
545
+ "average_quality": avg_quality,
546
+ "global_sequence": self.global_sequence,
547
+ }
@@ -158,20 +158,27 @@ class SocketIOServerCore:
158
158
  async def _start_server(self):
159
159
  """Start the Socket.IO server with aiohttp."""
160
160
  try:
161
- # Create Socket.IO server with proper ping/pong configuration
161
+ # Import centralized configuration for consistency
162
+ from ....config.socketio_config import CONNECTION_CONFIG
163
+
164
+ # Create Socket.IO server with centralized configuration
165
+ # CRITICAL: These values MUST match client settings to prevent disconnections
162
166
  self.sio = socketio.AsyncServer(
163
167
  cors_allowed_origins="*",
164
168
  logger=False, # Disable Socket.IO's own logging
165
169
  engineio_logger=False,
166
- ping_interval=25, # Send ping every 25 seconds
167
- ping_timeout=60, # Wait 60 seconds for pong response
168
- max_http_buffer_size=1e8, # 100MB max buffer
170
+ ping_interval=CONNECTION_CONFIG['ping_interval'], # 45 seconds from config
171
+ ping_timeout=CONNECTION_CONFIG['ping_timeout'], # 20 seconds from config
172
+ max_http_buffer_size=CONNECTION_CONFIG['max_http_buffer_size'], # 100MB from config
169
173
  )
170
174
 
171
175
  # Create aiohttp application
172
176
  self.app = web.Application()
173
177
  self.sio.attach(self.app)
174
178
 
179
+ # Setup HTTP API endpoints for receiving events from hook handlers
180
+ self._setup_http_api()
181
+
175
182
  # Find and serve static files
176
183
  self._setup_static_files()
177
184
 
@@ -193,9 +200,13 @@ class SocketIOServerCore:
193
200
  if self.static_path:
194
201
  self.logger.info(f"Serving static files from: {self.static_path}")
195
202
 
196
- # Start heartbeat task
197
- self.heartbeat_task = asyncio.create_task(self._heartbeat_loop())
198
- self.logger.info("Started system heartbeat task")
203
+ # Conditionally start heartbeat task based on configuration
204
+ from ....config.socketio_config import CONNECTION_CONFIG
205
+ if CONNECTION_CONFIG.get('enable_extra_heartbeat', False):
206
+ self.heartbeat_task = asyncio.create_task(self._heartbeat_loop())
207
+ self.logger.info("Started system heartbeat task")
208
+ else:
209
+ self.logger.info("System heartbeat disabled (using Socket.IO ping/pong instead)")
199
210
 
200
211
  # Keep the server running
201
212
  while self.running:
@@ -229,6 +240,48 @@ class SocketIOServerCore:
229
240
  except Exception as e:
230
241
  self.logger.error(f"Error stopping Socket.IO server: {e}")
231
242
 
243
+ def _setup_http_api(self):
244
+ """Setup HTTP API endpoints for receiving events from hook handlers.
245
+
246
+ WHY: Hook handlers are ephemeral processes that spawn and die quickly.
247
+ Using HTTP POST allows them to send events without managing persistent
248
+ connections, eliminating disconnection issues.
249
+ """
250
+
251
+ async def api_events_handler(request):
252
+ """Handle POST /api/events from hook handlers."""
253
+ try:
254
+ # Parse JSON payload
255
+ event_data = await request.json()
256
+
257
+ # Log receipt if debugging
258
+ event_type = event_data.get("subtype", "unknown")
259
+ self.logger.debug(f"Received HTTP event: {event_type}")
260
+
261
+ # Broadcast to all connected dashboard clients via SocketIO
262
+ if self.sio:
263
+ # The event is already in claude_event format from the hook handler
264
+ await self.sio.emit("claude_event", event_data)
265
+
266
+ # Update stats
267
+ self.stats["events_sent"] = self.stats.get("events_sent", 0) + 1
268
+
269
+ # Add to event buffer for late-joining clients
270
+ with self.buffer_lock:
271
+ self.event_buffer.append(event_data)
272
+ self.stats["events_buffered"] = len(self.event_buffer)
273
+
274
+ # Return 204 No Content for success
275
+ return web.Response(status=204)
276
+
277
+ except Exception as e:
278
+ self.logger.error(f"Error handling HTTP event: {e}")
279
+ return web.Response(status=500, text=str(e))
280
+
281
+ # Register the HTTP POST endpoint
282
+ self.app.router.add_post("/api/events", api_events_handler)
283
+ self.logger.info("✅ HTTP API endpoint registered at /api/events")
284
+
232
285
  def _setup_static_files(self):
233
286
  """Setup static file serving for the dashboard."""
234
287
  try:
@@ -261,6 +314,24 @@ class SocketIOServerCore:
261
314
 
262
315
  self.app.router.add_get("/", index_handler)
263
316
 
317
+ # Serve the actual dashboard template at /dashboard
318
+ async def dashboard_handler(request):
319
+ dashboard_template = (
320
+ self.dashboard_path.parent / "templates" / "index.html"
321
+ )
322
+ if dashboard_template.exists():
323
+ self.logger.debug(
324
+ f"Serving dashboard template from: {dashboard_template}"
325
+ )
326
+ return web.FileResponse(dashboard_template)
327
+ # Fallback to the main index if template doesn't exist
328
+ self.logger.warning(
329
+ f"Dashboard template not found at: {dashboard_template}, falling back to index"
330
+ )
331
+ return await index_handler(request)
332
+
333
+ self.app.router.add_get("/dashboard", dashboard_handler)
334
+
264
335
  # Serve version.json from dashboard directory
265
336
  async def version_handler(request):
266
337
  version_file = self.dashboard_path / "version.json"