claude-mpm 4.2.9__py3-none-any.whl → 4.2.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. claude_mpm/VERSION +1 -1
  2. claude_mpm/cli/commands/dashboard.py +59 -126
  3. claude_mpm/cli/commands/monitor.py +71 -212
  4. claude_mpm/cli/commands/run.py +33 -33
  5. claude_mpm/dashboard/static/css/code-tree.css +8 -16
  6. claude_mpm/dashboard/static/dist/components/code-tree.js +1 -1
  7. claude_mpm/dashboard/static/dist/components/file-viewer.js +2 -0
  8. claude_mpm/dashboard/static/dist/components/module-viewer.js +1 -1
  9. claude_mpm/dashboard/static/dist/components/unified-data-viewer.js +1 -1
  10. claude_mpm/dashboard/static/dist/dashboard.js +1 -1
  11. claude_mpm/dashboard/static/dist/socket-client.js +1 -1
  12. claude_mpm/dashboard/static/js/components/code-tree.js +692 -114
  13. claude_mpm/dashboard/static/js/components/file-viewer.js +538 -0
  14. claude_mpm/dashboard/static/js/components/module-viewer.js +26 -0
  15. claude_mpm/dashboard/static/js/components/unified-data-viewer.js +166 -14
  16. claude_mpm/dashboard/static/js/dashboard.js +108 -91
  17. claude_mpm/dashboard/static/js/socket-client.js +9 -7
  18. claude_mpm/dashboard/templates/index.html +2 -7
  19. claude_mpm/hooks/claude_hooks/hook_handler.py +1 -11
  20. claude_mpm/hooks/claude_hooks/services/connection_manager.py +54 -59
  21. claude_mpm/hooks/claude_hooks/services/connection_manager_http.py +112 -72
  22. claude_mpm/services/agents/deployment/agent_template_builder.py +0 -1
  23. claude_mpm/services/cli/unified_dashboard_manager.py +354 -0
  24. claude_mpm/services/monitor/__init__.py +20 -0
  25. claude_mpm/services/monitor/daemon.py +256 -0
  26. claude_mpm/services/monitor/event_emitter.py +279 -0
  27. claude_mpm/services/monitor/handlers/__init__.py +20 -0
  28. claude_mpm/services/monitor/handlers/code_analysis.py +334 -0
  29. claude_mpm/services/monitor/handlers/dashboard.py +298 -0
  30. claude_mpm/services/monitor/handlers/hooks.py +491 -0
  31. claude_mpm/services/monitor/management/__init__.py +18 -0
  32. claude_mpm/services/monitor/management/health.py +124 -0
  33. claude_mpm/services/monitor/management/lifecycle.py +298 -0
  34. claude_mpm/services/monitor/server.py +442 -0
  35. claude_mpm/tools/code_tree_analyzer.py +33 -17
  36. {claude_mpm-4.2.9.dist-info → claude_mpm-4.2.11.dist-info}/METADATA +1 -1
  37. {claude_mpm-4.2.9.dist-info → claude_mpm-4.2.11.dist-info}/RECORD +41 -36
  38. claude_mpm/cli/commands/socketio_monitor.py +0 -233
  39. claude_mpm/scripts/socketio_daemon.py +0 -571
  40. claude_mpm/scripts/socketio_daemon_hardened.py +0 -937
  41. claude_mpm/scripts/socketio_daemon_wrapper.py +0 -78
  42. claude_mpm/scripts/socketio_server_manager.py +0 -349
  43. claude_mpm/services/cli/dashboard_launcher.py +0 -423
  44. claude_mpm/services/cli/socketio_manager.py +0 -595
  45. claude_mpm/services/dashboard/stable_server.py +0 -1020
  46. claude_mpm/services/socketio/monitor_server.py +0 -505
  47. {claude_mpm-4.2.9.dist-info → claude_mpm-4.2.11.dist-info}/WHEEL +0 -0
  48. {claude_mpm-4.2.9.dist-info → claude_mpm-4.2.11.dist-info}/entry_points.txt +0 -0
  49. {claude_mpm-4.2.9.dist-info → claude_mpm-4.2.11.dist-info}/licenses/LICENSE +0 -0
  50. {claude_mpm-4.2.9.dist-info → claude_mpm-4.2.11.dist-info}/top_level.txt +0 -0
@@ -1,937 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Production-hardened Socket.IO daemon with automatic recovery and monitoring.
4
-
5
- WHY: Production environments require robust daemon management with automatic
6
- recovery, comprehensive monitoring, and graceful degradation under load.
7
-
8
- FEATURES:
9
- - Automatic retry with exponential backoff
10
- - Supervisor pattern for crash recovery
11
- - Comprehensive error handling and logging
12
- - Resource management and cleanup
13
- - Process management with PID files
14
- - Signal handling for graceful shutdown
15
- - Health monitoring and metrics
16
- - Configuration through environment variables
17
- """
18
-
19
- import json
20
- import os
21
- import signal
22
- import subprocess
23
- import sys
24
- import threading
25
- import time
26
- import traceback
27
- from datetime import datetime
28
- from pathlib import Path
29
- from typing import Optional
30
-
31
-
32
- # Detect and use virtual environment Python if available
33
- def get_python_executable():
34
- """Get the appropriate Python executable, preferring virtual environment."""
35
- if hasattr(sys, "real_prefix") or (
36
- hasattr(sys, "base_prefix") and sys.base_prefix != sys.prefix
37
- ):
38
- return sys.executable
39
-
40
- venv_path = os.environ.get("VIRTUAL_ENV")
41
- if venv_path:
42
- venv_python = Path(venv_path) / "bin" / "python"
43
- if venv_python.exists():
44
- return str(venv_python)
45
-
46
- exe_path = Path(sys.executable).resolve()
47
- for parent in exe_path.parents:
48
- if parent.name in ("venv", ".venv", "env", ".env"):
49
- return sys.executable
50
- if parent.name == "bin" and (parent.parent / "pyvenv.cfg").exists():
51
- return sys.executable
52
- if parent.name == "Scripts" and (parent.parent / "pyvenv.cfg").exists():
53
- return sys.executable
54
-
55
- script_path = Path(__file__).resolve()
56
- for parent in script_path.parents:
57
- if parent.name == "src" or not (parent / "src").exists():
58
- for venv_name in ("venv", ".venv", "env", ".env"):
59
- venv_dir = parent / venv_name
60
- if venv_dir.exists():
61
- venv_python = venv_dir / "bin" / "python"
62
- if venv_python.exists():
63
- return str(venv_python)
64
- break
65
-
66
- return sys.executable
67
-
68
-
69
- PYTHON_EXECUTABLE = get_python_executable()
70
-
71
-
72
- # Configuration from environment variables
73
- class Config:
74
- """Centralized configuration with environment variable support."""
75
-
76
- # Retry configuration
77
- MAX_RETRIES = int(os.environ.get("SOCKETIO_MAX_RETRIES", "10"))
78
- INITIAL_RETRY_DELAY = float(os.environ.get("SOCKETIO_INITIAL_RETRY_DELAY", "1.0"))
79
- MAX_RETRY_DELAY = float(os.environ.get("SOCKETIO_MAX_RETRY_DELAY", "60.0"))
80
- BACKOFF_FACTOR = float(os.environ.get("SOCKETIO_BACKOFF_FACTOR", "2.0"))
81
-
82
- # Health check configuration
83
- HEALTH_CHECK_INTERVAL = float(
84
- os.environ.get("SOCKETIO_HEALTH_CHECK_INTERVAL", "30.0")
85
- )
86
- HEALTH_CHECK_TIMEOUT = float(os.environ.get("SOCKETIO_HEALTH_CHECK_TIMEOUT", "5.0"))
87
- UNHEALTHY_THRESHOLD = int(os.environ.get("SOCKETIO_UNHEALTHY_THRESHOLD", "3"))
88
-
89
- # Process management
90
- STARTUP_TIMEOUT = float(os.environ.get("SOCKETIO_STARTUP_TIMEOUT", "30.0"))
91
- SHUTDOWN_TIMEOUT = float(os.environ.get("SOCKETIO_SHUTDOWN_TIMEOUT", "10.0"))
92
- FORCE_KILL_TIMEOUT = float(os.environ.get("SOCKETIO_FORCE_KILL_TIMEOUT", "5.0"))
93
-
94
- # Port configuration
95
- PORT_RANGE_START = int(os.environ.get("SOCKETIO_PORT_START", "8765"))
96
- PORT_RANGE_END = int(os.environ.get("SOCKETIO_PORT_END", "8785"))
97
-
98
- # Logging
99
- LOG_LEVEL = os.environ.get("SOCKETIO_LOG_LEVEL", "INFO")
100
- LOG_FORMAT = "%(asctime)s [%(levelname)s] %(name)s: %(message)s"
101
-
102
- # Monitoring
103
- METRICS_ENABLED = (
104
- os.environ.get("SOCKETIO_METRICS_ENABLED", "true").lower() == "true"
105
- )
106
- METRICS_FILE = os.environ.get(
107
- "SOCKETIO_METRICS_FILE", ".claude-mpm/socketio-metrics.json"
108
- )
109
-
110
-
111
- # Setup structured logging
112
- import contextlib
113
- import logging
114
-
115
- logging.basicConfig(level=getattr(logging, Config.LOG_LEVEL), format=Config.LOG_FORMAT)
116
- logger = logging.getLogger("socketio-daemon")
117
-
118
- try:
119
- import psutil
120
- except ImportError:
121
- logger.info(f"Installing psutil using {PYTHON_EXECUTABLE}...")
122
- subprocess.check_call([PYTHON_EXECUTABLE, "-m", "pip", "install", "psutil"])
123
- import psutil
124
-
125
- # Import project modules
126
- try:
127
- from claude_mpm.core.unified_paths import get_project_root
128
- from claude_mpm.services.port_manager import PortManager
129
- from claude_mpm.services.socketio.server.main import SocketIOServer
130
- except ImportError:
131
- script_path = Path(__file__).resolve()
132
- if "site-packages" in str(script_path):
133
- parts = script_path.parts
134
- site_packages_idx = next(
135
- i for i, part in enumerate(parts) if part == "site-packages"
136
- )
137
- site_packages_path = Path(*parts[: site_packages_idx + 1])
138
- if site_packages_path.exists() and str(site_packages_path) not in sys.path:
139
- sys.path.insert(0, str(site_packages_path))
140
- else:
141
- src_path = script_path.parent.parent.parent
142
- if (
143
- src_path.exists()
144
- and (src_path / "claude_mpm").exists()
145
- and str(src_path) not in sys.path
146
- ):
147
- sys.path.insert(0, str(src_path))
148
-
149
- from claude_mpm.core.unified_paths import get_project_root
150
- from claude_mpm.services.port_manager import PortManager
151
- from claude_mpm.services.socketio.server.main import SocketIOServer
152
-
153
-
154
- class DaemonMetrics:
155
- """Track and persist daemon metrics for monitoring."""
156
-
157
- def __init__(self, metrics_file: Path):
158
- self.metrics_file = metrics_file
159
- self.metrics = {
160
- "start_time": None,
161
- "restarts": 0,
162
- "total_failures": 0,
163
- "last_failure": None,
164
- "health_checks_passed": 0,
165
- "health_checks_failed": 0,
166
- "uptime_seconds": 0,
167
- "last_health_check": None,
168
- "status": "initializing",
169
- }
170
- self.lock = threading.Lock()
171
- self.load()
172
-
173
- def load(self):
174
- """Load metrics from file if exists."""
175
- if self.metrics_file.exists():
176
- try:
177
- with open(self.metrics_file) as f:
178
- saved = json.load(f)
179
- self.metrics.update(saved)
180
- except Exception as e:
181
- logger.warning(f"Could not load metrics: {e}")
182
-
183
- def save(self):
184
- """Persist metrics to file."""
185
- try:
186
- self.metrics_file.parent.mkdir(parents=True, exist_ok=True)
187
- with self.lock, open(self.metrics_file, "w") as f:
188
- json.dump(self.metrics, f, indent=2, default=str)
189
- except Exception as e:
190
- logger.error(f"Could not save metrics: {e}")
191
-
192
- def update(self, **kwargs):
193
- """Update metrics atomically."""
194
- with self.lock:
195
- self.metrics.update(kwargs)
196
- if self.metrics["start_time"]:
197
- start = datetime.fromisoformat(str(self.metrics["start_time"]))
198
- self.metrics["uptime_seconds"] = int(
199
- (datetime.now() - start).total_seconds()
200
- )
201
- self.save()
202
-
203
- def increment(self, key: str, amount: int = 1):
204
- """Increment a counter metric."""
205
- with self.lock:
206
- self.metrics[key] = self.metrics.get(key, 0) + amount
207
- self.save()
208
-
209
-
210
- class ExponentialBackoff:
211
- """Implement exponential backoff with jitter for retry logic."""
212
-
213
- def __init__(
214
- self, initial_delay: float = 1.0, max_delay: float = 60.0, factor: float = 2.0
215
- ):
216
- self.initial_delay = initial_delay
217
- self.max_delay = max_delay
218
- self.factor = factor
219
- self.current_delay = initial_delay
220
- self.attempt = 0
221
-
222
- def next_delay(self) -> float:
223
- """Get the next delay with jitter."""
224
- import random
225
-
226
- self.attempt += 1
227
-
228
- # Calculate exponential delay
229
- delay = min(self.initial_delay * (self.factor**self.attempt), self.max_delay)
230
-
231
- # Add jitter (±25% randomization)
232
- jitter = delay * 0.25 * (2 * random.random() - 1)
233
- actual_delay = max(0.1, delay + jitter)
234
-
235
- logger.debug(f"Backoff attempt {self.attempt}: {actual_delay:.2f}s")
236
- return actual_delay
237
-
238
- def reset(self):
239
- """Reset the backoff counter."""
240
- self.attempt = 0
241
- self.current_delay = self.initial_delay
242
-
243
-
244
- class HealthMonitor:
245
- """Monitor daemon health and trigger recovery if needed."""
246
-
247
- def __init__(self, port: int, metrics: DaemonMetrics):
248
- self.port = port
249
- self.metrics = metrics
250
- self.consecutive_failures = 0
251
- self.running = False
252
- self.thread = None
253
-
254
- def start(self):
255
- """Start health monitoring in background thread."""
256
- if self.running:
257
- return
258
-
259
- self.running = True
260
- self.thread = threading.Thread(target=self._monitor_loop, daemon=True)
261
- self.thread.start()
262
- logger.info("Health monitor started")
263
-
264
- def stop(self):
265
- """Stop health monitoring."""
266
- self.running = False
267
- if self.thread:
268
- self.thread.join(timeout=5)
269
- logger.info("Health monitor stopped")
270
-
271
- def _monitor_loop(self):
272
- """Main health check loop."""
273
- while self.running:
274
- try:
275
- time.sleep(Config.HEALTH_CHECK_INTERVAL)
276
-
277
- if self._check_health():
278
- self.consecutive_failures = 0
279
- self.metrics.increment("health_checks_passed")
280
- self.metrics.update(
281
- last_health_check=datetime.now(), status="healthy"
282
- )
283
- else:
284
- self.consecutive_failures += 1
285
- self.metrics.increment("health_checks_failed")
286
- self.metrics.update(
287
- last_health_check=datetime.now(), status="unhealthy"
288
- )
289
-
290
- if self.consecutive_failures >= Config.UNHEALTHY_THRESHOLD:
291
- logger.error(
292
- f"Health check failed {self.consecutive_failures} times - daemon unhealthy"
293
- )
294
- # Supervisor will handle restart
295
-
296
- except Exception as e:
297
- logger.error(f"Health monitor error: {e}")
298
-
299
- def _check_health(self) -> bool:
300
- """Perform health check on the daemon."""
301
- try:
302
- import socket
303
-
304
- # Try to connect to the socket
305
- sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
306
- sock.settimeout(Config.HEALTH_CHECK_TIMEOUT)
307
- result = sock.connect_ex(("localhost", self.port))
308
- sock.close()
309
-
310
- if result != 0:
311
- logger.warning(
312
- f"Health check failed: cannot connect to port {self.port}"
313
- )
314
- return False
315
-
316
- # Try to make an HTTP health request if possible
317
- try:
318
- import urllib.request
319
-
320
- url = f"http://localhost:{self.port}/health"
321
- with urllib.request.urlopen(
322
- url, timeout=Config.HEALTH_CHECK_TIMEOUT
323
- ) as response:
324
- if response.status == 200:
325
- return True
326
- except:
327
- # Fall back to simple port check
328
- pass
329
-
330
- return True
331
-
332
- except Exception as e:
333
- logger.error(f"Health check error: {e}")
334
- return False
335
-
336
-
337
- class DaemonSupervisor:
338
- """Supervise the daemon process and handle automatic recovery."""
339
-
340
- def __init__(self):
341
- self.deployment_root = get_project_root()
342
- self.pid_file = self.deployment_root / ".claude-mpm" / "socketio-server.pid"
343
- self.log_file = self.deployment_root / ".claude-mpm" / "socketio-server.log"
344
- self.lock_file = self.deployment_root / ".claude-mpm" / "socketio-server.lock"
345
- self.supervisor_pid_file = (
346
- self.deployment_root / ".claude-mpm" / "socketio-supervisor.pid"
347
- )
348
-
349
- # Metrics tracking
350
- metrics_file = self.deployment_root / ".claude-mpm" / Config.METRICS_FILE
351
- self.metrics = DaemonMetrics(metrics_file)
352
-
353
- # Recovery state
354
- self.backoff = ExponentialBackoff(
355
- Config.INITIAL_RETRY_DELAY, Config.MAX_RETRY_DELAY, Config.BACKOFF_FACTOR
356
- )
357
-
358
- self.port_manager = PortManager()
359
- self.server_process = None
360
- self.selected_port = None
361
- self.health_monitor = None
362
- self.shutdown_requested = False
363
-
364
- def ensure_dirs(self):
365
- """Ensure required directories exist."""
366
- self.pid_file.parent.mkdir(parents=True, exist_ok=True)
367
-
368
- def acquire_lock(self) -> bool:
369
- """Acquire exclusive lock to prevent multiple instances."""
370
- try:
371
- self.ensure_dirs()
372
-
373
- # Check for existing lock
374
- if self.lock_file.exists():
375
- try:
376
- with open(self.lock_file) as f:
377
- old_pid = int(f.read().strip())
378
-
379
- # Check if old process is still running
380
- if psutil.pid_exists(old_pid):
381
- process = psutil.Process(old_pid)
382
- if process.is_running():
383
- logger.warning(
384
- f"Another supervisor is running (PID: {old_pid})"
385
- )
386
- return False
387
- except:
388
- pass
389
-
390
- # Clean up stale lock
391
- self.lock_file.unlink(missing_ok=True)
392
-
393
- # Create new lock
394
- with open(self.lock_file, "w") as f:
395
- f.write(str(os.getpid()))
396
-
397
- return True
398
-
399
- except Exception as e:
400
- logger.error(f"Could not acquire lock: {e}")
401
- return False
402
-
403
- def release_lock(self):
404
- """Release the exclusive lock."""
405
- self.lock_file.unlink(missing_ok=True)
406
-
407
- def find_available_port(self) -> Optional[int]:
408
- """Find an available port for the server."""
409
- self.port_manager.cleanup_dead_instances()
410
- port = self.port_manager.find_available_port()
411
-
412
- if not port:
413
- # Try extended range if configured
414
- for p in range(Config.PORT_RANGE_START, Config.PORT_RANGE_END + 1):
415
- import socket
416
-
417
- try:
418
- sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
419
- result = sock.connect_ex(("localhost", p))
420
- sock.close()
421
- if result != 0:
422
- return p
423
- except:
424
- pass
425
-
426
- return port
427
-
428
- def start_server_process(self) -> bool:
429
- """Start the actual Socket.IO server process."""
430
- try:
431
- # Find available port
432
- self.selected_port = self.find_available_port()
433
- if not self.selected_port:
434
- logger.error("No available ports")
435
- return False
436
-
437
- logger.info(f"Starting server on port {self.selected_port}")
438
-
439
- # Fork to create daemon process
440
- pid = os.fork()
441
- if pid > 0:
442
- # Parent process - supervisor
443
- self.server_process = pid
444
-
445
- # Save PID files
446
- with open(self.pid_file, "w") as f:
447
- f.write(str(pid))
448
-
449
- with open(self.supervisor_pid_file, "w") as f:
450
- f.write(str(os.getpid()))
451
-
452
- # Save port info
453
- port_file = self.pid_file.parent / "socketio-port"
454
- with open(port_file, "w") as f:
455
- f.write(str(self.selected_port))
456
-
457
- # Register with port manager
458
- self.port_manager.register_instance(self.selected_port, pid)
459
-
460
- # Wait for server to start
461
- if self._wait_for_server_start():
462
- logger.info(f"Server started successfully (PID: {pid})")
463
- self.metrics.update(start_time=datetime.now(), status="running")
464
- self.backoff.reset()
465
- return True
466
- logger.error("Server failed to start within timeout")
467
- self._cleanup_failed_server(pid)
468
- return False
469
-
470
- # Child process - actual server
471
- self._run_server_process()
472
-
473
- except Exception as e:
474
- logger.error(f"Failed to start server: {e}")
475
- logger.debug(traceback.format_exc())
476
- return False
477
-
478
- def _run_server_process(self):
479
- """Run the Socket.IO server in the child process."""
480
- try:
481
- # Become a proper daemon
482
- os.setsid()
483
- os.umask(0)
484
-
485
- # Redirect output to log file
486
- with open(self.log_file, "a") as log:
487
- os.dup2(log.fileno(), sys.stdout.fileno())
488
- os.dup2(log.fileno(), sys.stderr.fileno())
489
-
490
- # Log startup info
491
- print(
492
- f"[{datetime.now()}] Starting Socket.IO server on port {self.selected_port}"
493
- )
494
- print(f"[{datetime.now()}] Python: {sys.executable}")
495
- print(f"[{datetime.now()}] Version: {sys.version}")
496
-
497
- # Create and start server with error handling
498
- server = None
499
- try:
500
- server = SocketIOServer(host="localhost", port=self.selected_port)
501
-
502
- # Setup signal handlers
503
- def signal_handler(signum, frame):
504
- print(
505
- f"[{datetime.now()}] Received signal {signum}, shutting down..."
506
- )
507
- if server:
508
- with contextlib.suppress(Exception):
509
- server.stop_sync()
510
- sys.exit(0)
511
-
512
- signal.signal(signal.SIGTERM, signal_handler)
513
- signal.signal(signal.SIGINT, signal_handler)
514
-
515
- # Start server
516
- server.start_sync()
517
-
518
- # Keep running
519
- while True:
520
- time.sleep(1)
521
-
522
- except KeyboardInterrupt:
523
- if server:
524
- server.stop_sync()
525
- sys.exit(0)
526
- except Exception as e:
527
- print(f"[{datetime.now()}] Server error: {e}")
528
- print(traceback.format_exc())
529
- sys.exit(1)
530
-
531
- except Exception as e:
532
- print(f"[{datetime.now()}] Fatal error: {e}")
533
- sys.exit(1)
534
-
535
- def _wait_for_server_start(self) -> bool:
536
- """Wait for the server to become responsive."""
537
- import socket
538
-
539
- start_time = time.time()
540
- while time.time() - start_time < Config.STARTUP_TIMEOUT:
541
- # Check if process is still alive
542
- if not self._is_process_alive(self.server_process):
543
- return False
544
-
545
- # Try to connect
546
- try:
547
- sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
548
- result = sock.connect_ex(("localhost", self.selected_port))
549
- sock.close()
550
-
551
- if result == 0:
552
- return True
553
- except:
554
- pass
555
-
556
- time.sleep(0.5)
557
-
558
- return False
559
-
560
- def _is_process_alive(self, pid: int) -> bool:
561
- """Check if a process is alive."""
562
- try:
563
- process = psutil.Process(pid)
564
- return process.is_running()
565
- except (psutil.NoSuchProcess, psutil.AccessDenied):
566
- return False
567
-
568
- def _cleanup_failed_server(self, pid: int):
569
- """Clean up after a failed server start."""
570
- try:
571
- if self._is_process_alive(pid):
572
- os.kill(pid, signal.SIGKILL)
573
- except:
574
- pass
575
-
576
- self.pid_file.unlink(missing_ok=True)
577
-
578
- if self.selected_port:
579
- instances = self.port_manager.load_instances()
580
- for instance_id, info in instances.items():
581
- if info.get("pid") == pid:
582
- self.port_manager.remove_instance(instance_id)
583
- break
584
-
585
- def monitor_and_restart(self):
586
- """Monitor the server and restart if it crashes."""
587
- retry_count = 0
588
-
589
- while retry_count < Config.MAX_RETRIES and not self.shutdown_requested:
590
- try:
591
- # Start the server
592
- if self.start_server_process():
593
- # Start health monitoring
594
- if Config.METRICS_ENABLED and self.selected_port:
595
- self.health_monitor = HealthMonitor(
596
- self.selected_port, self.metrics
597
- )
598
- self.health_monitor.start()
599
-
600
- # Monitor the process
601
- while not self.shutdown_requested:
602
- time.sleep(5)
603
-
604
- # Check if process is still alive
605
- if not self._is_process_alive(self.server_process):
606
- logger.error("Server process died unexpectedly")
607
- self.metrics.increment("total_failures")
608
- self.metrics.update(
609
- last_failure=datetime.now(), status="crashed"
610
- )
611
- break
612
-
613
- # Check health status
614
- if (
615
- self.health_monitor
616
- and self.health_monitor.consecutive_failures
617
- >= Config.UNHEALTHY_THRESHOLD
618
- ):
619
- logger.error("Server is unhealthy, restarting...")
620
- self._stop_server_process()
621
- break
622
-
623
- if self.shutdown_requested:
624
- break
625
-
626
- # Stop health monitor before restart
627
- if self.health_monitor:
628
- self.health_monitor.stop()
629
- self.health_monitor = None
630
-
631
- # Server crashed, apply backoff before restart
632
- retry_count += 1
633
- delay = self.backoff.next_delay()
634
- logger.info(
635
- f"Restarting in {delay:.1f}s (attempt {retry_count}/{Config.MAX_RETRIES})"
636
- )
637
- time.sleep(delay)
638
- self.metrics.increment("restarts")
639
-
640
- else:
641
- # Failed to start
642
- retry_count += 1
643
- delay = self.backoff.next_delay()
644
- logger.error(
645
- f"Failed to start, retrying in {delay:.1f}s (attempt {retry_count}/{Config.MAX_RETRIES})"
646
- )
647
- time.sleep(delay)
648
-
649
- except KeyboardInterrupt:
650
- logger.info("Supervisor interrupted")
651
- break
652
- except Exception as e:
653
- logger.error(f"Supervisor error: {e}")
654
- logger.debug(traceback.format_exc())
655
- retry_count += 1
656
- time.sleep(self.backoff.next_delay())
657
-
658
- if retry_count >= Config.MAX_RETRIES:
659
- logger.error(f"Max retries ({Config.MAX_RETRIES}) exceeded, giving up")
660
- self.metrics.update(status="failed")
661
-
662
- self.cleanup()
663
-
664
- def _stop_server_process(self):
665
- """Stop the server process gracefully."""
666
- if not self.server_process:
667
- return
668
-
669
- try:
670
- # Try graceful shutdown
671
- os.kill(self.server_process, signal.SIGTERM)
672
-
673
- # Wait for shutdown
674
- start_time = time.time()
675
- while time.time() - start_time < Config.SHUTDOWN_TIMEOUT:
676
- if not self._is_process_alive(self.server_process):
677
- logger.info("Server stopped gracefully")
678
- return
679
- time.sleep(0.5)
680
-
681
- # Force kill if still running
682
- logger.warning("Server didn't stop gracefully, forcing...")
683
- os.kill(self.server_process, signal.SIGKILL)
684
- time.sleep(Config.FORCE_KILL_TIMEOUT)
685
-
686
- except Exception as e:
687
- logger.error(f"Error stopping server: {e}")
688
-
689
- def cleanup(self):
690
- """Clean up resources on shutdown."""
691
- logger.info("Cleaning up supervisor resources")
692
-
693
- # Stop health monitor
694
- if self.health_monitor:
695
- self.health_monitor.stop()
696
-
697
- # Stop server process
698
- if self.server_process:
699
- self._stop_server_process()
700
-
701
- # Clean up port registration
702
- if self.selected_port:
703
- instances = self.port_manager.load_instances()
704
- for instance_id, info in instances.items():
705
- if info.get("pid") == self.server_process:
706
- self.port_manager.remove_instance(instance_id)
707
- break
708
-
709
- # Remove PID files
710
- self.pid_file.unlink(missing_ok=True)
711
- self.supervisor_pid_file.unlink(missing_ok=True)
712
-
713
- # Update metrics
714
- self.metrics.update(status="stopped")
715
-
716
- # Release lock
717
- self.release_lock()
718
-
719
- def handle_shutdown(self, signum, frame):
720
- """Handle shutdown signals."""
721
- logger.info(f"Received signal {signum}, initiating shutdown...")
722
- self.shutdown_requested = True
723
-
724
-
725
- def start_daemon():
726
- """Start the hardened daemon with supervisor."""
727
- supervisor = DaemonSupervisor()
728
-
729
- # Check if already running
730
- if supervisor.pid_file.exists():
731
- try:
732
- with open(supervisor.pid_file) as f:
733
- old_pid = int(f.read().strip())
734
-
735
- if supervisor._is_process_alive(old_pid):
736
- print(f"Socket.IO daemon is already running (PID: {old_pid})")
737
- return
738
- except:
739
- pass
740
-
741
- # Clean up stale PID file
742
- supervisor.pid_file.unlink(missing_ok=True)
743
-
744
- # Acquire lock
745
- if not supervisor.acquire_lock():
746
- print("Could not acquire lock - another instance may be running")
747
- return
748
-
749
- print("Starting hardened Socket.IO daemon with supervisor...")
750
- print(f"Python: {PYTHON_EXECUTABLE}")
751
- print(f"Max retries: {Config.MAX_RETRIES}")
752
- print(f"Health checks: {'enabled' if Config.METRICS_ENABLED else 'disabled'}")
753
-
754
- # Setup signal handlers
755
- signal.signal(signal.SIGTERM, supervisor.handle_shutdown)
756
- signal.signal(signal.SIGINT, supervisor.handle_shutdown)
757
-
758
- try:
759
- # Start monitoring and auto-restart loop
760
- supervisor.monitor_and_restart()
761
- finally:
762
- supervisor.cleanup()
763
-
764
- print("Socket.IO daemon stopped")
765
-
766
-
767
- def stop_daemon():
768
- """Stop the hardened daemon."""
769
- deployment_root = get_project_root()
770
- pid_file = deployment_root / ".claude-mpm" / "socketio-server.pid"
771
- supervisor_pid_file = deployment_root / ".claude-mpm" / "socketio-supervisor.pid"
772
-
773
- # Try to stop supervisor first
774
- if supervisor_pid_file.exists():
775
- try:
776
- with open(supervisor_pid_file) as f:
777
- supervisor_pid = int(f.read().strip())
778
-
779
- print(f"Stopping supervisor (PID: {supervisor_pid})...")
780
- os.kill(supervisor_pid, signal.SIGTERM)
781
-
782
- # Wait for supervisor to stop
783
- for _ in range(20):
784
- if not psutil.pid_exists(supervisor_pid):
785
- print("Supervisor stopped successfully")
786
- supervisor_pid_file.unlink(missing_ok=True)
787
- return
788
- time.sleep(0.5)
789
-
790
- # Force kill if needed
791
- print("Supervisor didn't stop gracefully, forcing...")
792
- os.kill(supervisor_pid, signal.SIGKILL)
793
- supervisor_pid_file.unlink(missing_ok=True)
794
-
795
- except Exception as e:
796
- print(f"Error stopping supervisor: {e}")
797
-
798
- # Also try to stop server directly if supervisor failed
799
- if pid_file.exists():
800
- try:
801
- with open(pid_file) as f:
802
- server_pid = int(f.read().strip())
803
-
804
- if psutil.pid_exists(server_pid):
805
- print(f"Stopping server (PID: {server_pid})...")
806
- os.kill(server_pid, signal.SIGTERM)
807
- time.sleep(2)
808
-
809
- if psutil.pid_exists(server_pid):
810
- os.kill(server_pid, signal.SIGKILL)
811
-
812
- pid_file.unlink(missing_ok=True)
813
-
814
- except Exception as e:
815
- print(f"Error stopping server: {e}")
816
-
817
-
818
- def status_daemon():
819
- """Show detailed daemon status."""
820
- deployment_root = get_project_root()
821
- pid_file = deployment_root / ".claude-mpm" / "socketio-server.pid"
822
- supervisor_pid_file = deployment_root / ".claude-mpm" / "socketio-supervisor.pid"
823
- metrics_file = deployment_root / ".claude-mpm" / Config.METRICS_FILE
824
-
825
- print("Socket.IO Daemon Status")
826
- print("=" * 50)
827
-
828
- # Check supervisor
829
- if supervisor_pid_file.exists():
830
- try:
831
- with open(supervisor_pid_file) as f:
832
- supervisor_pid = int(f.read().strip())
833
-
834
- if psutil.pid_exists(supervisor_pid):
835
- process = psutil.Process(supervisor_pid)
836
- print(f"✅ Supervisor: RUNNING (PID: {supervisor_pid})")
837
- print(f" Memory: {process.memory_info().rss / 1024 / 1024:.1f} MB")
838
- print(f" CPU: {process.cpu_percent()}%")
839
- else:
840
- print(f"❌ Supervisor: NOT RUNNING (stale PID: {supervisor_pid})")
841
- except:
842
- print("❌ Supervisor: ERROR reading status")
843
- else:
844
- print("❌ Supervisor: NOT RUNNING")
845
-
846
- # Check server
847
- if pid_file.exists():
848
- try:
849
- with open(pid_file) as f:
850
- server_pid = int(f.read().strip())
851
-
852
- if psutil.pid_exists(server_pid):
853
- process = psutil.Process(server_pid)
854
- print(f"✅ Server: RUNNING (PID: {server_pid})")
855
- print(f" Memory: {process.memory_info().rss / 1024 / 1024:.1f} MB")
856
- print(f" CPU: {process.cpu_percent()}%")
857
-
858
- # Check port
859
- port_file = deployment_root / ".claude-mpm" / "socketio-port"
860
- if port_file.exists():
861
- with open(port_file) as f:
862
- port = int(f.read().strip())
863
- print(f" Port: {port}")
864
-
865
- # Test connection
866
- import socket
867
-
868
- sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
869
- result = sock.connect_ex(("localhost", port))
870
- sock.close()
871
-
872
- if result == 0:
873
- print(f" ✅ Listening on port {port}")
874
- else:
875
- print(f" ❌ Not responding on port {port}")
876
- else:
877
- print(f"❌ Server: NOT RUNNING (stale PID: {server_pid})")
878
- except:
879
- print("❌ Server: ERROR reading status")
880
- else:
881
- print("❌ Server: NOT RUNNING")
882
-
883
- # Show metrics
884
- if metrics_file.exists():
885
- try:
886
- with open(metrics_file) as f:
887
- metrics = json.load(f)
888
-
889
- print("\n📊 Metrics:")
890
- print(f" Status: {metrics.get('status', 'unknown')}")
891
- print(f" Uptime: {metrics.get('uptime_seconds', 0)} seconds")
892
- print(f" Restarts: {metrics.get('restarts', 0)}")
893
- print(f" Failures: {metrics.get('total_failures', 0)}")
894
- print(f" Health Checks Passed: {metrics.get('health_checks_passed', 0)}")
895
- print(f" Health Checks Failed: {metrics.get('health_checks_failed', 0)}")
896
-
897
- if metrics.get("last_failure"):
898
- print(f" Last Failure: {metrics['last_failure']}")
899
- if metrics.get("last_health_check"):
900
- print(f" Last Health Check: {metrics['last_health_check']}")
901
-
902
- except Exception as e:
903
- print(f"\n❌ Could not read metrics: {e}")
904
-
905
- print("\n🔧 Configuration:")
906
- print(f" Max Retries: {Config.MAX_RETRIES}")
907
- print(f" Health Check Interval: {Config.HEALTH_CHECK_INTERVAL}s")
908
- print(f" Port Range: {Config.PORT_RANGE_START}-{Config.PORT_RANGE_END}")
909
- print(f" Log Level: {Config.LOG_LEVEL}")
910
-
911
-
912
- def main():
913
- """Main entry point."""
914
- if len(sys.argv) < 2:
915
- print("Usage: socketio-daemon-hardened.py {start|stop|restart|status}")
916
- sys.exit(1)
917
-
918
- command = sys.argv[1]
919
-
920
- if command == "start":
921
- start_daemon()
922
- elif command == "stop":
923
- stop_daemon()
924
- elif command == "restart":
925
- stop_daemon()
926
- time.sleep(2)
927
- start_daemon()
928
- elif command == "status":
929
- status_daemon()
930
- else:
931
- print(f"Unknown command: {command}")
932
- print("Usage: socketio-daemon-hardened.py {start|stop|restart|status}")
933
- sys.exit(1)
934
-
935
-
936
- if __name__ == "__main__":
937
- main()