claude-mpm 4.0.20__py3-none-any.whl → 4.0.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. claude_mpm/BUILD_NUMBER +1 -1
  2. claude_mpm/VERSION +1 -1
  3. claude_mpm/agents/INSTRUCTIONS.md +74 -0
  4. claude_mpm/agents/WORKFLOW.md +308 -4
  5. claude_mpm/agents/agents_metadata.py +52 -0
  6. claude_mpm/agents/base_agent_loader.py +75 -19
  7. claude_mpm/agents/templates/__init__.py +4 -0
  8. claude_mpm/agents/templates/api_qa.json +206 -0
  9. claude_mpm/agents/templates/research.json +24 -16
  10. claude_mpm/agents/templates/ticketing.json +18 -5
  11. claude_mpm/agents/templates/vercel_ops_agent.json +281 -0
  12. claude_mpm/agents/templates/vercel_ops_instructions.md +582 -0
  13. claude_mpm/cli/commands/mcp_command_router.py +87 -1
  14. claude_mpm/cli/commands/mcp_install_commands.py +207 -26
  15. claude_mpm/cli/parsers/mcp_parser.py +23 -0
  16. claude_mpm/constants.py +1 -0
  17. claude_mpm/core/base_service.py +7 -1
  18. claude_mpm/core/config.py +64 -39
  19. claude_mpm/core/framework_loader.py +68 -28
  20. claude_mpm/core/interactive_session.py +28 -17
  21. claude_mpm/scripts/socketio_daemon.py +67 -7
  22. claude_mpm/scripts/socketio_daemon_hardened.py +897 -0
  23. claude_mpm/services/agents/deployment/agent_deployment.py +65 -3
  24. claude_mpm/services/agents/deployment/async_agent_deployment.py +65 -1
  25. claude_mpm/services/agents/memory/agent_memory_manager.py +42 -203
  26. claude_mpm/services/memory_hook_service.py +62 -4
  27. claude_mpm/services/runner_configuration_service.py +5 -9
  28. claude_mpm/services/socketio/server/broadcaster.py +32 -1
  29. claude_mpm/services/socketio/server/core.py +4 -0
  30. claude_mpm/services/socketio/server/main.py +23 -4
  31. {claude_mpm-4.0.20.dist-info → claude_mpm-4.0.22.dist-info}/METADATA +1 -1
  32. {claude_mpm-4.0.20.dist-info → claude_mpm-4.0.22.dist-info}/RECORD +36 -32
  33. {claude_mpm-4.0.20.dist-info → claude_mpm-4.0.22.dist-info}/WHEEL +0 -0
  34. {claude_mpm-4.0.20.dist-info → claude_mpm-4.0.22.dist-info}/entry_points.txt +0 -0
  35. {claude_mpm-4.0.20.dist-info → claude_mpm-4.0.22.dist-info}/licenses/LICENSE +0 -0
  36. {claude_mpm-4.0.20.dist-info → claude_mpm-4.0.22.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,897 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Production-hardened Socket.IO daemon with automatic recovery and monitoring.
4
+
5
+ WHY: Production environments require robust daemon management with automatic
6
+ recovery, comprehensive monitoring, and graceful degradation under load.
7
+
8
+ FEATURES:
9
+ - Automatic retry with exponential backoff
10
+ - Supervisor pattern for crash recovery
11
+ - Comprehensive error handling and logging
12
+ - Resource management and cleanup
13
+ - Process management with PID files
14
+ - Signal handling for graceful shutdown
15
+ - Health monitoring and metrics
16
+ - Configuration through environment variables
17
+ """
18
+
19
+ import json
20
+ import os
21
+ import signal
22
+ import subprocess
23
+ import sys
24
+ import time
25
+ import traceback
26
+ from pathlib import Path
27
+ from datetime import datetime
28
+ from typing import Optional, Dict, Any
29
+ import threading
30
+ import queue
31
+
32
+ # Detect and use virtual environment Python if available
33
+ def get_python_executable():
34
+ """Get the appropriate Python executable, preferring virtual environment."""
35
+ if hasattr(sys, 'real_prefix') or (hasattr(sys, 'base_prefix') and sys.base_prefix != sys.prefix):
36
+ return sys.executable
37
+
38
+ venv_path = os.environ.get('VIRTUAL_ENV')
39
+ if venv_path:
40
+ venv_python = Path(venv_path) / 'bin' / 'python'
41
+ if venv_python.exists():
42
+ return str(venv_python)
43
+
44
+ exe_path = Path(sys.executable).resolve()
45
+ for parent in exe_path.parents:
46
+ if parent.name in ('venv', '.venv', 'env', '.env'):
47
+ return sys.executable
48
+ if parent.name == 'bin' and (parent.parent / 'pyvenv.cfg').exists():
49
+ return sys.executable
50
+ if parent.name == 'Scripts' and (parent.parent / 'pyvenv.cfg').exists():
51
+ return sys.executable
52
+
53
+ script_path = Path(__file__).resolve()
54
+ for parent in script_path.parents:
55
+ if parent.name == 'src' or not (parent / 'src').exists():
56
+ for venv_name in ('venv', '.venv', 'env', '.env'):
57
+ venv_dir = parent / venv_name
58
+ if venv_dir.exists():
59
+ venv_python = venv_dir / 'bin' / 'python'
60
+ if venv_python.exists():
61
+ return str(venv_python)
62
+ break
63
+
64
+ return sys.executable
65
+
66
+ PYTHON_EXECUTABLE = get_python_executable()
67
+
68
+ # Configuration from environment variables
69
+ class Config:
70
+ """Centralized configuration with environment variable support."""
71
+
72
+ # Retry configuration
73
+ MAX_RETRIES = int(os.environ.get('SOCKETIO_MAX_RETRIES', '10'))
74
+ INITIAL_RETRY_DELAY = float(os.environ.get('SOCKETIO_INITIAL_RETRY_DELAY', '1.0'))
75
+ MAX_RETRY_DELAY = float(os.environ.get('SOCKETIO_MAX_RETRY_DELAY', '60.0'))
76
+ BACKOFF_FACTOR = float(os.environ.get('SOCKETIO_BACKOFF_FACTOR', '2.0'))
77
+
78
+ # Health check configuration
79
+ HEALTH_CHECK_INTERVAL = float(os.environ.get('SOCKETIO_HEALTH_CHECK_INTERVAL', '30.0'))
80
+ HEALTH_CHECK_TIMEOUT = float(os.environ.get('SOCKETIO_HEALTH_CHECK_TIMEOUT', '5.0'))
81
+ UNHEALTHY_THRESHOLD = int(os.environ.get('SOCKETIO_UNHEALTHY_THRESHOLD', '3'))
82
+
83
+ # Process management
84
+ STARTUP_TIMEOUT = float(os.environ.get('SOCKETIO_STARTUP_TIMEOUT', '30.0'))
85
+ SHUTDOWN_TIMEOUT = float(os.environ.get('SOCKETIO_SHUTDOWN_TIMEOUT', '10.0'))
86
+ FORCE_KILL_TIMEOUT = float(os.environ.get('SOCKETIO_FORCE_KILL_TIMEOUT', '5.0'))
87
+
88
+ # Port configuration
89
+ PORT_RANGE_START = int(os.environ.get('SOCKETIO_PORT_START', '8765'))
90
+ PORT_RANGE_END = int(os.environ.get('SOCKETIO_PORT_END', '8785'))
91
+
92
+ # Logging
93
+ LOG_LEVEL = os.environ.get('SOCKETIO_LOG_LEVEL', 'INFO')
94
+ LOG_FORMAT = '%(asctime)s [%(levelname)s] %(name)s: %(message)s'
95
+
96
+ # Monitoring
97
+ METRICS_ENABLED = os.environ.get('SOCKETIO_METRICS_ENABLED', 'true').lower() == 'true'
98
+ METRICS_FILE = os.environ.get('SOCKETIO_METRICS_FILE', '.claude-mpm/socketio-metrics.json')
99
+
100
+ # Setup structured logging
101
+ import logging
102
+ logging.basicConfig(
103
+ level=getattr(logging, Config.LOG_LEVEL),
104
+ format=Config.LOG_FORMAT
105
+ )
106
+ logger = logging.getLogger('socketio-daemon')
107
+
108
+ try:
109
+ import psutil
110
+ except ImportError:
111
+ logger.info(f"Installing psutil using {PYTHON_EXECUTABLE}...")
112
+ subprocess.check_call([PYTHON_EXECUTABLE, "-m", "pip", "install", "psutil"])
113
+ import psutil
114
+
115
+ # Import project modules
116
+ try:
117
+ from claude_mpm.services.port_manager import PortManager
118
+ from claude_mpm.services.socketio.server.main import SocketIOServer
119
+ from claude_mpm.core.unified_paths import get_project_root
120
+ except ImportError:
121
+ script_path = Path(__file__).resolve()
122
+ if "site-packages" in str(script_path):
123
+ parts = script_path.parts
124
+ site_packages_idx = next(i for i, part in enumerate(parts) if part == "site-packages")
125
+ site_packages_path = Path(*parts[: site_packages_idx + 1])
126
+ if site_packages_path.exists() and str(site_packages_path) not in sys.path:
127
+ sys.path.insert(0, str(site_packages_path))
128
+ else:
129
+ src_path = script_path.parent.parent.parent
130
+ if src_path.exists() and (src_path / "claude_mpm").exists() and str(src_path) not in sys.path:
131
+ sys.path.insert(0, str(src_path))
132
+
133
+ from claude_mpm.services.port_manager import PortManager
134
+ from claude_mpm.services.socketio.server.main import SocketIOServer
135
+ from claude_mpm.core.unified_paths import get_project_root
136
+
137
+
138
+ class DaemonMetrics:
139
+ """Track and persist daemon metrics for monitoring."""
140
+
141
+ def __init__(self, metrics_file: Path):
142
+ self.metrics_file = metrics_file
143
+ self.metrics = {
144
+ 'start_time': None,
145
+ 'restarts': 0,
146
+ 'total_failures': 0,
147
+ 'last_failure': None,
148
+ 'health_checks_passed': 0,
149
+ 'health_checks_failed': 0,
150
+ 'uptime_seconds': 0,
151
+ 'last_health_check': None,
152
+ 'status': 'initializing'
153
+ }
154
+ self.lock = threading.Lock()
155
+ self.load()
156
+
157
+ def load(self):
158
+ """Load metrics from file if exists."""
159
+ if self.metrics_file.exists():
160
+ try:
161
+ with open(self.metrics_file, 'r') as f:
162
+ saved = json.load(f)
163
+ self.metrics.update(saved)
164
+ except Exception as e:
165
+ logger.warning(f"Could not load metrics: {e}")
166
+
167
+ def save(self):
168
+ """Persist metrics to file."""
169
+ try:
170
+ self.metrics_file.parent.mkdir(parents=True, exist_ok=True)
171
+ with self.lock:
172
+ with open(self.metrics_file, 'w') as f:
173
+ json.dump(self.metrics, f, indent=2, default=str)
174
+ except Exception as e:
175
+ logger.error(f"Could not save metrics: {e}")
176
+
177
+ def update(self, **kwargs):
178
+ """Update metrics atomically."""
179
+ with self.lock:
180
+ self.metrics.update(kwargs)
181
+ if self.metrics['start_time']:
182
+ start = datetime.fromisoformat(str(self.metrics['start_time']))
183
+ self.metrics['uptime_seconds'] = int((datetime.now() - start).total_seconds())
184
+ self.save()
185
+
186
+ def increment(self, key: str, amount: int = 1):
187
+ """Increment a counter metric."""
188
+ with self.lock:
189
+ self.metrics[key] = self.metrics.get(key, 0) + amount
190
+ self.save()
191
+
192
+
193
+ class ExponentialBackoff:
194
+ """Implement exponential backoff with jitter for retry logic."""
195
+
196
+ def __init__(self, initial_delay: float = 1.0, max_delay: float = 60.0, factor: float = 2.0):
197
+ self.initial_delay = initial_delay
198
+ self.max_delay = max_delay
199
+ self.factor = factor
200
+ self.current_delay = initial_delay
201
+ self.attempt = 0
202
+
203
+ def next_delay(self) -> float:
204
+ """Get the next delay with jitter."""
205
+ import random
206
+ self.attempt += 1
207
+
208
+ # Calculate exponential delay
209
+ delay = min(self.initial_delay * (self.factor ** self.attempt), self.max_delay)
210
+
211
+ # Add jitter (±25% randomization)
212
+ jitter = delay * 0.25 * (2 * random.random() - 1)
213
+ actual_delay = max(0.1, delay + jitter)
214
+
215
+ logger.debug(f"Backoff attempt {self.attempt}: {actual_delay:.2f}s")
216
+ return actual_delay
217
+
218
+ def reset(self):
219
+ """Reset the backoff counter."""
220
+ self.attempt = 0
221
+ self.current_delay = self.initial_delay
222
+
223
+
224
+ class HealthMonitor:
225
+ """Monitor daemon health and trigger recovery if needed."""
226
+
227
+ def __init__(self, port: int, metrics: DaemonMetrics):
228
+ self.port = port
229
+ self.metrics = metrics
230
+ self.consecutive_failures = 0
231
+ self.running = False
232
+ self.thread = None
233
+
234
+ def start(self):
235
+ """Start health monitoring in background thread."""
236
+ if self.running:
237
+ return
238
+
239
+ self.running = True
240
+ self.thread = threading.Thread(target=self._monitor_loop, daemon=True)
241
+ self.thread.start()
242
+ logger.info("Health monitor started")
243
+
244
+ def stop(self):
245
+ """Stop health monitoring."""
246
+ self.running = False
247
+ if self.thread:
248
+ self.thread.join(timeout=5)
249
+ logger.info("Health monitor stopped")
250
+
251
+ def _monitor_loop(self):
252
+ """Main health check loop."""
253
+ while self.running:
254
+ try:
255
+ time.sleep(Config.HEALTH_CHECK_INTERVAL)
256
+
257
+ if self._check_health():
258
+ self.consecutive_failures = 0
259
+ self.metrics.increment('health_checks_passed')
260
+ self.metrics.update(last_health_check=datetime.now(), status='healthy')
261
+ else:
262
+ self.consecutive_failures += 1
263
+ self.metrics.increment('health_checks_failed')
264
+ self.metrics.update(last_health_check=datetime.now(), status='unhealthy')
265
+
266
+ if self.consecutive_failures >= Config.UNHEALTHY_THRESHOLD:
267
+ logger.error(f"Health check failed {self.consecutive_failures} times - daemon unhealthy")
268
+ # Supervisor will handle restart
269
+
270
+ except Exception as e:
271
+ logger.error(f"Health monitor error: {e}")
272
+
273
+ def _check_health(self) -> bool:
274
+ """Perform health check on the daemon."""
275
+ try:
276
+ import socket
277
+ import json
278
+
279
+ # Try to connect to the socket
280
+ sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
281
+ sock.settimeout(Config.HEALTH_CHECK_TIMEOUT)
282
+ result = sock.connect_ex(('localhost', self.port))
283
+ sock.close()
284
+
285
+ if result != 0:
286
+ logger.warning(f"Health check failed: cannot connect to port {self.port}")
287
+ return False
288
+
289
+ # Try to make an HTTP health request if possible
290
+ try:
291
+ import urllib.request
292
+ url = f'http://localhost:{self.port}/health'
293
+ with urllib.request.urlopen(url, timeout=Config.HEALTH_CHECK_TIMEOUT) as response:
294
+ if response.status == 200:
295
+ return True
296
+ except:
297
+ # Fall back to simple port check
298
+ pass
299
+
300
+ return True
301
+
302
+ except Exception as e:
303
+ logger.error(f"Health check error: {e}")
304
+ return False
305
+
306
+
307
+ class DaemonSupervisor:
308
+ """Supervise the daemon process and handle automatic recovery."""
309
+
310
+ def __init__(self):
311
+ self.deployment_root = get_project_root()
312
+ self.pid_file = self.deployment_root / ".claude-mpm" / "socketio-server.pid"
313
+ self.log_file = self.deployment_root / ".claude-mpm" / "socketio-server.log"
314
+ self.lock_file = self.deployment_root / ".claude-mpm" / "socketio-server.lock"
315
+ self.supervisor_pid_file = self.deployment_root / ".claude-mpm" / "socketio-supervisor.pid"
316
+
317
+ # Metrics tracking
318
+ metrics_file = self.deployment_root / ".claude-mpm" / Config.METRICS_FILE
319
+ self.metrics = DaemonMetrics(metrics_file)
320
+
321
+ # Recovery state
322
+ self.backoff = ExponentialBackoff(
323
+ Config.INITIAL_RETRY_DELAY,
324
+ Config.MAX_RETRY_DELAY,
325
+ Config.BACKOFF_FACTOR
326
+ )
327
+
328
+ self.port_manager = PortManager()
329
+ self.server_process = None
330
+ self.selected_port = None
331
+ self.health_monitor = None
332
+ self.shutdown_requested = False
333
+
334
+ def ensure_dirs(self):
335
+ """Ensure required directories exist."""
336
+ self.pid_file.parent.mkdir(parents=True, exist_ok=True)
337
+
338
+ def acquire_lock(self) -> bool:
339
+ """Acquire exclusive lock to prevent multiple instances."""
340
+ try:
341
+ self.ensure_dirs()
342
+
343
+ # Check for existing lock
344
+ if self.lock_file.exists():
345
+ try:
346
+ with open(self.lock_file, 'r') as f:
347
+ old_pid = int(f.read().strip())
348
+
349
+ # Check if old process is still running
350
+ if psutil.pid_exists(old_pid):
351
+ process = psutil.Process(old_pid)
352
+ if process.is_running():
353
+ logger.warning(f"Another supervisor is running (PID: {old_pid})")
354
+ return False
355
+ except:
356
+ pass
357
+
358
+ # Clean up stale lock
359
+ self.lock_file.unlink(missing_ok=True)
360
+
361
+ # Create new lock
362
+ with open(self.lock_file, 'w') as f:
363
+ f.write(str(os.getpid()))
364
+
365
+ return True
366
+
367
+ except Exception as e:
368
+ logger.error(f"Could not acquire lock: {e}")
369
+ return False
370
+
371
+ def release_lock(self):
372
+ """Release the exclusive lock."""
373
+ self.lock_file.unlink(missing_ok=True)
374
+
375
+ def find_available_port(self) -> Optional[int]:
376
+ """Find an available port for the server."""
377
+ self.port_manager.cleanup_dead_instances()
378
+ port = self.port_manager.find_available_port()
379
+
380
+ if not port:
381
+ # Try extended range if configured
382
+ for p in range(Config.PORT_RANGE_START, Config.PORT_RANGE_END + 1):
383
+ import socket
384
+ try:
385
+ sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
386
+ result = sock.connect_ex(('localhost', p))
387
+ sock.close()
388
+ if result != 0:
389
+ return p
390
+ except:
391
+ pass
392
+
393
+ return port
394
+
395
+ def start_server_process(self) -> bool:
396
+ """Start the actual Socket.IO server process."""
397
+ try:
398
+ # Find available port
399
+ self.selected_port = self.find_available_port()
400
+ if not self.selected_port:
401
+ logger.error("No available ports")
402
+ return False
403
+
404
+ logger.info(f"Starting server on port {self.selected_port}")
405
+
406
+ # Fork to create daemon process
407
+ pid = os.fork()
408
+ if pid > 0:
409
+ # Parent process - supervisor
410
+ self.server_process = pid
411
+
412
+ # Save PID files
413
+ with open(self.pid_file, 'w') as f:
414
+ f.write(str(pid))
415
+
416
+ with open(self.supervisor_pid_file, 'w') as f:
417
+ f.write(str(os.getpid()))
418
+
419
+ # Save port info
420
+ port_file = self.pid_file.parent / "socketio-port"
421
+ with open(port_file, 'w') as f:
422
+ f.write(str(self.selected_port))
423
+
424
+ # Register with port manager
425
+ self.port_manager.register_instance(self.selected_port, pid)
426
+
427
+ # Wait for server to start
428
+ if self._wait_for_server_start():
429
+ logger.info(f"Server started successfully (PID: {pid})")
430
+ self.metrics.update(
431
+ start_time=datetime.now(),
432
+ status='running'
433
+ )
434
+ self.backoff.reset()
435
+ return True
436
+ else:
437
+ logger.error("Server failed to start within timeout")
438
+ self._cleanup_failed_server(pid)
439
+ return False
440
+
441
+ else:
442
+ # Child process - actual server
443
+ self._run_server_process()
444
+
445
+ except Exception as e:
446
+ logger.error(f"Failed to start server: {e}")
447
+ logger.debug(traceback.format_exc())
448
+ return False
449
+
450
+ def _run_server_process(self):
451
+ """Run the Socket.IO server in the child process."""
452
+ try:
453
+ # Become a proper daemon
454
+ os.setsid()
455
+ os.umask(0)
456
+
457
+ # Redirect output to log file
458
+ with open(self.log_file, 'a') as log:
459
+ os.dup2(log.fileno(), sys.stdout.fileno())
460
+ os.dup2(log.fileno(), sys.stderr.fileno())
461
+
462
+ # Log startup info
463
+ print(f"[{datetime.now()}] Starting Socket.IO server on port {self.selected_port}")
464
+ print(f"[{datetime.now()}] Python: {sys.executable}")
465
+ print(f"[{datetime.now()}] Version: {sys.version}")
466
+
467
+ # Create and start server with error handling
468
+ server = None
469
+ try:
470
+ server = SocketIOServer(host="localhost", port=self.selected_port)
471
+
472
+ # Setup signal handlers
473
+ def signal_handler(signum, frame):
474
+ print(f"[{datetime.now()}] Received signal {signum}, shutting down...")
475
+ if server:
476
+ try:
477
+ server.stop_sync()
478
+ except:
479
+ pass
480
+ sys.exit(0)
481
+
482
+ signal.signal(signal.SIGTERM, signal_handler)
483
+ signal.signal(signal.SIGINT, signal_handler)
484
+
485
+ # Start server
486
+ server.start_sync()
487
+
488
+ # Keep running
489
+ while True:
490
+ time.sleep(1)
491
+
492
+ except KeyboardInterrupt:
493
+ if server:
494
+ server.stop_sync()
495
+ sys.exit(0)
496
+ except Exception as e:
497
+ print(f"[{datetime.now()}] Server error: {e}")
498
+ print(traceback.format_exc())
499
+ sys.exit(1)
500
+
501
+ except Exception as e:
502
+ print(f"[{datetime.now()}] Fatal error: {e}")
503
+ sys.exit(1)
504
+
505
+ def _wait_for_server_start(self) -> bool:
506
+ """Wait for the server to become responsive."""
507
+ import socket
508
+
509
+ start_time = time.time()
510
+ while time.time() - start_time < Config.STARTUP_TIMEOUT:
511
+ # Check if process is still alive
512
+ if not self._is_process_alive(self.server_process):
513
+ return False
514
+
515
+ # Try to connect
516
+ try:
517
+ sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
518
+ result = sock.connect_ex(('localhost', self.selected_port))
519
+ sock.close()
520
+
521
+ if result == 0:
522
+ return True
523
+ except:
524
+ pass
525
+
526
+ time.sleep(0.5)
527
+
528
+ return False
529
+
530
+ def _is_process_alive(self, pid: int) -> bool:
531
+ """Check if a process is alive."""
532
+ try:
533
+ process = psutil.Process(pid)
534
+ return process.is_running()
535
+ except (psutil.NoSuchProcess, psutil.AccessDenied):
536
+ return False
537
+
538
+ def _cleanup_failed_server(self, pid: int):
539
+ """Clean up after a failed server start."""
540
+ try:
541
+ if self._is_process_alive(pid):
542
+ os.kill(pid, signal.SIGKILL)
543
+ except:
544
+ pass
545
+
546
+ self.pid_file.unlink(missing_ok=True)
547
+
548
+ if self.selected_port:
549
+ instances = self.port_manager.load_instances()
550
+ for instance_id, info in instances.items():
551
+ if info.get('pid') == pid:
552
+ self.port_manager.remove_instance(instance_id)
553
+ break
554
+
555
+ def monitor_and_restart(self):
556
+ """Monitor the server and restart if it crashes."""
557
+ retry_count = 0
558
+
559
+ while retry_count < Config.MAX_RETRIES and not self.shutdown_requested:
560
+ try:
561
+ # Start the server
562
+ if self.start_server_process():
563
+ # Start health monitoring
564
+ if Config.METRICS_ENABLED and self.selected_port:
565
+ self.health_monitor = HealthMonitor(self.selected_port, self.metrics)
566
+ self.health_monitor.start()
567
+
568
+ # Monitor the process
569
+ while not self.shutdown_requested:
570
+ time.sleep(5)
571
+
572
+ # Check if process is still alive
573
+ if not self._is_process_alive(self.server_process):
574
+ logger.error("Server process died unexpectedly")
575
+ self.metrics.increment('total_failures')
576
+ self.metrics.update(
577
+ last_failure=datetime.now(),
578
+ status='crashed'
579
+ )
580
+ break
581
+
582
+ # Check health status
583
+ if self.health_monitor and self.health_monitor.consecutive_failures >= Config.UNHEALTHY_THRESHOLD:
584
+ logger.error("Server is unhealthy, restarting...")
585
+ self._stop_server_process()
586
+ break
587
+
588
+ if self.shutdown_requested:
589
+ break
590
+
591
+ # Stop health monitor before restart
592
+ if self.health_monitor:
593
+ self.health_monitor.stop()
594
+ self.health_monitor = None
595
+
596
+ # Server crashed, apply backoff before restart
597
+ retry_count += 1
598
+ delay = self.backoff.next_delay()
599
+ logger.info(f"Restarting in {delay:.1f}s (attempt {retry_count}/{Config.MAX_RETRIES})")
600
+ time.sleep(delay)
601
+ self.metrics.increment('restarts')
602
+
603
+ else:
604
+ # Failed to start
605
+ retry_count += 1
606
+ delay = self.backoff.next_delay()
607
+ logger.error(f"Failed to start, retrying in {delay:.1f}s (attempt {retry_count}/{Config.MAX_RETRIES})")
608
+ time.sleep(delay)
609
+
610
+ except KeyboardInterrupt:
611
+ logger.info("Supervisor interrupted")
612
+ break
613
+ except Exception as e:
614
+ logger.error(f"Supervisor error: {e}")
615
+ logger.debug(traceback.format_exc())
616
+ retry_count += 1
617
+ time.sleep(self.backoff.next_delay())
618
+
619
+ if retry_count >= Config.MAX_RETRIES:
620
+ logger.error(f"Max retries ({Config.MAX_RETRIES}) exceeded, giving up")
621
+ self.metrics.update(status='failed')
622
+
623
+ self.cleanup()
624
+
625
+ def _stop_server_process(self):
626
+ """Stop the server process gracefully."""
627
+ if not self.server_process:
628
+ return
629
+
630
+ try:
631
+ # Try graceful shutdown
632
+ os.kill(self.server_process, signal.SIGTERM)
633
+
634
+ # Wait for shutdown
635
+ start_time = time.time()
636
+ while time.time() - start_time < Config.SHUTDOWN_TIMEOUT:
637
+ if not self._is_process_alive(self.server_process):
638
+ logger.info("Server stopped gracefully")
639
+ return
640
+ time.sleep(0.5)
641
+
642
+ # Force kill if still running
643
+ logger.warning("Server didn't stop gracefully, forcing...")
644
+ os.kill(self.server_process, signal.SIGKILL)
645
+ time.sleep(Config.FORCE_KILL_TIMEOUT)
646
+
647
+ except Exception as e:
648
+ logger.error(f"Error stopping server: {e}")
649
+
650
+ def cleanup(self):
651
+ """Clean up resources on shutdown."""
652
+ logger.info("Cleaning up supervisor resources")
653
+
654
+ # Stop health monitor
655
+ if self.health_monitor:
656
+ self.health_monitor.stop()
657
+
658
+ # Stop server process
659
+ if self.server_process:
660
+ self._stop_server_process()
661
+
662
+ # Clean up port registration
663
+ if self.selected_port:
664
+ instances = self.port_manager.load_instances()
665
+ for instance_id, info in instances.items():
666
+ if info.get('pid') == self.server_process:
667
+ self.port_manager.remove_instance(instance_id)
668
+ break
669
+
670
+ # Remove PID files
671
+ self.pid_file.unlink(missing_ok=True)
672
+ self.supervisor_pid_file.unlink(missing_ok=True)
673
+
674
+ # Update metrics
675
+ self.metrics.update(status='stopped')
676
+
677
+ # Release lock
678
+ self.release_lock()
679
+
680
+ def handle_shutdown(self, signum, frame):
681
+ """Handle shutdown signals."""
682
+ logger.info(f"Received signal {signum}, initiating shutdown...")
683
+ self.shutdown_requested = True
684
+
685
+
686
+ def start_daemon():
687
+ """Start the hardened daemon with supervisor."""
688
+ supervisor = DaemonSupervisor()
689
+
690
+ # Check if already running
691
+ if supervisor.pid_file.exists():
692
+ try:
693
+ with open(supervisor.pid_file, 'r') as f:
694
+ old_pid = int(f.read().strip())
695
+
696
+ if supervisor._is_process_alive(old_pid):
697
+ print(f"Socket.IO daemon is already running (PID: {old_pid})")
698
+ return
699
+ except:
700
+ pass
701
+
702
+ # Clean up stale PID file
703
+ supervisor.pid_file.unlink(missing_ok=True)
704
+
705
+ # Acquire lock
706
+ if not supervisor.acquire_lock():
707
+ print("Could not acquire lock - another instance may be running")
708
+ return
709
+
710
+ print("Starting hardened Socket.IO daemon with supervisor...")
711
+ print(f"Python: {PYTHON_EXECUTABLE}")
712
+ print(f"Max retries: {Config.MAX_RETRIES}")
713
+ print(f"Health checks: {'enabled' if Config.METRICS_ENABLED else 'disabled'}")
714
+
715
+ # Setup signal handlers
716
+ signal.signal(signal.SIGTERM, supervisor.handle_shutdown)
717
+ signal.signal(signal.SIGINT, supervisor.handle_shutdown)
718
+
719
+ try:
720
+ # Start monitoring and auto-restart loop
721
+ supervisor.monitor_and_restart()
722
+ finally:
723
+ supervisor.cleanup()
724
+
725
+ print("Socket.IO daemon stopped")
726
+
727
+
728
+ def stop_daemon():
729
+ """Stop the hardened daemon."""
730
+ deployment_root = get_project_root()
731
+ pid_file = deployment_root / ".claude-mpm" / "socketio-server.pid"
732
+ supervisor_pid_file = deployment_root / ".claude-mpm" / "socketio-supervisor.pid"
733
+
734
+ # Try to stop supervisor first
735
+ if supervisor_pid_file.exists():
736
+ try:
737
+ with open(supervisor_pid_file, 'r') as f:
738
+ supervisor_pid = int(f.read().strip())
739
+
740
+ print(f"Stopping supervisor (PID: {supervisor_pid})...")
741
+ os.kill(supervisor_pid, signal.SIGTERM)
742
+
743
+ # Wait for supervisor to stop
744
+ for _ in range(20):
745
+ if not psutil.pid_exists(supervisor_pid):
746
+ print("Supervisor stopped successfully")
747
+ supervisor_pid_file.unlink(missing_ok=True)
748
+ return
749
+ time.sleep(0.5)
750
+
751
+ # Force kill if needed
752
+ print("Supervisor didn't stop gracefully, forcing...")
753
+ os.kill(supervisor_pid, signal.SIGKILL)
754
+ supervisor_pid_file.unlink(missing_ok=True)
755
+
756
+ except Exception as e:
757
+ print(f"Error stopping supervisor: {e}")
758
+
759
+ # Also try to stop server directly if supervisor failed
760
+ if pid_file.exists():
761
+ try:
762
+ with open(pid_file, 'r') as f:
763
+ server_pid = int(f.read().strip())
764
+
765
+ if psutil.pid_exists(server_pid):
766
+ print(f"Stopping server (PID: {server_pid})...")
767
+ os.kill(server_pid, signal.SIGTERM)
768
+ time.sleep(2)
769
+
770
+ if psutil.pid_exists(server_pid):
771
+ os.kill(server_pid, signal.SIGKILL)
772
+
773
+ pid_file.unlink(missing_ok=True)
774
+
775
+ except Exception as e:
776
+ print(f"Error stopping server: {e}")
777
+
778
+
779
+ def status_daemon():
780
+ """Show detailed daemon status."""
781
+ deployment_root = get_project_root()
782
+ pid_file = deployment_root / ".claude-mpm" / "socketio-server.pid"
783
+ supervisor_pid_file = deployment_root / ".claude-mpm" / "socketio-supervisor.pid"
784
+ metrics_file = deployment_root / ".claude-mpm" / Config.METRICS_FILE
785
+
786
+ print("Socket.IO Daemon Status")
787
+ print("=" * 50)
788
+
789
+ # Check supervisor
790
+ if supervisor_pid_file.exists():
791
+ try:
792
+ with open(supervisor_pid_file, 'r') as f:
793
+ supervisor_pid = int(f.read().strip())
794
+
795
+ if psutil.pid_exists(supervisor_pid):
796
+ process = psutil.Process(supervisor_pid)
797
+ print(f"✅ Supervisor: RUNNING (PID: {supervisor_pid})")
798
+ print(f" Memory: {process.memory_info().rss / 1024 / 1024:.1f} MB")
799
+ print(f" CPU: {process.cpu_percent()}%")
800
+ else:
801
+ print(f"❌ Supervisor: NOT RUNNING (stale PID: {supervisor_pid})")
802
+ except:
803
+ print("❌ Supervisor: ERROR reading status")
804
+ else:
805
+ print("❌ Supervisor: NOT RUNNING")
806
+
807
+ # Check server
808
+ if pid_file.exists():
809
+ try:
810
+ with open(pid_file, 'r') as f:
811
+ server_pid = int(f.read().strip())
812
+
813
+ if psutil.pid_exists(server_pid):
814
+ process = psutil.Process(server_pid)
815
+ print(f"✅ Server: RUNNING (PID: {server_pid})")
816
+ print(f" Memory: {process.memory_info().rss / 1024 / 1024:.1f} MB")
817
+ print(f" CPU: {process.cpu_percent()}%")
818
+
819
+ # Check port
820
+ port_file = deployment_root / ".claude-mpm" / "socketio-port"
821
+ if port_file.exists():
822
+ with open(port_file, 'r') as f:
823
+ port = int(f.read().strip())
824
+ print(f" Port: {port}")
825
+
826
+ # Test connection
827
+ import socket
828
+ sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
829
+ result = sock.connect_ex(('localhost', port))
830
+ sock.close()
831
+
832
+ if result == 0:
833
+ print(f" ✅ Listening on port {port}")
834
+ else:
835
+ print(f" ❌ Not responding on port {port}")
836
+ else:
837
+ print(f"❌ Server: NOT RUNNING (stale PID: {server_pid})")
838
+ except:
839
+ print("❌ Server: ERROR reading status")
840
+ else:
841
+ print("❌ Server: NOT RUNNING")
842
+
843
+ # Show metrics
844
+ if metrics_file.exists():
845
+ try:
846
+ with open(metrics_file, 'r') as f:
847
+ metrics = json.load(f)
848
+
849
+ print("\n📊 Metrics:")
850
+ print(f" Status: {metrics.get('status', 'unknown')}")
851
+ print(f" Uptime: {metrics.get('uptime_seconds', 0)} seconds")
852
+ print(f" Restarts: {metrics.get('restarts', 0)}")
853
+ print(f" Failures: {metrics.get('total_failures', 0)}")
854
+ print(f" Health Checks Passed: {metrics.get('health_checks_passed', 0)}")
855
+ print(f" Health Checks Failed: {metrics.get('health_checks_failed', 0)}")
856
+
857
+ if metrics.get('last_failure'):
858
+ print(f" Last Failure: {metrics['last_failure']}")
859
+ if metrics.get('last_health_check'):
860
+ print(f" Last Health Check: {metrics['last_health_check']}")
861
+
862
+ except Exception as e:
863
+ print(f"\n❌ Could not read metrics: {e}")
864
+
865
+ print("\n🔧 Configuration:")
866
+ print(f" Max Retries: {Config.MAX_RETRIES}")
867
+ print(f" Health Check Interval: {Config.HEALTH_CHECK_INTERVAL}s")
868
+ print(f" Port Range: {Config.PORT_RANGE_START}-{Config.PORT_RANGE_END}")
869
+ print(f" Log Level: {Config.LOG_LEVEL}")
870
+
871
+
872
+ def main():
873
+ """Main entry point."""
874
+ if len(sys.argv) < 2:
875
+ print("Usage: socketio-daemon-hardened.py {start|stop|restart|status}")
876
+ sys.exit(1)
877
+
878
+ command = sys.argv[1]
879
+
880
+ if command == "start":
881
+ start_daemon()
882
+ elif command == "stop":
883
+ stop_daemon()
884
+ elif command == "restart":
885
+ stop_daemon()
886
+ time.sleep(2)
887
+ start_daemon()
888
+ elif command == "status":
889
+ status_daemon()
890
+ else:
891
+ print(f"Unknown command: {command}")
892
+ print("Usage: socketio-daemon-hardened.py {start|stop|restart|status}")
893
+ sys.exit(1)
894
+
895
+
896
+ if __name__ == "__main__":
897
+ main()