claude-mpm 3.4.0__py3-none-any.whl → 3.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -33,6 +33,68 @@ from pathlib import Path
33
33
  from typing import Dict, Any, Optional, List, Set
34
34
  from collections import deque
35
35
  import importlib.metadata
36
+ import fcntl # Unix file locking
37
+ import platform
38
+
39
+ # Import health monitoring and recovery systems
40
+ try:
41
+ from .health_monitor import (
42
+ AdvancedHealthMonitor, ProcessResourceChecker,
43
+ NetworkConnectivityChecker, ServiceHealthChecker,
44
+ HealthStatus, HealthCheckResult
45
+ )
46
+ from .recovery_manager import RecoveryManager, RecoveryEvent
47
+ HEALTH_MONITORING_AVAILABLE = True
48
+ except ImportError as e:
49
+ HEALTH_MONITORING_AVAILABLE = False
50
+ # Create stub classes to prevent errors
51
+ class AdvancedHealthMonitor:
52
+ def __init__(self, *args, **kwargs): pass
53
+ def add_checker(self, *args): pass
54
+ def start_monitoring(self): pass
55
+ async def stop_monitoring(self): pass
56
+ def get_current_status(self): return None
57
+ def export_diagnostics(self): return {}
58
+
59
+ class RecoveryManager:
60
+ def __init__(self, *args, **kwargs): pass
61
+ def handle_health_result(self, *args): return None
62
+ def get_recovery_status(self): return {}
63
+
64
+ # Import enhanced error classes
65
+ try:
66
+ from .exceptions import (
67
+ DaemonConflictError, PortConflictError, StaleProcessError,
68
+ RecoveryFailedError, HealthCheckError, format_troubleshooting_guide
69
+ )
70
+ ENHANCED_ERRORS_AVAILABLE = True
71
+ except ImportError as e:
72
+ ENHANCED_ERRORS_AVAILABLE = False
73
+ # Create stub classes to prevent errors
74
+ class DaemonConflictError(Exception): pass
75
+ class PortConflictError(Exception): pass
76
+ class StaleProcessError(Exception): pass
77
+ class RecoveryFailedError(Exception): pass
78
+ class HealthCheckError(Exception): pass
79
+ def format_troubleshooting_guide(error): return str(error)
80
+
81
+ try:
82
+ import psutil
83
+ PSUTIL_AVAILABLE = True
84
+ except ImportError:
85
+ PSUTIL_AVAILABLE = False
86
+ psutil = None
87
+
88
+ # Windows file locking support
89
+ if platform.system() == 'Windows':
90
+ try:
91
+ import msvcrt
92
+ WINDOWS_LOCKING = True
93
+ except ImportError:
94
+ WINDOWS_LOCKING = False
95
+ else:
96
+ WINDOWS_LOCKING = False
97
+ msvcrt = None
36
98
 
37
99
  try:
38
100
  import socketio
@@ -65,7 +127,13 @@ COMPATIBILITY_MATRIX = {
65
127
  "version_compatibility",
66
128
  "process_isolation",
67
129
  "health_monitoring",
68
- "event_namespacing"
130
+ "advanced_health_monitoring",
131
+ "automatic_recovery",
132
+ "circuit_breaker",
133
+ "resource_monitoring",
134
+ "event_namespacing",
135
+ "comprehensive_diagnostics",
136
+ "metrics_export"
69
137
  ]
70
138
  }
71
139
  }
@@ -115,14 +183,35 @@ class StandaloneSocketIOServer:
115
183
  # Process management
116
184
  self.pid = os.getpid()
117
185
  self.pidfile_path = self._get_pidfile_path()
186
+ self.pidfile_lock = None # File lock object
187
+ self.process_start_time = None
188
+ if PSUTIL_AVAILABLE:
189
+ try:
190
+ current_process = psutil.Process(self.pid)
191
+ self.process_start_time = current_process.create_time()
192
+ except Exception as e:
193
+ self.logger.warning(f"Could not get process start time: {e}")
118
194
 
119
195
  if not SOCKETIO_AVAILABLE:
120
196
  self.logger.error("Socket.IO dependencies not available. Install with: pip install python-socketio aiohttp")
121
197
  return
122
198
 
199
+ # Log initialization with comprehensive info
123
200
  self.logger.info(f"Standalone Socket.IO server v{self.server_version} initialized")
124
201
  self.logger.info(f"Server ID: {self.server_id}, PID: {self.pid}")
125
202
  self.logger.info(f"Using python-socketio v{SOCKETIO_VERSION}")
203
+ self.logger.info(f"Enhanced validation: psutil {'available' if PSUTIL_AVAILABLE else 'not available'}")
204
+ self.logger.info(f"File locking: {platform.system()} {'supported' if (platform.system() != 'Windows' or WINDOWS_LOCKING) else 'not supported'}")
205
+ self.logger.info(f"Health monitoring: {'available' if HEALTH_MONITORING_AVAILABLE else 'not available'}")
206
+
207
+ # Initialize health monitoring system
208
+ self.health_monitor = None
209
+ self.recovery_manager = None
210
+ if HEALTH_MONITORING_AVAILABLE:
211
+ self._initialize_health_monitoring()
212
+
213
+ if self.process_start_time:
214
+ self.logger.debug(f"Process start time: {self.process_start_time}")
126
215
 
127
216
  def _setup_logging(self) -> logging.Logger:
128
217
  """Setup dedicated logging for standalone server."""
@@ -139,6 +228,92 @@ class StandaloneSocketIOServer:
139
228
 
140
229
  return logger
141
230
 
231
+ def _initialize_health_monitoring(self):
232
+ """Initialize health monitoring and recovery systems."""
233
+ try:
234
+ # Health monitoring configuration
235
+ health_config = {
236
+ 'check_interval': 30, # Check every 30 seconds
237
+ 'history_size': 100, # Keep 100 health check results
238
+ 'aggregation_window': 300 # 5 minute aggregation window
239
+ }
240
+
241
+ self.health_monitor = AdvancedHealthMonitor(health_config)
242
+
243
+ # Add health checkers
244
+
245
+ # Process resource monitoring
246
+ if PSUTIL_AVAILABLE:
247
+ process_checker = ProcessResourceChecker(
248
+ pid=self.pid,
249
+ cpu_threshold=80.0, # 80% CPU threshold
250
+ memory_threshold_mb=500, # 500MB memory threshold
251
+ fd_threshold=1000 # 1000 file descriptor threshold
252
+ )
253
+ self.health_monitor.add_checker(process_checker)
254
+
255
+ # Network connectivity monitoring
256
+ network_checker = NetworkConnectivityChecker(
257
+ host=self.host,
258
+ port=self.port,
259
+ timeout=2.0
260
+ )
261
+ self.health_monitor.add_checker(network_checker)
262
+
263
+ # Service health monitoring (will be initialized after server stats are available)
264
+ # This is added later in start_async after health_stats is fully initialized
265
+
266
+ # Recovery manager configuration
267
+ recovery_config = {
268
+ 'enabled': True,
269
+ 'check_interval': 60,
270
+ 'max_recovery_attempts': 5,
271
+ 'recovery_timeout': 30,
272
+ 'circuit_breaker': {
273
+ 'failure_threshold': 5,
274
+ 'timeout_seconds': 300,
275
+ 'success_threshold': 3
276
+ },
277
+ 'strategy': {
278
+ 'warning_threshold': 2,
279
+ 'critical_threshold': 1,
280
+ 'failure_window_seconds': 300,
281
+ 'min_recovery_interval': 60
282
+ }
283
+ }
284
+
285
+ self.recovery_manager = RecoveryManager(recovery_config, self)
286
+
287
+ # Link health monitor and recovery manager
288
+ self.health_monitor.add_health_callback(self._handle_health_result)
289
+
290
+ self.logger.info("Health monitoring and recovery systems initialized")
291
+
292
+ except Exception as e:
293
+ self.logger.error(f"Failed to initialize health monitoring: {e}")
294
+ self.health_monitor = None
295
+ self.recovery_manager = None
296
+
297
+ def _handle_health_result(self, health_result: HealthCheckResult):
298
+ """Handle health check results and trigger recovery if needed."""
299
+ try:
300
+ if self.recovery_manager:
301
+ recovery_event = self.recovery_manager.handle_health_result(health_result)
302
+ if recovery_event:
303
+ self.logger.info(f"Recovery triggered: {recovery_event.action.value}")
304
+ except Exception as e:
305
+ self.logger.error(f"Error handling health result: {e}")
306
+
307
+ # Enhanced error reporting for health check failures
308
+ if ENHANCED_ERRORS_AVAILABLE:
309
+ if hasattr(health_result, 'status') and health_result.status in ['critical', 'failed']:
310
+ health_error = HealthCheckError(
311
+ check_name=getattr(health_result, 'check_name', 'unknown'),
312
+ check_status=getattr(health_result, 'status', 'failed'),
313
+ check_details=getattr(health_result, 'details', {})
314
+ )
315
+ self.logger.error(f"\nHealth Check Failure Details:\n{health_error}")
316
+
142
317
  def _get_pidfile_path(self) -> Path:
143
318
  """Get path for PID file to track running server."""
144
319
  # Use system temp directory or user home
@@ -182,52 +357,442 @@ class StandaloneSocketIOServer:
182
357
 
183
358
  return result
184
359
 
185
- def is_already_running(self) -> bool:
186
- """Check if another server instance is already running on this port."""
360
+ def _validate_process_identity(self, pid: int, expected_cmdline_patterns: List[str] = None) -> Dict[str, Any]:
361
+ """Validate that a process is actually our Socket.IO server.
362
+
363
+ Args:
364
+ pid: Process ID to validate
365
+ expected_cmdline_patterns: Command line patterns that should match our server
366
+
367
+ Returns:
368
+ Dict with validation results and process info
369
+ """
370
+ validation_result = {
371
+ "is_valid": False,
372
+ "is_zombie": False,
373
+ "is_our_server": False,
374
+ "process_info": {},
375
+ "validation_errors": []
376
+ }
377
+
378
+ if not PSUTIL_AVAILABLE:
379
+ validation_result["validation_errors"].append("psutil not available for enhanced validation")
380
+ # Fallback to basic process existence check
381
+ try:
382
+ os.kill(pid, 0)
383
+ validation_result["is_valid"] = True
384
+ validation_result["process_info"] = {"pid": pid, "method": "basic_os_check"}
385
+ except OSError:
386
+ validation_result["validation_errors"].append(f"Process {pid} does not exist")
387
+ return validation_result
388
+
187
389
  try:
188
- # Check PID file
189
- if self.pidfile_path.exists():
390
+ process = psutil.Process(pid)
391
+
392
+ # Basic process info
393
+ process_info = {
394
+ "pid": pid,
395
+ "status": process.status(),
396
+ "create_time": process.create_time(),
397
+ "name": process.name(),
398
+ "cwd": None,
399
+ "cmdline": [],
400
+ "memory_info": None
401
+ }
402
+
403
+ # Check if process is zombie
404
+ if process.status() == psutil.STATUS_ZOMBIE:
405
+ validation_result["is_zombie"] = True
406
+ validation_result["validation_errors"].append(f"Process {pid} is a zombie")
407
+ validation_result["process_info"] = process_info
408
+ return validation_result
409
+
410
+ # Get additional process details
411
+ try:
412
+ process_info["cwd"] = process.cwd()
413
+ process_info["cmdline"] = process.cmdline()
414
+ process_info["memory_info"] = process.memory_info()._asdict()
415
+ except (psutil.AccessDenied, psutil.NoSuchProcess) as e:
416
+ validation_result["validation_errors"].append(f"Access denied getting process details: {e}")
417
+
418
+ validation_result["process_info"] = process_info
419
+ validation_result["is_valid"] = True
420
+
421
+ # Validate this is likely our server process
422
+ cmdline = process_info.get("cmdline", [])
423
+ cmdline_str = " ".join(cmdline).lower()
424
+
425
+ # Default patterns for our Socket.IO server
426
+ if expected_cmdline_patterns is None:
427
+ expected_cmdline_patterns = [
428
+ "socketio",
429
+ "standalone_socketio_server",
430
+ "claude-mpm",
431
+ str(self.port)
432
+ ]
433
+
434
+ # Check if any patterns match the command line
435
+ matches = [pattern.lower() in cmdline_str for pattern in expected_cmdline_patterns]
436
+ if any(matches):
437
+ validation_result["is_our_server"] = True
438
+ self.logger.debug(f"Process {pid} matches server patterns: {[p for p, m in zip(expected_cmdline_patterns, matches) if m]}")
439
+ else:
440
+ validation_result["validation_errors"].append(
441
+ f"Process {pid} command line '{cmdline_str}' does not match expected patterns: {expected_cmdline_patterns}"
442
+ )
443
+ self.logger.warning(f"Process {pid} does not appear to be our server: {cmdline}")
444
+
445
+ except psutil.NoSuchProcess:
446
+ validation_result["validation_errors"].append(f"Process {pid} no longer exists")
447
+ except psutil.AccessDenied as e:
448
+ validation_result["validation_errors"].append(f"Access denied to process {pid}: {e}")
449
+ except Exception as e:
450
+ validation_result["validation_errors"].append(f"Error validating process {pid}: {e}")
451
+
452
+ return validation_result
453
+
454
+ def _acquire_pidfile_lock(self, pidfile_fd) -> bool:
455
+ """Acquire exclusive lock on PID file.
456
+
457
+ Args:
458
+ pidfile_fd: Open file descriptor for PID file
459
+
460
+ Returns:
461
+ True if lock acquired successfully, False otherwise
462
+ """
463
+ try:
464
+ if platform.system() == 'Windows' and WINDOWS_LOCKING:
465
+ # Windows file locking
466
+ msvcrt.locking(pidfile_fd.fileno(), msvcrt.LK_NBLCK, 1)
467
+ return True
468
+ else:
469
+ # Unix file locking
470
+ fcntl.flock(pidfile_fd.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
471
+ return True
472
+ except (IOError, OSError) as e:
473
+ self.logger.debug(f"Could not acquire PID file lock: {e}")
474
+ return False
475
+
476
+ def _release_pidfile_lock(self, pidfile_fd):
477
+ """Release lock on PID file.
478
+
479
+ Args:
480
+ pidfile_fd: Open file descriptor for PID file
481
+ """
482
+ try:
483
+ if platform.system() == 'Windows' and WINDOWS_LOCKING:
484
+ msvcrt.locking(pidfile_fd.fileno(), msvcrt.LK_UNLCK, 1)
485
+ else:
486
+ fcntl.flock(pidfile_fd.fileno(), fcntl.LOCK_UN)
487
+ except (IOError, OSError) as e:
488
+ self.logger.debug(f"Error releasing PID file lock: {e}")
489
+
490
+ def _validate_pidfile_timestamp(self, pidfile_path: Path, process_start_time: float) -> bool:
491
+ """Validate that PID file was created around the same time as the process.
492
+
493
+ Args:
494
+ pidfile_path: Path to PID file
495
+ process_start_time: Process start time from psutil
496
+
497
+ Returns:
498
+ True if timestamps are reasonably close, False otherwise
499
+ """
500
+ try:
501
+ pidfile_mtime = pidfile_path.stat().st_mtime
502
+ time_diff = abs(pidfile_mtime - process_start_time)
503
+
504
+ # Allow up to 5 seconds difference (process start vs file creation)
505
+ if time_diff <= 5.0:
506
+ return True
507
+ else:
508
+ self.logger.warning(
509
+ f"PID file timestamp ({pidfile_mtime}) and process start time ({process_start_time}) "
510
+ f"differ by {time_diff:.2f} seconds"
511
+ )
512
+ return False
513
+ except Exception as e:
514
+ self.logger.warning(f"Could not validate PID file timestamp: {e}")
515
+ return False
516
+
517
+ def is_already_running(self, raise_on_conflict: bool = False) -> bool:
518
+ """Enhanced check if another server instance is already running on this port.
519
+
520
+ This method performs comprehensive validation including:
521
+ - PID file existence and validity
522
+ - Process identity verification (command line, start time)
523
+ - Zombie process detection
524
+ - Port availability check
525
+ - Automatic cleanup of stale PID files
526
+
527
+ Returns:
528
+ True if a valid server is already running, False otherwise
529
+ """
530
+ self.logger.debug(f"Checking if server is already running on {self.host}:{self.port}")
531
+
532
+ try:
533
+ # Step 1: Check PID file existence
534
+ if not self.pidfile_path.exists():
535
+ self.logger.debug("No PID file found")
536
+ return self._check_port_only(raise_on_conflict)
537
+
538
+ self.logger.debug(f"Found PID file: {self.pidfile_path}")
539
+
540
+ # Step 2: Read PID from file with support for both JSON and legacy formats
541
+ try:
190
542
  with open(self.pidfile_path, 'r') as f:
191
- old_pid = int(f.read().strip())
192
-
193
- # Check if process is still running
543
+ pid_content = f.read().strip()
544
+
545
+ if not pid_content:
546
+ self.logger.warning("Empty PID file")
547
+ self._cleanup_stale_pidfile("empty_file")
548
+ return self._check_port_only(raise_on_conflict)
549
+
550
+ # Try JSON format first (new format)
551
+ try:
552
+ pidfile_data = json.loads(pid_content)
553
+ old_pid = pidfile_data["pid"]
554
+ server_id = pidfile_data.get("server_id", "unknown")
555
+ self.logger.debug(f"Found PID {old_pid} for server {server_id} in JSON format")
556
+ except (json.JSONDecodeError, KeyError, TypeError):
557
+ # Fallback to legacy format (plain PID number)
558
+ if pid_content.isdigit():
559
+ old_pid = int(pid_content)
560
+ self.logger.debug(f"Found PID {old_pid} in legacy format")
561
+ else:
562
+ self.logger.warning(f"Invalid PID content in file: '{pid_content[:100]}...' (truncated)")
563
+ self._cleanup_stale_pidfile("invalid_content")
564
+ return self._check_port_only(raise_on_conflict)
565
+
566
+ except (IOError, ValueError) as e:
567
+ self.logger.warning(f"Could not read PID file: {e}")
568
+ self._cleanup_stale_pidfile("read_error")
569
+ return self._check_port_only(raise_on_conflict)
570
+
571
+ # Step 3: Enhanced process validation
572
+ validation = self._validate_process_identity(old_pid)
573
+
574
+ if not validation["is_valid"]:
575
+ self.logger.info(f"Process {old_pid} is not valid: {validation['validation_errors']}")
576
+ if raise_on_conflict and ENHANCED_ERRORS_AVAILABLE:
577
+ raise StaleProcessError(
578
+ pid=old_pid,
579
+ pidfile_path=self.pidfile_path,
580
+ process_status="not_found",
581
+ validation_errors=validation['validation_errors']
582
+ )
583
+ self._cleanup_stale_pidfile("invalid_process")
584
+ return self._check_port_only(raise_on_conflict)
585
+
586
+ if validation["is_zombie"]:
587
+ self.logger.info(f"Process {old_pid} is a zombie, cleaning up")
588
+ if raise_on_conflict and ENHANCED_ERRORS_AVAILABLE:
589
+ raise StaleProcessError(
590
+ pid=old_pid,
591
+ pidfile_path=self.pidfile_path,
592
+ process_status="zombie",
593
+ validation_errors=["Process is a zombie (terminated but not reaped)"]
594
+ )
595
+ self._cleanup_stale_pidfile("zombie_process")
596
+ return self._check_port_only(raise_on_conflict)
597
+
598
+ # Step 4: Verify this is actually our server process
599
+ if not validation["is_our_server"]:
600
+ self.logger.warning(
601
+ f"Process {old_pid} exists but does not appear to be our Socket.IO server. "
602
+ f"Command line: {validation['process_info'].get('cmdline', 'unknown')}"
603
+ )
604
+ # Don't automatically clean up - might be another legitimate process
605
+ return self._check_port_only(raise_on_conflict)
606
+
607
+ # Step 5: Validate process start time against PID file timestamp
608
+ if PSUTIL_AVAILABLE and 'create_time' in validation['process_info']:
609
+ process_start_time = validation['process_info']['create_time']
610
+ if not self._validate_pidfile_timestamp(self.pidfile_path, process_start_time):
611
+ self.logger.warning("PID file timestamp does not match process start time")
612
+ # Continue anyway - timestamp validation is not critical
613
+
614
+ # Step 6: All validations passed
615
+ process_info = validation['process_info']
616
+ self.logger.info(
617
+ f"Found valid running server: PID {old_pid}, "
618
+ f"status: {process_info.get('status', 'unknown')}, "
619
+ f"name: {process_info.get('name', 'unknown')}"
620
+ )
621
+
622
+ if raise_on_conflict and ENHANCED_ERRORS_AVAILABLE:
623
+ # Try to extract server ID from PID file if available
624
+ server_id = "unknown"
194
625
  try:
195
- os.kill(old_pid, 0) # Signal 0 just checks if process exists
196
- self.logger.info(f"Found existing server with PID {old_pid}")
197
- return True
198
- except OSError:
199
- # Process doesn't exist, remove stale PID file
200
- self.pidfile_path.unlink()
626
+ with open(self.pidfile_path, 'r') as f:
627
+ content = f.read().strip()
628
+ if content.startswith('{'):
629
+ pidfile_data = json.loads(content)
630
+ server_id = pidfile_data.get("server_id", "unknown")
631
+ except:
632
+ pass
633
+
634
+ raise DaemonConflictError(
635
+ port=self.port,
636
+ existing_pid=old_pid,
637
+ existing_server_id=server_id,
638
+ process_info=process_info,
639
+ pidfile_path=self.pidfile_path
640
+ )
641
+
642
+ return True
643
+
644
+ except (DaemonConflictError, StaleProcessError, PortConflictError) as e:
645
+ # Re-raise our enhanced errors instead of catching them
646
+ raise
647
+ except Exception as e:
648
+ self.logger.error(f"Error during enhanced server check: {e}")
649
+ # Fallback to basic port check on unexpected errors
650
+ return self._check_port_only(raise_on_conflict)
651
+
652
+ def _check_port_only(self, raise_on_conflict: bool = False) -> bool:
653
+ """Fallback method to check if port is in use.
654
+
655
+ Args:
656
+ raise_on_conflict: If True, raises PortConflictError instead of returning True
657
+
658
+ Returns:
659
+ True if port is in use, False otherwise
201
660
 
202
- # Check if port is in use
661
+ Raises:
662
+ PortConflictError: If raise_on_conflict=True and port is in use
663
+ """
664
+ try:
203
665
  with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
204
666
  s.settimeout(1.0)
205
667
  result = s.connect_ex((self.host, self.port))
206
668
  if result == 0:
207
- self.logger.info(f"Port {self.port} is already in use")
208
- return True
669
+ self.logger.info(f"Port {self.port} is in use by some process")
209
670
 
671
+ if raise_on_conflict and ENHANCED_ERRORS_AVAILABLE:
672
+ # Try to identify the conflicting process if psutil is available
673
+ conflicting_process = {}
674
+ if PSUTIL_AVAILABLE:
675
+ try:
676
+ import psutil
677
+ for proc in psutil.process_iter(['pid', 'name', 'cmdline']):
678
+ try:
679
+ for conn in proc.connections():
680
+ if (conn.laddr.ip == self.host or conn.laddr.ip == '0.0.0.0') and conn.laddr.port == self.port:
681
+ conflicting_process = {
682
+ 'pid': proc.info['pid'],
683
+ 'name': proc.info['name'],
684
+ 'cmdline': proc.info['cmdline']
685
+ }
686
+ break
687
+ except (psutil.NoSuchProcess, psutil.AccessDenied):
688
+ continue
689
+ if conflicting_process:
690
+ break
691
+ except Exception:
692
+ pass # Ignore errors in process discovery
693
+
694
+ raise PortConflictError(
695
+ port=self.port,
696
+ host=self.host,
697
+ conflicting_process=conflicting_process
698
+ )
699
+
700
+ return True
210
701
  except Exception as e:
211
- self.logger.debug(f"Error checking for existing server: {e}")
702
+ if not isinstance(e, PortConflictError): # Don't mask our own exceptions
703
+ self.logger.debug(f"Error checking port availability: {e}")
212
704
 
213
705
  return False
214
706
 
707
+ def _cleanup_stale_pidfile(self, reason: str):
708
+ """Clean up stale PID file with logging.
709
+
710
+ Args:
711
+ reason: Reason for cleanup (for logging)
712
+ """
713
+ try:
714
+ if self.pidfile_path.exists():
715
+ self.pidfile_path.unlink()
716
+ self.logger.info(f"Cleaned up stale PID file (reason: {reason}): {self.pidfile_path}")
717
+ except Exception as e:
718
+ self.logger.error(f"Failed to clean up stale PID file: {e}")
719
+
215
720
  def create_pidfile(self):
216
- """Create PID file to track this server instance."""
721
+ """Create PID file with exclusive locking to track this server instance.
722
+
723
+ This method creates a PID file with exclusive locking to prevent race conditions
724
+ and ensures only one server instance can hold the lock at a time.
725
+ """
217
726
  try:
218
727
  self.pidfile_path.parent.mkdir(parents=True, exist_ok=True)
219
- with open(self.pidfile_path, 'w') as f:
220
- f.write(str(self.pid))
221
- self.logger.info(f"Created PID file: {self.pidfile_path}")
728
+
729
+ # Open file for writing with exclusive creation
730
+ pidfile_fd = open(self.pidfile_path, 'w')
731
+
732
+ # Try to acquire exclusive lock
733
+ if not self._acquire_pidfile_lock(pidfile_fd):
734
+ pidfile_fd.close()
735
+ if ENHANCED_ERRORS_AVAILABLE:
736
+ raise DaemonConflictError(
737
+ port=self.port,
738
+ existing_pid=0, # Unknown PID since we can't get lock
739
+ existing_server_id="unknown",
740
+ pidfile_path=self.pidfile_path
741
+ )
742
+ else:
743
+ raise RuntimeError("Could not acquire exclusive lock on PID file")
744
+
745
+ # Write PID and additional metadata
746
+ pidfile_content = {
747
+ "pid": self.pid,
748
+ "server_id": self.server_id,
749
+ "server_version": self.server_version,
750
+ "port": self.port,
751
+ "host": self.host,
752
+ "start_time": self.start_time.isoformat() + "Z",
753
+ "process_start_time": self.process_start_time if self.process_start_time else None,
754
+ "python_version": sys.version.split()[0],
755
+ "platform": platform.system(),
756
+ "created_at": datetime.utcnow().isoformat() + "Z"
757
+ }
758
+
759
+ # Write JSON format for better validation
760
+ pidfile_fd.write(json.dumps(pidfile_content, indent=2))
761
+ pidfile_fd.flush()
762
+
763
+ # Keep file descriptor open to maintain lock
764
+ self.pidfile_lock = pidfile_fd
765
+
766
+ self.logger.info(f"Created PID file with exclusive lock: {self.pidfile_path}")
767
+ self.logger.debug(f"PID file content: {pidfile_content}")
768
+
222
769
  except Exception as e:
223
770
  self.logger.error(f"Failed to create PID file: {e}")
771
+ if 'pidfile_fd' in locals():
772
+ try:
773
+ pidfile_fd.close()
774
+ except:
775
+ pass
776
+ raise
224
777
 
225
778
  def remove_pidfile(self):
226
- """Remove PID file on shutdown."""
779
+ """Remove PID file and release lock on shutdown."""
227
780
  try:
781
+ # Release file lock first
782
+ if self.pidfile_lock:
783
+ try:
784
+ self._release_pidfile_lock(self.pidfile_lock)
785
+ self.pidfile_lock.close()
786
+ self.pidfile_lock = None
787
+ self.logger.debug("Released PID file lock")
788
+ except Exception as e:
789
+ self.logger.warning(f"Error releasing PID file lock: {e}")
790
+
791
+ # Remove PID file
228
792
  if self.pidfile_path.exists():
229
793
  self.pidfile_path.unlink()
230
794
  self.logger.info(f"Removed PID file: {self.pidfile_path}")
795
+
231
796
  except Exception as e:
232
797
  self.logger.error(f"Failed to remove PID file: {e}")
233
798
 
@@ -247,7 +812,11 @@ class StandaloneSocketIOServer:
247
812
  async def start_async(self):
248
813
  """Start the server asynchronously."""
249
814
  if not SOCKETIO_AVAILABLE:
250
- raise RuntimeError("Socket.IO dependencies not available")
815
+ error_msg = "Socket.IO dependencies not available. Install with: pip install python-socketio aiohttp"
816
+ if ENHANCED_ERRORS_AVAILABLE:
817
+ raise RuntimeError(error_msg + "\n\nInstallation steps:\n 1. pip install python-socketio aiohttp\n 2. Restart the server\n 3. Verify installation: python -c 'import socketio; print(socketio.__version__)'")
818
+ else:
819
+ raise RuntimeError(error_msg)
251
820
 
252
821
  self.logger.info(f"Starting standalone Socket.IO server v{self.server_version}")
253
822
 
@@ -271,14 +840,53 @@ class StandaloneSocketIOServer:
271
840
  self._setup_event_handlers()
272
841
 
273
842
  # Start the server
274
- self.runner = web.AppRunner(self.app)
275
- await self.runner.setup()
276
-
277
- self.site = web.TCPSite(self.runner, self.host, self.port)
278
- await self.site.start()
279
-
280
- self.running = True
281
- self.create_pidfile()
843
+ try:
844
+ self.runner = web.AppRunner(self.app)
845
+ await self.runner.setup()
846
+
847
+ self.site = web.TCPSite(self.runner, self.host, self.port)
848
+ await self.site.start()
849
+
850
+ self.running = True
851
+
852
+ # Create PID file after successful server start
853
+ self.create_pidfile()
854
+
855
+ # Start health monitoring
856
+ if self.health_monitor:
857
+ # Add service health checker now that stats are available
858
+ service_checker = ServiceHealthChecker(
859
+ service_stats=self.health_stats,
860
+ max_clients=1000,
861
+ max_error_rate=0.1
862
+ )
863
+ self.health_monitor.add_checker(service_checker)
864
+
865
+ # Start monitoring
866
+ self.health_monitor.start_monitoring()
867
+ self.logger.info("Health monitoring started")
868
+
869
+ except Exception as e:
870
+ self.logger.error(f"Failed to start server: {e}")
871
+ # Clean up partial initialization
872
+ if hasattr(self, 'runner') and self.runner:
873
+ try:
874
+ await self.runner.cleanup()
875
+ except:
876
+ pass
877
+
878
+ # Enhanced error handling for common startup failures
879
+ if ENHANCED_ERRORS_AVAILABLE:
880
+ if "Address already in use" in str(e) or "Permission denied" in str(e):
881
+ # This is likely a port conflict
882
+ try:
883
+ # Check if port is in use and raise appropriate error
884
+ self._check_port_only(raise_on_conflict=True)
885
+ except PortConflictError:
886
+ # Re-raise the more specific error
887
+ raise
888
+
889
+ raise
282
890
 
283
891
  self.logger.info(f"🚀 Standalone Socket.IO server STARTED on http://{self.host}:{self.port}")
284
892
  self.logger.info(f"🔧 Server ID: {self.server_id}")
@@ -346,6 +954,11 @@ class StandaloneSocketIOServer:
346
954
  async def _shutdown_async(self):
347
955
  """Async shutdown process."""
348
956
  try:
957
+ # Stop health monitoring
958
+ if self.health_monitor:
959
+ await self.health_monitor.stop_monitoring()
960
+ self.logger.info("Health monitoring stopped")
961
+
349
962
  # Close all client connections
350
963
  if self.sio:
351
964
  await self.sio.shutdown()
@@ -534,6 +1147,12 @@ class StandaloneSocketIOServer:
534
1147
  except Exception as e:
535
1148
  self.logger.error(f"Error handling claude_event: {e}")
536
1149
  self.health_stats["errors"] += 1
1150
+
1151
+ # Check if error rate is becoming concerning
1152
+ if ENHANCED_ERRORS_AVAILABLE and self.health_stats["errors"] > 0:
1153
+ error_rate = self.health_stats["errors"] / max(self.health_stats["events_processed"], 1)
1154
+ if error_rate > 0.1: # More than 10% error rate
1155
+ self.logger.warning(f"⚠️ High error rate detected: {error_rate:.2%} ({self.health_stats['errors']} errors out of {self.health_stats['events_processed']} events)")
537
1156
 
538
1157
  @self.sio.event
539
1158
  async def get_history(sid, data=None):
@@ -571,6 +1190,8 @@ class StandaloneSocketIOServer:
571
1190
  def main():
572
1191
  """Main entry point for standalone server execution."""
573
1192
  import argparse
1193
+ import json
1194
+ import time
574
1195
 
575
1196
  parser = argparse.ArgumentParser(description="Standalone Claude MPM Socket.IO Server")
576
1197
  parser.add_argument("--host", default="localhost", help="Host to bind to")
@@ -605,12 +1226,60 @@ def main():
605
1226
 
606
1227
  if args.stop:
607
1228
  if server.is_already_running():
608
- # Send termination signal to running server
1229
+ # Send termination signal to running server with enhanced validation
609
1230
  try:
1231
+ # Read and validate PID file
610
1232
  with open(server.pidfile_path, 'r') as f:
611
- pid = int(f.read().strip())
1233
+ content = f.read().strip()
1234
+
1235
+ # Try to parse as JSON first (new format), fallback to plain PID
1236
+ try:
1237
+ pidfile_data = json.loads(content)
1238
+ pid = pidfile_data["pid"]
1239
+ server_id = pidfile_data.get("server_id", "unknown")
1240
+ print(f"Found server {server_id} with PID {pid}")
1241
+ except (json.JSONDecodeError, KeyError):
1242
+ # Fallback to old format
1243
+ pid = int(content)
1244
+ server_id = "unknown"
1245
+
1246
+ # Validate the process before attempting to stop it
1247
+ validation = server._validate_process_identity(pid)
1248
+ if not validation["is_valid"]:
1249
+ print(f"Process {pid} is not valid or no longer exists")
1250
+ server._cleanup_stale_pidfile("stop_command_invalid_process")
1251
+ print("Cleaned up stale PID file")
1252
+ sys.exit(1)
1253
+
1254
+ if validation["is_zombie"]:
1255
+ print(f"Process {pid} is a zombie, cleaning up PID file")
1256
+ server._cleanup_stale_pidfile("stop_command_zombie")
1257
+ sys.exit(0)
1258
+
1259
+ if not validation["is_our_server"]:
1260
+ print(f"Warning: Process {pid} may not be our Socket.IO server")
1261
+ print(f"Command line: {validation['process_info'].get('cmdline', 'unknown')}")
1262
+ response = input("Stop it anyway? [y/N]: ")
1263
+ if response.lower() != 'y':
1264
+ print("Aborted")
1265
+ sys.exit(1)
1266
+
1267
+ # Send termination signal
612
1268
  os.kill(pid, signal.SIGTERM)
613
1269
  print(f"Sent stop signal to server (PID: {pid})")
1270
+
1271
+ # Wait a moment for graceful shutdown
1272
+ time.sleep(2)
1273
+
1274
+ # Check if process is still running
1275
+ try:
1276
+ os.kill(pid, 0)
1277
+ print(f"Server is still running, sending SIGKILL...")
1278
+ os.kill(pid, signal.SIGKILL)
1279
+ time.sleep(1)
1280
+ except OSError:
1281
+ print("Server stopped successfully")
1282
+
614
1283
  except Exception as e:
615
1284
  print(f"Error stopping server: {e}")
616
1285
  sys.exit(1)