claude-mpm 3.4.0__py3-none-any.whl → 3.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- claude_mpm/cli/commands/memory.py +6 -1
- claude_mpm/core/config.py +160 -0
- claude_mpm/hooks/claude_hooks/hook_wrapper.sh +1 -1
- claude_mpm/scripts/socketio_daemon.py +49 -9
- claude_mpm/scripts/socketio_server_manager.py +370 -45
- claude_mpm/services/__init__.py +18 -0
- claude_mpm/services/agent_memory_manager.py +7 -5
- claude_mpm/services/exceptions.py +677 -0
- claude_mpm/services/health_monitor.py +892 -0
- claude_mpm/services/memory_builder.py +4 -2
- claude_mpm/services/memory_optimizer.py +6 -2
- claude_mpm/services/recovery_manager.py +670 -0
- claude_mpm/services/socketio_server.py +188 -11
- claude_mpm/services/standalone_socketio_server.py +703 -34
- {claude_mpm-3.4.0.dist-info → claude_mpm-3.4.2.dist-info}/METADATA +1 -1
- {claude_mpm-3.4.0.dist-info → claude_mpm-3.4.2.dist-info}/RECORD +21 -18
- /claude_mpm/{web → dashboard}/open_dashboard.py +0 -0
- {claude_mpm-3.4.0.dist-info → claude_mpm-3.4.2.dist-info}/WHEEL +0 -0
- {claude_mpm-3.4.0.dist-info → claude_mpm-3.4.2.dist-info}/entry_points.txt +0 -0
- {claude_mpm-3.4.0.dist-info → claude_mpm-3.4.2.dist-info}/licenses/LICENSE +0 -0
- {claude_mpm-3.4.0.dist-info → claude_mpm-3.4.2.dist-info}/top_level.txt +0 -0
|
@@ -33,6 +33,68 @@ from pathlib import Path
|
|
|
33
33
|
from typing import Dict, Any, Optional, List, Set
|
|
34
34
|
from collections import deque
|
|
35
35
|
import importlib.metadata
|
|
36
|
+
import fcntl # Unix file locking
|
|
37
|
+
import platform
|
|
38
|
+
|
|
39
|
+
# Import health monitoring and recovery systems
|
|
40
|
+
try:
|
|
41
|
+
from .health_monitor import (
|
|
42
|
+
AdvancedHealthMonitor, ProcessResourceChecker,
|
|
43
|
+
NetworkConnectivityChecker, ServiceHealthChecker,
|
|
44
|
+
HealthStatus, HealthCheckResult
|
|
45
|
+
)
|
|
46
|
+
from .recovery_manager import RecoveryManager, RecoveryEvent
|
|
47
|
+
HEALTH_MONITORING_AVAILABLE = True
|
|
48
|
+
except ImportError as e:
|
|
49
|
+
HEALTH_MONITORING_AVAILABLE = False
|
|
50
|
+
# Create stub classes to prevent errors
|
|
51
|
+
class AdvancedHealthMonitor:
|
|
52
|
+
def __init__(self, *args, **kwargs): pass
|
|
53
|
+
def add_checker(self, *args): pass
|
|
54
|
+
def start_monitoring(self): pass
|
|
55
|
+
async def stop_monitoring(self): pass
|
|
56
|
+
def get_current_status(self): return None
|
|
57
|
+
def export_diagnostics(self): return {}
|
|
58
|
+
|
|
59
|
+
class RecoveryManager:
|
|
60
|
+
def __init__(self, *args, **kwargs): pass
|
|
61
|
+
def handle_health_result(self, *args): return None
|
|
62
|
+
def get_recovery_status(self): return {}
|
|
63
|
+
|
|
64
|
+
# Import enhanced error classes
|
|
65
|
+
try:
|
|
66
|
+
from .exceptions import (
|
|
67
|
+
DaemonConflictError, PortConflictError, StaleProcessError,
|
|
68
|
+
RecoveryFailedError, HealthCheckError, format_troubleshooting_guide
|
|
69
|
+
)
|
|
70
|
+
ENHANCED_ERRORS_AVAILABLE = True
|
|
71
|
+
except ImportError as e:
|
|
72
|
+
ENHANCED_ERRORS_AVAILABLE = False
|
|
73
|
+
# Create stub classes to prevent errors
|
|
74
|
+
class DaemonConflictError(Exception): pass
|
|
75
|
+
class PortConflictError(Exception): pass
|
|
76
|
+
class StaleProcessError(Exception): pass
|
|
77
|
+
class RecoveryFailedError(Exception): pass
|
|
78
|
+
class HealthCheckError(Exception): pass
|
|
79
|
+
def format_troubleshooting_guide(error): return str(error)
|
|
80
|
+
|
|
81
|
+
try:
|
|
82
|
+
import psutil
|
|
83
|
+
PSUTIL_AVAILABLE = True
|
|
84
|
+
except ImportError:
|
|
85
|
+
PSUTIL_AVAILABLE = False
|
|
86
|
+
psutil = None
|
|
87
|
+
|
|
88
|
+
# Windows file locking support
|
|
89
|
+
if platform.system() == 'Windows':
|
|
90
|
+
try:
|
|
91
|
+
import msvcrt
|
|
92
|
+
WINDOWS_LOCKING = True
|
|
93
|
+
except ImportError:
|
|
94
|
+
WINDOWS_LOCKING = False
|
|
95
|
+
else:
|
|
96
|
+
WINDOWS_LOCKING = False
|
|
97
|
+
msvcrt = None
|
|
36
98
|
|
|
37
99
|
try:
|
|
38
100
|
import socketio
|
|
@@ -65,7 +127,13 @@ COMPATIBILITY_MATRIX = {
|
|
|
65
127
|
"version_compatibility",
|
|
66
128
|
"process_isolation",
|
|
67
129
|
"health_monitoring",
|
|
68
|
-
"
|
|
130
|
+
"advanced_health_monitoring",
|
|
131
|
+
"automatic_recovery",
|
|
132
|
+
"circuit_breaker",
|
|
133
|
+
"resource_monitoring",
|
|
134
|
+
"event_namespacing",
|
|
135
|
+
"comprehensive_diagnostics",
|
|
136
|
+
"metrics_export"
|
|
69
137
|
]
|
|
70
138
|
}
|
|
71
139
|
}
|
|
@@ -115,14 +183,35 @@ class StandaloneSocketIOServer:
|
|
|
115
183
|
# Process management
|
|
116
184
|
self.pid = os.getpid()
|
|
117
185
|
self.pidfile_path = self._get_pidfile_path()
|
|
186
|
+
self.pidfile_lock = None # File lock object
|
|
187
|
+
self.process_start_time = None
|
|
188
|
+
if PSUTIL_AVAILABLE:
|
|
189
|
+
try:
|
|
190
|
+
current_process = psutil.Process(self.pid)
|
|
191
|
+
self.process_start_time = current_process.create_time()
|
|
192
|
+
except Exception as e:
|
|
193
|
+
self.logger.warning(f"Could not get process start time: {e}")
|
|
118
194
|
|
|
119
195
|
if not SOCKETIO_AVAILABLE:
|
|
120
196
|
self.logger.error("Socket.IO dependencies not available. Install with: pip install python-socketio aiohttp")
|
|
121
197
|
return
|
|
122
198
|
|
|
199
|
+
# Log initialization with comprehensive info
|
|
123
200
|
self.logger.info(f"Standalone Socket.IO server v{self.server_version} initialized")
|
|
124
201
|
self.logger.info(f"Server ID: {self.server_id}, PID: {self.pid}")
|
|
125
202
|
self.logger.info(f"Using python-socketio v{SOCKETIO_VERSION}")
|
|
203
|
+
self.logger.info(f"Enhanced validation: psutil {'available' if PSUTIL_AVAILABLE else 'not available'}")
|
|
204
|
+
self.logger.info(f"File locking: {platform.system()} {'supported' if (platform.system() != 'Windows' or WINDOWS_LOCKING) else 'not supported'}")
|
|
205
|
+
self.logger.info(f"Health monitoring: {'available' if HEALTH_MONITORING_AVAILABLE else 'not available'}")
|
|
206
|
+
|
|
207
|
+
# Initialize health monitoring system
|
|
208
|
+
self.health_monitor = None
|
|
209
|
+
self.recovery_manager = None
|
|
210
|
+
if HEALTH_MONITORING_AVAILABLE:
|
|
211
|
+
self._initialize_health_monitoring()
|
|
212
|
+
|
|
213
|
+
if self.process_start_time:
|
|
214
|
+
self.logger.debug(f"Process start time: {self.process_start_time}")
|
|
126
215
|
|
|
127
216
|
def _setup_logging(self) -> logging.Logger:
|
|
128
217
|
"""Setup dedicated logging for standalone server."""
|
|
@@ -139,6 +228,92 @@ class StandaloneSocketIOServer:
|
|
|
139
228
|
|
|
140
229
|
return logger
|
|
141
230
|
|
|
231
|
+
def _initialize_health_monitoring(self):
|
|
232
|
+
"""Initialize health monitoring and recovery systems."""
|
|
233
|
+
try:
|
|
234
|
+
# Health monitoring configuration
|
|
235
|
+
health_config = {
|
|
236
|
+
'check_interval': 30, # Check every 30 seconds
|
|
237
|
+
'history_size': 100, # Keep 100 health check results
|
|
238
|
+
'aggregation_window': 300 # 5 minute aggregation window
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
self.health_monitor = AdvancedHealthMonitor(health_config)
|
|
242
|
+
|
|
243
|
+
# Add health checkers
|
|
244
|
+
|
|
245
|
+
# Process resource monitoring
|
|
246
|
+
if PSUTIL_AVAILABLE:
|
|
247
|
+
process_checker = ProcessResourceChecker(
|
|
248
|
+
pid=self.pid,
|
|
249
|
+
cpu_threshold=80.0, # 80% CPU threshold
|
|
250
|
+
memory_threshold_mb=500, # 500MB memory threshold
|
|
251
|
+
fd_threshold=1000 # 1000 file descriptor threshold
|
|
252
|
+
)
|
|
253
|
+
self.health_monitor.add_checker(process_checker)
|
|
254
|
+
|
|
255
|
+
# Network connectivity monitoring
|
|
256
|
+
network_checker = NetworkConnectivityChecker(
|
|
257
|
+
host=self.host,
|
|
258
|
+
port=self.port,
|
|
259
|
+
timeout=2.0
|
|
260
|
+
)
|
|
261
|
+
self.health_monitor.add_checker(network_checker)
|
|
262
|
+
|
|
263
|
+
# Service health monitoring (will be initialized after server stats are available)
|
|
264
|
+
# This is added later in start_async after health_stats is fully initialized
|
|
265
|
+
|
|
266
|
+
# Recovery manager configuration
|
|
267
|
+
recovery_config = {
|
|
268
|
+
'enabled': True,
|
|
269
|
+
'check_interval': 60,
|
|
270
|
+
'max_recovery_attempts': 5,
|
|
271
|
+
'recovery_timeout': 30,
|
|
272
|
+
'circuit_breaker': {
|
|
273
|
+
'failure_threshold': 5,
|
|
274
|
+
'timeout_seconds': 300,
|
|
275
|
+
'success_threshold': 3
|
|
276
|
+
},
|
|
277
|
+
'strategy': {
|
|
278
|
+
'warning_threshold': 2,
|
|
279
|
+
'critical_threshold': 1,
|
|
280
|
+
'failure_window_seconds': 300,
|
|
281
|
+
'min_recovery_interval': 60
|
|
282
|
+
}
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
self.recovery_manager = RecoveryManager(recovery_config, self)
|
|
286
|
+
|
|
287
|
+
# Link health monitor and recovery manager
|
|
288
|
+
self.health_monitor.add_health_callback(self._handle_health_result)
|
|
289
|
+
|
|
290
|
+
self.logger.info("Health monitoring and recovery systems initialized")
|
|
291
|
+
|
|
292
|
+
except Exception as e:
|
|
293
|
+
self.logger.error(f"Failed to initialize health monitoring: {e}")
|
|
294
|
+
self.health_monitor = None
|
|
295
|
+
self.recovery_manager = None
|
|
296
|
+
|
|
297
|
+
def _handle_health_result(self, health_result: HealthCheckResult):
|
|
298
|
+
"""Handle health check results and trigger recovery if needed."""
|
|
299
|
+
try:
|
|
300
|
+
if self.recovery_manager:
|
|
301
|
+
recovery_event = self.recovery_manager.handle_health_result(health_result)
|
|
302
|
+
if recovery_event:
|
|
303
|
+
self.logger.info(f"Recovery triggered: {recovery_event.action.value}")
|
|
304
|
+
except Exception as e:
|
|
305
|
+
self.logger.error(f"Error handling health result: {e}")
|
|
306
|
+
|
|
307
|
+
# Enhanced error reporting for health check failures
|
|
308
|
+
if ENHANCED_ERRORS_AVAILABLE:
|
|
309
|
+
if hasattr(health_result, 'status') and health_result.status in ['critical', 'failed']:
|
|
310
|
+
health_error = HealthCheckError(
|
|
311
|
+
check_name=getattr(health_result, 'check_name', 'unknown'),
|
|
312
|
+
check_status=getattr(health_result, 'status', 'failed'),
|
|
313
|
+
check_details=getattr(health_result, 'details', {})
|
|
314
|
+
)
|
|
315
|
+
self.logger.error(f"\nHealth Check Failure Details:\n{health_error}")
|
|
316
|
+
|
|
142
317
|
def _get_pidfile_path(self) -> Path:
|
|
143
318
|
"""Get path for PID file to track running server."""
|
|
144
319
|
# Use system temp directory or user home
|
|
@@ -182,52 +357,442 @@ class StandaloneSocketIOServer:
|
|
|
182
357
|
|
|
183
358
|
return result
|
|
184
359
|
|
|
185
|
-
def
|
|
186
|
-
"""
|
|
360
|
+
def _validate_process_identity(self, pid: int, expected_cmdline_patterns: List[str] = None) -> Dict[str, Any]:
|
|
361
|
+
"""Validate that a process is actually our Socket.IO server.
|
|
362
|
+
|
|
363
|
+
Args:
|
|
364
|
+
pid: Process ID to validate
|
|
365
|
+
expected_cmdline_patterns: Command line patterns that should match our server
|
|
366
|
+
|
|
367
|
+
Returns:
|
|
368
|
+
Dict with validation results and process info
|
|
369
|
+
"""
|
|
370
|
+
validation_result = {
|
|
371
|
+
"is_valid": False,
|
|
372
|
+
"is_zombie": False,
|
|
373
|
+
"is_our_server": False,
|
|
374
|
+
"process_info": {},
|
|
375
|
+
"validation_errors": []
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
if not PSUTIL_AVAILABLE:
|
|
379
|
+
validation_result["validation_errors"].append("psutil not available for enhanced validation")
|
|
380
|
+
# Fallback to basic process existence check
|
|
381
|
+
try:
|
|
382
|
+
os.kill(pid, 0)
|
|
383
|
+
validation_result["is_valid"] = True
|
|
384
|
+
validation_result["process_info"] = {"pid": pid, "method": "basic_os_check"}
|
|
385
|
+
except OSError:
|
|
386
|
+
validation_result["validation_errors"].append(f"Process {pid} does not exist")
|
|
387
|
+
return validation_result
|
|
388
|
+
|
|
187
389
|
try:
|
|
188
|
-
|
|
189
|
-
|
|
390
|
+
process = psutil.Process(pid)
|
|
391
|
+
|
|
392
|
+
# Basic process info
|
|
393
|
+
process_info = {
|
|
394
|
+
"pid": pid,
|
|
395
|
+
"status": process.status(),
|
|
396
|
+
"create_time": process.create_time(),
|
|
397
|
+
"name": process.name(),
|
|
398
|
+
"cwd": None,
|
|
399
|
+
"cmdline": [],
|
|
400
|
+
"memory_info": None
|
|
401
|
+
}
|
|
402
|
+
|
|
403
|
+
# Check if process is zombie
|
|
404
|
+
if process.status() == psutil.STATUS_ZOMBIE:
|
|
405
|
+
validation_result["is_zombie"] = True
|
|
406
|
+
validation_result["validation_errors"].append(f"Process {pid} is a zombie")
|
|
407
|
+
validation_result["process_info"] = process_info
|
|
408
|
+
return validation_result
|
|
409
|
+
|
|
410
|
+
# Get additional process details
|
|
411
|
+
try:
|
|
412
|
+
process_info["cwd"] = process.cwd()
|
|
413
|
+
process_info["cmdline"] = process.cmdline()
|
|
414
|
+
process_info["memory_info"] = process.memory_info()._asdict()
|
|
415
|
+
except (psutil.AccessDenied, psutil.NoSuchProcess) as e:
|
|
416
|
+
validation_result["validation_errors"].append(f"Access denied getting process details: {e}")
|
|
417
|
+
|
|
418
|
+
validation_result["process_info"] = process_info
|
|
419
|
+
validation_result["is_valid"] = True
|
|
420
|
+
|
|
421
|
+
# Validate this is likely our server process
|
|
422
|
+
cmdline = process_info.get("cmdline", [])
|
|
423
|
+
cmdline_str = " ".join(cmdline).lower()
|
|
424
|
+
|
|
425
|
+
# Default patterns for our Socket.IO server
|
|
426
|
+
if expected_cmdline_patterns is None:
|
|
427
|
+
expected_cmdline_patterns = [
|
|
428
|
+
"socketio",
|
|
429
|
+
"standalone_socketio_server",
|
|
430
|
+
"claude-mpm",
|
|
431
|
+
str(self.port)
|
|
432
|
+
]
|
|
433
|
+
|
|
434
|
+
# Check if any patterns match the command line
|
|
435
|
+
matches = [pattern.lower() in cmdline_str for pattern in expected_cmdline_patterns]
|
|
436
|
+
if any(matches):
|
|
437
|
+
validation_result["is_our_server"] = True
|
|
438
|
+
self.logger.debug(f"Process {pid} matches server patterns: {[p for p, m in zip(expected_cmdline_patterns, matches) if m]}")
|
|
439
|
+
else:
|
|
440
|
+
validation_result["validation_errors"].append(
|
|
441
|
+
f"Process {pid} command line '{cmdline_str}' does not match expected patterns: {expected_cmdline_patterns}"
|
|
442
|
+
)
|
|
443
|
+
self.logger.warning(f"Process {pid} does not appear to be our server: {cmdline}")
|
|
444
|
+
|
|
445
|
+
except psutil.NoSuchProcess:
|
|
446
|
+
validation_result["validation_errors"].append(f"Process {pid} no longer exists")
|
|
447
|
+
except psutil.AccessDenied as e:
|
|
448
|
+
validation_result["validation_errors"].append(f"Access denied to process {pid}: {e}")
|
|
449
|
+
except Exception as e:
|
|
450
|
+
validation_result["validation_errors"].append(f"Error validating process {pid}: {e}")
|
|
451
|
+
|
|
452
|
+
return validation_result
|
|
453
|
+
|
|
454
|
+
def _acquire_pidfile_lock(self, pidfile_fd) -> bool:
|
|
455
|
+
"""Acquire exclusive lock on PID file.
|
|
456
|
+
|
|
457
|
+
Args:
|
|
458
|
+
pidfile_fd: Open file descriptor for PID file
|
|
459
|
+
|
|
460
|
+
Returns:
|
|
461
|
+
True if lock acquired successfully, False otherwise
|
|
462
|
+
"""
|
|
463
|
+
try:
|
|
464
|
+
if platform.system() == 'Windows' and WINDOWS_LOCKING:
|
|
465
|
+
# Windows file locking
|
|
466
|
+
msvcrt.locking(pidfile_fd.fileno(), msvcrt.LK_NBLCK, 1)
|
|
467
|
+
return True
|
|
468
|
+
else:
|
|
469
|
+
# Unix file locking
|
|
470
|
+
fcntl.flock(pidfile_fd.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
|
|
471
|
+
return True
|
|
472
|
+
except (IOError, OSError) as e:
|
|
473
|
+
self.logger.debug(f"Could not acquire PID file lock: {e}")
|
|
474
|
+
return False
|
|
475
|
+
|
|
476
|
+
def _release_pidfile_lock(self, pidfile_fd):
|
|
477
|
+
"""Release lock on PID file.
|
|
478
|
+
|
|
479
|
+
Args:
|
|
480
|
+
pidfile_fd: Open file descriptor for PID file
|
|
481
|
+
"""
|
|
482
|
+
try:
|
|
483
|
+
if platform.system() == 'Windows' and WINDOWS_LOCKING:
|
|
484
|
+
msvcrt.locking(pidfile_fd.fileno(), msvcrt.LK_UNLCK, 1)
|
|
485
|
+
else:
|
|
486
|
+
fcntl.flock(pidfile_fd.fileno(), fcntl.LOCK_UN)
|
|
487
|
+
except (IOError, OSError) as e:
|
|
488
|
+
self.logger.debug(f"Error releasing PID file lock: {e}")
|
|
489
|
+
|
|
490
|
+
def _validate_pidfile_timestamp(self, pidfile_path: Path, process_start_time: float) -> bool:
|
|
491
|
+
"""Validate that PID file was created around the same time as the process.
|
|
492
|
+
|
|
493
|
+
Args:
|
|
494
|
+
pidfile_path: Path to PID file
|
|
495
|
+
process_start_time: Process start time from psutil
|
|
496
|
+
|
|
497
|
+
Returns:
|
|
498
|
+
True if timestamps are reasonably close, False otherwise
|
|
499
|
+
"""
|
|
500
|
+
try:
|
|
501
|
+
pidfile_mtime = pidfile_path.stat().st_mtime
|
|
502
|
+
time_diff = abs(pidfile_mtime - process_start_time)
|
|
503
|
+
|
|
504
|
+
# Allow up to 5 seconds difference (process start vs file creation)
|
|
505
|
+
if time_diff <= 5.0:
|
|
506
|
+
return True
|
|
507
|
+
else:
|
|
508
|
+
self.logger.warning(
|
|
509
|
+
f"PID file timestamp ({pidfile_mtime}) and process start time ({process_start_time}) "
|
|
510
|
+
f"differ by {time_diff:.2f} seconds"
|
|
511
|
+
)
|
|
512
|
+
return False
|
|
513
|
+
except Exception as e:
|
|
514
|
+
self.logger.warning(f"Could not validate PID file timestamp: {e}")
|
|
515
|
+
return False
|
|
516
|
+
|
|
517
|
+
def is_already_running(self, raise_on_conflict: bool = False) -> bool:
|
|
518
|
+
"""Enhanced check if another server instance is already running on this port.
|
|
519
|
+
|
|
520
|
+
This method performs comprehensive validation including:
|
|
521
|
+
- PID file existence and validity
|
|
522
|
+
- Process identity verification (command line, start time)
|
|
523
|
+
- Zombie process detection
|
|
524
|
+
- Port availability check
|
|
525
|
+
- Automatic cleanup of stale PID files
|
|
526
|
+
|
|
527
|
+
Returns:
|
|
528
|
+
True if a valid server is already running, False otherwise
|
|
529
|
+
"""
|
|
530
|
+
self.logger.debug(f"Checking if server is already running on {self.host}:{self.port}")
|
|
531
|
+
|
|
532
|
+
try:
|
|
533
|
+
# Step 1: Check PID file existence
|
|
534
|
+
if not self.pidfile_path.exists():
|
|
535
|
+
self.logger.debug("No PID file found")
|
|
536
|
+
return self._check_port_only(raise_on_conflict)
|
|
537
|
+
|
|
538
|
+
self.logger.debug(f"Found PID file: {self.pidfile_path}")
|
|
539
|
+
|
|
540
|
+
# Step 2: Read PID from file with support for both JSON and legacy formats
|
|
541
|
+
try:
|
|
190
542
|
with open(self.pidfile_path, 'r') as f:
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
543
|
+
pid_content = f.read().strip()
|
|
544
|
+
|
|
545
|
+
if not pid_content:
|
|
546
|
+
self.logger.warning("Empty PID file")
|
|
547
|
+
self._cleanup_stale_pidfile("empty_file")
|
|
548
|
+
return self._check_port_only(raise_on_conflict)
|
|
549
|
+
|
|
550
|
+
# Try JSON format first (new format)
|
|
551
|
+
try:
|
|
552
|
+
pidfile_data = json.loads(pid_content)
|
|
553
|
+
old_pid = pidfile_data["pid"]
|
|
554
|
+
server_id = pidfile_data.get("server_id", "unknown")
|
|
555
|
+
self.logger.debug(f"Found PID {old_pid} for server {server_id} in JSON format")
|
|
556
|
+
except (json.JSONDecodeError, KeyError, TypeError):
|
|
557
|
+
# Fallback to legacy format (plain PID number)
|
|
558
|
+
if pid_content.isdigit():
|
|
559
|
+
old_pid = int(pid_content)
|
|
560
|
+
self.logger.debug(f"Found PID {old_pid} in legacy format")
|
|
561
|
+
else:
|
|
562
|
+
self.logger.warning(f"Invalid PID content in file: '{pid_content[:100]}...' (truncated)")
|
|
563
|
+
self._cleanup_stale_pidfile("invalid_content")
|
|
564
|
+
return self._check_port_only(raise_on_conflict)
|
|
565
|
+
|
|
566
|
+
except (IOError, ValueError) as e:
|
|
567
|
+
self.logger.warning(f"Could not read PID file: {e}")
|
|
568
|
+
self._cleanup_stale_pidfile("read_error")
|
|
569
|
+
return self._check_port_only(raise_on_conflict)
|
|
570
|
+
|
|
571
|
+
# Step 3: Enhanced process validation
|
|
572
|
+
validation = self._validate_process_identity(old_pid)
|
|
573
|
+
|
|
574
|
+
if not validation["is_valid"]:
|
|
575
|
+
self.logger.info(f"Process {old_pid} is not valid: {validation['validation_errors']}")
|
|
576
|
+
if raise_on_conflict and ENHANCED_ERRORS_AVAILABLE:
|
|
577
|
+
raise StaleProcessError(
|
|
578
|
+
pid=old_pid,
|
|
579
|
+
pidfile_path=self.pidfile_path,
|
|
580
|
+
process_status="not_found",
|
|
581
|
+
validation_errors=validation['validation_errors']
|
|
582
|
+
)
|
|
583
|
+
self._cleanup_stale_pidfile("invalid_process")
|
|
584
|
+
return self._check_port_only(raise_on_conflict)
|
|
585
|
+
|
|
586
|
+
if validation["is_zombie"]:
|
|
587
|
+
self.logger.info(f"Process {old_pid} is a zombie, cleaning up")
|
|
588
|
+
if raise_on_conflict and ENHANCED_ERRORS_AVAILABLE:
|
|
589
|
+
raise StaleProcessError(
|
|
590
|
+
pid=old_pid,
|
|
591
|
+
pidfile_path=self.pidfile_path,
|
|
592
|
+
process_status="zombie",
|
|
593
|
+
validation_errors=["Process is a zombie (terminated but not reaped)"]
|
|
594
|
+
)
|
|
595
|
+
self._cleanup_stale_pidfile("zombie_process")
|
|
596
|
+
return self._check_port_only(raise_on_conflict)
|
|
597
|
+
|
|
598
|
+
# Step 4: Verify this is actually our server process
|
|
599
|
+
if not validation["is_our_server"]:
|
|
600
|
+
self.logger.warning(
|
|
601
|
+
f"Process {old_pid} exists but does not appear to be our Socket.IO server. "
|
|
602
|
+
f"Command line: {validation['process_info'].get('cmdline', 'unknown')}"
|
|
603
|
+
)
|
|
604
|
+
# Don't automatically clean up - might be another legitimate process
|
|
605
|
+
return self._check_port_only(raise_on_conflict)
|
|
606
|
+
|
|
607
|
+
# Step 5: Validate process start time against PID file timestamp
|
|
608
|
+
if PSUTIL_AVAILABLE and 'create_time' in validation['process_info']:
|
|
609
|
+
process_start_time = validation['process_info']['create_time']
|
|
610
|
+
if not self._validate_pidfile_timestamp(self.pidfile_path, process_start_time):
|
|
611
|
+
self.logger.warning("PID file timestamp does not match process start time")
|
|
612
|
+
# Continue anyway - timestamp validation is not critical
|
|
613
|
+
|
|
614
|
+
# Step 6: All validations passed
|
|
615
|
+
process_info = validation['process_info']
|
|
616
|
+
self.logger.info(
|
|
617
|
+
f"Found valid running server: PID {old_pid}, "
|
|
618
|
+
f"status: {process_info.get('status', 'unknown')}, "
|
|
619
|
+
f"name: {process_info.get('name', 'unknown')}"
|
|
620
|
+
)
|
|
621
|
+
|
|
622
|
+
if raise_on_conflict and ENHANCED_ERRORS_AVAILABLE:
|
|
623
|
+
# Try to extract server ID from PID file if available
|
|
624
|
+
server_id = "unknown"
|
|
194
625
|
try:
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
626
|
+
with open(self.pidfile_path, 'r') as f:
|
|
627
|
+
content = f.read().strip()
|
|
628
|
+
if content.startswith('{'):
|
|
629
|
+
pidfile_data = json.loads(content)
|
|
630
|
+
server_id = pidfile_data.get("server_id", "unknown")
|
|
631
|
+
except:
|
|
632
|
+
pass
|
|
633
|
+
|
|
634
|
+
raise DaemonConflictError(
|
|
635
|
+
port=self.port,
|
|
636
|
+
existing_pid=old_pid,
|
|
637
|
+
existing_server_id=server_id,
|
|
638
|
+
process_info=process_info,
|
|
639
|
+
pidfile_path=self.pidfile_path
|
|
640
|
+
)
|
|
641
|
+
|
|
642
|
+
return True
|
|
643
|
+
|
|
644
|
+
except (DaemonConflictError, StaleProcessError, PortConflictError) as e:
|
|
645
|
+
# Re-raise our enhanced errors instead of catching them
|
|
646
|
+
raise
|
|
647
|
+
except Exception as e:
|
|
648
|
+
self.logger.error(f"Error during enhanced server check: {e}")
|
|
649
|
+
# Fallback to basic port check on unexpected errors
|
|
650
|
+
return self._check_port_only(raise_on_conflict)
|
|
651
|
+
|
|
652
|
+
def _check_port_only(self, raise_on_conflict: bool = False) -> bool:
|
|
653
|
+
"""Fallback method to check if port is in use.
|
|
654
|
+
|
|
655
|
+
Args:
|
|
656
|
+
raise_on_conflict: If True, raises PortConflictError instead of returning True
|
|
657
|
+
|
|
658
|
+
Returns:
|
|
659
|
+
True if port is in use, False otherwise
|
|
201
660
|
|
|
202
|
-
|
|
661
|
+
Raises:
|
|
662
|
+
PortConflictError: If raise_on_conflict=True and port is in use
|
|
663
|
+
"""
|
|
664
|
+
try:
|
|
203
665
|
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
204
666
|
s.settimeout(1.0)
|
|
205
667
|
result = s.connect_ex((self.host, self.port))
|
|
206
668
|
if result == 0:
|
|
207
|
-
self.logger.info(f"Port {self.port} is
|
|
208
|
-
return True
|
|
669
|
+
self.logger.info(f"Port {self.port} is in use by some process")
|
|
209
670
|
|
|
671
|
+
if raise_on_conflict and ENHANCED_ERRORS_AVAILABLE:
|
|
672
|
+
# Try to identify the conflicting process if psutil is available
|
|
673
|
+
conflicting_process = {}
|
|
674
|
+
if PSUTIL_AVAILABLE:
|
|
675
|
+
try:
|
|
676
|
+
import psutil
|
|
677
|
+
for proc in psutil.process_iter(['pid', 'name', 'cmdline']):
|
|
678
|
+
try:
|
|
679
|
+
for conn in proc.connections():
|
|
680
|
+
if (conn.laddr.ip == self.host or conn.laddr.ip == '0.0.0.0') and conn.laddr.port == self.port:
|
|
681
|
+
conflicting_process = {
|
|
682
|
+
'pid': proc.info['pid'],
|
|
683
|
+
'name': proc.info['name'],
|
|
684
|
+
'cmdline': proc.info['cmdline']
|
|
685
|
+
}
|
|
686
|
+
break
|
|
687
|
+
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
|
688
|
+
continue
|
|
689
|
+
if conflicting_process:
|
|
690
|
+
break
|
|
691
|
+
except Exception:
|
|
692
|
+
pass # Ignore errors in process discovery
|
|
693
|
+
|
|
694
|
+
raise PortConflictError(
|
|
695
|
+
port=self.port,
|
|
696
|
+
host=self.host,
|
|
697
|
+
conflicting_process=conflicting_process
|
|
698
|
+
)
|
|
699
|
+
|
|
700
|
+
return True
|
|
210
701
|
except Exception as e:
|
|
211
|
-
|
|
702
|
+
if not isinstance(e, PortConflictError): # Don't mask our own exceptions
|
|
703
|
+
self.logger.debug(f"Error checking port availability: {e}")
|
|
212
704
|
|
|
213
705
|
return False
|
|
214
706
|
|
|
707
|
+
def _cleanup_stale_pidfile(self, reason: str):
|
|
708
|
+
"""Clean up stale PID file with logging.
|
|
709
|
+
|
|
710
|
+
Args:
|
|
711
|
+
reason: Reason for cleanup (for logging)
|
|
712
|
+
"""
|
|
713
|
+
try:
|
|
714
|
+
if self.pidfile_path.exists():
|
|
715
|
+
self.pidfile_path.unlink()
|
|
716
|
+
self.logger.info(f"Cleaned up stale PID file (reason: {reason}): {self.pidfile_path}")
|
|
717
|
+
except Exception as e:
|
|
718
|
+
self.logger.error(f"Failed to clean up stale PID file: {e}")
|
|
719
|
+
|
|
215
720
|
def create_pidfile(self):
|
|
216
|
-
"""Create PID file to track this server instance.
|
|
721
|
+
"""Create PID file with exclusive locking to track this server instance.
|
|
722
|
+
|
|
723
|
+
This method creates a PID file with exclusive locking to prevent race conditions
|
|
724
|
+
and ensures only one server instance can hold the lock at a time.
|
|
725
|
+
"""
|
|
217
726
|
try:
|
|
218
727
|
self.pidfile_path.parent.mkdir(parents=True, exist_ok=True)
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
728
|
+
|
|
729
|
+
# Open file for writing with exclusive creation
|
|
730
|
+
pidfile_fd = open(self.pidfile_path, 'w')
|
|
731
|
+
|
|
732
|
+
# Try to acquire exclusive lock
|
|
733
|
+
if not self._acquire_pidfile_lock(pidfile_fd):
|
|
734
|
+
pidfile_fd.close()
|
|
735
|
+
if ENHANCED_ERRORS_AVAILABLE:
|
|
736
|
+
raise DaemonConflictError(
|
|
737
|
+
port=self.port,
|
|
738
|
+
existing_pid=0, # Unknown PID since we can't get lock
|
|
739
|
+
existing_server_id="unknown",
|
|
740
|
+
pidfile_path=self.pidfile_path
|
|
741
|
+
)
|
|
742
|
+
else:
|
|
743
|
+
raise RuntimeError("Could not acquire exclusive lock on PID file")
|
|
744
|
+
|
|
745
|
+
# Write PID and additional metadata
|
|
746
|
+
pidfile_content = {
|
|
747
|
+
"pid": self.pid,
|
|
748
|
+
"server_id": self.server_id,
|
|
749
|
+
"server_version": self.server_version,
|
|
750
|
+
"port": self.port,
|
|
751
|
+
"host": self.host,
|
|
752
|
+
"start_time": self.start_time.isoformat() + "Z",
|
|
753
|
+
"process_start_time": self.process_start_time if self.process_start_time else None,
|
|
754
|
+
"python_version": sys.version.split()[0],
|
|
755
|
+
"platform": platform.system(),
|
|
756
|
+
"created_at": datetime.utcnow().isoformat() + "Z"
|
|
757
|
+
}
|
|
758
|
+
|
|
759
|
+
# Write JSON format for better validation
|
|
760
|
+
pidfile_fd.write(json.dumps(pidfile_content, indent=2))
|
|
761
|
+
pidfile_fd.flush()
|
|
762
|
+
|
|
763
|
+
# Keep file descriptor open to maintain lock
|
|
764
|
+
self.pidfile_lock = pidfile_fd
|
|
765
|
+
|
|
766
|
+
self.logger.info(f"Created PID file with exclusive lock: {self.pidfile_path}")
|
|
767
|
+
self.logger.debug(f"PID file content: {pidfile_content}")
|
|
768
|
+
|
|
222
769
|
except Exception as e:
|
|
223
770
|
self.logger.error(f"Failed to create PID file: {e}")
|
|
771
|
+
if 'pidfile_fd' in locals():
|
|
772
|
+
try:
|
|
773
|
+
pidfile_fd.close()
|
|
774
|
+
except:
|
|
775
|
+
pass
|
|
776
|
+
raise
|
|
224
777
|
|
|
225
778
|
def remove_pidfile(self):
|
|
226
|
-
"""Remove PID file on shutdown."""
|
|
779
|
+
"""Remove PID file and release lock on shutdown."""
|
|
227
780
|
try:
|
|
781
|
+
# Release file lock first
|
|
782
|
+
if self.pidfile_lock:
|
|
783
|
+
try:
|
|
784
|
+
self._release_pidfile_lock(self.pidfile_lock)
|
|
785
|
+
self.pidfile_lock.close()
|
|
786
|
+
self.pidfile_lock = None
|
|
787
|
+
self.logger.debug("Released PID file lock")
|
|
788
|
+
except Exception as e:
|
|
789
|
+
self.logger.warning(f"Error releasing PID file lock: {e}")
|
|
790
|
+
|
|
791
|
+
# Remove PID file
|
|
228
792
|
if self.pidfile_path.exists():
|
|
229
793
|
self.pidfile_path.unlink()
|
|
230
794
|
self.logger.info(f"Removed PID file: {self.pidfile_path}")
|
|
795
|
+
|
|
231
796
|
except Exception as e:
|
|
232
797
|
self.logger.error(f"Failed to remove PID file: {e}")
|
|
233
798
|
|
|
@@ -247,7 +812,11 @@ class StandaloneSocketIOServer:
|
|
|
247
812
|
async def start_async(self):
|
|
248
813
|
"""Start the server asynchronously."""
|
|
249
814
|
if not SOCKETIO_AVAILABLE:
|
|
250
|
-
|
|
815
|
+
error_msg = "Socket.IO dependencies not available. Install with: pip install python-socketio aiohttp"
|
|
816
|
+
if ENHANCED_ERRORS_AVAILABLE:
|
|
817
|
+
raise RuntimeError(error_msg + "\n\nInstallation steps:\n 1. pip install python-socketio aiohttp\n 2. Restart the server\n 3. Verify installation: python -c 'import socketio; print(socketio.__version__)'")
|
|
818
|
+
else:
|
|
819
|
+
raise RuntimeError(error_msg)
|
|
251
820
|
|
|
252
821
|
self.logger.info(f"Starting standalone Socket.IO server v{self.server_version}")
|
|
253
822
|
|
|
@@ -271,14 +840,53 @@ class StandaloneSocketIOServer:
|
|
|
271
840
|
self._setup_event_handlers()
|
|
272
841
|
|
|
273
842
|
# Start the server
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
843
|
+
try:
|
|
844
|
+
self.runner = web.AppRunner(self.app)
|
|
845
|
+
await self.runner.setup()
|
|
846
|
+
|
|
847
|
+
self.site = web.TCPSite(self.runner, self.host, self.port)
|
|
848
|
+
await self.site.start()
|
|
849
|
+
|
|
850
|
+
self.running = True
|
|
851
|
+
|
|
852
|
+
# Create PID file after successful server start
|
|
853
|
+
self.create_pidfile()
|
|
854
|
+
|
|
855
|
+
# Start health monitoring
|
|
856
|
+
if self.health_monitor:
|
|
857
|
+
# Add service health checker now that stats are available
|
|
858
|
+
service_checker = ServiceHealthChecker(
|
|
859
|
+
service_stats=self.health_stats,
|
|
860
|
+
max_clients=1000,
|
|
861
|
+
max_error_rate=0.1
|
|
862
|
+
)
|
|
863
|
+
self.health_monitor.add_checker(service_checker)
|
|
864
|
+
|
|
865
|
+
# Start monitoring
|
|
866
|
+
self.health_monitor.start_monitoring()
|
|
867
|
+
self.logger.info("Health monitoring started")
|
|
868
|
+
|
|
869
|
+
except Exception as e:
|
|
870
|
+
self.logger.error(f"Failed to start server: {e}")
|
|
871
|
+
# Clean up partial initialization
|
|
872
|
+
if hasattr(self, 'runner') and self.runner:
|
|
873
|
+
try:
|
|
874
|
+
await self.runner.cleanup()
|
|
875
|
+
except:
|
|
876
|
+
pass
|
|
877
|
+
|
|
878
|
+
# Enhanced error handling for common startup failures
|
|
879
|
+
if ENHANCED_ERRORS_AVAILABLE:
|
|
880
|
+
if "Address already in use" in str(e) or "Permission denied" in str(e):
|
|
881
|
+
# This is likely a port conflict
|
|
882
|
+
try:
|
|
883
|
+
# Check if port is in use and raise appropriate error
|
|
884
|
+
self._check_port_only(raise_on_conflict=True)
|
|
885
|
+
except PortConflictError:
|
|
886
|
+
# Re-raise the more specific error
|
|
887
|
+
raise
|
|
888
|
+
|
|
889
|
+
raise
|
|
282
890
|
|
|
283
891
|
self.logger.info(f"🚀 Standalone Socket.IO server STARTED on http://{self.host}:{self.port}")
|
|
284
892
|
self.logger.info(f"🔧 Server ID: {self.server_id}")
|
|
@@ -346,6 +954,11 @@ class StandaloneSocketIOServer:
|
|
|
346
954
|
async def _shutdown_async(self):
|
|
347
955
|
"""Async shutdown process."""
|
|
348
956
|
try:
|
|
957
|
+
# Stop health monitoring
|
|
958
|
+
if self.health_monitor:
|
|
959
|
+
await self.health_monitor.stop_monitoring()
|
|
960
|
+
self.logger.info("Health monitoring stopped")
|
|
961
|
+
|
|
349
962
|
# Close all client connections
|
|
350
963
|
if self.sio:
|
|
351
964
|
await self.sio.shutdown()
|
|
@@ -534,6 +1147,12 @@ class StandaloneSocketIOServer:
|
|
|
534
1147
|
except Exception as e:
|
|
535
1148
|
self.logger.error(f"Error handling claude_event: {e}")
|
|
536
1149
|
self.health_stats["errors"] += 1
|
|
1150
|
+
|
|
1151
|
+
# Check if error rate is becoming concerning
|
|
1152
|
+
if ENHANCED_ERRORS_AVAILABLE and self.health_stats["errors"] > 0:
|
|
1153
|
+
error_rate = self.health_stats["errors"] / max(self.health_stats["events_processed"], 1)
|
|
1154
|
+
if error_rate > 0.1: # More than 10% error rate
|
|
1155
|
+
self.logger.warning(f"⚠️ High error rate detected: {error_rate:.2%} ({self.health_stats['errors']} errors out of {self.health_stats['events_processed']} events)")
|
|
537
1156
|
|
|
538
1157
|
@self.sio.event
|
|
539
1158
|
async def get_history(sid, data=None):
|
|
@@ -571,6 +1190,8 @@ class StandaloneSocketIOServer:
|
|
|
571
1190
|
def main():
|
|
572
1191
|
"""Main entry point for standalone server execution."""
|
|
573
1192
|
import argparse
|
|
1193
|
+
import json
|
|
1194
|
+
import time
|
|
574
1195
|
|
|
575
1196
|
parser = argparse.ArgumentParser(description="Standalone Claude MPM Socket.IO Server")
|
|
576
1197
|
parser.add_argument("--host", default="localhost", help="Host to bind to")
|
|
@@ -605,12 +1226,60 @@ def main():
|
|
|
605
1226
|
|
|
606
1227
|
if args.stop:
|
|
607
1228
|
if server.is_already_running():
|
|
608
|
-
# Send termination signal to running server
|
|
1229
|
+
# Send termination signal to running server with enhanced validation
|
|
609
1230
|
try:
|
|
1231
|
+
# Read and validate PID file
|
|
610
1232
|
with open(server.pidfile_path, 'r') as f:
|
|
611
|
-
|
|
1233
|
+
content = f.read().strip()
|
|
1234
|
+
|
|
1235
|
+
# Try to parse as JSON first (new format), fallback to plain PID
|
|
1236
|
+
try:
|
|
1237
|
+
pidfile_data = json.loads(content)
|
|
1238
|
+
pid = pidfile_data["pid"]
|
|
1239
|
+
server_id = pidfile_data.get("server_id", "unknown")
|
|
1240
|
+
print(f"Found server {server_id} with PID {pid}")
|
|
1241
|
+
except (json.JSONDecodeError, KeyError):
|
|
1242
|
+
# Fallback to old format
|
|
1243
|
+
pid = int(content)
|
|
1244
|
+
server_id = "unknown"
|
|
1245
|
+
|
|
1246
|
+
# Validate the process before attempting to stop it
|
|
1247
|
+
validation = server._validate_process_identity(pid)
|
|
1248
|
+
if not validation["is_valid"]:
|
|
1249
|
+
print(f"Process {pid} is not valid or no longer exists")
|
|
1250
|
+
server._cleanup_stale_pidfile("stop_command_invalid_process")
|
|
1251
|
+
print("Cleaned up stale PID file")
|
|
1252
|
+
sys.exit(1)
|
|
1253
|
+
|
|
1254
|
+
if validation["is_zombie"]:
|
|
1255
|
+
print(f"Process {pid} is a zombie, cleaning up PID file")
|
|
1256
|
+
server._cleanup_stale_pidfile("stop_command_zombie")
|
|
1257
|
+
sys.exit(0)
|
|
1258
|
+
|
|
1259
|
+
if not validation["is_our_server"]:
|
|
1260
|
+
print(f"Warning: Process {pid} may not be our Socket.IO server")
|
|
1261
|
+
print(f"Command line: {validation['process_info'].get('cmdline', 'unknown')}")
|
|
1262
|
+
response = input("Stop it anyway? [y/N]: ")
|
|
1263
|
+
if response.lower() != 'y':
|
|
1264
|
+
print("Aborted")
|
|
1265
|
+
sys.exit(1)
|
|
1266
|
+
|
|
1267
|
+
# Send termination signal
|
|
612
1268
|
os.kill(pid, signal.SIGTERM)
|
|
613
1269
|
print(f"Sent stop signal to server (PID: {pid})")
|
|
1270
|
+
|
|
1271
|
+
# Wait a moment for graceful shutdown
|
|
1272
|
+
time.sleep(2)
|
|
1273
|
+
|
|
1274
|
+
# Check if process is still running
|
|
1275
|
+
try:
|
|
1276
|
+
os.kill(pid, 0)
|
|
1277
|
+
print(f"Server is still running, sending SIGKILL...")
|
|
1278
|
+
os.kill(pid, signal.SIGKILL)
|
|
1279
|
+
time.sleep(1)
|
|
1280
|
+
except OSError:
|
|
1281
|
+
print("Server stopped successfully")
|
|
1282
|
+
|
|
614
1283
|
except Exception as e:
|
|
615
1284
|
print(f"Error stopping server: {e}")
|
|
616
1285
|
sys.exit(1)
|