claude-mpm 3.4.0__py3-none-any.whl → 3.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- claude_mpm/cli/commands/memory.py +6 -1
- claude_mpm/core/config.py +160 -0
- claude_mpm/hooks/claude_hooks/hook_wrapper.sh +1 -1
- claude_mpm/scripts/socketio_daemon.py +49 -9
- claude_mpm/scripts/socketio_server_manager.py +370 -45
- claude_mpm/services/__init__.py +18 -0
- claude_mpm/services/agent_memory_manager.py +7 -5
- claude_mpm/services/exceptions.py +677 -0
- claude_mpm/services/health_monitor.py +892 -0
- claude_mpm/services/memory_builder.py +4 -2
- claude_mpm/services/memory_optimizer.py +6 -2
- claude_mpm/services/recovery_manager.py +670 -0
- claude_mpm/services/socketio_server.py +188 -11
- claude_mpm/services/standalone_socketio_server.py +703 -34
- {claude_mpm-3.4.0.dist-info → claude_mpm-3.4.2.dist-info}/METADATA +1 -1
- {claude_mpm-3.4.0.dist-info → claude_mpm-3.4.2.dist-info}/RECORD +21 -18
- /claude_mpm/{web → dashboard}/open_dashboard.py +0 -0
- {claude_mpm-3.4.0.dist-info → claude_mpm-3.4.2.dist-info}/WHEEL +0 -0
- {claude_mpm-3.4.0.dist-info → claude_mpm-3.4.2.dist-info}/entry_points.txt +0 -0
- {claude_mpm-3.4.0.dist-info → claude_mpm-3.4.2.dist-info}/licenses/LICENSE +0 -0
- {claude_mpm-3.4.0.dist-info → claude_mpm-3.4.2.dist-info}/top_level.txt +0 -0
|
@@ -10,6 +10,7 @@ with other command modules like agents.py.
|
|
|
10
10
|
"""
|
|
11
11
|
|
|
12
12
|
import json
|
|
13
|
+
import os
|
|
13
14
|
from datetime import datetime
|
|
14
15
|
from pathlib import Path
|
|
15
16
|
|
|
@@ -38,7 +39,11 @@ def manage_memory(args):
|
|
|
38
39
|
try:
|
|
39
40
|
# Load configuration for memory manager
|
|
40
41
|
config = Config()
|
|
41
|
-
|
|
42
|
+
# Use CLAUDE_MPM_USER_PWD if available (when called via shell script),
|
|
43
|
+
# otherwise use current working directory
|
|
44
|
+
user_pwd = os.environ.get('CLAUDE_MPM_USER_PWD', os.getcwd())
|
|
45
|
+
current_dir = Path(user_pwd)
|
|
46
|
+
memory_manager = AgentMemoryManager(config, current_dir)
|
|
42
47
|
|
|
43
48
|
if not args.memory_command:
|
|
44
49
|
# No subcommand - show status
|
claude_mpm/core/config.py
CHANGED
|
@@ -166,9 +166,38 @@ class Config:
|
|
|
166
166
|
# Health monitoring
|
|
167
167
|
"enable_health_monitoring": True,
|
|
168
168
|
"health_check_interval": 30,
|
|
169
|
+
"health_history_size": 100,
|
|
170
|
+
"health_aggregation_window": 300,
|
|
169
171
|
# Metrics
|
|
170
172
|
"enable_metrics": True,
|
|
171
173
|
"metrics_interval": 60,
|
|
174
|
+
# Advanced health monitoring thresholds
|
|
175
|
+
"health_thresholds": {
|
|
176
|
+
"cpu_percent": 80.0,
|
|
177
|
+
"memory_mb": 500,
|
|
178
|
+
"file_descriptors": 1000,
|
|
179
|
+
"max_clients": 1000,
|
|
180
|
+
"max_error_rate": 0.1,
|
|
181
|
+
"network_timeout": 2.0
|
|
182
|
+
},
|
|
183
|
+
# Automatic recovery configuration
|
|
184
|
+
"recovery": {
|
|
185
|
+
"enabled": True,
|
|
186
|
+
"check_interval": 60,
|
|
187
|
+
"max_recovery_attempts": 5,
|
|
188
|
+
"recovery_timeout": 30,
|
|
189
|
+
"circuit_breaker": {
|
|
190
|
+
"failure_threshold": 5,
|
|
191
|
+
"timeout_seconds": 300,
|
|
192
|
+
"success_threshold": 3
|
|
193
|
+
},
|
|
194
|
+
"strategy": {
|
|
195
|
+
"warning_threshold": 2,
|
|
196
|
+
"critical_threshold": 1,
|
|
197
|
+
"failure_window_seconds": 300,
|
|
198
|
+
"min_recovery_interval": 60
|
|
199
|
+
}
|
|
200
|
+
},
|
|
172
201
|
# Service management
|
|
173
202
|
"graceful_shutdown_timeout": 30,
|
|
174
203
|
"startup_timeout": 60,
|
|
@@ -247,6 +276,47 @@ class Config:
|
|
|
247
276
|
"auto_learning": True # Enable auto learning
|
|
248
277
|
}
|
|
249
278
|
}
|
|
279
|
+
},
|
|
280
|
+
# Socket.IO server health and recovery configuration
|
|
281
|
+
"socketio_server": {
|
|
282
|
+
"host": "localhost",
|
|
283
|
+
"port": 8765,
|
|
284
|
+
"enable_health_monitoring": True,
|
|
285
|
+
"enable_recovery": True,
|
|
286
|
+
"health_monitoring": {
|
|
287
|
+
"check_interval": 30,
|
|
288
|
+
"history_size": 100,
|
|
289
|
+
"aggregation_window": 300,
|
|
290
|
+
"thresholds": {
|
|
291
|
+
"cpu_percent": 80.0,
|
|
292
|
+
"memory_mb": 500,
|
|
293
|
+
"file_descriptors": 1000,
|
|
294
|
+
"max_clients": 1000,
|
|
295
|
+
"max_error_rate": 0.1
|
|
296
|
+
}
|
|
297
|
+
},
|
|
298
|
+
"recovery": {
|
|
299
|
+
"enabled": True,
|
|
300
|
+
"max_attempts": 5,
|
|
301
|
+
"timeout": 30,
|
|
302
|
+
"circuit_breaker": {
|
|
303
|
+
"failure_threshold": 5,
|
|
304
|
+
"timeout_seconds": 300,
|
|
305
|
+
"success_threshold": 3
|
|
306
|
+
},
|
|
307
|
+
"strategy": {
|
|
308
|
+
"warning_threshold": 2,
|
|
309
|
+
"critical_threshold": 1,
|
|
310
|
+
"failure_window_seconds": 300,
|
|
311
|
+
"min_recovery_interval": 60
|
|
312
|
+
},
|
|
313
|
+
"actions": {
|
|
314
|
+
"log_warning": True,
|
|
315
|
+
"clear_connections": True,
|
|
316
|
+
"restart_service": True,
|
|
317
|
+
"emergency_stop": True
|
|
318
|
+
}
|
|
319
|
+
}
|
|
250
320
|
}
|
|
251
321
|
}
|
|
252
322
|
|
|
@@ -254,6 +324,9 @@ class Config:
|
|
|
254
324
|
for key, default_value in defaults.items():
|
|
255
325
|
if key not in self._config:
|
|
256
326
|
self._config[key] = default_value
|
|
327
|
+
|
|
328
|
+
# Validate health and recovery configuration
|
|
329
|
+
self._validate_health_recovery_config()
|
|
257
330
|
|
|
258
331
|
def get(self, key: str, default: Any = None) -> Any:
|
|
259
332
|
"""Get configuration value."""
|
|
@@ -349,6 +422,93 @@ class Config:
|
|
|
349
422
|
"""Check if configuration contains a key."""
|
|
350
423
|
return self.get(key) is not None
|
|
351
424
|
|
|
425
|
+
def _validate_health_recovery_config(self) -> None:
|
|
426
|
+
"""Validate health monitoring and recovery configuration."""
|
|
427
|
+
try:
|
|
428
|
+
# Validate health thresholds
|
|
429
|
+
thresholds = self.get('health_thresholds', {})
|
|
430
|
+
if thresholds.get('cpu_percent', 0) < 0 or thresholds.get('cpu_percent', 0) > 100:
|
|
431
|
+
logger.warning("CPU threshold should be between 0-100, using default 80")
|
|
432
|
+
self.set('health_thresholds.cpu_percent', 80.0)
|
|
433
|
+
|
|
434
|
+
if thresholds.get('memory_mb', 0) <= 0:
|
|
435
|
+
logger.warning("Memory threshold should be positive, using default 500MB")
|
|
436
|
+
self.set('health_thresholds.memory_mb', 500)
|
|
437
|
+
|
|
438
|
+
if thresholds.get('max_error_rate', 0) < 0 or thresholds.get('max_error_rate', 0) > 1:
|
|
439
|
+
logger.warning("Error rate threshold should be between 0-1, using default 0.1")
|
|
440
|
+
self.set('health_thresholds.max_error_rate', 0.1)
|
|
441
|
+
|
|
442
|
+
# Validate recovery configuration
|
|
443
|
+
recovery_config = self.get('recovery', {})
|
|
444
|
+
if recovery_config.get('max_recovery_attempts', 0) <= 0:
|
|
445
|
+
logger.warning("Max recovery attempts should be positive, using default 5")
|
|
446
|
+
self.set('recovery.max_recovery_attempts', 5)
|
|
447
|
+
|
|
448
|
+
# Validate circuit breaker configuration
|
|
449
|
+
cb_config = recovery_config.get('circuit_breaker', {})
|
|
450
|
+
if cb_config.get('failure_threshold', 0) <= 0:
|
|
451
|
+
logger.warning("Circuit breaker failure threshold should be positive, using default 5")
|
|
452
|
+
self.set('recovery.circuit_breaker.failure_threshold', 5)
|
|
453
|
+
|
|
454
|
+
if cb_config.get('timeout_seconds', 0) <= 0:
|
|
455
|
+
logger.warning("Circuit breaker timeout should be positive, using default 300")
|
|
456
|
+
self.set('recovery.circuit_breaker.timeout_seconds', 300)
|
|
457
|
+
|
|
458
|
+
except Exception as e:
|
|
459
|
+
logger.error(f"Error validating health/recovery configuration: {e}")
|
|
460
|
+
|
|
461
|
+
def get_health_monitoring_config(self) -> Dict[str, Any]:
|
|
462
|
+
"""Get health monitoring configuration with defaults."""
|
|
463
|
+
base_config = {
|
|
464
|
+
'enabled': self.get('enable_health_monitoring', True),
|
|
465
|
+
'check_interval': self.get('health_check_interval', 30),
|
|
466
|
+
'history_size': self.get('health_history_size', 100),
|
|
467
|
+
'aggregation_window': self.get('health_aggregation_window', 300),
|
|
468
|
+
'thresholds': self.get('health_thresholds', {
|
|
469
|
+
'cpu_percent': 80.0,
|
|
470
|
+
'memory_mb': 500,
|
|
471
|
+
'file_descriptors': 1000,
|
|
472
|
+
'max_clients': 1000,
|
|
473
|
+
'max_error_rate': 0.1,
|
|
474
|
+
'network_timeout': 2.0
|
|
475
|
+
})
|
|
476
|
+
}
|
|
477
|
+
|
|
478
|
+
# Merge with socketio-specific config if available
|
|
479
|
+
socketio_config = self.get('socketio_server.health_monitoring', {})
|
|
480
|
+
if socketio_config:
|
|
481
|
+
base_config.update(socketio_config)
|
|
482
|
+
|
|
483
|
+
return base_config
|
|
484
|
+
|
|
485
|
+
def get_recovery_config(self) -> Dict[str, Any]:
|
|
486
|
+
"""Get recovery configuration with defaults."""
|
|
487
|
+
base_config = self.get('recovery', {
|
|
488
|
+
'enabled': True,
|
|
489
|
+
'check_interval': 60,
|
|
490
|
+
'max_recovery_attempts': 5,
|
|
491
|
+
'recovery_timeout': 30,
|
|
492
|
+
'circuit_breaker': {
|
|
493
|
+
'failure_threshold': 5,
|
|
494
|
+
'timeout_seconds': 300,
|
|
495
|
+
'success_threshold': 3
|
|
496
|
+
},
|
|
497
|
+
'strategy': {
|
|
498
|
+
'warning_threshold': 2,
|
|
499
|
+
'critical_threshold': 1,
|
|
500
|
+
'failure_window_seconds': 300,
|
|
501
|
+
'min_recovery_interval': 60
|
|
502
|
+
}
|
|
503
|
+
})
|
|
504
|
+
|
|
505
|
+
# Merge with socketio-specific config if available
|
|
506
|
+
socketio_config = self.get('socketio_server.recovery', {})
|
|
507
|
+
if socketio_config:
|
|
508
|
+
base_config = self._config_mgr.merge_configs(base_config, socketio_config)
|
|
509
|
+
|
|
510
|
+
return base_config
|
|
511
|
+
|
|
352
512
|
def __repr__(self) -> str:
|
|
353
513
|
"""String representation of configuration."""
|
|
354
514
|
return f"<Config({len(self._config)} keys)>"
|
|
@@ -43,11 +43,26 @@ def is_running():
|
|
|
43
43
|
return False
|
|
44
44
|
|
|
45
45
|
def start_server():
|
|
46
|
-
"""Start the Socket.IO server as a daemon."""
|
|
46
|
+
"""Start the Socket.IO server as a daemon with conflict detection."""
|
|
47
47
|
if is_running():
|
|
48
|
-
print("Socket.IO server is already running.")
|
|
48
|
+
print("Socket.IO daemon server is already running.")
|
|
49
|
+
print(f"Use '{__file__} status' for details")
|
|
49
50
|
return
|
|
50
51
|
|
|
52
|
+
# Check for HTTP-managed server conflict
|
|
53
|
+
try:
|
|
54
|
+
import requests
|
|
55
|
+
response = requests.get("http://localhost:8765/health", timeout=1.0)
|
|
56
|
+
if response.status_code == 200:
|
|
57
|
+
data = response.json()
|
|
58
|
+
if 'server_id' in data:
|
|
59
|
+
print(f"⚠️ HTTP-managed server already running: {data.get('server_id')}")
|
|
60
|
+
print(f" Stop it first: socketio_server_manager.py stop --port 8765")
|
|
61
|
+
print(f" Or diagnose: socketio_server_manager.py diagnose")
|
|
62
|
+
return
|
|
63
|
+
except:
|
|
64
|
+
pass # No HTTP server, continue
|
|
65
|
+
|
|
51
66
|
ensure_dirs()
|
|
52
67
|
|
|
53
68
|
# Fork to create daemon
|
|
@@ -96,9 +111,10 @@ def start_server():
|
|
|
96
111
|
signal_handler(signal.SIGINT, None)
|
|
97
112
|
|
|
98
113
|
def stop_server():
|
|
99
|
-
"""Stop the Socket.IO server."""
|
|
114
|
+
"""Stop the Socket.IO daemon server."""
|
|
100
115
|
if not is_running():
|
|
101
|
-
print("Socket.IO server is not running.")
|
|
116
|
+
print("Socket.IO daemon server is not running.")
|
|
117
|
+
print(f"Check for other servers: socketio_server_manager.py status")
|
|
102
118
|
return
|
|
103
119
|
|
|
104
120
|
try:
|
|
@@ -125,11 +141,12 @@ def stop_server():
|
|
|
125
141
|
print(f"Error stopping server: {e}")
|
|
126
142
|
|
|
127
143
|
def status_server():
|
|
128
|
-
"""Check server status."""
|
|
144
|
+
"""Check server status with manager integration info."""
|
|
129
145
|
if is_running():
|
|
130
146
|
with open(PID_FILE) as f:
|
|
131
147
|
pid = int(f.read().strip())
|
|
132
|
-
print(f"Socket.IO server is running (PID: {pid})")
|
|
148
|
+
print(f"Socket.IO daemon server is running (PID: {pid})")
|
|
149
|
+
print(f"PID file: {PID_FILE}")
|
|
133
150
|
|
|
134
151
|
# Check if port is listening
|
|
135
152
|
try:
|
|
@@ -138,13 +155,36 @@ def status_server():
|
|
|
138
155
|
result = sock.connect_ex(('localhost', 8765))
|
|
139
156
|
sock.close()
|
|
140
157
|
if result == 0:
|
|
141
|
-
print("Server is listening on port 8765")
|
|
158
|
+
print("✅ Server is listening on port 8765")
|
|
159
|
+
print("🔧 Management style: daemon")
|
|
142
160
|
else:
|
|
143
|
-
print("WARNING: Server process exists but port 8765 is not accessible")
|
|
161
|
+
print("⚠️ WARNING: Server process exists but port 8765 is not accessible")
|
|
162
|
+
except:
|
|
163
|
+
pass
|
|
164
|
+
|
|
165
|
+
# Show management commands
|
|
166
|
+
print("\n🔧 Management Commands:")
|
|
167
|
+
print(f" • Stop: {__file__} stop")
|
|
168
|
+
print(f" • Restart: {__file__} restart")
|
|
169
|
+
|
|
170
|
+
# Check for manager conflicts
|
|
171
|
+
try:
|
|
172
|
+
import requests
|
|
173
|
+
response = requests.get("http://localhost:8765/health", timeout=1.0)
|
|
174
|
+
if response.status_code == 200:
|
|
175
|
+
data = response.json()
|
|
176
|
+
if 'server_id' in data and data.get('server_id') != 'daemon-socketio':
|
|
177
|
+
print(f"\n⚠️ POTENTIAL CONFLICT: HTTP-managed server also detected")
|
|
178
|
+
print(f" Server ID: {data.get('server_id')}")
|
|
179
|
+
print(f" Use 'socketio_server_manager.py diagnose' to resolve")
|
|
144
180
|
except:
|
|
145
181
|
pass
|
|
182
|
+
|
|
146
183
|
else:
|
|
147
|
-
print("Socket.IO server is not running")
|
|
184
|
+
print("Socket.IO daemon server is not running")
|
|
185
|
+
print(f"\n🔧 Start Commands:")
|
|
186
|
+
print(f" • Daemon: {__file__} start")
|
|
187
|
+
print(f" • HTTP-managed: socketio_server_manager.py start")
|
|
148
188
|
|
|
149
189
|
def main():
|
|
150
190
|
"""Main entry point."""
|