claude-mpm 3.4.0__py3-none-any.whl → 3.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- claude_mpm/agents/templates/.claude-mpm/memories/README.md +36 -0
- claude_mpm/cli/commands/memory.py +6 -1
- claude_mpm/core/config.py +160 -0
- claude_mpm/hooks/claude_hooks/hook_wrapper.sh +1 -1
- claude_mpm/scripts/socketio_daemon.py +49 -9
- claude_mpm/scripts/socketio_server_manager.py +370 -45
- claude_mpm/services/__init__.py +18 -0
- claude_mpm/services/agent_memory_manager.py +7 -5
- claude_mpm/services/exceptions.py +677 -0
- claude_mpm/services/health_monitor.py +892 -0
- claude_mpm/services/memory_builder.py +4 -2
- claude_mpm/services/memory_optimizer.py +6 -2
- claude_mpm/services/memory_router.py +99 -6
- claude_mpm/services/recovery_manager.py +670 -0
- claude_mpm/services/socketio_server.py +188 -11
- claude_mpm/services/standalone_socketio_server.py +703 -34
- {claude_mpm-3.4.0.dist-info → claude_mpm-3.4.3.dist-info}/METADATA +1 -1
- {claude_mpm-3.4.0.dist-info → claude_mpm-3.4.3.dist-info}/RECORD +23 -19
- /claude_mpm/{web → dashboard}/open_dashboard.py +0 -0
- {claude_mpm-3.4.0.dist-info → claude_mpm-3.4.3.dist-info}/WHEEL +0 -0
- {claude_mpm-3.4.0.dist-info → claude_mpm-3.4.3.dist-info}/entry_points.txt +0 -0
- {claude_mpm-3.4.0.dist-info → claude_mpm-3.4.3.dist-info}/licenses/LICENSE +0 -0
- {claude_mpm-3.4.0.dist-info → claude_mpm-3.4.3.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,677 @@
|
|
|
1
|
+
"""Enhanced error classes for daemon conflict and process management in claude-mpm Socket.IO server.
|
|
2
|
+
|
|
3
|
+
These error classes provide detailed context and actionable guidance for users to resolve
|
|
4
|
+
common issues like daemon conflicts, port conflicts, stale processes, and recovery failures.
|
|
5
|
+
|
|
6
|
+
Design Principles:
|
|
7
|
+
1. Clear, actionable error messages with exact commands
|
|
8
|
+
2. Process details (PID, port, start time) for debugging
|
|
9
|
+
3. Resolution steps with specific troubleshooting guidance
|
|
10
|
+
4. Health status information when available
|
|
11
|
+
5. Structured error data for programmatic handling
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import os
|
|
15
|
+
import sys
|
|
16
|
+
import time
|
|
17
|
+
import platform
|
|
18
|
+
from datetime import datetime
|
|
19
|
+
from typing import Dict, Any, Optional, List
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class SocketIOServerError(Exception):
|
|
24
|
+
"""Base exception for Socket.IO server errors.
|
|
25
|
+
|
|
26
|
+
Provides common functionality for all server-related errors including
|
|
27
|
+
structured error data and detailed context information.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
def __init__(self, message: str, error_code: str = None, context: Dict[str, Any] = None):
|
|
31
|
+
"""Initialize base server error.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
message: Human-readable error message
|
|
35
|
+
error_code: Machine-readable error code for programmatic handling
|
|
36
|
+
context: Additional context data for debugging and resolution
|
|
37
|
+
"""
|
|
38
|
+
super().__init__(message)
|
|
39
|
+
self.message = message
|
|
40
|
+
self.error_code = error_code or self.__class__.__name__.lower()
|
|
41
|
+
self.context = context or {}
|
|
42
|
+
self.timestamp = datetime.utcnow().isoformat() + "Z"
|
|
43
|
+
|
|
44
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
45
|
+
"""Convert error to dictionary format for structured logging/handling."""
|
|
46
|
+
return {
|
|
47
|
+
"error_type": self.__class__.__name__,
|
|
48
|
+
"error_code": self.error_code,
|
|
49
|
+
"message": self.message,
|
|
50
|
+
"context": self.context,
|
|
51
|
+
"timestamp": self.timestamp
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class DaemonConflictError(SocketIOServerError):
|
|
56
|
+
"""Error raised when attempting to start server while another instance is already running.
|
|
57
|
+
|
|
58
|
+
This error provides detailed information about the conflicting process and
|
|
59
|
+
actionable steps to resolve the conflict.
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
def __init__(self,
|
|
63
|
+
port: int,
|
|
64
|
+
existing_pid: int,
|
|
65
|
+
existing_server_id: str = None,
|
|
66
|
+
process_info: Dict[str, Any] = None,
|
|
67
|
+
pidfile_path: Path = None):
|
|
68
|
+
"""Initialize daemon conflict error with detailed context.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
port: Port number where conflict occurred
|
|
72
|
+
existing_pid: PID of the existing server process
|
|
73
|
+
existing_server_id: Server ID of the existing instance (if available)
|
|
74
|
+
process_info: Detailed process information from psutil or similar
|
|
75
|
+
pidfile_path: Path to the PID file of the existing server
|
|
76
|
+
"""
|
|
77
|
+
self.port = port
|
|
78
|
+
self.existing_pid = existing_pid
|
|
79
|
+
self.existing_server_id = existing_server_id or "unknown"
|
|
80
|
+
self.process_info = process_info or {}
|
|
81
|
+
self.pidfile_path = pidfile_path
|
|
82
|
+
|
|
83
|
+
# Build detailed error message with resolution steps
|
|
84
|
+
message = self._build_error_message()
|
|
85
|
+
|
|
86
|
+
context = {
|
|
87
|
+
"port": port,
|
|
88
|
+
"existing_pid": existing_pid,
|
|
89
|
+
"existing_server_id": self.existing_server_id,
|
|
90
|
+
"process_info": process_info,
|
|
91
|
+
"pidfile_path": str(pidfile_path) if pidfile_path else None,
|
|
92
|
+
"resolution_steps": self._get_resolution_steps()
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
super().__init__(message, "daemon_conflict", context)
|
|
96
|
+
|
|
97
|
+
def _build_error_message(self) -> str:
|
|
98
|
+
"""Build comprehensive error message with process details."""
|
|
99
|
+
lines = [
|
|
100
|
+
f"🚫 Socket.IO server conflict detected on port {self.port}",
|
|
101
|
+
f"",
|
|
102
|
+
f"CONFLICT DETAILS:",
|
|
103
|
+
f" • Existing PID: {self.existing_pid}",
|
|
104
|
+
f" • Server ID: {self.existing_server_id}",
|
|
105
|
+
]
|
|
106
|
+
|
|
107
|
+
# Add process information if available
|
|
108
|
+
if self.process_info:
|
|
109
|
+
status = self.process_info.get('status', 'unknown')
|
|
110
|
+
name = self.process_info.get('name', 'unknown')
|
|
111
|
+
create_time = self.process_info.get('create_time')
|
|
112
|
+
memory_info = self.process_info.get('memory_info', {})
|
|
113
|
+
|
|
114
|
+
lines.extend([
|
|
115
|
+
f" • Process Status: {status}",
|
|
116
|
+
f" • Process Name: {name}",
|
|
117
|
+
])
|
|
118
|
+
|
|
119
|
+
if create_time:
|
|
120
|
+
start_time = datetime.fromtimestamp(create_time).strftime('%Y-%m-%d %H:%M:%S')
|
|
121
|
+
uptime = time.time() - create_time
|
|
122
|
+
lines.append(f" • Started: {start_time} (uptime: {uptime:.0f}s)")
|
|
123
|
+
|
|
124
|
+
if memory_info.get('rss'):
|
|
125
|
+
memory_mb = memory_info['rss'] / (1024 * 1024)
|
|
126
|
+
lines.append(f" • Memory Usage: {memory_mb:.1f} MB")
|
|
127
|
+
|
|
128
|
+
# Add PID file information
|
|
129
|
+
if self.pidfile_path:
|
|
130
|
+
lines.extend([
|
|
131
|
+
f" • PID File: {self.pidfile_path}",
|
|
132
|
+
f" • File Exists: {self.pidfile_path.exists() if isinstance(self.pidfile_path, Path) else 'unknown'}"
|
|
133
|
+
])
|
|
134
|
+
|
|
135
|
+
lines.extend([
|
|
136
|
+
f"",
|
|
137
|
+
f"RESOLUTION STEPS:",
|
|
138
|
+
])
|
|
139
|
+
|
|
140
|
+
# Add resolution steps
|
|
141
|
+
for i, step in enumerate(self._get_resolution_steps(), 1):
|
|
142
|
+
lines.append(f" {i}. {step}")
|
|
143
|
+
|
|
144
|
+
return "\n".join(lines)
|
|
145
|
+
|
|
146
|
+
def _get_resolution_steps(self) -> List[str]:
|
|
147
|
+
"""Get ordered list of resolution steps."""
|
|
148
|
+
steps = [
|
|
149
|
+
f"Check if the existing server is still needed: ps -p {self.existing_pid}",
|
|
150
|
+
f"Stop the existing server gracefully: kill -TERM {self.existing_pid}",
|
|
151
|
+
f"If graceful shutdown fails: kill -KILL {self.existing_pid}",
|
|
152
|
+
]
|
|
153
|
+
|
|
154
|
+
if self.pidfile_path:
|
|
155
|
+
steps.append(f"Remove stale PID file if needed: rm {self.pidfile_path}")
|
|
156
|
+
|
|
157
|
+
steps.extend([
|
|
158
|
+
f"Wait a few seconds for port cleanup",
|
|
159
|
+
f"Try starting the server again on port {self.port}",
|
|
160
|
+
f"Alternative: Use a different port with --port <new_port>"
|
|
161
|
+
])
|
|
162
|
+
|
|
163
|
+
return steps
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
class PortConflictError(SocketIOServerError):
|
|
167
|
+
"""Error raised when network port is already in use by another process.
|
|
168
|
+
|
|
169
|
+
This error helps identify what process is using the port and provides
|
|
170
|
+
steps to resolve the conflict.
|
|
171
|
+
"""
|
|
172
|
+
|
|
173
|
+
def __init__(self,
|
|
174
|
+
port: int,
|
|
175
|
+
host: str = "localhost",
|
|
176
|
+
conflicting_process: Dict[str, Any] = None):
|
|
177
|
+
"""Initialize port conflict error.
|
|
178
|
+
|
|
179
|
+
Args:
|
|
180
|
+
port: Port number that's in use
|
|
181
|
+
host: Host address where the conflict occurred
|
|
182
|
+
conflicting_process: Information about the process using the port
|
|
183
|
+
"""
|
|
184
|
+
self.port = port
|
|
185
|
+
self.host = host
|
|
186
|
+
self.conflicting_process = conflicting_process or {}
|
|
187
|
+
|
|
188
|
+
message = self._build_error_message()
|
|
189
|
+
|
|
190
|
+
context = {
|
|
191
|
+
"port": port,
|
|
192
|
+
"host": host,
|
|
193
|
+
"conflicting_process": conflicting_process,
|
|
194
|
+
"resolution_steps": self._get_resolution_steps()
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
super().__init__(message, "port_conflict", context)
|
|
198
|
+
|
|
199
|
+
def _build_error_message(self) -> str:
|
|
200
|
+
"""Build error message with port conflict details."""
|
|
201
|
+
lines = [
|
|
202
|
+
f"🔌 Network port conflict detected",
|
|
203
|
+
f"",
|
|
204
|
+
f"PORT CONFLICT DETAILS:",
|
|
205
|
+
f" • Port: {self.port}",
|
|
206
|
+
f" • Host: {self.host}",
|
|
207
|
+
f" • Address: {self.host}:{self.port}",
|
|
208
|
+
]
|
|
209
|
+
|
|
210
|
+
# Add information about conflicting process if available
|
|
211
|
+
if self.conflicting_process:
|
|
212
|
+
pid = self.conflicting_process.get('pid')
|
|
213
|
+
name = self.conflicting_process.get('name', 'unknown')
|
|
214
|
+
cmdline = self.conflicting_process.get('cmdline', [])
|
|
215
|
+
|
|
216
|
+
lines.extend([
|
|
217
|
+
f"",
|
|
218
|
+
f"CONFLICTING PROCESS:",
|
|
219
|
+
f" • PID: {pid or 'unknown'}",
|
|
220
|
+
f" • Name: {name}",
|
|
221
|
+
])
|
|
222
|
+
|
|
223
|
+
if cmdline:
|
|
224
|
+
lines.append(f" • Command: {' '.join(cmdline)}")
|
|
225
|
+
|
|
226
|
+
lines.extend([
|
|
227
|
+
f"",
|
|
228
|
+
f"RESOLUTION STEPS:",
|
|
229
|
+
])
|
|
230
|
+
|
|
231
|
+
for i, step in enumerate(self._get_resolution_steps(), 1):
|
|
232
|
+
lines.append(f" {i}. {step}")
|
|
233
|
+
|
|
234
|
+
return "\n".join(lines)
|
|
235
|
+
|
|
236
|
+
def _get_resolution_steps(self) -> List[str]:
|
|
237
|
+
"""Get resolution steps for port conflicts."""
|
|
238
|
+
steps = [
|
|
239
|
+
f"Check what process is using port {self.port}:"
|
|
240
|
+
]
|
|
241
|
+
|
|
242
|
+
# Add platform-specific commands
|
|
243
|
+
if platform.system() == "Darwin": # macOS
|
|
244
|
+
steps.extend([
|
|
245
|
+
f" • lsof -i :{self.port}",
|
|
246
|
+
f" • netstat -an | grep {self.port}"
|
|
247
|
+
])
|
|
248
|
+
elif platform.system() == "Linux":
|
|
249
|
+
steps.extend([
|
|
250
|
+
f" • lsof -i :{self.port}",
|
|
251
|
+
f" • netstat -tulpn | grep {self.port}",
|
|
252
|
+
f" • ss -tulpn | grep {self.port}"
|
|
253
|
+
])
|
|
254
|
+
elif platform.system() == "Windows":
|
|
255
|
+
steps.extend([
|
|
256
|
+
f" • netstat -ano | findstr {self.port}",
|
|
257
|
+
f" • tasklist /fi \"PID eq <PID_FROM_NETSTAT>\""
|
|
258
|
+
])
|
|
259
|
+
|
|
260
|
+
steps.extend([
|
|
261
|
+
f"Stop the conflicting process if it's safe to do so",
|
|
262
|
+
f"Wait for port cleanup (may take 30-60 seconds)",
|
|
263
|
+
f"Try again with the same port",
|
|
264
|
+
f"Alternative: Use a different port: --port {self.port + 1}"
|
|
265
|
+
])
|
|
266
|
+
|
|
267
|
+
return steps
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
class StaleProcessError(SocketIOServerError):
|
|
271
|
+
"""Error raised when dealing with stale processes or PID files.
|
|
272
|
+
|
|
273
|
+
This error occurs when a PID file exists but the associated process
|
|
274
|
+
is no longer running, is a zombie, or has been replaced.
|
|
275
|
+
"""
|
|
276
|
+
|
|
277
|
+
def __init__(self,
|
|
278
|
+
pid: int,
|
|
279
|
+
pidfile_path: Path = None,
|
|
280
|
+
process_status: str = "not_found",
|
|
281
|
+
validation_errors: List[str] = None):
|
|
282
|
+
"""Initialize stale process error.
|
|
283
|
+
|
|
284
|
+
Args:
|
|
285
|
+
pid: Process ID that's stale
|
|
286
|
+
pidfile_path: Path to the stale PID file
|
|
287
|
+
process_status: Status of the process (zombie, not_found, invalid, etc.)
|
|
288
|
+
validation_errors: List of validation errors encountered
|
|
289
|
+
"""
|
|
290
|
+
self.pid = pid
|
|
291
|
+
self.pidfile_path = pidfile_path
|
|
292
|
+
self.process_status = process_status
|
|
293
|
+
self.validation_errors = validation_errors or []
|
|
294
|
+
|
|
295
|
+
message = self._build_error_message()
|
|
296
|
+
|
|
297
|
+
context = {
|
|
298
|
+
"pid": pid,
|
|
299
|
+
"pidfile_path": str(pidfile_path) if pidfile_path else None,
|
|
300
|
+
"process_status": process_status,
|
|
301
|
+
"validation_errors": validation_errors,
|
|
302
|
+
"resolution_steps": self._get_resolution_steps()
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
super().__init__(message, "stale_process", context)
|
|
306
|
+
|
|
307
|
+
def _build_error_message(self) -> str:
|
|
308
|
+
"""Build error message for stale process."""
|
|
309
|
+
status_descriptions = {
|
|
310
|
+
"not_found": "Process no longer exists",
|
|
311
|
+
"zombie": "Process is a zombie (terminated but not reaped)",
|
|
312
|
+
"invalid": "Process exists but is not the expected server",
|
|
313
|
+
"access_denied": "Cannot access process information",
|
|
314
|
+
"stale_pidfile": "PID file is stale or corrupted"
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
status_desc = status_descriptions.get(self.process_status, f"Process status: {self.process_status}")
|
|
318
|
+
|
|
319
|
+
lines = [
|
|
320
|
+
f"🧟 Stale process detected",
|
|
321
|
+
f"",
|
|
322
|
+
f"PROCESS DETAILS:",
|
|
323
|
+
f" • PID: {self.pid}",
|
|
324
|
+
f" • Status: {status_desc}",
|
|
325
|
+
]
|
|
326
|
+
|
|
327
|
+
if self.pidfile_path:
|
|
328
|
+
lines.extend([
|
|
329
|
+
f" • PID File: {self.pidfile_path}",
|
|
330
|
+
f" • File Exists: {self.pidfile_path.exists() if isinstance(self.pidfile_path, Path) else 'unknown'}"
|
|
331
|
+
])
|
|
332
|
+
|
|
333
|
+
if self.validation_errors:
|
|
334
|
+
lines.extend([
|
|
335
|
+
f"",
|
|
336
|
+
f"VALIDATION ERRORS:",
|
|
337
|
+
])
|
|
338
|
+
for error in self.validation_errors:
|
|
339
|
+
lines.append(f" • {error}")
|
|
340
|
+
|
|
341
|
+
lines.extend([
|
|
342
|
+
f"",
|
|
343
|
+
f"RESOLUTION STEPS:",
|
|
344
|
+
])
|
|
345
|
+
|
|
346
|
+
for i, step in enumerate(self._get_resolution_steps(), 1):
|
|
347
|
+
lines.append(f" {i}. {step}")
|
|
348
|
+
|
|
349
|
+
return "\n".join(lines)
|
|
350
|
+
|
|
351
|
+
def _get_resolution_steps(self) -> List[str]:
|
|
352
|
+
"""Get resolution steps for stale processes."""
|
|
353
|
+
steps = []
|
|
354
|
+
|
|
355
|
+
if self.process_status == "zombie":
|
|
356
|
+
steps.extend([
|
|
357
|
+
"Wait for parent process to reap zombie (usually automatic)",
|
|
358
|
+
f"If zombie persists, check parent process: ps -o ppid= -p {self.pid}",
|
|
359
|
+
"Restart parent process if necessary"
|
|
360
|
+
])
|
|
361
|
+
elif self.process_status == "not_found":
|
|
362
|
+
steps.extend([
|
|
363
|
+
f"Process {self.pid} no longer exists - safe to clean up"
|
|
364
|
+
])
|
|
365
|
+
elif self.process_status == "invalid":
|
|
366
|
+
steps.extend([
|
|
367
|
+
f"Verify process {self.pid} is not a legitimate server:",
|
|
368
|
+
f" • ps -p {self.pid} -o pid,ppid,cmd",
|
|
369
|
+
"If it's not your server, it's safe to clean up the PID file"
|
|
370
|
+
])
|
|
371
|
+
|
|
372
|
+
# Common cleanup steps
|
|
373
|
+
if self.pidfile_path:
|
|
374
|
+
steps.append(f"Remove stale PID file: rm {self.pidfile_path}")
|
|
375
|
+
|
|
376
|
+
steps.extend([
|
|
377
|
+
"Try starting the server again",
|
|
378
|
+
"If issues persist, check for permission problems or disk space"
|
|
379
|
+
])
|
|
380
|
+
|
|
381
|
+
return steps
|
|
382
|
+
|
|
383
|
+
|
|
384
|
+
class RecoveryFailedError(SocketIOServerError):
|
|
385
|
+
"""Error raised when automatic recovery mechanisms fail.
|
|
386
|
+
|
|
387
|
+
This error occurs when the health monitoring and recovery system
|
|
388
|
+
cannot automatically resolve server issues.
|
|
389
|
+
"""
|
|
390
|
+
|
|
391
|
+
def __init__(self,
|
|
392
|
+
recovery_action: str,
|
|
393
|
+
failure_reason: str,
|
|
394
|
+
attempt_count: int = 1,
|
|
395
|
+
health_status: Dict[str, Any] = None,
|
|
396
|
+
last_successful_recovery: str = None):
|
|
397
|
+
"""Initialize recovery failure error.
|
|
398
|
+
|
|
399
|
+
Args:
|
|
400
|
+
recovery_action: The recovery action that failed (e.g., 'restart', 'cleanup')
|
|
401
|
+
failure_reason: Why the recovery failed
|
|
402
|
+
attempt_count: Number of recovery attempts made
|
|
403
|
+
health_status: Current health status information
|
|
404
|
+
last_successful_recovery: Timestamp of last successful recovery
|
|
405
|
+
"""
|
|
406
|
+
self.recovery_action = recovery_action
|
|
407
|
+
self.failure_reason = failure_reason
|
|
408
|
+
self.attempt_count = attempt_count
|
|
409
|
+
self.health_status = health_status or {}
|
|
410
|
+
self.last_successful_recovery = last_successful_recovery
|
|
411
|
+
|
|
412
|
+
message = self._build_error_message()
|
|
413
|
+
|
|
414
|
+
context = {
|
|
415
|
+
"recovery_action": recovery_action,
|
|
416
|
+
"failure_reason": failure_reason,
|
|
417
|
+
"attempt_count": attempt_count,
|
|
418
|
+
"health_status": health_status,
|
|
419
|
+
"last_successful_recovery": last_successful_recovery,
|
|
420
|
+
"resolution_steps": self._get_resolution_steps()
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
super().__init__(message, "recovery_failed", context)
|
|
424
|
+
|
|
425
|
+
def _build_error_message(self) -> str:
|
|
426
|
+
"""Build error message for recovery failure."""
|
|
427
|
+
lines = [
|
|
428
|
+
f"🚨 Automatic recovery failed",
|
|
429
|
+
f"",
|
|
430
|
+
f"RECOVERY DETAILS:",
|
|
431
|
+
f" • Failed Action: {self.recovery_action}",
|
|
432
|
+
f" • Failure Reason: {self.failure_reason}",
|
|
433
|
+
f" • Attempt Count: {self.attempt_count}",
|
|
434
|
+
]
|
|
435
|
+
|
|
436
|
+
if self.last_successful_recovery:
|
|
437
|
+
lines.append(f" • Last Successful Recovery: {self.last_successful_recovery}")
|
|
438
|
+
|
|
439
|
+
# Add health status information
|
|
440
|
+
if self.health_status:
|
|
441
|
+
lines.extend([
|
|
442
|
+
f"",
|
|
443
|
+
f"CURRENT HEALTH STATUS:",
|
|
444
|
+
])
|
|
445
|
+
|
|
446
|
+
# Common health metrics
|
|
447
|
+
for key, value in self.health_status.items():
|
|
448
|
+
if key in ['status', 'uptime', 'clients_connected', 'events_processed', 'errors']:
|
|
449
|
+
lines.append(f" • {key.replace('_', ' ').title()}: {value}")
|
|
450
|
+
|
|
451
|
+
lines.extend([
|
|
452
|
+
f"",
|
|
453
|
+
f"MANUAL RESOLUTION REQUIRED:",
|
|
454
|
+
])
|
|
455
|
+
|
|
456
|
+
for i, step in enumerate(self._get_resolution_steps(), 1):
|
|
457
|
+
lines.append(f" {i}. {step}")
|
|
458
|
+
|
|
459
|
+
return "\n".join(lines)
|
|
460
|
+
|
|
461
|
+
def _get_resolution_steps(self) -> List[str]:
|
|
462
|
+
"""Get manual resolution steps for recovery failures."""
|
|
463
|
+
steps = [
|
|
464
|
+
"Check server logs for detailed error information",
|
|
465
|
+
"Verify system resources (CPU, memory, disk space)",
|
|
466
|
+
"Check network connectivity and port availability",
|
|
467
|
+
]
|
|
468
|
+
|
|
469
|
+
if self.recovery_action == "restart":
|
|
470
|
+
steps.extend([
|
|
471
|
+
"Manually stop the server process",
|
|
472
|
+
"Wait for complete shutdown (check process list)",
|
|
473
|
+
"Remove any stale PID files",
|
|
474
|
+
"Restart the server manually"
|
|
475
|
+
])
|
|
476
|
+
elif self.recovery_action == "cleanup":
|
|
477
|
+
steps.extend([
|
|
478
|
+
"Manually identify and clean up stale resources",
|
|
479
|
+
"Check for zombie processes",
|
|
480
|
+
"Clear temporary files and logs if needed"
|
|
481
|
+
])
|
|
482
|
+
elif self.recovery_action == "port_reset":
|
|
483
|
+
steps.extend([
|
|
484
|
+
"Check what's using the required port",
|
|
485
|
+
"Stop conflicting processes",
|
|
486
|
+
"Wait for port cleanup",
|
|
487
|
+
"Consider using a different port temporarily"
|
|
488
|
+
])
|
|
489
|
+
|
|
490
|
+
steps.extend([
|
|
491
|
+
"Review health monitoring configuration",
|
|
492
|
+
"Consider adjusting recovery thresholds if appropriate",
|
|
493
|
+
"Monitor server stability after manual intervention"
|
|
494
|
+
])
|
|
495
|
+
|
|
496
|
+
return steps
|
|
497
|
+
|
|
498
|
+
|
|
499
|
+
class HealthCheckError(SocketIOServerError):
|
|
500
|
+
"""Error raised when health monitoring detects critical issues.
|
|
501
|
+
|
|
502
|
+
This error provides detailed health status information and guidance
|
|
503
|
+
for addressing system health problems.
|
|
504
|
+
"""
|
|
505
|
+
|
|
506
|
+
def __init__(self,
|
|
507
|
+
check_name: str,
|
|
508
|
+
check_status: str,
|
|
509
|
+
check_details: Dict[str, Any] = None,
|
|
510
|
+
threshold_exceeded: Dict[str, Any] = None):
|
|
511
|
+
"""Initialize health check error.
|
|
512
|
+
|
|
513
|
+
Args:
|
|
514
|
+
check_name: Name of the failed health check
|
|
515
|
+
check_status: Status of the health check (critical, warning, failed)
|
|
516
|
+
check_details: Detailed results from the health check
|
|
517
|
+
threshold_exceeded: Information about exceeded thresholds
|
|
518
|
+
"""
|
|
519
|
+
self.check_name = check_name
|
|
520
|
+
self.check_status = check_status
|
|
521
|
+
self.check_details = check_details or {}
|
|
522
|
+
self.threshold_exceeded = threshold_exceeded or {}
|
|
523
|
+
|
|
524
|
+
message = self._build_error_message()
|
|
525
|
+
|
|
526
|
+
context = {
|
|
527
|
+
"check_name": check_name,
|
|
528
|
+
"check_status": check_status,
|
|
529
|
+
"check_details": check_details,
|
|
530
|
+
"threshold_exceeded": threshold_exceeded,
|
|
531
|
+
"resolution_steps": self._get_resolution_steps()
|
|
532
|
+
}
|
|
533
|
+
|
|
534
|
+
super().__init__(message, "health_check_failed", context)
|
|
535
|
+
|
|
536
|
+
def _build_error_message(self) -> str:
|
|
537
|
+
"""Build error message for health check failure."""
|
|
538
|
+
status_emoji = {
|
|
539
|
+
"critical": "🚨",
|
|
540
|
+
"warning": "⚠️",
|
|
541
|
+
"failed": "❌"
|
|
542
|
+
}
|
|
543
|
+
|
|
544
|
+
emoji = status_emoji.get(self.check_status, "🔍")
|
|
545
|
+
|
|
546
|
+
lines = [
|
|
547
|
+
f"{emoji} Health check failed: {self.check_name}",
|
|
548
|
+
f"",
|
|
549
|
+
f"CHECK DETAILS:",
|
|
550
|
+
f" • Check: {self.check_name}",
|
|
551
|
+
f" • Status: {self.check_status.upper()}",
|
|
552
|
+
]
|
|
553
|
+
|
|
554
|
+
# Add check details
|
|
555
|
+
if self.check_details:
|
|
556
|
+
for key, value in self.check_details.items():
|
|
557
|
+
if key not in ['raw_data', 'internal_state']: # Skip internal data
|
|
558
|
+
lines.append(f" • {key.replace('_', ' ').title()}: {value}")
|
|
559
|
+
|
|
560
|
+
# Add threshold information
|
|
561
|
+
if self.threshold_exceeded:
|
|
562
|
+
lines.extend([
|
|
563
|
+
f"",
|
|
564
|
+
f"THRESHOLDS EXCEEDED:",
|
|
565
|
+
])
|
|
566
|
+
for metric, info in self.threshold_exceeded.items():
|
|
567
|
+
current = info.get('current', 'unknown')
|
|
568
|
+
threshold = info.get('threshold', 'unknown')
|
|
569
|
+
lines.append(f" • {metric.title()}: {current} (threshold: {threshold})")
|
|
570
|
+
|
|
571
|
+
lines.extend([
|
|
572
|
+
f"",
|
|
573
|
+
f"RECOMMENDED ACTIONS:",
|
|
574
|
+
])
|
|
575
|
+
|
|
576
|
+
for i, step in enumerate(self._get_resolution_steps(), 1):
|
|
577
|
+
lines.append(f" {i}. {step}")
|
|
578
|
+
|
|
579
|
+
return "\n".join(lines)
|
|
580
|
+
|
|
581
|
+
def _get_resolution_steps(self) -> List[str]:
|
|
582
|
+
"""Get resolution steps based on health check type."""
|
|
583
|
+
steps = []
|
|
584
|
+
|
|
585
|
+
if "cpu" in self.check_name.lower():
|
|
586
|
+
steps.extend([
|
|
587
|
+
"Check for runaway processes consuming CPU",
|
|
588
|
+
"Consider adding rate limiting or request throttling",
|
|
589
|
+
"Monitor CPU usage patterns over time"
|
|
590
|
+
])
|
|
591
|
+
elif "memory" in self.check_name.lower():
|
|
592
|
+
steps.extend([
|
|
593
|
+
"Check for memory leaks in the application",
|
|
594
|
+
"Monitor memory usage trends",
|
|
595
|
+
"Consider restarting if memory usage is excessive",
|
|
596
|
+
"Review event history size limits"
|
|
597
|
+
])
|
|
598
|
+
elif "network" in self.check_name.lower() or "connectivity" in self.check_name.lower():
|
|
599
|
+
steps.extend([
|
|
600
|
+
"Check network connectivity to required services",
|
|
601
|
+
"Verify firewall settings and port accessibility",
|
|
602
|
+
"Test network latency and bandwidth"
|
|
603
|
+
])
|
|
604
|
+
elif "disk" in self.check_name.lower() or "file" in self.check_name.lower():
|
|
605
|
+
steps.extend([
|
|
606
|
+
"Check available disk space",
|
|
607
|
+
"Clean up old log files and temporary data",
|
|
608
|
+
"Verify file permissions and access"
|
|
609
|
+
])
|
|
610
|
+
|
|
611
|
+
# General steps for all health check failures
|
|
612
|
+
steps.extend([
|
|
613
|
+
"Review recent system changes or deployments",
|
|
614
|
+
"Check system logs for related errors",
|
|
615
|
+
"Consider adjusting health check thresholds if appropriate",
|
|
616
|
+
"Monitor the issue to identify patterns or trends"
|
|
617
|
+
])
|
|
618
|
+
|
|
619
|
+
return steps
|
|
620
|
+
|
|
621
|
+
|
|
622
|
+
def format_troubleshooting_guide(error: SocketIOServerError) -> str:
|
|
623
|
+
"""Format a comprehensive troubleshooting guide for any Socket.IO server error.
|
|
624
|
+
|
|
625
|
+
Args:
|
|
626
|
+
error: The error instance to create troubleshooting guide for
|
|
627
|
+
|
|
628
|
+
Returns:
|
|
629
|
+
Formatted troubleshooting guide as a string
|
|
630
|
+
"""
|
|
631
|
+
lines = [
|
|
632
|
+
f"═══════════════════════════════════════════════════════════════",
|
|
633
|
+
f"🔧 CLAUDE MPM SOCKET.IO SERVER TROUBLESHOOTING GUIDE",
|
|
634
|
+
f"═══════════════════════════════════════════════════════════════",
|
|
635
|
+
f"",
|
|
636
|
+
f"ERROR TYPE: {error.__class__.__name__}",
|
|
637
|
+
f"ERROR CODE: {error.error_code}",
|
|
638
|
+
f"TIMESTAMP: {error.timestamp}",
|
|
639
|
+
f"",
|
|
640
|
+
str(error),
|
|
641
|
+
f"",
|
|
642
|
+
f"ADDITIONAL TROUBLESHOOTING:",
|
|
643
|
+
f"",
|
|
644
|
+
f"🔍 DIAGNOSTIC COMMANDS:",
|
|
645
|
+
f" • Check running processes: ps aux | grep socketio",
|
|
646
|
+
f" • Check port usage: lsof -i :{error.context.get('port', 'PORT')}",
|
|
647
|
+
f" • Check system resources: top or htop",
|
|
648
|
+
f" • Check disk space: df -h",
|
|
649
|
+
f" • Check logs: tail -f /path/to/claude-mpm.log",
|
|
650
|
+
f"",
|
|
651
|
+
f"🛠️ COMMON SOLUTIONS:",
|
|
652
|
+
f" 1. Restart the Socket.IO server completely",
|
|
653
|
+
f" 2. Clear any stale PID files",
|
|
654
|
+
f" 3. Check for zombie processes and clean them up",
|
|
655
|
+
f" 4. Verify network port availability",
|
|
656
|
+
f" 5. Check system resource availability (CPU, memory, disk)",
|
|
657
|
+
f" 6. Review server configuration and permissions",
|
|
658
|
+
f"",
|
|
659
|
+
f"📞 GETTING HELP:",
|
|
660
|
+
f" • Check the claude-mpm documentation",
|
|
661
|
+
f" • Review server logs for additional context",
|
|
662
|
+
f" • Report persistent issues with this error information",
|
|
663
|
+
f"",
|
|
664
|
+
f"🔗 ERROR CONTEXT DATA:",
|
|
665
|
+
]
|
|
666
|
+
|
|
667
|
+
# Add structured context data
|
|
668
|
+
for key, value in error.context.items():
|
|
669
|
+
if key != "resolution_steps": # Skip resolution steps as they're already shown
|
|
670
|
+
lines.append(f" • {key}: {value}")
|
|
671
|
+
|
|
672
|
+
lines.extend([
|
|
673
|
+
f"",
|
|
674
|
+
f"═══════════════════════════════════════════════════════════════",
|
|
675
|
+
])
|
|
676
|
+
|
|
677
|
+
return "\n".join(lines)
|