claude-mpm 3.3.2__py3-none-any.whl → 3.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. claude_mpm/cli/commands/memory.py +192 -14
  2. claude_mpm/cli/parser.py +13 -1
  3. claude_mpm/constants.py +1 -0
  4. claude_mpm/core/claude_runner.py +61 -0
  5. claude_mpm/core/config.py +161 -1
  6. claude_mpm/core/simple_runner.py +61 -0
  7. claude_mpm/hooks/builtin/mpm_command_hook.py +5 -5
  8. claude_mpm/hooks/claude_hooks/hook_handler.py +211 -4
  9. claude_mpm/hooks/claude_hooks/hook_wrapper.sh +10 -3
  10. claude_mpm/hooks/memory_integration_hook.py +51 -5
  11. claude_mpm/scripts/socketio_daemon.py +49 -9
  12. claude_mpm/scripts/socketio_server_manager.py +370 -45
  13. claude_mpm/services/__init__.py +41 -5
  14. claude_mpm/services/agent_memory_manager.py +541 -51
  15. claude_mpm/services/exceptions.py +677 -0
  16. claude_mpm/services/health_monitor.py +892 -0
  17. claude_mpm/services/memory_builder.py +341 -7
  18. claude_mpm/services/memory_optimizer.py +6 -2
  19. claude_mpm/services/project_analyzer.py +771 -0
  20. claude_mpm/services/recovery_manager.py +670 -0
  21. claude_mpm/services/socketio_server.py +653 -36
  22. claude_mpm/services/standalone_socketio_server.py +703 -34
  23. claude_mpm/services/version_control/git_operations.py +26 -0
  24. {claude_mpm-3.3.2.dist-info → claude_mpm-3.4.2.dist-info}/METADATA +34 -10
  25. {claude_mpm-3.3.2.dist-info → claude_mpm-3.4.2.dist-info}/RECORD +30 -44
  26. claude_mpm/agents/agent-template.yaml +0 -83
  27. claude_mpm/agents/test_fix_deployment/.claude-pm/config/project.json +0 -6
  28. claude_mpm/cli/README.md +0 -109
  29. claude_mpm/cli_module/refactoring_guide.md +0 -253
  30. claude_mpm/core/agent_registry.py.bak +0 -312
  31. claude_mpm/core/base_service.py.bak +0 -406
  32. claude_mpm/hooks/README.md +0 -97
  33. claude_mpm/orchestration/SUBPROCESS_DESIGN.md +0 -66
  34. claude_mpm/schemas/README_SECURITY.md +0 -92
  35. claude_mpm/schemas/agent_schema.json +0 -395
  36. claude_mpm/schemas/agent_schema_documentation.md +0 -181
  37. claude_mpm/schemas/agent_schema_security_notes.md +0 -165
  38. claude_mpm/schemas/examples/standard_workflow.json +0 -505
  39. claude_mpm/schemas/ticket_workflow_documentation.md +0 -482
  40. claude_mpm/schemas/ticket_workflow_schema.json +0 -590
  41. claude_mpm/services/framework_claude_md_generator/README.md +0 -92
  42. claude_mpm/services/parent_directory_manager/README.md +0 -83
  43. claude_mpm/services/version_control/VERSION +0 -1
  44. /claude_mpm/{web → dashboard}/open_dashboard.py +0 -0
  45. {claude_mpm-3.3.2.dist-info → claude_mpm-3.4.2.dist-info}/WHEEL +0 -0
  46. {claude_mpm-3.3.2.dist-info → claude_mpm-3.4.2.dist-info}/entry_points.txt +0 -0
  47. {claude_mpm-3.3.2.dist-info → claude_mpm-3.4.2.dist-info}/licenses/LICENSE +0 -0
  48. {claude_mpm-3.3.2.dist-info → claude_mpm-3.4.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,892 @@
1
+ """Advanced health monitoring system for claude-mpm Socket.IO server.
2
+
3
+ This module provides comprehensive health checking capabilities including:
4
+ - Process resource monitoring (CPU, memory, file descriptors)
5
+ - Service-specific health markers
6
+ - Configurable thresholds and intervals
7
+ - Health status aggregation and history
8
+ - Integration with automatic recovery mechanisms
9
+
10
+ Design Principles:
11
+ - Minimal performance impact through efficient polling
12
+ - Extensible metric collection system
13
+ - Circuit breaker integration for failure detection
14
+ - Comprehensive logging for debugging and diagnostics
15
+ """
16
+
17
+ import asyncio
18
+ import logging
19
+ import time
20
+ import threading
21
+ from abc import ABC, abstractmethod
22
+ from collections import deque
23
+ from dataclasses import dataclass, asdict
24
+ from datetime import datetime, timezone
25
+ from enum import Enum
26
+ from pathlib import Path
27
+ from typing import Any, Dict, List, Optional, Callable, Union
28
+ import json
29
+ import socket
30
+
31
+ try:
32
+ import psutil
33
+ PSUTIL_AVAILABLE = True
34
+ except ImportError:
35
+ PSUTIL_AVAILABLE = False
36
+ psutil = None
37
+
38
+
39
+ class HealthStatus(Enum):
40
+ """Health status levels for monitoring."""
41
+ HEALTHY = "healthy"
42
+ WARNING = "warning"
43
+ CRITICAL = "critical"
44
+ UNKNOWN = "unknown"
45
+
46
+
47
+ @dataclass
48
+ class HealthMetric:
49
+ """Individual health metric data structure."""
50
+ name: str
51
+ value: Union[int, float, str, bool]
52
+ status: HealthStatus
53
+ threshold: Optional[Union[int, float]] = None
54
+ unit: Optional[str] = None
55
+ timestamp: float = None
56
+ message: Optional[str] = None
57
+
58
+ def __post_init__(self):
59
+ if self.timestamp is None:
60
+ self.timestamp = time.time()
61
+
62
+ def to_dict(self) -> Dict[str, Any]:
63
+ """Convert metric to dictionary format."""
64
+ result = asdict(self)
65
+ result['status'] = self.status.value
66
+ result['timestamp_iso'] = datetime.fromtimestamp(self.timestamp, timezone.utc).isoformat()
67
+ return result
68
+
69
+
70
+ @dataclass
71
+ class HealthCheckResult:
72
+ """Result of a health check operation."""
73
+ overall_status: HealthStatus
74
+ metrics: List[HealthMetric]
75
+ timestamp: float
76
+ duration_ms: float
77
+ errors: List[str]
78
+
79
+ def __post_init__(self):
80
+ if not hasattr(self, 'timestamp') or self.timestamp is None:
81
+ self.timestamp = time.time()
82
+
83
+ def to_dict(self) -> Dict[str, Any]:
84
+ """Convert health check result to dictionary format."""
85
+ return {
86
+ 'overall_status': self.overall_status.value,
87
+ 'metrics': [metric.to_dict() for metric in self.metrics],
88
+ 'timestamp': self.timestamp,
89
+ 'timestamp_iso': datetime.fromtimestamp(self.timestamp, timezone.utc).isoformat(),
90
+ 'duration_ms': self.duration_ms,
91
+ 'errors': self.errors,
92
+ 'metric_count': len(self.metrics),
93
+ 'healthy_metrics': len([m for m in self.metrics if m.status == HealthStatus.HEALTHY]),
94
+ 'warning_metrics': len([m for m in self.metrics if m.status == HealthStatus.WARNING]),
95
+ 'critical_metrics': len([m for m in self.metrics if m.status == HealthStatus.CRITICAL])
96
+ }
97
+
98
+
99
+ class HealthChecker(ABC):
100
+ """Abstract base class for health checkers.
101
+
102
+ Health checkers implement specific monitoring logic for different aspects
103
+ of the system (process resources, network connectivity, service health, etc.).
104
+ """
105
+
106
+ @abstractmethod
107
+ def get_name(self) -> str:
108
+ """Get the name of this health checker."""
109
+ pass
110
+
111
+ @abstractmethod
112
+ async def check_health(self) -> List[HealthMetric]:
113
+ """Perform health check and return metrics."""
114
+ pass
115
+
116
+
117
+ class ProcessResourceChecker(HealthChecker):
118
+ """Health checker for process resource usage.
119
+
120
+ Monitors:
121
+ - CPU usage percentage
122
+ - Memory usage (RSS, VMS)
123
+ - File descriptor count
124
+ - Thread count
125
+ - Process status
126
+ """
127
+
128
+ def __init__(self, pid: int, cpu_threshold: float = 80.0,
129
+ memory_threshold_mb: int = 500, fd_threshold: int = 1000):
130
+ """Initialize process resource checker.
131
+
132
+ Args:
133
+ pid: Process ID to monitor
134
+ cpu_threshold: CPU usage threshold as percentage
135
+ memory_threshold_mb: Memory usage threshold in MB
136
+ fd_threshold: File descriptor count threshold
137
+ """
138
+ self.pid = pid
139
+ self.cpu_threshold = cpu_threshold
140
+ self.memory_threshold_mb = memory_threshold_mb
141
+ self.fd_threshold = fd_threshold
142
+ self.process = None
143
+ self.logger = logging.getLogger(f"{__name__}.ProcessResourceChecker")
144
+
145
+ if PSUTIL_AVAILABLE:
146
+ try:
147
+ self.process = psutil.Process(pid)
148
+ except psutil.NoSuchProcess:
149
+ self.logger.warning(f"Process {pid} not found for monitoring")
150
+
151
+ def get_name(self) -> str:
152
+ return f"process_resources_{self.pid}"
153
+
154
+ async def check_health(self) -> List[HealthMetric]:
155
+ """Check process resource usage."""
156
+ metrics = []
157
+
158
+ if not PSUTIL_AVAILABLE:
159
+ metrics.append(HealthMetric(
160
+ name="psutil_availability",
161
+ value=False,
162
+ status=HealthStatus.WARNING,
163
+ message="psutil not available for enhanced monitoring"
164
+ ))
165
+ return metrics
166
+
167
+ if not self.process:
168
+ metrics.append(HealthMetric(
169
+ name="process_exists",
170
+ value=False,
171
+ status=HealthStatus.CRITICAL,
172
+ message=f"Process {self.pid} not found"
173
+ ))
174
+ return metrics
175
+
176
+ try:
177
+ # Check if process still exists
178
+ if not self.process.is_running():
179
+ metrics.append(HealthMetric(
180
+ name="process_exists",
181
+ value=False,
182
+ status=HealthStatus.CRITICAL,
183
+ message=f"Process {self.pid} is no longer running"
184
+ ))
185
+ return metrics
186
+
187
+ # Process status
188
+ status = self.process.status()
189
+ process_healthy = status not in [psutil.STATUS_ZOMBIE, psutil.STATUS_DEAD, psutil.STATUS_STOPPED]
190
+ metrics.append(HealthMetric(
191
+ name="process_status",
192
+ value=status,
193
+ status=HealthStatus.HEALTHY if process_healthy else HealthStatus.CRITICAL,
194
+ message=f"Process status: {status}"
195
+ ))
196
+
197
+ # CPU usage
198
+ try:
199
+ cpu_percent = self.process.cpu_percent(interval=0.1)
200
+ cpu_status = HealthStatus.HEALTHY
201
+ if cpu_percent > self.cpu_threshold:
202
+ cpu_status = HealthStatus.WARNING if cpu_percent < self.cpu_threshold * 1.2 else HealthStatus.CRITICAL
203
+
204
+ metrics.append(HealthMetric(
205
+ name="cpu_usage_percent",
206
+ value=round(cpu_percent, 2),
207
+ status=cpu_status,
208
+ threshold=self.cpu_threshold,
209
+ unit="%"
210
+ ))
211
+ except Exception as e:
212
+ metrics.append(HealthMetric(
213
+ name="cpu_usage_percent",
214
+ value=-1,
215
+ status=HealthStatus.UNKNOWN,
216
+ message=f"Failed to get CPU usage: {e}"
217
+ ))
218
+
219
+ # Memory usage
220
+ try:
221
+ memory_info = self.process.memory_info()
222
+ memory_mb = memory_info.rss / (1024 * 1024)
223
+ memory_status = HealthStatus.HEALTHY
224
+ if memory_mb > self.memory_threshold_mb:
225
+ memory_status = HealthStatus.WARNING if memory_mb < self.memory_threshold_mb * 1.2 else HealthStatus.CRITICAL
226
+
227
+ metrics.append(HealthMetric(
228
+ name="memory_usage_mb",
229
+ value=round(memory_mb, 2),
230
+ status=memory_status,
231
+ threshold=self.memory_threshold_mb,
232
+ unit="MB"
233
+ ))
234
+
235
+ metrics.append(HealthMetric(
236
+ name="memory_vms_mb",
237
+ value=round(memory_info.vms / (1024 * 1024), 2),
238
+ status=HealthStatus.HEALTHY,
239
+ unit="MB"
240
+ ))
241
+ except Exception as e:
242
+ metrics.append(HealthMetric(
243
+ name="memory_usage_mb",
244
+ value=-1,
245
+ status=HealthStatus.UNKNOWN,
246
+ message=f"Failed to get memory usage: {e}"
247
+ ))
248
+
249
+ # File descriptors (Unix only)
250
+ if hasattr(self.process, 'num_fds'):
251
+ try:
252
+ fd_count = self.process.num_fds()
253
+ fd_status = HealthStatus.HEALTHY
254
+ if fd_count > self.fd_threshold:
255
+ fd_status = HealthStatus.WARNING if fd_count < self.fd_threshold * 1.2 else HealthStatus.CRITICAL
256
+
257
+ metrics.append(HealthMetric(
258
+ name="file_descriptors",
259
+ value=fd_count,
260
+ status=fd_status,
261
+ threshold=self.fd_threshold
262
+ ))
263
+ except Exception as e:
264
+ metrics.append(HealthMetric(
265
+ name="file_descriptors",
266
+ value=-1,
267
+ status=HealthStatus.UNKNOWN,
268
+ message=f"Failed to get file descriptor count: {e}"
269
+ ))
270
+
271
+ # Thread count
272
+ try:
273
+ thread_count = self.process.num_threads()
274
+ metrics.append(HealthMetric(
275
+ name="thread_count",
276
+ value=thread_count,
277
+ status=HealthStatus.HEALTHY,
278
+ ))
279
+ except Exception as e:
280
+ metrics.append(HealthMetric(
281
+ name="thread_count",
282
+ value=-1,
283
+ status=HealthStatus.UNKNOWN,
284
+ message=f"Failed to get thread count: {e}"
285
+ ))
286
+
287
+ # Process create time (for validation)
288
+ try:
289
+ create_time = self.process.create_time()
290
+ metrics.append(HealthMetric(
291
+ name="process_start_time",
292
+ value=create_time,
293
+ status=HealthStatus.HEALTHY,
294
+ unit="timestamp"
295
+ ))
296
+ except Exception as e:
297
+ metrics.append(HealthMetric(
298
+ name="process_start_time",
299
+ value=-1,
300
+ status=HealthStatus.UNKNOWN,
301
+ message=f"Failed to get process start time: {e}"
302
+ ))
303
+
304
+ except psutil.NoSuchProcess:
305
+ metrics.append(HealthMetric(
306
+ name="process_exists",
307
+ value=False,
308
+ status=HealthStatus.CRITICAL,
309
+ message=f"Process {self.pid} no longer exists"
310
+ ))
311
+ except Exception as e:
312
+ self.logger.error(f"Error checking process health: {e}")
313
+ metrics.append(HealthMetric(
314
+ name="process_check_error",
315
+ value=str(e),
316
+ status=HealthStatus.UNKNOWN,
317
+ message=f"Unexpected error during process health check: {e}"
318
+ ))
319
+
320
+ return metrics
321
+
322
+
323
+ class NetworkConnectivityChecker(HealthChecker):
324
+ """Health checker for network connectivity.
325
+
326
+ Monitors:
327
+ - Port availability and binding status
328
+ - Socket connection health
329
+ - Network interface status
330
+ """
331
+
332
+ def __init__(self, host: str, port: int, timeout: float = 1.0):
333
+ """Initialize network connectivity checker.
334
+
335
+ Args:
336
+ host: Host address to check
337
+ port: Port number to check
338
+ timeout: Connection timeout in seconds
339
+ """
340
+ self.host = host
341
+ self.port = port
342
+ self.timeout = timeout
343
+ self.logger = logging.getLogger(f"{__name__}.NetworkConnectivityChecker")
344
+
345
+ def get_name(self) -> str:
346
+ return f"network_connectivity_{self.host}_{self.port}"
347
+
348
+ async def check_health(self) -> List[HealthMetric]:
349
+ """Check network connectivity."""
350
+ metrics = []
351
+
352
+ # Check port binding
353
+ try:
354
+ # Try to connect to the port
355
+ sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
356
+ sock.settimeout(self.timeout)
357
+ result = sock.connect_ex((self.host, self.port))
358
+ sock.close()
359
+
360
+ if result == 0:
361
+ metrics.append(HealthMetric(
362
+ name="port_accessible",
363
+ value=True,
364
+ status=HealthStatus.HEALTHY,
365
+ message=f"Port {self.port} is accessible on {self.host}"
366
+ ))
367
+ else:
368
+ metrics.append(HealthMetric(
369
+ name="port_accessible",
370
+ value=False,
371
+ status=HealthStatus.CRITICAL,
372
+ message=f"Port {self.port} is not accessible on {self.host}"
373
+ ))
374
+ except Exception as e:
375
+ metrics.append(HealthMetric(
376
+ name="port_accessible",
377
+ value=False,
378
+ status=HealthStatus.UNKNOWN,
379
+ message=f"Error checking port accessibility: {e}"
380
+ ))
381
+
382
+ # Check if we can create a socket (resource availability)
383
+ try:
384
+ test_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
385
+ test_sock.close()
386
+ metrics.append(HealthMetric(
387
+ name="socket_creation",
388
+ value=True,
389
+ status=HealthStatus.HEALTHY,
390
+ message="Socket creation successful"
391
+ ))
392
+ except Exception as e:
393
+ metrics.append(HealthMetric(
394
+ name="socket_creation",
395
+ value=False,
396
+ status=HealthStatus.CRITICAL,
397
+ message=f"Failed to create socket: {e}"
398
+ ))
399
+
400
+ return metrics
401
+
402
+
403
+ class ServiceHealthChecker(HealthChecker):
404
+ """Health checker for service-specific metrics.
405
+
406
+ Monitors:
407
+ - Connected clients count
408
+ - Event processing rate
409
+ - Error rates
410
+ - Response times
411
+ """
412
+
413
+ def __init__(self, service_stats: Dict[str, Any],
414
+ max_clients: int = 1000, max_error_rate: float = 0.1):
415
+ """Initialize service health checker.
416
+
417
+ Args:
418
+ service_stats: Reference to service statistics dictionary
419
+ max_clients: Maximum allowed connected clients
420
+ max_error_rate: Maximum allowed error rate (0.0-1.0)
421
+ """
422
+ self.service_stats = service_stats
423
+ self.max_clients = max_clients
424
+ self.max_error_rate = max_error_rate
425
+ self.last_check_time = time.time()
426
+ self.last_events_processed = 0
427
+ self.logger = logging.getLogger(f"{__name__}.ServiceHealthChecker")
428
+
429
+ def get_name(self) -> str:
430
+ return "service_health"
431
+
432
+ async def check_health(self) -> List[HealthMetric]:
433
+ """Check service-specific health metrics."""
434
+ metrics = []
435
+ current_time = time.time()
436
+
437
+ # Connected clients
438
+ try:
439
+ client_count = self.service_stats.get("clients_connected", 0)
440
+ client_status = HealthStatus.HEALTHY
441
+ if client_count > self.max_clients * 0.8:
442
+ client_status = HealthStatus.WARNING
443
+ if client_count > self.max_clients:
444
+ client_status = HealthStatus.CRITICAL
445
+
446
+ metrics.append(HealthMetric(
447
+ name="connected_clients",
448
+ value=client_count,
449
+ status=client_status,
450
+ threshold=self.max_clients
451
+ ))
452
+ except Exception as e:
453
+ metrics.append(HealthMetric(
454
+ name="connected_clients",
455
+ value=-1,
456
+ status=HealthStatus.UNKNOWN,
457
+ message=f"Failed to get client count: {e}"
458
+ ))
459
+
460
+ # Event processing rate
461
+ try:
462
+ events_processed = self.service_stats.get("events_processed", 0)
463
+ time_diff = current_time - self.last_check_time
464
+
465
+ if time_diff > 0 and self.last_events_processed > 0:
466
+ event_rate = (events_processed - self.last_events_processed) / time_diff
467
+ metrics.append(HealthMetric(
468
+ name="event_processing_rate",
469
+ value=round(event_rate, 2),
470
+ status=HealthStatus.HEALTHY,
471
+ unit="events/sec"
472
+ ))
473
+
474
+ self.last_events_processed = events_processed
475
+
476
+ # Total events processed
477
+ metrics.append(HealthMetric(
478
+ name="total_events_processed",
479
+ value=events_processed,
480
+ status=HealthStatus.HEALTHY
481
+ ))
482
+ except Exception as e:
483
+ metrics.append(HealthMetric(
484
+ name="event_processing_rate",
485
+ value=-1,
486
+ status=HealthStatus.UNKNOWN,
487
+ message=f"Failed to calculate event rate: {e}"
488
+ ))
489
+
490
+ # Error rate
491
+ try:
492
+ errors = self.service_stats.get("errors", 0)
493
+ total_events = self.service_stats.get("events_processed", 1) # Avoid division by zero
494
+ error_rate = errors / max(total_events, 1)
495
+
496
+ error_status = HealthStatus.HEALTHY
497
+ if error_rate > self.max_error_rate * 0.5:
498
+ error_status = HealthStatus.WARNING
499
+ if error_rate > self.max_error_rate:
500
+ error_status = HealthStatus.CRITICAL
501
+
502
+ metrics.append(HealthMetric(
503
+ name="error_rate",
504
+ value=round(error_rate, 4),
505
+ status=error_status,
506
+ threshold=self.max_error_rate,
507
+ unit="ratio"
508
+ ))
509
+
510
+ metrics.append(HealthMetric(
511
+ name="total_errors",
512
+ value=errors,
513
+ status=HealthStatus.HEALTHY if errors == 0 else HealthStatus.WARNING
514
+ ))
515
+ except Exception as e:
516
+ metrics.append(HealthMetric(
517
+ name="error_rate",
518
+ value=-1,
519
+ status=HealthStatus.UNKNOWN,
520
+ message=f"Failed to calculate error rate: {e}"
521
+ ))
522
+
523
+ # Last activity timestamp
524
+ try:
525
+ last_activity = self.service_stats.get("last_activity")
526
+ if last_activity:
527
+ # Parse ISO timestamp or use as-is if numeric
528
+ if isinstance(last_activity, str):
529
+ try:
530
+ from dateutil.parser import parse
531
+ last_activity_dt = parse(last_activity)
532
+ last_activity_timestamp = last_activity_dt.timestamp()
533
+ except ImportError:
534
+ # Fallback: try to parse ISO format manually
535
+ try:
536
+ from datetime import datetime
537
+ clean_timestamp = last_activity.rstrip('Z')
538
+ last_activity_dt = datetime.fromisoformat(clean_timestamp.replace('T', ' '))
539
+ last_activity_timestamp = last_activity_dt.timestamp()
540
+ except Exception:
541
+ # Final fallback: treat as current time
542
+ last_activity_timestamp = current_time
543
+ else:
544
+ last_activity_timestamp = float(last_activity)
545
+
546
+ time_since_activity = current_time - last_activity_timestamp
547
+ activity_status = HealthStatus.HEALTHY
548
+ if time_since_activity > 300: # 5 minutes
549
+ activity_status = HealthStatus.WARNING
550
+ if time_since_activity > 1800: # 30 minutes
551
+ activity_status = HealthStatus.CRITICAL
552
+
553
+ metrics.append(HealthMetric(
554
+ name="time_since_last_activity",
555
+ value=round(time_since_activity, 2),
556
+ status=activity_status,
557
+ unit="seconds"
558
+ ))
559
+ else:
560
+ metrics.append(HealthMetric(
561
+ name="time_since_last_activity",
562
+ value=-1,
563
+ status=HealthStatus.WARNING,
564
+ message="No last activity recorded"
565
+ ))
566
+ except Exception as e:
567
+ metrics.append(HealthMetric(
568
+ name="time_since_last_activity",
569
+ value=-1,
570
+ status=HealthStatus.UNKNOWN,
571
+ message=f"Failed to parse last activity: {e}"
572
+ ))
573
+
574
+ self.last_check_time = current_time
575
+ return metrics
576
+
577
+
578
+ class AdvancedHealthMonitor:
579
+ """Advanced health monitoring system with configurable checks and thresholds.
580
+
581
+ Provides comprehensive health monitoring including:
582
+ - Multiple health checker integration
583
+ - Configurable check intervals and thresholds
584
+ - Health history tracking
585
+ - Status aggregation and reporting
586
+ - Integration with recovery systems
587
+ """
588
+
589
+ def __init__(self, config: Optional[Dict[str, Any]] = None):
590
+ """Initialize advanced health monitor.
591
+
592
+ Args:
593
+ config: Configuration dictionary for health monitoring
594
+ """
595
+ self.config = config or {}
596
+ self.logger = logging.getLogger(f"{__name__}.AdvancedHealthMonitor")
597
+
598
+ # Configuration with defaults
599
+ self.check_interval = self.config.get('check_interval', 30)
600
+ self.history_size = self.config.get('history_size', 100)
601
+ self.aggregation_window = self.config.get('aggregation_window', 300) # 5 minutes
602
+
603
+ # Health checkers
604
+ self.checkers: List[HealthChecker] = []
605
+
606
+ # Health history
607
+ self.health_history: deque = deque(maxlen=self.history_size)
608
+
609
+ # Monitoring state
610
+ self.monitoring = False
611
+ self.monitor_task: Optional[asyncio.Task] = None
612
+ self.last_check_result: Optional[HealthCheckResult] = None
613
+
614
+ # Health callbacks for recovery integration
615
+ self.health_callbacks: List[Callable[[HealthCheckResult], None]] = []
616
+
617
+ # Initialize metrics
618
+ self.monitoring_stats = {
619
+ 'checks_performed': 0,
620
+ 'checks_failed': 0,
621
+ 'average_check_duration_ms': 0,
622
+ 'last_check_timestamp': None
623
+ }
624
+
625
+ self.logger.info("Advanced health monitor initialized")
626
+
627
+ def add_checker(self, checker: HealthChecker) -> None:
628
+ """Add a health checker to the monitoring system."""
629
+ self.checkers.append(checker)
630
+ self.logger.info(f"Added health checker: {checker.get_name()}")
631
+
632
+ def add_health_callback(self, callback: Callable[[HealthCheckResult], None]) -> None:
633
+ """Add a callback to be called when health checks complete.
634
+
635
+ Args:
636
+ callback: Function to call with HealthCheckResult
637
+ """
638
+ self.health_callbacks.append(callback)
639
+ self.logger.debug(f"Added health callback: {callback.__name__}")
640
+
641
+ async def perform_health_check(self) -> HealthCheckResult:
642
+ """Perform comprehensive health check using all registered checkers."""
643
+ start_time = time.time()
644
+ all_metrics = []
645
+ errors = []
646
+
647
+ # Run all health checkers
648
+ for checker in self.checkers:
649
+ try:
650
+ checker_start = time.time()
651
+ metrics = await checker.check_health()
652
+ checker_duration = (time.time() - checker_start) * 1000
653
+
654
+ all_metrics.extend(metrics)
655
+ self.logger.debug(f"Health checker {checker.get_name()} completed in {checker_duration:.2f}ms")
656
+
657
+ except Exception as e:
658
+ error_msg = f"Health checker {checker.get_name()} failed: {e}"
659
+ errors.append(error_msg)
660
+ self.logger.error(error_msg)
661
+
662
+ # Add error metric
663
+ all_metrics.append(HealthMetric(
664
+ name=f"{checker.get_name()}_error",
665
+ value=str(e),
666
+ status=HealthStatus.UNKNOWN,
667
+ message=error_msg
668
+ ))
669
+
670
+ # Determine overall status
671
+ overall_status = self._determine_overall_status(all_metrics)
672
+
673
+ # Create result
674
+ duration_ms = (time.time() - start_time) * 1000
675
+ result = HealthCheckResult(
676
+ overall_status=overall_status,
677
+ metrics=all_metrics,
678
+ timestamp=start_time,
679
+ duration_ms=duration_ms,
680
+ errors=errors
681
+ )
682
+
683
+ # Update statistics
684
+ self.monitoring_stats['checks_performed'] += 1
685
+ if errors:
686
+ self.monitoring_stats['checks_failed'] += 1
687
+
688
+ # Update average duration
689
+ current_avg = self.monitoring_stats['average_check_duration_ms']
690
+ checks_count = self.monitoring_stats['checks_performed']
691
+ self.monitoring_stats['average_check_duration_ms'] = (
692
+ (current_avg * (checks_count - 1) + duration_ms) / checks_count
693
+ )
694
+ self.monitoring_stats['last_check_timestamp'] = time.time()
695
+
696
+ # Store in history
697
+ self.health_history.append(result)
698
+ self.last_check_result = result
699
+
700
+ # Notify callbacks
701
+ for callback in self.health_callbacks:
702
+ try:
703
+ callback(result)
704
+ except Exception as e:
705
+ self.logger.error(f"Health callback {callback.__name__} failed: {e}")
706
+
707
+ self.logger.debug(f"Health check completed: {overall_status.value} "
708
+ f"({len(all_metrics)} metrics, {len(errors)} errors, "
709
+ f"{duration_ms:.2f}ms)")
710
+
711
+ return result
712
+
713
+ def _determine_overall_status(self, metrics: List[HealthMetric]) -> HealthStatus:
714
+ """Determine overall health status from individual metrics."""
715
+ if not metrics:
716
+ return HealthStatus.UNKNOWN
717
+
718
+ # Count metrics by status
719
+ status_counts = {status: 0 for status in HealthStatus}
720
+ for metric in metrics:
721
+ status_counts[metric.status] += 1
722
+
723
+ # Determine overall status based on counts
724
+ total_metrics = len(metrics)
725
+
726
+ # If any critical metrics, overall is critical
727
+ if status_counts[HealthStatus.CRITICAL] > 0:
728
+ return HealthStatus.CRITICAL
729
+
730
+ # If more than 30% warning metrics, overall is warning
731
+ warning_ratio = status_counts[HealthStatus.WARNING] / total_metrics
732
+ if warning_ratio > 0.3:
733
+ return HealthStatus.WARNING
734
+
735
+ # If any warning metrics but less than 30%, still healthy
736
+ if status_counts[HealthStatus.WARNING] > 0:
737
+ return HealthStatus.HEALTHY
738
+
739
+ # If any unknown metrics, overall is unknown
740
+ if status_counts[HealthStatus.UNKNOWN] > 0:
741
+ return HealthStatus.UNKNOWN
742
+
743
+ # All metrics healthy
744
+ return HealthStatus.HEALTHY
745
+
746
+ def start_monitoring(self) -> None:
747
+ """Start continuous health monitoring."""
748
+ if self.monitoring:
749
+ self.logger.warning("Health monitoring is already running")
750
+ return
751
+
752
+ self.monitoring = True
753
+ self.monitor_task = asyncio.create_task(self._monitoring_loop())
754
+ self.logger.info(f"Started health monitoring with {self.check_interval}s interval")
755
+
756
+ async def stop_monitoring(self) -> None:
757
+ """Stop continuous health monitoring."""
758
+ if not self.monitoring:
759
+ return
760
+
761
+ self.monitoring = False
762
+ if self.monitor_task:
763
+ self.monitor_task.cancel()
764
+ try:
765
+ await self.monitor_task
766
+ except asyncio.CancelledError:
767
+ pass
768
+ self.monitor_task = None
769
+
770
+ self.logger.info("Stopped health monitoring")
771
+
772
+ async def _monitoring_loop(self) -> None:
773
+ """Continuous health monitoring loop."""
774
+ try:
775
+ while self.monitoring:
776
+ try:
777
+ await self.perform_health_check()
778
+ except Exception as e:
779
+ self.logger.error(f"Error during health check: {e}")
780
+
781
+ # Wait for next check
782
+ await asyncio.sleep(self.check_interval)
783
+ except asyncio.CancelledError:
784
+ self.logger.debug("Health monitoring loop cancelled")
785
+ except Exception as e:
786
+ self.logger.error(f"Health monitoring loop error: {e}")
787
+
788
+ def get_current_status(self) -> Optional[HealthCheckResult]:
789
+ """Get the most recent health check result."""
790
+ return self.last_check_result
791
+
792
+ def get_health_history(self, limit: Optional[int] = None) -> List[HealthCheckResult]:
793
+ """Get health check history.
794
+
795
+ Args:
796
+ limit: Maximum number of results to return
797
+
798
+ Returns:
799
+ List of health check results, newest first
800
+ """
801
+ history = list(self.health_history)
802
+ history.reverse() # Newest first
803
+
804
+ if limit:
805
+ history = history[:limit]
806
+
807
+ return history
808
+
809
+ def get_aggregated_status(self, window_seconds: Optional[int] = None) -> Dict[str, Any]:
810
+ """Get aggregated health status over a time window.
811
+
812
+ Args:
813
+ window_seconds: Time window for aggregation (defaults to configured window)
814
+
815
+ Returns:
816
+ Dictionary with aggregated health statistics
817
+ """
818
+ window_seconds = window_seconds or self.aggregation_window
819
+ current_time = time.time()
820
+ cutoff_time = current_time - window_seconds
821
+
822
+ # Filter history to time window
823
+ recent_results = [
824
+ result for result in self.health_history
825
+ if result.timestamp >= cutoff_time
826
+ ]
827
+
828
+ if not recent_results:
829
+ return {
830
+ 'period': 'no_data',
831
+ 'window_seconds': window_seconds,
832
+ 'checks_count': 0,
833
+ 'overall_status': HealthStatus.UNKNOWN.value
834
+ }
835
+
836
+ # Aggregate statistics
837
+ status_counts = {status: 0 for status in HealthStatus}
838
+ total_metrics = 0
839
+ total_errors = 0
840
+ total_duration_ms = 0
841
+
842
+ for result in recent_results:
843
+ status_counts[result.overall_status] += 1
844
+ total_metrics += len(result.metrics)
845
+ total_errors += len(result.errors)
846
+ total_duration_ms += result.duration_ms
847
+
848
+ checks_count = len(recent_results)
849
+
850
+ # Determine aggregated status
851
+ if status_counts[HealthStatus.CRITICAL] > 0:
852
+ aggregated_status = HealthStatus.CRITICAL
853
+ elif status_counts[HealthStatus.WARNING] > checks_count * 0.3:
854
+ aggregated_status = HealthStatus.WARNING
855
+ elif status_counts[HealthStatus.UNKNOWN] > checks_count * 0.5:
856
+ aggregated_status = HealthStatus.UNKNOWN
857
+ else:
858
+ aggregated_status = HealthStatus.HEALTHY
859
+
860
+ return {
861
+ 'period': f'last_{window_seconds}_seconds',
862
+ 'window_seconds': window_seconds,
863
+ 'checks_count': checks_count,
864
+ 'overall_status': aggregated_status.value,
865
+ 'status_distribution': {status.value: count for status, count in status_counts.items()},
866
+ 'average_metrics_per_check': round(total_metrics / checks_count, 2) if checks_count > 0 else 0,
867
+ 'total_errors': total_errors,
868
+ 'average_duration_ms': round(total_duration_ms / checks_count, 2) if checks_count > 0 else 0,
869
+ 'monitoring_stats': dict(self.monitoring_stats)
870
+ }
871
+
872
+ def export_diagnostics(self) -> Dict[str, Any]:
873
+ """Export comprehensive diagnostics information."""
874
+ return {
875
+ 'monitor_info': {
876
+ 'check_interval': self.check_interval,
877
+ 'history_size': self.history_size,
878
+ 'aggregation_window': self.aggregation_window,
879
+ 'monitoring_active': self.monitoring,
880
+ 'checkers_count': len(self.checkers),
881
+ 'callbacks_count': len(self.health_callbacks)
882
+ },
883
+ 'checkers': [checker.get_name() for checker in self.checkers],
884
+ 'current_status': self.last_check_result.to_dict() if self.last_check_result else None,
885
+ 'aggregated_status': self.get_aggregated_status(),
886
+ 'monitoring_stats': dict(self.monitoring_stats),
887
+ 'history_summary': {
888
+ 'total_checks': len(self.health_history),
889
+ 'oldest_check': self.health_history[0].timestamp if self.health_history else None,
890
+ 'newest_check': self.health_history[-1].timestamp if self.health_history else None
891
+ }
892
+ }