claude-mpm 3.7.8__py3-none-any.whl → 3.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. claude_mpm/VERSION +1 -1
  2. claude_mpm/agents/BASE_PM.md +0 -106
  3. claude_mpm/agents/INSTRUCTIONS.md +0 -96
  4. claude_mpm/agents/MEMORY.md +94 -0
  5. claude_mpm/agents/WORKFLOW.md +86 -0
  6. claude_mpm/agents/templates/code_analyzer.json +2 -2
  7. claude_mpm/agents/templates/data_engineer.json +1 -1
  8. claude_mpm/agents/templates/documentation.json +1 -1
  9. claude_mpm/agents/templates/engineer.json +1 -1
  10. claude_mpm/agents/templates/ops.json +1 -1
  11. claude_mpm/agents/templates/qa.json +1 -1
  12. claude_mpm/agents/templates/research.json +1 -1
  13. claude_mpm/agents/templates/security.json +1 -1
  14. claude_mpm/agents/templates/ticketing.json +3 -8
  15. claude_mpm/agents/templates/version_control.json +1 -1
  16. claude_mpm/agents/templates/web_qa.json +2 -2
  17. claude_mpm/agents/templates/web_ui.json +2 -2
  18. claude_mpm/cli/__init__.py +2 -2
  19. claude_mpm/cli/commands/__init__.py +2 -1
  20. claude_mpm/cli/commands/agents.py +8 -3
  21. claude_mpm/cli/commands/tickets.py +596 -19
  22. claude_mpm/cli/parser.py +217 -5
  23. claude_mpm/config/__init__.py +30 -39
  24. claude_mpm/config/socketio_config.py +8 -5
  25. claude_mpm/constants.py +13 -0
  26. claude_mpm/core/__init__.py +8 -18
  27. claude_mpm/core/cache.py +596 -0
  28. claude_mpm/core/claude_runner.py +166 -622
  29. claude_mpm/core/config.py +7 -3
  30. claude_mpm/core/constants.py +339 -0
  31. claude_mpm/core/container.py +548 -38
  32. claude_mpm/core/exceptions.py +392 -0
  33. claude_mpm/core/framework_loader.py +249 -93
  34. claude_mpm/core/interactive_session.py +479 -0
  35. claude_mpm/core/interfaces.py +424 -0
  36. claude_mpm/core/lazy.py +467 -0
  37. claude_mpm/core/logging_config.py +444 -0
  38. claude_mpm/core/oneshot_session.py +465 -0
  39. claude_mpm/core/optimized_agent_loader.py +485 -0
  40. claude_mpm/core/optimized_startup.py +490 -0
  41. claude_mpm/core/service_registry.py +52 -26
  42. claude_mpm/core/socketio_pool.py +162 -5
  43. claude_mpm/core/types.py +292 -0
  44. claude_mpm/core/typing_utils.py +477 -0
  45. claude_mpm/hooks/claude_hooks/hook_handler.py +213 -99
  46. claude_mpm/init.py +2 -1
  47. claude_mpm/services/__init__.py +78 -14
  48. claude_mpm/services/agent/__init__.py +24 -0
  49. claude_mpm/services/agent/deployment.py +2548 -0
  50. claude_mpm/services/agent/management.py +598 -0
  51. claude_mpm/services/agent/registry.py +813 -0
  52. claude_mpm/services/agents/deployment/agent_deployment.py +728 -308
  53. claude_mpm/services/agents/memory/agent_memory_manager.py +160 -4
  54. claude_mpm/services/async_session_logger.py +8 -3
  55. claude_mpm/services/communication/__init__.py +21 -0
  56. claude_mpm/services/communication/socketio.py +1933 -0
  57. claude_mpm/services/communication/websocket.py +479 -0
  58. claude_mpm/services/core/__init__.py +123 -0
  59. claude_mpm/services/core/base.py +247 -0
  60. claude_mpm/services/core/interfaces.py +951 -0
  61. claude_mpm/services/framework_claude_md_generator/__init__.py +10 -3
  62. claude_mpm/services/framework_claude_md_generator/deployment_manager.py +14 -11
  63. claude_mpm/services/framework_claude_md_generator/section_generators/todo_task_tools.py +23 -23
  64. claude_mpm/services/framework_claude_md_generator.py +3 -2
  65. claude_mpm/services/health_monitor.py +4 -3
  66. claude_mpm/services/hook_service.py +64 -4
  67. claude_mpm/services/infrastructure/__init__.py +21 -0
  68. claude_mpm/services/infrastructure/logging.py +202 -0
  69. claude_mpm/services/infrastructure/monitoring.py +893 -0
  70. claude_mpm/services/memory/indexed_memory.py +648 -0
  71. claude_mpm/services/project/__init__.py +21 -0
  72. claude_mpm/services/project/analyzer.py +864 -0
  73. claude_mpm/services/project/registry.py +608 -0
  74. claude_mpm/services/project_analyzer.py +95 -2
  75. claude_mpm/services/recovery_manager.py +15 -9
  76. claude_mpm/services/response_tracker.py +3 -5
  77. claude_mpm/services/socketio/__init__.py +25 -0
  78. claude_mpm/services/socketio/handlers/__init__.py +25 -0
  79. claude_mpm/services/socketio/handlers/base.py +121 -0
  80. claude_mpm/services/socketio/handlers/connection.py +198 -0
  81. claude_mpm/services/socketio/handlers/file.py +213 -0
  82. claude_mpm/services/socketio/handlers/git.py +723 -0
  83. claude_mpm/services/socketio/handlers/memory.py +27 -0
  84. claude_mpm/services/socketio/handlers/project.py +25 -0
  85. claude_mpm/services/socketio/handlers/registry.py +145 -0
  86. claude_mpm/services/socketio_client_manager.py +12 -7
  87. claude_mpm/services/socketio_server.py +156 -30
  88. claude_mpm/services/ticket_manager.py +172 -9
  89. claude_mpm/services/ticket_manager_di.py +1 -1
  90. claude_mpm/services/version_control/semantic_versioning.py +80 -7
  91. claude_mpm/services/version_control/version_parser.py +528 -0
  92. claude_mpm/utils/error_handler.py +1 -1
  93. claude_mpm/validation/agent_validator.py +27 -14
  94. claude_mpm/validation/frontmatter_validator.py +231 -0
  95. {claude_mpm-3.7.8.dist-info → claude_mpm-3.9.0.dist-info}/METADATA +38 -128
  96. {claude_mpm-3.7.8.dist-info → claude_mpm-3.9.0.dist-info}/RECORD +100 -59
  97. {claude_mpm-3.7.8.dist-info → claude_mpm-3.9.0.dist-info}/WHEEL +0 -0
  98. {claude_mpm-3.7.8.dist-info → claude_mpm-3.9.0.dist-info}/entry_points.txt +0 -0
  99. {claude_mpm-3.7.8.dist-info → claude_mpm-3.9.0.dist-info}/licenses/LICENSE +0 -0
  100. {claude_mpm-3.7.8.dist-info → claude_mpm-3.9.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,893 @@
1
+ """Advanced health monitoring system for claude-mpm Socket.IO server.
2
+
3
+ This module provides comprehensive health checking capabilities including:
4
+ - Process resource monitoring (CPU, memory, file descriptors)
5
+ - Service-specific health markers
6
+ - Configurable thresholds and intervals
7
+ - Health status aggregation and history
8
+ - Integration with automatic recovery mechanisms
9
+
10
+ Design Principles:
11
+ - Minimal performance impact through efficient polling
12
+ - Extensible metric collection system
13
+ - Circuit breaker integration for failure detection
14
+ - Comprehensive logging for debugging and diagnostics
15
+ """
16
+
17
+ import asyncio
18
+ import logging
19
+ import time
20
+ import threading
21
+ from abc import ABC, abstractmethod
22
+ from collections import deque
23
+ from dataclasses import dataclass, asdict
24
+ from datetime import datetime, timezone
25
+ from enum import Enum
26
+ from pathlib import Path
27
+ from typing import Any, Dict, List, Optional, Callable, Union
28
+ import json
29
+ import socket
30
+ from claude_mpm.core.constants import ResourceLimits, TimeoutConfig
31
+
32
+ try:
33
+ import psutil
34
+ PSUTIL_AVAILABLE = True
35
+ except ImportError:
36
+ PSUTIL_AVAILABLE = False
37
+ psutil = None
38
+
39
+
40
+ class HealthStatus(Enum):
41
+ """Health status levels for monitoring."""
42
+ HEALTHY = "healthy"
43
+ WARNING = "warning"
44
+ CRITICAL = "critical"
45
+ UNKNOWN = "unknown"
46
+
47
+
48
+ @dataclass
49
+ class HealthMetric:
50
+ """Individual health metric data structure."""
51
+ name: str
52
+ value: Union[int, float, str, bool]
53
+ status: HealthStatus
54
+ threshold: Optional[Union[int, float]] = None
55
+ unit: Optional[str] = None
56
+ timestamp: float = None
57
+ message: Optional[str] = None
58
+
59
+ def __post_init__(self):
60
+ if self.timestamp is None:
61
+ self.timestamp = time.time()
62
+
63
+ def to_dict(self) -> Dict[str, Any]:
64
+ """Convert metric to dictionary format."""
65
+ result = asdict(self)
66
+ result['status'] = self.status.value
67
+ result['timestamp_iso'] = datetime.fromtimestamp(self.timestamp, timezone.utc).isoformat()
68
+ return result
69
+
70
+
71
+ @dataclass
72
+ class HealthCheckResult:
73
+ """Result of a health check operation."""
74
+ overall_status: HealthStatus
75
+ metrics: List[HealthMetric]
76
+ timestamp: float
77
+ duration_ms: float
78
+ errors: List[str]
79
+
80
+ def __post_init__(self):
81
+ if not hasattr(self, 'timestamp') or self.timestamp is None:
82
+ self.timestamp = time.time()
83
+
84
+ def to_dict(self) -> Dict[str, Any]:
85
+ """Convert health check result to dictionary format."""
86
+ return {
87
+ 'overall_status': self.overall_status.value,
88
+ 'metrics': [metric.to_dict() for metric in self.metrics],
89
+ 'timestamp': self.timestamp,
90
+ 'timestamp_iso': datetime.fromtimestamp(self.timestamp, timezone.utc).isoformat(),
91
+ 'duration_ms': self.duration_ms,
92
+ 'errors': self.errors,
93
+ 'metric_count': len(self.metrics),
94
+ 'healthy_metrics': len([m for m in self.metrics if m.status == HealthStatus.HEALTHY]),
95
+ 'warning_metrics': len([m for m in self.metrics if m.status == HealthStatus.WARNING]),
96
+ 'critical_metrics': len([m for m in self.metrics if m.status == HealthStatus.CRITICAL])
97
+ }
98
+
99
+
100
+ class HealthChecker(ABC):
101
+ """Abstract base class for health checkers.
102
+
103
+ Health checkers implement specific monitoring logic for different aspects
104
+ of the system (process resources, network connectivity, service health, etc.).
105
+ """
106
+
107
+ @abstractmethod
108
+ def get_name(self) -> str:
109
+ """Get the name of this health checker."""
110
+ pass
111
+
112
+ @abstractmethod
113
+ async def check_health(self) -> List[HealthMetric]:
114
+ """Perform health check and return metrics."""
115
+ pass
116
+
117
+
118
+ class ProcessResourceChecker(HealthChecker):
119
+ """Health checker for process resource usage.
120
+
121
+ Monitors:
122
+ - CPU usage percentage
123
+ - Memory usage (RSS, VMS)
124
+ - File descriptor count
125
+ - Thread count
126
+ - Process status
127
+ """
128
+
129
+ def __init__(self, pid: int, cpu_threshold: float = 80.0,
130
+ memory_threshold_mb: int = 500, fd_threshold: int = 1000):
131
+ """Initialize process resource checker.
132
+
133
+ Args:
134
+ pid: Process ID to monitor
135
+ cpu_threshold: CPU usage threshold as percentage
136
+ memory_threshold_mb: Memory usage threshold in MB
137
+ fd_threshold: File descriptor count threshold
138
+ """
139
+ self.pid = pid
140
+ self.cpu_threshold = cpu_threshold
141
+ self.memory_threshold_mb = memory_threshold_mb
142
+ self.fd_threshold = fd_threshold
143
+ self.process = None
144
+ self.logger = logging.getLogger(f"{__name__}.ProcessResourceChecker")
145
+
146
+ if PSUTIL_AVAILABLE:
147
+ try:
148
+ self.process = psutil.Process(pid)
149
+ except psutil.NoSuchProcess:
150
+ self.logger.warning(f"Process {pid} not found for monitoring")
151
+
152
+ def get_name(self) -> str:
153
+ return f"process_resources_{self.pid}"
154
+
155
+ async def check_health(self) -> List[HealthMetric]:
156
+ """Check process resource usage."""
157
+ metrics = []
158
+
159
+ if not PSUTIL_AVAILABLE:
160
+ metrics.append(HealthMetric(
161
+ name="psutil_availability",
162
+ value=False,
163
+ status=HealthStatus.WARNING,
164
+ message="psutil not available for enhanced monitoring"
165
+ ))
166
+ return metrics
167
+
168
+ if not self.process:
169
+ metrics.append(HealthMetric(
170
+ name="process_exists",
171
+ value=False,
172
+ status=HealthStatus.CRITICAL,
173
+ message=f"Process {self.pid} not found"
174
+ ))
175
+ return metrics
176
+
177
+ try:
178
+ # Check if process still exists
179
+ if not self.process.is_running():
180
+ metrics.append(HealthMetric(
181
+ name="process_exists",
182
+ value=False,
183
+ status=HealthStatus.CRITICAL,
184
+ message=f"Process {self.pid} is no longer running"
185
+ ))
186
+ return metrics
187
+
188
+ # Process status
189
+ status = self.process.status()
190
+ process_healthy = status not in [psutil.STATUS_ZOMBIE, psutil.STATUS_DEAD, psutil.STATUS_STOPPED]
191
+ metrics.append(HealthMetric(
192
+ name="process_status",
193
+ value=status,
194
+ status=HealthStatus.HEALTHY if process_healthy else HealthStatus.CRITICAL,
195
+ message=f"Process status: {status}"
196
+ ))
197
+
198
+ # CPU usage
199
+ try:
200
+ cpu_percent = self.process.cpu_percent(interval=TimeoutConfig.CPU_SAMPLE_INTERVAL)
201
+ cpu_status = HealthStatus.HEALTHY
202
+ if cpu_percent > self.cpu_threshold:
203
+ cpu_status = HealthStatus.WARNING if cpu_percent < self.cpu_threshold * 1.2 else HealthStatus.CRITICAL
204
+
205
+ metrics.append(HealthMetric(
206
+ name="cpu_usage_percent",
207
+ value=round(cpu_percent, 2),
208
+ status=cpu_status,
209
+ threshold=self.cpu_threshold,
210
+ unit="%"
211
+ ))
212
+ except Exception as e:
213
+ metrics.append(HealthMetric(
214
+ name="cpu_usage_percent",
215
+ value=-1,
216
+ status=HealthStatus.UNKNOWN,
217
+ message=f"Failed to get CPU usage: {e}"
218
+ ))
219
+
220
+ # Memory usage
221
+ try:
222
+ memory_info = self.process.memory_info()
223
+ memory_mb = memory_info.rss / ResourceLimits.BYTES_TO_MB
224
+ memory_status = HealthStatus.HEALTHY
225
+ if memory_mb > self.memory_threshold_mb:
226
+ memory_status = HealthStatus.WARNING if memory_mb < self.memory_threshold_mb * 1.2 else HealthStatus.CRITICAL
227
+
228
+ metrics.append(HealthMetric(
229
+ name="memory_usage_mb",
230
+ value=round(memory_mb, 2),
231
+ status=memory_status,
232
+ threshold=self.memory_threshold_mb,
233
+ unit="MB"
234
+ ))
235
+
236
+ metrics.append(HealthMetric(
237
+ name="memory_vms_mb",
238
+ value=round(memory_info.vms / ResourceLimits.BYTES_TO_MB, 2),
239
+ status=HealthStatus.HEALTHY,
240
+ unit="MB"
241
+ ))
242
+ except Exception as e:
243
+ metrics.append(HealthMetric(
244
+ name="memory_usage_mb",
245
+ value=-1,
246
+ status=HealthStatus.UNKNOWN,
247
+ message=f"Failed to get memory usage: {e}"
248
+ ))
249
+
250
+ # File descriptors (Unix only)
251
+ if hasattr(self.process, 'num_fds'):
252
+ try:
253
+ fd_count = self.process.num_fds()
254
+ fd_status = HealthStatus.HEALTHY
255
+ if fd_count > self.fd_threshold:
256
+ fd_status = HealthStatus.WARNING if fd_count < self.fd_threshold * 1.2 else HealthStatus.CRITICAL
257
+
258
+ metrics.append(HealthMetric(
259
+ name="file_descriptors",
260
+ value=fd_count,
261
+ status=fd_status,
262
+ threshold=self.fd_threshold
263
+ ))
264
+ except Exception as e:
265
+ metrics.append(HealthMetric(
266
+ name="file_descriptors",
267
+ value=-1,
268
+ status=HealthStatus.UNKNOWN,
269
+ message=f"Failed to get file descriptor count: {e}"
270
+ ))
271
+
272
+ # Thread count
273
+ try:
274
+ thread_count = self.process.num_threads()
275
+ metrics.append(HealthMetric(
276
+ name="thread_count",
277
+ value=thread_count,
278
+ status=HealthStatus.HEALTHY,
279
+ ))
280
+ except Exception as e:
281
+ metrics.append(HealthMetric(
282
+ name="thread_count",
283
+ value=-1,
284
+ status=HealthStatus.UNKNOWN,
285
+ message=f"Failed to get thread count: {e}"
286
+ ))
287
+
288
+ # Process create time (for validation)
289
+ try:
290
+ create_time = self.process.create_time()
291
+ metrics.append(HealthMetric(
292
+ name="process_start_time",
293
+ value=create_time,
294
+ status=HealthStatus.HEALTHY,
295
+ unit="timestamp"
296
+ ))
297
+ except Exception as e:
298
+ metrics.append(HealthMetric(
299
+ name="process_start_time",
300
+ value=-1,
301
+ status=HealthStatus.UNKNOWN,
302
+ message=f"Failed to get process start time: {e}"
303
+ ))
304
+
305
+ except psutil.NoSuchProcess:
306
+ metrics.append(HealthMetric(
307
+ name="process_exists",
308
+ value=False,
309
+ status=HealthStatus.CRITICAL,
310
+ message=f"Process {self.pid} no longer exists"
311
+ ))
312
+ except Exception as e:
313
+ self.logger.error(f"Error checking process health: {e}")
314
+ metrics.append(HealthMetric(
315
+ name="process_check_error",
316
+ value=str(e),
317
+ status=HealthStatus.UNKNOWN,
318
+ message=f"Unexpected error during process health check: {e}"
319
+ ))
320
+
321
+ return metrics
322
+
323
+
324
+ class NetworkConnectivityChecker(HealthChecker):
325
+ """Health checker for network connectivity.
326
+
327
+ Monitors:
328
+ - Port availability and binding status
329
+ - Socket connection health
330
+ - Network interface status
331
+ """
332
+
333
+ def __init__(self, host: str, port: int, timeout: float = 1.0):
334
+ """Initialize network connectivity checker.
335
+
336
+ Args:
337
+ host: Host address to check
338
+ port: Port number to check
339
+ timeout: Connection timeout in seconds
340
+ """
341
+ self.host = host
342
+ self.port = port
343
+ self.timeout = timeout
344
+ self.logger = logging.getLogger(f"{__name__}.NetworkConnectivityChecker")
345
+
346
+ def get_name(self) -> str:
347
+ return f"network_connectivity_{self.host}_{self.port}"
348
+
349
+ async def check_health(self) -> List[HealthMetric]:
350
+ """Check network connectivity."""
351
+ metrics = []
352
+
353
+ # Check port binding
354
+ try:
355
+ # Try to connect to the port
356
+ sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
357
+ sock.settimeout(self.timeout)
358
+ result = sock.connect_ex((self.host, self.port))
359
+ sock.close()
360
+
361
+ if result == 0:
362
+ metrics.append(HealthMetric(
363
+ name="port_accessible",
364
+ value=True,
365
+ status=HealthStatus.HEALTHY,
366
+ message=f"Port {self.port} is accessible on {self.host}"
367
+ ))
368
+ else:
369
+ metrics.append(HealthMetric(
370
+ name="port_accessible",
371
+ value=False,
372
+ status=HealthStatus.CRITICAL,
373
+ message=f"Port {self.port} is not accessible on {self.host}"
374
+ ))
375
+ except Exception as e:
376
+ metrics.append(HealthMetric(
377
+ name="port_accessible",
378
+ value=False,
379
+ status=HealthStatus.UNKNOWN,
380
+ message=f"Error checking port accessibility: {e}"
381
+ ))
382
+
383
+ # Check if we can create a socket (resource availability)
384
+ try:
385
+ test_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
386
+ test_sock.close()
387
+ metrics.append(HealthMetric(
388
+ name="socket_creation",
389
+ value=True,
390
+ status=HealthStatus.HEALTHY,
391
+ message="Socket creation successful"
392
+ ))
393
+ except Exception as e:
394
+ metrics.append(HealthMetric(
395
+ name="socket_creation",
396
+ value=False,
397
+ status=HealthStatus.CRITICAL,
398
+ message=f"Failed to create socket: {e}"
399
+ ))
400
+
401
+ return metrics
402
+
403
+
404
+ class ServiceHealthChecker(HealthChecker):
405
+ """Health checker for service-specific metrics.
406
+
407
+ Monitors:
408
+ - Connected clients count
409
+ - Event processing rate
410
+ - Error rates
411
+ - Response times
412
+ """
413
+
414
+ def __init__(self, service_stats: Dict[str, Any],
415
+ max_clients: int = 1000, max_error_rate: float = 0.1):
416
+ """Initialize service health checker.
417
+
418
+ Args:
419
+ service_stats: Reference to service statistics dictionary
420
+ max_clients: Maximum allowed connected clients
421
+ max_error_rate: Maximum allowed error rate (0.0-1.0)
422
+ """
423
+ self.service_stats = service_stats
424
+ self.max_clients = max_clients
425
+ self.max_error_rate = max_error_rate
426
+ self.last_check_time = time.time()
427
+ self.last_events_processed = 0
428
+ self.logger = logging.getLogger(f"{__name__}.ServiceHealthChecker")
429
+
430
+ def get_name(self) -> str:
431
+ return "service_health"
432
+
433
+ async def check_health(self) -> List[HealthMetric]:
434
+ """Check service-specific health metrics."""
435
+ metrics = []
436
+ current_time = time.time()
437
+
438
+ # Connected clients
439
+ try:
440
+ client_count = self.service_stats.get("clients_connected", 0)
441
+ client_status = HealthStatus.HEALTHY
442
+ if client_count > self.max_clients * 0.8:
443
+ client_status = HealthStatus.WARNING
444
+ if client_count > self.max_clients:
445
+ client_status = HealthStatus.CRITICAL
446
+
447
+ metrics.append(HealthMetric(
448
+ name="connected_clients",
449
+ value=client_count,
450
+ status=client_status,
451
+ threshold=self.max_clients
452
+ ))
453
+ except Exception as e:
454
+ metrics.append(HealthMetric(
455
+ name="connected_clients",
456
+ value=-1,
457
+ status=HealthStatus.UNKNOWN,
458
+ message=f"Failed to get client count: {e}"
459
+ ))
460
+
461
+ # Event processing rate
462
+ try:
463
+ events_processed = self.service_stats.get("events_processed", 0)
464
+ time_diff = current_time - self.last_check_time
465
+
466
+ if time_diff > 0 and self.last_events_processed > 0:
467
+ event_rate = (events_processed - self.last_events_processed) / time_diff
468
+ metrics.append(HealthMetric(
469
+ name="event_processing_rate",
470
+ value=round(event_rate, 2),
471
+ status=HealthStatus.HEALTHY,
472
+ unit="events/sec"
473
+ ))
474
+
475
+ self.last_events_processed = events_processed
476
+
477
+ # Total events processed
478
+ metrics.append(HealthMetric(
479
+ name="total_events_processed",
480
+ value=events_processed,
481
+ status=HealthStatus.HEALTHY
482
+ ))
483
+ except Exception as e:
484
+ metrics.append(HealthMetric(
485
+ name="event_processing_rate",
486
+ value=-1,
487
+ status=HealthStatus.UNKNOWN,
488
+ message=f"Failed to calculate event rate: {e}"
489
+ ))
490
+
491
+ # Error rate
492
+ try:
493
+ errors = self.service_stats.get("errors", 0)
494
+ total_events = self.service_stats.get("events_processed", 1) # Avoid division by zero
495
+ error_rate = errors / max(total_events, 1)
496
+
497
+ error_status = HealthStatus.HEALTHY
498
+ if error_rate > self.max_error_rate * 0.5:
499
+ error_status = HealthStatus.WARNING
500
+ if error_rate > self.max_error_rate:
501
+ error_status = HealthStatus.CRITICAL
502
+
503
+ metrics.append(HealthMetric(
504
+ name="error_rate",
505
+ value=round(error_rate, 4),
506
+ status=error_status,
507
+ threshold=self.max_error_rate,
508
+ unit="ratio"
509
+ ))
510
+
511
+ metrics.append(HealthMetric(
512
+ name="total_errors",
513
+ value=errors,
514
+ status=HealthStatus.HEALTHY if errors == 0 else HealthStatus.WARNING
515
+ ))
516
+ except Exception as e:
517
+ metrics.append(HealthMetric(
518
+ name="error_rate",
519
+ value=-1,
520
+ status=HealthStatus.UNKNOWN,
521
+ message=f"Failed to calculate error rate: {e}"
522
+ ))
523
+
524
+ # Last activity timestamp
525
+ try:
526
+ last_activity = self.service_stats.get("last_activity")
527
+ if last_activity:
528
+ # Parse ISO timestamp or use as-is if numeric
529
+ if isinstance(last_activity, str):
530
+ try:
531
+ from dateutil.parser import parse
532
+ last_activity_dt = parse(last_activity)
533
+ last_activity_timestamp = last_activity_dt.timestamp()
534
+ except ImportError:
535
+ # Fallback: try to parse ISO format manually
536
+ try:
537
+ from datetime import datetime
538
+ clean_timestamp = last_activity.rstrip('Z')
539
+ last_activity_dt = datetime.fromisoformat(clean_timestamp.replace('T', ' '))
540
+ last_activity_timestamp = last_activity_dt.timestamp()
541
+ except Exception:
542
+ # Final fallback: treat as current time
543
+ last_activity_timestamp = current_time
544
+ else:
545
+ last_activity_timestamp = float(last_activity)
546
+
547
+ time_since_activity = current_time - last_activity_timestamp
548
+ activity_status = HealthStatus.HEALTHY
549
+ if time_since_activity > 300: # 5 minutes
550
+ activity_status = HealthStatus.WARNING
551
+ if time_since_activity > 1800: # 30 minutes
552
+ activity_status = HealthStatus.CRITICAL
553
+
554
+ metrics.append(HealthMetric(
555
+ name="time_since_last_activity",
556
+ value=round(time_since_activity, 2),
557
+ status=activity_status,
558
+ unit="seconds"
559
+ ))
560
+ else:
561
+ metrics.append(HealthMetric(
562
+ name="time_since_last_activity",
563
+ value=-1,
564
+ status=HealthStatus.WARNING,
565
+ message="No last activity recorded"
566
+ ))
567
+ except Exception as e:
568
+ metrics.append(HealthMetric(
569
+ name="time_since_last_activity",
570
+ value=-1,
571
+ status=HealthStatus.UNKNOWN,
572
+ message=f"Failed to parse last activity: {e}"
573
+ ))
574
+
575
+ self.last_check_time = current_time
576
+ return metrics
577
+
578
+
579
+ class AdvancedHealthMonitor:
580
+ """Advanced health monitoring system with configurable checks and thresholds.
581
+
582
+ Provides comprehensive health monitoring including:
583
+ - Multiple health checker integration
584
+ - Configurable check intervals and thresholds
585
+ - Health history tracking
586
+ - Status aggregation and reporting
587
+ - Integration with recovery systems
588
+ """
589
+
590
+ def __init__(self, config: Optional[Dict[str, Any]] = None):
591
+ """Initialize advanced health monitor.
592
+
593
+ Args:
594
+ config: Configuration dictionary for health monitoring
595
+ """
596
+ self.config = config or {}
597
+ self.logger = logging.getLogger(f"{__name__}.AdvancedHealthMonitor")
598
+
599
+ # Configuration with defaults
600
+ self.check_interval = self.config.get('check_interval', 30)
601
+ self.history_size = self.config.get('history_size', 100)
602
+ self.aggregation_window = self.config.get('aggregation_window', 300) # 5 minutes
603
+
604
+ # Health checkers
605
+ self.checkers: List[HealthChecker] = []
606
+
607
+ # Health history
608
+ self.health_history: deque = deque(maxlen=self.history_size)
609
+
610
+ # Monitoring state
611
+ self.monitoring = False
612
+ self.monitor_task: Optional[asyncio.Task] = None
613
+ self.last_check_result: Optional[HealthCheckResult] = None
614
+
615
+ # Health callbacks for recovery integration
616
+ self.health_callbacks: List[Callable[[HealthCheckResult], None]] = []
617
+
618
+ # Initialize metrics
619
+ self.monitoring_stats = {
620
+ 'checks_performed': 0,
621
+ 'checks_failed': 0,
622
+ 'average_check_duration_ms': 0,
623
+ 'last_check_timestamp': None
624
+ }
625
+
626
+ self.logger.info("Advanced health monitor initialized")
627
+
628
+ def add_checker(self, checker: HealthChecker) -> None:
629
+ """Add a health checker to the monitoring system."""
630
+ self.checkers.append(checker)
631
+ self.logger.info(f"Added health checker: {checker.get_name()}")
632
+
633
+ def add_health_callback(self, callback: Callable[[HealthCheckResult], None]) -> None:
634
+ """Add a callback to be called when health checks complete.
635
+
636
+ Args:
637
+ callback: Function to call with HealthCheckResult
638
+ """
639
+ self.health_callbacks.append(callback)
640
+ self.logger.debug(f"Added health callback: {callback.__name__}")
641
+
642
+ async def perform_health_check(self) -> HealthCheckResult:
643
+ """Perform comprehensive health check using all registered checkers."""
644
+ start_time = time.time()
645
+ all_metrics = []
646
+ errors = []
647
+
648
+ # Run all health checkers
649
+ for checker in self.checkers:
650
+ try:
651
+ checker_start = time.time()
652
+ metrics = await checker.check_health()
653
+ checker_duration = (time.time() - checker_start) * 1000
654
+
655
+ all_metrics.extend(metrics)
656
+ self.logger.debug(f"Health checker {checker.get_name()} completed in {checker_duration:.2f}ms")
657
+
658
+ except Exception as e:
659
+ error_msg = f"Health checker {checker.get_name()} failed: {e}"
660
+ errors.append(error_msg)
661
+ self.logger.error(error_msg)
662
+
663
+ # Add error metric
664
+ all_metrics.append(HealthMetric(
665
+ name=f"{checker.get_name()}_error",
666
+ value=str(e),
667
+ status=HealthStatus.UNKNOWN,
668
+ message=error_msg
669
+ ))
670
+
671
+ # Determine overall status
672
+ overall_status = self._determine_overall_status(all_metrics)
673
+
674
+ # Create result
675
+ duration_ms = (time.time() - start_time) * 1000
676
+ result = HealthCheckResult(
677
+ overall_status=overall_status,
678
+ metrics=all_metrics,
679
+ timestamp=start_time,
680
+ duration_ms=duration_ms,
681
+ errors=errors
682
+ )
683
+
684
+ # Update statistics
685
+ self.monitoring_stats['checks_performed'] += 1
686
+ if errors:
687
+ self.monitoring_stats['checks_failed'] += 1
688
+
689
+ # Update average duration
690
+ current_avg = self.monitoring_stats['average_check_duration_ms']
691
+ checks_count = self.monitoring_stats['checks_performed']
692
+ self.monitoring_stats['average_check_duration_ms'] = (
693
+ (current_avg * (checks_count - 1) + duration_ms) / checks_count
694
+ )
695
+ self.monitoring_stats['last_check_timestamp'] = time.time()
696
+
697
+ # Store in history
698
+ self.health_history.append(result)
699
+ self.last_check_result = result
700
+
701
+ # Notify callbacks
702
+ for callback in self.health_callbacks:
703
+ try:
704
+ callback(result)
705
+ except Exception as e:
706
+ self.logger.error(f"Health callback {callback.__name__} failed: {e}")
707
+
708
+ self.logger.debug(f"Health check completed: {overall_status.value} "
709
+ f"({len(all_metrics)} metrics, {len(errors)} errors, "
710
+ f"{duration_ms:.2f}ms)")
711
+
712
+ return result
713
+
714
+ def _determine_overall_status(self, metrics: List[HealthMetric]) -> HealthStatus:
715
+ """Determine overall health status from individual metrics."""
716
+ if not metrics:
717
+ return HealthStatus.UNKNOWN
718
+
719
+ # Count metrics by status
720
+ status_counts = {status: 0 for status in HealthStatus}
721
+ for metric in metrics:
722
+ status_counts[metric.status] += 1
723
+
724
+ # Determine overall status based on counts
725
+ total_metrics = len(metrics)
726
+
727
+ # If any critical metrics, overall is critical
728
+ if status_counts[HealthStatus.CRITICAL] > 0:
729
+ return HealthStatus.CRITICAL
730
+
731
+ # If more than 30% warning metrics, overall is warning
732
+ warning_ratio = status_counts[HealthStatus.WARNING] / total_metrics
733
+ if warning_ratio > 0.3:
734
+ return HealthStatus.WARNING
735
+
736
+ # If any warning metrics but less than 30%, still healthy
737
+ if status_counts[HealthStatus.WARNING] > 0:
738
+ return HealthStatus.HEALTHY
739
+
740
+ # If any unknown metrics, overall is unknown
741
+ if status_counts[HealthStatus.UNKNOWN] > 0:
742
+ return HealthStatus.UNKNOWN
743
+
744
+ # All metrics healthy
745
+ return HealthStatus.HEALTHY
746
+
747
+ def start_monitoring(self) -> None:
748
+ """Start continuous health monitoring."""
749
+ if self.monitoring:
750
+ self.logger.warning("Health monitoring is already running")
751
+ return
752
+
753
+ self.monitoring = True
754
+ self.monitor_task = asyncio.create_task(self._monitoring_loop())
755
+ self.logger.info(f"Started health monitoring with {self.check_interval}s interval")
756
+
757
+ async def stop_monitoring(self) -> None:
758
+ """Stop continuous health monitoring."""
759
+ if not self.monitoring:
760
+ return
761
+
762
+ self.monitoring = False
763
+ if self.monitor_task:
764
+ self.monitor_task.cancel()
765
+ try:
766
+ await self.monitor_task
767
+ except asyncio.CancelledError:
768
+ pass
769
+ self.monitor_task = None
770
+
771
+ self.logger.info("Stopped health monitoring")
772
+
773
+ async def _monitoring_loop(self) -> None:
774
+ """Continuous health monitoring loop."""
775
+ try:
776
+ while self.monitoring:
777
+ try:
778
+ await self.perform_health_check()
779
+ except Exception as e:
780
+ self.logger.error(f"Error during health check: {e}")
781
+
782
+ # Wait for next check
783
+ await asyncio.sleep(self.check_interval)
784
+ except asyncio.CancelledError:
785
+ self.logger.debug("Health monitoring loop cancelled")
786
+ except Exception as e:
787
+ self.logger.error(f"Health monitoring loop error: {e}")
788
+
789
+ def get_current_status(self) -> Optional[HealthCheckResult]:
790
+ """Get the most recent health check result."""
791
+ return self.last_check_result
792
+
793
+ def get_health_history(self, limit: Optional[int] = None) -> List[HealthCheckResult]:
794
+ """Get health check history.
795
+
796
+ Args:
797
+ limit: Maximum number of results to return
798
+
799
+ Returns:
800
+ List of health check results, newest first
801
+ """
802
+ history = list(self.health_history)
803
+ history.reverse() # Newest first
804
+
805
+ if limit:
806
+ history = history[:limit]
807
+
808
+ return history
809
+
810
+ def get_aggregated_status(self, window_seconds: Optional[int] = None) -> Dict[str, Any]:
811
+ """Get aggregated health status over a time window.
812
+
813
+ Args:
814
+ window_seconds: Time window for aggregation (defaults to configured window)
815
+
816
+ Returns:
817
+ Dictionary with aggregated health statistics
818
+ """
819
+ window_seconds = window_seconds or self.aggregation_window
820
+ current_time = time.time()
821
+ cutoff_time = current_time - window_seconds
822
+
823
+ # Filter history to time window
824
+ recent_results = [
825
+ result for result in self.health_history
826
+ if result.timestamp >= cutoff_time
827
+ ]
828
+
829
+ if not recent_results:
830
+ return {
831
+ 'period': 'no_data',
832
+ 'window_seconds': window_seconds,
833
+ 'checks_count': 0,
834
+ 'overall_status': HealthStatus.UNKNOWN.value
835
+ }
836
+
837
+ # Aggregate statistics
838
+ status_counts = {status: 0 for status in HealthStatus}
839
+ total_metrics = 0
840
+ total_errors = 0
841
+ total_duration_ms = 0
842
+
843
+ for result in recent_results:
844
+ status_counts[result.overall_status] += 1
845
+ total_metrics += len(result.metrics)
846
+ total_errors += len(result.errors)
847
+ total_duration_ms += result.duration_ms
848
+
849
+ checks_count = len(recent_results)
850
+
851
+ # Determine aggregated status
852
+ if status_counts[HealthStatus.CRITICAL] > 0:
853
+ aggregated_status = HealthStatus.CRITICAL
854
+ elif status_counts[HealthStatus.WARNING] > checks_count * 0.3:
855
+ aggregated_status = HealthStatus.WARNING
856
+ elif status_counts[HealthStatus.UNKNOWN] > checks_count * 0.5:
857
+ aggregated_status = HealthStatus.UNKNOWN
858
+ else:
859
+ aggregated_status = HealthStatus.HEALTHY
860
+
861
+ return {
862
+ 'period': f'last_{window_seconds}_seconds',
863
+ 'window_seconds': window_seconds,
864
+ 'checks_count': checks_count,
865
+ 'overall_status': aggregated_status.value,
866
+ 'status_distribution': {status.value: count for status, count in status_counts.items()},
867
+ 'average_metrics_per_check': round(total_metrics / checks_count, 2) if checks_count > 0 else 0,
868
+ 'total_errors': total_errors,
869
+ 'average_duration_ms': round(total_duration_ms / checks_count, 2) if checks_count > 0 else 0,
870
+ 'monitoring_stats': dict(self.monitoring_stats)
871
+ }
872
+
873
+ def export_diagnostics(self) -> Dict[str, Any]:
874
+ """Export comprehensive diagnostics information."""
875
+ return {
876
+ 'monitor_info': {
877
+ 'check_interval': self.check_interval,
878
+ 'history_size': self.history_size,
879
+ 'aggregation_window': self.aggregation_window,
880
+ 'monitoring_active': self.monitoring,
881
+ 'checkers_count': len(self.checkers),
882
+ 'callbacks_count': len(self.health_callbacks)
883
+ },
884
+ 'checkers': [checker.get_name() for checker in self.checkers],
885
+ 'current_status': self.last_check_result.to_dict() if self.last_check_result else None,
886
+ 'aggregated_status': self.get_aggregated_status(),
887
+ 'monitoring_stats': dict(self.monitoring_stats),
888
+ 'history_summary': {
889
+ 'total_checks': len(self.health_history),
890
+ 'oldest_check': self.health_history[0].timestamp if self.health_history else None,
891
+ 'newest_check': self.health_history[-1].timestamp if self.health_history else None
892
+ }
893
+ }