claude-mpm 4.1.4__py3-none-any.whl → 4.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. claude_mpm/VERSION +1 -1
  2. claude_mpm/agents/templates/research.json +39 -13
  3. claude_mpm/cli/__init__.py +2 -0
  4. claude_mpm/cli/commands/__init__.py +2 -0
  5. claude_mpm/cli/commands/configure.py +1221 -0
  6. claude_mpm/cli/commands/configure_tui.py +1921 -0
  7. claude_mpm/cli/commands/tickets.py +365 -784
  8. claude_mpm/cli/parsers/base_parser.py +7 -0
  9. claude_mpm/cli/parsers/configure_parser.py +119 -0
  10. claude_mpm/cli/startup_logging.py +39 -12
  11. claude_mpm/constants.py +1 -0
  12. claude_mpm/core/output_style_manager.py +24 -0
  13. claude_mpm/core/socketio_pool.py +35 -3
  14. claude_mpm/core/unified_agent_registry.py +46 -15
  15. claude_mpm/dashboard/static/css/connection-status.css +370 -0
  16. claude_mpm/dashboard/static/js/components/connection-debug.js +654 -0
  17. claude_mpm/dashboard/static/js/connection-manager.js +536 -0
  18. claude_mpm/dashboard/templates/index.html +11 -0
  19. claude_mpm/hooks/claude_hooks/services/__init__.py +3 -1
  20. claude_mpm/hooks/claude_hooks/services/connection_manager_http.py +190 -0
  21. claude_mpm/services/agents/deployment/agent_discovery_service.py +12 -3
  22. claude_mpm/services/agents/deployment/agent_lifecycle_manager.py +172 -233
  23. claude_mpm/services/agents/deployment/agent_lifecycle_manager_refactored.py +575 -0
  24. claude_mpm/services/agents/deployment/agent_operation_service.py +573 -0
  25. claude_mpm/services/agents/deployment/agent_record_service.py +419 -0
  26. claude_mpm/services/agents/deployment/agent_state_service.py +381 -0
  27. claude_mpm/services/agents/deployment/multi_source_deployment_service.py +4 -2
  28. claude_mpm/services/diagnostics/checks/__init__.py +2 -0
  29. claude_mpm/services/diagnostics/checks/instructions_check.py +418 -0
  30. claude_mpm/services/diagnostics/diagnostic_runner.py +15 -2
  31. claude_mpm/services/event_bus/direct_relay.py +173 -0
  32. claude_mpm/services/infrastructure/__init__.py +31 -5
  33. claude_mpm/services/infrastructure/monitoring/__init__.py +43 -0
  34. claude_mpm/services/infrastructure/monitoring/aggregator.py +437 -0
  35. claude_mpm/services/infrastructure/monitoring/base.py +130 -0
  36. claude_mpm/services/infrastructure/monitoring/legacy.py +203 -0
  37. claude_mpm/services/infrastructure/monitoring/network.py +218 -0
  38. claude_mpm/services/infrastructure/monitoring/process.py +342 -0
  39. claude_mpm/services/infrastructure/monitoring/resources.py +243 -0
  40. claude_mpm/services/infrastructure/monitoring/service.py +367 -0
  41. claude_mpm/services/infrastructure/monitoring.py +67 -1030
  42. claude_mpm/services/project/analyzer.py +13 -4
  43. claude_mpm/services/project/analyzer_refactored.py +450 -0
  44. claude_mpm/services/project/analyzer_v2.py +566 -0
  45. claude_mpm/services/project/architecture_analyzer.py +461 -0
  46. claude_mpm/services/project/dependency_analyzer.py +462 -0
  47. claude_mpm/services/project/language_analyzer.py +265 -0
  48. claude_mpm/services/project/metrics_collector.py +410 -0
  49. claude_mpm/services/socketio/handlers/connection_handler.py +345 -0
  50. claude_mpm/services/socketio/server/broadcaster.py +32 -1
  51. claude_mpm/services/socketio/server/connection_manager.py +516 -0
  52. claude_mpm/services/socketio/server/core.py +63 -0
  53. claude_mpm/services/socketio/server/eventbus_integration.py +20 -9
  54. claude_mpm/services/socketio/server/main.py +27 -1
  55. claude_mpm/services/ticket_manager.py +5 -1
  56. claude_mpm/services/ticket_services/__init__.py +26 -0
  57. claude_mpm/services/ticket_services/crud_service.py +328 -0
  58. claude_mpm/services/ticket_services/formatter_service.py +290 -0
  59. claude_mpm/services/ticket_services/search_service.py +324 -0
  60. claude_mpm/services/ticket_services/validation_service.py +303 -0
  61. claude_mpm/services/ticket_services/workflow_service.py +244 -0
  62. {claude_mpm-4.1.4.dist-info → claude_mpm-4.1.6.dist-info}/METADATA +3 -1
  63. {claude_mpm-4.1.4.dist-info → claude_mpm-4.1.6.dist-info}/RECORD +67 -46
  64. claude_mpm/agents/OUTPUT_STYLE.md +0 -73
  65. claude_mpm/agents/backups/INSTRUCTIONS.md +0 -352
  66. claude_mpm/agents/templates/OPTIMIZATION_REPORT.md +0 -156
  67. claude_mpm/agents/templates/backup/data_engineer_agent_20250726_234551.json +0 -79
  68. claude_mpm/agents/templates/backup/documentation_agent_20250726_234551.json +0 -68
  69. claude_mpm/agents/templates/backup/engineer_agent_20250726_234551.json +0 -77
  70. claude_mpm/agents/templates/backup/ops_agent_20250726_234551.json +0 -78
  71. claude_mpm/agents/templates/backup/qa_agent_20250726_234551.json +0 -67
  72. claude_mpm/agents/templates/backup/research_agent_2025011_234551.json +0 -88
  73. claude_mpm/agents/templates/backup/research_agent_20250726_234551.json +0 -72
  74. claude_mpm/agents/templates/backup/research_memory_efficient.json +0 -88
  75. claude_mpm/agents/templates/backup/security_agent_20250726_234551.json +0 -78
  76. claude_mpm/agents/templates/backup/version_control_agent_20250726_234551.json +0 -62
  77. claude_mpm/agents/templates/vercel_ops_instructions.md +0 -582
  78. {claude_mpm-4.1.4.dist-info → claude_mpm-4.1.6.dist-info}/WHEEL +0 -0
  79. {claude_mpm-4.1.4.dist-info → claude_mpm-4.1.6.dist-info}/entry_points.txt +0 -0
  80. {claude_mpm-4.1.4.dist-info → claude_mpm-4.1.6.dist-info}/licenses/LICENSE +0 -0
  81. {claude_mpm-4.1.4.dist-info → claude_mpm-4.1.6.dist-info}/top_level.txt +0 -0
@@ -1,1034 +1,71 @@
1
1
  """Advanced health monitoring system for claude-mpm Socket.IO server.
2
2
 
3
- This module provides comprehensive health checking capabilities including:
4
- - Process resource monitoring (CPU, memory, file descriptors)
5
- - Service-specific health markers
6
- - Configurable thresholds and intervals
7
- - Health status aggregation and history
8
- - Integration with automatic recovery mechanisms
9
-
10
- Design Principles:
11
- - Minimal performance impact through efficient polling
12
- - Extensible metric collection system
13
- - Circuit breaker integration for failure detection
14
- - Comprehensive logging for debugging and diagnostics
3
+ This module has been refactored into a modular service-based architecture.
4
+ All functionality is preserved through the monitoring package.
5
+
6
+ The refactoring reduces complexity from 1,034 lines to under 100 lines
7
+ by delegating to specialized services:
8
+ - ResourceMonitorService: System resource monitoring
9
+ - ProcessHealthService: Process-specific monitoring
10
+ - ServiceHealthService: Application-level metrics
11
+ - NetworkHealthService: Network connectivity checks
12
+ - MonitoringAggregatorService: Orchestration and aggregation
13
+
14
+ For new code, use the service-based API:
15
+ from claude_mpm.services.infrastructure.monitoring import (
16
+ ResourceMonitorService,
17
+ ProcessHealthService,
18
+ ServiceHealthService,
19
+ NetworkHealthService,
20
+ MonitoringAggregatorService,
21
+ )
22
+
23
+ For backward compatibility, legacy classes are still available:
24
+ from claude_mpm.services.infrastructure.monitoring import (
25
+ ProcessResourceChecker,
26
+ NetworkConnectivityChecker,
27
+ ServiceHealthChecker,
28
+ AdvancedHealthMonitor,
29
+ )
15
30
  """
16
31
 
17
- import asyncio
18
- import contextlib
19
- import logging
20
- import socket
21
- import time
22
- from abc import ABC, abstractmethod
23
- from collections import deque
24
- from dataclasses import asdict, dataclass
25
- from datetime import datetime, timezone
26
- from enum import Enum
27
- from typing import Any, Callable, Dict, List, Optional, Union
28
-
29
- from claude_mpm.core.constants import ResourceLimits, TimeoutConfig
30
-
31
- try:
32
- import psutil
33
-
34
- PSUTIL_AVAILABLE = True
35
- except ImportError:
36
- PSUTIL_AVAILABLE = False
37
- psutil = None
38
-
39
-
40
- class HealthStatus(Enum):
41
- """Health status levels for monitoring."""
42
-
43
- HEALTHY = "healthy"
44
- WARNING = "warning"
45
- CRITICAL = "critical"
46
- UNKNOWN = "unknown"
47
-
48
-
49
- @dataclass
50
- class HealthMetric:
51
- """Individual health metric data structure."""
52
-
53
- name: str
54
- value: Union[int, float, str, bool]
55
- status: HealthStatus
56
- threshold: Optional[Union[int, float]] = None
57
- unit: Optional[str] = None
58
- timestamp: float = None
59
- message: Optional[str] = None
60
-
61
- def __post_init__(self):
62
- if self.timestamp is None:
63
- self.timestamp = time.time()
64
-
65
- def to_dict(self) -> Dict[str, Any]:
66
- """Convert metric to dictionary format."""
67
- result = asdict(self)
68
- result["status"] = self.status.value
69
- result["timestamp_iso"] = datetime.fromtimestamp(
70
- self.timestamp, timezone.utc
71
- ).isoformat()
72
- return result
73
-
74
-
75
- @dataclass
76
- class HealthCheckResult:
77
- """Result of a health check operation."""
78
-
79
- overall_status: HealthStatus
80
- metrics: List[HealthMetric]
81
- timestamp: float
82
- duration_ms: float
83
- errors: List[str]
84
-
85
- def __post_init__(self):
86
- if not hasattr(self, "timestamp") or self.timestamp is None:
87
- self.timestamp = time.time()
88
-
89
- def to_dict(self) -> Dict[str, Any]:
90
- """Convert health check result to dictionary format."""
91
- return {
92
- "overall_status": self.overall_status.value,
93
- "metrics": [metric.to_dict() for metric in self.metrics],
94
- "timestamp": self.timestamp,
95
- "timestamp_iso": datetime.fromtimestamp(
96
- self.timestamp, timezone.utc
97
- ).isoformat(),
98
- "duration_ms": self.duration_ms,
99
- "errors": self.errors,
100
- "metric_count": len(self.metrics),
101
- "healthy_metrics": len(
102
- [m for m in self.metrics if m.status == HealthStatus.HEALTHY]
103
- ),
104
- "warning_metrics": len(
105
- [m for m in self.metrics if m.status == HealthStatus.WARNING]
106
- ),
107
- "critical_metrics": len(
108
- [m for m in self.metrics if m.status == HealthStatus.CRITICAL]
109
- ),
110
- }
111
-
112
-
113
- class HealthChecker(ABC):
114
- """Abstract base class for health checkers.
115
-
116
- Health checkers implement specific monitoring logic for different aspects
117
- of the system (process resources, network connectivity, service health, etc.).
118
- """
119
-
120
- @abstractmethod
121
- def get_name(self) -> str:
122
- """Get the name of this health checker."""
123
-
124
- @abstractmethod
125
- async def check_health(self) -> List[HealthMetric]:
126
- """Perform health check and return metrics."""
127
-
128
-
129
- class ProcessResourceChecker(HealthChecker):
130
- """Health checker for process resource usage.
131
-
132
- Monitors:
133
- - CPU usage percentage
134
- - Memory usage (RSS, VMS)
135
- - File descriptor count
136
- - Thread count
137
- - Process status
138
- """
139
-
140
- def __init__(
141
- self,
142
- pid: int,
143
- cpu_threshold: float = 80.0,
144
- memory_threshold_mb: int = 500,
145
- fd_threshold: int = 1000,
146
- ):
147
- """Initialize process resource checker.
148
-
149
- Args:
150
- pid: Process ID to monitor
151
- cpu_threshold: CPU usage threshold as percentage
152
- memory_threshold_mb: Memory usage threshold in MB
153
- fd_threshold: File descriptor count threshold
154
- """
155
- self.pid = pid
156
- self.cpu_threshold = cpu_threshold
157
- self.memory_threshold_mb = memory_threshold_mb
158
- self.fd_threshold = fd_threshold
159
- self.process = None
160
- self.logger = logging.getLogger(f"{__name__}.ProcessResourceChecker")
161
-
162
- if PSUTIL_AVAILABLE:
163
- try:
164
- self.process = psutil.Process(pid)
165
- except psutil.NoSuchProcess:
166
- self.logger.warning(f"Process {pid} not found for monitoring")
167
-
168
- def get_name(self) -> str:
169
- return f"process_resources_{self.pid}"
170
-
171
- async def check_health(self) -> List[HealthMetric]:
172
- """Check process resource usage."""
173
- metrics = []
174
-
175
- if not PSUTIL_AVAILABLE:
176
- metrics.append(
177
- HealthMetric(
178
- name="psutil_availability",
179
- value=False,
180
- status=HealthStatus.WARNING,
181
- message="psutil not available for enhanced monitoring",
182
- )
183
- )
184
- return metrics
185
-
186
- if not self.process:
187
- metrics.append(
188
- HealthMetric(
189
- name="process_exists",
190
- value=False,
191
- status=HealthStatus.CRITICAL,
192
- message=f"Process {self.pid} not found",
193
- )
194
- )
195
- return metrics
196
-
197
- try:
198
- # Check if process still exists
199
- if not self.process.is_running():
200
- metrics.append(
201
- HealthMetric(
202
- name="process_exists",
203
- value=False,
204
- status=HealthStatus.CRITICAL,
205
- message=f"Process {self.pid} is no longer running",
206
- )
207
- )
208
- return metrics
209
-
210
- # Process status
211
- status = self.process.status()
212
- process_healthy = status not in [
213
- psutil.STATUS_ZOMBIE,
214
- psutil.STATUS_DEAD,
215
- psutil.STATUS_STOPPED,
216
- ]
217
- metrics.append(
218
- HealthMetric(
219
- name="process_status",
220
- value=status,
221
- status=(
222
- HealthStatus.HEALTHY
223
- if process_healthy
224
- else HealthStatus.CRITICAL
225
- ),
226
- message=f"Process status: {status}",
227
- )
228
- )
229
-
230
- # CPU usage
231
- try:
232
- cpu_percent = self.process.cpu_percent(
233
- interval=TimeoutConfig.CPU_SAMPLE_INTERVAL
234
- )
235
- cpu_status = HealthStatus.HEALTHY
236
- if cpu_percent > self.cpu_threshold:
237
- cpu_status = (
238
- HealthStatus.WARNING
239
- if cpu_percent < self.cpu_threshold * 1.2
240
- else HealthStatus.CRITICAL
241
- )
242
-
243
- metrics.append(
244
- HealthMetric(
245
- name="cpu_usage_percent",
246
- value=round(cpu_percent, 2),
247
- status=cpu_status,
248
- threshold=self.cpu_threshold,
249
- unit="%",
250
- )
251
- )
252
- except Exception as e:
253
- metrics.append(
254
- HealthMetric(
255
- name="cpu_usage_percent",
256
- value=-1,
257
- status=HealthStatus.UNKNOWN,
258
- message=f"Failed to get CPU usage: {e}",
259
- )
260
- )
261
-
262
- # Memory usage
263
- try:
264
- memory_info = self.process.memory_info()
265
- memory_mb = memory_info.rss / ResourceLimits.BYTES_TO_MB
266
- memory_status = HealthStatus.HEALTHY
267
- if memory_mb > self.memory_threshold_mb:
268
- memory_status = (
269
- HealthStatus.WARNING
270
- if memory_mb < self.memory_threshold_mb * 1.2
271
- else HealthStatus.CRITICAL
272
- )
273
-
274
- metrics.append(
275
- HealthMetric(
276
- name="memory_usage_mb",
277
- value=round(memory_mb, 2),
278
- status=memory_status,
279
- threshold=self.memory_threshold_mb,
280
- unit="MB",
281
- )
282
- )
283
-
284
- metrics.append(
285
- HealthMetric(
286
- name="memory_vms_mb",
287
- value=round(memory_info.vms / ResourceLimits.BYTES_TO_MB, 2),
288
- status=HealthStatus.HEALTHY,
289
- unit="MB",
290
- )
291
- )
292
- except Exception as e:
293
- metrics.append(
294
- HealthMetric(
295
- name="memory_usage_mb",
296
- value=-1,
297
- status=HealthStatus.UNKNOWN,
298
- message=f"Failed to get memory usage: {e}",
299
- )
300
- )
301
-
302
- # File descriptors (Unix only)
303
- if hasattr(self.process, "num_fds"):
304
- try:
305
- fd_count = self.process.num_fds()
306
- fd_status = HealthStatus.HEALTHY
307
- if fd_count > self.fd_threshold:
308
- fd_status = (
309
- HealthStatus.WARNING
310
- if fd_count < self.fd_threshold * 1.2
311
- else HealthStatus.CRITICAL
312
- )
313
-
314
- metrics.append(
315
- HealthMetric(
316
- name="file_descriptors",
317
- value=fd_count,
318
- status=fd_status,
319
- threshold=self.fd_threshold,
320
- )
321
- )
322
- except Exception as e:
323
- metrics.append(
324
- HealthMetric(
325
- name="file_descriptors",
326
- value=-1,
327
- status=HealthStatus.UNKNOWN,
328
- message=f"Failed to get file descriptor count: {e}",
329
- )
330
- )
331
-
332
- # Thread count
333
- try:
334
- thread_count = self.process.num_threads()
335
- metrics.append(
336
- HealthMetric(
337
- name="thread_count",
338
- value=thread_count,
339
- status=HealthStatus.HEALTHY,
340
- )
341
- )
342
- except Exception as e:
343
- metrics.append(
344
- HealthMetric(
345
- name="thread_count",
346
- value=-1,
347
- status=HealthStatus.UNKNOWN,
348
- message=f"Failed to get thread count: {e}",
349
- )
350
- )
351
-
352
- # Process create time (for validation)
353
- try:
354
- create_time = self.process.create_time()
355
- metrics.append(
356
- HealthMetric(
357
- name="process_start_time",
358
- value=create_time,
359
- status=HealthStatus.HEALTHY,
360
- unit="timestamp",
361
- )
362
- )
363
- except Exception as e:
364
- metrics.append(
365
- HealthMetric(
366
- name="process_start_time",
367
- value=-1,
368
- status=HealthStatus.UNKNOWN,
369
- message=f"Failed to get process start time: {e}",
370
- )
371
- )
372
-
373
- except psutil.NoSuchProcess:
374
- metrics.append(
375
- HealthMetric(
376
- name="process_exists",
377
- value=False,
378
- status=HealthStatus.CRITICAL,
379
- message=f"Process {self.pid} no longer exists",
380
- )
381
- )
382
- except Exception as e:
383
- self.logger.error(f"Error checking process health: {e}")
384
- metrics.append(
385
- HealthMetric(
386
- name="process_check_error",
387
- value=str(e),
388
- status=HealthStatus.UNKNOWN,
389
- message=f"Unexpected error during process health check: {e}",
390
- )
391
- )
392
-
393
- return metrics
394
-
395
-
396
- class NetworkConnectivityChecker(HealthChecker):
397
- """Health checker for network connectivity.
398
-
399
- Monitors:
400
- - Port availability and binding status
401
- - Socket connection health
402
- - Network interface status
403
- """
404
-
405
- def __init__(self, host: str, port: int, timeout: float = 1.0):
406
- """Initialize network connectivity checker.
407
-
408
- Args:
409
- host: Host address to check
410
- port: Port number to check
411
- timeout: Connection timeout in seconds
412
- """
413
- self.host = host
414
- self.port = port
415
- self.timeout = timeout
416
- self.logger = logging.getLogger(f"{__name__}.NetworkConnectivityChecker")
417
-
418
- def get_name(self) -> str:
419
- return f"network_connectivity_{self.host}_{self.port}"
420
-
421
- async def check_health(self) -> List[HealthMetric]:
422
- """Check network connectivity."""
423
- metrics = []
424
-
425
- # Check port binding
426
- try:
427
- # Try to connect to the port
428
- sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
429
- sock.settimeout(self.timeout)
430
- result = sock.connect_ex((self.host, self.port))
431
- sock.close()
432
-
433
- if result == 0:
434
- metrics.append(
435
- HealthMetric(
436
- name="port_accessible",
437
- value=True,
438
- status=HealthStatus.HEALTHY,
439
- message=f"Port {self.port} is accessible on {self.host}",
440
- )
441
- )
442
- else:
443
- metrics.append(
444
- HealthMetric(
445
- name="port_accessible",
446
- value=False,
447
- status=HealthStatus.CRITICAL,
448
- message=f"Port {self.port} is not accessible on {self.host}",
449
- )
450
- )
451
- except Exception as e:
452
- metrics.append(
453
- HealthMetric(
454
- name="port_accessible",
455
- value=False,
456
- status=HealthStatus.UNKNOWN,
457
- message=f"Error checking port accessibility: {e}",
458
- )
459
- )
460
-
461
- # Check if we can create a socket (resource availability)
462
- try:
463
- test_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
464
- test_sock.close()
465
- metrics.append(
466
- HealthMetric(
467
- name="socket_creation",
468
- value=True,
469
- status=HealthStatus.HEALTHY,
470
- message="Socket creation successful",
471
- )
472
- )
473
- except Exception as e:
474
- metrics.append(
475
- HealthMetric(
476
- name="socket_creation",
477
- value=False,
478
- status=HealthStatus.CRITICAL,
479
- message=f"Failed to create socket: {e}",
480
- )
481
- )
482
-
483
- return metrics
484
-
485
-
486
- class ServiceHealthChecker(HealthChecker):
487
- """Health checker for service-specific metrics.
488
-
489
- Monitors:
490
- - Connected clients count
491
- - Event processing rate
492
- - Error rates
493
- - Response times
494
- """
495
-
496
- def __init__(
497
- self,
498
- service_stats: Dict[str, Any],
499
- max_clients: int = 1000,
500
- max_error_rate: float = 0.1,
501
- ):
502
- """Initialize service health checker.
503
-
504
- Args:
505
- service_stats: Reference to service statistics dictionary
506
- max_clients: Maximum allowed connected clients
507
- max_error_rate: Maximum allowed error rate (0.0-1.0)
508
- """
509
- self.service_stats = service_stats
510
- self.max_clients = max_clients
511
- self.max_error_rate = max_error_rate
512
- self.last_check_time = time.time()
513
- self.last_events_processed = 0
514
- self.logger = logging.getLogger(f"{__name__}.ServiceHealthChecker")
515
-
516
- def get_name(self) -> str:
517
- return "service_health"
518
-
519
- async def check_health(self) -> List[HealthMetric]:
520
- """Check service-specific health metrics."""
521
- metrics = []
522
- current_time = time.time()
523
-
524
- # Connected clients
525
- try:
526
- client_count = self.service_stats.get("clients_connected", 0)
527
- client_status = HealthStatus.HEALTHY
528
- if client_count > self.max_clients * 0.8:
529
- client_status = HealthStatus.WARNING
530
- if client_count > self.max_clients:
531
- client_status = HealthStatus.CRITICAL
532
-
533
- metrics.append(
534
- HealthMetric(
535
- name="connected_clients",
536
- value=client_count,
537
- status=client_status,
538
- threshold=self.max_clients,
539
- )
540
- )
541
- except Exception as e:
542
- metrics.append(
543
- HealthMetric(
544
- name="connected_clients",
545
- value=-1,
546
- status=HealthStatus.UNKNOWN,
547
- message=f"Failed to get client count: {e}",
548
- )
549
- )
550
-
551
- # Event processing rate
552
- try:
553
- events_processed = self.service_stats.get("events_processed", 0)
554
- time_diff = current_time - self.last_check_time
555
-
556
- if time_diff > 0 and self.last_events_processed > 0:
557
- event_rate = (events_processed - self.last_events_processed) / time_diff
558
- metrics.append(
559
- HealthMetric(
560
- name="event_processing_rate",
561
- value=round(event_rate, 2),
562
- status=HealthStatus.HEALTHY,
563
- unit="events/sec",
564
- )
565
- )
566
-
567
- self.last_events_processed = events_processed
568
-
569
- # Total events processed
570
- metrics.append(
571
- HealthMetric(
572
- name="total_events_processed",
573
- value=events_processed,
574
- status=HealthStatus.HEALTHY,
575
- )
576
- )
577
- except Exception as e:
578
- metrics.append(
579
- HealthMetric(
580
- name="event_processing_rate",
581
- value=-1,
582
- status=HealthStatus.UNKNOWN,
583
- message=f"Failed to calculate event rate: {e}",
584
- )
585
- )
586
-
587
- # Error rate
588
- try:
589
- errors = self.service_stats.get("errors", 0)
590
- total_events = self.service_stats.get(
591
- "events_processed", 1
592
- ) # Avoid division by zero
593
- error_rate = errors / max(total_events, 1)
594
-
595
- error_status = HealthStatus.HEALTHY
596
- if error_rate > self.max_error_rate * 0.5:
597
- error_status = HealthStatus.WARNING
598
- if error_rate > self.max_error_rate:
599
- error_status = HealthStatus.CRITICAL
600
-
601
- metrics.append(
602
- HealthMetric(
603
- name="error_rate",
604
- value=round(error_rate, 4),
605
- status=error_status,
606
- threshold=self.max_error_rate,
607
- unit="ratio",
608
- )
609
- )
610
-
611
- metrics.append(
612
- HealthMetric(
613
- name="total_errors",
614
- value=errors,
615
- status=(
616
- HealthStatus.HEALTHY if errors == 0 else HealthStatus.WARNING
617
- ),
618
- )
619
- )
620
- except Exception as e:
621
- metrics.append(
622
- HealthMetric(
623
- name="error_rate",
624
- value=-1,
625
- status=HealthStatus.UNKNOWN,
626
- message=f"Failed to calculate error rate: {e}",
627
- )
628
- )
629
-
630
- # Last activity timestamp
631
- try:
632
- last_activity = self.service_stats.get("last_activity")
633
- if last_activity:
634
- # Parse ISO timestamp or use as-is if numeric
635
- if isinstance(last_activity, str):
636
- try:
637
- from dateutil.parser import parse
638
-
639
- last_activity_dt = parse(last_activity)
640
- last_activity_timestamp = last_activity_dt.timestamp()
641
- except ImportError:
642
- # Fallback: try to parse ISO format manually
643
- try:
644
- from datetime import datetime
645
-
646
- clean_timestamp = last_activity.rstrip("Z")
647
- last_activity_dt = datetime.fromisoformat(
648
- clean_timestamp.replace("T", " ")
649
- )
650
- last_activity_timestamp = last_activity_dt.timestamp()
651
- except Exception:
652
- # Final fallback: treat as current time
653
- last_activity_timestamp = current_time
654
- else:
655
- last_activity_timestamp = float(last_activity)
656
-
657
- time_since_activity = current_time - last_activity_timestamp
658
- activity_status = HealthStatus.HEALTHY
659
- if time_since_activity > 300: # 5 minutes
660
- activity_status = HealthStatus.WARNING
661
- if time_since_activity > 1800: # 30 minutes
662
- activity_status = HealthStatus.CRITICAL
663
-
664
- metrics.append(
665
- HealthMetric(
666
- name="time_since_last_activity",
667
- value=round(time_since_activity, 2),
668
- status=activity_status,
669
- unit="seconds",
670
- )
671
- )
672
- else:
673
- metrics.append(
674
- HealthMetric(
675
- name="time_since_last_activity",
676
- value=-1,
677
- status=HealthStatus.WARNING,
678
- message="No last activity recorded",
679
- )
680
- )
681
- except Exception as e:
682
- metrics.append(
683
- HealthMetric(
684
- name="time_since_last_activity",
685
- value=-1,
686
- status=HealthStatus.UNKNOWN,
687
- message=f"Failed to parse last activity: {e}",
688
- )
689
- )
690
-
691
- self.last_check_time = current_time
692
- return metrics
693
-
694
-
695
- class AdvancedHealthMonitor:
696
- """Advanced health monitoring system with configurable checks and thresholds.
697
-
698
- Provides comprehensive health monitoring including:
699
- - Multiple health checker integration
700
- - Configurable check intervals and thresholds
701
- - Health history tracking
702
- - Status aggregation and reporting
703
- - Integration with recovery systems
704
- """
705
-
706
- def __init__(self, config: Optional[Dict[str, Any]] = None):
707
- """Initialize advanced health monitor.
708
-
709
- Args:
710
- config: Configuration dictionary for health monitoring
711
- """
712
- self.config = config or {}
713
- self.logger = logging.getLogger(f"{__name__}.AdvancedHealthMonitor")
714
-
715
- # Configuration with defaults
716
- self.check_interval = self.config.get("check_interval", 30)
717
- self.history_size = self.config.get("history_size", 100)
718
- self.aggregation_window = self.config.get(
719
- "aggregation_window", 300
720
- ) # 5 minutes
721
-
722
- # Health checkers
723
- self.checkers: List[HealthChecker] = []
724
-
725
- # Health history
726
- self.health_history: deque = deque(maxlen=self.history_size)
727
-
728
- # Monitoring state
729
- self.monitoring = False
730
- self.monitor_task: Optional[asyncio.Task] = None
731
- self.last_check_result: Optional[HealthCheckResult] = None
732
-
733
- # Health callbacks for recovery integration
734
- self.health_callbacks: List[Callable[[HealthCheckResult], None]] = []
735
-
736
- # Initialize metrics
737
- self.monitoring_stats = {
738
- "checks_performed": 0,
739
- "checks_failed": 0,
740
- "average_check_duration_ms": 0,
741
- "last_check_timestamp": None,
742
- }
743
-
744
- self.logger.info("Advanced health monitor initialized")
745
-
746
- def add_checker(self, checker: HealthChecker) -> None:
747
- """Add a health checker to the monitoring system."""
748
- self.checkers.append(checker)
749
- self.logger.info(f"Added health checker: {checker.get_name()}")
750
-
751
- def add_health_callback(
752
- self, callback: Callable[[HealthCheckResult], None]
753
- ) -> None:
754
- """Add a callback to be called when health checks complete.
755
-
756
- Args:
757
- callback: Function to call with HealthCheckResult
758
- """
759
- self.health_callbacks.append(callback)
760
- self.logger.debug(f"Added health callback: {callback.__name__}")
761
-
762
- async def perform_health_check(self) -> HealthCheckResult:
763
- """Perform comprehensive health check using all registered checkers."""
764
- start_time = time.time()
765
- all_metrics = []
766
- errors = []
767
-
768
- # Run all health checkers
769
- for checker in self.checkers:
770
- try:
771
- checker_start = time.time()
772
- metrics = await checker.check_health()
773
- checker_duration = (time.time() - checker_start) * 1000
774
-
775
- all_metrics.extend(metrics)
776
- self.logger.debug(
777
- f"Health checker {checker.get_name()} completed in {checker_duration:.2f}ms"
778
- )
779
-
780
- except Exception as e:
781
- error_msg = f"Health checker {checker.get_name()} failed: {e}"
782
- errors.append(error_msg)
783
- self.logger.error(error_msg)
784
-
785
- # Add error metric
786
- all_metrics.append(
787
- HealthMetric(
788
- name=f"{checker.get_name()}_error",
789
- value=str(e),
790
- status=HealthStatus.UNKNOWN,
791
- message=error_msg,
792
- )
793
- )
794
-
795
- # Determine overall status
796
- overall_status = self._determine_overall_status(all_metrics)
797
-
798
- # Create result
799
- duration_ms = (time.time() - start_time) * 1000
800
- result = HealthCheckResult(
801
- overall_status=overall_status,
802
- metrics=all_metrics,
803
- timestamp=start_time,
804
- duration_ms=duration_ms,
805
- errors=errors,
806
- )
807
-
808
- # Update statistics
809
- self.monitoring_stats["checks_performed"] += 1
810
- if errors:
811
- self.monitoring_stats["checks_failed"] += 1
812
-
813
- # Update average duration
814
- current_avg = self.monitoring_stats["average_check_duration_ms"]
815
- checks_count = self.monitoring_stats["checks_performed"]
816
- self.monitoring_stats["average_check_duration_ms"] = (
817
- current_avg * (checks_count - 1) + duration_ms
818
- ) / checks_count
819
- self.monitoring_stats["last_check_timestamp"] = time.time()
820
-
821
- # Store in history
822
- self.health_history.append(result)
823
- self.last_check_result = result
824
-
825
- # Notify callbacks
826
- for callback in self.health_callbacks:
827
- try:
828
- callback(result)
829
- except Exception as e:
830
- self.logger.error(f"Health callback {callback.__name__} failed: {e}")
831
-
832
- self.logger.debug(
833
- f"Health check completed: {overall_status.value} "
834
- f"({len(all_metrics)} metrics, {len(errors)} errors, "
835
- f"{duration_ms:.2f}ms)"
836
- )
837
-
838
- return result
839
-
840
- def _determine_overall_status(self, metrics: List[HealthMetric]) -> HealthStatus:
841
- """Determine overall health status from individual metrics."""
842
- if not metrics:
843
- return HealthStatus.UNKNOWN
844
-
845
- # Count metrics by status
846
- status_counts = dict.fromkeys(HealthStatus, 0)
847
- for metric in metrics:
848
- status_counts[metric.status] += 1
849
-
850
- # Determine overall status based on counts
851
- total_metrics = len(metrics)
852
-
853
- # If any critical metrics, overall is critical
854
- if status_counts[HealthStatus.CRITICAL] > 0:
855
- return HealthStatus.CRITICAL
856
-
857
- # If more than 30% warning metrics, overall is warning
858
- warning_ratio = status_counts[HealthStatus.WARNING] / total_metrics
859
- if warning_ratio > 0.3:
860
- return HealthStatus.WARNING
861
-
862
- # If any warning metrics but less than 30%, still healthy
863
- if status_counts[HealthStatus.WARNING] > 0:
864
- return HealthStatus.HEALTHY
865
-
866
- # If any unknown metrics, overall is unknown
867
- if status_counts[HealthStatus.UNKNOWN] > 0:
868
- return HealthStatus.UNKNOWN
869
-
870
- # All metrics healthy
871
- return HealthStatus.HEALTHY
872
-
873
- def start_monitoring(self) -> None:
874
- """Start continuous health monitoring."""
875
- if self.monitoring:
876
- self.logger.warning("Health monitoring is already running")
877
- return
878
-
879
- self.monitoring = True
880
- self.monitor_task = asyncio.create_task(self._monitoring_loop())
881
- self.logger.info(
882
- f"Started health monitoring with {self.check_interval}s interval"
883
- )
884
-
885
- async def stop_monitoring(self) -> None:
886
- """Stop continuous health monitoring."""
887
- if not self.monitoring:
888
- return
889
-
890
- self.monitoring = False
891
- if self.monitor_task:
892
- self.monitor_task.cancel()
893
- with contextlib.suppress(asyncio.CancelledError):
894
- await self.monitor_task
895
- self.monitor_task = None
896
-
897
- self.logger.info("Stopped health monitoring")
898
-
899
- async def _monitoring_loop(self) -> None:
900
- """Continuous health monitoring loop."""
901
- try:
902
- while self.monitoring:
903
- try:
904
- await self.perform_health_check()
905
- except Exception as e:
906
- self.logger.error(f"Error during health check: {e}")
907
-
908
- # Wait for next check
909
- await asyncio.sleep(self.check_interval)
910
- except asyncio.CancelledError:
911
- self.logger.debug("Health monitoring loop cancelled")
912
- except Exception as e:
913
- self.logger.error(f"Health monitoring loop error: {e}")
914
-
915
- def get_current_status(self) -> Optional[HealthCheckResult]:
916
- """Get the most recent health check result."""
917
- return self.last_check_result
918
-
919
- def get_health_history(
920
- self, limit: Optional[int] = None
921
- ) -> List[HealthCheckResult]:
922
- """Get health check history.
923
-
924
- Args:
925
- limit: Maximum number of results to return
926
-
927
- Returns:
928
- List of health check results, newest first
929
- """
930
- history = list(self.health_history)
931
- history.reverse() # Newest first
932
-
933
- if limit:
934
- history = history[:limit]
935
-
936
- return history
937
-
938
- def get_aggregated_status(
939
- self, window_seconds: Optional[int] = None
940
- ) -> Dict[str, Any]:
941
- """Get aggregated health status over a time window.
942
-
943
- Args:
944
- window_seconds: Time window for aggregation (defaults to configured window)
945
-
946
- Returns:
947
- Dictionary with aggregated health statistics
948
- """
949
- window_seconds = window_seconds or self.aggregation_window
950
- current_time = time.time()
951
- cutoff_time = current_time - window_seconds
952
-
953
- # Filter history to time window
954
- recent_results = [
955
- result for result in self.health_history if result.timestamp >= cutoff_time
956
- ]
957
-
958
- if not recent_results:
959
- return {
960
- "period": "no_data",
961
- "window_seconds": window_seconds,
962
- "checks_count": 0,
963
- "overall_status": HealthStatus.UNKNOWN.value,
964
- }
965
-
966
- # Aggregate statistics
967
- status_counts = dict.fromkeys(HealthStatus, 0)
968
- total_metrics = 0
969
- total_errors = 0
970
- total_duration_ms = 0
971
-
972
- for result in recent_results:
973
- status_counts[result.overall_status] += 1
974
- total_metrics += len(result.metrics)
975
- total_errors += len(result.errors)
976
- total_duration_ms += result.duration_ms
977
-
978
- checks_count = len(recent_results)
979
-
980
- # Determine aggregated status
981
- if status_counts[HealthStatus.CRITICAL] > 0:
982
- aggregated_status = HealthStatus.CRITICAL
983
- elif status_counts[HealthStatus.WARNING] > checks_count * 0.3:
984
- aggregated_status = HealthStatus.WARNING
985
- elif status_counts[HealthStatus.UNKNOWN] > checks_count * 0.5:
986
- aggregated_status = HealthStatus.UNKNOWN
987
- else:
988
- aggregated_status = HealthStatus.HEALTHY
989
-
990
- return {
991
- "period": f"last_{window_seconds}_seconds",
992
- "window_seconds": window_seconds,
993
- "checks_count": checks_count,
994
- "overall_status": aggregated_status.value,
995
- "status_distribution": {
996
- status.value: count for status, count in status_counts.items()
997
- },
998
- "average_metrics_per_check": (
999
- round(total_metrics / checks_count, 2) if checks_count > 0 else 0
1000
- ),
1001
- "total_errors": total_errors,
1002
- "average_duration_ms": (
1003
- round(total_duration_ms / checks_count, 2) if checks_count > 0 else 0
1004
- ),
1005
- "monitoring_stats": dict(self.monitoring_stats),
1006
- }
1007
-
1008
- def export_diagnostics(self) -> Dict[str, Any]:
1009
- """Export comprehensive diagnostics information."""
1010
- return {
1011
- "monitor_info": {
1012
- "check_interval": self.check_interval,
1013
- "history_size": self.history_size,
1014
- "aggregation_window": self.aggregation_window,
1015
- "monitoring_active": self.monitoring,
1016
- "checkers_count": len(self.checkers),
1017
- "callbacks_count": len(self.health_callbacks),
1018
- },
1019
- "checkers": [checker.get_name() for checker in self.checkers],
1020
- "current_status": (
1021
- self.last_check_result.to_dict() if self.last_check_result else None
1022
- ),
1023
- "aggregated_status": self.get_aggregated_status(),
1024
- "monitoring_stats": dict(self.monitoring_stats),
1025
- "history_summary": {
1026
- "total_checks": len(self.health_history),
1027
- "oldest_check": (
1028
- self.health_history[0].timestamp if self.health_history else None
1029
- ),
1030
- "newest_check": (
1031
- self.health_history[-1].timestamp if self.health_history else None
1032
- ),
1033
- },
1034
- }
32
+ # Re-export all components from the modular implementation
33
+ from .monitoring import ( # noqa: F401; New service-based API; Base components; Legacy compatibility
34
+ AdvancedHealthMonitor,
35
+ HealthChecker,
36
+ HealthCheckResult,
37
+ HealthMetric,
38
+ HealthStatus,
39
+ MonitoringAggregatorService,
40
+ NetworkConnectivityChecker,
41
+ NetworkHealthService,
42
+ ProcessHealthService,
43
+ ProcessResourceChecker,
44
+ ResourceMonitorService,
45
+ ServiceHealthChecker,
46
+ ServiceHealthService,
47
+ )
48
+
49
+ __all__ = [
50
+ # New service-based API
51
+ "ResourceMonitorService",
52
+ "ProcessHealthService",
53
+ "ServiceHealthService",
54
+ "NetworkHealthService",
55
+ "MonitoringAggregatorService",
56
+ # Base components
57
+ "HealthStatus",
58
+ "HealthMetric",
59
+ "HealthCheckResult",
60
+ "HealthChecker",
61
+ # Legacy compatibility
62
+ "ProcessResourceChecker",
63
+ "NetworkConnectivityChecker",
64
+ "ServiceHealthChecker",
65
+ "AdvancedHealthMonitor",
66
+ ]
67
+
68
+ # Module metadata
69
+ __version__ = "2.0.0"
70
+ __author__ = "Claude MPM Team"
71
+ __description__ = "Refactored modular health monitoring system"