claude-mpm 4.1.4__py3-none-any.whl → 4.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- claude_mpm/VERSION +1 -1
- claude_mpm/cli/commands/tickets.py +365 -784
- claude_mpm/core/output_style_manager.py +24 -0
- claude_mpm/core/unified_agent_registry.py +46 -15
- claude_mpm/services/agents/deployment/agent_discovery_service.py +12 -3
- claude_mpm/services/agents/deployment/agent_lifecycle_manager.py +172 -233
- claude_mpm/services/agents/deployment/agent_lifecycle_manager_refactored.py +575 -0
- claude_mpm/services/agents/deployment/agent_operation_service.py +573 -0
- claude_mpm/services/agents/deployment/agent_record_service.py +419 -0
- claude_mpm/services/agents/deployment/agent_state_service.py +381 -0
- claude_mpm/services/agents/deployment/multi_source_deployment_service.py +4 -2
- claude_mpm/services/infrastructure/__init__.py +31 -5
- claude_mpm/services/infrastructure/monitoring/__init__.py +43 -0
- claude_mpm/services/infrastructure/monitoring/aggregator.py +437 -0
- claude_mpm/services/infrastructure/monitoring/base.py +130 -0
- claude_mpm/services/infrastructure/monitoring/legacy.py +203 -0
- claude_mpm/services/infrastructure/monitoring/network.py +218 -0
- claude_mpm/services/infrastructure/monitoring/process.py +342 -0
- claude_mpm/services/infrastructure/monitoring/resources.py +243 -0
- claude_mpm/services/infrastructure/monitoring/service.py +367 -0
- claude_mpm/services/infrastructure/monitoring.py +67 -1030
- claude_mpm/services/project/analyzer.py +13 -4
- claude_mpm/services/project/analyzer_refactored.py +450 -0
- claude_mpm/services/project/analyzer_v2.py +566 -0
- claude_mpm/services/project/architecture_analyzer.py +461 -0
- claude_mpm/services/project/dependency_analyzer.py +462 -0
- claude_mpm/services/project/language_analyzer.py +265 -0
- claude_mpm/services/project/metrics_collector.py +410 -0
- claude_mpm/services/ticket_manager.py +5 -1
- claude_mpm/services/ticket_services/__init__.py +26 -0
- claude_mpm/services/ticket_services/crud_service.py +328 -0
- claude_mpm/services/ticket_services/formatter_service.py +290 -0
- claude_mpm/services/ticket_services/search_service.py +324 -0
- claude_mpm/services/ticket_services/validation_service.py +303 -0
- claude_mpm/services/ticket_services/workflow_service.py +244 -0
- {claude_mpm-4.1.4.dist-info → claude_mpm-4.1.5.dist-info}/METADATA +1 -1
- {claude_mpm-4.1.4.dist-info → claude_mpm-4.1.5.dist-info}/RECORD +41 -17
- {claude_mpm-4.1.4.dist-info → claude_mpm-4.1.5.dist-info}/WHEEL +0 -0
- {claude_mpm-4.1.4.dist-info → claude_mpm-4.1.5.dist-info}/entry_points.txt +0 -0
- {claude_mpm-4.1.4.dist-info → claude_mpm-4.1.5.dist-info}/licenses/LICENSE +0 -0
- {claude_mpm-4.1.4.dist-info → claude_mpm-4.1.5.dist-info}/top_level.txt +0 -0
|
@@ -1,1034 +1,71 @@
|
|
|
1
1
|
"""Advanced health monitoring system for claude-mpm Socket.IO server.
|
|
2
2
|
|
|
3
|
-
This module
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
-
|
|
12
|
-
-
|
|
13
|
-
|
|
14
|
-
|
|
3
|
+
This module has been refactored into a modular service-based architecture.
|
|
4
|
+
All functionality is preserved through the monitoring package.
|
|
5
|
+
|
|
6
|
+
The refactoring reduces complexity from 1,034 lines to under 100 lines
|
|
7
|
+
by delegating to specialized services:
|
|
8
|
+
- ResourceMonitorService: System resource monitoring
|
|
9
|
+
- ProcessHealthService: Process-specific monitoring
|
|
10
|
+
- ServiceHealthService: Application-level metrics
|
|
11
|
+
- NetworkHealthService: Network connectivity checks
|
|
12
|
+
- MonitoringAggregatorService: Orchestration and aggregation
|
|
13
|
+
|
|
14
|
+
For new code, use the service-based API:
|
|
15
|
+
from claude_mpm.services.infrastructure.monitoring import (
|
|
16
|
+
ResourceMonitorService,
|
|
17
|
+
ProcessHealthService,
|
|
18
|
+
ServiceHealthService,
|
|
19
|
+
NetworkHealthService,
|
|
20
|
+
MonitoringAggregatorService,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
For backward compatibility, legacy classes are still available:
|
|
24
|
+
from claude_mpm.services.infrastructure.monitoring import (
|
|
25
|
+
ProcessResourceChecker,
|
|
26
|
+
NetworkConnectivityChecker,
|
|
27
|
+
ServiceHealthChecker,
|
|
28
|
+
AdvancedHealthMonitor,
|
|
29
|
+
)
|
|
15
30
|
"""
|
|
16
31
|
|
|
17
|
-
|
|
18
|
-
import
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
unit: Optional[str] = None
|
|
58
|
-
timestamp: float = None
|
|
59
|
-
message: Optional[str] = None
|
|
60
|
-
|
|
61
|
-
def __post_init__(self):
|
|
62
|
-
if self.timestamp is None:
|
|
63
|
-
self.timestamp = time.time()
|
|
64
|
-
|
|
65
|
-
def to_dict(self) -> Dict[str, Any]:
|
|
66
|
-
"""Convert metric to dictionary format."""
|
|
67
|
-
result = asdict(self)
|
|
68
|
-
result["status"] = self.status.value
|
|
69
|
-
result["timestamp_iso"] = datetime.fromtimestamp(
|
|
70
|
-
self.timestamp, timezone.utc
|
|
71
|
-
).isoformat()
|
|
72
|
-
return result
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
@dataclass
|
|
76
|
-
class HealthCheckResult:
|
|
77
|
-
"""Result of a health check operation."""
|
|
78
|
-
|
|
79
|
-
overall_status: HealthStatus
|
|
80
|
-
metrics: List[HealthMetric]
|
|
81
|
-
timestamp: float
|
|
82
|
-
duration_ms: float
|
|
83
|
-
errors: List[str]
|
|
84
|
-
|
|
85
|
-
def __post_init__(self):
|
|
86
|
-
if not hasattr(self, "timestamp") or self.timestamp is None:
|
|
87
|
-
self.timestamp = time.time()
|
|
88
|
-
|
|
89
|
-
def to_dict(self) -> Dict[str, Any]:
|
|
90
|
-
"""Convert health check result to dictionary format."""
|
|
91
|
-
return {
|
|
92
|
-
"overall_status": self.overall_status.value,
|
|
93
|
-
"metrics": [metric.to_dict() for metric in self.metrics],
|
|
94
|
-
"timestamp": self.timestamp,
|
|
95
|
-
"timestamp_iso": datetime.fromtimestamp(
|
|
96
|
-
self.timestamp, timezone.utc
|
|
97
|
-
).isoformat(),
|
|
98
|
-
"duration_ms": self.duration_ms,
|
|
99
|
-
"errors": self.errors,
|
|
100
|
-
"metric_count": len(self.metrics),
|
|
101
|
-
"healthy_metrics": len(
|
|
102
|
-
[m for m in self.metrics if m.status == HealthStatus.HEALTHY]
|
|
103
|
-
),
|
|
104
|
-
"warning_metrics": len(
|
|
105
|
-
[m for m in self.metrics if m.status == HealthStatus.WARNING]
|
|
106
|
-
),
|
|
107
|
-
"critical_metrics": len(
|
|
108
|
-
[m for m in self.metrics if m.status == HealthStatus.CRITICAL]
|
|
109
|
-
),
|
|
110
|
-
}
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
class HealthChecker(ABC):
|
|
114
|
-
"""Abstract base class for health checkers.
|
|
115
|
-
|
|
116
|
-
Health checkers implement specific monitoring logic for different aspects
|
|
117
|
-
of the system (process resources, network connectivity, service health, etc.).
|
|
118
|
-
"""
|
|
119
|
-
|
|
120
|
-
@abstractmethod
|
|
121
|
-
def get_name(self) -> str:
|
|
122
|
-
"""Get the name of this health checker."""
|
|
123
|
-
|
|
124
|
-
@abstractmethod
|
|
125
|
-
async def check_health(self) -> List[HealthMetric]:
|
|
126
|
-
"""Perform health check and return metrics."""
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
class ProcessResourceChecker(HealthChecker):
|
|
130
|
-
"""Health checker for process resource usage.
|
|
131
|
-
|
|
132
|
-
Monitors:
|
|
133
|
-
- CPU usage percentage
|
|
134
|
-
- Memory usage (RSS, VMS)
|
|
135
|
-
- File descriptor count
|
|
136
|
-
- Thread count
|
|
137
|
-
- Process status
|
|
138
|
-
"""
|
|
139
|
-
|
|
140
|
-
def __init__(
|
|
141
|
-
self,
|
|
142
|
-
pid: int,
|
|
143
|
-
cpu_threshold: float = 80.0,
|
|
144
|
-
memory_threshold_mb: int = 500,
|
|
145
|
-
fd_threshold: int = 1000,
|
|
146
|
-
):
|
|
147
|
-
"""Initialize process resource checker.
|
|
148
|
-
|
|
149
|
-
Args:
|
|
150
|
-
pid: Process ID to monitor
|
|
151
|
-
cpu_threshold: CPU usage threshold as percentage
|
|
152
|
-
memory_threshold_mb: Memory usage threshold in MB
|
|
153
|
-
fd_threshold: File descriptor count threshold
|
|
154
|
-
"""
|
|
155
|
-
self.pid = pid
|
|
156
|
-
self.cpu_threshold = cpu_threshold
|
|
157
|
-
self.memory_threshold_mb = memory_threshold_mb
|
|
158
|
-
self.fd_threshold = fd_threshold
|
|
159
|
-
self.process = None
|
|
160
|
-
self.logger = logging.getLogger(f"{__name__}.ProcessResourceChecker")
|
|
161
|
-
|
|
162
|
-
if PSUTIL_AVAILABLE:
|
|
163
|
-
try:
|
|
164
|
-
self.process = psutil.Process(pid)
|
|
165
|
-
except psutil.NoSuchProcess:
|
|
166
|
-
self.logger.warning(f"Process {pid} not found for monitoring")
|
|
167
|
-
|
|
168
|
-
def get_name(self) -> str:
|
|
169
|
-
return f"process_resources_{self.pid}"
|
|
170
|
-
|
|
171
|
-
async def check_health(self) -> List[HealthMetric]:
|
|
172
|
-
"""Check process resource usage."""
|
|
173
|
-
metrics = []
|
|
174
|
-
|
|
175
|
-
if not PSUTIL_AVAILABLE:
|
|
176
|
-
metrics.append(
|
|
177
|
-
HealthMetric(
|
|
178
|
-
name="psutil_availability",
|
|
179
|
-
value=False,
|
|
180
|
-
status=HealthStatus.WARNING,
|
|
181
|
-
message="psutil not available for enhanced monitoring",
|
|
182
|
-
)
|
|
183
|
-
)
|
|
184
|
-
return metrics
|
|
185
|
-
|
|
186
|
-
if not self.process:
|
|
187
|
-
metrics.append(
|
|
188
|
-
HealthMetric(
|
|
189
|
-
name="process_exists",
|
|
190
|
-
value=False,
|
|
191
|
-
status=HealthStatus.CRITICAL,
|
|
192
|
-
message=f"Process {self.pid} not found",
|
|
193
|
-
)
|
|
194
|
-
)
|
|
195
|
-
return metrics
|
|
196
|
-
|
|
197
|
-
try:
|
|
198
|
-
# Check if process still exists
|
|
199
|
-
if not self.process.is_running():
|
|
200
|
-
metrics.append(
|
|
201
|
-
HealthMetric(
|
|
202
|
-
name="process_exists",
|
|
203
|
-
value=False,
|
|
204
|
-
status=HealthStatus.CRITICAL,
|
|
205
|
-
message=f"Process {self.pid} is no longer running",
|
|
206
|
-
)
|
|
207
|
-
)
|
|
208
|
-
return metrics
|
|
209
|
-
|
|
210
|
-
# Process status
|
|
211
|
-
status = self.process.status()
|
|
212
|
-
process_healthy = status not in [
|
|
213
|
-
psutil.STATUS_ZOMBIE,
|
|
214
|
-
psutil.STATUS_DEAD,
|
|
215
|
-
psutil.STATUS_STOPPED,
|
|
216
|
-
]
|
|
217
|
-
metrics.append(
|
|
218
|
-
HealthMetric(
|
|
219
|
-
name="process_status",
|
|
220
|
-
value=status,
|
|
221
|
-
status=(
|
|
222
|
-
HealthStatus.HEALTHY
|
|
223
|
-
if process_healthy
|
|
224
|
-
else HealthStatus.CRITICAL
|
|
225
|
-
),
|
|
226
|
-
message=f"Process status: {status}",
|
|
227
|
-
)
|
|
228
|
-
)
|
|
229
|
-
|
|
230
|
-
# CPU usage
|
|
231
|
-
try:
|
|
232
|
-
cpu_percent = self.process.cpu_percent(
|
|
233
|
-
interval=TimeoutConfig.CPU_SAMPLE_INTERVAL
|
|
234
|
-
)
|
|
235
|
-
cpu_status = HealthStatus.HEALTHY
|
|
236
|
-
if cpu_percent > self.cpu_threshold:
|
|
237
|
-
cpu_status = (
|
|
238
|
-
HealthStatus.WARNING
|
|
239
|
-
if cpu_percent < self.cpu_threshold * 1.2
|
|
240
|
-
else HealthStatus.CRITICAL
|
|
241
|
-
)
|
|
242
|
-
|
|
243
|
-
metrics.append(
|
|
244
|
-
HealthMetric(
|
|
245
|
-
name="cpu_usage_percent",
|
|
246
|
-
value=round(cpu_percent, 2),
|
|
247
|
-
status=cpu_status,
|
|
248
|
-
threshold=self.cpu_threshold,
|
|
249
|
-
unit="%",
|
|
250
|
-
)
|
|
251
|
-
)
|
|
252
|
-
except Exception as e:
|
|
253
|
-
metrics.append(
|
|
254
|
-
HealthMetric(
|
|
255
|
-
name="cpu_usage_percent",
|
|
256
|
-
value=-1,
|
|
257
|
-
status=HealthStatus.UNKNOWN,
|
|
258
|
-
message=f"Failed to get CPU usage: {e}",
|
|
259
|
-
)
|
|
260
|
-
)
|
|
261
|
-
|
|
262
|
-
# Memory usage
|
|
263
|
-
try:
|
|
264
|
-
memory_info = self.process.memory_info()
|
|
265
|
-
memory_mb = memory_info.rss / ResourceLimits.BYTES_TO_MB
|
|
266
|
-
memory_status = HealthStatus.HEALTHY
|
|
267
|
-
if memory_mb > self.memory_threshold_mb:
|
|
268
|
-
memory_status = (
|
|
269
|
-
HealthStatus.WARNING
|
|
270
|
-
if memory_mb < self.memory_threshold_mb * 1.2
|
|
271
|
-
else HealthStatus.CRITICAL
|
|
272
|
-
)
|
|
273
|
-
|
|
274
|
-
metrics.append(
|
|
275
|
-
HealthMetric(
|
|
276
|
-
name="memory_usage_mb",
|
|
277
|
-
value=round(memory_mb, 2),
|
|
278
|
-
status=memory_status,
|
|
279
|
-
threshold=self.memory_threshold_mb,
|
|
280
|
-
unit="MB",
|
|
281
|
-
)
|
|
282
|
-
)
|
|
283
|
-
|
|
284
|
-
metrics.append(
|
|
285
|
-
HealthMetric(
|
|
286
|
-
name="memory_vms_mb",
|
|
287
|
-
value=round(memory_info.vms / ResourceLimits.BYTES_TO_MB, 2),
|
|
288
|
-
status=HealthStatus.HEALTHY,
|
|
289
|
-
unit="MB",
|
|
290
|
-
)
|
|
291
|
-
)
|
|
292
|
-
except Exception as e:
|
|
293
|
-
metrics.append(
|
|
294
|
-
HealthMetric(
|
|
295
|
-
name="memory_usage_mb",
|
|
296
|
-
value=-1,
|
|
297
|
-
status=HealthStatus.UNKNOWN,
|
|
298
|
-
message=f"Failed to get memory usage: {e}",
|
|
299
|
-
)
|
|
300
|
-
)
|
|
301
|
-
|
|
302
|
-
# File descriptors (Unix only)
|
|
303
|
-
if hasattr(self.process, "num_fds"):
|
|
304
|
-
try:
|
|
305
|
-
fd_count = self.process.num_fds()
|
|
306
|
-
fd_status = HealthStatus.HEALTHY
|
|
307
|
-
if fd_count > self.fd_threshold:
|
|
308
|
-
fd_status = (
|
|
309
|
-
HealthStatus.WARNING
|
|
310
|
-
if fd_count < self.fd_threshold * 1.2
|
|
311
|
-
else HealthStatus.CRITICAL
|
|
312
|
-
)
|
|
313
|
-
|
|
314
|
-
metrics.append(
|
|
315
|
-
HealthMetric(
|
|
316
|
-
name="file_descriptors",
|
|
317
|
-
value=fd_count,
|
|
318
|
-
status=fd_status,
|
|
319
|
-
threshold=self.fd_threshold,
|
|
320
|
-
)
|
|
321
|
-
)
|
|
322
|
-
except Exception as e:
|
|
323
|
-
metrics.append(
|
|
324
|
-
HealthMetric(
|
|
325
|
-
name="file_descriptors",
|
|
326
|
-
value=-1,
|
|
327
|
-
status=HealthStatus.UNKNOWN,
|
|
328
|
-
message=f"Failed to get file descriptor count: {e}",
|
|
329
|
-
)
|
|
330
|
-
)
|
|
331
|
-
|
|
332
|
-
# Thread count
|
|
333
|
-
try:
|
|
334
|
-
thread_count = self.process.num_threads()
|
|
335
|
-
metrics.append(
|
|
336
|
-
HealthMetric(
|
|
337
|
-
name="thread_count",
|
|
338
|
-
value=thread_count,
|
|
339
|
-
status=HealthStatus.HEALTHY,
|
|
340
|
-
)
|
|
341
|
-
)
|
|
342
|
-
except Exception as e:
|
|
343
|
-
metrics.append(
|
|
344
|
-
HealthMetric(
|
|
345
|
-
name="thread_count",
|
|
346
|
-
value=-1,
|
|
347
|
-
status=HealthStatus.UNKNOWN,
|
|
348
|
-
message=f"Failed to get thread count: {e}",
|
|
349
|
-
)
|
|
350
|
-
)
|
|
351
|
-
|
|
352
|
-
# Process create time (for validation)
|
|
353
|
-
try:
|
|
354
|
-
create_time = self.process.create_time()
|
|
355
|
-
metrics.append(
|
|
356
|
-
HealthMetric(
|
|
357
|
-
name="process_start_time",
|
|
358
|
-
value=create_time,
|
|
359
|
-
status=HealthStatus.HEALTHY,
|
|
360
|
-
unit="timestamp",
|
|
361
|
-
)
|
|
362
|
-
)
|
|
363
|
-
except Exception as e:
|
|
364
|
-
metrics.append(
|
|
365
|
-
HealthMetric(
|
|
366
|
-
name="process_start_time",
|
|
367
|
-
value=-1,
|
|
368
|
-
status=HealthStatus.UNKNOWN,
|
|
369
|
-
message=f"Failed to get process start time: {e}",
|
|
370
|
-
)
|
|
371
|
-
)
|
|
372
|
-
|
|
373
|
-
except psutil.NoSuchProcess:
|
|
374
|
-
metrics.append(
|
|
375
|
-
HealthMetric(
|
|
376
|
-
name="process_exists",
|
|
377
|
-
value=False,
|
|
378
|
-
status=HealthStatus.CRITICAL,
|
|
379
|
-
message=f"Process {self.pid} no longer exists",
|
|
380
|
-
)
|
|
381
|
-
)
|
|
382
|
-
except Exception as e:
|
|
383
|
-
self.logger.error(f"Error checking process health: {e}")
|
|
384
|
-
metrics.append(
|
|
385
|
-
HealthMetric(
|
|
386
|
-
name="process_check_error",
|
|
387
|
-
value=str(e),
|
|
388
|
-
status=HealthStatus.UNKNOWN,
|
|
389
|
-
message=f"Unexpected error during process health check: {e}",
|
|
390
|
-
)
|
|
391
|
-
)
|
|
392
|
-
|
|
393
|
-
return metrics
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
class NetworkConnectivityChecker(HealthChecker):
|
|
397
|
-
"""Health checker for network connectivity.
|
|
398
|
-
|
|
399
|
-
Monitors:
|
|
400
|
-
- Port availability and binding status
|
|
401
|
-
- Socket connection health
|
|
402
|
-
- Network interface status
|
|
403
|
-
"""
|
|
404
|
-
|
|
405
|
-
def __init__(self, host: str, port: int, timeout: float = 1.0):
|
|
406
|
-
"""Initialize network connectivity checker.
|
|
407
|
-
|
|
408
|
-
Args:
|
|
409
|
-
host: Host address to check
|
|
410
|
-
port: Port number to check
|
|
411
|
-
timeout: Connection timeout in seconds
|
|
412
|
-
"""
|
|
413
|
-
self.host = host
|
|
414
|
-
self.port = port
|
|
415
|
-
self.timeout = timeout
|
|
416
|
-
self.logger = logging.getLogger(f"{__name__}.NetworkConnectivityChecker")
|
|
417
|
-
|
|
418
|
-
def get_name(self) -> str:
|
|
419
|
-
return f"network_connectivity_{self.host}_{self.port}"
|
|
420
|
-
|
|
421
|
-
async def check_health(self) -> List[HealthMetric]:
|
|
422
|
-
"""Check network connectivity."""
|
|
423
|
-
metrics = []
|
|
424
|
-
|
|
425
|
-
# Check port binding
|
|
426
|
-
try:
|
|
427
|
-
# Try to connect to the port
|
|
428
|
-
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
|
429
|
-
sock.settimeout(self.timeout)
|
|
430
|
-
result = sock.connect_ex((self.host, self.port))
|
|
431
|
-
sock.close()
|
|
432
|
-
|
|
433
|
-
if result == 0:
|
|
434
|
-
metrics.append(
|
|
435
|
-
HealthMetric(
|
|
436
|
-
name="port_accessible",
|
|
437
|
-
value=True,
|
|
438
|
-
status=HealthStatus.HEALTHY,
|
|
439
|
-
message=f"Port {self.port} is accessible on {self.host}",
|
|
440
|
-
)
|
|
441
|
-
)
|
|
442
|
-
else:
|
|
443
|
-
metrics.append(
|
|
444
|
-
HealthMetric(
|
|
445
|
-
name="port_accessible",
|
|
446
|
-
value=False,
|
|
447
|
-
status=HealthStatus.CRITICAL,
|
|
448
|
-
message=f"Port {self.port} is not accessible on {self.host}",
|
|
449
|
-
)
|
|
450
|
-
)
|
|
451
|
-
except Exception as e:
|
|
452
|
-
metrics.append(
|
|
453
|
-
HealthMetric(
|
|
454
|
-
name="port_accessible",
|
|
455
|
-
value=False,
|
|
456
|
-
status=HealthStatus.UNKNOWN,
|
|
457
|
-
message=f"Error checking port accessibility: {e}",
|
|
458
|
-
)
|
|
459
|
-
)
|
|
460
|
-
|
|
461
|
-
# Check if we can create a socket (resource availability)
|
|
462
|
-
try:
|
|
463
|
-
test_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
|
464
|
-
test_sock.close()
|
|
465
|
-
metrics.append(
|
|
466
|
-
HealthMetric(
|
|
467
|
-
name="socket_creation",
|
|
468
|
-
value=True,
|
|
469
|
-
status=HealthStatus.HEALTHY,
|
|
470
|
-
message="Socket creation successful",
|
|
471
|
-
)
|
|
472
|
-
)
|
|
473
|
-
except Exception as e:
|
|
474
|
-
metrics.append(
|
|
475
|
-
HealthMetric(
|
|
476
|
-
name="socket_creation",
|
|
477
|
-
value=False,
|
|
478
|
-
status=HealthStatus.CRITICAL,
|
|
479
|
-
message=f"Failed to create socket: {e}",
|
|
480
|
-
)
|
|
481
|
-
)
|
|
482
|
-
|
|
483
|
-
return metrics
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
class ServiceHealthChecker(HealthChecker):
|
|
487
|
-
"""Health checker for service-specific metrics.
|
|
488
|
-
|
|
489
|
-
Monitors:
|
|
490
|
-
- Connected clients count
|
|
491
|
-
- Event processing rate
|
|
492
|
-
- Error rates
|
|
493
|
-
- Response times
|
|
494
|
-
"""
|
|
495
|
-
|
|
496
|
-
def __init__(
|
|
497
|
-
self,
|
|
498
|
-
service_stats: Dict[str, Any],
|
|
499
|
-
max_clients: int = 1000,
|
|
500
|
-
max_error_rate: float = 0.1,
|
|
501
|
-
):
|
|
502
|
-
"""Initialize service health checker.
|
|
503
|
-
|
|
504
|
-
Args:
|
|
505
|
-
service_stats: Reference to service statistics dictionary
|
|
506
|
-
max_clients: Maximum allowed connected clients
|
|
507
|
-
max_error_rate: Maximum allowed error rate (0.0-1.0)
|
|
508
|
-
"""
|
|
509
|
-
self.service_stats = service_stats
|
|
510
|
-
self.max_clients = max_clients
|
|
511
|
-
self.max_error_rate = max_error_rate
|
|
512
|
-
self.last_check_time = time.time()
|
|
513
|
-
self.last_events_processed = 0
|
|
514
|
-
self.logger = logging.getLogger(f"{__name__}.ServiceHealthChecker")
|
|
515
|
-
|
|
516
|
-
def get_name(self) -> str:
|
|
517
|
-
return "service_health"
|
|
518
|
-
|
|
519
|
-
async def check_health(self) -> List[HealthMetric]:
|
|
520
|
-
"""Check service-specific health metrics."""
|
|
521
|
-
metrics = []
|
|
522
|
-
current_time = time.time()
|
|
523
|
-
|
|
524
|
-
# Connected clients
|
|
525
|
-
try:
|
|
526
|
-
client_count = self.service_stats.get("clients_connected", 0)
|
|
527
|
-
client_status = HealthStatus.HEALTHY
|
|
528
|
-
if client_count > self.max_clients * 0.8:
|
|
529
|
-
client_status = HealthStatus.WARNING
|
|
530
|
-
if client_count > self.max_clients:
|
|
531
|
-
client_status = HealthStatus.CRITICAL
|
|
532
|
-
|
|
533
|
-
metrics.append(
|
|
534
|
-
HealthMetric(
|
|
535
|
-
name="connected_clients",
|
|
536
|
-
value=client_count,
|
|
537
|
-
status=client_status,
|
|
538
|
-
threshold=self.max_clients,
|
|
539
|
-
)
|
|
540
|
-
)
|
|
541
|
-
except Exception as e:
|
|
542
|
-
metrics.append(
|
|
543
|
-
HealthMetric(
|
|
544
|
-
name="connected_clients",
|
|
545
|
-
value=-1,
|
|
546
|
-
status=HealthStatus.UNKNOWN,
|
|
547
|
-
message=f"Failed to get client count: {e}",
|
|
548
|
-
)
|
|
549
|
-
)
|
|
550
|
-
|
|
551
|
-
# Event processing rate
|
|
552
|
-
try:
|
|
553
|
-
events_processed = self.service_stats.get("events_processed", 0)
|
|
554
|
-
time_diff = current_time - self.last_check_time
|
|
555
|
-
|
|
556
|
-
if time_diff > 0 and self.last_events_processed > 0:
|
|
557
|
-
event_rate = (events_processed - self.last_events_processed) / time_diff
|
|
558
|
-
metrics.append(
|
|
559
|
-
HealthMetric(
|
|
560
|
-
name="event_processing_rate",
|
|
561
|
-
value=round(event_rate, 2),
|
|
562
|
-
status=HealthStatus.HEALTHY,
|
|
563
|
-
unit="events/sec",
|
|
564
|
-
)
|
|
565
|
-
)
|
|
566
|
-
|
|
567
|
-
self.last_events_processed = events_processed
|
|
568
|
-
|
|
569
|
-
# Total events processed
|
|
570
|
-
metrics.append(
|
|
571
|
-
HealthMetric(
|
|
572
|
-
name="total_events_processed",
|
|
573
|
-
value=events_processed,
|
|
574
|
-
status=HealthStatus.HEALTHY,
|
|
575
|
-
)
|
|
576
|
-
)
|
|
577
|
-
except Exception as e:
|
|
578
|
-
metrics.append(
|
|
579
|
-
HealthMetric(
|
|
580
|
-
name="event_processing_rate",
|
|
581
|
-
value=-1,
|
|
582
|
-
status=HealthStatus.UNKNOWN,
|
|
583
|
-
message=f"Failed to calculate event rate: {e}",
|
|
584
|
-
)
|
|
585
|
-
)
|
|
586
|
-
|
|
587
|
-
# Error rate
|
|
588
|
-
try:
|
|
589
|
-
errors = self.service_stats.get("errors", 0)
|
|
590
|
-
total_events = self.service_stats.get(
|
|
591
|
-
"events_processed", 1
|
|
592
|
-
) # Avoid division by zero
|
|
593
|
-
error_rate = errors / max(total_events, 1)
|
|
594
|
-
|
|
595
|
-
error_status = HealthStatus.HEALTHY
|
|
596
|
-
if error_rate > self.max_error_rate * 0.5:
|
|
597
|
-
error_status = HealthStatus.WARNING
|
|
598
|
-
if error_rate > self.max_error_rate:
|
|
599
|
-
error_status = HealthStatus.CRITICAL
|
|
600
|
-
|
|
601
|
-
metrics.append(
|
|
602
|
-
HealthMetric(
|
|
603
|
-
name="error_rate",
|
|
604
|
-
value=round(error_rate, 4),
|
|
605
|
-
status=error_status,
|
|
606
|
-
threshold=self.max_error_rate,
|
|
607
|
-
unit="ratio",
|
|
608
|
-
)
|
|
609
|
-
)
|
|
610
|
-
|
|
611
|
-
metrics.append(
|
|
612
|
-
HealthMetric(
|
|
613
|
-
name="total_errors",
|
|
614
|
-
value=errors,
|
|
615
|
-
status=(
|
|
616
|
-
HealthStatus.HEALTHY if errors == 0 else HealthStatus.WARNING
|
|
617
|
-
),
|
|
618
|
-
)
|
|
619
|
-
)
|
|
620
|
-
except Exception as e:
|
|
621
|
-
metrics.append(
|
|
622
|
-
HealthMetric(
|
|
623
|
-
name="error_rate",
|
|
624
|
-
value=-1,
|
|
625
|
-
status=HealthStatus.UNKNOWN,
|
|
626
|
-
message=f"Failed to calculate error rate: {e}",
|
|
627
|
-
)
|
|
628
|
-
)
|
|
629
|
-
|
|
630
|
-
# Last activity timestamp
|
|
631
|
-
try:
|
|
632
|
-
last_activity = self.service_stats.get("last_activity")
|
|
633
|
-
if last_activity:
|
|
634
|
-
# Parse ISO timestamp or use as-is if numeric
|
|
635
|
-
if isinstance(last_activity, str):
|
|
636
|
-
try:
|
|
637
|
-
from dateutil.parser import parse
|
|
638
|
-
|
|
639
|
-
last_activity_dt = parse(last_activity)
|
|
640
|
-
last_activity_timestamp = last_activity_dt.timestamp()
|
|
641
|
-
except ImportError:
|
|
642
|
-
# Fallback: try to parse ISO format manually
|
|
643
|
-
try:
|
|
644
|
-
from datetime import datetime
|
|
645
|
-
|
|
646
|
-
clean_timestamp = last_activity.rstrip("Z")
|
|
647
|
-
last_activity_dt = datetime.fromisoformat(
|
|
648
|
-
clean_timestamp.replace("T", " ")
|
|
649
|
-
)
|
|
650
|
-
last_activity_timestamp = last_activity_dt.timestamp()
|
|
651
|
-
except Exception:
|
|
652
|
-
# Final fallback: treat as current time
|
|
653
|
-
last_activity_timestamp = current_time
|
|
654
|
-
else:
|
|
655
|
-
last_activity_timestamp = float(last_activity)
|
|
656
|
-
|
|
657
|
-
time_since_activity = current_time - last_activity_timestamp
|
|
658
|
-
activity_status = HealthStatus.HEALTHY
|
|
659
|
-
if time_since_activity > 300: # 5 minutes
|
|
660
|
-
activity_status = HealthStatus.WARNING
|
|
661
|
-
if time_since_activity > 1800: # 30 minutes
|
|
662
|
-
activity_status = HealthStatus.CRITICAL
|
|
663
|
-
|
|
664
|
-
metrics.append(
|
|
665
|
-
HealthMetric(
|
|
666
|
-
name="time_since_last_activity",
|
|
667
|
-
value=round(time_since_activity, 2),
|
|
668
|
-
status=activity_status,
|
|
669
|
-
unit="seconds",
|
|
670
|
-
)
|
|
671
|
-
)
|
|
672
|
-
else:
|
|
673
|
-
metrics.append(
|
|
674
|
-
HealthMetric(
|
|
675
|
-
name="time_since_last_activity",
|
|
676
|
-
value=-1,
|
|
677
|
-
status=HealthStatus.WARNING,
|
|
678
|
-
message="No last activity recorded",
|
|
679
|
-
)
|
|
680
|
-
)
|
|
681
|
-
except Exception as e:
|
|
682
|
-
metrics.append(
|
|
683
|
-
HealthMetric(
|
|
684
|
-
name="time_since_last_activity",
|
|
685
|
-
value=-1,
|
|
686
|
-
status=HealthStatus.UNKNOWN,
|
|
687
|
-
message=f"Failed to parse last activity: {e}",
|
|
688
|
-
)
|
|
689
|
-
)
|
|
690
|
-
|
|
691
|
-
self.last_check_time = current_time
|
|
692
|
-
return metrics
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
class AdvancedHealthMonitor:
|
|
696
|
-
"""Advanced health monitoring system with configurable checks and thresholds.
|
|
697
|
-
|
|
698
|
-
Provides comprehensive health monitoring including:
|
|
699
|
-
- Multiple health checker integration
|
|
700
|
-
- Configurable check intervals and thresholds
|
|
701
|
-
- Health history tracking
|
|
702
|
-
- Status aggregation and reporting
|
|
703
|
-
- Integration with recovery systems
|
|
704
|
-
"""
|
|
705
|
-
|
|
706
|
-
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
|
707
|
-
"""Initialize advanced health monitor.
|
|
708
|
-
|
|
709
|
-
Args:
|
|
710
|
-
config: Configuration dictionary for health monitoring
|
|
711
|
-
"""
|
|
712
|
-
self.config = config or {}
|
|
713
|
-
self.logger = logging.getLogger(f"{__name__}.AdvancedHealthMonitor")
|
|
714
|
-
|
|
715
|
-
# Configuration with defaults
|
|
716
|
-
self.check_interval = self.config.get("check_interval", 30)
|
|
717
|
-
self.history_size = self.config.get("history_size", 100)
|
|
718
|
-
self.aggregation_window = self.config.get(
|
|
719
|
-
"aggregation_window", 300
|
|
720
|
-
) # 5 minutes
|
|
721
|
-
|
|
722
|
-
# Health checkers
|
|
723
|
-
self.checkers: List[HealthChecker] = []
|
|
724
|
-
|
|
725
|
-
# Health history
|
|
726
|
-
self.health_history: deque = deque(maxlen=self.history_size)
|
|
727
|
-
|
|
728
|
-
# Monitoring state
|
|
729
|
-
self.monitoring = False
|
|
730
|
-
self.monitor_task: Optional[asyncio.Task] = None
|
|
731
|
-
self.last_check_result: Optional[HealthCheckResult] = None
|
|
732
|
-
|
|
733
|
-
# Health callbacks for recovery integration
|
|
734
|
-
self.health_callbacks: List[Callable[[HealthCheckResult], None]] = []
|
|
735
|
-
|
|
736
|
-
# Initialize metrics
|
|
737
|
-
self.monitoring_stats = {
|
|
738
|
-
"checks_performed": 0,
|
|
739
|
-
"checks_failed": 0,
|
|
740
|
-
"average_check_duration_ms": 0,
|
|
741
|
-
"last_check_timestamp": None,
|
|
742
|
-
}
|
|
743
|
-
|
|
744
|
-
self.logger.info("Advanced health monitor initialized")
|
|
745
|
-
|
|
746
|
-
def add_checker(self, checker: HealthChecker) -> None:
|
|
747
|
-
"""Add a health checker to the monitoring system."""
|
|
748
|
-
self.checkers.append(checker)
|
|
749
|
-
self.logger.info(f"Added health checker: {checker.get_name()}")
|
|
750
|
-
|
|
751
|
-
def add_health_callback(
|
|
752
|
-
self, callback: Callable[[HealthCheckResult], None]
|
|
753
|
-
) -> None:
|
|
754
|
-
"""Add a callback to be called when health checks complete.
|
|
755
|
-
|
|
756
|
-
Args:
|
|
757
|
-
callback: Function to call with HealthCheckResult
|
|
758
|
-
"""
|
|
759
|
-
self.health_callbacks.append(callback)
|
|
760
|
-
self.logger.debug(f"Added health callback: {callback.__name__}")
|
|
761
|
-
|
|
762
|
-
async def perform_health_check(self) -> HealthCheckResult:
|
|
763
|
-
"""Perform comprehensive health check using all registered checkers."""
|
|
764
|
-
start_time = time.time()
|
|
765
|
-
all_metrics = []
|
|
766
|
-
errors = []
|
|
767
|
-
|
|
768
|
-
# Run all health checkers
|
|
769
|
-
for checker in self.checkers:
|
|
770
|
-
try:
|
|
771
|
-
checker_start = time.time()
|
|
772
|
-
metrics = await checker.check_health()
|
|
773
|
-
checker_duration = (time.time() - checker_start) * 1000
|
|
774
|
-
|
|
775
|
-
all_metrics.extend(metrics)
|
|
776
|
-
self.logger.debug(
|
|
777
|
-
f"Health checker {checker.get_name()} completed in {checker_duration:.2f}ms"
|
|
778
|
-
)
|
|
779
|
-
|
|
780
|
-
except Exception as e:
|
|
781
|
-
error_msg = f"Health checker {checker.get_name()} failed: {e}"
|
|
782
|
-
errors.append(error_msg)
|
|
783
|
-
self.logger.error(error_msg)
|
|
784
|
-
|
|
785
|
-
# Add error metric
|
|
786
|
-
all_metrics.append(
|
|
787
|
-
HealthMetric(
|
|
788
|
-
name=f"{checker.get_name()}_error",
|
|
789
|
-
value=str(e),
|
|
790
|
-
status=HealthStatus.UNKNOWN,
|
|
791
|
-
message=error_msg,
|
|
792
|
-
)
|
|
793
|
-
)
|
|
794
|
-
|
|
795
|
-
# Determine overall status
|
|
796
|
-
overall_status = self._determine_overall_status(all_metrics)
|
|
797
|
-
|
|
798
|
-
# Create result
|
|
799
|
-
duration_ms = (time.time() - start_time) * 1000
|
|
800
|
-
result = HealthCheckResult(
|
|
801
|
-
overall_status=overall_status,
|
|
802
|
-
metrics=all_metrics,
|
|
803
|
-
timestamp=start_time,
|
|
804
|
-
duration_ms=duration_ms,
|
|
805
|
-
errors=errors,
|
|
806
|
-
)
|
|
807
|
-
|
|
808
|
-
# Update statistics
|
|
809
|
-
self.monitoring_stats["checks_performed"] += 1
|
|
810
|
-
if errors:
|
|
811
|
-
self.monitoring_stats["checks_failed"] += 1
|
|
812
|
-
|
|
813
|
-
# Update average duration
|
|
814
|
-
current_avg = self.monitoring_stats["average_check_duration_ms"]
|
|
815
|
-
checks_count = self.monitoring_stats["checks_performed"]
|
|
816
|
-
self.monitoring_stats["average_check_duration_ms"] = (
|
|
817
|
-
current_avg * (checks_count - 1) + duration_ms
|
|
818
|
-
) / checks_count
|
|
819
|
-
self.monitoring_stats["last_check_timestamp"] = time.time()
|
|
820
|
-
|
|
821
|
-
# Store in history
|
|
822
|
-
self.health_history.append(result)
|
|
823
|
-
self.last_check_result = result
|
|
824
|
-
|
|
825
|
-
# Notify callbacks
|
|
826
|
-
for callback in self.health_callbacks:
|
|
827
|
-
try:
|
|
828
|
-
callback(result)
|
|
829
|
-
except Exception as e:
|
|
830
|
-
self.logger.error(f"Health callback {callback.__name__} failed: {e}")
|
|
831
|
-
|
|
832
|
-
self.logger.debug(
|
|
833
|
-
f"Health check completed: {overall_status.value} "
|
|
834
|
-
f"({len(all_metrics)} metrics, {len(errors)} errors, "
|
|
835
|
-
f"{duration_ms:.2f}ms)"
|
|
836
|
-
)
|
|
837
|
-
|
|
838
|
-
return result
|
|
839
|
-
|
|
840
|
-
def _determine_overall_status(self, metrics: List[HealthMetric]) -> HealthStatus:
|
|
841
|
-
"""Determine overall health status from individual metrics."""
|
|
842
|
-
if not metrics:
|
|
843
|
-
return HealthStatus.UNKNOWN
|
|
844
|
-
|
|
845
|
-
# Count metrics by status
|
|
846
|
-
status_counts = dict.fromkeys(HealthStatus, 0)
|
|
847
|
-
for metric in metrics:
|
|
848
|
-
status_counts[metric.status] += 1
|
|
849
|
-
|
|
850
|
-
# Determine overall status based on counts
|
|
851
|
-
total_metrics = len(metrics)
|
|
852
|
-
|
|
853
|
-
# If any critical metrics, overall is critical
|
|
854
|
-
if status_counts[HealthStatus.CRITICAL] > 0:
|
|
855
|
-
return HealthStatus.CRITICAL
|
|
856
|
-
|
|
857
|
-
# If more than 30% warning metrics, overall is warning
|
|
858
|
-
warning_ratio = status_counts[HealthStatus.WARNING] / total_metrics
|
|
859
|
-
if warning_ratio > 0.3:
|
|
860
|
-
return HealthStatus.WARNING
|
|
861
|
-
|
|
862
|
-
# If any warning metrics but less than 30%, still healthy
|
|
863
|
-
if status_counts[HealthStatus.WARNING] > 0:
|
|
864
|
-
return HealthStatus.HEALTHY
|
|
865
|
-
|
|
866
|
-
# If any unknown metrics, overall is unknown
|
|
867
|
-
if status_counts[HealthStatus.UNKNOWN] > 0:
|
|
868
|
-
return HealthStatus.UNKNOWN
|
|
869
|
-
|
|
870
|
-
# All metrics healthy
|
|
871
|
-
return HealthStatus.HEALTHY
|
|
872
|
-
|
|
873
|
-
def start_monitoring(self) -> None:
|
|
874
|
-
"""Start continuous health monitoring."""
|
|
875
|
-
if self.monitoring:
|
|
876
|
-
self.logger.warning("Health monitoring is already running")
|
|
877
|
-
return
|
|
878
|
-
|
|
879
|
-
self.monitoring = True
|
|
880
|
-
self.monitor_task = asyncio.create_task(self._monitoring_loop())
|
|
881
|
-
self.logger.info(
|
|
882
|
-
f"Started health monitoring with {self.check_interval}s interval"
|
|
883
|
-
)
|
|
884
|
-
|
|
885
|
-
async def stop_monitoring(self) -> None:
|
|
886
|
-
"""Stop continuous health monitoring."""
|
|
887
|
-
if not self.monitoring:
|
|
888
|
-
return
|
|
889
|
-
|
|
890
|
-
self.monitoring = False
|
|
891
|
-
if self.monitor_task:
|
|
892
|
-
self.monitor_task.cancel()
|
|
893
|
-
with contextlib.suppress(asyncio.CancelledError):
|
|
894
|
-
await self.monitor_task
|
|
895
|
-
self.monitor_task = None
|
|
896
|
-
|
|
897
|
-
self.logger.info("Stopped health monitoring")
|
|
898
|
-
|
|
899
|
-
async def _monitoring_loop(self) -> None:
|
|
900
|
-
"""Continuous health monitoring loop."""
|
|
901
|
-
try:
|
|
902
|
-
while self.monitoring:
|
|
903
|
-
try:
|
|
904
|
-
await self.perform_health_check()
|
|
905
|
-
except Exception as e:
|
|
906
|
-
self.logger.error(f"Error during health check: {e}")
|
|
907
|
-
|
|
908
|
-
# Wait for next check
|
|
909
|
-
await asyncio.sleep(self.check_interval)
|
|
910
|
-
except asyncio.CancelledError:
|
|
911
|
-
self.logger.debug("Health monitoring loop cancelled")
|
|
912
|
-
except Exception as e:
|
|
913
|
-
self.logger.error(f"Health monitoring loop error: {e}")
|
|
914
|
-
|
|
915
|
-
def get_current_status(self) -> Optional[HealthCheckResult]:
|
|
916
|
-
"""Get the most recent health check result."""
|
|
917
|
-
return self.last_check_result
|
|
918
|
-
|
|
919
|
-
def get_health_history(
|
|
920
|
-
self, limit: Optional[int] = None
|
|
921
|
-
) -> List[HealthCheckResult]:
|
|
922
|
-
"""Get health check history.
|
|
923
|
-
|
|
924
|
-
Args:
|
|
925
|
-
limit: Maximum number of results to return
|
|
926
|
-
|
|
927
|
-
Returns:
|
|
928
|
-
List of health check results, newest first
|
|
929
|
-
"""
|
|
930
|
-
history = list(self.health_history)
|
|
931
|
-
history.reverse() # Newest first
|
|
932
|
-
|
|
933
|
-
if limit:
|
|
934
|
-
history = history[:limit]
|
|
935
|
-
|
|
936
|
-
return history
|
|
937
|
-
|
|
938
|
-
def get_aggregated_status(
|
|
939
|
-
self, window_seconds: Optional[int] = None
|
|
940
|
-
) -> Dict[str, Any]:
|
|
941
|
-
"""Get aggregated health status over a time window.
|
|
942
|
-
|
|
943
|
-
Args:
|
|
944
|
-
window_seconds: Time window for aggregation (defaults to configured window)
|
|
945
|
-
|
|
946
|
-
Returns:
|
|
947
|
-
Dictionary with aggregated health statistics
|
|
948
|
-
"""
|
|
949
|
-
window_seconds = window_seconds or self.aggregation_window
|
|
950
|
-
current_time = time.time()
|
|
951
|
-
cutoff_time = current_time - window_seconds
|
|
952
|
-
|
|
953
|
-
# Filter history to time window
|
|
954
|
-
recent_results = [
|
|
955
|
-
result for result in self.health_history if result.timestamp >= cutoff_time
|
|
956
|
-
]
|
|
957
|
-
|
|
958
|
-
if not recent_results:
|
|
959
|
-
return {
|
|
960
|
-
"period": "no_data",
|
|
961
|
-
"window_seconds": window_seconds,
|
|
962
|
-
"checks_count": 0,
|
|
963
|
-
"overall_status": HealthStatus.UNKNOWN.value,
|
|
964
|
-
}
|
|
965
|
-
|
|
966
|
-
# Aggregate statistics
|
|
967
|
-
status_counts = dict.fromkeys(HealthStatus, 0)
|
|
968
|
-
total_metrics = 0
|
|
969
|
-
total_errors = 0
|
|
970
|
-
total_duration_ms = 0
|
|
971
|
-
|
|
972
|
-
for result in recent_results:
|
|
973
|
-
status_counts[result.overall_status] += 1
|
|
974
|
-
total_metrics += len(result.metrics)
|
|
975
|
-
total_errors += len(result.errors)
|
|
976
|
-
total_duration_ms += result.duration_ms
|
|
977
|
-
|
|
978
|
-
checks_count = len(recent_results)
|
|
979
|
-
|
|
980
|
-
# Determine aggregated status
|
|
981
|
-
if status_counts[HealthStatus.CRITICAL] > 0:
|
|
982
|
-
aggregated_status = HealthStatus.CRITICAL
|
|
983
|
-
elif status_counts[HealthStatus.WARNING] > checks_count * 0.3:
|
|
984
|
-
aggregated_status = HealthStatus.WARNING
|
|
985
|
-
elif status_counts[HealthStatus.UNKNOWN] > checks_count * 0.5:
|
|
986
|
-
aggregated_status = HealthStatus.UNKNOWN
|
|
987
|
-
else:
|
|
988
|
-
aggregated_status = HealthStatus.HEALTHY
|
|
989
|
-
|
|
990
|
-
return {
|
|
991
|
-
"period": f"last_{window_seconds}_seconds",
|
|
992
|
-
"window_seconds": window_seconds,
|
|
993
|
-
"checks_count": checks_count,
|
|
994
|
-
"overall_status": aggregated_status.value,
|
|
995
|
-
"status_distribution": {
|
|
996
|
-
status.value: count for status, count in status_counts.items()
|
|
997
|
-
},
|
|
998
|
-
"average_metrics_per_check": (
|
|
999
|
-
round(total_metrics / checks_count, 2) if checks_count > 0 else 0
|
|
1000
|
-
),
|
|
1001
|
-
"total_errors": total_errors,
|
|
1002
|
-
"average_duration_ms": (
|
|
1003
|
-
round(total_duration_ms / checks_count, 2) if checks_count > 0 else 0
|
|
1004
|
-
),
|
|
1005
|
-
"monitoring_stats": dict(self.monitoring_stats),
|
|
1006
|
-
}
|
|
1007
|
-
|
|
1008
|
-
def export_diagnostics(self) -> Dict[str, Any]:
|
|
1009
|
-
"""Export comprehensive diagnostics information."""
|
|
1010
|
-
return {
|
|
1011
|
-
"monitor_info": {
|
|
1012
|
-
"check_interval": self.check_interval,
|
|
1013
|
-
"history_size": self.history_size,
|
|
1014
|
-
"aggregation_window": self.aggregation_window,
|
|
1015
|
-
"monitoring_active": self.monitoring,
|
|
1016
|
-
"checkers_count": len(self.checkers),
|
|
1017
|
-
"callbacks_count": len(self.health_callbacks),
|
|
1018
|
-
},
|
|
1019
|
-
"checkers": [checker.get_name() for checker in self.checkers],
|
|
1020
|
-
"current_status": (
|
|
1021
|
-
self.last_check_result.to_dict() if self.last_check_result else None
|
|
1022
|
-
),
|
|
1023
|
-
"aggregated_status": self.get_aggregated_status(),
|
|
1024
|
-
"monitoring_stats": dict(self.monitoring_stats),
|
|
1025
|
-
"history_summary": {
|
|
1026
|
-
"total_checks": len(self.health_history),
|
|
1027
|
-
"oldest_check": (
|
|
1028
|
-
self.health_history[0].timestamp if self.health_history else None
|
|
1029
|
-
),
|
|
1030
|
-
"newest_check": (
|
|
1031
|
-
self.health_history[-1].timestamp if self.health_history else None
|
|
1032
|
-
),
|
|
1033
|
-
},
|
|
1034
|
-
}
|
|
32
|
+
# Re-export all components from the modular implementation
|
|
33
|
+
from .monitoring import ( # noqa: F401; New service-based API; Base components; Legacy compatibility
|
|
34
|
+
AdvancedHealthMonitor,
|
|
35
|
+
HealthChecker,
|
|
36
|
+
HealthCheckResult,
|
|
37
|
+
HealthMetric,
|
|
38
|
+
HealthStatus,
|
|
39
|
+
MonitoringAggregatorService,
|
|
40
|
+
NetworkConnectivityChecker,
|
|
41
|
+
NetworkHealthService,
|
|
42
|
+
ProcessHealthService,
|
|
43
|
+
ProcessResourceChecker,
|
|
44
|
+
ResourceMonitorService,
|
|
45
|
+
ServiceHealthChecker,
|
|
46
|
+
ServiceHealthService,
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
__all__ = [
|
|
50
|
+
# New service-based API
|
|
51
|
+
"ResourceMonitorService",
|
|
52
|
+
"ProcessHealthService",
|
|
53
|
+
"ServiceHealthService",
|
|
54
|
+
"NetworkHealthService",
|
|
55
|
+
"MonitoringAggregatorService",
|
|
56
|
+
# Base components
|
|
57
|
+
"HealthStatus",
|
|
58
|
+
"HealthMetric",
|
|
59
|
+
"HealthCheckResult",
|
|
60
|
+
"HealthChecker",
|
|
61
|
+
# Legacy compatibility
|
|
62
|
+
"ProcessResourceChecker",
|
|
63
|
+
"NetworkConnectivityChecker",
|
|
64
|
+
"ServiceHealthChecker",
|
|
65
|
+
"AdvancedHealthMonitor",
|
|
66
|
+
]
|
|
67
|
+
|
|
68
|
+
# Module metadata
|
|
69
|
+
__version__ = "2.0.0"
|
|
70
|
+
__author__ = "Claude MPM Team"
|
|
71
|
+
__description__ = "Refactored modular health monitoring system"
|