claude-mpm 4.1.4__py3-none-any.whl → 4.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- claude_mpm/VERSION +1 -1
- claude_mpm/cli/commands/tickets.py +365 -784
- claude_mpm/core/output_style_manager.py +24 -0
- claude_mpm/core/unified_agent_registry.py +46 -15
- claude_mpm/services/agents/deployment/agent_discovery_service.py +12 -3
- claude_mpm/services/agents/deployment/agent_lifecycle_manager.py +172 -233
- claude_mpm/services/agents/deployment/agent_lifecycle_manager_refactored.py +575 -0
- claude_mpm/services/agents/deployment/agent_operation_service.py +573 -0
- claude_mpm/services/agents/deployment/agent_record_service.py +419 -0
- claude_mpm/services/agents/deployment/agent_state_service.py +381 -0
- claude_mpm/services/agents/deployment/multi_source_deployment_service.py +4 -2
- claude_mpm/services/infrastructure/__init__.py +31 -5
- claude_mpm/services/infrastructure/monitoring/__init__.py +43 -0
- claude_mpm/services/infrastructure/monitoring/aggregator.py +437 -0
- claude_mpm/services/infrastructure/monitoring/base.py +130 -0
- claude_mpm/services/infrastructure/monitoring/legacy.py +203 -0
- claude_mpm/services/infrastructure/monitoring/network.py +218 -0
- claude_mpm/services/infrastructure/monitoring/process.py +342 -0
- claude_mpm/services/infrastructure/monitoring/resources.py +243 -0
- claude_mpm/services/infrastructure/monitoring/service.py +367 -0
- claude_mpm/services/infrastructure/monitoring.py +67 -1030
- claude_mpm/services/project/analyzer.py +13 -4
- claude_mpm/services/project/analyzer_refactored.py +450 -0
- claude_mpm/services/project/analyzer_v2.py +566 -0
- claude_mpm/services/project/architecture_analyzer.py +461 -0
- claude_mpm/services/project/dependency_analyzer.py +462 -0
- claude_mpm/services/project/language_analyzer.py +265 -0
- claude_mpm/services/project/metrics_collector.py +410 -0
- claude_mpm/services/ticket_manager.py +5 -1
- claude_mpm/services/ticket_services/__init__.py +26 -0
- claude_mpm/services/ticket_services/crud_service.py +328 -0
- claude_mpm/services/ticket_services/formatter_service.py +290 -0
- claude_mpm/services/ticket_services/search_service.py +324 -0
- claude_mpm/services/ticket_services/validation_service.py +303 -0
- claude_mpm/services/ticket_services/workflow_service.py +244 -0
- {claude_mpm-4.1.4.dist-info → claude_mpm-4.1.5.dist-info}/METADATA +1 -1
- {claude_mpm-4.1.4.dist-info → claude_mpm-4.1.5.dist-info}/RECORD +41 -17
- {claude_mpm-4.1.4.dist-info → claude_mpm-4.1.5.dist-info}/WHEEL +0 -0
- {claude_mpm-4.1.4.dist-info → claude_mpm-4.1.5.dist-info}/entry_points.txt +0 -0
- {claude_mpm-4.1.4.dist-info → claude_mpm-4.1.5.dist-info}/licenses/LICENSE +0 -0
- {claude_mpm-4.1.4.dist-info → claude_mpm-4.1.5.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,342 @@
|
|
|
1
|
+
"""Process health monitoring service.
|
|
2
|
+
|
|
3
|
+
Monitors individual process health including CPU, memory, file descriptors, and threads.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from typing import List
|
|
7
|
+
|
|
8
|
+
from claude_mpm.core.constants import ResourceLimits, TimeoutConfig
|
|
9
|
+
|
|
10
|
+
from .base import BaseMonitoringService, HealthMetric, HealthStatus
|
|
11
|
+
|
|
12
|
+
try:
|
|
13
|
+
import psutil
|
|
14
|
+
|
|
15
|
+
PSUTIL_AVAILABLE = True
|
|
16
|
+
except ImportError:
|
|
17
|
+
PSUTIL_AVAILABLE = False
|
|
18
|
+
psutil = None
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class ProcessHealthService(BaseMonitoringService):
|
|
22
|
+
"""Service for monitoring individual process health.
|
|
23
|
+
|
|
24
|
+
Monitors:
|
|
25
|
+
- Process existence and status
|
|
26
|
+
- Process CPU usage
|
|
27
|
+
- Process memory usage (RSS, VMS)
|
|
28
|
+
- File descriptor count (Unix)
|
|
29
|
+
- Thread count
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def __init__(
|
|
33
|
+
self,
|
|
34
|
+
pid: int,
|
|
35
|
+
cpu_threshold: float = 80.0,
|
|
36
|
+
memory_threshold_mb: int = 500,
|
|
37
|
+
fd_threshold: int = 1000,
|
|
38
|
+
):
|
|
39
|
+
"""Initialize process health service.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
pid: Process ID to monitor
|
|
43
|
+
cpu_threshold: CPU usage threshold as percentage
|
|
44
|
+
memory_threshold_mb: Memory usage threshold in MB
|
|
45
|
+
fd_threshold: File descriptor count threshold
|
|
46
|
+
"""
|
|
47
|
+
super().__init__(f"ProcessHealth_{pid}")
|
|
48
|
+
self.pid = pid
|
|
49
|
+
self.cpu_threshold = cpu_threshold
|
|
50
|
+
self.memory_threshold_mb = memory_threshold_mb
|
|
51
|
+
self.fd_threshold = fd_threshold
|
|
52
|
+
self.process = None
|
|
53
|
+
|
|
54
|
+
if PSUTIL_AVAILABLE:
|
|
55
|
+
try:
|
|
56
|
+
self.process = psutil.Process(pid)
|
|
57
|
+
except psutil.NoSuchProcess:
|
|
58
|
+
self.logger.warning(f"Process {pid} not found for monitoring")
|
|
59
|
+
|
|
60
|
+
async def check_health(self) -> List[HealthMetric]:
|
|
61
|
+
"""Check process health metrics."""
|
|
62
|
+
metrics = []
|
|
63
|
+
|
|
64
|
+
if not PSUTIL_AVAILABLE:
|
|
65
|
+
metrics.append(
|
|
66
|
+
HealthMetric(
|
|
67
|
+
name="psutil_availability",
|
|
68
|
+
value=False,
|
|
69
|
+
status=HealthStatus.WARNING,
|
|
70
|
+
message="psutil not available for process monitoring",
|
|
71
|
+
)
|
|
72
|
+
)
|
|
73
|
+
return metrics
|
|
74
|
+
|
|
75
|
+
if not self.process:
|
|
76
|
+
metrics.append(
|
|
77
|
+
HealthMetric(
|
|
78
|
+
name="process_exists",
|
|
79
|
+
value=False,
|
|
80
|
+
status=HealthStatus.CRITICAL,
|
|
81
|
+
message=f"Process {self.pid} not found",
|
|
82
|
+
)
|
|
83
|
+
)
|
|
84
|
+
return metrics
|
|
85
|
+
|
|
86
|
+
try:
|
|
87
|
+
# Check if process still exists
|
|
88
|
+
if not self.process.is_running():
|
|
89
|
+
metrics.append(
|
|
90
|
+
HealthMetric(
|
|
91
|
+
name="process_exists",
|
|
92
|
+
value=False,
|
|
93
|
+
status=HealthStatus.CRITICAL,
|
|
94
|
+
message=f"Process {self.pid} is no longer running",
|
|
95
|
+
)
|
|
96
|
+
)
|
|
97
|
+
return metrics
|
|
98
|
+
|
|
99
|
+
# Process status
|
|
100
|
+
metrics.extend(self._check_process_status())
|
|
101
|
+
|
|
102
|
+
# CPU usage
|
|
103
|
+
metrics.extend(self._check_cpu_usage())
|
|
104
|
+
|
|
105
|
+
# Memory usage
|
|
106
|
+
metrics.extend(self._check_memory_usage())
|
|
107
|
+
|
|
108
|
+
# File descriptors
|
|
109
|
+
metrics.extend(self._check_file_descriptors())
|
|
110
|
+
|
|
111
|
+
# Thread count
|
|
112
|
+
metrics.extend(self._check_thread_count())
|
|
113
|
+
|
|
114
|
+
# Process metadata
|
|
115
|
+
metrics.extend(self._check_process_metadata())
|
|
116
|
+
|
|
117
|
+
except psutil.NoSuchProcess:
|
|
118
|
+
metrics.append(
|
|
119
|
+
HealthMetric(
|
|
120
|
+
name="process_exists",
|
|
121
|
+
value=False,
|
|
122
|
+
status=HealthStatus.CRITICAL,
|
|
123
|
+
message=f"Process {self.pid} no longer exists",
|
|
124
|
+
)
|
|
125
|
+
)
|
|
126
|
+
except Exception as e:
|
|
127
|
+
self.logger.error(f"Error checking process health: {e}")
|
|
128
|
+
metrics.append(
|
|
129
|
+
HealthMetric(
|
|
130
|
+
name="process_check_error",
|
|
131
|
+
value=str(e),
|
|
132
|
+
status=HealthStatus.UNKNOWN,
|
|
133
|
+
message=f"Unexpected error during process health check: {e}",
|
|
134
|
+
)
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
return metrics
|
|
138
|
+
|
|
139
|
+
def _check_process_status(self) -> List[HealthMetric]:
|
|
140
|
+
"""Check process status."""
|
|
141
|
+
metrics = []
|
|
142
|
+
try:
|
|
143
|
+
status = self.process.status()
|
|
144
|
+
process_healthy = status not in [
|
|
145
|
+
psutil.STATUS_ZOMBIE,
|
|
146
|
+
psutil.STATUS_DEAD,
|
|
147
|
+
psutil.STATUS_STOPPED,
|
|
148
|
+
]
|
|
149
|
+
metrics.append(
|
|
150
|
+
HealthMetric(
|
|
151
|
+
name="process_status",
|
|
152
|
+
value=status,
|
|
153
|
+
status=(
|
|
154
|
+
HealthStatus.HEALTHY
|
|
155
|
+
if process_healthy
|
|
156
|
+
else HealthStatus.CRITICAL
|
|
157
|
+
),
|
|
158
|
+
message=f"Process status: {status}",
|
|
159
|
+
)
|
|
160
|
+
)
|
|
161
|
+
except Exception as e:
|
|
162
|
+
metrics.append(
|
|
163
|
+
HealthMetric(
|
|
164
|
+
name="process_status",
|
|
165
|
+
value="unknown",
|
|
166
|
+
status=HealthStatus.UNKNOWN,
|
|
167
|
+
message=f"Failed to get process status: {e}",
|
|
168
|
+
)
|
|
169
|
+
)
|
|
170
|
+
return metrics
|
|
171
|
+
|
|
172
|
+
def _check_cpu_usage(self) -> List[HealthMetric]:
|
|
173
|
+
"""Check CPU usage."""
|
|
174
|
+
metrics = []
|
|
175
|
+
try:
|
|
176
|
+
cpu_percent = self.process.cpu_percent(
|
|
177
|
+
interval=TimeoutConfig.CPU_SAMPLE_INTERVAL
|
|
178
|
+
)
|
|
179
|
+
cpu_status = HealthStatus.HEALTHY
|
|
180
|
+
if cpu_percent > self.cpu_threshold:
|
|
181
|
+
cpu_status = (
|
|
182
|
+
HealthStatus.WARNING
|
|
183
|
+
if cpu_percent < self.cpu_threshold * 1.2
|
|
184
|
+
else HealthStatus.CRITICAL
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
metrics.append(
|
|
188
|
+
HealthMetric(
|
|
189
|
+
name="cpu_usage_percent",
|
|
190
|
+
value=round(cpu_percent, 2),
|
|
191
|
+
status=cpu_status,
|
|
192
|
+
threshold=self.cpu_threshold,
|
|
193
|
+
unit="%",
|
|
194
|
+
)
|
|
195
|
+
)
|
|
196
|
+
except Exception as e:
|
|
197
|
+
metrics.append(
|
|
198
|
+
HealthMetric(
|
|
199
|
+
name="cpu_usage_percent",
|
|
200
|
+
value=-1,
|
|
201
|
+
status=HealthStatus.UNKNOWN,
|
|
202
|
+
message=f"Failed to get CPU usage: {e}",
|
|
203
|
+
)
|
|
204
|
+
)
|
|
205
|
+
return metrics
|
|
206
|
+
|
|
207
|
+
def _check_memory_usage(self) -> List[HealthMetric]:
|
|
208
|
+
"""Check memory usage."""
|
|
209
|
+
metrics = []
|
|
210
|
+
try:
|
|
211
|
+
memory_info = self.process.memory_info()
|
|
212
|
+
memory_mb = memory_info.rss / ResourceLimits.BYTES_TO_MB
|
|
213
|
+
memory_status = HealthStatus.HEALTHY
|
|
214
|
+
if memory_mb > self.memory_threshold_mb:
|
|
215
|
+
memory_status = (
|
|
216
|
+
HealthStatus.WARNING
|
|
217
|
+
if memory_mb < self.memory_threshold_mb * 1.2
|
|
218
|
+
else HealthStatus.CRITICAL
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
metrics.append(
|
|
222
|
+
HealthMetric(
|
|
223
|
+
name="memory_usage_mb",
|
|
224
|
+
value=round(memory_mb, 2),
|
|
225
|
+
status=memory_status,
|
|
226
|
+
threshold=self.memory_threshold_mb,
|
|
227
|
+
unit="MB",
|
|
228
|
+
)
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
metrics.append(
|
|
232
|
+
HealthMetric(
|
|
233
|
+
name="memory_vms_mb",
|
|
234
|
+
value=round(memory_info.vms / ResourceLimits.BYTES_TO_MB, 2),
|
|
235
|
+
status=HealthStatus.HEALTHY,
|
|
236
|
+
unit="MB",
|
|
237
|
+
)
|
|
238
|
+
)
|
|
239
|
+
except Exception as e:
|
|
240
|
+
metrics.append(
|
|
241
|
+
HealthMetric(
|
|
242
|
+
name="memory_usage_mb",
|
|
243
|
+
value=-1,
|
|
244
|
+
status=HealthStatus.UNKNOWN,
|
|
245
|
+
message=f"Failed to get memory usage: {e}",
|
|
246
|
+
)
|
|
247
|
+
)
|
|
248
|
+
return metrics
|
|
249
|
+
|
|
250
|
+
def _check_file_descriptors(self) -> List[HealthMetric]:
|
|
251
|
+
"""Check file descriptor count (Unix only)."""
|
|
252
|
+
metrics = []
|
|
253
|
+
if hasattr(self.process, "num_fds"):
|
|
254
|
+
try:
|
|
255
|
+
fd_count = self.process.num_fds()
|
|
256
|
+
fd_status = HealthStatus.HEALTHY
|
|
257
|
+
if fd_count > self.fd_threshold:
|
|
258
|
+
fd_status = (
|
|
259
|
+
HealthStatus.WARNING
|
|
260
|
+
if fd_count < self.fd_threshold * 1.2
|
|
261
|
+
else HealthStatus.CRITICAL
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
metrics.append(
|
|
265
|
+
HealthMetric(
|
|
266
|
+
name="file_descriptors",
|
|
267
|
+
value=fd_count,
|
|
268
|
+
status=fd_status,
|
|
269
|
+
threshold=self.fd_threshold,
|
|
270
|
+
)
|
|
271
|
+
)
|
|
272
|
+
except Exception as e:
|
|
273
|
+
metrics.append(
|
|
274
|
+
HealthMetric(
|
|
275
|
+
name="file_descriptors",
|
|
276
|
+
value=-1,
|
|
277
|
+
status=HealthStatus.UNKNOWN,
|
|
278
|
+
message=f"Failed to get file descriptor count: {e}",
|
|
279
|
+
)
|
|
280
|
+
)
|
|
281
|
+
return metrics
|
|
282
|
+
|
|
283
|
+
def _check_thread_count(self) -> List[HealthMetric]:
|
|
284
|
+
"""Check thread count."""
|
|
285
|
+
metrics = []
|
|
286
|
+
try:
|
|
287
|
+
thread_count = self.process.num_threads()
|
|
288
|
+
metrics.append(
|
|
289
|
+
HealthMetric(
|
|
290
|
+
name="thread_count",
|
|
291
|
+
value=thread_count,
|
|
292
|
+
status=HealthStatus.HEALTHY,
|
|
293
|
+
)
|
|
294
|
+
)
|
|
295
|
+
except Exception as e:
|
|
296
|
+
metrics.append(
|
|
297
|
+
HealthMetric(
|
|
298
|
+
name="thread_count",
|
|
299
|
+
value=-1,
|
|
300
|
+
status=HealthStatus.UNKNOWN,
|
|
301
|
+
message=f"Failed to get thread count: {e}",
|
|
302
|
+
)
|
|
303
|
+
)
|
|
304
|
+
return metrics
|
|
305
|
+
|
|
306
|
+
def _check_process_metadata(self) -> List[HealthMetric]:
|
|
307
|
+
"""Check process metadata."""
|
|
308
|
+
metrics = []
|
|
309
|
+
try:
|
|
310
|
+
create_time = self.process.create_time()
|
|
311
|
+
metrics.append(
|
|
312
|
+
HealthMetric(
|
|
313
|
+
name="process_start_time",
|
|
314
|
+
value=create_time,
|
|
315
|
+
status=HealthStatus.HEALTHY,
|
|
316
|
+
unit="timestamp",
|
|
317
|
+
)
|
|
318
|
+
)
|
|
319
|
+
except Exception as e:
|
|
320
|
+
metrics.append(
|
|
321
|
+
HealthMetric(
|
|
322
|
+
name="process_start_time",
|
|
323
|
+
value=-1,
|
|
324
|
+
status=HealthStatus.UNKNOWN,
|
|
325
|
+
message=f"Failed to get process start time: {e}",
|
|
326
|
+
)
|
|
327
|
+
)
|
|
328
|
+
return metrics
|
|
329
|
+
|
|
330
|
+
def is_process_alive(self) -> bool:
|
|
331
|
+
"""Quick check if process is still alive.
|
|
332
|
+
|
|
333
|
+
Returns:
|
|
334
|
+
True if process exists and is running
|
|
335
|
+
"""
|
|
336
|
+
if not PSUTIL_AVAILABLE or not self.process:
|
|
337
|
+
return False
|
|
338
|
+
|
|
339
|
+
try:
|
|
340
|
+
return self.process.is_running()
|
|
341
|
+
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
|
342
|
+
return False
|
|
@@ -0,0 +1,243 @@
|
|
|
1
|
+
"""Resource monitoring service for system resources (CPU, memory, disk).
|
|
2
|
+
|
|
3
|
+
Monitors system-wide resource usage including CPU, memory, and disk utilization.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from typing import Dict, List, Optional
|
|
7
|
+
|
|
8
|
+
from .base import BaseMonitoringService, HealthMetric, HealthStatus
|
|
9
|
+
|
|
10
|
+
try:
|
|
11
|
+
import psutil
|
|
12
|
+
|
|
13
|
+
PSUTIL_AVAILABLE = True
|
|
14
|
+
except ImportError:
|
|
15
|
+
PSUTIL_AVAILABLE = False
|
|
16
|
+
psutil = None
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class ResourceMonitorService(BaseMonitoringService):
|
|
20
|
+
"""Service for monitoring system resource usage.
|
|
21
|
+
|
|
22
|
+
Monitors:
|
|
23
|
+
- System CPU usage
|
|
24
|
+
- System memory usage
|
|
25
|
+
- Disk space utilization
|
|
26
|
+
- System load average
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
def __init__(
|
|
30
|
+
self,
|
|
31
|
+
cpu_threshold: float = 80.0,
|
|
32
|
+
memory_threshold: float = 85.0,
|
|
33
|
+
disk_threshold: float = 90.0,
|
|
34
|
+
):
|
|
35
|
+
"""Initialize resource monitor service.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
cpu_threshold: CPU usage warning threshold (%)
|
|
39
|
+
memory_threshold: Memory usage warning threshold (%)
|
|
40
|
+
disk_threshold: Disk usage warning threshold (%)
|
|
41
|
+
"""
|
|
42
|
+
super().__init__("ResourceMonitor")
|
|
43
|
+
self.cpu_threshold = cpu_threshold
|
|
44
|
+
self.memory_threshold = memory_threshold
|
|
45
|
+
self.disk_threshold = disk_threshold
|
|
46
|
+
|
|
47
|
+
async def check_health(self) -> List[HealthMetric]:
|
|
48
|
+
"""Check system resource health."""
|
|
49
|
+
metrics = []
|
|
50
|
+
|
|
51
|
+
if not PSUTIL_AVAILABLE:
|
|
52
|
+
metrics.append(
|
|
53
|
+
HealthMetric(
|
|
54
|
+
name="psutil_availability",
|
|
55
|
+
value=False,
|
|
56
|
+
status=HealthStatus.WARNING,
|
|
57
|
+
message="psutil not available for resource monitoring",
|
|
58
|
+
)
|
|
59
|
+
)
|
|
60
|
+
return metrics
|
|
61
|
+
|
|
62
|
+
# CPU usage
|
|
63
|
+
try:
|
|
64
|
+
cpu_percent = psutil.cpu_percent(interval=0.1)
|
|
65
|
+
cpu_status = self._get_threshold_status(cpu_percent, self.cpu_threshold)
|
|
66
|
+
|
|
67
|
+
metrics.append(
|
|
68
|
+
HealthMetric(
|
|
69
|
+
name="system_cpu_usage",
|
|
70
|
+
value=round(cpu_percent, 2),
|
|
71
|
+
status=cpu_status,
|
|
72
|
+
threshold=self.cpu_threshold,
|
|
73
|
+
unit="%",
|
|
74
|
+
)
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
# CPU count for context
|
|
78
|
+
metrics.append(
|
|
79
|
+
HealthMetric(
|
|
80
|
+
name="cpu_count",
|
|
81
|
+
value=psutil.cpu_count(),
|
|
82
|
+
status=HealthStatus.HEALTHY,
|
|
83
|
+
)
|
|
84
|
+
)
|
|
85
|
+
except Exception as e:
|
|
86
|
+
self.logger.error(f"Failed to get CPU usage: {e}")
|
|
87
|
+
metrics.append(
|
|
88
|
+
HealthMetric(
|
|
89
|
+
name="system_cpu_usage",
|
|
90
|
+
value=-1,
|
|
91
|
+
status=HealthStatus.UNKNOWN,
|
|
92
|
+
message=str(e),
|
|
93
|
+
)
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
# Memory usage
|
|
97
|
+
try:
|
|
98
|
+
memory = psutil.virtual_memory()
|
|
99
|
+
memory_status = self._get_threshold_status(
|
|
100
|
+
memory.percent, self.memory_threshold
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
metrics.append(
|
|
104
|
+
HealthMetric(
|
|
105
|
+
name="system_memory_usage",
|
|
106
|
+
value=round(memory.percent, 2),
|
|
107
|
+
status=memory_status,
|
|
108
|
+
threshold=self.memory_threshold,
|
|
109
|
+
unit="%",
|
|
110
|
+
)
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
# Memory details
|
|
114
|
+
metrics.append(
|
|
115
|
+
HealthMetric(
|
|
116
|
+
name="memory_available_gb",
|
|
117
|
+
value=round(memory.available / (1024**3), 2),
|
|
118
|
+
status=HealthStatus.HEALTHY,
|
|
119
|
+
unit="GB",
|
|
120
|
+
)
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
metrics.append(
|
|
124
|
+
HealthMetric(
|
|
125
|
+
name="memory_total_gb",
|
|
126
|
+
value=round(memory.total / (1024**3), 2),
|
|
127
|
+
status=HealthStatus.HEALTHY,
|
|
128
|
+
unit="GB",
|
|
129
|
+
)
|
|
130
|
+
)
|
|
131
|
+
except Exception as e:
|
|
132
|
+
self.logger.error(f"Failed to get memory usage: {e}")
|
|
133
|
+
metrics.append(
|
|
134
|
+
HealthMetric(
|
|
135
|
+
name="system_memory_usage",
|
|
136
|
+
value=-1,
|
|
137
|
+
status=HealthStatus.UNKNOWN,
|
|
138
|
+
message=str(e),
|
|
139
|
+
)
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
# Disk usage
|
|
143
|
+
try:
|
|
144
|
+
disk = psutil.disk_usage("/")
|
|
145
|
+
disk_status = self._get_threshold_status(disk.percent, self.disk_threshold)
|
|
146
|
+
|
|
147
|
+
metrics.append(
|
|
148
|
+
HealthMetric(
|
|
149
|
+
name="disk_usage",
|
|
150
|
+
value=round(disk.percent, 2),
|
|
151
|
+
status=disk_status,
|
|
152
|
+
threshold=self.disk_threshold,
|
|
153
|
+
unit="%",
|
|
154
|
+
)
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
metrics.append(
|
|
158
|
+
HealthMetric(
|
|
159
|
+
name="disk_free_gb",
|
|
160
|
+
value=round(disk.free / (1024**3), 2),
|
|
161
|
+
status=HealthStatus.HEALTHY,
|
|
162
|
+
unit="GB",
|
|
163
|
+
)
|
|
164
|
+
)
|
|
165
|
+
except Exception as e:
|
|
166
|
+
self.logger.error(f"Failed to get disk usage: {e}")
|
|
167
|
+
metrics.append(
|
|
168
|
+
HealthMetric(
|
|
169
|
+
name="disk_usage",
|
|
170
|
+
value=-1,
|
|
171
|
+
status=HealthStatus.UNKNOWN,
|
|
172
|
+
message=str(e),
|
|
173
|
+
)
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
# Load average (Unix only)
|
|
177
|
+
try:
|
|
178
|
+
if hasattr(psutil, "getloadavg"):
|
|
179
|
+
load1, load5, load15 = psutil.getloadavg()
|
|
180
|
+
cpu_count = psutil.cpu_count()
|
|
181
|
+
|
|
182
|
+
# Load is concerning if > cpu_count
|
|
183
|
+
load_status = HealthStatus.HEALTHY
|
|
184
|
+
if load1 > cpu_count:
|
|
185
|
+
load_status = HealthStatus.WARNING
|
|
186
|
+
if load1 > cpu_count * 1.5:
|
|
187
|
+
load_status = HealthStatus.CRITICAL
|
|
188
|
+
|
|
189
|
+
metrics.append(
|
|
190
|
+
HealthMetric(
|
|
191
|
+
name="load_average_1min",
|
|
192
|
+
value=round(load1, 2),
|
|
193
|
+
status=load_status,
|
|
194
|
+
threshold=cpu_count,
|
|
195
|
+
)
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
metrics.append(
|
|
199
|
+
HealthMetric(
|
|
200
|
+
name="load_average_5min",
|
|
201
|
+
value=round(load5, 2),
|
|
202
|
+
status=HealthStatus.HEALTHY,
|
|
203
|
+
)
|
|
204
|
+
)
|
|
205
|
+
except Exception as e:
|
|
206
|
+
self.logger.debug(f"Load average not available: {e}")
|
|
207
|
+
|
|
208
|
+
return metrics
|
|
209
|
+
|
|
210
|
+
def _get_threshold_status(self, value: float, threshold: float) -> HealthStatus:
|
|
211
|
+
"""Determine health status based on threshold.
|
|
212
|
+
|
|
213
|
+
Args:
|
|
214
|
+
value: Current value
|
|
215
|
+
threshold: Warning threshold
|
|
216
|
+
|
|
217
|
+
Returns:
|
|
218
|
+
Health status based on value vs threshold
|
|
219
|
+
"""
|
|
220
|
+
if value < threshold:
|
|
221
|
+
return HealthStatus.HEALTHY
|
|
222
|
+
if value < threshold * 1.1: # 10% above threshold
|
|
223
|
+
return HealthStatus.WARNING
|
|
224
|
+
return HealthStatus.CRITICAL
|
|
225
|
+
|
|
226
|
+
def get_resource_summary(self) -> Optional[Dict[str, float]]:
|
|
227
|
+
"""Get quick resource summary without full health check.
|
|
228
|
+
|
|
229
|
+
Returns:
|
|
230
|
+
Dictionary with current resource usage percentages
|
|
231
|
+
"""
|
|
232
|
+
if not PSUTIL_AVAILABLE:
|
|
233
|
+
return None
|
|
234
|
+
|
|
235
|
+
try:
|
|
236
|
+
return {
|
|
237
|
+
"cpu_percent": psutil.cpu_percent(interval=0.1),
|
|
238
|
+
"memory_percent": psutil.virtual_memory().percent,
|
|
239
|
+
"disk_percent": psutil.disk_usage("/").percent,
|
|
240
|
+
}
|
|
241
|
+
except Exception as e:
|
|
242
|
+
self.logger.error(f"Failed to get resource summary: {e}")
|
|
243
|
+
return None
|