morphml 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of morphml might be problematic. Click here for more details.
- morphml/__init__.py +14 -0
- morphml/api/__init__.py +26 -0
- morphml/api/app.py +326 -0
- morphml/api/auth.py +193 -0
- morphml/api/client.py +338 -0
- morphml/api/models.py +132 -0
- morphml/api/rate_limit.py +192 -0
- morphml/benchmarking/__init__.py +36 -0
- morphml/benchmarking/comparison.py +430 -0
- morphml/benchmarks/__init__.py +56 -0
- morphml/benchmarks/comparator.py +409 -0
- morphml/benchmarks/datasets.py +280 -0
- morphml/benchmarks/metrics.py +199 -0
- morphml/benchmarks/openml_suite.py +201 -0
- morphml/benchmarks/problems.py +289 -0
- morphml/benchmarks/suite.py +318 -0
- morphml/cli/__init__.py +5 -0
- morphml/cli/commands/experiment.py +329 -0
- morphml/cli/main.py +457 -0
- morphml/cli/quickstart.py +312 -0
- morphml/config.py +278 -0
- morphml/constraints/__init__.py +19 -0
- morphml/constraints/handler.py +205 -0
- morphml/constraints/predicates.py +285 -0
- morphml/core/__init__.py +3 -0
- morphml/core/crossover.py +449 -0
- morphml/core/dsl/README.md +359 -0
- morphml/core/dsl/__init__.py +72 -0
- morphml/core/dsl/ast_nodes.py +364 -0
- morphml/core/dsl/compiler.py +318 -0
- morphml/core/dsl/layers.py +368 -0
- morphml/core/dsl/lexer.py +336 -0
- morphml/core/dsl/parser.py +455 -0
- morphml/core/dsl/search_space.py +386 -0
- morphml/core/dsl/syntax.py +199 -0
- morphml/core/dsl/type_system.py +361 -0
- morphml/core/dsl/validator.py +386 -0
- morphml/core/graph/__init__.py +40 -0
- morphml/core/graph/edge.py +124 -0
- morphml/core/graph/graph.py +507 -0
- morphml/core/graph/mutations.py +409 -0
- morphml/core/graph/node.py +196 -0
- morphml/core/graph/serialization.py +361 -0
- morphml/core/graph/visualization.py +431 -0
- morphml/core/objectives/__init__.py +20 -0
- morphml/core/search/__init__.py +33 -0
- morphml/core/search/individual.py +252 -0
- morphml/core/search/parameters.py +453 -0
- morphml/core/search/population.py +375 -0
- morphml/core/search/search_engine.py +340 -0
- morphml/distributed/__init__.py +76 -0
- morphml/distributed/fault_tolerance.py +497 -0
- morphml/distributed/health_monitor.py +348 -0
- morphml/distributed/master.py +709 -0
- morphml/distributed/proto/README.md +224 -0
- morphml/distributed/proto/__init__.py +74 -0
- morphml/distributed/proto/worker.proto +170 -0
- morphml/distributed/proto/worker_pb2.py +79 -0
- morphml/distributed/proto/worker_pb2_grpc.py +423 -0
- morphml/distributed/resource_manager.py +416 -0
- morphml/distributed/scheduler.py +567 -0
- morphml/distributed/storage/__init__.py +33 -0
- morphml/distributed/storage/artifacts.py +381 -0
- morphml/distributed/storage/cache.py +366 -0
- morphml/distributed/storage/checkpointing.py +329 -0
- morphml/distributed/storage/database.py +459 -0
- morphml/distributed/worker.py +549 -0
- morphml/evaluation/__init__.py +5 -0
- morphml/evaluation/heuristic.py +237 -0
- morphml/exceptions.py +55 -0
- morphml/execution/__init__.py +5 -0
- morphml/execution/local_executor.py +350 -0
- morphml/integrations/__init__.py +28 -0
- morphml/integrations/jax_adapter.py +206 -0
- morphml/integrations/pytorch_adapter.py +530 -0
- morphml/integrations/sklearn_adapter.py +206 -0
- morphml/integrations/tensorflow_adapter.py +230 -0
- morphml/logging_config.py +93 -0
- morphml/meta_learning/__init__.py +66 -0
- morphml/meta_learning/architecture_similarity.py +277 -0
- morphml/meta_learning/experiment_database.py +240 -0
- morphml/meta_learning/knowledge_base/__init__.py +19 -0
- morphml/meta_learning/knowledge_base/embedder.py +179 -0
- morphml/meta_learning/knowledge_base/knowledge_base.py +313 -0
- morphml/meta_learning/knowledge_base/meta_features.py +265 -0
- morphml/meta_learning/knowledge_base/vector_store.py +271 -0
- morphml/meta_learning/predictors/__init__.py +27 -0
- morphml/meta_learning/predictors/ensemble.py +221 -0
- morphml/meta_learning/predictors/gnn_predictor.py +552 -0
- morphml/meta_learning/predictors/learning_curve.py +231 -0
- morphml/meta_learning/predictors/proxy_metrics.py +261 -0
- morphml/meta_learning/strategy_evolution/__init__.py +27 -0
- morphml/meta_learning/strategy_evolution/adaptive_optimizer.py +226 -0
- morphml/meta_learning/strategy_evolution/bandit.py +276 -0
- morphml/meta_learning/strategy_evolution/portfolio.py +230 -0
- morphml/meta_learning/transfer.py +581 -0
- morphml/meta_learning/warm_start.py +286 -0
- morphml/optimizers/__init__.py +74 -0
- morphml/optimizers/adaptive_operators.py +399 -0
- morphml/optimizers/bayesian/__init__.py +52 -0
- morphml/optimizers/bayesian/acquisition.py +387 -0
- morphml/optimizers/bayesian/base.py +319 -0
- morphml/optimizers/bayesian/gaussian_process.py +635 -0
- morphml/optimizers/bayesian/smac.py +534 -0
- morphml/optimizers/bayesian/tpe.py +411 -0
- morphml/optimizers/differential_evolution.py +220 -0
- morphml/optimizers/evolutionary/__init__.py +61 -0
- morphml/optimizers/evolutionary/cma_es.py +416 -0
- morphml/optimizers/evolutionary/differential_evolution.py +556 -0
- morphml/optimizers/evolutionary/encoding.py +426 -0
- morphml/optimizers/evolutionary/particle_swarm.py +449 -0
- morphml/optimizers/genetic_algorithm.py +486 -0
- morphml/optimizers/gradient_based/__init__.py +22 -0
- morphml/optimizers/gradient_based/darts.py +550 -0
- morphml/optimizers/gradient_based/enas.py +585 -0
- morphml/optimizers/gradient_based/operations.py +474 -0
- morphml/optimizers/gradient_based/utils.py +601 -0
- morphml/optimizers/hill_climbing.py +169 -0
- morphml/optimizers/multi_objective/__init__.py +56 -0
- morphml/optimizers/multi_objective/indicators.py +504 -0
- morphml/optimizers/multi_objective/nsga2.py +647 -0
- morphml/optimizers/multi_objective/visualization.py +427 -0
- morphml/optimizers/nsga2.py +308 -0
- morphml/optimizers/random_search.py +172 -0
- morphml/optimizers/simulated_annealing.py +181 -0
- morphml/plugins/__init__.py +35 -0
- morphml/plugins/custom_evaluator_example.py +81 -0
- morphml/plugins/custom_optimizer_example.py +63 -0
- morphml/plugins/plugin_system.py +454 -0
- morphml/reports/__init__.py +30 -0
- morphml/reports/generator.py +362 -0
- morphml/tracking/__init__.py +7 -0
- morphml/tracking/experiment.py +309 -0
- morphml/tracking/logger.py +301 -0
- morphml/tracking/reporter.py +357 -0
- morphml/utils/__init__.py +6 -0
- morphml/utils/checkpoint.py +189 -0
- morphml/utils/comparison.py +390 -0
- morphml/utils/export.py +407 -0
- morphml/utils/progress.py +392 -0
- morphml/utils/validation.py +392 -0
- morphml/version.py +7 -0
- morphml/visualization/__init__.py +50 -0
- morphml/visualization/analytics.py +423 -0
- morphml/visualization/architecture_diagrams.py +353 -0
- morphml/visualization/architecture_plot.py +223 -0
- morphml/visualization/convergence_plot.py +174 -0
- morphml/visualization/crossover_viz.py +386 -0
- morphml/visualization/graph_viz.py +338 -0
- morphml/visualization/pareto_plot.py +149 -0
- morphml/visualization/plotly_dashboards.py +422 -0
- morphml/visualization/population.py +309 -0
- morphml/visualization/progress.py +260 -0
- morphml-1.0.0.dist-info/METADATA +434 -0
- morphml-1.0.0.dist-info/RECORD +158 -0
- morphml-1.0.0.dist-info/WHEEL +4 -0
- morphml-1.0.0.dist-info/entry_points.txt +3 -0
- morphml-1.0.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,348 @@
|
|
|
1
|
+
"""System health monitoring for workers.
|
|
2
|
+
|
|
3
|
+
Tracks CPU, memory, GPU, disk, and network health metrics.
|
|
4
|
+
|
|
5
|
+
Author: Eshan Roy <eshanized@proton.me>
|
|
6
|
+
Organization: TONMOY INFRASTRUCTURE & VISION
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import platform
|
|
10
|
+
import time
|
|
11
|
+
from dataclasses import dataclass, field
|
|
12
|
+
from typing import Any, Dict, List, Optional
|
|
13
|
+
|
|
14
|
+
try:
|
|
15
|
+
import psutil
|
|
16
|
+
|
|
17
|
+
PSUTIL_AVAILABLE = True
|
|
18
|
+
except ImportError:
|
|
19
|
+
PSUTIL_AVAILABLE = False
|
|
20
|
+
|
|
21
|
+
try:
|
|
22
|
+
import pynvml
|
|
23
|
+
|
|
24
|
+
PYNVML_AVAILABLE = True
|
|
25
|
+
try:
|
|
26
|
+
pynvml.nvmlInit()
|
|
27
|
+
except Exception:
|
|
28
|
+
PYNVML_AVAILABLE = False
|
|
29
|
+
except ImportError:
|
|
30
|
+
PYNVML_AVAILABLE = False
|
|
31
|
+
|
|
32
|
+
from morphml.logging_config import get_logger
|
|
33
|
+
|
|
34
|
+
logger = get_logger(__name__)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass
|
|
38
|
+
class HealthMetrics:
|
|
39
|
+
"""System health metrics."""
|
|
40
|
+
|
|
41
|
+
timestamp: float = field(default_factory=time.time)
|
|
42
|
+
cpu_percent: float = 0.0
|
|
43
|
+
memory_percent: float = 0.0
|
|
44
|
+
memory_available_gb: float = 0.0
|
|
45
|
+
disk_percent: float = 0.0
|
|
46
|
+
disk_free_gb: float = 0.0
|
|
47
|
+
gpu_stats: List[Dict[str, Any]] = field(default_factory=list)
|
|
48
|
+
network_latency_ms: Optional[float] = None
|
|
49
|
+
is_healthy: bool = True
|
|
50
|
+
issues: List[str] = field(default_factory=list)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class HealthMonitor:
|
|
54
|
+
"""
|
|
55
|
+
Monitor system health metrics for workers.
|
|
56
|
+
|
|
57
|
+
Tracks:
|
|
58
|
+
- CPU utilization
|
|
59
|
+
- Memory usage
|
|
60
|
+
- GPU utilization and memory
|
|
61
|
+
- Disk space
|
|
62
|
+
- Optional: Network latency
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
thresholds: Health thresholds dictionary
|
|
66
|
+
- cpu_critical: CPU % to mark unhealthy (default: 95)
|
|
67
|
+
- memory_critical: Memory % to mark unhealthy (default: 95)
|
|
68
|
+
- disk_critical: Disk % to mark unhealthy (default: 95)
|
|
69
|
+
- gpu_temp_critical: GPU temperature °C (default: 85)
|
|
70
|
+
- gpu_memory_critical: GPU memory % (default: 95)
|
|
71
|
+
|
|
72
|
+
Example:
|
|
73
|
+
>>> monitor = HealthMonitor()
|
|
74
|
+
>>> metrics = monitor.get_health_metrics()
|
|
75
|
+
>>> if not metrics.is_healthy:
|
|
76
|
+
... print(f"Health issues: {metrics.issues}")
|
|
77
|
+
"""
|
|
78
|
+
|
|
79
|
+
def __init__(self, thresholds: Optional[Dict[str, float]] = None):
|
|
80
|
+
"""Initialize health monitor."""
|
|
81
|
+
if not PSUTIL_AVAILABLE:
|
|
82
|
+
logger.warning("psutil not available, health monitoring limited")
|
|
83
|
+
|
|
84
|
+
thresholds = thresholds or {}
|
|
85
|
+
self.cpu_critical = thresholds.get("cpu_critical", 95.0)
|
|
86
|
+
self.memory_critical = thresholds.get("memory_critical", 95.0)
|
|
87
|
+
self.disk_critical = thresholds.get("disk_critical", 95.0)
|
|
88
|
+
self.gpu_temp_critical = thresholds.get("gpu_temp_critical", 85.0)
|
|
89
|
+
self.gpu_memory_critical = thresholds.get("gpu_memory_critical", 95.0)
|
|
90
|
+
|
|
91
|
+
# Initialize GPU monitoring
|
|
92
|
+
self.gpu_count = 0
|
|
93
|
+
if PYNVML_AVAILABLE:
|
|
94
|
+
try:
|
|
95
|
+
self.gpu_count = pynvml.nvmlDeviceGetCount()
|
|
96
|
+
logger.info(f"Detected {self.gpu_count} GPUs")
|
|
97
|
+
except Exception as e:
|
|
98
|
+
logger.warning(f"Failed to detect GPUs: {e}")
|
|
99
|
+
|
|
100
|
+
logger.info("Initialized HealthMonitor")
|
|
101
|
+
|
|
102
|
+
def get_health_metrics(self) -> HealthMetrics:
|
|
103
|
+
"""
|
|
104
|
+
Get current system health metrics.
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
HealthMetrics object
|
|
108
|
+
"""
|
|
109
|
+
metrics = HealthMetrics()
|
|
110
|
+
|
|
111
|
+
if not PSUTIL_AVAILABLE:
|
|
112
|
+
metrics.is_healthy = True # Assume healthy if can't check
|
|
113
|
+
return metrics
|
|
114
|
+
|
|
115
|
+
# CPU
|
|
116
|
+
try:
|
|
117
|
+
metrics.cpu_percent = psutil.cpu_percent(interval=0.1)
|
|
118
|
+
except Exception as e:
|
|
119
|
+
logger.warning(f"Failed to get CPU metrics: {e}")
|
|
120
|
+
|
|
121
|
+
# Memory
|
|
122
|
+
try:
|
|
123
|
+
mem = psutil.virtual_memory()
|
|
124
|
+
metrics.memory_percent = mem.percent
|
|
125
|
+
metrics.memory_available_gb = mem.available / (1024**3)
|
|
126
|
+
except Exception as e:
|
|
127
|
+
logger.warning(f"Failed to get memory metrics: {e}")
|
|
128
|
+
|
|
129
|
+
# Disk
|
|
130
|
+
try:
|
|
131
|
+
disk = psutil.disk_usage("/")
|
|
132
|
+
metrics.disk_percent = disk.percent
|
|
133
|
+
metrics.disk_free_gb = disk.free / (1024**3)
|
|
134
|
+
except Exception as e:
|
|
135
|
+
logger.warning(f"Failed to get disk metrics: {e}")
|
|
136
|
+
|
|
137
|
+
# GPU
|
|
138
|
+
if PYNVML_AVAILABLE and self.gpu_count > 0:
|
|
139
|
+
metrics.gpu_stats = self._get_gpu_stats()
|
|
140
|
+
|
|
141
|
+
# Check health
|
|
142
|
+
metrics.is_healthy, metrics.issues = self._check_health(metrics)
|
|
143
|
+
|
|
144
|
+
return metrics
|
|
145
|
+
|
|
146
|
+
def _get_gpu_stats(self) -> List[Dict[str, Any]]:
|
|
147
|
+
"""Get GPU statistics using pynvml."""
|
|
148
|
+
gpu_stats = []
|
|
149
|
+
|
|
150
|
+
for i in range(self.gpu_count):
|
|
151
|
+
try:
|
|
152
|
+
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
|
|
153
|
+
|
|
154
|
+
# Utilization
|
|
155
|
+
util = pynvml.nvmlDeviceGetUtilizationRates(handle)
|
|
156
|
+
|
|
157
|
+
# Memory
|
|
158
|
+
mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
|
|
159
|
+
|
|
160
|
+
# Temperature
|
|
161
|
+
temp = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU)
|
|
162
|
+
|
|
163
|
+
# Name
|
|
164
|
+
name = pynvml.nvmlDeviceGetName(handle)
|
|
165
|
+
if isinstance(name, bytes):
|
|
166
|
+
name = name.decode("utf-8")
|
|
167
|
+
|
|
168
|
+
gpu_stats.append(
|
|
169
|
+
{
|
|
170
|
+
"id": i,
|
|
171
|
+
"name": name,
|
|
172
|
+
"load": util.gpu,
|
|
173
|
+
"memory_used_mb": mem_info.used / (1024**2),
|
|
174
|
+
"memory_total_mb": mem_info.total / (1024**2),
|
|
175
|
+
"memory_percent": (mem_info.used / mem_info.total) * 100,
|
|
176
|
+
"temperature": temp,
|
|
177
|
+
}
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
except Exception as e:
|
|
181
|
+
logger.warning(f"Failed to get stats for GPU {i}: {e}")
|
|
182
|
+
|
|
183
|
+
return gpu_stats
|
|
184
|
+
|
|
185
|
+
def _check_health(self, metrics: HealthMetrics) -> tuple[bool, List[str]]:
|
|
186
|
+
"""
|
|
187
|
+
Check if system is healthy based on metrics.
|
|
188
|
+
|
|
189
|
+
Returns:
|
|
190
|
+
(is_healthy, list of issues)
|
|
191
|
+
"""
|
|
192
|
+
issues = []
|
|
193
|
+
|
|
194
|
+
# Check CPU
|
|
195
|
+
if metrics.cpu_percent > self.cpu_critical:
|
|
196
|
+
issues.append(f"CPU overload: {metrics.cpu_percent:.1f}%")
|
|
197
|
+
|
|
198
|
+
# Check memory
|
|
199
|
+
if metrics.memory_percent > self.memory_critical:
|
|
200
|
+
issues.append(f"Memory critical: {metrics.memory_percent:.1f}%")
|
|
201
|
+
|
|
202
|
+
# Check disk
|
|
203
|
+
if metrics.disk_percent > self.disk_critical:
|
|
204
|
+
issues.append(
|
|
205
|
+
f"Disk critical: {metrics.disk_percent:.1f}% "
|
|
206
|
+
f"({metrics.disk_free_gb:.1f}GB free)"
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
# Check GPUs
|
|
210
|
+
for gpu in metrics.gpu_stats:
|
|
211
|
+
gpu_id = gpu["id"]
|
|
212
|
+
|
|
213
|
+
# Temperature
|
|
214
|
+
if gpu["temperature"] > self.gpu_temp_critical:
|
|
215
|
+
issues.append(f"GPU {gpu_id} overheating: {gpu['temperature']}°C")
|
|
216
|
+
|
|
217
|
+
# Memory
|
|
218
|
+
if gpu["memory_percent"] > self.gpu_memory_critical:
|
|
219
|
+
issues.append(f"GPU {gpu_id} memory critical: {gpu['memory_percent']:.1f}%")
|
|
220
|
+
|
|
221
|
+
is_healthy = len(issues) == 0
|
|
222
|
+
|
|
223
|
+
return is_healthy, issues
|
|
224
|
+
|
|
225
|
+
def get_system_info(self) -> Dict[str, Any]:
|
|
226
|
+
"""
|
|
227
|
+
Get static system information.
|
|
228
|
+
|
|
229
|
+
Returns:
|
|
230
|
+
System info dictionary
|
|
231
|
+
"""
|
|
232
|
+
info = {
|
|
233
|
+
"platform": platform.system(),
|
|
234
|
+
"platform_release": platform.release(),
|
|
235
|
+
"platform_version": platform.version(),
|
|
236
|
+
"architecture": platform.machine(),
|
|
237
|
+
"processor": platform.processor(),
|
|
238
|
+
"python_version": platform.python_version(),
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
if PSUTIL_AVAILABLE:
|
|
242
|
+
try:
|
|
243
|
+
info["cpu_count_logical"] = psutil.cpu_count(logical=True)
|
|
244
|
+
info["cpu_count_physical"] = psutil.cpu_count(logical=False)
|
|
245
|
+
info["memory_total_gb"] = psutil.virtual_memory().total / (1024**3)
|
|
246
|
+
info["disk_total_gb"] = psutil.disk_usage("/").total / (1024**3)
|
|
247
|
+
except Exception as e:
|
|
248
|
+
logger.warning(f"Failed to get system info: {e}")
|
|
249
|
+
|
|
250
|
+
if PYNVML_AVAILABLE and self.gpu_count > 0:
|
|
251
|
+
gpu_info = []
|
|
252
|
+
for i in range(self.gpu_count):
|
|
253
|
+
try:
|
|
254
|
+
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
|
|
255
|
+
name = pynvml.nvmlDeviceGetName(handle)
|
|
256
|
+
if isinstance(name, bytes):
|
|
257
|
+
name = name.decode("utf-8")
|
|
258
|
+
|
|
259
|
+
mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
|
|
260
|
+
|
|
261
|
+
gpu_info.append(
|
|
262
|
+
{
|
|
263
|
+
"id": i,
|
|
264
|
+
"name": name,
|
|
265
|
+
"memory_total_gb": mem_info.total / (1024**3),
|
|
266
|
+
}
|
|
267
|
+
)
|
|
268
|
+
except Exception as e:
|
|
269
|
+
logger.warning(f"Failed to get info for GPU {i}: {e}")
|
|
270
|
+
|
|
271
|
+
info["gpus"] = gpu_info
|
|
272
|
+
else:
|
|
273
|
+
info["gpus"] = []
|
|
274
|
+
|
|
275
|
+
return info
|
|
276
|
+
|
|
277
|
+
def monitor_continuously(
|
|
278
|
+
self, interval: float = 60.0, callback: Optional[callable] = None
|
|
279
|
+
) -> None:
|
|
280
|
+
"""
|
|
281
|
+
Continuously monitor health (blocking).
|
|
282
|
+
|
|
283
|
+
Args:
|
|
284
|
+
interval: Monitoring interval in seconds
|
|
285
|
+
callback: Optional callback function(metrics)
|
|
286
|
+
"""
|
|
287
|
+
logger.info(f"Starting continuous monitoring (interval={interval}s)")
|
|
288
|
+
|
|
289
|
+
try:
|
|
290
|
+
while True:
|
|
291
|
+
metrics = self.get_health_metrics()
|
|
292
|
+
|
|
293
|
+
if not metrics.is_healthy:
|
|
294
|
+
logger.warning(f"Health issues detected: {metrics.issues}")
|
|
295
|
+
|
|
296
|
+
if callback:
|
|
297
|
+
callback(metrics)
|
|
298
|
+
|
|
299
|
+
time.sleep(interval)
|
|
300
|
+
|
|
301
|
+
except KeyboardInterrupt:
|
|
302
|
+
logger.info("Stopping continuous monitoring")
|
|
303
|
+
|
|
304
|
+
def cleanup(self) -> None:
|
|
305
|
+
"""Cleanup resources."""
|
|
306
|
+
if PYNVML_AVAILABLE:
|
|
307
|
+
try:
|
|
308
|
+
pynvml.nvmlShutdown()
|
|
309
|
+
except Exception:
|
|
310
|
+
pass
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
def get_system_health() -> Dict[str, Any]:
|
|
314
|
+
"""
|
|
315
|
+
Convenience function to get system health.
|
|
316
|
+
|
|
317
|
+
Returns:
|
|
318
|
+
Health metrics dictionary
|
|
319
|
+
"""
|
|
320
|
+
monitor = HealthMonitor()
|
|
321
|
+
metrics = monitor.get_health_metrics()
|
|
322
|
+
|
|
323
|
+
return {
|
|
324
|
+
"timestamp": metrics.timestamp,
|
|
325
|
+
"cpu_percent": metrics.cpu_percent,
|
|
326
|
+
"memory_percent": metrics.memory_percent,
|
|
327
|
+
"memory_available_gb": metrics.memory_available_gb,
|
|
328
|
+
"disk_percent": metrics.disk_percent,
|
|
329
|
+
"disk_free_gb": metrics.disk_free_gb,
|
|
330
|
+
"gpus": metrics.gpu_stats,
|
|
331
|
+
"is_healthy": metrics.is_healthy,
|
|
332
|
+
"issues": metrics.issues,
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
def is_system_healthy(thresholds: Optional[Dict[str, float]] = None) -> bool:
|
|
337
|
+
"""
|
|
338
|
+
Quick health check.
|
|
339
|
+
|
|
340
|
+
Args:
|
|
341
|
+
thresholds: Optional custom thresholds
|
|
342
|
+
|
|
343
|
+
Returns:
|
|
344
|
+
True if system is healthy
|
|
345
|
+
"""
|
|
346
|
+
monitor = HealthMonitor(thresholds)
|
|
347
|
+
metrics = monitor.get_health_metrics()
|
|
348
|
+
return metrics.is_healthy
|