isa-model 0.4.3__py3-none-any.whl → 0.4.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,428 +0,0 @@
1
- """
2
- Local service health monitoring and management
3
-
4
- Provides health checking, monitoring, and management for local GPU services.
5
- """
6
-
7
- import asyncio
8
- import logging
9
- import time
10
- from typing import Dict, List, Optional, Any, Union
11
- from datetime import datetime, timedelta
12
- from dataclasses import dataclass, field
13
- from enum import Enum
14
-
15
- from ...utils.gpu_utils import get_gpu_manager
16
-
17
- logger = logging.getLogger(__name__)
18
-
19
-
20
- class ServiceStatus(Enum):
21
- """Service status states"""
22
- STOPPED = "stopped"
23
- STARTING = "starting"
24
- RUNNING = "running"
25
- ERROR = "error"
26
- UNHEALTHY = "unhealthy"
27
- STOPPING = "stopping"
28
-
29
-
30
- @dataclass
31
- class HealthMetrics:
32
- """Health metrics for a service"""
33
- service_name: str
34
- status: ServiceStatus
35
- last_check: datetime
36
- response_time_ms: Optional[float] = None
37
- error_count: int = 0
38
- consecutive_failures: int = 0
39
- uptime_seconds: Optional[float] = None
40
- memory_usage_mb: Optional[float] = None
41
- gpu_utilization: Optional[float] = None
42
- request_count: int = 0
43
- last_error: Optional[str] = None
44
- metadata: Dict[str, Any] = field(default_factory=dict)
45
-
46
-
47
- class LocalHealthChecker:
48
- """Health checker for local GPU services"""
49
-
50
- def __init__(self, check_interval: int = 30, failure_threshold: int = 3):
51
- """
52
- Initialize health checker.
53
-
54
- Args:
55
- check_interval: Health check interval in seconds
56
- failure_threshold: Number of consecutive failures before marking unhealthy
57
- """
58
- self.check_interval = check_interval
59
- self.failure_threshold = failure_threshold
60
- self.gpu_manager = get_gpu_manager()
61
-
62
- # Service tracking
63
- self.services: Dict[str, Any] = {} # service_name -> service instance
64
- self.metrics: Dict[str, HealthMetrics] = {} # service_name -> metrics
65
- self.monitoring_tasks: Dict[str, asyncio.Task] = {} # service_name -> task
66
-
67
- # Global monitoring
68
- self.monitoring_enabled = False
69
- self.global_monitor_task: Optional[asyncio.Task] = None
70
-
71
- logger.info("Local health checker initialized")
72
-
73
- def register_service(self, service_name: str, service_instance: Any) -> bool:
74
- """
75
- Register a service for health monitoring.
76
-
77
- Args:
78
- service_name: Unique service name
79
- service_instance: Service instance with health_check() method
80
-
81
- Returns:
82
- Registration success
83
- """
84
- try:
85
- if not hasattr(service_instance, 'health_check'):
86
- logger.error(f"Service {service_name} does not have health_check method")
87
- return False
88
-
89
- self.services[service_name] = service_instance
90
- self.metrics[service_name] = HealthMetrics(
91
- service_name=service_name,
92
- status=ServiceStatus.STOPPED,
93
- last_check=datetime.now()
94
- )
95
-
96
- logger.info(f"Service registered for health monitoring: {service_name}")
97
- return True
98
-
99
- except Exception as e:
100
- logger.error(f"Failed to register service {service_name}: {e}")
101
- return False
102
-
103
- def unregister_service(self, service_name: str) -> bool:
104
- """
105
- Unregister a service from health monitoring.
106
-
107
- Args:
108
- service_name: Service name to unregister
109
-
110
- Returns:
111
- Unregistration success
112
- """
113
- try:
114
- # Stop monitoring task
115
- if service_name in self.monitoring_tasks:
116
- self.monitoring_tasks[service_name].cancel()
117
- del self.monitoring_tasks[service_name]
118
-
119
- # Remove from tracking
120
- if service_name in self.services:
121
- del self.services[service_name]
122
- if service_name in self.metrics:
123
- del self.metrics[service_name]
124
-
125
- logger.info(f"Service unregistered from health monitoring: {service_name}")
126
- return True
127
-
128
- except Exception as e:
129
- logger.error(f"Failed to unregister service {service_name}: {e}")
130
- return False
131
-
132
- async def start_monitoring(self, service_name: Optional[str] = None) -> bool:
133
- """
134
- Start health monitoring for a specific service or all services.
135
-
136
- Args:
137
- service_name: Service to monitor, or None for all services
138
-
139
- Returns:
140
- Start success
141
- """
142
- try:
143
- if service_name:
144
- # Start monitoring for specific service
145
- if service_name not in self.services:
146
- logger.error(f"Service {service_name} not registered")
147
- return False
148
-
149
- if service_name not in self.monitoring_tasks:
150
- task = asyncio.create_task(self._monitor_service(service_name))
151
- self.monitoring_tasks[service_name] = task
152
- logger.info(f"Started monitoring service: {service_name}")
153
-
154
- else:
155
- # Start monitoring for all services
156
- for svc_name in self.services:
157
- if svc_name not in self.monitoring_tasks:
158
- task = asyncio.create_task(self._monitor_service(svc_name))
159
- self.monitoring_tasks[svc_name] = task
160
-
161
- # Start global monitoring
162
- if not self.monitoring_enabled:
163
- self.monitoring_enabled = True
164
- self.global_monitor_task = asyncio.create_task(self._global_monitor())
165
- logger.info("Started global health monitoring")
166
-
167
- return True
168
-
169
- except Exception as e:
170
- logger.error(f"Failed to start monitoring: {e}")
171
- return False
172
-
173
- async def stop_monitoring(self, service_name: Optional[str] = None) -> bool:
174
- """
175
- Stop health monitoring for a specific service or all services.
176
-
177
- Args:
178
- service_name: Service to stop monitoring, or None for all services
179
-
180
- Returns:
181
- Stop success
182
- """
183
- try:
184
- if service_name:
185
- # Stop monitoring for specific service
186
- if service_name in self.monitoring_tasks:
187
- self.monitoring_tasks[service_name].cancel()
188
- del self.monitoring_tasks[service_name]
189
- logger.info(f"Stopped monitoring service: {service_name}")
190
-
191
- else:
192
- # Stop all monitoring
193
- for task in self.monitoring_tasks.values():
194
- task.cancel()
195
- self.monitoring_tasks.clear()
196
-
197
- # Stop global monitoring
198
- self.monitoring_enabled = False
199
- if self.global_monitor_task:
200
- self.global_monitor_task.cancel()
201
- self.global_monitor_task = None
202
-
203
- logger.info("Stopped all health monitoring")
204
-
205
- return True
206
-
207
- except Exception as e:
208
- logger.error(f"Failed to stop monitoring: {e}")
209
- return False
210
-
211
- async def check_service_health(self, service_name: str) -> Dict[str, Any]:
212
- """
213
- Perform immediate health check for a service.
214
-
215
- Args:
216
- service_name: Service to check
217
-
218
- Returns:
219
- Health check result
220
- """
221
- if service_name not in self.services:
222
- return {
223
- "healthy": False,
224
- "error": f"Service {service_name} not registered"
225
- }
226
-
227
- try:
228
- start_time = time.time()
229
- service = self.services[service_name]
230
-
231
- # Perform health check
232
- health_result = await service.health_check()
233
-
234
- response_time = (time.time() - start_time) * 1000 # ms
235
-
236
- # Update metrics
237
- metrics = self.metrics[service_name]
238
- metrics.last_check = datetime.now()
239
- metrics.response_time_ms = response_time
240
-
241
- if health_result.get("healthy", False):
242
- metrics.status = ServiceStatus.RUNNING
243
- metrics.consecutive_failures = 0
244
-
245
- # Update additional metrics if available
246
- if "memory_usage_mb" in health_result:
247
- metrics.memory_usage_mb = health_result["memory_usage_mb"]
248
- if "gpu_utilization" in health_result:
249
- metrics.gpu_utilization = health_result["gpu_utilization"]
250
- if "uptime_seconds" in health_result:
251
- metrics.uptime_seconds = health_result["uptime_seconds"]
252
- if "request_count" in health_result:
253
- metrics.request_count = health_result["request_count"]
254
-
255
- else:
256
- metrics.consecutive_failures += 1
257
- metrics.error_count += 1
258
- metrics.last_error = health_result.get("error", "Unknown error")
259
-
260
- if metrics.consecutive_failures >= self.failure_threshold:
261
- metrics.status = ServiceStatus.UNHEALTHY
262
- else:
263
- metrics.status = ServiceStatus.ERROR
264
-
265
- return {
266
- **health_result,
267
- "response_time_ms": response_time,
268
- "consecutive_failures": metrics.consecutive_failures,
269
- "service_name": service_name
270
- }
271
-
272
- except Exception as e:
273
- logger.error(f"Health check failed for {service_name}: {e}")
274
-
275
- # Update metrics on exception
276
- metrics = self.metrics[service_name]
277
- metrics.last_check = datetime.now()
278
- metrics.consecutive_failures += 1
279
- metrics.error_count += 1
280
- metrics.last_error = str(e)
281
- metrics.status = ServiceStatus.ERROR
282
-
283
- return {
284
- "healthy": False,
285
- "error": str(e),
286
- "service_name": service_name,
287
- "consecutive_failures": metrics.consecutive_failures
288
- }
289
-
290
- def get_service_metrics(self, service_name: str) -> Optional[HealthMetrics]:
291
- """Get metrics for a specific service"""
292
- return self.metrics.get(service_name)
293
-
294
- def get_all_metrics(self) -> Dict[str, HealthMetrics]:
295
- """Get metrics for all services"""
296
- return self.metrics.copy()
297
-
298
- def get_system_health(self) -> Dict[str, Any]:
299
- """Get overall system health summary"""
300
- total_services = len(self.services)
301
- healthy_services = sum(1 for m in self.metrics.values() if m.status == ServiceStatus.RUNNING)
302
- unhealthy_services = sum(1 for m in self.metrics.values() if m.status == ServiceStatus.UNHEALTHY)
303
- error_services = sum(1 for m in self.metrics.values() if m.status == ServiceStatus.ERROR)
304
-
305
- # Get GPU status
306
- self.gpu_manager.refresh()
307
- gpu_info = [
308
- {
309
- "gpu_id": gpu.gpu_id,
310
- "name": gpu.name,
311
- "memory_used_mb": gpu.memory_used,
312
- "memory_total_mb": gpu.memory_total,
313
- "memory_free_mb": gpu.memory_free,
314
- "utilization_percent": gpu.utilization,
315
- "temperature_c": gpu.temperature
316
- }
317
- for gpu in self.gpu_manager.gpus
318
- ]
319
-
320
- overall_status = "healthy"
321
- if unhealthy_services > 0:
322
- overall_status = "degraded"
323
- elif error_services > 0:
324
- overall_status = "warning"
325
- elif healthy_services == 0 and total_services > 0:
326
- overall_status = "down"
327
-
328
- return {
329
- "overall_status": overall_status,
330
- "timestamp": datetime.now().isoformat(),
331
- "services": {
332
- "total": total_services,
333
- "healthy": healthy_services,
334
- "unhealthy": unhealthy_services,
335
- "error": error_services,
336
- "stopped": total_services - healthy_services - unhealthy_services - error_services
337
- },
338
- "gpu_info": gpu_info,
339
- "monitoring_enabled": self.monitoring_enabled,
340
- "check_interval": self.check_interval
341
- }
342
-
343
- async def restart_unhealthy_services(self) -> Dict[str, Any]:
344
- """Attempt to restart unhealthy services"""
345
- restart_results = {}
346
-
347
- for service_name, metrics in self.metrics.items():
348
- if metrics.status == ServiceStatus.UNHEALTHY:
349
- try:
350
- logger.info(f"Attempting to restart unhealthy service: {service_name}")
351
- service = self.services[service_name]
352
-
353
- # Check if service has restart method
354
- if hasattr(service, 'restart'):
355
- result = await service.restart()
356
- restart_results[service_name] = result
357
- elif hasattr(service, 'stop') and hasattr(service, 'start'):
358
- # Manual restart
359
- await service.stop()
360
- await asyncio.sleep(2)
361
- result = await service.start()
362
- restart_results[service_name] = result
363
- else:
364
- restart_results[service_name] = {
365
- "success": False,
366
- "error": "Service does not support restart"
367
- }
368
-
369
- except Exception as e:
370
- logger.error(f"Failed to restart service {service_name}: {e}")
371
- restart_results[service_name] = {
372
- "success": False,
373
- "error": str(e)
374
- }
375
-
376
- return restart_results
377
-
378
- async def _monitor_service(self, service_name: str):
379
- """Background monitoring task for a service"""
380
- logger.info(f"Starting background monitoring for service: {service_name}")
381
-
382
- try:
383
- while True:
384
- await self.check_service_health(service_name)
385
- await asyncio.sleep(self.check_interval)
386
-
387
- except asyncio.CancelledError:
388
- logger.info(f"Monitoring cancelled for service: {service_name}")
389
- except Exception as e:
390
- logger.error(f"Monitoring error for service {service_name}: {e}")
391
-
392
- async def _global_monitor(self):
393
- """Global monitoring task for system-wide health"""
394
- logger.info("Starting global health monitoring")
395
-
396
- try:
397
- while self.monitoring_enabled:
398
- # Check system resources
399
- self.gpu_manager.refresh()
400
-
401
- # Log system health periodically
402
- system_health = self.get_system_health()
403
- if system_health["overall_status"] != "healthy":
404
- logger.warning(f"System health: {system_health['overall_status']}")
405
-
406
- # Auto-restart unhealthy services if configured
407
- unhealthy_count = system_health["services"]["unhealthy"]
408
- if unhealthy_count > 0:
409
- logger.info(f"Found {unhealthy_count} unhealthy services, attempting restart...")
410
- await self.restart_unhealthy_services()
411
-
412
- await asyncio.sleep(self.check_interval * 2) # Less frequent than individual checks
413
-
414
- except asyncio.CancelledError:
415
- logger.info("Global monitoring cancelled")
416
- except Exception as e:
417
- logger.error(f"Global monitoring error: {e}")
418
-
419
-
420
- # Global health checker instance
421
- _health_checker = None
422
-
423
- def get_health_checker() -> LocalHealthChecker:
424
- """Get global health checker instance"""
425
- global _health_checker
426
- if _health_checker is None:
427
- _health_checker = LocalHealthChecker()
428
- return _health_checker