isa-model 0.3.91__py3-none-any.whl → 0.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- isa_model/client.py +1166 -584
- isa_model/core/cache/redis_cache.py +410 -0
- isa_model/core/config/config_manager.py +282 -12
- isa_model/core/config.py +91 -1
- isa_model/core/database/__init__.py +1 -0
- isa_model/core/database/direct_db_client.py +114 -0
- isa_model/core/database/migration_manager.py +563 -0
- isa_model/core/database/migrations.py +297 -0
- isa_model/core/database/supabase_client.py +258 -0
- isa_model/core/dependencies.py +316 -0
- isa_model/core/discovery/__init__.py +19 -0
- isa_model/core/discovery/consul_discovery.py +190 -0
- isa_model/core/logging/__init__.py +54 -0
- isa_model/core/logging/influx_logger.py +523 -0
- isa_model/core/logging/loki_logger.py +160 -0
- isa_model/core/models/__init__.py +46 -0
- isa_model/core/models/config_models.py +625 -0
- isa_model/core/models/deployment_billing_tracker.py +430 -0
- isa_model/core/models/model_billing_tracker.py +60 -88
- isa_model/core/models/model_manager.py +66 -25
- isa_model/core/models/model_metadata.py +690 -0
- isa_model/core/models/model_repo.py +217 -55
- isa_model/core/models/model_statistics_tracker.py +234 -0
- isa_model/core/models/model_storage.py +0 -1
- isa_model/core/models/model_version_manager.py +959 -0
- isa_model/core/models/system_models.py +857 -0
- isa_model/core/pricing_manager.py +2 -249
- isa_model/core/repositories/__init__.py +9 -0
- isa_model/core/repositories/config_repository.py +912 -0
- isa_model/core/resilience/circuit_breaker.py +366 -0
- isa_model/core/security/secrets.py +358 -0
- isa_model/core/services/__init__.py +2 -4
- isa_model/core/services/intelligent_model_selector.py +479 -370
- isa_model/core/storage/hf_storage.py +2 -2
- isa_model/core/types.py +8 -0
- isa_model/deployment/__init__.py +5 -48
- isa_model/deployment/core/__init__.py +2 -31
- isa_model/deployment/core/deployment_manager.py +1278 -368
- isa_model/deployment/local/__init__.py +31 -0
- isa_model/deployment/local/config.py +248 -0
- isa_model/deployment/local/gpu_gateway.py +607 -0
- isa_model/deployment/local/health_checker.py +428 -0
- isa_model/deployment/local/provider.py +586 -0
- isa_model/deployment/local/tensorrt_service.py +621 -0
- isa_model/deployment/local/transformers_service.py +644 -0
- isa_model/deployment/local/vllm_service.py +527 -0
- isa_model/deployment/modal/__init__.py +8 -0
- isa_model/deployment/modal/config.py +136 -0
- isa_model/deployment/modal/deployer.py +894 -0
- isa_model/deployment/modal/services/__init__.py +3 -0
- isa_model/deployment/modal/services/audio/__init__.py +1 -0
- isa_model/deployment/modal/services/audio/isa_audio_chatTTS_service.py +520 -0
- isa_model/deployment/modal/services/audio/isa_audio_openvoice_service.py +758 -0
- isa_model/deployment/modal/services/audio/isa_audio_service_v2.py +1044 -0
- isa_model/deployment/modal/services/embedding/__init__.py +1 -0
- isa_model/deployment/modal/services/embedding/isa_embed_rerank_service.py +296 -0
- isa_model/deployment/modal/services/llm/__init__.py +1 -0
- isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
- isa_model/deployment/modal/services/video/__init__.py +1 -0
- isa_model/deployment/modal/services/video/isa_video_hunyuan_service.py +423 -0
- isa_model/deployment/modal/services/vision/__init__.py +1 -0
- isa_model/deployment/modal/services/vision/isa_vision_ocr_service.py +519 -0
- isa_model/deployment/modal/services/vision/isa_vision_qwen25_service.py +709 -0
- isa_model/deployment/modal/services/vision/isa_vision_table_service.py +676 -0
- isa_model/deployment/modal/services/vision/isa_vision_ui_service.py +833 -0
- isa_model/deployment/modal/services/vision/isa_vision_ui_service_optimized.py +660 -0
- isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
- isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
- isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
- isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
- isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
- isa_model/deployment/storage/__init__.py +5 -0
- isa_model/deployment/storage/deployment_repository.py +824 -0
- isa_model/deployment/triton/__init__.py +10 -0
- isa_model/deployment/triton/config.py +196 -0
- isa_model/deployment/triton/configs/__init__.py +1 -0
- isa_model/deployment/triton/provider.py +512 -0
- isa_model/deployment/triton/scripts/__init__.py +1 -0
- isa_model/deployment/triton/templates/__init__.py +1 -0
- isa_model/inference/__init__.py +47 -1
- isa_model/inference/ai_factory.py +179 -16
- isa_model/inference/legacy_services/__init__.py +21 -0
- isa_model/inference/legacy_services/model_evaluation.py +637 -0
- isa_model/inference/legacy_services/model_service.py +573 -0
- isa_model/inference/legacy_services/model_serving.py +717 -0
- isa_model/inference/legacy_services/model_training.py +561 -0
- isa_model/inference/models/__init__.py +21 -0
- isa_model/inference/models/inference_config.py +551 -0
- isa_model/inference/models/inference_record.py +675 -0
- isa_model/inference/models/performance_models.py +714 -0
- isa_model/inference/repositories/__init__.py +9 -0
- isa_model/inference/repositories/inference_repository.py +828 -0
- isa_model/inference/services/audio/__init__.py +21 -0
- isa_model/inference/services/audio/base_realtime_service.py +225 -0
- isa_model/inference/services/audio/base_stt_service.py +184 -11
- isa_model/inference/services/audio/isa_tts_service.py +0 -0
- isa_model/inference/services/audio/openai_realtime_service.py +320 -124
- isa_model/inference/services/audio/openai_stt_service.py +53 -11
- isa_model/inference/services/base_service.py +17 -1
- isa_model/inference/services/custom_model_manager.py +277 -0
- isa_model/inference/services/embedding/__init__.py +13 -0
- isa_model/inference/services/embedding/base_embed_service.py +111 -8
- isa_model/inference/services/embedding/isa_embed_service.py +305 -0
- isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
- isa_model/inference/services/embedding/openai_embed_service.py +2 -4
- isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
- isa_model/inference/services/embedding/tests/test_embedding.py +222 -0
- isa_model/inference/services/img/__init__.py +2 -2
- isa_model/inference/services/img/base_image_gen_service.py +24 -7
- isa_model/inference/services/img/replicate_image_gen_service.py +84 -422
- isa_model/inference/services/img/services/replicate_face_swap.py +193 -0
- isa_model/inference/services/img/services/replicate_flux.py +226 -0
- isa_model/inference/services/img/services/replicate_flux_kontext.py +219 -0
- isa_model/inference/services/img/services/replicate_sticker_maker.py +249 -0
- isa_model/inference/services/img/tests/test_img_client.py +297 -0
- isa_model/inference/services/llm/__init__.py +10 -2
- isa_model/inference/services/llm/base_llm_service.py +361 -26
- isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
- isa_model/inference/services/llm/helpers/llm_adapter.py +71 -12
- isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
- isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
- isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
- isa_model/inference/services/llm/local_llm_service.py +747 -0
- isa_model/inference/services/llm/ollama_llm_service.py +11 -3
- isa_model/inference/services/llm/openai_llm_service.py +670 -56
- isa_model/inference/services/llm/yyds_llm_service.py +10 -3
- isa_model/inference/services/vision/__init__.py +27 -6
- isa_model/inference/services/vision/base_vision_service.py +118 -185
- isa_model/inference/services/vision/blip_vision_service.py +359 -0
- isa_model/inference/services/vision/helpers/image_utils.py +19 -10
- isa_model/inference/services/vision/isa_vision_service.py +634 -0
- isa_model/inference/services/vision/openai_vision_service.py +19 -10
- isa_model/inference/services/vision/tests/test_ocr_client.py +284 -0
- isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
- isa_model/serving/api/cache_manager.py +245 -0
- isa_model/serving/api/dependencies/__init__.py +1 -0
- isa_model/serving/api/dependencies/auth.py +194 -0
- isa_model/serving/api/dependencies/database.py +139 -0
- isa_model/serving/api/error_handlers.py +284 -0
- isa_model/serving/api/fastapi_server.py +240 -18
- isa_model/serving/api/middleware/auth.py +317 -0
- isa_model/serving/api/middleware/security.py +268 -0
- isa_model/serving/api/middleware/tenant_context.py +414 -0
- isa_model/serving/api/routes/analytics.py +489 -0
- isa_model/serving/api/routes/config.py +645 -0
- isa_model/serving/api/routes/deployment_billing.py +315 -0
- isa_model/serving/api/routes/deployments.py +475 -0
- isa_model/serving/api/routes/gpu_gateway.py +440 -0
- isa_model/serving/api/routes/health.py +32 -12
- isa_model/serving/api/routes/inference_monitoring.py +486 -0
- isa_model/serving/api/routes/local_deployments.py +448 -0
- isa_model/serving/api/routes/logs.py +430 -0
- isa_model/serving/api/routes/settings.py +582 -0
- isa_model/serving/api/routes/tenants.py +575 -0
- isa_model/serving/api/routes/unified.py +992 -171
- isa_model/serving/api/routes/webhooks.py +479 -0
- isa_model/serving/api/startup.py +318 -0
- isa_model/serving/modal_proxy_server.py +249 -0
- isa_model/utils/gpu_utils.py +311 -0
- {isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/METADATA +76 -22
- isa_model-0.4.3.dist-info/RECORD +193 -0
- isa_model/deployment/cloud/__init__.py +0 -9
- isa_model/deployment/cloud/modal/__init__.py +0 -10
- isa_model/deployment/cloud/modal/isa_vision_doc_service.py +0 -766
- isa_model/deployment/cloud/modal/isa_vision_table_service.py +0 -532
- isa_model/deployment/cloud/modal/isa_vision_ui_service.py +0 -406
- isa_model/deployment/cloud/modal/register_models.py +0 -321
- isa_model/deployment/core/deployment_config.py +0 -356
- isa_model/deployment/core/isa_deployment_service.py +0 -401
- isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
- isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
- isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
- isa_model/deployment/runtime/deployed_service.py +0 -338
- isa_model/deployment/services/__init__.py +0 -9
- isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
- isa_model/deployment/services/model_service.py +0 -332
- isa_model/deployment/services/service_monitor.py +0 -356
- isa_model/deployment/services/service_registry.py +0 -527
- isa_model/eval/__init__.py +0 -92
- isa_model/eval/benchmarks.py +0 -469
- isa_model/eval/config/__init__.py +0 -10
- isa_model/eval/config/evaluation_config.py +0 -108
- isa_model/eval/evaluators/__init__.py +0 -18
- isa_model/eval/evaluators/base_evaluator.py +0 -503
- isa_model/eval/evaluators/llm_evaluator.py +0 -472
- isa_model/eval/factory.py +0 -531
- isa_model/eval/infrastructure/__init__.py +0 -24
- isa_model/eval/infrastructure/experiment_tracker.py +0 -466
- isa_model/eval/metrics.py +0 -798
- isa_model/inference/adapter/unified_api.py +0 -248
- isa_model/inference/services/helpers/stacked_config.py +0 -148
- isa_model/inference/services/img/flux_professional_service.py +0 -603
- isa_model/inference/services/img/helpers/base_stacked_service.py +0 -274
- isa_model/inference/services/others/table_transformer_service.py +0 -61
- isa_model/inference/services/vision/doc_analysis_service.py +0 -640
- isa_model/inference/services/vision/helpers/base_stacked_service.py +0 -274
- isa_model/inference/services/vision/ui_analysis_service.py +0 -823
- isa_model/scripts/inference_tracker.py +0 -283
- isa_model/scripts/mlflow_manager.py +0 -379
- isa_model/scripts/model_registry.py +0 -465
- isa_model/scripts/register_models.py +0 -370
- isa_model/scripts/register_models_with_embeddings.py +0 -510
- isa_model/scripts/start_mlflow.py +0 -95
- isa_model/scripts/training_tracker.py +0 -257
- isa_model/training/__init__.py +0 -74
- isa_model/training/annotation/annotation_schema.py +0 -47
- isa_model/training/annotation/processors/annotation_processor.py +0 -126
- isa_model/training/annotation/storage/dataset_manager.py +0 -131
- isa_model/training/annotation/storage/dataset_schema.py +0 -44
- isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
- isa_model/training/annotation/tests/test_minio copy.py +0 -113
- isa_model/training/annotation/tests/test_minio_upload.py +0 -43
- isa_model/training/annotation/views/annotation_controller.py +0 -158
- isa_model/training/cloud/__init__.py +0 -22
- isa_model/training/cloud/job_orchestrator.py +0 -402
- isa_model/training/cloud/runpod_trainer.py +0 -454
- isa_model/training/cloud/storage_manager.py +0 -482
- isa_model/training/core/__init__.py +0 -23
- isa_model/training/core/config.py +0 -181
- isa_model/training/core/dataset.py +0 -222
- isa_model/training/core/trainer.py +0 -720
- isa_model/training/core/utils.py +0 -213
- isa_model/training/factory.py +0 -424
- isa_model-0.3.91.dist-info/RECORD +0 -138
- /isa_model/{core/storage/minio_storage.py → deployment/modal/services/audio/isa_audio_fish_service.py} +0 -0
- /isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
- {isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/WHEEL +0 -0
- {isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,428 @@
|
|
1
|
+
"""
|
2
|
+
Local service health monitoring and management
|
3
|
+
|
4
|
+
Provides health checking, monitoring, and management for local GPU services.
|
5
|
+
"""
|
6
|
+
|
7
|
+
import asyncio
|
8
|
+
import logging
|
9
|
+
import time
|
10
|
+
from typing import Dict, List, Optional, Any, Union
|
11
|
+
from datetime import datetime, timedelta
|
12
|
+
from dataclasses import dataclass, field
|
13
|
+
from enum import Enum
|
14
|
+
|
15
|
+
from ...utils.gpu_utils import get_gpu_manager
|
16
|
+
|
17
|
+
logger = logging.getLogger(__name__)
|
18
|
+
|
19
|
+
|
20
|
+
class ServiceStatus(Enum):
|
21
|
+
"""Service status states"""
|
22
|
+
STOPPED = "stopped"
|
23
|
+
STARTING = "starting"
|
24
|
+
RUNNING = "running"
|
25
|
+
ERROR = "error"
|
26
|
+
UNHEALTHY = "unhealthy"
|
27
|
+
STOPPING = "stopping"
|
28
|
+
|
29
|
+
|
30
|
+
@dataclass
|
31
|
+
class HealthMetrics:
|
32
|
+
"""Health metrics for a service"""
|
33
|
+
service_name: str
|
34
|
+
status: ServiceStatus
|
35
|
+
last_check: datetime
|
36
|
+
response_time_ms: Optional[float] = None
|
37
|
+
error_count: int = 0
|
38
|
+
consecutive_failures: int = 0
|
39
|
+
uptime_seconds: Optional[float] = None
|
40
|
+
memory_usage_mb: Optional[float] = None
|
41
|
+
gpu_utilization: Optional[float] = None
|
42
|
+
request_count: int = 0
|
43
|
+
last_error: Optional[str] = None
|
44
|
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
45
|
+
|
46
|
+
|
47
|
+
class LocalHealthChecker:
|
48
|
+
"""Health checker for local GPU services"""
|
49
|
+
|
50
|
+
def __init__(self, check_interval: int = 30, failure_threshold: int = 3):
|
51
|
+
"""
|
52
|
+
Initialize health checker.
|
53
|
+
|
54
|
+
Args:
|
55
|
+
check_interval: Health check interval in seconds
|
56
|
+
failure_threshold: Number of consecutive failures before marking unhealthy
|
57
|
+
"""
|
58
|
+
self.check_interval = check_interval
|
59
|
+
self.failure_threshold = failure_threshold
|
60
|
+
self.gpu_manager = get_gpu_manager()
|
61
|
+
|
62
|
+
# Service tracking
|
63
|
+
self.services: Dict[str, Any] = {} # service_name -> service instance
|
64
|
+
self.metrics: Dict[str, HealthMetrics] = {} # service_name -> metrics
|
65
|
+
self.monitoring_tasks: Dict[str, asyncio.Task] = {} # service_name -> task
|
66
|
+
|
67
|
+
# Global monitoring
|
68
|
+
self.monitoring_enabled = False
|
69
|
+
self.global_monitor_task: Optional[asyncio.Task] = None
|
70
|
+
|
71
|
+
logger.info("Local health checker initialized")
|
72
|
+
|
73
|
+
def register_service(self, service_name: str, service_instance: Any) -> bool:
|
74
|
+
"""
|
75
|
+
Register a service for health monitoring.
|
76
|
+
|
77
|
+
Args:
|
78
|
+
service_name: Unique service name
|
79
|
+
service_instance: Service instance with health_check() method
|
80
|
+
|
81
|
+
Returns:
|
82
|
+
Registration success
|
83
|
+
"""
|
84
|
+
try:
|
85
|
+
if not hasattr(service_instance, 'health_check'):
|
86
|
+
logger.error(f"Service {service_name} does not have health_check method")
|
87
|
+
return False
|
88
|
+
|
89
|
+
self.services[service_name] = service_instance
|
90
|
+
self.metrics[service_name] = HealthMetrics(
|
91
|
+
service_name=service_name,
|
92
|
+
status=ServiceStatus.STOPPED,
|
93
|
+
last_check=datetime.now()
|
94
|
+
)
|
95
|
+
|
96
|
+
logger.info(f"Service registered for health monitoring: {service_name}")
|
97
|
+
return True
|
98
|
+
|
99
|
+
except Exception as e:
|
100
|
+
logger.error(f"Failed to register service {service_name}: {e}")
|
101
|
+
return False
|
102
|
+
|
103
|
+
def unregister_service(self, service_name: str) -> bool:
|
104
|
+
"""
|
105
|
+
Unregister a service from health monitoring.
|
106
|
+
|
107
|
+
Args:
|
108
|
+
service_name: Service name to unregister
|
109
|
+
|
110
|
+
Returns:
|
111
|
+
Unregistration success
|
112
|
+
"""
|
113
|
+
try:
|
114
|
+
# Stop monitoring task
|
115
|
+
if service_name in self.monitoring_tasks:
|
116
|
+
self.monitoring_tasks[service_name].cancel()
|
117
|
+
del self.monitoring_tasks[service_name]
|
118
|
+
|
119
|
+
# Remove from tracking
|
120
|
+
if service_name in self.services:
|
121
|
+
del self.services[service_name]
|
122
|
+
if service_name in self.metrics:
|
123
|
+
del self.metrics[service_name]
|
124
|
+
|
125
|
+
logger.info(f"Service unregistered from health monitoring: {service_name}")
|
126
|
+
return True
|
127
|
+
|
128
|
+
except Exception as e:
|
129
|
+
logger.error(f"Failed to unregister service {service_name}: {e}")
|
130
|
+
return False
|
131
|
+
|
132
|
+
async def start_monitoring(self, service_name: Optional[str] = None) -> bool:
|
133
|
+
"""
|
134
|
+
Start health monitoring for a specific service or all services.
|
135
|
+
|
136
|
+
Args:
|
137
|
+
service_name: Service to monitor, or None for all services
|
138
|
+
|
139
|
+
Returns:
|
140
|
+
Start success
|
141
|
+
"""
|
142
|
+
try:
|
143
|
+
if service_name:
|
144
|
+
# Start monitoring for specific service
|
145
|
+
if service_name not in self.services:
|
146
|
+
logger.error(f"Service {service_name} not registered")
|
147
|
+
return False
|
148
|
+
|
149
|
+
if service_name not in self.monitoring_tasks:
|
150
|
+
task = asyncio.create_task(self._monitor_service(service_name))
|
151
|
+
self.monitoring_tasks[service_name] = task
|
152
|
+
logger.info(f"Started monitoring service: {service_name}")
|
153
|
+
|
154
|
+
else:
|
155
|
+
# Start monitoring for all services
|
156
|
+
for svc_name in self.services:
|
157
|
+
if svc_name not in self.monitoring_tasks:
|
158
|
+
task = asyncio.create_task(self._monitor_service(svc_name))
|
159
|
+
self.monitoring_tasks[svc_name] = task
|
160
|
+
|
161
|
+
# Start global monitoring
|
162
|
+
if not self.monitoring_enabled:
|
163
|
+
self.monitoring_enabled = True
|
164
|
+
self.global_monitor_task = asyncio.create_task(self._global_monitor())
|
165
|
+
logger.info("Started global health monitoring")
|
166
|
+
|
167
|
+
return True
|
168
|
+
|
169
|
+
except Exception as e:
|
170
|
+
logger.error(f"Failed to start monitoring: {e}")
|
171
|
+
return False
|
172
|
+
|
173
|
+
async def stop_monitoring(self, service_name: Optional[str] = None) -> bool:
|
174
|
+
"""
|
175
|
+
Stop health monitoring for a specific service or all services.
|
176
|
+
|
177
|
+
Args:
|
178
|
+
service_name: Service to stop monitoring, or None for all services
|
179
|
+
|
180
|
+
Returns:
|
181
|
+
Stop success
|
182
|
+
"""
|
183
|
+
try:
|
184
|
+
if service_name:
|
185
|
+
# Stop monitoring for specific service
|
186
|
+
if service_name in self.monitoring_tasks:
|
187
|
+
self.monitoring_tasks[service_name].cancel()
|
188
|
+
del self.monitoring_tasks[service_name]
|
189
|
+
logger.info(f"Stopped monitoring service: {service_name}")
|
190
|
+
|
191
|
+
else:
|
192
|
+
# Stop all monitoring
|
193
|
+
for task in self.monitoring_tasks.values():
|
194
|
+
task.cancel()
|
195
|
+
self.monitoring_tasks.clear()
|
196
|
+
|
197
|
+
# Stop global monitoring
|
198
|
+
self.monitoring_enabled = False
|
199
|
+
if self.global_monitor_task:
|
200
|
+
self.global_monitor_task.cancel()
|
201
|
+
self.global_monitor_task = None
|
202
|
+
|
203
|
+
logger.info("Stopped all health monitoring")
|
204
|
+
|
205
|
+
return True
|
206
|
+
|
207
|
+
except Exception as e:
|
208
|
+
logger.error(f"Failed to stop monitoring: {e}")
|
209
|
+
return False
|
210
|
+
|
211
|
+
async def check_service_health(self, service_name: str) -> Dict[str, Any]:
|
212
|
+
"""
|
213
|
+
Perform immediate health check for a service.
|
214
|
+
|
215
|
+
Args:
|
216
|
+
service_name: Service to check
|
217
|
+
|
218
|
+
Returns:
|
219
|
+
Health check result
|
220
|
+
"""
|
221
|
+
if service_name not in self.services:
|
222
|
+
return {
|
223
|
+
"healthy": False,
|
224
|
+
"error": f"Service {service_name} not registered"
|
225
|
+
}
|
226
|
+
|
227
|
+
try:
|
228
|
+
start_time = time.time()
|
229
|
+
service = self.services[service_name]
|
230
|
+
|
231
|
+
# Perform health check
|
232
|
+
health_result = await service.health_check()
|
233
|
+
|
234
|
+
response_time = (time.time() - start_time) * 1000 # ms
|
235
|
+
|
236
|
+
# Update metrics
|
237
|
+
metrics = self.metrics[service_name]
|
238
|
+
metrics.last_check = datetime.now()
|
239
|
+
metrics.response_time_ms = response_time
|
240
|
+
|
241
|
+
if health_result.get("healthy", False):
|
242
|
+
metrics.status = ServiceStatus.RUNNING
|
243
|
+
metrics.consecutive_failures = 0
|
244
|
+
|
245
|
+
# Update additional metrics if available
|
246
|
+
if "memory_usage_mb" in health_result:
|
247
|
+
metrics.memory_usage_mb = health_result["memory_usage_mb"]
|
248
|
+
if "gpu_utilization" in health_result:
|
249
|
+
metrics.gpu_utilization = health_result["gpu_utilization"]
|
250
|
+
if "uptime_seconds" in health_result:
|
251
|
+
metrics.uptime_seconds = health_result["uptime_seconds"]
|
252
|
+
if "request_count" in health_result:
|
253
|
+
metrics.request_count = health_result["request_count"]
|
254
|
+
|
255
|
+
else:
|
256
|
+
metrics.consecutive_failures += 1
|
257
|
+
metrics.error_count += 1
|
258
|
+
metrics.last_error = health_result.get("error", "Unknown error")
|
259
|
+
|
260
|
+
if metrics.consecutive_failures >= self.failure_threshold:
|
261
|
+
metrics.status = ServiceStatus.UNHEALTHY
|
262
|
+
else:
|
263
|
+
metrics.status = ServiceStatus.ERROR
|
264
|
+
|
265
|
+
return {
|
266
|
+
**health_result,
|
267
|
+
"response_time_ms": response_time,
|
268
|
+
"consecutive_failures": metrics.consecutive_failures,
|
269
|
+
"service_name": service_name
|
270
|
+
}
|
271
|
+
|
272
|
+
except Exception as e:
|
273
|
+
logger.error(f"Health check failed for {service_name}: {e}")
|
274
|
+
|
275
|
+
# Update metrics on exception
|
276
|
+
metrics = self.metrics[service_name]
|
277
|
+
metrics.last_check = datetime.now()
|
278
|
+
metrics.consecutive_failures += 1
|
279
|
+
metrics.error_count += 1
|
280
|
+
metrics.last_error = str(e)
|
281
|
+
metrics.status = ServiceStatus.ERROR
|
282
|
+
|
283
|
+
return {
|
284
|
+
"healthy": False,
|
285
|
+
"error": str(e),
|
286
|
+
"service_name": service_name,
|
287
|
+
"consecutive_failures": metrics.consecutive_failures
|
288
|
+
}
|
289
|
+
|
290
|
+
def get_service_metrics(self, service_name: str) -> Optional[HealthMetrics]:
|
291
|
+
"""Get metrics for a specific service"""
|
292
|
+
return self.metrics.get(service_name)
|
293
|
+
|
294
|
+
def get_all_metrics(self) -> Dict[str, HealthMetrics]:
|
295
|
+
"""Get metrics for all services"""
|
296
|
+
return self.metrics.copy()
|
297
|
+
|
298
|
+
def get_system_health(self) -> Dict[str, Any]:
|
299
|
+
"""Get overall system health summary"""
|
300
|
+
total_services = len(self.services)
|
301
|
+
healthy_services = sum(1 for m in self.metrics.values() if m.status == ServiceStatus.RUNNING)
|
302
|
+
unhealthy_services = sum(1 for m in self.metrics.values() if m.status == ServiceStatus.UNHEALTHY)
|
303
|
+
error_services = sum(1 for m in self.metrics.values() if m.status == ServiceStatus.ERROR)
|
304
|
+
|
305
|
+
# Get GPU status
|
306
|
+
self.gpu_manager.refresh()
|
307
|
+
gpu_info = [
|
308
|
+
{
|
309
|
+
"gpu_id": gpu.gpu_id,
|
310
|
+
"name": gpu.name,
|
311
|
+
"memory_used_mb": gpu.memory_used,
|
312
|
+
"memory_total_mb": gpu.memory_total,
|
313
|
+
"memory_free_mb": gpu.memory_free,
|
314
|
+
"utilization_percent": gpu.utilization,
|
315
|
+
"temperature_c": gpu.temperature
|
316
|
+
}
|
317
|
+
for gpu in self.gpu_manager.gpus
|
318
|
+
]
|
319
|
+
|
320
|
+
overall_status = "healthy"
|
321
|
+
if unhealthy_services > 0:
|
322
|
+
overall_status = "degraded"
|
323
|
+
elif error_services > 0:
|
324
|
+
overall_status = "warning"
|
325
|
+
elif healthy_services == 0 and total_services > 0:
|
326
|
+
overall_status = "down"
|
327
|
+
|
328
|
+
return {
|
329
|
+
"overall_status": overall_status,
|
330
|
+
"timestamp": datetime.now().isoformat(),
|
331
|
+
"services": {
|
332
|
+
"total": total_services,
|
333
|
+
"healthy": healthy_services,
|
334
|
+
"unhealthy": unhealthy_services,
|
335
|
+
"error": error_services,
|
336
|
+
"stopped": total_services - healthy_services - unhealthy_services - error_services
|
337
|
+
},
|
338
|
+
"gpu_info": gpu_info,
|
339
|
+
"monitoring_enabled": self.monitoring_enabled,
|
340
|
+
"check_interval": self.check_interval
|
341
|
+
}
|
342
|
+
|
343
|
+
async def restart_unhealthy_services(self) -> Dict[str, Any]:
|
344
|
+
"""Attempt to restart unhealthy services"""
|
345
|
+
restart_results = {}
|
346
|
+
|
347
|
+
for service_name, metrics in self.metrics.items():
|
348
|
+
if metrics.status == ServiceStatus.UNHEALTHY:
|
349
|
+
try:
|
350
|
+
logger.info(f"Attempting to restart unhealthy service: {service_name}")
|
351
|
+
service = self.services[service_name]
|
352
|
+
|
353
|
+
# Check if service has restart method
|
354
|
+
if hasattr(service, 'restart'):
|
355
|
+
result = await service.restart()
|
356
|
+
restart_results[service_name] = result
|
357
|
+
elif hasattr(service, 'stop') and hasattr(service, 'start'):
|
358
|
+
# Manual restart
|
359
|
+
await service.stop()
|
360
|
+
await asyncio.sleep(2)
|
361
|
+
result = await service.start()
|
362
|
+
restart_results[service_name] = result
|
363
|
+
else:
|
364
|
+
restart_results[service_name] = {
|
365
|
+
"success": False,
|
366
|
+
"error": "Service does not support restart"
|
367
|
+
}
|
368
|
+
|
369
|
+
except Exception as e:
|
370
|
+
logger.error(f"Failed to restart service {service_name}: {e}")
|
371
|
+
restart_results[service_name] = {
|
372
|
+
"success": False,
|
373
|
+
"error": str(e)
|
374
|
+
}
|
375
|
+
|
376
|
+
return restart_results
|
377
|
+
|
378
|
+
async def _monitor_service(self, service_name: str):
|
379
|
+
"""Background monitoring task for a service"""
|
380
|
+
logger.info(f"Starting background monitoring for service: {service_name}")
|
381
|
+
|
382
|
+
try:
|
383
|
+
while True:
|
384
|
+
await self.check_service_health(service_name)
|
385
|
+
await asyncio.sleep(self.check_interval)
|
386
|
+
|
387
|
+
except asyncio.CancelledError:
|
388
|
+
logger.info(f"Monitoring cancelled for service: {service_name}")
|
389
|
+
except Exception as e:
|
390
|
+
logger.error(f"Monitoring error for service {service_name}: {e}")
|
391
|
+
|
392
|
+
async def _global_monitor(self):
|
393
|
+
"""Global monitoring task for system-wide health"""
|
394
|
+
logger.info("Starting global health monitoring")
|
395
|
+
|
396
|
+
try:
|
397
|
+
while self.monitoring_enabled:
|
398
|
+
# Check system resources
|
399
|
+
self.gpu_manager.refresh()
|
400
|
+
|
401
|
+
# Log system health periodically
|
402
|
+
system_health = self.get_system_health()
|
403
|
+
if system_health["overall_status"] != "healthy":
|
404
|
+
logger.warning(f"System health: {system_health['overall_status']}")
|
405
|
+
|
406
|
+
# Auto-restart unhealthy services if configured
|
407
|
+
unhealthy_count = system_health["services"]["unhealthy"]
|
408
|
+
if unhealthy_count > 0:
|
409
|
+
logger.info(f"Found {unhealthy_count} unhealthy services, attempting restart...")
|
410
|
+
await self.restart_unhealthy_services()
|
411
|
+
|
412
|
+
await asyncio.sleep(self.check_interval * 2) # Less frequent than individual checks
|
413
|
+
|
414
|
+
except asyncio.CancelledError:
|
415
|
+
logger.info("Global monitoring cancelled")
|
416
|
+
except Exception as e:
|
417
|
+
logger.error(f"Global monitoring error: {e}")
|
418
|
+
|
419
|
+
|
420
|
+
# Global health checker instance
|
421
|
+
_health_checker = None
|
422
|
+
|
423
|
+
def get_health_checker() -> LocalHealthChecker:
|
424
|
+
"""Get global health checker instance"""
|
425
|
+
global _health_checker
|
426
|
+
if _health_checker is None:
|
427
|
+
_health_checker = LocalHealthChecker()
|
428
|
+
return _health_checker
|