PyPI - isa-model - Versions diffs - 0.3.91__py3-none-any.whl → 0.4.3__py3-none-any.whl - Mend

isa-model 0.3.91py3-none-any.whl → 0.4.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (228) hide show

isa_model/client.py +1166 -584
isa_model/core/cache/redis_cache.py +410 -0
isa_model/core/config/config_manager.py +282 -12
isa_model/core/config.py +91 -1
isa_model/core/database/__init__.py +1 -0
isa_model/core/database/direct_db_client.py +114 -0
isa_model/core/database/migration_manager.py +563 -0
isa_model/core/database/migrations.py +297 -0
isa_model/core/database/supabase_client.py +258 -0
isa_model/core/dependencies.py +316 -0
isa_model/core/discovery/__init__.py +19 -0
isa_model/core/discovery/consul_discovery.py +190 -0
isa_model/core/logging/__init__.py +54 -0
isa_model/core/logging/influx_logger.py +523 -0
isa_model/core/logging/loki_logger.py +160 -0
isa_model/core/models/__init__.py +46 -0
isa_model/core/models/config_models.py +625 -0
isa_model/core/models/deployment_billing_tracker.py +430 -0
isa_model/core/models/model_billing_tracker.py +60 -88
isa_model/core/models/model_manager.py +66 -25
isa_model/core/models/model_metadata.py +690 -0
isa_model/core/models/model_repo.py +217 -55
isa_model/core/models/model_statistics_tracker.py +234 -0
isa_model/core/models/model_storage.py +0 -1
isa_model/core/models/model_version_manager.py +959 -0
isa_model/core/models/system_models.py +857 -0
isa_model/core/pricing_manager.py +2 -249
isa_model/core/repositories/__init__.py +9 -0
isa_model/core/repositories/config_repository.py +912 -0
isa_model/core/resilience/circuit_breaker.py +366 -0
isa_model/core/security/secrets.py +358 -0
isa_model/core/services/__init__.py +2 -4
isa_model/core/services/intelligent_model_selector.py +479 -370
isa_model/core/storage/hf_storage.py +2 -2
isa_model/core/types.py +8 -0
isa_model/deployment/__init__.py +5 -48
isa_model/deployment/core/__init__.py +2 -31
isa_model/deployment/core/deployment_manager.py +1278 -368
isa_model/deployment/local/__init__.py +31 -0
isa_model/deployment/local/config.py +248 -0
isa_model/deployment/local/gpu_gateway.py +607 -0
isa_model/deployment/local/health_checker.py +428 -0
isa_model/deployment/local/provider.py +586 -0
isa_model/deployment/local/tensorrt_service.py +621 -0
isa_model/deployment/local/transformers_service.py +644 -0
isa_model/deployment/local/vllm_service.py +527 -0
isa_model/deployment/modal/__init__.py +8 -0
isa_model/deployment/modal/config.py +136 -0
isa_model/deployment/modal/deployer.py +894 -0
isa_model/deployment/modal/services/__init__.py +3 -0
isa_model/deployment/modal/services/audio/__init__.py +1 -0
isa_model/deployment/modal/services/audio/isa_audio_chatTTS_service.py +520 -0
isa_model/deployment/modal/services/audio/isa_audio_openvoice_service.py +758 -0
isa_model/deployment/modal/services/audio/isa_audio_service_v2.py +1044 -0
isa_model/deployment/modal/services/embedding/__init__.py +1 -0
isa_model/deployment/modal/services/embedding/isa_embed_rerank_service.py +296 -0
isa_model/deployment/modal/services/llm/__init__.py +1 -0
isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
isa_model/deployment/modal/services/video/__init__.py +1 -0
isa_model/deployment/modal/services/video/isa_video_hunyuan_service.py +423 -0
isa_model/deployment/modal/services/vision/__init__.py +1 -0
isa_model/deployment/modal/services/vision/isa_vision_ocr_service.py +519 -0
isa_model/deployment/modal/services/vision/isa_vision_qwen25_service.py +709 -0
isa_model/deployment/modal/services/vision/isa_vision_table_service.py +676 -0
isa_model/deployment/modal/services/vision/isa_vision_ui_service.py +833 -0
isa_model/deployment/modal/services/vision/isa_vision_ui_service_optimized.py +660 -0
isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
isa_model/deployment/storage/__init__.py +5 -0
isa_model/deployment/storage/deployment_repository.py +824 -0
isa_model/deployment/triton/__init__.py +10 -0
isa_model/deployment/triton/config.py +196 -0
isa_model/deployment/triton/configs/__init__.py +1 -0
isa_model/deployment/triton/provider.py +512 -0
isa_model/deployment/triton/scripts/__init__.py +1 -0
isa_model/deployment/triton/templates/__init__.py +1 -0
isa_model/inference/__init__.py +47 -1
isa_model/inference/ai_factory.py +179 -16
isa_model/inference/legacy_services/__init__.py +21 -0
isa_model/inference/legacy_services/model_evaluation.py +637 -0
isa_model/inference/legacy_services/model_service.py +573 -0
isa_model/inference/legacy_services/model_serving.py +717 -0
isa_model/inference/legacy_services/model_training.py +561 -0
isa_model/inference/models/__init__.py +21 -0
isa_model/inference/models/inference_config.py +551 -0
isa_model/inference/models/inference_record.py +675 -0
isa_model/inference/models/performance_models.py +714 -0
isa_model/inference/repositories/__init__.py +9 -0
isa_model/inference/repositories/inference_repository.py +828 -0
isa_model/inference/services/audio/__init__.py +21 -0
isa_model/inference/services/audio/base_realtime_service.py +225 -0
isa_model/inference/services/audio/base_stt_service.py +184 -11
isa_model/inference/services/audio/isa_tts_service.py +0 -0
isa_model/inference/services/audio/openai_realtime_service.py +320 -124
isa_model/inference/services/audio/openai_stt_service.py +53 -11
isa_model/inference/services/base_service.py +17 -1
isa_model/inference/services/custom_model_manager.py +277 -0
isa_model/inference/services/embedding/__init__.py +13 -0
isa_model/inference/services/embedding/base_embed_service.py +111 -8
isa_model/inference/services/embedding/isa_embed_service.py +305 -0
isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
isa_model/inference/services/embedding/openai_embed_service.py +2 -4
isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
isa_model/inference/services/embedding/tests/test_embedding.py +222 -0
isa_model/inference/services/img/__init__.py +2 -2
isa_model/inference/services/img/base_image_gen_service.py +24 -7
isa_model/inference/services/img/replicate_image_gen_service.py +84 -422
isa_model/inference/services/img/services/replicate_face_swap.py +193 -0
isa_model/inference/services/img/services/replicate_flux.py +226 -0
isa_model/inference/services/img/services/replicate_flux_kontext.py +219 -0
isa_model/inference/services/img/services/replicate_sticker_maker.py +249 -0
isa_model/inference/services/img/tests/test_img_client.py +297 -0
isa_model/inference/services/llm/__init__.py +10 -2
isa_model/inference/services/llm/base_llm_service.py +361 -26
isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
isa_model/inference/services/llm/helpers/llm_adapter.py +71 -12
isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
isa_model/inference/services/llm/local_llm_service.py +747 -0
isa_model/inference/services/llm/ollama_llm_service.py +11 -3
isa_model/inference/services/llm/openai_llm_service.py +670 -56
isa_model/inference/services/llm/yyds_llm_service.py +10 -3
isa_model/inference/services/vision/__init__.py +27 -6
isa_model/inference/services/vision/base_vision_service.py +118 -185
isa_model/inference/services/vision/blip_vision_service.py +359 -0
isa_model/inference/services/vision/helpers/image_utils.py +19 -10
isa_model/inference/services/vision/isa_vision_service.py +634 -0
isa_model/inference/services/vision/openai_vision_service.py +19 -10
isa_model/inference/services/vision/tests/test_ocr_client.py +284 -0
isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
isa_model/serving/api/cache_manager.py +245 -0
isa_model/serving/api/dependencies/__init__.py +1 -0
isa_model/serving/api/dependencies/auth.py +194 -0
isa_model/serving/api/dependencies/database.py +139 -0
isa_model/serving/api/error_handlers.py +284 -0
isa_model/serving/api/fastapi_server.py +240 -18
isa_model/serving/api/middleware/auth.py +317 -0
isa_model/serving/api/middleware/security.py +268 -0
isa_model/serving/api/middleware/tenant_context.py +414 -0
isa_model/serving/api/routes/analytics.py +489 -0
isa_model/serving/api/routes/config.py +645 -0
isa_model/serving/api/routes/deployment_billing.py +315 -0
isa_model/serving/api/routes/deployments.py +475 -0
isa_model/serving/api/routes/gpu_gateway.py +440 -0
isa_model/serving/api/routes/health.py +32 -12
isa_model/serving/api/routes/inference_monitoring.py +486 -0
isa_model/serving/api/routes/local_deployments.py +448 -0
isa_model/serving/api/routes/logs.py +430 -0
isa_model/serving/api/routes/settings.py +582 -0
isa_model/serving/api/routes/tenants.py +575 -0
isa_model/serving/api/routes/unified.py +992 -171
isa_model/serving/api/routes/webhooks.py +479 -0
isa_model/serving/api/startup.py +318 -0
isa_model/serving/modal_proxy_server.py +249 -0
isa_model/utils/gpu_utils.py +311 -0
{isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/METADATA +76 -22
isa_model-0.4.3.dist-info/RECORD +193 -0
isa_model/deployment/cloud/__init__.py +0 -9
isa_model/deployment/cloud/modal/__init__.py +0 -10
isa_model/deployment/cloud/modal/isa_vision_doc_service.py +0 -766
isa_model/deployment/cloud/modal/isa_vision_table_service.py +0 -532
isa_model/deployment/cloud/modal/isa_vision_ui_service.py +0 -406
isa_model/deployment/cloud/modal/register_models.py +0 -321
isa_model/deployment/core/deployment_config.py +0 -356
isa_model/deployment/core/isa_deployment_service.py +0 -401
isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
isa_model/deployment/runtime/deployed_service.py +0 -338
isa_model/deployment/services/__init__.py +0 -9
isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
isa_model/deployment/services/model_service.py +0 -332
isa_model/deployment/services/service_monitor.py +0 -356
isa_model/deployment/services/service_registry.py +0 -527
isa_model/eval/__init__.py +0 -92
isa_model/eval/benchmarks.py +0 -469
isa_model/eval/config/__init__.py +0 -10
isa_model/eval/config/evaluation_config.py +0 -108
isa_model/eval/evaluators/__init__.py +0 -18
isa_model/eval/evaluators/base_evaluator.py +0 -503
isa_model/eval/evaluators/llm_evaluator.py +0 -472
isa_model/eval/factory.py +0 -531
isa_model/eval/infrastructure/__init__.py +0 -24
isa_model/eval/infrastructure/experiment_tracker.py +0 -466
isa_model/eval/metrics.py +0 -798
isa_model/inference/adapter/unified_api.py +0 -248
isa_model/inference/services/helpers/stacked_config.py +0 -148
isa_model/inference/services/img/flux_professional_service.py +0 -603
isa_model/inference/services/img/helpers/base_stacked_service.py +0 -274
isa_model/inference/services/others/table_transformer_service.py +0 -61
isa_model/inference/services/vision/doc_analysis_service.py +0 -640
isa_model/inference/services/vision/helpers/base_stacked_service.py +0 -274
isa_model/inference/services/vision/ui_analysis_service.py +0 -823
isa_model/scripts/inference_tracker.py +0 -283
isa_model/scripts/mlflow_manager.py +0 -379
isa_model/scripts/model_registry.py +0 -465
isa_model/scripts/register_models.py +0 -370
isa_model/scripts/register_models_with_embeddings.py +0 -510
isa_model/scripts/start_mlflow.py +0 -95
isa_model/scripts/training_tracker.py +0 -257
isa_model/training/__init__.py +0 -74
isa_model/training/annotation/annotation_schema.py +0 -47
isa_model/training/annotation/processors/annotation_processor.py +0 -126
isa_model/training/annotation/storage/dataset_manager.py +0 -131
isa_model/training/annotation/storage/dataset_schema.py +0 -44
isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
isa_model/training/annotation/tests/test_minio copy.py +0 -113
isa_model/training/annotation/tests/test_minio_upload.py +0 -43
isa_model/training/annotation/views/annotation_controller.py +0 -158
isa_model/training/cloud/__init__.py +0 -22
isa_model/training/cloud/job_orchestrator.py +0 -402
isa_model/training/cloud/runpod_trainer.py +0 -454
isa_model/training/cloud/storage_manager.py +0 -482
isa_model/training/core/__init__.py +0 -23
isa_model/training/core/config.py +0 -181
isa_model/training/core/dataset.py +0 -222
isa_model/training/core/trainer.py +0 -720
isa_model/training/core/utils.py +0 -213
isa_model/training/factory.py +0 -424
isa_model-0.3.91.dist-info/RECORD +0 -138
/isa_model/{core/storage/minio_storage.py → deployment/modal/services/audio/isa_audio_fish_service.py} +0 -0
/isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
{isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/WHEEL +0 -0
{isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/top_level.txt +0 -0

isa_model/deployment/local/health_checker.py ADDED Viewed

@@ -0,0 +1,428 @@
+"""
+Local service health monitoring and management
+Provides health checking, monitoring, and management for local GPU services.
+"""
+import asyncio
+import logging
+import time
+from typing import Dict, List, Optional, Any, Union
+from datetime import datetime, timedelta
+from dataclasses import dataclass, field
+from enum import Enum
+from ...utils.gpu_utils import get_gpu_manager
+logger = logging.getLogger(__name__)
+class ServiceStatus(Enum):
+    """Service status states"""
+    STOPPED = "stopped"
+    STARTING = "starting"
+    RUNNING = "running"
+    ERROR = "error"
+    UNHEALTHY = "unhealthy"
+    STOPPING = "stopping"
+@dataclass
+class HealthMetrics:
+    """Health metrics for a service"""
+    service_name: str
+    status: ServiceStatus
+    last_check: datetime
+    response_time_ms: Optional[float] = None
+    error_count: int = 0
+    consecutive_failures: int = 0
+    uptime_seconds: Optional[float] = None
+    memory_usage_mb: Optional[float] = None
+    gpu_utilization: Optional[float] = None
+    request_count: int = 0
+    last_error: Optional[str] = None
+    metadata: Dict[str, Any] = field(default_factory=dict)
+class LocalHealthChecker:
+    """Health checker for local GPU services"""
+    def __init__(self, check_interval: int = 30, failure_threshold: int = 3):
+        """
+        Initialize health checker.
+        Args:
+            check_interval: Health check interval in seconds
+            failure_threshold: Number of consecutive failures before marking unhealthy
+        """
+        self.check_interval = check_interval
+        self.failure_threshold = failure_threshold
+        self.gpu_manager = get_gpu_manager()
+        # Service tracking
+        self.services: Dict[str, Any] = {}  # service_name -> service instance
+        self.metrics: Dict[str, HealthMetrics] = {}  # service_name -> metrics
+        self.monitoring_tasks: Dict[str, asyncio.Task] = {}  # service_name -> task
+        # Global monitoring
+        self.monitoring_enabled = False
+        self.global_monitor_task: Optional[asyncio.Task] = None
+        logger.info("Local health checker initialized")
+    def register_service(self, service_name: str, service_instance: Any) -> bool:
+        """
+        Register a service for health monitoring.
+        Args:
+            service_name: Unique service name
+            service_instance: Service instance with health_check() method
+        Returns:
+            Registration success
+        """
+        try:
+            if not hasattr(service_instance, 'health_check'):
+                logger.error(f"Service {service_name} does not have health_check method")
+                return False
+            self.services[service_name] = service_instance
+            self.metrics[service_name] = HealthMetrics(
+                service_name=service_name,
+                status=ServiceStatus.STOPPED,
+                last_check=datetime.now()
+            )
+            logger.info(f"Service registered for health monitoring: {service_name}")
+            return True
+        except Exception as e:
+            logger.error(f"Failed to register service {service_name}: {e}")
+            return False
+    def unregister_service(self, service_name: str) -> bool:
+        """
+        Unregister a service from health monitoring.
+        Args:
+            service_name: Service name to unregister
+        Returns:
+            Unregistration success
+        """
+        try:
+            # Stop monitoring task
+            if service_name in self.monitoring_tasks:
+                self.monitoring_tasks[service_name].cancel()
+                del self.monitoring_tasks[service_name]
+            # Remove from tracking
+            if service_name in self.services:
+                del self.services[service_name]
+            if service_name in self.metrics:
+                del self.metrics[service_name]
+            logger.info(f"Service unregistered from health monitoring: {service_name}")
+            return True
+        except Exception as e:
+            logger.error(f"Failed to unregister service {service_name}: {e}")
+            return False
+    async def start_monitoring(self, service_name: Optional[str] = None) -> bool:
+        """
+        Start health monitoring for a specific service or all services.
+        Args:
+            service_name: Service to monitor, or None for all services
+        Returns:
+            Start success
+        """
+        try:
+            if service_name:
+                # Start monitoring for specific service
+                if service_name not in self.services:
+                    logger.error(f"Service {service_name} not registered")
+                    return False
+                if service_name not in self.monitoring_tasks:
+                    task = asyncio.create_task(self._monitor_service(service_name))
+                    self.monitoring_tasks[service_name] = task
+                    logger.info(f"Started monitoring service: {service_name}")
+            else:
+                # Start monitoring for all services
+                for svc_name in self.services:
+                    if svc_name not in self.monitoring_tasks:
+                        task = asyncio.create_task(self._monitor_service(svc_name))
+                        self.monitoring_tasks[svc_name] = task
+                # Start global monitoring
+                if not self.monitoring_enabled:
+                    self.monitoring_enabled = True
+                    self.global_monitor_task = asyncio.create_task(self._global_monitor())
+                    logger.info("Started global health monitoring")
+            return True
+        except Exception as e:
+            logger.error(f"Failed to start monitoring: {e}")
+            return False
+    async def stop_monitoring(self, service_name: Optional[str] = None) -> bool:
+        """
+        Stop health monitoring for a specific service or all services.
+        Args:
+            service_name: Service to stop monitoring, or None for all services
+        Returns:
+            Stop success
+        """
+        try:
+            if service_name:
+                # Stop monitoring for specific service
+                if service_name in self.monitoring_tasks:
+                    self.monitoring_tasks[service_name].cancel()
+                    del self.monitoring_tasks[service_name]
+                    logger.info(f"Stopped monitoring service: {service_name}")
+            else:
+                # Stop all monitoring
+                for task in self.monitoring_tasks.values():
+                    task.cancel()
+                self.monitoring_tasks.clear()
+                # Stop global monitoring
+                self.monitoring_enabled = False
+                if self.global_monitor_task:
+                    self.global_monitor_task.cancel()
+                    self.global_monitor_task = None
+                logger.info("Stopped all health monitoring")
+            return True
+        except Exception as e:
+            logger.error(f"Failed to stop monitoring: {e}")
+            return False
+    async def check_service_health(self, service_name: str) -> Dict[str, Any]:
+        """
+        Perform immediate health check for a service.
+        Args:
+            service_name: Service to check
+        Returns:
+            Health check result
+        """
+        if service_name not in self.services:
+            return {
+                "healthy": False,
+                "error": f"Service {service_name} not registered"
+            }
+        try:
+            start_time = time.time()
+            service = self.services[service_name]
+            # Perform health check
+            health_result = await service.health_check()
+            response_time = (time.time() - start_time) * 1000  # ms
+            # Update metrics
+            metrics = self.metrics[service_name]
+            metrics.last_check = datetime.now()
+            metrics.response_time_ms = response_time
+            if health_result.get("healthy", False):
+                metrics.status = ServiceStatus.RUNNING
+                metrics.consecutive_failures = 0
+                # Update additional metrics if available
+                if "memory_usage_mb" in health_result:
+                    metrics.memory_usage_mb = health_result["memory_usage_mb"]
+                if "gpu_utilization" in health_result:
+                    metrics.gpu_utilization = health_result["gpu_utilization"]
+                if "uptime_seconds" in health_result:
+                    metrics.uptime_seconds = health_result["uptime_seconds"]
+                if "request_count" in health_result:
+                    metrics.request_count = health_result["request_count"]
+            else:
+                metrics.consecutive_failures += 1
+                metrics.error_count += 1
+                metrics.last_error = health_result.get("error", "Unknown error")
+                if metrics.consecutive_failures >= self.failure_threshold:
+                    metrics.status = ServiceStatus.UNHEALTHY
+                else:
+                    metrics.status = ServiceStatus.ERROR
+            return {
+                **health_result,
+                "response_time_ms": response_time,
+                "consecutive_failures": metrics.consecutive_failures,
+                "service_name": service_name
+            }
+        except Exception as e:
+            logger.error(f"Health check failed for {service_name}: {e}")
+            # Update metrics on exception
+            metrics = self.metrics[service_name]
+            metrics.last_check = datetime.now()
+            metrics.consecutive_failures += 1
+            metrics.error_count += 1
+            metrics.last_error = str(e)
+            metrics.status = ServiceStatus.ERROR
+            return {
+                "healthy": False,
+                "error": str(e),
+                "service_name": service_name,
+                "consecutive_failures": metrics.consecutive_failures
+            }
+    def get_service_metrics(self, service_name: str) -> Optional[HealthMetrics]:
+        """Get metrics for a specific service"""
+        return self.metrics.get(service_name)
+    def get_all_metrics(self) -> Dict[str, HealthMetrics]:
+        """Get metrics for all services"""
+        return self.metrics.copy()
+    def get_system_health(self) -> Dict[str, Any]:
+        """Get overall system health summary"""
+        total_services = len(self.services)
+        healthy_services = sum(1 for m in self.metrics.values() if m.status == ServiceStatus.RUNNING)
+        unhealthy_services = sum(1 for m in self.metrics.values() if m.status == ServiceStatus.UNHEALTHY)
+        error_services = sum(1 for m in self.metrics.values() if m.status == ServiceStatus.ERROR)
+        # Get GPU status
+        self.gpu_manager.refresh()
+        gpu_info = [
+            {
+                "gpu_id": gpu.gpu_id,
+                "name": gpu.name,
+                "memory_used_mb": gpu.memory_used,
+                "memory_total_mb": gpu.memory_total,
+                "memory_free_mb": gpu.memory_free,
+                "utilization_percent": gpu.utilization,
+                "temperature_c": gpu.temperature
+            }
+            for gpu in self.gpu_manager.gpus
+        ]
+        overall_status = "healthy"
+        if unhealthy_services > 0:
+            overall_status = "degraded"
+        elif error_services > 0:
+            overall_status = "warning"
+        elif healthy_services == 0 and total_services > 0:
+            overall_status = "down"
+        return {
+            "overall_status": overall_status,
+            "timestamp": datetime.now().isoformat(),
+            "services": {
+                "total": total_services,
+                "healthy": healthy_services,
+                "unhealthy": unhealthy_services,
+                "error": error_services,
+                "stopped": total_services - healthy_services - unhealthy_services - error_services
+            },
+            "gpu_info": gpu_info,
+            "monitoring_enabled": self.monitoring_enabled,
+            "check_interval": self.check_interval
+        }
+    async def restart_unhealthy_services(self) -> Dict[str, Any]:
+        """Attempt to restart unhealthy services"""
+        restart_results = {}
+        for service_name, metrics in self.metrics.items():
+            if metrics.status == ServiceStatus.UNHEALTHY:
+                try:
+                    logger.info(f"Attempting to restart unhealthy service: {service_name}")
+                    service = self.services[service_name]
+                    # Check if service has restart method
+                    if hasattr(service, 'restart'):
+                        result = await service.restart()
+                        restart_results[service_name] = result
+                    elif hasattr(service, 'stop') and hasattr(service, 'start'):
+                        # Manual restart
+                        await service.stop()
+                        await asyncio.sleep(2)
+                        result = await service.start()
+                        restart_results[service_name] = result
+                    else:
+                        restart_results[service_name] = {
+                            "success": False,
+                            "error": "Service does not support restart"
+                        }
+                except Exception as e:
+                    logger.error(f"Failed to restart service {service_name}: {e}")
+                    restart_results[service_name] = {
+                        "success": False,
+                        "error": str(e)
+                    }
+        return restart_results
+    async def _monitor_service(self, service_name: str):
+        """Background monitoring task for a service"""
+        logger.info(f"Starting background monitoring for service: {service_name}")
+        try:
+            while True:
+                await self.check_service_health(service_name)
+                await asyncio.sleep(self.check_interval)
+        except asyncio.CancelledError:
+            logger.info(f"Monitoring cancelled for service: {service_name}")
+        except Exception as e:
+            logger.error(f"Monitoring error for service {service_name}: {e}")
+    async def _global_monitor(self):
+        """Global monitoring task for system-wide health"""
+        logger.info("Starting global health monitoring")
+        try:
+            while self.monitoring_enabled:
+                # Check system resources
+                self.gpu_manager.refresh()
+                # Log system health periodically
+                system_health = self.get_system_health()
+                if system_health["overall_status"] != "healthy":
+                    logger.warning(f"System health: {system_health['overall_status']}")
+                # Auto-restart unhealthy services if configured
+                unhealthy_count = system_health["services"]["unhealthy"]
+                if unhealthy_count > 0:
+                    logger.info(f"Found {unhealthy_count} unhealthy services, attempting restart...")
+                    await self.restart_unhealthy_services()
+                await asyncio.sleep(self.check_interval * 2)  # Less frequent than individual checks
+        except asyncio.CancelledError:
+            logger.info("Global monitoring cancelled")
+        except Exception as e:
+            logger.error(f"Global monitoring error: {e}")
+# Global health checker instance
+_health_checker = None
+def get_health_checker() -> LocalHealthChecker:
+    """Get global health checker instance"""
+    global _health_checker
+    if _health_checker is None:
+        _health_checker = LocalHealthChecker()
+    return _health_checker

isa-model 0.3.91__py3-none-any.whl → 0.4.3__py3-none-any.whl

isa-model 0.3.91py3-none-any.whl → 0.4.3py3-none-any.whl