isa-model 0.4.0__py3-none-any.whl → 0.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- isa_model/client.py +466 -43
- isa_model/core/cache/redis_cache.py +12 -3
- isa_model/core/config/config_manager.py +230 -3
- isa_model/core/config.py +90 -0
- isa_model/core/database/direct_db_client.py +114 -0
- isa_model/core/database/migration_manager.py +563 -0
- isa_model/core/database/migrations.py +21 -1
- isa_model/core/database/supabase_client.py +154 -19
- isa_model/core/dependencies.py +316 -0
- isa_model/core/discovery/__init__.py +19 -0
- isa_model/core/discovery/consul_discovery.py +190 -0
- isa_model/core/logging/__init__.py +54 -0
- isa_model/core/logging/influx_logger.py +523 -0
- isa_model/core/logging/loki_logger.py +160 -0
- isa_model/core/models/__init__.py +27 -18
- isa_model/core/models/config_models.py +625 -0
- isa_model/core/models/deployment_billing_tracker.py +430 -0
- isa_model/core/models/model_manager.py +40 -17
- isa_model/core/models/model_metadata.py +690 -0
- isa_model/core/models/model_repo.py +174 -18
- isa_model/core/models/system_models.py +857 -0
- isa_model/core/repositories/__init__.py +9 -0
- isa_model/core/repositories/config_repository.py +912 -0
- isa_model/core/services/intelligent_model_selector.py +399 -21
- isa_model/core/storage/hf_storage.py +1 -1
- isa_model/core/types.py +1 -0
- isa_model/deployment/__init__.py +5 -48
- isa_model/deployment/core/__init__.py +2 -31
- isa_model/deployment/core/deployment_manager.py +1278 -370
- isa_model/deployment/local/__init__.py +31 -0
- isa_model/deployment/local/config.py +248 -0
- isa_model/deployment/local/gpu_gateway.py +607 -0
- isa_model/deployment/local/health_checker.py +428 -0
- isa_model/deployment/local/provider.py +586 -0
- isa_model/deployment/local/tensorrt_service.py +621 -0
- isa_model/deployment/local/transformers_service.py +644 -0
- isa_model/deployment/local/vllm_service.py +527 -0
- isa_model/deployment/modal/__init__.py +8 -0
- isa_model/deployment/modal/config.py +136 -0
- isa_model/deployment/{services/auto_hf_modal_deployer.py → modal/deployer.py} +1 -1
- isa_model/deployment/modal/services/__init__.py +3 -0
- isa_model/deployment/modal/services/audio/__init__.py +1 -0
- isa_model/deployment/modal/services/embedding/__init__.py +1 -0
- isa_model/deployment/modal/services/llm/__init__.py +1 -0
- isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
- isa_model/deployment/modal/services/video/__init__.py +1 -0
- isa_model/deployment/modal/services/vision/__init__.py +1 -0
- isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
- isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
- isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
- isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
- isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
- isa_model/deployment/storage/__init__.py +5 -0
- isa_model/deployment/storage/deployment_repository.py +824 -0
- isa_model/deployment/triton/__init__.py +10 -0
- isa_model/deployment/triton/config.py +196 -0
- isa_model/deployment/triton/configs/__init__.py +1 -0
- isa_model/deployment/triton/provider.py +512 -0
- isa_model/deployment/triton/scripts/__init__.py +1 -0
- isa_model/deployment/triton/templates/__init__.py +1 -0
- isa_model/inference/__init__.py +47 -1
- isa_model/inference/ai_factory.py +137 -10
- isa_model/inference/legacy_services/__init__.py +21 -0
- isa_model/inference/legacy_services/model_evaluation.py +637 -0
- isa_model/inference/legacy_services/model_service.py +573 -0
- isa_model/inference/legacy_services/model_serving.py +717 -0
- isa_model/inference/legacy_services/model_training.py +561 -0
- isa_model/inference/models/__init__.py +21 -0
- isa_model/inference/models/inference_config.py +551 -0
- isa_model/inference/models/inference_record.py +675 -0
- isa_model/inference/models/performance_models.py +714 -0
- isa_model/inference/repositories/__init__.py +9 -0
- isa_model/inference/repositories/inference_repository.py +828 -0
- isa_model/inference/services/audio/base_stt_service.py +184 -11
- isa_model/inference/services/audio/openai_stt_service.py +22 -6
- isa_model/inference/services/custom_model_manager.py +277 -0
- isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
- isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
- isa_model/inference/services/llm/__init__.py +10 -2
- isa_model/inference/services/llm/base_llm_service.py +335 -24
- isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
- isa_model/inference/services/llm/helpers/llm_adapter.py +9 -4
- isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
- isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
- isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
- isa_model/inference/services/llm/local_llm_service.py +747 -0
- isa_model/inference/services/llm/ollama_llm_service.py +9 -2
- isa_model/inference/services/llm/openai_llm_service.py +33 -16
- isa_model/inference/services/llm/yyds_llm_service.py +8 -2
- isa_model/inference/services/vision/__init__.py +22 -1
- isa_model/inference/services/vision/blip_vision_service.py +359 -0
- isa_model/inference/services/vision/helpers/image_utils.py +8 -5
- isa_model/inference/services/vision/isa_vision_service.py +65 -4
- isa_model/inference/services/vision/openai_vision_service.py +19 -10
- isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
- isa_model/serving/api/cache_manager.py +245 -0
- isa_model/serving/api/dependencies/__init__.py +1 -0
- isa_model/serving/api/dependencies/auth.py +194 -0
- isa_model/serving/api/dependencies/database.py +139 -0
- isa_model/serving/api/error_handlers.py +284 -0
- isa_model/serving/api/fastapi_server.py +172 -22
- isa_model/serving/api/middleware/auth.py +8 -2
- isa_model/serving/api/middleware/security.py +23 -33
- isa_model/serving/api/middleware/tenant_context.py +414 -0
- isa_model/serving/api/routes/analytics.py +4 -1
- isa_model/serving/api/routes/config.py +645 -0
- isa_model/serving/api/routes/deployment_billing.py +315 -0
- isa_model/serving/api/routes/deployments.py +138 -2
- isa_model/serving/api/routes/gpu_gateway.py +440 -0
- isa_model/serving/api/routes/health.py +32 -12
- isa_model/serving/api/routes/inference_monitoring.py +486 -0
- isa_model/serving/api/routes/local_deployments.py +448 -0
- isa_model/serving/api/routes/tenants.py +575 -0
- isa_model/serving/api/routes/unified.py +680 -18
- isa_model/serving/api/routes/webhooks.py +479 -0
- isa_model/serving/api/startup.py +68 -54
- isa_model/utils/gpu_utils.py +311 -0
- {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/METADATA +66 -24
- isa_model-0.4.3.dist-info/RECORD +193 -0
- isa_model/core/storage/minio_storage.py +0 -0
- isa_model/deployment/cloud/__init__.py +0 -9
- isa_model/deployment/cloud/modal/__init__.py +0 -10
- isa_model/deployment/core/deployment_config.py +0 -356
- isa_model/deployment/core/isa_deployment_service.py +0 -401
- isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
- isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
- isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
- isa_model/deployment/runtime/deployed_service.py +0 -338
- isa_model/deployment/services/__init__.py +0 -9
- isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
- isa_model/deployment/services/model_service.py +0 -332
- isa_model/deployment/services/service_monitor.py +0 -356
- isa_model/deployment/services/service_registry.py +0 -527
- isa_model/eval/__init__.py +0 -92
- isa_model/eval/benchmarks/__init__.py +0 -27
- isa_model/eval/benchmarks/multimodal_datasets.py +0 -460
- isa_model/eval/benchmarks.py +0 -701
- isa_model/eval/config/__init__.py +0 -10
- isa_model/eval/config/evaluation_config.py +0 -108
- isa_model/eval/evaluators/__init__.py +0 -24
- isa_model/eval/evaluators/audio_evaluator.py +0 -727
- isa_model/eval/evaluators/base_evaluator.py +0 -503
- isa_model/eval/evaluators/embedding_evaluator.py +0 -742
- isa_model/eval/evaluators/llm_evaluator.py +0 -472
- isa_model/eval/evaluators/vision_evaluator.py +0 -564
- isa_model/eval/example_evaluation.py +0 -395
- isa_model/eval/factory.py +0 -798
- isa_model/eval/infrastructure/__init__.py +0 -24
- isa_model/eval/infrastructure/experiment_tracker.py +0 -466
- isa_model/eval/isa_benchmarks.py +0 -700
- isa_model/eval/isa_integration.py +0 -582
- isa_model/eval/metrics.py +0 -951
- isa_model/eval/tests/unit/test_basic.py +0 -396
- isa_model/serving/api/routes/evaluations.py +0 -579
- isa_model/training/__init__.py +0 -168
- isa_model/training/annotation/annotation_schema.py +0 -47
- isa_model/training/annotation/processors/annotation_processor.py +0 -126
- isa_model/training/annotation/storage/dataset_manager.py +0 -131
- isa_model/training/annotation/storage/dataset_schema.py +0 -44
- isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
- isa_model/training/annotation/tests/test_minio copy.py +0 -113
- isa_model/training/annotation/tests/test_minio_upload.py +0 -43
- isa_model/training/annotation/views/annotation_controller.py +0 -158
- isa_model/training/cloud/__init__.py +0 -22
- isa_model/training/cloud/job_orchestrator.py +0 -402
- isa_model/training/cloud/runpod_trainer.py +0 -454
- isa_model/training/cloud/storage_manager.py +0 -482
- isa_model/training/core/__init__.py +0 -26
- isa_model/training/core/config.py +0 -181
- isa_model/training/core/dataset.py +0 -222
- isa_model/training/core/trainer.py +0 -720
- isa_model/training/core/utils.py +0 -213
- isa_model/training/examples/intelligent_training_example.py +0 -281
- isa_model/training/factory.py +0 -424
- isa_model/training/intelligent/__init__.py +0 -25
- isa_model/training/intelligent/decision_engine.py +0 -643
- isa_model/training/intelligent/intelligent_factory.py +0 -888
- isa_model/training/intelligent/knowledge_base.py +0 -751
- isa_model/training/intelligent/resource_optimizer.py +0 -839
- isa_model/training/intelligent/task_classifier.py +0 -576
- isa_model/training/storage/__init__.py +0 -24
- isa_model/training/storage/core_integration.py +0 -439
- isa_model/training/storage/training_repository.py +0 -552
- isa_model/training/storage/training_storage.py +0 -628
- isa_model-0.4.0.dist-info/RECORD +0 -182
- /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_chatTTS_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_fish_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_openvoice_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_service_v2.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/embedding}/isa_embed_rerank_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/video}/isa_video_hunyuan_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ocr_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_qwen25_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_table_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service_optimized.py +0 -0
- /isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
- {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/WHEEL +0 -0
- {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,857 @@
|
|
1
|
+
"""
|
2
|
+
System Models
|
3
|
+
|
4
|
+
Data models for system health, resource usage, and service status monitoring,
|
5
|
+
following the ISA Model architecture pattern.
|
6
|
+
"""
|
7
|
+
|
8
|
+
import logging
|
9
|
+
from datetime import datetime, timezone, timedelta
|
10
|
+
from typing import Dict, List, Optional, Any, Union
|
11
|
+
from dataclasses import dataclass, field
|
12
|
+
from enum import Enum
|
13
|
+
import statistics
|
14
|
+
|
15
|
+
logger = logging.getLogger(__name__)
|
16
|
+
|
17
|
+
class HealthStatus(str, Enum):
|
18
|
+
"""Health status enumeration"""
|
19
|
+
HEALTHY = "healthy"
|
20
|
+
DEGRADED = "degraded"
|
21
|
+
CRITICAL = "critical"
|
22
|
+
OFFLINE = "offline"
|
23
|
+
UNKNOWN = "unknown"
|
24
|
+
MAINTENANCE = "maintenance"
|
25
|
+
|
26
|
+
class ServiceType(str, Enum):
|
27
|
+
"""Service type enumeration"""
|
28
|
+
API = "api"
|
29
|
+
DATABASE = "database"
|
30
|
+
CACHE = "cache"
|
31
|
+
QUEUE = "queue"
|
32
|
+
STORAGE = "storage"
|
33
|
+
COMPUTE = "compute"
|
34
|
+
MONITORING = "monitoring"
|
35
|
+
EXTERNAL = "external"
|
36
|
+
|
37
|
+
class AlertSeverity(str, Enum):
|
38
|
+
"""Alert severity enumeration"""
|
39
|
+
INFO = "info"
|
40
|
+
WARNING = "warning"
|
41
|
+
ERROR = "error"
|
42
|
+
CRITICAL = "critical"
|
43
|
+
|
44
|
+
@dataclass
|
45
|
+
class SystemHealth:
|
46
|
+
"""
|
47
|
+
System health monitoring record
|
48
|
+
|
49
|
+
Tracks overall system health including component status,
|
50
|
+
performance metrics, and alert information.
|
51
|
+
"""
|
52
|
+
health_id: str
|
53
|
+
system_name: str
|
54
|
+
overall_status: str = HealthStatus.HEALTHY
|
55
|
+
timestamp: datetime = None
|
56
|
+
|
57
|
+
# Component health
|
58
|
+
component_status: Dict[str, str] = field(default_factory=dict)
|
59
|
+
component_metrics: Dict[str, Dict[str, float]] = field(default_factory=dict)
|
60
|
+
failing_components: List[str] = field(default_factory=list)
|
61
|
+
degraded_components: List[str] = field(default_factory=list)
|
62
|
+
|
63
|
+
# Performance indicators
|
64
|
+
response_time_ms: Optional[float] = None
|
65
|
+
throughput_rps: Optional[float] = None
|
66
|
+
error_rate_percent: Optional[float] = None
|
67
|
+
availability_percent: Optional[float] = None
|
68
|
+
uptime_seconds: Optional[int] = None
|
69
|
+
|
70
|
+
# Resource utilization
|
71
|
+
cpu_usage_percent: Optional[float] = None
|
72
|
+
memory_usage_percent: Optional[float] = None
|
73
|
+
disk_usage_percent: Optional[float] = None
|
74
|
+
network_usage_mbps: Optional[float] = None
|
75
|
+
|
76
|
+
# Health checks
|
77
|
+
last_health_check: Optional[datetime] = None
|
78
|
+
health_check_interval_seconds: int = 60
|
79
|
+
consecutive_failures: int = 0
|
80
|
+
consecutive_successes: int = 0
|
81
|
+
|
82
|
+
# Alerts and issues
|
83
|
+
active_alerts: List[Dict[str, Any]] = field(default_factory=list)
|
84
|
+
resolved_alerts_24h: int = 0
|
85
|
+
critical_issues: List[str] = field(default_factory=list)
|
86
|
+
|
87
|
+
# Metadata
|
88
|
+
version: Optional[str] = None
|
89
|
+
environment: str = "production"
|
90
|
+
region: Optional[str] = None
|
91
|
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
92
|
+
|
93
|
+
def __post_init__(self):
|
94
|
+
if self.timestamp is None:
|
95
|
+
self.timestamp = datetime.now(timezone.utc)
|
96
|
+
if self.last_health_check is None:
|
97
|
+
self.last_health_check = self.timestamp
|
98
|
+
|
99
|
+
@property
|
100
|
+
def is_healthy(self) -> bool:
|
101
|
+
"""Check if system is healthy"""
|
102
|
+
return self.overall_status == HealthStatus.HEALTHY
|
103
|
+
|
104
|
+
@property
|
105
|
+
def health_score(self) -> float:
|
106
|
+
"""Calculate overall health score (0-100)"""
|
107
|
+
score = 100.0
|
108
|
+
|
109
|
+
# Component health impact
|
110
|
+
total_components = len(self.component_status)
|
111
|
+
if total_components > 0:
|
112
|
+
healthy_components = sum(1 for status in self.component_status.values()
|
113
|
+
if status == HealthStatus.HEALTHY)
|
114
|
+
component_score = (healthy_components / total_components) * 100
|
115
|
+
score = min(score, component_score)
|
116
|
+
|
117
|
+
# Performance impact
|
118
|
+
if self.error_rate_percent is not None:
|
119
|
+
score -= min(50, self.error_rate_percent * 5) # Error rate penalty
|
120
|
+
|
121
|
+
if self.availability_percent is not None:
|
122
|
+
score = min(score, self.availability_percent)
|
123
|
+
|
124
|
+
if self.cpu_usage_percent is not None and self.cpu_usage_percent > 90:
|
125
|
+
score -= (self.cpu_usage_percent - 90) * 2
|
126
|
+
|
127
|
+
if self.memory_usage_percent is not None and self.memory_usage_percent > 90:
|
128
|
+
score -= (self.memory_usage_percent - 90) * 2
|
129
|
+
|
130
|
+
# Alert impact
|
131
|
+
critical_alert_count = sum(1 for alert in self.active_alerts
|
132
|
+
if alert.get('severity') == AlertSeverity.CRITICAL)
|
133
|
+
score -= critical_alert_count * 10
|
134
|
+
|
135
|
+
return max(0.0, min(100.0, score))
|
136
|
+
|
137
|
+
@property
|
138
|
+
def needs_attention(self) -> bool:
|
139
|
+
"""Check if system needs immediate attention"""
|
140
|
+
return (self.overall_status in [HealthStatus.CRITICAL, HealthStatus.DEGRADED] or
|
141
|
+
len(self.critical_issues) > 0 or
|
142
|
+
any(alert.get('severity') == AlertSeverity.CRITICAL for alert in self.active_alerts))
|
143
|
+
|
144
|
+
@property
|
145
|
+
def time_since_last_check(self) -> int:
|
146
|
+
"""Get seconds since last health check"""
|
147
|
+
if self.last_health_check:
|
148
|
+
return int((datetime.now(timezone.utc) - self.last_health_check).total_seconds())
|
149
|
+
return 0
|
150
|
+
|
151
|
+
def update_component_status(self, component_name: str, status: str,
|
152
|
+
metrics: Optional[Dict[str, float]] = None):
|
153
|
+
"""Update status for a specific component"""
|
154
|
+
old_status = self.component_status.get(component_name)
|
155
|
+
self.component_status[component_name] = status
|
156
|
+
|
157
|
+
if metrics:
|
158
|
+
self.component_metrics[component_name] = metrics
|
159
|
+
|
160
|
+
# Update component lists
|
161
|
+
if status == HealthStatus.CRITICAL:
|
162
|
+
if component_name not in self.failing_components:
|
163
|
+
self.failing_components.append(component_name)
|
164
|
+
if component_name in self.degraded_components:
|
165
|
+
self.degraded_components.remove(component_name)
|
166
|
+
elif status == HealthStatus.DEGRADED:
|
167
|
+
if component_name not in self.degraded_components:
|
168
|
+
self.degraded_components.append(component_name)
|
169
|
+
if component_name in self.failing_components:
|
170
|
+
self.failing_components.remove(component_name)
|
171
|
+
else: # Healthy or other
|
172
|
+
if component_name in self.failing_components:
|
173
|
+
self.failing_components.remove(component_name)
|
174
|
+
if component_name in self.degraded_components:
|
175
|
+
self.degraded_components.remove(component_name)
|
176
|
+
|
177
|
+
# Update overall status
|
178
|
+
self._calculate_overall_status()
|
179
|
+
|
180
|
+
# Track consecutive failures/successes
|
181
|
+
if old_status != status:
|
182
|
+
if status in [HealthStatus.CRITICAL, HealthStatus.DEGRADED]:
|
183
|
+
self.consecutive_failures += 1
|
184
|
+
self.consecutive_successes = 0
|
185
|
+
else:
|
186
|
+
self.consecutive_successes += 1
|
187
|
+
self.consecutive_failures = 0
|
188
|
+
|
189
|
+
def _calculate_overall_status(self):
|
190
|
+
"""Calculate overall system status from component statuses"""
|
191
|
+
if not self.component_status:
|
192
|
+
self.overall_status = HealthStatus.UNKNOWN
|
193
|
+
return
|
194
|
+
|
195
|
+
statuses = list(self.component_status.values())
|
196
|
+
|
197
|
+
if HealthStatus.CRITICAL in statuses:
|
198
|
+
self.overall_status = HealthStatus.CRITICAL
|
199
|
+
elif HealthStatus.DEGRADED in statuses:
|
200
|
+
self.overall_status = HealthStatus.DEGRADED
|
201
|
+
elif HealthStatus.OFFLINE in statuses:
|
202
|
+
self.overall_status = HealthStatus.DEGRADED
|
203
|
+
elif all(status == HealthStatus.HEALTHY for status in statuses):
|
204
|
+
self.overall_status = HealthStatus.HEALTHY
|
205
|
+
else:
|
206
|
+
self.overall_status = HealthStatus.DEGRADED
|
207
|
+
|
208
|
+
def add_alert(self, alert_id: str, severity: str, message: str,
|
209
|
+
component: Optional[str] = None, **kwargs):
|
210
|
+
"""Add an active alert"""
|
211
|
+
alert = {
|
212
|
+
"alert_id": alert_id,
|
213
|
+
"severity": severity,
|
214
|
+
"message": message,
|
215
|
+
"component": component,
|
216
|
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
217
|
+
**kwargs
|
218
|
+
}
|
219
|
+
|
220
|
+
# Check if alert already exists
|
221
|
+
existing_alert = next((a for a in self.active_alerts if a.get("alert_id") == alert_id), None)
|
222
|
+
if existing_alert:
|
223
|
+
existing_alert.update(alert)
|
224
|
+
else:
|
225
|
+
self.active_alerts.append(alert)
|
226
|
+
|
227
|
+
def resolve_alert(self, alert_id: str):
|
228
|
+
"""Resolve an active alert"""
|
229
|
+
self.active_alerts = [alert for alert in self.active_alerts
|
230
|
+
if alert.get("alert_id") != alert_id]
|
231
|
+
self.resolved_alerts_24h += 1
|
232
|
+
|
233
|
+
def perform_health_check(self) -> Dict[str, Any]:
|
234
|
+
"""Perform comprehensive health check"""
|
235
|
+
self.last_health_check = datetime.now(timezone.utc)
|
236
|
+
|
237
|
+
health_result = {
|
238
|
+
"overall_status": self.overall_status,
|
239
|
+
"health_score": self.health_score,
|
240
|
+
"timestamp": self.last_health_check.isoformat(),
|
241
|
+
"component_summary": {
|
242
|
+
"total": len(self.component_status),
|
243
|
+
"healthy": sum(1 for s in self.component_status.values() if s == HealthStatus.HEALTHY),
|
244
|
+
"degraded": len(self.degraded_components),
|
245
|
+
"critical": len(self.failing_components)
|
246
|
+
},
|
247
|
+
"alerts": {
|
248
|
+
"active": len(self.active_alerts),
|
249
|
+
"critical": sum(1 for a in self.active_alerts if a.get("severity") == AlertSeverity.CRITICAL),
|
250
|
+
"resolved_24h": self.resolved_alerts_24h
|
251
|
+
},
|
252
|
+
"performance": {
|
253
|
+
"response_time_ms": self.response_time_ms,
|
254
|
+
"error_rate_percent": self.error_rate_percent,
|
255
|
+
"availability_percent": self.availability_percent
|
256
|
+
},
|
257
|
+
"needs_attention": self.needs_attention
|
258
|
+
}
|
259
|
+
|
260
|
+
return health_result
|
261
|
+
|
262
|
+
@dataclass
|
263
|
+
class ResourceUsage:
|
264
|
+
"""
|
265
|
+
Resource usage monitoring record
|
266
|
+
|
267
|
+
Tracks resource consumption across different dimensions
|
268
|
+
including compute, memory, storage, and network resources.
|
269
|
+
"""
|
270
|
+
usage_id: str
|
271
|
+
resource_type: str # cpu, memory, disk, network, gpu
|
272
|
+
timestamp: datetime = None
|
273
|
+
measurement_period_seconds: int = 60
|
274
|
+
|
275
|
+
# Current usage
|
276
|
+
current_usage: float = 0.0
|
277
|
+
current_usage_percent: Optional[float] = None
|
278
|
+
|
279
|
+
# Historical data
|
280
|
+
min_usage: float = 0.0
|
281
|
+
max_usage: float = 0.0
|
282
|
+
avg_usage: float = 0.0
|
283
|
+
p95_usage: Optional[float] = None
|
284
|
+
p99_usage: Optional[float] = None
|
285
|
+
|
286
|
+
# Capacity and limits
|
287
|
+
total_capacity: Optional[float] = None
|
288
|
+
allocated_capacity: Optional[float] = None
|
289
|
+
reserved_capacity: Optional[float] = None
|
290
|
+
soft_limit: Optional[float] = None
|
291
|
+
hard_limit: Optional[float] = None
|
292
|
+
|
293
|
+
# Usage patterns
|
294
|
+
usage_samples: List[float] = field(default_factory=list)
|
295
|
+
peak_hours: List[int] = field(default_factory=list)
|
296
|
+
low_hours: List[int] = field(default_factory=list)
|
297
|
+
|
298
|
+
# Trends and predictions
|
299
|
+
trend_direction: str = "stable" # increasing, decreasing, stable, volatile
|
300
|
+
predicted_usage_1h: Optional[float] = None
|
301
|
+
predicted_usage_24h: Optional[float] = None
|
302
|
+
time_to_capacity: Optional[int] = None # seconds until capacity reached
|
303
|
+
|
304
|
+
# Alerts and thresholds
|
305
|
+
warning_threshold: Optional[float] = None
|
306
|
+
critical_threshold: Optional[float] = None
|
307
|
+
threshold_breaches_24h: int = 0
|
308
|
+
|
309
|
+
# Metadata
|
310
|
+
host: Optional[str] = None
|
311
|
+
service: Optional[str] = None
|
312
|
+
tags: Dict[str, str] = field(default_factory=dict)
|
313
|
+
|
314
|
+
def __post_init__(self):
|
315
|
+
if self.timestamp is None:
|
316
|
+
self.timestamp = datetime.now(timezone.utc)
|
317
|
+
|
318
|
+
# Calculate percentage if capacity is known
|
319
|
+
if self.total_capacity and self.total_capacity > 0:
|
320
|
+
self.current_usage_percent = (self.current_usage / self.total_capacity) * 100
|
321
|
+
|
322
|
+
@property
|
323
|
+
def is_at_capacity(self) -> bool:
|
324
|
+
"""Check if resource is at or near capacity"""
|
325
|
+
if self.current_usage_percent:
|
326
|
+
return self.current_usage_percent >= 95
|
327
|
+
return False
|
328
|
+
|
329
|
+
@property
|
330
|
+
def is_over_soft_limit(self) -> bool:
|
331
|
+
"""Check if usage exceeds soft limit"""
|
332
|
+
if self.soft_limit:
|
333
|
+
return self.current_usage > self.soft_limit
|
334
|
+
return False
|
335
|
+
|
336
|
+
@property
|
337
|
+
def is_over_hard_limit(self) -> bool:
|
338
|
+
"""Check if usage exceeds hard limit"""
|
339
|
+
if self.hard_limit:
|
340
|
+
return self.current_usage > self.hard_limit
|
341
|
+
return False
|
342
|
+
|
343
|
+
@property
|
344
|
+
def utilization_efficiency(self) -> float:
|
345
|
+
"""Calculate utilization efficiency (0-100)"""
|
346
|
+
if not self.allocated_capacity or self.allocated_capacity == 0:
|
347
|
+
return 0.0
|
348
|
+
|
349
|
+
return min(100.0, (self.current_usage / self.allocated_capacity) * 100)
|
350
|
+
|
351
|
+
@property
|
352
|
+
def waste_percentage(self) -> float:
|
353
|
+
"""Calculate resource waste percentage"""
|
354
|
+
if not self.allocated_capacity or self.allocated_capacity == 0:
|
355
|
+
return 0.0
|
356
|
+
|
357
|
+
unused = max(0, self.allocated_capacity - self.current_usage)
|
358
|
+
return (unused / self.allocated_capacity) * 100
|
359
|
+
|
360
|
+
def add_usage_sample(self, usage_value: float, timestamp: Optional[datetime] = None):
|
361
|
+
"""Add a usage measurement sample"""
|
362
|
+
self.usage_samples.append(usage_value)
|
363
|
+
|
364
|
+
# Update current values
|
365
|
+
self.current_usage = usage_value
|
366
|
+
if self.total_capacity and self.total_capacity > 0:
|
367
|
+
self.current_usage_percent = (usage_value / self.total_capacity) * 100
|
368
|
+
|
369
|
+
# Update min/max
|
370
|
+
if usage_value < self.min_usage or self.min_usage == 0:
|
371
|
+
self.min_usage = usage_value
|
372
|
+
if usage_value > self.max_usage:
|
373
|
+
self.max_usage = usage_value
|
374
|
+
|
375
|
+
# Recalculate statistics if we have enough samples
|
376
|
+
if len(self.usage_samples) >= 10:
|
377
|
+
self._calculate_statistics()
|
378
|
+
|
379
|
+
# Track peak hours
|
380
|
+
if timestamp:
|
381
|
+
hour = timestamp.hour
|
382
|
+
if usage_value > self.avg_usage * 1.5:
|
383
|
+
if hour not in self.peak_hours:
|
384
|
+
self.peak_hours.append(hour)
|
385
|
+
elif usage_value < self.avg_usage * 0.5:
|
386
|
+
if hour not in self.low_hours:
|
387
|
+
self.low_hours.append(hour)
|
388
|
+
|
389
|
+
def _calculate_statistics(self):
|
390
|
+
"""Calculate statistical measures from usage samples"""
|
391
|
+
if not self.usage_samples:
|
392
|
+
return
|
393
|
+
|
394
|
+
self.avg_usage = statistics.mean(self.usage_samples)
|
395
|
+
|
396
|
+
if len(self.usage_samples) > 1:
|
397
|
+
sorted_samples = sorted(self.usage_samples)
|
398
|
+
n = len(sorted_samples)
|
399
|
+
|
400
|
+
self.p95_usage = sorted_samples[int(0.95 * n)]
|
401
|
+
self.p99_usage = sorted_samples[int(0.99 * n)] if n > 100 else sorted_samples[-1]
|
402
|
+
|
403
|
+
# Analyze trend
|
404
|
+
if len(self.usage_samples) >= 5:
|
405
|
+
recent_avg = statistics.mean(self.usage_samples[-5:])
|
406
|
+
older_avg = statistics.mean(self.usage_samples[:-5]) if len(self.usage_samples) > 5 else self.avg_usage
|
407
|
+
|
408
|
+
if recent_avg > older_avg * 1.1:
|
409
|
+
self.trend_direction = "increasing"
|
410
|
+
elif recent_avg < older_avg * 0.9:
|
411
|
+
self.trend_direction = "decreasing"
|
412
|
+
else:
|
413
|
+
# Check for volatility
|
414
|
+
std_dev = statistics.stdev(self.usage_samples[-10:]) if len(self.usage_samples) >= 10 else 0
|
415
|
+
cv = std_dev / self.avg_usage if self.avg_usage > 0 else 0
|
416
|
+
|
417
|
+
if cv > 0.3: # High coefficient of variation
|
418
|
+
self.trend_direction = "volatile"
|
419
|
+
else:
|
420
|
+
self.trend_direction = "stable"
|
421
|
+
|
422
|
+
def predict_future_usage(self, hours_ahead: int = 1) -> Optional[float]:
|
423
|
+
"""Predict future resource usage based on trends"""
|
424
|
+
if len(self.usage_samples) < 5:
|
425
|
+
return None
|
426
|
+
|
427
|
+
# Simple linear trend prediction
|
428
|
+
recent_samples = self.usage_samples[-10:]
|
429
|
+
x = list(range(len(recent_samples)))
|
430
|
+
y = recent_samples
|
431
|
+
|
432
|
+
# Calculate linear regression (simplified)
|
433
|
+
n = len(x)
|
434
|
+
sum_x = sum(x)
|
435
|
+
sum_y = sum(y)
|
436
|
+
sum_xy = sum(x[i] * y[i] for i in range(n))
|
437
|
+
sum_x2 = sum(x[i] ** 2 for i in range(n))
|
438
|
+
|
439
|
+
if n * sum_x2 - sum_x ** 2 != 0:
|
440
|
+
slope = (n * sum_xy - sum_x * sum_y) / (n * sum_x2 - sum_x ** 2)
|
441
|
+
intercept = (sum_y - slope * sum_x) / n
|
442
|
+
|
443
|
+
# Predict future value
|
444
|
+
future_x = len(recent_samples) + hours_ahead
|
445
|
+
predicted = slope * future_x + intercept
|
446
|
+
|
447
|
+
return max(0, predicted)
|
448
|
+
|
449
|
+
return self.avg_usage
|
450
|
+
|
451
|
+
def calculate_time_to_capacity(self) -> Optional[int]:
|
452
|
+
"""Calculate time until resource reaches capacity"""
|
453
|
+
if not self.total_capacity or self.trend_direction != "increasing":
|
454
|
+
return None
|
455
|
+
|
456
|
+
available_capacity = self.total_capacity - self.current_usage
|
457
|
+
if available_capacity <= 0:
|
458
|
+
return 0
|
459
|
+
|
460
|
+
# Estimate growth rate from recent samples
|
461
|
+
if len(self.usage_samples) >= 5:
|
462
|
+
recent_growth = self.usage_samples[-1] - self.usage_samples[-5]
|
463
|
+
if recent_growth > 0:
|
464
|
+
# Estimate hours to capacity based on current growth rate
|
465
|
+
growth_per_hour = recent_growth / 5 # Assuming samples are hourly
|
466
|
+
hours_to_capacity = available_capacity / growth_per_hour
|
467
|
+
return int(hours_to_capacity * 3600) # Convert to seconds
|
468
|
+
|
469
|
+
return None
|
470
|
+
|
471
|
+
def check_thresholds(self) -> List[Dict[str, Any]]:
|
472
|
+
"""Check if usage exceeds configured thresholds"""
|
473
|
+
alerts = []
|
474
|
+
|
475
|
+
if self.warning_threshold and self.current_usage > self.warning_threshold:
|
476
|
+
alerts.append({
|
477
|
+
"level": "warning",
|
478
|
+
"message": f"{self.resource_type} usage ({self.current_usage}) exceeds warning threshold ({self.warning_threshold})",
|
479
|
+
"threshold": self.warning_threshold,
|
480
|
+
"current_value": self.current_usage
|
481
|
+
})
|
482
|
+
|
483
|
+
if self.critical_threshold and self.current_usage > self.critical_threshold:
|
484
|
+
alerts.append({
|
485
|
+
"level": "critical",
|
486
|
+
"message": f"{self.resource_type} usage ({self.current_usage}) exceeds critical threshold ({self.critical_threshold})",
|
487
|
+
"threshold": self.critical_threshold,
|
488
|
+
"current_value": self.current_usage
|
489
|
+
})
|
490
|
+
|
491
|
+
return alerts
|
492
|
+
|
493
|
+
@dataclass
|
494
|
+
class ServiceStatus:
|
495
|
+
"""
|
496
|
+
Service status monitoring record
|
497
|
+
|
498
|
+
Tracks the status and health of individual services
|
499
|
+
including availability, performance, and dependencies.
|
500
|
+
"""
|
501
|
+
service_id: str
|
502
|
+
service_name: str
|
503
|
+
service_type: str
|
504
|
+
status: str = HealthStatus.UNKNOWN
|
505
|
+
timestamp: datetime = None
|
506
|
+
|
507
|
+
# Service details
|
508
|
+
version: Optional[str] = None
|
509
|
+
endpoint_url: Optional[str] = None
|
510
|
+
health_check_url: Optional[str] = None
|
511
|
+
last_deployment: Optional[datetime] = None
|
512
|
+
|
513
|
+
# Availability metrics
|
514
|
+
uptime_seconds: int = 0
|
515
|
+
downtime_seconds: int = 0
|
516
|
+
availability_percent: float = 100.0
|
517
|
+
mttr_seconds: Optional[int] = None # Mean Time To Repair
|
518
|
+
mtbf_seconds: Optional[int] = None # Mean Time Between Failures
|
519
|
+
|
520
|
+
# Performance metrics
|
521
|
+
response_time_ms: Optional[float] = None
|
522
|
+
throughput_rps: Optional[float] = None
|
523
|
+
error_rate_percent: float = 0.0
|
524
|
+
success_rate_percent: float = 100.0
|
525
|
+
|
526
|
+
# Health check results
|
527
|
+
last_health_check: Optional[datetime] = None
|
528
|
+
health_check_success: bool = True
|
529
|
+
consecutive_failures: int = 0
|
530
|
+
consecutive_successes: int = 0
|
531
|
+
health_check_interval_seconds: int = 30
|
532
|
+
|
533
|
+
# Dependencies
|
534
|
+
dependencies: List[str] = field(default_factory=list)
|
535
|
+
dependency_statuses: Dict[str, str] = field(default_factory=dict)
|
536
|
+
critical_dependencies: List[str] = field(default_factory=list)
|
537
|
+
|
538
|
+
# Incidents and alerts
|
539
|
+
active_incidents: List[Dict[str, Any]] = field(default_factory=list)
|
540
|
+
incidents_24h: int = 0
|
541
|
+
last_incident: Optional[datetime] = None
|
542
|
+
|
543
|
+
# Configuration
|
544
|
+
auto_restart_enabled: bool = False
|
545
|
+
restart_count_24h: int = 0
|
546
|
+
circuit_breaker_status: str = "closed" # closed, open, half_open
|
547
|
+
|
548
|
+
# Metadata
|
549
|
+
environment: str = "production"
|
550
|
+
region: Optional[str] = None
|
551
|
+
owner_team: Optional[str] = None
|
552
|
+
tags: Dict[str, str] = field(default_factory=dict)
|
553
|
+
|
554
|
+
def __post_init__(self):
|
555
|
+
if self.timestamp is None:
|
556
|
+
self.timestamp = datetime.now(timezone.utc)
|
557
|
+
if self.last_health_check is None:
|
558
|
+
self.last_health_check = self.timestamp
|
559
|
+
|
560
|
+
@property
|
561
|
+
def is_healthy(self) -> bool:
|
562
|
+
"""Check if service is healthy"""
|
563
|
+
return self.status == HealthStatus.HEALTHY and self.health_check_success
|
564
|
+
|
565
|
+
@property
|
566
|
+
def is_available(self) -> bool:
|
567
|
+
"""Check if service is available (not offline)"""
|
568
|
+
return self.status != HealthStatus.OFFLINE
|
569
|
+
|
570
|
+
@property
|
571
|
+
def needs_attention(self) -> bool:
|
572
|
+
"""Check if service needs immediate attention"""
|
573
|
+
return (self.status in [HealthStatus.CRITICAL, HealthStatus.OFFLINE] or
|
574
|
+
self.consecutive_failures >= 3 or
|
575
|
+
len(self.active_incidents) > 0)
|
576
|
+
|
577
|
+
@property
|
578
|
+
def uptime_percent_24h(self) -> float:
|
579
|
+
"""Calculate uptime percentage for last 24 hours"""
|
580
|
+
total_seconds_24h = 24 * 60 * 60
|
581
|
+
uptime_24h = min(self.uptime_seconds, total_seconds_24h)
|
582
|
+
return (uptime_24h / total_seconds_24h) * 100
|
583
|
+
|
584
|
+
@property
|
585
|
+
def service_level_indicator(self) -> Dict[str, float]:
|
586
|
+
"""Calculate Service Level Indicators (SLI)"""
|
587
|
+
return {
|
588
|
+
"availability": self.availability_percent,
|
589
|
+
"success_rate": self.success_rate_percent,
|
590
|
+
"response_time_ms": self.response_time_ms or 0,
|
591
|
+
"throughput_rps": self.throughput_rps or 0
|
592
|
+
}
|
593
|
+
|
594
|
+
@property
|
595
|
+
def reliability_score(self) -> float:
|
596
|
+
"""Calculate overall reliability score (0-100)"""
|
597
|
+
score = 100.0
|
598
|
+
|
599
|
+
# Availability impact
|
600
|
+
score *= (self.availability_percent / 100)
|
601
|
+
|
602
|
+
# Error rate impact
|
603
|
+
score *= (self.success_rate_percent / 100)
|
604
|
+
|
605
|
+
# Health check impact
|
606
|
+
if self.consecutive_failures > 0:
|
607
|
+
score -= min(50, self.consecutive_failures * 10)
|
608
|
+
|
609
|
+
# Incident impact
|
610
|
+
if self.incidents_24h > 0:
|
611
|
+
score -= min(30, self.incidents_24h * 5)
|
612
|
+
|
613
|
+
return max(0.0, min(100.0, score))
|
614
|
+
|
615
|
+
def update_status(self, new_status: str, reason: Optional[str] = None):
|
616
|
+
"""Update service status"""
|
617
|
+
old_status = self.status
|
618
|
+
self.status = new_status
|
619
|
+
self.timestamp = datetime.now(timezone.utc)
|
620
|
+
|
621
|
+
# Track uptime/downtime
|
622
|
+
if old_status != new_status:
|
623
|
+
if new_status == HealthStatus.OFFLINE:
|
624
|
+
# Service went down
|
625
|
+
if reason:
|
626
|
+
self.add_incident(f"Service offline: {reason}", AlertSeverity.CRITICAL)
|
627
|
+
elif old_status == HealthStatus.OFFLINE:
|
628
|
+
# Service came back up
|
629
|
+
self.resolve_incidents("Service restored")
|
630
|
+
|
631
|
+
def perform_health_check(self) -> bool:
|
632
|
+
"""Perform health check and update status"""
|
633
|
+
self.last_health_check = datetime.now(timezone.utc)
|
634
|
+
|
635
|
+
# This would contain actual health check logic
|
636
|
+
# For now, simulate based on current status
|
637
|
+
if self.status == HealthStatus.OFFLINE:
|
638
|
+
self.health_check_success = False
|
639
|
+
self.consecutive_failures += 1
|
640
|
+
self.consecutive_successes = 0
|
641
|
+
else:
|
642
|
+
self.health_check_success = True
|
643
|
+
self.consecutive_successes += 1
|
644
|
+
self.consecutive_failures = 0
|
645
|
+
|
646
|
+
# Update circuit breaker status
|
647
|
+
if self.consecutive_failures >= 5:
|
648
|
+
self.circuit_breaker_status = "open"
|
649
|
+
elif self.consecutive_failures >= 3:
|
650
|
+
self.circuit_breaker_status = "half_open"
|
651
|
+
else:
|
652
|
+
self.circuit_breaker_status = "closed"
|
653
|
+
|
654
|
+
return self.health_check_success
|
655
|
+
|
656
|
+
def add_incident(self, description: str, severity: str, incident_id: Optional[str] = None):
|
657
|
+
"""Add an active incident"""
|
658
|
+
if not incident_id:
|
659
|
+
import uuid
|
660
|
+
incident_id = f"inc_{uuid.uuid4().hex[:8]}"
|
661
|
+
|
662
|
+
incident = {
|
663
|
+
"incident_id": incident_id,
|
664
|
+
"description": description,
|
665
|
+
"severity": severity,
|
666
|
+
"start_time": datetime.now(timezone.utc).isoformat(),
|
667
|
+
"status": "active"
|
668
|
+
}
|
669
|
+
|
670
|
+
self.active_incidents.append(incident)
|
671
|
+
self.incidents_24h += 1
|
672
|
+
self.last_incident = datetime.now(timezone.utc)
|
673
|
+
|
674
|
+
def resolve_incidents(self, resolution: str):
|
675
|
+
"""Resolve all active incidents"""
|
676
|
+
for incident in self.active_incidents:
|
677
|
+
incident["status"] = "resolved"
|
678
|
+
incident["end_time"] = datetime.now(timezone.utc).isoformat()
|
679
|
+
incident["resolution"] = resolution
|
680
|
+
|
681
|
+
self.active_incidents = []
|
682
|
+
|
683
|
+
def check_dependencies(self) -> bool:
|
684
|
+
"""Check status of service dependencies"""
|
685
|
+
all_healthy = True
|
686
|
+
|
687
|
+
for dependency in self.critical_dependencies:
|
688
|
+
status = self.dependency_statuses.get(dependency, HealthStatus.UNKNOWN)
|
689
|
+
if status not in [HealthStatus.HEALTHY, HealthStatus.DEGRADED]:
|
690
|
+
all_healthy = False
|
691
|
+
|
692
|
+
# Update our status if critical dependency is down
|
693
|
+
if self.status == HealthStatus.HEALTHY:
|
694
|
+
self.update_status(HealthStatus.DEGRADED, f"Critical dependency {dependency} is {status}")
|
695
|
+
|
696
|
+
return all_healthy
|
697
|
+
|
698
|
+
def generate_status_report(self) -> Dict[str, Any]:
|
699
|
+
"""Generate comprehensive status report"""
|
700
|
+
return {
|
701
|
+
"service_info": {
|
702
|
+
"service_id": self.service_id,
|
703
|
+
"service_name": self.service_name,
|
704
|
+
"service_type": self.service_type,
|
705
|
+
"version": self.version,
|
706
|
+
"environment": self.environment
|
707
|
+
},
|
708
|
+
"current_status": {
|
709
|
+
"status": self.status,
|
710
|
+
"health_check_success": self.health_check_success,
|
711
|
+
"is_available": self.is_available,
|
712
|
+
"needs_attention": self.needs_attention,
|
713
|
+
"timestamp": self.timestamp.isoformat()
|
714
|
+
},
|
715
|
+
"performance": {
|
716
|
+
"response_time_ms": self.response_time_ms,
|
717
|
+
"throughput_rps": self.throughput_rps,
|
718
|
+
"error_rate_percent": self.error_rate_percent,
|
719
|
+
"success_rate_percent": self.success_rate_percent
|
720
|
+
},
|
721
|
+
"reliability": {
|
722
|
+
"availability_percent": self.availability_percent,
|
723
|
+
"uptime_percent_24h": self.uptime_percent_24h,
|
724
|
+
"reliability_score": self.reliability_score,
|
725
|
+
"mttr_seconds": self.mttr_seconds,
|
726
|
+
"mtbf_seconds": self.mtbf_seconds
|
727
|
+
},
|
728
|
+
"incidents": {
|
729
|
+
"active_count": len(self.active_incidents),
|
730
|
+
"incidents_24h": self.incidents_24h,
|
731
|
+
"last_incident": self.last_incident.isoformat() if self.last_incident else None
|
732
|
+
},
|
733
|
+
"health_checks": {
|
734
|
+
"consecutive_failures": self.consecutive_failures,
|
735
|
+
"consecutive_successes": self.consecutive_successes,
|
736
|
+
"last_check": self.last_health_check.isoformat() if self.last_health_check else None
|
737
|
+
},
|
738
|
+
"dependencies": {
|
739
|
+
"total": len(self.dependencies),
|
740
|
+
"critical": len(self.critical_dependencies),
|
741
|
+
"statuses": self.dependency_statuses
|
742
|
+
}
|
743
|
+
}
|
744
|
+
|
745
|
+
# Utility functions for working with system models
|
746
|
+
|
747
|
+
def create_system_health(
|
748
|
+
system_name: str,
|
749
|
+
environment: str = "production"
|
750
|
+
) -> SystemHealth:
|
751
|
+
"""Factory function to create system health record"""
|
752
|
+
import uuid
|
753
|
+
|
754
|
+
health_id = f"health_{system_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}_{uuid.uuid4().hex[:8]}"
|
755
|
+
|
756
|
+
return SystemHealth(
|
757
|
+
health_id=health_id,
|
758
|
+
system_name=system_name,
|
759
|
+
environment=environment
|
760
|
+
)
|
761
|
+
|
762
|
+
def create_resource_usage(
|
763
|
+
resource_type: str,
|
764
|
+
current_usage: float,
|
765
|
+
total_capacity: Optional[float] = None,
|
766
|
+
host: Optional[str] = None,
|
767
|
+
service: Optional[str] = None
|
768
|
+
) -> ResourceUsage:
|
769
|
+
"""Factory function to create resource usage record"""
|
770
|
+
import uuid
|
771
|
+
|
772
|
+
usage_id = f"usage_{resource_type}_{datetime.now().strftime('%Y%m%d_%H%M%S')}_{uuid.uuid4().hex[:8]}"
|
773
|
+
|
774
|
+
return ResourceUsage(
|
775
|
+
usage_id=usage_id,
|
776
|
+
resource_type=resource_type,
|
777
|
+
current_usage=current_usage,
|
778
|
+
total_capacity=total_capacity,
|
779
|
+
host=host,
|
780
|
+
service=service
|
781
|
+
)
|
782
|
+
|
783
|
+
def create_service_status(
|
784
|
+
service_name: str,
|
785
|
+
service_type: str,
|
786
|
+
endpoint_url: Optional[str] = None,
|
787
|
+
environment: str = "production"
|
788
|
+
) -> ServiceStatus:
|
789
|
+
"""Factory function to create service status record"""
|
790
|
+
import uuid
|
791
|
+
|
792
|
+
service_id = f"svc_{service_name}_{uuid.uuid4().hex[:8]}"
|
793
|
+
|
794
|
+
return ServiceStatus(
|
795
|
+
service_id=service_id,
|
796
|
+
service_name=service_name,
|
797
|
+
service_type=service_type,
|
798
|
+
endpoint_url=endpoint_url,
|
799
|
+
environment=environment
|
800
|
+
)
|
801
|
+
|
802
|
+
def calculate_system_overview(
|
803
|
+
health_records: List[SystemHealth],
|
804
|
+
resource_records: List[ResourceUsage],
|
805
|
+
service_records: List[ServiceStatus]
|
806
|
+
) -> Dict[str, Any]:
|
807
|
+
"""Calculate comprehensive system overview"""
|
808
|
+
overview = {
|
809
|
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
810
|
+
"system_health": {},
|
811
|
+
"resource_utilization": {},
|
812
|
+
"service_availability": {},
|
813
|
+
"overall_status": HealthStatus.HEALTHY,
|
814
|
+
"alerts": []
|
815
|
+
}
|
816
|
+
|
817
|
+
# System health summary
|
818
|
+
if health_records:
|
819
|
+
healthy_systems = sum(1 for h in health_records if h.is_healthy)
|
820
|
+
overview["system_health"] = {
|
821
|
+
"total_systems": len(health_records),
|
822
|
+
"healthy_systems": healthy_systems,
|
823
|
+
"health_percentage": (healthy_systems / len(health_records)) * 100,
|
824
|
+
"avg_health_score": sum(h.health_score for h in health_records) / len(health_records)
|
825
|
+
}
|
826
|
+
|
827
|
+
# Resource utilization summary
|
828
|
+
if resource_records:
|
829
|
+
cpu_records = [r for r in resource_records if r.resource_type == "cpu"]
|
830
|
+
memory_records = [r for r in resource_records if r.resource_type == "memory"]
|
831
|
+
|
832
|
+
overview["resource_utilization"] = {
|
833
|
+
"avg_cpu_usage": sum(r.current_usage_percent or 0 for r in cpu_records) / len(cpu_records) if cpu_records else 0,
|
834
|
+
"avg_memory_usage": sum(r.current_usage_percent or 0 for r in memory_records) / len(memory_records) if memory_records else 0,
|
835
|
+
"resources_at_capacity": sum(1 for r in resource_records if r.is_at_capacity),
|
836
|
+
"resources_over_threshold": sum(1 for r in resource_records if r.is_over_soft_limit)
|
837
|
+
}
|
838
|
+
|
839
|
+
# Service availability summary
|
840
|
+
if service_records:
|
841
|
+
available_services = sum(1 for s in service_records if s.is_available)
|
842
|
+
overview["service_availability"] = {
|
843
|
+
"total_services": len(service_records),
|
844
|
+
"available_services": available_services,
|
845
|
+
"availability_percentage": (available_services / len(service_records)) * 100,
|
846
|
+
"avg_reliability_score": sum(s.reliability_score for s in service_records) / len(service_records)
|
847
|
+
}
|
848
|
+
|
849
|
+
# Overall status determination
|
850
|
+
if overview["system_health"].get("health_percentage", 100) < 80:
|
851
|
+
overview["overall_status"] = HealthStatus.CRITICAL
|
852
|
+
elif overview["service_availability"].get("availability_percentage", 100) < 90:
|
853
|
+
overview["overall_status"] = HealthStatus.DEGRADED
|
854
|
+
elif overview["resource_utilization"].get("resources_at_capacity", 0) > 0:
|
855
|
+
overview["overall_status"] = HealthStatus.DEGRADED
|
856
|
+
|
857
|
+
return overview
|