isa-model 0.4.0__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (199) hide show
  1. isa_model/client.py +466 -43
  2. isa_model/core/cache/redis_cache.py +12 -3
  3. isa_model/core/config/config_manager.py +230 -3
  4. isa_model/core/config.py +90 -0
  5. isa_model/core/database/direct_db_client.py +114 -0
  6. isa_model/core/database/migration_manager.py +563 -0
  7. isa_model/core/database/migrations.py +21 -1
  8. isa_model/core/database/supabase_client.py +154 -19
  9. isa_model/core/dependencies.py +316 -0
  10. isa_model/core/discovery/__init__.py +19 -0
  11. isa_model/core/discovery/consul_discovery.py +190 -0
  12. isa_model/core/logging/__init__.py +54 -0
  13. isa_model/core/logging/influx_logger.py +523 -0
  14. isa_model/core/logging/loki_logger.py +160 -0
  15. isa_model/core/models/__init__.py +27 -18
  16. isa_model/core/models/config_models.py +625 -0
  17. isa_model/core/models/deployment_billing_tracker.py +430 -0
  18. isa_model/core/models/model_manager.py +40 -17
  19. isa_model/core/models/model_metadata.py +690 -0
  20. isa_model/core/models/model_repo.py +174 -18
  21. isa_model/core/models/system_models.py +857 -0
  22. isa_model/core/repositories/__init__.py +9 -0
  23. isa_model/core/repositories/config_repository.py +912 -0
  24. isa_model/core/services/intelligent_model_selector.py +399 -21
  25. isa_model/core/storage/hf_storage.py +1 -1
  26. isa_model/core/types.py +1 -0
  27. isa_model/deployment/__init__.py +5 -48
  28. isa_model/deployment/core/__init__.py +2 -31
  29. isa_model/deployment/core/deployment_manager.py +1278 -370
  30. isa_model/deployment/local/__init__.py +31 -0
  31. isa_model/deployment/local/config.py +248 -0
  32. isa_model/deployment/local/gpu_gateway.py +607 -0
  33. isa_model/deployment/local/health_checker.py +428 -0
  34. isa_model/deployment/local/provider.py +586 -0
  35. isa_model/deployment/local/tensorrt_service.py +621 -0
  36. isa_model/deployment/local/transformers_service.py +644 -0
  37. isa_model/deployment/local/vllm_service.py +527 -0
  38. isa_model/deployment/modal/__init__.py +8 -0
  39. isa_model/deployment/modal/config.py +136 -0
  40. isa_model/deployment/{services/auto_hf_modal_deployer.py → modal/deployer.py} +1 -1
  41. isa_model/deployment/modal/services/__init__.py +3 -0
  42. isa_model/deployment/modal/services/audio/__init__.py +1 -0
  43. isa_model/deployment/modal/services/embedding/__init__.py +1 -0
  44. isa_model/deployment/modal/services/llm/__init__.py +1 -0
  45. isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
  46. isa_model/deployment/modal/services/video/__init__.py +1 -0
  47. isa_model/deployment/modal/services/vision/__init__.py +1 -0
  48. isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
  49. isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
  50. isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
  51. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
  52. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
  53. isa_model/deployment/storage/__init__.py +5 -0
  54. isa_model/deployment/storage/deployment_repository.py +824 -0
  55. isa_model/deployment/triton/__init__.py +10 -0
  56. isa_model/deployment/triton/config.py +196 -0
  57. isa_model/deployment/triton/configs/__init__.py +1 -0
  58. isa_model/deployment/triton/provider.py +512 -0
  59. isa_model/deployment/triton/scripts/__init__.py +1 -0
  60. isa_model/deployment/triton/templates/__init__.py +1 -0
  61. isa_model/inference/__init__.py +47 -1
  62. isa_model/inference/ai_factory.py +137 -10
  63. isa_model/inference/legacy_services/__init__.py +21 -0
  64. isa_model/inference/legacy_services/model_evaluation.py +637 -0
  65. isa_model/inference/legacy_services/model_service.py +573 -0
  66. isa_model/inference/legacy_services/model_serving.py +717 -0
  67. isa_model/inference/legacy_services/model_training.py +561 -0
  68. isa_model/inference/models/__init__.py +21 -0
  69. isa_model/inference/models/inference_config.py +551 -0
  70. isa_model/inference/models/inference_record.py +675 -0
  71. isa_model/inference/models/performance_models.py +714 -0
  72. isa_model/inference/repositories/__init__.py +9 -0
  73. isa_model/inference/repositories/inference_repository.py +828 -0
  74. isa_model/inference/services/audio/base_stt_service.py +184 -11
  75. isa_model/inference/services/audio/openai_stt_service.py +22 -6
  76. isa_model/inference/services/custom_model_manager.py +277 -0
  77. isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
  78. isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
  79. isa_model/inference/services/llm/__init__.py +10 -2
  80. isa_model/inference/services/llm/base_llm_service.py +335 -24
  81. isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
  82. isa_model/inference/services/llm/helpers/llm_adapter.py +9 -4
  83. isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
  84. isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
  85. isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
  86. isa_model/inference/services/llm/local_llm_service.py +747 -0
  87. isa_model/inference/services/llm/ollama_llm_service.py +9 -2
  88. isa_model/inference/services/llm/openai_llm_service.py +33 -16
  89. isa_model/inference/services/llm/yyds_llm_service.py +8 -2
  90. isa_model/inference/services/vision/__init__.py +22 -1
  91. isa_model/inference/services/vision/blip_vision_service.py +359 -0
  92. isa_model/inference/services/vision/helpers/image_utils.py +8 -5
  93. isa_model/inference/services/vision/isa_vision_service.py +65 -4
  94. isa_model/inference/services/vision/openai_vision_service.py +19 -10
  95. isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
  96. isa_model/serving/api/cache_manager.py +245 -0
  97. isa_model/serving/api/dependencies/__init__.py +1 -0
  98. isa_model/serving/api/dependencies/auth.py +194 -0
  99. isa_model/serving/api/dependencies/database.py +139 -0
  100. isa_model/serving/api/error_handlers.py +284 -0
  101. isa_model/serving/api/fastapi_server.py +172 -22
  102. isa_model/serving/api/middleware/auth.py +8 -2
  103. isa_model/serving/api/middleware/security.py +23 -33
  104. isa_model/serving/api/middleware/tenant_context.py +414 -0
  105. isa_model/serving/api/routes/analytics.py +4 -1
  106. isa_model/serving/api/routes/config.py +645 -0
  107. isa_model/serving/api/routes/deployment_billing.py +315 -0
  108. isa_model/serving/api/routes/deployments.py +138 -2
  109. isa_model/serving/api/routes/gpu_gateway.py +440 -0
  110. isa_model/serving/api/routes/health.py +32 -12
  111. isa_model/serving/api/routes/inference_monitoring.py +486 -0
  112. isa_model/serving/api/routes/local_deployments.py +448 -0
  113. isa_model/serving/api/routes/tenants.py +575 -0
  114. isa_model/serving/api/routes/unified.py +680 -18
  115. isa_model/serving/api/routes/webhooks.py +479 -0
  116. isa_model/serving/api/startup.py +68 -54
  117. isa_model/utils/gpu_utils.py +311 -0
  118. {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/METADATA +66 -24
  119. isa_model-0.4.3.dist-info/RECORD +193 -0
  120. isa_model/core/storage/minio_storage.py +0 -0
  121. isa_model/deployment/cloud/__init__.py +0 -9
  122. isa_model/deployment/cloud/modal/__init__.py +0 -10
  123. isa_model/deployment/core/deployment_config.py +0 -356
  124. isa_model/deployment/core/isa_deployment_service.py +0 -401
  125. isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
  126. isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
  127. isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
  128. isa_model/deployment/runtime/deployed_service.py +0 -338
  129. isa_model/deployment/services/__init__.py +0 -9
  130. isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
  131. isa_model/deployment/services/model_service.py +0 -332
  132. isa_model/deployment/services/service_monitor.py +0 -356
  133. isa_model/deployment/services/service_registry.py +0 -527
  134. isa_model/eval/__init__.py +0 -92
  135. isa_model/eval/benchmarks/__init__.py +0 -27
  136. isa_model/eval/benchmarks/multimodal_datasets.py +0 -460
  137. isa_model/eval/benchmarks.py +0 -701
  138. isa_model/eval/config/__init__.py +0 -10
  139. isa_model/eval/config/evaluation_config.py +0 -108
  140. isa_model/eval/evaluators/__init__.py +0 -24
  141. isa_model/eval/evaluators/audio_evaluator.py +0 -727
  142. isa_model/eval/evaluators/base_evaluator.py +0 -503
  143. isa_model/eval/evaluators/embedding_evaluator.py +0 -742
  144. isa_model/eval/evaluators/llm_evaluator.py +0 -472
  145. isa_model/eval/evaluators/vision_evaluator.py +0 -564
  146. isa_model/eval/example_evaluation.py +0 -395
  147. isa_model/eval/factory.py +0 -798
  148. isa_model/eval/infrastructure/__init__.py +0 -24
  149. isa_model/eval/infrastructure/experiment_tracker.py +0 -466
  150. isa_model/eval/isa_benchmarks.py +0 -700
  151. isa_model/eval/isa_integration.py +0 -582
  152. isa_model/eval/metrics.py +0 -951
  153. isa_model/eval/tests/unit/test_basic.py +0 -396
  154. isa_model/serving/api/routes/evaluations.py +0 -579
  155. isa_model/training/__init__.py +0 -168
  156. isa_model/training/annotation/annotation_schema.py +0 -47
  157. isa_model/training/annotation/processors/annotation_processor.py +0 -126
  158. isa_model/training/annotation/storage/dataset_manager.py +0 -131
  159. isa_model/training/annotation/storage/dataset_schema.py +0 -44
  160. isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
  161. isa_model/training/annotation/tests/test_minio copy.py +0 -113
  162. isa_model/training/annotation/tests/test_minio_upload.py +0 -43
  163. isa_model/training/annotation/views/annotation_controller.py +0 -158
  164. isa_model/training/cloud/__init__.py +0 -22
  165. isa_model/training/cloud/job_orchestrator.py +0 -402
  166. isa_model/training/cloud/runpod_trainer.py +0 -454
  167. isa_model/training/cloud/storage_manager.py +0 -482
  168. isa_model/training/core/__init__.py +0 -26
  169. isa_model/training/core/config.py +0 -181
  170. isa_model/training/core/dataset.py +0 -222
  171. isa_model/training/core/trainer.py +0 -720
  172. isa_model/training/core/utils.py +0 -213
  173. isa_model/training/examples/intelligent_training_example.py +0 -281
  174. isa_model/training/factory.py +0 -424
  175. isa_model/training/intelligent/__init__.py +0 -25
  176. isa_model/training/intelligent/decision_engine.py +0 -643
  177. isa_model/training/intelligent/intelligent_factory.py +0 -888
  178. isa_model/training/intelligent/knowledge_base.py +0 -751
  179. isa_model/training/intelligent/resource_optimizer.py +0 -839
  180. isa_model/training/intelligent/task_classifier.py +0 -576
  181. isa_model/training/storage/__init__.py +0 -24
  182. isa_model/training/storage/core_integration.py +0 -439
  183. isa_model/training/storage/training_repository.py +0 -552
  184. isa_model/training/storage/training_storage.py +0 -628
  185. isa_model-0.4.0.dist-info/RECORD +0 -182
  186. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_chatTTS_service.py +0 -0
  187. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_fish_service.py +0 -0
  188. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_openvoice_service.py +0 -0
  189. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_service_v2.py +0 -0
  190. /isa_model/deployment/{cloud/modal → modal/services/embedding}/isa_embed_rerank_service.py +0 -0
  191. /isa_model/deployment/{cloud/modal → modal/services/video}/isa_video_hunyuan_service.py +0 -0
  192. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ocr_service.py +0 -0
  193. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_qwen25_service.py +0 -0
  194. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_table_service.py +0 -0
  195. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service.py +0 -0
  196. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service_optimized.py +0 -0
  197. /isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
  198. {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/WHEEL +0 -0
  199. {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,857 @@
1
+ """
2
+ System Models
3
+
4
+ Data models for system health, resource usage, and service status monitoring,
5
+ following the ISA Model architecture pattern.
6
+ """
7
+
8
+ import logging
9
+ from datetime import datetime, timezone, timedelta
10
+ from typing import Dict, List, Optional, Any, Union
11
+ from dataclasses import dataclass, field
12
+ from enum import Enum
13
+ import statistics
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+ class HealthStatus(str, Enum):
18
+ """Health status enumeration"""
19
+ HEALTHY = "healthy"
20
+ DEGRADED = "degraded"
21
+ CRITICAL = "critical"
22
+ OFFLINE = "offline"
23
+ UNKNOWN = "unknown"
24
+ MAINTENANCE = "maintenance"
25
+
26
+ class ServiceType(str, Enum):
27
+ """Service type enumeration"""
28
+ API = "api"
29
+ DATABASE = "database"
30
+ CACHE = "cache"
31
+ QUEUE = "queue"
32
+ STORAGE = "storage"
33
+ COMPUTE = "compute"
34
+ MONITORING = "monitoring"
35
+ EXTERNAL = "external"
36
+
37
+ class AlertSeverity(str, Enum):
38
+ """Alert severity enumeration"""
39
+ INFO = "info"
40
+ WARNING = "warning"
41
+ ERROR = "error"
42
+ CRITICAL = "critical"
43
+
44
+ @dataclass
45
+ class SystemHealth:
46
+ """
47
+ System health monitoring record
48
+
49
+ Tracks overall system health including component status,
50
+ performance metrics, and alert information.
51
+ """
52
+ health_id: str
53
+ system_name: str
54
+ overall_status: str = HealthStatus.HEALTHY
55
+ timestamp: datetime = None
56
+
57
+ # Component health
58
+ component_status: Dict[str, str] = field(default_factory=dict)
59
+ component_metrics: Dict[str, Dict[str, float]] = field(default_factory=dict)
60
+ failing_components: List[str] = field(default_factory=list)
61
+ degraded_components: List[str] = field(default_factory=list)
62
+
63
+ # Performance indicators
64
+ response_time_ms: Optional[float] = None
65
+ throughput_rps: Optional[float] = None
66
+ error_rate_percent: Optional[float] = None
67
+ availability_percent: Optional[float] = None
68
+ uptime_seconds: Optional[int] = None
69
+
70
+ # Resource utilization
71
+ cpu_usage_percent: Optional[float] = None
72
+ memory_usage_percent: Optional[float] = None
73
+ disk_usage_percent: Optional[float] = None
74
+ network_usage_mbps: Optional[float] = None
75
+
76
+ # Health checks
77
+ last_health_check: Optional[datetime] = None
78
+ health_check_interval_seconds: int = 60
79
+ consecutive_failures: int = 0
80
+ consecutive_successes: int = 0
81
+
82
+ # Alerts and issues
83
+ active_alerts: List[Dict[str, Any]] = field(default_factory=list)
84
+ resolved_alerts_24h: int = 0
85
+ critical_issues: List[str] = field(default_factory=list)
86
+
87
+ # Metadata
88
+ version: Optional[str] = None
89
+ environment: str = "production"
90
+ region: Optional[str] = None
91
+ metadata: Dict[str, Any] = field(default_factory=dict)
92
+
93
+ def __post_init__(self):
94
+ if self.timestamp is None:
95
+ self.timestamp = datetime.now(timezone.utc)
96
+ if self.last_health_check is None:
97
+ self.last_health_check = self.timestamp
98
+
99
+ @property
100
+ def is_healthy(self) -> bool:
101
+ """Check if system is healthy"""
102
+ return self.overall_status == HealthStatus.HEALTHY
103
+
104
+ @property
105
+ def health_score(self) -> float:
106
+ """Calculate overall health score (0-100)"""
107
+ score = 100.0
108
+
109
+ # Component health impact
110
+ total_components = len(self.component_status)
111
+ if total_components > 0:
112
+ healthy_components = sum(1 for status in self.component_status.values()
113
+ if status == HealthStatus.HEALTHY)
114
+ component_score = (healthy_components / total_components) * 100
115
+ score = min(score, component_score)
116
+
117
+ # Performance impact
118
+ if self.error_rate_percent is not None:
119
+ score -= min(50, self.error_rate_percent * 5) # Error rate penalty
120
+
121
+ if self.availability_percent is not None:
122
+ score = min(score, self.availability_percent)
123
+
124
+ if self.cpu_usage_percent is not None and self.cpu_usage_percent > 90:
125
+ score -= (self.cpu_usage_percent - 90) * 2
126
+
127
+ if self.memory_usage_percent is not None and self.memory_usage_percent > 90:
128
+ score -= (self.memory_usage_percent - 90) * 2
129
+
130
+ # Alert impact
131
+ critical_alert_count = sum(1 for alert in self.active_alerts
132
+ if alert.get('severity') == AlertSeverity.CRITICAL)
133
+ score -= critical_alert_count * 10
134
+
135
+ return max(0.0, min(100.0, score))
136
+
137
+ @property
138
+ def needs_attention(self) -> bool:
139
+ """Check if system needs immediate attention"""
140
+ return (self.overall_status in [HealthStatus.CRITICAL, HealthStatus.DEGRADED] or
141
+ len(self.critical_issues) > 0 or
142
+ any(alert.get('severity') == AlertSeverity.CRITICAL for alert in self.active_alerts))
143
+
144
+ @property
145
+ def time_since_last_check(self) -> int:
146
+ """Get seconds since last health check"""
147
+ if self.last_health_check:
148
+ return int((datetime.now(timezone.utc) - self.last_health_check).total_seconds())
149
+ return 0
150
+
151
+ def update_component_status(self, component_name: str, status: str,
152
+ metrics: Optional[Dict[str, float]] = None):
153
+ """Update status for a specific component"""
154
+ old_status = self.component_status.get(component_name)
155
+ self.component_status[component_name] = status
156
+
157
+ if metrics:
158
+ self.component_metrics[component_name] = metrics
159
+
160
+ # Update component lists
161
+ if status == HealthStatus.CRITICAL:
162
+ if component_name not in self.failing_components:
163
+ self.failing_components.append(component_name)
164
+ if component_name in self.degraded_components:
165
+ self.degraded_components.remove(component_name)
166
+ elif status == HealthStatus.DEGRADED:
167
+ if component_name not in self.degraded_components:
168
+ self.degraded_components.append(component_name)
169
+ if component_name in self.failing_components:
170
+ self.failing_components.remove(component_name)
171
+ else: # Healthy or other
172
+ if component_name in self.failing_components:
173
+ self.failing_components.remove(component_name)
174
+ if component_name in self.degraded_components:
175
+ self.degraded_components.remove(component_name)
176
+
177
+ # Update overall status
178
+ self._calculate_overall_status()
179
+
180
+ # Track consecutive failures/successes
181
+ if old_status != status:
182
+ if status in [HealthStatus.CRITICAL, HealthStatus.DEGRADED]:
183
+ self.consecutive_failures += 1
184
+ self.consecutive_successes = 0
185
+ else:
186
+ self.consecutive_successes += 1
187
+ self.consecutive_failures = 0
188
+
189
+ def _calculate_overall_status(self):
190
+ """Calculate overall system status from component statuses"""
191
+ if not self.component_status:
192
+ self.overall_status = HealthStatus.UNKNOWN
193
+ return
194
+
195
+ statuses = list(self.component_status.values())
196
+
197
+ if HealthStatus.CRITICAL in statuses:
198
+ self.overall_status = HealthStatus.CRITICAL
199
+ elif HealthStatus.DEGRADED in statuses:
200
+ self.overall_status = HealthStatus.DEGRADED
201
+ elif HealthStatus.OFFLINE in statuses:
202
+ self.overall_status = HealthStatus.DEGRADED
203
+ elif all(status == HealthStatus.HEALTHY for status in statuses):
204
+ self.overall_status = HealthStatus.HEALTHY
205
+ else:
206
+ self.overall_status = HealthStatus.DEGRADED
207
+
208
+ def add_alert(self, alert_id: str, severity: str, message: str,
209
+ component: Optional[str] = None, **kwargs):
210
+ """Add an active alert"""
211
+ alert = {
212
+ "alert_id": alert_id,
213
+ "severity": severity,
214
+ "message": message,
215
+ "component": component,
216
+ "timestamp": datetime.now(timezone.utc).isoformat(),
217
+ **kwargs
218
+ }
219
+
220
+ # Check if alert already exists
221
+ existing_alert = next((a for a in self.active_alerts if a.get("alert_id") == alert_id), None)
222
+ if existing_alert:
223
+ existing_alert.update(alert)
224
+ else:
225
+ self.active_alerts.append(alert)
226
+
227
+ def resolve_alert(self, alert_id: str):
228
+ """Resolve an active alert"""
229
+ self.active_alerts = [alert for alert in self.active_alerts
230
+ if alert.get("alert_id") != alert_id]
231
+ self.resolved_alerts_24h += 1
232
+
233
+ def perform_health_check(self) -> Dict[str, Any]:
234
+ """Perform comprehensive health check"""
235
+ self.last_health_check = datetime.now(timezone.utc)
236
+
237
+ health_result = {
238
+ "overall_status": self.overall_status,
239
+ "health_score": self.health_score,
240
+ "timestamp": self.last_health_check.isoformat(),
241
+ "component_summary": {
242
+ "total": len(self.component_status),
243
+ "healthy": sum(1 for s in self.component_status.values() if s == HealthStatus.HEALTHY),
244
+ "degraded": len(self.degraded_components),
245
+ "critical": len(self.failing_components)
246
+ },
247
+ "alerts": {
248
+ "active": len(self.active_alerts),
249
+ "critical": sum(1 for a in self.active_alerts if a.get("severity") == AlertSeverity.CRITICAL),
250
+ "resolved_24h": self.resolved_alerts_24h
251
+ },
252
+ "performance": {
253
+ "response_time_ms": self.response_time_ms,
254
+ "error_rate_percent": self.error_rate_percent,
255
+ "availability_percent": self.availability_percent
256
+ },
257
+ "needs_attention": self.needs_attention
258
+ }
259
+
260
+ return health_result
261
+
262
+ @dataclass
263
+ class ResourceUsage:
264
+ """
265
+ Resource usage monitoring record
266
+
267
+ Tracks resource consumption across different dimensions
268
+ including compute, memory, storage, and network resources.
269
+ """
270
+ usage_id: str
271
+ resource_type: str # cpu, memory, disk, network, gpu
272
+ timestamp: datetime = None
273
+ measurement_period_seconds: int = 60
274
+
275
+ # Current usage
276
+ current_usage: float = 0.0
277
+ current_usage_percent: Optional[float] = None
278
+
279
+ # Historical data
280
+ min_usage: float = 0.0
281
+ max_usage: float = 0.0
282
+ avg_usage: float = 0.0
283
+ p95_usage: Optional[float] = None
284
+ p99_usage: Optional[float] = None
285
+
286
+ # Capacity and limits
287
+ total_capacity: Optional[float] = None
288
+ allocated_capacity: Optional[float] = None
289
+ reserved_capacity: Optional[float] = None
290
+ soft_limit: Optional[float] = None
291
+ hard_limit: Optional[float] = None
292
+
293
+ # Usage patterns
294
+ usage_samples: List[float] = field(default_factory=list)
295
+ peak_hours: List[int] = field(default_factory=list)
296
+ low_hours: List[int] = field(default_factory=list)
297
+
298
+ # Trends and predictions
299
+ trend_direction: str = "stable" # increasing, decreasing, stable, volatile
300
+ predicted_usage_1h: Optional[float] = None
301
+ predicted_usage_24h: Optional[float] = None
302
+ time_to_capacity: Optional[int] = None # seconds until capacity reached
303
+
304
+ # Alerts and thresholds
305
+ warning_threshold: Optional[float] = None
306
+ critical_threshold: Optional[float] = None
307
+ threshold_breaches_24h: int = 0
308
+
309
+ # Metadata
310
+ host: Optional[str] = None
311
+ service: Optional[str] = None
312
+ tags: Dict[str, str] = field(default_factory=dict)
313
+
314
+ def __post_init__(self):
315
+ if self.timestamp is None:
316
+ self.timestamp = datetime.now(timezone.utc)
317
+
318
+ # Calculate percentage if capacity is known
319
+ if self.total_capacity and self.total_capacity > 0:
320
+ self.current_usage_percent = (self.current_usage / self.total_capacity) * 100
321
+
322
+ @property
323
+ def is_at_capacity(self) -> bool:
324
+ """Check if resource is at or near capacity"""
325
+ if self.current_usage_percent:
326
+ return self.current_usage_percent >= 95
327
+ return False
328
+
329
+ @property
330
+ def is_over_soft_limit(self) -> bool:
331
+ """Check if usage exceeds soft limit"""
332
+ if self.soft_limit:
333
+ return self.current_usage > self.soft_limit
334
+ return False
335
+
336
+ @property
337
+ def is_over_hard_limit(self) -> bool:
338
+ """Check if usage exceeds hard limit"""
339
+ if self.hard_limit:
340
+ return self.current_usage > self.hard_limit
341
+ return False
342
+
343
+ @property
344
+ def utilization_efficiency(self) -> float:
345
+ """Calculate utilization efficiency (0-100)"""
346
+ if not self.allocated_capacity or self.allocated_capacity == 0:
347
+ return 0.0
348
+
349
+ return min(100.0, (self.current_usage / self.allocated_capacity) * 100)
350
+
351
+ @property
352
+ def waste_percentage(self) -> float:
353
+ """Calculate resource waste percentage"""
354
+ if not self.allocated_capacity or self.allocated_capacity == 0:
355
+ return 0.0
356
+
357
+ unused = max(0, self.allocated_capacity - self.current_usage)
358
+ return (unused / self.allocated_capacity) * 100
359
+
360
+ def add_usage_sample(self, usage_value: float, timestamp: Optional[datetime] = None):
361
+ """Add a usage measurement sample"""
362
+ self.usage_samples.append(usage_value)
363
+
364
+ # Update current values
365
+ self.current_usage = usage_value
366
+ if self.total_capacity and self.total_capacity > 0:
367
+ self.current_usage_percent = (usage_value / self.total_capacity) * 100
368
+
369
+ # Update min/max
370
+ if usage_value < self.min_usage or self.min_usage == 0:
371
+ self.min_usage = usage_value
372
+ if usage_value > self.max_usage:
373
+ self.max_usage = usage_value
374
+
375
+ # Recalculate statistics if we have enough samples
376
+ if len(self.usage_samples) >= 10:
377
+ self._calculate_statistics()
378
+
379
+ # Track peak hours
380
+ if timestamp:
381
+ hour = timestamp.hour
382
+ if usage_value > self.avg_usage * 1.5:
383
+ if hour not in self.peak_hours:
384
+ self.peak_hours.append(hour)
385
+ elif usage_value < self.avg_usage * 0.5:
386
+ if hour not in self.low_hours:
387
+ self.low_hours.append(hour)
388
+
389
+ def _calculate_statistics(self):
390
+ """Calculate statistical measures from usage samples"""
391
+ if not self.usage_samples:
392
+ return
393
+
394
+ self.avg_usage = statistics.mean(self.usage_samples)
395
+
396
+ if len(self.usage_samples) > 1:
397
+ sorted_samples = sorted(self.usage_samples)
398
+ n = len(sorted_samples)
399
+
400
+ self.p95_usage = sorted_samples[int(0.95 * n)]
401
+ self.p99_usage = sorted_samples[int(0.99 * n)] if n > 100 else sorted_samples[-1]
402
+
403
+ # Analyze trend
404
+ if len(self.usage_samples) >= 5:
405
+ recent_avg = statistics.mean(self.usage_samples[-5:])
406
+ older_avg = statistics.mean(self.usage_samples[:-5]) if len(self.usage_samples) > 5 else self.avg_usage
407
+
408
+ if recent_avg > older_avg * 1.1:
409
+ self.trend_direction = "increasing"
410
+ elif recent_avg < older_avg * 0.9:
411
+ self.trend_direction = "decreasing"
412
+ else:
413
+ # Check for volatility
414
+ std_dev = statistics.stdev(self.usage_samples[-10:]) if len(self.usage_samples) >= 10 else 0
415
+ cv = std_dev / self.avg_usage if self.avg_usage > 0 else 0
416
+
417
+ if cv > 0.3: # High coefficient of variation
418
+ self.trend_direction = "volatile"
419
+ else:
420
+ self.trend_direction = "stable"
421
+
422
+ def predict_future_usage(self, hours_ahead: int = 1) -> Optional[float]:
423
+ """Predict future resource usage based on trends"""
424
+ if len(self.usage_samples) < 5:
425
+ return None
426
+
427
+ # Simple linear trend prediction
428
+ recent_samples = self.usage_samples[-10:]
429
+ x = list(range(len(recent_samples)))
430
+ y = recent_samples
431
+
432
+ # Calculate linear regression (simplified)
433
+ n = len(x)
434
+ sum_x = sum(x)
435
+ sum_y = sum(y)
436
+ sum_xy = sum(x[i] * y[i] for i in range(n))
437
+ sum_x2 = sum(x[i] ** 2 for i in range(n))
438
+
439
+ if n * sum_x2 - sum_x ** 2 != 0:
440
+ slope = (n * sum_xy - sum_x * sum_y) / (n * sum_x2 - sum_x ** 2)
441
+ intercept = (sum_y - slope * sum_x) / n
442
+
443
+ # Predict future value
444
+ future_x = len(recent_samples) + hours_ahead
445
+ predicted = slope * future_x + intercept
446
+
447
+ return max(0, predicted)
448
+
449
+ return self.avg_usage
450
+
451
+ def calculate_time_to_capacity(self) -> Optional[int]:
452
+ """Calculate time until resource reaches capacity"""
453
+ if not self.total_capacity or self.trend_direction != "increasing":
454
+ return None
455
+
456
+ available_capacity = self.total_capacity - self.current_usage
457
+ if available_capacity <= 0:
458
+ return 0
459
+
460
+ # Estimate growth rate from recent samples
461
+ if len(self.usage_samples) >= 5:
462
+ recent_growth = self.usage_samples[-1] - self.usage_samples[-5]
463
+ if recent_growth > 0:
464
+ # Estimate hours to capacity based on current growth rate
465
+ growth_per_hour = recent_growth / 5 # Assuming samples are hourly
466
+ hours_to_capacity = available_capacity / growth_per_hour
467
+ return int(hours_to_capacity * 3600) # Convert to seconds
468
+
469
+ return None
470
+
471
+ def check_thresholds(self) -> List[Dict[str, Any]]:
472
+ """Check if usage exceeds configured thresholds"""
473
+ alerts = []
474
+
475
+ if self.warning_threshold and self.current_usage > self.warning_threshold:
476
+ alerts.append({
477
+ "level": "warning",
478
+ "message": f"{self.resource_type} usage ({self.current_usage}) exceeds warning threshold ({self.warning_threshold})",
479
+ "threshold": self.warning_threshold,
480
+ "current_value": self.current_usage
481
+ })
482
+
483
+ if self.critical_threshold and self.current_usage > self.critical_threshold:
484
+ alerts.append({
485
+ "level": "critical",
486
+ "message": f"{self.resource_type} usage ({self.current_usage}) exceeds critical threshold ({self.critical_threshold})",
487
+ "threshold": self.critical_threshold,
488
+ "current_value": self.current_usage
489
+ })
490
+
491
+ return alerts
492
+
493
+ @dataclass
494
+ class ServiceStatus:
495
+ """
496
+ Service status monitoring record
497
+
498
+ Tracks the status and health of individual services
499
+ including availability, performance, and dependencies.
500
+ """
501
+ service_id: str
502
+ service_name: str
503
+ service_type: str
504
+ status: str = HealthStatus.UNKNOWN
505
+ timestamp: datetime = None
506
+
507
+ # Service details
508
+ version: Optional[str] = None
509
+ endpoint_url: Optional[str] = None
510
+ health_check_url: Optional[str] = None
511
+ last_deployment: Optional[datetime] = None
512
+
513
+ # Availability metrics
514
+ uptime_seconds: int = 0
515
+ downtime_seconds: int = 0
516
+ availability_percent: float = 100.0
517
+ mttr_seconds: Optional[int] = None # Mean Time To Repair
518
+ mtbf_seconds: Optional[int] = None # Mean Time Between Failures
519
+
520
+ # Performance metrics
521
+ response_time_ms: Optional[float] = None
522
+ throughput_rps: Optional[float] = None
523
+ error_rate_percent: float = 0.0
524
+ success_rate_percent: float = 100.0
525
+
526
+ # Health check results
527
+ last_health_check: Optional[datetime] = None
528
+ health_check_success: bool = True
529
+ consecutive_failures: int = 0
530
+ consecutive_successes: int = 0
531
+ health_check_interval_seconds: int = 30
532
+
533
+ # Dependencies
534
+ dependencies: List[str] = field(default_factory=list)
535
+ dependency_statuses: Dict[str, str] = field(default_factory=dict)
536
+ critical_dependencies: List[str] = field(default_factory=list)
537
+
538
+ # Incidents and alerts
539
+ active_incidents: List[Dict[str, Any]] = field(default_factory=list)
540
+ incidents_24h: int = 0
541
+ last_incident: Optional[datetime] = None
542
+
543
+ # Configuration
544
+ auto_restart_enabled: bool = False
545
+ restart_count_24h: int = 0
546
+ circuit_breaker_status: str = "closed" # closed, open, half_open
547
+
548
+ # Metadata
549
+ environment: str = "production"
550
+ region: Optional[str] = None
551
+ owner_team: Optional[str] = None
552
+ tags: Dict[str, str] = field(default_factory=dict)
553
+
554
+ def __post_init__(self):
555
+ if self.timestamp is None:
556
+ self.timestamp = datetime.now(timezone.utc)
557
+ if self.last_health_check is None:
558
+ self.last_health_check = self.timestamp
559
+
560
+ @property
561
+ def is_healthy(self) -> bool:
562
+ """Check if service is healthy"""
563
+ return self.status == HealthStatus.HEALTHY and self.health_check_success
564
+
565
+ @property
566
+ def is_available(self) -> bool:
567
+ """Check if service is available (not offline)"""
568
+ return self.status != HealthStatus.OFFLINE
569
+
570
+ @property
571
+ def needs_attention(self) -> bool:
572
+ """Check if service needs immediate attention"""
573
+ return (self.status in [HealthStatus.CRITICAL, HealthStatus.OFFLINE] or
574
+ self.consecutive_failures >= 3 or
575
+ len(self.active_incidents) > 0)
576
+
577
+ @property
578
+ def uptime_percent_24h(self) -> float:
579
+ """Calculate uptime percentage for last 24 hours"""
580
+ total_seconds_24h = 24 * 60 * 60
581
+ uptime_24h = min(self.uptime_seconds, total_seconds_24h)
582
+ return (uptime_24h / total_seconds_24h) * 100
583
+
584
+ @property
585
+ def service_level_indicator(self) -> Dict[str, float]:
586
+ """Calculate Service Level Indicators (SLI)"""
587
+ return {
588
+ "availability": self.availability_percent,
589
+ "success_rate": self.success_rate_percent,
590
+ "response_time_ms": self.response_time_ms or 0,
591
+ "throughput_rps": self.throughput_rps or 0
592
+ }
593
+
594
+ @property
595
+ def reliability_score(self) -> float:
596
+ """Calculate overall reliability score (0-100)"""
597
+ score = 100.0
598
+
599
+ # Availability impact
600
+ score *= (self.availability_percent / 100)
601
+
602
+ # Error rate impact
603
+ score *= (self.success_rate_percent / 100)
604
+
605
+ # Health check impact
606
+ if self.consecutive_failures > 0:
607
+ score -= min(50, self.consecutive_failures * 10)
608
+
609
+ # Incident impact
610
+ if self.incidents_24h > 0:
611
+ score -= min(30, self.incidents_24h * 5)
612
+
613
+ return max(0.0, min(100.0, score))
614
+
615
+ def update_status(self, new_status: str, reason: Optional[str] = None):
616
+ """Update service status"""
617
+ old_status = self.status
618
+ self.status = new_status
619
+ self.timestamp = datetime.now(timezone.utc)
620
+
621
+ # Track uptime/downtime
622
+ if old_status != new_status:
623
+ if new_status == HealthStatus.OFFLINE:
624
+ # Service went down
625
+ if reason:
626
+ self.add_incident(f"Service offline: {reason}", AlertSeverity.CRITICAL)
627
+ elif old_status == HealthStatus.OFFLINE:
628
+ # Service came back up
629
+ self.resolve_incidents("Service restored")
630
+
631
+ def perform_health_check(self) -> bool:
632
+ """Perform health check and update status"""
633
+ self.last_health_check = datetime.now(timezone.utc)
634
+
635
+ # This would contain actual health check logic
636
+ # For now, simulate based on current status
637
+ if self.status == HealthStatus.OFFLINE:
638
+ self.health_check_success = False
639
+ self.consecutive_failures += 1
640
+ self.consecutive_successes = 0
641
+ else:
642
+ self.health_check_success = True
643
+ self.consecutive_successes += 1
644
+ self.consecutive_failures = 0
645
+
646
+ # Update circuit breaker status
647
+ if self.consecutive_failures >= 5:
648
+ self.circuit_breaker_status = "open"
649
+ elif self.consecutive_failures >= 3:
650
+ self.circuit_breaker_status = "half_open"
651
+ else:
652
+ self.circuit_breaker_status = "closed"
653
+
654
+ return self.health_check_success
655
+
656
+ def add_incident(self, description: str, severity: str, incident_id: Optional[str] = None):
657
+ """Add an active incident"""
658
+ if not incident_id:
659
+ import uuid
660
+ incident_id = f"inc_{uuid.uuid4().hex[:8]}"
661
+
662
+ incident = {
663
+ "incident_id": incident_id,
664
+ "description": description,
665
+ "severity": severity,
666
+ "start_time": datetime.now(timezone.utc).isoformat(),
667
+ "status": "active"
668
+ }
669
+
670
+ self.active_incidents.append(incident)
671
+ self.incidents_24h += 1
672
+ self.last_incident = datetime.now(timezone.utc)
673
+
674
+ def resolve_incidents(self, resolution: str):
675
+ """Resolve all active incidents"""
676
+ for incident in self.active_incidents:
677
+ incident["status"] = "resolved"
678
+ incident["end_time"] = datetime.now(timezone.utc).isoformat()
679
+ incident["resolution"] = resolution
680
+
681
+ self.active_incidents = []
682
+
683
+ def check_dependencies(self) -> bool:
684
+ """Check status of service dependencies"""
685
+ all_healthy = True
686
+
687
+ for dependency in self.critical_dependencies:
688
+ status = self.dependency_statuses.get(dependency, HealthStatus.UNKNOWN)
689
+ if status not in [HealthStatus.HEALTHY, HealthStatus.DEGRADED]:
690
+ all_healthy = False
691
+
692
+ # Update our status if critical dependency is down
693
+ if self.status == HealthStatus.HEALTHY:
694
+ self.update_status(HealthStatus.DEGRADED, f"Critical dependency {dependency} is {status}")
695
+
696
+ return all_healthy
697
+
698
+ def generate_status_report(self) -> Dict[str, Any]:
699
+ """Generate comprehensive status report"""
700
+ return {
701
+ "service_info": {
702
+ "service_id": self.service_id,
703
+ "service_name": self.service_name,
704
+ "service_type": self.service_type,
705
+ "version": self.version,
706
+ "environment": self.environment
707
+ },
708
+ "current_status": {
709
+ "status": self.status,
710
+ "health_check_success": self.health_check_success,
711
+ "is_available": self.is_available,
712
+ "needs_attention": self.needs_attention,
713
+ "timestamp": self.timestamp.isoformat()
714
+ },
715
+ "performance": {
716
+ "response_time_ms": self.response_time_ms,
717
+ "throughput_rps": self.throughput_rps,
718
+ "error_rate_percent": self.error_rate_percent,
719
+ "success_rate_percent": self.success_rate_percent
720
+ },
721
+ "reliability": {
722
+ "availability_percent": self.availability_percent,
723
+ "uptime_percent_24h": self.uptime_percent_24h,
724
+ "reliability_score": self.reliability_score,
725
+ "mttr_seconds": self.mttr_seconds,
726
+ "mtbf_seconds": self.mtbf_seconds
727
+ },
728
+ "incidents": {
729
+ "active_count": len(self.active_incidents),
730
+ "incidents_24h": self.incidents_24h,
731
+ "last_incident": self.last_incident.isoformat() if self.last_incident else None
732
+ },
733
+ "health_checks": {
734
+ "consecutive_failures": self.consecutive_failures,
735
+ "consecutive_successes": self.consecutive_successes,
736
+ "last_check": self.last_health_check.isoformat() if self.last_health_check else None
737
+ },
738
+ "dependencies": {
739
+ "total": len(self.dependencies),
740
+ "critical": len(self.critical_dependencies),
741
+ "statuses": self.dependency_statuses
742
+ }
743
+ }
744
+
745
+ # Utility functions for working with system models
746
+
747
+ def create_system_health(
748
+ system_name: str,
749
+ environment: str = "production"
750
+ ) -> SystemHealth:
751
+ """Factory function to create system health record"""
752
+ import uuid
753
+
754
+ health_id = f"health_{system_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}_{uuid.uuid4().hex[:8]}"
755
+
756
+ return SystemHealth(
757
+ health_id=health_id,
758
+ system_name=system_name,
759
+ environment=environment
760
+ )
761
+
762
+ def create_resource_usage(
763
+ resource_type: str,
764
+ current_usage: float,
765
+ total_capacity: Optional[float] = None,
766
+ host: Optional[str] = None,
767
+ service: Optional[str] = None
768
+ ) -> ResourceUsage:
769
+ """Factory function to create resource usage record"""
770
+ import uuid
771
+
772
+ usage_id = f"usage_{resource_type}_{datetime.now().strftime('%Y%m%d_%H%M%S')}_{uuid.uuid4().hex[:8]}"
773
+
774
+ return ResourceUsage(
775
+ usage_id=usage_id,
776
+ resource_type=resource_type,
777
+ current_usage=current_usage,
778
+ total_capacity=total_capacity,
779
+ host=host,
780
+ service=service
781
+ )
782
+
783
+ def create_service_status(
784
+ service_name: str,
785
+ service_type: str,
786
+ endpoint_url: Optional[str] = None,
787
+ environment: str = "production"
788
+ ) -> ServiceStatus:
789
+ """Factory function to create service status record"""
790
+ import uuid
791
+
792
+ service_id = f"svc_{service_name}_{uuid.uuid4().hex[:8]}"
793
+
794
+ return ServiceStatus(
795
+ service_id=service_id,
796
+ service_name=service_name,
797
+ service_type=service_type,
798
+ endpoint_url=endpoint_url,
799
+ environment=environment
800
+ )
801
+
802
+ def calculate_system_overview(
803
+ health_records: List[SystemHealth],
804
+ resource_records: List[ResourceUsage],
805
+ service_records: List[ServiceStatus]
806
+ ) -> Dict[str, Any]:
807
+ """Calculate comprehensive system overview"""
808
+ overview = {
809
+ "timestamp": datetime.now(timezone.utc).isoformat(),
810
+ "system_health": {},
811
+ "resource_utilization": {},
812
+ "service_availability": {},
813
+ "overall_status": HealthStatus.HEALTHY,
814
+ "alerts": []
815
+ }
816
+
817
+ # System health summary
818
+ if health_records:
819
+ healthy_systems = sum(1 for h in health_records if h.is_healthy)
820
+ overview["system_health"] = {
821
+ "total_systems": len(health_records),
822
+ "healthy_systems": healthy_systems,
823
+ "health_percentage": (healthy_systems / len(health_records)) * 100,
824
+ "avg_health_score": sum(h.health_score for h in health_records) / len(health_records)
825
+ }
826
+
827
+ # Resource utilization summary
828
+ if resource_records:
829
+ cpu_records = [r for r in resource_records if r.resource_type == "cpu"]
830
+ memory_records = [r for r in resource_records if r.resource_type == "memory"]
831
+
832
+ overview["resource_utilization"] = {
833
+ "avg_cpu_usage": sum(r.current_usage_percent or 0 for r in cpu_records) / len(cpu_records) if cpu_records else 0,
834
+ "avg_memory_usage": sum(r.current_usage_percent or 0 for r in memory_records) / len(memory_records) if memory_records else 0,
835
+ "resources_at_capacity": sum(1 for r in resource_records if r.is_at_capacity),
836
+ "resources_over_threshold": sum(1 for r in resource_records if r.is_over_soft_limit)
837
+ }
838
+
839
+ # Service availability summary
840
+ if service_records:
841
+ available_services = sum(1 for s in service_records if s.is_available)
842
+ overview["service_availability"] = {
843
+ "total_services": len(service_records),
844
+ "available_services": available_services,
845
+ "availability_percentage": (available_services / len(service_records)) * 100,
846
+ "avg_reliability_score": sum(s.reliability_score for s in service_records) / len(service_records)
847
+ }
848
+
849
+ # Overall status determination
850
+ if overview["system_health"].get("health_percentage", 100) < 80:
851
+ overview["overall_status"] = HealthStatus.CRITICAL
852
+ elif overview["service_availability"].get("availability_percentage", 100) < 90:
853
+ overview["overall_status"] = HealthStatus.DEGRADED
854
+ elif overview["resource_utilization"].get("resources_at_capacity", 0) > 0:
855
+ overview["overall_status"] = HealthStatus.DEGRADED
856
+
857
+ return overview