isa-model 0.4.0__py3-none-any.whl → 0.4.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (189) hide show
  1. isa_model/client.py +466 -43
  2. isa_model/core/cache/redis_cache.py +12 -3
  3. isa_model/core/config/config_manager.py +230 -3
  4. isa_model/core/config.py +90 -0
  5. isa_model/core/database/direct_db_client.py +114 -0
  6. isa_model/core/database/migration_manager.py +563 -0
  7. isa_model/core/database/migrations.py +21 -1
  8. isa_model/core/database/supabase_client.py +154 -19
  9. isa_model/core/dependencies.py +316 -0
  10. isa_model/core/discovery/__init__.py +19 -0
  11. isa_model/core/discovery/consul_discovery.py +190 -0
  12. isa_model/core/logging/__init__.py +54 -0
  13. isa_model/core/logging/influx_logger.py +523 -0
  14. isa_model/core/logging/loki_logger.py +160 -0
  15. isa_model/core/models/__init__.py +27 -18
  16. isa_model/core/models/config_models.py +625 -0
  17. isa_model/core/models/deployment_billing_tracker.py +430 -0
  18. isa_model/core/models/model_manager.py +35 -80
  19. isa_model/core/models/model_metadata.py +690 -0
  20. isa_model/core/models/model_repo.py +174 -18
  21. isa_model/core/models/system_models.py +857 -0
  22. isa_model/core/repositories/__init__.py +9 -0
  23. isa_model/core/repositories/config_repository.py +912 -0
  24. isa_model/core/services/intelligent_model_selector.py +399 -21
  25. isa_model/core/types.py +1 -0
  26. isa_model/deployment/__init__.py +5 -48
  27. isa_model/deployment/core/__init__.py +2 -31
  28. isa_model/deployment/core/deployment_manager.py +1278 -370
  29. isa_model/deployment/modal/__init__.py +8 -0
  30. isa_model/deployment/modal/config.py +136 -0
  31. isa_model/deployment/{services/auto_hf_modal_deployer.py → modal/deployer.py} +1 -1
  32. isa_model/deployment/modal/services/__init__.py +3 -0
  33. isa_model/deployment/modal/services/audio/__init__.py +1 -0
  34. isa_model/deployment/modal/services/embedding/__init__.py +1 -0
  35. isa_model/deployment/modal/services/llm/__init__.py +1 -0
  36. isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
  37. isa_model/deployment/modal/services/video/__init__.py +1 -0
  38. isa_model/deployment/modal/services/vision/__init__.py +1 -0
  39. isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
  40. isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
  41. isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
  42. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
  43. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
  44. isa_model/deployment/storage/__init__.py +5 -0
  45. isa_model/deployment/storage/deployment_repository.py +824 -0
  46. isa_model/deployment/triton/__init__.py +10 -0
  47. isa_model/deployment/triton/config.py +196 -0
  48. isa_model/deployment/triton/configs/__init__.py +1 -0
  49. isa_model/deployment/triton/provider.py +512 -0
  50. isa_model/deployment/triton/scripts/__init__.py +1 -0
  51. isa_model/deployment/triton/templates/__init__.py +1 -0
  52. isa_model/inference/__init__.py +47 -1
  53. isa_model/inference/ai_factory.py +137 -10
  54. isa_model/inference/legacy_services/__init__.py +21 -0
  55. isa_model/inference/legacy_services/model_evaluation.py +637 -0
  56. isa_model/inference/legacy_services/model_service.py +573 -0
  57. isa_model/inference/legacy_services/model_serving.py +717 -0
  58. isa_model/inference/legacy_services/model_training.py +561 -0
  59. isa_model/inference/models/__init__.py +21 -0
  60. isa_model/inference/models/inference_config.py +551 -0
  61. isa_model/inference/models/inference_record.py +675 -0
  62. isa_model/inference/models/performance_models.py +714 -0
  63. isa_model/inference/repositories/__init__.py +9 -0
  64. isa_model/inference/repositories/inference_repository.py +828 -0
  65. isa_model/inference/services/audio/base_stt_service.py +184 -11
  66. isa_model/inference/services/audio/openai_stt_service.py +22 -6
  67. isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
  68. isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
  69. isa_model/inference/services/llm/__init__.py +10 -2
  70. isa_model/inference/services/llm/base_llm_service.py +335 -24
  71. isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
  72. isa_model/inference/services/llm/helpers/llm_adapter.py +9 -4
  73. isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
  74. isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
  75. isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
  76. isa_model/inference/services/llm/ollama_llm_service.py +9 -2
  77. isa_model/inference/services/llm/openai_llm_service.py +33 -16
  78. isa_model/inference/services/llm/yyds_llm_service.py +8 -2
  79. isa_model/inference/services/vision/__init__.py +22 -1
  80. isa_model/inference/services/vision/helpers/image_utils.py +8 -5
  81. isa_model/inference/services/vision/isa_vision_service.py +65 -4
  82. isa_model/inference/services/vision/openai_vision_service.py +19 -10
  83. isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
  84. isa_model/serving/api/cache_manager.py +245 -0
  85. isa_model/serving/api/dependencies/__init__.py +1 -0
  86. isa_model/serving/api/dependencies/auth.py +194 -0
  87. isa_model/serving/api/dependencies/database.py +139 -0
  88. isa_model/serving/api/error_handlers.py +284 -0
  89. isa_model/serving/api/fastapi_server.py +172 -22
  90. isa_model/serving/api/middleware/auth.py +8 -2
  91. isa_model/serving/api/middleware/security.py +23 -33
  92. isa_model/serving/api/middleware/tenant_context.py +414 -0
  93. isa_model/serving/api/routes/analytics.py +4 -1
  94. isa_model/serving/api/routes/config.py +645 -0
  95. isa_model/serving/api/routes/deployment_billing.py +315 -0
  96. isa_model/serving/api/routes/deployments.py +138 -2
  97. isa_model/serving/api/routes/gpu_gateway.py +440 -0
  98. isa_model/serving/api/routes/health.py +32 -12
  99. isa_model/serving/api/routes/inference_monitoring.py +486 -0
  100. isa_model/serving/api/routes/local_deployments.py +448 -0
  101. isa_model/serving/api/routes/tenants.py +575 -0
  102. isa_model/serving/api/routes/unified.py +680 -18
  103. isa_model/serving/api/routes/webhooks.py +479 -0
  104. isa_model/serving/api/startup.py +68 -54
  105. isa_model/utils/gpu_utils.py +311 -0
  106. {isa_model-0.4.0.dist-info → isa_model-0.4.4.dist-info}/METADATA +71 -24
  107. isa_model-0.4.4.dist-info/RECORD +180 -0
  108. isa_model/core/security/secrets.py +0 -358
  109. isa_model/core/storage/hf_storage.py +0 -419
  110. isa_model/core/storage/minio_storage.py +0 -0
  111. isa_model/deployment/cloud/__init__.py +0 -9
  112. isa_model/deployment/cloud/modal/__init__.py +0 -10
  113. isa_model/deployment/core/deployment_config.py +0 -356
  114. isa_model/deployment/core/isa_deployment_service.py +0 -401
  115. isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
  116. isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
  117. isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
  118. isa_model/deployment/runtime/deployed_service.py +0 -338
  119. isa_model/deployment/services/__init__.py +0 -9
  120. isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
  121. isa_model/deployment/services/model_service.py +0 -332
  122. isa_model/deployment/services/service_monitor.py +0 -356
  123. isa_model/deployment/services/service_registry.py +0 -527
  124. isa_model/eval/__init__.py +0 -92
  125. isa_model/eval/benchmarks/__init__.py +0 -27
  126. isa_model/eval/benchmarks/multimodal_datasets.py +0 -460
  127. isa_model/eval/benchmarks.py +0 -701
  128. isa_model/eval/config/__init__.py +0 -10
  129. isa_model/eval/config/evaluation_config.py +0 -108
  130. isa_model/eval/evaluators/__init__.py +0 -24
  131. isa_model/eval/evaluators/audio_evaluator.py +0 -727
  132. isa_model/eval/evaluators/base_evaluator.py +0 -503
  133. isa_model/eval/evaluators/embedding_evaluator.py +0 -742
  134. isa_model/eval/evaluators/llm_evaluator.py +0 -472
  135. isa_model/eval/evaluators/vision_evaluator.py +0 -564
  136. isa_model/eval/example_evaluation.py +0 -395
  137. isa_model/eval/factory.py +0 -798
  138. isa_model/eval/infrastructure/__init__.py +0 -24
  139. isa_model/eval/infrastructure/experiment_tracker.py +0 -466
  140. isa_model/eval/isa_benchmarks.py +0 -700
  141. isa_model/eval/isa_integration.py +0 -582
  142. isa_model/eval/metrics.py +0 -951
  143. isa_model/eval/tests/unit/test_basic.py +0 -396
  144. isa_model/serving/api/routes/evaluations.py +0 -579
  145. isa_model/training/__init__.py +0 -168
  146. isa_model/training/annotation/annotation_schema.py +0 -47
  147. isa_model/training/annotation/processors/annotation_processor.py +0 -126
  148. isa_model/training/annotation/storage/dataset_manager.py +0 -131
  149. isa_model/training/annotation/storage/dataset_schema.py +0 -44
  150. isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
  151. isa_model/training/annotation/tests/test_minio copy.py +0 -113
  152. isa_model/training/annotation/tests/test_minio_upload.py +0 -43
  153. isa_model/training/annotation/views/annotation_controller.py +0 -158
  154. isa_model/training/cloud/__init__.py +0 -22
  155. isa_model/training/cloud/job_orchestrator.py +0 -402
  156. isa_model/training/cloud/runpod_trainer.py +0 -454
  157. isa_model/training/cloud/storage_manager.py +0 -482
  158. isa_model/training/core/__init__.py +0 -26
  159. isa_model/training/core/config.py +0 -181
  160. isa_model/training/core/dataset.py +0 -222
  161. isa_model/training/core/trainer.py +0 -720
  162. isa_model/training/core/utils.py +0 -213
  163. isa_model/training/examples/intelligent_training_example.py +0 -281
  164. isa_model/training/factory.py +0 -424
  165. isa_model/training/intelligent/__init__.py +0 -25
  166. isa_model/training/intelligent/decision_engine.py +0 -643
  167. isa_model/training/intelligent/intelligent_factory.py +0 -888
  168. isa_model/training/intelligent/knowledge_base.py +0 -751
  169. isa_model/training/intelligent/resource_optimizer.py +0 -839
  170. isa_model/training/intelligent/task_classifier.py +0 -576
  171. isa_model/training/storage/__init__.py +0 -24
  172. isa_model/training/storage/core_integration.py +0 -439
  173. isa_model/training/storage/training_repository.py +0 -552
  174. isa_model/training/storage/training_storage.py +0 -628
  175. isa_model-0.4.0.dist-info/RECORD +0 -182
  176. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_chatTTS_service.py +0 -0
  177. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_fish_service.py +0 -0
  178. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_openvoice_service.py +0 -0
  179. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_service_v2.py +0 -0
  180. /isa_model/deployment/{cloud/modal → modal/services/embedding}/isa_embed_rerank_service.py +0 -0
  181. /isa_model/deployment/{cloud/modal → modal/services/video}/isa_video_hunyuan_service.py +0 -0
  182. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ocr_service.py +0 -0
  183. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_qwen25_service.py +0 -0
  184. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_table_service.py +0 -0
  185. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service.py +0 -0
  186. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service_optimized.py +0 -0
  187. /isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
  188. {isa_model-0.4.0.dist-info → isa_model-0.4.4.dist-info}/WHEEL +0 -0
  189. {isa_model-0.4.0.dist-info → isa_model-0.4.4.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,857 @@
1
+ """
2
+ System Models
3
+
4
+ Data models for system health, resource usage, and service status monitoring,
5
+ following the ISA Model architecture pattern.
6
+ """
7
+
8
+ import logging
9
+ from datetime import datetime, timezone, timedelta
10
+ from typing import Dict, List, Optional, Any, Union
11
+ from dataclasses import dataclass, field
12
+ from enum import Enum
13
+ import statistics
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+ class HealthStatus(str, Enum):
18
+ """Health status enumeration"""
19
+ HEALTHY = "healthy"
20
+ DEGRADED = "degraded"
21
+ CRITICAL = "critical"
22
+ OFFLINE = "offline"
23
+ UNKNOWN = "unknown"
24
+ MAINTENANCE = "maintenance"
25
+
26
+ class ServiceType(str, Enum):
27
+ """Service type enumeration"""
28
+ API = "api"
29
+ DATABASE = "database"
30
+ CACHE = "cache"
31
+ QUEUE = "queue"
32
+ STORAGE = "storage"
33
+ COMPUTE = "compute"
34
+ MONITORING = "monitoring"
35
+ EXTERNAL = "external"
36
+
37
+ class AlertSeverity(str, Enum):
38
+ """Alert severity enumeration"""
39
+ INFO = "info"
40
+ WARNING = "warning"
41
+ ERROR = "error"
42
+ CRITICAL = "critical"
43
+
44
+ @dataclass
45
+ class SystemHealth:
46
+ """
47
+ System health monitoring record
48
+
49
+ Tracks overall system health including component status,
50
+ performance metrics, and alert information.
51
+ """
52
+ health_id: str
53
+ system_name: str
54
+ overall_status: str = HealthStatus.HEALTHY
55
+ timestamp: datetime = None
56
+
57
+ # Component health
58
+ component_status: Dict[str, str] = field(default_factory=dict)
59
+ component_metrics: Dict[str, Dict[str, float]] = field(default_factory=dict)
60
+ failing_components: List[str] = field(default_factory=list)
61
+ degraded_components: List[str] = field(default_factory=list)
62
+
63
+ # Performance indicators
64
+ response_time_ms: Optional[float] = None
65
+ throughput_rps: Optional[float] = None
66
+ error_rate_percent: Optional[float] = None
67
+ availability_percent: Optional[float] = None
68
+ uptime_seconds: Optional[int] = None
69
+
70
+ # Resource utilization
71
+ cpu_usage_percent: Optional[float] = None
72
+ memory_usage_percent: Optional[float] = None
73
+ disk_usage_percent: Optional[float] = None
74
+ network_usage_mbps: Optional[float] = None
75
+
76
+ # Health checks
77
+ last_health_check: Optional[datetime] = None
78
+ health_check_interval_seconds: int = 60
79
+ consecutive_failures: int = 0
80
+ consecutive_successes: int = 0
81
+
82
+ # Alerts and issues
83
+ active_alerts: List[Dict[str, Any]] = field(default_factory=list)
84
+ resolved_alerts_24h: int = 0
85
+ critical_issues: List[str] = field(default_factory=list)
86
+
87
+ # Metadata
88
+ version: Optional[str] = None
89
+ environment: str = "production"
90
+ region: Optional[str] = None
91
+ metadata: Dict[str, Any] = field(default_factory=dict)
92
+
93
+ def __post_init__(self):
94
+ if self.timestamp is None:
95
+ self.timestamp = datetime.now(timezone.utc)
96
+ if self.last_health_check is None:
97
+ self.last_health_check = self.timestamp
98
+
99
+ @property
100
+ def is_healthy(self) -> bool:
101
+ """Check if system is healthy"""
102
+ return self.overall_status == HealthStatus.HEALTHY
103
+
104
+ @property
105
+ def health_score(self) -> float:
106
+ """Calculate overall health score (0-100)"""
107
+ score = 100.0
108
+
109
+ # Component health impact
110
+ total_components = len(self.component_status)
111
+ if total_components > 0:
112
+ healthy_components = sum(1 for status in self.component_status.values()
113
+ if status == HealthStatus.HEALTHY)
114
+ component_score = (healthy_components / total_components) * 100
115
+ score = min(score, component_score)
116
+
117
+ # Performance impact
118
+ if self.error_rate_percent is not None:
119
+ score -= min(50, self.error_rate_percent * 5) # Error rate penalty
120
+
121
+ if self.availability_percent is not None:
122
+ score = min(score, self.availability_percent)
123
+
124
+ if self.cpu_usage_percent is not None and self.cpu_usage_percent > 90:
125
+ score -= (self.cpu_usage_percent - 90) * 2
126
+
127
+ if self.memory_usage_percent is not None and self.memory_usage_percent > 90:
128
+ score -= (self.memory_usage_percent - 90) * 2
129
+
130
+ # Alert impact
131
+ critical_alert_count = sum(1 for alert in self.active_alerts
132
+ if alert.get('severity') == AlertSeverity.CRITICAL)
133
+ score -= critical_alert_count * 10
134
+
135
+ return max(0.0, min(100.0, score))
136
+
137
+ @property
138
+ def needs_attention(self) -> bool:
139
+ """Check if system needs immediate attention"""
140
+ return (self.overall_status in [HealthStatus.CRITICAL, HealthStatus.DEGRADED] or
141
+ len(self.critical_issues) > 0 or
142
+ any(alert.get('severity') == AlertSeverity.CRITICAL for alert in self.active_alerts))
143
+
144
+ @property
145
+ def time_since_last_check(self) -> int:
146
+ """Get seconds since last health check"""
147
+ if self.last_health_check:
148
+ return int((datetime.now(timezone.utc) - self.last_health_check).total_seconds())
149
+ return 0
150
+
151
+ def update_component_status(self, component_name: str, status: str,
152
+ metrics: Optional[Dict[str, float]] = None):
153
+ """Update status for a specific component"""
154
+ old_status = self.component_status.get(component_name)
155
+ self.component_status[component_name] = status
156
+
157
+ if metrics:
158
+ self.component_metrics[component_name] = metrics
159
+
160
+ # Update component lists
161
+ if status == HealthStatus.CRITICAL:
162
+ if component_name not in self.failing_components:
163
+ self.failing_components.append(component_name)
164
+ if component_name in self.degraded_components:
165
+ self.degraded_components.remove(component_name)
166
+ elif status == HealthStatus.DEGRADED:
167
+ if component_name not in self.degraded_components:
168
+ self.degraded_components.append(component_name)
169
+ if component_name in self.failing_components:
170
+ self.failing_components.remove(component_name)
171
+ else: # Healthy or other
172
+ if component_name in self.failing_components:
173
+ self.failing_components.remove(component_name)
174
+ if component_name in self.degraded_components:
175
+ self.degraded_components.remove(component_name)
176
+
177
+ # Update overall status
178
+ self._calculate_overall_status()
179
+
180
+ # Track consecutive failures/successes
181
+ if old_status != status:
182
+ if status in [HealthStatus.CRITICAL, HealthStatus.DEGRADED]:
183
+ self.consecutive_failures += 1
184
+ self.consecutive_successes = 0
185
+ else:
186
+ self.consecutive_successes += 1
187
+ self.consecutive_failures = 0
188
+
189
+ def _calculate_overall_status(self):
190
+ """Calculate overall system status from component statuses"""
191
+ if not self.component_status:
192
+ self.overall_status = HealthStatus.UNKNOWN
193
+ return
194
+
195
+ statuses = list(self.component_status.values())
196
+
197
+ if HealthStatus.CRITICAL in statuses:
198
+ self.overall_status = HealthStatus.CRITICAL
199
+ elif HealthStatus.DEGRADED in statuses:
200
+ self.overall_status = HealthStatus.DEGRADED
201
+ elif HealthStatus.OFFLINE in statuses:
202
+ self.overall_status = HealthStatus.DEGRADED
203
+ elif all(status == HealthStatus.HEALTHY for status in statuses):
204
+ self.overall_status = HealthStatus.HEALTHY
205
+ else:
206
+ self.overall_status = HealthStatus.DEGRADED
207
+
208
+ def add_alert(self, alert_id: str, severity: str, message: str,
209
+ component: Optional[str] = None, **kwargs):
210
+ """Add an active alert"""
211
+ alert = {
212
+ "alert_id": alert_id,
213
+ "severity": severity,
214
+ "message": message,
215
+ "component": component,
216
+ "timestamp": datetime.now(timezone.utc).isoformat(),
217
+ **kwargs
218
+ }
219
+
220
+ # Check if alert already exists
221
+ existing_alert = next((a for a in self.active_alerts if a.get("alert_id") == alert_id), None)
222
+ if existing_alert:
223
+ existing_alert.update(alert)
224
+ else:
225
+ self.active_alerts.append(alert)
226
+
227
+ def resolve_alert(self, alert_id: str):
228
+ """Resolve an active alert"""
229
+ self.active_alerts = [alert for alert in self.active_alerts
230
+ if alert.get("alert_id") != alert_id]
231
+ self.resolved_alerts_24h += 1
232
+
233
+ def perform_health_check(self) -> Dict[str, Any]:
234
+ """Perform comprehensive health check"""
235
+ self.last_health_check = datetime.now(timezone.utc)
236
+
237
+ health_result = {
238
+ "overall_status": self.overall_status,
239
+ "health_score": self.health_score,
240
+ "timestamp": self.last_health_check.isoformat(),
241
+ "component_summary": {
242
+ "total": len(self.component_status),
243
+ "healthy": sum(1 for s in self.component_status.values() if s == HealthStatus.HEALTHY),
244
+ "degraded": len(self.degraded_components),
245
+ "critical": len(self.failing_components)
246
+ },
247
+ "alerts": {
248
+ "active": len(self.active_alerts),
249
+ "critical": sum(1 for a in self.active_alerts if a.get("severity") == AlertSeverity.CRITICAL),
250
+ "resolved_24h": self.resolved_alerts_24h
251
+ },
252
+ "performance": {
253
+ "response_time_ms": self.response_time_ms,
254
+ "error_rate_percent": self.error_rate_percent,
255
+ "availability_percent": self.availability_percent
256
+ },
257
+ "needs_attention": self.needs_attention
258
+ }
259
+
260
+ return health_result
261
+
262
+ @dataclass
263
+ class ResourceUsage:
264
+ """
265
+ Resource usage monitoring record
266
+
267
+ Tracks resource consumption across different dimensions
268
+ including compute, memory, storage, and network resources.
269
+ """
270
+ usage_id: str
271
+ resource_type: str # cpu, memory, disk, network, gpu
272
+ timestamp: datetime = None
273
+ measurement_period_seconds: int = 60
274
+
275
+ # Current usage
276
+ current_usage: float = 0.0
277
+ current_usage_percent: Optional[float] = None
278
+
279
+ # Historical data
280
+ min_usage: float = 0.0
281
+ max_usage: float = 0.0
282
+ avg_usage: float = 0.0
283
+ p95_usage: Optional[float] = None
284
+ p99_usage: Optional[float] = None
285
+
286
+ # Capacity and limits
287
+ total_capacity: Optional[float] = None
288
+ allocated_capacity: Optional[float] = None
289
+ reserved_capacity: Optional[float] = None
290
+ soft_limit: Optional[float] = None
291
+ hard_limit: Optional[float] = None
292
+
293
+ # Usage patterns
294
+ usage_samples: List[float] = field(default_factory=list)
295
+ peak_hours: List[int] = field(default_factory=list)
296
+ low_hours: List[int] = field(default_factory=list)
297
+
298
+ # Trends and predictions
299
+ trend_direction: str = "stable" # increasing, decreasing, stable, volatile
300
+ predicted_usage_1h: Optional[float] = None
301
+ predicted_usage_24h: Optional[float] = None
302
+ time_to_capacity: Optional[int] = None # seconds until capacity reached
303
+
304
+ # Alerts and thresholds
305
+ warning_threshold: Optional[float] = None
306
+ critical_threshold: Optional[float] = None
307
+ threshold_breaches_24h: int = 0
308
+
309
+ # Metadata
310
+ host: Optional[str] = None
311
+ service: Optional[str] = None
312
+ tags: Dict[str, str] = field(default_factory=dict)
313
+
314
+ def __post_init__(self):
315
+ if self.timestamp is None:
316
+ self.timestamp = datetime.now(timezone.utc)
317
+
318
+ # Calculate percentage if capacity is known
319
+ if self.total_capacity and self.total_capacity > 0:
320
+ self.current_usage_percent = (self.current_usage / self.total_capacity) * 100
321
+
322
+ @property
323
+ def is_at_capacity(self) -> bool:
324
+ """Check if resource is at or near capacity"""
325
+ if self.current_usage_percent:
326
+ return self.current_usage_percent >= 95
327
+ return False
328
+
329
+ @property
330
+ def is_over_soft_limit(self) -> bool:
331
+ """Check if usage exceeds soft limit"""
332
+ if self.soft_limit:
333
+ return self.current_usage > self.soft_limit
334
+ return False
335
+
336
+ @property
337
+ def is_over_hard_limit(self) -> bool:
338
+ """Check if usage exceeds hard limit"""
339
+ if self.hard_limit:
340
+ return self.current_usage > self.hard_limit
341
+ return False
342
+
343
+ @property
344
+ def utilization_efficiency(self) -> float:
345
+ """Calculate utilization efficiency (0-100)"""
346
+ if not self.allocated_capacity or self.allocated_capacity == 0:
347
+ return 0.0
348
+
349
+ return min(100.0, (self.current_usage / self.allocated_capacity) * 100)
350
+
351
+ @property
352
+ def waste_percentage(self) -> float:
353
+ """Calculate resource waste percentage"""
354
+ if not self.allocated_capacity or self.allocated_capacity == 0:
355
+ return 0.0
356
+
357
+ unused = max(0, self.allocated_capacity - self.current_usage)
358
+ return (unused / self.allocated_capacity) * 100
359
+
360
+ def add_usage_sample(self, usage_value: float, timestamp: Optional[datetime] = None):
361
+ """Add a usage measurement sample"""
362
+ self.usage_samples.append(usage_value)
363
+
364
+ # Update current values
365
+ self.current_usage = usage_value
366
+ if self.total_capacity and self.total_capacity > 0:
367
+ self.current_usage_percent = (usage_value / self.total_capacity) * 100
368
+
369
+ # Update min/max
370
+ if usage_value < self.min_usage or self.min_usage == 0:
371
+ self.min_usage = usage_value
372
+ if usage_value > self.max_usage:
373
+ self.max_usage = usage_value
374
+
375
+ # Recalculate statistics if we have enough samples
376
+ if len(self.usage_samples) >= 10:
377
+ self._calculate_statistics()
378
+
379
+ # Track peak hours
380
+ if timestamp:
381
+ hour = timestamp.hour
382
+ if usage_value > self.avg_usage * 1.5:
383
+ if hour not in self.peak_hours:
384
+ self.peak_hours.append(hour)
385
+ elif usage_value < self.avg_usage * 0.5:
386
+ if hour not in self.low_hours:
387
+ self.low_hours.append(hour)
388
+
389
+ def _calculate_statistics(self):
390
+ """Calculate statistical measures from usage samples"""
391
+ if not self.usage_samples:
392
+ return
393
+
394
+ self.avg_usage = statistics.mean(self.usage_samples)
395
+
396
+ if len(self.usage_samples) > 1:
397
+ sorted_samples = sorted(self.usage_samples)
398
+ n = len(sorted_samples)
399
+
400
+ self.p95_usage = sorted_samples[int(0.95 * n)]
401
+ self.p99_usage = sorted_samples[int(0.99 * n)] if n > 100 else sorted_samples[-1]
402
+
403
+ # Analyze trend
404
+ if len(self.usage_samples) >= 5:
405
+ recent_avg = statistics.mean(self.usage_samples[-5:])
406
+ older_avg = statistics.mean(self.usage_samples[:-5]) if len(self.usage_samples) > 5 else self.avg_usage
407
+
408
+ if recent_avg > older_avg * 1.1:
409
+ self.trend_direction = "increasing"
410
+ elif recent_avg < older_avg * 0.9:
411
+ self.trend_direction = "decreasing"
412
+ else:
413
+ # Check for volatility
414
+ std_dev = statistics.stdev(self.usage_samples[-10:]) if len(self.usage_samples) >= 10 else 0
415
+ cv = std_dev / self.avg_usage if self.avg_usage > 0 else 0
416
+
417
+ if cv > 0.3: # High coefficient of variation
418
+ self.trend_direction = "volatile"
419
+ else:
420
+ self.trend_direction = "stable"
421
+
422
+ def predict_future_usage(self, hours_ahead: int = 1) -> Optional[float]:
423
+ """Predict future resource usage based on trends"""
424
+ if len(self.usage_samples) < 5:
425
+ return None
426
+
427
+ # Simple linear trend prediction
428
+ recent_samples = self.usage_samples[-10:]
429
+ x = list(range(len(recent_samples)))
430
+ y = recent_samples
431
+
432
+ # Calculate linear regression (simplified)
433
+ n = len(x)
434
+ sum_x = sum(x)
435
+ sum_y = sum(y)
436
+ sum_xy = sum(x[i] * y[i] for i in range(n))
437
+ sum_x2 = sum(x[i] ** 2 for i in range(n))
438
+
439
+ if n * sum_x2 - sum_x ** 2 != 0:
440
+ slope = (n * sum_xy - sum_x * sum_y) / (n * sum_x2 - sum_x ** 2)
441
+ intercept = (sum_y - slope * sum_x) / n
442
+
443
+ # Predict future value
444
+ future_x = len(recent_samples) + hours_ahead
445
+ predicted = slope * future_x + intercept
446
+
447
+ return max(0, predicted)
448
+
449
+ return self.avg_usage
450
+
451
+ def calculate_time_to_capacity(self) -> Optional[int]:
452
+ """Calculate time until resource reaches capacity"""
453
+ if not self.total_capacity or self.trend_direction != "increasing":
454
+ return None
455
+
456
+ available_capacity = self.total_capacity - self.current_usage
457
+ if available_capacity <= 0:
458
+ return 0
459
+
460
+ # Estimate growth rate from recent samples
461
+ if len(self.usage_samples) >= 5:
462
+ recent_growth = self.usage_samples[-1] - self.usage_samples[-5]
463
+ if recent_growth > 0:
464
+ # Estimate hours to capacity based on current growth rate
465
+ growth_per_hour = recent_growth / 5 # Assuming samples are hourly
466
+ hours_to_capacity = available_capacity / growth_per_hour
467
+ return int(hours_to_capacity * 3600) # Convert to seconds
468
+
469
+ return None
470
+
471
+ def check_thresholds(self) -> List[Dict[str, Any]]:
472
+ """Check if usage exceeds configured thresholds"""
473
+ alerts = []
474
+
475
+ if self.warning_threshold and self.current_usage > self.warning_threshold:
476
+ alerts.append({
477
+ "level": "warning",
478
+ "message": f"{self.resource_type} usage ({self.current_usage}) exceeds warning threshold ({self.warning_threshold})",
479
+ "threshold": self.warning_threshold,
480
+ "current_value": self.current_usage
481
+ })
482
+
483
+ if self.critical_threshold and self.current_usage > self.critical_threshold:
484
+ alerts.append({
485
+ "level": "critical",
486
+ "message": f"{self.resource_type} usage ({self.current_usage}) exceeds critical threshold ({self.critical_threshold})",
487
+ "threshold": self.critical_threshold,
488
+ "current_value": self.current_usage
489
+ })
490
+
491
+ return alerts
492
+
493
+ @dataclass
494
+ class ServiceStatus:
495
+ """
496
+ Service status monitoring record
497
+
498
+ Tracks the status and health of individual services
499
+ including availability, performance, and dependencies.
500
+ """
501
+ service_id: str
502
+ service_name: str
503
+ service_type: str
504
+ status: str = HealthStatus.UNKNOWN
505
+ timestamp: datetime = None
506
+
507
+ # Service details
508
+ version: Optional[str] = None
509
+ endpoint_url: Optional[str] = None
510
+ health_check_url: Optional[str] = None
511
+ last_deployment: Optional[datetime] = None
512
+
513
+ # Availability metrics
514
+ uptime_seconds: int = 0
515
+ downtime_seconds: int = 0
516
+ availability_percent: float = 100.0
517
+ mttr_seconds: Optional[int] = None # Mean Time To Repair
518
+ mtbf_seconds: Optional[int] = None # Mean Time Between Failures
519
+
520
+ # Performance metrics
521
+ response_time_ms: Optional[float] = None
522
+ throughput_rps: Optional[float] = None
523
+ error_rate_percent: float = 0.0
524
+ success_rate_percent: float = 100.0
525
+
526
+ # Health check results
527
+ last_health_check: Optional[datetime] = None
528
+ health_check_success: bool = True
529
+ consecutive_failures: int = 0
530
+ consecutive_successes: int = 0
531
+ health_check_interval_seconds: int = 30
532
+
533
+ # Dependencies
534
+ dependencies: List[str] = field(default_factory=list)
535
+ dependency_statuses: Dict[str, str] = field(default_factory=dict)
536
+ critical_dependencies: List[str] = field(default_factory=list)
537
+
538
+ # Incidents and alerts
539
+ active_incidents: List[Dict[str, Any]] = field(default_factory=list)
540
+ incidents_24h: int = 0
541
+ last_incident: Optional[datetime] = None
542
+
543
+ # Configuration
544
+ auto_restart_enabled: bool = False
545
+ restart_count_24h: int = 0
546
+ circuit_breaker_status: str = "closed" # closed, open, half_open
547
+
548
+ # Metadata
549
+ environment: str = "production"
550
+ region: Optional[str] = None
551
+ owner_team: Optional[str] = None
552
+ tags: Dict[str, str] = field(default_factory=dict)
553
+
554
+ def __post_init__(self):
555
+ if self.timestamp is None:
556
+ self.timestamp = datetime.now(timezone.utc)
557
+ if self.last_health_check is None:
558
+ self.last_health_check = self.timestamp
559
+
560
+ @property
561
+ def is_healthy(self) -> bool:
562
+ """Check if service is healthy"""
563
+ return self.status == HealthStatus.HEALTHY and self.health_check_success
564
+
565
+ @property
566
+ def is_available(self) -> bool:
567
+ """Check if service is available (not offline)"""
568
+ return self.status != HealthStatus.OFFLINE
569
+
570
+ @property
571
+ def needs_attention(self) -> bool:
572
+ """Check if service needs immediate attention"""
573
+ return (self.status in [HealthStatus.CRITICAL, HealthStatus.OFFLINE] or
574
+ self.consecutive_failures >= 3 or
575
+ len(self.active_incidents) > 0)
576
+
577
+ @property
578
+ def uptime_percent_24h(self) -> float:
579
+ """Calculate uptime percentage for last 24 hours"""
580
+ total_seconds_24h = 24 * 60 * 60
581
+ uptime_24h = min(self.uptime_seconds, total_seconds_24h)
582
+ return (uptime_24h / total_seconds_24h) * 100
583
+
584
+ @property
585
+ def service_level_indicator(self) -> Dict[str, float]:
586
+ """Calculate Service Level Indicators (SLI)"""
587
+ return {
588
+ "availability": self.availability_percent,
589
+ "success_rate": self.success_rate_percent,
590
+ "response_time_ms": self.response_time_ms or 0,
591
+ "throughput_rps": self.throughput_rps or 0
592
+ }
593
+
594
+ @property
595
+ def reliability_score(self) -> float:
596
+ """Calculate overall reliability score (0-100)"""
597
+ score = 100.0
598
+
599
+ # Availability impact
600
+ score *= (self.availability_percent / 100)
601
+
602
+ # Error rate impact
603
+ score *= (self.success_rate_percent / 100)
604
+
605
+ # Health check impact
606
+ if self.consecutive_failures > 0:
607
+ score -= min(50, self.consecutive_failures * 10)
608
+
609
+ # Incident impact
610
+ if self.incidents_24h > 0:
611
+ score -= min(30, self.incidents_24h * 5)
612
+
613
+ return max(0.0, min(100.0, score))
614
+
615
+ def update_status(self, new_status: str, reason: Optional[str] = None):
616
+ """Update service status"""
617
+ old_status = self.status
618
+ self.status = new_status
619
+ self.timestamp = datetime.now(timezone.utc)
620
+
621
+ # Track uptime/downtime
622
+ if old_status != new_status:
623
+ if new_status == HealthStatus.OFFLINE:
624
+ # Service went down
625
+ if reason:
626
+ self.add_incident(f"Service offline: {reason}", AlertSeverity.CRITICAL)
627
+ elif old_status == HealthStatus.OFFLINE:
628
+ # Service came back up
629
+ self.resolve_incidents("Service restored")
630
+
631
+ def perform_health_check(self) -> bool:
632
+ """Perform health check and update status"""
633
+ self.last_health_check = datetime.now(timezone.utc)
634
+
635
+ # This would contain actual health check logic
636
+ # For now, simulate based on current status
637
+ if self.status == HealthStatus.OFFLINE:
638
+ self.health_check_success = False
639
+ self.consecutive_failures += 1
640
+ self.consecutive_successes = 0
641
+ else:
642
+ self.health_check_success = True
643
+ self.consecutive_successes += 1
644
+ self.consecutive_failures = 0
645
+
646
+ # Update circuit breaker status
647
+ if self.consecutive_failures >= 5:
648
+ self.circuit_breaker_status = "open"
649
+ elif self.consecutive_failures >= 3:
650
+ self.circuit_breaker_status = "half_open"
651
+ else:
652
+ self.circuit_breaker_status = "closed"
653
+
654
+ return self.health_check_success
655
+
656
+ def add_incident(self, description: str, severity: str, incident_id: Optional[str] = None):
657
+ """Add an active incident"""
658
+ if not incident_id:
659
+ import uuid
660
+ incident_id = f"inc_{uuid.uuid4().hex[:8]}"
661
+
662
+ incident = {
663
+ "incident_id": incident_id,
664
+ "description": description,
665
+ "severity": severity,
666
+ "start_time": datetime.now(timezone.utc).isoformat(),
667
+ "status": "active"
668
+ }
669
+
670
+ self.active_incidents.append(incident)
671
+ self.incidents_24h += 1
672
+ self.last_incident = datetime.now(timezone.utc)
673
+
674
+ def resolve_incidents(self, resolution: str):
675
+ """Resolve all active incidents"""
676
+ for incident in self.active_incidents:
677
+ incident["status"] = "resolved"
678
+ incident["end_time"] = datetime.now(timezone.utc).isoformat()
679
+ incident["resolution"] = resolution
680
+
681
+ self.active_incidents = []
682
+
683
+ def check_dependencies(self) -> bool:
684
+ """Check status of service dependencies"""
685
+ all_healthy = True
686
+
687
+ for dependency in self.critical_dependencies:
688
+ status = self.dependency_statuses.get(dependency, HealthStatus.UNKNOWN)
689
+ if status not in [HealthStatus.HEALTHY, HealthStatus.DEGRADED]:
690
+ all_healthy = False
691
+
692
+ # Update our status if critical dependency is down
693
+ if self.status == HealthStatus.HEALTHY:
694
+ self.update_status(HealthStatus.DEGRADED, f"Critical dependency {dependency} is {status}")
695
+
696
+ return all_healthy
697
+
698
+ def generate_status_report(self) -> Dict[str, Any]:
699
+ """Generate comprehensive status report"""
700
+ return {
701
+ "service_info": {
702
+ "service_id": self.service_id,
703
+ "service_name": self.service_name,
704
+ "service_type": self.service_type,
705
+ "version": self.version,
706
+ "environment": self.environment
707
+ },
708
+ "current_status": {
709
+ "status": self.status,
710
+ "health_check_success": self.health_check_success,
711
+ "is_available": self.is_available,
712
+ "needs_attention": self.needs_attention,
713
+ "timestamp": self.timestamp.isoformat()
714
+ },
715
+ "performance": {
716
+ "response_time_ms": self.response_time_ms,
717
+ "throughput_rps": self.throughput_rps,
718
+ "error_rate_percent": self.error_rate_percent,
719
+ "success_rate_percent": self.success_rate_percent
720
+ },
721
+ "reliability": {
722
+ "availability_percent": self.availability_percent,
723
+ "uptime_percent_24h": self.uptime_percent_24h,
724
+ "reliability_score": self.reliability_score,
725
+ "mttr_seconds": self.mttr_seconds,
726
+ "mtbf_seconds": self.mtbf_seconds
727
+ },
728
+ "incidents": {
729
+ "active_count": len(self.active_incidents),
730
+ "incidents_24h": self.incidents_24h,
731
+ "last_incident": self.last_incident.isoformat() if self.last_incident else None
732
+ },
733
+ "health_checks": {
734
+ "consecutive_failures": self.consecutive_failures,
735
+ "consecutive_successes": self.consecutive_successes,
736
+ "last_check": self.last_health_check.isoformat() if self.last_health_check else None
737
+ },
738
+ "dependencies": {
739
+ "total": len(self.dependencies),
740
+ "critical": len(self.critical_dependencies),
741
+ "statuses": self.dependency_statuses
742
+ }
743
+ }
744
+
745
+ # Utility functions for working with system models
746
+
747
+ def create_system_health(
748
+ system_name: str,
749
+ environment: str = "production"
750
+ ) -> SystemHealth:
751
+ """Factory function to create system health record"""
752
+ import uuid
753
+
754
+ health_id = f"health_{system_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}_{uuid.uuid4().hex[:8]}"
755
+
756
+ return SystemHealth(
757
+ health_id=health_id,
758
+ system_name=system_name,
759
+ environment=environment
760
+ )
761
+
762
+ def create_resource_usage(
763
+ resource_type: str,
764
+ current_usage: float,
765
+ total_capacity: Optional[float] = None,
766
+ host: Optional[str] = None,
767
+ service: Optional[str] = None
768
+ ) -> ResourceUsage:
769
+ """Factory function to create resource usage record"""
770
+ import uuid
771
+
772
+ usage_id = f"usage_{resource_type}_{datetime.now().strftime('%Y%m%d_%H%M%S')}_{uuid.uuid4().hex[:8]}"
773
+
774
+ return ResourceUsage(
775
+ usage_id=usage_id,
776
+ resource_type=resource_type,
777
+ current_usage=current_usage,
778
+ total_capacity=total_capacity,
779
+ host=host,
780
+ service=service
781
+ )
782
+
783
+ def create_service_status(
784
+ service_name: str,
785
+ service_type: str,
786
+ endpoint_url: Optional[str] = None,
787
+ environment: str = "production"
788
+ ) -> ServiceStatus:
789
+ """Factory function to create service status record"""
790
+ import uuid
791
+
792
+ service_id = f"svc_{service_name}_{uuid.uuid4().hex[:8]}"
793
+
794
+ return ServiceStatus(
795
+ service_id=service_id,
796
+ service_name=service_name,
797
+ service_type=service_type,
798
+ endpoint_url=endpoint_url,
799
+ environment=environment
800
+ )
801
+
802
+ def calculate_system_overview(
803
+ health_records: List[SystemHealth],
804
+ resource_records: List[ResourceUsage],
805
+ service_records: List[ServiceStatus]
806
+ ) -> Dict[str, Any]:
807
+ """Calculate comprehensive system overview"""
808
+ overview = {
809
+ "timestamp": datetime.now(timezone.utc).isoformat(),
810
+ "system_health": {},
811
+ "resource_utilization": {},
812
+ "service_availability": {},
813
+ "overall_status": HealthStatus.HEALTHY,
814
+ "alerts": []
815
+ }
816
+
817
+ # System health summary
818
+ if health_records:
819
+ healthy_systems = sum(1 for h in health_records if h.is_healthy)
820
+ overview["system_health"] = {
821
+ "total_systems": len(health_records),
822
+ "healthy_systems": healthy_systems,
823
+ "health_percentage": (healthy_systems / len(health_records)) * 100,
824
+ "avg_health_score": sum(h.health_score for h in health_records) / len(health_records)
825
+ }
826
+
827
+ # Resource utilization summary
828
+ if resource_records:
829
+ cpu_records = [r for r in resource_records if r.resource_type == "cpu"]
830
+ memory_records = [r for r in resource_records if r.resource_type == "memory"]
831
+
832
+ overview["resource_utilization"] = {
833
+ "avg_cpu_usage": sum(r.current_usage_percent or 0 for r in cpu_records) / len(cpu_records) if cpu_records else 0,
834
+ "avg_memory_usage": sum(r.current_usage_percent or 0 for r in memory_records) / len(memory_records) if memory_records else 0,
835
+ "resources_at_capacity": sum(1 for r in resource_records if r.is_at_capacity),
836
+ "resources_over_threshold": sum(1 for r in resource_records if r.is_over_soft_limit)
837
+ }
838
+
839
+ # Service availability summary
840
+ if service_records:
841
+ available_services = sum(1 for s in service_records if s.is_available)
842
+ overview["service_availability"] = {
843
+ "total_services": len(service_records),
844
+ "available_services": available_services,
845
+ "availability_percentage": (available_services / len(service_records)) * 100,
846
+ "avg_reliability_score": sum(s.reliability_score for s in service_records) / len(service_records)
847
+ }
848
+
849
+ # Overall status determination
850
+ if overview["system_health"].get("health_percentage", 100) < 80:
851
+ overview["overall_status"] = HealthStatus.CRITICAL
852
+ elif overview["service_availability"].get("availability_percentage", 100) < 90:
853
+ overview["overall_status"] = HealthStatus.DEGRADED
854
+ elif overview["resource_utilization"].get("resources_at_capacity", 0) > 0:
855
+ overview["overall_status"] = HealthStatus.DEGRADED
856
+
857
+ return overview