isa-model 0.4.0__py3-none-any.whl → 0.4.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (189) hide show
  1. isa_model/client.py +466 -43
  2. isa_model/core/cache/redis_cache.py +12 -3
  3. isa_model/core/config/config_manager.py +230 -3
  4. isa_model/core/config.py +90 -0
  5. isa_model/core/database/direct_db_client.py +114 -0
  6. isa_model/core/database/migration_manager.py +563 -0
  7. isa_model/core/database/migrations.py +21 -1
  8. isa_model/core/database/supabase_client.py +154 -19
  9. isa_model/core/dependencies.py +316 -0
  10. isa_model/core/discovery/__init__.py +19 -0
  11. isa_model/core/discovery/consul_discovery.py +190 -0
  12. isa_model/core/logging/__init__.py +54 -0
  13. isa_model/core/logging/influx_logger.py +523 -0
  14. isa_model/core/logging/loki_logger.py +160 -0
  15. isa_model/core/models/__init__.py +27 -18
  16. isa_model/core/models/config_models.py +625 -0
  17. isa_model/core/models/deployment_billing_tracker.py +430 -0
  18. isa_model/core/models/model_manager.py +35 -80
  19. isa_model/core/models/model_metadata.py +690 -0
  20. isa_model/core/models/model_repo.py +174 -18
  21. isa_model/core/models/system_models.py +857 -0
  22. isa_model/core/repositories/__init__.py +9 -0
  23. isa_model/core/repositories/config_repository.py +912 -0
  24. isa_model/core/services/intelligent_model_selector.py +399 -21
  25. isa_model/core/types.py +1 -0
  26. isa_model/deployment/__init__.py +5 -48
  27. isa_model/deployment/core/__init__.py +2 -31
  28. isa_model/deployment/core/deployment_manager.py +1278 -370
  29. isa_model/deployment/modal/__init__.py +8 -0
  30. isa_model/deployment/modal/config.py +136 -0
  31. isa_model/deployment/{services/auto_hf_modal_deployer.py → modal/deployer.py} +1 -1
  32. isa_model/deployment/modal/services/__init__.py +3 -0
  33. isa_model/deployment/modal/services/audio/__init__.py +1 -0
  34. isa_model/deployment/modal/services/embedding/__init__.py +1 -0
  35. isa_model/deployment/modal/services/llm/__init__.py +1 -0
  36. isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
  37. isa_model/deployment/modal/services/video/__init__.py +1 -0
  38. isa_model/deployment/modal/services/vision/__init__.py +1 -0
  39. isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
  40. isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
  41. isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
  42. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
  43. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
  44. isa_model/deployment/storage/__init__.py +5 -0
  45. isa_model/deployment/storage/deployment_repository.py +824 -0
  46. isa_model/deployment/triton/__init__.py +10 -0
  47. isa_model/deployment/triton/config.py +196 -0
  48. isa_model/deployment/triton/configs/__init__.py +1 -0
  49. isa_model/deployment/triton/provider.py +512 -0
  50. isa_model/deployment/triton/scripts/__init__.py +1 -0
  51. isa_model/deployment/triton/templates/__init__.py +1 -0
  52. isa_model/inference/__init__.py +47 -1
  53. isa_model/inference/ai_factory.py +137 -10
  54. isa_model/inference/legacy_services/__init__.py +21 -0
  55. isa_model/inference/legacy_services/model_evaluation.py +637 -0
  56. isa_model/inference/legacy_services/model_service.py +573 -0
  57. isa_model/inference/legacy_services/model_serving.py +717 -0
  58. isa_model/inference/legacy_services/model_training.py +561 -0
  59. isa_model/inference/models/__init__.py +21 -0
  60. isa_model/inference/models/inference_config.py +551 -0
  61. isa_model/inference/models/inference_record.py +675 -0
  62. isa_model/inference/models/performance_models.py +714 -0
  63. isa_model/inference/repositories/__init__.py +9 -0
  64. isa_model/inference/repositories/inference_repository.py +828 -0
  65. isa_model/inference/services/audio/base_stt_service.py +184 -11
  66. isa_model/inference/services/audio/openai_stt_service.py +22 -6
  67. isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
  68. isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
  69. isa_model/inference/services/llm/__init__.py +10 -2
  70. isa_model/inference/services/llm/base_llm_service.py +335 -24
  71. isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
  72. isa_model/inference/services/llm/helpers/llm_adapter.py +9 -4
  73. isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
  74. isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
  75. isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
  76. isa_model/inference/services/llm/ollama_llm_service.py +9 -2
  77. isa_model/inference/services/llm/openai_llm_service.py +33 -16
  78. isa_model/inference/services/llm/yyds_llm_service.py +8 -2
  79. isa_model/inference/services/vision/__init__.py +22 -1
  80. isa_model/inference/services/vision/helpers/image_utils.py +8 -5
  81. isa_model/inference/services/vision/isa_vision_service.py +65 -4
  82. isa_model/inference/services/vision/openai_vision_service.py +19 -10
  83. isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
  84. isa_model/serving/api/cache_manager.py +245 -0
  85. isa_model/serving/api/dependencies/__init__.py +1 -0
  86. isa_model/serving/api/dependencies/auth.py +194 -0
  87. isa_model/serving/api/dependencies/database.py +139 -0
  88. isa_model/serving/api/error_handlers.py +284 -0
  89. isa_model/serving/api/fastapi_server.py +172 -22
  90. isa_model/serving/api/middleware/auth.py +8 -2
  91. isa_model/serving/api/middleware/security.py +23 -33
  92. isa_model/serving/api/middleware/tenant_context.py +414 -0
  93. isa_model/serving/api/routes/analytics.py +4 -1
  94. isa_model/serving/api/routes/config.py +645 -0
  95. isa_model/serving/api/routes/deployment_billing.py +315 -0
  96. isa_model/serving/api/routes/deployments.py +138 -2
  97. isa_model/serving/api/routes/gpu_gateway.py +440 -0
  98. isa_model/serving/api/routes/health.py +32 -12
  99. isa_model/serving/api/routes/inference_monitoring.py +486 -0
  100. isa_model/serving/api/routes/local_deployments.py +448 -0
  101. isa_model/serving/api/routes/tenants.py +575 -0
  102. isa_model/serving/api/routes/unified.py +680 -18
  103. isa_model/serving/api/routes/webhooks.py +479 -0
  104. isa_model/serving/api/startup.py +68 -54
  105. isa_model/utils/gpu_utils.py +311 -0
  106. {isa_model-0.4.0.dist-info → isa_model-0.4.4.dist-info}/METADATA +71 -24
  107. isa_model-0.4.4.dist-info/RECORD +180 -0
  108. isa_model/core/security/secrets.py +0 -358
  109. isa_model/core/storage/hf_storage.py +0 -419
  110. isa_model/core/storage/minio_storage.py +0 -0
  111. isa_model/deployment/cloud/__init__.py +0 -9
  112. isa_model/deployment/cloud/modal/__init__.py +0 -10
  113. isa_model/deployment/core/deployment_config.py +0 -356
  114. isa_model/deployment/core/isa_deployment_service.py +0 -401
  115. isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
  116. isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
  117. isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
  118. isa_model/deployment/runtime/deployed_service.py +0 -338
  119. isa_model/deployment/services/__init__.py +0 -9
  120. isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
  121. isa_model/deployment/services/model_service.py +0 -332
  122. isa_model/deployment/services/service_monitor.py +0 -356
  123. isa_model/deployment/services/service_registry.py +0 -527
  124. isa_model/eval/__init__.py +0 -92
  125. isa_model/eval/benchmarks/__init__.py +0 -27
  126. isa_model/eval/benchmarks/multimodal_datasets.py +0 -460
  127. isa_model/eval/benchmarks.py +0 -701
  128. isa_model/eval/config/__init__.py +0 -10
  129. isa_model/eval/config/evaluation_config.py +0 -108
  130. isa_model/eval/evaluators/__init__.py +0 -24
  131. isa_model/eval/evaluators/audio_evaluator.py +0 -727
  132. isa_model/eval/evaluators/base_evaluator.py +0 -503
  133. isa_model/eval/evaluators/embedding_evaluator.py +0 -742
  134. isa_model/eval/evaluators/llm_evaluator.py +0 -472
  135. isa_model/eval/evaluators/vision_evaluator.py +0 -564
  136. isa_model/eval/example_evaluation.py +0 -395
  137. isa_model/eval/factory.py +0 -798
  138. isa_model/eval/infrastructure/__init__.py +0 -24
  139. isa_model/eval/infrastructure/experiment_tracker.py +0 -466
  140. isa_model/eval/isa_benchmarks.py +0 -700
  141. isa_model/eval/isa_integration.py +0 -582
  142. isa_model/eval/metrics.py +0 -951
  143. isa_model/eval/tests/unit/test_basic.py +0 -396
  144. isa_model/serving/api/routes/evaluations.py +0 -579
  145. isa_model/training/__init__.py +0 -168
  146. isa_model/training/annotation/annotation_schema.py +0 -47
  147. isa_model/training/annotation/processors/annotation_processor.py +0 -126
  148. isa_model/training/annotation/storage/dataset_manager.py +0 -131
  149. isa_model/training/annotation/storage/dataset_schema.py +0 -44
  150. isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
  151. isa_model/training/annotation/tests/test_minio copy.py +0 -113
  152. isa_model/training/annotation/tests/test_minio_upload.py +0 -43
  153. isa_model/training/annotation/views/annotation_controller.py +0 -158
  154. isa_model/training/cloud/__init__.py +0 -22
  155. isa_model/training/cloud/job_orchestrator.py +0 -402
  156. isa_model/training/cloud/runpod_trainer.py +0 -454
  157. isa_model/training/cloud/storage_manager.py +0 -482
  158. isa_model/training/core/__init__.py +0 -26
  159. isa_model/training/core/config.py +0 -181
  160. isa_model/training/core/dataset.py +0 -222
  161. isa_model/training/core/trainer.py +0 -720
  162. isa_model/training/core/utils.py +0 -213
  163. isa_model/training/examples/intelligent_training_example.py +0 -281
  164. isa_model/training/factory.py +0 -424
  165. isa_model/training/intelligent/__init__.py +0 -25
  166. isa_model/training/intelligent/decision_engine.py +0 -643
  167. isa_model/training/intelligent/intelligent_factory.py +0 -888
  168. isa_model/training/intelligent/knowledge_base.py +0 -751
  169. isa_model/training/intelligent/resource_optimizer.py +0 -839
  170. isa_model/training/intelligent/task_classifier.py +0 -576
  171. isa_model/training/storage/__init__.py +0 -24
  172. isa_model/training/storage/core_integration.py +0 -439
  173. isa_model/training/storage/training_repository.py +0 -552
  174. isa_model/training/storage/training_storage.py +0 -628
  175. isa_model-0.4.0.dist-info/RECORD +0 -182
  176. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_chatTTS_service.py +0 -0
  177. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_fish_service.py +0 -0
  178. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_openvoice_service.py +0 -0
  179. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_service_v2.py +0 -0
  180. /isa_model/deployment/{cloud/modal → modal/services/embedding}/isa_embed_rerank_service.py +0 -0
  181. /isa_model/deployment/{cloud/modal → modal/services/video}/isa_video_hunyuan_service.py +0 -0
  182. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ocr_service.py +0 -0
  183. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_qwen25_service.py +0 -0
  184. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_table_service.py +0 -0
  185. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service.py +0 -0
  186. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service_optimized.py +0 -0
  187. /isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
  188. {isa_model-0.4.0.dist-info → isa_model-0.4.4.dist-info}/WHEEL +0 -0
  189. {isa_model-0.4.0.dist-info → isa_model-0.4.4.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,714 @@
1
+ """
2
+ Performance Models
3
+
4
+ Specialized models for tracking and analyzing inference performance metrics,
5
+ latency profiles, and throughput characteristics.
6
+ """
7
+
8
+ import logging
9
+ from datetime import datetime, timezone, timedelta
10
+ from typing import Dict, List, Optional, Any, Union, Tuple
11
+ from dataclasses import dataclass, field
12
+ from enum import Enum
13
+ import statistics
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+ class PerformanceTier(str, Enum):
18
+ """Performance tier enumeration"""
19
+ EXCELLENT = "excellent"
20
+ GOOD = "good"
21
+ AVERAGE = "average"
22
+ POOR = "poor"
23
+ CRITICAL = "critical"
24
+
25
+ class LatencyCategory(str, Enum):
26
+ """Latency category enumeration"""
27
+ ULTRA_LOW = "ultra_low" # < 100ms
28
+ LOW = "low" # 100-500ms
29
+ MODERATE = "moderate" # 500ms-2s
30
+ HIGH = "high" # 2s-10s
31
+ VERY_HIGH = "very_high" # > 10s
32
+
33
+ class ThroughputUnit(str, Enum):
34
+ """Throughput measurement unit enumeration"""
35
+ REQUESTS_PER_SECOND = "rps"
36
+ TOKENS_PER_SECOND = "tps"
37
+ TOKENS_PER_MINUTE = "tpm"
38
+ REQUESTS_PER_MINUTE = "rpm"
39
+
40
+ @dataclass
41
+ class PerformanceMetrics:
42
+ """
43
+ Comprehensive performance metrics for inference operations
44
+
45
+ Tracks detailed performance characteristics including latency, throughput,
46
+ resource utilization, and quality metrics.
47
+ """
48
+ metric_id: str
49
+ model_id: str
50
+ provider: str
51
+ service_type: str
52
+ measurement_period_start: datetime
53
+ measurement_period_end: datetime
54
+
55
+ # Request volume metrics
56
+ total_requests: int = 0
57
+ successful_requests: int = 0
58
+ failed_requests: int = 0
59
+ timeout_requests: int = 0
60
+
61
+ # Latency metrics (in milliseconds)
62
+ min_latency_ms: Optional[float] = None
63
+ max_latency_ms: Optional[float] = None
64
+ mean_latency_ms: Optional[float] = None
65
+ median_latency_ms: Optional[float] = None
66
+ p95_latency_ms: Optional[float] = None
67
+ p99_latency_ms: Optional[float] = None
68
+ p999_latency_ms: Optional[float] = None
69
+ latency_std_dev: Optional[float] = None
70
+
71
+ # Throughput metrics
72
+ requests_per_second: Optional[float] = None
73
+ tokens_per_second: Optional[float] = None
74
+ tokens_per_minute: Optional[float] = None
75
+ peak_rps: Optional[float] = None
76
+
77
+ # Token metrics
78
+ total_input_tokens: int = 0
79
+ total_output_tokens: int = 0
80
+ avg_input_tokens: Optional[float] = None
81
+ avg_output_tokens: Optional[float] = None
82
+ max_input_tokens: Optional[int] = None
83
+ max_output_tokens: Optional[int] = None
84
+
85
+ # Cost metrics
86
+ total_cost_usd: float = 0.0
87
+ cost_per_request: Optional[float] = None
88
+ cost_per_token: Optional[float] = None
89
+ cost_per_second: Optional[float] = None
90
+
91
+ # Quality metrics
92
+ success_rate: float = 0.0
93
+ error_rate: float = 0.0
94
+ timeout_rate: float = 0.0
95
+ retry_rate: float = 0.0
96
+ cache_hit_rate: float = 0.0
97
+
98
+ # Resource utilization (if available)
99
+ avg_cpu_usage: Optional[float] = None
100
+ avg_memory_usage: Optional[float] = None
101
+ avg_gpu_usage: Optional[float] = None
102
+ peak_memory_mb: Optional[float] = None
103
+
104
+ # Queue and concurrency metrics
105
+ avg_queue_time_ms: Optional[float] = None
106
+ max_queue_time_ms: Optional[float] = None
107
+ avg_concurrent_requests: Optional[float] = None
108
+ max_concurrent_requests: Optional[int] = None
109
+
110
+ created_at: datetime = None
111
+
112
+ def __post_init__(self):
113
+ if self.created_at is None:
114
+ self.created_at = datetime.now(timezone.utc)
115
+
116
+ # Calculate derived metrics
117
+ self._calculate_derived_metrics()
118
+
119
+ def _calculate_derived_metrics(self):
120
+ """Calculate derived metrics from base measurements"""
121
+ if self.total_requests > 0:
122
+ self.success_rate = (self.successful_requests / self.total_requests) * 100
123
+ self.error_rate = (self.failed_requests / self.total_requests) * 100
124
+ self.timeout_rate = (self.timeout_requests / self.total_requests) * 100
125
+
126
+ if self.total_cost_usd > 0:
127
+ self.cost_per_request = self.total_cost_usd / self.total_requests
128
+
129
+ if self.successful_requests > 0:
130
+ self.avg_input_tokens = self.total_input_tokens / self.successful_requests
131
+ self.avg_output_tokens = self.total_output_tokens / self.successful_requests
132
+
133
+ # Calculate period-based metrics
134
+ period_seconds = (self.measurement_period_end - self.measurement_period_start).total_seconds()
135
+ if period_seconds > 0:
136
+ self.requests_per_second = self.total_requests / period_seconds
137
+
138
+ total_tokens = self.total_input_tokens + self.total_output_tokens
139
+ if total_tokens > 0:
140
+ self.tokens_per_second = total_tokens / period_seconds
141
+ self.tokens_per_minute = total_tokens / (period_seconds / 60)
142
+
143
+ if self.total_cost_usd > 0:
144
+ self.cost_per_token = self.total_cost_usd / total_tokens
145
+ self.cost_per_second = self.total_cost_usd / period_seconds
146
+
147
+ @property
148
+ def measurement_duration_seconds(self) -> float:
149
+ """Get measurement period duration in seconds"""
150
+ return (self.measurement_period_end - self.measurement_period_start).total_seconds()
151
+
152
+ @property
153
+ def latency_category(self) -> str:
154
+ """Categorize average latency"""
155
+ if self.mean_latency_ms is None:
156
+ return "unknown"
157
+
158
+ if self.mean_latency_ms < 100:
159
+ return LatencyCategory.ULTRA_LOW
160
+ elif self.mean_latency_ms < 500:
161
+ return LatencyCategory.LOW
162
+ elif self.mean_latency_ms < 2000:
163
+ return LatencyCategory.MODERATE
164
+ elif self.mean_latency_ms < 10000:
165
+ return LatencyCategory.HIGH
166
+ else:
167
+ return LatencyCategory.VERY_HIGH
168
+
169
+ @property
170
+ def performance_tier(self) -> str:
171
+ """Calculate overall performance tier"""
172
+ score = 100.0
173
+
174
+ # Latency penalty
175
+ if self.mean_latency_ms:
176
+ if self.mean_latency_ms > 10000:
177
+ score -= 40
178
+ elif self.mean_latency_ms > 5000:
179
+ score -= 25
180
+ elif self.mean_latency_ms > 2000:
181
+ score -= 15
182
+ elif self.mean_latency_ms > 1000:
183
+ score -= 5
184
+
185
+ # Success rate impact
186
+ score *= (self.success_rate / 100)
187
+
188
+ # Timeout penalty
189
+ score -= self.timeout_rate * 2
190
+
191
+ if score >= 85:
192
+ return PerformanceTier.EXCELLENT
193
+ elif score >= 70:
194
+ return PerformanceTier.GOOD
195
+ elif score >= 50:
196
+ return PerformanceTier.AVERAGE
197
+ elif score >= 25:
198
+ return PerformanceTier.POOR
199
+ else:
200
+ return PerformanceTier.CRITICAL
201
+
202
+ @property
203
+ def efficiency_score(self) -> float:
204
+ """Calculate efficiency score (performance per cost)"""
205
+ if not self.cost_per_request or self.cost_per_request == 0:
206
+ return 0.0
207
+
208
+ # Higher score for better performance and lower cost
209
+ base_score = self.success_rate
210
+ latency_penalty = (self.mean_latency_ms or 1000) / 1000 # Normalize to seconds
211
+ cost_penalty = self.cost_per_request * 1000 # Scale up cost impact
212
+
213
+ return max(0, base_score / (latency_penalty * cost_penalty))
214
+
215
+ @property
216
+ def reliability_score(self) -> float:
217
+ """Calculate reliability score based on error rates"""
218
+ return max(0, 100 - self.error_rate - (self.timeout_rate * 1.5))
219
+
220
+ def add_request_measurement(self, latency_ms: float, success: bool, tokens_used: int = 0,
221
+ cost: float = 0.0, cache_hit: bool = False):
222
+ """Add individual request measurement to aggregate metrics"""
223
+ self.total_requests += 1
224
+
225
+ if success:
226
+ self.successful_requests += 1
227
+ self.total_input_tokens += tokens_used # Simplified - would split input/output
228
+ self.total_output_tokens += tokens_used
229
+ else:
230
+ self.failed_requests += 1
231
+
232
+ if cache_hit:
233
+ # Update cache hit rate calculation
234
+ pass
235
+
236
+ self.total_cost_usd += cost
237
+
238
+ # Update latency statistics (simplified - would use proper streaming statistics)
239
+ if self.min_latency_ms is None or latency_ms < self.min_latency_ms:
240
+ self.min_latency_ms = latency_ms
241
+
242
+ if self.max_latency_ms is None or latency_ms > self.max_latency_ms:
243
+ self.max_latency_ms = latency_ms
244
+
245
+ # Recalculate derived metrics
246
+ self._calculate_derived_metrics()
247
+
248
+ def compare_to(self, other: 'PerformanceMetrics') -> Dict[str, Any]:
249
+ """Compare this metrics to another set of metrics"""
250
+ comparison = {
251
+ "baseline_model": other.model_id,
252
+ "comparison_period": {
253
+ "our_period": f"{self.measurement_period_start} to {self.measurement_period_end}",
254
+ "baseline_period": f"{other.measurement_period_start} to {other.measurement_period_end}"
255
+ },
256
+ "improvements": {},
257
+ "regressions": {},
258
+ "summary": {}
259
+ }
260
+
261
+ # Compare key metrics
262
+ metrics_to_compare = [
263
+ ("mean_latency_ms", "lower_is_better"),
264
+ ("success_rate", "higher_is_better"),
265
+ ("requests_per_second", "higher_is_better"),
266
+ ("tokens_per_second", "higher_is_better"),
267
+ ("cost_per_request", "lower_is_better"),
268
+ ("cost_per_token", "lower_is_better"),
269
+ ("error_rate", "lower_is_better")
270
+ ]
271
+
272
+ for metric_name, direction in metrics_to_compare:
273
+ our_value = getattr(self, metric_name)
274
+ other_value = getattr(other, metric_name)
275
+
276
+ if our_value is not None and other_value is not None and other_value != 0:
277
+ change_percent = ((our_value - other_value) / other_value) * 100
278
+
279
+ is_improvement = (
280
+ (direction == "higher_is_better" and change_percent > 0) or
281
+ (direction == "lower_is_better" and change_percent < 0)
282
+ )
283
+
284
+ change_data = {
285
+ "our_value": our_value,
286
+ "baseline_value": other_value,
287
+ "change_percent": round(change_percent, 2),
288
+ "absolute_change": our_value - other_value
289
+ }
290
+
291
+ if abs(change_percent) > 5: # Significant change threshold
292
+ if is_improvement:
293
+ comparison["improvements"][metric_name] = change_data
294
+ else:
295
+ comparison["regressions"][metric_name] = change_data
296
+
297
+ # Overall summary
298
+ comparison["summary"] = {
299
+ "overall_performance_change": self.performance_tier != other.performance_tier,
300
+ "our_tier": self.performance_tier,
301
+ "baseline_tier": other.performance_tier,
302
+ "improvements_count": len(comparison["improvements"]),
303
+ "regressions_count": len(comparison["regressions"])
304
+ }
305
+
306
+ return comparison
307
+
308
+ @dataclass
309
+ class LatencyProfile:
310
+ """
311
+ Detailed latency profile analysis
312
+
313
+ Provides comprehensive latency analysis including distribution,
314
+ outliers, and temporal patterns.
315
+ """
316
+ profile_id: str
317
+ model_id: str
318
+ provider: str
319
+ measurement_start: datetime
320
+ measurement_end: datetime
321
+
322
+ # Distribution data
323
+ latency_samples: List[float] = field(default_factory=list)
324
+ sample_count: int = 0
325
+
326
+ # Statistical measures
327
+ min_latency: float = float('inf')
328
+ max_latency: float = 0.0
329
+ mean_latency: float = 0.0
330
+ median_latency: float = 0.0
331
+ mode_latency: Optional[float] = None
332
+ std_deviation: float = 0.0
333
+ variance: float = 0.0
334
+ skewness: Optional[float] = None
335
+ kurtosis: Optional[float] = None
336
+
337
+ # Percentiles
338
+ p10: float = 0.0
339
+ p25: float = 0.0
340
+ p50: float = 0.0
341
+ p75: float = 0.0
342
+ p90: float = 0.0
343
+ p95: float = 0.0
344
+ p99: float = 0.0
345
+ p999: float = 0.0
346
+
347
+ # Outlier analysis
348
+ outlier_threshold_multiplier: float = 3.0
349
+ outlier_count: int = 0
350
+ outlier_rate: float = 0.0
351
+ outliers: List[float] = field(default_factory=list)
352
+
353
+ # Temporal patterns
354
+ hourly_averages: Dict[int, float] = field(default_factory=dict)
355
+ daily_trends: Dict[str, float] = field(default_factory=dict)
356
+
357
+ def __post_init__(self):
358
+ if self.latency_samples:
359
+ self._calculate_statistics()
360
+
361
+ def _calculate_statistics(self):
362
+ """Calculate comprehensive latency statistics"""
363
+ if not self.latency_samples:
364
+ return
365
+
366
+ self.sample_count = len(self.latency_samples)
367
+ sorted_samples = sorted(self.latency_samples)
368
+
369
+ # Basic statistics
370
+ self.min_latency = min(self.latency_samples)
371
+ self.max_latency = max(self.latency_samples)
372
+ self.mean_latency = statistics.mean(self.latency_samples)
373
+ self.median_latency = statistics.median(self.latency_samples)
374
+
375
+ if self.sample_count > 1:
376
+ self.std_deviation = statistics.stdev(self.latency_samples)
377
+ self.variance = statistics.variance(self.latency_samples)
378
+
379
+ # Percentiles
380
+ n = len(sorted_samples)
381
+ self.p10 = sorted_samples[int(0.10 * n)]
382
+ self.p25 = sorted_samples[int(0.25 * n)]
383
+ self.p50 = sorted_samples[int(0.50 * n)]
384
+ self.p75 = sorted_samples[int(0.75 * n)]
385
+ self.p90 = sorted_samples[int(0.90 * n)]
386
+ self.p95 = sorted_samples[int(0.95 * n)]
387
+ self.p99 = sorted_samples[int(0.99 * n)] if n > 100 else sorted_samples[-1]
388
+ self.p999 = sorted_samples[int(0.999 * n)] if n > 1000 else sorted_samples[-1]
389
+
390
+ # Outlier detection using IQR method
391
+ iqr = self.p75 - self.p25
392
+ lower_bound = self.p25 - (self.outlier_threshold_multiplier * iqr)
393
+ upper_bound = self.p75 + (self.outlier_threshold_multiplier * iqr)
394
+
395
+ self.outliers = [x for x in self.latency_samples if x < lower_bound or x > upper_bound]
396
+ self.outlier_count = len(self.outliers)
397
+ self.outlier_rate = (self.outlier_count / self.sample_count) * 100
398
+
399
+ @property
400
+ def distribution_type(self) -> str:
401
+ """Classify the latency distribution"""
402
+ if self.sample_count < 10:
403
+ return "insufficient_data"
404
+
405
+ # Simple heuristics for distribution classification
406
+ if abs(self.mean_latency - self.median_latency) < (0.1 * self.std_deviation):
407
+ return "normal"
408
+ elif self.mean_latency > self.median_latency:
409
+ return "right_skewed"
410
+ else:
411
+ return "left_skewed"
412
+
413
+ @property
414
+ def stability_score(self) -> float:
415
+ """Calculate latency stability score (0-100)"""
416
+ if self.mean_latency == 0:
417
+ return 100.0
418
+
419
+ # Lower coefficient of variation = higher stability
420
+ cv = self.std_deviation / self.mean_latency
421
+ base_score = max(0, 100 - (cv * 100))
422
+
423
+ # Penalty for outliers
424
+ outlier_penalty = min(20, self.outlier_rate)
425
+
426
+ return max(0, base_score - outlier_penalty)
427
+
428
+ @property
429
+ def consistency_category(self) -> str:
430
+ """Categorize latency consistency"""
431
+ stability = self.stability_score
432
+
433
+ if stability >= 90:
434
+ return "very_consistent"
435
+ elif stability >= 75:
436
+ return "consistent"
437
+ elif stability >= 60:
438
+ return "moderately_consistent"
439
+ elif stability >= 40:
440
+ return "inconsistent"
441
+ else:
442
+ return "very_inconsistent"
443
+
444
+ def add_latency_sample(self, latency_ms: float, timestamp: Optional[datetime] = None):
445
+ """Add a latency sample to the profile"""
446
+ self.latency_samples.append(latency_ms)
447
+
448
+ if timestamp:
449
+ # Track hourly patterns
450
+ hour = timestamp.hour
451
+ if hour in self.hourly_averages:
452
+ # Update running average
453
+ count = sum(1 for ts in self.hourly_averages if ts == hour)
454
+ self.hourly_averages[hour] = ((self.hourly_averages[hour] * count) + latency_ms) / (count + 1)
455
+ else:
456
+ self.hourly_averages[hour] = latency_ms
457
+
458
+ # Recalculate if we have enough samples
459
+ if len(self.latency_samples) % 100 == 0: # Recalculate every 100 samples
460
+ self._calculate_statistics()
461
+
462
+ def get_latency_bands(self) -> Dict[str, Dict[str, Any]]:
463
+ """Get latency distribution in bands"""
464
+ if not self.latency_samples:
465
+ return {}
466
+
467
+ bands = {
468
+ "ultra_fast": {"range": "< 100ms", "count": 0, "percentage": 0},
469
+ "fast": {"range": "100-500ms", "count": 0, "percentage": 0},
470
+ "moderate": {"range": "500ms-2s", "count": 0, "percentage": 0},
471
+ "slow": {"range": "2s-10s", "count": 0, "percentage": 0},
472
+ "very_slow": {"range": "> 10s", "count": 0, "percentage": 0}
473
+ }
474
+
475
+ for latency in self.latency_samples:
476
+ if latency < 100:
477
+ bands["ultra_fast"]["count"] += 1
478
+ elif latency < 500:
479
+ bands["fast"]["count"] += 1
480
+ elif latency < 2000:
481
+ bands["moderate"]["count"] += 1
482
+ elif latency < 10000:
483
+ bands["slow"]["count"] += 1
484
+ else:
485
+ bands["very_slow"]["count"] += 1
486
+
487
+ # Calculate percentages
488
+ total = len(self.latency_samples)
489
+ for band in bands.values():
490
+ band["percentage"] = (band["count"] / total) * 100
491
+
492
+ return bands
493
+
494
+ @dataclass
495
+ class ThroughputProfile:
496
+ """
497
+ Throughput analysis and capacity planning
498
+
499
+ Analyzes request and token throughput patterns for capacity planning
500
+ and performance optimization.
501
+ """
502
+ profile_id: str
503
+ model_id: str
504
+ provider: str
505
+ measurement_start: datetime
506
+ measurement_end: datetime
507
+
508
+ # Request throughput
509
+ peak_requests_per_second: float = 0.0
510
+ avg_requests_per_second: float = 0.0
511
+ min_requests_per_second: float = 0.0
512
+
513
+ # Token throughput
514
+ peak_tokens_per_second: float = 0.0
515
+ avg_tokens_per_second: float = 0.0
516
+ min_tokens_per_second: float = 0.0
517
+
518
+ # Capacity metrics
519
+ max_concurrent_requests: int = 0
520
+ avg_concurrent_requests: float = 0.0
521
+ queue_overflow_events: int = 0
522
+ throttling_events: int = 0
523
+
524
+ # Temporal patterns
525
+ throughput_samples: List[Tuple[datetime, float, float]] = field(default_factory=list) # (timestamp, rps, tps)
526
+ peak_hours: List[int] = field(default_factory=list)
527
+ low_hours: List[int] = field(default_factory=list)
528
+
529
+ # Efficiency metrics
530
+ tokens_per_request_ratio: float = 0.0
531
+ processing_efficiency: float = 0.0 # Actual vs theoretical max throughput
532
+
533
+ @property
534
+ def measurement_duration_hours(self) -> float:
535
+ """Get measurement duration in hours"""
536
+ return (self.measurement_end - self.measurement_start).total_seconds() / 3600
537
+
538
+ @property
539
+ def capacity_utilization(self) -> float:
540
+ """Calculate capacity utilization percentage"""
541
+ if self.peak_requests_per_second == 0:
542
+ return 0.0
543
+ return (self.avg_requests_per_second / self.peak_requests_per_second) * 100
544
+
545
+ @property
546
+ def throughput_consistency(self) -> str:
547
+ """Analyze throughput consistency"""
548
+ if not self.throughput_samples:
549
+ return "unknown"
550
+
551
+ rps_values = [sample[1] for sample in self.throughput_samples]
552
+ if not rps_values:
553
+ return "unknown"
554
+
555
+ cv = statistics.stdev(rps_values) / statistics.mean(rps_values) if statistics.mean(rps_values) > 0 else 0
556
+
557
+ if cv < 0.1:
558
+ return "very_stable"
559
+ elif cv < 0.3:
560
+ return "stable"
561
+ elif cv < 0.5:
562
+ return "variable"
563
+ else:
564
+ return "highly_variable"
565
+
566
+ @property
567
+ def performance_headroom(self) -> float:
568
+ """Calculate available performance headroom"""
569
+ return max(0, self.peak_requests_per_second - self.avg_requests_per_second)
570
+
571
+ def add_throughput_sample(self, timestamp: datetime, requests_per_second: float,
572
+ tokens_per_second: float, concurrent_requests: int = 0):
573
+ """Add throughput measurement sample"""
574
+ self.throughput_samples.append((timestamp, requests_per_second, tokens_per_second))
575
+
576
+ # Update peak values
577
+ if requests_per_second > self.peak_requests_per_second:
578
+ self.peak_requests_per_second = requests_per_second
579
+
580
+ if tokens_per_second > self.peak_tokens_per_second:
581
+ self.peak_tokens_per_second = tokens_per_second
582
+
583
+ if concurrent_requests > self.max_concurrent_requests:
584
+ self.max_concurrent_requests = concurrent_requests
585
+
586
+ # Track peak hours
587
+ hour = timestamp.hour
588
+ if requests_per_second > self.avg_requests_per_second * 1.5:
589
+ if hour not in self.peak_hours:
590
+ self.peak_hours.append(hour)
591
+ elif requests_per_second < self.avg_requests_per_second * 0.5:
592
+ if hour not in self.low_hours:
593
+ self.low_hours.append(hour)
594
+
595
+ def calculate_capacity_recommendations(self) -> Dict[str, Any]:
596
+ """Generate capacity planning recommendations"""
597
+ recommendations = {
598
+ "current_capacity": {
599
+ "peak_rps": self.peak_requests_per_second,
600
+ "avg_rps": self.avg_requests_per_second,
601
+ "utilization": self.capacity_utilization
602
+ },
603
+ "scaling_recommendations": [],
604
+ "optimization_opportunities": []
605
+ }
606
+
607
+ # Scaling recommendations
608
+ if self.capacity_utilization > 80:
609
+ recommendations["scaling_recommendations"].append({
610
+ "type": "scale_up",
611
+ "urgency": "high",
612
+ "reason": "High capacity utilization detected",
613
+ "suggested_increase": "50%"
614
+ })
615
+ elif self.capacity_utilization < 30:
616
+ recommendations["scaling_recommendations"].append({
617
+ "type": "scale_down",
618
+ "urgency": "low",
619
+ "reason": "Low capacity utilization, cost optimization opportunity",
620
+ "suggested_decrease": "25%"
621
+ })
622
+
623
+ # Optimization opportunities
624
+ if self.queue_overflow_events > 0:
625
+ recommendations["optimization_opportunities"].append({
626
+ "type": "queue_optimization",
627
+ "description": "Queue overflow events detected",
628
+ "suggestion": "Increase queue size or add load balancing"
629
+ })
630
+
631
+ if self.throughput_consistency == "highly_variable":
632
+ recommendations["optimization_opportunities"].append({
633
+ "type": "load_smoothing",
634
+ "description": "High throughput variability",
635
+ "suggestion": "Implement request smoothing or auto-scaling"
636
+ })
637
+
638
+ return recommendations
639
+
640
+ # Utility functions
641
+
642
+ def create_performance_metrics(
643
+ model_id: str,
644
+ provider: str,
645
+ service_type: str,
646
+ period_start: datetime,
647
+ period_end: datetime
648
+ ) -> PerformanceMetrics:
649
+ """Factory function to create performance metrics"""
650
+ import uuid
651
+
652
+ metric_id = f"perf_{datetime.now().strftime('%Y%m%d_%H%M%S')}_{uuid.uuid4().hex[:8]}"
653
+
654
+ return PerformanceMetrics(
655
+ metric_id=metric_id,
656
+ model_id=model_id,
657
+ provider=provider,
658
+ service_type=service_type,
659
+ measurement_period_start=period_start,
660
+ measurement_period_end=period_end
661
+ )
662
+
663
+ def analyze_performance_trend(metrics_list: List[PerformanceMetrics]) -> Dict[str, Any]:
664
+ """Analyze performance trends across multiple measurement periods"""
665
+ if not metrics_list:
666
+ return {"status": "no_data"}
667
+
668
+ # Sort by measurement period
669
+ sorted_metrics = sorted(metrics_list, key=lambda x: x.measurement_period_start)
670
+
671
+ # Calculate trends
672
+ latencies = [m.mean_latency_ms for m in sorted_metrics if m.mean_latency_ms]
673
+ success_rates = [m.success_rate for m in sorted_metrics]
674
+ throughputs = [m.requests_per_second for m in sorted_metrics if m.requests_per_second]
675
+
676
+ trends = {
677
+ "period_count": len(sorted_metrics),
678
+ "time_range": {
679
+ "start": sorted_metrics[0].measurement_period_start.isoformat(),
680
+ "end": sorted_metrics[-1].measurement_period_end.isoformat()
681
+ },
682
+ "performance_trend": "stable",
683
+ "key_changes": []
684
+ }
685
+
686
+ # Analyze latency trend
687
+ if len(latencies) > 1:
688
+ latency_change = ((latencies[-1] - latencies[0]) / latencies[0]) * 100
689
+ if abs(latency_change) > 10:
690
+ trends["key_changes"].append({
691
+ "metric": "latency",
692
+ "change_percent": round(latency_change, 2),
693
+ "direction": "increased" if latency_change > 0 else "decreased"
694
+ })
695
+
696
+ # Analyze success rate trend
697
+ if len(success_rates) > 1:
698
+ success_change = success_rates[-1] - success_rates[0]
699
+ if abs(success_change) > 5:
700
+ trends["key_changes"].append({
701
+ "metric": "success_rate",
702
+ "change_percent": round(success_change, 2),
703
+ "direction": "improved" if success_change > 0 else "degraded"
704
+ })
705
+
706
+ # Overall trend assessment
707
+ if len(trends["key_changes"]) > 2:
708
+ trends["performance_trend"] = "volatile"
709
+ elif any(change["metric"] == "latency" and change["direction"] == "increased" for change in trends["key_changes"]):
710
+ trends["performance_trend"] = "degrading"
711
+ elif any(change["metric"] == "success_rate" and change["direction"] == "improved" for change in trends["key_changes"]):
712
+ trends["performance_trend"] = "improving"
713
+
714
+ return trends