isa-model 0.4.0__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (199) hide show
  1. isa_model/client.py +466 -43
  2. isa_model/core/cache/redis_cache.py +12 -3
  3. isa_model/core/config/config_manager.py +230 -3
  4. isa_model/core/config.py +90 -0
  5. isa_model/core/database/direct_db_client.py +114 -0
  6. isa_model/core/database/migration_manager.py +563 -0
  7. isa_model/core/database/migrations.py +21 -1
  8. isa_model/core/database/supabase_client.py +154 -19
  9. isa_model/core/dependencies.py +316 -0
  10. isa_model/core/discovery/__init__.py +19 -0
  11. isa_model/core/discovery/consul_discovery.py +190 -0
  12. isa_model/core/logging/__init__.py +54 -0
  13. isa_model/core/logging/influx_logger.py +523 -0
  14. isa_model/core/logging/loki_logger.py +160 -0
  15. isa_model/core/models/__init__.py +27 -18
  16. isa_model/core/models/config_models.py +625 -0
  17. isa_model/core/models/deployment_billing_tracker.py +430 -0
  18. isa_model/core/models/model_manager.py +40 -17
  19. isa_model/core/models/model_metadata.py +690 -0
  20. isa_model/core/models/model_repo.py +174 -18
  21. isa_model/core/models/system_models.py +857 -0
  22. isa_model/core/repositories/__init__.py +9 -0
  23. isa_model/core/repositories/config_repository.py +912 -0
  24. isa_model/core/services/intelligent_model_selector.py +399 -21
  25. isa_model/core/storage/hf_storage.py +1 -1
  26. isa_model/core/types.py +1 -0
  27. isa_model/deployment/__init__.py +5 -48
  28. isa_model/deployment/core/__init__.py +2 -31
  29. isa_model/deployment/core/deployment_manager.py +1278 -370
  30. isa_model/deployment/local/__init__.py +31 -0
  31. isa_model/deployment/local/config.py +248 -0
  32. isa_model/deployment/local/gpu_gateway.py +607 -0
  33. isa_model/deployment/local/health_checker.py +428 -0
  34. isa_model/deployment/local/provider.py +586 -0
  35. isa_model/deployment/local/tensorrt_service.py +621 -0
  36. isa_model/deployment/local/transformers_service.py +644 -0
  37. isa_model/deployment/local/vllm_service.py +527 -0
  38. isa_model/deployment/modal/__init__.py +8 -0
  39. isa_model/deployment/modal/config.py +136 -0
  40. isa_model/deployment/{services/auto_hf_modal_deployer.py → modal/deployer.py} +1 -1
  41. isa_model/deployment/modal/services/__init__.py +3 -0
  42. isa_model/deployment/modal/services/audio/__init__.py +1 -0
  43. isa_model/deployment/modal/services/embedding/__init__.py +1 -0
  44. isa_model/deployment/modal/services/llm/__init__.py +1 -0
  45. isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
  46. isa_model/deployment/modal/services/video/__init__.py +1 -0
  47. isa_model/deployment/modal/services/vision/__init__.py +1 -0
  48. isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
  49. isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
  50. isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
  51. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
  52. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
  53. isa_model/deployment/storage/__init__.py +5 -0
  54. isa_model/deployment/storage/deployment_repository.py +824 -0
  55. isa_model/deployment/triton/__init__.py +10 -0
  56. isa_model/deployment/triton/config.py +196 -0
  57. isa_model/deployment/triton/configs/__init__.py +1 -0
  58. isa_model/deployment/triton/provider.py +512 -0
  59. isa_model/deployment/triton/scripts/__init__.py +1 -0
  60. isa_model/deployment/triton/templates/__init__.py +1 -0
  61. isa_model/inference/__init__.py +47 -1
  62. isa_model/inference/ai_factory.py +137 -10
  63. isa_model/inference/legacy_services/__init__.py +21 -0
  64. isa_model/inference/legacy_services/model_evaluation.py +637 -0
  65. isa_model/inference/legacy_services/model_service.py +573 -0
  66. isa_model/inference/legacy_services/model_serving.py +717 -0
  67. isa_model/inference/legacy_services/model_training.py +561 -0
  68. isa_model/inference/models/__init__.py +21 -0
  69. isa_model/inference/models/inference_config.py +551 -0
  70. isa_model/inference/models/inference_record.py +675 -0
  71. isa_model/inference/models/performance_models.py +714 -0
  72. isa_model/inference/repositories/__init__.py +9 -0
  73. isa_model/inference/repositories/inference_repository.py +828 -0
  74. isa_model/inference/services/audio/base_stt_service.py +184 -11
  75. isa_model/inference/services/audio/openai_stt_service.py +22 -6
  76. isa_model/inference/services/custom_model_manager.py +277 -0
  77. isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
  78. isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
  79. isa_model/inference/services/llm/__init__.py +10 -2
  80. isa_model/inference/services/llm/base_llm_service.py +335 -24
  81. isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
  82. isa_model/inference/services/llm/helpers/llm_adapter.py +9 -4
  83. isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
  84. isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
  85. isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
  86. isa_model/inference/services/llm/local_llm_service.py +747 -0
  87. isa_model/inference/services/llm/ollama_llm_service.py +9 -2
  88. isa_model/inference/services/llm/openai_llm_service.py +33 -16
  89. isa_model/inference/services/llm/yyds_llm_service.py +8 -2
  90. isa_model/inference/services/vision/__init__.py +22 -1
  91. isa_model/inference/services/vision/blip_vision_service.py +359 -0
  92. isa_model/inference/services/vision/helpers/image_utils.py +8 -5
  93. isa_model/inference/services/vision/isa_vision_service.py +65 -4
  94. isa_model/inference/services/vision/openai_vision_service.py +19 -10
  95. isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
  96. isa_model/serving/api/cache_manager.py +245 -0
  97. isa_model/serving/api/dependencies/__init__.py +1 -0
  98. isa_model/serving/api/dependencies/auth.py +194 -0
  99. isa_model/serving/api/dependencies/database.py +139 -0
  100. isa_model/serving/api/error_handlers.py +284 -0
  101. isa_model/serving/api/fastapi_server.py +172 -22
  102. isa_model/serving/api/middleware/auth.py +8 -2
  103. isa_model/serving/api/middleware/security.py +23 -33
  104. isa_model/serving/api/middleware/tenant_context.py +414 -0
  105. isa_model/serving/api/routes/analytics.py +4 -1
  106. isa_model/serving/api/routes/config.py +645 -0
  107. isa_model/serving/api/routes/deployment_billing.py +315 -0
  108. isa_model/serving/api/routes/deployments.py +138 -2
  109. isa_model/serving/api/routes/gpu_gateway.py +440 -0
  110. isa_model/serving/api/routes/health.py +32 -12
  111. isa_model/serving/api/routes/inference_monitoring.py +486 -0
  112. isa_model/serving/api/routes/local_deployments.py +448 -0
  113. isa_model/serving/api/routes/tenants.py +575 -0
  114. isa_model/serving/api/routes/unified.py +680 -18
  115. isa_model/serving/api/routes/webhooks.py +479 -0
  116. isa_model/serving/api/startup.py +68 -54
  117. isa_model/utils/gpu_utils.py +311 -0
  118. {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/METADATA +66 -24
  119. isa_model-0.4.3.dist-info/RECORD +193 -0
  120. isa_model/core/storage/minio_storage.py +0 -0
  121. isa_model/deployment/cloud/__init__.py +0 -9
  122. isa_model/deployment/cloud/modal/__init__.py +0 -10
  123. isa_model/deployment/core/deployment_config.py +0 -356
  124. isa_model/deployment/core/isa_deployment_service.py +0 -401
  125. isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
  126. isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
  127. isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
  128. isa_model/deployment/runtime/deployed_service.py +0 -338
  129. isa_model/deployment/services/__init__.py +0 -9
  130. isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
  131. isa_model/deployment/services/model_service.py +0 -332
  132. isa_model/deployment/services/service_monitor.py +0 -356
  133. isa_model/deployment/services/service_registry.py +0 -527
  134. isa_model/eval/__init__.py +0 -92
  135. isa_model/eval/benchmarks/__init__.py +0 -27
  136. isa_model/eval/benchmarks/multimodal_datasets.py +0 -460
  137. isa_model/eval/benchmarks.py +0 -701
  138. isa_model/eval/config/__init__.py +0 -10
  139. isa_model/eval/config/evaluation_config.py +0 -108
  140. isa_model/eval/evaluators/__init__.py +0 -24
  141. isa_model/eval/evaluators/audio_evaluator.py +0 -727
  142. isa_model/eval/evaluators/base_evaluator.py +0 -503
  143. isa_model/eval/evaluators/embedding_evaluator.py +0 -742
  144. isa_model/eval/evaluators/llm_evaluator.py +0 -472
  145. isa_model/eval/evaluators/vision_evaluator.py +0 -564
  146. isa_model/eval/example_evaluation.py +0 -395
  147. isa_model/eval/factory.py +0 -798
  148. isa_model/eval/infrastructure/__init__.py +0 -24
  149. isa_model/eval/infrastructure/experiment_tracker.py +0 -466
  150. isa_model/eval/isa_benchmarks.py +0 -700
  151. isa_model/eval/isa_integration.py +0 -582
  152. isa_model/eval/metrics.py +0 -951
  153. isa_model/eval/tests/unit/test_basic.py +0 -396
  154. isa_model/serving/api/routes/evaluations.py +0 -579
  155. isa_model/training/__init__.py +0 -168
  156. isa_model/training/annotation/annotation_schema.py +0 -47
  157. isa_model/training/annotation/processors/annotation_processor.py +0 -126
  158. isa_model/training/annotation/storage/dataset_manager.py +0 -131
  159. isa_model/training/annotation/storage/dataset_schema.py +0 -44
  160. isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
  161. isa_model/training/annotation/tests/test_minio copy.py +0 -113
  162. isa_model/training/annotation/tests/test_minio_upload.py +0 -43
  163. isa_model/training/annotation/views/annotation_controller.py +0 -158
  164. isa_model/training/cloud/__init__.py +0 -22
  165. isa_model/training/cloud/job_orchestrator.py +0 -402
  166. isa_model/training/cloud/runpod_trainer.py +0 -454
  167. isa_model/training/cloud/storage_manager.py +0 -482
  168. isa_model/training/core/__init__.py +0 -26
  169. isa_model/training/core/config.py +0 -181
  170. isa_model/training/core/dataset.py +0 -222
  171. isa_model/training/core/trainer.py +0 -720
  172. isa_model/training/core/utils.py +0 -213
  173. isa_model/training/examples/intelligent_training_example.py +0 -281
  174. isa_model/training/factory.py +0 -424
  175. isa_model/training/intelligent/__init__.py +0 -25
  176. isa_model/training/intelligent/decision_engine.py +0 -643
  177. isa_model/training/intelligent/intelligent_factory.py +0 -888
  178. isa_model/training/intelligent/knowledge_base.py +0 -751
  179. isa_model/training/intelligent/resource_optimizer.py +0 -839
  180. isa_model/training/intelligent/task_classifier.py +0 -576
  181. isa_model/training/storage/__init__.py +0 -24
  182. isa_model/training/storage/core_integration.py +0 -439
  183. isa_model/training/storage/training_repository.py +0 -552
  184. isa_model/training/storage/training_storage.py +0 -628
  185. isa_model-0.4.0.dist-info/RECORD +0 -182
  186. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_chatTTS_service.py +0 -0
  187. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_fish_service.py +0 -0
  188. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_openvoice_service.py +0 -0
  189. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_service_v2.py +0 -0
  190. /isa_model/deployment/{cloud/modal → modal/services/embedding}/isa_embed_rerank_service.py +0 -0
  191. /isa_model/deployment/{cloud/modal → modal/services/video}/isa_video_hunyuan_service.py +0 -0
  192. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ocr_service.py +0 -0
  193. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_qwen25_service.py +0 -0
  194. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_table_service.py +0 -0
  195. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service.py +0 -0
  196. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service_optimized.py +0 -0
  197. /isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
  198. {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/WHEEL +0 -0
  199. {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,714 @@
1
+ """
2
+ Performance Models
3
+
4
+ Specialized models for tracking and analyzing inference performance metrics,
5
+ latency profiles, and throughput characteristics.
6
+ """
7
+
8
+ import logging
9
+ from datetime import datetime, timezone, timedelta
10
+ from typing import Dict, List, Optional, Any, Union, Tuple
11
+ from dataclasses import dataclass, field
12
+ from enum import Enum
13
+ import statistics
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+ class PerformanceTier(str, Enum):
18
+ """Performance tier enumeration"""
19
+ EXCELLENT = "excellent"
20
+ GOOD = "good"
21
+ AVERAGE = "average"
22
+ POOR = "poor"
23
+ CRITICAL = "critical"
24
+
25
+ class LatencyCategory(str, Enum):
26
+ """Latency category enumeration"""
27
+ ULTRA_LOW = "ultra_low" # < 100ms
28
+ LOW = "low" # 100-500ms
29
+ MODERATE = "moderate" # 500ms-2s
30
+ HIGH = "high" # 2s-10s
31
+ VERY_HIGH = "very_high" # > 10s
32
+
33
+ class ThroughputUnit(str, Enum):
34
+ """Throughput measurement unit enumeration"""
35
+ REQUESTS_PER_SECOND = "rps"
36
+ TOKENS_PER_SECOND = "tps"
37
+ TOKENS_PER_MINUTE = "tpm"
38
+ REQUESTS_PER_MINUTE = "rpm"
39
+
40
+ @dataclass
41
+ class PerformanceMetrics:
42
+ """
43
+ Comprehensive performance metrics for inference operations
44
+
45
+ Tracks detailed performance characteristics including latency, throughput,
46
+ resource utilization, and quality metrics.
47
+ """
48
+ metric_id: str
49
+ model_id: str
50
+ provider: str
51
+ service_type: str
52
+ measurement_period_start: datetime
53
+ measurement_period_end: datetime
54
+
55
+ # Request volume metrics
56
+ total_requests: int = 0
57
+ successful_requests: int = 0
58
+ failed_requests: int = 0
59
+ timeout_requests: int = 0
60
+
61
+ # Latency metrics (in milliseconds)
62
+ min_latency_ms: Optional[float] = None
63
+ max_latency_ms: Optional[float] = None
64
+ mean_latency_ms: Optional[float] = None
65
+ median_latency_ms: Optional[float] = None
66
+ p95_latency_ms: Optional[float] = None
67
+ p99_latency_ms: Optional[float] = None
68
+ p999_latency_ms: Optional[float] = None
69
+ latency_std_dev: Optional[float] = None
70
+
71
+ # Throughput metrics
72
+ requests_per_second: Optional[float] = None
73
+ tokens_per_second: Optional[float] = None
74
+ tokens_per_minute: Optional[float] = None
75
+ peak_rps: Optional[float] = None
76
+
77
+ # Token metrics
78
+ total_input_tokens: int = 0
79
+ total_output_tokens: int = 0
80
+ avg_input_tokens: Optional[float] = None
81
+ avg_output_tokens: Optional[float] = None
82
+ max_input_tokens: Optional[int] = None
83
+ max_output_tokens: Optional[int] = None
84
+
85
+ # Cost metrics
86
+ total_cost_usd: float = 0.0
87
+ cost_per_request: Optional[float] = None
88
+ cost_per_token: Optional[float] = None
89
+ cost_per_second: Optional[float] = None
90
+
91
+ # Quality metrics
92
+ success_rate: float = 0.0
93
+ error_rate: float = 0.0
94
+ timeout_rate: float = 0.0
95
+ retry_rate: float = 0.0
96
+ cache_hit_rate: float = 0.0
97
+
98
+ # Resource utilization (if available)
99
+ avg_cpu_usage: Optional[float] = None
100
+ avg_memory_usage: Optional[float] = None
101
+ avg_gpu_usage: Optional[float] = None
102
+ peak_memory_mb: Optional[float] = None
103
+
104
+ # Queue and concurrency metrics
105
+ avg_queue_time_ms: Optional[float] = None
106
+ max_queue_time_ms: Optional[float] = None
107
+ avg_concurrent_requests: Optional[float] = None
108
+ max_concurrent_requests: Optional[int] = None
109
+
110
+ created_at: datetime = None
111
+
112
+ def __post_init__(self):
113
+ if self.created_at is None:
114
+ self.created_at = datetime.now(timezone.utc)
115
+
116
+ # Calculate derived metrics
117
+ self._calculate_derived_metrics()
118
+
119
+ def _calculate_derived_metrics(self):
120
+ """Calculate derived metrics from base measurements"""
121
+ if self.total_requests > 0:
122
+ self.success_rate = (self.successful_requests / self.total_requests) * 100
123
+ self.error_rate = (self.failed_requests / self.total_requests) * 100
124
+ self.timeout_rate = (self.timeout_requests / self.total_requests) * 100
125
+
126
+ if self.total_cost_usd > 0:
127
+ self.cost_per_request = self.total_cost_usd / self.total_requests
128
+
129
+ if self.successful_requests > 0:
130
+ self.avg_input_tokens = self.total_input_tokens / self.successful_requests
131
+ self.avg_output_tokens = self.total_output_tokens / self.successful_requests
132
+
133
+ # Calculate period-based metrics
134
+ period_seconds = (self.measurement_period_end - self.measurement_period_start).total_seconds()
135
+ if period_seconds > 0:
136
+ self.requests_per_second = self.total_requests / period_seconds
137
+
138
+ total_tokens = self.total_input_tokens + self.total_output_tokens
139
+ if total_tokens > 0:
140
+ self.tokens_per_second = total_tokens / period_seconds
141
+ self.tokens_per_minute = total_tokens / (period_seconds / 60)
142
+
143
+ if self.total_cost_usd > 0:
144
+ self.cost_per_token = self.total_cost_usd / total_tokens
145
+ self.cost_per_second = self.total_cost_usd / period_seconds
146
+
147
+ @property
148
+ def measurement_duration_seconds(self) -> float:
149
+ """Get measurement period duration in seconds"""
150
+ return (self.measurement_period_end - self.measurement_period_start).total_seconds()
151
+
152
+ @property
153
+ def latency_category(self) -> str:
154
+ """Categorize average latency"""
155
+ if self.mean_latency_ms is None:
156
+ return "unknown"
157
+
158
+ if self.mean_latency_ms < 100:
159
+ return LatencyCategory.ULTRA_LOW
160
+ elif self.mean_latency_ms < 500:
161
+ return LatencyCategory.LOW
162
+ elif self.mean_latency_ms < 2000:
163
+ return LatencyCategory.MODERATE
164
+ elif self.mean_latency_ms < 10000:
165
+ return LatencyCategory.HIGH
166
+ else:
167
+ return LatencyCategory.VERY_HIGH
168
+
169
+ @property
170
+ def performance_tier(self) -> str:
171
+ """Calculate overall performance tier"""
172
+ score = 100.0
173
+
174
+ # Latency penalty
175
+ if self.mean_latency_ms:
176
+ if self.mean_latency_ms > 10000:
177
+ score -= 40
178
+ elif self.mean_latency_ms > 5000:
179
+ score -= 25
180
+ elif self.mean_latency_ms > 2000:
181
+ score -= 15
182
+ elif self.mean_latency_ms > 1000:
183
+ score -= 5
184
+
185
+ # Success rate impact
186
+ score *= (self.success_rate / 100)
187
+
188
+ # Timeout penalty
189
+ score -= self.timeout_rate * 2
190
+
191
+ if score >= 85:
192
+ return PerformanceTier.EXCELLENT
193
+ elif score >= 70:
194
+ return PerformanceTier.GOOD
195
+ elif score >= 50:
196
+ return PerformanceTier.AVERAGE
197
+ elif score >= 25:
198
+ return PerformanceTier.POOR
199
+ else:
200
+ return PerformanceTier.CRITICAL
201
+
202
+ @property
203
+ def efficiency_score(self) -> float:
204
+ """Calculate efficiency score (performance per cost)"""
205
+ if not self.cost_per_request or self.cost_per_request == 0:
206
+ return 0.0
207
+
208
+ # Higher score for better performance and lower cost
209
+ base_score = self.success_rate
210
+ latency_penalty = (self.mean_latency_ms or 1000) / 1000 # Normalize to seconds
211
+ cost_penalty = self.cost_per_request * 1000 # Scale up cost impact
212
+
213
+ return max(0, base_score / (latency_penalty * cost_penalty))
214
+
215
+ @property
216
+ def reliability_score(self) -> float:
217
+ """Calculate reliability score based on error rates"""
218
+ return max(0, 100 - self.error_rate - (self.timeout_rate * 1.5))
219
+
220
+ def add_request_measurement(self, latency_ms: float, success: bool, tokens_used: int = 0,
221
+ cost: float = 0.0, cache_hit: bool = False):
222
+ """Add individual request measurement to aggregate metrics"""
223
+ self.total_requests += 1
224
+
225
+ if success:
226
+ self.successful_requests += 1
227
+ self.total_input_tokens += tokens_used # Simplified - would split input/output
228
+ self.total_output_tokens += tokens_used
229
+ else:
230
+ self.failed_requests += 1
231
+
232
+ if cache_hit:
233
+ # Update cache hit rate calculation
234
+ pass
235
+
236
+ self.total_cost_usd += cost
237
+
238
+ # Update latency statistics (simplified - would use proper streaming statistics)
239
+ if self.min_latency_ms is None or latency_ms < self.min_latency_ms:
240
+ self.min_latency_ms = latency_ms
241
+
242
+ if self.max_latency_ms is None or latency_ms > self.max_latency_ms:
243
+ self.max_latency_ms = latency_ms
244
+
245
+ # Recalculate derived metrics
246
+ self._calculate_derived_metrics()
247
+
248
+ def compare_to(self, other: 'PerformanceMetrics') -> Dict[str, Any]:
249
+ """Compare this metrics to another set of metrics"""
250
+ comparison = {
251
+ "baseline_model": other.model_id,
252
+ "comparison_period": {
253
+ "our_period": f"{self.measurement_period_start} to {self.measurement_period_end}",
254
+ "baseline_period": f"{other.measurement_period_start} to {other.measurement_period_end}"
255
+ },
256
+ "improvements": {},
257
+ "regressions": {},
258
+ "summary": {}
259
+ }
260
+
261
+ # Compare key metrics
262
+ metrics_to_compare = [
263
+ ("mean_latency_ms", "lower_is_better"),
264
+ ("success_rate", "higher_is_better"),
265
+ ("requests_per_second", "higher_is_better"),
266
+ ("tokens_per_second", "higher_is_better"),
267
+ ("cost_per_request", "lower_is_better"),
268
+ ("cost_per_token", "lower_is_better"),
269
+ ("error_rate", "lower_is_better")
270
+ ]
271
+
272
+ for metric_name, direction in metrics_to_compare:
273
+ our_value = getattr(self, metric_name)
274
+ other_value = getattr(other, metric_name)
275
+
276
+ if our_value is not None and other_value is not None and other_value != 0:
277
+ change_percent = ((our_value - other_value) / other_value) * 100
278
+
279
+ is_improvement = (
280
+ (direction == "higher_is_better" and change_percent > 0) or
281
+ (direction == "lower_is_better" and change_percent < 0)
282
+ )
283
+
284
+ change_data = {
285
+ "our_value": our_value,
286
+ "baseline_value": other_value,
287
+ "change_percent": round(change_percent, 2),
288
+ "absolute_change": our_value - other_value
289
+ }
290
+
291
+ if abs(change_percent) > 5: # Significant change threshold
292
+ if is_improvement:
293
+ comparison["improvements"][metric_name] = change_data
294
+ else:
295
+ comparison["regressions"][metric_name] = change_data
296
+
297
+ # Overall summary
298
+ comparison["summary"] = {
299
+ "overall_performance_change": self.performance_tier != other.performance_tier,
300
+ "our_tier": self.performance_tier,
301
+ "baseline_tier": other.performance_tier,
302
+ "improvements_count": len(comparison["improvements"]),
303
+ "regressions_count": len(comparison["regressions"])
304
+ }
305
+
306
+ return comparison
307
+
308
+ @dataclass
309
+ class LatencyProfile:
310
+ """
311
+ Detailed latency profile analysis
312
+
313
+ Provides comprehensive latency analysis including distribution,
314
+ outliers, and temporal patterns.
315
+ """
316
+ profile_id: str
317
+ model_id: str
318
+ provider: str
319
+ measurement_start: datetime
320
+ measurement_end: datetime
321
+
322
+ # Distribution data
323
+ latency_samples: List[float] = field(default_factory=list)
324
+ sample_count: int = 0
325
+
326
+ # Statistical measures
327
+ min_latency: float = float('inf')
328
+ max_latency: float = 0.0
329
+ mean_latency: float = 0.0
330
+ median_latency: float = 0.0
331
+ mode_latency: Optional[float] = None
332
+ std_deviation: float = 0.0
333
+ variance: float = 0.0
334
+ skewness: Optional[float] = None
335
+ kurtosis: Optional[float] = None
336
+
337
+ # Percentiles
338
+ p10: float = 0.0
339
+ p25: float = 0.0
340
+ p50: float = 0.0
341
+ p75: float = 0.0
342
+ p90: float = 0.0
343
+ p95: float = 0.0
344
+ p99: float = 0.0
345
+ p999: float = 0.0
346
+
347
+ # Outlier analysis
348
+ outlier_threshold_multiplier: float = 3.0
349
+ outlier_count: int = 0
350
+ outlier_rate: float = 0.0
351
+ outliers: List[float] = field(default_factory=list)
352
+
353
+ # Temporal patterns
354
+ hourly_averages: Dict[int, float] = field(default_factory=dict)
355
+ daily_trends: Dict[str, float] = field(default_factory=dict)
356
+
357
+ def __post_init__(self):
358
+ if self.latency_samples:
359
+ self._calculate_statistics()
360
+
361
+ def _calculate_statistics(self):
362
+ """Calculate comprehensive latency statistics"""
363
+ if not self.latency_samples:
364
+ return
365
+
366
+ self.sample_count = len(self.latency_samples)
367
+ sorted_samples = sorted(self.latency_samples)
368
+
369
+ # Basic statistics
370
+ self.min_latency = min(self.latency_samples)
371
+ self.max_latency = max(self.latency_samples)
372
+ self.mean_latency = statistics.mean(self.latency_samples)
373
+ self.median_latency = statistics.median(self.latency_samples)
374
+
375
+ if self.sample_count > 1:
376
+ self.std_deviation = statistics.stdev(self.latency_samples)
377
+ self.variance = statistics.variance(self.latency_samples)
378
+
379
+ # Percentiles
380
+ n = len(sorted_samples)
381
+ self.p10 = sorted_samples[int(0.10 * n)]
382
+ self.p25 = sorted_samples[int(0.25 * n)]
383
+ self.p50 = sorted_samples[int(0.50 * n)]
384
+ self.p75 = sorted_samples[int(0.75 * n)]
385
+ self.p90 = sorted_samples[int(0.90 * n)]
386
+ self.p95 = sorted_samples[int(0.95 * n)]
387
+ self.p99 = sorted_samples[int(0.99 * n)] if n > 100 else sorted_samples[-1]
388
+ self.p999 = sorted_samples[int(0.999 * n)] if n > 1000 else sorted_samples[-1]
389
+
390
+ # Outlier detection using IQR method
391
+ iqr = self.p75 - self.p25
392
+ lower_bound = self.p25 - (self.outlier_threshold_multiplier * iqr)
393
+ upper_bound = self.p75 + (self.outlier_threshold_multiplier * iqr)
394
+
395
+ self.outliers = [x for x in self.latency_samples if x < lower_bound or x > upper_bound]
396
+ self.outlier_count = len(self.outliers)
397
+ self.outlier_rate = (self.outlier_count / self.sample_count) * 100
398
+
399
+ @property
400
+ def distribution_type(self) -> str:
401
+ """Classify the latency distribution"""
402
+ if self.sample_count < 10:
403
+ return "insufficient_data"
404
+
405
+ # Simple heuristics for distribution classification
406
+ if abs(self.mean_latency - self.median_latency) < (0.1 * self.std_deviation):
407
+ return "normal"
408
+ elif self.mean_latency > self.median_latency:
409
+ return "right_skewed"
410
+ else:
411
+ return "left_skewed"
412
+
413
+ @property
414
+ def stability_score(self) -> float:
415
+ """Calculate latency stability score (0-100)"""
416
+ if self.mean_latency == 0:
417
+ return 100.0
418
+
419
+ # Lower coefficient of variation = higher stability
420
+ cv = self.std_deviation / self.mean_latency
421
+ base_score = max(0, 100 - (cv * 100))
422
+
423
+ # Penalty for outliers
424
+ outlier_penalty = min(20, self.outlier_rate)
425
+
426
+ return max(0, base_score - outlier_penalty)
427
+
428
+ @property
429
+ def consistency_category(self) -> str:
430
+ """Categorize latency consistency"""
431
+ stability = self.stability_score
432
+
433
+ if stability >= 90:
434
+ return "very_consistent"
435
+ elif stability >= 75:
436
+ return "consistent"
437
+ elif stability >= 60:
438
+ return "moderately_consistent"
439
+ elif stability >= 40:
440
+ return "inconsistent"
441
+ else:
442
+ return "very_inconsistent"
443
+
444
+ def add_latency_sample(self, latency_ms: float, timestamp: Optional[datetime] = None):
445
+ """Add a latency sample to the profile"""
446
+ self.latency_samples.append(latency_ms)
447
+
448
+ if timestamp:
449
+ # Track hourly patterns
450
+ hour = timestamp.hour
451
+ if hour in self.hourly_averages:
452
+ # Update running average
453
+ count = sum(1 for ts in self.hourly_averages if ts == hour)
454
+ self.hourly_averages[hour] = ((self.hourly_averages[hour] * count) + latency_ms) / (count + 1)
455
+ else:
456
+ self.hourly_averages[hour] = latency_ms
457
+
458
+ # Recalculate if we have enough samples
459
+ if len(self.latency_samples) % 100 == 0: # Recalculate every 100 samples
460
+ self._calculate_statistics()
461
+
462
+ def get_latency_bands(self) -> Dict[str, Dict[str, Any]]:
463
+ """Get latency distribution in bands"""
464
+ if not self.latency_samples:
465
+ return {}
466
+
467
+ bands = {
468
+ "ultra_fast": {"range": "< 100ms", "count": 0, "percentage": 0},
469
+ "fast": {"range": "100-500ms", "count": 0, "percentage": 0},
470
+ "moderate": {"range": "500ms-2s", "count": 0, "percentage": 0},
471
+ "slow": {"range": "2s-10s", "count": 0, "percentage": 0},
472
+ "very_slow": {"range": "> 10s", "count": 0, "percentage": 0}
473
+ }
474
+
475
+ for latency in self.latency_samples:
476
+ if latency < 100:
477
+ bands["ultra_fast"]["count"] += 1
478
+ elif latency < 500:
479
+ bands["fast"]["count"] += 1
480
+ elif latency < 2000:
481
+ bands["moderate"]["count"] += 1
482
+ elif latency < 10000:
483
+ bands["slow"]["count"] += 1
484
+ else:
485
+ bands["very_slow"]["count"] += 1
486
+
487
+ # Calculate percentages
488
+ total = len(self.latency_samples)
489
+ for band in bands.values():
490
+ band["percentage"] = (band["count"] / total) * 100
491
+
492
+ return bands
493
+
494
+ @dataclass
495
+ class ThroughputProfile:
496
+ """
497
+ Throughput analysis and capacity planning
498
+
499
+ Analyzes request and token throughput patterns for capacity planning
500
+ and performance optimization.
501
+ """
502
+ profile_id: str
503
+ model_id: str
504
+ provider: str
505
+ measurement_start: datetime
506
+ measurement_end: datetime
507
+
508
+ # Request throughput
509
+ peak_requests_per_second: float = 0.0
510
+ avg_requests_per_second: float = 0.0
511
+ min_requests_per_second: float = 0.0
512
+
513
+ # Token throughput
514
+ peak_tokens_per_second: float = 0.0
515
+ avg_tokens_per_second: float = 0.0
516
+ min_tokens_per_second: float = 0.0
517
+
518
+ # Capacity metrics
519
+ max_concurrent_requests: int = 0
520
+ avg_concurrent_requests: float = 0.0
521
+ queue_overflow_events: int = 0
522
+ throttling_events: int = 0
523
+
524
+ # Temporal patterns
525
+ throughput_samples: List[Tuple[datetime, float, float]] = field(default_factory=list) # (timestamp, rps, tps)
526
+ peak_hours: List[int] = field(default_factory=list)
527
+ low_hours: List[int] = field(default_factory=list)
528
+
529
+ # Efficiency metrics
530
+ tokens_per_request_ratio: float = 0.0
531
+ processing_efficiency: float = 0.0 # Actual vs theoretical max throughput
532
+
533
+ @property
534
+ def measurement_duration_hours(self) -> float:
535
+ """Get measurement duration in hours"""
536
+ return (self.measurement_end - self.measurement_start).total_seconds() / 3600
537
+
538
+ @property
539
+ def capacity_utilization(self) -> float:
540
+ """Calculate capacity utilization percentage"""
541
+ if self.peak_requests_per_second == 0:
542
+ return 0.0
543
+ return (self.avg_requests_per_second / self.peak_requests_per_second) * 100
544
+
545
+ @property
546
+ def throughput_consistency(self) -> str:
547
+ """Analyze throughput consistency"""
548
+ if not self.throughput_samples:
549
+ return "unknown"
550
+
551
+ rps_values = [sample[1] for sample in self.throughput_samples]
552
+ if not rps_values:
553
+ return "unknown"
554
+
555
+ cv = statistics.stdev(rps_values) / statistics.mean(rps_values) if statistics.mean(rps_values) > 0 else 0
556
+
557
+ if cv < 0.1:
558
+ return "very_stable"
559
+ elif cv < 0.3:
560
+ return "stable"
561
+ elif cv < 0.5:
562
+ return "variable"
563
+ else:
564
+ return "highly_variable"
565
+
566
+ @property
567
+ def performance_headroom(self) -> float:
568
+ """Calculate available performance headroom"""
569
+ return max(0, self.peak_requests_per_second - self.avg_requests_per_second)
570
+
571
+ def add_throughput_sample(self, timestamp: datetime, requests_per_second: float,
572
+ tokens_per_second: float, concurrent_requests: int = 0):
573
+ """Add throughput measurement sample"""
574
+ self.throughput_samples.append((timestamp, requests_per_second, tokens_per_second))
575
+
576
+ # Update peak values
577
+ if requests_per_second > self.peak_requests_per_second:
578
+ self.peak_requests_per_second = requests_per_second
579
+
580
+ if tokens_per_second > self.peak_tokens_per_second:
581
+ self.peak_tokens_per_second = tokens_per_second
582
+
583
+ if concurrent_requests > self.max_concurrent_requests:
584
+ self.max_concurrent_requests = concurrent_requests
585
+
586
+ # Track peak hours
587
+ hour = timestamp.hour
588
+ if requests_per_second > self.avg_requests_per_second * 1.5:
589
+ if hour not in self.peak_hours:
590
+ self.peak_hours.append(hour)
591
+ elif requests_per_second < self.avg_requests_per_second * 0.5:
592
+ if hour not in self.low_hours:
593
+ self.low_hours.append(hour)
594
+
595
+ def calculate_capacity_recommendations(self) -> Dict[str, Any]:
596
+ """Generate capacity planning recommendations"""
597
+ recommendations = {
598
+ "current_capacity": {
599
+ "peak_rps": self.peak_requests_per_second,
600
+ "avg_rps": self.avg_requests_per_second,
601
+ "utilization": self.capacity_utilization
602
+ },
603
+ "scaling_recommendations": [],
604
+ "optimization_opportunities": []
605
+ }
606
+
607
+ # Scaling recommendations
608
+ if self.capacity_utilization > 80:
609
+ recommendations["scaling_recommendations"].append({
610
+ "type": "scale_up",
611
+ "urgency": "high",
612
+ "reason": "High capacity utilization detected",
613
+ "suggested_increase": "50%"
614
+ })
615
+ elif self.capacity_utilization < 30:
616
+ recommendations["scaling_recommendations"].append({
617
+ "type": "scale_down",
618
+ "urgency": "low",
619
+ "reason": "Low capacity utilization, cost optimization opportunity",
620
+ "suggested_decrease": "25%"
621
+ })
622
+
623
+ # Optimization opportunities
624
+ if self.queue_overflow_events > 0:
625
+ recommendations["optimization_opportunities"].append({
626
+ "type": "queue_optimization",
627
+ "description": "Queue overflow events detected",
628
+ "suggestion": "Increase queue size or add load balancing"
629
+ })
630
+
631
+ if self.throughput_consistency == "highly_variable":
632
+ recommendations["optimization_opportunities"].append({
633
+ "type": "load_smoothing",
634
+ "description": "High throughput variability",
635
+ "suggestion": "Implement request smoothing or auto-scaling"
636
+ })
637
+
638
+ return recommendations
639
+
640
+ # Utility functions
641
+
642
+ def create_performance_metrics(
643
+ model_id: str,
644
+ provider: str,
645
+ service_type: str,
646
+ period_start: datetime,
647
+ period_end: datetime
648
+ ) -> PerformanceMetrics:
649
+ """Factory function to create performance metrics"""
650
+ import uuid
651
+
652
+ metric_id = f"perf_{datetime.now().strftime('%Y%m%d_%H%M%S')}_{uuid.uuid4().hex[:8]}"
653
+
654
+ return PerformanceMetrics(
655
+ metric_id=metric_id,
656
+ model_id=model_id,
657
+ provider=provider,
658
+ service_type=service_type,
659
+ measurement_period_start=period_start,
660
+ measurement_period_end=period_end
661
+ )
662
+
663
+ def analyze_performance_trend(metrics_list: List[PerformanceMetrics]) -> Dict[str, Any]:
664
+ """Analyze performance trends across multiple measurement periods"""
665
+ if not metrics_list:
666
+ return {"status": "no_data"}
667
+
668
+ # Sort by measurement period
669
+ sorted_metrics = sorted(metrics_list, key=lambda x: x.measurement_period_start)
670
+
671
+ # Calculate trends
672
+ latencies = [m.mean_latency_ms for m in sorted_metrics if m.mean_latency_ms]
673
+ success_rates = [m.success_rate for m in sorted_metrics]
674
+ throughputs = [m.requests_per_second for m in sorted_metrics if m.requests_per_second]
675
+
676
+ trends = {
677
+ "period_count": len(sorted_metrics),
678
+ "time_range": {
679
+ "start": sorted_metrics[0].measurement_period_start.isoformat(),
680
+ "end": sorted_metrics[-1].measurement_period_end.isoformat()
681
+ },
682
+ "performance_trend": "stable",
683
+ "key_changes": []
684
+ }
685
+
686
+ # Analyze latency trend
687
+ if len(latencies) > 1:
688
+ latency_change = ((latencies[-1] - latencies[0]) / latencies[0]) * 100
689
+ if abs(latency_change) > 10:
690
+ trends["key_changes"].append({
691
+ "metric": "latency",
692
+ "change_percent": round(latency_change, 2),
693
+ "direction": "increased" if latency_change > 0 else "decreased"
694
+ })
695
+
696
+ # Analyze success rate trend
697
+ if len(success_rates) > 1:
698
+ success_change = success_rates[-1] - success_rates[0]
699
+ if abs(success_change) > 5:
700
+ trends["key_changes"].append({
701
+ "metric": "success_rate",
702
+ "change_percent": round(success_change, 2),
703
+ "direction": "improved" if success_change > 0 else "degraded"
704
+ })
705
+
706
+ # Overall trend assessment
707
+ if len(trends["key_changes"]) > 2:
708
+ trends["performance_trend"] = "volatile"
709
+ elif any(change["metric"] == "latency" and change["direction"] == "increased" for change in trends["key_changes"]):
710
+ trends["performance_trend"] = "degrading"
711
+ elif any(change["metric"] == "success_rate" and change["direction"] == "improved" for change in trends["key_changes"]):
712
+ trends["performance_trend"] = "improving"
713
+
714
+ return trends