isa-model 0.4.0__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (199) hide show
  1. isa_model/client.py +466 -43
  2. isa_model/core/cache/redis_cache.py +12 -3
  3. isa_model/core/config/config_manager.py +230 -3
  4. isa_model/core/config.py +90 -0
  5. isa_model/core/database/direct_db_client.py +114 -0
  6. isa_model/core/database/migration_manager.py +563 -0
  7. isa_model/core/database/migrations.py +21 -1
  8. isa_model/core/database/supabase_client.py +154 -19
  9. isa_model/core/dependencies.py +316 -0
  10. isa_model/core/discovery/__init__.py +19 -0
  11. isa_model/core/discovery/consul_discovery.py +190 -0
  12. isa_model/core/logging/__init__.py +54 -0
  13. isa_model/core/logging/influx_logger.py +523 -0
  14. isa_model/core/logging/loki_logger.py +160 -0
  15. isa_model/core/models/__init__.py +27 -18
  16. isa_model/core/models/config_models.py +625 -0
  17. isa_model/core/models/deployment_billing_tracker.py +430 -0
  18. isa_model/core/models/model_manager.py +40 -17
  19. isa_model/core/models/model_metadata.py +690 -0
  20. isa_model/core/models/model_repo.py +174 -18
  21. isa_model/core/models/system_models.py +857 -0
  22. isa_model/core/repositories/__init__.py +9 -0
  23. isa_model/core/repositories/config_repository.py +912 -0
  24. isa_model/core/services/intelligent_model_selector.py +399 -21
  25. isa_model/core/storage/hf_storage.py +1 -1
  26. isa_model/core/types.py +1 -0
  27. isa_model/deployment/__init__.py +5 -48
  28. isa_model/deployment/core/__init__.py +2 -31
  29. isa_model/deployment/core/deployment_manager.py +1278 -370
  30. isa_model/deployment/local/__init__.py +31 -0
  31. isa_model/deployment/local/config.py +248 -0
  32. isa_model/deployment/local/gpu_gateway.py +607 -0
  33. isa_model/deployment/local/health_checker.py +428 -0
  34. isa_model/deployment/local/provider.py +586 -0
  35. isa_model/deployment/local/tensorrt_service.py +621 -0
  36. isa_model/deployment/local/transformers_service.py +644 -0
  37. isa_model/deployment/local/vllm_service.py +527 -0
  38. isa_model/deployment/modal/__init__.py +8 -0
  39. isa_model/deployment/modal/config.py +136 -0
  40. isa_model/deployment/{services/auto_hf_modal_deployer.py → modal/deployer.py} +1 -1
  41. isa_model/deployment/modal/services/__init__.py +3 -0
  42. isa_model/deployment/modal/services/audio/__init__.py +1 -0
  43. isa_model/deployment/modal/services/embedding/__init__.py +1 -0
  44. isa_model/deployment/modal/services/llm/__init__.py +1 -0
  45. isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
  46. isa_model/deployment/modal/services/video/__init__.py +1 -0
  47. isa_model/deployment/modal/services/vision/__init__.py +1 -0
  48. isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
  49. isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
  50. isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
  51. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
  52. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
  53. isa_model/deployment/storage/__init__.py +5 -0
  54. isa_model/deployment/storage/deployment_repository.py +824 -0
  55. isa_model/deployment/triton/__init__.py +10 -0
  56. isa_model/deployment/triton/config.py +196 -0
  57. isa_model/deployment/triton/configs/__init__.py +1 -0
  58. isa_model/deployment/triton/provider.py +512 -0
  59. isa_model/deployment/triton/scripts/__init__.py +1 -0
  60. isa_model/deployment/triton/templates/__init__.py +1 -0
  61. isa_model/inference/__init__.py +47 -1
  62. isa_model/inference/ai_factory.py +137 -10
  63. isa_model/inference/legacy_services/__init__.py +21 -0
  64. isa_model/inference/legacy_services/model_evaluation.py +637 -0
  65. isa_model/inference/legacy_services/model_service.py +573 -0
  66. isa_model/inference/legacy_services/model_serving.py +717 -0
  67. isa_model/inference/legacy_services/model_training.py +561 -0
  68. isa_model/inference/models/__init__.py +21 -0
  69. isa_model/inference/models/inference_config.py +551 -0
  70. isa_model/inference/models/inference_record.py +675 -0
  71. isa_model/inference/models/performance_models.py +714 -0
  72. isa_model/inference/repositories/__init__.py +9 -0
  73. isa_model/inference/repositories/inference_repository.py +828 -0
  74. isa_model/inference/services/audio/base_stt_service.py +184 -11
  75. isa_model/inference/services/audio/openai_stt_service.py +22 -6
  76. isa_model/inference/services/custom_model_manager.py +277 -0
  77. isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
  78. isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
  79. isa_model/inference/services/llm/__init__.py +10 -2
  80. isa_model/inference/services/llm/base_llm_service.py +335 -24
  81. isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
  82. isa_model/inference/services/llm/helpers/llm_adapter.py +9 -4
  83. isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
  84. isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
  85. isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
  86. isa_model/inference/services/llm/local_llm_service.py +747 -0
  87. isa_model/inference/services/llm/ollama_llm_service.py +9 -2
  88. isa_model/inference/services/llm/openai_llm_service.py +33 -16
  89. isa_model/inference/services/llm/yyds_llm_service.py +8 -2
  90. isa_model/inference/services/vision/__init__.py +22 -1
  91. isa_model/inference/services/vision/blip_vision_service.py +359 -0
  92. isa_model/inference/services/vision/helpers/image_utils.py +8 -5
  93. isa_model/inference/services/vision/isa_vision_service.py +65 -4
  94. isa_model/inference/services/vision/openai_vision_service.py +19 -10
  95. isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
  96. isa_model/serving/api/cache_manager.py +245 -0
  97. isa_model/serving/api/dependencies/__init__.py +1 -0
  98. isa_model/serving/api/dependencies/auth.py +194 -0
  99. isa_model/serving/api/dependencies/database.py +139 -0
  100. isa_model/serving/api/error_handlers.py +284 -0
  101. isa_model/serving/api/fastapi_server.py +172 -22
  102. isa_model/serving/api/middleware/auth.py +8 -2
  103. isa_model/serving/api/middleware/security.py +23 -33
  104. isa_model/serving/api/middleware/tenant_context.py +414 -0
  105. isa_model/serving/api/routes/analytics.py +4 -1
  106. isa_model/serving/api/routes/config.py +645 -0
  107. isa_model/serving/api/routes/deployment_billing.py +315 -0
  108. isa_model/serving/api/routes/deployments.py +138 -2
  109. isa_model/serving/api/routes/gpu_gateway.py +440 -0
  110. isa_model/serving/api/routes/health.py +32 -12
  111. isa_model/serving/api/routes/inference_monitoring.py +486 -0
  112. isa_model/serving/api/routes/local_deployments.py +448 -0
  113. isa_model/serving/api/routes/tenants.py +575 -0
  114. isa_model/serving/api/routes/unified.py +680 -18
  115. isa_model/serving/api/routes/webhooks.py +479 -0
  116. isa_model/serving/api/startup.py +68 -54
  117. isa_model/utils/gpu_utils.py +311 -0
  118. {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/METADATA +66 -24
  119. isa_model-0.4.3.dist-info/RECORD +193 -0
  120. isa_model/core/storage/minio_storage.py +0 -0
  121. isa_model/deployment/cloud/__init__.py +0 -9
  122. isa_model/deployment/cloud/modal/__init__.py +0 -10
  123. isa_model/deployment/core/deployment_config.py +0 -356
  124. isa_model/deployment/core/isa_deployment_service.py +0 -401
  125. isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
  126. isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
  127. isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
  128. isa_model/deployment/runtime/deployed_service.py +0 -338
  129. isa_model/deployment/services/__init__.py +0 -9
  130. isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
  131. isa_model/deployment/services/model_service.py +0 -332
  132. isa_model/deployment/services/service_monitor.py +0 -356
  133. isa_model/deployment/services/service_registry.py +0 -527
  134. isa_model/eval/__init__.py +0 -92
  135. isa_model/eval/benchmarks/__init__.py +0 -27
  136. isa_model/eval/benchmarks/multimodal_datasets.py +0 -460
  137. isa_model/eval/benchmarks.py +0 -701
  138. isa_model/eval/config/__init__.py +0 -10
  139. isa_model/eval/config/evaluation_config.py +0 -108
  140. isa_model/eval/evaluators/__init__.py +0 -24
  141. isa_model/eval/evaluators/audio_evaluator.py +0 -727
  142. isa_model/eval/evaluators/base_evaluator.py +0 -503
  143. isa_model/eval/evaluators/embedding_evaluator.py +0 -742
  144. isa_model/eval/evaluators/llm_evaluator.py +0 -472
  145. isa_model/eval/evaluators/vision_evaluator.py +0 -564
  146. isa_model/eval/example_evaluation.py +0 -395
  147. isa_model/eval/factory.py +0 -798
  148. isa_model/eval/infrastructure/__init__.py +0 -24
  149. isa_model/eval/infrastructure/experiment_tracker.py +0 -466
  150. isa_model/eval/isa_benchmarks.py +0 -700
  151. isa_model/eval/isa_integration.py +0 -582
  152. isa_model/eval/metrics.py +0 -951
  153. isa_model/eval/tests/unit/test_basic.py +0 -396
  154. isa_model/serving/api/routes/evaluations.py +0 -579
  155. isa_model/training/__init__.py +0 -168
  156. isa_model/training/annotation/annotation_schema.py +0 -47
  157. isa_model/training/annotation/processors/annotation_processor.py +0 -126
  158. isa_model/training/annotation/storage/dataset_manager.py +0 -131
  159. isa_model/training/annotation/storage/dataset_schema.py +0 -44
  160. isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
  161. isa_model/training/annotation/tests/test_minio copy.py +0 -113
  162. isa_model/training/annotation/tests/test_minio_upload.py +0 -43
  163. isa_model/training/annotation/views/annotation_controller.py +0 -158
  164. isa_model/training/cloud/__init__.py +0 -22
  165. isa_model/training/cloud/job_orchestrator.py +0 -402
  166. isa_model/training/cloud/runpod_trainer.py +0 -454
  167. isa_model/training/cloud/storage_manager.py +0 -482
  168. isa_model/training/core/__init__.py +0 -26
  169. isa_model/training/core/config.py +0 -181
  170. isa_model/training/core/dataset.py +0 -222
  171. isa_model/training/core/trainer.py +0 -720
  172. isa_model/training/core/utils.py +0 -213
  173. isa_model/training/examples/intelligent_training_example.py +0 -281
  174. isa_model/training/factory.py +0 -424
  175. isa_model/training/intelligent/__init__.py +0 -25
  176. isa_model/training/intelligent/decision_engine.py +0 -643
  177. isa_model/training/intelligent/intelligent_factory.py +0 -888
  178. isa_model/training/intelligent/knowledge_base.py +0 -751
  179. isa_model/training/intelligent/resource_optimizer.py +0 -839
  180. isa_model/training/intelligent/task_classifier.py +0 -576
  181. isa_model/training/storage/__init__.py +0 -24
  182. isa_model/training/storage/core_integration.py +0 -439
  183. isa_model/training/storage/training_repository.py +0 -552
  184. isa_model/training/storage/training_storage.py +0 -628
  185. isa_model-0.4.0.dist-info/RECORD +0 -182
  186. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_chatTTS_service.py +0 -0
  187. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_fish_service.py +0 -0
  188. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_openvoice_service.py +0 -0
  189. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_service_v2.py +0 -0
  190. /isa_model/deployment/{cloud/modal → modal/services/embedding}/isa_embed_rerank_service.py +0 -0
  191. /isa_model/deployment/{cloud/modal → modal/services/video}/isa_video_hunyuan_service.py +0 -0
  192. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ocr_service.py +0 -0
  193. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_qwen25_service.py +0 -0
  194. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_table_service.py +0 -0
  195. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service.py +0 -0
  196. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service_optimized.py +0 -0
  197. /isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
  198. {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/WHEEL +0 -0
  199. {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,675 @@
1
+ """
2
+ Inference Record Models
3
+
4
+ Core data models for inference requests, usage statistics, and model snapshots,
5
+ extracted from repository layer to follow the standard ISA Model architecture pattern.
6
+ """
7
+
8
+ import logging
9
+ import hashlib
10
+ from datetime import datetime, timezone, timedelta
11
+ from typing import Dict, List, Optional, Any, Union
12
+ from dataclasses import dataclass, field
13
+ from enum import Enum
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+ class InferenceStatus(str, Enum):
18
+ """Inference status enumeration"""
19
+ PENDING = "pending"
20
+ PROCESSING = "processing"
21
+ COMPLETED = "completed"
22
+ FAILED = "failed"
23
+ TIMEOUT = "timeout"
24
+ CANCELLED = "cancelled"
25
+ QUEUED = "queued"
26
+ RETRYING = "retrying"
27
+
28
+ class ServiceType(str, Enum):
29
+ """Service type enumeration"""
30
+ LLM = "llm"
31
+ VISION = "vision"
32
+ EMBEDDING = "embedding"
33
+ TTS = "tts"
34
+ STT = "stt"
35
+ IMAGE_GEN = "image_gen"
36
+ AUDIO = "audio"
37
+ RERANK = "rerank"
38
+ OCR = "ocr"
39
+ TRANSLATION = "translation"
40
+ SUMMARIZATION = "summarization"
41
+ CLASSIFICATION = "classification"
42
+
43
+ class ErrorCategory(str, Enum):
44
+ """Error category enumeration"""
45
+ TIMEOUT = "timeout"
46
+ RATE_LIMIT = "rate_limit"
47
+ AUTHENTICATION = "authentication"
48
+ VALIDATION = "validation"
49
+ MODEL_ERROR = "model_error"
50
+ NETWORK_ERROR = "network_error"
51
+ SERVER_ERROR = "server_error"
52
+ QUOTA_EXCEEDED = "quota_exceeded"
53
+ UNKNOWN = "unknown"
54
+
55
+ @dataclass
56
+ class InferenceRequest:
57
+ """
58
+ Core inference request record
59
+
60
+ Represents a single inference request with its input, output, performance metrics,
61
+ and tracking information for analytics and billing purposes.
62
+ """
63
+ request_id: str
64
+ service_type: str
65
+ model_id: str
66
+ provider: str
67
+ endpoint: str
68
+ request_data: Dict[str, Any]
69
+ status: str = InferenceStatus.PENDING
70
+ created_at: datetime = None
71
+ started_at: Optional[datetime] = None
72
+ completed_at: Optional[datetime] = None
73
+ user_id: Optional[str] = None
74
+ session_id: Optional[str] = None
75
+ client_id: Optional[str] = None
76
+ ip_address: Optional[str] = None
77
+ user_agent: Optional[str] = None
78
+ response_data: Optional[Dict[str, Any]] = None
79
+ error_message: Optional[str] = None
80
+ error_category: Optional[str] = None
81
+ execution_time_ms: Optional[int] = None
82
+ queue_time_ms: Optional[int] = None
83
+ tokens_used: Optional[int] = None
84
+ input_tokens: Optional[int] = None
85
+ output_tokens: Optional[int] = None
86
+ cost_usd: Optional[float] = None
87
+ request_size_bytes: Optional[int] = None
88
+ response_size_bytes: Optional[int] = None
89
+ cache_hit: bool = False
90
+ retry_count: int = 0
91
+ priority: int = 5 # 1-10 scale
92
+ request_hash: Optional[str] = None
93
+ response_hash: Optional[str] = None
94
+ metadata: Optional[Dict[str, Any]] = None
95
+
96
+ def __post_init__(self):
97
+ if self.created_at is None:
98
+ self.created_at = datetime.now(timezone.utc)
99
+ if self.metadata is None:
100
+ self.metadata = {}
101
+
102
+ # Generate request hash for deduplication
103
+ if self.request_hash is None and self.request_data:
104
+ self.request_hash = self._generate_content_hash(self.request_data)
105
+
106
+ def _generate_content_hash(self, content: Any) -> str:
107
+ """Generate consistent hash for content"""
108
+ import json
109
+ try:
110
+ content_str = json.dumps(content, sort_keys=True, ensure_ascii=True)
111
+ return hashlib.sha256(content_str.encode()).hexdigest()[:16]
112
+ except Exception:
113
+ return hashlib.sha256(str(content).encode()).hexdigest()[:16]
114
+
115
+ @property
116
+ def is_active(self) -> bool:
117
+ """Check if request is in active processing state"""
118
+ return self.status in [InferenceStatus.PENDING, InferenceStatus.PROCESSING,
119
+ InferenceStatus.QUEUED, InferenceStatus.RETRYING]
120
+
121
+ @property
122
+ def is_completed(self) -> bool:
123
+ """Check if request is completed (successfully or not)"""
124
+ return self.status in [InferenceStatus.COMPLETED, InferenceStatus.FAILED,
125
+ InferenceStatus.TIMEOUT, InferenceStatus.CANCELLED]
126
+
127
+ @property
128
+ def was_successful(self) -> bool:
129
+ """Check if request completed successfully"""
130
+ return self.status == InferenceStatus.COMPLETED
131
+
132
+ @property
133
+ def total_duration_ms(self) -> Optional[int]:
134
+ """Calculate total request duration including queue time"""
135
+ if self.created_at and self.completed_at:
136
+ return int((self.completed_at - self.created_at).total_seconds() * 1000)
137
+ return None
138
+
139
+ @property
140
+ def total_tokens(self) -> Optional[int]:
141
+ """Get total tokens used (input + output)"""
142
+ if self.input_tokens is not None and self.output_tokens is not None:
143
+ return self.input_tokens + self.output_tokens
144
+ return self.tokens_used
145
+
146
+ @property
147
+ def cost_per_token(self) -> Optional[float]:
148
+ """Calculate cost per token"""
149
+ total = self.total_tokens
150
+ if self.cost_usd and total and total > 0:
151
+ return self.cost_usd / total
152
+ return None
153
+
154
+ @property
155
+ def throughput_tokens_per_second(self) -> Optional[float]:
156
+ """Calculate token throughput"""
157
+ total = self.total_tokens
158
+ if total and self.execution_time_ms and self.execution_time_ms > 0:
159
+ return (total * 1000) / self.execution_time_ms
160
+ return None
161
+
162
+ def update_status(self, new_status: str, error_message: Optional[str] = None,
163
+ error_category: Optional[str] = None):
164
+ """Update request status with timestamp tracking"""
165
+ old_status = self.status
166
+ self.status = new_status
167
+
168
+ now = datetime.now(timezone.utc)
169
+
170
+ if new_status == InferenceStatus.PROCESSING and old_status in [InferenceStatus.PENDING, InferenceStatus.QUEUED]:
171
+ self.started_at = now
172
+ if self.created_at:
173
+ self.queue_time_ms = int((now - self.created_at).total_seconds() * 1000)
174
+
175
+ elif new_status in [InferenceStatus.COMPLETED, InferenceStatus.FAILED,
176
+ InferenceStatus.TIMEOUT, InferenceStatus.CANCELLED]:
177
+ if not self.completed_at:
178
+ self.completed_at = now
179
+
180
+ if self.started_at:
181
+ self.execution_time_ms = int((self.completed_at - self.started_at).total_seconds() * 1000)
182
+
183
+ if error_message:
184
+ self.error_message = error_message
185
+ if error_category:
186
+ self.error_category = error_category
187
+
188
+ logger.debug(f"Request {self.request_id} status: {old_status} -> {new_status}")
189
+
190
+ def complete_request(self, response_data: Dict[str, Any], tokens_used: Optional[int] = None,
191
+ cost_usd: Optional[float] = None, **kwargs):
192
+ """Mark request as completed with response data"""
193
+ self.response_data = response_data
194
+ self.response_hash = self._generate_content_hash(response_data)
195
+
196
+ if tokens_used:
197
+ self.tokens_used = tokens_used
198
+ if cost_usd:
199
+ self.cost_usd = cost_usd
200
+
201
+ # Update any additional metrics
202
+ for key, value in kwargs.items():
203
+ if hasattr(self, key):
204
+ setattr(self, key, value)
205
+
206
+ self.update_status(InferenceStatus.COMPLETED)
207
+
208
+ def fail_request(self, error_message: str, error_category: str = ErrorCategory.UNKNOWN,
209
+ **kwargs):
210
+ """Mark request as failed with error details"""
211
+ self.error_message = error_message
212
+ self.error_category = error_category
213
+
214
+ # Update any additional error metrics
215
+ for key, value in kwargs.items():
216
+ if hasattr(self, key):
217
+ setattr(self, key, value)
218
+
219
+ self.update_status(InferenceStatus.FAILED, error_message, error_category)
220
+
221
+ def increment_retry(self):
222
+ """Increment retry count and reset to retrying status"""
223
+ self.retry_count += 1
224
+ self.update_status(InferenceStatus.RETRYING)
225
+
226
+ def add_metadata(self, key: str, value: Any):
227
+ """Add metadata entry"""
228
+ self.metadata[key] = value
229
+
230
+ def get_metadata(self, key: str, default: Any = None) -> Any:
231
+ """Get metadata entry"""
232
+ return self.metadata.get(key, default)
233
+
234
+ @dataclass
235
+ class UsageStatistics:
236
+ """
237
+ Aggregated usage statistics for analytics and billing
238
+
239
+ Contains summarized metrics for a specific time period, service type,
240
+ model, or user for reporting and analysis purposes.
241
+ """
242
+ stat_id: str
243
+ period_start: datetime
244
+ period_end: datetime
245
+ service_type: str
246
+ model_id: Optional[str] = None
247
+ provider: Optional[str] = None
248
+ user_id: Optional[str] = None
249
+ client_id: Optional[str] = None
250
+ total_requests: int = 0
251
+ successful_requests: int = 0
252
+ failed_requests: int = 0
253
+ timeout_requests: int = 0
254
+ retry_requests: int = 0
255
+ cache_hits: int = 0
256
+ total_tokens: int = 0
257
+ input_tokens: int = 0
258
+ output_tokens: int = 0
259
+ total_cost_usd: float = 0.0
260
+ avg_response_time_ms: float = 0.0
261
+ p50_response_time_ms: float = 0.0
262
+ p95_response_time_ms: float = 0.0
263
+ p99_response_time_ms: float = 0.0
264
+ avg_queue_time_ms: float = 0.0
265
+ requests_per_hour: float = 0.0
266
+ tokens_per_hour: float = 0.0
267
+ error_rate: float = 0.0
268
+ timeout_rate: float = 0.0
269
+ cache_hit_rate: float = 0.0
270
+ avg_tokens_per_request: float = 0.0
271
+ cost_per_token: float = 0.0
272
+ cost_per_request: float = 0.0
273
+ throughput_tokens_per_second: float = 0.0
274
+ created_at: datetime = None
275
+
276
+ def __post_init__(self):
277
+ if self.created_at is None:
278
+ self.created_at = datetime.now(timezone.utc)
279
+
280
+ # Calculate derived metrics
281
+ self._calculate_derived_metrics()
282
+
283
+ def _calculate_derived_metrics(self):
284
+ """Calculate derived metrics from base counts"""
285
+ # Error and success rates
286
+ if self.total_requests > 0:
287
+ self.error_rate = (self.failed_requests / self.total_requests) * 100
288
+ self.timeout_rate = (self.timeout_requests / self.total_requests) * 100
289
+ self.cache_hit_rate = (self.cache_hits / self.total_requests) * 100
290
+ self.cost_per_request = self.total_cost_usd / self.total_requests
291
+
292
+ # Token metrics
293
+ if self.total_tokens > 0:
294
+ self.cost_per_token = self.total_cost_usd / self.total_tokens
295
+
296
+ if self.successful_requests > 0:
297
+ self.avg_tokens_per_request = self.total_tokens / self.successful_requests
298
+
299
+ # Time-based metrics
300
+ period_hours = (self.period_end - self.period_start).total_seconds() / 3600
301
+ if period_hours > 0:
302
+ self.requests_per_hour = self.total_requests / period_hours
303
+ self.tokens_per_hour = self.total_tokens / period_hours
304
+
305
+ # Throughput
306
+ if self.avg_response_time_ms > 0:
307
+ self.throughput_tokens_per_second = (self.avg_tokens_per_request * 1000) / self.avg_response_time_ms
308
+
309
+ @property
310
+ def success_rate(self) -> float:
311
+ """Calculate success rate percentage"""
312
+ return 100.0 - self.error_rate
313
+
314
+ @property
315
+ def period_duration_hours(self) -> float:
316
+ """Get period duration in hours"""
317
+ return (self.period_end - self.period_start).total_seconds() / 3600
318
+
319
+ @property
320
+ def efficiency_score(self) -> float:
321
+ """Calculate efficiency score (0-100) based on performance metrics"""
322
+ score = 100.0
323
+
324
+ # Penalty for high error rates
325
+ score -= self.error_rate
326
+
327
+ # Penalty for high timeout rates
328
+ score -= self.timeout_rate * 2 # Timeouts are worse than regular errors
329
+
330
+ # Bonus for cache hits
331
+ score += self.cache_hit_rate * 0.1
332
+
333
+ # Penalty for slow responses (relative to service type)
334
+ if self.avg_response_time_ms > 5000: # 5+ seconds
335
+ score -= 20
336
+ elif self.avg_response_time_ms > 2000: # 2+ seconds
337
+ score -= 10
338
+ elif self.avg_response_time_ms > 1000: # 1+ seconds
339
+ score -= 5
340
+
341
+ return max(0.0, min(100.0, score))
342
+
343
+ @property
344
+ def performance_tier(self) -> str:
345
+ """Get performance tier classification"""
346
+ efficiency = self.efficiency_score
347
+
348
+ if efficiency >= 90:
349
+ return "excellent"
350
+ elif efficiency >= 75:
351
+ return "good"
352
+ elif efficiency >= 60:
353
+ return "average"
354
+ elif efficiency >= 40:
355
+ return "poor"
356
+ else:
357
+ return "critical"
358
+
359
+ def add_request_data(self, request: InferenceRequest):
360
+ """Add data from an individual request to the statistics"""
361
+ self.total_requests += 1
362
+
363
+ if request.was_successful:
364
+ self.successful_requests += 1
365
+
366
+ if request.total_tokens:
367
+ self.total_tokens += request.total_tokens
368
+ if request.input_tokens:
369
+ self.input_tokens += request.input_tokens
370
+ if request.output_tokens:
371
+ self.output_tokens += request.output_tokens
372
+ if request.cost_usd:
373
+ self.total_cost_usd += request.cost_usd
374
+
375
+ elif request.status == InferenceStatus.FAILED:
376
+ self.failed_requests += 1
377
+ elif request.status == InferenceStatus.TIMEOUT:
378
+ self.timeout_requests += 1
379
+
380
+ if request.retry_count > 0:
381
+ self.retry_requests += 1
382
+
383
+ if request.cache_hit:
384
+ self.cache_hits += 1
385
+
386
+ # Recalculate derived metrics
387
+ self._calculate_derived_metrics()
388
+
389
+ def merge_with(self, other: 'UsageStatistics') -> 'UsageStatistics':
390
+ """Merge this statistics with another to create combined stats"""
391
+ # This would implement proper statistical aggregation
392
+ # For now, just sum the counts and recalculate
393
+ merged = UsageStatistics(
394
+ stat_id=f"merged_{self.stat_id}_{other.stat_id}",
395
+ period_start=min(self.period_start, other.period_start),
396
+ period_end=max(self.period_end, other.period_end),
397
+ service_type="combined" if self.service_type != other.service_type else self.service_type,
398
+ total_requests=self.total_requests + other.total_requests,
399
+ successful_requests=self.successful_requests + other.successful_requests,
400
+ failed_requests=self.failed_requests + other.failed_requests,
401
+ timeout_requests=self.timeout_requests + other.timeout_requests,
402
+ retry_requests=self.retry_requests + other.retry_requests,
403
+ cache_hits=self.cache_hits + other.cache_hits,
404
+ total_tokens=self.total_tokens + other.total_tokens,
405
+ input_tokens=self.input_tokens + other.input_tokens,
406
+ output_tokens=self.output_tokens + other.output_tokens,
407
+ total_cost_usd=self.total_cost_usd + other.total_cost_usd
408
+ )
409
+
410
+ # Calculate weighted averages for timing metrics
411
+ if merged.total_requests > 0:
412
+ weight_self = self.total_requests / merged.total_requests
413
+ weight_other = other.total_requests / merged.total_requests
414
+
415
+ merged.avg_response_time_ms = (self.avg_response_time_ms * weight_self +
416
+ other.avg_response_time_ms * weight_other)
417
+ merged.avg_queue_time_ms = (self.avg_queue_time_ms * weight_self +
418
+ other.avg_queue_time_ms * weight_other)
419
+
420
+ return merged
421
+
422
+ @dataclass
423
+ class ModelUsageSnapshot:
424
+ """
425
+ Point-in-time usage snapshot for quick analytics
426
+
427
+ Provides a snapshot view of model usage at different time granularities
428
+ for real-time monitoring and dashboard displays.
429
+ """
430
+ snapshot_id: str
431
+ model_id: str
432
+ provider: str
433
+ snapshot_time: datetime
434
+ hourly_requests: int = 0
435
+ daily_requests: int = 0
436
+ weekly_requests: int = 0
437
+ monthly_requests: int = 0
438
+ total_tokens_hour: int = 0
439
+ total_tokens_day: int = 0
440
+ total_tokens_week: int = 0
441
+ total_tokens_month: int = 0
442
+ total_cost_hour: float = 0.0
443
+ total_cost_day: float = 0.0
444
+ total_cost_week: float = 0.0
445
+ total_cost_month: float = 0.0
446
+ avg_response_time_hour: float = 0.0
447
+ avg_response_time_day: float = 0.0
448
+ success_rate_hour: float = 100.0
449
+ success_rate_day: float = 100.0
450
+ cache_hit_rate_hour: float = 0.0
451
+ cache_hit_rate_day: float = 0.0
452
+ unique_users_hour: int = 0
453
+ unique_users_day: int = 0
454
+ peak_requests_per_minute: int = 0
455
+ current_queue_size: int = 0
456
+ last_used: Optional[datetime] = None
457
+ health_status: str = "healthy" # healthy, degraded, critical, offline
458
+
459
+ def __post_init__(self):
460
+ if self.snapshot_time is None:
461
+ self.snapshot_time = datetime.now(timezone.utc)
462
+
463
+ @property
464
+ def is_active(self) -> bool:
465
+ """Check if model has been used recently"""
466
+ if not self.last_used:
467
+ return False
468
+
469
+ time_since_use = datetime.now(timezone.utc) - self.last_used
470
+ return time_since_use.total_seconds() < 3600 # Active if used in last hour
471
+
472
+ @property
473
+ def utilization_trend(self) -> str:
474
+ """Analyze utilization trend"""
475
+ if self.weekly_requests == 0:
476
+ return "unused"
477
+
478
+ daily_avg = self.weekly_requests / 7
479
+ hourly_avg = self.daily_requests / 24
480
+
481
+ if self.hourly_requests > hourly_avg * 2:
482
+ return "surge"
483
+ elif self.hourly_requests > hourly_avg * 1.5:
484
+ return "high"
485
+ elif self.hourly_requests > hourly_avg * 0.8:
486
+ return "normal"
487
+ elif self.hourly_requests > hourly_avg * 0.3:
488
+ return "low"
489
+ else:
490
+ return "minimal"
491
+
492
+ @property
493
+ def cost_trend(self) -> str:
494
+ """Analyze cost trend"""
495
+ if self.total_cost_week == 0:
496
+ return "no_cost"
497
+
498
+ daily_avg = self.total_cost_week / 7
499
+ hourly_avg = self.total_cost_day / 24
500
+
501
+ if self.total_cost_hour > hourly_avg * 3:
502
+ return "expensive_spike"
503
+ elif self.total_cost_hour > hourly_avg * 1.5:
504
+ return "above_average"
505
+ elif self.total_cost_hour > hourly_avg * 0.8:
506
+ return "normal"
507
+ else:
508
+ return "below_average"
509
+
510
+ @property
511
+ def efficiency_metrics(self) -> Dict[str, float]:
512
+ """Get efficiency metrics"""
513
+ return {
514
+ "requests_per_dollar_hour": self.hourly_requests / max(self.total_cost_hour, 0.01),
515
+ "tokens_per_dollar_hour": self.total_tokens_hour / max(self.total_cost_hour, 0.01),
516
+ "requests_per_dollar_day": self.daily_requests / max(self.total_cost_day, 0.01),
517
+ "tokens_per_dollar_day": self.total_tokens_day / max(self.total_cost_day, 0.01),
518
+ "avg_cost_per_request_hour": self.total_cost_hour / max(self.hourly_requests, 1),
519
+ "avg_cost_per_request_day": self.total_cost_day / max(self.daily_requests, 1)
520
+ }
521
+
522
+ @property
523
+ def performance_score(self) -> float:
524
+ """Calculate overall performance score (0-100)"""
525
+ score = 100.0
526
+
527
+ # Response time penalty
528
+ if self.avg_response_time_day > 5000:
529
+ score -= 30
530
+ elif self.avg_response_time_day > 2000:
531
+ score -= 15
532
+ elif self.avg_response_time_day > 1000:
533
+ score -= 5
534
+
535
+ # Success rate bonus/penalty
536
+ score = score * (self.success_rate_day / 100)
537
+
538
+ # Cache hit bonus
539
+ score += self.cache_hit_rate_day * 0.1
540
+
541
+ # Health status penalty
542
+ if self.health_status == "critical":
543
+ score *= 0.5
544
+ elif self.health_status == "degraded":
545
+ score *= 0.8
546
+ elif self.health_status == "offline":
547
+ score = 0
548
+
549
+ return max(0.0, min(100.0, score))
550
+
551
+ def update_health_status(self, new_status: str):
552
+ """Update health status"""
553
+ self.health_status = new_status
554
+ self.snapshot_time = datetime.now(timezone.utc)
555
+
556
+ def record_usage(self, requests: int = 1, tokens: int = 0, cost: float = 0.0,
557
+ response_time_ms: float = 0.0, success: bool = True, cache_hit: bool = False):
558
+ """Record usage activity"""
559
+ self.hourly_requests += requests
560
+ self.daily_requests += requests
561
+ self.weekly_requests += requests
562
+ self.monthly_requests += requests
563
+
564
+ self.total_tokens_hour += tokens
565
+ self.total_tokens_day += tokens
566
+ self.total_tokens_week += tokens
567
+ self.total_tokens_month += tokens
568
+
569
+ self.total_cost_hour += cost
570
+ self.total_cost_day += cost
571
+ self.total_cost_week += cost
572
+ self.total_cost_month += cost
573
+
574
+ # Update averages (simplified - would use proper moving averages in production)
575
+ if self.hourly_requests > 0:
576
+ self.avg_response_time_hour = ((self.avg_response_time_hour * (self.hourly_requests - requests)) +
577
+ (response_time_ms * requests)) / self.hourly_requests
578
+
579
+ if self.daily_requests > 0:
580
+ self.avg_response_time_day = ((self.avg_response_time_day * (self.daily_requests - requests)) +
581
+ (response_time_ms * requests)) / self.daily_requests
582
+
583
+ self.last_used = datetime.now(timezone.utc)
584
+
585
+ # Utility functions for working with inference models
586
+
587
+ def create_inference_request(
588
+ service_type: str,
589
+ model_id: str,
590
+ provider: str,
591
+ endpoint: str,
592
+ request_data: Dict[str, Any],
593
+ user_id: Optional[str] = None,
594
+ **kwargs
595
+ ) -> InferenceRequest:
596
+ """Factory function to create a new inference request"""
597
+ import uuid
598
+
599
+ request_id = f"inf_{datetime.now().strftime('%Y%m%d_%H%M%S')}_{uuid.uuid4().hex[:8]}"
600
+
601
+ return InferenceRequest(
602
+ request_id=request_id,
603
+ service_type=service_type,
604
+ model_id=model_id,
605
+ provider=provider,
606
+ endpoint=endpoint,
607
+ request_data=request_data,
608
+ user_id=user_id,
609
+ **kwargs
610
+ )
611
+
612
+ def create_usage_statistics(
613
+ period_start: datetime,
614
+ period_end: datetime,
615
+ service_type: str,
616
+ model_id: Optional[str] = None,
617
+ provider: Optional[str] = None,
618
+ user_id: Optional[str] = None
619
+ ) -> UsageStatistics:
620
+ """Factory function to create usage statistics"""
621
+ import uuid
622
+
623
+ stat_id = f"stat_{period_start.strftime('%Y%m%d_%H')}_{uuid.uuid4().hex[:6]}"
624
+
625
+ return UsageStatistics(
626
+ stat_id=stat_id,
627
+ period_start=period_start,
628
+ period_end=period_end,
629
+ service_type=service_type,
630
+ model_id=model_id,
631
+ provider=provider,
632
+ user_id=user_id
633
+ )
634
+
635
+ def create_model_snapshot(
636
+ model_id: str,
637
+ provider: str
638
+ ) -> ModelUsageSnapshot:
639
+ """Factory function to create model usage snapshot"""
640
+ snapshot_id = f"snap_{model_id}_{provider}_{datetime.now().strftime('%Y%m%d_%H')}"
641
+
642
+ return ModelUsageSnapshot(
643
+ snapshot_id=snapshot_id,
644
+ model_id=model_id,
645
+ provider=provider
646
+ )
647
+
648
+ def calculate_usage_summary(requests: List[InferenceRequest]) -> Dict[str, Any]:
649
+ """Calculate usage summary from list of requests"""
650
+ if not requests:
651
+ return {"total_requests": 0}
652
+
653
+ total_requests = len(requests)
654
+ successful = sum(1 for r in requests if r.was_successful)
655
+ failed = sum(1 for r in requests if r.status == InferenceStatus.FAILED)
656
+ timeouts = sum(1 for r in requests if r.status == InferenceStatus.TIMEOUT)
657
+
658
+ total_cost = sum(r.cost_usd or 0 for r in requests)
659
+ total_tokens = sum(r.total_tokens or 0 for r in requests)
660
+
661
+ execution_times = [r.execution_time_ms for r in requests if r.execution_time_ms]
662
+ avg_execution_time = sum(execution_times) / len(execution_times) if execution_times else 0
663
+
664
+ return {
665
+ "total_requests": total_requests,
666
+ "successful_requests": successful,
667
+ "failed_requests": failed,
668
+ "timeout_requests": timeouts,
669
+ "success_rate": (successful / total_requests) * 100 if total_requests > 0 else 0,
670
+ "total_cost_usd": round(total_cost, 4),
671
+ "total_tokens": total_tokens,
672
+ "avg_execution_time_ms": round(avg_execution_time, 2),
673
+ "cost_per_request": round(total_cost / total_requests, 6) if total_requests > 0 else 0,
674
+ "cost_per_token": round(total_cost / total_tokens, 8) if total_tokens > 0 else 0
675
+ }