isa-model 0.4.0__py3-none-any.whl → 0.4.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- isa_model/client.py +466 -43
- isa_model/core/cache/redis_cache.py +12 -3
- isa_model/core/config/config_manager.py +230 -3
- isa_model/core/config.py +90 -0
- isa_model/core/database/direct_db_client.py +114 -0
- isa_model/core/database/migration_manager.py +563 -0
- isa_model/core/database/migrations.py +21 -1
- isa_model/core/database/supabase_client.py +154 -19
- isa_model/core/dependencies.py +316 -0
- isa_model/core/discovery/__init__.py +19 -0
- isa_model/core/discovery/consul_discovery.py +190 -0
- isa_model/core/logging/__init__.py +54 -0
- isa_model/core/logging/influx_logger.py +523 -0
- isa_model/core/logging/loki_logger.py +160 -0
- isa_model/core/models/__init__.py +27 -18
- isa_model/core/models/config_models.py +625 -0
- isa_model/core/models/deployment_billing_tracker.py +430 -0
- isa_model/core/models/model_manager.py +35 -80
- isa_model/core/models/model_metadata.py +690 -0
- isa_model/core/models/model_repo.py +174 -18
- isa_model/core/models/system_models.py +857 -0
- isa_model/core/repositories/__init__.py +9 -0
- isa_model/core/repositories/config_repository.py +912 -0
- isa_model/core/services/intelligent_model_selector.py +399 -21
- isa_model/core/types.py +1 -0
- isa_model/deployment/__init__.py +5 -48
- isa_model/deployment/core/__init__.py +2 -31
- isa_model/deployment/core/deployment_manager.py +1278 -370
- isa_model/deployment/modal/__init__.py +8 -0
- isa_model/deployment/modal/config.py +136 -0
- isa_model/deployment/{services/auto_hf_modal_deployer.py → modal/deployer.py} +1 -1
- isa_model/deployment/modal/services/__init__.py +3 -0
- isa_model/deployment/modal/services/audio/__init__.py +1 -0
- isa_model/deployment/modal/services/embedding/__init__.py +1 -0
- isa_model/deployment/modal/services/llm/__init__.py +1 -0
- isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
- isa_model/deployment/modal/services/video/__init__.py +1 -0
- isa_model/deployment/modal/services/vision/__init__.py +1 -0
- isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
- isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
- isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
- isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
- isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
- isa_model/deployment/storage/__init__.py +5 -0
- isa_model/deployment/storage/deployment_repository.py +824 -0
- isa_model/deployment/triton/__init__.py +10 -0
- isa_model/deployment/triton/config.py +196 -0
- isa_model/deployment/triton/configs/__init__.py +1 -0
- isa_model/deployment/triton/provider.py +512 -0
- isa_model/deployment/triton/scripts/__init__.py +1 -0
- isa_model/deployment/triton/templates/__init__.py +1 -0
- isa_model/inference/__init__.py +47 -1
- isa_model/inference/ai_factory.py +137 -10
- isa_model/inference/legacy_services/__init__.py +21 -0
- isa_model/inference/legacy_services/model_evaluation.py +637 -0
- isa_model/inference/legacy_services/model_service.py +573 -0
- isa_model/inference/legacy_services/model_serving.py +717 -0
- isa_model/inference/legacy_services/model_training.py +561 -0
- isa_model/inference/models/__init__.py +21 -0
- isa_model/inference/models/inference_config.py +551 -0
- isa_model/inference/models/inference_record.py +675 -0
- isa_model/inference/models/performance_models.py +714 -0
- isa_model/inference/repositories/__init__.py +9 -0
- isa_model/inference/repositories/inference_repository.py +828 -0
- isa_model/inference/services/audio/base_stt_service.py +184 -11
- isa_model/inference/services/audio/openai_stt_service.py +22 -6
- isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
- isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
- isa_model/inference/services/llm/__init__.py +10 -2
- isa_model/inference/services/llm/base_llm_service.py +335 -24
- isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
- isa_model/inference/services/llm/helpers/llm_adapter.py +9 -4
- isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
- isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
- isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
- isa_model/inference/services/llm/ollama_llm_service.py +9 -2
- isa_model/inference/services/llm/openai_llm_service.py +33 -16
- isa_model/inference/services/llm/yyds_llm_service.py +8 -2
- isa_model/inference/services/vision/__init__.py +22 -1
- isa_model/inference/services/vision/helpers/image_utils.py +8 -5
- isa_model/inference/services/vision/isa_vision_service.py +65 -4
- isa_model/inference/services/vision/openai_vision_service.py +19 -10
- isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
- isa_model/serving/api/cache_manager.py +245 -0
- isa_model/serving/api/dependencies/__init__.py +1 -0
- isa_model/serving/api/dependencies/auth.py +194 -0
- isa_model/serving/api/dependencies/database.py +139 -0
- isa_model/serving/api/error_handlers.py +284 -0
- isa_model/serving/api/fastapi_server.py +172 -22
- isa_model/serving/api/middleware/auth.py +8 -2
- isa_model/serving/api/middleware/security.py +23 -33
- isa_model/serving/api/middleware/tenant_context.py +414 -0
- isa_model/serving/api/routes/analytics.py +4 -1
- isa_model/serving/api/routes/config.py +645 -0
- isa_model/serving/api/routes/deployment_billing.py +315 -0
- isa_model/serving/api/routes/deployments.py +138 -2
- isa_model/serving/api/routes/gpu_gateway.py +440 -0
- isa_model/serving/api/routes/health.py +32 -12
- isa_model/serving/api/routes/inference_monitoring.py +486 -0
- isa_model/serving/api/routes/local_deployments.py +448 -0
- isa_model/serving/api/routes/tenants.py +575 -0
- isa_model/serving/api/routes/unified.py +680 -18
- isa_model/serving/api/routes/webhooks.py +479 -0
- isa_model/serving/api/startup.py +68 -54
- isa_model/utils/gpu_utils.py +311 -0
- {isa_model-0.4.0.dist-info → isa_model-0.4.4.dist-info}/METADATA +71 -24
- isa_model-0.4.4.dist-info/RECORD +180 -0
- isa_model/core/security/secrets.py +0 -358
- isa_model/core/storage/hf_storage.py +0 -419
- isa_model/core/storage/minio_storage.py +0 -0
- isa_model/deployment/cloud/__init__.py +0 -9
- isa_model/deployment/cloud/modal/__init__.py +0 -10
- isa_model/deployment/core/deployment_config.py +0 -356
- isa_model/deployment/core/isa_deployment_service.py +0 -401
- isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
- isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
- isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
- isa_model/deployment/runtime/deployed_service.py +0 -338
- isa_model/deployment/services/__init__.py +0 -9
- isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
- isa_model/deployment/services/model_service.py +0 -332
- isa_model/deployment/services/service_monitor.py +0 -356
- isa_model/deployment/services/service_registry.py +0 -527
- isa_model/eval/__init__.py +0 -92
- isa_model/eval/benchmarks/__init__.py +0 -27
- isa_model/eval/benchmarks/multimodal_datasets.py +0 -460
- isa_model/eval/benchmarks.py +0 -701
- isa_model/eval/config/__init__.py +0 -10
- isa_model/eval/config/evaluation_config.py +0 -108
- isa_model/eval/evaluators/__init__.py +0 -24
- isa_model/eval/evaluators/audio_evaluator.py +0 -727
- isa_model/eval/evaluators/base_evaluator.py +0 -503
- isa_model/eval/evaluators/embedding_evaluator.py +0 -742
- isa_model/eval/evaluators/llm_evaluator.py +0 -472
- isa_model/eval/evaluators/vision_evaluator.py +0 -564
- isa_model/eval/example_evaluation.py +0 -395
- isa_model/eval/factory.py +0 -798
- isa_model/eval/infrastructure/__init__.py +0 -24
- isa_model/eval/infrastructure/experiment_tracker.py +0 -466
- isa_model/eval/isa_benchmarks.py +0 -700
- isa_model/eval/isa_integration.py +0 -582
- isa_model/eval/metrics.py +0 -951
- isa_model/eval/tests/unit/test_basic.py +0 -396
- isa_model/serving/api/routes/evaluations.py +0 -579
- isa_model/training/__init__.py +0 -168
- isa_model/training/annotation/annotation_schema.py +0 -47
- isa_model/training/annotation/processors/annotation_processor.py +0 -126
- isa_model/training/annotation/storage/dataset_manager.py +0 -131
- isa_model/training/annotation/storage/dataset_schema.py +0 -44
- isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
- isa_model/training/annotation/tests/test_minio copy.py +0 -113
- isa_model/training/annotation/tests/test_minio_upload.py +0 -43
- isa_model/training/annotation/views/annotation_controller.py +0 -158
- isa_model/training/cloud/__init__.py +0 -22
- isa_model/training/cloud/job_orchestrator.py +0 -402
- isa_model/training/cloud/runpod_trainer.py +0 -454
- isa_model/training/cloud/storage_manager.py +0 -482
- isa_model/training/core/__init__.py +0 -26
- isa_model/training/core/config.py +0 -181
- isa_model/training/core/dataset.py +0 -222
- isa_model/training/core/trainer.py +0 -720
- isa_model/training/core/utils.py +0 -213
- isa_model/training/examples/intelligent_training_example.py +0 -281
- isa_model/training/factory.py +0 -424
- isa_model/training/intelligent/__init__.py +0 -25
- isa_model/training/intelligent/decision_engine.py +0 -643
- isa_model/training/intelligent/intelligent_factory.py +0 -888
- isa_model/training/intelligent/knowledge_base.py +0 -751
- isa_model/training/intelligent/resource_optimizer.py +0 -839
- isa_model/training/intelligent/task_classifier.py +0 -576
- isa_model/training/storage/__init__.py +0 -24
- isa_model/training/storage/core_integration.py +0 -439
- isa_model/training/storage/training_repository.py +0 -552
- isa_model/training/storage/training_storage.py +0 -628
- isa_model-0.4.0.dist-info/RECORD +0 -182
- /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_chatTTS_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_fish_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_openvoice_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_service_v2.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/embedding}/isa_embed_rerank_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/video}/isa_video_hunyuan_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ocr_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_qwen25_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_table_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service_optimized.py +0 -0
- /isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
- {isa_model-0.4.0.dist-info → isa_model-0.4.4.dist-info}/WHEEL +0 -0
- {isa_model-0.4.0.dist-info → isa_model-0.4.4.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,714 @@
|
|
1
|
+
"""
|
2
|
+
Performance Models
|
3
|
+
|
4
|
+
Specialized models for tracking and analyzing inference performance metrics,
|
5
|
+
latency profiles, and throughput characteristics.
|
6
|
+
"""
|
7
|
+
|
8
|
+
import logging
|
9
|
+
from datetime import datetime, timezone, timedelta
|
10
|
+
from typing import Dict, List, Optional, Any, Union, Tuple
|
11
|
+
from dataclasses import dataclass, field
|
12
|
+
from enum import Enum
|
13
|
+
import statistics
|
14
|
+
|
15
|
+
logger = logging.getLogger(__name__)
|
16
|
+
|
17
|
+
class PerformanceTier(str, Enum):
|
18
|
+
"""Performance tier enumeration"""
|
19
|
+
EXCELLENT = "excellent"
|
20
|
+
GOOD = "good"
|
21
|
+
AVERAGE = "average"
|
22
|
+
POOR = "poor"
|
23
|
+
CRITICAL = "critical"
|
24
|
+
|
25
|
+
class LatencyCategory(str, Enum):
|
26
|
+
"""Latency category enumeration"""
|
27
|
+
ULTRA_LOW = "ultra_low" # < 100ms
|
28
|
+
LOW = "low" # 100-500ms
|
29
|
+
MODERATE = "moderate" # 500ms-2s
|
30
|
+
HIGH = "high" # 2s-10s
|
31
|
+
VERY_HIGH = "very_high" # > 10s
|
32
|
+
|
33
|
+
class ThroughputUnit(str, Enum):
|
34
|
+
"""Throughput measurement unit enumeration"""
|
35
|
+
REQUESTS_PER_SECOND = "rps"
|
36
|
+
TOKENS_PER_SECOND = "tps"
|
37
|
+
TOKENS_PER_MINUTE = "tpm"
|
38
|
+
REQUESTS_PER_MINUTE = "rpm"
|
39
|
+
|
40
|
+
@dataclass
|
41
|
+
class PerformanceMetrics:
|
42
|
+
"""
|
43
|
+
Comprehensive performance metrics for inference operations
|
44
|
+
|
45
|
+
Tracks detailed performance characteristics including latency, throughput,
|
46
|
+
resource utilization, and quality metrics.
|
47
|
+
"""
|
48
|
+
metric_id: str
|
49
|
+
model_id: str
|
50
|
+
provider: str
|
51
|
+
service_type: str
|
52
|
+
measurement_period_start: datetime
|
53
|
+
measurement_period_end: datetime
|
54
|
+
|
55
|
+
# Request volume metrics
|
56
|
+
total_requests: int = 0
|
57
|
+
successful_requests: int = 0
|
58
|
+
failed_requests: int = 0
|
59
|
+
timeout_requests: int = 0
|
60
|
+
|
61
|
+
# Latency metrics (in milliseconds)
|
62
|
+
min_latency_ms: Optional[float] = None
|
63
|
+
max_latency_ms: Optional[float] = None
|
64
|
+
mean_latency_ms: Optional[float] = None
|
65
|
+
median_latency_ms: Optional[float] = None
|
66
|
+
p95_latency_ms: Optional[float] = None
|
67
|
+
p99_latency_ms: Optional[float] = None
|
68
|
+
p999_latency_ms: Optional[float] = None
|
69
|
+
latency_std_dev: Optional[float] = None
|
70
|
+
|
71
|
+
# Throughput metrics
|
72
|
+
requests_per_second: Optional[float] = None
|
73
|
+
tokens_per_second: Optional[float] = None
|
74
|
+
tokens_per_minute: Optional[float] = None
|
75
|
+
peak_rps: Optional[float] = None
|
76
|
+
|
77
|
+
# Token metrics
|
78
|
+
total_input_tokens: int = 0
|
79
|
+
total_output_tokens: int = 0
|
80
|
+
avg_input_tokens: Optional[float] = None
|
81
|
+
avg_output_tokens: Optional[float] = None
|
82
|
+
max_input_tokens: Optional[int] = None
|
83
|
+
max_output_tokens: Optional[int] = None
|
84
|
+
|
85
|
+
# Cost metrics
|
86
|
+
total_cost_usd: float = 0.0
|
87
|
+
cost_per_request: Optional[float] = None
|
88
|
+
cost_per_token: Optional[float] = None
|
89
|
+
cost_per_second: Optional[float] = None
|
90
|
+
|
91
|
+
# Quality metrics
|
92
|
+
success_rate: float = 0.0
|
93
|
+
error_rate: float = 0.0
|
94
|
+
timeout_rate: float = 0.0
|
95
|
+
retry_rate: float = 0.0
|
96
|
+
cache_hit_rate: float = 0.0
|
97
|
+
|
98
|
+
# Resource utilization (if available)
|
99
|
+
avg_cpu_usage: Optional[float] = None
|
100
|
+
avg_memory_usage: Optional[float] = None
|
101
|
+
avg_gpu_usage: Optional[float] = None
|
102
|
+
peak_memory_mb: Optional[float] = None
|
103
|
+
|
104
|
+
# Queue and concurrency metrics
|
105
|
+
avg_queue_time_ms: Optional[float] = None
|
106
|
+
max_queue_time_ms: Optional[float] = None
|
107
|
+
avg_concurrent_requests: Optional[float] = None
|
108
|
+
max_concurrent_requests: Optional[int] = None
|
109
|
+
|
110
|
+
created_at: datetime = None
|
111
|
+
|
112
|
+
def __post_init__(self):
|
113
|
+
if self.created_at is None:
|
114
|
+
self.created_at = datetime.now(timezone.utc)
|
115
|
+
|
116
|
+
# Calculate derived metrics
|
117
|
+
self._calculate_derived_metrics()
|
118
|
+
|
119
|
+
def _calculate_derived_metrics(self):
|
120
|
+
"""Calculate derived metrics from base measurements"""
|
121
|
+
if self.total_requests > 0:
|
122
|
+
self.success_rate = (self.successful_requests / self.total_requests) * 100
|
123
|
+
self.error_rate = (self.failed_requests / self.total_requests) * 100
|
124
|
+
self.timeout_rate = (self.timeout_requests / self.total_requests) * 100
|
125
|
+
|
126
|
+
if self.total_cost_usd > 0:
|
127
|
+
self.cost_per_request = self.total_cost_usd / self.total_requests
|
128
|
+
|
129
|
+
if self.successful_requests > 0:
|
130
|
+
self.avg_input_tokens = self.total_input_tokens / self.successful_requests
|
131
|
+
self.avg_output_tokens = self.total_output_tokens / self.successful_requests
|
132
|
+
|
133
|
+
# Calculate period-based metrics
|
134
|
+
period_seconds = (self.measurement_period_end - self.measurement_period_start).total_seconds()
|
135
|
+
if period_seconds > 0:
|
136
|
+
self.requests_per_second = self.total_requests / period_seconds
|
137
|
+
|
138
|
+
total_tokens = self.total_input_tokens + self.total_output_tokens
|
139
|
+
if total_tokens > 0:
|
140
|
+
self.tokens_per_second = total_tokens / period_seconds
|
141
|
+
self.tokens_per_minute = total_tokens / (period_seconds / 60)
|
142
|
+
|
143
|
+
if self.total_cost_usd > 0:
|
144
|
+
self.cost_per_token = self.total_cost_usd / total_tokens
|
145
|
+
self.cost_per_second = self.total_cost_usd / period_seconds
|
146
|
+
|
147
|
+
@property
|
148
|
+
def measurement_duration_seconds(self) -> float:
|
149
|
+
"""Get measurement period duration in seconds"""
|
150
|
+
return (self.measurement_period_end - self.measurement_period_start).total_seconds()
|
151
|
+
|
152
|
+
@property
|
153
|
+
def latency_category(self) -> str:
|
154
|
+
"""Categorize average latency"""
|
155
|
+
if self.mean_latency_ms is None:
|
156
|
+
return "unknown"
|
157
|
+
|
158
|
+
if self.mean_latency_ms < 100:
|
159
|
+
return LatencyCategory.ULTRA_LOW
|
160
|
+
elif self.mean_latency_ms < 500:
|
161
|
+
return LatencyCategory.LOW
|
162
|
+
elif self.mean_latency_ms < 2000:
|
163
|
+
return LatencyCategory.MODERATE
|
164
|
+
elif self.mean_latency_ms < 10000:
|
165
|
+
return LatencyCategory.HIGH
|
166
|
+
else:
|
167
|
+
return LatencyCategory.VERY_HIGH
|
168
|
+
|
169
|
+
@property
|
170
|
+
def performance_tier(self) -> str:
|
171
|
+
"""Calculate overall performance tier"""
|
172
|
+
score = 100.0
|
173
|
+
|
174
|
+
# Latency penalty
|
175
|
+
if self.mean_latency_ms:
|
176
|
+
if self.mean_latency_ms > 10000:
|
177
|
+
score -= 40
|
178
|
+
elif self.mean_latency_ms > 5000:
|
179
|
+
score -= 25
|
180
|
+
elif self.mean_latency_ms > 2000:
|
181
|
+
score -= 15
|
182
|
+
elif self.mean_latency_ms > 1000:
|
183
|
+
score -= 5
|
184
|
+
|
185
|
+
# Success rate impact
|
186
|
+
score *= (self.success_rate / 100)
|
187
|
+
|
188
|
+
# Timeout penalty
|
189
|
+
score -= self.timeout_rate * 2
|
190
|
+
|
191
|
+
if score >= 85:
|
192
|
+
return PerformanceTier.EXCELLENT
|
193
|
+
elif score >= 70:
|
194
|
+
return PerformanceTier.GOOD
|
195
|
+
elif score >= 50:
|
196
|
+
return PerformanceTier.AVERAGE
|
197
|
+
elif score >= 25:
|
198
|
+
return PerformanceTier.POOR
|
199
|
+
else:
|
200
|
+
return PerformanceTier.CRITICAL
|
201
|
+
|
202
|
+
@property
|
203
|
+
def efficiency_score(self) -> float:
|
204
|
+
"""Calculate efficiency score (performance per cost)"""
|
205
|
+
if not self.cost_per_request or self.cost_per_request == 0:
|
206
|
+
return 0.0
|
207
|
+
|
208
|
+
# Higher score for better performance and lower cost
|
209
|
+
base_score = self.success_rate
|
210
|
+
latency_penalty = (self.mean_latency_ms or 1000) / 1000 # Normalize to seconds
|
211
|
+
cost_penalty = self.cost_per_request * 1000 # Scale up cost impact
|
212
|
+
|
213
|
+
return max(0, base_score / (latency_penalty * cost_penalty))
|
214
|
+
|
215
|
+
@property
|
216
|
+
def reliability_score(self) -> float:
|
217
|
+
"""Calculate reliability score based on error rates"""
|
218
|
+
return max(0, 100 - self.error_rate - (self.timeout_rate * 1.5))
|
219
|
+
|
220
|
+
def add_request_measurement(self, latency_ms: float, success: bool, tokens_used: int = 0,
|
221
|
+
cost: float = 0.0, cache_hit: bool = False):
|
222
|
+
"""Add individual request measurement to aggregate metrics"""
|
223
|
+
self.total_requests += 1
|
224
|
+
|
225
|
+
if success:
|
226
|
+
self.successful_requests += 1
|
227
|
+
self.total_input_tokens += tokens_used # Simplified - would split input/output
|
228
|
+
self.total_output_tokens += tokens_used
|
229
|
+
else:
|
230
|
+
self.failed_requests += 1
|
231
|
+
|
232
|
+
if cache_hit:
|
233
|
+
# Update cache hit rate calculation
|
234
|
+
pass
|
235
|
+
|
236
|
+
self.total_cost_usd += cost
|
237
|
+
|
238
|
+
# Update latency statistics (simplified - would use proper streaming statistics)
|
239
|
+
if self.min_latency_ms is None or latency_ms < self.min_latency_ms:
|
240
|
+
self.min_latency_ms = latency_ms
|
241
|
+
|
242
|
+
if self.max_latency_ms is None or latency_ms > self.max_latency_ms:
|
243
|
+
self.max_latency_ms = latency_ms
|
244
|
+
|
245
|
+
# Recalculate derived metrics
|
246
|
+
self._calculate_derived_metrics()
|
247
|
+
|
248
|
+
def compare_to(self, other: 'PerformanceMetrics') -> Dict[str, Any]:
|
249
|
+
"""Compare this metrics to another set of metrics"""
|
250
|
+
comparison = {
|
251
|
+
"baseline_model": other.model_id,
|
252
|
+
"comparison_period": {
|
253
|
+
"our_period": f"{self.measurement_period_start} to {self.measurement_period_end}",
|
254
|
+
"baseline_period": f"{other.measurement_period_start} to {other.measurement_period_end}"
|
255
|
+
},
|
256
|
+
"improvements": {},
|
257
|
+
"regressions": {},
|
258
|
+
"summary": {}
|
259
|
+
}
|
260
|
+
|
261
|
+
# Compare key metrics
|
262
|
+
metrics_to_compare = [
|
263
|
+
("mean_latency_ms", "lower_is_better"),
|
264
|
+
("success_rate", "higher_is_better"),
|
265
|
+
("requests_per_second", "higher_is_better"),
|
266
|
+
("tokens_per_second", "higher_is_better"),
|
267
|
+
("cost_per_request", "lower_is_better"),
|
268
|
+
("cost_per_token", "lower_is_better"),
|
269
|
+
("error_rate", "lower_is_better")
|
270
|
+
]
|
271
|
+
|
272
|
+
for metric_name, direction in metrics_to_compare:
|
273
|
+
our_value = getattr(self, metric_name)
|
274
|
+
other_value = getattr(other, metric_name)
|
275
|
+
|
276
|
+
if our_value is not None and other_value is not None and other_value != 0:
|
277
|
+
change_percent = ((our_value - other_value) / other_value) * 100
|
278
|
+
|
279
|
+
is_improvement = (
|
280
|
+
(direction == "higher_is_better" and change_percent > 0) or
|
281
|
+
(direction == "lower_is_better" and change_percent < 0)
|
282
|
+
)
|
283
|
+
|
284
|
+
change_data = {
|
285
|
+
"our_value": our_value,
|
286
|
+
"baseline_value": other_value,
|
287
|
+
"change_percent": round(change_percent, 2),
|
288
|
+
"absolute_change": our_value - other_value
|
289
|
+
}
|
290
|
+
|
291
|
+
if abs(change_percent) > 5: # Significant change threshold
|
292
|
+
if is_improvement:
|
293
|
+
comparison["improvements"][metric_name] = change_data
|
294
|
+
else:
|
295
|
+
comparison["regressions"][metric_name] = change_data
|
296
|
+
|
297
|
+
# Overall summary
|
298
|
+
comparison["summary"] = {
|
299
|
+
"overall_performance_change": self.performance_tier != other.performance_tier,
|
300
|
+
"our_tier": self.performance_tier,
|
301
|
+
"baseline_tier": other.performance_tier,
|
302
|
+
"improvements_count": len(comparison["improvements"]),
|
303
|
+
"regressions_count": len(comparison["regressions"])
|
304
|
+
}
|
305
|
+
|
306
|
+
return comparison
|
307
|
+
|
308
|
+
@dataclass
|
309
|
+
class LatencyProfile:
|
310
|
+
"""
|
311
|
+
Detailed latency profile analysis
|
312
|
+
|
313
|
+
Provides comprehensive latency analysis including distribution,
|
314
|
+
outliers, and temporal patterns.
|
315
|
+
"""
|
316
|
+
profile_id: str
|
317
|
+
model_id: str
|
318
|
+
provider: str
|
319
|
+
measurement_start: datetime
|
320
|
+
measurement_end: datetime
|
321
|
+
|
322
|
+
# Distribution data
|
323
|
+
latency_samples: List[float] = field(default_factory=list)
|
324
|
+
sample_count: int = 0
|
325
|
+
|
326
|
+
# Statistical measures
|
327
|
+
min_latency: float = float('inf')
|
328
|
+
max_latency: float = 0.0
|
329
|
+
mean_latency: float = 0.0
|
330
|
+
median_latency: float = 0.0
|
331
|
+
mode_latency: Optional[float] = None
|
332
|
+
std_deviation: float = 0.0
|
333
|
+
variance: float = 0.0
|
334
|
+
skewness: Optional[float] = None
|
335
|
+
kurtosis: Optional[float] = None
|
336
|
+
|
337
|
+
# Percentiles
|
338
|
+
p10: float = 0.0
|
339
|
+
p25: float = 0.0
|
340
|
+
p50: float = 0.0
|
341
|
+
p75: float = 0.0
|
342
|
+
p90: float = 0.0
|
343
|
+
p95: float = 0.0
|
344
|
+
p99: float = 0.0
|
345
|
+
p999: float = 0.0
|
346
|
+
|
347
|
+
# Outlier analysis
|
348
|
+
outlier_threshold_multiplier: float = 3.0
|
349
|
+
outlier_count: int = 0
|
350
|
+
outlier_rate: float = 0.0
|
351
|
+
outliers: List[float] = field(default_factory=list)
|
352
|
+
|
353
|
+
# Temporal patterns
|
354
|
+
hourly_averages: Dict[int, float] = field(default_factory=dict)
|
355
|
+
daily_trends: Dict[str, float] = field(default_factory=dict)
|
356
|
+
|
357
|
+
def __post_init__(self):
|
358
|
+
if self.latency_samples:
|
359
|
+
self._calculate_statistics()
|
360
|
+
|
361
|
+
def _calculate_statistics(self):
|
362
|
+
"""Calculate comprehensive latency statistics"""
|
363
|
+
if not self.latency_samples:
|
364
|
+
return
|
365
|
+
|
366
|
+
self.sample_count = len(self.latency_samples)
|
367
|
+
sorted_samples = sorted(self.latency_samples)
|
368
|
+
|
369
|
+
# Basic statistics
|
370
|
+
self.min_latency = min(self.latency_samples)
|
371
|
+
self.max_latency = max(self.latency_samples)
|
372
|
+
self.mean_latency = statistics.mean(self.latency_samples)
|
373
|
+
self.median_latency = statistics.median(self.latency_samples)
|
374
|
+
|
375
|
+
if self.sample_count > 1:
|
376
|
+
self.std_deviation = statistics.stdev(self.latency_samples)
|
377
|
+
self.variance = statistics.variance(self.latency_samples)
|
378
|
+
|
379
|
+
# Percentiles
|
380
|
+
n = len(sorted_samples)
|
381
|
+
self.p10 = sorted_samples[int(0.10 * n)]
|
382
|
+
self.p25 = sorted_samples[int(0.25 * n)]
|
383
|
+
self.p50 = sorted_samples[int(0.50 * n)]
|
384
|
+
self.p75 = sorted_samples[int(0.75 * n)]
|
385
|
+
self.p90 = sorted_samples[int(0.90 * n)]
|
386
|
+
self.p95 = sorted_samples[int(0.95 * n)]
|
387
|
+
self.p99 = sorted_samples[int(0.99 * n)] if n > 100 else sorted_samples[-1]
|
388
|
+
self.p999 = sorted_samples[int(0.999 * n)] if n > 1000 else sorted_samples[-1]
|
389
|
+
|
390
|
+
# Outlier detection using IQR method
|
391
|
+
iqr = self.p75 - self.p25
|
392
|
+
lower_bound = self.p25 - (self.outlier_threshold_multiplier * iqr)
|
393
|
+
upper_bound = self.p75 + (self.outlier_threshold_multiplier * iqr)
|
394
|
+
|
395
|
+
self.outliers = [x for x in self.latency_samples if x < lower_bound or x > upper_bound]
|
396
|
+
self.outlier_count = len(self.outliers)
|
397
|
+
self.outlier_rate = (self.outlier_count / self.sample_count) * 100
|
398
|
+
|
399
|
+
@property
|
400
|
+
def distribution_type(self) -> str:
|
401
|
+
"""Classify the latency distribution"""
|
402
|
+
if self.sample_count < 10:
|
403
|
+
return "insufficient_data"
|
404
|
+
|
405
|
+
# Simple heuristics for distribution classification
|
406
|
+
if abs(self.mean_latency - self.median_latency) < (0.1 * self.std_deviation):
|
407
|
+
return "normal"
|
408
|
+
elif self.mean_latency > self.median_latency:
|
409
|
+
return "right_skewed"
|
410
|
+
else:
|
411
|
+
return "left_skewed"
|
412
|
+
|
413
|
+
@property
|
414
|
+
def stability_score(self) -> float:
|
415
|
+
"""Calculate latency stability score (0-100)"""
|
416
|
+
if self.mean_latency == 0:
|
417
|
+
return 100.0
|
418
|
+
|
419
|
+
# Lower coefficient of variation = higher stability
|
420
|
+
cv = self.std_deviation / self.mean_latency
|
421
|
+
base_score = max(0, 100 - (cv * 100))
|
422
|
+
|
423
|
+
# Penalty for outliers
|
424
|
+
outlier_penalty = min(20, self.outlier_rate)
|
425
|
+
|
426
|
+
return max(0, base_score - outlier_penalty)
|
427
|
+
|
428
|
+
@property
|
429
|
+
def consistency_category(self) -> str:
|
430
|
+
"""Categorize latency consistency"""
|
431
|
+
stability = self.stability_score
|
432
|
+
|
433
|
+
if stability >= 90:
|
434
|
+
return "very_consistent"
|
435
|
+
elif stability >= 75:
|
436
|
+
return "consistent"
|
437
|
+
elif stability >= 60:
|
438
|
+
return "moderately_consistent"
|
439
|
+
elif stability >= 40:
|
440
|
+
return "inconsistent"
|
441
|
+
else:
|
442
|
+
return "very_inconsistent"
|
443
|
+
|
444
|
+
def add_latency_sample(self, latency_ms: float, timestamp: Optional[datetime] = None):
|
445
|
+
"""Add a latency sample to the profile"""
|
446
|
+
self.latency_samples.append(latency_ms)
|
447
|
+
|
448
|
+
if timestamp:
|
449
|
+
# Track hourly patterns
|
450
|
+
hour = timestamp.hour
|
451
|
+
if hour in self.hourly_averages:
|
452
|
+
# Update running average
|
453
|
+
count = sum(1 for ts in self.hourly_averages if ts == hour)
|
454
|
+
self.hourly_averages[hour] = ((self.hourly_averages[hour] * count) + latency_ms) / (count + 1)
|
455
|
+
else:
|
456
|
+
self.hourly_averages[hour] = latency_ms
|
457
|
+
|
458
|
+
# Recalculate if we have enough samples
|
459
|
+
if len(self.latency_samples) % 100 == 0: # Recalculate every 100 samples
|
460
|
+
self._calculate_statistics()
|
461
|
+
|
462
|
+
def get_latency_bands(self) -> Dict[str, Dict[str, Any]]:
|
463
|
+
"""Get latency distribution in bands"""
|
464
|
+
if not self.latency_samples:
|
465
|
+
return {}
|
466
|
+
|
467
|
+
bands = {
|
468
|
+
"ultra_fast": {"range": "< 100ms", "count": 0, "percentage": 0},
|
469
|
+
"fast": {"range": "100-500ms", "count": 0, "percentage": 0},
|
470
|
+
"moderate": {"range": "500ms-2s", "count": 0, "percentage": 0},
|
471
|
+
"slow": {"range": "2s-10s", "count": 0, "percentage": 0},
|
472
|
+
"very_slow": {"range": "> 10s", "count": 0, "percentage": 0}
|
473
|
+
}
|
474
|
+
|
475
|
+
for latency in self.latency_samples:
|
476
|
+
if latency < 100:
|
477
|
+
bands["ultra_fast"]["count"] += 1
|
478
|
+
elif latency < 500:
|
479
|
+
bands["fast"]["count"] += 1
|
480
|
+
elif latency < 2000:
|
481
|
+
bands["moderate"]["count"] += 1
|
482
|
+
elif latency < 10000:
|
483
|
+
bands["slow"]["count"] += 1
|
484
|
+
else:
|
485
|
+
bands["very_slow"]["count"] += 1
|
486
|
+
|
487
|
+
# Calculate percentages
|
488
|
+
total = len(self.latency_samples)
|
489
|
+
for band in bands.values():
|
490
|
+
band["percentage"] = (band["count"] / total) * 100
|
491
|
+
|
492
|
+
return bands
|
493
|
+
|
494
|
+
@dataclass
|
495
|
+
class ThroughputProfile:
|
496
|
+
"""
|
497
|
+
Throughput analysis and capacity planning
|
498
|
+
|
499
|
+
Analyzes request and token throughput patterns for capacity planning
|
500
|
+
and performance optimization.
|
501
|
+
"""
|
502
|
+
profile_id: str
|
503
|
+
model_id: str
|
504
|
+
provider: str
|
505
|
+
measurement_start: datetime
|
506
|
+
measurement_end: datetime
|
507
|
+
|
508
|
+
# Request throughput
|
509
|
+
peak_requests_per_second: float = 0.0
|
510
|
+
avg_requests_per_second: float = 0.0
|
511
|
+
min_requests_per_second: float = 0.0
|
512
|
+
|
513
|
+
# Token throughput
|
514
|
+
peak_tokens_per_second: float = 0.0
|
515
|
+
avg_tokens_per_second: float = 0.0
|
516
|
+
min_tokens_per_second: float = 0.0
|
517
|
+
|
518
|
+
# Capacity metrics
|
519
|
+
max_concurrent_requests: int = 0
|
520
|
+
avg_concurrent_requests: float = 0.0
|
521
|
+
queue_overflow_events: int = 0
|
522
|
+
throttling_events: int = 0
|
523
|
+
|
524
|
+
# Temporal patterns
|
525
|
+
throughput_samples: List[Tuple[datetime, float, float]] = field(default_factory=list) # (timestamp, rps, tps)
|
526
|
+
peak_hours: List[int] = field(default_factory=list)
|
527
|
+
low_hours: List[int] = field(default_factory=list)
|
528
|
+
|
529
|
+
# Efficiency metrics
|
530
|
+
tokens_per_request_ratio: float = 0.0
|
531
|
+
processing_efficiency: float = 0.0 # Actual vs theoretical max throughput
|
532
|
+
|
533
|
+
@property
|
534
|
+
def measurement_duration_hours(self) -> float:
|
535
|
+
"""Get measurement duration in hours"""
|
536
|
+
return (self.measurement_end - self.measurement_start).total_seconds() / 3600
|
537
|
+
|
538
|
+
@property
|
539
|
+
def capacity_utilization(self) -> float:
|
540
|
+
"""Calculate capacity utilization percentage"""
|
541
|
+
if self.peak_requests_per_second == 0:
|
542
|
+
return 0.0
|
543
|
+
return (self.avg_requests_per_second / self.peak_requests_per_second) * 100
|
544
|
+
|
545
|
+
@property
|
546
|
+
def throughput_consistency(self) -> str:
|
547
|
+
"""Analyze throughput consistency"""
|
548
|
+
if not self.throughput_samples:
|
549
|
+
return "unknown"
|
550
|
+
|
551
|
+
rps_values = [sample[1] for sample in self.throughput_samples]
|
552
|
+
if not rps_values:
|
553
|
+
return "unknown"
|
554
|
+
|
555
|
+
cv = statistics.stdev(rps_values) / statistics.mean(rps_values) if statistics.mean(rps_values) > 0 else 0
|
556
|
+
|
557
|
+
if cv < 0.1:
|
558
|
+
return "very_stable"
|
559
|
+
elif cv < 0.3:
|
560
|
+
return "stable"
|
561
|
+
elif cv < 0.5:
|
562
|
+
return "variable"
|
563
|
+
else:
|
564
|
+
return "highly_variable"
|
565
|
+
|
566
|
+
@property
|
567
|
+
def performance_headroom(self) -> float:
|
568
|
+
"""Calculate available performance headroom"""
|
569
|
+
return max(0, self.peak_requests_per_second - self.avg_requests_per_second)
|
570
|
+
|
571
|
+
def add_throughput_sample(self, timestamp: datetime, requests_per_second: float,
|
572
|
+
tokens_per_second: float, concurrent_requests: int = 0):
|
573
|
+
"""Add throughput measurement sample"""
|
574
|
+
self.throughput_samples.append((timestamp, requests_per_second, tokens_per_second))
|
575
|
+
|
576
|
+
# Update peak values
|
577
|
+
if requests_per_second > self.peak_requests_per_second:
|
578
|
+
self.peak_requests_per_second = requests_per_second
|
579
|
+
|
580
|
+
if tokens_per_second > self.peak_tokens_per_second:
|
581
|
+
self.peak_tokens_per_second = tokens_per_second
|
582
|
+
|
583
|
+
if concurrent_requests > self.max_concurrent_requests:
|
584
|
+
self.max_concurrent_requests = concurrent_requests
|
585
|
+
|
586
|
+
# Track peak hours
|
587
|
+
hour = timestamp.hour
|
588
|
+
if requests_per_second > self.avg_requests_per_second * 1.5:
|
589
|
+
if hour not in self.peak_hours:
|
590
|
+
self.peak_hours.append(hour)
|
591
|
+
elif requests_per_second < self.avg_requests_per_second * 0.5:
|
592
|
+
if hour not in self.low_hours:
|
593
|
+
self.low_hours.append(hour)
|
594
|
+
|
595
|
+
def calculate_capacity_recommendations(self) -> Dict[str, Any]:
|
596
|
+
"""Generate capacity planning recommendations"""
|
597
|
+
recommendations = {
|
598
|
+
"current_capacity": {
|
599
|
+
"peak_rps": self.peak_requests_per_second,
|
600
|
+
"avg_rps": self.avg_requests_per_second,
|
601
|
+
"utilization": self.capacity_utilization
|
602
|
+
},
|
603
|
+
"scaling_recommendations": [],
|
604
|
+
"optimization_opportunities": []
|
605
|
+
}
|
606
|
+
|
607
|
+
# Scaling recommendations
|
608
|
+
if self.capacity_utilization > 80:
|
609
|
+
recommendations["scaling_recommendations"].append({
|
610
|
+
"type": "scale_up",
|
611
|
+
"urgency": "high",
|
612
|
+
"reason": "High capacity utilization detected",
|
613
|
+
"suggested_increase": "50%"
|
614
|
+
})
|
615
|
+
elif self.capacity_utilization < 30:
|
616
|
+
recommendations["scaling_recommendations"].append({
|
617
|
+
"type": "scale_down",
|
618
|
+
"urgency": "low",
|
619
|
+
"reason": "Low capacity utilization, cost optimization opportunity",
|
620
|
+
"suggested_decrease": "25%"
|
621
|
+
})
|
622
|
+
|
623
|
+
# Optimization opportunities
|
624
|
+
if self.queue_overflow_events > 0:
|
625
|
+
recommendations["optimization_opportunities"].append({
|
626
|
+
"type": "queue_optimization",
|
627
|
+
"description": "Queue overflow events detected",
|
628
|
+
"suggestion": "Increase queue size or add load balancing"
|
629
|
+
})
|
630
|
+
|
631
|
+
if self.throughput_consistency == "highly_variable":
|
632
|
+
recommendations["optimization_opportunities"].append({
|
633
|
+
"type": "load_smoothing",
|
634
|
+
"description": "High throughput variability",
|
635
|
+
"suggestion": "Implement request smoothing or auto-scaling"
|
636
|
+
})
|
637
|
+
|
638
|
+
return recommendations
|
639
|
+
|
640
|
+
# Utility functions
|
641
|
+
|
642
|
+
def create_performance_metrics(
|
643
|
+
model_id: str,
|
644
|
+
provider: str,
|
645
|
+
service_type: str,
|
646
|
+
period_start: datetime,
|
647
|
+
period_end: datetime
|
648
|
+
) -> PerformanceMetrics:
|
649
|
+
"""Factory function to create performance metrics"""
|
650
|
+
import uuid
|
651
|
+
|
652
|
+
metric_id = f"perf_{datetime.now().strftime('%Y%m%d_%H%M%S')}_{uuid.uuid4().hex[:8]}"
|
653
|
+
|
654
|
+
return PerformanceMetrics(
|
655
|
+
metric_id=metric_id,
|
656
|
+
model_id=model_id,
|
657
|
+
provider=provider,
|
658
|
+
service_type=service_type,
|
659
|
+
measurement_period_start=period_start,
|
660
|
+
measurement_period_end=period_end
|
661
|
+
)
|
662
|
+
|
663
|
+
def analyze_performance_trend(metrics_list: List[PerformanceMetrics]) -> Dict[str, Any]:
|
664
|
+
"""Analyze performance trends across multiple measurement periods"""
|
665
|
+
if not metrics_list:
|
666
|
+
return {"status": "no_data"}
|
667
|
+
|
668
|
+
# Sort by measurement period
|
669
|
+
sorted_metrics = sorted(metrics_list, key=lambda x: x.measurement_period_start)
|
670
|
+
|
671
|
+
# Calculate trends
|
672
|
+
latencies = [m.mean_latency_ms for m in sorted_metrics if m.mean_latency_ms]
|
673
|
+
success_rates = [m.success_rate for m in sorted_metrics]
|
674
|
+
throughputs = [m.requests_per_second for m in sorted_metrics if m.requests_per_second]
|
675
|
+
|
676
|
+
trends = {
|
677
|
+
"period_count": len(sorted_metrics),
|
678
|
+
"time_range": {
|
679
|
+
"start": sorted_metrics[0].measurement_period_start.isoformat(),
|
680
|
+
"end": sorted_metrics[-1].measurement_period_end.isoformat()
|
681
|
+
},
|
682
|
+
"performance_trend": "stable",
|
683
|
+
"key_changes": []
|
684
|
+
}
|
685
|
+
|
686
|
+
# Analyze latency trend
|
687
|
+
if len(latencies) > 1:
|
688
|
+
latency_change = ((latencies[-1] - latencies[0]) / latencies[0]) * 100
|
689
|
+
if abs(latency_change) > 10:
|
690
|
+
trends["key_changes"].append({
|
691
|
+
"metric": "latency",
|
692
|
+
"change_percent": round(latency_change, 2),
|
693
|
+
"direction": "increased" if latency_change > 0 else "decreased"
|
694
|
+
})
|
695
|
+
|
696
|
+
# Analyze success rate trend
|
697
|
+
if len(success_rates) > 1:
|
698
|
+
success_change = success_rates[-1] - success_rates[0]
|
699
|
+
if abs(success_change) > 5:
|
700
|
+
trends["key_changes"].append({
|
701
|
+
"metric": "success_rate",
|
702
|
+
"change_percent": round(success_change, 2),
|
703
|
+
"direction": "improved" if success_change > 0 else "degraded"
|
704
|
+
})
|
705
|
+
|
706
|
+
# Overall trend assessment
|
707
|
+
if len(trends["key_changes"]) > 2:
|
708
|
+
trends["performance_trend"] = "volatile"
|
709
|
+
elif any(change["metric"] == "latency" and change["direction"] == "increased" for change in trends["key_changes"]):
|
710
|
+
trends["performance_trend"] = "degrading"
|
711
|
+
elif any(change["metric"] == "success_rate" and change["direction"] == "improved" for change in trends["key_changes"]):
|
712
|
+
trends["performance_trend"] = "improving"
|
713
|
+
|
714
|
+
return trends
|