isa-model 0.3.91__py3-none-any.whl → 0.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- isa_model/client.py +1166 -584
- isa_model/core/cache/redis_cache.py +410 -0
- isa_model/core/config/config_manager.py +282 -12
- isa_model/core/config.py +91 -1
- isa_model/core/database/__init__.py +1 -0
- isa_model/core/database/direct_db_client.py +114 -0
- isa_model/core/database/migration_manager.py +563 -0
- isa_model/core/database/migrations.py +297 -0
- isa_model/core/database/supabase_client.py +258 -0
- isa_model/core/dependencies.py +316 -0
- isa_model/core/discovery/__init__.py +19 -0
- isa_model/core/discovery/consul_discovery.py +190 -0
- isa_model/core/logging/__init__.py +54 -0
- isa_model/core/logging/influx_logger.py +523 -0
- isa_model/core/logging/loki_logger.py +160 -0
- isa_model/core/models/__init__.py +46 -0
- isa_model/core/models/config_models.py +625 -0
- isa_model/core/models/deployment_billing_tracker.py +430 -0
- isa_model/core/models/model_billing_tracker.py +60 -88
- isa_model/core/models/model_manager.py +66 -25
- isa_model/core/models/model_metadata.py +690 -0
- isa_model/core/models/model_repo.py +217 -55
- isa_model/core/models/model_statistics_tracker.py +234 -0
- isa_model/core/models/model_storage.py +0 -1
- isa_model/core/models/model_version_manager.py +959 -0
- isa_model/core/models/system_models.py +857 -0
- isa_model/core/pricing_manager.py +2 -249
- isa_model/core/repositories/__init__.py +9 -0
- isa_model/core/repositories/config_repository.py +912 -0
- isa_model/core/resilience/circuit_breaker.py +366 -0
- isa_model/core/security/secrets.py +358 -0
- isa_model/core/services/__init__.py +2 -4
- isa_model/core/services/intelligent_model_selector.py +479 -370
- isa_model/core/storage/hf_storage.py +2 -2
- isa_model/core/types.py +8 -0
- isa_model/deployment/__init__.py +5 -48
- isa_model/deployment/core/__init__.py +2 -31
- isa_model/deployment/core/deployment_manager.py +1278 -368
- isa_model/deployment/local/__init__.py +31 -0
- isa_model/deployment/local/config.py +248 -0
- isa_model/deployment/local/gpu_gateway.py +607 -0
- isa_model/deployment/local/health_checker.py +428 -0
- isa_model/deployment/local/provider.py +586 -0
- isa_model/deployment/local/tensorrt_service.py +621 -0
- isa_model/deployment/local/transformers_service.py +644 -0
- isa_model/deployment/local/vllm_service.py +527 -0
- isa_model/deployment/modal/__init__.py +8 -0
- isa_model/deployment/modal/config.py +136 -0
- isa_model/deployment/modal/deployer.py +894 -0
- isa_model/deployment/modal/services/__init__.py +3 -0
- isa_model/deployment/modal/services/audio/__init__.py +1 -0
- isa_model/deployment/modal/services/audio/isa_audio_chatTTS_service.py +520 -0
- isa_model/deployment/modal/services/audio/isa_audio_openvoice_service.py +758 -0
- isa_model/deployment/modal/services/audio/isa_audio_service_v2.py +1044 -0
- isa_model/deployment/modal/services/embedding/__init__.py +1 -0
- isa_model/deployment/modal/services/embedding/isa_embed_rerank_service.py +296 -0
- isa_model/deployment/modal/services/llm/__init__.py +1 -0
- isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
- isa_model/deployment/modal/services/video/__init__.py +1 -0
- isa_model/deployment/modal/services/video/isa_video_hunyuan_service.py +423 -0
- isa_model/deployment/modal/services/vision/__init__.py +1 -0
- isa_model/deployment/modal/services/vision/isa_vision_ocr_service.py +519 -0
- isa_model/deployment/modal/services/vision/isa_vision_qwen25_service.py +709 -0
- isa_model/deployment/modal/services/vision/isa_vision_table_service.py +676 -0
- isa_model/deployment/modal/services/vision/isa_vision_ui_service.py +833 -0
- isa_model/deployment/modal/services/vision/isa_vision_ui_service_optimized.py +660 -0
- isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
- isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
- isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
- isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
- isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
- isa_model/deployment/storage/__init__.py +5 -0
- isa_model/deployment/storage/deployment_repository.py +824 -0
- isa_model/deployment/triton/__init__.py +10 -0
- isa_model/deployment/triton/config.py +196 -0
- isa_model/deployment/triton/configs/__init__.py +1 -0
- isa_model/deployment/triton/provider.py +512 -0
- isa_model/deployment/triton/scripts/__init__.py +1 -0
- isa_model/deployment/triton/templates/__init__.py +1 -0
- isa_model/inference/__init__.py +47 -1
- isa_model/inference/ai_factory.py +179 -16
- isa_model/inference/legacy_services/__init__.py +21 -0
- isa_model/inference/legacy_services/model_evaluation.py +637 -0
- isa_model/inference/legacy_services/model_service.py +573 -0
- isa_model/inference/legacy_services/model_serving.py +717 -0
- isa_model/inference/legacy_services/model_training.py +561 -0
- isa_model/inference/models/__init__.py +21 -0
- isa_model/inference/models/inference_config.py +551 -0
- isa_model/inference/models/inference_record.py +675 -0
- isa_model/inference/models/performance_models.py +714 -0
- isa_model/inference/repositories/__init__.py +9 -0
- isa_model/inference/repositories/inference_repository.py +828 -0
- isa_model/inference/services/audio/__init__.py +21 -0
- isa_model/inference/services/audio/base_realtime_service.py +225 -0
- isa_model/inference/services/audio/base_stt_service.py +184 -11
- isa_model/inference/services/audio/isa_tts_service.py +0 -0
- isa_model/inference/services/audio/openai_realtime_service.py +320 -124
- isa_model/inference/services/audio/openai_stt_service.py +53 -11
- isa_model/inference/services/base_service.py +17 -1
- isa_model/inference/services/custom_model_manager.py +277 -0
- isa_model/inference/services/embedding/__init__.py +13 -0
- isa_model/inference/services/embedding/base_embed_service.py +111 -8
- isa_model/inference/services/embedding/isa_embed_service.py +305 -0
- isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
- isa_model/inference/services/embedding/openai_embed_service.py +2 -4
- isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
- isa_model/inference/services/embedding/tests/test_embedding.py +222 -0
- isa_model/inference/services/img/__init__.py +2 -2
- isa_model/inference/services/img/base_image_gen_service.py +24 -7
- isa_model/inference/services/img/replicate_image_gen_service.py +84 -422
- isa_model/inference/services/img/services/replicate_face_swap.py +193 -0
- isa_model/inference/services/img/services/replicate_flux.py +226 -0
- isa_model/inference/services/img/services/replicate_flux_kontext.py +219 -0
- isa_model/inference/services/img/services/replicate_sticker_maker.py +249 -0
- isa_model/inference/services/img/tests/test_img_client.py +297 -0
- isa_model/inference/services/llm/__init__.py +10 -2
- isa_model/inference/services/llm/base_llm_service.py +361 -26
- isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
- isa_model/inference/services/llm/helpers/llm_adapter.py +71 -12
- isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
- isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
- isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
- isa_model/inference/services/llm/local_llm_service.py +747 -0
- isa_model/inference/services/llm/ollama_llm_service.py +11 -3
- isa_model/inference/services/llm/openai_llm_service.py +670 -56
- isa_model/inference/services/llm/yyds_llm_service.py +10 -3
- isa_model/inference/services/vision/__init__.py +27 -6
- isa_model/inference/services/vision/base_vision_service.py +118 -185
- isa_model/inference/services/vision/blip_vision_service.py +359 -0
- isa_model/inference/services/vision/helpers/image_utils.py +19 -10
- isa_model/inference/services/vision/isa_vision_service.py +634 -0
- isa_model/inference/services/vision/openai_vision_service.py +19 -10
- isa_model/inference/services/vision/tests/test_ocr_client.py +284 -0
- isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
- isa_model/serving/api/cache_manager.py +245 -0
- isa_model/serving/api/dependencies/__init__.py +1 -0
- isa_model/serving/api/dependencies/auth.py +194 -0
- isa_model/serving/api/dependencies/database.py +139 -0
- isa_model/serving/api/error_handlers.py +284 -0
- isa_model/serving/api/fastapi_server.py +240 -18
- isa_model/serving/api/middleware/auth.py +317 -0
- isa_model/serving/api/middleware/security.py +268 -0
- isa_model/serving/api/middleware/tenant_context.py +414 -0
- isa_model/serving/api/routes/analytics.py +489 -0
- isa_model/serving/api/routes/config.py +645 -0
- isa_model/serving/api/routes/deployment_billing.py +315 -0
- isa_model/serving/api/routes/deployments.py +475 -0
- isa_model/serving/api/routes/gpu_gateway.py +440 -0
- isa_model/serving/api/routes/health.py +32 -12
- isa_model/serving/api/routes/inference_monitoring.py +486 -0
- isa_model/serving/api/routes/local_deployments.py +448 -0
- isa_model/serving/api/routes/logs.py +430 -0
- isa_model/serving/api/routes/settings.py +582 -0
- isa_model/serving/api/routes/tenants.py +575 -0
- isa_model/serving/api/routes/unified.py +992 -171
- isa_model/serving/api/routes/webhooks.py +479 -0
- isa_model/serving/api/startup.py +318 -0
- isa_model/serving/modal_proxy_server.py +249 -0
- isa_model/utils/gpu_utils.py +311 -0
- {isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/METADATA +76 -22
- isa_model-0.4.3.dist-info/RECORD +193 -0
- isa_model/deployment/cloud/__init__.py +0 -9
- isa_model/deployment/cloud/modal/__init__.py +0 -10
- isa_model/deployment/cloud/modal/isa_vision_doc_service.py +0 -766
- isa_model/deployment/cloud/modal/isa_vision_table_service.py +0 -532
- isa_model/deployment/cloud/modal/isa_vision_ui_service.py +0 -406
- isa_model/deployment/cloud/modal/register_models.py +0 -321
- isa_model/deployment/core/deployment_config.py +0 -356
- isa_model/deployment/core/isa_deployment_service.py +0 -401
- isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
- isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
- isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
- isa_model/deployment/runtime/deployed_service.py +0 -338
- isa_model/deployment/services/__init__.py +0 -9
- isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
- isa_model/deployment/services/model_service.py +0 -332
- isa_model/deployment/services/service_monitor.py +0 -356
- isa_model/deployment/services/service_registry.py +0 -527
- isa_model/eval/__init__.py +0 -92
- isa_model/eval/benchmarks.py +0 -469
- isa_model/eval/config/__init__.py +0 -10
- isa_model/eval/config/evaluation_config.py +0 -108
- isa_model/eval/evaluators/__init__.py +0 -18
- isa_model/eval/evaluators/base_evaluator.py +0 -503
- isa_model/eval/evaluators/llm_evaluator.py +0 -472
- isa_model/eval/factory.py +0 -531
- isa_model/eval/infrastructure/__init__.py +0 -24
- isa_model/eval/infrastructure/experiment_tracker.py +0 -466
- isa_model/eval/metrics.py +0 -798
- isa_model/inference/adapter/unified_api.py +0 -248
- isa_model/inference/services/helpers/stacked_config.py +0 -148
- isa_model/inference/services/img/flux_professional_service.py +0 -603
- isa_model/inference/services/img/helpers/base_stacked_service.py +0 -274
- isa_model/inference/services/others/table_transformer_service.py +0 -61
- isa_model/inference/services/vision/doc_analysis_service.py +0 -640
- isa_model/inference/services/vision/helpers/base_stacked_service.py +0 -274
- isa_model/inference/services/vision/ui_analysis_service.py +0 -823
- isa_model/scripts/inference_tracker.py +0 -283
- isa_model/scripts/mlflow_manager.py +0 -379
- isa_model/scripts/model_registry.py +0 -465
- isa_model/scripts/register_models.py +0 -370
- isa_model/scripts/register_models_with_embeddings.py +0 -510
- isa_model/scripts/start_mlflow.py +0 -95
- isa_model/scripts/training_tracker.py +0 -257
- isa_model/training/__init__.py +0 -74
- isa_model/training/annotation/annotation_schema.py +0 -47
- isa_model/training/annotation/processors/annotation_processor.py +0 -126
- isa_model/training/annotation/storage/dataset_manager.py +0 -131
- isa_model/training/annotation/storage/dataset_schema.py +0 -44
- isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
- isa_model/training/annotation/tests/test_minio copy.py +0 -113
- isa_model/training/annotation/tests/test_minio_upload.py +0 -43
- isa_model/training/annotation/views/annotation_controller.py +0 -158
- isa_model/training/cloud/__init__.py +0 -22
- isa_model/training/cloud/job_orchestrator.py +0 -402
- isa_model/training/cloud/runpod_trainer.py +0 -454
- isa_model/training/cloud/storage_manager.py +0 -482
- isa_model/training/core/__init__.py +0 -23
- isa_model/training/core/config.py +0 -181
- isa_model/training/core/dataset.py +0 -222
- isa_model/training/core/trainer.py +0 -720
- isa_model/training/core/utils.py +0 -213
- isa_model/training/factory.py +0 -424
- isa_model-0.3.91.dist-info/RECORD +0 -138
- /isa_model/{core/storage/minio_storage.py → deployment/modal/services/audio/isa_audio_fish_service.py} +0 -0
- /isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
- {isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/WHEEL +0 -0
- {isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,675 @@
|
|
1
|
+
"""
|
2
|
+
Inference Record Models
|
3
|
+
|
4
|
+
Core data models for inference requests, usage statistics, and model snapshots,
|
5
|
+
extracted from repository layer to follow the standard ISA Model architecture pattern.
|
6
|
+
"""
|
7
|
+
|
8
|
+
import logging
|
9
|
+
import hashlib
|
10
|
+
from datetime import datetime, timezone, timedelta
|
11
|
+
from typing import Dict, List, Optional, Any, Union
|
12
|
+
from dataclasses import dataclass, field
|
13
|
+
from enum import Enum
|
14
|
+
|
15
|
+
logger = logging.getLogger(__name__)
|
16
|
+
|
17
|
+
class InferenceStatus(str, Enum):
|
18
|
+
"""Inference status enumeration"""
|
19
|
+
PENDING = "pending"
|
20
|
+
PROCESSING = "processing"
|
21
|
+
COMPLETED = "completed"
|
22
|
+
FAILED = "failed"
|
23
|
+
TIMEOUT = "timeout"
|
24
|
+
CANCELLED = "cancelled"
|
25
|
+
QUEUED = "queued"
|
26
|
+
RETRYING = "retrying"
|
27
|
+
|
28
|
+
class ServiceType(str, Enum):
|
29
|
+
"""Service type enumeration"""
|
30
|
+
LLM = "llm"
|
31
|
+
VISION = "vision"
|
32
|
+
EMBEDDING = "embedding"
|
33
|
+
TTS = "tts"
|
34
|
+
STT = "stt"
|
35
|
+
IMAGE_GEN = "image_gen"
|
36
|
+
AUDIO = "audio"
|
37
|
+
RERANK = "rerank"
|
38
|
+
OCR = "ocr"
|
39
|
+
TRANSLATION = "translation"
|
40
|
+
SUMMARIZATION = "summarization"
|
41
|
+
CLASSIFICATION = "classification"
|
42
|
+
|
43
|
+
class ErrorCategory(str, Enum):
|
44
|
+
"""Error category enumeration"""
|
45
|
+
TIMEOUT = "timeout"
|
46
|
+
RATE_LIMIT = "rate_limit"
|
47
|
+
AUTHENTICATION = "authentication"
|
48
|
+
VALIDATION = "validation"
|
49
|
+
MODEL_ERROR = "model_error"
|
50
|
+
NETWORK_ERROR = "network_error"
|
51
|
+
SERVER_ERROR = "server_error"
|
52
|
+
QUOTA_EXCEEDED = "quota_exceeded"
|
53
|
+
UNKNOWN = "unknown"
|
54
|
+
|
55
|
+
@dataclass
|
56
|
+
class InferenceRequest:
|
57
|
+
"""
|
58
|
+
Core inference request record
|
59
|
+
|
60
|
+
Represents a single inference request with its input, output, performance metrics,
|
61
|
+
and tracking information for analytics and billing purposes.
|
62
|
+
"""
|
63
|
+
request_id: str
|
64
|
+
service_type: str
|
65
|
+
model_id: str
|
66
|
+
provider: str
|
67
|
+
endpoint: str
|
68
|
+
request_data: Dict[str, Any]
|
69
|
+
status: str = InferenceStatus.PENDING
|
70
|
+
created_at: datetime = None
|
71
|
+
started_at: Optional[datetime] = None
|
72
|
+
completed_at: Optional[datetime] = None
|
73
|
+
user_id: Optional[str] = None
|
74
|
+
session_id: Optional[str] = None
|
75
|
+
client_id: Optional[str] = None
|
76
|
+
ip_address: Optional[str] = None
|
77
|
+
user_agent: Optional[str] = None
|
78
|
+
response_data: Optional[Dict[str, Any]] = None
|
79
|
+
error_message: Optional[str] = None
|
80
|
+
error_category: Optional[str] = None
|
81
|
+
execution_time_ms: Optional[int] = None
|
82
|
+
queue_time_ms: Optional[int] = None
|
83
|
+
tokens_used: Optional[int] = None
|
84
|
+
input_tokens: Optional[int] = None
|
85
|
+
output_tokens: Optional[int] = None
|
86
|
+
cost_usd: Optional[float] = None
|
87
|
+
request_size_bytes: Optional[int] = None
|
88
|
+
response_size_bytes: Optional[int] = None
|
89
|
+
cache_hit: bool = False
|
90
|
+
retry_count: int = 0
|
91
|
+
priority: int = 5 # 1-10 scale
|
92
|
+
request_hash: Optional[str] = None
|
93
|
+
response_hash: Optional[str] = None
|
94
|
+
metadata: Optional[Dict[str, Any]] = None
|
95
|
+
|
96
|
+
def __post_init__(self):
|
97
|
+
if self.created_at is None:
|
98
|
+
self.created_at = datetime.now(timezone.utc)
|
99
|
+
if self.metadata is None:
|
100
|
+
self.metadata = {}
|
101
|
+
|
102
|
+
# Generate request hash for deduplication
|
103
|
+
if self.request_hash is None and self.request_data:
|
104
|
+
self.request_hash = self._generate_content_hash(self.request_data)
|
105
|
+
|
106
|
+
def _generate_content_hash(self, content: Any) -> str:
|
107
|
+
"""Generate consistent hash for content"""
|
108
|
+
import json
|
109
|
+
try:
|
110
|
+
content_str = json.dumps(content, sort_keys=True, ensure_ascii=True)
|
111
|
+
return hashlib.sha256(content_str.encode()).hexdigest()[:16]
|
112
|
+
except Exception:
|
113
|
+
return hashlib.sha256(str(content).encode()).hexdigest()[:16]
|
114
|
+
|
115
|
+
@property
|
116
|
+
def is_active(self) -> bool:
|
117
|
+
"""Check if request is in active processing state"""
|
118
|
+
return self.status in [InferenceStatus.PENDING, InferenceStatus.PROCESSING,
|
119
|
+
InferenceStatus.QUEUED, InferenceStatus.RETRYING]
|
120
|
+
|
121
|
+
@property
|
122
|
+
def is_completed(self) -> bool:
|
123
|
+
"""Check if request is completed (successfully or not)"""
|
124
|
+
return self.status in [InferenceStatus.COMPLETED, InferenceStatus.FAILED,
|
125
|
+
InferenceStatus.TIMEOUT, InferenceStatus.CANCELLED]
|
126
|
+
|
127
|
+
@property
|
128
|
+
def was_successful(self) -> bool:
|
129
|
+
"""Check if request completed successfully"""
|
130
|
+
return self.status == InferenceStatus.COMPLETED
|
131
|
+
|
132
|
+
@property
|
133
|
+
def total_duration_ms(self) -> Optional[int]:
|
134
|
+
"""Calculate total request duration including queue time"""
|
135
|
+
if self.created_at and self.completed_at:
|
136
|
+
return int((self.completed_at - self.created_at).total_seconds() * 1000)
|
137
|
+
return None
|
138
|
+
|
139
|
+
@property
|
140
|
+
def total_tokens(self) -> Optional[int]:
|
141
|
+
"""Get total tokens used (input + output)"""
|
142
|
+
if self.input_tokens is not None and self.output_tokens is not None:
|
143
|
+
return self.input_tokens + self.output_tokens
|
144
|
+
return self.tokens_used
|
145
|
+
|
146
|
+
@property
|
147
|
+
def cost_per_token(self) -> Optional[float]:
|
148
|
+
"""Calculate cost per token"""
|
149
|
+
total = self.total_tokens
|
150
|
+
if self.cost_usd and total and total > 0:
|
151
|
+
return self.cost_usd / total
|
152
|
+
return None
|
153
|
+
|
154
|
+
@property
|
155
|
+
def throughput_tokens_per_second(self) -> Optional[float]:
|
156
|
+
"""Calculate token throughput"""
|
157
|
+
total = self.total_tokens
|
158
|
+
if total and self.execution_time_ms and self.execution_time_ms > 0:
|
159
|
+
return (total * 1000) / self.execution_time_ms
|
160
|
+
return None
|
161
|
+
|
162
|
+
def update_status(self, new_status: str, error_message: Optional[str] = None,
|
163
|
+
error_category: Optional[str] = None):
|
164
|
+
"""Update request status with timestamp tracking"""
|
165
|
+
old_status = self.status
|
166
|
+
self.status = new_status
|
167
|
+
|
168
|
+
now = datetime.now(timezone.utc)
|
169
|
+
|
170
|
+
if new_status == InferenceStatus.PROCESSING and old_status in [InferenceStatus.PENDING, InferenceStatus.QUEUED]:
|
171
|
+
self.started_at = now
|
172
|
+
if self.created_at:
|
173
|
+
self.queue_time_ms = int((now - self.created_at).total_seconds() * 1000)
|
174
|
+
|
175
|
+
elif new_status in [InferenceStatus.COMPLETED, InferenceStatus.FAILED,
|
176
|
+
InferenceStatus.TIMEOUT, InferenceStatus.CANCELLED]:
|
177
|
+
if not self.completed_at:
|
178
|
+
self.completed_at = now
|
179
|
+
|
180
|
+
if self.started_at:
|
181
|
+
self.execution_time_ms = int((self.completed_at - self.started_at).total_seconds() * 1000)
|
182
|
+
|
183
|
+
if error_message:
|
184
|
+
self.error_message = error_message
|
185
|
+
if error_category:
|
186
|
+
self.error_category = error_category
|
187
|
+
|
188
|
+
logger.debug(f"Request {self.request_id} status: {old_status} -> {new_status}")
|
189
|
+
|
190
|
+
def complete_request(self, response_data: Dict[str, Any], tokens_used: Optional[int] = None,
|
191
|
+
cost_usd: Optional[float] = None, **kwargs):
|
192
|
+
"""Mark request as completed with response data"""
|
193
|
+
self.response_data = response_data
|
194
|
+
self.response_hash = self._generate_content_hash(response_data)
|
195
|
+
|
196
|
+
if tokens_used:
|
197
|
+
self.tokens_used = tokens_used
|
198
|
+
if cost_usd:
|
199
|
+
self.cost_usd = cost_usd
|
200
|
+
|
201
|
+
# Update any additional metrics
|
202
|
+
for key, value in kwargs.items():
|
203
|
+
if hasattr(self, key):
|
204
|
+
setattr(self, key, value)
|
205
|
+
|
206
|
+
self.update_status(InferenceStatus.COMPLETED)
|
207
|
+
|
208
|
+
def fail_request(self, error_message: str, error_category: str = ErrorCategory.UNKNOWN,
|
209
|
+
**kwargs):
|
210
|
+
"""Mark request as failed with error details"""
|
211
|
+
self.error_message = error_message
|
212
|
+
self.error_category = error_category
|
213
|
+
|
214
|
+
# Update any additional error metrics
|
215
|
+
for key, value in kwargs.items():
|
216
|
+
if hasattr(self, key):
|
217
|
+
setattr(self, key, value)
|
218
|
+
|
219
|
+
self.update_status(InferenceStatus.FAILED, error_message, error_category)
|
220
|
+
|
221
|
+
def increment_retry(self):
|
222
|
+
"""Increment retry count and reset to retrying status"""
|
223
|
+
self.retry_count += 1
|
224
|
+
self.update_status(InferenceStatus.RETRYING)
|
225
|
+
|
226
|
+
def add_metadata(self, key: str, value: Any):
|
227
|
+
"""Add metadata entry"""
|
228
|
+
self.metadata[key] = value
|
229
|
+
|
230
|
+
def get_metadata(self, key: str, default: Any = None) -> Any:
|
231
|
+
"""Get metadata entry"""
|
232
|
+
return self.metadata.get(key, default)
|
233
|
+
|
234
|
+
@dataclass
|
235
|
+
class UsageStatistics:
|
236
|
+
"""
|
237
|
+
Aggregated usage statistics for analytics and billing
|
238
|
+
|
239
|
+
Contains summarized metrics for a specific time period, service type,
|
240
|
+
model, or user for reporting and analysis purposes.
|
241
|
+
"""
|
242
|
+
stat_id: str
|
243
|
+
period_start: datetime
|
244
|
+
period_end: datetime
|
245
|
+
service_type: str
|
246
|
+
model_id: Optional[str] = None
|
247
|
+
provider: Optional[str] = None
|
248
|
+
user_id: Optional[str] = None
|
249
|
+
client_id: Optional[str] = None
|
250
|
+
total_requests: int = 0
|
251
|
+
successful_requests: int = 0
|
252
|
+
failed_requests: int = 0
|
253
|
+
timeout_requests: int = 0
|
254
|
+
retry_requests: int = 0
|
255
|
+
cache_hits: int = 0
|
256
|
+
total_tokens: int = 0
|
257
|
+
input_tokens: int = 0
|
258
|
+
output_tokens: int = 0
|
259
|
+
total_cost_usd: float = 0.0
|
260
|
+
avg_response_time_ms: float = 0.0
|
261
|
+
p50_response_time_ms: float = 0.0
|
262
|
+
p95_response_time_ms: float = 0.0
|
263
|
+
p99_response_time_ms: float = 0.0
|
264
|
+
avg_queue_time_ms: float = 0.0
|
265
|
+
requests_per_hour: float = 0.0
|
266
|
+
tokens_per_hour: float = 0.0
|
267
|
+
error_rate: float = 0.0
|
268
|
+
timeout_rate: float = 0.0
|
269
|
+
cache_hit_rate: float = 0.0
|
270
|
+
avg_tokens_per_request: float = 0.0
|
271
|
+
cost_per_token: float = 0.0
|
272
|
+
cost_per_request: float = 0.0
|
273
|
+
throughput_tokens_per_second: float = 0.0
|
274
|
+
created_at: datetime = None
|
275
|
+
|
276
|
+
def __post_init__(self):
|
277
|
+
if self.created_at is None:
|
278
|
+
self.created_at = datetime.now(timezone.utc)
|
279
|
+
|
280
|
+
# Calculate derived metrics
|
281
|
+
self._calculate_derived_metrics()
|
282
|
+
|
283
|
+
def _calculate_derived_metrics(self):
|
284
|
+
"""Calculate derived metrics from base counts"""
|
285
|
+
# Error and success rates
|
286
|
+
if self.total_requests > 0:
|
287
|
+
self.error_rate = (self.failed_requests / self.total_requests) * 100
|
288
|
+
self.timeout_rate = (self.timeout_requests / self.total_requests) * 100
|
289
|
+
self.cache_hit_rate = (self.cache_hits / self.total_requests) * 100
|
290
|
+
self.cost_per_request = self.total_cost_usd / self.total_requests
|
291
|
+
|
292
|
+
# Token metrics
|
293
|
+
if self.total_tokens > 0:
|
294
|
+
self.cost_per_token = self.total_cost_usd / self.total_tokens
|
295
|
+
|
296
|
+
if self.successful_requests > 0:
|
297
|
+
self.avg_tokens_per_request = self.total_tokens / self.successful_requests
|
298
|
+
|
299
|
+
# Time-based metrics
|
300
|
+
period_hours = (self.period_end - self.period_start).total_seconds() / 3600
|
301
|
+
if period_hours > 0:
|
302
|
+
self.requests_per_hour = self.total_requests / period_hours
|
303
|
+
self.tokens_per_hour = self.total_tokens / period_hours
|
304
|
+
|
305
|
+
# Throughput
|
306
|
+
if self.avg_response_time_ms > 0:
|
307
|
+
self.throughput_tokens_per_second = (self.avg_tokens_per_request * 1000) / self.avg_response_time_ms
|
308
|
+
|
309
|
+
@property
|
310
|
+
def success_rate(self) -> float:
|
311
|
+
"""Calculate success rate percentage"""
|
312
|
+
return 100.0 - self.error_rate
|
313
|
+
|
314
|
+
@property
|
315
|
+
def period_duration_hours(self) -> float:
|
316
|
+
"""Get period duration in hours"""
|
317
|
+
return (self.period_end - self.period_start).total_seconds() / 3600
|
318
|
+
|
319
|
+
@property
|
320
|
+
def efficiency_score(self) -> float:
|
321
|
+
"""Calculate efficiency score (0-100) based on performance metrics"""
|
322
|
+
score = 100.0
|
323
|
+
|
324
|
+
# Penalty for high error rates
|
325
|
+
score -= self.error_rate
|
326
|
+
|
327
|
+
# Penalty for high timeout rates
|
328
|
+
score -= self.timeout_rate * 2 # Timeouts are worse than regular errors
|
329
|
+
|
330
|
+
# Bonus for cache hits
|
331
|
+
score += self.cache_hit_rate * 0.1
|
332
|
+
|
333
|
+
# Penalty for slow responses (relative to service type)
|
334
|
+
if self.avg_response_time_ms > 5000: # 5+ seconds
|
335
|
+
score -= 20
|
336
|
+
elif self.avg_response_time_ms > 2000: # 2+ seconds
|
337
|
+
score -= 10
|
338
|
+
elif self.avg_response_time_ms > 1000: # 1+ seconds
|
339
|
+
score -= 5
|
340
|
+
|
341
|
+
return max(0.0, min(100.0, score))
|
342
|
+
|
343
|
+
@property
|
344
|
+
def performance_tier(self) -> str:
|
345
|
+
"""Get performance tier classification"""
|
346
|
+
efficiency = self.efficiency_score
|
347
|
+
|
348
|
+
if efficiency >= 90:
|
349
|
+
return "excellent"
|
350
|
+
elif efficiency >= 75:
|
351
|
+
return "good"
|
352
|
+
elif efficiency >= 60:
|
353
|
+
return "average"
|
354
|
+
elif efficiency >= 40:
|
355
|
+
return "poor"
|
356
|
+
else:
|
357
|
+
return "critical"
|
358
|
+
|
359
|
+
def add_request_data(self, request: InferenceRequest):
|
360
|
+
"""Add data from an individual request to the statistics"""
|
361
|
+
self.total_requests += 1
|
362
|
+
|
363
|
+
if request.was_successful:
|
364
|
+
self.successful_requests += 1
|
365
|
+
|
366
|
+
if request.total_tokens:
|
367
|
+
self.total_tokens += request.total_tokens
|
368
|
+
if request.input_tokens:
|
369
|
+
self.input_tokens += request.input_tokens
|
370
|
+
if request.output_tokens:
|
371
|
+
self.output_tokens += request.output_tokens
|
372
|
+
if request.cost_usd:
|
373
|
+
self.total_cost_usd += request.cost_usd
|
374
|
+
|
375
|
+
elif request.status == InferenceStatus.FAILED:
|
376
|
+
self.failed_requests += 1
|
377
|
+
elif request.status == InferenceStatus.TIMEOUT:
|
378
|
+
self.timeout_requests += 1
|
379
|
+
|
380
|
+
if request.retry_count > 0:
|
381
|
+
self.retry_requests += 1
|
382
|
+
|
383
|
+
if request.cache_hit:
|
384
|
+
self.cache_hits += 1
|
385
|
+
|
386
|
+
# Recalculate derived metrics
|
387
|
+
self._calculate_derived_metrics()
|
388
|
+
|
389
|
+
def merge_with(self, other: 'UsageStatistics') -> 'UsageStatistics':
|
390
|
+
"""Merge this statistics with another to create combined stats"""
|
391
|
+
# This would implement proper statistical aggregation
|
392
|
+
# For now, just sum the counts and recalculate
|
393
|
+
merged = UsageStatistics(
|
394
|
+
stat_id=f"merged_{self.stat_id}_{other.stat_id}",
|
395
|
+
period_start=min(self.period_start, other.period_start),
|
396
|
+
period_end=max(self.period_end, other.period_end),
|
397
|
+
service_type="combined" if self.service_type != other.service_type else self.service_type,
|
398
|
+
total_requests=self.total_requests + other.total_requests,
|
399
|
+
successful_requests=self.successful_requests + other.successful_requests,
|
400
|
+
failed_requests=self.failed_requests + other.failed_requests,
|
401
|
+
timeout_requests=self.timeout_requests + other.timeout_requests,
|
402
|
+
retry_requests=self.retry_requests + other.retry_requests,
|
403
|
+
cache_hits=self.cache_hits + other.cache_hits,
|
404
|
+
total_tokens=self.total_tokens + other.total_tokens,
|
405
|
+
input_tokens=self.input_tokens + other.input_tokens,
|
406
|
+
output_tokens=self.output_tokens + other.output_tokens,
|
407
|
+
total_cost_usd=self.total_cost_usd + other.total_cost_usd
|
408
|
+
)
|
409
|
+
|
410
|
+
# Calculate weighted averages for timing metrics
|
411
|
+
if merged.total_requests > 0:
|
412
|
+
weight_self = self.total_requests / merged.total_requests
|
413
|
+
weight_other = other.total_requests / merged.total_requests
|
414
|
+
|
415
|
+
merged.avg_response_time_ms = (self.avg_response_time_ms * weight_self +
|
416
|
+
other.avg_response_time_ms * weight_other)
|
417
|
+
merged.avg_queue_time_ms = (self.avg_queue_time_ms * weight_self +
|
418
|
+
other.avg_queue_time_ms * weight_other)
|
419
|
+
|
420
|
+
return merged
|
421
|
+
|
422
|
+
@dataclass
|
423
|
+
class ModelUsageSnapshot:
|
424
|
+
"""
|
425
|
+
Point-in-time usage snapshot for quick analytics
|
426
|
+
|
427
|
+
Provides a snapshot view of model usage at different time granularities
|
428
|
+
for real-time monitoring and dashboard displays.
|
429
|
+
"""
|
430
|
+
snapshot_id: str
|
431
|
+
model_id: str
|
432
|
+
provider: str
|
433
|
+
snapshot_time: datetime
|
434
|
+
hourly_requests: int = 0
|
435
|
+
daily_requests: int = 0
|
436
|
+
weekly_requests: int = 0
|
437
|
+
monthly_requests: int = 0
|
438
|
+
total_tokens_hour: int = 0
|
439
|
+
total_tokens_day: int = 0
|
440
|
+
total_tokens_week: int = 0
|
441
|
+
total_tokens_month: int = 0
|
442
|
+
total_cost_hour: float = 0.0
|
443
|
+
total_cost_day: float = 0.0
|
444
|
+
total_cost_week: float = 0.0
|
445
|
+
total_cost_month: float = 0.0
|
446
|
+
avg_response_time_hour: float = 0.0
|
447
|
+
avg_response_time_day: float = 0.0
|
448
|
+
success_rate_hour: float = 100.0
|
449
|
+
success_rate_day: float = 100.0
|
450
|
+
cache_hit_rate_hour: float = 0.0
|
451
|
+
cache_hit_rate_day: float = 0.0
|
452
|
+
unique_users_hour: int = 0
|
453
|
+
unique_users_day: int = 0
|
454
|
+
peak_requests_per_minute: int = 0
|
455
|
+
current_queue_size: int = 0
|
456
|
+
last_used: Optional[datetime] = None
|
457
|
+
health_status: str = "healthy" # healthy, degraded, critical, offline
|
458
|
+
|
459
|
+
def __post_init__(self):
|
460
|
+
if self.snapshot_time is None:
|
461
|
+
self.snapshot_time = datetime.now(timezone.utc)
|
462
|
+
|
463
|
+
@property
|
464
|
+
def is_active(self) -> bool:
|
465
|
+
"""Check if model has been used recently"""
|
466
|
+
if not self.last_used:
|
467
|
+
return False
|
468
|
+
|
469
|
+
time_since_use = datetime.now(timezone.utc) - self.last_used
|
470
|
+
return time_since_use.total_seconds() < 3600 # Active if used in last hour
|
471
|
+
|
472
|
+
@property
|
473
|
+
def utilization_trend(self) -> str:
|
474
|
+
"""Analyze utilization trend"""
|
475
|
+
if self.weekly_requests == 0:
|
476
|
+
return "unused"
|
477
|
+
|
478
|
+
daily_avg = self.weekly_requests / 7
|
479
|
+
hourly_avg = self.daily_requests / 24
|
480
|
+
|
481
|
+
if self.hourly_requests > hourly_avg * 2:
|
482
|
+
return "surge"
|
483
|
+
elif self.hourly_requests > hourly_avg * 1.5:
|
484
|
+
return "high"
|
485
|
+
elif self.hourly_requests > hourly_avg * 0.8:
|
486
|
+
return "normal"
|
487
|
+
elif self.hourly_requests > hourly_avg * 0.3:
|
488
|
+
return "low"
|
489
|
+
else:
|
490
|
+
return "minimal"
|
491
|
+
|
492
|
+
@property
|
493
|
+
def cost_trend(self) -> str:
|
494
|
+
"""Analyze cost trend"""
|
495
|
+
if self.total_cost_week == 0:
|
496
|
+
return "no_cost"
|
497
|
+
|
498
|
+
daily_avg = self.total_cost_week / 7
|
499
|
+
hourly_avg = self.total_cost_day / 24
|
500
|
+
|
501
|
+
if self.total_cost_hour > hourly_avg * 3:
|
502
|
+
return "expensive_spike"
|
503
|
+
elif self.total_cost_hour > hourly_avg * 1.5:
|
504
|
+
return "above_average"
|
505
|
+
elif self.total_cost_hour > hourly_avg * 0.8:
|
506
|
+
return "normal"
|
507
|
+
else:
|
508
|
+
return "below_average"
|
509
|
+
|
510
|
+
@property
|
511
|
+
def efficiency_metrics(self) -> Dict[str, float]:
|
512
|
+
"""Get efficiency metrics"""
|
513
|
+
return {
|
514
|
+
"requests_per_dollar_hour": self.hourly_requests / max(self.total_cost_hour, 0.01),
|
515
|
+
"tokens_per_dollar_hour": self.total_tokens_hour / max(self.total_cost_hour, 0.01),
|
516
|
+
"requests_per_dollar_day": self.daily_requests / max(self.total_cost_day, 0.01),
|
517
|
+
"tokens_per_dollar_day": self.total_tokens_day / max(self.total_cost_day, 0.01),
|
518
|
+
"avg_cost_per_request_hour": self.total_cost_hour / max(self.hourly_requests, 1),
|
519
|
+
"avg_cost_per_request_day": self.total_cost_day / max(self.daily_requests, 1)
|
520
|
+
}
|
521
|
+
|
522
|
+
@property
|
523
|
+
def performance_score(self) -> float:
|
524
|
+
"""Calculate overall performance score (0-100)"""
|
525
|
+
score = 100.0
|
526
|
+
|
527
|
+
# Response time penalty
|
528
|
+
if self.avg_response_time_day > 5000:
|
529
|
+
score -= 30
|
530
|
+
elif self.avg_response_time_day > 2000:
|
531
|
+
score -= 15
|
532
|
+
elif self.avg_response_time_day > 1000:
|
533
|
+
score -= 5
|
534
|
+
|
535
|
+
# Success rate bonus/penalty
|
536
|
+
score = score * (self.success_rate_day / 100)
|
537
|
+
|
538
|
+
# Cache hit bonus
|
539
|
+
score += self.cache_hit_rate_day * 0.1
|
540
|
+
|
541
|
+
# Health status penalty
|
542
|
+
if self.health_status == "critical":
|
543
|
+
score *= 0.5
|
544
|
+
elif self.health_status == "degraded":
|
545
|
+
score *= 0.8
|
546
|
+
elif self.health_status == "offline":
|
547
|
+
score = 0
|
548
|
+
|
549
|
+
return max(0.0, min(100.0, score))
|
550
|
+
|
551
|
+
def update_health_status(self, new_status: str):
|
552
|
+
"""Update health status"""
|
553
|
+
self.health_status = new_status
|
554
|
+
self.snapshot_time = datetime.now(timezone.utc)
|
555
|
+
|
556
|
+
def record_usage(self, requests: int = 1, tokens: int = 0, cost: float = 0.0,
|
557
|
+
response_time_ms: float = 0.0, success: bool = True, cache_hit: bool = False):
|
558
|
+
"""Record usage activity"""
|
559
|
+
self.hourly_requests += requests
|
560
|
+
self.daily_requests += requests
|
561
|
+
self.weekly_requests += requests
|
562
|
+
self.monthly_requests += requests
|
563
|
+
|
564
|
+
self.total_tokens_hour += tokens
|
565
|
+
self.total_tokens_day += tokens
|
566
|
+
self.total_tokens_week += tokens
|
567
|
+
self.total_tokens_month += tokens
|
568
|
+
|
569
|
+
self.total_cost_hour += cost
|
570
|
+
self.total_cost_day += cost
|
571
|
+
self.total_cost_week += cost
|
572
|
+
self.total_cost_month += cost
|
573
|
+
|
574
|
+
# Update averages (simplified - would use proper moving averages in production)
|
575
|
+
if self.hourly_requests > 0:
|
576
|
+
self.avg_response_time_hour = ((self.avg_response_time_hour * (self.hourly_requests - requests)) +
|
577
|
+
(response_time_ms * requests)) / self.hourly_requests
|
578
|
+
|
579
|
+
if self.daily_requests > 0:
|
580
|
+
self.avg_response_time_day = ((self.avg_response_time_day * (self.daily_requests - requests)) +
|
581
|
+
(response_time_ms * requests)) / self.daily_requests
|
582
|
+
|
583
|
+
self.last_used = datetime.now(timezone.utc)
|
584
|
+
|
585
|
+
# Utility functions for working with inference models
|
586
|
+
|
587
|
+
def create_inference_request(
|
588
|
+
service_type: str,
|
589
|
+
model_id: str,
|
590
|
+
provider: str,
|
591
|
+
endpoint: str,
|
592
|
+
request_data: Dict[str, Any],
|
593
|
+
user_id: Optional[str] = None,
|
594
|
+
**kwargs
|
595
|
+
) -> InferenceRequest:
|
596
|
+
"""Factory function to create a new inference request"""
|
597
|
+
import uuid
|
598
|
+
|
599
|
+
request_id = f"inf_{datetime.now().strftime('%Y%m%d_%H%M%S')}_{uuid.uuid4().hex[:8]}"
|
600
|
+
|
601
|
+
return InferenceRequest(
|
602
|
+
request_id=request_id,
|
603
|
+
service_type=service_type,
|
604
|
+
model_id=model_id,
|
605
|
+
provider=provider,
|
606
|
+
endpoint=endpoint,
|
607
|
+
request_data=request_data,
|
608
|
+
user_id=user_id,
|
609
|
+
**kwargs
|
610
|
+
)
|
611
|
+
|
612
|
+
def create_usage_statistics(
|
613
|
+
period_start: datetime,
|
614
|
+
period_end: datetime,
|
615
|
+
service_type: str,
|
616
|
+
model_id: Optional[str] = None,
|
617
|
+
provider: Optional[str] = None,
|
618
|
+
user_id: Optional[str] = None
|
619
|
+
) -> UsageStatistics:
|
620
|
+
"""Factory function to create usage statistics"""
|
621
|
+
import uuid
|
622
|
+
|
623
|
+
stat_id = f"stat_{period_start.strftime('%Y%m%d_%H')}_{uuid.uuid4().hex[:6]}"
|
624
|
+
|
625
|
+
return UsageStatistics(
|
626
|
+
stat_id=stat_id,
|
627
|
+
period_start=period_start,
|
628
|
+
period_end=period_end,
|
629
|
+
service_type=service_type,
|
630
|
+
model_id=model_id,
|
631
|
+
provider=provider,
|
632
|
+
user_id=user_id
|
633
|
+
)
|
634
|
+
|
635
|
+
def create_model_snapshot(
|
636
|
+
model_id: str,
|
637
|
+
provider: str
|
638
|
+
) -> ModelUsageSnapshot:
|
639
|
+
"""Factory function to create model usage snapshot"""
|
640
|
+
snapshot_id = f"snap_{model_id}_{provider}_{datetime.now().strftime('%Y%m%d_%H')}"
|
641
|
+
|
642
|
+
return ModelUsageSnapshot(
|
643
|
+
snapshot_id=snapshot_id,
|
644
|
+
model_id=model_id,
|
645
|
+
provider=provider
|
646
|
+
)
|
647
|
+
|
648
|
+
def calculate_usage_summary(requests: List[InferenceRequest]) -> Dict[str, Any]:
|
649
|
+
"""Calculate usage summary from list of requests"""
|
650
|
+
if not requests:
|
651
|
+
return {"total_requests": 0}
|
652
|
+
|
653
|
+
total_requests = len(requests)
|
654
|
+
successful = sum(1 for r in requests if r.was_successful)
|
655
|
+
failed = sum(1 for r in requests if r.status == InferenceStatus.FAILED)
|
656
|
+
timeouts = sum(1 for r in requests if r.status == InferenceStatus.TIMEOUT)
|
657
|
+
|
658
|
+
total_cost = sum(r.cost_usd or 0 for r in requests)
|
659
|
+
total_tokens = sum(r.total_tokens or 0 for r in requests)
|
660
|
+
|
661
|
+
execution_times = [r.execution_time_ms for r in requests if r.execution_time_ms]
|
662
|
+
avg_execution_time = sum(execution_times) / len(execution_times) if execution_times else 0
|
663
|
+
|
664
|
+
return {
|
665
|
+
"total_requests": total_requests,
|
666
|
+
"successful_requests": successful,
|
667
|
+
"failed_requests": failed,
|
668
|
+
"timeout_requests": timeouts,
|
669
|
+
"success_rate": (successful / total_requests) * 100 if total_requests > 0 else 0,
|
670
|
+
"total_cost_usd": round(total_cost, 4),
|
671
|
+
"total_tokens": total_tokens,
|
672
|
+
"avg_execution_time_ms": round(avg_execution_time, 2),
|
673
|
+
"cost_per_request": round(total_cost / total_requests, 6) if total_requests > 0 else 0,
|
674
|
+
"cost_per_token": round(total_cost / total_tokens, 8) if total_tokens > 0 else 0
|
675
|
+
}
|