isa-model 0.4.0__py3-none-any.whl → 0.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- isa_model/client.py +466 -43
- isa_model/core/cache/redis_cache.py +12 -3
- isa_model/core/config/config_manager.py +230 -3
- isa_model/core/config.py +90 -0
- isa_model/core/database/direct_db_client.py +114 -0
- isa_model/core/database/migration_manager.py +563 -0
- isa_model/core/database/migrations.py +21 -1
- isa_model/core/database/supabase_client.py +154 -19
- isa_model/core/dependencies.py +316 -0
- isa_model/core/discovery/__init__.py +19 -0
- isa_model/core/discovery/consul_discovery.py +190 -0
- isa_model/core/logging/__init__.py +54 -0
- isa_model/core/logging/influx_logger.py +523 -0
- isa_model/core/logging/loki_logger.py +160 -0
- isa_model/core/models/__init__.py +27 -18
- isa_model/core/models/config_models.py +625 -0
- isa_model/core/models/deployment_billing_tracker.py +430 -0
- isa_model/core/models/model_manager.py +40 -17
- isa_model/core/models/model_metadata.py +690 -0
- isa_model/core/models/model_repo.py +174 -18
- isa_model/core/models/system_models.py +857 -0
- isa_model/core/repositories/__init__.py +9 -0
- isa_model/core/repositories/config_repository.py +912 -0
- isa_model/core/services/intelligent_model_selector.py +399 -21
- isa_model/core/storage/hf_storage.py +1 -1
- isa_model/core/types.py +1 -0
- isa_model/deployment/__init__.py +5 -48
- isa_model/deployment/core/__init__.py +2 -31
- isa_model/deployment/core/deployment_manager.py +1278 -370
- isa_model/deployment/local/__init__.py +31 -0
- isa_model/deployment/local/config.py +248 -0
- isa_model/deployment/local/gpu_gateway.py +607 -0
- isa_model/deployment/local/health_checker.py +428 -0
- isa_model/deployment/local/provider.py +586 -0
- isa_model/deployment/local/tensorrt_service.py +621 -0
- isa_model/deployment/local/transformers_service.py +644 -0
- isa_model/deployment/local/vllm_service.py +527 -0
- isa_model/deployment/modal/__init__.py +8 -0
- isa_model/deployment/modal/config.py +136 -0
- isa_model/deployment/{services/auto_hf_modal_deployer.py → modal/deployer.py} +1 -1
- isa_model/deployment/modal/services/__init__.py +3 -0
- isa_model/deployment/modal/services/audio/__init__.py +1 -0
- isa_model/deployment/modal/services/embedding/__init__.py +1 -0
- isa_model/deployment/modal/services/llm/__init__.py +1 -0
- isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
- isa_model/deployment/modal/services/video/__init__.py +1 -0
- isa_model/deployment/modal/services/vision/__init__.py +1 -0
- isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
- isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
- isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
- isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
- isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
- isa_model/deployment/storage/__init__.py +5 -0
- isa_model/deployment/storage/deployment_repository.py +824 -0
- isa_model/deployment/triton/__init__.py +10 -0
- isa_model/deployment/triton/config.py +196 -0
- isa_model/deployment/triton/configs/__init__.py +1 -0
- isa_model/deployment/triton/provider.py +512 -0
- isa_model/deployment/triton/scripts/__init__.py +1 -0
- isa_model/deployment/triton/templates/__init__.py +1 -0
- isa_model/inference/__init__.py +47 -1
- isa_model/inference/ai_factory.py +137 -10
- isa_model/inference/legacy_services/__init__.py +21 -0
- isa_model/inference/legacy_services/model_evaluation.py +637 -0
- isa_model/inference/legacy_services/model_service.py +573 -0
- isa_model/inference/legacy_services/model_serving.py +717 -0
- isa_model/inference/legacy_services/model_training.py +561 -0
- isa_model/inference/models/__init__.py +21 -0
- isa_model/inference/models/inference_config.py +551 -0
- isa_model/inference/models/inference_record.py +675 -0
- isa_model/inference/models/performance_models.py +714 -0
- isa_model/inference/repositories/__init__.py +9 -0
- isa_model/inference/repositories/inference_repository.py +828 -0
- isa_model/inference/services/audio/base_stt_service.py +184 -11
- isa_model/inference/services/audio/openai_stt_service.py +22 -6
- isa_model/inference/services/custom_model_manager.py +277 -0
- isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
- isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
- isa_model/inference/services/llm/__init__.py +10 -2
- isa_model/inference/services/llm/base_llm_service.py +335 -24
- isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
- isa_model/inference/services/llm/helpers/llm_adapter.py +9 -4
- isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
- isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
- isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
- isa_model/inference/services/llm/local_llm_service.py +747 -0
- isa_model/inference/services/llm/ollama_llm_service.py +9 -2
- isa_model/inference/services/llm/openai_llm_service.py +33 -16
- isa_model/inference/services/llm/yyds_llm_service.py +8 -2
- isa_model/inference/services/vision/__init__.py +22 -1
- isa_model/inference/services/vision/blip_vision_service.py +359 -0
- isa_model/inference/services/vision/helpers/image_utils.py +8 -5
- isa_model/inference/services/vision/isa_vision_service.py +65 -4
- isa_model/inference/services/vision/openai_vision_service.py +19 -10
- isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
- isa_model/serving/api/cache_manager.py +245 -0
- isa_model/serving/api/dependencies/__init__.py +1 -0
- isa_model/serving/api/dependencies/auth.py +194 -0
- isa_model/serving/api/dependencies/database.py +139 -0
- isa_model/serving/api/error_handlers.py +284 -0
- isa_model/serving/api/fastapi_server.py +172 -22
- isa_model/serving/api/middleware/auth.py +8 -2
- isa_model/serving/api/middleware/security.py +23 -33
- isa_model/serving/api/middleware/tenant_context.py +414 -0
- isa_model/serving/api/routes/analytics.py +4 -1
- isa_model/serving/api/routes/config.py +645 -0
- isa_model/serving/api/routes/deployment_billing.py +315 -0
- isa_model/serving/api/routes/deployments.py +138 -2
- isa_model/serving/api/routes/gpu_gateway.py +440 -0
- isa_model/serving/api/routes/health.py +32 -12
- isa_model/serving/api/routes/inference_monitoring.py +486 -0
- isa_model/serving/api/routes/local_deployments.py +448 -0
- isa_model/serving/api/routes/tenants.py +575 -0
- isa_model/serving/api/routes/unified.py +680 -18
- isa_model/serving/api/routes/webhooks.py +479 -0
- isa_model/serving/api/startup.py +68 -54
- isa_model/utils/gpu_utils.py +311 -0
- {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/METADATA +66 -24
- isa_model-0.4.3.dist-info/RECORD +193 -0
- isa_model/core/storage/minio_storage.py +0 -0
- isa_model/deployment/cloud/__init__.py +0 -9
- isa_model/deployment/cloud/modal/__init__.py +0 -10
- isa_model/deployment/core/deployment_config.py +0 -356
- isa_model/deployment/core/isa_deployment_service.py +0 -401
- isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
- isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
- isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
- isa_model/deployment/runtime/deployed_service.py +0 -338
- isa_model/deployment/services/__init__.py +0 -9
- isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
- isa_model/deployment/services/model_service.py +0 -332
- isa_model/deployment/services/service_monitor.py +0 -356
- isa_model/deployment/services/service_registry.py +0 -527
- isa_model/eval/__init__.py +0 -92
- isa_model/eval/benchmarks/__init__.py +0 -27
- isa_model/eval/benchmarks/multimodal_datasets.py +0 -460
- isa_model/eval/benchmarks.py +0 -701
- isa_model/eval/config/__init__.py +0 -10
- isa_model/eval/config/evaluation_config.py +0 -108
- isa_model/eval/evaluators/__init__.py +0 -24
- isa_model/eval/evaluators/audio_evaluator.py +0 -727
- isa_model/eval/evaluators/base_evaluator.py +0 -503
- isa_model/eval/evaluators/embedding_evaluator.py +0 -742
- isa_model/eval/evaluators/llm_evaluator.py +0 -472
- isa_model/eval/evaluators/vision_evaluator.py +0 -564
- isa_model/eval/example_evaluation.py +0 -395
- isa_model/eval/factory.py +0 -798
- isa_model/eval/infrastructure/__init__.py +0 -24
- isa_model/eval/infrastructure/experiment_tracker.py +0 -466
- isa_model/eval/isa_benchmarks.py +0 -700
- isa_model/eval/isa_integration.py +0 -582
- isa_model/eval/metrics.py +0 -951
- isa_model/eval/tests/unit/test_basic.py +0 -396
- isa_model/serving/api/routes/evaluations.py +0 -579
- isa_model/training/__init__.py +0 -168
- isa_model/training/annotation/annotation_schema.py +0 -47
- isa_model/training/annotation/processors/annotation_processor.py +0 -126
- isa_model/training/annotation/storage/dataset_manager.py +0 -131
- isa_model/training/annotation/storage/dataset_schema.py +0 -44
- isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
- isa_model/training/annotation/tests/test_minio copy.py +0 -113
- isa_model/training/annotation/tests/test_minio_upload.py +0 -43
- isa_model/training/annotation/views/annotation_controller.py +0 -158
- isa_model/training/cloud/__init__.py +0 -22
- isa_model/training/cloud/job_orchestrator.py +0 -402
- isa_model/training/cloud/runpod_trainer.py +0 -454
- isa_model/training/cloud/storage_manager.py +0 -482
- isa_model/training/core/__init__.py +0 -26
- isa_model/training/core/config.py +0 -181
- isa_model/training/core/dataset.py +0 -222
- isa_model/training/core/trainer.py +0 -720
- isa_model/training/core/utils.py +0 -213
- isa_model/training/examples/intelligent_training_example.py +0 -281
- isa_model/training/factory.py +0 -424
- isa_model/training/intelligent/__init__.py +0 -25
- isa_model/training/intelligent/decision_engine.py +0 -643
- isa_model/training/intelligent/intelligent_factory.py +0 -888
- isa_model/training/intelligent/knowledge_base.py +0 -751
- isa_model/training/intelligent/resource_optimizer.py +0 -839
- isa_model/training/intelligent/task_classifier.py +0 -576
- isa_model/training/storage/__init__.py +0 -24
- isa_model/training/storage/core_integration.py +0 -439
- isa_model/training/storage/training_repository.py +0 -552
- isa_model/training/storage/training_storage.py +0 -628
- isa_model-0.4.0.dist-info/RECORD +0 -182
- /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_chatTTS_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_fish_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_openvoice_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_service_v2.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/embedding}/isa_embed_rerank_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/video}/isa_video_hunyuan_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ocr_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_qwen25_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_table_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service_optimized.py +0 -0
- /isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
- {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/WHEEL +0 -0
- {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/top_level.txt +0 -0
@@ -1,839 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
Resource Optimization System for Training
|
3
|
-
|
4
|
-
This module provides intelligent resource selection and cost optimization:
|
5
|
-
- GPU type selection based on model requirements
|
6
|
-
- Cloud provider comparison and selection
|
7
|
-
- Cost estimation and budget optimization
|
8
|
-
- Performance prediction and time estimation
|
9
|
-
- Resource availability monitoring
|
10
|
-
|
11
|
-
Optimizes for cost, performance, and availability based on user constraints.
|
12
|
-
"""
|
13
|
-
|
14
|
-
import logging
|
15
|
-
from typing import Dict, List, Optional, Any, Tuple
|
16
|
-
from dataclasses import dataclass, field
|
17
|
-
from datetime import datetime
|
18
|
-
import json
|
19
|
-
import os
|
20
|
-
|
21
|
-
logger = logging.getLogger(__name__)
|
22
|
-
|
23
|
-
|
24
|
-
@dataclass
|
25
|
-
class GPUSpec:
|
26
|
-
"""GPU specification and characteristics."""
|
27
|
-
|
28
|
-
name: str
|
29
|
-
memory_gb: int
|
30
|
-
compute_capability: float
|
31
|
-
|
32
|
-
# Performance characteristics
|
33
|
-
fp16_tflops: float
|
34
|
-
fp32_tflops: float
|
35
|
-
memory_bandwidth_gbps: float
|
36
|
-
|
37
|
-
# Cost (per hour in USD)
|
38
|
-
cost_per_hour: float = 0.0
|
39
|
-
|
40
|
-
# Availability
|
41
|
-
availability_score: float = 1.0 # 0.0 to 1.0
|
42
|
-
|
43
|
-
# Provider information
|
44
|
-
providers: List[str] = field(default_factory=list)
|
45
|
-
|
46
|
-
# Training characteristics
|
47
|
-
training_efficiency: float = 1.0 # Relative efficiency for training
|
48
|
-
power_efficiency: float = 1.0 # Performance per watt
|
49
|
-
|
50
|
-
# Metadata
|
51
|
-
is_recommended: bool = False
|
52
|
-
description: str = ""
|
53
|
-
|
54
|
-
|
55
|
-
@dataclass
|
56
|
-
class CloudProvider:
|
57
|
-
"""Cloud provider specification."""
|
58
|
-
|
59
|
-
name: str
|
60
|
-
regions: List[str]
|
61
|
-
|
62
|
-
# Available GPU types
|
63
|
-
available_gpus: List[str]
|
64
|
-
|
65
|
-
# Pricing model
|
66
|
-
pricing_model: str = "hourly" # "hourly", "spot", "reserved"
|
67
|
-
|
68
|
-
# Features
|
69
|
-
supports_spot_instances: bool = False
|
70
|
-
supports_auto_scaling: bool = False
|
71
|
-
supports_preemption: bool = False
|
72
|
-
|
73
|
-
# Performance characteristics
|
74
|
-
startup_time_minutes: float = 5.0
|
75
|
-
network_performance: str = "standard" # "low", "standard", "high"
|
76
|
-
|
77
|
-
# Reliability
|
78
|
-
availability_score: float = 0.99
|
79
|
-
|
80
|
-
# Additional costs
|
81
|
-
storage_cost_per_gb_hour: float = 0.0
|
82
|
-
egress_cost_per_gb: float = 0.0
|
83
|
-
|
84
|
-
description: str = ""
|
85
|
-
|
86
|
-
|
87
|
-
@dataclass
|
88
|
-
class ResourceRecommendation:
|
89
|
-
"""Resource optimization recommendation."""
|
90
|
-
|
91
|
-
# Selected resources
|
92
|
-
gpu: str
|
93
|
-
cloud_provider: str
|
94
|
-
region: str
|
95
|
-
instance_type: str
|
96
|
-
|
97
|
-
# Cost estimates
|
98
|
-
estimated_cost: float
|
99
|
-
cost_breakdown: Dict[str, float]
|
100
|
-
|
101
|
-
# Performance estimates
|
102
|
-
estimated_time: float # hours
|
103
|
-
performance_score: float
|
104
|
-
|
105
|
-
# Configuration
|
106
|
-
recommended_batch_size: int
|
107
|
-
recommended_precision: str # "fp16", "fp32", "bf16"
|
108
|
-
|
109
|
-
# Alternatives
|
110
|
-
alternatives: List[Dict[str, Any]]
|
111
|
-
|
112
|
-
# Reasoning
|
113
|
-
decision_factors: List[str]
|
114
|
-
confidence: float
|
115
|
-
|
116
|
-
# Metadata
|
117
|
-
created_at: datetime = field(default_factory=datetime.now)
|
118
|
-
|
119
|
-
|
120
|
-
class ResourceOptimizer:
|
121
|
-
"""
|
122
|
-
Intelligent resource optimization system.
|
123
|
-
|
124
|
-
This class analyzes training requirements and recommends optimal resources:
|
125
|
-
- GPU selection based on model size and requirements
|
126
|
-
- Cloud provider comparison for cost and performance
|
127
|
-
- Cost estimation and budget optimization
|
128
|
-
- Performance prediction and time estimation
|
129
|
-
|
130
|
-
Example:
|
131
|
-
```python
|
132
|
-
optimizer = ResourceOptimizer()
|
133
|
-
|
134
|
-
recommendation = optimizer.optimize_resources(
|
135
|
-
model_name="google/gemma-2-7b-it",
|
136
|
-
training_config=config,
|
137
|
-
budget_limit=100.0,
|
138
|
-
time_limit=8
|
139
|
-
)
|
140
|
-
|
141
|
-
print(f"Recommended: {recommendation.gpu} on {recommendation.cloud_provider}")
|
142
|
-
print(f"Cost: ${recommendation.estimated_cost:.2f}")
|
143
|
-
```
|
144
|
-
"""
|
145
|
-
|
146
|
-
def __init__(self, data_dir: Optional[str] = None):
|
147
|
-
"""
|
148
|
-
Initialize resource optimizer.
|
149
|
-
|
150
|
-
Args:
|
151
|
-
data_dir: Directory for storing resource data
|
152
|
-
"""
|
153
|
-
self.data_dir = data_dir or os.path.join(os.getcwd(), "resource_data")
|
154
|
-
os.makedirs(self.data_dir, exist_ok=True)
|
155
|
-
|
156
|
-
# Initialize resource databases
|
157
|
-
self.gpus: Dict[str, GPUSpec] = {}
|
158
|
-
self.cloud_providers: Dict[str, CloudProvider] = {}
|
159
|
-
self.pricing_cache: Dict[str, Dict[str, float]] = {}
|
160
|
-
|
161
|
-
# Load resource data
|
162
|
-
self._load_resource_data()
|
163
|
-
|
164
|
-
# Initialize with defaults if empty
|
165
|
-
if not self.gpus:
|
166
|
-
self._initialize_default_resources()
|
167
|
-
|
168
|
-
logger.info(f"Resource optimizer initialized with {len(self.gpus)} GPUs and {len(self.cloud_providers)} providers")
|
169
|
-
|
170
|
-
def optimize_resources(
|
171
|
-
self,
|
172
|
-
model_name: str,
|
173
|
-
training_config: Any,
|
174
|
-
budget_limit: Optional[float] = None,
|
175
|
-
time_limit: Optional[int] = None,
|
176
|
-
preferences: Optional[Dict[str, Any]] = None
|
177
|
-
) -> ResourceRecommendation:
|
178
|
-
"""
|
179
|
-
Optimize resource selection for training requirements.
|
180
|
-
|
181
|
-
Args:
|
182
|
-
model_name: Name of the model to train
|
183
|
-
training_config: Training configuration
|
184
|
-
budget_limit: Maximum budget in USD
|
185
|
-
time_limit: Maximum time in hours
|
186
|
-
preferences: User preferences for GPU/cloud providers
|
187
|
-
|
188
|
-
Returns:
|
189
|
-
Optimal resource recommendation
|
190
|
-
"""
|
191
|
-
preferences = preferences or {}
|
192
|
-
|
193
|
-
logger.info(f"Optimizing resources for {model_name}")
|
194
|
-
|
195
|
-
try:
|
196
|
-
# Step 1: Analyze model requirements
|
197
|
-
model_requirements = self._analyze_model_requirements(model_name, training_config)
|
198
|
-
|
199
|
-
# Step 2: Filter compatible GPUs
|
200
|
-
compatible_gpus = self._filter_compatible_gpus(model_requirements)
|
201
|
-
|
202
|
-
# Step 3: Estimate costs and performance for each option
|
203
|
-
gpu_options = []
|
204
|
-
total_evaluated = 0
|
205
|
-
total_filtered = 0
|
206
|
-
|
207
|
-
for gpu_name in compatible_gpus:
|
208
|
-
gpu_spec = self.gpus[gpu_name]
|
209
|
-
|
210
|
-
# Get best provider for this GPU
|
211
|
-
provider_options = self._get_provider_options(gpu_name, preferences)
|
212
|
-
|
213
|
-
for provider_name, provider_spec, region, instance_type in provider_options:
|
214
|
-
total_evaluated += 1
|
215
|
-
option = self._evaluate_option(
|
216
|
-
gpu_spec, provider_spec, region, instance_type,
|
217
|
-
model_requirements, budget_limit, time_limit
|
218
|
-
)
|
219
|
-
|
220
|
-
if option:
|
221
|
-
gpu_options.append(option)
|
222
|
-
else:
|
223
|
-
total_filtered += 1
|
224
|
-
|
225
|
-
# Step 4: Rank options by overall score
|
226
|
-
if not gpu_options:
|
227
|
-
logger.warning(f"No compatible GPU options found. Evaluated {total_evaluated} options, {total_filtered} filtered by constraints.")
|
228
|
-
logger.warning(f"Budget limit: {budget_limit}, Time limit: {time_limit}")
|
229
|
-
raise ValueError("No compatible GPU options found")
|
230
|
-
|
231
|
-
gpu_options.sort(key=lambda x: x["score"], reverse=True)
|
232
|
-
|
233
|
-
# Step 5: Select best option
|
234
|
-
best_option = gpu_options[0]
|
235
|
-
|
236
|
-
# Step 6: Generate alternatives
|
237
|
-
alternatives = self._generate_alternatives(gpu_options[1:5]) # Top 5 alternatives
|
238
|
-
|
239
|
-
# Step 7: Create recommendation
|
240
|
-
recommendation = ResourceRecommendation(
|
241
|
-
gpu=best_option["gpu"],
|
242
|
-
cloud_provider=best_option["provider"],
|
243
|
-
region=best_option["region"],
|
244
|
-
instance_type=best_option["instance_type"],
|
245
|
-
estimated_cost=best_option["cost"],
|
246
|
-
cost_breakdown=best_option["cost_breakdown"],
|
247
|
-
estimated_time=best_option["time"],
|
248
|
-
performance_score=best_option["performance"],
|
249
|
-
recommended_batch_size=best_option["batch_size"],
|
250
|
-
recommended_precision=best_option["precision"],
|
251
|
-
alternatives=alternatives,
|
252
|
-
decision_factors=best_option["reasons"],
|
253
|
-
confidence=best_option["confidence"]
|
254
|
-
)
|
255
|
-
|
256
|
-
logger.info(f"Selected {recommendation.gpu} on {recommendation.cloud_provider} "
|
257
|
-
f"(${recommendation.estimated_cost:.2f}, {recommendation.estimated_time:.1f}h)")
|
258
|
-
|
259
|
-
return recommendation
|
260
|
-
|
261
|
-
except Exception as e:
|
262
|
-
logger.error(f"Resource optimization failed: {e}")
|
263
|
-
raise
|
264
|
-
|
265
|
-
def _analyze_model_requirements(self, model_name: str, training_config: Any) -> Dict[str, Any]:
|
266
|
-
"""Analyze model resource requirements."""
|
267
|
-
requirements = {
|
268
|
-
"min_memory_gb": 8,
|
269
|
-
"recommended_memory_gb": 16,
|
270
|
-
"compute_intensity": "medium", # "low", "medium", "high"
|
271
|
-
"precision": "fp16",
|
272
|
-
"batch_size": getattr(training_config, 'batch_size', 4),
|
273
|
-
"sequence_length": 1024,
|
274
|
-
"model_size_gb": 4.0,
|
275
|
-
"training_type": getattr(training_config, 'training_type', 'sft')
|
276
|
-
}
|
277
|
-
|
278
|
-
# Estimate model size and requirements based on name
|
279
|
-
if "2b" in model_name.lower():
|
280
|
-
requirements.update({
|
281
|
-
"min_memory_gb": 6, # Reduced for LoRA training
|
282
|
-
"recommended_memory_gb": 10,
|
283
|
-
"model_size_gb": 4.0,
|
284
|
-
"compute_intensity": "medium"
|
285
|
-
})
|
286
|
-
elif "4b" in model_name.lower():
|
287
|
-
requirements.update({
|
288
|
-
"min_memory_gb": 8, # Reduced for LoRA training
|
289
|
-
"recommended_memory_gb": 12,
|
290
|
-
"model_size_gb": 8.0,
|
291
|
-
"compute_intensity": "medium"
|
292
|
-
})
|
293
|
-
elif "7b" in model_name.lower():
|
294
|
-
requirements.update({
|
295
|
-
"min_memory_gb": 12, # Reduced for LoRA training
|
296
|
-
"recommended_memory_gb": 16,
|
297
|
-
"model_size_gb": 14.0,
|
298
|
-
"compute_intensity": "high"
|
299
|
-
})
|
300
|
-
elif "13b" in model_name.lower():
|
301
|
-
requirements.update({
|
302
|
-
"min_memory_gb": 20, # Reduced for LoRA training
|
303
|
-
"recommended_memory_gb": 32,
|
304
|
-
"model_size_gb": 26.0,
|
305
|
-
"compute_intensity": "high"
|
306
|
-
})
|
307
|
-
|
308
|
-
# Adjust for LoRA training (most training uses LoRA)
|
309
|
-
if hasattr(training_config, 'lora_config') and training_config.lora_config and training_config.lora_config.use_lora:
|
310
|
-
requirements["min_memory_gb"] = int(requirements["min_memory_gb"] * 0.8)
|
311
|
-
requirements["recommended_memory_gb"] = int(requirements["recommended_memory_gb"] * 0.9)
|
312
|
-
else:
|
313
|
-
# Assume LoRA by default for most efficient training
|
314
|
-
requirements["min_memory_gb"] = int(requirements["min_memory_gb"] * 0.8)
|
315
|
-
requirements["recommended_memory_gb"] = int(requirements["recommended_memory_gb"] * 0.9)
|
316
|
-
|
317
|
-
# Adjust for batch size
|
318
|
-
batch_size = requirements["batch_size"]
|
319
|
-
if batch_size > 4:
|
320
|
-
requirements["min_memory_gb"] = int(requirements["min_memory_gb"] * (1 + (batch_size - 4) * 0.15))
|
321
|
-
requirements["recommended_memory_gb"] = int(requirements["recommended_memory_gb"] * (1 + (batch_size - 4) * 0.15))
|
322
|
-
|
323
|
-
return requirements
|
324
|
-
|
325
|
-
def _filter_compatible_gpus(self, requirements: Dict[str, Any]) -> List[str]:
|
326
|
-
"""Filter GPUs that meet the requirements."""
|
327
|
-
compatible = []
|
328
|
-
|
329
|
-
min_memory = requirements["min_memory_gb"]
|
330
|
-
|
331
|
-
for gpu_name, gpu_spec in self.gpus.items():
|
332
|
-
if gpu_spec.memory_gb >= min_memory:
|
333
|
-
compatible.append(gpu_name)
|
334
|
-
|
335
|
-
return compatible
|
336
|
-
|
337
|
-
def _get_provider_options(self, gpu_name: str, preferences: Dict[str, Any]) -> List[Tuple[str, CloudProvider, str, str]]:
|
338
|
-
"""Get provider options for a GPU."""
|
339
|
-
options = []
|
340
|
-
gpu_spec = self.gpus[gpu_name]
|
341
|
-
|
342
|
-
for provider_name in gpu_spec.providers:
|
343
|
-
if provider_name in self.cloud_providers:
|
344
|
-
provider_spec = self.cloud_providers[provider_name]
|
345
|
-
|
346
|
-
# Skip if not in user preferences
|
347
|
-
if preferences.get("cloud") and provider_name not in preferences["cloud"]:
|
348
|
-
continue
|
349
|
-
|
350
|
-
# Get regions and instance types
|
351
|
-
for region in provider_spec.regions[:2]: # Limit to top 2 regions
|
352
|
-
instance_type = f"{gpu_name.lower().replace(' ', '-')}-instance"
|
353
|
-
options.append((provider_name, provider_spec, region, instance_type))
|
354
|
-
|
355
|
-
return options
|
356
|
-
|
357
|
-
def _evaluate_option(
|
358
|
-
self,
|
359
|
-
gpu_spec: GPUSpec,
|
360
|
-
provider_spec: CloudProvider,
|
361
|
-
region: str,
|
362
|
-
instance_type: str,
|
363
|
-
requirements: Dict[str, Any],
|
364
|
-
budget_limit: Optional[float],
|
365
|
-
time_limit: Optional[int]
|
366
|
-
) -> Optional[Dict[str, Any]]:
|
367
|
-
"""Evaluate a specific resource option."""
|
368
|
-
|
369
|
-
# Estimate training time (more realistic for LoRA training)
|
370
|
-
base_time = 3.0 # Base training time in hours for LoRA
|
371
|
-
time_factor = 1.0 / gpu_spec.training_efficiency
|
372
|
-
|
373
|
-
# Adjust base time for model size
|
374
|
-
model_size_gb = requirements.get("model_size_gb", 8.0)
|
375
|
-
if model_size_gb > 20: # 13B+ models
|
376
|
-
base_time = 6.0
|
377
|
-
elif model_size_gb > 12: # 7B models
|
378
|
-
base_time = 4.0
|
379
|
-
elif model_size_gb > 6: # 4B models
|
380
|
-
base_time = 3.0
|
381
|
-
else: # 2B models
|
382
|
-
base_time = 2.0
|
383
|
-
|
384
|
-
# Adjust for compute intensity
|
385
|
-
if requirements["compute_intensity"] == "high":
|
386
|
-
time_factor *= 1.3
|
387
|
-
elif requirements["compute_intensity"] == "low":
|
388
|
-
time_factor *= 0.8
|
389
|
-
|
390
|
-
# Adjust for training type (LoRA is much faster)
|
391
|
-
if requirements.get("training_type") == "sft":
|
392
|
-
time_factor *= 0.7 # LoRA SFT is typically faster
|
393
|
-
|
394
|
-
estimated_time = base_time * time_factor
|
395
|
-
|
396
|
-
# Estimate costs
|
397
|
-
compute_cost = gpu_spec.cost_per_hour * estimated_time
|
398
|
-
storage_cost = provider_spec.storage_cost_per_gb_hour * 100 * estimated_time # Assume 100GB storage
|
399
|
-
|
400
|
-
total_cost = compute_cost + storage_cost
|
401
|
-
|
402
|
-
# Check constraints
|
403
|
-
if budget_limit and total_cost > budget_limit:
|
404
|
-
return None
|
405
|
-
|
406
|
-
if time_limit and estimated_time > time_limit:
|
407
|
-
return None
|
408
|
-
|
409
|
-
# Calculate performance score
|
410
|
-
performance_score = self._calculate_performance_score(gpu_spec, requirements)
|
411
|
-
|
412
|
-
# Calculate cost efficiency
|
413
|
-
cost_efficiency = performance_score / total_cost if total_cost > 0 else 0
|
414
|
-
|
415
|
-
# Calculate overall score
|
416
|
-
score = self._calculate_overall_score(
|
417
|
-
performance_score, cost_efficiency, gpu_spec, provider_spec, requirements
|
418
|
-
)
|
419
|
-
|
420
|
-
# Determine optimal batch size and precision
|
421
|
-
batch_size = self._determine_optimal_batch_size(gpu_spec, requirements)
|
422
|
-
precision = self._determine_optimal_precision(gpu_spec, requirements)
|
423
|
-
|
424
|
-
# Generate reasons
|
425
|
-
reasons = self._generate_option_reasons(gpu_spec, provider_spec, total_cost, estimated_time)
|
426
|
-
|
427
|
-
return {
|
428
|
-
"gpu": gpu_spec.name,
|
429
|
-
"provider": provider_spec.name,
|
430
|
-
"region": region,
|
431
|
-
"instance_type": instance_type,
|
432
|
-
"cost": total_cost,
|
433
|
-
"cost_breakdown": {
|
434
|
-
"compute": compute_cost,
|
435
|
-
"storage": storage_cost
|
436
|
-
},
|
437
|
-
"time": estimated_time,
|
438
|
-
"performance": performance_score,
|
439
|
-
"batch_size": batch_size,
|
440
|
-
"precision": precision,
|
441
|
-
"score": score,
|
442
|
-
"reasons": reasons,
|
443
|
-
"confidence": min(1.0, score / 100.0)
|
444
|
-
}
|
445
|
-
|
446
|
-
def _calculate_performance_score(self, gpu_spec: GPUSpec, requirements: Dict[str, Any]) -> float:
|
447
|
-
"""Calculate performance score for a GPU."""
|
448
|
-
score = 0.0
|
449
|
-
|
450
|
-
# Memory adequacy
|
451
|
-
memory_ratio = gpu_spec.memory_gb / requirements["recommended_memory_gb"]
|
452
|
-
if memory_ratio >= 1.0:
|
453
|
-
score += 30
|
454
|
-
else:
|
455
|
-
score += memory_ratio * 30
|
456
|
-
|
457
|
-
# Compute performance
|
458
|
-
if requirements["precision"] == "fp16":
|
459
|
-
compute_score = min(30, gpu_spec.fp16_tflops / 100 * 30)
|
460
|
-
else:
|
461
|
-
compute_score = min(30, gpu_spec.fp32_tflops / 50 * 30)
|
462
|
-
score += compute_score
|
463
|
-
|
464
|
-
# Training efficiency
|
465
|
-
score += gpu_spec.training_efficiency * 20
|
466
|
-
|
467
|
-
# Memory bandwidth
|
468
|
-
bandwidth_score = min(20, gpu_spec.memory_bandwidth_gbps / 1000 * 20)
|
469
|
-
score += bandwidth_score
|
470
|
-
|
471
|
-
return score
|
472
|
-
|
473
|
-
def _calculate_overall_score(
|
474
|
-
self,
|
475
|
-
performance_score: float,
|
476
|
-
cost_efficiency: float,
|
477
|
-
gpu_spec: GPUSpec,
|
478
|
-
provider_spec: CloudProvider,
|
479
|
-
requirements: Dict[str, Any]
|
480
|
-
) -> float:
|
481
|
-
"""Calculate overall option score."""
|
482
|
-
score = 0.0
|
483
|
-
|
484
|
-
# Performance weight (40%)
|
485
|
-
score += performance_score * 0.4
|
486
|
-
|
487
|
-
# Cost efficiency weight (30%)
|
488
|
-
score += cost_efficiency * 30 * 0.3
|
489
|
-
|
490
|
-
# Availability weight (15%)
|
491
|
-
score += gpu_spec.availability_score * provider_spec.availability_score * 15
|
492
|
-
|
493
|
-
# Recommendation bonus (10%)
|
494
|
-
if gpu_spec.is_recommended:
|
495
|
-
score += 10
|
496
|
-
|
497
|
-
# Provider reliability (5%)
|
498
|
-
score += provider_spec.availability_score * 5
|
499
|
-
|
500
|
-
return score
|
501
|
-
|
502
|
-
def _determine_optimal_batch_size(self, gpu_spec: GPUSpec, requirements: Dict[str, Any]) -> int:
|
503
|
-
"""Determine optimal batch size for GPU."""
|
504
|
-
base_batch_size = requirements["batch_size"]
|
505
|
-
|
506
|
-
# Adjust based on GPU memory
|
507
|
-
if gpu_spec.memory_gb >= 40:
|
508
|
-
return min(base_batch_size * 4, 16)
|
509
|
-
elif gpu_spec.memory_gb >= 24:
|
510
|
-
return min(base_batch_size * 2, 8)
|
511
|
-
elif gpu_spec.memory_gb >= 16:
|
512
|
-
return base_batch_size
|
513
|
-
else:
|
514
|
-
return max(1, base_batch_size // 2)
|
515
|
-
|
516
|
-
def _determine_optimal_precision(self, gpu_spec: GPUSpec, requirements: Dict[str, Any]) -> str:
|
517
|
-
"""Determine optimal precision for GPU."""
|
518
|
-
# Prefer fp16 for modern GPUs with good fp16 performance
|
519
|
-
if gpu_spec.fp16_tflops > gpu_spec.fp32_tflops * 1.5:
|
520
|
-
return "fp16"
|
521
|
-
else:
|
522
|
-
return "fp32"
|
523
|
-
|
524
|
-
def _generate_option_reasons(
|
525
|
-
self,
|
526
|
-
gpu_spec: GPUSpec,
|
527
|
-
provider_spec: CloudProvider,
|
528
|
-
cost: float,
|
529
|
-
time: float
|
530
|
-
) -> List[str]:
|
531
|
-
"""Generate reasons for selecting this option."""
|
532
|
-
reasons = []
|
533
|
-
|
534
|
-
reasons.append(f"{gpu_spec.name} provides {gpu_spec.memory_gb}GB memory")
|
535
|
-
|
536
|
-
if gpu_spec.is_recommended:
|
537
|
-
reasons.append("Recommended GPU for this model type")
|
538
|
-
|
539
|
-
if cost < 50:
|
540
|
-
reasons.append("Cost-effective option")
|
541
|
-
elif cost < 100:
|
542
|
-
reasons.append("Moderate cost option")
|
543
|
-
|
544
|
-
if time < 5:
|
545
|
-
reasons.append("Fast training time")
|
546
|
-
elif time < 12:
|
547
|
-
reasons.append("Reasonable training time")
|
548
|
-
|
549
|
-
if provider_spec.availability_score > 0.95:
|
550
|
-
reasons.append("High availability provider")
|
551
|
-
|
552
|
-
return reasons
|
553
|
-
|
554
|
-
def _generate_alternatives(self, options: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
555
|
-
"""Generate alternative recommendations."""
|
556
|
-
alternatives = []
|
557
|
-
|
558
|
-
for option in options:
|
559
|
-
alt = {
|
560
|
-
"gpu": option["gpu"],
|
561
|
-
"provider": option["provider"],
|
562
|
-
"cost": option["cost"],
|
563
|
-
"time": option["time"],
|
564
|
-
"performance": option["performance"],
|
565
|
-
"reason": f"Alternative option with different cost/performance tradeoff"
|
566
|
-
}
|
567
|
-
alternatives.append(alt)
|
568
|
-
|
569
|
-
return alternatives
|
570
|
-
|
571
|
-
def _initialize_default_resources(self) -> None:
|
572
|
-
"""Initialize with default GPU and cloud provider data."""
|
573
|
-
self._add_default_gpus()
|
574
|
-
self._add_default_cloud_providers()
|
575
|
-
self._save_resource_data()
|
576
|
-
|
577
|
-
logger.info("Initialized resource optimizer with default data")
|
578
|
-
|
579
|
-
def _add_default_gpus(self) -> None:
|
580
|
-
"""Add default GPU specifications."""
|
581
|
-
gpus = [
|
582
|
-
GPUSpec(
|
583
|
-
name="NVIDIA RTX A6000",
|
584
|
-
memory_gb=48,
|
585
|
-
compute_capability=8.6,
|
586
|
-
fp16_tflops=150,
|
587
|
-
fp32_tflops=38,
|
588
|
-
memory_bandwidth_gbps=768,
|
589
|
-
cost_per_hour=1.89,
|
590
|
-
availability_score=0.8,
|
591
|
-
providers=["runpod", "vast", "lambda"],
|
592
|
-
training_efficiency=1.0,
|
593
|
-
power_efficiency=0.9,
|
594
|
-
is_recommended=True,
|
595
|
-
description="High-memory professional GPU ideal for large models"
|
596
|
-
),
|
597
|
-
GPUSpec(
|
598
|
-
name="NVIDIA RTX 4090",
|
599
|
-
memory_gb=24,
|
600
|
-
compute_capability=8.9,
|
601
|
-
fp16_tflops=165,
|
602
|
-
fp32_tflops=83,
|
603
|
-
memory_bandwidth_gbps=1008,
|
604
|
-
cost_per_hour=1.25,
|
605
|
-
availability_score=0.9,
|
606
|
-
providers=["runpod", "vast"],
|
607
|
-
training_efficiency=1.1,
|
608
|
-
power_efficiency=1.0,
|
609
|
-
is_recommended=True,
|
610
|
-
description="Latest consumer GPU with excellent performance"
|
611
|
-
),
|
612
|
-
GPUSpec(
|
613
|
-
name="NVIDIA A100 40GB",
|
614
|
-
memory_gb=40,
|
615
|
-
compute_capability=8.0,
|
616
|
-
fp16_tflops=312,
|
617
|
-
fp32_tflops=19.5,
|
618
|
-
memory_bandwidth_gbps=1555,
|
619
|
-
cost_per_hour=2.95,
|
620
|
-
availability_score=0.7,
|
621
|
-
providers=["runpod", "aws", "gcp"],
|
622
|
-
training_efficiency=1.2,
|
623
|
-
power_efficiency=1.1,
|
624
|
-
is_recommended=True,
|
625
|
-
description="Data center GPU optimized for AI training"
|
626
|
-
),
|
627
|
-
GPUSpec(
|
628
|
-
name="NVIDIA RTX 3090",
|
629
|
-
memory_gb=24,
|
630
|
-
compute_capability=8.6,
|
631
|
-
fp16_tflops=142,
|
632
|
-
fp32_tflops=35.6,
|
633
|
-
memory_bandwidth_gbps=936,
|
634
|
-
cost_per_hour=0.89,
|
635
|
-
availability_score=0.95,
|
636
|
-
providers=["runpod", "vast", "lambda"],
|
637
|
-
training_efficiency=0.9,
|
638
|
-
power_efficiency=0.8,
|
639
|
-
is_recommended=False,
|
640
|
-
description="Previous generation high-memory consumer GPU"
|
641
|
-
),
|
642
|
-
GPUSpec(
|
643
|
-
name="NVIDIA RTX 4080",
|
644
|
-
memory_gb=16,
|
645
|
-
compute_capability=8.9,
|
646
|
-
fp16_tflops=120,
|
647
|
-
fp32_tflops=48.7,
|
648
|
-
memory_bandwidth_gbps=716,
|
649
|
-
cost_per_hour=0.95,
|
650
|
-
availability_score=0.85,
|
651
|
-
providers=["runpod", "vast"],
|
652
|
-
training_efficiency=1.0,
|
653
|
-
power_efficiency=1.0,
|
654
|
-
is_recommended=False,
|
655
|
-
description="Mid-range modern GPU for smaller models"
|
656
|
-
),
|
657
|
-
GPUSpec(
|
658
|
-
name="NVIDIA RTX 3080",
|
659
|
-
memory_gb=10,
|
660
|
-
compute_capability=8.6,
|
661
|
-
fp16_tflops=119,
|
662
|
-
fp32_tflops=29.8,
|
663
|
-
memory_bandwidth_gbps=760,
|
664
|
-
cost_per_hour=0.55,
|
665
|
-
availability_score=0.9,
|
666
|
-
providers=["runpod", "vast", "lambda"],
|
667
|
-
training_efficiency=0.8,
|
668
|
-
power_efficiency=0.8,
|
669
|
-
is_recommended=False,
|
670
|
-
description="Budget-friendly option for small models"
|
671
|
-
)
|
672
|
-
]
|
673
|
-
|
674
|
-
for gpu in gpus:
|
675
|
-
self.gpus[gpu.name] = gpu
|
676
|
-
|
677
|
-
def _add_default_cloud_providers(self) -> None:
|
678
|
-
"""Add default cloud provider specifications."""
|
679
|
-
providers = [
|
680
|
-
CloudProvider(
|
681
|
-
name="runpod",
|
682
|
-
regions=["US-East", "US-West", "EU-West"],
|
683
|
-
available_gpus=["NVIDIA RTX A6000", "NVIDIA RTX 4090", "NVIDIA A100 40GB", "NVIDIA RTX 3090", "NVIDIA RTX 4080", "NVIDIA RTX 3080"],
|
684
|
-
pricing_model="hourly",
|
685
|
-
supports_spot_instances=True,
|
686
|
-
supports_auto_scaling=False,
|
687
|
-
supports_preemption=True,
|
688
|
-
startup_time_minutes=2.0,
|
689
|
-
network_performance="high",
|
690
|
-
availability_score=0.95,
|
691
|
-
storage_cost_per_gb_hour=0.0002,
|
692
|
-
egress_cost_per_gb=0.02,
|
693
|
-
description="Specialized GPU cloud for AI/ML workloads"
|
694
|
-
),
|
695
|
-
CloudProvider(
|
696
|
-
name="vast",
|
697
|
-
regions=["Global"],
|
698
|
-
available_gpus=["NVIDIA RTX A6000", "NVIDIA RTX 4090", "NVIDIA RTX 3090", "NVIDIA RTX 4080", "NVIDIA RTX 3080"],
|
699
|
-
pricing_model="spot",
|
700
|
-
supports_spot_instances=True,
|
701
|
-
supports_auto_scaling=False,
|
702
|
-
supports_preemption=True,
|
703
|
-
startup_time_minutes=3.0,
|
704
|
-
network_performance="standard",
|
705
|
-
availability_score=0.85,
|
706
|
-
storage_cost_per_gb_hour=0.0001,
|
707
|
-
egress_cost_per_gb=0.01,
|
708
|
-
description="Decentralized GPU marketplace with competitive pricing"
|
709
|
-
),
|
710
|
-
CloudProvider(
|
711
|
-
name="lambda",
|
712
|
-
regions=["US-East", "US-West"],
|
713
|
-
available_gpus=["NVIDIA RTX A6000", "NVIDIA RTX 3090", "NVIDIA RTX 3080"],
|
714
|
-
pricing_model="hourly",
|
715
|
-
supports_spot_instances=False,
|
716
|
-
supports_auto_scaling=True,
|
717
|
-
supports_preemption=False,
|
718
|
-
startup_time_minutes=1.0,
|
719
|
-
network_performance="high",
|
720
|
-
availability_score=0.98,
|
721
|
-
storage_cost_per_gb_hour=0.0003,
|
722
|
-
egress_cost_per_gb=0.05,
|
723
|
-
description="Premium GPU cloud with high reliability"
|
724
|
-
),
|
725
|
-
CloudProvider(
|
726
|
-
name="aws",
|
727
|
-
regions=["us-east-1", "us-west-2", "eu-west-1"],
|
728
|
-
available_gpus=["NVIDIA A100 40GB"],
|
729
|
-
pricing_model="hourly",
|
730
|
-
supports_spot_instances=True,
|
731
|
-
supports_auto_scaling=True,
|
732
|
-
supports_preemption=True,
|
733
|
-
startup_time_minutes=5.0,
|
734
|
-
network_performance="high",
|
735
|
-
availability_score=0.99,
|
736
|
-
storage_cost_per_gb_hour=0.0005,
|
737
|
-
egress_cost_per_gb=0.09,
|
738
|
-
description="Enterprise cloud with comprehensive services"
|
739
|
-
),
|
740
|
-
CloudProvider(
|
741
|
-
name="gcp",
|
742
|
-
regions=["us-central1", "us-east1", "europe-west1"],
|
743
|
-
available_gpus=["NVIDIA A100 40GB"],
|
744
|
-
pricing_model="hourly",
|
745
|
-
supports_spot_instances=True,
|
746
|
-
supports_auto_scaling=True,
|
747
|
-
supports_preemption=True,
|
748
|
-
startup_time_minutes=4.0,
|
749
|
-
network_performance="high",
|
750
|
-
availability_score=0.99,
|
751
|
-
storage_cost_per_gb_hour=0.0004,
|
752
|
-
egress_cost_per_gb=0.08,
|
753
|
-
description="Google's cloud platform with AI/ML focus"
|
754
|
-
)
|
755
|
-
]
|
756
|
-
|
757
|
-
for provider in providers:
|
758
|
-
self.cloud_providers[provider.name] = provider
|
759
|
-
|
760
|
-
def _load_resource_data(self) -> None:
|
761
|
-
"""Load resource data from disk."""
|
762
|
-
try:
|
763
|
-
self._load_gpus()
|
764
|
-
self._load_cloud_providers()
|
765
|
-
except Exception as e:
|
766
|
-
logger.warning(f"Failed to load resource data: {e}")
|
767
|
-
|
768
|
-
def _save_resource_data(self) -> None:
|
769
|
-
"""Save resource data to disk."""
|
770
|
-
try:
|
771
|
-
self._save_gpus()
|
772
|
-
self._save_cloud_providers()
|
773
|
-
except Exception as e:
|
774
|
-
logger.error(f"Failed to save resource data: {e}")
|
775
|
-
|
776
|
-
def _load_gpus(self) -> None:
|
777
|
-
"""Load GPU data from disk."""
|
778
|
-
gpus_file = os.path.join(self.data_dir, "gpus.json")
|
779
|
-
if os.path.exists(gpus_file):
|
780
|
-
with open(gpus_file, 'r') as f:
|
781
|
-
data = json.load(f)
|
782
|
-
for name, gpu_data in data.items():
|
783
|
-
self.gpus[name] = GPUSpec(**gpu_data)
|
784
|
-
|
785
|
-
def _save_gpus(self) -> None:
|
786
|
-
"""Save GPU data to disk."""
|
787
|
-
gpus_file = os.path.join(self.data_dir, "gpus.json")
|
788
|
-
with open(gpus_file, 'w') as f:
|
789
|
-
from dataclasses import asdict
|
790
|
-
data = {name: asdict(gpu) for name, gpu in self.gpus.items()}
|
791
|
-
json.dump(data, f, indent=2)
|
792
|
-
|
793
|
-
def _load_cloud_providers(self) -> None:
|
794
|
-
"""Load cloud provider data from disk."""
|
795
|
-
providers_file = os.path.join(self.data_dir, "cloud_providers.json")
|
796
|
-
if os.path.exists(providers_file):
|
797
|
-
with open(providers_file, 'r') as f:
|
798
|
-
data = json.load(f)
|
799
|
-
for name, provider_data in data.items():
|
800
|
-
self.cloud_providers[name] = CloudProvider(**provider_data)
|
801
|
-
|
802
|
-
def _save_cloud_providers(self) -> None:
|
803
|
-
"""Save cloud provider data to disk."""
|
804
|
-
providers_file = os.path.join(self.data_dir, "cloud_providers.json")
|
805
|
-
with open(providers_file, 'w') as f:
|
806
|
-
from dataclasses import asdict
|
807
|
-
data = {name: asdict(provider) for name, provider in self.cloud_providers.items()}
|
808
|
-
json.dump(data, f, indent=2)
|
809
|
-
|
810
|
-
def get_available_gpus(self) -> List[str]:
|
811
|
-
"""Get list of available GPU types."""
|
812
|
-
return list(self.gpus.keys())
|
813
|
-
|
814
|
-
def get_available_providers(self) -> List[str]:
|
815
|
-
"""Get list of available cloud providers."""
|
816
|
-
return list(self.cloud_providers.keys())
|
817
|
-
|
818
|
-
def estimate_cost(self, gpu_name: str, provider_name: str, hours: float) -> float:
|
819
|
-
"""Estimate cost for specific GPU and provider."""
|
820
|
-
if gpu_name in self.gpus and provider_name in self.cloud_providers:
|
821
|
-
gpu_spec = self.gpus[gpu_name]
|
822
|
-
provider_spec = self.cloud_providers[provider_name]
|
823
|
-
|
824
|
-
compute_cost = gpu_spec.cost_per_hour * hours
|
825
|
-
storage_cost = provider_spec.storage_cost_per_gb_hour * 100 * hours # Assume 100GB
|
826
|
-
|
827
|
-
return compute_cost + storage_cost
|
828
|
-
|
829
|
-
return 0.0
|
830
|
-
|
831
|
-
def get_statistics(self) -> Dict[str, Any]:
|
832
|
-
"""Get resource optimizer statistics."""
|
833
|
-
return {
|
834
|
-
"total_gpus": len(self.gpus),
|
835
|
-
"total_providers": len(self.cloud_providers),
|
836
|
-
"avg_gpu_memory": sum(gpu.memory_gb for gpu in self.gpus.values()) / len(self.gpus) if self.gpus else 0,
|
837
|
-
"avg_cost_per_hour": sum(gpu.cost_per_hour for gpu in self.gpus.values()) / len(self.gpus) if self.gpus else 0,
|
838
|
-
"recommended_gpus": len([gpu for gpu in self.gpus.values() if gpu.is_recommended])
|
839
|
-
}
|