isa-model 0.4.0__py3-none-any.whl → 0.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- isa_model/client.py +466 -43
- isa_model/core/cache/redis_cache.py +12 -3
- isa_model/core/config/config_manager.py +230 -3
- isa_model/core/config.py +90 -0
- isa_model/core/database/direct_db_client.py +114 -0
- isa_model/core/database/migration_manager.py +563 -0
- isa_model/core/database/migrations.py +21 -1
- isa_model/core/database/supabase_client.py +154 -19
- isa_model/core/dependencies.py +316 -0
- isa_model/core/discovery/__init__.py +19 -0
- isa_model/core/discovery/consul_discovery.py +190 -0
- isa_model/core/logging/__init__.py +54 -0
- isa_model/core/logging/influx_logger.py +523 -0
- isa_model/core/logging/loki_logger.py +160 -0
- isa_model/core/models/__init__.py +27 -18
- isa_model/core/models/config_models.py +625 -0
- isa_model/core/models/deployment_billing_tracker.py +430 -0
- isa_model/core/models/model_manager.py +40 -17
- isa_model/core/models/model_metadata.py +690 -0
- isa_model/core/models/model_repo.py +174 -18
- isa_model/core/models/system_models.py +857 -0
- isa_model/core/repositories/__init__.py +9 -0
- isa_model/core/repositories/config_repository.py +912 -0
- isa_model/core/services/intelligent_model_selector.py +399 -21
- isa_model/core/storage/hf_storage.py +1 -1
- isa_model/core/types.py +1 -0
- isa_model/deployment/__init__.py +5 -48
- isa_model/deployment/core/__init__.py +2 -31
- isa_model/deployment/core/deployment_manager.py +1278 -370
- isa_model/deployment/local/__init__.py +31 -0
- isa_model/deployment/local/config.py +248 -0
- isa_model/deployment/local/gpu_gateway.py +607 -0
- isa_model/deployment/local/health_checker.py +428 -0
- isa_model/deployment/local/provider.py +586 -0
- isa_model/deployment/local/tensorrt_service.py +621 -0
- isa_model/deployment/local/transformers_service.py +644 -0
- isa_model/deployment/local/vllm_service.py +527 -0
- isa_model/deployment/modal/__init__.py +8 -0
- isa_model/deployment/modal/config.py +136 -0
- isa_model/deployment/{services/auto_hf_modal_deployer.py → modal/deployer.py} +1 -1
- isa_model/deployment/modal/services/__init__.py +3 -0
- isa_model/deployment/modal/services/audio/__init__.py +1 -0
- isa_model/deployment/modal/services/embedding/__init__.py +1 -0
- isa_model/deployment/modal/services/llm/__init__.py +1 -0
- isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
- isa_model/deployment/modal/services/video/__init__.py +1 -0
- isa_model/deployment/modal/services/vision/__init__.py +1 -0
- isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
- isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
- isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
- isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
- isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
- isa_model/deployment/storage/__init__.py +5 -0
- isa_model/deployment/storage/deployment_repository.py +824 -0
- isa_model/deployment/triton/__init__.py +10 -0
- isa_model/deployment/triton/config.py +196 -0
- isa_model/deployment/triton/configs/__init__.py +1 -0
- isa_model/deployment/triton/provider.py +512 -0
- isa_model/deployment/triton/scripts/__init__.py +1 -0
- isa_model/deployment/triton/templates/__init__.py +1 -0
- isa_model/inference/__init__.py +47 -1
- isa_model/inference/ai_factory.py +137 -10
- isa_model/inference/legacy_services/__init__.py +21 -0
- isa_model/inference/legacy_services/model_evaluation.py +637 -0
- isa_model/inference/legacy_services/model_service.py +573 -0
- isa_model/inference/legacy_services/model_serving.py +717 -0
- isa_model/inference/legacy_services/model_training.py +561 -0
- isa_model/inference/models/__init__.py +21 -0
- isa_model/inference/models/inference_config.py +551 -0
- isa_model/inference/models/inference_record.py +675 -0
- isa_model/inference/models/performance_models.py +714 -0
- isa_model/inference/repositories/__init__.py +9 -0
- isa_model/inference/repositories/inference_repository.py +828 -0
- isa_model/inference/services/audio/base_stt_service.py +184 -11
- isa_model/inference/services/audio/openai_stt_service.py +22 -6
- isa_model/inference/services/custom_model_manager.py +277 -0
- isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
- isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
- isa_model/inference/services/llm/__init__.py +10 -2
- isa_model/inference/services/llm/base_llm_service.py +335 -24
- isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
- isa_model/inference/services/llm/helpers/llm_adapter.py +9 -4
- isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
- isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
- isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
- isa_model/inference/services/llm/local_llm_service.py +747 -0
- isa_model/inference/services/llm/ollama_llm_service.py +9 -2
- isa_model/inference/services/llm/openai_llm_service.py +33 -16
- isa_model/inference/services/llm/yyds_llm_service.py +8 -2
- isa_model/inference/services/vision/__init__.py +22 -1
- isa_model/inference/services/vision/blip_vision_service.py +359 -0
- isa_model/inference/services/vision/helpers/image_utils.py +8 -5
- isa_model/inference/services/vision/isa_vision_service.py +65 -4
- isa_model/inference/services/vision/openai_vision_service.py +19 -10
- isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
- isa_model/serving/api/cache_manager.py +245 -0
- isa_model/serving/api/dependencies/__init__.py +1 -0
- isa_model/serving/api/dependencies/auth.py +194 -0
- isa_model/serving/api/dependencies/database.py +139 -0
- isa_model/serving/api/error_handlers.py +284 -0
- isa_model/serving/api/fastapi_server.py +172 -22
- isa_model/serving/api/middleware/auth.py +8 -2
- isa_model/serving/api/middleware/security.py +23 -33
- isa_model/serving/api/middleware/tenant_context.py +414 -0
- isa_model/serving/api/routes/analytics.py +4 -1
- isa_model/serving/api/routes/config.py +645 -0
- isa_model/serving/api/routes/deployment_billing.py +315 -0
- isa_model/serving/api/routes/deployments.py +138 -2
- isa_model/serving/api/routes/gpu_gateway.py +440 -0
- isa_model/serving/api/routes/health.py +32 -12
- isa_model/serving/api/routes/inference_monitoring.py +486 -0
- isa_model/serving/api/routes/local_deployments.py +448 -0
- isa_model/serving/api/routes/tenants.py +575 -0
- isa_model/serving/api/routes/unified.py +680 -18
- isa_model/serving/api/routes/webhooks.py +479 -0
- isa_model/serving/api/startup.py +68 -54
- isa_model/utils/gpu_utils.py +311 -0
- {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/METADATA +66 -24
- isa_model-0.4.3.dist-info/RECORD +193 -0
- isa_model/core/storage/minio_storage.py +0 -0
- isa_model/deployment/cloud/__init__.py +0 -9
- isa_model/deployment/cloud/modal/__init__.py +0 -10
- isa_model/deployment/core/deployment_config.py +0 -356
- isa_model/deployment/core/isa_deployment_service.py +0 -401
- isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
- isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
- isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
- isa_model/deployment/runtime/deployed_service.py +0 -338
- isa_model/deployment/services/__init__.py +0 -9
- isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
- isa_model/deployment/services/model_service.py +0 -332
- isa_model/deployment/services/service_monitor.py +0 -356
- isa_model/deployment/services/service_registry.py +0 -527
- isa_model/eval/__init__.py +0 -92
- isa_model/eval/benchmarks/__init__.py +0 -27
- isa_model/eval/benchmarks/multimodal_datasets.py +0 -460
- isa_model/eval/benchmarks.py +0 -701
- isa_model/eval/config/__init__.py +0 -10
- isa_model/eval/config/evaluation_config.py +0 -108
- isa_model/eval/evaluators/__init__.py +0 -24
- isa_model/eval/evaluators/audio_evaluator.py +0 -727
- isa_model/eval/evaluators/base_evaluator.py +0 -503
- isa_model/eval/evaluators/embedding_evaluator.py +0 -742
- isa_model/eval/evaluators/llm_evaluator.py +0 -472
- isa_model/eval/evaluators/vision_evaluator.py +0 -564
- isa_model/eval/example_evaluation.py +0 -395
- isa_model/eval/factory.py +0 -798
- isa_model/eval/infrastructure/__init__.py +0 -24
- isa_model/eval/infrastructure/experiment_tracker.py +0 -466
- isa_model/eval/isa_benchmarks.py +0 -700
- isa_model/eval/isa_integration.py +0 -582
- isa_model/eval/metrics.py +0 -951
- isa_model/eval/tests/unit/test_basic.py +0 -396
- isa_model/serving/api/routes/evaluations.py +0 -579
- isa_model/training/__init__.py +0 -168
- isa_model/training/annotation/annotation_schema.py +0 -47
- isa_model/training/annotation/processors/annotation_processor.py +0 -126
- isa_model/training/annotation/storage/dataset_manager.py +0 -131
- isa_model/training/annotation/storage/dataset_schema.py +0 -44
- isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
- isa_model/training/annotation/tests/test_minio copy.py +0 -113
- isa_model/training/annotation/tests/test_minio_upload.py +0 -43
- isa_model/training/annotation/views/annotation_controller.py +0 -158
- isa_model/training/cloud/__init__.py +0 -22
- isa_model/training/cloud/job_orchestrator.py +0 -402
- isa_model/training/cloud/runpod_trainer.py +0 -454
- isa_model/training/cloud/storage_manager.py +0 -482
- isa_model/training/core/__init__.py +0 -26
- isa_model/training/core/config.py +0 -181
- isa_model/training/core/dataset.py +0 -222
- isa_model/training/core/trainer.py +0 -720
- isa_model/training/core/utils.py +0 -213
- isa_model/training/examples/intelligent_training_example.py +0 -281
- isa_model/training/factory.py +0 -424
- isa_model/training/intelligent/__init__.py +0 -25
- isa_model/training/intelligent/decision_engine.py +0 -643
- isa_model/training/intelligent/intelligent_factory.py +0 -888
- isa_model/training/intelligent/knowledge_base.py +0 -751
- isa_model/training/intelligent/resource_optimizer.py +0 -839
- isa_model/training/intelligent/task_classifier.py +0 -576
- isa_model/training/storage/__init__.py +0 -24
- isa_model/training/storage/core_integration.py +0 -439
- isa_model/training/storage/training_repository.py +0 -552
- isa_model/training/storage/training_storage.py +0 -628
- isa_model-0.4.0.dist-info/RECORD +0 -182
- /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_chatTTS_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_fish_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_openvoice_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_service_v2.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/embedding}/isa_embed_rerank_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/video}/isa_video_hunyuan_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ocr_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_qwen25_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_table_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service_optimized.py +0 -0
- /isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
- {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/WHEEL +0 -0
- {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/top_level.txt +0 -0
@@ -1,8 +1,7 @@
|
|
1
1
|
"""
|
2
|
-
Deployment Manager
|
2
|
+
Unified Deployment Manager
|
3
3
|
|
4
|
-
Orchestrates
|
5
|
-
container building, deployment to cloud providers, and monitoring.
|
4
|
+
Orchestrates deployment of AI models to multiple platforms (Modal, Triton, Local GPU).
|
6
5
|
"""
|
7
6
|
|
8
7
|
import os
|
@@ -13,85 +12,69 @@ from pathlib import Path
|
|
13
12
|
from datetime import datetime
|
14
13
|
import asyncio
|
15
14
|
|
16
|
-
from .
|
17
|
-
DeploymentConfig, DeploymentProvider, InferenceEngine,
|
18
|
-
ModelConfig, TritonConfig, RunPodServerlessConfig
|
19
|
-
)
|
20
|
-
from ...core.models.model_manager import ModelManager
|
21
|
-
from ...core.models.model_repo import ModelCapability, ModelType
|
22
|
-
# ModelRegistry may not exist or may be in a different location
|
23
|
-
from ...core.storage.hf_storage import HuggingFaceStorage
|
15
|
+
from ...core.config.config_manager import ConfigManager
|
24
16
|
|
25
17
|
logger = logging.getLogger(__name__)
|
26
18
|
|
27
19
|
|
28
20
|
class DeploymentManager:
|
29
21
|
"""
|
30
|
-
|
22
|
+
Unified deployment manager for multiple platforms.
|
31
23
|
|
32
24
|
This manager coordinates:
|
33
|
-
-
|
34
|
-
-
|
35
|
-
-
|
36
|
-
-
|
37
|
-
- Integration with model registry
|
25
|
+
- Local GPU deployment with vLLM, TensorRT-LLM, Transformers
|
26
|
+
- Cloud deployment to Modal platform
|
27
|
+
- Container deployment with Triton Inference Server
|
28
|
+
- Deployment tracking and monitoring
|
38
29
|
|
39
30
|
Example:
|
40
31
|
```python
|
41
32
|
from isa_model.deployment import DeploymentManager
|
42
|
-
from isa_model.deployment.
|
33
|
+
from isa_model.deployment.local import create_vllm_config
|
43
34
|
|
44
35
|
# Initialize deployment manager
|
45
36
|
manager = DeploymentManager()
|
46
37
|
|
47
|
-
#
|
48
|
-
|
49
|
-
|
50
|
-
runpod_api_key="your-api-key",
|
51
|
-
model_source_path="xenobordom/gemma-4b-alpaca-v1"
|
52
|
-
)
|
38
|
+
# Deploy to local GPU
|
39
|
+
local_config = create_vllm_config("llama2-7b", "meta-llama/Llama-2-7b-chat-hf")
|
40
|
+
local_deployment = await manager.deploy_to_local(local_config)
|
53
41
|
|
54
|
-
# Deploy
|
55
|
-
|
56
|
-
|
42
|
+
# Deploy to Modal
|
43
|
+
modal_deployment = await manager.deploy_to_modal(
|
44
|
+
service_name="llm-service",
|
45
|
+
model_id="my-model",
|
46
|
+
service_type="llm"
|
47
|
+
)
|
57
48
|
```
|
58
49
|
"""
|
59
50
|
|
60
|
-
def __init__(self,
|
61
|
-
model_manager: Optional[ModelManager] = None,
|
62
|
-
storage_backend: str = "huggingface",
|
63
|
-
workspace_dir: str = "./deployments"):
|
51
|
+
def __init__(self, workspace_dir: str = "./deployments"):
|
64
52
|
"""
|
65
53
|
Initialize deployment manager.
|
66
|
-
|
54
|
+
|
67
55
|
Args:
|
68
|
-
model_manager: Model manager instance
|
69
|
-
storage_backend: Storage backend to use ("huggingface", "local")
|
70
56
|
workspace_dir: Directory for deployment artifacts
|
71
57
|
"""
|
72
58
|
self.workspace_dir = Path(workspace_dir)
|
73
59
|
self.workspace_dir.mkdir(parents=True, exist_ok=True)
|
74
|
-
|
75
|
-
# Initialize model management
|
76
|
-
if storage_backend == "huggingface":
|
77
|
-
storage = HuggingFaceStorage()
|
78
|
-
else:
|
79
|
-
from ...core.models.model_storage import LocalModelStorage
|
80
|
-
storage = LocalModelStorage()
|
81
|
-
|
82
|
-
self.model_manager = model_manager or ModelManager(storage=storage)
|
83
|
-
# self.model_registry = ModelRegistry() # ModelRegistry may not exist
|
84
|
-
self.model_registry = None
|
85
|
-
|
60
|
+
|
86
61
|
# Deployment tracking
|
87
62
|
self.deployments: Dict[str, Dict[str, Any]] = {}
|
88
63
|
self.deployments_file = self.workspace_dir / "deployments.json"
|
89
64
|
self._load_deployments()
|
90
|
-
|
65
|
+
|
91
66
|
# Setup logging
|
92
67
|
self._setup_logging()
|
93
|
-
|
94
|
-
|
68
|
+
|
69
|
+
# Initialize configuration manager
|
70
|
+
self.config_manager = ConfigManager()
|
71
|
+
|
72
|
+
# Initialize providers
|
73
|
+
self._modal_provider = None
|
74
|
+
self._triton_provider = None
|
75
|
+
self._local_provider = None
|
76
|
+
|
77
|
+
logger.info("Unified deployment manager initialized")
|
95
78
|
logger.info(f"Workspace directory: {self.workspace_dir}")
|
96
79
|
|
97
80
|
def _setup_logging(self):
|
@@ -126,49 +109,78 @@ class DeploymentManager:
|
|
126
109
|
with open(self.deployments_file, 'w') as f:
|
127
110
|
json.dump(self.deployments, f, indent=2, default=str)
|
128
111
|
|
129
|
-
async def
|
112
|
+
async def deploy_to_modal(self,
|
113
|
+
service_name: str,
|
114
|
+
model_id: str,
|
115
|
+
service_type: str = "llm",
|
116
|
+
config: Optional[Dict[str, Any]] = None,
|
117
|
+
tenant_context: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
|
130
118
|
"""
|
131
|
-
Deploy a
|
119
|
+
Deploy a service to Modal.
|
132
120
|
|
133
121
|
Args:
|
134
|
-
|
122
|
+
service_name: Name of the service to deploy
|
123
|
+
model_id: Model identifier
|
124
|
+
service_type: Type of service (llm, vision, audio, embedding, video)
|
125
|
+
config: Additional configuration for the service
|
135
126
|
|
136
127
|
Returns:
|
137
128
|
Deployment result with endpoint information
|
138
129
|
"""
|
139
|
-
|
130
|
+
# Extract tenant information for deployment isolation
|
131
|
+
organization_id = tenant_context.get('organization_id') if tenant_context else 'default'
|
132
|
+
tenant_prefix = f"org-{organization_id}" if organization_id != 'default' else ''
|
133
|
+
|
134
|
+
# Generate tenant-isolated deployment ID
|
135
|
+
base_deployment_id = f"{service_name}-{service_type}-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
|
136
|
+
deployment_id = f"{tenant_prefix}-{base_deployment_id}" if tenant_prefix else base_deployment_id
|
140
137
|
|
141
138
|
logger.info("=" * 60)
|
142
|
-
logger.info(f"STARTING DEPLOYMENT: {deployment_id}")
|
139
|
+
logger.info(f"STARTING MODAL DEPLOYMENT: {deployment_id}")
|
140
|
+
logger.info(f"TENANT: {organization_id}")
|
143
141
|
logger.info("=" * 60)
|
144
142
|
|
145
143
|
try:
|
144
|
+
# Track deployment start for billing
|
145
|
+
deployment_start_time = datetime.now()
|
146
|
+
|
146
147
|
# Step 1: Validate configuration
|
147
|
-
logger.info("Step 1/
|
148
|
-
self.
|
148
|
+
logger.info("Step 1/4: Validating deployment configuration...")
|
149
|
+
self._validate_modal_config(service_name, model_id, service_type)
|
149
150
|
|
150
|
-
# Step 2: Prepare
|
151
|
-
logger.info("Step 2/
|
152
|
-
|
151
|
+
# Step 2: Prepare deployment artifacts
|
152
|
+
logger.info("Step 2/4: Preparing Modal deployment artifacts...")
|
153
|
+
artifacts_path = await self._prepare_modal_artifacts(deployment_id, service_name, model_id, service_type, config)
|
153
154
|
|
154
|
-
# Step 3:
|
155
|
-
logger.info("Step 3/
|
156
|
-
|
155
|
+
# Step 3: Deploy to Modal
|
156
|
+
logger.info("Step 3/4: Deploying to Modal...")
|
157
|
+
deployment_result = await self._deploy_modal_service(deployment_id, service_name, service_type, artifacts_path)
|
157
158
|
|
158
|
-
#
|
159
|
-
|
160
|
-
artifacts_path = await self._prepare_deployment_artifacts(config, optimized_model_path)
|
159
|
+
# Calculate deployment duration
|
160
|
+
deployment_duration = (datetime.now() - deployment_start_time).total_seconds() / 3600 # hours
|
161
161
|
|
162
|
-
#
|
163
|
-
|
164
|
-
|
162
|
+
# Track billing for Modal deployment
|
163
|
+
self._track_modal_deployment_billing(
|
164
|
+
service_name=service_name,
|
165
|
+
model_id=model_id,
|
166
|
+
service_type=service_type,
|
167
|
+
deployment_duration_hours=deployment_duration,
|
168
|
+
config=config,
|
169
|
+
result=deployment_result
|
170
|
+
)
|
165
171
|
|
166
|
-
# Step
|
167
|
-
logger.info("Step
|
168
|
-
await self._register_deployment(
|
172
|
+
# Step 4: Register deployment
|
173
|
+
logger.info("Step 4/4: Registering deployment...")
|
174
|
+
await self._register_deployment(deployment_id, {
|
175
|
+
"service_name": service_name,
|
176
|
+
"model_id": model_id,
|
177
|
+
"service_type": service_type,
|
178
|
+
"config": config or {},
|
179
|
+
"deployment_duration_hours": deployment_duration
|
180
|
+
}, deployment_result, tenant_context)
|
169
181
|
|
170
182
|
logger.info("=" * 60)
|
171
|
-
logger.info("DEPLOYMENT COMPLETED SUCCESSFULLY!")
|
183
|
+
logger.info("MODAL DEPLOYMENT COMPLETED SUCCESSFULLY!")
|
172
184
|
logger.info("=" * 60)
|
173
185
|
logger.info(f"Deployment ID: {deployment_id}")
|
174
186
|
logger.info(f"Endpoint URL: {deployment_result.get('endpoint_url', 'N/A')}")
|
@@ -177,13 +189,15 @@ class DeploymentManager:
|
|
177
189
|
|
178
190
|
except Exception as e:
|
179
191
|
logger.error("=" * 60)
|
180
|
-
logger.error("DEPLOYMENT FAILED!")
|
192
|
+
logger.error("MODAL DEPLOYMENT FAILED!")
|
181
193
|
logger.error("=" * 60)
|
182
194
|
logger.error(f"Error: {e}")
|
183
195
|
|
184
196
|
# Update deployment status
|
185
197
|
self.deployments[deployment_id] = {
|
186
|
-
"
|
198
|
+
"service_name": service_name,
|
199
|
+
"model_id": model_id,
|
200
|
+
"service_type": service_type,
|
187
201
|
"status": "failed",
|
188
202
|
"error": str(e),
|
189
203
|
"created_at": datetime.now().isoformat(),
|
@@ -193,99 +207,44 @@ class DeploymentManager:
|
|
193
207
|
|
194
208
|
raise
|
195
209
|
|
196
|
-
def
|
197
|
-
"""Validate deployment configuration"""
|
198
|
-
logger.debug("Validating deployment configuration...")
|
210
|
+
def _validate_modal_config(self, service_name: str, model_id: str, service_type: str):
|
211
|
+
"""Validate Modal deployment configuration"""
|
212
|
+
logger.debug("Validating Modal deployment configuration...")
|
199
213
|
|
200
214
|
# Check required fields
|
201
|
-
if not
|
202
|
-
raise ValueError("
|
203
|
-
|
204
|
-
if not config.model_config:
|
205
|
-
raise ValueError("model_config is required")
|
206
|
-
|
207
|
-
# Provider-specific validation
|
208
|
-
if config.provider == DeploymentProvider.RUNPOD_SERVERLESS:
|
209
|
-
if not config.runpod_config or not config.runpod_config.api_key:
|
210
|
-
raise ValueError("RunPod API key is required for RunPod deployment")
|
211
|
-
|
212
|
-
# Engine-specific validation
|
213
|
-
if config.inference_engine == InferenceEngine.TRITON:
|
214
|
-
if not config.triton_config:
|
215
|
-
raise ValueError("Triton configuration is required for Triton engine")
|
216
|
-
|
217
|
-
logger.info("Configuration validation passed")
|
218
|
-
|
219
|
-
async def _prepare_model(self, model_config: ModelConfig) -> Path:
|
220
|
-
"""Prepare model for deployment"""
|
221
|
-
logger.info(f"Preparing model: {model_config.model_id}")
|
222
|
-
|
223
|
-
# Determine model type for registry
|
224
|
-
if model_config.model_type == "llm":
|
225
|
-
model_type = ModelType.LLM
|
226
|
-
elif model_config.model_type == "embedding":
|
227
|
-
model_type = ModelType.EMBEDDING
|
228
|
-
elif model_config.model_type == "vision":
|
229
|
-
model_type = ModelType.VISION
|
230
|
-
else:
|
231
|
-
model_type = ModelType.LLM # Default
|
232
|
-
|
233
|
-
# Convert capabilities
|
234
|
-
capabilities = []
|
235
|
-
for cap in model_config.capabilities:
|
236
|
-
if cap == "text_generation":
|
237
|
-
capabilities.append(ModelCapability.TEXT_GENERATION)
|
238
|
-
elif cap == "chat":
|
239
|
-
capabilities.append(ModelCapability.CHAT)
|
240
|
-
elif cap == "embedding":
|
241
|
-
capabilities.append(ModelCapability.EMBEDDING)
|
242
|
-
else:
|
243
|
-
capabilities.append(ModelCapability.TEXT_GENERATION) # Default
|
244
|
-
|
245
|
-
# Get or download model
|
246
|
-
if model_config.source_type == "huggingface":
|
247
|
-
model_path = await self.model_manager.get_model(
|
248
|
-
model_id=model_config.model_id,
|
249
|
-
repo_id=model_config.source_path,
|
250
|
-
model_type=model_type,
|
251
|
-
capabilities=capabilities
|
252
|
-
)
|
253
|
-
elif model_config.source_type == "local":
|
254
|
-
model_path = Path(model_config.source_path)
|
255
|
-
if not model_path.exists():
|
256
|
-
raise FileNotFoundError(f"Model not found at {model_path}")
|
257
|
-
else:
|
258
|
-
raise ValueError(f"Unsupported source type: {model_config.source_type}")
|
259
|
-
|
260
|
-
logger.info(f"Model prepared at: {model_path}")
|
261
|
-
return model_path
|
262
|
-
|
263
|
-
async def _optimize_model(self, config: DeploymentConfig, model_path: Path) -> Path:
|
264
|
-
"""Optimize model for deployment"""
|
265
|
-
logger.info("Optimizing model for deployment...")
|
215
|
+
if not service_name:
|
216
|
+
raise ValueError("service_name is required")
|
266
217
|
|
267
|
-
|
268
|
-
|
269
|
-
if config.model_config.use_tensorrt:
|
270
|
-
logger.info("TensorRT optimization requested (not yet implemented)")
|
218
|
+
if not model_id:
|
219
|
+
raise ValueError("model_id is required")
|
271
220
|
|
272
|
-
|
273
|
-
|
221
|
+
# Check service type
|
222
|
+
valid_service_types = ["llm", "vision", "audio", "embedding", "video"]
|
223
|
+
if service_type not in valid_service_types:
|
224
|
+
raise ValueError(f"service_type must be one of {valid_service_types}")
|
274
225
|
|
275
|
-
|
276
|
-
|
226
|
+
# Check Modal token using ConfigManager
|
227
|
+
modal_config = self.config_manager.get_deployment_config("modal")
|
228
|
+
if not modal_config or not modal_config.get("token_id"):
|
229
|
+
logger.warning("MODAL_TOKEN_ID not found in configuration")
|
230
|
+
|
231
|
+
logger.info("Modal configuration validation passed")
|
277
232
|
|
278
|
-
async def
|
279
|
-
"""Prepare deployment artifacts"""
|
280
|
-
logger.info("Preparing deployment artifacts...")
|
233
|
+
async def _prepare_modal_artifacts(self, deployment_id: str, service_name: str, model_id: str, service_type: str, config: Optional[Dict[str, Any]]) -> Path:
|
234
|
+
"""Prepare Modal deployment artifacts"""
|
235
|
+
logger.info("Preparing Modal deployment artifacts...")
|
281
236
|
|
282
237
|
# Create deployment workspace
|
283
|
-
deployment_workspace = self.workspace_dir /
|
238
|
+
deployment_workspace = self.workspace_dir / deployment_id
|
284
239
|
deployment_workspace.mkdir(exist_ok=True)
|
285
240
|
|
286
241
|
artifacts = {
|
287
|
-
"
|
288
|
-
"
|
242
|
+
"deployment_id": deployment_id,
|
243
|
+
"service_name": service_name,
|
244
|
+
"model_id": model_id,
|
245
|
+
"service_type": service_type,
|
246
|
+
"config": config or {},
|
247
|
+
"platform": "modal",
|
289
248
|
"created_at": datetime.now().isoformat()
|
290
249
|
}
|
291
250
|
|
@@ -293,211 +252,121 @@ class DeploymentManager:
|
|
293
252
|
with open(deployment_workspace / "deployment_config.json", 'w') as f:
|
294
253
|
json.dump(artifacts, f, indent=2)
|
295
254
|
|
296
|
-
|
297
|
-
if config.inference_engine == InferenceEngine.TRITON:
|
298
|
-
await self._generate_triton_config(config, deployment_workspace, model_path)
|
299
|
-
|
300
|
-
# Generate Docker configuration if needed
|
301
|
-
await self._generate_docker_config(config, deployment_workspace)
|
302
|
-
|
303
|
-
logger.info(f"Deployment artifacts prepared at: {deployment_workspace}")
|
255
|
+
logger.info(f"Modal deployment artifacts prepared at: {deployment_workspace}")
|
304
256
|
return deployment_workspace
|
305
257
|
|
306
|
-
async def
|
307
|
-
"""
|
308
|
-
logger.info("
|
309
|
-
|
310
|
-
triton_config = config.triton_config
|
311
|
-
model_config = config.model_config
|
312
|
-
|
313
|
-
# Create model repository structure
|
314
|
-
model_repo = workspace / "model_repository"
|
315
|
-
model_dir = model_repo / triton_config.model_name / "1"
|
316
|
-
model_dir.mkdir(parents=True, exist_ok=True)
|
317
|
-
|
318
|
-
# Copy model files
|
319
|
-
import shutil
|
320
|
-
if model_path.is_file():
|
321
|
-
shutil.copy2(model_path, model_dir)
|
322
|
-
else:
|
323
|
-
shutil.copytree(model_path, model_dir / "model", dirs_exist_ok=True)
|
324
|
-
|
325
|
-
# Generate config.pbtxt
|
326
|
-
config_content = f"""
|
327
|
-
name: "{triton_config.model_name}"
|
328
|
-
backend: "{triton_config.backend}"
|
329
|
-
max_batch_size: {triton_config.max_batch_size}
|
330
|
-
|
331
|
-
input [
|
332
|
-
{{
|
333
|
-
name: "input_ids"
|
334
|
-
data_type: TYPE_INT32
|
335
|
-
dims: [ -1 ]
|
336
|
-
}},
|
337
|
-
{{
|
338
|
-
name: "attention_mask"
|
339
|
-
data_type: TYPE_INT32
|
340
|
-
dims: [ -1 ]
|
341
|
-
optional: true
|
342
|
-
}}
|
343
|
-
]
|
344
|
-
|
345
|
-
output [
|
346
|
-
{{
|
347
|
-
name: "output"
|
348
|
-
data_type: TYPE_STRING
|
349
|
-
dims: [ -1 ]
|
350
|
-
}}
|
351
|
-
]
|
352
|
-
|
353
|
-
instance_group [
|
354
|
-
{{
|
355
|
-
count: {triton_config.instance_group_count}
|
356
|
-
kind: {triton_config.instance_group_kind}
|
357
|
-
}}
|
358
|
-
]
|
359
|
-
|
360
|
-
dynamic_batching {{
|
361
|
-
max_queue_delay_microseconds: 100
|
362
|
-
}}
|
363
|
-
"""
|
364
|
-
|
365
|
-
with open(model_repo / triton_config.model_name / "config.pbtxt", 'w') as f:
|
366
|
-
f.write(config_content.strip())
|
367
|
-
|
368
|
-
logger.info("Triton configuration generated")
|
369
|
-
|
370
|
-
async def _generate_docker_config(self, config: DeploymentConfig, workspace: Path):
|
371
|
-
"""Generate Docker configuration"""
|
372
|
-
logger.info("Generating Docker configuration...")
|
373
|
-
|
374
|
-
# Generate Dockerfile
|
375
|
-
dockerfile_content = f"""
|
376
|
-
FROM {config.runpod_config.container_image if config.runpod_config else 'nvidia/tritonserver:23.10-py3'}
|
377
|
-
|
378
|
-
WORKDIR /workspace
|
379
|
-
|
380
|
-
# Copy model repository
|
381
|
-
COPY model_repository /models
|
382
|
-
|
383
|
-
# Copy deployment configuration
|
384
|
-
COPY deployment_config.json /workspace/
|
385
|
-
|
386
|
-
# Set environment variables
|
387
|
-
ENV TRITON_MODEL_REPOSITORY=/models
|
388
|
-
ENV CUDA_VISIBLE_DEVICES=0
|
389
|
-
|
390
|
-
# Expose Triton ports
|
391
|
-
EXPOSE 8000 8001 8002
|
392
|
-
|
393
|
-
# Health check
|
394
|
-
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \\
|
395
|
-
CMD curl -f http://localhost:8000/v2/health/ready || exit 1
|
396
|
-
|
397
|
-
# Start Triton server
|
398
|
-
CMD ["tritonserver", "--model-repository=/models", "--allow-http=true", "--allow-grpc=true", "--allow-metrics=true"]
|
399
|
-
"""
|
400
|
-
|
401
|
-
with open(workspace / "Dockerfile", 'w') as f:
|
402
|
-
f.write(dockerfile_content.strip())
|
403
|
-
|
404
|
-
# Generate docker-compose.yml for local testing
|
405
|
-
compose_content = f"""
|
406
|
-
version: '3.8'
|
407
|
-
|
408
|
-
services:
|
409
|
-
triton-server:
|
410
|
-
build: .
|
411
|
-
ports:
|
412
|
-
- "8000:8000"
|
413
|
-
- "8001:8001"
|
414
|
-
- "8002:8002"
|
415
|
-
environment:
|
416
|
-
- CUDA_VISIBLE_DEVICES=0
|
417
|
-
volumes:
|
418
|
-
- ./model_repository:/models
|
419
|
-
deploy:
|
420
|
-
resources:
|
421
|
-
reservations:
|
422
|
-
devices:
|
423
|
-
- driver: nvidia
|
424
|
-
count: 1
|
425
|
-
capabilities: [gpu]
|
426
|
-
"""
|
427
|
-
|
428
|
-
with open(workspace / "docker-compose.yml", 'w') as f:
|
429
|
-
f.write(compose_content.strip())
|
430
|
-
|
431
|
-
logger.info("Docker configuration generated")
|
432
|
-
|
433
|
-
async def _deploy_to_provider(self, config: DeploymentConfig, artifacts_path: Path) -> Dict[str, Any]:
|
434
|
-
"""Deploy to the specified provider"""
|
435
|
-
logger.info(f"Deploying to provider: {config.provider.value}")
|
436
|
-
|
437
|
-
if config.provider == DeploymentProvider.RUNPOD_SERVERLESS:
|
438
|
-
return await self._deploy_to_runpod_serverless(config, artifacts_path)
|
439
|
-
elif config.provider == DeploymentProvider.LOCAL:
|
440
|
-
return await self._deploy_locally(config, artifacts_path)
|
441
|
-
else:
|
442
|
-
raise ValueError(f"Provider {config.provider} not yet implemented")
|
443
|
-
|
444
|
-
async def _deploy_to_runpod_serverless(self, config: DeploymentConfig, artifacts_path: Path) -> Dict[str, Any]:
|
445
|
-
"""Deploy to RunPod Serverless"""
|
446
|
-
logger.info("Deploying to RunPod Serverless...")
|
447
|
-
|
448
|
-
# TODO: Implement RunPod Serverless deployment
|
449
|
-
# This would involve:
|
450
|
-
# 1. Building and pushing Docker image
|
451
|
-
# 2. Creating RunPod serverless endpoint
|
452
|
-
# 3. Configuring scaling and networking
|
453
|
-
|
454
|
-
# For now, return mock result
|
455
|
-
result = {
|
456
|
-
"provider": "runpod_serverless",
|
457
|
-
"endpoint_id": f"mock-endpoint-{config.deployment_id}",
|
458
|
-
"endpoint_url": f"https://api.runpod.ai/v2/{config.deployment_id}/run",
|
459
|
-
"status": "deployed",
|
460
|
-
"deployed_at": datetime.now().isoformat()
|
461
|
-
}
|
258
|
+
async def _deploy_modal_service(self, deployment_id: str, service_name: str, service_type: str, artifacts_path: Path) -> Dict[str, Any]:
|
259
|
+
"""Deploy service to Modal using real Modal integration"""
|
260
|
+
logger.info(f"Deploying {service_type} service '{service_name}' to Modal...")
|
462
261
|
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
|
476
|
-
"
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
|
262
|
+
try:
|
263
|
+
# Load deployment config
|
264
|
+
config_file = artifacts_path / "deployment_config.json"
|
265
|
+
with open(config_file, 'r') as f:
|
266
|
+
deployment_config = json.load(f)
|
267
|
+
|
268
|
+
model_id = deployment_config['model_id']
|
269
|
+
config = deployment_config.get('config', {})
|
270
|
+
|
271
|
+
# Use Modal provider for real deployment
|
272
|
+
modal_provider = self.modal_provider
|
273
|
+
|
274
|
+
# Step 1: Analyze the model to get optimal configuration
|
275
|
+
logger.info(f"Analyzing model {model_id}...")
|
276
|
+
model_config = await asyncio.get_event_loop().run_in_executor(
|
277
|
+
None, modal_provider.analyze_model, model_id
|
278
|
+
)
|
279
|
+
|
280
|
+
# Step 2: Generate the appropriate Modal service
|
281
|
+
logger.info(f"Generating {service_type} service for {model_config.architecture}...")
|
282
|
+
service_code = await self._generate_modal_service_code(
|
283
|
+
service_name=service_name,
|
284
|
+
model_config=model_config,
|
285
|
+
service_type=service_type,
|
286
|
+
config=config
|
287
|
+
)
|
288
|
+
|
289
|
+
# Step 3: Save the generated service code
|
290
|
+
service_file = artifacts_path / f"{service_name}_modal_service.py"
|
291
|
+
with open(service_file, 'w') as f:
|
292
|
+
f.write(service_code)
|
293
|
+
|
294
|
+
# Step 4: Deploy to Modal (simulate for now, but with real structure)
|
295
|
+
deployment_result = await self._execute_modal_deployment(
|
296
|
+
service_file=service_file,
|
297
|
+
service_name=service_name,
|
298
|
+
model_config=model_config,
|
299
|
+
deployment_id=deployment_id
|
300
|
+
)
|
301
|
+
|
302
|
+
result = {
|
303
|
+
"provider": "modal",
|
304
|
+
"deployment_id": deployment_id,
|
305
|
+
"service_name": service_name,
|
306
|
+
"service_type": service_type,
|
307
|
+
"model_id": model_id,
|
308
|
+
"model_architecture": model_config.architecture,
|
309
|
+
"endpoint_url": deployment_result['endpoint_url'],
|
310
|
+
"status": deployment_result['status'],
|
311
|
+
"gpu_type": model_config.gpu_requirements,
|
312
|
+
"memory_gb": model_config.memory_gb,
|
313
|
+
"estimated_cost_per_hour": model_config.estimated_cost_per_hour,
|
314
|
+
"deployed_at": datetime.now().isoformat(),
|
315
|
+
"service_file": str(service_file)
|
316
|
+
}
|
317
|
+
|
318
|
+
logger.info(f"Modal deployment completed: {result['endpoint_url']}")
|
319
|
+
return result
|
320
|
+
|
321
|
+
except Exception as e:
|
322
|
+
logger.error(f"Failed to deploy Modal service: {e}")
|
323
|
+
raise
|
481
324
|
|
482
|
-
async def _register_deployment(self, config:
|
483
|
-
"""Register deployment in tracking system"""
|
484
|
-
logger.info("Registering deployment...")
|
325
|
+
async def _register_deployment(self, deployment_id: str, config: Dict[str, Any], deployment_result: Dict[str, Any], tenant_context: Optional[Dict[str, Any]] = None):
|
326
|
+
"""Register deployment in tracking system with tenant isolation"""
|
327
|
+
logger.info("Registering Modal deployment...")
|
485
328
|
|
486
329
|
deployment_info = {
|
487
|
-
"config": config
|
330
|
+
"config": config,
|
488
331
|
"result": deployment_result,
|
489
332
|
"status": "active",
|
333
|
+
"platform": "modal",
|
490
334
|
"created_at": datetime.now().isoformat(),
|
491
|
-
"updated_at": datetime.now().isoformat()
|
335
|
+
"updated_at": datetime.now().isoformat(),
|
336
|
+
# Add tenant information for isolation
|
337
|
+
"tenant": {
|
338
|
+
"organization_id": tenant_context.get('organization_id', 'default') if tenant_context else 'default',
|
339
|
+
"user_id": tenant_context.get('user_id') if tenant_context else None,
|
340
|
+
"role": tenant_context.get('role', 'user') if tenant_context else 'user'
|
341
|
+
}
|
492
342
|
}
|
493
343
|
|
494
|
-
self.deployments[
|
344
|
+
self.deployments[deployment_id] = deployment_info
|
495
345
|
self._save_deployments()
|
496
346
|
|
497
|
-
logger.info(f"
|
347
|
+
logger.info(f"Modal deployment registered: {deployment_id}")
|
498
348
|
|
499
|
-
async def list_deployments(self) -> List[Dict[str, Any]]:
|
500
|
-
"""List
|
349
|
+
async def list_deployments(self, tenant_context: Optional[Dict[str, Any]] = None) -> List[Dict[str, Any]]:
|
350
|
+
"""List deployments with optional tenant filtering"""
|
351
|
+
|
352
|
+
# If tenant context is provided, filter by organization
|
353
|
+
if tenant_context and tenant_context.get('organization_id'):
|
354
|
+
organization_id = tenant_context['organization_id']
|
355
|
+
filtered_deployments = []
|
356
|
+
|
357
|
+
for deployment_id, info in self.deployments.items():
|
358
|
+
# Check tenant information in deployment
|
359
|
+
deployment_org = info.get('tenant', {}).get('organization_id', 'default')
|
360
|
+
if deployment_org == organization_id:
|
361
|
+
filtered_deployments.append({
|
362
|
+
"deployment_id": deployment_id,
|
363
|
+
**info
|
364
|
+
})
|
365
|
+
|
366
|
+
logger.info(f"Filtered deployments for tenant {organization_id}: {len(filtered_deployments)} found")
|
367
|
+
return filtered_deployments
|
368
|
+
|
369
|
+
# Return all deployments if no tenant context
|
501
370
|
return [
|
502
371
|
{
|
503
372
|
"deployment_id": deployment_id,
|
@@ -506,38 +375,263 @@ services:
|
|
506
375
|
for deployment_id, info in self.deployments.items()
|
507
376
|
]
|
508
377
|
|
509
|
-
async def get_deployment(self, deployment_id: str) -> Optional[Dict[str, Any]]:
|
510
|
-
"""Get deployment information"""
|
511
|
-
|
378
|
+
async def get_deployment(self, deployment_id: str, tenant_context: Optional[Dict[str, Any]] = None) -> Optional[Dict[str, Any]]:
|
379
|
+
"""Get deployment information with tenant access control"""
|
380
|
+
deployment = self.deployments.get(deployment_id)
|
381
|
+
|
382
|
+
if not deployment:
|
383
|
+
return None
|
384
|
+
|
385
|
+
# If tenant context is provided, verify access
|
386
|
+
if tenant_context and tenant_context.get('organization_id'):
|
387
|
+
organization_id = tenant_context['organization_id']
|
388
|
+
deployment_org = deployment.get('tenant', {}).get('organization_id', 'default')
|
389
|
+
|
390
|
+
# Check if user has access to this deployment
|
391
|
+
if deployment_org != organization_id:
|
392
|
+
logger.warning(f"Access denied: tenant {organization_id} tried to access deployment from {deployment_org}")
|
393
|
+
return None
|
394
|
+
|
395
|
+
return deployment
|
512
396
|
|
513
|
-
async def delete_deployment(self, deployment_id: str) -> bool:
|
514
|
-
"""Delete a deployment"""
|
515
|
-
logger.info(f"Deleting deployment: {deployment_id}")
|
397
|
+
async def delete_deployment(self, deployment_id: str, tenant_context: Optional[Dict[str, Any]] = None) -> bool:
|
398
|
+
"""Delete a Modal deployment with tenant access control"""
|
399
|
+
logger.info(f"Deleting Modal deployment: {deployment_id}")
|
516
400
|
|
517
401
|
try:
|
518
|
-
if deployment_id in self.deployments:
|
519
|
-
|
402
|
+
if deployment_id not in self.deployments:
|
403
|
+
logger.warning(f"Deployment not found: {deployment_id}")
|
404
|
+
return False
|
520
405
|
|
521
|
-
|
522
|
-
|
523
|
-
|
406
|
+
deployment = self.deployments[deployment_id]
|
407
|
+
|
408
|
+
# Verify tenant access
|
409
|
+
if tenant_context and tenant_context.get('organization_id'):
|
410
|
+
organization_id = tenant_context['organization_id']
|
411
|
+
deployment_org = deployment.get('tenant', {}).get('organization_id', 'default')
|
524
412
|
|
525
|
-
|
526
|
-
|
527
|
-
|
528
|
-
|
529
|
-
|
413
|
+
if deployment_org != organization_id:
|
414
|
+
logger.warning(f"Access denied: tenant {organization_id} tried to delete deployment from {deployment_org}")
|
415
|
+
return False
|
416
|
+
|
417
|
+
# TODO: Implement actual Modal service cleanup using Modal SDK
|
418
|
+
|
419
|
+
# Remove from tracking
|
420
|
+
del self.deployments[deployment_id]
|
421
|
+
self._save_deployments()
|
422
|
+
|
423
|
+
# Clean up workspace
|
424
|
+
deployment_workspace = self.workspace_dir / deployment_id
|
425
|
+
if deployment_workspace.exists():
|
426
|
+
import shutil
|
427
|
+
shutil.rmtree(deployment_workspace)
|
428
|
+
|
429
|
+
logger.info(f"Modal deployment deleted: {deployment_id}")
|
430
|
+
return True
|
530
431
|
|
531
|
-
|
532
|
-
|
432
|
+
except Exception as e:
|
433
|
+
logger.error(f"Failed to delete Modal deployment {deployment_id}: {e}")
|
434
|
+
return False
|
435
|
+
|
436
|
+
async def get_modal_service_status(self, deployment_id: str) -> Dict[str, Any]:
|
437
|
+
"""Get real-time Modal service status"""
|
438
|
+
logger.info(f"Getting Modal service status for: {deployment_id}")
|
439
|
+
|
440
|
+
if deployment_id not in self.deployments:
|
441
|
+
return {
|
442
|
+
"deployment_id": deployment_id,
|
443
|
+
"status": "not_found",
|
444
|
+
"error": "Deployment not found"
|
445
|
+
}
|
446
|
+
|
447
|
+
deployment_info = self.deployments[deployment_id]
|
448
|
+
|
449
|
+
try:
|
450
|
+
# Get Modal service details
|
451
|
+
service_name = deployment_info.get('service_name')
|
452
|
+
model_id = deployment_info.get('model_id')
|
453
|
+
|
454
|
+
# Check if Modal service is accessible
|
455
|
+
modal_url = deployment_info.get('modal_url')
|
456
|
+
|
457
|
+
status_info = {
|
458
|
+
"deployment_id": deployment_id,
|
459
|
+
"service_name": service_name,
|
460
|
+
"model_id": model_id,
|
461
|
+
"status": deployment_info.get('status', 'unknown'),
|
462
|
+
"created_at": deployment_info.get('created_at'),
|
463
|
+
"updated_at": deployment_info.get('updated_at'),
|
464
|
+
"modal_url": modal_url,
|
465
|
+
"platform": "modal",
|
466
|
+
"monitoring": {
|
467
|
+
"health_check": await self._check_modal_health(modal_url),
|
468
|
+
"resource_usage": await self._get_modal_resource_usage(deployment_id),
|
469
|
+
"request_metrics": await self._get_modal_metrics(deployment_id),
|
470
|
+
"cost_tracking": await self._get_modal_cost_info(deployment_id)
|
471
|
+
}
|
472
|
+
}
|
473
|
+
|
474
|
+
# Update status based on health check
|
475
|
+
if status_info["monitoring"]["health_check"]["status"] == "healthy":
|
476
|
+
status_info["status"] = "running"
|
477
|
+
elif status_info["monitoring"]["health_check"]["status"] == "error":
|
478
|
+
status_info["status"] = "error"
|
533
479
|
else:
|
534
|
-
|
535
|
-
|
480
|
+
status_info["status"] = "pending"
|
481
|
+
|
482
|
+
logger.info(f"Modal service status retrieved: {deployment_id}")
|
483
|
+
return status_info
|
484
|
+
|
485
|
+
except Exception as e:
|
486
|
+
logger.error(f"Failed to get Modal service status {deployment_id}: {e}")
|
487
|
+
return {
|
488
|
+
"deployment_id": deployment_id,
|
489
|
+
"status": "error",
|
490
|
+
"error": str(e),
|
491
|
+
"last_check": datetime.now().isoformat()
|
492
|
+
}
|
493
|
+
|
494
|
+
async def _check_modal_health(self, modal_url: Optional[str]) -> Dict[str, Any]:
|
495
|
+
"""Check Modal service health"""
|
496
|
+
if not modal_url:
|
497
|
+
return {
|
498
|
+
"status": "unknown",
|
499
|
+
"message": "No Modal URL available"
|
500
|
+
}
|
501
|
+
|
502
|
+
try:
|
503
|
+
import httpx
|
504
|
+
import asyncio
|
505
|
+
|
506
|
+
async with httpx.AsyncClient(timeout=10.0) as client:
|
507
|
+
# Try to ping the Modal endpoint
|
508
|
+
response = await client.get(f"{modal_url}/health", timeout=5.0)
|
536
509
|
|
510
|
+
if response.status_code == 200:
|
511
|
+
return {
|
512
|
+
"status": "healthy",
|
513
|
+
"response_time_ms": response.elapsed.total_seconds() * 1000,
|
514
|
+
"last_check": datetime.now().isoformat()
|
515
|
+
}
|
516
|
+
else:
|
517
|
+
return {
|
518
|
+
"status": "unhealthy",
|
519
|
+
"status_code": response.status_code,
|
520
|
+
"last_check": datetime.now().isoformat()
|
521
|
+
}
|
522
|
+
|
537
523
|
except Exception as e:
|
538
|
-
|
539
|
-
|
524
|
+
return {
|
525
|
+
"status": "error",
|
526
|
+
"error": str(e),
|
527
|
+
"last_check": datetime.now().isoformat()
|
528
|
+
}
|
529
|
+
|
530
|
+
async def _get_modal_resource_usage(self, deployment_id: str) -> Dict[str, Any]:
|
531
|
+
"""Get Modal service resource usage"""
|
532
|
+
try:
|
533
|
+
# In a real implementation, this would query Modal's API for resource usage
|
534
|
+
# For now, return simulated data based on deployment info
|
535
|
+
deployment_info = self.deployments.get(deployment_id, {})
|
536
|
+
|
537
|
+
return {
|
538
|
+
"gpu_utilization": "85%", # Simulated
|
539
|
+
"memory_usage": "12.5GB / 32GB",
|
540
|
+
"cpu_usage": "45%",
|
541
|
+
"requests_per_minute": 24,
|
542
|
+
"average_response_time": "1.2s",
|
543
|
+
"uptime": self._calculate_uptime(deployment_info.get('created_at')),
|
544
|
+
"last_updated": datetime.now().isoformat()
|
545
|
+
}
|
546
|
+
|
547
|
+
except Exception as e:
|
548
|
+
return {
|
549
|
+
"error": str(e),
|
550
|
+
"last_updated": datetime.now().isoformat()
|
551
|
+
}
|
552
|
+
|
553
|
+
async def _get_modal_metrics(self, deployment_id: str) -> Dict[str, Any]:
|
554
|
+
"""Get Modal service request metrics"""
|
555
|
+
try:
|
556
|
+
# Simulated metrics - in production this would come from Modal's monitoring
|
557
|
+
return {
|
558
|
+
"total_requests": 1247,
|
559
|
+
"successful_requests": 1198,
|
560
|
+
"failed_requests": 49,
|
561
|
+
"success_rate": "96.1%",
|
562
|
+
"average_latency": "1.15s",
|
563
|
+
"requests_last_hour": 156,
|
564
|
+
"errors_last_hour": 3,
|
565
|
+
"last_updated": datetime.now().isoformat()
|
566
|
+
}
|
567
|
+
|
568
|
+
except Exception as e:
|
569
|
+
return {
|
570
|
+
"error": str(e),
|
571
|
+
"last_updated": datetime.now().isoformat()
|
572
|
+
}
|
540
573
|
|
574
|
+
async def _get_modal_cost_info(self, deployment_id: str) -> Dict[str, Any]:
|
575
|
+
"""Get Modal service cost information"""
|
576
|
+
try:
|
577
|
+
deployment_info = self.deployments.get(deployment_id, {})
|
578
|
+
|
579
|
+
# Calculate estimated costs based on uptime and GPU type
|
580
|
+
uptime_hours = self._calculate_uptime_hours(deployment_info.get('created_at'))
|
581
|
+
gpu_cost_per_hour = 4.0 # A100 default rate
|
582
|
+
|
583
|
+
estimated_cost = uptime_hours * gpu_cost_per_hour
|
584
|
+
|
585
|
+
return {
|
586
|
+
"estimated_cost_usd": f"${estimated_cost:.4f}",
|
587
|
+
"uptime_hours": f"{uptime_hours:.2f}",
|
588
|
+
"hourly_rate": f"${gpu_cost_per_hour:.2f}",
|
589
|
+
"gpu_type": "A100",
|
590
|
+
"billing_period": "current_month",
|
591
|
+
"last_updated": datetime.now().isoformat()
|
592
|
+
}
|
593
|
+
|
594
|
+
except Exception as e:
|
595
|
+
return {
|
596
|
+
"error": str(e),
|
597
|
+
"last_updated": datetime.now().isoformat()
|
598
|
+
}
|
599
|
+
|
600
|
+
def _calculate_uptime(self, created_at: Optional[str]) -> str:
|
601
|
+
"""Calculate service uptime"""
|
602
|
+
if not created_at:
|
603
|
+
return "Unknown"
|
604
|
+
|
605
|
+
try:
|
606
|
+
created = datetime.fromisoformat(created_at.replace('Z', '+00:00'))
|
607
|
+
uptime = datetime.now() - created.replace(tzinfo=None)
|
608
|
+
|
609
|
+
days = uptime.days
|
610
|
+
hours, remainder = divmod(uptime.seconds, 3600)
|
611
|
+
minutes, _ = divmod(remainder, 60)
|
612
|
+
|
613
|
+
if days > 0:
|
614
|
+
return f"{days}d {hours}h {minutes}m"
|
615
|
+
elif hours > 0:
|
616
|
+
return f"{hours}h {minutes}m"
|
617
|
+
else:
|
618
|
+
return f"{minutes}m"
|
619
|
+
|
620
|
+
except Exception:
|
621
|
+
return "Unknown"
|
622
|
+
|
623
|
+
def _calculate_uptime_hours(self, created_at: Optional[str]) -> float:
|
624
|
+
"""Calculate service uptime in hours"""
|
625
|
+
if not created_at:
|
626
|
+
return 0.0
|
627
|
+
|
628
|
+
try:
|
629
|
+
created = datetime.fromisoformat(created_at.replace('Z', '+00:00'))
|
630
|
+
uptime = datetime.now() - created.replace(tzinfo=None)
|
631
|
+
return uptime.total_seconds() / 3600
|
632
|
+
except Exception:
|
633
|
+
return 0.0
|
634
|
+
|
541
635
|
async def update_deployment_status(self, deployment_id: str, status: str, **kwargs):
|
542
636
|
"""Update deployment status"""
|
543
637
|
if deployment_id in self.deployments:
|
@@ -548,4 +642,818 @@ services:
|
|
548
642
|
self.deployments[deployment_id][key] = value
|
549
643
|
|
550
644
|
self._save_deployments()
|
551
|
-
logger.info(f"Updated deployment {deployment_id} status to {status}")
|
645
|
+
logger.info(f"Updated deployment {deployment_id} status to {status}")
|
646
|
+
|
647
|
+
@property
|
648
|
+
def modal_provider(self):
|
649
|
+
"""Get or create Modal provider"""
|
650
|
+
if self._modal_provider is None:
|
651
|
+
from ..modal.deployer import ModalDeployer
|
652
|
+
self._modal_provider = ModalDeployer()
|
653
|
+
return self._modal_provider
|
654
|
+
|
655
|
+
@property
|
656
|
+
def triton_provider(self):
|
657
|
+
"""Get or create Triton provider"""
|
658
|
+
if self._triton_provider is None:
|
659
|
+
from ..triton.provider import TritonProvider
|
660
|
+
self._triton_provider = TritonProvider(str(self.workspace_dir / "triton"))
|
661
|
+
return self._triton_provider
|
662
|
+
|
663
|
+
@property
|
664
|
+
def local_provider(self):
|
665
|
+
"""Get or create Local GPU provider"""
|
666
|
+
if self._local_provider is None:
|
667
|
+
from ..local.provider import LocalGPUProvider
|
668
|
+
self._local_provider = LocalGPUProvider(str(self.workspace_dir / "local"))
|
669
|
+
return self._local_provider
|
670
|
+
|
671
|
+
async def deploy_to_triton(self, config) -> Dict[str, Any]:
|
672
|
+
"""
|
673
|
+
Deploy a service to Triton Inference Server.
|
674
|
+
|
675
|
+
Args:
|
676
|
+
config: TritonConfig instance
|
677
|
+
|
678
|
+
Returns:
|
679
|
+
Deployment result with endpoint information
|
680
|
+
"""
|
681
|
+
logger.info("=" * 60)
|
682
|
+
logger.info(f"STARTING TRITON DEPLOYMENT: {config.service_name}")
|
683
|
+
logger.info("=" * 60)
|
684
|
+
|
685
|
+
try:
|
686
|
+
# Track deployment start for billing
|
687
|
+
deployment_start_time = datetime.now()
|
688
|
+
|
689
|
+
# Deploy using Triton provider
|
690
|
+
result = await self.triton_provider.deploy(config)
|
691
|
+
|
692
|
+
# Calculate deployment duration
|
693
|
+
deployment_duration = (datetime.now() - deployment_start_time).total_seconds() / 3600 # hours
|
694
|
+
|
695
|
+
# Track billing for deployment
|
696
|
+
self._track_deployment_billing(
|
697
|
+
config=config,
|
698
|
+
provider="triton",
|
699
|
+
operation_type="deployment",
|
700
|
+
deployment_duration_hours=deployment_duration,
|
701
|
+
result=result
|
702
|
+
)
|
703
|
+
|
704
|
+
# Register in our tracking system
|
705
|
+
deployment_id = result["deployment_id"]
|
706
|
+
deployment_info = {
|
707
|
+
"config": config.to_dict(),
|
708
|
+
"result": result,
|
709
|
+
"status": "active",
|
710
|
+
"platform": "triton",
|
711
|
+
"created_at": datetime.now().isoformat(),
|
712
|
+
"updated_at": datetime.now().isoformat(),
|
713
|
+
"deployment_duration_hours": deployment_duration
|
714
|
+
}
|
715
|
+
|
716
|
+
self.deployments[deployment_id] = deployment_info
|
717
|
+
self._save_deployments()
|
718
|
+
|
719
|
+
logger.info("=" * 60)
|
720
|
+
logger.info("TRITON DEPLOYMENT COMPLETED SUCCESSFULLY!")
|
721
|
+
logger.info("=" * 60)
|
722
|
+
logger.info(f"Deployment ID: {deployment_id}")
|
723
|
+
logger.info(f"Endpoint URL: {result.get('endpoint_url', 'N/A')}")
|
724
|
+
|
725
|
+
return result
|
726
|
+
|
727
|
+
except Exception as e:
|
728
|
+
logger.error("=" * 60)
|
729
|
+
logger.error("TRITON DEPLOYMENT FAILED!")
|
730
|
+
logger.error("=" * 60)
|
731
|
+
logger.error(f"Error: {e}")
|
732
|
+
raise
|
733
|
+
|
734
|
+
async def deploy_to_local(self, config) -> Dict[str, Any]:
|
735
|
+
"""
|
736
|
+
Deploy a service to local GPU.
|
737
|
+
|
738
|
+
Args:
|
739
|
+
config: LocalGPUConfig instance
|
740
|
+
|
741
|
+
Returns:
|
742
|
+
Deployment result with service information
|
743
|
+
"""
|
744
|
+
logger.info("=" * 60)
|
745
|
+
logger.info(f"STARTING LOCAL GPU DEPLOYMENT: {config.service_name}")
|
746
|
+
logger.info(f"MODEL: {config.model_id}")
|
747
|
+
logger.info(f"BACKEND: {config.backend.value}")
|
748
|
+
logger.info("=" * 60)
|
749
|
+
|
750
|
+
try:
|
751
|
+
# Track deployment start for billing
|
752
|
+
deployment_start_time = datetime.now()
|
753
|
+
|
754
|
+
# Deploy using Local provider
|
755
|
+
result = await self.local_provider.deploy(config)
|
756
|
+
|
757
|
+
if result["success"]:
|
758
|
+
# Calculate deployment duration
|
759
|
+
deployment_duration = (datetime.now() - deployment_start_time).total_seconds() / 3600 # hours
|
760
|
+
|
761
|
+
# Register in our tracking system
|
762
|
+
deployment_id = f"local-{config.service_name}-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
|
763
|
+
deployment_info = {
|
764
|
+
"config": config.to_dict(),
|
765
|
+
"result": result,
|
766
|
+
"status": "active",
|
767
|
+
"platform": "local",
|
768
|
+
"created_at": datetime.now().isoformat(),
|
769
|
+
"updated_at": datetime.now().isoformat(),
|
770
|
+
"deployment_duration_hours": deployment_duration
|
771
|
+
}
|
772
|
+
|
773
|
+
self.deployments[deployment_id] = deployment_info
|
774
|
+
self._save_deployments()
|
775
|
+
|
776
|
+
logger.info("=" * 60)
|
777
|
+
logger.info("LOCAL GPU DEPLOYMENT COMPLETED SUCCESSFULLY!")
|
778
|
+
logger.info("=" * 60)
|
779
|
+
logger.info(f"Service: {config.service_name}")
|
780
|
+
logger.info(f"Backend: {config.backend.value}")
|
781
|
+
|
782
|
+
return {
|
783
|
+
**result,
|
784
|
+
"deployment_id": deployment_id,
|
785
|
+
"platform": "local"
|
786
|
+
}
|
787
|
+
else:
|
788
|
+
return result
|
789
|
+
|
790
|
+
except Exception as e:
|
791
|
+
logger.error("=" * 60)
|
792
|
+
logger.error("LOCAL GPU DEPLOYMENT FAILED!")
|
793
|
+
logger.error("=" * 60)
|
794
|
+
logger.error(f"Error: {e}")
|
795
|
+
raise
|
796
|
+
|
797
|
+
async def list_local_services(self) -> List[Dict[str, Any]]:
|
798
|
+
"""List local GPU services"""
|
799
|
+
if not self.local_provider:
|
800
|
+
return []
|
801
|
+
return await self.local_provider.list_services()
|
802
|
+
|
803
|
+
async def get_local_service_info(self, service_name: str) -> Optional[Dict[str, Any]]:
|
804
|
+
"""Get local service information"""
|
805
|
+
if not self.local_provider:
|
806
|
+
return None
|
807
|
+
return await self.local_provider.get_service_info(service_name)
|
808
|
+
|
809
|
+
async def undeploy_local_service(self, service_name: str) -> Dict[str, Any]:
|
810
|
+
"""Undeploy local service"""
|
811
|
+
if not self.local_provider:
|
812
|
+
return {
|
813
|
+
"success": False,
|
814
|
+
"error": "Local provider not available"
|
815
|
+
}
|
816
|
+
|
817
|
+
result = await self.local_provider.undeploy(service_name)
|
818
|
+
|
819
|
+
# Remove from tracking
|
820
|
+
deployment_ids_to_remove = []
|
821
|
+
for deployment_id, info in self.deployments.items():
|
822
|
+
if (info.get('platform') == 'local' and
|
823
|
+
info.get('config', {}).get('service_name') == service_name):
|
824
|
+
deployment_ids_to_remove.append(deployment_id)
|
825
|
+
|
826
|
+
for deployment_id in deployment_ids_to_remove:
|
827
|
+
del self.deployments[deployment_id]
|
828
|
+
|
829
|
+
if deployment_ids_to_remove:
|
830
|
+
self._save_deployments()
|
831
|
+
|
832
|
+
return result
|
833
|
+
|
834
|
+
async def get_local_system_status(self) -> Dict[str, Any]:
|
835
|
+
"""Get local GPU system status"""
|
836
|
+
if not self.local_provider:
|
837
|
+
return {
|
838
|
+
"available": False,
|
839
|
+
"error": "Local provider not initialized"
|
840
|
+
}
|
841
|
+
return await self.local_provider.get_system_status()
|
842
|
+
|
843
|
+
async def list_providers(self) -> List[str]:
|
844
|
+
"""List available deployment providers"""
|
845
|
+
return ["local", "modal", "triton"]
|
846
|
+
|
847
|
+
async def get_provider_status(self, provider: str) -> Dict[str, Any]:
|
848
|
+
"""Get status of a deployment provider"""
|
849
|
+
if provider == "local":
|
850
|
+
# Check local GPU availability
|
851
|
+
try:
|
852
|
+
from ...utils.gpu_utils import get_gpu_manager
|
853
|
+
gpu_manager = get_gpu_manager()
|
854
|
+
|
855
|
+
return {
|
856
|
+
"provider": "local",
|
857
|
+
"available": gpu_manager.cuda_available,
|
858
|
+
"description": "Local GPU deployment with vLLM, TensorRT-LLM, Transformers",
|
859
|
+
"gpu_count": len(gpu_manager.gpus),
|
860
|
+
"cuda_available": gpu_manager.cuda_available,
|
861
|
+
"nvidia_smi_available": gpu_manager.nvidia_smi_available,
|
862
|
+
"requirements": ["CUDA", "GPU drivers", "Sufficient GPU memory"]
|
863
|
+
}
|
864
|
+
except Exception as e:
|
865
|
+
return {
|
866
|
+
"provider": "local",
|
867
|
+
"available": False,
|
868
|
+
"description": "Local GPU deployment",
|
869
|
+
"error": str(e)
|
870
|
+
}
|
871
|
+
elif provider == "modal":
|
872
|
+
return {
|
873
|
+
"provider": "modal",
|
874
|
+
"available": True,
|
875
|
+
"description": "Modal serverless platform"
|
876
|
+
}
|
877
|
+
elif provider == "triton":
|
878
|
+
# Check if Docker is available
|
879
|
+
try:
|
880
|
+
import docker
|
881
|
+
docker.from_env()
|
882
|
+
docker_available = True
|
883
|
+
except Exception:
|
884
|
+
docker_available = False
|
885
|
+
|
886
|
+
return {
|
887
|
+
"provider": "triton",
|
888
|
+
"available": docker_available,
|
889
|
+
"description": "Triton Inference Server with TensorRT-LLM",
|
890
|
+
"requirements": ["Docker", "GPU support"]
|
891
|
+
}
|
892
|
+
else:
|
893
|
+
raise ValueError(f"Unknown provider: {provider}")
|
894
|
+
|
895
|
+
def _track_deployment_billing(
|
896
|
+
self,
|
897
|
+
config: Any,
|
898
|
+
provider: str,
|
899
|
+
operation_type: str,
|
900
|
+
deployment_duration_hours: float,
|
901
|
+
result: Dict[str, Any]
|
902
|
+
):
|
903
|
+
"""Track billing for deployment operations"""
|
904
|
+
try:
|
905
|
+
from ...core.models.deployment_billing_tracker import get_deployment_billing_tracker
|
906
|
+
|
907
|
+
# Extract GPU info from config
|
908
|
+
gpu_type = getattr(config, 'gpu_type', None)
|
909
|
+
gpu_count = getattr(config, 'gpu_count', 1)
|
910
|
+
memory_gb = getattr(config, 'memory_gb', None)
|
911
|
+
|
912
|
+
# Track the deployment billing
|
913
|
+
billing_tracker = get_deployment_billing_tracker()
|
914
|
+
billing_tracker.track_deployment_usage(
|
915
|
+
model_id=getattr(config, 'model_id', 'unknown'),
|
916
|
+
provider=provider,
|
917
|
+
operation_type=operation_type,
|
918
|
+
service_type=getattr(config, 'service_type', 'unknown').value if hasattr(getattr(config, 'service_type', 'unknown'), 'value') else str(getattr(config, 'service_type', 'unknown')),
|
919
|
+
operation="deploy",
|
920
|
+
gpu_type=gpu_type,
|
921
|
+
gpu_count=gpu_count,
|
922
|
+
runtime_hours=deployment_duration_hours,
|
923
|
+
deployment_duration_hours=deployment_duration_hours,
|
924
|
+
memory_gb=memory_gb,
|
925
|
+
metadata={
|
926
|
+
"deployment_id": result.get("deployment_id"),
|
927
|
+
"endpoint_url": result.get("endpoint_url"),
|
928
|
+
"provider_details": provider
|
929
|
+
}
|
930
|
+
)
|
931
|
+
|
932
|
+
logger.info(f"Tracked deployment billing: {provider} - {deployment_duration_hours:.3f}h")
|
933
|
+
|
934
|
+
except Exception as e:
|
935
|
+
logger.error(f"Failed to track deployment billing: {e}")
|
936
|
+
|
937
|
+
async def estimate_deployment_cost(
|
938
|
+
self,
|
939
|
+
provider: str,
|
940
|
+
gpu_type: str,
|
941
|
+
gpu_count: int = 1,
|
942
|
+
estimated_hours: float = 1.0
|
943
|
+
) -> Dict[str, float]:
|
944
|
+
"""Estimate deployment costs before starting"""
|
945
|
+
try:
|
946
|
+
from ...core.models.deployment_billing_tracker import get_deployment_billing_tracker
|
947
|
+
|
948
|
+
billing_tracker = get_deployment_billing_tracker()
|
949
|
+
return billing_tracker.estimate_deployment_cost(
|
950
|
+
provider=provider,
|
951
|
+
gpu_type=gpu_type,
|
952
|
+
gpu_count=gpu_count,
|
953
|
+
estimated_hours=estimated_hours
|
954
|
+
)
|
955
|
+
except Exception as e:
|
956
|
+
logger.error(f"Failed to estimate deployment cost: {e}")
|
957
|
+
return {"total_cost": 0.0, "compute_cost": 0.0, "storage_cost": 0.0, "network_cost": 0.0}
|
958
|
+
|
959
|
+
def _track_modal_deployment_billing(
|
960
|
+
self,
|
961
|
+
service_name: str,
|
962
|
+
model_id: str,
|
963
|
+
service_type: str,
|
964
|
+
deployment_duration_hours: float,
|
965
|
+
config: Optional[Dict[str, Any]],
|
966
|
+
result: Dict[str, Any]
|
967
|
+
):
|
968
|
+
"""Track billing for Modal deployment operations"""
|
969
|
+
try:
|
970
|
+
from ...core.models.deployment_billing_tracker import get_deployment_billing_tracker
|
971
|
+
|
972
|
+
# Extract GPU info from config or use defaults
|
973
|
+
gpu_type = config.get('gpu_type', 't4') if config else 't4'
|
974
|
+
gpu_count = config.get('gpu_count', 1) if config else 1
|
975
|
+
memory_gb = config.get('memory_gb', 8) if config else 8
|
976
|
+
|
977
|
+
# Track the Modal deployment billing
|
978
|
+
billing_tracker = get_deployment_billing_tracker()
|
979
|
+
billing_tracker.track_deployment_usage(
|
980
|
+
model_id=model_id,
|
981
|
+
provider="modal",
|
982
|
+
operation_type="deployment",
|
983
|
+
service_type=service_type,
|
984
|
+
operation="deploy",
|
985
|
+
gpu_type=gpu_type,
|
986
|
+
gpu_count=gpu_count,
|
987
|
+
runtime_hours=deployment_duration_hours,
|
988
|
+
deployment_duration_hours=deployment_duration_hours,
|
989
|
+
memory_gb=memory_gb,
|
990
|
+
metadata={
|
991
|
+
"service_name": service_name,
|
992
|
+
"deployment_id": result.get("deployment_id"),
|
993
|
+
"endpoint_url": result.get("endpoint_url"),
|
994
|
+
"provider_details": "modal_serverless"
|
995
|
+
}
|
996
|
+
)
|
997
|
+
|
998
|
+
logger.info(f"Tracked Modal deployment billing: {service_name} - {deployment_duration_hours:.3f}h")
|
999
|
+
|
1000
|
+
except Exception as e:
|
1001
|
+
logger.error(f"Failed to track Modal deployment billing: {e}")
|
1002
|
+
|
1003
|
+
async def list_modal_services(self) -> List[Dict[str, Any]]:
|
1004
|
+
"""List available Modal services by type"""
|
1005
|
+
services = {
|
1006
|
+
"llm": ["isa_llm_service"],
|
1007
|
+
"vision": ["isa_vision_ocr_service", "isa_vision_ui_service", "isa_vision_table_service", "isa_vision_qwen25_service"],
|
1008
|
+
"audio": ["isa_audio_chatTTS_service", "isa_audio_openvoice_service", "isa_audio_service_v2", "isa_audio_fish_service"],
|
1009
|
+
"embedding": ["isa_embed_rerank_service"],
|
1010
|
+
"video": ["isa_video_hunyuan_service"]
|
1011
|
+
}
|
1012
|
+
|
1013
|
+
result = []
|
1014
|
+
for service_type, service_list in services.items():
|
1015
|
+
for service_name in service_list:
|
1016
|
+
result.append({
|
1017
|
+
"service_name": service_name,
|
1018
|
+
"service_type": service_type,
|
1019
|
+
"platform": "modal"
|
1020
|
+
})
|
1021
|
+
|
1022
|
+
return result
|
1023
|
+
|
1024
|
+
# ============= MODAL SERVICE CODE GENERATION =============
|
1025
|
+
|
1026
|
+
async def _generate_modal_service_code(self,
|
1027
|
+
service_name: str,
|
1028
|
+
model_config: Any,
|
1029
|
+
service_type: str,
|
1030
|
+
config: Dict[str, Any]) -> str:
|
1031
|
+
"""Generate Modal service code based on model type and configuration"""
|
1032
|
+
|
1033
|
+
# Choose the appropriate service template based on service_type
|
1034
|
+
if service_type == "llm":
|
1035
|
+
return self._generate_llm_service_code(service_name, model_config, config)
|
1036
|
+
elif service_type == "vision":
|
1037
|
+
return self._generate_vision_service_code(service_name, model_config, config)
|
1038
|
+
elif service_type == "embedding":
|
1039
|
+
return self._generate_embedding_service_code(service_name, model_config, config)
|
1040
|
+
else:
|
1041
|
+
# Default to LLM service
|
1042
|
+
return self._generate_llm_service_code(service_name, model_config, config)
|
1043
|
+
|
1044
|
+
def _generate_llm_service_code(self, service_name: str, model_config: Any, config: Dict[str, Any]) -> str:
|
1045
|
+
"""Generate production-ready LLM service code for Modal"""
|
1046
|
+
dependencies = getattr(model_config, 'dependencies', None) or [
|
1047
|
+
"torch", "transformers>=4.36.0", "accelerate", "bitsandbytes", "flash-attn"
|
1048
|
+
]
|
1049
|
+
|
1050
|
+
# Determine optimal GPU based on model size
|
1051
|
+
gpu_config = self._get_optimal_gpu_config(model_config)
|
1052
|
+
|
1053
|
+
return f'''"""
|
1054
|
+
{service_name} LLM Service for Modal
|
1055
|
+
|
1056
|
+
Production-ready service for model: {getattr(model_config, 'model_id', 'unknown')}
|
1057
|
+
Architecture: {getattr(model_config, 'architecture', 'transformer')}
|
1058
|
+
Generated automatically by ISA Model Deployment Manager
|
1059
|
+
"""
|
1060
|
+
|
1061
|
+
import modal
|
1062
|
+
import asyncio
|
1063
|
+
import json
|
1064
|
+
import time
|
1065
|
+
from typing import Dict, Any, List, Optional
|
1066
|
+
from datetime import datetime
|
1067
|
+
|
1068
|
+
# Create Modal app
|
1069
|
+
app = modal.App("{service_name}")
|
1070
|
+
|
1071
|
+
# Production image with optimized dependencies
|
1072
|
+
image = (
|
1073
|
+
modal.Image.debian_slim(python_version="3.11")
|
1074
|
+
.pip_install([
|
1075
|
+
{', '.join([f'"{dep}"' for dep in dependencies])}
|
1076
|
+
])
|
1077
|
+
.env({{"HF_HUB_ENABLE_HF_TRANSFER": "1"}})
|
1078
|
+
)
|
1079
|
+
|
1080
|
+
@app.cls(
|
1081
|
+
image=image,
|
1082
|
+
gpu=modal.gpu.{gpu_config['gpu_type']}(count={gpu_config['gpu_count']}),
|
1083
|
+
container_idle_timeout=300,
|
1084
|
+
timeout=1800, # 30 minutes
|
1085
|
+
memory={getattr(model_config, 'container_memory_mb', 32768)},
|
1086
|
+
keep_warm=1, # Keep one container warm
|
1087
|
+
allow_concurrent_inputs=10
|
1088
|
+
)
|
1089
|
+
class {service_name.replace('-', '_').title()}Service:
|
1090
|
+
|
1091
|
+
@modal.enter()
|
1092
|
+
def load_model(self):
|
1093
|
+
"""Load model with production optimizations"""
|
1094
|
+
import torch
|
1095
|
+
from transformers import (
|
1096
|
+
AutoTokenizer,
|
1097
|
+
AutoModelForCausalLM,
|
1098
|
+
BitsAndBytesConfig
|
1099
|
+
)
|
1100
|
+
|
1101
|
+
model_id = "{getattr(model_config, 'model_id', 'microsoft/DialoGPT-medium')}"
|
1102
|
+
|
1103
|
+
print(f"Loading model: {{model_id}}")
|
1104
|
+
start_time = time.time()
|
1105
|
+
|
1106
|
+
# Load tokenizer
|
1107
|
+
self.tokenizer = AutoTokenizer.from_pretrained(
|
1108
|
+
model_id,
|
1109
|
+
trust_remote_code=True,
|
1110
|
+
use_fast=True
|
1111
|
+
)
|
1112
|
+
|
1113
|
+
if self.tokenizer.pad_token is None:
|
1114
|
+
self.tokenizer.pad_token = self.tokenizer.eos_token
|
1115
|
+
|
1116
|
+
# Configure quantization for efficiency
|
1117
|
+
quantization_config = BitsAndBytesConfig(
|
1118
|
+
load_in_4bit=True,
|
1119
|
+
bnb_4bit_compute_dtype=torch.float16,
|
1120
|
+
bnb_4bit_use_double_quant=True,
|
1121
|
+
bnb_4bit_quant_type="nf4"
|
1122
|
+
)
|
1123
|
+
|
1124
|
+
# Load model with optimizations
|
1125
|
+
self.model = AutoModelForCausalLM.from_pretrained(
|
1126
|
+
model_id,
|
1127
|
+
quantization_config=quantization_config,
|
1128
|
+
device_map="auto",
|
1129
|
+
trust_remote_code=True,
|
1130
|
+
torch_dtype=torch.float16,
|
1131
|
+
attn_implementation="flash_attention_2"
|
1132
|
+
)
|
1133
|
+
|
1134
|
+
self.model.eval()
|
1135
|
+
|
1136
|
+
load_time = time.time() - start_time
|
1137
|
+
print(f"Model loaded successfully in {{load_time:.2f}}s")
|
1138
|
+
|
1139
|
+
# Model metadata
|
1140
|
+
self.model_info = {{
|
1141
|
+
"model_id": model_id,
|
1142
|
+
"architecture": "{getattr(model_config, 'architecture', 'transformer')}",
|
1143
|
+
"parameters": getattr(self.model, 'num_parameters', lambda: 0)(),
|
1144
|
+
"loaded_at": datetime.now().isoformat(),
|
1145
|
+
"load_time_seconds": load_time
|
1146
|
+
}}
|
1147
|
+
|
1148
|
+
@modal.method()
|
1149
|
+
def generate(self,
|
1150
|
+
messages: List[Dict[str, str]],
|
1151
|
+
max_tokens: int = 512,
|
1152
|
+
temperature: float = 0.7,
|
1153
|
+
top_p: float = 0.9,
|
1154
|
+
top_k: int = 50,
|
1155
|
+
do_sample: bool = True,
|
1156
|
+
**kwargs) -> Dict[str, Any]:
|
1157
|
+
"""Generate response with production features"""
|
1158
|
+
|
1159
|
+
start_time = time.time()
|
1160
|
+
|
1161
|
+
try:
|
1162
|
+
# Format messages into prompt
|
1163
|
+
prompt = self._format_messages(messages)
|
1164
|
+
|
1165
|
+
# Tokenize input
|
1166
|
+
inputs = self.tokenizer(
|
1167
|
+
prompt,
|
1168
|
+
return_tensors="pt",
|
1169
|
+
padding=True,
|
1170
|
+
truncation=True,
|
1171
|
+
max_length=2048
|
1172
|
+
).to(self.model.device)
|
1173
|
+
|
1174
|
+
# Generate response
|
1175
|
+
with torch.no_grad():
|
1176
|
+
outputs = self.model.generate(
|
1177
|
+
**inputs,
|
1178
|
+
max_new_tokens=max_tokens,
|
1179
|
+
temperature=temperature,
|
1180
|
+
top_p=top_p,
|
1181
|
+
top_k=top_k,
|
1182
|
+
do_sample=do_sample,
|
1183
|
+
pad_token_id=self.tokenizer.eos_token_id,
|
1184
|
+
eos_token_id=self.tokenizer.eos_token_id,
|
1185
|
+
use_cache=True
|
1186
|
+
)
|
1187
|
+
|
1188
|
+
# Decode response
|
1189
|
+
response_tokens = outputs[0][inputs['input_ids'].shape[-1]:]
|
1190
|
+
response_text = self.tokenizer.decode(
|
1191
|
+
response_tokens,
|
1192
|
+
skip_special_tokens=True
|
1193
|
+
).strip()
|
1194
|
+
|
1195
|
+
generation_time = time.time() - start_time
|
1196
|
+
|
1197
|
+
return {{
|
1198
|
+
"response": response_text,
|
1199
|
+
"model": self.model_info["model_id"],
|
1200
|
+
"usage": {{
|
1201
|
+
"prompt_tokens": inputs['input_ids'].shape[-1],
|
1202
|
+
"completion_tokens": len(response_tokens),
|
1203
|
+
"total_tokens": inputs['input_ids'].shape[-1] + len(response_tokens)
|
1204
|
+
}},
|
1205
|
+
"metadata": {{
|
1206
|
+
"generation_time_seconds": generation_time,
|
1207
|
+
"parameters": {{
|
1208
|
+
"temperature": temperature,
|
1209
|
+
"top_p": top_p,
|
1210
|
+
"top_k": top_k,
|
1211
|
+
"max_tokens": max_tokens
|
1212
|
+
}},
|
1213
|
+
"timestamp": datetime.now().isoformat()
|
1214
|
+
}}
|
1215
|
+
}}
|
1216
|
+
|
1217
|
+
except Exception as e:
|
1218
|
+
return {{
|
1219
|
+
"error": str(e),
|
1220
|
+
"error_type": type(e).__name__,
|
1221
|
+
"model": self.model_info.get("model_id", "unknown"),
|
1222
|
+
"timestamp": datetime.now().isoformat()
|
1223
|
+
}}
|
1224
|
+
|
1225
|
+
def _format_messages(self, messages: List[Dict[str, str]]) -> str:
|
1226
|
+
"""Format messages into model-appropriate prompt"""
|
1227
|
+
if not messages:
|
1228
|
+
return ""
|
1229
|
+
|
1230
|
+
# Simple chat format - can be enhanced for specific models
|
1231
|
+
formatted_parts = []
|
1232
|
+
for msg in messages:
|
1233
|
+
role = msg.get("role", "user")
|
1234
|
+
content = msg.get("content", "")
|
1235
|
+
|
1236
|
+
if role == "system":
|
1237
|
+
formatted_parts.append(f"System: {{content}}")
|
1238
|
+
elif role == "user":
|
1239
|
+
formatted_parts.append(f"Human: {{content}}")
|
1240
|
+
elif role == "assistant":
|
1241
|
+
formatted_parts.append(f"Assistant: {{content}}")
|
1242
|
+
|
1243
|
+
formatted_parts.append("Assistant:")
|
1244
|
+
return "\\n\\n".join(formatted_parts)
|
1245
|
+
|
1246
|
+
@modal.method()
|
1247
|
+
def get_model_info(self) -> Dict[str, Any]:
|
1248
|
+
"""Get model metadata"""
|
1249
|
+
return self.model_info
|
1250
|
+
|
1251
|
+
# Web endpoint for HTTP access
|
1252
|
+
@app.function(
|
1253
|
+
image=image,
|
1254
|
+
timeout=300
|
1255
|
+
)
|
1256
|
+
@modal.web_endpoint(method="POST")
|
1257
|
+
async def inference_endpoint(item: Dict[str, Any]):
|
1258
|
+
"""HTTP endpoint for model inference"""
|
1259
|
+
try:
|
1260
|
+
service = {service_name.replace('-', '_').title()}Service()
|
1261
|
+
|
1262
|
+
# Extract parameters
|
1263
|
+
messages = item.get("messages", [])
|
1264
|
+
max_tokens = item.get("max_tokens", 512)
|
1265
|
+
temperature = item.get("temperature", 0.7)
|
1266
|
+
top_p = item.get("top_p", 0.9)
|
1267
|
+
|
1268
|
+
# Generate response
|
1269
|
+
result = service.generate(
|
1270
|
+
messages=messages,
|
1271
|
+
max_tokens=max_tokens,
|
1272
|
+
temperature=temperature,
|
1273
|
+
top_p=top_p
|
1274
|
+
)
|
1275
|
+
|
1276
|
+
return result
|
1277
|
+
|
1278
|
+
except Exception as e:
|
1279
|
+
return {{
|
1280
|
+
"error": str(e),
|
1281
|
+
"error_type": type(e).__name__,
|
1282
|
+
"endpoint": "inference_endpoint",
|
1283
|
+
"timestamp": datetime.now().isoformat()
|
1284
|
+
}}
|
1285
|
+
|
1286
|
+
@app.function(image=image)
|
1287
|
+
@modal.web_endpoint(method="GET")
|
1288
|
+
async def health_check():
|
1289
|
+
"""Health check endpoint"""
|
1290
|
+
return {{
|
1291
|
+
"status": "healthy",
|
1292
|
+
"service": "{service_name}",
|
1293
|
+
"timestamp": datetime.now().isoformat(),
|
1294
|
+
"version": "1.0.0"
|
1295
|
+
}}
|
1296
|
+
|
1297
|
+
@app.function(image=image)
|
1298
|
+
@modal.web_endpoint(method="GET")
|
1299
|
+
async def model_info():
|
1300
|
+
"""Model information endpoint"""
|
1301
|
+
try:
|
1302
|
+
service = {service_name.replace('-', '_').title()}Service()
|
1303
|
+
return service.get_model_info()
|
1304
|
+
except Exception as e:
|
1305
|
+
return {{
|
1306
|
+
"error": str(e),
|
1307
|
+
"timestamp": datetime.now().isoformat()
|
1308
|
+
}}
|
1309
|
+
|
1310
|
+
# For local testing
|
1311
|
+
if __name__ == "__main__":
|
1312
|
+
# Test the service locally
|
1313
|
+
import asyncio
|
1314
|
+
|
1315
|
+
async def test():
|
1316
|
+
service = {service_name.replace('-', '_').title()}Service()
|
1317
|
+
result = service.generate([
|
1318
|
+
{{"role": "user", "content": "Hello! How are you today?"}}
|
1319
|
+
])
|
1320
|
+
print(json.dumps(result, indent=2))
|
1321
|
+
|
1322
|
+
asyncio.run(test())
|
1323
|
+
'''
|
1324
|
+
|
1325
|
+
def _generate_vision_service_code(self, service_name: str, model_config: Any, config: Dict[str, Any]) -> str:
|
1326
|
+
"""Generate Vision service code for Modal"""
|
1327
|
+
return f'# Vision service template for {service_name} - {model_config.model_id}'
|
1328
|
+
|
1329
|
+
def _generate_embedding_service_code(self, service_name: str, model_config: Any, config: Dict[str, Any]) -> str:
|
1330
|
+
"""Generate Embedding service code for Modal"""
|
1331
|
+
return f'# Embedding service template for {service_name} - {model_config.model_id}'
|
1332
|
+
|
1333
|
+
async def _execute_modal_deployment(self,
|
1334
|
+
service_file: Path,
|
1335
|
+
service_name: str,
|
1336
|
+
model_config: Any,
|
1337
|
+
deployment_id: str) -> Dict[str, Any]:
|
1338
|
+
"""Execute the actual Modal deployment using Modal SDK"""
|
1339
|
+
|
1340
|
+
logger.info(f"Executing Modal deployment for {service_name}...")
|
1341
|
+
|
1342
|
+
try:
|
1343
|
+
import subprocess
|
1344
|
+
import tempfile
|
1345
|
+
import os
|
1346
|
+
|
1347
|
+
# Check if modal CLI is available
|
1348
|
+
modal_check = subprocess.run(["modal", "--version"],
|
1349
|
+
capture_output=True, text=True, timeout=10)
|
1350
|
+
if modal_check.returncode != 0:
|
1351
|
+
raise RuntimeError("Modal CLI not found. Please install Modal: pip install modal")
|
1352
|
+
|
1353
|
+
# Create a temporary script for deployment
|
1354
|
+
with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as tmp_file:
|
1355
|
+
tmp_file.write(open(service_file, 'r').read())
|
1356
|
+
tmp_script_path = tmp_file.name
|
1357
|
+
|
1358
|
+
try:
|
1359
|
+
# Execute Modal deployment
|
1360
|
+
logger.info(f"Deploying Modal service from {service_file}")
|
1361
|
+
deploy_result = subprocess.run(
|
1362
|
+
["modal", "deploy", tmp_script_path],
|
1363
|
+
capture_output=True,
|
1364
|
+
text=True,
|
1365
|
+
timeout=300, # 5 minute timeout
|
1366
|
+
cwd=service_file.parent
|
1367
|
+
)
|
1368
|
+
|
1369
|
+
if deploy_result.returncode == 0:
|
1370
|
+
# Parse deployment output to extract endpoint URL
|
1371
|
+
output = deploy_result.stdout + deploy_result.stderr
|
1372
|
+
endpoint_url = self._extract_modal_endpoint(output, service_name, deployment_id)
|
1373
|
+
|
1374
|
+
result = {
|
1375
|
+
"status": "deployed",
|
1376
|
+
"endpoint_url": endpoint_url,
|
1377
|
+
"deployment_id": deployment_id,
|
1378
|
+
"service_file": str(service_file),
|
1379
|
+
"model_architecture": getattr(model_config, 'architecture', 'unknown'),
|
1380
|
+
"deployment_output": output,
|
1381
|
+
"estimated_startup_time": "30-60 seconds"
|
1382
|
+
}
|
1383
|
+
|
1384
|
+
logger.info(f"Modal deployment completed successfully: {endpoint_url}")
|
1385
|
+
return result
|
1386
|
+
|
1387
|
+
else:
|
1388
|
+
error_output = deploy_result.stderr or deploy_result.stdout
|
1389
|
+
logger.error(f"Modal deployment failed: {error_output}")
|
1390
|
+
raise RuntimeError(f"Modal deployment failed: {error_output}")
|
1391
|
+
|
1392
|
+
finally:
|
1393
|
+
# Clean up temporary file
|
1394
|
+
if os.path.exists(tmp_script_path):
|
1395
|
+
os.unlink(tmp_script_path)
|
1396
|
+
|
1397
|
+
except subprocess.TimeoutExpired:
|
1398
|
+
logger.error("Modal deployment timed out")
|
1399
|
+
raise RuntimeError("Modal deployment timed out after 5 minutes")
|
1400
|
+
|
1401
|
+
except Exception as e:
|
1402
|
+
logger.error(f"Failed to execute Modal deployment: {e}")
|
1403
|
+
raise
|
1404
|
+
|
1405
|
+
def _extract_modal_endpoint(self, output: str, service_name: str, deployment_id: str) -> str:
|
1406
|
+
"""Extract Modal endpoint URL from deployment output"""
|
1407
|
+
import re
|
1408
|
+
|
1409
|
+
# Look for typical Modal endpoint patterns in output
|
1410
|
+
patterns = [
|
1411
|
+
r'https://[a-zA-Z0-9\-]+--[a-zA-Z0-9\-]+\.modal\.run',
|
1412
|
+
r'Deployed! Your app is at (https://[^\s]+)',
|
1413
|
+
r'App deployed to (https://[^\s]+)',
|
1414
|
+
r'Available at (https://[^\s]+)'
|
1415
|
+
]
|
1416
|
+
|
1417
|
+
for pattern in patterns:
|
1418
|
+
match = re.search(pattern, output)
|
1419
|
+
if match:
|
1420
|
+
url = match.group(1) if match.lastindex else match.group(0)
|
1421
|
+
logger.info(f"Extracted Modal endpoint: {url}")
|
1422
|
+
return url
|
1423
|
+
|
1424
|
+
# If no endpoint found in output, generate expected URL pattern
|
1425
|
+
endpoint_url = f"https://{service_name}--{deployment_id}.modal.run"
|
1426
|
+
logger.warning(f"Could not extract endpoint from output, using expected pattern: {endpoint_url}")
|
1427
|
+
return endpoint_url
|
1428
|
+
|
1429
|
+
def _get_optimal_gpu_config(self, model_config: Any) -> Dict[str, Any]:
|
1430
|
+
"""Determine optimal GPU configuration based on model size"""
|
1431
|
+
|
1432
|
+
# Get model parameters or estimate from model ID
|
1433
|
+
parameters = getattr(model_config, 'parameters', None)
|
1434
|
+
model_id = getattr(model_config, 'model_id', '')
|
1435
|
+
|
1436
|
+
# Estimate parameters from model name if not available
|
1437
|
+
if not parameters:
|
1438
|
+
if '7b' in model_id.lower():
|
1439
|
+
parameters = 7_000_000_000
|
1440
|
+
elif '13b' in model_id.lower():
|
1441
|
+
parameters = 13_000_000_000
|
1442
|
+
elif '70b' in model_id.lower():
|
1443
|
+
parameters = 70_000_000_000
|
1444
|
+
elif 'large' in model_id.lower():
|
1445
|
+
parameters = 1_000_000_000
|
1446
|
+
elif 'medium' in model_id.lower():
|
1447
|
+
parameters = 350_000_000
|
1448
|
+
else:
|
1449
|
+
parameters = 500_000_000 # Default assumption
|
1450
|
+
|
1451
|
+
# Choose GPU based on model size
|
1452
|
+
if parameters > 50_000_000_000: # >50B parameters
|
1453
|
+
return {"gpu_type": "A100", "gpu_count": 2}
|
1454
|
+
elif parameters > 15_000_000_000: # 15B-50B parameters
|
1455
|
+
return {"gpu_type": "A100", "gpu_count": 1}
|
1456
|
+
elif parameters > 3_000_000_000: # 3B-15B parameters
|
1457
|
+
return {"gpu_type": "A10G", "gpu_count": 1}
|
1458
|
+
else: # <3B parameters
|
1459
|
+
return {"gpu_type": "T4", "gpu_count": 1}
|