isa-model 0.4.0__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (199) hide show
  1. isa_model/client.py +466 -43
  2. isa_model/core/cache/redis_cache.py +12 -3
  3. isa_model/core/config/config_manager.py +230 -3
  4. isa_model/core/config.py +90 -0
  5. isa_model/core/database/direct_db_client.py +114 -0
  6. isa_model/core/database/migration_manager.py +563 -0
  7. isa_model/core/database/migrations.py +21 -1
  8. isa_model/core/database/supabase_client.py +154 -19
  9. isa_model/core/dependencies.py +316 -0
  10. isa_model/core/discovery/__init__.py +19 -0
  11. isa_model/core/discovery/consul_discovery.py +190 -0
  12. isa_model/core/logging/__init__.py +54 -0
  13. isa_model/core/logging/influx_logger.py +523 -0
  14. isa_model/core/logging/loki_logger.py +160 -0
  15. isa_model/core/models/__init__.py +27 -18
  16. isa_model/core/models/config_models.py +625 -0
  17. isa_model/core/models/deployment_billing_tracker.py +430 -0
  18. isa_model/core/models/model_manager.py +40 -17
  19. isa_model/core/models/model_metadata.py +690 -0
  20. isa_model/core/models/model_repo.py +174 -18
  21. isa_model/core/models/system_models.py +857 -0
  22. isa_model/core/repositories/__init__.py +9 -0
  23. isa_model/core/repositories/config_repository.py +912 -0
  24. isa_model/core/services/intelligent_model_selector.py +399 -21
  25. isa_model/core/storage/hf_storage.py +1 -1
  26. isa_model/core/types.py +1 -0
  27. isa_model/deployment/__init__.py +5 -48
  28. isa_model/deployment/core/__init__.py +2 -31
  29. isa_model/deployment/core/deployment_manager.py +1278 -370
  30. isa_model/deployment/local/__init__.py +31 -0
  31. isa_model/deployment/local/config.py +248 -0
  32. isa_model/deployment/local/gpu_gateway.py +607 -0
  33. isa_model/deployment/local/health_checker.py +428 -0
  34. isa_model/deployment/local/provider.py +586 -0
  35. isa_model/deployment/local/tensorrt_service.py +621 -0
  36. isa_model/deployment/local/transformers_service.py +644 -0
  37. isa_model/deployment/local/vllm_service.py +527 -0
  38. isa_model/deployment/modal/__init__.py +8 -0
  39. isa_model/deployment/modal/config.py +136 -0
  40. isa_model/deployment/{services/auto_hf_modal_deployer.py → modal/deployer.py} +1 -1
  41. isa_model/deployment/modal/services/__init__.py +3 -0
  42. isa_model/deployment/modal/services/audio/__init__.py +1 -0
  43. isa_model/deployment/modal/services/embedding/__init__.py +1 -0
  44. isa_model/deployment/modal/services/llm/__init__.py +1 -0
  45. isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
  46. isa_model/deployment/modal/services/video/__init__.py +1 -0
  47. isa_model/deployment/modal/services/vision/__init__.py +1 -0
  48. isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
  49. isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
  50. isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
  51. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
  52. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
  53. isa_model/deployment/storage/__init__.py +5 -0
  54. isa_model/deployment/storage/deployment_repository.py +824 -0
  55. isa_model/deployment/triton/__init__.py +10 -0
  56. isa_model/deployment/triton/config.py +196 -0
  57. isa_model/deployment/triton/configs/__init__.py +1 -0
  58. isa_model/deployment/triton/provider.py +512 -0
  59. isa_model/deployment/triton/scripts/__init__.py +1 -0
  60. isa_model/deployment/triton/templates/__init__.py +1 -0
  61. isa_model/inference/__init__.py +47 -1
  62. isa_model/inference/ai_factory.py +137 -10
  63. isa_model/inference/legacy_services/__init__.py +21 -0
  64. isa_model/inference/legacy_services/model_evaluation.py +637 -0
  65. isa_model/inference/legacy_services/model_service.py +573 -0
  66. isa_model/inference/legacy_services/model_serving.py +717 -0
  67. isa_model/inference/legacy_services/model_training.py +561 -0
  68. isa_model/inference/models/__init__.py +21 -0
  69. isa_model/inference/models/inference_config.py +551 -0
  70. isa_model/inference/models/inference_record.py +675 -0
  71. isa_model/inference/models/performance_models.py +714 -0
  72. isa_model/inference/repositories/__init__.py +9 -0
  73. isa_model/inference/repositories/inference_repository.py +828 -0
  74. isa_model/inference/services/audio/base_stt_service.py +184 -11
  75. isa_model/inference/services/audio/openai_stt_service.py +22 -6
  76. isa_model/inference/services/custom_model_manager.py +277 -0
  77. isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
  78. isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
  79. isa_model/inference/services/llm/__init__.py +10 -2
  80. isa_model/inference/services/llm/base_llm_service.py +335 -24
  81. isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
  82. isa_model/inference/services/llm/helpers/llm_adapter.py +9 -4
  83. isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
  84. isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
  85. isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
  86. isa_model/inference/services/llm/local_llm_service.py +747 -0
  87. isa_model/inference/services/llm/ollama_llm_service.py +9 -2
  88. isa_model/inference/services/llm/openai_llm_service.py +33 -16
  89. isa_model/inference/services/llm/yyds_llm_service.py +8 -2
  90. isa_model/inference/services/vision/__init__.py +22 -1
  91. isa_model/inference/services/vision/blip_vision_service.py +359 -0
  92. isa_model/inference/services/vision/helpers/image_utils.py +8 -5
  93. isa_model/inference/services/vision/isa_vision_service.py +65 -4
  94. isa_model/inference/services/vision/openai_vision_service.py +19 -10
  95. isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
  96. isa_model/serving/api/cache_manager.py +245 -0
  97. isa_model/serving/api/dependencies/__init__.py +1 -0
  98. isa_model/serving/api/dependencies/auth.py +194 -0
  99. isa_model/serving/api/dependencies/database.py +139 -0
  100. isa_model/serving/api/error_handlers.py +284 -0
  101. isa_model/serving/api/fastapi_server.py +172 -22
  102. isa_model/serving/api/middleware/auth.py +8 -2
  103. isa_model/serving/api/middleware/security.py +23 -33
  104. isa_model/serving/api/middleware/tenant_context.py +414 -0
  105. isa_model/serving/api/routes/analytics.py +4 -1
  106. isa_model/serving/api/routes/config.py +645 -0
  107. isa_model/serving/api/routes/deployment_billing.py +315 -0
  108. isa_model/serving/api/routes/deployments.py +138 -2
  109. isa_model/serving/api/routes/gpu_gateway.py +440 -0
  110. isa_model/serving/api/routes/health.py +32 -12
  111. isa_model/serving/api/routes/inference_monitoring.py +486 -0
  112. isa_model/serving/api/routes/local_deployments.py +448 -0
  113. isa_model/serving/api/routes/tenants.py +575 -0
  114. isa_model/serving/api/routes/unified.py +680 -18
  115. isa_model/serving/api/routes/webhooks.py +479 -0
  116. isa_model/serving/api/startup.py +68 -54
  117. isa_model/utils/gpu_utils.py +311 -0
  118. {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/METADATA +66 -24
  119. isa_model-0.4.3.dist-info/RECORD +193 -0
  120. isa_model/core/storage/minio_storage.py +0 -0
  121. isa_model/deployment/cloud/__init__.py +0 -9
  122. isa_model/deployment/cloud/modal/__init__.py +0 -10
  123. isa_model/deployment/core/deployment_config.py +0 -356
  124. isa_model/deployment/core/isa_deployment_service.py +0 -401
  125. isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
  126. isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
  127. isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
  128. isa_model/deployment/runtime/deployed_service.py +0 -338
  129. isa_model/deployment/services/__init__.py +0 -9
  130. isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
  131. isa_model/deployment/services/model_service.py +0 -332
  132. isa_model/deployment/services/service_monitor.py +0 -356
  133. isa_model/deployment/services/service_registry.py +0 -527
  134. isa_model/eval/__init__.py +0 -92
  135. isa_model/eval/benchmarks/__init__.py +0 -27
  136. isa_model/eval/benchmarks/multimodal_datasets.py +0 -460
  137. isa_model/eval/benchmarks.py +0 -701
  138. isa_model/eval/config/__init__.py +0 -10
  139. isa_model/eval/config/evaluation_config.py +0 -108
  140. isa_model/eval/evaluators/__init__.py +0 -24
  141. isa_model/eval/evaluators/audio_evaluator.py +0 -727
  142. isa_model/eval/evaluators/base_evaluator.py +0 -503
  143. isa_model/eval/evaluators/embedding_evaluator.py +0 -742
  144. isa_model/eval/evaluators/llm_evaluator.py +0 -472
  145. isa_model/eval/evaluators/vision_evaluator.py +0 -564
  146. isa_model/eval/example_evaluation.py +0 -395
  147. isa_model/eval/factory.py +0 -798
  148. isa_model/eval/infrastructure/__init__.py +0 -24
  149. isa_model/eval/infrastructure/experiment_tracker.py +0 -466
  150. isa_model/eval/isa_benchmarks.py +0 -700
  151. isa_model/eval/isa_integration.py +0 -582
  152. isa_model/eval/metrics.py +0 -951
  153. isa_model/eval/tests/unit/test_basic.py +0 -396
  154. isa_model/serving/api/routes/evaluations.py +0 -579
  155. isa_model/training/__init__.py +0 -168
  156. isa_model/training/annotation/annotation_schema.py +0 -47
  157. isa_model/training/annotation/processors/annotation_processor.py +0 -126
  158. isa_model/training/annotation/storage/dataset_manager.py +0 -131
  159. isa_model/training/annotation/storage/dataset_schema.py +0 -44
  160. isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
  161. isa_model/training/annotation/tests/test_minio copy.py +0 -113
  162. isa_model/training/annotation/tests/test_minio_upload.py +0 -43
  163. isa_model/training/annotation/views/annotation_controller.py +0 -158
  164. isa_model/training/cloud/__init__.py +0 -22
  165. isa_model/training/cloud/job_orchestrator.py +0 -402
  166. isa_model/training/cloud/runpod_trainer.py +0 -454
  167. isa_model/training/cloud/storage_manager.py +0 -482
  168. isa_model/training/core/__init__.py +0 -26
  169. isa_model/training/core/config.py +0 -181
  170. isa_model/training/core/dataset.py +0 -222
  171. isa_model/training/core/trainer.py +0 -720
  172. isa_model/training/core/utils.py +0 -213
  173. isa_model/training/examples/intelligent_training_example.py +0 -281
  174. isa_model/training/factory.py +0 -424
  175. isa_model/training/intelligent/__init__.py +0 -25
  176. isa_model/training/intelligent/decision_engine.py +0 -643
  177. isa_model/training/intelligent/intelligent_factory.py +0 -888
  178. isa_model/training/intelligent/knowledge_base.py +0 -751
  179. isa_model/training/intelligent/resource_optimizer.py +0 -839
  180. isa_model/training/intelligent/task_classifier.py +0 -576
  181. isa_model/training/storage/__init__.py +0 -24
  182. isa_model/training/storage/core_integration.py +0 -439
  183. isa_model/training/storage/training_repository.py +0 -552
  184. isa_model/training/storage/training_storage.py +0 -628
  185. isa_model-0.4.0.dist-info/RECORD +0 -182
  186. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_chatTTS_service.py +0 -0
  187. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_fish_service.py +0 -0
  188. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_openvoice_service.py +0 -0
  189. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_service_v2.py +0 -0
  190. /isa_model/deployment/{cloud/modal → modal/services/embedding}/isa_embed_rerank_service.py +0 -0
  191. /isa_model/deployment/{cloud/modal → modal/services/video}/isa_video_hunyuan_service.py +0 -0
  192. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ocr_service.py +0 -0
  193. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_qwen25_service.py +0 -0
  194. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_table_service.py +0 -0
  195. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service.py +0 -0
  196. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service_optimized.py +0 -0
  197. /isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
  198. {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/WHEEL +0 -0
  199. {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/top_level.txt +0 -0
@@ -1,8 +1,7 @@
1
1
  """
2
- Deployment Manager
2
+ Unified Deployment Manager
3
3
 
4
- Orchestrates the complete deployment workflow including model preparation,
5
- container building, deployment to cloud providers, and monitoring.
4
+ Orchestrates deployment of AI models to multiple platforms (Modal, Triton, Local GPU).
6
5
  """
7
6
 
8
7
  import os
@@ -13,85 +12,69 @@ from pathlib import Path
13
12
  from datetime import datetime
14
13
  import asyncio
15
14
 
16
- from .deployment_config import (
17
- DeploymentConfig, DeploymentProvider, InferenceEngine,
18
- ModelConfig, TritonConfig, RunPodServerlessConfig
19
- )
20
- from ...core.models.model_manager import ModelManager
21
- from ...core.models.model_repo import ModelCapability, ModelType
22
- # ModelRegistry may not exist or may be in a different location
23
- from ...core.storage.hf_storage import HuggingFaceStorage
15
+ from ...core.config.config_manager import ConfigManager
24
16
 
25
17
  logger = logging.getLogger(__name__)
26
18
 
27
19
 
28
20
  class DeploymentManager:
29
21
  """
30
- Manages the complete deployment lifecycle for AI models.
22
+ Unified deployment manager for multiple platforms.
31
23
 
32
24
  This manager coordinates:
33
- - Model preparation and optimization
34
- - Container building and configuration
35
- - Deployment to cloud providers
36
- - Health monitoring and scaling
37
- - Integration with model registry
25
+ - Local GPU deployment with vLLM, TensorRT-LLM, Transformers
26
+ - Cloud deployment to Modal platform
27
+ - Container deployment with Triton Inference Server
28
+ - Deployment tracking and monitoring
38
29
 
39
30
  Example:
40
31
  ```python
41
32
  from isa_model.deployment import DeploymentManager
42
- from isa_model.deployment.core import create_gemma_runpod_triton_config
33
+ from isa_model.deployment.local import create_vllm_config
43
34
 
44
35
  # Initialize deployment manager
45
36
  manager = DeploymentManager()
46
37
 
47
- # Create deployment configuration
48
- config = create_gemma_runpod_triton_config(
49
- model_id="gemma-v1",
50
- runpod_api_key="your-api-key",
51
- model_source_path="xenobordom/gemma-4b-alpaca-v1"
52
- )
38
+ # Deploy to local GPU
39
+ local_config = create_vllm_config("llama2-7b", "meta-llama/Llama-2-7b-chat-hf")
40
+ local_deployment = await manager.deploy_to_local(local_config)
53
41
 
54
- # Deploy the model
55
- deployment = await manager.deploy_model(config)
56
- print(f"Model deployed: {deployment['endpoint_url']}")
42
+ # Deploy to Modal
43
+ modal_deployment = await manager.deploy_to_modal(
44
+ service_name="llm-service",
45
+ model_id="my-model",
46
+ service_type="llm"
47
+ )
57
48
  ```
58
49
  """
59
50
 
60
- def __init__(self,
61
- model_manager: Optional[ModelManager] = None,
62
- storage_backend: str = "huggingface",
63
- workspace_dir: str = "./deployments"):
51
+ def __init__(self, workspace_dir: str = "./deployments"):
64
52
  """
65
53
  Initialize deployment manager.
66
-
54
+
67
55
  Args:
68
- model_manager: Model manager instance
69
- storage_backend: Storage backend to use ("huggingface", "local")
70
56
  workspace_dir: Directory for deployment artifacts
71
57
  """
72
58
  self.workspace_dir = Path(workspace_dir)
73
59
  self.workspace_dir.mkdir(parents=True, exist_ok=True)
74
-
75
- # Initialize model management
76
- if storage_backend == "huggingface":
77
- storage = HuggingFaceStorage()
78
- else:
79
- from ...core.models.model_storage import LocalModelStorage
80
- storage = LocalModelStorage()
81
-
82
- self.model_manager = model_manager or ModelManager(storage=storage)
83
- # self.model_registry = ModelRegistry() # ModelRegistry may not exist
84
- self.model_registry = None
85
-
60
+
86
61
  # Deployment tracking
87
62
  self.deployments: Dict[str, Dict[str, Any]] = {}
88
63
  self.deployments_file = self.workspace_dir / "deployments.json"
89
64
  self._load_deployments()
90
-
65
+
91
66
  # Setup logging
92
67
  self._setup_logging()
93
-
94
- logger.info(f"Deployment manager initialized with {storage_backend} storage")
68
+
69
+ # Initialize configuration manager
70
+ self.config_manager = ConfigManager()
71
+
72
+ # Initialize providers
73
+ self._modal_provider = None
74
+ self._triton_provider = None
75
+ self._local_provider = None
76
+
77
+ logger.info("Unified deployment manager initialized")
95
78
  logger.info(f"Workspace directory: {self.workspace_dir}")
96
79
 
97
80
  def _setup_logging(self):
@@ -126,49 +109,78 @@ class DeploymentManager:
126
109
  with open(self.deployments_file, 'w') as f:
127
110
  json.dump(self.deployments, f, indent=2, default=str)
128
111
 
129
- async def deploy_model(self, config: DeploymentConfig) -> Dict[str, Any]:
112
+ async def deploy_to_modal(self,
113
+ service_name: str,
114
+ model_id: str,
115
+ service_type: str = "llm",
116
+ config: Optional[Dict[str, Any]] = None,
117
+ tenant_context: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
130
118
  """
131
- Deploy a model using the specified configuration.
119
+ Deploy a service to Modal.
132
120
 
133
121
  Args:
134
- config: Deployment configuration
122
+ service_name: Name of the service to deploy
123
+ model_id: Model identifier
124
+ service_type: Type of service (llm, vision, audio, embedding, video)
125
+ config: Additional configuration for the service
135
126
 
136
127
  Returns:
137
128
  Deployment result with endpoint information
138
129
  """
139
- deployment_id = config.deployment_id
130
+ # Extract tenant information for deployment isolation
131
+ organization_id = tenant_context.get('organization_id') if tenant_context else 'default'
132
+ tenant_prefix = f"org-{organization_id}" if organization_id != 'default' else ''
133
+
134
+ # Generate tenant-isolated deployment ID
135
+ base_deployment_id = f"{service_name}-{service_type}-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
136
+ deployment_id = f"{tenant_prefix}-{base_deployment_id}" if tenant_prefix else base_deployment_id
140
137
 
141
138
  logger.info("=" * 60)
142
- logger.info(f"STARTING DEPLOYMENT: {deployment_id}")
139
+ logger.info(f"STARTING MODAL DEPLOYMENT: {deployment_id}")
140
+ logger.info(f"TENANT: {organization_id}")
143
141
  logger.info("=" * 60)
144
142
 
145
143
  try:
144
+ # Track deployment start for billing
145
+ deployment_start_time = datetime.now()
146
+
146
147
  # Step 1: Validate configuration
147
- logger.info("Step 1/6: Validating deployment configuration...")
148
- self._validate_config(config)
148
+ logger.info("Step 1/4: Validating deployment configuration...")
149
+ self._validate_modal_config(service_name, model_id, service_type)
149
150
 
150
- # Step 2: Prepare model
151
- logger.info("Step 2/6: Preparing model...")
152
- model_path = await self._prepare_model(config.model_config)
151
+ # Step 2: Prepare deployment artifacts
152
+ logger.info("Step 2/4: Preparing Modal deployment artifacts...")
153
+ artifacts_path = await self._prepare_modal_artifacts(deployment_id, service_name, model_id, service_type, config)
153
154
 
154
- # Step 3: Optimize model (TensorRT conversion if needed)
155
- logger.info("Step 3/6: Optimizing model...")
156
- optimized_model_path = await self._optimize_model(config, model_path)
155
+ # Step 3: Deploy to Modal
156
+ logger.info("Step 3/4: Deploying to Modal...")
157
+ deployment_result = await self._deploy_modal_service(deployment_id, service_name, service_type, artifacts_path)
157
158
 
158
- # Step 4: Prepare deployment artifacts
159
- logger.info("Step 4/6: Preparing deployment artifacts...")
160
- artifacts_path = await self._prepare_deployment_artifacts(config, optimized_model_path)
159
+ # Calculate deployment duration
160
+ deployment_duration = (datetime.now() - deployment_start_time).total_seconds() / 3600 # hours
161
161
 
162
- # Step 5: Deploy to provider
163
- logger.info("Step 5/6: Deploying to provider...")
164
- deployment_result = await self._deploy_to_provider(config, artifacts_path)
162
+ # Track billing for Modal deployment
163
+ self._track_modal_deployment_billing(
164
+ service_name=service_name,
165
+ model_id=model_id,
166
+ service_type=service_type,
167
+ deployment_duration_hours=deployment_duration,
168
+ config=config,
169
+ result=deployment_result
170
+ )
165
171
 
166
- # Step 6: Register deployment
167
- logger.info("Step 6/6: Registering deployment...")
168
- await self._register_deployment(config, deployment_result)
172
+ # Step 4: Register deployment
173
+ logger.info("Step 4/4: Registering deployment...")
174
+ await self._register_deployment(deployment_id, {
175
+ "service_name": service_name,
176
+ "model_id": model_id,
177
+ "service_type": service_type,
178
+ "config": config or {},
179
+ "deployment_duration_hours": deployment_duration
180
+ }, deployment_result, tenant_context)
169
181
 
170
182
  logger.info("=" * 60)
171
- logger.info("DEPLOYMENT COMPLETED SUCCESSFULLY!")
183
+ logger.info("MODAL DEPLOYMENT COMPLETED SUCCESSFULLY!")
172
184
  logger.info("=" * 60)
173
185
  logger.info(f"Deployment ID: {deployment_id}")
174
186
  logger.info(f"Endpoint URL: {deployment_result.get('endpoint_url', 'N/A')}")
@@ -177,13 +189,15 @@ class DeploymentManager:
177
189
 
178
190
  except Exception as e:
179
191
  logger.error("=" * 60)
180
- logger.error("DEPLOYMENT FAILED!")
192
+ logger.error("MODAL DEPLOYMENT FAILED!")
181
193
  logger.error("=" * 60)
182
194
  logger.error(f"Error: {e}")
183
195
 
184
196
  # Update deployment status
185
197
  self.deployments[deployment_id] = {
186
- "config": config.to_dict(),
198
+ "service_name": service_name,
199
+ "model_id": model_id,
200
+ "service_type": service_type,
187
201
  "status": "failed",
188
202
  "error": str(e),
189
203
  "created_at": datetime.now().isoformat(),
@@ -193,99 +207,44 @@ class DeploymentManager:
193
207
 
194
208
  raise
195
209
 
196
- def _validate_config(self, config: DeploymentConfig):
197
- """Validate deployment configuration"""
198
- logger.debug("Validating deployment configuration...")
210
+ def _validate_modal_config(self, service_name: str, model_id: str, service_type: str):
211
+ """Validate Modal deployment configuration"""
212
+ logger.debug("Validating Modal deployment configuration...")
199
213
 
200
214
  # Check required fields
201
- if not config.deployment_id:
202
- raise ValueError("deployment_id is required")
203
-
204
- if not config.model_config:
205
- raise ValueError("model_config is required")
206
-
207
- # Provider-specific validation
208
- if config.provider == DeploymentProvider.RUNPOD_SERVERLESS:
209
- if not config.runpod_config or not config.runpod_config.api_key:
210
- raise ValueError("RunPod API key is required for RunPod deployment")
211
-
212
- # Engine-specific validation
213
- if config.inference_engine == InferenceEngine.TRITON:
214
- if not config.triton_config:
215
- raise ValueError("Triton configuration is required for Triton engine")
216
-
217
- logger.info("Configuration validation passed")
218
-
219
- async def _prepare_model(self, model_config: ModelConfig) -> Path:
220
- """Prepare model for deployment"""
221
- logger.info(f"Preparing model: {model_config.model_id}")
222
-
223
- # Determine model type for registry
224
- if model_config.model_type == "llm":
225
- model_type = ModelType.LLM
226
- elif model_config.model_type == "embedding":
227
- model_type = ModelType.EMBEDDING
228
- elif model_config.model_type == "vision":
229
- model_type = ModelType.VISION
230
- else:
231
- model_type = ModelType.LLM # Default
232
-
233
- # Convert capabilities
234
- capabilities = []
235
- for cap in model_config.capabilities:
236
- if cap == "text_generation":
237
- capabilities.append(ModelCapability.TEXT_GENERATION)
238
- elif cap == "chat":
239
- capabilities.append(ModelCapability.CHAT)
240
- elif cap == "embedding":
241
- capabilities.append(ModelCapability.EMBEDDING)
242
- else:
243
- capabilities.append(ModelCapability.TEXT_GENERATION) # Default
244
-
245
- # Get or download model
246
- if model_config.source_type == "huggingface":
247
- model_path = await self.model_manager.get_model(
248
- model_id=model_config.model_id,
249
- repo_id=model_config.source_path,
250
- model_type=model_type,
251
- capabilities=capabilities
252
- )
253
- elif model_config.source_type == "local":
254
- model_path = Path(model_config.source_path)
255
- if not model_path.exists():
256
- raise FileNotFoundError(f"Model not found at {model_path}")
257
- else:
258
- raise ValueError(f"Unsupported source type: {model_config.source_type}")
259
-
260
- logger.info(f"Model prepared at: {model_path}")
261
- return model_path
262
-
263
- async def _optimize_model(self, config: DeploymentConfig, model_path: Path) -> Path:
264
- """Optimize model for deployment"""
265
- logger.info("Optimizing model for deployment...")
215
+ if not service_name:
216
+ raise ValueError("service_name is required")
266
217
 
267
- # For now, return the original path
268
- # TODO: Implement TensorRT optimization, quantization, etc.
269
- if config.model_config.use_tensorrt:
270
- logger.info("TensorRT optimization requested (not yet implemented)")
218
+ if not model_id:
219
+ raise ValueError("model_id is required")
271
220
 
272
- if config.model_config.use_quantization:
273
- logger.info(f"Quantization requested: {config.model_config.quantization_method}")
221
+ # Check service type
222
+ valid_service_types = ["llm", "vision", "audio", "embedding", "video"]
223
+ if service_type not in valid_service_types:
224
+ raise ValueError(f"service_type must be one of {valid_service_types}")
274
225
 
275
- logger.info("Model optimization completed (pass-through for now)")
276
- return model_path
226
+ # Check Modal token using ConfigManager
227
+ modal_config = self.config_manager.get_deployment_config("modal")
228
+ if not modal_config or not modal_config.get("token_id"):
229
+ logger.warning("MODAL_TOKEN_ID not found in configuration")
230
+
231
+ logger.info("Modal configuration validation passed")
277
232
 
278
- async def _prepare_deployment_artifacts(self, config: DeploymentConfig, model_path: Path) -> Path:
279
- """Prepare deployment artifacts"""
280
- logger.info("Preparing deployment artifacts...")
233
+ async def _prepare_modal_artifacts(self, deployment_id: str, service_name: str, model_id: str, service_type: str, config: Optional[Dict[str, Any]]) -> Path:
234
+ """Prepare Modal deployment artifacts"""
235
+ logger.info("Preparing Modal deployment artifacts...")
281
236
 
282
237
  # Create deployment workspace
283
- deployment_workspace = self.workspace_dir / config.deployment_id
238
+ deployment_workspace = self.workspace_dir / deployment_id
284
239
  deployment_workspace.mkdir(exist_ok=True)
285
240
 
286
241
  artifacts = {
287
- "config": config.to_dict(),
288
- "model_path": str(model_path),
242
+ "deployment_id": deployment_id,
243
+ "service_name": service_name,
244
+ "model_id": model_id,
245
+ "service_type": service_type,
246
+ "config": config or {},
247
+ "platform": "modal",
289
248
  "created_at": datetime.now().isoformat()
290
249
  }
291
250
 
@@ -293,211 +252,121 @@ class DeploymentManager:
293
252
  with open(deployment_workspace / "deployment_config.json", 'w') as f:
294
253
  json.dump(artifacts, f, indent=2)
295
254
 
296
- # Generate Triton model configuration if needed
297
- if config.inference_engine == InferenceEngine.TRITON:
298
- await self._generate_triton_config(config, deployment_workspace, model_path)
299
-
300
- # Generate Docker configuration if needed
301
- await self._generate_docker_config(config, deployment_workspace)
302
-
303
- logger.info(f"Deployment artifacts prepared at: {deployment_workspace}")
255
+ logger.info(f"Modal deployment artifacts prepared at: {deployment_workspace}")
304
256
  return deployment_workspace
305
257
 
306
- async def _generate_triton_config(self, config: DeploymentConfig, workspace: Path, model_path: Path):
307
- """Generate Triton model configuration"""
308
- logger.info("Generating Triton model configuration...")
309
-
310
- triton_config = config.triton_config
311
- model_config = config.model_config
312
-
313
- # Create model repository structure
314
- model_repo = workspace / "model_repository"
315
- model_dir = model_repo / triton_config.model_name / "1"
316
- model_dir.mkdir(parents=True, exist_ok=True)
317
-
318
- # Copy model files
319
- import shutil
320
- if model_path.is_file():
321
- shutil.copy2(model_path, model_dir)
322
- else:
323
- shutil.copytree(model_path, model_dir / "model", dirs_exist_ok=True)
324
-
325
- # Generate config.pbtxt
326
- config_content = f"""
327
- name: "{triton_config.model_name}"
328
- backend: "{triton_config.backend}"
329
- max_batch_size: {triton_config.max_batch_size}
330
-
331
- input [
332
- {{
333
- name: "input_ids"
334
- data_type: TYPE_INT32
335
- dims: [ -1 ]
336
- }},
337
- {{
338
- name: "attention_mask"
339
- data_type: TYPE_INT32
340
- dims: [ -1 ]
341
- optional: true
342
- }}
343
- ]
344
-
345
- output [
346
- {{
347
- name: "output"
348
- data_type: TYPE_STRING
349
- dims: [ -1 ]
350
- }}
351
- ]
352
-
353
- instance_group [
354
- {{
355
- count: {triton_config.instance_group_count}
356
- kind: {triton_config.instance_group_kind}
357
- }}
358
- ]
359
-
360
- dynamic_batching {{
361
- max_queue_delay_microseconds: 100
362
- }}
363
- """
364
-
365
- with open(model_repo / triton_config.model_name / "config.pbtxt", 'w') as f:
366
- f.write(config_content.strip())
367
-
368
- logger.info("Triton configuration generated")
369
-
370
- async def _generate_docker_config(self, config: DeploymentConfig, workspace: Path):
371
- """Generate Docker configuration"""
372
- logger.info("Generating Docker configuration...")
373
-
374
- # Generate Dockerfile
375
- dockerfile_content = f"""
376
- FROM {config.runpod_config.container_image if config.runpod_config else 'nvidia/tritonserver:23.10-py3'}
377
-
378
- WORKDIR /workspace
379
-
380
- # Copy model repository
381
- COPY model_repository /models
382
-
383
- # Copy deployment configuration
384
- COPY deployment_config.json /workspace/
385
-
386
- # Set environment variables
387
- ENV TRITON_MODEL_REPOSITORY=/models
388
- ENV CUDA_VISIBLE_DEVICES=0
389
-
390
- # Expose Triton ports
391
- EXPOSE 8000 8001 8002
392
-
393
- # Health check
394
- HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \\
395
- CMD curl -f http://localhost:8000/v2/health/ready || exit 1
396
-
397
- # Start Triton server
398
- CMD ["tritonserver", "--model-repository=/models", "--allow-http=true", "--allow-grpc=true", "--allow-metrics=true"]
399
- """
400
-
401
- with open(workspace / "Dockerfile", 'w') as f:
402
- f.write(dockerfile_content.strip())
403
-
404
- # Generate docker-compose.yml for local testing
405
- compose_content = f"""
406
- version: '3.8'
407
-
408
- services:
409
- triton-server:
410
- build: .
411
- ports:
412
- - "8000:8000"
413
- - "8001:8001"
414
- - "8002:8002"
415
- environment:
416
- - CUDA_VISIBLE_DEVICES=0
417
- volumes:
418
- - ./model_repository:/models
419
- deploy:
420
- resources:
421
- reservations:
422
- devices:
423
- - driver: nvidia
424
- count: 1
425
- capabilities: [gpu]
426
- """
427
-
428
- with open(workspace / "docker-compose.yml", 'w') as f:
429
- f.write(compose_content.strip())
430
-
431
- logger.info("Docker configuration generated")
432
-
433
- async def _deploy_to_provider(self, config: DeploymentConfig, artifacts_path: Path) -> Dict[str, Any]:
434
- """Deploy to the specified provider"""
435
- logger.info(f"Deploying to provider: {config.provider.value}")
436
-
437
- if config.provider == DeploymentProvider.RUNPOD_SERVERLESS:
438
- return await self._deploy_to_runpod_serverless(config, artifacts_path)
439
- elif config.provider == DeploymentProvider.LOCAL:
440
- return await self._deploy_locally(config, artifacts_path)
441
- else:
442
- raise ValueError(f"Provider {config.provider} not yet implemented")
443
-
444
- async def _deploy_to_runpod_serverless(self, config: DeploymentConfig, artifacts_path: Path) -> Dict[str, Any]:
445
- """Deploy to RunPod Serverless"""
446
- logger.info("Deploying to RunPod Serverless...")
447
-
448
- # TODO: Implement RunPod Serverless deployment
449
- # This would involve:
450
- # 1. Building and pushing Docker image
451
- # 2. Creating RunPod serverless endpoint
452
- # 3. Configuring scaling and networking
453
-
454
- # For now, return mock result
455
- result = {
456
- "provider": "runpod_serverless",
457
- "endpoint_id": f"mock-endpoint-{config.deployment_id}",
458
- "endpoint_url": f"https://api.runpod.ai/v2/{config.deployment_id}/run",
459
- "status": "deployed",
460
- "deployed_at": datetime.now().isoformat()
461
- }
258
+ async def _deploy_modal_service(self, deployment_id: str, service_name: str, service_type: str, artifacts_path: Path) -> Dict[str, Any]:
259
+ """Deploy service to Modal using real Modal integration"""
260
+ logger.info(f"Deploying {service_type} service '{service_name}' to Modal...")
462
261
 
463
- logger.info(f"RunPod deployment completed: {result['endpoint_url']}")
464
- return result
465
-
466
- async def _deploy_locally(self, config: DeploymentConfig, artifacts_path: Path) -> Dict[str, Any]:
467
- """Deploy locally using Docker"""
468
- logger.info("Deploying locally using Docker...")
469
-
470
- # TODO: Implement local Docker deployment
471
- result = {
472
- "provider": "local",
473
- "endpoint_url": "http://localhost:8000",
474
- "status": "deployed",
475
- "deployed_at": datetime.now().isoformat(),
476
- "container_id": f"triton-{config.deployment_id}"
477
- }
478
-
479
- logger.info(f"Local deployment completed: {result['endpoint_url']}")
480
- return result
262
+ try:
263
+ # Load deployment config
264
+ config_file = artifacts_path / "deployment_config.json"
265
+ with open(config_file, 'r') as f:
266
+ deployment_config = json.load(f)
267
+
268
+ model_id = deployment_config['model_id']
269
+ config = deployment_config.get('config', {})
270
+
271
+ # Use Modal provider for real deployment
272
+ modal_provider = self.modal_provider
273
+
274
+ # Step 1: Analyze the model to get optimal configuration
275
+ logger.info(f"Analyzing model {model_id}...")
276
+ model_config = await asyncio.get_event_loop().run_in_executor(
277
+ None, modal_provider.analyze_model, model_id
278
+ )
279
+
280
+ # Step 2: Generate the appropriate Modal service
281
+ logger.info(f"Generating {service_type} service for {model_config.architecture}...")
282
+ service_code = await self._generate_modal_service_code(
283
+ service_name=service_name,
284
+ model_config=model_config,
285
+ service_type=service_type,
286
+ config=config
287
+ )
288
+
289
+ # Step 3: Save the generated service code
290
+ service_file = artifacts_path / f"{service_name}_modal_service.py"
291
+ with open(service_file, 'w') as f:
292
+ f.write(service_code)
293
+
294
+ # Step 4: Deploy to Modal (simulate for now, but with real structure)
295
+ deployment_result = await self._execute_modal_deployment(
296
+ service_file=service_file,
297
+ service_name=service_name,
298
+ model_config=model_config,
299
+ deployment_id=deployment_id
300
+ )
301
+
302
+ result = {
303
+ "provider": "modal",
304
+ "deployment_id": deployment_id,
305
+ "service_name": service_name,
306
+ "service_type": service_type,
307
+ "model_id": model_id,
308
+ "model_architecture": model_config.architecture,
309
+ "endpoint_url": deployment_result['endpoint_url'],
310
+ "status": deployment_result['status'],
311
+ "gpu_type": model_config.gpu_requirements,
312
+ "memory_gb": model_config.memory_gb,
313
+ "estimated_cost_per_hour": model_config.estimated_cost_per_hour,
314
+ "deployed_at": datetime.now().isoformat(),
315
+ "service_file": str(service_file)
316
+ }
317
+
318
+ logger.info(f"Modal deployment completed: {result['endpoint_url']}")
319
+ return result
320
+
321
+ except Exception as e:
322
+ logger.error(f"Failed to deploy Modal service: {e}")
323
+ raise
481
324
 
482
- async def _register_deployment(self, config: DeploymentConfig, deployment_result: Dict[str, Any]):
483
- """Register deployment in tracking system"""
484
- logger.info("Registering deployment...")
325
+ async def _register_deployment(self, deployment_id: str, config: Dict[str, Any], deployment_result: Dict[str, Any], tenant_context: Optional[Dict[str, Any]] = None):
326
+ """Register deployment in tracking system with tenant isolation"""
327
+ logger.info("Registering Modal deployment...")
485
328
 
486
329
  deployment_info = {
487
- "config": config.to_dict(),
330
+ "config": config,
488
331
  "result": deployment_result,
489
332
  "status": "active",
333
+ "platform": "modal",
490
334
  "created_at": datetime.now().isoformat(),
491
- "updated_at": datetime.now().isoformat()
335
+ "updated_at": datetime.now().isoformat(),
336
+ # Add tenant information for isolation
337
+ "tenant": {
338
+ "organization_id": tenant_context.get('organization_id', 'default') if tenant_context else 'default',
339
+ "user_id": tenant_context.get('user_id') if tenant_context else None,
340
+ "role": tenant_context.get('role', 'user') if tenant_context else 'user'
341
+ }
492
342
  }
493
343
 
494
- self.deployments[config.deployment_id] = deployment_info
344
+ self.deployments[deployment_id] = deployment_info
495
345
  self._save_deployments()
496
346
 
497
- logger.info(f"Deployment registered: {config.deployment_id}")
347
+ logger.info(f"Modal deployment registered: {deployment_id}")
498
348
 
499
- async def list_deployments(self) -> List[Dict[str, Any]]:
500
- """List all deployments"""
349
+ async def list_deployments(self, tenant_context: Optional[Dict[str, Any]] = None) -> List[Dict[str, Any]]:
350
+ """List deployments with optional tenant filtering"""
351
+
352
+ # If tenant context is provided, filter by organization
353
+ if tenant_context and tenant_context.get('organization_id'):
354
+ organization_id = tenant_context['organization_id']
355
+ filtered_deployments = []
356
+
357
+ for deployment_id, info in self.deployments.items():
358
+ # Check tenant information in deployment
359
+ deployment_org = info.get('tenant', {}).get('organization_id', 'default')
360
+ if deployment_org == organization_id:
361
+ filtered_deployments.append({
362
+ "deployment_id": deployment_id,
363
+ **info
364
+ })
365
+
366
+ logger.info(f"Filtered deployments for tenant {organization_id}: {len(filtered_deployments)} found")
367
+ return filtered_deployments
368
+
369
+ # Return all deployments if no tenant context
501
370
  return [
502
371
  {
503
372
  "deployment_id": deployment_id,
@@ -506,38 +375,263 @@ services:
506
375
  for deployment_id, info in self.deployments.items()
507
376
  ]
508
377
 
509
- async def get_deployment(self, deployment_id: str) -> Optional[Dict[str, Any]]:
510
- """Get deployment information"""
511
- return self.deployments.get(deployment_id)
378
+ async def get_deployment(self, deployment_id: str, tenant_context: Optional[Dict[str, Any]] = None) -> Optional[Dict[str, Any]]:
379
+ """Get deployment information with tenant access control"""
380
+ deployment = self.deployments.get(deployment_id)
381
+
382
+ if not deployment:
383
+ return None
384
+
385
+ # If tenant context is provided, verify access
386
+ if tenant_context and tenant_context.get('organization_id'):
387
+ organization_id = tenant_context['organization_id']
388
+ deployment_org = deployment.get('tenant', {}).get('organization_id', 'default')
389
+
390
+ # Check if user has access to this deployment
391
+ if deployment_org != organization_id:
392
+ logger.warning(f"Access denied: tenant {organization_id} tried to access deployment from {deployment_org}")
393
+ return None
394
+
395
+ return deployment
512
396
 
513
- async def delete_deployment(self, deployment_id: str) -> bool:
514
- """Delete a deployment"""
515
- logger.info(f"Deleting deployment: {deployment_id}")
397
+ async def delete_deployment(self, deployment_id: str, tenant_context: Optional[Dict[str, Any]] = None) -> bool:
398
+ """Delete a Modal deployment with tenant access control"""
399
+ logger.info(f"Deleting Modal deployment: {deployment_id}")
516
400
 
517
401
  try:
518
- if deployment_id in self.deployments:
519
- # TODO: Implement actual provider cleanup
402
+ if deployment_id not in self.deployments:
403
+ logger.warning(f"Deployment not found: {deployment_id}")
404
+ return False
520
405
 
521
- # Remove from tracking
522
- del self.deployments[deployment_id]
523
- self._save_deployments()
406
+ deployment = self.deployments[deployment_id]
407
+
408
+ # Verify tenant access
409
+ if tenant_context and tenant_context.get('organization_id'):
410
+ organization_id = tenant_context['organization_id']
411
+ deployment_org = deployment.get('tenant', {}).get('organization_id', 'default')
524
412
 
525
- # Clean up workspace
526
- deployment_workspace = self.workspace_dir / deployment_id
527
- if deployment_workspace.exists():
528
- import shutil
529
- shutil.rmtree(deployment_workspace)
413
+ if deployment_org != organization_id:
414
+ logger.warning(f"Access denied: tenant {organization_id} tried to delete deployment from {deployment_org}")
415
+ return False
416
+
417
+ # TODO: Implement actual Modal service cleanup using Modal SDK
418
+
419
+ # Remove from tracking
420
+ del self.deployments[deployment_id]
421
+ self._save_deployments()
422
+
423
+ # Clean up workspace
424
+ deployment_workspace = self.workspace_dir / deployment_id
425
+ if deployment_workspace.exists():
426
+ import shutil
427
+ shutil.rmtree(deployment_workspace)
428
+
429
+ logger.info(f"Modal deployment deleted: {deployment_id}")
430
+ return True
530
431
 
531
- logger.info(f"Deployment deleted: {deployment_id}")
532
- return True
432
+ except Exception as e:
433
+ logger.error(f"Failed to delete Modal deployment {deployment_id}: {e}")
434
+ return False
435
+
436
+ async def get_modal_service_status(self, deployment_id: str) -> Dict[str, Any]:
437
+ """Get real-time Modal service status"""
438
+ logger.info(f"Getting Modal service status for: {deployment_id}")
439
+
440
+ if deployment_id not in self.deployments:
441
+ return {
442
+ "deployment_id": deployment_id,
443
+ "status": "not_found",
444
+ "error": "Deployment not found"
445
+ }
446
+
447
+ deployment_info = self.deployments[deployment_id]
448
+
449
+ try:
450
+ # Get Modal service details
451
+ service_name = deployment_info.get('service_name')
452
+ model_id = deployment_info.get('model_id')
453
+
454
+ # Check if Modal service is accessible
455
+ modal_url = deployment_info.get('modal_url')
456
+
457
+ status_info = {
458
+ "deployment_id": deployment_id,
459
+ "service_name": service_name,
460
+ "model_id": model_id,
461
+ "status": deployment_info.get('status', 'unknown'),
462
+ "created_at": deployment_info.get('created_at'),
463
+ "updated_at": deployment_info.get('updated_at'),
464
+ "modal_url": modal_url,
465
+ "platform": "modal",
466
+ "monitoring": {
467
+ "health_check": await self._check_modal_health(modal_url),
468
+ "resource_usage": await self._get_modal_resource_usage(deployment_id),
469
+ "request_metrics": await self._get_modal_metrics(deployment_id),
470
+ "cost_tracking": await self._get_modal_cost_info(deployment_id)
471
+ }
472
+ }
473
+
474
+ # Update status based on health check
475
+ if status_info["monitoring"]["health_check"]["status"] == "healthy":
476
+ status_info["status"] = "running"
477
+ elif status_info["monitoring"]["health_check"]["status"] == "error":
478
+ status_info["status"] = "error"
533
479
  else:
534
- logger.warning(f"Deployment not found: {deployment_id}")
535
- return False
480
+ status_info["status"] = "pending"
481
+
482
+ logger.info(f"Modal service status retrieved: {deployment_id}")
483
+ return status_info
484
+
485
+ except Exception as e:
486
+ logger.error(f"Failed to get Modal service status {deployment_id}: {e}")
487
+ return {
488
+ "deployment_id": deployment_id,
489
+ "status": "error",
490
+ "error": str(e),
491
+ "last_check": datetime.now().isoformat()
492
+ }
493
+
494
+ async def _check_modal_health(self, modal_url: Optional[str]) -> Dict[str, Any]:
495
+ """Check Modal service health"""
496
+ if not modal_url:
497
+ return {
498
+ "status": "unknown",
499
+ "message": "No Modal URL available"
500
+ }
501
+
502
+ try:
503
+ import httpx
504
+ import asyncio
505
+
506
+ async with httpx.AsyncClient(timeout=10.0) as client:
507
+ # Try to ping the Modal endpoint
508
+ response = await client.get(f"{modal_url}/health", timeout=5.0)
536
509
 
510
+ if response.status_code == 200:
511
+ return {
512
+ "status": "healthy",
513
+ "response_time_ms": response.elapsed.total_seconds() * 1000,
514
+ "last_check": datetime.now().isoformat()
515
+ }
516
+ else:
517
+ return {
518
+ "status": "unhealthy",
519
+ "status_code": response.status_code,
520
+ "last_check": datetime.now().isoformat()
521
+ }
522
+
537
523
  except Exception as e:
538
- logger.error(f"Failed to delete deployment {deployment_id}: {e}")
539
- return False
524
+ return {
525
+ "status": "error",
526
+ "error": str(e),
527
+ "last_check": datetime.now().isoformat()
528
+ }
529
+
530
+ async def _get_modal_resource_usage(self, deployment_id: str) -> Dict[str, Any]:
531
+ """Get Modal service resource usage"""
532
+ try:
533
+ # In a real implementation, this would query Modal's API for resource usage
534
+ # For now, return simulated data based on deployment info
535
+ deployment_info = self.deployments.get(deployment_id, {})
536
+
537
+ return {
538
+ "gpu_utilization": "85%", # Simulated
539
+ "memory_usage": "12.5GB / 32GB",
540
+ "cpu_usage": "45%",
541
+ "requests_per_minute": 24,
542
+ "average_response_time": "1.2s",
543
+ "uptime": self._calculate_uptime(deployment_info.get('created_at')),
544
+ "last_updated": datetime.now().isoformat()
545
+ }
546
+
547
+ except Exception as e:
548
+ return {
549
+ "error": str(e),
550
+ "last_updated": datetime.now().isoformat()
551
+ }
552
+
553
+ async def _get_modal_metrics(self, deployment_id: str) -> Dict[str, Any]:
554
+ """Get Modal service request metrics"""
555
+ try:
556
+ # Simulated metrics - in production this would come from Modal's monitoring
557
+ return {
558
+ "total_requests": 1247,
559
+ "successful_requests": 1198,
560
+ "failed_requests": 49,
561
+ "success_rate": "96.1%",
562
+ "average_latency": "1.15s",
563
+ "requests_last_hour": 156,
564
+ "errors_last_hour": 3,
565
+ "last_updated": datetime.now().isoformat()
566
+ }
567
+
568
+ except Exception as e:
569
+ return {
570
+ "error": str(e),
571
+ "last_updated": datetime.now().isoformat()
572
+ }
540
573
 
574
+ async def _get_modal_cost_info(self, deployment_id: str) -> Dict[str, Any]:
575
+ """Get Modal service cost information"""
576
+ try:
577
+ deployment_info = self.deployments.get(deployment_id, {})
578
+
579
+ # Calculate estimated costs based on uptime and GPU type
580
+ uptime_hours = self._calculate_uptime_hours(deployment_info.get('created_at'))
581
+ gpu_cost_per_hour = 4.0 # A100 default rate
582
+
583
+ estimated_cost = uptime_hours * gpu_cost_per_hour
584
+
585
+ return {
586
+ "estimated_cost_usd": f"${estimated_cost:.4f}",
587
+ "uptime_hours": f"{uptime_hours:.2f}",
588
+ "hourly_rate": f"${gpu_cost_per_hour:.2f}",
589
+ "gpu_type": "A100",
590
+ "billing_period": "current_month",
591
+ "last_updated": datetime.now().isoformat()
592
+ }
593
+
594
+ except Exception as e:
595
+ return {
596
+ "error": str(e),
597
+ "last_updated": datetime.now().isoformat()
598
+ }
599
+
600
+ def _calculate_uptime(self, created_at: Optional[str]) -> str:
601
+ """Calculate service uptime"""
602
+ if not created_at:
603
+ return "Unknown"
604
+
605
+ try:
606
+ created = datetime.fromisoformat(created_at.replace('Z', '+00:00'))
607
+ uptime = datetime.now() - created.replace(tzinfo=None)
608
+
609
+ days = uptime.days
610
+ hours, remainder = divmod(uptime.seconds, 3600)
611
+ minutes, _ = divmod(remainder, 60)
612
+
613
+ if days > 0:
614
+ return f"{days}d {hours}h {minutes}m"
615
+ elif hours > 0:
616
+ return f"{hours}h {minutes}m"
617
+ else:
618
+ return f"{minutes}m"
619
+
620
+ except Exception:
621
+ return "Unknown"
622
+
623
+ def _calculate_uptime_hours(self, created_at: Optional[str]) -> float:
624
+ """Calculate service uptime in hours"""
625
+ if not created_at:
626
+ return 0.0
627
+
628
+ try:
629
+ created = datetime.fromisoformat(created_at.replace('Z', '+00:00'))
630
+ uptime = datetime.now() - created.replace(tzinfo=None)
631
+ return uptime.total_seconds() / 3600
632
+ except Exception:
633
+ return 0.0
634
+
541
635
  async def update_deployment_status(self, deployment_id: str, status: str, **kwargs):
542
636
  """Update deployment status"""
543
637
  if deployment_id in self.deployments:
@@ -548,4 +642,818 @@ services:
548
642
  self.deployments[deployment_id][key] = value
549
643
 
550
644
  self._save_deployments()
551
- logger.info(f"Updated deployment {deployment_id} status to {status}")
645
+ logger.info(f"Updated deployment {deployment_id} status to {status}")
646
+
647
+ @property
648
+ def modal_provider(self):
649
+ """Get or create Modal provider"""
650
+ if self._modal_provider is None:
651
+ from ..modal.deployer import ModalDeployer
652
+ self._modal_provider = ModalDeployer()
653
+ return self._modal_provider
654
+
655
+ @property
656
+ def triton_provider(self):
657
+ """Get or create Triton provider"""
658
+ if self._triton_provider is None:
659
+ from ..triton.provider import TritonProvider
660
+ self._triton_provider = TritonProvider(str(self.workspace_dir / "triton"))
661
+ return self._triton_provider
662
+
663
+ @property
664
+ def local_provider(self):
665
+ """Get or create Local GPU provider"""
666
+ if self._local_provider is None:
667
+ from ..local.provider import LocalGPUProvider
668
+ self._local_provider = LocalGPUProvider(str(self.workspace_dir / "local"))
669
+ return self._local_provider
670
+
671
+ async def deploy_to_triton(self, config) -> Dict[str, Any]:
672
+ """
673
+ Deploy a service to Triton Inference Server.
674
+
675
+ Args:
676
+ config: TritonConfig instance
677
+
678
+ Returns:
679
+ Deployment result with endpoint information
680
+ """
681
+ logger.info("=" * 60)
682
+ logger.info(f"STARTING TRITON DEPLOYMENT: {config.service_name}")
683
+ logger.info("=" * 60)
684
+
685
+ try:
686
+ # Track deployment start for billing
687
+ deployment_start_time = datetime.now()
688
+
689
+ # Deploy using Triton provider
690
+ result = await self.triton_provider.deploy(config)
691
+
692
+ # Calculate deployment duration
693
+ deployment_duration = (datetime.now() - deployment_start_time).total_seconds() / 3600 # hours
694
+
695
+ # Track billing for deployment
696
+ self._track_deployment_billing(
697
+ config=config,
698
+ provider="triton",
699
+ operation_type="deployment",
700
+ deployment_duration_hours=deployment_duration,
701
+ result=result
702
+ )
703
+
704
+ # Register in our tracking system
705
+ deployment_id = result["deployment_id"]
706
+ deployment_info = {
707
+ "config": config.to_dict(),
708
+ "result": result,
709
+ "status": "active",
710
+ "platform": "triton",
711
+ "created_at": datetime.now().isoformat(),
712
+ "updated_at": datetime.now().isoformat(),
713
+ "deployment_duration_hours": deployment_duration
714
+ }
715
+
716
+ self.deployments[deployment_id] = deployment_info
717
+ self._save_deployments()
718
+
719
+ logger.info("=" * 60)
720
+ logger.info("TRITON DEPLOYMENT COMPLETED SUCCESSFULLY!")
721
+ logger.info("=" * 60)
722
+ logger.info(f"Deployment ID: {deployment_id}")
723
+ logger.info(f"Endpoint URL: {result.get('endpoint_url', 'N/A')}")
724
+
725
+ return result
726
+
727
+ except Exception as e:
728
+ logger.error("=" * 60)
729
+ logger.error("TRITON DEPLOYMENT FAILED!")
730
+ logger.error("=" * 60)
731
+ logger.error(f"Error: {e}")
732
+ raise
733
+
734
+ async def deploy_to_local(self, config) -> Dict[str, Any]:
735
+ """
736
+ Deploy a service to local GPU.
737
+
738
+ Args:
739
+ config: LocalGPUConfig instance
740
+
741
+ Returns:
742
+ Deployment result with service information
743
+ """
744
+ logger.info("=" * 60)
745
+ logger.info(f"STARTING LOCAL GPU DEPLOYMENT: {config.service_name}")
746
+ logger.info(f"MODEL: {config.model_id}")
747
+ logger.info(f"BACKEND: {config.backend.value}")
748
+ logger.info("=" * 60)
749
+
750
+ try:
751
+ # Track deployment start for billing
752
+ deployment_start_time = datetime.now()
753
+
754
+ # Deploy using Local provider
755
+ result = await self.local_provider.deploy(config)
756
+
757
+ if result["success"]:
758
+ # Calculate deployment duration
759
+ deployment_duration = (datetime.now() - deployment_start_time).total_seconds() / 3600 # hours
760
+
761
+ # Register in our tracking system
762
+ deployment_id = f"local-{config.service_name}-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
763
+ deployment_info = {
764
+ "config": config.to_dict(),
765
+ "result": result,
766
+ "status": "active",
767
+ "platform": "local",
768
+ "created_at": datetime.now().isoformat(),
769
+ "updated_at": datetime.now().isoformat(),
770
+ "deployment_duration_hours": deployment_duration
771
+ }
772
+
773
+ self.deployments[deployment_id] = deployment_info
774
+ self._save_deployments()
775
+
776
+ logger.info("=" * 60)
777
+ logger.info("LOCAL GPU DEPLOYMENT COMPLETED SUCCESSFULLY!")
778
+ logger.info("=" * 60)
779
+ logger.info(f"Service: {config.service_name}")
780
+ logger.info(f"Backend: {config.backend.value}")
781
+
782
+ return {
783
+ **result,
784
+ "deployment_id": deployment_id,
785
+ "platform": "local"
786
+ }
787
+ else:
788
+ return result
789
+
790
+ except Exception as e:
791
+ logger.error("=" * 60)
792
+ logger.error("LOCAL GPU DEPLOYMENT FAILED!")
793
+ logger.error("=" * 60)
794
+ logger.error(f"Error: {e}")
795
+ raise
796
+
797
+ async def list_local_services(self) -> List[Dict[str, Any]]:
798
+ """List local GPU services"""
799
+ if not self.local_provider:
800
+ return []
801
+ return await self.local_provider.list_services()
802
+
803
+ async def get_local_service_info(self, service_name: str) -> Optional[Dict[str, Any]]:
804
+ """Get local service information"""
805
+ if not self.local_provider:
806
+ return None
807
+ return await self.local_provider.get_service_info(service_name)
808
+
809
+ async def undeploy_local_service(self, service_name: str) -> Dict[str, Any]:
810
+ """Undeploy local service"""
811
+ if not self.local_provider:
812
+ return {
813
+ "success": False,
814
+ "error": "Local provider not available"
815
+ }
816
+
817
+ result = await self.local_provider.undeploy(service_name)
818
+
819
+ # Remove from tracking
820
+ deployment_ids_to_remove = []
821
+ for deployment_id, info in self.deployments.items():
822
+ if (info.get('platform') == 'local' and
823
+ info.get('config', {}).get('service_name') == service_name):
824
+ deployment_ids_to_remove.append(deployment_id)
825
+
826
+ for deployment_id in deployment_ids_to_remove:
827
+ del self.deployments[deployment_id]
828
+
829
+ if deployment_ids_to_remove:
830
+ self._save_deployments()
831
+
832
+ return result
833
+
834
+ async def get_local_system_status(self) -> Dict[str, Any]:
835
+ """Get local GPU system status"""
836
+ if not self.local_provider:
837
+ return {
838
+ "available": False,
839
+ "error": "Local provider not initialized"
840
+ }
841
+ return await self.local_provider.get_system_status()
842
+
843
+ async def list_providers(self) -> List[str]:
844
+ """List available deployment providers"""
845
+ return ["local", "modal", "triton"]
846
+
847
+ async def get_provider_status(self, provider: str) -> Dict[str, Any]:
848
+ """Get status of a deployment provider"""
849
+ if provider == "local":
850
+ # Check local GPU availability
851
+ try:
852
+ from ...utils.gpu_utils import get_gpu_manager
853
+ gpu_manager = get_gpu_manager()
854
+
855
+ return {
856
+ "provider": "local",
857
+ "available": gpu_manager.cuda_available,
858
+ "description": "Local GPU deployment with vLLM, TensorRT-LLM, Transformers",
859
+ "gpu_count": len(gpu_manager.gpus),
860
+ "cuda_available": gpu_manager.cuda_available,
861
+ "nvidia_smi_available": gpu_manager.nvidia_smi_available,
862
+ "requirements": ["CUDA", "GPU drivers", "Sufficient GPU memory"]
863
+ }
864
+ except Exception as e:
865
+ return {
866
+ "provider": "local",
867
+ "available": False,
868
+ "description": "Local GPU deployment",
869
+ "error": str(e)
870
+ }
871
+ elif provider == "modal":
872
+ return {
873
+ "provider": "modal",
874
+ "available": True,
875
+ "description": "Modal serverless platform"
876
+ }
877
+ elif provider == "triton":
878
+ # Check if Docker is available
879
+ try:
880
+ import docker
881
+ docker.from_env()
882
+ docker_available = True
883
+ except Exception:
884
+ docker_available = False
885
+
886
+ return {
887
+ "provider": "triton",
888
+ "available": docker_available,
889
+ "description": "Triton Inference Server with TensorRT-LLM",
890
+ "requirements": ["Docker", "GPU support"]
891
+ }
892
+ else:
893
+ raise ValueError(f"Unknown provider: {provider}")
894
+
895
+ def _track_deployment_billing(
896
+ self,
897
+ config: Any,
898
+ provider: str,
899
+ operation_type: str,
900
+ deployment_duration_hours: float,
901
+ result: Dict[str, Any]
902
+ ):
903
+ """Track billing for deployment operations"""
904
+ try:
905
+ from ...core.models.deployment_billing_tracker import get_deployment_billing_tracker
906
+
907
+ # Extract GPU info from config
908
+ gpu_type = getattr(config, 'gpu_type', None)
909
+ gpu_count = getattr(config, 'gpu_count', 1)
910
+ memory_gb = getattr(config, 'memory_gb', None)
911
+
912
+ # Track the deployment billing
913
+ billing_tracker = get_deployment_billing_tracker()
914
+ billing_tracker.track_deployment_usage(
915
+ model_id=getattr(config, 'model_id', 'unknown'),
916
+ provider=provider,
917
+ operation_type=operation_type,
918
+ service_type=getattr(config, 'service_type', 'unknown').value if hasattr(getattr(config, 'service_type', 'unknown'), 'value') else str(getattr(config, 'service_type', 'unknown')),
919
+ operation="deploy",
920
+ gpu_type=gpu_type,
921
+ gpu_count=gpu_count,
922
+ runtime_hours=deployment_duration_hours,
923
+ deployment_duration_hours=deployment_duration_hours,
924
+ memory_gb=memory_gb,
925
+ metadata={
926
+ "deployment_id": result.get("deployment_id"),
927
+ "endpoint_url": result.get("endpoint_url"),
928
+ "provider_details": provider
929
+ }
930
+ )
931
+
932
+ logger.info(f"Tracked deployment billing: {provider} - {deployment_duration_hours:.3f}h")
933
+
934
+ except Exception as e:
935
+ logger.error(f"Failed to track deployment billing: {e}")
936
+
937
+ async def estimate_deployment_cost(
938
+ self,
939
+ provider: str,
940
+ gpu_type: str,
941
+ gpu_count: int = 1,
942
+ estimated_hours: float = 1.0
943
+ ) -> Dict[str, float]:
944
+ """Estimate deployment costs before starting"""
945
+ try:
946
+ from ...core.models.deployment_billing_tracker import get_deployment_billing_tracker
947
+
948
+ billing_tracker = get_deployment_billing_tracker()
949
+ return billing_tracker.estimate_deployment_cost(
950
+ provider=provider,
951
+ gpu_type=gpu_type,
952
+ gpu_count=gpu_count,
953
+ estimated_hours=estimated_hours
954
+ )
955
+ except Exception as e:
956
+ logger.error(f"Failed to estimate deployment cost: {e}")
957
+ return {"total_cost": 0.0, "compute_cost": 0.0, "storage_cost": 0.0, "network_cost": 0.0}
958
+
959
+ def _track_modal_deployment_billing(
960
+ self,
961
+ service_name: str,
962
+ model_id: str,
963
+ service_type: str,
964
+ deployment_duration_hours: float,
965
+ config: Optional[Dict[str, Any]],
966
+ result: Dict[str, Any]
967
+ ):
968
+ """Track billing for Modal deployment operations"""
969
+ try:
970
+ from ...core.models.deployment_billing_tracker import get_deployment_billing_tracker
971
+
972
+ # Extract GPU info from config or use defaults
973
+ gpu_type = config.get('gpu_type', 't4') if config else 't4'
974
+ gpu_count = config.get('gpu_count', 1) if config else 1
975
+ memory_gb = config.get('memory_gb', 8) if config else 8
976
+
977
+ # Track the Modal deployment billing
978
+ billing_tracker = get_deployment_billing_tracker()
979
+ billing_tracker.track_deployment_usage(
980
+ model_id=model_id,
981
+ provider="modal",
982
+ operation_type="deployment",
983
+ service_type=service_type,
984
+ operation="deploy",
985
+ gpu_type=gpu_type,
986
+ gpu_count=gpu_count,
987
+ runtime_hours=deployment_duration_hours,
988
+ deployment_duration_hours=deployment_duration_hours,
989
+ memory_gb=memory_gb,
990
+ metadata={
991
+ "service_name": service_name,
992
+ "deployment_id": result.get("deployment_id"),
993
+ "endpoint_url": result.get("endpoint_url"),
994
+ "provider_details": "modal_serverless"
995
+ }
996
+ )
997
+
998
+ logger.info(f"Tracked Modal deployment billing: {service_name} - {deployment_duration_hours:.3f}h")
999
+
1000
+ except Exception as e:
1001
+ logger.error(f"Failed to track Modal deployment billing: {e}")
1002
+
1003
+ async def list_modal_services(self) -> List[Dict[str, Any]]:
1004
+ """List available Modal services by type"""
1005
+ services = {
1006
+ "llm": ["isa_llm_service"],
1007
+ "vision": ["isa_vision_ocr_service", "isa_vision_ui_service", "isa_vision_table_service", "isa_vision_qwen25_service"],
1008
+ "audio": ["isa_audio_chatTTS_service", "isa_audio_openvoice_service", "isa_audio_service_v2", "isa_audio_fish_service"],
1009
+ "embedding": ["isa_embed_rerank_service"],
1010
+ "video": ["isa_video_hunyuan_service"]
1011
+ }
1012
+
1013
+ result = []
1014
+ for service_type, service_list in services.items():
1015
+ for service_name in service_list:
1016
+ result.append({
1017
+ "service_name": service_name,
1018
+ "service_type": service_type,
1019
+ "platform": "modal"
1020
+ })
1021
+
1022
+ return result
1023
+
1024
+ # ============= MODAL SERVICE CODE GENERATION =============
1025
+
1026
+ async def _generate_modal_service_code(self,
1027
+ service_name: str,
1028
+ model_config: Any,
1029
+ service_type: str,
1030
+ config: Dict[str, Any]) -> str:
1031
+ """Generate Modal service code based on model type and configuration"""
1032
+
1033
+ # Choose the appropriate service template based on service_type
1034
+ if service_type == "llm":
1035
+ return self._generate_llm_service_code(service_name, model_config, config)
1036
+ elif service_type == "vision":
1037
+ return self._generate_vision_service_code(service_name, model_config, config)
1038
+ elif service_type == "embedding":
1039
+ return self._generate_embedding_service_code(service_name, model_config, config)
1040
+ else:
1041
+ # Default to LLM service
1042
+ return self._generate_llm_service_code(service_name, model_config, config)
1043
+
1044
+ def _generate_llm_service_code(self, service_name: str, model_config: Any, config: Dict[str, Any]) -> str:
1045
+ """Generate production-ready LLM service code for Modal"""
1046
+ dependencies = getattr(model_config, 'dependencies', None) or [
1047
+ "torch", "transformers>=4.36.0", "accelerate", "bitsandbytes", "flash-attn"
1048
+ ]
1049
+
1050
+ # Determine optimal GPU based on model size
1051
+ gpu_config = self._get_optimal_gpu_config(model_config)
1052
+
1053
+ return f'''"""
1054
+ {service_name} LLM Service for Modal
1055
+
1056
+ Production-ready service for model: {getattr(model_config, 'model_id', 'unknown')}
1057
+ Architecture: {getattr(model_config, 'architecture', 'transformer')}
1058
+ Generated automatically by ISA Model Deployment Manager
1059
+ """
1060
+
1061
+ import modal
1062
+ import asyncio
1063
+ import json
1064
+ import time
1065
+ from typing import Dict, Any, List, Optional
1066
+ from datetime import datetime
1067
+
1068
+ # Create Modal app
1069
+ app = modal.App("{service_name}")
1070
+
1071
+ # Production image with optimized dependencies
1072
+ image = (
1073
+ modal.Image.debian_slim(python_version="3.11")
1074
+ .pip_install([
1075
+ {', '.join([f'"{dep}"' for dep in dependencies])}
1076
+ ])
1077
+ .env({{"HF_HUB_ENABLE_HF_TRANSFER": "1"}})
1078
+ )
1079
+
1080
+ @app.cls(
1081
+ image=image,
1082
+ gpu=modal.gpu.{gpu_config['gpu_type']}(count={gpu_config['gpu_count']}),
1083
+ container_idle_timeout=300,
1084
+ timeout=1800, # 30 minutes
1085
+ memory={getattr(model_config, 'container_memory_mb', 32768)},
1086
+ keep_warm=1, # Keep one container warm
1087
+ allow_concurrent_inputs=10
1088
+ )
1089
+ class {service_name.replace('-', '_').title()}Service:
1090
+
1091
+ @modal.enter()
1092
+ def load_model(self):
1093
+ """Load model with production optimizations"""
1094
+ import torch
1095
+ from transformers import (
1096
+ AutoTokenizer,
1097
+ AutoModelForCausalLM,
1098
+ BitsAndBytesConfig
1099
+ )
1100
+
1101
+ model_id = "{getattr(model_config, 'model_id', 'microsoft/DialoGPT-medium')}"
1102
+
1103
+ print(f"Loading model: {{model_id}}")
1104
+ start_time = time.time()
1105
+
1106
+ # Load tokenizer
1107
+ self.tokenizer = AutoTokenizer.from_pretrained(
1108
+ model_id,
1109
+ trust_remote_code=True,
1110
+ use_fast=True
1111
+ )
1112
+
1113
+ if self.tokenizer.pad_token is None:
1114
+ self.tokenizer.pad_token = self.tokenizer.eos_token
1115
+
1116
+ # Configure quantization for efficiency
1117
+ quantization_config = BitsAndBytesConfig(
1118
+ load_in_4bit=True,
1119
+ bnb_4bit_compute_dtype=torch.float16,
1120
+ bnb_4bit_use_double_quant=True,
1121
+ bnb_4bit_quant_type="nf4"
1122
+ )
1123
+
1124
+ # Load model with optimizations
1125
+ self.model = AutoModelForCausalLM.from_pretrained(
1126
+ model_id,
1127
+ quantization_config=quantization_config,
1128
+ device_map="auto",
1129
+ trust_remote_code=True,
1130
+ torch_dtype=torch.float16,
1131
+ attn_implementation="flash_attention_2"
1132
+ )
1133
+
1134
+ self.model.eval()
1135
+
1136
+ load_time = time.time() - start_time
1137
+ print(f"Model loaded successfully in {{load_time:.2f}}s")
1138
+
1139
+ # Model metadata
1140
+ self.model_info = {{
1141
+ "model_id": model_id,
1142
+ "architecture": "{getattr(model_config, 'architecture', 'transformer')}",
1143
+ "parameters": getattr(self.model, 'num_parameters', lambda: 0)(),
1144
+ "loaded_at": datetime.now().isoformat(),
1145
+ "load_time_seconds": load_time
1146
+ }}
1147
+
1148
+ @modal.method()
1149
+ def generate(self,
1150
+ messages: List[Dict[str, str]],
1151
+ max_tokens: int = 512,
1152
+ temperature: float = 0.7,
1153
+ top_p: float = 0.9,
1154
+ top_k: int = 50,
1155
+ do_sample: bool = True,
1156
+ **kwargs) -> Dict[str, Any]:
1157
+ """Generate response with production features"""
1158
+
1159
+ start_time = time.time()
1160
+
1161
+ try:
1162
+ # Format messages into prompt
1163
+ prompt = self._format_messages(messages)
1164
+
1165
+ # Tokenize input
1166
+ inputs = self.tokenizer(
1167
+ prompt,
1168
+ return_tensors="pt",
1169
+ padding=True,
1170
+ truncation=True,
1171
+ max_length=2048
1172
+ ).to(self.model.device)
1173
+
1174
+ # Generate response
1175
+ with torch.no_grad():
1176
+ outputs = self.model.generate(
1177
+ **inputs,
1178
+ max_new_tokens=max_tokens,
1179
+ temperature=temperature,
1180
+ top_p=top_p,
1181
+ top_k=top_k,
1182
+ do_sample=do_sample,
1183
+ pad_token_id=self.tokenizer.eos_token_id,
1184
+ eos_token_id=self.tokenizer.eos_token_id,
1185
+ use_cache=True
1186
+ )
1187
+
1188
+ # Decode response
1189
+ response_tokens = outputs[0][inputs['input_ids'].shape[-1]:]
1190
+ response_text = self.tokenizer.decode(
1191
+ response_tokens,
1192
+ skip_special_tokens=True
1193
+ ).strip()
1194
+
1195
+ generation_time = time.time() - start_time
1196
+
1197
+ return {{
1198
+ "response": response_text,
1199
+ "model": self.model_info["model_id"],
1200
+ "usage": {{
1201
+ "prompt_tokens": inputs['input_ids'].shape[-1],
1202
+ "completion_tokens": len(response_tokens),
1203
+ "total_tokens": inputs['input_ids'].shape[-1] + len(response_tokens)
1204
+ }},
1205
+ "metadata": {{
1206
+ "generation_time_seconds": generation_time,
1207
+ "parameters": {{
1208
+ "temperature": temperature,
1209
+ "top_p": top_p,
1210
+ "top_k": top_k,
1211
+ "max_tokens": max_tokens
1212
+ }},
1213
+ "timestamp": datetime.now().isoformat()
1214
+ }}
1215
+ }}
1216
+
1217
+ except Exception as e:
1218
+ return {{
1219
+ "error": str(e),
1220
+ "error_type": type(e).__name__,
1221
+ "model": self.model_info.get("model_id", "unknown"),
1222
+ "timestamp": datetime.now().isoformat()
1223
+ }}
1224
+
1225
+ def _format_messages(self, messages: List[Dict[str, str]]) -> str:
1226
+ """Format messages into model-appropriate prompt"""
1227
+ if not messages:
1228
+ return ""
1229
+
1230
+ # Simple chat format - can be enhanced for specific models
1231
+ formatted_parts = []
1232
+ for msg in messages:
1233
+ role = msg.get("role", "user")
1234
+ content = msg.get("content", "")
1235
+
1236
+ if role == "system":
1237
+ formatted_parts.append(f"System: {{content}}")
1238
+ elif role == "user":
1239
+ formatted_parts.append(f"Human: {{content}}")
1240
+ elif role == "assistant":
1241
+ formatted_parts.append(f"Assistant: {{content}}")
1242
+
1243
+ formatted_parts.append("Assistant:")
1244
+ return "\\n\\n".join(formatted_parts)
1245
+
1246
+ @modal.method()
1247
+ def get_model_info(self) -> Dict[str, Any]:
1248
+ """Get model metadata"""
1249
+ return self.model_info
1250
+
1251
+ # Web endpoint for HTTP access
1252
+ @app.function(
1253
+ image=image,
1254
+ timeout=300
1255
+ )
1256
+ @modal.web_endpoint(method="POST")
1257
+ async def inference_endpoint(item: Dict[str, Any]):
1258
+ """HTTP endpoint for model inference"""
1259
+ try:
1260
+ service = {service_name.replace('-', '_').title()}Service()
1261
+
1262
+ # Extract parameters
1263
+ messages = item.get("messages", [])
1264
+ max_tokens = item.get("max_tokens", 512)
1265
+ temperature = item.get("temperature", 0.7)
1266
+ top_p = item.get("top_p", 0.9)
1267
+
1268
+ # Generate response
1269
+ result = service.generate(
1270
+ messages=messages,
1271
+ max_tokens=max_tokens,
1272
+ temperature=temperature,
1273
+ top_p=top_p
1274
+ )
1275
+
1276
+ return result
1277
+
1278
+ except Exception as e:
1279
+ return {{
1280
+ "error": str(e),
1281
+ "error_type": type(e).__name__,
1282
+ "endpoint": "inference_endpoint",
1283
+ "timestamp": datetime.now().isoformat()
1284
+ }}
1285
+
1286
+ @app.function(image=image)
1287
+ @modal.web_endpoint(method="GET")
1288
+ async def health_check():
1289
+ """Health check endpoint"""
1290
+ return {{
1291
+ "status": "healthy",
1292
+ "service": "{service_name}",
1293
+ "timestamp": datetime.now().isoformat(),
1294
+ "version": "1.0.0"
1295
+ }}
1296
+
1297
+ @app.function(image=image)
1298
+ @modal.web_endpoint(method="GET")
1299
+ async def model_info():
1300
+ """Model information endpoint"""
1301
+ try:
1302
+ service = {service_name.replace('-', '_').title()}Service()
1303
+ return service.get_model_info()
1304
+ except Exception as e:
1305
+ return {{
1306
+ "error": str(e),
1307
+ "timestamp": datetime.now().isoformat()
1308
+ }}
1309
+
1310
+ # For local testing
1311
+ if __name__ == "__main__":
1312
+ # Test the service locally
1313
+ import asyncio
1314
+
1315
+ async def test():
1316
+ service = {service_name.replace('-', '_').title()}Service()
1317
+ result = service.generate([
1318
+ {{"role": "user", "content": "Hello! How are you today?"}}
1319
+ ])
1320
+ print(json.dumps(result, indent=2))
1321
+
1322
+ asyncio.run(test())
1323
+ '''
1324
+
1325
+ def _generate_vision_service_code(self, service_name: str, model_config: Any, config: Dict[str, Any]) -> str:
1326
+ """Generate Vision service code for Modal"""
1327
+ return f'# Vision service template for {service_name} - {model_config.model_id}'
1328
+
1329
+ def _generate_embedding_service_code(self, service_name: str, model_config: Any, config: Dict[str, Any]) -> str:
1330
+ """Generate Embedding service code for Modal"""
1331
+ return f'# Embedding service template for {service_name} - {model_config.model_id}'
1332
+
1333
+ async def _execute_modal_deployment(self,
1334
+ service_file: Path,
1335
+ service_name: str,
1336
+ model_config: Any,
1337
+ deployment_id: str) -> Dict[str, Any]:
1338
+ """Execute the actual Modal deployment using Modal SDK"""
1339
+
1340
+ logger.info(f"Executing Modal deployment for {service_name}...")
1341
+
1342
+ try:
1343
+ import subprocess
1344
+ import tempfile
1345
+ import os
1346
+
1347
+ # Check if modal CLI is available
1348
+ modal_check = subprocess.run(["modal", "--version"],
1349
+ capture_output=True, text=True, timeout=10)
1350
+ if modal_check.returncode != 0:
1351
+ raise RuntimeError("Modal CLI not found. Please install Modal: pip install modal")
1352
+
1353
+ # Create a temporary script for deployment
1354
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as tmp_file:
1355
+ tmp_file.write(open(service_file, 'r').read())
1356
+ tmp_script_path = tmp_file.name
1357
+
1358
+ try:
1359
+ # Execute Modal deployment
1360
+ logger.info(f"Deploying Modal service from {service_file}")
1361
+ deploy_result = subprocess.run(
1362
+ ["modal", "deploy", tmp_script_path],
1363
+ capture_output=True,
1364
+ text=True,
1365
+ timeout=300, # 5 minute timeout
1366
+ cwd=service_file.parent
1367
+ )
1368
+
1369
+ if deploy_result.returncode == 0:
1370
+ # Parse deployment output to extract endpoint URL
1371
+ output = deploy_result.stdout + deploy_result.stderr
1372
+ endpoint_url = self._extract_modal_endpoint(output, service_name, deployment_id)
1373
+
1374
+ result = {
1375
+ "status": "deployed",
1376
+ "endpoint_url": endpoint_url,
1377
+ "deployment_id": deployment_id,
1378
+ "service_file": str(service_file),
1379
+ "model_architecture": getattr(model_config, 'architecture', 'unknown'),
1380
+ "deployment_output": output,
1381
+ "estimated_startup_time": "30-60 seconds"
1382
+ }
1383
+
1384
+ logger.info(f"Modal deployment completed successfully: {endpoint_url}")
1385
+ return result
1386
+
1387
+ else:
1388
+ error_output = deploy_result.stderr or deploy_result.stdout
1389
+ logger.error(f"Modal deployment failed: {error_output}")
1390
+ raise RuntimeError(f"Modal deployment failed: {error_output}")
1391
+
1392
+ finally:
1393
+ # Clean up temporary file
1394
+ if os.path.exists(tmp_script_path):
1395
+ os.unlink(tmp_script_path)
1396
+
1397
+ except subprocess.TimeoutExpired:
1398
+ logger.error("Modal deployment timed out")
1399
+ raise RuntimeError("Modal deployment timed out after 5 minutes")
1400
+
1401
+ except Exception as e:
1402
+ logger.error(f"Failed to execute Modal deployment: {e}")
1403
+ raise
1404
+
1405
+ def _extract_modal_endpoint(self, output: str, service_name: str, deployment_id: str) -> str:
1406
+ """Extract Modal endpoint URL from deployment output"""
1407
+ import re
1408
+
1409
+ # Look for typical Modal endpoint patterns in output
1410
+ patterns = [
1411
+ r'https://[a-zA-Z0-9\-]+--[a-zA-Z0-9\-]+\.modal\.run',
1412
+ r'Deployed! Your app is at (https://[^\s]+)',
1413
+ r'App deployed to (https://[^\s]+)',
1414
+ r'Available at (https://[^\s]+)'
1415
+ ]
1416
+
1417
+ for pattern in patterns:
1418
+ match = re.search(pattern, output)
1419
+ if match:
1420
+ url = match.group(1) if match.lastindex else match.group(0)
1421
+ logger.info(f"Extracted Modal endpoint: {url}")
1422
+ return url
1423
+
1424
+ # If no endpoint found in output, generate expected URL pattern
1425
+ endpoint_url = f"https://{service_name}--{deployment_id}.modal.run"
1426
+ logger.warning(f"Could not extract endpoint from output, using expected pattern: {endpoint_url}")
1427
+ return endpoint_url
1428
+
1429
+ def _get_optimal_gpu_config(self, model_config: Any) -> Dict[str, Any]:
1430
+ """Determine optimal GPU configuration based on model size"""
1431
+
1432
+ # Get model parameters or estimate from model ID
1433
+ parameters = getattr(model_config, 'parameters', None)
1434
+ model_id = getattr(model_config, 'model_id', '')
1435
+
1436
+ # Estimate parameters from model name if not available
1437
+ if not parameters:
1438
+ if '7b' in model_id.lower():
1439
+ parameters = 7_000_000_000
1440
+ elif '13b' in model_id.lower():
1441
+ parameters = 13_000_000_000
1442
+ elif '70b' in model_id.lower():
1443
+ parameters = 70_000_000_000
1444
+ elif 'large' in model_id.lower():
1445
+ parameters = 1_000_000_000
1446
+ elif 'medium' in model_id.lower():
1447
+ parameters = 350_000_000
1448
+ else:
1449
+ parameters = 500_000_000 # Default assumption
1450
+
1451
+ # Choose GPU based on model size
1452
+ if parameters > 50_000_000_000: # >50B parameters
1453
+ return {"gpu_type": "A100", "gpu_count": 2}
1454
+ elif parameters > 15_000_000_000: # 15B-50B parameters
1455
+ return {"gpu_type": "A100", "gpu_count": 1}
1456
+ elif parameters > 3_000_000_000: # 3B-15B parameters
1457
+ return {"gpu_type": "A10G", "gpu_count": 1}
1458
+ else: # <3B parameters
1459
+ return {"gpu_type": "T4", "gpu_count": 1}