isa-model 0.3.91__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (228) hide show
  1. isa_model/client.py +1166 -584
  2. isa_model/core/cache/redis_cache.py +410 -0
  3. isa_model/core/config/config_manager.py +282 -12
  4. isa_model/core/config.py +91 -1
  5. isa_model/core/database/__init__.py +1 -0
  6. isa_model/core/database/direct_db_client.py +114 -0
  7. isa_model/core/database/migration_manager.py +563 -0
  8. isa_model/core/database/migrations.py +297 -0
  9. isa_model/core/database/supabase_client.py +258 -0
  10. isa_model/core/dependencies.py +316 -0
  11. isa_model/core/discovery/__init__.py +19 -0
  12. isa_model/core/discovery/consul_discovery.py +190 -0
  13. isa_model/core/logging/__init__.py +54 -0
  14. isa_model/core/logging/influx_logger.py +523 -0
  15. isa_model/core/logging/loki_logger.py +160 -0
  16. isa_model/core/models/__init__.py +46 -0
  17. isa_model/core/models/config_models.py +625 -0
  18. isa_model/core/models/deployment_billing_tracker.py +430 -0
  19. isa_model/core/models/model_billing_tracker.py +60 -88
  20. isa_model/core/models/model_manager.py +66 -25
  21. isa_model/core/models/model_metadata.py +690 -0
  22. isa_model/core/models/model_repo.py +217 -55
  23. isa_model/core/models/model_statistics_tracker.py +234 -0
  24. isa_model/core/models/model_storage.py +0 -1
  25. isa_model/core/models/model_version_manager.py +959 -0
  26. isa_model/core/models/system_models.py +857 -0
  27. isa_model/core/pricing_manager.py +2 -249
  28. isa_model/core/repositories/__init__.py +9 -0
  29. isa_model/core/repositories/config_repository.py +912 -0
  30. isa_model/core/resilience/circuit_breaker.py +366 -0
  31. isa_model/core/security/secrets.py +358 -0
  32. isa_model/core/services/__init__.py +2 -4
  33. isa_model/core/services/intelligent_model_selector.py +479 -370
  34. isa_model/core/storage/hf_storage.py +2 -2
  35. isa_model/core/types.py +8 -0
  36. isa_model/deployment/__init__.py +5 -48
  37. isa_model/deployment/core/__init__.py +2 -31
  38. isa_model/deployment/core/deployment_manager.py +1278 -368
  39. isa_model/deployment/local/__init__.py +31 -0
  40. isa_model/deployment/local/config.py +248 -0
  41. isa_model/deployment/local/gpu_gateway.py +607 -0
  42. isa_model/deployment/local/health_checker.py +428 -0
  43. isa_model/deployment/local/provider.py +586 -0
  44. isa_model/deployment/local/tensorrt_service.py +621 -0
  45. isa_model/deployment/local/transformers_service.py +644 -0
  46. isa_model/deployment/local/vllm_service.py +527 -0
  47. isa_model/deployment/modal/__init__.py +8 -0
  48. isa_model/deployment/modal/config.py +136 -0
  49. isa_model/deployment/modal/deployer.py +894 -0
  50. isa_model/deployment/modal/services/__init__.py +3 -0
  51. isa_model/deployment/modal/services/audio/__init__.py +1 -0
  52. isa_model/deployment/modal/services/audio/isa_audio_chatTTS_service.py +520 -0
  53. isa_model/deployment/modal/services/audio/isa_audio_openvoice_service.py +758 -0
  54. isa_model/deployment/modal/services/audio/isa_audio_service_v2.py +1044 -0
  55. isa_model/deployment/modal/services/embedding/__init__.py +1 -0
  56. isa_model/deployment/modal/services/embedding/isa_embed_rerank_service.py +296 -0
  57. isa_model/deployment/modal/services/llm/__init__.py +1 -0
  58. isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
  59. isa_model/deployment/modal/services/video/__init__.py +1 -0
  60. isa_model/deployment/modal/services/video/isa_video_hunyuan_service.py +423 -0
  61. isa_model/deployment/modal/services/vision/__init__.py +1 -0
  62. isa_model/deployment/modal/services/vision/isa_vision_ocr_service.py +519 -0
  63. isa_model/deployment/modal/services/vision/isa_vision_qwen25_service.py +709 -0
  64. isa_model/deployment/modal/services/vision/isa_vision_table_service.py +676 -0
  65. isa_model/deployment/modal/services/vision/isa_vision_ui_service.py +833 -0
  66. isa_model/deployment/modal/services/vision/isa_vision_ui_service_optimized.py +660 -0
  67. isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
  68. isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
  69. isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
  70. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
  71. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
  72. isa_model/deployment/storage/__init__.py +5 -0
  73. isa_model/deployment/storage/deployment_repository.py +824 -0
  74. isa_model/deployment/triton/__init__.py +10 -0
  75. isa_model/deployment/triton/config.py +196 -0
  76. isa_model/deployment/triton/configs/__init__.py +1 -0
  77. isa_model/deployment/triton/provider.py +512 -0
  78. isa_model/deployment/triton/scripts/__init__.py +1 -0
  79. isa_model/deployment/triton/templates/__init__.py +1 -0
  80. isa_model/inference/__init__.py +47 -1
  81. isa_model/inference/ai_factory.py +179 -16
  82. isa_model/inference/legacy_services/__init__.py +21 -0
  83. isa_model/inference/legacy_services/model_evaluation.py +637 -0
  84. isa_model/inference/legacy_services/model_service.py +573 -0
  85. isa_model/inference/legacy_services/model_serving.py +717 -0
  86. isa_model/inference/legacy_services/model_training.py +561 -0
  87. isa_model/inference/models/__init__.py +21 -0
  88. isa_model/inference/models/inference_config.py +551 -0
  89. isa_model/inference/models/inference_record.py +675 -0
  90. isa_model/inference/models/performance_models.py +714 -0
  91. isa_model/inference/repositories/__init__.py +9 -0
  92. isa_model/inference/repositories/inference_repository.py +828 -0
  93. isa_model/inference/services/audio/__init__.py +21 -0
  94. isa_model/inference/services/audio/base_realtime_service.py +225 -0
  95. isa_model/inference/services/audio/base_stt_service.py +184 -11
  96. isa_model/inference/services/audio/isa_tts_service.py +0 -0
  97. isa_model/inference/services/audio/openai_realtime_service.py +320 -124
  98. isa_model/inference/services/audio/openai_stt_service.py +53 -11
  99. isa_model/inference/services/base_service.py +17 -1
  100. isa_model/inference/services/custom_model_manager.py +277 -0
  101. isa_model/inference/services/embedding/__init__.py +13 -0
  102. isa_model/inference/services/embedding/base_embed_service.py +111 -8
  103. isa_model/inference/services/embedding/isa_embed_service.py +305 -0
  104. isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
  105. isa_model/inference/services/embedding/openai_embed_service.py +2 -4
  106. isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
  107. isa_model/inference/services/embedding/tests/test_embedding.py +222 -0
  108. isa_model/inference/services/img/__init__.py +2 -2
  109. isa_model/inference/services/img/base_image_gen_service.py +24 -7
  110. isa_model/inference/services/img/replicate_image_gen_service.py +84 -422
  111. isa_model/inference/services/img/services/replicate_face_swap.py +193 -0
  112. isa_model/inference/services/img/services/replicate_flux.py +226 -0
  113. isa_model/inference/services/img/services/replicate_flux_kontext.py +219 -0
  114. isa_model/inference/services/img/services/replicate_sticker_maker.py +249 -0
  115. isa_model/inference/services/img/tests/test_img_client.py +297 -0
  116. isa_model/inference/services/llm/__init__.py +10 -2
  117. isa_model/inference/services/llm/base_llm_service.py +361 -26
  118. isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
  119. isa_model/inference/services/llm/helpers/llm_adapter.py +71 -12
  120. isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
  121. isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
  122. isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
  123. isa_model/inference/services/llm/local_llm_service.py +747 -0
  124. isa_model/inference/services/llm/ollama_llm_service.py +11 -3
  125. isa_model/inference/services/llm/openai_llm_service.py +670 -56
  126. isa_model/inference/services/llm/yyds_llm_service.py +10 -3
  127. isa_model/inference/services/vision/__init__.py +27 -6
  128. isa_model/inference/services/vision/base_vision_service.py +118 -185
  129. isa_model/inference/services/vision/blip_vision_service.py +359 -0
  130. isa_model/inference/services/vision/helpers/image_utils.py +19 -10
  131. isa_model/inference/services/vision/isa_vision_service.py +634 -0
  132. isa_model/inference/services/vision/openai_vision_service.py +19 -10
  133. isa_model/inference/services/vision/tests/test_ocr_client.py +284 -0
  134. isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
  135. isa_model/serving/api/cache_manager.py +245 -0
  136. isa_model/serving/api/dependencies/__init__.py +1 -0
  137. isa_model/serving/api/dependencies/auth.py +194 -0
  138. isa_model/serving/api/dependencies/database.py +139 -0
  139. isa_model/serving/api/error_handlers.py +284 -0
  140. isa_model/serving/api/fastapi_server.py +240 -18
  141. isa_model/serving/api/middleware/auth.py +317 -0
  142. isa_model/serving/api/middleware/security.py +268 -0
  143. isa_model/serving/api/middleware/tenant_context.py +414 -0
  144. isa_model/serving/api/routes/analytics.py +489 -0
  145. isa_model/serving/api/routes/config.py +645 -0
  146. isa_model/serving/api/routes/deployment_billing.py +315 -0
  147. isa_model/serving/api/routes/deployments.py +475 -0
  148. isa_model/serving/api/routes/gpu_gateway.py +440 -0
  149. isa_model/serving/api/routes/health.py +32 -12
  150. isa_model/serving/api/routes/inference_monitoring.py +486 -0
  151. isa_model/serving/api/routes/local_deployments.py +448 -0
  152. isa_model/serving/api/routes/logs.py +430 -0
  153. isa_model/serving/api/routes/settings.py +582 -0
  154. isa_model/serving/api/routes/tenants.py +575 -0
  155. isa_model/serving/api/routes/unified.py +992 -171
  156. isa_model/serving/api/routes/webhooks.py +479 -0
  157. isa_model/serving/api/startup.py +318 -0
  158. isa_model/serving/modal_proxy_server.py +249 -0
  159. isa_model/utils/gpu_utils.py +311 -0
  160. {isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/METADATA +76 -22
  161. isa_model-0.4.3.dist-info/RECORD +193 -0
  162. isa_model/deployment/cloud/__init__.py +0 -9
  163. isa_model/deployment/cloud/modal/__init__.py +0 -10
  164. isa_model/deployment/cloud/modal/isa_vision_doc_service.py +0 -766
  165. isa_model/deployment/cloud/modal/isa_vision_table_service.py +0 -532
  166. isa_model/deployment/cloud/modal/isa_vision_ui_service.py +0 -406
  167. isa_model/deployment/cloud/modal/register_models.py +0 -321
  168. isa_model/deployment/core/deployment_config.py +0 -356
  169. isa_model/deployment/core/isa_deployment_service.py +0 -401
  170. isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
  171. isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
  172. isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
  173. isa_model/deployment/runtime/deployed_service.py +0 -338
  174. isa_model/deployment/services/__init__.py +0 -9
  175. isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
  176. isa_model/deployment/services/model_service.py +0 -332
  177. isa_model/deployment/services/service_monitor.py +0 -356
  178. isa_model/deployment/services/service_registry.py +0 -527
  179. isa_model/eval/__init__.py +0 -92
  180. isa_model/eval/benchmarks.py +0 -469
  181. isa_model/eval/config/__init__.py +0 -10
  182. isa_model/eval/config/evaluation_config.py +0 -108
  183. isa_model/eval/evaluators/__init__.py +0 -18
  184. isa_model/eval/evaluators/base_evaluator.py +0 -503
  185. isa_model/eval/evaluators/llm_evaluator.py +0 -472
  186. isa_model/eval/factory.py +0 -531
  187. isa_model/eval/infrastructure/__init__.py +0 -24
  188. isa_model/eval/infrastructure/experiment_tracker.py +0 -466
  189. isa_model/eval/metrics.py +0 -798
  190. isa_model/inference/adapter/unified_api.py +0 -248
  191. isa_model/inference/services/helpers/stacked_config.py +0 -148
  192. isa_model/inference/services/img/flux_professional_service.py +0 -603
  193. isa_model/inference/services/img/helpers/base_stacked_service.py +0 -274
  194. isa_model/inference/services/others/table_transformer_service.py +0 -61
  195. isa_model/inference/services/vision/doc_analysis_service.py +0 -640
  196. isa_model/inference/services/vision/helpers/base_stacked_service.py +0 -274
  197. isa_model/inference/services/vision/ui_analysis_service.py +0 -823
  198. isa_model/scripts/inference_tracker.py +0 -283
  199. isa_model/scripts/mlflow_manager.py +0 -379
  200. isa_model/scripts/model_registry.py +0 -465
  201. isa_model/scripts/register_models.py +0 -370
  202. isa_model/scripts/register_models_with_embeddings.py +0 -510
  203. isa_model/scripts/start_mlflow.py +0 -95
  204. isa_model/scripts/training_tracker.py +0 -257
  205. isa_model/training/__init__.py +0 -74
  206. isa_model/training/annotation/annotation_schema.py +0 -47
  207. isa_model/training/annotation/processors/annotation_processor.py +0 -126
  208. isa_model/training/annotation/storage/dataset_manager.py +0 -131
  209. isa_model/training/annotation/storage/dataset_schema.py +0 -44
  210. isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
  211. isa_model/training/annotation/tests/test_minio copy.py +0 -113
  212. isa_model/training/annotation/tests/test_minio_upload.py +0 -43
  213. isa_model/training/annotation/views/annotation_controller.py +0 -158
  214. isa_model/training/cloud/__init__.py +0 -22
  215. isa_model/training/cloud/job_orchestrator.py +0 -402
  216. isa_model/training/cloud/runpod_trainer.py +0 -454
  217. isa_model/training/cloud/storage_manager.py +0 -482
  218. isa_model/training/core/__init__.py +0 -23
  219. isa_model/training/core/config.py +0 -181
  220. isa_model/training/core/dataset.py +0 -222
  221. isa_model/training/core/trainer.py +0 -720
  222. isa_model/training/core/utils.py +0 -213
  223. isa_model/training/factory.py +0 -424
  224. isa_model-0.3.91.dist-info/RECORD +0 -138
  225. /isa_model/{core/storage/minio_storage.py → deployment/modal/services/audio/isa_audio_fish_service.py} +0 -0
  226. /isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
  227. {isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/WHEEL +0 -0
  228. {isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/top_level.txt +0 -0
@@ -1,8 +1,7 @@
1
1
  """
2
- Deployment Manager
2
+ Unified Deployment Manager
3
3
 
4
- Orchestrates the complete deployment workflow including model preparation,
5
- container building, deployment to cloud providers, and monitoring.
4
+ Orchestrates deployment of AI models to multiple platforms (Modal, Triton, Local GPU).
6
5
  """
7
6
 
8
7
  import os
@@ -13,83 +12,69 @@ from pathlib import Path
13
12
  from datetime import datetime
14
13
  import asyncio
15
14
 
16
- from .deployment_config import (
17
- DeploymentConfig, DeploymentProvider, InferenceEngine,
18
- ModelConfig, TritonConfig, RunPodServerlessConfig
19
- )
20
- from ...core.model_manager import ModelManager
21
- from ...core.model_registry import ModelRegistry, ModelType, ModelCapability
22
- from ...core.storage.hf_storage import HuggingFaceStorage
15
+ from ...core.config.config_manager import ConfigManager
23
16
 
24
17
  logger = logging.getLogger(__name__)
25
18
 
26
19
 
27
20
  class DeploymentManager:
28
21
  """
29
- Manages the complete deployment lifecycle for AI models.
22
+ Unified deployment manager for multiple platforms.
30
23
 
31
24
  This manager coordinates:
32
- - Model preparation and optimization
33
- - Container building and configuration
34
- - Deployment to cloud providers
35
- - Health monitoring and scaling
36
- - Integration with model registry
25
+ - Local GPU deployment with vLLM, TensorRT-LLM, Transformers
26
+ - Cloud deployment to Modal platform
27
+ - Container deployment with Triton Inference Server
28
+ - Deployment tracking and monitoring
37
29
 
38
30
  Example:
39
31
  ```python
40
32
  from isa_model.deployment import DeploymentManager
41
- from isa_model.deployment.core import create_gemma_runpod_triton_config
33
+ from isa_model.deployment.local import create_vllm_config
42
34
 
43
35
  # Initialize deployment manager
44
36
  manager = DeploymentManager()
45
37
 
46
- # Create deployment configuration
47
- config = create_gemma_runpod_triton_config(
48
- model_id="gemma-v1",
49
- runpod_api_key="your-api-key",
50
- model_source_path="xenobordom/gemma-4b-alpaca-v1"
51
- )
38
+ # Deploy to local GPU
39
+ local_config = create_vllm_config("llama2-7b", "meta-llama/Llama-2-7b-chat-hf")
40
+ local_deployment = await manager.deploy_to_local(local_config)
52
41
 
53
- # Deploy the model
54
- deployment = await manager.deploy_model(config)
55
- print(f"Model deployed: {deployment['endpoint_url']}")
42
+ # Deploy to Modal
43
+ modal_deployment = await manager.deploy_to_modal(
44
+ service_name="llm-service",
45
+ model_id="my-model",
46
+ service_type="llm"
47
+ )
56
48
  ```
57
49
  """
58
50
 
59
- def __init__(self,
60
- model_manager: Optional[ModelManager] = None,
61
- storage_backend: str = "huggingface",
62
- workspace_dir: str = "./deployments"):
51
+ def __init__(self, workspace_dir: str = "./deployments"):
63
52
  """
64
53
  Initialize deployment manager.
65
-
54
+
66
55
  Args:
67
- model_manager: Model manager instance
68
- storage_backend: Storage backend to use ("huggingface", "local")
69
56
  workspace_dir: Directory for deployment artifacts
70
57
  """
71
58
  self.workspace_dir = Path(workspace_dir)
72
59
  self.workspace_dir.mkdir(parents=True, exist_ok=True)
73
-
74
- # Initialize model management
75
- if storage_backend == "huggingface":
76
- storage = HuggingFaceStorage()
77
- else:
78
- from ...core.model_storage import LocalModelStorage
79
- storage = LocalModelStorage()
80
-
81
- self.model_manager = model_manager or ModelManager(storage=storage)
82
- self.model_registry = ModelRegistry()
83
-
60
+
84
61
  # Deployment tracking
85
62
  self.deployments: Dict[str, Dict[str, Any]] = {}
86
63
  self.deployments_file = self.workspace_dir / "deployments.json"
87
64
  self._load_deployments()
88
-
65
+
89
66
  # Setup logging
90
67
  self._setup_logging()
91
-
92
- logger.info(f"Deployment manager initialized with {storage_backend} storage")
68
+
69
+ # Initialize configuration manager
70
+ self.config_manager = ConfigManager()
71
+
72
+ # Initialize providers
73
+ self._modal_provider = None
74
+ self._triton_provider = None
75
+ self._local_provider = None
76
+
77
+ logger.info("Unified deployment manager initialized")
93
78
  logger.info(f"Workspace directory: {self.workspace_dir}")
94
79
 
95
80
  def _setup_logging(self):
@@ -124,49 +109,78 @@ class DeploymentManager:
124
109
  with open(self.deployments_file, 'w') as f:
125
110
  json.dump(self.deployments, f, indent=2, default=str)
126
111
 
127
- async def deploy_model(self, config: DeploymentConfig) -> Dict[str, Any]:
112
+ async def deploy_to_modal(self,
113
+ service_name: str,
114
+ model_id: str,
115
+ service_type: str = "llm",
116
+ config: Optional[Dict[str, Any]] = None,
117
+ tenant_context: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
128
118
  """
129
- Deploy a model using the specified configuration.
119
+ Deploy a service to Modal.
130
120
 
131
121
  Args:
132
- config: Deployment configuration
122
+ service_name: Name of the service to deploy
123
+ model_id: Model identifier
124
+ service_type: Type of service (llm, vision, audio, embedding, video)
125
+ config: Additional configuration for the service
133
126
 
134
127
  Returns:
135
128
  Deployment result with endpoint information
136
129
  """
137
- deployment_id = config.deployment_id
130
+ # Extract tenant information for deployment isolation
131
+ organization_id = tenant_context.get('organization_id') if tenant_context else 'default'
132
+ tenant_prefix = f"org-{organization_id}" if organization_id != 'default' else ''
133
+
134
+ # Generate tenant-isolated deployment ID
135
+ base_deployment_id = f"{service_name}-{service_type}-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
136
+ deployment_id = f"{tenant_prefix}-{base_deployment_id}" if tenant_prefix else base_deployment_id
138
137
 
139
138
  logger.info("=" * 60)
140
- logger.info(f"STARTING DEPLOYMENT: {deployment_id}")
139
+ logger.info(f"STARTING MODAL DEPLOYMENT: {deployment_id}")
140
+ logger.info(f"TENANT: {organization_id}")
141
141
  logger.info("=" * 60)
142
142
 
143
143
  try:
144
+ # Track deployment start for billing
145
+ deployment_start_time = datetime.now()
146
+
144
147
  # Step 1: Validate configuration
145
- logger.info("Step 1/6: Validating deployment configuration...")
146
- self._validate_config(config)
148
+ logger.info("Step 1/4: Validating deployment configuration...")
149
+ self._validate_modal_config(service_name, model_id, service_type)
147
150
 
148
- # Step 2: Prepare model
149
- logger.info("Step 2/6: Preparing model...")
150
- model_path = await self._prepare_model(config.model_config)
151
+ # Step 2: Prepare deployment artifacts
152
+ logger.info("Step 2/4: Preparing Modal deployment artifacts...")
153
+ artifacts_path = await self._prepare_modal_artifacts(deployment_id, service_name, model_id, service_type, config)
151
154
 
152
- # Step 3: Optimize model (TensorRT conversion if needed)
153
- logger.info("Step 3/6: Optimizing model...")
154
- optimized_model_path = await self._optimize_model(config, model_path)
155
+ # Step 3: Deploy to Modal
156
+ logger.info("Step 3/4: Deploying to Modal...")
157
+ deployment_result = await self._deploy_modal_service(deployment_id, service_name, service_type, artifacts_path)
155
158
 
156
- # Step 4: Prepare deployment artifacts
157
- logger.info("Step 4/6: Preparing deployment artifacts...")
158
- artifacts_path = await self._prepare_deployment_artifacts(config, optimized_model_path)
159
+ # Calculate deployment duration
160
+ deployment_duration = (datetime.now() - deployment_start_time).total_seconds() / 3600 # hours
159
161
 
160
- # Step 5: Deploy to provider
161
- logger.info("Step 5/6: Deploying to provider...")
162
- deployment_result = await self._deploy_to_provider(config, artifacts_path)
162
+ # Track billing for Modal deployment
163
+ self._track_modal_deployment_billing(
164
+ service_name=service_name,
165
+ model_id=model_id,
166
+ service_type=service_type,
167
+ deployment_duration_hours=deployment_duration,
168
+ config=config,
169
+ result=deployment_result
170
+ )
163
171
 
164
- # Step 6: Register deployment
165
- logger.info("Step 6/6: Registering deployment...")
166
- await self._register_deployment(config, deployment_result)
172
+ # Step 4: Register deployment
173
+ logger.info("Step 4/4: Registering deployment...")
174
+ await self._register_deployment(deployment_id, {
175
+ "service_name": service_name,
176
+ "model_id": model_id,
177
+ "service_type": service_type,
178
+ "config": config or {},
179
+ "deployment_duration_hours": deployment_duration
180
+ }, deployment_result, tenant_context)
167
181
 
168
182
  logger.info("=" * 60)
169
- logger.info("DEPLOYMENT COMPLETED SUCCESSFULLY!")
183
+ logger.info("MODAL DEPLOYMENT COMPLETED SUCCESSFULLY!")
170
184
  logger.info("=" * 60)
171
185
  logger.info(f"Deployment ID: {deployment_id}")
172
186
  logger.info(f"Endpoint URL: {deployment_result.get('endpoint_url', 'N/A')}")
@@ -175,13 +189,15 @@ class DeploymentManager:
175
189
 
176
190
  except Exception as e:
177
191
  logger.error("=" * 60)
178
- logger.error("DEPLOYMENT FAILED!")
192
+ logger.error("MODAL DEPLOYMENT FAILED!")
179
193
  logger.error("=" * 60)
180
194
  logger.error(f"Error: {e}")
181
195
 
182
196
  # Update deployment status
183
197
  self.deployments[deployment_id] = {
184
- "config": config.to_dict(),
198
+ "service_name": service_name,
199
+ "model_id": model_id,
200
+ "service_type": service_type,
185
201
  "status": "failed",
186
202
  "error": str(e),
187
203
  "created_at": datetime.now().isoformat(),
@@ -191,99 +207,44 @@ class DeploymentManager:
191
207
 
192
208
  raise
193
209
 
194
- def _validate_config(self, config: DeploymentConfig):
195
- """Validate deployment configuration"""
196
- logger.debug("Validating deployment configuration...")
210
+ def _validate_modal_config(self, service_name: str, model_id: str, service_type: str):
211
+ """Validate Modal deployment configuration"""
212
+ logger.debug("Validating Modal deployment configuration...")
197
213
 
198
214
  # Check required fields
199
- if not config.deployment_id:
200
- raise ValueError("deployment_id is required")
201
-
202
- if not config.model_config:
203
- raise ValueError("model_config is required")
204
-
205
- # Provider-specific validation
206
- if config.provider == DeploymentProvider.RUNPOD_SERVERLESS:
207
- if not config.runpod_config or not config.runpod_config.api_key:
208
- raise ValueError("RunPod API key is required for RunPod deployment")
209
-
210
- # Engine-specific validation
211
- if config.inference_engine == InferenceEngine.TRITON:
212
- if not config.triton_config:
213
- raise ValueError("Triton configuration is required for Triton engine")
214
-
215
- logger.info("Configuration validation passed")
216
-
217
- async def _prepare_model(self, model_config: ModelConfig) -> Path:
218
- """Prepare model for deployment"""
219
- logger.info(f"Preparing model: {model_config.model_id}")
220
-
221
- # Determine model type for registry
222
- if model_config.model_type == "llm":
223
- model_type = ModelType.LLM
224
- elif model_config.model_type == "embedding":
225
- model_type = ModelType.EMBEDDING
226
- elif model_config.model_type == "vision":
227
- model_type = ModelType.VISION
228
- else:
229
- model_type = ModelType.LLM # Default
230
-
231
- # Convert capabilities
232
- capabilities = []
233
- for cap in model_config.capabilities:
234
- if cap == "text_generation":
235
- capabilities.append(ModelCapability.TEXT_GENERATION)
236
- elif cap == "chat":
237
- capabilities.append(ModelCapability.CHAT)
238
- elif cap == "embedding":
239
- capabilities.append(ModelCapability.EMBEDDING)
240
- else:
241
- capabilities.append(ModelCapability.TEXT_GENERATION) # Default
242
-
243
- # Get or download model
244
- if model_config.source_type == "huggingface":
245
- model_path = await self.model_manager.get_model(
246
- model_id=model_config.model_id,
247
- repo_id=model_config.source_path,
248
- model_type=model_type,
249
- capabilities=capabilities
250
- )
251
- elif model_config.source_type == "local":
252
- model_path = Path(model_config.source_path)
253
- if not model_path.exists():
254
- raise FileNotFoundError(f"Model not found at {model_path}")
255
- else:
256
- raise ValueError(f"Unsupported source type: {model_config.source_type}")
257
-
258
- logger.info(f"Model prepared at: {model_path}")
259
- return model_path
260
-
261
- async def _optimize_model(self, config: DeploymentConfig, model_path: Path) -> Path:
262
- """Optimize model for deployment"""
263
- logger.info("Optimizing model for deployment...")
215
+ if not service_name:
216
+ raise ValueError("service_name is required")
264
217
 
265
- # For now, return the original path
266
- # TODO: Implement TensorRT optimization, quantization, etc.
267
- if config.model_config.use_tensorrt:
268
- logger.info("TensorRT optimization requested (not yet implemented)")
218
+ if not model_id:
219
+ raise ValueError("model_id is required")
269
220
 
270
- if config.model_config.use_quantization:
271
- logger.info(f"Quantization requested: {config.model_config.quantization_method}")
221
+ # Check service type
222
+ valid_service_types = ["llm", "vision", "audio", "embedding", "video"]
223
+ if service_type not in valid_service_types:
224
+ raise ValueError(f"service_type must be one of {valid_service_types}")
272
225
 
273
- logger.info("Model optimization completed (pass-through for now)")
274
- return model_path
226
+ # Check Modal token using ConfigManager
227
+ modal_config = self.config_manager.get_deployment_config("modal")
228
+ if not modal_config or not modal_config.get("token_id"):
229
+ logger.warning("MODAL_TOKEN_ID not found in configuration")
230
+
231
+ logger.info("Modal configuration validation passed")
275
232
 
276
- async def _prepare_deployment_artifacts(self, config: DeploymentConfig, model_path: Path) -> Path:
277
- """Prepare deployment artifacts"""
278
- logger.info("Preparing deployment artifacts...")
233
+ async def _prepare_modal_artifacts(self, deployment_id: str, service_name: str, model_id: str, service_type: str, config: Optional[Dict[str, Any]]) -> Path:
234
+ """Prepare Modal deployment artifacts"""
235
+ logger.info("Preparing Modal deployment artifacts...")
279
236
 
280
237
  # Create deployment workspace
281
- deployment_workspace = self.workspace_dir / config.deployment_id
238
+ deployment_workspace = self.workspace_dir / deployment_id
282
239
  deployment_workspace.mkdir(exist_ok=True)
283
240
 
284
241
  artifacts = {
285
- "config": config.to_dict(),
286
- "model_path": str(model_path),
242
+ "deployment_id": deployment_id,
243
+ "service_name": service_name,
244
+ "model_id": model_id,
245
+ "service_type": service_type,
246
+ "config": config or {},
247
+ "platform": "modal",
287
248
  "created_at": datetime.now().isoformat()
288
249
  }
289
250
 
@@ -291,211 +252,121 @@ class DeploymentManager:
291
252
  with open(deployment_workspace / "deployment_config.json", 'w') as f:
292
253
  json.dump(artifacts, f, indent=2)
293
254
 
294
- # Generate Triton model configuration if needed
295
- if config.inference_engine == InferenceEngine.TRITON:
296
- await self._generate_triton_config(config, deployment_workspace, model_path)
297
-
298
- # Generate Docker configuration if needed
299
- await self._generate_docker_config(config, deployment_workspace)
300
-
301
- logger.info(f"Deployment artifacts prepared at: {deployment_workspace}")
255
+ logger.info(f"Modal deployment artifacts prepared at: {deployment_workspace}")
302
256
  return deployment_workspace
303
257
 
304
- async def _generate_triton_config(self, config: DeploymentConfig, workspace: Path, model_path: Path):
305
- """Generate Triton model configuration"""
306
- logger.info("Generating Triton model configuration...")
307
-
308
- triton_config = config.triton_config
309
- model_config = config.model_config
310
-
311
- # Create model repository structure
312
- model_repo = workspace / "model_repository"
313
- model_dir = model_repo / triton_config.model_name / "1"
314
- model_dir.mkdir(parents=True, exist_ok=True)
315
-
316
- # Copy model files
317
- import shutil
318
- if model_path.is_file():
319
- shutil.copy2(model_path, model_dir)
320
- else:
321
- shutil.copytree(model_path, model_dir / "model", dirs_exist_ok=True)
322
-
323
- # Generate config.pbtxt
324
- config_content = f"""
325
- name: "{triton_config.model_name}"
326
- backend: "{triton_config.backend}"
327
- max_batch_size: {triton_config.max_batch_size}
328
-
329
- input [
330
- {{
331
- name: "input_ids"
332
- data_type: TYPE_INT32
333
- dims: [ -1 ]
334
- }},
335
- {{
336
- name: "attention_mask"
337
- data_type: TYPE_INT32
338
- dims: [ -1 ]
339
- optional: true
340
- }}
341
- ]
342
-
343
- output [
344
- {{
345
- name: "output"
346
- data_type: TYPE_STRING
347
- dims: [ -1 ]
348
- }}
349
- ]
350
-
351
- instance_group [
352
- {{
353
- count: {triton_config.instance_group_count}
354
- kind: {triton_config.instance_group_kind}
355
- }}
356
- ]
357
-
358
- dynamic_batching {{
359
- max_queue_delay_microseconds: 100
360
- }}
361
- """
362
-
363
- with open(model_repo / triton_config.model_name / "config.pbtxt", 'w') as f:
364
- f.write(config_content.strip())
365
-
366
- logger.info("Triton configuration generated")
367
-
368
- async def _generate_docker_config(self, config: DeploymentConfig, workspace: Path):
369
- """Generate Docker configuration"""
370
- logger.info("Generating Docker configuration...")
371
-
372
- # Generate Dockerfile
373
- dockerfile_content = f"""
374
- FROM {config.runpod_config.container_image if config.runpod_config else 'nvidia/tritonserver:23.10-py3'}
375
-
376
- WORKDIR /workspace
377
-
378
- # Copy model repository
379
- COPY model_repository /models
380
-
381
- # Copy deployment configuration
382
- COPY deployment_config.json /workspace/
383
-
384
- # Set environment variables
385
- ENV TRITON_MODEL_REPOSITORY=/models
386
- ENV CUDA_VISIBLE_DEVICES=0
387
-
388
- # Expose Triton ports
389
- EXPOSE 8000 8001 8002
390
-
391
- # Health check
392
- HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \\
393
- CMD curl -f http://localhost:8000/v2/health/ready || exit 1
394
-
395
- # Start Triton server
396
- CMD ["tritonserver", "--model-repository=/models", "--allow-http=true", "--allow-grpc=true", "--allow-metrics=true"]
397
- """
398
-
399
- with open(workspace / "Dockerfile", 'w') as f:
400
- f.write(dockerfile_content.strip())
401
-
402
- # Generate docker-compose.yml for local testing
403
- compose_content = f"""
404
- version: '3.8'
405
-
406
- services:
407
- triton-server:
408
- build: .
409
- ports:
410
- - "8000:8000"
411
- - "8001:8001"
412
- - "8002:8002"
413
- environment:
414
- - CUDA_VISIBLE_DEVICES=0
415
- volumes:
416
- - ./model_repository:/models
417
- deploy:
418
- resources:
419
- reservations:
420
- devices:
421
- - driver: nvidia
422
- count: 1
423
- capabilities: [gpu]
424
- """
425
-
426
- with open(workspace / "docker-compose.yml", 'w') as f:
427
- f.write(compose_content.strip())
428
-
429
- logger.info("Docker configuration generated")
430
-
431
- async def _deploy_to_provider(self, config: DeploymentConfig, artifacts_path: Path) -> Dict[str, Any]:
432
- """Deploy to the specified provider"""
433
- logger.info(f"Deploying to provider: {config.provider.value}")
434
-
435
- if config.provider == DeploymentProvider.RUNPOD_SERVERLESS:
436
- return await self._deploy_to_runpod_serverless(config, artifacts_path)
437
- elif config.provider == DeploymentProvider.LOCAL:
438
- return await self._deploy_locally(config, artifacts_path)
439
- else:
440
- raise ValueError(f"Provider {config.provider} not yet implemented")
441
-
442
- async def _deploy_to_runpod_serverless(self, config: DeploymentConfig, artifacts_path: Path) -> Dict[str, Any]:
443
- """Deploy to RunPod Serverless"""
444
- logger.info("Deploying to RunPod Serverless...")
445
-
446
- # TODO: Implement RunPod Serverless deployment
447
- # This would involve:
448
- # 1. Building and pushing Docker image
449
- # 2. Creating RunPod serverless endpoint
450
- # 3. Configuring scaling and networking
451
-
452
- # For now, return mock result
453
- result = {
454
- "provider": "runpod_serverless",
455
- "endpoint_id": f"mock-endpoint-{config.deployment_id}",
456
- "endpoint_url": f"https://api.runpod.ai/v2/{config.deployment_id}/run",
457
- "status": "deployed",
458
- "deployed_at": datetime.now().isoformat()
459
- }
258
+ async def _deploy_modal_service(self, deployment_id: str, service_name: str, service_type: str, artifacts_path: Path) -> Dict[str, Any]:
259
+ """Deploy service to Modal using real Modal integration"""
260
+ logger.info(f"Deploying {service_type} service '{service_name}' to Modal...")
460
261
 
461
- logger.info(f"RunPod deployment completed: {result['endpoint_url']}")
462
- return result
463
-
464
- async def _deploy_locally(self, config: DeploymentConfig, artifacts_path: Path) -> Dict[str, Any]:
465
- """Deploy locally using Docker"""
466
- logger.info("Deploying locally using Docker...")
467
-
468
- # TODO: Implement local Docker deployment
469
- result = {
470
- "provider": "local",
471
- "endpoint_url": "http://localhost:8000",
472
- "status": "deployed",
473
- "deployed_at": datetime.now().isoformat(),
474
- "container_id": f"triton-{config.deployment_id}"
475
- }
476
-
477
- logger.info(f"Local deployment completed: {result['endpoint_url']}")
478
- return result
262
+ try:
263
+ # Load deployment config
264
+ config_file = artifacts_path / "deployment_config.json"
265
+ with open(config_file, 'r') as f:
266
+ deployment_config = json.load(f)
267
+
268
+ model_id = deployment_config['model_id']
269
+ config = deployment_config.get('config', {})
270
+
271
+ # Use Modal provider for real deployment
272
+ modal_provider = self.modal_provider
273
+
274
+ # Step 1: Analyze the model to get optimal configuration
275
+ logger.info(f"Analyzing model {model_id}...")
276
+ model_config = await asyncio.get_event_loop().run_in_executor(
277
+ None, modal_provider.analyze_model, model_id
278
+ )
279
+
280
+ # Step 2: Generate the appropriate Modal service
281
+ logger.info(f"Generating {service_type} service for {model_config.architecture}...")
282
+ service_code = await self._generate_modal_service_code(
283
+ service_name=service_name,
284
+ model_config=model_config,
285
+ service_type=service_type,
286
+ config=config
287
+ )
288
+
289
+ # Step 3: Save the generated service code
290
+ service_file = artifacts_path / f"{service_name}_modal_service.py"
291
+ with open(service_file, 'w') as f:
292
+ f.write(service_code)
293
+
294
+ # Step 4: Deploy to Modal (simulate for now, but with real structure)
295
+ deployment_result = await self._execute_modal_deployment(
296
+ service_file=service_file,
297
+ service_name=service_name,
298
+ model_config=model_config,
299
+ deployment_id=deployment_id
300
+ )
301
+
302
+ result = {
303
+ "provider": "modal",
304
+ "deployment_id": deployment_id,
305
+ "service_name": service_name,
306
+ "service_type": service_type,
307
+ "model_id": model_id,
308
+ "model_architecture": model_config.architecture,
309
+ "endpoint_url": deployment_result['endpoint_url'],
310
+ "status": deployment_result['status'],
311
+ "gpu_type": model_config.gpu_requirements,
312
+ "memory_gb": model_config.memory_gb,
313
+ "estimated_cost_per_hour": model_config.estimated_cost_per_hour,
314
+ "deployed_at": datetime.now().isoformat(),
315
+ "service_file": str(service_file)
316
+ }
317
+
318
+ logger.info(f"Modal deployment completed: {result['endpoint_url']}")
319
+ return result
320
+
321
+ except Exception as e:
322
+ logger.error(f"Failed to deploy Modal service: {e}")
323
+ raise
479
324
 
480
- async def _register_deployment(self, config: DeploymentConfig, deployment_result: Dict[str, Any]):
481
- """Register deployment in tracking system"""
482
- logger.info("Registering deployment...")
325
+ async def _register_deployment(self, deployment_id: str, config: Dict[str, Any], deployment_result: Dict[str, Any], tenant_context: Optional[Dict[str, Any]] = None):
326
+ """Register deployment in tracking system with tenant isolation"""
327
+ logger.info("Registering Modal deployment...")
483
328
 
484
329
  deployment_info = {
485
- "config": config.to_dict(),
330
+ "config": config,
486
331
  "result": deployment_result,
487
332
  "status": "active",
333
+ "platform": "modal",
488
334
  "created_at": datetime.now().isoformat(),
489
- "updated_at": datetime.now().isoformat()
335
+ "updated_at": datetime.now().isoformat(),
336
+ # Add tenant information for isolation
337
+ "tenant": {
338
+ "organization_id": tenant_context.get('organization_id', 'default') if tenant_context else 'default',
339
+ "user_id": tenant_context.get('user_id') if tenant_context else None,
340
+ "role": tenant_context.get('role', 'user') if tenant_context else 'user'
341
+ }
490
342
  }
491
343
 
492
- self.deployments[config.deployment_id] = deployment_info
344
+ self.deployments[deployment_id] = deployment_info
493
345
  self._save_deployments()
494
346
 
495
- logger.info(f"Deployment registered: {config.deployment_id}")
347
+ logger.info(f"Modal deployment registered: {deployment_id}")
496
348
 
497
- async def list_deployments(self) -> List[Dict[str, Any]]:
498
- """List all deployments"""
349
+ async def list_deployments(self, tenant_context: Optional[Dict[str, Any]] = None) -> List[Dict[str, Any]]:
350
+ """List deployments with optional tenant filtering"""
351
+
352
+ # If tenant context is provided, filter by organization
353
+ if tenant_context and tenant_context.get('organization_id'):
354
+ organization_id = tenant_context['organization_id']
355
+ filtered_deployments = []
356
+
357
+ for deployment_id, info in self.deployments.items():
358
+ # Check tenant information in deployment
359
+ deployment_org = info.get('tenant', {}).get('organization_id', 'default')
360
+ if deployment_org == organization_id:
361
+ filtered_deployments.append({
362
+ "deployment_id": deployment_id,
363
+ **info
364
+ })
365
+
366
+ logger.info(f"Filtered deployments for tenant {organization_id}: {len(filtered_deployments)} found")
367
+ return filtered_deployments
368
+
369
+ # Return all deployments if no tenant context
499
370
  return [
500
371
  {
501
372
  "deployment_id": deployment_id,
@@ -504,38 +375,263 @@ services:
504
375
  for deployment_id, info in self.deployments.items()
505
376
  ]
506
377
 
507
- async def get_deployment(self, deployment_id: str) -> Optional[Dict[str, Any]]:
508
- """Get deployment information"""
509
- return self.deployments.get(deployment_id)
378
+ async def get_deployment(self, deployment_id: str, tenant_context: Optional[Dict[str, Any]] = None) -> Optional[Dict[str, Any]]:
379
+ """Get deployment information with tenant access control"""
380
+ deployment = self.deployments.get(deployment_id)
381
+
382
+ if not deployment:
383
+ return None
384
+
385
+ # If tenant context is provided, verify access
386
+ if tenant_context and tenant_context.get('organization_id'):
387
+ organization_id = tenant_context['organization_id']
388
+ deployment_org = deployment.get('tenant', {}).get('organization_id', 'default')
389
+
390
+ # Check if user has access to this deployment
391
+ if deployment_org != organization_id:
392
+ logger.warning(f"Access denied: tenant {organization_id} tried to access deployment from {deployment_org}")
393
+ return None
394
+
395
+ return deployment
510
396
 
511
- async def delete_deployment(self, deployment_id: str) -> bool:
512
- """Delete a deployment"""
513
- logger.info(f"Deleting deployment: {deployment_id}")
397
+ async def delete_deployment(self, deployment_id: str, tenant_context: Optional[Dict[str, Any]] = None) -> bool:
398
+ """Delete a Modal deployment with tenant access control"""
399
+ logger.info(f"Deleting Modal deployment: {deployment_id}")
514
400
 
515
401
  try:
516
- if deployment_id in self.deployments:
517
- # TODO: Implement actual provider cleanup
402
+ if deployment_id not in self.deployments:
403
+ logger.warning(f"Deployment not found: {deployment_id}")
404
+ return False
518
405
 
519
- # Remove from tracking
520
- del self.deployments[deployment_id]
521
- self._save_deployments()
406
+ deployment = self.deployments[deployment_id]
407
+
408
+ # Verify tenant access
409
+ if tenant_context and tenant_context.get('organization_id'):
410
+ organization_id = tenant_context['organization_id']
411
+ deployment_org = deployment.get('tenant', {}).get('organization_id', 'default')
522
412
 
523
- # Clean up workspace
524
- deployment_workspace = self.workspace_dir / deployment_id
525
- if deployment_workspace.exists():
526
- import shutil
527
- shutil.rmtree(deployment_workspace)
413
+ if deployment_org != organization_id:
414
+ logger.warning(f"Access denied: tenant {organization_id} tried to delete deployment from {deployment_org}")
415
+ return False
416
+
417
+ # TODO: Implement actual Modal service cleanup using Modal SDK
418
+
419
+ # Remove from tracking
420
+ del self.deployments[deployment_id]
421
+ self._save_deployments()
422
+
423
+ # Clean up workspace
424
+ deployment_workspace = self.workspace_dir / deployment_id
425
+ if deployment_workspace.exists():
426
+ import shutil
427
+ shutil.rmtree(deployment_workspace)
428
+
429
+ logger.info(f"Modal deployment deleted: {deployment_id}")
430
+ return True
528
431
 
529
- logger.info(f"Deployment deleted: {deployment_id}")
530
- return True
432
+ except Exception as e:
433
+ logger.error(f"Failed to delete Modal deployment {deployment_id}: {e}")
434
+ return False
435
+
436
+ async def get_modal_service_status(self, deployment_id: str) -> Dict[str, Any]:
437
+ """Get real-time Modal service status"""
438
+ logger.info(f"Getting Modal service status for: {deployment_id}")
439
+
440
+ if deployment_id not in self.deployments:
441
+ return {
442
+ "deployment_id": deployment_id,
443
+ "status": "not_found",
444
+ "error": "Deployment not found"
445
+ }
446
+
447
+ deployment_info = self.deployments[deployment_id]
448
+
449
+ try:
450
+ # Get Modal service details
451
+ service_name = deployment_info.get('service_name')
452
+ model_id = deployment_info.get('model_id')
453
+
454
+ # Check if Modal service is accessible
455
+ modal_url = deployment_info.get('modal_url')
456
+
457
+ status_info = {
458
+ "deployment_id": deployment_id,
459
+ "service_name": service_name,
460
+ "model_id": model_id,
461
+ "status": deployment_info.get('status', 'unknown'),
462
+ "created_at": deployment_info.get('created_at'),
463
+ "updated_at": deployment_info.get('updated_at'),
464
+ "modal_url": modal_url,
465
+ "platform": "modal",
466
+ "monitoring": {
467
+ "health_check": await self._check_modal_health(modal_url),
468
+ "resource_usage": await self._get_modal_resource_usage(deployment_id),
469
+ "request_metrics": await self._get_modal_metrics(deployment_id),
470
+ "cost_tracking": await self._get_modal_cost_info(deployment_id)
471
+ }
472
+ }
473
+
474
+ # Update status based on health check
475
+ if status_info["monitoring"]["health_check"]["status"] == "healthy":
476
+ status_info["status"] = "running"
477
+ elif status_info["monitoring"]["health_check"]["status"] == "error":
478
+ status_info["status"] = "error"
531
479
  else:
532
- logger.warning(f"Deployment not found: {deployment_id}")
533
- return False
480
+ status_info["status"] = "pending"
481
+
482
+ logger.info(f"Modal service status retrieved: {deployment_id}")
483
+ return status_info
484
+
485
+ except Exception as e:
486
+ logger.error(f"Failed to get Modal service status {deployment_id}: {e}")
487
+ return {
488
+ "deployment_id": deployment_id,
489
+ "status": "error",
490
+ "error": str(e),
491
+ "last_check": datetime.now().isoformat()
492
+ }
493
+
494
+ async def _check_modal_health(self, modal_url: Optional[str]) -> Dict[str, Any]:
495
+ """Check Modal service health"""
496
+ if not modal_url:
497
+ return {
498
+ "status": "unknown",
499
+ "message": "No Modal URL available"
500
+ }
501
+
502
+ try:
503
+ import httpx
504
+ import asyncio
505
+
506
+ async with httpx.AsyncClient(timeout=10.0) as client:
507
+ # Try to ping the Modal endpoint
508
+ response = await client.get(f"{modal_url}/health", timeout=5.0)
534
509
 
510
+ if response.status_code == 200:
511
+ return {
512
+ "status": "healthy",
513
+ "response_time_ms": response.elapsed.total_seconds() * 1000,
514
+ "last_check": datetime.now().isoformat()
515
+ }
516
+ else:
517
+ return {
518
+ "status": "unhealthy",
519
+ "status_code": response.status_code,
520
+ "last_check": datetime.now().isoformat()
521
+ }
522
+
535
523
  except Exception as e:
536
- logger.error(f"Failed to delete deployment {deployment_id}: {e}")
537
- return False
524
+ return {
525
+ "status": "error",
526
+ "error": str(e),
527
+ "last_check": datetime.now().isoformat()
528
+ }
529
+
530
+ async def _get_modal_resource_usage(self, deployment_id: str) -> Dict[str, Any]:
531
+ """Get Modal service resource usage"""
532
+ try:
533
+ # In a real implementation, this would query Modal's API for resource usage
534
+ # For now, return simulated data based on deployment info
535
+ deployment_info = self.deployments.get(deployment_id, {})
536
+
537
+ return {
538
+ "gpu_utilization": "85%", # Simulated
539
+ "memory_usage": "12.5GB / 32GB",
540
+ "cpu_usage": "45%",
541
+ "requests_per_minute": 24,
542
+ "average_response_time": "1.2s",
543
+ "uptime": self._calculate_uptime(deployment_info.get('created_at')),
544
+ "last_updated": datetime.now().isoformat()
545
+ }
546
+
547
+ except Exception as e:
548
+ return {
549
+ "error": str(e),
550
+ "last_updated": datetime.now().isoformat()
551
+ }
552
+
553
+ async def _get_modal_metrics(self, deployment_id: str) -> Dict[str, Any]:
554
+ """Get Modal service request metrics"""
555
+ try:
556
+ # Simulated metrics - in production this would come from Modal's monitoring
557
+ return {
558
+ "total_requests": 1247,
559
+ "successful_requests": 1198,
560
+ "failed_requests": 49,
561
+ "success_rate": "96.1%",
562
+ "average_latency": "1.15s",
563
+ "requests_last_hour": 156,
564
+ "errors_last_hour": 3,
565
+ "last_updated": datetime.now().isoformat()
566
+ }
567
+
568
+ except Exception as e:
569
+ return {
570
+ "error": str(e),
571
+ "last_updated": datetime.now().isoformat()
572
+ }
538
573
 
574
+ async def _get_modal_cost_info(self, deployment_id: str) -> Dict[str, Any]:
575
+ """Get Modal service cost information"""
576
+ try:
577
+ deployment_info = self.deployments.get(deployment_id, {})
578
+
579
+ # Calculate estimated costs based on uptime and GPU type
580
+ uptime_hours = self._calculate_uptime_hours(deployment_info.get('created_at'))
581
+ gpu_cost_per_hour = 4.0 # A100 default rate
582
+
583
+ estimated_cost = uptime_hours * gpu_cost_per_hour
584
+
585
+ return {
586
+ "estimated_cost_usd": f"${estimated_cost:.4f}",
587
+ "uptime_hours": f"{uptime_hours:.2f}",
588
+ "hourly_rate": f"${gpu_cost_per_hour:.2f}",
589
+ "gpu_type": "A100",
590
+ "billing_period": "current_month",
591
+ "last_updated": datetime.now().isoformat()
592
+ }
593
+
594
+ except Exception as e:
595
+ return {
596
+ "error": str(e),
597
+ "last_updated": datetime.now().isoformat()
598
+ }
599
+
600
+ def _calculate_uptime(self, created_at: Optional[str]) -> str:
601
+ """Calculate service uptime"""
602
+ if not created_at:
603
+ return "Unknown"
604
+
605
+ try:
606
+ created = datetime.fromisoformat(created_at.replace('Z', '+00:00'))
607
+ uptime = datetime.now() - created.replace(tzinfo=None)
608
+
609
+ days = uptime.days
610
+ hours, remainder = divmod(uptime.seconds, 3600)
611
+ minutes, _ = divmod(remainder, 60)
612
+
613
+ if days > 0:
614
+ return f"{days}d {hours}h {minutes}m"
615
+ elif hours > 0:
616
+ return f"{hours}h {minutes}m"
617
+ else:
618
+ return f"{minutes}m"
619
+
620
+ except Exception:
621
+ return "Unknown"
622
+
623
+ def _calculate_uptime_hours(self, created_at: Optional[str]) -> float:
624
+ """Calculate service uptime in hours"""
625
+ if not created_at:
626
+ return 0.0
627
+
628
+ try:
629
+ created = datetime.fromisoformat(created_at.replace('Z', '+00:00'))
630
+ uptime = datetime.now() - created.replace(tzinfo=None)
631
+ return uptime.total_seconds() / 3600
632
+ except Exception:
633
+ return 0.0
634
+
539
635
  async def update_deployment_status(self, deployment_id: str, status: str, **kwargs):
540
636
  """Update deployment status"""
541
637
  if deployment_id in self.deployments:
@@ -546,4 +642,818 @@ services:
546
642
  self.deployments[deployment_id][key] = value
547
643
 
548
644
  self._save_deployments()
549
- logger.info(f"Updated deployment {deployment_id} status to {status}")
645
+ logger.info(f"Updated deployment {deployment_id} status to {status}")
646
+
647
+ @property
648
+ def modal_provider(self):
649
+ """Get or create Modal provider"""
650
+ if self._modal_provider is None:
651
+ from ..modal.deployer import ModalDeployer
652
+ self._modal_provider = ModalDeployer()
653
+ return self._modal_provider
654
+
655
+ @property
656
+ def triton_provider(self):
657
+ """Get or create Triton provider"""
658
+ if self._triton_provider is None:
659
+ from ..triton.provider import TritonProvider
660
+ self._triton_provider = TritonProvider(str(self.workspace_dir / "triton"))
661
+ return self._triton_provider
662
+
663
+ @property
664
+ def local_provider(self):
665
+ """Get or create Local GPU provider"""
666
+ if self._local_provider is None:
667
+ from ..local.provider import LocalGPUProvider
668
+ self._local_provider = LocalGPUProvider(str(self.workspace_dir / "local"))
669
+ return self._local_provider
670
+
671
+ async def deploy_to_triton(self, config) -> Dict[str, Any]:
672
+ """
673
+ Deploy a service to Triton Inference Server.
674
+
675
+ Args:
676
+ config: TritonConfig instance
677
+
678
+ Returns:
679
+ Deployment result with endpoint information
680
+ """
681
+ logger.info("=" * 60)
682
+ logger.info(f"STARTING TRITON DEPLOYMENT: {config.service_name}")
683
+ logger.info("=" * 60)
684
+
685
+ try:
686
+ # Track deployment start for billing
687
+ deployment_start_time = datetime.now()
688
+
689
+ # Deploy using Triton provider
690
+ result = await self.triton_provider.deploy(config)
691
+
692
+ # Calculate deployment duration
693
+ deployment_duration = (datetime.now() - deployment_start_time).total_seconds() / 3600 # hours
694
+
695
+ # Track billing for deployment
696
+ self._track_deployment_billing(
697
+ config=config,
698
+ provider="triton",
699
+ operation_type="deployment",
700
+ deployment_duration_hours=deployment_duration,
701
+ result=result
702
+ )
703
+
704
+ # Register in our tracking system
705
+ deployment_id = result["deployment_id"]
706
+ deployment_info = {
707
+ "config": config.to_dict(),
708
+ "result": result,
709
+ "status": "active",
710
+ "platform": "triton",
711
+ "created_at": datetime.now().isoformat(),
712
+ "updated_at": datetime.now().isoformat(),
713
+ "deployment_duration_hours": deployment_duration
714
+ }
715
+
716
+ self.deployments[deployment_id] = deployment_info
717
+ self._save_deployments()
718
+
719
+ logger.info("=" * 60)
720
+ logger.info("TRITON DEPLOYMENT COMPLETED SUCCESSFULLY!")
721
+ logger.info("=" * 60)
722
+ logger.info(f"Deployment ID: {deployment_id}")
723
+ logger.info(f"Endpoint URL: {result.get('endpoint_url', 'N/A')}")
724
+
725
+ return result
726
+
727
+ except Exception as e:
728
+ logger.error("=" * 60)
729
+ logger.error("TRITON DEPLOYMENT FAILED!")
730
+ logger.error("=" * 60)
731
+ logger.error(f"Error: {e}")
732
+ raise
733
+
734
+ async def deploy_to_local(self, config) -> Dict[str, Any]:
735
+ """
736
+ Deploy a service to local GPU.
737
+
738
+ Args:
739
+ config: LocalGPUConfig instance
740
+
741
+ Returns:
742
+ Deployment result with service information
743
+ """
744
+ logger.info("=" * 60)
745
+ logger.info(f"STARTING LOCAL GPU DEPLOYMENT: {config.service_name}")
746
+ logger.info(f"MODEL: {config.model_id}")
747
+ logger.info(f"BACKEND: {config.backend.value}")
748
+ logger.info("=" * 60)
749
+
750
+ try:
751
+ # Track deployment start for billing
752
+ deployment_start_time = datetime.now()
753
+
754
+ # Deploy using Local provider
755
+ result = await self.local_provider.deploy(config)
756
+
757
+ if result["success"]:
758
+ # Calculate deployment duration
759
+ deployment_duration = (datetime.now() - deployment_start_time).total_seconds() / 3600 # hours
760
+
761
+ # Register in our tracking system
762
+ deployment_id = f"local-{config.service_name}-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
763
+ deployment_info = {
764
+ "config": config.to_dict(),
765
+ "result": result,
766
+ "status": "active",
767
+ "platform": "local",
768
+ "created_at": datetime.now().isoformat(),
769
+ "updated_at": datetime.now().isoformat(),
770
+ "deployment_duration_hours": deployment_duration
771
+ }
772
+
773
+ self.deployments[deployment_id] = deployment_info
774
+ self._save_deployments()
775
+
776
+ logger.info("=" * 60)
777
+ logger.info("LOCAL GPU DEPLOYMENT COMPLETED SUCCESSFULLY!")
778
+ logger.info("=" * 60)
779
+ logger.info(f"Service: {config.service_name}")
780
+ logger.info(f"Backend: {config.backend.value}")
781
+
782
+ return {
783
+ **result,
784
+ "deployment_id": deployment_id,
785
+ "platform": "local"
786
+ }
787
+ else:
788
+ return result
789
+
790
+ except Exception as e:
791
+ logger.error("=" * 60)
792
+ logger.error("LOCAL GPU DEPLOYMENT FAILED!")
793
+ logger.error("=" * 60)
794
+ logger.error(f"Error: {e}")
795
+ raise
796
+
797
+ async def list_local_services(self) -> List[Dict[str, Any]]:
798
+ """List local GPU services"""
799
+ if not self.local_provider:
800
+ return []
801
+ return await self.local_provider.list_services()
802
+
803
+ async def get_local_service_info(self, service_name: str) -> Optional[Dict[str, Any]]:
804
+ """Get local service information"""
805
+ if not self.local_provider:
806
+ return None
807
+ return await self.local_provider.get_service_info(service_name)
808
+
809
+ async def undeploy_local_service(self, service_name: str) -> Dict[str, Any]:
810
+ """Undeploy local service"""
811
+ if not self.local_provider:
812
+ return {
813
+ "success": False,
814
+ "error": "Local provider not available"
815
+ }
816
+
817
+ result = await self.local_provider.undeploy(service_name)
818
+
819
+ # Remove from tracking
820
+ deployment_ids_to_remove = []
821
+ for deployment_id, info in self.deployments.items():
822
+ if (info.get('platform') == 'local' and
823
+ info.get('config', {}).get('service_name') == service_name):
824
+ deployment_ids_to_remove.append(deployment_id)
825
+
826
+ for deployment_id in deployment_ids_to_remove:
827
+ del self.deployments[deployment_id]
828
+
829
+ if deployment_ids_to_remove:
830
+ self._save_deployments()
831
+
832
+ return result
833
+
834
+ async def get_local_system_status(self) -> Dict[str, Any]:
835
+ """Get local GPU system status"""
836
+ if not self.local_provider:
837
+ return {
838
+ "available": False,
839
+ "error": "Local provider not initialized"
840
+ }
841
+ return await self.local_provider.get_system_status()
842
+
843
+ async def list_providers(self) -> List[str]:
844
+ """List available deployment providers"""
845
+ return ["local", "modal", "triton"]
846
+
847
+ async def get_provider_status(self, provider: str) -> Dict[str, Any]:
848
+ """Get status of a deployment provider"""
849
+ if provider == "local":
850
+ # Check local GPU availability
851
+ try:
852
+ from ...utils.gpu_utils import get_gpu_manager
853
+ gpu_manager = get_gpu_manager()
854
+
855
+ return {
856
+ "provider": "local",
857
+ "available": gpu_manager.cuda_available,
858
+ "description": "Local GPU deployment with vLLM, TensorRT-LLM, Transformers",
859
+ "gpu_count": len(gpu_manager.gpus),
860
+ "cuda_available": gpu_manager.cuda_available,
861
+ "nvidia_smi_available": gpu_manager.nvidia_smi_available,
862
+ "requirements": ["CUDA", "GPU drivers", "Sufficient GPU memory"]
863
+ }
864
+ except Exception as e:
865
+ return {
866
+ "provider": "local",
867
+ "available": False,
868
+ "description": "Local GPU deployment",
869
+ "error": str(e)
870
+ }
871
+ elif provider == "modal":
872
+ return {
873
+ "provider": "modal",
874
+ "available": True,
875
+ "description": "Modal serverless platform"
876
+ }
877
+ elif provider == "triton":
878
+ # Check if Docker is available
879
+ try:
880
+ import docker
881
+ docker.from_env()
882
+ docker_available = True
883
+ except Exception:
884
+ docker_available = False
885
+
886
+ return {
887
+ "provider": "triton",
888
+ "available": docker_available,
889
+ "description": "Triton Inference Server with TensorRT-LLM",
890
+ "requirements": ["Docker", "GPU support"]
891
+ }
892
+ else:
893
+ raise ValueError(f"Unknown provider: {provider}")
894
+
895
+ def _track_deployment_billing(
896
+ self,
897
+ config: Any,
898
+ provider: str,
899
+ operation_type: str,
900
+ deployment_duration_hours: float,
901
+ result: Dict[str, Any]
902
+ ):
903
+ """Track billing for deployment operations"""
904
+ try:
905
+ from ...core.models.deployment_billing_tracker import get_deployment_billing_tracker
906
+
907
+ # Extract GPU info from config
908
+ gpu_type = getattr(config, 'gpu_type', None)
909
+ gpu_count = getattr(config, 'gpu_count', 1)
910
+ memory_gb = getattr(config, 'memory_gb', None)
911
+
912
+ # Track the deployment billing
913
+ billing_tracker = get_deployment_billing_tracker()
914
+ billing_tracker.track_deployment_usage(
915
+ model_id=getattr(config, 'model_id', 'unknown'),
916
+ provider=provider,
917
+ operation_type=operation_type,
918
+ service_type=getattr(config, 'service_type', 'unknown').value if hasattr(getattr(config, 'service_type', 'unknown'), 'value') else str(getattr(config, 'service_type', 'unknown')),
919
+ operation="deploy",
920
+ gpu_type=gpu_type,
921
+ gpu_count=gpu_count,
922
+ runtime_hours=deployment_duration_hours,
923
+ deployment_duration_hours=deployment_duration_hours,
924
+ memory_gb=memory_gb,
925
+ metadata={
926
+ "deployment_id": result.get("deployment_id"),
927
+ "endpoint_url": result.get("endpoint_url"),
928
+ "provider_details": provider
929
+ }
930
+ )
931
+
932
+ logger.info(f"Tracked deployment billing: {provider} - {deployment_duration_hours:.3f}h")
933
+
934
+ except Exception as e:
935
+ logger.error(f"Failed to track deployment billing: {e}")
936
+
937
+ async def estimate_deployment_cost(
938
+ self,
939
+ provider: str,
940
+ gpu_type: str,
941
+ gpu_count: int = 1,
942
+ estimated_hours: float = 1.0
943
+ ) -> Dict[str, float]:
944
+ """Estimate deployment costs before starting"""
945
+ try:
946
+ from ...core.models.deployment_billing_tracker import get_deployment_billing_tracker
947
+
948
+ billing_tracker = get_deployment_billing_tracker()
949
+ return billing_tracker.estimate_deployment_cost(
950
+ provider=provider,
951
+ gpu_type=gpu_type,
952
+ gpu_count=gpu_count,
953
+ estimated_hours=estimated_hours
954
+ )
955
+ except Exception as e:
956
+ logger.error(f"Failed to estimate deployment cost: {e}")
957
+ return {"total_cost": 0.0, "compute_cost": 0.0, "storage_cost": 0.0, "network_cost": 0.0}
958
+
959
+ def _track_modal_deployment_billing(
960
+ self,
961
+ service_name: str,
962
+ model_id: str,
963
+ service_type: str,
964
+ deployment_duration_hours: float,
965
+ config: Optional[Dict[str, Any]],
966
+ result: Dict[str, Any]
967
+ ):
968
+ """Track billing for Modal deployment operations"""
969
+ try:
970
+ from ...core.models.deployment_billing_tracker import get_deployment_billing_tracker
971
+
972
+ # Extract GPU info from config or use defaults
973
+ gpu_type = config.get('gpu_type', 't4') if config else 't4'
974
+ gpu_count = config.get('gpu_count', 1) if config else 1
975
+ memory_gb = config.get('memory_gb', 8) if config else 8
976
+
977
+ # Track the Modal deployment billing
978
+ billing_tracker = get_deployment_billing_tracker()
979
+ billing_tracker.track_deployment_usage(
980
+ model_id=model_id,
981
+ provider="modal",
982
+ operation_type="deployment",
983
+ service_type=service_type,
984
+ operation="deploy",
985
+ gpu_type=gpu_type,
986
+ gpu_count=gpu_count,
987
+ runtime_hours=deployment_duration_hours,
988
+ deployment_duration_hours=deployment_duration_hours,
989
+ memory_gb=memory_gb,
990
+ metadata={
991
+ "service_name": service_name,
992
+ "deployment_id": result.get("deployment_id"),
993
+ "endpoint_url": result.get("endpoint_url"),
994
+ "provider_details": "modal_serverless"
995
+ }
996
+ )
997
+
998
+ logger.info(f"Tracked Modal deployment billing: {service_name} - {deployment_duration_hours:.3f}h")
999
+
1000
+ except Exception as e:
1001
+ logger.error(f"Failed to track Modal deployment billing: {e}")
1002
+
1003
+ async def list_modal_services(self) -> List[Dict[str, Any]]:
1004
+ """List available Modal services by type"""
1005
+ services = {
1006
+ "llm": ["isa_llm_service"],
1007
+ "vision": ["isa_vision_ocr_service", "isa_vision_ui_service", "isa_vision_table_service", "isa_vision_qwen25_service"],
1008
+ "audio": ["isa_audio_chatTTS_service", "isa_audio_openvoice_service", "isa_audio_service_v2", "isa_audio_fish_service"],
1009
+ "embedding": ["isa_embed_rerank_service"],
1010
+ "video": ["isa_video_hunyuan_service"]
1011
+ }
1012
+
1013
+ result = []
1014
+ for service_type, service_list in services.items():
1015
+ for service_name in service_list:
1016
+ result.append({
1017
+ "service_name": service_name,
1018
+ "service_type": service_type,
1019
+ "platform": "modal"
1020
+ })
1021
+
1022
+ return result
1023
+
1024
+ # ============= MODAL SERVICE CODE GENERATION =============
1025
+
1026
+ async def _generate_modal_service_code(self,
1027
+ service_name: str,
1028
+ model_config: Any,
1029
+ service_type: str,
1030
+ config: Dict[str, Any]) -> str:
1031
+ """Generate Modal service code based on model type and configuration"""
1032
+
1033
+ # Choose the appropriate service template based on service_type
1034
+ if service_type == "llm":
1035
+ return self._generate_llm_service_code(service_name, model_config, config)
1036
+ elif service_type == "vision":
1037
+ return self._generate_vision_service_code(service_name, model_config, config)
1038
+ elif service_type == "embedding":
1039
+ return self._generate_embedding_service_code(service_name, model_config, config)
1040
+ else:
1041
+ # Default to LLM service
1042
+ return self._generate_llm_service_code(service_name, model_config, config)
1043
+
1044
+ def _generate_llm_service_code(self, service_name: str, model_config: Any, config: Dict[str, Any]) -> str:
1045
+ """Generate production-ready LLM service code for Modal"""
1046
+ dependencies = getattr(model_config, 'dependencies', None) or [
1047
+ "torch", "transformers>=4.36.0", "accelerate", "bitsandbytes", "flash-attn"
1048
+ ]
1049
+
1050
+ # Determine optimal GPU based on model size
1051
+ gpu_config = self._get_optimal_gpu_config(model_config)
1052
+
1053
+ return f'''"""
1054
+ {service_name} LLM Service for Modal
1055
+
1056
+ Production-ready service for model: {getattr(model_config, 'model_id', 'unknown')}
1057
+ Architecture: {getattr(model_config, 'architecture', 'transformer')}
1058
+ Generated automatically by ISA Model Deployment Manager
1059
+ """
1060
+
1061
+ import modal
1062
+ import asyncio
1063
+ import json
1064
+ import time
1065
+ from typing import Dict, Any, List, Optional
1066
+ from datetime import datetime
1067
+
1068
+ # Create Modal app
1069
+ app = modal.App("{service_name}")
1070
+
1071
+ # Production image with optimized dependencies
1072
+ image = (
1073
+ modal.Image.debian_slim(python_version="3.11")
1074
+ .pip_install([
1075
+ {', '.join([f'"{dep}"' for dep in dependencies])}
1076
+ ])
1077
+ .env({{"HF_HUB_ENABLE_HF_TRANSFER": "1"}})
1078
+ )
1079
+
1080
+ @app.cls(
1081
+ image=image,
1082
+ gpu=modal.gpu.{gpu_config['gpu_type']}(count={gpu_config['gpu_count']}),
1083
+ container_idle_timeout=300,
1084
+ timeout=1800, # 30 minutes
1085
+ memory={getattr(model_config, 'container_memory_mb', 32768)},
1086
+ keep_warm=1, # Keep one container warm
1087
+ allow_concurrent_inputs=10
1088
+ )
1089
+ class {service_name.replace('-', '_').title()}Service:
1090
+
1091
+ @modal.enter()
1092
+ def load_model(self):
1093
+ """Load model with production optimizations"""
1094
+ import torch
1095
+ from transformers import (
1096
+ AutoTokenizer,
1097
+ AutoModelForCausalLM,
1098
+ BitsAndBytesConfig
1099
+ )
1100
+
1101
+ model_id = "{getattr(model_config, 'model_id', 'microsoft/DialoGPT-medium')}"
1102
+
1103
+ print(f"Loading model: {{model_id}}")
1104
+ start_time = time.time()
1105
+
1106
+ # Load tokenizer
1107
+ self.tokenizer = AutoTokenizer.from_pretrained(
1108
+ model_id,
1109
+ trust_remote_code=True,
1110
+ use_fast=True
1111
+ )
1112
+
1113
+ if self.tokenizer.pad_token is None:
1114
+ self.tokenizer.pad_token = self.tokenizer.eos_token
1115
+
1116
+ # Configure quantization for efficiency
1117
+ quantization_config = BitsAndBytesConfig(
1118
+ load_in_4bit=True,
1119
+ bnb_4bit_compute_dtype=torch.float16,
1120
+ bnb_4bit_use_double_quant=True,
1121
+ bnb_4bit_quant_type="nf4"
1122
+ )
1123
+
1124
+ # Load model with optimizations
1125
+ self.model = AutoModelForCausalLM.from_pretrained(
1126
+ model_id,
1127
+ quantization_config=quantization_config,
1128
+ device_map="auto",
1129
+ trust_remote_code=True,
1130
+ torch_dtype=torch.float16,
1131
+ attn_implementation="flash_attention_2"
1132
+ )
1133
+
1134
+ self.model.eval()
1135
+
1136
+ load_time = time.time() - start_time
1137
+ print(f"Model loaded successfully in {{load_time:.2f}}s")
1138
+
1139
+ # Model metadata
1140
+ self.model_info = {{
1141
+ "model_id": model_id,
1142
+ "architecture": "{getattr(model_config, 'architecture', 'transformer')}",
1143
+ "parameters": getattr(self.model, 'num_parameters', lambda: 0)(),
1144
+ "loaded_at": datetime.now().isoformat(),
1145
+ "load_time_seconds": load_time
1146
+ }}
1147
+
1148
+ @modal.method()
1149
+ def generate(self,
1150
+ messages: List[Dict[str, str]],
1151
+ max_tokens: int = 512,
1152
+ temperature: float = 0.7,
1153
+ top_p: float = 0.9,
1154
+ top_k: int = 50,
1155
+ do_sample: bool = True,
1156
+ **kwargs) -> Dict[str, Any]:
1157
+ """Generate response with production features"""
1158
+
1159
+ start_time = time.time()
1160
+
1161
+ try:
1162
+ # Format messages into prompt
1163
+ prompt = self._format_messages(messages)
1164
+
1165
+ # Tokenize input
1166
+ inputs = self.tokenizer(
1167
+ prompt,
1168
+ return_tensors="pt",
1169
+ padding=True,
1170
+ truncation=True,
1171
+ max_length=2048
1172
+ ).to(self.model.device)
1173
+
1174
+ # Generate response
1175
+ with torch.no_grad():
1176
+ outputs = self.model.generate(
1177
+ **inputs,
1178
+ max_new_tokens=max_tokens,
1179
+ temperature=temperature,
1180
+ top_p=top_p,
1181
+ top_k=top_k,
1182
+ do_sample=do_sample,
1183
+ pad_token_id=self.tokenizer.eos_token_id,
1184
+ eos_token_id=self.tokenizer.eos_token_id,
1185
+ use_cache=True
1186
+ )
1187
+
1188
+ # Decode response
1189
+ response_tokens = outputs[0][inputs['input_ids'].shape[-1]:]
1190
+ response_text = self.tokenizer.decode(
1191
+ response_tokens,
1192
+ skip_special_tokens=True
1193
+ ).strip()
1194
+
1195
+ generation_time = time.time() - start_time
1196
+
1197
+ return {{
1198
+ "response": response_text,
1199
+ "model": self.model_info["model_id"],
1200
+ "usage": {{
1201
+ "prompt_tokens": inputs['input_ids'].shape[-1],
1202
+ "completion_tokens": len(response_tokens),
1203
+ "total_tokens": inputs['input_ids'].shape[-1] + len(response_tokens)
1204
+ }},
1205
+ "metadata": {{
1206
+ "generation_time_seconds": generation_time,
1207
+ "parameters": {{
1208
+ "temperature": temperature,
1209
+ "top_p": top_p,
1210
+ "top_k": top_k,
1211
+ "max_tokens": max_tokens
1212
+ }},
1213
+ "timestamp": datetime.now().isoformat()
1214
+ }}
1215
+ }}
1216
+
1217
+ except Exception as e:
1218
+ return {{
1219
+ "error": str(e),
1220
+ "error_type": type(e).__name__,
1221
+ "model": self.model_info.get("model_id", "unknown"),
1222
+ "timestamp": datetime.now().isoformat()
1223
+ }}
1224
+
1225
+ def _format_messages(self, messages: List[Dict[str, str]]) -> str:
1226
+ """Format messages into model-appropriate prompt"""
1227
+ if not messages:
1228
+ return ""
1229
+
1230
+ # Simple chat format - can be enhanced for specific models
1231
+ formatted_parts = []
1232
+ for msg in messages:
1233
+ role = msg.get("role", "user")
1234
+ content = msg.get("content", "")
1235
+
1236
+ if role == "system":
1237
+ formatted_parts.append(f"System: {{content}}")
1238
+ elif role == "user":
1239
+ formatted_parts.append(f"Human: {{content}}")
1240
+ elif role == "assistant":
1241
+ formatted_parts.append(f"Assistant: {{content}}")
1242
+
1243
+ formatted_parts.append("Assistant:")
1244
+ return "\\n\\n".join(formatted_parts)
1245
+
1246
+ @modal.method()
1247
+ def get_model_info(self) -> Dict[str, Any]:
1248
+ """Get model metadata"""
1249
+ return self.model_info
1250
+
1251
+ # Web endpoint for HTTP access
1252
+ @app.function(
1253
+ image=image,
1254
+ timeout=300
1255
+ )
1256
+ @modal.web_endpoint(method="POST")
1257
+ async def inference_endpoint(item: Dict[str, Any]):
1258
+ """HTTP endpoint for model inference"""
1259
+ try:
1260
+ service = {service_name.replace('-', '_').title()}Service()
1261
+
1262
+ # Extract parameters
1263
+ messages = item.get("messages", [])
1264
+ max_tokens = item.get("max_tokens", 512)
1265
+ temperature = item.get("temperature", 0.7)
1266
+ top_p = item.get("top_p", 0.9)
1267
+
1268
+ # Generate response
1269
+ result = service.generate(
1270
+ messages=messages,
1271
+ max_tokens=max_tokens,
1272
+ temperature=temperature,
1273
+ top_p=top_p
1274
+ )
1275
+
1276
+ return result
1277
+
1278
+ except Exception as e:
1279
+ return {{
1280
+ "error": str(e),
1281
+ "error_type": type(e).__name__,
1282
+ "endpoint": "inference_endpoint",
1283
+ "timestamp": datetime.now().isoformat()
1284
+ }}
1285
+
1286
+ @app.function(image=image)
1287
+ @modal.web_endpoint(method="GET")
1288
+ async def health_check():
1289
+ """Health check endpoint"""
1290
+ return {{
1291
+ "status": "healthy",
1292
+ "service": "{service_name}",
1293
+ "timestamp": datetime.now().isoformat(),
1294
+ "version": "1.0.0"
1295
+ }}
1296
+
1297
+ @app.function(image=image)
1298
+ @modal.web_endpoint(method="GET")
1299
+ async def model_info():
1300
+ """Model information endpoint"""
1301
+ try:
1302
+ service = {service_name.replace('-', '_').title()}Service()
1303
+ return service.get_model_info()
1304
+ except Exception as e:
1305
+ return {{
1306
+ "error": str(e),
1307
+ "timestamp": datetime.now().isoformat()
1308
+ }}
1309
+
1310
+ # For local testing
1311
+ if __name__ == "__main__":
1312
+ # Test the service locally
1313
+ import asyncio
1314
+
1315
+ async def test():
1316
+ service = {service_name.replace('-', '_').title()}Service()
1317
+ result = service.generate([
1318
+ {{"role": "user", "content": "Hello! How are you today?"}}
1319
+ ])
1320
+ print(json.dumps(result, indent=2))
1321
+
1322
+ asyncio.run(test())
1323
+ '''
1324
+
1325
+ def _generate_vision_service_code(self, service_name: str, model_config: Any, config: Dict[str, Any]) -> str:
1326
+ """Generate Vision service code for Modal"""
1327
+ return f'# Vision service template for {service_name} - {model_config.model_id}'
1328
+
1329
+ def _generate_embedding_service_code(self, service_name: str, model_config: Any, config: Dict[str, Any]) -> str:
1330
+ """Generate Embedding service code for Modal"""
1331
+ return f'# Embedding service template for {service_name} - {model_config.model_id}'
1332
+
1333
+ async def _execute_modal_deployment(self,
1334
+ service_file: Path,
1335
+ service_name: str,
1336
+ model_config: Any,
1337
+ deployment_id: str) -> Dict[str, Any]:
1338
+ """Execute the actual Modal deployment using Modal SDK"""
1339
+
1340
+ logger.info(f"Executing Modal deployment for {service_name}...")
1341
+
1342
+ try:
1343
+ import subprocess
1344
+ import tempfile
1345
+ import os
1346
+
1347
+ # Check if modal CLI is available
1348
+ modal_check = subprocess.run(["modal", "--version"],
1349
+ capture_output=True, text=True, timeout=10)
1350
+ if modal_check.returncode != 0:
1351
+ raise RuntimeError("Modal CLI not found. Please install Modal: pip install modal")
1352
+
1353
+ # Create a temporary script for deployment
1354
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as tmp_file:
1355
+ tmp_file.write(open(service_file, 'r').read())
1356
+ tmp_script_path = tmp_file.name
1357
+
1358
+ try:
1359
+ # Execute Modal deployment
1360
+ logger.info(f"Deploying Modal service from {service_file}")
1361
+ deploy_result = subprocess.run(
1362
+ ["modal", "deploy", tmp_script_path],
1363
+ capture_output=True,
1364
+ text=True,
1365
+ timeout=300, # 5 minute timeout
1366
+ cwd=service_file.parent
1367
+ )
1368
+
1369
+ if deploy_result.returncode == 0:
1370
+ # Parse deployment output to extract endpoint URL
1371
+ output = deploy_result.stdout + deploy_result.stderr
1372
+ endpoint_url = self._extract_modal_endpoint(output, service_name, deployment_id)
1373
+
1374
+ result = {
1375
+ "status": "deployed",
1376
+ "endpoint_url": endpoint_url,
1377
+ "deployment_id": deployment_id,
1378
+ "service_file": str(service_file),
1379
+ "model_architecture": getattr(model_config, 'architecture', 'unknown'),
1380
+ "deployment_output": output,
1381
+ "estimated_startup_time": "30-60 seconds"
1382
+ }
1383
+
1384
+ logger.info(f"Modal deployment completed successfully: {endpoint_url}")
1385
+ return result
1386
+
1387
+ else:
1388
+ error_output = deploy_result.stderr or deploy_result.stdout
1389
+ logger.error(f"Modal deployment failed: {error_output}")
1390
+ raise RuntimeError(f"Modal deployment failed: {error_output}")
1391
+
1392
+ finally:
1393
+ # Clean up temporary file
1394
+ if os.path.exists(tmp_script_path):
1395
+ os.unlink(tmp_script_path)
1396
+
1397
+ except subprocess.TimeoutExpired:
1398
+ logger.error("Modal deployment timed out")
1399
+ raise RuntimeError("Modal deployment timed out after 5 minutes")
1400
+
1401
+ except Exception as e:
1402
+ logger.error(f"Failed to execute Modal deployment: {e}")
1403
+ raise
1404
+
1405
+ def _extract_modal_endpoint(self, output: str, service_name: str, deployment_id: str) -> str:
1406
+ """Extract Modal endpoint URL from deployment output"""
1407
+ import re
1408
+
1409
+ # Look for typical Modal endpoint patterns in output
1410
+ patterns = [
1411
+ r'https://[a-zA-Z0-9\-]+--[a-zA-Z0-9\-]+\.modal\.run',
1412
+ r'Deployed! Your app is at (https://[^\s]+)',
1413
+ r'App deployed to (https://[^\s]+)',
1414
+ r'Available at (https://[^\s]+)'
1415
+ ]
1416
+
1417
+ for pattern in patterns:
1418
+ match = re.search(pattern, output)
1419
+ if match:
1420
+ url = match.group(1) if match.lastindex else match.group(0)
1421
+ logger.info(f"Extracted Modal endpoint: {url}")
1422
+ return url
1423
+
1424
+ # If no endpoint found in output, generate expected URL pattern
1425
+ endpoint_url = f"https://{service_name}--{deployment_id}.modal.run"
1426
+ logger.warning(f"Could not extract endpoint from output, using expected pattern: {endpoint_url}")
1427
+ return endpoint_url
1428
+
1429
+ def _get_optimal_gpu_config(self, model_config: Any) -> Dict[str, Any]:
1430
+ """Determine optimal GPU configuration based on model size"""
1431
+
1432
+ # Get model parameters or estimate from model ID
1433
+ parameters = getattr(model_config, 'parameters', None)
1434
+ model_id = getattr(model_config, 'model_id', '')
1435
+
1436
+ # Estimate parameters from model name if not available
1437
+ if not parameters:
1438
+ if '7b' in model_id.lower():
1439
+ parameters = 7_000_000_000
1440
+ elif '13b' in model_id.lower():
1441
+ parameters = 13_000_000_000
1442
+ elif '70b' in model_id.lower():
1443
+ parameters = 70_000_000_000
1444
+ elif 'large' in model_id.lower():
1445
+ parameters = 1_000_000_000
1446
+ elif 'medium' in model_id.lower():
1447
+ parameters = 350_000_000
1448
+ else:
1449
+ parameters = 500_000_000 # Default assumption
1450
+
1451
+ # Choose GPU based on model size
1452
+ if parameters > 50_000_000_000: # >50B parameters
1453
+ return {"gpu_type": "A100", "gpu_count": 2}
1454
+ elif parameters > 15_000_000_000: # 15B-50B parameters
1455
+ return {"gpu_type": "A100", "gpu_count": 1}
1456
+ elif parameters > 3_000_000_000: # 3B-15B parameters
1457
+ return {"gpu_type": "A10G", "gpu_count": 1}
1458
+ else: # <3B parameters
1459
+ return {"gpu_type": "T4", "gpu_count": 1}