isa-model 0.4.0__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (199) hide show
  1. isa_model/client.py +466 -43
  2. isa_model/core/cache/redis_cache.py +12 -3
  3. isa_model/core/config/config_manager.py +230 -3
  4. isa_model/core/config.py +90 -0
  5. isa_model/core/database/direct_db_client.py +114 -0
  6. isa_model/core/database/migration_manager.py +563 -0
  7. isa_model/core/database/migrations.py +21 -1
  8. isa_model/core/database/supabase_client.py +154 -19
  9. isa_model/core/dependencies.py +316 -0
  10. isa_model/core/discovery/__init__.py +19 -0
  11. isa_model/core/discovery/consul_discovery.py +190 -0
  12. isa_model/core/logging/__init__.py +54 -0
  13. isa_model/core/logging/influx_logger.py +523 -0
  14. isa_model/core/logging/loki_logger.py +160 -0
  15. isa_model/core/models/__init__.py +27 -18
  16. isa_model/core/models/config_models.py +625 -0
  17. isa_model/core/models/deployment_billing_tracker.py +430 -0
  18. isa_model/core/models/model_manager.py +40 -17
  19. isa_model/core/models/model_metadata.py +690 -0
  20. isa_model/core/models/model_repo.py +174 -18
  21. isa_model/core/models/system_models.py +857 -0
  22. isa_model/core/repositories/__init__.py +9 -0
  23. isa_model/core/repositories/config_repository.py +912 -0
  24. isa_model/core/services/intelligent_model_selector.py +399 -21
  25. isa_model/core/storage/hf_storage.py +1 -1
  26. isa_model/core/types.py +1 -0
  27. isa_model/deployment/__init__.py +5 -48
  28. isa_model/deployment/core/__init__.py +2 -31
  29. isa_model/deployment/core/deployment_manager.py +1278 -370
  30. isa_model/deployment/local/__init__.py +31 -0
  31. isa_model/deployment/local/config.py +248 -0
  32. isa_model/deployment/local/gpu_gateway.py +607 -0
  33. isa_model/deployment/local/health_checker.py +428 -0
  34. isa_model/deployment/local/provider.py +586 -0
  35. isa_model/deployment/local/tensorrt_service.py +621 -0
  36. isa_model/deployment/local/transformers_service.py +644 -0
  37. isa_model/deployment/local/vllm_service.py +527 -0
  38. isa_model/deployment/modal/__init__.py +8 -0
  39. isa_model/deployment/modal/config.py +136 -0
  40. isa_model/deployment/{services/auto_hf_modal_deployer.py → modal/deployer.py} +1 -1
  41. isa_model/deployment/modal/services/__init__.py +3 -0
  42. isa_model/deployment/modal/services/audio/__init__.py +1 -0
  43. isa_model/deployment/modal/services/embedding/__init__.py +1 -0
  44. isa_model/deployment/modal/services/llm/__init__.py +1 -0
  45. isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
  46. isa_model/deployment/modal/services/video/__init__.py +1 -0
  47. isa_model/deployment/modal/services/vision/__init__.py +1 -0
  48. isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
  49. isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
  50. isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
  51. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
  52. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
  53. isa_model/deployment/storage/__init__.py +5 -0
  54. isa_model/deployment/storage/deployment_repository.py +824 -0
  55. isa_model/deployment/triton/__init__.py +10 -0
  56. isa_model/deployment/triton/config.py +196 -0
  57. isa_model/deployment/triton/configs/__init__.py +1 -0
  58. isa_model/deployment/triton/provider.py +512 -0
  59. isa_model/deployment/triton/scripts/__init__.py +1 -0
  60. isa_model/deployment/triton/templates/__init__.py +1 -0
  61. isa_model/inference/__init__.py +47 -1
  62. isa_model/inference/ai_factory.py +137 -10
  63. isa_model/inference/legacy_services/__init__.py +21 -0
  64. isa_model/inference/legacy_services/model_evaluation.py +637 -0
  65. isa_model/inference/legacy_services/model_service.py +573 -0
  66. isa_model/inference/legacy_services/model_serving.py +717 -0
  67. isa_model/inference/legacy_services/model_training.py +561 -0
  68. isa_model/inference/models/__init__.py +21 -0
  69. isa_model/inference/models/inference_config.py +551 -0
  70. isa_model/inference/models/inference_record.py +675 -0
  71. isa_model/inference/models/performance_models.py +714 -0
  72. isa_model/inference/repositories/__init__.py +9 -0
  73. isa_model/inference/repositories/inference_repository.py +828 -0
  74. isa_model/inference/services/audio/base_stt_service.py +184 -11
  75. isa_model/inference/services/audio/openai_stt_service.py +22 -6
  76. isa_model/inference/services/custom_model_manager.py +277 -0
  77. isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
  78. isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
  79. isa_model/inference/services/llm/__init__.py +10 -2
  80. isa_model/inference/services/llm/base_llm_service.py +335 -24
  81. isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
  82. isa_model/inference/services/llm/helpers/llm_adapter.py +9 -4
  83. isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
  84. isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
  85. isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
  86. isa_model/inference/services/llm/local_llm_service.py +747 -0
  87. isa_model/inference/services/llm/ollama_llm_service.py +9 -2
  88. isa_model/inference/services/llm/openai_llm_service.py +33 -16
  89. isa_model/inference/services/llm/yyds_llm_service.py +8 -2
  90. isa_model/inference/services/vision/__init__.py +22 -1
  91. isa_model/inference/services/vision/blip_vision_service.py +359 -0
  92. isa_model/inference/services/vision/helpers/image_utils.py +8 -5
  93. isa_model/inference/services/vision/isa_vision_service.py +65 -4
  94. isa_model/inference/services/vision/openai_vision_service.py +19 -10
  95. isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
  96. isa_model/serving/api/cache_manager.py +245 -0
  97. isa_model/serving/api/dependencies/__init__.py +1 -0
  98. isa_model/serving/api/dependencies/auth.py +194 -0
  99. isa_model/serving/api/dependencies/database.py +139 -0
  100. isa_model/serving/api/error_handlers.py +284 -0
  101. isa_model/serving/api/fastapi_server.py +172 -22
  102. isa_model/serving/api/middleware/auth.py +8 -2
  103. isa_model/serving/api/middleware/security.py +23 -33
  104. isa_model/serving/api/middleware/tenant_context.py +414 -0
  105. isa_model/serving/api/routes/analytics.py +4 -1
  106. isa_model/serving/api/routes/config.py +645 -0
  107. isa_model/serving/api/routes/deployment_billing.py +315 -0
  108. isa_model/serving/api/routes/deployments.py +138 -2
  109. isa_model/serving/api/routes/gpu_gateway.py +440 -0
  110. isa_model/serving/api/routes/health.py +32 -12
  111. isa_model/serving/api/routes/inference_monitoring.py +486 -0
  112. isa_model/serving/api/routes/local_deployments.py +448 -0
  113. isa_model/serving/api/routes/tenants.py +575 -0
  114. isa_model/serving/api/routes/unified.py +680 -18
  115. isa_model/serving/api/routes/webhooks.py +479 -0
  116. isa_model/serving/api/startup.py +68 -54
  117. isa_model/utils/gpu_utils.py +311 -0
  118. {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/METADATA +66 -24
  119. isa_model-0.4.3.dist-info/RECORD +193 -0
  120. isa_model/core/storage/minio_storage.py +0 -0
  121. isa_model/deployment/cloud/__init__.py +0 -9
  122. isa_model/deployment/cloud/modal/__init__.py +0 -10
  123. isa_model/deployment/core/deployment_config.py +0 -356
  124. isa_model/deployment/core/isa_deployment_service.py +0 -401
  125. isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
  126. isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
  127. isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
  128. isa_model/deployment/runtime/deployed_service.py +0 -338
  129. isa_model/deployment/services/__init__.py +0 -9
  130. isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
  131. isa_model/deployment/services/model_service.py +0 -332
  132. isa_model/deployment/services/service_monitor.py +0 -356
  133. isa_model/deployment/services/service_registry.py +0 -527
  134. isa_model/eval/__init__.py +0 -92
  135. isa_model/eval/benchmarks/__init__.py +0 -27
  136. isa_model/eval/benchmarks/multimodal_datasets.py +0 -460
  137. isa_model/eval/benchmarks.py +0 -701
  138. isa_model/eval/config/__init__.py +0 -10
  139. isa_model/eval/config/evaluation_config.py +0 -108
  140. isa_model/eval/evaluators/__init__.py +0 -24
  141. isa_model/eval/evaluators/audio_evaluator.py +0 -727
  142. isa_model/eval/evaluators/base_evaluator.py +0 -503
  143. isa_model/eval/evaluators/embedding_evaluator.py +0 -742
  144. isa_model/eval/evaluators/llm_evaluator.py +0 -472
  145. isa_model/eval/evaluators/vision_evaluator.py +0 -564
  146. isa_model/eval/example_evaluation.py +0 -395
  147. isa_model/eval/factory.py +0 -798
  148. isa_model/eval/infrastructure/__init__.py +0 -24
  149. isa_model/eval/infrastructure/experiment_tracker.py +0 -466
  150. isa_model/eval/isa_benchmarks.py +0 -700
  151. isa_model/eval/isa_integration.py +0 -582
  152. isa_model/eval/metrics.py +0 -951
  153. isa_model/eval/tests/unit/test_basic.py +0 -396
  154. isa_model/serving/api/routes/evaluations.py +0 -579
  155. isa_model/training/__init__.py +0 -168
  156. isa_model/training/annotation/annotation_schema.py +0 -47
  157. isa_model/training/annotation/processors/annotation_processor.py +0 -126
  158. isa_model/training/annotation/storage/dataset_manager.py +0 -131
  159. isa_model/training/annotation/storage/dataset_schema.py +0 -44
  160. isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
  161. isa_model/training/annotation/tests/test_minio copy.py +0 -113
  162. isa_model/training/annotation/tests/test_minio_upload.py +0 -43
  163. isa_model/training/annotation/views/annotation_controller.py +0 -158
  164. isa_model/training/cloud/__init__.py +0 -22
  165. isa_model/training/cloud/job_orchestrator.py +0 -402
  166. isa_model/training/cloud/runpod_trainer.py +0 -454
  167. isa_model/training/cloud/storage_manager.py +0 -482
  168. isa_model/training/core/__init__.py +0 -26
  169. isa_model/training/core/config.py +0 -181
  170. isa_model/training/core/dataset.py +0 -222
  171. isa_model/training/core/trainer.py +0 -720
  172. isa_model/training/core/utils.py +0 -213
  173. isa_model/training/examples/intelligent_training_example.py +0 -281
  174. isa_model/training/factory.py +0 -424
  175. isa_model/training/intelligent/__init__.py +0 -25
  176. isa_model/training/intelligent/decision_engine.py +0 -643
  177. isa_model/training/intelligent/intelligent_factory.py +0 -888
  178. isa_model/training/intelligent/knowledge_base.py +0 -751
  179. isa_model/training/intelligent/resource_optimizer.py +0 -839
  180. isa_model/training/intelligent/task_classifier.py +0 -576
  181. isa_model/training/storage/__init__.py +0 -24
  182. isa_model/training/storage/core_integration.py +0 -439
  183. isa_model/training/storage/training_repository.py +0 -552
  184. isa_model/training/storage/training_storage.py +0 -628
  185. isa_model-0.4.0.dist-info/RECORD +0 -182
  186. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_chatTTS_service.py +0 -0
  187. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_fish_service.py +0 -0
  188. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_openvoice_service.py +0 -0
  189. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_service_v2.py +0 -0
  190. /isa_model/deployment/{cloud/modal → modal/services/embedding}/isa_embed_rerank_service.py +0 -0
  191. /isa_model/deployment/{cloud/modal → modal/services/video}/isa_video_hunyuan_service.py +0 -0
  192. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ocr_service.py +0 -0
  193. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_qwen25_service.py +0 -0
  194. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_table_service.py +0 -0
  195. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service.py +0 -0
  196. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service_optimized.py +0 -0
  197. /isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
  198. {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/WHEEL +0 -0
  199. {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,527 @@
1
+ """
2
+ vLLM local inference service
3
+
4
+ High-performance local model serving using vLLM.
5
+ """
6
+
7
+ import os
8
+ import json
9
+ import asyncio
10
+ import logging
11
+ import subprocess
12
+ import signal
13
+ from typing import Dict, List, Optional, Any, Union, AsyncGenerator
14
+ from pathlib import Path
15
+ from datetime import datetime
16
+ import httpx
17
+ import time
18
+
19
+ from .config import LocalGPUConfig, LocalServiceType, LocalBackend
20
+ from ...utils.gpu_utils import get_gpu_manager, GPUInfo
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ class VLLMService:
26
+ """vLLM local inference service manager"""
27
+
28
+ def __init__(self, config: LocalGPUConfig):
29
+ """
30
+ Initialize vLLM service.
31
+
32
+ Args:
33
+ config: Local GPU configuration for vLLM
34
+ """
35
+ if config.backend != LocalBackend.VLLM:
36
+ raise ValueError("Config must use VLLM backend")
37
+
38
+ self.config = config
39
+ self.gpu_manager = get_gpu_manager()
40
+ self.process: Optional[subprocess.Popen] = None
41
+ self.service_url = f"http://{config.host}:{config.port}"
42
+ self.is_running = False
43
+ self.startup_time: Optional[datetime] = None
44
+
45
+ # Service info
46
+ self.service_info = {
47
+ "service_name": config.service_name,
48
+ "model_id": config.model_id,
49
+ "backend": "vllm",
50
+ "status": "stopped",
51
+ "url": self.service_url
52
+ }
53
+
54
+ async def start(self) -> Dict[str, Any]:
55
+ """
56
+ Start vLLM inference server.
57
+
58
+ Returns:
59
+ Service startup result
60
+ """
61
+ if self.is_running:
62
+ return {
63
+ "success": False,
64
+ "error": "Service already running",
65
+ "service_info": self.service_info
66
+ }
67
+
68
+ try:
69
+ logger.info(f"Starting vLLM service: {self.config.service_name}")
70
+
71
+ # Check GPU availability
72
+ gpu_check = await self._check_gpu_requirements()
73
+ if not gpu_check["compatible"]:
74
+ return {
75
+ "success": False,
76
+ "error": f"GPU requirements not met: {', '.join(gpu_check['warnings'])}",
77
+ "gpu_check": gpu_check
78
+ }
79
+
80
+ # Prepare vLLM command
81
+ cmd = self._build_vllm_command()
82
+ logger.info(f"vLLM command: {' '.join(cmd)}")
83
+
84
+ # Start vLLM process
85
+ self.startup_time = datetime.now()
86
+ self.process = subprocess.Popen(
87
+ cmd,
88
+ stdout=subprocess.PIPE,
89
+ stderr=subprocess.PIPE,
90
+ text=True,
91
+ env=self._get_environment()
92
+ )
93
+
94
+ # Wait for service to be ready
95
+ startup_result = await self._wait_for_startup()
96
+
97
+ if startup_result["success"]:
98
+ self.is_running = True
99
+ self.service_info.update({
100
+ "status": "running",
101
+ "pid": self.process.pid,
102
+ "started_at": self.startup_time.isoformat(),
103
+ "model_info": await self._get_model_info()
104
+ })
105
+
106
+ logger.info(f"vLLM service started successfully: {self.service_url}")
107
+ return {
108
+ "success": True,
109
+ "service_info": self.service_info,
110
+ "startup_time_seconds": startup_result["startup_time"],
111
+ "gpu_info": gpu_check["selected_gpu"]
112
+ }
113
+ else:
114
+ await self.stop()
115
+ return {
116
+ "success": False,
117
+ "error": startup_result["error"],
118
+ "logs": startup_result.get("logs", [])
119
+ }
120
+
121
+ except Exception as e:
122
+ logger.error(f"Failed to start vLLM service: {e}")
123
+ await self.stop()
124
+ return {
125
+ "success": False,
126
+ "error": str(e)
127
+ }
128
+
129
+ async def stop(self) -> Dict[str, Any]:
130
+ """
131
+ Stop vLLM inference server.
132
+
133
+ Returns:
134
+ Service shutdown result
135
+ """
136
+ if not self.is_running:
137
+ return {
138
+ "success": True,
139
+ "message": "Service was not running"
140
+ }
141
+
142
+ try:
143
+ logger.info(f"Stopping vLLM service: {self.config.service_name}")
144
+
145
+ if self.process:
146
+ # Graceful shutdown
147
+ self.process.terminate()
148
+
149
+ # Wait for graceful shutdown
150
+ try:
151
+ self.process.wait(timeout=10)
152
+ except subprocess.TimeoutExpired:
153
+ # Force kill if graceful shutdown fails
154
+ logger.warning("Graceful shutdown timed out, force killing process")
155
+ self.process.kill()
156
+ self.process.wait(timeout=5)
157
+
158
+ self.process = None
159
+
160
+ self.is_running = False
161
+ self.service_info.update({
162
+ "status": "stopped",
163
+ "pid": None,
164
+ "stopped_at": datetime.now().isoformat()
165
+ })
166
+
167
+ logger.info(f"vLLM service stopped: {self.config.service_name}")
168
+ return {
169
+ "success": True,
170
+ "service_info": self.service_info
171
+ }
172
+
173
+ except Exception as e:
174
+ logger.error(f"Failed to stop vLLM service: {e}")
175
+ return {
176
+ "success": False,
177
+ "error": str(e)
178
+ }
179
+
180
+ async def restart(self) -> Dict[str, Any]:
181
+ """Restart vLLM service"""
182
+ stop_result = await self.stop()
183
+ if not stop_result["success"]:
184
+ return stop_result
185
+
186
+ # Wait a moment before restart
187
+ await asyncio.sleep(2)
188
+
189
+ return await self.start()
190
+
191
+ async def health_check(self) -> Dict[str, Any]:
192
+ """Check service health"""
193
+ if not self.is_running:
194
+ return {
195
+ "healthy": False,
196
+ "status": "stopped"
197
+ }
198
+
199
+ try:
200
+ async with httpx.AsyncClient(timeout=5.0) as client:
201
+ response = await client.get(f"{self.service_url}/health")
202
+
203
+ if response.status_code == 200:
204
+ return {
205
+ "healthy": True,
206
+ "status": "running",
207
+ "response_time_ms": response.elapsed.total_seconds() * 1000,
208
+ "service_info": self.service_info
209
+ }
210
+ else:
211
+ return {
212
+ "healthy": False,
213
+ "status": "unhealthy",
214
+ "status_code": response.status_code
215
+ }
216
+
217
+ except Exception as e:
218
+ return {
219
+ "healthy": False,
220
+ "status": "error",
221
+ "error": str(e)
222
+ }
223
+
224
+ async def generate(self, prompt: str, **kwargs) -> Dict[str, Any]:
225
+ """Generate text using vLLM service"""
226
+ if not self.is_running:
227
+ return {
228
+ "success": False,
229
+ "error": "Service not running"
230
+ }
231
+
232
+ try:
233
+ request_data = {
234
+ "prompt": prompt,
235
+ "max_tokens": kwargs.get("max_tokens", 512),
236
+ "temperature": kwargs.get("temperature", 0.7),
237
+ "top_p": kwargs.get("top_p", 0.9),
238
+ "stream": kwargs.get("stream", False)
239
+ }
240
+
241
+ async with httpx.AsyncClient(timeout=60.0) as client:
242
+ response = await client.post(
243
+ f"{self.service_url}/generate",
244
+ json=request_data
245
+ )
246
+
247
+ if response.status_code == 200:
248
+ return {
249
+ "success": True,
250
+ **response.json()
251
+ }
252
+ else:
253
+ return {
254
+ "success": False,
255
+ "error": f"API error: {response.status_code}",
256
+ "response": response.text
257
+ }
258
+
259
+ except Exception as e:
260
+ return {
261
+ "success": False,
262
+ "error": str(e)
263
+ }
264
+
265
+ async def chat_completions(self, messages: List[Dict[str, str]], **kwargs) -> Dict[str, Any]:
266
+ """OpenAI-compatible chat completions endpoint"""
267
+ if not self.is_running:
268
+ return {
269
+ "success": False,
270
+ "error": "Service not running"
271
+ }
272
+
273
+ try:
274
+ request_data = {
275
+ "model": self.config.served_model_name or self.config.model_id,
276
+ "messages": messages,
277
+ "max_tokens": kwargs.get("max_tokens", 512),
278
+ "temperature": kwargs.get("temperature", 0.7),
279
+ "top_p": kwargs.get("top_p", 0.9),
280
+ "stream": kwargs.get("stream", False)
281
+ }
282
+
283
+ async with httpx.AsyncClient(timeout=60.0) as client:
284
+ response = await client.post(
285
+ f"{self.service_url}/v1/chat/completions",
286
+ json=request_data,
287
+ headers={"Authorization": f"Bearer {self.config.api_key}"} if self.config.api_key else {}
288
+ )
289
+
290
+ if response.status_code == 200:
291
+ return {
292
+ "success": True,
293
+ **response.json()
294
+ }
295
+ else:
296
+ return {
297
+ "success": False,
298
+ "error": f"API error: {response.status_code}",
299
+ "response": response.text
300
+ }
301
+
302
+ except Exception as e:
303
+ return {
304
+ "success": False,
305
+ "error": str(e)
306
+ }
307
+
308
+ def _build_vllm_command(self) -> List[str]:
309
+ """Build vLLM server command"""
310
+ cmd = ["python", "-m", "vllm.entrypoints.openai.api_server"]
311
+
312
+ # Basic model configuration
313
+ cmd.extend(["--model", self.config.model_id])
314
+ cmd.extend(["--host", self.config.host])
315
+ cmd.extend(["--port", str(self.config.port)])
316
+
317
+ # Model configuration
318
+ if self.config.served_model_name:
319
+ cmd.extend(["--served-model-name", self.config.served_model_name])
320
+
321
+ cmd.extend(["--max-model-len", str(self.config.max_model_len)])
322
+ cmd.extend(["--max-num-seqs", str(self.config.max_num_seqs)])
323
+
324
+ # GPU configuration
325
+ if self.config.gpu_id is not None:
326
+ cmd.extend(["--tensor-parallel-size", str(self.config.tensor_parallel_size)])
327
+
328
+ cmd.extend(["--gpu-memory-utilization", str(self.config.gpu_memory_utilization)])
329
+ cmd.extend(["--swap-space", str(self.config.swap_space)])
330
+
331
+ # Performance settings
332
+ if self.config.enable_chunked_prefill:
333
+ cmd.append("--enable-chunked-prefill")
334
+
335
+ if self.config.enable_prefix_caching:
336
+ cmd.append("--enable-prefix-caching")
337
+
338
+ # Precision and quantization
339
+ if self.config.model_precision == "float16":
340
+ cmd.extend(["--dtype", "float16"])
341
+ elif self.config.model_precision == "bfloat16":
342
+ cmd.extend(["--dtype", "bfloat16"])
343
+
344
+ if self.config.quantization:
345
+ cmd.extend(["--quantization", self.config.quantization])
346
+ if self.config.quantization_param_path:
347
+ cmd.extend(["--quantization-param-path", self.config.quantization_param_path])
348
+
349
+ # Trust remote code
350
+ if self.config.trust_remote_code:
351
+ cmd.append("--trust-remote-code")
352
+
353
+ # Model revisions
354
+ if self.config.revision:
355
+ cmd.extend(["--revision", self.config.revision])
356
+ if self.config.tokenizer_revision:
357
+ cmd.extend(["--tokenizer-revision", self.config.tokenizer_revision])
358
+
359
+ # Additional vLLM arguments
360
+ for key, value in self.config.vllm_args.items():
361
+ if isinstance(value, bool):
362
+ if value:
363
+ cmd.append(f"--{key}")
364
+ else:
365
+ cmd.extend([f"--{key}", str(value)])
366
+
367
+ return cmd
368
+
369
+ def _get_environment(self) -> Dict[str, str]:
370
+ """Get environment variables for vLLM"""
371
+ env = os.environ.copy()
372
+
373
+ # CUDA configuration
374
+ if self.config.gpu_id is not None:
375
+ env["CUDA_VISIBLE_DEVICES"] = str(self.config.gpu_id)
376
+
377
+ # Cache directories
378
+ if self.config.model_cache_dir:
379
+ env["TRANSFORMERS_CACHE"] = self.config.model_cache_dir
380
+ env["HF_HOME"] = self.config.model_cache_dir
381
+
382
+ if self.config.download_dir:
383
+ env["HF_HUB_CACHE"] = self.config.download_dir
384
+
385
+ # Performance optimizations
386
+ env["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"
387
+ env["OMP_NUM_THREADS"] = "8"
388
+
389
+ return env
390
+
391
+ async def _check_gpu_requirements(self) -> Dict[str, Any]:
392
+ """Check GPU requirements for the model"""
393
+ self.gpu_manager.refresh()
394
+
395
+ if not self.gpu_manager.cuda_available:
396
+ return {
397
+ "compatible": False,
398
+ "warnings": ["CUDA not available"],
399
+ "selected_gpu": None
400
+ }
401
+
402
+ # Estimate memory requirements
403
+ estimated_memory = self.gpu_manager.estimate_model_memory(
404
+ self.config.model_id,
405
+ self.config.model_precision
406
+ )
407
+
408
+ # Find suitable GPU
409
+ if self.config.gpu_id is not None:
410
+ selected_gpu = self.gpu_manager.get_gpu_info(self.config.gpu_id)
411
+ if not selected_gpu:
412
+ return {
413
+ "compatible": False,
414
+ "warnings": [f"Specified GPU {self.config.gpu_id} not found"],
415
+ "selected_gpu": None
416
+ }
417
+ else:
418
+ selected_gpu = self.gpu_manager.get_best_gpu(estimated_memory)
419
+ if selected_gpu:
420
+ self.config.gpu_id = selected_gpu.gpu_id
421
+
422
+ if not selected_gpu:
423
+ return {
424
+ "compatible": False,
425
+ "warnings": [
426
+ f"No suitable GPU found. Required: {estimated_memory}MB, "
427
+ f"Available: {max(gpu.memory_free for gpu in self.gpu_manager.gpus) if self.gpu_manager.gpus else 0}MB"
428
+ ],
429
+ "selected_gpu": None
430
+ }
431
+
432
+ warnings = []
433
+
434
+ # Check memory requirements
435
+ required_memory = int(estimated_memory * self.config.gpu_memory_utilization)
436
+ if selected_gpu.memory_free < required_memory:
437
+ warnings.append(f"GPU memory may be insufficient: {selected_gpu.memory_free}MB available, {required_memory}MB required")
438
+
439
+ # Check utilization
440
+ if selected_gpu.utilization > 80:
441
+ warnings.append(f"GPU utilization is high: {selected_gpu.utilization}%")
442
+
443
+ return {
444
+ "compatible": True,
445
+ "warnings": warnings,
446
+ "selected_gpu": {
447
+ "gpu_id": selected_gpu.gpu_id,
448
+ "name": selected_gpu.name,
449
+ "memory_total": selected_gpu.memory_total,
450
+ "memory_free": selected_gpu.memory_free,
451
+ "utilization": selected_gpu.utilization,
452
+ "estimated_memory_required": estimated_memory
453
+ }
454
+ }
455
+
456
+ async def _wait_for_startup(self, timeout: int = 300) -> Dict[str, Any]:
457
+ """Wait for vLLM service to start"""
458
+ start_time = time.time()
459
+ logs = []
460
+
461
+ while time.time() - start_time < timeout:
462
+ # Check if process is still running
463
+ if self.process and self.process.poll() is not None:
464
+ # Process died
465
+ stdout, stderr = self.process.communicate()
466
+ return {
467
+ "success": False,
468
+ "error": "vLLM process died during startup",
469
+ "logs": logs + [stdout, stderr]
470
+ }
471
+
472
+ # Try to connect to service
473
+ try:
474
+ async with httpx.AsyncClient(timeout=2.0) as client:
475
+ response = await client.get(f"{self.service_url}/health")
476
+ if response.status_code == 200:
477
+ startup_time = time.time() - start_time
478
+ return {
479
+ "success": True,
480
+ "startup_time": startup_time
481
+ }
482
+ except:
483
+ pass
484
+
485
+ # Collect logs
486
+ if self.process:
487
+ try:
488
+ # Non-blocking read of logs
489
+ import select
490
+ if hasattr(select, 'select'):
491
+ ready, _, _ = select.select([self.process.stdout], [], [], 0.1)
492
+ if ready:
493
+ line = self.process.stdout.readline()
494
+ if line:
495
+ logs.append(line.strip())
496
+ logger.debug(f"vLLM: {line.strip()}")
497
+ except:
498
+ pass
499
+
500
+ await asyncio.sleep(2)
501
+
502
+ return {
503
+ "success": False,
504
+ "error": f"Startup timeout after {timeout} seconds",
505
+ "logs": logs
506
+ }
507
+
508
+ async def _get_model_info(self) -> Optional[Dict[str, Any]]:
509
+ """Get model information from vLLM service"""
510
+ try:
511
+ async with httpx.AsyncClient(timeout=5.0) as client:
512
+ response = await client.get(f"{self.service_url}/v1/models")
513
+ if response.status_code == 200:
514
+ return response.json()
515
+ except:
516
+ pass
517
+ return None
518
+
519
+ def get_service_info(self) -> Dict[str, Any]:
520
+ """Get current service information"""
521
+ return {
522
+ **self.service_info,
523
+ "config": self.config.to_dict(),
524
+ "process_id": self.process.pid if self.process else None,
525
+ "is_running": self.is_running,
526
+ "startup_time": self.startup_time.isoformat() if self.startup_time else None
527
+ }
@@ -0,0 +1,8 @@
1
+ """
2
+ Modal deployment services and utilities
3
+ """
4
+
5
+ from .deployer import ModalDeployer
6
+ from .config import ModalConfig, ModalServiceType, create_llm_config, create_vision_config, create_audio_config, create_embedding_config
7
+
8
+ __all__ = ["ModalDeployer", "ModalConfig", "ModalServiceType", "create_llm_config", "create_vision_config", "create_audio_config", "create_embedding_config"]
@@ -0,0 +1,136 @@
1
+ """
2
+ Modal deployment configuration
3
+
4
+ Simplified configuration for Modal-specific deployments.
5
+ """
6
+
7
+ from dataclasses import dataclass, field
8
+ from typing import Dict, Any, Optional
9
+ from enum import Enum
10
+
11
+
12
+ class ModalServiceType(Enum):
13
+ """Modal service types"""
14
+ LLM = "llm"
15
+ VISION = "vision"
16
+ AUDIO = "audio"
17
+ EMBEDDING = "embedding"
18
+ VIDEO = "video"
19
+
20
+
21
+ @dataclass
22
+ class ModalConfig:
23
+ """Configuration for Modal deployment"""
24
+
25
+ # Service identification
26
+ service_name: str
27
+ service_type: ModalServiceType
28
+ model_id: str
29
+
30
+ # Modal-specific settings
31
+ image_tag: str = "latest"
32
+ cpu_cores: int = 2
33
+ memory_gb: int = 8
34
+ gpu_type: Optional[str] = None # e.g., "A10G", "T4", "A100"
35
+ timeout_seconds: int = 300
36
+
37
+ # Scaling configuration
38
+ min_instances: int = 0
39
+ max_instances: int = 10
40
+ concurrency_limit: int = 1
41
+
42
+ # Environment variables
43
+ environment: Dict[str, str] = field(default_factory=dict)
44
+
45
+ # Service-specific configuration
46
+ service_config: Dict[str, Any] = field(default_factory=dict)
47
+
48
+ def to_dict(self) -> Dict[str, Any]:
49
+ """Convert to dictionary for serialization"""
50
+ return {
51
+ "service_name": self.service_name,
52
+ "service_type": self.service_type.value,
53
+ "model_id": self.model_id,
54
+ "image_tag": self.image_tag,
55
+ "cpu_cores": self.cpu_cores,
56
+ "memory_gb": self.memory_gb,
57
+ "gpu_type": self.gpu_type,
58
+ "timeout_seconds": self.timeout_seconds,
59
+ "min_instances": self.min_instances,
60
+ "max_instances": self.max_instances,
61
+ "concurrency_limit": self.concurrency_limit,
62
+ "environment": self.environment,
63
+ "service_config": self.service_config
64
+ }
65
+
66
+ @classmethod
67
+ def from_dict(cls, data: Dict[str, Any]) -> "ModalConfig":
68
+ """Create from dictionary"""
69
+ return cls(
70
+ service_name=data["service_name"],
71
+ service_type=ModalServiceType(data["service_type"]),
72
+ model_id=data["model_id"],
73
+ image_tag=data.get("image_tag", "latest"),
74
+ cpu_cores=data.get("cpu_cores", 2),
75
+ memory_gb=data.get("memory_gb", 8),
76
+ gpu_type=data.get("gpu_type"),
77
+ timeout_seconds=data.get("timeout_seconds", 300),
78
+ min_instances=data.get("min_instances", 0),
79
+ max_instances=data.get("max_instances", 10),
80
+ concurrency_limit=data.get("concurrency_limit", 1),
81
+ environment=data.get("environment", {}),
82
+ service_config=data.get("service_config", {})
83
+ )
84
+
85
+
86
+ # Predefined configurations for common service types
87
+ def create_llm_config(service_name: str, model_id: str, gpu_type: str = "A10G") -> ModalConfig:
88
+ """Create configuration for LLM service"""
89
+ return ModalConfig(
90
+ service_name=service_name,
91
+ service_type=ModalServiceType.LLM,
92
+ model_id=model_id,
93
+ gpu_type=gpu_type,
94
+ memory_gb=16,
95
+ timeout_seconds=600,
96
+ max_instances=5
97
+ )
98
+
99
+
100
+ def create_vision_config(service_name: str, model_id: str, gpu_type: str = "T4") -> ModalConfig:
101
+ """Create configuration for vision service"""
102
+ return ModalConfig(
103
+ service_name=service_name,
104
+ service_type=ModalServiceType.VISION,
105
+ model_id=model_id,
106
+ gpu_type=gpu_type,
107
+ memory_gb=12,
108
+ timeout_seconds=300,
109
+ max_instances=10
110
+ )
111
+
112
+
113
+ def create_audio_config(service_name: str, model_id: str, gpu_type: str = "T4") -> ModalConfig:
114
+ """Create configuration for audio service"""
115
+ return ModalConfig(
116
+ service_name=service_name,
117
+ service_type=ModalServiceType.AUDIO,
118
+ model_id=model_id,
119
+ gpu_type=gpu_type,
120
+ memory_gb=8,
121
+ timeout_seconds=300,
122
+ max_instances=8
123
+ )
124
+
125
+
126
+ def create_embedding_config(service_name: str, model_id: str, gpu_type: str = "T4") -> ModalConfig:
127
+ """Create configuration for embedding service"""
128
+ return ModalConfig(
129
+ service_name=service_name,
130
+ service_type=ModalServiceType.EMBEDDING,
131
+ model_id=model_id,
132
+ gpu_type=gpu_type,
133
+ memory_gb=6,
134
+ timeout_seconds=120,
135
+ max_instances=15
136
+ )
@@ -34,7 +34,7 @@ class ModelConfig:
34
34
  max_tokens: int = 2048
35
35
  estimated_cost_per_hour: float = 0.0
36
36
 
37
- class HuggingFaceModalDeployer:
37
+ class ModalDeployer:
38
38
  """
39
39
  Service to automatically deploy HuggingFace models to Modal
40
40
  """
@@ -0,0 +1,3 @@
1
+ """
2
+ Modal service implementations organized by capability
3
+ """
@@ -0,0 +1 @@
1
+ """Audio services for Modal deployment"""
@@ -0,0 +1 @@
1
+ """Embedding services for Modal deployment"""
@@ -0,0 +1 @@
1
+ """LLM services for Modal deployment"""