isa-model 0.3.91__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (228) hide show
  1. isa_model/client.py +1166 -584
  2. isa_model/core/cache/redis_cache.py +410 -0
  3. isa_model/core/config/config_manager.py +282 -12
  4. isa_model/core/config.py +91 -1
  5. isa_model/core/database/__init__.py +1 -0
  6. isa_model/core/database/direct_db_client.py +114 -0
  7. isa_model/core/database/migration_manager.py +563 -0
  8. isa_model/core/database/migrations.py +297 -0
  9. isa_model/core/database/supabase_client.py +258 -0
  10. isa_model/core/dependencies.py +316 -0
  11. isa_model/core/discovery/__init__.py +19 -0
  12. isa_model/core/discovery/consul_discovery.py +190 -0
  13. isa_model/core/logging/__init__.py +54 -0
  14. isa_model/core/logging/influx_logger.py +523 -0
  15. isa_model/core/logging/loki_logger.py +160 -0
  16. isa_model/core/models/__init__.py +46 -0
  17. isa_model/core/models/config_models.py +625 -0
  18. isa_model/core/models/deployment_billing_tracker.py +430 -0
  19. isa_model/core/models/model_billing_tracker.py +60 -88
  20. isa_model/core/models/model_manager.py +66 -25
  21. isa_model/core/models/model_metadata.py +690 -0
  22. isa_model/core/models/model_repo.py +217 -55
  23. isa_model/core/models/model_statistics_tracker.py +234 -0
  24. isa_model/core/models/model_storage.py +0 -1
  25. isa_model/core/models/model_version_manager.py +959 -0
  26. isa_model/core/models/system_models.py +857 -0
  27. isa_model/core/pricing_manager.py +2 -249
  28. isa_model/core/repositories/__init__.py +9 -0
  29. isa_model/core/repositories/config_repository.py +912 -0
  30. isa_model/core/resilience/circuit_breaker.py +366 -0
  31. isa_model/core/security/secrets.py +358 -0
  32. isa_model/core/services/__init__.py +2 -4
  33. isa_model/core/services/intelligent_model_selector.py +479 -370
  34. isa_model/core/storage/hf_storage.py +2 -2
  35. isa_model/core/types.py +8 -0
  36. isa_model/deployment/__init__.py +5 -48
  37. isa_model/deployment/core/__init__.py +2 -31
  38. isa_model/deployment/core/deployment_manager.py +1278 -368
  39. isa_model/deployment/local/__init__.py +31 -0
  40. isa_model/deployment/local/config.py +248 -0
  41. isa_model/deployment/local/gpu_gateway.py +607 -0
  42. isa_model/deployment/local/health_checker.py +428 -0
  43. isa_model/deployment/local/provider.py +586 -0
  44. isa_model/deployment/local/tensorrt_service.py +621 -0
  45. isa_model/deployment/local/transformers_service.py +644 -0
  46. isa_model/deployment/local/vllm_service.py +527 -0
  47. isa_model/deployment/modal/__init__.py +8 -0
  48. isa_model/deployment/modal/config.py +136 -0
  49. isa_model/deployment/modal/deployer.py +894 -0
  50. isa_model/deployment/modal/services/__init__.py +3 -0
  51. isa_model/deployment/modal/services/audio/__init__.py +1 -0
  52. isa_model/deployment/modal/services/audio/isa_audio_chatTTS_service.py +520 -0
  53. isa_model/deployment/modal/services/audio/isa_audio_openvoice_service.py +758 -0
  54. isa_model/deployment/modal/services/audio/isa_audio_service_v2.py +1044 -0
  55. isa_model/deployment/modal/services/embedding/__init__.py +1 -0
  56. isa_model/deployment/modal/services/embedding/isa_embed_rerank_service.py +296 -0
  57. isa_model/deployment/modal/services/llm/__init__.py +1 -0
  58. isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
  59. isa_model/deployment/modal/services/video/__init__.py +1 -0
  60. isa_model/deployment/modal/services/video/isa_video_hunyuan_service.py +423 -0
  61. isa_model/deployment/modal/services/vision/__init__.py +1 -0
  62. isa_model/deployment/modal/services/vision/isa_vision_ocr_service.py +519 -0
  63. isa_model/deployment/modal/services/vision/isa_vision_qwen25_service.py +709 -0
  64. isa_model/deployment/modal/services/vision/isa_vision_table_service.py +676 -0
  65. isa_model/deployment/modal/services/vision/isa_vision_ui_service.py +833 -0
  66. isa_model/deployment/modal/services/vision/isa_vision_ui_service_optimized.py +660 -0
  67. isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
  68. isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
  69. isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
  70. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
  71. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
  72. isa_model/deployment/storage/__init__.py +5 -0
  73. isa_model/deployment/storage/deployment_repository.py +824 -0
  74. isa_model/deployment/triton/__init__.py +10 -0
  75. isa_model/deployment/triton/config.py +196 -0
  76. isa_model/deployment/triton/configs/__init__.py +1 -0
  77. isa_model/deployment/triton/provider.py +512 -0
  78. isa_model/deployment/triton/scripts/__init__.py +1 -0
  79. isa_model/deployment/triton/templates/__init__.py +1 -0
  80. isa_model/inference/__init__.py +47 -1
  81. isa_model/inference/ai_factory.py +179 -16
  82. isa_model/inference/legacy_services/__init__.py +21 -0
  83. isa_model/inference/legacy_services/model_evaluation.py +637 -0
  84. isa_model/inference/legacy_services/model_service.py +573 -0
  85. isa_model/inference/legacy_services/model_serving.py +717 -0
  86. isa_model/inference/legacy_services/model_training.py +561 -0
  87. isa_model/inference/models/__init__.py +21 -0
  88. isa_model/inference/models/inference_config.py +551 -0
  89. isa_model/inference/models/inference_record.py +675 -0
  90. isa_model/inference/models/performance_models.py +714 -0
  91. isa_model/inference/repositories/__init__.py +9 -0
  92. isa_model/inference/repositories/inference_repository.py +828 -0
  93. isa_model/inference/services/audio/__init__.py +21 -0
  94. isa_model/inference/services/audio/base_realtime_service.py +225 -0
  95. isa_model/inference/services/audio/base_stt_service.py +184 -11
  96. isa_model/inference/services/audio/isa_tts_service.py +0 -0
  97. isa_model/inference/services/audio/openai_realtime_service.py +320 -124
  98. isa_model/inference/services/audio/openai_stt_service.py +53 -11
  99. isa_model/inference/services/base_service.py +17 -1
  100. isa_model/inference/services/custom_model_manager.py +277 -0
  101. isa_model/inference/services/embedding/__init__.py +13 -0
  102. isa_model/inference/services/embedding/base_embed_service.py +111 -8
  103. isa_model/inference/services/embedding/isa_embed_service.py +305 -0
  104. isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
  105. isa_model/inference/services/embedding/openai_embed_service.py +2 -4
  106. isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
  107. isa_model/inference/services/embedding/tests/test_embedding.py +222 -0
  108. isa_model/inference/services/img/__init__.py +2 -2
  109. isa_model/inference/services/img/base_image_gen_service.py +24 -7
  110. isa_model/inference/services/img/replicate_image_gen_service.py +84 -422
  111. isa_model/inference/services/img/services/replicate_face_swap.py +193 -0
  112. isa_model/inference/services/img/services/replicate_flux.py +226 -0
  113. isa_model/inference/services/img/services/replicate_flux_kontext.py +219 -0
  114. isa_model/inference/services/img/services/replicate_sticker_maker.py +249 -0
  115. isa_model/inference/services/img/tests/test_img_client.py +297 -0
  116. isa_model/inference/services/llm/__init__.py +10 -2
  117. isa_model/inference/services/llm/base_llm_service.py +361 -26
  118. isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
  119. isa_model/inference/services/llm/helpers/llm_adapter.py +71 -12
  120. isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
  121. isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
  122. isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
  123. isa_model/inference/services/llm/local_llm_service.py +747 -0
  124. isa_model/inference/services/llm/ollama_llm_service.py +11 -3
  125. isa_model/inference/services/llm/openai_llm_service.py +670 -56
  126. isa_model/inference/services/llm/yyds_llm_service.py +10 -3
  127. isa_model/inference/services/vision/__init__.py +27 -6
  128. isa_model/inference/services/vision/base_vision_service.py +118 -185
  129. isa_model/inference/services/vision/blip_vision_service.py +359 -0
  130. isa_model/inference/services/vision/helpers/image_utils.py +19 -10
  131. isa_model/inference/services/vision/isa_vision_service.py +634 -0
  132. isa_model/inference/services/vision/openai_vision_service.py +19 -10
  133. isa_model/inference/services/vision/tests/test_ocr_client.py +284 -0
  134. isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
  135. isa_model/serving/api/cache_manager.py +245 -0
  136. isa_model/serving/api/dependencies/__init__.py +1 -0
  137. isa_model/serving/api/dependencies/auth.py +194 -0
  138. isa_model/serving/api/dependencies/database.py +139 -0
  139. isa_model/serving/api/error_handlers.py +284 -0
  140. isa_model/serving/api/fastapi_server.py +240 -18
  141. isa_model/serving/api/middleware/auth.py +317 -0
  142. isa_model/serving/api/middleware/security.py +268 -0
  143. isa_model/serving/api/middleware/tenant_context.py +414 -0
  144. isa_model/serving/api/routes/analytics.py +489 -0
  145. isa_model/serving/api/routes/config.py +645 -0
  146. isa_model/serving/api/routes/deployment_billing.py +315 -0
  147. isa_model/serving/api/routes/deployments.py +475 -0
  148. isa_model/serving/api/routes/gpu_gateway.py +440 -0
  149. isa_model/serving/api/routes/health.py +32 -12
  150. isa_model/serving/api/routes/inference_monitoring.py +486 -0
  151. isa_model/serving/api/routes/local_deployments.py +448 -0
  152. isa_model/serving/api/routes/logs.py +430 -0
  153. isa_model/serving/api/routes/settings.py +582 -0
  154. isa_model/serving/api/routes/tenants.py +575 -0
  155. isa_model/serving/api/routes/unified.py +992 -171
  156. isa_model/serving/api/routes/webhooks.py +479 -0
  157. isa_model/serving/api/startup.py +318 -0
  158. isa_model/serving/modal_proxy_server.py +249 -0
  159. isa_model/utils/gpu_utils.py +311 -0
  160. {isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/METADATA +76 -22
  161. isa_model-0.4.3.dist-info/RECORD +193 -0
  162. isa_model/deployment/cloud/__init__.py +0 -9
  163. isa_model/deployment/cloud/modal/__init__.py +0 -10
  164. isa_model/deployment/cloud/modal/isa_vision_doc_service.py +0 -766
  165. isa_model/deployment/cloud/modal/isa_vision_table_service.py +0 -532
  166. isa_model/deployment/cloud/modal/isa_vision_ui_service.py +0 -406
  167. isa_model/deployment/cloud/modal/register_models.py +0 -321
  168. isa_model/deployment/core/deployment_config.py +0 -356
  169. isa_model/deployment/core/isa_deployment_service.py +0 -401
  170. isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
  171. isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
  172. isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
  173. isa_model/deployment/runtime/deployed_service.py +0 -338
  174. isa_model/deployment/services/__init__.py +0 -9
  175. isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
  176. isa_model/deployment/services/model_service.py +0 -332
  177. isa_model/deployment/services/service_monitor.py +0 -356
  178. isa_model/deployment/services/service_registry.py +0 -527
  179. isa_model/eval/__init__.py +0 -92
  180. isa_model/eval/benchmarks.py +0 -469
  181. isa_model/eval/config/__init__.py +0 -10
  182. isa_model/eval/config/evaluation_config.py +0 -108
  183. isa_model/eval/evaluators/__init__.py +0 -18
  184. isa_model/eval/evaluators/base_evaluator.py +0 -503
  185. isa_model/eval/evaluators/llm_evaluator.py +0 -472
  186. isa_model/eval/factory.py +0 -531
  187. isa_model/eval/infrastructure/__init__.py +0 -24
  188. isa_model/eval/infrastructure/experiment_tracker.py +0 -466
  189. isa_model/eval/metrics.py +0 -798
  190. isa_model/inference/adapter/unified_api.py +0 -248
  191. isa_model/inference/services/helpers/stacked_config.py +0 -148
  192. isa_model/inference/services/img/flux_professional_service.py +0 -603
  193. isa_model/inference/services/img/helpers/base_stacked_service.py +0 -274
  194. isa_model/inference/services/others/table_transformer_service.py +0 -61
  195. isa_model/inference/services/vision/doc_analysis_service.py +0 -640
  196. isa_model/inference/services/vision/helpers/base_stacked_service.py +0 -274
  197. isa_model/inference/services/vision/ui_analysis_service.py +0 -823
  198. isa_model/scripts/inference_tracker.py +0 -283
  199. isa_model/scripts/mlflow_manager.py +0 -379
  200. isa_model/scripts/model_registry.py +0 -465
  201. isa_model/scripts/register_models.py +0 -370
  202. isa_model/scripts/register_models_with_embeddings.py +0 -510
  203. isa_model/scripts/start_mlflow.py +0 -95
  204. isa_model/scripts/training_tracker.py +0 -257
  205. isa_model/training/__init__.py +0 -74
  206. isa_model/training/annotation/annotation_schema.py +0 -47
  207. isa_model/training/annotation/processors/annotation_processor.py +0 -126
  208. isa_model/training/annotation/storage/dataset_manager.py +0 -131
  209. isa_model/training/annotation/storage/dataset_schema.py +0 -44
  210. isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
  211. isa_model/training/annotation/tests/test_minio copy.py +0 -113
  212. isa_model/training/annotation/tests/test_minio_upload.py +0 -43
  213. isa_model/training/annotation/views/annotation_controller.py +0 -158
  214. isa_model/training/cloud/__init__.py +0 -22
  215. isa_model/training/cloud/job_orchestrator.py +0 -402
  216. isa_model/training/cloud/runpod_trainer.py +0 -454
  217. isa_model/training/cloud/storage_manager.py +0 -482
  218. isa_model/training/core/__init__.py +0 -23
  219. isa_model/training/core/config.py +0 -181
  220. isa_model/training/core/dataset.py +0 -222
  221. isa_model/training/core/trainer.py +0 -720
  222. isa_model/training/core/utils.py +0 -213
  223. isa_model/training/factory.py +0 -424
  224. isa_model-0.3.91.dist-info/RECORD +0 -138
  225. /isa_model/{core/storage/minio_storage.py → deployment/modal/services/audio/isa_audio_fish_service.py} +0 -0
  226. /isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
  227. {isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/WHEEL +0 -0
  228. {isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,527 @@
1
+ """
2
+ vLLM local inference service
3
+
4
+ High-performance local model serving using vLLM.
5
+ """
6
+
7
+ import os
8
+ import json
9
+ import asyncio
10
+ import logging
11
+ import subprocess
12
+ import signal
13
+ from typing import Dict, List, Optional, Any, Union, AsyncGenerator
14
+ from pathlib import Path
15
+ from datetime import datetime
16
+ import httpx
17
+ import time
18
+
19
+ from .config import LocalGPUConfig, LocalServiceType, LocalBackend
20
+ from ...utils.gpu_utils import get_gpu_manager, GPUInfo
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ class VLLMService:
26
+ """vLLM local inference service manager"""
27
+
28
+ def __init__(self, config: LocalGPUConfig):
29
+ """
30
+ Initialize vLLM service.
31
+
32
+ Args:
33
+ config: Local GPU configuration for vLLM
34
+ """
35
+ if config.backend != LocalBackend.VLLM:
36
+ raise ValueError("Config must use VLLM backend")
37
+
38
+ self.config = config
39
+ self.gpu_manager = get_gpu_manager()
40
+ self.process: Optional[subprocess.Popen] = None
41
+ self.service_url = f"http://{config.host}:{config.port}"
42
+ self.is_running = False
43
+ self.startup_time: Optional[datetime] = None
44
+
45
+ # Service info
46
+ self.service_info = {
47
+ "service_name": config.service_name,
48
+ "model_id": config.model_id,
49
+ "backend": "vllm",
50
+ "status": "stopped",
51
+ "url": self.service_url
52
+ }
53
+
54
+ async def start(self) -> Dict[str, Any]:
55
+ """
56
+ Start vLLM inference server.
57
+
58
+ Returns:
59
+ Service startup result
60
+ """
61
+ if self.is_running:
62
+ return {
63
+ "success": False,
64
+ "error": "Service already running",
65
+ "service_info": self.service_info
66
+ }
67
+
68
+ try:
69
+ logger.info(f"Starting vLLM service: {self.config.service_name}")
70
+
71
+ # Check GPU availability
72
+ gpu_check = await self._check_gpu_requirements()
73
+ if not gpu_check["compatible"]:
74
+ return {
75
+ "success": False,
76
+ "error": f"GPU requirements not met: {', '.join(gpu_check['warnings'])}",
77
+ "gpu_check": gpu_check
78
+ }
79
+
80
+ # Prepare vLLM command
81
+ cmd = self._build_vllm_command()
82
+ logger.info(f"vLLM command: {' '.join(cmd)}")
83
+
84
+ # Start vLLM process
85
+ self.startup_time = datetime.now()
86
+ self.process = subprocess.Popen(
87
+ cmd,
88
+ stdout=subprocess.PIPE,
89
+ stderr=subprocess.PIPE,
90
+ text=True,
91
+ env=self._get_environment()
92
+ )
93
+
94
+ # Wait for service to be ready
95
+ startup_result = await self._wait_for_startup()
96
+
97
+ if startup_result["success"]:
98
+ self.is_running = True
99
+ self.service_info.update({
100
+ "status": "running",
101
+ "pid": self.process.pid,
102
+ "started_at": self.startup_time.isoformat(),
103
+ "model_info": await self._get_model_info()
104
+ })
105
+
106
+ logger.info(f"vLLM service started successfully: {self.service_url}")
107
+ return {
108
+ "success": True,
109
+ "service_info": self.service_info,
110
+ "startup_time_seconds": startup_result["startup_time"],
111
+ "gpu_info": gpu_check["selected_gpu"]
112
+ }
113
+ else:
114
+ await self.stop()
115
+ return {
116
+ "success": False,
117
+ "error": startup_result["error"],
118
+ "logs": startup_result.get("logs", [])
119
+ }
120
+
121
+ except Exception as e:
122
+ logger.error(f"Failed to start vLLM service: {e}")
123
+ await self.stop()
124
+ return {
125
+ "success": False,
126
+ "error": str(e)
127
+ }
128
+
129
+ async def stop(self) -> Dict[str, Any]:
130
+ """
131
+ Stop vLLM inference server.
132
+
133
+ Returns:
134
+ Service shutdown result
135
+ """
136
+ if not self.is_running:
137
+ return {
138
+ "success": True,
139
+ "message": "Service was not running"
140
+ }
141
+
142
+ try:
143
+ logger.info(f"Stopping vLLM service: {self.config.service_name}")
144
+
145
+ if self.process:
146
+ # Graceful shutdown
147
+ self.process.terminate()
148
+
149
+ # Wait for graceful shutdown
150
+ try:
151
+ self.process.wait(timeout=10)
152
+ except subprocess.TimeoutExpired:
153
+ # Force kill if graceful shutdown fails
154
+ logger.warning("Graceful shutdown timed out, force killing process")
155
+ self.process.kill()
156
+ self.process.wait(timeout=5)
157
+
158
+ self.process = None
159
+
160
+ self.is_running = False
161
+ self.service_info.update({
162
+ "status": "stopped",
163
+ "pid": None,
164
+ "stopped_at": datetime.now().isoformat()
165
+ })
166
+
167
+ logger.info(f"vLLM service stopped: {self.config.service_name}")
168
+ return {
169
+ "success": True,
170
+ "service_info": self.service_info
171
+ }
172
+
173
+ except Exception as e:
174
+ logger.error(f"Failed to stop vLLM service: {e}")
175
+ return {
176
+ "success": False,
177
+ "error": str(e)
178
+ }
179
+
180
+ async def restart(self) -> Dict[str, Any]:
181
+ """Restart vLLM service"""
182
+ stop_result = await self.stop()
183
+ if not stop_result["success"]:
184
+ return stop_result
185
+
186
+ # Wait a moment before restart
187
+ await asyncio.sleep(2)
188
+
189
+ return await self.start()
190
+
191
+ async def health_check(self) -> Dict[str, Any]:
192
+ """Check service health"""
193
+ if not self.is_running:
194
+ return {
195
+ "healthy": False,
196
+ "status": "stopped"
197
+ }
198
+
199
+ try:
200
+ async with httpx.AsyncClient(timeout=5.0) as client:
201
+ response = await client.get(f"{self.service_url}/health")
202
+
203
+ if response.status_code == 200:
204
+ return {
205
+ "healthy": True,
206
+ "status": "running",
207
+ "response_time_ms": response.elapsed.total_seconds() * 1000,
208
+ "service_info": self.service_info
209
+ }
210
+ else:
211
+ return {
212
+ "healthy": False,
213
+ "status": "unhealthy",
214
+ "status_code": response.status_code
215
+ }
216
+
217
+ except Exception as e:
218
+ return {
219
+ "healthy": False,
220
+ "status": "error",
221
+ "error": str(e)
222
+ }
223
+
224
+ async def generate(self, prompt: str, **kwargs) -> Dict[str, Any]:
225
+ """Generate text using vLLM service"""
226
+ if not self.is_running:
227
+ return {
228
+ "success": False,
229
+ "error": "Service not running"
230
+ }
231
+
232
+ try:
233
+ request_data = {
234
+ "prompt": prompt,
235
+ "max_tokens": kwargs.get("max_tokens", 512),
236
+ "temperature": kwargs.get("temperature", 0.7),
237
+ "top_p": kwargs.get("top_p", 0.9),
238
+ "stream": kwargs.get("stream", False)
239
+ }
240
+
241
+ async with httpx.AsyncClient(timeout=60.0) as client:
242
+ response = await client.post(
243
+ f"{self.service_url}/generate",
244
+ json=request_data
245
+ )
246
+
247
+ if response.status_code == 200:
248
+ return {
249
+ "success": True,
250
+ **response.json()
251
+ }
252
+ else:
253
+ return {
254
+ "success": False,
255
+ "error": f"API error: {response.status_code}",
256
+ "response": response.text
257
+ }
258
+
259
+ except Exception as e:
260
+ return {
261
+ "success": False,
262
+ "error": str(e)
263
+ }
264
+
265
+ async def chat_completions(self, messages: List[Dict[str, str]], **kwargs) -> Dict[str, Any]:
266
+ """OpenAI-compatible chat completions endpoint"""
267
+ if not self.is_running:
268
+ return {
269
+ "success": False,
270
+ "error": "Service not running"
271
+ }
272
+
273
+ try:
274
+ request_data = {
275
+ "model": self.config.served_model_name or self.config.model_id,
276
+ "messages": messages,
277
+ "max_tokens": kwargs.get("max_tokens", 512),
278
+ "temperature": kwargs.get("temperature", 0.7),
279
+ "top_p": kwargs.get("top_p", 0.9),
280
+ "stream": kwargs.get("stream", False)
281
+ }
282
+
283
+ async with httpx.AsyncClient(timeout=60.0) as client:
284
+ response = await client.post(
285
+ f"{self.service_url}/v1/chat/completions",
286
+ json=request_data,
287
+ headers={"Authorization": f"Bearer {self.config.api_key}"} if self.config.api_key else {}
288
+ )
289
+
290
+ if response.status_code == 200:
291
+ return {
292
+ "success": True,
293
+ **response.json()
294
+ }
295
+ else:
296
+ return {
297
+ "success": False,
298
+ "error": f"API error: {response.status_code}",
299
+ "response": response.text
300
+ }
301
+
302
+ except Exception as e:
303
+ return {
304
+ "success": False,
305
+ "error": str(e)
306
+ }
307
+
308
+ def _build_vllm_command(self) -> List[str]:
309
+ """Build vLLM server command"""
310
+ cmd = ["python", "-m", "vllm.entrypoints.openai.api_server"]
311
+
312
+ # Basic model configuration
313
+ cmd.extend(["--model", self.config.model_id])
314
+ cmd.extend(["--host", self.config.host])
315
+ cmd.extend(["--port", str(self.config.port)])
316
+
317
+ # Model configuration
318
+ if self.config.served_model_name:
319
+ cmd.extend(["--served-model-name", self.config.served_model_name])
320
+
321
+ cmd.extend(["--max-model-len", str(self.config.max_model_len)])
322
+ cmd.extend(["--max-num-seqs", str(self.config.max_num_seqs)])
323
+
324
+ # GPU configuration
325
+ if self.config.gpu_id is not None:
326
+ cmd.extend(["--tensor-parallel-size", str(self.config.tensor_parallel_size)])
327
+
328
+ cmd.extend(["--gpu-memory-utilization", str(self.config.gpu_memory_utilization)])
329
+ cmd.extend(["--swap-space", str(self.config.swap_space)])
330
+
331
+ # Performance settings
332
+ if self.config.enable_chunked_prefill:
333
+ cmd.append("--enable-chunked-prefill")
334
+
335
+ if self.config.enable_prefix_caching:
336
+ cmd.append("--enable-prefix-caching")
337
+
338
+ # Precision and quantization
339
+ if self.config.model_precision == "float16":
340
+ cmd.extend(["--dtype", "float16"])
341
+ elif self.config.model_precision == "bfloat16":
342
+ cmd.extend(["--dtype", "bfloat16"])
343
+
344
+ if self.config.quantization:
345
+ cmd.extend(["--quantization", self.config.quantization])
346
+ if self.config.quantization_param_path:
347
+ cmd.extend(["--quantization-param-path", self.config.quantization_param_path])
348
+
349
+ # Trust remote code
350
+ if self.config.trust_remote_code:
351
+ cmd.append("--trust-remote-code")
352
+
353
+ # Model revisions
354
+ if self.config.revision:
355
+ cmd.extend(["--revision", self.config.revision])
356
+ if self.config.tokenizer_revision:
357
+ cmd.extend(["--tokenizer-revision", self.config.tokenizer_revision])
358
+
359
+ # Additional vLLM arguments
360
+ for key, value in self.config.vllm_args.items():
361
+ if isinstance(value, bool):
362
+ if value:
363
+ cmd.append(f"--{key}")
364
+ else:
365
+ cmd.extend([f"--{key}", str(value)])
366
+
367
+ return cmd
368
+
369
+ def _get_environment(self) -> Dict[str, str]:
370
+ """Get environment variables for vLLM"""
371
+ env = os.environ.copy()
372
+
373
+ # CUDA configuration
374
+ if self.config.gpu_id is not None:
375
+ env["CUDA_VISIBLE_DEVICES"] = str(self.config.gpu_id)
376
+
377
+ # Cache directories
378
+ if self.config.model_cache_dir:
379
+ env["TRANSFORMERS_CACHE"] = self.config.model_cache_dir
380
+ env["HF_HOME"] = self.config.model_cache_dir
381
+
382
+ if self.config.download_dir:
383
+ env["HF_HUB_CACHE"] = self.config.download_dir
384
+
385
+ # Performance optimizations
386
+ env["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"
387
+ env["OMP_NUM_THREADS"] = "8"
388
+
389
+ return env
390
+
391
+ async def _check_gpu_requirements(self) -> Dict[str, Any]:
392
+ """Check GPU requirements for the model"""
393
+ self.gpu_manager.refresh()
394
+
395
+ if not self.gpu_manager.cuda_available:
396
+ return {
397
+ "compatible": False,
398
+ "warnings": ["CUDA not available"],
399
+ "selected_gpu": None
400
+ }
401
+
402
+ # Estimate memory requirements
403
+ estimated_memory = self.gpu_manager.estimate_model_memory(
404
+ self.config.model_id,
405
+ self.config.model_precision
406
+ )
407
+
408
+ # Find suitable GPU
409
+ if self.config.gpu_id is not None:
410
+ selected_gpu = self.gpu_manager.get_gpu_info(self.config.gpu_id)
411
+ if not selected_gpu:
412
+ return {
413
+ "compatible": False,
414
+ "warnings": [f"Specified GPU {self.config.gpu_id} not found"],
415
+ "selected_gpu": None
416
+ }
417
+ else:
418
+ selected_gpu = self.gpu_manager.get_best_gpu(estimated_memory)
419
+ if selected_gpu:
420
+ self.config.gpu_id = selected_gpu.gpu_id
421
+
422
+ if not selected_gpu:
423
+ return {
424
+ "compatible": False,
425
+ "warnings": [
426
+ f"No suitable GPU found. Required: {estimated_memory}MB, "
427
+ f"Available: {max(gpu.memory_free for gpu in self.gpu_manager.gpus) if self.gpu_manager.gpus else 0}MB"
428
+ ],
429
+ "selected_gpu": None
430
+ }
431
+
432
+ warnings = []
433
+
434
+ # Check memory requirements
435
+ required_memory = int(estimated_memory * self.config.gpu_memory_utilization)
436
+ if selected_gpu.memory_free < required_memory:
437
+ warnings.append(f"GPU memory may be insufficient: {selected_gpu.memory_free}MB available, {required_memory}MB required")
438
+
439
+ # Check utilization
440
+ if selected_gpu.utilization > 80:
441
+ warnings.append(f"GPU utilization is high: {selected_gpu.utilization}%")
442
+
443
+ return {
444
+ "compatible": True,
445
+ "warnings": warnings,
446
+ "selected_gpu": {
447
+ "gpu_id": selected_gpu.gpu_id,
448
+ "name": selected_gpu.name,
449
+ "memory_total": selected_gpu.memory_total,
450
+ "memory_free": selected_gpu.memory_free,
451
+ "utilization": selected_gpu.utilization,
452
+ "estimated_memory_required": estimated_memory
453
+ }
454
+ }
455
+
456
+ async def _wait_for_startup(self, timeout: int = 300) -> Dict[str, Any]:
457
+ """Wait for vLLM service to start"""
458
+ start_time = time.time()
459
+ logs = []
460
+
461
+ while time.time() - start_time < timeout:
462
+ # Check if process is still running
463
+ if self.process and self.process.poll() is not None:
464
+ # Process died
465
+ stdout, stderr = self.process.communicate()
466
+ return {
467
+ "success": False,
468
+ "error": "vLLM process died during startup",
469
+ "logs": logs + [stdout, stderr]
470
+ }
471
+
472
+ # Try to connect to service
473
+ try:
474
+ async with httpx.AsyncClient(timeout=2.0) as client:
475
+ response = await client.get(f"{self.service_url}/health")
476
+ if response.status_code == 200:
477
+ startup_time = time.time() - start_time
478
+ return {
479
+ "success": True,
480
+ "startup_time": startup_time
481
+ }
482
+ except:
483
+ pass
484
+
485
+ # Collect logs
486
+ if self.process:
487
+ try:
488
+ # Non-blocking read of logs
489
+ import select
490
+ if hasattr(select, 'select'):
491
+ ready, _, _ = select.select([self.process.stdout], [], [], 0.1)
492
+ if ready:
493
+ line = self.process.stdout.readline()
494
+ if line:
495
+ logs.append(line.strip())
496
+ logger.debug(f"vLLM: {line.strip()}")
497
+ except:
498
+ pass
499
+
500
+ await asyncio.sleep(2)
501
+
502
+ return {
503
+ "success": False,
504
+ "error": f"Startup timeout after {timeout} seconds",
505
+ "logs": logs
506
+ }
507
+
508
+ async def _get_model_info(self) -> Optional[Dict[str, Any]]:
509
+ """Get model information from vLLM service"""
510
+ try:
511
+ async with httpx.AsyncClient(timeout=5.0) as client:
512
+ response = await client.get(f"{self.service_url}/v1/models")
513
+ if response.status_code == 200:
514
+ return response.json()
515
+ except:
516
+ pass
517
+ return None
518
+
519
+ def get_service_info(self) -> Dict[str, Any]:
520
+ """Get current service information"""
521
+ return {
522
+ **self.service_info,
523
+ "config": self.config.to_dict(),
524
+ "process_id": self.process.pid if self.process else None,
525
+ "is_running": self.is_running,
526
+ "startup_time": self.startup_time.isoformat() if self.startup_time else None
527
+ }
@@ -0,0 +1,8 @@
1
+ """
2
+ Modal deployment services and utilities
3
+ """
4
+
5
+ from .deployer import ModalDeployer
6
+ from .config import ModalConfig, ModalServiceType, create_llm_config, create_vision_config, create_audio_config, create_embedding_config
7
+
8
+ __all__ = ["ModalDeployer", "ModalConfig", "ModalServiceType", "create_llm_config", "create_vision_config", "create_audio_config", "create_embedding_config"]
@@ -0,0 +1,136 @@
1
+ """
2
+ Modal deployment configuration
3
+
4
+ Simplified configuration for Modal-specific deployments.
5
+ """
6
+
7
+ from dataclasses import dataclass, field
8
+ from typing import Dict, Any, Optional
9
+ from enum import Enum
10
+
11
+
12
+ class ModalServiceType(Enum):
13
+ """Modal service types"""
14
+ LLM = "llm"
15
+ VISION = "vision"
16
+ AUDIO = "audio"
17
+ EMBEDDING = "embedding"
18
+ VIDEO = "video"
19
+
20
+
21
+ @dataclass
22
+ class ModalConfig:
23
+ """Configuration for Modal deployment"""
24
+
25
+ # Service identification
26
+ service_name: str
27
+ service_type: ModalServiceType
28
+ model_id: str
29
+
30
+ # Modal-specific settings
31
+ image_tag: str = "latest"
32
+ cpu_cores: int = 2
33
+ memory_gb: int = 8
34
+ gpu_type: Optional[str] = None # e.g., "A10G", "T4", "A100"
35
+ timeout_seconds: int = 300
36
+
37
+ # Scaling configuration
38
+ min_instances: int = 0
39
+ max_instances: int = 10
40
+ concurrency_limit: int = 1
41
+
42
+ # Environment variables
43
+ environment: Dict[str, str] = field(default_factory=dict)
44
+
45
+ # Service-specific configuration
46
+ service_config: Dict[str, Any] = field(default_factory=dict)
47
+
48
+ def to_dict(self) -> Dict[str, Any]:
49
+ """Convert to dictionary for serialization"""
50
+ return {
51
+ "service_name": self.service_name,
52
+ "service_type": self.service_type.value,
53
+ "model_id": self.model_id,
54
+ "image_tag": self.image_tag,
55
+ "cpu_cores": self.cpu_cores,
56
+ "memory_gb": self.memory_gb,
57
+ "gpu_type": self.gpu_type,
58
+ "timeout_seconds": self.timeout_seconds,
59
+ "min_instances": self.min_instances,
60
+ "max_instances": self.max_instances,
61
+ "concurrency_limit": self.concurrency_limit,
62
+ "environment": self.environment,
63
+ "service_config": self.service_config
64
+ }
65
+
66
+ @classmethod
67
+ def from_dict(cls, data: Dict[str, Any]) -> "ModalConfig":
68
+ """Create from dictionary"""
69
+ return cls(
70
+ service_name=data["service_name"],
71
+ service_type=ModalServiceType(data["service_type"]),
72
+ model_id=data["model_id"],
73
+ image_tag=data.get("image_tag", "latest"),
74
+ cpu_cores=data.get("cpu_cores", 2),
75
+ memory_gb=data.get("memory_gb", 8),
76
+ gpu_type=data.get("gpu_type"),
77
+ timeout_seconds=data.get("timeout_seconds", 300),
78
+ min_instances=data.get("min_instances", 0),
79
+ max_instances=data.get("max_instances", 10),
80
+ concurrency_limit=data.get("concurrency_limit", 1),
81
+ environment=data.get("environment", {}),
82
+ service_config=data.get("service_config", {})
83
+ )
84
+
85
+
86
+ # Predefined configurations for common service types
87
+ def create_llm_config(service_name: str, model_id: str, gpu_type: str = "A10G") -> ModalConfig:
88
+ """Create configuration for LLM service"""
89
+ return ModalConfig(
90
+ service_name=service_name,
91
+ service_type=ModalServiceType.LLM,
92
+ model_id=model_id,
93
+ gpu_type=gpu_type,
94
+ memory_gb=16,
95
+ timeout_seconds=600,
96
+ max_instances=5
97
+ )
98
+
99
+
100
+ def create_vision_config(service_name: str, model_id: str, gpu_type: str = "T4") -> ModalConfig:
101
+ """Create configuration for vision service"""
102
+ return ModalConfig(
103
+ service_name=service_name,
104
+ service_type=ModalServiceType.VISION,
105
+ model_id=model_id,
106
+ gpu_type=gpu_type,
107
+ memory_gb=12,
108
+ timeout_seconds=300,
109
+ max_instances=10
110
+ )
111
+
112
+
113
+ def create_audio_config(service_name: str, model_id: str, gpu_type: str = "T4") -> ModalConfig:
114
+ """Create configuration for audio service"""
115
+ return ModalConfig(
116
+ service_name=service_name,
117
+ service_type=ModalServiceType.AUDIO,
118
+ model_id=model_id,
119
+ gpu_type=gpu_type,
120
+ memory_gb=8,
121
+ timeout_seconds=300,
122
+ max_instances=8
123
+ )
124
+
125
+
126
+ def create_embedding_config(service_name: str, model_id: str, gpu_type: str = "T4") -> ModalConfig:
127
+ """Create configuration for embedding service"""
128
+ return ModalConfig(
129
+ service_name=service_name,
130
+ service_type=ModalServiceType.EMBEDDING,
131
+ model_id=model_id,
132
+ gpu_type=gpu_type,
133
+ memory_gb=6,
134
+ timeout_seconds=120,
135
+ max_instances=15
136
+ )