isa-model 0.3.91__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (228) hide show
  1. isa_model/client.py +1166 -584
  2. isa_model/core/cache/redis_cache.py +410 -0
  3. isa_model/core/config/config_manager.py +282 -12
  4. isa_model/core/config.py +91 -1
  5. isa_model/core/database/__init__.py +1 -0
  6. isa_model/core/database/direct_db_client.py +114 -0
  7. isa_model/core/database/migration_manager.py +563 -0
  8. isa_model/core/database/migrations.py +297 -0
  9. isa_model/core/database/supabase_client.py +258 -0
  10. isa_model/core/dependencies.py +316 -0
  11. isa_model/core/discovery/__init__.py +19 -0
  12. isa_model/core/discovery/consul_discovery.py +190 -0
  13. isa_model/core/logging/__init__.py +54 -0
  14. isa_model/core/logging/influx_logger.py +523 -0
  15. isa_model/core/logging/loki_logger.py +160 -0
  16. isa_model/core/models/__init__.py +46 -0
  17. isa_model/core/models/config_models.py +625 -0
  18. isa_model/core/models/deployment_billing_tracker.py +430 -0
  19. isa_model/core/models/model_billing_tracker.py +60 -88
  20. isa_model/core/models/model_manager.py +66 -25
  21. isa_model/core/models/model_metadata.py +690 -0
  22. isa_model/core/models/model_repo.py +217 -55
  23. isa_model/core/models/model_statistics_tracker.py +234 -0
  24. isa_model/core/models/model_storage.py +0 -1
  25. isa_model/core/models/model_version_manager.py +959 -0
  26. isa_model/core/models/system_models.py +857 -0
  27. isa_model/core/pricing_manager.py +2 -249
  28. isa_model/core/repositories/__init__.py +9 -0
  29. isa_model/core/repositories/config_repository.py +912 -0
  30. isa_model/core/resilience/circuit_breaker.py +366 -0
  31. isa_model/core/security/secrets.py +358 -0
  32. isa_model/core/services/__init__.py +2 -4
  33. isa_model/core/services/intelligent_model_selector.py +479 -370
  34. isa_model/core/storage/hf_storage.py +2 -2
  35. isa_model/core/types.py +8 -0
  36. isa_model/deployment/__init__.py +5 -48
  37. isa_model/deployment/core/__init__.py +2 -31
  38. isa_model/deployment/core/deployment_manager.py +1278 -368
  39. isa_model/deployment/local/__init__.py +31 -0
  40. isa_model/deployment/local/config.py +248 -0
  41. isa_model/deployment/local/gpu_gateway.py +607 -0
  42. isa_model/deployment/local/health_checker.py +428 -0
  43. isa_model/deployment/local/provider.py +586 -0
  44. isa_model/deployment/local/tensorrt_service.py +621 -0
  45. isa_model/deployment/local/transformers_service.py +644 -0
  46. isa_model/deployment/local/vllm_service.py +527 -0
  47. isa_model/deployment/modal/__init__.py +8 -0
  48. isa_model/deployment/modal/config.py +136 -0
  49. isa_model/deployment/modal/deployer.py +894 -0
  50. isa_model/deployment/modal/services/__init__.py +3 -0
  51. isa_model/deployment/modal/services/audio/__init__.py +1 -0
  52. isa_model/deployment/modal/services/audio/isa_audio_chatTTS_service.py +520 -0
  53. isa_model/deployment/modal/services/audio/isa_audio_openvoice_service.py +758 -0
  54. isa_model/deployment/modal/services/audio/isa_audio_service_v2.py +1044 -0
  55. isa_model/deployment/modal/services/embedding/__init__.py +1 -0
  56. isa_model/deployment/modal/services/embedding/isa_embed_rerank_service.py +296 -0
  57. isa_model/deployment/modal/services/llm/__init__.py +1 -0
  58. isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
  59. isa_model/deployment/modal/services/video/__init__.py +1 -0
  60. isa_model/deployment/modal/services/video/isa_video_hunyuan_service.py +423 -0
  61. isa_model/deployment/modal/services/vision/__init__.py +1 -0
  62. isa_model/deployment/modal/services/vision/isa_vision_ocr_service.py +519 -0
  63. isa_model/deployment/modal/services/vision/isa_vision_qwen25_service.py +709 -0
  64. isa_model/deployment/modal/services/vision/isa_vision_table_service.py +676 -0
  65. isa_model/deployment/modal/services/vision/isa_vision_ui_service.py +833 -0
  66. isa_model/deployment/modal/services/vision/isa_vision_ui_service_optimized.py +660 -0
  67. isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
  68. isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
  69. isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
  70. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
  71. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
  72. isa_model/deployment/storage/__init__.py +5 -0
  73. isa_model/deployment/storage/deployment_repository.py +824 -0
  74. isa_model/deployment/triton/__init__.py +10 -0
  75. isa_model/deployment/triton/config.py +196 -0
  76. isa_model/deployment/triton/configs/__init__.py +1 -0
  77. isa_model/deployment/triton/provider.py +512 -0
  78. isa_model/deployment/triton/scripts/__init__.py +1 -0
  79. isa_model/deployment/triton/templates/__init__.py +1 -0
  80. isa_model/inference/__init__.py +47 -1
  81. isa_model/inference/ai_factory.py +179 -16
  82. isa_model/inference/legacy_services/__init__.py +21 -0
  83. isa_model/inference/legacy_services/model_evaluation.py +637 -0
  84. isa_model/inference/legacy_services/model_service.py +573 -0
  85. isa_model/inference/legacy_services/model_serving.py +717 -0
  86. isa_model/inference/legacy_services/model_training.py +561 -0
  87. isa_model/inference/models/__init__.py +21 -0
  88. isa_model/inference/models/inference_config.py +551 -0
  89. isa_model/inference/models/inference_record.py +675 -0
  90. isa_model/inference/models/performance_models.py +714 -0
  91. isa_model/inference/repositories/__init__.py +9 -0
  92. isa_model/inference/repositories/inference_repository.py +828 -0
  93. isa_model/inference/services/audio/__init__.py +21 -0
  94. isa_model/inference/services/audio/base_realtime_service.py +225 -0
  95. isa_model/inference/services/audio/base_stt_service.py +184 -11
  96. isa_model/inference/services/audio/isa_tts_service.py +0 -0
  97. isa_model/inference/services/audio/openai_realtime_service.py +320 -124
  98. isa_model/inference/services/audio/openai_stt_service.py +53 -11
  99. isa_model/inference/services/base_service.py +17 -1
  100. isa_model/inference/services/custom_model_manager.py +277 -0
  101. isa_model/inference/services/embedding/__init__.py +13 -0
  102. isa_model/inference/services/embedding/base_embed_service.py +111 -8
  103. isa_model/inference/services/embedding/isa_embed_service.py +305 -0
  104. isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
  105. isa_model/inference/services/embedding/openai_embed_service.py +2 -4
  106. isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
  107. isa_model/inference/services/embedding/tests/test_embedding.py +222 -0
  108. isa_model/inference/services/img/__init__.py +2 -2
  109. isa_model/inference/services/img/base_image_gen_service.py +24 -7
  110. isa_model/inference/services/img/replicate_image_gen_service.py +84 -422
  111. isa_model/inference/services/img/services/replicate_face_swap.py +193 -0
  112. isa_model/inference/services/img/services/replicate_flux.py +226 -0
  113. isa_model/inference/services/img/services/replicate_flux_kontext.py +219 -0
  114. isa_model/inference/services/img/services/replicate_sticker_maker.py +249 -0
  115. isa_model/inference/services/img/tests/test_img_client.py +297 -0
  116. isa_model/inference/services/llm/__init__.py +10 -2
  117. isa_model/inference/services/llm/base_llm_service.py +361 -26
  118. isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
  119. isa_model/inference/services/llm/helpers/llm_adapter.py +71 -12
  120. isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
  121. isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
  122. isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
  123. isa_model/inference/services/llm/local_llm_service.py +747 -0
  124. isa_model/inference/services/llm/ollama_llm_service.py +11 -3
  125. isa_model/inference/services/llm/openai_llm_service.py +670 -56
  126. isa_model/inference/services/llm/yyds_llm_service.py +10 -3
  127. isa_model/inference/services/vision/__init__.py +27 -6
  128. isa_model/inference/services/vision/base_vision_service.py +118 -185
  129. isa_model/inference/services/vision/blip_vision_service.py +359 -0
  130. isa_model/inference/services/vision/helpers/image_utils.py +19 -10
  131. isa_model/inference/services/vision/isa_vision_service.py +634 -0
  132. isa_model/inference/services/vision/openai_vision_service.py +19 -10
  133. isa_model/inference/services/vision/tests/test_ocr_client.py +284 -0
  134. isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
  135. isa_model/serving/api/cache_manager.py +245 -0
  136. isa_model/serving/api/dependencies/__init__.py +1 -0
  137. isa_model/serving/api/dependencies/auth.py +194 -0
  138. isa_model/serving/api/dependencies/database.py +139 -0
  139. isa_model/serving/api/error_handlers.py +284 -0
  140. isa_model/serving/api/fastapi_server.py +240 -18
  141. isa_model/serving/api/middleware/auth.py +317 -0
  142. isa_model/serving/api/middleware/security.py +268 -0
  143. isa_model/serving/api/middleware/tenant_context.py +414 -0
  144. isa_model/serving/api/routes/analytics.py +489 -0
  145. isa_model/serving/api/routes/config.py +645 -0
  146. isa_model/serving/api/routes/deployment_billing.py +315 -0
  147. isa_model/serving/api/routes/deployments.py +475 -0
  148. isa_model/serving/api/routes/gpu_gateway.py +440 -0
  149. isa_model/serving/api/routes/health.py +32 -12
  150. isa_model/serving/api/routes/inference_monitoring.py +486 -0
  151. isa_model/serving/api/routes/local_deployments.py +448 -0
  152. isa_model/serving/api/routes/logs.py +430 -0
  153. isa_model/serving/api/routes/settings.py +582 -0
  154. isa_model/serving/api/routes/tenants.py +575 -0
  155. isa_model/serving/api/routes/unified.py +992 -171
  156. isa_model/serving/api/routes/webhooks.py +479 -0
  157. isa_model/serving/api/startup.py +318 -0
  158. isa_model/serving/modal_proxy_server.py +249 -0
  159. isa_model/utils/gpu_utils.py +311 -0
  160. {isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/METADATA +76 -22
  161. isa_model-0.4.3.dist-info/RECORD +193 -0
  162. isa_model/deployment/cloud/__init__.py +0 -9
  163. isa_model/deployment/cloud/modal/__init__.py +0 -10
  164. isa_model/deployment/cloud/modal/isa_vision_doc_service.py +0 -766
  165. isa_model/deployment/cloud/modal/isa_vision_table_service.py +0 -532
  166. isa_model/deployment/cloud/modal/isa_vision_ui_service.py +0 -406
  167. isa_model/deployment/cloud/modal/register_models.py +0 -321
  168. isa_model/deployment/core/deployment_config.py +0 -356
  169. isa_model/deployment/core/isa_deployment_service.py +0 -401
  170. isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
  171. isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
  172. isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
  173. isa_model/deployment/runtime/deployed_service.py +0 -338
  174. isa_model/deployment/services/__init__.py +0 -9
  175. isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
  176. isa_model/deployment/services/model_service.py +0 -332
  177. isa_model/deployment/services/service_monitor.py +0 -356
  178. isa_model/deployment/services/service_registry.py +0 -527
  179. isa_model/eval/__init__.py +0 -92
  180. isa_model/eval/benchmarks.py +0 -469
  181. isa_model/eval/config/__init__.py +0 -10
  182. isa_model/eval/config/evaluation_config.py +0 -108
  183. isa_model/eval/evaluators/__init__.py +0 -18
  184. isa_model/eval/evaluators/base_evaluator.py +0 -503
  185. isa_model/eval/evaluators/llm_evaluator.py +0 -472
  186. isa_model/eval/factory.py +0 -531
  187. isa_model/eval/infrastructure/__init__.py +0 -24
  188. isa_model/eval/infrastructure/experiment_tracker.py +0 -466
  189. isa_model/eval/metrics.py +0 -798
  190. isa_model/inference/adapter/unified_api.py +0 -248
  191. isa_model/inference/services/helpers/stacked_config.py +0 -148
  192. isa_model/inference/services/img/flux_professional_service.py +0 -603
  193. isa_model/inference/services/img/helpers/base_stacked_service.py +0 -274
  194. isa_model/inference/services/others/table_transformer_service.py +0 -61
  195. isa_model/inference/services/vision/doc_analysis_service.py +0 -640
  196. isa_model/inference/services/vision/helpers/base_stacked_service.py +0 -274
  197. isa_model/inference/services/vision/ui_analysis_service.py +0 -823
  198. isa_model/scripts/inference_tracker.py +0 -283
  199. isa_model/scripts/mlflow_manager.py +0 -379
  200. isa_model/scripts/model_registry.py +0 -465
  201. isa_model/scripts/register_models.py +0 -370
  202. isa_model/scripts/register_models_with_embeddings.py +0 -510
  203. isa_model/scripts/start_mlflow.py +0 -95
  204. isa_model/scripts/training_tracker.py +0 -257
  205. isa_model/training/__init__.py +0 -74
  206. isa_model/training/annotation/annotation_schema.py +0 -47
  207. isa_model/training/annotation/processors/annotation_processor.py +0 -126
  208. isa_model/training/annotation/storage/dataset_manager.py +0 -131
  209. isa_model/training/annotation/storage/dataset_schema.py +0 -44
  210. isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
  211. isa_model/training/annotation/tests/test_minio copy.py +0 -113
  212. isa_model/training/annotation/tests/test_minio_upload.py +0 -43
  213. isa_model/training/annotation/views/annotation_controller.py +0 -158
  214. isa_model/training/cloud/__init__.py +0 -22
  215. isa_model/training/cloud/job_orchestrator.py +0 -402
  216. isa_model/training/cloud/runpod_trainer.py +0 -454
  217. isa_model/training/cloud/storage_manager.py +0 -482
  218. isa_model/training/core/__init__.py +0 -23
  219. isa_model/training/core/config.py +0 -181
  220. isa_model/training/core/dataset.py +0 -222
  221. isa_model/training/core/trainer.py +0 -720
  222. isa_model/training/core/utils.py +0 -213
  223. isa_model/training/factory.py +0 -424
  224. isa_model-0.3.91.dist-info/RECORD +0 -138
  225. /isa_model/{core/storage/minio_storage.py → deployment/modal/services/audio/isa_audio_fish_service.py} +0 -0
  226. /isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
  227. {isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/WHEEL +0 -0
  228. {isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,448 @@
1
+ """
2
+ Local GPU deployments API routes
3
+
4
+ Endpoints for managing local GPU model deployments.
5
+ """
6
+
7
+ import logging
8
+ from typing import Dict, List, Optional, Any
9
+ from fastapi import APIRouter, HTTPException, Depends, BackgroundTasks
10
+ from pydantic import BaseModel, Field
11
+
12
+ from ....deployment.core.deployment_manager import DeploymentManager
13
+ from ....deployment.local.config import (
14
+ LocalGPUConfig, LocalServiceType, LocalBackend,
15
+ create_vllm_config, create_tensorrt_config, create_transformers_config
16
+ )
17
+ from ...middleware.auth import get_current_user
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+ router = APIRouter(prefix="/api/v1/local", tags=["local-deployments"])
22
+
23
+
24
+ # Request/Response Models
25
+ class LocalDeployRequest(BaseModel):
26
+ """Local deployment request"""
27
+ service_name: str = Field(..., description="Unique service name")
28
+ model_id: str = Field(..., description="HuggingFace model ID")
29
+ backend: str = Field("transformers", description="Inference backend (vllm, tensorrt_llm, transformers)")
30
+ service_type: str = Field("llm", description="Service type (llm, vision, audio, embedding)")
31
+
32
+ # Model configuration
33
+ model_precision: str = Field("float16", description="Model precision")
34
+ max_model_len: int = Field(2048, description="Maximum sequence length")
35
+ max_batch_size: int = Field(8, description="Maximum batch size")
36
+
37
+ # GPU settings
38
+ gpu_id: Optional[int] = Field(None, description="Specific GPU ID to use")
39
+ gpu_memory_utilization: float = Field(0.9, description="GPU memory utilization fraction")
40
+
41
+ # Performance settings
42
+ tensor_parallel_size: int = Field(1, description="Tensor parallel size")
43
+ enable_chunked_prefill: bool = Field(True, description="Enable chunked prefill")
44
+ enable_prefix_caching: bool = Field(True, description="Enable prefix caching")
45
+
46
+ # Quantization
47
+ quantization: Optional[str] = Field(None, description="Quantization method (int8, int4, awq, gptq)")
48
+
49
+ # Advanced settings
50
+ trust_remote_code: bool = Field(False, description="Trust remote code in model")
51
+ revision: Optional[str] = Field(None, description="Model revision")
52
+
53
+ # Backend-specific settings
54
+ vllm_args: Dict[str, Any] = Field(default_factory=dict, description="Additional vLLM arguments")
55
+ tensorrt_args: Dict[str, Any] = Field(default_factory=dict, description="Additional TensorRT arguments")
56
+ transformers_args: Dict[str, Any] = Field(default_factory=dict, description="Additional Transformers arguments")
57
+
58
+
59
+ class LocalServiceInfo(BaseModel):
60
+ """Local service information"""
61
+ service_name: str
62
+ model_id: str
63
+ backend: str
64
+ service_type: str
65
+ status: str
66
+ healthy: bool
67
+ response_time_ms: Optional[float] = None
68
+ error_count: int = 0
69
+ uptime_seconds: Optional[float] = None
70
+ deployed_at: Optional[str] = None
71
+
72
+
73
+ class GenerateRequest(BaseModel):
74
+ """Text generation request"""
75
+ prompt: str = Field(..., description="Input prompt")
76
+ max_tokens: int = Field(512, description="Maximum tokens to generate")
77
+ temperature: float = Field(0.7, description="Sampling temperature")
78
+ top_p: float = Field(0.9, description="Top-p sampling")
79
+ top_k: int = Field(50, description="Top-k sampling")
80
+ stream: bool = Field(False, description="Stream response")
81
+
82
+
83
+ class ChatCompletionRequest(BaseModel):
84
+ """Chat completion request"""
85
+ messages: List[Dict[str, str]] = Field(..., description="Chat messages")
86
+ max_tokens: int = Field(512, description="Maximum tokens to generate")
87
+ temperature: float = Field(0.7, description="Sampling temperature")
88
+ top_p: float = Field(0.9, description="Top-p sampling")
89
+ stream: bool = Field(False, description="Stream response")
90
+
91
+
92
+ # Dependency injection
93
+ async def get_deployment_manager() -> DeploymentManager:
94
+ """Get deployment manager instance"""
95
+ return DeploymentManager()
96
+
97
+
98
+ @router.get("/status", summary="Get local GPU system status")
99
+ async def get_local_status(
100
+ manager: DeploymentManager = Depends(get_deployment_manager)
101
+ ):
102
+ """Get overall local GPU system status including available resources"""
103
+ try:
104
+ status = await manager.get_local_system_status()
105
+ return {"success": True, "status": status}
106
+ except Exception as e:
107
+ logger.error(f"Failed to get local status: {e}")
108
+ raise HTTPException(status_code=500, detail=str(e))
109
+
110
+
111
+ @router.post("/deploy", summary="Deploy model to local GPU")
112
+ async def deploy_local_service(
113
+ request: LocalDeployRequest,
114
+ background_tasks: BackgroundTasks,
115
+ manager: DeploymentManager = Depends(get_deployment_manager),
116
+ current_user: Optional[Dict] = Depends(get_current_user)
117
+ ):
118
+ """Deploy a model service to local GPU"""
119
+ try:
120
+ # Convert request to configuration
121
+ config = LocalGPUConfig(
122
+ service_name=request.service_name,
123
+ service_type=LocalServiceType(request.service_type),
124
+ model_id=request.model_id,
125
+ backend=LocalBackend(request.backend),
126
+ model_precision=request.model_precision,
127
+ max_model_len=request.max_model_len,
128
+ max_batch_size=request.max_batch_size,
129
+ gpu_id=request.gpu_id,
130
+ gpu_memory_utilization=request.gpu_memory_utilization,
131
+ tensor_parallel_size=request.tensor_parallel_size,
132
+ enable_chunked_prefill=request.enable_chunked_prefill,
133
+ enable_prefix_caching=request.enable_prefix_caching,
134
+ quantization=request.quantization,
135
+ trust_remote_code=request.trust_remote_code,
136
+ revision=request.revision,
137
+ vllm_args=request.vllm_args,
138
+ tensorrt_args=request.tensorrt_args,
139
+ transformers_args=request.transformers_args
140
+ )
141
+
142
+ # Deploy service
143
+ result = await manager.deploy_to_local(config)
144
+
145
+ if result["success"]:
146
+ return {
147
+ "success": True,
148
+ "message": f"Service {request.service_name} deployed successfully",
149
+ "deployment": result
150
+ }
151
+ else:
152
+ raise HTTPException(status_code=400, detail=result.get("error", "Deployment failed"))
153
+
154
+ except ValueError as e:
155
+ raise HTTPException(status_code=400, detail=f"Invalid configuration: {e}")
156
+ except Exception as e:
157
+ logger.error(f"Local deployment failed: {e}")
158
+ raise HTTPException(status_code=500, detail=str(e))
159
+
160
+
161
+ @router.get("/services", summary="List local GPU services")
162
+ async def list_local_services(
163
+ manager: DeploymentManager = Depends(get_deployment_manager)
164
+ ) -> Dict[str, Any]:
165
+ """List all deployed local GPU services"""
166
+ try:
167
+ services = await manager.list_local_services()
168
+ return {
169
+ "success": True,
170
+ "services": services,
171
+ "count": len(services)
172
+ }
173
+ except Exception as e:
174
+ logger.error(f"Failed to list local services: {e}")
175
+ raise HTTPException(status_code=500, detail=str(e))
176
+
177
+
178
+ @router.get("/services/{service_name}", summary="Get local service information")
179
+ async def get_local_service(
180
+ service_name: str,
181
+ manager: DeploymentManager = Depends(get_deployment_manager)
182
+ ):
183
+ """Get detailed information about a specific local service"""
184
+ try:
185
+ service_info = await manager.get_local_service_info(service_name)
186
+
187
+ if service_info is None:
188
+ raise HTTPException(status_code=404, detail=f"Service {service_name} not found")
189
+
190
+ return {
191
+ "success": True,
192
+ "service": service_info
193
+ }
194
+ except HTTPException:
195
+ raise
196
+ except Exception as e:
197
+ logger.error(f"Failed to get service info for {service_name}: {e}")
198
+ raise HTTPException(status_code=500, detail=str(e))
199
+
200
+
201
+ @router.delete("/services/{service_name}", summary="Undeploy local service")
202
+ async def undeploy_local_service(
203
+ service_name: str,
204
+ manager: DeploymentManager = Depends(get_deployment_manager),
205
+ current_user: Optional[Dict] = Depends(get_current_user)
206
+ ):
207
+ """Stop and remove a deployed local service"""
208
+ try:
209
+ result = await manager.undeploy_local_service(service_name)
210
+
211
+ if result["success"]:
212
+ return {
213
+ "success": True,
214
+ "message": f"Service {service_name} undeployed successfully"
215
+ }
216
+ else:
217
+ raise HTTPException(status_code=400, detail=result.get("error", "Undeploy failed"))
218
+
219
+ except HTTPException:
220
+ raise
221
+ except Exception as e:
222
+ logger.error(f"Failed to undeploy service {service_name}: {e}")
223
+ raise HTTPException(status_code=500, detail=str(e))
224
+
225
+
226
+ @router.post("/services/{service_name}/generate", summary="Generate text using local service")
227
+ async def generate_text(
228
+ service_name: str,
229
+ request: GenerateRequest,
230
+ manager: DeploymentManager = Depends(get_deployment_manager)
231
+ ):
232
+ """Generate text using a deployed local service"""
233
+ try:
234
+ # Get the local provider and call generate_text
235
+ local_provider = manager.local_provider
236
+
237
+ result = await local_provider.generate_text(
238
+ service_name=service_name,
239
+ prompt=request.prompt,
240
+ max_tokens=request.max_tokens,
241
+ temperature=request.temperature,
242
+ top_p=request.top_p,
243
+ top_k=request.top_k,
244
+ stream=request.stream
245
+ )
246
+
247
+ if result["success"]:
248
+ return result
249
+ else:
250
+ raise HTTPException(status_code=400, detail=result.get("error", "Generation failed"))
251
+
252
+ except HTTPException:
253
+ raise
254
+ except Exception as e:
255
+ logger.error(f"Text generation failed for {service_name}: {e}")
256
+ raise HTTPException(status_code=500, detail=str(e))
257
+
258
+
259
+ @router.post("/services/{service_name}/chat/completions", summary="Chat completion using local service")
260
+ async def chat_completion(
261
+ service_name: str,
262
+ request: ChatCompletionRequest,
263
+ manager: DeploymentManager = Depends(get_deployment_manager)
264
+ ):
265
+ """Generate chat completion using a deployed local service"""
266
+ try:
267
+ # Get the local provider and call chat_completion
268
+ local_provider = manager.local_provider
269
+
270
+ result = await local_provider.chat_completion(
271
+ service_name=service_name,
272
+ messages=request.messages,
273
+ max_tokens=request.max_tokens,
274
+ temperature=request.temperature,
275
+ top_p=request.top_p,
276
+ stream=request.stream
277
+ )
278
+
279
+ if result["success"]:
280
+ return result
281
+ else:
282
+ raise HTTPException(status_code=400, detail=result.get("error", "Chat completion failed"))
283
+
284
+ except HTTPException:
285
+ raise
286
+ except Exception as e:
287
+ logger.error(f"Chat completion failed for {service_name}: {e}")
288
+ raise HTTPException(status_code=500, detail=str(e))
289
+
290
+
291
+ @router.get("/backends", summary="List available local backends")
292
+ async def list_backends():
293
+ """List available local inference backends"""
294
+ backends = []
295
+
296
+ # Check backend availability
297
+ try:
298
+ import vllm
299
+ backends.append({
300
+ "name": "vllm",
301
+ "description": "High-performance LLM inference server",
302
+ "available": True,
303
+ "features": ["high_throughput", "dynamic_batching", "prefix_caching"]
304
+ })
305
+ except ImportError:
306
+ backends.append({
307
+ "name": "vllm",
308
+ "description": "High-performance LLM inference server",
309
+ "available": False,
310
+ "install_command": "pip install vllm"
311
+ })
312
+
313
+ try:
314
+ import tensorrt_llm
315
+ backends.append({
316
+ "name": "tensorrt_llm",
317
+ "description": "NVIDIA TensorRT-LLM for maximum optimization",
318
+ "available": True,
319
+ "features": ["maximum_performance", "tensorrt_optimization", "cuda_acceleration"]
320
+ })
321
+ except ImportError:
322
+ backends.append({
323
+ "name": "tensorrt_llm",
324
+ "description": "NVIDIA TensorRT-LLM for maximum optimization",
325
+ "available": False,
326
+ "install_command": "pip install tensorrt-llm"
327
+ })
328
+
329
+ try:
330
+ import transformers
331
+ backends.append({
332
+ "name": "transformers",
333
+ "description": "HuggingFace Transformers for universal compatibility",
334
+ "available": True,
335
+ "features": ["universal_compatibility", "all_model_types", "quantization_support"]
336
+ })
337
+ except ImportError:
338
+ backends.append({
339
+ "name": "transformers",
340
+ "description": "HuggingFace Transformers for universal compatibility",
341
+ "available": False,
342
+ "install_command": "pip install transformers"
343
+ })
344
+
345
+ return {
346
+ "success": True,
347
+ "backends": backends
348
+ }
349
+
350
+
351
+ @router.get("/gpu-info", summary="Get GPU information")
352
+ async def get_gpu_info():
353
+ """Get detailed information about available GPUs"""
354
+ try:
355
+ from ....utils.gpu_utils import get_gpu_manager
356
+
357
+ gpu_manager = get_gpu_manager()
358
+ system_info = gpu_manager.get_system_info()
359
+
360
+ return {
361
+ "success": True,
362
+ "gpu_info": system_info
363
+ }
364
+ except Exception as e:
365
+ logger.error(f"Failed to get GPU info: {e}")
366
+ raise HTTPException(status_code=500, detail=str(e))
367
+
368
+
369
+ @router.post("/estimate-memory", summary="Estimate model memory requirements")
370
+ async def estimate_memory(
371
+ model_id: str,
372
+ precision: str = "float16"
373
+ ):
374
+ """Estimate memory requirements for a model"""
375
+ try:
376
+ from ....utils.gpu_utils import estimate_model_memory
377
+
378
+ memory_mb = estimate_model_memory(model_id, precision)
379
+ memory_gb = memory_mb / 1024
380
+
381
+ return {
382
+ "success": True,
383
+ "model_id": model_id,
384
+ "precision": precision,
385
+ "estimated_memory_mb": memory_mb,
386
+ "estimated_memory_gb": round(memory_gb, 2)
387
+ }
388
+ except Exception as e:
389
+ logger.error(f"Failed to estimate memory for {model_id}: {e}")
390
+ raise HTTPException(status_code=500, detail=str(e))
391
+
392
+
393
+ @router.get("/presets", summary="Get deployment configuration presets")
394
+ async def get_deployment_presets():
395
+ """Get predefined deployment configuration presets"""
396
+ presets = {
397
+ "vllm_small": {
398
+ "name": "vLLM - Small Model",
399
+ "description": "Optimized for models up to 7B parameters",
400
+ "backend": "vllm",
401
+ "max_model_len": 2048,
402
+ "max_batch_size": 16,
403
+ "gpu_memory_utilization": 0.9,
404
+ "enable_chunked_prefill": True,
405
+ "enable_prefix_caching": True
406
+ },
407
+ "vllm_large": {
408
+ "name": "vLLM - Large Model",
409
+ "description": "Optimized for models 13B+ parameters",
410
+ "backend": "vllm",
411
+ "max_model_len": 4096,
412
+ "max_batch_size": 8,
413
+ "gpu_memory_utilization": 0.95,
414
+ "tensor_parallel_size": 2,
415
+ "enable_chunked_prefill": True,
416
+ "enable_prefix_caching": True
417
+ },
418
+ "tensorrt_performance": {
419
+ "name": "TensorRT-LLM - Maximum Performance",
420
+ "description": "Maximum optimization with TensorRT",
421
+ "backend": "tensorrt_llm",
422
+ "model_precision": "float16",
423
+ "max_batch_size": 16,
424
+ "tensorrt_args": {
425
+ "enable_kv_cache_reuse": True,
426
+ "use_gpt_attention_plugin": True,
427
+ "remove_input_padding": True
428
+ }
429
+ },
430
+ "transformers_compatible": {
431
+ "name": "Transformers - Universal",
432
+ "description": "Maximum compatibility with all models",
433
+ "backend": "transformers",
434
+ "model_precision": "float16",
435
+ "max_batch_size": 4,
436
+ "gpu_memory_utilization": 0.8,
437
+ "transformers_args": {
438
+ "device_map": "auto",
439
+ "torch_dtype": "auto",
440
+ "low_cpu_mem_usage": True
441
+ }
442
+ }
443
+ }
444
+
445
+ return {
446
+ "success": True,
447
+ "presets": presets
448
+ }