isa-model 0.4.0__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (199) hide show
  1. isa_model/client.py +466 -43
  2. isa_model/core/cache/redis_cache.py +12 -3
  3. isa_model/core/config/config_manager.py +230 -3
  4. isa_model/core/config.py +90 -0
  5. isa_model/core/database/direct_db_client.py +114 -0
  6. isa_model/core/database/migration_manager.py +563 -0
  7. isa_model/core/database/migrations.py +21 -1
  8. isa_model/core/database/supabase_client.py +154 -19
  9. isa_model/core/dependencies.py +316 -0
  10. isa_model/core/discovery/__init__.py +19 -0
  11. isa_model/core/discovery/consul_discovery.py +190 -0
  12. isa_model/core/logging/__init__.py +54 -0
  13. isa_model/core/logging/influx_logger.py +523 -0
  14. isa_model/core/logging/loki_logger.py +160 -0
  15. isa_model/core/models/__init__.py +27 -18
  16. isa_model/core/models/config_models.py +625 -0
  17. isa_model/core/models/deployment_billing_tracker.py +430 -0
  18. isa_model/core/models/model_manager.py +40 -17
  19. isa_model/core/models/model_metadata.py +690 -0
  20. isa_model/core/models/model_repo.py +174 -18
  21. isa_model/core/models/system_models.py +857 -0
  22. isa_model/core/repositories/__init__.py +9 -0
  23. isa_model/core/repositories/config_repository.py +912 -0
  24. isa_model/core/services/intelligent_model_selector.py +399 -21
  25. isa_model/core/storage/hf_storage.py +1 -1
  26. isa_model/core/types.py +1 -0
  27. isa_model/deployment/__init__.py +5 -48
  28. isa_model/deployment/core/__init__.py +2 -31
  29. isa_model/deployment/core/deployment_manager.py +1278 -370
  30. isa_model/deployment/local/__init__.py +31 -0
  31. isa_model/deployment/local/config.py +248 -0
  32. isa_model/deployment/local/gpu_gateway.py +607 -0
  33. isa_model/deployment/local/health_checker.py +428 -0
  34. isa_model/deployment/local/provider.py +586 -0
  35. isa_model/deployment/local/tensorrt_service.py +621 -0
  36. isa_model/deployment/local/transformers_service.py +644 -0
  37. isa_model/deployment/local/vllm_service.py +527 -0
  38. isa_model/deployment/modal/__init__.py +8 -0
  39. isa_model/deployment/modal/config.py +136 -0
  40. isa_model/deployment/{services/auto_hf_modal_deployer.py → modal/deployer.py} +1 -1
  41. isa_model/deployment/modal/services/__init__.py +3 -0
  42. isa_model/deployment/modal/services/audio/__init__.py +1 -0
  43. isa_model/deployment/modal/services/embedding/__init__.py +1 -0
  44. isa_model/deployment/modal/services/llm/__init__.py +1 -0
  45. isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
  46. isa_model/deployment/modal/services/video/__init__.py +1 -0
  47. isa_model/deployment/modal/services/vision/__init__.py +1 -0
  48. isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
  49. isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
  50. isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
  51. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
  52. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
  53. isa_model/deployment/storage/__init__.py +5 -0
  54. isa_model/deployment/storage/deployment_repository.py +824 -0
  55. isa_model/deployment/triton/__init__.py +10 -0
  56. isa_model/deployment/triton/config.py +196 -0
  57. isa_model/deployment/triton/configs/__init__.py +1 -0
  58. isa_model/deployment/triton/provider.py +512 -0
  59. isa_model/deployment/triton/scripts/__init__.py +1 -0
  60. isa_model/deployment/triton/templates/__init__.py +1 -0
  61. isa_model/inference/__init__.py +47 -1
  62. isa_model/inference/ai_factory.py +137 -10
  63. isa_model/inference/legacy_services/__init__.py +21 -0
  64. isa_model/inference/legacy_services/model_evaluation.py +637 -0
  65. isa_model/inference/legacy_services/model_service.py +573 -0
  66. isa_model/inference/legacy_services/model_serving.py +717 -0
  67. isa_model/inference/legacy_services/model_training.py +561 -0
  68. isa_model/inference/models/__init__.py +21 -0
  69. isa_model/inference/models/inference_config.py +551 -0
  70. isa_model/inference/models/inference_record.py +675 -0
  71. isa_model/inference/models/performance_models.py +714 -0
  72. isa_model/inference/repositories/__init__.py +9 -0
  73. isa_model/inference/repositories/inference_repository.py +828 -0
  74. isa_model/inference/services/audio/base_stt_service.py +184 -11
  75. isa_model/inference/services/audio/openai_stt_service.py +22 -6
  76. isa_model/inference/services/custom_model_manager.py +277 -0
  77. isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
  78. isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
  79. isa_model/inference/services/llm/__init__.py +10 -2
  80. isa_model/inference/services/llm/base_llm_service.py +335 -24
  81. isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
  82. isa_model/inference/services/llm/helpers/llm_adapter.py +9 -4
  83. isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
  84. isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
  85. isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
  86. isa_model/inference/services/llm/local_llm_service.py +747 -0
  87. isa_model/inference/services/llm/ollama_llm_service.py +9 -2
  88. isa_model/inference/services/llm/openai_llm_service.py +33 -16
  89. isa_model/inference/services/llm/yyds_llm_service.py +8 -2
  90. isa_model/inference/services/vision/__init__.py +22 -1
  91. isa_model/inference/services/vision/blip_vision_service.py +359 -0
  92. isa_model/inference/services/vision/helpers/image_utils.py +8 -5
  93. isa_model/inference/services/vision/isa_vision_service.py +65 -4
  94. isa_model/inference/services/vision/openai_vision_service.py +19 -10
  95. isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
  96. isa_model/serving/api/cache_manager.py +245 -0
  97. isa_model/serving/api/dependencies/__init__.py +1 -0
  98. isa_model/serving/api/dependencies/auth.py +194 -0
  99. isa_model/serving/api/dependencies/database.py +139 -0
  100. isa_model/serving/api/error_handlers.py +284 -0
  101. isa_model/serving/api/fastapi_server.py +172 -22
  102. isa_model/serving/api/middleware/auth.py +8 -2
  103. isa_model/serving/api/middleware/security.py +23 -33
  104. isa_model/serving/api/middleware/tenant_context.py +414 -0
  105. isa_model/serving/api/routes/analytics.py +4 -1
  106. isa_model/serving/api/routes/config.py +645 -0
  107. isa_model/serving/api/routes/deployment_billing.py +315 -0
  108. isa_model/serving/api/routes/deployments.py +138 -2
  109. isa_model/serving/api/routes/gpu_gateway.py +440 -0
  110. isa_model/serving/api/routes/health.py +32 -12
  111. isa_model/serving/api/routes/inference_monitoring.py +486 -0
  112. isa_model/serving/api/routes/local_deployments.py +448 -0
  113. isa_model/serving/api/routes/tenants.py +575 -0
  114. isa_model/serving/api/routes/unified.py +680 -18
  115. isa_model/serving/api/routes/webhooks.py +479 -0
  116. isa_model/serving/api/startup.py +68 -54
  117. isa_model/utils/gpu_utils.py +311 -0
  118. {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/METADATA +66 -24
  119. isa_model-0.4.3.dist-info/RECORD +193 -0
  120. isa_model/core/storage/minio_storage.py +0 -0
  121. isa_model/deployment/cloud/__init__.py +0 -9
  122. isa_model/deployment/cloud/modal/__init__.py +0 -10
  123. isa_model/deployment/core/deployment_config.py +0 -356
  124. isa_model/deployment/core/isa_deployment_service.py +0 -401
  125. isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
  126. isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
  127. isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
  128. isa_model/deployment/runtime/deployed_service.py +0 -338
  129. isa_model/deployment/services/__init__.py +0 -9
  130. isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
  131. isa_model/deployment/services/model_service.py +0 -332
  132. isa_model/deployment/services/service_monitor.py +0 -356
  133. isa_model/deployment/services/service_registry.py +0 -527
  134. isa_model/eval/__init__.py +0 -92
  135. isa_model/eval/benchmarks/__init__.py +0 -27
  136. isa_model/eval/benchmarks/multimodal_datasets.py +0 -460
  137. isa_model/eval/benchmarks.py +0 -701
  138. isa_model/eval/config/__init__.py +0 -10
  139. isa_model/eval/config/evaluation_config.py +0 -108
  140. isa_model/eval/evaluators/__init__.py +0 -24
  141. isa_model/eval/evaluators/audio_evaluator.py +0 -727
  142. isa_model/eval/evaluators/base_evaluator.py +0 -503
  143. isa_model/eval/evaluators/embedding_evaluator.py +0 -742
  144. isa_model/eval/evaluators/llm_evaluator.py +0 -472
  145. isa_model/eval/evaluators/vision_evaluator.py +0 -564
  146. isa_model/eval/example_evaluation.py +0 -395
  147. isa_model/eval/factory.py +0 -798
  148. isa_model/eval/infrastructure/__init__.py +0 -24
  149. isa_model/eval/infrastructure/experiment_tracker.py +0 -466
  150. isa_model/eval/isa_benchmarks.py +0 -700
  151. isa_model/eval/isa_integration.py +0 -582
  152. isa_model/eval/metrics.py +0 -951
  153. isa_model/eval/tests/unit/test_basic.py +0 -396
  154. isa_model/serving/api/routes/evaluations.py +0 -579
  155. isa_model/training/__init__.py +0 -168
  156. isa_model/training/annotation/annotation_schema.py +0 -47
  157. isa_model/training/annotation/processors/annotation_processor.py +0 -126
  158. isa_model/training/annotation/storage/dataset_manager.py +0 -131
  159. isa_model/training/annotation/storage/dataset_schema.py +0 -44
  160. isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
  161. isa_model/training/annotation/tests/test_minio copy.py +0 -113
  162. isa_model/training/annotation/tests/test_minio_upload.py +0 -43
  163. isa_model/training/annotation/views/annotation_controller.py +0 -158
  164. isa_model/training/cloud/__init__.py +0 -22
  165. isa_model/training/cloud/job_orchestrator.py +0 -402
  166. isa_model/training/cloud/runpod_trainer.py +0 -454
  167. isa_model/training/cloud/storage_manager.py +0 -482
  168. isa_model/training/core/__init__.py +0 -26
  169. isa_model/training/core/config.py +0 -181
  170. isa_model/training/core/dataset.py +0 -222
  171. isa_model/training/core/trainer.py +0 -720
  172. isa_model/training/core/utils.py +0 -213
  173. isa_model/training/examples/intelligent_training_example.py +0 -281
  174. isa_model/training/factory.py +0 -424
  175. isa_model/training/intelligent/__init__.py +0 -25
  176. isa_model/training/intelligent/decision_engine.py +0 -643
  177. isa_model/training/intelligent/intelligent_factory.py +0 -888
  178. isa_model/training/intelligent/knowledge_base.py +0 -751
  179. isa_model/training/intelligent/resource_optimizer.py +0 -839
  180. isa_model/training/intelligent/task_classifier.py +0 -576
  181. isa_model/training/storage/__init__.py +0 -24
  182. isa_model/training/storage/core_integration.py +0 -439
  183. isa_model/training/storage/training_repository.py +0 -552
  184. isa_model/training/storage/training_storage.py +0 -628
  185. isa_model-0.4.0.dist-info/RECORD +0 -182
  186. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_chatTTS_service.py +0 -0
  187. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_fish_service.py +0 -0
  188. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_openvoice_service.py +0 -0
  189. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_service_v2.py +0 -0
  190. /isa_model/deployment/{cloud/modal → modal/services/embedding}/isa_embed_rerank_service.py +0 -0
  191. /isa_model/deployment/{cloud/modal → modal/services/video}/isa_video_hunyuan_service.py +0 -0
  192. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ocr_service.py +0 -0
  193. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_qwen25_service.py +0 -0
  194. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_table_service.py +0 -0
  195. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service.py +0 -0
  196. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service_optimized.py +0 -0
  197. /isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
  198. {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/WHEEL +0 -0
  199. {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,448 @@
1
+ """
2
+ Local GPU deployments API routes
3
+
4
+ Endpoints for managing local GPU model deployments.
5
+ """
6
+
7
+ import logging
8
+ from typing import Dict, List, Optional, Any
9
+ from fastapi import APIRouter, HTTPException, Depends, BackgroundTasks
10
+ from pydantic import BaseModel, Field
11
+
12
+ from ....deployment.core.deployment_manager import DeploymentManager
13
+ from ....deployment.local.config import (
14
+ LocalGPUConfig, LocalServiceType, LocalBackend,
15
+ create_vllm_config, create_tensorrt_config, create_transformers_config
16
+ )
17
+ from ...middleware.auth import get_current_user
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+ router = APIRouter(prefix="/api/v1/local", tags=["local-deployments"])
22
+
23
+
24
+ # Request/Response Models
25
+ class LocalDeployRequest(BaseModel):
26
+ """Local deployment request"""
27
+ service_name: str = Field(..., description="Unique service name")
28
+ model_id: str = Field(..., description="HuggingFace model ID")
29
+ backend: str = Field("transformers", description="Inference backend (vllm, tensorrt_llm, transformers)")
30
+ service_type: str = Field("llm", description="Service type (llm, vision, audio, embedding)")
31
+
32
+ # Model configuration
33
+ model_precision: str = Field("float16", description="Model precision")
34
+ max_model_len: int = Field(2048, description="Maximum sequence length")
35
+ max_batch_size: int = Field(8, description="Maximum batch size")
36
+
37
+ # GPU settings
38
+ gpu_id: Optional[int] = Field(None, description="Specific GPU ID to use")
39
+ gpu_memory_utilization: float = Field(0.9, description="GPU memory utilization fraction")
40
+
41
+ # Performance settings
42
+ tensor_parallel_size: int = Field(1, description="Tensor parallel size")
43
+ enable_chunked_prefill: bool = Field(True, description="Enable chunked prefill")
44
+ enable_prefix_caching: bool = Field(True, description="Enable prefix caching")
45
+
46
+ # Quantization
47
+ quantization: Optional[str] = Field(None, description="Quantization method (int8, int4, awq, gptq)")
48
+
49
+ # Advanced settings
50
+ trust_remote_code: bool = Field(False, description="Trust remote code in model")
51
+ revision: Optional[str] = Field(None, description="Model revision")
52
+
53
+ # Backend-specific settings
54
+ vllm_args: Dict[str, Any] = Field(default_factory=dict, description="Additional vLLM arguments")
55
+ tensorrt_args: Dict[str, Any] = Field(default_factory=dict, description="Additional TensorRT arguments")
56
+ transformers_args: Dict[str, Any] = Field(default_factory=dict, description="Additional Transformers arguments")
57
+
58
+
59
+ class LocalServiceInfo(BaseModel):
60
+ """Local service information"""
61
+ service_name: str
62
+ model_id: str
63
+ backend: str
64
+ service_type: str
65
+ status: str
66
+ healthy: bool
67
+ response_time_ms: Optional[float] = None
68
+ error_count: int = 0
69
+ uptime_seconds: Optional[float] = None
70
+ deployed_at: Optional[str] = None
71
+
72
+
73
+ class GenerateRequest(BaseModel):
74
+ """Text generation request"""
75
+ prompt: str = Field(..., description="Input prompt")
76
+ max_tokens: int = Field(512, description="Maximum tokens to generate")
77
+ temperature: float = Field(0.7, description="Sampling temperature")
78
+ top_p: float = Field(0.9, description="Top-p sampling")
79
+ top_k: int = Field(50, description="Top-k sampling")
80
+ stream: bool = Field(False, description="Stream response")
81
+
82
+
83
+ class ChatCompletionRequest(BaseModel):
84
+ """Chat completion request"""
85
+ messages: List[Dict[str, str]] = Field(..., description="Chat messages")
86
+ max_tokens: int = Field(512, description="Maximum tokens to generate")
87
+ temperature: float = Field(0.7, description="Sampling temperature")
88
+ top_p: float = Field(0.9, description="Top-p sampling")
89
+ stream: bool = Field(False, description="Stream response")
90
+
91
+
92
+ # Dependency injection
93
+ async def get_deployment_manager() -> DeploymentManager:
94
+ """Get deployment manager instance"""
95
+ return DeploymentManager()
96
+
97
+
98
+ @router.get("/status", summary="Get local GPU system status")
99
+ async def get_local_status(
100
+ manager: DeploymentManager = Depends(get_deployment_manager)
101
+ ):
102
+ """Get overall local GPU system status including available resources"""
103
+ try:
104
+ status = await manager.get_local_system_status()
105
+ return {"success": True, "status": status}
106
+ except Exception as e:
107
+ logger.error(f"Failed to get local status: {e}")
108
+ raise HTTPException(status_code=500, detail=str(e))
109
+
110
+
111
+ @router.post("/deploy", summary="Deploy model to local GPU")
112
+ async def deploy_local_service(
113
+ request: LocalDeployRequest,
114
+ background_tasks: BackgroundTasks,
115
+ manager: DeploymentManager = Depends(get_deployment_manager),
116
+ current_user: Optional[Dict] = Depends(get_current_user)
117
+ ):
118
+ """Deploy a model service to local GPU"""
119
+ try:
120
+ # Convert request to configuration
121
+ config = LocalGPUConfig(
122
+ service_name=request.service_name,
123
+ service_type=LocalServiceType(request.service_type),
124
+ model_id=request.model_id,
125
+ backend=LocalBackend(request.backend),
126
+ model_precision=request.model_precision,
127
+ max_model_len=request.max_model_len,
128
+ max_batch_size=request.max_batch_size,
129
+ gpu_id=request.gpu_id,
130
+ gpu_memory_utilization=request.gpu_memory_utilization,
131
+ tensor_parallel_size=request.tensor_parallel_size,
132
+ enable_chunked_prefill=request.enable_chunked_prefill,
133
+ enable_prefix_caching=request.enable_prefix_caching,
134
+ quantization=request.quantization,
135
+ trust_remote_code=request.trust_remote_code,
136
+ revision=request.revision,
137
+ vllm_args=request.vllm_args,
138
+ tensorrt_args=request.tensorrt_args,
139
+ transformers_args=request.transformers_args
140
+ )
141
+
142
+ # Deploy service
143
+ result = await manager.deploy_to_local(config)
144
+
145
+ if result["success"]:
146
+ return {
147
+ "success": True,
148
+ "message": f"Service {request.service_name} deployed successfully",
149
+ "deployment": result
150
+ }
151
+ else:
152
+ raise HTTPException(status_code=400, detail=result.get("error", "Deployment failed"))
153
+
154
+ except ValueError as e:
155
+ raise HTTPException(status_code=400, detail=f"Invalid configuration: {e}")
156
+ except Exception as e:
157
+ logger.error(f"Local deployment failed: {e}")
158
+ raise HTTPException(status_code=500, detail=str(e))
159
+
160
+
161
+ @router.get("/services", summary="List local GPU services")
162
+ async def list_local_services(
163
+ manager: DeploymentManager = Depends(get_deployment_manager)
164
+ ) -> Dict[str, Any]:
165
+ """List all deployed local GPU services"""
166
+ try:
167
+ services = await manager.list_local_services()
168
+ return {
169
+ "success": True,
170
+ "services": services,
171
+ "count": len(services)
172
+ }
173
+ except Exception as e:
174
+ logger.error(f"Failed to list local services: {e}")
175
+ raise HTTPException(status_code=500, detail=str(e))
176
+
177
+
178
+ @router.get("/services/{service_name}", summary="Get local service information")
179
+ async def get_local_service(
180
+ service_name: str,
181
+ manager: DeploymentManager = Depends(get_deployment_manager)
182
+ ):
183
+ """Get detailed information about a specific local service"""
184
+ try:
185
+ service_info = await manager.get_local_service_info(service_name)
186
+
187
+ if service_info is None:
188
+ raise HTTPException(status_code=404, detail=f"Service {service_name} not found")
189
+
190
+ return {
191
+ "success": True,
192
+ "service": service_info
193
+ }
194
+ except HTTPException:
195
+ raise
196
+ except Exception as e:
197
+ logger.error(f"Failed to get service info for {service_name}: {e}")
198
+ raise HTTPException(status_code=500, detail=str(e))
199
+
200
+
201
+ @router.delete("/services/{service_name}", summary="Undeploy local service")
202
+ async def undeploy_local_service(
203
+ service_name: str,
204
+ manager: DeploymentManager = Depends(get_deployment_manager),
205
+ current_user: Optional[Dict] = Depends(get_current_user)
206
+ ):
207
+ """Stop and remove a deployed local service"""
208
+ try:
209
+ result = await manager.undeploy_local_service(service_name)
210
+
211
+ if result["success"]:
212
+ return {
213
+ "success": True,
214
+ "message": f"Service {service_name} undeployed successfully"
215
+ }
216
+ else:
217
+ raise HTTPException(status_code=400, detail=result.get("error", "Undeploy failed"))
218
+
219
+ except HTTPException:
220
+ raise
221
+ except Exception as e:
222
+ logger.error(f"Failed to undeploy service {service_name}: {e}")
223
+ raise HTTPException(status_code=500, detail=str(e))
224
+
225
+
226
+ @router.post("/services/{service_name}/generate", summary="Generate text using local service")
227
+ async def generate_text(
228
+ service_name: str,
229
+ request: GenerateRequest,
230
+ manager: DeploymentManager = Depends(get_deployment_manager)
231
+ ):
232
+ """Generate text using a deployed local service"""
233
+ try:
234
+ # Get the local provider and call generate_text
235
+ local_provider = manager.local_provider
236
+
237
+ result = await local_provider.generate_text(
238
+ service_name=service_name,
239
+ prompt=request.prompt,
240
+ max_tokens=request.max_tokens,
241
+ temperature=request.temperature,
242
+ top_p=request.top_p,
243
+ top_k=request.top_k,
244
+ stream=request.stream
245
+ )
246
+
247
+ if result["success"]:
248
+ return result
249
+ else:
250
+ raise HTTPException(status_code=400, detail=result.get("error", "Generation failed"))
251
+
252
+ except HTTPException:
253
+ raise
254
+ except Exception as e:
255
+ logger.error(f"Text generation failed for {service_name}: {e}")
256
+ raise HTTPException(status_code=500, detail=str(e))
257
+
258
+
259
+ @router.post("/services/{service_name}/chat/completions", summary="Chat completion using local service")
260
+ async def chat_completion(
261
+ service_name: str,
262
+ request: ChatCompletionRequest,
263
+ manager: DeploymentManager = Depends(get_deployment_manager)
264
+ ):
265
+ """Generate chat completion using a deployed local service"""
266
+ try:
267
+ # Get the local provider and call chat_completion
268
+ local_provider = manager.local_provider
269
+
270
+ result = await local_provider.chat_completion(
271
+ service_name=service_name,
272
+ messages=request.messages,
273
+ max_tokens=request.max_tokens,
274
+ temperature=request.temperature,
275
+ top_p=request.top_p,
276
+ stream=request.stream
277
+ )
278
+
279
+ if result["success"]:
280
+ return result
281
+ else:
282
+ raise HTTPException(status_code=400, detail=result.get("error", "Chat completion failed"))
283
+
284
+ except HTTPException:
285
+ raise
286
+ except Exception as e:
287
+ logger.error(f"Chat completion failed for {service_name}: {e}")
288
+ raise HTTPException(status_code=500, detail=str(e))
289
+
290
+
291
+ @router.get("/backends", summary="List available local backends")
292
+ async def list_backends():
293
+ """List available local inference backends"""
294
+ backends = []
295
+
296
+ # Check backend availability
297
+ try:
298
+ import vllm
299
+ backends.append({
300
+ "name": "vllm",
301
+ "description": "High-performance LLM inference server",
302
+ "available": True,
303
+ "features": ["high_throughput", "dynamic_batching", "prefix_caching"]
304
+ })
305
+ except ImportError:
306
+ backends.append({
307
+ "name": "vllm",
308
+ "description": "High-performance LLM inference server",
309
+ "available": False,
310
+ "install_command": "pip install vllm"
311
+ })
312
+
313
+ try:
314
+ import tensorrt_llm
315
+ backends.append({
316
+ "name": "tensorrt_llm",
317
+ "description": "NVIDIA TensorRT-LLM for maximum optimization",
318
+ "available": True,
319
+ "features": ["maximum_performance", "tensorrt_optimization", "cuda_acceleration"]
320
+ })
321
+ except ImportError:
322
+ backends.append({
323
+ "name": "tensorrt_llm",
324
+ "description": "NVIDIA TensorRT-LLM for maximum optimization",
325
+ "available": False,
326
+ "install_command": "pip install tensorrt-llm"
327
+ })
328
+
329
+ try:
330
+ import transformers
331
+ backends.append({
332
+ "name": "transformers",
333
+ "description": "HuggingFace Transformers for universal compatibility",
334
+ "available": True,
335
+ "features": ["universal_compatibility", "all_model_types", "quantization_support"]
336
+ })
337
+ except ImportError:
338
+ backends.append({
339
+ "name": "transformers",
340
+ "description": "HuggingFace Transformers for universal compatibility",
341
+ "available": False,
342
+ "install_command": "pip install transformers"
343
+ })
344
+
345
+ return {
346
+ "success": True,
347
+ "backends": backends
348
+ }
349
+
350
+
351
+ @router.get("/gpu-info", summary="Get GPU information")
352
+ async def get_gpu_info():
353
+ """Get detailed information about available GPUs"""
354
+ try:
355
+ from ....utils.gpu_utils import get_gpu_manager
356
+
357
+ gpu_manager = get_gpu_manager()
358
+ system_info = gpu_manager.get_system_info()
359
+
360
+ return {
361
+ "success": True,
362
+ "gpu_info": system_info
363
+ }
364
+ except Exception as e:
365
+ logger.error(f"Failed to get GPU info: {e}")
366
+ raise HTTPException(status_code=500, detail=str(e))
367
+
368
+
369
+ @router.post("/estimate-memory", summary="Estimate model memory requirements")
370
+ async def estimate_memory(
371
+ model_id: str,
372
+ precision: str = "float16"
373
+ ):
374
+ """Estimate memory requirements for a model"""
375
+ try:
376
+ from ....utils.gpu_utils import estimate_model_memory
377
+
378
+ memory_mb = estimate_model_memory(model_id, precision)
379
+ memory_gb = memory_mb / 1024
380
+
381
+ return {
382
+ "success": True,
383
+ "model_id": model_id,
384
+ "precision": precision,
385
+ "estimated_memory_mb": memory_mb,
386
+ "estimated_memory_gb": round(memory_gb, 2)
387
+ }
388
+ except Exception as e:
389
+ logger.error(f"Failed to estimate memory for {model_id}: {e}")
390
+ raise HTTPException(status_code=500, detail=str(e))
391
+
392
+
393
+ @router.get("/presets", summary="Get deployment configuration presets")
394
+ async def get_deployment_presets():
395
+ """Get predefined deployment configuration presets"""
396
+ presets = {
397
+ "vllm_small": {
398
+ "name": "vLLM - Small Model",
399
+ "description": "Optimized for models up to 7B parameters",
400
+ "backend": "vllm",
401
+ "max_model_len": 2048,
402
+ "max_batch_size": 16,
403
+ "gpu_memory_utilization": 0.9,
404
+ "enable_chunked_prefill": True,
405
+ "enable_prefix_caching": True
406
+ },
407
+ "vllm_large": {
408
+ "name": "vLLM - Large Model",
409
+ "description": "Optimized for models 13B+ parameters",
410
+ "backend": "vllm",
411
+ "max_model_len": 4096,
412
+ "max_batch_size": 8,
413
+ "gpu_memory_utilization": 0.95,
414
+ "tensor_parallel_size": 2,
415
+ "enable_chunked_prefill": True,
416
+ "enable_prefix_caching": True
417
+ },
418
+ "tensorrt_performance": {
419
+ "name": "TensorRT-LLM - Maximum Performance",
420
+ "description": "Maximum optimization with TensorRT",
421
+ "backend": "tensorrt_llm",
422
+ "model_precision": "float16",
423
+ "max_batch_size": 16,
424
+ "tensorrt_args": {
425
+ "enable_kv_cache_reuse": True,
426
+ "use_gpt_attention_plugin": True,
427
+ "remove_input_padding": True
428
+ }
429
+ },
430
+ "transformers_compatible": {
431
+ "name": "Transformers - Universal",
432
+ "description": "Maximum compatibility with all models",
433
+ "backend": "transformers",
434
+ "model_precision": "float16",
435
+ "max_batch_size": 4,
436
+ "gpu_memory_utilization": 0.8,
437
+ "transformers_args": {
438
+ "device_map": "auto",
439
+ "torch_dtype": "auto",
440
+ "low_cpu_mem_usage": True
441
+ }
442
+ }
443
+ }
444
+
445
+ return {
446
+ "success": True,
447
+ "presets": presets
448
+ }