isa-model 0.4.0__py3-none-any.whl → 0.4.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (189) hide show
  1. isa_model/client.py +466 -43
  2. isa_model/core/cache/redis_cache.py +12 -3
  3. isa_model/core/config/config_manager.py +230 -3
  4. isa_model/core/config.py +90 -0
  5. isa_model/core/database/direct_db_client.py +114 -0
  6. isa_model/core/database/migration_manager.py +563 -0
  7. isa_model/core/database/migrations.py +21 -1
  8. isa_model/core/database/supabase_client.py +154 -19
  9. isa_model/core/dependencies.py +316 -0
  10. isa_model/core/discovery/__init__.py +19 -0
  11. isa_model/core/discovery/consul_discovery.py +190 -0
  12. isa_model/core/logging/__init__.py +54 -0
  13. isa_model/core/logging/influx_logger.py +523 -0
  14. isa_model/core/logging/loki_logger.py +160 -0
  15. isa_model/core/models/__init__.py +27 -18
  16. isa_model/core/models/config_models.py +625 -0
  17. isa_model/core/models/deployment_billing_tracker.py +430 -0
  18. isa_model/core/models/model_manager.py +35 -80
  19. isa_model/core/models/model_metadata.py +690 -0
  20. isa_model/core/models/model_repo.py +174 -18
  21. isa_model/core/models/system_models.py +857 -0
  22. isa_model/core/repositories/__init__.py +9 -0
  23. isa_model/core/repositories/config_repository.py +912 -0
  24. isa_model/core/services/intelligent_model_selector.py +399 -21
  25. isa_model/core/types.py +1 -0
  26. isa_model/deployment/__init__.py +5 -48
  27. isa_model/deployment/core/__init__.py +2 -31
  28. isa_model/deployment/core/deployment_manager.py +1278 -370
  29. isa_model/deployment/modal/__init__.py +8 -0
  30. isa_model/deployment/modal/config.py +136 -0
  31. isa_model/deployment/{services/auto_hf_modal_deployer.py → modal/deployer.py} +1 -1
  32. isa_model/deployment/modal/services/__init__.py +3 -0
  33. isa_model/deployment/modal/services/audio/__init__.py +1 -0
  34. isa_model/deployment/modal/services/embedding/__init__.py +1 -0
  35. isa_model/deployment/modal/services/llm/__init__.py +1 -0
  36. isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
  37. isa_model/deployment/modal/services/video/__init__.py +1 -0
  38. isa_model/deployment/modal/services/vision/__init__.py +1 -0
  39. isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
  40. isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
  41. isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
  42. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
  43. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
  44. isa_model/deployment/storage/__init__.py +5 -0
  45. isa_model/deployment/storage/deployment_repository.py +824 -0
  46. isa_model/deployment/triton/__init__.py +10 -0
  47. isa_model/deployment/triton/config.py +196 -0
  48. isa_model/deployment/triton/configs/__init__.py +1 -0
  49. isa_model/deployment/triton/provider.py +512 -0
  50. isa_model/deployment/triton/scripts/__init__.py +1 -0
  51. isa_model/deployment/triton/templates/__init__.py +1 -0
  52. isa_model/inference/__init__.py +47 -1
  53. isa_model/inference/ai_factory.py +137 -10
  54. isa_model/inference/legacy_services/__init__.py +21 -0
  55. isa_model/inference/legacy_services/model_evaluation.py +637 -0
  56. isa_model/inference/legacy_services/model_service.py +573 -0
  57. isa_model/inference/legacy_services/model_serving.py +717 -0
  58. isa_model/inference/legacy_services/model_training.py +561 -0
  59. isa_model/inference/models/__init__.py +21 -0
  60. isa_model/inference/models/inference_config.py +551 -0
  61. isa_model/inference/models/inference_record.py +675 -0
  62. isa_model/inference/models/performance_models.py +714 -0
  63. isa_model/inference/repositories/__init__.py +9 -0
  64. isa_model/inference/repositories/inference_repository.py +828 -0
  65. isa_model/inference/services/audio/base_stt_service.py +184 -11
  66. isa_model/inference/services/audio/openai_stt_service.py +22 -6
  67. isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
  68. isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
  69. isa_model/inference/services/llm/__init__.py +10 -2
  70. isa_model/inference/services/llm/base_llm_service.py +335 -24
  71. isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
  72. isa_model/inference/services/llm/helpers/llm_adapter.py +9 -4
  73. isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
  74. isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
  75. isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
  76. isa_model/inference/services/llm/ollama_llm_service.py +9 -2
  77. isa_model/inference/services/llm/openai_llm_service.py +33 -16
  78. isa_model/inference/services/llm/yyds_llm_service.py +8 -2
  79. isa_model/inference/services/vision/__init__.py +22 -1
  80. isa_model/inference/services/vision/helpers/image_utils.py +8 -5
  81. isa_model/inference/services/vision/isa_vision_service.py +65 -4
  82. isa_model/inference/services/vision/openai_vision_service.py +19 -10
  83. isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
  84. isa_model/serving/api/cache_manager.py +245 -0
  85. isa_model/serving/api/dependencies/__init__.py +1 -0
  86. isa_model/serving/api/dependencies/auth.py +194 -0
  87. isa_model/serving/api/dependencies/database.py +139 -0
  88. isa_model/serving/api/error_handlers.py +284 -0
  89. isa_model/serving/api/fastapi_server.py +172 -22
  90. isa_model/serving/api/middleware/auth.py +8 -2
  91. isa_model/serving/api/middleware/security.py +23 -33
  92. isa_model/serving/api/middleware/tenant_context.py +414 -0
  93. isa_model/serving/api/routes/analytics.py +4 -1
  94. isa_model/serving/api/routes/config.py +645 -0
  95. isa_model/serving/api/routes/deployment_billing.py +315 -0
  96. isa_model/serving/api/routes/deployments.py +138 -2
  97. isa_model/serving/api/routes/gpu_gateway.py +440 -0
  98. isa_model/serving/api/routes/health.py +32 -12
  99. isa_model/serving/api/routes/inference_monitoring.py +486 -0
  100. isa_model/serving/api/routes/local_deployments.py +448 -0
  101. isa_model/serving/api/routes/tenants.py +575 -0
  102. isa_model/serving/api/routes/unified.py +680 -18
  103. isa_model/serving/api/routes/webhooks.py +479 -0
  104. isa_model/serving/api/startup.py +68 -54
  105. isa_model/utils/gpu_utils.py +311 -0
  106. {isa_model-0.4.0.dist-info → isa_model-0.4.4.dist-info}/METADATA +71 -24
  107. isa_model-0.4.4.dist-info/RECORD +180 -0
  108. isa_model/core/security/secrets.py +0 -358
  109. isa_model/core/storage/hf_storage.py +0 -419
  110. isa_model/core/storage/minio_storage.py +0 -0
  111. isa_model/deployment/cloud/__init__.py +0 -9
  112. isa_model/deployment/cloud/modal/__init__.py +0 -10
  113. isa_model/deployment/core/deployment_config.py +0 -356
  114. isa_model/deployment/core/isa_deployment_service.py +0 -401
  115. isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
  116. isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
  117. isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
  118. isa_model/deployment/runtime/deployed_service.py +0 -338
  119. isa_model/deployment/services/__init__.py +0 -9
  120. isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
  121. isa_model/deployment/services/model_service.py +0 -332
  122. isa_model/deployment/services/service_monitor.py +0 -356
  123. isa_model/deployment/services/service_registry.py +0 -527
  124. isa_model/eval/__init__.py +0 -92
  125. isa_model/eval/benchmarks/__init__.py +0 -27
  126. isa_model/eval/benchmarks/multimodal_datasets.py +0 -460
  127. isa_model/eval/benchmarks.py +0 -701
  128. isa_model/eval/config/__init__.py +0 -10
  129. isa_model/eval/config/evaluation_config.py +0 -108
  130. isa_model/eval/evaluators/__init__.py +0 -24
  131. isa_model/eval/evaluators/audio_evaluator.py +0 -727
  132. isa_model/eval/evaluators/base_evaluator.py +0 -503
  133. isa_model/eval/evaluators/embedding_evaluator.py +0 -742
  134. isa_model/eval/evaluators/llm_evaluator.py +0 -472
  135. isa_model/eval/evaluators/vision_evaluator.py +0 -564
  136. isa_model/eval/example_evaluation.py +0 -395
  137. isa_model/eval/factory.py +0 -798
  138. isa_model/eval/infrastructure/__init__.py +0 -24
  139. isa_model/eval/infrastructure/experiment_tracker.py +0 -466
  140. isa_model/eval/isa_benchmarks.py +0 -700
  141. isa_model/eval/isa_integration.py +0 -582
  142. isa_model/eval/metrics.py +0 -951
  143. isa_model/eval/tests/unit/test_basic.py +0 -396
  144. isa_model/serving/api/routes/evaluations.py +0 -579
  145. isa_model/training/__init__.py +0 -168
  146. isa_model/training/annotation/annotation_schema.py +0 -47
  147. isa_model/training/annotation/processors/annotation_processor.py +0 -126
  148. isa_model/training/annotation/storage/dataset_manager.py +0 -131
  149. isa_model/training/annotation/storage/dataset_schema.py +0 -44
  150. isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
  151. isa_model/training/annotation/tests/test_minio copy.py +0 -113
  152. isa_model/training/annotation/tests/test_minio_upload.py +0 -43
  153. isa_model/training/annotation/views/annotation_controller.py +0 -158
  154. isa_model/training/cloud/__init__.py +0 -22
  155. isa_model/training/cloud/job_orchestrator.py +0 -402
  156. isa_model/training/cloud/runpod_trainer.py +0 -454
  157. isa_model/training/cloud/storage_manager.py +0 -482
  158. isa_model/training/core/__init__.py +0 -26
  159. isa_model/training/core/config.py +0 -181
  160. isa_model/training/core/dataset.py +0 -222
  161. isa_model/training/core/trainer.py +0 -720
  162. isa_model/training/core/utils.py +0 -213
  163. isa_model/training/examples/intelligent_training_example.py +0 -281
  164. isa_model/training/factory.py +0 -424
  165. isa_model/training/intelligent/__init__.py +0 -25
  166. isa_model/training/intelligent/decision_engine.py +0 -643
  167. isa_model/training/intelligent/intelligent_factory.py +0 -888
  168. isa_model/training/intelligent/knowledge_base.py +0 -751
  169. isa_model/training/intelligent/resource_optimizer.py +0 -839
  170. isa_model/training/intelligent/task_classifier.py +0 -576
  171. isa_model/training/storage/__init__.py +0 -24
  172. isa_model/training/storage/core_integration.py +0 -439
  173. isa_model/training/storage/training_repository.py +0 -552
  174. isa_model/training/storage/training_storage.py +0 -628
  175. isa_model-0.4.0.dist-info/RECORD +0 -182
  176. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_chatTTS_service.py +0 -0
  177. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_fish_service.py +0 -0
  178. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_openvoice_service.py +0 -0
  179. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_service_v2.py +0 -0
  180. /isa_model/deployment/{cloud/modal → modal/services/embedding}/isa_embed_rerank_service.py +0 -0
  181. /isa_model/deployment/{cloud/modal → modal/services/video}/isa_video_hunyuan_service.py +0 -0
  182. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ocr_service.py +0 -0
  183. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_qwen25_service.py +0 -0
  184. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_table_service.py +0 -0
  185. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service.py +0 -0
  186. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service_optimized.py +0 -0
  187. /isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
  188. {isa_model-0.4.0.dist-info → isa_model-0.4.4.dist-info}/WHEEL +0 -0
  189. {isa_model-0.4.0.dist-info → isa_model-0.4.4.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,448 @@
1
+ """
2
+ Local GPU deployments API routes
3
+
4
+ Endpoints for managing local GPU model deployments.
5
+ """
6
+
7
+ import logging
8
+ from typing import Dict, List, Optional, Any
9
+ from fastapi import APIRouter, HTTPException, Depends, BackgroundTasks
10
+ from pydantic import BaseModel, Field
11
+
12
+ from ....deployment.core.deployment_manager import DeploymentManager
13
+ from ....deployment.local.config import (
14
+ LocalGPUConfig, LocalServiceType, LocalBackend,
15
+ create_vllm_config, create_tensorrt_config, create_transformers_config
16
+ )
17
+ from ...middleware.auth import get_current_user
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+ router = APIRouter(prefix="/api/v1/local", tags=["local-deployments"])
22
+
23
+
24
+ # Request/Response Models
25
+ class LocalDeployRequest(BaseModel):
26
+ """Local deployment request"""
27
+ service_name: str = Field(..., description="Unique service name")
28
+ model_id: str = Field(..., description="HuggingFace model ID")
29
+ backend: str = Field("transformers", description="Inference backend (vllm, tensorrt_llm, transformers)")
30
+ service_type: str = Field("llm", description="Service type (llm, vision, audio, embedding)")
31
+
32
+ # Model configuration
33
+ model_precision: str = Field("float16", description="Model precision")
34
+ max_model_len: int = Field(2048, description="Maximum sequence length")
35
+ max_batch_size: int = Field(8, description="Maximum batch size")
36
+
37
+ # GPU settings
38
+ gpu_id: Optional[int] = Field(None, description="Specific GPU ID to use")
39
+ gpu_memory_utilization: float = Field(0.9, description="GPU memory utilization fraction")
40
+
41
+ # Performance settings
42
+ tensor_parallel_size: int = Field(1, description="Tensor parallel size")
43
+ enable_chunked_prefill: bool = Field(True, description="Enable chunked prefill")
44
+ enable_prefix_caching: bool = Field(True, description="Enable prefix caching")
45
+
46
+ # Quantization
47
+ quantization: Optional[str] = Field(None, description="Quantization method (int8, int4, awq, gptq)")
48
+
49
+ # Advanced settings
50
+ trust_remote_code: bool = Field(False, description="Trust remote code in model")
51
+ revision: Optional[str] = Field(None, description="Model revision")
52
+
53
+ # Backend-specific settings
54
+ vllm_args: Dict[str, Any] = Field(default_factory=dict, description="Additional vLLM arguments")
55
+ tensorrt_args: Dict[str, Any] = Field(default_factory=dict, description="Additional TensorRT arguments")
56
+ transformers_args: Dict[str, Any] = Field(default_factory=dict, description="Additional Transformers arguments")
57
+
58
+
59
+ class LocalServiceInfo(BaseModel):
60
+ """Local service information"""
61
+ service_name: str
62
+ model_id: str
63
+ backend: str
64
+ service_type: str
65
+ status: str
66
+ healthy: bool
67
+ response_time_ms: Optional[float] = None
68
+ error_count: int = 0
69
+ uptime_seconds: Optional[float] = None
70
+ deployed_at: Optional[str] = None
71
+
72
+
73
+ class GenerateRequest(BaseModel):
74
+ """Text generation request"""
75
+ prompt: str = Field(..., description="Input prompt")
76
+ max_tokens: int = Field(512, description="Maximum tokens to generate")
77
+ temperature: float = Field(0.7, description="Sampling temperature")
78
+ top_p: float = Field(0.9, description="Top-p sampling")
79
+ top_k: int = Field(50, description="Top-k sampling")
80
+ stream: bool = Field(False, description="Stream response")
81
+
82
+
83
+ class ChatCompletionRequest(BaseModel):
84
+ """Chat completion request"""
85
+ messages: List[Dict[str, str]] = Field(..., description="Chat messages")
86
+ max_tokens: int = Field(512, description="Maximum tokens to generate")
87
+ temperature: float = Field(0.7, description="Sampling temperature")
88
+ top_p: float = Field(0.9, description="Top-p sampling")
89
+ stream: bool = Field(False, description="Stream response")
90
+
91
+
92
+ # Dependency injection
93
+ async def get_deployment_manager() -> DeploymentManager:
94
+ """Get deployment manager instance"""
95
+ return DeploymentManager()
96
+
97
+
98
+ @router.get("/status", summary="Get local GPU system status")
99
+ async def get_local_status(
100
+ manager: DeploymentManager = Depends(get_deployment_manager)
101
+ ):
102
+ """Get overall local GPU system status including available resources"""
103
+ try:
104
+ status = await manager.get_local_system_status()
105
+ return {"success": True, "status": status}
106
+ except Exception as e:
107
+ logger.error(f"Failed to get local status: {e}")
108
+ raise HTTPException(status_code=500, detail=str(e))
109
+
110
+
111
+ @router.post("/deploy", summary="Deploy model to local GPU")
112
+ async def deploy_local_service(
113
+ request: LocalDeployRequest,
114
+ background_tasks: BackgroundTasks,
115
+ manager: DeploymentManager = Depends(get_deployment_manager),
116
+ current_user: Optional[Dict] = Depends(get_current_user)
117
+ ):
118
+ """Deploy a model service to local GPU"""
119
+ try:
120
+ # Convert request to configuration
121
+ config = LocalGPUConfig(
122
+ service_name=request.service_name,
123
+ service_type=LocalServiceType(request.service_type),
124
+ model_id=request.model_id,
125
+ backend=LocalBackend(request.backend),
126
+ model_precision=request.model_precision,
127
+ max_model_len=request.max_model_len,
128
+ max_batch_size=request.max_batch_size,
129
+ gpu_id=request.gpu_id,
130
+ gpu_memory_utilization=request.gpu_memory_utilization,
131
+ tensor_parallel_size=request.tensor_parallel_size,
132
+ enable_chunked_prefill=request.enable_chunked_prefill,
133
+ enable_prefix_caching=request.enable_prefix_caching,
134
+ quantization=request.quantization,
135
+ trust_remote_code=request.trust_remote_code,
136
+ revision=request.revision,
137
+ vllm_args=request.vllm_args,
138
+ tensorrt_args=request.tensorrt_args,
139
+ transformers_args=request.transformers_args
140
+ )
141
+
142
+ # Deploy service
143
+ result = await manager.deploy_to_local(config)
144
+
145
+ if result["success"]:
146
+ return {
147
+ "success": True,
148
+ "message": f"Service {request.service_name} deployed successfully",
149
+ "deployment": result
150
+ }
151
+ else:
152
+ raise HTTPException(status_code=400, detail=result.get("error", "Deployment failed"))
153
+
154
+ except ValueError as e:
155
+ raise HTTPException(status_code=400, detail=f"Invalid configuration: {e}")
156
+ except Exception as e:
157
+ logger.error(f"Local deployment failed: {e}")
158
+ raise HTTPException(status_code=500, detail=str(e))
159
+
160
+
161
+ @router.get("/services", summary="List local GPU services")
162
+ async def list_local_services(
163
+ manager: DeploymentManager = Depends(get_deployment_manager)
164
+ ) -> Dict[str, Any]:
165
+ """List all deployed local GPU services"""
166
+ try:
167
+ services = await manager.list_local_services()
168
+ return {
169
+ "success": True,
170
+ "services": services,
171
+ "count": len(services)
172
+ }
173
+ except Exception as e:
174
+ logger.error(f"Failed to list local services: {e}")
175
+ raise HTTPException(status_code=500, detail=str(e))
176
+
177
+
178
+ @router.get("/services/{service_name}", summary="Get local service information")
179
+ async def get_local_service(
180
+ service_name: str,
181
+ manager: DeploymentManager = Depends(get_deployment_manager)
182
+ ):
183
+ """Get detailed information about a specific local service"""
184
+ try:
185
+ service_info = await manager.get_local_service_info(service_name)
186
+
187
+ if service_info is None:
188
+ raise HTTPException(status_code=404, detail=f"Service {service_name} not found")
189
+
190
+ return {
191
+ "success": True,
192
+ "service": service_info
193
+ }
194
+ except HTTPException:
195
+ raise
196
+ except Exception as e:
197
+ logger.error(f"Failed to get service info for {service_name}: {e}")
198
+ raise HTTPException(status_code=500, detail=str(e))
199
+
200
+
201
+ @router.delete("/services/{service_name}", summary="Undeploy local service")
202
+ async def undeploy_local_service(
203
+ service_name: str,
204
+ manager: DeploymentManager = Depends(get_deployment_manager),
205
+ current_user: Optional[Dict] = Depends(get_current_user)
206
+ ):
207
+ """Stop and remove a deployed local service"""
208
+ try:
209
+ result = await manager.undeploy_local_service(service_name)
210
+
211
+ if result["success"]:
212
+ return {
213
+ "success": True,
214
+ "message": f"Service {service_name} undeployed successfully"
215
+ }
216
+ else:
217
+ raise HTTPException(status_code=400, detail=result.get("error", "Undeploy failed"))
218
+
219
+ except HTTPException:
220
+ raise
221
+ except Exception as e:
222
+ logger.error(f"Failed to undeploy service {service_name}: {e}")
223
+ raise HTTPException(status_code=500, detail=str(e))
224
+
225
+
226
+ @router.post("/services/{service_name}/generate", summary="Generate text using local service")
227
+ async def generate_text(
228
+ service_name: str,
229
+ request: GenerateRequest,
230
+ manager: DeploymentManager = Depends(get_deployment_manager)
231
+ ):
232
+ """Generate text using a deployed local service"""
233
+ try:
234
+ # Get the local provider and call generate_text
235
+ local_provider = manager.local_provider
236
+
237
+ result = await local_provider.generate_text(
238
+ service_name=service_name,
239
+ prompt=request.prompt,
240
+ max_tokens=request.max_tokens,
241
+ temperature=request.temperature,
242
+ top_p=request.top_p,
243
+ top_k=request.top_k,
244
+ stream=request.stream
245
+ )
246
+
247
+ if result["success"]:
248
+ return result
249
+ else:
250
+ raise HTTPException(status_code=400, detail=result.get("error", "Generation failed"))
251
+
252
+ except HTTPException:
253
+ raise
254
+ except Exception as e:
255
+ logger.error(f"Text generation failed for {service_name}: {e}")
256
+ raise HTTPException(status_code=500, detail=str(e))
257
+
258
+
259
+ @router.post("/services/{service_name}/chat/completions", summary="Chat completion using local service")
260
+ async def chat_completion(
261
+ service_name: str,
262
+ request: ChatCompletionRequest,
263
+ manager: DeploymentManager = Depends(get_deployment_manager)
264
+ ):
265
+ """Generate chat completion using a deployed local service"""
266
+ try:
267
+ # Get the local provider and call chat_completion
268
+ local_provider = manager.local_provider
269
+
270
+ result = await local_provider.chat_completion(
271
+ service_name=service_name,
272
+ messages=request.messages,
273
+ max_tokens=request.max_tokens,
274
+ temperature=request.temperature,
275
+ top_p=request.top_p,
276
+ stream=request.stream
277
+ )
278
+
279
+ if result["success"]:
280
+ return result
281
+ else:
282
+ raise HTTPException(status_code=400, detail=result.get("error", "Chat completion failed"))
283
+
284
+ except HTTPException:
285
+ raise
286
+ except Exception as e:
287
+ logger.error(f"Chat completion failed for {service_name}: {e}")
288
+ raise HTTPException(status_code=500, detail=str(e))
289
+
290
+
291
+ @router.get("/backends", summary="List available local backends")
292
+ async def list_backends():
293
+ """List available local inference backends"""
294
+ backends = []
295
+
296
+ # Check backend availability
297
+ try:
298
+ import vllm
299
+ backends.append({
300
+ "name": "vllm",
301
+ "description": "High-performance LLM inference server",
302
+ "available": True,
303
+ "features": ["high_throughput", "dynamic_batching", "prefix_caching"]
304
+ })
305
+ except ImportError:
306
+ backends.append({
307
+ "name": "vllm",
308
+ "description": "High-performance LLM inference server",
309
+ "available": False,
310
+ "install_command": "pip install vllm"
311
+ })
312
+
313
+ try:
314
+ import tensorrt_llm
315
+ backends.append({
316
+ "name": "tensorrt_llm",
317
+ "description": "NVIDIA TensorRT-LLM for maximum optimization",
318
+ "available": True,
319
+ "features": ["maximum_performance", "tensorrt_optimization", "cuda_acceleration"]
320
+ })
321
+ except ImportError:
322
+ backends.append({
323
+ "name": "tensorrt_llm",
324
+ "description": "NVIDIA TensorRT-LLM for maximum optimization",
325
+ "available": False,
326
+ "install_command": "pip install tensorrt-llm"
327
+ })
328
+
329
+ try:
330
+ import transformers
331
+ backends.append({
332
+ "name": "transformers",
333
+ "description": "HuggingFace Transformers for universal compatibility",
334
+ "available": True,
335
+ "features": ["universal_compatibility", "all_model_types", "quantization_support"]
336
+ })
337
+ except ImportError:
338
+ backends.append({
339
+ "name": "transformers",
340
+ "description": "HuggingFace Transformers for universal compatibility",
341
+ "available": False,
342
+ "install_command": "pip install transformers"
343
+ })
344
+
345
+ return {
346
+ "success": True,
347
+ "backends": backends
348
+ }
349
+
350
+
351
+ @router.get("/gpu-info", summary="Get GPU information")
352
+ async def get_gpu_info():
353
+ """Get detailed information about available GPUs"""
354
+ try:
355
+ from ....utils.gpu_utils import get_gpu_manager
356
+
357
+ gpu_manager = get_gpu_manager()
358
+ system_info = gpu_manager.get_system_info()
359
+
360
+ return {
361
+ "success": True,
362
+ "gpu_info": system_info
363
+ }
364
+ except Exception as e:
365
+ logger.error(f"Failed to get GPU info: {e}")
366
+ raise HTTPException(status_code=500, detail=str(e))
367
+
368
+
369
+ @router.post("/estimate-memory", summary="Estimate model memory requirements")
370
+ async def estimate_memory(
371
+ model_id: str,
372
+ precision: str = "float16"
373
+ ):
374
+ """Estimate memory requirements for a model"""
375
+ try:
376
+ from ....utils.gpu_utils import estimate_model_memory
377
+
378
+ memory_mb = estimate_model_memory(model_id, precision)
379
+ memory_gb = memory_mb / 1024
380
+
381
+ return {
382
+ "success": True,
383
+ "model_id": model_id,
384
+ "precision": precision,
385
+ "estimated_memory_mb": memory_mb,
386
+ "estimated_memory_gb": round(memory_gb, 2)
387
+ }
388
+ except Exception as e:
389
+ logger.error(f"Failed to estimate memory for {model_id}: {e}")
390
+ raise HTTPException(status_code=500, detail=str(e))
391
+
392
+
393
+ @router.get("/presets", summary="Get deployment configuration presets")
394
+ async def get_deployment_presets():
395
+ """Get predefined deployment configuration presets"""
396
+ presets = {
397
+ "vllm_small": {
398
+ "name": "vLLM - Small Model",
399
+ "description": "Optimized for models up to 7B parameters",
400
+ "backend": "vllm",
401
+ "max_model_len": 2048,
402
+ "max_batch_size": 16,
403
+ "gpu_memory_utilization": 0.9,
404
+ "enable_chunked_prefill": True,
405
+ "enable_prefix_caching": True
406
+ },
407
+ "vllm_large": {
408
+ "name": "vLLM - Large Model",
409
+ "description": "Optimized for models 13B+ parameters",
410
+ "backend": "vllm",
411
+ "max_model_len": 4096,
412
+ "max_batch_size": 8,
413
+ "gpu_memory_utilization": 0.95,
414
+ "tensor_parallel_size": 2,
415
+ "enable_chunked_prefill": True,
416
+ "enable_prefix_caching": True
417
+ },
418
+ "tensorrt_performance": {
419
+ "name": "TensorRT-LLM - Maximum Performance",
420
+ "description": "Maximum optimization with TensorRT",
421
+ "backend": "tensorrt_llm",
422
+ "model_precision": "float16",
423
+ "max_batch_size": 16,
424
+ "tensorrt_args": {
425
+ "enable_kv_cache_reuse": True,
426
+ "use_gpt_attention_plugin": True,
427
+ "remove_input_padding": True
428
+ }
429
+ },
430
+ "transformers_compatible": {
431
+ "name": "Transformers - Universal",
432
+ "description": "Maximum compatibility with all models",
433
+ "backend": "transformers",
434
+ "model_precision": "float16",
435
+ "max_batch_size": 4,
436
+ "gpu_memory_utilization": 0.8,
437
+ "transformers_args": {
438
+ "device_map": "auto",
439
+ "torch_dtype": "auto",
440
+ "low_cpu_mem_usage": True
441
+ }
442
+ }
443
+ }
444
+
445
+ return {
446
+ "success": True,
447
+ "presets": presets
448
+ }