isa-model 0.3.91__py3-none-any.whl → 0.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- isa_model/client.py +1166 -584
- isa_model/core/cache/redis_cache.py +410 -0
- isa_model/core/config/config_manager.py +282 -12
- isa_model/core/config.py +91 -1
- isa_model/core/database/__init__.py +1 -0
- isa_model/core/database/direct_db_client.py +114 -0
- isa_model/core/database/migration_manager.py +563 -0
- isa_model/core/database/migrations.py +297 -0
- isa_model/core/database/supabase_client.py +258 -0
- isa_model/core/dependencies.py +316 -0
- isa_model/core/discovery/__init__.py +19 -0
- isa_model/core/discovery/consul_discovery.py +190 -0
- isa_model/core/logging/__init__.py +54 -0
- isa_model/core/logging/influx_logger.py +523 -0
- isa_model/core/logging/loki_logger.py +160 -0
- isa_model/core/models/__init__.py +46 -0
- isa_model/core/models/config_models.py +625 -0
- isa_model/core/models/deployment_billing_tracker.py +430 -0
- isa_model/core/models/model_billing_tracker.py +60 -88
- isa_model/core/models/model_manager.py +66 -25
- isa_model/core/models/model_metadata.py +690 -0
- isa_model/core/models/model_repo.py +217 -55
- isa_model/core/models/model_statistics_tracker.py +234 -0
- isa_model/core/models/model_storage.py +0 -1
- isa_model/core/models/model_version_manager.py +959 -0
- isa_model/core/models/system_models.py +857 -0
- isa_model/core/pricing_manager.py +2 -249
- isa_model/core/repositories/__init__.py +9 -0
- isa_model/core/repositories/config_repository.py +912 -0
- isa_model/core/resilience/circuit_breaker.py +366 -0
- isa_model/core/security/secrets.py +358 -0
- isa_model/core/services/__init__.py +2 -4
- isa_model/core/services/intelligent_model_selector.py +479 -370
- isa_model/core/storage/hf_storage.py +2 -2
- isa_model/core/types.py +8 -0
- isa_model/deployment/__init__.py +5 -48
- isa_model/deployment/core/__init__.py +2 -31
- isa_model/deployment/core/deployment_manager.py +1278 -368
- isa_model/deployment/local/__init__.py +31 -0
- isa_model/deployment/local/config.py +248 -0
- isa_model/deployment/local/gpu_gateway.py +607 -0
- isa_model/deployment/local/health_checker.py +428 -0
- isa_model/deployment/local/provider.py +586 -0
- isa_model/deployment/local/tensorrt_service.py +621 -0
- isa_model/deployment/local/transformers_service.py +644 -0
- isa_model/deployment/local/vllm_service.py +527 -0
- isa_model/deployment/modal/__init__.py +8 -0
- isa_model/deployment/modal/config.py +136 -0
- isa_model/deployment/modal/deployer.py +894 -0
- isa_model/deployment/modal/services/__init__.py +3 -0
- isa_model/deployment/modal/services/audio/__init__.py +1 -0
- isa_model/deployment/modal/services/audio/isa_audio_chatTTS_service.py +520 -0
- isa_model/deployment/modal/services/audio/isa_audio_openvoice_service.py +758 -0
- isa_model/deployment/modal/services/audio/isa_audio_service_v2.py +1044 -0
- isa_model/deployment/modal/services/embedding/__init__.py +1 -0
- isa_model/deployment/modal/services/embedding/isa_embed_rerank_service.py +296 -0
- isa_model/deployment/modal/services/llm/__init__.py +1 -0
- isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
- isa_model/deployment/modal/services/video/__init__.py +1 -0
- isa_model/deployment/modal/services/video/isa_video_hunyuan_service.py +423 -0
- isa_model/deployment/modal/services/vision/__init__.py +1 -0
- isa_model/deployment/modal/services/vision/isa_vision_ocr_service.py +519 -0
- isa_model/deployment/modal/services/vision/isa_vision_qwen25_service.py +709 -0
- isa_model/deployment/modal/services/vision/isa_vision_table_service.py +676 -0
- isa_model/deployment/modal/services/vision/isa_vision_ui_service.py +833 -0
- isa_model/deployment/modal/services/vision/isa_vision_ui_service_optimized.py +660 -0
- isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
- isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
- isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
- isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
- isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
- isa_model/deployment/storage/__init__.py +5 -0
- isa_model/deployment/storage/deployment_repository.py +824 -0
- isa_model/deployment/triton/__init__.py +10 -0
- isa_model/deployment/triton/config.py +196 -0
- isa_model/deployment/triton/configs/__init__.py +1 -0
- isa_model/deployment/triton/provider.py +512 -0
- isa_model/deployment/triton/scripts/__init__.py +1 -0
- isa_model/deployment/triton/templates/__init__.py +1 -0
- isa_model/inference/__init__.py +47 -1
- isa_model/inference/ai_factory.py +179 -16
- isa_model/inference/legacy_services/__init__.py +21 -0
- isa_model/inference/legacy_services/model_evaluation.py +637 -0
- isa_model/inference/legacy_services/model_service.py +573 -0
- isa_model/inference/legacy_services/model_serving.py +717 -0
- isa_model/inference/legacy_services/model_training.py +561 -0
- isa_model/inference/models/__init__.py +21 -0
- isa_model/inference/models/inference_config.py +551 -0
- isa_model/inference/models/inference_record.py +675 -0
- isa_model/inference/models/performance_models.py +714 -0
- isa_model/inference/repositories/__init__.py +9 -0
- isa_model/inference/repositories/inference_repository.py +828 -0
- isa_model/inference/services/audio/__init__.py +21 -0
- isa_model/inference/services/audio/base_realtime_service.py +225 -0
- isa_model/inference/services/audio/base_stt_service.py +184 -11
- isa_model/inference/services/audio/isa_tts_service.py +0 -0
- isa_model/inference/services/audio/openai_realtime_service.py +320 -124
- isa_model/inference/services/audio/openai_stt_service.py +53 -11
- isa_model/inference/services/base_service.py +17 -1
- isa_model/inference/services/custom_model_manager.py +277 -0
- isa_model/inference/services/embedding/__init__.py +13 -0
- isa_model/inference/services/embedding/base_embed_service.py +111 -8
- isa_model/inference/services/embedding/isa_embed_service.py +305 -0
- isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
- isa_model/inference/services/embedding/openai_embed_service.py +2 -4
- isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
- isa_model/inference/services/embedding/tests/test_embedding.py +222 -0
- isa_model/inference/services/img/__init__.py +2 -2
- isa_model/inference/services/img/base_image_gen_service.py +24 -7
- isa_model/inference/services/img/replicate_image_gen_service.py +84 -422
- isa_model/inference/services/img/services/replicate_face_swap.py +193 -0
- isa_model/inference/services/img/services/replicate_flux.py +226 -0
- isa_model/inference/services/img/services/replicate_flux_kontext.py +219 -0
- isa_model/inference/services/img/services/replicate_sticker_maker.py +249 -0
- isa_model/inference/services/img/tests/test_img_client.py +297 -0
- isa_model/inference/services/llm/__init__.py +10 -2
- isa_model/inference/services/llm/base_llm_service.py +361 -26
- isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
- isa_model/inference/services/llm/helpers/llm_adapter.py +71 -12
- isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
- isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
- isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
- isa_model/inference/services/llm/local_llm_service.py +747 -0
- isa_model/inference/services/llm/ollama_llm_service.py +11 -3
- isa_model/inference/services/llm/openai_llm_service.py +670 -56
- isa_model/inference/services/llm/yyds_llm_service.py +10 -3
- isa_model/inference/services/vision/__init__.py +27 -6
- isa_model/inference/services/vision/base_vision_service.py +118 -185
- isa_model/inference/services/vision/blip_vision_service.py +359 -0
- isa_model/inference/services/vision/helpers/image_utils.py +19 -10
- isa_model/inference/services/vision/isa_vision_service.py +634 -0
- isa_model/inference/services/vision/openai_vision_service.py +19 -10
- isa_model/inference/services/vision/tests/test_ocr_client.py +284 -0
- isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
- isa_model/serving/api/cache_manager.py +245 -0
- isa_model/serving/api/dependencies/__init__.py +1 -0
- isa_model/serving/api/dependencies/auth.py +194 -0
- isa_model/serving/api/dependencies/database.py +139 -0
- isa_model/serving/api/error_handlers.py +284 -0
- isa_model/serving/api/fastapi_server.py +240 -18
- isa_model/serving/api/middleware/auth.py +317 -0
- isa_model/serving/api/middleware/security.py +268 -0
- isa_model/serving/api/middleware/tenant_context.py +414 -0
- isa_model/serving/api/routes/analytics.py +489 -0
- isa_model/serving/api/routes/config.py +645 -0
- isa_model/serving/api/routes/deployment_billing.py +315 -0
- isa_model/serving/api/routes/deployments.py +475 -0
- isa_model/serving/api/routes/gpu_gateway.py +440 -0
- isa_model/serving/api/routes/health.py +32 -12
- isa_model/serving/api/routes/inference_monitoring.py +486 -0
- isa_model/serving/api/routes/local_deployments.py +448 -0
- isa_model/serving/api/routes/logs.py +430 -0
- isa_model/serving/api/routes/settings.py +582 -0
- isa_model/serving/api/routes/tenants.py +575 -0
- isa_model/serving/api/routes/unified.py +992 -171
- isa_model/serving/api/routes/webhooks.py +479 -0
- isa_model/serving/api/startup.py +318 -0
- isa_model/serving/modal_proxy_server.py +249 -0
- isa_model/utils/gpu_utils.py +311 -0
- {isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/METADATA +76 -22
- isa_model-0.4.3.dist-info/RECORD +193 -0
- isa_model/deployment/cloud/__init__.py +0 -9
- isa_model/deployment/cloud/modal/__init__.py +0 -10
- isa_model/deployment/cloud/modal/isa_vision_doc_service.py +0 -766
- isa_model/deployment/cloud/modal/isa_vision_table_service.py +0 -532
- isa_model/deployment/cloud/modal/isa_vision_ui_service.py +0 -406
- isa_model/deployment/cloud/modal/register_models.py +0 -321
- isa_model/deployment/core/deployment_config.py +0 -356
- isa_model/deployment/core/isa_deployment_service.py +0 -401
- isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
- isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
- isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
- isa_model/deployment/runtime/deployed_service.py +0 -338
- isa_model/deployment/services/__init__.py +0 -9
- isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
- isa_model/deployment/services/model_service.py +0 -332
- isa_model/deployment/services/service_monitor.py +0 -356
- isa_model/deployment/services/service_registry.py +0 -527
- isa_model/eval/__init__.py +0 -92
- isa_model/eval/benchmarks.py +0 -469
- isa_model/eval/config/__init__.py +0 -10
- isa_model/eval/config/evaluation_config.py +0 -108
- isa_model/eval/evaluators/__init__.py +0 -18
- isa_model/eval/evaluators/base_evaluator.py +0 -503
- isa_model/eval/evaluators/llm_evaluator.py +0 -472
- isa_model/eval/factory.py +0 -531
- isa_model/eval/infrastructure/__init__.py +0 -24
- isa_model/eval/infrastructure/experiment_tracker.py +0 -466
- isa_model/eval/metrics.py +0 -798
- isa_model/inference/adapter/unified_api.py +0 -248
- isa_model/inference/services/helpers/stacked_config.py +0 -148
- isa_model/inference/services/img/flux_professional_service.py +0 -603
- isa_model/inference/services/img/helpers/base_stacked_service.py +0 -274
- isa_model/inference/services/others/table_transformer_service.py +0 -61
- isa_model/inference/services/vision/doc_analysis_service.py +0 -640
- isa_model/inference/services/vision/helpers/base_stacked_service.py +0 -274
- isa_model/inference/services/vision/ui_analysis_service.py +0 -823
- isa_model/scripts/inference_tracker.py +0 -283
- isa_model/scripts/mlflow_manager.py +0 -379
- isa_model/scripts/model_registry.py +0 -465
- isa_model/scripts/register_models.py +0 -370
- isa_model/scripts/register_models_with_embeddings.py +0 -510
- isa_model/scripts/start_mlflow.py +0 -95
- isa_model/scripts/training_tracker.py +0 -257
- isa_model/training/__init__.py +0 -74
- isa_model/training/annotation/annotation_schema.py +0 -47
- isa_model/training/annotation/processors/annotation_processor.py +0 -126
- isa_model/training/annotation/storage/dataset_manager.py +0 -131
- isa_model/training/annotation/storage/dataset_schema.py +0 -44
- isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
- isa_model/training/annotation/tests/test_minio copy.py +0 -113
- isa_model/training/annotation/tests/test_minio_upload.py +0 -43
- isa_model/training/annotation/views/annotation_controller.py +0 -158
- isa_model/training/cloud/__init__.py +0 -22
- isa_model/training/cloud/job_orchestrator.py +0 -402
- isa_model/training/cloud/runpod_trainer.py +0 -454
- isa_model/training/cloud/storage_manager.py +0 -482
- isa_model/training/core/__init__.py +0 -23
- isa_model/training/core/config.py +0 -181
- isa_model/training/core/dataset.py +0 -222
- isa_model/training/core/trainer.py +0 -720
- isa_model/training/core/utils.py +0 -213
- isa_model/training/factory.py +0 -424
- isa_model-0.3.91.dist-info/RECORD +0 -138
- /isa_model/{core/storage/minio_storage.py → deployment/modal/services/audio/isa_audio_fish_service.py} +0 -0
- /isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
- {isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/WHEEL +0 -0
- {isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,448 @@
|
|
1
|
+
"""
|
2
|
+
Local GPU deployments API routes
|
3
|
+
|
4
|
+
Endpoints for managing local GPU model deployments.
|
5
|
+
"""
|
6
|
+
|
7
|
+
import logging
|
8
|
+
from typing import Dict, List, Optional, Any
|
9
|
+
from fastapi import APIRouter, HTTPException, Depends, BackgroundTasks
|
10
|
+
from pydantic import BaseModel, Field
|
11
|
+
|
12
|
+
from ....deployment.core.deployment_manager import DeploymentManager
|
13
|
+
from ....deployment.local.config import (
|
14
|
+
LocalGPUConfig, LocalServiceType, LocalBackend,
|
15
|
+
create_vllm_config, create_tensorrt_config, create_transformers_config
|
16
|
+
)
|
17
|
+
from ...middleware.auth import get_current_user
|
18
|
+
|
19
|
+
logger = logging.getLogger(__name__)
|
20
|
+
|
21
|
+
router = APIRouter(prefix="/api/v1/local", tags=["local-deployments"])
|
22
|
+
|
23
|
+
|
24
|
+
# Request/Response Models
|
25
|
+
class LocalDeployRequest(BaseModel):
|
26
|
+
"""Local deployment request"""
|
27
|
+
service_name: str = Field(..., description="Unique service name")
|
28
|
+
model_id: str = Field(..., description="HuggingFace model ID")
|
29
|
+
backend: str = Field("transformers", description="Inference backend (vllm, tensorrt_llm, transformers)")
|
30
|
+
service_type: str = Field("llm", description="Service type (llm, vision, audio, embedding)")
|
31
|
+
|
32
|
+
# Model configuration
|
33
|
+
model_precision: str = Field("float16", description="Model precision")
|
34
|
+
max_model_len: int = Field(2048, description="Maximum sequence length")
|
35
|
+
max_batch_size: int = Field(8, description="Maximum batch size")
|
36
|
+
|
37
|
+
# GPU settings
|
38
|
+
gpu_id: Optional[int] = Field(None, description="Specific GPU ID to use")
|
39
|
+
gpu_memory_utilization: float = Field(0.9, description="GPU memory utilization fraction")
|
40
|
+
|
41
|
+
# Performance settings
|
42
|
+
tensor_parallel_size: int = Field(1, description="Tensor parallel size")
|
43
|
+
enable_chunked_prefill: bool = Field(True, description="Enable chunked prefill")
|
44
|
+
enable_prefix_caching: bool = Field(True, description="Enable prefix caching")
|
45
|
+
|
46
|
+
# Quantization
|
47
|
+
quantization: Optional[str] = Field(None, description="Quantization method (int8, int4, awq, gptq)")
|
48
|
+
|
49
|
+
# Advanced settings
|
50
|
+
trust_remote_code: bool = Field(False, description="Trust remote code in model")
|
51
|
+
revision: Optional[str] = Field(None, description="Model revision")
|
52
|
+
|
53
|
+
# Backend-specific settings
|
54
|
+
vllm_args: Dict[str, Any] = Field(default_factory=dict, description="Additional vLLM arguments")
|
55
|
+
tensorrt_args: Dict[str, Any] = Field(default_factory=dict, description="Additional TensorRT arguments")
|
56
|
+
transformers_args: Dict[str, Any] = Field(default_factory=dict, description="Additional Transformers arguments")
|
57
|
+
|
58
|
+
|
59
|
+
class LocalServiceInfo(BaseModel):
|
60
|
+
"""Local service information"""
|
61
|
+
service_name: str
|
62
|
+
model_id: str
|
63
|
+
backend: str
|
64
|
+
service_type: str
|
65
|
+
status: str
|
66
|
+
healthy: bool
|
67
|
+
response_time_ms: Optional[float] = None
|
68
|
+
error_count: int = 0
|
69
|
+
uptime_seconds: Optional[float] = None
|
70
|
+
deployed_at: Optional[str] = None
|
71
|
+
|
72
|
+
|
73
|
+
class GenerateRequest(BaseModel):
|
74
|
+
"""Text generation request"""
|
75
|
+
prompt: str = Field(..., description="Input prompt")
|
76
|
+
max_tokens: int = Field(512, description="Maximum tokens to generate")
|
77
|
+
temperature: float = Field(0.7, description="Sampling temperature")
|
78
|
+
top_p: float = Field(0.9, description="Top-p sampling")
|
79
|
+
top_k: int = Field(50, description="Top-k sampling")
|
80
|
+
stream: bool = Field(False, description="Stream response")
|
81
|
+
|
82
|
+
|
83
|
+
class ChatCompletionRequest(BaseModel):
|
84
|
+
"""Chat completion request"""
|
85
|
+
messages: List[Dict[str, str]] = Field(..., description="Chat messages")
|
86
|
+
max_tokens: int = Field(512, description="Maximum tokens to generate")
|
87
|
+
temperature: float = Field(0.7, description="Sampling temperature")
|
88
|
+
top_p: float = Field(0.9, description="Top-p sampling")
|
89
|
+
stream: bool = Field(False, description="Stream response")
|
90
|
+
|
91
|
+
|
92
|
+
# Dependency injection
|
93
|
+
async def get_deployment_manager() -> DeploymentManager:
|
94
|
+
"""Get deployment manager instance"""
|
95
|
+
return DeploymentManager()
|
96
|
+
|
97
|
+
|
98
|
+
@router.get("/status", summary="Get local GPU system status")
|
99
|
+
async def get_local_status(
|
100
|
+
manager: DeploymentManager = Depends(get_deployment_manager)
|
101
|
+
):
|
102
|
+
"""Get overall local GPU system status including available resources"""
|
103
|
+
try:
|
104
|
+
status = await manager.get_local_system_status()
|
105
|
+
return {"success": True, "status": status}
|
106
|
+
except Exception as e:
|
107
|
+
logger.error(f"Failed to get local status: {e}")
|
108
|
+
raise HTTPException(status_code=500, detail=str(e))
|
109
|
+
|
110
|
+
|
111
|
+
@router.post("/deploy", summary="Deploy model to local GPU")
|
112
|
+
async def deploy_local_service(
|
113
|
+
request: LocalDeployRequest,
|
114
|
+
background_tasks: BackgroundTasks,
|
115
|
+
manager: DeploymentManager = Depends(get_deployment_manager),
|
116
|
+
current_user: Optional[Dict] = Depends(get_current_user)
|
117
|
+
):
|
118
|
+
"""Deploy a model service to local GPU"""
|
119
|
+
try:
|
120
|
+
# Convert request to configuration
|
121
|
+
config = LocalGPUConfig(
|
122
|
+
service_name=request.service_name,
|
123
|
+
service_type=LocalServiceType(request.service_type),
|
124
|
+
model_id=request.model_id,
|
125
|
+
backend=LocalBackend(request.backend),
|
126
|
+
model_precision=request.model_precision,
|
127
|
+
max_model_len=request.max_model_len,
|
128
|
+
max_batch_size=request.max_batch_size,
|
129
|
+
gpu_id=request.gpu_id,
|
130
|
+
gpu_memory_utilization=request.gpu_memory_utilization,
|
131
|
+
tensor_parallel_size=request.tensor_parallel_size,
|
132
|
+
enable_chunked_prefill=request.enable_chunked_prefill,
|
133
|
+
enable_prefix_caching=request.enable_prefix_caching,
|
134
|
+
quantization=request.quantization,
|
135
|
+
trust_remote_code=request.trust_remote_code,
|
136
|
+
revision=request.revision,
|
137
|
+
vllm_args=request.vllm_args,
|
138
|
+
tensorrt_args=request.tensorrt_args,
|
139
|
+
transformers_args=request.transformers_args
|
140
|
+
)
|
141
|
+
|
142
|
+
# Deploy service
|
143
|
+
result = await manager.deploy_to_local(config)
|
144
|
+
|
145
|
+
if result["success"]:
|
146
|
+
return {
|
147
|
+
"success": True,
|
148
|
+
"message": f"Service {request.service_name} deployed successfully",
|
149
|
+
"deployment": result
|
150
|
+
}
|
151
|
+
else:
|
152
|
+
raise HTTPException(status_code=400, detail=result.get("error", "Deployment failed"))
|
153
|
+
|
154
|
+
except ValueError as e:
|
155
|
+
raise HTTPException(status_code=400, detail=f"Invalid configuration: {e}")
|
156
|
+
except Exception as e:
|
157
|
+
logger.error(f"Local deployment failed: {e}")
|
158
|
+
raise HTTPException(status_code=500, detail=str(e))
|
159
|
+
|
160
|
+
|
161
|
+
@router.get("/services", summary="List local GPU services")
|
162
|
+
async def list_local_services(
|
163
|
+
manager: DeploymentManager = Depends(get_deployment_manager)
|
164
|
+
) -> Dict[str, Any]:
|
165
|
+
"""List all deployed local GPU services"""
|
166
|
+
try:
|
167
|
+
services = await manager.list_local_services()
|
168
|
+
return {
|
169
|
+
"success": True,
|
170
|
+
"services": services,
|
171
|
+
"count": len(services)
|
172
|
+
}
|
173
|
+
except Exception as e:
|
174
|
+
logger.error(f"Failed to list local services: {e}")
|
175
|
+
raise HTTPException(status_code=500, detail=str(e))
|
176
|
+
|
177
|
+
|
178
|
+
@router.get("/services/{service_name}", summary="Get local service information")
|
179
|
+
async def get_local_service(
|
180
|
+
service_name: str,
|
181
|
+
manager: DeploymentManager = Depends(get_deployment_manager)
|
182
|
+
):
|
183
|
+
"""Get detailed information about a specific local service"""
|
184
|
+
try:
|
185
|
+
service_info = await manager.get_local_service_info(service_name)
|
186
|
+
|
187
|
+
if service_info is None:
|
188
|
+
raise HTTPException(status_code=404, detail=f"Service {service_name} not found")
|
189
|
+
|
190
|
+
return {
|
191
|
+
"success": True,
|
192
|
+
"service": service_info
|
193
|
+
}
|
194
|
+
except HTTPException:
|
195
|
+
raise
|
196
|
+
except Exception as e:
|
197
|
+
logger.error(f"Failed to get service info for {service_name}: {e}")
|
198
|
+
raise HTTPException(status_code=500, detail=str(e))
|
199
|
+
|
200
|
+
|
201
|
+
@router.delete("/services/{service_name}", summary="Undeploy local service")
|
202
|
+
async def undeploy_local_service(
|
203
|
+
service_name: str,
|
204
|
+
manager: DeploymentManager = Depends(get_deployment_manager),
|
205
|
+
current_user: Optional[Dict] = Depends(get_current_user)
|
206
|
+
):
|
207
|
+
"""Stop and remove a deployed local service"""
|
208
|
+
try:
|
209
|
+
result = await manager.undeploy_local_service(service_name)
|
210
|
+
|
211
|
+
if result["success"]:
|
212
|
+
return {
|
213
|
+
"success": True,
|
214
|
+
"message": f"Service {service_name} undeployed successfully"
|
215
|
+
}
|
216
|
+
else:
|
217
|
+
raise HTTPException(status_code=400, detail=result.get("error", "Undeploy failed"))
|
218
|
+
|
219
|
+
except HTTPException:
|
220
|
+
raise
|
221
|
+
except Exception as e:
|
222
|
+
logger.error(f"Failed to undeploy service {service_name}: {e}")
|
223
|
+
raise HTTPException(status_code=500, detail=str(e))
|
224
|
+
|
225
|
+
|
226
|
+
@router.post("/services/{service_name}/generate", summary="Generate text using local service")
|
227
|
+
async def generate_text(
|
228
|
+
service_name: str,
|
229
|
+
request: GenerateRequest,
|
230
|
+
manager: DeploymentManager = Depends(get_deployment_manager)
|
231
|
+
):
|
232
|
+
"""Generate text using a deployed local service"""
|
233
|
+
try:
|
234
|
+
# Get the local provider and call generate_text
|
235
|
+
local_provider = manager.local_provider
|
236
|
+
|
237
|
+
result = await local_provider.generate_text(
|
238
|
+
service_name=service_name,
|
239
|
+
prompt=request.prompt,
|
240
|
+
max_tokens=request.max_tokens,
|
241
|
+
temperature=request.temperature,
|
242
|
+
top_p=request.top_p,
|
243
|
+
top_k=request.top_k,
|
244
|
+
stream=request.stream
|
245
|
+
)
|
246
|
+
|
247
|
+
if result["success"]:
|
248
|
+
return result
|
249
|
+
else:
|
250
|
+
raise HTTPException(status_code=400, detail=result.get("error", "Generation failed"))
|
251
|
+
|
252
|
+
except HTTPException:
|
253
|
+
raise
|
254
|
+
except Exception as e:
|
255
|
+
logger.error(f"Text generation failed for {service_name}: {e}")
|
256
|
+
raise HTTPException(status_code=500, detail=str(e))
|
257
|
+
|
258
|
+
|
259
|
+
@router.post("/services/{service_name}/chat/completions", summary="Chat completion using local service")
|
260
|
+
async def chat_completion(
|
261
|
+
service_name: str,
|
262
|
+
request: ChatCompletionRequest,
|
263
|
+
manager: DeploymentManager = Depends(get_deployment_manager)
|
264
|
+
):
|
265
|
+
"""Generate chat completion using a deployed local service"""
|
266
|
+
try:
|
267
|
+
# Get the local provider and call chat_completion
|
268
|
+
local_provider = manager.local_provider
|
269
|
+
|
270
|
+
result = await local_provider.chat_completion(
|
271
|
+
service_name=service_name,
|
272
|
+
messages=request.messages,
|
273
|
+
max_tokens=request.max_tokens,
|
274
|
+
temperature=request.temperature,
|
275
|
+
top_p=request.top_p,
|
276
|
+
stream=request.stream
|
277
|
+
)
|
278
|
+
|
279
|
+
if result["success"]:
|
280
|
+
return result
|
281
|
+
else:
|
282
|
+
raise HTTPException(status_code=400, detail=result.get("error", "Chat completion failed"))
|
283
|
+
|
284
|
+
except HTTPException:
|
285
|
+
raise
|
286
|
+
except Exception as e:
|
287
|
+
logger.error(f"Chat completion failed for {service_name}: {e}")
|
288
|
+
raise HTTPException(status_code=500, detail=str(e))
|
289
|
+
|
290
|
+
|
291
|
+
@router.get("/backends", summary="List available local backends")
|
292
|
+
async def list_backends():
|
293
|
+
"""List available local inference backends"""
|
294
|
+
backends = []
|
295
|
+
|
296
|
+
# Check backend availability
|
297
|
+
try:
|
298
|
+
import vllm
|
299
|
+
backends.append({
|
300
|
+
"name": "vllm",
|
301
|
+
"description": "High-performance LLM inference server",
|
302
|
+
"available": True,
|
303
|
+
"features": ["high_throughput", "dynamic_batching", "prefix_caching"]
|
304
|
+
})
|
305
|
+
except ImportError:
|
306
|
+
backends.append({
|
307
|
+
"name": "vllm",
|
308
|
+
"description": "High-performance LLM inference server",
|
309
|
+
"available": False,
|
310
|
+
"install_command": "pip install vllm"
|
311
|
+
})
|
312
|
+
|
313
|
+
try:
|
314
|
+
import tensorrt_llm
|
315
|
+
backends.append({
|
316
|
+
"name": "tensorrt_llm",
|
317
|
+
"description": "NVIDIA TensorRT-LLM for maximum optimization",
|
318
|
+
"available": True,
|
319
|
+
"features": ["maximum_performance", "tensorrt_optimization", "cuda_acceleration"]
|
320
|
+
})
|
321
|
+
except ImportError:
|
322
|
+
backends.append({
|
323
|
+
"name": "tensorrt_llm",
|
324
|
+
"description": "NVIDIA TensorRT-LLM for maximum optimization",
|
325
|
+
"available": False,
|
326
|
+
"install_command": "pip install tensorrt-llm"
|
327
|
+
})
|
328
|
+
|
329
|
+
try:
|
330
|
+
import transformers
|
331
|
+
backends.append({
|
332
|
+
"name": "transformers",
|
333
|
+
"description": "HuggingFace Transformers for universal compatibility",
|
334
|
+
"available": True,
|
335
|
+
"features": ["universal_compatibility", "all_model_types", "quantization_support"]
|
336
|
+
})
|
337
|
+
except ImportError:
|
338
|
+
backends.append({
|
339
|
+
"name": "transformers",
|
340
|
+
"description": "HuggingFace Transformers for universal compatibility",
|
341
|
+
"available": False,
|
342
|
+
"install_command": "pip install transformers"
|
343
|
+
})
|
344
|
+
|
345
|
+
return {
|
346
|
+
"success": True,
|
347
|
+
"backends": backends
|
348
|
+
}
|
349
|
+
|
350
|
+
|
351
|
+
@router.get("/gpu-info", summary="Get GPU information")
|
352
|
+
async def get_gpu_info():
|
353
|
+
"""Get detailed information about available GPUs"""
|
354
|
+
try:
|
355
|
+
from ....utils.gpu_utils import get_gpu_manager
|
356
|
+
|
357
|
+
gpu_manager = get_gpu_manager()
|
358
|
+
system_info = gpu_manager.get_system_info()
|
359
|
+
|
360
|
+
return {
|
361
|
+
"success": True,
|
362
|
+
"gpu_info": system_info
|
363
|
+
}
|
364
|
+
except Exception as e:
|
365
|
+
logger.error(f"Failed to get GPU info: {e}")
|
366
|
+
raise HTTPException(status_code=500, detail=str(e))
|
367
|
+
|
368
|
+
|
369
|
+
@router.post("/estimate-memory", summary="Estimate model memory requirements")
|
370
|
+
async def estimate_memory(
|
371
|
+
model_id: str,
|
372
|
+
precision: str = "float16"
|
373
|
+
):
|
374
|
+
"""Estimate memory requirements for a model"""
|
375
|
+
try:
|
376
|
+
from ....utils.gpu_utils import estimate_model_memory
|
377
|
+
|
378
|
+
memory_mb = estimate_model_memory(model_id, precision)
|
379
|
+
memory_gb = memory_mb / 1024
|
380
|
+
|
381
|
+
return {
|
382
|
+
"success": True,
|
383
|
+
"model_id": model_id,
|
384
|
+
"precision": precision,
|
385
|
+
"estimated_memory_mb": memory_mb,
|
386
|
+
"estimated_memory_gb": round(memory_gb, 2)
|
387
|
+
}
|
388
|
+
except Exception as e:
|
389
|
+
logger.error(f"Failed to estimate memory for {model_id}: {e}")
|
390
|
+
raise HTTPException(status_code=500, detail=str(e))
|
391
|
+
|
392
|
+
|
393
|
+
@router.get("/presets", summary="Get deployment configuration presets")
|
394
|
+
async def get_deployment_presets():
|
395
|
+
"""Get predefined deployment configuration presets"""
|
396
|
+
presets = {
|
397
|
+
"vllm_small": {
|
398
|
+
"name": "vLLM - Small Model",
|
399
|
+
"description": "Optimized for models up to 7B parameters",
|
400
|
+
"backend": "vllm",
|
401
|
+
"max_model_len": 2048,
|
402
|
+
"max_batch_size": 16,
|
403
|
+
"gpu_memory_utilization": 0.9,
|
404
|
+
"enable_chunked_prefill": True,
|
405
|
+
"enable_prefix_caching": True
|
406
|
+
},
|
407
|
+
"vllm_large": {
|
408
|
+
"name": "vLLM - Large Model",
|
409
|
+
"description": "Optimized for models 13B+ parameters",
|
410
|
+
"backend": "vllm",
|
411
|
+
"max_model_len": 4096,
|
412
|
+
"max_batch_size": 8,
|
413
|
+
"gpu_memory_utilization": 0.95,
|
414
|
+
"tensor_parallel_size": 2,
|
415
|
+
"enable_chunked_prefill": True,
|
416
|
+
"enable_prefix_caching": True
|
417
|
+
},
|
418
|
+
"tensorrt_performance": {
|
419
|
+
"name": "TensorRT-LLM - Maximum Performance",
|
420
|
+
"description": "Maximum optimization with TensorRT",
|
421
|
+
"backend": "tensorrt_llm",
|
422
|
+
"model_precision": "float16",
|
423
|
+
"max_batch_size": 16,
|
424
|
+
"tensorrt_args": {
|
425
|
+
"enable_kv_cache_reuse": True,
|
426
|
+
"use_gpt_attention_plugin": True,
|
427
|
+
"remove_input_padding": True
|
428
|
+
}
|
429
|
+
},
|
430
|
+
"transformers_compatible": {
|
431
|
+
"name": "Transformers - Universal",
|
432
|
+
"description": "Maximum compatibility with all models",
|
433
|
+
"backend": "transformers",
|
434
|
+
"model_precision": "float16",
|
435
|
+
"max_batch_size": 4,
|
436
|
+
"gpu_memory_utilization": 0.8,
|
437
|
+
"transformers_args": {
|
438
|
+
"device_map": "auto",
|
439
|
+
"torch_dtype": "auto",
|
440
|
+
"low_cpu_mem_usage": True
|
441
|
+
}
|
442
|
+
}
|
443
|
+
}
|
444
|
+
|
445
|
+
return {
|
446
|
+
"success": True,
|
447
|
+
"presets": presets
|
448
|
+
}
|