isa-model 0.4.0__py3-none-any.whl → 0.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- isa_model/client.py +466 -43
- isa_model/core/cache/redis_cache.py +12 -3
- isa_model/core/config/config_manager.py +230 -3
- isa_model/core/config.py +90 -0
- isa_model/core/database/direct_db_client.py +114 -0
- isa_model/core/database/migration_manager.py +563 -0
- isa_model/core/database/migrations.py +21 -1
- isa_model/core/database/supabase_client.py +154 -19
- isa_model/core/dependencies.py +316 -0
- isa_model/core/discovery/__init__.py +19 -0
- isa_model/core/discovery/consul_discovery.py +190 -0
- isa_model/core/logging/__init__.py +54 -0
- isa_model/core/logging/influx_logger.py +523 -0
- isa_model/core/logging/loki_logger.py +160 -0
- isa_model/core/models/__init__.py +27 -18
- isa_model/core/models/config_models.py +625 -0
- isa_model/core/models/deployment_billing_tracker.py +430 -0
- isa_model/core/models/model_manager.py +40 -17
- isa_model/core/models/model_metadata.py +690 -0
- isa_model/core/models/model_repo.py +174 -18
- isa_model/core/models/system_models.py +857 -0
- isa_model/core/repositories/__init__.py +9 -0
- isa_model/core/repositories/config_repository.py +912 -0
- isa_model/core/services/intelligent_model_selector.py +399 -21
- isa_model/core/storage/hf_storage.py +1 -1
- isa_model/core/types.py +1 -0
- isa_model/deployment/__init__.py +5 -48
- isa_model/deployment/core/__init__.py +2 -31
- isa_model/deployment/core/deployment_manager.py +1278 -370
- isa_model/deployment/local/__init__.py +31 -0
- isa_model/deployment/local/config.py +248 -0
- isa_model/deployment/local/gpu_gateway.py +607 -0
- isa_model/deployment/local/health_checker.py +428 -0
- isa_model/deployment/local/provider.py +586 -0
- isa_model/deployment/local/tensorrt_service.py +621 -0
- isa_model/deployment/local/transformers_service.py +644 -0
- isa_model/deployment/local/vllm_service.py +527 -0
- isa_model/deployment/modal/__init__.py +8 -0
- isa_model/deployment/modal/config.py +136 -0
- isa_model/deployment/{services/auto_hf_modal_deployer.py → modal/deployer.py} +1 -1
- isa_model/deployment/modal/services/__init__.py +3 -0
- isa_model/deployment/modal/services/audio/__init__.py +1 -0
- isa_model/deployment/modal/services/embedding/__init__.py +1 -0
- isa_model/deployment/modal/services/llm/__init__.py +1 -0
- isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
- isa_model/deployment/modal/services/video/__init__.py +1 -0
- isa_model/deployment/modal/services/vision/__init__.py +1 -0
- isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
- isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
- isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
- isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
- isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
- isa_model/deployment/storage/__init__.py +5 -0
- isa_model/deployment/storage/deployment_repository.py +824 -0
- isa_model/deployment/triton/__init__.py +10 -0
- isa_model/deployment/triton/config.py +196 -0
- isa_model/deployment/triton/configs/__init__.py +1 -0
- isa_model/deployment/triton/provider.py +512 -0
- isa_model/deployment/triton/scripts/__init__.py +1 -0
- isa_model/deployment/triton/templates/__init__.py +1 -0
- isa_model/inference/__init__.py +47 -1
- isa_model/inference/ai_factory.py +137 -10
- isa_model/inference/legacy_services/__init__.py +21 -0
- isa_model/inference/legacy_services/model_evaluation.py +637 -0
- isa_model/inference/legacy_services/model_service.py +573 -0
- isa_model/inference/legacy_services/model_serving.py +717 -0
- isa_model/inference/legacy_services/model_training.py +561 -0
- isa_model/inference/models/__init__.py +21 -0
- isa_model/inference/models/inference_config.py +551 -0
- isa_model/inference/models/inference_record.py +675 -0
- isa_model/inference/models/performance_models.py +714 -0
- isa_model/inference/repositories/__init__.py +9 -0
- isa_model/inference/repositories/inference_repository.py +828 -0
- isa_model/inference/services/audio/base_stt_service.py +184 -11
- isa_model/inference/services/audio/openai_stt_service.py +22 -6
- isa_model/inference/services/custom_model_manager.py +277 -0
- isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
- isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
- isa_model/inference/services/llm/__init__.py +10 -2
- isa_model/inference/services/llm/base_llm_service.py +335 -24
- isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
- isa_model/inference/services/llm/helpers/llm_adapter.py +9 -4
- isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
- isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
- isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
- isa_model/inference/services/llm/local_llm_service.py +747 -0
- isa_model/inference/services/llm/ollama_llm_service.py +9 -2
- isa_model/inference/services/llm/openai_llm_service.py +33 -16
- isa_model/inference/services/llm/yyds_llm_service.py +8 -2
- isa_model/inference/services/vision/__init__.py +22 -1
- isa_model/inference/services/vision/blip_vision_service.py +359 -0
- isa_model/inference/services/vision/helpers/image_utils.py +8 -5
- isa_model/inference/services/vision/isa_vision_service.py +65 -4
- isa_model/inference/services/vision/openai_vision_service.py +19 -10
- isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
- isa_model/serving/api/cache_manager.py +245 -0
- isa_model/serving/api/dependencies/__init__.py +1 -0
- isa_model/serving/api/dependencies/auth.py +194 -0
- isa_model/serving/api/dependencies/database.py +139 -0
- isa_model/serving/api/error_handlers.py +284 -0
- isa_model/serving/api/fastapi_server.py +172 -22
- isa_model/serving/api/middleware/auth.py +8 -2
- isa_model/serving/api/middleware/security.py +23 -33
- isa_model/serving/api/middleware/tenant_context.py +414 -0
- isa_model/serving/api/routes/analytics.py +4 -1
- isa_model/serving/api/routes/config.py +645 -0
- isa_model/serving/api/routes/deployment_billing.py +315 -0
- isa_model/serving/api/routes/deployments.py +138 -2
- isa_model/serving/api/routes/gpu_gateway.py +440 -0
- isa_model/serving/api/routes/health.py +32 -12
- isa_model/serving/api/routes/inference_monitoring.py +486 -0
- isa_model/serving/api/routes/local_deployments.py +448 -0
- isa_model/serving/api/routes/tenants.py +575 -0
- isa_model/serving/api/routes/unified.py +680 -18
- isa_model/serving/api/routes/webhooks.py +479 -0
- isa_model/serving/api/startup.py +68 -54
- isa_model/utils/gpu_utils.py +311 -0
- {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/METADATA +66 -24
- isa_model-0.4.3.dist-info/RECORD +193 -0
- isa_model/core/storage/minio_storage.py +0 -0
- isa_model/deployment/cloud/__init__.py +0 -9
- isa_model/deployment/cloud/modal/__init__.py +0 -10
- isa_model/deployment/core/deployment_config.py +0 -356
- isa_model/deployment/core/isa_deployment_service.py +0 -401
- isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
- isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
- isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
- isa_model/deployment/runtime/deployed_service.py +0 -338
- isa_model/deployment/services/__init__.py +0 -9
- isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
- isa_model/deployment/services/model_service.py +0 -332
- isa_model/deployment/services/service_monitor.py +0 -356
- isa_model/deployment/services/service_registry.py +0 -527
- isa_model/eval/__init__.py +0 -92
- isa_model/eval/benchmarks/__init__.py +0 -27
- isa_model/eval/benchmarks/multimodal_datasets.py +0 -460
- isa_model/eval/benchmarks.py +0 -701
- isa_model/eval/config/__init__.py +0 -10
- isa_model/eval/config/evaluation_config.py +0 -108
- isa_model/eval/evaluators/__init__.py +0 -24
- isa_model/eval/evaluators/audio_evaluator.py +0 -727
- isa_model/eval/evaluators/base_evaluator.py +0 -503
- isa_model/eval/evaluators/embedding_evaluator.py +0 -742
- isa_model/eval/evaluators/llm_evaluator.py +0 -472
- isa_model/eval/evaluators/vision_evaluator.py +0 -564
- isa_model/eval/example_evaluation.py +0 -395
- isa_model/eval/factory.py +0 -798
- isa_model/eval/infrastructure/__init__.py +0 -24
- isa_model/eval/infrastructure/experiment_tracker.py +0 -466
- isa_model/eval/isa_benchmarks.py +0 -700
- isa_model/eval/isa_integration.py +0 -582
- isa_model/eval/metrics.py +0 -951
- isa_model/eval/tests/unit/test_basic.py +0 -396
- isa_model/serving/api/routes/evaluations.py +0 -579
- isa_model/training/__init__.py +0 -168
- isa_model/training/annotation/annotation_schema.py +0 -47
- isa_model/training/annotation/processors/annotation_processor.py +0 -126
- isa_model/training/annotation/storage/dataset_manager.py +0 -131
- isa_model/training/annotation/storage/dataset_schema.py +0 -44
- isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
- isa_model/training/annotation/tests/test_minio copy.py +0 -113
- isa_model/training/annotation/tests/test_minio_upload.py +0 -43
- isa_model/training/annotation/views/annotation_controller.py +0 -158
- isa_model/training/cloud/__init__.py +0 -22
- isa_model/training/cloud/job_orchestrator.py +0 -402
- isa_model/training/cloud/runpod_trainer.py +0 -454
- isa_model/training/cloud/storage_manager.py +0 -482
- isa_model/training/core/__init__.py +0 -26
- isa_model/training/core/config.py +0 -181
- isa_model/training/core/dataset.py +0 -222
- isa_model/training/core/trainer.py +0 -720
- isa_model/training/core/utils.py +0 -213
- isa_model/training/examples/intelligent_training_example.py +0 -281
- isa_model/training/factory.py +0 -424
- isa_model/training/intelligent/__init__.py +0 -25
- isa_model/training/intelligent/decision_engine.py +0 -643
- isa_model/training/intelligent/intelligent_factory.py +0 -888
- isa_model/training/intelligent/knowledge_base.py +0 -751
- isa_model/training/intelligent/resource_optimizer.py +0 -839
- isa_model/training/intelligent/task_classifier.py +0 -576
- isa_model/training/storage/__init__.py +0 -24
- isa_model/training/storage/core_integration.py +0 -439
- isa_model/training/storage/training_repository.py +0 -552
- isa_model/training/storage/training_storage.py +0 -628
- isa_model-0.4.0.dist-info/RECORD +0 -182
- /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_chatTTS_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_fish_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_openvoice_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_service_v2.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/embedding}/isa_embed_rerank_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/video}/isa_video_hunyuan_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ocr_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_qwen25_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_table_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service_optimized.py +0 -0
- /isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
- {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/WHEEL +0 -0
- {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,607 @@
|
|
1
|
+
"""
|
2
|
+
GPU Gateway for PaaS/SaaS Platform
|
3
|
+
本地GPU资源的统一网关和管理服务
|
4
|
+
"""
|
5
|
+
|
6
|
+
import asyncio
|
7
|
+
import logging
|
8
|
+
from typing import Dict, List, Optional, Any
|
9
|
+
from dataclasses import dataclass, field
|
10
|
+
from enum import Enum
|
11
|
+
import aiohttp
|
12
|
+
from aiohttp import web
|
13
|
+
import time
|
14
|
+
import json
|
15
|
+
from pathlib import Path
|
16
|
+
|
17
|
+
from .provider import LocalGPUProvider
|
18
|
+
from .config import LocalGPUConfig, LocalServiceType, LocalBackend
|
19
|
+
from .health_checker import LocalHealthChecker
|
20
|
+
from ..core.base_deployment import DeploymentResult
|
21
|
+
|
22
|
+
|
23
|
+
logger = logging.getLogger(__name__)
|
24
|
+
|
25
|
+
|
26
|
+
class GPUPoolStatus(Enum):
|
27
|
+
"""GPU资源池状态"""
|
28
|
+
AVAILABLE = "available"
|
29
|
+
BUSY = "busy"
|
30
|
+
MAINTENANCE = "maintenance"
|
31
|
+
ERROR = "error"
|
32
|
+
|
33
|
+
|
34
|
+
@dataclass
|
35
|
+
class GPUNode:
|
36
|
+
"""GPU节点信息"""
|
37
|
+
node_id: str
|
38
|
+
hostname: str
|
39
|
+
gpu_count: int
|
40
|
+
gpu_memory_total: int # MB
|
41
|
+
gpu_memory_free: int # MB
|
42
|
+
status: GPUPoolStatus = GPUPoolStatus.AVAILABLE
|
43
|
+
current_models: List[str] = field(default_factory=list)
|
44
|
+
max_concurrent_requests: int = 10
|
45
|
+
current_requests: int = 0
|
46
|
+
last_heartbeat: float = field(default_factory=time.time)
|
47
|
+
|
48
|
+
|
49
|
+
@dataclass
|
50
|
+
class TenantConfig:
|
51
|
+
"""租户配置"""
|
52
|
+
tenant_id: str
|
53
|
+
gpu_quota: int # 分配的GPU数量
|
54
|
+
memory_quota: int # 分配的GPU内存MB
|
55
|
+
priority: int = 1 # 优先级 1-10
|
56
|
+
allowed_models: List[str] = field(default_factory=list)
|
57
|
+
rate_limit: int = 100 # 每分钟请求数
|
58
|
+
|
59
|
+
|
60
|
+
class GPUGateway:
|
61
|
+
"""
|
62
|
+
GPU网关服务 - PaaS平台的本地GPU资源管理
|
63
|
+
|
64
|
+
功能:
|
65
|
+
1. GPU资源池管理
|
66
|
+
2. 多租户资源隔离
|
67
|
+
3. 负载均衡和路由
|
68
|
+
4. 服务发现和健康检查
|
69
|
+
5. 与云端API的通信
|
70
|
+
"""
|
71
|
+
|
72
|
+
def __init__(self,
|
73
|
+
gateway_port: int = 8888,
|
74
|
+
cloud_api_url: Optional[str] = None,
|
75
|
+
workspace_dir: str = "./gpu_gateway"):
|
76
|
+
self.gateway_port = gateway_port
|
77
|
+
self.cloud_api_url = cloud_api_url
|
78
|
+
self.workspace_dir = Path(workspace_dir)
|
79
|
+
self.workspace_dir.mkdir(exist_ok=True)
|
80
|
+
|
81
|
+
# GPU资源管理
|
82
|
+
self.gpu_nodes: Dict[str, GPUNode] = {}
|
83
|
+
self.gpu_providers: Dict[str, LocalGPUProvider] = {}
|
84
|
+
self.health_checkers: Dict[str, LocalHealthChecker] = {}
|
85
|
+
|
86
|
+
# 租户管理
|
87
|
+
self.tenants: Dict[str, TenantConfig] = {}
|
88
|
+
|
89
|
+
# 请求路由和负载均衡
|
90
|
+
self.request_queue = asyncio.Queue()
|
91
|
+
self.model_routing: Dict[str, List[str]] = {} # model_id -> [node_ids]
|
92
|
+
|
93
|
+
# 监控数据
|
94
|
+
self.metrics = {
|
95
|
+
"total_requests": 0,
|
96
|
+
"successful_requests": 0,
|
97
|
+
"failed_requests": 0,
|
98
|
+
"average_latency": 0.0,
|
99
|
+
"gpu_utilization": {}
|
100
|
+
}
|
101
|
+
|
102
|
+
# 启动标志
|
103
|
+
self._running = False
|
104
|
+
self._background_tasks = []
|
105
|
+
|
106
|
+
async def start(self):
|
107
|
+
"""启动GPU网关服务"""
|
108
|
+
logger.info("🚀 Starting GPU Gateway...")
|
109
|
+
|
110
|
+
# 发现本地GPU节点
|
111
|
+
await self._discover_gpu_nodes()
|
112
|
+
|
113
|
+
# 启动背景任务
|
114
|
+
self._running = True
|
115
|
+
self._background_tasks = [
|
116
|
+
asyncio.create_task(self._heartbeat_monitor()),
|
117
|
+
asyncio.create_task(self._resource_balancer()),
|
118
|
+
asyncio.create_task(self._metrics_collector()),
|
119
|
+
asyncio.create_task(self._cloud_sync()) if self.cloud_api_url else None
|
120
|
+
]
|
121
|
+
self._background_tasks = [t for t in self._background_tasks if t]
|
122
|
+
|
123
|
+
# 启动HTTP服务
|
124
|
+
app = self._create_app()
|
125
|
+
runner = web.AppRunner(app)
|
126
|
+
await runner.setup()
|
127
|
+
site = web.TCPSite(runner, '0.0.0.0', self.gateway_port)
|
128
|
+
await site.start()
|
129
|
+
|
130
|
+
logger.info(f"✅ GPU Gateway started on port {self.gateway_port}")
|
131
|
+
return runner
|
132
|
+
|
133
|
+
async def stop(self):
|
134
|
+
"""停止网关服务"""
|
135
|
+
self._running = False
|
136
|
+
|
137
|
+
# 停止背景任务
|
138
|
+
for task in self._background_tasks:
|
139
|
+
task.cancel()
|
140
|
+
|
141
|
+
# 停止所有GPU服务
|
142
|
+
for provider in self.gpu_providers.values():
|
143
|
+
await provider.cleanup()
|
144
|
+
|
145
|
+
logger.info("🛑 GPU Gateway stopped")
|
146
|
+
|
147
|
+
async def _discover_gpu_nodes(self):
|
148
|
+
"""发现本地GPU节点"""
|
149
|
+
logger.info("🔍 Discovering local GPU nodes...")
|
150
|
+
|
151
|
+
try:
|
152
|
+
# 检测本地GPU
|
153
|
+
from ...utils.gpu_utils import GPUManager
|
154
|
+
gpu_manager = GPUManager()
|
155
|
+
|
156
|
+
if gpu_manager.is_cuda_available():
|
157
|
+
gpu_info = gpu_manager.get_gpu_info()
|
158
|
+
|
159
|
+
for i, gpu in enumerate(gpu_info):
|
160
|
+
node_id = f"local-gpu-{i}"
|
161
|
+
node = GPUNode(
|
162
|
+
node_id=node_id,
|
163
|
+
hostname="localhost",
|
164
|
+
gpu_count=1,
|
165
|
+
gpu_memory_total=gpu.get('memory_total_mb', 0),
|
166
|
+
gpu_memory_free=gpu.get('memory_free_mb', 0)
|
167
|
+
)
|
168
|
+
|
169
|
+
self.gpu_nodes[node_id] = node
|
170
|
+
|
171
|
+
# 创建GPU Provider
|
172
|
+
provider = LocalGPUProvider(
|
173
|
+
workspace_dir=self.workspace_dir / node_id,
|
174
|
+
gpu_id=i
|
175
|
+
)
|
176
|
+
self.gpu_providers[node_id] = provider
|
177
|
+
|
178
|
+
# 创建健康检查器
|
179
|
+
health_checker = LocalHealthChecker()
|
180
|
+
self.health_checkers[node_id] = health_checker
|
181
|
+
|
182
|
+
logger.info(f"✅ Discovered GPU node: {node_id}")
|
183
|
+
|
184
|
+
else:
|
185
|
+
logger.warning("⚠️ No CUDA GPUs detected")
|
186
|
+
|
187
|
+
except Exception as e:
|
188
|
+
logger.error(f"❌ GPU discovery failed: {e}")
|
189
|
+
|
190
|
+
def register_tenant(self, tenant_config: TenantConfig):
|
191
|
+
"""注册租户"""
|
192
|
+
self.tenants[tenant_config.tenant_id] = tenant_config
|
193
|
+
logger.info(f"👤 Registered tenant: {tenant_config.tenant_id}")
|
194
|
+
|
195
|
+
async def deploy_model(self,
|
196
|
+
tenant_id: str,
|
197
|
+
model_id: str,
|
198
|
+
config: LocalGPUConfig,
|
199
|
+
preferred_node: Optional[str] = None) -> DeploymentResult:
|
200
|
+
"""为租户部署模型"""
|
201
|
+
|
202
|
+
# 验证租户权限
|
203
|
+
if tenant_id not in self.tenants:
|
204
|
+
return DeploymentResult(
|
205
|
+
success=False,
|
206
|
+
error=f"Unknown tenant: {tenant_id}"
|
207
|
+
)
|
208
|
+
|
209
|
+
tenant = self.tenants[tenant_id]
|
210
|
+
|
211
|
+
# 检查模型权限
|
212
|
+
if tenant.allowed_models and model_id not in tenant.allowed_models:
|
213
|
+
return DeploymentResult(
|
214
|
+
success=False,
|
215
|
+
error=f"Model {model_id} not allowed for tenant {tenant_id}"
|
216
|
+
)
|
217
|
+
|
218
|
+
# 选择GPU节点
|
219
|
+
node_id = preferred_node or await self._select_best_node(config, tenant)
|
220
|
+
if not node_id:
|
221
|
+
return DeploymentResult(
|
222
|
+
success=False,
|
223
|
+
error="No available GPU nodes"
|
224
|
+
)
|
225
|
+
|
226
|
+
# 部署模型
|
227
|
+
try:
|
228
|
+
provider = self.gpu_providers[node_id]
|
229
|
+
result = await provider.deploy_model(config)
|
230
|
+
|
231
|
+
if result.success:
|
232
|
+
# 更新节点状态
|
233
|
+
node = self.gpu_nodes[node_id]
|
234
|
+
node.current_models.append(model_id)
|
235
|
+
|
236
|
+
# 更新路由表
|
237
|
+
if model_id not in self.model_routing:
|
238
|
+
self.model_routing[model_id] = []
|
239
|
+
self.model_routing[model_id].append(node_id)
|
240
|
+
|
241
|
+
logger.info(f"✅ Deployed {model_id} for tenant {tenant_id} on {node_id}")
|
242
|
+
|
243
|
+
return result
|
244
|
+
|
245
|
+
except Exception as e:
|
246
|
+
logger.error(f"❌ Deployment failed: {e}")
|
247
|
+
return DeploymentResult(
|
248
|
+
success=False,
|
249
|
+
error=str(e)
|
250
|
+
)
|
251
|
+
|
252
|
+
async def inference_request(self,
|
253
|
+
tenant_id: str,
|
254
|
+
model_id: str,
|
255
|
+
request_data: Dict[str, Any]) -> Dict[str, Any]:
|
256
|
+
"""处理推理请求"""
|
257
|
+
|
258
|
+
# 验证租户
|
259
|
+
if tenant_id not in self.tenants:
|
260
|
+
return {"error": f"Unknown tenant: {tenant_id}"}
|
261
|
+
|
262
|
+
# 检查rate limiting
|
263
|
+
tenant = self.tenants[tenant_id]
|
264
|
+
# TODO: 实现rate limiting逻辑
|
265
|
+
|
266
|
+
# 选择服务节点
|
267
|
+
node_id = await self._route_request(model_id, tenant)
|
268
|
+
if not node_id:
|
269
|
+
return {"error": "No available nodes for model"}
|
270
|
+
|
271
|
+
# 执行推理
|
272
|
+
try:
|
273
|
+
start_time = time.time()
|
274
|
+
provider = self.gpu_providers[node_id]
|
275
|
+
|
276
|
+
# 根据请求类型调用不同方法
|
277
|
+
if "messages" in request_data:
|
278
|
+
# Chat completion
|
279
|
+
result = await provider.chat_completion(
|
280
|
+
model_id=model_id,
|
281
|
+
**request_data
|
282
|
+
)
|
283
|
+
else:
|
284
|
+
# Text completion
|
285
|
+
result = await provider.text_completion(
|
286
|
+
model_id=model_id,
|
287
|
+
**request_data
|
288
|
+
)
|
289
|
+
|
290
|
+
# 更新指标
|
291
|
+
latency = time.time() - start_time
|
292
|
+
await self._update_metrics(tenant_id, latency, True)
|
293
|
+
|
294
|
+
# 更新节点负载
|
295
|
+
node = self.gpu_nodes[node_id]
|
296
|
+
node.current_requests = max(0, node.current_requests - 1)
|
297
|
+
|
298
|
+
return result
|
299
|
+
|
300
|
+
except Exception as e:
|
301
|
+
await self._update_metrics(tenant_id, 0, False)
|
302
|
+
logger.error(f"❌ Inference failed: {e}")
|
303
|
+
return {"error": str(e)}
|
304
|
+
|
305
|
+
async def _select_best_node(self,
|
306
|
+
config: LocalGPUConfig,
|
307
|
+
tenant: TenantConfig) -> Optional[str]:
|
308
|
+
"""选择最佳GPU节点"""
|
309
|
+
available_nodes = []
|
310
|
+
|
311
|
+
for node_id, node in self.gpu_nodes.items():
|
312
|
+
if (node.status == GPUPoolStatus.AVAILABLE and
|
313
|
+
node.gpu_memory_free > 2000): # 至少2GB空闲
|
314
|
+
|
315
|
+
# 计算节点分数 (内存越多,当前负载越少,分数越高)
|
316
|
+
score = (
|
317
|
+
node.gpu_memory_free * 0.6 + # 内存权重
|
318
|
+
(node.max_concurrent_requests - node.current_requests) * 0.3 + # 负载权重
|
319
|
+
tenant.priority * 0.1 # 租户优先级权重
|
320
|
+
)
|
321
|
+
|
322
|
+
available_nodes.append((node_id, score))
|
323
|
+
|
324
|
+
if available_nodes:
|
325
|
+
# 选择分数最高的节点
|
326
|
+
available_nodes.sort(key=lambda x: x[1], reverse=True)
|
327
|
+
return available_nodes[0][0]
|
328
|
+
|
329
|
+
return None
|
330
|
+
|
331
|
+
async def _route_request(self, model_id: str, tenant: TenantConfig) -> Optional[str]:
|
332
|
+
"""路由推理请求到合适的节点"""
|
333
|
+
if model_id not in self.model_routing:
|
334
|
+
return None
|
335
|
+
|
336
|
+
# 从部署了该模型的节点中选择负载最低的
|
337
|
+
candidate_nodes = self.model_routing[model_id]
|
338
|
+
best_node = None
|
339
|
+
min_load = float('inf')
|
340
|
+
|
341
|
+
for node_id in candidate_nodes:
|
342
|
+
node = self.gpu_nodes[node_id]
|
343
|
+
if (node.status == GPUPoolStatus.AVAILABLE and
|
344
|
+
node.current_requests < node.max_concurrent_requests):
|
345
|
+
|
346
|
+
# 考虑租户优先级的负载计算
|
347
|
+
load = node.current_requests / node.max_concurrent_requests
|
348
|
+
adjusted_load = load / max(tenant.priority, 1)
|
349
|
+
|
350
|
+
if adjusted_load < min_load:
|
351
|
+
min_load = adjusted_load
|
352
|
+
best_node = node_id
|
353
|
+
|
354
|
+
if best_node:
|
355
|
+
# 增加节点负载计数
|
356
|
+
self.gpu_nodes[best_node].current_requests += 1
|
357
|
+
|
358
|
+
return best_node
|
359
|
+
|
360
|
+
async def _heartbeat_monitor(self):
|
361
|
+
"""监控GPU节点心跳"""
|
362
|
+
while self._running:
|
363
|
+
try:
|
364
|
+
current_time = time.time()
|
365
|
+
|
366
|
+
for node_id, node in self.gpu_nodes.items():
|
367
|
+
# 检查心跳超时
|
368
|
+
if current_time - node.last_heartbeat > 30: # 30秒超时
|
369
|
+
logger.warning(f"⚠️ Node {node_id} heartbeat timeout")
|
370
|
+
node.status = GPUPoolStatus.ERROR
|
371
|
+
|
372
|
+
# 更新GPU状态
|
373
|
+
if node_id in self.health_checkers:
|
374
|
+
health = await self.health_checkers[node_id].check_service_health("gpu-status")
|
375
|
+
if health.get("success"):
|
376
|
+
node.status = GPUPoolStatus.AVAILABLE
|
377
|
+
node.gpu_memory_free = health.get("gpu_memory_free", 0)
|
378
|
+
node.last_heartbeat = current_time
|
379
|
+
|
380
|
+
await asyncio.sleep(10) # 每10秒检查一次
|
381
|
+
|
382
|
+
except Exception as e:
|
383
|
+
logger.error(f"❌ Heartbeat monitor error: {e}")
|
384
|
+
await asyncio.sleep(5)
|
385
|
+
|
386
|
+
async def _resource_balancer(self):
|
387
|
+
"""资源均衡器"""
|
388
|
+
while self._running:
|
389
|
+
try:
|
390
|
+
# 检查资源使用情况
|
391
|
+
for node_id, node in self.gpu_nodes.items():
|
392
|
+
utilization = node.current_requests / node.max_concurrent_requests
|
393
|
+
|
394
|
+
# 如果节点过载,考虑迁移部分服务
|
395
|
+
if utilization > 0.9:
|
396
|
+
logger.info(f"🔄 Node {node_id} is overloaded ({utilization:.1%})")
|
397
|
+
# TODO: 实现负载迁移逻辑
|
398
|
+
|
399
|
+
await asyncio.sleep(30) # 每30秒平衡一次
|
400
|
+
|
401
|
+
except Exception as e:
|
402
|
+
logger.error(f"❌ Resource balancer error: {e}")
|
403
|
+
await asyncio.sleep(10)
|
404
|
+
|
405
|
+
async def _metrics_collector(self):
|
406
|
+
"""指标收集器"""
|
407
|
+
while self._running:
|
408
|
+
try:
|
409
|
+
# 收集GPU利用率
|
410
|
+
for node_id, node in self.gpu_nodes.items():
|
411
|
+
utilization = node.current_requests / node.max_concurrent_requests
|
412
|
+
self.metrics["gpu_utilization"][node_id] = utilization
|
413
|
+
|
414
|
+
await asyncio.sleep(5) # 每5秒收集一次
|
415
|
+
|
416
|
+
except Exception as e:
|
417
|
+
logger.error(f"❌ Metrics collector error: {e}")
|
418
|
+
await asyncio.sleep(10)
|
419
|
+
|
420
|
+
async def _cloud_sync(self):
|
421
|
+
"""与云端API同步"""
|
422
|
+
if not self.cloud_api_url:
|
423
|
+
return
|
424
|
+
|
425
|
+
while self._running:
|
426
|
+
try:
|
427
|
+
# 向云端报告GPU状态
|
428
|
+
status_data = {
|
429
|
+
"gateway_id": "local-gpu-gateway",
|
430
|
+
"nodes": [
|
431
|
+
{
|
432
|
+
"node_id": node.node_id,
|
433
|
+
"status": node.status.value,
|
434
|
+
"gpu_count": node.gpu_count,
|
435
|
+
"memory_free": node.gpu_memory_free,
|
436
|
+
"current_models": node.current_models,
|
437
|
+
"current_load": node.current_requests
|
438
|
+
}
|
439
|
+
for node in self.gpu_nodes.values()
|
440
|
+
],
|
441
|
+
"metrics": self.metrics
|
442
|
+
}
|
443
|
+
|
444
|
+
async with aiohttp.ClientSession() as session:
|
445
|
+
async with session.post(
|
446
|
+
f"{self.cloud_api_url}/api/gpu-gateway/status",
|
447
|
+
json=status_data
|
448
|
+
) as response:
|
449
|
+
if response.status == 200:
|
450
|
+
logger.debug("✅ Status synced to cloud")
|
451
|
+
else:
|
452
|
+
logger.warning(f"⚠️ Cloud sync failed: {response.status}")
|
453
|
+
|
454
|
+
await asyncio.sleep(60) # 每分钟同步一次
|
455
|
+
|
456
|
+
except Exception as e:
|
457
|
+
logger.error(f"❌ Cloud sync error: {e}")
|
458
|
+
await asyncio.sleep(30)
|
459
|
+
|
460
|
+
async def _update_metrics(self, tenant_id: str, latency: float, success: bool):
|
461
|
+
"""更新指标"""
|
462
|
+
self.metrics["total_requests"] += 1
|
463
|
+
|
464
|
+
if success:
|
465
|
+
self.metrics["successful_requests"] += 1
|
466
|
+
# 更新平均延迟
|
467
|
+
current_avg = self.metrics["average_latency"]
|
468
|
+
total_successful = self.metrics["successful_requests"]
|
469
|
+
self.metrics["average_latency"] = (
|
470
|
+
(current_avg * (total_successful - 1) + latency) / total_successful
|
471
|
+
)
|
472
|
+
else:
|
473
|
+
self.metrics["failed_requests"] += 1
|
474
|
+
|
475
|
+
def _create_app(self) -> web.Application:
|
476
|
+
"""创建HTTP API应用"""
|
477
|
+
app = web.Application()
|
478
|
+
|
479
|
+
# API路由
|
480
|
+
app.router.add_post('/deploy', self._handle_deploy)
|
481
|
+
app.router.add_post('/inference', self._handle_inference)
|
482
|
+
app.router.add_get('/status', self._handle_status)
|
483
|
+
app.router.add_get('/metrics', self._handle_metrics)
|
484
|
+
app.router.add_post('/tenants', self._handle_register_tenant)
|
485
|
+
|
486
|
+
return app
|
487
|
+
|
488
|
+
async def _handle_deploy(self, request: web.Request) -> web.Response:
|
489
|
+
"""处理部署请求"""
|
490
|
+
try:
|
491
|
+
data = await request.json()
|
492
|
+
|
493
|
+
config = LocalGPUConfig(
|
494
|
+
service_name=data["service_name"],
|
495
|
+
service_type=LocalServiceType(data["service_type"]),
|
496
|
+
model_id=data["model_id"],
|
497
|
+
backend=LocalBackend(data.get("backend", "transformers"))
|
498
|
+
)
|
499
|
+
|
500
|
+
result = await self.deploy_model(
|
501
|
+
tenant_id=data["tenant_id"],
|
502
|
+
model_id=data["model_id"],
|
503
|
+
config=config,
|
504
|
+
preferred_node=data.get("preferred_node")
|
505
|
+
)
|
506
|
+
|
507
|
+
return web.json_response({
|
508
|
+
"success": result.success,
|
509
|
+
"error": result.error,
|
510
|
+
"service_name": result.service_name,
|
511
|
+
"service_info": result.service_info
|
512
|
+
})
|
513
|
+
|
514
|
+
except Exception as e:
|
515
|
+
return web.json_response({
|
516
|
+
"success": False,
|
517
|
+
"error": str(e)
|
518
|
+
}, status=400)
|
519
|
+
|
520
|
+
async def _handle_inference(self, request: web.Request) -> web.Response:
|
521
|
+
"""处理推理请求"""
|
522
|
+
try:
|
523
|
+
data = await request.json()
|
524
|
+
|
525
|
+
result = await self.inference_request(
|
526
|
+
tenant_id=data["tenant_id"],
|
527
|
+
model_id=data["model_id"],
|
528
|
+
request_data=data["request"]
|
529
|
+
)
|
530
|
+
|
531
|
+
return web.json_response(result)
|
532
|
+
|
533
|
+
except Exception as e:
|
534
|
+
return web.json_response({
|
535
|
+
"error": str(e)
|
536
|
+
}, status=400)
|
537
|
+
|
538
|
+
async def _handle_status(self, request: web.Request) -> web.Response:
|
539
|
+
"""获取网关状态"""
|
540
|
+
status = {
|
541
|
+
"gateway_status": "running" if self._running else "stopped",
|
542
|
+
"total_nodes": len(self.gpu_nodes),
|
543
|
+
"healthy_nodes": sum(
|
544
|
+
1 for node in self.gpu_nodes.values()
|
545
|
+
if node.status == GPUPoolStatus.AVAILABLE
|
546
|
+
),
|
547
|
+
"total_tenants": len(self.tenants),
|
548
|
+
"deployed_models": len(self.model_routing),
|
549
|
+
"nodes": [
|
550
|
+
{
|
551
|
+
"node_id": node.node_id,
|
552
|
+
"status": node.status.value,
|
553
|
+
"gpu_memory_free": node.gpu_memory_free,
|
554
|
+
"current_requests": node.current_requests,
|
555
|
+
"models": node.current_models
|
556
|
+
}
|
557
|
+
for node in self.gpu_nodes.values()
|
558
|
+
]
|
559
|
+
}
|
560
|
+
|
561
|
+
return web.json_response(status)
|
562
|
+
|
563
|
+
async def _handle_metrics(self, request: web.Request) -> web.Response:
|
564
|
+
"""获取指标数据"""
|
565
|
+
return web.json_response(self.metrics)
|
566
|
+
|
567
|
+
async def _handle_register_tenant(self, request: web.Request) -> web.Response:
|
568
|
+
"""注册租户"""
|
569
|
+
try:
|
570
|
+
data = await request.json()
|
571
|
+
|
572
|
+
tenant_config = TenantConfig(
|
573
|
+
tenant_id=data["tenant_id"],
|
574
|
+
gpu_quota=data.get("gpu_quota", 1),
|
575
|
+
memory_quota=data.get("memory_quota", 8192),
|
576
|
+
priority=data.get("priority", 1),
|
577
|
+
allowed_models=data.get("allowed_models", []),
|
578
|
+
rate_limit=data.get("rate_limit", 100)
|
579
|
+
)
|
580
|
+
|
581
|
+
self.register_tenant(tenant_config)
|
582
|
+
|
583
|
+
return web.json_response({
|
584
|
+
"success": True,
|
585
|
+
"tenant_id": tenant_config.tenant_id
|
586
|
+
})
|
587
|
+
|
588
|
+
except Exception as e:
|
589
|
+
return web.json_response({
|
590
|
+
"success": False,
|
591
|
+
"error": str(e)
|
592
|
+
}, status=400)
|
593
|
+
|
594
|
+
|
595
|
+
# 便捷函数
|
596
|
+
async def create_gpu_gateway(gateway_port: int = 8888,
|
597
|
+
cloud_api_url: Optional[str] = None,
|
598
|
+
workspace_dir: str = "./gpu_gateway") -> GPUGateway:
|
599
|
+
"""创建并启动GPU网关"""
|
600
|
+
gateway = GPUGateway(
|
601
|
+
gateway_port=gateway_port,
|
602
|
+
cloud_api_url=cloud_api_url,
|
603
|
+
workspace_dir=workspace_dir
|
604
|
+
)
|
605
|
+
|
606
|
+
await gateway.start()
|
607
|
+
return gateway
|