isa-model 0.3.91__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (228) hide show
  1. isa_model/client.py +1166 -584
  2. isa_model/core/cache/redis_cache.py +410 -0
  3. isa_model/core/config/config_manager.py +282 -12
  4. isa_model/core/config.py +91 -1
  5. isa_model/core/database/__init__.py +1 -0
  6. isa_model/core/database/direct_db_client.py +114 -0
  7. isa_model/core/database/migration_manager.py +563 -0
  8. isa_model/core/database/migrations.py +297 -0
  9. isa_model/core/database/supabase_client.py +258 -0
  10. isa_model/core/dependencies.py +316 -0
  11. isa_model/core/discovery/__init__.py +19 -0
  12. isa_model/core/discovery/consul_discovery.py +190 -0
  13. isa_model/core/logging/__init__.py +54 -0
  14. isa_model/core/logging/influx_logger.py +523 -0
  15. isa_model/core/logging/loki_logger.py +160 -0
  16. isa_model/core/models/__init__.py +46 -0
  17. isa_model/core/models/config_models.py +625 -0
  18. isa_model/core/models/deployment_billing_tracker.py +430 -0
  19. isa_model/core/models/model_billing_tracker.py +60 -88
  20. isa_model/core/models/model_manager.py +66 -25
  21. isa_model/core/models/model_metadata.py +690 -0
  22. isa_model/core/models/model_repo.py +217 -55
  23. isa_model/core/models/model_statistics_tracker.py +234 -0
  24. isa_model/core/models/model_storage.py +0 -1
  25. isa_model/core/models/model_version_manager.py +959 -0
  26. isa_model/core/models/system_models.py +857 -0
  27. isa_model/core/pricing_manager.py +2 -249
  28. isa_model/core/repositories/__init__.py +9 -0
  29. isa_model/core/repositories/config_repository.py +912 -0
  30. isa_model/core/resilience/circuit_breaker.py +366 -0
  31. isa_model/core/security/secrets.py +358 -0
  32. isa_model/core/services/__init__.py +2 -4
  33. isa_model/core/services/intelligent_model_selector.py +479 -370
  34. isa_model/core/storage/hf_storage.py +2 -2
  35. isa_model/core/types.py +8 -0
  36. isa_model/deployment/__init__.py +5 -48
  37. isa_model/deployment/core/__init__.py +2 -31
  38. isa_model/deployment/core/deployment_manager.py +1278 -368
  39. isa_model/deployment/local/__init__.py +31 -0
  40. isa_model/deployment/local/config.py +248 -0
  41. isa_model/deployment/local/gpu_gateway.py +607 -0
  42. isa_model/deployment/local/health_checker.py +428 -0
  43. isa_model/deployment/local/provider.py +586 -0
  44. isa_model/deployment/local/tensorrt_service.py +621 -0
  45. isa_model/deployment/local/transformers_service.py +644 -0
  46. isa_model/deployment/local/vllm_service.py +527 -0
  47. isa_model/deployment/modal/__init__.py +8 -0
  48. isa_model/deployment/modal/config.py +136 -0
  49. isa_model/deployment/modal/deployer.py +894 -0
  50. isa_model/deployment/modal/services/__init__.py +3 -0
  51. isa_model/deployment/modal/services/audio/__init__.py +1 -0
  52. isa_model/deployment/modal/services/audio/isa_audio_chatTTS_service.py +520 -0
  53. isa_model/deployment/modal/services/audio/isa_audio_openvoice_service.py +758 -0
  54. isa_model/deployment/modal/services/audio/isa_audio_service_v2.py +1044 -0
  55. isa_model/deployment/modal/services/embedding/__init__.py +1 -0
  56. isa_model/deployment/modal/services/embedding/isa_embed_rerank_service.py +296 -0
  57. isa_model/deployment/modal/services/llm/__init__.py +1 -0
  58. isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
  59. isa_model/deployment/modal/services/video/__init__.py +1 -0
  60. isa_model/deployment/modal/services/video/isa_video_hunyuan_service.py +423 -0
  61. isa_model/deployment/modal/services/vision/__init__.py +1 -0
  62. isa_model/deployment/modal/services/vision/isa_vision_ocr_service.py +519 -0
  63. isa_model/deployment/modal/services/vision/isa_vision_qwen25_service.py +709 -0
  64. isa_model/deployment/modal/services/vision/isa_vision_table_service.py +676 -0
  65. isa_model/deployment/modal/services/vision/isa_vision_ui_service.py +833 -0
  66. isa_model/deployment/modal/services/vision/isa_vision_ui_service_optimized.py +660 -0
  67. isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
  68. isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
  69. isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
  70. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
  71. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
  72. isa_model/deployment/storage/__init__.py +5 -0
  73. isa_model/deployment/storage/deployment_repository.py +824 -0
  74. isa_model/deployment/triton/__init__.py +10 -0
  75. isa_model/deployment/triton/config.py +196 -0
  76. isa_model/deployment/triton/configs/__init__.py +1 -0
  77. isa_model/deployment/triton/provider.py +512 -0
  78. isa_model/deployment/triton/scripts/__init__.py +1 -0
  79. isa_model/deployment/triton/templates/__init__.py +1 -0
  80. isa_model/inference/__init__.py +47 -1
  81. isa_model/inference/ai_factory.py +179 -16
  82. isa_model/inference/legacy_services/__init__.py +21 -0
  83. isa_model/inference/legacy_services/model_evaluation.py +637 -0
  84. isa_model/inference/legacy_services/model_service.py +573 -0
  85. isa_model/inference/legacy_services/model_serving.py +717 -0
  86. isa_model/inference/legacy_services/model_training.py +561 -0
  87. isa_model/inference/models/__init__.py +21 -0
  88. isa_model/inference/models/inference_config.py +551 -0
  89. isa_model/inference/models/inference_record.py +675 -0
  90. isa_model/inference/models/performance_models.py +714 -0
  91. isa_model/inference/repositories/__init__.py +9 -0
  92. isa_model/inference/repositories/inference_repository.py +828 -0
  93. isa_model/inference/services/audio/__init__.py +21 -0
  94. isa_model/inference/services/audio/base_realtime_service.py +225 -0
  95. isa_model/inference/services/audio/base_stt_service.py +184 -11
  96. isa_model/inference/services/audio/isa_tts_service.py +0 -0
  97. isa_model/inference/services/audio/openai_realtime_service.py +320 -124
  98. isa_model/inference/services/audio/openai_stt_service.py +53 -11
  99. isa_model/inference/services/base_service.py +17 -1
  100. isa_model/inference/services/custom_model_manager.py +277 -0
  101. isa_model/inference/services/embedding/__init__.py +13 -0
  102. isa_model/inference/services/embedding/base_embed_service.py +111 -8
  103. isa_model/inference/services/embedding/isa_embed_service.py +305 -0
  104. isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
  105. isa_model/inference/services/embedding/openai_embed_service.py +2 -4
  106. isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
  107. isa_model/inference/services/embedding/tests/test_embedding.py +222 -0
  108. isa_model/inference/services/img/__init__.py +2 -2
  109. isa_model/inference/services/img/base_image_gen_service.py +24 -7
  110. isa_model/inference/services/img/replicate_image_gen_service.py +84 -422
  111. isa_model/inference/services/img/services/replicate_face_swap.py +193 -0
  112. isa_model/inference/services/img/services/replicate_flux.py +226 -0
  113. isa_model/inference/services/img/services/replicate_flux_kontext.py +219 -0
  114. isa_model/inference/services/img/services/replicate_sticker_maker.py +249 -0
  115. isa_model/inference/services/img/tests/test_img_client.py +297 -0
  116. isa_model/inference/services/llm/__init__.py +10 -2
  117. isa_model/inference/services/llm/base_llm_service.py +361 -26
  118. isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
  119. isa_model/inference/services/llm/helpers/llm_adapter.py +71 -12
  120. isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
  121. isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
  122. isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
  123. isa_model/inference/services/llm/local_llm_service.py +747 -0
  124. isa_model/inference/services/llm/ollama_llm_service.py +11 -3
  125. isa_model/inference/services/llm/openai_llm_service.py +670 -56
  126. isa_model/inference/services/llm/yyds_llm_service.py +10 -3
  127. isa_model/inference/services/vision/__init__.py +27 -6
  128. isa_model/inference/services/vision/base_vision_service.py +118 -185
  129. isa_model/inference/services/vision/blip_vision_service.py +359 -0
  130. isa_model/inference/services/vision/helpers/image_utils.py +19 -10
  131. isa_model/inference/services/vision/isa_vision_service.py +634 -0
  132. isa_model/inference/services/vision/openai_vision_service.py +19 -10
  133. isa_model/inference/services/vision/tests/test_ocr_client.py +284 -0
  134. isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
  135. isa_model/serving/api/cache_manager.py +245 -0
  136. isa_model/serving/api/dependencies/__init__.py +1 -0
  137. isa_model/serving/api/dependencies/auth.py +194 -0
  138. isa_model/serving/api/dependencies/database.py +139 -0
  139. isa_model/serving/api/error_handlers.py +284 -0
  140. isa_model/serving/api/fastapi_server.py +240 -18
  141. isa_model/serving/api/middleware/auth.py +317 -0
  142. isa_model/serving/api/middleware/security.py +268 -0
  143. isa_model/serving/api/middleware/tenant_context.py +414 -0
  144. isa_model/serving/api/routes/analytics.py +489 -0
  145. isa_model/serving/api/routes/config.py +645 -0
  146. isa_model/serving/api/routes/deployment_billing.py +315 -0
  147. isa_model/serving/api/routes/deployments.py +475 -0
  148. isa_model/serving/api/routes/gpu_gateway.py +440 -0
  149. isa_model/serving/api/routes/health.py +32 -12
  150. isa_model/serving/api/routes/inference_monitoring.py +486 -0
  151. isa_model/serving/api/routes/local_deployments.py +448 -0
  152. isa_model/serving/api/routes/logs.py +430 -0
  153. isa_model/serving/api/routes/settings.py +582 -0
  154. isa_model/serving/api/routes/tenants.py +575 -0
  155. isa_model/serving/api/routes/unified.py +992 -171
  156. isa_model/serving/api/routes/webhooks.py +479 -0
  157. isa_model/serving/api/startup.py +318 -0
  158. isa_model/serving/modal_proxy_server.py +249 -0
  159. isa_model/utils/gpu_utils.py +311 -0
  160. {isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/METADATA +76 -22
  161. isa_model-0.4.3.dist-info/RECORD +193 -0
  162. isa_model/deployment/cloud/__init__.py +0 -9
  163. isa_model/deployment/cloud/modal/__init__.py +0 -10
  164. isa_model/deployment/cloud/modal/isa_vision_doc_service.py +0 -766
  165. isa_model/deployment/cloud/modal/isa_vision_table_service.py +0 -532
  166. isa_model/deployment/cloud/modal/isa_vision_ui_service.py +0 -406
  167. isa_model/deployment/cloud/modal/register_models.py +0 -321
  168. isa_model/deployment/core/deployment_config.py +0 -356
  169. isa_model/deployment/core/isa_deployment_service.py +0 -401
  170. isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
  171. isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
  172. isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
  173. isa_model/deployment/runtime/deployed_service.py +0 -338
  174. isa_model/deployment/services/__init__.py +0 -9
  175. isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
  176. isa_model/deployment/services/model_service.py +0 -332
  177. isa_model/deployment/services/service_monitor.py +0 -356
  178. isa_model/deployment/services/service_registry.py +0 -527
  179. isa_model/eval/__init__.py +0 -92
  180. isa_model/eval/benchmarks.py +0 -469
  181. isa_model/eval/config/__init__.py +0 -10
  182. isa_model/eval/config/evaluation_config.py +0 -108
  183. isa_model/eval/evaluators/__init__.py +0 -18
  184. isa_model/eval/evaluators/base_evaluator.py +0 -503
  185. isa_model/eval/evaluators/llm_evaluator.py +0 -472
  186. isa_model/eval/factory.py +0 -531
  187. isa_model/eval/infrastructure/__init__.py +0 -24
  188. isa_model/eval/infrastructure/experiment_tracker.py +0 -466
  189. isa_model/eval/metrics.py +0 -798
  190. isa_model/inference/adapter/unified_api.py +0 -248
  191. isa_model/inference/services/helpers/stacked_config.py +0 -148
  192. isa_model/inference/services/img/flux_professional_service.py +0 -603
  193. isa_model/inference/services/img/helpers/base_stacked_service.py +0 -274
  194. isa_model/inference/services/others/table_transformer_service.py +0 -61
  195. isa_model/inference/services/vision/doc_analysis_service.py +0 -640
  196. isa_model/inference/services/vision/helpers/base_stacked_service.py +0 -274
  197. isa_model/inference/services/vision/ui_analysis_service.py +0 -823
  198. isa_model/scripts/inference_tracker.py +0 -283
  199. isa_model/scripts/mlflow_manager.py +0 -379
  200. isa_model/scripts/model_registry.py +0 -465
  201. isa_model/scripts/register_models.py +0 -370
  202. isa_model/scripts/register_models_with_embeddings.py +0 -510
  203. isa_model/scripts/start_mlflow.py +0 -95
  204. isa_model/scripts/training_tracker.py +0 -257
  205. isa_model/training/__init__.py +0 -74
  206. isa_model/training/annotation/annotation_schema.py +0 -47
  207. isa_model/training/annotation/processors/annotation_processor.py +0 -126
  208. isa_model/training/annotation/storage/dataset_manager.py +0 -131
  209. isa_model/training/annotation/storage/dataset_schema.py +0 -44
  210. isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
  211. isa_model/training/annotation/tests/test_minio copy.py +0 -113
  212. isa_model/training/annotation/tests/test_minio_upload.py +0 -43
  213. isa_model/training/annotation/views/annotation_controller.py +0 -158
  214. isa_model/training/cloud/__init__.py +0 -22
  215. isa_model/training/cloud/job_orchestrator.py +0 -402
  216. isa_model/training/cloud/runpod_trainer.py +0 -454
  217. isa_model/training/cloud/storage_manager.py +0 -482
  218. isa_model/training/core/__init__.py +0 -23
  219. isa_model/training/core/config.py +0 -181
  220. isa_model/training/core/dataset.py +0 -222
  221. isa_model/training/core/trainer.py +0 -720
  222. isa_model/training/core/utils.py +0 -213
  223. isa_model/training/factory.py +0 -424
  224. isa_model-0.3.91.dist-info/RECORD +0 -138
  225. /isa_model/{core/storage/minio_storage.py → deployment/modal/services/audio/isa_audio_fish_service.py} +0 -0
  226. /isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
  227. {isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/WHEEL +0 -0
  228. {isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,607 @@
1
+ """
2
+ GPU Gateway for PaaS/SaaS Platform
3
+ 本地GPU资源的统一网关和管理服务
4
+ """
5
+
6
+ import asyncio
7
+ import logging
8
+ from typing import Dict, List, Optional, Any
9
+ from dataclasses import dataclass, field
10
+ from enum import Enum
11
+ import aiohttp
12
+ from aiohttp import web
13
+ import time
14
+ import json
15
+ from pathlib import Path
16
+
17
+ from .provider import LocalGPUProvider
18
+ from .config import LocalGPUConfig, LocalServiceType, LocalBackend
19
+ from .health_checker import LocalHealthChecker
20
+ from ..core.base_deployment import DeploymentResult
21
+
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ class GPUPoolStatus(Enum):
27
+ """GPU资源池状态"""
28
+ AVAILABLE = "available"
29
+ BUSY = "busy"
30
+ MAINTENANCE = "maintenance"
31
+ ERROR = "error"
32
+
33
+
34
+ @dataclass
35
+ class GPUNode:
36
+ """GPU节点信息"""
37
+ node_id: str
38
+ hostname: str
39
+ gpu_count: int
40
+ gpu_memory_total: int # MB
41
+ gpu_memory_free: int # MB
42
+ status: GPUPoolStatus = GPUPoolStatus.AVAILABLE
43
+ current_models: List[str] = field(default_factory=list)
44
+ max_concurrent_requests: int = 10
45
+ current_requests: int = 0
46
+ last_heartbeat: float = field(default_factory=time.time)
47
+
48
+
49
+ @dataclass
50
+ class TenantConfig:
51
+ """租户配置"""
52
+ tenant_id: str
53
+ gpu_quota: int # 分配的GPU数量
54
+ memory_quota: int # 分配的GPU内存MB
55
+ priority: int = 1 # 优先级 1-10
56
+ allowed_models: List[str] = field(default_factory=list)
57
+ rate_limit: int = 100 # 每分钟请求数
58
+
59
+
60
+ class GPUGateway:
61
+ """
62
+ GPU网关服务 - PaaS平台的本地GPU资源管理
63
+
64
+ 功能:
65
+ 1. GPU资源池管理
66
+ 2. 多租户资源隔离
67
+ 3. 负载均衡和路由
68
+ 4. 服务发现和健康检查
69
+ 5. 与云端API的通信
70
+ """
71
+
72
+ def __init__(self,
73
+ gateway_port: int = 8888,
74
+ cloud_api_url: Optional[str] = None,
75
+ workspace_dir: str = "./gpu_gateway"):
76
+ self.gateway_port = gateway_port
77
+ self.cloud_api_url = cloud_api_url
78
+ self.workspace_dir = Path(workspace_dir)
79
+ self.workspace_dir.mkdir(exist_ok=True)
80
+
81
+ # GPU资源管理
82
+ self.gpu_nodes: Dict[str, GPUNode] = {}
83
+ self.gpu_providers: Dict[str, LocalGPUProvider] = {}
84
+ self.health_checkers: Dict[str, LocalHealthChecker] = {}
85
+
86
+ # 租户管理
87
+ self.tenants: Dict[str, TenantConfig] = {}
88
+
89
+ # 请求路由和负载均衡
90
+ self.request_queue = asyncio.Queue()
91
+ self.model_routing: Dict[str, List[str]] = {} # model_id -> [node_ids]
92
+
93
+ # 监控数据
94
+ self.metrics = {
95
+ "total_requests": 0,
96
+ "successful_requests": 0,
97
+ "failed_requests": 0,
98
+ "average_latency": 0.0,
99
+ "gpu_utilization": {}
100
+ }
101
+
102
+ # 启动标志
103
+ self._running = False
104
+ self._background_tasks = []
105
+
106
+ async def start(self):
107
+ """启动GPU网关服务"""
108
+ logger.info("🚀 Starting GPU Gateway...")
109
+
110
+ # 发现本地GPU节点
111
+ await self._discover_gpu_nodes()
112
+
113
+ # 启动背景任务
114
+ self._running = True
115
+ self._background_tasks = [
116
+ asyncio.create_task(self._heartbeat_monitor()),
117
+ asyncio.create_task(self._resource_balancer()),
118
+ asyncio.create_task(self._metrics_collector()),
119
+ asyncio.create_task(self._cloud_sync()) if self.cloud_api_url else None
120
+ ]
121
+ self._background_tasks = [t for t in self._background_tasks if t]
122
+
123
+ # 启动HTTP服务
124
+ app = self._create_app()
125
+ runner = web.AppRunner(app)
126
+ await runner.setup()
127
+ site = web.TCPSite(runner, '0.0.0.0', self.gateway_port)
128
+ await site.start()
129
+
130
+ logger.info(f"✅ GPU Gateway started on port {self.gateway_port}")
131
+ return runner
132
+
133
+ async def stop(self):
134
+ """停止网关服务"""
135
+ self._running = False
136
+
137
+ # 停止背景任务
138
+ for task in self._background_tasks:
139
+ task.cancel()
140
+
141
+ # 停止所有GPU服务
142
+ for provider in self.gpu_providers.values():
143
+ await provider.cleanup()
144
+
145
+ logger.info("🛑 GPU Gateway stopped")
146
+
147
+ async def _discover_gpu_nodes(self):
148
+ """发现本地GPU节点"""
149
+ logger.info("🔍 Discovering local GPU nodes...")
150
+
151
+ try:
152
+ # 检测本地GPU
153
+ from ...utils.gpu_utils import GPUManager
154
+ gpu_manager = GPUManager()
155
+
156
+ if gpu_manager.is_cuda_available():
157
+ gpu_info = gpu_manager.get_gpu_info()
158
+
159
+ for i, gpu in enumerate(gpu_info):
160
+ node_id = f"local-gpu-{i}"
161
+ node = GPUNode(
162
+ node_id=node_id,
163
+ hostname="localhost",
164
+ gpu_count=1,
165
+ gpu_memory_total=gpu.get('memory_total_mb', 0),
166
+ gpu_memory_free=gpu.get('memory_free_mb', 0)
167
+ )
168
+
169
+ self.gpu_nodes[node_id] = node
170
+
171
+ # 创建GPU Provider
172
+ provider = LocalGPUProvider(
173
+ workspace_dir=self.workspace_dir / node_id,
174
+ gpu_id=i
175
+ )
176
+ self.gpu_providers[node_id] = provider
177
+
178
+ # 创建健康检查器
179
+ health_checker = LocalHealthChecker()
180
+ self.health_checkers[node_id] = health_checker
181
+
182
+ logger.info(f"✅ Discovered GPU node: {node_id}")
183
+
184
+ else:
185
+ logger.warning("⚠️ No CUDA GPUs detected")
186
+
187
+ except Exception as e:
188
+ logger.error(f"❌ GPU discovery failed: {e}")
189
+
190
+ def register_tenant(self, tenant_config: TenantConfig):
191
+ """注册租户"""
192
+ self.tenants[tenant_config.tenant_id] = tenant_config
193
+ logger.info(f"👤 Registered tenant: {tenant_config.tenant_id}")
194
+
195
+ async def deploy_model(self,
196
+ tenant_id: str,
197
+ model_id: str,
198
+ config: LocalGPUConfig,
199
+ preferred_node: Optional[str] = None) -> DeploymentResult:
200
+ """为租户部署模型"""
201
+
202
+ # 验证租户权限
203
+ if tenant_id not in self.tenants:
204
+ return DeploymentResult(
205
+ success=False,
206
+ error=f"Unknown tenant: {tenant_id}"
207
+ )
208
+
209
+ tenant = self.tenants[tenant_id]
210
+
211
+ # 检查模型权限
212
+ if tenant.allowed_models and model_id not in tenant.allowed_models:
213
+ return DeploymentResult(
214
+ success=False,
215
+ error=f"Model {model_id} not allowed for tenant {tenant_id}"
216
+ )
217
+
218
+ # 选择GPU节点
219
+ node_id = preferred_node or await self._select_best_node(config, tenant)
220
+ if not node_id:
221
+ return DeploymentResult(
222
+ success=False,
223
+ error="No available GPU nodes"
224
+ )
225
+
226
+ # 部署模型
227
+ try:
228
+ provider = self.gpu_providers[node_id]
229
+ result = await provider.deploy_model(config)
230
+
231
+ if result.success:
232
+ # 更新节点状态
233
+ node = self.gpu_nodes[node_id]
234
+ node.current_models.append(model_id)
235
+
236
+ # 更新路由表
237
+ if model_id not in self.model_routing:
238
+ self.model_routing[model_id] = []
239
+ self.model_routing[model_id].append(node_id)
240
+
241
+ logger.info(f"✅ Deployed {model_id} for tenant {tenant_id} on {node_id}")
242
+
243
+ return result
244
+
245
+ except Exception as e:
246
+ logger.error(f"❌ Deployment failed: {e}")
247
+ return DeploymentResult(
248
+ success=False,
249
+ error=str(e)
250
+ )
251
+
252
+ async def inference_request(self,
253
+ tenant_id: str,
254
+ model_id: str,
255
+ request_data: Dict[str, Any]) -> Dict[str, Any]:
256
+ """处理推理请求"""
257
+
258
+ # 验证租户
259
+ if tenant_id not in self.tenants:
260
+ return {"error": f"Unknown tenant: {tenant_id}"}
261
+
262
+ # 检查rate limiting
263
+ tenant = self.tenants[tenant_id]
264
+ # TODO: 实现rate limiting逻辑
265
+
266
+ # 选择服务节点
267
+ node_id = await self._route_request(model_id, tenant)
268
+ if not node_id:
269
+ return {"error": "No available nodes for model"}
270
+
271
+ # 执行推理
272
+ try:
273
+ start_time = time.time()
274
+ provider = self.gpu_providers[node_id]
275
+
276
+ # 根据请求类型调用不同方法
277
+ if "messages" in request_data:
278
+ # Chat completion
279
+ result = await provider.chat_completion(
280
+ model_id=model_id,
281
+ **request_data
282
+ )
283
+ else:
284
+ # Text completion
285
+ result = await provider.text_completion(
286
+ model_id=model_id,
287
+ **request_data
288
+ )
289
+
290
+ # 更新指标
291
+ latency = time.time() - start_time
292
+ await self._update_metrics(tenant_id, latency, True)
293
+
294
+ # 更新节点负载
295
+ node = self.gpu_nodes[node_id]
296
+ node.current_requests = max(0, node.current_requests - 1)
297
+
298
+ return result
299
+
300
+ except Exception as e:
301
+ await self._update_metrics(tenant_id, 0, False)
302
+ logger.error(f"❌ Inference failed: {e}")
303
+ return {"error": str(e)}
304
+
305
+ async def _select_best_node(self,
306
+ config: LocalGPUConfig,
307
+ tenant: TenantConfig) -> Optional[str]:
308
+ """选择最佳GPU节点"""
309
+ available_nodes = []
310
+
311
+ for node_id, node in self.gpu_nodes.items():
312
+ if (node.status == GPUPoolStatus.AVAILABLE and
313
+ node.gpu_memory_free > 2000): # 至少2GB空闲
314
+
315
+ # 计算节点分数 (内存越多,当前负载越少,分数越高)
316
+ score = (
317
+ node.gpu_memory_free * 0.6 + # 内存权重
318
+ (node.max_concurrent_requests - node.current_requests) * 0.3 + # 负载权重
319
+ tenant.priority * 0.1 # 租户优先级权重
320
+ )
321
+
322
+ available_nodes.append((node_id, score))
323
+
324
+ if available_nodes:
325
+ # 选择分数最高的节点
326
+ available_nodes.sort(key=lambda x: x[1], reverse=True)
327
+ return available_nodes[0][0]
328
+
329
+ return None
330
+
331
+ async def _route_request(self, model_id: str, tenant: TenantConfig) -> Optional[str]:
332
+ """路由推理请求到合适的节点"""
333
+ if model_id not in self.model_routing:
334
+ return None
335
+
336
+ # 从部署了该模型的节点中选择负载最低的
337
+ candidate_nodes = self.model_routing[model_id]
338
+ best_node = None
339
+ min_load = float('inf')
340
+
341
+ for node_id in candidate_nodes:
342
+ node = self.gpu_nodes[node_id]
343
+ if (node.status == GPUPoolStatus.AVAILABLE and
344
+ node.current_requests < node.max_concurrent_requests):
345
+
346
+ # 考虑租户优先级的负载计算
347
+ load = node.current_requests / node.max_concurrent_requests
348
+ adjusted_load = load / max(tenant.priority, 1)
349
+
350
+ if adjusted_load < min_load:
351
+ min_load = adjusted_load
352
+ best_node = node_id
353
+
354
+ if best_node:
355
+ # 增加节点负载计数
356
+ self.gpu_nodes[best_node].current_requests += 1
357
+
358
+ return best_node
359
+
360
+ async def _heartbeat_monitor(self):
361
+ """监控GPU节点心跳"""
362
+ while self._running:
363
+ try:
364
+ current_time = time.time()
365
+
366
+ for node_id, node in self.gpu_nodes.items():
367
+ # 检查心跳超时
368
+ if current_time - node.last_heartbeat > 30: # 30秒超时
369
+ logger.warning(f"⚠️ Node {node_id} heartbeat timeout")
370
+ node.status = GPUPoolStatus.ERROR
371
+
372
+ # 更新GPU状态
373
+ if node_id in self.health_checkers:
374
+ health = await self.health_checkers[node_id].check_service_health("gpu-status")
375
+ if health.get("success"):
376
+ node.status = GPUPoolStatus.AVAILABLE
377
+ node.gpu_memory_free = health.get("gpu_memory_free", 0)
378
+ node.last_heartbeat = current_time
379
+
380
+ await asyncio.sleep(10) # 每10秒检查一次
381
+
382
+ except Exception as e:
383
+ logger.error(f"❌ Heartbeat monitor error: {e}")
384
+ await asyncio.sleep(5)
385
+
386
+ async def _resource_balancer(self):
387
+ """资源均衡器"""
388
+ while self._running:
389
+ try:
390
+ # 检查资源使用情况
391
+ for node_id, node in self.gpu_nodes.items():
392
+ utilization = node.current_requests / node.max_concurrent_requests
393
+
394
+ # 如果节点过载,考虑迁移部分服务
395
+ if utilization > 0.9:
396
+ logger.info(f"🔄 Node {node_id} is overloaded ({utilization:.1%})")
397
+ # TODO: 实现负载迁移逻辑
398
+
399
+ await asyncio.sleep(30) # 每30秒平衡一次
400
+
401
+ except Exception as e:
402
+ logger.error(f"❌ Resource balancer error: {e}")
403
+ await asyncio.sleep(10)
404
+
405
+ async def _metrics_collector(self):
406
+ """指标收集器"""
407
+ while self._running:
408
+ try:
409
+ # 收集GPU利用率
410
+ for node_id, node in self.gpu_nodes.items():
411
+ utilization = node.current_requests / node.max_concurrent_requests
412
+ self.metrics["gpu_utilization"][node_id] = utilization
413
+
414
+ await asyncio.sleep(5) # 每5秒收集一次
415
+
416
+ except Exception as e:
417
+ logger.error(f"❌ Metrics collector error: {e}")
418
+ await asyncio.sleep(10)
419
+
420
+ async def _cloud_sync(self):
421
+ """与云端API同步"""
422
+ if not self.cloud_api_url:
423
+ return
424
+
425
+ while self._running:
426
+ try:
427
+ # 向云端报告GPU状态
428
+ status_data = {
429
+ "gateway_id": "local-gpu-gateway",
430
+ "nodes": [
431
+ {
432
+ "node_id": node.node_id,
433
+ "status": node.status.value,
434
+ "gpu_count": node.gpu_count,
435
+ "memory_free": node.gpu_memory_free,
436
+ "current_models": node.current_models,
437
+ "current_load": node.current_requests
438
+ }
439
+ for node in self.gpu_nodes.values()
440
+ ],
441
+ "metrics": self.metrics
442
+ }
443
+
444
+ async with aiohttp.ClientSession() as session:
445
+ async with session.post(
446
+ f"{self.cloud_api_url}/api/gpu-gateway/status",
447
+ json=status_data
448
+ ) as response:
449
+ if response.status == 200:
450
+ logger.debug("✅ Status synced to cloud")
451
+ else:
452
+ logger.warning(f"⚠️ Cloud sync failed: {response.status}")
453
+
454
+ await asyncio.sleep(60) # 每分钟同步一次
455
+
456
+ except Exception as e:
457
+ logger.error(f"❌ Cloud sync error: {e}")
458
+ await asyncio.sleep(30)
459
+
460
+ async def _update_metrics(self, tenant_id: str, latency: float, success: bool):
461
+ """更新指标"""
462
+ self.metrics["total_requests"] += 1
463
+
464
+ if success:
465
+ self.metrics["successful_requests"] += 1
466
+ # 更新平均延迟
467
+ current_avg = self.metrics["average_latency"]
468
+ total_successful = self.metrics["successful_requests"]
469
+ self.metrics["average_latency"] = (
470
+ (current_avg * (total_successful - 1) + latency) / total_successful
471
+ )
472
+ else:
473
+ self.metrics["failed_requests"] += 1
474
+
475
+ def _create_app(self) -> web.Application:
476
+ """创建HTTP API应用"""
477
+ app = web.Application()
478
+
479
+ # API路由
480
+ app.router.add_post('/deploy', self._handle_deploy)
481
+ app.router.add_post('/inference', self._handle_inference)
482
+ app.router.add_get('/status', self._handle_status)
483
+ app.router.add_get('/metrics', self._handle_metrics)
484
+ app.router.add_post('/tenants', self._handle_register_tenant)
485
+
486
+ return app
487
+
488
+ async def _handle_deploy(self, request: web.Request) -> web.Response:
489
+ """处理部署请求"""
490
+ try:
491
+ data = await request.json()
492
+
493
+ config = LocalGPUConfig(
494
+ service_name=data["service_name"],
495
+ service_type=LocalServiceType(data["service_type"]),
496
+ model_id=data["model_id"],
497
+ backend=LocalBackend(data.get("backend", "transformers"))
498
+ )
499
+
500
+ result = await self.deploy_model(
501
+ tenant_id=data["tenant_id"],
502
+ model_id=data["model_id"],
503
+ config=config,
504
+ preferred_node=data.get("preferred_node")
505
+ )
506
+
507
+ return web.json_response({
508
+ "success": result.success,
509
+ "error": result.error,
510
+ "service_name": result.service_name,
511
+ "service_info": result.service_info
512
+ })
513
+
514
+ except Exception as e:
515
+ return web.json_response({
516
+ "success": False,
517
+ "error": str(e)
518
+ }, status=400)
519
+
520
+ async def _handle_inference(self, request: web.Request) -> web.Response:
521
+ """处理推理请求"""
522
+ try:
523
+ data = await request.json()
524
+
525
+ result = await self.inference_request(
526
+ tenant_id=data["tenant_id"],
527
+ model_id=data["model_id"],
528
+ request_data=data["request"]
529
+ )
530
+
531
+ return web.json_response(result)
532
+
533
+ except Exception as e:
534
+ return web.json_response({
535
+ "error": str(e)
536
+ }, status=400)
537
+
538
+ async def _handle_status(self, request: web.Request) -> web.Response:
539
+ """获取网关状态"""
540
+ status = {
541
+ "gateway_status": "running" if self._running else "stopped",
542
+ "total_nodes": len(self.gpu_nodes),
543
+ "healthy_nodes": sum(
544
+ 1 for node in self.gpu_nodes.values()
545
+ if node.status == GPUPoolStatus.AVAILABLE
546
+ ),
547
+ "total_tenants": len(self.tenants),
548
+ "deployed_models": len(self.model_routing),
549
+ "nodes": [
550
+ {
551
+ "node_id": node.node_id,
552
+ "status": node.status.value,
553
+ "gpu_memory_free": node.gpu_memory_free,
554
+ "current_requests": node.current_requests,
555
+ "models": node.current_models
556
+ }
557
+ for node in self.gpu_nodes.values()
558
+ ]
559
+ }
560
+
561
+ return web.json_response(status)
562
+
563
+ async def _handle_metrics(self, request: web.Request) -> web.Response:
564
+ """获取指标数据"""
565
+ return web.json_response(self.metrics)
566
+
567
+ async def _handle_register_tenant(self, request: web.Request) -> web.Response:
568
+ """注册租户"""
569
+ try:
570
+ data = await request.json()
571
+
572
+ tenant_config = TenantConfig(
573
+ tenant_id=data["tenant_id"],
574
+ gpu_quota=data.get("gpu_quota", 1),
575
+ memory_quota=data.get("memory_quota", 8192),
576
+ priority=data.get("priority", 1),
577
+ allowed_models=data.get("allowed_models", []),
578
+ rate_limit=data.get("rate_limit", 100)
579
+ )
580
+
581
+ self.register_tenant(tenant_config)
582
+
583
+ return web.json_response({
584
+ "success": True,
585
+ "tenant_id": tenant_config.tenant_id
586
+ })
587
+
588
+ except Exception as e:
589
+ return web.json_response({
590
+ "success": False,
591
+ "error": str(e)
592
+ }, status=400)
593
+
594
+
595
+ # 便捷函数
596
+ async def create_gpu_gateway(gateway_port: int = 8888,
597
+ cloud_api_url: Optional[str] = None,
598
+ workspace_dir: str = "./gpu_gateway") -> GPUGateway:
599
+ """创建并启动GPU网关"""
600
+ gateway = GPUGateway(
601
+ gateway_port=gateway_port,
602
+ cloud_api_url=cloud_api_url,
603
+ workspace_dir=workspace_dir
604
+ )
605
+
606
+ await gateway.start()
607
+ return gateway