isa-model 0.4.3__py3-none-any.whl → 0.4.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- isa_model/core/config.py +3 -3
- isa_model/core/logging/__init__.py +14 -13
- isa_model/core/models/model_manager.py +1 -69
- isa_model/core/models/model_storage.py +4 -2
- {isa_model-0.4.3.dist-info → isa_model-0.4.5.dist-info}/METADATA +6 -1
- {isa_model-0.4.3.dist-info → isa_model-0.4.5.dist-info}/RECORD +8 -22
- isa_model/core/logging/influx_logger.py +0 -523
- isa_model/core/security/secrets.py +0 -358
- isa_model/core/storage/hf_storage.py +0 -419
- isa_model/deployment/local/__init__.py +0 -31
- isa_model/deployment/local/config.py +0 -248
- isa_model/deployment/local/gpu_gateway.py +0 -607
- isa_model/deployment/local/health_checker.py +0 -428
- isa_model/deployment/local/provider.py +0 -586
- isa_model/deployment/local/tensorrt_service.py +0 -621
- isa_model/deployment/local/transformers_service.py +0 -644
- isa_model/deployment/local/vllm_service.py +0 -527
- isa_model/inference/services/custom_model_manager.py +0 -277
- isa_model/inference/services/llm/local_llm_service.py +0 -747
- isa_model/inference/services/vision/blip_vision_service.py +0 -359
- {isa_model-0.4.3.dist-info → isa_model-0.4.5.dist-info}/WHEEL +0 -0
- {isa_model-0.4.3.dist-info → isa_model-0.4.5.dist-info}/top_level.txt +0 -0
@@ -1,527 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
vLLM local inference service
|
3
|
-
|
4
|
-
High-performance local model serving using vLLM.
|
5
|
-
"""
|
6
|
-
|
7
|
-
import os
|
8
|
-
import json
|
9
|
-
import asyncio
|
10
|
-
import logging
|
11
|
-
import subprocess
|
12
|
-
import signal
|
13
|
-
from typing import Dict, List, Optional, Any, Union, AsyncGenerator
|
14
|
-
from pathlib import Path
|
15
|
-
from datetime import datetime
|
16
|
-
import httpx
|
17
|
-
import time
|
18
|
-
|
19
|
-
from .config import LocalGPUConfig, LocalServiceType, LocalBackend
|
20
|
-
from ...utils.gpu_utils import get_gpu_manager, GPUInfo
|
21
|
-
|
22
|
-
logger = logging.getLogger(__name__)
|
23
|
-
|
24
|
-
|
25
|
-
class VLLMService:
|
26
|
-
"""vLLM local inference service manager"""
|
27
|
-
|
28
|
-
def __init__(self, config: LocalGPUConfig):
|
29
|
-
"""
|
30
|
-
Initialize vLLM service.
|
31
|
-
|
32
|
-
Args:
|
33
|
-
config: Local GPU configuration for vLLM
|
34
|
-
"""
|
35
|
-
if config.backend != LocalBackend.VLLM:
|
36
|
-
raise ValueError("Config must use VLLM backend")
|
37
|
-
|
38
|
-
self.config = config
|
39
|
-
self.gpu_manager = get_gpu_manager()
|
40
|
-
self.process: Optional[subprocess.Popen] = None
|
41
|
-
self.service_url = f"http://{config.host}:{config.port}"
|
42
|
-
self.is_running = False
|
43
|
-
self.startup_time: Optional[datetime] = None
|
44
|
-
|
45
|
-
# Service info
|
46
|
-
self.service_info = {
|
47
|
-
"service_name": config.service_name,
|
48
|
-
"model_id": config.model_id,
|
49
|
-
"backend": "vllm",
|
50
|
-
"status": "stopped",
|
51
|
-
"url": self.service_url
|
52
|
-
}
|
53
|
-
|
54
|
-
async def start(self) -> Dict[str, Any]:
|
55
|
-
"""
|
56
|
-
Start vLLM inference server.
|
57
|
-
|
58
|
-
Returns:
|
59
|
-
Service startup result
|
60
|
-
"""
|
61
|
-
if self.is_running:
|
62
|
-
return {
|
63
|
-
"success": False,
|
64
|
-
"error": "Service already running",
|
65
|
-
"service_info": self.service_info
|
66
|
-
}
|
67
|
-
|
68
|
-
try:
|
69
|
-
logger.info(f"Starting vLLM service: {self.config.service_name}")
|
70
|
-
|
71
|
-
# Check GPU availability
|
72
|
-
gpu_check = await self._check_gpu_requirements()
|
73
|
-
if not gpu_check["compatible"]:
|
74
|
-
return {
|
75
|
-
"success": False,
|
76
|
-
"error": f"GPU requirements not met: {', '.join(gpu_check['warnings'])}",
|
77
|
-
"gpu_check": gpu_check
|
78
|
-
}
|
79
|
-
|
80
|
-
# Prepare vLLM command
|
81
|
-
cmd = self._build_vllm_command()
|
82
|
-
logger.info(f"vLLM command: {' '.join(cmd)}")
|
83
|
-
|
84
|
-
# Start vLLM process
|
85
|
-
self.startup_time = datetime.now()
|
86
|
-
self.process = subprocess.Popen(
|
87
|
-
cmd,
|
88
|
-
stdout=subprocess.PIPE,
|
89
|
-
stderr=subprocess.PIPE,
|
90
|
-
text=True,
|
91
|
-
env=self._get_environment()
|
92
|
-
)
|
93
|
-
|
94
|
-
# Wait for service to be ready
|
95
|
-
startup_result = await self._wait_for_startup()
|
96
|
-
|
97
|
-
if startup_result["success"]:
|
98
|
-
self.is_running = True
|
99
|
-
self.service_info.update({
|
100
|
-
"status": "running",
|
101
|
-
"pid": self.process.pid,
|
102
|
-
"started_at": self.startup_time.isoformat(),
|
103
|
-
"model_info": await self._get_model_info()
|
104
|
-
})
|
105
|
-
|
106
|
-
logger.info(f"vLLM service started successfully: {self.service_url}")
|
107
|
-
return {
|
108
|
-
"success": True,
|
109
|
-
"service_info": self.service_info,
|
110
|
-
"startup_time_seconds": startup_result["startup_time"],
|
111
|
-
"gpu_info": gpu_check["selected_gpu"]
|
112
|
-
}
|
113
|
-
else:
|
114
|
-
await self.stop()
|
115
|
-
return {
|
116
|
-
"success": False,
|
117
|
-
"error": startup_result["error"],
|
118
|
-
"logs": startup_result.get("logs", [])
|
119
|
-
}
|
120
|
-
|
121
|
-
except Exception as e:
|
122
|
-
logger.error(f"Failed to start vLLM service: {e}")
|
123
|
-
await self.stop()
|
124
|
-
return {
|
125
|
-
"success": False,
|
126
|
-
"error": str(e)
|
127
|
-
}
|
128
|
-
|
129
|
-
async def stop(self) -> Dict[str, Any]:
|
130
|
-
"""
|
131
|
-
Stop vLLM inference server.
|
132
|
-
|
133
|
-
Returns:
|
134
|
-
Service shutdown result
|
135
|
-
"""
|
136
|
-
if not self.is_running:
|
137
|
-
return {
|
138
|
-
"success": True,
|
139
|
-
"message": "Service was not running"
|
140
|
-
}
|
141
|
-
|
142
|
-
try:
|
143
|
-
logger.info(f"Stopping vLLM service: {self.config.service_name}")
|
144
|
-
|
145
|
-
if self.process:
|
146
|
-
# Graceful shutdown
|
147
|
-
self.process.terminate()
|
148
|
-
|
149
|
-
# Wait for graceful shutdown
|
150
|
-
try:
|
151
|
-
self.process.wait(timeout=10)
|
152
|
-
except subprocess.TimeoutExpired:
|
153
|
-
# Force kill if graceful shutdown fails
|
154
|
-
logger.warning("Graceful shutdown timed out, force killing process")
|
155
|
-
self.process.kill()
|
156
|
-
self.process.wait(timeout=5)
|
157
|
-
|
158
|
-
self.process = None
|
159
|
-
|
160
|
-
self.is_running = False
|
161
|
-
self.service_info.update({
|
162
|
-
"status": "stopped",
|
163
|
-
"pid": None,
|
164
|
-
"stopped_at": datetime.now().isoformat()
|
165
|
-
})
|
166
|
-
|
167
|
-
logger.info(f"vLLM service stopped: {self.config.service_name}")
|
168
|
-
return {
|
169
|
-
"success": True,
|
170
|
-
"service_info": self.service_info
|
171
|
-
}
|
172
|
-
|
173
|
-
except Exception as e:
|
174
|
-
logger.error(f"Failed to stop vLLM service: {e}")
|
175
|
-
return {
|
176
|
-
"success": False,
|
177
|
-
"error": str(e)
|
178
|
-
}
|
179
|
-
|
180
|
-
async def restart(self) -> Dict[str, Any]:
|
181
|
-
"""Restart vLLM service"""
|
182
|
-
stop_result = await self.stop()
|
183
|
-
if not stop_result["success"]:
|
184
|
-
return stop_result
|
185
|
-
|
186
|
-
# Wait a moment before restart
|
187
|
-
await asyncio.sleep(2)
|
188
|
-
|
189
|
-
return await self.start()
|
190
|
-
|
191
|
-
async def health_check(self) -> Dict[str, Any]:
|
192
|
-
"""Check service health"""
|
193
|
-
if not self.is_running:
|
194
|
-
return {
|
195
|
-
"healthy": False,
|
196
|
-
"status": "stopped"
|
197
|
-
}
|
198
|
-
|
199
|
-
try:
|
200
|
-
async with httpx.AsyncClient(timeout=5.0) as client:
|
201
|
-
response = await client.get(f"{self.service_url}/health")
|
202
|
-
|
203
|
-
if response.status_code == 200:
|
204
|
-
return {
|
205
|
-
"healthy": True,
|
206
|
-
"status": "running",
|
207
|
-
"response_time_ms": response.elapsed.total_seconds() * 1000,
|
208
|
-
"service_info": self.service_info
|
209
|
-
}
|
210
|
-
else:
|
211
|
-
return {
|
212
|
-
"healthy": False,
|
213
|
-
"status": "unhealthy",
|
214
|
-
"status_code": response.status_code
|
215
|
-
}
|
216
|
-
|
217
|
-
except Exception as e:
|
218
|
-
return {
|
219
|
-
"healthy": False,
|
220
|
-
"status": "error",
|
221
|
-
"error": str(e)
|
222
|
-
}
|
223
|
-
|
224
|
-
async def generate(self, prompt: str, **kwargs) -> Dict[str, Any]:
|
225
|
-
"""Generate text using vLLM service"""
|
226
|
-
if not self.is_running:
|
227
|
-
return {
|
228
|
-
"success": False,
|
229
|
-
"error": "Service not running"
|
230
|
-
}
|
231
|
-
|
232
|
-
try:
|
233
|
-
request_data = {
|
234
|
-
"prompt": prompt,
|
235
|
-
"max_tokens": kwargs.get("max_tokens", 512),
|
236
|
-
"temperature": kwargs.get("temperature", 0.7),
|
237
|
-
"top_p": kwargs.get("top_p", 0.9),
|
238
|
-
"stream": kwargs.get("stream", False)
|
239
|
-
}
|
240
|
-
|
241
|
-
async with httpx.AsyncClient(timeout=60.0) as client:
|
242
|
-
response = await client.post(
|
243
|
-
f"{self.service_url}/generate",
|
244
|
-
json=request_data
|
245
|
-
)
|
246
|
-
|
247
|
-
if response.status_code == 200:
|
248
|
-
return {
|
249
|
-
"success": True,
|
250
|
-
**response.json()
|
251
|
-
}
|
252
|
-
else:
|
253
|
-
return {
|
254
|
-
"success": False,
|
255
|
-
"error": f"API error: {response.status_code}",
|
256
|
-
"response": response.text
|
257
|
-
}
|
258
|
-
|
259
|
-
except Exception as e:
|
260
|
-
return {
|
261
|
-
"success": False,
|
262
|
-
"error": str(e)
|
263
|
-
}
|
264
|
-
|
265
|
-
async def chat_completions(self, messages: List[Dict[str, str]], **kwargs) -> Dict[str, Any]:
|
266
|
-
"""OpenAI-compatible chat completions endpoint"""
|
267
|
-
if not self.is_running:
|
268
|
-
return {
|
269
|
-
"success": False,
|
270
|
-
"error": "Service not running"
|
271
|
-
}
|
272
|
-
|
273
|
-
try:
|
274
|
-
request_data = {
|
275
|
-
"model": self.config.served_model_name or self.config.model_id,
|
276
|
-
"messages": messages,
|
277
|
-
"max_tokens": kwargs.get("max_tokens", 512),
|
278
|
-
"temperature": kwargs.get("temperature", 0.7),
|
279
|
-
"top_p": kwargs.get("top_p", 0.9),
|
280
|
-
"stream": kwargs.get("stream", False)
|
281
|
-
}
|
282
|
-
|
283
|
-
async with httpx.AsyncClient(timeout=60.0) as client:
|
284
|
-
response = await client.post(
|
285
|
-
f"{self.service_url}/v1/chat/completions",
|
286
|
-
json=request_data,
|
287
|
-
headers={"Authorization": f"Bearer {self.config.api_key}"} if self.config.api_key else {}
|
288
|
-
)
|
289
|
-
|
290
|
-
if response.status_code == 200:
|
291
|
-
return {
|
292
|
-
"success": True,
|
293
|
-
**response.json()
|
294
|
-
}
|
295
|
-
else:
|
296
|
-
return {
|
297
|
-
"success": False,
|
298
|
-
"error": f"API error: {response.status_code}",
|
299
|
-
"response": response.text
|
300
|
-
}
|
301
|
-
|
302
|
-
except Exception as e:
|
303
|
-
return {
|
304
|
-
"success": False,
|
305
|
-
"error": str(e)
|
306
|
-
}
|
307
|
-
|
308
|
-
def _build_vllm_command(self) -> List[str]:
|
309
|
-
"""Build vLLM server command"""
|
310
|
-
cmd = ["python", "-m", "vllm.entrypoints.openai.api_server"]
|
311
|
-
|
312
|
-
# Basic model configuration
|
313
|
-
cmd.extend(["--model", self.config.model_id])
|
314
|
-
cmd.extend(["--host", self.config.host])
|
315
|
-
cmd.extend(["--port", str(self.config.port)])
|
316
|
-
|
317
|
-
# Model configuration
|
318
|
-
if self.config.served_model_name:
|
319
|
-
cmd.extend(["--served-model-name", self.config.served_model_name])
|
320
|
-
|
321
|
-
cmd.extend(["--max-model-len", str(self.config.max_model_len)])
|
322
|
-
cmd.extend(["--max-num-seqs", str(self.config.max_num_seqs)])
|
323
|
-
|
324
|
-
# GPU configuration
|
325
|
-
if self.config.gpu_id is not None:
|
326
|
-
cmd.extend(["--tensor-parallel-size", str(self.config.tensor_parallel_size)])
|
327
|
-
|
328
|
-
cmd.extend(["--gpu-memory-utilization", str(self.config.gpu_memory_utilization)])
|
329
|
-
cmd.extend(["--swap-space", str(self.config.swap_space)])
|
330
|
-
|
331
|
-
# Performance settings
|
332
|
-
if self.config.enable_chunked_prefill:
|
333
|
-
cmd.append("--enable-chunked-prefill")
|
334
|
-
|
335
|
-
if self.config.enable_prefix_caching:
|
336
|
-
cmd.append("--enable-prefix-caching")
|
337
|
-
|
338
|
-
# Precision and quantization
|
339
|
-
if self.config.model_precision == "float16":
|
340
|
-
cmd.extend(["--dtype", "float16"])
|
341
|
-
elif self.config.model_precision == "bfloat16":
|
342
|
-
cmd.extend(["--dtype", "bfloat16"])
|
343
|
-
|
344
|
-
if self.config.quantization:
|
345
|
-
cmd.extend(["--quantization", self.config.quantization])
|
346
|
-
if self.config.quantization_param_path:
|
347
|
-
cmd.extend(["--quantization-param-path", self.config.quantization_param_path])
|
348
|
-
|
349
|
-
# Trust remote code
|
350
|
-
if self.config.trust_remote_code:
|
351
|
-
cmd.append("--trust-remote-code")
|
352
|
-
|
353
|
-
# Model revisions
|
354
|
-
if self.config.revision:
|
355
|
-
cmd.extend(["--revision", self.config.revision])
|
356
|
-
if self.config.tokenizer_revision:
|
357
|
-
cmd.extend(["--tokenizer-revision", self.config.tokenizer_revision])
|
358
|
-
|
359
|
-
# Additional vLLM arguments
|
360
|
-
for key, value in self.config.vllm_args.items():
|
361
|
-
if isinstance(value, bool):
|
362
|
-
if value:
|
363
|
-
cmd.append(f"--{key}")
|
364
|
-
else:
|
365
|
-
cmd.extend([f"--{key}", str(value)])
|
366
|
-
|
367
|
-
return cmd
|
368
|
-
|
369
|
-
def _get_environment(self) -> Dict[str, str]:
|
370
|
-
"""Get environment variables for vLLM"""
|
371
|
-
env = os.environ.copy()
|
372
|
-
|
373
|
-
# CUDA configuration
|
374
|
-
if self.config.gpu_id is not None:
|
375
|
-
env["CUDA_VISIBLE_DEVICES"] = str(self.config.gpu_id)
|
376
|
-
|
377
|
-
# Cache directories
|
378
|
-
if self.config.model_cache_dir:
|
379
|
-
env["TRANSFORMERS_CACHE"] = self.config.model_cache_dir
|
380
|
-
env["HF_HOME"] = self.config.model_cache_dir
|
381
|
-
|
382
|
-
if self.config.download_dir:
|
383
|
-
env["HF_HUB_CACHE"] = self.config.download_dir
|
384
|
-
|
385
|
-
# Performance optimizations
|
386
|
-
env["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"
|
387
|
-
env["OMP_NUM_THREADS"] = "8"
|
388
|
-
|
389
|
-
return env
|
390
|
-
|
391
|
-
async def _check_gpu_requirements(self) -> Dict[str, Any]:
|
392
|
-
"""Check GPU requirements for the model"""
|
393
|
-
self.gpu_manager.refresh()
|
394
|
-
|
395
|
-
if not self.gpu_manager.cuda_available:
|
396
|
-
return {
|
397
|
-
"compatible": False,
|
398
|
-
"warnings": ["CUDA not available"],
|
399
|
-
"selected_gpu": None
|
400
|
-
}
|
401
|
-
|
402
|
-
# Estimate memory requirements
|
403
|
-
estimated_memory = self.gpu_manager.estimate_model_memory(
|
404
|
-
self.config.model_id,
|
405
|
-
self.config.model_precision
|
406
|
-
)
|
407
|
-
|
408
|
-
# Find suitable GPU
|
409
|
-
if self.config.gpu_id is not None:
|
410
|
-
selected_gpu = self.gpu_manager.get_gpu_info(self.config.gpu_id)
|
411
|
-
if not selected_gpu:
|
412
|
-
return {
|
413
|
-
"compatible": False,
|
414
|
-
"warnings": [f"Specified GPU {self.config.gpu_id} not found"],
|
415
|
-
"selected_gpu": None
|
416
|
-
}
|
417
|
-
else:
|
418
|
-
selected_gpu = self.gpu_manager.get_best_gpu(estimated_memory)
|
419
|
-
if selected_gpu:
|
420
|
-
self.config.gpu_id = selected_gpu.gpu_id
|
421
|
-
|
422
|
-
if not selected_gpu:
|
423
|
-
return {
|
424
|
-
"compatible": False,
|
425
|
-
"warnings": [
|
426
|
-
f"No suitable GPU found. Required: {estimated_memory}MB, "
|
427
|
-
f"Available: {max(gpu.memory_free for gpu in self.gpu_manager.gpus) if self.gpu_manager.gpus else 0}MB"
|
428
|
-
],
|
429
|
-
"selected_gpu": None
|
430
|
-
}
|
431
|
-
|
432
|
-
warnings = []
|
433
|
-
|
434
|
-
# Check memory requirements
|
435
|
-
required_memory = int(estimated_memory * self.config.gpu_memory_utilization)
|
436
|
-
if selected_gpu.memory_free < required_memory:
|
437
|
-
warnings.append(f"GPU memory may be insufficient: {selected_gpu.memory_free}MB available, {required_memory}MB required")
|
438
|
-
|
439
|
-
# Check utilization
|
440
|
-
if selected_gpu.utilization > 80:
|
441
|
-
warnings.append(f"GPU utilization is high: {selected_gpu.utilization}%")
|
442
|
-
|
443
|
-
return {
|
444
|
-
"compatible": True,
|
445
|
-
"warnings": warnings,
|
446
|
-
"selected_gpu": {
|
447
|
-
"gpu_id": selected_gpu.gpu_id,
|
448
|
-
"name": selected_gpu.name,
|
449
|
-
"memory_total": selected_gpu.memory_total,
|
450
|
-
"memory_free": selected_gpu.memory_free,
|
451
|
-
"utilization": selected_gpu.utilization,
|
452
|
-
"estimated_memory_required": estimated_memory
|
453
|
-
}
|
454
|
-
}
|
455
|
-
|
456
|
-
async def _wait_for_startup(self, timeout: int = 300) -> Dict[str, Any]:
|
457
|
-
"""Wait for vLLM service to start"""
|
458
|
-
start_time = time.time()
|
459
|
-
logs = []
|
460
|
-
|
461
|
-
while time.time() - start_time < timeout:
|
462
|
-
# Check if process is still running
|
463
|
-
if self.process and self.process.poll() is not None:
|
464
|
-
# Process died
|
465
|
-
stdout, stderr = self.process.communicate()
|
466
|
-
return {
|
467
|
-
"success": False,
|
468
|
-
"error": "vLLM process died during startup",
|
469
|
-
"logs": logs + [stdout, stderr]
|
470
|
-
}
|
471
|
-
|
472
|
-
# Try to connect to service
|
473
|
-
try:
|
474
|
-
async with httpx.AsyncClient(timeout=2.0) as client:
|
475
|
-
response = await client.get(f"{self.service_url}/health")
|
476
|
-
if response.status_code == 200:
|
477
|
-
startup_time = time.time() - start_time
|
478
|
-
return {
|
479
|
-
"success": True,
|
480
|
-
"startup_time": startup_time
|
481
|
-
}
|
482
|
-
except:
|
483
|
-
pass
|
484
|
-
|
485
|
-
# Collect logs
|
486
|
-
if self.process:
|
487
|
-
try:
|
488
|
-
# Non-blocking read of logs
|
489
|
-
import select
|
490
|
-
if hasattr(select, 'select'):
|
491
|
-
ready, _, _ = select.select([self.process.stdout], [], [], 0.1)
|
492
|
-
if ready:
|
493
|
-
line = self.process.stdout.readline()
|
494
|
-
if line:
|
495
|
-
logs.append(line.strip())
|
496
|
-
logger.debug(f"vLLM: {line.strip()}")
|
497
|
-
except:
|
498
|
-
pass
|
499
|
-
|
500
|
-
await asyncio.sleep(2)
|
501
|
-
|
502
|
-
return {
|
503
|
-
"success": False,
|
504
|
-
"error": f"Startup timeout after {timeout} seconds",
|
505
|
-
"logs": logs
|
506
|
-
}
|
507
|
-
|
508
|
-
async def _get_model_info(self) -> Optional[Dict[str, Any]]:
|
509
|
-
"""Get model information from vLLM service"""
|
510
|
-
try:
|
511
|
-
async with httpx.AsyncClient(timeout=5.0) as client:
|
512
|
-
response = await client.get(f"{self.service_url}/v1/models")
|
513
|
-
if response.status_code == 200:
|
514
|
-
return response.json()
|
515
|
-
except:
|
516
|
-
pass
|
517
|
-
return None
|
518
|
-
|
519
|
-
def get_service_info(self) -> Dict[str, Any]:
|
520
|
-
"""Get current service information"""
|
521
|
-
return {
|
522
|
-
**self.service_info,
|
523
|
-
"config": self.config.to_dict(),
|
524
|
-
"process_id": self.process.pid if self.process else None,
|
525
|
-
"is_running": self.is_running,
|
526
|
-
"startup_time": self.startup_time.isoformat() if self.startup_time else None
|
527
|
-
}
|