isa-model 0.4.3__py3-none-any.whl → 0.4.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- isa_model/core/config.py +3 -3
- isa_model/core/models/model_manager.py +1 -69
- {isa_model-0.4.3.dist-info → isa_model-0.4.4.dist-info}/METADATA +6 -1
- {isa_model-0.4.3.dist-info → isa_model-0.4.4.dist-info}/RECORD +6 -19
- isa_model/core/security/secrets.py +0 -358
- isa_model/core/storage/hf_storage.py +0 -419
- isa_model/deployment/local/__init__.py +0 -31
- isa_model/deployment/local/config.py +0 -248
- isa_model/deployment/local/gpu_gateway.py +0 -607
- isa_model/deployment/local/health_checker.py +0 -428
- isa_model/deployment/local/provider.py +0 -586
- isa_model/deployment/local/tensorrt_service.py +0 -621
- isa_model/deployment/local/transformers_service.py +0 -644
- isa_model/deployment/local/vllm_service.py +0 -527
- isa_model/inference/services/custom_model_manager.py +0 -277
- isa_model/inference/services/llm/local_llm_service.py +0 -747
- isa_model/inference/services/vision/blip_vision_service.py +0 -359
- {isa_model-0.4.3.dist-info → isa_model-0.4.4.dist-info}/WHEEL +0 -0
- {isa_model-0.4.3.dist-info → isa_model-0.4.4.dist-info}/top_level.txt +0 -0
@@ -1,621 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
TensorRT-LLM local inference service
|
3
|
-
|
4
|
-
Direct TensorRT-LLM deployment without containers for maximum performance.
|
5
|
-
"""
|
6
|
-
|
7
|
-
import os
|
8
|
-
import json
|
9
|
-
import asyncio
|
10
|
-
import logging
|
11
|
-
import subprocess
|
12
|
-
import tempfile
|
13
|
-
import shutil
|
14
|
-
from typing import Dict, List, Optional, Any, Union
|
15
|
-
from pathlib import Path
|
16
|
-
from datetime import datetime
|
17
|
-
import time
|
18
|
-
|
19
|
-
from .config import LocalGPUConfig, LocalServiceType, LocalBackend
|
20
|
-
from ...utils.gpu_utils import get_gpu_manager
|
21
|
-
|
22
|
-
logger = logging.getLogger(__name__)
|
23
|
-
|
24
|
-
|
25
|
-
class TensorRTLLMService:
|
26
|
-
"""TensorRT-LLM local inference service manager"""
|
27
|
-
|
28
|
-
def __init__(self, config: LocalGPUConfig):
|
29
|
-
"""
|
30
|
-
Initialize TensorRT-LLM service.
|
31
|
-
|
32
|
-
Args:
|
33
|
-
config: Local GPU configuration for TensorRT-LLM
|
34
|
-
"""
|
35
|
-
if config.backend != LocalBackend.TENSORRT_LLM:
|
36
|
-
raise ValueError("Config must use TENSORRT_LLM backend")
|
37
|
-
|
38
|
-
self.config = config
|
39
|
-
self.gpu_manager = get_gpu_manager()
|
40
|
-
self.workspace_dir: Optional[Path] = None
|
41
|
-
self.engine_path: Optional[Path] = None
|
42
|
-
self.model_loaded = False
|
43
|
-
self.startup_time: Optional[datetime] = None
|
44
|
-
|
45
|
-
# TensorRT-LLM imports (lazy loaded)
|
46
|
-
self.tensorrt_llm = None
|
47
|
-
self.runtime_mapping = None
|
48
|
-
self.generation_session = None
|
49
|
-
|
50
|
-
# Service info
|
51
|
-
self.service_info = {
|
52
|
-
"service_name": config.service_name,
|
53
|
-
"model_id": config.model_id,
|
54
|
-
"backend": "tensorrt_llm",
|
55
|
-
"status": "stopped",
|
56
|
-
"engine_path": None
|
57
|
-
}
|
58
|
-
|
59
|
-
async def build_engine(self) -> Dict[str, Any]:
|
60
|
-
"""
|
61
|
-
Build TensorRT engine from HuggingFace model.
|
62
|
-
|
63
|
-
Returns:
|
64
|
-
Engine build result
|
65
|
-
"""
|
66
|
-
try:
|
67
|
-
logger.info(f"Building TensorRT engine for {self.config.model_id}")
|
68
|
-
|
69
|
-
# Check GPU requirements
|
70
|
-
gpu_check = await self._check_gpu_requirements()
|
71
|
-
if not gpu_check["compatible"]:
|
72
|
-
return {
|
73
|
-
"success": False,
|
74
|
-
"error": f"GPU requirements not met: {', '.join(gpu_check['warnings'])}",
|
75
|
-
"gpu_check": gpu_check
|
76
|
-
}
|
77
|
-
|
78
|
-
# Create workspace
|
79
|
-
self.workspace_dir = Path(tempfile.mkdtemp(prefix=f"tensorrt_{self.config.service_name}_"))
|
80
|
-
logger.info(f"TensorRT workspace: {self.workspace_dir}")
|
81
|
-
|
82
|
-
# Download HuggingFace model
|
83
|
-
hf_model_path = await self._download_hf_model()
|
84
|
-
|
85
|
-
# Convert to TensorRT engine
|
86
|
-
engine_build_result = await self._build_tensorrt_engine(hf_model_path)
|
87
|
-
|
88
|
-
if engine_build_result["success"]:
|
89
|
-
self.engine_path = engine_build_result["engine_path"]
|
90
|
-
self.service_info.update({
|
91
|
-
"engine_path": str(self.engine_path),
|
92
|
-
"build_time": engine_build_result["build_time"],
|
93
|
-
"status": "engine_built"
|
94
|
-
})
|
95
|
-
|
96
|
-
logger.info(f"TensorRT engine built successfully: {self.engine_path}")
|
97
|
-
return {
|
98
|
-
"success": True,
|
99
|
-
"engine_path": str(self.engine_path),
|
100
|
-
"build_time": engine_build_result["build_time"],
|
101
|
-
"workspace": str(self.workspace_dir),
|
102
|
-
"gpu_info": gpu_check["selected_gpu"]
|
103
|
-
}
|
104
|
-
else:
|
105
|
-
return engine_build_result
|
106
|
-
|
107
|
-
except Exception as e:
|
108
|
-
logger.error(f"Failed to build TensorRT engine: {e}")
|
109
|
-
return {
|
110
|
-
"success": False,
|
111
|
-
"error": str(e)
|
112
|
-
}
|
113
|
-
|
114
|
-
async def load_model(self) -> Dict[str, Any]:
|
115
|
-
"""
|
116
|
-
Load TensorRT engine for inference.
|
117
|
-
|
118
|
-
Returns:
|
119
|
-
Model loading result
|
120
|
-
"""
|
121
|
-
if self.model_loaded:
|
122
|
-
return {
|
123
|
-
"success": True,
|
124
|
-
"message": "Model already loaded"
|
125
|
-
}
|
126
|
-
|
127
|
-
if not self.engine_path or not self.engine_path.exists():
|
128
|
-
return {
|
129
|
-
"success": False,
|
130
|
-
"error": "TensorRT engine not found. Build engine first."
|
131
|
-
}
|
132
|
-
|
133
|
-
try:
|
134
|
-
logger.info(f"Loading TensorRT engine: {self.engine_path}")
|
135
|
-
self.startup_time = datetime.now()
|
136
|
-
|
137
|
-
# Import TensorRT-LLM (lazy loading)
|
138
|
-
await self._import_tensorrt_llm()
|
139
|
-
|
140
|
-
# Load the engine
|
141
|
-
load_result = await self._load_tensorrt_engine()
|
142
|
-
|
143
|
-
if load_result["success"]:
|
144
|
-
self.model_loaded = True
|
145
|
-
self.service_info.update({
|
146
|
-
"status": "running",
|
147
|
-
"loaded_at": self.startup_time.isoformat(),
|
148
|
-
"load_time": load_result["load_time"]
|
149
|
-
})
|
150
|
-
|
151
|
-
logger.info(f"TensorRT model loaded successfully")
|
152
|
-
return {
|
153
|
-
"success": True,
|
154
|
-
"service_info": self.service_info,
|
155
|
-
"load_time": load_result["load_time"]
|
156
|
-
}
|
157
|
-
else:
|
158
|
-
return load_result
|
159
|
-
|
160
|
-
except Exception as e:
|
161
|
-
logger.error(f"Failed to load TensorRT model: {e}")
|
162
|
-
return {
|
163
|
-
"success": False,
|
164
|
-
"error": str(e)
|
165
|
-
}
|
166
|
-
|
167
|
-
async def unload_model(self) -> Dict[str, Any]:
|
168
|
-
"""Unload TensorRT model"""
|
169
|
-
try:
|
170
|
-
if self.generation_session:
|
171
|
-
del self.generation_session
|
172
|
-
self.generation_session = None
|
173
|
-
|
174
|
-
self.model_loaded = False
|
175
|
-
self.service_info.update({
|
176
|
-
"status": "stopped",
|
177
|
-
"unloaded_at": datetime.now().isoformat()
|
178
|
-
})
|
179
|
-
|
180
|
-
# Free GPU memory
|
181
|
-
if self.tensorrt_llm:
|
182
|
-
import torch
|
183
|
-
if torch.cuda.is_available():
|
184
|
-
torch.cuda.empty_cache()
|
185
|
-
|
186
|
-
logger.info("TensorRT model unloaded")
|
187
|
-
return {
|
188
|
-
"success": True,
|
189
|
-
"service_info": self.service_info
|
190
|
-
}
|
191
|
-
|
192
|
-
except Exception as e:
|
193
|
-
logger.error(f"Failed to unload TensorRT model: {e}")
|
194
|
-
return {
|
195
|
-
"success": False,
|
196
|
-
"error": str(e)
|
197
|
-
}
|
198
|
-
|
199
|
-
async def generate(self, prompt: str, **kwargs) -> Dict[str, Any]:
|
200
|
-
"""Generate text using TensorRT-LLM"""
|
201
|
-
if not self.model_loaded:
|
202
|
-
return {
|
203
|
-
"success": False,
|
204
|
-
"error": "Model not loaded"
|
205
|
-
}
|
206
|
-
|
207
|
-
try:
|
208
|
-
start_time = time.time()
|
209
|
-
|
210
|
-
# Prepare generation parameters
|
211
|
-
max_tokens = kwargs.get("max_tokens", 512)
|
212
|
-
temperature = kwargs.get("temperature", 0.7)
|
213
|
-
top_p = kwargs.get("top_p", 0.9)
|
214
|
-
top_k = kwargs.get("top_k", 50)
|
215
|
-
|
216
|
-
# Tokenize input
|
217
|
-
input_ids = await self._tokenize_input(prompt)
|
218
|
-
|
219
|
-
# Generate with TensorRT-LLM
|
220
|
-
output_ids = await self._generate_tensorrt(
|
221
|
-
input_ids=input_ids,
|
222
|
-
max_tokens=max_tokens,
|
223
|
-
temperature=temperature,
|
224
|
-
top_p=top_p,
|
225
|
-
top_k=top_k
|
226
|
-
)
|
227
|
-
|
228
|
-
# Decode output
|
229
|
-
generated_text = await self._decode_output(output_ids, len(input_ids[0]))
|
230
|
-
|
231
|
-
generation_time = time.time() - start_time
|
232
|
-
|
233
|
-
return {
|
234
|
-
"success": True,
|
235
|
-
"text": generated_text,
|
236
|
-
"model": self.config.model_id,
|
237
|
-
"generation_time": generation_time,
|
238
|
-
"input_tokens": len(input_ids[0]),
|
239
|
-
"output_tokens": len(output_ids[0]) - len(input_ids[0]),
|
240
|
-
"total_tokens": len(output_ids[0])
|
241
|
-
}
|
242
|
-
|
243
|
-
except Exception as e:
|
244
|
-
logger.error(f"TensorRT generation failed: {e}")
|
245
|
-
return {
|
246
|
-
"success": False,
|
247
|
-
"error": str(e)
|
248
|
-
}
|
249
|
-
|
250
|
-
async def health_check(self) -> Dict[str, Any]:
|
251
|
-
"""Check service health"""
|
252
|
-
return {
|
253
|
-
"healthy": self.model_loaded,
|
254
|
-
"status": "running" if self.model_loaded else "stopped",
|
255
|
-
"service_info": self.service_info,
|
256
|
-
"engine_exists": self.engine_path.exists() if self.engine_path else False
|
257
|
-
}
|
258
|
-
|
259
|
-
async def cleanup(self) -> Dict[str, Any]:
|
260
|
-
"""Clean up workspace and temporary files"""
|
261
|
-
try:
|
262
|
-
# Unload model first
|
263
|
-
await self.unload_model()
|
264
|
-
|
265
|
-
# Clean up workspace
|
266
|
-
if self.workspace_dir and self.workspace_dir.exists():
|
267
|
-
shutil.rmtree(self.workspace_dir)
|
268
|
-
logger.info(f"Cleaned up workspace: {self.workspace_dir}")
|
269
|
-
|
270
|
-
return {
|
271
|
-
"success": True,
|
272
|
-
"message": "Cleanup completed"
|
273
|
-
}
|
274
|
-
|
275
|
-
except Exception as e:
|
276
|
-
logger.error(f"Cleanup failed: {e}")
|
277
|
-
return {
|
278
|
-
"success": False,
|
279
|
-
"error": str(e)
|
280
|
-
}
|
281
|
-
|
282
|
-
async def _download_hf_model(self) -> Path:
|
283
|
-
"""Download HuggingFace model"""
|
284
|
-
hf_model_path = self.workspace_dir / "hf_model"
|
285
|
-
|
286
|
-
try:
|
287
|
-
from huggingface_hub import snapshot_download
|
288
|
-
|
289
|
-
logger.info(f"Downloading HF model: {self.config.model_id}")
|
290
|
-
snapshot_download(
|
291
|
-
repo_id=self.config.model_id,
|
292
|
-
local_dir=str(hf_model_path),
|
293
|
-
local_dir_use_symlinks=False,
|
294
|
-
revision=self.config.revision
|
295
|
-
)
|
296
|
-
|
297
|
-
logger.info(f"Model downloaded to: {hf_model_path}")
|
298
|
-
return hf_model_path
|
299
|
-
|
300
|
-
except Exception as e:
|
301
|
-
logger.error(f"Failed to download model: {e}")
|
302
|
-
raise
|
303
|
-
|
304
|
-
async def _build_tensorrt_engine(self, hf_model_path: Path) -> Dict[str, Any]:
|
305
|
-
"""Build TensorRT engine using trtllm-build"""
|
306
|
-
try:
|
307
|
-
engine_output_path = self.workspace_dir / "engines"
|
308
|
-
engine_output_path.mkdir(exist_ok=True)
|
309
|
-
|
310
|
-
logger.info("Building TensorRT engine...")
|
311
|
-
start_time = time.time()
|
312
|
-
|
313
|
-
# Prepare build command
|
314
|
-
build_cmd = [
|
315
|
-
"trtllm-build",
|
316
|
-
"--checkpoint_dir", str(hf_model_path),
|
317
|
-
"--output_dir", str(engine_output_path),
|
318
|
-
"--gemm_plugin", self.config.model_precision,
|
319
|
-
"--gpt_attention_plugin", self.config.model_precision,
|
320
|
-
"--max_batch_size", str(self.config.max_batch_size),
|
321
|
-
"--max_seq_len", str(self.config.max_model_len),
|
322
|
-
]
|
323
|
-
|
324
|
-
# Add TensorRT-specific arguments
|
325
|
-
tensorrt_args = self.config.tensorrt_args
|
326
|
-
for key, value in tensorrt_args.items():
|
327
|
-
if isinstance(value, bool):
|
328
|
-
if value:
|
329
|
-
build_cmd.append(f"--{key}")
|
330
|
-
else:
|
331
|
-
build_cmd.extend([f"--{key}", str(value)])
|
332
|
-
|
333
|
-
# Set environment
|
334
|
-
env = os.environ.copy()
|
335
|
-
if self.config.gpu_id is not None:
|
336
|
-
env["CUDA_VISIBLE_DEVICES"] = str(self.config.gpu_id)
|
337
|
-
|
338
|
-
# Run build command
|
339
|
-
logger.info(f"TensorRT build command: {' '.join(build_cmd)}")
|
340
|
-
|
341
|
-
process = await asyncio.create_subprocess_exec(
|
342
|
-
*build_cmd,
|
343
|
-
stdout=asyncio.subprocess.PIPE,
|
344
|
-
stderr=asyncio.subprocess.PIPE,
|
345
|
-
env=env
|
346
|
-
)
|
347
|
-
|
348
|
-
stdout, stderr = await process.communicate()
|
349
|
-
|
350
|
-
if process.returncode == 0:
|
351
|
-
build_time = time.time() - start_time
|
352
|
-
|
353
|
-
# Find the built engine
|
354
|
-
engine_files = list(engine_output_path.glob("*.engine"))
|
355
|
-
if engine_files:
|
356
|
-
engine_path = engine_files[0]
|
357
|
-
else:
|
358
|
-
# Look for rank_0.engine or similar patterns
|
359
|
-
rank_engines = list(engine_output_path.glob("*rank_0*.engine"))
|
360
|
-
if rank_engines:
|
361
|
-
engine_path = rank_engines[0]
|
362
|
-
else:
|
363
|
-
engine_path = engine_output_path / "model.engine"
|
364
|
-
|
365
|
-
logger.info(f"TensorRT engine built in {build_time:.2f}s: {engine_path}")
|
366
|
-
|
367
|
-
return {
|
368
|
-
"success": True,
|
369
|
-
"engine_path": engine_path,
|
370
|
-
"build_time": build_time,
|
371
|
-
"stdout": stdout.decode() if stdout else "",
|
372
|
-
"stderr": stderr.decode() if stderr else ""
|
373
|
-
}
|
374
|
-
else:
|
375
|
-
error_msg = stderr.decode() if stderr else "Unknown build error"
|
376
|
-
logger.error(f"TensorRT build failed: {error_msg}")
|
377
|
-
|
378
|
-
return {
|
379
|
-
"success": False,
|
380
|
-
"error": f"TensorRT build failed: {error_msg}",
|
381
|
-
"stdout": stdout.decode() if stdout else "",
|
382
|
-
"stderr": stderr.decode() if stderr else ""
|
383
|
-
}
|
384
|
-
|
385
|
-
except Exception as e:
|
386
|
-
logger.error(f"TensorRT build error: {e}")
|
387
|
-
return {
|
388
|
-
"success": False,
|
389
|
-
"error": str(e)
|
390
|
-
}
|
391
|
-
|
392
|
-
async def _import_tensorrt_llm(self):
|
393
|
-
"""Import TensorRT-LLM modules"""
|
394
|
-
try:
|
395
|
-
import tensorrt_llm
|
396
|
-
from tensorrt_llm.runtime import ModelConfig, SamplingConfig
|
397
|
-
from tensorrt_llm.runtime.generation import GenerationSession
|
398
|
-
|
399
|
-
self.tensorrt_llm = tensorrt_llm
|
400
|
-
self.ModelConfig = ModelConfig
|
401
|
-
self.SamplingConfig = SamplingConfig
|
402
|
-
self.GenerationSession = GenerationSession
|
403
|
-
|
404
|
-
logger.info("TensorRT-LLM modules imported successfully")
|
405
|
-
|
406
|
-
except ImportError as e:
|
407
|
-
raise ImportError(f"TensorRT-LLM not installed: {e}")
|
408
|
-
|
409
|
-
async def _load_tensorrt_engine(self) -> Dict[str, Any]:
|
410
|
-
"""Load TensorRT engine for inference"""
|
411
|
-
try:
|
412
|
-
start_time = time.time()
|
413
|
-
|
414
|
-
# Configure runtime mapping
|
415
|
-
from tensorrt_llm.runtime import ModelConfig
|
416
|
-
from tensorrt_llm.runtime.generation import GenerationSession
|
417
|
-
|
418
|
-
# Load model configuration
|
419
|
-
config_path = self.engine_path.parent / "config.json"
|
420
|
-
if config_path.exists():
|
421
|
-
with open(config_path, 'r') as f:
|
422
|
-
model_config_dict = json.load(f)
|
423
|
-
model_config = ModelConfig.from_dict(model_config_dict)
|
424
|
-
else:
|
425
|
-
# Create default config
|
426
|
-
model_config = ModelConfig(
|
427
|
-
max_batch_size=self.config.max_batch_size,
|
428
|
-
max_input_len=self.config.max_model_len // 2,
|
429
|
-
max_output_len=self.config.max_model_len // 2,
|
430
|
-
max_beam_width=1,
|
431
|
-
vocab_size=50000, # Default, will be updated from tokenizer
|
432
|
-
num_heads=32,
|
433
|
-
num_kv_heads=32,
|
434
|
-
hidden_size=4096,
|
435
|
-
gpt_attention_plugin=True,
|
436
|
-
remove_input_padding=True
|
437
|
-
)
|
438
|
-
|
439
|
-
# Create generation session
|
440
|
-
self.generation_session = GenerationSession(
|
441
|
-
model_config=model_config,
|
442
|
-
engine_dir=str(self.engine_path.parent),
|
443
|
-
runtime_mapping=None # Single GPU
|
444
|
-
)
|
445
|
-
|
446
|
-
load_time = time.time() - start_time
|
447
|
-
|
448
|
-
return {
|
449
|
-
"success": True,
|
450
|
-
"load_time": load_time
|
451
|
-
}
|
452
|
-
|
453
|
-
except Exception as e:
|
454
|
-
logger.error(f"Failed to load TensorRT engine: {e}")
|
455
|
-
return {
|
456
|
-
"success": False,
|
457
|
-
"error": str(e)
|
458
|
-
}
|
459
|
-
|
460
|
-
async def _tokenize_input(self, text: str) -> List[List[int]]:
|
461
|
-
"""Tokenize input text"""
|
462
|
-
try:
|
463
|
-
from transformers import AutoTokenizer
|
464
|
-
|
465
|
-
# Load tokenizer
|
466
|
-
if not hasattr(self, '_tokenizer'):
|
467
|
-
self._tokenizer = AutoTokenizer.from_pretrained(
|
468
|
-
self.config.model_id,
|
469
|
-
revision=self.config.tokenizer_revision,
|
470
|
-
trust_remote_code=self.config.trust_remote_code
|
471
|
-
)
|
472
|
-
|
473
|
-
# Tokenize
|
474
|
-
encoded = self._tokenizer.encode(text, return_tensors="pt")
|
475
|
-
return [encoded[0].tolist()]
|
476
|
-
|
477
|
-
except Exception as e:
|
478
|
-
logger.error(f"Tokenization failed: {e}")
|
479
|
-
raise
|
480
|
-
|
481
|
-
async def _generate_tensorrt(self, input_ids: List[List[int]],
|
482
|
-
max_tokens: int, temperature: float,
|
483
|
-
top_p: float, top_k: int) -> List[List[int]]:
|
484
|
-
"""Generate using TensorRT-LLM"""
|
485
|
-
try:
|
486
|
-
import torch
|
487
|
-
from tensorrt_llm.runtime import SamplingConfig
|
488
|
-
|
489
|
-
# Prepare inputs
|
490
|
-
batch_size = len(input_ids)
|
491
|
-
input_lengths = [len(seq) for seq in input_ids]
|
492
|
-
max_input_length = max(input_lengths)
|
493
|
-
|
494
|
-
# Pad sequences
|
495
|
-
padded_ids = []
|
496
|
-
for seq in input_ids:
|
497
|
-
padded = seq + [0] * (max_input_length - len(seq))
|
498
|
-
padded_ids.append(padded)
|
499
|
-
|
500
|
-
input_ids_tensor = torch.tensor(padded_ids, dtype=torch.int32).cuda()
|
501
|
-
input_lengths_tensor = torch.tensor(input_lengths, dtype=torch.int32).cuda()
|
502
|
-
|
503
|
-
# Configure sampling
|
504
|
-
sampling_config = SamplingConfig(
|
505
|
-
end_id=self._tokenizer.eos_token_id,
|
506
|
-
pad_id=self._tokenizer.pad_token_id or self._tokenizer.eos_token_id,
|
507
|
-
temperature=temperature,
|
508
|
-
top_k=top_k,
|
509
|
-
top_p=top_p,
|
510
|
-
num_beams=1,
|
511
|
-
length_penalty=1.0
|
512
|
-
)
|
513
|
-
|
514
|
-
# Generate
|
515
|
-
output_ids = self.generation_session.decode(
|
516
|
-
input_ids=input_ids_tensor,
|
517
|
-
input_lengths=input_lengths_tensor,
|
518
|
-
sampling_config=sampling_config,
|
519
|
-
max_new_tokens=max_tokens
|
520
|
-
)
|
521
|
-
|
522
|
-
return output_ids.cpu().numpy().tolist()
|
523
|
-
|
524
|
-
except Exception as e:
|
525
|
-
logger.error(f"TensorRT generation failed: {e}")
|
526
|
-
raise
|
527
|
-
|
528
|
-
async def _decode_output(self, output_ids: List[List[int]], input_length: int) -> str:
|
529
|
-
"""Decode generated tokens to text"""
|
530
|
-
try:
|
531
|
-
# Extract only the generated part
|
532
|
-
generated_ids = output_ids[0][input_length:]
|
533
|
-
|
534
|
-
# Decode
|
535
|
-
generated_text = self._tokenizer.decode(
|
536
|
-
generated_ids,
|
537
|
-
skip_special_tokens=True
|
538
|
-
)
|
539
|
-
|
540
|
-
return generated_text.strip()
|
541
|
-
|
542
|
-
except Exception as e:
|
543
|
-
logger.error(f"Decoding failed: {e}")
|
544
|
-
raise
|
545
|
-
|
546
|
-
async def _check_gpu_requirements(self) -> Dict[str, Any]:
|
547
|
-
"""Check GPU requirements for TensorRT-LLM"""
|
548
|
-
self.gpu_manager.refresh()
|
549
|
-
|
550
|
-
if not self.gpu_manager.cuda_available:
|
551
|
-
return {
|
552
|
-
"compatible": False,
|
553
|
-
"warnings": ["CUDA not available"],
|
554
|
-
"selected_gpu": None
|
555
|
-
}
|
556
|
-
|
557
|
-
# TensorRT-LLM requires more memory due to engine optimization
|
558
|
-
estimated_memory = self.gpu_manager.estimate_model_memory(
|
559
|
-
self.config.model_id,
|
560
|
-
self.config.model_precision
|
561
|
-
)
|
562
|
-
# Add 50% overhead for TensorRT optimizations
|
563
|
-
estimated_memory = int(estimated_memory * 1.5)
|
564
|
-
|
565
|
-
# Find suitable GPU
|
566
|
-
if self.config.gpu_id is not None:
|
567
|
-
selected_gpu = self.gpu_manager.get_gpu_info(self.config.gpu_id)
|
568
|
-
if not selected_gpu:
|
569
|
-
return {
|
570
|
-
"compatible": False,
|
571
|
-
"warnings": [f"Specified GPU {self.config.gpu_id} not found"],
|
572
|
-
"selected_gpu": None
|
573
|
-
}
|
574
|
-
else:
|
575
|
-
selected_gpu = self.gpu_manager.get_best_gpu(estimated_memory)
|
576
|
-
if selected_gpu:
|
577
|
-
self.config.gpu_id = selected_gpu.gpu_id
|
578
|
-
|
579
|
-
if not selected_gpu:
|
580
|
-
return {
|
581
|
-
"compatible": False,
|
582
|
-
"warnings": [
|
583
|
-
f"No suitable GPU found. Required: {estimated_memory}MB (TensorRT overhead included), "
|
584
|
-
f"Available: {max(gpu.memory_free for gpu in self.gpu_manager.gpus) if self.gpu_manager.gpus else 0}MB"
|
585
|
-
],
|
586
|
-
"selected_gpu": None
|
587
|
-
}
|
588
|
-
|
589
|
-
warnings = []
|
590
|
-
|
591
|
-
# Check memory requirements (TensorRT needs more memory)
|
592
|
-
if selected_gpu.memory_free < estimated_memory:
|
593
|
-
warnings.append(f"GPU memory may be insufficient for TensorRT: {selected_gpu.memory_free}MB available, {estimated_memory}MB required")
|
594
|
-
|
595
|
-
# Check compute capability for TensorRT
|
596
|
-
if selected_gpu.name and "RTX" not in selected_gpu.name and "Tesla" not in selected_gpu.name:
|
597
|
-
warnings.append("TensorRT-LLM works best with RTX/Tesla GPUs")
|
598
|
-
|
599
|
-
return {
|
600
|
-
"compatible": True,
|
601
|
-
"warnings": warnings,
|
602
|
-
"selected_gpu": {
|
603
|
-
"gpu_id": selected_gpu.gpu_id,
|
604
|
-
"name": selected_gpu.name,
|
605
|
-
"memory_total": selected_gpu.memory_total,
|
606
|
-
"memory_free": selected_gpu.memory_free,
|
607
|
-
"utilization": selected_gpu.utilization,
|
608
|
-
"estimated_memory_required": estimated_memory
|
609
|
-
}
|
610
|
-
}
|
611
|
-
|
612
|
-
def get_service_info(self) -> Dict[str, Any]:
|
613
|
-
"""Get current service information"""
|
614
|
-
return {
|
615
|
-
**self.service_info,
|
616
|
-
"config": self.config.to_dict(),
|
617
|
-
"workspace_dir": str(self.workspace_dir) if self.workspace_dir else None,
|
618
|
-
"engine_path": str(self.engine_path) if self.engine_path else None,
|
619
|
-
"model_loaded": self.model_loaded,
|
620
|
-
"startup_time": self.startup_time.isoformat() if self.startup_time else None
|
621
|
-
}
|