isa-model 0.4.3__py3-none-any.whl → 0.4.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,621 +0,0 @@
1
- """
2
- TensorRT-LLM local inference service
3
-
4
- Direct TensorRT-LLM deployment without containers for maximum performance.
5
- """
6
-
7
- import os
8
- import json
9
- import asyncio
10
- import logging
11
- import subprocess
12
- import tempfile
13
- import shutil
14
- from typing import Dict, List, Optional, Any, Union
15
- from pathlib import Path
16
- from datetime import datetime
17
- import time
18
-
19
- from .config import LocalGPUConfig, LocalServiceType, LocalBackend
20
- from ...utils.gpu_utils import get_gpu_manager
21
-
22
- logger = logging.getLogger(__name__)
23
-
24
-
25
- class TensorRTLLMService:
26
- """TensorRT-LLM local inference service manager"""
27
-
28
- def __init__(self, config: LocalGPUConfig):
29
- """
30
- Initialize TensorRT-LLM service.
31
-
32
- Args:
33
- config: Local GPU configuration for TensorRT-LLM
34
- """
35
- if config.backend != LocalBackend.TENSORRT_LLM:
36
- raise ValueError("Config must use TENSORRT_LLM backend")
37
-
38
- self.config = config
39
- self.gpu_manager = get_gpu_manager()
40
- self.workspace_dir: Optional[Path] = None
41
- self.engine_path: Optional[Path] = None
42
- self.model_loaded = False
43
- self.startup_time: Optional[datetime] = None
44
-
45
- # TensorRT-LLM imports (lazy loaded)
46
- self.tensorrt_llm = None
47
- self.runtime_mapping = None
48
- self.generation_session = None
49
-
50
- # Service info
51
- self.service_info = {
52
- "service_name": config.service_name,
53
- "model_id": config.model_id,
54
- "backend": "tensorrt_llm",
55
- "status": "stopped",
56
- "engine_path": None
57
- }
58
-
59
- async def build_engine(self) -> Dict[str, Any]:
60
- """
61
- Build TensorRT engine from HuggingFace model.
62
-
63
- Returns:
64
- Engine build result
65
- """
66
- try:
67
- logger.info(f"Building TensorRT engine for {self.config.model_id}")
68
-
69
- # Check GPU requirements
70
- gpu_check = await self._check_gpu_requirements()
71
- if not gpu_check["compatible"]:
72
- return {
73
- "success": False,
74
- "error": f"GPU requirements not met: {', '.join(gpu_check['warnings'])}",
75
- "gpu_check": gpu_check
76
- }
77
-
78
- # Create workspace
79
- self.workspace_dir = Path(tempfile.mkdtemp(prefix=f"tensorrt_{self.config.service_name}_"))
80
- logger.info(f"TensorRT workspace: {self.workspace_dir}")
81
-
82
- # Download HuggingFace model
83
- hf_model_path = await self._download_hf_model()
84
-
85
- # Convert to TensorRT engine
86
- engine_build_result = await self._build_tensorrt_engine(hf_model_path)
87
-
88
- if engine_build_result["success"]:
89
- self.engine_path = engine_build_result["engine_path"]
90
- self.service_info.update({
91
- "engine_path": str(self.engine_path),
92
- "build_time": engine_build_result["build_time"],
93
- "status": "engine_built"
94
- })
95
-
96
- logger.info(f"TensorRT engine built successfully: {self.engine_path}")
97
- return {
98
- "success": True,
99
- "engine_path": str(self.engine_path),
100
- "build_time": engine_build_result["build_time"],
101
- "workspace": str(self.workspace_dir),
102
- "gpu_info": gpu_check["selected_gpu"]
103
- }
104
- else:
105
- return engine_build_result
106
-
107
- except Exception as e:
108
- logger.error(f"Failed to build TensorRT engine: {e}")
109
- return {
110
- "success": False,
111
- "error": str(e)
112
- }
113
-
114
- async def load_model(self) -> Dict[str, Any]:
115
- """
116
- Load TensorRT engine for inference.
117
-
118
- Returns:
119
- Model loading result
120
- """
121
- if self.model_loaded:
122
- return {
123
- "success": True,
124
- "message": "Model already loaded"
125
- }
126
-
127
- if not self.engine_path or not self.engine_path.exists():
128
- return {
129
- "success": False,
130
- "error": "TensorRT engine not found. Build engine first."
131
- }
132
-
133
- try:
134
- logger.info(f"Loading TensorRT engine: {self.engine_path}")
135
- self.startup_time = datetime.now()
136
-
137
- # Import TensorRT-LLM (lazy loading)
138
- await self._import_tensorrt_llm()
139
-
140
- # Load the engine
141
- load_result = await self._load_tensorrt_engine()
142
-
143
- if load_result["success"]:
144
- self.model_loaded = True
145
- self.service_info.update({
146
- "status": "running",
147
- "loaded_at": self.startup_time.isoformat(),
148
- "load_time": load_result["load_time"]
149
- })
150
-
151
- logger.info(f"TensorRT model loaded successfully")
152
- return {
153
- "success": True,
154
- "service_info": self.service_info,
155
- "load_time": load_result["load_time"]
156
- }
157
- else:
158
- return load_result
159
-
160
- except Exception as e:
161
- logger.error(f"Failed to load TensorRT model: {e}")
162
- return {
163
- "success": False,
164
- "error": str(e)
165
- }
166
-
167
- async def unload_model(self) -> Dict[str, Any]:
168
- """Unload TensorRT model"""
169
- try:
170
- if self.generation_session:
171
- del self.generation_session
172
- self.generation_session = None
173
-
174
- self.model_loaded = False
175
- self.service_info.update({
176
- "status": "stopped",
177
- "unloaded_at": datetime.now().isoformat()
178
- })
179
-
180
- # Free GPU memory
181
- if self.tensorrt_llm:
182
- import torch
183
- if torch.cuda.is_available():
184
- torch.cuda.empty_cache()
185
-
186
- logger.info("TensorRT model unloaded")
187
- return {
188
- "success": True,
189
- "service_info": self.service_info
190
- }
191
-
192
- except Exception as e:
193
- logger.error(f"Failed to unload TensorRT model: {e}")
194
- return {
195
- "success": False,
196
- "error": str(e)
197
- }
198
-
199
- async def generate(self, prompt: str, **kwargs) -> Dict[str, Any]:
200
- """Generate text using TensorRT-LLM"""
201
- if not self.model_loaded:
202
- return {
203
- "success": False,
204
- "error": "Model not loaded"
205
- }
206
-
207
- try:
208
- start_time = time.time()
209
-
210
- # Prepare generation parameters
211
- max_tokens = kwargs.get("max_tokens", 512)
212
- temperature = kwargs.get("temperature", 0.7)
213
- top_p = kwargs.get("top_p", 0.9)
214
- top_k = kwargs.get("top_k", 50)
215
-
216
- # Tokenize input
217
- input_ids = await self._tokenize_input(prompt)
218
-
219
- # Generate with TensorRT-LLM
220
- output_ids = await self._generate_tensorrt(
221
- input_ids=input_ids,
222
- max_tokens=max_tokens,
223
- temperature=temperature,
224
- top_p=top_p,
225
- top_k=top_k
226
- )
227
-
228
- # Decode output
229
- generated_text = await self._decode_output(output_ids, len(input_ids[0]))
230
-
231
- generation_time = time.time() - start_time
232
-
233
- return {
234
- "success": True,
235
- "text": generated_text,
236
- "model": self.config.model_id,
237
- "generation_time": generation_time,
238
- "input_tokens": len(input_ids[0]),
239
- "output_tokens": len(output_ids[0]) - len(input_ids[0]),
240
- "total_tokens": len(output_ids[0])
241
- }
242
-
243
- except Exception as e:
244
- logger.error(f"TensorRT generation failed: {e}")
245
- return {
246
- "success": False,
247
- "error": str(e)
248
- }
249
-
250
- async def health_check(self) -> Dict[str, Any]:
251
- """Check service health"""
252
- return {
253
- "healthy": self.model_loaded,
254
- "status": "running" if self.model_loaded else "stopped",
255
- "service_info": self.service_info,
256
- "engine_exists": self.engine_path.exists() if self.engine_path else False
257
- }
258
-
259
- async def cleanup(self) -> Dict[str, Any]:
260
- """Clean up workspace and temporary files"""
261
- try:
262
- # Unload model first
263
- await self.unload_model()
264
-
265
- # Clean up workspace
266
- if self.workspace_dir and self.workspace_dir.exists():
267
- shutil.rmtree(self.workspace_dir)
268
- logger.info(f"Cleaned up workspace: {self.workspace_dir}")
269
-
270
- return {
271
- "success": True,
272
- "message": "Cleanup completed"
273
- }
274
-
275
- except Exception as e:
276
- logger.error(f"Cleanup failed: {e}")
277
- return {
278
- "success": False,
279
- "error": str(e)
280
- }
281
-
282
- async def _download_hf_model(self) -> Path:
283
- """Download HuggingFace model"""
284
- hf_model_path = self.workspace_dir / "hf_model"
285
-
286
- try:
287
- from huggingface_hub import snapshot_download
288
-
289
- logger.info(f"Downloading HF model: {self.config.model_id}")
290
- snapshot_download(
291
- repo_id=self.config.model_id,
292
- local_dir=str(hf_model_path),
293
- local_dir_use_symlinks=False,
294
- revision=self.config.revision
295
- )
296
-
297
- logger.info(f"Model downloaded to: {hf_model_path}")
298
- return hf_model_path
299
-
300
- except Exception as e:
301
- logger.error(f"Failed to download model: {e}")
302
- raise
303
-
304
- async def _build_tensorrt_engine(self, hf_model_path: Path) -> Dict[str, Any]:
305
- """Build TensorRT engine using trtllm-build"""
306
- try:
307
- engine_output_path = self.workspace_dir / "engines"
308
- engine_output_path.mkdir(exist_ok=True)
309
-
310
- logger.info("Building TensorRT engine...")
311
- start_time = time.time()
312
-
313
- # Prepare build command
314
- build_cmd = [
315
- "trtllm-build",
316
- "--checkpoint_dir", str(hf_model_path),
317
- "--output_dir", str(engine_output_path),
318
- "--gemm_plugin", self.config.model_precision,
319
- "--gpt_attention_plugin", self.config.model_precision,
320
- "--max_batch_size", str(self.config.max_batch_size),
321
- "--max_seq_len", str(self.config.max_model_len),
322
- ]
323
-
324
- # Add TensorRT-specific arguments
325
- tensorrt_args = self.config.tensorrt_args
326
- for key, value in tensorrt_args.items():
327
- if isinstance(value, bool):
328
- if value:
329
- build_cmd.append(f"--{key}")
330
- else:
331
- build_cmd.extend([f"--{key}", str(value)])
332
-
333
- # Set environment
334
- env = os.environ.copy()
335
- if self.config.gpu_id is not None:
336
- env["CUDA_VISIBLE_DEVICES"] = str(self.config.gpu_id)
337
-
338
- # Run build command
339
- logger.info(f"TensorRT build command: {' '.join(build_cmd)}")
340
-
341
- process = await asyncio.create_subprocess_exec(
342
- *build_cmd,
343
- stdout=asyncio.subprocess.PIPE,
344
- stderr=asyncio.subprocess.PIPE,
345
- env=env
346
- )
347
-
348
- stdout, stderr = await process.communicate()
349
-
350
- if process.returncode == 0:
351
- build_time = time.time() - start_time
352
-
353
- # Find the built engine
354
- engine_files = list(engine_output_path.glob("*.engine"))
355
- if engine_files:
356
- engine_path = engine_files[0]
357
- else:
358
- # Look for rank_0.engine or similar patterns
359
- rank_engines = list(engine_output_path.glob("*rank_0*.engine"))
360
- if rank_engines:
361
- engine_path = rank_engines[0]
362
- else:
363
- engine_path = engine_output_path / "model.engine"
364
-
365
- logger.info(f"TensorRT engine built in {build_time:.2f}s: {engine_path}")
366
-
367
- return {
368
- "success": True,
369
- "engine_path": engine_path,
370
- "build_time": build_time,
371
- "stdout": stdout.decode() if stdout else "",
372
- "stderr": stderr.decode() if stderr else ""
373
- }
374
- else:
375
- error_msg = stderr.decode() if stderr else "Unknown build error"
376
- logger.error(f"TensorRT build failed: {error_msg}")
377
-
378
- return {
379
- "success": False,
380
- "error": f"TensorRT build failed: {error_msg}",
381
- "stdout": stdout.decode() if stdout else "",
382
- "stderr": stderr.decode() if stderr else ""
383
- }
384
-
385
- except Exception as e:
386
- logger.error(f"TensorRT build error: {e}")
387
- return {
388
- "success": False,
389
- "error": str(e)
390
- }
391
-
392
- async def _import_tensorrt_llm(self):
393
- """Import TensorRT-LLM modules"""
394
- try:
395
- import tensorrt_llm
396
- from tensorrt_llm.runtime import ModelConfig, SamplingConfig
397
- from tensorrt_llm.runtime.generation import GenerationSession
398
-
399
- self.tensorrt_llm = tensorrt_llm
400
- self.ModelConfig = ModelConfig
401
- self.SamplingConfig = SamplingConfig
402
- self.GenerationSession = GenerationSession
403
-
404
- logger.info("TensorRT-LLM modules imported successfully")
405
-
406
- except ImportError as e:
407
- raise ImportError(f"TensorRT-LLM not installed: {e}")
408
-
409
- async def _load_tensorrt_engine(self) -> Dict[str, Any]:
410
- """Load TensorRT engine for inference"""
411
- try:
412
- start_time = time.time()
413
-
414
- # Configure runtime mapping
415
- from tensorrt_llm.runtime import ModelConfig
416
- from tensorrt_llm.runtime.generation import GenerationSession
417
-
418
- # Load model configuration
419
- config_path = self.engine_path.parent / "config.json"
420
- if config_path.exists():
421
- with open(config_path, 'r') as f:
422
- model_config_dict = json.load(f)
423
- model_config = ModelConfig.from_dict(model_config_dict)
424
- else:
425
- # Create default config
426
- model_config = ModelConfig(
427
- max_batch_size=self.config.max_batch_size,
428
- max_input_len=self.config.max_model_len // 2,
429
- max_output_len=self.config.max_model_len // 2,
430
- max_beam_width=1,
431
- vocab_size=50000, # Default, will be updated from tokenizer
432
- num_heads=32,
433
- num_kv_heads=32,
434
- hidden_size=4096,
435
- gpt_attention_plugin=True,
436
- remove_input_padding=True
437
- )
438
-
439
- # Create generation session
440
- self.generation_session = GenerationSession(
441
- model_config=model_config,
442
- engine_dir=str(self.engine_path.parent),
443
- runtime_mapping=None # Single GPU
444
- )
445
-
446
- load_time = time.time() - start_time
447
-
448
- return {
449
- "success": True,
450
- "load_time": load_time
451
- }
452
-
453
- except Exception as e:
454
- logger.error(f"Failed to load TensorRT engine: {e}")
455
- return {
456
- "success": False,
457
- "error": str(e)
458
- }
459
-
460
- async def _tokenize_input(self, text: str) -> List[List[int]]:
461
- """Tokenize input text"""
462
- try:
463
- from transformers import AutoTokenizer
464
-
465
- # Load tokenizer
466
- if not hasattr(self, '_tokenizer'):
467
- self._tokenizer = AutoTokenizer.from_pretrained(
468
- self.config.model_id,
469
- revision=self.config.tokenizer_revision,
470
- trust_remote_code=self.config.trust_remote_code
471
- )
472
-
473
- # Tokenize
474
- encoded = self._tokenizer.encode(text, return_tensors="pt")
475
- return [encoded[0].tolist()]
476
-
477
- except Exception as e:
478
- logger.error(f"Tokenization failed: {e}")
479
- raise
480
-
481
- async def _generate_tensorrt(self, input_ids: List[List[int]],
482
- max_tokens: int, temperature: float,
483
- top_p: float, top_k: int) -> List[List[int]]:
484
- """Generate using TensorRT-LLM"""
485
- try:
486
- import torch
487
- from tensorrt_llm.runtime import SamplingConfig
488
-
489
- # Prepare inputs
490
- batch_size = len(input_ids)
491
- input_lengths = [len(seq) for seq in input_ids]
492
- max_input_length = max(input_lengths)
493
-
494
- # Pad sequences
495
- padded_ids = []
496
- for seq in input_ids:
497
- padded = seq + [0] * (max_input_length - len(seq))
498
- padded_ids.append(padded)
499
-
500
- input_ids_tensor = torch.tensor(padded_ids, dtype=torch.int32).cuda()
501
- input_lengths_tensor = torch.tensor(input_lengths, dtype=torch.int32).cuda()
502
-
503
- # Configure sampling
504
- sampling_config = SamplingConfig(
505
- end_id=self._tokenizer.eos_token_id,
506
- pad_id=self._tokenizer.pad_token_id or self._tokenizer.eos_token_id,
507
- temperature=temperature,
508
- top_k=top_k,
509
- top_p=top_p,
510
- num_beams=1,
511
- length_penalty=1.0
512
- )
513
-
514
- # Generate
515
- output_ids = self.generation_session.decode(
516
- input_ids=input_ids_tensor,
517
- input_lengths=input_lengths_tensor,
518
- sampling_config=sampling_config,
519
- max_new_tokens=max_tokens
520
- )
521
-
522
- return output_ids.cpu().numpy().tolist()
523
-
524
- except Exception as e:
525
- logger.error(f"TensorRT generation failed: {e}")
526
- raise
527
-
528
- async def _decode_output(self, output_ids: List[List[int]], input_length: int) -> str:
529
- """Decode generated tokens to text"""
530
- try:
531
- # Extract only the generated part
532
- generated_ids = output_ids[0][input_length:]
533
-
534
- # Decode
535
- generated_text = self._tokenizer.decode(
536
- generated_ids,
537
- skip_special_tokens=True
538
- )
539
-
540
- return generated_text.strip()
541
-
542
- except Exception as e:
543
- logger.error(f"Decoding failed: {e}")
544
- raise
545
-
546
- async def _check_gpu_requirements(self) -> Dict[str, Any]:
547
- """Check GPU requirements for TensorRT-LLM"""
548
- self.gpu_manager.refresh()
549
-
550
- if not self.gpu_manager.cuda_available:
551
- return {
552
- "compatible": False,
553
- "warnings": ["CUDA not available"],
554
- "selected_gpu": None
555
- }
556
-
557
- # TensorRT-LLM requires more memory due to engine optimization
558
- estimated_memory = self.gpu_manager.estimate_model_memory(
559
- self.config.model_id,
560
- self.config.model_precision
561
- )
562
- # Add 50% overhead for TensorRT optimizations
563
- estimated_memory = int(estimated_memory * 1.5)
564
-
565
- # Find suitable GPU
566
- if self.config.gpu_id is not None:
567
- selected_gpu = self.gpu_manager.get_gpu_info(self.config.gpu_id)
568
- if not selected_gpu:
569
- return {
570
- "compatible": False,
571
- "warnings": [f"Specified GPU {self.config.gpu_id} not found"],
572
- "selected_gpu": None
573
- }
574
- else:
575
- selected_gpu = self.gpu_manager.get_best_gpu(estimated_memory)
576
- if selected_gpu:
577
- self.config.gpu_id = selected_gpu.gpu_id
578
-
579
- if not selected_gpu:
580
- return {
581
- "compatible": False,
582
- "warnings": [
583
- f"No suitable GPU found. Required: {estimated_memory}MB (TensorRT overhead included), "
584
- f"Available: {max(gpu.memory_free for gpu in self.gpu_manager.gpus) if self.gpu_manager.gpus else 0}MB"
585
- ],
586
- "selected_gpu": None
587
- }
588
-
589
- warnings = []
590
-
591
- # Check memory requirements (TensorRT needs more memory)
592
- if selected_gpu.memory_free < estimated_memory:
593
- warnings.append(f"GPU memory may be insufficient for TensorRT: {selected_gpu.memory_free}MB available, {estimated_memory}MB required")
594
-
595
- # Check compute capability for TensorRT
596
- if selected_gpu.name and "RTX" not in selected_gpu.name and "Tesla" not in selected_gpu.name:
597
- warnings.append("TensorRT-LLM works best with RTX/Tesla GPUs")
598
-
599
- return {
600
- "compatible": True,
601
- "warnings": warnings,
602
- "selected_gpu": {
603
- "gpu_id": selected_gpu.gpu_id,
604
- "name": selected_gpu.name,
605
- "memory_total": selected_gpu.memory_total,
606
- "memory_free": selected_gpu.memory_free,
607
- "utilization": selected_gpu.utilization,
608
- "estimated_memory_required": estimated_memory
609
- }
610
- }
611
-
612
- def get_service_info(self) -> Dict[str, Any]:
613
- """Get current service information"""
614
- return {
615
- **self.service_info,
616
- "config": self.config.to_dict(),
617
- "workspace_dir": str(self.workspace_dir) if self.workspace_dir else None,
618
- "engine_path": str(self.engine_path) if self.engine_path else None,
619
- "model_loaded": self.model_loaded,
620
- "startup_time": self.startup_time.isoformat() if self.startup_time else None
621
- }