isa-model 0.3.91__py3-none-any.whl → 0.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- isa_model/client.py +1166 -584
- isa_model/core/cache/redis_cache.py +410 -0
- isa_model/core/config/config_manager.py +282 -12
- isa_model/core/config.py +91 -1
- isa_model/core/database/__init__.py +1 -0
- isa_model/core/database/direct_db_client.py +114 -0
- isa_model/core/database/migration_manager.py +563 -0
- isa_model/core/database/migrations.py +297 -0
- isa_model/core/database/supabase_client.py +258 -0
- isa_model/core/dependencies.py +316 -0
- isa_model/core/discovery/__init__.py +19 -0
- isa_model/core/discovery/consul_discovery.py +190 -0
- isa_model/core/logging/__init__.py +54 -0
- isa_model/core/logging/influx_logger.py +523 -0
- isa_model/core/logging/loki_logger.py +160 -0
- isa_model/core/models/__init__.py +46 -0
- isa_model/core/models/config_models.py +625 -0
- isa_model/core/models/deployment_billing_tracker.py +430 -0
- isa_model/core/models/model_billing_tracker.py +60 -88
- isa_model/core/models/model_manager.py +66 -25
- isa_model/core/models/model_metadata.py +690 -0
- isa_model/core/models/model_repo.py +217 -55
- isa_model/core/models/model_statistics_tracker.py +234 -0
- isa_model/core/models/model_storage.py +0 -1
- isa_model/core/models/model_version_manager.py +959 -0
- isa_model/core/models/system_models.py +857 -0
- isa_model/core/pricing_manager.py +2 -249
- isa_model/core/repositories/__init__.py +9 -0
- isa_model/core/repositories/config_repository.py +912 -0
- isa_model/core/resilience/circuit_breaker.py +366 -0
- isa_model/core/security/secrets.py +358 -0
- isa_model/core/services/__init__.py +2 -4
- isa_model/core/services/intelligent_model_selector.py +479 -370
- isa_model/core/storage/hf_storage.py +2 -2
- isa_model/core/types.py +8 -0
- isa_model/deployment/__init__.py +5 -48
- isa_model/deployment/core/__init__.py +2 -31
- isa_model/deployment/core/deployment_manager.py +1278 -368
- isa_model/deployment/local/__init__.py +31 -0
- isa_model/deployment/local/config.py +248 -0
- isa_model/deployment/local/gpu_gateway.py +607 -0
- isa_model/deployment/local/health_checker.py +428 -0
- isa_model/deployment/local/provider.py +586 -0
- isa_model/deployment/local/tensorrt_service.py +621 -0
- isa_model/deployment/local/transformers_service.py +644 -0
- isa_model/deployment/local/vllm_service.py +527 -0
- isa_model/deployment/modal/__init__.py +8 -0
- isa_model/deployment/modal/config.py +136 -0
- isa_model/deployment/modal/deployer.py +894 -0
- isa_model/deployment/modal/services/__init__.py +3 -0
- isa_model/deployment/modal/services/audio/__init__.py +1 -0
- isa_model/deployment/modal/services/audio/isa_audio_chatTTS_service.py +520 -0
- isa_model/deployment/modal/services/audio/isa_audio_openvoice_service.py +758 -0
- isa_model/deployment/modal/services/audio/isa_audio_service_v2.py +1044 -0
- isa_model/deployment/modal/services/embedding/__init__.py +1 -0
- isa_model/deployment/modal/services/embedding/isa_embed_rerank_service.py +296 -0
- isa_model/deployment/modal/services/llm/__init__.py +1 -0
- isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
- isa_model/deployment/modal/services/video/__init__.py +1 -0
- isa_model/deployment/modal/services/video/isa_video_hunyuan_service.py +423 -0
- isa_model/deployment/modal/services/vision/__init__.py +1 -0
- isa_model/deployment/modal/services/vision/isa_vision_ocr_service.py +519 -0
- isa_model/deployment/modal/services/vision/isa_vision_qwen25_service.py +709 -0
- isa_model/deployment/modal/services/vision/isa_vision_table_service.py +676 -0
- isa_model/deployment/modal/services/vision/isa_vision_ui_service.py +833 -0
- isa_model/deployment/modal/services/vision/isa_vision_ui_service_optimized.py +660 -0
- isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
- isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
- isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
- isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
- isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
- isa_model/deployment/storage/__init__.py +5 -0
- isa_model/deployment/storage/deployment_repository.py +824 -0
- isa_model/deployment/triton/__init__.py +10 -0
- isa_model/deployment/triton/config.py +196 -0
- isa_model/deployment/triton/configs/__init__.py +1 -0
- isa_model/deployment/triton/provider.py +512 -0
- isa_model/deployment/triton/scripts/__init__.py +1 -0
- isa_model/deployment/triton/templates/__init__.py +1 -0
- isa_model/inference/__init__.py +47 -1
- isa_model/inference/ai_factory.py +179 -16
- isa_model/inference/legacy_services/__init__.py +21 -0
- isa_model/inference/legacy_services/model_evaluation.py +637 -0
- isa_model/inference/legacy_services/model_service.py +573 -0
- isa_model/inference/legacy_services/model_serving.py +717 -0
- isa_model/inference/legacy_services/model_training.py +561 -0
- isa_model/inference/models/__init__.py +21 -0
- isa_model/inference/models/inference_config.py +551 -0
- isa_model/inference/models/inference_record.py +675 -0
- isa_model/inference/models/performance_models.py +714 -0
- isa_model/inference/repositories/__init__.py +9 -0
- isa_model/inference/repositories/inference_repository.py +828 -0
- isa_model/inference/services/audio/__init__.py +21 -0
- isa_model/inference/services/audio/base_realtime_service.py +225 -0
- isa_model/inference/services/audio/base_stt_service.py +184 -11
- isa_model/inference/services/audio/isa_tts_service.py +0 -0
- isa_model/inference/services/audio/openai_realtime_service.py +320 -124
- isa_model/inference/services/audio/openai_stt_service.py +53 -11
- isa_model/inference/services/base_service.py +17 -1
- isa_model/inference/services/custom_model_manager.py +277 -0
- isa_model/inference/services/embedding/__init__.py +13 -0
- isa_model/inference/services/embedding/base_embed_service.py +111 -8
- isa_model/inference/services/embedding/isa_embed_service.py +305 -0
- isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
- isa_model/inference/services/embedding/openai_embed_service.py +2 -4
- isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
- isa_model/inference/services/embedding/tests/test_embedding.py +222 -0
- isa_model/inference/services/img/__init__.py +2 -2
- isa_model/inference/services/img/base_image_gen_service.py +24 -7
- isa_model/inference/services/img/replicate_image_gen_service.py +84 -422
- isa_model/inference/services/img/services/replicate_face_swap.py +193 -0
- isa_model/inference/services/img/services/replicate_flux.py +226 -0
- isa_model/inference/services/img/services/replicate_flux_kontext.py +219 -0
- isa_model/inference/services/img/services/replicate_sticker_maker.py +249 -0
- isa_model/inference/services/img/tests/test_img_client.py +297 -0
- isa_model/inference/services/llm/__init__.py +10 -2
- isa_model/inference/services/llm/base_llm_service.py +361 -26
- isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
- isa_model/inference/services/llm/helpers/llm_adapter.py +71 -12
- isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
- isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
- isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
- isa_model/inference/services/llm/local_llm_service.py +747 -0
- isa_model/inference/services/llm/ollama_llm_service.py +11 -3
- isa_model/inference/services/llm/openai_llm_service.py +670 -56
- isa_model/inference/services/llm/yyds_llm_service.py +10 -3
- isa_model/inference/services/vision/__init__.py +27 -6
- isa_model/inference/services/vision/base_vision_service.py +118 -185
- isa_model/inference/services/vision/blip_vision_service.py +359 -0
- isa_model/inference/services/vision/helpers/image_utils.py +19 -10
- isa_model/inference/services/vision/isa_vision_service.py +634 -0
- isa_model/inference/services/vision/openai_vision_service.py +19 -10
- isa_model/inference/services/vision/tests/test_ocr_client.py +284 -0
- isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
- isa_model/serving/api/cache_manager.py +245 -0
- isa_model/serving/api/dependencies/__init__.py +1 -0
- isa_model/serving/api/dependencies/auth.py +194 -0
- isa_model/serving/api/dependencies/database.py +139 -0
- isa_model/serving/api/error_handlers.py +284 -0
- isa_model/serving/api/fastapi_server.py +240 -18
- isa_model/serving/api/middleware/auth.py +317 -0
- isa_model/serving/api/middleware/security.py +268 -0
- isa_model/serving/api/middleware/tenant_context.py +414 -0
- isa_model/serving/api/routes/analytics.py +489 -0
- isa_model/serving/api/routes/config.py +645 -0
- isa_model/serving/api/routes/deployment_billing.py +315 -0
- isa_model/serving/api/routes/deployments.py +475 -0
- isa_model/serving/api/routes/gpu_gateway.py +440 -0
- isa_model/serving/api/routes/health.py +32 -12
- isa_model/serving/api/routes/inference_monitoring.py +486 -0
- isa_model/serving/api/routes/local_deployments.py +448 -0
- isa_model/serving/api/routes/logs.py +430 -0
- isa_model/serving/api/routes/settings.py +582 -0
- isa_model/serving/api/routes/tenants.py +575 -0
- isa_model/serving/api/routes/unified.py +992 -171
- isa_model/serving/api/routes/webhooks.py +479 -0
- isa_model/serving/api/startup.py +318 -0
- isa_model/serving/modal_proxy_server.py +249 -0
- isa_model/utils/gpu_utils.py +311 -0
- {isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/METADATA +76 -22
- isa_model-0.4.3.dist-info/RECORD +193 -0
- isa_model/deployment/cloud/__init__.py +0 -9
- isa_model/deployment/cloud/modal/__init__.py +0 -10
- isa_model/deployment/cloud/modal/isa_vision_doc_service.py +0 -766
- isa_model/deployment/cloud/modal/isa_vision_table_service.py +0 -532
- isa_model/deployment/cloud/modal/isa_vision_ui_service.py +0 -406
- isa_model/deployment/cloud/modal/register_models.py +0 -321
- isa_model/deployment/core/deployment_config.py +0 -356
- isa_model/deployment/core/isa_deployment_service.py +0 -401
- isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
- isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
- isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
- isa_model/deployment/runtime/deployed_service.py +0 -338
- isa_model/deployment/services/__init__.py +0 -9
- isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
- isa_model/deployment/services/model_service.py +0 -332
- isa_model/deployment/services/service_monitor.py +0 -356
- isa_model/deployment/services/service_registry.py +0 -527
- isa_model/eval/__init__.py +0 -92
- isa_model/eval/benchmarks.py +0 -469
- isa_model/eval/config/__init__.py +0 -10
- isa_model/eval/config/evaluation_config.py +0 -108
- isa_model/eval/evaluators/__init__.py +0 -18
- isa_model/eval/evaluators/base_evaluator.py +0 -503
- isa_model/eval/evaluators/llm_evaluator.py +0 -472
- isa_model/eval/factory.py +0 -531
- isa_model/eval/infrastructure/__init__.py +0 -24
- isa_model/eval/infrastructure/experiment_tracker.py +0 -466
- isa_model/eval/metrics.py +0 -798
- isa_model/inference/adapter/unified_api.py +0 -248
- isa_model/inference/services/helpers/stacked_config.py +0 -148
- isa_model/inference/services/img/flux_professional_service.py +0 -603
- isa_model/inference/services/img/helpers/base_stacked_service.py +0 -274
- isa_model/inference/services/others/table_transformer_service.py +0 -61
- isa_model/inference/services/vision/doc_analysis_service.py +0 -640
- isa_model/inference/services/vision/helpers/base_stacked_service.py +0 -274
- isa_model/inference/services/vision/ui_analysis_service.py +0 -823
- isa_model/scripts/inference_tracker.py +0 -283
- isa_model/scripts/mlflow_manager.py +0 -379
- isa_model/scripts/model_registry.py +0 -465
- isa_model/scripts/register_models.py +0 -370
- isa_model/scripts/register_models_with_embeddings.py +0 -510
- isa_model/scripts/start_mlflow.py +0 -95
- isa_model/scripts/training_tracker.py +0 -257
- isa_model/training/__init__.py +0 -74
- isa_model/training/annotation/annotation_schema.py +0 -47
- isa_model/training/annotation/processors/annotation_processor.py +0 -126
- isa_model/training/annotation/storage/dataset_manager.py +0 -131
- isa_model/training/annotation/storage/dataset_schema.py +0 -44
- isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
- isa_model/training/annotation/tests/test_minio copy.py +0 -113
- isa_model/training/annotation/tests/test_minio_upload.py +0 -43
- isa_model/training/annotation/views/annotation_controller.py +0 -158
- isa_model/training/cloud/__init__.py +0 -22
- isa_model/training/cloud/job_orchestrator.py +0 -402
- isa_model/training/cloud/runpod_trainer.py +0 -454
- isa_model/training/cloud/storage_manager.py +0 -482
- isa_model/training/core/__init__.py +0 -23
- isa_model/training/core/config.py +0 -181
- isa_model/training/core/dataset.py +0 -222
- isa_model/training/core/trainer.py +0 -720
- isa_model/training/core/utils.py +0 -213
- isa_model/training/factory.py +0 -424
- isa_model-0.3.91.dist-info/RECORD +0 -138
- /isa_model/{core/storage/minio_storage.py → deployment/modal/services/audio/isa_audio_fish_service.py} +0 -0
- /isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
- {isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/WHEEL +0 -0
- {isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,512 @@
|
|
1
|
+
"""
|
2
|
+
Triton deployment provider
|
3
|
+
|
4
|
+
Handles deployment of models to Triton Inference Server with TensorRT-LLM optimization.
|
5
|
+
"""
|
6
|
+
|
7
|
+
import os
|
8
|
+
import json
|
9
|
+
import logging
|
10
|
+
import subprocess
|
11
|
+
import tempfile
|
12
|
+
from typing import Dict, List, Optional, Any
|
13
|
+
from pathlib import Path
|
14
|
+
from datetime import datetime
|
15
|
+
import asyncio
|
16
|
+
import docker
|
17
|
+
|
18
|
+
from .config import TritonConfig, TritonServiceType, TritonBackend
|
19
|
+
|
20
|
+
logger = logging.getLogger(__name__)
|
21
|
+
|
22
|
+
|
23
|
+
class TritonProvider:
|
24
|
+
"""
|
25
|
+
Provider for deploying models to Triton Inference Server with TensorRT-LLM.
|
26
|
+
|
27
|
+
This provider handles:
|
28
|
+
- Model conversion to TensorRT engines
|
29
|
+
- Triton model configuration generation
|
30
|
+
- Docker container deployment
|
31
|
+
- Health monitoring and scaling
|
32
|
+
"""
|
33
|
+
|
34
|
+
def __init__(self, workspace_dir: str = "./triton_deployments"):
|
35
|
+
"""
|
36
|
+
Initialize Triton provider.
|
37
|
+
|
38
|
+
Args:
|
39
|
+
workspace_dir: Directory for deployment artifacts
|
40
|
+
"""
|
41
|
+
self.workspace_dir = Path(workspace_dir)
|
42
|
+
self.workspace_dir.mkdir(parents=True, exist_ok=True)
|
43
|
+
|
44
|
+
# Initialize Docker client
|
45
|
+
try:
|
46
|
+
self.docker_client = docker.from_env()
|
47
|
+
except Exception as e:
|
48
|
+
logger.warning(f"Docker client initialization failed: {e}")
|
49
|
+
self.docker_client = None
|
50
|
+
|
51
|
+
# Deployment tracking
|
52
|
+
self.deployments: Dict[str, Dict[str, Any]] = {}
|
53
|
+
|
54
|
+
logger.info("Triton provider initialized")
|
55
|
+
logger.info(f"Workspace directory: {self.workspace_dir}")
|
56
|
+
|
57
|
+
async def deploy(self, config: TritonConfig) -> Dict[str, Any]:
|
58
|
+
"""
|
59
|
+
Deploy a model to Triton Inference Server.
|
60
|
+
|
61
|
+
Args:
|
62
|
+
config: Triton deployment configuration
|
63
|
+
|
64
|
+
Returns:
|
65
|
+
Deployment result with endpoint information
|
66
|
+
"""
|
67
|
+
deployment_id = f"{config.service_name}-triton-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
|
68
|
+
|
69
|
+
logger.info("=" * 60)
|
70
|
+
logger.info(f"STARTING TRITON DEPLOYMENT: {deployment_id}")
|
71
|
+
logger.info("=" * 60)
|
72
|
+
|
73
|
+
try:
|
74
|
+
# Step 1: Prepare workspace
|
75
|
+
logger.info("Step 1/6: Preparing deployment workspace...")
|
76
|
+
workspace = await self._prepare_workspace(deployment_id, config)
|
77
|
+
|
78
|
+
# Step 2: Download HF model
|
79
|
+
logger.info("Step 2/6: Downloading HuggingFace model...")
|
80
|
+
hf_model_path = await self._download_hf_model(config, workspace)
|
81
|
+
|
82
|
+
# Step 3: Convert to TensorRT engine (if needed)
|
83
|
+
if config.use_tensorrt and config.service_type == TritonServiceType.LLM:
|
84
|
+
logger.info("Step 3/6: Converting model to TensorRT engine...")
|
85
|
+
engine_path = await self._build_tensorrt_engine(config, workspace, hf_model_path)
|
86
|
+
else:
|
87
|
+
logger.info("Step 3/6: Skipping TensorRT conversion...")
|
88
|
+
engine_path = hf_model_path
|
89
|
+
|
90
|
+
# Step 4: Generate Triton model configuration
|
91
|
+
logger.info("Step 4/6: Generating Triton model configuration...")
|
92
|
+
await self._generate_triton_config(config, workspace, engine_path)
|
93
|
+
|
94
|
+
# Step 5: Deploy container
|
95
|
+
logger.info("Step 5/6: Deploying Triton container...")
|
96
|
+
container_info = await self._deploy_container(config, workspace)
|
97
|
+
|
98
|
+
# Step 6: Verify deployment
|
99
|
+
logger.info("Step 6/6: Verifying deployment...")
|
100
|
+
endpoint_url = await self._verify_deployment(config, container_info)
|
101
|
+
|
102
|
+
result = {
|
103
|
+
"provider": "triton",
|
104
|
+
"deployment_id": deployment_id,
|
105
|
+
"service_name": config.service_name,
|
106
|
+
"service_type": config.service_type.value,
|
107
|
+
"endpoint_url": endpoint_url,
|
108
|
+
"container_id": container_info.get("container_id"),
|
109
|
+
"status": "deployed",
|
110
|
+
"deployed_at": datetime.now().isoformat()
|
111
|
+
}
|
112
|
+
|
113
|
+
# Register deployment
|
114
|
+
self.deployments[deployment_id] = {
|
115
|
+
"config": config.to_dict(),
|
116
|
+
"result": result,
|
117
|
+
"workspace": str(workspace)
|
118
|
+
}
|
119
|
+
|
120
|
+
logger.info("=" * 60)
|
121
|
+
logger.info("TRITON DEPLOYMENT COMPLETED SUCCESSFULLY!")
|
122
|
+
logger.info("=" * 60)
|
123
|
+
logger.info(f"Deployment ID: {deployment_id}")
|
124
|
+
logger.info(f"Endpoint URL: {endpoint_url}")
|
125
|
+
|
126
|
+
return result
|
127
|
+
|
128
|
+
except Exception as e:
|
129
|
+
logger.error("=" * 60)
|
130
|
+
logger.error("TRITON DEPLOYMENT FAILED!")
|
131
|
+
logger.error("=" * 60)
|
132
|
+
logger.error(f"Error: {e}")
|
133
|
+
raise
|
134
|
+
|
135
|
+
async def _prepare_workspace(self, deployment_id: str, config: TritonConfig) -> Path:
|
136
|
+
"""Prepare deployment workspace"""
|
137
|
+
workspace = self.workspace_dir / deployment_id
|
138
|
+
workspace.mkdir(exist_ok=True)
|
139
|
+
|
140
|
+
# Create required directories
|
141
|
+
(workspace / "hf_model").mkdir(exist_ok=True)
|
142
|
+
(workspace / "engines").mkdir(exist_ok=True)
|
143
|
+
(workspace / "model_repository" / config.model_name / config.model_version).mkdir(parents=True, exist_ok=True)
|
144
|
+
|
145
|
+
# Save deployment config
|
146
|
+
with open(workspace / "deployment_config.json", 'w') as f:
|
147
|
+
json.dump(config.to_dict(), f, indent=2)
|
148
|
+
|
149
|
+
logger.info(f"Workspace prepared at: {workspace}")
|
150
|
+
return workspace
|
151
|
+
|
152
|
+
async def _download_hf_model(self, config: TritonConfig, workspace: Path) -> Path:
|
153
|
+
"""Download HuggingFace model"""
|
154
|
+
hf_model_path = workspace / "hf_model"
|
155
|
+
|
156
|
+
# Use git clone or huggingface_hub to download
|
157
|
+
try:
|
158
|
+
from huggingface_hub import snapshot_download
|
159
|
+
|
160
|
+
logger.info(f"Downloading model: {config.model_id}")
|
161
|
+
snapshot_download(
|
162
|
+
repo_id=config.model_id,
|
163
|
+
local_dir=str(hf_model_path),
|
164
|
+
local_dir_use_symlinks=False
|
165
|
+
)
|
166
|
+
|
167
|
+
logger.info(f"Model downloaded to: {hf_model_path}")
|
168
|
+
return hf_model_path
|
169
|
+
|
170
|
+
except Exception as e:
|
171
|
+
logger.error(f"Failed to download model: {e}")
|
172
|
+
raise
|
173
|
+
|
174
|
+
async def _build_tensorrt_engine(self, config: TritonConfig, workspace: Path, hf_model_path: Path) -> Path:
|
175
|
+
"""Build TensorRT engine using Docker"""
|
176
|
+
engine_output_path = workspace / "engines"
|
177
|
+
|
178
|
+
logger.info("Building TensorRT engine using Docker...")
|
179
|
+
|
180
|
+
# Prepare build command
|
181
|
+
build_options = config.build_options
|
182
|
+
build_cmd_parts = [
|
183
|
+
"trtllm-build",
|
184
|
+
f"--checkpoint_dir /workspace/hf_model",
|
185
|
+
f"--output_dir /workspace/engines",
|
186
|
+
]
|
187
|
+
|
188
|
+
# Add build options
|
189
|
+
for key, value in build_options.items():
|
190
|
+
if isinstance(value, bool):
|
191
|
+
if value:
|
192
|
+
build_cmd_parts.append(f"--{key}")
|
193
|
+
else:
|
194
|
+
build_cmd_parts.append(f"--{key} {value}")
|
195
|
+
|
196
|
+
build_cmd = " && ".join([
|
197
|
+
"set -e",
|
198
|
+
"echo '>>> Building TensorRT engine...'",
|
199
|
+
" ".join(build_cmd_parts),
|
200
|
+
"echo '>>> TensorRT engine build completed!'"
|
201
|
+
])
|
202
|
+
|
203
|
+
# Run Docker container for building
|
204
|
+
if self.docker_client:
|
205
|
+
try:
|
206
|
+
logger.info("Starting TensorRT build container...")
|
207
|
+
|
208
|
+
container = self.docker_client.containers.run(
|
209
|
+
config.build_container_image,
|
210
|
+
command=f"bash -c \"{build_cmd}\"",
|
211
|
+
volumes={
|
212
|
+
str(hf_model_path): {"bind": "/workspace/hf_model", "mode": "ro"},
|
213
|
+
str(engine_output_path): {"bind": "/workspace/engines", "mode": "rw"}
|
214
|
+
},
|
215
|
+
device_requests=[
|
216
|
+
docker.types.DeviceRequest(count=-1, capabilities=[["gpu"]])
|
217
|
+
],
|
218
|
+
remove=True,
|
219
|
+
detach=False
|
220
|
+
)
|
221
|
+
|
222
|
+
logger.info("TensorRT engine build completed")
|
223
|
+
|
224
|
+
except Exception as e:
|
225
|
+
logger.error(f"TensorRT build failed: {e}")
|
226
|
+
raise
|
227
|
+
else:
|
228
|
+
# Fallback to subprocess if Docker client unavailable
|
229
|
+
logger.warning("Docker client unavailable, using subprocess...")
|
230
|
+
# Implementation would depend on having docker command available
|
231
|
+
raise RuntimeError("Docker client required for TensorRT build")
|
232
|
+
|
233
|
+
return engine_output_path
|
234
|
+
|
235
|
+
async def _generate_triton_config(self, config: TritonConfig, workspace: Path, model_path: Path):
|
236
|
+
"""Generate Triton model configuration"""
|
237
|
+
model_repo_path = workspace / "model_repository" / config.model_name
|
238
|
+
|
239
|
+
# Generate config.pbtxt
|
240
|
+
if config.backend == TritonBackend.TENSORRT_LLM:
|
241
|
+
config_content = self._generate_tensorrt_llm_config(config)
|
242
|
+
elif config.backend == TritonBackend.PYTHON:
|
243
|
+
config_content = self._generate_python_backend_config(config)
|
244
|
+
else:
|
245
|
+
raise ValueError(f"Unsupported backend: {config.backend}")
|
246
|
+
|
247
|
+
# Write config file
|
248
|
+
with open(model_repo_path / "config.pbtxt", 'w') as f:
|
249
|
+
f.write(config_content)
|
250
|
+
|
251
|
+
# Copy model files to model repository
|
252
|
+
model_version_path = model_repo_path / config.model_version
|
253
|
+
if config.use_tensorrt:
|
254
|
+
# Copy engine files
|
255
|
+
import shutil
|
256
|
+
if (model_path / "model.engine").exists():
|
257
|
+
shutil.copy2(model_path / "model.engine", model_version_path)
|
258
|
+
else:
|
259
|
+
# Copy all engine files
|
260
|
+
for engine_file in model_path.glob("*.engine"):
|
261
|
+
shutil.copy2(engine_file, model_version_path)
|
262
|
+
else:
|
263
|
+
# Copy HF model files
|
264
|
+
import shutil
|
265
|
+
shutil.copytree(model_path, model_version_path / "model", dirs_exist_ok=True)
|
266
|
+
|
267
|
+
logger.info(f"Triton configuration generated at: {model_repo_path}")
|
268
|
+
|
269
|
+
def _generate_tensorrt_llm_config(self, config: TritonConfig) -> str:
|
270
|
+
"""Generate TensorRT-LLM backend configuration"""
|
271
|
+
return f'''name: "{config.model_name}"
|
272
|
+
backend: "tensorrtllm"
|
273
|
+
max_batch_size: {config.max_batch_size}
|
274
|
+
|
275
|
+
{"decoupled: true" if config.enable_streaming else ""}
|
276
|
+
|
277
|
+
input [
|
278
|
+
{{
|
279
|
+
name: "text_input"
|
280
|
+
data_type: TYPE_STRING
|
281
|
+
dims: [ -1 ]
|
282
|
+
}},
|
283
|
+
{{
|
284
|
+
name: "max_new_tokens"
|
285
|
+
data_type: TYPE_UINT32
|
286
|
+
dims: [ 1 ]
|
287
|
+
optional: true
|
288
|
+
}},
|
289
|
+
{{
|
290
|
+
name: "stream"
|
291
|
+
data_type: TYPE_BOOL
|
292
|
+
dims: [ 1 ]
|
293
|
+
optional: true
|
294
|
+
}},
|
295
|
+
{{
|
296
|
+
name: "temperature"
|
297
|
+
data_type: TYPE_FP32
|
298
|
+
dims: [ 1 ]
|
299
|
+
optional: true
|
300
|
+
}},
|
301
|
+
{{
|
302
|
+
name: "top_p"
|
303
|
+
data_type: TYPE_FP32
|
304
|
+
dims: [ 1 ]
|
305
|
+
optional: true
|
306
|
+
}}
|
307
|
+
]
|
308
|
+
|
309
|
+
output [
|
310
|
+
{{
|
311
|
+
name: "text_output"
|
312
|
+
data_type: TYPE_STRING
|
313
|
+
dims: [ -1 ]
|
314
|
+
}}
|
315
|
+
]
|
316
|
+
|
317
|
+
instance_group [
|
318
|
+
{{
|
319
|
+
count: {config.instance_group_count}
|
320
|
+
kind: {config.instance_group_kind}
|
321
|
+
}}
|
322
|
+
]
|
323
|
+
|
324
|
+
parameters {{
|
325
|
+
key: "model_type"
|
326
|
+
value: {{ string_value: "{"inflight_batching_llm" if config.use_inflight_batching else "llm"}" }}
|
327
|
+
}}
|
328
|
+
|
329
|
+
parameters {{
|
330
|
+
key: "max_tokens_in_paged_kv_cache"
|
331
|
+
value: {{ string_value: "{config.max_sequence_length * config.max_batch_size}" }}
|
332
|
+
}}'''
|
333
|
+
|
334
|
+
def _generate_python_backend_config(self, config: TritonConfig) -> str:
|
335
|
+
"""Generate Python backend configuration"""
|
336
|
+
return f'''name: "{config.model_name}"
|
337
|
+
backend: "python"
|
338
|
+
max_batch_size: {config.max_batch_size}
|
339
|
+
|
340
|
+
input [
|
341
|
+
{{
|
342
|
+
name: "input"
|
343
|
+
data_type: TYPE_STRING
|
344
|
+
dims: [ -1 ]
|
345
|
+
}}
|
346
|
+
]
|
347
|
+
|
348
|
+
output [
|
349
|
+
{{
|
350
|
+
name: "output"
|
351
|
+
data_type: TYPE_STRING
|
352
|
+
dims: [ -1 ]
|
353
|
+
}}
|
354
|
+
]
|
355
|
+
|
356
|
+
instance_group [
|
357
|
+
{{
|
358
|
+
count: {config.instance_group_count}
|
359
|
+
kind: {config.instance_group_kind}
|
360
|
+
}}
|
361
|
+
]'''
|
362
|
+
|
363
|
+
async def _deploy_container(self, config: TritonConfig, workspace: Path) -> Dict[str, Any]:
|
364
|
+
"""Deploy Triton container"""
|
365
|
+
if not self.docker_client:
|
366
|
+
raise RuntimeError("Docker client required for container deployment")
|
367
|
+
|
368
|
+
# Generate docker-compose.yml
|
369
|
+
await self._generate_docker_compose(config, workspace)
|
370
|
+
|
371
|
+
# Deploy using docker-compose
|
372
|
+
compose_file = workspace / "docker-compose.yml"
|
373
|
+
|
374
|
+
try:
|
375
|
+
# Run docker-compose up
|
376
|
+
cmd = f"cd {workspace} && docker-compose up -d"
|
377
|
+
result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
|
378
|
+
|
379
|
+
if result.returncode != 0:
|
380
|
+
raise RuntimeError(f"Docker compose failed: {result.stderr}")
|
381
|
+
|
382
|
+
logger.info("Triton container deployed successfully")
|
383
|
+
|
384
|
+
return {
|
385
|
+
"container_id": f"triton-{config.service_name}",
|
386
|
+
"compose_file": str(compose_file)
|
387
|
+
}
|
388
|
+
|
389
|
+
except Exception as e:
|
390
|
+
logger.error(f"Container deployment failed: {e}")
|
391
|
+
raise
|
392
|
+
|
393
|
+
async def _generate_docker_compose(self, config: TritonConfig, workspace: Path):
|
394
|
+
"""Generate docker-compose.yml for Triton deployment"""
|
395
|
+
compose_content = f'''version: '3.8'
|
396
|
+
|
397
|
+
services:
|
398
|
+
triton-{config.service_name}:
|
399
|
+
image: {config.container_image}
|
400
|
+
ports:
|
401
|
+
- "{config.http_port}:{config.http_port}"
|
402
|
+
- "{config.grpc_port}:{config.grpc_port}"
|
403
|
+
- "{config.metrics_port}:{config.metrics_port}"
|
404
|
+
volumes:
|
405
|
+
- ./model_repository:/models
|
406
|
+
environment:
|
407
|
+
- CUDA_VISIBLE_DEVICES=0
|
408
|
+
{self._format_env_vars(config.environment)}
|
409
|
+
command: >
|
410
|
+
tritonserver
|
411
|
+
--model-repository=/models
|
412
|
+
--allow-http=true
|
413
|
+
--allow-grpc=true
|
414
|
+
--allow-metrics=true
|
415
|
+
--http-port={config.http_port}
|
416
|
+
--grpc-port={config.grpc_port}
|
417
|
+
--metrics-port={config.metrics_port}
|
418
|
+
--log-verbose=1
|
419
|
+
deploy:
|
420
|
+
resources:
|
421
|
+
reservations:
|
422
|
+
devices:
|
423
|
+
- driver: nvidia
|
424
|
+
count: {config.gpu_count}
|
425
|
+
capabilities: [gpu]
|
426
|
+
healthcheck:
|
427
|
+
test: ["CMD", "curl", "-f", "http://localhost:{config.http_port}/v2/health/ready"]
|
428
|
+
interval: 30s
|
429
|
+
timeout: 10s
|
430
|
+
retries: 3
|
431
|
+
start_period: 60s
|
432
|
+
'''
|
433
|
+
|
434
|
+
with open(workspace / "docker-compose.yml", 'w') as f:
|
435
|
+
f.write(compose_content)
|
436
|
+
|
437
|
+
logger.info("Docker compose configuration generated")
|
438
|
+
|
439
|
+
def _format_env_vars(self, env_vars: Dict[str, str]) -> str:
|
440
|
+
"""Format environment variables for docker-compose"""
|
441
|
+
if not env_vars:
|
442
|
+
return ""
|
443
|
+
|
444
|
+
formatted = []
|
445
|
+
for key, value in env_vars.items():
|
446
|
+
formatted.append(f" - {key}={value}")
|
447
|
+
|
448
|
+
return "\n" + "\n".join(formatted)
|
449
|
+
|
450
|
+
async def _verify_deployment(self, config: TritonConfig, container_info: Dict[str, Any]) -> str:
|
451
|
+
"""Verify deployment is healthy"""
|
452
|
+
import time
|
453
|
+
import requests
|
454
|
+
|
455
|
+
endpoint_url = f"http://localhost:{config.http_port}"
|
456
|
+
health_url = f"{endpoint_url}/v2/health/ready"
|
457
|
+
|
458
|
+
# Wait for service to be ready
|
459
|
+
max_retries = 30
|
460
|
+
for i in range(max_retries):
|
461
|
+
try:
|
462
|
+
response = requests.get(health_url, timeout=5)
|
463
|
+
if response.status_code == 200:
|
464
|
+
logger.info("Triton service is healthy and ready")
|
465
|
+
return endpoint_url
|
466
|
+
except Exception:
|
467
|
+
pass
|
468
|
+
|
469
|
+
if i < max_retries - 1:
|
470
|
+
logger.info(f"Waiting for Triton service... ({i+1}/{max_retries})")
|
471
|
+
time.sleep(10)
|
472
|
+
|
473
|
+
raise RuntimeError("Triton service failed to become ready")
|
474
|
+
|
475
|
+
async def list_deployments(self) -> List[Dict[str, Any]]:
|
476
|
+
"""List all Triton deployments"""
|
477
|
+
return [
|
478
|
+
{
|
479
|
+
"deployment_id": deployment_id,
|
480
|
+
**info
|
481
|
+
}
|
482
|
+
for deployment_id, info in self.deployments.items()
|
483
|
+
]
|
484
|
+
|
485
|
+
async def delete_deployment(self, deployment_id: str) -> bool:
|
486
|
+
"""Delete a Triton deployment"""
|
487
|
+
if deployment_id not in self.deployments:
|
488
|
+
return False
|
489
|
+
|
490
|
+
try:
|
491
|
+
deployment_info = self.deployments[deployment_id]
|
492
|
+
workspace = Path(deployment_info["workspace"])
|
493
|
+
|
494
|
+
# Stop docker-compose services
|
495
|
+
if (workspace / "docker-compose.yml").exists():
|
496
|
+
cmd = f"cd {workspace} && docker-compose down"
|
497
|
+
subprocess.run(cmd, shell=True, capture_output=True)
|
498
|
+
|
499
|
+
# Clean up workspace
|
500
|
+
import shutil
|
501
|
+
if workspace.exists():
|
502
|
+
shutil.rmtree(workspace)
|
503
|
+
|
504
|
+
# Remove from tracking
|
505
|
+
del self.deployments[deployment_id]
|
506
|
+
|
507
|
+
logger.info(f"Triton deployment deleted: {deployment_id}")
|
508
|
+
return True
|
509
|
+
|
510
|
+
except Exception as e:
|
511
|
+
logger.error(f"Failed to delete Triton deployment {deployment_id}: {e}")
|
512
|
+
return False
|
@@ -0,0 +1 @@
|
|
1
|
+
"""Triton deployment scripts"""
|
@@ -0,0 +1 @@
|
|
1
|
+
"""Triton deployment templates"""
|
isa_model/inference/__init__.py
CHANGED
@@ -8,4 +8,50 @@ This module provides the main inference components for the IsA Model system.
|
|
8
8
|
from .ai_factory import AIFactory
|
9
9
|
from .base import ModelType, Capability, RoutingStrategy
|
10
10
|
|
11
|
-
|
11
|
+
# Import legacy model services (migrated from isA_MCP)
|
12
|
+
try:
|
13
|
+
from .legacy_services import (
|
14
|
+
ModelTrainingService,
|
15
|
+
TrainingConfig,
|
16
|
+
TrainingResult,
|
17
|
+
ModelEvaluationService,
|
18
|
+
EvaluationResult,
|
19
|
+
ModelServingService,
|
20
|
+
ServingResult,
|
21
|
+
ModelService,
|
22
|
+
ModelConfig,
|
23
|
+
ModelResult
|
24
|
+
)
|
25
|
+
LEGACY_SERVICES_AVAILABLE = True
|
26
|
+
except ImportError:
|
27
|
+
LEGACY_SERVICES_AVAILABLE = False
|
28
|
+
ModelTrainingService = None
|
29
|
+
TrainingConfig = None
|
30
|
+
TrainingResult = None
|
31
|
+
ModelEvaluationService = None
|
32
|
+
EvaluationResult = None
|
33
|
+
ModelServingService = None
|
34
|
+
ServingResult = None
|
35
|
+
ModelService = None
|
36
|
+
ModelConfig = None
|
37
|
+
ModelResult = None
|
38
|
+
|
39
|
+
__all__ = [
|
40
|
+
"AIFactory",
|
41
|
+
"ModelType",
|
42
|
+
"Capability",
|
43
|
+
"RoutingStrategy",
|
44
|
+
|
45
|
+
# Legacy model services (migrated from isA_MCP)
|
46
|
+
'ModelTrainingService',
|
47
|
+
'TrainingConfig',
|
48
|
+
'TrainingResult',
|
49
|
+
'ModelEvaluationService',
|
50
|
+
'EvaluationResult',
|
51
|
+
'ModelServingService',
|
52
|
+
'ServingResult',
|
53
|
+
'ModelService',
|
54
|
+
'ModelConfig',
|
55
|
+
'ModelResult',
|
56
|
+
'LEGACY_SERVICES_AVAILABLE'
|
57
|
+
]
|