isa-model 0.3.91__py3-none-any.whl → 0.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- isa_model/client.py +1166 -584
- isa_model/core/cache/redis_cache.py +410 -0
- isa_model/core/config/config_manager.py +282 -12
- isa_model/core/config.py +91 -1
- isa_model/core/database/__init__.py +1 -0
- isa_model/core/database/direct_db_client.py +114 -0
- isa_model/core/database/migration_manager.py +563 -0
- isa_model/core/database/migrations.py +297 -0
- isa_model/core/database/supabase_client.py +258 -0
- isa_model/core/dependencies.py +316 -0
- isa_model/core/discovery/__init__.py +19 -0
- isa_model/core/discovery/consul_discovery.py +190 -0
- isa_model/core/logging/__init__.py +54 -0
- isa_model/core/logging/influx_logger.py +523 -0
- isa_model/core/logging/loki_logger.py +160 -0
- isa_model/core/models/__init__.py +46 -0
- isa_model/core/models/config_models.py +625 -0
- isa_model/core/models/deployment_billing_tracker.py +430 -0
- isa_model/core/models/model_billing_tracker.py +60 -88
- isa_model/core/models/model_manager.py +66 -25
- isa_model/core/models/model_metadata.py +690 -0
- isa_model/core/models/model_repo.py +217 -55
- isa_model/core/models/model_statistics_tracker.py +234 -0
- isa_model/core/models/model_storage.py +0 -1
- isa_model/core/models/model_version_manager.py +959 -0
- isa_model/core/models/system_models.py +857 -0
- isa_model/core/pricing_manager.py +2 -249
- isa_model/core/repositories/__init__.py +9 -0
- isa_model/core/repositories/config_repository.py +912 -0
- isa_model/core/resilience/circuit_breaker.py +366 -0
- isa_model/core/security/secrets.py +358 -0
- isa_model/core/services/__init__.py +2 -4
- isa_model/core/services/intelligent_model_selector.py +479 -370
- isa_model/core/storage/hf_storage.py +2 -2
- isa_model/core/types.py +8 -0
- isa_model/deployment/__init__.py +5 -48
- isa_model/deployment/core/__init__.py +2 -31
- isa_model/deployment/core/deployment_manager.py +1278 -368
- isa_model/deployment/local/__init__.py +31 -0
- isa_model/deployment/local/config.py +248 -0
- isa_model/deployment/local/gpu_gateway.py +607 -0
- isa_model/deployment/local/health_checker.py +428 -0
- isa_model/deployment/local/provider.py +586 -0
- isa_model/deployment/local/tensorrt_service.py +621 -0
- isa_model/deployment/local/transformers_service.py +644 -0
- isa_model/deployment/local/vllm_service.py +527 -0
- isa_model/deployment/modal/__init__.py +8 -0
- isa_model/deployment/modal/config.py +136 -0
- isa_model/deployment/modal/deployer.py +894 -0
- isa_model/deployment/modal/services/__init__.py +3 -0
- isa_model/deployment/modal/services/audio/__init__.py +1 -0
- isa_model/deployment/modal/services/audio/isa_audio_chatTTS_service.py +520 -0
- isa_model/deployment/modal/services/audio/isa_audio_openvoice_service.py +758 -0
- isa_model/deployment/modal/services/audio/isa_audio_service_v2.py +1044 -0
- isa_model/deployment/modal/services/embedding/__init__.py +1 -0
- isa_model/deployment/modal/services/embedding/isa_embed_rerank_service.py +296 -0
- isa_model/deployment/modal/services/llm/__init__.py +1 -0
- isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
- isa_model/deployment/modal/services/video/__init__.py +1 -0
- isa_model/deployment/modal/services/video/isa_video_hunyuan_service.py +423 -0
- isa_model/deployment/modal/services/vision/__init__.py +1 -0
- isa_model/deployment/modal/services/vision/isa_vision_ocr_service.py +519 -0
- isa_model/deployment/modal/services/vision/isa_vision_qwen25_service.py +709 -0
- isa_model/deployment/modal/services/vision/isa_vision_table_service.py +676 -0
- isa_model/deployment/modal/services/vision/isa_vision_ui_service.py +833 -0
- isa_model/deployment/modal/services/vision/isa_vision_ui_service_optimized.py +660 -0
- isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
- isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
- isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
- isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
- isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
- isa_model/deployment/storage/__init__.py +5 -0
- isa_model/deployment/storage/deployment_repository.py +824 -0
- isa_model/deployment/triton/__init__.py +10 -0
- isa_model/deployment/triton/config.py +196 -0
- isa_model/deployment/triton/configs/__init__.py +1 -0
- isa_model/deployment/triton/provider.py +512 -0
- isa_model/deployment/triton/scripts/__init__.py +1 -0
- isa_model/deployment/triton/templates/__init__.py +1 -0
- isa_model/inference/__init__.py +47 -1
- isa_model/inference/ai_factory.py +179 -16
- isa_model/inference/legacy_services/__init__.py +21 -0
- isa_model/inference/legacy_services/model_evaluation.py +637 -0
- isa_model/inference/legacy_services/model_service.py +573 -0
- isa_model/inference/legacy_services/model_serving.py +717 -0
- isa_model/inference/legacy_services/model_training.py +561 -0
- isa_model/inference/models/__init__.py +21 -0
- isa_model/inference/models/inference_config.py +551 -0
- isa_model/inference/models/inference_record.py +675 -0
- isa_model/inference/models/performance_models.py +714 -0
- isa_model/inference/repositories/__init__.py +9 -0
- isa_model/inference/repositories/inference_repository.py +828 -0
- isa_model/inference/services/audio/__init__.py +21 -0
- isa_model/inference/services/audio/base_realtime_service.py +225 -0
- isa_model/inference/services/audio/base_stt_service.py +184 -11
- isa_model/inference/services/audio/isa_tts_service.py +0 -0
- isa_model/inference/services/audio/openai_realtime_service.py +320 -124
- isa_model/inference/services/audio/openai_stt_service.py +53 -11
- isa_model/inference/services/base_service.py +17 -1
- isa_model/inference/services/custom_model_manager.py +277 -0
- isa_model/inference/services/embedding/__init__.py +13 -0
- isa_model/inference/services/embedding/base_embed_service.py +111 -8
- isa_model/inference/services/embedding/isa_embed_service.py +305 -0
- isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
- isa_model/inference/services/embedding/openai_embed_service.py +2 -4
- isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
- isa_model/inference/services/embedding/tests/test_embedding.py +222 -0
- isa_model/inference/services/img/__init__.py +2 -2
- isa_model/inference/services/img/base_image_gen_service.py +24 -7
- isa_model/inference/services/img/replicate_image_gen_service.py +84 -422
- isa_model/inference/services/img/services/replicate_face_swap.py +193 -0
- isa_model/inference/services/img/services/replicate_flux.py +226 -0
- isa_model/inference/services/img/services/replicate_flux_kontext.py +219 -0
- isa_model/inference/services/img/services/replicate_sticker_maker.py +249 -0
- isa_model/inference/services/img/tests/test_img_client.py +297 -0
- isa_model/inference/services/llm/__init__.py +10 -2
- isa_model/inference/services/llm/base_llm_service.py +361 -26
- isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
- isa_model/inference/services/llm/helpers/llm_adapter.py +71 -12
- isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
- isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
- isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
- isa_model/inference/services/llm/local_llm_service.py +747 -0
- isa_model/inference/services/llm/ollama_llm_service.py +11 -3
- isa_model/inference/services/llm/openai_llm_service.py +670 -56
- isa_model/inference/services/llm/yyds_llm_service.py +10 -3
- isa_model/inference/services/vision/__init__.py +27 -6
- isa_model/inference/services/vision/base_vision_service.py +118 -185
- isa_model/inference/services/vision/blip_vision_service.py +359 -0
- isa_model/inference/services/vision/helpers/image_utils.py +19 -10
- isa_model/inference/services/vision/isa_vision_service.py +634 -0
- isa_model/inference/services/vision/openai_vision_service.py +19 -10
- isa_model/inference/services/vision/tests/test_ocr_client.py +284 -0
- isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
- isa_model/serving/api/cache_manager.py +245 -0
- isa_model/serving/api/dependencies/__init__.py +1 -0
- isa_model/serving/api/dependencies/auth.py +194 -0
- isa_model/serving/api/dependencies/database.py +139 -0
- isa_model/serving/api/error_handlers.py +284 -0
- isa_model/serving/api/fastapi_server.py +240 -18
- isa_model/serving/api/middleware/auth.py +317 -0
- isa_model/serving/api/middleware/security.py +268 -0
- isa_model/serving/api/middleware/tenant_context.py +414 -0
- isa_model/serving/api/routes/analytics.py +489 -0
- isa_model/serving/api/routes/config.py +645 -0
- isa_model/serving/api/routes/deployment_billing.py +315 -0
- isa_model/serving/api/routes/deployments.py +475 -0
- isa_model/serving/api/routes/gpu_gateway.py +440 -0
- isa_model/serving/api/routes/health.py +32 -12
- isa_model/serving/api/routes/inference_monitoring.py +486 -0
- isa_model/serving/api/routes/local_deployments.py +448 -0
- isa_model/serving/api/routes/logs.py +430 -0
- isa_model/serving/api/routes/settings.py +582 -0
- isa_model/serving/api/routes/tenants.py +575 -0
- isa_model/serving/api/routes/unified.py +992 -171
- isa_model/serving/api/routes/webhooks.py +479 -0
- isa_model/serving/api/startup.py +318 -0
- isa_model/serving/modal_proxy_server.py +249 -0
- isa_model/utils/gpu_utils.py +311 -0
- {isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/METADATA +76 -22
- isa_model-0.4.3.dist-info/RECORD +193 -0
- isa_model/deployment/cloud/__init__.py +0 -9
- isa_model/deployment/cloud/modal/__init__.py +0 -10
- isa_model/deployment/cloud/modal/isa_vision_doc_service.py +0 -766
- isa_model/deployment/cloud/modal/isa_vision_table_service.py +0 -532
- isa_model/deployment/cloud/modal/isa_vision_ui_service.py +0 -406
- isa_model/deployment/cloud/modal/register_models.py +0 -321
- isa_model/deployment/core/deployment_config.py +0 -356
- isa_model/deployment/core/isa_deployment_service.py +0 -401
- isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
- isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
- isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
- isa_model/deployment/runtime/deployed_service.py +0 -338
- isa_model/deployment/services/__init__.py +0 -9
- isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
- isa_model/deployment/services/model_service.py +0 -332
- isa_model/deployment/services/service_monitor.py +0 -356
- isa_model/deployment/services/service_registry.py +0 -527
- isa_model/eval/__init__.py +0 -92
- isa_model/eval/benchmarks.py +0 -469
- isa_model/eval/config/__init__.py +0 -10
- isa_model/eval/config/evaluation_config.py +0 -108
- isa_model/eval/evaluators/__init__.py +0 -18
- isa_model/eval/evaluators/base_evaluator.py +0 -503
- isa_model/eval/evaluators/llm_evaluator.py +0 -472
- isa_model/eval/factory.py +0 -531
- isa_model/eval/infrastructure/__init__.py +0 -24
- isa_model/eval/infrastructure/experiment_tracker.py +0 -466
- isa_model/eval/metrics.py +0 -798
- isa_model/inference/adapter/unified_api.py +0 -248
- isa_model/inference/services/helpers/stacked_config.py +0 -148
- isa_model/inference/services/img/flux_professional_service.py +0 -603
- isa_model/inference/services/img/helpers/base_stacked_service.py +0 -274
- isa_model/inference/services/others/table_transformer_service.py +0 -61
- isa_model/inference/services/vision/doc_analysis_service.py +0 -640
- isa_model/inference/services/vision/helpers/base_stacked_service.py +0 -274
- isa_model/inference/services/vision/ui_analysis_service.py +0 -823
- isa_model/scripts/inference_tracker.py +0 -283
- isa_model/scripts/mlflow_manager.py +0 -379
- isa_model/scripts/model_registry.py +0 -465
- isa_model/scripts/register_models.py +0 -370
- isa_model/scripts/register_models_with_embeddings.py +0 -510
- isa_model/scripts/start_mlflow.py +0 -95
- isa_model/scripts/training_tracker.py +0 -257
- isa_model/training/__init__.py +0 -74
- isa_model/training/annotation/annotation_schema.py +0 -47
- isa_model/training/annotation/processors/annotation_processor.py +0 -126
- isa_model/training/annotation/storage/dataset_manager.py +0 -131
- isa_model/training/annotation/storage/dataset_schema.py +0 -44
- isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
- isa_model/training/annotation/tests/test_minio copy.py +0 -113
- isa_model/training/annotation/tests/test_minio_upload.py +0 -43
- isa_model/training/annotation/views/annotation_controller.py +0 -158
- isa_model/training/cloud/__init__.py +0 -22
- isa_model/training/cloud/job_orchestrator.py +0 -402
- isa_model/training/cloud/runpod_trainer.py +0 -454
- isa_model/training/cloud/storage_manager.py +0 -482
- isa_model/training/core/__init__.py +0 -23
- isa_model/training/core/config.py +0 -181
- isa_model/training/core/dataset.py +0 -222
- isa_model/training/core/trainer.py +0 -720
- isa_model/training/core/utils.py +0 -213
- isa_model/training/factory.py +0 -424
- isa_model-0.3.91.dist-info/RECORD +0 -138
- /isa_model/{core/storage/minio_storage.py → deployment/modal/services/audio/isa_audio_fish_service.py} +0 -0
- /isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
- {isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/WHEEL +0 -0
- {isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/top_level.txt +0 -0
@@ -1,401 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
ISA Model Deployment Service
|
3
|
-
|
4
|
-
Complete deployment pipeline that:
|
5
|
-
1. Downloads fine-tuned models from HuggingFace storage
|
6
|
-
2. Quantizes models using open-source TensorRT-LLM
|
7
|
-
3. Builds optimized engines
|
8
|
-
4. Deploys as custom container service on RunPod
|
9
|
-
"""
|
10
|
-
|
11
|
-
import os
|
12
|
-
import json
|
13
|
-
import logging
|
14
|
-
import asyncio
|
15
|
-
from typing import Dict, Any, Optional, List
|
16
|
-
from pathlib import Path
|
17
|
-
import shutil
|
18
|
-
from datetime import datetime
|
19
|
-
|
20
|
-
logger = logging.getLogger(__name__)
|
21
|
-
|
22
|
-
|
23
|
-
class ISADeploymentService:
|
24
|
-
"""
|
25
|
-
Complete deployment service for ISA Model SDK.
|
26
|
-
|
27
|
-
Example:
|
28
|
-
```python
|
29
|
-
from isa_model.deployment.core import ISADeploymentService
|
30
|
-
|
31
|
-
service = ISADeploymentService()
|
32
|
-
|
33
|
-
# Complete deployment pipeline
|
34
|
-
deployment = await service.deploy_finetuned_model(
|
35
|
-
model_id="gemma-4b-alpaca-v1",
|
36
|
-
quantization="int8"
|
37
|
-
)
|
38
|
-
```
|
39
|
-
"""
|
40
|
-
|
41
|
-
def __init__(self,
|
42
|
-
work_dir: str = "./isa_deployment_work",
|
43
|
-
hf_username: str = "xenobordom"):
|
44
|
-
"""Initialize ISA deployment service."""
|
45
|
-
self.work_dir = Path(work_dir)
|
46
|
-
self.work_dir.mkdir(parents=True, exist_ok=True)
|
47
|
-
self.hf_username = hf_username
|
48
|
-
|
49
|
-
# Create subdirectories
|
50
|
-
(self.work_dir / "models").mkdir(exist_ok=True)
|
51
|
-
(self.work_dir / "containers").mkdir(exist_ok=True)
|
52
|
-
(self.work_dir / "deployments").mkdir(exist_ok=True)
|
53
|
-
|
54
|
-
logger.info(f"ISA Deployment Service initialized with work_dir: {self.work_dir}")
|
55
|
-
|
56
|
-
async def deploy_finetuned_model(self,
|
57
|
-
model_id: str,
|
58
|
-
quantization: str = "int8",
|
59
|
-
container_registry: str = "docker.io") -> Dict[str, Any]:
|
60
|
-
"""Complete deployment pipeline for fine-tuned models."""
|
61
|
-
deployment_id = f"{model_id}-{quantization}-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
|
62
|
-
logger.info(f"Starting deployment pipeline: {deployment_id}")
|
63
|
-
|
64
|
-
deployment_info = {
|
65
|
-
"deployment_id": deployment_id,
|
66
|
-
"model_id": model_id,
|
67
|
-
"quantization": quantization,
|
68
|
-
"status": "starting",
|
69
|
-
"steps": []
|
70
|
-
}
|
71
|
-
|
72
|
-
try:
|
73
|
-
# Step 1: Download model
|
74
|
-
model_path = await self._download_finetuned_model(model_id)
|
75
|
-
deployment_info["steps"].append({
|
76
|
-
"step": 1,
|
77
|
-
"name": "download_model",
|
78
|
-
"status": "completed",
|
79
|
-
"model_path": str(model_path)
|
80
|
-
})
|
81
|
-
|
82
|
-
# Step 2: Build container
|
83
|
-
container_image = await self._build_deployment_container(
|
84
|
-
model_id=model_id,
|
85
|
-
model_path=model_path,
|
86
|
-
quantization=quantization,
|
87
|
-
container_registry=container_registry
|
88
|
-
)
|
89
|
-
deployment_info["steps"].append({
|
90
|
-
"step": 2,
|
91
|
-
"name": "build_container",
|
92
|
-
"status": "completed",
|
93
|
-
"container_image": container_image
|
94
|
-
})
|
95
|
-
|
96
|
-
deployment_info["status"] = "completed"
|
97
|
-
deployment_info["completed_at"] = datetime.now().isoformat()
|
98
|
-
|
99
|
-
# Save configuration
|
100
|
-
config_file = self.work_dir / "deployments" / f"{deployment_id}.json"
|
101
|
-
with open(config_file, 'w') as f:
|
102
|
-
json.dump(deployment_info, f, indent=2)
|
103
|
-
|
104
|
-
logger.info(f"✅ Deployment completed: {deployment_id}")
|
105
|
-
return deployment_info
|
106
|
-
|
107
|
-
except Exception as e:
|
108
|
-
deployment_info["status"] = "failed"
|
109
|
-
deployment_info["error"] = str(e)
|
110
|
-
logger.error(f"❌ Deployment failed: {e}")
|
111
|
-
raise
|
112
|
-
|
113
|
-
async def _download_finetuned_model(self, model_id: str) -> Path:
|
114
|
-
"""Download fine-tuned model from HuggingFace storage."""
|
115
|
-
from ...core.storage.hf_storage import HuggingFaceStorage
|
116
|
-
|
117
|
-
logger.info(f"Downloading model {model_id}...")
|
118
|
-
|
119
|
-
storage = HuggingFaceStorage(username=self.hf_username)
|
120
|
-
model_path = await storage.load_model(model_id)
|
121
|
-
|
122
|
-
if not model_path:
|
123
|
-
raise ValueError(f"Failed to download model {model_id}")
|
124
|
-
|
125
|
-
# Copy to work directory
|
126
|
-
local_model_path = self.work_dir / "models" / model_id
|
127
|
-
if local_model_path.exists():
|
128
|
-
shutil.rmtree(local_model_path)
|
129
|
-
|
130
|
-
shutil.copytree(model_path, local_model_path)
|
131
|
-
logger.info(f"Model downloaded to: {local_model_path}")
|
132
|
-
|
133
|
-
return local_model_path
|
134
|
-
|
135
|
-
async def _build_deployment_container(self,
|
136
|
-
model_id: str,
|
137
|
-
model_path: Path,
|
138
|
-
quantization: str,
|
139
|
-
container_registry: str) -> str:
|
140
|
-
"""Build custom deployment container."""
|
141
|
-
container_name = f"isa-model-{model_id}"
|
142
|
-
container_tag = f"{container_registry}/{container_name}:latest"
|
143
|
-
|
144
|
-
logger.info(f"Building container: {container_tag}")
|
145
|
-
|
146
|
-
container_dir = self.work_dir / "containers" / model_id
|
147
|
-
container_dir.mkdir(parents=True, exist_ok=True)
|
148
|
-
|
149
|
-
# Create Dockerfile
|
150
|
-
dockerfile_content = self._create_deployment_dockerfile(quantization)
|
151
|
-
with open(container_dir / "Dockerfile", 'w') as f:
|
152
|
-
f.write(dockerfile_content)
|
153
|
-
|
154
|
-
# Copy model files
|
155
|
-
model_dst = container_dir / "hf_model"
|
156
|
-
if model_dst.exists():
|
157
|
-
shutil.rmtree(model_dst)
|
158
|
-
shutil.copytree(model_path, model_dst)
|
159
|
-
|
160
|
-
# Create server.py
|
161
|
-
server_content = self._create_server_py()
|
162
|
-
with open(container_dir / "server.py", 'w') as f:
|
163
|
-
f.write(server_content)
|
164
|
-
|
165
|
-
# Build container
|
166
|
-
process = await asyncio.create_subprocess_exec(
|
167
|
-
"docker", "build", "-t", container_tag, str(container_dir),
|
168
|
-
stdout=asyncio.subprocess.PIPE,
|
169
|
-
stderr=asyncio.subprocess.PIPE
|
170
|
-
)
|
171
|
-
|
172
|
-
stdout, stderr = await process.communicate()
|
173
|
-
|
174
|
-
if process.returncode != 0:
|
175
|
-
raise RuntimeError(f"Container build failed: {stderr.decode()}")
|
176
|
-
|
177
|
-
logger.info(f"Container built: {container_tag}")
|
178
|
-
return container_tag
|
179
|
-
|
180
|
-
def _create_deployment_dockerfile(self, quantization: str) -> str:
|
181
|
-
"""Create Dockerfile for deployment."""
|
182
|
-
return f'''# ISA Model Deployment Container
|
183
|
-
FROM nvcr.io/nvidia/pytorch:24.05-py3
|
184
|
-
|
185
|
-
# Install dependencies
|
186
|
-
RUN apt-get update && apt-get install -y git-lfs curl && rm -rf /var/lib/apt/lists/*
|
187
|
-
|
188
|
-
# Install Python packages
|
189
|
-
RUN pip install fastapi uvicorn transformers torch
|
190
|
-
|
191
|
-
# Clone TensorRT-LLM for quantization and inference
|
192
|
-
RUN git clone https://github.com/NVIDIA/TensorRT-LLM.git /opt/TensorRT-LLM
|
193
|
-
WORKDIR /opt/TensorRT-LLM
|
194
|
-
RUN pip install -r requirements.txt
|
195
|
-
|
196
|
-
# Set up application
|
197
|
-
WORKDIR /app
|
198
|
-
COPY hf_model/ /app/hf_model/
|
199
|
-
COPY server.py /app/server.py
|
200
|
-
|
201
|
-
# Environment variables
|
202
|
-
ENV QUANTIZATION={quantization}
|
203
|
-
ENV MODEL_PATH=/app/hf_model
|
204
|
-
ENV PYTHONPATH=/opt/TensorRT-LLM:$PYTHONPATH
|
205
|
-
|
206
|
-
# Expose port
|
207
|
-
EXPOSE 8000
|
208
|
-
|
209
|
-
# Health check
|
210
|
-
HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \\
|
211
|
-
CMD curl -f http://localhost:8000/health || exit 1
|
212
|
-
|
213
|
-
# Start server
|
214
|
-
CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8000"]
|
215
|
-
'''
|
216
|
-
|
217
|
-
def _create_server_py(self) -> str:
|
218
|
-
"""Create FastAPI server."""
|
219
|
-
return '''"""
|
220
|
-
ISA Model Deployment Server
|
221
|
-
"""
|
222
|
-
|
223
|
-
import os
|
224
|
-
import logging
|
225
|
-
import asyncio
|
226
|
-
from pathlib import Path
|
227
|
-
from fastapi import FastAPI, HTTPException
|
228
|
-
from pydantic import BaseModel
|
229
|
-
from contextlib import asynccontextmanager
|
230
|
-
from transformers import AutoTokenizer, AutoModelForCausalLM
|
231
|
-
import torch
|
232
|
-
|
233
|
-
logging.basicConfig(level=logging.INFO)
|
234
|
-
logger = logging.getLogger(__name__)
|
235
|
-
|
236
|
-
# Global variables
|
237
|
-
MODEL_PATH = os.getenv("MODEL_PATH", "/app/hf_model")
|
238
|
-
QUANTIZATION = os.getenv("QUANTIZATION", "int8")
|
239
|
-
|
240
|
-
model = None
|
241
|
-
tokenizer = None
|
242
|
-
|
243
|
-
@asynccontextmanager
|
244
|
-
async def lifespan(app: FastAPI):
|
245
|
-
"""FastAPI lifespan events."""
|
246
|
-
global model, tokenizer
|
247
|
-
|
248
|
-
logger.info("Starting ISA Model Deployment Service...")
|
249
|
-
logger.info(f"Loading model from: {MODEL_PATH}")
|
250
|
-
logger.info(f"Quantization: {QUANTIZATION}")
|
251
|
-
|
252
|
-
try:
|
253
|
-
# Load tokenizer
|
254
|
-
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
|
255
|
-
|
256
|
-
# Load model with appropriate settings
|
257
|
-
model = AutoModelForCausalLM.from_pretrained(
|
258
|
-
MODEL_PATH,
|
259
|
-
torch_dtype=torch.float16,
|
260
|
-
device_map="auto",
|
261
|
-
trust_remote_code=True
|
262
|
-
)
|
263
|
-
|
264
|
-
logger.info("🚀 Model loaded successfully!")
|
265
|
-
|
266
|
-
except Exception as e:
|
267
|
-
logger.error(f"Failed to load model: {e}")
|
268
|
-
raise
|
269
|
-
|
270
|
-
yield
|
271
|
-
|
272
|
-
logger.info("Shutting down...")
|
273
|
-
model = None
|
274
|
-
tokenizer = None
|
275
|
-
|
276
|
-
app = FastAPI(
|
277
|
-
title="ISA Model Deployment Service",
|
278
|
-
description="Quantized model inference service",
|
279
|
-
version="1.0.0",
|
280
|
-
lifespan=lifespan
|
281
|
-
)
|
282
|
-
|
283
|
-
class GenerateRequest(BaseModel):
|
284
|
-
prompt: str
|
285
|
-
max_new_tokens: int = 256
|
286
|
-
temperature: float = 0.7
|
287
|
-
top_p: float = 0.9
|
288
|
-
|
289
|
-
class GenerateResponse(BaseModel):
|
290
|
-
text: str
|
291
|
-
quantization: str
|
292
|
-
backend: str
|
293
|
-
|
294
|
-
@app.post("/generate", response_model=GenerateResponse)
|
295
|
-
async def generate(request: GenerateRequest):
|
296
|
-
"""Generate text."""
|
297
|
-
if model is None or tokenizer is None:
|
298
|
-
raise HTTPException(status_code=503, detail="Model not loaded")
|
299
|
-
|
300
|
-
try:
|
301
|
-
# Tokenize input
|
302
|
-
inputs = tokenizer(request.prompt, return_tensors="pt").to(model.device)
|
303
|
-
|
304
|
-
# Generate response
|
305
|
-
with torch.no_grad():
|
306
|
-
outputs = model.generate(
|
307
|
-
**inputs,
|
308
|
-
max_new_tokens=request.max_new_tokens,
|
309
|
-
temperature=request.temperature,
|
310
|
-
top_p=request.top_p,
|
311
|
-
do_sample=True,
|
312
|
-
eos_token_id=tokenizer.eos_token_id,
|
313
|
-
pad_token_id=tokenizer.pad_token_id,
|
314
|
-
)
|
315
|
-
|
316
|
-
# Decode response
|
317
|
-
generated_text = tokenizer.decode(
|
318
|
-
outputs[0][len(inputs.input_ids[0]):],
|
319
|
-
skip_special_tokens=True
|
320
|
-
)
|
321
|
-
|
322
|
-
return GenerateResponse(
|
323
|
-
text=generated_text,
|
324
|
-
quantization=QUANTIZATION,
|
325
|
-
backend="Transformers"
|
326
|
-
)
|
327
|
-
|
328
|
-
except Exception as e:
|
329
|
-
logger.error(f"Generation failed: {e}")
|
330
|
-
raise HTTPException(status_code=500, detail=str(e))
|
331
|
-
|
332
|
-
@app.get("/health")
|
333
|
-
async def health_check():
|
334
|
-
"""Health check."""
|
335
|
-
return {
|
336
|
-
"status": "healthy" if (model is not None and tokenizer is not None) else "loading",
|
337
|
-
"quantization": QUANTIZATION,
|
338
|
-
"backend": "Transformers"
|
339
|
-
}
|
340
|
-
|
341
|
-
@app.get("/info")
|
342
|
-
async def model_info():
|
343
|
-
"""Model information."""
|
344
|
-
return {
|
345
|
-
"model_path": MODEL_PATH,
|
346
|
-
"quantization": QUANTIZATION,
|
347
|
-
"framework": "ISA Model SDK",
|
348
|
-
"backend": "Transformers"
|
349
|
-
}
|
350
|
-
'''
|
351
|
-
|
352
|
-
def get_deployment_instructions(self, deployment_info: Dict[str, Any]) -> str:
|
353
|
-
"""Generate deployment instructions."""
|
354
|
-
container_image = None
|
355
|
-
|
356
|
-
for step in deployment_info.get("steps", []):
|
357
|
-
if step["name"] == "build_container":
|
358
|
-
container_image = step.get("container_image")
|
359
|
-
|
360
|
-
return f'''# ISA Model Deployment Instructions
|
361
|
-
|
362
|
-
## Deployment ID: {deployment_info['deployment_id']}
|
363
|
-
## Model: {deployment_info['model_id']}
|
364
|
-
## Quantization: {deployment_info['quantization']}
|
365
|
-
|
366
|
-
### Container Image
|
367
|
-
```
|
368
|
-
{container_image or 'Not built yet'}
|
369
|
-
```
|
370
|
-
|
371
|
-
### RunPod Configuration
|
372
|
-
- **Container Image**: {container_image}
|
373
|
-
- **GPU Type**: NVIDIA RTX A6000
|
374
|
-
- **Container Disk**: 30GB
|
375
|
-
- **Ports**: 8000 (HTTP API)
|
376
|
-
|
377
|
-
### Testing the Deployment
|
378
|
-
```python
|
379
|
-
import requests
|
380
|
-
|
381
|
-
# Health check
|
382
|
-
response = requests.get("http://your-endpoint/health")
|
383
|
-
print(response.json())
|
384
|
-
|
385
|
-
# Generate text
|
386
|
-
payload = {{
|
387
|
-
"prompt": "What is machine learning?",
|
388
|
-
"max_new_tokens": 100,
|
389
|
-
"temperature": 0.7
|
390
|
-
}}
|
391
|
-
|
392
|
-
response = requests.post("http://your-endpoint/generate", json=payload)
|
393
|
-
print(response.json())
|
394
|
-
```
|
395
|
-
|
396
|
-
### Features
|
397
|
-
- ✅ Automatic model download from HuggingFace
|
398
|
-
- ✅ {deployment_info['quantization'].upper()} quantization for efficiency
|
399
|
-
- ✅ FastAPI REST interface
|
400
|
-
- ✅ Health monitoring
|
401
|
-
'''
|
@@ -1,66 +0,0 @@
|
|
1
|
-
import os
|
2
|
-
from fastapi import FastAPI
|
3
|
-
from pydantic import BaseModel
|
4
|
-
from contextlib import asynccontextmanager
|
5
|
-
from pathlib import Path
|
6
|
-
from threading import Thread
|
7
|
-
from transformers import AutoTokenizer
|
8
|
-
from tensorrt_llm.runtime import ModelRunner
|
9
|
-
|
10
|
-
# --- 全局变量 ---
|
11
|
-
ENGINE_PATH = "/app/built_engine/deepseek_engine"
|
12
|
-
TOKENIZER_PATH = "/app/hf_model" # 我们需要原始HF模型中的tokenizer
|
13
|
-
runner = None
|
14
|
-
tokenizer = None
|
15
|
-
|
16
|
-
# --- FastAPI生命周期事件 ---
|
17
|
-
@asynccontextmanager
|
18
|
-
async def lifespan(app: FastAPI):
|
19
|
-
global runner, tokenizer
|
20
|
-
print("--- 正在加载模型引擎和Tokenizer... ---")
|
21
|
-
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH, trust_remote_code=True)
|
22
|
-
runner = ModelRunner.from_dir(engine_dir=ENGINE_PATH, rank=0, stream=True)
|
23
|
-
print("--- ✅ 模型加载完毕,服务准备就绪 ---")
|
24
|
-
yield
|
25
|
-
print("--- 正在清理资源... ---")
|
26
|
-
runner = None
|
27
|
-
tokenizer = None
|
28
|
-
|
29
|
-
app = FastAPI(lifespan=lifespan)
|
30
|
-
|
31
|
-
# --- API请求和响应模型 ---
|
32
|
-
class GenerateRequest(BaseModel):
|
33
|
-
prompt: str
|
34
|
-
max_new_tokens: int = 256
|
35
|
-
temperature: float = 0.7
|
36
|
-
|
37
|
-
class GenerateResponse(BaseModel):
|
38
|
-
text: str
|
39
|
-
|
40
|
-
# --- API端点 ---
|
41
|
-
@app.post("/generate", response_model=GenerateResponse)
|
42
|
-
async def generate(request: GenerateRequest):
|
43
|
-
print(f"收到请求: {request.prompt}")
|
44
|
-
|
45
|
-
# 准备输入
|
46
|
-
input_ids = tokenizer.encode(request.prompt, return_tensors="pt").to("cuda")
|
47
|
-
|
48
|
-
# 执行推理
|
49
|
-
output_ids = runner.generate(
|
50
|
-
input_ids,
|
51
|
-
max_new_tokens=request.max_new_tokens,
|
52
|
-
temperature=request.temperature,
|
53
|
-
eos_token_id=tokenizer.eos_token_id,
|
54
|
-
pad_token_id=tokenizer.pad_token_id,
|
55
|
-
)
|
56
|
-
|
57
|
-
# 清理并解码输出
|
58
|
-
# output_ids[0] 的形状是 [beam_width, seq_length]
|
59
|
-
generated_text = tokenizer.decode(output_ids[0, 0, len(input_ids[0]):], skip_special_tokens=True)
|
60
|
-
|
61
|
-
print(f"生成响应: {generated_text}")
|
62
|
-
return GenerateResponse(text=generated_text)
|
63
|
-
|
64
|
-
@app.get("/health")
|
65
|
-
async def health_check():
|
66
|
-
return {"status": "ok" if runner is not None else "loading"}
|
@@ -1,43 +0,0 @@
|
|
1
|
-
import requests
|
2
|
-
import json
|
3
|
-
|
4
|
-
# --- 配置 ---
|
5
|
-
TRITON_SERVER_URL = "http://localhost:8000"
|
6
|
-
MODEL_NAME = "deepseek_trtllm"
|
7
|
-
PROMPT = "请给我讲一个关于人工智能的笑话。"
|
8
|
-
MAX_TOKENS = 256
|
9
|
-
STREAM = False
|
10
|
-
# ----------------------------------------------------
|
11
|
-
|
12
|
-
def main():
|
13
|
-
"""向Triton服务器发送请求并打印结果。"""
|
14
|
-
url = f"{TRITON_SERVER_URL}/v2/models/{MODEL_NAME}/generate"
|
15
|
-
payload = {
|
16
|
-
"text_input": PROMPT,
|
17
|
-
"max_new_tokens": MAX_TOKENS,
|
18
|
-
"temperature": 0.7,
|
19
|
-
"stream": STREAM
|
20
|
-
}
|
21
|
-
print(f"Sending request to: {url}")
|
22
|
-
print(f"Payload: {json.dumps(payload, indent=2, ensure_ascii=False)}")
|
23
|
-
print("-" * 30)
|
24
|
-
|
25
|
-
try:
|
26
|
-
response = requests.post(url, json=payload, headers={"Accept": "application/json"})
|
27
|
-
response.raise_for_status()
|
28
|
-
response_data = response.json()
|
29
|
-
generated_text = response_data.get('text_output', 'Error: "text_output" key not found.')
|
30
|
-
|
31
|
-
print("✅ Request successful!")
|
32
|
-
print("-" * 30)
|
33
|
-
print("Prompt:", PROMPT)
|
34
|
-
print("\nGenerated Text:", generated_text)
|
35
|
-
|
36
|
-
except requests.exceptions.RequestException as e:
|
37
|
-
print(f"❌ Error making request to Triton server: {e}")
|
38
|
-
if e.response:
|
39
|
-
print(f"Response Status Code: {e.response.status_code}")
|
40
|
-
print(f"Response Body: {e.response.text}")
|
41
|
-
|
42
|
-
if __name__ == '__main__':
|
43
|
-
main()
|
@@ -1,35 +0,0 @@
|
|
1
|
-
import requests
|
2
|
-
import json
|
3
|
-
|
4
|
-
PROMPT = "请给我讲一个关于人工智能的笑话。"
|
5
|
-
API_URL = "http://localhost:8000/generate"
|
6
|
-
|
7
|
-
def main():
|
8
|
-
payload = {
|
9
|
-
"prompt": PROMPT,
|
10
|
-
"max_new_tokens": 100
|
11
|
-
}
|
12
|
-
|
13
|
-
print(f"Sending request to: {API_URL}")
|
14
|
-
print(f"Payload: {json.dumps(payload, ensure_ascii=False)}")
|
15
|
-
print("-" * 30)
|
16
|
-
|
17
|
-
try:
|
18
|
-
response = requests.post(API_URL, json=payload)
|
19
|
-
response.raise_for_status()
|
20
|
-
|
21
|
-
response_data = response.json()
|
22
|
-
generated_text = response_data.get('text')
|
23
|
-
|
24
|
-
print("✅ Request successful!")
|
25
|
-
print("-" * 30)
|
26
|
-
print("Prompt:", PROMPT)
|
27
|
-
print("\nGenerated Text:", generated_text)
|
28
|
-
|
29
|
-
except requests.exceptions.RequestException as e:
|
30
|
-
print(f"❌ Error making request: {e}")
|
31
|
-
if e.response:
|
32
|
-
print(f"Response Body: {e.response.text}")
|
33
|
-
|
34
|
-
if __name__ == '__main__':
|
35
|
-
main()
|