isa-model 0.3.91__py3-none-any.whl → 0.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- isa_model/client.py +1166 -584
- isa_model/core/cache/redis_cache.py +410 -0
- isa_model/core/config/config_manager.py +282 -12
- isa_model/core/config.py +91 -1
- isa_model/core/database/__init__.py +1 -0
- isa_model/core/database/direct_db_client.py +114 -0
- isa_model/core/database/migration_manager.py +563 -0
- isa_model/core/database/migrations.py +297 -0
- isa_model/core/database/supabase_client.py +258 -0
- isa_model/core/dependencies.py +316 -0
- isa_model/core/discovery/__init__.py +19 -0
- isa_model/core/discovery/consul_discovery.py +190 -0
- isa_model/core/logging/__init__.py +54 -0
- isa_model/core/logging/influx_logger.py +523 -0
- isa_model/core/logging/loki_logger.py +160 -0
- isa_model/core/models/__init__.py +46 -0
- isa_model/core/models/config_models.py +625 -0
- isa_model/core/models/deployment_billing_tracker.py +430 -0
- isa_model/core/models/model_billing_tracker.py +60 -88
- isa_model/core/models/model_manager.py +66 -25
- isa_model/core/models/model_metadata.py +690 -0
- isa_model/core/models/model_repo.py +217 -55
- isa_model/core/models/model_statistics_tracker.py +234 -0
- isa_model/core/models/model_storage.py +0 -1
- isa_model/core/models/model_version_manager.py +959 -0
- isa_model/core/models/system_models.py +857 -0
- isa_model/core/pricing_manager.py +2 -249
- isa_model/core/repositories/__init__.py +9 -0
- isa_model/core/repositories/config_repository.py +912 -0
- isa_model/core/resilience/circuit_breaker.py +366 -0
- isa_model/core/security/secrets.py +358 -0
- isa_model/core/services/__init__.py +2 -4
- isa_model/core/services/intelligent_model_selector.py +479 -370
- isa_model/core/storage/hf_storage.py +2 -2
- isa_model/core/types.py +8 -0
- isa_model/deployment/__init__.py +5 -48
- isa_model/deployment/core/__init__.py +2 -31
- isa_model/deployment/core/deployment_manager.py +1278 -368
- isa_model/deployment/local/__init__.py +31 -0
- isa_model/deployment/local/config.py +248 -0
- isa_model/deployment/local/gpu_gateway.py +607 -0
- isa_model/deployment/local/health_checker.py +428 -0
- isa_model/deployment/local/provider.py +586 -0
- isa_model/deployment/local/tensorrt_service.py +621 -0
- isa_model/deployment/local/transformers_service.py +644 -0
- isa_model/deployment/local/vllm_service.py +527 -0
- isa_model/deployment/modal/__init__.py +8 -0
- isa_model/deployment/modal/config.py +136 -0
- isa_model/deployment/modal/deployer.py +894 -0
- isa_model/deployment/modal/services/__init__.py +3 -0
- isa_model/deployment/modal/services/audio/__init__.py +1 -0
- isa_model/deployment/modal/services/audio/isa_audio_chatTTS_service.py +520 -0
- isa_model/deployment/modal/services/audio/isa_audio_openvoice_service.py +758 -0
- isa_model/deployment/modal/services/audio/isa_audio_service_v2.py +1044 -0
- isa_model/deployment/modal/services/embedding/__init__.py +1 -0
- isa_model/deployment/modal/services/embedding/isa_embed_rerank_service.py +296 -0
- isa_model/deployment/modal/services/llm/__init__.py +1 -0
- isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
- isa_model/deployment/modal/services/video/__init__.py +1 -0
- isa_model/deployment/modal/services/video/isa_video_hunyuan_service.py +423 -0
- isa_model/deployment/modal/services/vision/__init__.py +1 -0
- isa_model/deployment/modal/services/vision/isa_vision_ocr_service.py +519 -0
- isa_model/deployment/modal/services/vision/isa_vision_qwen25_service.py +709 -0
- isa_model/deployment/modal/services/vision/isa_vision_table_service.py +676 -0
- isa_model/deployment/modal/services/vision/isa_vision_ui_service.py +833 -0
- isa_model/deployment/modal/services/vision/isa_vision_ui_service_optimized.py +660 -0
- isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
- isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
- isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
- isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
- isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
- isa_model/deployment/storage/__init__.py +5 -0
- isa_model/deployment/storage/deployment_repository.py +824 -0
- isa_model/deployment/triton/__init__.py +10 -0
- isa_model/deployment/triton/config.py +196 -0
- isa_model/deployment/triton/configs/__init__.py +1 -0
- isa_model/deployment/triton/provider.py +512 -0
- isa_model/deployment/triton/scripts/__init__.py +1 -0
- isa_model/deployment/triton/templates/__init__.py +1 -0
- isa_model/inference/__init__.py +47 -1
- isa_model/inference/ai_factory.py +179 -16
- isa_model/inference/legacy_services/__init__.py +21 -0
- isa_model/inference/legacy_services/model_evaluation.py +637 -0
- isa_model/inference/legacy_services/model_service.py +573 -0
- isa_model/inference/legacy_services/model_serving.py +717 -0
- isa_model/inference/legacy_services/model_training.py +561 -0
- isa_model/inference/models/__init__.py +21 -0
- isa_model/inference/models/inference_config.py +551 -0
- isa_model/inference/models/inference_record.py +675 -0
- isa_model/inference/models/performance_models.py +714 -0
- isa_model/inference/repositories/__init__.py +9 -0
- isa_model/inference/repositories/inference_repository.py +828 -0
- isa_model/inference/services/audio/__init__.py +21 -0
- isa_model/inference/services/audio/base_realtime_service.py +225 -0
- isa_model/inference/services/audio/base_stt_service.py +184 -11
- isa_model/inference/services/audio/isa_tts_service.py +0 -0
- isa_model/inference/services/audio/openai_realtime_service.py +320 -124
- isa_model/inference/services/audio/openai_stt_service.py +53 -11
- isa_model/inference/services/base_service.py +17 -1
- isa_model/inference/services/custom_model_manager.py +277 -0
- isa_model/inference/services/embedding/__init__.py +13 -0
- isa_model/inference/services/embedding/base_embed_service.py +111 -8
- isa_model/inference/services/embedding/isa_embed_service.py +305 -0
- isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
- isa_model/inference/services/embedding/openai_embed_service.py +2 -4
- isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
- isa_model/inference/services/embedding/tests/test_embedding.py +222 -0
- isa_model/inference/services/img/__init__.py +2 -2
- isa_model/inference/services/img/base_image_gen_service.py +24 -7
- isa_model/inference/services/img/replicate_image_gen_service.py +84 -422
- isa_model/inference/services/img/services/replicate_face_swap.py +193 -0
- isa_model/inference/services/img/services/replicate_flux.py +226 -0
- isa_model/inference/services/img/services/replicate_flux_kontext.py +219 -0
- isa_model/inference/services/img/services/replicate_sticker_maker.py +249 -0
- isa_model/inference/services/img/tests/test_img_client.py +297 -0
- isa_model/inference/services/llm/__init__.py +10 -2
- isa_model/inference/services/llm/base_llm_service.py +361 -26
- isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
- isa_model/inference/services/llm/helpers/llm_adapter.py +71 -12
- isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
- isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
- isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
- isa_model/inference/services/llm/local_llm_service.py +747 -0
- isa_model/inference/services/llm/ollama_llm_service.py +11 -3
- isa_model/inference/services/llm/openai_llm_service.py +670 -56
- isa_model/inference/services/llm/yyds_llm_service.py +10 -3
- isa_model/inference/services/vision/__init__.py +27 -6
- isa_model/inference/services/vision/base_vision_service.py +118 -185
- isa_model/inference/services/vision/blip_vision_service.py +359 -0
- isa_model/inference/services/vision/helpers/image_utils.py +19 -10
- isa_model/inference/services/vision/isa_vision_service.py +634 -0
- isa_model/inference/services/vision/openai_vision_service.py +19 -10
- isa_model/inference/services/vision/tests/test_ocr_client.py +284 -0
- isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
- isa_model/serving/api/cache_manager.py +245 -0
- isa_model/serving/api/dependencies/__init__.py +1 -0
- isa_model/serving/api/dependencies/auth.py +194 -0
- isa_model/serving/api/dependencies/database.py +139 -0
- isa_model/serving/api/error_handlers.py +284 -0
- isa_model/serving/api/fastapi_server.py +240 -18
- isa_model/serving/api/middleware/auth.py +317 -0
- isa_model/serving/api/middleware/security.py +268 -0
- isa_model/serving/api/middleware/tenant_context.py +414 -0
- isa_model/serving/api/routes/analytics.py +489 -0
- isa_model/serving/api/routes/config.py +645 -0
- isa_model/serving/api/routes/deployment_billing.py +315 -0
- isa_model/serving/api/routes/deployments.py +475 -0
- isa_model/serving/api/routes/gpu_gateway.py +440 -0
- isa_model/serving/api/routes/health.py +32 -12
- isa_model/serving/api/routes/inference_monitoring.py +486 -0
- isa_model/serving/api/routes/local_deployments.py +448 -0
- isa_model/serving/api/routes/logs.py +430 -0
- isa_model/serving/api/routes/settings.py +582 -0
- isa_model/serving/api/routes/tenants.py +575 -0
- isa_model/serving/api/routes/unified.py +992 -171
- isa_model/serving/api/routes/webhooks.py +479 -0
- isa_model/serving/api/startup.py +318 -0
- isa_model/serving/modal_proxy_server.py +249 -0
- isa_model/utils/gpu_utils.py +311 -0
- {isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/METADATA +76 -22
- isa_model-0.4.3.dist-info/RECORD +193 -0
- isa_model/deployment/cloud/__init__.py +0 -9
- isa_model/deployment/cloud/modal/__init__.py +0 -10
- isa_model/deployment/cloud/modal/isa_vision_doc_service.py +0 -766
- isa_model/deployment/cloud/modal/isa_vision_table_service.py +0 -532
- isa_model/deployment/cloud/modal/isa_vision_ui_service.py +0 -406
- isa_model/deployment/cloud/modal/register_models.py +0 -321
- isa_model/deployment/core/deployment_config.py +0 -356
- isa_model/deployment/core/isa_deployment_service.py +0 -401
- isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
- isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
- isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
- isa_model/deployment/runtime/deployed_service.py +0 -338
- isa_model/deployment/services/__init__.py +0 -9
- isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
- isa_model/deployment/services/model_service.py +0 -332
- isa_model/deployment/services/service_monitor.py +0 -356
- isa_model/deployment/services/service_registry.py +0 -527
- isa_model/eval/__init__.py +0 -92
- isa_model/eval/benchmarks.py +0 -469
- isa_model/eval/config/__init__.py +0 -10
- isa_model/eval/config/evaluation_config.py +0 -108
- isa_model/eval/evaluators/__init__.py +0 -18
- isa_model/eval/evaluators/base_evaluator.py +0 -503
- isa_model/eval/evaluators/llm_evaluator.py +0 -472
- isa_model/eval/factory.py +0 -531
- isa_model/eval/infrastructure/__init__.py +0 -24
- isa_model/eval/infrastructure/experiment_tracker.py +0 -466
- isa_model/eval/metrics.py +0 -798
- isa_model/inference/adapter/unified_api.py +0 -248
- isa_model/inference/services/helpers/stacked_config.py +0 -148
- isa_model/inference/services/img/flux_professional_service.py +0 -603
- isa_model/inference/services/img/helpers/base_stacked_service.py +0 -274
- isa_model/inference/services/others/table_transformer_service.py +0 -61
- isa_model/inference/services/vision/doc_analysis_service.py +0 -640
- isa_model/inference/services/vision/helpers/base_stacked_service.py +0 -274
- isa_model/inference/services/vision/ui_analysis_service.py +0 -823
- isa_model/scripts/inference_tracker.py +0 -283
- isa_model/scripts/mlflow_manager.py +0 -379
- isa_model/scripts/model_registry.py +0 -465
- isa_model/scripts/register_models.py +0 -370
- isa_model/scripts/register_models_with_embeddings.py +0 -510
- isa_model/scripts/start_mlflow.py +0 -95
- isa_model/scripts/training_tracker.py +0 -257
- isa_model/training/__init__.py +0 -74
- isa_model/training/annotation/annotation_schema.py +0 -47
- isa_model/training/annotation/processors/annotation_processor.py +0 -126
- isa_model/training/annotation/storage/dataset_manager.py +0 -131
- isa_model/training/annotation/storage/dataset_schema.py +0 -44
- isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
- isa_model/training/annotation/tests/test_minio copy.py +0 -113
- isa_model/training/annotation/tests/test_minio_upload.py +0 -43
- isa_model/training/annotation/views/annotation_controller.py +0 -158
- isa_model/training/cloud/__init__.py +0 -22
- isa_model/training/cloud/job_orchestrator.py +0 -402
- isa_model/training/cloud/runpod_trainer.py +0 -454
- isa_model/training/cloud/storage_manager.py +0 -482
- isa_model/training/core/__init__.py +0 -23
- isa_model/training/core/config.py +0 -181
- isa_model/training/core/dataset.py +0 -222
- isa_model/training/core/trainer.py +0 -720
- isa_model/training/core/utils.py +0 -213
- isa_model/training/factory.py +0 -424
- isa_model-0.3.91.dist-info/RECORD +0 -138
- /isa_model/{core/storage/minio_storage.py → deployment/modal/services/audio/isa_audio_fish_service.py} +0 -0
- /isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
- {isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/WHEEL +0 -0
- {isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,644 @@
|
|
1
|
+
"""
|
2
|
+
HuggingFace Transformers local inference service
|
3
|
+
|
4
|
+
Direct model loading and inference using HuggingFace Transformers.
|
5
|
+
"""
|
6
|
+
|
7
|
+
import os
|
8
|
+
import json
|
9
|
+
import asyncio
|
10
|
+
import logging
|
11
|
+
import threading
|
12
|
+
from typing import Dict, List, Optional, Any, Union
|
13
|
+
from datetime import datetime
|
14
|
+
import time
|
15
|
+
import torch
|
16
|
+
|
17
|
+
from .config import LocalGPUConfig, LocalServiceType, LocalBackend
|
18
|
+
from ...utils.gpu_utils import get_gpu_manager
|
19
|
+
|
20
|
+
logger = logging.getLogger(__name__)
|
21
|
+
|
22
|
+
|
23
|
+
class TransformersService:
|
24
|
+
"""HuggingFace Transformers local inference service"""
|
25
|
+
|
26
|
+
def __init__(self, config: LocalGPUConfig):
|
27
|
+
"""
|
28
|
+
Initialize Transformers service.
|
29
|
+
|
30
|
+
Args:
|
31
|
+
config: Local GPU configuration for Transformers
|
32
|
+
"""
|
33
|
+
if config.backend != LocalBackend.TRANSFORMERS:
|
34
|
+
raise ValueError("Config must use TRANSFORMERS backend")
|
35
|
+
|
36
|
+
self.config = config
|
37
|
+
self.gpu_manager = get_gpu_manager()
|
38
|
+
self.model = None
|
39
|
+
self.tokenizer = None
|
40
|
+
self.processor = None # For multimodal models
|
41
|
+
self.model_loaded = False
|
42
|
+
self.startup_time: Optional[datetime] = None
|
43
|
+
self.device = None
|
44
|
+
|
45
|
+
# Thread safety for inference
|
46
|
+
self._inference_lock = threading.Lock()
|
47
|
+
|
48
|
+
# Service info
|
49
|
+
self.service_info = {
|
50
|
+
"service_name": config.service_name,
|
51
|
+
"model_id": config.model_id,
|
52
|
+
"backend": "transformers",
|
53
|
+
"status": "stopped",
|
54
|
+
"device": None
|
55
|
+
}
|
56
|
+
|
57
|
+
async def load_model(self) -> Dict[str, Any]:
|
58
|
+
"""
|
59
|
+
Load HuggingFace model for inference.
|
60
|
+
|
61
|
+
Returns:
|
62
|
+
Model loading result
|
63
|
+
"""
|
64
|
+
if self.model_loaded:
|
65
|
+
return {
|
66
|
+
"success": True,
|
67
|
+
"message": "Model already loaded",
|
68
|
+
"service_info": self.service_info
|
69
|
+
}
|
70
|
+
|
71
|
+
try:
|
72
|
+
logger.info(f"Loading Transformers model: {self.config.model_id}")
|
73
|
+
self.startup_time = datetime.now()
|
74
|
+
|
75
|
+
# Check GPU requirements
|
76
|
+
gpu_check = await self._check_gpu_requirements()
|
77
|
+
if not gpu_check["compatible"]:
|
78
|
+
return {
|
79
|
+
"success": False,
|
80
|
+
"error": f"GPU requirements not met: {', '.join(gpu_check['warnings'])}",
|
81
|
+
"gpu_check": gpu_check
|
82
|
+
}
|
83
|
+
|
84
|
+
# Set device
|
85
|
+
self.device = await self._setup_device()
|
86
|
+
|
87
|
+
# Load model components
|
88
|
+
load_result = await self._load_model_components()
|
89
|
+
|
90
|
+
if load_result["success"]:
|
91
|
+
self.model_loaded = True
|
92
|
+
self.service_info.update({
|
93
|
+
"status": "running",
|
94
|
+
"device": str(self.device),
|
95
|
+
"loaded_at": self.startup_time.isoformat(),
|
96
|
+
"load_time": load_result["load_time"],
|
97
|
+
"model_info": load_result["model_info"]
|
98
|
+
})
|
99
|
+
|
100
|
+
logger.info(f"Transformers model loaded successfully on {self.device}")
|
101
|
+
return {
|
102
|
+
"success": True,
|
103
|
+
"service_info": self.service_info,
|
104
|
+
"load_time": load_result["load_time"],
|
105
|
+
"gpu_info": gpu_check.get("selected_gpu")
|
106
|
+
}
|
107
|
+
else:
|
108
|
+
return load_result
|
109
|
+
|
110
|
+
except Exception as e:
|
111
|
+
logger.error(f"Failed to load Transformers model: {e}")
|
112
|
+
return {
|
113
|
+
"success": False,
|
114
|
+
"error": str(e)
|
115
|
+
}
|
116
|
+
|
117
|
+
async def unload_model(self) -> Dict[str, Any]:
|
118
|
+
"""Unload model and free GPU memory"""
|
119
|
+
try:
|
120
|
+
if self.model:
|
121
|
+
del self.model
|
122
|
+
self.model = None
|
123
|
+
|
124
|
+
if self.tokenizer:
|
125
|
+
del self.tokenizer
|
126
|
+
self.tokenizer = None
|
127
|
+
|
128
|
+
if self.processor:
|
129
|
+
del self.processor
|
130
|
+
self.processor = None
|
131
|
+
|
132
|
+
self.model_loaded = False
|
133
|
+
self.service_info.update({
|
134
|
+
"status": "stopped",
|
135
|
+
"device": None,
|
136
|
+
"unloaded_at": datetime.now().isoformat()
|
137
|
+
})
|
138
|
+
|
139
|
+
# Free GPU memory
|
140
|
+
if torch.cuda.is_available():
|
141
|
+
torch.cuda.empty_cache()
|
142
|
+
|
143
|
+
logger.info("Transformers model unloaded")
|
144
|
+
return {
|
145
|
+
"success": True,
|
146
|
+
"service_info": self.service_info
|
147
|
+
}
|
148
|
+
|
149
|
+
except Exception as e:
|
150
|
+
logger.error(f"Failed to unload model: {e}")
|
151
|
+
return {
|
152
|
+
"success": False,
|
153
|
+
"error": str(e)
|
154
|
+
}
|
155
|
+
|
156
|
+
async def generate_text(self, prompt: str, **kwargs) -> Dict[str, Any]:
|
157
|
+
"""Generate text using the loaded model"""
|
158
|
+
if not self.model_loaded:
|
159
|
+
return {
|
160
|
+
"success": False,
|
161
|
+
"error": "Model not loaded"
|
162
|
+
}
|
163
|
+
|
164
|
+
try:
|
165
|
+
with self._inference_lock:
|
166
|
+
start_time = time.time()
|
167
|
+
|
168
|
+
# Prepare generation parameters
|
169
|
+
max_tokens = kwargs.get("max_tokens", 512)
|
170
|
+
temperature = kwargs.get("temperature", 0.7)
|
171
|
+
top_p = kwargs.get("top_p", 0.9)
|
172
|
+
top_k = kwargs.get("top_k", 50)
|
173
|
+
do_sample = kwargs.get("do_sample", True)
|
174
|
+
|
175
|
+
# Tokenize input
|
176
|
+
inputs = self.tokenizer(
|
177
|
+
prompt,
|
178
|
+
return_tensors="pt",
|
179
|
+
padding=True,
|
180
|
+
truncation=True,
|
181
|
+
max_length=self.config.max_model_len // 2
|
182
|
+
).to(self.device)
|
183
|
+
|
184
|
+
# Generate
|
185
|
+
with torch.no_grad():
|
186
|
+
outputs = self.model.generate(
|
187
|
+
**inputs,
|
188
|
+
max_new_tokens=max_tokens,
|
189
|
+
temperature=temperature,
|
190
|
+
top_p=top_p,
|
191
|
+
top_k=top_k,
|
192
|
+
do_sample=do_sample,
|
193
|
+
pad_token_id=self.tokenizer.eos_token_id,
|
194
|
+
eos_token_id=self.tokenizer.eos_token_id,
|
195
|
+
use_cache=True
|
196
|
+
)
|
197
|
+
|
198
|
+
# Decode output
|
199
|
+
input_length = inputs['input_ids'].shape[-1]
|
200
|
+
generated_tokens = outputs[0][input_length:]
|
201
|
+
generated_text = self.tokenizer.decode(
|
202
|
+
generated_tokens,
|
203
|
+
skip_special_tokens=True
|
204
|
+
).strip()
|
205
|
+
|
206
|
+
generation_time = time.time() - start_time
|
207
|
+
|
208
|
+
return {
|
209
|
+
"success": True,
|
210
|
+
"text": generated_text,
|
211
|
+
"model": self.config.model_id,
|
212
|
+
"generation_time": generation_time,
|
213
|
+
"input_tokens": input_length,
|
214
|
+
"output_tokens": len(generated_tokens),
|
215
|
+
"total_tokens": len(outputs[0])
|
216
|
+
}
|
217
|
+
|
218
|
+
except Exception as e:
|
219
|
+
logger.error(f"Text generation failed: {e}")
|
220
|
+
return {
|
221
|
+
"success": False,
|
222
|
+
"error": str(e)
|
223
|
+
}
|
224
|
+
|
225
|
+
async def chat_completion(self, messages: List[Dict[str, str]], **kwargs) -> Dict[str, Any]:
|
226
|
+
"""Generate chat completion response"""
|
227
|
+
# Convert messages to prompt
|
228
|
+
prompt = await self._format_chat_messages(messages)
|
229
|
+
|
230
|
+
# Generate response
|
231
|
+
result = await self.generate_text(prompt, **kwargs)
|
232
|
+
|
233
|
+
if result["success"]:
|
234
|
+
# Format as chat completion
|
235
|
+
return {
|
236
|
+
"success": True,
|
237
|
+
"choices": [{
|
238
|
+
"message": {
|
239
|
+
"role": "assistant",
|
240
|
+
"content": result["text"]
|
241
|
+
},
|
242
|
+
"finish_reason": "stop"
|
243
|
+
}],
|
244
|
+
"usage": {
|
245
|
+
"prompt_tokens": result["input_tokens"],
|
246
|
+
"completion_tokens": result["output_tokens"],
|
247
|
+
"total_tokens": result["total_tokens"]
|
248
|
+
},
|
249
|
+
"model": result["model"],
|
250
|
+
"generation_time": result["generation_time"]
|
251
|
+
}
|
252
|
+
else:
|
253
|
+
return result
|
254
|
+
|
255
|
+
async def analyze_image(self, image_data: bytes, prompt: str = "Describe this image.", **kwargs) -> Dict[str, Any]:
|
256
|
+
"""Analyze image using vision model"""
|
257
|
+
if self.config.service_type != LocalServiceType.VISION:
|
258
|
+
return {
|
259
|
+
"success": False,
|
260
|
+
"error": "Service not configured for vision tasks"
|
261
|
+
}
|
262
|
+
|
263
|
+
if not self.processor:
|
264
|
+
return {
|
265
|
+
"success": False,
|
266
|
+
"error": "Vision processor not loaded"
|
267
|
+
}
|
268
|
+
|
269
|
+
try:
|
270
|
+
with self._inference_lock:
|
271
|
+
start_time = time.time()
|
272
|
+
|
273
|
+
from PIL import Image
|
274
|
+
import io
|
275
|
+
|
276
|
+
# Load image
|
277
|
+
image = Image.open(io.BytesIO(image_data)).convert('RGB')
|
278
|
+
|
279
|
+
# Process inputs
|
280
|
+
inputs = self.processor(
|
281
|
+
text=prompt,
|
282
|
+
images=image,
|
283
|
+
return_tensors="pt"
|
284
|
+
).to(self.device)
|
285
|
+
|
286
|
+
# Generate
|
287
|
+
max_tokens = kwargs.get("max_tokens", 512)
|
288
|
+
with torch.no_grad():
|
289
|
+
outputs = self.model.generate(
|
290
|
+
**inputs,
|
291
|
+
max_new_tokens=max_tokens,
|
292
|
+
do_sample=True,
|
293
|
+
temperature=kwargs.get("temperature", 0.7)
|
294
|
+
)
|
295
|
+
|
296
|
+
# Decode
|
297
|
+
response = self.processor.decode(outputs[0], skip_special_tokens=True)
|
298
|
+
|
299
|
+
# Clean up response (remove prompt)
|
300
|
+
if prompt in response:
|
301
|
+
response = response.replace(prompt, "").strip()
|
302
|
+
|
303
|
+
generation_time = time.time() - start_time
|
304
|
+
|
305
|
+
return {
|
306
|
+
"success": True,
|
307
|
+
"text": response,
|
308
|
+
"model": self.config.model_id,
|
309
|
+
"generation_time": generation_time
|
310
|
+
}
|
311
|
+
|
312
|
+
except Exception as e:
|
313
|
+
logger.error(f"Image analysis failed: {e}")
|
314
|
+
return {
|
315
|
+
"success": False,
|
316
|
+
"error": str(e)
|
317
|
+
}
|
318
|
+
|
319
|
+
async def embed_text(self, texts: Union[str, List[str]], **kwargs) -> Dict[str, Any]:
|
320
|
+
"""Generate text embeddings"""
|
321
|
+
if self.config.service_type != LocalServiceType.EMBEDDING:
|
322
|
+
return {
|
323
|
+
"success": False,
|
324
|
+
"error": "Service not configured for embedding tasks"
|
325
|
+
}
|
326
|
+
|
327
|
+
try:
|
328
|
+
with self._inference_lock:
|
329
|
+
start_time = time.time()
|
330
|
+
|
331
|
+
# Ensure texts is a list
|
332
|
+
if isinstance(texts, str):
|
333
|
+
texts = [texts]
|
334
|
+
|
335
|
+
# Tokenize
|
336
|
+
inputs = self.tokenizer(
|
337
|
+
texts,
|
338
|
+
return_tensors="pt",
|
339
|
+
padding=True,
|
340
|
+
truncation=True,
|
341
|
+
max_length=self.config.max_model_len
|
342
|
+
).to(self.device)
|
343
|
+
|
344
|
+
# Generate embeddings
|
345
|
+
with torch.no_grad():
|
346
|
+
outputs = self.model(**inputs)
|
347
|
+
|
348
|
+
# Use different pooling strategies based on model
|
349
|
+
if hasattr(outputs, 'pooler_output') and outputs.pooler_output is not None:
|
350
|
+
embeddings = outputs.pooler_output
|
351
|
+
else:
|
352
|
+
# Mean pooling
|
353
|
+
embeddings = outputs.last_hidden_state.mean(dim=1)
|
354
|
+
|
355
|
+
# Normalize embeddings
|
356
|
+
embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
|
357
|
+
|
358
|
+
generation_time = time.time() - start_time
|
359
|
+
|
360
|
+
return {
|
361
|
+
"success": True,
|
362
|
+
"embeddings": embeddings.cpu().numpy().tolist(),
|
363
|
+
"model": self.config.model_id,
|
364
|
+
"generation_time": generation_time,
|
365
|
+
"embedding_dimension": embeddings.shape[-1]
|
366
|
+
}
|
367
|
+
|
368
|
+
except Exception as e:
|
369
|
+
logger.error(f"Text embedding failed: {e}")
|
370
|
+
return {
|
371
|
+
"success": False,
|
372
|
+
"error": str(e)
|
373
|
+
}
|
374
|
+
|
375
|
+
async def health_check(self) -> Dict[str, Any]:
|
376
|
+
"""Check service health"""
|
377
|
+
return {
|
378
|
+
"healthy": self.model_loaded,
|
379
|
+
"status": "running" if self.model_loaded else "stopped",
|
380
|
+
"service_info": self.service_info,
|
381
|
+
"device": str(self.device) if self.device else None,
|
382
|
+
"model_loaded": self.model_loaded
|
383
|
+
}
|
384
|
+
|
385
|
+
async def _load_model_components(self) -> Dict[str, Any]:
|
386
|
+
"""Load model, tokenizer, and processor"""
|
387
|
+
try:
|
388
|
+
start_time = time.time()
|
389
|
+
|
390
|
+
from transformers import (
|
391
|
+
AutoTokenizer, AutoModel, AutoModelForCausalLM,
|
392
|
+
AutoProcessor, AutoModelForVision2Seq,
|
393
|
+
BitsAndBytesConfig
|
394
|
+
)
|
395
|
+
|
396
|
+
# Load tokenizer
|
397
|
+
logger.info("Loading tokenizer...")
|
398
|
+
self.tokenizer = AutoTokenizer.from_pretrained(
|
399
|
+
self.config.model_id,
|
400
|
+
revision=self.config.tokenizer_revision,
|
401
|
+
trust_remote_code=self.config.trust_remote_code,
|
402
|
+
use_fast=True
|
403
|
+
)
|
404
|
+
|
405
|
+
# Set pad token if missing
|
406
|
+
if self.tokenizer.pad_token is None:
|
407
|
+
self.tokenizer.pad_token = self.tokenizer.eos_token
|
408
|
+
|
409
|
+
# Load processor for multimodal models
|
410
|
+
if self.config.service_type in [LocalServiceType.VISION, LocalServiceType.AUDIO]:
|
411
|
+
try:
|
412
|
+
logger.info("Loading processor...")
|
413
|
+
self.processor = AutoProcessor.from_pretrained(
|
414
|
+
self.config.model_id,
|
415
|
+
revision=self.config.revision,
|
416
|
+
trust_remote_code=self.config.trust_remote_code
|
417
|
+
)
|
418
|
+
except Exception as e:
|
419
|
+
logger.warning(f"Failed to load processor: {e}")
|
420
|
+
|
421
|
+
# Configure quantization
|
422
|
+
quantization_config = None
|
423
|
+
if self.config.quantization:
|
424
|
+
if self.config.quantization in ["4bit", "int4"]:
|
425
|
+
quantization_config = BitsAndBytesConfig(
|
426
|
+
load_in_4bit=True,
|
427
|
+
bnb_4bit_compute_dtype=torch.float16,
|
428
|
+
bnb_4bit_use_double_quant=True,
|
429
|
+
bnb_4bit_quant_type="nf4"
|
430
|
+
)
|
431
|
+
elif self.config.quantization in ["8bit", "int8"]:
|
432
|
+
quantization_config = BitsAndBytesConfig(
|
433
|
+
load_in_8bit=True
|
434
|
+
)
|
435
|
+
|
436
|
+
# Determine model class based on service type
|
437
|
+
if self.config.service_type == LocalServiceType.EMBEDDING:
|
438
|
+
model_class = AutoModel
|
439
|
+
elif self.config.service_type == LocalServiceType.VISION:
|
440
|
+
model_class = AutoModelForVision2Seq
|
441
|
+
else:
|
442
|
+
model_class = AutoModelForCausalLM
|
443
|
+
|
444
|
+
# Configure model loading arguments
|
445
|
+
model_kwargs = {
|
446
|
+
"revision": self.config.revision,
|
447
|
+
"trust_remote_code": self.config.trust_remote_code,
|
448
|
+
"torch_dtype": self._get_torch_dtype(),
|
449
|
+
"device_map": "auto" if self.config.enable_gpu else "cpu",
|
450
|
+
"low_cpu_mem_usage": True,
|
451
|
+
**self.config.transformers_args
|
452
|
+
}
|
453
|
+
|
454
|
+
if quantization_config:
|
455
|
+
model_kwargs["quantization_config"] = quantization_config
|
456
|
+
|
457
|
+
# Load model
|
458
|
+
logger.info(f"Loading model with {model_class.__name__}...")
|
459
|
+
self.model = model_class.from_pretrained(
|
460
|
+
self.config.model_id,
|
461
|
+
**model_kwargs
|
462
|
+
)
|
463
|
+
|
464
|
+
# Move to device if not using device_map
|
465
|
+
if not self.config.transformers_args.get("device_map"):
|
466
|
+
self.model.to(self.device)
|
467
|
+
|
468
|
+
self.model.eval()
|
469
|
+
|
470
|
+
# Try to compile model for faster inference
|
471
|
+
if hasattr(torch, 'compile') and self.config.enable_gpu:
|
472
|
+
try:
|
473
|
+
self.model = torch.compile(self.model, mode="reduce-overhead")
|
474
|
+
logger.info("Model compiled for faster inference")
|
475
|
+
except Exception as e:
|
476
|
+
logger.warning(f"Model compilation failed: {e}")
|
477
|
+
|
478
|
+
load_time = time.time() - start_time
|
479
|
+
|
480
|
+
# Get model info
|
481
|
+
model_info = {
|
482
|
+
"model_id": self.config.model_id,
|
483
|
+
"model_type": self.config.service_type.value,
|
484
|
+
"torch_dtype": str(self.model.dtype) if hasattr(self.model, 'dtype') else None,
|
485
|
+
"device": str(next(self.model.parameters()).device) if hasattr(self.model, 'parameters') else None,
|
486
|
+
"quantization": self.config.quantization,
|
487
|
+
"parameters": self._count_parameters()
|
488
|
+
}
|
489
|
+
|
490
|
+
logger.info(f"Model loaded successfully in {load_time:.2f}s")
|
491
|
+
|
492
|
+
return {
|
493
|
+
"success": True,
|
494
|
+
"load_time": load_time,
|
495
|
+
"model_info": model_info
|
496
|
+
}
|
497
|
+
|
498
|
+
except Exception as e:
|
499
|
+
logger.error(f"Failed to load model components: {e}")
|
500
|
+
return {
|
501
|
+
"success": False,
|
502
|
+
"error": str(e)
|
503
|
+
}
|
504
|
+
|
505
|
+
def _count_parameters(self) -> Optional[int]:
|
506
|
+
"""Count model parameters"""
|
507
|
+
try:
|
508
|
+
if hasattr(self.model, 'num_parameters'):
|
509
|
+
return self.model.num_parameters()
|
510
|
+
else:
|
511
|
+
return sum(p.numel() for p in self.model.parameters())
|
512
|
+
except:
|
513
|
+
return None
|
514
|
+
|
515
|
+
def _get_torch_dtype(self) -> torch.dtype:
|
516
|
+
"""Get appropriate torch dtype"""
|
517
|
+
precision_map = {
|
518
|
+
"float32": torch.float32,
|
519
|
+
"float16": torch.float16,
|
520
|
+
"bfloat16": torch.bfloat16,
|
521
|
+
"int8": torch.int8
|
522
|
+
}
|
523
|
+
return precision_map.get(self.config.model_precision, torch.float16)
|
524
|
+
|
525
|
+
async def _setup_device(self) -> torch.device:
|
526
|
+
"""Setup compute device"""
|
527
|
+
if not self.config.enable_gpu or not torch.cuda.is_available():
|
528
|
+
return torch.device("cpu")
|
529
|
+
|
530
|
+
if self.config.gpu_id is not None:
|
531
|
+
device = torch.device(f"cuda:{self.config.gpu_id}")
|
532
|
+
else:
|
533
|
+
device = torch.device("cuda")
|
534
|
+
|
535
|
+
# Set memory fraction
|
536
|
+
if torch.cuda.is_available():
|
537
|
+
torch.cuda.set_per_process_memory_fraction(
|
538
|
+
self.config.gpu_memory_fraction,
|
539
|
+
device.index if device.index is not None else 0
|
540
|
+
)
|
541
|
+
|
542
|
+
return device
|
543
|
+
|
544
|
+
async def _format_chat_messages(self, messages: List[Dict[str, str]]) -> str:
|
545
|
+
"""Format chat messages into a prompt"""
|
546
|
+
formatted_parts = []
|
547
|
+
|
548
|
+
for message in messages:
|
549
|
+
role = message.get("role", "user")
|
550
|
+
content = message.get("content", "")
|
551
|
+
|
552
|
+
if role == "system":
|
553
|
+
formatted_parts.append(f"System: {content}")
|
554
|
+
elif role == "user":
|
555
|
+
formatted_parts.append(f"Human: {content}")
|
556
|
+
elif role == "assistant":
|
557
|
+
formatted_parts.append(f"Assistant: {content}")
|
558
|
+
|
559
|
+
formatted_parts.append("Assistant:")
|
560
|
+
return "\n\n".join(formatted_parts)
|
561
|
+
|
562
|
+
async def _check_gpu_requirements(self) -> Dict[str, Any]:
|
563
|
+
"""Check GPU requirements"""
|
564
|
+
if not self.config.enable_gpu:
|
565
|
+
return {
|
566
|
+
"compatible": True,
|
567
|
+
"warnings": ["Using CPU inference"],
|
568
|
+
"selected_gpu": None
|
569
|
+
}
|
570
|
+
|
571
|
+
self.gpu_manager.refresh()
|
572
|
+
|
573
|
+
if not self.gpu_manager.cuda_available:
|
574
|
+
return {
|
575
|
+
"compatible": True, # Can fallback to CPU
|
576
|
+
"warnings": ["CUDA not available, falling back to CPU"],
|
577
|
+
"selected_gpu": None
|
578
|
+
}
|
579
|
+
|
580
|
+
# Estimate memory requirements
|
581
|
+
estimated_memory = self.gpu_manager.estimate_model_memory(
|
582
|
+
self.config.model_id,
|
583
|
+
self.config.model_precision
|
584
|
+
)
|
585
|
+
|
586
|
+
# Apply quantization reduction
|
587
|
+
if self.config.quantization == "int8":
|
588
|
+
estimated_memory = int(estimated_memory * 0.5)
|
589
|
+
elif self.config.quantization == "int4":
|
590
|
+
estimated_memory = int(estimated_memory * 0.25)
|
591
|
+
|
592
|
+
# Find suitable GPU
|
593
|
+
if self.config.gpu_id is not None:
|
594
|
+
selected_gpu = self.gpu_manager.get_gpu_info(self.config.gpu_id)
|
595
|
+
if not selected_gpu:
|
596
|
+
return {
|
597
|
+
"compatible": True,
|
598
|
+
"warnings": [f"Specified GPU {self.config.gpu_id} not found, falling back to CPU"],
|
599
|
+
"selected_gpu": None
|
600
|
+
}
|
601
|
+
else:
|
602
|
+
selected_gpu = self.gpu_manager.get_best_gpu(estimated_memory)
|
603
|
+
if selected_gpu:
|
604
|
+
self.config.gpu_id = selected_gpu.gpu_id
|
605
|
+
|
606
|
+
if not selected_gpu:
|
607
|
+
return {
|
608
|
+
"compatible": True,
|
609
|
+
"warnings": [
|
610
|
+
f"No suitable GPU found (Required: {estimated_memory}MB), falling back to CPU"
|
611
|
+
],
|
612
|
+
"selected_gpu": None
|
613
|
+
}
|
614
|
+
|
615
|
+
warnings = []
|
616
|
+
|
617
|
+
# Check memory requirements
|
618
|
+
required_memory = int(estimated_memory * self.config.gpu_memory_fraction)
|
619
|
+
if selected_gpu.memory_free < required_memory:
|
620
|
+
warnings.append(f"GPU memory may be tight: {selected_gpu.memory_free}MB available, {required_memory}MB required")
|
621
|
+
|
622
|
+
return {
|
623
|
+
"compatible": True,
|
624
|
+
"warnings": warnings,
|
625
|
+
"selected_gpu": {
|
626
|
+
"gpu_id": selected_gpu.gpu_id,
|
627
|
+
"name": selected_gpu.name,
|
628
|
+
"memory_total": selected_gpu.memory_total,
|
629
|
+
"memory_free": selected_gpu.memory_free,
|
630
|
+
"utilization": selected_gpu.utilization,
|
631
|
+
"estimated_memory_required": estimated_memory
|
632
|
+
}
|
633
|
+
}
|
634
|
+
|
635
|
+
def get_service_info(self) -> Dict[str, Any]:
|
636
|
+
"""Get current service information"""
|
637
|
+
return {
|
638
|
+
**self.service_info,
|
639
|
+
"config": self.config.to_dict(),
|
640
|
+
"device": str(self.device) if self.device else None,
|
641
|
+
"model_loaded": self.model_loaded,
|
642
|
+
"startup_time": self.startup_time.isoformat() if self.startup_time else None,
|
643
|
+
"parameters": self._count_parameters() if self.model_loaded else None
|
644
|
+
}
|