isa-model 0.3.91__py3-none-any.whl → 0.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- isa_model/client.py +1166 -584
- isa_model/core/cache/redis_cache.py +410 -0
- isa_model/core/config/config_manager.py +282 -12
- isa_model/core/config.py +91 -1
- isa_model/core/database/__init__.py +1 -0
- isa_model/core/database/direct_db_client.py +114 -0
- isa_model/core/database/migration_manager.py +563 -0
- isa_model/core/database/migrations.py +297 -0
- isa_model/core/database/supabase_client.py +258 -0
- isa_model/core/dependencies.py +316 -0
- isa_model/core/discovery/__init__.py +19 -0
- isa_model/core/discovery/consul_discovery.py +190 -0
- isa_model/core/logging/__init__.py +54 -0
- isa_model/core/logging/influx_logger.py +523 -0
- isa_model/core/logging/loki_logger.py +160 -0
- isa_model/core/models/__init__.py +46 -0
- isa_model/core/models/config_models.py +625 -0
- isa_model/core/models/deployment_billing_tracker.py +430 -0
- isa_model/core/models/model_billing_tracker.py +60 -88
- isa_model/core/models/model_manager.py +66 -25
- isa_model/core/models/model_metadata.py +690 -0
- isa_model/core/models/model_repo.py +217 -55
- isa_model/core/models/model_statistics_tracker.py +234 -0
- isa_model/core/models/model_storage.py +0 -1
- isa_model/core/models/model_version_manager.py +959 -0
- isa_model/core/models/system_models.py +857 -0
- isa_model/core/pricing_manager.py +2 -249
- isa_model/core/repositories/__init__.py +9 -0
- isa_model/core/repositories/config_repository.py +912 -0
- isa_model/core/resilience/circuit_breaker.py +366 -0
- isa_model/core/security/secrets.py +358 -0
- isa_model/core/services/__init__.py +2 -4
- isa_model/core/services/intelligent_model_selector.py +479 -370
- isa_model/core/storage/hf_storage.py +2 -2
- isa_model/core/types.py +8 -0
- isa_model/deployment/__init__.py +5 -48
- isa_model/deployment/core/__init__.py +2 -31
- isa_model/deployment/core/deployment_manager.py +1278 -368
- isa_model/deployment/local/__init__.py +31 -0
- isa_model/deployment/local/config.py +248 -0
- isa_model/deployment/local/gpu_gateway.py +607 -0
- isa_model/deployment/local/health_checker.py +428 -0
- isa_model/deployment/local/provider.py +586 -0
- isa_model/deployment/local/tensorrt_service.py +621 -0
- isa_model/deployment/local/transformers_service.py +644 -0
- isa_model/deployment/local/vllm_service.py +527 -0
- isa_model/deployment/modal/__init__.py +8 -0
- isa_model/deployment/modal/config.py +136 -0
- isa_model/deployment/modal/deployer.py +894 -0
- isa_model/deployment/modal/services/__init__.py +3 -0
- isa_model/deployment/modal/services/audio/__init__.py +1 -0
- isa_model/deployment/modal/services/audio/isa_audio_chatTTS_service.py +520 -0
- isa_model/deployment/modal/services/audio/isa_audio_openvoice_service.py +758 -0
- isa_model/deployment/modal/services/audio/isa_audio_service_v2.py +1044 -0
- isa_model/deployment/modal/services/embedding/__init__.py +1 -0
- isa_model/deployment/modal/services/embedding/isa_embed_rerank_service.py +296 -0
- isa_model/deployment/modal/services/llm/__init__.py +1 -0
- isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
- isa_model/deployment/modal/services/video/__init__.py +1 -0
- isa_model/deployment/modal/services/video/isa_video_hunyuan_service.py +423 -0
- isa_model/deployment/modal/services/vision/__init__.py +1 -0
- isa_model/deployment/modal/services/vision/isa_vision_ocr_service.py +519 -0
- isa_model/deployment/modal/services/vision/isa_vision_qwen25_service.py +709 -0
- isa_model/deployment/modal/services/vision/isa_vision_table_service.py +676 -0
- isa_model/deployment/modal/services/vision/isa_vision_ui_service.py +833 -0
- isa_model/deployment/modal/services/vision/isa_vision_ui_service_optimized.py +660 -0
- isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
- isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
- isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
- isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
- isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
- isa_model/deployment/storage/__init__.py +5 -0
- isa_model/deployment/storage/deployment_repository.py +824 -0
- isa_model/deployment/triton/__init__.py +10 -0
- isa_model/deployment/triton/config.py +196 -0
- isa_model/deployment/triton/configs/__init__.py +1 -0
- isa_model/deployment/triton/provider.py +512 -0
- isa_model/deployment/triton/scripts/__init__.py +1 -0
- isa_model/deployment/triton/templates/__init__.py +1 -0
- isa_model/inference/__init__.py +47 -1
- isa_model/inference/ai_factory.py +179 -16
- isa_model/inference/legacy_services/__init__.py +21 -0
- isa_model/inference/legacy_services/model_evaluation.py +637 -0
- isa_model/inference/legacy_services/model_service.py +573 -0
- isa_model/inference/legacy_services/model_serving.py +717 -0
- isa_model/inference/legacy_services/model_training.py +561 -0
- isa_model/inference/models/__init__.py +21 -0
- isa_model/inference/models/inference_config.py +551 -0
- isa_model/inference/models/inference_record.py +675 -0
- isa_model/inference/models/performance_models.py +714 -0
- isa_model/inference/repositories/__init__.py +9 -0
- isa_model/inference/repositories/inference_repository.py +828 -0
- isa_model/inference/services/audio/__init__.py +21 -0
- isa_model/inference/services/audio/base_realtime_service.py +225 -0
- isa_model/inference/services/audio/base_stt_service.py +184 -11
- isa_model/inference/services/audio/isa_tts_service.py +0 -0
- isa_model/inference/services/audio/openai_realtime_service.py +320 -124
- isa_model/inference/services/audio/openai_stt_service.py +53 -11
- isa_model/inference/services/base_service.py +17 -1
- isa_model/inference/services/custom_model_manager.py +277 -0
- isa_model/inference/services/embedding/__init__.py +13 -0
- isa_model/inference/services/embedding/base_embed_service.py +111 -8
- isa_model/inference/services/embedding/isa_embed_service.py +305 -0
- isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
- isa_model/inference/services/embedding/openai_embed_service.py +2 -4
- isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
- isa_model/inference/services/embedding/tests/test_embedding.py +222 -0
- isa_model/inference/services/img/__init__.py +2 -2
- isa_model/inference/services/img/base_image_gen_service.py +24 -7
- isa_model/inference/services/img/replicate_image_gen_service.py +84 -422
- isa_model/inference/services/img/services/replicate_face_swap.py +193 -0
- isa_model/inference/services/img/services/replicate_flux.py +226 -0
- isa_model/inference/services/img/services/replicate_flux_kontext.py +219 -0
- isa_model/inference/services/img/services/replicate_sticker_maker.py +249 -0
- isa_model/inference/services/img/tests/test_img_client.py +297 -0
- isa_model/inference/services/llm/__init__.py +10 -2
- isa_model/inference/services/llm/base_llm_service.py +361 -26
- isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
- isa_model/inference/services/llm/helpers/llm_adapter.py +71 -12
- isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
- isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
- isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
- isa_model/inference/services/llm/local_llm_service.py +747 -0
- isa_model/inference/services/llm/ollama_llm_service.py +11 -3
- isa_model/inference/services/llm/openai_llm_service.py +670 -56
- isa_model/inference/services/llm/yyds_llm_service.py +10 -3
- isa_model/inference/services/vision/__init__.py +27 -6
- isa_model/inference/services/vision/base_vision_service.py +118 -185
- isa_model/inference/services/vision/blip_vision_service.py +359 -0
- isa_model/inference/services/vision/helpers/image_utils.py +19 -10
- isa_model/inference/services/vision/isa_vision_service.py +634 -0
- isa_model/inference/services/vision/openai_vision_service.py +19 -10
- isa_model/inference/services/vision/tests/test_ocr_client.py +284 -0
- isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
- isa_model/serving/api/cache_manager.py +245 -0
- isa_model/serving/api/dependencies/__init__.py +1 -0
- isa_model/serving/api/dependencies/auth.py +194 -0
- isa_model/serving/api/dependencies/database.py +139 -0
- isa_model/serving/api/error_handlers.py +284 -0
- isa_model/serving/api/fastapi_server.py +240 -18
- isa_model/serving/api/middleware/auth.py +317 -0
- isa_model/serving/api/middleware/security.py +268 -0
- isa_model/serving/api/middleware/tenant_context.py +414 -0
- isa_model/serving/api/routes/analytics.py +489 -0
- isa_model/serving/api/routes/config.py +645 -0
- isa_model/serving/api/routes/deployment_billing.py +315 -0
- isa_model/serving/api/routes/deployments.py +475 -0
- isa_model/serving/api/routes/gpu_gateway.py +440 -0
- isa_model/serving/api/routes/health.py +32 -12
- isa_model/serving/api/routes/inference_monitoring.py +486 -0
- isa_model/serving/api/routes/local_deployments.py +448 -0
- isa_model/serving/api/routes/logs.py +430 -0
- isa_model/serving/api/routes/settings.py +582 -0
- isa_model/serving/api/routes/tenants.py +575 -0
- isa_model/serving/api/routes/unified.py +992 -171
- isa_model/serving/api/routes/webhooks.py +479 -0
- isa_model/serving/api/startup.py +318 -0
- isa_model/serving/modal_proxy_server.py +249 -0
- isa_model/utils/gpu_utils.py +311 -0
- {isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/METADATA +76 -22
- isa_model-0.4.3.dist-info/RECORD +193 -0
- isa_model/deployment/cloud/__init__.py +0 -9
- isa_model/deployment/cloud/modal/__init__.py +0 -10
- isa_model/deployment/cloud/modal/isa_vision_doc_service.py +0 -766
- isa_model/deployment/cloud/modal/isa_vision_table_service.py +0 -532
- isa_model/deployment/cloud/modal/isa_vision_ui_service.py +0 -406
- isa_model/deployment/cloud/modal/register_models.py +0 -321
- isa_model/deployment/core/deployment_config.py +0 -356
- isa_model/deployment/core/isa_deployment_service.py +0 -401
- isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
- isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
- isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
- isa_model/deployment/runtime/deployed_service.py +0 -338
- isa_model/deployment/services/__init__.py +0 -9
- isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
- isa_model/deployment/services/model_service.py +0 -332
- isa_model/deployment/services/service_monitor.py +0 -356
- isa_model/deployment/services/service_registry.py +0 -527
- isa_model/eval/__init__.py +0 -92
- isa_model/eval/benchmarks.py +0 -469
- isa_model/eval/config/__init__.py +0 -10
- isa_model/eval/config/evaluation_config.py +0 -108
- isa_model/eval/evaluators/__init__.py +0 -18
- isa_model/eval/evaluators/base_evaluator.py +0 -503
- isa_model/eval/evaluators/llm_evaluator.py +0 -472
- isa_model/eval/factory.py +0 -531
- isa_model/eval/infrastructure/__init__.py +0 -24
- isa_model/eval/infrastructure/experiment_tracker.py +0 -466
- isa_model/eval/metrics.py +0 -798
- isa_model/inference/adapter/unified_api.py +0 -248
- isa_model/inference/services/helpers/stacked_config.py +0 -148
- isa_model/inference/services/img/flux_professional_service.py +0 -603
- isa_model/inference/services/img/helpers/base_stacked_service.py +0 -274
- isa_model/inference/services/others/table_transformer_service.py +0 -61
- isa_model/inference/services/vision/doc_analysis_service.py +0 -640
- isa_model/inference/services/vision/helpers/base_stacked_service.py +0 -274
- isa_model/inference/services/vision/ui_analysis_service.py +0 -823
- isa_model/scripts/inference_tracker.py +0 -283
- isa_model/scripts/mlflow_manager.py +0 -379
- isa_model/scripts/model_registry.py +0 -465
- isa_model/scripts/register_models.py +0 -370
- isa_model/scripts/register_models_with_embeddings.py +0 -510
- isa_model/scripts/start_mlflow.py +0 -95
- isa_model/scripts/training_tracker.py +0 -257
- isa_model/training/__init__.py +0 -74
- isa_model/training/annotation/annotation_schema.py +0 -47
- isa_model/training/annotation/processors/annotation_processor.py +0 -126
- isa_model/training/annotation/storage/dataset_manager.py +0 -131
- isa_model/training/annotation/storage/dataset_schema.py +0 -44
- isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
- isa_model/training/annotation/tests/test_minio copy.py +0 -113
- isa_model/training/annotation/tests/test_minio_upload.py +0 -43
- isa_model/training/annotation/views/annotation_controller.py +0 -158
- isa_model/training/cloud/__init__.py +0 -22
- isa_model/training/cloud/job_orchestrator.py +0 -402
- isa_model/training/cloud/runpod_trainer.py +0 -454
- isa_model/training/cloud/storage_manager.py +0 -482
- isa_model/training/core/__init__.py +0 -23
- isa_model/training/core/config.py +0 -181
- isa_model/training/core/dataset.py +0 -222
- isa_model/training/core/trainer.py +0 -720
- isa_model/training/core/utils.py +0 -213
- isa_model/training/factory.py +0 -424
- isa_model-0.3.91.dist-info/RECORD +0 -138
- /isa_model/{core/storage/minio_storage.py → deployment/modal/services/audio/isa_audio_fish_service.py} +0 -0
- /isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
- {isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/WHEEL +0 -0
- {isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,747 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
|
4
|
+
"""
|
5
|
+
Local LLM Service - Direct local GPU inference service
|
6
|
+
Provides high-performance local model inference using vLLM, TensorRT-LLM, and Transformers
|
7
|
+
"""
|
8
|
+
|
9
|
+
import logging
|
10
|
+
import asyncio
|
11
|
+
from typing import Dict, Any, Optional, List, Union
|
12
|
+
from pathlib import Path
|
13
|
+
|
14
|
+
from isa_model.inference.services.base_service import BaseService
|
15
|
+
from isa_model.core.models.model_manager import ModelManager
|
16
|
+
from isa_model.core.config import ConfigManager
|
17
|
+
from isa_model.core.dependencies import DependencyChecker, is_torch_available, is_transformers_available
|
18
|
+
|
19
|
+
# Conditional imports for local deployment
|
20
|
+
try:
|
21
|
+
from isa_model.deployment.local import (
|
22
|
+
LocalGPUProvider, LocalGPUConfig, LocalServiceType, LocalBackend,
|
23
|
+
create_vllm_config, create_tensorrt_config, create_transformers_config
|
24
|
+
)
|
25
|
+
LOCAL_DEPLOYMENT_AVAILABLE = True
|
26
|
+
except ImportError:
|
27
|
+
LOCAL_DEPLOYMENT_AVAILABLE = False
|
28
|
+
LocalGPUProvider = None
|
29
|
+
LocalGPUConfig = None
|
30
|
+
LocalServiceType = None
|
31
|
+
LocalBackend = None
|
32
|
+
|
33
|
+
# Conditional import for GPU utilities
|
34
|
+
try:
|
35
|
+
from isa_model.utils.gpu_utils import get_gpu_manager
|
36
|
+
GPU_UTILS_AVAILABLE = True
|
37
|
+
except ImportError:
|
38
|
+
GPU_UTILS_AVAILABLE = False
|
39
|
+
get_gpu_manager = None
|
40
|
+
|
41
|
+
logger = logging.getLogger(__name__)
|
42
|
+
|
43
|
+
|
44
|
+
class LocalLLMService(BaseService):
|
45
|
+
"""
|
46
|
+
Local LLM Service - Direct local GPU inference
|
47
|
+
|
48
|
+
Features:
|
49
|
+
- Multiple inference backends (vLLM, TensorRT-LLM, Transformers)
|
50
|
+
- Automatic GPU resource management
|
51
|
+
- Model deployment and lifecycle management
|
52
|
+
- High-performance local inference
|
53
|
+
- No cloud dependency
|
54
|
+
|
55
|
+
Example:
|
56
|
+
```python
|
57
|
+
service = LocalLLMService()
|
58
|
+
|
59
|
+
# Deploy a model
|
60
|
+
await service.deploy_model("meta-llama/Llama-2-7b-chat-hf", backend="vllm")
|
61
|
+
|
62
|
+
# Generate text
|
63
|
+
result = await service.complete("Hello, how are you?")
|
64
|
+
print(result['text'])
|
65
|
+
```
|
66
|
+
"""
|
67
|
+
|
68
|
+
def __init__(
|
69
|
+
self,
|
70
|
+
provider_name: str = "local",
|
71
|
+
model_name: str = None,
|
72
|
+
model_manager: ModelManager = None,
|
73
|
+
config_manager: ConfigManager = None,
|
74
|
+
workspace_dir: str = "./local_llm_services",
|
75
|
+
auto_deploy_models: List[str] = None,
|
76
|
+
preferred_backend: str = "transformers",
|
77
|
+
**kwargs
|
78
|
+
):
|
79
|
+
# Check dependencies based on preferred backend
|
80
|
+
if preferred_backend == "transformers":
|
81
|
+
if not is_torch_available() or not is_transformers_available():
|
82
|
+
install_cmd = DependencyChecker.get_install_command(group="local_llm")
|
83
|
+
raise ImportError(
|
84
|
+
f"Local LLM inference requires PyTorch and Transformers.\n"
|
85
|
+
f"Install with: {install_cmd}"
|
86
|
+
)
|
87
|
+
elif preferred_backend == "vllm":
|
88
|
+
available, missing = DependencyChecker.check_group("vllm")
|
89
|
+
if not available:
|
90
|
+
install_cmd = DependencyChecker.get_install_command(group="vllm")
|
91
|
+
raise ImportError(
|
92
|
+
f"vLLM backend requires additional dependencies: {', '.join(missing)}.\n"
|
93
|
+
f"Install with: {install_cmd}"
|
94
|
+
)
|
95
|
+
|
96
|
+
# Check if local deployment is available
|
97
|
+
if not LOCAL_DEPLOYMENT_AVAILABLE:
|
98
|
+
logger.warning(
|
99
|
+
"Local deployment modules are not available. "
|
100
|
+
"Some features may be limited. "
|
101
|
+
"Install with: pip install 'isa-model[local]'"
|
102
|
+
)
|
103
|
+
|
104
|
+
# Initialize base service
|
105
|
+
self.provider_name = provider_name
|
106
|
+
self.model_name = model_name or "local-llm"
|
107
|
+
self.workspace_dir = Path(workspace_dir)
|
108
|
+
self.preferred_backend = preferred_backend
|
109
|
+
self.auto_deploy_models = auto_deploy_models or []
|
110
|
+
|
111
|
+
# Initialize local GPU provider if available
|
112
|
+
try:
|
113
|
+
if LOCAL_DEPLOYMENT_AVAILABLE and GPU_UTILS_AVAILABLE:
|
114
|
+
self.local_provider = LocalGPUProvider(str(self.workspace_dir))
|
115
|
+
self.gpu_manager = get_gpu_manager()
|
116
|
+
self.gpu_available = self.gpu_manager.cuda_available
|
117
|
+
logger.info("✅ Local GPU provider initialized")
|
118
|
+
else:
|
119
|
+
logger.warning("⚠️ Local GPU provider not available - CPU inference only")
|
120
|
+
self.local_provider = None
|
121
|
+
self.gpu_manager = None
|
122
|
+
self.gpu_available = False
|
123
|
+
except Exception as e:
|
124
|
+
logger.error(f"❌ Failed to initialize local GPU provider: {e}")
|
125
|
+
self.local_provider = None
|
126
|
+
self.gpu_manager = None
|
127
|
+
self.gpu_available = False
|
128
|
+
|
129
|
+
# Service state
|
130
|
+
self.deployed_models: Dict[str, str] = {} # model_id -> service_name
|
131
|
+
self.default_service: Optional[str] = None
|
132
|
+
self.request_count = 0
|
133
|
+
|
134
|
+
# Configuration
|
135
|
+
self.config_manager = config_manager or ConfigManager()
|
136
|
+
self.local_config = self.config_manager.get_local_gpu_config()
|
137
|
+
|
138
|
+
logger.info(f"Local LLM Service initialized (GPU Available: {self.gpu_available})")
|
139
|
+
|
140
|
+
async def initialize(self):
|
141
|
+
"""Initialize the service and auto-deploy models if configured"""
|
142
|
+
if not self.gpu_available:
|
143
|
+
logger.warning("⚠️ No GPU available, local inference will be limited")
|
144
|
+
return
|
145
|
+
|
146
|
+
# Auto-deploy models if specified
|
147
|
+
for model_id in self.auto_deploy_models:
|
148
|
+
try:
|
149
|
+
logger.info(f"🚀 Auto-deploying model: {model_id}")
|
150
|
+
result = await self.deploy_model(model_id, backend=self.preferred_backend)
|
151
|
+
if result.get("success"):
|
152
|
+
logger.info(f"✅ Auto-deployed: {model_id}")
|
153
|
+
else:
|
154
|
+
logger.warning(f"❌ Auto-deploy failed for {model_id}: {result.get('error')}")
|
155
|
+
except Exception as e:
|
156
|
+
logger.error(f"❌ Auto-deploy error for {model_id}: {e}")
|
157
|
+
|
158
|
+
async def deploy_model(
|
159
|
+
self,
|
160
|
+
model_id: str,
|
161
|
+
backend: str = None,
|
162
|
+
service_name: str = None,
|
163
|
+
**config_kwargs
|
164
|
+
) -> Dict[str, Any]:
|
165
|
+
"""
|
166
|
+
Deploy a model to local GPU
|
167
|
+
|
168
|
+
Args:
|
169
|
+
model_id: HuggingFace model ID
|
170
|
+
backend: Inference backend (vllm, tensorrt_llm, transformers)
|
171
|
+
service_name: Custom service name
|
172
|
+
**config_kwargs: Additional configuration parameters
|
173
|
+
|
174
|
+
Returns:
|
175
|
+
Deployment result
|
176
|
+
"""
|
177
|
+
if not self.local_provider:
|
178
|
+
return {
|
179
|
+
"success": False,
|
180
|
+
"error": "Local GPU provider not available"
|
181
|
+
}
|
182
|
+
|
183
|
+
try:
|
184
|
+
# Generate service name
|
185
|
+
if not service_name:
|
186
|
+
service_name = f"local-{model_id.replace('/', '-').replace('_', '-')}"
|
187
|
+
|
188
|
+
# Select backend
|
189
|
+
backend = backend or self.preferred_backend
|
190
|
+
backend_enum = LocalBackend(backend)
|
191
|
+
|
192
|
+
# Create configuration based on backend
|
193
|
+
if backend_enum == LocalBackend.VLLM:
|
194
|
+
config = create_vllm_config(
|
195
|
+
service_name=service_name,
|
196
|
+
model_id=model_id,
|
197
|
+
**config_kwargs
|
198
|
+
)
|
199
|
+
elif backend_enum == LocalBackend.TENSORRT_LLM:
|
200
|
+
config = create_tensorrt_config(
|
201
|
+
service_name=service_name,
|
202
|
+
model_id=model_id,
|
203
|
+
**config_kwargs
|
204
|
+
)
|
205
|
+
else: # Transformers
|
206
|
+
config = create_transformers_config(
|
207
|
+
service_name=service_name,
|
208
|
+
model_id=model_id,
|
209
|
+
**config_kwargs
|
210
|
+
)
|
211
|
+
|
212
|
+
logger.info(f"🚀 Deploying {model_id} with {backend} backend...")
|
213
|
+
|
214
|
+
# Deploy the model
|
215
|
+
result = await self.local_provider.deploy(config)
|
216
|
+
|
217
|
+
if result.get("success"):
|
218
|
+
# Track deployed model
|
219
|
+
self.deployed_models[model_id] = service_name
|
220
|
+
|
221
|
+
# Set as default if first model
|
222
|
+
if not self.default_service:
|
223
|
+
self.default_service = service_name
|
224
|
+
|
225
|
+
logger.info(f"✅ Model deployed successfully: {model_id} -> {service_name}")
|
226
|
+
|
227
|
+
return {
|
228
|
+
"success": True,
|
229
|
+
"model_id": model_id,
|
230
|
+
"service_name": service_name,
|
231
|
+
"backend": backend,
|
232
|
+
"deployment_info": result
|
233
|
+
}
|
234
|
+
else:
|
235
|
+
logger.error(f"❌ Deployment failed for {model_id}: {result.get('error')}")
|
236
|
+
return result
|
237
|
+
|
238
|
+
except Exception as e:
|
239
|
+
logger.error(f"❌ Deploy model error: {e}")
|
240
|
+
return {
|
241
|
+
"success": False,
|
242
|
+
"error": str(e)
|
243
|
+
}
|
244
|
+
|
245
|
+
async def undeploy_model(self, model_id: str) -> Dict[str, Any]:
|
246
|
+
"""
|
247
|
+
Undeploy a model from local GPU
|
248
|
+
|
249
|
+
Args:
|
250
|
+
model_id: Model ID to undeploy
|
251
|
+
|
252
|
+
Returns:
|
253
|
+
Undeploy result
|
254
|
+
"""
|
255
|
+
if model_id not in self.deployed_models:
|
256
|
+
return {
|
257
|
+
"success": False,
|
258
|
+
"error": f"Model {model_id} not deployed"
|
259
|
+
}
|
260
|
+
|
261
|
+
try:
|
262
|
+
service_name = self.deployed_models[model_id]
|
263
|
+
|
264
|
+
# Undeploy from local provider
|
265
|
+
result = await self.local_provider.undeploy(service_name)
|
266
|
+
|
267
|
+
if result.get("success"):
|
268
|
+
# Remove from tracking
|
269
|
+
del self.deployed_models[model_id]
|
270
|
+
|
271
|
+
# Update default service if needed
|
272
|
+
if self.default_service == service_name:
|
273
|
+
self.default_service = next(iter(self.deployed_models.values()), None)
|
274
|
+
|
275
|
+
logger.info(f"✅ Model undeployed: {model_id}")
|
276
|
+
|
277
|
+
return result
|
278
|
+
|
279
|
+
except Exception as e:
|
280
|
+
logger.error(f"❌ Undeploy error: {e}")
|
281
|
+
return {
|
282
|
+
"success": False,
|
283
|
+
"error": str(e)
|
284
|
+
}
|
285
|
+
|
286
|
+
async def complete(
|
287
|
+
self,
|
288
|
+
prompt: str,
|
289
|
+
model_id: str = None,
|
290
|
+
max_tokens: int = 512,
|
291
|
+
temperature: float = 0.7,
|
292
|
+
top_p: float = 0.9,
|
293
|
+
top_k: int = 50,
|
294
|
+
**kwargs
|
295
|
+
) -> Dict[str, Any]:
|
296
|
+
"""
|
297
|
+
Generate text completion using local model
|
298
|
+
|
299
|
+
Args:
|
300
|
+
prompt: Input text prompt
|
301
|
+
model_id: Specific model to use (optional)
|
302
|
+
max_tokens: Maximum tokens to generate
|
303
|
+
temperature: Sampling temperature
|
304
|
+
top_p: Top-p sampling
|
305
|
+
top_k: Top-k sampling
|
306
|
+
**kwargs: Additional generation parameters
|
307
|
+
|
308
|
+
Returns:
|
309
|
+
Generated text result
|
310
|
+
"""
|
311
|
+
if not self.local_provider:
|
312
|
+
return {
|
313
|
+
"success": False,
|
314
|
+
"error": "Local GPU provider not available",
|
315
|
+
"provider": "local",
|
316
|
+
"service": "local-llm"
|
317
|
+
}
|
318
|
+
|
319
|
+
try:
|
320
|
+
# Select service to use
|
321
|
+
service_name = None
|
322
|
+
if model_id and model_id in self.deployed_models:
|
323
|
+
service_name = self.deployed_models[model_id]
|
324
|
+
elif self.default_service:
|
325
|
+
service_name = self.default_service
|
326
|
+
else:
|
327
|
+
return {
|
328
|
+
"success": False,
|
329
|
+
"error": "No models deployed locally",
|
330
|
+
"provider": "local",
|
331
|
+
"service": "local-llm"
|
332
|
+
}
|
333
|
+
|
334
|
+
logger.info(f"🔄 Generating text with service: {service_name}")
|
335
|
+
|
336
|
+
# Generate text
|
337
|
+
result = await self.local_provider.generate_text(
|
338
|
+
service_name=service_name,
|
339
|
+
prompt=prompt,
|
340
|
+
max_tokens=max_tokens,
|
341
|
+
temperature=temperature,
|
342
|
+
top_p=top_p,
|
343
|
+
top_k=top_k,
|
344
|
+
**kwargs
|
345
|
+
)
|
346
|
+
|
347
|
+
if result.get("success"):
|
348
|
+
self.request_count += 1
|
349
|
+
|
350
|
+
# Format response
|
351
|
+
return {
|
352
|
+
"success": True,
|
353
|
+
"text": result.get("text", ""),
|
354
|
+
"generated_text": result.get("text", ""),
|
355
|
+
"full_text": prompt + " " + result.get("text", ""),
|
356
|
+
"prompt": prompt,
|
357
|
+
"model_id": model_id or "local-default",
|
358
|
+
"provider": "local",
|
359
|
+
"service": "local-llm",
|
360
|
+
"backend": result.get("backend", "unknown"),
|
361
|
+
"generation_config": {
|
362
|
+
"max_tokens": max_tokens,
|
363
|
+
"temperature": temperature,
|
364
|
+
"top_p": top_p,
|
365
|
+
"top_k": top_k,
|
366
|
+
**kwargs
|
367
|
+
},
|
368
|
+
"metadata": {
|
369
|
+
"processing_time": result.get("generation_time", 0),
|
370
|
+
"service_name": service_name,
|
371
|
+
"input_tokens": result.get("input_tokens", 0),
|
372
|
+
"output_tokens": result.get("output_tokens", 0),
|
373
|
+
"total_tokens": result.get("total_tokens", 0),
|
374
|
+
"gpu_accelerated": True,
|
375
|
+
"local_inference": True
|
376
|
+
}
|
377
|
+
}
|
378
|
+
else:
|
379
|
+
return {
|
380
|
+
"success": False,
|
381
|
+
"error": result.get("error", "Local inference failed"),
|
382
|
+
"provider": "local",
|
383
|
+
"service": "local-llm",
|
384
|
+
"details": result
|
385
|
+
}
|
386
|
+
|
387
|
+
except Exception as e:
|
388
|
+
logger.error(f"❌ Local completion failed: {e}")
|
389
|
+
return {
|
390
|
+
"success": False,
|
391
|
+
"error": str(e),
|
392
|
+
"provider": "local",
|
393
|
+
"service": "local-llm"
|
394
|
+
}
|
395
|
+
|
396
|
+
async def chat(
|
397
|
+
self,
|
398
|
+
messages: List[Dict[str, str]],
|
399
|
+
model_id: str = None,
|
400
|
+
**kwargs
|
401
|
+
) -> Dict[str, Any]:
|
402
|
+
"""
|
403
|
+
Chat completion using local model
|
404
|
+
|
405
|
+
Args:
|
406
|
+
messages: List of chat messages
|
407
|
+
model_id: Specific model to use (optional)
|
408
|
+
**kwargs: Additional generation parameters
|
409
|
+
|
410
|
+
Returns:
|
411
|
+
Chat completion result
|
412
|
+
"""
|
413
|
+
if not self.local_provider:
|
414
|
+
return {
|
415
|
+
"success": False,
|
416
|
+
"error": "Local GPU provider not available",
|
417
|
+
"provider": "local",
|
418
|
+
"service": "local-llm"
|
419
|
+
}
|
420
|
+
|
421
|
+
try:
|
422
|
+
# Select service to use
|
423
|
+
service_name = None
|
424
|
+
if model_id and model_id in self.deployed_models:
|
425
|
+
service_name = self.deployed_models[model_id]
|
426
|
+
elif self.default_service:
|
427
|
+
service_name = self.default_service
|
428
|
+
else:
|
429
|
+
return {
|
430
|
+
"success": False,
|
431
|
+
"error": "No models deployed locally",
|
432
|
+
"provider": "local",
|
433
|
+
"service": "local-llm"
|
434
|
+
}
|
435
|
+
|
436
|
+
logger.info(f"💬 Chat completion with service: {service_name}")
|
437
|
+
|
438
|
+
# Generate chat completion
|
439
|
+
result = await self.local_provider.chat_completion(
|
440
|
+
service_name=service_name,
|
441
|
+
messages=messages,
|
442
|
+
**kwargs
|
443
|
+
)
|
444
|
+
|
445
|
+
if result.get("success"):
|
446
|
+
self.request_count += 1
|
447
|
+
|
448
|
+
# Format response
|
449
|
+
response_content = ""
|
450
|
+
if "choices" in result and result["choices"]:
|
451
|
+
response_content = result["choices"][0].get("message", {}).get("content", "")
|
452
|
+
elif "text" in result:
|
453
|
+
response_content = result["text"]
|
454
|
+
|
455
|
+
return {
|
456
|
+
"success": True,
|
457
|
+
"text": response_content,
|
458
|
+
"content": response_content,
|
459
|
+
"role": "assistant",
|
460
|
+
"messages": messages,
|
461
|
+
"response": {
|
462
|
+
"role": "assistant",
|
463
|
+
"content": response_content
|
464
|
+
},
|
465
|
+
"model_id": model_id or "local-default",
|
466
|
+
"provider": "local",
|
467
|
+
"service": "local-llm",
|
468
|
+
"metadata": {
|
469
|
+
"processing_time": result.get("generation_time", 0),
|
470
|
+
"service_name": service_name,
|
471
|
+
"usage": result.get("usage", {}),
|
472
|
+
"gpu_accelerated": True,
|
473
|
+
"local_inference": True
|
474
|
+
}
|
475
|
+
}
|
476
|
+
else:
|
477
|
+
return {
|
478
|
+
"success": False,
|
479
|
+
"error": result.get("error", "Local chat completion failed"),
|
480
|
+
"provider": "local",
|
481
|
+
"service": "local-llm",
|
482
|
+
"details": result
|
483
|
+
}
|
484
|
+
|
485
|
+
except Exception as e:
|
486
|
+
logger.error(f"❌ Local chat completion failed: {e}")
|
487
|
+
return {
|
488
|
+
"success": False,
|
489
|
+
"error": str(e),
|
490
|
+
"provider": "local",
|
491
|
+
"service": "local-llm"
|
492
|
+
}
|
493
|
+
|
494
|
+
async def get_model_info(self, model_id: str = None) -> Dict[str, Any]:
|
495
|
+
"""Get information about deployed models"""
|
496
|
+
try:
|
497
|
+
if not self.local_provider:
|
498
|
+
return {
|
499
|
+
"success": False,
|
500
|
+
"error": "Local GPU provider not available"
|
501
|
+
}
|
502
|
+
|
503
|
+
if model_id and model_id in self.deployed_models:
|
504
|
+
# Get info for specific model
|
505
|
+
service_name = self.deployed_models[model_id]
|
506
|
+
service_info = await self.local_provider.get_service_info(service_name)
|
507
|
+
|
508
|
+
return {
|
509
|
+
"success": True,
|
510
|
+
"model_id": model_id,
|
511
|
+
"service_name": service_name,
|
512
|
+
"provider": "local",
|
513
|
+
"service": "local-llm",
|
514
|
+
"service_info": service_info
|
515
|
+
}
|
516
|
+
else:
|
517
|
+
# Get info for all deployed models
|
518
|
+
all_services = await self.local_provider.list_services()
|
519
|
+
|
520
|
+
return {
|
521
|
+
"success": True,
|
522
|
+
"provider": "local",
|
523
|
+
"service": "local-llm",
|
524
|
+
"deployed_models": self.deployed_models,
|
525
|
+
"default_service": self.default_service,
|
526
|
+
"services": all_services,
|
527
|
+
"gpu_status": self.gpu_manager.get_system_info() if self.gpu_manager else None
|
528
|
+
}
|
529
|
+
|
530
|
+
except Exception as e:
|
531
|
+
logger.error(f"❌ Get model info failed: {e}")
|
532
|
+
return {
|
533
|
+
"success": False,
|
534
|
+
"error": str(e)
|
535
|
+
}
|
536
|
+
|
537
|
+
async def health_check(self) -> Dict[str, Any]:
|
538
|
+
"""Check local LLM service health"""
|
539
|
+
try:
|
540
|
+
if not self.local_provider:
|
541
|
+
return {
|
542
|
+
"success": False,
|
543
|
+
"status": "error",
|
544
|
+
"provider": "local",
|
545
|
+
"service": "local-llm",
|
546
|
+
"error": "Local GPU provider not available"
|
547
|
+
}
|
548
|
+
|
549
|
+
# Get system status
|
550
|
+
system_status = await self.local_provider.get_system_status()
|
551
|
+
|
552
|
+
# Check deployed services
|
553
|
+
services = await self.local_provider.list_services()
|
554
|
+
healthy_services = [s for s in services if s.get("healthy", False)]
|
555
|
+
|
556
|
+
return {
|
557
|
+
"success": True,
|
558
|
+
"status": "healthy" if len(healthy_services) > 0 else "no_services",
|
559
|
+
"provider": "local",
|
560
|
+
"service": "local-llm",
|
561
|
+
"deployed_models": len(self.deployed_models),
|
562
|
+
"healthy_services": len(healthy_services),
|
563
|
+
"total_services": len(services),
|
564
|
+
"gpu_available": self.gpu_available,
|
565
|
+
"system_status": system_status,
|
566
|
+
"usage_stats": {
|
567
|
+
"total_requests": self.request_count,
|
568
|
+
"deployed_models": list(self.deployed_models.keys()),
|
569
|
+
"default_service": self.default_service
|
570
|
+
}
|
571
|
+
}
|
572
|
+
|
573
|
+
except Exception as e:
|
574
|
+
logger.error(f"❌ Health check failed: {e}")
|
575
|
+
return {
|
576
|
+
"success": False,
|
577
|
+
"status": "error",
|
578
|
+
"provider": "local",
|
579
|
+
"service": "local-llm",
|
580
|
+
"error": str(e)
|
581
|
+
}
|
582
|
+
|
583
|
+
def get_supported_tasks(self) -> List[str]:
|
584
|
+
"""Get supported task list"""
|
585
|
+
return [
|
586
|
+
"generate", # Text generation
|
587
|
+
"chat", # Chat completion
|
588
|
+
"complete", # Text completion
|
589
|
+
"deploy", # Model deployment
|
590
|
+
"undeploy" # Model undeployment
|
591
|
+
]
|
592
|
+
|
593
|
+
def get_supported_models(self) -> List[str]:
|
594
|
+
"""Get supported model types"""
|
595
|
+
return [
|
596
|
+
"llama", # Llama models
|
597
|
+
"mistral", # Mistral models
|
598
|
+
"qwen", # Qwen models
|
599
|
+
"gpt2", # GPT-2 models
|
600
|
+
"dialogpt", # DialoGPT models
|
601
|
+
"custom" # Custom trained models
|
602
|
+
]
|
603
|
+
|
604
|
+
def get_supported_backends(self) -> List[str]:
|
605
|
+
"""Get supported inference backends"""
|
606
|
+
return ["vllm", "tensorrt_llm", "transformers"]
|
607
|
+
|
608
|
+
async def invoke(self, input_data: str, task: str = "chat", **kwargs) -> Dict[str, Any]:
|
609
|
+
"""
|
610
|
+
Unified invoke method for compatibility with ISA Model client interface
|
611
|
+
"""
|
612
|
+
try:
|
613
|
+
if task in ["chat", "generate", "complete"]:
|
614
|
+
if task == "chat":
|
615
|
+
if isinstance(input_data, str):
|
616
|
+
messages = [{"role": "user", "content": input_data}]
|
617
|
+
elif isinstance(input_data, list):
|
618
|
+
messages = input_data
|
619
|
+
else:
|
620
|
+
messages = [{"role": "user", "content": str(input_data)}]
|
621
|
+
|
622
|
+
result = await self.chat(messages, **kwargs)
|
623
|
+
else:
|
624
|
+
result = await self.complete(input_data, **kwargs)
|
625
|
+
|
626
|
+
# Convert to unified format
|
627
|
+
if result.get("success"):
|
628
|
+
response_text = result.get("text", "") or result.get("content", "")
|
629
|
+
|
630
|
+
return {
|
631
|
+
"success": True,
|
632
|
+
"result": {
|
633
|
+
"content": response_text,
|
634
|
+
"tool_calls": [],
|
635
|
+
"response_metadata": result.get("metadata", {})
|
636
|
+
},
|
637
|
+
"error": None,
|
638
|
+
"metadata": {
|
639
|
+
"model_used": result.get("model_id", self.model_name),
|
640
|
+
"provider": self.provider_name,
|
641
|
+
"task": task,
|
642
|
+
"service_type": "text",
|
643
|
+
"processing_time": result.get("metadata", {}).get("processing_time", 0),
|
644
|
+
"local_inference": True,
|
645
|
+
"gpu_accelerated": True
|
646
|
+
}
|
647
|
+
}
|
648
|
+
else:
|
649
|
+
return {
|
650
|
+
"success": False,
|
651
|
+
"result": None,
|
652
|
+
"error": result.get("error", "Unknown error"),
|
653
|
+
"metadata": {
|
654
|
+
"model_used": self.model_name,
|
655
|
+
"provider": self.provider_name,
|
656
|
+
"task": task,
|
657
|
+
"service_type": "text",
|
658
|
+
"local_inference": True
|
659
|
+
}
|
660
|
+
}
|
661
|
+
else:
|
662
|
+
return {
|
663
|
+
"success": False,
|
664
|
+
"result": None,
|
665
|
+
"error": f"Unsupported task: {task}. Supported tasks: {self.get_supported_tasks()}",
|
666
|
+
"metadata": {
|
667
|
+
"model_used": self.model_name,
|
668
|
+
"provider": self.provider_name,
|
669
|
+
"task": task,
|
670
|
+
"service_type": "text"
|
671
|
+
}
|
672
|
+
}
|
673
|
+
|
674
|
+
except Exception as e:
|
675
|
+
logger.error(f"❌ Local LLM invoke failed: {e}")
|
676
|
+
return {
|
677
|
+
"success": False,
|
678
|
+
"result": None,
|
679
|
+
"error": str(e),
|
680
|
+
"metadata": {
|
681
|
+
"model_used": self.model_name,
|
682
|
+
"provider": self.provider_name,
|
683
|
+
"task": task,
|
684
|
+
"service_type": "text",
|
685
|
+
"local_inference": True
|
686
|
+
}
|
687
|
+
}
|
688
|
+
|
689
|
+
|
690
|
+
# Convenience function for quick setup
|
691
|
+
async def create_local_llm_service(
|
692
|
+
models_to_deploy: List[str] = None,
|
693
|
+
backend: str = "transformers",
|
694
|
+
workspace_dir: str = "./local_llm_services"
|
695
|
+
) -> LocalLLMService:
|
696
|
+
"""
|
697
|
+
Convenience function to create and initialize a local LLM service
|
698
|
+
|
699
|
+
Args:
|
700
|
+
models_to_deploy: List of model IDs to auto-deploy
|
701
|
+
backend: Preferred inference backend
|
702
|
+
workspace_dir: Working directory for services
|
703
|
+
|
704
|
+
Returns:
|
705
|
+
Initialized LocalLLMService instance
|
706
|
+
"""
|
707
|
+
service = LocalLLMService(
|
708
|
+
auto_deploy_models=models_to_deploy or [],
|
709
|
+
preferred_backend=backend,
|
710
|
+
workspace_dir=workspace_dir
|
711
|
+
)
|
712
|
+
|
713
|
+
await service.initialize()
|
714
|
+
return service
|
715
|
+
|
716
|
+
|
717
|
+
# Example usage and testing
|
718
|
+
if __name__ == "__main__":
|
719
|
+
async def test_local_llm_service():
|
720
|
+
"""Test the local LLM service"""
|
721
|
+
|
722
|
+
# Create service
|
723
|
+
service = await create_local_llm_service(
|
724
|
+
models_to_deploy=["microsoft/DialoGPT-medium"],
|
725
|
+
backend="transformers"
|
726
|
+
)
|
727
|
+
|
728
|
+
# Check health
|
729
|
+
health = await service.health_check()
|
730
|
+
print(f"Health: {health}")
|
731
|
+
|
732
|
+
# Generate text
|
733
|
+
if health.get("success"):
|
734
|
+
result = await service.complete(
|
735
|
+
prompt="Hello, how are you today?",
|
736
|
+
max_tokens=50
|
737
|
+
)
|
738
|
+
print(f"Generation result: {result}")
|
739
|
+
|
740
|
+
# Chat completion
|
741
|
+
chat_result = await service.chat([
|
742
|
+
{"role": "user", "content": "What is artificial intelligence?"}
|
743
|
+
])
|
744
|
+
print(f"Chat result: {chat_result}")
|
745
|
+
|
746
|
+
# Run test
|
747
|
+
asyncio.run(test_local_llm_service())
|