isa-model 0.4.0__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (199) hide show
  1. isa_model/client.py +466 -43
  2. isa_model/core/cache/redis_cache.py +12 -3
  3. isa_model/core/config/config_manager.py +230 -3
  4. isa_model/core/config.py +90 -0
  5. isa_model/core/database/direct_db_client.py +114 -0
  6. isa_model/core/database/migration_manager.py +563 -0
  7. isa_model/core/database/migrations.py +21 -1
  8. isa_model/core/database/supabase_client.py +154 -19
  9. isa_model/core/dependencies.py +316 -0
  10. isa_model/core/discovery/__init__.py +19 -0
  11. isa_model/core/discovery/consul_discovery.py +190 -0
  12. isa_model/core/logging/__init__.py +54 -0
  13. isa_model/core/logging/influx_logger.py +523 -0
  14. isa_model/core/logging/loki_logger.py +160 -0
  15. isa_model/core/models/__init__.py +27 -18
  16. isa_model/core/models/config_models.py +625 -0
  17. isa_model/core/models/deployment_billing_tracker.py +430 -0
  18. isa_model/core/models/model_manager.py +40 -17
  19. isa_model/core/models/model_metadata.py +690 -0
  20. isa_model/core/models/model_repo.py +174 -18
  21. isa_model/core/models/system_models.py +857 -0
  22. isa_model/core/repositories/__init__.py +9 -0
  23. isa_model/core/repositories/config_repository.py +912 -0
  24. isa_model/core/services/intelligent_model_selector.py +399 -21
  25. isa_model/core/storage/hf_storage.py +1 -1
  26. isa_model/core/types.py +1 -0
  27. isa_model/deployment/__init__.py +5 -48
  28. isa_model/deployment/core/__init__.py +2 -31
  29. isa_model/deployment/core/deployment_manager.py +1278 -370
  30. isa_model/deployment/local/__init__.py +31 -0
  31. isa_model/deployment/local/config.py +248 -0
  32. isa_model/deployment/local/gpu_gateway.py +607 -0
  33. isa_model/deployment/local/health_checker.py +428 -0
  34. isa_model/deployment/local/provider.py +586 -0
  35. isa_model/deployment/local/tensorrt_service.py +621 -0
  36. isa_model/deployment/local/transformers_service.py +644 -0
  37. isa_model/deployment/local/vllm_service.py +527 -0
  38. isa_model/deployment/modal/__init__.py +8 -0
  39. isa_model/deployment/modal/config.py +136 -0
  40. isa_model/deployment/{services/auto_hf_modal_deployer.py → modal/deployer.py} +1 -1
  41. isa_model/deployment/modal/services/__init__.py +3 -0
  42. isa_model/deployment/modal/services/audio/__init__.py +1 -0
  43. isa_model/deployment/modal/services/embedding/__init__.py +1 -0
  44. isa_model/deployment/modal/services/llm/__init__.py +1 -0
  45. isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
  46. isa_model/deployment/modal/services/video/__init__.py +1 -0
  47. isa_model/deployment/modal/services/vision/__init__.py +1 -0
  48. isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
  49. isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
  50. isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
  51. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
  52. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
  53. isa_model/deployment/storage/__init__.py +5 -0
  54. isa_model/deployment/storage/deployment_repository.py +824 -0
  55. isa_model/deployment/triton/__init__.py +10 -0
  56. isa_model/deployment/triton/config.py +196 -0
  57. isa_model/deployment/triton/configs/__init__.py +1 -0
  58. isa_model/deployment/triton/provider.py +512 -0
  59. isa_model/deployment/triton/scripts/__init__.py +1 -0
  60. isa_model/deployment/triton/templates/__init__.py +1 -0
  61. isa_model/inference/__init__.py +47 -1
  62. isa_model/inference/ai_factory.py +137 -10
  63. isa_model/inference/legacy_services/__init__.py +21 -0
  64. isa_model/inference/legacy_services/model_evaluation.py +637 -0
  65. isa_model/inference/legacy_services/model_service.py +573 -0
  66. isa_model/inference/legacy_services/model_serving.py +717 -0
  67. isa_model/inference/legacy_services/model_training.py +561 -0
  68. isa_model/inference/models/__init__.py +21 -0
  69. isa_model/inference/models/inference_config.py +551 -0
  70. isa_model/inference/models/inference_record.py +675 -0
  71. isa_model/inference/models/performance_models.py +714 -0
  72. isa_model/inference/repositories/__init__.py +9 -0
  73. isa_model/inference/repositories/inference_repository.py +828 -0
  74. isa_model/inference/services/audio/base_stt_service.py +184 -11
  75. isa_model/inference/services/audio/openai_stt_service.py +22 -6
  76. isa_model/inference/services/custom_model_manager.py +277 -0
  77. isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
  78. isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
  79. isa_model/inference/services/llm/__init__.py +10 -2
  80. isa_model/inference/services/llm/base_llm_service.py +335 -24
  81. isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
  82. isa_model/inference/services/llm/helpers/llm_adapter.py +9 -4
  83. isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
  84. isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
  85. isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
  86. isa_model/inference/services/llm/local_llm_service.py +747 -0
  87. isa_model/inference/services/llm/ollama_llm_service.py +9 -2
  88. isa_model/inference/services/llm/openai_llm_service.py +33 -16
  89. isa_model/inference/services/llm/yyds_llm_service.py +8 -2
  90. isa_model/inference/services/vision/__init__.py +22 -1
  91. isa_model/inference/services/vision/blip_vision_service.py +359 -0
  92. isa_model/inference/services/vision/helpers/image_utils.py +8 -5
  93. isa_model/inference/services/vision/isa_vision_service.py +65 -4
  94. isa_model/inference/services/vision/openai_vision_service.py +19 -10
  95. isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
  96. isa_model/serving/api/cache_manager.py +245 -0
  97. isa_model/serving/api/dependencies/__init__.py +1 -0
  98. isa_model/serving/api/dependencies/auth.py +194 -0
  99. isa_model/serving/api/dependencies/database.py +139 -0
  100. isa_model/serving/api/error_handlers.py +284 -0
  101. isa_model/serving/api/fastapi_server.py +172 -22
  102. isa_model/serving/api/middleware/auth.py +8 -2
  103. isa_model/serving/api/middleware/security.py +23 -33
  104. isa_model/serving/api/middleware/tenant_context.py +414 -0
  105. isa_model/serving/api/routes/analytics.py +4 -1
  106. isa_model/serving/api/routes/config.py +645 -0
  107. isa_model/serving/api/routes/deployment_billing.py +315 -0
  108. isa_model/serving/api/routes/deployments.py +138 -2
  109. isa_model/serving/api/routes/gpu_gateway.py +440 -0
  110. isa_model/serving/api/routes/health.py +32 -12
  111. isa_model/serving/api/routes/inference_monitoring.py +486 -0
  112. isa_model/serving/api/routes/local_deployments.py +448 -0
  113. isa_model/serving/api/routes/tenants.py +575 -0
  114. isa_model/serving/api/routes/unified.py +680 -18
  115. isa_model/serving/api/routes/webhooks.py +479 -0
  116. isa_model/serving/api/startup.py +68 -54
  117. isa_model/utils/gpu_utils.py +311 -0
  118. {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/METADATA +66 -24
  119. isa_model-0.4.3.dist-info/RECORD +193 -0
  120. isa_model/core/storage/minio_storage.py +0 -0
  121. isa_model/deployment/cloud/__init__.py +0 -9
  122. isa_model/deployment/cloud/modal/__init__.py +0 -10
  123. isa_model/deployment/core/deployment_config.py +0 -356
  124. isa_model/deployment/core/isa_deployment_service.py +0 -401
  125. isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
  126. isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
  127. isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
  128. isa_model/deployment/runtime/deployed_service.py +0 -338
  129. isa_model/deployment/services/__init__.py +0 -9
  130. isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
  131. isa_model/deployment/services/model_service.py +0 -332
  132. isa_model/deployment/services/service_monitor.py +0 -356
  133. isa_model/deployment/services/service_registry.py +0 -527
  134. isa_model/eval/__init__.py +0 -92
  135. isa_model/eval/benchmarks/__init__.py +0 -27
  136. isa_model/eval/benchmarks/multimodal_datasets.py +0 -460
  137. isa_model/eval/benchmarks.py +0 -701
  138. isa_model/eval/config/__init__.py +0 -10
  139. isa_model/eval/config/evaluation_config.py +0 -108
  140. isa_model/eval/evaluators/__init__.py +0 -24
  141. isa_model/eval/evaluators/audio_evaluator.py +0 -727
  142. isa_model/eval/evaluators/base_evaluator.py +0 -503
  143. isa_model/eval/evaluators/embedding_evaluator.py +0 -742
  144. isa_model/eval/evaluators/llm_evaluator.py +0 -472
  145. isa_model/eval/evaluators/vision_evaluator.py +0 -564
  146. isa_model/eval/example_evaluation.py +0 -395
  147. isa_model/eval/factory.py +0 -798
  148. isa_model/eval/infrastructure/__init__.py +0 -24
  149. isa_model/eval/infrastructure/experiment_tracker.py +0 -466
  150. isa_model/eval/isa_benchmarks.py +0 -700
  151. isa_model/eval/isa_integration.py +0 -582
  152. isa_model/eval/metrics.py +0 -951
  153. isa_model/eval/tests/unit/test_basic.py +0 -396
  154. isa_model/serving/api/routes/evaluations.py +0 -579
  155. isa_model/training/__init__.py +0 -168
  156. isa_model/training/annotation/annotation_schema.py +0 -47
  157. isa_model/training/annotation/processors/annotation_processor.py +0 -126
  158. isa_model/training/annotation/storage/dataset_manager.py +0 -131
  159. isa_model/training/annotation/storage/dataset_schema.py +0 -44
  160. isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
  161. isa_model/training/annotation/tests/test_minio copy.py +0 -113
  162. isa_model/training/annotation/tests/test_minio_upload.py +0 -43
  163. isa_model/training/annotation/views/annotation_controller.py +0 -158
  164. isa_model/training/cloud/__init__.py +0 -22
  165. isa_model/training/cloud/job_orchestrator.py +0 -402
  166. isa_model/training/cloud/runpod_trainer.py +0 -454
  167. isa_model/training/cloud/storage_manager.py +0 -482
  168. isa_model/training/core/__init__.py +0 -26
  169. isa_model/training/core/config.py +0 -181
  170. isa_model/training/core/dataset.py +0 -222
  171. isa_model/training/core/trainer.py +0 -720
  172. isa_model/training/core/utils.py +0 -213
  173. isa_model/training/examples/intelligent_training_example.py +0 -281
  174. isa_model/training/factory.py +0 -424
  175. isa_model/training/intelligent/__init__.py +0 -25
  176. isa_model/training/intelligent/decision_engine.py +0 -643
  177. isa_model/training/intelligent/intelligent_factory.py +0 -888
  178. isa_model/training/intelligent/knowledge_base.py +0 -751
  179. isa_model/training/intelligent/resource_optimizer.py +0 -839
  180. isa_model/training/intelligent/task_classifier.py +0 -576
  181. isa_model/training/storage/__init__.py +0 -24
  182. isa_model/training/storage/core_integration.py +0 -439
  183. isa_model/training/storage/training_repository.py +0 -552
  184. isa_model/training/storage/training_storage.py +0 -628
  185. isa_model-0.4.0.dist-info/RECORD +0 -182
  186. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_chatTTS_service.py +0 -0
  187. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_fish_service.py +0 -0
  188. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_openvoice_service.py +0 -0
  189. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_service_v2.py +0 -0
  190. /isa_model/deployment/{cloud/modal → modal/services/embedding}/isa_embed_rerank_service.py +0 -0
  191. /isa_model/deployment/{cloud/modal → modal/services/video}/isa_video_hunyuan_service.py +0 -0
  192. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ocr_service.py +0 -0
  193. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_qwen25_service.py +0 -0
  194. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_table_service.py +0 -0
  195. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service.py +0 -0
  196. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service_optimized.py +0 -0
  197. /isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
  198. {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/WHEEL +0 -0
  199. {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,747 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ """
5
+ Local LLM Service - Direct local GPU inference service
6
+ Provides high-performance local model inference using vLLM, TensorRT-LLM, and Transformers
7
+ """
8
+
9
+ import logging
10
+ import asyncio
11
+ from typing import Dict, Any, Optional, List, Union
12
+ from pathlib import Path
13
+
14
+ from isa_model.inference.services.base_service import BaseService
15
+ from isa_model.core.models.model_manager import ModelManager
16
+ from isa_model.core.config import ConfigManager
17
+ from isa_model.core.dependencies import DependencyChecker, is_torch_available, is_transformers_available
18
+
19
+ # Conditional imports for local deployment
20
+ try:
21
+ from isa_model.deployment.local import (
22
+ LocalGPUProvider, LocalGPUConfig, LocalServiceType, LocalBackend,
23
+ create_vllm_config, create_tensorrt_config, create_transformers_config
24
+ )
25
+ LOCAL_DEPLOYMENT_AVAILABLE = True
26
+ except ImportError:
27
+ LOCAL_DEPLOYMENT_AVAILABLE = False
28
+ LocalGPUProvider = None
29
+ LocalGPUConfig = None
30
+ LocalServiceType = None
31
+ LocalBackend = None
32
+
33
+ # Conditional import for GPU utilities
34
+ try:
35
+ from isa_model.utils.gpu_utils import get_gpu_manager
36
+ GPU_UTILS_AVAILABLE = True
37
+ except ImportError:
38
+ GPU_UTILS_AVAILABLE = False
39
+ get_gpu_manager = None
40
+
41
+ logger = logging.getLogger(__name__)
42
+
43
+
44
+ class LocalLLMService(BaseService):
45
+ """
46
+ Local LLM Service - Direct local GPU inference
47
+
48
+ Features:
49
+ - Multiple inference backends (vLLM, TensorRT-LLM, Transformers)
50
+ - Automatic GPU resource management
51
+ - Model deployment and lifecycle management
52
+ - High-performance local inference
53
+ - No cloud dependency
54
+
55
+ Example:
56
+ ```python
57
+ service = LocalLLMService()
58
+
59
+ # Deploy a model
60
+ await service.deploy_model("meta-llama/Llama-2-7b-chat-hf", backend="vllm")
61
+
62
+ # Generate text
63
+ result = await service.complete("Hello, how are you?")
64
+ print(result['text'])
65
+ ```
66
+ """
67
+
68
+ def __init__(
69
+ self,
70
+ provider_name: str = "local",
71
+ model_name: str = None,
72
+ model_manager: ModelManager = None,
73
+ config_manager: ConfigManager = None,
74
+ workspace_dir: str = "./local_llm_services",
75
+ auto_deploy_models: List[str] = None,
76
+ preferred_backend: str = "transformers",
77
+ **kwargs
78
+ ):
79
+ # Check dependencies based on preferred backend
80
+ if preferred_backend == "transformers":
81
+ if not is_torch_available() or not is_transformers_available():
82
+ install_cmd = DependencyChecker.get_install_command(group="local_llm")
83
+ raise ImportError(
84
+ f"Local LLM inference requires PyTorch and Transformers.\n"
85
+ f"Install with: {install_cmd}"
86
+ )
87
+ elif preferred_backend == "vllm":
88
+ available, missing = DependencyChecker.check_group("vllm")
89
+ if not available:
90
+ install_cmd = DependencyChecker.get_install_command(group="vllm")
91
+ raise ImportError(
92
+ f"vLLM backend requires additional dependencies: {', '.join(missing)}.\n"
93
+ f"Install with: {install_cmd}"
94
+ )
95
+
96
+ # Check if local deployment is available
97
+ if not LOCAL_DEPLOYMENT_AVAILABLE:
98
+ logger.warning(
99
+ "Local deployment modules are not available. "
100
+ "Some features may be limited. "
101
+ "Install with: pip install 'isa-model[local]'"
102
+ )
103
+
104
+ # Initialize base service
105
+ self.provider_name = provider_name
106
+ self.model_name = model_name or "local-llm"
107
+ self.workspace_dir = Path(workspace_dir)
108
+ self.preferred_backend = preferred_backend
109
+ self.auto_deploy_models = auto_deploy_models or []
110
+
111
+ # Initialize local GPU provider if available
112
+ try:
113
+ if LOCAL_DEPLOYMENT_AVAILABLE and GPU_UTILS_AVAILABLE:
114
+ self.local_provider = LocalGPUProvider(str(self.workspace_dir))
115
+ self.gpu_manager = get_gpu_manager()
116
+ self.gpu_available = self.gpu_manager.cuda_available
117
+ logger.info("✅ Local GPU provider initialized")
118
+ else:
119
+ logger.warning("⚠️ Local GPU provider not available - CPU inference only")
120
+ self.local_provider = None
121
+ self.gpu_manager = None
122
+ self.gpu_available = False
123
+ except Exception as e:
124
+ logger.error(f"❌ Failed to initialize local GPU provider: {e}")
125
+ self.local_provider = None
126
+ self.gpu_manager = None
127
+ self.gpu_available = False
128
+
129
+ # Service state
130
+ self.deployed_models: Dict[str, str] = {} # model_id -> service_name
131
+ self.default_service: Optional[str] = None
132
+ self.request_count = 0
133
+
134
+ # Configuration
135
+ self.config_manager = config_manager or ConfigManager()
136
+ self.local_config = self.config_manager.get_local_gpu_config()
137
+
138
+ logger.info(f"Local LLM Service initialized (GPU Available: {self.gpu_available})")
139
+
140
+ async def initialize(self):
141
+ """Initialize the service and auto-deploy models if configured"""
142
+ if not self.gpu_available:
143
+ logger.warning("⚠️ No GPU available, local inference will be limited")
144
+ return
145
+
146
+ # Auto-deploy models if specified
147
+ for model_id in self.auto_deploy_models:
148
+ try:
149
+ logger.info(f"🚀 Auto-deploying model: {model_id}")
150
+ result = await self.deploy_model(model_id, backend=self.preferred_backend)
151
+ if result.get("success"):
152
+ logger.info(f"✅ Auto-deployed: {model_id}")
153
+ else:
154
+ logger.warning(f"❌ Auto-deploy failed for {model_id}: {result.get('error')}")
155
+ except Exception as e:
156
+ logger.error(f"❌ Auto-deploy error for {model_id}: {e}")
157
+
158
+ async def deploy_model(
159
+ self,
160
+ model_id: str,
161
+ backend: str = None,
162
+ service_name: str = None,
163
+ **config_kwargs
164
+ ) -> Dict[str, Any]:
165
+ """
166
+ Deploy a model to local GPU
167
+
168
+ Args:
169
+ model_id: HuggingFace model ID
170
+ backend: Inference backend (vllm, tensorrt_llm, transformers)
171
+ service_name: Custom service name
172
+ **config_kwargs: Additional configuration parameters
173
+
174
+ Returns:
175
+ Deployment result
176
+ """
177
+ if not self.local_provider:
178
+ return {
179
+ "success": False,
180
+ "error": "Local GPU provider not available"
181
+ }
182
+
183
+ try:
184
+ # Generate service name
185
+ if not service_name:
186
+ service_name = f"local-{model_id.replace('/', '-').replace('_', '-')}"
187
+
188
+ # Select backend
189
+ backend = backend or self.preferred_backend
190
+ backend_enum = LocalBackend(backend)
191
+
192
+ # Create configuration based on backend
193
+ if backend_enum == LocalBackend.VLLM:
194
+ config = create_vllm_config(
195
+ service_name=service_name,
196
+ model_id=model_id,
197
+ **config_kwargs
198
+ )
199
+ elif backend_enum == LocalBackend.TENSORRT_LLM:
200
+ config = create_tensorrt_config(
201
+ service_name=service_name,
202
+ model_id=model_id,
203
+ **config_kwargs
204
+ )
205
+ else: # Transformers
206
+ config = create_transformers_config(
207
+ service_name=service_name,
208
+ model_id=model_id,
209
+ **config_kwargs
210
+ )
211
+
212
+ logger.info(f"🚀 Deploying {model_id} with {backend} backend...")
213
+
214
+ # Deploy the model
215
+ result = await self.local_provider.deploy(config)
216
+
217
+ if result.get("success"):
218
+ # Track deployed model
219
+ self.deployed_models[model_id] = service_name
220
+
221
+ # Set as default if first model
222
+ if not self.default_service:
223
+ self.default_service = service_name
224
+
225
+ logger.info(f"✅ Model deployed successfully: {model_id} -> {service_name}")
226
+
227
+ return {
228
+ "success": True,
229
+ "model_id": model_id,
230
+ "service_name": service_name,
231
+ "backend": backend,
232
+ "deployment_info": result
233
+ }
234
+ else:
235
+ logger.error(f"❌ Deployment failed for {model_id}: {result.get('error')}")
236
+ return result
237
+
238
+ except Exception as e:
239
+ logger.error(f"❌ Deploy model error: {e}")
240
+ return {
241
+ "success": False,
242
+ "error": str(e)
243
+ }
244
+
245
+ async def undeploy_model(self, model_id: str) -> Dict[str, Any]:
246
+ """
247
+ Undeploy a model from local GPU
248
+
249
+ Args:
250
+ model_id: Model ID to undeploy
251
+
252
+ Returns:
253
+ Undeploy result
254
+ """
255
+ if model_id not in self.deployed_models:
256
+ return {
257
+ "success": False,
258
+ "error": f"Model {model_id} not deployed"
259
+ }
260
+
261
+ try:
262
+ service_name = self.deployed_models[model_id]
263
+
264
+ # Undeploy from local provider
265
+ result = await self.local_provider.undeploy(service_name)
266
+
267
+ if result.get("success"):
268
+ # Remove from tracking
269
+ del self.deployed_models[model_id]
270
+
271
+ # Update default service if needed
272
+ if self.default_service == service_name:
273
+ self.default_service = next(iter(self.deployed_models.values()), None)
274
+
275
+ logger.info(f"✅ Model undeployed: {model_id}")
276
+
277
+ return result
278
+
279
+ except Exception as e:
280
+ logger.error(f"❌ Undeploy error: {e}")
281
+ return {
282
+ "success": False,
283
+ "error": str(e)
284
+ }
285
+
286
+ async def complete(
287
+ self,
288
+ prompt: str,
289
+ model_id: str = None,
290
+ max_tokens: int = 512,
291
+ temperature: float = 0.7,
292
+ top_p: float = 0.9,
293
+ top_k: int = 50,
294
+ **kwargs
295
+ ) -> Dict[str, Any]:
296
+ """
297
+ Generate text completion using local model
298
+
299
+ Args:
300
+ prompt: Input text prompt
301
+ model_id: Specific model to use (optional)
302
+ max_tokens: Maximum tokens to generate
303
+ temperature: Sampling temperature
304
+ top_p: Top-p sampling
305
+ top_k: Top-k sampling
306
+ **kwargs: Additional generation parameters
307
+
308
+ Returns:
309
+ Generated text result
310
+ """
311
+ if not self.local_provider:
312
+ return {
313
+ "success": False,
314
+ "error": "Local GPU provider not available",
315
+ "provider": "local",
316
+ "service": "local-llm"
317
+ }
318
+
319
+ try:
320
+ # Select service to use
321
+ service_name = None
322
+ if model_id and model_id in self.deployed_models:
323
+ service_name = self.deployed_models[model_id]
324
+ elif self.default_service:
325
+ service_name = self.default_service
326
+ else:
327
+ return {
328
+ "success": False,
329
+ "error": "No models deployed locally",
330
+ "provider": "local",
331
+ "service": "local-llm"
332
+ }
333
+
334
+ logger.info(f"🔄 Generating text with service: {service_name}")
335
+
336
+ # Generate text
337
+ result = await self.local_provider.generate_text(
338
+ service_name=service_name,
339
+ prompt=prompt,
340
+ max_tokens=max_tokens,
341
+ temperature=temperature,
342
+ top_p=top_p,
343
+ top_k=top_k,
344
+ **kwargs
345
+ )
346
+
347
+ if result.get("success"):
348
+ self.request_count += 1
349
+
350
+ # Format response
351
+ return {
352
+ "success": True,
353
+ "text": result.get("text", ""),
354
+ "generated_text": result.get("text", ""),
355
+ "full_text": prompt + " " + result.get("text", ""),
356
+ "prompt": prompt,
357
+ "model_id": model_id or "local-default",
358
+ "provider": "local",
359
+ "service": "local-llm",
360
+ "backend": result.get("backend", "unknown"),
361
+ "generation_config": {
362
+ "max_tokens": max_tokens,
363
+ "temperature": temperature,
364
+ "top_p": top_p,
365
+ "top_k": top_k,
366
+ **kwargs
367
+ },
368
+ "metadata": {
369
+ "processing_time": result.get("generation_time", 0),
370
+ "service_name": service_name,
371
+ "input_tokens": result.get("input_tokens", 0),
372
+ "output_tokens": result.get("output_tokens", 0),
373
+ "total_tokens": result.get("total_tokens", 0),
374
+ "gpu_accelerated": True,
375
+ "local_inference": True
376
+ }
377
+ }
378
+ else:
379
+ return {
380
+ "success": False,
381
+ "error": result.get("error", "Local inference failed"),
382
+ "provider": "local",
383
+ "service": "local-llm",
384
+ "details": result
385
+ }
386
+
387
+ except Exception as e:
388
+ logger.error(f"❌ Local completion failed: {e}")
389
+ return {
390
+ "success": False,
391
+ "error": str(e),
392
+ "provider": "local",
393
+ "service": "local-llm"
394
+ }
395
+
396
+ async def chat(
397
+ self,
398
+ messages: List[Dict[str, str]],
399
+ model_id: str = None,
400
+ **kwargs
401
+ ) -> Dict[str, Any]:
402
+ """
403
+ Chat completion using local model
404
+
405
+ Args:
406
+ messages: List of chat messages
407
+ model_id: Specific model to use (optional)
408
+ **kwargs: Additional generation parameters
409
+
410
+ Returns:
411
+ Chat completion result
412
+ """
413
+ if not self.local_provider:
414
+ return {
415
+ "success": False,
416
+ "error": "Local GPU provider not available",
417
+ "provider": "local",
418
+ "service": "local-llm"
419
+ }
420
+
421
+ try:
422
+ # Select service to use
423
+ service_name = None
424
+ if model_id and model_id in self.deployed_models:
425
+ service_name = self.deployed_models[model_id]
426
+ elif self.default_service:
427
+ service_name = self.default_service
428
+ else:
429
+ return {
430
+ "success": False,
431
+ "error": "No models deployed locally",
432
+ "provider": "local",
433
+ "service": "local-llm"
434
+ }
435
+
436
+ logger.info(f"💬 Chat completion with service: {service_name}")
437
+
438
+ # Generate chat completion
439
+ result = await self.local_provider.chat_completion(
440
+ service_name=service_name,
441
+ messages=messages,
442
+ **kwargs
443
+ )
444
+
445
+ if result.get("success"):
446
+ self.request_count += 1
447
+
448
+ # Format response
449
+ response_content = ""
450
+ if "choices" in result and result["choices"]:
451
+ response_content = result["choices"][0].get("message", {}).get("content", "")
452
+ elif "text" in result:
453
+ response_content = result["text"]
454
+
455
+ return {
456
+ "success": True,
457
+ "text": response_content,
458
+ "content": response_content,
459
+ "role": "assistant",
460
+ "messages": messages,
461
+ "response": {
462
+ "role": "assistant",
463
+ "content": response_content
464
+ },
465
+ "model_id": model_id or "local-default",
466
+ "provider": "local",
467
+ "service": "local-llm",
468
+ "metadata": {
469
+ "processing_time": result.get("generation_time", 0),
470
+ "service_name": service_name,
471
+ "usage": result.get("usage", {}),
472
+ "gpu_accelerated": True,
473
+ "local_inference": True
474
+ }
475
+ }
476
+ else:
477
+ return {
478
+ "success": False,
479
+ "error": result.get("error", "Local chat completion failed"),
480
+ "provider": "local",
481
+ "service": "local-llm",
482
+ "details": result
483
+ }
484
+
485
+ except Exception as e:
486
+ logger.error(f"❌ Local chat completion failed: {e}")
487
+ return {
488
+ "success": False,
489
+ "error": str(e),
490
+ "provider": "local",
491
+ "service": "local-llm"
492
+ }
493
+
494
+ async def get_model_info(self, model_id: str = None) -> Dict[str, Any]:
495
+ """Get information about deployed models"""
496
+ try:
497
+ if not self.local_provider:
498
+ return {
499
+ "success": False,
500
+ "error": "Local GPU provider not available"
501
+ }
502
+
503
+ if model_id and model_id in self.deployed_models:
504
+ # Get info for specific model
505
+ service_name = self.deployed_models[model_id]
506
+ service_info = await self.local_provider.get_service_info(service_name)
507
+
508
+ return {
509
+ "success": True,
510
+ "model_id": model_id,
511
+ "service_name": service_name,
512
+ "provider": "local",
513
+ "service": "local-llm",
514
+ "service_info": service_info
515
+ }
516
+ else:
517
+ # Get info for all deployed models
518
+ all_services = await self.local_provider.list_services()
519
+
520
+ return {
521
+ "success": True,
522
+ "provider": "local",
523
+ "service": "local-llm",
524
+ "deployed_models": self.deployed_models,
525
+ "default_service": self.default_service,
526
+ "services": all_services,
527
+ "gpu_status": self.gpu_manager.get_system_info() if self.gpu_manager else None
528
+ }
529
+
530
+ except Exception as e:
531
+ logger.error(f"❌ Get model info failed: {e}")
532
+ return {
533
+ "success": False,
534
+ "error": str(e)
535
+ }
536
+
537
+ async def health_check(self) -> Dict[str, Any]:
538
+ """Check local LLM service health"""
539
+ try:
540
+ if not self.local_provider:
541
+ return {
542
+ "success": False,
543
+ "status": "error",
544
+ "provider": "local",
545
+ "service": "local-llm",
546
+ "error": "Local GPU provider not available"
547
+ }
548
+
549
+ # Get system status
550
+ system_status = await self.local_provider.get_system_status()
551
+
552
+ # Check deployed services
553
+ services = await self.local_provider.list_services()
554
+ healthy_services = [s for s in services if s.get("healthy", False)]
555
+
556
+ return {
557
+ "success": True,
558
+ "status": "healthy" if len(healthy_services) > 0 else "no_services",
559
+ "provider": "local",
560
+ "service": "local-llm",
561
+ "deployed_models": len(self.deployed_models),
562
+ "healthy_services": len(healthy_services),
563
+ "total_services": len(services),
564
+ "gpu_available": self.gpu_available,
565
+ "system_status": system_status,
566
+ "usage_stats": {
567
+ "total_requests": self.request_count,
568
+ "deployed_models": list(self.deployed_models.keys()),
569
+ "default_service": self.default_service
570
+ }
571
+ }
572
+
573
+ except Exception as e:
574
+ logger.error(f"❌ Health check failed: {e}")
575
+ return {
576
+ "success": False,
577
+ "status": "error",
578
+ "provider": "local",
579
+ "service": "local-llm",
580
+ "error": str(e)
581
+ }
582
+
583
+ def get_supported_tasks(self) -> List[str]:
584
+ """Get supported task list"""
585
+ return [
586
+ "generate", # Text generation
587
+ "chat", # Chat completion
588
+ "complete", # Text completion
589
+ "deploy", # Model deployment
590
+ "undeploy" # Model undeployment
591
+ ]
592
+
593
+ def get_supported_models(self) -> List[str]:
594
+ """Get supported model types"""
595
+ return [
596
+ "llama", # Llama models
597
+ "mistral", # Mistral models
598
+ "qwen", # Qwen models
599
+ "gpt2", # GPT-2 models
600
+ "dialogpt", # DialoGPT models
601
+ "custom" # Custom trained models
602
+ ]
603
+
604
+ def get_supported_backends(self) -> List[str]:
605
+ """Get supported inference backends"""
606
+ return ["vllm", "tensorrt_llm", "transformers"]
607
+
608
+ async def invoke(self, input_data: str, task: str = "chat", **kwargs) -> Dict[str, Any]:
609
+ """
610
+ Unified invoke method for compatibility with ISA Model client interface
611
+ """
612
+ try:
613
+ if task in ["chat", "generate", "complete"]:
614
+ if task == "chat":
615
+ if isinstance(input_data, str):
616
+ messages = [{"role": "user", "content": input_data}]
617
+ elif isinstance(input_data, list):
618
+ messages = input_data
619
+ else:
620
+ messages = [{"role": "user", "content": str(input_data)}]
621
+
622
+ result = await self.chat(messages, **kwargs)
623
+ else:
624
+ result = await self.complete(input_data, **kwargs)
625
+
626
+ # Convert to unified format
627
+ if result.get("success"):
628
+ response_text = result.get("text", "") or result.get("content", "")
629
+
630
+ return {
631
+ "success": True,
632
+ "result": {
633
+ "content": response_text,
634
+ "tool_calls": [],
635
+ "response_metadata": result.get("metadata", {})
636
+ },
637
+ "error": None,
638
+ "metadata": {
639
+ "model_used": result.get("model_id", self.model_name),
640
+ "provider": self.provider_name,
641
+ "task": task,
642
+ "service_type": "text",
643
+ "processing_time": result.get("metadata", {}).get("processing_time", 0),
644
+ "local_inference": True,
645
+ "gpu_accelerated": True
646
+ }
647
+ }
648
+ else:
649
+ return {
650
+ "success": False,
651
+ "result": None,
652
+ "error": result.get("error", "Unknown error"),
653
+ "metadata": {
654
+ "model_used": self.model_name,
655
+ "provider": self.provider_name,
656
+ "task": task,
657
+ "service_type": "text",
658
+ "local_inference": True
659
+ }
660
+ }
661
+ else:
662
+ return {
663
+ "success": False,
664
+ "result": None,
665
+ "error": f"Unsupported task: {task}. Supported tasks: {self.get_supported_tasks()}",
666
+ "metadata": {
667
+ "model_used": self.model_name,
668
+ "provider": self.provider_name,
669
+ "task": task,
670
+ "service_type": "text"
671
+ }
672
+ }
673
+
674
+ except Exception as e:
675
+ logger.error(f"❌ Local LLM invoke failed: {e}")
676
+ return {
677
+ "success": False,
678
+ "result": None,
679
+ "error": str(e),
680
+ "metadata": {
681
+ "model_used": self.model_name,
682
+ "provider": self.provider_name,
683
+ "task": task,
684
+ "service_type": "text",
685
+ "local_inference": True
686
+ }
687
+ }
688
+
689
+
690
+ # Convenience function for quick setup
691
+ async def create_local_llm_service(
692
+ models_to_deploy: List[str] = None,
693
+ backend: str = "transformers",
694
+ workspace_dir: str = "./local_llm_services"
695
+ ) -> LocalLLMService:
696
+ """
697
+ Convenience function to create and initialize a local LLM service
698
+
699
+ Args:
700
+ models_to_deploy: List of model IDs to auto-deploy
701
+ backend: Preferred inference backend
702
+ workspace_dir: Working directory for services
703
+
704
+ Returns:
705
+ Initialized LocalLLMService instance
706
+ """
707
+ service = LocalLLMService(
708
+ auto_deploy_models=models_to_deploy or [],
709
+ preferred_backend=backend,
710
+ workspace_dir=workspace_dir
711
+ )
712
+
713
+ await service.initialize()
714
+ return service
715
+
716
+
717
+ # Example usage and testing
718
+ if __name__ == "__main__":
719
+ async def test_local_llm_service():
720
+ """Test the local LLM service"""
721
+
722
+ # Create service
723
+ service = await create_local_llm_service(
724
+ models_to_deploy=["microsoft/DialoGPT-medium"],
725
+ backend="transformers"
726
+ )
727
+
728
+ # Check health
729
+ health = await service.health_check()
730
+ print(f"Health: {health}")
731
+
732
+ # Generate text
733
+ if health.get("success"):
734
+ result = await service.complete(
735
+ prompt="Hello, how are you today?",
736
+ max_tokens=50
737
+ )
738
+ print(f"Generation result: {result}")
739
+
740
+ # Chat completion
741
+ chat_result = await service.chat([
742
+ {"role": "user", "content": "What is artificial intelligence?"}
743
+ ])
744
+ print(f"Chat result: {chat_result}")
745
+
746
+ # Run test
747
+ asyncio.run(test_local_llm_service())