isa-model 0.4.0__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (199) hide show
  1. isa_model/client.py +466 -43
  2. isa_model/core/cache/redis_cache.py +12 -3
  3. isa_model/core/config/config_manager.py +230 -3
  4. isa_model/core/config.py +90 -0
  5. isa_model/core/database/direct_db_client.py +114 -0
  6. isa_model/core/database/migration_manager.py +563 -0
  7. isa_model/core/database/migrations.py +21 -1
  8. isa_model/core/database/supabase_client.py +154 -19
  9. isa_model/core/dependencies.py +316 -0
  10. isa_model/core/discovery/__init__.py +19 -0
  11. isa_model/core/discovery/consul_discovery.py +190 -0
  12. isa_model/core/logging/__init__.py +54 -0
  13. isa_model/core/logging/influx_logger.py +523 -0
  14. isa_model/core/logging/loki_logger.py +160 -0
  15. isa_model/core/models/__init__.py +27 -18
  16. isa_model/core/models/config_models.py +625 -0
  17. isa_model/core/models/deployment_billing_tracker.py +430 -0
  18. isa_model/core/models/model_manager.py +40 -17
  19. isa_model/core/models/model_metadata.py +690 -0
  20. isa_model/core/models/model_repo.py +174 -18
  21. isa_model/core/models/system_models.py +857 -0
  22. isa_model/core/repositories/__init__.py +9 -0
  23. isa_model/core/repositories/config_repository.py +912 -0
  24. isa_model/core/services/intelligent_model_selector.py +399 -21
  25. isa_model/core/storage/hf_storage.py +1 -1
  26. isa_model/core/types.py +1 -0
  27. isa_model/deployment/__init__.py +5 -48
  28. isa_model/deployment/core/__init__.py +2 -31
  29. isa_model/deployment/core/deployment_manager.py +1278 -370
  30. isa_model/deployment/local/__init__.py +31 -0
  31. isa_model/deployment/local/config.py +248 -0
  32. isa_model/deployment/local/gpu_gateway.py +607 -0
  33. isa_model/deployment/local/health_checker.py +428 -0
  34. isa_model/deployment/local/provider.py +586 -0
  35. isa_model/deployment/local/tensorrt_service.py +621 -0
  36. isa_model/deployment/local/transformers_service.py +644 -0
  37. isa_model/deployment/local/vllm_service.py +527 -0
  38. isa_model/deployment/modal/__init__.py +8 -0
  39. isa_model/deployment/modal/config.py +136 -0
  40. isa_model/deployment/{services/auto_hf_modal_deployer.py → modal/deployer.py} +1 -1
  41. isa_model/deployment/modal/services/__init__.py +3 -0
  42. isa_model/deployment/modal/services/audio/__init__.py +1 -0
  43. isa_model/deployment/modal/services/embedding/__init__.py +1 -0
  44. isa_model/deployment/modal/services/llm/__init__.py +1 -0
  45. isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
  46. isa_model/deployment/modal/services/video/__init__.py +1 -0
  47. isa_model/deployment/modal/services/vision/__init__.py +1 -0
  48. isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
  49. isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
  50. isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
  51. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
  52. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
  53. isa_model/deployment/storage/__init__.py +5 -0
  54. isa_model/deployment/storage/deployment_repository.py +824 -0
  55. isa_model/deployment/triton/__init__.py +10 -0
  56. isa_model/deployment/triton/config.py +196 -0
  57. isa_model/deployment/triton/configs/__init__.py +1 -0
  58. isa_model/deployment/triton/provider.py +512 -0
  59. isa_model/deployment/triton/scripts/__init__.py +1 -0
  60. isa_model/deployment/triton/templates/__init__.py +1 -0
  61. isa_model/inference/__init__.py +47 -1
  62. isa_model/inference/ai_factory.py +137 -10
  63. isa_model/inference/legacy_services/__init__.py +21 -0
  64. isa_model/inference/legacy_services/model_evaluation.py +637 -0
  65. isa_model/inference/legacy_services/model_service.py +573 -0
  66. isa_model/inference/legacy_services/model_serving.py +717 -0
  67. isa_model/inference/legacy_services/model_training.py +561 -0
  68. isa_model/inference/models/__init__.py +21 -0
  69. isa_model/inference/models/inference_config.py +551 -0
  70. isa_model/inference/models/inference_record.py +675 -0
  71. isa_model/inference/models/performance_models.py +714 -0
  72. isa_model/inference/repositories/__init__.py +9 -0
  73. isa_model/inference/repositories/inference_repository.py +828 -0
  74. isa_model/inference/services/audio/base_stt_service.py +184 -11
  75. isa_model/inference/services/audio/openai_stt_service.py +22 -6
  76. isa_model/inference/services/custom_model_manager.py +277 -0
  77. isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
  78. isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
  79. isa_model/inference/services/llm/__init__.py +10 -2
  80. isa_model/inference/services/llm/base_llm_service.py +335 -24
  81. isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
  82. isa_model/inference/services/llm/helpers/llm_adapter.py +9 -4
  83. isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
  84. isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
  85. isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
  86. isa_model/inference/services/llm/local_llm_service.py +747 -0
  87. isa_model/inference/services/llm/ollama_llm_service.py +9 -2
  88. isa_model/inference/services/llm/openai_llm_service.py +33 -16
  89. isa_model/inference/services/llm/yyds_llm_service.py +8 -2
  90. isa_model/inference/services/vision/__init__.py +22 -1
  91. isa_model/inference/services/vision/blip_vision_service.py +359 -0
  92. isa_model/inference/services/vision/helpers/image_utils.py +8 -5
  93. isa_model/inference/services/vision/isa_vision_service.py +65 -4
  94. isa_model/inference/services/vision/openai_vision_service.py +19 -10
  95. isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
  96. isa_model/serving/api/cache_manager.py +245 -0
  97. isa_model/serving/api/dependencies/__init__.py +1 -0
  98. isa_model/serving/api/dependencies/auth.py +194 -0
  99. isa_model/serving/api/dependencies/database.py +139 -0
  100. isa_model/serving/api/error_handlers.py +284 -0
  101. isa_model/serving/api/fastapi_server.py +172 -22
  102. isa_model/serving/api/middleware/auth.py +8 -2
  103. isa_model/serving/api/middleware/security.py +23 -33
  104. isa_model/serving/api/middleware/tenant_context.py +414 -0
  105. isa_model/serving/api/routes/analytics.py +4 -1
  106. isa_model/serving/api/routes/config.py +645 -0
  107. isa_model/serving/api/routes/deployment_billing.py +315 -0
  108. isa_model/serving/api/routes/deployments.py +138 -2
  109. isa_model/serving/api/routes/gpu_gateway.py +440 -0
  110. isa_model/serving/api/routes/health.py +32 -12
  111. isa_model/serving/api/routes/inference_monitoring.py +486 -0
  112. isa_model/serving/api/routes/local_deployments.py +448 -0
  113. isa_model/serving/api/routes/tenants.py +575 -0
  114. isa_model/serving/api/routes/unified.py +680 -18
  115. isa_model/serving/api/routes/webhooks.py +479 -0
  116. isa_model/serving/api/startup.py +68 -54
  117. isa_model/utils/gpu_utils.py +311 -0
  118. {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/METADATA +66 -24
  119. isa_model-0.4.3.dist-info/RECORD +193 -0
  120. isa_model/core/storage/minio_storage.py +0 -0
  121. isa_model/deployment/cloud/__init__.py +0 -9
  122. isa_model/deployment/cloud/modal/__init__.py +0 -10
  123. isa_model/deployment/core/deployment_config.py +0 -356
  124. isa_model/deployment/core/isa_deployment_service.py +0 -401
  125. isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
  126. isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
  127. isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
  128. isa_model/deployment/runtime/deployed_service.py +0 -338
  129. isa_model/deployment/services/__init__.py +0 -9
  130. isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
  131. isa_model/deployment/services/model_service.py +0 -332
  132. isa_model/deployment/services/service_monitor.py +0 -356
  133. isa_model/deployment/services/service_registry.py +0 -527
  134. isa_model/eval/__init__.py +0 -92
  135. isa_model/eval/benchmarks/__init__.py +0 -27
  136. isa_model/eval/benchmarks/multimodal_datasets.py +0 -460
  137. isa_model/eval/benchmarks.py +0 -701
  138. isa_model/eval/config/__init__.py +0 -10
  139. isa_model/eval/config/evaluation_config.py +0 -108
  140. isa_model/eval/evaluators/__init__.py +0 -24
  141. isa_model/eval/evaluators/audio_evaluator.py +0 -727
  142. isa_model/eval/evaluators/base_evaluator.py +0 -503
  143. isa_model/eval/evaluators/embedding_evaluator.py +0 -742
  144. isa_model/eval/evaluators/llm_evaluator.py +0 -472
  145. isa_model/eval/evaluators/vision_evaluator.py +0 -564
  146. isa_model/eval/example_evaluation.py +0 -395
  147. isa_model/eval/factory.py +0 -798
  148. isa_model/eval/infrastructure/__init__.py +0 -24
  149. isa_model/eval/infrastructure/experiment_tracker.py +0 -466
  150. isa_model/eval/isa_benchmarks.py +0 -700
  151. isa_model/eval/isa_integration.py +0 -582
  152. isa_model/eval/metrics.py +0 -951
  153. isa_model/eval/tests/unit/test_basic.py +0 -396
  154. isa_model/serving/api/routes/evaluations.py +0 -579
  155. isa_model/training/__init__.py +0 -168
  156. isa_model/training/annotation/annotation_schema.py +0 -47
  157. isa_model/training/annotation/processors/annotation_processor.py +0 -126
  158. isa_model/training/annotation/storage/dataset_manager.py +0 -131
  159. isa_model/training/annotation/storage/dataset_schema.py +0 -44
  160. isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
  161. isa_model/training/annotation/tests/test_minio copy.py +0 -113
  162. isa_model/training/annotation/tests/test_minio_upload.py +0 -43
  163. isa_model/training/annotation/views/annotation_controller.py +0 -158
  164. isa_model/training/cloud/__init__.py +0 -22
  165. isa_model/training/cloud/job_orchestrator.py +0 -402
  166. isa_model/training/cloud/runpod_trainer.py +0 -454
  167. isa_model/training/cloud/storage_manager.py +0 -482
  168. isa_model/training/core/__init__.py +0 -26
  169. isa_model/training/core/config.py +0 -181
  170. isa_model/training/core/dataset.py +0 -222
  171. isa_model/training/core/trainer.py +0 -720
  172. isa_model/training/core/utils.py +0 -213
  173. isa_model/training/examples/intelligent_training_example.py +0 -281
  174. isa_model/training/factory.py +0 -424
  175. isa_model/training/intelligent/__init__.py +0 -25
  176. isa_model/training/intelligent/decision_engine.py +0 -643
  177. isa_model/training/intelligent/intelligent_factory.py +0 -888
  178. isa_model/training/intelligent/knowledge_base.py +0 -751
  179. isa_model/training/intelligent/resource_optimizer.py +0 -839
  180. isa_model/training/intelligent/task_classifier.py +0 -576
  181. isa_model/training/storage/__init__.py +0 -24
  182. isa_model/training/storage/core_integration.py +0 -439
  183. isa_model/training/storage/training_repository.py +0 -552
  184. isa_model/training/storage/training_storage.py +0 -628
  185. isa_model-0.4.0.dist-info/RECORD +0 -182
  186. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_chatTTS_service.py +0 -0
  187. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_fish_service.py +0 -0
  188. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_openvoice_service.py +0 -0
  189. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_service_v2.py +0 -0
  190. /isa_model/deployment/{cloud/modal → modal/services/embedding}/isa_embed_rerank_service.py +0 -0
  191. /isa_model/deployment/{cloud/modal → modal/services/video}/isa_video_hunyuan_service.py +0 -0
  192. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ocr_service.py +0 -0
  193. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_qwen25_service.py +0 -0
  194. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_table_service.py +0 -0
  195. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service.py +0 -0
  196. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service_optimized.py +0 -0
  197. /isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
  198. {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/WHEEL +0 -0
  199. {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,644 @@
1
+ """
2
+ HuggingFace Transformers local inference service
3
+
4
+ Direct model loading and inference using HuggingFace Transformers.
5
+ """
6
+
7
+ import os
8
+ import json
9
+ import asyncio
10
+ import logging
11
+ import threading
12
+ from typing import Dict, List, Optional, Any, Union
13
+ from datetime import datetime
14
+ import time
15
+ import torch
16
+
17
+ from .config import LocalGPUConfig, LocalServiceType, LocalBackend
18
+ from ...utils.gpu_utils import get_gpu_manager
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ class TransformersService:
24
+ """HuggingFace Transformers local inference service"""
25
+
26
+ def __init__(self, config: LocalGPUConfig):
27
+ """
28
+ Initialize Transformers service.
29
+
30
+ Args:
31
+ config: Local GPU configuration for Transformers
32
+ """
33
+ if config.backend != LocalBackend.TRANSFORMERS:
34
+ raise ValueError("Config must use TRANSFORMERS backend")
35
+
36
+ self.config = config
37
+ self.gpu_manager = get_gpu_manager()
38
+ self.model = None
39
+ self.tokenizer = None
40
+ self.processor = None # For multimodal models
41
+ self.model_loaded = False
42
+ self.startup_time: Optional[datetime] = None
43
+ self.device = None
44
+
45
+ # Thread safety for inference
46
+ self._inference_lock = threading.Lock()
47
+
48
+ # Service info
49
+ self.service_info = {
50
+ "service_name": config.service_name,
51
+ "model_id": config.model_id,
52
+ "backend": "transformers",
53
+ "status": "stopped",
54
+ "device": None
55
+ }
56
+
57
+ async def load_model(self) -> Dict[str, Any]:
58
+ """
59
+ Load HuggingFace model for inference.
60
+
61
+ Returns:
62
+ Model loading result
63
+ """
64
+ if self.model_loaded:
65
+ return {
66
+ "success": True,
67
+ "message": "Model already loaded",
68
+ "service_info": self.service_info
69
+ }
70
+
71
+ try:
72
+ logger.info(f"Loading Transformers model: {self.config.model_id}")
73
+ self.startup_time = datetime.now()
74
+
75
+ # Check GPU requirements
76
+ gpu_check = await self._check_gpu_requirements()
77
+ if not gpu_check["compatible"]:
78
+ return {
79
+ "success": False,
80
+ "error": f"GPU requirements not met: {', '.join(gpu_check['warnings'])}",
81
+ "gpu_check": gpu_check
82
+ }
83
+
84
+ # Set device
85
+ self.device = await self._setup_device()
86
+
87
+ # Load model components
88
+ load_result = await self._load_model_components()
89
+
90
+ if load_result["success"]:
91
+ self.model_loaded = True
92
+ self.service_info.update({
93
+ "status": "running",
94
+ "device": str(self.device),
95
+ "loaded_at": self.startup_time.isoformat(),
96
+ "load_time": load_result["load_time"],
97
+ "model_info": load_result["model_info"]
98
+ })
99
+
100
+ logger.info(f"Transformers model loaded successfully on {self.device}")
101
+ return {
102
+ "success": True,
103
+ "service_info": self.service_info,
104
+ "load_time": load_result["load_time"],
105
+ "gpu_info": gpu_check.get("selected_gpu")
106
+ }
107
+ else:
108
+ return load_result
109
+
110
+ except Exception as e:
111
+ logger.error(f"Failed to load Transformers model: {e}")
112
+ return {
113
+ "success": False,
114
+ "error": str(e)
115
+ }
116
+
117
+ async def unload_model(self) -> Dict[str, Any]:
118
+ """Unload model and free GPU memory"""
119
+ try:
120
+ if self.model:
121
+ del self.model
122
+ self.model = None
123
+
124
+ if self.tokenizer:
125
+ del self.tokenizer
126
+ self.tokenizer = None
127
+
128
+ if self.processor:
129
+ del self.processor
130
+ self.processor = None
131
+
132
+ self.model_loaded = False
133
+ self.service_info.update({
134
+ "status": "stopped",
135
+ "device": None,
136
+ "unloaded_at": datetime.now().isoformat()
137
+ })
138
+
139
+ # Free GPU memory
140
+ if torch.cuda.is_available():
141
+ torch.cuda.empty_cache()
142
+
143
+ logger.info("Transformers model unloaded")
144
+ return {
145
+ "success": True,
146
+ "service_info": self.service_info
147
+ }
148
+
149
+ except Exception as e:
150
+ logger.error(f"Failed to unload model: {e}")
151
+ return {
152
+ "success": False,
153
+ "error": str(e)
154
+ }
155
+
156
+ async def generate_text(self, prompt: str, **kwargs) -> Dict[str, Any]:
157
+ """Generate text using the loaded model"""
158
+ if not self.model_loaded:
159
+ return {
160
+ "success": False,
161
+ "error": "Model not loaded"
162
+ }
163
+
164
+ try:
165
+ with self._inference_lock:
166
+ start_time = time.time()
167
+
168
+ # Prepare generation parameters
169
+ max_tokens = kwargs.get("max_tokens", 512)
170
+ temperature = kwargs.get("temperature", 0.7)
171
+ top_p = kwargs.get("top_p", 0.9)
172
+ top_k = kwargs.get("top_k", 50)
173
+ do_sample = kwargs.get("do_sample", True)
174
+
175
+ # Tokenize input
176
+ inputs = self.tokenizer(
177
+ prompt,
178
+ return_tensors="pt",
179
+ padding=True,
180
+ truncation=True,
181
+ max_length=self.config.max_model_len // 2
182
+ ).to(self.device)
183
+
184
+ # Generate
185
+ with torch.no_grad():
186
+ outputs = self.model.generate(
187
+ **inputs,
188
+ max_new_tokens=max_tokens,
189
+ temperature=temperature,
190
+ top_p=top_p,
191
+ top_k=top_k,
192
+ do_sample=do_sample,
193
+ pad_token_id=self.tokenizer.eos_token_id,
194
+ eos_token_id=self.tokenizer.eos_token_id,
195
+ use_cache=True
196
+ )
197
+
198
+ # Decode output
199
+ input_length = inputs['input_ids'].shape[-1]
200
+ generated_tokens = outputs[0][input_length:]
201
+ generated_text = self.tokenizer.decode(
202
+ generated_tokens,
203
+ skip_special_tokens=True
204
+ ).strip()
205
+
206
+ generation_time = time.time() - start_time
207
+
208
+ return {
209
+ "success": True,
210
+ "text": generated_text,
211
+ "model": self.config.model_id,
212
+ "generation_time": generation_time,
213
+ "input_tokens": input_length,
214
+ "output_tokens": len(generated_tokens),
215
+ "total_tokens": len(outputs[0])
216
+ }
217
+
218
+ except Exception as e:
219
+ logger.error(f"Text generation failed: {e}")
220
+ return {
221
+ "success": False,
222
+ "error": str(e)
223
+ }
224
+
225
+ async def chat_completion(self, messages: List[Dict[str, str]], **kwargs) -> Dict[str, Any]:
226
+ """Generate chat completion response"""
227
+ # Convert messages to prompt
228
+ prompt = await self._format_chat_messages(messages)
229
+
230
+ # Generate response
231
+ result = await self.generate_text(prompt, **kwargs)
232
+
233
+ if result["success"]:
234
+ # Format as chat completion
235
+ return {
236
+ "success": True,
237
+ "choices": [{
238
+ "message": {
239
+ "role": "assistant",
240
+ "content": result["text"]
241
+ },
242
+ "finish_reason": "stop"
243
+ }],
244
+ "usage": {
245
+ "prompt_tokens": result["input_tokens"],
246
+ "completion_tokens": result["output_tokens"],
247
+ "total_tokens": result["total_tokens"]
248
+ },
249
+ "model": result["model"],
250
+ "generation_time": result["generation_time"]
251
+ }
252
+ else:
253
+ return result
254
+
255
+ async def analyze_image(self, image_data: bytes, prompt: str = "Describe this image.", **kwargs) -> Dict[str, Any]:
256
+ """Analyze image using vision model"""
257
+ if self.config.service_type != LocalServiceType.VISION:
258
+ return {
259
+ "success": False,
260
+ "error": "Service not configured for vision tasks"
261
+ }
262
+
263
+ if not self.processor:
264
+ return {
265
+ "success": False,
266
+ "error": "Vision processor not loaded"
267
+ }
268
+
269
+ try:
270
+ with self._inference_lock:
271
+ start_time = time.time()
272
+
273
+ from PIL import Image
274
+ import io
275
+
276
+ # Load image
277
+ image = Image.open(io.BytesIO(image_data)).convert('RGB')
278
+
279
+ # Process inputs
280
+ inputs = self.processor(
281
+ text=prompt,
282
+ images=image,
283
+ return_tensors="pt"
284
+ ).to(self.device)
285
+
286
+ # Generate
287
+ max_tokens = kwargs.get("max_tokens", 512)
288
+ with torch.no_grad():
289
+ outputs = self.model.generate(
290
+ **inputs,
291
+ max_new_tokens=max_tokens,
292
+ do_sample=True,
293
+ temperature=kwargs.get("temperature", 0.7)
294
+ )
295
+
296
+ # Decode
297
+ response = self.processor.decode(outputs[0], skip_special_tokens=True)
298
+
299
+ # Clean up response (remove prompt)
300
+ if prompt in response:
301
+ response = response.replace(prompt, "").strip()
302
+
303
+ generation_time = time.time() - start_time
304
+
305
+ return {
306
+ "success": True,
307
+ "text": response,
308
+ "model": self.config.model_id,
309
+ "generation_time": generation_time
310
+ }
311
+
312
+ except Exception as e:
313
+ logger.error(f"Image analysis failed: {e}")
314
+ return {
315
+ "success": False,
316
+ "error": str(e)
317
+ }
318
+
319
+ async def embed_text(self, texts: Union[str, List[str]], **kwargs) -> Dict[str, Any]:
320
+ """Generate text embeddings"""
321
+ if self.config.service_type != LocalServiceType.EMBEDDING:
322
+ return {
323
+ "success": False,
324
+ "error": "Service not configured for embedding tasks"
325
+ }
326
+
327
+ try:
328
+ with self._inference_lock:
329
+ start_time = time.time()
330
+
331
+ # Ensure texts is a list
332
+ if isinstance(texts, str):
333
+ texts = [texts]
334
+
335
+ # Tokenize
336
+ inputs = self.tokenizer(
337
+ texts,
338
+ return_tensors="pt",
339
+ padding=True,
340
+ truncation=True,
341
+ max_length=self.config.max_model_len
342
+ ).to(self.device)
343
+
344
+ # Generate embeddings
345
+ with torch.no_grad():
346
+ outputs = self.model(**inputs)
347
+
348
+ # Use different pooling strategies based on model
349
+ if hasattr(outputs, 'pooler_output') and outputs.pooler_output is not None:
350
+ embeddings = outputs.pooler_output
351
+ else:
352
+ # Mean pooling
353
+ embeddings = outputs.last_hidden_state.mean(dim=1)
354
+
355
+ # Normalize embeddings
356
+ embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
357
+
358
+ generation_time = time.time() - start_time
359
+
360
+ return {
361
+ "success": True,
362
+ "embeddings": embeddings.cpu().numpy().tolist(),
363
+ "model": self.config.model_id,
364
+ "generation_time": generation_time,
365
+ "embedding_dimension": embeddings.shape[-1]
366
+ }
367
+
368
+ except Exception as e:
369
+ logger.error(f"Text embedding failed: {e}")
370
+ return {
371
+ "success": False,
372
+ "error": str(e)
373
+ }
374
+
375
+ async def health_check(self) -> Dict[str, Any]:
376
+ """Check service health"""
377
+ return {
378
+ "healthy": self.model_loaded,
379
+ "status": "running" if self.model_loaded else "stopped",
380
+ "service_info": self.service_info,
381
+ "device": str(self.device) if self.device else None,
382
+ "model_loaded": self.model_loaded
383
+ }
384
+
385
+ async def _load_model_components(self) -> Dict[str, Any]:
386
+ """Load model, tokenizer, and processor"""
387
+ try:
388
+ start_time = time.time()
389
+
390
+ from transformers import (
391
+ AutoTokenizer, AutoModel, AutoModelForCausalLM,
392
+ AutoProcessor, AutoModelForVision2Seq,
393
+ BitsAndBytesConfig
394
+ )
395
+
396
+ # Load tokenizer
397
+ logger.info("Loading tokenizer...")
398
+ self.tokenizer = AutoTokenizer.from_pretrained(
399
+ self.config.model_id,
400
+ revision=self.config.tokenizer_revision,
401
+ trust_remote_code=self.config.trust_remote_code,
402
+ use_fast=True
403
+ )
404
+
405
+ # Set pad token if missing
406
+ if self.tokenizer.pad_token is None:
407
+ self.tokenizer.pad_token = self.tokenizer.eos_token
408
+
409
+ # Load processor for multimodal models
410
+ if self.config.service_type in [LocalServiceType.VISION, LocalServiceType.AUDIO]:
411
+ try:
412
+ logger.info("Loading processor...")
413
+ self.processor = AutoProcessor.from_pretrained(
414
+ self.config.model_id,
415
+ revision=self.config.revision,
416
+ trust_remote_code=self.config.trust_remote_code
417
+ )
418
+ except Exception as e:
419
+ logger.warning(f"Failed to load processor: {e}")
420
+
421
+ # Configure quantization
422
+ quantization_config = None
423
+ if self.config.quantization:
424
+ if self.config.quantization in ["4bit", "int4"]:
425
+ quantization_config = BitsAndBytesConfig(
426
+ load_in_4bit=True,
427
+ bnb_4bit_compute_dtype=torch.float16,
428
+ bnb_4bit_use_double_quant=True,
429
+ bnb_4bit_quant_type="nf4"
430
+ )
431
+ elif self.config.quantization in ["8bit", "int8"]:
432
+ quantization_config = BitsAndBytesConfig(
433
+ load_in_8bit=True
434
+ )
435
+
436
+ # Determine model class based on service type
437
+ if self.config.service_type == LocalServiceType.EMBEDDING:
438
+ model_class = AutoModel
439
+ elif self.config.service_type == LocalServiceType.VISION:
440
+ model_class = AutoModelForVision2Seq
441
+ else:
442
+ model_class = AutoModelForCausalLM
443
+
444
+ # Configure model loading arguments
445
+ model_kwargs = {
446
+ "revision": self.config.revision,
447
+ "trust_remote_code": self.config.trust_remote_code,
448
+ "torch_dtype": self._get_torch_dtype(),
449
+ "device_map": "auto" if self.config.enable_gpu else "cpu",
450
+ "low_cpu_mem_usage": True,
451
+ **self.config.transformers_args
452
+ }
453
+
454
+ if quantization_config:
455
+ model_kwargs["quantization_config"] = quantization_config
456
+
457
+ # Load model
458
+ logger.info(f"Loading model with {model_class.__name__}...")
459
+ self.model = model_class.from_pretrained(
460
+ self.config.model_id,
461
+ **model_kwargs
462
+ )
463
+
464
+ # Move to device if not using device_map
465
+ if not self.config.transformers_args.get("device_map"):
466
+ self.model.to(self.device)
467
+
468
+ self.model.eval()
469
+
470
+ # Try to compile model for faster inference
471
+ if hasattr(torch, 'compile') and self.config.enable_gpu:
472
+ try:
473
+ self.model = torch.compile(self.model, mode="reduce-overhead")
474
+ logger.info("Model compiled for faster inference")
475
+ except Exception as e:
476
+ logger.warning(f"Model compilation failed: {e}")
477
+
478
+ load_time = time.time() - start_time
479
+
480
+ # Get model info
481
+ model_info = {
482
+ "model_id": self.config.model_id,
483
+ "model_type": self.config.service_type.value,
484
+ "torch_dtype": str(self.model.dtype) if hasattr(self.model, 'dtype') else None,
485
+ "device": str(next(self.model.parameters()).device) if hasattr(self.model, 'parameters') else None,
486
+ "quantization": self.config.quantization,
487
+ "parameters": self._count_parameters()
488
+ }
489
+
490
+ logger.info(f"Model loaded successfully in {load_time:.2f}s")
491
+
492
+ return {
493
+ "success": True,
494
+ "load_time": load_time,
495
+ "model_info": model_info
496
+ }
497
+
498
+ except Exception as e:
499
+ logger.error(f"Failed to load model components: {e}")
500
+ return {
501
+ "success": False,
502
+ "error": str(e)
503
+ }
504
+
505
+ def _count_parameters(self) -> Optional[int]:
506
+ """Count model parameters"""
507
+ try:
508
+ if hasattr(self.model, 'num_parameters'):
509
+ return self.model.num_parameters()
510
+ else:
511
+ return sum(p.numel() for p in self.model.parameters())
512
+ except:
513
+ return None
514
+
515
+ def _get_torch_dtype(self) -> torch.dtype:
516
+ """Get appropriate torch dtype"""
517
+ precision_map = {
518
+ "float32": torch.float32,
519
+ "float16": torch.float16,
520
+ "bfloat16": torch.bfloat16,
521
+ "int8": torch.int8
522
+ }
523
+ return precision_map.get(self.config.model_precision, torch.float16)
524
+
525
+ async def _setup_device(self) -> torch.device:
526
+ """Setup compute device"""
527
+ if not self.config.enable_gpu or not torch.cuda.is_available():
528
+ return torch.device("cpu")
529
+
530
+ if self.config.gpu_id is not None:
531
+ device = torch.device(f"cuda:{self.config.gpu_id}")
532
+ else:
533
+ device = torch.device("cuda")
534
+
535
+ # Set memory fraction
536
+ if torch.cuda.is_available():
537
+ torch.cuda.set_per_process_memory_fraction(
538
+ self.config.gpu_memory_fraction,
539
+ device.index if device.index is not None else 0
540
+ )
541
+
542
+ return device
543
+
544
+ async def _format_chat_messages(self, messages: List[Dict[str, str]]) -> str:
545
+ """Format chat messages into a prompt"""
546
+ formatted_parts = []
547
+
548
+ for message in messages:
549
+ role = message.get("role", "user")
550
+ content = message.get("content", "")
551
+
552
+ if role == "system":
553
+ formatted_parts.append(f"System: {content}")
554
+ elif role == "user":
555
+ formatted_parts.append(f"Human: {content}")
556
+ elif role == "assistant":
557
+ formatted_parts.append(f"Assistant: {content}")
558
+
559
+ formatted_parts.append("Assistant:")
560
+ return "\n\n".join(formatted_parts)
561
+
562
+ async def _check_gpu_requirements(self) -> Dict[str, Any]:
563
+ """Check GPU requirements"""
564
+ if not self.config.enable_gpu:
565
+ return {
566
+ "compatible": True,
567
+ "warnings": ["Using CPU inference"],
568
+ "selected_gpu": None
569
+ }
570
+
571
+ self.gpu_manager.refresh()
572
+
573
+ if not self.gpu_manager.cuda_available:
574
+ return {
575
+ "compatible": True, # Can fallback to CPU
576
+ "warnings": ["CUDA not available, falling back to CPU"],
577
+ "selected_gpu": None
578
+ }
579
+
580
+ # Estimate memory requirements
581
+ estimated_memory = self.gpu_manager.estimate_model_memory(
582
+ self.config.model_id,
583
+ self.config.model_precision
584
+ )
585
+
586
+ # Apply quantization reduction
587
+ if self.config.quantization == "int8":
588
+ estimated_memory = int(estimated_memory * 0.5)
589
+ elif self.config.quantization == "int4":
590
+ estimated_memory = int(estimated_memory * 0.25)
591
+
592
+ # Find suitable GPU
593
+ if self.config.gpu_id is not None:
594
+ selected_gpu = self.gpu_manager.get_gpu_info(self.config.gpu_id)
595
+ if not selected_gpu:
596
+ return {
597
+ "compatible": True,
598
+ "warnings": [f"Specified GPU {self.config.gpu_id} not found, falling back to CPU"],
599
+ "selected_gpu": None
600
+ }
601
+ else:
602
+ selected_gpu = self.gpu_manager.get_best_gpu(estimated_memory)
603
+ if selected_gpu:
604
+ self.config.gpu_id = selected_gpu.gpu_id
605
+
606
+ if not selected_gpu:
607
+ return {
608
+ "compatible": True,
609
+ "warnings": [
610
+ f"No suitable GPU found (Required: {estimated_memory}MB), falling back to CPU"
611
+ ],
612
+ "selected_gpu": None
613
+ }
614
+
615
+ warnings = []
616
+
617
+ # Check memory requirements
618
+ required_memory = int(estimated_memory * self.config.gpu_memory_fraction)
619
+ if selected_gpu.memory_free < required_memory:
620
+ warnings.append(f"GPU memory may be tight: {selected_gpu.memory_free}MB available, {required_memory}MB required")
621
+
622
+ return {
623
+ "compatible": True,
624
+ "warnings": warnings,
625
+ "selected_gpu": {
626
+ "gpu_id": selected_gpu.gpu_id,
627
+ "name": selected_gpu.name,
628
+ "memory_total": selected_gpu.memory_total,
629
+ "memory_free": selected_gpu.memory_free,
630
+ "utilization": selected_gpu.utilization,
631
+ "estimated_memory_required": estimated_memory
632
+ }
633
+ }
634
+
635
+ def get_service_info(self) -> Dict[str, Any]:
636
+ """Get current service information"""
637
+ return {
638
+ **self.service_info,
639
+ "config": self.config.to_dict(),
640
+ "device": str(self.device) if self.device else None,
641
+ "model_loaded": self.model_loaded,
642
+ "startup_time": self.startup_time.isoformat() if self.startup_time else None,
643
+ "parameters": self._count_parameters() if self.model_loaded else None
644
+ }