isa-model 0.4.0__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (199) hide show
  1. isa_model/client.py +466 -43
  2. isa_model/core/cache/redis_cache.py +12 -3
  3. isa_model/core/config/config_manager.py +230 -3
  4. isa_model/core/config.py +90 -0
  5. isa_model/core/database/direct_db_client.py +114 -0
  6. isa_model/core/database/migration_manager.py +563 -0
  7. isa_model/core/database/migrations.py +21 -1
  8. isa_model/core/database/supabase_client.py +154 -19
  9. isa_model/core/dependencies.py +316 -0
  10. isa_model/core/discovery/__init__.py +19 -0
  11. isa_model/core/discovery/consul_discovery.py +190 -0
  12. isa_model/core/logging/__init__.py +54 -0
  13. isa_model/core/logging/influx_logger.py +523 -0
  14. isa_model/core/logging/loki_logger.py +160 -0
  15. isa_model/core/models/__init__.py +27 -18
  16. isa_model/core/models/config_models.py +625 -0
  17. isa_model/core/models/deployment_billing_tracker.py +430 -0
  18. isa_model/core/models/model_manager.py +40 -17
  19. isa_model/core/models/model_metadata.py +690 -0
  20. isa_model/core/models/model_repo.py +174 -18
  21. isa_model/core/models/system_models.py +857 -0
  22. isa_model/core/repositories/__init__.py +9 -0
  23. isa_model/core/repositories/config_repository.py +912 -0
  24. isa_model/core/services/intelligent_model_selector.py +399 -21
  25. isa_model/core/storage/hf_storage.py +1 -1
  26. isa_model/core/types.py +1 -0
  27. isa_model/deployment/__init__.py +5 -48
  28. isa_model/deployment/core/__init__.py +2 -31
  29. isa_model/deployment/core/deployment_manager.py +1278 -370
  30. isa_model/deployment/local/__init__.py +31 -0
  31. isa_model/deployment/local/config.py +248 -0
  32. isa_model/deployment/local/gpu_gateway.py +607 -0
  33. isa_model/deployment/local/health_checker.py +428 -0
  34. isa_model/deployment/local/provider.py +586 -0
  35. isa_model/deployment/local/tensorrt_service.py +621 -0
  36. isa_model/deployment/local/transformers_service.py +644 -0
  37. isa_model/deployment/local/vllm_service.py +527 -0
  38. isa_model/deployment/modal/__init__.py +8 -0
  39. isa_model/deployment/modal/config.py +136 -0
  40. isa_model/deployment/{services/auto_hf_modal_deployer.py → modal/deployer.py} +1 -1
  41. isa_model/deployment/modal/services/__init__.py +3 -0
  42. isa_model/deployment/modal/services/audio/__init__.py +1 -0
  43. isa_model/deployment/modal/services/embedding/__init__.py +1 -0
  44. isa_model/deployment/modal/services/llm/__init__.py +1 -0
  45. isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
  46. isa_model/deployment/modal/services/video/__init__.py +1 -0
  47. isa_model/deployment/modal/services/vision/__init__.py +1 -0
  48. isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
  49. isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
  50. isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
  51. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
  52. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
  53. isa_model/deployment/storage/__init__.py +5 -0
  54. isa_model/deployment/storage/deployment_repository.py +824 -0
  55. isa_model/deployment/triton/__init__.py +10 -0
  56. isa_model/deployment/triton/config.py +196 -0
  57. isa_model/deployment/triton/configs/__init__.py +1 -0
  58. isa_model/deployment/triton/provider.py +512 -0
  59. isa_model/deployment/triton/scripts/__init__.py +1 -0
  60. isa_model/deployment/triton/templates/__init__.py +1 -0
  61. isa_model/inference/__init__.py +47 -1
  62. isa_model/inference/ai_factory.py +137 -10
  63. isa_model/inference/legacy_services/__init__.py +21 -0
  64. isa_model/inference/legacy_services/model_evaluation.py +637 -0
  65. isa_model/inference/legacy_services/model_service.py +573 -0
  66. isa_model/inference/legacy_services/model_serving.py +717 -0
  67. isa_model/inference/legacy_services/model_training.py +561 -0
  68. isa_model/inference/models/__init__.py +21 -0
  69. isa_model/inference/models/inference_config.py +551 -0
  70. isa_model/inference/models/inference_record.py +675 -0
  71. isa_model/inference/models/performance_models.py +714 -0
  72. isa_model/inference/repositories/__init__.py +9 -0
  73. isa_model/inference/repositories/inference_repository.py +828 -0
  74. isa_model/inference/services/audio/base_stt_service.py +184 -11
  75. isa_model/inference/services/audio/openai_stt_service.py +22 -6
  76. isa_model/inference/services/custom_model_manager.py +277 -0
  77. isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
  78. isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
  79. isa_model/inference/services/llm/__init__.py +10 -2
  80. isa_model/inference/services/llm/base_llm_service.py +335 -24
  81. isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
  82. isa_model/inference/services/llm/helpers/llm_adapter.py +9 -4
  83. isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
  84. isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
  85. isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
  86. isa_model/inference/services/llm/local_llm_service.py +747 -0
  87. isa_model/inference/services/llm/ollama_llm_service.py +9 -2
  88. isa_model/inference/services/llm/openai_llm_service.py +33 -16
  89. isa_model/inference/services/llm/yyds_llm_service.py +8 -2
  90. isa_model/inference/services/vision/__init__.py +22 -1
  91. isa_model/inference/services/vision/blip_vision_service.py +359 -0
  92. isa_model/inference/services/vision/helpers/image_utils.py +8 -5
  93. isa_model/inference/services/vision/isa_vision_service.py +65 -4
  94. isa_model/inference/services/vision/openai_vision_service.py +19 -10
  95. isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
  96. isa_model/serving/api/cache_manager.py +245 -0
  97. isa_model/serving/api/dependencies/__init__.py +1 -0
  98. isa_model/serving/api/dependencies/auth.py +194 -0
  99. isa_model/serving/api/dependencies/database.py +139 -0
  100. isa_model/serving/api/error_handlers.py +284 -0
  101. isa_model/serving/api/fastapi_server.py +172 -22
  102. isa_model/serving/api/middleware/auth.py +8 -2
  103. isa_model/serving/api/middleware/security.py +23 -33
  104. isa_model/serving/api/middleware/tenant_context.py +414 -0
  105. isa_model/serving/api/routes/analytics.py +4 -1
  106. isa_model/serving/api/routes/config.py +645 -0
  107. isa_model/serving/api/routes/deployment_billing.py +315 -0
  108. isa_model/serving/api/routes/deployments.py +138 -2
  109. isa_model/serving/api/routes/gpu_gateway.py +440 -0
  110. isa_model/serving/api/routes/health.py +32 -12
  111. isa_model/serving/api/routes/inference_monitoring.py +486 -0
  112. isa_model/serving/api/routes/local_deployments.py +448 -0
  113. isa_model/serving/api/routes/tenants.py +575 -0
  114. isa_model/serving/api/routes/unified.py +680 -18
  115. isa_model/serving/api/routes/webhooks.py +479 -0
  116. isa_model/serving/api/startup.py +68 -54
  117. isa_model/utils/gpu_utils.py +311 -0
  118. {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/METADATA +66 -24
  119. isa_model-0.4.3.dist-info/RECORD +193 -0
  120. isa_model/core/storage/minio_storage.py +0 -0
  121. isa_model/deployment/cloud/__init__.py +0 -9
  122. isa_model/deployment/cloud/modal/__init__.py +0 -10
  123. isa_model/deployment/core/deployment_config.py +0 -356
  124. isa_model/deployment/core/isa_deployment_service.py +0 -401
  125. isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
  126. isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
  127. isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
  128. isa_model/deployment/runtime/deployed_service.py +0 -338
  129. isa_model/deployment/services/__init__.py +0 -9
  130. isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
  131. isa_model/deployment/services/model_service.py +0 -332
  132. isa_model/deployment/services/service_monitor.py +0 -356
  133. isa_model/deployment/services/service_registry.py +0 -527
  134. isa_model/eval/__init__.py +0 -92
  135. isa_model/eval/benchmarks/__init__.py +0 -27
  136. isa_model/eval/benchmarks/multimodal_datasets.py +0 -460
  137. isa_model/eval/benchmarks.py +0 -701
  138. isa_model/eval/config/__init__.py +0 -10
  139. isa_model/eval/config/evaluation_config.py +0 -108
  140. isa_model/eval/evaluators/__init__.py +0 -24
  141. isa_model/eval/evaluators/audio_evaluator.py +0 -727
  142. isa_model/eval/evaluators/base_evaluator.py +0 -503
  143. isa_model/eval/evaluators/embedding_evaluator.py +0 -742
  144. isa_model/eval/evaluators/llm_evaluator.py +0 -472
  145. isa_model/eval/evaluators/vision_evaluator.py +0 -564
  146. isa_model/eval/example_evaluation.py +0 -395
  147. isa_model/eval/factory.py +0 -798
  148. isa_model/eval/infrastructure/__init__.py +0 -24
  149. isa_model/eval/infrastructure/experiment_tracker.py +0 -466
  150. isa_model/eval/isa_benchmarks.py +0 -700
  151. isa_model/eval/isa_integration.py +0 -582
  152. isa_model/eval/metrics.py +0 -951
  153. isa_model/eval/tests/unit/test_basic.py +0 -396
  154. isa_model/serving/api/routes/evaluations.py +0 -579
  155. isa_model/training/__init__.py +0 -168
  156. isa_model/training/annotation/annotation_schema.py +0 -47
  157. isa_model/training/annotation/processors/annotation_processor.py +0 -126
  158. isa_model/training/annotation/storage/dataset_manager.py +0 -131
  159. isa_model/training/annotation/storage/dataset_schema.py +0 -44
  160. isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
  161. isa_model/training/annotation/tests/test_minio copy.py +0 -113
  162. isa_model/training/annotation/tests/test_minio_upload.py +0 -43
  163. isa_model/training/annotation/views/annotation_controller.py +0 -158
  164. isa_model/training/cloud/__init__.py +0 -22
  165. isa_model/training/cloud/job_orchestrator.py +0 -402
  166. isa_model/training/cloud/runpod_trainer.py +0 -454
  167. isa_model/training/cloud/storage_manager.py +0 -482
  168. isa_model/training/core/__init__.py +0 -26
  169. isa_model/training/core/config.py +0 -181
  170. isa_model/training/core/dataset.py +0 -222
  171. isa_model/training/core/trainer.py +0 -720
  172. isa_model/training/core/utils.py +0 -213
  173. isa_model/training/examples/intelligent_training_example.py +0 -281
  174. isa_model/training/factory.py +0 -424
  175. isa_model/training/intelligent/__init__.py +0 -25
  176. isa_model/training/intelligent/decision_engine.py +0 -643
  177. isa_model/training/intelligent/intelligent_factory.py +0 -888
  178. isa_model/training/intelligent/knowledge_base.py +0 -751
  179. isa_model/training/intelligent/resource_optimizer.py +0 -839
  180. isa_model/training/intelligent/task_classifier.py +0 -576
  181. isa_model/training/storage/__init__.py +0 -24
  182. isa_model/training/storage/core_integration.py +0 -439
  183. isa_model/training/storage/training_repository.py +0 -552
  184. isa_model/training/storage/training_storage.py +0 -628
  185. isa_model-0.4.0.dist-info/RECORD +0 -182
  186. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_chatTTS_service.py +0 -0
  187. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_fish_service.py +0 -0
  188. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_openvoice_service.py +0 -0
  189. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_service_v2.py +0 -0
  190. /isa_model/deployment/{cloud/modal → modal/services/embedding}/isa_embed_rerank_service.py +0 -0
  191. /isa_model/deployment/{cloud/modal → modal/services/video}/isa_video_hunyuan_service.py +0 -0
  192. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ocr_service.py +0 -0
  193. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_qwen25_service.py +0 -0
  194. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_table_service.py +0 -0
  195. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service.py +0 -0
  196. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service_optimized.py +0 -0
  197. /isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
  198. {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/WHEEL +0 -0
  199. {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,31 @@
1
+ """
2
+ Local GPU deployment module
3
+
4
+ This module provides local GPU model deployment capabilities including:
5
+ - Direct GPU resource management
6
+ - vLLM integration for high-performance inference
7
+ - TensorRT-LLM native deployment (non-containerized)
8
+ - HuggingFace Transformers direct deployment
9
+ - Local service monitoring and health checks
10
+ """
11
+
12
+ from .provider import LocalGPUProvider
13
+ from .config import (
14
+ LocalGPUConfig, LocalServiceType, LocalBackend,
15
+ create_vllm_config, create_tensorrt_config, create_transformers_config,
16
+ create_vision_config, create_embedding_config
17
+ )
18
+ from .health_checker import LocalHealthChecker
19
+
20
+ __all__ = [
21
+ 'LocalGPUProvider',
22
+ 'LocalGPUConfig',
23
+ 'LocalServiceType',
24
+ 'LocalBackend',
25
+ 'LocalHealthChecker',
26
+ 'create_vllm_config',
27
+ 'create_tensorrt_config',
28
+ 'create_transformers_config',
29
+ 'create_vision_config',
30
+ 'create_embedding_config'
31
+ ]
@@ -0,0 +1,248 @@
1
+ """
2
+ Local GPU deployment configuration
3
+
4
+ Configuration classes for local GPU model deployment.
5
+ """
6
+
7
+ from dataclasses import dataclass, field
8
+ from typing import Dict, Any, Optional, List
9
+ from enum import Enum
10
+ from pathlib import Path
11
+
12
+
13
+ class LocalServiceType(Enum):
14
+ """Local service types"""
15
+ LLM = "llm"
16
+ VISION = "vision"
17
+ AUDIO = "audio"
18
+ EMBEDDING = "embedding"
19
+ IMAGE_GENERATION = "image_generation"
20
+
21
+
22
+ class LocalBackend(Enum):
23
+ """Local inference backends"""
24
+ VLLM = "vllm"
25
+ TENSORRT_LLM = "tensorrt_llm"
26
+ TRANSFORMERS = "transformers"
27
+ ONNX = "onnxruntime"
28
+ OPENVINO = "openvino"
29
+
30
+
31
+ @dataclass
32
+ class LocalGPUConfig:
33
+ """Configuration for local GPU model deployment"""
34
+
35
+ # Service identification
36
+ service_name: str
37
+ service_type: LocalServiceType
38
+ model_id: str
39
+ backend: LocalBackend = LocalBackend.TRANSFORMERS
40
+
41
+ # GPU configuration
42
+ gpu_id: Optional[int] = None # None = auto-select best GPU
43
+ gpu_memory_fraction: float = 0.9 # Fraction of GPU memory to use
44
+ enable_gpu: bool = True
45
+
46
+ # Model configuration
47
+ model_precision: str = "float16" # float32, float16, int8, int4
48
+ max_model_len: int = 2048
49
+ max_batch_size: int = 8
50
+
51
+ # Performance settings
52
+ enable_chunked_prefill: bool = True
53
+ max_num_seqs: int = 256
54
+ tensor_parallel_size: int = 1
55
+ pipeline_parallel_size: int = 1
56
+
57
+ # Memory optimization
58
+ enable_prefix_caching: bool = True
59
+ gpu_memory_utilization: float = 0.9
60
+ swap_space: int = 4 # GB
61
+ cpu_offload: bool = False
62
+
63
+ # Quantization settings
64
+ quantization: Optional[str] = None # awq, gptq, squeezellm, etc.
65
+ quantization_param_path: Optional[str] = None
66
+
67
+ # Serving configuration
68
+ host: str = "127.0.0.1"
69
+ port: int = 8000
70
+ api_key: Optional[str] = None
71
+ served_model_name: Optional[str] = None
72
+
73
+ # Advanced settings
74
+ trust_remote_code: bool = False
75
+ revision: Optional[str] = None
76
+ tokenizer_revision: Optional[str] = None
77
+
78
+ # Specific backend configurations
79
+ vllm_args: Dict[str, Any] = field(default_factory=dict)
80
+ tensorrt_args: Dict[str, Any] = field(default_factory=dict)
81
+ transformers_args: Dict[str, Any] = field(default_factory=dict)
82
+
83
+ # Environment and paths
84
+ model_cache_dir: Optional[str] = None
85
+ download_dir: Optional[str] = None
86
+
87
+ def to_dict(self) -> Dict[str, Any]:
88
+ """Convert to dictionary for serialization"""
89
+ return {
90
+ "service_name": self.service_name,
91
+ "service_type": self.service_type.value,
92
+ "model_id": self.model_id,
93
+ "backend": self.backend.value,
94
+ "gpu_id": self.gpu_id,
95
+ "gpu_memory_fraction": self.gpu_memory_fraction,
96
+ "enable_gpu": self.enable_gpu,
97
+ "model_precision": self.model_precision,
98
+ "max_model_len": self.max_model_len,
99
+ "max_batch_size": self.max_batch_size,
100
+ "enable_chunked_prefill": self.enable_chunked_prefill,
101
+ "max_num_seqs": self.max_num_seqs,
102
+ "tensor_parallel_size": self.tensor_parallel_size,
103
+ "pipeline_parallel_size": self.pipeline_parallel_size,
104
+ "enable_prefix_caching": self.enable_prefix_caching,
105
+ "gpu_memory_utilization": self.gpu_memory_utilization,
106
+ "swap_space": self.swap_space,
107
+ "cpu_offload": self.cpu_offload,
108
+ "quantization": self.quantization,
109
+ "quantization_param_path": self.quantization_param_path,
110
+ "host": self.host,
111
+ "port": self.port,
112
+ "api_key": self.api_key,
113
+ "served_model_name": self.served_model_name,
114
+ "trust_remote_code": self.trust_remote_code,
115
+ "revision": self.revision,
116
+ "tokenizer_revision": self.tokenizer_revision,
117
+ "vllm_args": self.vllm_args,
118
+ "tensorrt_args": self.tensorrt_args,
119
+ "transformers_args": self.transformers_args,
120
+ "model_cache_dir": self.model_cache_dir,
121
+ "download_dir": self.download_dir
122
+ }
123
+
124
+ @classmethod
125
+ def from_dict(cls, data: Dict[str, Any]) -> "LocalGPUConfig":
126
+ """Create from dictionary"""
127
+ return cls(
128
+ service_name=data["service_name"],
129
+ service_type=LocalServiceType(data["service_type"]),
130
+ model_id=data["model_id"],
131
+ backend=LocalBackend(data.get("backend", "transformers")),
132
+ gpu_id=data.get("gpu_id"),
133
+ gpu_memory_fraction=data.get("gpu_memory_fraction", 0.9),
134
+ enable_gpu=data.get("enable_gpu", True),
135
+ model_precision=data.get("model_precision", "float16"),
136
+ max_model_len=data.get("max_model_len", 2048),
137
+ max_batch_size=data.get("max_batch_size", 8),
138
+ enable_chunked_prefill=data.get("enable_chunked_prefill", True),
139
+ max_num_seqs=data.get("max_num_seqs", 256),
140
+ tensor_parallel_size=data.get("tensor_parallel_size", 1),
141
+ pipeline_parallel_size=data.get("pipeline_parallel_size", 1),
142
+ enable_prefix_caching=data.get("enable_prefix_caching", True),
143
+ gpu_memory_utilization=data.get("gpu_memory_utilization", 0.9),
144
+ swap_space=data.get("swap_space", 4),
145
+ cpu_offload=data.get("cpu_offload", False),
146
+ quantization=data.get("quantization"),
147
+ quantization_param_path=data.get("quantization_param_path"),
148
+ host=data.get("host", "127.0.0.1"),
149
+ port=data.get("port", 8000),
150
+ api_key=data.get("api_key"),
151
+ served_model_name=data.get("served_model_name"),
152
+ trust_remote_code=data.get("trust_remote_code", False),
153
+ revision=data.get("revision"),
154
+ tokenizer_revision=data.get("tokenizer_revision"),
155
+ vllm_args=data.get("vllm_args", {}),
156
+ tensorrt_args=data.get("tensorrt_args", {}),
157
+ transformers_args=data.get("transformers_args", {}),
158
+ model_cache_dir=data.get("model_cache_dir"),
159
+ download_dir=data.get("download_dir")
160
+ )
161
+
162
+
163
+ # Predefined configurations for common use cases
164
+ def create_vllm_config(service_name: str, model_id: str,
165
+ max_model_len: int = 2048,
166
+ tensor_parallel_size: int = 1) -> LocalGPUConfig:
167
+ """Create optimized vLLM configuration"""
168
+ return LocalGPUConfig(
169
+ service_name=service_name,
170
+ service_type=LocalServiceType.LLM,
171
+ model_id=model_id,
172
+ backend=LocalBackend.VLLM,
173
+ max_model_len=max_model_len,
174
+ tensor_parallel_size=tensor_parallel_size,
175
+ enable_chunked_prefill=True,
176
+ enable_prefix_caching=True,
177
+ gpu_memory_utilization=0.9,
178
+ model_precision="float16"
179
+ )
180
+
181
+
182
+ def create_tensorrt_config(service_name: str, model_id: str,
183
+ max_batch_size: int = 8,
184
+ precision: str = "float16") -> LocalGPUConfig:
185
+ """Create TensorRT-LLM configuration"""
186
+ return LocalGPUConfig(
187
+ service_name=service_name,
188
+ service_type=LocalServiceType.LLM,
189
+ model_id=model_id,
190
+ backend=LocalBackend.TENSORRT_LLM,
191
+ max_batch_size=max_batch_size,
192
+ model_precision=precision,
193
+ tensor_parallel_size=1,
194
+ tensorrt_args={
195
+ "enable_kv_cache_reuse": True,
196
+ "remove_input_padding": True,
197
+ "use_gpt_attention_plugin": True
198
+ }
199
+ )
200
+
201
+
202
+ def create_transformers_config(service_name: str, model_id: str,
203
+ precision: str = "float16",
204
+ quantization: Optional[str] = None) -> LocalGPUConfig:
205
+ """Create HuggingFace Transformers configuration"""
206
+ return LocalGPUConfig(
207
+ service_name=service_name,
208
+ service_type=LocalServiceType.LLM,
209
+ model_id=model_id,
210
+ backend=LocalBackend.TRANSFORMERS,
211
+ model_precision=precision,
212
+ quantization=quantization,
213
+ max_batch_size=4, # Lower for memory efficiency
214
+ transformers_args={
215
+ "device_map": "auto",
216
+ "torch_dtype": "auto",
217
+ "low_cpu_mem_usage": True
218
+ }
219
+ )
220
+
221
+
222
+ def create_vision_config(service_name: str, model_id: str,
223
+ backend: LocalBackend = LocalBackend.TRANSFORMERS) -> LocalGPUConfig:
224
+ """Create vision model configuration"""
225
+ return LocalGPUConfig(
226
+ service_name=service_name,
227
+ service_type=LocalServiceType.VISION,
228
+ model_id=model_id,
229
+ backend=backend,
230
+ max_batch_size=16,
231
+ model_precision="float16",
232
+ gpu_memory_utilization=0.8 # Lower for vision models
233
+ )
234
+
235
+
236
+ def create_embedding_config(service_name: str, model_id: str,
237
+ max_batch_size: int = 32) -> LocalGPUConfig:
238
+ """Create embedding model configuration"""
239
+ return LocalGPUConfig(
240
+ service_name=service_name,
241
+ service_type=LocalServiceType.EMBEDDING,
242
+ model_id=model_id,
243
+ backend=LocalBackend.TRANSFORMERS,
244
+ max_batch_size=max_batch_size,
245
+ model_precision="float16",
246
+ gpu_memory_utilization=0.7, # Lower memory usage for embeddings
247
+ cpu_offload=False
248
+ )