isa-model 0.4.3__py3-none-any.whl → 0.4.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,419 +0,0 @@
1
- """
2
- HuggingFace Hub Storage Implementation
3
-
4
- Provides storage capabilities using HuggingFace Hub as the backend.
5
- Supports uploading trained models, managing versions, and metadata.
6
- """
7
-
8
- import os
9
- import json
10
- import logging
11
- from typing import Optional, Dict, Any, List
12
- from pathlib import Path
13
- import tempfile
14
- import shutil
15
- from datetime import datetime
16
-
17
- try:
18
- from huggingface_hub import HfApi, create_repo, upload_folder, snapshot_download
19
- from huggingface_hub.errors import HfHubHTTPError
20
- HF_HUB_AVAILABLE = True
21
- except ImportError:
22
- HF_HUB_AVAILABLE = False
23
-
24
- from ..models.model_storage import ModelStorage
25
-
26
- logger = logging.getLogger(__name__)
27
-
28
-
29
- class HuggingFaceStorage(ModelStorage):
30
- """
31
- HuggingFace Hub storage implementation for model management.
32
-
33
- This storage backend uploads models to HuggingFace Hub and manages
34
- them using the repository system. Perfect for sharing trained models
35
- and maintaining versions.
36
-
37
- Example:
38
- ```python
39
- from isa_model.core.storage import HuggingFaceStorage
40
-
41
- storage = HuggingFaceStorage(
42
- username="xenobordom",
43
- token=os.getenv("HF_TOKEN") # Set in environment
44
- )
45
-
46
- # Save a trained model to HuggingFace Hub
47
- await storage.save_model(
48
- model_id="gemma-4b-alpaca-v1",
49
- model_path="./trained_models/gemma-4b",
50
- metadata={
51
- "base_model": "google/gemma-2-4b-it",
52
- "dataset": "tatsu-lab/alpaca",
53
- "training_method": "LoRA + Unsloth"
54
- }
55
- )
56
- ```
57
- """
58
-
59
- def __init__(self,
60
- username: str = "xenobordom",
61
- token: Optional[str] = None,
62
- private: bool = False,
63
- local_cache_dir: str = "./models/hf_cache"):
64
- """
65
- Initialize HuggingFace storage.
66
-
67
- Args:
68
- username: HuggingFace username (default: xenobordom)
69
- token: HuggingFace API token (from env if not provided)
70
- private: Whether to create private repositories
71
- local_cache_dir: Local cache directory for downloaded models
72
- """
73
- if not HF_HUB_AVAILABLE:
74
- raise ImportError("huggingface_hub is required. Install with: pip install huggingface_hub")
75
-
76
- self.username = username
77
- self.token = token or os.getenv("HF_TOKEN")
78
- self.private = private
79
- self.local_cache_dir = Path(local_cache_dir)
80
- self.local_cache_dir.mkdir(parents=True, exist_ok=True)
81
-
82
- if not self.token:
83
- raise ValueError("HuggingFace token is required. Set HF_TOKEN environment variable or pass token parameter.")
84
-
85
- # Initialize HF API
86
- self.api = HfApi(token=self.token)
87
-
88
- # Local metadata storage
89
- self.metadata_file = self.local_cache_dir / "hf_models_metadata.json"
90
- self._load_metadata()
91
-
92
- logger.info(f"HuggingFace storage initialized for user: {self.username}")
93
- logger.info(f"Local cache directory: {self.local_cache_dir}")
94
-
95
- def _load_metadata(self):
96
- """Load local metadata cache"""
97
- if self.metadata_file.exists():
98
- with open(self.metadata_file, 'r') as f:
99
- self.metadata = json.load(f)
100
- else:
101
- self.metadata = {}
102
- self._save_metadata()
103
-
104
- def _save_metadata(self):
105
- """Save local metadata cache"""
106
- with open(self.metadata_file, 'w') as f:
107
- json.dump(self.metadata, f, indent=2)
108
-
109
- def _get_repo_id(self, model_id: str) -> str:
110
- """Get full repository ID for a model"""
111
- return f"{self.username}/{model_id}"
112
-
113
- async def save_model(self, model_id: str, model_path: str, metadata: Dict[str, Any]) -> bool:
114
- """
115
- Save model to HuggingFace Hub.
116
-
117
- Args:
118
- model_id: Unique identifier for the model (will be repo name)
119
- model_path: Local path to model files
120
- metadata: Model metadata to include
121
-
122
- Returns:
123
- True if successful, False otherwise
124
- """
125
- try:
126
- repo_id = self._get_repo_id(model_id)
127
- source_path = Path(model_path)
128
-
129
- logger.info(f"Uploading model {model_id} to HuggingFace Hub: {repo_id}")
130
-
131
- # Create repository if it doesn't exist
132
- try:
133
- create_repo(
134
- repo_id=repo_id,
135
- token=self.token,
136
- private=self.private,
137
- exist_ok=True
138
- )
139
- logger.info(f"Repository created/verified: {repo_id}")
140
- except Exception as e:
141
- logger.warning(f"Repository creation warning: {e}")
142
-
143
- # Prepare metadata for README
144
- readme_content = self._generate_model_card(model_id, metadata)
145
-
146
- # Create temporary directory for upload preparation
147
- with tempfile.TemporaryDirectory() as temp_dir:
148
- temp_path = Path(temp_dir)
149
-
150
- # Copy model files
151
- if source_path.is_file():
152
- shutil.copy2(source_path, temp_path / source_path.name)
153
- else:
154
- # Copy entire directory
155
- for item in source_path.rglob("*"):
156
- if item.is_file():
157
- relative_path = item.relative_to(source_path)
158
- dest_path = temp_path / relative_path
159
- dest_path.parent.mkdir(parents=True, exist_ok=True)
160
- shutil.copy2(item, dest_path)
161
-
162
- # Add README.md
163
- with open(temp_path / "README.md", 'w') as f:
164
- f.write(readme_content)
165
-
166
- # Add metadata.json
167
- enhanced_metadata = {
168
- **metadata,
169
- "model_id": model_id,
170
- "repo_id": repo_id,
171
- "uploaded_at": datetime.now().isoformat(),
172
- "uploaded_by": self.username,
173
- "storage_backend": "huggingface_hub"
174
- }
175
-
176
- with open(temp_path / "metadata.json", 'w') as f:
177
- json.dump(enhanced_metadata, f, indent=2)
178
-
179
- # Upload to HuggingFace Hub
180
- upload_folder(
181
- folder_path=str(temp_path),
182
- repo_id=repo_id,
183
- token=self.token,
184
- commit_message=f"Upload {model_id} - {metadata.get('description', 'Model upload')}"
185
- )
186
-
187
- # Update local metadata
188
- self.metadata[model_id] = {
189
- **enhanced_metadata,
190
- "local_cache_path": str(self.local_cache_dir / model_id),
191
- "repo_url": f"https://huggingface.co/{repo_id}"
192
- }
193
- self._save_metadata()
194
-
195
- logger.info(f"Model {model_id} uploaded successfully to {repo_id}")
196
- return True
197
-
198
- except Exception as e:
199
- logger.error(f"Failed to save model {model_id} to HuggingFace Hub: {e}")
200
- return False
201
-
202
- async def load_model(self, model_id: str) -> Optional[Path]:
203
- """
204
- Load model from HuggingFace Hub.
205
-
206
- Args:
207
- model_id: Model identifier
208
-
209
- Returns:
210
- Path to local model files
211
- """
212
- try:
213
- repo_id = self._get_repo_id(model_id)
214
- local_path = self.local_cache_dir / model_id
215
-
216
- # Check if already cached
217
- if local_path.exists() and model_id in self.metadata:
218
- logger.info(f"Using cached model {model_id}")
219
- return local_path
220
-
221
- logger.info(f"Downloading model {model_id} from HuggingFace Hub: {repo_id}")
222
-
223
- # Download from HuggingFace Hub
224
- snapshot_download(
225
- repo_id=repo_id,
226
- local_dir=str(local_path),
227
- token=self.token,
228
- local_dir_use_symlinks=False
229
- )
230
-
231
- # Load metadata if available
232
- metadata_file = local_path / "metadata.json"
233
- if metadata_file.exists():
234
- with open(metadata_file, 'r') as f:
235
- metadata = json.load(f)
236
-
237
- self.metadata[model_id] = {
238
- **metadata,
239
- "local_cache_path": str(local_path),
240
- "last_downloaded": datetime.now().isoformat()
241
- }
242
- self._save_metadata()
243
-
244
- logger.info(f"Model {model_id} downloaded successfully")
245
- return local_path
246
-
247
- except HfHubHTTPError as e:
248
- if e.response.status_code == 404:
249
- logger.error(f"Model {model_id} not found on HuggingFace Hub")
250
- else:
251
- logger.error(f"Failed to load model {model_id}: {e}")
252
- return None
253
- except Exception as e:
254
- logger.error(f"Failed to load model {model_id}: {e}")
255
- return None
256
-
257
- async def delete_model(self, model_id: str) -> bool:
258
- """
259
- Delete model from HuggingFace Hub and local cache.
260
-
261
- Args:
262
- model_id: Model identifier
263
-
264
- Returns:
265
- True if successful, False otherwise
266
- """
267
- try:
268
- repo_id = self._get_repo_id(model_id)
269
-
270
- # Delete from HuggingFace Hub
271
- try:
272
- self.api.delete_repo(repo_id=repo_id, token=self.token)
273
- logger.info(f"Deleted repository {repo_id} from HuggingFace Hub")
274
- except Exception as e:
275
- logger.warning(f"Failed to delete repository {repo_id}: {e}")
276
-
277
- # Delete local cache
278
- local_path = self.local_cache_dir / model_id
279
- if local_path.exists():
280
- shutil.rmtree(local_path)
281
- logger.info(f"Deleted local cache for {model_id}")
282
-
283
- # Remove from metadata
284
- if model_id in self.metadata:
285
- del self.metadata[model_id]
286
- self._save_metadata()
287
-
288
- return True
289
-
290
- except Exception as e:
291
- logger.error(f"Failed to delete model {model_id}: {e}")
292
- return False
293
-
294
- async def get_metadata(self, model_id: str) -> Optional[Dict[str, Any]]:
295
- """Get model metadata"""
296
- return self.metadata.get(model_id)
297
-
298
- async def list_models(self) -> Dict[str, Dict[str, Any]]:
299
- """List all models managed by this storage"""
300
- return self.metadata.copy()
301
-
302
- def _generate_model_card(self, model_id: str, metadata: Dict[str, Any]) -> str:
303
- """Generate a model card for HuggingFace Hub"""
304
- base_model = metadata.get("base_model", "Unknown")
305
- dataset = metadata.get("dataset", "Unknown")
306
- training_method = metadata.get("training_method", "Unknown")
307
- description = metadata.get("description", f"Fine-tuned {base_model}")
308
-
309
- model_card = f"""---
310
- license: apache-2.0
311
- base_model: {base_model}
312
- tags:
313
- - generated_from_trainer
314
- - isa-model
315
- - {training_method.lower().replace(' ', '-')}
316
- datasets:
317
- - {dataset}
318
- language:
319
- - en
320
- pipeline_tag: text-generation
321
- ---
322
-
323
- # {model_id}
324
-
325
- {description}
326
-
327
- ## Model Details
328
-
329
- - **Base Model**: {base_model}
330
- - **Training Dataset**: {dataset}
331
- - **Training Method**: {training_method}
332
- - **Uploaded by**: {self.username}
333
- - **Framework**: ISA Model SDK
334
-
335
- ## Training Details
336
-
337
- """
338
-
339
- # Add training configuration if available
340
- if "config" in metadata:
341
- config = metadata["config"]
342
- model_card += f"""
343
- ### Training Configuration
344
-
345
- - **Epochs**: {config.get('num_epochs', 'Unknown')}
346
- - **Batch Size**: {config.get('batch_size', 'Unknown')}
347
- - **Learning Rate**: {config.get('learning_rate', 'Unknown')}
348
- - **LoRA**: {'Yes' if config.get('use_lora', False) else 'No'}
349
- """
350
-
351
- if config.get('use_lora', False):
352
- lora_config = config.get('lora_config', {})
353
- model_card += f"""
354
- ### LoRA Configuration
355
-
356
- - **LoRA Rank**: {lora_config.get('lora_rank', 'Unknown')}
357
- - **LoRA Alpha**: {lora_config.get('lora_alpha', 'Unknown')}
358
- - **LoRA Dropout**: {lora_config.get('lora_dropout', 'Unknown')}
359
- """
360
-
361
- model_card += f"""
362
-
363
- ## Usage
364
-
365
- ```python
366
- from transformers import AutoTokenizer, AutoModelForCausalLM
367
-
368
- tokenizer = AutoTokenizer.from_pretrained("{self._get_repo_id(model_id)}")
369
- model = AutoModelForCausalLM.from_pretrained("{self._get_repo_id(model_id)}")
370
-
371
- # For inference
372
- inputs = tokenizer("Your prompt here", return_tensors="pt")
373
- outputs = model.generate(**inputs, max_length=100)
374
- response = tokenizer.decode(outputs[0], skip_special_tokens=True)
375
- print(response)
376
- ```
377
-
378
- ## ISA Model SDK
379
-
380
- This model was trained using the [ISA Model SDK](https://github.com/your-repo/isA_Model),
381
- a comprehensive framework for training and deploying AI models.
382
-
383
- """
384
-
385
- return model_card
386
-
387
- def get_public_url(self, model_id: str) -> str:
388
- """Get public URL for a model on HuggingFace Hub"""
389
- return f"https://huggingface.co/{self._get_repo_id(model_id)}"
390
-
391
- async def update_model_metadata(self, model_id: str, new_metadata: Dict[str, Any]) -> bool:
392
- """Update model metadata on HuggingFace Hub"""
393
- try:
394
- if model_id not in self.metadata:
395
- logger.error(f"Model {model_id} not found in metadata")
396
- return False
397
-
398
- # Update local metadata
399
- self.metadata[model_id].update(new_metadata)
400
- self._save_metadata()
401
-
402
- # Update README on HuggingFace Hub
403
- repo_id = self._get_repo_id(model_id)
404
- readme_content = self._generate_model_card(model_id, self.metadata[model_id])
405
-
406
- self.api.upload_file(
407
- path_or_fileobj=readme_content.encode('utf-8'),
408
- path_in_repo="README.md",
409
- repo_id=repo_id,
410
- token=self.token,
411
- commit_message=f"Update metadata for {model_id}"
412
- )
413
-
414
- logger.info(f"Updated metadata for model {model_id}")
415
- return True
416
-
417
- except Exception as e:
418
- logger.error(f"Failed to update metadata for {model_id}: {e}")
419
- return False
@@ -1,31 +0,0 @@
1
- """
2
- Local GPU deployment module
3
-
4
- This module provides local GPU model deployment capabilities including:
5
- - Direct GPU resource management
6
- - vLLM integration for high-performance inference
7
- - TensorRT-LLM native deployment (non-containerized)
8
- - HuggingFace Transformers direct deployment
9
- - Local service monitoring and health checks
10
- """
11
-
12
- from .provider import LocalGPUProvider
13
- from .config import (
14
- LocalGPUConfig, LocalServiceType, LocalBackend,
15
- create_vllm_config, create_tensorrt_config, create_transformers_config,
16
- create_vision_config, create_embedding_config
17
- )
18
- from .health_checker import LocalHealthChecker
19
-
20
- __all__ = [
21
- 'LocalGPUProvider',
22
- 'LocalGPUConfig',
23
- 'LocalServiceType',
24
- 'LocalBackend',
25
- 'LocalHealthChecker',
26
- 'create_vllm_config',
27
- 'create_tensorrt_config',
28
- 'create_transformers_config',
29
- 'create_vision_config',
30
- 'create_embedding_config'
31
- ]
@@ -1,248 +0,0 @@
1
- """
2
- Local GPU deployment configuration
3
-
4
- Configuration classes for local GPU model deployment.
5
- """
6
-
7
- from dataclasses import dataclass, field
8
- from typing import Dict, Any, Optional, List
9
- from enum import Enum
10
- from pathlib import Path
11
-
12
-
13
- class LocalServiceType(Enum):
14
- """Local service types"""
15
- LLM = "llm"
16
- VISION = "vision"
17
- AUDIO = "audio"
18
- EMBEDDING = "embedding"
19
- IMAGE_GENERATION = "image_generation"
20
-
21
-
22
- class LocalBackend(Enum):
23
- """Local inference backends"""
24
- VLLM = "vllm"
25
- TENSORRT_LLM = "tensorrt_llm"
26
- TRANSFORMERS = "transformers"
27
- ONNX = "onnxruntime"
28
- OPENVINO = "openvino"
29
-
30
-
31
- @dataclass
32
- class LocalGPUConfig:
33
- """Configuration for local GPU model deployment"""
34
-
35
- # Service identification
36
- service_name: str
37
- service_type: LocalServiceType
38
- model_id: str
39
- backend: LocalBackend = LocalBackend.TRANSFORMERS
40
-
41
- # GPU configuration
42
- gpu_id: Optional[int] = None # None = auto-select best GPU
43
- gpu_memory_fraction: float = 0.9 # Fraction of GPU memory to use
44
- enable_gpu: bool = True
45
-
46
- # Model configuration
47
- model_precision: str = "float16" # float32, float16, int8, int4
48
- max_model_len: int = 2048
49
- max_batch_size: int = 8
50
-
51
- # Performance settings
52
- enable_chunked_prefill: bool = True
53
- max_num_seqs: int = 256
54
- tensor_parallel_size: int = 1
55
- pipeline_parallel_size: int = 1
56
-
57
- # Memory optimization
58
- enable_prefix_caching: bool = True
59
- gpu_memory_utilization: float = 0.9
60
- swap_space: int = 4 # GB
61
- cpu_offload: bool = False
62
-
63
- # Quantization settings
64
- quantization: Optional[str] = None # awq, gptq, squeezellm, etc.
65
- quantization_param_path: Optional[str] = None
66
-
67
- # Serving configuration
68
- host: str = "127.0.0.1"
69
- port: int = 8000
70
- api_key: Optional[str] = None
71
- served_model_name: Optional[str] = None
72
-
73
- # Advanced settings
74
- trust_remote_code: bool = False
75
- revision: Optional[str] = None
76
- tokenizer_revision: Optional[str] = None
77
-
78
- # Specific backend configurations
79
- vllm_args: Dict[str, Any] = field(default_factory=dict)
80
- tensorrt_args: Dict[str, Any] = field(default_factory=dict)
81
- transformers_args: Dict[str, Any] = field(default_factory=dict)
82
-
83
- # Environment and paths
84
- model_cache_dir: Optional[str] = None
85
- download_dir: Optional[str] = None
86
-
87
- def to_dict(self) -> Dict[str, Any]:
88
- """Convert to dictionary for serialization"""
89
- return {
90
- "service_name": self.service_name,
91
- "service_type": self.service_type.value,
92
- "model_id": self.model_id,
93
- "backend": self.backend.value,
94
- "gpu_id": self.gpu_id,
95
- "gpu_memory_fraction": self.gpu_memory_fraction,
96
- "enable_gpu": self.enable_gpu,
97
- "model_precision": self.model_precision,
98
- "max_model_len": self.max_model_len,
99
- "max_batch_size": self.max_batch_size,
100
- "enable_chunked_prefill": self.enable_chunked_prefill,
101
- "max_num_seqs": self.max_num_seqs,
102
- "tensor_parallel_size": self.tensor_parallel_size,
103
- "pipeline_parallel_size": self.pipeline_parallel_size,
104
- "enable_prefix_caching": self.enable_prefix_caching,
105
- "gpu_memory_utilization": self.gpu_memory_utilization,
106
- "swap_space": self.swap_space,
107
- "cpu_offload": self.cpu_offload,
108
- "quantization": self.quantization,
109
- "quantization_param_path": self.quantization_param_path,
110
- "host": self.host,
111
- "port": self.port,
112
- "api_key": self.api_key,
113
- "served_model_name": self.served_model_name,
114
- "trust_remote_code": self.trust_remote_code,
115
- "revision": self.revision,
116
- "tokenizer_revision": self.tokenizer_revision,
117
- "vllm_args": self.vllm_args,
118
- "tensorrt_args": self.tensorrt_args,
119
- "transformers_args": self.transformers_args,
120
- "model_cache_dir": self.model_cache_dir,
121
- "download_dir": self.download_dir
122
- }
123
-
124
- @classmethod
125
- def from_dict(cls, data: Dict[str, Any]) -> "LocalGPUConfig":
126
- """Create from dictionary"""
127
- return cls(
128
- service_name=data["service_name"],
129
- service_type=LocalServiceType(data["service_type"]),
130
- model_id=data["model_id"],
131
- backend=LocalBackend(data.get("backend", "transformers")),
132
- gpu_id=data.get("gpu_id"),
133
- gpu_memory_fraction=data.get("gpu_memory_fraction", 0.9),
134
- enable_gpu=data.get("enable_gpu", True),
135
- model_precision=data.get("model_precision", "float16"),
136
- max_model_len=data.get("max_model_len", 2048),
137
- max_batch_size=data.get("max_batch_size", 8),
138
- enable_chunked_prefill=data.get("enable_chunked_prefill", True),
139
- max_num_seqs=data.get("max_num_seqs", 256),
140
- tensor_parallel_size=data.get("tensor_parallel_size", 1),
141
- pipeline_parallel_size=data.get("pipeline_parallel_size", 1),
142
- enable_prefix_caching=data.get("enable_prefix_caching", True),
143
- gpu_memory_utilization=data.get("gpu_memory_utilization", 0.9),
144
- swap_space=data.get("swap_space", 4),
145
- cpu_offload=data.get("cpu_offload", False),
146
- quantization=data.get("quantization"),
147
- quantization_param_path=data.get("quantization_param_path"),
148
- host=data.get("host", "127.0.0.1"),
149
- port=data.get("port", 8000),
150
- api_key=data.get("api_key"),
151
- served_model_name=data.get("served_model_name"),
152
- trust_remote_code=data.get("trust_remote_code", False),
153
- revision=data.get("revision"),
154
- tokenizer_revision=data.get("tokenizer_revision"),
155
- vllm_args=data.get("vllm_args", {}),
156
- tensorrt_args=data.get("tensorrt_args", {}),
157
- transformers_args=data.get("transformers_args", {}),
158
- model_cache_dir=data.get("model_cache_dir"),
159
- download_dir=data.get("download_dir")
160
- )
161
-
162
-
163
- # Predefined configurations for common use cases
164
- def create_vllm_config(service_name: str, model_id: str,
165
- max_model_len: int = 2048,
166
- tensor_parallel_size: int = 1) -> LocalGPUConfig:
167
- """Create optimized vLLM configuration"""
168
- return LocalGPUConfig(
169
- service_name=service_name,
170
- service_type=LocalServiceType.LLM,
171
- model_id=model_id,
172
- backend=LocalBackend.VLLM,
173
- max_model_len=max_model_len,
174
- tensor_parallel_size=tensor_parallel_size,
175
- enable_chunked_prefill=True,
176
- enable_prefix_caching=True,
177
- gpu_memory_utilization=0.9,
178
- model_precision="float16"
179
- )
180
-
181
-
182
- def create_tensorrt_config(service_name: str, model_id: str,
183
- max_batch_size: int = 8,
184
- precision: str = "float16") -> LocalGPUConfig:
185
- """Create TensorRT-LLM configuration"""
186
- return LocalGPUConfig(
187
- service_name=service_name,
188
- service_type=LocalServiceType.LLM,
189
- model_id=model_id,
190
- backend=LocalBackend.TENSORRT_LLM,
191
- max_batch_size=max_batch_size,
192
- model_precision=precision,
193
- tensor_parallel_size=1,
194
- tensorrt_args={
195
- "enable_kv_cache_reuse": True,
196
- "remove_input_padding": True,
197
- "use_gpt_attention_plugin": True
198
- }
199
- )
200
-
201
-
202
- def create_transformers_config(service_name: str, model_id: str,
203
- precision: str = "float16",
204
- quantization: Optional[str] = None) -> LocalGPUConfig:
205
- """Create HuggingFace Transformers configuration"""
206
- return LocalGPUConfig(
207
- service_name=service_name,
208
- service_type=LocalServiceType.LLM,
209
- model_id=model_id,
210
- backend=LocalBackend.TRANSFORMERS,
211
- model_precision=precision,
212
- quantization=quantization,
213
- max_batch_size=4, # Lower for memory efficiency
214
- transformers_args={
215
- "device_map": "auto",
216
- "torch_dtype": "auto",
217
- "low_cpu_mem_usage": True
218
- }
219
- )
220
-
221
-
222
- def create_vision_config(service_name: str, model_id: str,
223
- backend: LocalBackend = LocalBackend.TRANSFORMERS) -> LocalGPUConfig:
224
- """Create vision model configuration"""
225
- return LocalGPUConfig(
226
- service_name=service_name,
227
- service_type=LocalServiceType.VISION,
228
- model_id=model_id,
229
- backend=backend,
230
- max_batch_size=16,
231
- model_precision="float16",
232
- gpu_memory_utilization=0.8 # Lower for vision models
233
- )
234
-
235
-
236
- def create_embedding_config(service_name: str, model_id: str,
237
- max_batch_size: int = 32) -> LocalGPUConfig:
238
- """Create embedding model configuration"""
239
- return LocalGPUConfig(
240
- service_name=service_name,
241
- service_type=LocalServiceType.EMBEDDING,
242
- model_id=model_id,
243
- backend=LocalBackend.TRANSFORMERS,
244
- max_batch_size=max_batch_size,
245
- model_precision="float16",
246
- gpu_memory_utilization=0.7, # Lower memory usage for embeddings
247
- cpu_offload=False
248
- )