isa-model 0.4.3__py3-none-any.whl → 0.4.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- isa_model/core/config.py +3 -3
- isa_model/core/logging/__init__.py +14 -13
- isa_model/core/models/model_manager.py +1 -69
- isa_model/core/models/model_storage.py +4 -2
- {isa_model-0.4.3.dist-info → isa_model-0.4.5.dist-info}/METADATA +6 -1
- {isa_model-0.4.3.dist-info → isa_model-0.4.5.dist-info}/RECORD +8 -22
- isa_model/core/logging/influx_logger.py +0 -523
- isa_model/core/security/secrets.py +0 -358
- isa_model/core/storage/hf_storage.py +0 -419
- isa_model/deployment/local/__init__.py +0 -31
- isa_model/deployment/local/config.py +0 -248
- isa_model/deployment/local/gpu_gateway.py +0 -607
- isa_model/deployment/local/health_checker.py +0 -428
- isa_model/deployment/local/provider.py +0 -586
- isa_model/deployment/local/tensorrt_service.py +0 -621
- isa_model/deployment/local/transformers_service.py +0 -644
- isa_model/deployment/local/vllm_service.py +0 -527
- isa_model/inference/services/custom_model_manager.py +0 -277
- isa_model/inference/services/llm/local_llm_service.py +0 -747
- isa_model/inference/services/vision/blip_vision_service.py +0 -359
- {isa_model-0.4.3.dist-info → isa_model-0.4.5.dist-info}/WHEEL +0 -0
- {isa_model-0.4.3.dist-info → isa_model-0.4.5.dist-info}/top_level.txt +0 -0
@@ -1,419 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
HuggingFace Hub Storage Implementation
|
3
|
-
|
4
|
-
Provides storage capabilities using HuggingFace Hub as the backend.
|
5
|
-
Supports uploading trained models, managing versions, and metadata.
|
6
|
-
"""
|
7
|
-
|
8
|
-
import os
|
9
|
-
import json
|
10
|
-
import logging
|
11
|
-
from typing import Optional, Dict, Any, List
|
12
|
-
from pathlib import Path
|
13
|
-
import tempfile
|
14
|
-
import shutil
|
15
|
-
from datetime import datetime
|
16
|
-
|
17
|
-
try:
|
18
|
-
from huggingface_hub import HfApi, create_repo, upload_folder, snapshot_download
|
19
|
-
from huggingface_hub.errors import HfHubHTTPError
|
20
|
-
HF_HUB_AVAILABLE = True
|
21
|
-
except ImportError:
|
22
|
-
HF_HUB_AVAILABLE = False
|
23
|
-
|
24
|
-
from ..models.model_storage import ModelStorage
|
25
|
-
|
26
|
-
logger = logging.getLogger(__name__)
|
27
|
-
|
28
|
-
|
29
|
-
class HuggingFaceStorage(ModelStorage):
|
30
|
-
"""
|
31
|
-
HuggingFace Hub storage implementation for model management.
|
32
|
-
|
33
|
-
This storage backend uploads models to HuggingFace Hub and manages
|
34
|
-
them using the repository system. Perfect for sharing trained models
|
35
|
-
and maintaining versions.
|
36
|
-
|
37
|
-
Example:
|
38
|
-
```python
|
39
|
-
from isa_model.core.storage import HuggingFaceStorage
|
40
|
-
|
41
|
-
storage = HuggingFaceStorage(
|
42
|
-
username="xenobordom",
|
43
|
-
token=os.getenv("HF_TOKEN") # Set in environment
|
44
|
-
)
|
45
|
-
|
46
|
-
# Save a trained model to HuggingFace Hub
|
47
|
-
await storage.save_model(
|
48
|
-
model_id="gemma-4b-alpaca-v1",
|
49
|
-
model_path="./trained_models/gemma-4b",
|
50
|
-
metadata={
|
51
|
-
"base_model": "google/gemma-2-4b-it",
|
52
|
-
"dataset": "tatsu-lab/alpaca",
|
53
|
-
"training_method": "LoRA + Unsloth"
|
54
|
-
}
|
55
|
-
)
|
56
|
-
```
|
57
|
-
"""
|
58
|
-
|
59
|
-
def __init__(self,
|
60
|
-
username: str = "xenobordom",
|
61
|
-
token: Optional[str] = None,
|
62
|
-
private: bool = False,
|
63
|
-
local_cache_dir: str = "./models/hf_cache"):
|
64
|
-
"""
|
65
|
-
Initialize HuggingFace storage.
|
66
|
-
|
67
|
-
Args:
|
68
|
-
username: HuggingFace username (default: xenobordom)
|
69
|
-
token: HuggingFace API token (from env if not provided)
|
70
|
-
private: Whether to create private repositories
|
71
|
-
local_cache_dir: Local cache directory for downloaded models
|
72
|
-
"""
|
73
|
-
if not HF_HUB_AVAILABLE:
|
74
|
-
raise ImportError("huggingface_hub is required. Install with: pip install huggingface_hub")
|
75
|
-
|
76
|
-
self.username = username
|
77
|
-
self.token = token or os.getenv("HF_TOKEN")
|
78
|
-
self.private = private
|
79
|
-
self.local_cache_dir = Path(local_cache_dir)
|
80
|
-
self.local_cache_dir.mkdir(parents=True, exist_ok=True)
|
81
|
-
|
82
|
-
if not self.token:
|
83
|
-
raise ValueError("HuggingFace token is required. Set HF_TOKEN environment variable or pass token parameter.")
|
84
|
-
|
85
|
-
# Initialize HF API
|
86
|
-
self.api = HfApi(token=self.token)
|
87
|
-
|
88
|
-
# Local metadata storage
|
89
|
-
self.metadata_file = self.local_cache_dir / "hf_models_metadata.json"
|
90
|
-
self._load_metadata()
|
91
|
-
|
92
|
-
logger.info(f"HuggingFace storage initialized for user: {self.username}")
|
93
|
-
logger.info(f"Local cache directory: {self.local_cache_dir}")
|
94
|
-
|
95
|
-
def _load_metadata(self):
|
96
|
-
"""Load local metadata cache"""
|
97
|
-
if self.metadata_file.exists():
|
98
|
-
with open(self.metadata_file, 'r') as f:
|
99
|
-
self.metadata = json.load(f)
|
100
|
-
else:
|
101
|
-
self.metadata = {}
|
102
|
-
self._save_metadata()
|
103
|
-
|
104
|
-
def _save_metadata(self):
|
105
|
-
"""Save local metadata cache"""
|
106
|
-
with open(self.metadata_file, 'w') as f:
|
107
|
-
json.dump(self.metadata, f, indent=2)
|
108
|
-
|
109
|
-
def _get_repo_id(self, model_id: str) -> str:
|
110
|
-
"""Get full repository ID for a model"""
|
111
|
-
return f"{self.username}/{model_id}"
|
112
|
-
|
113
|
-
async def save_model(self, model_id: str, model_path: str, metadata: Dict[str, Any]) -> bool:
|
114
|
-
"""
|
115
|
-
Save model to HuggingFace Hub.
|
116
|
-
|
117
|
-
Args:
|
118
|
-
model_id: Unique identifier for the model (will be repo name)
|
119
|
-
model_path: Local path to model files
|
120
|
-
metadata: Model metadata to include
|
121
|
-
|
122
|
-
Returns:
|
123
|
-
True if successful, False otherwise
|
124
|
-
"""
|
125
|
-
try:
|
126
|
-
repo_id = self._get_repo_id(model_id)
|
127
|
-
source_path = Path(model_path)
|
128
|
-
|
129
|
-
logger.info(f"Uploading model {model_id} to HuggingFace Hub: {repo_id}")
|
130
|
-
|
131
|
-
# Create repository if it doesn't exist
|
132
|
-
try:
|
133
|
-
create_repo(
|
134
|
-
repo_id=repo_id,
|
135
|
-
token=self.token,
|
136
|
-
private=self.private,
|
137
|
-
exist_ok=True
|
138
|
-
)
|
139
|
-
logger.info(f"Repository created/verified: {repo_id}")
|
140
|
-
except Exception as e:
|
141
|
-
logger.warning(f"Repository creation warning: {e}")
|
142
|
-
|
143
|
-
# Prepare metadata for README
|
144
|
-
readme_content = self._generate_model_card(model_id, metadata)
|
145
|
-
|
146
|
-
# Create temporary directory for upload preparation
|
147
|
-
with tempfile.TemporaryDirectory() as temp_dir:
|
148
|
-
temp_path = Path(temp_dir)
|
149
|
-
|
150
|
-
# Copy model files
|
151
|
-
if source_path.is_file():
|
152
|
-
shutil.copy2(source_path, temp_path / source_path.name)
|
153
|
-
else:
|
154
|
-
# Copy entire directory
|
155
|
-
for item in source_path.rglob("*"):
|
156
|
-
if item.is_file():
|
157
|
-
relative_path = item.relative_to(source_path)
|
158
|
-
dest_path = temp_path / relative_path
|
159
|
-
dest_path.parent.mkdir(parents=True, exist_ok=True)
|
160
|
-
shutil.copy2(item, dest_path)
|
161
|
-
|
162
|
-
# Add README.md
|
163
|
-
with open(temp_path / "README.md", 'w') as f:
|
164
|
-
f.write(readme_content)
|
165
|
-
|
166
|
-
# Add metadata.json
|
167
|
-
enhanced_metadata = {
|
168
|
-
**metadata,
|
169
|
-
"model_id": model_id,
|
170
|
-
"repo_id": repo_id,
|
171
|
-
"uploaded_at": datetime.now().isoformat(),
|
172
|
-
"uploaded_by": self.username,
|
173
|
-
"storage_backend": "huggingface_hub"
|
174
|
-
}
|
175
|
-
|
176
|
-
with open(temp_path / "metadata.json", 'w') as f:
|
177
|
-
json.dump(enhanced_metadata, f, indent=2)
|
178
|
-
|
179
|
-
# Upload to HuggingFace Hub
|
180
|
-
upload_folder(
|
181
|
-
folder_path=str(temp_path),
|
182
|
-
repo_id=repo_id,
|
183
|
-
token=self.token,
|
184
|
-
commit_message=f"Upload {model_id} - {metadata.get('description', 'Model upload')}"
|
185
|
-
)
|
186
|
-
|
187
|
-
# Update local metadata
|
188
|
-
self.metadata[model_id] = {
|
189
|
-
**enhanced_metadata,
|
190
|
-
"local_cache_path": str(self.local_cache_dir / model_id),
|
191
|
-
"repo_url": f"https://huggingface.co/{repo_id}"
|
192
|
-
}
|
193
|
-
self._save_metadata()
|
194
|
-
|
195
|
-
logger.info(f"Model {model_id} uploaded successfully to {repo_id}")
|
196
|
-
return True
|
197
|
-
|
198
|
-
except Exception as e:
|
199
|
-
logger.error(f"Failed to save model {model_id} to HuggingFace Hub: {e}")
|
200
|
-
return False
|
201
|
-
|
202
|
-
async def load_model(self, model_id: str) -> Optional[Path]:
|
203
|
-
"""
|
204
|
-
Load model from HuggingFace Hub.
|
205
|
-
|
206
|
-
Args:
|
207
|
-
model_id: Model identifier
|
208
|
-
|
209
|
-
Returns:
|
210
|
-
Path to local model files
|
211
|
-
"""
|
212
|
-
try:
|
213
|
-
repo_id = self._get_repo_id(model_id)
|
214
|
-
local_path = self.local_cache_dir / model_id
|
215
|
-
|
216
|
-
# Check if already cached
|
217
|
-
if local_path.exists() and model_id in self.metadata:
|
218
|
-
logger.info(f"Using cached model {model_id}")
|
219
|
-
return local_path
|
220
|
-
|
221
|
-
logger.info(f"Downloading model {model_id} from HuggingFace Hub: {repo_id}")
|
222
|
-
|
223
|
-
# Download from HuggingFace Hub
|
224
|
-
snapshot_download(
|
225
|
-
repo_id=repo_id,
|
226
|
-
local_dir=str(local_path),
|
227
|
-
token=self.token,
|
228
|
-
local_dir_use_symlinks=False
|
229
|
-
)
|
230
|
-
|
231
|
-
# Load metadata if available
|
232
|
-
metadata_file = local_path / "metadata.json"
|
233
|
-
if metadata_file.exists():
|
234
|
-
with open(metadata_file, 'r') as f:
|
235
|
-
metadata = json.load(f)
|
236
|
-
|
237
|
-
self.metadata[model_id] = {
|
238
|
-
**metadata,
|
239
|
-
"local_cache_path": str(local_path),
|
240
|
-
"last_downloaded": datetime.now().isoformat()
|
241
|
-
}
|
242
|
-
self._save_metadata()
|
243
|
-
|
244
|
-
logger.info(f"Model {model_id} downloaded successfully")
|
245
|
-
return local_path
|
246
|
-
|
247
|
-
except HfHubHTTPError as e:
|
248
|
-
if e.response.status_code == 404:
|
249
|
-
logger.error(f"Model {model_id} not found on HuggingFace Hub")
|
250
|
-
else:
|
251
|
-
logger.error(f"Failed to load model {model_id}: {e}")
|
252
|
-
return None
|
253
|
-
except Exception as e:
|
254
|
-
logger.error(f"Failed to load model {model_id}: {e}")
|
255
|
-
return None
|
256
|
-
|
257
|
-
async def delete_model(self, model_id: str) -> bool:
|
258
|
-
"""
|
259
|
-
Delete model from HuggingFace Hub and local cache.
|
260
|
-
|
261
|
-
Args:
|
262
|
-
model_id: Model identifier
|
263
|
-
|
264
|
-
Returns:
|
265
|
-
True if successful, False otherwise
|
266
|
-
"""
|
267
|
-
try:
|
268
|
-
repo_id = self._get_repo_id(model_id)
|
269
|
-
|
270
|
-
# Delete from HuggingFace Hub
|
271
|
-
try:
|
272
|
-
self.api.delete_repo(repo_id=repo_id, token=self.token)
|
273
|
-
logger.info(f"Deleted repository {repo_id} from HuggingFace Hub")
|
274
|
-
except Exception as e:
|
275
|
-
logger.warning(f"Failed to delete repository {repo_id}: {e}")
|
276
|
-
|
277
|
-
# Delete local cache
|
278
|
-
local_path = self.local_cache_dir / model_id
|
279
|
-
if local_path.exists():
|
280
|
-
shutil.rmtree(local_path)
|
281
|
-
logger.info(f"Deleted local cache for {model_id}")
|
282
|
-
|
283
|
-
# Remove from metadata
|
284
|
-
if model_id in self.metadata:
|
285
|
-
del self.metadata[model_id]
|
286
|
-
self._save_metadata()
|
287
|
-
|
288
|
-
return True
|
289
|
-
|
290
|
-
except Exception as e:
|
291
|
-
logger.error(f"Failed to delete model {model_id}: {e}")
|
292
|
-
return False
|
293
|
-
|
294
|
-
async def get_metadata(self, model_id: str) -> Optional[Dict[str, Any]]:
|
295
|
-
"""Get model metadata"""
|
296
|
-
return self.metadata.get(model_id)
|
297
|
-
|
298
|
-
async def list_models(self) -> Dict[str, Dict[str, Any]]:
|
299
|
-
"""List all models managed by this storage"""
|
300
|
-
return self.metadata.copy()
|
301
|
-
|
302
|
-
def _generate_model_card(self, model_id: str, metadata: Dict[str, Any]) -> str:
|
303
|
-
"""Generate a model card for HuggingFace Hub"""
|
304
|
-
base_model = metadata.get("base_model", "Unknown")
|
305
|
-
dataset = metadata.get("dataset", "Unknown")
|
306
|
-
training_method = metadata.get("training_method", "Unknown")
|
307
|
-
description = metadata.get("description", f"Fine-tuned {base_model}")
|
308
|
-
|
309
|
-
model_card = f"""---
|
310
|
-
license: apache-2.0
|
311
|
-
base_model: {base_model}
|
312
|
-
tags:
|
313
|
-
- generated_from_trainer
|
314
|
-
- isa-model
|
315
|
-
- {training_method.lower().replace(' ', '-')}
|
316
|
-
datasets:
|
317
|
-
- {dataset}
|
318
|
-
language:
|
319
|
-
- en
|
320
|
-
pipeline_tag: text-generation
|
321
|
-
---
|
322
|
-
|
323
|
-
# {model_id}
|
324
|
-
|
325
|
-
{description}
|
326
|
-
|
327
|
-
## Model Details
|
328
|
-
|
329
|
-
- **Base Model**: {base_model}
|
330
|
-
- **Training Dataset**: {dataset}
|
331
|
-
- **Training Method**: {training_method}
|
332
|
-
- **Uploaded by**: {self.username}
|
333
|
-
- **Framework**: ISA Model SDK
|
334
|
-
|
335
|
-
## Training Details
|
336
|
-
|
337
|
-
"""
|
338
|
-
|
339
|
-
# Add training configuration if available
|
340
|
-
if "config" in metadata:
|
341
|
-
config = metadata["config"]
|
342
|
-
model_card += f"""
|
343
|
-
### Training Configuration
|
344
|
-
|
345
|
-
- **Epochs**: {config.get('num_epochs', 'Unknown')}
|
346
|
-
- **Batch Size**: {config.get('batch_size', 'Unknown')}
|
347
|
-
- **Learning Rate**: {config.get('learning_rate', 'Unknown')}
|
348
|
-
- **LoRA**: {'Yes' if config.get('use_lora', False) else 'No'}
|
349
|
-
"""
|
350
|
-
|
351
|
-
if config.get('use_lora', False):
|
352
|
-
lora_config = config.get('lora_config', {})
|
353
|
-
model_card += f"""
|
354
|
-
### LoRA Configuration
|
355
|
-
|
356
|
-
- **LoRA Rank**: {lora_config.get('lora_rank', 'Unknown')}
|
357
|
-
- **LoRA Alpha**: {lora_config.get('lora_alpha', 'Unknown')}
|
358
|
-
- **LoRA Dropout**: {lora_config.get('lora_dropout', 'Unknown')}
|
359
|
-
"""
|
360
|
-
|
361
|
-
model_card += f"""
|
362
|
-
|
363
|
-
## Usage
|
364
|
-
|
365
|
-
```python
|
366
|
-
from transformers import AutoTokenizer, AutoModelForCausalLM
|
367
|
-
|
368
|
-
tokenizer = AutoTokenizer.from_pretrained("{self._get_repo_id(model_id)}")
|
369
|
-
model = AutoModelForCausalLM.from_pretrained("{self._get_repo_id(model_id)}")
|
370
|
-
|
371
|
-
# For inference
|
372
|
-
inputs = tokenizer("Your prompt here", return_tensors="pt")
|
373
|
-
outputs = model.generate(**inputs, max_length=100)
|
374
|
-
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
375
|
-
print(response)
|
376
|
-
```
|
377
|
-
|
378
|
-
## ISA Model SDK
|
379
|
-
|
380
|
-
This model was trained using the [ISA Model SDK](https://github.com/your-repo/isA_Model),
|
381
|
-
a comprehensive framework for training and deploying AI models.
|
382
|
-
|
383
|
-
"""
|
384
|
-
|
385
|
-
return model_card
|
386
|
-
|
387
|
-
def get_public_url(self, model_id: str) -> str:
|
388
|
-
"""Get public URL for a model on HuggingFace Hub"""
|
389
|
-
return f"https://huggingface.co/{self._get_repo_id(model_id)}"
|
390
|
-
|
391
|
-
async def update_model_metadata(self, model_id: str, new_metadata: Dict[str, Any]) -> bool:
|
392
|
-
"""Update model metadata on HuggingFace Hub"""
|
393
|
-
try:
|
394
|
-
if model_id not in self.metadata:
|
395
|
-
logger.error(f"Model {model_id} not found in metadata")
|
396
|
-
return False
|
397
|
-
|
398
|
-
# Update local metadata
|
399
|
-
self.metadata[model_id].update(new_metadata)
|
400
|
-
self._save_metadata()
|
401
|
-
|
402
|
-
# Update README on HuggingFace Hub
|
403
|
-
repo_id = self._get_repo_id(model_id)
|
404
|
-
readme_content = self._generate_model_card(model_id, self.metadata[model_id])
|
405
|
-
|
406
|
-
self.api.upload_file(
|
407
|
-
path_or_fileobj=readme_content.encode('utf-8'),
|
408
|
-
path_in_repo="README.md",
|
409
|
-
repo_id=repo_id,
|
410
|
-
token=self.token,
|
411
|
-
commit_message=f"Update metadata for {model_id}"
|
412
|
-
)
|
413
|
-
|
414
|
-
logger.info(f"Updated metadata for model {model_id}")
|
415
|
-
return True
|
416
|
-
|
417
|
-
except Exception as e:
|
418
|
-
logger.error(f"Failed to update metadata for {model_id}: {e}")
|
419
|
-
return False
|
@@ -1,31 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
Local GPU deployment module
|
3
|
-
|
4
|
-
This module provides local GPU model deployment capabilities including:
|
5
|
-
- Direct GPU resource management
|
6
|
-
- vLLM integration for high-performance inference
|
7
|
-
- TensorRT-LLM native deployment (non-containerized)
|
8
|
-
- HuggingFace Transformers direct deployment
|
9
|
-
- Local service monitoring and health checks
|
10
|
-
"""
|
11
|
-
|
12
|
-
from .provider import LocalGPUProvider
|
13
|
-
from .config import (
|
14
|
-
LocalGPUConfig, LocalServiceType, LocalBackend,
|
15
|
-
create_vllm_config, create_tensorrt_config, create_transformers_config,
|
16
|
-
create_vision_config, create_embedding_config
|
17
|
-
)
|
18
|
-
from .health_checker import LocalHealthChecker
|
19
|
-
|
20
|
-
__all__ = [
|
21
|
-
'LocalGPUProvider',
|
22
|
-
'LocalGPUConfig',
|
23
|
-
'LocalServiceType',
|
24
|
-
'LocalBackend',
|
25
|
-
'LocalHealthChecker',
|
26
|
-
'create_vllm_config',
|
27
|
-
'create_tensorrt_config',
|
28
|
-
'create_transformers_config',
|
29
|
-
'create_vision_config',
|
30
|
-
'create_embedding_config'
|
31
|
-
]
|
@@ -1,248 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
Local GPU deployment configuration
|
3
|
-
|
4
|
-
Configuration classes for local GPU model deployment.
|
5
|
-
"""
|
6
|
-
|
7
|
-
from dataclasses import dataclass, field
|
8
|
-
from typing import Dict, Any, Optional, List
|
9
|
-
from enum import Enum
|
10
|
-
from pathlib import Path
|
11
|
-
|
12
|
-
|
13
|
-
class LocalServiceType(Enum):
|
14
|
-
"""Local service types"""
|
15
|
-
LLM = "llm"
|
16
|
-
VISION = "vision"
|
17
|
-
AUDIO = "audio"
|
18
|
-
EMBEDDING = "embedding"
|
19
|
-
IMAGE_GENERATION = "image_generation"
|
20
|
-
|
21
|
-
|
22
|
-
class LocalBackend(Enum):
|
23
|
-
"""Local inference backends"""
|
24
|
-
VLLM = "vllm"
|
25
|
-
TENSORRT_LLM = "tensorrt_llm"
|
26
|
-
TRANSFORMERS = "transformers"
|
27
|
-
ONNX = "onnxruntime"
|
28
|
-
OPENVINO = "openvino"
|
29
|
-
|
30
|
-
|
31
|
-
@dataclass
|
32
|
-
class LocalGPUConfig:
|
33
|
-
"""Configuration for local GPU model deployment"""
|
34
|
-
|
35
|
-
# Service identification
|
36
|
-
service_name: str
|
37
|
-
service_type: LocalServiceType
|
38
|
-
model_id: str
|
39
|
-
backend: LocalBackend = LocalBackend.TRANSFORMERS
|
40
|
-
|
41
|
-
# GPU configuration
|
42
|
-
gpu_id: Optional[int] = None # None = auto-select best GPU
|
43
|
-
gpu_memory_fraction: float = 0.9 # Fraction of GPU memory to use
|
44
|
-
enable_gpu: bool = True
|
45
|
-
|
46
|
-
# Model configuration
|
47
|
-
model_precision: str = "float16" # float32, float16, int8, int4
|
48
|
-
max_model_len: int = 2048
|
49
|
-
max_batch_size: int = 8
|
50
|
-
|
51
|
-
# Performance settings
|
52
|
-
enable_chunked_prefill: bool = True
|
53
|
-
max_num_seqs: int = 256
|
54
|
-
tensor_parallel_size: int = 1
|
55
|
-
pipeline_parallel_size: int = 1
|
56
|
-
|
57
|
-
# Memory optimization
|
58
|
-
enable_prefix_caching: bool = True
|
59
|
-
gpu_memory_utilization: float = 0.9
|
60
|
-
swap_space: int = 4 # GB
|
61
|
-
cpu_offload: bool = False
|
62
|
-
|
63
|
-
# Quantization settings
|
64
|
-
quantization: Optional[str] = None # awq, gptq, squeezellm, etc.
|
65
|
-
quantization_param_path: Optional[str] = None
|
66
|
-
|
67
|
-
# Serving configuration
|
68
|
-
host: str = "127.0.0.1"
|
69
|
-
port: int = 8000
|
70
|
-
api_key: Optional[str] = None
|
71
|
-
served_model_name: Optional[str] = None
|
72
|
-
|
73
|
-
# Advanced settings
|
74
|
-
trust_remote_code: bool = False
|
75
|
-
revision: Optional[str] = None
|
76
|
-
tokenizer_revision: Optional[str] = None
|
77
|
-
|
78
|
-
# Specific backend configurations
|
79
|
-
vllm_args: Dict[str, Any] = field(default_factory=dict)
|
80
|
-
tensorrt_args: Dict[str, Any] = field(default_factory=dict)
|
81
|
-
transformers_args: Dict[str, Any] = field(default_factory=dict)
|
82
|
-
|
83
|
-
# Environment and paths
|
84
|
-
model_cache_dir: Optional[str] = None
|
85
|
-
download_dir: Optional[str] = None
|
86
|
-
|
87
|
-
def to_dict(self) -> Dict[str, Any]:
|
88
|
-
"""Convert to dictionary for serialization"""
|
89
|
-
return {
|
90
|
-
"service_name": self.service_name,
|
91
|
-
"service_type": self.service_type.value,
|
92
|
-
"model_id": self.model_id,
|
93
|
-
"backend": self.backend.value,
|
94
|
-
"gpu_id": self.gpu_id,
|
95
|
-
"gpu_memory_fraction": self.gpu_memory_fraction,
|
96
|
-
"enable_gpu": self.enable_gpu,
|
97
|
-
"model_precision": self.model_precision,
|
98
|
-
"max_model_len": self.max_model_len,
|
99
|
-
"max_batch_size": self.max_batch_size,
|
100
|
-
"enable_chunked_prefill": self.enable_chunked_prefill,
|
101
|
-
"max_num_seqs": self.max_num_seqs,
|
102
|
-
"tensor_parallel_size": self.tensor_parallel_size,
|
103
|
-
"pipeline_parallel_size": self.pipeline_parallel_size,
|
104
|
-
"enable_prefix_caching": self.enable_prefix_caching,
|
105
|
-
"gpu_memory_utilization": self.gpu_memory_utilization,
|
106
|
-
"swap_space": self.swap_space,
|
107
|
-
"cpu_offload": self.cpu_offload,
|
108
|
-
"quantization": self.quantization,
|
109
|
-
"quantization_param_path": self.quantization_param_path,
|
110
|
-
"host": self.host,
|
111
|
-
"port": self.port,
|
112
|
-
"api_key": self.api_key,
|
113
|
-
"served_model_name": self.served_model_name,
|
114
|
-
"trust_remote_code": self.trust_remote_code,
|
115
|
-
"revision": self.revision,
|
116
|
-
"tokenizer_revision": self.tokenizer_revision,
|
117
|
-
"vllm_args": self.vllm_args,
|
118
|
-
"tensorrt_args": self.tensorrt_args,
|
119
|
-
"transformers_args": self.transformers_args,
|
120
|
-
"model_cache_dir": self.model_cache_dir,
|
121
|
-
"download_dir": self.download_dir
|
122
|
-
}
|
123
|
-
|
124
|
-
@classmethod
|
125
|
-
def from_dict(cls, data: Dict[str, Any]) -> "LocalGPUConfig":
|
126
|
-
"""Create from dictionary"""
|
127
|
-
return cls(
|
128
|
-
service_name=data["service_name"],
|
129
|
-
service_type=LocalServiceType(data["service_type"]),
|
130
|
-
model_id=data["model_id"],
|
131
|
-
backend=LocalBackend(data.get("backend", "transformers")),
|
132
|
-
gpu_id=data.get("gpu_id"),
|
133
|
-
gpu_memory_fraction=data.get("gpu_memory_fraction", 0.9),
|
134
|
-
enable_gpu=data.get("enable_gpu", True),
|
135
|
-
model_precision=data.get("model_precision", "float16"),
|
136
|
-
max_model_len=data.get("max_model_len", 2048),
|
137
|
-
max_batch_size=data.get("max_batch_size", 8),
|
138
|
-
enable_chunked_prefill=data.get("enable_chunked_prefill", True),
|
139
|
-
max_num_seqs=data.get("max_num_seqs", 256),
|
140
|
-
tensor_parallel_size=data.get("tensor_parallel_size", 1),
|
141
|
-
pipeline_parallel_size=data.get("pipeline_parallel_size", 1),
|
142
|
-
enable_prefix_caching=data.get("enable_prefix_caching", True),
|
143
|
-
gpu_memory_utilization=data.get("gpu_memory_utilization", 0.9),
|
144
|
-
swap_space=data.get("swap_space", 4),
|
145
|
-
cpu_offload=data.get("cpu_offload", False),
|
146
|
-
quantization=data.get("quantization"),
|
147
|
-
quantization_param_path=data.get("quantization_param_path"),
|
148
|
-
host=data.get("host", "127.0.0.1"),
|
149
|
-
port=data.get("port", 8000),
|
150
|
-
api_key=data.get("api_key"),
|
151
|
-
served_model_name=data.get("served_model_name"),
|
152
|
-
trust_remote_code=data.get("trust_remote_code", False),
|
153
|
-
revision=data.get("revision"),
|
154
|
-
tokenizer_revision=data.get("tokenizer_revision"),
|
155
|
-
vllm_args=data.get("vllm_args", {}),
|
156
|
-
tensorrt_args=data.get("tensorrt_args", {}),
|
157
|
-
transformers_args=data.get("transformers_args", {}),
|
158
|
-
model_cache_dir=data.get("model_cache_dir"),
|
159
|
-
download_dir=data.get("download_dir")
|
160
|
-
)
|
161
|
-
|
162
|
-
|
163
|
-
# Predefined configurations for common use cases
|
164
|
-
def create_vllm_config(service_name: str, model_id: str,
|
165
|
-
max_model_len: int = 2048,
|
166
|
-
tensor_parallel_size: int = 1) -> LocalGPUConfig:
|
167
|
-
"""Create optimized vLLM configuration"""
|
168
|
-
return LocalGPUConfig(
|
169
|
-
service_name=service_name,
|
170
|
-
service_type=LocalServiceType.LLM,
|
171
|
-
model_id=model_id,
|
172
|
-
backend=LocalBackend.VLLM,
|
173
|
-
max_model_len=max_model_len,
|
174
|
-
tensor_parallel_size=tensor_parallel_size,
|
175
|
-
enable_chunked_prefill=True,
|
176
|
-
enable_prefix_caching=True,
|
177
|
-
gpu_memory_utilization=0.9,
|
178
|
-
model_precision="float16"
|
179
|
-
)
|
180
|
-
|
181
|
-
|
182
|
-
def create_tensorrt_config(service_name: str, model_id: str,
|
183
|
-
max_batch_size: int = 8,
|
184
|
-
precision: str = "float16") -> LocalGPUConfig:
|
185
|
-
"""Create TensorRT-LLM configuration"""
|
186
|
-
return LocalGPUConfig(
|
187
|
-
service_name=service_name,
|
188
|
-
service_type=LocalServiceType.LLM,
|
189
|
-
model_id=model_id,
|
190
|
-
backend=LocalBackend.TENSORRT_LLM,
|
191
|
-
max_batch_size=max_batch_size,
|
192
|
-
model_precision=precision,
|
193
|
-
tensor_parallel_size=1,
|
194
|
-
tensorrt_args={
|
195
|
-
"enable_kv_cache_reuse": True,
|
196
|
-
"remove_input_padding": True,
|
197
|
-
"use_gpt_attention_plugin": True
|
198
|
-
}
|
199
|
-
)
|
200
|
-
|
201
|
-
|
202
|
-
def create_transformers_config(service_name: str, model_id: str,
|
203
|
-
precision: str = "float16",
|
204
|
-
quantization: Optional[str] = None) -> LocalGPUConfig:
|
205
|
-
"""Create HuggingFace Transformers configuration"""
|
206
|
-
return LocalGPUConfig(
|
207
|
-
service_name=service_name,
|
208
|
-
service_type=LocalServiceType.LLM,
|
209
|
-
model_id=model_id,
|
210
|
-
backend=LocalBackend.TRANSFORMERS,
|
211
|
-
model_precision=precision,
|
212
|
-
quantization=quantization,
|
213
|
-
max_batch_size=4, # Lower for memory efficiency
|
214
|
-
transformers_args={
|
215
|
-
"device_map": "auto",
|
216
|
-
"torch_dtype": "auto",
|
217
|
-
"low_cpu_mem_usage": True
|
218
|
-
}
|
219
|
-
)
|
220
|
-
|
221
|
-
|
222
|
-
def create_vision_config(service_name: str, model_id: str,
|
223
|
-
backend: LocalBackend = LocalBackend.TRANSFORMERS) -> LocalGPUConfig:
|
224
|
-
"""Create vision model configuration"""
|
225
|
-
return LocalGPUConfig(
|
226
|
-
service_name=service_name,
|
227
|
-
service_type=LocalServiceType.VISION,
|
228
|
-
model_id=model_id,
|
229
|
-
backend=backend,
|
230
|
-
max_batch_size=16,
|
231
|
-
model_precision="float16",
|
232
|
-
gpu_memory_utilization=0.8 # Lower for vision models
|
233
|
-
)
|
234
|
-
|
235
|
-
|
236
|
-
def create_embedding_config(service_name: str, model_id: str,
|
237
|
-
max_batch_size: int = 32) -> LocalGPUConfig:
|
238
|
-
"""Create embedding model configuration"""
|
239
|
-
return LocalGPUConfig(
|
240
|
-
service_name=service_name,
|
241
|
-
service_type=LocalServiceType.EMBEDDING,
|
242
|
-
model_id=model_id,
|
243
|
-
backend=LocalBackend.TRANSFORMERS,
|
244
|
-
max_batch_size=max_batch_size,
|
245
|
-
model_precision="float16",
|
246
|
-
gpu_memory_utilization=0.7, # Lower memory usage for embeddings
|
247
|
-
cpu_offload=False
|
248
|
-
)
|