isa-model 0.3.9__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- isa_model/__init__.py +1 -1
- isa_model/client.py +732 -565
- isa_model/core/cache/redis_cache.py +401 -0
- isa_model/core/config/config_manager.py +53 -10
- isa_model/core/config.py +1 -1
- isa_model/core/database/__init__.py +1 -0
- isa_model/core/database/migrations.py +277 -0
- isa_model/core/database/supabase_client.py +123 -0
- isa_model/core/models/__init__.py +37 -0
- isa_model/core/models/model_billing_tracker.py +60 -88
- isa_model/core/models/model_manager.py +36 -18
- isa_model/core/models/model_repo.py +44 -38
- isa_model/core/models/model_statistics_tracker.py +234 -0
- isa_model/core/models/model_storage.py +0 -1
- isa_model/core/models/model_version_manager.py +959 -0
- isa_model/core/pricing_manager.py +2 -249
- isa_model/core/resilience/circuit_breaker.py +366 -0
- isa_model/core/security/secrets.py +358 -0
- isa_model/core/services/__init__.py +2 -4
- isa_model/core/services/intelligent_model_selector.py +101 -370
- isa_model/core/storage/hf_storage.py +1 -1
- isa_model/core/types.py +7 -0
- isa_model/deployment/cloud/modal/isa_audio_chatTTS_service.py +520 -0
- isa_model/deployment/cloud/modal/isa_audio_fish_service.py +0 -0
- isa_model/deployment/cloud/modal/isa_audio_openvoice_service.py +758 -0
- isa_model/deployment/cloud/modal/isa_audio_service_v2.py +1044 -0
- isa_model/deployment/cloud/modal/isa_embed_rerank_service.py +296 -0
- isa_model/deployment/cloud/modal/isa_video_hunyuan_service.py +423 -0
- isa_model/deployment/cloud/modal/isa_vision_ocr_service.py +519 -0
- isa_model/deployment/cloud/modal/isa_vision_qwen25_service.py +709 -0
- isa_model/deployment/cloud/modal/isa_vision_table_service.py +467 -323
- isa_model/deployment/cloud/modal/isa_vision_ui_service.py +607 -180
- isa_model/deployment/cloud/modal/isa_vision_ui_service_optimized.py +660 -0
- isa_model/deployment/core/deployment_manager.py +6 -4
- isa_model/deployment/services/auto_hf_modal_deployer.py +894 -0
- isa_model/eval/benchmarks/__init__.py +27 -0
- isa_model/eval/benchmarks/multimodal_datasets.py +460 -0
- isa_model/eval/benchmarks.py +244 -12
- isa_model/eval/evaluators/__init__.py +8 -2
- isa_model/eval/evaluators/audio_evaluator.py +727 -0
- isa_model/eval/evaluators/embedding_evaluator.py +742 -0
- isa_model/eval/evaluators/vision_evaluator.py +564 -0
- isa_model/eval/example_evaluation.py +395 -0
- isa_model/eval/factory.py +272 -5
- isa_model/eval/isa_benchmarks.py +700 -0
- isa_model/eval/isa_integration.py +582 -0
- isa_model/eval/metrics.py +159 -6
- isa_model/eval/tests/unit/test_basic.py +396 -0
- isa_model/inference/ai_factory.py +44 -8
- isa_model/inference/services/audio/__init__.py +21 -0
- isa_model/inference/services/audio/base_realtime_service.py +225 -0
- isa_model/inference/services/audio/isa_tts_service.py +0 -0
- isa_model/inference/services/audio/openai_realtime_service.py +320 -124
- isa_model/inference/services/audio/openai_stt_service.py +32 -6
- isa_model/inference/services/base_service.py +17 -1
- isa_model/inference/services/embedding/__init__.py +13 -0
- isa_model/inference/services/embedding/base_embed_service.py +111 -8
- isa_model/inference/services/embedding/isa_embed_service.py +305 -0
- isa_model/inference/services/embedding/openai_embed_service.py +2 -4
- isa_model/inference/services/embedding/tests/test_embedding.py +222 -0
- isa_model/inference/services/img/__init__.py +2 -2
- isa_model/inference/services/img/base_image_gen_service.py +24 -7
- isa_model/inference/services/img/replicate_image_gen_service.py +84 -422
- isa_model/inference/services/img/services/replicate_face_swap.py +193 -0
- isa_model/inference/services/img/services/replicate_flux.py +226 -0
- isa_model/inference/services/img/services/replicate_flux_kontext.py +219 -0
- isa_model/inference/services/img/services/replicate_sticker_maker.py +249 -0
- isa_model/inference/services/img/tests/test_img_client.py +297 -0
- isa_model/inference/services/llm/base_llm_service.py +30 -6
- isa_model/inference/services/llm/helpers/llm_adapter.py +63 -9
- isa_model/inference/services/llm/ollama_llm_service.py +2 -1
- isa_model/inference/services/llm/openai_llm_service.py +652 -55
- isa_model/inference/services/llm/yyds_llm_service.py +2 -1
- isa_model/inference/services/vision/__init__.py +5 -5
- isa_model/inference/services/vision/base_vision_service.py +118 -185
- isa_model/inference/services/vision/helpers/image_utils.py +11 -5
- isa_model/inference/services/vision/isa_vision_service.py +573 -0
- isa_model/inference/services/vision/tests/test_ocr_client.py +284 -0
- isa_model/serving/api/fastapi_server.py +88 -16
- isa_model/serving/api/middleware/auth.py +311 -0
- isa_model/serving/api/middleware/security.py +278 -0
- isa_model/serving/api/routes/analytics.py +486 -0
- isa_model/serving/api/routes/deployments.py +339 -0
- isa_model/serving/api/routes/evaluations.py +579 -0
- isa_model/serving/api/routes/logs.py +430 -0
- isa_model/serving/api/routes/settings.py +582 -0
- isa_model/serving/api/routes/unified.py +324 -165
- isa_model/serving/api/startup.py +304 -0
- isa_model/serving/modal_proxy_server.py +249 -0
- isa_model/training/__init__.py +100 -6
- isa_model/training/core/__init__.py +4 -1
- isa_model/training/examples/intelligent_training_example.py +281 -0
- isa_model/training/intelligent/__init__.py +25 -0
- isa_model/training/intelligent/decision_engine.py +643 -0
- isa_model/training/intelligent/intelligent_factory.py +888 -0
- isa_model/training/intelligent/knowledge_base.py +751 -0
- isa_model/training/intelligent/resource_optimizer.py +839 -0
- isa_model/training/intelligent/task_classifier.py +576 -0
- isa_model/training/storage/__init__.py +24 -0
- isa_model/training/storage/core_integration.py +439 -0
- isa_model/training/storage/training_repository.py +552 -0
- isa_model/training/storage/training_storage.py +628 -0
- {isa_model-0.3.9.dist-info → isa_model-0.4.0.dist-info}/METADATA +13 -1
- isa_model-0.4.0.dist-info/RECORD +182 -0
- isa_model/deployment/cloud/modal/isa_vision_doc_service.py +0 -766
- isa_model/deployment/cloud/modal/register_models.py +0 -321
- isa_model/inference/adapter/unified_api.py +0 -248
- isa_model/inference/services/helpers/stacked_config.py +0 -148
- isa_model/inference/services/img/flux_professional_service.py +0 -603
- isa_model/inference/services/img/helpers/base_stacked_service.py +0 -274
- isa_model/inference/services/others/table_transformer_service.py +0 -61
- isa_model/inference/services/vision/doc_analysis_service.py +0 -640
- isa_model/inference/services/vision/helpers/base_stacked_service.py +0 -274
- isa_model/inference/services/vision/ui_analysis_service.py +0 -823
- isa_model/scripts/inference_tracker.py +0 -283
- isa_model/scripts/mlflow_manager.py +0 -379
- isa_model/scripts/model_registry.py +0 -465
- isa_model/scripts/register_models.py +0 -370
- isa_model/scripts/register_models_with_embeddings.py +0 -510
- isa_model/scripts/start_mlflow.py +0 -95
- isa_model/scripts/training_tracker.py +0 -257
- isa_model-0.3.9.dist-info/RECORD +0 -138
- {isa_model-0.3.9.dist-info → isa_model-0.4.0.dist-info}/WHEEL +0 -0
- {isa_model-0.3.9.dist-info → isa_model-0.4.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,894 @@
|
|
1
|
+
"""
|
2
|
+
Automated HuggingFace to Modal Deployment Service
|
3
|
+
|
4
|
+
This service automatically generates and deploys HuggingFace models to Modal
|
5
|
+
with optimized configurations based on model type and architecture.
|
6
|
+
"""
|
7
|
+
|
8
|
+
import os
|
9
|
+
import json
|
10
|
+
import time
|
11
|
+
import requests
|
12
|
+
import tempfile
|
13
|
+
from typing import Dict, List, Optional, Any, Union
|
14
|
+
from pathlib import Path
|
15
|
+
from dataclasses import dataclass
|
16
|
+
from huggingface_hub import HfApi, model_info
|
17
|
+
import logging
|
18
|
+
|
19
|
+
logger = logging.getLogger(__name__)
|
20
|
+
|
21
|
+
@dataclass
|
22
|
+
class ModelConfig:
|
23
|
+
"""Configuration for a HuggingFace model deployment"""
|
24
|
+
model_id: str
|
25
|
+
model_type: str # text, vision, audio, image, embedding
|
26
|
+
architecture: str
|
27
|
+
parameters: str
|
28
|
+
gpu_requirements: str
|
29
|
+
memory_gb: int
|
30
|
+
container_memory_mb: int
|
31
|
+
python_version: str = "3.10"
|
32
|
+
dependencies: List[str] = None
|
33
|
+
capabilities: List[str] = None
|
34
|
+
max_tokens: int = 2048
|
35
|
+
estimated_cost_per_hour: float = 0.0
|
36
|
+
|
37
|
+
class HuggingFaceModalDeployer:
|
38
|
+
"""
|
39
|
+
Service to automatically deploy HuggingFace models to Modal
|
40
|
+
"""
|
41
|
+
|
42
|
+
def __init__(self):
|
43
|
+
self.hf_api = HfApi()
|
44
|
+
self.supported_architectures = {
|
45
|
+
# Text/LLM models
|
46
|
+
'llama': {'type': 'text', 'gpu': 'A100', 'memory': 32768, 'cost': 4.0},
|
47
|
+
'mistral': {'type': 'text', 'gpu': 'A100', 'memory': 24576, 'cost': 4.0},
|
48
|
+
'qwen': {'type': 'text', 'gpu': 'A100', 'memory': 24576, 'cost': 4.0},
|
49
|
+
'gemma': {'type': 'text', 'gpu': 'A10G', 'memory': 16384, 'cost': 1.2},
|
50
|
+
'phi': {'type': 'text', 'gpu': 'A10G', 'memory': 16384, 'cost': 1.2},
|
51
|
+
'gpt': {'type': 'text', 'gpu': 'A100', 'memory': 32768, 'cost': 4.0},
|
52
|
+
|
53
|
+
# Vision models
|
54
|
+
'clip': {'type': 'vision', 'gpu': 'A10G', 'memory': 16384, 'cost': 1.2},
|
55
|
+
'blip': {'type': 'vision', 'gpu': 'A10G', 'memory': 16384, 'cost': 1.2},
|
56
|
+
'qwen2_vl': {'type': 'vision', 'gpu': 'A100', 'memory': 32768, 'cost': 4.0},
|
57
|
+
'llava': {'type': 'vision', 'gpu': 'A100', 'memory': 24576, 'cost': 4.0},
|
58
|
+
'fuyu': {'type': 'vision', 'gpu': 'A100', 'memory': 32768, 'cost': 4.0},
|
59
|
+
|
60
|
+
# Audio models
|
61
|
+
'whisper': {'type': 'audio', 'gpu': 'A10G', 'memory': 8192, 'cost': 1.2},
|
62
|
+
'wav2vec2': {'type': 'audio', 'gpu': 'A10G', 'memory': 8192, 'cost': 1.2},
|
63
|
+
'musicgen': {'type': 'audio', 'gpu': 'A100', 'memory': 16384, 'cost': 4.0},
|
64
|
+
'bark': {'type': 'audio', 'gpu': 'A100', 'memory': 16384, 'cost': 4.0},
|
65
|
+
|
66
|
+
# Image generation models
|
67
|
+
'stable-diffusion': {'type': 'image', 'gpu': 'A100', 'memory': 16384, 'cost': 4.0},
|
68
|
+
'flux': {'type': 'image', 'gpu': 'A100', 'memory': 24576, 'cost': 4.0},
|
69
|
+
'dall-e': {'type': 'image', 'gpu': 'A100', 'memory': 16384, 'cost': 4.0},
|
70
|
+
|
71
|
+
# Embedding models
|
72
|
+
'sentence-transformers': {'type': 'embedding', 'gpu': 'A10G', 'memory': 8192, 'cost': 1.2},
|
73
|
+
'e5': {'type': 'embedding', 'gpu': 'A10G', 'memory': 8192, 'cost': 1.2},
|
74
|
+
'bge': {'type': 'embedding', 'gpu': 'A10G', 'memory': 8192, 'cost': 1.2},
|
75
|
+
}
|
76
|
+
|
77
|
+
def analyze_model(self, model_id: str) -> ModelConfig:
|
78
|
+
"""
|
79
|
+
Analyze a HuggingFace model and determine deployment configuration
|
80
|
+
|
81
|
+
Args:
|
82
|
+
model_id: HuggingFace model ID (e.g., "microsoft/DialoGPT-medium")
|
83
|
+
|
84
|
+
Returns:
|
85
|
+
ModelConfig with deployment settings
|
86
|
+
"""
|
87
|
+
try:
|
88
|
+
# Get model information from HuggingFace
|
89
|
+
info = model_info(model_id)
|
90
|
+
|
91
|
+
# Extract model details
|
92
|
+
architecture = self._detect_architecture(model_id, info)
|
93
|
+
model_type = self._determine_model_type(model_id, info, architecture)
|
94
|
+
parameters = self._estimate_parameters(info)
|
95
|
+
|
96
|
+
# Get deployment requirements based on architecture
|
97
|
+
requirements = self.supported_architectures.get(
|
98
|
+
architecture.lower(),
|
99
|
+
{'type': 'text', 'gpu': 'A10G', 'memory': 16384, 'cost': 1.2}
|
100
|
+
)
|
101
|
+
|
102
|
+
# Generate capabilities based on model type and tags
|
103
|
+
capabilities = self._generate_capabilities(model_type, info)
|
104
|
+
|
105
|
+
# Generate dependencies based on model type
|
106
|
+
dependencies = self._generate_dependencies(model_type, architecture, info)
|
107
|
+
|
108
|
+
return ModelConfig(
|
109
|
+
model_id=model_id,
|
110
|
+
model_type=model_type,
|
111
|
+
architecture=architecture,
|
112
|
+
parameters=parameters,
|
113
|
+
gpu_requirements=requirements['gpu'],
|
114
|
+
memory_gb=requirements['memory'] // 1024,
|
115
|
+
container_memory_mb=requirements['memory'],
|
116
|
+
dependencies=dependencies,
|
117
|
+
capabilities=capabilities,
|
118
|
+
estimated_cost_per_hour=requirements['cost']
|
119
|
+
)
|
120
|
+
|
121
|
+
except Exception as e:
|
122
|
+
logger.error(f"Error analyzing model {model_id}: {e}")
|
123
|
+
raise
|
124
|
+
|
125
|
+
def _detect_architecture(self, model_id: str, info) -> str:
|
126
|
+
"""Detect model architecture from model ID and metadata"""
|
127
|
+
model_id_lower = model_id.lower()
|
128
|
+
|
129
|
+
# Check for specific architectures in model ID
|
130
|
+
for arch in self.supported_architectures.keys():
|
131
|
+
if arch.replace('_', '-') in model_id_lower or arch.replace('-', '_') in model_id_lower:
|
132
|
+
return arch
|
133
|
+
|
134
|
+
# Check model tags and config
|
135
|
+
if hasattr(info, 'tags'):
|
136
|
+
for tag in info.tags:
|
137
|
+
tag_lower = tag.lower()
|
138
|
+
for arch in self.supported_architectures.keys():
|
139
|
+
if arch in tag_lower:
|
140
|
+
return arch
|
141
|
+
|
142
|
+
# Check config architectures
|
143
|
+
if hasattr(info, 'config') and info.config:
|
144
|
+
config_str = str(info.config).lower()
|
145
|
+
for arch in self.supported_architectures.keys():
|
146
|
+
if arch in config_str:
|
147
|
+
return arch
|
148
|
+
|
149
|
+
# Default fallback
|
150
|
+
return 'transformers'
|
151
|
+
|
152
|
+
def _determine_model_type(self, model_id: str, info, architecture: str) -> str:
|
153
|
+
"""Determine the primary model type"""
|
154
|
+
model_id_lower = model_id.lower()
|
155
|
+
|
156
|
+
# Check for specific model types in ID
|
157
|
+
if any(x in model_id_lower for x in ['vision', 'clip', 'blip', 'llava', 'qwen2-vl', 'fuyu']):
|
158
|
+
return 'vision'
|
159
|
+
elif any(x in model_id_lower for x in ['whisper', 'wav2vec', 'audio', 'speech', 'tts', 'stt']):
|
160
|
+
return 'audio'
|
161
|
+
elif any(x in model_id_lower for x in ['stable-diffusion', 'sd-', 'flux', 'dall-e', 'imagen']):
|
162
|
+
return 'image'
|
163
|
+
elif any(x in model_id_lower for x in ['embed', 'sentence-transformer', 'e5-', 'bge-']):
|
164
|
+
return 'embedding'
|
165
|
+
|
166
|
+
# Check tags
|
167
|
+
if hasattr(info, 'tags'):
|
168
|
+
for tag in info.tags:
|
169
|
+
tag_lower = tag.lower()
|
170
|
+
if tag_lower in ['computer-vision', 'image-classification', 'object-detection']:
|
171
|
+
return 'vision'
|
172
|
+
elif tag_lower in ['automatic-speech-recognition', 'text-to-speech', 'audio']:
|
173
|
+
return 'audio'
|
174
|
+
elif tag_lower in ['text-to-image', 'image-generation']:
|
175
|
+
return 'image'
|
176
|
+
elif tag_lower in ['sentence-similarity', 'feature-extraction']:
|
177
|
+
return 'embedding'
|
178
|
+
|
179
|
+
# Use architecture mapping
|
180
|
+
if architecture in self.supported_architectures:
|
181
|
+
return self.supported_architectures[architecture]['type']
|
182
|
+
|
183
|
+
return 'text' # Default
|
184
|
+
|
185
|
+
def _estimate_parameters(self, info) -> str:
|
186
|
+
"""Estimate model parameters from model info"""
|
187
|
+
if hasattr(info, 'config') and info.config:
|
188
|
+
config = info.config
|
189
|
+
if isinstance(config, dict):
|
190
|
+
# Try different parameter estimation methods
|
191
|
+
if 'num_parameters' in config:
|
192
|
+
params = config['num_parameters']
|
193
|
+
elif 'd_model' in config and 'n_layer' in config:
|
194
|
+
# Transformer estimation
|
195
|
+
d_model = config.get('d_model', 768)
|
196
|
+
n_layer = config.get('n_layer', 12)
|
197
|
+
vocab_size = config.get('vocab_size', 50000)
|
198
|
+
params = (d_model * d_model * 4 * n_layer) + (vocab_size * d_model)
|
199
|
+
else:
|
200
|
+
return 'Unknown'
|
201
|
+
|
202
|
+
# Format parameters
|
203
|
+
if params > 1e9:
|
204
|
+
return f"{params/1e9:.1f}B"
|
205
|
+
elif params > 1e6:
|
206
|
+
return f"{params/1e6:.0f}M"
|
207
|
+
else:
|
208
|
+
return f"{params/1e3:.0f}K"
|
209
|
+
|
210
|
+
return 'Unknown'
|
211
|
+
|
212
|
+
def _generate_capabilities(self, model_type: str, info) -> List[str]:
|
213
|
+
"""Generate capabilities list based on model type"""
|
214
|
+
base_capabilities = {
|
215
|
+
'text': ['text_generation', 'chat', 'completion'],
|
216
|
+
'vision': ['image_analysis', 'image_understanding', 'visual_question_answering'],
|
217
|
+
'audio': ['speech_recognition', 'audio_processing'],
|
218
|
+
'image': ['image_generation', 'text_to_image'],
|
219
|
+
'embedding': ['text_embedding', 'similarity_search', 'semantic_search']
|
220
|
+
}
|
221
|
+
|
222
|
+
capabilities = base_capabilities.get(model_type, ['general_ai'])
|
223
|
+
|
224
|
+
# Add specific capabilities based on tags
|
225
|
+
if hasattr(info, 'tags'):
|
226
|
+
for tag in info.tags:
|
227
|
+
if tag == 'conversational':
|
228
|
+
capabilities.append('chat')
|
229
|
+
elif tag == 'question-answering':
|
230
|
+
capabilities.append('question_answering')
|
231
|
+
elif tag == 'summarization':
|
232
|
+
capabilities.append('text_summarization')
|
233
|
+
elif tag == 'translation':
|
234
|
+
capabilities.append('translation')
|
235
|
+
|
236
|
+
return list(set(capabilities))
|
237
|
+
|
238
|
+
def _generate_dependencies(self, model_type: str, architecture: str, info) -> List[str]:
|
239
|
+
"""Generate Python dependencies based on model type and architecture"""
|
240
|
+
base_deps = [
|
241
|
+
"torch>=2.0.0",
|
242
|
+
"transformers>=4.35.0",
|
243
|
+
"accelerate>=0.24.0",
|
244
|
+
"numpy>=1.24.0",
|
245
|
+
"requests>=2.31.0",
|
246
|
+
"httpx>=0.26.0",
|
247
|
+
"pydantic>=2.0.0",
|
248
|
+
]
|
249
|
+
|
250
|
+
type_deps = {
|
251
|
+
'vision': [
|
252
|
+
"Pillow>=10.0.0",
|
253
|
+
"opencv-python>=4.8.0",
|
254
|
+
"torchvision>=0.15.0",
|
255
|
+
],
|
256
|
+
'audio': [
|
257
|
+
"librosa>=0.10.0",
|
258
|
+
"soundfile>=0.12.0",
|
259
|
+
"torchaudio>=2.0.0",
|
260
|
+
],
|
261
|
+
'image': [
|
262
|
+
"diffusers>=0.21.0",
|
263
|
+
"Pillow>=10.0.0",
|
264
|
+
"controlnet-aux>=0.3.0",
|
265
|
+
],
|
266
|
+
'embedding': [
|
267
|
+
"sentence-transformers>=2.2.0",
|
268
|
+
"faiss-cpu>=1.7.0",
|
269
|
+
]
|
270
|
+
}
|
271
|
+
|
272
|
+
arch_deps = {
|
273
|
+
'whisper': ["openai-whisper>=20231117"],
|
274
|
+
'stable-diffusion': ["diffusers>=0.21.0", "controlnet-aux>=0.3.0"],
|
275
|
+
'qwen2_vl': ["qwen-vl-utils", "av", "decord"],
|
276
|
+
'llava': ["llava>=1.1.0"],
|
277
|
+
}
|
278
|
+
|
279
|
+
deps = base_deps.copy()
|
280
|
+
deps.extend(type_deps.get(model_type, []))
|
281
|
+
deps.extend(arch_deps.get(architecture, []))
|
282
|
+
|
283
|
+
return list(set(deps))
|
284
|
+
|
285
|
+
def generate_modal_service(self, config: ModelConfig) -> str:
|
286
|
+
"""
|
287
|
+
Generate Modal deployment code for a HuggingFace model
|
288
|
+
|
289
|
+
Args:
|
290
|
+
config: Model configuration
|
291
|
+
|
292
|
+
Returns:
|
293
|
+
Generated Python code for Modal deployment
|
294
|
+
"""
|
295
|
+
service_name = config.model_id.replace('/', '_').replace('-', '_').lower()
|
296
|
+
|
297
|
+
template = f'''"""
|
298
|
+
{config.model_id} Modal Service
|
299
|
+
|
300
|
+
Automatically generated deployment for {config.model_id}
|
301
|
+
- Model Type: {config.model_type}
|
302
|
+
- Architecture: {config.architecture}
|
303
|
+
- Parameters: {config.parameters}
|
304
|
+
- Capabilities: {', '.join(config.capabilities)}
|
305
|
+
"""
|
306
|
+
|
307
|
+
import modal
|
308
|
+
import time
|
309
|
+
import json
|
310
|
+
import os
|
311
|
+
import logging
|
312
|
+
import base64
|
313
|
+
import tempfile
|
314
|
+
from typing import Dict, List, Optional, Any, Union
|
315
|
+
from pathlib import Path
|
316
|
+
|
317
|
+
# Define Modal application
|
318
|
+
app = modal.App("isa-{service_name}")
|
319
|
+
|
320
|
+
# Define Modal container image
|
321
|
+
image = (
|
322
|
+
modal.Image.debian_slim(python_version="{config.python_version}")
|
323
|
+
.pip_install([
|
324
|
+
{self._format_dependencies(config.dependencies)}
|
325
|
+
])
|
326
|
+
.apt_install([
|
327
|
+
"ffmpeg",
|
328
|
+
"libsm6",
|
329
|
+
"libxext6",
|
330
|
+
"libxrender-dev",
|
331
|
+
"libglib2.0-0",
|
332
|
+
"libgl1-mesa-glx",
|
333
|
+
"git-lfs"
|
334
|
+
])
|
335
|
+
.env({{
|
336
|
+
"TRANSFORMERS_CACHE": "/models",
|
337
|
+
"TORCH_HOME": "/models/torch",
|
338
|
+
"HF_HOME": "/models",
|
339
|
+
"CUDA_VISIBLE_DEVICES": "0",
|
340
|
+
"PYTORCH_CUDA_ALLOC_CONF": "max_split_size_mb:512"
|
341
|
+
}})
|
342
|
+
)
|
343
|
+
|
344
|
+
# Model Service
|
345
|
+
@app.cls(
|
346
|
+
gpu="{config.gpu_requirements}",
|
347
|
+
image=image,
|
348
|
+
memory={config.container_memory_mb},
|
349
|
+
timeout=1800,
|
350
|
+
scaledown_window=300,
|
351
|
+
min_containers=0,
|
352
|
+
max_containers=5,
|
353
|
+
)
|
354
|
+
class {service_name.title().replace('_', '')}Service:
|
355
|
+
"""
|
356
|
+
{config.model_id} Service
|
357
|
+
|
358
|
+
Model: {config.model_id}
|
359
|
+
Architecture: {config.architecture}
|
360
|
+
Parameters: {config.parameters}
|
361
|
+
Capabilities: {', '.join(config.capabilities)}
|
362
|
+
"""
|
363
|
+
|
364
|
+
@modal.enter()
|
365
|
+
def load_model(self):
|
366
|
+
"""Load {config.model_id} model and dependencies"""
|
367
|
+
print("Loading {config.model_id}...")
|
368
|
+
start_time = time.time()
|
369
|
+
|
370
|
+
self.model = None
|
371
|
+
self.tokenizer = None
|
372
|
+
self.processor = None
|
373
|
+
self.logger = logging.getLogger(__name__)
|
374
|
+
self.request_count = 0
|
375
|
+
self.total_processing_time = 0.0
|
376
|
+
|
377
|
+
try:
|
378
|
+
import torch
|
379
|
+
from transformers import AutoModel, AutoTokenizer, AutoProcessor
|
380
|
+
|
381
|
+
model_name = "{config.model_id}"
|
382
|
+
|
383
|
+
print(f"Loading model: {{model_name}}")
|
384
|
+
|
385
|
+
# Load tokenizer/processor
|
386
|
+
try:
|
387
|
+
self.processor = AutoProcessor.from_pretrained(model_name)
|
388
|
+
print("✅ Processor loaded")
|
389
|
+
except:
|
390
|
+
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
391
|
+
print("✅ Tokenizer loaded")
|
392
|
+
|
393
|
+
# Load model with optimizations
|
394
|
+
self.model = AutoModel.from_pretrained(
|
395
|
+
model_name,
|
396
|
+
torch_dtype=torch.float16,
|
397
|
+
device_map="auto",
|
398
|
+
low_cpu_mem_usage=True,
|
399
|
+
use_cache=True
|
400
|
+
)
|
401
|
+
|
402
|
+
self.model.eval()
|
403
|
+
|
404
|
+
# Try to compile model for faster inference
|
405
|
+
try:
|
406
|
+
self.model = torch.compile(self.model, mode="reduce-overhead")
|
407
|
+
print("✅ Model compiled for faster inference")
|
408
|
+
except Exception as e:
|
409
|
+
print(f"⚠️ Model compilation failed: {{e}}")
|
410
|
+
|
411
|
+
load_time = time.time() - start_time
|
412
|
+
print(f"{config.model_id} loaded successfully in {{load_time:.2f}}s")
|
413
|
+
|
414
|
+
self.models_loaded = True
|
415
|
+
|
416
|
+
except Exception as e:
|
417
|
+
print(f"Model loading failed: {{e}}")
|
418
|
+
import traceback
|
419
|
+
traceback.print_exc()
|
420
|
+
self.models_loaded = False
|
421
|
+
|
422
|
+
{self._generate_inference_methods(config)}
|
423
|
+
|
424
|
+
@modal.method()
|
425
|
+
def health_check(self) -> Dict[str, Any]:
|
426
|
+
"""Health check endpoint"""
|
427
|
+
return {{
|
428
|
+
'status': 'healthy',
|
429
|
+
'service': 'isa-{service_name}',
|
430
|
+
'provider': 'ISA',
|
431
|
+
'models_loaded': self.models_loaded,
|
432
|
+
'model': '{config.model_id}',
|
433
|
+
'architecture': '{config.architecture}',
|
434
|
+
'timestamp': time.time(),
|
435
|
+
'gpu': '{config.gpu_requirements}',
|
436
|
+
'memory_usage': '{config.memory_gb}GB',
|
437
|
+
'request_count': self.request_count,
|
438
|
+
'capabilities': {config.capabilities}
|
439
|
+
}}
|
440
|
+
|
441
|
+
# Deployment functions
|
442
|
+
@app.function()
|
443
|
+
def deploy_info():
|
444
|
+
"""Deployment information"""
|
445
|
+
return {{
|
446
|
+
'service': 'isa-{service_name}',
|
447
|
+
'version': '1.0.0',
|
448
|
+
'description': 'ISA {config.model_id} service',
|
449
|
+
'model': '{config.model_id}',
|
450
|
+
'architecture': '{config.architecture}',
|
451
|
+
'gpu': '{config.gpu_requirements}',
|
452
|
+
'capabilities': {config.capabilities},
|
453
|
+
'deployment_time': time.time()
|
454
|
+
}}
|
455
|
+
|
456
|
+
if __name__ == "__main__":
|
457
|
+
print("ISA {config.model_id} Service - Modal Deployment")
|
458
|
+
print("Deploy with: modal deploy {service_name}_service.py")
|
459
|
+
print()
|
460
|
+
print("Model: {config.model_id}")
|
461
|
+
print("Architecture: {config.architecture}")
|
462
|
+
print("Parameters: {config.parameters}")
|
463
|
+
print("GPU: {config.gpu_requirements}")
|
464
|
+
print("Capabilities: {', '.join(config.capabilities)}")
|
465
|
+
'''
|
466
|
+
|
467
|
+
return template
|
468
|
+
|
469
|
+
def _format_dependencies(self, dependencies: List[str]) -> str:
|
470
|
+
"""Format dependencies for template"""
|
471
|
+
formatted = []
|
472
|
+
for dep in dependencies:
|
473
|
+
formatted.append(f' "{dep}",')
|
474
|
+
return '\n'.join(formatted)
|
475
|
+
|
476
|
+
def _generate_inference_methods(self, config: ModelConfig) -> str:
|
477
|
+
"""Generate inference methods based on model type"""
|
478
|
+
methods = {
|
479
|
+
'text': self._text_generation_method,
|
480
|
+
'vision': self._vision_analysis_method,
|
481
|
+
'audio': self._audio_processing_method,
|
482
|
+
'image': self._image_generation_method,
|
483
|
+
'embedding': self._embedding_method
|
484
|
+
}
|
485
|
+
|
486
|
+
return methods.get(config.model_type, self._generic_inference_method)(config)
|
487
|
+
|
488
|
+
def _text_generation_method(self, config: ModelConfig) -> str:
|
489
|
+
return '''
|
490
|
+
@modal.method()
|
491
|
+
def generate_text(
|
492
|
+
self,
|
493
|
+
prompt: str,
|
494
|
+
max_tokens: int = 512,
|
495
|
+
temperature: float = 0.7,
|
496
|
+
top_p: float = 0.9
|
497
|
+
) -> Dict[str, Any]:
|
498
|
+
"""Generate text using the model"""
|
499
|
+
start_time = time.time()
|
500
|
+
self.request_count += 1
|
501
|
+
|
502
|
+
try:
|
503
|
+
if not self.models_loaded or not self.model:
|
504
|
+
raise RuntimeError("Model not loaded")
|
505
|
+
|
506
|
+
# Tokenize input
|
507
|
+
inputs = self.tokenizer(
|
508
|
+
prompt,
|
509
|
+
return_tensors="pt",
|
510
|
+
padding=True,
|
511
|
+
truncation=True
|
512
|
+
).to("cuda")
|
513
|
+
|
514
|
+
# Generate response
|
515
|
+
import torch
|
516
|
+
with torch.no_grad():
|
517
|
+
outputs = self.model.generate(
|
518
|
+
**inputs,
|
519
|
+
max_new_tokens=max_tokens,
|
520
|
+
temperature=temperature,
|
521
|
+
top_p=top_p,
|
522
|
+
do_sample=True,
|
523
|
+
pad_token_id=self.tokenizer.eos_token_id
|
524
|
+
)
|
525
|
+
|
526
|
+
# Decode response
|
527
|
+
response = self.tokenizer.decode(
|
528
|
+
outputs[0][inputs.input_ids.shape[1]:],
|
529
|
+
skip_special_tokens=True
|
530
|
+
)
|
531
|
+
|
532
|
+
processing_time = time.time() - start_time
|
533
|
+
|
534
|
+
return {
|
535
|
+
'success': True,
|
536
|
+
'text': response,
|
537
|
+
'processing_time': processing_time,
|
538
|
+
'model': self.model.config.name_or_path
|
539
|
+
}
|
540
|
+
|
541
|
+
except Exception as e:
|
542
|
+
return {
|
543
|
+
'success': False,
|
544
|
+
'error': str(e),
|
545
|
+
'processing_time': time.time() - start_time
|
546
|
+
}'''
|
547
|
+
|
548
|
+
def _vision_analysis_method(self, config: ModelConfig) -> str:
|
549
|
+
return '''
|
550
|
+
@modal.method()
|
551
|
+
def analyze_image(
|
552
|
+
self,
|
553
|
+
image_b64: str,
|
554
|
+
prompt: str = "Describe this image.",
|
555
|
+
max_tokens: int = 512
|
556
|
+
) -> Dict[str, Any]:
|
557
|
+
"""Analyze image using the model"""
|
558
|
+
start_time = time.time()
|
559
|
+
self.request_count += 1
|
560
|
+
|
561
|
+
try:
|
562
|
+
if not self.models_loaded or not self.model:
|
563
|
+
raise RuntimeError("Model not loaded")
|
564
|
+
|
565
|
+
# Decode image
|
566
|
+
image_data = base64.b64decode(image_b64)
|
567
|
+
|
568
|
+
with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_file:
|
569
|
+
tmp_file.write(image_data)
|
570
|
+
tmp_file.flush()
|
571
|
+
|
572
|
+
from PIL import Image
|
573
|
+
image = Image.open(tmp_file.name)
|
574
|
+
|
575
|
+
# Process inputs
|
576
|
+
if self.processor:
|
577
|
+
inputs = self.processor(text=prompt, images=image, return_tensors="pt")
|
578
|
+
else:
|
579
|
+
# Fallback for models without processor
|
580
|
+
inputs = self.tokenizer(prompt, return_tensors="pt")
|
581
|
+
|
582
|
+
inputs = inputs.to("cuda")
|
583
|
+
|
584
|
+
# Generate response
|
585
|
+
import torch
|
586
|
+
with torch.no_grad():
|
587
|
+
outputs = self.model.generate(
|
588
|
+
**inputs,
|
589
|
+
max_new_tokens=max_tokens,
|
590
|
+
do_sample=True
|
591
|
+
)
|
592
|
+
|
593
|
+
# Decode response
|
594
|
+
if self.processor:
|
595
|
+
response = self.processor.decode(outputs[0], skip_special_tokens=True)
|
596
|
+
else:
|
597
|
+
response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
|
598
|
+
|
599
|
+
os.unlink(tmp_file.name)
|
600
|
+
|
601
|
+
processing_time = time.time() - start_time
|
602
|
+
|
603
|
+
return {
|
604
|
+
'success': True,
|
605
|
+
'text': response,
|
606
|
+
'processing_time': processing_time,
|
607
|
+
'model': self.model.config.name_or_path
|
608
|
+
}
|
609
|
+
|
610
|
+
except Exception as e:
|
611
|
+
return {
|
612
|
+
'success': False,
|
613
|
+
'error': str(e),
|
614
|
+
'processing_time': time.time() - start_time
|
615
|
+
}'''
|
616
|
+
|
617
|
+
def _audio_processing_method(self, config: ModelConfig) -> str:
|
618
|
+
return '''
|
619
|
+
@modal.method()
|
620
|
+
def process_audio(
|
621
|
+
self,
|
622
|
+
audio_b64: str,
|
623
|
+
task: str = "transcribe"
|
624
|
+
) -> Dict[str, Any]:
|
625
|
+
"""Process audio using the model"""
|
626
|
+
start_time = time.time()
|
627
|
+
self.request_count += 1
|
628
|
+
|
629
|
+
try:
|
630
|
+
if not self.models_loaded or not self.model:
|
631
|
+
raise RuntimeError("Model not loaded")
|
632
|
+
|
633
|
+
# Decode audio
|
634
|
+
audio_data = base64.b64decode(audio_b64)
|
635
|
+
|
636
|
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
|
637
|
+
tmp_file.write(audio_data)
|
638
|
+
tmp_file.flush()
|
639
|
+
|
640
|
+
# Process audio
|
641
|
+
if self.processor:
|
642
|
+
inputs = self.processor(tmp_file.name, return_tensors="pt")
|
643
|
+
else:
|
644
|
+
import librosa
|
645
|
+
audio, sr = librosa.load(tmp_file.name)
|
646
|
+
inputs = self.tokenizer(audio, return_tensors="pt")
|
647
|
+
|
648
|
+
inputs = inputs.to("cuda")
|
649
|
+
|
650
|
+
# Generate response
|
651
|
+
import torch
|
652
|
+
with torch.no_grad():
|
653
|
+
outputs = self.model.generate(**inputs)
|
654
|
+
|
655
|
+
# Decode response
|
656
|
+
if self.processor:
|
657
|
+
response = self.processor.decode(outputs[0], skip_special_tokens=True)
|
658
|
+
else:
|
659
|
+
response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
|
660
|
+
|
661
|
+
os.unlink(tmp_file.name)
|
662
|
+
|
663
|
+
processing_time = time.time() - start_time
|
664
|
+
|
665
|
+
return {
|
666
|
+
'success': True,
|
667
|
+
'text': response,
|
668
|
+
'processing_time': processing_time,
|
669
|
+
'model': self.model.config.name_or_path
|
670
|
+
}
|
671
|
+
|
672
|
+
except Exception as e:
|
673
|
+
return {
|
674
|
+
'success': False,
|
675
|
+
'error': str(e),
|
676
|
+
'processing_time': time.time() - start_time
|
677
|
+
}'''
|
678
|
+
|
679
|
+
def _image_generation_method(self, config: ModelConfig) -> str:
|
680
|
+
return '''
|
681
|
+
@modal.method()
|
682
|
+
def generate_image(
|
683
|
+
self,
|
684
|
+
prompt: str,
|
685
|
+
width: int = 512,
|
686
|
+
height: int = 512,
|
687
|
+
num_inference_steps: int = 20
|
688
|
+
) -> Dict[str, Any]:
|
689
|
+
"""Generate image using the model"""
|
690
|
+
start_time = time.time()
|
691
|
+
self.request_count += 1
|
692
|
+
|
693
|
+
try:
|
694
|
+
if not self.models_loaded or not self.model:
|
695
|
+
raise RuntimeError("Model not loaded")
|
696
|
+
|
697
|
+
# Generate image
|
698
|
+
image = self.model(
|
699
|
+
prompt=prompt,
|
700
|
+
width=width,
|
701
|
+
height=height,
|
702
|
+
num_inference_steps=num_inference_steps
|
703
|
+
).images[0]
|
704
|
+
|
705
|
+
# Convert to base64
|
706
|
+
import io
|
707
|
+
buffer = io.BytesIO()
|
708
|
+
image.save(buffer, format="PNG")
|
709
|
+
image_b64 = base64.b64encode(buffer.getvalue()).decode()
|
710
|
+
|
711
|
+
processing_time = time.time() - start_time
|
712
|
+
|
713
|
+
return {
|
714
|
+
'success': True,
|
715
|
+
'image': image_b64,
|
716
|
+
'processing_time': processing_time,
|
717
|
+
'model': self.model.config.name_or_path
|
718
|
+
}
|
719
|
+
|
720
|
+
except Exception as e:
|
721
|
+
return {
|
722
|
+
'success': False,
|
723
|
+
'error': str(e),
|
724
|
+
'processing_time': time.time() - start_time
|
725
|
+
}'''
|
726
|
+
|
727
|
+
def _embedding_method(self, config: ModelConfig) -> str:
|
728
|
+
return '''
|
729
|
+
@modal.method()
|
730
|
+
def embed_text(
|
731
|
+
self,
|
732
|
+
text: Union[str, List[str]]
|
733
|
+
) -> Dict[str, Any]:
|
734
|
+
"""Generate embeddings for text"""
|
735
|
+
start_time = time.time()
|
736
|
+
self.request_count += 1
|
737
|
+
|
738
|
+
try:
|
739
|
+
if not self.models_loaded or not self.model:
|
740
|
+
raise RuntimeError("Model not loaded")
|
741
|
+
|
742
|
+
# Generate embeddings
|
743
|
+
if hasattr(self.model, 'encode'):
|
744
|
+
embeddings = self.model.encode(text)
|
745
|
+
else:
|
746
|
+
inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True)
|
747
|
+
inputs = inputs.to("cuda")
|
748
|
+
|
749
|
+
import torch
|
750
|
+
with torch.no_grad():
|
751
|
+
outputs = self.model(**inputs)
|
752
|
+
embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
|
753
|
+
|
754
|
+
processing_time = time.time() - start_time
|
755
|
+
|
756
|
+
return {
|
757
|
+
'success': True,
|
758
|
+
'embeddings': embeddings.tolist(),
|
759
|
+
'processing_time': processing_time,
|
760
|
+
'model': self.model.config.name_or_path
|
761
|
+
}
|
762
|
+
|
763
|
+
except Exception as e:
|
764
|
+
return {
|
765
|
+
'success': False,
|
766
|
+
'error': str(e),
|
767
|
+
'processing_time': time.time() - start_time
|
768
|
+
}'''
|
769
|
+
|
770
|
+
def _generic_inference_method(self, config: ModelConfig) -> str:
|
771
|
+
return '''
|
772
|
+
@modal.method()
|
773
|
+
def inference(
|
774
|
+
self,
|
775
|
+
input_data: str,
|
776
|
+
task: str = "generate",
|
777
|
+
**kwargs
|
778
|
+
) -> Dict[str, Any]:
|
779
|
+
"""Generic inference method"""
|
780
|
+
start_time = time.time()
|
781
|
+
self.request_count += 1
|
782
|
+
|
783
|
+
try:
|
784
|
+
if not self.models_loaded or not self.model:
|
785
|
+
raise RuntimeError("Model not loaded")
|
786
|
+
|
787
|
+
# Process input
|
788
|
+
if self.processor:
|
789
|
+
inputs = self.processor(input_data, return_tensors="pt")
|
790
|
+
else:
|
791
|
+
inputs = self.tokenizer(input_data, return_tensors="pt")
|
792
|
+
|
793
|
+
inputs = inputs.to("cuda")
|
794
|
+
|
795
|
+
# Generate response
|
796
|
+
import torch
|
797
|
+
with torch.no_grad():
|
798
|
+
outputs = self.model.generate(**inputs, **kwargs)
|
799
|
+
|
800
|
+
# Decode response
|
801
|
+
if self.processor:
|
802
|
+
response = self.processor.decode(outputs[0], skip_special_tokens=True)
|
803
|
+
else:
|
804
|
+
response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
|
805
|
+
|
806
|
+
processing_time = time.time() - start_time
|
807
|
+
|
808
|
+
return {
|
809
|
+
'success': True,
|
810
|
+
'output': response,
|
811
|
+
'processing_time': processing_time,
|
812
|
+
'model': self.model.config.name_or_path
|
813
|
+
}
|
814
|
+
|
815
|
+
except Exception as e:
|
816
|
+
return {
|
817
|
+
'success': False,
|
818
|
+
'error': str(e),
|
819
|
+
'processing_time': time.time() - start_time
|
820
|
+
}'''
|
821
|
+
|
822
|
+
def deploy_model(self, model_id: str, deploy: bool = False) -> Dict[str, Any]:
|
823
|
+
"""
|
824
|
+
Analyze and optionally deploy a HuggingFace model to Modal
|
825
|
+
|
826
|
+
Args:
|
827
|
+
model_id: HuggingFace model ID
|
828
|
+
deploy: Whether to actually deploy to Modal
|
829
|
+
|
830
|
+
Returns:
|
831
|
+
Deployment result with service code
|
832
|
+
"""
|
833
|
+
try:
|
834
|
+
# Analyze model
|
835
|
+
config = self.analyze_model(model_id)
|
836
|
+
|
837
|
+
# Generate Modal service code
|
838
|
+
service_code = self.generate_modal_service(config)
|
839
|
+
|
840
|
+
# Save service code to file
|
841
|
+
service_name = model_id.replace('/', '_').replace('-', '_').lower()
|
842
|
+
output_dir = Path("/Users/xenodennis/Documents/Fun/isA_Model/isa_model/deployment/cloud/modal")
|
843
|
+
output_file = output_dir / f"auto_{service_name}_service.py"
|
844
|
+
|
845
|
+
with open(output_file, 'w') as f:
|
846
|
+
f.write(service_code)
|
847
|
+
|
848
|
+
result = {
|
849
|
+
'success': True,
|
850
|
+
'model_id': model_id,
|
851
|
+
'config': config.__dict__,
|
852
|
+
'service_file': str(output_file),
|
853
|
+
'service_code': service_code,
|
854
|
+
'estimated_cost_per_hour': config.estimated_cost_per_hour,
|
855
|
+
'deployment_command': f"modal deploy {output_file}",
|
856
|
+
'deployed': False
|
857
|
+
}
|
858
|
+
|
859
|
+
# Optional: Actually deploy to Modal
|
860
|
+
if deploy:
|
861
|
+
try:
|
862
|
+
import subprocess
|
863
|
+
deployment_result = subprocess.run(
|
864
|
+
['modal', 'deploy', str(output_file)],
|
865
|
+
capture_output=True,
|
866
|
+
text=True,
|
867
|
+
timeout=300
|
868
|
+
)
|
869
|
+
|
870
|
+
if deployment_result.returncode == 0:
|
871
|
+
result['deployed'] = True
|
872
|
+
result['deployment_output'] = deployment_result.stdout
|
873
|
+
else:
|
874
|
+
result['deployment_error'] = deployment_result.stderr
|
875
|
+
|
876
|
+
except Exception as e:
|
877
|
+
result['deployment_error'] = str(e)
|
878
|
+
|
879
|
+
return result
|
880
|
+
|
881
|
+
except Exception as e:
|
882
|
+
return {
|
883
|
+
'success': False,
|
884
|
+
'error': str(e),
|
885
|
+
'model_id': model_id
|
886
|
+
}
|
887
|
+
|
888
|
+
# Example usage
|
889
|
+
if __name__ == "__main__":
|
890
|
+
deployer = HuggingFaceModalDeployer()
|
891
|
+
|
892
|
+
# Example: Deploy a text model
|
893
|
+
result = deployer.deploy_model("microsoft/DialoGPT-medium")
|
894
|
+
print(json.dumps(result, indent=2, default=str))
|