isa-model 0.3.91__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- isa_model/client.py +732 -573
- isa_model/core/cache/redis_cache.py +401 -0
- isa_model/core/config/config_manager.py +53 -10
- isa_model/core/config.py +1 -1
- isa_model/core/database/__init__.py +1 -0
- isa_model/core/database/migrations.py +277 -0
- isa_model/core/database/supabase_client.py +123 -0
- isa_model/core/models/__init__.py +37 -0
- isa_model/core/models/model_billing_tracker.py +60 -88
- isa_model/core/models/model_manager.py +36 -18
- isa_model/core/models/model_repo.py +44 -38
- isa_model/core/models/model_statistics_tracker.py +234 -0
- isa_model/core/models/model_storage.py +0 -1
- isa_model/core/models/model_version_manager.py +959 -0
- isa_model/core/pricing_manager.py +2 -249
- isa_model/core/resilience/circuit_breaker.py +366 -0
- isa_model/core/security/secrets.py +358 -0
- isa_model/core/services/__init__.py +2 -4
- isa_model/core/services/intelligent_model_selector.py +101 -370
- isa_model/core/storage/hf_storage.py +1 -1
- isa_model/core/types.py +7 -0
- isa_model/deployment/cloud/modal/isa_audio_chatTTS_service.py +520 -0
- isa_model/deployment/cloud/modal/isa_audio_fish_service.py +0 -0
- isa_model/deployment/cloud/modal/isa_audio_openvoice_service.py +758 -0
- isa_model/deployment/cloud/modal/isa_audio_service_v2.py +1044 -0
- isa_model/deployment/cloud/modal/isa_embed_rerank_service.py +296 -0
- isa_model/deployment/cloud/modal/isa_video_hunyuan_service.py +423 -0
- isa_model/deployment/cloud/modal/isa_vision_ocr_service.py +519 -0
- isa_model/deployment/cloud/modal/isa_vision_qwen25_service.py +709 -0
- isa_model/deployment/cloud/modal/isa_vision_table_service.py +467 -323
- isa_model/deployment/cloud/modal/isa_vision_ui_service.py +607 -180
- isa_model/deployment/cloud/modal/isa_vision_ui_service_optimized.py +660 -0
- isa_model/deployment/core/deployment_manager.py +6 -4
- isa_model/deployment/services/auto_hf_modal_deployer.py +894 -0
- isa_model/eval/benchmarks/__init__.py +27 -0
- isa_model/eval/benchmarks/multimodal_datasets.py +460 -0
- isa_model/eval/benchmarks.py +244 -12
- isa_model/eval/evaluators/__init__.py +8 -2
- isa_model/eval/evaluators/audio_evaluator.py +727 -0
- isa_model/eval/evaluators/embedding_evaluator.py +742 -0
- isa_model/eval/evaluators/vision_evaluator.py +564 -0
- isa_model/eval/example_evaluation.py +395 -0
- isa_model/eval/factory.py +272 -5
- isa_model/eval/isa_benchmarks.py +700 -0
- isa_model/eval/isa_integration.py +582 -0
- isa_model/eval/metrics.py +159 -6
- isa_model/eval/tests/unit/test_basic.py +396 -0
- isa_model/inference/ai_factory.py +44 -8
- isa_model/inference/services/audio/__init__.py +21 -0
- isa_model/inference/services/audio/base_realtime_service.py +225 -0
- isa_model/inference/services/audio/isa_tts_service.py +0 -0
- isa_model/inference/services/audio/openai_realtime_service.py +320 -124
- isa_model/inference/services/audio/openai_stt_service.py +32 -6
- isa_model/inference/services/base_service.py +17 -1
- isa_model/inference/services/embedding/__init__.py +13 -0
- isa_model/inference/services/embedding/base_embed_service.py +111 -8
- isa_model/inference/services/embedding/isa_embed_service.py +305 -0
- isa_model/inference/services/embedding/openai_embed_service.py +2 -4
- isa_model/inference/services/embedding/tests/test_embedding.py +222 -0
- isa_model/inference/services/img/__init__.py +2 -2
- isa_model/inference/services/img/base_image_gen_service.py +24 -7
- isa_model/inference/services/img/replicate_image_gen_service.py +84 -422
- isa_model/inference/services/img/services/replicate_face_swap.py +193 -0
- isa_model/inference/services/img/services/replicate_flux.py +226 -0
- isa_model/inference/services/img/services/replicate_flux_kontext.py +219 -0
- isa_model/inference/services/img/services/replicate_sticker_maker.py +249 -0
- isa_model/inference/services/img/tests/test_img_client.py +297 -0
- isa_model/inference/services/llm/base_llm_service.py +30 -6
- isa_model/inference/services/llm/helpers/llm_adapter.py +63 -9
- isa_model/inference/services/llm/ollama_llm_service.py +2 -1
- isa_model/inference/services/llm/openai_llm_service.py +652 -55
- isa_model/inference/services/llm/yyds_llm_service.py +2 -1
- isa_model/inference/services/vision/__init__.py +5 -5
- isa_model/inference/services/vision/base_vision_service.py +118 -185
- isa_model/inference/services/vision/helpers/image_utils.py +11 -5
- isa_model/inference/services/vision/isa_vision_service.py +573 -0
- isa_model/inference/services/vision/tests/test_ocr_client.py +284 -0
- isa_model/serving/api/fastapi_server.py +88 -16
- isa_model/serving/api/middleware/auth.py +311 -0
- isa_model/serving/api/middleware/security.py +278 -0
- isa_model/serving/api/routes/analytics.py +486 -0
- isa_model/serving/api/routes/deployments.py +339 -0
- isa_model/serving/api/routes/evaluations.py +579 -0
- isa_model/serving/api/routes/logs.py +430 -0
- isa_model/serving/api/routes/settings.py +582 -0
- isa_model/serving/api/routes/unified.py +324 -165
- isa_model/serving/api/startup.py +304 -0
- isa_model/serving/modal_proxy_server.py +249 -0
- isa_model/training/__init__.py +100 -6
- isa_model/training/core/__init__.py +4 -1
- isa_model/training/examples/intelligent_training_example.py +281 -0
- isa_model/training/intelligent/__init__.py +25 -0
- isa_model/training/intelligent/decision_engine.py +643 -0
- isa_model/training/intelligent/intelligent_factory.py +888 -0
- isa_model/training/intelligent/knowledge_base.py +751 -0
- isa_model/training/intelligent/resource_optimizer.py +839 -0
- isa_model/training/intelligent/task_classifier.py +576 -0
- isa_model/training/storage/__init__.py +24 -0
- isa_model/training/storage/core_integration.py +439 -0
- isa_model/training/storage/training_repository.py +552 -0
- isa_model/training/storage/training_storage.py +628 -0
- {isa_model-0.3.91.dist-info → isa_model-0.4.0.dist-info}/METADATA +13 -1
- isa_model-0.4.0.dist-info/RECORD +182 -0
- isa_model/deployment/cloud/modal/isa_vision_doc_service.py +0 -766
- isa_model/deployment/cloud/modal/register_models.py +0 -321
- isa_model/inference/adapter/unified_api.py +0 -248
- isa_model/inference/services/helpers/stacked_config.py +0 -148
- isa_model/inference/services/img/flux_professional_service.py +0 -603
- isa_model/inference/services/img/helpers/base_stacked_service.py +0 -274
- isa_model/inference/services/others/table_transformer_service.py +0 -61
- isa_model/inference/services/vision/doc_analysis_service.py +0 -640
- isa_model/inference/services/vision/helpers/base_stacked_service.py +0 -274
- isa_model/inference/services/vision/ui_analysis_service.py +0 -823
- isa_model/scripts/inference_tracker.py +0 -283
- isa_model/scripts/mlflow_manager.py +0 -379
- isa_model/scripts/model_registry.py +0 -465
- isa_model/scripts/register_models.py +0 -370
- isa_model/scripts/register_models_with_embeddings.py +0 -510
- isa_model/scripts/start_mlflow.py +0 -95
- isa_model/scripts/training_tracker.py +0 -257
- isa_model-0.3.91.dist-info/RECORD +0 -138
- {isa_model-0.3.91.dist-info → isa_model-0.4.0.dist-info}/WHEEL +0 -0
- {isa_model-0.3.91.dist-info → isa_model-0.4.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,709 @@
|
|
1
|
+
"""
|
2
|
+
ISA Qwen2.5-VL Service
|
3
|
+
|
4
|
+
Multimodal vision-language service using Qwen2.5-VL 7B
|
5
|
+
- Image understanding and analysis
|
6
|
+
- Video understanding and analysis
|
7
|
+
- Vision-language reasoning
|
8
|
+
- High-quality visual content interpretation
|
9
|
+
"""
|
10
|
+
|
11
|
+
import modal
|
12
|
+
import time
|
13
|
+
import json
|
14
|
+
import os
|
15
|
+
import logging
|
16
|
+
import base64
|
17
|
+
import tempfile
|
18
|
+
from typing import Dict, List, Optional, Any, Union
|
19
|
+
from pathlib import Path
|
20
|
+
|
21
|
+
# Define Modal application
|
22
|
+
app = modal.App("isa-vision-qwen2.5")
|
23
|
+
|
24
|
+
# Define Modal container image with Qwen2.5-VL dependencies
|
25
|
+
image = (
|
26
|
+
modal.Image.debian_slim(python_version="3.10")
|
27
|
+
.pip_install([
|
28
|
+
"packaging", # Required dependency
|
29
|
+
"torch>=2.0.0",
|
30
|
+
"torchvision>=0.15.0",
|
31
|
+
"transformers>=4.37.0",
|
32
|
+
"accelerate>=0.26.0",
|
33
|
+
"Pillow>=10.0.0",
|
34
|
+
"opencv-python>=4.8.0",
|
35
|
+
"numpy>=1.24.0",
|
36
|
+
"requests>=2.31.0",
|
37
|
+
"httpx>=0.26.0",
|
38
|
+
"pydantic>=2.0.0",
|
39
|
+
"python-dotenv>=1.0.0",
|
40
|
+
"qwen-vl-utils", # Qwen VL utilities
|
41
|
+
"av", # Video processing
|
42
|
+
"decord", # Video decoding
|
43
|
+
"imageio>=2.31.0",
|
44
|
+
"imageio-ffmpeg>=0.4.8",
|
45
|
+
"tiktoken>=0.5.0",
|
46
|
+
"sentencepiece>=0.1.99",
|
47
|
+
"protobuf>=3.20.0",
|
48
|
+
# "flash-attn>=2.0.0", # Optional - removed for easier deployment
|
49
|
+
])
|
50
|
+
.apt_install([
|
51
|
+
"ffmpeg",
|
52
|
+
"libsm6",
|
53
|
+
"libxext6",
|
54
|
+
"libxrender-dev",
|
55
|
+
"libglib2.0-0",
|
56
|
+
"libgl1-mesa-glx",
|
57
|
+
"git-lfs"
|
58
|
+
])
|
59
|
+
.env({
|
60
|
+
"TRANSFORMERS_CACHE": "/models",
|
61
|
+
"TORCH_HOME": "/models/torch",
|
62
|
+
"HF_HOME": "/models",
|
63
|
+
"CUDA_VISIBLE_DEVICES": "0",
|
64
|
+
"PYTORCH_CUDA_ALLOC_CONF": "max_split_size_mb:512"
|
65
|
+
})
|
66
|
+
)
|
67
|
+
|
68
|
+
# Qwen2.5-VL Service - Optimized for performance
|
69
|
+
@app.cls(
|
70
|
+
gpu="A100", # Use A100 for better performance (40GB)
|
71
|
+
image=image,
|
72
|
+
memory=32768, # 32GB RAM for faster processing
|
73
|
+
timeout=1800, # 30 minutes
|
74
|
+
scaledown_window=300, # 5 minutes idle timeout (longer for model warmup)
|
75
|
+
min_containers=1, # Keep 1 container warm
|
76
|
+
max_containers=5, # Limit for cost control
|
77
|
+
# secrets=[modal.Secret.from_name("huggingface-secret")], # Optional HF token
|
78
|
+
)
|
79
|
+
class ISAVisionQwen25Service:
|
80
|
+
"""
|
81
|
+
ISA Qwen2.5-VL Service
|
82
|
+
|
83
|
+
Multimodal vision-language model (7B parameters):
|
84
|
+
- Model: Qwen/Qwen2.5-VL-7B-Instruct
|
85
|
+
- Architecture: Vision Transformer + Language Model
|
86
|
+
- Capabilities: Image understanding, Video understanding, VL reasoning
|
87
|
+
- Performance: SOTA multimodal understanding
|
88
|
+
"""
|
89
|
+
|
90
|
+
@modal.enter()
|
91
|
+
def load_models(self):
|
92
|
+
"""Load Qwen2.5-VL model and dependencies"""
|
93
|
+
print("Loading Qwen2.5-VL (7B parameters)...")
|
94
|
+
start_time = time.time()
|
95
|
+
|
96
|
+
# Initialize instance variables
|
97
|
+
self.model = None
|
98
|
+
self.processor = None
|
99
|
+
self.logger = logging.getLogger(__name__)
|
100
|
+
self.request_count = 0
|
101
|
+
self.total_processing_time = 0.0
|
102
|
+
|
103
|
+
try:
|
104
|
+
import torch
|
105
|
+
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
|
106
|
+
from qwen_vl_utils import process_vision_info
|
107
|
+
|
108
|
+
# Store the function as instance variable for later use
|
109
|
+
self.process_vision_info = process_vision_info
|
110
|
+
|
111
|
+
model_name = "Qwen/Qwen2.5-VL-7B-Instruct"
|
112
|
+
|
113
|
+
print(f"Loading Qwen2.5-VL model: {model_name}")
|
114
|
+
|
115
|
+
# Load model with optimizations for speed
|
116
|
+
self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
117
|
+
model_name,
|
118
|
+
torch_dtype=torch.float16, # Use float16 for speed
|
119
|
+
device_map="auto",
|
120
|
+
attn_implementation="sdpa", # Use SDPA for better performance
|
121
|
+
low_cpu_mem_usage=True, # Reduce CPU memory usage
|
122
|
+
use_cache=True # Enable KV cache
|
123
|
+
)
|
124
|
+
|
125
|
+
# Load processor for image/video processing
|
126
|
+
self.processor = AutoProcessor.from_pretrained(
|
127
|
+
model_name,
|
128
|
+
use_fast=True # Use fast tokenizer for speed
|
129
|
+
)
|
130
|
+
|
131
|
+
# Set model to evaluation mode and optimize for inference
|
132
|
+
self.model.eval()
|
133
|
+
|
134
|
+
# Compile model for faster inference (PyTorch 2.0+)
|
135
|
+
try:
|
136
|
+
self.model = torch.compile(self.model, mode="reduce-overhead")
|
137
|
+
print("✅ Model compiled for faster inference")
|
138
|
+
except Exception as e:
|
139
|
+
print(f"⚠️ Model compilation failed: {e}")
|
140
|
+
|
141
|
+
# Enable CPU offloading for memory efficiency
|
142
|
+
self.model.tie_weights()
|
143
|
+
|
144
|
+
load_time = time.time() - start_time
|
145
|
+
print(f"Qwen2.5-VL loaded successfully in {load_time:.2f}s")
|
146
|
+
|
147
|
+
# Model loading status
|
148
|
+
self.models_loaded = True
|
149
|
+
|
150
|
+
except Exception as e:
|
151
|
+
print(f"Model loading failed: {e}")
|
152
|
+
import traceback
|
153
|
+
traceback.print_exc()
|
154
|
+
self.models_loaded = False
|
155
|
+
|
156
|
+
@modal.method()
|
157
|
+
def analyze_image(
|
158
|
+
self,
|
159
|
+
image_b64: str,
|
160
|
+
prompt: str = "Describe this image in detail.",
|
161
|
+
max_tokens: int = 1000,
|
162
|
+
temperature: float = 0.7,
|
163
|
+
top_p: float = 0.9
|
164
|
+
) -> Dict[str, Any]:
|
165
|
+
"""
|
166
|
+
Analyze image using Qwen2.5-VL
|
167
|
+
|
168
|
+
Args:
|
169
|
+
image_b64: Base64 encoded image
|
170
|
+
prompt: Question or instruction about the image
|
171
|
+
max_tokens: Maximum tokens to generate
|
172
|
+
temperature: Sampling temperature
|
173
|
+
top_p: Top-p sampling parameter
|
174
|
+
|
175
|
+
Returns:
|
176
|
+
Image analysis results
|
177
|
+
"""
|
178
|
+
start_time = time.time()
|
179
|
+
self.request_count += 1
|
180
|
+
|
181
|
+
try:
|
182
|
+
# Validate model loading status
|
183
|
+
if not self.models_loaded or not self.model:
|
184
|
+
raise RuntimeError("Qwen2.5-VL model not loaded")
|
185
|
+
|
186
|
+
# Decode base64 image
|
187
|
+
image_data = base64.b64decode(image_b64)
|
188
|
+
|
189
|
+
# Save to temporary file
|
190
|
+
with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_file:
|
191
|
+
tmp_file.write(image_data)
|
192
|
+
tmp_file.flush()
|
193
|
+
|
194
|
+
# Prepare messages for the model
|
195
|
+
messages = [
|
196
|
+
{
|
197
|
+
"role": "user",
|
198
|
+
"content": [
|
199
|
+
{
|
200
|
+
"type": "image",
|
201
|
+
"image": tmp_file.name,
|
202
|
+
},
|
203
|
+
{"type": "text", "text": prompt},
|
204
|
+
],
|
205
|
+
}
|
206
|
+
]
|
207
|
+
|
208
|
+
# Process the conversation
|
209
|
+
text = self.processor.apply_chat_template(
|
210
|
+
messages, tokenize=False, add_generation_prompt=True
|
211
|
+
)
|
212
|
+
|
213
|
+
# Process vision info
|
214
|
+
image_inputs, video_inputs = self.process_vision_info(messages)
|
215
|
+
|
216
|
+
# Prepare inputs
|
217
|
+
inputs = self.processor(
|
218
|
+
text=[text],
|
219
|
+
images=image_inputs,
|
220
|
+
videos=video_inputs,
|
221
|
+
padding=True,
|
222
|
+
return_tensors="pt",
|
223
|
+
)
|
224
|
+
inputs = inputs.to("cuda")
|
225
|
+
|
226
|
+
# Generate response with optimized parameters
|
227
|
+
import torch
|
228
|
+
with torch.no_grad():
|
229
|
+
generated_ids = self.model.generate(
|
230
|
+
**inputs,
|
231
|
+
max_new_tokens=min(max_tokens, 200), # Limit max tokens for speed
|
232
|
+
temperature=temperature,
|
233
|
+
top_p=top_p,
|
234
|
+
do_sample=True,
|
235
|
+
pad_token_id=self.processor.tokenizer.eos_token_id,
|
236
|
+
use_cache=True, # Enable KV cache
|
237
|
+
num_beams=1, # Use greedy decoding for speed
|
238
|
+
early_stopping=True # Stop early when possible
|
239
|
+
)
|
240
|
+
|
241
|
+
# Extract generated tokens (remove input tokens)
|
242
|
+
generated_ids_trimmed = [
|
243
|
+
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
|
244
|
+
]
|
245
|
+
|
246
|
+
# Decode response
|
247
|
+
response_text = self.processor.batch_decode(
|
248
|
+
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
|
249
|
+
)[0]
|
250
|
+
|
251
|
+
# Clean up temp file
|
252
|
+
os.unlink(tmp_file.name)
|
253
|
+
|
254
|
+
processing_time = time.time() - start_time
|
255
|
+
self.total_processing_time += processing_time
|
256
|
+
|
257
|
+
# Calculate cost (A100 GPU: ~$4.00/hour)
|
258
|
+
gpu_cost = (processing_time / 3600) * 4.00
|
259
|
+
|
260
|
+
result = {
|
261
|
+
'success': True,
|
262
|
+
'service': 'isa-vision-qwen2.5',
|
263
|
+
'operation': 'image_analysis',
|
264
|
+
'provider': 'ISA',
|
265
|
+
'text': response_text,
|
266
|
+
'prompt': prompt,
|
267
|
+
'model': 'Qwen2.5-VL-7B-Instruct',
|
268
|
+
'architecture': 'Vision Transformer + Language Model',
|
269
|
+
'modality': 'image',
|
270
|
+
'parameters': {
|
271
|
+
'max_tokens': max_tokens,
|
272
|
+
'temperature': temperature,
|
273
|
+
'top_p': top_p
|
274
|
+
},
|
275
|
+
'processing_time': processing_time,
|
276
|
+
'billing': {
|
277
|
+
'request_id': f"img_{self.request_count}_{int(time.time())}",
|
278
|
+
'gpu_seconds': processing_time,
|
279
|
+
'estimated_cost_usd': round(gpu_cost, 4),
|
280
|
+
'gpu_type': 'A100'
|
281
|
+
},
|
282
|
+
'model_info': {
|
283
|
+
'model_name': 'Qwen2.5-VL-7B-Instruct',
|
284
|
+
'provider': 'ISA',
|
285
|
+
'architecture': 'Multimodal Vision-Language',
|
286
|
+
'parameters': '7B',
|
287
|
+
'gpu': 'A100',
|
288
|
+
'capabilities': ['image_understanding', 'vision_language_reasoning'],
|
289
|
+
'container_id': os.environ.get('MODAL_TASK_ID', 'unknown')
|
290
|
+
}
|
291
|
+
}
|
292
|
+
|
293
|
+
# Output JSON results
|
294
|
+
print("=== JSON_RESULT_START ===")
|
295
|
+
print(json.dumps(result, default=str))
|
296
|
+
print("=== JSON_RESULT_END ===")
|
297
|
+
|
298
|
+
return result
|
299
|
+
|
300
|
+
except Exception as e:
|
301
|
+
processing_time = time.time() - start_time
|
302
|
+
error_result = {
|
303
|
+
'success': False,
|
304
|
+
'service': 'isa-vision-qwen2.5',
|
305
|
+
'operation': 'image_analysis',
|
306
|
+
'provider': 'ISA',
|
307
|
+
'error': str(e),
|
308
|
+
'processing_time': processing_time,
|
309
|
+
'billing': {
|
310
|
+
'request_id': f"img_{self.request_count}_{int(time.time())}",
|
311
|
+
'gpu_seconds': processing_time,
|
312
|
+
'estimated_cost_usd': round((processing_time / 3600) * 1.20, 4),
|
313
|
+
'gpu_type': 'A100'
|
314
|
+
}
|
315
|
+
}
|
316
|
+
|
317
|
+
print("=== JSON_RESULT_START ===")
|
318
|
+
print(json.dumps(error_result, default=str))
|
319
|
+
print("=== JSON_RESULT_END ===")
|
320
|
+
|
321
|
+
return error_result
|
322
|
+
|
323
|
+
@modal.method()
|
324
|
+
def analyze_video(
|
325
|
+
self,
|
326
|
+
video_b64: str,
|
327
|
+
prompt: str = "Describe what happens in this video.",
|
328
|
+
max_tokens: int = 1000,
|
329
|
+
temperature: float = 0.7,
|
330
|
+
top_p: float = 0.9,
|
331
|
+
max_frames: int = 8
|
332
|
+
) -> Dict[str, Any]:
|
333
|
+
"""
|
334
|
+
Analyze video using Qwen2.5-VL
|
335
|
+
|
336
|
+
Args:
|
337
|
+
video_b64: Base64 encoded video
|
338
|
+
prompt: Question or instruction about the video
|
339
|
+
max_tokens: Maximum tokens to generate
|
340
|
+
temperature: Sampling temperature
|
341
|
+
top_p: Top-p sampling parameter
|
342
|
+
max_frames: Maximum frames to sample from video
|
343
|
+
|
344
|
+
Returns:
|
345
|
+
Video analysis results
|
346
|
+
"""
|
347
|
+
start_time = time.time()
|
348
|
+
self.request_count += 1
|
349
|
+
|
350
|
+
try:
|
351
|
+
# Validate model loading status
|
352
|
+
if not self.models_loaded or not self.model:
|
353
|
+
raise RuntimeError("Qwen2.5-VL model not loaded")
|
354
|
+
|
355
|
+
# Decode base64 video
|
356
|
+
video_data = base64.b64decode(video_b64)
|
357
|
+
|
358
|
+
# Save to temporary file
|
359
|
+
with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmp_file:
|
360
|
+
tmp_file.write(video_data)
|
361
|
+
tmp_file.flush()
|
362
|
+
|
363
|
+
# Prepare messages for the model
|
364
|
+
messages = [
|
365
|
+
{
|
366
|
+
"role": "user",
|
367
|
+
"content": [
|
368
|
+
{
|
369
|
+
"type": "video",
|
370
|
+
"video": tmp_file.name,
|
371
|
+
"max_pixels": 360 * 420,
|
372
|
+
"fps": 1.0,
|
373
|
+
},
|
374
|
+
{"type": "text", "text": prompt},
|
375
|
+
],
|
376
|
+
}
|
377
|
+
]
|
378
|
+
|
379
|
+
# Process the conversation
|
380
|
+
text = self.processor.apply_chat_template(
|
381
|
+
messages, tokenize=False, add_generation_prompt=True
|
382
|
+
)
|
383
|
+
|
384
|
+
# Process vision info
|
385
|
+
image_inputs, video_inputs = self.process_vision_info(messages)
|
386
|
+
|
387
|
+
# Prepare inputs
|
388
|
+
inputs = self.processor(
|
389
|
+
text=[text],
|
390
|
+
images=image_inputs,
|
391
|
+
videos=video_inputs,
|
392
|
+
padding=True,
|
393
|
+
return_tensors="pt",
|
394
|
+
)
|
395
|
+
inputs = inputs.to("cuda")
|
396
|
+
|
397
|
+
# Generate response with optimized parameters
|
398
|
+
import torch
|
399
|
+
with torch.no_grad():
|
400
|
+
generated_ids = self.model.generate(
|
401
|
+
**inputs,
|
402
|
+
max_new_tokens=min(max_tokens, 200), # Limit max tokens for speed
|
403
|
+
temperature=temperature,
|
404
|
+
top_p=top_p,
|
405
|
+
do_sample=True,
|
406
|
+
pad_token_id=self.processor.tokenizer.eos_token_id,
|
407
|
+
use_cache=True, # Enable KV cache
|
408
|
+
num_beams=1, # Use greedy decoding for speed
|
409
|
+
early_stopping=True # Stop early when possible
|
410
|
+
)
|
411
|
+
|
412
|
+
# Extract generated tokens (remove input tokens)
|
413
|
+
generated_ids_trimmed = [
|
414
|
+
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
|
415
|
+
]
|
416
|
+
|
417
|
+
# Decode response
|
418
|
+
response_text = self.processor.batch_decode(
|
419
|
+
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
|
420
|
+
)[0]
|
421
|
+
|
422
|
+
# Clean up temp file
|
423
|
+
os.unlink(tmp_file.name)
|
424
|
+
|
425
|
+
processing_time = time.time() - start_time
|
426
|
+
self.total_processing_time += processing_time
|
427
|
+
|
428
|
+
# Calculate cost (A100 GPU: ~$4.00/hour)
|
429
|
+
gpu_cost = (processing_time / 3600) * 4.00
|
430
|
+
|
431
|
+
result = {
|
432
|
+
'success': True,
|
433
|
+
'service': 'isa-vision-qwen2.5',
|
434
|
+
'operation': 'video_analysis',
|
435
|
+
'provider': 'ISA',
|
436
|
+
'text': response_text,
|
437
|
+
'prompt': prompt,
|
438
|
+
'model': 'Qwen2.5-VL-7B-Instruct',
|
439
|
+
'architecture': 'Vision Transformer + Language Model',
|
440
|
+
'modality': 'video',
|
441
|
+
'parameters': {
|
442
|
+
'max_tokens': max_tokens,
|
443
|
+
'temperature': temperature,
|
444
|
+
'top_p': top_p,
|
445
|
+
'max_frames': max_frames
|
446
|
+
},
|
447
|
+
'processing_time': processing_time,
|
448
|
+
'billing': {
|
449
|
+
'request_id': f"vid_{self.request_count}_{int(time.time())}",
|
450
|
+
'gpu_seconds': processing_time,
|
451
|
+
'estimated_cost_usd': round(gpu_cost, 4),
|
452
|
+
'gpu_type': 'A100'
|
453
|
+
},
|
454
|
+
'model_info': {
|
455
|
+
'model_name': 'Qwen2.5-VL-7B-Instruct',
|
456
|
+
'provider': 'ISA',
|
457
|
+
'architecture': 'Multimodal Vision-Language',
|
458
|
+
'parameters': '7B',
|
459
|
+
'gpu': 'A100',
|
460
|
+
'capabilities': ['video_understanding', 'temporal_reasoning', 'vision_language_reasoning'],
|
461
|
+
'container_id': os.environ.get('MODAL_TASK_ID', 'unknown')
|
462
|
+
}
|
463
|
+
}
|
464
|
+
|
465
|
+
# Output JSON results
|
466
|
+
print("=== JSON_RESULT_START ===")
|
467
|
+
print(json.dumps(result, default=str))
|
468
|
+
print("=== JSON_RESULT_END ===")
|
469
|
+
|
470
|
+
return result
|
471
|
+
|
472
|
+
except Exception as e:
|
473
|
+
processing_time = time.time() - start_time
|
474
|
+
error_result = {
|
475
|
+
'success': False,
|
476
|
+
'service': 'isa-vision-qwen2.5',
|
477
|
+
'operation': 'video_analysis',
|
478
|
+
'provider': 'ISA',
|
479
|
+
'error': str(e),
|
480
|
+
'processing_time': processing_time,
|
481
|
+
'billing': {
|
482
|
+
'request_id': f"vid_{self.request_count}_{int(time.time())}",
|
483
|
+
'gpu_seconds': processing_time,
|
484
|
+
'estimated_cost_usd': round((processing_time / 3600) * 1.20, 4),
|
485
|
+
'gpu_type': 'A100'
|
486
|
+
}
|
487
|
+
}
|
488
|
+
|
489
|
+
print("=== JSON_RESULT_START ===")
|
490
|
+
print(json.dumps(error_result, default=str))
|
491
|
+
print("=== JSON_RESULT_END ===")
|
492
|
+
|
493
|
+
return error_result
|
494
|
+
|
495
|
+
@modal.method()
|
496
|
+
def multimodal_chat(
|
497
|
+
self,
|
498
|
+
messages: List[Dict[str, Any]],
|
499
|
+
max_tokens: int = 1000,
|
500
|
+
temperature: float = 0.7,
|
501
|
+
top_p: float = 0.9
|
502
|
+
) -> Dict[str, Any]:
|
503
|
+
"""
|
504
|
+
Multimodal chat with images/videos
|
505
|
+
|
506
|
+
Args:
|
507
|
+
messages: List of chat messages with images/videos
|
508
|
+
max_tokens: Maximum tokens to generate
|
509
|
+
temperature: Sampling temperature
|
510
|
+
top_p: Top-p sampling parameter
|
511
|
+
|
512
|
+
Returns:
|
513
|
+
Chat response
|
514
|
+
"""
|
515
|
+
start_time = time.time()
|
516
|
+
self.request_count += 1
|
517
|
+
|
518
|
+
try:
|
519
|
+
# Validate model loading status
|
520
|
+
if not self.models_loaded or not self.model:
|
521
|
+
raise RuntimeError("Qwen2.5-VL model not loaded")
|
522
|
+
|
523
|
+
# Process the conversation
|
524
|
+
text = self.processor.apply_chat_template(
|
525
|
+
messages, tokenize=False, add_generation_prompt=True
|
526
|
+
)
|
527
|
+
|
528
|
+
# Process vision info
|
529
|
+
image_inputs, video_inputs = self.process_vision_info(messages)
|
530
|
+
|
531
|
+
# Prepare inputs
|
532
|
+
inputs = self.processor(
|
533
|
+
text=[text],
|
534
|
+
images=image_inputs,
|
535
|
+
videos=video_inputs,
|
536
|
+
padding=True,
|
537
|
+
return_tensors="pt",
|
538
|
+
)
|
539
|
+
inputs = inputs.to("cuda")
|
540
|
+
|
541
|
+
# Generate response
|
542
|
+
import torch
|
543
|
+
with torch.no_grad():
|
544
|
+
generated_ids = self.model.generate(
|
545
|
+
**inputs,
|
546
|
+
max_new_tokens=max_tokens,
|
547
|
+
temperature=temperature,
|
548
|
+
top_p=top_p,
|
549
|
+
do_sample=True,
|
550
|
+
pad_token_id=self.processor.tokenizer.eos_token_id
|
551
|
+
)
|
552
|
+
|
553
|
+
# Extract generated tokens (remove input tokens)
|
554
|
+
generated_ids_trimmed = [
|
555
|
+
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
|
556
|
+
]
|
557
|
+
|
558
|
+
# Decode response
|
559
|
+
response_text = self.processor.batch_decode(
|
560
|
+
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
|
561
|
+
)[0]
|
562
|
+
|
563
|
+
processing_time = time.time() - start_time
|
564
|
+
self.total_processing_time += processing_time
|
565
|
+
|
566
|
+
# Calculate cost (A100 GPU: ~$4.00/hour)
|
567
|
+
gpu_cost = (processing_time / 3600) * 4.00
|
568
|
+
|
569
|
+
result = {
|
570
|
+
'success': True,
|
571
|
+
'service': 'isa-vision-qwen2.5',
|
572
|
+
'operation': 'multimodal_chat',
|
573
|
+
'provider': 'ISA',
|
574
|
+
'text': response_text,
|
575
|
+
'model': 'Qwen2.5-VL-7B-Instruct',
|
576
|
+
'architecture': 'Vision Transformer + Language Model',
|
577
|
+
'modality': 'multimodal',
|
578
|
+
'parameters': {
|
579
|
+
'max_tokens': max_tokens,
|
580
|
+
'temperature': temperature,
|
581
|
+
'top_p': top_p
|
582
|
+
},
|
583
|
+
'processing_time': processing_time,
|
584
|
+
'billing': {
|
585
|
+
'request_id': f"chat_{self.request_count}_{int(time.time())}",
|
586
|
+
'gpu_seconds': processing_time,
|
587
|
+
'estimated_cost_usd': round(gpu_cost, 4),
|
588
|
+
'gpu_type': 'A100'
|
589
|
+
},
|
590
|
+
'model_info': {
|
591
|
+
'model_name': 'Qwen2.5-VL-7B-Instruct',
|
592
|
+
'provider': 'ISA',
|
593
|
+
'architecture': 'Multimodal Vision-Language',
|
594
|
+
'parameters': '7B',
|
595
|
+
'gpu': 'A100',
|
596
|
+
'capabilities': ['image_understanding', 'video_understanding', 'multimodal_chat'],
|
597
|
+
'container_id': os.environ.get('MODAL_TASK_ID', 'unknown')
|
598
|
+
}
|
599
|
+
}
|
600
|
+
|
601
|
+
# Output JSON results
|
602
|
+
print("=== JSON_RESULT_START ===")
|
603
|
+
print(json.dumps(result, default=str))
|
604
|
+
print("=== JSON_RESULT_END ===")
|
605
|
+
|
606
|
+
return result
|
607
|
+
|
608
|
+
except Exception as e:
|
609
|
+
processing_time = time.time() - start_time
|
610
|
+
error_result = {
|
611
|
+
'success': False,
|
612
|
+
'service': 'isa-vision-qwen2.5',
|
613
|
+
'operation': 'multimodal_chat',
|
614
|
+
'provider': 'ISA',
|
615
|
+
'error': str(e),
|
616
|
+
'processing_time': processing_time,
|
617
|
+
'billing': {
|
618
|
+
'request_id': f"chat_{self.request_count}_{int(time.time())}",
|
619
|
+
'gpu_seconds': processing_time,
|
620
|
+
'estimated_cost_usd': round((processing_time / 3600) * 1.20, 4),
|
621
|
+
'gpu_type': 'A100'
|
622
|
+
}
|
623
|
+
}
|
624
|
+
|
625
|
+
print("=== JSON_RESULT_START ===")
|
626
|
+
print(json.dumps(error_result, default=str))
|
627
|
+
print("=== JSON_RESULT_END ===")
|
628
|
+
|
629
|
+
return error_result
|
630
|
+
|
631
|
+
@modal.method()
|
632
|
+
def health_check(self) -> Dict[str, Any]:
|
633
|
+
"""Health check endpoint"""
|
634
|
+
return {
|
635
|
+
'status': 'healthy',
|
636
|
+
'service': 'isa-vision-qwen2.5',
|
637
|
+
'provider': 'ISA',
|
638
|
+
'models_loaded': self.models_loaded,
|
639
|
+
'model': 'Qwen2.5-VL-7B-Instruct',
|
640
|
+
'architecture': 'Vision Transformer + Language Model',
|
641
|
+
'timestamp': time.time(),
|
642
|
+
'gpu': 'A100',
|
643
|
+
'memory_usage': '32GB',
|
644
|
+
'request_count': self.request_count,
|
645
|
+
'capabilities': ['image_understanding', 'video_understanding', 'multimodal_chat']
|
646
|
+
}
|
647
|
+
|
648
|
+
# Deployment functions
|
649
|
+
@app.function()
|
650
|
+
def deploy_info():
|
651
|
+
"""Deployment information"""
|
652
|
+
return {
|
653
|
+
'service': 'isa-vision-qwen2.5',
|
654
|
+
'version': '1.0.0',
|
655
|
+
'description': 'ISA Qwen2.5-VL service - 7B multimodal vision-language model',
|
656
|
+
'model': 'Qwen2.5-VL-7B-Instruct',
|
657
|
+
'architecture': 'Vision Transformer + Language Model',
|
658
|
+
'gpu': 'A10G',
|
659
|
+
'capabilities': ['image_understanding', 'video_understanding'],
|
660
|
+
'deployment_time': time.time()
|
661
|
+
}
|
662
|
+
|
663
|
+
@app.function()
|
664
|
+
def register_service():
|
665
|
+
"""Register service to model repository"""
|
666
|
+
try:
|
667
|
+
from isa_model.core.models.model_repo import ModelRepository
|
668
|
+
|
669
|
+
repo = ModelRepository()
|
670
|
+
|
671
|
+
# Register multimodal vision service
|
672
|
+
repo.register_model({
|
673
|
+
'model_id': 'isa-qwen2.5-vl-service',
|
674
|
+
'model_type': 'vision',
|
675
|
+
'provider': 'isa',
|
676
|
+
'endpoint': 'https://isa-vision-qwen2.5.modal.run',
|
677
|
+
'capabilities': ['image_understanding', 'video_understanding', 'multimodal_chat', 'vision_language_reasoning'],
|
678
|
+
'pricing': {'gpu_type': 'A10G', 'cost_per_hour': 1.20},
|
679
|
+
'metadata': {
|
680
|
+
'model': 'Qwen2.5-VL-7B-Instruct',
|
681
|
+
'architecture': 'Vision Transformer + Language Model',
|
682
|
+
'parameters': '7B',
|
683
|
+
'modalities': ['image', 'video', 'text'],
|
684
|
+
'max_tokens': 1000,
|
685
|
+
'supported_formats': ['jpg', 'png', 'gif', 'mp4', 'avi']
|
686
|
+
}
|
687
|
+
})
|
688
|
+
|
689
|
+
print("Qwen2.5-VL service registered successfully")
|
690
|
+
return {'status': 'registered'}
|
691
|
+
|
692
|
+
except Exception as e:
|
693
|
+
print(f"Service registration failed: {e}")
|
694
|
+
return {'status': 'failed', 'error': str(e)}
|
695
|
+
|
696
|
+
if __name__ == "__main__":
|
697
|
+
print("ISA Qwen2.5-VL Service - Modal Deployment")
|
698
|
+
print("Deploy with: modal deploy isa_vision_qwen2.5_service.py")
|
699
|
+
print()
|
700
|
+
print("Model: Qwen2.5-VL-7B-Instruct")
|
701
|
+
print("Architecture: Vision Transformer + Language Model")
|
702
|
+
print("Capabilities: Image & Video Understanding")
|
703
|
+
print("GPU: A10G (24GB)")
|
704
|
+
print()
|
705
|
+
print("Usage:")
|
706
|
+
print("# Image analysis")
|
707
|
+
print("service.analyze_image(image_b64, 'What do you see in this image?')")
|
708
|
+
print("# Video analysis")
|
709
|
+
print("service.analyze_video(video_b64, 'Describe what happens in this video')")
|