isa-model 0.3.91__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (123) hide show
  1. isa_model/client.py +732 -573
  2. isa_model/core/cache/redis_cache.py +401 -0
  3. isa_model/core/config/config_manager.py +53 -10
  4. isa_model/core/config.py +1 -1
  5. isa_model/core/database/__init__.py +1 -0
  6. isa_model/core/database/migrations.py +277 -0
  7. isa_model/core/database/supabase_client.py +123 -0
  8. isa_model/core/models/__init__.py +37 -0
  9. isa_model/core/models/model_billing_tracker.py +60 -88
  10. isa_model/core/models/model_manager.py +36 -18
  11. isa_model/core/models/model_repo.py +44 -38
  12. isa_model/core/models/model_statistics_tracker.py +234 -0
  13. isa_model/core/models/model_storage.py +0 -1
  14. isa_model/core/models/model_version_manager.py +959 -0
  15. isa_model/core/pricing_manager.py +2 -249
  16. isa_model/core/resilience/circuit_breaker.py +366 -0
  17. isa_model/core/security/secrets.py +358 -0
  18. isa_model/core/services/__init__.py +2 -4
  19. isa_model/core/services/intelligent_model_selector.py +101 -370
  20. isa_model/core/storage/hf_storage.py +1 -1
  21. isa_model/core/types.py +7 -0
  22. isa_model/deployment/cloud/modal/isa_audio_chatTTS_service.py +520 -0
  23. isa_model/deployment/cloud/modal/isa_audio_fish_service.py +0 -0
  24. isa_model/deployment/cloud/modal/isa_audio_openvoice_service.py +758 -0
  25. isa_model/deployment/cloud/modal/isa_audio_service_v2.py +1044 -0
  26. isa_model/deployment/cloud/modal/isa_embed_rerank_service.py +296 -0
  27. isa_model/deployment/cloud/modal/isa_video_hunyuan_service.py +423 -0
  28. isa_model/deployment/cloud/modal/isa_vision_ocr_service.py +519 -0
  29. isa_model/deployment/cloud/modal/isa_vision_qwen25_service.py +709 -0
  30. isa_model/deployment/cloud/modal/isa_vision_table_service.py +467 -323
  31. isa_model/deployment/cloud/modal/isa_vision_ui_service.py +607 -180
  32. isa_model/deployment/cloud/modal/isa_vision_ui_service_optimized.py +660 -0
  33. isa_model/deployment/core/deployment_manager.py +6 -4
  34. isa_model/deployment/services/auto_hf_modal_deployer.py +894 -0
  35. isa_model/eval/benchmarks/__init__.py +27 -0
  36. isa_model/eval/benchmarks/multimodal_datasets.py +460 -0
  37. isa_model/eval/benchmarks.py +244 -12
  38. isa_model/eval/evaluators/__init__.py +8 -2
  39. isa_model/eval/evaluators/audio_evaluator.py +727 -0
  40. isa_model/eval/evaluators/embedding_evaluator.py +742 -0
  41. isa_model/eval/evaluators/vision_evaluator.py +564 -0
  42. isa_model/eval/example_evaluation.py +395 -0
  43. isa_model/eval/factory.py +272 -5
  44. isa_model/eval/isa_benchmarks.py +700 -0
  45. isa_model/eval/isa_integration.py +582 -0
  46. isa_model/eval/metrics.py +159 -6
  47. isa_model/eval/tests/unit/test_basic.py +396 -0
  48. isa_model/inference/ai_factory.py +44 -8
  49. isa_model/inference/services/audio/__init__.py +21 -0
  50. isa_model/inference/services/audio/base_realtime_service.py +225 -0
  51. isa_model/inference/services/audio/isa_tts_service.py +0 -0
  52. isa_model/inference/services/audio/openai_realtime_service.py +320 -124
  53. isa_model/inference/services/audio/openai_stt_service.py +32 -6
  54. isa_model/inference/services/base_service.py +17 -1
  55. isa_model/inference/services/embedding/__init__.py +13 -0
  56. isa_model/inference/services/embedding/base_embed_service.py +111 -8
  57. isa_model/inference/services/embedding/isa_embed_service.py +305 -0
  58. isa_model/inference/services/embedding/openai_embed_service.py +2 -4
  59. isa_model/inference/services/embedding/tests/test_embedding.py +222 -0
  60. isa_model/inference/services/img/__init__.py +2 -2
  61. isa_model/inference/services/img/base_image_gen_service.py +24 -7
  62. isa_model/inference/services/img/replicate_image_gen_service.py +84 -422
  63. isa_model/inference/services/img/services/replicate_face_swap.py +193 -0
  64. isa_model/inference/services/img/services/replicate_flux.py +226 -0
  65. isa_model/inference/services/img/services/replicate_flux_kontext.py +219 -0
  66. isa_model/inference/services/img/services/replicate_sticker_maker.py +249 -0
  67. isa_model/inference/services/img/tests/test_img_client.py +297 -0
  68. isa_model/inference/services/llm/base_llm_service.py +30 -6
  69. isa_model/inference/services/llm/helpers/llm_adapter.py +63 -9
  70. isa_model/inference/services/llm/ollama_llm_service.py +2 -1
  71. isa_model/inference/services/llm/openai_llm_service.py +652 -55
  72. isa_model/inference/services/llm/yyds_llm_service.py +2 -1
  73. isa_model/inference/services/vision/__init__.py +5 -5
  74. isa_model/inference/services/vision/base_vision_service.py +118 -185
  75. isa_model/inference/services/vision/helpers/image_utils.py +11 -5
  76. isa_model/inference/services/vision/isa_vision_service.py +573 -0
  77. isa_model/inference/services/vision/tests/test_ocr_client.py +284 -0
  78. isa_model/serving/api/fastapi_server.py +88 -16
  79. isa_model/serving/api/middleware/auth.py +311 -0
  80. isa_model/serving/api/middleware/security.py +278 -0
  81. isa_model/serving/api/routes/analytics.py +486 -0
  82. isa_model/serving/api/routes/deployments.py +339 -0
  83. isa_model/serving/api/routes/evaluations.py +579 -0
  84. isa_model/serving/api/routes/logs.py +430 -0
  85. isa_model/serving/api/routes/settings.py +582 -0
  86. isa_model/serving/api/routes/unified.py +324 -165
  87. isa_model/serving/api/startup.py +304 -0
  88. isa_model/serving/modal_proxy_server.py +249 -0
  89. isa_model/training/__init__.py +100 -6
  90. isa_model/training/core/__init__.py +4 -1
  91. isa_model/training/examples/intelligent_training_example.py +281 -0
  92. isa_model/training/intelligent/__init__.py +25 -0
  93. isa_model/training/intelligent/decision_engine.py +643 -0
  94. isa_model/training/intelligent/intelligent_factory.py +888 -0
  95. isa_model/training/intelligent/knowledge_base.py +751 -0
  96. isa_model/training/intelligent/resource_optimizer.py +839 -0
  97. isa_model/training/intelligent/task_classifier.py +576 -0
  98. isa_model/training/storage/__init__.py +24 -0
  99. isa_model/training/storage/core_integration.py +439 -0
  100. isa_model/training/storage/training_repository.py +552 -0
  101. isa_model/training/storage/training_storage.py +628 -0
  102. {isa_model-0.3.91.dist-info → isa_model-0.4.0.dist-info}/METADATA +13 -1
  103. isa_model-0.4.0.dist-info/RECORD +182 -0
  104. isa_model/deployment/cloud/modal/isa_vision_doc_service.py +0 -766
  105. isa_model/deployment/cloud/modal/register_models.py +0 -321
  106. isa_model/inference/adapter/unified_api.py +0 -248
  107. isa_model/inference/services/helpers/stacked_config.py +0 -148
  108. isa_model/inference/services/img/flux_professional_service.py +0 -603
  109. isa_model/inference/services/img/helpers/base_stacked_service.py +0 -274
  110. isa_model/inference/services/others/table_transformer_service.py +0 -61
  111. isa_model/inference/services/vision/doc_analysis_service.py +0 -640
  112. isa_model/inference/services/vision/helpers/base_stacked_service.py +0 -274
  113. isa_model/inference/services/vision/ui_analysis_service.py +0 -823
  114. isa_model/scripts/inference_tracker.py +0 -283
  115. isa_model/scripts/mlflow_manager.py +0 -379
  116. isa_model/scripts/model_registry.py +0 -465
  117. isa_model/scripts/register_models.py +0 -370
  118. isa_model/scripts/register_models_with_embeddings.py +0 -510
  119. isa_model/scripts/start_mlflow.py +0 -95
  120. isa_model/scripts/training_tracker.py +0 -257
  121. isa_model-0.3.91.dist-info/RECORD +0 -138
  122. {isa_model-0.3.91.dist-info → isa_model-0.4.0.dist-info}/WHEEL +0 -0
  123. {isa_model-0.3.91.dist-info → isa_model-0.4.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,709 @@
1
+ """
2
+ ISA Qwen2.5-VL Service
3
+
4
+ Multimodal vision-language service using Qwen2.5-VL 7B
5
+ - Image understanding and analysis
6
+ - Video understanding and analysis
7
+ - Vision-language reasoning
8
+ - High-quality visual content interpretation
9
+ """
10
+
11
+ import modal
12
+ import time
13
+ import json
14
+ import os
15
+ import logging
16
+ import base64
17
+ import tempfile
18
+ from typing import Dict, List, Optional, Any, Union
19
+ from pathlib import Path
20
+
21
+ # Define Modal application
22
+ app = modal.App("isa-vision-qwen2.5")
23
+
24
+ # Define Modal container image with Qwen2.5-VL dependencies
25
+ image = (
26
+ modal.Image.debian_slim(python_version="3.10")
27
+ .pip_install([
28
+ "packaging", # Required dependency
29
+ "torch>=2.0.0",
30
+ "torchvision>=0.15.0",
31
+ "transformers>=4.37.0",
32
+ "accelerate>=0.26.0",
33
+ "Pillow>=10.0.0",
34
+ "opencv-python>=4.8.0",
35
+ "numpy>=1.24.0",
36
+ "requests>=2.31.0",
37
+ "httpx>=0.26.0",
38
+ "pydantic>=2.0.0",
39
+ "python-dotenv>=1.0.0",
40
+ "qwen-vl-utils", # Qwen VL utilities
41
+ "av", # Video processing
42
+ "decord", # Video decoding
43
+ "imageio>=2.31.0",
44
+ "imageio-ffmpeg>=0.4.8",
45
+ "tiktoken>=0.5.0",
46
+ "sentencepiece>=0.1.99",
47
+ "protobuf>=3.20.0",
48
+ # "flash-attn>=2.0.0", # Optional - removed for easier deployment
49
+ ])
50
+ .apt_install([
51
+ "ffmpeg",
52
+ "libsm6",
53
+ "libxext6",
54
+ "libxrender-dev",
55
+ "libglib2.0-0",
56
+ "libgl1-mesa-glx",
57
+ "git-lfs"
58
+ ])
59
+ .env({
60
+ "TRANSFORMERS_CACHE": "/models",
61
+ "TORCH_HOME": "/models/torch",
62
+ "HF_HOME": "/models",
63
+ "CUDA_VISIBLE_DEVICES": "0",
64
+ "PYTORCH_CUDA_ALLOC_CONF": "max_split_size_mb:512"
65
+ })
66
+ )
67
+
68
+ # Qwen2.5-VL Service - Optimized for performance
69
+ @app.cls(
70
+ gpu="A100", # Use A100 for better performance (40GB)
71
+ image=image,
72
+ memory=32768, # 32GB RAM for faster processing
73
+ timeout=1800, # 30 minutes
74
+ scaledown_window=300, # 5 minutes idle timeout (longer for model warmup)
75
+ min_containers=1, # Keep 1 container warm
76
+ max_containers=5, # Limit for cost control
77
+ # secrets=[modal.Secret.from_name("huggingface-secret")], # Optional HF token
78
+ )
79
+ class ISAVisionQwen25Service:
80
+ """
81
+ ISA Qwen2.5-VL Service
82
+
83
+ Multimodal vision-language model (7B parameters):
84
+ - Model: Qwen/Qwen2.5-VL-7B-Instruct
85
+ - Architecture: Vision Transformer + Language Model
86
+ - Capabilities: Image understanding, Video understanding, VL reasoning
87
+ - Performance: SOTA multimodal understanding
88
+ """
89
+
90
+ @modal.enter()
91
+ def load_models(self):
92
+ """Load Qwen2.5-VL model and dependencies"""
93
+ print("Loading Qwen2.5-VL (7B parameters)...")
94
+ start_time = time.time()
95
+
96
+ # Initialize instance variables
97
+ self.model = None
98
+ self.processor = None
99
+ self.logger = logging.getLogger(__name__)
100
+ self.request_count = 0
101
+ self.total_processing_time = 0.0
102
+
103
+ try:
104
+ import torch
105
+ from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
106
+ from qwen_vl_utils import process_vision_info
107
+
108
+ # Store the function as instance variable for later use
109
+ self.process_vision_info = process_vision_info
110
+
111
+ model_name = "Qwen/Qwen2.5-VL-7B-Instruct"
112
+
113
+ print(f"Loading Qwen2.5-VL model: {model_name}")
114
+
115
+ # Load model with optimizations for speed
116
+ self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
117
+ model_name,
118
+ torch_dtype=torch.float16, # Use float16 for speed
119
+ device_map="auto",
120
+ attn_implementation="sdpa", # Use SDPA for better performance
121
+ low_cpu_mem_usage=True, # Reduce CPU memory usage
122
+ use_cache=True # Enable KV cache
123
+ )
124
+
125
+ # Load processor for image/video processing
126
+ self.processor = AutoProcessor.from_pretrained(
127
+ model_name,
128
+ use_fast=True # Use fast tokenizer for speed
129
+ )
130
+
131
+ # Set model to evaluation mode and optimize for inference
132
+ self.model.eval()
133
+
134
+ # Compile model for faster inference (PyTorch 2.0+)
135
+ try:
136
+ self.model = torch.compile(self.model, mode="reduce-overhead")
137
+ print("✅ Model compiled for faster inference")
138
+ except Exception as e:
139
+ print(f"⚠️ Model compilation failed: {e}")
140
+
141
+ # Enable CPU offloading for memory efficiency
142
+ self.model.tie_weights()
143
+
144
+ load_time = time.time() - start_time
145
+ print(f"Qwen2.5-VL loaded successfully in {load_time:.2f}s")
146
+
147
+ # Model loading status
148
+ self.models_loaded = True
149
+
150
+ except Exception as e:
151
+ print(f"Model loading failed: {e}")
152
+ import traceback
153
+ traceback.print_exc()
154
+ self.models_loaded = False
155
+
156
+ @modal.method()
157
+ def analyze_image(
158
+ self,
159
+ image_b64: str,
160
+ prompt: str = "Describe this image in detail.",
161
+ max_tokens: int = 1000,
162
+ temperature: float = 0.7,
163
+ top_p: float = 0.9
164
+ ) -> Dict[str, Any]:
165
+ """
166
+ Analyze image using Qwen2.5-VL
167
+
168
+ Args:
169
+ image_b64: Base64 encoded image
170
+ prompt: Question or instruction about the image
171
+ max_tokens: Maximum tokens to generate
172
+ temperature: Sampling temperature
173
+ top_p: Top-p sampling parameter
174
+
175
+ Returns:
176
+ Image analysis results
177
+ """
178
+ start_time = time.time()
179
+ self.request_count += 1
180
+
181
+ try:
182
+ # Validate model loading status
183
+ if not self.models_loaded or not self.model:
184
+ raise RuntimeError("Qwen2.5-VL model not loaded")
185
+
186
+ # Decode base64 image
187
+ image_data = base64.b64decode(image_b64)
188
+
189
+ # Save to temporary file
190
+ with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_file:
191
+ tmp_file.write(image_data)
192
+ tmp_file.flush()
193
+
194
+ # Prepare messages for the model
195
+ messages = [
196
+ {
197
+ "role": "user",
198
+ "content": [
199
+ {
200
+ "type": "image",
201
+ "image": tmp_file.name,
202
+ },
203
+ {"type": "text", "text": prompt},
204
+ ],
205
+ }
206
+ ]
207
+
208
+ # Process the conversation
209
+ text = self.processor.apply_chat_template(
210
+ messages, tokenize=False, add_generation_prompt=True
211
+ )
212
+
213
+ # Process vision info
214
+ image_inputs, video_inputs = self.process_vision_info(messages)
215
+
216
+ # Prepare inputs
217
+ inputs = self.processor(
218
+ text=[text],
219
+ images=image_inputs,
220
+ videos=video_inputs,
221
+ padding=True,
222
+ return_tensors="pt",
223
+ )
224
+ inputs = inputs.to("cuda")
225
+
226
+ # Generate response with optimized parameters
227
+ import torch
228
+ with torch.no_grad():
229
+ generated_ids = self.model.generate(
230
+ **inputs,
231
+ max_new_tokens=min(max_tokens, 200), # Limit max tokens for speed
232
+ temperature=temperature,
233
+ top_p=top_p,
234
+ do_sample=True,
235
+ pad_token_id=self.processor.tokenizer.eos_token_id,
236
+ use_cache=True, # Enable KV cache
237
+ num_beams=1, # Use greedy decoding for speed
238
+ early_stopping=True # Stop early when possible
239
+ )
240
+
241
+ # Extract generated tokens (remove input tokens)
242
+ generated_ids_trimmed = [
243
+ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
244
+ ]
245
+
246
+ # Decode response
247
+ response_text = self.processor.batch_decode(
248
+ generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
249
+ )[0]
250
+
251
+ # Clean up temp file
252
+ os.unlink(tmp_file.name)
253
+
254
+ processing_time = time.time() - start_time
255
+ self.total_processing_time += processing_time
256
+
257
+ # Calculate cost (A100 GPU: ~$4.00/hour)
258
+ gpu_cost = (processing_time / 3600) * 4.00
259
+
260
+ result = {
261
+ 'success': True,
262
+ 'service': 'isa-vision-qwen2.5',
263
+ 'operation': 'image_analysis',
264
+ 'provider': 'ISA',
265
+ 'text': response_text,
266
+ 'prompt': prompt,
267
+ 'model': 'Qwen2.5-VL-7B-Instruct',
268
+ 'architecture': 'Vision Transformer + Language Model',
269
+ 'modality': 'image',
270
+ 'parameters': {
271
+ 'max_tokens': max_tokens,
272
+ 'temperature': temperature,
273
+ 'top_p': top_p
274
+ },
275
+ 'processing_time': processing_time,
276
+ 'billing': {
277
+ 'request_id': f"img_{self.request_count}_{int(time.time())}",
278
+ 'gpu_seconds': processing_time,
279
+ 'estimated_cost_usd': round(gpu_cost, 4),
280
+ 'gpu_type': 'A100'
281
+ },
282
+ 'model_info': {
283
+ 'model_name': 'Qwen2.5-VL-7B-Instruct',
284
+ 'provider': 'ISA',
285
+ 'architecture': 'Multimodal Vision-Language',
286
+ 'parameters': '7B',
287
+ 'gpu': 'A100',
288
+ 'capabilities': ['image_understanding', 'vision_language_reasoning'],
289
+ 'container_id': os.environ.get('MODAL_TASK_ID', 'unknown')
290
+ }
291
+ }
292
+
293
+ # Output JSON results
294
+ print("=== JSON_RESULT_START ===")
295
+ print(json.dumps(result, default=str))
296
+ print("=== JSON_RESULT_END ===")
297
+
298
+ return result
299
+
300
+ except Exception as e:
301
+ processing_time = time.time() - start_time
302
+ error_result = {
303
+ 'success': False,
304
+ 'service': 'isa-vision-qwen2.5',
305
+ 'operation': 'image_analysis',
306
+ 'provider': 'ISA',
307
+ 'error': str(e),
308
+ 'processing_time': processing_time,
309
+ 'billing': {
310
+ 'request_id': f"img_{self.request_count}_{int(time.time())}",
311
+ 'gpu_seconds': processing_time,
312
+ 'estimated_cost_usd': round((processing_time / 3600) * 1.20, 4),
313
+ 'gpu_type': 'A100'
314
+ }
315
+ }
316
+
317
+ print("=== JSON_RESULT_START ===")
318
+ print(json.dumps(error_result, default=str))
319
+ print("=== JSON_RESULT_END ===")
320
+
321
+ return error_result
322
+
323
+ @modal.method()
324
+ def analyze_video(
325
+ self,
326
+ video_b64: str,
327
+ prompt: str = "Describe what happens in this video.",
328
+ max_tokens: int = 1000,
329
+ temperature: float = 0.7,
330
+ top_p: float = 0.9,
331
+ max_frames: int = 8
332
+ ) -> Dict[str, Any]:
333
+ """
334
+ Analyze video using Qwen2.5-VL
335
+
336
+ Args:
337
+ video_b64: Base64 encoded video
338
+ prompt: Question or instruction about the video
339
+ max_tokens: Maximum tokens to generate
340
+ temperature: Sampling temperature
341
+ top_p: Top-p sampling parameter
342
+ max_frames: Maximum frames to sample from video
343
+
344
+ Returns:
345
+ Video analysis results
346
+ """
347
+ start_time = time.time()
348
+ self.request_count += 1
349
+
350
+ try:
351
+ # Validate model loading status
352
+ if not self.models_loaded or not self.model:
353
+ raise RuntimeError("Qwen2.5-VL model not loaded")
354
+
355
+ # Decode base64 video
356
+ video_data = base64.b64decode(video_b64)
357
+
358
+ # Save to temporary file
359
+ with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmp_file:
360
+ tmp_file.write(video_data)
361
+ tmp_file.flush()
362
+
363
+ # Prepare messages for the model
364
+ messages = [
365
+ {
366
+ "role": "user",
367
+ "content": [
368
+ {
369
+ "type": "video",
370
+ "video": tmp_file.name,
371
+ "max_pixels": 360 * 420,
372
+ "fps": 1.0,
373
+ },
374
+ {"type": "text", "text": prompt},
375
+ ],
376
+ }
377
+ ]
378
+
379
+ # Process the conversation
380
+ text = self.processor.apply_chat_template(
381
+ messages, tokenize=False, add_generation_prompt=True
382
+ )
383
+
384
+ # Process vision info
385
+ image_inputs, video_inputs = self.process_vision_info(messages)
386
+
387
+ # Prepare inputs
388
+ inputs = self.processor(
389
+ text=[text],
390
+ images=image_inputs,
391
+ videos=video_inputs,
392
+ padding=True,
393
+ return_tensors="pt",
394
+ )
395
+ inputs = inputs.to("cuda")
396
+
397
+ # Generate response with optimized parameters
398
+ import torch
399
+ with torch.no_grad():
400
+ generated_ids = self.model.generate(
401
+ **inputs,
402
+ max_new_tokens=min(max_tokens, 200), # Limit max tokens for speed
403
+ temperature=temperature,
404
+ top_p=top_p,
405
+ do_sample=True,
406
+ pad_token_id=self.processor.tokenizer.eos_token_id,
407
+ use_cache=True, # Enable KV cache
408
+ num_beams=1, # Use greedy decoding for speed
409
+ early_stopping=True # Stop early when possible
410
+ )
411
+
412
+ # Extract generated tokens (remove input tokens)
413
+ generated_ids_trimmed = [
414
+ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
415
+ ]
416
+
417
+ # Decode response
418
+ response_text = self.processor.batch_decode(
419
+ generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
420
+ )[0]
421
+
422
+ # Clean up temp file
423
+ os.unlink(tmp_file.name)
424
+
425
+ processing_time = time.time() - start_time
426
+ self.total_processing_time += processing_time
427
+
428
+ # Calculate cost (A100 GPU: ~$4.00/hour)
429
+ gpu_cost = (processing_time / 3600) * 4.00
430
+
431
+ result = {
432
+ 'success': True,
433
+ 'service': 'isa-vision-qwen2.5',
434
+ 'operation': 'video_analysis',
435
+ 'provider': 'ISA',
436
+ 'text': response_text,
437
+ 'prompt': prompt,
438
+ 'model': 'Qwen2.5-VL-7B-Instruct',
439
+ 'architecture': 'Vision Transformer + Language Model',
440
+ 'modality': 'video',
441
+ 'parameters': {
442
+ 'max_tokens': max_tokens,
443
+ 'temperature': temperature,
444
+ 'top_p': top_p,
445
+ 'max_frames': max_frames
446
+ },
447
+ 'processing_time': processing_time,
448
+ 'billing': {
449
+ 'request_id': f"vid_{self.request_count}_{int(time.time())}",
450
+ 'gpu_seconds': processing_time,
451
+ 'estimated_cost_usd': round(gpu_cost, 4),
452
+ 'gpu_type': 'A100'
453
+ },
454
+ 'model_info': {
455
+ 'model_name': 'Qwen2.5-VL-7B-Instruct',
456
+ 'provider': 'ISA',
457
+ 'architecture': 'Multimodal Vision-Language',
458
+ 'parameters': '7B',
459
+ 'gpu': 'A100',
460
+ 'capabilities': ['video_understanding', 'temporal_reasoning', 'vision_language_reasoning'],
461
+ 'container_id': os.environ.get('MODAL_TASK_ID', 'unknown')
462
+ }
463
+ }
464
+
465
+ # Output JSON results
466
+ print("=== JSON_RESULT_START ===")
467
+ print(json.dumps(result, default=str))
468
+ print("=== JSON_RESULT_END ===")
469
+
470
+ return result
471
+
472
+ except Exception as e:
473
+ processing_time = time.time() - start_time
474
+ error_result = {
475
+ 'success': False,
476
+ 'service': 'isa-vision-qwen2.5',
477
+ 'operation': 'video_analysis',
478
+ 'provider': 'ISA',
479
+ 'error': str(e),
480
+ 'processing_time': processing_time,
481
+ 'billing': {
482
+ 'request_id': f"vid_{self.request_count}_{int(time.time())}",
483
+ 'gpu_seconds': processing_time,
484
+ 'estimated_cost_usd': round((processing_time / 3600) * 1.20, 4),
485
+ 'gpu_type': 'A100'
486
+ }
487
+ }
488
+
489
+ print("=== JSON_RESULT_START ===")
490
+ print(json.dumps(error_result, default=str))
491
+ print("=== JSON_RESULT_END ===")
492
+
493
+ return error_result
494
+
495
+ @modal.method()
496
+ def multimodal_chat(
497
+ self,
498
+ messages: List[Dict[str, Any]],
499
+ max_tokens: int = 1000,
500
+ temperature: float = 0.7,
501
+ top_p: float = 0.9
502
+ ) -> Dict[str, Any]:
503
+ """
504
+ Multimodal chat with images/videos
505
+
506
+ Args:
507
+ messages: List of chat messages with images/videos
508
+ max_tokens: Maximum tokens to generate
509
+ temperature: Sampling temperature
510
+ top_p: Top-p sampling parameter
511
+
512
+ Returns:
513
+ Chat response
514
+ """
515
+ start_time = time.time()
516
+ self.request_count += 1
517
+
518
+ try:
519
+ # Validate model loading status
520
+ if not self.models_loaded or not self.model:
521
+ raise RuntimeError("Qwen2.5-VL model not loaded")
522
+
523
+ # Process the conversation
524
+ text = self.processor.apply_chat_template(
525
+ messages, tokenize=False, add_generation_prompt=True
526
+ )
527
+
528
+ # Process vision info
529
+ image_inputs, video_inputs = self.process_vision_info(messages)
530
+
531
+ # Prepare inputs
532
+ inputs = self.processor(
533
+ text=[text],
534
+ images=image_inputs,
535
+ videos=video_inputs,
536
+ padding=True,
537
+ return_tensors="pt",
538
+ )
539
+ inputs = inputs.to("cuda")
540
+
541
+ # Generate response
542
+ import torch
543
+ with torch.no_grad():
544
+ generated_ids = self.model.generate(
545
+ **inputs,
546
+ max_new_tokens=max_tokens,
547
+ temperature=temperature,
548
+ top_p=top_p,
549
+ do_sample=True,
550
+ pad_token_id=self.processor.tokenizer.eos_token_id
551
+ )
552
+
553
+ # Extract generated tokens (remove input tokens)
554
+ generated_ids_trimmed = [
555
+ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
556
+ ]
557
+
558
+ # Decode response
559
+ response_text = self.processor.batch_decode(
560
+ generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
561
+ )[0]
562
+
563
+ processing_time = time.time() - start_time
564
+ self.total_processing_time += processing_time
565
+
566
+ # Calculate cost (A100 GPU: ~$4.00/hour)
567
+ gpu_cost = (processing_time / 3600) * 4.00
568
+
569
+ result = {
570
+ 'success': True,
571
+ 'service': 'isa-vision-qwen2.5',
572
+ 'operation': 'multimodal_chat',
573
+ 'provider': 'ISA',
574
+ 'text': response_text,
575
+ 'model': 'Qwen2.5-VL-7B-Instruct',
576
+ 'architecture': 'Vision Transformer + Language Model',
577
+ 'modality': 'multimodal',
578
+ 'parameters': {
579
+ 'max_tokens': max_tokens,
580
+ 'temperature': temperature,
581
+ 'top_p': top_p
582
+ },
583
+ 'processing_time': processing_time,
584
+ 'billing': {
585
+ 'request_id': f"chat_{self.request_count}_{int(time.time())}",
586
+ 'gpu_seconds': processing_time,
587
+ 'estimated_cost_usd': round(gpu_cost, 4),
588
+ 'gpu_type': 'A100'
589
+ },
590
+ 'model_info': {
591
+ 'model_name': 'Qwen2.5-VL-7B-Instruct',
592
+ 'provider': 'ISA',
593
+ 'architecture': 'Multimodal Vision-Language',
594
+ 'parameters': '7B',
595
+ 'gpu': 'A100',
596
+ 'capabilities': ['image_understanding', 'video_understanding', 'multimodal_chat'],
597
+ 'container_id': os.environ.get('MODAL_TASK_ID', 'unknown')
598
+ }
599
+ }
600
+
601
+ # Output JSON results
602
+ print("=== JSON_RESULT_START ===")
603
+ print(json.dumps(result, default=str))
604
+ print("=== JSON_RESULT_END ===")
605
+
606
+ return result
607
+
608
+ except Exception as e:
609
+ processing_time = time.time() - start_time
610
+ error_result = {
611
+ 'success': False,
612
+ 'service': 'isa-vision-qwen2.5',
613
+ 'operation': 'multimodal_chat',
614
+ 'provider': 'ISA',
615
+ 'error': str(e),
616
+ 'processing_time': processing_time,
617
+ 'billing': {
618
+ 'request_id': f"chat_{self.request_count}_{int(time.time())}",
619
+ 'gpu_seconds': processing_time,
620
+ 'estimated_cost_usd': round((processing_time / 3600) * 1.20, 4),
621
+ 'gpu_type': 'A100'
622
+ }
623
+ }
624
+
625
+ print("=== JSON_RESULT_START ===")
626
+ print(json.dumps(error_result, default=str))
627
+ print("=== JSON_RESULT_END ===")
628
+
629
+ return error_result
630
+
631
+ @modal.method()
632
+ def health_check(self) -> Dict[str, Any]:
633
+ """Health check endpoint"""
634
+ return {
635
+ 'status': 'healthy',
636
+ 'service': 'isa-vision-qwen2.5',
637
+ 'provider': 'ISA',
638
+ 'models_loaded': self.models_loaded,
639
+ 'model': 'Qwen2.5-VL-7B-Instruct',
640
+ 'architecture': 'Vision Transformer + Language Model',
641
+ 'timestamp': time.time(),
642
+ 'gpu': 'A100',
643
+ 'memory_usage': '32GB',
644
+ 'request_count': self.request_count,
645
+ 'capabilities': ['image_understanding', 'video_understanding', 'multimodal_chat']
646
+ }
647
+
648
+ # Deployment functions
649
+ @app.function()
650
+ def deploy_info():
651
+ """Deployment information"""
652
+ return {
653
+ 'service': 'isa-vision-qwen2.5',
654
+ 'version': '1.0.0',
655
+ 'description': 'ISA Qwen2.5-VL service - 7B multimodal vision-language model',
656
+ 'model': 'Qwen2.5-VL-7B-Instruct',
657
+ 'architecture': 'Vision Transformer + Language Model',
658
+ 'gpu': 'A10G',
659
+ 'capabilities': ['image_understanding', 'video_understanding'],
660
+ 'deployment_time': time.time()
661
+ }
662
+
663
+ @app.function()
664
+ def register_service():
665
+ """Register service to model repository"""
666
+ try:
667
+ from isa_model.core.models.model_repo import ModelRepository
668
+
669
+ repo = ModelRepository()
670
+
671
+ # Register multimodal vision service
672
+ repo.register_model({
673
+ 'model_id': 'isa-qwen2.5-vl-service',
674
+ 'model_type': 'vision',
675
+ 'provider': 'isa',
676
+ 'endpoint': 'https://isa-vision-qwen2.5.modal.run',
677
+ 'capabilities': ['image_understanding', 'video_understanding', 'multimodal_chat', 'vision_language_reasoning'],
678
+ 'pricing': {'gpu_type': 'A10G', 'cost_per_hour': 1.20},
679
+ 'metadata': {
680
+ 'model': 'Qwen2.5-VL-7B-Instruct',
681
+ 'architecture': 'Vision Transformer + Language Model',
682
+ 'parameters': '7B',
683
+ 'modalities': ['image', 'video', 'text'],
684
+ 'max_tokens': 1000,
685
+ 'supported_formats': ['jpg', 'png', 'gif', 'mp4', 'avi']
686
+ }
687
+ })
688
+
689
+ print("Qwen2.5-VL service registered successfully")
690
+ return {'status': 'registered'}
691
+
692
+ except Exception as e:
693
+ print(f"Service registration failed: {e}")
694
+ return {'status': 'failed', 'error': str(e)}
695
+
696
+ if __name__ == "__main__":
697
+ print("ISA Qwen2.5-VL Service - Modal Deployment")
698
+ print("Deploy with: modal deploy isa_vision_qwen2.5_service.py")
699
+ print()
700
+ print("Model: Qwen2.5-VL-7B-Instruct")
701
+ print("Architecture: Vision Transformer + Language Model")
702
+ print("Capabilities: Image & Video Understanding")
703
+ print("GPU: A10G (24GB)")
704
+ print()
705
+ print("Usage:")
706
+ print("# Image analysis")
707
+ print("service.analyze_image(image_b64, 'What do you see in this image?')")
708
+ print("# Video analysis")
709
+ print("service.analyze_video(video_b64, 'Describe what happens in this video')")