isa-model 0.4.0__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (199) hide show
  1. isa_model/client.py +466 -43
  2. isa_model/core/cache/redis_cache.py +12 -3
  3. isa_model/core/config/config_manager.py +230 -3
  4. isa_model/core/config.py +90 -0
  5. isa_model/core/database/direct_db_client.py +114 -0
  6. isa_model/core/database/migration_manager.py +563 -0
  7. isa_model/core/database/migrations.py +21 -1
  8. isa_model/core/database/supabase_client.py +154 -19
  9. isa_model/core/dependencies.py +316 -0
  10. isa_model/core/discovery/__init__.py +19 -0
  11. isa_model/core/discovery/consul_discovery.py +190 -0
  12. isa_model/core/logging/__init__.py +54 -0
  13. isa_model/core/logging/influx_logger.py +523 -0
  14. isa_model/core/logging/loki_logger.py +160 -0
  15. isa_model/core/models/__init__.py +27 -18
  16. isa_model/core/models/config_models.py +625 -0
  17. isa_model/core/models/deployment_billing_tracker.py +430 -0
  18. isa_model/core/models/model_manager.py +40 -17
  19. isa_model/core/models/model_metadata.py +690 -0
  20. isa_model/core/models/model_repo.py +174 -18
  21. isa_model/core/models/system_models.py +857 -0
  22. isa_model/core/repositories/__init__.py +9 -0
  23. isa_model/core/repositories/config_repository.py +912 -0
  24. isa_model/core/services/intelligent_model_selector.py +399 -21
  25. isa_model/core/storage/hf_storage.py +1 -1
  26. isa_model/core/types.py +1 -0
  27. isa_model/deployment/__init__.py +5 -48
  28. isa_model/deployment/core/__init__.py +2 -31
  29. isa_model/deployment/core/deployment_manager.py +1278 -370
  30. isa_model/deployment/local/__init__.py +31 -0
  31. isa_model/deployment/local/config.py +248 -0
  32. isa_model/deployment/local/gpu_gateway.py +607 -0
  33. isa_model/deployment/local/health_checker.py +428 -0
  34. isa_model/deployment/local/provider.py +586 -0
  35. isa_model/deployment/local/tensorrt_service.py +621 -0
  36. isa_model/deployment/local/transformers_service.py +644 -0
  37. isa_model/deployment/local/vllm_service.py +527 -0
  38. isa_model/deployment/modal/__init__.py +8 -0
  39. isa_model/deployment/modal/config.py +136 -0
  40. isa_model/deployment/{services/auto_hf_modal_deployer.py → modal/deployer.py} +1 -1
  41. isa_model/deployment/modal/services/__init__.py +3 -0
  42. isa_model/deployment/modal/services/audio/__init__.py +1 -0
  43. isa_model/deployment/modal/services/embedding/__init__.py +1 -0
  44. isa_model/deployment/modal/services/llm/__init__.py +1 -0
  45. isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
  46. isa_model/deployment/modal/services/video/__init__.py +1 -0
  47. isa_model/deployment/modal/services/vision/__init__.py +1 -0
  48. isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
  49. isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
  50. isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
  51. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
  52. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
  53. isa_model/deployment/storage/__init__.py +5 -0
  54. isa_model/deployment/storage/deployment_repository.py +824 -0
  55. isa_model/deployment/triton/__init__.py +10 -0
  56. isa_model/deployment/triton/config.py +196 -0
  57. isa_model/deployment/triton/configs/__init__.py +1 -0
  58. isa_model/deployment/triton/provider.py +512 -0
  59. isa_model/deployment/triton/scripts/__init__.py +1 -0
  60. isa_model/deployment/triton/templates/__init__.py +1 -0
  61. isa_model/inference/__init__.py +47 -1
  62. isa_model/inference/ai_factory.py +137 -10
  63. isa_model/inference/legacy_services/__init__.py +21 -0
  64. isa_model/inference/legacy_services/model_evaluation.py +637 -0
  65. isa_model/inference/legacy_services/model_service.py +573 -0
  66. isa_model/inference/legacy_services/model_serving.py +717 -0
  67. isa_model/inference/legacy_services/model_training.py +561 -0
  68. isa_model/inference/models/__init__.py +21 -0
  69. isa_model/inference/models/inference_config.py +551 -0
  70. isa_model/inference/models/inference_record.py +675 -0
  71. isa_model/inference/models/performance_models.py +714 -0
  72. isa_model/inference/repositories/__init__.py +9 -0
  73. isa_model/inference/repositories/inference_repository.py +828 -0
  74. isa_model/inference/services/audio/base_stt_service.py +184 -11
  75. isa_model/inference/services/audio/openai_stt_service.py +22 -6
  76. isa_model/inference/services/custom_model_manager.py +277 -0
  77. isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
  78. isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
  79. isa_model/inference/services/llm/__init__.py +10 -2
  80. isa_model/inference/services/llm/base_llm_service.py +335 -24
  81. isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
  82. isa_model/inference/services/llm/helpers/llm_adapter.py +9 -4
  83. isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
  84. isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
  85. isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
  86. isa_model/inference/services/llm/local_llm_service.py +747 -0
  87. isa_model/inference/services/llm/ollama_llm_service.py +9 -2
  88. isa_model/inference/services/llm/openai_llm_service.py +33 -16
  89. isa_model/inference/services/llm/yyds_llm_service.py +8 -2
  90. isa_model/inference/services/vision/__init__.py +22 -1
  91. isa_model/inference/services/vision/blip_vision_service.py +359 -0
  92. isa_model/inference/services/vision/helpers/image_utils.py +8 -5
  93. isa_model/inference/services/vision/isa_vision_service.py +65 -4
  94. isa_model/inference/services/vision/openai_vision_service.py +19 -10
  95. isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
  96. isa_model/serving/api/cache_manager.py +245 -0
  97. isa_model/serving/api/dependencies/__init__.py +1 -0
  98. isa_model/serving/api/dependencies/auth.py +194 -0
  99. isa_model/serving/api/dependencies/database.py +139 -0
  100. isa_model/serving/api/error_handlers.py +284 -0
  101. isa_model/serving/api/fastapi_server.py +172 -22
  102. isa_model/serving/api/middleware/auth.py +8 -2
  103. isa_model/serving/api/middleware/security.py +23 -33
  104. isa_model/serving/api/middleware/tenant_context.py +414 -0
  105. isa_model/serving/api/routes/analytics.py +4 -1
  106. isa_model/serving/api/routes/config.py +645 -0
  107. isa_model/serving/api/routes/deployment_billing.py +315 -0
  108. isa_model/serving/api/routes/deployments.py +138 -2
  109. isa_model/serving/api/routes/gpu_gateway.py +440 -0
  110. isa_model/serving/api/routes/health.py +32 -12
  111. isa_model/serving/api/routes/inference_monitoring.py +486 -0
  112. isa_model/serving/api/routes/local_deployments.py +448 -0
  113. isa_model/serving/api/routes/tenants.py +575 -0
  114. isa_model/serving/api/routes/unified.py +680 -18
  115. isa_model/serving/api/routes/webhooks.py +479 -0
  116. isa_model/serving/api/startup.py +68 -54
  117. isa_model/utils/gpu_utils.py +311 -0
  118. {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/METADATA +66 -24
  119. isa_model-0.4.3.dist-info/RECORD +193 -0
  120. isa_model/core/storage/minio_storage.py +0 -0
  121. isa_model/deployment/cloud/__init__.py +0 -9
  122. isa_model/deployment/cloud/modal/__init__.py +0 -10
  123. isa_model/deployment/core/deployment_config.py +0 -356
  124. isa_model/deployment/core/isa_deployment_service.py +0 -401
  125. isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
  126. isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
  127. isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
  128. isa_model/deployment/runtime/deployed_service.py +0 -338
  129. isa_model/deployment/services/__init__.py +0 -9
  130. isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
  131. isa_model/deployment/services/model_service.py +0 -332
  132. isa_model/deployment/services/service_monitor.py +0 -356
  133. isa_model/deployment/services/service_registry.py +0 -527
  134. isa_model/eval/__init__.py +0 -92
  135. isa_model/eval/benchmarks/__init__.py +0 -27
  136. isa_model/eval/benchmarks/multimodal_datasets.py +0 -460
  137. isa_model/eval/benchmarks.py +0 -701
  138. isa_model/eval/config/__init__.py +0 -10
  139. isa_model/eval/config/evaluation_config.py +0 -108
  140. isa_model/eval/evaluators/__init__.py +0 -24
  141. isa_model/eval/evaluators/audio_evaluator.py +0 -727
  142. isa_model/eval/evaluators/base_evaluator.py +0 -503
  143. isa_model/eval/evaluators/embedding_evaluator.py +0 -742
  144. isa_model/eval/evaluators/llm_evaluator.py +0 -472
  145. isa_model/eval/evaluators/vision_evaluator.py +0 -564
  146. isa_model/eval/example_evaluation.py +0 -395
  147. isa_model/eval/factory.py +0 -798
  148. isa_model/eval/infrastructure/__init__.py +0 -24
  149. isa_model/eval/infrastructure/experiment_tracker.py +0 -466
  150. isa_model/eval/isa_benchmarks.py +0 -700
  151. isa_model/eval/isa_integration.py +0 -582
  152. isa_model/eval/metrics.py +0 -951
  153. isa_model/eval/tests/unit/test_basic.py +0 -396
  154. isa_model/serving/api/routes/evaluations.py +0 -579
  155. isa_model/training/__init__.py +0 -168
  156. isa_model/training/annotation/annotation_schema.py +0 -47
  157. isa_model/training/annotation/processors/annotation_processor.py +0 -126
  158. isa_model/training/annotation/storage/dataset_manager.py +0 -131
  159. isa_model/training/annotation/storage/dataset_schema.py +0 -44
  160. isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
  161. isa_model/training/annotation/tests/test_minio copy.py +0 -113
  162. isa_model/training/annotation/tests/test_minio_upload.py +0 -43
  163. isa_model/training/annotation/views/annotation_controller.py +0 -158
  164. isa_model/training/cloud/__init__.py +0 -22
  165. isa_model/training/cloud/job_orchestrator.py +0 -402
  166. isa_model/training/cloud/runpod_trainer.py +0 -454
  167. isa_model/training/cloud/storage_manager.py +0 -482
  168. isa_model/training/core/__init__.py +0 -26
  169. isa_model/training/core/config.py +0 -181
  170. isa_model/training/core/dataset.py +0 -222
  171. isa_model/training/core/trainer.py +0 -720
  172. isa_model/training/core/utils.py +0 -213
  173. isa_model/training/examples/intelligent_training_example.py +0 -281
  174. isa_model/training/factory.py +0 -424
  175. isa_model/training/intelligent/__init__.py +0 -25
  176. isa_model/training/intelligent/decision_engine.py +0 -643
  177. isa_model/training/intelligent/intelligent_factory.py +0 -888
  178. isa_model/training/intelligent/knowledge_base.py +0 -751
  179. isa_model/training/intelligent/resource_optimizer.py +0 -839
  180. isa_model/training/intelligent/task_classifier.py +0 -576
  181. isa_model/training/storage/__init__.py +0 -24
  182. isa_model/training/storage/core_integration.py +0 -439
  183. isa_model/training/storage/training_repository.py +0 -552
  184. isa_model/training/storage/training_storage.py +0 -628
  185. isa_model-0.4.0.dist-info/RECORD +0 -182
  186. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_chatTTS_service.py +0 -0
  187. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_fish_service.py +0 -0
  188. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_openvoice_service.py +0 -0
  189. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_service_v2.py +0 -0
  190. /isa_model/deployment/{cloud/modal → modal/services/embedding}/isa_embed_rerank_service.py +0 -0
  191. /isa_model/deployment/{cloud/modal → modal/services/video}/isa_video_hunyuan_service.py +0 -0
  192. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ocr_service.py +0 -0
  193. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_qwen25_service.py +0 -0
  194. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_table_service.py +0 -0
  195. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service.py +0 -0
  196. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service_optimized.py +0 -0
  197. /isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
  198. {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/WHEEL +0 -0
  199. {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/top_level.txt +0 -0
@@ -1,395 +0,0 @@
1
- """
2
- Example evaluation script demonstrating the ISA Model evaluation framework.
3
-
4
- Shows how to:
5
- 1. Evaluate standard benchmarks (MMLU, HellaSwag, etc.)
6
- 2. Test ISA custom services
7
- 3. Run multimodal evaluations
8
- 4. Perform comprehensive service benchmarking
9
- """
10
-
11
- import asyncio
12
- import logging
13
- import json
14
- from pathlib import Path
15
- from typing import Dict, Any, Optional
16
-
17
- # Import evaluation components
18
- from .benchmarks import create_mmlu_benchmark, create_gsm8k_benchmark
19
- from .benchmarks.multimodal_datasets import create_vqa_dataset, create_coco_captions_dataset
20
- from .evaluators import LLMEvaluator, VisionEvaluator, AudioEvaluator, EmbeddingEvaluator
21
- from .isa_integration import ISAModelInterface
22
- from .isa_benchmarks import run_isa_service_benchmark
23
- from .factory import EvaluationFactory
24
-
25
- # Setup logging
26
- logging.basicConfig(level=logging.INFO)
27
- logger = logging.getLogger(__name__)
28
-
29
-
30
- async def run_standard_llm_evaluation():
31
- """Example: Run standard LLM evaluation on MMLU and GSM8K."""
32
- logger.info("🚀 Running Standard LLM Evaluation")
33
-
34
- # Create evaluator
35
- evaluator = LLMEvaluator(config={
36
- "max_concurrent_requests": 5,
37
- "batch_size": 10
38
- })
39
-
40
- # Create ISA model interface
41
- model_interface = ISAModelInterface()
42
-
43
- # Test MMLU
44
- logger.info("📚 Testing MMLU benchmark")
45
- mmlu_benchmark = create_mmlu_benchmark(subjects=["anatomy", "astronomy", "business_ethics"])
46
- mmlu_data = mmlu_benchmark.load_data(max_samples=20)
47
-
48
- mmlu_result = await evaluator.evaluate(
49
- model_interface=model_interface,
50
- dataset=mmlu_data,
51
- dataset_name="MMLU",
52
- model_name="gpt-4.1-nano"
53
- )
54
-
55
- logger.info(f"MMLU Results: {mmlu_result.get_summary()}")
56
-
57
- # Test GSM8K
58
- logger.info("🧮 Testing GSM8K benchmark")
59
- gsm8k_benchmark = create_gsm8k_benchmark()
60
- gsm8k_data = gsm8k_benchmark.load_data(max_samples=10)
61
-
62
- gsm8k_result = await evaluator.evaluate(
63
- model_interface=model_interface,
64
- dataset=gsm8k_data,
65
- dataset_name="GSM8K",
66
- model_name="gpt-4.1-nano"
67
- )
68
-
69
- logger.info(f"GSM8K Results: {gsm8k_result.get_summary()}")
70
-
71
- return {
72
- "mmlu": mmlu_result.to_dict(),
73
- "gsm8k": gsm8k_result.to_dict()
74
- }
75
-
76
-
77
- async def run_vision_evaluation():
78
- """Example: Run vision evaluation with VQA and image captioning."""
79
- logger.info("👁️ Running Vision Evaluation")
80
-
81
- # Create vision evaluator
82
- evaluator = VisionEvaluator(config={
83
- "task_type": "vqa",
84
- "max_image_size": (1024, 1024)
85
- })
86
-
87
- # Create ISA model interface
88
- model_interface = ISAModelInterface()
89
-
90
- # Test VQA
91
- logger.info("❓ Testing VQA dataset")
92
- vqa_dataset = create_vqa_dataset()
93
- vqa_data = vqa_dataset.load_data(max_samples=10, use_real_data=False) # Use placeholder for demo
94
-
95
- vqa_result = await evaluator.evaluate(
96
- model_interface=model_interface,
97
- dataset=vqa_data,
98
- dataset_name="VQA_v2",
99
- model_name="gpt-4.1-mini"
100
- )
101
-
102
- logger.info(f"VQA Results: {vqa_result.get_summary()}")
103
-
104
- # Test Image Captioning
105
- logger.info("🖼️ Testing Image Captioning")
106
- caption_evaluator = VisionEvaluator(config={"task_type": "caption"})
107
-
108
- coco_dataset = create_coco_captions_dataset()
109
- caption_data = coco_dataset.load_data(max_samples=5, use_real_data=False)
110
-
111
- caption_result = await caption_evaluator.evaluate(
112
- model_interface=model_interface,
113
- dataset=caption_data,
114
- dataset_name="COCO_Captions",
115
- model_name="gpt-4.1-mini"
116
- )
117
-
118
- logger.info(f"Caption Results: {caption_result.get_summary()}")
119
-
120
- return {
121
- "vqa": vqa_result.to_dict(),
122
- "captioning": caption_result.to_dict()
123
- }
124
-
125
-
126
- async def run_audio_evaluation():
127
- """Example: Run audio evaluation for STT and emotion recognition."""
128
- logger.info("🎵 Running Audio Evaluation")
129
-
130
- # STT Evaluation
131
- stt_evaluator = AudioEvaluator(config={
132
- "task_type": "stt",
133
- "normalize_text": True,
134
- "case_sensitive": False
135
- })
136
-
137
- model_interface = ISAModelInterface()
138
-
139
- # Create mock STT dataset
140
- stt_data = [
141
- {
142
- "audio": "mock_audio_1.wav",
143
- "expected_output": "The quick brown fox jumps over the lazy dog",
144
- "task_type": "stt",
145
- "id": "stt_test_1"
146
- },
147
- {
148
- "audio": "mock_audio_2.wav",
149
- "expected_output": "Machine learning is transforming artificial intelligence",
150
- "task_type": "stt",
151
- "id": "stt_test_2"
152
- }
153
- ]
154
-
155
- stt_result = await stt_evaluator.evaluate(
156
- model_interface=model_interface,
157
- dataset=stt_data,
158
- dataset_name="LibriSpeech_Test",
159
- model_name="isa_audio_sota_service"
160
- )
161
-
162
- logger.info(f"STT Results: {stt_result.get_summary()}")
163
-
164
- # Emotion Recognition Evaluation
165
- emotion_evaluator = AudioEvaluator(config={"task_type": "emotion"})
166
-
167
- emotion_data = [
168
- {
169
- "audio": "mock_emotion_1.wav",
170
- "expected_output": "happy",
171
- "task_type": "emotion",
172
- "id": "emotion_test_1"
173
- },
174
- {
175
- "audio": "mock_emotion_2.wav",
176
- "expected_output": "sad",
177
- "task_type": "emotion",
178
- "id": "emotion_test_2"
179
- }
180
- ]
181
-
182
- emotion_result = await emotion_evaluator.evaluate(
183
- model_interface=model_interface,
184
- dataset=emotion_data,
185
- dataset_name="Emotion_Test",
186
- model_name="isa_audio_sota_service"
187
- )
188
-
189
- logger.info(f"Emotion Results: {emotion_result.get_summary()}")
190
-
191
- return {
192
- "stt": stt_result.to_dict(),
193
- "emotion": emotion_result.to_dict()
194
- }
195
-
196
-
197
- async def run_embedding_evaluation():
198
- """Example: Run embedding evaluation for similarity and retrieval."""
199
- logger.info("🔍 Running Embedding Evaluation")
200
-
201
- # Similarity Evaluation
202
- similarity_evaluator = EmbeddingEvaluator(config={
203
- "task_type": "similarity",
204
- "similarity_metric": "cosine"
205
- })
206
-
207
- model_interface = ISAModelInterface()
208
-
209
- # Create similarity dataset
210
- similarity_data = [
211
- {
212
- "text1": "The cat is sleeping on the couch",
213
- "text2": "A feline is resting on the sofa",
214
- "expected_output": 0.8, # High similarity
215
- "task_type": "similarity",
216
- "id": "sim_test_1"
217
- },
218
- {
219
- "text1": "I love pizza",
220
- "text2": "The weather is sunny today",
221
- "expected_output": 0.1, # Low similarity
222
- "task_type": "similarity",
223
- "id": "sim_test_2"
224
- }
225
- ]
226
-
227
- similarity_result = await similarity_evaluator.evaluate(
228
- model_interface=model_interface,
229
- dataset=similarity_data,
230
- dataset_name="Similarity_Test",
231
- model_name="text-embedding-3-small"
232
- )
233
-
234
- logger.info(f"Similarity Results: {similarity_result.get_summary()}")
235
-
236
- # Retrieval Evaluation
237
- retrieval_evaluator = EmbeddingEvaluator(config={
238
- "task_type": "retrieval",
239
- "k_values": [1, 3, 5]
240
- })
241
-
242
- retrieval_data = [
243
- {
244
- "query": "machine learning algorithms",
245
- "documents": [
246
- "Neural networks are a type of machine learning algorithm",
247
- "The weather is nice today",
248
- "Deep learning uses artificial neural networks",
249
- "I like to cook pasta"
250
- ],
251
- "expected_output": [1, 0, 1, 0], # Relevance labels
252
- "task_type": "retrieval",
253
- "id": "retrieval_test_1"
254
- }
255
- ]
256
-
257
- retrieval_result = await retrieval_evaluator.evaluate(
258
- model_interface=model_interface,
259
- dataset=retrieval_data,
260
- dataset_name="Retrieval_Test",
261
- model_name="text-embedding-3-small"
262
- )
263
-
264
- logger.info(f"Retrieval Results: {retrieval_result.get_summary()}")
265
-
266
- return {
267
- "similarity": similarity_result.to_dict(),
268
- "retrieval": retrieval_result.to_dict()
269
- }
270
-
271
-
272
- async def run_isa_service_benchmark_example():
273
- """Example: Run comprehensive ISA service benchmarking."""
274
- logger.info("⚡ Running ISA Service Benchmark")
275
-
276
- benchmark_config = {
277
- "test_duration_seconds": 30, # Short test for demo
278
- "max_concurrent_requests": 5,
279
- "warmup_requests": 3,
280
- "services_to_test": [
281
- "isa_ocr_service",
282
- "isa_audio_sota_service",
283
- "isa_embedding_reranking_service"
284
- ]
285
- }
286
-
287
- benchmark_results = await run_isa_service_benchmark(benchmark_config)
288
-
289
- logger.info("📊 ISA Service Benchmark Summary:")
290
- summary = benchmark_results.get("summary", {})
291
- logger.info(f"Services tested: {summary.get('total_services_tested', 0)}")
292
- logger.info(f"Successful services: {summary.get('successful_services', 0)}")
293
-
294
- # Log performance highlights
295
- comparative = benchmark_results.get("comparative_analysis", {})
296
- recommendations = comparative.get("recommendations", [])
297
- for rec in recommendations:
298
- logger.info(f"💡 {rec}")
299
-
300
- return benchmark_results
301
-
302
-
303
- async def run_factory_evaluation():
304
- """Example: Use EvaluationFactory for simplified multi-model comparison."""
305
- logger.info("🏭 Running Factory-based Multi-Model Evaluation")
306
-
307
- factory = EvaluationFactory()
308
-
309
- # Define models to compare
310
- models = [
311
- {"name": "gpt-4.1-nano", "provider": "openai"},
312
- {"name": "llama3.2:3b-instruct-fp16", "provider": "ollama"},
313
- {"name": "claude-sonnet-4-20250514", "provider": "yyds"}
314
- ]
315
-
316
- # Create simple test dataset
317
- test_data = [
318
- {
319
- "input": "What is 2+2?",
320
- "output": "4",
321
- "id": "math_test_1"
322
- },
323
- {
324
- "input": "Name the capital of France.",
325
- "output": "Paris",
326
- "id": "geography_test_1"
327
- }
328
- ]
329
-
330
- # Run comparison
331
- comparison_results = await factory.compare_models(
332
- models=models,
333
- dataset=test_data,
334
- evaluator_type="llm",
335
- metrics=["accuracy", "f1_score", "latency"]
336
- )
337
-
338
- logger.info("📈 Model Comparison Results:")
339
- for model_name, results in comparison_results.items():
340
- metrics = results.get("metrics", {})
341
- logger.info(f"{model_name}: Accuracy={metrics.get('accuracy', 0):.3f}, "
342
- f"F1={metrics.get('f1_score', 0):.3f}")
343
-
344
- return comparison_results
345
-
346
-
347
- async def save_results(results: Dict[str, Any], output_file: str = "evaluation_results.json"):
348
- """Save evaluation results to file."""
349
- output_path = Path(output_file)
350
-
351
- with open(output_path, 'w', encoding='utf-8') as f:
352
- json.dump(results, f, indent=2, ensure_ascii=False, default=str)
353
-
354
- logger.info(f"💾 Results saved to {output_path}")
355
-
356
-
357
- async def main():
358
- """Run comprehensive evaluation examples."""
359
- logger.info("🔬 Starting ISA Model Evaluation Framework Demo")
360
-
361
- results = {}
362
-
363
- try:
364
- # Run all evaluation examples
365
- results["llm_evaluation"] = await run_standard_llm_evaluation()
366
- results["vision_evaluation"] = await run_vision_evaluation()
367
- results["audio_evaluation"] = await run_audio_evaluation()
368
- results["embedding_evaluation"] = await run_embedding_evaluation()
369
- results["isa_benchmarks"] = await run_isa_service_benchmark_example()
370
- results["factory_comparison"] = await run_factory_evaluation()
371
-
372
- # Save results
373
- await save_results(results)
374
-
375
- logger.info("✅ All evaluations completed successfully!")
376
-
377
- # Print summary
378
- logger.info("\n📋 Evaluation Summary:")
379
- logger.info(f"- LLM evaluations: {len(results['llm_evaluation'])} benchmarks")
380
- logger.info(f"- Vision evaluations: {len(results['vision_evaluation'])} tasks")
381
- logger.info(f"- Audio evaluations: {len(results['audio_evaluation'])} tasks")
382
- logger.info(f"- Embedding evaluations: {len(results['embedding_evaluation'])} tasks")
383
- logger.info(f"- ISA service benchmarks: {results['isa_benchmarks']['summary']['total_services_tested']} services")
384
- logger.info(f"- Model comparisons: {len(results['factory_comparison'])} models")
385
-
386
- except Exception as e:
387
- logger.error(f"❌ Evaluation failed: {e}")
388
- raise
389
-
390
- return results
391
-
392
-
393
- if __name__ == "__main__":
394
- # Run the evaluation demo
395
- asyncio.run(main())