isa-model 0.4.0__py3-none-any.whl → 0.4.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (189) hide show
  1. isa_model/client.py +466 -43
  2. isa_model/core/cache/redis_cache.py +12 -3
  3. isa_model/core/config/config_manager.py +230 -3
  4. isa_model/core/config.py +90 -0
  5. isa_model/core/database/direct_db_client.py +114 -0
  6. isa_model/core/database/migration_manager.py +563 -0
  7. isa_model/core/database/migrations.py +21 -1
  8. isa_model/core/database/supabase_client.py +154 -19
  9. isa_model/core/dependencies.py +316 -0
  10. isa_model/core/discovery/__init__.py +19 -0
  11. isa_model/core/discovery/consul_discovery.py +190 -0
  12. isa_model/core/logging/__init__.py +54 -0
  13. isa_model/core/logging/influx_logger.py +523 -0
  14. isa_model/core/logging/loki_logger.py +160 -0
  15. isa_model/core/models/__init__.py +27 -18
  16. isa_model/core/models/config_models.py +625 -0
  17. isa_model/core/models/deployment_billing_tracker.py +430 -0
  18. isa_model/core/models/model_manager.py +35 -80
  19. isa_model/core/models/model_metadata.py +690 -0
  20. isa_model/core/models/model_repo.py +174 -18
  21. isa_model/core/models/system_models.py +857 -0
  22. isa_model/core/repositories/__init__.py +9 -0
  23. isa_model/core/repositories/config_repository.py +912 -0
  24. isa_model/core/services/intelligent_model_selector.py +399 -21
  25. isa_model/core/types.py +1 -0
  26. isa_model/deployment/__init__.py +5 -48
  27. isa_model/deployment/core/__init__.py +2 -31
  28. isa_model/deployment/core/deployment_manager.py +1278 -370
  29. isa_model/deployment/modal/__init__.py +8 -0
  30. isa_model/deployment/modal/config.py +136 -0
  31. isa_model/deployment/{services/auto_hf_modal_deployer.py → modal/deployer.py} +1 -1
  32. isa_model/deployment/modal/services/__init__.py +3 -0
  33. isa_model/deployment/modal/services/audio/__init__.py +1 -0
  34. isa_model/deployment/modal/services/embedding/__init__.py +1 -0
  35. isa_model/deployment/modal/services/llm/__init__.py +1 -0
  36. isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
  37. isa_model/deployment/modal/services/video/__init__.py +1 -0
  38. isa_model/deployment/modal/services/vision/__init__.py +1 -0
  39. isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
  40. isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
  41. isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
  42. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
  43. isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
  44. isa_model/deployment/storage/__init__.py +5 -0
  45. isa_model/deployment/storage/deployment_repository.py +824 -0
  46. isa_model/deployment/triton/__init__.py +10 -0
  47. isa_model/deployment/triton/config.py +196 -0
  48. isa_model/deployment/triton/configs/__init__.py +1 -0
  49. isa_model/deployment/triton/provider.py +512 -0
  50. isa_model/deployment/triton/scripts/__init__.py +1 -0
  51. isa_model/deployment/triton/templates/__init__.py +1 -0
  52. isa_model/inference/__init__.py +47 -1
  53. isa_model/inference/ai_factory.py +137 -10
  54. isa_model/inference/legacy_services/__init__.py +21 -0
  55. isa_model/inference/legacy_services/model_evaluation.py +637 -0
  56. isa_model/inference/legacy_services/model_service.py +573 -0
  57. isa_model/inference/legacy_services/model_serving.py +717 -0
  58. isa_model/inference/legacy_services/model_training.py +561 -0
  59. isa_model/inference/models/__init__.py +21 -0
  60. isa_model/inference/models/inference_config.py +551 -0
  61. isa_model/inference/models/inference_record.py +675 -0
  62. isa_model/inference/models/performance_models.py +714 -0
  63. isa_model/inference/repositories/__init__.py +9 -0
  64. isa_model/inference/repositories/inference_repository.py +828 -0
  65. isa_model/inference/services/audio/base_stt_service.py +184 -11
  66. isa_model/inference/services/audio/openai_stt_service.py +22 -6
  67. isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
  68. isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
  69. isa_model/inference/services/llm/__init__.py +10 -2
  70. isa_model/inference/services/llm/base_llm_service.py +335 -24
  71. isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
  72. isa_model/inference/services/llm/helpers/llm_adapter.py +9 -4
  73. isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
  74. isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
  75. isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
  76. isa_model/inference/services/llm/ollama_llm_service.py +9 -2
  77. isa_model/inference/services/llm/openai_llm_service.py +33 -16
  78. isa_model/inference/services/llm/yyds_llm_service.py +8 -2
  79. isa_model/inference/services/vision/__init__.py +22 -1
  80. isa_model/inference/services/vision/helpers/image_utils.py +8 -5
  81. isa_model/inference/services/vision/isa_vision_service.py +65 -4
  82. isa_model/inference/services/vision/openai_vision_service.py +19 -10
  83. isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
  84. isa_model/serving/api/cache_manager.py +245 -0
  85. isa_model/serving/api/dependencies/__init__.py +1 -0
  86. isa_model/serving/api/dependencies/auth.py +194 -0
  87. isa_model/serving/api/dependencies/database.py +139 -0
  88. isa_model/serving/api/error_handlers.py +284 -0
  89. isa_model/serving/api/fastapi_server.py +172 -22
  90. isa_model/serving/api/middleware/auth.py +8 -2
  91. isa_model/serving/api/middleware/security.py +23 -33
  92. isa_model/serving/api/middleware/tenant_context.py +414 -0
  93. isa_model/serving/api/routes/analytics.py +4 -1
  94. isa_model/serving/api/routes/config.py +645 -0
  95. isa_model/serving/api/routes/deployment_billing.py +315 -0
  96. isa_model/serving/api/routes/deployments.py +138 -2
  97. isa_model/serving/api/routes/gpu_gateway.py +440 -0
  98. isa_model/serving/api/routes/health.py +32 -12
  99. isa_model/serving/api/routes/inference_monitoring.py +486 -0
  100. isa_model/serving/api/routes/local_deployments.py +448 -0
  101. isa_model/serving/api/routes/tenants.py +575 -0
  102. isa_model/serving/api/routes/unified.py +680 -18
  103. isa_model/serving/api/routes/webhooks.py +479 -0
  104. isa_model/serving/api/startup.py +68 -54
  105. isa_model/utils/gpu_utils.py +311 -0
  106. {isa_model-0.4.0.dist-info → isa_model-0.4.4.dist-info}/METADATA +71 -24
  107. isa_model-0.4.4.dist-info/RECORD +180 -0
  108. isa_model/core/security/secrets.py +0 -358
  109. isa_model/core/storage/hf_storage.py +0 -419
  110. isa_model/core/storage/minio_storage.py +0 -0
  111. isa_model/deployment/cloud/__init__.py +0 -9
  112. isa_model/deployment/cloud/modal/__init__.py +0 -10
  113. isa_model/deployment/core/deployment_config.py +0 -356
  114. isa_model/deployment/core/isa_deployment_service.py +0 -401
  115. isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
  116. isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
  117. isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
  118. isa_model/deployment/runtime/deployed_service.py +0 -338
  119. isa_model/deployment/services/__init__.py +0 -9
  120. isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
  121. isa_model/deployment/services/model_service.py +0 -332
  122. isa_model/deployment/services/service_monitor.py +0 -356
  123. isa_model/deployment/services/service_registry.py +0 -527
  124. isa_model/eval/__init__.py +0 -92
  125. isa_model/eval/benchmarks/__init__.py +0 -27
  126. isa_model/eval/benchmarks/multimodal_datasets.py +0 -460
  127. isa_model/eval/benchmarks.py +0 -701
  128. isa_model/eval/config/__init__.py +0 -10
  129. isa_model/eval/config/evaluation_config.py +0 -108
  130. isa_model/eval/evaluators/__init__.py +0 -24
  131. isa_model/eval/evaluators/audio_evaluator.py +0 -727
  132. isa_model/eval/evaluators/base_evaluator.py +0 -503
  133. isa_model/eval/evaluators/embedding_evaluator.py +0 -742
  134. isa_model/eval/evaluators/llm_evaluator.py +0 -472
  135. isa_model/eval/evaluators/vision_evaluator.py +0 -564
  136. isa_model/eval/example_evaluation.py +0 -395
  137. isa_model/eval/factory.py +0 -798
  138. isa_model/eval/infrastructure/__init__.py +0 -24
  139. isa_model/eval/infrastructure/experiment_tracker.py +0 -466
  140. isa_model/eval/isa_benchmarks.py +0 -700
  141. isa_model/eval/isa_integration.py +0 -582
  142. isa_model/eval/metrics.py +0 -951
  143. isa_model/eval/tests/unit/test_basic.py +0 -396
  144. isa_model/serving/api/routes/evaluations.py +0 -579
  145. isa_model/training/__init__.py +0 -168
  146. isa_model/training/annotation/annotation_schema.py +0 -47
  147. isa_model/training/annotation/processors/annotation_processor.py +0 -126
  148. isa_model/training/annotation/storage/dataset_manager.py +0 -131
  149. isa_model/training/annotation/storage/dataset_schema.py +0 -44
  150. isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
  151. isa_model/training/annotation/tests/test_minio copy.py +0 -113
  152. isa_model/training/annotation/tests/test_minio_upload.py +0 -43
  153. isa_model/training/annotation/views/annotation_controller.py +0 -158
  154. isa_model/training/cloud/__init__.py +0 -22
  155. isa_model/training/cloud/job_orchestrator.py +0 -402
  156. isa_model/training/cloud/runpod_trainer.py +0 -454
  157. isa_model/training/cloud/storage_manager.py +0 -482
  158. isa_model/training/core/__init__.py +0 -26
  159. isa_model/training/core/config.py +0 -181
  160. isa_model/training/core/dataset.py +0 -222
  161. isa_model/training/core/trainer.py +0 -720
  162. isa_model/training/core/utils.py +0 -213
  163. isa_model/training/examples/intelligent_training_example.py +0 -281
  164. isa_model/training/factory.py +0 -424
  165. isa_model/training/intelligent/__init__.py +0 -25
  166. isa_model/training/intelligent/decision_engine.py +0 -643
  167. isa_model/training/intelligent/intelligent_factory.py +0 -888
  168. isa_model/training/intelligent/knowledge_base.py +0 -751
  169. isa_model/training/intelligent/resource_optimizer.py +0 -839
  170. isa_model/training/intelligent/task_classifier.py +0 -576
  171. isa_model/training/storage/__init__.py +0 -24
  172. isa_model/training/storage/core_integration.py +0 -439
  173. isa_model/training/storage/training_repository.py +0 -552
  174. isa_model/training/storage/training_storage.py +0 -628
  175. isa_model-0.4.0.dist-info/RECORD +0 -182
  176. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_chatTTS_service.py +0 -0
  177. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_fish_service.py +0 -0
  178. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_openvoice_service.py +0 -0
  179. /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_service_v2.py +0 -0
  180. /isa_model/deployment/{cloud/modal → modal/services/embedding}/isa_embed_rerank_service.py +0 -0
  181. /isa_model/deployment/{cloud/modal → modal/services/video}/isa_video_hunyuan_service.py +0 -0
  182. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ocr_service.py +0 -0
  183. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_qwen25_service.py +0 -0
  184. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_table_service.py +0 -0
  185. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service.py +0 -0
  186. /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service_optimized.py +0 -0
  187. /isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
  188. {isa_model-0.4.0.dist-info → isa_model-0.4.4.dist-info}/WHEEL +0 -0
  189. {isa_model-0.4.0.dist-info → isa_model-0.4.4.dist-info}/top_level.txt +0 -0
@@ -1,395 +0,0 @@
1
- """
2
- Example evaluation script demonstrating the ISA Model evaluation framework.
3
-
4
- Shows how to:
5
- 1. Evaluate standard benchmarks (MMLU, HellaSwag, etc.)
6
- 2. Test ISA custom services
7
- 3. Run multimodal evaluations
8
- 4. Perform comprehensive service benchmarking
9
- """
10
-
11
- import asyncio
12
- import logging
13
- import json
14
- from pathlib import Path
15
- from typing import Dict, Any, Optional
16
-
17
- # Import evaluation components
18
- from .benchmarks import create_mmlu_benchmark, create_gsm8k_benchmark
19
- from .benchmarks.multimodal_datasets import create_vqa_dataset, create_coco_captions_dataset
20
- from .evaluators import LLMEvaluator, VisionEvaluator, AudioEvaluator, EmbeddingEvaluator
21
- from .isa_integration import ISAModelInterface
22
- from .isa_benchmarks import run_isa_service_benchmark
23
- from .factory import EvaluationFactory
24
-
25
- # Setup logging
26
- logging.basicConfig(level=logging.INFO)
27
- logger = logging.getLogger(__name__)
28
-
29
-
30
- async def run_standard_llm_evaluation():
31
- """Example: Run standard LLM evaluation on MMLU and GSM8K."""
32
- logger.info("🚀 Running Standard LLM Evaluation")
33
-
34
- # Create evaluator
35
- evaluator = LLMEvaluator(config={
36
- "max_concurrent_requests": 5,
37
- "batch_size": 10
38
- })
39
-
40
- # Create ISA model interface
41
- model_interface = ISAModelInterface()
42
-
43
- # Test MMLU
44
- logger.info("📚 Testing MMLU benchmark")
45
- mmlu_benchmark = create_mmlu_benchmark(subjects=["anatomy", "astronomy", "business_ethics"])
46
- mmlu_data = mmlu_benchmark.load_data(max_samples=20)
47
-
48
- mmlu_result = await evaluator.evaluate(
49
- model_interface=model_interface,
50
- dataset=mmlu_data,
51
- dataset_name="MMLU",
52
- model_name="gpt-4.1-nano"
53
- )
54
-
55
- logger.info(f"MMLU Results: {mmlu_result.get_summary()}")
56
-
57
- # Test GSM8K
58
- logger.info("🧮 Testing GSM8K benchmark")
59
- gsm8k_benchmark = create_gsm8k_benchmark()
60
- gsm8k_data = gsm8k_benchmark.load_data(max_samples=10)
61
-
62
- gsm8k_result = await evaluator.evaluate(
63
- model_interface=model_interface,
64
- dataset=gsm8k_data,
65
- dataset_name="GSM8K",
66
- model_name="gpt-4.1-nano"
67
- )
68
-
69
- logger.info(f"GSM8K Results: {gsm8k_result.get_summary()}")
70
-
71
- return {
72
- "mmlu": mmlu_result.to_dict(),
73
- "gsm8k": gsm8k_result.to_dict()
74
- }
75
-
76
-
77
- async def run_vision_evaluation():
78
- """Example: Run vision evaluation with VQA and image captioning."""
79
- logger.info("👁️ Running Vision Evaluation")
80
-
81
- # Create vision evaluator
82
- evaluator = VisionEvaluator(config={
83
- "task_type": "vqa",
84
- "max_image_size": (1024, 1024)
85
- })
86
-
87
- # Create ISA model interface
88
- model_interface = ISAModelInterface()
89
-
90
- # Test VQA
91
- logger.info("❓ Testing VQA dataset")
92
- vqa_dataset = create_vqa_dataset()
93
- vqa_data = vqa_dataset.load_data(max_samples=10, use_real_data=False) # Use placeholder for demo
94
-
95
- vqa_result = await evaluator.evaluate(
96
- model_interface=model_interface,
97
- dataset=vqa_data,
98
- dataset_name="VQA_v2",
99
- model_name="gpt-4.1-mini"
100
- )
101
-
102
- logger.info(f"VQA Results: {vqa_result.get_summary()}")
103
-
104
- # Test Image Captioning
105
- logger.info("🖼️ Testing Image Captioning")
106
- caption_evaluator = VisionEvaluator(config={"task_type": "caption"})
107
-
108
- coco_dataset = create_coco_captions_dataset()
109
- caption_data = coco_dataset.load_data(max_samples=5, use_real_data=False)
110
-
111
- caption_result = await caption_evaluator.evaluate(
112
- model_interface=model_interface,
113
- dataset=caption_data,
114
- dataset_name="COCO_Captions",
115
- model_name="gpt-4.1-mini"
116
- )
117
-
118
- logger.info(f"Caption Results: {caption_result.get_summary()}")
119
-
120
- return {
121
- "vqa": vqa_result.to_dict(),
122
- "captioning": caption_result.to_dict()
123
- }
124
-
125
-
126
- async def run_audio_evaluation():
127
- """Example: Run audio evaluation for STT and emotion recognition."""
128
- logger.info("🎵 Running Audio Evaluation")
129
-
130
- # STT Evaluation
131
- stt_evaluator = AudioEvaluator(config={
132
- "task_type": "stt",
133
- "normalize_text": True,
134
- "case_sensitive": False
135
- })
136
-
137
- model_interface = ISAModelInterface()
138
-
139
- # Create mock STT dataset
140
- stt_data = [
141
- {
142
- "audio": "mock_audio_1.wav",
143
- "expected_output": "The quick brown fox jumps over the lazy dog",
144
- "task_type": "stt",
145
- "id": "stt_test_1"
146
- },
147
- {
148
- "audio": "mock_audio_2.wav",
149
- "expected_output": "Machine learning is transforming artificial intelligence",
150
- "task_type": "stt",
151
- "id": "stt_test_2"
152
- }
153
- ]
154
-
155
- stt_result = await stt_evaluator.evaluate(
156
- model_interface=model_interface,
157
- dataset=stt_data,
158
- dataset_name="LibriSpeech_Test",
159
- model_name="isa_audio_sota_service"
160
- )
161
-
162
- logger.info(f"STT Results: {stt_result.get_summary()}")
163
-
164
- # Emotion Recognition Evaluation
165
- emotion_evaluator = AudioEvaluator(config={"task_type": "emotion"})
166
-
167
- emotion_data = [
168
- {
169
- "audio": "mock_emotion_1.wav",
170
- "expected_output": "happy",
171
- "task_type": "emotion",
172
- "id": "emotion_test_1"
173
- },
174
- {
175
- "audio": "mock_emotion_2.wav",
176
- "expected_output": "sad",
177
- "task_type": "emotion",
178
- "id": "emotion_test_2"
179
- }
180
- ]
181
-
182
- emotion_result = await emotion_evaluator.evaluate(
183
- model_interface=model_interface,
184
- dataset=emotion_data,
185
- dataset_name="Emotion_Test",
186
- model_name="isa_audio_sota_service"
187
- )
188
-
189
- logger.info(f"Emotion Results: {emotion_result.get_summary()}")
190
-
191
- return {
192
- "stt": stt_result.to_dict(),
193
- "emotion": emotion_result.to_dict()
194
- }
195
-
196
-
197
- async def run_embedding_evaluation():
198
- """Example: Run embedding evaluation for similarity and retrieval."""
199
- logger.info("🔍 Running Embedding Evaluation")
200
-
201
- # Similarity Evaluation
202
- similarity_evaluator = EmbeddingEvaluator(config={
203
- "task_type": "similarity",
204
- "similarity_metric": "cosine"
205
- })
206
-
207
- model_interface = ISAModelInterface()
208
-
209
- # Create similarity dataset
210
- similarity_data = [
211
- {
212
- "text1": "The cat is sleeping on the couch",
213
- "text2": "A feline is resting on the sofa",
214
- "expected_output": 0.8, # High similarity
215
- "task_type": "similarity",
216
- "id": "sim_test_1"
217
- },
218
- {
219
- "text1": "I love pizza",
220
- "text2": "The weather is sunny today",
221
- "expected_output": 0.1, # Low similarity
222
- "task_type": "similarity",
223
- "id": "sim_test_2"
224
- }
225
- ]
226
-
227
- similarity_result = await similarity_evaluator.evaluate(
228
- model_interface=model_interface,
229
- dataset=similarity_data,
230
- dataset_name="Similarity_Test",
231
- model_name="text-embedding-3-small"
232
- )
233
-
234
- logger.info(f"Similarity Results: {similarity_result.get_summary()}")
235
-
236
- # Retrieval Evaluation
237
- retrieval_evaluator = EmbeddingEvaluator(config={
238
- "task_type": "retrieval",
239
- "k_values": [1, 3, 5]
240
- })
241
-
242
- retrieval_data = [
243
- {
244
- "query": "machine learning algorithms",
245
- "documents": [
246
- "Neural networks are a type of machine learning algorithm",
247
- "The weather is nice today",
248
- "Deep learning uses artificial neural networks",
249
- "I like to cook pasta"
250
- ],
251
- "expected_output": [1, 0, 1, 0], # Relevance labels
252
- "task_type": "retrieval",
253
- "id": "retrieval_test_1"
254
- }
255
- ]
256
-
257
- retrieval_result = await retrieval_evaluator.evaluate(
258
- model_interface=model_interface,
259
- dataset=retrieval_data,
260
- dataset_name="Retrieval_Test",
261
- model_name="text-embedding-3-small"
262
- )
263
-
264
- logger.info(f"Retrieval Results: {retrieval_result.get_summary()}")
265
-
266
- return {
267
- "similarity": similarity_result.to_dict(),
268
- "retrieval": retrieval_result.to_dict()
269
- }
270
-
271
-
272
- async def run_isa_service_benchmark_example():
273
- """Example: Run comprehensive ISA service benchmarking."""
274
- logger.info("⚡ Running ISA Service Benchmark")
275
-
276
- benchmark_config = {
277
- "test_duration_seconds": 30, # Short test for demo
278
- "max_concurrent_requests": 5,
279
- "warmup_requests": 3,
280
- "services_to_test": [
281
- "isa_ocr_service",
282
- "isa_audio_sota_service",
283
- "isa_embedding_reranking_service"
284
- ]
285
- }
286
-
287
- benchmark_results = await run_isa_service_benchmark(benchmark_config)
288
-
289
- logger.info("📊 ISA Service Benchmark Summary:")
290
- summary = benchmark_results.get("summary", {})
291
- logger.info(f"Services tested: {summary.get('total_services_tested', 0)}")
292
- logger.info(f"Successful services: {summary.get('successful_services', 0)}")
293
-
294
- # Log performance highlights
295
- comparative = benchmark_results.get("comparative_analysis", {})
296
- recommendations = comparative.get("recommendations", [])
297
- for rec in recommendations:
298
- logger.info(f"💡 {rec}")
299
-
300
- return benchmark_results
301
-
302
-
303
- async def run_factory_evaluation():
304
- """Example: Use EvaluationFactory for simplified multi-model comparison."""
305
- logger.info("🏭 Running Factory-based Multi-Model Evaluation")
306
-
307
- factory = EvaluationFactory()
308
-
309
- # Define models to compare
310
- models = [
311
- {"name": "gpt-4.1-nano", "provider": "openai"},
312
- {"name": "llama3.2:3b-instruct-fp16", "provider": "ollama"},
313
- {"name": "claude-sonnet-4-20250514", "provider": "yyds"}
314
- ]
315
-
316
- # Create simple test dataset
317
- test_data = [
318
- {
319
- "input": "What is 2+2?",
320
- "output": "4",
321
- "id": "math_test_1"
322
- },
323
- {
324
- "input": "Name the capital of France.",
325
- "output": "Paris",
326
- "id": "geography_test_1"
327
- }
328
- ]
329
-
330
- # Run comparison
331
- comparison_results = await factory.compare_models(
332
- models=models,
333
- dataset=test_data,
334
- evaluator_type="llm",
335
- metrics=["accuracy", "f1_score", "latency"]
336
- )
337
-
338
- logger.info("📈 Model Comparison Results:")
339
- for model_name, results in comparison_results.items():
340
- metrics = results.get("metrics", {})
341
- logger.info(f"{model_name}: Accuracy={metrics.get('accuracy', 0):.3f}, "
342
- f"F1={metrics.get('f1_score', 0):.3f}")
343
-
344
- return comparison_results
345
-
346
-
347
- async def save_results(results: Dict[str, Any], output_file: str = "evaluation_results.json"):
348
- """Save evaluation results to file."""
349
- output_path = Path(output_file)
350
-
351
- with open(output_path, 'w', encoding='utf-8') as f:
352
- json.dump(results, f, indent=2, ensure_ascii=False, default=str)
353
-
354
- logger.info(f"💾 Results saved to {output_path}")
355
-
356
-
357
- async def main():
358
- """Run comprehensive evaluation examples."""
359
- logger.info("🔬 Starting ISA Model Evaluation Framework Demo")
360
-
361
- results = {}
362
-
363
- try:
364
- # Run all evaluation examples
365
- results["llm_evaluation"] = await run_standard_llm_evaluation()
366
- results["vision_evaluation"] = await run_vision_evaluation()
367
- results["audio_evaluation"] = await run_audio_evaluation()
368
- results["embedding_evaluation"] = await run_embedding_evaluation()
369
- results["isa_benchmarks"] = await run_isa_service_benchmark_example()
370
- results["factory_comparison"] = await run_factory_evaluation()
371
-
372
- # Save results
373
- await save_results(results)
374
-
375
- logger.info("✅ All evaluations completed successfully!")
376
-
377
- # Print summary
378
- logger.info("\n📋 Evaluation Summary:")
379
- logger.info(f"- LLM evaluations: {len(results['llm_evaluation'])} benchmarks")
380
- logger.info(f"- Vision evaluations: {len(results['vision_evaluation'])} tasks")
381
- logger.info(f"- Audio evaluations: {len(results['audio_evaluation'])} tasks")
382
- logger.info(f"- Embedding evaluations: {len(results['embedding_evaluation'])} tasks")
383
- logger.info(f"- ISA service benchmarks: {results['isa_benchmarks']['summary']['total_services_tested']} services")
384
- logger.info(f"- Model comparisons: {len(results['factory_comparison'])} models")
385
-
386
- except Exception as e:
387
- logger.error(f"❌ Evaluation failed: {e}")
388
- raise
389
-
390
- return results
391
-
392
-
393
- if __name__ == "__main__":
394
- # Run the evaluation demo
395
- asyncio.run(main())