isa-model 0.4.0__py3-none-any.whl → 0.4.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- isa_model/client.py +466 -43
- isa_model/core/cache/redis_cache.py +12 -3
- isa_model/core/config/config_manager.py +230 -3
- isa_model/core/config.py +90 -0
- isa_model/core/database/direct_db_client.py +114 -0
- isa_model/core/database/migration_manager.py +563 -0
- isa_model/core/database/migrations.py +21 -1
- isa_model/core/database/supabase_client.py +154 -19
- isa_model/core/dependencies.py +316 -0
- isa_model/core/discovery/__init__.py +19 -0
- isa_model/core/discovery/consul_discovery.py +190 -0
- isa_model/core/logging/__init__.py +54 -0
- isa_model/core/logging/influx_logger.py +523 -0
- isa_model/core/logging/loki_logger.py +160 -0
- isa_model/core/models/__init__.py +27 -18
- isa_model/core/models/config_models.py +625 -0
- isa_model/core/models/deployment_billing_tracker.py +430 -0
- isa_model/core/models/model_manager.py +35 -80
- isa_model/core/models/model_metadata.py +690 -0
- isa_model/core/models/model_repo.py +174 -18
- isa_model/core/models/system_models.py +857 -0
- isa_model/core/repositories/__init__.py +9 -0
- isa_model/core/repositories/config_repository.py +912 -0
- isa_model/core/services/intelligent_model_selector.py +399 -21
- isa_model/core/types.py +1 -0
- isa_model/deployment/__init__.py +5 -48
- isa_model/deployment/core/__init__.py +2 -31
- isa_model/deployment/core/deployment_manager.py +1278 -370
- isa_model/deployment/modal/__init__.py +8 -0
- isa_model/deployment/modal/config.py +136 -0
- isa_model/deployment/{services/auto_hf_modal_deployer.py → modal/deployer.py} +1 -1
- isa_model/deployment/modal/services/__init__.py +3 -0
- isa_model/deployment/modal/services/audio/__init__.py +1 -0
- isa_model/deployment/modal/services/embedding/__init__.py +1 -0
- isa_model/deployment/modal/services/llm/__init__.py +1 -0
- isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
- isa_model/deployment/modal/services/video/__init__.py +1 -0
- isa_model/deployment/modal/services/vision/__init__.py +1 -0
- isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
- isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
- isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
- isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
- isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
- isa_model/deployment/storage/__init__.py +5 -0
- isa_model/deployment/storage/deployment_repository.py +824 -0
- isa_model/deployment/triton/__init__.py +10 -0
- isa_model/deployment/triton/config.py +196 -0
- isa_model/deployment/triton/configs/__init__.py +1 -0
- isa_model/deployment/triton/provider.py +512 -0
- isa_model/deployment/triton/scripts/__init__.py +1 -0
- isa_model/deployment/triton/templates/__init__.py +1 -0
- isa_model/inference/__init__.py +47 -1
- isa_model/inference/ai_factory.py +137 -10
- isa_model/inference/legacy_services/__init__.py +21 -0
- isa_model/inference/legacy_services/model_evaluation.py +637 -0
- isa_model/inference/legacy_services/model_service.py +573 -0
- isa_model/inference/legacy_services/model_serving.py +717 -0
- isa_model/inference/legacy_services/model_training.py +561 -0
- isa_model/inference/models/__init__.py +21 -0
- isa_model/inference/models/inference_config.py +551 -0
- isa_model/inference/models/inference_record.py +675 -0
- isa_model/inference/models/performance_models.py +714 -0
- isa_model/inference/repositories/__init__.py +9 -0
- isa_model/inference/repositories/inference_repository.py +828 -0
- isa_model/inference/services/audio/base_stt_service.py +184 -11
- isa_model/inference/services/audio/openai_stt_service.py +22 -6
- isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
- isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
- isa_model/inference/services/llm/__init__.py +10 -2
- isa_model/inference/services/llm/base_llm_service.py +335 -24
- isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
- isa_model/inference/services/llm/helpers/llm_adapter.py +9 -4
- isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
- isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
- isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
- isa_model/inference/services/llm/ollama_llm_service.py +9 -2
- isa_model/inference/services/llm/openai_llm_service.py +33 -16
- isa_model/inference/services/llm/yyds_llm_service.py +8 -2
- isa_model/inference/services/vision/__init__.py +22 -1
- isa_model/inference/services/vision/helpers/image_utils.py +8 -5
- isa_model/inference/services/vision/isa_vision_service.py +65 -4
- isa_model/inference/services/vision/openai_vision_service.py +19 -10
- isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
- isa_model/serving/api/cache_manager.py +245 -0
- isa_model/serving/api/dependencies/__init__.py +1 -0
- isa_model/serving/api/dependencies/auth.py +194 -0
- isa_model/serving/api/dependencies/database.py +139 -0
- isa_model/serving/api/error_handlers.py +284 -0
- isa_model/serving/api/fastapi_server.py +172 -22
- isa_model/serving/api/middleware/auth.py +8 -2
- isa_model/serving/api/middleware/security.py +23 -33
- isa_model/serving/api/middleware/tenant_context.py +414 -0
- isa_model/serving/api/routes/analytics.py +4 -1
- isa_model/serving/api/routes/config.py +645 -0
- isa_model/serving/api/routes/deployment_billing.py +315 -0
- isa_model/serving/api/routes/deployments.py +138 -2
- isa_model/serving/api/routes/gpu_gateway.py +440 -0
- isa_model/serving/api/routes/health.py +32 -12
- isa_model/serving/api/routes/inference_monitoring.py +486 -0
- isa_model/serving/api/routes/local_deployments.py +448 -0
- isa_model/serving/api/routes/tenants.py +575 -0
- isa_model/serving/api/routes/unified.py +680 -18
- isa_model/serving/api/routes/webhooks.py +479 -0
- isa_model/serving/api/startup.py +68 -54
- isa_model/utils/gpu_utils.py +311 -0
- {isa_model-0.4.0.dist-info → isa_model-0.4.4.dist-info}/METADATA +71 -24
- isa_model-0.4.4.dist-info/RECORD +180 -0
- isa_model/core/security/secrets.py +0 -358
- isa_model/core/storage/hf_storage.py +0 -419
- isa_model/core/storage/minio_storage.py +0 -0
- isa_model/deployment/cloud/__init__.py +0 -9
- isa_model/deployment/cloud/modal/__init__.py +0 -10
- isa_model/deployment/core/deployment_config.py +0 -356
- isa_model/deployment/core/isa_deployment_service.py +0 -401
- isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
- isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
- isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
- isa_model/deployment/runtime/deployed_service.py +0 -338
- isa_model/deployment/services/__init__.py +0 -9
- isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
- isa_model/deployment/services/model_service.py +0 -332
- isa_model/deployment/services/service_monitor.py +0 -356
- isa_model/deployment/services/service_registry.py +0 -527
- isa_model/eval/__init__.py +0 -92
- isa_model/eval/benchmarks/__init__.py +0 -27
- isa_model/eval/benchmarks/multimodal_datasets.py +0 -460
- isa_model/eval/benchmarks.py +0 -701
- isa_model/eval/config/__init__.py +0 -10
- isa_model/eval/config/evaluation_config.py +0 -108
- isa_model/eval/evaluators/__init__.py +0 -24
- isa_model/eval/evaluators/audio_evaluator.py +0 -727
- isa_model/eval/evaluators/base_evaluator.py +0 -503
- isa_model/eval/evaluators/embedding_evaluator.py +0 -742
- isa_model/eval/evaluators/llm_evaluator.py +0 -472
- isa_model/eval/evaluators/vision_evaluator.py +0 -564
- isa_model/eval/example_evaluation.py +0 -395
- isa_model/eval/factory.py +0 -798
- isa_model/eval/infrastructure/__init__.py +0 -24
- isa_model/eval/infrastructure/experiment_tracker.py +0 -466
- isa_model/eval/isa_benchmarks.py +0 -700
- isa_model/eval/isa_integration.py +0 -582
- isa_model/eval/metrics.py +0 -951
- isa_model/eval/tests/unit/test_basic.py +0 -396
- isa_model/serving/api/routes/evaluations.py +0 -579
- isa_model/training/__init__.py +0 -168
- isa_model/training/annotation/annotation_schema.py +0 -47
- isa_model/training/annotation/processors/annotation_processor.py +0 -126
- isa_model/training/annotation/storage/dataset_manager.py +0 -131
- isa_model/training/annotation/storage/dataset_schema.py +0 -44
- isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
- isa_model/training/annotation/tests/test_minio copy.py +0 -113
- isa_model/training/annotation/tests/test_minio_upload.py +0 -43
- isa_model/training/annotation/views/annotation_controller.py +0 -158
- isa_model/training/cloud/__init__.py +0 -22
- isa_model/training/cloud/job_orchestrator.py +0 -402
- isa_model/training/cloud/runpod_trainer.py +0 -454
- isa_model/training/cloud/storage_manager.py +0 -482
- isa_model/training/core/__init__.py +0 -26
- isa_model/training/core/config.py +0 -181
- isa_model/training/core/dataset.py +0 -222
- isa_model/training/core/trainer.py +0 -720
- isa_model/training/core/utils.py +0 -213
- isa_model/training/examples/intelligent_training_example.py +0 -281
- isa_model/training/factory.py +0 -424
- isa_model/training/intelligent/__init__.py +0 -25
- isa_model/training/intelligent/decision_engine.py +0 -643
- isa_model/training/intelligent/intelligent_factory.py +0 -888
- isa_model/training/intelligent/knowledge_base.py +0 -751
- isa_model/training/intelligent/resource_optimizer.py +0 -839
- isa_model/training/intelligent/task_classifier.py +0 -576
- isa_model/training/storage/__init__.py +0 -24
- isa_model/training/storage/core_integration.py +0 -439
- isa_model/training/storage/training_repository.py +0 -552
- isa_model/training/storage/training_storage.py +0 -628
- isa_model-0.4.0.dist-info/RECORD +0 -182
- /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_chatTTS_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_fish_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_openvoice_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_service_v2.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/embedding}/isa_embed_rerank_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/video}/isa_video_hunyuan_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ocr_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_qwen25_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_table_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service_optimized.py +0 -0
- /isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
- {isa_model-0.4.0.dist-info → isa_model-0.4.4.dist-info}/WHEEL +0 -0
- {isa_model-0.4.0.dist-info → isa_model-0.4.4.dist-info}/top_level.txt +0 -0
@@ -1,395 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
Example evaluation script demonstrating the ISA Model evaluation framework.
|
3
|
-
|
4
|
-
Shows how to:
|
5
|
-
1. Evaluate standard benchmarks (MMLU, HellaSwag, etc.)
|
6
|
-
2. Test ISA custom services
|
7
|
-
3. Run multimodal evaluations
|
8
|
-
4. Perform comprehensive service benchmarking
|
9
|
-
"""
|
10
|
-
|
11
|
-
import asyncio
|
12
|
-
import logging
|
13
|
-
import json
|
14
|
-
from pathlib import Path
|
15
|
-
from typing import Dict, Any, Optional
|
16
|
-
|
17
|
-
# Import evaluation components
|
18
|
-
from .benchmarks import create_mmlu_benchmark, create_gsm8k_benchmark
|
19
|
-
from .benchmarks.multimodal_datasets import create_vqa_dataset, create_coco_captions_dataset
|
20
|
-
from .evaluators import LLMEvaluator, VisionEvaluator, AudioEvaluator, EmbeddingEvaluator
|
21
|
-
from .isa_integration import ISAModelInterface
|
22
|
-
from .isa_benchmarks import run_isa_service_benchmark
|
23
|
-
from .factory import EvaluationFactory
|
24
|
-
|
25
|
-
# Setup logging
|
26
|
-
logging.basicConfig(level=logging.INFO)
|
27
|
-
logger = logging.getLogger(__name__)
|
28
|
-
|
29
|
-
|
30
|
-
async def run_standard_llm_evaluation():
|
31
|
-
"""Example: Run standard LLM evaluation on MMLU and GSM8K."""
|
32
|
-
logger.info("🚀 Running Standard LLM Evaluation")
|
33
|
-
|
34
|
-
# Create evaluator
|
35
|
-
evaluator = LLMEvaluator(config={
|
36
|
-
"max_concurrent_requests": 5,
|
37
|
-
"batch_size": 10
|
38
|
-
})
|
39
|
-
|
40
|
-
# Create ISA model interface
|
41
|
-
model_interface = ISAModelInterface()
|
42
|
-
|
43
|
-
# Test MMLU
|
44
|
-
logger.info("📚 Testing MMLU benchmark")
|
45
|
-
mmlu_benchmark = create_mmlu_benchmark(subjects=["anatomy", "astronomy", "business_ethics"])
|
46
|
-
mmlu_data = mmlu_benchmark.load_data(max_samples=20)
|
47
|
-
|
48
|
-
mmlu_result = await evaluator.evaluate(
|
49
|
-
model_interface=model_interface,
|
50
|
-
dataset=mmlu_data,
|
51
|
-
dataset_name="MMLU",
|
52
|
-
model_name="gpt-4.1-nano"
|
53
|
-
)
|
54
|
-
|
55
|
-
logger.info(f"MMLU Results: {mmlu_result.get_summary()}")
|
56
|
-
|
57
|
-
# Test GSM8K
|
58
|
-
logger.info("🧮 Testing GSM8K benchmark")
|
59
|
-
gsm8k_benchmark = create_gsm8k_benchmark()
|
60
|
-
gsm8k_data = gsm8k_benchmark.load_data(max_samples=10)
|
61
|
-
|
62
|
-
gsm8k_result = await evaluator.evaluate(
|
63
|
-
model_interface=model_interface,
|
64
|
-
dataset=gsm8k_data,
|
65
|
-
dataset_name="GSM8K",
|
66
|
-
model_name="gpt-4.1-nano"
|
67
|
-
)
|
68
|
-
|
69
|
-
logger.info(f"GSM8K Results: {gsm8k_result.get_summary()}")
|
70
|
-
|
71
|
-
return {
|
72
|
-
"mmlu": mmlu_result.to_dict(),
|
73
|
-
"gsm8k": gsm8k_result.to_dict()
|
74
|
-
}
|
75
|
-
|
76
|
-
|
77
|
-
async def run_vision_evaluation():
|
78
|
-
"""Example: Run vision evaluation with VQA and image captioning."""
|
79
|
-
logger.info("👁️ Running Vision Evaluation")
|
80
|
-
|
81
|
-
# Create vision evaluator
|
82
|
-
evaluator = VisionEvaluator(config={
|
83
|
-
"task_type": "vqa",
|
84
|
-
"max_image_size": (1024, 1024)
|
85
|
-
})
|
86
|
-
|
87
|
-
# Create ISA model interface
|
88
|
-
model_interface = ISAModelInterface()
|
89
|
-
|
90
|
-
# Test VQA
|
91
|
-
logger.info("❓ Testing VQA dataset")
|
92
|
-
vqa_dataset = create_vqa_dataset()
|
93
|
-
vqa_data = vqa_dataset.load_data(max_samples=10, use_real_data=False) # Use placeholder for demo
|
94
|
-
|
95
|
-
vqa_result = await evaluator.evaluate(
|
96
|
-
model_interface=model_interface,
|
97
|
-
dataset=vqa_data,
|
98
|
-
dataset_name="VQA_v2",
|
99
|
-
model_name="gpt-4.1-mini"
|
100
|
-
)
|
101
|
-
|
102
|
-
logger.info(f"VQA Results: {vqa_result.get_summary()}")
|
103
|
-
|
104
|
-
# Test Image Captioning
|
105
|
-
logger.info("🖼️ Testing Image Captioning")
|
106
|
-
caption_evaluator = VisionEvaluator(config={"task_type": "caption"})
|
107
|
-
|
108
|
-
coco_dataset = create_coco_captions_dataset()
|
109
|
-
caption_data = coco_dataset.load_data(max_samples=5, use_real_data=False)
|
110
|
-
|
111
|
-
caption_result = await caption_evaluator.evaluate(
|
112
|
-
model_interface=model_interface,
|
113
|
-
dataset=caption_data,
|
114
|
-
dataset_name="COCO_Captions",
|
115
|
-
model_name="gpt-4.1-mini"
|
116
|
-
)
|
117
|
-
|
118
|
-
logger.info(f"Caption Results: {caption_result.get_summary()}")
|
119
|
-
|
120
|
-
return {
|
121
|
-
"vqa": vqa_result.to_dict(),
|
122
|
-
"captioning": caption_result.to_dict()
|
123
|
-
}
|
124
|
-
|
125
|
-
|
126
|
-
async def run_audio_evaluation():
|
127
|
-
"""Example: Run audio evaluation for STT and emotion recognition."""
|
128
|
-
logger.info("🎵 Running Audio Evaluation")
|
129
|
-
|
130
|
-
# STT Evaluation
|
131
|
-
stt_evaluator = AudioEvaluator(config={
|
132
|
-
"task_type": "stt",
|
133
|
-
"normalize_text": True,
|
134
|
-
"case_sensitive": False
|
135
|
-
})
|
136
|
-
|
137
|
-
model_interface = ISAModelInterface()
|
138
|
-
|
139
|
-
# Create mock STT dataset
|
140
|
-
stt_data = [
|
141
|
-
{
|
142
|
-
"audio": "mock_audio_1.wav",
|
143
|
-
"expected_output": "The quick brown fox jumps over the lazy dog",
|
144
|
-
"task_type": "stt",
|
145
|
-
"id": "stt_test_1"
|
146
|
-
},
|
147
|
-
{
|
148
|
-
"audio": "mock_audio_2.wav",
|
149
|
-
"expected_output": "Machine learning is transforming artificial intelligence",
|
150
|
-
"task_type": "stt",
|
151
|
-
"id": "stt_test_2"
|
152
|
-
}
|
153
|
-
]
|
154
|
-
|
155
|
-
stt_result = await stt_evaluator.evaluate(
|
156
|
-
model_interface=model_interface,
|
157
|
-
dataset=stt_data,
|
158
|
-
dataset_name="LibriSpeech_Test",
|
159
|
-
model_name="isa_audio_sota_service"
|
160
|
-
)
|
161
|
-
|
162
|
-
logger.info(f"STT Results: {stt_result.get_summary()}")
|
163
|
-
|
164
|
-
# Emotion Recognition Evaluation
|
165
|
-
emotion_evaluator = AudioEvaluator(config={"task_type": "emotion"})
|
166
|
-
|
167
|
-
emotion_data = [
|
168
|
-
{
|
169
|
-
"audio": "mock_emotion_1.wav",
|
170
|
-
"expected_output": "happy",
|
171
|
-
"task_type": "emotion",
|
172
|
-
"id": "emotion_test_1"
|
173
|
-
},
|
174
|
-
{
|
175
|
-
"audio": "mock_emotion_2.wav",
|
176
|
-
"expected_output": "sad",
|
177
|
-
"task_type": "emotion",
|
178
|
-
"id": "emotion_test_2"
|
179
|
-
}
|
180
|
-
]
|
181
|
-
|
182
|
-
emotion_result = await emotion_evaluator.evaluate(
|
183
|
-
model_interface=model_interface,
|
184
|
-
dataset=emotion_data,
|
185
|
-
dataset_name="Emotion_Test",
|
186
|
-
model_name="isa_audio_sota_service"
|
187
|
-
)
|
188
|
-
|
189
|
-
logger.info(f"Emotion Results: {emotion_result.get_summary()}")
|
190
|
-
|
191
|
-
return {
|
192
|
-
"stt": stt_result.to_dict(),
|
193
|
-
"emotion": emotion_result.to_dict()
|
194
|
-
}
|
195
|
-
|
196
|
-
|
197
|
-
async def run_embedding_evaluation():
|
198
|
-
"""Example: Run embedding evaluation for similarity and retrieval."""
|
199
|
-
logger.info("🔍 Running Embedding Evaluation")
|
200
|
-
|
201
|
-
# Similarity Evaluation
|
202
|
-
similarity_evaluator = EmbeddingEvaluator(config={
|
203
|
-
"task_type": "similarity",
|
204
|
-
"similarity_metric": "cosine"
|
205
|
-
})
|
206
|
-
|
207
|
-
model_interface = ISAModelInterface()
|
208
|
-
|
209
|
-
# Create similarity dataset
|
210
|
-
similarity_data = [
|
211
|
-
{
|
212
|
-
"text1": "The cat is sleeping on the couch",
|
213
|
-
"text2": "A feline is resting on the sofa",
|
214
|
-
"expected_output": 0.8, # High similarity
|
215
|
-
"task_type": "similarity",
|
216
|
-
"id": "sim_test_1"
|
217
|
-
},
|
218
|
-
{
|
219
|
-
"text1": "I love pizza",
|
220
|
-
"text2": "The weather is sunny today",
|
221
|
-
"expected_output": 0.1, # Low similarity
|
222
|
-
"task_type": "similarity",
|
223
|
-
"id": "sim_test_2"
|
224
|
-
}
|
225
|
-
]
|
226
|
-
|
227
|
-
similarity_result = await similarity_evaluator.evaluate(
|
228
|
-
model_interface=model_interface,
|
229
|
-
dataset=similarity_data,
|
230
|
-
dataset_name="Similarity_Test",
|
231
|
-
model_name="text-embedding-3-small"
|
232
|
-
)
|
233
|
-
|
234
|
-
logger.info(f"Similarity Results: {similarity_result.get_summary()}")
|
235
|
-
|
236
|
-
# Retrieval Evaluation
|
237
|
-
retrieval_evaluator = EmbeddingEvaluator(config={
|
238
|
-
"task_type": "retrieval",
|
239
|
-
"k_values": [1, 3, 5]
|
240
|
-
})
|
241
|
-
|
242
|
-
retrieval_data = [
|
243
|
-
{
|
244
|
-
"query": "machine learning algorithms",
|
245
|
-
"documents": [
|
246
|
-
"Neural networks are a type of machine learning algorithm",
|
247
|
-
"The weather is nice today",
|
248
|
-
"Deep learning uses artificial neural networks",
|
249
|
-
"I like to cook pasta"
|
250
|
-
],
|
251
|
-
"expected_output": [1, 0, 1, 0], # Relevance labels
|
252
|
-
"task_type": "retrieval",
|
253
|
-
"id": "retrieval_test_1"
|
254
|
-
}
|
255
|
-
]
|
256
|
-
|
257
|
-
retrieval_result = await retrieval_evaluator.evaluate(
|
258
|
-
model_interface=model_interface,
|
259
|
-
dataset=retrieval_data,
|
260
|
-
dataset_name="Retrieval_Test",
|
261
|
-
model_name="text-embedding-3-small"
|
262
|
-
)
|
263
|
-
|
264
|
-
logger.info(f"Retrieval Results: {retrieval_result.get_summary()}")
|
265
|
-
|
266
|
-
return {
|
267
|
-
"similarity": similarity_result.to_dict(),
|
268
|
-
"retrieval": retrieval_result.to_dict()
|
269
|
-
}
|
270
|
-
|
271
|
-
|
272
|
-
async def run_isa_service_benchmark_example():
|
273
|
-
"""Example: Run comprehensive ISA service benchmarking."""
|
274
|
-
logger.info("⚡ Running ISA Service Benchmark")
|
275
|
-
|
276
|
-
benchmark_config = {
|
277
|
-
"test_duration_seconds": 30, # Short test for demo
|
278
|
-
"max_concurrent_requests": 5,
|
279
|
-
"warmup_requests": 3,
|
280
|
-
"services_to_test": [
|
281
|
-
"isa_ocr_service",
|
282
|
-
"isa_audio_sota_service",
|
283
|
-
"isa_embedding_reranking_service"
|
284
|
-
]
|
285
|
-
}
|
286
|
-
|
287
|
-
benchmark_results = await run_isa_service_benchmark(benchmark_config)
|
288
|
-
|
289
|
-
logger.info("📊 ISA Service Benchmark Summary:")
|
290
|
-
summary = benchmark_results.get("summary", {})
|
291
|
-
logger.info(f"Services tested: {summary.get('total_services_tested', 0)}")
|
292
|
-
logger.info(f"Successful services: {summary.get('successful_services', 0)}")
|
293
|
-
|
294
|
-
# Log performance highlights
|
295
|
-
comparative = benchmark_results.get("comparative_analysis", {})
|
296
|
-
recommendations = comparative.get("recommendations", [])
|
297
|
-
for rec in recommendations:
|
298
|
-
logger.info(f"💡 {rec}")
|
299
|
-
|
300
|
-
return benchmark_results
|
301
|
-
|
302
|
-
|
303
|
-
async def run_factory_evaluation():
|
304
|
-
"""Example: Use EvaluationFactory for simplified multi-model comparison."""
|
305
|
-
logger.info("🏭 Running Factory-based Multi-Model Evaluation")
|
306
|
-
|
307
|
-
factory = EvaluationFactory()
|
308
|
-
|
309
|
-
# Define models to compare
|
310
|
-
models = [
|
311
|
-
{"name": "gpt-4.1-nano", "provider": "openai"},
|
312
|
-
{"name": "llama3.2:3b-instruct-fp16", "provider": "ollama"},
|
313
|
-
{"name": "claude-sonnet-4-20250514", "provider": "yyds"}
|
314
|
-
]
|
315
|
-
|
316
|
-
# Create simple test dataset
|
317
|
-
test_data = [
|
318
|
-
{
|
319
|
-
"input": "What is 2+2?",
|
320
|
-
"output": "4",
|
321
|
-
"id": "math_test_1"
|
322
|
-
},
|
323
|
-
{
|
324
|
-
"input": "Name the capital of France.",
|
325
|
-
"output": "Paris",
|
326
|
-
"id": "geography_test_1"
|
327
|
-
}
|
328
|
-
]
|
329
|
-
|
330
|
-
# Run comparison
|
331
|
-
comparison_results = await factory.compare_models(
|
332
|
-
models=models,
|
333
|
-
dataset=test_data,
|
334
|
-
evaluator_type="llm",
|
335
|
-
metrics=["accuracy", "f1_score", "latency"]
|
336
|
-
)
|
337
|
-
|
338
|
-
logger.info("📈 Model Comparison Results:")
|
339
|
-
for model_name, results in comparison_results.items():
|
340
|
-
metrics = results.get("metrics", {})
|
341
|
-
logger.info(f"{model_name}: Accuracy={metrics.get('accuracy', 0):.3f}, "
|
342
|
-
f"F1={metrics.get('f1_score', 0):.3f}")
|
343
|
-
|
344
|
-
return comparison_results
|
345
|
-
|
346
|
-
|
347
|
-
async def save_results(results: Dict[str, Any], output_file: str = "evaluation_results.json"):
|
348
|
-
"""Save evaluation results to file."""
|
349
|
-
output_path = Path(output_file)
|
350
|
-
|
351
|
-
with open(output_path, 'w', encoding='utf-8') as f:
|
352
|
-
json.dump(results, f, indent=2, ensure_ascii=False, default=str)
|
353
|
-
|
354
|
-
logger.info(f"💾 Results saved to {output_path}")
|
355
|
-
|
356
|
-
|
357
|
-
async def main():
|
358
|
-
"""Run comprehensive evaluation examples."""
|
359
|
-
logger.info("🔬 Starting ISA Model Evaluation Framework Demo")
|
360
|
-
|
361
|
-
results = {}
|
362
|
-
|
363
|
-
try:
|
364
|
-
# Run all evaluation examples
|
365
|
-
results["llm_evaluation"] = await run_standard_llm_evaluation()
|
366
|
-
results["vision_evaluation"] = await run_vision_evaluation()
|
367
|
-
results["audio_evaluation"] = await run_audio_evaluation()
|
368
|
-
results["embedding_evaluation"] = await run_embedding_evaluation()
|
369
|
-
results["isa_benchmarks"] = await run_isa_service_benchmark_example()
|
370
|
-
results["factory_comparison"] = await run_factory_evaluation()
|
371
|
-
|
372
|
-
# Save results
|
373
|
-
await save_results(results)
|
374
|
-
|
375
|
-
logger.info("✅ All evaluations completed successfully!")
|
376
|
-
|
377
|
-
# Print summary
|
378
|
-
logger.info("\n📋 Evaluation Summary:")
|
379
|
-
logger.info(f"- LLM evaluations: {len(results['llm_evaluation'])} benchmarks")
|
380
|
-
logger.info(f"- Vision evaluations: {len(results['vision_evaluation'])} tasks")
|
381
|
-
logger.info(f"- Audio evaluations: {len(results['audio_evaluation'])} tasks")
|
382
|
-
logger.info(f"- Embedding evaluations: {len(results['embedding_evaluation'])} tasks")
|
383
|
-
logger.info(f"- ISA service benchmarks: {results['isa_benchmarks']['summary']['total_services_tested']} services")
|
384
|
-
logger.info(f"- Model comparisons: {len(results['factory_comparison'])} models")
|
385
|
-
|
386
|
-
except Exception as e:
|
387
|
-
logger.error(f"❌ Evaluation failed: {e}")
|
388
|
-
raise
|
389
|
-
|
390
|
-
return results
|
391
|
-
|
392
|
-
|
393
|
-
if __name__ == "__main__":
|
394
|
-
# Run the evaluation demo
|
395
|
-
asyncio.run(main())
|