isa-model 0.4.0__py3-none-any.whl → 0.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- isa_model/client.py +466 -43
- isa_model/core/cache/redis_cache.py +12 -3
- isa_model/core/config/config_manager.py +230 -3
- isa_model/core/config.py +90 -0
- isa_model/core/database/direct_db_client.py +114 -0
- isa_model/core/database/migration_manager.py +563 -0
- isa_model/core/database/migrations.py +21 -1
- isa_model/core/database/supabase_client.py +154 -19
- isa_model/core/dependencies.py +316 -0
- isa_model/core/discovery/__init__.py +19 -0
- isa_model/core/discovery/consul_discovery.py +190 -0
- isa_model/core/logging/__init__.py +54 -0
- isa_model/core/logging/influx_logger.py +523 -0
- isa_model/core/logging/loki_logger.py +160 -0
- isa_model/core/models/__init__.py +27 -18
- isa_model/core/models/config_models.py +625 -0
- isa_model/core/models/deployment_billing_tracker.py +430 -0
- isa_model/core/models/model_manager.py +40 -17
- isa_model/core/models/model_metadata.py +690 -0
- isa_model/core/models/model_repo.py +174 -18
- isa_model/core/models/system_models.py +857 -0
- isa_model/core/repositories/__init__.py +9 -0
- isa_model/core/repositories/config_repository.py +912 -0
- isa_model/core/services/intelligent_model_selector.py +399 -21
- isa_model/core/storage/hf_storage.py +1 -1
- isa_model/core/types.py +1 -0
- isa_model/deployment/__init__.py +5 -48
- isa_model/deployment/core/__init__.py +2 -31
- isa_model/deployment/core/deployment_manager.py +1278 -370
- isa_model/deployment/local/__init__.py +31 -0
- isa_model/deployment/local/config.py +248 -0
- isa_model/deployment/local/gpu_gateway.py +607 -0
- isa_model/deployment/local/health_checker.py +428 -0
- isa_model/deployment/local/provider.py +586 -0
- isa_model/deployment/local/tensorrt_service.py +621 -0
- isa_model/deployment/local/transformers_service.py +644 -0
- isa_model/deployment/local/vllm_service.py +527 -0
- isa_model/deployment/modal/__init__.py +8 -0
- isa_model/deployment/modal/config.py +136 -0
- isa_model/deployment/{services/auto_hf_modal_deployer.py → modal/deployer.py} +1 -1
- isa_model/deployment/modal/services/__init__.py +3 -0
- isa_model/deployment/modal/services/audio/__init__.py +1 -0
- isa_model/deployment/modal/services/embedding/__init__.py +1 -0
- isa_model/deployment/modal/services/llm/__init__.py +1 -0
- isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
- isa_model/deployment/modal/services/video/__init__.py +1 -0
- isa_model/deployment/modal/services/vision/__init__.py +1 -0
- isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
- isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
- isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
- isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
- isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
- isa_model/deployment/storage/__init__.py +5 -0
- isa_model/deployment/storage/deployment_repository.py +824 -0
- isa_model/deployment/triton/__init__.py +10 -0
- isa_model/deployment/triton/config.py +196 -0
- isa_model/deployment/triton/configs/__init__.py +1 -0
- isa_model/deployment/triton/provider.py +512 -0
- isa_model/deployment/triton/scripts/__init__.py +1 -0
- isa_model/deployment/triton/templates/__init__.py +1 -0
- isa_model/inference/__init__.py +47 -1
- isa_model/inference/ai_factory.py +137 -10
- isa_model/inference/legacy_services/__init__.py +21 -0
- isa_model/inference/legacy_services/model_evaluation.py +637 -0
- isa_model/inference/legacy_services/model_service.py +573 -0
- isa_model/inference/legacy_services/model_serving.py +717 -0
- isa_model/inference/legacy_services/model_training.py +561 -0
- isa_model/inference/models/__init__.py +21 -0
- isa_model/inference/models/inference_config.py +551 -0
- isa_model/inference/models/inference_record.py +675 -0
- isa_model/inference/models/performance_models.py +714 -0
- isa_model/inference/repositories/__init__.py +9 -0
- isa_model/inference/repositories/inference_repository.py +828 -0
- isa_model/inference/services/audio/base_stt_service.py +184 -11
- isa_model/inference/services/audio/openai_stt_service.py +22 -6
- isa_model/inference/services/custom_model_manager.py +277 -0
- isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
- isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
- isa_model/inference/services/llm/__init__.py +10 -2
- isa_model/inference/services/llm/base_llm_service.py +335 -24
- isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
- isa_model/inference/services/llm/helpers/llm_adapter.py +9 -4
- isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
- isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
- isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
- isa_model/inference/services/llm/local_llm_service.py +747 -0
- isa_model/inference/services/llm/ollama_llm_service.py +9 -2
- isa_model/inference/services/llm/openai_llm_service.py +33 -16
- isa_model/inference/services/llm/yyds_llm_service.py +8 -2
- isa_model/inference/services/vision/__init__.py +22 -1
- isa_model/inference/services/vision/blip_vision_service.py +359 -0
- isa_model/inference/services/vision/helpers/image_utils.py +8 -5
- isa_model/inference/services/vision/isa_vision_service.py +65 -4
- isa_model/inference/services/vision/openai_vision_service.py +19 -10
- isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
- isa_model/serving/api/cache_manager.py +245 -0
- isa_model/serving/api/dependencies/__init__.py +1 -0
- isa_model/serving/api/dependencies/auth.py +194 -0
- isa_model/serving/api/dependencies/database.py +139 -0
- isa_model/serving/api/error_handlers.py +284 -0
- isa_model/serving/api/fastapi_server.py +172 -22
- isa_model/serving/api/middleware/auth.py +8 -2
- isa_model/serving/api/middleware/security.py +23 -33
- isa_model/serving/api/middleware/tenant_context.py +414 -0
- isa_model/serving/api/routes/analytics.py +4 -1
- isa_model/serving/api/routes/config.py +645 -0
- isa_model/serving/api/routes/deployment_billing.py +315 -0
- isa_model/serving/api/routes/deployments.py +138 -2
- isa_model/serving/api/routes/gpu_gateway.py +440 -0
- isa_model/serving/api/routes/health.py +32 -12
- isa_model/serving/api/routes/inference_monitoring.py +486 -0
- isa_model/serving/api/routes/local_deployments.py +448 -0
- isa_model/serving/api/routes/tenants.py +575 -0
- isa_model/serving/api/routes/unified.py +680 -18
- isa_model/serving/api/routes/webhooks.py +479 -0
- isa_model/serving/api/startup.py +68 -54
- isa_model/utils/gpu_utils.py +311 -0
- {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/METADATA +66 -24
- isa_model-0.4.3.dist-info/RECORD +193 -0
- isa_model/core/storage/minio_storage.py +0 -0
- isa_model/deployment/cloud/__init__.py +0 -9
- isa_model/deployment/cloud/modal/__init__.py +0 -10
- isa_model/deployment/core/deployment_config.py +0 -356
- isa_model/deployment/core/isa_deployment_service.py +0 -401
- isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
- isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
- isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
- isa_model/deployment/runtime/deployed_service.py +0 -338
- isa_model/deployment/services/__init__.py +0 -9
- isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
- isa_model/deployment/services/model_service.py +0 -332
- isa_model/deployment/services/service_monitor.py +0 -356
- isa_model/deployment/services/service_registry.py +0 -527
- isa_model/eval/__init__.py +0 -92
- isa_model/eval/benchmarks/__init__.py +0 -27
- isa_model/eval/benchmarks/multimodal_datasets.py +0 -460
- isa_model/eval/benchmarks.py +0 -701
- isa_model/eval/config/__init__.py +0 -10
- isa_model/eval/config/evaluation_config.py +0 -108
- isa_model/eval/evaluators/__init__.py +0 -24
- isa_model/eval/evaluators/audio_evaluator.py +0 -727
- isa_model/eval/evaluators/base_evaluator.py +0 -503
- isa_model/eval/evaluators/embedding_evaluator.py +0 -742
- isa_model/eval/evaluators/llm_evaluator.py +0 -472
- isa_model/eval/evaluators/vision_evaluator.py +0 -564
- isa_model/eval/example_evaluation.py +0 -395
- isa_model/eval/factory.py +0 -798
- isa_model/eval/infrastructure/__init__.py +0 -24
- isa_model/eval/infrastructure/experiment_tracker.py +0 -466
- isa_model/eval/isa_benchmarks.py +0 -700
- isa_model/eval/isa_integration.py +0 -582
- isa_model/eval/metrics.py +0 -951
- isa_model/eval/tests/unit/test_basic.py +0 -396
- isa_model/serving/api/routes/evaluations.py +0 -579
- isa_model/training/__init__.py +0 -168
- isa_model/training/annotation/annotation_schema.py +0 -47
- isa_model/training/annotation/processors/annotation_processor.py +0 -126
- isa_model/training/annotation/storage/dataset_manager.py +0 -131
- isa_model/training/annotation/storage/dataset_schema.py +0 -44
- isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
- isa_model/training/annotation/tests/test_minio copy.py +0 -113
- isa_model/training/annotation/tests/test_minio_upload.py +0 -43
- isa_model/training/annotation/views/annotation_controller.py +0 -158
- isa_model/training/cloud/__init__.py +0 -22
- isa_model/training/cloud/job_orchestrator.py +0 -402
- isa_model/training/cloud/runpod_trainer.py +0 -454
- isa_model/training/cloud/storage_manager.py +0 -482
- isa_model/training/core/__init__.py +0 -26
- isa_model/training/core/config.py +0 -181
- isa_model/training/core/dataset.py +0 -222
- isa_model/training/core/trainer.py +0 -720
- isa_model/training/core/utils.py +0 -213
- isa_model/training/examples/intelligent_training_example.py +0 -281
- isa_model/training/factory.py +0 -424
- isa_model/training/intelligent/__init__.py +0 -25
- isa_model/training/intelligent/decision_engine.py +0 -643
- isa_model/training/intelligent/intelligent_factory.py +0 -888
- isa_model/training/intelligent/knowledge_base.py +0 -751
- isa_model/training/intelligent/resource_optimizer.py +0 -839
- isa_model/training/intelligent/task_classifier.py +0 -576
- isa_model/training/storage/__init__.py +0 -24
- isa_model/training/storage/core_integration.py +0 -439
- isa_model/training/storage/training_repository.py +0 -552
- isa_model/training/storage/training_storage.py +0 -628
- isa_model-0.4.0.dist-info/RECORD +0 -182
- /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_chatTTS_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_fish_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_openvoice_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/audio}/isa_audio_service_v2.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/embedding}/isa_embed_rerank_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/video}/isa_video_hunyuan_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ocr_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_qwen25_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_table_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service.py +0 -0
- /isa_model/deployment/{cloud/modal → modal/services/vision}/isa_vision_ui_service_optimized.py +0 -0
- /isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
- {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/WHEEL +0 -0
- {isa_model-0.4.0.dist-info → isa_model-0.4.3.dist-info}/top_level.txt +0 -0
isa_model/eval/factory.py
DELETED
@@ -1,798 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
Enterprise-Grade Evaluation Factory for ISA Model Framework
|
3
|
-
|
4
|
-
Implements industry best practices for AI model evaluation at scale:
|
5
|
-
- Async evaluation with concurrency control
|
6
|
-
- Comprehensive experiment tracking (W&B, MLflow)
|
7
|
-
- Distributed evaluation support
|
8
|
-
- Production-ready monitoring and alerting
|
9
|
-
- Cost tracking and optimization
|
10
|
-
- Reproducible evaluation pipelines
|
11
|
-
"""
|
12
|
-
|
13
|
-
import asyncio
|
14
|
-
import logging
|
15
|
-
from typing import Optional, Dict, Any, List, Union, Callable
|
16
|
-
from pathlib import Path
|
17
|
-
import json
|
18
|
-
|
19
|
-
from .evaluators import LLMEvaluator, VisionEvaluator, AudioEvaluator, EmbeddingEvaluator, EvaluationResult
|
20
|
-
from .isa_integration import ISAModelInterface
|
21
|
-
try:
|
22
|
-
from .infrastructure import ExperimentTracker, create_experiment_tracker
|
23
|
-
EXPERIMENT_TRACKING_AVAILABLE = True
|
24
|
-
except ImportError:
|
25
|
-
EXPERIMENT_TRACKING_AVAILABLE = False
|
26
|
-
logger.warning("Experiment tracking not available")
|
27
|
-
|
28
|
-
try:
|
29
|
-
from .config import EvaluationConfig
|
30
|
-
CONFIG_AVAILABLE = True
|
31
|
-
except ImportError:
|
32
|
-
CONFIG_AVAILABLE = False
|
33
|
-
# Create a simple config class
|
34
|
-
class EvaluationConfig:
|
35
|
-
def __init__(self):
|
36
|
-
self.batch_size = 16
|
37
|
-
self.output_dir = "./evaluation_results"
|
38
|
-
self.default_temperature = 0.7
|
39
|
-
self.default_max_tokens = 512
|
40
|
-
self.max_concurrent_evaluations = 3
|
41
|
-
|
42
|
-
def to_dict(self):
|
43
|
-
return {
|
44
|
-
"batch_size": self.batch_size,
|
45
|
-
"output_dir": self.output_dir,
|
46
|
-
"default_temperature": self.default_temperature,
|
47
|
-
"default_max_tokens": self.default_max_tokens,
|
48
|
-
"max_concurrent_evaluations": self.max_concurrent_evaluations
|
49
|
-
}
|
50
|
-
|
51
|
-
@classmethod
|
52
|
-
def from_dict(cls, config_dict):
|
53
|
-
config = cls()
|
54
|
-
for key, value in config_dict.items():
|
55
|
-
if hasattr(config, key):
|
56
|
-
setattr(config, key, value)
|
57
|
-
return config
|
58
|
-
|
59
|
-
logger = logging.getLogger(__name__)
|
60
|
-
|
61
|
-
|
62
|
-
class EvaluationFactory:
|
63
|
-
"""
|
64
|
-
Enterprise-grade evaluation factory implementing MLOps best practices.
|
65
|
-
|
66
|
-
Features:
|
67
|
-
- Multi-modal evaluation support (LLM, Vision, Multimodal)
|
68
|
-
- Async evaluation with smart concurrency management
|
69
|
-
- Comprehensive experiment tracking and visualization
|
70
|
-
- Cost optimization and resource monitoring
|
71
|
-
- Distributed evaluation across multiple GPUs/nodes
|
72
|
-
- Production-ready error handling and retry logic
|
73
|
-
- Automated result storage and comparison
|
74
|
-
|
75
|
-
Example usage:
|
76
|
-
```python
|
77
|
-
from isa_model.eval import EvaluationFactory
|
78
|
-
|
79
|
-
# Initialize with experiment tracking
|
80
|
-
factory = EvaluationFactory(
|
81
|
-
experiment_tracking={
|
82
|
-
"type": "wandb",
|
83
|
-
"project": "model-evaluation",
|
84
|
-
"entity": "my-team"
|
85
|
-
}
|
86
|
-
)
|
87
|
-
|
88
|
-
# Evaluate LLM on dataset
|
89
|
-
result = await factory.evaluate_llm(
|
90
|
-
model_name="gpt-4.1-mini",
|
91
|
-
provider="openai",
|
92
|
-
dataset_path="path/to/evaluation_data.json",
|
93
|
-
metrics=["accuracy", "f1_score", "bleu_score"],
|
94
|
-
save_results=True
|
95
|
-
)
|
96
|
-
|
97
|
-
# Run benchmark evaluation
|
98
|
-
benchmark_result = await factory.run_benchmark(
|
99
|
-
model_name="claude-sonnet-4",
|
100
|
-
provider="yyds",
|
101
|
-
benchmark_name="mmlu",
|
102
|
-
subjects=["math", "physics", "chemistry"]
|
103
|
-
)
|
104
|
-
|
105
|
-
# Compare multiple models
|
106
|
-
comparison = await factory.compare_models(
|
107
|
-
models=[
|
108
|
-
{"name": "gpt-4.1-mini", "provider": "openai"},
|
109
|
-
{"name": "claude-sonnet-4", "provider": "yyds"}
|
110
|
-
],
|
111
|
-
dataset_path="comparison_dataset.json"
|
112
|
-
)
|
113
|
-
```
|
114
|
-
"""
|
115
|
-
|
116
|
-
def __init__(self,
|
117
|
-
config: Optional[Union[Dict[str, Any], EvaluationConfig]] = None,
|
118
|
-
experiment_tracking: Optional[Dict[str, Any]] = None,
|
119
|
-
output_dir: Optional[str] = None):
|
120
|
-
"""
|
121
|
-
Initialize the enterprise evaluation factory.
|
122
|
-
|
123
|
-
Args:
|
124
|
-
config: Evaluation configuration (dict or EvaluationConfig object)
|
125
|
-
experiment_tracking: Experiment tracking configuration
|
126
|
-
output_dir: Output directory for results
|
127
|
-
"""
|
128
|
-
# Initialize configuration
|
129
|
-
if isinstance(config, dict):
|
130
|
-
self.config = EvaluationConfig.from_dict(config)
|
131
|
-
elif isinstance(config, EvaluationConfig):
|
132
|
-
self.config = config
|
133
|
-
else:
|
134
|
-
self.config = EvaluationConfig()
|
135
|
-
|
136
|
-
# Override output directory if provided
|
137
|
-
if output_dir:
|
138
|
-
self.config.output_dir = output_dir
|
139
|
-
|
140
|
-
# Initialize experiment tracker
|
141
|
-
self.experiment_tracker = None
|
142
|
-
if experiment_tracking and EXPERIMENT_TRACKING_AVAILABLE:
|
143
|
-
try:
|
144
|
-
self.experiment_tracker = create_experiment_tracker(**experiment_tracking)
|
145
|
-
logger.info(f"Initialized experiment tracking: {experiment_tracking['type']}")
|
146
|
-
except Exception as e:
|
147
|
-
logger.warning(f"Failed to initialize experiment tracking: {e}")
|
148
|
-
|
149
|
-
# Initialize ISA Model interface
|
150
|
-
self.isa_interface = ISAModelInterface()
|
151
|
-
|
152
|
-
# Initialize evaluators
|
153
|
-
self.llm_evaluator = LLMEvaluator(
|
154
|
-
config=self.config.to_dict(),
|
155
|
-
experiment_tracker=self.experiment_tracker
|
156
|
-
)
|
157
|
-
|
158
|
-
self.vision_evaluator = VisionEvaluator(
|
159
|
-
config=self.config.to_dict(),
|
160
|
-
experiment_tracker=self.experiment_tracker
|
161
|
-
)
|
162
|
-
|
163
|
-
self.audio_evaluator = AudioEvaluator(
|
164
|
-
config=self.config.to_dict(),
|
165
|
-
experiment_tracker=self.experiment_tracker
|
166
|
-
)
|
167
|
-
|
168
|
-
self.embedding_evaluator = EmbeddingEvaluator(
|
169
|
-
config=self.config.to_dict(),
|
170
|
-
experiment_tracker=self.experiment_tracker
|
171
|
-
)
|
172
|
-
|
173
|
-
# State tracking
|
174
|
-
self._active_evaluations: Dict[str, asyncio.Task] = {}
|
175
|
-
|
176
|
-
logger.info(f"EvaluationFactory initialized with output dir: {self.config.output_dir}")
|
177
|
-
|
178
|
-
async def evaluate_llm(self,
|
179
|
-
model_name: str,
|
180
|
-
provider: str = "openai",
|
181
|
-
dataset_path: Optional[str] = None,
|
182
|
-
dataset: Optional[List[Dict[str, Any]]] = None,
|
183
|
-
metrics: Optional[List[str]] = None,
|
184
|
-
batch_size: Optional[int] = None,
|
185
|
-
save_results: bool = True,
|
186
|
-
experiment_name: Optional[str] = None,
|
187
|
-
progress_callback: Optional[Callable] = None) -> EvaluationResult:
|
188
|
-
"""
|
189
|
-
Evaluate LLM with comprehensive metrics and tracking.
|
190
|
-
|
191
|
-
Args:
|
192
|
-
model_name: Name of the model to evaluate
|
193
|
-
provider: Model provider (openai, yyds, ollama, etc.)
|
194
|
-
dataset_path: Path to evaluation dataset JSON file
|
195
|
-
dataset: Direct dataset input (alternative to dataset_path)
|
196
|
-
metrics: List of metrics to compute
|
197
|
-
batch_size: Batch size for evaluation
|
198
|
-
save_results: Whether to save results to disk
|
199
|
-
experiment_name: Custom experiment name
|
200
|
-
progress_callback: Optional progress callback function
|
201
|
-
|
202
|
-
Returns:
|
203
|
-
Comprehensive evaluation results
|
204
|
-
"""
|
205
|
-
# Load dataset
|
206
|
-
if dataset is None:
|
207
|
-
if dataset_path is None:
|
208
|
-
raise ValueError("Either dataset_path or dataset must be provided")
|
209
|
-
dataset = self._load_dataset(dataset_path)
|
210
|
-
|
211
|
-
# Configure LLM evaluator
|
212
|
-
llm_config = {
|
213
|
-
"provider": provider,
|
214
|
-
"model_name": model_name,
|
215
|
-
"batch_size": batch_size or self.config.batch_size,
|
216
|
-
"temperature": self.config.default_temperature,
|
217
|
-
"max_tokens": self.config.default_max_tokens
|
218
|
-
}
|
219
|
-
|
220
|
-
self.llm_evaluator.config.update(llm_config)
|
221
|
-
|
222
|
-
# Generate experiment name
|
223
|
-
dataset_name = Path(dataset_path).stem if dataset_path else "custom_dataset"
|
224
|
-
experiment_name = experiment_name or f"llm_eval_{model_name}_{dataset_name}"
|
225
|
-
|
226
|
-
# Run evaluation
|
227
|
-
result = await self.llm_evaluator.evaluate(
|
228
|
-
model_interface=self.isa_interface,
|
229
|
-
dataset=dataset,
|
230
|
-
dataset_name=dataset_name,
|
231
|
-
model_name=f"{provider}:{model_name}",
|
232
|
-
batch_size=batch_size,
|
233
|
-
progress_callback=progress_callback
|
234
|
-
)
|
235
|
-
|
236
|
-
# Save results if requested
|
237
|
-
if save_results:
|
238
|
-
await self._save_results(result, experiment_name)
|
239
|
-
|
240
|
-
return result
|
241
|
-
|
242
|
-
async def run_benchmark(self,
|
243
|
-
model_name: str,
|
244
|
-
provider: str,
|
245
|
-
benchmark_name: str,
|
246
|
-
subjects: Optional[List[str]] = None,
|
247
|
-
max_samples: Optional[int] = None,
|
248
|
-
few_shot: bool = True,
|
249
|
-
num_shots: int = 5,
|
250
|
-
save_results: bool = True,
|
251
|
-
experiment_name: Optional[str] = None) -> EvaluationResult:
|
252
|
-
"""
|
253
|
-
Run standardized benchmark evaluation.
|
254
|
-
|
255
|
-
Args:
|
256
|
-
model_name: Name of the model to evaluate
|
257
|
-
provider: Model provider
|
258
|
-
benchmark_name: Name of benchmark (mmlu, hellaswag, arc, gsm8k, etc.)
|
259
|
-
subjects: List of subjects to evaluate (for MMLU)
|
260
|
-
max_samples: Maximum number of samples to evaluate
|
261
|
-
few_shot: Whether to use few-shot examples
|
262
|
-
num_shots: Number of few-shot examples
|
263
|
-
save_results: Whether to save results
|
264
|
-
experiment_name: Custom experiment name
|
265
|
-
|
266
|
-
Returns:
|
267
|
-
Benchmark evaluation results
|
268
|
-
"""
|
269
|
-
# Load benchmark dataset
|
270
|
-
benchmark_dataset = await self._load_benchmark(
|
271
|
-
benchmark_name,
|
272
|
-
subjects=subjects,
|
273
|
-
max_samples=max_samples,
|
274
|
-
few_shot=few_shot,
|
275
|
-
num_shots=num_shots
|
276
|
-
)
|
277
|
-
|
278
|
-
# Configure for benchmark evaluation
|
279
|
-
benchmark_config = {
|
280
|
-
"provider": provider,
|
281
|
-
"model_name": model_name,
|
282
|
-
"temperature": 0.0, # Deterministic for benchmarks
|
283
|
-
"max_tokens": 50, # Short answers for most benchmarks
|
284
|
-
"task_type": "benchmark",
|
285
|
-
"benchmark_name": benchmark_name
|
286
|
-
}
|
287
|
-
|
288
|
-
self.llm_evaluator.config.update(benchmark_config)
|
289
|
-
|
290
|
-
# Generate experiment name
|
291
|
-
experiment_name = experiment_name or f"benchmark_{benchmark_name}_{model_name}"
|
292
|
-
|
293
|
-
# Run evaluation
|
294
|
-
result = await self.llm_evaluator.evaluate(
|
295
|
-
model_interface=None,
|
296
|
-
dataset=benchmark_dataset,
|
297
|
-
dataset_name=benchmark_name,
|
298
|
-
model_name=f"{provider}:{model_name}",
|
299
|
-
batch_size=self.config.batch_size
|
300
|
-
)
|
301
|
-
|
302
|
-
# Add benchmark-specific metadata
|
303
|
-
result.config.update({
|
304
|
-
"benchmark_name": benchmark_name,
|
305
|
-
"subjects": subjects,
|
306
|
-
"few_shot": few_shot,
|
307
|
-
"num_shots": num_shots
|
308
|
-
})
|
309
|
-
|
310
|
-
# Save results if requested
|
311
|
-
if save_results:
|
312
|
-
await self._save_results(result, experiment_name)
|
313
|
-
|
314
|
-
return result
|
315
|
-
|
316
|
-
async def compare_models(self,
|
317
|
-
models: List[Dict[str, str]],
|
318
|
-
dataset_path: Optional[str] = None,
|
319
|
-
dataset: Optional[List[Dict[str, Any]]] = None,
|
320
|
-
benchmark_name: Optional[str] = None,
|
321
|
-
metrics: Optional[List[str]] = None,
|
322
|
-
save_results: bool = True,
|
323
|
-
experiment_name: Optional[str] = None) -> Dict[str, EvaluationResult]:
|
324
|
-
"""
|
325
|
-
Compare multiple models on the same evaluation task.
|
326
|
-
|
327
|
-
Args:
|
328
|
-
models: List of model configs [{"name": "gpt-4", "provider": "openai"}, ...]
|
329
|
-
dataset_path: Path to evaluation dataset
|
330
|
-
dataset: Direct dataset input
|
331
|
-
benchmark_name: Benchmark name (alternative to dataset)
|
332
|
-
metrics: Metrics to compute
|
333
|
-
save_results: Whether to save comparison results
|
334
|
-
experiment_name: Custom experiment name
|
335
|
-
|
336
|
-
Returns:
|
337
|
-
Dictionary mapping model names to evaluation results
|
338
|
-
"""
|
339
|
-
results = {}
|
340
|
-
|
341
|
-
# Run evaluations concurrently (with concurrency limits)
|
342
|
-
semaphore = asyncio.Semaphore(self.config.max_concurrent_evaluations)
|
343
|
-
|
344
|
-
async def evaluate_single_model(model_config: Dict[str, str]) -> tuple:
|
345
|
-
async with semaphore:
|
346
|
-
model_name = model_config["name"]
|
347
|
-
provider = model_config["provider"]
|
348
|
-
|
349
|
-
if benchmark_name:
|
350
|
-
result = await self.run_benchmark(
|
351
|
-
model_name=model_name,
|
352
|
-
provider=provider,
|
353
|
-
benchmark_name=benchmark_name,
|
354
|
-
save_results=False # Save comparison results together
|
355
|
-
)
|
356
|
-
else:
|
357
|
-
result = await self.evaluate_llm(
|
358
|
-
model_name=model_name,
|
359
|
-
provider=provider,
|
360
|
-
dataset_path=dataset_path,
|
361
|
-
dataset=dataset,
|
362
|
-
metrics=metrics,
|
363
|
-
save_results=False
|
364
|
-
)
|
365
|
-
|
366
|
-
return f"{provider}:{model_name}", result
|
367
|
-
|
368
|
-
# Execute all evaluations
|
369
|
-
tasks = [evaluate_single_model(model) for model in models]
|
370
|
-
evaluation_results = await asyncio.gather(*tasks)
|
371
|
-
|
372
|
-
# Collect results
|
373
|
-
for model_id, result in evaluation_results:
|
374
|
-
results[model_id] = result
|
375
|
-
|
376
|
-
# Generate comparison report
|
377
|
-
comparison_report = self._generate_comparison_report(results)
|
378
|
-
|
379
|
-
# Save results if requested
|
380
|
-
if save_results:
|
381
|
-
experiment_name = experiment_name or f"model_comparison_{len(models)}_models"
|
382
|
-
await self._save_comparison_results(results, comparison_report, experiment_name)
|
383
|
-
|
384
|
-
return results
|
385
|
-
|
386
|
-
async def evaluate_vision(self,
|
387
|
-
dataset: List[Dict[str, Any]],
|
388
|
-
task_type: str = "ocr",
|
389
|
-
model_name: str = "gpt-4.1-mini",
|
390
|
-
save_results: bool = True,
|
391
|
-
experiment_name: Optional[str] = None) -> EvaluationResult:
|
392
|
-
"""
|
393
|
-
Evaluate vision model on image tasks.
|
394
|
-
|
395
|
-
Args:
|
396
|
-
dataset: Vision dataset with images and expected outputs
|
397
|
-
task_type: Vision task type (ocr, table, ui, vqa, caption)
|
398
|
-
model_name: Vision model name
|
399
|
-
save_results: Whether to save results
|
400
|
-
experiment_name: Custom experiment name
|
401
|
-
|
402
|
-
Returns:
|
403
|
-
Vision evaluation results
|
404
|
-
"""
|
405
|
-
# Configure vision evaluator
|
406
|
-
self.vision_evaluator.config.update({
|
407
|
-
"task_type": task_type,
|
408
|
-
"model_name": model_name
|
409
|
-
})
|
410
|
-
|
411
|
-
experiment_name = experiment_name or f"vision_{task_type}_{model_name}"
|
412
|
-
|
413
|
-
result = await self.vision_evaluator.evaluate(
|
414
|
-
model_interface=self.isa_interface,
|
415
|
-
dataset=dataset,
|
416
|
-
dataset_name=f"vision_{task_type}",
|
417
|
-
model_name=model_name
|
418
|
-
)
|
419
|
-
|
420
|
-
if save_results:
|
421
|
-
await self._save_results(result, experiment_name)
|
422
|
-
|
423
|
-
return result
|
424
|
-
|
425
|
-
async def evaluate_audio(self,
|
426
|
-
dataset: List[Dict[str, Any]],
|
427
|
-
task_type: str = "stt",
|
428
|
-
model_name: str = "isa_audio_sota_service",
|
429
|
-
save_results: bool = True,
|
430
|
-
experiment_name: Optional[str] = None) -> EvaluationResult:
|
431
|
-
"""
|
432
|
-
Evaluate audio model on speech tasks.
|
433
|
-
|
434
|
-
Args:
|
435
|
-
dataset: Audio dataset with audio files and expected outputs
|
436
|
-
task_type: Audio task type (stt, emotion, diarization)
|
437
|
-
model_name: Audio model name
|
438
|
-
save_results: Whether to save results
|
439
|
-
experiment_name: Custom experiment name
|
440
|
-
|
441
|
-
Returns:
|
442
|
-
Audio evaluation results
|
443
|
-
"""
|
444
|
-
# Configure audio evaluator
|
445
|
-
self.audio_evaluator.config.update({
|
446
|
-
"task_type": task_type,
|
447
|
-
"model_name": model_name
|
448
|
-
})
|
449
|
-
|
450
|
-
experiment_name = experiment_name or f"audio_{task_type}_{model_name}"
|
451
|
-
|
452
|
-
result = await self.audio_evaluator.evaluate(
|
453
|
-
model_interface=self.isa_interface,
|
454
|
-
dataset=dataset,
|
455
|
-
dataset_name=f"audio_{task_type}",
|
456
|
-
model_name=model_name
|
457
|
-
)
|
458
|
-
|
459
|
-
if save_results:
|
460
|
-
await self._save_results(result, experiment_name)
|
461
|
-
|
462
|
-
return result
|
463
|
-
|
464
|
-
async def evaluate_embedding(self,
|
465
|
-
dataset: List[Dict[str, Any]],
|
466
|
-
task_type: str = "similarity",
|
467
|
-
model_name: str = "text-embedding-3-small",
|
468
|
-
save_results: bool = True,
|
469
|
-
experiment_name: Optional[str] = None) -> EvaluationResult:
|
470
|
-
"""
|
471
|
-
Evaluate embedding model on semantic tasks.
|
472
|
-
|
473
|
-
Args:
|
474
|
-
dataset: Embedding dataset with text and expected outputs
|
475
|
-
task_type: Embedding task type (similarity, retrieval, reranking)
|
476
|
-
model_name: Embedding model name
|
477
|
-
save_results: Whether to save results
|
478
|
-
experiment_name: Custom experiment name
|
479
|
-
|
480
|
-
Returns:
|
481
|
-
Embedding evaluation results
|
482
|
-
"""
|
483
|
-
# Configure embedding evaluator
|
484
|
-
self.embedding_evaluator.config.update({
|
485
|
-
"task_type": task_type,
|
486
|
-
"model_name": model_name
|
487
|
-
})
|
488
|
-
|
489
|
-
experiment_name = experiment_name or f"embedding_{task_type}_{model_name}"
|
490
|
-
|
491
|
-
result = await self.embedding_evaluator.evaluate(
|
492
|
-
model_interface=self.isa_interface,
|
493
|
-
dataset=dataset,
|
494
|
-
dataset_name=f"embedding_{task_type}",
|
495
|
-
model_name=model_name
|
496
|
-
)
|
497
|
-
|
498
|
-
if save_results:
|
499
|
-
await self._save_results(result, experiment_name)
|
500
|
-
|
501
|
-
return result
|
502
|
-
|
503
|
-
async def compare_models(self,
|
504
|
-
models: List[Dict[str, str]],
|
505
|
-
dataset_path: Optional[str] = None,
|
506
|
-
dataset: Optional[List[Dict[str, Any]]] = None,
|
507
|
-
evaluator_type: str = "llm",
|
508
|
-
benchmark_name: Optional[str] = None,
|
509
|
-
metrics: Optional[List[str]] = None,
|
510
|
-
save_results: bool = True,
|
511
|
-
experiment_name: Optional[str] = None) -> Dict[str, EvaluationResult]:
|
512
|
-
"""
|
513
|
-
Compare multiple models on the same evaluation task.
|
514
|
-
|
515
|
-
Args:
|
516
|
-
models: List of model configs [{"name": "gpt-4", "provider": "openai"}, ...]
|
517
|
-
dataset_path: Path to evaluation dataset
|
518
|
-
dataset: Direct dataset input
|
519
|
-
evaluator_type: Type of evaluator (llm, vision, audio, embedding)
|
520
|
-
benchmark_name: Benchmark name (alternative to dataset)
|
521
|
-
metrics: Metrics to compute
|
522
|
-
save_results: Whether to save comparison results
|
523
|
-
experiment_name: Custom experiment name
|
524
|
-
|
525
|
-
Returns:
|
526
|
-
Dictionary mapping model names to evaluation results
|
527
|
-
"""
|
528
|
-
results = {}
|
529
|
-
|
530
|
-
# Load dataset if needed
|
531
|
-
if dataset is None and dataset_path:
|
532
|
-
dataset = self._load_dataset(dataset_path)
|
533
|
-
|
534
|
-
# Run evaluations concurrently (with concurrency limits)
|
535
|
-
semaphore = asyncio.Semaphore(self.config.max_concurrent_evaluations)
|
536
|
-
|
537
|
-
async def evaluate_single_model(model_config: Dict[str, str]) -> tuple:
|
538
|
-
async with semaphore:
|
539
|
-
model_name = model_config["name"]
|
540
|
-
provider = model_config.get("provider", "openai")
|
541
|
-
|
542
|
-
if evaluator_type == "llm":
|
543
|
-
if benchmark_name:
|
544
|
-
result = await self.run_benchmark(
|
545
|
-
model_name=model_name,
|
546
|
-
provider=provider,
|
547
|
-
benchmark_name=benchmark_name,
|
548
|
-
save_results=False
|
549
|
-
)
|
550
|
-
else:
|
551
|
-
result = await self.evaluate_llm(
|
552
|
-
model_name=model_name,
|
553
|
-
provider=provider,
|
554
|
-
dataset=dataset,
|
555
|
-
metrics=metrics,
|
556
|
-
save_results=False
|
557
|
-
)
|
558
|
-
elif evaluator_type == "vision":
|
559
|
-
result = await self.evaluate_vision(
|
560
|
-
dataset=dataset,
|
561
|
-
model_name=model_name,
|
562
|
-
save_results=False
|
563
|
-
)
|
564
|
-
elif evaluator_type == "audio":
|
565
|
-
result = await self.evaluate_audio(
|
566
|
-
dataset=dataset,
|
567
|
-
model_name=model_name,
|
568
|
-
save_results=False
|
569
|
-
)
|
570
|
-
elif evaluator_type == "embedding":
|
571
|
-
result = await self.evaluate_embedding(
|
572
|
-
dataset=dataset,
|
573
|
-
model_name=model_name,
|
574
|
-
save_results=False
|
575
|
-
)
|
576
|
-
else:
|
577
|
-
raise ValueError(f"Unknown evaluator type: {evaluator_type}")
|
578
|
-
|
579
|
-
return f"{provider}:{model_name}", result
|
580
|
-
|
581
|
-
# Execute all evaluations
|
582
|
-
tasks = [evaluate_single_model(model) for model in models]
|
583
|
-
evaluation_results = await asyncio.gather(*tasks)
|
584
|
-
|
585
|
-
# Collect results
|
586
|
-
for model_id, result in evaluation_results:
|
587
|
-
results[model_id] = result
|
588
|
-
|
589
|
-
# Generate comparison report
|
590
|
-
comparison_report = self._generate_comparison_report(results)
|
591
|
-
|
592
|
-
# Save results if requested
|
593
|
-
if save_results:
|
594
|
-
experiment_name = experiment_name or f"model_comparison_{evaluator_type}_{len(models)}_models"
|
595
|
-
await self._save_comparison_results(results, comparison_report, experiment_name)
|
596
|
-
|
597
|
-
return results
|
598
|
-
|
599
|
-
def _load_dataset(self, dataset_path: str) -> List[Dict[str, Any]]:
|
600
|
-
"""Load dataset from file."""
|
601
|
-
with open(dataset_path, 'r', encoding='utf-8') as f:
|
602
|
-
if dataset_path.endswith('.json'):
|
603
|
-
dataset = json.load(f)
|
604
|
-
elif dataset_path.endswith('.jsonl'):
|
605
|
-
dataset = [json.loads(line) for line in f]
|
606
|
-
else:
|
607
|
-
raise ValueError(f"Unsupported dataset format: {dataset_path}")
|
608
|
-
|
609
|
-
logger.info(f"Loaded dataset with {len(dataset)} samples from {dataset_path}")
|
610
|
-
return dataset
|
611
|
-
|
612
|
-
async def _load_benchmark(self,
|
613
|
-
benchmark_name: str,
|
614
|
-
subjects: Optional[List[str]] = None,
|
615
|
-
max_samples: Optional[int] = None,
|
616
|
-
few_shot: bool = True,
|
617
|
-
num_shots: int = 5) -> List[Dict[str, Any]]:
|
618
|
-
"""Load benchmark dataset."""
|
619
|
-
# This would integrate with the benchmark loaders
|
620
|
-
# For now, return a placeholder
|
621
|
-
logger.warning(f"Benchmark {benchmark_name} loading not yet implemented")
|
622
|
-
|
623
|
-
# Placeholder benchmark data
|
624
|
-
return [
|
625
|
-
{
|
626
|
-
"id": f"sample_{i}",
|
627
|
-
"prompt": f"Sample question {i} for {benchmark_name}",
|
628
|
-
"reference": "A",
|
629
|
-
"choices": ["A", "B", "C", "D"] if benchmark_name != "gsm8k" else None
|
630
|
-
}
|
631
|
-
for i in range(min(max_samples or 10, 10))
|
632
|
-
]
|
633
|
-
|
634
|
-
async def _save_results(self, result: EvaluationResult, experiment_name: str) -> None:
|
635
|
-
"""Save evaluation results to disk."""
|
636
|
-
# Create output directory
|
637
|
-
output_dir = Path(self.config.output_dir) / experiment_name
|
638
|
-
output_dir.mkdir(parents=True, exist_ok=True)
|
639
|
-
|
640
|
-
# Save main results
|
641
|
-
results_path = output_dir / "results.json"
|
642
|
-
result.save_to_file(results_path)
|
643
|
-
|
644
|
-
# Save detailed predictions if available
|
645
|
-
if result.sample_results:
|
646
|
-
predictions_path = output_dir / "predictions.json"
|
647
|
-
with open(predictions_path, 'w', encoding='utf-8') as f:
|
648
|
-
json.dump(result.sample_results, f, indent=2, ensure_ascii=False)
|
649
|
-
|
650
|
-
# Save summary
|
651
|
-
summary_path = output_dir / "summary.json"
|
652
|
-
with open(summary_path, 'w', encoding='utf-8') as f:
|
653
|
-
json.dump(result.get_summary(), f, indent=2, ensure_ascii=False)
|
654
|
-
|
655
|
-
logger.info(f"Saved evaluation results to {output_dir}")
|
656
|
-
|
657
|
-
async def _save_comparison_results(self,
|
658
|
-
results: Dict[str, EvaluationResult],
|
659
|
-
comparison_report: Dict[str, Any],
|
660
|
-
experiment_name: str) -> None:
|
661
|
-
"""Save model comparison results."""
|
662
|
-
output_dir = Path(self.config.output_dir) / experiment_name
|
663
|
-
output_dir.mkdir(parents=True, exist_ok=True)
|
664
|
-
|
665
|
-
# Save individual results
|
666
|
-
for model_id, result in results.items():
|
667
|
-
model_dir = output_dir / model_id.replace(":", "_")
|
668
|
-
model_dir.mkdir(exist_ok=True)
|
669
|
-
result.save_to_file(model_dir / "results.json")
|
670
|
-
|
671
|
-
# Save comparison report
|
672
|
-
comparison_path = output_dir / "comparison_report.json"
|
673
|
-
with open(comparison_path, 'w', encoding='utf-8') as f:
|
674
|
-
json.dump(comparison_report, f, indent=2, ensure_ascii=False)
|
675
|
-
|
676
|
-
logger.info(f"Saved comparison results to {output_dir}")
|
677
|
-
|
678
|
-
def _generate_comparison_report(self, results: Dict[str, EvaluationResult]) -> Dict[str, Any]:
|
679
|
-
"""Generate comparison report from multiple model results."""
|
680
|
-
report = {
|
681
|
-
"models_compared": list(results.keys()),
|
682
|
-
"comparison_timestamp": results[list(results.keys())[0]].timestamp,
|
683
|
-
"metric_comparison": {},
|
684
|
-
"rankings": {},
|
685
|
-
"best_model_per_metric": {}
|
686
|
-
}
|
687
|
-
|
688
|
-
# Extract all metrics
|
689
|
-
all_metrics = set()
|
690
|
-
for result in results.values():
|
691
|
-
all_metrics.update(result.metrics.keys())
|
692
|
-
|
693
|
-
# Compare each metric
|
694
|
-
for metric in all_metrics:
|
695
|
-
metric_values = {}
|
696
|
-
for model_id, result in results.items():
|
697
|
-
if metric in result.metrics:
|
698
|
-
metric_values[model_id] = result.metrics[metric]
|
699
|
-
|
700
|
-
if metric_values:
|
701
|
-
# Determine if higher is better
|
702
|
-
higher_is_better = metric not in ["perplexity", "loss", "error_rate"]
|
703
|
-
|
704
|
-
# Find best model
|
705
|
-
best_model = max(metric_values.items(), key=lambda x: x[1]) if higher_is_better else min(metric_values.items(), key=lambda x: x[1])
|
706
|
-
|
707
|
-
# Create ranking
|
708
|
-
sorted_models = sorted(metric_values.items(), key=lambda x: x[1], reverse=higher_is_better)
|
709
|
-
|
710
|
-
report["metric_comparison"][metric] = metric_values
|
711
|
-
report["rankings"][metric] = [{"model": model, "value": value} for model, value in sorted_models]
|
712
|
-
report["best_model_per_metric"][metric] = {"model": best_model[0], "value": best_model[1]}
|
713
|
-
|
714
|
-
return report
|
715
|
-
|
716
|
-
def get_configuration(self) -> Dict[str, Any]:
|
717
|
-
"""Get current factory configuration."""
|
718
|
-
return self.config.to_dict()
|
719
|
-
|
720
|
-
def get_active_evaluations(self) -> List[str]:
|
721
|
-
"""Get list of currently running evaluations."""
|
722
|
-
return list(self._active_evaluations.keys())
|
723
|
-
|
724
|
-
async def stop_evaluation(self, evaluation_id: str) -> bool:
|
725
|
-
"""Stop a running evaluation."""
|
726
|
-
if evaluation_id in self._active_evaluations:
|
727
|
-
task = self._active_evaluations[evaluation_id]
|
728
|
-
task.cancel()
|
729
|
-
del self._active_evaluations[evaluation_id]
|
730
|
-
logger.info(f"Stopped evaluation: {evaluation_id}")
|
731
|
-
return True
|
732
|
-
return False
|
733
|
-
|
734
|
-
async def cleanup(self) -> None:
|
735
|
-
"""Cleanup resources and stop all running evaluations."""
|
736
|
-
# Cancel all active evaluations
|
737
|
-
for evaluation_id in list(self._active_evaluations.keys()):
|
738
|
-
await self.stop_evaluation(evaluation_id)
|
739
|
-
|
740
|
-
# Close experiment tracker
|
741
|
-
if self.experiment_tracker and self.experiment_tracker.is_running:
|
742
|
-
await self.experiment_tracker.end_run()
|
743
|
-
|
744
|
-
logger.info("EvaluationFactory cleanup completed")
|
745
|
-
|
746
|
-
|
747
|
-
# Convenience functions for quick evaluation
|
748
|
-
async def evaluate_llm_quick(model_name: str,
|
749
|
-
provider: str,
|
750
|
-
dataset_path: str,
|
751
|
-
metrics: Optional[List[str]] = None) -> EvaluationResult:
|
752
|
-
"""
|
753
|
-
Quick LLM evaluation function.
|
754
|
-
|
755
|
-
Args:
|
756
|
-
model_name: Name of the model
|
757
|
-
provider: Model provider
|
758
|
-
dataset_path: Path to dataset
|
759
|
-
metrics: Metrics to compute
|
760
|
-
|
761
|
-
Returns:
|
762
|
-
Evaluation results
|
763
|
-
"""
|
764
|
-
factory = EvaluationFactory()
|
765
|
-
try:
|
766
|
-
return await factory.evaluate_llm(
|
767
|
-
model_name=model_name,
|
768
|
-
provider=provider,
|
769
|
-
dataset_path=dataset_path,
|
770
|
-
metrics=metrics
|
771
|
-
)
|
772
|
-
finally:
|
773
|
-
await factory.cleanup()
|
774
|
-
|
775
|
-
|
776
|
-
async def run_benchmark_quick(model_name: str,
|
777
|
-
provider: str,
|
778
|
-
benchmark_name: str) -> EvaluationResult:
|
779
|
-
"""
|
780
|
-
Quick benchmark evaluation function.
|
781
|
-
|
782
|
-
Args:
|
783
|
-
model_name: Name of the model
|
784
|
-
provider: Model provider
|
785
|
-
benchmark_name: Benchmark name
|
786
|
-
|
787
|
-
Returns:
|
788
|
-
Benchmark results
|
789
|
-
"""
|
790
|
-
factory = EvaluationFactory()
|
791
|
-
try:
|
792
|
-
return await factory.run_benchmark(
|
793
|
-
model_name=model_name,
|
794
|
-
provider=provider,
|
795
|
-
benchmark_name=benchmark_name
|
796
|
-
)
|
797
|
-
finally:
|
798
|
-
await factory.cleanup()
|