isa-model 0.3.5__py3-none-any.whl → 0.3.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- isa_model/__init__.py +30 -1
- isa_model/client.py +770 -0
- isa_model/core/config/__init__.py +16 -0
- isa_model/core/config/config_manager.py +514 -0
- isa_model/core/config.py +426 -0
- isa_model/core/models/model_billing_tracker.py +476 -0
- isa_model/core/models/model_manager.py +399 -0
- isa_model/core/{storage/supabase_storage.py → models/model_repo.py} +72 -73
- isa_model/core/pricing_manager.py +426 -0
- isa_model/core/services/__init__.py +19 -0
- isa_model/core/services/intelligent_model_selector.py +547 -0
- isa_model/core/types.py +291 -0
- isa_model/deployment/__init__.py +2 -0
- isa_model/deployment/cloud/modal/isa_vision_doc_service.py +157 -3
- isa_model/deployment/cloud/modal/isa_vision_table_service.py +532 -0
- isa_model/deployment/cloud/modal/isa_vision_ui_service.py +104 -3
- isa_model/deployment/cloud/modal/register_models.py +321 -0
- isa_model/deployment/runtime/deployed_service.py +338 -0
- isa_model/deployment/services/__init__.py +9 -0
- isa_model/deployment/services/auto_deploy_vision_service.py +537 -0
- isa_model/deployment/services/model_service.py +332 -0
- isa_model/deployment/services/service_monitor.py +356 -0
- isa_model/deployment/services/service_registry.py +527 -0
- isa_model/eval/__init__.py +80 -44
- isa_model/eval/config/__init__.py +10 -0
- isa_model/eval/config/evaluation_config.py +108 -0
- isa_model/eval/evaluators/__init__.py +18 -0
- isa_model/eval/evaluators/base_evaluator.py +503 -0
- isa_model/eval/evaluators/llm_evaluator.py +472 -0
- isa_model/eval/factory.py +417 -709
- isa_model/eval/infrastructure/__init__.py +24 -0
- isa_model/eval/infrastructure/experiment_tracker.py +466 -0
- isa_model/eval/metrics.py +191 -21
- isa_model/inference/ai_factory.py +181 -605
- isa_model/inference/services/audio/base_stt_service.py +65 -1
- isa_model/inference/services/audio/base_tts_service.py +75 -1
- isa_model/inference/services/audio/openai_stt_service.py +189 -151
- isa_model/inference/services/audio/openai_tts_service.py +12 -10
- isa_model/inference/services/audio/replicate_tts_service.py +61 -56
- isa_model/inference/services/base_service.py +55 -17
- isa_model/inference/services/embedding/base_embed_service.py +65 -1
- isa_model/inference/services/embedding/ollama_embed_service.py +103 -43
- isa_model/inference/services/embedding/openai_embed_service.py +8 -10
- isa_model/inference/services/helpers/stacked_config.py +148 -0
- isa_model/inference/services/img/__init__.py +18 -0
- isa_model/inference/services/{vision → img}/base_image_gen_service.py +80 -1
- isa_model/inference/services/{stacked → img}/flux_professional_service.py +25 -1
- isa_model/inference/services/{stacked → img/helpers}/base_stacked_service.py +40 -35
- isa_model/inference/services/{vision → img}/replicate_image_gen_service.py +44 -31
- isa_model/inference/services/llm/__init__.py +3 -3
- isa_model/inference/services/llm/base_llm_service.py +492 -40
- isa_model/inference/services/llm/helpers/llm_prompts.py +258 -0
- isa_model/inference/services/llm/helpers/llm_utils.py +280 -0
- isa_model/inference/services/llm/ollama_llm_service.py +51 -17
- isa_model/inference/services/llm/openai_llm_service.py +70 -19
- isa_model/inference/services/llm/yyds_llm_service.py +24 -23
- isa_model/inference/services/vision/__init__.py +38 -4
- isa_model/inference/services/vision/base_vision_service.py +218 -117
- isa_model/inference/services/vision/{isA_vision_service.py → disabled/isA_vision_service.py} +98 -0
- isa_model/inference/services/{stacked → vision}/doc_analysis_service.py +1 -1
- isa_model/inference/services/vision/helpers/base_stacked_service.py +274 -0
- isa_model/inference/services/vision/helpers/image_utils.py +272 -3
- isa_model/inference/services/vision/helpers/vision_prompts.py +297 -0
- isa_model/inference/services/vision/openai_vision_service.py +104 -307
- isa_model/inference/services/vision/replicate_vision_service.py +140 -325
- isa_model/inference/services/{stacked → vision}/ui_analysis_service.py +2 -498
- isa_model/scripts/register_models.py +370 -0
- isa_model/scripts/register_models_with_embeddings.py +510 -0
- isa_model/serving/api/fastapi_server.py +6 -1
- isa_model/serving/api/routes/unified.py +202 -0
- {isa_model-0.3.5.dist-info → isa_model-0.3.6.dist-info}/METADATA +4 -1
- {isa_model-0.3.5.dist-info → isa_model-0.3.6.dist-info}/RECORD +77 -53
- isa_model/config/__init__.py +0 -9
- isa_model/config/config_manager.py +0 -213
- isa_model/core/model_manager.py +0 -213
- isa_model/core/model_registry.py +0 -375
- isa_model/core/vision_models_init.py +0 -116
- isa_model/inference/billing_tracker.py +0 -406
- isa_model/inference/services/llm/triton_llm_service.py +0 -481
- isa_model/inference/services/stacked/__init__.py +0 -26
- isa_model/inference/services/stacked/config.py +0 -426
- isa_model/inference/services/vision/ollama_vision_service.py +0 -194
- /isa_model/core/{model_storage.py → models/model_storage.py} +0 -0
- /isa_model/inference/services/{vision → embedding}/helpers/text_splitter.py +0 -0
- /isa_model/inference/services/llm/{llm_adapter.py → helpers/llm_adapter.py} +0 -0
- {isa_model-0.3.5.dist-info → isa_model-0.3.6.dist-info}/WHEEL +0 -0
- {isa_model-0.3.5.dist-info → isa_model-0.3.6.dist-info}/top_level.txt +0 -0
isa_model/eval/factory.py
CHANGED
@@ -1,823 +1,531 @@
|
|
1
1
|
"""
|
2
|
-
|
2
|
+
Enterprise-Grade Evaluation Factory for ISA Model Framework
|
3
3
|
|
4
|
-
|
5
|
-
-
|
6
|
-
-
|
7
|
-
-
|
8
|
-
-
|
9
|
-
-
|
4
|
+
Implements industry best practices for AI model evaluation at scale:
|
5
|
+
- Async evaluation with concurrency control
|
6
|
+
- Comprehensive experiment tracking (W&B, MLflow)
|
7
|
+
- Distributed evaluation support
|
8
|
+
- Production-ready monitoring and alerting
|
9
|
+
- Cost tracking and optimization
|
10
|
+
- Reproducible evaluation pipelines
|
10
11
|
"""
|
11
12
|
|
12
|
-
import
|
13
|
-
import json
|
13
|
+
import asyncio
|
14
14
|
import logging
|
15
|
-
from typing import Optional, Dict, Any, List, Union
|
15
|
+
from typing import Optional, Dict, Any, List, Union, Callable
|
16
16
|
from pathlib import Path
|
17
|
-
import
|
18
|
-
|
19
|
-
try:
|
20
|
-
import wandb
|
21
|
-
WANDB_AVAILABLE = True
|
22
|
-
except ImportError:
|
23
|
-
WANDB_AVAILABLE = False
|
24
|
-
|
25
|
-
try:
|
26
|
-
import mlflow
|
27
|
-
MLFLOW_AVAILABLE = True
|
28
|
-
except ImportError:
|
29
|
-
MLFLOW_AVAILABLE = False
|
17
|
+
import json
|
30
18
|
|
31
|
-
from .
|
32
|
-
from .
|
19
|
+
from .evaluators import LLMEvaluator, VisionEvaluator, MultimodalEvaluator, EvaluationResult
|
20
|
+
from .infrastructure import ExperimentTracker, create_experiment_tracker
|
21
|
+
from .config import EvaluationConfig
|
33
22
|
|
34
23
|
logger = logging.getLogger(__name__)
|
35
24
|
|
36
25
|
|
37
26
|
class EvaluationFactory:
|
38
27
|
"""
|
39
|
-
|
28
|
+
Enterprise-grade evaluation factory implementing MLOps best practices.
|
40
29
|
|
41
|
-
|
42
|
-
-
|
43
|
-
-
|
44
|
-
-
|
45
|
-
-
|
46
|
-
-
|
30
|
+
Features:
|
31
|
+
- Multi-modal evaluation support (LLM, Vision, Multimodal)
|
32
|
+
- Async evaluation with smart concurrency management
|
33
|
+
- Comprehensive experiment tracking and visualization
|
34
|
+
- Cost optimization and resource monitoring
|
35
|
+
- Distributed evaluation across multiple GPUs/nodes
|
36
|
+
- Production-ready error handling and retry logic
|
37
|
+
- Automated result storage and comparison
|
47
38
|
|
48
39
|
Example usage:
|
49
40
|
```python
|
50
41
|
from isa_model.eval import EvaluationFactory
|
51
42
|
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
43
|
+
# Initialize with experiment tracking
|
44
|
+
factory = EvaluationFactory(
|
45
|
+
experiment_tracking={
|
46
|
+
"type": "wandb",
|
47
|
+
"project": "model-evaluation",
|
48
|
+
"entity": "my-team"
|
49
|
+
}
|
56
50
|
)
|
57
51
|
|
58
|
-
# Evaluate LLM on
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
52
|
+
# Evaluate LLM on dataset
|
53
|
+
result = await factory.evaluate_llm(
|
54
|
+
model_name="gpt-4.1-mini",
|
55
|
+
provider="openai",
|
56
|
+
dataset_path="path/to/evaluation_data.json",
|
57
|
+
metrics=["accuracy", "f1_score", "bleu_score"],
|
58
|
+
save_results=True
|
64
59
|
)
|
65
60
|
|
66
|
-
# Run
|
67
|
-
|
68
|
-
|
69
|
-
|
61
|
+
# Run benchmark evaluation
|
62
|
+
benchmark_result = await factory.run_benchmark(
|
63
|
+
model_name="claude-sonnet-4",
|
64
|
+
provider="yyds",
|
65
|
+
benchmark_name="mmlu",
|
70
66
|
subjects=["math", "physics", "chemistry"]
|
71
67
|
)
|
72
68
|
|
73
69
|
# Compare multiple models
|
74
|
-
comparison =
|
75
|
-
|
76
|
-
|
77
|
-
|
70
|
+
comparison = await factory.compare_models(
|
71
|
+
models=[
|
72
|
+
{"name": "gpt-4.1-mini", "provider": "openai"},
|
73
|
+
{"name": "claude-sonnet-4", "provider": "yyds"}
|
74
|
+
],
|
75
|
+
dataset_path="comparison_dataset.json"
|
76
|
+
)
|
78
77
|
```
|
79
78
|
"""
|
80
79
|
|
81
|
-
def __init__(
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
wandb_project: Optional[str] = None,
|
86
|
-
wandb_entity: Optional[str] = None,
|
87
|
-
use_mlflow: bool = False,
|
88
|
-
mlflow_tracking_uri: Optional[str] = None
|
89
|
-
):
|
80
|
+
def __init__(self,
|
81
|
+
config: Optional[Union[Dict[str, Any], EvaluationConfig]] = None,
|
82
|
+
experiment_tracking: Optional[Dict[str, Any]] = None,
|
83
|
+
output_dir: Optional[str] = None):
|
90
84
|
"""
|
91
|
-
Initialize the evaluation factory
|
85
|
+
Initialize the enterprise evaluation factory.
|
92
86
|
|
93
87
|
Args:
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
wandb_entity: W&B entity/team name
|
98
|
-
use_mlflow: Whether to use MLflow for tracking
|
99
|
-
mlflow_tracking_uri: MLflow tracking server URI
|
88
|
+
config: Evaluation configuration (dict or EvaluationConfig object)
|
89
|
+
experiment_tracking: Experiment tracking configuration
|
90
|
+
output_dir: Output directory for results
|
100
91
|
"""
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
#
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
92
|
+
# Initialize configuration
|
93
|
+
if isinstance(config, dict):
|
94
|
+
self.config = EvaluationConfig.from_dict(config)
|
95
|
+
elif isinstance(config, EvaluationConfig):
|
96
|
+
self.config = config
|
97
|
+
else:
|
98
|
+
self.config = EvaluationConfig()
|
99
|
+
|
100
|
+
# Override output directory if provided
|
101
|
+
if output_dir:
|
102
|
+
self.config.output_dir = output_dir
|
103
|
+
|
104
|
+
# Initialize experiment tracker
|
105
|
+
self.experiment_tracker = None
|
106
|
+
if experiment_tracking:
|
107
|
+
try:
|
108
|
+
self.experiment_tracker = create_experiment_tracker(**experiment_tracking)
|
109
|
+
logger.info(f"Initialized experiment tracking: {experiment_tracking['type']}")
|
110
|
+
except Exception as e:
|
111
|
+
logger.warning(f"Failed to initialize experiment tracking: {e}")
|
117
112
|
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
113
|
+
# Initialize evaluators
|
114
|
+
self.llm_evaluator = LLMEvaluator(
|
115
|
+
config=self.config.to_dict(),
|
116
|
+
experiment_tracker=self.experiment_tracker
|
117
|
+
)
|
122
118
|
|
123
|
-
|
124
|
-
|
125
|
-
def _start_experiment(self, experiment_name: str, config: Dict[str, Any]) -> None:
|
126
|
-
"""Start experiment tracking."""
|
127
|
-
if self.use_wandb:
|
128
|
-
wandb.init(
|
129
|
-
project=self.wandb_project,
|
130
|
-
entity=self.wandb_entity,
|
131
|
-
name=experiment_name,
|
132
|
-
config=config,
|
133
|
-
reinit=True
|
134
|
-
)
|
135
|
-
|
136
|
-
if self.use_mlflow:
|
137
|
-
mlflow.start_run(run_name=experiment_name)
|
138
|
-
mlflow.log_params(config)
|
139
|
-
|
140
|
-
def _log_metrics(self, metrics: Dict[str, Any], step: Optional[int] = None) -> None:
|
141
|
-
"""Log metrics to experiment tracking systems."""
|
142
|
-
if self.use_wandb:
|
143
|
-
wandb.log(metrics, step=step)
|
144
|
-
|
145
|
-
if self.use_mlflow:
|
146
|
-
for key, value in metrics.items():
|
147
|
-
if isinstance(value, (int, float)):
|
148
|
-
mlflow.log_metric(key, value, step=step)
|
149
|
-
|
150
|
-
def _end_experiment(self) -> None:
|
151
|
-
"""End experiment tracking."""
|
152
|
-
if self.use_wandb:
|
153
|
-
wandb.finish()
|
119
|
+
# State tracking
|
120
|
+
self._active_evaluations: Dict[str, asyncio.Task] = {}
|
154
121
|
|
155
|
-
|
156
|
-
mlflow.end_run()
|
122
|
+
logger.info(f"EvaluationFactory initialized with output dir: {self.config.output_dir}")
|
157
123
|
|
158
|
-
def
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
def evaluate_llm(
|
170
|
-
self,
|
171
|
-
model_path: str,
|
172
|
-
dataset_path: str,
|
173
|
-
metrics: List[str] = None,
|
174
|
-
output_path: Optional[str] = None,
|
175
|
-
batch_size: int = 8,
|
176
|
-
max_samples: Optional[int] = None,
|
177
|
-
provider: str = "ollama",
|
178
|
-
experiment_name: Optional[str] = None,
|
179
|
-
**kwargs
|
180
|
-
) -> Dict[str, Any]:
|
124
|
+
async def evaluate_llm(self,
|
125
|
+
model_name: str,
|
126
|
+
provider: str = "openai",
|
127
|
+
dataset_path: Optional[str] = None,
|
128
|
+
dataset: Optional[List[Dict[str, Any]]] = None,
|
129
|
+
metrics: Optional[List[str]] = None,
|
130
|
+
batch_size: Optional[int] = None,
|
131
|
+
save_results: bool = True,
|
132
|
+
experiment_name: Optional[str] = None,
|
133
|
+
progress_callback: Optional[Callable] = None) -> EvaluationResult:
|
181
134
|
"""
|
182
|
-
Evaluate
|
135
|
+
Evaluate LLM with comprehensive metrics and tracking.
|
183
136
|
|
184
137
|
Args:
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
138
|
+
model_name: Name of the model to evaluate
|
139
|
+
provider: Model provider (openai, yyds, ollama, etc.)
|
140
|
+
dataset_path: Path to evaluation dataset JSON file
|
141
|
+
dataset: Direct dataset input (alternative to dataset_path)
|
142
|
+
metrics: List of metrics to compute
|
189
143
|
batch_size: Batch size for evaluation
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
**kwargs: Additional parameters
|
144
|
+
save_results: Whether to save results to disk
|
145
|
+
experiment_name: Custom experiment name
|
146
|
+
progress_callback: Optional progress callback function
|
194
147
|
|
195
148
|
Returns:
|
196
|
-
|
197
|
-
|
198
|
-
Example:
|
199
|
-
```python
|
200
|
-
results = evaluator.evaluate_llm(
|
201
|
-
model_path="google/gemma-2-4b-it",
|
202
|
-
dataset_path="test_data.json",
|
203
|
-
metrics=["perplexity", "bleu", "rouge"],
|
204
|
-
max_samples=1000,
|
205
|
-
experiment_name="gemma-4b-eval"
|
206
|
-
)
|
207
|
-
```
|
149
|
+
Comprehensive evaluation results
|
208
150
|
"""
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
#
|
216
|
-
|
217
|
-
"
|
218
|
-
"
|
219
|
-
"
|
220
|
-
"
|
221
|
-
"
|
222
|
-
"provider": provider
|
151
|
+
# Load dataset
|
152
|
+
if dataset is None:
|
153
|
+
if dataset_path is None:
|
154
|
+
raise ValueError("Either dataset_path or dataset must be provided")
|
155
|
+
dataset = self._load_dataset(dataset_path)
|
156
|
+
|
157
|
+
# Configure LLM evaluator
|
158
|
+
llm_config = {
|
159
|
+
"provider": provider,
|
160
|
+
"model_name": model_name,
|
161
|
+
"batch_size": batch_size or self.config.batch_size,
|
162
|
+
"temperature": self.config.default_temperature,
|
163
|
+
"max_tokens": self.config.default_max_tokens
|
223
164
|
}
|
224
165
|
|
225
|
-
|
226
|
-
self._start_experiment(experiment_name, config)
|
166
|
+
self.llm_evaluator.config.update(llm_config)
|
227
167
|
|
228
|
-
|
168
|
+
# Generate experiment name
|
169
|
+
dataset_name = Path(dataset_path).stem if dataset_path else "custom_dataset"
|
170
|
+
experiment_name = experiment_name or f"llm_eval_{model_name}_{dataset_name}"
|
229
171
|
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
# Run evaluation
|
239
|
-
results = self.llm_metrics.evaluate(
|
240
|
-
model_path=model_path,
|
241
|
-
dataset=dataset,
|
242
|
-
metrics=metrics,
|
243
|
-
batch_size=batch_size,
|
244
|
-
provider=provider,
|
245
|
-
**kwargs
|
246
|
-
)
|
247
|
-
|
248
|
-
# Log metrics to tracking systems
|
249
|
-
self._log_metrics(results.get("metrics", {}))
|
250
|
-
|
251
|
-
# Add metadata
|
252
|
-
results["metadata"] = {
|
253
|
-
"model_path": model_path,
|
254
|
-
"dataset_path": dataset_path,
|
255
|
-
"metrics": metrics,
|
256
|
-
"num_samples": len(dataset),
|
257
|
-
"timestamp": datetime.datetime.now().isoformat(),
|
258
|
-
"provider": provider,
|
259
|
-
"experiment_name": experiment_name
|
260
|
-
}
|
261
|
-
|
262
|
-
# Save results
|
263
|
-
with open(output_path, 'w') as f:
|
264
|
-
json.dump(results, f, indent=2)
|
265
|
-
|
266
|
-
logger.info(f"Evaluation results saved to: {output_path}")
|
267
|
-
|
268
|
-
finally:
|
269
|
-
self._end_experiment()
|
270
|
-
|
271
|
-
return results
|
272
|
-
|
273
|
-
def evaluate_generation_quality(
|
274
|
-
self,
|
275
|
-
model_path: str,
|
276
|
-
prompts: List[str],
|
277
|
-
reference_texts: List[str] = None,
|
278
|
-
metrics: List[str] = None,
|
279
|
-
output_path: Optional[str] = None,
|
280
|
-
provider: str = "ollama",
|
281
|
-
**kwargs
|
282
|
-
) -> Dict[str, Any]:
|
283
|
-
"""
|
284
|
-
Evaluate text generation quality.
|
285
|
-
|
286
|
-
Args:
|
287
|
-
model_path: Path to the model
|
288
|
-
prompts: List of input prompts
|
289
|
-
reference_texts: Reference texts for comparison (optional)
|
290
|
-
metrics: Metrics to compute
|
291
|
-
output_path: Output path for results
|
292
|
-
provider: Model provider
|
293
|
-
**kwargs: Additional parameters
|
294
|
-
|
295
|
-
Returns:
|
296
|
-
Evaluation results dictionary
|
297
|
-
"""
|
298
|
-
if metrics is None:
|
299
|
-
metrics = ["diversity", "coherence", "fluency"]
|
300
|
-
|
301
|
-
if not output_path:
|
302
|
-
output_path = self._get_output_path(model_path, "generation_eval")
|
303
|
-
|
304
|
-
results = self.llm_metrics.evaluate_generation(
|
305
|
-
model_path=model_path,
|
306
|
-
prompts=prompts,
|
307
|
-
reference_texts=reference_texts,
|
308
|
-
metrics=metrics,
|
309
|
-
provider=provider,
|
310
|
-
**kwargs
|
172
|
+
# Run evaluation
|
173
|
+
result = await self.llm_evaluator.evaluate(
|
174
|
+
model_interface=None, # Will use AI factory
|
175
|
+
dataset=dataset,
|
176
|
+
dataset_name=dataset_name,
|
177
|
+
model_name=f"{provider}:{model_name}",
|
178
|
+
batch_size=batch_size,
|
179
|
+
progress_callback=progress_callback
|
311
180
|
)
|
312
181
|
|
313
|
-
# Save results
|
314
|
-
|
315
|
-
|
182
|
+
# Save results if requested
|
183
|
+
if save_results:
|
184
|
+
await self._save_results(result, experiment_name)
|
316
185
|
|
317
|
-
return
|
186
|
+
return result
|
318
187
|
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
max_samples: Optional[int] = None,
|
330
|
-
provider: str = "ollama",
|
331
|
-
experiment_name: Optional[str] = None,
|
332
|
-
**kwargs
|
333
|
-
) -> Dict[str, Any]:
|
188
|
+
async def run_benchmark(self,
|
189
|
+
model_name: str,
|
190
|
+
provider: str,
|
191
|
+
benchmark_name: str,
|
192
|
+
subjects: Optional[List[str]] = None,
|
193
|
+
max_samples: Optional[int] = None,
|
194
|
+
few_shot: bool = True,
|
195
|
+
num_shots: int = 5,
|
196
|
+
save_results: bool = True,
|
197
|
+
experiment_name: Optional[str] = None) -> EvaluationResult:
|
334
198
|
"""
|
335
|
-
Run
|
199
|
+
Run standardized benchmark evaluation.
|
336
200
|
|
337
201
|
Args:
|
338
|
-
|
339
|
-
benchmark: Benchmark name ("mmlu", "hellaswag", "arc", "gsm8k")
|
340
|
-
output_path: Path to save results
|
341
|
-
num_shots: Number of few-shot examples
|
342
|
-
max_samples: Maximum samples to evaluate
|
202
|
+
model_name: Name of the model to evaluate
|
343
203
|
provider: Model provider
|
344
|
-
|
345
|
-
|
204
|
+
benchmark_name: Name of benchmark (mmlu, hellaswag, arc, gsm8k, etc.)
|
205
|
+
subjects: List of subjects to evaluate (for MMLU)
|
206
|
+
max_samples: Maximum number of samples to evaluate
|
207
|
+
few_shot: Whether to use few-shot examples
|
208
|
+
num_shots: Number of few-shot examples
|
209
|
+
save_results: Whether to save results
|
210
|
+
experiment_name: Custom experiment name
|
346
211
|
|
347
212
|
Returns:
|
348
|
-
Benchmark results
|
213
|
+
Benchmark evaluation results
|
349
214
|
"""
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
"max_samples": max_samples,
|
359
|
-
"provider": provider
|
360
|
-
}
|
361
|
-
|
362
|
-
experiment_name = experiment_name or f"{benchmark}_{os.path.basename(model_path)}"
|
363
|
-
self._start_experiment(experiment_name, config)
|
364
|
-
|
365
|
-
logger.info(f"Running {benchmark.upper()} benchmark on {model_path}")
|
366
|
-
|
367
|
-
try:
|
368
|
-
# Initialize benchmark
|
369
|
-
benchmark_map = {
|
370
|
-
"mmlu": MMLU(),
|
371
|
-
"hellaswag": HellaSwag(),
|
372
|
-
"arc": ARC(),
|
373
|
-
"gsm8k": GSM8K()
|
374
|
-
}
|
375
|
-
|
376
|
-
if benchmark.lower() not in benchmark_map:
|
377
|
-
raise ValueError(f"Benchmark '{benchmark}' not supported. Available: {list(benchmark_map.keys())}")
|
378
|
-
|
379
|
-
benchmark_instance = benchmark_map[benchmark.lower()]
|
380
|
-
|
381
|
-
# Run benchmark
|
382
|
-
results = self.benchmark_runner.run_benchmark(
|
383
|
-
model_path=model_path,
|
384
|
-
benchmark=benchmark_instance,
|
385
|
-
num_shots=num_shots,
|
386
|
-
max_samples=max_samples,
|
387
|
-
provider=provider,
|
388
|
-
**kwargs
|
389
|
-
)
|
390
|
-
|
391
|
-
# Log metrics to tracking systems
|
392
|
-
self._log_metrics(results.get("metrics", {}))
|
393
|
-
|
394
|
-
# Add metadata
|
395
|
-
results["metadata"] = {
|
396
|
-
"model_path": model_path,
|
397
|
-
"benchmark": benchmark,
|
398
|
-
"num_shots": num_shots,
|
399
|
-
"max_samples": max_samples,
|
400
|
-
"timestamp": datetime.datetime.now().isoformat(),
|
401
|
-
"provider": provider,
|
402
|
-
"experiment_name": experiment_name
|
403
|
-
}
|
404
|
-
|
405
|
-
# Save results
|
406
|
-
with open(output_path, 'w') as f:
|
407
|
-
json.dump(results, f, indent=2)
|
408
|
-
|
409
|
-
logger.info(f"Benchmark results saved to: {output_path}")
|
410
|
-
|
411
|
-
finally:
|
412
|
-
self._end_experiment()
|
215
|
+
# Load benchmark dataset
|
216
|
+
benchmark_dataset = await self._load_benchmark(
|
217
|
+
benchmark_name,
|
218
|
+
subjects=subjects,
|
219
|
+
max_samples=max_samples,
|
220
|
+
few_shot=few_shot,
|
221
|
+
num_shots=num_shots
|
222
|
+
)
|
413
223
|
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
|
420
|
-
|
421
|
-
|
422
|
-
|
423
|
-
"""
|
424
|
-
Run multiple benchmarks on a model.
|
224
|
+
# Configure for benchmark evaluation
|
225
|
+
benchmark_config = {
|
226
|
+
"provider": provider,
|
227
|
+
"model_name": model_name,
|
228
|
+
"temperature": 0.0, # Deterministic for benchmarks
|
229
|
+
"max_tokens": 50, # Short answers for most benchmarks
|
230
|
+
"task_type": "benchmark",
|
231
|
+
"benchmark_name": benchmark_name
|
232
|
+
}
|
425
233
|
|
426
|
-
|
427
|
-
model_path: Path to the model
|
428
|
-
benchmarks: List of benchmark names
|
429
|
-
output_dir: Directory to save results
|
430
|
-
**kwargs: Additional parameters
|
431
|
-
|
432
|
-
Returns:
|
433
|
-
Combined results dictionary
|
434
|
-
"""
|
435
|
-
if benchmarks is None:
|
436
|
-
benchmarks = ["mmlu", "hellaswag", "arc"]
|
234
|
+
self.llm_evaluator.config.update(benchmark_config)
|
437
235
|
|
438
|
-
|
439
|
-
|
440
|
-
os.makedirs(output_dir, exist_ok=True)
|
236
|
+
# Generate experiment name
|
237
|
+
experiment_name = experiment_name or f"benchmark_{benchmark_name}_{model_name}"
|
441
238
|
|
442
|
-
|
239
|
+
# Run evaluation
|
240
|
+
result = await self.llm_evaluator.evaluate(
|
241
|
+
model_interface=None,
|
242
|
+
dataset=benchmark_dataset,
|
243
|
+
dataset_name=benchmark_name,
|
244
|
+
model_name=f"{provider}:{model_name}",
|
245
|
+
batch_size=self.config.batch_size
|
246
|
+
)
|
443
247
|
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
**kwargs
|
452
|
-
)
|
453
|
-
all_results[benchmark] = results
|
454
|
-
except Exception as e:
|
455
|
-
logger.error(f"Failed to run benchmark {benchmark}: {e}")
|
456
|
-
all_results[benchmark] = {"error": str(e)}
|
248
|
+
# Add benchmark-specific metadata
|
249
|
+
result.config.update({
|
250
|
+
"benchmark_name": benchmark_name,
|
251
|
+
"subjects": subjects,
|
252
|
+
"few_shot": few_shot,
|
253
|
+
"num_shots": num_shots
|
254
|
+
})
|
457
255
|
|
458
|
-
# Save
|
459
|
-
|
460
|
-
|
461
|
-
json.dump(all_results, f, indent=2)
|
256
|
+
# Save results if requested
|
257
|
+
if save_results:
|
258
|
+
await self._save_results(result, experiment_name)
|
462
259
|
|
463
|
-
return
|
260
|
+
return result
|
464
261
|
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
benchmark: Optional[str] = None,
|
474
|
-
metrics: List[str] = None,
|
475
|
-
output_path: Optional[str] = None,
|
476
|
-
experiment_name: Optional[str] = None,
|
477
|
-
**kwargs
|
478
|
-
) -> Dict[str, Any]:
|
262
|
+
async def compare_models(self,
|
263
|
+
models: List[Dict[str, str]],
|
264
|
+
dataset_path: Optional[str] = None,
|
265
|
+
dataset: Optional[List[Dict[str, Any]]] = None,
|
266
|
+
benchmark_name: Optional[str] = None,
|
267
|
+
metrics: Optional[List[str]] = None,
|
268
|
+
save_results: bool = True,
|
269
|
+
experiment_name: Optional[str] = None) -> Dict[str, EvaluationResult]:
|
479
270
|
"""
|
480
271
|
Compare multiple models on the same evaluation task.
|
481
272
|
|
482
273
|
Args:
|
483
|
-
|
484
|
-
dataset_path:
|
485
|
-
|
274
|
+
models: List of model configs [{"name": "gpt-4", "provider": "openai"}, ...]
|
275
|
+
dataset_path: Path to evaluation dataset
|
276
|
+
dataset: Direct dataset input
|
277
|
+
benchmark_name: Benchmark name (alternative to dataset)
|
486
278
|
metrics: Metrics to compute
|
487
|
-
|
488
|
-
experiment_name:
|
489
|
-
**kwargs: Additional parameters
|
279
|
+
save_results: Whether to save comparison results
|
280
|
+
experiment_name: Custom experiment name
|
490
281
|
|
491
282
|
Returns:
|
492
|
-
|
283
|
+
Dictionary mapping model names to evaluation results
|
493
284
|
"""
|
494
|
-
|
495
|
-
raise ValueError("Either dataset_path or benchmark must be provided")
|
496
|
-
|
497
|
-
if not output_path:
|
498
|
-
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
499
|
-
output_path = os.path.join(self.output_dir, f"model_comparison_{timestamp}.json")
|
500
|
-
|
501
|
-
# Setup experiment tracking
|
502
|
-
config = {
|
503
|
-
"model_paths": model_paths,
|
504
|
-
"dataset_path": dataset_path,
|
505
|
-
"benchmark": benchmark,
|
506
|
-
"metrics": metrics
|
507
|
-
}
|
285
|
+
results = {}
|
508
286
|
|
509
|
-
|
510
|
-
self.
|
287
|
+
# Run evaluations concurrently (with concurrency limits)
|
288
|
+
semaphore = asyncio.Semaphore(self.config.max_concurrent_evaluations)
|
511
289
|
|
512
|
-
|
513
|
-
|
514
|
-
|
515
|
-
|
516
|
-
|
517
|
-
# Evaluate each model
|
518
|
-
for i, model_path in enumerate(model_paths):
|
519
|
-
logger.info(f"Evaluating model {i+1}/{len(model_paths)}: {model_path}")
|
290
|
+
async def evaluate_single_model(model_config: Dict[str, str]) -> tuple:
|
291
|
+
async with semaphore:
|
292
|
+
model_name = model_config["name"]
|
293
|
+
provider = model_config["provider"]
|
520
294
|
|
521
|
-
if
|
522
|
-
|
523
|
-
|
524
|
-
|
525
|
-
|
526
|
-
|
295
|
+
if benchmark_name:
|
296
|
+
result = await self.run_benchmark(
|
297
|
+
model_name=model_name,
|
298
|
+
provider=provider,
|
299
|
+
benchmark_name=benchmark_name,
|
300
|
+
save_results=False # Save comparison results together
|
527
301
|
)
|
528
302
|
else:
|
529
|
-
|
530
|
-
|
303
|
+
result = await self.evaluate_llm(
|
304
|
+
model_name=model_name,
|
305
|
+
provider=provider,
|
531
306
|
dataset_path=dataset_path,
|
307
|
+
dataset=dataset,
|
532
308
|
metrics=metrics,
|
533
|
-
|
534
|
-
**kwargs
|
309
|
+
save_results=False
|
535
310
|
)
|
536
311
|
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
results[
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
|
550
|
-
|
551
|
-
|
552
|
-
|
553
|
-
|
554
|
-
"experiment_name": experiment_name
|
555
|
-
}
|
556
|
-
|
557
|
-
# Save results
|
558
|
-
with open(output_path, 'w') as f:
|
559
|
-
json.dump(results, f, indent=2)
|
560
|
-
|
561
|
-
logger.info(f"Comparison results saved to: {output_path}")
|
562
|
-
|
563
|
-
finally:
|
564
|
-
self._end_experiment()
|
312
|
+
return f"{provider}:{model_name}", result
|
313
|
+
|
314
|
+
# Execute all evaluations
|
315
|
+
tasks = [evaluate_single_model(model) for model in models]
|
316
|
+
evaluation_results = await asyncio.gather(*tasks)
|
317
|
+
|
318
|
+
# Collect results
|
319
|
+
for model_id, result in evaluation_results:
|
320
|
+
results[model_id] = result
|
321
|
+
|
322
|
+
# Generate comparison report
|
323
|
+
comparison_report = self._generate_comparison_report(results)
|
324
|
+
|
325
|
+
# Save results if requested
|
326
|
+
if save_results:
|
327
|
+
experiment_name = experiment_name or f"model_comparison_{len(models)}_models"
|
328
|
+
await self._save_comparison_results(results, comparison_report, experiment_name)
|
565
329
|
|
566
330
|
return results
|
567
331
|
|
568
|
-
def
|
569
|
-
"""
|
570
|
-
|
571
|
-
|
332
|
+
def _load_dataset(self, dataset_path: str) -> List[Dict[str, Any]]:
|
333
|
+
"""Load dataset from file."""
|
334
|
+
with open(dataset_path, 'r', encoding='utf-8') as f:
|
335
|
+
if dataset_path.endswith('.json'):
|
336
|
+
dataset = json.load(f)
|
337
|
+
elif dataset_path.endswith('.jsonl'):
|
338
|
+
dataset = [json.loads(line) for line in f]
|
339
|
+
else:
|
340
|
+
raise ValueError(f"Unsupported dataset format: {dataset_path}")
|
341
|
+
|
342
|
+
logger.info(f"Loaded dataset with {len(dataset)} samples from {dataset_path}")
|
343
|
+
return dataset
|
344
|
+
|
345
|
+
async def _load_benchmark(self,
|
346
|
+
benchmark_name: str,
|
347
|
+
subjects: Optional[List[str]] = None,
|
348
|
+
max_samples: Optional[int] = None,
|
349
|
+
few_shot: bool = True,
|
350
|
+
num_shots: int = 5) -> List[Dict[str, Any]]:
|
351
|
+
"""Load benchmark dataset."""
|
352
|
+
# This would integrate with the benchmark loaders
|
353
|
+
# For now, return a placeholder
|
354
|
+
logger.warning(f"Benchmark {benchmark_name} loading not yet implemented")
|
355
|
+
|
356
|
+
# Placeholder benchmark data
|
357
|
+
return [
|
358
|
+
{
|
359
|
+
"id": f"sample_{i}",
|
360
|
+
"prompt": f"Sample question {i} for {benchmark_name}",
|
361
|
+
"reference": "A",
|
362
|
+
"choices": ["A", "B", "C", "D"] if benchmark_name != "gsm8k" else None
|
363
|
+
}
|
364
|
+
for i in range(min(max_samples or 10, 10))
|
365
|
+
]
|
366
|
+
|
367
|
+
async def _save_results(self, result: EvaluationResult, experiment_name: str) -> None:
|
368
|
+
"""Save evaluation results to disk."""
|
369
|
+
# Create output directory
|
370
|
+
output_dir = Path(self.config.output_dir) / experiment_name
|
371
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
372
|
+
|
373
|
+
# Save main results
|
374
|
+
results_path = output_dir / "results.json"
|
375
|
+
result.save_to_file(results_path)
|
376
|
+
|
377
|
+
# Save detailed predictions if available
|
378
|
+
if result.sample_results:
|
379
|
+
predictions_path = output_dir / "predictions.json"
|
380
|
+
with open(predictions_path, 'w', encoding='utf-8') as f:
|
381
|
+
json.dump(result.sample_results, f, indent=2, ensure_ascii=False)
|
382
|
+
|
383
|
+
# Save summary
|
384
|
+
summary_path = output_dir / "summary.json"
|
385
|
+
with open(summary_path, 'w', encoding='utf-8') as f:
|
386
|
+
json.dump(result.get_summary(), f, indent=2, ensure_ascii=False)
|
387
|
+
|
388
|
+
logger.info(f"Saved evaluation results to {output_dir}")
|
389
|
+
|
390
|
+
async def _save_comparison_results(self,
|
391
|
+
results: Dict[str, EvaluationResult],
|
392
|
+
comparison_report: Dict[str, Any],
|
393
|
+
experiment_name: str) -> None:
|
394
|
+
"""Save model comparison results."""
|
395
|
+
output_dir = Path(self.config.output_dir) / experiment_name
|
396
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
397
|
+
|
398
|
+
# Save individual results
|
399
|
+
for model_id, result in results.items():
|
400
|
+
model_dir = output_dir / model_id.replace(":", "_")
|
401
|
+
model_dir.mkdir(exist_ok=True)
|
402
|
+
result.save_to_file(model_dir / "results.json")
|
403
|
+
|
404
|
+
# Save comparison report
|
405
|
+
comparison_path = output_dir / "comparison_report.json"
|
406
|
+
with open(comparison_path, 'w', encoding='utf-8') as f:
|
407
|
+
json.dump(comparison_report, f, indent=2, ensure_ascii=False)
|
408
|
+
|
409
|
+
logger.info(f"Saved comparison results to {output_dir}")
|
410
|
+
|
411
|
+
def _generate_comparison_report(self, results: Dict[str, EvaluationResult]) -> Dict[str, Any]:
|
412
|
+
"""Generate comparison report from multiple model results."""
|
413
|
+
report = {
|
414
|
+
"models_compared": list(results.keys()),
|
415
|
+
"comparison_timestamp": results[list(results.keys())[0]].timestamp,
|
416
|
+
"metric_comparison": {},
|
572
417
|
"rankings": {},
|
573
|
-
"
|
418
|
+
"best_model_per_metric": {}
|
574
419
|
}
|
575
420
|
|
576
|
-
# Extract all metrics
|
421
|
+
# Extract all metrics
|
577
422
|
all_metrics = set()
|
578
|
-
for
|
579
|
-
|
580
|
-
all_metrics.update(model_results["metrics"].keys())
|
423
|
+
for result in results.values():
|
424
|
+
all_metrics.update(result.metrics.keys())
|
581
425
|
|
582
426
|
# Compare each metric
|
583
427
|
for metric in all_metrics:
|
584
428
|
metric_values = {}
|
585
|
-
for
|
586
|
-
if
|
587
|
-
metric_values[
|
429
|
+
for model_id, result in results.items():
|
430
|
+
if metric in result.metrics:
|
431
|
+
metric_values[model_id] = result.metrics[metric]
|
588
432
|
|
589
433
|
if metric_values:
|
590
|
-
# Determine if higher is better
|
434
|
+
# Determine if higher is better
|
591
435
|
higher_is_better = metric not in ["perplexity", "loss", "error_rate"]
|
592
436
|
|
437
|
+
# Find best model
|
593
438
|
best_model = max(metric_values.items(), key=lambda x: x[1]) if higher_is_better else min(metric_values.items(), key=lambda x: x[1])
|
594
|
-
summary["best_model"][metric] = {
|
595
|
-
"model": best_model[0],
|
596
|
-
"value": best_model[1]
|
597
|
-
}
|
598
439
|
|
599
440
|
# Create ranking
|
600
441
|
sorted_models = sorted(metric_values.items(), key=lambda x: x[1], reverse=higher_is_better)
|
601
|
-
summary["rankings"][metric] = [{"model": model, "value": value} for model, value in sorted_models]
|
602
442
|
|
603
|
-
|
604
|
-
|
605
|
-
|
606
|
-
|
607
|
-
# =================
|
608
|
-
# Image Model Evaluation Methods
|
609
|
-
# =================
|
610
|
-
|
611
|
-
def evaluate_image_model(
|
612
|
-
self,
|
613
|
-
model_path: str,
|
614
|
-
test_images_dir: str,
|
615
|
-
reference_images_dir: Optional[str] = None,
|
616
|
-
metrics: List[str] = None,
|
617
|
-
output_path: Optional[str] = None,
|
618
|
-
**kwargs
|
619
|
-
) -> Dict[str, Any]:
|
620
|
-
"""
|
621
|
-
Evaluate image generation model.
|
622
|
-
|
623
|
-
Args:
|
624
|
-
model_path: Path to the image model
|
625
|
-
test_images_dir: Directory with test images
|
626
|
-
reference_images_dir: Directory with reference images
|
627
|
-
metrics: Metrics to compute ["fid", "is", "lpips"]
|
628
|
-
output_path: Output path for results
|
629
|
-
**kwargs: Additional parameters
|
630
|
-
|
631
|
-
Returns:
|
632
|
-
Image evaluation results
|
633
|
-
"""
|
634
|
-
if metrics is None:
|
635
|
-
metrics = ["fid", "is"]
|
636
|
-
|
637
|
-
if not output_path:
|
638
|
-
output_path = self._get_output_path(model_path, "image_eval")
|
639
|
-
|
640
|
-
results = self.image_metrics.evaluate(
|
641
|
-
model_path=model_path,
|
642
|
-
test_images_dir=test_images_dir,
|
643
|
-
reference_images_dir=reference_images_dir,
|
644
|
-
metrics=metrics,
|
645
|
-
**kwargs
|
646
|
-
)
|
647
|
-
|
648
|
-
# Save results
|
649
|
-
with open(output_path, 'w') as f:
|
650
|
-
json.dump(results, f, indent=2)
|
443
|
+
report["metric_comparison"][metric] = metric_values
|
444
|
+
report["rankings"][metric] = [{"model": model, "value": value} for model, value in sorted_models]
|
445
|
+
report["best_model_per_metric"][metric] = {"model": best_model[0], "value": best_model[1]}
|
651
446
|
|
652
|
-
return
|
447
|
+
return report
|
653
448
|
|
654
|
-
|
655
|
-
|
656
|
-
|
449
|
+
def get_configuration(self) -> Dict[str, Any]:
|
450
|
+
"""Get current factory configuration."""
|
451
|
+
return self.config.to_dict()
|
657
452
|
|
658
|
-
def
|
659
|
-
"""
|
660
|
-
|
661
|
-
return json.load(f)
|
453
|
+
def get_active_evaluations(self) -> List[str]:
|
454
|
+
"""Get list of currently running evaluations."""
|
455
|
+
return list(self._active_evaluations.keys())
|
662
456
|
|
663
|
-
def
|
664
|
-
"""
|
665
|
-
|
666
|
-
|
667
|
-
|
668
|
-
|
669
|
-
|
670
|
-
|
671
|
-
|
672
|
-
with open(filepath, 'r') as f:
|
673
|
-
data = json.load(f)
|
674
|
-
results.append({
|
675
|
-
"filename": filename,
|
676
|
-
"path": filepath,
|
677
|
-
"metadata": data.get("metadata", {}),
|
678
|
-
"created": datetime.datetime.fromtimestamp(
|
679
|
-
os.path.getctime(filepath)
|
680
|
-
).isoformat()
|
681
|
-
})
|
682
|
-
except Exception as e:
|
683
|
-
logger.warning(f"Failed to load {filename}: {e}")
|
684
|
-
|
685
|
-
return sorted(results, key=lambda x: x["created"], reverse=True)
|
457
|
+
async def stop_evaluation(self, evaluation_id: str) -> bool:
|
458
|
+
"""Stop a running evaluation."""
|
459
|
+
if evaluation_id in self._active_evaluations:
|
460
|
+
task = self._active_evaluations[evaluation_id]
|
461
|
+
task.cancel()
|
462
|
+
del self._active_evaluations[evaluation_id]
|
463
|
+
logger.info(f"Stopped evaluation: {evaluation_id}")
|
464
|
+
return True
|
465
|
+
return False
|
686
466
|
|
687
|
-
def
|
688
|
-
|
689
|
-
|
690
|
-
|
691
|
-
|
692
|
-
) -> str:
|
693
|
-
"""
|
694
|
-
Generate evaluation report from multiple results.
|
695
|
-
|
696
|
-
Args:
|
697
|
-
results_paths: List of result file paths
|
698
|
-
output_path: Output path for report
|
699
|
-
format: Report format ("json", "html", "markdown")
|
700
|
-
|
701
|
-
Returns:
|
702
|
-
Path to generated report
|
703
|
-
"""
|
704
|
-
if not output_path:
|
705
|
-
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
706
|
-
output_path = os.path.join(self.output_dir, f"evaluation_report_{timestamp}.{format}")
|
707
|
-
|
708
|
-
# Load all results
|
709
|
-
all_results = []
|
710
|
-
for path in results_paths:
|
711
|
-
try:
|
712
|
-
results = self.load_results(path)
|
713
|
-
all_results.append(results)
|
714
|
-
except Exception as e:
|
715
|
-
logger.warning(f"Failed to load results from {path}: {e}")
|
716
|
-
|
717
|
-
# Generate report based on format
|
718
|
-
if format == "json":
|
719
|
-
report_data = {
|
720
|
-
"report_generated": datetime.datetime.now().isoformat(),
|
721
|
-
"num_evaluations": len(all_results),
|
722
|
-
"results": all_results
|
723
|
-
}
|
724
|
-
|
725
|
-
with open(output_path, 'w') as f:
|
726
|
-
json.dump(report_data, f, indent=2)
|
467
|
+
async def cleanup(self) -> None:
|
468
|
+
"""Cleanup resources and stop all running evaluations."""
|
469
|
+
# Cancel all active evaluations
|
470
|
+
for evaluation_id in list(self._active_evaluations.keys()):
|
471
|
+
await self.stop_evaluation(evaluation_id)
|
727
472
|
|
728
|
-
#
|
473
|
+
# Close experiment tracker
|
474
|
+
if self.experiment_tracker and self.experiment_tracker.is_running:
|
475
|
+
await self.experiment_tracker.end_run()
|
729
476
|
|
730
|
-
logger.info(
|
731
|
-
return output_path
|
477
|
+
logger.info("EvaluationFactory cleanup completed")
|
732
478
|
|
733
|
-
|
734
|
-
|
735
|
-
|
736
|
-
|
737
|
-
|
738
|
-
|
739
|
-
|
740
|
-
|
741
|
-
|
742
|
-
|
743
|
-
|
744
|
-
|
745
|
-
|
746
|
-
|
747
|
-
|
748
|
-
|
749
|
-
|
750
|
-
|
751
|
-
|
752
|
-
|
753
|
-
|
754
|
-
|
755
|
-
|
756
|
-
|
757
|
-
|
758
|
-
|
759
|
-
|
760
|
-
|
761
|
-
|
762
|
-
|
763
|
-
|
764
|
-
|
765
|
-
|
766
|
-
|
767
|
-
|
768
|
-
|
769
|
-
|
770
|
-
|
771
|
-
|
772
|
-
|
773
|
-
|
774
|
-
# Text evaluation
|
775
|
-
if text_dataset_path:
|
776
|
-
logger.info("Evaluating text modality...")
|
777
|
-
text_results = self.evaluate_llm(
|
778
|
-
model_path=model_path,
|
779
|
-
dataset_path=text_dataset_path,
|
780
|
-
metrics=metrics or ["perplexity", "bleu", "rouge"],
|
781
|
-
experiment_name=None,
|
782
|
-
**kwargs
|
783
|
-
)
|
784
|
-
results["modalities"]["text"] = text_results
|
785
|
-
self._log_metrics({f"text_{k}": v for k, v in text_results.get("metrics", {}).items()})
|
786
|
-
|
787
|
-
# Image evaluation
|
788
|
-
if image_dataset_path:
|
789
|
-
logger.info("Evaluating image modality...")
|
790
|
-
image_results = self.evaluate_image_model(
|
791
|
-
model_path=model_path,
|
792
|
-
test_images_dir=image_dataset_path,
|
793
|
-
metrics=metrics or ["fid", "is", "lpips"],
|
794
|
-
experiment_name=None,
|
795
|
-
**kwargs
|
796
|
-
)
|
797
|
-
results["modalities"]["image"] = image_results
|
798
|
-
self._log_metrics({f"image_{k}": v for k, v in image_results.get("metrics", {}).items()})
|
799
|
-
|
800
|
-
# Audio evaluation (placeholder for future implementation)
|
801
|
-
if audio_dataset_path:
|
802
|
-
logger.info("Audio evaluation not yet implemented")
|
803
|
-
results["modalities"]["audio"] = {"status": "not_implemented"}
|
804
|
-
|
805
|
-
# Add metadata
|
806
|
-
results["metadata"] = {
|
807
|
-
"model_path": model_path,
|
808
|
-
"modalities_evaluated": list(results["modalities"].keys()),
|
809
|
-
"timestamp": datetime.datetime.now().isoformat(),
|
810
|
-
"experiment_name": experiment_name
|
811
|
-
}
|
812
|
-
|
813
|
-
# Save results
|
814
|
-
output_path = self._get_output_path(model_path, "multimodal_eval")
|
815
|
-
with open(output_path, 'w') as f:
|
816
|
-
json.dump(results, f, indent=2)
|
817
|
-
|
818
|
-
logger.info(f"Multimodal evaluation results saved to: {output_path}")
|
819
|
-
|
820
|
-
finally:
|
821
|
-
self._end_experiment()
|
479
|
+
|
480
|
+
# Convenience functions for quick evaluation
|
481
|
+
async def evaluate_llm_quick(model_name: str,
|
482
|
+
provider: str,
|
483
|
+
dataset_path: str,
|
484
|
+
metrics: Optional[List[str]] = None) -> EvaluationResult:
|
485
|
+
"""
|
486
|
+
Quick LLM evaluation function.
|
487
|
+
|
488
|
+
Args:
|
489
|
+
model_name: Name of the model
|
490
|
+
provider: Model provider
|
491
|
+
dataset_path: Path to dataset
|
492
|
+
metrics: Metrics to compute
|
493
|
+
|
494
|
+
Returns:
|
495
|
+
Evaluation results
|
496
|
+
"""
|
497
|
+
factory = EvaluationFactory()
|
498
|
+
try:
|
499
|
+
return await factory.evaluate_llm(
|
500
|
+
model_name=model_name,
|
501
|
+
provider=provider,
|
502
|
+
dataset_path=dataset_path,
|
503
|
+
metrics=metrics
|
504
|
+
)
|
505
|
+
finally:
|
506
|
+
await factory.cleanup()
|
507
|
+
|
508
|
+
|
509
|
+
async def run_benchmark_quick(model_name: str,
|
510
|
+
provider: str,
|
511
|
+
benchmark_name: str) -> EvaluationResult:
|
512
|
+
"""
|
513
|
+
Quick benchmark evaluation function.
|
514
|
+
|
515
|
+
Args:
|
516
|
+
model_name: Name of the model
|
517
|
+
provider: Model provider
|
518
|
+
benchmark_name: Benchmark name
|
822
519
|
|
823
|
-
|
520
|
+
Returns:
|
521
|
+
Benchmark results
|
522
|
+
"""
|
523
|
+
factory = EvaluationFactory()
|
524
|
+
try:
|
525
|
+
return await factory.run_benchmark(
|
526
|
+
model_name=model_name,
|
527
|
+
provider=provider,
|
528
|
+
benchmark_name=benchmark_name
|
529
|
+
)
|
530
|
+
finally:
|
531
|
+
await factory.cleanup()
|