isa-model 0.3.5__py3-none-any.whl → 0.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. isa_model/__init__.py +30 -1
  2. isa_model/client.py +770 -0
  3. isa_model/core/config/__init__.py +16 -0
  4. isa_model/core/config/config_manager.py +514 -0
  5. isa_model/core/config.py +426 -0
  6. isa_model/core/models/model_billing_tracker.py +476 -0
  7. isa_model/core/models/model_manager.py +399 -0
  8. isa_model/core/{storage/supabase_storage.py → models/model_repo.py} +72 -73
  9. isa_model/core/pricing_manager.py +426 -0
  10. isa_model/core/services/__init__.py +19 -0
  11. isa_model/core/services/intelligent_model_selector.py +547 -0
  12. isa_model/core/types.py +291 -0
  13. isa_model/deployment/__init__.py +2 -0
  14. isa_model/deployment/cloud/modal/isa_vision_doc_service.py +157 -3
  15. isa_model/deployment/cloud/modal/isa_vision_table_service.py +532 -0
  16. isa_model/deployment/cloud/modal/isa_vision_ui_service.py +104 -3
  17. isa_model/deployment/cloud/modal/register_models.py +321 -0
  18. isa_model/deployment/runtime/deployed_service.py +338 -0
  19. isa_model/deployment/services/__init__.py +9 -0
  20. isa_model/deployment/services/auto_deploy_vision_service.py +537 -0
  21. isa_model/deployment/services/model_service.py +332 -0
  22. isa_model/deployment/services/service_monitor.py +356 -0
  23. isa_model/deployment/services/service_registry.py +527 -0
  24. isa_model/eval/__init__.py +80 -44
  25. isa_model/eval/config/__init__.py +10 -0
  26. isa_model/eval/config/evaluation_config.py +108 -0
  27. isa_model/eval/evaluators/__init__.py +18 -0
  28. isa_model/eval/evaluators/base_evaluator.py +503 -0
  29. isa_model/eval/evaluators/llm_evaluator.py +472 -0
  30. isa_model/eval/factory.py +417 -709
  31. isa_model/eval/infrastructure/__init__.py +24 -0
  32. isa_model/eval/infrastructure/experiment_tracker.py +466 -0
  33. isa_model/eval/metrics.py +191 -21
  34. isa_model/inference/ai_factory.py +181 -605
  35. isa_model/inference/services/audio/base_stt_service.py +65 -1
  36. isa_model/inference/services/audio/base_tts_service.py +75 -1
  37. isa_model/inference/services/audio/openai_stt_service.py +189 -151
  38. isa_model/inference/services/audio/openai_tts_service.py +12 -10
  39. isa_model/inference/services/audio/replicate_tts_service.py +61 -56
  40. isa_model/inference/services/base_service.py +55 -17
  41. isa_model/inference/services/embedding/base_embed_service.py +65 -1
  42. isa_model/inference/services/embedding/ollama_embed_service.py +103 -43
  43. isa_model/inference/services/embedding/openai_embed_service.py +8 -10
  44. isa_model/inference/services/helpers/stacked_config.py +148 -0
  45. isa_model/inference/services/img/__init__.py +18 -0
  46. isa_model/inference/services/{vision → img}/base_image_gen_service.py +80 -1
  47. isa_model/inference/services/{stacked → img}/flux_professional_service.py +25 -1
  48. isa_model/inference/services/{stacked → img/helpers}/base_stacked_service.py +40 -35
  49. isa_model/inference/services/{vision → img}/replicate_image_gen_service.py +44 -31
  50. isa_model/inference/services/llm/__init__.py +3 -3
  51. isa_model/inference/services/llm/base_llm_service.py +492 -40
  52. isa_model/inference/services/llm/helpers/llm_prompts.py +258 -0
  53. isa_model/inference/services/llm/helpers/llm_utils.py +280 -0
  54. isa_model/inference/services/llm/ollama_llm_service.py +51 -17
  55. isa_model/inference/services/llm/openai_llm_service.py +70 -19
  56. isa_model/inference/services/llm/yyds_llm_service.py +24 -23
  57. isa_model/inference/services/vision/__init__.py +38 -4
  58. isa_model/inference/services/vision/base_vision_service.py +218 -117
  59. isa_model/inference/services/vision/{isA_vision_service.py → disabled/isA_vision_service.py} +98 -0
  60. isa_model/inference/services/{stacked → vision}/doc_analysis_service.py +1 -1
  61. isa_model/inference/services/vision/helpers/base_stacked_service.py +274 -0
  62. isa_model/inference/services/vision/helpers/image_utils.py +272 -3
  63. isa_model/inference/services/vision/helpers/vision_prompts.py +297 -0
  64. isa_model/inference/services/vision/openai_vision_service.py +104 -307
  65. isa_model/inference/services/vision/replicate_vision_service.py +140 -325
  66. isa_model/inference/services/{stacked → vision}/ui_analysis_service.py +2 -498
  67. isa_model/scripts/register_models.py +370 -0
  68. isa_model/scripts/register_models_with_embeddings.py +510 -0
  69. isa_model/serving/api/fastapi_server.py +6 -1
  70. isa_model/serving/api/routes/unified.py +202 -0
  71. {isa_model-0.3.5.dist-info → isa_model-0.3.6.dist-info}/METADATA +4 -1
  72. {isa_model-0.3.5.dist-info → isa_model-0.3.6.dist-info}/RECORD +77 -53
  73. isa_model/config/__init__.py +0 -9
  74. isa_model/config/config_manager.py +0 -213
  75. isa_model/core/model_manager.py +0 -213
  76. isa_model/core/model_registry.py +0 -375
  77. isa_model/core/vision_models_init.py +0 -116
  78. isa_model/inference/billing_tracker.py +0 -406
  79. isa_model/inference/services/llm/triton_llm_service.py +0 -481
  80. isa_model/inference/services/stacked/__init__.py +0 -26
  81. isa_model/inference/services/stacked/config.py +0 -426
  82. isa_model/inference/services/vision/ollama_vision_service.py +0 -194
  83. /isa_model/core/{model_storage.py → models/model_storage.py} +0 -0
  84. /isa_model/inference/services/{vision → embedding}/helpers/text_splitter.py +0 -0
  85. /isa_model/inference/services/llm/{llm_adapter.py → helpers/llm_adapter.py} +0 -0
  86. {isa_model-0.3.5.dist-info → isa_model-0.3.6.dist-info}/WHEEL +0 -0
  87. {isa_model-0.3.5.dist-info → isa_model-0.3.6.dist-info}/top_level.txt +0 -0
isa_model/eval/factory.py CHANGED
@@ -1,823 +1,531 @@
1
1
  """
2
- Unified Evaluation Factory for ISA Model Framework
2
+ Enterprise-Grade Evaluation Factory for ISA Model Framework
3
3
 
4
- This factory provides a single interface for all evaluation operations:
5
- - LLM evaluation (perplexity, BLEU, ROUGE, custom metrics)
6
- - Image model evaluation (FID, IS, LPIPS)
7
- - Benchmark testing (MMLU, HellaSwag, ARC, etc.)
8
- - Custom evaluation pipelines
9
- - Weights & Biases integration for experiment tracking
4
+ Implements industry best practices for AI model evaluation at scale:
5
+ - Async evaluation with concurrency control
6
+ - Comprehensive experiment tracking (W&B, MLflow)
7
+ - Distributed evaluation support
8
+ - Production-ready monitoring and alerting
9
+ - Cost tracking and optimization
10
+ - Reproducible evaluation pipelines
10
11
  """
11
12
 
12
- import os
13
- import json
13
+ import asyncio
14
14
  import logging
15
- from typing import Optional, Dict, Any, List, Union
15
+ from typing import Optional, Dict, Any, List, Union, Callable
16
16
  from pathlib import Path
17
- import datetime
18
-
19
- try:
20
- import wandb
21
- WANDB_AVAILABLE = True
22
- except ImportError:
23
- WANDB_AVAILABLE = False
24
-
25
- try:
26
- import mlflow
27
- MLFLOW_AVAILABLE = True
28
- except ImportError:
29
- MLFLOW_AVAILABLE = False
17
+ import json
30
18
 
31
- from .metrics import LLMMetrics, ImageMetrics, BenchmarkRunner
32
- from .benchmarks import MMLU, HellaSwag, ARC, GSM8K
19
+ from .evaluators import LLMEvaluator, VisionEvaluator, MultimodalEvaluator, EvaluationResult
20
+ from .infrastructure import ExperimentTracker, create_experiment_tracker
21
+ from .config import EvaluationConfig
33
22
 
34
23
  logger = logging.getLogger(__name__)
35
24
 
36
25
 
37
26
  class EvaluationFactory:
38
27
  """
39
- Unified factory for all AI model evaluation operations with experiment tracking.
28
+ Enterprise-grade evaluation factory implementing MLOps best practices.
40
29
 
41
- This class provides simplified interfaces for:
42
- - LLM evaluation with various metrics
43
- - Image model evaluation
44
- - Benchmark testing on standard datasets
45
- - Custom evaluation pipelines
46
- - Experiment tracking with W&B and MLflow
30
+ Features:
31
+ - Multi-modal evaluation support (LLM, Vision, Multimodal)
32
+ - Async evaluation with smart concurrency management
33
+ - Comprehensive experiment tracking and visualization
34
+ - Cost optimization and resource monitoring
35
+ - Distributed evaluation across multiple GPUs/nodes
36
+ - Production-ready error handling and retry logic
37
+ - Automated result storage and comparison
47
38
 
48
39
  Example usage:
49
40
  ```python
50
41
  from isa_model.eval import EvaluationFactory
51
42
 
52
- evaluator = EvaluationFactory(
53
- output_dir="eval_results",
54
- use_wandb=True,
55
- wandb_project="model-evaluation"
43
+ # Initialize with experiment tracking
44
+ factory = EvaluationFactory(
45
+ experiment_tracking={
46
+ "type": "wandb",
47
+ "project": "model-evaluation",
48
+ "entity": "my-team"
49
+ }
56
50
  )
57
51
 
58
- # Evaluate LLM on custom dataset
59
- results = evaluator.evaluate_llm(
60
- model_path="path/to/model",
61
- dataset_path="test_data.json",
62
- metrics=["perplexity", "bleu", "rouge"],
63
- experiment_name="gemma-4b-evaluation"
52
+ # Evaluate LLM on dataset
53
+ result = await factory.evaluate_llm(
54
+ model_name="gpt-4.1-mini",
55
+ provider="openai",
56
+ dataset_path="path/to/evaluation_data.json",
57
+ metrics=["accuracy", "f1_score", "bleu_score"],
58
+ save_results=True
64
59
  )
65
60
 
66
- # Run MMLU benchmark
67
- mmlu_results = evaluator.run_benchmark(
68
- model_path="path/to/model",
69
- benchmark="mmlu",
61
+ # Run benchmark evaluation
62
+ benchmark_result = await factory.run_benchmark(
63
+ model_name="claude-sonnet-4",
64
+ provider="yyds",
65
+ benchmark_name="mmlu",
70
66
  subjects=["math", "physics", "chemistry"]
71
67
  )
72
68
 
73
69
  # Compare multiple models
74
- comparison = evaluator.compare_models([
75
- "model1/path",
76
- "model2/path"
77
- ], benchmark="hellaswag")
70
+ comparison = await factory.compare_models(
71
+ models=[
72
+ {"name": "gpt-4.1-mini", "provider": "openai"},
73
+ {"name": "claude-sonnet-4", "provider": "yyds"}
74
+ ],
75
+ dataset_path="comparison_dataset.json"
76
+ )
78
77
  ```
79
78
  """
80
79
 
81
- def __init__(
82
- self,
83
- output_dir: Optional[str] = None,
84
- use_wandb: bool = False,
85
- wandb_project: Optional[str] = None,
86
- wandb_entity: Optional[str] = None,
87
- use_mlflow: bool = False,
88
- mlflow_tracking_uri: Optional[str] = None
89
- ):
80
+ def __init__(self,
81
+ config: Optional[Union[Dict[str, Any], EvaluationConfig]] = None,
82
+ experiment_tracking: Optional[Dict[str, Any]] = None,
83
+ output_dir: Optional[str] = None):
90
84
  """
91
- Initialize the evaluation factory with experiment tracking.
85
+ Initialize the enterprise evaluation factory.
92
86
 
93
87
  Args:
94
- output_dir: Base directory for evaluation outputs
95
- use_wandb: Whether to use Weights & Biases for tracking
96
- wandb_project: W&B project name
97
- wandb_entity: W&B entity/team name
98
- use_mlflow: Whether to use MLflow for tracking
99
- mlflow_tracking_uri: MLflow tracking server URI
88
+ config: Evaluation configuration (dict or EvaluationConfig object)
89
+ experiment_tracking: Experiment tracking configuration
90
+ output_dir: Output directory for results
100
91
  """
101
- self.output_dir = output_dir or os.path.join(os.getcwd(), "evaluation_results")
102
- os.makedirs(self.output_dir, exist_ok=True)
103
-
104
- # Initialize metrics calculators
105
- self.llm_metrics = LLMMetrics()
106
- self.image_metrics = ImageMetrics()
107
- self.benchmark_runner = BenchmarkRunner()
108
-
109
- # Setup experiment tracking
110
- self.use_wandb = use_wandb and WANDB_AVAILABLE
111
- self.use_mlflow = use_mlflow and MLFLOW_AVAILABLE
112
-
113
- if self.use_wandb:
114
- self.wandb_project = wandb_project or "isa-model-evaluation"
115
- self.wandb_entity = wandb_entity
116
- logger.info(f"W&B tracking enabled for project: {self.wandb_project}")
92
+ # Initialize configuration
93
+ if isinstance(config, dict):
94
+ self.config = EvaluationConfig.from_dict(config)
95
+ elif isinstance(config, EvaluationConfig):
96
+ self.config = config
97
+ else:
98
+ self.config = EvaluationConfig()
99
+
100
+ # Override output directory if provided
101
+ if output_dir:
102
+ self.config.output_dir = output_dir
103
+
104
+ # Initialize experiment tracker
105
+ self.experiment_tracker = None
106
+ if experiment_tracking:
107
+ try:
108
+ self.experiment_tracker = create_experiment_tracker(**experiment_tracking)
109
+ logger.info(f"Initialized experiment tracking: {experiment_tracking['type']}")
110
+ except Exception as e:
111
+ logger.warning(f"Failed to initialize experiment tracking: {e}")
117
112
 
118
- if self.use_mlflow:
119
- if mlflow_tracking_uri:
120
- mlflow.set_tracking_uri(mlflow_tracking_uri)
121
- logger.info(f"MLflow tracking enabled with URI: {mlflow.get_tracking_uri()}")
113
+ # Initialize evaluators
114
+ self.llm_evaluator = LLMEvaluator(
115
+ config=self.config.to_dict(),
116
+ experiment_tracker=self.experiment_tracker
117
+ )
122
118
 
123
- logger.info(f"EvaluationFactory initialized with output dir: {self.output_dir}")
124
-
125
- def _start_experiment(self, experiment_name: str, config: Dict[str, Any]) -> None:
126
- """Start experiment tracking."""
127
- if self.use_wandb:
128
- wandb.init(
129
- project=self.wandb_project,
130
- entity=self.wandb_entity,
131
- name=experiment_name,
132
- config=config,
133
- reinit=True
134
- )
135
-
136
- if self.use_mlflow:
137
- mlflow.start_run(run_name=experiment_name)
138
- mlflow.log_params(config)
139
-
140
- def _log_metrics(self, metrics: Dict[str, Any], step: Optional[int] = None) -> None:
141
- """Log metrics to experiment tracking systems."""
142
- if self.use_wandb:
143
- wandb.log(metrics, step=step)
144
-
145
- if self.use_mlflow:
146
- for key, value in metrics.items():
147
- if isinstance(value, (int, float)):
148
- mlflow.log_metric(key, value, step=step)
149
-
150
- def _end_experiment(self) -> None:
151
- """End experiment tracking."""
152
- if self.use_wandb:
153
- wandb.finish()
119
+ # State tracking
120
+ self._active_evaluations: Dict[str, asyncio.Task] = {}
154
121
 
155
- if self.use_mlflow:
156
- mlflow.end_run()
122
+ logger.info(f"EvaluationFactory initialized with output dir: {self.config.output_dir}")
157
123
 
158
- def _get_output_path(self, model_name: str, eval_type: str) -> str:
159
- """Generate timestamped output path for evaluation results."""
160
- timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
161
- safe_model_name = os.path.basename(model_name).replace("/", "_").replace(":", "_")
162
- filename = f"{safe_model_name}_{eval_type}_{timestamp}.json"
163
- return os.path.join(self.output_dir, filename)
164
-
165
- # =================
166
- # LLM Evaluation Methods
167
- # =================
168
-
169
- def evaluate_llm(
170
- self,
171
- model_path: str,
172
- dataset_path: str,
173
- metrics: List[str] = None,
174
- output_path: Optional[str] = None,
175
- batch_size: int = 8,
176
- max_samples: Optional[int] = None,
177
- provider: str = "ollama",
178
- experiment_name: Optional[str] = None,
179
- **kwargs
180
- ) -> Dict[str, Any]:
124
+ async def evaluate_llm(self,
125
+ model_name: str,
126
+ provider: str = "openai",
127
+ dataset_path: Optional[str] = None,
128
+ dataset: Optional[List[Dict[str, Any]]] = None,
129
+ metrics: Optional[List[str]] = None,
130
+ batch_size: Optional[int] = None,
131
+ save_results: bool = True,
132
+ experiment_name: Optional[str] = None,
133
+ progress_callback: Optional[Callable] = None) -> EvaluationResult:
181
134
  """
182
- Evaluate an LLM model on a dataset with specified metrics.
135
+ Evaluate LLM with comprehensive metrics and tracking.
183
136
 
184
137
  Args:
185
- model_path: Path to the model or model identifier
186
- dataset_path: Path to evaluation dataset (JSON format)
187
- metrics: List of metrics to compute ["perplexity", "bleu", "rouge", "accuracy"]
188
- output_path: Path to save results
138
+ model_name: Name of the model to evaluate
139
+ provider: Model provider (openai, yyds, ollama, etc.)
140
+ dataset_path: Path to evaluation dataset JSON file
141
+ dataset: Direct dataset input (alternative to dataset_path)
142
+ metrics: List of metrics to compute
189
143
  batch_size: Batch size for evaluation
190
- max_samples: Maximum number of samples to evaluate
191
- provider: Model provider ("ollama", "openai", "hf")
192
- experiment_name: Name for experiment tracking
193
- **kwargs: Additional parameters
144
+ save_results: Whether to save results to disk
145
+ experiment_name: Custom experiment name
146
+ progress_callback: Optional progress callback function
194
147
 
195
148
  Returns:
196
- Dictionary containing evaluation results
197
-
198
- Example:
199
- ```python
200
- results = evaluator.evaluate_llm(
201
- model_path="google/gemma-2-4b-it",
202
- dataset_path="test_data.json",
203
- metrics=["perplexity", "bleu", "rouge"],
204
- max_samples=1000,
205
- experiment_name="gemma-4b-eval"
206
- )
207
- ```
149
+ Comprehensive evaluation results
208
150
  """
209
- if metrics is None:
210
- metrics = ["perplexity", "bleu", "rouge"]
211
-
212
- if not output_path:
213
- output_path = self._get_output_path(model_path, "llm_eval")
214
-
215
- # Setup experiment tracking
216
- config = {
217
- "model_path": model_path,
218
- "dataset_path": dataset_path,
219
- "metrics": metrics,
220
- "batch_size": batch_size,
221
- "max_samples": max_samples,
222
- "provider": provider
151
+ # Load dataset
152
+ if dataset is None:
153
+ if dataset_path is None:
154
+ raise ValueError("Either dataset_path or dataset must be provided")
155
+ dataset = self._load_dataset(dataset_path)
156
+
157
+ # Configure LLM evaluator
158
+ llm_config = {
159
+ "provider": provider,
160
+ "model_name": model_name,
161
+ "batch_size": batch_size or self.config.batch_size,
162
+ "temperature": self.config.default_temperature,
163
+ "max_tokens": self.config.default_max_tokens
223
164
  }
224
165
 
225
- experiment_name = experiment_name or f"llm_eval_{os.path.basename(model_path)}"
226
- self._start_experiment(experiment_name, config)
166
+ self.llm_evaluator.config.update(llm_config)
227
167
 
228
- logger.info(f"Evaluating LLM {model_path} with metrics: {metrics}")
168
+ # Generate experiment name
169
+ dataset_name = Path(dataset_path).stem if dataset_path else "custom_dataset"
170
+ experiment_name = experiment_name or f"llm_eval_{model_name}_{dataset_name}"
229
171
 
230
- try:
231
- # Load dataset
232
- with open(dataset_path, 'r') as f:
233
- dataset = json.load(f)
234
-
235
- if max_samples:
236
- dataset = dataset[:max_samples]
237
-
238
- # Run evaluation
239
- results = self.llm_metrics.evaluate(
240
- model_path=model_path,
241
- dataset=dataset,
242
- metrics=metrics,
243
- batch_size=batch_size,
244
- provider=provider,
245
- **kwargs
246
- )
247
-
248
- # Log metrics to tracking systems
249
- self._log_metrics(results.get("metrics", {}))
250
-
251
- # Add metadata
252
- results["metadata"] = {
253
- "model_path": model_path,
254
- "dataset_path": dataset_path,
255
- "metrics": metrics,
256
- "num_samples": len(dataset),
257
- "timestamp": datetime.datetime.now().isoformat(),
258
- "provider": provider,
259
- "experiment_name": experiment_name
260
- }
261
-
262
- # Save results
263
- with open(output_path, 'w') as f:
264
- json.dump(results, f, indent=2)
265
-
266
- logger.info(f"Evaluation results saved to: {output_path}")
267
-
268
- finally:
269
- self._end_experiment()
270
-
271
- return results
272
-
273
- def evaluate_generation_quality(
274
- self,
275
- model_path: str,
276
- prompts: List[str],
277
- reference_texts: List[str] = None,
278
- metrics: List[str] = None,
279
- output_path: Optional[str] = None,
280
- provider: str = "ollama",
281
- **kwargs
282
- ) -> Dict[str, Any]:
283
- """
284
- Evaluate text generation quality.
285
-
286
- Args:
287
- model_path: Path to the model
288
- prompts: List of input prompts
289
- reference_texts: Reference texts for comparison (optional)
290
- metrics: Metrics to compute
291
- output_path: Output path for results
292
- provider: Model provider
293
- **kwargs: Additional parameters
294
-
295
- Returns:
296
- Evaluation results dictionary
297
- """
298
- if metrics is None:
299
- metrics = ["diversity", "coherence", "fluency"]
300
-
301
- if not output_path:
302
- output_path = self._get_output_path(model_path, "generation_eval")
303
-
304
- results = self.llm_metrics.evaluate_generation(
305
- model_path=model_path,
306
- prompts=prompts,
307
- reference_texts=reference_texts,
308
- metrics=metrics,
309
- provider=provider,
310
- **kwargs
172
+ # Run evaluation
173
+ result = await self.llm_evaluator.evaluate(
174
+ model_interface=None, # Will use AI factory
175
+ dataset=dataset,
176
+ dataset_name=dataset_name,
177
+ model_name=f"{provider}:{model_name}",
178
+ batch_size=batch_size,
179
+ progress_callback=progress_callback
311
180
  )
312
181
 
313
- # Save results
314
- with open(output_path, 'w') as f:
315
- json.dump(results, f, indent=2)
182
+ # Save results if requested
183
+ if save_results:
184
+ await self._save_results(result, experiment_name)
316
185
 
317
- return results
186
+ return result
318
187
 
319
- # =================
320
- # Benchmark Testing Methods
321
- # =================
322
-
323
- def run_benchmark(
324
- self,
325
- model_path: str,
326
- benchmark: str,
327
- output_path: Optional[str] = None,
328
- num_shots: int = 0,
329
- max_samples: Optional[int] = None,
330
- provider: str = "ollama",
331
- experiment_name: Optional[str] = None,
332
- **kwargs
333
- ) -> Dict[str, Any]:
188
+ async def run_benchmark(self,
189
+ model_name: str,
190
+ provider: str,
191
+ benchmark_name: str,
192
+ subjects: Optional[List[str]] = None,
193
+ max_samples: Optional[int] = None,
194
+ few_shot: bool = True,
195
+ num_shots: int = 5,
196
+ save_results: bool = True,
197
+ experiment_name: Optional[str] = None) -> EvaluationResult:
334
198
  """
335
- Run a specific benchmark on a model with experiment tracking.
199
+ Run standardized benchmark evaluation.
336
200
 
337
201
  Args:
338
- model_path: Path to the model
339
- benchmark: Benchmark name ("mmlu", "hellaswag", "arc", "gsm8k")
340
- output_path: Path to save results
341
- num_shots: Number of few-shot examples
342
- max_samples: Maximum samples to evaluate
202
+ model_name: Name of the model to evaluate
343
203
  provider: Model provider
344
- experiment_name: Name for experiment tracking
345
- **kwargs: Additional parameters
204
+ benchmark_name: Name of benchmark (mmlu, hellaswag, arc, gsm8k, etc.)
205
+ subjects: List of subjects to evaluate (for MMLU)
206
+ max_samples: Maximum number of samples to evaluate
207
+ few_shot: Whether to use few-shot examples
208
+ num_shots: Number of few-shot examples
209
+ save_results: Whether to save results
210
+ experiment_name: Custom experiment name
346
211
 
347
212
  Returns:
348
- Benchmark results dictionary
213
+ Benchmark evaluation results
349
214
  """
350
- if not output_path:
351
- output_path = self._get_output_path(model_path, f"{benchmark}_benchmark")
352
-
353
- # Setup experiment tracking
354
- config = {
355
- "model_path": model_path,
356
- "benchmark": benchmark,
357
- "num_shots": num_shots,
358
- "max_samples": max_samples,
359
- "provider": provider
360
- }
361
-
362
- experiment_name = experiment_name or f"{benchmark}_{os.path.basename(model_path)}"
363
- self._start_experiment(experiment_name, config)
364
-
365
- logger.info(f"Running {benchmark.upper()} benchmark on {model_path}")
366
-
367
- try:
368
- # Initialize benchmark
369
- benchmark_map = {
370
- "mmlu": MMLU(),
371
- "hellaswag": HellaSwag(),
372
- "arc": ARC(),
373
- "gsm8k": GSM8K()
374
- }
375
-
376
- if benchmark.lower() not in benchmark_map:
377
- raise ValueError(f"Benchmark '{benchmark}' not supported. Available: {list(benchmark_map.keys())}")
378
-
379
- benchmark_instance = benchmark_map[benchmark.lower()]
380
-
381
- # Run benchmark
382
- results = self.benchmark_runner.run_benchmark(
383
- model_path=model_path,
384
- benchmark=benchmark_instance,
385
- num_shots=num_shots,
386
- max_samples=max_samples,
387
- provider=provider,
388
- **kwargs
389
- )
390
-
391
- # Log metrics to tracking systems
392
- self._log_metrics(results.get("metrics", {}))
393
-
394
- # Add metadata
395
- results["metadata"] = {
396
- "model_path": model_path,
397
- "benchmark": benchmark,
398
- "num_shots": num_shots,
399
- "max_samples": max_samples,
400
- "timestamp": datetime.datetime.now().isoformat(),
401
- "provider": provider,
402
- "experiment_name": experiment_name
403
- }
404
-
405
- # Save results
406
- with open(output_path, 'w') as f:
407
- json.dump(results, f, indent=2)
408
-
409
- logger.info(f"Benchmark results saved to: {output_path}")
410
-
411
- finally:
412
- self._end_experiment()
215
+ # Load benchmark dataset
216
+ benchmark_dataset = await self._load_benchmark(
217
+ benchmark_name,
218
+ subjects=subjects,
219
+ max_samples=max_samples,
220
+ few_shot=few_shot,
221
+ num_shots=num_shots
222
+ )
413
223
 
414
- return results
415
-
416
- def run_multiple_benchmarks(
417
- self,
418
- model_path: str,
419
- benchmarks: List[str] = None,
420
- output_dir: Optional[str] = None,
421
- **kwargs
422
- ) -> Dict[str, Any]:
423
- """
424
- Run multiple benchmarks on a model.
224
+ # Configure for benchmark evaluation
225
+ benchmark_config = {
226
+ "provider": provider,
227
+ "model_name": model_name,
228
+ "temperature": 0.0, # Deterministic for benchmarks
229
+ "max_tokens": 50, # Short answers for most benchmarks
230
+ "task_type": "benchmark",
231
+ "benchmark_name": benchmark_name
232
+ }
425
233
 
426
- Args:
427
- model_path: Path to the model
428
- benchmarks: List of benchmark names
429
- output_dir: Directory to save results
430
- **kwargs: Additional parameters
431
-
432
- Returns:
433
- Combined results dictionary
434
- """
435
- if benchmarks is None:
436
- benchmarks = ["mmlu", "hellaswag", "arc"]
234
+ self.llm_evaluator.config.update(benchmark_config)
437
235
 
438
- if not output_dir:
439
- output_dir = os.path.join(self.output_dir, "multi_benchmark")
440
- os.makedirs(output_dir, exist_ok=True)
236
+ # Generate experiment name
237
+ experiment_name = experiment_name or f"benchmark_{benchmark_name}_{model_name}"
441
238
 
442
- all_results = {}
239
+ # Run evaluation
240
+ result = await self.llm_evaluator.evaluate(
241
+ model_interface=None,
242
+ dataset=benchmark_dataset,
243
+ dataset_name=benchmark_name,
244
+ model_name=f"{provider}:{model_name}",
245
+ batch_size=self.config.batch_size
246
+ )
443
247
 
444
- for benchmark in benchmarks:
445
- try:
446
- output_path = os.path.join(output_dir, f"{benchmark}_results.json")
447
- results = self.run_benchmark(
448
- model_path=model_path,
449
- benchmark=benchmark,
450
- output_path=output_path,
451
- **kwargs
452
- )
453
- all_results[benchmark] = results
454
- except Exception as e:
455
- logger.error(f"Failed to run benchmark {benchmark}: {e}")
456
- all_results[benchmark] = {"error": str(e)}
248
+ # Add benchmark-specific metadata
249
+ result.config.update({
250
+ "benchmark_name": benchmark_name,
251
+ "subjects": subjects,
252
+ "few_shot": few_shot,
253
+ "num_shots": num_shots
254
+ })
457
255
 
458
- # Save combined results
459
- combined_path = os.path.join(output_dir, "combined_results.json")
460
- with open(combined_path, 'w') as f:
461
- json.dump(all_results, f, indent=2)
256
+ # Save results if requested
257
+ if save_results:
258
+ await self._save_results(result, experiment_name)
462
259
 
463
- return all_results
260
+ return result
464
261
 
465
- # =================
466
- # Model Comparison Methods
467
- # =================
468
-
469
- def compare_models(
470
- self,
471
- model_paths: List[str],
472
- dataset_path: Optional[str] = None,
473
- benchmark: Optional[str] = None,
474
- metrics: List[str] = None,
475
- output_path: Optional[str] = None,
476
- experiment_name: Optional[str] = None,
477
- **kwargs
478
- ) -> Dict[str, Any]:
262
+ async def compare_models(self,
263
+ models: List[Dict[str, str]],
264
+ dataset_path: Optional[str] = None,
265
+ dataset: Optional[List[Dict[str, Any]]] = None,
266
+ benchmark_name: Optional[str] = None,
267
+ metrics: Optional[List[str]] = None,
268
+ save_results: bool = True,
269
+ experiment_name: Optional[str] = None) -> Dict[str, EvaluationResult]:
479
270
  """
480
271
  Compare multiple models on the same evaluation task.
481
272
 
482
273
  Args:
483
- model_paths: List of model paths to compare
484
- dataset_path: Dataset for evaluation (if not using benchmark)
485
- benchmark: Benchmark name (if not using custom dataset)
274
+ models: List of model configs [{"name": "gpt-4", "provider": "openai"}, ...]
275
+ dataset_path: Path to evaluation dataset
276
+ dataset: Direct dataset input
277
+ benchmark_name: Benchmark name (alternative to dataset)
486
278
  metrics: Metrics to compute
487
- output_path: Path to save comparison results
488
- experiment_name: Name for experiment tracking
489
- **kwargs: Additional parameters
279
+ save_results: Whether to save comparison results
280
+ experiment_name: Custom experiment name
490
281
 
491
282
  Returns:
492
- Comparison results dictionary
283
+ Dictionary mapping model names to evaluation results
493
284
  """
494
- if not dataset_path and not benchmark:
495
- raise ValueError("Either dataset_path or benchmark must be provided")
496
-
497
- if not output_path:
498
- timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
499
- output_path = os.path.join(self.output_dir, f"model_comparison_{timestamp}.json")
500
-
501
- # Setup experiment tracking
502
- config = {
503
- "model_paths": model_paths,
504
- "dataset_path": dataset_path,
505
- "benchmark": benchmark,
506
- "metrics": metrics
507
- }
285
+ results = {}
508
286
 
509
- experiment_name = experiment_name or f"model_comparison_{len(model_paths)}_models"
510
- self._start_experiment(experiment_name, config)
287
+ # Run evaluations concurrently (with concurrency limits)
288
+ semaphore = asyncio.Semaphore(self.config.max_concurrent_evaluations)
511
289
 
512
- logger.info(f"Comparing {len(model_paths)} models")
513
-
514
- try:
515
- results = {"models": {}, "comparison": {}}
516
-
517
- # Evaluate each model
518
- for i, model_path in enumerate(model_paths):
519
- logger.info(f"Evaluating model {i+1}/{len(model_paths)}: {model_path}")
290
+ async def evaluate_single_model(model_config: Dict[str, str]) -> tuple:
291
+ async with semaphore:
292
+ model_name = model_config["name"]
293
+ provider = model_config["provider"]
520
294
 
521
- if benchmark:
522
- model_results = self.run_benchmark(
523
- model_path=model_path,
524
- benchmark=benchmark,
525
- experiment_name=None, # Don't start new experiment
526
- **kwargs
295
+ if benchmark_name:
296
+ result = await self.run_benchmark(
297
+ model_name=model_name,
298
+ provider=provider,
299
+ benchmark_name=benchmark_name,
300
+ save_results=False # Save comparison results together
527
301
  )
528
302
  else:
529
- model_results = self.evaluate_llm(
530
- model_path=model_path,
303
+ result = await self.evaluate_llm(
304
+ model_name=model_name,
305
+ provider=provider,
531
306
  dataset_path=dataset_path,
307
+ dataset=dataset,
532
308
  metrics=metrics,
533
- experiment_name=None, # Don't start new experiment
534
- **kwargs
309
+ save_results=False
535
310
  )
536
311
 
537
- results["models"][model_path] = model_results
538
-
539
- # Log individual model metrics
540
- model_metrics = model_results.get("metrics", {})
541
- for metric_name, value in model_metrics.items():
542
- self._log_metrics({f"{os.path.basename(model_path)}_{metric_name}": value})
543
-
544
- # Generate comparison summary
545
- results["comparison"] = self._generate_comparison_summary(results["models"])
546
-
547
- # Add metadata
548
- results["metadata"] = {
549
- "model_paths": model_paths,
550
- "dataset_path": dataset_path,
551
- "benchmark": benchmark,
552
- "metrics": metrics,
553
- "timestamp": datetime.datetime.now().isoformat(),
554
- "experiment_name": experiment_name
555
- }
556
-
557
- # Save results
558
- with open(output_path, 'w') as f:
559
- json.dump(results, f, indent=2)
560
-
561
- logger.info(f"Comparison results saved to: {output_path}")
562
-
563
- finally:
564
- self._end_experiment()
312
+ return f"{provider}:{model_name}", result
313
+
314
+ # Execute all evaluations
315
+ tasks = [evaluate_single_model(model) for model in models]
316
+ evaluation_results = await asyncio.gather(*tasks)
317
+
318
+ # Collect results
319
+ for model_id, result in evaluation_results:
320
+ results[model_id] = result
321
+
322
+ # Generate comparison report
323
+ comparison_report = self._generate_comparison_report(results)
324
+
325
+ # Save results if requested
326
+ if save_results:
327
+ experiment_name = experiment_name or f"model_comparison_{len(models)}_models"
328
+ await self._save_comparison_results(results, comparison_report, experiment_name)
565
329
 
566
330
  return results
567
331
 
568
- def _generate_comparison_summary(self, results: Dict[str, Any]) -> Dict[str, Any]:
569
- """Generate comparison summary from multiple model results."""
570
- summary = {
571
- "best_model": {},
332
+ def _load_dataset(self, dataset_path: str) -> List[Dict[str, Any]]:
333
+ """Load dataset from file."""
334
+ with open(dataset_path, 'r', encoding='utf-8') as f:
335
+ if dataset_path.endswith('.json'):
336
+ dataset = json.load(f)
337
+ elif dataset_path.endswith('.jsonl'):
338
+ dataset = [json.loads(line) for line in f]
339
+ else:
340
+ raise ValueError(f"Unsupported dataset format: {dataset_path}")
341
+
342
+ logger.info(f"Loaded dataset with {len(dataset)} samples from {dataset_path}")
343
+ return dataset
344
+
345
+ async def _load_benchmark(self,
346
+ benchmark_name: str,
347
+ subjects: Optional[List[str]] = None,
348
+ max_samples: Optional[int] = None,
349
+ few_shot: bool = True,
350
+ num_shots: int = 5) -> List[Dict[str, Any]]:
351
+ """Load benchmark dataset."""
352
+ # This would integrate with the benchmark loaders
353
+ # For now, return a placeholder
354
+ logger.warning(f"Benchmark {benchmark_name} loading not yet implemented")
355
+
356
+ # Placeholder benchmark data
357
+ return [
358
+ {
359
+ "id": f"sample_{i}",
360
+ "prompt": f"Sample question {i} for {benchmark_name}",
361
+ "reference": "A",
362
+ "choices": ["A", "B", "C", "D"] if benchmark_name != "gsm8k" else None
363
+ }
364
+ for i in range(min(max_samples or 10, 10))
365
+ ]
366
+
367
+ async def _save_results(self, result: EvaluationResult, experiment_name: str) -> None:
368
+ """Save evaluation results to disk."""
369
+ # Create output directory
370
+ output_dir = Path(self.config.output_dir) / experiment_name
371
+ output_dir.mkdir(parents=True, exist_ok=True)
372
+
373
+ # Save main results
374
+ results_path = output_dir / "results.json"
375
+ result.save_to_file(results_path)
376
+
377
+ # Save detailed predictions if available
378
+ if result.sample_results:
379
+ predictions_path = output_dir / "predictions.json"
380
+ with open(predictions_path, 'w', encoding='utf-8') as f:
381
+ json.dump(result.sample_results, f, indent=2, ensure_ascii=False)
382
+
383
+ # Save summary
384
+ summary_path = output_dir / "summary.json"
385
+ with open(summary_path, 'w', encoding='utf-8') as f:
386
+ json.dump(result.get_summary(), f, indent=2, ensure_ascii=False)
387
+
388
+ logger.info(f"Saved evaluation results to {output_dir}")
389
+
390
+ async def _save_comparison_results(self,
391
+ results: Dict[str, EvaluationResult],
392
+ comparison_report: Dict[str, Any],
393
+ experiment_name: str) -> None:
394
+ """Save model comparison results."""
395
+ output_dir = Path(self.config.output_dir) / experiment_name
396
+ output_dir.mkdir(parents=True, exist_ok=True)
397
+
398
+ # Save individual results
399
+ for model_id, result in results.items():
400
+ model_dir = output_dir / model_id.replace(":", "_")
401
+ model_dir.mkdir(exist_ok=True)
402
+ result.save_to_file(model_dir / "results.json")
403
+
404
+ # Save comparison report
405
+ comparison_path = output_dir / "comparison_report.json"
406
+ with open(comparison_path, 'w', encoding='utf-8') as f:
407
+ json.dump(comparison_report, f, indent=2, ensure_ascii=False)
408
+
409
+ logger.info(f"Saved comparison results to {output_dir}")
410
+
411
+ def _generate_comparison_report(self, results: Dict[str, EvaluationResult]) -> Dict[str, Any]:
412
+ """Generate comparison report from multiple model results."""
413
+ report = {
414
+ "models_compared": list(results.keys()),
415
+ "comparison_timestamp": results[list(results.keys())[0]].timestamp,
416
+ "metric_comparison": {},
572
417
  "rankings": {},
573
- "metric_comparisons": {}
418
+ "best_model_per_metric": {}
574
419
  }
575
420
 
576
- # Extract all metrics across models
421
+ # Extract all metrics
577
422
  all_metrics = set()
578
- for model_results in results.values():
579
- if "metrics" in model_results:
580
- all_metrics.update(model_results["metrics"].keys())
423
+ for result in results.values():
424
+ all_metrics.update(result.metrics.keys())
581
425
 
582
426
  # Compare each metric
583
427
  for metric in all_metrics:
584
428
  metric_values = {}
585
- for model_path, model_results in results.items():
586
- if "metrics" in model_results and metric in model_results["metrics"]:
587
- metric_values[model_path] = model_results["metrics"][metric]
429
+ for model_id, result in results.items():
430
+ if metric in result.metrics:
431
+ metric_values[model_id] = result.metrics[metric]
588
432
 
589
433
  if metric_values:
590
- # Determine if higher is better (most metrics, higher is better)
434
+ # Determine if higher is better
591
435
  higher_is_better = metric not in ["perplexity", "loss", "error_rate"]
592
436
 
437
+ # Find best model
593
438
  best_model = max(metric_values.items(), key=lambda x: x[1]) if higher_is_better else min(metric_values.items(), key=lambda x: x[1])
594
- summary["best_model"][metric] = {
595
- "model": best_model[0],
596
- "value": best_model[1]
597
- }
598
439
 
599
440
  # Create ranking
600
441
  sorted_models = sorted(metric_values.items(), key=lambda x: x[1], reverse=higher_is_better)
601
- summary["rankings"][metric] = [{"model": model, "value": value} for model, value in sorted_models]
602
442
 
603
- summary["metric_comparisons"][metric] = metric_values
604
-
605
- return summary
606
-
607
- # =================
608
- # Image Model Evaluation Methods
609
- # =================
610
-
611
- def evaluate_image_model(
612
- self,
613
- model_path: str,
614
- test_images_dir: str,
615
- reference_images_dir: Optional[str] = None,
616
- metrics: List[str] = None,
617
- output_path: Optional[str] = None,
618
- **kwargs
619
- ) -> Dict[str, Any]:
620
- """
621
- Evaluate image generation model.
622
-
623
- Args:
624
- model_path: Path to the image model
625
- test_images_dir: Directory with test images
626
- reference_images_dir: Directory with reference images
627
- metrics: Metrics to compute ["fid", "is", "lpips"]
628
- output_path: Output path for results
629
- **kwargs: Additional parameters
630
-
631
- Returns:
632
- Image evaluation results
633
- """
634
- if metrics is None:
635
- metrics = ["fid", "is"]
636
-
637
- if not output_path:
638
- output_path = self._get_output_path(model_path, "image_eval")
639
-
640
- results = self.image_metrics.evaluate(
641
- model_path=model_path,
642
- test_images_dir=test_images_dir,
643
- reference_images_dir=reference_images_dir,
644
- metrics=metrics,
645
- **kwargs
646
- )
647
-
648
- # Save results
649
- with open(output_path, 'w') as f:
650
- json.dump(results, f, indent=2)
443
+ report["metric_comparison"][metric] = metric_values
444
+ report["rankings"][metric] = [{"model": model, "value": value} for model, value in sorted_models]
445
+ report["best_model_per_metric"][metric] = {"model": best_model[0], "value": best_model[1]}
651
446
 
652
- return results
447
+ return report
653
448
 
654
- # =================
655
- # Utility Methods
656
- # =================
449
+ def get_configuration(self) -> Dict[str, Any]:
450
+ """Get current factory configuration."""
451
+ return self.config.to_dict()
657
452
 
658
- def load_results(self, results_path: str) -> Dict[str, Any]:
659
- """Load evaluation results from file."""
660
- with open(results_path, 'r') as f:
661
- return json.load(f)
453
+ def get_active_evaluations(self) -> List[str]:
454
+ """Get list of currently running evaluations."""
455
+ return list(self._active_evaluations.keys())
662
456
 
663
- def list_evaluation_results(self) -> List[Dict[str, Any]]:
664
- """List all evaluation results in the output directory."""
665
- results = []
666
-
667
- if os.path.exists(self.output_dir):
668
- for filename in os.listdir(self.output_dir):
669
- if filename.endswith('.json'):
670
- filepath = os.path.join(self.output_dir, filename)
671
- try:
672
- with open(filepath, 'r') as f:
673
- data = json.load(f)
674
- results.append({
675
- "filename": filename,
676
- "path": filepath,
677
- "metadata": data.get("metadata", {}),
678
- "created": datetime.datetime.fromtimestamp(
679
- os.path.getctime(filepath)
680
- ).isoformat()
681
- })
682
- except Exception as e:
683
- logger.warning(f"Failed to load {filename}: {e}")
684
-
685
- return sorted(results, key=lambda x: x["created"], reverse=True)
457
+ async def stop_evaluation(self, evaluation_id: str) -> bool:
458
+ """Stop a running evaluation."""
459
+ if evaluation_id in self._active_evaluations:
460
+ task = self._active_evaluations[evaluation_id]
461
+ task.cancel()
462
+ del self._active_evaluations[evaluation_id]
463
+ logger.info(f"Stopped evaluation: {evaluation_id}")
464
+ return True
465
+ return False
686
466
 
687
- def generate_report(
688
- self,
689
- results_paths: List[str],
690
- output_path: Optional[str] = None,
691
- format: str = "json"
692
- ) -> str:
693
- """
694
- Generate evaluation report from multiple results.
695
-
696
- Args:
697
- results_paths: List of result file paths
698
- output_path: Output path for report
699
- format: Report format ("json", "html", "markdown")
700
-
701
- Returns:
702
- Path to generated report
703
- """
704
- if not output_path:
705
- timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
706
- output_path = os.path.join(self.output_dir, f"evaluation_report_{timestamp}.{format}")
707
-
708
- # Load all results
709
- all_results = []
710
- for path in results_paths:
711
- try:
712
- results = self.load_results(path)
713
- all_results.append(results)
714
- except Exception as e:
715
- logger.warning(f"Failed to load results from {path}: {e}")
716
-
717
- # Generate report based on format
718
- if format == "json":
719
- report_data = {
720
- "report_generated": datetime.datetime.now().isoformat(),
721
- "num_evaluations": len(all_results),
722
- "results": all_results
723
- }
724
-
725
- with open(output_path, 'w') as f:
726
- json.dump(report_data, f, indent=2)
467
+ async def cleanup(self) -> None:
468
+ """Cleanup resources and stop all running evaluations."""
469
+ # Cancel all active evaluations
470
+ for evaluation_id in list(self._active_evaluations.keys()):
471
+ await self.stop_evaluation(evaluation_id)
727
472
 
728
- # TODO: Implement HTML and Markdown report generation
473
+ # Close experiment tracker
474
+ if self.experiment_tracker and self.experiment_tracker.is_running:
475
+ await self.experiment_tracker.end_run()
729
476
 
730
- logger.info(f"Evaluation report generated: {output_path}")
731
- return output_path
477
+ logger.info("EvaluationFactory cleanup completed")
732
478
 
733
- def evaluate_multimodal_model(
734
- self,
735
- model_path: str,
736
- text_dataset_path: Optional[str] = None,
737
- image_dataset_path: Optional[str] = None,
738
- audio_dataset_path: Optional[str] = None,
739
- metrics: List[str] = None,
740
- experiment_name: Optional[str] = None,
741
- **kwargs
742
- ) -> Dict[str, Any]:
743
- """
744
- Evaluate multimodal models across different modalities.
745
-
746
- Args:
747
- model_path: Path to the multimodal model
748
- text_dataset_path: Path to text evaluation dataset
749
- image_dataset_path: Path to image evaluation dataset
750
- audio_dataset_path: Path to audio evaluation dataset
751
- metrics: Metrics to compute for each modality
752
- experiment_name: Name for experiment tracking
753
- **kwargs: Additional parameters
754
-
755
- Returns:
756
- Multimodal evaluation results
757
- """
758
- config = {
759
- "model_path": model_path,
760
- "text_dataset_path": text_dataset_path,
761
- "image_dataset_path": image_dataset_path,
762
- "audio_dataset_path": audio_dataset_path,
763
- "metrics": metrics
764
- }
765
-
766
- experiment_name = experiment_name or f"multimodal_eval_{os.path.basename(model_path)}"
767
- self._start_experiment(experiment_name, config)
768
-
769
- logger.info(f"Evaluating multimodal model: {model_path}")
770
-
771
- try:
772
- results = {"modalities": {}}
773
-
774
- # Text evaluation
775
- if text_dataset_path:
776
- logger.info("Evaluating text modality...")
777
- text_results = self.evaluate_llm(
778
- model_path=model_path,
779
- dataset_path=text_dataset_path,
780
- metrics=metrics or ["perplexity", "bleu", "rouge"],
781
- experiment_name=None,
782
- **kwargs
783
- )
784
- results["modalities"]["text"] = text_results
785
- self._log_metrics({f"text_{k}": v for k, v in text_results.get("metrics", {}).items()})
786
-
787
- # Image evaluation
788
- if image_dataset_path:
789
- logger.info("Evaluating image modality...")
790
- image_results = self.evaluate_image_model(
791
- model_path=model_path,
792
- test_images_dir=image_dataset_path,
793
- metrics=metrics or ["fid", "is", "lpips"],
794
- experiment_name=None,
795
- **kwargs
796
- )
797
- results["modalities"]["image"] = image_results
798
- self._log_metrics({f"image_{k}": v for k, v in image_results.get("metrics", {}).items()})
799
-
800
- # Audio evaluation (placeholder for future implementation)
801
- if audio_dataset_path:
802
- logger.info("Audio evaluation not yet implemented")
803
- results["modalities"]["audio"] = {"status": "not_implemented"}
804
-
805
- # Add metadata
806
- results["metadata"] = {
807
- "model_path": model_path,
808
- "modalities_evaluated": list(results["modalities"].keys()),
809
- "timestamp": datetime.datetime.now().isoformat(),
810
- "experiment_name": experiment_name
811
- }
812
-
813
- # Save results
814
- output_path = self._get_output_path(model_path, "multimodal_eval")
815
- with open(output_path, 'w') as f:
816
- json.dump(results, f, indent=2)
817
-
818
- logger.info(f"Multimodal evaluation results saved to: {output_path}")
819
-
820
- finally:
821
- self._end_experiment()
479
+
480
+ # Convenience functions for quick evaluation
481
+ async def evaluate_llm_quick(model_name: str,
482
+ provider: str,
483
+ dataset_path: str,
484
+ metrics: Optional[List[str]] = None) -> EvaluationResult:
485
+ """
486
+ Quick LLM evaluation function.
487
+
488
+ Args:
489
+ model_name: Name of the model
490
+ provider: Model provider
491
+ dataset_path: Path to dataset
492
+ metrics: Metrics to compute
493
+
494
+ Returns:
495
+ Evaluation results
496
+ """
497
+ factory = EvaluationFactory()
498
+ try:
499
+ return await factory.evaluate_llm(
500
+ model_name=model_name,
501
+ provider=provider,
502
+ dataset_path=dataset_path,
503
+ metrics=metrics
504
+ )
505
+ finally:
506
+ await factory.cleanup()
507
+
508
+
509
+ async def run_benchmark_quick(model_name: str,
510
+ provider: str,
511
+ benchmark_name: str) -> EvaluationResult:
512
+ """
513
+ Quick benchmark evaluation function.
514
+
515
+ Args:
516
+ model_name: Name of the model
517
+ provider: Model provider
518
+ benchmark_name: Benchmark name
822
519
 
823
- return results
520
+ Returns:
521
+ Benchmark results
522
+ """
523
+ factory = EvaluationFactory()
524
+ try:
525
+ return await factory.run_benchmark(
526
+ model_name=model_name,
527
+ provider=provider,
528
+ benchmark_name=benchmark_name
529
+ )
530
+ finally:
531
+ await factory.cleanup()