isa-model 0.2.0__py3-none-any.whl → 0.2.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- isa_model/__init__.py +1 -1
- isa_model/core/storage/hf_storage.py +419 -0
- isa_model/deployment/__init__.py +52 -0
- isa_model/deployment/core/__init__.py +34 -0
- isa_model/deployment/core/deployment_config.py +356 -0
- isa_model/deployment/core/deployment_manager.py +549 -0
- isa_model/deployment/core/isa_deployment_service.py +401 -0
- isa_model/eval/factory.py +381 -140
- isa_model/inference/ai_factory.py +142 -240
- isa_model/inference/providers/ml_provider.py +50 -0
- isa_model/inference/services/audio/openai_tts_service.py +104 -3
- isa_model/inference/services/embedding/base_embed_service.py +112 -0
- isa_model/inference/services/embedding/ollama_embed_service.py +28 -2
- isa_model/inference/services/llm/__init__.py +2 -0
- isa_model/inference/services/llm/base_llm_service.py +111 -1
- isa_model/inference/services/llm/ollama_llm_service.py +234 -26
- isa_model/inference/services/llm/openai_llm_service.py +243 -28
- isa_model/inference/services/llm/triton_llm_service.py +481 -0
- isa_model/inference/services/ml/base_ml_service.py +78 -0
- isa_model/inference/services/ml/sklearn_ml_service.py +140 -0
- isa_model/inference/services/vision/__init__.py +3 -3
- isa_model/inference/services/vision/base_image_gen_service.py +161 -0
- isa_model/inference/services/vision/base_vision_service.py +177 -0
- isa_model/inference/services/vision/ollama_vision_service.py +143 -17
- isa_model/inference/services/vision/replicate_image_gen_service.py +139 -7
- isa_model/training/__init__.py +62 -32
- isa_model/training/cloud/__init__.py +22 -0
- isa_model/training/cloud/job_orchestrator.py +402 -0
- isa_model/training/cloud/runpod_trainer.py +454 -0
- isa_model/training/cloud/storage_manager.py +482 -0
- isa_model/training/core/__init__.py +23 -0
- isa_model/training/core/config.py +181 -0
- isa_model/training/core/dataset.py +222 -0
- isa_model/training/core/trainer.py +720 -0
- isa_model/training/core/utils.py +213 -0
- isa_model/training/factory.py +229 -198
- isa_model-0.2.9.dist-info/METADATA +465 -0
- isa_model-0.2.9.dist-info/RECORD +86 -0
- isa_model/core/model_router.py +0 -226
- isa_model/core/model_version.py +0 -0
- isa_model/core/resource_manager.py +0 -202
- isa_model/deployment/gpu_fp16_ds8/models/deepseek_r1/1/model.py +0 -120
- isa_model/deployment/gpu_fp16_ds8/scripts/download_model.py +0 -18
- isa_model/training/engine/llama_factory/__init__.py +0 -39
- isa_model/training/engine/llama_factory/config.py +0 -115
- isa_model/training/engine/llama_factory/data_adapter.py +0 -284
- isa_model/training/engine/llama_factory/examples/__init__.py +0 -6
- isa_model/training/engine/llama_factory/examples/finetune_with_tracking.py +0 -185
- isa_model/training/engine/llama_factory/examples/rlhf_with_tracking.py +0 -163
- isa_model/training/engine/llama_factory/factory.py +0 -331
- isa_model/training/engine/llama_factory/rl.py +0 -254
- isa_model/training/engine/llama_factory/trainer.py +0 -171
- isa_model/training/image_model/configs/create_config.py +0 -37
- isa_model/training/image_model/configs/create_flux_config.py +0 -26
- isa_model/training/image_model/configs/create_lora_config.py +0 -21
- isa_model/training/image_model/prepare_massed_compute.py +0 -97
- isa_model/training/image_model/prepare_upload.py +0 -17
- isa_model/training/image_model/raw_data/create_captions.py +0 -16
- isa_model/training/image_model/raw_data/create_lora_captions.py +0 -20
- isa_model/training/image_model/raw_data/pre_processing.py +0 -200
- isa_model/training/image_model/train/train.py +0 -42
- isa_model/training/image_model/train/train_flux.py +0 -41
- isa_model/training/image_model/train/train_lora.py +0 -57
- isa_model/training/image_model/train_main.py +0 -25
- isa_model-0.2.0.dist-info/METADATA +0 -327
- isa_model-0.2.0.dist-info/RECORD +0 -92
- isa_model-0.2.0.dist-info/licenses/LICENSE +0 -21
- /isa_model/training/{llm_model/annotation → annotation}/annotation_schema.py +0 -0
- /isa_model/training/{llm_model/annotation → annotation}/processors/annotation_processor.py +0 -0
- /isa_model/training/{llm_model/annotation → annotation}/storage/dataset_manager.py +0 -0
- /isa_model/training/{llm_model/annotation → annotation}/storage/dataset_schema.py +0 -0
- /isa_model/training/{llm_model/annotation → annotation}/tests/test_annotation_flow.py +0 -0
- /isa_model/training/{llm_model/annotation → annotation}/tests/test_minio copy.py +0 -0
- /isa_model/training/{llm_model/annotation → annotation}/tests/test_minio_upload.py +0 -0
- /isa_model/training/{llm_model/annotation → annotation}/views/annotation_controller.py +0 -0
- {isa_model-0.2.0.dist-info → isa_model-0.2.9.dist-info}/WHEEL +0 -0
- {isa_model-0.2.0.dist-info → isa_model-0.2.9.dist-info}/top_level.txt +0 -0
isa_model/eval/factory.py
CHANGED
@@ -6,6 +6,7 @@ This factory provides a single interface for all evaluation operations:
|
|
6
6
|
- Image model evaluation (FID, IS, LPIPS)
|
7
7
|
- Benchmark testing (MMLU, HellaSwag, ARC, etc.)
|
8
8
|
- Custom evaluation pipelines
|
9
|
+
- Weights & Biases integration for experiment tracking
|
9
10
|
"""
|
10
11
|
|
11
12
|
import os
|
@@ -15,6 +16,18 @@ from typing import Optional, Dict, Any, List, Union
|
|
15
16
|
from pathlib import Path
|
16
17
|
import datetime
|
17
18
|
|
19
|
+
try:
|
20
|
+
import wandb
|
21
|
+
WANDB_AVAILABLE = True
|
22
|
+
except ImportError:
|
23
|
+
WANDB_AVAILABLE = False
|
24
|
+
|
25
|
+
try:
|
26
|
+
import mlflow
|
27
|
+
MLFLOW_AVAILABLE = True
|
28
|
+
except ImportError:
|
29
|
+
MLFLOW_AVAILABLE = False
|
30
|
+
|
18
31
|
from .metrics import LLMMetrics, ImageMetrics, BenchmarkRunner
|
19
32
|
from .benchmarks import MMLU, HellaSwag, ARC, GSM8K
|
20
33
|
|
@@ -23,26 +36,31 @@ logger = logging.getLogger(__name__)
|
|
23
36
|
|
24
37
|
class EvaluationFactory:
|
25
38
|
"""
|
26
|
-
Unified factory for all AI model evaluation operations.
|
39
|
+
Unified factory for all AI model evaluation operations with experiment tracking.
|
27
40
|
|
28
41
|
This class provides simplified interfaces for:
|
29
42
|
- LLM evaluation with various metrics
|
30
43
|
- Image model evaluation
|
31
44
|
- Benchmark testing on standard datasets
|
32
45
|
- Custom evaluation pipelines
|
46
|
+
- Experiment tracking with W&B and MLflow
|
33
47
|
|
34
48
|
Example usage:
|
35
49
|
```python
|
36
50
|
from isa_model.eval import EvaluationFactory
|
37
51
|
|
38
|
-
evaluator = EvaluationFactory(
|
52
|
+
evaluator = EvaluationFactory(
|
53
|
+
output_dir="eval_results",
|
54
|
+
use_wandb=True,
|
55
|
+
wandb_project="model-evaluation"
|
56
|
+
)
|
39
57
|
|
40
58
|
# Evaluate LLM on custom dataset
|
41
59
|
results = evaluator.evaluate_llm(
|
42
60
|
model_path="path/to/model",
|
43
61
|
dataset_path="test_data.json",
|
44
62
|
metrics=["perplexity", "bleu", "rouge"],
|
45
|
-
|
63
|
+
experiment_name="gemma-4b-evaluation"
|
46
64
|
)
|
47
65
|
|
48
66
|
# Run MMLU benchmark
|
@@ -60,12 +78,25 @@ class EvaluationFactory:
|
|
60
78
|
```
|
61
79
|
"""
|
62
80
|
|
63
|
-
def __init__(
|
81
|
+
def __init__(
|
82
|
+
self,
|
83
|
+
output_dir: Optional[str] = None,
|
84
|
+
use_wandb: bool = False,
|
85
|
+
wandb_project: Optional[str] = None,
|
86
|
+
wandb_entity: Optional[str] = None,
|
87
|
+
use_mlflow: bool = False,
|
88
|
+
mlflow_tracking_uri: Optional[str] = None
|
89
|
+
):
|
64
90
|
"""
|
65
|
-
Initialize the evaluation factory.
|
91
|
+
Initialize the evaluation factory with experiment tracking.
|
66
92
|
|
67
93
|
Args:
|
68
94
|
output_dir: Base directory for evaluation outputs
|
95
|
+
use_wandb: Whether to use Weights & Biases for tracking
|
96
|
+
wandb_project: W&B project name
|
97
|
+
wandb_entity: W&B entity/team name
|
98
|
+
use_mlflow: Whether to use MLflow for tracking
|
99
|
+
mlflow_tracking_uri: MLflow tracking server URI
|
69
100
|
"""
|
70
101
|
self.output_dir = output_dir or os.path.join(os.getcwd(), "evaluation_results")
|
71
102
|
os.makedirs(self.output_dir, exist_ok=True)
|
@@ -75,8 +106,55 @@ class EvaluationFactory:
|
|
75
106
|
self.image_metrics = ImageMetrics()
|
76
107
|
self.benchmark_runner = BenchmarkRunner()
|
77
108
|
|
109
|
+
# Setup experiment tracking
|
110
|
+
self.use_wandb = use_wandb and WANDB_AVAILABLE
|
111
|
+
self.use_mlflow = use_mlflow and MLFLOW_AVAILABLE
|
112
|
+
|
113
|
+
if self.use_wandb:
|
114
|
+
self.wandb_project = wandb_project or "isa-model-evaluation"
|
115
|
+
self.wandb_entity = wandb_entity
|
116
|
+
logger.info(f"W&B tracking enabled for project: {self.wandb_project}")
|
117
|
+
|
118
|
+
if self.use_mlflow:
|
119
|
+
if mlflow_tracking_uri:
|
120
|
+
mlflow.set_tracking_uri(mlflow_tracking_uri)
|
121
|
+
logger.info(f"MLflow tracking enabled with URI: {mlflow.get_tracking_uri()}")
|
122
|
+
|
78
123
|
logger.info(f"EvaluationFactory initialized with output dir: {self.output_dir}")
|
79
124
|
|
125
|
+
def _start_experiment(self, experiment_name: str, config: Dict[str, Any]) -> None:
|
126
|
+
"""Start experiment tracking."""
|
127
|
+
if self.use_wandb:
|
128
|
+
wandb.init(
|
129
|
+
project=self.wandb_project,
|
130
|
+
entity=self.wandb_entity,
|
131
|
+
name=experiment_name,
|
132
|
+
config=config,
|
133
|
+
reinit=True
|
134
|
+
)
|
135
|
+
|
136
|
+
if self.use_mlflow:
|
137
|
+
mlflow.start_run(run_name=experiment_name)
|
138
|
+
mlflow.log_params(config)
|
139
|
+
|
140
|
+
def _log_metrics(self, metrics: Dict[str, Any], step: Optional[int] = None) -> None:
|
141
|
+
"""Log metrics to experiment tracking systems."""
|
142
|
+
if self.use_wandb:
|
143
|
+
wandb.log(metrics, step=step)
|
144
|
+
|
145
|
+
if self.use_mlflow:
|
146
|
+
for key, value in metrics.items():
|
147
|
+
if isinstance(value, (int, float)):
|
148
|
+
mlflow.log_metric(key, value, step=step)
|
149
|
+
|
150
|
+
def _end_experiment(self) -> None:
|
151
|
+
"""End experiment tracking."""
|
152
|
+
if self.use_wandb:
|
153
|
+
wandb.finish()
|
154
|
+
|
155
|
+
if self.use_mlflow:
|
156
|
+
mlflow.end_run()
|
157
|
+
|
80
158
|
def _get_output_path(self, model_name: str, eval_type: str) -> str:
|
81
159
|
"""Generate timestamped output path for evaluation results."""
|
82
160
|
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
@@ -97,6 +175,7 @@ class EvaluationFactory:
|
|
97
175
|
batch_size: int = 8,
|
98
176
|
max_samples: Optional[int] = None,
|
99
177
|
provider: str = "ollama",
|
178
|
+
experiment_name: Optional[str] = None,
|
100
179
|
**kwargs
|
101
180
|
) -> Dict[str, Any]:
|
102
181
|
"""
|
@@ -110,6 +189,7 @@ class EvaluationFactory:
|
|
110
189
|
batch_size: Batch size for evaluation
|
111
190
|
max_samples: Maximum number of samples to evaluate
|
112
191
|
provider: Model provider ("ollama", "openai", "hf")
|
192
|
+
experiment_name: Name for experiment tracking
|
113
193
|
**kwargs: Additional parameters
|
114
194
|
|
115
195
|
Returns:
|
@@ -121,7 +201,8 @@ class EvaluationFactory:
|
|
121
201
|
model_path="google/gemma-2-4b-it",
|
122
202
|
dataset_path="test_data.json",
|
123
203
|
metrics=["perplexity", "bleu", "rouge"],
|
124
|
-
max_samples=1000
|
204
|
+
max_samples=1000,
|
205
|
+
experiment_name="gemma-4b-eval"
|
125
206
|
)
|
126
207
|
```
|
127
208
|
"""
|
@@ -131,40 +212,62 @@ class EvaluationFactory:
|
|
131
212
|
if not output_path:
|
132
213
|
output_path = self._get_output_path(model_path, "llm_eval")
|
133
214
|
|
134
|
-
|
135
|
-
|
136
|
-
# Load dataset
|
137
|
-
with open(dataset_path, 'r') as f:
|
138
|
-
dataset = json.load(f)
|
139
|
-
|
140
|
-
if max_samples:
|
141
|
-
dataset = dataset[:max_samples]
|
142
|
-
|
143
|
-
# Run evaluation
|
144
|
-
results = self.llm_metrics.evaluate(
|
145
|
-
model_path=model_path,
|
146
|
-
dataset=dataset,
|
147
|
-
metrics=metrics,
|
148
|
-
batch_size=batch_size,
|
149
|
-
provider=provider,
|
150
|
-
**kwargs
|
151
|
-
)
|
152
|
-
|
153
|
-
# Add metadata
|
154
|
-
results["metadata"] = {
|
215
|
+
# Setup experiment tracking
|
216
|
+
config = {
|
155
217
|
"model_path": model_path,
|
156
218
|
"dataset_path": dataset_path,
|
157
219
|
"metrics": metrics,
|
158
|
-
"
|
159
|
-
"
|
220
|
+
"batch_size": batch_size,
|
221
|
+
"max_samples": max_samples,
|
160
222
|
"provider": provider
|
161
223
|
}
|
162
224
|
|
163
|
-
|
164
|
-
|
165
|
-
|
225
|
+
experiment_name = experiment_name or f"llm_eval_{os.path.basename(model_path)}"
|
226
|
+
self._start_experiment(experiment_name, config)
|
227
|
+
|
228
|
+
logger.info(f"Evaluating LLM {model_path} with metrics: {metrics}")
|
229
|
+
|
230
|
+
try:
|
231
|
+
# Load dataset
|
232
|
+
with open(dataset_path, 'r') as f:
|
233
|
+
dataset = json.load(f)
|
234
|
+
|
235
|
+
if max_samples:
|
236
|
+
dataset = dataset[:max_samples]
|
237
|
+
|
238
|
+
# Run evaluation
|
239
|
+
results = self.llm_metrics.evaluate(
|
240
|
+
model_path=model_path,
|
241
|
+
dataset=dataset,
|
242
|
+
metrics=metrics,
|
243
|
+
batch_size=batch_size,
|
244
|
+
provider=provider,
|
245
|
+
**kwargs
|
246
|
+
)
|
247
|
+
|
248
|
+
# Log metrics to tracking systems
|
249
|
+
self._log_metrics(results.get("metrics", {}))
|
250
|
+
|
251
|
+
# Add metadata
|
252
|
+
results["metadata"] = {
|
253
|
+
"model_path": model_path,
|
254
|
+
"dataset_path": dataset_path,
|
255
|
+
"metrics": metrics,
|
256
|
+
"num_samples": len(dataset),
|
257
|
+
"timestamp": datetime.datetime.now().isoformat(),
|
258
|
+
"provider": provider,
|
259
|
+
"experiment_name": experiment_name
|
260
|
+
}
|
261
|
+
|
262
|
+
# Save results
|
263
|
+
with open(output_path, 'w') as f:
|
264
|
+
json.dump(results, f, indent=2)
|
265
|
+
|
266
|
+
logger.info(f"Evaluation results saved to: {output_path}")
|
267
|
+
|
268
|
+
finally:
|
269
|
+
self._end_experiment()
|
166
270
|
|
167
|
-
logger.info(f"Evaluation results saved to: {output_path}")
|
168
271
|
return results
|
169
272
|
|
170
273
|
def evaluate_generation_quality(
|
@@ -225,76 +328,89 @@ class EvaluationFactory:
|
|
225
328
|
num_shots: int = 0,
|
226
329
|
max_samples: Optional[int] = None,
|
227
330
|
provider: str = "ollama",
|
331
|
+
experiment_name: Optional[str] = None,
|
228
332
|
**kwargs
|
229
333
|
) -> Dict[str, Any]:
|
230
334
|
"""
|
231
|
-
Run a
|
335
|
+
Run a specific benchmark on a model with experiment tracking.
|
232
336
|
|
233
337
|
Args:
|
234
338
|
model_path: Path to the model
|
235
339
|
benchmark: Benchmark name ("mmlu", "hellaswag", "arc", "gsm8k")
|
236
|
-
output_path:
|
340
|
+
output_path: Path to save results
|
237
341
|
num_shots: Number of few-shot examples
|
238
342
|
max_samples: Maximum samples to evaluate
|
239
343
|
provider: Model provider
|
344
|
+
experiment_name: Name for experiment tracking
|
240
345
|
**kwargs: Additional parameters
|
241
346
|
|
242
347
|
Returns:
|
243
348
|
Benchmark results dictionary
|
244
|
-
|
245
|
-
Example:
|
246
|
-
```python
|
247
|
-
mmlu_results = evaluator.run_benchmark(
|
248
|
-
model_path="google/gemma-2-4b-it",
|
249
|
-
benchmark="mmlu",
|
250
|
-
num_shots=5,
|
251
|
-
max_samples=1000
|
252
|
-
)
|
253
|
-
```
|
254
349
|
"""
|
255
350
|
if not output_path:
|
256
351
|
output_path = self._get_output_path(model_path, f"{benchmark}_benchmark")
|
257
352
|
|
258
|
-
|
259
|
-
|
260
|
-
# Select benchmark
|
261
|
-
benchmark_map = {
|
262
|
-
"mmlu": MMLU(),
|
263
|
-
"hellaswag": HellaSwag(),
|
264
|
-
"arc": ARC(),
|
265
|
-
"gsm8k": GSM8K()
|
266
|
-
}
|
267
|
-
|
268
|
-
if benchmark.lower() not in benchmark_map:
|
269
|
-
raise ValueError(f"Unsupported benchmark: {benchmark}")
|
270
|
-
|
271
|
-
benchmark_instance = benchmark_map[benchmark.lower()]
|
272
|
-
|
273
|
-
# Run benchmark
|
274
|
-
results = self.benchmark_runner.run(
|
275
|
-
benchmark=benchmark_instance,
|
276
|
-
model_path=model_path,
|
277
|
-
num_shots=num_shots,
|
278
|
-
max_samples=max_samples,
|
279
|
-
provider=provider,
|
280
|
-
**kwargs
|
281
|
-
)
|
282
|
-
|
283
|
-
# Add metadata
|
284
|
-
results["metadata"] = {
|
353
|
+
# Setup experiment tracking
|
354
|
+
config = {
|
285
355
|
"model_path": model_path,
|
286
356
|
"benchmark": benchmark,
|
287
357
|
"num_shots": num_shots,
|
288
358
|
"max_samples": max_samples,
|
289
|
-
"timestamp": datetime.datetime.now().isoformat(),
|
290
359
|
"provider": provider
|
291
360
|
}
|
292
361
|
|
293
|
-
|
294
|
-
|
295
|
-
|
362
|
+
experiment_name = experiment_name or f"{benchmark}_{os.path.basename(model_path)}"
|
363
|
+
self._start_experiment(experiment_name, config)
|
364
|
+
|
365
|
+
logger.info(f"Running {benchmark.upper()} benchmark on {model_path}")
|
366
|
+
|
367
|
+
try:
|
368
|
+
# Initialize benchmark
|
369
|
+
benchmark_map = {
|
370
|
+
"mmlu": MMLU(),
|
371
|
+
"hellaswag": HellaSwag(),
|
372
|
+
"arc": ARC(),
|
373
|
+
"gsm8k": GSM8K()
|
374
|
+
}
|
375
|
+
|
376
|
+
if benchmark.lower() not in benchmark_map:
|
377
|
+
raise ValueError(f"Benchmark '{benchmark}' not supported. Available: {list(benchmark_map.keys())}")
|
378
|
+
|
379
|
+
benchmark_instance = benchmark_map[benchmark.lower()]
|
380
|
+
|
381
|
+
# Run benchmark
|
382
|
+
results = self.benchmark_runner.run_benchmark(
|
383
|
+
model_path=model_path,
|
384
|
+
benchmark=benchmark_instance,
|
385
|
+
num_shots=num_shots,
|
386
|
+
max_samples=max_samples,
|
387
|
+
provider=provider,
|
388
|
+
**kwargs
|
389
|
+
)
|
390
|
+
|
391
|
+
# Log metrics to tracking systems
|
392
|
+
self._log_metrics(results.get("metrics", {}))
|
393
|
+
|
394
|
+
# Add metadata
|
395
|
+
results["metadata"] = {
|
396
|
+
"model_path": model_path,
|
397
|
+
"benchmark": benchmark,
|
398
|
+
"num_shots": num_shots,
|
399
|
+
"max_samples": max_samples,
|
400
|
+
"timestamp": datetime.datetime.now().isoformat(),
|
401
|
+
"provider": provider,
|
402
|
+
"experiment_name": experiment_name
|
403
|
+
}
|
404
|
+
|
405
|
+
# Save results
|
406
|
+
with open(output_path, 'w') as f:
|
407
|
+
json.dump(results, f, indent=2)
|
408
|
+
|
409
|
+
logger.info(f"Benchmark results saved to: {output_path}")
|
410
|
+
|
411
|
+
finally:
|
412
|
+
self._end_experiment()
|
296
413
|
|
297
|
-
logger.info(f"Benchmark results saved to: {output_path}")
|
298
414
|
return results
|
299
415
|
|
300
416
|
def run_multiple_benchmarks(
|
@@ -357,101 +473,134 @@ class EvaluationFactory:
|
|
357
473
|
benchmark: Optional[str] = None,
|
358
474
|
metrics: List[str] = None,
|
359
475
|
output_path: Optional[str] = None,
|
476
|
+
experiment_name: Optional[str] = None,
|
360
477
|
**kwargs
|
361
478
|
) -> Dict[str, Any]:
|
362
479
|
"""
|
363
|
-
Compare multiple models on the same evaluation.
|
480
|
+
Compare multiple models on the same evaluation task.
|
364
481
|
|
365
482
|
Args:
|
366
483
|
model_paths: List of model paths to compare
|
367
|
-
dataset_path:
|
368
|
-
benchmark: Benchmark name
|
484
|
+
dataset_path: Dataset for evaluation (if not using benchmark)
|
485
|
+
benchmark: Benchmark name (if not using custom dataset)
|
369
486
|
metrics: Metrics to compute
|
370
|
-
output_path:
|
487
|
+
output_path: Path to save comparison results
|
488
|
+
experiment_name: Name for experiment tracking
|
371
489
|
**kwargs: Additional parameters
|
372
490
|
|
373
491
|
Returns:
|
374
492
|
Comparison results dictionary
|
375
493
|
"""
|
494
|
+
if not dataset_path and not benchmark:
|
495
|
+
raise ValueError("Either dataset_path or benchmark must be provided")
|
496
|
+
|
376
497
|
if not output_path:
|
377
|
-
|
498
|
+
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
499
|
+
output_path = os.path.join(self.output_dir, f"model_comparison_{timestamp}.json")
|
378
500
|
|
379
|
-
|
380
|
-
|
381
|
-
"
|
382
|
-
"
|
501
|
+
# Setup experiment tracking
|
502
|
+
config = {
|
503
|
+
"model_paths": model_paths,
|
504
|
+
"dataset_path": dataset_path,
|
505
|
+
"benchmark": benchmark,
|
506
|
+
"metrics": metrics
|
383
507
|
}
|
384
508
|
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
509
|
+
experiment_name = experiment_name or f"model_comparison_{len(model_paths)}_models"
|
510
|
+
self._start_experiment(experiment_name, config)
|
511
|
+
|
512
|
+
logger.info(f"Comparing {len(model_paths)} models")
|
513
|
+
|
514
|
+
try:
|
515
|
+
results = {"models": {}, "comparison": {}}
|
389
516
|
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
517
|
+
# Evaluate each model
|
518
|
+
for i, model_path in enumerate(model_paths):
|
519
|
+
logger.info(f"Evaluating model {i+1}/{len(model_paths)}: {model_path}")
|
520
|
+
|
521
|
+
if benchmark:
|
522
|
+
model_results = self.run_benchmark(
|
394
523
|
model_path=model_path,
|
395
|
-
|
396
|
-
|
524
|
+
benchmark=benchmark,
|
525
|
+
experiment_name=None, # Don't start new experiment
|
397
526
|
**kwargs
|
398
527
|
)
|
399
|
-
|
400
|
-
|
401
|
-
results = self.run_benchmark(
|
528
|
+
else:
|
529
|
+
model_results = self.evaluate_llm(
|
402
530
|
model_path=model_path,
|
403
|
-
|
531
|
+
dataset_path=dataset_path,
|
532
|
+
metrics=metrics,
|
533
|
+
experiment_name=None, # Don't start new experiment
|
404
534
|
**kwargs
|
405
535
|
)
|
406
|
-
else:
|
407
|
-
raise ValueError("Either dataset_path or benchmark must be provided")
|
408
536
|
|
409
|
-
|
537
|
+
results["models"][model_path] = model_results
|
410
538
|
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
|
420
|
-
|
421
|
-
|
422
|
-
|
539
|
+
# Log individual model metrics
|
540
|
+
model_metrics = model_results.get("metrics", {})
|
541
|
+
for metric_name, value in model_metrics.items():
|
542
|
+
self._log_metrics({f"{os.path.basename(model_path)}_{metric_name}": value})
|
543
|
+
|
544
|
+
# Generate comparison summary
|
545
|
+
results["comparison"] = self._generate_comparison_summary(results["models"])
|
546
|
+
|
547
|
+
# Add metadata
|
548
|
+
results["metadata"] = {
|
549
|
+
"model_paths": model_paths,
|
550
|
+
"dataset_path": dataset_path,
|
551
|
+
"benchmark": benchmark,
|
552
|
+
"metrics": metrics,
|
553
|
+
"timestamp": datetime.datetime.now().isoformat(),
|
554
|
+
"experiment_name": experiment_name
|
555
|
+
}
|
556
|
+
|
557
|
+
# Save results
|
558
|
+
with open(output_path, 'w') as f:
|
559
|
+
json.dump(results, f, indent=2)
|
560
|
+
|
561
|
+
logger.info(f"Comparison results saved to: {output_path}")
|
562
|
+
|
563
|
+
finally:
|
564
|
+
self._end_experiment()
|
423
565
|
|
424
|
-
return
|
566
|
+
return results
|
425
567
|
|
426
568
|
def _generate_comparison_summary(self, results: Dict[str, Any]) -> Dict[str, Any]:
|
427
|
-
"""Generate summary
|
569
|
+
"""Generate comparison summary from multiple model results."""
|
428
570
|
summary = {
|
429
|
-
"
|
571
|
+
"best_model": {},
|
430
572
|
"rankings": {},
|
431
|
-
"
|
573
|
+
"metric_comparisons": {}
|
432
574
|
}
|
433
575
|
|
434
|
-
# Extract
|
435
|
-
|
436
|
-
|
437
|
-
|
576
|
+
# Extract all metrics across models
|
577
|
+
all_metrics = set()
|
578
|
+
for model_results in results.values():
|
579
|
+
if "metrics" in model_results:
|
580
|
+
all_metrics.update(model_results["metrics"].keys())
|
581
|
+
|
582
|
+
# Compare each metric
|
583
|
+
for metric in all_metrics:
|
584
|
+
metric_values = {}
|
585
|
+
for model_path, model_results in results.items():
|
586
|
+
if "metrics" in model_results and metric in model_results["metrics"]:
|
587
|
+
metric_values[model_path] = model_results["metrics"][metric]
|
588
|
+
|
589
|
+
if metric_values:
|
590
|
+
# Determine if higher is better (most metrics, higher is better)
|
591
|
+
higher_is_better = metric not in ["perplexity", "loss", "error_rate"]
|
438
592
|
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
reverse=True
|
451
|
-
)
|
452
|
-
summary["rankings"] = {i+1: model for i, (model, score) in enumerate(ranked)}
|
453
|
-
summary["best_performing"]["model"] = ranked[0][0]
|
454
|
-
summary["best_performing"]["score"] = ranked[0][1]
|
593
|
+
best_model = max(metric_values.items(), key=lambda x: x[1]) if higher_is_better else min(metric_values.items(), key=lambda x: x[1])
|
594
|
+
summary["best_model"][metric] = {
|
595
|
+
"model": best_model[0],
|
596
|
+
"value": best_model[1]
|
597
|
+
}
|
598
|
+
|
599
|
+
# Create ranking
|
600
|
+
sorted_models = sorted(metric_values.items(), key=lambda x: x[1], reverse=higher_is_better)
|
601
|
+
summary["rankings"][metric] = [{"model": model, "value": value} for model, value in sorted_models]
|
602
|
+
|
603
|
+
summary["metric_comparisons"][metric] = metric_values
|
455
604
|
|
456
605
|
return summary
|
457
606
|
|
@@ -579,4 +728,96 @@ class EvaluationFactory:
|
|
579
728
|
# TODO: Implement HTML and Markdown report generation
|
580
729
|
|
581
730
|
logger.info(f"Evaluation report generated: {output_path}")
|
582
|
-
return output_path
|
731
|
+
return output_path
|
732
|
+
|
733
|
+
def evaluate_multimodal_model(
|
734
|
+
self,
|
735
|
+
model_path: str,
|
736
|
+
text_dataset_path: Optional[str] = None,
|
737
|
+
image_dataset_path: Optional[str] = None,
|
738
|
+
audio_dataset_path: Optional[str] = None,
|
739
|
+
metrics: List[str] = None,
|
740
|
+
experiment_name: Optional[str] = None,
|
741
|
+
**kwargs
|
742
|
+
) -> Dict[str, Any]:
|
743
|
+
"""
|
744
|
+
Evaluate multimodal models across different modalities.
|
745
|
+
|
746
|
+
Args:
|
747
|
+
model_path: Path to the multimodal model
|
748
|
+
text_dataset_path: Path to text evaluation dataset
|
749
|
+
image_dataset_path: Path to image evaluation dataset
|
750
|
+
audio_dataset_path: Path to audio evaluation dataset
|
751
|
+
metrics: Metrics to compute for each modality
|
752
|
+
experiment_name: Name for experiment tracking
|
753
|
+
**kwargs: Additional parameters
|
754
|
+
|
755
|
+
Returns:
|
756
|
+
Multimodal evaluation results
|
757
|
+
"""
|
758
|
+
config = {
|
759
|
+
"model_path": model_path,
|
760
|
+
"text_dataset_path": text_dataset_path,
|
761
|
+
"image_dataset_path": image_dataset_path,
|
762
|
+
"audio_dataset_path": audio_dataset_path,
|
763
|
+
"metrics": metrics
|
764
|
+
}
|
765
|
+
|
766
|
+
experiment_name = experiment_name or f"multimodal_eval_{os.path.basename(model_path)}"
|
767
|
+
self._start_experiment(experiment_name, config)
|
768
|
+
|
769
|
+
logger.info(f"Evaluating multimodal model: {model_path}")
|
770
|
+
|
771
|
+
try:
|
772
|
+
results = {"modalities": {}}
|
773
|
+
|
774
|
+
# Text evaluation
|
775
|
+
if text_dataset_path:
|
776
|
+
logger.info("Evaluating text modality...")
|
777
|
+
text_results = self.evaluate_llm(
|
778
|
+
model_path=model_path,
|
779
|
+
dataset_path=text_dataset_path,
|
780
|
+
metrics=metrics or ["perplexity", "bleu", "rouge"],
|
781
|
+
experiment_name=None,
|
782
|
+
**kwargs
|
783
|
+
)
|
784
|
+
results["modalities"]["text"] = text_results
|
785
|
+
self._log_metrics({f"text_{k}": v for k, v in text_results.get("metrics", {}).items()})
|
786
|
+
|
787
|
+
# Image evaluation
|
788
|
+
if image_dataset_path:
|
789
|
+
logger.info("Evaluating image modality...")
|
790
|
+
image_results = self.evaluate_image_model(
|
791
|
+
model_path=model_path,
|
792
|
+
test_images_dir=image_dataset_path,
|
793
|
+
metrics=metrics or ["fid", "is", "lpips"],
|
794
|
+
experiment_name=None,
|
795
|
+
**kwargs
|
796
|
+
)
|
797
|
+
results["modalities"]["image"] = image_results
|
798
|
+
self._log_metrics({f"image_{k}": v for k, v in image_results.get("metrics", {}).items()})
|
799
|
+
|
800
|
+
# Audio evaluation (placeholder for future implementation)
|
801
|
+
if audio_dataset_path:
|
802
|
+
logger.info("Audio evaluation not yet implemented")
|
803
|
+
results["modalities"]["audio"] = {"status": "not_implemented"}
|
804
|
+
|
805
|
+
# Add metadata
|
806
|
+
results["metadata"] = {
|
807
|
+
"model_path": model_path,
|
808
|
+
"modalities_evaluated": list(results["modalities"].keys()),
|
809
|
+
"timestamp": datetime.datetime.now().isoformat(),
|
810
|
+
"experiment_name": experiment_name
|
811
|
+
}
|
812
|
+
|
813
|
+
# Save results
|
814
|
+
output_path = self._get_output_path(model_path, "multimodal_eval")
|
815
|
+
with open(output_path, 'w') as f:
|
816
|
+
json.dump(results, f, indent=2)
|
817
|
+
|
818
|
+
logger.info(f"Multimodal evaluation results saved to: {output_path}")
|
819
|
+
|
820
|
+
finally:
|
821
|
+
self._end_experiment()
|
822
|
+
|
823
|
+
return results
|