isa-model 0.2.0__py3-none-any.whl → 0.2.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. isa_model/__init__.py +1 -1
  2. isa_model/core/storage/hf_storage.py +419 -0
  3. isa_model/deployment/__init__.py +52 -0
  4. isa_model/deployment/core/__init__.py +34 -0
  5. isa_model/deployment/core/deployment_config.py +356 -0
  6. isa_model/deployment/core/deployment_manager.py +549 -0
  7. isa_model/deployment/core/isa_deployment_service.py +401 -0
  8. isa_model/eval/factory.py +381 -140
  9. isa_model/inference/ai_factory.py +142 -240
  10. isa_model/inference/providers/ml_provider.py +50 -0
  11. isa_model/inference/services/audio/openai_tts_service.py +104 -3
  12. isa_model/inference/services/embedding/base_embed_service.py +112 -0
  13. isa_model/inference/services/embedding/ollama_embed_service.py +28 -2
  14. isa_model/inference/services/llm/__init__.py +2 -0
  15. isa_model/inference/services/llm/base_llm_service.py +111 -1
  16. isa_model/inference/services/llm/ollama_llm_service.py +234 -26
  17. isa_model/inference/services/llm/openai_llm_service.py +225 -28
  18. isa_model/inference/services/llm/triton_llm_service.py +481 -0
  19. isa_model/inference/services/ml/base_ml_service.py +78 -0
  20. isa_model/inference/services/ml/sklearn_ml_service.py +140 -0
  21. isa_model/inference/services/vision/__init__.py +3 -3
  22. isa_model/inference/services/vision/base_image_gen_service.py +161 -0
  23. isa_model/inference/services/vision/base_vision_service.py +177 -0
  24. isa_model/inference/services/vision/ollama_vision_service.py +143 -17
  25. isa_model/inference/services/vision/replicate_image_gen_service.py +139 -7
  26. isa_model/training/__init__.py +62 -32
  27. isa_model/training/cloud/__init__.py +22 -0
  28. isa_model/training/cloud/job_orchestrator.py +402 -0
  29. isa_model/training/cloud/runpod_trainer.py +454 -0
  30. isa_model/training/cloud/storage_manager.py +482 -0
  31. isa_model/training/core/__init__.py +23 -0
  32. isa_model/training/core/config.py +181 -0
  33. isa_model/training/core/dataset.py +222 -0
  34. isa_model/training/core/trainer.py +720 -0
  35. isa_model/training/core/utils.py +213 -0
  36. isa_model/training/factory.py +229 -198
  37. isa_model-0.2.8.dist-info/METADATA +465 -0
  38. isa_model-0.2.8.dist-info/RECORD +86 -0
  39. isa_model/core/model_router.py +0 -226
  40. isa_model/core/model_version.py +0 -0
  41. isa_model/core/resource_manager.py +0 -202
  42. isa_model/deployment/gpu_fp16_ds8/models/deepseek_r1/1/model.py +0 -120
  43. isa_model/deployment/gpu_fp16_ds8/scripts/download_model.py +0 -18
  44. isa_model/training/engine/llama_factory/__init__.py +0 -39
  45. isa_model/training/engine/llama_factory/config.py +0 -115
  46. isa_model/training/engine/llama_factory/data_adapter.py +0 -284
  47. isa_model/training/engine/llama_factory/examples/__init__.py +0 -6
  48. isa_model/training/engine/llama_factory/examples/finetune_with_tracking.py +0 -185
  49. isa_model/training/engine/llama_factory/examples/rlhf_with_tracking.py +0 -163
  50. isa_model/training/engine/llama_factory/factory.py +0 -331
  51. isa_model/training/engine/llama_factory/rl.py +0 -254
  52. isa_model/training/engine/llama_factory/trainer.py +0 -171
  53. isa_model/training/image_model/configs/create_config.py +0 -37
  54. isa_model/training/image_model/configs/create_flux_config.py +0 -26
  55. isa_model/training/image_model/configs/create_lora_config.py +0 -21
  56. isa_model/training/image_model/prepare_massed_compute.py +0 -97
  57. isa_model/training/image_model/prepare_upload.py +0 -17
  58. isa_model/training/image_model/raw_data/create_captions.py +0 -16
  59. isa_model/training/image_model/raw_data/create_lora_captions.py +0 -20
  60. isa_model/training/image_model/raw_data/pre_processing.py +0 -200
  61. isa_model/training/image_model/train/train.py +0 -42
  62. isa_model/training/image_model/train/train_flux.py +0 -41
  63. isa_model/training/image_model/train/train_lora.py +0 -57
  64. isa_model/training/image_model/train_main.py +0 -25
  65. isa_model-0.2.0.dist-info/METADATA +0 -327
  66. isa_model-0.2.0.dist-info/RECORD +0 -92
  67. isa_model-0.2.0.dist-info/licenses/LICENSE +0 -21
  68. /isa_model/training/{llm_model/annotation → annotation}/annotation_schema.py +0 -0
  69. /isa_model/training/{llm_model/annotation → annotation}/processors/annotation_processor.py +0 -0
  70. /isa_model/training/{llm_model/annotation → annotation}/storage/dataset_manager.py +0 -0
  71. /isa_model/training/{llm_model/annotation → annotation}/storage/dataset_schema.py +0 -0
  72. /isa_model/training/{llm_model/annotation → annotation}/tests/test_annotation_flow.py +0 -0
  73. /isa_model/training/{llm_model/annotation → annotation}/tests/test_minio copy.py +0 -0
  74. /isa_model/training/{llm_model/annotation → annotation}/tests/test_minio_upload.py +0 -0
  75. /isa_model/training/{llm_model/annotation → annotation}/views/annotation_controller.py +0 -0
  76. {isa_model-0.2.0.dist-info → isa_model-0.2.8.dist-info}/WHEEL +0 -0
  77. {isa_model-0.2.0.dist-info → isa_model-0.2.8.dist-info}/top_level.txt +0 -0
isa_model/eval/factory.py CHANGED
@@ -6,6 +6,7 @@ This factory provides a single interface for all evaluation operations:
6
6
  - Image model evaluation (FID, IS, LPIPS)
7
7
  - Benchmark testing (MMLU, HellaSwag, ARC, etc.)
8
8
  - Custom evaluation pipelines
9
+ - Weights & Biases integration for experiment tracking
9
10
  """
10
11
 
11
12
  import os
@@ -15,6 +16,18 @@ from typing import Optional, Dict, Any, List, Union
15
16
  from pathlib import Path
16
17
  import datetime
17
18
 
19
+ try:
20
+ import wandb
21
+ WANDB_AVAILABLE = True
22
+ except ImportError:
23
+ WANDB_AVAILABLE = False
24
+
25
+ try:
26
+ import mlflow
27
+ MLFLOW_AVAILABLE = True
28
+ except ImportError:
29
+ MLFLOW_AVAILABLE = False
30
+
18
31
  from .metrics import LLMMetrics, ImageMetrics, BenchmarkRunner
19
32
  from .benchmarks import MMLU, HellaSwag, ARC, GSM8K
20
33
 
@@ -23,26 +36,31 @@ logger = logging.getLogger(__name__)
23
36
 
24
37
  class EvaluationFactory:
25
38
  """
26
- Unified factory for all AI model evaluation operations.
39
+ Unified factory for all AI model evaluation operations with experiment tracking.
27
40
 
28
41
  This class provides simplified interfaces for:
29
42
  - LLM evaluation with various metrics
30
43
  - Image model evaluation
31
44
  - Benchmark testing on standard datasets
32
45
  - Custom evaluation pipelines
46
+ - Experiment tracking with W&B and MLflow
33
47
 
34
48
  Example usage:
35
49
  ```python
36
50
  from isa_model.eval import EvaluationFactory
37
51
 
38
- evaluator = EvaluationFactory()
52
+ evaluator = EvaluationFactory(
53
+ output_dir="eval_results",
54
+ use_wandb=True,
55
+ wandb_project="model-evaluation"
56
+ )
39
57
 
40
58
  # Evaluate LLM on custom dataset
41
59
  results = evaluator.evaluate_llm(
42
60
  model_path="path/to/model",
43
61
  dataset_path="test_data.json",
44
62
  metrics=["perplexity", "bleu", "rouge"],
45
- output_dir="eval_results"
63
+ experiment_name="gemma-4b-evaluation"
46
64
  )
47
65
 
48
66
  # Run MMLU benchmark
@@ -60,12 +78,25 @@ class EvaluationFactory:
60
78
  ```
61
79
  """
62
80
 
63
- def __init__(self, output_dir: Optional[str] = None):
81
+ def __init__(
82
+ self,
83
+ output_dir: Optional[str] = None,
84
+ use_wandb: bool = False,
85
+ wandb_project: Optional[str] = None,
86
+ wandb_entity: Optional[str] = None,
87
+ use_mlflow: bool = False,
88
+ mlflow_tracking_uri: Optional[str] = None
89
+ ):
64
90
  """
65
- Initialize the evaluation factory.
91
+ Initialize the evaluation factory with experiment tracking.
66
92
 
67
93
  Args:
68
94
  output_dir: Base directory for evaluation outputs
95
+ use_wandb: Whether to use Weights & Biases for tracking
96
+ wandb_project: W&B project name
97
+ wandb_entity: W&B entity/team name
98
+ use_mlflow: Whether to use MLflow for tracking
99
+ mlflow_tracking_uri: MLflow tracking server URI
69
100
  """
70
101
  self.output_dir = output_dir or os.path.join(os.getcwd(), "evaluation_results")
71
102
  os.makedirs(self.output_dir, exist_ok=True)
@@ -75,8 +106,55 @@ class EvaluationFactory:
75
106
  self.image_metrics = ImageMetrics()
76
107
  self.benchmark_runner = BenchmarkRunner()
77
108
 
109
+ # Setup experiment tracking
110
+ self.use_wandb = use_wandb and WANDB_AVAILABLE
111
+ self.use_mlflow = use_mlflow and MLFLOW_AVAILABLE
112
+
113
+ if self.use_wandb:
114
+ self.wandb_project = wandb_project or "isa-model-evaluation"
115
+ self.wandb_entity = wandb_entity
116
+ logger.info(f"W&B tracking enabled for project: {self.wandb_project}")
117
+
118
+ if self.use_mlflow:
119
+ if mlflow_tracking_uri:
120
+ mlflow.set_tracking_uri(mlflow_tracking_uri)
121
+ logger.info(f"MLflow tracking enabled with URI: {mlflow.get_tracking_uri()}")
122
+
78
123
  logger.info(f"EvaluationFactory initialized with output dir: {self.output_dir}")
79
124
 
125
+ def _start_experiment(self, experiment_name: str, config: Dict[str, Any]) -> None:
126
+ """Start experiment tracking."""
127
+ if self.use_wandb:
128
+ wandb.init(
129
+ project=self.wandb_project,
130
+ entity=self.wandb_entity,
131
+ name=experiment_name,
132
+ config=config,
133
+ reinit=True
134
+ )
135
+
136
+ if self.use_mlflow:
137
+ mlflow.start_run(run_name=experiment_name)
138
+ mlflow.log_params(config)
139
+
140
+ def _log_metrics(self, metrics: Dict[str, Any], step: Optional[int] = None) -> None:
141
+ """Log metrics to experiment tracking systems."""
142
+ if self.use_wandb:
143
+ wandb.log(metrics, step=step)
144
+
145
+ if self.use_mlflow:
146
+ for key, value in metrics.items():
147
+ if isinstance(value, (int, float)):
148
+ mlflow.log_metric(key, value, step=step)
149
+
150
+ def _end_experiment(self) -> None:
151
+ """End experiment tracking."""
152
+ if self.use_wandb:
153
+ wandb.finish()
154
+
155
+ if self.use_mlflow:
156
+ mlflow.end_run()
157
+
80
158
  def _get_output_path(self, model_name: str, eval_type: str) -> str:
81
159
  """Generate timestamped output path for evaluation results."""
82
160
  timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
@@ -97,6 +175,7 @@ class EvaluationFactory:
97
175
  batch_size: int = 8,
98
176
  max_samples: Optional[int] = None,
99
177
  provider: str = "ollama",
178
+ experiment_name: Optional[str] = None,
100
179
  **kwargs
101
180
  ) -> Dict[str, Any]:
102
181
  """
@@ -110,6 +189,7 @@ class EvaluationFactory:
110
189
  batch_size: Batch size for evaluation
111
190
  max_samples: Maximum number of samples to evaluate
112
191
  provider: Model provider ("ollama", "openai", "hf")
192
+ experiment_name: Name for experiment tracking
113
193
  **kwargs: Additional parameters
114
194
 
115
195
  Returns:
@@ -121,7 +201,8 @@ class EvaluationFactory:
121
201
  model_path="google/gemma-2-4b-it",
122
202
  dataset_path="test_data.json",
123
203
  metrics=["perplexity", "bleu", "rouge"],
124
- max_samples=1000
204
+ max_samples=1000,
205
+ experiment_name="gemma-4b-eval"
125
206
  )
126
207
  ```
127
208
  """
@@ -131,40 +212,62 @@ class EvaluationFactory:
131
212
  if not output_path:
132
213
  output_path = self._get_output_path(model_path, "llm_eval")
133
214
 
134
- logger.info(f"Evaluating LLM {model_path} with metrics: {metrics}")
135
-
136
- # Load dataset
137
- with open(dataset_path, 'r') as f:
138
- dataset = json.load(f)
139
-
140
- if max_samples:
141
- dataset = dataset[:max_samples]
142
-
143
- # Run evaluation
144
- results = self.llm_metrics.evaluate(
145
- model_path=model_path,
146
- dataset=dataset,
147
- metrics=metrics,
148
- batch_size=batch_size,
149
- provider=provider,
150
- **kwargs
151
- )
152
-
153
- # Add metadata
154
- results["metadata"] = {
215
+ # Setup experiment tracking
216
+ config = {
155
217
  "model_path": model_path,
156
218
  "dataset_path": dataset_path,
157
219
  "metrics": metrics,
158
- "num_samples": len(dataset),
159
- "timestamp": datetime.datetime.now().isoformat(),
220
+ "batch_size": batch_size,
221
+ "max_samples": max_samples,
160
222
  "provider": provider
161
223
  }
162
224
 
163
- # Save results
164
- with open(output_path, 'w') as f:
165
- json.dump(results, f, indent=2)
225
+ experiment_name = experiment_name or f"llm_eval_{os.path.basename(model_path)}"
226
+ self._start_experiment(experiment_name, config)
227
+
228
+ logger.info(f"Evaluating LLM {model_path} with metrics: {metrics}")
229
+
230
+ try:
231
+ # Load dataset
232
+ with open(dataset_path, 'r') as f:
233
+ dataset = json.load(f)
234
+
235
+ if max_samples:
236
+ dataset = dataset[:max_samples]
237
+
238
+ # Run evaluation
239
+ results = self.llm_metrics.evaluate(
240
+ model_path=model_path,
241
+ dataset=dataset,
242
+ metrics=metrics,
243
+ batch_size=batch_size,
244
+ provider=provider,
245
+ **kwargs
246
+ )
247
+
248
+ # Log metrics to tracking systems
249
+ self._log_metrics(results.get("metrics", {}))
250
+
251
+ # Add metadata
252
+ results["metadata"] = {
253
+ "model_path": model_path,
254
+ "dataset_path": dataset_path,
255
+ "metrics": metrics,
256
+ "num_samples": len(dataset),
257
+ "timestamp": datetime.datetime.now().isoformat(),
258
+ "provider": provider,
259
+ "experiment_name": experiment_name
260
+ }
261
+
262
+ # Save results
263
+ with open(output_path, 'w') as f:
264
+ json.dump(results, f, indent=2)
265
+
266
+ logger.info(f"Evaluation results saved to: {output_path}")
267
+
268
+ finally:
269
+ self._end_experiment()
166
270
 
167
- logger.info(f"Evaluation results saved to: {output_path}")
168
271
  return results
169
272
 
170
273
  def evaluate_generation_quality(
@@ -225,76 +328,89 @@ class EvaluationFactory:
225
328
  num_shots: int = 0,
226
329
  max_samples: Optional[int] = None,
227
330
  provider: str = "ollama",
331
+ experiment_name: Optional[str] = None,
228
332
  **kwargs
229
333
  ) -> Dict[str, Any]:
230
334
  """
231
- Run a standard benchmark test.
335
+ Run a specific benchmark on a model with experiment tracking.
232
336
 
233
337
  Args:
234
338
  model_path: Path to the model
235
339
  benchmark: Benchmark name ("mmlu", "hellaswag", "arc", "gsm8k")
236
- output_path: Output path for results
340
+ output_path: Path to save results
237
341
  num_shots: Number of few-shot examples
238
342
  max_samples: Maximum samples to evaluate
239
343
  provider: Model provider
344
+ experiment_name: Name for experiment tracking
240
345
  **kwargs: Additional parameters
241
346
 
242
347
  Returns:
243
348
  Benchmark results dictionary
244
-
245
- Example:
246
- ```python
247
- mmlu_results = evaluator.run_benchmark(
248
- model_path="google/gemma-2-4b-it",
249
- benchmark="mmlu",
250
- num_shots=5,
251
- max_samples=1000
252
- )
253
- ```
254
349
  """
255
350
  if not output_path:
256
351
  output_path = self._get_output_path(model_path, f"{benchmark}_benchmark")
257
352
 
258
- logger.info(f"Running {benchmark} benchmark on {model_path}")
259
-
260
- # Select benchmark
261
- benchmark_map = {
262
- "mmlu": MMLU(),
263
- "hellaswag": HellaSwag(),
264
- "arc": ARC(),
265
- "gsm8k": GSM8K()
266
- }
267
-
268
- if benchmark.lower() not in benchmark_map:
269
- raise ValueError(f"Unsupported benchmark: {benchmark}")
270
-
271
- benchmark_instance = benchmark_map[benchmark.lower()]
272
-
273
- # Run benchmark
274
- results = self.benchmark_runner.run(
275
- benchmark=benchmark_instance,
276
- model_path=model_path,
277
- num_shots=num_shots,
278
- max_samples=max_samples,
279
- provider=provider,
280
- **kwargs
281
- )
282
-
283
- # Add metadata
284
- results["metadata"] = {
353
+ # Setup experiment tracking
354
+ config = {
285
355
  "model_path": model_path,
286
356
  "benchmark": benchmark,
287
357
  "num_shots": num_shots,
288
358
  "max_samples": max_samples,
289
- "timestamp": datetime.datetime.now().isoformat(),
290
359
  "provider": provider
291
360
  }
292
361
 
293
- # Save results
294
- with open(output_path, 'w') as f:
295
- json.dump(results, f, indent=2)
362
+ experiment_name = experiment_name or f"{benchmark}_{os.path.basename(model_path)}"
363
+ self._start_experiment(experiment_name, config)
364
+
365
+ logger.info(f"Running {benchmark.upper()} benchmark on {model_path}")
366
+
367
+ try:
368
+ # Initialize benchmark
369
+ benchmark_map = {
370
+ "mmlu": MMLU(),
371
+ "hellaswag": HellaSwag(),
372
+ "arc": ARC(),
373
+ "gsm8k": GSM8K()
374
+ }
375
+
376
+ if benchmark.lower() not in benchmark_map:
377
+ raise ValueError(f"Benchmark '{benchmark}' not supported. Available: {list(benchmark_map.keys())}")
378
+
379
+ benchmark_instance = benchmark_map[benchmark.lower()]
380
+
381
+ # Run benchmark
382
+ results = self.benchmark_runner.run_benchmark(
383
+ model_path=model_path,
384
+ benchmark=benchmark_instance,
385
+ num_shots=num_shots,
386
+ max_samples=max_samples,
387
+ provider=provider,
388
+ **kwargs
389
+ )
390
+
391
+ # Log metrics to tracking systems
392
+ self._log_metrics(results.get("metrics", {}))
393
+
394
+ # Add metadata
395
+ results["metadata"] = {
396
+ "model_path": model_path,
397
+ "benchmark": benchmark,
398
+ "num_shots": num_shots,
399
+ "max_samples": max_samples,
400
+ "timestamp": datetime.datetime.now().isoformat(),
401
+ "provider": provider,
402
+ "experiment_name": experiment_name
403
+ }
404
+
405
+ # Save results
406
+ with open(output_path, 'w') as f:
407
+ json.dump(results, f, indent=2)
408
+
409
+ logger.info(f"Benchmark results saved to: {output_path}")
410
+
411
+ finally:
412
+ self._end_experiment()
296
413
 
297
- logger.info(f"Benchmark results saved to: {output_path}")
298
414
  return results
299
415
 
300
416
  def run_multiple_benchmarks(
@@ -357,101 +473,134 @@ class EvaluationFactory:
357
473
  benchmark: Optional[str] = None,
358
474
  metrics: List[str] = None,
359
475
  output_path: Optional[str] = None,
476
+ experiment_name: Optional[str] = None,
360
477
  **kwargs
361
478
  ) -> Dict[str, Any]:
362
479
  """
363
- Compare multiple models on the same evaluation.
480
+ Compare multiple models on the same evaluation task.
364
481
 
365
482
  Args:
366
483
  model_paths: List of model paths to compare
367
- dataset_path: Path to evaluation dataset
368
- benchmark: Benchmark name for comparison
484
+ dataset_path: Dataset for evaluation (if not using benchmark)
485
+ benchmark: Benchmark name (if not using custom dataset)
369
486
  metrics: Metrics to compute
370
- output_path: Output path for comparison results
487
+ output_path: Path to save comparison results
488
+ experiment_name: Name for experiment tracking
371
489
  **kwargs: Additional parameters
372
490
 
373
491
  Returns:
374
492
  Comparison results dictionary
375
493
  """
494
+ if not dataset_path and not benchmark:
495
+ raise ValueError("Either dataset_path or benchmark must be provided")
496
+
376
497
  if not output_path:
377
- output_path = self._get_output_path("model_comparison", "comparison")
498
+ timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
499
+ output_path = os.path.join(self.output_dir, f"model_comparison_{timestamp}.json")
378
500
 
379
- comparison_results = {
380
- "models": model_paths,
381
- "results": {},
382
- "summary": {}
501
+ # Setup experiment tracking
502
+ config = {
503
+ "model_paths": model_paths,
504
+ "dataset_path": dataset_path,
505
+ "benchmark": benchmark,
506
+ "metrics": metrics
383
507
  }
384
508
 
385
- # Run evaluation for each model
386
- for model_path in model_paths:
387
- model_name = os.path.basename(model_path)
388
- logger.info(f"Evaluating model: {model_name}")
509
+ experiment_name = experiment_name or f"model_comparison_{len(model_paths)}_models"
510
+ self._start_experiment(experiment_name, config)
511
+
512
+ logger.info(f"Comparing {len(model_paths)} models")
513
+
514
+ try:
515
+ results = {"models": {}, "comparison": {}}
389
516
 
390
- try:
391
- if dataset_path:
392
- # Custom dataset evaluation
393
- results = self.evaluate_llm(
517
+ # Evaluate each model
518
+ for i, model_path in enumerate(model_paths):
519
+ logger.info(f"Evaluating model {i+1}/{len(model_paths)}: {model_path}")
520
+
521
+ if benchmark:
522
+ model_results = self.run_benchmark(
394
523
  model_path=model_path,
395
- dataset_path=dataset_path,
396
- metrics=metrics,
524
+ benchmark=benchmark,
525
+ experiment_name=None, # Don't start new experiment
397
526
  **kwargs
398
527
  )
399
- elif benchmark:
400
- # Benchmark evaluation
401
- results = self.run_benchmark(
528
+ else:
529
+ model_results = self.evaluate_llm(
402
530
  model_path=model_path,
403
- benchmark=benchmark,
531
+ dataset_path=dataset_path,
532
+ metrics=metrics,
533
+ experiment_name=None, # Don't start new experiment
404
534
  **kwargs
405
535
  )
406
- else:
407
- raise ValueError("Either dataset_path or benchmark must be provided")
408
536
 
409
- comparison_results["results"][model_name] = results
537
+ results["models"][model_path] = model_results
410
538
 
411
- except Exception as e:
412
- logger.error(f"Failed to evaluate {model_name}: {e}")
413
- comparison_results["results"][model_name] = {"error": str(e)}
414
-
415
- # Generate summary
416
- comparison_results["summary"] = self._generate_comparison_summary(
417
- comparison_results["results"]
418
- )
419
-
420
- # Save results
421
- with open(output_path, 'w') as f:
422
- json.dump(comparison_results, f, indent=2)
539
+ # Log individual model metrics
540
+ model_metrics = model_results.get("metrics", {})
541
+ for metric_name, value in model_metrics.items():
542
+ self._log_metrics({f"{os.path.basename(model_path)}_{metric_name}": value})
543
+
544
+ # Generate comparison summary
545
+ results["comparison"] = self._generate_comparison_summary(results["models"])
546
+
547
+ # Add metadata
548
+ results["metadata"] = {
549
+ "model_paths": model_paths,
550
+ "dataset_path": dataset_path,
551
+ "benchmark": benchmark,
552
+ "metrics": metrics,
553
+ "timestamp": datetime.datetime.now().isoformat(),
554
+ "experiment_name": experiment_name
555
+ }
556
+
557
+ # Save results
558
+ with open(output_path, 'w') as f:
559
+ json.dump(results, f, indent=2)
560
+
561
+ logger.info(f"Comparison results saved to: {output_path}")
562
+
563
+ finally:
564
+ self._end_experiment()
423
565
 
424
- return comparison_results
566
+ return results
425
567
 
426
568
  def _generate_comparison_summary(self, results: Dict[str, Any]) -> Dict[str, Any]:
427
- """Generate summary statistics for model comparison."""
569
+ """Generate comparison summary from multiple model results."""
428
570
  summary = {
429
- "best_performing": {},
571
+ "best_model": {},
430
572
  "rankings": {},
431
- "average_scores": {}
573
+ "metric_comparisons": {}
432
574
  }
433
575
 
434
- # Extract key metrics and find best performing models
435
- for model_name, model_results in results.items():
436
- if "error" in model_results:
437
- continue
576
+ # Extract all metrics across models
577
+ all_metrics = set()
578
+ for model_results in results.values():
579
+ if "metrics" in model_results:
580
+ all_metrics.update(model_results["metrics"].keys())
581
+
582
+ # Compare each metric
583
+ for metric in all_metrics:
584
+ metric_values = {}
585
+ for model_path, model_results in results.items():
586
+ if "metrics" in model_results and metric in model_results["metrics"]:
587
+ metric_values[model_path] = model_results["metrics"][metric]
588
+
589
+ if metric_values:
590
+ # Determine if higher is better (most metrics, higher is better)
591
+ higher_is_better = metric not in ["perplexity", "loss", "error_rate"]
438
592
 
439
- # Extract main scores (this is simplified - would need more sophisticated logic)
440
- if "accuracy" in model_results:
441
- summary["average_scores"][model_name] = model_results["accuracy"]
442
- elif "overall_score" in model_results:
443
- summary["average_scores"][model_name] = model_results["overall_score"]
444
-
445
- # Rank models by performance
446
- if summary["average_scores"]:
447
- ranked = sorted(
448
- summary["average_scores"].items(),
449
- key=lambda x: x[1],
450
- reverse=True
451
- )
452
- summary["rankings"] = {i+1: model for i, (model, score) in enumerate(ranked)}
453
- summary["best_performing"]["model"] = ranked[0][0]
454
- summary["best_performing"]["score"] = ranked[0][1]
593
+ best_model = max(metric_values.items(), key=lambda x: x[1]) if higher_is_better else min(metric_values.items(), key=lambda x: x[1])
594
+ summary["best_model"][metric] = {
595
+ "model": best_model[0],
596
+ "value": best_model[1]
597
+ }
598
+
599
+ # Create ranking
600
+ sorted_models = sorted(metric_values.items(), key=lambda x: x[1], reverse=higher_is_better)
601
+ summary["rankings"][metric] = [{"model": model, "value": value} for model, value in sorted_models]
602
+
603
+ summary["metric_comparisons"][metric] = metric_values
455
604
 
456
605
  return summary
457
606
 
@@ -579,4 +728,96 @@ class EvaluationFactory:
579
728
  # TODO: Implement HTML and Markdown report generation
580
729
 
581
730
  logger.info(f"Evaluation report generated: {output_path}")
582
- return output_path
731
+ return output_path
732
+
733
+ def evaluate_multimodal_model(
734
+ self,
735
+ model_path: str,
736
+ text_dataset_path: Optional[str] = None,
737
+ image_dataset_path: Optional[str] = None,
738
+ audio_dataset_path: Optional[str] = None,
739
+ metrics: List[str] = None,
740
+ experiment_name: Optional[str] = None,
741
+ **kwargs
742
+ ) -> Dict[str, Any]:
743
+ """
744
+ Evaluate multimodal models across different modalities.
745
+
746
+ Args:
747
+ model_path: Path to the multimodal model
748
+ text_dataset_path: Path to text evaluation dataset
749
+ image_dataset_path: Path to image evaluation dataset
750
+ audio_dataset_path: Path to audio evaluation dataset
751
+ metrics: Metrics to compute for each modality
752
+ experiment_name: Name for experiment tracking
753
+ **kwargs: Additional parameters
754
+
755
+ Returns:
756
+ Multimodal evaluation results
757
+ """
758
+ config = {
759
+ "model_path": model_path,
760
+ "text_dataset_path": text_dataset_path,
761
+ "image_dataset_path": image_dataset_path,
762
+ "audio_dataset_path": audio_dataset_path,
763
+ "metrics": metrics
764
+ }
765
+
766
+ experiment_name = experiment_name or f"multimodal_eval_{os.path.basename(model_path)}"
767
+ self._start_experiment(experiment_name, config)
768
+
769
+ logger.info(f"Evaluating multimodal model: {model_path}")
770
+
771
+ try:
772
+ results = {"modalities": {}}
773
+
774
+ # Text evaluation
775
+ if text_dataset_path:
776
+ logger.info("Evaluating text modality...")
777
+ text_results = self.evaluate_llm(
778
+ model_path=model_path,
779
+ dataset_path=text_dataset_path,
780
+ metrics=metrics or ["perplexity", "bleu", "rouge"],
781
+ experiment_name=None,
782
+ **kwargs
783
+ )
784
+ results["modalities"]["text"] = text_results
785
+ self._log_metrics({f"text_{k}": v for k, v in text_results.get("metrics", {}).items()})
786
+
787
+ # Image evaluation
788
+ if image_dataset_path:
789
+ logger.info("Evaluating image modality...")
790
+ image_results = self.evaluate_image_model(
791
+ model_path=model_path,
792
+ test_images_dir=image_dataset_path,
793
+ metrics=metrics or ["fid", "is", "lpips"],
794
+ experiment_name=None,
795
+ **kwargs
796
+ )
797
+ results["modalities"]["image"] = image_results
798
+ self._log_metrics({f"image_{k}": v for k, v in image_results.get("metrics", {}).items()})
799
+
800
+ # Audio evaluation (placeholder for future implementation)
801
+ if audio_dataset_path:
802
+ logger.info("Audio evaluation not yet implemented")
803
+ results["modalities"]["audio"] = {"status": "not_implemented"}
804
+
805
+ # Add metadata
806
+ results["metadata"] = {
807
+ "model_path": model_path,
808
+ "modalities_evaluated": list(results["modalities"].keys()),
809
+ "timestamp": datetime.datetime.now().isoformat(),
810
+ "experiment_name": experiment_name
811
+ }
812
+
813
+ # Save results
814
+ output_path = self._get_output_path(model_path, "multimodal_eval")
815
+ with open(output_path, 'w') as f:
816
+ json.dump(results, f, indent=2)
817
+
818
+ logger.info(f"Multimodal evaluation results saved to: {output_path}")
819
+
820
+ finally:
821
+ self._end_experiment()
822
+
823
+ return results