PyPI - isa-model - Versions diffs - 0.2.0__py3-none-any.whl → 0.2.8__py3-none-any.whl - Mend

isa-model 0.2.0py3-none-any.whl → 0.2.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (77) hide show

isa_model/__init__.py +1 -1
isa_model/core/storage/hf_storage.py +419 -0
isa_model/deployment/__init__.py +52 -0
isa_model/deployment/core/__init__.py +34 -0
isa_model/deployment/core/deployment_config.py +356 -0
isa_model/deployment/core/deployment_manager.py +549 -0
isa_model/deployment/core/isa_deployment_service.py +401 -0
isa_model/eval/factory.py +381 -140
isa_model/inference/ai_factory.py +142 -240
isa_model/inference/providers/ml_provider.py +50 -0
isa_model/inference/services/audio/openai_tts_service.py +104 -3
isa_model/inference/services/embedding/base_embed_service.py +112 -0
isa_model/inference/services/embedding/ollama_embed_service.py +28 -2
isa_model/inference/services/llm/__init__.py +2 -0
isa_model/inference/services/llm/base_llm_service.py +111 -1
isa_model/inference/services/llm/ollama_llm_service.py +234 -26
isa_model/inference/services/llm/openai_llm_service.py +225 -28
isa_model/inference/services/llm/triton_llm_service.py +481 -0
isa_model/inference/services/ml/base_ml_service.py +78 -0
isa_model/inference/services/ml/sklearn_ml_service.py +140 -0
isa_model/inference/services/vision/__init__.py +3 -3
isa_model/inference/services/vision/base_image_gen_service.py +161 -0
isa_model/inference/services/vision/base_vision_service.py +177 -0
isa_model/inference/services/vision/ollama_vision_service.py +143 -17
isa_model/inference/services/vision/replicate_image_gen_service.py +139 -7
isa_model/training/__init__.py +62 -32
isa_model/training/cloud/__init__.py +22 -0
isa_model/training/cloud/job_orchestrator.py +402 -0
isa_model/training/cloud/runpod_trainer.py +454 -0
isa_model/training/cloud/storage_manager.py +482 -0
isa_model/training/core/__init__.py +23 -0
isa_model/training/core/config.py +181 -0
isa_model/training/core/dataset.py +222 -0
isa_model/training/core/trainer.py +720 -0
isa_model/training/core/utils.py +213 -0
isa_model/training/factory.py +229 -198
isa_model-0.2.8.dist-info/METADATA +465 -0
isa_model-0.2.8.dist-info/RECORD +86 -0
isa_model/core/model_router.py +0 -226
isa_model/core/model_version.py +0 -0
isa_model/core/resource_manager.py +0 -202
isa_model/deployment/gpu_fp16_ds8/models/deepseek_r1/1/model.py +0 -120
isa_model/deployment/gpu_fp16_ds8/scripts/download_model.py +0 -18
isa_model/training/engine/llama_factory/__init__.py +0 -39
isa_model/training/engine/llama_factory/config.py +0 -115
isa_model/training/engine/llama_factory/data_adapter.py +0 -284
isa_model/training/engine/llama_factory/examples/__init__.py +0 -6
isa_model/training/engine/llama_factory/examples/finetune_with_tracking.py +0 -185
isa_model/training/engine/llama_factory/examples/rlhf_with_tracking.py +0 -163
isa_model/training/engine/llama_factory/factory.py +0 -331
isa_model/training/engine/llama_factory/rl.py +0 -254
isa_model/training/engine/llama_factory/trainer.py +0 -171
isa_model/training/image_model/configs/create_config.py +0 -37
isa_model/training/image_model/configs/create_flux_config.py +0 -26
isa_model/training/image_model/configs/create_lora_config.py +0 -21
isa_model/training/image_model/prepare_massed_compute.py +0 -97
isa_model/training/image_model/prepare_upload.py +0 -17
isa_model/training/image_model/raw_data/create_captions.py +0 -16
isa_model/training/image_model/raw_data/create_lora_captions.py +0 -20
isa_model/training/image_model/raw_data/pre_processing.py +0 -200
isa_model/training/image_model/train/train.py +0 -42
isa_model/training/image_model/train/train_flux.py +0 -41
isa_model/training/image_model/train/train_lora.py +0 -57
isa_model/training/image_model/train_main.py +0 -25
isa_model-0.2.0.dist-info/METADATA +0 -327
isa_model-0.2.0.dist-info/RECORD +0 -92
isa_model-0.2.0.dist-info/licenses/LICENSE +0 -21
/isa_model/training/{llm_model/annotation → annotation}/annotation_schema.py +0 -0
/isa_model/training/{llm_model/annotation → annotation}/processors/annotation_processor.py +0 -0
/isa_model/training/{llm_model/annotation → annotation}/storage/dataset_manager.py +0 -0
/isa_model/training/{llm_model/annotation → annotation}/storage/dataset_schema.py +0 -0
/isa_model/training/{llm_model/annotation → annotation}/tests/test_annotation_flow.py +0 -0
/isa_model/training/{llm_model/annotation → annotation}/tests/test_minio copy.py +0 -0
/isa_model/training/{llm_model/annotation → annotation}/tests/test_minio_upload.py +0 -0
/isa_model/training/{llm_model/annotation → annotation}/views/annotation_controller.py +0 -0
{isa_model-0.2.0.dist-info → isa_model-0.2.8.dist-info}/WHEEL +0 -0
{isa_model-0.2.0.dist-info → isa_model-0.2.8.dist-info}/top_level.txt +0 -0

isa_model/eval/factory.py CHANGED Viewed

@@ -6,6 +6,7 @@ This factory provides a single interface for all evaluation operations:
 - Image model evaluation (FID, IS, LPIPS)
 - Benchmark testing (MMLU, HellaSwag, ARC, etc.)
 - Custom evaluation pipelines
+- Weights & Biases integration for experiment tracking
 """
 import os
@@ -15,6 +16,18 @@ from typing import Optional, Dict, Any, List, Union
 from pathlib import Path
 import datetime
+try:
+    import wandb
+    WANDB_AVAILABLE = True
+except ImportError:
+    WANDB_AVAILABLE = False
+try:
+    import mlflow
+    MLFLOW_AVAILABLE = True
+except ImportError:
+    MLFLOW_AVAILABLE = False
 from .metrics import LLMMetrics, ImageMetrics, BenchmarkRunner
 from .benchmarks import MMLU, HellaSwag, ARC, GSM8K
@@ -23,26 +36,31 @@ logger = logging.getLogger(__name__)
 class EvaluationFactory:
     """
-    Unified factory for all AI model evaluation operations.
+    Unified factory for all AI model evaluation operations with experiment tracking.
     This class provides simplified interfaces for:
     - LLM evaluation with various metrics
     - Image model evaluation
     - Benchmark testing on standard datasets
     - Custom evaluation pipelines
+    - Experiment tracking with W&B and MLflow
     Example usage:
         ```python
         from isa_model.eval import EvaluationFactory
-        evaluator = EvaluationFactory()
+        evaluator = EvaluationFactory(
+            output_dir="eval_results",
+            use_wandb=True,
+            wandb_project="model-evaluation"
+        )
         # Evaluate LLM on custom dataset
         results = evaluator.evaluate_llm(
             model_path="path/to/model",
             dataset_path="test_data.json",
             metrics=["perplexity", "bleu", "rouge"],
-            output_dir="eval_results"
+            experiment_name="gemma-4b-evaluation"
         )
         # Run MMLU benchmark
@@ -60,12 +78,25 @@ class EvaluationFactory:
         ```
     """
-    def __init__(self, output_dir: Optional[str] = None):
+    def __init__(
+        self,
+        output_dir: Optional[str] = None,
+        use_wandb: bool = False,
+        wandb_project: Optional[str] = None,
+        wandb_entity: Optional[str] = None,
+        use_mlflow: bool = False,
+        mlflow_tracking_uri: Optional[str] = None
+    ):
         """
-        Initialize the evaluation factory.
+        Initialize the evaluation factory with experiment tracking.
         Args:
             output_dir: Base directory for evaluation outputs
+            use_wandb: Whether to use Weights & Biases for tracking
+            wandb_project: W&B project name
+            wandb_entity: W&B entity/team name
+            use_mlflow: Whether to use MLflow for tracking
+            mlflow_tracking_uri: MLflow tracking server URI
         """
         self.output_dir = output_dir or os.path.join(os.getcwd(), "evaluation_results")
         os.makedirs(self.output_dir, exist_ok=True)
@@ -75,8 +106,55 @@ class EvaluationFactory:
         self.image_metrics = ImageMetrics()
         self.benchmark_runner = BenchmarkRunner()
+        # Setup experiment tracking
+        self.use_wandb = use_wandb and WANDB_AVAILABLE
+        self.use_mlflow = use_mlflow and MLFLOW_AVAILABLE
+        if self.use_wandb:
+            self.wandb_project = wandb_project or "isa-model-evaluation"
+            self.wandb_entity = wandb_entity
+            logger.info(f"W&B tracking enabled for project: {self.wandb_project}")
+        if self.use_mlflow:
+            if mlflow_tracking_uri:
+                mlflow.set_tracking_uri(mlflow_tracking_uri)
+            logger.info(f"MLflow tracking enabled with URI: {mlflow.get_tracking_uri()}")
         logger.info(f"EvaluationFactory initialized with output dir: {self.output_dir}")
+    def _start_experiment(self, experiment_name: str, config: Dict[str, Any]) -> None:
+        """Start experiment tracking."""
+        if self.use_wandb:
+            wandb.init(
+                project=self.wandb_project,
+                entity=self.wandb_entity,
+                name=experiment_name,
+                config=config,
+                reinit=True
+            )
+        if self.use_mlflow:
+            mlflow.start_run(run_name=experiment_name)
+            mlflow.log_params(config)
+    def _log_metrics(self, metrics: Dict[str, Any], step: Optional[int] = None) -> None:
+        """Log metrics to experiment tracking systems."""
+        if self.use_wandb:
+            wandb.log(metrics, step=step)
+        if self.use_mlflow:
+            for key, value in metrics.items():
+                if isinstance(value, (int, float)):
+                    mlflow.log_metric(key, value, step=step)
+    def _end_experiment(self) -> None:
+        """End experiment tracking."""
+        if self.use_wandb:
+            wandb.finish()
+        if self.use_mlflow:
+            mlflow.end_run()
     def _get_output_path(self, model_name: str, eval_type: str) -> str:
         """Generate timestamped output path for evaluation results."""
         timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
@@ -97,6 +175,7 @@ class EvaluationFactory:
         batch_size: int = 8,
         max_samples: Optional[int] = None,
         provider: str = "ollama",
+        experiment_name: Optional[str] = None,
         **kwargs
     ) -> Dict[str, Any]:
         """
@@ -110,6 +189,7 @@ class EvaluationFactory:
             batch_size: Batch size for evaluation
             max_samples: Maximum number of samples to evaluate
             provider: Model provider ("ollama", "openai", "hf")
+            experiment_name: Name for experiment tracking
             **kwargs: Additional parameters
         Returns:
@@ -121,7 +201,8 @@ class EvaluationFactory:
                 model_path="google/gemma-2-4b-it",
                 dataset_path="test_data.json",
                 metrics=["perplexity", "bleu", "rouge"],
-                max_samples=1000
+                max_samples=1000,
+                experiment_name="gemma-4b-eval"
             )
             ```
         """
@@ -131,40 +212,62 @@ class EvaluationFactory:
         if not output_path:
             output_path = self._get_output_path(model_path, "llm_eval")
-        logger.info(f"Evaluating LLM {model_path} with metrics: {metrics}")
-        # Load dataset
-        with open(dataset_path, 'r') as f:
-            dataset = json.load(f)
-        if max_samples:
-            dataset = dataset[:max_samples]
-        # Run evaluation
-        results = self.llm_metrics.evaluate(
-            model_path=model_path,
-            dataset=dataset,
-            metrics=metrics,
-            batch_size=batch_size,
-            provider=provider,
-            **kwargs
-        )
-        # Add metadata
-        results["metadata"] = {
+        # Setup experiment tracking
+        config = {
             "model_path": model_path,
             "dataset_path": dataset_path,
             "metrics": metrics,
-            "num_samples": len(dataset),
-            "timestamp": datetime.datetime.now().isoformat(),
+            "batch_size": batch_size,
+            "max_samples": max_samples,
             "provider": provider
         }
-        # Save results
-        with open(output_path, 'w') as f:
-            json.dump(results, f, indent=2)
+        experiment_name = experiment_name or f"llm_eval_{os.path.basename(model_path)}"
+        self._start_experiment(experiment_name, config)
+        logger.info(f"Evaluating LLM {model_path} with metrics: {metrics}")
+        try:
+            # Load dataset
+            with open(dataset_path, 'r') as f:
+                dataset = json.load(f)
+            if max_samples:
+                dataset = dataset[:max_samples]
+            # Run evaluation
+            results = self.llm_metrics.evaluate(
+                model_path=model_path,
+                dataset=dataset,
+                metrics=metrics,
+                batch_size=batch_size,
+                provider=provider,
+                **kwargs
+            )
+            # Log metrics to tracking systems
+            self._log_metrics(results.get("metrics", {}))
+            # Add metadata
+            results["metadata"] = {
+                "model_path": model_path,
+                "dataset_path": dataset_path,
+                "metrics": metrics,
+                "num_samples": len(dataset),
+                "timestamp": datetime.datetime.now().isoformat(),
+                "provider": provider,
+                "experiment_name": experiment_name
+            }
+            # Save results
+            with open(output_path, 'w') as f:
+                json.dump(results, f, indent=2)
+            logger.info(f"Evaluation results saved to: {output_path}")
+        finally:
+            self._end_experiment()
-        logger.info(f"Evaluation results saved to: {output_path}")
         return results
     def evaluate_generation_quality(
@@ -225,76 +328,89 @@ class EvaluationFactory:
         num_shots: int = 0,
         max_samples: Optional[int] = None,
         provider: str = "ollama",
+        experiment_name: Optional[str] = None,
         **kwargs
     ) -> Dict[str, Any]:
         """
-        Run a standard benchmark test.
+        Run a specific benchmark on a model with experiment tracking.
         Args:
             model_path: Path to the model
             benchmark: Benchmark name ("mmlu", "hellaswag", "arc", "gsm8k")
-            output_path: Output path for results
+            output_path: Path to save results
             num_shots: Number of few-shot examples
             max_samples: Maximum samples to evaluate
             provider: Model provider
+            experiment_name: Name for experiment tracking
             **kwargs: Additional parameters
         Returns:
             Benchmark results dictionary
-        Example:
-            ```python
-            mmlu_results = evaluator.run_benchmark(
-                model_path="google/gemma-2-4b-it",
-                benchmark="mmlu",
-                num_shots=5,
-                max_samples=1000
-            )
-            ```
         """
         if not output_path:
             output_path = self._get_output_path(model_path, f"{benchmark}_benchmark")
-        logger.info(f"Running {benchmark} benchmark on {model_path}")
-        # Select benchmark
-        benchmark_map = {
-            "mmlu": MMLU(),
-            "hellaswag": HellaSwag(),
-            "arc": ARC(),
-            "gsm8k": GSM8K()
-        }
-        if benchmark.lower() not in benchmark_map:
-            raise ValueError(f"Unsupported benchmark: {benchmark}")
-        benchmark_instance = benchmark_map[benchmark.lower()]
-        # Run benchmark
-        results = self.benchmark_runner.run(
-            benchmark=benchmark_instance,
-            model_path=model_path,
-            num_shots=num_shots,
-            max_samples=max_samples,
-            provider=provider,
-            **kwargs
-        )
-        # Add metadata
-        results["metadata"] = {
+        # Setup experiment tracking
+        config = {
             "model_path": model_path,
             "benchmark": benchmark,
             "num_shots": num_shots,
             "max_samples": max_samples,
-            "timestamp": datetime.datetime.now().isoformat(),
             "provider": provider
         }
-        # Save results
-        with open(output_path, 'w') as f:
-            json.dump(results, f, indent=2)
+        experiment_name = experiment_name or f"{benchmark}_{os.path.basename(model_path)}"
+        self._start_experiment(experiment_name, config)
+        logger.info(f"Running {benchmark.upper()} benchmark on {model_path}")
+        try:
+            # Initialize benchmark
+            benchmark_map = {
+                "mmlu": MMLU(),
+                "hellaswag": HellaSwag(),
+                "arc": ARC(),
+                "gsm8k": GSM8K()
+            }
+            if benchmark.lower() not in benchmark_map:
+                raise ValueError(f"Benchmark '{benchmark}' not supported. Available: {list(benchmark_map.keys())}")
+            benchmark_instance = benchmark_map[benchmark.lower()]
+            # Run benchmark
+            results = self.benchmark_runner.run_benchmark(
+                model_path=model_path,
+                benchmark=benchmark_instance,
+                num_shots=num_shots,
+                max_samples=max_samples,
+                provider=provider,
+                **kwargs
+            )
+            # Log metrics to tracking systems
+            self._log_metrics(results.get("metrics", {}))
+            # Add metadata
+            results["metadata"] = {
+                "model_path": model_path,
+                "benchmark": benchmark,
+                "num_shots": num_shots,
+                "max_samples": max_samples,
+                "timestamp": datetime.datetime.now().isoformat(),
+                "provider": provider,
+                "experiment_name": experiment_name
+            }
+            # Save results
+            with open(output_path, 'w') as f:
+                json.dump(results, f, indent=2)
+            logger.info(f"Benchmark results saved to: {output_path}")
+        finally:
+            self._end_experiment()
-        logger.info(f"Benchmark results saved to: {output_path}")
         return results
     def run_multiple_benchmarks(
@@ -357,101 +473,134 @@ class EvaluationFactory:
         benchmark: Optional[str] = None,
         metrics: List[str] = None,
         output_path: Optional[str] = None,
+        experiment_name: Optional[str] = None,
         **kwargs
     ) -> Dict[str, Any]:
         """
-        Compare multiple models on the same evaluation.
+        Compare multiple models on the same evaluation task.
         Args:
             model_paths: List of model paths to compare
-            dataset_path: Path to evaluation dataset
-            benchmark: Benchmark name for comparison
+            dataset_path: Dataset for evaluation (if not using benchmark)
+            benchmark: Benchmark name (if not using custom dataset)
             metrics: Metrics to compute
-            output_path: Output path for comparison results
+            output_path: Path to save comparison results
+            experiment_name: Name for experiment tracking
             **kwargs: Additional parameters
         Returns:
             Comparison results dictionary
         """
+        if not dataset_path and not benchmark:
+            raise ValueError("Either dataset_path or benchmark must be provided")
         if not output_path:
-            output_path = self._get_output_path("model_comparison", "comparison")
+            timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+            output_path = os.path.join(self.output_dir, f"model_comparison_{timestamp}.json")
-        comparison_results = {
-            "models": model_paths,
-            "results": {},
-            "summary": {}
+        # Setup experiment tracking
+        config = {
+            "model_paths": model_paths,
+            "dataset_path": dataset_path,
+            "benchmark": benchmark,
+            "metrics": metrics
         }
-        # Run evaluation for each model
-        for model_path in model_paths:
-            model_name = os.path.basename(model_path)
-            logger.info(f"Evaluating model: {model_name}")
+        experiment_name = experiment_name or f"model_comparison_{len(model_paths)}_models"
+        self._start_experiment(experiment_name, config)
+        logger.info(f"Comparing {len(model_paths)} models")
+        try:
+            results = {"models": {}, "comparison": {}}
-            try:
-                if dataset_path:
-                    # Custom dataset evaluation
-                    results = self.evaluate_llm(
+            # Evaluate each model
+            for i, model_path in enumerate(model_paths):
+                logger.info(f"Evaluating model {i+1}/{len(model_paths)}: {model_path}")
+                if benchmark:
+                    model_results = self.run_benchmark(
                         model_path=model_path,
-                        dataset_path=dataset_path,
-                        metrics=metrics,
+                        benchmark=benchmark,
+                        experiment_name=None,  # Don't start new experiment
                         **kwargs
                     )
-                elif benchmark:
-                    # Benchmark evaluation
-                    results = self.run_benchmark(
+                else:
+                    model_results = self.evaluate_llm(
                         model_path=model_path,
-                        benchmark=benchmark,
+                        dataset_path=dataset_path,
+                        metrics=metrics,
+                        experiment_name=None,  # Don't start new experiment
                         **kwargs
                     )
-                else:
-                    raise ValueError("Either dataset_path or benchmark must be provided")
-                comparison_results["results"][model_name] = results
+                results["models"][model_path] = model_results
-            except Exception as e:
-                logger.error(f"Failed to evaluate {model_name}: {e}")
-                comparison_results["results"][model_name] = {"error": str(e)}
-        # Generate summary
-        comparison_results["summary"] = self._generate_comparison_summary(
-            comparison_results["results"]
-        )
-        # Save results
-        with open(output_path, 'w') as f:
-            json.dump(comparison_results, f, indent=2)
+                # Log individual model metrics
+                model_metrics = model_results.get("metrics", {})
+                for metric_name, value in model_metrics.items():
+                    self._log_metrics({f"{os.path.basename(model_path)}_{metric_name}": value})
+            # Generate comparison summary
+            results["comparison"] = self._generate_comparison_summary(results["models"])
+            # Add metadata
+            results["metadata"] = {
+                "model_paths": model_paths,
+                "dataset_path": dataset_path,
+                "benchmark": benchmark,
+                "metrics": metrics,
+                "timestamp": datetime.datetime.now().isoformat(),
+                "experiment_name": experiment_name
+            }
+            # Save results
+            with open(output_path, 'w') as f:
+                json.dump(results, f, indent=2)
+            logger.info(f"Comparison results saved to: {output_path}")
+        finally:
+            self._end_experiment()
-        return comparison_results
+        return results
     def _generate_comparison_summary(self, results: Dict[str, Any]) -> Dict[str, Any]:
-        """Generate summary statistics for model comparison."""
+        """Generate comparison summary from multiple model results."""
         summary = {
-            "best_performing": {},
+            "best_model": {},
             "rankings": {},
-            "average_scores": {}
+            "metric_comparisons": {}
         }
-        # Extract key metrics and find best performing models
-        for model_name, model_results in results.items():
-            if "error" in model_results:
-                continue
+        # Extract all metrics across models
+        all_metrics = set()
+        for model_results in results.values():
+            if "metrics" in model_results:
+                all_metrics.update(model_results["metrics"].keys())
+        # Compare each metric
+        for metric in all_metrics:
+            metric_values = {}
+            for model_path, model_results in results.items():
+                if "metrics" in model_results and metric in model_results["metrics"]:
+                    metric_values[model_path] = model_results["metrics"][metric]
+            if metric_values:
+                # Determine if higher is better (most metrics, higher is better)
+                higher_is_better = metric not in ["perplexity", "loss", "error_rate"]
-            # Extract main scores (this is simplified - would need more sophisticated logic)
-            if "accuracy" in model_results:
-                summary["average_scores"][model_name] = model_results["accuracy"]
-            elif "overall_score" in model_results:
-                summary["average_scores"][model_name] = model_results["overall_score"]
-        # Rank models by performance
-        if summary["average_scores"]:
-            ranked = sorted(
-                summary["average_scores"].items(),
-                key=lambda x: x[1],
-                reverse=True
-            )
-            summary["rankings"] = {i+1: model for i, (model, score) in enumerate(ranked)}
-            summary["best_performing"]["model"] = ranked[0][0]
-            summary["best_performing"]["score"] = ranked[0][1]
+                best_model = max(metric_values.items(), key=lambda x: x[1]) if higher_is_better else min(metric_values.items(), key=lambda x: x[1])
+                summary["best_model"][metric] = {
+                    "model": best_model[0],
+                    "value": best_model[1]
+                }
+                # Create ranking
+                sorted_models = sorted(metric_values.items(), key=lambda x: x[1], reverse=higher_is_better)
+                summary["rankings"][metric] = [{"model": model, "value": value} for model, value in sorted_models]
+                summary["metric_comparisons"][metric] = metric_values
         return summary
@@ -579,4 +728,96 @@ class EvaluationFactory:
         # TODO: Implement HTML and Markdown report generation
         logger.info(f"Evaluation report generated: {output_path}")
-        return output_path
+        return output_path
+    def evaluate_multimodal_model(
+        self,
+        model_path: str,
+        text_dataset_path: Optional[str] = None,
+        image_dataset_path: Optional[str] = None,
+        audio_dataset_path: Optional[str] = None,
+        metrics: List[str] = None,
+        experiment_name: Optional[str] = None,
+        **kwargs
+    ) -> Dict[str, Any]:
+        """
+        Evaluate multimodal models across different modalities.
+        Args:
+            model_path: Path to the multimodal model
+            text_dataset_path: Path to text evaluation dataset
+            image_dataset_path: Path to image evaluation dataset
+            audio_dataset_path: Path to audio evaluation dataset
+            metrics: Metrics to compute for each modality
+            experiment_name: Name for experiment tracking
+            **kwargs: Additional parameters
+        Returns:
+            Multimodal evaluation results
+        """
+        config = {
+            "model_path": model_path,
+            "text_dataset_path": text_dataset_path,
+            "image_dataset_path": image_dataset_path,
+            "audio_dataset_path": audio_dataset_path,
+            "metrics": metrics
+        }
+        experiment_name = experiment_name or f"multimodal_eval_{os.path.basename(model_path)}"
+        self._start_experiment(experiment_name, config)
+        logger.info(f"Evaluating multimodal model: {model_path}")
+        try:
+            results = {"modalities": {}}
+            # Text evaluation
+            if text_dataset_path:
+                logger.info("Evaluating text modality...")
+                text_results = self.evaluate_llm(
+                    model_path=model_path,
+                    dataset_path=text_dataset_path,
+                    metrics=metrics or ["perplexity", "bleu", "rouge"],
+                    experiment_name=None,
+                    **kwargs
+                )
+                results["modalities"]["text"] = text_results
+                self._log_metrics({f"text_{k}": v for k, v in text_results.get("metrics", {}).items()})
+            # Image evaluation
+            if image_dataset_path:
+                logger.info("Evaluating image modality...")
+                image_results = self.evaluate_image_model(
+                    model_path=model_path,
+                    test_images_dir=image_dataset_path,
+                    metrics=metrics or ["fid", "is", "lpips"],
+                    experiment_name=None,
+                    **kwargs
+                )
+                results["modalities"]["image"] = image_results
+                self._log_metrics({f"image_{k}": v for k, v in image_results.get("metrics", {}).items()})
+            # Audio evaluation (placeholder for future implementation)
+            if audio_dataset_path:
+                logger.info("Audio evaluation not yet implemented")
+                results["modalities"]["audio"] = {"status": "not_implemented"}
+            # Add metadata
+            results["metadata"] = {
+                "model_path": model_path,
+                "modalities_evaluated": list(results["modalities"].keys()),
+                "timestamp": datetime.datetime.now().isoformat(),
+                "experiment_name": experiment_name
+            }
+            # Save results
+            output_path = self._get_output_path(model_path, "multimodal_eval")
+            with open(output_path, 'w') as f:
+                json.dump(results, f, indent=2)
+            logger.info(f"Multimodal evaluation results saved to: {output_path}")
+        finally:
+            self._end_experiment()
+        return results

isa-model 0.2.0__py3-none-any.whl → 0.2.8__py3-none-any.whl

isa-model 0.2.0py3-none-any.whl → 0.2.8py3-none-any.whl