PyPI - fusion-bench - Versions diffs - 0.2.9__py3-none-any.whl - Mend

fusion-bench 0.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (727) hide show

fusion_bench/method/adamerging/task_wise_adamerging.py ADDED Viewed

@@ -0,0 +1,174 @@
+import logging
+from abc import abstractmethod
+from typing import List, Mapping, Union  # noqa: F401
+import lightning as L
+import numpy as np
+import torch
+import torch.nn as nn
+from omegaconf import DictConfig
+from torch import Tensor
+from torch.utils.data import DataLoader
+from tqdm.autonotebook import tqdm
+from fusion_bench.compat.method import ModelFusionAlgorithm
+from fusion_bench.compat.modelpool import ModelPool
+from fusion_bench.models.wrappers.task_wise_fusion import (
+    TaskWiseMergedModel,
+    get_task_wise_weights,
+)
+log = logging.getLogger(__name__)
+def entropy_loss(logits: Tensor) -> Tensor:
+    """
+    Compute the entropy loss of a set of logits.
+    Args:
+        logits (Tensor): The logits to compute the entropy loss of.
+    Returns:
+        Tensor: The entropy loss of the logits.
+    """
+    probs = torch.softmax(logits, dim=-1)
+    return -torch.sum(probs * torch.log(probs + 1e-8), dim=-1).mean()
+class TaskWiseAdaMergingAlgorithm(ModelFusionAlgorithm):
+    _fabric: L.Fabric = None
+    def __init__(self, algorithm_config: DictConfig):
+        super().__init__(algorithm_config)
+        if self._fabric is None and torch.cuda.is_available():
+            self._fabric = L.Fabric(devices=self.config.get("devices", 1))
+            self._fabric.launch()
+    @torch.no_grad()
+    def construct_task_wise_merged_model(self, modelpool: ModelPool):
+        if self.config.weights is None:
+            task_wise_weight = get_task_wise_weights(
+                num_models=len(modelpool.model_names),
+                init_values=self.config.init_values,
+            )
+        else:
+            if isinstance(self.config.weights, str):
+                # self.config.weights is a path to a .np or .pt file
+                if self.config.weights.endswith(".pt"):
+                    task_wise_weight = torch.load(
+                        self.config.weights, map_location="cpu"
+                    ).detach_()
+                elif self.config.weights.endswith(".np"):
+                    task_wise_weight = torch.from_numpy(
+                        np.load(self.config.weights)
+                    ).detach_()
+                else:
+                    raise ValueError(f"Unsupported file format: {self.config.weights}")
+            else:
+                try:
+                    task_wise_weight = torch.tensor(
+                        list(self.config.weights), dtype=torch.float32
+                    )
+                except ValueError:
+                    raise ValueError(
+                        f"Unsupported weights format: {self.config.weights}"
+                    )
+        pretrained_model = modelpool.load_model("_pretrained_")
+        finetuned_models = [
+            modelpool.load_model(name) for name in modelpool.model_names
+        ]
+        module = TaskWiseMergedModel(
+            task_wise_weight=task_wise_weight,
+            pretrained_model=pretrained_model,
+            finetuned_models=finetuned_models,
+            clamp_weights=self.config.clamp_weights,
+            tie_weights=self.config.tie_weights,
+            strict=self.config.strict,
+        )
+        return module
+    def run(self, modelpool: ModelPool):
+        log.info("Fusing models using task-wise adaptive merging.")
+        self.modelpool = modelpool
+        module = self.construct_task_wise_merged_model(modelpool)
+        if self.config.weights is not None:
+            # skip the test-time adaptation
+            return module.merge_and_unload()
+        else:
+            module = self.test_time_adaptation(module)
+            if self.config.get("save_merging_weights", False):
+                torch.save(module.merge_weight, self.config.save_merging_weights)
+            return module.merge_and_unload()
+    def on_test_time_adaptation_start(self):
+        pass
+    @abstractmethod
+    def get_shuffled_test_loader_iter(self, task: str) -> DataLoader:
+        pass
+    @abstractmethod
+    def compute_logits(self, module: nn.Module, batch, task: str) -> Tensor:
+        """
+        Compute the logits for the given batch and task.
+        Args:
+            module (nn.Module): The model module.
+            batch (tuple): A batch of input data.
+            task (str): The name of the task.
+        Returns:
+            Tensor: The classification logits for the batch.
+        """
+        pass
+    def test_time_adaptation(self, module: TaskWiseMergedModel):
+        self.on_test_time_adaptation_start()
+        # configure optimizer
+        if self.config.optimizer == "adam":
+            optimizer = torch.optim.Adam([module.merge_weight], lr=self.config.lr)
+        else:
+            raise ValueError(f"Unsupported optimizer: {self.config.optimizer}")
+        if self._fabric is not None:
+            module, optimizer = self._fabric.setup(module, optimizer)
+        module.train()
+        module.merge_weights()
+        if self.config.get("fast_dev_run", False):
+            log.info("Running fast_dev_run, only one step")
+            pbar = tqdm(
+                range(1),
+                "AdaMerging Test-time adaptation",
+                dynamic_ncols=True,
+            )
+        else:
+            pbar = tqdm(
+                range(self.config.max_steps),
+                "AdaMerging Test-time adaptation",
+                dynamic_ncols=True,
+            )
+        for step_idx in pbar:
+            for task in self.modelpool.model_names:
+                batch = next(self.get_shuffled_test_loader_iter(task))
+                logits = self.compute_logits(module, batch, task)
+                assert (
+                    logits.dim() == 2
+                ), f"Expected logits to be 2D, got {logits.dim()}"
+                loss = entropy_loss(logits)
+                # .backward() accumulates when .zero_grad() wasn't called
+                # this can save memory
+                self._fabric.backward(loss, retain_graph=True)
+            optimizer.step()
+            optimizer.zero_grad()
+            module.merge_weights()
+        return module

fusion_bench/method/adamerging/utils.py ADDED Viewed

@@ -0,0 +1,15 @@
+import torch
+def get_memory_usage(desc):
+    """
+    obtain the current GPU memory usage
+    Returns:
+        str: A string containing the allocated and cached memory in MB.
+    """
+    allocated = torch.cuda.memory_allocated() / 1024**2  # 转换为 MB
+    cached = torch.cuda.memory_reserved() / 1024**2  # 转换为 MB
+    return (
+        f"{desc}\nAllocated Memory: {allocated:.2f} MB\nCached Memory: {cached:.2f} MB"
+    )

fusion_bench/method/analysis/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ from .task_vector_cos_similarity import TaskVectorCosSimilarity
2	+ from .task_vector_violin_plot import TaskVectorViolinPlot

fusion_bench/method/analysis/task_vector_cos_similarity.py ADDED Viewed

@@ -0,0 +1,172 @@
+import logging
+import os
+from typing import Dict, List, Optional, cast
+import numpy as np
+import pandas as pd
+import torch
+import torch.utils
+from numpy.typing import NDArray
+from torch import nn
+from tqdm.auto import tqdm
+from fusion_bench.method import BaseAlgorithm
+from fusion_bench.mixins import LightningFabricMixin
+from fusion_bench.modelpool import BaseModelPool
+from fusion_bench.utils.parameters import (
+    StateDictType,
+    state_dict_to_vector,
+    trainable_state_dict,
+)
+from fusion_bench.utils.state_dict_arithmetic import state_dict_sub
+log = logging.getLogger(__name__)
+class TaskVectorCosSimilarity(BaseAlgorithm, LightningFabricMixin):
+    """
+    This class is similar to the Dummy algorithm,
+    but it also print (or save) the cosine similarity matrix between the task vectors of the models in the model pool.
+    """
+    _config_mapping = BaseAlgorithm._config_mapping | {
+        "plot_heatmap": "plot_heatmap",
+        "_output_path": "output_path",
+    }
+    def __init__(
+        self,
+        plot_heatmap: bool,
+        trainable_only: bool = True,
+        max_points_per_model: Optional[int] = None,
+        output_path: Optional[str] = None,
+        **kwargs,
+    ):
+        self.plot_heatmap = plot_heatmap
+        self.trainable_only = trainable_only
+        self.max_points_per_model = max_points_per_model
+        self._output_path = output_path
+        super().__init__(**kwargs)
+    @property
+    def output_path(self):
+        if self._output_path is None:
+            return self.fabric.logger.log_dir
+        else:
+            return self._output_path
+    @torch.no_grad()
+    def run(self, modelpool: BaseModelPool):
+        pretrained_model = modelpool.load_pretrained_model()
+        task_vectors = []
+        for name, finetuned_model in tqdm(
+            modelpool.named_models(), total=len(modelpool)
+        ):
+            print(f"computing task vectors for {name}")
+            task_vectors.append(
+                self.get_task_vector(pretrained_model, finetuned_model).to(
+                    torch.float64
+                )
+            )
+        task_vectors = torch.stack(task_vectors, dim=0)
+        cos_sim_matrix = torch.zeros(
+            len(modelpool), len(modelpool), dtype=torch.float64
+        )
+        for i in range(len(modelpool)):
+            for j in range(i, len(modelpool)):
+                assert task_vectors[i].size() == task_vectors[j].size()
+                cos_sim_matrix[i, j] = torch.nn.functional.cosine_similarity(
+                    task_vectors[i], task_vectors[j], dim=0
+                )
+                cos_sim_matrix[j, i] = cos_sim_matrix[i, j]
+        # convert the matrix to a pandas DataFrame
+        cos_sim_df = pd.DataFrame(
+            cos_sim_matrix.numpy(),
+            index=modelpool.model_names,
+            columns=modelpool.model_names,
+        )
+        print(cos_sim_df)
+        if self.output_path is not None:
+            os.makedirs(self.output_path, exist_ok=True)
+            cos_sim_df.to_csv(
+                os.path.join(self.output_path, "task_vector_cos_similarity.csv")
+            )
+        if self.plot_heatmap:
+            self._plot_heatmap(cos_sim_df)
+        return pretrained_model
+    def _plot_heatmap(self, data: pd.DataFrame):
+        """
+        This function plots a heatmap of the provided data using seaborn.
+        Args:
+            data (pd.DataFrame): A pandas DataFrame containing the data to be plotted.
+            figsize (tuple): A tuple specifying the size of the figure. Default is (4, 3).
+        Returns:
+            None
+        """
+        import matplotlib.pyplot as plt
+        import seaborn as sns
+        # Create a heatmap using seaborn
+        plt.figure()
+        sns.heatmap(
+            data,
+            annot=True,
+            fmt=".2f",
+            cmap="GnBu",
+        )
+        # Add title and labels with increased font size
+        plt.title("Heatmap of Cos Similarities", fontsize=14)
+        # plt.xlabel("Task", fontsize=14)
+        # plt.ylabel("Task", fontsize=14)
+        plt.xticks(rotation=45)
+        plt.yticks(rotation=45)
+        # Show plot
+        plt.savefig(
+            os.path.join(self.output_path, "task_vector_cos_similarity.pdf"),
+            bbox_inches="tight",
+        )
+        plt.close()
+    def get_task_vector(
+        self, pretrained_model: nn.Module, finetuned_model: nn.Module
+    ) -> torch.Tensor:
+        task_vector = state_dict_sub(
+            self.get_state_dict(finetuned_model),
+            self.get_state_dict(pretrained_model),
+        )
+        task_vector = state_dict_to_vector(task_vector)
+        task_vector = task_vector.cpu().float().numpy()
+        # downsample if necessary
+        if (
+            self.max_points_per_model is not None
+            and self.max_points_per_model > 0
+            and task_vector.shape[0] > self.max_points_per_model
+        ):
+            log.info(
+                f"Downsampling task vectors to {self.max_points_per_model} points."
+            )
+            indices = np.random.choice(
+                task_vector.shape[0], self.max_points_per_model, replace=False
+            )
+            task_vector = task_vector[indices].copy()
+        task_vector = torch.from_numpy(task_vector)
+        return task_vector
+    def get_state_dict(self, model: nn.Module):
+        if self.trainable_only:
+            return trainable_state_dict(model)
+        else:
+            return model.state_dict()

fusion_bench/method/analysis/task_vector_violin_plot.py ADDED Viewed

@@ -0,0 +1,205 @@
+import logging
+import os
+from typing import Dict, List, Optional, cast
+import matplotlib as mpl
+import matplotlib.pyplot as plt
+import numpy as np
+import seaborn as sns
+import torch
+from numpy.typing import NDArray
+from torch import nn
+from tqdm.auto import tqdm
+from fusion_bench import BaseAlgorithm, BaseModelPool
+from fusion_bench.mixins import LightningFabricMixin, SimpleProfilerMixin
+from fusion_bench.utils import timeit_context
+from fusion_bench.utils.parameters import (
+    StateDictType,
+    state_dict_to_vector,
+    trainable_state_dict,
+)
+from fusion_bench.utils.state_dict_arithmetic import state_dict_sub
+log = logging.getLogger(__name__)
+class TaskVectorViolinPlot(BaseAlgorithm, LightningFabricMixin, SimpleProfilerMixin):
+    R"""
+    Plot violin plots of task vectors as in:
+    [L.Shen, A.Tang, E.Yang et al. Efficient and Effective Weight-Ensembling Mixture of Experts for Multi-Task Model Merging](https://arxiv.org/abs/2410.21804)
+    """
+    # config_mapping is a mapping from the attributes to the key in the configuration files
+    _config_mapping = BaseAlgorithm._config_mapping | {
+        "trainable_only": "trainable_only",
+        "max_points_per_model": "max_points_per_model",
+        "fig_kwargs": "fig_kwargs",
+        "_output_path": "output_path",
+    }
+    def __init__(
+        self,
+        trainable_only: bool,
+        max_points_per_model: Optional[int] = 1000,
+        fig_kwawrgs=None,
+        output_path: Optional[str] = None,
+        **kwargs,
+    ):
+        R"""
+        This class creates violin plots to visualize task vectors, which represent the differences
+        between fine-tuned models and their pretrained base model.
+        Args:
+            trainable_only (bool): If True, only consider trainable parameters when computing
+                task vectors. If False, use all parameters.
+            fig_kwargs (dict, optional): Dictionary of keyword arguments to pass to
+                `matplotlib.pyplot.subplots`. Common options include:
+                - figsize: Tuple of (width, height) in inches
+                - dpi: Dots per inch
+                - facecolor: Figure background color
+                Defaults to None.
+            output_path (str, optional): Path where the violin plot will be saved. If None,
+                uses the fabric logger's log directory. Defaults to None.
+            kwargs: Additional keyword arguments passed to the parent class(es).
+        Example:
+            ```python
+            plotter = TaskVectorViolinPlot(
+                trainable_only=True,
+                fig_kwargs={'figsize': (10, 6), 'dpi': 300},
+                output_path='./plots'
+            )
+            plotter.run(modelpool)
+            ```
+        """
+        self.trainable_only = trainable_only
+        self.fig_kwargs = fig_kwawrgs
+        self.max_points_per_model = max_points_per_model
+        self._output_path = output_path
+        super().__init__(**kwargs)
+    @property
+    def output_path(self):
+        if self._output_path is None:
+            return self.fabric.logger.log_dir
+        else:
+            return self._output_path
+    def run(self, modelpool: BaseModelPool):
+        """Create violin plots of task vectors comparing different fine-tuned models against a pretrained model.
+        This method implements the visualization technique from the paper "Efficient and Effective
+        Weight-Ensembling Mixture of Experts for Multi-Task Model Merging". It:
+        1. Loads the pretrained model
+        2. Computes task vectors (differences between fine-tuned and pretrained models)
+        3. Creates violin plots showing the distribution of values in these task vectors
+        Args:
+            modelpool (BaseModelPool): Model pool containing the pretrained model and fine-tuned models
+        Returns:
+            pretrained_model (nn.Model): The plot is saved to the specified output path.
+        """
+        assert modelpool.has_pretrained
+        pretrained_model = modelpool.load_pretrained_model()
+        # Compute task vectors for each fine-tuned model
+        with torch.no_grad(), timeit_context("Computing task vectors"):
+            task_vectors: Dict[str, NDArray] = {}
+            for name, finetuned_model in tqdm(
+                modelpool.named_models(), total=len(modelpool)
+            ):
+                print(f"computing task vectors for {name}")
+                task_vectors[name] = self.get_task_vector(
+                    pretrained_model, finetuned_model
+                )
+        # === Create violin plot ===
+        fig, ax = plt.subplots(
+            1, 1, **self.fig_kwargs if self.fig_kwargs is not None else {}
+        )
+        fig = cast(plt.Figure, fig)
+        ax = cast(plt.Axes, ax)
+        # Prepare data for plotting
+        data = [values for values in task_vectors.values()]
+        labels = list(task_vectors.keys())
+        # Create violin plot using seaborn
+        with timeit_context("ploting"):
+            sns.violinplot(data=data, ax=ax)
+        # Customize plot
+        ax.set_xticklabels(labels, rotation=45, ha="right")
+        ax.set_ylabel("Task Vector Values")
+        ax.set_title("Distribution of Task Vector Values")
+        # Adjust layout to prevent label cutoff and save plot
+        plt.tight_layout()
+        os.makedirs(self.output_path, exist_ok=True)
+        output_file = f"{self.output_path}/task_vector_violin.pdf"
+        plt.savefig(output_file, bbox_inches="tight")
+        plt.close(fig)
+        # === Create violin plot (Abs values) ===
+        fig, ax = plt.subplots(
+            1, 1, **self.fig_kwargs if self.fig_kwargs is not None else {}
+        )
+        fig = cast(plt.Figure, fig)
+        ax = cast(plt.Axes, ax)
+        # Prepare data for plotting
+        data = [np.abs(values) for values in task_vectors.values()]
+        labels = list(task_vectors.keys())
+        # Create violin plot using seaborn
+        with timeit_context("ploting abs value plot"):
+            sns.violinplot(data=data, ax=ax)
+        # Customize plot
+        ax.set_xticklabels(labels, rotation=45, ha="right")
+        ax.set_ylabel("The Absolute Values")
+        ax.set_title("Distribution of Task Vector Absolute Values")
+        # Adjust layout to prevent label cutoff and save plot
+        plt.tight_layout()
+        os.makedirs(self.output_path, exist_ok=True)
+        output_file = f"{self.output_path}/task_vector_violin_abs.pdf"
+        plt.savefig(output_file, bbox_inches="tight")
+        plt.close(fig)
+        return pretrained_model
+    def get_task_vector(self, pretrained_model, finetuned_model):
+        task_vector = state_dict_sub(
+            self.get_state_dict(finetuned_model),
+            self.get_state_dict(pretrained_model),
+        )
+        task_vector = state_dict_to_vector(task_vector)
+        task_vector = task_vector.cpu().float().numpy()
+        # downsample if necessary
+        if (
+            self.max_points_per_model is not None
+            and self.max_points_per_model > 0
+            and task_vector.shape[0] > self.max_points_per_model
+        ):
+            log.info(
+                f"Downsampling task vectors to {self.max_points_per_model} points."
+            )
+            indices = np.random.choice(
+                task_vector.shape[0], self.max_points_per_model, replace=False
+            )
+            task_vector = task_vector[indices].copy()
+        return task_vector
+    def get_state_dict(self, model: nn.Module):
+        if self.trainable_only:
+            return trainable_state_dict(model)
+        else:
+            return model.state_dict()

fusion_bench/method/base_algorithm.py ADDED Viewed

@@ -0,0 +1,44 @@
+import logging
+from abc import abstractmethod
+from typing import Optional  # noqa: F401
+from fusion_bench.mixins import BaseYAMLSerializableModel
+from fusion_bench.modelpool import BaseModelPool
+__all__ = ["BaseAlgorithm", "BaseModelFusionAlgorithm"]
+log = logging.getLogger(__name__)
+class BaseAlgorithm(BaseYAMLSerializableModel):
+    """
+    Base class for model fusion algorithms.
+    This class provides a template for implementing model fusion algorithms.
+    Subclasses must implement the `run` method to define the fusion logic.
+    """
+    _program = None
+    @abstractmethod
+    def run(self, modelpool: BaseModelPool):
+        """
+        Fuse the models in the given model pool.
+        This method must be implemented by subclasses to define the fusion logic.
+        Examples:
+            >>> algorithm = SimpleAverageAlgorithm()
+            >>> modelpool = ModelPool()
+            >>> merged_model = algorithm.run(modelpool)
+        Args:
+            modelpool (BaseModelPool): The pool of models to fuse.
+        """
+        pass
+BaseModelFusionAlgorithm = BaseAlgorithm
+"""
+Alias for `BaseAlgorithm`.
+"""

fusion_bench/method/classification/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+# flake8: noqa F401
+from .clip_finetune import ImageClassificationFineTuningForCLIP
+from .continual_clip_finetune import ContinualImageClassificationFineTuningForCLIP