PyPI - fusion-bench - Versions diffs - 0.2.9__py3-none-any.whl - Mend

fusion-bench 0.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (727) hide show

fusion_bench/method/trust_region/utils.py ADDED Viewed

@@ -0,0 +1,58 @@
+import copy
+from collections import OrderedDict
+from torch import nn
+# Model conversion utils
+def state_dict_to_vector(state_dict, remove_keys=[]):
+    """
+    Convert a state dictionary to a vector.
+    Args:
+        state_dict (dict): The state dictionary to convert.
+        remove_keys (list, optional): List of keys to remove from the state dictionary. Defaults to [].
+    Returns:
+        torch.Tensor: The converted vector.
+    """
+    shared_state_dict = copy.deepcopy(state_dict)
+    for key in remove_keys:
+        if key in shared_state_dict:
+            del shared_state_dict[key]
+    sorted_shared_state_dict = OrderedDict(sorted(shared_state_dict.items()))
+    return nn.utils.parameters_to_vector(
+        [value.reshape(-1) for key, value in sorted_shared_state_dict.items()]
+    )
+def vector_to_state_dict(vector, state_dict, remove_keys=[]):
+    """
+    Convert a vector to a state dictionary.
+    Args:
+        vector (torch.Tensor): The vector to convert.
+        state_dict (dict): The reference state dictionary to define the order of the vector.
+        remove_keys (list, optional): List of keys to remove from the reference state dictionary. Defaults to [].
+    Returns:
+        dict: The converted state dictionary.
+    """
+    # create a reference dict to define the order of the vector
+    reference_dict = copy.deepcopy(state_dict)
+    for key in remove_keys:
+        if key in reference_dict:
+            del reference_dict[key]
+    sorted_reference_dict = OrderedDict(sorted(reference_dict.items()))
+    # create a shared state dict using the reference dict
+    nn.utils.vector_to_parameters(vector, sorted_reference_dict.values())
+    # add back the encoder and decoder embedding weights.
+    if "transformer.shared.weight" in sorted_reference_dict:
+        for key in remove_keys:
+            sorted_reference_dict[key] = sorted_reference_dict[
+                "transformer.shared.weight"
+            ]
+    return sorted_reference_dict

fusion_bench/method/we_moe/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ # flake8: noqa F401
2	+ from .clip_we_moe import CLIPWeightEnsemblingMoEAlgorithm

fusion_bench/method/we_moe/clip_we_moe.py ADDED Viewed

@@ -0,0 +1,161 @@
+import functools
+import logging
+import os
+from copy import deepcopy
+import torch
+from torch import Tensor
+from torch.utils.data import DataLoader
+from transformers import CLIPModel, CLIPProcessor
+from transformers.models.clip.modeling_clip import CLIPEncoder
+from fusion_bench.dataset import CLIPDataset
+from fusion_bench.method.task_arithmetic.task_arithmetic import task_arithmetic_merge
+from fusion_bench.mixins import CLIPClassificationMixin
+from fusion_bench.modelpool import CLIPVisionModelPool
+from fusion_bench.models.hf_clip import HFCLIPClassifier
+from fusion_bench.models.we_moe import WeightEnsemblingMoE
+from fusion_bench.tasks.clip_classification import get_classnames_and_templates
+from fusion_bench.utils import timeit_context
+from fusion_bench.utils.data import InfiniteDataLoader
+from .we_moe import WeightEnsemblingMoEAlgorithm
+log = logging.getLogger(__name__)
+class CLIPWeightEnsemblingMoEAlgorithm(
+    WeightEnsemblingMoEAlgorithm,
+    CLIPClassificationMixin,
+):
+    """
+    CLIPWeightEnsemblingMoEAlgorithm is a class that implements the WeightEnsemblingMoEAlgorithm
+    for CLIP models. It extends the WeightEnsemblingMoEAlgorithm and CLIPClassificationMixin classes.
+    Attributes:
+        modelpool (CLIPVisionModelPool): The model pool containing the CLIP models.
+    """
+    modelpool: CLIPVisionModelPool = None
+    def load_checkpoint(self, model, checkpoint):
+        """
+        Load the checkpoint file.
+        Args:
+            model: The model to load the checkpoint into.
+            checkpoint: The path to the checkpoint file.
+        """
+        state = {"model": model}
+        self._fabric.load(checkpoint, state)
+    def save_checkpoint(self, model, checkpoint):
+        """
+        Save the checkpoint file.
+        Args:
+            model: The model to save the checkpoint from.
+            checkpoint: The path to the checkpoint file.
+        """
+        self._fabric.save(checkpoint, {"model": model})
+    def construct_moe_model(self) -> WeightEnsemblingMoE:
+        """
+        Construct the Mixture of Experts (MoE) model using the models in the model pool.
+        Returns:
+            WeightEnsemblingMoE: The constructed MoE model.
+        """
+        base_model = self.modelpool.load_model("_pretrained_")
+        expert_models = [
+            self.modelpool.load_model(m) for m in self.modelpool.model_names
+        ]
+        # Merge the models using task arithmetic
+        moe_model = task_arithmetic_merge(
+            # This function modifies the model in place, so we need to pass a deepcopy
+            deepcopy(base_model),
+            expert_models,
+            scaling_factor=self.config.init_lambda,
+        ).requires_grad_(False)
+        # Up-scale MLP modules
+        base_encoder: CLIPEncoder = base_model.vision_model.encoder
+        moe_encoder: CLIPEncoder = moe_model.vision_model.encoder
+        expert_encoders = [m.vision_model.encoder for m in expert_models]
+        num_layers = len(base_encoder.layers)
+        for layer_idx in range(num_layers):
+            base_mlp = base_encoder.layers[layer_idx].mlp
+            expert_mlps = [e.layers[layer_idx].mlp for e in expert_encoders]
+            moe_encoder.layers[layer_idx].mlp = WeightEnsemblingMoE(
+                hidden_size=base_encoder.config.hidden_size,
+                base_model=base_mlp,
+                expert_models=expert_mlps,
+                init_lambda=self.config.init_lambda,
+                batch_first=True,  # For open_clip models this is False
+                router_hidden_layers=self.config.router_hidden_layers,
+                batch_reduce=self.config.batch_reduce,
+            )
+        return moe_model
+    @functools.cache
+    def get_shuffled_test_loader_iter(self, tta_dataset: str):
+        """
+        Get an iterator for the shuffled test data loader.
+        Args:
+            tta_dataset (str): The name of the test-time adaptation dataset.
+        Returns:
+            Iterator: An iterator for the shuffled test data loader.
+        """
+        dataset = self.modelpool.load_test_dataset(tta_dataset)
+        dataset = CLIPDataset(dataset, processor=self.clip_processor)
+        log.info("get_shuffled_test_loader_iter")
+        loader = DataLoader(
+            dataset,
+            batch_size=self.config.batch_size,
+            shuffle=True,
+            num_workers=self.config.num_workers,
+            pin_memory=True,
+        )
+        loader = self.fabric.setup_dataloaders(loader)
+        return iter(InfiniteDataLoader(loader))
+    def on_test_time_adaptation_start(self):
+        """
+        Load the CLIP processor and construct the zero-shot classification head for each task.
+        """
+        self.setup_zero_shot_classification_head()
+    def compute_logits(self, module, batch, task) -> Tensor:
+        """
+        Compute the logits for the given batch and task.
+        Args:
+            module: The model module.
+            batch: The input batch.
+            task: The task name.
+        Returns:
+            Tensor: The computed logits.
+        """
+        images, _ = batch
+        text_embeds = self.zeroshot_weights[task]
+        image_embeds = module(images)[1]
+        image_embeds = self.visual_projection(image_embeds)
+        # Normalize embeddings
+        image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
+        # Cosine similarity
+        logits_per_text = (
+            torch.matmul(text_embeds, image_embeds.t()) * self.logit_scale_exp
+        )
+        logits_per_image = logits_per_text.t()
+        return logits_per_image

fusion_bench/method/we_moe/we_moe.py ADDED Viewed

@@ -0,0 +1,247 @@
+import logging
+from abc import abstractmethod
+from typing import cast  # noqa: F401
+import lightning as L
+import lightning.fabric.wrappers
+import torch
+from lightning.pytorch.profilers import SimpleProfiler
+from omegaconf import DictConfig
+from torch import Tensor
+from torch.utils.data import DataLoader
+from tqdm.autonotebook import tqdm
+from fusion_bench.compat.method.base_algorithm import ModelFusionAlgorithm
+from fusion_bench.compat.modelpool import ModelPool
+from fusion_bench.models.we_moe import WeightEnsemblingMoE
+from fusion_bench.utils import timeit_context
+from fusion_bench.utils.parameters import print_parameters
+log = logging.getLogger(__name__)
+def entropy_loss(logits: Tensor) -> Tensor:
+    """
+    Compute the entropy loss of a set of logits.
+    Args:
+        logits (Tensor): The logits to compute the entropy loss of.
+    Returns:
+        Tensor: The entropy loss of the logits.
+    """
+    probs = torch.softmax(logits, dim=-1)
+    return -torch.sum(probs * torch.log(probs + 1e-8), dim=-1).mean()
+class WeightEnsemblingMoEAlgorithm(ModelFusionAlgorithm):
+    """
+    Algorithm for fusing models using Weight Ensembling Mixture of Experts (MoE).
+    This class provides methods for constructing the MoE model, performing test-time adaptation,
+    and running the fusion process.
+    Attributes:
+        _fabric (L.Fabric): The fabric for distributed training.
+        modelpool (ModelPool): The pool of models to be fused.
+        profiler (SimpleProfiler): The profiler for measuring performance.
+    """
+    _fabric: L.Fabric = None
+    modelpool: ModelPool = None
+    def __init__(self, algorithm_config: DictConfig):
+        """
+        Initialize the WeightEnsemblingMoEAlgorithm with the given configuration.
+        Args:
+            algorithm_config (DictConfig): The configuration for the algorithm.
+        """
+        super().__init__(algorithm_config)
+        if self._fabric is None and torch.cuda.is_available():
+            self._fabric = L.Fabric(
+                devices=self.config.get("devices", 1),
+            )
+            self._fabric.launch()
+        else:
+            assert "No CUDA device available."
+        self.profiler = SimpleProfiler(
+            self.config.get("cache_dir", "outputs"), "we_moe_profiler.txt"
+        )
+    @abstractmethod
+    def load_checkpoint(self, model, checkpoint):
+        """
+        Load the checkpoint file.
+        Args:
+            model: The model to load the checkpoint into.
+            checkpoint: The checkpoint file to load.
+        """
+        pass
+    @abstractmethod
+    def save_checkpoint(self, model, checkpoint):
+        """
+        Save the checkpoint file.
+        Args:
+            model: The model to save the checkpoint from.
+            checkpoint: The checkpoint file to save.
+        """
+        pass
+    @abstractmethod
+    def construct_moe_model(self) -> WeightEnsemblingMoE:
+        """
+        Construct the Mixture of Experts model using the models in the model pool.
+        Returns:
+            WeightEnsemblingMoE: The constructed MoE model.
+        """
+        pass
+    def on_test_time_adaptation_start(self):
+        """
+        Hook method called at the start of test-time adaptation.
+        """
+        pass
+    @abstractmethod
+    def get_shuffled_test_loader_iter(self, task: str) -> DataLoader:
+        """
+        Get an iterator for the shuffled test data loader for a specific task.
+        Args:
+            task (str): The task for which to get the test data loader.
+        Returns:
+            DataLoader: The shuffled test data loader iterator.
+        """
+        pass
+    @abstractmethod
+    def compute_logits(self, module, batch, task) -> Tensor:
+        """
+        Compute the logits for a given batch and task.
+        Args:
+            module: The model module to use for computing logits.
+            batch: The batch of data.
+            task: The task for which to compute logits.
+        Returns:
+            Tensor: The computed logits.
+        """
+        pass
+    def test_time_adaptation(self, module: WeightEnsemblingMoE):
+        """
+        Perform test-time adaptation for the given module.
+        Args:
+            module (WeightEnsemblingMoE): The MoE module to adapt.
+        Returns:
+            WeightEnsemblingMoE: The adapted MoE module.
+        """
+        self.on_test_time_adaptation_start()
+        # configure optimizer
+        if self.config.optimizer == "adam":
+            optimizer = torch.optim.Adam(
+                [p for p in module.parameters() if p.requires_grad], lr=self.config.lr
+            )
+        else:
+            raise ValueError(f"Unsupported optimizer: {self.config.optimizer}")
+        if self._fabric is not None:
+            module, optimizer = self._fabric.setup(module, optimizer)
+        module.train()
+        if self.config.get("fast_dev_run", False):
+            log.info("Running fast_dev_run, only one step")
+            pbar = tqdm(
+                range(1),
+                "Test-time adaptation",
+                dynamic_ncols=True,
+            )
+        else:
+            pbar = tqdm(
+                range(self.config.max_steps),
+                "Test-time adaptation",
+                dynamic_ncols=True,
+            )
+        for step_idx in pbar:
+            if self.config.use_grad_accumulate:
+                for task in self.modelpool.model_names:
+                    with self.profiler.profile("data time"):
+                        batch = next(self.get_shuffled_test_loader_iter(task))
+                    with self.profiler.profile("forward pass"):
+                        logits = self.compute_logits(module, batch, task)
+                        assert (
+                            logits.dim() == 2
+                        ), f"Expected logits to be 2D, got {logits.dim()}"
+                        loss = entropy_loss(logits)
+                    # .backward() accumulates when .zero_grad() wasn't called
+                    # this can save memory
+                    with self.profiler.profile("backward pass"):
+                        self._fabric.backward(loss, retain_graph=True)
+            else:
+                loss = 0
+                for task in self.modelpool.model_names:
+                    with self.profiler.profile("data time"):
+                        batch = next(self.get_shuffled_test_loader_iter(task))
+                    with self.profiler.profile("forward pass"):
+                        logits = self.compute_logits(module, batch, task)
+                        assert (
+                            logits.dim() == 2
+                        ), f"Expected logits to be 2D, got {logits.dim()}"
+                        loss = loss + entropy_loss(logits)
+                with self.profiler.profile("backward pass"):
+                    self._fabric.backward(loss, retain_graph=True)
+            with self.profiler.profile("optimizer step"):
+                optimizer.step()
+                optimizer.zero_grad()
+        return module
+    def run(self, modelpool: ModelPool):
+        """
+        Run the WeightEnsemblingMoEAlgorithm to fuse models using Weight Ensembling Mixture of Experts.
+        Args:
+            modelpool (ModelPool): The pool of models to be fused.
+        Returns:
+            WeightEnsemblingMoE: The fused MoE model.
+        """
+        log.info("Fusing models using WeightEnsembling Mixture of Experts modules.")
+        self.modelpool = modelpool
+        with timeit_context("upscaling models to a weight-ensembling MoE model"):
+            moe_model = self.construct_moe_model()
+            print_parameters(moe_model)
+        if self.config.get("checkpoint", False):
+            log.info(
+                f"load checkpoint from {self.config.checkpoint}, test-time adaptation will be skipped."
+            )
+            self.load_checkpoint(moe_model, self.config.checkpoint)
+        else:
+            with self.profiler.profile("test-time adaptation"):
+                moe_model = self.test_time_adaptation(moe_model)
+            if self.config.get("save_checkpoint", False):
+                log.info(f"save checkpoint to {self.config.save_checkpoint}")
+                self.save_checkpoint(moe_model, self.config.save_checkpoint)
+            if lightning.fabric.wrappers.is_wrapped(moe_model):
+                moe_model = lightning.fabric.wrappers._unwrap_objects(moe_model)
+        # enable sample-wise adaptation
+        moe_model.batch_reduce = False
+        print(self.profiler.summary())
+        return moe_model

fusion_bench/method/weighted_average/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+# flake8: noqa F401
+from .llama import WeightedAverageForLLama
+from .weighted_average import WeightedAverageAlgorithm

fusion_bench/method/weighted_average/llama.py ADDED Viewed

@@ -0,0 +1,113 @@
+import logging
+from typing import List, Mapping, Union  # noqa: F401
+import numpy as np
+import torch
+from typing_extensions import override
+from fusion_bench.method import BaseAlgorithm
+from fusion_bench.modelpool import CausalLMPool
+from fusion_bench.utils import timeit_context
+from fusion_bench.utils.state_dict_arithmetic import state_dict_add, state_dict_mul
+from fusion_bench.utils.type import StateDictType
+log = logging.getLogger(__name__)
+class WeightedAverageForLLama(BaseAlgorithm):
+    """
+    A class to perform weighted averaging of LlaMa/Mistral models.
+    """
+    _config_mapping = BaseAlgorithm._config_mapping | {
+        "normalize": "normalize",
+        "weights": "weights",
+        "backbone_only": "backbone_only",
+        "merged_model_save_path": "merged_model_save_path",
+        "save_tokenizer": "save_tokenizer",
+        "push_to_hub": "push_to_hub",
+    }
+    def __init__(
+        self,
+        normalize: bool,
+        weights: List[float],
+        backbone_only: bool,
+        merged_model_save_path: str,
+        save_tokenizer: bool,
+        push_to_hub: bool,
+        **kwargs,
+    ):
+        """
+        Initialize the WeightedAverageForLLama class with the given parameters.
+        Args:
+            normalize (bool): Whether to normalize the weights.
+            weights (List[float]): The weights for averaging the models.
+            backbone_only (bool): Whether to use only the backbone of the models.
+            merged_model_save_path (str): The path to save the merged model.
+            save_tokenizer (bool): Whether to save the tokenizer.
+            push_to_hub (bool): Whether to push the model to the hub.
+        """
+        self.normalize = normalize
+        self.weights = weights
+        self.backbone_only = backbone_only
+        self.merged_model_save_path = merged_model_save_path
+        self.save_tokenizer = save_tokenizer
+        self.push_to_hub = push_to_hub
+        super().__init__(**kwargs)
+    @override
+    @torch.no_grad()
+    def run(self, modelpool: CausalLMPool):
+        """
+        Executes the weighted averaging of models in the provided model pool.
+        Args:
+            modelpool (LLamaForCausalLMPoolThe):  pool of models to be averaged.
+        Returns:
+            base_model: The base model after merging the state dictionaries of the models in the pool.
+        Raises:
+            ValueError: If the number of weights does not match the number of models in the pool.
+        """
+        if modelpool.has_pretrained:
+            base_model = modelpool.load_model("_pretrained_")
+        else:
+            base_model = modelpool.load_model(modelpool.model_names[0])
+        weights = self.weights
+        if len(weights) != len(modelpool.model_names):
+            raise ValueError(
+                "Number of weights must match the number of models.,"
+                f"but got {len(weights)} weights and {len(modelpool.model_names)} models."
+                f"weights: {weights}, models: {modelpool.model_names}"
+            )
+        if self.normalize:
+            weights = np.asarray(weights)
+            weights = weights / np.sum(weights)
+        merged_state_dict: StateDictType = None
+        for model_name, weight in zip(modelpool.model_names, weights):
+            model = modelpool.load_model(model_name, backbone_only=self.backbone_only)
+            sd = state_dict_mul(model.state_dict(), weight)
+            if merged_state_dict is None:
+                merged_state_dict = sd
+            else:
+                merged_state_dict = state_dict_add(merged_state_dict, sd)
+        base_model.load_state_dict(
+            merged_state_dict, strict=False if self.backbone_only else True
+        )
+        if self.merged_model_save_path is not None:
+            with timeit_context(
+                f"Saving the merged model to {self.merged_model_save_path}"
+            ):
+                modelpool.save_model(
+                    base_model,
+                    path=self.merged_model_save_path,
+                    save_tokenizer=self.save_tokenizer,
+                    push_to_hub=self.push_to_hub,
+                )
+        return base_model

fusion_bench/method/weighted_average/weighted_average.py ADDED Viewed

@@ -0,0 +1,102 @@
+R"""
+Examples:
+The following command merges eight clip-ViT models using a weighted average approach.
+Because `method.normalize` is set to true, the weights are normalized to sum to 1, thus equivalent to simple average.
+```bash
+fusion_bench \
+    method=linear/weighted_average \
+    method.normalize=true \
+    method.weights=[0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3] \
+    modelpool=... \
+    taskpool=...
+```
+"""
+import logging
+from typing import List, Mapping, Optional, Union  # noqa: F401
+import numpy as np
+import torch
+from typing_extensions import override
+from fusion_bench.method import BaseAlgorithm
+from fusion_bench.mixins import SimpleProfilerMixin
+from fusion_bench.modelpool import BaseModelPool
+from fusion_bench.utils.state_dict_arithmetic import state_dict_add, state_dict_mul
+from fusion_bench.utils.type import StateDictType
+log = logging.getLogger(__name__)
+class WeightedAverageAlgorithm(BaseAlgorithm, SimpleProfilerMixin):
+    _config_mapping = BaseAlgorithm._config_mapping | {
+        "normalize": "normalize",
+        "weights": "weights",
+    }
+    def __init__(
+        self,
+        normalize: bool,
+        weights: List[float],
+        verbose: bool = True,
+        **kwargs,
+    ):
+        self.normalize = normalize
+        self.weights = weights
+        self.verbose = verbose
+        log.disabled = not self.verbose
+        super().__init__(**kwargs)
+    @override
+    @torch.no_grad()
+    def run(self, modelpool: BaseModelPool):
+        """
+        Fuses the models in the model pool using a weighted average approach.
+        Parameters
+            modelpool (ModelPool): The pool of models to be fused.
+        Raises
+            ValueError: If the number of weights does not match the number of models in the model pool.
+        Returns
+            forward_model (torch.nn.Module): The resulting model after fusion.
+        """
+        if not isinstance(modelpool, BaseModelPool):
+            modelpool = BaseModelPool(modelpool)
+        log.info("Fusing models using weighted average.")
+        weights = np.asarray(self.weights)
+        if len(weights) != len(modelpool.model_names):
+            raise ValueError(
+                "Number of weights must match the number of models.,"
+                f"but got {len(weights)} weights and {len(modelpool.model_names)} models."
+                f"weights: {weights}, models: {modelpool.model_names}"
+            )
+        if self.normalize:
+            weights = weights / np.sum(weights)
+        if self.verbose:
+            print(f"weights: {weights}, normalized: {self.normalize}")
+        sd: Optional[StateDictType] = None
+        forward_model = None
+        for model_name, weight in zip(modelpool.model_names, weights):
+            with self.profile("load_model"):
+                model = modelpool.load_model(model_name)
+            with self.profile("merge weights"):
+                if sd is None:
+                    sd = state_dict_mul(model.state_dict(keep_vars=True), weight)
+                    forward_model = model
+                else:
+                    sd = state_dict_add(
+                        sd, state_dict_mul(model.state_dict(keep_vars=True), weight)
+                    )
+        forward_model.load_state_dict(sd)
+        if self.verbose:
+            self.print_profile_summary()
+        return forward_model

fusion_bench/metrics/__init__.py ADDED Viewed

File without changes