PyPI - fusion-bench - Versions diffs - 0.2.9__py3-none-any.whl - Mend

fusion-bench 0.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (727) hide show

fusion_bench/method/sparse_we_moe/sparse_clip_we_moe.py ADDED Viewed

@@ -0,0 +1,248 @@
+import functools
+import logging
+from copy import deepcopy
+from typing import List, Tuple
+import torch
+from torch import Tensor, nn
+from torch.utils.data import DataLoader
+from tqdm.auto import tqdm
+from transformers import CLIPModel, CLIPProcessor, CLIPVisionModel
+from transformers.models.clip.modeling_clip import CLIPEncoder, CLIPEncoderLayer
+from fusion_bench.dataset import CLIPDataset
+from fusion_bench.method.task_arithmetic import task_arithmetic_merge
+from fusion_bench.mixins import CLIPClassificationMixin
+from fusion_bench.modelpool import CLIPVisionModelPool
+from fusion_bench.models.sparse_we_moe import (
+    SparseWeightEnsemblingMoE,
+    SparseWeightEnsemblingMoE_ShardGate,
+    construct_weight_ensembling_gate,
+)
+from fusion_bench.utils.data import InfiniteDataLoader
+from .sparse_we_moe import SparseWeightEnsemblingMoEAlgorithm
+log = logging.getLogger(__name__)
+class SparseCLIPWeightEnsemblingMoEAlgorithm(
+    SparseWeightEnsemblingMoEAlgorithm,
+    CLIPClassificationMixin,
+):
+    modelpool: CLIPVisionModelPool = None
+    def load_checkpoint(self, model, checkpoint):
+        """
+        Load the checkpoint file.
+        """
+        state = {"model": model}
+        self._fabric.load(checkpoint, state)
+    def save_checkpoint(self, model, checkpoint):
+        """
+        Save the checkpoint file.
+        """
+        self._fabric.save(checkpoint, {"model": model})
+    def construct_moe_model(self) -> SparseWeightEnsemblingMoE:
+        """
+        Construct the Mixture of Experts model using the models in the model pool.
+        """
+        base_model = self.modelpool.load_model("_pretrained_")
+        expert_models = [
+            self.modelpool.load_model(m) for m in self.modelpool.model_names
+        ]
+        # merge the models using task arithmetic
+        moe_model = task_arithmetic_merge(
+            # this function modifies the model in place, so we need to pass a deepcopy
+            deepcopy(base_model),
+            expert_models,
+            scaling_factor=self.config.init_lambda,
+        ).requires_grad_(False)
+        # up-scale MLP modules
+        base_encoder: CLIPEncoder = base_model.vision_model.encoder
+        moe_encoder: CLIPEncoder = moe_model.vision_model.encoder
+        expert_encoders = [m.vision_model.encoder for m in expert_models]
+        num_layers = len(base_encoder.layers)
+        for layer_idx in range(num_layers):
+            base_mlp = base_encoder.layers[layer_idx].mlp
+            expert_mlps = [e.layers[layer_idx].mlp for e in expert_encoders]
+            moe_encoder.layers[layer_idx].mlp = SparseWeightEnsemblingMoE(
+                hidden_size=base_encoder.config.hidden_size,
+                base_model=base_mlp,
+                expert_models=expert_mlps,
+                init_lambda=self.config.init_lambda,
+                batch_first=True,  # for open_clip models this is False
+                router_hidden_layers=self.config.router_hidden_layers,
+                batch_reduce=self.config.batch_reduce,
+                num_layers=num_layers,
+                layer_idx=layer_idx,
+                tv_prune_ratio=self.config.tv_prune_ratio,
+            )
+        return moe_model
+    def construct_moe_model_sharedgate(self) -> SparseWeightEnsemblingMoE_ShardGate:
+        """
+        Construct the Mixture of Experts model using the models in the model pool with a shared gate.
+        """
+        base_model = self.modelpool.load_model("_pretrained_")
+        expert_models = [
+            self.modelpool.load_model(m) for m in self.modelpool.model_names
+        ]
+        # merge the models using task arithmetic
+        moe_model = task_arithmetic_merge(
+            # this function modifies the model in place, so we need to pass a deepcopy
+            deepcopy(base_model),
+            expert_models,
+            scaling_factor=self.config.init_lambda,
+        ).requires_grad_(False)
+        # up-scale MLP modules
+        base_encoder: CLIPEncoder = base_model.vision_model.encoder
+        moe_encoder: CLIPEncoder = moe_model.vision_model.encoder
+        expert_encoders = [m.vision_model.encoder for m in expert_models]
+        # shared gate
+        shared_gate = construct_weight_ensembling_gate(
+            hidden_size=(
+                base_encoder.config.hidden_size + self.config.position_encoding_dim
+                if self.config.position_encoding
+                else base_encoder.config.hidden_size
+            ),
+            num_experts=len(expert_models),
+            init_lambda=self.config.init_lambda,
+            num_hidden_layers=self.config.router_hidden_layers,
+        )
+        # ------------------------------------------------------------------------------------
+        # Calculate magnitude
+        # num_layers = len(base_encoder.layers)
+        # exp_id = 0
+        # for e in expert_encoders:
+        #     for layer_idx in range(num_layers):
+        #         if layer_idx in [0,3,5,7,9,11]:
+        #             print(f"layer_idx: {layer_idx}")
+        #             v_e = torch.cat([param.view(-1) for param in e.layers[layer_idx].mlp.parameters()])
+        #             v_base = torch.cat([param.view(-1) for param in base_encoder.layers[layer_idx].mlp.parameters()])
+        #             absolute_vector = torch.abs(v_e - v_base)
+        #             np.save(f"/home/enneng/fusion_bench/outputs/sparse_we_moe/magnitude/absolute_vector_expert_{exp_id}_layer_{layer_idx}.npy", absolute_vector.detach().numpy())
+        #     exp_id += 1
+        # print('succ')
+        # ------------------------------------------------------------------------------------
+        # ------------------------------------------------------------------------------------
+        # Calculate l2 distance and cos similarity
+        # key = 'att' # 'mlp' or 'att'
+        # num_layers = len(base_encoder.layers)
+        # l2_distance_ss = []
+        # cos_sim_ss = []
+        # for e in expert_encoders:
+        #     l2_distance_s = []
+        #     cos_sim_s = []
+        #     for layer_idx in range(num_layers):
+        #         print(f"layer_idx: {layer_idx}")
+        #         v_e = torch.cat([param.view(-1) for param in e.layers[layer_idx].mlp.parameters()]) if key == 'mlp' \
+        #             else torch.cat([param.view(-1) for param in e.layers[layer_idx].self_attn.parameters()])
+        #         v_base = torch.cat([param.view(-1) for param in base_encoder.layers[layer_idx].mlp.parameters()]) if key == 'mlp' \
+        #             else torch.cat([param.view(-1) for param in base_encoder.layers[layer_idx].self_attn.parameters()])
+        #         l2_distance = torch.norm(v_e - v_base, p=2)
+        #         print(f"L2 Distance: {l2_distance}")
+        #         cos_sim = torch.nn.functional.cosine_similarity(v_e, v_base, dim=0)
+        #         print(f"Cosine Similarity: {cos_sim}")
+        #
+        #         l2_distance_s.append(l2_distance.item())
+        #         cos_sim_s.append(cos_sim.item())
+        #     l2_distance_ss.append(l2_distance_s)
+        #     cos_sim_ss.append(cos_sim_s)
+        #
+        # print("L2 Distances:")
+        # print(l2_distance_ss)
+        # print("Cosine Similarity:")
+        # print(cos_sim_ss)
+        # ------------------------------------------------------------------------------------
+        num_layers = len(base_encoder.layers)
+        for layer_idx in range(num_layers):
+            base_mlp = base_encoder.layers[layer_idx].mlp
+            expert_mlps = [e.layers[layer_idx].mlp for e in expert_encoders]
+            moe_encoder.layers[layer_idx].mlp = SparseWeightEnsemblingMoE_ShardGate(
+                hidden_size=base_encoder.config.hidden_size,
+                base_model=base_mlp,
+                expert_models=expert_mlps,
+                init_lambda=self.config.init_lambda,
+                batch_first=True,  # for open_clip models this is False
+                router_hidden_layers=self.config.router_hidden_layers,
+                batch_reduce=self.config.batch_reduce,
+                num_layers=num_layers,
+                layer_idx=layer_idx,
+                tv_prune_ratio=self.config.tv_prune_ratio,
+                sharedgate=shared_gate,
+                position_encoding=self.config.position_encoding,
+                position_encoding_dim=self.config.position_encoding_dim,
+            )
+        return moe_model
+    @functools.cache
+    def get_shuffled_test_loader_iter(self, tta_dataset: str):
+        """
+        Get an iterator for the shuffled test data loader.
+        """
+        log.info("get_shuffled_test_loader_iter")
+        dataset = self.modelpool.load_test_dataset(tta_dataset)
+        dataset = CLIPDataset(dataset, processor=self.clip_processor)
+        loader = DataLoader(
+            dataset,
+            batch_size=self.config.batch_size,
+            shuffle=True,
+            num_workers=self.config.num_workers,
+            pin_memory=True,
+        )
+        if self._fabric is not None:
+            loader = self._fabric.setup_dataloaders(loader)
+        return iter(InfiniteDataLoader(loader))
+    def on_test_time_adaptation_start(self):
+        """
+        Here we load the CLIP processor and construct the zero-shot classification head for each task.
+        """
+        self.setup_zero_shot_classification_head()
+    def compute_logits(
+        self, module: CLIPVisionModel, batch: Tuple[Tensor, Tensor], task: str
+    ) -> Tensor:
+        """
+        Compute the logits for the given batch and task.
+        Args:
+            module (CLIPVisionModel): The vision model to use for computing logits.
+            batch (Tuple[Tensor, Tensor]): The batch of data.
+            task (str): The task for which to compute logits.
+        Returns:
+            Tensor: The computed logits.
+        """
+        images, _ = batch
+        text_embeds = self.zeroshot_weights[task]
+        image_embeds = module(images)[1]
+        image_embeds = self.visual_projection(image_embeds)
+        # normalize embeddings
+        image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
+        # cosine similarity
+        logits_per_text = (
+            torch.matmul(text_embeds, image_embeds.t()) * self.logit_scale_exp
+        )
+        logits_per_image = logits_per_text.t()
+        return logits_per_image

fusion_bench/method/sparse_we_moe/sparse_we_moe.py ADDED Viewed

@@ -0,0 +1,301 @@
+import logging
+from abc import abstractmethod
+from typing import cast
+import lightning as L
+import lightning.fabric.wrappers
+import torch
+from lightning.pytorch.profilers import SimpleProfiler
+from omegaconf import DictConfig
+from torch import Tensor, nn
+from torch.utils.data import DataLoader
+from tqdm.autonotebook import tqdm
+from fusion_bench.compat.method import ModelFusionAlgorithm
+from fusion_bench.modelpool import BaseModelPool
+from fusion_bench.models.sparse_we_moe import (
+    SparseWeightEnsemblingMoE,
+    SparseWeightEnsemblingMoE_ShardGate,
+    _magnitude_prune,
+    _module_magnitude_prune,
+)
+from fusion_bench.utils import timeit_context
+from fusion_bench.utils.parameters import print_parameters
+log = logging.getLogger(__name__)
+def entropy_loss(logits: Tensor) -> Tensor:
+    """
+    Compute the entropy loss of a set of logits.
+    Args:
+        logits (Tensor): The logits to compute the entropy loss of.
+    Returns:
+        Tensor: The entropy loss of the logits.
+    """
+    probs = torch.softmax(logits, dim=-1)
+    return -torch.sum(probs * torch.log(probs + 1e-8), dim=-1).mean()
+class SparseWeightEnsemblingMoEAlgorithm(ModelFusionAlgorithm):
+    _fabric: L.Fabric = None
+    modelpool: BaseModelPool = None
+    def __init__(self, algorithm_config: DictConfig):
+        """
+        Initialize the SparseWeightEnsemblingMoEAlgorithm with the given configuration.
+        Args:
+            algorithm_config (DictConfig): The configuration for the algorithm.
+        """
+        super().__init__(algorithm_config)
+        self.profiler = SimpleProfiler(
+            self.config.get("cache_dir", "outputs"), "we_moe_profiler.txt"
+        )
+    @abstractmethod
+    def load_checkpoint(self, model, checkpoint):
+        """
+        Load the checkpoint file.
+        Args:
+            model (nn.Module): The model to load the checkpoint into.
+            checkpoint (str): The path to the checkpoint file.
+        """
+        pass
+    @abstractmethod
+    def save_checkpoint(self, model, checkpoint):
+        """
+        Save the checkpoint file.
+        Args:
+            model (nn.Module): The model to save the checkpoint from.
+            checkpoint (str): The path to the checkpoint file.
+        """
+        pass
+    @abstractmethod
+    def construct_moe_model(self) -> SparseWeightEnsemblingMoE:
+        """
+        Construct the Mixture of Experts model using the models in the model pool.
+        Returns:
+            SparseWeightEnsemblingMoE: The constructed Mixture of Experts model.
+        """
+        pass
+    @abstractmethod
+    def construct_moe_model_sharedgate(self) -> SparseWeightEnsemblingMoE_ShardGate:
+        """
+        Construct the Mixture of Experts model using the models in the model pool.
+        Returns:
+            SparseWeightEnsemblingMoE_ShardGate: The constructed Mixture of Experts model with shared gate.
+        """
+        pass
+    def on_test_time_adaptation_start(self):
+        """
+        Hook that is called at the start of test-time adaptation.
+        """
+        pass
+    @abstractmethod
+    def get_shuffled_test_loader_iter(self, task: str) -> DataLoader:
+        """
+        Get an iterator for the shuffled test DataLoader for a specific task.
+        Args:
+            task (str): The task for which to get the DataLoader iterator.
+        Returns:
+            DataLoader: The DataLoader iterator for the specified task.
+        """
+        pass
+    @abstractmethod
+    def compute_logits(self, module, batch, task) -> Tensor:
+        """
+        Compute the logits for a given batch and task.
+        Args:
+            module (nn.Module): The model module.
+            batch (Any): The input batch.
+            task (str): The task for which to compute the logits.
+        Returns:
+            Tensor: The computed logits.
+        """
+        pass
+    def dynamic_prune(self, module, prune_ratio):
+        """
+        Dynamically prune the parameters of a module based on the given prune ratio.
+        Args:
+            module (nn.Module): The module to prune.
+            prune_ratio (float): The ratio of parameters to prune.
+        """
+        for param in module.parameters():
+            if param.requires_grad:
+                param.data = _magnitude_prune(param, prune_ratio)
+    def l1_regularization(self, module, l1_lambda):
+        """
+        Compute the L1 regularization loss for a module.
+        Args:
+            module (nn.Module): The module for which to compute the L1 regularization loss.
+            l1_lambda (float): The L1 regularization coefficient.
+        Returns:
+            Tensor: The L1 regularization loss.
+        """
+        l1_norm = sum(
+            param.abs().sum() for param in module.parameters() if param.requires_grad
+        )
+        return l1_lambda * l1_norm
+    def test_time_adaptation(self, module: SparseWeightEnsemblingMoE):
+        """
+        Perform test-time adaptation for the given module.
+        Args:
+            module (SparseWeightEnsemblingMoE): The module to adapt.
+        Returns:
+            SparseWeightEnsemblingMoE: The adapted module.
+        """
+        self.on_test_time_adaptation_start()
+        # configure optimizer
+        if self.config.optimizer == "adam":
+            optimizer = torch.optim.Adam(
+                [p for p in module.parameters() if p.requires_grad], lr=self.config.lr
+            )
+        else:
+            raise ValueError(f"Unsupported optimizer: {self.config.optimizer}")
+        if self._fabric is not None:
+            module, optimizer = self._fabric.setup(module, optimizer)
+        module.train()
+        if self.config.get("fast_dev_run", False):
+            log.info("Running fast_dev_run, only one step")
+            pbar = tqdm(
+                range(1),
+                "Test-time adaptation",
+                dynamic_ncols=True,
+            )
+        else:
+            pbar = tqdm(
+                range(self.config.max_steps),
+                "Test-time adaptation",
+                dynamic_ncols=True,
+            )
+        for step_idx in pbar:
+            if self.config.use_grad_accumulate:
+                for task in self.modelpool.model_names:
+                    with self.profiler.profile("data time"):
+                        batch = next(self.get_shuffled_test_loader_iter(task))
+                    with self.profiler.profile("forward pass"):
+                        logits = self.compute_logits(module, batch, task)
+                        assert (
+                            logits.dim() == 2
+                        ), f"Expected logits to be 2D, got {logits.dim()}"
+                        loss = entropy_loss(logits)
+                    # .backward() accumulates when .zero_grad() wasn't called
+                    # this can save memory
+                    with self.profiler.profile("backward pass"):
+                        self._fabric.backward(loss, retain_graph=True)
+            else:
+                loss = 0
+                for task in self.modelpool.model_names:
+                    with self.profiler.profile("data time"):
+                        batch = next(self.get_shuffled_test_loader_iter(task))
+                    with self.profiler.profile("forward pass"):
+                        logits = self.compute_logits(module, batch, task)
+                        assert (
+                            logits.dim() == 2
+                        ), f"Expected logits to be 2D, got {logits.dim()}"
+                        loss = loss + entropy_loss(logits)
+                with self.profiler.profile("backward pass"):
+                    self._fabric.backward(loss, retain_graph=True)
+            with self.profiler.profile("optimizer step"):
+                optimizer.step()
+                optimizer.zero_grad()
+        return module
+    def construct_post_spare_gate_model(self, moe_model, gate_prune_ratio):
+        """
+        Construct a (post) sparse gated model.
+        Args:
+            moe_model (SparseWeightEnsemblingMoE): The Mixture of Experts model.
+            gate_prune_ratio (float): The ratio of parameters to prune in the gate.
+        Returns:
+            SparseWeightEnsemblingMoE: The constructed (post) sparse gated model.
+        """
+        moe_encoder = moe_model.vision_model.encoder
+        num_layers = len(moe_encoder.layers)
+        for layer_idx in range(num_layers):
+            gate = moe_encoder.layers[layer_idx].mlp.gate
+            sparse_gate = _module_magnitude_prune(gate, gate_prune_ratio, layer_idx)
+            moe_encoder.layers[layer_idx].mlp.gate = sparse_gate
+        return moe_model
+    def run(self, modelpool: BaseModelPool):
+        """
+        Run the SparseWeightEnsemblingMoEAlgorithm with the given model pool.
+        Args:
+            modelpool (BaseModelPool): The model pool to use for the algorithm.
+        Returns:
+            SparseWeightEnsemblingMoE: The final Mixture of Experts model.
+        """
+        log.info("Fusing models using WeightEnsembling Mixture of Experts modules.")
+        self.modelpool = modelpool
+        with timeit_context("upscaling models to a weight-ensembling MoE model"):
+            if self.config.shared_gate:
+                moe_model = self.construct_moe_model_sharedgate()
+            else:
+                moe_model = self.construct_moe_model()
+            print_parameters(moe_model)
+        if self.config.get("checkpoint", False):
+            log.info(
+                f"load checkpoint from {self.config.checkpoint}, test-time adaptation will be skipped."
+            )
+            self.load_checkpoint(moe_model, self.config.checkpoint)
+        else:
+            with self.profiler.profile("test-time adaptation"):
+                moe_model = self.test_time_adaptation(moe_model)
+            if self.config.get("save_checkpoint", False):
+                log.info(f"save checkpoint to {self.config.save_checkpoint}")
+                self.save_checkpoint(moe_model, self.config.save_checkpoint)
+            if lightning.fabric.wrappers.is_wrapped(moe_model):
+                moe_model = lightning.fabric.wrappers._unwrap_objects(moe_model)
+        #  (post) sparse gate model
+        if self.config.post_sparse_gate:
+            moe_model = self.construct_post_spare_gate_model(
+                moe_model, self.config.gate_prune_ratio
+            )
+        # enable sample-wise adaptation
+        moe_model.batch_reduce = False
+        print(self.profiler.summary())
+        return moe_model

fusion_bench/method/sparselo/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ # flake8: noqa F401
2	+ from .sparselo import IterativeSparseLoForLlama, PCPSparseLoForLlama, SparseLoForLlama