PyPI - fusion-bench - Versions diffs - 0.2.16__py3-none-any.whl → 0.2.18__py3-none-any.whl - Mend

fusion-bench 0.2.16py3-none-any.whl → 0.2.18py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

fusion_bench/method/task_singular_vector/utils/task_singular_interference.py ADDED Viewed

@@ -0,0 +1,41 @@
+from typing import List
+import torch
+def compute_task_singular_interference(weight_differences: List[torch.Tensor]) -> float:
+    R"""
+    Compute the singular interference of a list of weight differences $\{W_i - W_0\}_{i=1}^T$,
+    where $W_0$ is the pre-trained model weight, $W_i$ is the weight of the i-th fine-tuned model
+    and $T$ is the number of fine-tuned models.
+    Args:
+        weight_differences (List[torch.Tensor]): A list of weight differences $\{W_i - W_0\}_{i=1}^T$.
+    Returns:
+        float: The singular interference of the list of weight differences.
+    """
+    device = weight_differences[0].device
+    dtype = weight_differences[0].dtype
+    U = []
+    S = []
+    V = []
+    for delta_w in weight_differences:
+        u, s, vh = torch.linalg.svd(delta_w, full_matrices=False)
+        U.append(u)
+        S.append(s)
+        V.append(vh.t())
+    U = torch.cat(U, dim=0)
+    S = torch.cat(S, dim=0)
+    V = torch.cat(V, dim=0)
+    singular_task_interference = torch.linalg.multi_dot(
+        (
+            U.t() @ U - torch.eye(U.shape[1], device=device, dtype=dtype),
+            torch.diag(S),
+            V.t() @ V - torch.eye(V.shape[1], device=device, dtype=dtype),
+        )
+    )
+    singular_task_interference = torch.linalg.norm(singular_task_interference, ord="1")
+    return singular_task_interference

fusion_bench/mixins/hydra_config.py CHANGED Viewed

@@ -9,7 +9,7 @@ from hydra import compose, initialize
 from omegaconf import DictConfig, OmegaConf
 from fusion_bench.utils import import_object, instantiate
-from fusion_bench.utils.instantiate import set_print_function_call
+from fusion_bench.utils.instantiate_utils import set_print_function_call
 log = logging.getLogger(__name__)

fusion_bench/mixins/lightning_fabric.py CHANGED Viewed

@@ -11,7 +11,7 @@ from lightning.fabric.utilities.rank_zero import rank_zero_only
 from omegaconf import DictConfig, OmegaConf
 from fusion_bench.utils import import_object
-from fusion_bench.utils.instantiate import instantiate
+from fusion_bench.utils.instantiate_utils import instantiate
 if TYPE_CHECKING:
     import lightning.fabric.loggers.tensorboard
@@ -172,3 +172,27 @@ class LightningFabricMixin:
             return True
         else:
             return False
+    def log(self, name: str, value: Any, step: Optional[int] = None):
+        """
+        Logs the metric to the fabric's logger.
+        """
+        self.fabric.log(name, value, step=step)
+    def log_dict(self, metrics: dict, step: Optional[int] = None):
+        """
+        Logs the metrics to the fabric's logger.
+        """
+        self.fabric.log_dict(metrics, step=step)
+    def log_optimizer_lr(
+        self,
+        optimizer: torch.optim.Optimizer,
+        step: Optional[int] = None,
+        name_template: str = "train/lr_group_{0}",
+    ):
+        """
+        Logs the learning rate of the optimizer to the fabric's logger.
+        """
+        for i, param_group in enumerate(optimizer.param_groups):
+            self.fabric.log(name_template.format(i), param_group["lr"], step=step)

fusion_bench/mixins/serialization.py CHANGED Viewed

@@ -4,13 +4,14 @@ from typing import Dict, Optional, Union
 from omegaconf import OmegaConf
-from fusion_bench.utils import instantiate
+from fusion_bench.utils import import_object, instantiate
 log = logging.getLogger(__name__)
 class YAMLSerializationMixin:
     _recursive_: bool = False
+    _config_key: Optional[str] = None
     _config_mapping: Dict[str, str] = {
         "_recursive_": "_recursive_",
     }
@@ -99,7 +100,22 @@ class YAMLSerializationMixin:
             BaseModelPool: The loaded model pool.
         """
         config = OmegaConf.load(path)
-        return instantiate(config, _recursive_=cls._recursive_)
+        if cls._config_key is not None and cls._config_key in config:
+            config = config[cls._config_key]
+        target_cls = import_object(config["_target_"])
+        if target_cls != cls:
+            log.warning(
+                f"The class {target_cls.__name__} is not the same as the class {cls.__name__}. "
+                f"Instantiating the class {target_cls.__name__} instead."
+            )
+        return instantiate(
+            config,
+            _recursive_=(
+                cls._recursive_
+                if config.get("_recursive_") is None
+                else config.get("_recursive_")
+            ),
+        )
     def to_config(self):
         """

fusion_bench/modelpool/base_pool.py CHANGED Viewed

@@ -29,6 +29,7 @@ class BaseModelPool(BaseYAMLSerializableModel, HydraConfigMixin):
     """
     _program = None
+    _config_key = "modelpool"
     _models: Union[DictConfig, Dict[str, nn.Module]]
     _config_mapping = BaseYAMLSerializableModel._config_mapping | {
         "_models": "models",

fusion_bench/modelpool/causal_lm/causal_lm.py CHANGED Viewed

@@ -141,6 +141,7 @@ class CausalLMPool(BaseModelPool):
         model_dtype: Optional[str] = None,
         save_tokenizer: bool = False,
         tokenizer_kwargs=None,
+        tokenizer: Optional[PreTrainedTokenizer] = None,
         **kwargs,
     ):
         """
@@ -154,11 +155,13 @@ class CausalLMPool(BaseModelPool):
             **kwargs: Additional keyword arguments passed to the `save_pretrained` method.
         """
         path = os.path.expanduser(path)
-        if save_tokenizer:
-            if tokenizer_kwargs is None:
-                tokenizer_kwargs = {}
-            # load the tokenizer
-            tokenizer = self.load_tokenizer(**tokenizer_kwargs)
+        # NOTE: if tokenizer is provided, it will be saved regardless of `save_tokenizer`
+        if save_tokenizer or tokenizer is not None:
+            if tokenizer is None:
+                if tokenizer_kwargs is None:
+                    tokenizer_kwargs = {}
+                # load the tokenizer
+                tokenizer = self.load_tokenizer(**tokenizer_kwargs)
             tokenizer.save_pretrained(
                 path,
                 push_to_hub=push_to_hub,

fusion_bench/modelpool/clip_vision/modelpool.py CHANGED Viewed

@@ -3,6 +3,7 @@ from copy import deepcopy
 from typing import Optional, Union
 from datasets import load_dataset
+from lightning.fabric.utilities import rank_zero_only
 from omegaconf import DictConfig, open_dict
 from torch import nn
 from torch.utils.data import Dataset
@@ -40,7 +41,8 @@ class CLIPVisionModelPool(BaseModelPool):
     def load_processor(self, *args, **kwargs) -> CLIPProcessor:
         assert self._processor is not None, "Processor is not defined in the config"
         if isinstance(self._processor, str):
-            log.info(f"Loading `transformers.CLIPProcessor`: {self._processor}")
+            if rank_zero_only.rank == 0:
+                log.info(f"Loading `transformers.CLIPProcessor`: {self._processor}")
             processor = CLIPProcessor.from_pretrained(self._processor)
         else:
             processor = instantiate(self._processor, *args, **kwargs)
@@ -50,7 +52,8 @@ class CLIPVisionModelPool(BaseModelPool):
         model_config = self._models[model_name]
         if isinstance(model_config, str):
-            log.info(f"Loading `transformers.CLIPModel`: {model_config}")
+            if rank_zero_only.rank == 0:
+                log.info(f"Loading `transformers.CLIPModel`: {model_config}")
             clip_model = CLIPModel.from_pretrained(model_config, *args, **kwargs)
             return clip_model
         else:
@@ -102,10 +105,12 @@ class CLIPVisionModelPool(BaseModelPool):
         ):
             model = self._models[model_name_or_config]
             if isinstance(model, str):
-                log.info(f"Loading `transformers.CLIPVisionModel`: {model}")
+                if rank_zero_only.rank == 0:
+                    log.info(f"Loading `transformers.CLIPVisionModel`: {model}")
                 return CLIPVisionModel.from_pretrained(model, *args, **kwargs)
             if isinstance(model, nn.Module):
-                log.info(f"Returning existing model: {model}")
+                if rank_zero_only.rank == 0:
+                    log.info(f"Returning existing model: {model}")
                 return model
         # If the model is not a string, we use the default load_model method
@@ -114,9 +119,10 @@ class CLIPVisionModelPool(BaseModelPool):
     def load_train_dataset(self, dataset_name: str, *args, **kwargs):
         dataset_config = self._train_datasets[dataset_name]
         if isinstance(dataset_config, str):
-            log.info(
-                f"Loading train dataset using `datasets.load_dataset`: {dataset_config}"
-            )
+            if rank_zero_only.rank == 0:
+                log.info(
+                    f"Loading train dataset using `datasets.load_dataset`: {dataset_config}"
+                )
             dataset = load_dataset(dataset_config, split="train")
         else:
             dataset = super().load_train_dataset(dataset_name, *args, **kwargs)
@@ -125,9 +131,10 @@ class CLIPVisionModelPool(BaseModelPool):
     def load_val_dataset(self, dataset_name: str, *args, **kwargs):
         dataset_config = self._val_datasets[dataset_name]
         if isinstance(dataset_config, str):
-            log.info(
-                f"Loading validation dataset using `datasets.load_dataset`: {dataset_config}"
-            )
+            if rank_zero_only.rank == 0:
+                log.info(
+                    f"Loading validation dataset using `datasets.load_dataset`: {dataset_config}"
+                )
             dataset = load_dataset(dataset_config, split="validation")
         else:
             dataset = super().load_val_dataset(dataset_name, *args, **kwargs)
@@ -136,9 +143,10 @@ class CLIPVisionModelPool(BaseModelPool):
     def load_test_dataset(self, dataset_name: str, *args, **kwargs):
         dataset_config = self._test_datasets[dataset_name]
         if isinstance(dataset_config, str):
-            log.info(
-                f"Loading test dataset using `datasets.load_dataset`: {dataset_config}"
-            )
+            if rank_zero_only.rank == 0:
+                log.info(
+                    f"Loading test dataset using `datasets.load_dataset`: {dataset_config}"
+                )
             dataset = load_dataset(dataset_config, split="test")
         else:
             dataset = super().load_test_dataset(dataset_name, *args, **kwargs)

fusion_bench/models/__init__.py CHANGED Viewed

@@ -1,3 +1,4 @@
 # flake8: noqa F401
 from . import separate_io, utils
 from .parameter_dict import ParameterDictModel
+from fusion_bench.utils import LazyStateDict

fusion_bench/models/expert_sparsity/__init__.py ADDED Viewed

File without changes

fusion_bench/models/expert_sparsity/mixtral/__init__.py ADDED Viewed

@@ -0,0 +1,15 @@
+R"""
+Copy from https://github.com/Lucky-Lance/Expert_Sparsity/tree/main/model
+Original repo: https://github.com/Lucky-Lance/Expert_Sparsity
+Reference:
+    Not All Experts are Equal: Efficient Expert Pruning and Skipping for Mixture-of-Experts Large Language Models.
+    ACL 2024.
+    http://arxiv.org/abs/2402.14800
+"""
+from .wrapper import (
+    PrunableMixtralSparseMoeBlockWrapper,
+    DynamicSkippingMixtralSparseMoeBlockWrapper,
+)

fusion_bench/models/expert_sparsity/mixtral/dataset.py ADDED Viewed

@@ -0,0 +1,40 @@
+import torch
+class CacheDataset(torch.utils.data.Dataset):
+    def __init__(self):
+        self.alphas = []  # logits
+        self.Xs = []  # input hidden states
+        self.Zs = []  # output hidden states
+        self.prepared = False
+    def __len__(self):
+        if not self.prepared:
+            self.prepare_for_loader()
+        return len(self.alphas)
+    def __getitem__(self, index):
+        if not self.prepared:
+            self.prepare_for_loader()
+        if isinstance(index, list):
+            return [(self.alphas[idx], self.Xs[idx], self.Zs[idx]) for idx in index]
+        elif isinstance(index, int):
+            return self.alphas[index], self.Xs[index], self.Zs[index]
+    def append(self, alpha=None, X=None, Z=None):
+        if alpha is not None:
+            self.alphas.append(alpha.detach().to("cpu", non_blocking=True))
+        if X is not None:
+            self.Xs.append(X.detach().to("cpu", non_blocking=True))
+        if Z is not None:
+            self.Zs.append(Z.detach().to("cpu", non_blocking=True))
+        self.prepared = False
+    def prepare_for_loader(self):
+        if self.prepared:
+            return
+        self.prepared = True
+        self.alphas = torch.concat(self.alphas)
+        self.Xs = torch.concat(self.Xs)
+        self.Zs = torch.concat(self.Zs)
+        assert len(self.Xs) == len(self.Zs)

fusion_bench/models/expert_sparsity/mixtral/modeling_mixtral.py ADDED Viewed

@@ -0,0 +1,207 @@
+import warnings
+from typing import Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
+from transformers.models.mixtral.modeling_mixtral import (
+    MixtralBlockSparseTop2MLP,
+    MixtralConfig,
+    MixtralRMSNorm,
+    MixtralSparseMoeBlock,
+)
+class DynamicSkippingMixtralSparseMoeBlock(nn.Module):
+    """
+    This implementation is
+    strictly equivalent to standard MoE with full capacity (no
+    dropped tokens). It's faster since it formulates MoE operations
+    in terms of block-sparse operations to accomodate imbalanced
+    assignments of tokens to experts, whereas standard MoE either
+    (1) drop tokens at the cost of reduced performance or (2) set
+    capacity factor to number of experts and thus waste computation
+    and memory on padding.
+    """
+    def __init__(self, config, beta):
+        super().__init__()
+        self.hidden_dim = config.hidden_size
+        self.ffn_dim = config.intermediate_size
+        self.num_experts = config.num_local_experts
+        self.top_k = config.num_experts_per_tok
+        # gating
+        self.gate = nn.Linear(self.hidden_dim, self.num_experts, bias=False)
+        self.experts = nn.ModuleList(
+            [MixtralBlockSparseTop2MLP(config) for _ in range(self.num_experts)]
+        )
+        self.beta = beta
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        """ """
+        batch_size, sequence_length, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        # router_logits: (batch * sequence_length, n_experts)
+        router_logits = self.gate(hidden_states)
+        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
+        routing_weights, selected_experts = torch.topk(
+            routing_weights, self.top_k, dim=-1
+        )
+        onlytop1_mask = (
+            routing_weights[:, 1] < self.beta * routing_weights[:, 0]
+        )  # bz x seqlen
+        # routing_weights[tokens_top1, 1].fill_(0)
+        routing_weights[onlytop1_mask, 1] = 0
+        routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
+        # we cast back to the input dtype
+        routing_weights = routing_weights.to(hidden_states.dtype)
+        final_hidden_states = torch.zeros(
+            (batch_size * sequence_length, hidden_dim),
+            dtype=hidden_states.dtype,
+            device=hidden_states.device,
+        )
+        # One hot encode the selected experts to create an expert mask
+        # this will be used to easily index which expert is going to be sollicitated
+        expert_mask = torch.nn.functional.one_hot(
+            selected_experts, num_classes=self.num_experts
+        )
+        # ipdb.set_trace()
+        # expert_mask[tokens_top1, 1, :].fill_(0)
+        expert_mask[onlytop1_mask, 1, :] = 0
+        expert_mask = expert_mask.permute(2, 1, 0)
+        # Loop over all available experts in the model and perform the computation on each expert
+        for expert_idx in range(self.num_experts):
+            expert_layer = self.experts[expert_idx]
+            idx, top_x = torch.where(expert_mask[expert_idx])
+            if top_x.shape[0] == 0:
+                continue
+            # in torch it is faster to index using lists than torch tensors
+            top_x_list = top_x.tolist()
+            idx_list = idx.tolist()
+            # Index the correct hidden states and compute the expert hidden state for
+            # the current expert. We need to make sure to multiply the output hidden
+            # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
+            current_state = hidden_states[None, top_x_list].reshape(-1, hidden_dim)
+            current_hidden_states = expert_layer(
+                current_state, routing_weights[top_x_list, idx_list, None]
+            )
+            # However `index_add_` only support torch tensors for indexing so we'll use
+            # the `top_x` tensor here.
+            final_hidden_states.index_add_(
+                0, top_x, current_hidden_states.to(hidden_states.dtype)
+            )
+        final_hidden_states = final_hidden_states.reshape(
+            batch_size, sequence_length, hidden_dim
+        )
+        return final_hidden_states, router_logits
+class MixtralDecoderLayer(nn.Module):
+    def __init__(self, config: MixtralConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = ALL_ATTENTION_FUNCTIONS[config._attn_implementation](
+            config, layer_idx
+        )
+        if hasattr(config, "betas"):
+            assert (
+                isinstance(config.betas, dict)
+                and len(config.betas) == config.num_hidden_layers
+            )
+            self.block_sparse_moe = DynamicSkippingMixtralSparseMoeBlock(
+                config, config.betas[str(layer_idx)]
+            )
+            warnings.warn(
+                f"Using online drop: {layer_idx}, {config.betas[str(layer_idx)]}, {type(self.block_sparse_moe)}"
+            )
+        else:
+            self.block_sparse_moe = MixtralSparseMoeBlock(config)
+        self.input_layernorm = MixtralRMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+        self.post_attention_layernorm = MixtralRMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        output_router_logits: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        **kwargs,
+    ) -> Tuple[
+        torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]
+    ]:
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_router_logits (`bool`, *optional*):
+                Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
+                should not be returned during inference.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+        """
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+        )
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states, router_logits = self.block_sparse_moe(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        if use_cache:
+            outputs += (present_key_value,)
+        if output_router_logits:
+            outputs += (router_logits,)
+        return outputs

fusion-bench 0.2.16__py3-none-any.whl → 0.2.18__py3-none-any.whl

fusion-bench 0.2.16py3-none-any.whl → 0.2.18py3-none-any.whl