PyPI - fusion-bench - Versions diffs - 0.2.17__py3-none-any.whl → 0.2.19__py3-none-any.whl - Mend

fusion-bench 0.2.17py3-none-any.whl → 0.2.19py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

fusion_bench/method/expert_sparsity/utils/calibration_data.py ADDED Viewed

@@ -0,0 +1,153 @@
+"""
+This module contains the code for loading the calibration data.
+- C4
+- Math
+"""
+import itertools
+import logging
+import os
+import torch
+import transformers
+from datasets import load_dataset
+from transformers import PreTrainedTokenizer, default_data_collator
+from transformers.testing_utils import CaptureLogger
+from huggingface_hub import hf_hub_download
+logger = logging.getLogger(__name__)
+DATASETS = {
+    # C4: Please download first part of the C4 training data `c4-train.00000-of-01024.json` from [allenai/c4](https://huggingface.co/datasets/allenai/c4/blob/main/en/c4-train.00000-of-01024.json.gz).
+    "c4": lambda: load_dataset(
+        "json",
+        data_files={
+            "train": hf_hub_download(
+                "allenai/c4",
+                filename="en/c4-train.00000-of-01024.json.gz",
+                repo_type="dataset",
+            )
+        },
+    ),
+    # MATH: You can use our pre-built calibration set in `./data/math_pretrain_style.json`. To reproduce our construction, please download the training set of [MATH](https://github.com/hendrycks/math) and use our [script](data/math_calib_construction.py).
+    # NOTE: I have uploaded the math_pretrain_style.json to my huggingface repo:
+    # https://huggingface.co/datasets/tanganke/math_pretrain_style/tree/main.
+    "math": lambda: load_dataset(
+        "json",
+        data_files={
+            "train": hf_hub_download(
+                "tanganke/math_pretrain_style",
+                filename="math_pretrain_style.json",
+                repo_type="dataset",
+            )
+        },
+    ),
+}
+def build_calib_loader(
+    dataset: str,
+    tokenizer: PreTrainedTokenizer,
+    max_block_size: int,
+    n_blocks_for_stat: int,
+    batch_size: int,
+    num_workers: int,
+    seed: int = 42,
+):
+    # dataset can be a string or a dataset object.
+    # If it is a string, it can be the name of the dataset in DATASETS or the path to the dataset (a json file).
+    if isinstance(dataset, str):
+        if dataset in DATASETS:
+            all_set = DATASETS[dataset]()
+        else:
+            assert os.path.exists(dataset), f"Dataset {dataset} not found."
+            all_set = load_dataset("json", data_files={"train": dataset})
+    else:
+        assert dataset is not None, "Dataset is not provided."
+        all_set = dataset
+    block_size = tokenizer.model_max_length
+    if block_size > max_block_size:
+        logger.info(
+            "The chosen tokenizer supports a `model_max_length` that is longer than the default `max_block_size` value"
+            f" of {max_block_size}. If you would like to use a longer `block_size` up to `tokenizer.model_max_length` you can"
+            " override this default with `--max_block_size xxx`."
+        )
+        block_size = max_block_size
+    if n_blocks_for_stat > 0:  # Random choose `n_blocks_for_stat` blocks
+        calib_set = (
+            all_set["train"]
+            .shuffle(seed=seed)
+            .select(range(min(n_blocks_for_stat * 16, len(all_set["train"]))))
+        )
+    else:  # Use the whole set
+        logger.warning("n_blocks_for_stat <= 0, using the whole dataset.")
+        calib_set = all_set["train"].shuffle(seed=seed)
+    logger.info(f"Calibration dataset: {calib_set}")
+    text_column_name = (
+        "text" if "text" in calib_set.features else list(calib_set.features)[0]
+    )
+    tok_logger = transformers.utils.logging.get_logger(
+        "transformers.tokenization_utils_base"
+    )
+    def tokenize_function(examples):
+        with CaptureLogger(tok_logger) as cl:
+            output = tokenizer(examples[text_column_name])
+        # clm input could be much much longer than block_size
+        if "Token indices sequence length is longer than the" in cl.out:
+            tok_logger.warning(
+                "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits"
+                " before being passed to the model."
+            )
+        return output
+    tokenized_calib_set = calib_set.map(
+        tokenize_function,
+        batched=True,
+        remove_columns=list(calib_set.features),
+    )
+    def group_texts(examples):
+        # Concatenate all texts.
+        concatenated_examples = {
+            k: list(itertools.chain(*examples[k])) for k in examples.keys()
+        }
+        total_length = len(concatenated_examples[list(examples.keys())[0]])
+        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
+        # customize this part to your needs.
+        if total_length >= block_size:
+            total_length = (total_length // block_size) * block_size
+        # Split by chunks of max_len.
+        result = {
+            k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
+            for k, t in concatenated_examples.items()
+        }
+        result["labels"] = result["input_ids"].copy()
+        return result
+    lm_calib_set = tokenized_calib_set.map(
+        group_texts,
+        batched=True,
+    )
+    if n_blocks_for_stat > 0:
+        assert len(lm_calib_set) > n_blocks_for_stat
+        lm_calib_set = lm_calib_set.select(range(n_blocks_for_stat))
+    calib_loader = torch.utils.data.DataLoader(
+        lm_calib_set,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        pin_memory=True,
+        drop_last=False,
+        shuffle=False,
+        collate_fn=default_data_collator,
+    )
+    return calib_loader

fusion_bench/method/knots/__init__.py ADDED Viewed

File without changes

fusion_bench/method/knots/knots_utils.py ADDED Viewed

@@ -0,0 +1,23 @@
+import torch
+def subspace_alignment(
+    delta_weights: list[torch.Tensor],
+    svd_dtype: torch.dtype | None = torch.float64,
+    eps: float = 1e-4,
+):
+    """
+    Reference: Model merging with SVD to tie the Knots. http://arxiv.org/abs/2410.19735
+    """
+    if svd_dtype is None:
+        svd_dtype = delta_weights[0].dtype
+    original_dtype = delta_weights[0].dtype
+    output_dim, input_dim = delta_weights[0].size()
+    concat_task_vector = torch.cat(delta_weights, dim=1)
+    U, S, Vh = torch.linalg.svd(concat_task_vector.to(svd_dtype), full_matrices=False)
+    # Keep only supported basis components
+    U = U[:, S > eps].to(original_dtype)
+    Vh = Vh[S > eps].to(original_dtype)
+    S = S[S > eps].to(original_dtype)
+    Vhs = torch.split(Vh, input_dim, dim=1)
+    return U, S, Vhs

fusion_bench/method/linear/simple_average_for_llama.py CHANGED Viewed

@@ -1,4 +1,5 @@
-from typing import Optional
+from copy import deepcopy
+from typing import TYPE_CHECKING, Optional
 from typing_extensions import override
@@ -6,6 +7,11 @@ from fusion_bench import timeit_context
 from fusion_bench.method.base_algorithm import BaseAlgorithm
 from fusion_bench.method.simple_average import SimpleAverageAlgorithm
 from fusion_bench.modelpool import CausalLMBackbonePool, CausalLMPool
+from fusion_bench.utils.pylogger import getRankZeroLogger
+from omegaconf import flag_override
+from fusion_bench.utils import instantiate
+log = getRankZeroLogger(__name__)
 class SimpleAverageForLlama(BaseAlgorithm):
@@ -40,12 +46,20 @@ class SimpleAverageForLlama(BaseAlgorithm):
         if self.merge_backbone:
             assert modelpool.has_pretrained
-            backbone_modelpool = CausalLMBackbonePool(**modelpool.config)
+            log.info(
+                "Merging backbone of the model pool, use CausalLMBackbonePool instead of CausalLMPool."
+            )
+            modelpool_config = deepcopy(modelpool.config)
+            with flag_override(modelpool_config, "allow_objects", True):
+                modelpool_config._target_ = (
+                    "fusion_bench.modelpool.causal_lm.CausalLMBackbonePool"
+                )
+            backbone_modelpool = instantiate(modelpool_config)
             model = modelpool.load_model("_pretrained_")
             backbone_model = SimpleAverageAlgorithm().run(backbone_modelpool)
             model.model.layers = backbone_model
         else:
-            model = SimpleAverageAlgorithm().run()
+            model = SimpleAverageAlgorithm().run(modelpool=modelpool)
         if self.model_save_path is not None:
             with timeit_context(f"Saving the model to {self.model_save_path}"):

fusion_bench/method/simple_average.py CHANGED Viewed

@@ -8,6 +8,7 @@ from torch import nn
 from fusion_bench.method.base_algorithm import BaseAlgorithm
 from fusion_bench.mixins.simple_profiler import SimpleProfilerMixin
 from fusion_bench.modelpool import BaseModelPool
+from fusion_bench.utils import LazyStateDict
 from fusion_bench.utils.state_dict_arithmetic import (
     state_dict_add,
     state_dict_avg,
@@ -104,6 +105,15 @@ class SimpleAverageAlgorithm(
             # Divide the accumulated state dictionary by the number of models to get the average
             sd = state_dict_div(sd, len(modelpool.model_names))
+        if isinstance(forward_model, LazyStateDict):
+            # if the model is a LazyStateDict, convert it to an empty module
+            forward_model = forward_model.meta_module.to_empty(
+                device=(
+                    "cpu"
+                    if forward_model._torch_dtype is None
+                    else forward_model._torch_dtype
+                )
+            )
         forward_model.load_state_dict(sd)
         # print profile report and log the merged models
         self.print_profile_summary()

fusion_bench/method/task_singular_vector/utils/__init__.py CHANGED Viewed

@@ -5,3 +5,4 @@ from fusion_bench.method.ties_merging.ties_merging_utils import (
 from fusion_bench.utils import state_dict_to_vector, vector_to_state_dict
 from . import TSVC_utils, TSVM_utils
+from .task_singular_interference import compute_task_singular_interference

fusion_bench/method/task_singular_vector/utils/task_singular_interference.py ADDED Viewed

@@ -0,0 +1,41 @@
+from typing import List
+import torch
+def compute_task_singular_interference(weight_differences: List[torch.Tensor]) -> float:
+    R"""
+    Compute the singular interference of a list of weight differences $\{W_i - W_0\}_{i=1}^T$,
+    where $W_0$ is the pre-trained model weight, $W_i$ is the weight of the i-th fine-tuned model
+    and $T$ is the number of fine-tuned models.
+    Args:
+        weight_differences (List[torch.Tensor]): A list of weight differences $\{W_i - W_0\}_{i=1}^T$.
+    Returns:
+        float: The singular interference of the list of weight differences.
+    """
+    device = weight_differences[0].device
+    dtype = weight_differences[0].dtype
+    U = []
+    S = []
+    V = []
+    for delta_w in weight_differences:
+        u, s, vh = torch.linalg.svd(delta_w, full_matrices=False)
+        U.append(u)
+        S.append(s)
+        V.append(vh.t())
+    U = torch.cat(U, dim=0)
+    S = torch.cat(S, dim=0)
+    V = torch.cat(V, dim=0)
+    singular_task_interference = torch.linalg.multi_dot(
+        (
+            U.t() @ U - torch.eye(U.shape[1], device=device, dtype=dtype),
+            torch.diag(S),
+            V.t() @ V - torch.eye(V.shape[1], device=device, dtype=dtype),
+        )
+    )
+    singular_task_interference = torch.linalg.norm(singular_task_interference, ord="1")
+    return singular_task_interference

fusion_bench/modelpool/causal_lm/causal_lm.py CHANGED Viewed

@@ -22,6 +22,8 @@ from typing_extensions import override
 from fusion_bench.modelpool import BaseModelPool
 from fusion_bench.utils import instantiate
 from fusion_bench.utils.dtype import parse_dtype
+from fusion_bench.utils.lazy_state_dict import LazyStateDict
+from fusion_bench.utils.packages import import_object
 log = logging.getLogger(__name__)
@@ -30,6 +32,7 @@ class CausalLMPool(BaseModelPool):
     _config_mapping = BaseModelPool._config_mapping | {
         "_tokenizer": "tokenizer",
         "_model_kwargs": "model_kwargs",
+        "load_lazy": "load_lazy",
     }
     def __init__(
@@ -38,6 +41,7 @@ class CausalLMPool(BaseModelPool):
         *,
         tokenizer: Optional[DictConfig],
         model_kwargs: Optional[DictConfig] = None,
+        load_lazy: bool = False,
         **kwargs,
     ):
         super().__init__(models, **kwargs)
@@ -51,6 +55,7 @@ class CausalLMPool(BaseModelPool):
                 self._model_kwargs.torch_dtype = parse_dtype(
                     self._model_kwargs.torch_dtype
                 )
+        self.load_lazy = load_lazy
     @override
     def load_model(
@@ -88,21 +93,41 @@ class CausalLMPool(BaseModelPool):
         model_kwargs.update(kwargs)
         if isinstance(model_name_or_config, str):
+            # If model_name_or_config is a string, it is the name or the path of the model
             log.info(f"Loading model: {model_name_or_config}", stacklevel=2)
             if model_name_or_config in self._models.keys():
                 model_config = self._models[model_name_or_config]
                 if isinstance(model_config, str):
                     # model_config is a string
-                    model = AutoModelForCausalLM.from_pretrained(
-                        model_config,
-                        *args,
-                        **model_kwargs,
-                    )
+                    if not self.load_lazy:
+                        model = AutoModelForCausalLM.from_pretrained(
+                            model_config,
+                            *args,
+                            **model_kwargs,
+                        )
+                    else:
+                        # model_config is a string, but we want to use LazyStateDict
+                        model = LazyStateDict(
+                            checkpoint=model_config,
+                            meta_module_class=AutoModelForCausalLM,
+                            *args,
+                            **model_kwargs,
+                        )
                     return model
         elif isinstance(model_name_or_config, (DictConfig, Dict)):
             model_config = model_name_or_config
-        model = instantiate(model_config, *args, **model_kwargs)
+        if not self.load_lazy:
+            model = instantiate(model_config, *args, **model_kwargs)
+        else:
+            meta_module_class = model_config.pop("_target_")
+            checkpoint = model_config.pop("pretrained_model_name_or_path")
+            model = LazyStateDict(
+                checkpoint=checkpoint,
+                meta_module_class=meta_module_class,
+                *args,
+                **model_kwargs,
+            )
         return model
     def load_tokenizer(self, *args, **kwargs) -> PreTrainedTokenizer:
@@ -141,6 +166,7 @@ class CausalLMPool(BaseModelPool):
         model_dtype: Optional[str] = None,
         save_tokenizer: bool = False,
         tokenizer_kwargs=None,
+        tokenizer: Optional[PreTrainedTokenizer] = None,
         **kwargs,
     ):
         """
@@ -154,11 +180,13 @@ class CausalLMPool(BaseModelPool):
             **kwargs: Additional keyword arguments passed to the `save_pretrained` method.
         """
         path = os.path.expanduser(path)
-        if save_tokenizer:
-            if tokenizer_kwargs is None:
-                tokenizer_kwargs = {}
-            # load the tokenizer
-            tokenizer = self.load_tokenizer(**tokenizer_kwargs)
+        # NOTE: if tokenizer is provided, it will be saved regardless of `save_tokenizer`
+        if save_tokenizer or tokenizer is not None:
+            if tokenizer is None:
+                if tokenizer_kwargs is None:
+                    tokenizer_kwargs = {}
+                # load the tokenizer
+                tokenizer = self.load_tokenizer(**tokenizer_kwargs)
             tokenizer.save_pretrained(
                 path,
                 push_to_hub=push_to_hub,
@@ -176,6 +204,12 @@ class CausalLMBackbonePool(CausalLMPool):
     def load_model(
         self, model_name_or_config: str | DictConfig, *args, **kwargs
     ) -> Module:
+        if self.load_lazy:
+            log.warning(
+                "CausalLMBackbonePool does not support lazy loading. "
+                "Falling back to normal loading."
+            )
+            self.load_lazy = False
         model: AutoModelForCausalLM = super().load_model(
             model_name_or_config, *args, **kwargs
         )

fusion_bench/models/__init__.py CHANGED Viewed

@@ -1,3 +1,4 @@
 # flake8: noqa F401
 from . import separate_io, utils
 from .parameter_dict import ParameterDictModel
+from fusion_bench.utils import LazyStateDict

fusion_bench/models/expert_sparsity/__init__.py ADDED Viewed

File without changes

fusion_bench/models/expert_sparsity/mixtral/__init__.py ADDED Viewed

@@ -0,0 +1,15 @@
+R"""
+Copy from https://github.com/Lucky-Lance/Expert_Sparsity/tree/main/model
+Original repo: https://github.com/Lucky-Lance/Expert_Sparsity
+Reference:
+    Not All Experts are Equal: Efficient Expert Pruning and Skipping for Mixture-of-Experts Large Language Models.
+    ACL 2024.
+    http://arxiv.org/abs/2402.14800
+"""
+from .wrapper import (
+    PrunableMixtralSparseMoeBlockWrapper,
+    DynamicSkippingMixtralSparseMoeBlockWrapper,
+)

fusion_bench/models/expert_sparsity/mixtral/dataset.py ADDED Viewed

@@ -0,0 +1,40 @@
+import torch
+class CacheDataset(torch.utils.data.Dataset):
+    def __init__(self):
+        self.alphas = []  # logits
+        self.Xs = []  # input hidden states
+        self.Zs = []  # output hidden states
+        self.prepared = False
+    def __len__(self):
+        if not self.prepared:
+            self.prepare_for_loader()
+        return len(self.alphas)
+    def __getitem__(self, index):
+        if not self.prepared:
+            self.prepare_for_loader()
+        if isinstance(index, list):
+            return [(self.alphas[idx], self.Xs[idx], self.Zs[idx]) for idx in index]
+        elif isinstance(index, int):
+            return self.alphas[index], self.Xs[index], self.Zs[index]
+    def append(self, alpha=None, X=None, Z=None):
+        if alpha is not None:
+            self.alphas.append(alpha.detach().to("cpu", non_blocking=True))
+        if X is not None:
+            self.Xs.append(X.detach().to("cpu", non_blocking=True))
+        if Z is not None:
+            self.Zs.append(Z.detach().to("cpu", non_blocking=True))
+        self.prepared = False
+    def prepare_for_loader(self):
+        if self.prepared:
+            return
+        self.prepared = True
+        self.alphas = torch.concat(self.alphas)
+        self.Xs = torch.concat(self.Xs)
+        self.Zs = torch.concat(self.Zs)
+        assert len(self.Xs) == len(self.Zs)

fusion_bench/models/expert_sparsity/mixtral/modeling_mixtral.py ADDED Viewed

@@ -0,0 +1,207 @@
+import warnings
+from typing import Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
+from transformers.models.mixtral.modeling_mixtral import (
+    MixtralBlockSparseTop2MLP,
+    MixtralConfig,
+    MixtralRMSNorm,
+    MixtralSparseMoeBlock,
+)
+class DynamicSkippingMixtralSparseMoeBlock(nn.Module):
+    """
+    This implementation is
+    strictly equivalent to standard MoE with full capacity (no
+    dropped tokens). It's faster since it formulates MoE operations
+    in terms of block-sparse operations to accomodate imbalanced
+    assignments of tokens to experts, whereas standard MoE either
+    (1) drop tokens at the cost of reduced performance or (2) set
+    capacity factor to number of experts and thus waste computation
+    and memory on padding.
+    """
+    def __init__(self, config, beta):
+        super().__init__()
+        self.hidden_dim = config.hidden_size
+        self.ffn_dim = config.intermediate_size
+        self.num_experts = config.num_local_experts
+        self.top_k = config.num_experts_per_tok
+        # gating
+        self.gate = nn.Linear(self.hidden_dim, self.num_experts, bias=False)
+        self.experts = nn.ModuleList(
+            [MixtralBlockSparseTop2MLP(config) for _ in range(self.num_experts)]
+        )
+        self.beta = beta
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        """ """
+        batch_size, sequence_length, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        # router_logits: (batch * sequence_length, n_experts)
+        router_logits = self.gate(hidden_states)
+        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
+        routing_weights, selected_experts = torch.topk(
+            routing_weights, self.top_k, dim=-1
+        )
+        onlytop1_mask = (
+            routing_weights[:, 1] < self.beta * routing_weights[:, 0]
+        )  # bz x seqlen
+        # routing_weights[tokens_top1, 1].fill_(0)
+        routing_weights[onlytop1_mask, 1] = 0
+        routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
+        # we cast back to the input dtype
+        routing_weights = routing_weights.to(hidden_states.dtype)
+        final_hidden_states = torch.zeros(
+            (batch_size * sequence_length, hidden_dim),
+            dtype=hidden_states.dtype,
+            device=hidden_states.device,
+        )
+        # One hot encode the selected experts to create an expert mask
+        # this will be used to easily index which expert is going to be sollicitated
+        expert_mask = torch.nn.functional.one_hot(
+            selected_experts, num_classes=self.num_experts
+        )
+        # ipdb.set_trace()
+        # expert_mask[tokens_top1, 1, :].fill_(0)
+        expert_mask[onlytop1_mask, 1, :] = 0
+        expert_mask = expert_mask.permute(2, 1, 0)
+        # Loop over all available experts in the model and perform the computation on each expert
+        for expert_idx in range(self.num_experts):
+            expert_layer = self.experts[expert_idx]
+            idx, top_x = torch.where(expert_mask[expert_idx])
+            if top_x.shape[0] == 0:
+                continue
+            # in torch it is faster to index using lists than torch tensors
+            top_x_list = top_x.tolist()
+            idx_list = idx.tolist()
+            # Index the correct hidden states and compute the expert hidden state for
+            # the current expert. We need to make sure to multiply the output hidden
+            # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
+            current_state = hidden_states[None, top_x_list].reshape(-1, hidden_dim)
+            current_hidden_states = expert_layer(
+                current_state, routing_weights[top_x_list, idx_list, None]
+            )
+            # However `index_add_` only support torch tensors for indexing so we'll use
+            # the `top_x` tensor here.
+            final_hidden_states.index_add_(
+                0, top_x, current_hidden_states.to(hidden_states.dtype)
+            )
+        final_hidden_states = final_hidden_states.reshape(
+            batch_size, sequence_length, hidden_dim
+        )
+        return final_hidden_states, router_logits
+class MixtralDecoderLayer(nn.Module):
+    def __init__(self, config: MixtralConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = ALL_ATTENTION_FUNCTIONS[config._attn_implementation](
+            config, layer_idx
+        )
+        if hasattr(config, "betas"):
+            assert (
+                isinstance(config.betas, dict)
+                and len(config.betas) == config.num_hidden_layers
+            )
+            self.block_sparse_moe = DynamicSkippingMixtralSparseMoeBlock(
+                config, config.betas[str(layer_idx)]
+            )
+            warnings.warn(
+                f"Using online drop: {layer_idx}, {config.betas[str(layer_idx)]}, {type(self.block_sparse_moe)}"
+            )
+        else:
+            self.block_sparse_moe = MixtralSparseMoeBlock(config)
+        self.input_layernorm = MixtralRMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+        self.post_attention_layernorm = MixtralRMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        output_router_logits: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        **kwargs,
+    ) -> Tuple[
+        torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]
+    ]:
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_router_logits (`bool`, *optional*):
+                Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
+                should not be returned during inference.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+        """
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+        )
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states, router_logits = self.block_sparse_moe(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        if use_cache:
+            outputs += (present_key_value,)
+        if output_router_logits:
+            outputs += (router_logits,)
+        return outputs

fusion-bench 0.2.17__py3-none-any.whl → 0.2.19__py3-none-any.whl

fusion-bench 0.2.17py3-none-any.whl → 0.2.19py3-none-any.whl