PyPI - fusion-bench - Versions diffs - 0.2.12__py3-none-any.whl → 0.2.14__py3-none-any.whl - Mend

fusion-bench 0.2.12py3-none-any.whl → 0.2.14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (209) hide show

fusion_bench/models/rankone_moe.py CHANGED Viewed

@@ -8,64 +8,13 @@ from torch import Tensor, nn
 from torch.func import functional_call
 from torch.nn import functional as F
+from fusion_bench.models.smile_moe.utils import _is_all_zeros, svd
+from fusion_bench.models.utils import del_attr, get_attr, set_attr
 from fusion_bench.utils.type import StateDictType
 log = logging.getLogger(__name__)
-def join_list(list_of_list: List[List]):
-    ans = []
-    for l in list_of_list:
-        ans.extend(l)
-    return ans
-def del_attr(obj, names: List[str]):
-    """
-    Deletes an attribute from an object recursively.
-    Args:
-        obj (object): Object to delete attribute from.
-        names (list): List of attribute names to delete recursively.
-    """
-    if len(names) == 1:
-        delattr(obj, names[0])
-    else:
-        del_attr(getattr(obj, names[0]), names[1:])
-def set_attr(obj, names: List[str], val):
-    """
-    Sets an attribute of an object recursively.
-    Args:
-        obj (object): Object to set attribute of.
-        names (list): List of attribute names to set recursively.
-        val (object): Value to set the attribute to.
-    """
-    if len(names) == 1:
-        setattr(obj, names[0], val)
-    else:
-        set_attr(getattr(obj, names[0]), names[1:], val)
-def get_attr(obj, names: List[str]):
-    """
-    Gets an attribute of an object recursively.
-    Args:
-        obj (object): Object to get attribute of.
-        names (list): List of attribute names to get recursively.
-    Returns:
-        object: The attribute of the object.
-    """
-    if len(names) == 1:
-        return getattr(obj, names[0])
-    else:
-        return get_attr(getattr(obj, names[0]), names[1:])
 class Depth_0_Gate(nn.Module):
     def __init__(self, num_experts: int):
         super().__init__()
@@ -132,41 +81,6 @@ class ExpertNotTrainedError(Exception):
     pass
-def _is_all_zeros(tensor: Tensor | List[Tensor]) -> bool:
-    """
-    Check if a tensor or a list of tensors are all zeros.
-    """
-    if isinstance(tensor, Tensor):
-        return torch.allclose(tensor, torch.zeros_like(tensor))
-    else:
-        return all(_is_all_zeros(t) for t in tensor)
-def _svd(w: Tensor, full_matrices=True) -> Tuple[Tensor, Tensor, Tensor]:
-    """
-    Perform Singular Value Decomposition (SVD) on a tensor.
-    """
-    u, s, vh = torch.linalg.svd(
-        w, full_matrices=full_matrices, driver="gesvd" if w.is_cuda else None
-    )
-    v = vh.T
-    return u, s, v
-def svd(
-    w: Tensor, full_matrices=True, accelerator=None
-) -> Tuple[Tensor, Tensor, Tensor]:
-    """
-    Perform SVD on a tensor, optionally using a specified accelerator.
-    """
-    if accelerator is None:
-        return _svd(w, full_matrices=full_matrices)
-    original_device = w.device
-    w = w.to(accelerator)
-    u, s, v = _svd(w)
-    return u.to(original_device), s.to(original_device), v.to(original_device)
 def fun_joint_svd(
     w_list: List[Tensor], accelerator=None
 ) -> Tuple[Tensor, Tensor, Tensor]:

fusion_bench/models/smile_moe/linear_from_hf_config.py ADDED Viewed

@@ -0,0 +1,373 @@
+from typing import List, Tuple
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+from fusion_bench.utils.state_dict_arithmetic import state_dict_sub
+from .utils import _is_all_zeros
+class ExpertNotTrainedError(Exception):
+    pass
+def _svd(w: Tensor, full_matrices=False) -> Tuple[Tensor, Tensor, Tensor]:
+    """
+    Perform Singular Value Decomposition (SVD) on a tensor.
+    Args:
+        w (Tensor): The input tensor.
+        full_matrices (bool, optional): Whether to compute the full-sized U and V matrices. Defaults to False.
+    Returns:
+        Tuple[Tensor, Tensor, Tensor]: The U, S, and V matrices from SVD.
+    """
+    dtype = w.dtype
+    if w.dtype != torch.float32 or w.dtype != torch.float64:
+        w = w.float()
+    u, s, vh = torch.linalg.svd(
+        w,
+        full_matrices=full_matrices,
+        # driver="gesvd" if w.is_cuda else None
+    )
+    v = vh.T
+    u = u.to(dtype=dtype)
+    s = s.to(dtype=dtype)
+    v = v.to(dtype=dtype)
+    return u, s, v
+def svd(
+    w: Tensor, full_matrices=True, accelerator=None
+) -> Tuple[Tensor, Tensor, Tensor]:
+    """
+    Perform SVD on a tensor with optional acceleration.
+    This is different from `.utils.svd` in that it handles tensors with precision other than float32 or float64.
+    Args:
+        w (Tensor): The input tensor.
+        full_matrices (bool, optional): Whether to compute the full-sized U and V matrices. Defaults to True.
+        accelerator (optional): The device to perform the computation on. Defaults to None.
+    Returns:
+        Tuple[Tensor, Tensor, Tensor]: The U, S, and V matrices from SVD.
+    """
+    if accelerator is None:
+        return _svd(w, full_matrices=full_matrices)
+    original_device = w.device
+    w = w.to(accelerator)
+    u, s, v = _svd(w)
+    return u, s, v
+class SmileMoEConfig:
+    """
+    Example PretrainedConfig for SmileMoE.
+    Args:
+        num_experts_per_tok: Number of experts per token.
+        rank_of_router: Rank of the router.
+        rank_of_expert: Rank of the expert.
+        num_local_experts: Number of local experts.
+    """
+    num_experts_per_tok: int
+    rank_of_router: int
+    rank_of_expert: int
+    num_local_experts: int
+class SmileGate(nn.Module):
+    __constants__ = ["in_features", "num_experts", "k"]
+    in_features: int
+    num_experts: int
+    k: int
+    weight: nn.Parameter
+    def __init__(
+        self,
+        in_features: int,
+        num_experts: int,
+        k: int,
+        device=None,
+        dtype=None,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.input_features = in_features
+        self.num_experts = num_experts
+        self.k = k
+        self.weight = nn.Parameter(
+            torch.empty(num_experts * k, in_features, **factory_kwargs)
+        )
+    def forward(self, x: Tensor):
+        batch_size = x.size(0)
+        if self.num_experts == 1:
+            return torch.ones(batch_size, 1, device=x.device, dtype=x.dtype)
+        routing_weights = F.linear(x, self.weight).view(
+            batch_size, self.num_experts, self.k
+        )
+        routing_weights = routing_weights.norm(p=2, dim=2)
+        return routing_weights
+class SmileLinearExpert(nn.Module):
+    __constants__ = ["in_features", "out_features", "k"]
+    in_features: int
+    out_features: int
+    k: int
+    def __init__(
+        self,
+        in_features,
+        out_features,
+        k: int,
+        bias: bool,
+        device=None,
+        dtype=None,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.k = k
+        if k > 0:
+            # check k < in_features and out_features
+            if k > in_features:
+                raise ValueError(
+                    f"k ({k}) must not be greater than in_features ({in_features})"
+                )
+            if k > out_features:
+                raise ValueError(
+                    f"k ({k}) must not be greater than out_features ({out_features})"
+                )
+        self.u = nn.Parameter(torch.empty(out_features, k, **factory_kwargs))
+        self.svh = nn.Parameter(torch.empty(k, in_features, **factory_kwargs))
+        if bias:
+            self.bias = nn.Parameter(torch.empty(out_features, **factory_kwargs))
+        else:
+            self.register_parameter("bias", None)
+    def forward(self, x):
+        x = F.linear(x, self.svh)
+        x = F.linear(x, self.u, self.bias)
+        return x
+class SmileLinear(nn.Module):
+    __constants__ = [
+        "in_features",
+        "out_features",
+        "num_local_experts",
+        "num_experts_per_tok",
+        "rank_of_expert",
+        "rank_of_router",
+    ]
+    in_features: int
+    out_features: int
+    num_local_experts: int
+    num_experts_per_tok: int
+    rank_of_expert: int
+    rank_of_router: int
+    @torch.no_grad()
+    def __init__(
+        self,
+        config: SmileMoEConfig,
+        in_features,
+        out_features,
+        bias: bool,
+        device=None,
+        dtype=None,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.num_local_experts = config.num_local_experts
+        self.num_experts_per_tok = config.num_experts_per_tok
+        self.rank_of_expert = config.rank_of_expert
+        self.rank_of_router = config.rank_of_router
+        self.in_features = in_features
+        self.out_features = out_features
+        # construct the gate network
+        self.gate = SmileGate(
+            in_features=in_features,
+            num_experts=self.num_local_experts,
+            k=self.rank_of_router,
+            **factory_kwargs,
+        )
+        # the shared linear
+        self.shared_linear = nn.Linear(
+            in_features, out_features, bias=bias, **factory_kwargs
+        )
+        # construct experts
+        if self.rank_of_expert > 0:
+            self.experts = nn.ModuleList(
+                [
+                    SmileLinearExpert(
+                        in_features=in_features,
+                        out_features=out_features,
+                        bias=bias,
+                        k=self.rank_of_expert,
+                        **factory_kwargs,
+                    )
+                    for _ in range(self.num_local_experts)
+                ]
+            )
+        else:
+            self.experts = nn.ModuleList(
+                [
+                    nn.Linear(in_features, out_features, bias=bias, **factory_kwargs)
+                    for _ in range(self.num_local_experts)
+                ]
+            )
+    def forward(self, hidden_states: Tensor):
+        pretrained_out = self.shared_linear(hidden_states)
+        input_shape = hidden_states.size()
+        hidden_states = hidden_states.view(-1, self.in_features)
+        router_logits = self.gate(hidden_states)
+        routing_weights = F.softmax(router_logits, dim=1)
+        # sample the expert according to the routing weights
+        routing_weights, selected_experts = torch.topk(
+            routing_weights, self.num_experts_per_tok, dim=-1
+        )
+        routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
+        final_hidden_states = torch.zeros(
+            (hidden_states.size(0), self.out_features),
+            dtype=hidden_states.dtype,
+            device=hidden_states.device,
+        )
+        # One hot encode the selected experts to create an expert mask
+        # this will be used to easily index which expert is going to be sollicitated
+        expert_mask = torch.nn.functional.one_hot(
+            selected_experts, num_classes=self.num_local_experts
+        ).permute(2, 1, 0)
+        # Loop over all available experts in the model and perform the computation on each expert
+        for expert_idx in range(self.num_local_experts):
+            expert_layer = self.experts[expert_idx]
+            idx, top_x = torch.where(expert_mask[expert_idx])
+            # Index the correct hidden states and compute the expert hidden state for
+            # the current expert. We need to make sure to multiply the output hidden
+            # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
+            current_state = hidden_states[None, top_x].reshape(-1, self.in_features)
+            if current_state.numel() == 0:
+                continue
+            current_hidden_states = (
+                expert_layer(current_state) * routing_weights[top_x, idx, None]
+            )
+            # However `index_add_` only support torch tensors for indexing so we'll use
+            # the `top_x` tensor here.
+            final_hidden_states.index_add_(
+                0, top_x, current_hidden_states.to(hidden_states.dtype)
+            )
+        final_hidden_states = final_hidden_states.reshape(
+            *input_shape[:-1], self.out_features
+        )
+        final_hidden_states = pretrained_out + final_hidden_states
+        return final_hidden_states
+    @property
+    def weight(self):
+        """
+        Mimic linear layer. Bacause in some cases, user might indicate the device (or dtype of parameters) of the linear layer using `linear_layer.weight.device`
+        """
+        return self.shared_linear.weight
+    @property
+    def bias(self):
+        return self.shared_linear.bias
+    def __repr__(self):
+        return (
+            f"SingularMoELinear("
+            f"in_features={self.shared_linear.in_features}, "
+            f"out_features={self.shared_linear.out_features}, "
+            f"num_local_experts={self.num_local_experts}, "
+            f"num_experts_per_tok={self.num_experts_per_tok}, "
+            f"rank_of_router={self.rank_of_router}, "
+            f"rank_of_expert={self.rank_of_expert}"
+            f")"
+        )
+@torch.no_grad()
+def upscale_to_smile_linear(
+    base: nn.Linear, experts: List[nn.Linear], target: SmileLinear, accelerator=None
+):
+    """
+    Upscale a base linear layer to a SmileLinear layer using expert models.
+    Args:
+        base (nn.Linear): The base linear layer.
+        experts (List[nn.Linear]): A list of expert linear layers.
+        target (SmileLinear): The target SmileLinear layer.
+        accelerator (optional): The device to perform the computation on. Defaults to None.
+    Returns:
+        SmileLinear: The upscaled SmileLinear layer.
+    """
+    w = base.weight
+    w_ft_list = [e.weight for e in experts]
+    dw_list = [w_ft - w for w_ft in w_ft_list]
+    if _is_all_zeros(dw_list):
+        raise ExpertNotTrainedError("Expert models are not trained")
+    rank_of_router = target.rank_of_router
+    rank_of_expert = target.rank_of_expert
+    num_local_experts = target.num_local_experts
+    svd_list = [svd(dw, accelerator=accelerator) for dw in dw_list]
+    # gate
+    gate_weight = []
+    for u, s, v in svd_list:
+        gate_weight.append(v[:, :rank_of_router].T)
+    gate_weight = (
+        torch.stack(gate_weight, dim=0)
+        .reshape(num_local_experts * rank_of_router, -1)
+        .contiguous()
+    )
+    target.gate.load_state_dict({"weight": gate_weight})
+    # shared linear
+    target.shared_linear.load_state_dict(base.state_dict())
+    # experts
+    if rank_of_expert > 0:
+        for expert_idx, target_expert in enumerate(target.experts):
+            u, s, v = svd_list[expert_idx]
+            u = u[:, :rank_of_expert]
+            s = s[:rank_of_expert]
+            v = v[:, :rank_of_expert]
+            state_dict = {"u": u, "svh": (s * v).T}
+            if experts[expert_idx].bias is not None:
+                state_dict["bias"] = experts[expert_idx].bias.data
+            target_expert.load_state_dict(state_dict)
+    else:
+        for expert_idx, target_expert in enumerate(target.experts):
+            target_expert.load_state_dict(
+                state_dict_sub(experts[expert_idx].state_dict(), base.state_dict())
+            )
+    return target

fusion_bench/models/smile_moe/{linear.py → linear_from_module.py} RENAMED Viewed

@@ -1,10 +1,12 @@
 import logging
-from typing import Dict, List, Tuple  # noqa: F401
+from typing import Dict, List, Optional, Tuple, Union  # noqa: F401
 import torch
 import torch.nn.functional as F
 from torch import Tensor, nn
+from .utils import _is_all_zeros, svd
 log = logging.getLogger(__name__)
@@ -12,50 +14,42 @@ class ExpertNotTrainedError(Exception):
     pass
-def _is_all_zeros(tensor: Tensor | List[Tensor]) -> bool:
-    if isinstance(tensor, Tensor):
-        return torch.allclose(tensor, torch.zeros_like(tensor))
-    else:
-        return all(_is_all_zeros(t) for t in tensor)
-def _svd(w: Tensor, full_matrices=True) -> Tuple[Tensor, Tensor, Tensor]:
-    u, s, vh = torch.linalg.svd(
-        w, full_matrices=full_matrices, driver="gesvd" if w.is_cuda else None
-    )
-    v = vh.T
-    return u, s, v
-def svd(
-    w: Tensor, full_matrices=True, accelerator=None
-) -> Tuple[Tensor, Tensor, Tensor]:
-    if accelerator is None:
-        return _svd(w, full_matrices=full_matrices)
-    original_device = w.device
-    w = w.to(accelerator)
-    u, s, v = _svd(w)
-    return u.to(original_device), s.to(original_device), v.to(original_device)
 class SmileGate(nn.Module):
+    __constants__ = ["in_features", "num_experts", "k"]
+    in_features: int
+    num_experts: int
+    k: int
+    weight: nn.Parameter
     def __init__(
         self,
         input_features: int,
         w_diff_list: List[Tensor],
         k: int,
-        svd_list=None,  # cached `svd_list`, pass it to avoid recomputing
+        svd_cache: List[
+            Tuple[Tensor, Tensor, Tensor]
+        ] = None,  # cached `svd_cache`, pass it to avoid recomputing
         upscaling_accelerator=None,
     ):
+        R"""
+        This constructs weights through SVD decomposition.
+        Args:
+            input_features: The dimension of input features.
+            w_diff_list: The list of weight matrices to be decomposed.
+            k: The number of singular values to keep.
+            svd_cache: The cached SVD decomposition results. If not provided, the SVD decomposition will be computed on the fly.
+            upscaling_accelerator: The accelerator to use for SVD decomposition.
+        """
         super().__init__()
         self.input_features = input_features
         self.num_experts = len(w_diff_list)
         weights = []
         for i, w_diff in enumerate(w_diff_list):
-            if svd_list is None:
+            if svd_cache is None:
                 u, s, v = svd(w_diff, accelerator=upscaling_accelerator)
             else:
-                u, s, v = svd_list[i]
+                u, s, v = svd_cache[i]
             u = u[:, :k]
             s = s[:k]
             v = v[:, :k]
@@ -86,8 +80,38 @@ class SmileGate(nn.Module):
 class SmileCompressedLinear(nn.Module):
-    def __init__(self, model: nn.Linear, k: int, svd_cache=None):
+    """
+    This module is used to compress a linear layer using SVD decomposition.
+    """
+    __constants__ = ["in_features", "out_features", "k"]
+    in_features: int
+    out_features: int
+    k: int
+    u: nn.Parameter
+    svh: nn.Parameter
+    bias: Optional[nn.Parameter]
+    def __init__(
+        self,
+        model: nn.Linear,
+        k: int,
+        svd_cache: Optional[Tuple[Tensor, Tensor, Tensor]] = None,
+    ):
+        """
+        Initialize the SmileCompressedLinear module.
+        Args:
+            model (nn.Linear): The linear model to compress.
+            k (int): The number of singular values to keep.
+            svd_cache (Tuple[Tensor, Tensor, Tensor]): Cached SVD results.
+        """
         super().__init__()
+        self.in_features = model.in_features
+        self.out_features = model.out_features
+        self.k = k
         if svd_cache is None:
             u, s, v = svd(model.weight)
         else:
@@ -106,12 +130,36 @@ class SmileCompressedLinear(nn.Module):
             self.register_parameter("bias", None)
     def forward(self, x):
+        """
+        Forward pass of the SmileCompressedLinear module.
+        Args:
+            x (Tensor): The input tensor.
+        Returns:
+            Tensor: The output tensor.
+        """
         x = F.linear(x, self.svh)
         x = F.linear(x, self.u, self.bias)
         return x
 class SmileMoELinear(nn.Module):
+    __constants__ = [
+        "in_features",
+        "out_features",
+        "num_experts",
+        "top_k",
+        "gate_k",
+        "k",
+    ]
+    in_features: int
+    out_features: int
+    num_experts: int
+    top_k: int
+    gate_k: int
+    k: int
     @torch.no_grad()
     def __init__(
         self,
@@ -124,6 +172,19 @@ class SmileMoELinear(nn.Module):
         upscaling_accelerator=None,
         routing_use_diff=True,
     ):
+        """
+        Initialize the SmileMoELinear module.
+        Args:
+            pretrained_model (nn.Linear): The pretrained linear model.
+            finetuned_models (List[nn.Linear]): A list of fine-tuned linear models.
+            gate_k (int): The number of singular values to keep for the gate.
+            k (int): The number of singular values to keep for the experts.
+            top_k (int): The number of top experts to select.
+            full_matrices (bool): Whether to compute the full-sized U and V matrices.
+            upscaling_accelerator (str): The device to perform the computation on.
+            routing_use_diff (bool): Whether to use weight differences for routing.
+        """
         super().__init__()
         self.num_experts = len(finetuned_models)
         self.top_k = top_k
@@ -149,7 +210,7 @@ class SmileMoELinear(nn.Module):
                 input_features=self.in_features,
                 w_diff_list=w_diff_list,
                 k=gate_k,
-                svd_list=svd_cache_list,
+                svd_cache=svd_cache_list,
                 upscaling_accelerator=upscaling_accelerator,
             )
         else:
@@ -157,7 +218,7 @@ class SmileMoELinear(nn.Module):
                 input_features=self.in_features,
                 w_diff_list=[m.weight for m in finetuned_models],
                 k=gate_k,
-                svd_list=None,
+                svd_cache=None,
                 upscaling_accelerator=upscaling_accelerator,
             )
@@ -181,6 +242,15 @@ class SmileMoELinear(nn.Module):
         self.pretrained_model = pretrained_model
     def forward(self, hidden_states: Tensor):
+        """
+        Forward pass of the SmileMoELinear module.
+        Args:
+            hidden_states (Tensor): The input tensor.
+        Returns:
+            Tensor: The output tensor.
+        """
         pretrained_out = self.pretrained_model(hidden_states)
         input_shape = hidden_states.size()

fusion-bench 0.2.12__py3-none-any.whl → 0.2.14__py3-none-any.whl

fusion-bench 0.2.12py3-none-any.whl → 0.2.14py3-none-any.whl