PyPI - fusion-bench - Versions diffs - 0.2.9__py3-none-any.whl - Mend

fusion-bench 0.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (727) hide show

fusion_bench/models/smile_moe/linear.py ADDED Viewed

@@ -0,0 +1,256 @@
+import logging
+from typing import Dict, List, Tuple  # noqa: F401
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+log = logging.getLogger(__name__)
+class ExpertNotTrainedError(Exception):
+    pass
+def _is_all_zeros(tensor: Tensor | List[Tensor]) -> bool:
+    if isinstance(tensor, Tensor):
+        return torch.allclose(tensor, torch.zeros_like(tensor))
+    else:
+        return all(_is_all_zeros(t) for t in tensor)
+def _svd(w: Tensor, full_matrices=True) -> Tuple[Tensor, Tensor, Tensor]:
+    u, s, vh = torch.linalg.svd(
+        w, full_matrices=full_matrices, driver="gesvd" if w.is_cuda else None
+    )
+    v = vh.T
+    return u, s, v
+def svd(
+    w: Tensor, full_matrices=True, accelerator=None
+) -> Tuple[Tensor, Tensor, Tensor]:
+    if accelerator is None:
+        return _svd(w, full_matrices=full_matrices)
+    original_device = w.device
+    w = w.to(accelerator)
+    u, s, v = _svd(w)
+    return u.to(original_device), s.to(original_device), v.to(original_device)
+class SmileGate(nn.Module):
+    def __init__(
+        self,
+        input_features: int,
+        w_diff_list: List[Tensor],
+        k: int,
+        svd_list=None,  # cached `svd_list`, pass it to avoid recomputing
+        upscaling_accelerator=None,
+    ):
+        super().__init__()
+        self.input_features = input_features
+        self.num_experts = len(w_diff_list)
+        weights = []
+        for i, w_diff in enumerate(w_diff_list):
+            if svd_list is None:
+                u, s, v = svd(w_diff, accelerator=upscaling_accelerator)
+            else:
+                u, s, v = svd_list[i]
+            u = u[:, :k]
+            s = s[:k]
+            v = v[:, :k]
+            # weights.append((s * v).T)
+            weights.append(v.T)
+        self.k = s.size(0)  # k is the actual k after truncation
+        weights = (
+            torch.stack(weights, dim=0)
+            .reshape(self.num_experts * self.k, -1)
+            .contiguous()
+        )
+        self.weights = nn.Parameter(
+            weights
+        )  # weights should be a tensor of shape (num_experts * k, n)
+    def forward(self, x: Tensor):
+        batch_size = x.size(0)
+        if self.num_experts == 1:
+            return torch.ones(batch_size, 1, device=x.device, dtype=x.dtype)
+        routing_weights = F.linear(x, self.weights).view(
+            batch_size, self.num_experts, self.k
+        )
+        routing_weights = routing_weights.norm(p=2, dim=2)
+        return routing_weights
+class SmileCompressedLinear(nn.Module):
+    def __init__(self, model: nn.Linear, k: int, svd_cache=None):
+        super().__init__()
+        if svd_cache is None:
+            u, s, v = svd(model.weight)
+        else:
+            u, s, v = svd_cache
+        if k > 0:
+            u = u[:, :k]
+            s = s[:k]
+            v = v[:, :k]
+        self.u = nn.Parameter(u)
+        self.svh = nn.Parameter((s * v).T)
+        if model.bias is not None:
+            self.bias = nn.Parameter(model.bias.data, requires_grad=True)
+        else:
+            self.register_parameter("bias", None)
+    def forward(self, x):
+        x = F.linear(x, self.svh)
+        x = F.linear(x, self.u, self.bias)
+        return x
+class SmileMoELinear(nn.Module):
+    @torch.no_grad()
+    def __init__(
+        self,
+        pretrained_model: nn.Linear,
+        finetuned_models: List[nn.Linear],
+        gate_k: int,
+        k: int,
+        top_k: int = 1,
+        full_matrices=True,
+        upscaling_accelerator=None,
+        routing_use_diff=True,
+    ):
+        super().__init__()
+        self.num_experts = len(finetuned_models)
+        self.top_k = top_k
+        self.k = k
+        self.gate_k = gate_k
+        self.in_features = pretrained_model.in_features
+        self.out_features = pretrained_model.out_features
+        w_diff_list = [m.weight - pretrained_model.weight for m in finetuned_models]
+        if _is_all_zeros(w_diff_list):
+            # All fine-tuned models are identical to the pretrained model
+            raise ExpertNotTrainedError()
+        if routing_use_diff or k > 0:
+            svd_cache_list = [
+                svd(w, full_matrices=full_matrices, accelerator=upscaling_accelerator)
+                for w in w_diff_list
+            ]  # the svd cache list to avoid recomputing
+        # construct the gate network
+        if routing_use_diff:
+            self.gate = SmileGate(
+                input_features=self.in_features,
+                w_diff_list=w_diff_list,
+                k=gate_k,
+                svd_list=svd_cache_list,
+                upscaling_accelerator=upscaling_accelerator,
+            )
+        else:
+            self.gate = SmileGate(
+                input_features=self.in_features,
+                w_diff_list=[m.weight for m in finetuned_models],
+                k=gate_k,
+                svd_list=None,
+                upscaling_accelerator=upscaling_accelerator,
+            )
+        # construct experts
+        for m, w_diff in zip(finetuned_models, w_diff_list):
+            m.weight.data = w_diff
+        if k > 0:
+            experts = [
+                SmileCompressedLinear(m, k, svd_cache=svd_cache)
+                for m, svd_cache in zip(finetuned_models, svd_cache_list)
+            ]
+        else:
+            # if k is not set (<0), we use the full fine-tuned model
+            experts = finetuned_models
+        self.experts = nn.ModuleList(experts)
+        if pretrained_model.bias is not None:
+            for m in experts:
+                m.bias.data = m.bias.data - pretrained_model.bias
+        # assign the pretrained model (the shared part)
+        self.pretrained_model = pretrained_model
+    def forward(self, hidden_states: Tensor):
+        pretrained_out = self.pretrained_model(hidden_states)
+        input_shape = hidden_states.size()
+        hidden_states = hidden_states.view(-1, self.in_features)
+        router_logits = self.gate(hidden_states)
+        routing_weights = F.softmax(router_logits, dim=1)
+        # sample the expert according to the routing weights
+        routing_weights, selected_experts = torch.topk(
+            routing_weights, self.top_k, dim=-1
+        )
+        routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
+        final_hidden_states = torch.zeros(
+            (hidden_states.size(0), self.out_features),
+            dtype=hidden_states.dtype,
+            device=hidden_states.device,
+        )
+        # One hot encode the selected experts to create an expert mask
+        # this will be used to easily index which expert is going to be sollicitated
+        expert_mask = torch.nn.functional.one_hot(
+            selected_experts, num_classes=self.num_experts
+        ).permute(2, 1, 0)
+        # Loop over all available experts in the model and perform the computation on each expert
+        for expert_idx in range(self.num_experts):
+            expert_layer = self.experts[expert_idx]
+            idx, top_x = torch.where(expert_mask[expert_idx])
+            # Index the correct hidden states and compute the expert hidden state for
+            # the current expert. We need to make sure to multiply the output hidden
+            # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
+            current_state = hidden_states[None, top_x].reshape(-1, self.in_features)
+            if current_state.numel() == 0:
+                continue
+            current_hidden_states = (
+                expert_layer(current_state) * routing_weights[top_x, idx, None]
+            )
+            # However `index_add_` only support torch tensors for indexing so we'll use
+            # the `top_x` tensor here.
+            final_hidden_states.index_add_(
+                0, top_x, current_hidden_states.to(hidden_states.dtype)
+            )
+        final_hidden_states = final_hidden_states.reshape(
+            *input_shape[:-1], self.out_features
+        )
+        final_hidden_states = pretrained_out + final_hidden_states
+        return final_hidden_states
+    @property
+    def weight(self):
+        """
+        Mimic linear layer. Bacause in some cases, user might indicate the device (or dtype of parameters) of the linear layer using `linear_layer.weight.device`
+        """
+        return self.pretrained_model.weight
+    @property
+    def bias(self):
+        return self.pretrained_model.bias
+    def __repr__(self):
+        return (
+            f"SingularMoELinear("
+            f"in_features={self.pretrained_model.in_features}, "
+            f"out_features={self.pretrained_model.out_features}, "
+            f"num_experts={self.num_experts}, "
+            f"top_k={self.top_k}, "
+            f"gate_k={self.gate_k}, "
+            f"k={self.k}"
+            f")"
+        )

fusion_bench/models/sparse_we_moe.py ADDED Viewed

@@ -0,0 +1,459 @@
+import functools
+import logging
+from copy import deepcopy
+from typing import List, Optional
+import numpy as np
+import torch
+import torch.func
+from torch import Tensor, nn
+from torch.func import functional_call
+from torch.nn import functional as F
+from tqdm.auto import tqdm
+from fusion_bench.utils.state_dict_arithmetic import (
+    state_dict_sub,
+    state_dict_weighted_sum,
+)
+from fusion_bench.utils.type import StateDictType
+log = logging.getLogger(__name__)
+def join_list(list_of_list: List[List]):
+    ans = []
+    for l in list_of_list:
+        ans.extend(l)
+    return ans
+def del_attr(obj, names: List[str]):
+    """
+    Deletes an attribute from an object recursively.
+    Args:
+        obj (object): Object to delete attribute from.
+        names (list): List of attribute names to delete recursively.
+    """
+    if len(names) == 1:
+        delattr(obj, names[0])
+    else:
+        del_attr(getattr(obj, names[0]), names[1:])
+def set_attr(obj, names: List[str], val):
+    """
+    Sets an attribute of an object recursively.
+    Args:
+        obj (object): Object to set attribute of.
+        names (list): List of attribute names to set recursively.
+        val (object): Value to set the attribute to.
+    """
+    if len(names) == 1:
+        setattr(obj, names[0], val)
+    else:
+        set_attr(getattr(obj, names[0]), names[1:], val)
+def get_attr(obj, names: List[str]):
+    """
+    Gets an attribute of an object recursively.
+    Args:
+        obj (object): Object to get attribute of.
+        names (list): List of attribute names to get recursively.
+    Returns:
+        object: The attribute of the object.
+    """
+    if len(names) == 1:
+        return getattr(obj, names[0])
+    else:
+        return get_attr(getattr(obj, names[0]), names[1:])
+class Depth_0_Gate(nn.Module):
+    def __init__(self, num_experts: int):
+        super().__init__()
+        self.weight = nn.Parameter(torch.empty(num_experts), requires_grad=True)
+    def init_weight(self, init_lambda: float):
+        nn.init.constant_(self.weight, init_lambda)
+    def forward(self, *args, **kwargs) -> Tensor:
+        return self.weight
+class Depth_1_Gate(nn.Module):
+    def __init__(self, hidden_size: int, num_experts: int):
+        super().__init__()
+        self.fc = nn.Linear(hidden_size, num_experts, bias=True)
+    def init_weight(self, init_lambda: float):
+        nn.init.normal_(self.fc.weight, std=0.01)
+        nn.init.constant_(self.fc.bias, init_lambda)
+    def forward(self, hidden_states: Tensor) -> Tensor:
+        return self.fc(hidden_states)
+class Depth_2_Gate(nn.Module):
+    def __init__(self, hidden_size: int, num_experts: int):
+        super().__init__()
+        self.fc1 = nn.Linear(hidden_size, hidden_size, bias=True)
+        self.fc2 = nn.Linear(hidden_size, num_experts, bias=True)
+    def init_weight(self, init_lambda: float):
+        nn.init.normal_(self.fc1.weight, std=0.01)
+        nn.init.zeros_(self.fc1.bias)
+        nn.init.normal_(self.fc2.weight, std=0.01)
+        nn.init.constant_(self.fc2.bias, init_lambda)
+    def forward(self, hidden_states: Tensor) -> Tensor:
+        hidden_states = F.relu(self.fc1(hidden_states))
+        return self.fc2(hidden_states)
+def construct_weight_ensembling_gate(
+    hidden_size: int,
+    num_experts: int,
+    init_lambda: float,
+    num_hidden_layers: int = 2,
+):
+    if num_hidden_layers == 0:
+        gate = Depth_0_Gate(num_experts)
+    elif num_hidden_layers == 1:
+        gate = Depth_1_Gate(hidden_size, num_experts)
+    elif num_hidden_layers == 2:
+        gate = Depth_2_Gate(hidden_size, num_experts)
+    else:
+        raise ValueError(f"Unsupported number of hidden layers: {num_hidden_layers}")
+    gate.num_hidden_layers = num_hidden_layers
+    gate.init_weight(init_lambda)
+    return gate
+def positional_encoding(layer_idx, dim=8):
+    """
+    layer index encoding
+    """
+    pos = layer_idx
+    i = np.arange(dim // 2)
+    angle_rates = 1 / np.power(10000, (2 * i) / dim)
+    encoding = np.concatenate([np.sin(pos * angle_rates), np.cos(pos * angle_rates)])
+    return encoding
+def _magnitude_prune(weight: Tensor, prune_ratio: float) -> Tensor:
+    """
+    Prune the weights by setting values below a certain quantile to zero.
+    Args:
+        weight (Tensor): The weight tensor to be pruned.
+        prune_ratio (float): The ratio of weights to prune.
+    Returns:
+        Tensor: The pruned weight tensor.
+    """
+    weight_abs = weight.abs()
+    mask = weight_abs > weight_abs.quantile(prune_ratio)
+    weight = weight * mask
+    return weight
+def _module_magnitude_prune(
+    model: Tensor, prune_ratio: float, layer_idx: int
+) -> Tensor:
+    """
+    Prune a module.
+    """
+    for name, param in tqdm(
+        model.named_parameters(),
+        "Magnitude Pruning On {} Linear Layer".format(layer_idx),
+        total=len(tuple(model.named_parameters())),
+    ):
+        param.data = _magnitude_prune(param, prune_ratio)
+    return model
+class SparseWeightEnsemblingMoE(nn.Module):
+    # variable to store the merged state dict temporarily
+    _merged_state_dict: StateDictType = None
+    def __init__(
+        self,
+        hidden_size: int,
+        base_model: nn.Module,
+        expert_models: List[nn.Module],
+        init_lambda: float = 0.2,
+        batch_first: bool = False,
+        router_hidden_layers: int = 2,
+        batch_reduce: bool = False,
+        num_layers: int = -1,
+        layer_idx: int = -1,
+        tv_prune_ratio: float = 0,
+    ):
+        """
+        Initializes the SparseWeightEnsemblingMoE class.
+        Args:
+            hidden_size (int): The size of the hidden layer in the models.
+            base_model (nn.Module): The base model that will be used as a reference for the expert models.
+            expert_models (List[nn.Module]): A list of expert models that will be combined.
+            init_lambda (float, optional): The initial lambda value for the weight ensembling gate. Defaults to 0.2.
+            batch_first (bool, optional): If True, the input tensors are expected to have the batch size as the first dimension. Defaults to False.
+            router_hidden_layers (int, optional): The number of hidden layers in the router. Defaults to 2.
+            batch_reduce (bool): If True, the batch dimension of routing weights is reduced. Defaults to False.
+            num_layers (int): Total number of layers
+            layer_idx (int): Index of the layer
+            tv_prune_ratio (int): What percentage of the parameters are removed
+        """
+        super().__init__()
+        self.num_experts = len(expert_models)
+        self.hidden_size = hidden_size
+        self.batch_first = batch_first
+        self.batch_reduce = batch_reduce
+        self.gate = construct_weight_ensembling_gate(
+            hidden_size,
+            self.num_experts,
+            init_lambda=init_lambda,
+            num_hidden_layers=router_hidden_layers,
+        )
+        # compute the task vectors
+        for name, param in base_model.named_parameters():
+            if not param.requires_grad:
+                for m in expert_models:
+                    del_attr(m, name.split("."))
+            else:
+                for m in expert_models:
+                    get_attr(m, name.split(".")).data = (
+                        get_attr(m, name.split(".")) - param
+                    )
+        # sparse task vectors
+        expert_models = [
+            _module_magnitude_prune(m, prune_ratio=tv_prune_ratio, layer_idx=layer_idx)
+            for m in expert_models
+        ]
+        # fix base model and expert models
+        self.base_model = base_model.requires_grad_(False)
+        for m in expert_models:
+            m.requires_grad_(False)
+        self.task_vectors = nn.ModuleList(expert_models)
+    @property
+    def forward_model(self):
+        return functools.partial(
+            functional_call,
+            self.base_model,
+            self._merged_state_dict,
+        )
+    def merge_weights(self, expert_weights):
+        state_dict = self.base_model.state_dict(keep_vars=True)
+        for weight, task_vector in zip(expert_weights, self.task_vectors):
+            for name, param in task_vector.named_parameters():
+                state_dict[name] = state_dict[name] + weight * param
+        self._merged_state_dict = state_dict
+        return state_dict
+    def forward(self, hidden_states: Tensor):
+        if self.gate.num_hidden_layers == 0:
+            gate_weights = self.gate()
+        else:
+            gate_weights = self.gate(hidden_states)
+            if self.batch_first:
+                # the input is in the shape of (batch_size, seq_len, hidden_size)
+                gate_weights = gate_weights.mean(dim=1)
+            else:
+                # the input is in the shape of (seq_len, batch_size, hidden_size)
+                gate_weights = gate_weights.mean(dim=0)
+        if self.gate.num_hidden_layers == 0:
+            self.merge_weights(gate_weights)
+            output_hidden_states = self.forward_model(hidden_states)
+        elif self.batch_reduce:
+            gate_weights = gate_weights.mean(dim=0)
+            self.merge_weights(gate_weights)
+            output_hidden_states = self.forward_model(hidden_states)
+        else:
+            output_hidden_states = []
+            for sample_idx, weights in enumerate(gate_weights):
+                self.merge_weights(weights)
+                if self.batch_first:
+                    output_hidden_states.append(
+                        self.forward_model(hidden_states[sample_idx : sample_idx + 1])
+                    )
+                else:
+                    output_hidden_states.append(
+                        self.forward_model(
+                            hidden_states[:, sample_idx : sample_idx + 1]
+                        )
+                    )
+            if self.batch_first:
+                output_hidden_states = torch.cat(output_hidden_states, dim=0)
+            else:
+                output_hidden_states = torch.cat(output_hidden_states, dim=1)
+        self._merged_state_dict = None
+        return output_hidden_states
+class SparseWeightEnsemblingMoE_ShardGate(nn.Module):
+    # variable to store the merged state dict temporarily
+    _merged_state_dict: StateDictType = None
+    def __init__(
+        self,
+        hidden_size: int,
+        base_model: nn.Module,
+        expert_models: List[nn.Module],
+        init_lambda: float = 0.2,
+        batch_first: bool = False,
+        router_hidden_layers: int = 2,
+        batch_reduce: bool = False,
+        num_layers: int = -1,
+        layer_idx: int = -1,
+        tv_prune_ratio: float = 0,
+        sharedgate: nn.Module = None,
+        position_encoding: bool = False,
+        position_encoding_dim: int = 0,
+    ):
+        """
+        Initializes the SparseWeightEnsemblingMoE class.
+        Args:
+            hidden_size (int): The size of the hidden layer in the models.
+            base_model (nn.Module): The base model that will be used as a reference for the expert models.
+            expert_models (List[nn.Module]): A list of expert models that will be combined.
+            init_lambda (float, optional): The initial lambda value for the weight ensembling gate. Defaults to 0.2.
+            batch_first (bool, optional): If True, the input tensors are expected to have the batch size as the first dimension. Defaults to False.
+            router_hidden_layers (int, optional): The number of hidden layers in the router. Defaults to 2.
+            batch_reduce (bool): If True, the batch dimension of routing weights is reduced. Defaults to False.
+            num_layers (int): Total number of layers
+            layer_idx (int): Index of the layer
+            tv_prune_ratio (int): What percentage of the parameters are removed
+            sharedgate (nn.Module): Shared routing mechanism
+            position_encoding (bool): Is Positional Encoding enabled?
+            position_encoding_dim (int): Positional Encoding dimension
+        """
+        super().__init__()
+        self.num_experts = len(expert_models)
+        self.hidden_size = hidden_size
+        self.batch_first = batch_first
+        self.batch_reduce = batch_reduce
+        self.position_encoding = position_encoding
+        self.position_encoding_dim = position_encoding_dim
+        # self.layer_idx = layer_idx
+        self.gate = sharedgate
+        if self.position_encoding:
+            self.layer_positional_encoding = torch.from_numpy(
+                positional_encoding(layer_idx, position_encoding_dim)
+            ).float()
+        # compute the task vectors
+        for name, param in base_model.named_parameters():
+            if not param.requires_grad:
+                for m in expert_models:
+                    del_attr(m, name.split("."))
+            else:
+                for m in expert_models:
+                    get_attr(m, name.split(".")).data = (
+                        get_attr(m, name.split(".")) - param
+                    )
+        # sparse task vectors
+        expert_models = [
+            _module_magnitude_prune(m, prune_ratio=tv_prune_ratio, layer_idx=layer_idx)
+            for m in expert_models
+        ]
+        # fix base model and expert models
+        self.base_model = base_model.requires_grad_(False)
+        for m in expert_models:
+            m.requires_grad_(False)
+        self.task_vectors = nn.ModuleList(expert_models)
+    @property
+    def forward_model(self):
+        return functools.partial(
+            functional_call,
+            self.base_model,
+            self._merged_state_dict,
+        )
+    def merge_weights(self, expert_weights):
+        state_dict = self.base_model.state_dict(keep_vars=True)
+        for weight, task_vector in zip(expert_weights, self.task_vectors):
+            for name, param in task_vector.named_parameters():
+                state_dict[name] = state_dict[name] + weight * param
+        self._merged_state_dict = state_dict
+        return state_dict
+    def forward(self, hidden_states: Tensor):
+        gate_input = hidden_states
+        if self.gate.num_hidden_layers == 0:
+            gate_weights = self.gate()
+        else:
+            if self.position_encoding:
+                layer_positional_encoding = (
+                    self.layer_positional_encoding.unsqueeze(0)
+                    .unsqueeze(0)
+                    .expand(
+                        hidden_states.size()[0],
+                        hidden_states.size()[1],
+                        self.position_encoding_dim,
+                    )
+                )
+                layer_positional_encoding = layer_positional_encoding.to(
+                    hidden_states.device
+                )
+                gate_input = torch.cat(
+                    (layer_positional_encoding, hidden_states), dim=-1
+                )
+            gate_weights = self.gate(gate_input)
+            if self.batch_first:
+                # the input is in the shape of (batch_size, seq_len, hidden_size)
+                gate_weights = gate_weights.mean(dim=1)
+            else:
+                # the input is in the shape of (seq_len, batch_size, hidden_size)
+                gate_weights = gate_weights.mean(dim=0)
+        # print('self.batch_reduce'+'-------------'+str(self.batch_reduce)+'-------------')
+        if self.gate.num_hidden_layers == 0:
+            self.merge_weights(gate_weights)
+            output_hidden_states = self.forward_model(hidden_states)
+        elif self.batch_reduce:
+            gate_weights = gate_weights.mean(dim=0)
+            self.merge_weights(gate_weights)
+            output_hidden_states = self.forward_model(hidden_states)
+        else:
+            output_hidden_states = []
+            for sample_idx, weights in enumerate(gate_weights):
+                self.merge_weights(weights)
+                if self.batch_first:
+                    output_hidden_states.append(
+                        self.forward_model(hidden_states[sample_idx : sample_idx + 1])
+                    )
+                else:
+                    output_hidden_states.append(
+                        self.forward_model(
+                            hidden_states[:, sample_idx : sample_idx + 1]
+                        )
+                    )
+            if self.batch_first:
+                output_hidden_states = torch.cat(output_hidden_states, dim=0)
+            else:
+                output_hidden_states = torch.cat(output_hidden_states, dim=1)
+        self._merged_state_dict = None
+        return output_hidden_states