PyPI - fusion-bench - Versions diffs - 0.2.9__py3-none-any.whl - Mend

fusion-bench 0.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (727) hide show

fusion_bench/models/nyuv2/resnet_dilated.py ADDED Viewed

@@ -0,0 +1,99 @@
+from typing import Literal
+import torch.nn as nn
+from . import resnet
+class ResnetDilated(nn.Module):
+    def __init__(self, orig_resnet, dilate_scale=8):
+        super(ResnetDilated, self).__init__()
+        from functools import partial
+        if dilate_scale == 8:
+            orig_resnet.layer3.apply(partial(self._nostride_dilate, dilate=2))
+            orig_resnet.layer4.apply(partial(self._nostride_dilate, dilate=4))
+        elif dilate_scale == 16:
+            orig_resnet.layer4.apply(partial(self._nostride_dilate, dilate=2))
+        # take pre-defined ResNet, except AvgPool and FC
+        self.conv1 = orig_resnet.conv1
+        self.bn1 = orig_resnet.bn1
+        self.relu = orig_resnet.relu
+        self.maxpool = orig_resnet.maxpool
+        self.layer1 = orig_resnet.layer1
+        self.layer2 = orig_resnet.layer2
+        self.layer3 = orig_resnet.layer3
+        self.layer4 = orig_resnet.layer4
+        self.feature_dim = orig_resnet.feature_dim
+    def _nostride_dilate(self, m, dilate):
+        classname = m.__class__.__name__
+        if classname.find("Conv") != -1:
+            # the convolution with stride
+            if m.stride == (2, 2):
+                m.stride = (1, 1)
+                if m.kernel_size == (3, 3):
+                    m.dilation = (dilate // 2, dilate // 2)
+                    m.padding = (dilate // 2, dilate // 2)
+            # other convoluions
+            else:
+                if m.kernel_size == (3, 3):
+                    m.dilation = (dilate, dilate)
+                    m.padding = (dilate, dilate)
+    def forward(self, x):
+        x = self.relu(self.bn1(self.conv1(x)))
+        x = self.maxpool(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        return x
+    def forward_stage(self, x, stage):
+        assert stage in [
+            "conv",
+            "layer1",
+            "layer2",
+            "layer3",
+            "layer4",
+            "layer1_without_conv",
+        ]
+        if stage == "conv":
+            x = self.relu(self.bn1(self.conv1(x)))
+            x = self.maxpool(x)
+            return x
+        elif stage == "layer1":
+            x = self.relu(self.bn1(self.conv1(x)))
+            x = self.maxpool(x)
+            x = self.layer1(x)
+            return x
+        elif stage == "layer1_without_conv":
+            x = self.layer1(x)
+            return x
+        else:  # Stage 2, 3 or 4
+            layer = getattr(self, stage)
+            return layer(x)
+def resnet_dilated(
+    basenet: str, pretrained: bool = True, dilate_scale: Literal[8, 16] = 8
+):
+    r"""Dilated Residual Network models from `"Dilated Residual Networks" <https://openaccess.thecvf.com/content_cvpr_2017/papers/Yu_Dilated_Residual_Networks_CVPR_2017_paper.pdf>`_
+    Args:
+        basenet (str): The type of ResNet.
+        pretrained (bool): If True, returns a model pre-trained on ImageNet.
+        dilate_scale ({8, 16}, default=8): The type of dilating process.
+    """
+    return ResnetDilated(
+        resnet.__dict__[basenet](pretrained=pretrained), dilate_scale=dilate_scale
+    )

fusion_bench/models/parameter_dict.py ADDED Viewed

@@ -0,0 +1,75 @@
+from typing import List, Mapping
+import torch
+from torch import nn
+__all__ = "ParamterDictModel"
+def set_attr(obj, names: List[str], val, check_parent: bool = False):
+    """
+    Sets an attribute of an object recursively.
+    Args:
+        obj (object): Object to set attribute of.
+        names (list): List of attribute names to set recursively.
+        val (object): Value to set the attribute to.
+        check_parent (bool): If True, checks if the parent attribute exists; otherwise, creates it if it does not exist.
+    """
+    if len(names) == 1:
+        setattr(obj, names[0], val)
+    else:
+        if check_parent and not hasattr(obj, names[0]):
+            setattr(obj, names[0], nn.Module())
+        set_attr(getattr(obj, names[0]), names[1:], val, check_parent=check_parent)
+def has_attr(obj, names: List[str]):
+    """
+    Checks if an attribute exists in an object recursively.
+    Args:
+        obj (object): Object to check attribute of.
+        names (list): List of attribute names to check recursively.
+    Returns:
+        bool: True if the attribute exists; otherwise, False.
+    """
+    if len(names) == 1:
+        return hasattr(obj, names[0])
+    else:
+        return has_attr(getattr(obj, names[0]), names[1:])
+class ParameterDictModel(nn.Module):
+    """
+    This model is used to create a model with parameters from a dictionary.
+    It behaves like a normal `nn.ParameterDict`, but support keys with dots.
+    """
+    def __init__(
+        self,
+        parameters: Mapping[str, nn.Parameter],
+    ):
+        super().__init__()
+        for name, param in parameters.items():
+            assert isinstance(param, nn.Parameter), f"{name} is not a nn.Parameter"
+            set_attr(
+                self,
+                name.split("."),
+                param,
+                check_parent=True,
+            )
+    def __repr__(self):
+        """
+        Generate a string representation of the model's parameters.
+        Returns:
+            str: A string representation of the model's parameters.
+        """
+        param_reprs = []
+        for name, param in self.named_parameters():
+            param_repr = f"{name}: {param.size()}"
+            param_reprs.append(param_repr)
+        return f"{self.__class__.__name__}({', '.join(param_reprs)})"

fusion_bench/models/rankone_moe.py ADDED Viewed

@@ -0,0 +1,410 @@
+import functools
+import logging
+from typing import Dict, List, Tuple  # noqa: F401
+import torch
+import torch.func
+from torch import Tensor, nn
+from torch.func import functional_call
+from torch.nn import functional as F
+from fusion_bench.utils.type import StateDictType
+log = logging.getLogger(__name__)
+def join_list(list_of_list: List[List]):
+    ans = []
+    for l in list_of_list:
+        ans.extend(l)
+    return ans
+def del_attr(obj, names: List[str]):
+    """
+    Deletes an attribute from an object recursively.
+    Args:
+        obj (object): Object to delete attribute from.
+        names (list): List of attribute names to delete recursively.
+    """
+    if len(names) == 1:
+        delattr(obj, names[0])
+    else:
+        del_attr(getattr(obj, names[0]), names[1:])
+def set_attr(obj, names: List[str], val):
+    """
+    Sets an attribute of an object recursively.
+    Args:
+        obj (object): Object to set attribute of.
+        names (list): List of attribute names to set recursively.
+        val (object): Value to set the attribute to.
+    """
+    if len(names) == 1:
+        setattr(obj, names[0], val)
+    else:
+        set_attr(getattr(obj, names[0]), names[1:], val)
+def get_attr(obj, names: List[str]):
+    """
+    Gets an attribute of an object recursively.
+    Args:
+        obj (object): Object to get attribute of.
+        names (list): List of attribute names to get recursively.
+    Returns:
+        object: The attribute of the object.
+    """
+    if len(names) == 1:
+        return getattr(obj, names[0])
+    else:
+        return get_attr(getattr(obj, names[0]), names[1:])
+class Depth_0_Gate(nn.Module):
+    def __init__(self, num_experts: int):
+        super().__init__()
+        self.weight = nn.Parameter(torch.empty(num_experts), requires_grad=True)
+    def init_weight(self, init_lambda: float):
+        nn.init.constant_(self.weight, init_lambda)
+    def forward(self, *args, **kwargs) -> Tensor:
+        return self.weight
+class Depth_1_Gate(nn.Module):
+    def __init__(self, hidden_size: int, num_experts: int):
+        super().__init__()
+        self.fc = nn.Linear(hidden_size, num_experts, bias=True)
+    def init_weight(self, init_lambda: float):
+        nn.init.normal_(self.fc.weight, std=0.01)
+        nn.init.constant_(self.fc.bias, init_lambda)
+    def forward(self, hidden_states: Tensor) -> Tensor:
+        return self.fc(hidden_states)
+class Depth_2_Gate(nn.Module):
+    def __init__(self, hidden_size: int, num_experts: int):
+        super().__init__()
+        self.fc1 = nn.Linear(hidden_size, num_experts * 2, bias=True)
+        self.fc2 = nn.Linear(num_experts * 2, num_experts, bias=True)
+    def init_weight(self, init_lambda: float):
+        nn.init.normal_(self.fc1.weight, std=0.01)
+        nn.init.zeros_(self.fc1.bias)
+        nn.init.normal_(self.fc2.weight, std=0.01)
+        nn.init.constant_(self.fc2.bias, init_lambda)
+    def forward(self, hidden_states: Tensor) -> Tensor:
+        hidden_states = F.relu(self.fc1(hidden_states))
+        return self.fc2(hidden_states)
+def construct_rankone_moe_gate(
+    hidden_size: int,
+    num_experts: int,
+    init_lambda: float,
+    num_hidden_layers: int = 2,
+):
+    if num_hidden_layers == 0:
+        gate = Depth_0_Gate(num_experts)
+    elif num_hidden_layers == 1:
+        gate = Depth_1_Gate(hidden_size, num_experts)
+    elif num_hidden_layers == 2:
+        gate = Depth_2_Gate(hidden_size, num_experts)
+    else:
+        raise ValueError(f"Unsupported number of hidden layers: {num_hidden_layers}")
+    gate.num_hidden_layers = num_hidden_layers
+    gate.init_weight(init_lambda)
+    return gate
+class ExpertNotTrainedError(Exception):
+    pass
+def _is_all_zeros(tensor: Tensor | List[Tensor]) -> bool:
+    """
+    Check if a tensor or a list of tensors are all zeros.
+    """
+    if isinstance(tensor, Tensor):
+        return torch.allclose(tensor, torch.zeros_like(tensor))
+    else:
+        return all(_is_all_zeros(t) for t in tensor)
+def _svd(w: Tensor, full_matrices=True) -> Tuple[Tensor, Tensor, Tensor]:
+    """
+    Perform Singular Value Decomposition (SVD) on a tensor.
+    """
+    u, s, vh = torch.linalg.svd(
+        w, full_matrices=full_matrices, driver="gesvd" if w.is_cuda else None
+    )
+    v = vh.T
+    return u, s, v
+def svd(
+    w: Tensor, full_matrices=True, accelerator=None
+) -> Tuple[Tensor, Tensor, Tensor]:
+    """
+    Perform SVD on a tensor, optionally using a specified accelerator.
+    """
+    if accelerator is None:
+        return _svd(w, full_matrices=full_matrices)
+    original_device = w.device
+    w = w.to(accelerator)
+    u, s, v = _svd(w)
+    return u.to(original_device), s.to(original_device), v.to(original_device)
+def fun_joint_svd(
+    w_list: List[Tensor], accelerator=None
+) -> Tuple[Tensor, Tensor, Tensor]:
+    w = torch.cat(w_list, dim=1)  # stacked_matrix
+    original_device = w.device
+    if accelerator is not None:
+        w = w.to(accelerator)
+    u_c, s_c, vh_c = torch.linalg.svd(
+        w, full_matrices=False, driver="gesvd" if w.is_cuda else None
+    )
+    svd_list = []
+    offset = 0
+    for matrix in w_list:
+        n_cols = matrix.size(1)
+        u = u_c
+        s = s_c
+        vh_ = vh_c[:, offset : offset + n_cols]
+        v = vh_.T
+        svd_list.append(
+            [u.to(original_device), s.to(original_device), v.to(original_device)]
+        )
+        offset += n_cols
+    return svd_list
+class RankOneMoE(nn.Module):
+    # variable to store the merged state dict temporarily
+    _merged_state_dict: StateDictType = None
+    def __init__(
+        self,
+        hidden_size: int,
+        base_model: nn.Module,
+        expert_models: List[nn.Module],
+        init_lambda: float = 0.2,
+        batch_first: bool = False,
+        router_hidden_layers: int = 2,
+        batch_reduce: bool = False,
+        svd_accelerator=False,
+        rank_k: int = -1,
+        select_k: int = -1,
+    ):
+        """
+        Initializes the RankOneMoE class.
+        https://github.com/EnnengYang/RankOne-MoE
+        Args:
+            hidden_size (int): The size of the hidden layer in the models.
+            base_model (nn.Module): The base model that will be used as a reference for the expert models.
+            expert_models (List[nn.Module]): A list of expert models that will be combined.
+            init_lambda (float, optional): The initial lambda value for the weight ensembling gate. Defaults to 0.2.
+            batch_first (bool, optional): If True, the input tensors are expected to have the batch size as the first dimension. Defaults to False.
+            router_hidden_layers (int, optional): The number of hidden layers in the router. Defaults to 2.
+            batch_reduce (bool): If True, the batch dimension of routing weights is reduced. Defaults to False.
+        """
+        super().__init__()
+        self.num_experts = len(expert_models)
+        self.hidden_size = hidden_size
+        self.batch_first = batch_first
+        self.batch_reduce = batch_reduce
+        self.svd_accelerator = svd_accelerator
+        self.rank_k = rank_k
+        self.select_k = select_k
+        self.init_lambda = init_lambda
+        self.gate = construct_rankone_moe_gate(
+            hidden_size=hidden_size,
+            num_experts=int(self.num_experts * self.rank_k),
+            init_lambda=init_lambda,
+            num_hidden_layers=router_hidden_layers,
+        )
+        # compute the task vectors
+        for name, param in base_model.named_parameters():
+            if not param.requires_grad:
+                for m in expert_models:
+                    del_attr(m, name.split("."))
+            else:
+                for m in expert_models:
+                    get_attr(m, name.split(".")).data = (
+                        get_attr(m, name.split(".")) - param
+                    )
+        # fix base model and expert models
+        self.base_model = base_model.requires_grad_(False)
+        for m in expert_models:
+            m.requires_grad_(False)
+        # task vecotr  (only bias term)
+        self.task_vectors_fc1_bias = nn.Parameter(
+            torch.stack([e.fc1.bias for e in expert_models], dim=0), requires_grad=False
+        )
+        self.task_vectors_fc2_bias = nn.Parameter(
+            torch.stack([e.fc2.bias for e in expert_models], dim=0), requires_grad=False
+        )
+        # SVD representation of task vector (only weight term)
+        self.task_vectors_fc1_u = nn.ParameterList()
+        self.task_vectors_fc1_svh = nn.ParameterList()
+        self.task_vectors_fc2_u = nn.ParameterList()
+        self.task_vectors_fc2_svh = nn.ParameterList()
+        for m in expert_models:
+            for name, param in m.named_parameters():
+                if ".weight" in name:
+                    if _is_all_zeros(param):
+                        # All fine-tuned models are identical to the pretrained model
+                        raise ExpertNotTrainedError()
+                    u, s, v = svd(param, accelerator=self.svd_accelerator)
+                    u = u[:, : self.rank_k]
+                    s = s[: self.rank_k]
+                    v = v[:, : self.rank_k]
+                    if "fc1.weight" == name:
+                        self.task_vectors_fc1_u.append(
+                            nn.Parameter(u.T, requires_grad=False)
+                        )
+                        self.task_vectors_fc1_svh.append(
+                            nn.Parameter((s * v).T, requires_grad=False)
+                        )
+                    elif "fc2.weight" == name:
+                        self.task_vectors_fc2_u.append(
+                            nn.Parameter(u.T, requires_grad=False)
+                        )
+                        self.task_vectors_fc2_svh.append(
+                            nn.Parameter((s * v).T, requires_grad=False)
+                        )
+        # remove the original module from fine-tuned models to save memory
+        for name, param in base_model.named_parameters():
+            name_list = name.split(".")
+            for m in expert_models:
+                set_attr(m, name_list, None)
+    @property
+    def forward_model(self):
+        return functools.partial(
+            functional_call,
+            self.base_model,
+            self._merged_state_dict,
+        )
+    def top_k_soft(self, s, k):
+        threshold, _ = torch.topk(s, k, largest=True, sorted=False)
+        min_threshold = threshold.min()
+        # sigmoid -> mask
+        mask = torch.sigmoid(100 * (s - min_threshold))
+        result = s * mask
+        return result
+    def merge_weights(self, expert_weights):
+        state_dict = self.base_model.state_dict(keep_vars=True)
+        # Select top-k experts from the expert pool for fusion
+        if self.select_k > 0:
+            expert_weights = self.top_k_soft(expert_weights, self.select_k)
+        for name in state_dict:
+            if name == "fc1.bias":
+                for param in self.task_vectors_fc1_bias:
+                    state_dict[name] = state_dict[name] + self.init_lambda * param
+            elif name == "fc2.bias":
+                for param in self.task_vectors_fc2_bias:
+                    state_dict[name] = state_dict[name] + self.init_lambda * param
+            elif name == "fc1.weight":
+                w_list = torch.split(
+                    expert_weights,
+                    int(expert_weights.size(-1) / self.num_experts),
+                    dim=-1,
+                )
+                for weight, u, svh in zip(
+                    w_list, self.task_vectors_fc1_u, self.task_vectors_fc1_svh
+                ):
+                    weight_diag = torch.diag(weight)
+                    weight_u = torch.mm(weight_diag, u)
+                    result = torch.matmul(weight_u.T, svh)
+                    state_dict[name] = state_dict[name] + result
+            elif name == "fc2.weight":
+                w_list = torch.split(
+                    expert_weights,
+                    int(expert_weights.size(-1) / self.num_experts),
+                    dim=-1,
+                )
+                for weight, u, svh in zip(
+                    w_list, self.task_vectors_fc2_u, self.task_vectors_fc2_svh
+                ):
+                    weight_diag = torch.diag(weight)
+                    weight_u = torch.mm(weight_diag, u)
+                    result = torch.matmul(weight_u.T, svh)
+                    state_dict[name] = state_dict[name] + result
+        self._merged_state_dict = state_dict
+        return state_dict
+    def forward(self, hidden_states: Tensor):
+        if self.gate.num_hidden_layers == 0:
+            gate_weights = self.gate()
+        else:
+            gate_weights = self.gate(hidden_states)
+            if self.batch_first:
+                # the input is in the shape of (batch_size, seq_len, hidden_size)
+                gate_weights = gate_weights.mean(dim=1)
+            else:
+                # the input is in the shape of (seq_len, batch_size, hidden_size)
+                gate_weights = gate_weights.mean(dim=0)
+        if self.gate.num_hidden_layers == 0:
+            self.merge_weights(gate_weights)
+            output_hidden_states = self.forward_model(hidden_states)
+        elif self.batch_reduce:
+            gate_weights = gate_weights.mean(dim=0)
+            self.merge_weights(gate_weights)
+            output_hidden_states = self.forward_model(hidden_states)
+        else:
+            output_hidden_states = []
+            for sample_idx, weights in enumerate(gate_weights):
+                self.merge_weights(weights)
+                if self.batch_first:
+                    output_hidden_states.append(
+                        self.forward_model(hidden_states[sample_idx : sample_idx + 1])
+                    )
+                else:
+                    output_hidden_states.append(
+                        self.forward_model(
+                            hidden_states[:, sample_idx : sample_idx + 1]
+                        )
+                    )
+            if self.batch_first:
+                output_hidden_states = torch.cat(output_hidden_states, dim=0)
+            else:
+                output_hidden_states = torch.cat(output_hidden_states, dim=1)
+        self._merged_state_dict = None
+        return output_hidden_states

fusion_bench/models/separate_io.py ADDED Viewed

@@ -0,0 +1,105 @@
+import os
+from copy import deepcopy
+import torch
+from safetensors import safe_open
+from safetensors.torch import save_file
+from torch import nn
+from fusion_bench.utils.dtype import parse_dtype
+__all__ = ["separate_save", "separate_load"]
+def dir_is_empty(path: str) -> bool:
+    return not os.path.exists(path) or len(os.listdir(path)) == 0
+def separate_save(
+    model: nn.Module,
+    save_dir: str,
+    dtype=None,
+    in_place: bool = True,
+    model_file="functional.bin",
+    state_dict_file="state_dict.bin",
+    use_safe_tensors: bool = True,
+):
+    """
+    Save the model's architecture and state dictionary separately.
+    Args:
+        model (nn.Module): The PyTorch model to save.
+        save_dir (str): The directory where the model and state dictionary will be saved.
+        in_place (bool, optional): If True, the original model is modified. If False, a deepcopy of the model is used. Default is True.
+        model_file (str, optional): The name of the file to save the model's architecture. Default is "functional.bin".
+        state_dict_file (str, optional): The name of the file to save the model's state dictionary. Default is "state_dict.bin".
+    """
+    if os.path.exists(save_dir) and not dir_is_empty(save_dir):
+        raise FileExistsError(f"Directory exists and is not empty. {save_dir}")
+    if not in_place:
+        model = deepcopy(model)
+    state_dict = {}
+    for name, param in model.state_dict().items():
+        state_dict[name] = param.clone().detach().to(dtype=dtype).cpu()
+    model = model.to_empty(device="meta")
+    if not os.path.exists(save_dir):
+        os.makedirs(save_dir)
+    torch.save(model, os.path.join(save_dir, model_file))
+    if not use_safe_tensors:
+        torch.save(state_dict, os.path.join(save_dir, state_dict_file))
+    else:
+        save_file(state_dict, os.path.join(save_dir, state_dict_file))
+def separate_load(
+    load_dir: str,
+    strict: bool = True,
+    dtype: torch.dtype = None,
+    device: torch.device = "cpu",
+    model_file="functional.bin",
+    state_dict_file="state_dict.bin",
+    use_safe_tensors: bool = True,
+):
+    """
+    Load the model's architecture and state dictionary separately.
+    Args:
+        load_dir (str): The directory from which the model and state dictionary will be loaded.
+        strict (bool, optional): Whether to strictly enforce that the keys in state_dict match the keys returned by model's state_dict() function. Default is True.
+        model_file (str, optional): The name of the file from which to load the model's architecture. Default is "functional.bin".
+        state_dict_file (str, optional): The name of the file from which to load the model's state dictionary. Default is "state_dict.bin".
+    Returns:
+        nn.Module: The loaded PyTorch model with the state dictionary applied.
+    """
+    if not os.path.exists(load_dir):
+        raise FileNotFoundError(f"Directory {load_dir} does not exist.")
+    dtype = parse_dtype(dtype)
+    model: nn.Module = (
+        torch.load(os.path.join(load_dir, model_file))
+        .to(dtype=dtype)
+        .to_empty(device=device or "cpu")
+    )
+    if state_dict_file is not None:
+        if not use_safe_tensors:
+            state_dict = torch.load(
+                os.path.join(load_dir, state_dict_file),
+                map_location="cpu",
+            )
+        else:
+            state_dict = {}
+            with safe_open(
+                os.path.join(load_dir, state_dict_file), framework="pt", device="cpu"
+            ) as f:
+                for k in f.keys():
+                    state_dict[k] = f.get_tensor(k)
+        if dtype is not None:
+            for name, param in state_dict.items():
+                state_dict[name] = param.to(dtype=dtype, non_blocking=True)
+        model.load_state_dict(state_dict, strict=strict)
+    return model

fusion_bench/models/smile_moe/__init__.py ADDED Viewed

File without changes