PyPI - fusion-bench - Versions diffs - 0.2.9__py3-none-any.whl - Mend

fusion-bench 0.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (727) hide show

fusion_bench/models/surgery/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .surgerymodelwrapper import SurgeryModelWrapper

fusion_bench/models/surgery/surgerymodelwrapper.py ADDED Viewed

@@ -0,0 +1,158 @@
+import math
+from typing import TYPE_CHECKING, Callable, Generic, List, Union
+import torch
+from torch import nn
+from transformers.models.clip.modeling_clip import (
+    CLIPVisionModel,
+    CLIPVisionTransformer,
+)
+from fusion_bench.utils.type import TorchModelType
+def regularize_name(name: str):
+    name = name.replace("-", "_")
+    name = name.replace(".", "_")
+    return name
+class SurgeryModelWrapper(torch.nn.Module, Generic[TorchModelType]):
+    is_surgery_model = True
+    """A flag to indicate that this is a surgery model."""
+    def __init__(
+        self,
+        model: TorchModelType,
+        test_datasets: List[str],
+        projection_dim: int = 512,
+        hidden_dim: int = 16,
+    ):
+        super(SurgeryModelWrapper, self).__init__()
+        self.model = model
+        self.model.requires_grad_(False)
+        self.test_datasets = test_datasets
+        self.non_linear_func = torch.nn.ReLU()
+        self.projection_dim = projection_dim
+        self.hidden_dim = hidden_dim
+        for dataset_name in test_datasets:
+            self.add_surgery_module(dataset_name)
+    def add_surgery_module(self, dataset_name: str):
+        """
+        Add a surgery module for a given dataset.
+        Args:
+            dataset_name (str): The name of the dataset.
+        """
+        dataset_name = regularize_name(dataset_name)
+        down_proj = torch.nn.Linear(self.projection_dim, self.hidden_dim, bias=False)
+        up_proj = torch.nn.Linear(self.hidden_dim, self.projection_dim, bias=False)
+        torch.nn.init.kaiming_uniform_(down_proj.weight, a=math.sqrt(5))
+        torch.nn.init.zeros_(up_proj.weight)
+        self.add_module(
+            "feature_mapping_to_head_down_proj_{}".format(dataset_name), down_proj
+        )
+        self.add_module(
+            "feature_mapping_to_head_up_proj_{}".format(dataset_name), up_proj
+        )
+    def collect_trainable_params(self):
+        trainable_params = []
+        # surgery parameter
+        for dataset_name in self.test_datasets:
+            dataset_name = regularize_name(dataset_name)
+            down_proj = getattr(
+                self, "feature_mapping_to_head_down_proj_{}".format(dataset_name)
+            )
+            up_proj = getattr(
+                self, "feature_mapping_to_head_up_proj_{}".format(dataset_name)
+            )
+            trainable_params.append(down_proj.weight)
+            trainable_params.append(up_proj.weight)
+        return trainable_params
+    def collect_surgery_module(self):
+        surgery_module = {}
+        # surgery parameter
+        for dataset_name in self.test_datasets:
+            dataset_name = regularize_name(dataset_name)
+            down_proj = getattr(
+                self, "feature_mapping_to_head_down_proj_{}".format(dataset_name)
+            )
+            up_proj = getattr(
+                self, "feature_mapping_to_head_up_proj_{}".format(dataset_name)
+            )
+            surgery_module[
+                "feature_mapping_to_head_down_proj_{}".format(dataset_name)
+            ] = down_proj
+            surgery_module[
+                "feature_mapping_to_head_up_proj_{}".format(dataset_name)
+            ] = up_proj
+        surgery_module["non_linear_func"] = self.non_linear_func
+        return surgery_module
+    def compute_surgery_features(
+        self,
+        compute_features_fn: Union[
+            torch.Tensor, Callable[[TorchModelType], torch.Tensor]
+        ],
+        dataset_name: str,
+    ):
+        """
+        Compute the surgery features.
+        Args:
+            compute_features_fn (Union[torch.Tensor, Callable[[nn.Module], torch.Tensor]]): A function that computes the features or a tensor that represents the features.
+            dataset_name (str): The name of the dataset.
+        Returns:
+            feature (torch.Tensor): The surgery features.
+            feature0 (torch.Tensor): The original features.
+            feature_sub (torch.Tensor): feature0 - feature.
+        """
+        dataset_name = regularize_name(dataset_name)
+        if isinstance(compute_features_fn, torch.Tensor):
+            feature = compute_features_fn
+        elif callable(compute_features_fn):
+            feature = compute_features_fn(self.model)
+        else:
+            raise ValueError(
+                "compute_features_fn must be a tensor or a callable, but got {}".format(
+                    type(compute_features_fn)
+                )
+            )
+        feature0 = feature
+        # feature bias
+        down_proj = getattr(
+            self, "feature_mapping_to_head_down_proj_{}".format(dataset_name)
+        )
+        up_proj = getattr(
+            self, "feature_mapping_to_head_up_proj_{}".format(dataset_name)
+        )
+        feature_sub = down_proj(feature)
+        feature_sub = self.non_linear_func(feature_sub)
+        feature_sub = up_proj(feature_sub)
+        # surgery feature
+        feature = feature0 - feature_sub
+        return feature, feature0, feature_sub
+    def forward(self, *args, **kwargs):
+        """The wrappered model should just forward like normal."""
+        return self.model(*args, **kwargs)

fusion_bench/models/utils.py ADDED Viewed

@@ -0,0 +1,80 @@
+from typing import List
+import torch
+from torch import nn
+def del_attr(obj, names: List[str]):
+    """
+    Deletes an attribute from an object recursively.
+    Args:
+        obj (object): Object to delete attribute from.
+        names (list): List of attribute names to delete recursively.
+    """
+    if len(names) == 1:
+        delattr(obj, names[0])
+    else:
+        del_attr(getattr(obj, names[0]), names[1:])
+def set_attr(obj, names: List[str], val):
+    """
+    Sets an attribute of an object recursively.
+    Args:
+        obj (object): Object to set attribute of.
+        names (list): List of attribute names to set recursively.
+        val (object): Value to set the attribute to.
+    """
+    if len(names) == 1:
+        setattr(obj, names[0], val)
+    else:
+        set_attr(getattr(obj, names[0]), names[1:], val)
+def get_attr(obj, names: List[str]):
+    """
+    Gets an attribute of an object recursively.
+    Args:
+        obj (object): Object to get attribute of.
+        names (list): List of attribute names to get recursively.
+    Returns:
+        object: The attribute of the object.
+    """
+    if len(names) == 1:
+        return getattr(obj, names[0])
+    else:
+        return get_attr(getattr(obj, names[0]), names[1:])
+def find_layers_with_type(
+    module: nn.Module,
+    layer_types=[nn.Linear],
+    prefix="",
+):
+    """
+    Recursively find the layers of a certain type in a module.
+    Args:
+        module (nn.Module): PyTorch module.
+        layer_types (list): List of layer types to find.
+        prefix (str): A prefix to add to the layer names.
+    Returns:
+        dict: Dictionary of layers of the given type(s) within the module.
+    """
+    res = {}
+    for name, submodule in module.named_modules(prefix=prefix):
+        if isinstance(submodule, tuple(layer_types)):
+            res[name] = submodule
+    return res
+def disable_dropout(model: torch.nn.Module):
+    """Disable dropout in a model."""
+    for module in model.modules():
+        if isinstance(module, torch.nn.Dropout):
+            module.p = 0

fusion_bench/models/we_moe.py ADDED Viewed

@@ -0,0 +1,247 @@
+import functools
+import logging
+from typing import List
+import torch
+import torch.func
+from torch import Tensor, nn
+from torch.func import functional_call
+from torch.nn import functional as F
+from fusion_bench.utils.type import StateDictType
+log = logging.getLogger(__name__)
+def join_list(list_of_list: List[List]):
+    ans = []
+    for l in list_of_list:
+        ans.extend(l)
+    return ans
+def del_attr(obj, names: List[str]):
+    """
+    Deletes an attribute from an object recursively.
+    Args:
+        obj (object): Object to delete attribute from.
+        names (list): List of attribute names to delete recursively.
+    """
+    if len(names) == 1:
+        delattr(obj, names[0])
+    else:
+        del_attr(getattr(obj, names[0]), names[1:])
+def set_attr(obj, names: List[str], val):
+    """
+    Sets an attribute of an object recursively.
+    Args:
+        obj (object): Object to set attribute of.
+        names (list): List of attribute names to set recursively.
+        val (object): Value to set the attribute to.
+    """
+    if len(names) == 1:
+        setattr(obj, names[0], val)
+    else:
+        set_attr(getattr(obj, names[0]), names[1:], val)
+def get_attr(obj, names: List[str]):
+    """
+    Gets an attribute of an object recursively.
+    Args:
+        obj (object): Object to get attribute of.
+        names (list): List of attribute names to get recursively.
+    Returns:
+        object: The attribute of the object.
+    """
+    if len(names) == 1:
+        return getattr(obj, names[0])
+    else:
+        return get_attr(getattr(obj, names[0]), names[1:])
+class Depth_0_Gate(nn.Module):
+    def __init__(self, num_experts: int):
+        super().__init__()
+        self.weight = nn.Parameter(torch.empty(num_experts), requires_grad=True)
+    def init_weight(self, init_lambda: float):
+        nn.init.constant_(self.weight, init_lambda)
+    def forward(self, *args, **kwargs) -> Tensor:
+        return self.weight
+class Depth_1_Gate(nn.Module):
+    def __init__(self, hidden_size: int, num_experts: int):
+        super().__init__()
+        self.fc = nn.Linear(hidden_size, num_experts, bias=True)
+    def init_weight(self, init_lambda: float):
+        nn.init.normal_(self.fc.weight, std=0.01)
+        nn.init.constant_(self.fc.bias, init_lambda)
+    def forward(self, hidden_states: Tensor) -> Tensor:
+        return self.fc(hidden_states)
+class Depth_2_Gate(nn.Module):
+    def __init__(self, hidden_size: int, num_experts: int):
+        super().__init__()
+        self.fc1 = nn.Linear(hidden_size, hidden_size, bias=True)
+        self.fc2 = nn.Linear(hidden_size, num_experts, bias=True)
+    def init_weight(self, init_lambda: float):
+        nn.init.normal_(self.fc1.weight, std=0.01)
+        nn.init.zeros_(self.fc1.bias)
+        nn.init.normal_(self.fc2.weight, std=0.01)
+        nn.init.constant_(self.fc2.bias, init_lambda)
+    def forward(self, hidden_states: Tensor) -> Tensor:
+        hidden_states = F.relu(self.fc1(hidden_states))
+        return self.fc2(hidden_states)
+def construct_weight_ensembling_gate(
+    hidden_size: int,
+    num_experts: int,
+    init_lambda: float,
+    num_hidden_layers: int = 2,
+):
+    if num_hidden_layers == 0:
+        gate = Depth_0_Gate(num_experts)
+    elif num_hidden_layers == 1:
+        gate = Depth_1_Gate(hidden_size, num_experts)
+    elif num_hidden_layers == 2:
+        gate = Depth_2_Gate(hidden_size, num_experts)
+    else:
+        raise ValueError(f"Unsupported number of hidden layers: {num_hidden_layers}")
+    gate.num_hidden_layers = num_hidden_layers
+    gate.init_weight(init_lambda)
+    return gate
+class WeightEnsemblingMoE(nn.Module):
+    # variable to store the merged state dict temporarily
+    _merged_state_dict: StateDictType = None
+    def __init__(
+        self,
+        hidden_size: int,
+        base_model: nn.Module,
+        expert_models: List[nn.Module],
+        init_lambda: float = 0.2,
+        batch_first: bool = False,
+        router_hidden_layers: int = 2,
+        batch_reduce: bool = False,
+    ):
+        """
+        Initializes the WeightEnsemblingMoE class.
+        References:
+            (ICML 2024) Merging Multi-Task Models via Weight-Ensembling Mixture of Experts
+            http://arxiv.org/abs/2402.00433
+        Args:
+            hidden_size (int): The size of the hidden layer in the models.
+            base_model (nn.Module): The base model that will be used as a reference for the expert models.
+            expert_models (List[nn.Module]): A list of expert models that will be combined.
+            init_lambda (float, optional): The initial lambda value for the weight ensembling gate. Defaults to 0.2.
+            batch_first (bool, optional): If True, the input tensors are expected to have the batch size as the first dimension. Defaults to False.
+            router_hidden_layers (int, optional): The number of hidden layers in the router. Defaults to 2.
+            batch_reduce (bool): If True, the batch dimension of routing weights is reduced. Defaults to False.
+        """
+        super().__init__()
+        self.num_experts = len(expert_models)
+        self.hidden_size = hidden_size
+        self.batch_first = batch_first
+        self.batch_reduce = batch_reduce
+        self.gate = construct_weight_ensembling_gate(
+            hidden_size,
+            self.num_experts,
+            init_lambda=init_lambda,
+            num_hidden_layers=router_hidden_layers,
+        )
+        # compute the task vectors
+        for name, param in base_model.named_parameters():
+            if not param.requires_grad:
+                for m in expert_models:
+                    del_attr(m, name.split("."))
+            else:
+                for m in expert_models:
+                    get_attr(m, name.split(".")).data = (
+                        get_attr(m, name.split(".")) - param
+                    )
+        # fix base model and expert models
+        self.base_model = base_model.requires_grad_(False)
+        for m in expert_models:
+            m.requires_grad_(False)
+        self.task_vectors = nn.ModuleList(expert_models)
+    @property
+    def forward_model(self):
+        return functools.partial(
+            functional_call,
+            self.base_model,
+            self._merged_state_dict,
+        )
+    def merge_weights(self, expert_weights):
+        state_dict = self.base_model.state_dict(keep_vars=True)
+        for weight, task_vector in zip(expert_weights, self.task_vectors):
+            for name, param in task_vector.named_parameters():
+                state_dict[name] = state_dict[name] + weight * param
+        self._merged_state_dict = state_dict
+        return state_dict
+    def forward(self, hidden_states: Tensor):
+        if self.gate.num_hidden_layers == 0:
+            gate_weights = self.gate()
+        else:
+            gate_weights = self.gate(hidden_states)
+            if self.batch_first:
+                # the input is in the shape of (batch_size, seq_len, hidden_size)
+                gate_weights = gate_weights.mean(dim=1)
+            else:
+                # the input is in the shape of (seq_len, batch_size, hidden_size)
+                gate_weights = gate_weights.mean(dim=0)
+        if self.gate.num_hidden_layers == 0:
+            self.merge_weights(gate_weights)
+            output_hidden_states = self.forward_model(hidden_states)
+        elif self.batch_reduce:
+            gate_weights = gate_weights.mean(dim=0)
+            self.merge_weights(gate_weights)
+            output_hidden_states = self.forward_model(hidden_states)
+        else:
+            output_hidden_states = []
+            for sample_idx, weights in enumerate(gate_weights):
+                self.merge_weights(weights)
+                if self.batch_first:
+                    output_hidden_states.append(
+                        self.forward_model(hidden_states[sample_idx : sample_idx + 1])
+                    )
+                else:
+                    output_hidden_states.append(
+                        self.forward_model(
+                            hidden_states[:, sample_idx : sample_idx + 1]
+                        )
+                    )
+            if self.batch_first:
+                output_hidden_states = torch.cat(output_hidden_states, dim=0)
+            else:
+                output_hidden_states = torch.cat(output_hidden_states, dim=1)
+        self._merged_state_dict = None
+        return output_hidden_states

fusion_bench/models/wrappers/__init__.py ADDED Viewed

File without changes

fusion_bench/models/wrappers/ensemble.py ADDED Viewed

@@ -0,0 +1,183 @@
+from typing import Any, Callable, Dict, List, cast
+import numpy as np
+import torch
+from omegaconf import ListConfig
+from torch import Tensor, nn
+def aggregate_tensors(outputs: List[Any], aggregate_fn: Callable) -> Tensor:
+    """
+    Aggregates a list of outputs using the provided aggregation function.
+    This function handles different types of outputs:
+    - If the outputs are Tensors, it applies the aggregation function directly.
+    - If the outputs are dictionaries, it recursively aggregates each value.
+    - If the outputs are tuples or lists, it recursively aggregates each element.
+    - If all outputs are None, it returns None.
+    - If the outputs are of an unsupported type, it raises a ValueError.
+    Args:
+        outputs (list): A list of outputs to be aggregated. The outputs can be Tensors, dictionaries, tuples, lists, or None.
+        aggregate_fn (callable): A function to aggregate the outputs. Typically, this could be a function like `torch.mean`.
+    Returns:
+        Tensor or dict or tuple or list or None: The aggregated output, matching the type of the input outputs.
+    Raises:
+        ValueError: If the outputs are of an unsupported type.
+    """
+    # If the output is a Tensor, take the mean
+    if isinstance(outputs[0], torch.Tensor):
+        return aggregate_fn(outputs)
+    # If the output is a dict, take the mean of each value
+    elif isinstance(outputs[0], Dict):
+        result = type(outputs[0])()
+        for key in outputs[0]:
+            result[key] = aggregate_tensors(
+                [output[key] for output in outputs], aggregate_fn
+            )
+        return result
+    # If the output is a tuple or list, take the mean of each element
+    elif isinstance(outputs[0], (tuple, list)):
+        return tuple(
+            aggregate_tensors([output[i] for output in outputs], aggregate_fn)
+            for i in range(len(outputs[0]))
+        )
+    # If the output is None, return None
+    elif all(output is None for output in outputs):
+        return None
+    # If the output is none of the above, return as is
+    else:
+        raise ValueError("Unsupported type for outputs")
+class EnsembleModule(nn.Module):
+    """
+    Ensemble module that averages the outputs of multiple models.
+    """
+    def __init__(self, models: List[nn.Module]):
+        """
+        Initializes the EnsembleModule with a list of models.
+        Args:
+            models (List[nn.Module]): List of models to ensemble.
+        """
+        super().__init__()
+        # TODO: distribute models to devices
+        self.model_list = nn.ModuleList(models)
+    def _aggregate_tensors(self, outputs: List[Tensor]) -> Tensor:
+        """
+        Aggregates a list of tensors by computing their mean.
+        Args:
+            outputs (List[Tensor]): List of tensors to aggregate.
+        Returns:
+            Tensor: The mean tensor.
+        """
+        return torch.stack(outputs).mean(dim=0)
+    def forward(self, *args, **kwargs):
+        """
+        Performs a forward pass by averaging the outputs of the models.
+        Args:
+            *args: Variable length argument list.
+            **kwargs: Arbitrary keyword arguments.
+        Returns:
+            Aggregated output from the ensemble of models.
+        """
+        outputs = [model(*args, **kwargs) for model in self.model_list]
+        return aggregate_tensors(outputs, self._aggregate_tensors)
+class WeightedEnsembleModule(nn.Module):
+    """
+    Ensemble module that computes a weighted average of the outputs from multiple models.
+    """
+    def __init__(
+        self,
+        models: List[nn.Module],
+        weights: List[float] | Tensor | np.ndarray,
+        normalize: bool = True,
+    ):
+        """
+        Initializes the WeightedEnsembleModule with models and their corresponding weights.
+        Args:
+            models (List[nn.Module]): List of models to ensemble.
+            weights (List[float] | Tensor | np.ndarray): Weights for each model.
+            normalize (bool, optional): If True, normalizes the weights. Defaults to True.
+        """
+        super().__init__()
+        self.model_list = nn.ModuleList(models)
+        if isinstance(weights, (list, tuple, ListConfig)):
+            weights = torch.tensor(weights)
+        elif isinstance(weights, Tensor):
+            weights = weights
+        elif isinstance(weights, np.ndarray):
+            weights = torch.from_numpy(weights)
+        else:
+            raise ValueError(f"Unsupported type for weights: {type(weights)=}")
+        assert len(models) == len(weights) and weights.dim() == 1, (
+            "weights must be a 1D tensor of the same length as models."
+            f"But got {len(models)=}, {weights.dim()=}"
+        )
+        if normalize:
+            weights = weights / weights.sum()
+        self.register_buffer("weights", weights)
+    def _aggregate_tensors(self, outputs: List[Tensor]) -> Tensor:
+        """
+        Aggregates a list of tensors using the provided weights.
+        Args:
+            outputs (List[Tensor]): List of tensors to aggregate.
+        Returns:
+            Tensor: The weighted sum of the tensors.
+        """
+        weights = cast(Tensor, self.weights).view(-1, *([1] * outputs[0].dim()))
+        return (torch.stack(outputs) * weights).sum(dim=0)
+    def forward(self, *args, **kwargs):
+        """
+        Performs a forward pass by computing the weighted average of the models' outputs.
+        Args:
+            *args: Variable length argument list.
+            **kwargs: Arbitrary keyword arguments.
+        Returns:
+            Weighted aggregated output from the ensemble of models.
+        """
+        outputs = [model(*args, **kwargs) for model in self.model_list]
+        return aggregate_tensors(outputs, self._aggregate_tensors)
+class MaxModelPredictor(EnsembleModule):
+    """
+    Ensemble module that selects the maximum output among multiple models.
+    """
+    def _aggregate_tensors(self, outputs: List[Tensor]) -> Tensor:
+        """
+        Aggregates a list of tensors by selecting the maximum value at each position.
+        Args:
+            outputs (List[Tensor]): List of tensors to aggregate.
+        Returns:
+            Tensor: Tensor with the maximum values.
+        """
+        return torch.stack(outputs).max(dim=0).values