PyPI - fusion-bench - Versions diffs - 0.2.9__py3-none-any.whl - Mend

fusion-bench 0.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (727) hide show

fusion_bench/method/ties_merging/ties_merging.py ADDED Viewed

@@ -0,0 +1,117 @@
+R"""
+Overview of Ties-Merging:
+1. Trim: For each task t, we trim the redundant parameters from the task vector $\tau_t$ to create $\hat{\tau}_t$ by keeping the top-k% values according to their magnitude and trimming the bottom $(100 - k)\%$ of the redundant parameters by resetting them to 0. This can be decomposed further as $\hat{\tau}_t = \hat{\gamma}_t \odot \hat{\mu}_t$.
+2. Elect: Next, we create an aggregate elected sign vector $\gamma_m$ for the merged model that resolves the disagreements in the sign for each parameter p across different models. To create the elected sign vector, we choose the sign with the highest total magnitude across all relevant models. For each parameter $p \in \{1, 2, \ldots, d\}$, we separate the values $\{\hat{\tau}_t^p\}_{t=1}^n$ based on their sign $(+1$ or $-1)$ and take their sum to calculate the total mass (i.e., total magnitude) in the positive and the negative direction. We then assign $\gamma_m^p$ as the sign with greater total movement. This can be efficiently computed using $\gamma_m^p = \text{sgn}(\sum_{t=1}^n \hat{\tau}_t^p)$.
+3. Disjoint Merge: Then, for each parameter p, we compute a disjoint mean by only keeping the parameter values from the models whose signs are the same as the aggregated elected sign and calculate their mean. Formally, let $A_p = \{t \in [n] \mid \hat{\gamma}_t^p = \gamma_m^p\}$, then $\tau_m^p = \frac{1}{|A_p|}\sum_{t\in A_p} \hat{\tau}_t^p$. Note that the disjoint mean always ignores the zero values.
+"""
+import logging
+from typing import Dict, List, Literal, Mapping, Union  # noqa: F401
+import torch
+from torch import Tensor, nn
+from fusion_bench.compat.modelpool import to_modelpool
+from fusion_bench.method import BaseAlgorithm
+from fusion_bench.modelpool import BaseModelPool
+from fusion_bench.utils.type import StateDictType
+from .ties_merging_utils import state_dict_to_vector, ties_merging, vector_to_state_dict
+log = logging.getLogger(__name__)
+class TiesMergingAlgorithm(BaseAlgorithm):
+    """
+    TiesMergingAlgorithm is a class for fusing multiple models using the TIES merging technique.
+    Attributes:
+        scaling_factor (float): The scaling factor to apply to the merged task vector.
+        threshold (float): The threshold for resetting values in the task vector.
+        remove_keys (List[str]): List of keys to remove from the state dictionary.
+        merge_func (Literal["sum", "mean", "max"]): The merge function to use for disjoint merging.
+    """
+    _config_mapping = BaseAlgorithm._config_mapping | {
+        "scaling_factor": "scaling_factor",
+        "threshold": "threshold",
+        "remove_keys": "remove_keys",
+        "merge_func": "merge_func",
+    }
+    def __init__(
+        self,
+        scaling_factor: float,
+        threshold: float,
+        remove_keys: List[str],
+        merge_func: Literal["sum", "mean", "max"],
+        **kwargs,
+    ):
+        """
+        Initialize the TiesMergingAlgorithm with the given parameters.
+        Args:
+            scaling_factor (float): The scaling factor to apply to the merged task vector.
+            threshold (float): The threshold for resetting values in the task vector.
+            remove_keys (List[str]): List of keys to remove from the state dictionary.
+            merge_func (Literal["sum", "mean", "max"]): The merge function to use for disjoint merging.
+            **kwargs: Additional keyword arguments for the base class.
+        """
+        self.scaling_factor = scaling_factor
+        self.threshold = threshold
+        self.remove_keys = remove_keys
+        self.merge_func = merge_func
+        super().__init__(**kwargs)
+    @torch.no_grad()
+    def run(self, modelpool: BaseModelPool | Dict[str, nn.Module], **kwargs):
+        """
+        Run the TIES merging algorithm to fuse models in the model pool.
+        Args:
+            modelpool (BaseModelPool | Dict[str, nn.Module]): The model pool containing the models to fuse.
+        Returns:
+            nn.Module: The fused model.
+        """
+        log.info("Fusing models using ties merging.")
+        modelpool = to_modelpool(modelpool)
+        remove_keys = self.config.get("remove_keys", [])
+        merge_func = self.config.get("merge_func", "sum")
+        scaling_factor = self.scaling_factor
+        threshold = self.threshold
+        # Load the pretrained model
+        pretrained_model = modelpool.load_model("_pretrained_")
+        # Load the state dicts of the models
+        ft_checks: List[StateDictType] = [
+            modelpool.load_model(model_name).state_dict(keep_vars=True)
+            for model_name in modelpool.model_names
+        ]
+        ptm_check: StateDictType = pretrained_model.state_dict(keep_vars=True)
+        # Compute the task vectors
+        flat_ft: Tensor = torch.vstack(
+            [state_dict_to_vector(check, remove_keys) for check in ft_checks]
+        )
+        flat_ptm: Tensor = state_dict_to_vector(ptm_check, remove_keys)
+        tv_flat_checks = flat_ft - flat_ptm
+        # Perform TIES Merging
+        merged_tv = ties_merging(
+            tv_flat_checks,
+            reset_thresh=threshold,
+            merge_func=merge_func,
+        )
+        merged_check = flat_ptm + scaling_factor * merged_tv
+        merged_state_dict = vector_to_state_dict(
+            merged_check, ptm_check, remove_keys=remove_keys
+        )
+        # Load the merged state dict into the pretrained model
+        pretrained_model.load_state_dict(merged_state_dict)
+        return pretrained_model

fusion_bench/method/ties_merging/ties_merging_utils.py ADDED Viewed

@@ -0,0 +1,331 @@
+"""
+This is modified based on https://github.com/EnnengYang/AdaMerging/blob/main/src/ties_merging_utils.py
+"""
+import copy
+from collections import OrderedDict
+from typing import List
+import torch
+from torch import Tensor, nn
+from fusion_bench.utils.type import StateDictType
+# Model conversion utils
+def state_dict_to_vector(state_dict, remove_keys=[]):
+    """
+    Convert a state dictionary to a vector, removing specified keys.
+    Args:
+        state_dict (dict): The state dictionary to convert.
+        remove_keys (list): List of keys to remove from the state dictionary.
+    Returns:
+        Tensor: A vector representation of the state dictionary.
+    """
+    shared_state_dict = copy.deepcopy(state_dict)
+    for key in remove_keys:
+        if key in shared_state_dict:
+            del shared_state_dict[key]
+    sorted_shared_state_dict = OrderedDict(sorted(shared_state_dict.items()))
+    return nn.utils.parameters_to_vector(
+        [value.reshape(-1) for key, value in sorted_shared_state_dict.items()]
+    )
+def vector_to_state_dict(vector, state_dict, remove_keys=[]):
+    """
+    Convert a vector back to a state dictionary, removing specified keys.
+    Args:
+        vector (Tensor): The vector to convert.
+        state_dict (dict): The reference state dictionary.
+        remove_keys (list): List of keys to remove from the state dictionary.
+    Returns:
+        dict: A state dictionary representation of the vector.
+    """
+    # create a reference dict to define the order of the vector
+    reference_dict = copy.deepcopy(state_dict)
+    for key in remove_keys:
+        if key in reference_dict:
+            del reference_dict[key]
+    sorted_reference_dict = OrderedDict(sorted(reference_dict.items()))
+    # create a shared state dict using the reference dict
+    nn.utils.vector_to_parameters(vector, sorted_reference_dict.values())
+    # add back the encoder and decoder embedding weights.
+    if "transformer.shared.weight" in sorted_reference_dict:
+        for key in remove_keys:
+            sorted_reference_dict[key] = sorted_reference_dict[
+                "transformer.shared.weight"
+            ]
+    return sorted_reference_dict
+def add_ptm_to_tv(tv_dict, ptm_dict):
+    """
+    Add the values of one state dictionary to another.
+    Args:
+        tv_dict (dict): The target state dictionary.
+        ptm_dict (dict): The state dictionary to add.
+    Returns:
+        dict: The resulting state dictionary after addition.
+    """
+    assert set(tv_dict.keys()) == set(
+        ptm_dict.keys()
+    ), "Differing parameter names in models."
+    final_dict = copy.deepcopy(tv_dict)
+    for k, v in ptm_dict.items():
+        final_dict[k] = tv_dict[k] + v
+    return final_dict
+def check_parameterNamesMatch(checkpoints: List[StateDictType]) -> None:
+    """
+    Check if the parameter names match across multiple checkpoints.
+    Args:
+        checkpoints (list): List of state dictionaries to check.
+    Raises:
+        ValueError: If the parameter names do not match.
+    """
+    parameter_names = set(checkpoints[0].keys())
+    if len(checkpoints) >= 2:
+        # raise ValueError("Number of models is less than 2.")
+        for checkpoint in checkpoints[1:]:
+            current_parameterNames = set(checkpoint.keys())
+            if current_parameterNames != parameter_names:
+                raise ValueError(
+                    "Differing parameter names in models. "
+                    f"The different parameters are {parameter_names.symmetric_difference(current_parameterNames)}"
+                )
+def check_state_dicts_equal(
+    state_dict1: StateDictType, state_dict2: StateDictType
+) -> bool:
+    """
+    Check if two state dictionaries are equal.
+    Args:
+        state_dict1 (dict): The first state dictionary.
+        state_dict2 (dict): The second state dictionary.
+    Returns:
+        bool: True if the state dictionaries are equal, False otherwise.
+    """
+    if set(state_dict1.keys()) != set(state_dict2.keys()):
+        return False
+    for key in state_dict1.keys():
+        if not torch.equal(state_dict1[key], state_dict2[key]):
+            return False
+    return True
+# TIES MERGING UTILS
+def topk_values_mask(M, K=0.7, return_mask=False):
+    """
+    Mask the top K values in a tensor.
+    Args:
+        M (Tensor): The input tensor.
+        K (float): The proportion of top values to keep.
+        return_mask (bool): Whether to return the mask tensor.
+    Returns:
+        tuple: The masked tensor, the mean of the mask, and optionally the mask tensor.
+    """
+    if K > 1:
+        K /= 100
+    original_shape = M.shape
+    if M.dim() == 1:
+        M = M.unsqueeze(0)
+    n, d = M.shape
+    k = int(d * K)
+    k = d - k  # Keep top k elements instead of bottom k elements
+    # Find the k-th smallest element by magnitude for each row
+    kth_values, _ = M.abs().kthvalue(k, dim=1, keepdim=True)
+    # Create a mask tensor with True for the top k elements in each row
+    mask = M.abs() >= kth_values
+    final_mask = mask.squeeze() if original_shape == M.squeeze().shape else mask
+    if return_mask:
+        return M * final_mask, final_mask.float().mean(dim=1), final_mask
+    return M * final_mask, final_mask.float().mean(dim=1)
+def resolve_zero_signs(sign_to_mult, method="majority"):
+    """
+    Resolve zero signs in a tensor by majority or minority rule.
+    Args:
+        sign_to_mult (Tensor): The tensor with signs to resolve.
+        method (str): The method to use for resolving zero signs ("majority" or "minority").
+    Returns:
+        Tensor: The tensor with resolved signs.
+    """
+    majority_sign = torch.sign(sign_to_mult.sum())
+    if method == "majority":
+        sign_to_mult[sign_to_mult == 0] = majority_sign
+    elif method == "minority":
+        sign_to_mult[sign_to_mult == 0] = -1 * majority_sign
+    return sign_to_mult
+def resolve_sign(v: Tensor):
+    """
+    Resolve the sign of a tensor by majority rule.
+    Args:
+        v (Tensor): The input tensor.
+    Returns:
+        Tensor: The tensor with resolved signs.
+    """
+    sign_to_mult = torch.sign(v.sum(dim=0))
+    sign_to_mult = resolve_zero_signs(sign_to_mult, "majority")
+    return sign_to_mult
+def disjoint_merge(v: Tensor, merge_func: str, sign_to_mult):
+    """
+    Perform disjoint merging of a tensor using a specified merge function.
+    Args:
+        v (Tensor): The input tensor.
+        merge_func (str): The merge function to use ("mean", "sum", or "max").
+        sign_to_mult (Tensor): The tensor with signs to use for merging.
+    Returns:
+        Tensor: The merged tensor.
+    """
+    merge_func = merge_func.split("-")[-1]
+    # If sign is provided then we select the corresponding entries and aggregate.
+    if sign_to_mult is not None:
+        rows_to_keep = torch.where(sign_to_mult.unsqueeze(0) > 0, v > 0, v < 0)
+        selected_entries = v * rows_to_keep
+    # Else we select all non-zero entries and aggregate.
+    else:
+        rows_to_keep = v != 0
+        selected_entries = v * rows_to_keep
+    if merge_func == "mean":
+        non_zero_counts = (selected_entries != 0).sum(dim=0).float()
+        disjoint_aggs = torch.sum(selected_entries, dim=0) / torch.clamp(
+            non_zero_counts, min=1
+        )
+    elif merge_func == "sum":
+        disjoint_aggs = torch.sum(selected_entries, dim=0)
+    elif merge_func == "max":
+        disjoint_aggs = selected_entries.abs().max(dim=0)[0]
+        disjoint_aggs *= sign_to_mult
+    else:
+        raise ValueError(f"Merge method {merge_func} is not defined.")
+    return disjoint_aggs
+def ties_merging(
+    flat_task_checks,
+    reset_thresh=None,
+    merge_func="",
+):
+    """
+    Perform TIES merging on a tensor.
+    Args:
+        flat_task_checks (Tensor): The input tensor.
+        reset_thresh (float): The threshold for resetting values.
+        merge_func (str): The merge function to use.
+    Returns:
+        Tensor: The merged tensor.
+    """
+    all_checks = flat_task_checks.clone()
+    updated_checks, *_ = topk_values_mask(all_checks, K=reset_thresh, return_mask=False)
+    print("RESOLVING SIGN")
+    final_signs = resolve_sign(updated_checks)
+    assert final_signs is not None
+    print(f"Disjoint AGGREGATION: {merge_func}")
+    merged_tv = disjoint_merge(updated_checks, merge_func, final_signs)
+    return merged_tv
+def disjoint_merge_split(v: Tensor, merge_func: str, sign_to_mult):
+    """
+    Perform disjoint merging of a tensor using a specified merge function and return selected entries.
+    Args:
+        v (Tensor): The input tensor.
+        merge_func (str): The merge function to use ("sum").
+        sign_to_mult (Tensor): The tensor with signs to use for merging.
+    Returns:
+        tuple: The selected entries and the merged tensor.
+    """
+    merge_func = merge_func.split("-")[-1]
+    # If sign is provided then we select the corresponding entries and aggregate.
+    if sign_to_mult is not None:
+        rows_to_keep = torch.where(sign_to_mult.unsqueeze(0) > 0, v > 0, v < 0)
+        selected_entries = v * rows_to_keep
+    # Else we select all non-zero entries and aggregate.
+    else:
+        rows_to_keep = v != 0
+        selected_entries = v * rows_to_keep
+    if merge_func == "sum":
+        disjoint_aggs = torch.sum(selected_entries, dim=0)
+    else:
+        raise ValueError(f"Merge method {merge_func} is not defined.")
+    return selected_entries, disjoint_aggs
+def ties_merging_split(
+    flat_task_checks,
+    reset_thresh=None,
+    merge_func: str = "",
+):
+    """
+    Perform TIES merging on a tensor and return selected entries.
+    Args:
+        flat_task_checks (Tensor): The input tensor.
+        reset_thresh (float): The threshold for resetting values.
+        merge_func (str): The merge function to use.
+    Returns:
+        tuple: The selected entries and the merged tensor.
+    """
+    all_checks = flat_task_checks.clone()
+    updated_checks, *_ = topk_values_mask(all_checks, K=reset_thresh, return_mask=False)
+    print("RESOLVING SIGN")
+    final_signs = resolve_sign(updated_checks)
+    assert final_signs is not None
+    print(f"Disjoint AGGREGATION: {merge_func}")
+    selected_entries, merged_tv = disjoint_merge_split(
+        updated_checks, merge_func, final_signs
+    )
+    return selected_entries, merged_tv

fusion_bench/method/trust_region/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ # flake8: noqa F401
2	+ from .clip_task_arithmetic import TaskArithmeticWithTrustRegionForCLIP

fusion_bench/method/trust_region/clip_task_arithmetic.py ADDED Viewed

@@ -0,0 +1,205 @@
+"""
+Implementation of Task Arithmetic in Trust Region: A Training-Free Model Merging Approach to Navigate Knowledge Conflicts
+https://openreview.net/forum?id=q3ztjJRQuJ
+"""
+import logging
+from collections import defaultdict
+from copy import deepcopy
+from typing import Dict, Iterable, List, Union
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+from torch.utils.data import DataLoader
+from tqdm.auto import tqdm
+from typing_extensions import override
+from fusion_bench import BaseAlgorithm, BaseModelPool
+from fusion_bench.dataset.clip_dataset import CLIPDataset
+from fusion_bench.mixins import CLIPClassificationMixin, SimpleProfilerMixin
+from fusion_bench.utils import first
+from fusion_bench.utils.state_dict_arithmetic import state_dict_sub
+from fusion_bench.utils.type import StateDictType
+from .utils import state_dict_to_vector, vector_to_state_dict
+log = logging.getLogger(__name__)
+def trainable_state_dict(module: nn.Module) -> StateDictType:
+    """
+    Returns the state dictionary of the module containing only the trainable parameters.
+    Args:
+        module (nn.Module): The neural network module.
+    Returns:
+        Dict[str, Tensor]: A dictionary containing the names and values of the trainable parameters.
+    """
+    return {
+        name: param for name, param in module.named_parameters() if param.requires_grad
+    }
+class TaskArithmeticWithTrustRegionForCLIP(
+    BaseAlgorithm,
+    SimpleProfilerMixin,
+    CLIPClassificationMixin,
+):
+    def __init__(
+        self,
+        scaling_factor: Union[float, List[float]],
+        threshold_quantile: float,
+        max_samples: int,
+        batch_size: int,
+        zero_shot: bool,
+        **kwargs,
+    ):
+        self.scaling_factor = scaling_factor
+        self.threshold_quantile = threshold_quantile
+        self.max_samples = max_samples
+        self.batch_size = batch_size
+        self.zero_shot = zero_shot
+        super().__init__(**kwargs)
+    @override
+    def run(self, modelpool: BaseModelPool):
+        self.modelpool = modelpool
+        # compute the task vectors
+        pretrained_model, task_vectors = self.compute_vanilla_task_vectors()
+        task_vectors = {
+            name: state_dict_to_vector(task_vector)
+            for name, task_vector in task_vectors.items()
+        }
+        if not self.zero_shot:
+            all_avg_abs_grads = self.compute_avg_abs_grads(pretrained_model)
+            all_avg_abs_grads = {
+                n: state_dict_to_vector(grad) for n, grad in all_avg_abs_grads.items()
+            }
+        else:
+            # the task vector is used to estimate the gradient
+            all_avg_abs_grads = {name: tv.abs() for name, tv in task_vectors.items()}
+        # compute the trust region
+        Omega = torch.zeros_like(first(all_avg_abs_grads.values()))
+        for i in all_avg_abs_grads:
+            for j in all_avg_abs_grads:
+                if i != j:
+                    vector1 = all_avg_abs_grads[i]
+                    vector2 = torch.abs(task_vectors[j])
+                    Omega += vector1 * vector2
+        values, indices = Omega.sort(descending=False)
+        threshold = values[
+            max(0, min(int(Omega.numel() * self.threshold_quantile), Omega.numel() - 1))
+        ]
+        mask = (Omega < threshold).bool()
+        # compute the task vectors
+        for task in task_vectors:
+            task_vectors[task] = task_vectors[task] * mask
+        task_vector_sum = sum(task_vectors.values())
+        task_vector_sum = vector_to_state_dict(
+            task_vector_sum, trainable_state_dict(pretrained_model)
+        )
+        if isinstance(self.scaling_factor, (int, float)):
+            model = pretrained_model
+            for name, param in model.named_parameters():
+                param.data += task_vector_sum[name] * self.scaling_factor
+            return model
+        elif isinstance(self.scaling_factor, Iterable):
+            models = {}
+            for scaling_factor in self.scaling_factor:
+                model = deepcopy(pretrained_model)
+                for name, param in pretrained_model.named_parameters():
+                    param.data += task_vector_sum[name] * scaling_factor
+                models[scaling_factor] = model
+            return models
+        else:
+            raise ValueError(
+                f"Incorrect type of `scaling_factor`: {type(self.scaling_factor)}. "
+                "It should be a single real number or a list of real numbers."
+            )
+    def compute_avg_abs_grads(self, pretrained_model):
+        modelpool = self.modelpool
+        self.setup_zero_shot_classification_head()
+        pretrained_model = (
+            deepcopy(pretrained_model)
+            if pretrained_model is not None
+            else modelpool.load_pretrained_model()
+        )
+        pretrained_model = self.fabric.setup_module(pretrained_model)
+        pretrained_model.train()
+        all_avg_abs_grads: Dict[str, StateDictType] = {}
+        for train_dataset_name in (
+            pbar := tqdm(
+                modelpool.train_dataset_names, desc="Train datasets", dynamic_ncols=True
+            )
+        ):
+            pbar.set_description(f"Train dataset: {train_dataset_name}")
+            dataset = modelpool.load_train_dataset(train_dataset_name)
+            dataset = CLIPDataset(dataset, self.clip_processor)
+            dataloader = DataLoader(dataset, shuffle=True, batch_size=self.batch_size)
+            dataloader = self.fabric.setup_dataloaders(dataloader)
+            grad: StateDictType = defaultdict(float)
+            num_samples = 0
+            for batch in dataloader:
+                images, labels = batch
+                batch_size = images.size(0)
+                if num_samples + batch_size > self.max_samples:
+                    batch_size = self.max_samples - num_samples
+                    images = images[:batch_size]
+                    labels = labels[:batch_size]
+                logits = self.compute_logits(
+                    pretrained_model, images, task=train_dataset_name
+                )
+                for i in range(batch_size):
+                    pretrained_model.zero_grad()
+                    loss = F.cross_entropy(logits[i], labels[i])
+                    self.fabric.backward(
+                        loss, retain_graph=True if i != batch_size - 1 else False
+                    )
+                    for name, param in pretrained_model.module.named_parameters():
+                        if param.requires_grad:
+                            grad[name] += torch.abs(param.grad).detach()
+                num_samples += batch_size
+                if num_samples >= self.max_samples:
+                    break
+            for name in grad:
+                grad[name] = (grad[name] / num_samples).cpu()
+            all_avg_abs_grads[name] = grad
+        return all_avg_abs_grads
+    @torch.no_grad()
+    def compute_vanilla_task_vectors(self):
+        modelpool = self.modelpool
+        pretrained_model = modelpool.load_pretrained_model()
+        pretrained_sd = trainable_state_dict(pretrained_model)
+        finetuned_sds = {
+            name: trainable_state_dict(model)
+            for name, model in modelpool.named_models()
+        }
+        task_vectors = {
+            name: state_dict_sub(finetuned, pretrained_sd)
+            for name, finetuned in finetuned_sds.items()
+        }
+        return pretrained_model, task_vectors