PyPI - fusion-bench - Versions diffs - 0.2.9__py3-none-any.whl - Mend

fusion-bench 0.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (727) hide show

fusion_bench/method/simple_average.py ADDED Viewed

@@ -0,0 +1,112 @@
+import logging
+from copy import deepcopy
+from typing import Dict, List, Mapping, Optional, Union
+import torch
+from torch import nn
+from fusion_bench.method.base_algorithm import BaseAlgorithm
+from fusion_bench.mixins.simple_profiler import SimpleProfilerMixin
+from fusion_bench.modelpool import BaseModelPool
+from fusion_bench.utils.state_dict_arithmetic import (
+    state_dict_add,
+    state_dict_avg,
+    state_dict_div,
+    state_dict_mul,
+)
+from fusion_bench.utils.type import StateDictType
+log = logging.getLogger(__name__)
+def simple_average(
+    modules: List[Union[nn.Module, StateDictType]],
+    base_module: Optional[nn.Module] = None,
+):
+    R"""
+    Averages the parameters of a list of PyTorch modules or state dictionaries.
+    This function takes a list of PyTorch modules or state dictionaries and returns a new module with the averaged parameters, or a new state dictionary with the averaged parameters.
+    Args:
+        modules (List[Union[nn.Module, StateDictType]]): A list of PyTorch modules or state dictionaries.
+        base_module (Optional[nn.Module]): A base module to use for the new module. If provided, the averaged parameters will be loaded into this module. If not provided, a new module will be created by copying the first module in the list.
+    Returns:
+        module_or_state_dict (Union[nn.Module, StateDictType]): A new PyTorch module with the averaged parameters, or a new state dictionary with the averaged parameters.
+    Examples:
+        >>> import torch.nn as nn
+        >>> model1 = nn.Linear(10, 10)
+        >>> model2 = nn.Linear(10, 10)
+        >>> averaged_model = simple_averageing([model1, model2])
+        >>> state_dict1 = model1.state_dict()
+        >>> state_dict2 = model2.state_dict()
+        >>> averaged_state_dict = simple_averageing([state_dict1, state_dict2])
+    """
+    if isinstance(modules[0], nn.Module):
+        if base_module is None:
+            new_module = deepcopy(modules[0])
+        else:
+            new_module = base_module
+        state_dict = state_dict_avg([module.state_dict() for module in modules])
+        new_module.load_state_dict(state_dict)
+        return new_module
+    elif isinstance(modules[0], Mapping):
+        return state_dict_avg(modules)
+class SimpleAverageAlgorithm(
+    BaseAlgorithm,
+    SimpleProfilerMixin,
+):
+    @torch.no_grad()
+    def run(self, modelpool: Union[BaseModelPool, Dict[str, nn.Module]]):
+        """
+        Fuse the models in the given model pool using simple averaging.
+        This method iterates over the names of the models in the model pool, loads each model, and appends it to a list.
+        It then returns the simple average of the models in the list.
+        Args:
+            modelpool: The pool of models to fuse.
+        Returns:
+            The fused model obtained by simple averaging.
+        """
+        if isinstance(modelpool, dict):
+            modelpool = BaseModelPool(modelpool)
+        log.info(
+            f"Fusing models using simple average on {len(modelpool.model_names)} models."
+            f"models: {modelpool.model_names}"
+        )
+        sd: Optional[StateDictType] = None
+        forward_model = None
+        merged_model_names = []
+        for model_name in modelpool.model_names:
+            with self.profile("load model"):
+                model = modelpool.load_model(model_name)
+                merged_model_names.append(model_name)
+                print(f"load model of type: {type(model).__name__}")
+            with self.profile("merge weights"):
+                if sd is None:
+                    # Initialize the state dictionary with the first model's state dictionary
+                    sd = model.state_dict(keep_vars=True)
+                    forward_model = model
+                else:
+                    # Add the current model's state dictionary to the accumulated state dictionary
+                    sd = state_dict_add(sd, model.state_dict(keep_vars=True))
+        with self.profile("merge weights"):
+            # Divide the accumulated state dictionary by the number of models to get the average
+            sd = state_dict_div(sd, len(modelpool.model_names))
+        forward_model.load_state_dict(sd)
+        # print profile report and log the merged models
+        self.print_profile_summary()
+        log.info(f"merged {len(merged_model_names)} models:")
+        for model_name in merged_model_names:
+            log.info(f"  - {model_name}")
+        return forward_model

fusion_bench/method/slerp/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ # flake8: noqa F401
2	+ from .slerp import SlerpMergeAlgorithm

fusion_bench/method/slerp/slerp.py ADDED Viewed

@@ -0,0 +1,101 @@
+import logging
+import torch
+from typing_extensions import override
+from fusion_bench.method import BaseAlgorithm
+from fusion_bench.modelpool import BaseModelPool
+from .slerp_utils import slerp
+log = logging.getLogger(__name__)
+def slerp_on_state_dicts(
+    t,
+    primary_state_dict,
+    secondary_state_dict,
+    *,
+    DOT_THRESHOLD: float = 0.9995,
+    epsilon: float = 1e-8,
+):
+    """
+    Perform spherical linear interpolation (slerp) on the state dictionaries of two models.
+    Args:
+        t (float): The interpolation factor, typically between 0 and 1.
+        primary_state_dict (dict): The state dictionary of the primary model.
+        secondary_state_dict (dict): The state dictionary of the secondary model.
+        DOT_THRESHOLD (float, optional): Threshold for considering the vectors as collinear. Defaults to 0.9995.
+        epsilon (float, optional): Small value to avoid division by zero. Defaults to 1e-8.
+    Returns:
+        dict: The interpolated state dictionary.
+    """
+    state_dict = {}
+    for key in secondary_state_dict:
+        v0 = primary_state_dict[key]
+        v1 = secondary_state_dict[key]
+        if v0.shape != v1.shape:
+            log.warning(
+                f"Skipping key {key} because the shapes of the tensors are different: {v0.shape} vs {v1.shape}. Base model parameters will be used."
+            )
+            state_dict[key] = v0
+        else:
+            state_dict[key] = slerp(t, v0, v1, DOT_THRESHOLD, epsilon)
+    return state_dict
+class SlerpMergeAlgorithm(BaseAlgorithm):
+    """
+    General purpose implementation of Slerp (Spherical Linear Interpolation) for PyTorch models.
+    """
+    _config_mapping = BaseAlgorithm._config_mapping | {
+        "t": "t",
+        "DOT_THRESHOLD": "DOT_THRESHOLD",
+        "epsilon": "epsilon",
+    }
+    def __init__(self, t: float, DOT_THRESHOLD: float = 0.9995, epsilon: float = 1e-8):
+        """
+        Initialize the SlerpMergeAlgorithm.
+        Args:
+            t (float): The interpolation parameter. Must be in the range [0, 1].
+            DOT_THRESHOLD (float, optional): The threshold for the dot product of the two vectors. Defaults to 0.9995.
+            epsilon (float, optional): The epsilon value for numerical stability. Defaults to 1e-8.
+        """
+        self.t = t
+        self.DOT_THRESHOLD = DOT_THRESHOLD
+        self.epsilon = epsilon
+        super().__init__()
+    @override
+    def run(self, modelpool: BaseModelPool):
+        """
+        Run the SlerpMergeAlgorithm on the given model pool.
+        Args:
+            modelpool (BaseModelPool): The pool of models to fuse.
+        Returns:
+            nn.Module: The fused model.
+        """
+        assert len(modelpool.all_model_names) == 2, "Slerp expect exactly 2 models"
+        primary_model = modelpool.load_model(modelpool.all_model_names[0])
+        secondary_model = modelpool.load_model(modelpool.all_model_names[1])
+        with torch.no_grad():
+            primary_state_dict = primary_model.state_dict()
+            secondary_state_dict = secondary_model.state_dict()
+            state_dict = slerp_on_state_dicts(
+                self.t,
+                primary_state_dict,
+                secondary_state_dict,
+                DOT_THRESHOLD=self.DOT_THRESHOLD,
+                epsilon=self.epsilon,
+            )
+        primary_model.load_state_dict(state_dict)
+        return primary_model

fusion_bench/method/slerp/slerp_utils.py ADDED Viewed

@@ -0,0 +1,107 @@
+# Modification of: https://github.com/Digitous/LLM-SLERP-Merge/blob/main/slerpmergelm.py
+# LLM HF SLERP Merge
+# Retrofitted from dvschultz's script at https://gist.github.com/dvschultz/3af50c40df002da3b751efab1daddf2c
+# to work with Huggingface Pretrained Language Models [by Chasm (AKA Digitous) and CalderaAI (on HuggingFace)].
+# Original language model linear interpolation methods pioneered by Concedo AKA LostRuins on Github and HF.
+# Idea for SLERP on LLMs sparked by discussion in Automatic1111 Stable Diffusion UI feature request for SLERP
+# model merging for image diffusion domain models.
+import logging
+from typing import TypeVar
+import numpy as np
+import torch
+log = logging.getLogger(__name__)
+T = TypeVar("T", torch.Tensor, np.ndarray, float)
+def lerp(t: float, v0: T, v1: T) -> T:
+    """
+    Performs linear interpolation between two tensors v0 and v1.
+    Args:
+        t (float): The interpolation factor, typically between 0 and 1.
+        v0 (T): The starting value.
+        v1 (T): The ending value.
+    Returns:
+        T: The interpolated value.
+    """
+    return (1 - t) * v0 + t * v1
+def normalize(v: torch.Tensor, epsilon: float) -> torch.Tensor:
+    """
+    Normalizes a tensor.
+    Args:
+        v (torch.Tensor): The tensor to normalize.
+        epsilon (float, optional): A small value to avoid division by zero. Defaults to 1e-8.
+    Returns:
+        torch.Tensor: The normalized tensor.
+    """
+    norm = torch.linalg.norm(v)
+    if norm > epsilon:
+        return v / norm
+    else:
+        log.debug(f"Warning: Norm of v is very small ({norm}). Skipping normalization.")
+        return v
+def slerp(
+    t: float,
+    v0: torch.Tensor,
+    v1: torch.Tensor,
+    DOT_THRESHOLD=0.9995,
+    epsilon=1e-8,
+):
+    """
+    Performs spherical linear interpolation (slerp) between two tensors v0 and v1.
+    Args:
+        t (float): The interpolation factor, typically between 0 and 1.
+        v0 (torch.Tensor): The starting tensor.
+        v1 (torch.Tensor): The ending tensor.
+        DOT_THRESHOLD (float, optional): Threshold for considering the vectors as collinear. Defaults to 0.9995.
+        epsilon (float, optional): Small value to avoid division by zero. Defaults to 1e-8.
+    Returns:
+        torch.Tensor: The interpolated tensor.
+    """
+    device = v0.device
+    # Convert tensors to a common format, at least float32
+    if v0.dtype != torch.float32 and v0.dtype != torch.float64:
+        v0 = v0.to(dtype=torch.float32, non_blocking=True)
+        v1 = v1.to(dtype=torch.float32, non_blocking=True)
+    # Copy the vectors to reuse them later
+    v0_copy = v0.clone()
+    v1_copy = v1.clone()
+    # Normalize the vectors to get the directions and angles
+    v0 = normalize(v0, epsilon)
+    v1 = normalize(v1, epsilon)
+    # Dot product with the normalized vectors (can't use np.dot in W)
+    dot = torch.sum(v0 * v1)
+    # If absolute value of dot product is almost 1, vectors are ~colineal, so use lerp
+    if torch.abs(dot) > DOT_THRESHOLD:
+        res = lerp(t, v0_copy, v1_copy)
+    else:
+        # Calculate initial angle between v0 and v1
+        theta_0 = torch.arccos(dot)
+        sin_theta_0 = np.sin(theta_0)
+        # Angle at timestep t
+        theta_t = theta_0 * t
+        sin_theta_t = torch.sin(theta_t)
+        # Finish the slerp algorithm
+        s0 = torch.sin(theta_0 - theta_t) / sin_theta_0
+        s1 = sin_theta_t / sin_theta_0
+        res = s0 * v0_copy + s1 * v1_copy
+    return res.to(device, non_blocking=True)

fusion_bench/method/smile_upscaling/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+# flake8: noqa F401
+from .singular_projection_merging import SingularProjectionMergingAlgorithm
+from .smile_upscaling import SmileUpscalingAlgorithm

fusion_bench/method/smile_upscaling/singular_projection_merging.py ADDED Viewed

@@ -0,0 +1,198 @@
+import logging
+import os
+import re
+from copy import deepcopy
+from typing import Dict, List, Tuple  # noqa: F401
+import torch
+from torch import Tensor, nn
+from tqdm.auto import tqdm
+from fusion_bench.compat.method import ModelFusionAlgorithm
+from fusion_bench.compat.modelpool import ModelPool, to_modelpool
+from fusion_bench.mixins.simple_profiler import SimpleProfilerMixin
+from fusion_bench.models.utils import get_attr, set_attr
+log = logging.getLogger(__name__)
+def svd(w: Tensor, full_matrices: bool) -> Tuple[Tensor, Tensor, Tensor]:
+    """
+    Perform Singular Value Decomposition (SVD) on the given tensor.
+    Args:
+        w (Tensor): The input tensor to decompose.
+        full_matrices (bool): Whether to compute the full-sized U and V matrices.
+    Returns:
+        Tuple[Tensor, Tensor, Tensor]: The U, S, and V matrices from the SVD.
+    """
+    u, s, vh = torch.linalg.svd(
+        w, full_matrices=full_matrices, driver="gesvd" if w.is_cuda else None
+    )
+    v = vh.T
+    return u, s, v
+def _is_name_matched(name: str, extract_names: List[str]) -> bool:
+    """
+    Check if the given name matches any of the provided regular expressions.
+    Args:
+        name (str): The name to check.
+        extract_names (List[str]): A list of regular expressions to match against.
+    Returns:
+        bool: True if the name matches any of the regular expressions, False otherwise.
+    """
+    for extract_name in extract_names:
+        # extract_name is a regular expression
+        if re.match(extract_name, name):
+            return True
+    return False
+def _total_parameters(state) -> int:
+    """
+    Calculate the total number of parameters in the given state.
+    Args:
+        state: The state to calculate the parameters for. Can be a Tensor or a dictionary of Tensors.
+    Returns:
+        int: The total number of parameters.
+    Raises:
+        ValueError: If the state is not a Tensor or a dictionary of Tensors.
+    """
+    if isinstance(state, Tensor):
+        return state.numel()
+    elif isinstance(state, dict):
+        return sum(_total_parameters(v) for v in state.values())
+    else:
+        raise ValueError(f"Unsupported type: {type(state)}")
+class SingularProjectionMergingAlgorithm(ModelFusionAlgorithm, SimpleProfilerMixin):
+    """
+    A model fusion algorithm that projects parameter differences into the SVD subspace of a pretrained model.
+    This algorithm is experimental and aims to investigate the location of task-specific knowledge.
+    """
+    @torch.no_grad()
+    def run(self, modelpool: ModelPool) -> nn.Module:
+        """
+        Run the singular projection merging algorithm on the given model pool.
+        Args:
+            modelpool (ModelPool): The pool of models to merge.
+        Returns:
+            nn.Module: The merged model.
+        """
+        modelpool = to_modelpool(modelpool)
+        if self.config.model_path is not None and os.path.exists(
+            self.config.model_path
+        ):
+            log.info(f"loading merged model from {self.config.model_path}")
+            model = torch.load(self.config.model_path)
+        with self.profile("load pretrained model"):
+            pretrained_model = modelpool.load_model("_pretrained_").to(
+                self.config.device
+            )
+        with self.profile("load fine-tuned model"):
+            finetuned_models = modelpool.load_model(modelpool.model_names[0]).to(
+                self.config.device
+            )
+        with self.profile("merge model"):
+            model = self.merge(pretrained_model, finetuned_models)
+        if self.config.model_path is not None:
+            os.path.makedirs(os.path.dirname(self.config.model_path), exist_ok=True)
+            torch.save(model, self.config.model_path)
+        self.print_profile_summary()
+        return model
+    def merge(
+        self,
+        pretrained_model: nn.Module,
+        finetuned_model: nn.Module,
+        in_place: bool = True,
+    ) -> nn.Module:
+        """
+        Merges the pretrained model with the fine-tuned model by projecting parameter differences
+        into the SVD subspace of the pretrained model.
+        Args:
+            pretrained_model (nn.Module): The pretrained model.
+            finetuned_model (nn.Module): The fine-tuned model.
+            in_place (bool): If True, modifies the fine-tuned model in place. Otherwise, creates a copy.
+        Returns:
+            nn.Module: The merged model.
+        """
+        if in_place:
+            model = finetuned_model
+        else:
+            model = deepcopy(finetuned_model)
+        for name, module in tqdm(
+            tuple(model.named_modules()),
+            "Projection merging in SVD subspace of pretrained model",
+        ):
+            if isinstance(module, nn.Linear):
+                name_list = name.split(".")
+                set_attr(
+                    model,
+                    name_list,
+                    self.projection_merge_linear(
+                        get_attr(pretrained_model, name_list),
+                        get_attr(finetuned_model, name_list),
+                        k=self.config.k,
+                    ),
+                )
+        return model
+    def projection_merge_linear(
+        self, pretrained_model: nn.Linear, finetuned_model: nn.Linear, k: int
+    ) -> nn.Linear:
+        """
+        Projects the parameter differences of linear layers into the SVD subspace of the pretrained model.
+        Args:
+            pretrained_model (nn.Linear): The linear layer of the pretrained model.
+            finetuned_model (nn.Linear): The linear layer of the fine-tuned model.
+            k (int): The number of singular values to keep. If negative, it is determined based on the sum of singular values.
+        Returns:
+            nn.Linear: The merged linear layer with projected parameter differences.
+        """
+        w = pretrained_model.weight
+        w_ft = finetuned_model.weight
+        u, s, v = svd(w, full_matrices=self.config.full_matrices)
+        if k < 0:
+            # find the position where the sum of singular values is larger than 50% of the total sum
+            cumsum = s.cumsum(0)
+            k = (cumsum < cumsum[-1] * 0.5).sum().item() + 1
+        if self.config.rank == "low":
+            u = u[:, :k]
+            s = s[:k]
+            v = v[:, :k]
+        else:
+            u = u[:, k:]
+            s = s[k:]
+            v = v[:, k:]
+        w_diff = w_ft - w
+        w_diff_proj = u.T @ w_diff @ v
+        w.data = w + u @ w_diff_proj @ v.T
+        if pretrained_model.bias is not None:
+            pretrained_model.bias.data = finetuned_model.bias.data
+        return pretrained_model