PyPI - fusion-bench - Versions diffs - 0.2.9__py3-none-any.whl → 0.2.11__py3-none-any.whl - Mend

fusion-bench 0.2.9py3-none-any.whl → 0.2.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

fusion_bench/method/isotropic_merging/__init__.py ADDED Viewed

@@ -0,0 +1,15 @@
+"""
+This module contains the implementation of the Isotropic Merging in Common Subspace (ISO-C) algorithm and Isotropic Merging in Common and Task-Specific Subspaces (Iso-CTS) algorithm.
+Modified from the original implementation: https://github.com/danielm1405/iso-merging
+Reference:
+- Daniel Marczak, et al. No Task Left Behind: Isotropic Model Merging with Common and Task-Specific Subspaces. 2025.
+    https://arxiv.org/abs/2502.04959
+"""
+from .iso import (
+    ISO_C_Merge,
+    ISO_CTS_Merge,
+    IsotropicMergingInCommonAndTaskSubspace,
+    IsotropicMergingInCommonSubspace,
+)

fusion_bench/method/isotropic_merging/iso.py ADDED Viewed

@@ -0,0 +1,114 @@
+from typing import List
+import torch
+from fusion_bench import BaseAlgorithm, BaseModelPool
+from fusion_bench.mixins import LightningFabricMixin
+from fusion_bench.utils.state_dict_arithmetic import (
+    state_dict_add,
+    state_dict_mul,
+    state_dict_sub,
+)
+from .iso_utils import check_parameterNamesMatch, iso_c, iso_cts
+class IsotropicMergingInCommonSubspace(BaseAlgorithm, LightningFabricMixin):
+    """
+    Isotropic Merging in Common Subspace (Iso-C)
+    """
+    def __init__(
+        self,
+        scaling_factor: float,
+        exclude_keys: List[str] = None,
+    ):
+        self.scaling_factor = scaling_factor
+        self.exclude_keys = exclude_keys
+        super().__init__()
+    def run(self, modelpool: BaseModelPool):
+        # load the pretrained model and the task vectors of all the finetuned models
+        with torch.no_grad():
+            pretrained_model = modelpool.load_pretrained_model()
+            task_vectors = []
+            for model_name in modelpool.model_names:
+                finetuned_model = modelpool.load_model(model_name)
+                task_vectors.append(
+                    state_dict_sub(
+                        finetuned_model.state_dict(), pretrained_model.state_dict()
+                    )
+                )
+                del finetuned_model  # free memory
+            check_parameterNamesMatch(task_vectors)
+        # compute the merged task vector
+        merged_tv = iso_c(
+            task_vectors,
+            accelerator=self.fabric.device,
+            exclude_keys=self.exclude_keys,
+        )
+        # merged_parameters = pretrained_parameters + scaling_factor * merged_task_vector
+        pretrained_model.load_state_dict(
+            state_dict_add(
+                pretrained_model.state_dict(),
+                state_dict_mul(merged_tv, self.scaling_factor),
+            )
+        )
+        return pretrained_model
+class IsotropicMergingInCommonAndTaskSubspace(BaseAlgorithm, LightningFabricMixin):
+    """
+    Isotropic Merging in Common and Task-Specific Subspaces (Iso-CTS)
+    """
+    def __init__(
+        self,
+        scaling_factor: float,
+        common_space_fraction: float,
+        exclude_keys: List[str] = None,
+    ):
+        self.common_space_fraction = common_space_fraction
+        self.scaling_factor = scaling_factor
+        self.exclude_keys = exclude_keys
+        super().__init__()
+    def run(self, modelpool: BaseModelPool):
+        # load the pretrained model and the task vectors of all the finetuned models
+        with torch.no_grad():
+            pretrained_model = modelpool.load_pretrained_model()
+            task_vectors = []
+            for model_name in modelpool.model_names:
+                finetuned_model = modelpool.load_model(model_name)
+                task_vectors.append(
+                    state_dict_sub(
+                        finetuned_model.state_dict(), pretrained_model.state_dict()
+                    )
+                )
+                del finetuned_model  # free memory
+            check_parameterNamesMatch(task_vectors)
+        # compute the merged task vector
+        merged_tv = iso_cts(
+            task_vectors,
+            common_space_fraction=self.common_space_fraction,
+            accelerator=self.fabric.device,
+            exclude_keys=self.exclude_keys,
+        )
+        # merged_parameters = pretrained_parameters + scaling_factor * merged_task_vector
+        pretrained_model.load_state_dict(
+            state_dict_add(
+                pretrained_model.state_dict(),
+                state_dict_mul(merged_tv, self.scaling_factor),
+            )
+        )
+        return pretrained_model
+ISO_C_Merge = IsotropicMergingInCommonSubspace  # alias
+ISO_CTS_Merge = IsotropicMergingInCommonAndTaskSubspace  # alias

fusion_bench/method/isotropic_merging/iso_utils.py ADDED Viewed

@@ -0,0 +1,176 @@
+import math
+from typing import List
+import torch
+from fusion_bench.utils import timeit_context
+from fusion_bench.utils.type import StateDictType
+def iso_c(
+    task_vectors: List[StateDictType],
+    accelerator="cuda",
+    exclude_keys: List[str] = None,
+) -> StateDictType:
+    exclude_keys = [] if exclude_keys is None else exclude_keys
+    with torch.no_grad(), timeit_context("ISO-C Merging"):
+        new_vector = {}
+        for key in task_vectors[0]:
+            print(f"Merging {key}...")
+            original_device = task_vectors[0][key].device
+            tvs = [
+                task_vector[key].to(device=accelerator, non_blocking=True)
+                for task_vector in task_vectors
+            ]
+            num_tvs = len(tvs)
+            new_vector[key] = sum(tvs) / num_tvs
+            del tvs  # free memory
+            if len(task_vectors[0][key].shape) == 2 and key not in exclude_keys:
+                # if the key is a 2D matrix, we need to merge the task vectors in the common space
+                new_vector[key] *= num_tvs
+                U, S, V = torch.linalg.svd(new_vector[key], full_matrices=False)
+                S_mean = torch.ones_like(S) * S.mean()
+                new_vector[key] = torch.linalg.multi_dot(
+                    (
+                        U,
+                        torch.diag(S_mean),
+                        V,
+                    )
+                )
+            new_vector[key] = new_vector[key].to(
+                device=original_device, non_blocking=True
+            )
+    return new_vector
+@torch.no_grad()
+def iso_cts(
+    task_vectors: List[StateDictType],
+    common_space_fraction: float,
+    accelerator: str = "cuda",
+    exclude_keys: List[str] = None,
+):
+    exclude_keys = [] if exclude_keys is None else exclude_keys
+    new_vector = {}
+    print("ISO-CTS Merging")
+    for key in task_vectors[0]:
+        shape_ = task_vectors[0][key].shape
+        original_device = task_vectors[0][key].device
+        is_2d_matrix = (len(shape_) == 2) and (key not in exclude_keys)
+        if not is_2d_matrix:
+            print(f"Combining by avg {key}...")
+            for i, task_vector in enumerate(task_vectors):
+                vec = task_vector[key].to(device=accelerator, non_blocking=True)
+                if i == 0:
+                    new_vector[key] = vec.clone()
+                else:
+                    new_vector[key] += (vec - new_vector[key]) / (i + 1)
+            # move the new vector to the original device
+            new_vector[key] = new_vector[key].to(
+                device=original_device, non_blocking=True
+            )
+            continue
+        print(f"Computing common space using sum for {key}...")
+        combined_w = sum(
+            [
+                task_vector[key].to(device=accelerator, non_blocking=True)
+                for task_vector in task_vectors
+            ]
+        )
+        ### Calculate the common space size (making sure that task specific space is equally divisible) ###
+        common_space_index_s = int(min(shape_) * common_space_fraction)
+        _task_specific_total_space_index_s = round(
+            (min(shape_) - common_space_index_s) / len(task_vectors)
+        ) * len(task_vectors)
+        common_space_index_s = min(shape_) - _task_specific_total_space_index_s
+        u, s, v = torch.linalg.svd(combined_w, full_matrices=False)
+        common_space_u = u[:, :common_space_index_s]
+        common_space_s = s[:common_space_index_s]
+        common_space_v = v[:common_space_index_s, :]
+        ###################################################################
+        ### Calculate task specific space ###
+        n_dims_per_task = int((min(shape_) - common_space_index_s) / len(task_vectors))
+        for i, task_vector in enumerate(task_vectors):
+            w = task_vector[key].to(device=accelerator)
+            # calculate the projection onto task specific space to remove the common space
+            w_ts = w - common_space_u @ common_space_u.T @ w
+            u_ts, s_ts, v_ts = torch.linalg.svd(w_ts, full_matrices=False)
+            if i == 0:
+                combined_space_u = torch.zeros_like(u_ts, device=accelerator)
+                combined_space_s = torch.zeros_like(s_ts, device=accelerator)
+                combined_space_v = torch.zeros_like(v_ts, device=accelerator)
+            combined_space_u[:, i * n_dims_per_task : (i + 1) * n_dims_per_task] = u_ts[
+                :, :n_dims_per_task
+            ]
+            combined_space_s[i * n_dims_per_task : (i + 1) * n_dims_per_task] = s_ts[
+                :n_dims_per_task
+            ]
+            combined_space_v[i * n_dims_per_task : (i + 1) * n_dims_per_task, :] = v_ts[
+                :n_dims_per_task, :
+            ]
+        ###################################################################
+        combined_space_u[
+            :,
+            len(task_vectors) * n_dims_per_task : len(task_vectors) * n_dims_per_task
+            + common_space_index_s,
+        ] = common_space_u
+        combined_space_s[
+            len(task_vectors) * n_dims_per_task : len(task_vectors) * n_dims_per_task
+            + common_space_index_s
+        ] = common_space_s
+        combined_space_v[
+            len(task_vectors) * n_dims_per_task : len(task_vectors) * n_dims_per_task
+            + common_space_index_s,
+            :,
+        ] = common_space_v
+        ### Orthogonalize combined_space_u and combined_space_v ###
+        u_combined_space_u, s_combined_space_u, v_combined_space_u = torch.linalg.svd(
+            combined_space_u, full_matrices=False
+        )
+        u_combined_space_v, s_combined_space_v, v_combined_space_v = torch.linalg.svd(
+            combined_space_v, full_matrices=False
+        )
+        combined_space_u = u_combined_space_u @ v_combined_space_u
+        combined_space_v = u_combined_space_v @ v_combined_space_v
+        ###################################################################
+        combined_space_s = torch.ones_like(combined_space_s) * combined_space_s.mean()
+        new_vector[key] = torch.linalg.multi_dot(
+            (
+                combined_space_u,
+                torch.diag(combined_space_s),
+                combined_space_v,
+            )
+        )
+        new_vector[key] = new_vector[key].to(device=original_device, non_blocking=True)
+    return new_vector
+def check_parameterNamesMatch(checkpoints):
+    parameter_names = set(checkpoints[0].keys())
+    if len(checkpoints) >= 2:
+        # raise ValueError("Number of models is less than 2.")
+        for checkpoint in checkpoints[1:]:
+            current_parameterNames = set(checkpoint.keys())
+            if current_parameterNames != parameter_names:
+                raise ValueError(
+                    "Differing parameter names in models. "
+                    f"The different parameters are {parameter_names.symmetric_difference(current_parameterNames)}"
+                )

fusion_bench/method/task_singular_vector/TSVM.py CHANGED Viewed

@@ -9,15 +9,20 @@ fusion_bench \
 ```
 """
-from typing import List, Optional
+from typing import Iterable, List, Optional, Union
 import torch
+from omegaconf import ListConfig
 from torch import Tensor, nn
 from fusion_bench import BaseAlgorithm
 from fusion_bench.mixins import LightningFabricMixin
 from fusion_bench.utils import timeit_context
-from fusion_bench.utils.state_dict_arithmetic import state_dict_sub, state_dict_add
+from fusion_bench.utils.state_dict_arithmetic import (
+    state_dict_add,
+    state_dict_mul,
+    state_dict_sub,
+)
 from fusion_bench.utils.type import StateDictType
 from .utils import (
@@ -33,9 +38,11 @@ class TaskSingularVectorMerging(BaseAlgorithm, LightningFabricMixin):
     def __init__(
         self,
+        alpha: Union[float, Iterable[float]] = None,
         remove_keys: Optional[List[str]] = None,
         **kwargs,
     ):
+        self.alpha = alpha
         self.remove_keys = remove_keys if remove_keys is not None else []
         super().__init__(**kwargs)
@@ -50,6 +57,14 @@ class TaskSingularVectorMerging(BaseAlgorithm, LightningFabricMixin):
         with timeit_context("Flattening out Checkpoints"):
             task_vectors = [state_dict_sub(check, ptm_check) for check in ft_checks]
+            if isinstance(self.alpha, Iterable):
+                assert len(self.alpha) == len(
+                    task_vectors
+                ), "Alpha and task vectors must have the same length"
+                task_vectors = [
+                    state_dict_mul(state_dict=tv, scalar=alpha)
+                    for alpha, tv in zip(self.alpha, task_vectors)
+                ]
         new_merged_tv = TSVM_utils.compute_and_sum_svd_mem_reduction(
             task_vectors,
@@ -57,6 +72,11 @@ class TaskSingularVectorMerging(BaseAlgorithm, LightningFabricMixin):
             accelerator=self.fabric.device,
         )
+        # If alpha is a float, we need to scale the new merged task vector by alpha
+        if self.alpha is not None and isinstance(self.alpha, float):
+            print(f"Scaling new merged task vector by alpha: {self.alpha}")
+            new_merged_tv = state_dict_mul(state_dict=new_merged_tv, scalar=self.alpha)
         pretrained_model.load_state_dict(
             state_dict_add(new_merged_tv, pretrained_model.state_dict())
         )

fusion-bench 0.2.9__py3-none-any.whl → 0.2.11__py3-none-any.whl

fusion-bench 0.2.9py3-none-any.whl → 0.2.11py3-none-any.whl