PyPI - fusion-bench - Versions diffs - 0.2.25__py3-none-any.whl → 0.2.27__py3-none-any.whl - Mend

fusion-bench 0.2.25py3-none-any.whl → 0.2.27py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (141) hide show

fusion_bench/method/dop/min_norm_solvers.py ADDED Viewed

@@ -0,0 +1,227 @@
+# This code is from
+# Multi-Task Learning as Multi-Objective Optimization
+# Ozan Sener, Vladlen Koltun
+# Neural Information Processing Systems (NeurIPS) 2018
+# https://github.com/intel-isl/MultiObjectiveOptimization
+from typing import Union
+import numpy as np
+import torch
+def np_sum(x: Union[torch.Tensor, np.ndarray]) -> float:
+    if isinstance(x, torch.Tensor):
+        return x.sum().item()
+    return np.sum(x)
+def to_numpy(x: Union[torch.Tensor, np.ndarray]) -> np.ndarray:
+    if isinstance(x, torch.Tensor):
+        return x.detach().cpu().numpy()
+    return x
+class MinNormSolver:
+    MAX_ITER = 250
+    STOP_CRIT = 1e-5
+    def _min_norm_element_from2(v1v1, v1v2, v2v2):
+        """
+        Analytical solution for min_{c} |cx_1 + (1-c)x_2|_2^2
+        d is the distance (objective) optimzed
+        v1v1 = <x1,x1>
+        v1v2 = <x1,x2>
+        v2v2 = <x2,x2>
+        """
+        if v1v2 >= v1v1:
+            # Case: Fig 1, third column
+            gamma = 0.999
+            cost = v1v1
+            return gamma, cost
+        if v1v2 >= v2v2:
+            # Case: Fig 1, first column
+            gamma = 0.001
+            cost = v2v2
+            return gamma, cost
+        # Case: Fig 1, second column
+        gamma = -1.0 * ((v1v2 - v2v2) / (v1v1 + v2v2 - 2 * v1v2))
+        cost = v2v2 + gamma * (v1v2 - v2v2)
+        return gamma, cost
+    def _min_norm_2d(vecs, dps):
+        R"""
+        Find the minimum norm solution as combination of two points
+        This is correct only in 2D
+        ie. min_c |\sum c_i x_i|_2^2 st. \sum c_i = 1 , 1 >= c_1 >= 0 for all i, c_i + c_j = 1.0 for some i, j
+        """
+        dmin = 1e8
+        for i in range(len(vecs)):
+            for j in range(i + 1, len(vecs)):
+                if (i, j) not in dps:
+                    dps[(i, j)] = 0.0
+                    for k in range(len(vecs[i])):
+                        dps[(i, j)] += (
+                            torch.mul(vecs[i][k], vecs[j][k]).sum().data.cpu()
+                        )
+                    dps[(j, i)] = dps[(i, j)]
+                if (i, i) not in dps:
+                    dps[(i, i)] = 0.0
+                    for k in range(len(vecs[i])):
+                        dps[(i, i)] += (
+                            torch.mul(vecs[i][k], vecs[i][k]).sum().data.cpu()
+                        )
+                if (j, j) not in dps:
+                    dps[(j, j)] = 0.0
+                    for k in range(len(vecs[i])):
+                        dps[(j, j)] += (
+                            torch.mul(vecs[j][k], vecs[j][k]).sum().data.cpu()
+                        )
+                c, d = MinNormSolver._min_norm_element_from2(
+                    dps[(i, i)], dps[(i, j)], dps[(j, j)]
+                )
+                if d < dmin:
+                    dmin = d
+                    sol = [(i, j), c, d]
+        return sol, dps
+    def _projection2simplex(y):
+        R"""
+        Given y, it solves argmin_z |y-z|_2 st \sum z = 1 , 1 >= z_i >= 0 for all i
+        """
+        m = len(y)
+        sorted_y = np.flip(np.sort(y), axis=0)
+        tmpsum = 0.0
+        tmax_f = (np.sum(y) - 1.0) / m
+        for i in range(m - 1):
+            tmpsum += sorted_y[i]
+            tmax = (tmpsum - 1) / (i + 1.0)
+            if tmax > sorted_y[i + 1]:
+                tmax_f = tmax
+                break
+        return np.maximum(y - tmax_f, np.zeros(y.shape))
+    def _next_point(cur_val, grad, n):
+        proj_grad = grad - (np.sum(grad) / n)
+        tm1 = -1.0 * cur_val[proj_grad < 0] / proj_grad[proj_grad < 0]
+        tm2 = (1.0 - cur_val[proj_grad > 0]) / (proj_grad[proj_grad > 0])
+        skippers = np_sum(tm1 < 1e-7) + np_sum(tm2 < 1e-7)
+        t = 1
+        if len(tm1[tm1 > 1e-7]) > 0:
+            t = np.min(to_numpy(tm1[tm1 > 1e-7]))
+        if len(tm2[tm2 > 1e-7]) > 0:
+            t = min(t, np.min(to_numpy(tm2[tm2 > 1e-7])))
+        next_point = proj_grad * t + to_numpy(cur_val)
+        next_point = MinNormSolver._projection2simplex(next_point)
+        return next_point
+    def find_min_norm_element(vecs):
+        R"""
+        Given a list of vectors (vecs), this method finds the minimum norm element in the convex hull
+        as min |u|_2 st. u = \sum c_i vecs[i] and \sum c_i = 1.
+        It is quite geometric, and the main idea is the fact that if d_{ij} = min |u|_2 st u = c x_i + (1-c) x_j; the solution lies in (0, d_{i,j})
+        Hence, we find the best 2-task solution, and then run the projected gradient descent until convergence
+        """
+        # Solution lying at the combination of two points
+        dps = {}
+        init_sol, dps = MinNormSolver._min_norm_2d(vecs, dps)
+        n = len(vecs)
+        sol_vec = np.zeros(n)
+        sol_vec[init_sol[0][0]] = init_sol[1]
+        sol_vec[init_sol[0][1]] = 1 - init_sol[1]
+        if n < 3:
+            # This is optimal for n=2, so return the solution
+            return sol_vec, init_sol[2]
+        iter_count = 0
+        grad_mat = np.zeros((n, n))
+        for i in range(n):
+            for j in range(n):
+                grad_mat[i, j] = dps[(i, j)]
+        while iter_count < MinNormSolver.MAX_ITER:
+            grad_dir = -1.0 * np.dot(grad_mat, sol_vec)
+            new_point = MinNormSolver._next_point(sol_vec, grad_dir, n)
+            # Re-compute the inner products for line search
+            v1v1 = 0.0
+            v1v2 = 0.0
+            v2v2 = 0.0
+            for i in range(n):
+                for j in range(n):
+                    v1v1 += sol_vec[i] * sol_vec[j] * dps[(i, j)]
+                    v1v2 += sol_vec[i] * new_point[j] * dps[(i, j)]
+                    v2v2 += new_point[i] * new_point[j] * dps[(i, j)]
+            nc, nd = MinNormSolver._min_norm_element_from2(v1v1, v1v2, v2v2)
+            new_sol_vec = nc * sol_vec + (1 - nc) * new_point
+            change = new_sol_vec - sol_vec
+            if np_sum(np.abs(change)) < MinNormSolver.STOP_CRIT:
+                return sol_vec, nd
+            sol_vec = new_sol_vec
+    def find_min_norm_element_FW(vecs):
+        R"""
+        Given a list of vectors (vecs), this method finds the minimum norm element in the convex hull
+        as min |u|_2 st. u = \sum c_i vecs[i] and \sum c_i = 1.
+        It is quite geometric, and the main idea is the fact that if d_{ij} = min |u|_2 st u = c x_i + (1-c) x_j; the solution lies in (0, d_{i,j})
+        Hence, we find the best 2-task solution, and then run the Frank Wolfe until convergence
+        """
+        # Solution lying at the combination of two points
+        dps = {}
+        init_sol, dps = MinNormSolver._min_norm_2d(vecs, dps)
+        n = len(vecs)
+        sol_vec = np.zeros(n)
+        sol_vec[init_sol[0][0]] = init_sol[1]
+        sol_vec[init_sol[0][1]] = 1 - init_sol[1]
+        if n < 3:
+            # This is optimal for n=2, so return the solution
+            return sol_vec, init_sol[2]
+        iter_count = 0
+        grad_mat = np.zeros((n, n))
+        for i in range(n):
+            for j in range(n):
+                grad_mat[i, j] = dps[(i, j)]
+        while iter_count < MinNormSolver.MAX_ITER:
+            t_iter = np.argmin(np.dot(grad_mat, sol_vec))
+            v1v1 = np.dot(sol_vec, np.dot(grad_mat, sol_vec))
+            v1v2 = np.dot(sol_vec, grad_mat[:, t_iter])
+            v2v2 = grad_mat[t_iter, t_iter]
+            nc, nd = MinNormSolver._min_norm_element_from2(v1v1, v1v2, v2v2)
+            new_sol_vec = nc * sol_vec
+            new_sol_vec[t_iter] += 1 - nc
+            change = new_sol_vec - sol_vec
+            if np_sum(np.abs(change)) < MinNormSolver.STOP_CRIT:
+                return sol_vec, nd
+            sol_vec = new_sol_vec
+def gradient_normalizers(grads, losses, normalization_type):
+    gn = {}
+    if normalization_type == "l2":
+        for t in grads:
+            gn[t] = np.sqrt(np.sum([gr.pow(2).sum().data.cpu() for gr in grads[t]]))
+    elif normalization_type == "loss":
+        for t in grads:
+            gn[t] = losses[t]
+    elif normalization_type == "loss+":
+        for t in grads:
+            gn[t] = losses[t] * np.sqrt(
+                np.sum([gr.pow(2).sum().data.cpu() for gr in grads[t]])
+            )
+    elif normalization_type == "none":
+        for t in grads:
+            gn[t] = 1.0
+    else:
+        print("ERROR: Invalid Normalization Type")
+    return gn

fusion_bench/method/dop/utils.py ADDED Viewed

@@ -0,0 +1,73 @@
+from typing import Tuple
+import torch
+from torch import Tensor, nn
+from fusion_bench.utils.parameters import state_dict_to_vector
+from fusion_bench.utils.state_dict_arithmetic import state_dict_sub
+def _svd(w: Tensor, full_matrices=True) -> Tuple[Tensor, Tensor, Tensor]:
+    """
+    Perform Singular Value Decomposition (SVD) on a tensor.
+    Args:
+        w (Tensor): The input tensor.
+        full_matrices (bool): Whether to compute the full-sized U and V matrices.
+    Returns:
+        Tuple[Tensor, Tensor, Tensor]: The U, S, and V matrices from SVD.
+    """
+    u, s, vh = torch.linalg.svd(
+        w, full_matrices=full_matrices, driver="gesvd" if w.is_cuda else None
+    )
+    v = vh.T
+    return u, s, v
+def svd(
+    w: Tensor, full_matrices=True, accelerator=None
+) -> Tuple[Tensor, Tensor, Tensor]:
+    """
+    Perform SVD on a tensor, optionally using a specified accelerator.
+    Args:
+        w (Tensor): The input tensor.
+        full_matrices (bool): Whether to compute the full-sized U and V matrices.
+        accelerator (str): The device to perform the computation on.
+    Returns:
+        Tuple[Tensor, Tensor, Tensor]: The U, S, and V matrices from SVD.
+    """
+    if accelerator is None:
+        return _svd(w, full_matrices=full_matrices)
+    original_device = w.device
+    w = w.to(accelerator)
+    u, s, v = _svd(w)
+    return u.to(original_device), s.to(original_device), v.to(original_device)
+def frobenius_inner_product(w1: Tensor, w2: Tensor) -> Tensor:
+    return torch.trace(w1.T @ w2)
+def is_leaf_module(module: nn.Module) -> bool:
+    return len(list(module.children())) == 0
+def get_task_vector_norm(model: nn.Module, pretrained_model: nn.Module) -> Tensor:
+    """
+    Get the vector norm of the task model.
+    Args:
+        model (nn.Module): The task model.
+        pretrained_model (nn.Module): The pretrained model.
+    Returns:
+        Tensor: The vector norm of the task model.
+    """
+    return torch.linalg.norm(
+        state_dict_to_vector(
+            state_dict_sub(model.state_dict(), pretrained_model.state_dict())
+        )
+    )

fusion_bench/method/simple_average.py CHANGED Viewed

@@ -64,10 +64,12 @@ class SimpleAverageAlgorithm(
     SimpleProfilerMixin,
     BaseAlgorithm,
 ):
-    def __init__(self, show_pbar: bool = False, **kwargs):
+    def __init__(self, show_pbar: bool = False, inplace: bool = True, **kwargs):
         """
         Args:
             show_pbar (bool): If True, shows a progress bar during model loading and merging. Default is False.
+            inplace (bool): If True, overwrites the weights of the first model in the model pool.
+                If False, creates a new model for the merged weights. Default is True.
         """
         super().__init__(**kwargs)
@@ -104,12 +106,12 @@ class SimpleAverageAlgorithm(
             with self.profile("merge weights"):
                 if sd is None:
                     # Initialize the state dictionary with the first model's state dictionary
-                    sd = model.state_dict(keep_vars=True)
-                    forward_model = model
+                    sd = model.state_dict()
+                    forward_model = model if self.inplace else deepcopy(model)
                 else:
                     # Add the current model's state dictionary to the accumulated state dictionary
                     sd = state_dict_add(
-                        sd, model.state_dict(keep_vars=True), show_pbar=self.show_pbar
+                        sd, model.state_dict(), show_pbar=self.show_pbar
                     )
         with self.profile("merge weights"):
             # Divide the accumulated state dictionary by the number of models to get the average

fusion_bench/mixins/lightning_fabric.py CHANGED Viewed

@@ -111,6 +111,15 @@ class LightningFabricMixin:
         """
         if self.fabric is not None and len(self.fabric._loggers) > 0:
             log_dir = self.fabric.logger.log_dir
+            # Special handling for SwanLabLogger to get the correct log directory
+            if (
+                log_dir is None
+                and self.fabric.logger.__class__.__name__ == "SwanLabLogger"
+            ):
+                log_dir = self.fabric.logger.save_dir or self.fabric.logger._logdir
+            assert log_dir is not None, "log_dir should not be None"
             if self.fabric.is_global_zero and not os.path.exists(log_dir):
                 os.makedirs(log_dir, exist_ok=True)
             return log_dir

fusion_bench/modelpool/causal_lm/causal_lm.py CHANGED Viewed

@@ -8,6 +8,7 @@ from copy import deepcopy
 from typing import Any, Dict, Optional, TypeAlias, Union, cast  # noqa: F401
 import peft
+from lightning_utilities.core.rank_zero import rank_zero_only
 from omegaconf import DictConfig, OmegaConf, flag_override
 from torch import nn
 from torch.nn.modules import Module
@@ -342,7 +343,7 @@ class CausalLMPool(BaseModelPool):
         )
         # Create and save model card if algorithm_config is provided
-        if algorithm_config is not None:
+        if algorithm_config is not None and rank_zero_only.rank == 0:
             if description is None:
                 description = "Model created using FusionBench."
             model_card_str = create_default_model_card(

fusion-bench 0.2.25__py3-none-any.whl → 0.2.27__py3-none-any.whl

fusion-bench 0.2.25py3-none-any.whl → 0.2.27py3-none-any.whl