PyPI - fusion-bench - Versions diffs - 0.2.6__py3-none-any.whl → 0.2.8__py3-none-any.whl - Mend

fusion-bench 0.2.6py3-none-any.whl → 0.2.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (264) hide show

fusion_bench/method/pruning/llama_random_prune.py CHANGED Viewed

@@ -1,7 +1,7 @@
-from typing import Literal, Optional, Union  # noqa: F401
+from typing import Dict, Literal, Optional, Union  # noqa: F401
 import torch
-from torch import Dict, nn
+from torch import nn
 from tqdm.auto import tqdm
 from transformers import LlamaForCausalLM, LlamaModel

fusion_bench/method/surgery/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .clip_layer_wise_adamerging_surgery import CLIPLayerWiseAdaMergingSurgeryAlgorithm

fusion_bench/method/surgery/clip_layer_wise_adamerging_surgery.py ADDED Viewed

@@ -0,0 +1,157 @@
+"""
+Implementation of the Layer-Wise AdaMerging+Surgery Algorithm.
+For more details, please refer to:
+- (ICLR 2024) Yang, et.al. AdaMerging: Adaptive Model Merging for Multi-Task Learning. http://arxiv.org/abs/2310.02575
+- (ICML 2024) Yang, et.al. Representation Surgery for Multi-Task Model Merging. https://arxiv.org/abs/2402.02705
+Basic Example:
+```shell
+fusion_bench \
+    method=surgery/adamerging_surgery \
+    modelpool=CLIPVisionModelPool/clip-vit-base-patch32_TA8 \
+    taskpool=CLIPVisionModelTaskPool/clip-vit-classification_TA8
+```
+"""
+import copy
+import functools
+import gc
+import logging
+from typing import TYPE_CHECKING, cast
+import torch
+import torch.nn.functional as F
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from transformers import CLIPVisionModel
+from fusion_bench.dataset.clip_dataset import CLIPDataset
+from fusion_bench.method.adamerging.layer_wise_adamerging import (
+    LayerWiseAdaMergingAlgorithm,
+)
+from fusion_bench.method.adamerging.utils import get_memory_usage
+from fusion_bench.mixins import CLIPClassificationMixin
+from fusion_bench.modelpool import CLIPVisionModelPool
+from fusion_bench.models.surgery.surgerymodelwrapper import SurgeryModelWrapper
+from fusion_bench.models.wrappers.layer_wise_fusion import LayerWiseMergedModel
+log = logging.getLogger(__name__)
+class CLIPLayerWiseAdaMergingSurgeryAlgorithm(
+    CLIPClassificationMixin,
+    LayerWiseAdaMergingAlgorithm,
+):
+    def on_test_time_adaptation_start(self):
+        """
+        Here we load the CLIP processor and construct the zero-shot classification head for each task.
+        """
+        self.setup_zero_shot_classification_head()
+    @functools.cache
+    def get_shuffled_test_loader_iter(self, task: str):
+        return super().get_shuffled_test_loader_iter(
+            task,
+            batch_size=self.config.batch_size,
+            num_workers=self.config.num_workers,
+        )
+    def run(self, modelpool: CLIPVisionModelPool, **kwargs):
+        """
+        Run the Layer-Wise AdaMerging+Surgery Algorithm.
+        This method constructs the wrapped model and performs test-time adaptation if necessary. Then, it will perform surgery.
+        Args:
+            modelpool (ModelPool): The model pool containing the pretrained and fine-tuned models.
+        Returns:
+            LayerWiseMergedModel: The merged model after test-time adaptation.
+        """
+        log.info("Fusing models using layer-wise adaptive merging.")
+        self.modelpool = modelpool
+        self.log_hyperparams(self.config)
+        # === Start of the AdaMerging Algorithm ===
+        with self.profile("construct the wrapped model"):
+            module = cast(
+                LayerWiseMergedModel[CLIPVisionModel],
+                self.construct_layer_wise_merged_model(modelpool),
+            )
+        if self.config.weights is not None:
+            # skip the test-time adaptation
+            merged_model = copy.deepcopy(module.merge_and_unload())
+        else:
+            with self.profile("test-time adaptation"):
+                module = self.test_time_adaptation(module)
+            if self.config.get("save_merging_weights", False):
+                self.save_merging_weights(
+                    self.config.save_merging_weights, module.merge_weight
+                )
+            merged_model = copy.deepcopy(module.merge_and_unload())
+        # free memory
+        del module
+        gc.collect()
+        torch.cuda.empty_cache()
+        # === Start of the Surgery Algorithm ===
+        log.info("start performing Surgery")
+        alpha_model = SurgeryModelWrapper(
+            merged_model,
+            modelpool.model_names,
+            projection_dim=merged_model.config.projection_dim,
+        )
+        alpha_model = self.fabric.setup(alpha_model)
+        log.info(get_memory_usage("after freeing memory, the memory usage of GPU is:"))
+        optimizer = torch.optim.Adam(
+            alpha_model.collect_trainable_params(),
+            lr=1e-3,
+            betas=(0.9, 0.999),
+            weight_decay=0.0,
+        )
+        finetuned_models = {
+            model_name: modelpool.load_model(model_name)
+            for model_name in modelpool.model_names
+        }
+        for name, model in finetuned_models.items():
+            model.requires_grad_(False)
+            model = self.fabric.to_device(model)
+            model.eval()
+        for iteration in tqdm(
+            range(self.config.surgery_steps),
+            "surgery",
+            dynamic_ncols=True,
+        ):
+            for dataset_name in modelpool.model_names:
+                batch = next(self.get_shuffled_test_loader_iter(dataset_name))
+                finetuned_feature = self.compute_features(
+                    finetuned_models[dataset_name], batch[0]
+                )
+                features, _, _ = alpha_model.compute_surgery_features(
+                    lambda model: self.compute_features(model, batch[0]),
+                    dataset_name,
+                )
+                loss = F.l1_loss(features, finetuned_feature)
+                optimizer.zero_grad()
+                loss.backward()
+                optimizer.step()
+            if ((iteration + 1) % self.config.eval_iterations) == 0:
+                # print(list(alpha_model.collect_trainable_params()))
+                # Evaluate try to use the test module in fusion bench
+                log.info(f"iteration: {iteration+1}")
+                self._program.evaluate_merged_model(self._program.taskpool, alpha_model)
+        log.info("test the result of Adamerging")
+        return {"adamerging": merged_model, "surgery": alpha_model}

fusion_bench/method/tall_mask/__init__.py ADDED Viewed

File without changes

fusion_bench/method/tall_mask/utils.py ADDED Viewed

@@ -0,0 +1,234 @@
+import copy
+import os
+from typing import List, Optional
+import numpy as np
+import torch
+from fusion_bench.utils import state_dict_to_vector, vector_to_state_dict
+def generate_task_masks(
+    tv_flat_checks: torch.Tensor,
+    flat_ft: torch.Tensor,
+    flat_ptm: torch.Tensor,
+    tv: Optional[torch.Tensor] = None,
+    tall_mask_lambda: float = 1.0,
+) -> torch.Tensor:
+    """
+    Generate task-specific TALL masks
+    TALL masks are generated as: mask_t = |theta_0 - theta_t| > |theta_mt - theta_t| * lambda
+    Args:
+        tv_flat_checks: individual task vectors
+        flat_ft: individual theta_t (fine-tuned weights)
+        flat_ptm: theta_0 (pre-trained weight)
+        tv: multi-task vector
+        tall_mask_lambda: hyper-parameter lambda for generating TALL masks
+    Returns:
+        final_mask: generated TALL masks with the given lambda, in shape (n_task, n_parameter)
+    """
+    print(f"Generating TALL masks.")
+    if tv is None:
+        tv = tv_flat_checks.sum(0)
+    flat_multi = flat_ptm + tv
+    original_shape = flat_ft.shape
+    # generate masks by comparing the l1 distance between |theta_0 - theta_t| and |theta_mt - theta_t|
+    diff_pt_ft = (flat_ptm - flat_ft).abs()
+    diff_multi_ft = (flat_multi - flat_ft).abs()
+    # compare the l1 distance, scaled with hyper-parameter lambda
+    mask = diff_pt_ft > diff_multi_ft * tall_mask_lambda
+    final_mask = (
+        mask.squeeze() if original_shape == tv_flat_checks.squeeze().shape else mask
+    )
+    print(
+        f"Average sparsity for the mask with tall_mask_lambda of {tall_mask_lambda}: {final_mask.float().mean():.4f}"
+    )
+    return final_mask
+def construct_tall_mask(
+    tv_flat_checks: torch.Tensor,
+    flat_ft: torch.Tensor,
+    flat_ptm: torch.Tensor,
+    merged_tv: torch.Tensor,
+    ptm_check: torch.Tensor,
+    remove_keys: List[str],
+    config,
+):
+    """
+    Construct TALL masks for all tasks for each lambda, and store in dictionary
+    Args:
+        tv_flat_checks: individual task vectors
+        flat_ft: individual theta_t (fine-tuned weights)
+        flat_ptm: theta_0 (pre-trained weight)
+        merged_tv: multi-task vector
+        ptm_check: pre-trained weight as state dictionary
+        remove_keys: the keys to be removed when converting between dictionary and vector
+    Returns:
+        tall_masks: constructed TALL masks in dictionary format of {lambda: {task: mask}}
+    """
+    tall_masks = {}
+    for tall_mask_lambda in [0.2, 0.3, 0.4, 0.5, 0.6]:
+        # generate tall masks for each lambda
+        masks_at_scale = generate_task_masks(
+            tv_flat_checks,
+            flat_ft,
+            flat_ptm,
+            tall_mask_lambda=tall_mask_lambda,
+            tv=merged_tv,
+        )
+        # convert vectors to dictionary
+        masks_at_scale = [
+            vector_to_state_dict(mask, ptm_check, remove_keys=remove_keys)
+            for mask in masks_at_scale
+        ]
+        # store the masks with {dataset: mask}
+        tall_masks[tall_mask_lambda] = {
+            key: value for key, value in zip(config.DATASETS, masks_at_scale)
+        }
+    return tall_masks
+def find_optimal_mask(val_metrics, eval_masks, args, save_masks=True):
+    """
+    Respectively finds the optimal mask for each data task based on the validation accuracy
+    Args:
+        val_metrics: validation metrics for each lambda
+        eval_masks: all generated masks
+    Returns:
+        best_masks_for_test: the best masks for each task, selected based on validation accuracy from each task
+        best_val_metrics: best validation metrics for each task
+    """
+    # transpose the dict from lambda-task to task-lambda
+    transposed_dict = {}
+    for key, inner_dict in val_metrics.items():
+        for inner_key, value in inner_dict.items():
+            if inner_key not in transposed_dict:
+                transposed_dict[inner_key] = {}
+            transposed_dict[inner_key][key] = value
+    # for each task, find the best lambda
+    max_subkeys = {
+        key: max(inner_dict, key=inner_dict.get)
+        for key, inner_dict in transposed_dict.items()
+    }
+    # select the best mask for each task, which will be used for testing later
+    best_masks_for_test = {}
+    best_masks_for_test_vector = {}  # the selected masks as vectors
+    best_val_metrics = {}
+    # respectively for each task:
+    for ds in args.DATASETS:
+        # select the lambda which achieves the best valdiation accuracy
+        best_lambda = float(max_subkeys[ds + "Val:top1"])
+        # select the mask based on the selected lambda, save as dictionaries
+        best_masks_for_test[ds] = eval_masks[best_lambda][ds]
+        # select the mask based on the selected lambda, save as vectors
+        best_masks_for_test_vector[ds] = state_dict_to_vector(
+            eval_masks[best_lambda][ds], remove_keys=[]
+        )
+        print(f"Best lambda for {ds} is {best_lambda}")
+        # save the best validation metric based on the selected lambda
+        best_val_metrics[ds + "Val:top1"] = val_metrics[best_lambda][ds + "Val:top1"]
+    # save the best masks in disk
+    if save_masks and not args.method.load_mask:
+        # convert to numpy to save with np.packbits for saving storage
+        best_masks_for_test_vector = {
+            k: np.packbits(v) for k, v in best_masks_for_test_vector.items()
+        }
+        mask_save_dir = args.model_location.replace("checkpoints", "tall_masks")
+        mask_name = (
+            f"TALL_mask_{args.num_tasks}task.npy"
+            if not args.method.use_ties
+            else f"TALL_mask_{args.num_tasks}task_use_ties_{args.method.ties_agg}.npy"
+        )
+        np.save(
+            os.path.join(mask_save_dir, args.model, mask_name),
+            best_masks_for_test_vector,
+        )
+        del best_masks_for_test_vector
+    return best_masks_for_test, best_val_metrics
+def load_tall_mask(remove_keys, ptm_check, config):
+    """Loads TALL masks from disk, unpack and transform to state dictionaries."""
+    mask_location = config.model_location.replace("checkpoints", "tall_masks")
+    try:
+        if config.method.use_ties:
+            print("==== Loading TALL Masks built with TIES ====")
+            tall_masks = torch.load(
+                os.path.join(
+                    mask_location,
+                    config.model,
+                    f"TALL_mask_{config.num_tasks}task_use_ties.npy",
+                )
+            )
+        else:
+            print("==== Loading TALL Masks built with Task Arithmetic ====")
+            tall_masks = torch.load(
+                os.path.join(
+                    mask_location, config.model, f"TALL_mask_{config.num_tasks}task.npy"
+                )
+            )
+    except:
+        raise Exception("TALL Masks are not constructed yet.")
+    # unpack masks and convert back to torch tensors
+    tall_masks = {k: torch.from_numpy(np.unpackbits(v)) for k, v in tall_masks.items()}
+    # convert vectors to dictionaries
+    tall_masks = {
+        dataset: vector_to_state_dict(mask, ptm_check, remove_keys=remove_keys)
+        for dataset, mask in tall_masks.items()
+    }
+    return tall_masks
+def construct_consensus_mask(ptm_check, prun_thre_k, config, remove_keys=[]):
+    """
+    Generate consensus mask by filtering out least-used parameters
+    Args:
+        ptm_check: pretrained_checkpoint as state dictionary
+        prun_thre_k: weight-pruning threhold, stands for the least number of activated tasks for a parameter to be preserved from pruning
+                if prun_thre_k is set to 2: remove both catastrophic and selfish weights;
+                if prun_thre_k is set to 1: remove only catastrophic weights;
+                if prun_thre_k is set to 0: remove no weights -> reduce to TA or TIES
+                if prun_thre_k is set to > num_tasks: remove all weights -> reduce to zero-shot
+    Returns:
+        consensus_mask_vector: constructed consensus mask as vector (boolean in shape (n_parameter, ))
+    """
+    print("==== Generating Consensus Mask ====")
+    # load TALL masks (in shape (n_task, n_parameter))
+    tall_masks = load_tall_mask(remove_keys, ptm_check, config)
+    tall_masks = list(tall_masks.values())
+    # generate consensus masks
+    consensus_mask = copy.deepcopy(tall_masks[0])
+    for key, value in consensus_mask.items():
+        consensus_mask[key] = torch.zeros_like(value)
+        # count for each parameter, the tasks it has been activated for
+        for mask in tall_masks:
+            consensus_mask[key] = consensus_mask[key] + mask[key].float()
+        # filter out the least-activated parameters based on given threshold
+        consensus_mask[key] = consensus_mask[key].float() >= prun_thre_k
+    consensus_mask_vector = state_dict_to_vector(
+        consensus_mask, remove_keys=remove_keys
+    )
+    return consensus_mask_vector

fusion_bench/method/task_singular_vector/TSVC.py ADDED Viewed

@@ -0,0 +1,16 @@
+import torch
+from torch import Tensor, nn
+from fusion_bench import BaseAlgorithm
+from .utils import TSVC_utils, check_parameterNamesMatch
+class TaskSingularVectorCompression(BaseAlgorithm):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+    def run(self, modelpool):
+        raise NotImplementedError(
+            "Task Singular Vector Compression is not implemented yet."
+        )

fusion_bench/method/task_singular_vector/TSVM.py ADDED Viewed

@@ -0,0 +1,63 @@
+"""
+Example:
+```bash
+fusion_bench \
+    method=task_singular_vector/TaskSingularVectorMerging \
+    modelpool=CLIPVisionModelPool/clip-vit-base-patch32_TALL20_model_only \
+    taskpool=CLIPVisionModelTaskPool/clip-vit-classification_TALL20
+```
+"""
+from typing import List, Optional
+import torch
+from torch import Tensor, nn
+from fusion_bench import BaseAlgorithm
+from fusion_bench.mixins import LightningFabricMixin
+from fusion_bench.utils import timeit_context
+from fusion_bench.utils.state_dict_arithmetic import state_dict_sub, state_dict_add
+from fusion_bench.utils.type import StateDictType
+from .utils import (
+    TSVM_utils,
+    check_parameterNamesMatch,
+    check_state_dicts_equal,
+    state_dict_to_vector,
+    vector_to_state_dict,
+)
+class TaskSingularVectorMerging(BaseAlgorithm, LightningFabricMixin):
+    def __init__(
+        self,
+        remove_keys: Optional[List[str]] = None,
+        **kwargs,
+    ):
+        self.remove_keys = remove_keys if remove_keys is not None else []
+        super().__init__(**kwargs)
+    def run(self, modelpool):
+        # Load the pre-trained model and the fine-tuned models
+        pretrained_model = modelpool.load_pretrained_model()
+        finetuned_models = list(modelpool.models())
+        ptm_check = pretrained_model.state_dict()
+        ft_checks = [model.state_dict() for model in finetuned_models]
+        check_parameterNamesMatch(ft_checks + [ptm_check])
+        with timeit_context("Flattening out Checkpoints"):
+            task_vectors = [state_dict_sub(check, ptm_check) for check in ft_checks]
+        new_merged_tv = TSVM_utils.compute_and_sum_svd_mem_reduction(
+            task_vectors,
+            exclude_keys=self.remove_keys,
+            accelerator=self.fabric.device,
+        )
+        pretrained_model.load_state_dict(
+            state_dict_add(new_merged_tv, pretrained_model.state_dict())
+        )
+        return pretrained_model

fusion_bench/method/task_singular_vector/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+"""
+This module is modified from the original code of the paper:
+- Gargiulo, et.al. Task Singular Vectors: Reducing Task Interference in Model Merging
+    - http://arxiv.org/abs/2412.00081
+    - https://github.com/AntoAndGar/task_singular_vectors/
+"""
+from .TSVM import TaskSingularVectorMerging

fusion_bench/method/task_singular_vector/utils/TSVC_utils.py ADDED Viewed

@@ -0,0 +1,50 @@
+import torch
+def compute_svd_and_compress(key, matrix, sv_reduction):
+    """
+    Computes the Singular Value Decomposition (SVD) of a given matrix and compresses it by reducing the number of singular values.
+    Args:
+        key (Any): An identifier for the matrix.
+        matrix (torch.Tensor): The input matrix to decompose.
+        sv_reduction (float): The fraction of singular values to retain (0 < sv_reduction <= 1).
+    Returns:
+        tuple: A tuple containing:
+            - key (Any): The original identifier for the matrix.
+            - u (torch.Tensor): The left singular vectors of the reduced SVD.
+            - s (torch.Tensor): The reduced singular values.
+            - v (torch.Tensor): The right singular vectors of the reduced SVD.
+    """
+    u, s, v = torch.linalg.svd(matrix, full_matrices=False)
+    reduced_index_s = int(s.shape[0] * sv_reduction)
+    return key, u[:, :reduced_index_s], s[:reduced_index_s], v[:reduced_index_s, :]
+def compress_tv(task_vectors, sv_reduction):
+    """
+    Compress task vectors using Singular Value Decomposition (SVD).
+    Args:
+        task_vectors (dict): A dictionary where keys are dataset names and values are task vectors.
+            Each task vector is expected to have a 'vector' attribute which is a dictionary
+            with keys as layer names and values as layer matrices.
+        sv_reduction (int): The fraction of singular values to keep for compression.
+    Returns:
+        dict: A dictionary with the same structure as `task_vectors`, but with each layer matrix
+            replaced by its compressed SVD components (u, s, v) if the layer is 2-dimensional.
+            If the layer is not 2-dimensional, it is stored as is under the key "dim1".
+    """
+    with torch.no_grad():
+        svd_dict = {}
+        for dataset, task_vector in task_vectors.items():
+            svd_dict[dataset] = {}
+            for key, layer in task_vector.vector.items():
+                if len(layer.shape) == 2:  # and "text_projection" not in key:
+                    _, u, s, v = compute_svd_and_compress(key, layer, sv_reduction)
+                    svd_dict[dataset][key] = {"u": u, "s": s, "v": v}
+                else:
+                    svd_dict[dataset][key] = {"dim1": layer}
+        return svd_dict

fusion-bench 0.2.6__py3-none-any.whl → 0.2.8__py3-none-any.whl

fusion-bench 0.2.6py3-none-any.whl → 0.2.8py3-none-any.whl