PyPI - fusion-bench - Versions diffs - 0.2.7__py3-none-any.whl → 0.2.8__py3-none-any.whl - Mend

fusion-bench 0.2.7py3-none-any.whl → 0.2.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (195) hide show

fusion_bench/compat/method/base_algorithm.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import Optional, TYPE_CHECKING
+from typing import TYPE_CHECKING, Optional
 from omegaconf import DictConfig

fusion_bench/dataset/clip_dataset.py CHANGED Viewed

@@ -65,4 +65,7 @@ class CLIPDataset(torch.utils.data.Dataset):
         else:
             # if processor is None, return the raw image directly
             inputs = image
+        # convert boolean label to int, this is for the case when the label is a binary classification task
+        if isinstance(item["label"], bool):
+            item["label"] = 1 if item["label"] else 0
         return inputs, item["label"]

fusion_bench/dataset/fer2013.py ADDED Viewed

@@ -0,0 +1,12 @@
+from datasets import load_dataset
+def load_fer2013(path: str = "clip-benchmark/wds_fer2013", split: str = "train"):
+    dataset = load_dataset(path, split=split)
+    dataset = dataset.remove_columns(["__key__", "__url__"])
+    dataset = dataset.rename_columns({"jpg": "image", "cls": "label"})
+    return dataset
+if __name__ == "__main__":
+    dataset = load_fer2013(split="test")
+    print(dataset)

fusion_bench/dataset/llama/preference_700k.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import logging
 import os
 from copy import deepcopy
 from typing import TYPE_CHECKING, Optional
@@ -7,7 +8,6 @@ from lightning.fabric.utilities import rank_zero_only
 from tqdm.auto import tqdm
 from fusion_bench.utils import timeit_context
-import logging
 if TYPE_CHECKING:
     from transformers import PreTrainedTokenizer

fusion_bench/method/__init__.py CHANGED Viewed

@@ -49,6 +49,7 @@ _import_structure = {
         "PWEMoExactParetoOptimalForCLIP",
     ],
     "ada_svd": ["AdaSVDMergingForCLIPVisionModel"],
+    "task_singular_vector": ["TaskSingularVectorMerging"],
     # plug-and-play model merging methods
     "concrete_subspace": [
         "ConcreteTaskArithmeticAlgorithmForCLIP",
@@ -153,6 +154,7 @@ if TYPE_CHECKING:
         SparseLoForLlama,
     )
     from .task_arithmetic import TaskArithmeticAlgorithm
+    from .task_singular_vector import TaskSingularVectorMerging
     from .ties_merging import TiesMergingAlgorithm
     from .we_moe import CLIPWeightEnsemblingMoEAlgorithm
     from .weighted_average import WeightedAverageAlgorithm, WeightedAverageForLLama

fusion_bench/method/classification/clip_finetune.py CHANGED Viewed

@@ -41,11 +41,10 @@ from transformers.models.clip.modeling_clip import CLIPVisionTransformer
 from fusion_bench import print_parameters
 from fusion_bench.compat.method import ModelFusionAlgorithm
 from fusion_bench.compat.modelpool import to_modelpool
-from fusion_bench.compat.modelpool.huggingface_clip_vision import (
-    HuggingFaceClipVisionPool,
-)
+from fusion_bench.dataset.clip_dataset import CLIPDataset
 from fusion_bench.mixins import CLIPClassificationMixin
 from fusion_bench.mixins.simple_profiler import SimpleProfilerMixin
+from fusion_bench.modelpool import CLIPVisionModelPool
 from fusion_bench.models.hf_clip import HFCLIPClassifier
 from fusion_bench.models.linearized.linearized_model_utils import LinearizedModelWraper
 from fusion_bench.utils.data import InfiniteDataLoader
@@ -92,12 +91,12 @@ class ImageClassificationFineTuningForCLIP(
     A class for fine-tuning CLIP models for image classification tasks.
     """
-    def run(self, modelpool: HuggingFaceClipVisionPool):
+    def run(self, modelpool: CLIPVisionModelPool):
         """
         Executes the fine-tuning process.
         Args:
-            modelpool (HuggingFaceClipVisionPool): The modelpool is responsible for loading the pre-trained model and training datasets.
+            modelpool (CLIPVisionModelPool): The modelpool is responsible for loading the pre-trained model and training datasets.
         Returns:
             VisionModel: The fine-tuned vision model.
@@ -109,9 +108,7 @@ class ImageClassificationFineTuningForCLIP(
         L.seed_everything(config.seed)
-        task_names = [
-            dataset_config["name"] for dataset_config in modelpool.config.train_datasets
-        ]
+        task_names = modelpool.train_dataset_names
         with self.profile("setup model and optimizer"):
             processor, classifier, optimizer, lr_scheduler = self.setup_model()
@@ -133,7 +130,7 @@ class ImageClassificationFineTuningForCLIP(
         with self.profile("setup data"):
             train_datasets = [
-                modelpool.get_train_dataset(task_name, processor)
+                CLIPDataset(modelpool.load_train_dataset(task_name), processor)
                 for task_name in task_names
             ]
             train_dataloaders = [
@@ -157,6 +154,7 @@ class ImageClassificationFineTuningForCLIP(
             range(config.num_steps),
             desc=self.finetune_method,
             disable=not self.fabric.is_global_zero,
+            dynamic_ncols=True,
         ):
             optimizer.zero_grad()
             loss = 0
@@ -183,7 +181,7 @@ class ImageClassificationFineTuningForCLIP(
                 save_path = os.path.join(
                     self.log_dir, "checkpoints", f"step={step_idx}.ckpt"
                 )
-                self.save_model(classifier, save_path, trainable_only=True)
+                self.save_model(classifier, save_path)
         if config.state_dict_save_path is not None:
             self.save_model(
@@ -232,9 +230,8 @@ class ImageClassificationFineTuningForCLIP(
         config = self.config
         modelpool = self.modelpool
-        pretrained_model_config = modelpool.get_model_config("_pretrained_")
-        clip_model: CLIPModel = CLIPModel.from_pretrained(pretrained_model_config.path)
-        processor = CLIPProcessor.from_pretrained(pretrained_model_config.path)
+        clip_model: CLIPModel = modelpool.load_clip_model("_pretrained_")
+        processor = modelpool.load_processor()
         self.finetune_method = "full fine-tune"
         if config.use_lora or config.use_l_lora:

fusion_bench/method/surgery/__init__.py CHANGED Viewed

@@ -1,3 +1 @@
-from .clip_layer_wise_adamerging_surgery import (
-    CLIPLayerWiseAdaMergingSurgeryAlgorithm,
-)
+from .clip_layer_wise_adamerging_surgery import CLIPLayerWiseAdaMergingSurgeryAlgorithm

fusion_bench/method/surgery/clip_layer_wise_adamerging_surgery.py CHANGED Viewed

@@ -154,4 +154,4 @@ class CLIPLayerWiseAdaMergingSurgeryAlgorithm(
                 self._program.evaluate_merged_model(self._program.taskpool, alpha_model)
         log.info("test the result of Adamerging")
-        return merged_model
+        return {"adamerging": merged_model, "surgery": alpha_model}

fusion_bench/method/tall_mask/__init__.py ADDED Viewed

File without changes

fusion_bench/method/tall_mask/utils.py ADDED Viewed

@@ -0,0 +1,234 @@
+import copy
+import os
+from typing import List, Optional
+import numpy as np
+import torch
+from fusion_bench.utils import state_dict_to_vector, vector_to_state_dict
+def generate_task_masks(
+    tv_flat_checks: torch.Tensor,
+    flat_ft: torch.Tensor,
+    flat_ptm: torch.Tensor,
+    tv: Optional[torch.Tensor] = None,
+    tall_mask_lambda: float = 1.0,
+) -> torch.Tensor:
+    """
+    Generate task-specific TALL masks
+    TALL masks are generated as: mask_t = |theta_0 - theta_t| > |theta_mt - theta_t| * lambda
+    Args:
+        tv_flat_checks: individual task vectors
+        flat_ft: individual theta_t (fine-tuned weights)
+        flat_ptm: theta_0 (pre-trained weight)
+        tv: multi-task vector
+        tall_mask_lambda: hyper-parameter lambda for generating TALL masks
+    Returns:
+        final_mask: generated TALL masks with the given lambda, in shape (n_task, n_parameter)
+    """
+    print(f"Generating TALL masks.")
+    if tv is None:
+        tv = tv_flat_checks.sum(0)
+    flat_multi = flat_ptm + tv
+    original_shape = flat_ft.shape
+    # generate masks by comparing the l1 distance between |theta_0 - theta_t| and |theta_mt - theta_t|
+    diff_pt_ft = (flat_ptm - flat_ft).abs()
+    diff_multi_ft = (flat_multi - flat_ft).abs()
+    # compare the l1 distance, scaled with hyper-parameter lambda
+    mask = diff_pt_ft > diff_multi_ft * tall_mask_lambda
+    final_mask = (
+        mask.squeeze() if original_shape == tv_flat_checks.squeeze().shape else mask
+    )
+    print(
+        f"Average sparsity for the mask with tall_mask_lambda of {tall_mask_lambda}: {final_mask.float().mean():.4f}"
+    )
+    return final_mask
+def construct_tall_mask(
+    tv_flat_checks: torch.Tensor,
+    flat_ft: torch.Tensor,
+    flat_ptm: torch.Tensor,
+    merged_tv: torch.Tensor,
+    ptm_check: torch.Tensor,
+    remove_keys: List[str],
+    config,
+):
+    """
+    Construct TALL masks for all tasks for each lambda, and store in dictionary
+    Args:
+        tv_flat_checks: individual task vectors
+        flat_ft: individual theta_t (fine-tuned weights)
+        flat_ptm: theta_0 (pre-trained weight)
+        merged_tv: multi-task vector
+        ptm_check: pre-trained weight as state dictionary
+        remove_keys: the keys to be removed when converting between dictionary and vector
+    Returns:
+        tall_masks: constructed TALL masks in dictionary format of {lambda: {task: mask}}
+    """
+    tall_masks = {}
+    for tall_mask_lambda in [0.2, 0.3, 0.4, 0.5, 0.6]:
+        # generate tall masks for each lambda
+        masks_at_scale = generate_task_masks(
+            tv_flat_checks,
+            flat_ft,
+            flat_ptm,
+            tall_mask_lambda=tall_mask_lambda,
+            tv=merged_tv,
+        )
+        # convert vectors to dictionary
+        masks_at_scale = [
+            vector_to_state_dict(mask, ptm_check, remove_keys=remove_keys)
+            for mask in masks_at_scale
+        ]
+        # store the masks with {dataset: mask}
+        tall_masks[tall_mask_lambda] = {
+            key: value for key, value in zip(config.DATASETS, masks_at_scale)
+        }
+    return tall_masks
+def find_optimal_mask(val_metrics, eval_masks, args, save_masks=True):
+    """
+    Respectively finds the optimal mask for each data task based on the validation accuracy
+    Args:
+        val_metrics: validation metrics for each lambda
+        eval_masks: all generated masks
+    Returns:
+        best_masks_for_test: the best masks for each task, selected based on validation accuracy from each task
+        best_val_metrics: best validation metrics for each task
+    """
+    # transpose the dict from lambda-task to task-lambda
+    transposed_dict = {}
+    for key, inner_dict in val_metrics.items():
+        for inner_key, value in inner_dict.items():
+            if inner_key not in transposed_dict:
+                transposed_dict[inner_key] = {}
+            transposed_dict[inner_key][key] = value
+    # for each task, find the best lambda
+    max_subkeys = {
+        key: max(inner_dict, key=inner_dict.get)
+        for key, inner_dict in transposed_dict.items()
+    }
+    # select the best mask for each task, which will be used for testing later
+    best_masks_for_test = {}
+    best_masks_for_test_vector = {}  # the selected masks as vectors
+    best_val_metrics = {}
+    # respectively for each task:
+    for ds in args.DATASETS:
+        # select the lambda which achieves the best valdiation accuracy
+        best_lambda = float(max_subkeys[ds + "Val:top1"])
+        # select the mask based on the selected lambda, save as dictionaries
+        best_masks_for_test[ds] = eval_masks[best_lambda][ds]
+        # select the mask based on the selected lambda, save as vectors
+        best_masks_for_test_vector[ds] = state_dict_to_vector(
+            eval_masks[best_lambda][ds], remove_keys=[]
+        )
+        print(f"Best lambda for {ds} is {best_lambda}")
+        # save the best validation metric based on the selected lambda
+        best_val_metrics[ds + "Val:top1"] = val_metrics[best_lambda][ds + "Val:top1"]
+    # save the best masks in disk
+    if save_masks and not args.method.load_mask:
+        # convert to numpy to save with np.packbits for saving storage
+        best_masks_for_test_vector = {
+            k: np.packbits(v) for k, v in best_masks_for_test_vector.items()
+        }
+        mask_save_dir = args.model_location.replace("checkpoints", "tall_masks")
+        mask_name = (
+            f"TALL_mask_{args.num_tasks}task.npy"
+            if not args.method.use_ties
+            else f"TALL_mask_{args.num_tasks}task_use_ties_{args.method.ties_agg}.npy"
+        )
+        np.save(
+            os.path.join(mask_save_dir, args.model, mask_name),
+            best_masks_for_test_vector,
+        )
+        del best_masks_for_test_vector
+    return best_masks_for_test, best_val_metrics
+def load_tall_mask(remove_keys, ptm_check, config):
+    """Loads TALL masks from disk, unpack and transform to state dictionaries."""
+    mask_location = config.model_location.replace("checkpoints", "tall_masks")
+    try:
+        if config.method.use_ties:
+            print("==== Loading TALL Masks built with TIES ====")
+            tall_masks = torch.load(
+                os.path.join(
+                    mask_location,
+                    config.model,
+                    f"TALL_mask_{config.num_tasks}task_use_ties.npy",
+                )
+            )
+        else:
+            print("==== Loading TALL Masks built with Task Arithmetic ====")
+            tall_masks = torch.load(
+                os.path.join(
+                    mask_location, config.model, f"TALL_mask_{config.num_tasks}task.npy"
+                )
+            )
+    except:
+        raise Exception("TALL Masks are not constructed yet.")
+    # unpack masks and convert back to torch tensors
+    tall_masks = {k: torch.from_numpy(np.unpackbits(v)) for k, v in tall_masks.items()}
+    # convert vectors to dictionaries
+    tall_masks = {
+        dataset: vector_to_state_dict(mask, ptm_check, remove_keys=remove_keys)
+        for dataset, mask in tall_masks.items()
+    }
+    return tall_masks
+def construct_consensus_mask(ptm_check, prun_thre_k, config, remove_keys=[]):
+    """
+    Generate consensus mask by filtering out least-used parameters
+    Args:
+        ptm_check: pretrained_checkpoint as state dictionary
+        prun_thre_k: weight-pruning threhold, stands for the least number of activated tasks for a parameter to be preserved from pruning
+                if prun_thre_k is set to 2: remove both catastrophic and selfish weights;
+                if prun_thre_k is set to 1: remove only catastrophic weights;
+                if prun_thre_k is set to 0: remove no weights -> reduce to TA or TIES
+                if prun_thre_k is set to > num_tasks: remove all weights -> reduce to zero-shot
+    Returns:
+        consensus_mask_vector: constructed consensus mask as vector (boolean in shape (n_parameter, ))
+    """
+    print("==== Generating Consensus Mask ====")
+    # load TALL masks (in shape (n_task, n_parameter))
+    tall_masks = load_tall_mask(remove_keys, ptm_check, config)
+    tall_masks = list(tall_masks.values())
+    # generate consensus masks
+    consensus_mask = copy.deepcopy(tall_masks[0])
+    for key, value in consensus_mask.items():
+        consensus_mask[key] = torch.zeros_like(value)
+        # count for each parameter, the tasks it has been activated for
+        for mask in tall_masks:
+            consensus_mask[key] = consensus_mask[key] + mask[key].float()
+        # filter out the least-activated parameters based on given threshold
+        consensus_mask[key] = consensus_mask[key].float() >= prun_thre_k
+    consensus_mask_vector = state_dict_to_vector(
+        consensus_mask, remove_keys=remove_keys
+    )
+    return consensus_mask_vector

fusion_bench/method/task_singular_vector/TSVC.py ADDED Viewed

@@ -0,0 +1,16 @@
+import torch
+from torch import Tensor, nn
+from fusion_bench import BaseAlgorithm
+from .utils import TSVC_utils, check_parameterNamesMatch
+class TaskSingularVectorCompression(BaseAlgorithm):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+    def run(self, modelpool):
+        raise NotImplementedError(
+            "Task Singular Vector Compression is not implemented yet."
+        )

fusion_bench/method/task_singular_vector/TSVM.py ADDED Viewed

@@ -0,0 +1,63 @@
+"""
+Example:
+```bash
+fusion_bench \
+    method=task_singular_vector/TaskSingularVectorMerging \
+    modelpool=CLIPVisionModelPool/clip-vit-base-patch32_TALL20_model_only \
+    taskpool=CLIPVisionModelTaskPool/clip-vit-classification_TALL20
+```
+"""
+from typing import List, Optional
+import torch
+from torch import Tensor, nn
+from fusion_bench import BaseAlgorithm
+from fusion_bench.mixins import LightningFabricMixin
+from fusion_bench.utils import timeit_context
+from fusion_bench.utils.state_dict_arithmetic import state_dict_sub, state_dict_add
+from fusion_bench.utils.type import StateDictType
+from .utils import (
+    TSVM_utils,
+    check_parameterNamesMatch,
+    check_state_dicts_equal,
+    state_dict_to_vector,
+    vector_to_state_dict,
+)
+class TaskSingularVectorMerging(BaseAlgorithm, LightningFabricMixin):
+    def __init__(
+        self,
+        remove_keys: Optional[List[str]] = None,
+        **kwargs,
+    ):
+        self.remove_keys = remove_keys if remove_keys is not None else []
+        super().__init__(**kwargs)
+    def run(self, modelpool):
+        # Load the pre-trained model and the fine-tuned models
+        pretrained_model = modelpool.load_pretrained_model()
+        finetuned_models = list(modelpool.models())
+        ptm_check = pretrained_model.state_dict()
+        ft_checks = [model.state_dict() for model in finetuned_models]
+        check_parameterNamesMatch(ft_checks + [ptm_check])
+        with timeit_context("Flattening out Checkpoints"):
+            task_vectors = [state_dict_sub(check, ptm_check) for check in ft_checks]
+        new_merged_tv = TSVM_utils.compute_and_sum_svd_mem_reduction(
+            task_vectors,
+            exclude_keys=self.remove_keys,
+            accelerator=self.fabric.device,
+        )
+        pretrained_model.load_state_dict(
+            state_dict_add(new_merged_tv, pretrained_model.state_dict())
+        )
+        return pretrained_model

fusion_bench/method/task_singular_vector/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+"""
+This module is modified from the original code of the paper:
+- Gargiulo, et.al. Task Singular Vectors: Reducing Task Interference in Model Merging
+    - http://arxiv.org/abs/2412.00081
+    - https://github.com/AntoAndGar/task_singular_vectors/
+"""
+from .TSVM import TaskSingularVectorMerging

fusion_bench/method/task_singular_vector/utils/TSVC_utils.py ADDED Viewed

@@ -0,0 +1,50 @@
+import torch
+def compute_svd_and_compress(key, matrix, sv_reduction):
+    """
+    Computes the Singular Value Decomposition (SVD) of a given matrix and compresses it by reducing the number of singular values.
+    Args:
+        key (Any): An identifier for the matrix.
+        matrix (torch.Tensor): The input matrix to decompose.
+        sv_reduction (float): The fraction of singular values to retain (0 < sv_reduction <= 1).
+    Returns:
+        tuple: A tuple containing:
+            - key (Any): The original identifier for the matrix.
+            - u (torch.Tensor): The left singular vectors of the reduced SVD.
+            - s (torch.Tensor): The reduced singular values.
+            - v (torch.Tensor): The right singular vectors of the reduced SVD.
+    """
+    u, s, v = torch.linalg.svd(matrix, full_matrices=False)
+    reduced_index_s = int(s.shape[0] * sv_reduction)
+    return key, u[:, :reduced_index_s], s[:reduced_index_s], v[:reduced_index_s, :]
+def compress_tv(task_vectors, sv_reduction):
+    """
+    Compress task vectors using Singular Value Decomposition (SVD).
+    Args:
+        task_vectors (dict): A dictionary where keys are dataset names and values are task vectors.
+            Each task vector is expected to have a 'vector' attribute which is a dictionary
+            with keys as layer names and values as layer matrices.
+        sv_reduction (int): The fraction of singular values to keep for compression.
+    Returns:
+        dict: A dictionary with the same structure as `task_vectors`, but with each layer matrix
+            replaced by its compressed SVD components (u, s, v) if the layer is 2-dimensional.
+            If the layer is not 2-dimensional, it is stored as is under the key "dim1".
+    """
+    with torch.no_grad():
+        svd_dict = {}
+        for dataset, task_vector in task_vectors.items():
+            svd_dict[dataset] = {}
+            for key, layer in task_vector.vector.items():
+                if len(layer.shape) == 2:  # and "text_projection" not in key:
+                    _, u, s, v = compute_svd_and_compress(key, layer, sv_reduction)
+                    svd_dict[dataset][key] = {"u": u, "s": s, "v": v}
+                else:
+                    svd_dict[dataset][key] = {"dim1": layer}
+        return svd_dict

fusion-bench 0.2.7__py3-none-any.whl → 0.2.8__py3-none-any.whl

fusion-bench 0.2.7py3-none-any.whl → 0.2.8py3-none-any.whl