PyPI - fusion-bench - Versions diffs - 0.2.9__py3-none-any.whl - Mend

fusion-bench 0.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (727) hide show

fusion_bench/method/concrete_subspace/clip_concrete_task_arithmetic.py ADDED Viewed

@@ -0,0 +1,263 @@
+"""
+Examples:
+```bash
+fusion_bench \
+    fabric.loggers.name=ViT-B-32/concrete_task_arithmetic \
+    method=clip_concrete_task_arithmetic \
+    modelpool=clip-vit-base-patch32_TA8 \
+    taskpool=clip-vit-classification_TA8
+```
+"""
+import logging
+import os
+import torch
+from tqdm.autonotebook import tqdm
+from fusion_bench.compat.method import ModelFusionAlgorithm
+from fusion_bench.compat.modelpool import to_modelpool
+from fusion_bench.compat.modelpool.huggingface_clip_vision import (
+    HuggingFaceClipVisionPool,
+)
+from fusion_bench.method.adamerging.entropy_loss import entropy_loss
+from fusion_bench.mixins import CLIPClassificationMixin
+from fusion_bench.mixins.simple_profiler import SimpleProfilerMixin
+from fusion_bench.models.masks import MaskModel, mask_sparsity
+from fusion_bench.models.wrappers.task_wise_fusion import (
+    TaskWiseMergedModel,
+    get_task_wise_weights,
+)
+from fusion_bench.utils.dtype import parse_dtype
+from fusion_bench.utils.parameters import print_parameters
+log = logging.getLogger(__name__)
+class ConcreteTaskArithmeticAlgorithmForCLIP(
+    CLIPClassificationMixin,
+    SimpleProfilerMixin,
+    ModelFusionAlgorithm,
+):
+    """
+    ConcreteTaskArithmeticAlgorithmForCLIP is a class for performing task arithmetic on CLIP models with learned masking.
+    This class extends the CLIPClassificationMixin, SimpleProfilerMixin, and ModelFusionAlgorithm classes.
+    It provides methods for setting up models, training masks, and running the task arithmetic algorithm.
+    Attributes:
+        merge_dtype (torch.dtype): The data type for merging weights.
+        modelpool (HuggingFaceClipVisionPool): The model pool containing the pretrained and fine-tuned models.
+    """
+    @torch.no_grad()
+    def setup_models(self):
+        """
+        Set up the pretrained model, fine-tuned models, and mask model.
+        This method loads the pretrained model, constructs the PGE mask model, and loads the fine-tuned models.
+        It also creates a wrapped model with task-wise weights.
+        Returns:
+            Tuple[TaskWiseMergedModel, MaskModel]: The wrapped model and mask model.
+        """
+        config = self.config
+        self.merge_dtype = parse_dtype(config.get("merge_dtype", None))
+        modelpool = self.modelpool
+        # Load the pretrained model
+        pretrained_model = modelpool.load_model("_pretrained_")
+        # construct PGE mask model
+        mask_model = MaskModel(
+            pretrained_model,
+            ignore_untrained_params=True,
+            parameter_type="logits",
+        )
+        if self.merge_dtype is not None:
+            mask_model.to(self.merge_dtype)
+        mask_model.fill_(self.config.initial_logits)
+        # TODO: ablation study for the initialization of mask model
+        # for param in mask_model.parameters():
+        #     param.data = param + 0.1 * torch.randn_like(param)
+        print("Summary of mask model:")
+        print_parameters(mask_model)
+        # Load the fine-tuned models
+        finetuned_models = [
+            modelpool.load_model(name) for name in modelpool.model_names
+        ]
+        task_wise_weight = get_task_wise_weights(
+            num_models=len(modelpool.model_names),
+            init_values=self.config.scaling_factor,
+        )
+        # create a warpped model
+        module = TaskWiseMergedModel(
+            task_wise_weight=task_wise_weight,
+            pretrained_model=pretrained_model,
+            finetuned_models=finetuned_models,
+            clamp_weights=self.config.clamp_weights,
+            tie_weights=self.config.tie_weights,
+            strict=self.config.strict,
+            task_vector_dtype=self.merge_dtype,
+        )
+        return module, mask_model
+    def train_mask(self, module: TaskWiseMergedModel, mask_model: MaskModel):
+        """
+        Train the mask model using the provided module.
+        This method configures the optimizer, sets up the mask model, and performs test-time adaptation to train the mask model.
+        Args:
+            module (TaskWiseMergedModel): The wrapped model with task-wise weights.
+            mask_model (MaskModel): The mask model to be trained.
+        """
+        config = self.config
+        # mask_model: MaskModel = self.fabric.to_device(mask_model)
+        # configure optimizer
+        lr_scheduler = None
+        if self.config.optimizer == "adam":
+            optimizer = torch.optim.Adam(
+                filter(lambda p: p.requires_grad, mask_model.parameters()),
+                lr=self.config.lr,
+            )
+            print(f"{optimizer=}")
+            # TODO: ablation study for the learning rate scheduler. It should yield similar results.
+            # lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
+            #     optimizer, self.config.max_steps, eta_min=0.1
+            # )
+            mask_model, optimizer = self.fabric.setup(mask_model, optimizer)
+        elif self.config.optimizer == "sgd":
+            optimizer = torch.optim.SGD(mask_model.parameters(), lr=self.config.lr)
+            print(f"{optimizer=}")
+            lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
+                optimizer, self.config.max_steps, eta_min=0.1
+            )
+            mask_model, optimizer = self.fabric.setup(mask_model, optimizer)
+        else:
+            raise ValueError(f"Unsupported optimizer: {self.config.optimizer}")
+        module.to(mask_model.device)
+        module.requires_grad_(False)
+        mask_model.train()
+        optimizer.zero_grad()
+        for step_idx in (
+            pbar := tqdm(
+                range(self.config.max_steps if not self.is_debug_mode else 5),
+                ("[DEBUG MODE] " if self.is_debug_mode else "")
+                + "Concrete Task Arithmetic Test-Time Adaptation",
+                dynamic_ncols=True,
+                disable=not self.fabric.is_global_zero,
+            )
+        ):
+            metrics = {}
+            # sample a shared mask and merge weights
+            with self.profile("sample mask"):
+                mask = mask_model.sample_mask(
+                    mask_type="continuous", temperature=config.temperature
+                )
+                metrics["train/sparsity"] = mask_sparsity(mask)
+            with self.profile("merge weights"):
+                # rescale mask
+                for name, m in mask.items():
+                    mask[name] = m / torch.mean(m)
+                module.merge_weights(task_vector_mask=mask)
+            # ------ inner optimization goes here ------
+            # NOTE:
+            #   Because the algorithmic parameters of task arithmetic are assumed to be chosen on a validation test
+            #   set, we do not need to perform inner optimization here. So here we skip the inner optimization step.
+            # ------------------------------------------
+            total_loss = None
+            for task in self.modelpool.model_names:
+                with self.profile("data loading"):
+                    batch = next(self.get_shuffled_test_loader_iter(task))
+                    # NOTE: The labels are not allowed to be used during test-time adaptation
+                    images = batch[0].to(dtype=self.merge_dtype)
+                with self.profile("forward pass"):
+                    logits = self.compute_logits(module, images, task)
+                    loss = entropy_loss(logits)
+                    total_loss = loss if total_loss is None else total_loss + loss
+            with self.profile("compute grad"):
+                self.fabric.backward(total_loss)
+            with self.profile("optimizer step"):
+                optimizer.step()
+                optimizer.zero_grad()
+                if lr_scheduler is not None:
+                    lr_scheduler.step()
+            metrics.update({"train/loss": loss.item()})
+            self.fabric.log_dict(metrics, step=step_idx)
+            pbar.set_postfix(metrics)
+            if (step_idx + 1) % self.config.save_interval == 0:
+                with self.profiler.profile("save checkpoint"):
+                    save_dir = os.path.join(self.fabric.logger.log_dir, "checkpoints")
+                    if not os.path.exists(save_dir):
+                        os.makedirs(save_dir, exist_ok=True)
+                    save_path = os.path.join(save_dir, f"mask_steps_{step_idx}.pt")
+                    print(f"saving checkpoint to {save_path}")
+                    state = {"model": mask_model}
+                    self.fabric.save(save_path, state)
+                    # Create or update a symbolic link to the latest checkpoint
+                    if self.fabric.is_global_zero:
+                        symlink_path = os.path.join(save_dir, "latest_checkpoint.pt")
+                        if os.path.exists(symlink_path):
+                            os.remove(symlink_path)
+                        os.link(os.path.abspath(save_path), symlink_path)
+                self.print_profile_summary()
+    def run(self, modelpool: HuggingFaceClipVisionPool):
+        """
+        Run the Concrete Task Arithmetic algorithm.
+        This method sets up the models, trains the mask model if necessary, and performs the final merging of weights.
+        Args:
+            modelpool (HuggingFaceClipVisionPool): The model pool containing the pretrained and fine-tuned models.
+        Returns:
+            torch.nn.Module: The final merged model.
+        """
+        self.modelpool = to_modelpool(modelpool)
+        config = self.config
+        self.log_hyperparams(config, filename="method_config.yaml")
+        with self.profile("setup models"):
+            module, mask_model = self.setup_models()
+            self.setup_zero_shot_classification_head()
+        if config.mask_checkpoint is None:
+            if not config.skip_training:
+                torch.cuda.empty_cache()
+                self.train_mask(module=module, mask_model=mask_model)
+        else:
+            if self.fabric.is_global_zero:
+                print("loading mask from checkpoint", config.mask_checkpoint)
+            self.fabric.load(config.mask_checkpoint, {"model": mask_model})
+        with torch.no_grad():
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+            mask = mask_model.sample_mask(
+                mask_type=config.eval_mask_type,
+                temperature=config.temperature,
+            )
+            # rescale mask
+            for name, m in mask.items():
+                mask[name] = m / torch.mean(m)
+            model = module.merge_and_unload(mask)
+        return model.to(dtype=torch.float32)

fusion_bench/method/dare/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+# flake8: noqa F401
+from .simple_average import DareSimpleAverage
+from .task_arithmetic import DareTaskArithmetic
+from .ties_merging import DareTiesMerging

fusion_bench/method/dare/simple_average.py ADDED Viewed

@@ -0,0 +1,31 @@
+import logging
+from fusion_bench import BaseAlgorithm, BaseModelPool
+from fusion_bench.utils.state_dict_arithmetic import state_dict_add, state_dict_mul
+from .task_arithmetic import DareTaskArithmetic
+log = logging.getLogger(__name__)
+class DareSimpleAverage(BaseAlgorithm):
+    def __init__(
+        self,
+        sparsity_ratio: float,
+        only_on_linear_weights: bool,
+        rescale: bool = True,
+        **kwargs,
+    ):
+        self.sparsity_ratio = sparsity_ratio
+        self.only_on_linear_weight = only_on_linear_weights
+        self.rescale = rescale
+        super().__init__(**kwargs)
+    def run(self, modelpool: BaseModelPool):
+        return DareTaskArithmetic(
+            scaling_factor=1 / len(modelpool),
+            sparsity_ratio=self.sparsity_ratio,
+            only_on_linear_weights=self.only_on_linear_weight,
+            rescale=self.rescale,
+        ).run(modelpool)

fusion_bench/method/dare/task_arithmetic.py ADDED Viewed

@@ -0,0 +1,82 @@
+import torch
+from torch import Tensor, nn
+from fusion_bench import BaseAlgorithm, BaseModelPool
+from fusion_bench.utils.state_dict_arithmetic import state_dict_sum
+from .utils import (
+    module_random_drop_,
+    module_sub_,
+    param_random_drop_,
+    trainable_state_dict,
+)
+class DareTaskArithmetic(BaseAlgorithm):
+    """
+    Implementation of Task Arithmetic w/ DARE.
+    - Yu et al. Language Models are Super Mario: Absorbing Abilities from Homologous Models as a Free Lunch. 2023. http://arxiv.org/abs/2311.03099
+    """
+    def __init__(
+        self,
+        scaling_factor: float,
+        sparsity_ratio: float,
+        only_on_linear_weights: bool,
+        rescale: bool = True,
+        **kwargs,
+    ):
+        self.scaling_factor = scaling_factor
+        self.sparsity_ratio = sparsity_ratio
+        self.only_on_linear_weights = only_on_linear_weights
+        self.rescale = rescale
+        super().__init__(**kwargs)
+    def _load_task_vector(
+        self,
+        modelpool: BaseModelPool,
+        model_name: str,
+        pretrained_model: nn.Module,
+    ):
+        finetuned_model = modelpool.load_model(model_name)
+        task_vector = module_sub_(finetuned_model, pretrained_model)
+        return task_vector
+    @torch.no_grad()
+    def run(self, modelpool: BaseModelPool):
+        assert (
+            self.sparsity_ratio >= 0 and self.sparsity_ratio <= 1
+        ), "Sparsity ratio must be between 0 and 1"
+        pretrained_model = modelpool.load_pretrained_model()
+        # load task vectors
+        task_vectors = {
+            model_name: self._load_task_vector(modelpool, model_name, pretrained_model)
+            for model_name in modelpool.model_names
+        }
+        # drop and rescale task vectors
+        for model_name, tv in task_vectors.items():
+            if self.only_on_linear_weights:
+                for module_name, module in tv.named_modules():
+                    if isinstance(module, nn.Linear):
+                        print(f"pruning model: `{model_name}`, layer: {module_name}.")
+                        param_random_drop_(
+                            module.weight, self.sparsity_ratio, rescale=self.rescale
+                        )
+            else:
+                print(f"pruning model: `{model_name}`")
+                module_random_drop_(tv, self.sparsity_ratio, rescale=self.rescale)
+        # merge task vectors
+        task_vector_sum = state_dict_sum(
+            [trainable_state_dict(tv) for tv in task_vectors.values()]
+        )
+        # scale the task vector and add it to the pretrained model
+        for name, delta in task_vector_sum.items():
+            delta = delta * self.scaling_factor
+            pretrained_model.get_parameter(name).data.add_(delta)
+        return pretrained_model

fusion_bench/method/dare/ties_merging.py ADDED Viewed

@@ -0,0 +1,100 @@
+from typing import Literal
+import torch
+from torch import Tensor, nn
+from fusion_bench import BaseAlgorithm, BaseModelPool
+from fusion_bench.method.ties_merging.ties_merging_utils import ties_merging
+from fusion_bench.utils.parameters import state_dict_to_vector, vector_to_state_dict
+from fusion_bench.utils.state_dict_arithmetic import state_dict_sum
+from .utils import (
+    module_random_drop_,
+    module_sub_,
+    param_random_drop_,
+    trainable_state_dict,
+)
+class DareTiesMerging(BaseAlgorithm):
+    def __init__(
+        self,
+        # DARE parameters
+        sparsity_ratio: float,
+        only_on_linear_weights: bool,
+        rescale: bool,
+        # Ties merging parameters
+        scaling_factor: float,
+        threshold: int,
+        remove_keys: list[str],
+        merge_func: Literal["sum", "mean", "max"],
+        **kwargs,
+    ):
+        self.sparsity_ratio = sparsity_ratio
+        self.only_on_linear_weights = only_on_linear_weights
+        self.rescale = rescale
+        self.scaling_factor = scaling_factor
+        self.threshold = threshold
+        self.remove_keys = remove_keys
+        self.merge_func = merge_func
+        super().__init__(**kwargs)
+    @torch.no_grad()
+    def _load_task_vector(
+        self,
+        modelpool: BaseModelPool,
+        model_name: str,
+        pretrained_model: nn.Module,
+    ):
+        finetuned_model = modelpool.load_model(model_name)
+        task_vector = module_sub_(finetuned_model, pretrained_model)
+        return task_vector
+    def run(self, modelpool: BaseModelPool):
+        assert (
+            self.sparsity_ratio >= 0 and self.sparsity_ratio <= 1
+        ), "Sparsity ratio must be between 0 and 1"
+        pretrained_model = modelpool.load_pretrained_model()
+        # load task vectors
+        task_vectors = {
+            model_name: self._load_task_vector(modelpool, model_name, pretrained_model)
+            for model_name in modelpool.model_names
+        }
+        # drop and rescale task vectors
+        for model_name, tv in task_vectors.items():
+            if self.only_on_linear_weights:
+                for module_name, module in tv.named_modules():
+                    if isinstance(module, nn.Linear):
+                        print(f"pruning model: `{model_name}`, layer: {module_name}.")
+                        param_random_drop_(
+                            module.weight, self.sparsity_ratio, rescale=self.rescale
+                        )
+            else:
+                print(f"pruning model: `{model_name}`")
+                module_random_drop_(tv, self.sparsity_ratio, rescale=self.rescale)
+        ptm_check = pretrained_model.state_dict()
+        flat_ptm = state_dict_to_vector(ptm_check, self.remove_keys)
+        tv_flat_checks = torch.vstack(
+            [
+                state_dict_to_vector(check.state_dict(), self.remove_keys)
+                for check in task_vectors.values()
+            ]
+        )
+        del task_vectors
+        # Perform TIES Merging
+        merged_tv = ties_merging(
+            tv_flat_checks,
+            reset_thresh=self.threshold,
+            merge_func=self.merge_func,
+        )
+        merged_check = flat_ptm + self.scaling_factor * merged_tv
+        merged_state_dict = vector_to_state_dict(
+            merged_check, ptm_check, remove_keys=self.remove_keys
+        )
+        pretrained_model.load_state_dict(merged_state_dict)
+        return pretrained_model

fusion_bench/method/dare/utils.py ADDED Viewed

@@ -0,0 +1,87 @@
+from typing import Dict, Union
+import torch
+from torch import Tensor, nn
+from fusion_bench.utils.type import StateDictType
+def param_random_drop_(param: Tensor, sparsity_level: float, rescale: bool):
+    """
+    Randomly drops elements in the given tensor based on the sparsity level.
+    Args:
+        param (Tensor): The tensor whose elements are to be randomly dropped.
+        sparsity_level (float): The fraction of elements to drop (between 0 and 1).
+        rescale (bool): If True, rescale the remaining elements to maintain the original sum.
+    Returns:
+        None
+    """
+    mask = torch.rand_like(param) > sparsity_level
+    param.data = param.data * mask
+    if rescale:
+        param.data = param.data / (1 - sparsity_level)
+def module_random_drop_(
+    tv: Union[nn.Module, StateDictType], sparsity_level: float, rescale: bool
+):
+    """
+    Applies random drop to all parameters in a module or state dictionary.
+    Args:
+        tv (Union[nn.Module, StateDictType]): The module or state dictionary whose parameters are to be randomly dropped.
+        sparsity_level (float): The fraction of elements to drop (between 0 and 1).
+        rescale (bool): If True, rescale the remaining elements to maintain the original sum.
+    Returns:
+        None
+    """
+    if isinstance(tv, nn.Module):
+        for param in tv.parameters():
+            param_random_drop_(param, sparsity_level, rescale)
+    else:
+        for param in tv.values():
+            param_random_drop_(param, sparsity_level, rescale)
+def trainable_state_dict(module: nn.Module):
+    """
+    Returns a state dictionary containing only the trainable parameters of the given module.
+    Args:
+        module (nn.Module): The module from which to extract the trainable parameters.
+    Returns:
+        dict: A dictionary where the keys are parameter names and the values are the corresponding trainable parameters.
+    """
+    return {
+        name: param for name, param in module.named_parameters() if param.requires_grad
+    }
+def module_sub_(
+    a: Union[nn.Module, StateDictType],
+    b: Union[nn.Module, StateDictType],
+    trainable_only: bool = True,
+):
+    """
+    Subtracts the parameters of module b from module a in-place.
+    Args:
+        a (nn.Module): The module whose parameters will be subtracted from.
+        b (nn.Module): The module whose parameters will be subtracted.
+    Returns:
+        nn.Module: The modified module a with updated parameters.
+    """
+    for (a_name, a_param), (b_name, b_param) in zip(
+        a.named_parameters() if isinstance(a, nn.Module) else a.items(),
+        b.named_parameters() if isinstance(b, nn.Module) else b.items(),
+    ):
+        assert a_name == b_name, "Mismatch in parameter names"
+        if trainable_only and not a_param.requires_grad:
+            continue
+        a_param.data = a_param.data - b_param.data
+    return a

fusion_bench/method/dawe/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ # flake8: noqa F401
2	+ from .dawe_for_clip import DataAdaptiveWeightEnsemblingForCLIP