PyPI - fusion-bench - Versions diffs - 0.2.9__py3-none-any.whl - Mend

fusion-bench 0.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (727) hide show

fusion_bench/method/__init__.py ADDED Viewed

@@ -0,0 +1,177 @@
+# flake8: noqa F401
+import sys
+from typing import TYPE_CHECKING
+from fusion_bench.utils.lazy_imports import LazyImporter
+_import_structure = {
+    # --------------
+    "base_algorithm": ["BaseModelFusionAlgorithm", "BaseAlgorithm"],
+    "dummy": ["DummyAlgorithm"],
+    # single task learning (fine-tuning)
+    "classification": [
+        "ImageClassificationFineTuningForCLIP",
+        "ContinualImageClassificationFineTuningForCLIP",
+    ],
+    "lm_finetune": ["FullFinetuneSFT", "PeftFinetuneSFT", "BradleyTerryRewardModeling"],
+    # analysis
+    "analysis": ["TaskVectorCosSimilarity", "TaskVectorViolinPlot"],
+    # model ensemble methods
+    "ensemble": [
+        "SimpleEnsembleAlgorithm",
+        "WeightedEnsembleAlgorithm",
+        "MaxModelPredictorAlgorithm",
+    ],
+    # model merging methods
+    "linear": [
+        "ExPOAlgorithm",
+        "ExPOAlgorithmForLlama",
+        "SimpleAverageForLlama",
+        "TaskArithmeticForLlama",
+        "LinearInterpolationAlgorithm",
+    ],
+    "slerp": ["SlerpMergeAlgorithm"],
+    "simple_average": ["SimpleAverageAlgorithm"],
+    "weighted_average": ["WeightedAverageAlgorithm", "WeightedAverageForLLama"],
+    "task_arithmetic": ["TaskArithmeticAlgorithm"],
+    "ties_merging": ["TiesMergingAlgorithm"],
+    "dare": ["DareSimpleAverage", "DareTaskArithmetic", "DareTiesMerging"],
+    "fisher_merging": [
+        "FisherMergingForCLIPVisionModel",
+        "FisherMergingAlgorithmForGPT2",
+    ],
+    "regmean": ["RegMeanAlgorithmForCLIP", "RegMeanAlgorithmForGPT2"],
+    "adamerging": [
+        "CLIPTaskWiseAdaMergingAlgorithm",
+        "CLIPLayerWiseAdaMergingAlgorithm",
+        "GPT2LayerWiseAdaMergingAlgorithm",
+        "LayerWiseAdaMergingForLlamaSFT",
+        "FlanT5LayerWiseAdaMergingAlgorithm",
+    ],
+    "pwe_moe": [
+        "PWEMoELinearScalarizationForCLIP",
+        "PWEMoExactParetoOptimalForCLIP",
+    ],
+    "ada_svd": ["AdaSVDMergingForCLIPVisionModel"],
+    "task_singular_vector": ["TaskSingularVectorMerging"],
+    "opcm": ["OPCMForCLIP"],
+    # plug-and-play model merging methods
+    "concrete_subspace": [
+        "ConcreteTaskArithmeticAlgorithmForCLIP",
+        "ConcreteTaskWiseAdaMergingForCLIP",
+        "ConcreteLayerWiseAdaMergingForCLIP",
+    ],
+    # model mixing methods
+    "depth_upscaling": ["DepthUpscalingAlgorithm", "DepthUpscalingForLlama"],
+    "mixture_of_experts": [
+        "MixtralUpscalingAlgorithm",
+        "MixtralForCausalLMUpscalingAlgorithm",
+        "MixtralMoEMergingAlgorithm",
+        "MixtralForCausalLMMergingAlgorithm",
+    ],
+    "dawe": ["DataAdaptiveWeightEnsemblingForCLIP"],
+    "we_moe": ["CLIPWeightEnsemblingMoEAlgorithm"],
+    "rankone_moe": ["CLIPRankOneMoEAlgorithm", "RankOneMoEAlgorithm"],
+    "sparse_we_moe": [
+        "SparseWeightEnsemblingMoEAlgorithm",
+        "SparseCLIPWeightEnsemblingMoEAlgorithm",
+    ],
+    "model_recombination": ["ModelRecombinationAlgorithm"],
+    "smile_upscaling": [
+        "SmileUpscalingAlgorithm",
+        "SingularProjectionMergingAlgorithm",
+    ],
+    # pruning methods
+    "pruning": [
+        "MagnitudeDiffPruningAlgorithm",
+        "RandomPruningForLlama",
+        "MagnitudePruningForLlama",
+        "WandaPruningForLlama",
+    ],
+    "sparselo": [
+        "IterativeSparseLoForLlama",
+        "SparseLoForLlama",
+        "PCPSparseLoForLlama",
+    ],
+}
+if TYPE_CHECKING:
+    from .ada_svd import AdaSVDMergingForCLIPVisionModel
+    from .adamerging import *
+    from .analysis import TaskVectorCosSimilarity, TaskVectorViolinPlot
+    from .base_algorithm import BaseAlgorithm, BaseModelFusionAlgorithm
+    from .classification import (
+        ContinualImageClassificationFineTuningForCLIP,
+        ImageClassificationFineTuningForCLIP,
+    )
+    from .concrete_subspace import (
+        ConcreteLayerWiseAdaMergingForCLIP,
+        ConcreteTaskArithmeticAlgorithmForCLIP,
+        ConcreteTaskWiseAdaMergingForCLIP,
+    )
+    from .dare import DareSimpleAverage, DareTaskArithmetic, DareTiesMerging
+    from .dawe import DataAdaptiveWeightEnsemblingForCLIP
+    from .depth_upscaling import DepthUpscalingAlgorithm, DepthUpscalingForLlama
+    from .dummy import DummyAlgorithm
+    from .ensemble import (
+        MaxModelPredictorAlgorithm,
+        SimpleEnsembleAlgorithm,
+        WeightedEnsembleAlgorithm,
+    )
+    from .fisher_merging import FisherMergingForCLIPVisionModel
+    from .linear import (
+        ExPOAlgorithm,
+        ExPOAlgorithmForLlama,
+        LinearInterpolationAlgorithm,
+        SimpleAverageForLlama,
+        TaskArithmeticForLlama,
+    )
+    from .lm_finetune import *
+    from .mixture_of_experts import (
+        MixtralForCausalLMMergingAlgorithm,
+        MixtralForCausalLMUpscalingAlgorithm,
+        MixtralMoEMergingAlgorithm,
+        MixtralUpscalingAlgorithm,
+    )
+    from .model_recombination import ModelRecombinationAlgorithm
+    from .opcm import OPCMForCLIP
+    from .pruning import (
+        MagnitudeDiffPruningAlgorithm,
+        MagnitudePruningForLlama,
+        RandomPruningForLlama,
+        WandaPruningForLlama,
+    )
+    from .pwe_moe import (
+        PWEMoELinearScalarizationForCLIP,
+        PWEMoExactParetoOptimalForCLIP,
+    )
+    from .rankone_moe import CLIPRankOneMoEAlgorithm, RankOneMoEAlgorithm
+    from .regmean import RegMeanAlgorithmForCLIP, RegMeanAlgorithmForGPT2
+    from .simple_average import SimpleAverageAlgorithm
+    from .slerp import SlerpMergeAlgorithm
+    from .smile_upscaling import (
+        SingularProjectionMergingAlgorithm,
+        SmileUpscalingAlgorithm,
+    )
+    from .sparse_we_moe import (
+        SparseCLIPWeightEnsemblingMoEAlgorithm,
+        SparseWeightEnsemblingMoEAlgorithm,
+    )
+    from .sparselo import (
+        IterativeSparseLoForLlama,
+        PCPSparseLoForLlama,
+        SparseLoForLlama,
+    )
+    from .task_arithmetic import TaskArithmeticAlgorithm
+    from .task_singular_vector import TaskSingularVectorMerging
+    from .ties_merging import TiesMergingAlgorithm
+    from .we_moe import CLIPWeightEnsemblingMoEAlgorithm
+    from .weighted_average import WeightedAverageAlgorithm, WeightedAverageForLLama
+else:
+    sys.modules[__name__] = LazyImporter(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+    )

fusion_bench/method/ada_svd/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ # flake8: noqa F401
2	+ from .clip_vision import AdaSVDMergingForCLIPVisionModel

fusion_bench/method/ada_svd/clip_vision.py ADDED Viewed

@@ -0,0 +1,319 @@
+R"""
+This is an experimental implementation of an adaptive SVD-based merging algorithm for CLIP vision encoders.
+This implementation is based on the following paper:
+    - Ante Tang et al. SMILE: Zero-Shot Sparse Mixture of Low-Rank Experts Construction From Pre-Trained Foundation Models. Aug, 2024. http://arxiv.org/abs/2408.10174
+After the upscaled model is created, the algorithm reduces and unload the MoE modules to the original linear modules by averaging the routing weights of the experts.
+"""
+from copy import deepcopy
+from typing import Dict, Iterable, List, Literal, Optional, Tuple, Union, cast
+import lightning as L
+import torch
+import torch.nn.functional as F
+import torch.utils.hooks
+from torch import Tensor, nn
+from torch.utils.data import ConcatDataset, random_split
+from tqdm.auto import tqdm
+from transformers import CLIPVisionModel
+from transformers.models.clip.modeling_clip import (
+    CLIPEncoderLayer,
+    CLIPVisionTransformer,
+)
+from typing_extensions import override
+from fusion_bench import BaseAlgorithm, BaseModelPool
+from fusion_bench.dataset import CLIPDataset
+from fusion_bench.method import WeightedAverageAlgorithm
+from fusion_bench.method.simple_average import simple_average
+from fusion_bench.mixins import SimpleProfilerMixin
+from fusion_bench.modelpool import CLIPVisionModelPool
+from fusion_bench.models.smile_moe.linear import ExpertNotTrainedError, SmileMoELinear
+from fusion_bench.models.utils import find_layers_with_type, get_attr, set_attr
+from fusion_bench.utils.devices import get_device
+class AvgRoutingWeightsMetric:
+    def __init__(
+        self,
+        linear: SmileMoELinear,
+    ):
+        super().__init__()
+        self.linear = linear
+        self.reset()
+    def reset(self):
+        linear = self.linear
+        self.sample_count = 0
+        self.accumulated_routing_weights = torch.zeros(
+            linear.num_experts,
+            dtype=torch.float32,
+            device=get_device(linear),
+        )
+    def __call__(
+        self,
+        linear: SmileMoELinear,
+        inps: Tuple[Tensor],
+        out: Tensor,
+    ):
+        assert len(inps) == 1
+        inp = inps[0]
+        inp = inp.view(-1, inp.size(-1))
+        # compute routing weights
+        router_logits = linear.gate(inp)
+        routing_weights = F.softmax(router_logits, dim=-1)
+        # accumulate routing weights
+        self.accumulated_routing_weights += routing_weights.sum(dim=0)
+        self.sample_count += inp.size(0)
+    def compute(self):
+        return self.accumulated_routing_weights / self.sample_count
+class AdaSVDMergingForCLIPVisionModel(
+    BaseAlgorithm,
+    SimpleProfilerMixin,
+):
+    _linear_layer_cls = (nn.Linear,)
+    def __init__(
+        self,
+        scaling_factor: Optional[Union[float, List[float]]],
+        num_samples: int,  # budget of the number of samples to be used
+        gate_k: int,
+        average_experts: bool,  # if True, the non-linear modules of experts are averaged. Otherwise, copy the pretrained one.
+        device: Optional[Literal["cuda", "cpu"]],
+        upscaling_accelerator: str,
+        seed: Optional[int],
+        **kwargs,
+    ):
+        if seed is not None:
+            L.seed_everything(seed)
+        self.scaling_factor = scaling_factor
+        self.num_samples = num_samples
+        self.gate_k = gate_k
+        self.average_experts = average_experts
+        if device is not None:
+            self.device = device
+        else:
+            self.device = "cpu"  # default device
+        self.upscaling_accelerator = upscaling_accelerator
+        super().__init__(**kwargs)
+    @override
+    @torch.no_grad()
+    def run(self, modelpool: CLIPVisionModelPool):
+        # check preconditions
+        if isinstance(self.scaling_factor, Iterable):
+            assert len(modelpool.model_names) == len(
+                self.scaling_factor
+            ), "The number of scaling factors should be equal to the number of expert models."
+        # setup the data and model
+        dataset = self.prepare_data(modelpool)
+        upscaled_model = self.prepare_model(modelpool)  # SMILE Upscaling
+        # forward pass
+        vision_model: CLIPVisionTransformer = upscaled_model.vision_model
+        all_hidden_states = []
+        for sample_idx, sample in enumerate(
+            tqdm(dataset, desc="Extracting hidden states to the first layer")
+        ):
+            image, _ = sample
+            hidden_states = vision_model.embeddings(image.unsqueeze(0).to(self.device))
+            hidden_states = vision_model.pre_layrnorm(hidden_states)
+            all_hidden_states.append(hidden_states)
+        all_hidden_states = torch.concat(all_hidden_states, dim=0)
+        # reduce the MoE modules to the original linear modules
+        for layer_idx, layer in enumerate(
+            tqdm(vision_model.encoder.layers, desc="Reduce MoE to Linear")
+        ):
+            layer = cast(CLIPEncoderLayer, layer)
+            linear_layers = cast(
+                Dict[str, SmileMoELinear],
+                find_layers_with_type(layer, layer_types=[SmileMoELinear]),
+            )
+            def get_hook_fn(linear: SmileMoELinear):
+                hook_fn = AvgRoutingWeightsMetric(linear)
+                return hook_fn
+            hooks: Dict[str, AvgRoutingWeightsMetric] = {}
+            handles: List[torch.utils.hooks.RemovableHandle] = []
+            for name, linear in linear_layers.items():
+                hook_fn = get_hook_fn(linear)
+                hooks[name] = hook_fn
+                handles.append(linear.register_forward_hook(hook_fn))
+            # forward pass
+            hidden_states_to_next_layer = []
+            for sample_idx, _ in enumerate(all_hidden_states):
+                hidden_states = all_hidden_states[sample_idx : sample_idx + 1]
+                hidden_states = layer(
+                    hidden_states,
+                    attention_mask=None,
+                    causal_attention_mask=None,
+                    output_attentions=False,
+                )[0]
+                hidden_states_to_next_layer.append(hidden_states)
+            all_hidden_states = torch.concat(hidden_states_to_next_layer, dim=0)
+            # compute merge weights and merge
+            average_routing_weights = {}
+            for name, hook_fn in hooks.items():
+                average_routing_weights[name] = hook_fn.compute()
+            for h in handles:
+                h.remove()
+            # merge the MoE modules to the original linear modules
+            for name, linear in linear_layers.items():
+                weights: Tensor = average_routing_weights[name].cpu()
+                if self.scaling_factor is not None:
+                    if isinstance(self.scaling_factor, (int, float)):
+                        weights = weights * self.scaling_factor
+                    elif isinstance(self.scaling_factor, Iterable):
+                        weights = weights * torch.asarray(self.scaling_factor)
+                    else:
+                        raise ValueError(
+                            f"scaling_factor should be int, float, or a list/tuple of float. Got {type(self.scaling_factor)}"
+                        )
+                source_models = {"pretrained_model": linear.pretrained_model} | {
+                    f"expert_{i}": expert for i, expert in enumerate(linear.experts)
+                }
+                merged_linear = WeightedAverageAlgorithm(
+                    normalize=False, weights=[1] + weights.tolist(), verbose=False
+                ).run(BaseModelPool(source_models))
+                set_attr(layer, name.split("."), merged_linear)
+        return upscaled_model
+    def prepare_data(self, modelpool: CLIPVisionModelPool):
+        all_datasets = []
+        processor = modelpool.load_processor()
+        for dataset_name in modelpool.train_dataset_names:
+            dataset = modelpool.load_train_dataset(dataset_name)
+            assert len(dataset) >= self.num_samples, (
+                f"Number of samples in the dataset ({len(dataset)}) "
+                f"should be greater than or equal to the budget ({self.num_samples})"
+            )
+            dataset = random_split(
+                dataset, [self.num_samples, len(dataset) - self.num_samples]
+            )[0]
+            dataset = CLIPDataset(dataset, processor)
+            all_datasets.append(dataset)
+        dataset = ConcatDataset(all_datasets)
+        return dataset
+    def prepare_model(self, modelpool: CLIPVisionModelPool):
+        with self.profile("load pretrained model"):
+            pretrained_model: CLIPVisionModel = modelpool.load_pretrained_model()
+        with self.profile("load fine-tuned model"):
+            finetuned_models: List[CLIPVisionModel] = [
+                m for m in tqdm(modelpool.models(), total=len(modelpool.model_names))
+            ]
+        if self.device == "cuda" and torch.cuda.is_available():
+            pretrained_model = pretrained_model.cuda()
+            finetuned_models = [m.cuda() for m in finetuned_models]
+        with self.profile("merge model"):
+            # SMILE upscaling, with dense experts
+            upscaled_model = self.merge(pretrained_model, finetuned_models)
+        return upscaled_model
+    def merge(
+        self,
+        pretrained_model: nn.Module,
+        finetuned_models: List[nn.Module],
+        in_place: bool = True,
+    ):
+        """
+        Merges the pretrained model with the fine-tuned models to create an upscaled model.
+        Args:
+            pretrained_model (nn.Module): The pretrained model.
+            finetuned_models (List[nn.Module]): A list of fine-tuned models.
+            in_place (bool): If True, modifies the pretrained model in place. Otherwise, creates a copy.
+        Returns:
+            nn.Module: The merged model.
+        """
+        if in_place:
+            model = pretrained_model
+        else:
+            model = deepcopy(pretrained_model)
+        self._upscale_submodules(model, finetuned_models)
+        return model
+    def _upscale_linear_layer(
+        self,
+        pretrained_model,
+        finetuned_models,
+        name: str,
+    ):
+        name_list = name.split(".")
+        module = get_attr(pretrained_model, name_list)
+        experts = [get_attr(m, name_list) for m in finetuned_models]
+        try:
+            moe_linear = SmileMoELinear(
+                module,
+                experts,
+                gate_k=self.gate_k,
+                k=-1,  # we set k to -1 to use dense experts
+                top_k=len(finetuned_models),
+                routing_use_diff=True,
+                full_matrices=True,
+                upscaling_accelerator=self.upscaling_accelerator,
+            )
+        except ExpertNotTrainedError:
+            print(f"skip {name} because the experts are not trained.")
+            return
+        set_attr(pretrained_model, name_list, moe_linear)
+        # remove the original module from fine-tuned models to save memory
+        for m in finetuned_models:
+            set_attr(m, name_list, None)
+    def _average_experts(self, pretarined_model, finetuned_models, name: str):
+        name_list = name.split(".")
+        experts = [get_attr(m, name_list) for m in finetuned_models]
+        averaged_module = simple_average(experts)
+        set_attr(pretarined_model, name_list, averaged_module)
+    def _upscale_submodules(
+        self,
+        pretrained_model: nn.Module,
+        finetuned_models: List[nn.Module],
+        tqdm_desc: str = "Upscaling Linear Modules",
+    ):
+        """
+        Upscales the submodules of the pretrained model by merging them with the corresponding submodules from the fine-tuned models.
+        Args:
+            pretrained_model (nn.Module): The pretrained model.
+            finetuned_models (List[nn.Module]): A list of fine-tuned models.
+            tqdm_desc (str): Description for the tqdm progress bar.
+        """
+        for name, module in tqdm(
+            tuple(pretrained_model.named_modules()),
+            tqdm_desc,
+            leave=False,
+            dynamic_ncols=True,
+        ):
+            if isinstance(module, self._linear_layer_cls):
+                self._upscale_linear_layer(
+                    pretrained_model=pretrained_model,
+                    finetuned_models=finetuned_models,
+                    name=name,
+                )
+            elif self.average_experts and len(tuple(module.named_modules())) == 1:
+                # if the module is a leaf module, we perform a parameter average
+                self._average_experts(pretrained_model, finetuned_models, name)

fusion_bench/method/adamerging/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+# flake8: noqa F401
+from .clip_layer_wise_adamerging import CLIPLayerWiseAdaMergingAlgorithm
+from .clip_task_wise_adamerging import CLIPTaskWiseAdaMergingAlgorithm
+from .flan_t5_layer_wise_adamerging import FlanT5LayerWiseAdaMergingAlgorithm
+from .gpt2_layer_wise_adamerging import GPT2LayerWiseAdaMergingAlgorithm
+from .llama_adamerging import LayerWiseAdaMergingForLlamaSFT

fusion_bench/method/adamerging/clip_layer_wise_adamerging.py ADDED Viewed

@@ -0,0 +1,46 @@
+"""
+Example Usage:
+```bash
+fusion_bench \
+    method=adamerging \
+        method.name=clip_layer_wise_adamerging \
+        method.save_merging_weights=merging_weights.pt \
+    modelpool=clip-vit-base-patch32_TA8 \
+    taskpool=clip-vit-classification_TA8 \
+    fabric.loggers.root_dir=outputs/logs/ViT-B-32 \
+    fabric.loggers.name=clip_layer_wise_adamerging_adam
+```
+"""
+import functools
+import logging
+from torch.utils.data import DataLoader
+from fusion_bench.dataset.clip_dataset import CLIPDataset
+from fusion_bench.mixins import CLIPClassificationMixin
+from fusion_bench.utils.data import InfiniteDataLoader
+from .layer_wise_adamerging import LayerWiseAdaMergingAlgorithm
+log = logging.getLogger(__name__)
+class CLIPLayerWiseAdaMergingAlgorithm(
+    CLIPClassificationMixin,
+    LayerWiseAdaMergingAlgorithm,
+):
+    def on_test_time_adaptation_start(self):
+        """
+        Here we load the CLIP processor and construct the zero-shot classification head for each task.
+        """
+        self.setup_zero_shot_classification_head()
+    @functools.cache
+    def get_shuffled_test_loader_iter(self, task: str):
+        return super().get_shuffled_test_loader_iter(
+            task,
+            batch_size=self.config.batch_size,
+            num_workers=self.config.num_workers,
+        )