PyPI - fusion-bench - Versions diffs - 0.2.9__py3-none-any.whl - Mend

fusion-bench 0.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (727) hide show

fusion_bench/method/adamerging/clip_task_wise_adamerging.py ADDED Viewed

@@ -0,0 +1,187 @@
+import functools
+import logging
+import os
+import torch
+from omegaconf import DictConfig
+from torch import Tensor
+from torch.utils.data import DataLoader
+from transformers import CLIPModel, CLIPProcessor
+from fusion_bench.dataset import CLIPDataset
+from fusion_bench.modelpool import CLIPVisionModelPool
+from fusion_bench.models.hf_clip import HFCLIPClassifier
+from fusion_bench.tasks.clip_classification import get_classnames_and_templates
+from fusion_bench.utils import timeit_context
+from .task_wise_adamerging import TaskWiseAdaMergingAlgorithm
+log = logging.getLogger(__name__)
+class InfiniteDataLoader:
+    """
+    A wrapper class for DataLoader to create an infinite data loader.
+    This is useful in case we are only interested in the number of steps and not the number of epochs.
+    This class wraps a DataLoader and provides an iterator that resets
+    when the end of the dataset is reached, creating an infinite loop.
+    Attributes:
+        data_loader (DataLoader): The DataLoader to wrap.
+        data_iter (iterator): An iterator over the DataLoader.
+    """
+    def __init__(self, data_loader):
+        self.data_loader = data_loader
+        self.data_iter = iter(data_loader)
+    def __iter__(self):
+        return self
+    def __next__(self):
+        try:
+            data = next(self.data_iter)
+        except StopIteration:
+            self.data_iter = iter(self.data_loader)  # Reset the data loader
+            data = next(self.data_iter)
+        return data
+class CLIPTaskWiseAdaMergingAlgorithm(TaskWiseAdaMergingAlgorithm):
+    """
+    A class for task-wise adaptive merging of CLIP models.
+    This class extends the TaskWiseAdaMergingAlgorithm to provide specific
+    functionality for CLIP models, including loading datasets, constructing
+    zero-shot classification heads, and computing logits.
+    Attributes:
+        modelpool (CLIPVisionModelPool): The model pool containing CLIP models.
+        _clip_processor (CLIPProcessor): The CLIP processor for preparing inputs.
+        zeroshot_weights (dict): A dictionary to store zero-shot weights for each task.
+    """
+    modelpool: CLIPVisionModelPool = None
+    _clip_processor: CLIPProcessor = None
+    zeroshot_weights = {}
+    def __init__(self, algorithm_config: DictConfig):
+        super().__init__(algorithm_config)
+    @functools.cache
+    def get_test_dataset(self, task: str):
+        """
+        Load the test dataset for the task.
+        This method is cached, so the dataset is loaded only once.
+        Args:
+            task (str): The name of the task.
+        Returns:
+            CLIPDataset: The test dataset for the task.
+        """
+        log.info(f"Loading test dataset: {task}")
+        dataset = self.modelpool.load_test_dataset(task)
+        dataset = CLIPDataset(dataset, self._clip_processor)
+        return dataset
+    @functools.cache
+    def get_shuffled_test_loader_iter(self, task: str):
+        """
+        Get an iterator over the shuffled test DataLoader for the task.
+        Args:
+            task (str): The name of the task.
+        Returns:
+            iterator: An iterator over the shuffled test DataLoader.
+        """
+        loader = DataLoader(
+            self.get_test_dataset(task),
+            batch_size=self.config.batch_size,
+            shuffle=True,
+            num_workers=self.config.num_workers,
+            pin_memory=True,
+        )
+        if self._fabric is not None:
+            loader = self._fabric.setup_dataloaders(loader)
+        return iter(InfiniteDataLoader(loader))
+    def on_test_time_adaptation_start(self):
+        """
+        Prepare for test-time adaptation.
+        This method loads the CLIP processor and constructs the zero-shot
+        classification head for each task.
+        """
+        clip_model_config = self.modelpool.get_model_config("_pretrained_")
+        pretrained_path = (
+            clip_model_config.pretrained_model_name_or_path
+            if hasattr(clip_model_config, "pretrained_model_name_or_path")
+            else clip_model_config.path
+        )
+        with timeit_context("Loading CLIP processor and pretrained CLIP model."):
+            self._clip_processor = CLIPProcessor.from_pretrained(pretrained_path)
+            clip_model: CLIPModel = CLIPModel.from_pretrained(pretrained_path)
+            clip_classifier = HFCLIPClassifier(clip_model, self._clip_processor)
+            self.visual_projection = clip_model.visual_projection.requires_grad_(False)
+            self.logit_scale_exp = clip_model.logit_scale.exp()
+            if self._fabric is not None:
+                self.visual_projection = self._fabric.to_device(self.visual_projection)
+                self.logit_scale_exp = self._fabric.to_device(self.logit_scale_exp)
+        for task in self.modelpool.model_names:
+            cache_file = os.path.join(
+                self.config.cache_dir,
+                f"{os.path.basename(pretrained_path)}_{task}_zeroshot_weights.pt",
+            )
+            if os.path.exists(cache_file):
+                log.info(f"Loading cached zeroshot weights for task: {task}")
+                zeroshot_weights = torch.load(cache_file, map_location="cpu")
+            else:
+                log.info(f"Construct zero shot classification head for task: {task}")
+                classnames, templates = get_classnames_and_templates(task)
+                clip_classifier.set_classification_task(classnames, templates)
+                zeroshot_weights = clip_classifier.zeroshot_weights
+                log.info(f"save zeroshot weights to {cache_file}")
+                torch.save(zeroshot_weights, cache_file)
+            self.zeroshot_weights[task] = zeroshot_weights
+            if self._fabric is not None:
+                self.zeroshot_weights[task] = self._fabric.to_device(
+                    self.zeroshot_weights[task]
+                )
+    def compute_logits(self, module, batch, task: str) -> Tensor:
+        """
+        Compute the logits for the given batch and task.
+        This method computes the image embeddings, normalizes them, and calculates
+        the cosine similarity with the text embeddings to produce classification logits.
+        Args:
+            module (nn.Module): The model module.
+            batch (tuple): A batch of input data.
+            task (str): The name of the task.
+        Returns:
+            Tensor: The classification logits for the batch.
+        """
+        images, _ = batch
+        text_embeds = self.zeroshot_weights[task]
+        image_embeds = module(images)[1]
+        image_embeds = self.visual_projection(image_embeds)
+        # normalize embeddings
+        image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
+        # cosine similarity
+        logits_per_text = (
+            torch.matmul(text_embeds, image_embeds.t()) * self.logit_scale_exp
+        )
+        logits_per_image = logits_per_text.t()
+        return logits_per_image

fusion_bench/method/adamerging/entropy_loss.py ADDED Viewed

@@ -0,0 +1,25 @@
+import torch
+from torch import Tensor
+def entropy_loss(logits: Tensor, eps: float = 1e-8) -> Tensor:
+    """
+    Compute the entropy loss of a set of logits.
+    Args:
+        logits (Tensor): The logits to compute the entropy loss of.
+        eps (float): A small value to avoid log(0). Default is 1e-8.
+    Returns:
+        Tensor: The entropy loss of the logits.
+    """
+    # Ensure the logits tensor has 2 dimensions
+    assert (
+        logits.dim() == 2
+    ), f"Expected logits to have 2 dimensions, found {logits.dim()}, {logits.size()=}"
+    # Compute the softmax probabilities
+    probs = torch.softmax(logits, dim=-1)
+    # Compute the entropy loss
+    return -torch.sum(probs * torch.log(probs + eps), dim=-1).mean()

fusion_bench/method/adamerging/flan_t5_layer_wise_adamerging.py ADDED Viewed

@@ -0,0 +1,332 @@
+"""
+This is an experimental implementation of the Layer-Wise AdaMerging Algorithm for Flan-T5 models.
+The efficiency of the algorithm is not guaranteed, and it may not work as expected.
+"""
+import functools
+import logging
+import os
+from abc import abstractmethod
+from pathlib import Path
+from typing import Any, Dict, List, Mapping, Optional, Union, cast  # noqa: F401
+import torch
+from lightning.fabric.utilities.rank_zero import rank_zero_only
+from omegaconf import DictConfig
+from torch import Tensor, nn
+from torch.utils.data import DataLoader
+from tqdm.autonotebook import tqdm
+from transformers import T5ForConditionalGeneration
+from transformers.data import default_data_collator
+from fusion_bench.method import BaseAlgorithm
+from fusion_bench.method.simple_average import simple_average
+from fusion_bench.mixins.lightning_fabric import LightningFabricMixin
+from fusion_bench.mixins.simple_profiler import SimpleProfilerMixin
+from fusion_bench.modelpool import Seq2SeqLMPool
+from fusion_bench.models.wrappers.layer_wise_fusion import (
+    LayerWiseMergedModel,
+    get_layer_wise_weights,
+)
+from fusion_bench.utils.data import InfiniteDataLoader, load_tensor_from_file
+from fusion_bench.utils.instantiate import instantiate
+from .entropy_loss import entropy_loss
+from .min_norm_solvers import MinNormSolver
+from .utils import get_memory_usage
+log = logging.getLogger(__name__)
+class FlanT5LayerWiseAdaMergingAlgorithm(
+    BaseAlgorithm,
+    LightningFabricMixin,
+    SimpleProfilerMixin,
+):
+    def __init__(
+        self,
+        optimizer: DictConfig,
+        dataloader_kwargs: DictConfig,
+        init_values: float,
+        max_steps: int,
+        merging_weights_load_path: Optional[Union[str, Path]] = None,
+        merging_weights_save_path: Optional[Union[str, Path]] = None,
+        clamp_weights: bool = False,
+        tie_weights: bool = True,
+        strict: bool = False,
+        cache_dir: str = "outputs/cache",
+        variant: Optional[str] = None,
+        **kwargs,
+    ):
+        self._optimizer = optimizer
+        self.dataloader_kwargs = dataloader_kwargs
+        self.init_values = init_values
+        self.merging_weights_load_path = merging_weights_load_path
+        self.merging_weights_save_path = merging_weights_save_path
+        self.clamp_weights = clamp_weights
+        self.tie_weights = tie_weights
+        self.strict = strict
+        self.max_steps = max_steps
+        self.cache_dir = cache_dir
+        self.variant = variant
+        super().__init__(**kwargs)
+    @torch.no_grad()
+    def construct_layer_wise_merged_model(self, modelpool: Seq2SeqLMPool):
+        """
+        Constructs a wrapped layer-wise merged model from model pool.
+        This method creates a new wrapped model by merging the layers of a pretrained model with those of several fine-tuned models.
+        The merging is controlled by layer-wise weights, which is a `torch.Tensor` of the shape `(num_models, num_layers)`.
+        The merging weights can be initialized based on a provided configuration or loaded from a file.
+        Args:
+            modelpool (ModelPool): An object containing the pretrained model and fine-tuned models to be merged.
+        Returns:
+            LayerWiseMergedModel: An instance of the merged model with layer-wise weights applied.
+        """
+        pretrained_model = modelpool.load_model("_pretrained_")
+        finetuned_models = [
+            modelpool.load_model(name) for name in modelpool.model_names
+        ]
+        # initialize layer-wise weights using the provided configuration `init_values` or load from file if `weights` is provided
+        if self.merging_weights_load_path is None:
+            layer_wise_weight = get_layer_wise_weights(
+                num_models=len(modelpool.model_names),
+                num_layers=len(
+                    tuple(
+                        filter(lambda p: p.requires_grad, pretrained_model.parameters())
+                    )
+                ),
+                init_values=self.init_values,
+            )
+        else:
+            if isinstance(self.merging_weights_load_path, str):
+                # load the merging weights from a file
+                layer_wise_weight = load_tensor_from_file(
+                    self.merging_weights_load_path
+                )
+            else:
+                raise ValueError(
+                    f"Unsupported weights format: {self.merging_weights_load_path}"
+                )
+        module = LayerWiseMergedModel(
+            layer_wise_weight=layer_wise_weight,
+            pretrained_model=pretrained_model,
+            finetuned_models=finetuned_models,
+            clamp_weights=self.clamp_weights,
+            tie_weights=self.tie_weights,
+            strict=self.strict,
+        )
+        print(f"{layer_wise_weight.size()=}, {layer_wise_weight.numel()=}")
+        return module
+    @rank_zero_only
+    def save_merging_weights(self, file_path: str, merging_weights: torch.Tensor):
+        """
+        Save the merging weights to a file.
+        Args:
+            file_path (str): The path to save the merging weights.
+            merging_weights (torch.Tensor): The merging weights to save.
+        """
+        if self.fabric.is_global_zero and self.merging_weights_save_path is not None:
+            if isinstance(file_path, str) and not file_path.startswith(("/", ".")):
+                # if the file path is not absolute or relative to current working directory, save it in the log directory
+                save_path = os.path.join(self.log_dir, file_path)
+            else:
+                save_path = file_path
+            log.info(f"saving merging weights to {save_path}.")
+            if os.path.dirname(save_path):
+                os.makedirs(os.path.dirname(save_path), exist_ok=True)
+            torch.save(merging_weights.detach().cpu(), save_path)
+    def run(self, modelpool: Seq2SeqLMPool, **kwargs):
+        """
+        Run the Layer-Wise AdaMerging Algorithm.
+        This method constructs the wrapped model and performs test-time adaptation if necessary.
+        Args:
+            modelpool (ModelPool): The model pool containing the pretrained and fine-tuned models.
+        Returns:
+            LayerWiseMergedModel: The merged model after test-time adaptation.
+        """
+        log.info("Fusing models using layer-wise adaptive merging.")
+        self.modelpool = modelpool
+        with self.profile("construct the wrapped model"):
+            module = self.construct_layer_wise_merged_model(modelpool)
+        if self.merging_weights_load_path is not None:
+            # skip the test-time adaptation
+            return module.merge_and_unload()
+        else:
+            with self.profile("test-time adaptation"):
+                module = self.test_time_adaptation(module)
+            if self.merging_weights_save_path is not None:
+                self.save_merging_weights(
+                    self.merging_weights_save_path, module.merge_weight
+                )
+            return module.merge_and_unload()
+    @functools.cache
+    def get_shuffled_test_loader_iter(self, task: str) -> DataLoader:
+        """
+        Loader of test dataset for test-time adaptation. labels are not needed.
+        Args:
+            task (str): The name of the task.
+        Returns:
+            DataLoader: The data loader for the test dataset.
+        """
+        dataloader_kwargs = dict(self.dataloader_kwargs)
+        dataloader_kwargs.update(dict(shuffle=True, collate_fn=default_data_collator))
+        dataset = self.modelpool.load_test_dataset(task)
+        loader = DataLoader(dataset, **dataloader_kwargs)
+        if self.fabric is not None:
+            loader = self.fabric.setup_dataloaders(loader)
+        return iter(InfiniteDataLoader(loader))
+    def compute_logits(
+        self,
+        module: Union[T5ForConditionalGeneration, LayerWiseMergedModel],
+        batch,
+        task: str,
+    ) -> Tensor:
+        """
+        Compute the logits for the given images and task.
+        Args:
+            module: The model module.
+            images (Tensor): The input images.
+            task (str): The name of the task.
+        Returns:
+            Tensor: The computed logits.
+        """
+        input_ids: Tensor = batch["input_ids"]
+        attention_mask: Tensor = batch["attention_mask"]
+        # remove padding tokens from the input
+        while attention_mask[:, -1].eq(0).all():
+            input_ids = input_ids[:, :-1]
+            attention_mask = attention_mask[:, :-1]
+        outputs = module(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=torch.ones(
+                input_ids.size(0), 1, dtype=torch.long, device=input_ids.device
+            ),
+        )
+        logits = outputs.logits[:, 0, :]
+        return logits
+    def on_test_time_adaptation_start(self):
+        """
+        Something to do before the test-time adaptation starts. Such as setting up the task-specific heads.
+        """
+        pass
+    def test_time_adaptation(self, module: LayerWiseMergedModel):
+        """
+        Perform test-time adaptation on the merged model.
+        This method adapts the merging weights during test-time to improve performance.
+        Args:
+            module (LayerWiseMergedModel): The merged model.
+        Returns:
+            LayerWiseMergedModel: The adapted merged model.
+        """
+        self.on_test_time_adaptation_start()
+        # configure optimizer
+        optimizer = instantiate(self._optimizer, [module.merge_weight])
+        module, optimizer = self.fabric.setup(module, optimizer)
+        module.train()
+        module.merge_weights()
+        for step_idx in (
+            pbar := tqdm(
+                range(self.max_steps if not self.is_debug_mode else 1),
+                ("[DEBUG MODE] " if self.is_debug_mode else "")
+                + "AdaMerging Test-time adaptation",
+                dynamic_ncols=True,
+            )
+        ):
+            if self.variant == "mgda":
+                total_loss = self._compute_gradients_using_mgda(module)
+            else:
+                total_loss = 0
+                for task in self.modelpool.model_names:
+                    with self.profile("data loading"):
+                        batch = next(self.get_shuffled_test_loader_iter(task))
+                    with self.profile("forward pass"):
+                        logits = self.compute_logits(module, batch, task)
+                        logits = logits.mean(dim=0, keepdim=True)
+                        loss = entropy_loss(logits)
+                        total_loss += loss
+                    with self.profile("backward pass"):
+                        self.fabric.backward(loss, retain_graph=True)
+            with self.profile("optimizer step"):
+                optimizer.step()
+                optimizer.zero_grad()
+            with self.profile("merging weights"):
+                module.merge_weights()
+            metrics = {
+                "train/loss": total_loss.item(),
+                "train/weight_max": module.merge_weight.max().item(),
+                "train/weight_min": module.merge_weight.min().item(),
+                "train/weight_mean": module.merge_weight.mean().item(),
+            }
+            self.fabric.log_dict(metrics, step=step_idx)
+            pbar.set_postfix(metrics)
+        log.info(get_memory_usage(f"after adamerging, the memory usage of GPU is:"))
+        self.print_profile_summary()
+        return module
+    def _compute_gradients_using_mgda(self, module: LayerWiseMergedModel):
+        all_grads = []
+        total_loss = 0
+        # default behavior for first-order optimizers
+        for task in self.modelpool.model_names:
+            with self.profile("data loading"):
+                batch = next(self.get_shuffled_test_loader_iter(task))
+            with self.profile("forward pass"):
+                logits = self.compute_logits(module, batch, task)
+                logits = logits.mean(dim=0, keepdim=True)
+                loss = entropy_loss(logits)
+                total_loss += loss
+            with self.profile("backward pass"):
+                # self.fabric.backward(loss, retain_graph=True)
+                _grads = torch.autograd.grad(
+                    loss,
+                    [module.merge_weight],
+                    create_graph=False,
+                    retain_graph=True,
+                )
+                all_grads.append(_grads[0].flatten().detach())
+        sol, min_norm = MinNormSolver.find_min_norm_element(all_grads)
+        if not isinstance(sol, torch.Tensor):
+            sol = torch.from_numpy(sol)
+        sol = sol.to(
+            device=module.merge_weight.device,
+            dtype=module.merge_weight.dtype,
+        )
+        grad = torch.stack(all_grads) * sol.view(-1, 1)
+        module.merge_weight.grad = grad.sum(dim=0).view_as(module.merge_weight)
+        return total_loss