PyPI - fusion-bench - Versions diffs - 0.2.20__py3-none-any.whl → 0.2.22__py3-none-any.whl - Mend

fusion-bench 0.2.20py3-none-any.whl → 0.2.22py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (188) hide show

fusion_bench/method/we_moe/flan_t5_we_moe.py ADDED Viewed

@@ -0,0 +1,331 @@
+import functools
+import logging
+import os
+from copy import deepcopy
+from typing import Any, Dict, List, Mapping, Optional, Union, cast  # noqa: F401
+import lightning
+import lightning as L
+import lightning.fabric.wrappers
+import torch
+from torch import Tensor
+from torch.utils.data import DataLoader
+from tqdm.autonotebook import tqdm
+from transformers import T5ForConditionalGeneration
+from transformers.data import default_data_collator
+from fusion_bench.method import BaseAlgorithm
+from fusion_bench.method.task_arithmetic.task_arithmetic import task_arithmetic_merge
+from fusion_bench.mixins.lightning_fabric import LightningFabricMixin
+from fusion_bench.mixins.simple_profiler import SimpleProfilerMixin
+from fusion_bench.modelpool import Seq2SeqLMPool
+from fusion_bench.models.we_moe import WeightEnsemblingMoE
+from fusion_bench.utils import timeit_context
+from fusion_bench.utils.data import InfiniteDataLoader, load_tensor_from_file
+from fusion_bench.utils.instantiate_utils import instantiate
+from fusion_bench.utils.parameters import print_parameters
+from .entropy_loss import entropy_loss
+from .utils import get_memory_usage
+log = logging.getLogger(__name__)
+class FlanT5WeightEnsemblingMoEAlgorithm(
+    BaseAlgorithm,
+    LightningFabricMixin,
+    SimpleProfilerMixin,
+):
+    """
+    FlanT5WeightEnsemblingMoEAlgorithm is a class that implements the WeightEnsemblingMoEAlgorithm
+    for FlanT5 models. It extends the WeightEnsemblingMoEAlgorithm and CLIPClassificationMixin classes.
+    Attributes:
+        modelpool (Seq2SeqLMPool): The model pool containing the FlanT5 models.
+    """
+    modelpool: Seq2SeqLMPool = None
+    def __init__(
+        self,
+        checkpoint: bool = False,
+        save_checkpoint: bool = False,
+        router_hidden_layers: int = 2,
+        init_lambda: float = 0.3,
+        batch_reduce: bool = True,
+        lr: float = 1e-4,
+        optimizer: str = "adam",
+        devices: int = 1,
+        batch_size: int = 16,
+        num_workers: int = 0,
+        max_steps: int = 1000,
+        use_grad_accumulate: bool = True,
+        cache_dir: bool = "outputs",
+        fast_dev_run: bool = False,
+        **kwargs,
+    ):
+        """
+        Initialize the WeightEnsemblingMoEAlgorithm with the given configuration.
+        Args:
+            algorithm_config (DictConfig): The configuration for the algorithm.
+        """
+        self.checkpoint = checkpoint
+        self.save_checkpoint = save_checkpoint
+        self.router_hidden_layers = router_hidden_layers
+        self.init_lambda = init_lambda
+        self.batch_reduce = batch_reduce
+        self.lr = lr
+        self.optimizer = optimizer
+        self.devices = devices
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+        self.max_steps = max_steps
+        self.use_grad_accumulate = use_grad_accumulate
+        self.cache_dir = cache_dir
+        self.fast_dev_run = fast_dev_run
+        super().__init__(**kwargs)
+    def construct_moe_model(self) -> WeightEnsemblingMoE:
+        """
+        Construct the Mixture of Experts (MoE) model using the models in the model pool.
+        Returns:
+            WeightEnsemblingMoE: The constructed MoE model.
+        """
+        base_model = self.modelpool.load_model("_pretrained_")
+        expert_models = [
+            self.modelpool.load_model(name) for name in self.modelpool.model_names
+        ]
+        # Merge the models using task arithmetic
+        moe_model = task_arithmetic_merge(
+            # This function modifies the model in place, so we need to pass a deepcopy
+            deepcopy(base_model),
+            expert_models,
+            scaling_factor=self.init_lambda,
+        ).requires_grad_(False)
+        print(base_model)
+        # Up-scale MLP modules
+        num_layer = 12
+        encoder_mlp_index = 1
+        base_encoder = base_model.encoder
+        moe_encoder = moe_model.encoder
+        expert_encoders = [m.encoder for m in expert_models]
+        for layer_idx in range(num_layer):
+            base_mlp = (
+                base_encoder.block[layer_idx].layer[encoder_mlp_index].DenseReluDense
+            )
+            expert_mlps = [
+                e.block[layer_idx].layer[encoder_mlp_index].DenseReluDense
+                for e in expert_encoders
+            ]
+            moe_encoder.block[layer_idx].layer[encoder_mlp_index].DenseReluDense = (
+                WeightEnsemblingMoE(
+                    hidden_size=base_encoder.config.hidden_size,
+                    base_model=base_mlp,
+                    expert_models=expert_mlps,
+                    init_lambda=self.init_lambda,
+                    batch_first=True,
+                    router_hidden_layers=self.router_hidden_layers,
+                    batch_reduce=self.batch_reduce,
+                )
+            )
+        decoder_mlp_index = 2
+        base_decoder = base_model.decoder
+        moe_decoder = moe_model.decoder
+        expert_decoders = [m.decoder for m in expert_models]
+        for layer_idx in range(num_layer):
+            base_mlp = (
+                base_decoder.block[layer_idx].layer[decoder_mlp_index].DenseReluDense
+            )
+            expert_mlps = [
+                e.block[layer_idx].layer[decoder_mlp_index].DenseReluDense
+                for e in expert_decoders
+            ]
+            moe_decoder.block[layer_idx].layer[decoder_mlp_index].DenseReluDense = (
+                WeightEnsemblingMoE(
+                    hidden_size=base_decoder.config.hidden_size,
+                    base_model=base_mlp,
+                    expert_models=expert_mlps,
+                    init_lambda=self.init_lambda,
+                    batch_first=True,
+                    router_hidden_layers=self.router_hidden_layers,
+                    batch_reduce=self.batch_reduce,
+                )
+            )
+        print(moe_model)
+        return moe_model
+    @functools.cache
+    def get_shuffled_test_loader_iter(self, task: str) -> DataLoader:
+        """
+        Loader of test dataset for test-time adaptation. labels are not needed.
+        Args:
+            task (str): The name of the task.
+        Returns:
+            DataLoader: The data loader for the test dataset.
+        """
+        # dataloader_kwargs = dict(self.dataloader_kwargs)
+        # dataloader_kwargs.update(dict(shuffle=True, collate_fn=default_data_collator))
+        dataset = self.modelpool.load_test_dataset(task)
+        log.info("get_shuffled_test_loader_iter")
+        loader = DataLoader(
+            dataset,
+            batch_size=self.batch_size,
+            shuffle=True,
+            num_workers=self.num_workers,
+            collate_fn=default_data_collator,
+        )
+        # loader = DataLoader(dataset, **dataloader_kwargs)
+        if self.fabric is not None:
+            loader = self.fabric.setup_dataloaders(loader)
+        return iter(InfiniteDataLoader(loader))
+    def compute_logits(
+        self,
+        module: Union[T5ForConditionalGeneration],
+        batch,
+        task: str,
+    ) -> Tensor:
+        """
+        Compute the logits for the given images and task.
+        Args:
+            module: The model module.
+            images (Tensor): The input images.
+            task (str): The name of the task.
+        Returns:
+            Tensor: The computed logits.
+        """
+        input_ids: Tensor = batch["input_ids"]
+        attention_mask: Tensor = batch["attention_mask"]
+        # remove padding tokens from the input
+        while attention_mask[:, -1].eq(0).all():
+            input_ids = input_ids[:, :-1]
+            attention_mask = attention_mask[:, :-1]
+        outputs = module(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=torch.ones(
+                input_ids.size(0), 1, dtype=torch.long, device=input_ids.device
+            ),
+        )
+        logits = outputs.logits[:, 0, :]
+        return logits
+    def test_time_adaptation(self, module):
+        """
+        Perform test-time adaptation for the given module.
+        Args:
+            module (WeightEnsemblingMoE): The MoE module to adapt.
+        Returns:
+            WeightEnsemblingMoE: The adapted MoE module.
+        """
+        self.on_test_time_adaptation_start()
+        # configure optimizer
+        if self.optimizer == "adam":
+            print([name for name, p in module.named_parameters() if p.requires_grad])
+            optimizer = torch.optim.Adam(
+                [p for p in module.parameters() if p.requires_grad], lr=self.lr
+            )
+        else:
+            raise ValueError(f"Unsupported optimizer: {self.optimizer}")
+        module, optimizer = self.fabric.setup(module, optimizer)
+        module.train()
+        # module.merge_weights()
+        for step_idx in (
+            pbar := tqdm(
+                range(self.max_steps if not self.is_debug_mode else 1),
+                ("[DEBUG MODE] " if self.is_debug_mode else "")
+                + "WEMoE Test-time adaptation",
+                dynamic_ncols=True,
+            )
+        ):
+            total_loss = 0
+            for task in self.modelpool.model_names:
+                with self.profile("data loading"):
+                    batch = next(self.get_shuffled_test_loader_iter(task))
+                with self.profile("forward pass"):
+                    logits = self.compute_logits(module, batch, task)
+                    logits = logits.mean(dim=0, keepdim=True)
+                    loss = entropy_loss(logits)
+                    total_loss += loss
+                with self.profile("backward pass"):
+                    self.fabric.backward(loss, retain_graph=True)
+            with self.profile("optimizer step"):
+                optimizer.step()
+                optimizer.zero_grad()
+            metrics = {
+                "train/loss": total_loss.item(),
+            }
+            self.fabric.log_dict(metrics, step=step_idx)
+            pbar.set_postfix(metrics)
+        log.info(get_memory_usage(f"after adamerging, the memory usage of GPU is:"))
+        self.print_profile_summary()
+        return module
+    def on_test_time_adaptation_start(self):
+        """
+        Something to do before the test-time adaptation starts. Such as setting up the task-specific heads.
+        """
+        pass
+    def run(self, modelpool: Seq2SeqLMPool, **kwargs):
+        """
+        Run the WeightEnsemblingMoEAlgorithm to fuse models using Weight Ensembling Mixture of Experts.
+        Args:
+            modelpool (ModelPool): The pool of models to be fused.
+        Returns:
+            WeightEnsemblingMoE: The fused MoE model.
+        """
+        log.info("Fusing models using layer-wise adaptive merging.")
+        self.modelpool = modelpool
+        with timeit_context("upscaling models to a weight-ensembling MoE model"):
+            moe_model = self.construct_moe_model()
+            print_parameters(moe_model)
+        if self.checkpoint != False:
+            log.info(
+                f"load checkpoint from {self.checkpoint}, test-time adaptation will be skipped."
+            )
+            self.load_checkpoint(moe_model, self.checkpoint)
+        else:
+            with self.profile("test-time adaptation"):
+                moe_model = self.test_time_adaptation(moe_model)
+            if self.save_checkpoint != False:
+                log.info(f"save checkpoint to {self.save_checkpoint}")
+                self.save_checkpoint(moe_model, self.save_checkpoint)
+            if lightning.fabric.wrappers.is_wrapped(moe_model):
+                moe_model = lightning.fabric.wrappers._unwrap_objects(moe_model)
+        # enable sample-wise adaptation
+        moe_model.batch_reduce = False
+        self.print_profile_summary()
+        return moe_model

fusion_bench/method/we_moe/utils.py ADDED Viewed

@@ -0,0 +1,15 @@
+import torch
+def get_memory_usage(desc):
+    """
+    obtain the current GPU memory usage
+    Returns:
+        str: A string containing the allocated and cached memory in MB.
+    """
+    allocated = torch.cuda.memory_allocated() / 1024**2  # 转换为 MB
+    cached = torch.cuda.memory_reserved() / 1024**2  # 转换为 MB
+    return (
+        f"{desc}\nAllocated Memory: {allocated:.2f} MB\nCached Memory: {cached:.2f} MB"
+    )

fusion_bench/method/we_moe/we_moe.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import logging
 from abc import abstractmethod
-from typing import cast  # noqa: F401
+from typing import Any, cast  # noqa: F401
 import lightning as L
 import lightning.fabric.wrappers
@@ -70,7 +70,7 @@ class WeightEnsemblingMoEAlgorithm(
             assert "No CUDA device available."
     @abstractmethod
-    def load_checkpoint(self, model, checkpoint):
+    def load_checkpoint(self, model: Any, checkpoint: Any):
         """
         Load the checkpoint file.
@@ -81,7 +81,7 @@ class WeightEnsemblingMoEAlgorithm(
         pass
     @abstractmethod
-    def save_checkpoint(self, model, checkpoint):
+    def save_checkpoint(self, model: Any, checkpoint: Any):
         """
         Save the checkpoint file.
@@ -121,7 +121,7 @@ class WeightEnsemblingMoEAlgorithm(
         pass
     @abstractmethod
-    def compute_logits(self, module, batch, task) -> Tensor:
+    def compute_logits(self, module: Any, batch: Any, task: Any) -> Tensor:
         """
         Compute the logits for a given batch and task.
@@ -135,7 +135,7 @@ class WeightEnsemblingMoEAlgorithm(
         """
         pass
-    def test_time_adaptation(self, module: WeightEnsemblingMoE):
+    def test_time_adaptation(self, module: WeightEnsemblingMoE) -> WeightEnsemblingMoE:
         """
         Perform test-time adaptation for the given module.
@@ -208,7 +208,7 @@ class WeightEnsemblingMoEAlgorithm(
         return module
-    def run(self, modelpool: ModelPool):
+    def run(self, modelpool: ModelPool) -> WeightEnsemblingMoE:
         """
         Run the WeightEnsemblingMoEAlgorithm to fuse models using Weight Ensembling Mixture of Experts.

fusion_bench/method/weighted_average/llama.py CHANGED Viewed

@@ -3,9 +3,11 @@ from typing import List, Mapping, Union  # noqa: F401
 import numpy as np
 import torch
+from transformers import PreTrainedModel
 from typing_extensions import override
 from fusion_bench.method import BaseAlgorithm
+from fusion_bench.mixins import auto_register_config
 from fusion_bench.modelpool import CausalLMPool
 from fusion_bench.utils import timeit_context
 from fusion_bench.utils.state_dict_arithmetic import state_dict_add, state_dict_mul
@@ -14,20 +16,12 @@ from fusion_bench.utils.type import StateDictType
 log = logging.getLogger(__name__)
+@auto_register_config
 class WeightedAverageForLLama(BaseAlgorithm):
     """
     A class to perform weighted averaging of LlaMa/Mistral models.
     """
-    _config_mapping = BaseAlgorithm._config_mapping | {
-        "normalize": "normalize",
-        "weights": "weights",
-        "backbone_only": "backbone_only",
-        "merged_model_save_path": "merged_model_save_path",
-        "save_tokenizer": "save_tokenizer",
-        "push_to_hub": "push_to_hub",
-    }
     def __init__(
         self,
         normalize: bool,
@@ -49,17 +43,11 @@ class WeightedAverageForLLama(BaseAlgorithm):
             save_tokenizer (bool): Whether to save the tokenizer.
             push_to_hub (bool): Whether to push the model to the hub.
         """
-        self.normalize = normalize
-        self.weights = weights
-        self.backbone_only = backbone_only
-        self.merged_model_save_path = merged_model_save_path
-        self.save_tokenizer = save_tokenizer
-        self.push_to_hub = push_to_hub
         super().__init__(**kwargs)
     @override
     @torch.no_grad()
-    def run(self, modelpool: CausalLMPool):
+    def run(self, modelpool: CausalLMPool) -> PreTrainedModel:
         """
         Executes the weighted averaging of models in the provided model pool.

fusion_bench/metrics/continual_learning/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .backward_transfer import compute_backward_transfer

fusion_bench/metrics/continual_learning/backward_transfer.py CHANGED Viewed

@@ -10,7 +10,7 @@ def compute_backward_transfer(
     Compute the backward transfer (BWT) of a model on a set of tasks.
     Equation:
-        BWT = \frac{1}{n} \sum_{k=1}^{n} (acc_{Ti}[k] - acc_{ii}[k])
+        $BWT = \frac{1}{n} \sum_{k=1}^{n} (acc_{T,i}[k] - acc_{i,i}[k])$
     Returns:
         float: The backward transfer of the model.

fusion_bench/metrics/nyuv2/__init__.py CHANGED Viewed

@@ -1,10 +1,10 @@
 from .depth import DepthMetric
 from .noise import NoiseMetric
 from .normal import NormalMetric
-from .segmentation import SegmentationMertic
+from .segmentation import SegmentationMetric
 metric_classes = {
-    "segmentation": SegmentationMertic,
+    "segmentation": SegmentationMetric,
     "depth": DepthMetric,
     "normal": NormalMetric,
     "noise": NoiseMetric,

fusion_bench/metrics/nyuv2/segmentation.py CHANGED Viewed

@@ -5,7 +5,7 @@ from torch import Tensor, nn
 from torchmetrics import Metric
-class SegmentationMertic(Metric):
+class SegmentationMetric(Metric):
     metric_names = ["mIoU", "pixAcc"]
     def __init__(self, num_classes=13):

fusion_bench/mixins/__init__.py CHANGED Viewed

@@ -11,7 +11,11 @@ _import_structure = {
     "hydra_config": ["HydraConfigMixin"],
     "lightning_fabric": ["LightningFabricMixin"],
     "openclip_classification": ["OpenCLIPClassificationMixin"],
-    "serialization": ["YAMLSerializationMixin", "BaseYAMLSerializableModel"],
+    "serialization": [
+        "BaseYAMLSerializable",
+        "YAMLSerializationMixin",
+        "auto_register_config",
+    ],
     "simple_profiler": ["SimpleProfilerMixin"],
 }
@@ -21,7 +25,11 @@ if TYPE_CHECKING:
     from .hydra_config import HydraConfigMixin
     from .lightning_fabric import LightningFabricMixin
     from .openclip_classification import OpenCLIPClassificationMixin
-    from .serialization import BaseYAMLSerializableModel, YAMLSerializationMixin
+    from .serialization import (
+        BaseYAMLSerializable,
+        YAMLSerializationMixin,
+        auto_register_config,
+    )
     from .simple_profiler import SimpleProfilerMixin
 else:
     sys.modules[__name__] = LazyImporter(

fusion_bench/mixins/clip_classification.py CHANGED Viewed

@@ -6,6 +6,7 @@ from typing import (  # noqa: F401
     TYPE_CHECKING,
     Any,
     Dict,
+    Iterator,
     List,
     Optional,
     Tuple,
@@ -21,6 +22,7 @@ from torch.utils.data import DataLoader
 from tqdm.auto import tqdm
 from transformers import CLIPModel, CLIPProcessor, CLIPVisionModel
+from fusion_bench import cache_with_joblib
 from fusion_bench.dataset.clip_dataset import CLIPDataset
 from fusion_bench.mixins import LightningFabricMixin
 from fusion_bench.modelpool import CLIPVisionModelPool
@@ -45,15 +47,13 @@ class CLIPClassificationMixin(LightningFabricMixin):
     - `_dataloader_kwargs` (Dict[str, Any]): Keyword arguments for the dataloader.
     - `modelpool` (CLIPVisionModelPool): The model pool containing the CLIP models.
-    - `zeroshot_weights_cache_dir` (Optional[str]): The directory to cache the zero-shot weights.
     """
-    _dataloader_kwargs: Dict[str, Any] = {}
+    dataloader_kwargs: Dict[str, Any] = {}
     # the modelpool is set by inheriting class
     modelpool: CLIPVisionModelPool = None
     _clip_processor: CLIPProcessor = None
     # a dict of zeroshot weights for each task, each key is the task name
-    zeroshot_weights_cache_dir: str = "outputs/cache/clip_zeroshot_weights"
     zeroshot_weights: Dict[str, torch.Tensor] = {}
     whether_setup_zero_shot_classification_head = False
@@ -71,7 +71,7 @@ class CLIPClassificationMixin(LightningFabricMixin):
         batch_size: Optional[int] = None,
         num_workers: Optional[int] = None,
         **loader_kwargs,
-    ):
+    ) -> Iterator:
         """
         Get an iterator for a shuffled test DataLoader.
@@ -89,7 +89,7 @@ class CLIPClassificationMixin(LightningFabricMixin):
             Iterator: An iterator over the shuffled test DataLoader.
         """
         # get dataloader kwargs
-        dataloader_kwargs = self._dataloader_kwargs.copy()
+        dataloader_kwargs = self.dataloader_kwargs.copy()
         dataloader_kwargs["shuffle"] = True
         if batch_size is not None:
             dataloader_kwargs["batch_size"] = batch_size
@@ -130,26 +130,16 @@ class CLIPClassificationMixin(LightningFabricMixin):
         self.visual_projection = self.fabric.to_device(self.visual_projection)
         self.logit_scale_exp = self.fabric.to_device(self.logit_scale_exp)
-        # get cache directory
-        if self.modelpool.has_pretrained:
-            model_name = self.modelpool.get_model_config("_pretrained_")
-            if not isinstance(model_name, str):
-                model_name = model_name.pretrained_model_name_or_path
-        else:
-            model_name = self.modelpool.get_model_config(self.modelpool.model_names[0])
-            if not isinstance(model_name, str):
-                model_name = model_name.pretrained_model_name_or_path
-        cache_dir = os.path.join(
-            self.zeroshot_weights_cache_dir,
-            os.path.normpath(model_name.split("/")[-1]),
-        )
-        if not os.path.exists(cache_dir):
-            log.info(
-                f"Creating cache directory for zero-shot classification head at {cache_dir}"
-            )
-            os.makedirs(cache_dir)
+        @cache_with_joblib()
+        def construct_classification_head(task: str):
+            nonlocal clip_classifier
+            classnames, templates = get_classnames_and_templates(task)
+            clip_classifier.set_classification_task(classnames, templates)
+            zeroshot_weights = clip_classifier.zeroshot_weights.detach().clone()
+            return zeroshot_weights
-        log.info(f"cache directory for zero-shot classification head: {cache_dir}")
         for task in tqdm(
             self.modelpool.model_names if task_names is None else task_names,
             "Setting up zero-shot classification head",
@@ -157,27 +147,7 @@ class CLIPClassificationMixin(LightningFabricMixin):
         ):
             zeroshot_weights = None
             if self.fabric.is_global_zero:
-                cache_file = os.path.join(
-                    cache_dir, os.path.normpath(f"{task}_zeroshot_weights.pt")
-                )
-                if os.path.exists(cache_file):
-                    zeroshot_weights = torch.load(
-                        cache_file,
-                        map_location="cpu",
-                        weights_only=True,
-                    ).detach()
-                    log.info(
-                        f"Loadded cached zeroshot weights for task: {task}, shape: {zeroshot_weights.shape}"
-                    )
-                else:
-                    log.info(
-                        f"Construct zero shot classification head for task: {task}"
-                    )
-                    classnames, templates = get_classnames_and_templates(task)
-                    clip_classifier.set_classification_task(classnames, templates)
-                    zeroshot_weights = clip_classifier.zeroshot_weights.detach().clone()
-                    log.info(f"save zeroshot weights to {cache_file}")
-                    torch.save(zeroshot_weights, cache_file)
+                zeroshot_weights = construct_classification_head(task)
             self.fabric.barrier()
             self.zeroshot_weights[task] = self.fabric.broadcast(zeroshot_weights, src=0)

fusion-bench 0.2.20__py3-none-any.whl → 0.2.22__py3-none-any.whl

fusion-bench 0.2.20py3-none-any.whl → 0.2.22py3-none-any.whl