PyPI - fusion-bench - Versions diffs - 0.2.5__py3-none-any.whl → 0.2.7__py3-none-any.whl - Mend

fusion-bench 0.2.5py3-none-any.whl → 0.2.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (105) hide show

fusion_bench/compat/method/__init__.py +2 -0
fusion_bench/compat/method/base_algorithm.py +7 -2
fusion_bench/compat/modelpool/__init__.py +3 -2
fusion_bench/compat/taskpool/__init__.py +1 -1
fusion_bench/dataset/arc_agi/__init__.py +6 -1
fusion_bench/dataset/arc_agi/arc.py +26 -7
fusion_bench/dataset/arc_agi/arc_agi.py +156 -25
fusion_bench/dataset/arc_agi/np_cache.py +0 -1
fusion_bench/dataset/arc_agi/preprocess.py +51 -9
fusion_bench/dataset/llama/__init__.py +1 -0
fusion_bench/dataset/llama/alpaca.py +93 -3
fusion_bench/dataset/llama/collate.py +72 -5
fusion_bench/dataset/llama/metamathqa.py +50 -0
fusion_bench/dataset/llama/preference_700k.py +70 -0
fusion_bench/dataset/llama/stanford_shp.py +90 -0
fusion_bench/dataset/llama/ultrachat.py +58 -0
fusion_bench/dataset/llama/utils/__init__.py +0 -0
fusion_bench/method/__init__.py +4 -1
fusion_bench/method/adamerging/__init__.py +1 -1
fusion_bench/method/adamerging/layer_wise_adamerging.py +11 -4
fusion_bench/method/adamerging/min_norm_solvers.py +4 -4
fusion_bench/method/linear/expo.py +39 -0
fusion_bench/method/lm_finetune/__init__.py +1 -0
fusion_bench/method/lm_finetune/bradley_terry_rm.py +432 -0
fusion_bench/method/lm_finetune/fullfinetune_sft.py +122 -150
fusion_bench/method/lm_finetune/peftfinetune_sft.py +102 -157
fusion_bench/method/pruning/llama_magnitude_prune.py +2 -2
fusion_bench/method/pruning/llama_random_prune.py +2 -2
fusion_bench/method/pruning/magnitude_diff_pruning.py +2 -1
fusion_bench/method/rankone_moe/__init__.py +3 -0
fusion_bench/method/rankone_moe/clip_rankone_moe.py +160 -0
fusion_bench/method/rankone_moe/rankone_moe.py +249 -0
fusion_bench/method/simple_average.py +1 -1
fusion_bench/method/surgery/__init__.py +3 -0
fusion_bench/method/surgery/clip_layer_wise_adamerging_surgery.py +157 -0
fusion_bench/mixins/__init__.py +2 -0
fusion_bench/mixins/clip_classification.py +60 -12
fusion_bench/mixins/fabric_training.py +320 -0
fusion_bench/mixins/lightning_fabric.py +11 -2
fusion_bench/modelpool/__init__.py +2 -0
fusion_bench/modelpool/causal_lm/__init__.py +1 -1
fusion_bench/modelpool/causal_lm/causal_lm.py +21 -22
fusion_bench/modelpool/seq_classification_lm/__init__.py +2 -0
fusion_bench/modelpool/seq_classification_lm/reward_model.py +15 -0
fusion_bench/modelpool/seq_classification_lm/seq_classification_lm.py +98 -0
fusion_bench/models/chat_templates/__init__.py +1 -0
fusion_bench/models/chat_templates/llama_3_Instruct.py +1 -0
fusion_bench/models/chat_templates/load_tokenizer.py +43 -0
fusion_bench/models/hf_clip.py +50 -9
fusion_bench/models/rankone_moe.py +410 -0
fusion_bench/models/surgery/surgerymodelwrapper.py +157 -0
fusion_bench/models/utils.py +8 -0
fusion_bench/models/wrappers/layer_wise_fusion.py +14 -5
fusion_bench/models/wrappers/task_wise_fusion.py +5 -5
fusion_bench/optim/__init__.py +2 -0
fusion_bench/optim/exception.py +47 -0
fusion_bench/optim/lr_scheduler/__init__.py +1 -0
fusion_bench/optim/lr_scheduler/linear_warmup.py +222 -0
fusion_bench/optim/lr_scheduler/utils/__init__.py +1 -0
fusion_bench/optim/lr_scheduler/utils/visualization.py +119 -0
fusion_bench/optim/mezo.py +0 -2
fusion_bench/programs/fabric_fusion_program.py +5 -1
fusion_bench/taskpool/__init__.py +10 -2
fusion_bench/taskpool/clip_vision/__init__.py +1 -0
fusion_bench/taskpool/clip_vision/clip_rankone_moe_taskpool.py +112 -0
fusion_bench/taskpool/clip_vision/taskpool.py +43 -6
fusion_bench/taskpool/llama/reward_model.py +157 -0
fusion_bench/taskpool/nyuv2_taskpool.py +2 -0
fusion_bench/tasks/flan_t5_text_generation/glue_load_dataset.py +2 -1
fusion_bench/utils/hydra_utils.py +22 -0
fusion_bench/utils/plot/__init__.py +0 -0
fusion_bench/utils/plot/token.py +52 -0
fusion_bench/utils/plot/token_notebook.py +127 -0
fusion_bench/utils/type.py +5 -3
{fusion_bench-0.2.5.dist-info → fusion_bench-0.2.7.dist-info}/METADATA +1 -1
{fusion_bench-0.2.5.dist-info → fusion_bench-0.2.7.dist-info}/RECORD +104 -57
fusion_bench_config/clip-vit-base-patch32_robustness_corrupted.yaml +1 -1
fusion_bench_config/dataset/llm_sft/alpaca_cleaned.yaml +6 -0
fusion_bench_config/dataset/llm_sft/ultrachat_200k.yaml +3 -0
fusion_bench_config/fabric/llama_peft_fsdp.yaml +16 -0
fusion_bench_config/fabric/loggers/wandb_logger.yaml +2 -0
fusion_bench_config/fabric/strategy/deepspeed.yaml +10 -0
fusion_bench_config/fabric/strategy/llama_peft_fsdp.yaml +9 -0
fusion_bench_config/fabric_model_fusion.yaml +1 -1
fusion_bench_config/llama_full_finetune.yaml +19 -0
fusion_bench_config/method/lm_finetune/bradley_terry_rm.yaml +47 -0
fusion_bench_config/method/lm_finetune/fullfinetune_sft.yaml +13 -6
fusion_bench_config/method/lm_finetune/peftfinetune_sft.yaml +17 -9
fusion_bench_config/method/rankone_moe/rankone_moe.yaml +26 -0
fusion_bench_config/method/regmean/clip_regmean.yaml +1 -0
fusion_bench_config/method/surgery/adamerging_surgery.yaml +27 -0
fusion_bench_config/modelpool/CausalLMPool/llama_alpaca_cleaned.yaml +21 -0
fusion_bench_config/modelpool/CausalLMPool/llama_codealpaca.yaml +21 -0
fusion_bench_config/modelpool/CausalLMPool/llama_metamathqa.yaml +19 -0
fusion_bench_config/modelpool/CausalLMPool/llama_ultrachat.yaml +18 -0
fusion_bench_config/modelpool/SeqenceClassificationModelPool/llama_preference700k.yaml +23 -0
fusion_bench_config/modelpool/SeqenceClassificationModelPool/single_reward_model.yaml +14 -0
fusion_bench_config/nyuv2_config.yaml +5 -1
fusion_bench_config/taskpool/CLIPVisionModelTaskPool/clip_rankone_wemoe_clip-vit-classification_TA8.yaml +18 -0
fusion_bench_config/taskpool/reward_model_evaluation.yaml +18 -0
fusion_bench_config/llama_weighted_average.yaml +0 -26
{fusion_bench-0.2.5.dist-info → fusion_bench-0.2.7.dist-info}/LICENSE +0 -0
{fusion_bench-0.2.5.dist-info → fusion_bench-0.2.7.dist-info}/WHEEL +0 -0
{fusion_bench-0.2.5.dist-info → fusion_bench-0.2.7.dist-info}/entry_points.txt +0 -0
{fusion_bench-0.2.5.dist-info → fusion_bench-0.2.7.dist-info}/top_level.txt +0 -0

fusion_bench/models/wrappers/layer_wise_fusion.py CHANGED Viewed

@@ -1,13 +1,22 @@
 import functools
 import logging
 from copy import deepcopy
-from typing import Any, Callable, Dict, Iterator, List, Optional  # noqa: F401
+from typing import (  # noqa: F401
+    Any,
+    Callable,
+    Dict,
+    Generic,
+    Iterator,
+    List,
+    Optional,
+    TypeVar,
+)
 import torch
 from torch import Tensor, nn
 from torch.func import functional_call
-from fusion_bench.utils.type import StateDictType
+from fusion_bench.utils.type import TorchModelType, StateDictType
 __all__ = ["get_layer_wise_weights", "fuse_weights", "LayerWiseMergedModel"]
@@ -132,14 +141,14 @@ def fuse_weights(
     }
-class LayerWiseMergedModel(nn.Module):
+class LayerWiseMergedModel(nn.Module, Generic[TorchModelType]):
     _merged_state_dict: StateDictType = None
     def __init__(
         self,
         layer_wise_weight: Tensor,
-        pretrained_model: nn.Module,
-        finetuned_models: List[nn.Module],
+        pretrained_model: TorchModelType,
+        finetuned_models: List[TorchModelType],
         clamp_weights: bool = True,
         tie_weights: bool = False,
         strict: bool = True,

fusion_bench/models/wrappers/task_wise_fusion.py CHANGED Viewed

@@ -16,13 +16,13 @@ outputs = merged_model(inputs)
 import functools
 import logging
-from typing import Any, Callable, Dict, Iterator, List, Optional  # noqa: F401
+from typing import Any, Callable, Dict, Generic, Iterator, List, Optional  # noqa: F401
 import torch
 from torch import Tensor, nn
 from torch.func import functional_call
-from fusion_bench.utils.type import StateDictType
+from fusion_bench.utils.type import TorchModelType, StateDictType
 log = logging.getLogger(__name__)
@@ -157,14 +157,14 @@ def fuse_weights(
     }
-class TaskWiseMergedModel(nn.Module):
+class TaskWiseMergedModel(nn.Module, Generic[TorchModelType]):
     _merged_state_dict: StateDictType = None
     def __init__(
         self,
         task_wise_weight: Tensor,
-        pretrained_model: nn.Module,
-        finetuned_models: List[nn.Module],
+        pretrained_model: TorchModelType,
+        finetuned_models: List[TorchModelType],
         clamp_weights: bool = True,
         tie_weights: bool = False,
         strict: bool = True,

fusion_bench/optim/__init__.py CHANGED Viewed

	@@ -0,0 +1,2 @@
1	+ from . import exception, lr_scheduler
2	+ from .mezo import MeZO

fusion_bench/optim/exception.py ADDED Viewed

@@ -0,0 +1,47 @@
+class NoSparseGradientError(Exception):
+    """Raised when the gradient is sparse gradient.
+    :param optimizer_name: str. optimizer name.
+    :param note: str. special conditions to note (default '').
+    """
+    def __init__(self, optimizer_name: str, note: str = ""):
+        self.note: str = " " if not note else f" w/ {note} "
+        self.message: str = (
+            f"[-] {optimizer_name}{self.note}does not support sparse gradient."
+        )
+        super().__init__(self.message)
+class ZeroParameterSizeError(Exception):
+    """Raised when the parameter size is 0."""
+    def __init__(self):
+        self.message: str = "[-] parameter size is 0"
+        super().__init__(self.message)
+class NoClosureError(Exception):
+    """Raised when there's no closure function."""
+    def __init__(self, optimizer_name: str, note: str = ""):
+        self.message: str = f"[-] {optimizer_name} requires closure.{note}"
+        super().__init__(self.message)
+class NegativeLRError(Exception):
+    """Raised when learning rate is negative."""
+    def __init__(self, lr: float, lr_type: str = ""):
+        self.note: str = lr_type if lr_type else "learning rate"
+        self.message: str = f"[-] {self.note} must be positive. ({lr} > 0)"
+        super().__init__(self.message)
+class NegativeStepError(Exception):
+    """Raised when step is negative."""
+    def __init__(self, num_steps: int, step_type: str = ""):
+        self.note: str = step_type if step_type else "step"
+        self.message: str = f"[-] {self.note} must be positive. ({num_steps} > 0)"
+        super().__init__(self.message)

fusion_bench/optim/lr_scheduler/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .linear_warmup import *

fusion_bench/optim/lr_scheduler/linear_warmup.py ADDED Viewed

@@ -0,0 +1,222 @@
+"""
+Modified from pytorch_optimizer: https://github.com/kozistr/pytorch_optimizer/blob/main/pytorch_optimizer/lr_scheduler/linear_warmup.py
+"""
+import math
+from abc import ABC, abstractmethod
+from typing import List
+import numpy as np
+import torch
+from fusion_bench.optim.exception import NegativeLRError, NegativeStepError
+__all__ = [
+    "BaseLinearWarmupScheduler",
+    "LinearWarmupScheduler",
+    "CosineDecayWithWarmup",
+    "PolySchedulerWithWarmup",
+]
+class BaseLinearWarmupScheduler(ABC):
+    r"""BaseLinearWarmupScheduler class.
+    The LR Scheduler class based on this class has linear warmup strategy.
+    Args:
+        optimizer (torch.optim.Optimizer): Optimizer. It will set learning rate to all trainable parameters in optimizer.
+        T_max (int): Total steps to train.
+        max_lr (float): Maximum learning rate.
+        min_lr (float): Minimum learning rate.
+        init_lr (float): Initial learning rate.
+        warmup_steps (int): Steps to warm-up.
+    """
+    def __init__(
+        self,
+        optimizer: torch.optim.Optimizer,
+        T_max: int,
+        max_lr: float,
+        min_lr: float = 0.0,
+        init_lr: float = 0.0,
+        warmup_steps: int = 0,
+    ):
+        """
+        Initialize the BaseLinearWarmupScheduler.
+        Args:
+            optimizer (torch.optim.Optimizer): Optimizer to apply the learning rate schedule.
+            T_max (int): Total number of training steps.
+            max_lr (float): Maximum learning rate.
+            min_lr (float): Minimum learning rate.
+            init_lr (float): Initial learning rate.
+            warmup_steps (int): Number of steps for the warm-up phase.
+        """
+        self.optimizer = optimizer
+        self.total_steps = T_max
+        self.max_lr = max_lr
+        self.min_lr = min_lr
+        self.init_lr = init_lr
+        self.warmup_steps = warmup_steps
+        self.step_t: int = 0
+        self.base_lrs: List[float] = []
+        # record current value in self._last_lr to match API from torch.optim.lr_scheduler
+        self.last_lr: List[float] = [init_lr]
+        self.validate_parameters()
+        self._init_lr()
+    def validate_parameters(self):
+        """
+        Validate the parameters to ensure they are non-negative.
+        Raises:
+            NegativeLRError: If any of the learning rates are negative.
+            NegativeStepError: If any of the step values are negative.
+        """
+        if self.min_lr < 0:
+            raise NegativeLRError(self.min_lr, "min_lr")
+        if self.max_lr < 0:
+            raise NegativeLRError(self.max_lr, "max_lr")
+        if self.init_lr < 0:
+            raise NegativeLRError(self.init_lr, "init_lr")
+        if self.total_steps < 0:
+            raise NegativeStepError(self.total_steps, "T_max")
+        if self.warmup_steps < 0:
+            raise NegativeStepError(self.warmup_steps, "warmup_steps")
+    def _init_lr(self):
+        """
+        Initialize the learning rate for each parameter group in the optimizer.
+        """
+        self.base_lrs = []
+        for param_group in self.optimizer.param_groups:
+            param_group["lr"] = self.min_lr
+            self.base_lrs.append(self.min_lr)
+    def step(self):
+        """
+        Update the learning rate for the current step.
+        Returns:
+            float: The updated learning rate.
+        """
+        if self.step_t < self.warmup_steps:
+            value = (
+                self.init_lr
+                + (self.max_lr - self.init_lr) * self.step_t / self.warmup_steps
+            )
+        elif self.step_t == self.warmup_steps:
+            value = self.max_lr
+        else:
+            value = self._step()
+        self.step_t += 1
+        if self.optimizer is not None:
+            for param_group in self.optimizer.param_groups:
+                param_group["lr"] = value
+        self.last_lr = [value]
+        return value
+    @abstractmethod
+    def _step(self) -> float:  # pragma: no cover
+        """
+        Abstract method to calculate the learning rate for the current step.
+        Returns:
+            float: The calculated learning rate.
+        """
+        raise NotImplementedError
+    def get_lr(self) -> float:
+        """
+        Get the current learning rate.
+        Returns:
+            float: The current learning rate.
+        """
+        return self.last_lr[0]
+class LinearWarmupScheduler(BaseLinearWarmupScheduler):
+    r"""Linear LR Scheduler w/ linear warmup."""
+    def _step(self) -> float:
+        """
+        Calculate the learning rate for the current step using a linear decay.
+        Returns:
+            float: The calculated learning rate.
+        """
+        return self.max_lr + (self.min_lr - self.max_lr) * (
+            self.step_t - self.warmup_steps
+        ) / (self.total_steps - self.warmup_steps)
+class CosineDecayWithWarmup(BaseLinearWarmupScheduler):
+    r"""Cosine LR Scheduler w/ linear warmup."""
+    def _step(self) -> float:
+        """
+        Calculate the learning rate for the current step using a cosine decay.
+        Returns:
+            float: The calculated learning rate.
+        """
+        phase: float = (
+            (self.step_t - self.warmup_steps)
+            / (self.total_steps - self.warmup_steps)
+            * math.pi
+        )
+        return self.min_lr + (self.max_lr - self.min_lr) * (np.cos(phase) + 1.0) / 2.0
+class PolySchedulerWithWarmup(BaseLinearWarmupScheduler):
+    r"""Poly LR Scheduler.
+    Args:
+        poly_order (float): LR scheduler decreases with steps.
+    """
+    def __init__(self, optimizer, poly_order: float = 0.5, **kwargs):
+        """
+        Initialize the PolySchedulerWithWarmup.
+        Args:
+            optimizer (torch.optim.Optimizer): Optimizer to apply the learning rate schedule.
+            poly_order (float): Order of the polynomial for the learning rate decay.
+            kwargs: Additional arguments for the base class.
+        Raises:
+            ValueError: If poly_order is not positive.
+        """
+        self.poly_order = poly_order
+        if poly_order <= 0:
+            raise ValueError(f"[-] poly_order must be positive. {poly_order}")
+        super().__init__(optimizer, **kwargs)
+    def _step(self) -> float:
+        """
+        Calculate the learning rate for the current step using a polynomial decay.
+        Returns:
+            float: The calculated learning rate.
+        """
+        return (
+            self.min_lr
+            + (self.max_lr - self.min_lr)
+            * (self.step_t - self.warmup_steps) ** self.poly_order
+        )

fusion_bench/optim/lr_scheduler/utils/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .visualization import *

fusion_bench/optim/lr_scheduler/utils/visualization.py ADDED Viewed

@@ -0,0 +1,119 @@
+"""
+This module provides utilities for visualizing learning rate schedulers.
+Functions:
+    simulate_scheduler(lr_scheduler, steps): Simulates the learning rate scheduler for a given number of steps.
+    plot_lr_schedulers(lr_schedulers, steps, titles): Plots the learning rates of one or more schedulers over a number of steps.
+"""
+from typing import TYPE_CHECKING, List, Union
+import matplotlib.pyplot as plt
+import torch
+if TYPE_CHECKING:
+    from torch.optim.lr_scheduler import LRScheduler
+__all__ = ["simulate_scheduler", "plot_lr_schedulers"]
+def simulate_scheduler(lr_scheduler, steps: int):
+    """
+    Simulates the learning rate scheduler for a given number of steps.
+    Args:
+        lr_scheduler (torch.optim.lr_scheduler.LRScheduler): The learning rate scheduler object.
+        steps (int): The number of steps to simulate.
+    Returns:
+        List[float]: A list of learning rates for each step.
+    """
+    lrs = []
+    for _ in range(steps):
+        lr = lr_scheduler.step()
+        lrs.append(lr)
+    return lrs
+def plot_lr_schedulers(
+    lr_schedulers: Union["LRScheduler", List["LRScheduler"]],
+    steps: int,
+    titles: Union[str, List[str]],
+    show_plot: bool = True,
+):
+    """
+    Plots the learning rates of one or more schedulers over a number of steps.
+    Args:
+        lr_schedulers (Union[LRScheduler, List[LRScheduler]]): One or more learning rate scheduler objects.
+        steps (int): The number of steps to simulate.
+        titles (Union[str, List[str]]): Titles for the plots.
+    Returns:
+        fig, axes: The matplotlib figure and axes objects.
+    """
+    # Handle single scheduler
+    if isinstance(lr_schedulers, torch.optim.lr_scheduler.LRScheduler):
+        lr_schedulers = [lr_schedulers]
+    if isinstance(titles, str):
+        titles = [titles]
+    fig, axs = plt.subplots(len(lr_schedulers), 1, figsize=(5, 3 * len(lr_schedulers)))
+    if len(lr_schedulers) == 1:
+        axs = [axs]
+    for i, (scheduler, title) in enumerate(zip(lr_schedulers, titles)):
+        lrs = simulate_scheduler(scheduler, steps)
+        axs[i].plot(lrs, label=title)
+        axs[i].set_title(title)
+        axs[i].set_xlabel("Steps")
+        axs[i].set_ylabel("Learning Rate")
+        axs[i].legend()
+        axs[i].grid(True)
+    plt.tight_layout()
+    if show_plot:
+        plt.show()
+    return fig, axs
+# Example usage
+if __name__ == "__main__":
+    from fusion_bench.optim.lr_scheduler.linear_warmup import (
+        CosineDecayWithWarmup,
+        LinearWarmupScheduler,
+        PolySchedulerWithWarmup,
+    )
+    # Dummy optimizer
+    optimizer = torch.optim.SGD(
+        [torch.nn.Parameter(torch.randn(2, 2, requires_grad=True))], lr=0.1
+    )
+    # Define the schedulers
+    linear_scheduler = LinearWarmupScheduler(
+        optimizer, t_max=100, max_lr=0.1, min_lr=0.01, init_lr=0.0, warmup_steps=10
+    )
+    cosine_scheduler = CosineDecayWithWarmup(
+        optimizer, t_max=100, max_lr=0.1, min_lr=0.01, init_lr=0.0, warmup_steps=10
+    )
+    poly_scheduler = PolySchedulerWithWarmup(
+        optimizer,
+        t_max=100,
+        max_lr=0.1,
+        min_lr=0.01,
+        init_lr=0.0,
+        warmup_steps=40,
+        poly_order=2.0,
+    )
+    # Plot the learning rates
+    plot_lr_schedulers(
+        [linear_scheduler, cosine_scheduler, poly_scheduler],
+        steps=100,
+        titles=[
+            "Linear Warmup",
+            "Cosine Decay with Warmup",
+            "Poly Scheduler with Warmup",
+        ],
+    )

fusion_bench/optim/mezo.py CHANGED Viewed

@@ -5,8 +5,6 @@ import numpy as np
 import torch
 from torch.optim.optimizer import Optimizer
-from fusion_bench.utils import timeit_context
 log = logging.getLogger(__name__)

fusion_bench/programs/fabric_fusion_program.py CHANGED Viewed

@@ -236,7 +236,11 @@ class FabricModelFusionProgram(
             self.save_merged_model(merged_model)
             if self.taskpool is not None:
                 report = self.evaluate_merged_model(self.taskpool, merged_model)
-                print_json(report, print_type=False)
+                try:
+                    print_json(report, print_type=False)
+                except Exception as e:
+                    log.warning(f"Failed to pretty print the report: {e}")
+                    print(report)
                 if self.report_save_path is not None:
                     # save report (Dict) to a file
                     # if the directory of `save_report` does not exists, create it

fusion_bench/taskpool/__init__.py CHANGED Viewed

@@ -7,7 +7,11 @@ from fusion_bench.utils.lazy_imports import LazyImporter
 _import_structure = {
     "base_pool": ["BaseTaskPool"],
-    "clip_vision": ["CLIPVisionModelTaskPool", "SparseWEMoECLIPVisionModelTaskPool"],
+    "clip_vision": [
+        "CLIPVisionModelTaskPool",
+        "SparseWEMoECLIPVisionModelTaskPool",
+        "RankoneWEMoECLIPVisionModelTaskPool",
+    ],
     "dummy": ["DummyTaskPool"],
     "gpt2_text_classification": ["GPT2TextClassificationTaskPool"],
     "nyuv2_taskpool": ["NYUv2TaskPool"],
@@ -17,7 +21,11 @@ _import_structure = {
 if TYPE_CHECKING:
     from .base_pool import BaseTaskPool
-    from .clip_vision import CLIPVisionModelTaskPool, SparseWEMoECLIPVisionModelTaskPool
+    from .clip_vision import (
+        CLIPVisionModelTaskPool,
+        RankoneWEMoECLIPVisionModelTaskPool,
+        SparseWEMoECLIPVisionModelTaskPool,
+    )
     from .dummy import DummyTaskPool
     from .gpt2_text_classification import GPT2TextClassificationTaskPool
     from .llama import LlamaTestGenerationTaskPool

fusion_bench/taskpool/clip_vision/__init__.py CHANGED Viewed

@@ -1,3 +1,4 @@
 # flake8: noqa F401
+from .clip_rankone_moe_taskpool import RankoneMoECLIPVisionModelTaskPool
 from .clip_sparse_wemoe_taskpool import SparseWEMoECLIPVisionModelTaskPool
 from .taskpool import CLIPVisionModelTaskPool

fusion_bench/taskpool/clip_vision/clip_rankone_moe_taskpool.py ADDED Viewed

@@ -0,0 +1,112 @@
+from copy import deepcopy
+from pathlib import Path
+from typing import Any, Dict, Optional, Tuple
+import torch
+from torch import Tensor
+from torch.utils.hooks import RemovableHandle
+from transformers import CLIPModel, CLIPProcessor, CLIPVisionModel
+from transformers.models.clip.modeling_clip import CLIPVisionTransformer
+from fusion_bench.models.hf_clip import HFCLIPClassifier
+from fusion_bench.models.rankone_moe import RankOneMoE
+from .taskpool import CLIPVisionModelTaskPool
+class LayerWiseRoutingWeightSaver:
+    def __init__(self, save_path: Path, max_num: Optional[int] = None):
+        self.save_path = save_path
+        self.max_num = max_num
+        self.routing_weights = []
+    def __call__(self, module, input: Tuple[Tensor], output: Tensor):
+        assert isinstance(output, Tensor), "Output is expected to be a Tensor"
+        # (batch_size, num_tokens, num_experts)
+        routing_weights = output.detach().cpu()
+        if self.max_num is not None and self.max_num > 0:
+            if len(self.routing_weights) > self.max_num:
+                return
+            elif routing_weights.size(0) + len(self.routing_weights) > self.max_num:
+                self.routing_weights.append(
+                    routing_weights[: self.max_num - len(self.routing_weights)]
+                )
+            else:
+                self.routing_weights.append(routing_weights)
+        else:
+            self.routing_weights.append(routing_weights)
+    def save_routing_weights(self):
+        routing_weights = torch.cat(self.routing_weights, dim=0)
+        if self.save_path is not None:
+            self.save_path.parent.mkdir(parents=True, exist_ok=True)
+            print(f"Saving routing weights to {self.save_path}")
+            torch.save(routing_weights, self.save_path)
+class RankoneMoECLIPVisionModelTaskPool(CLIPVisionModelTaskPool):
+    # hooks and handles for saving layer-wise routing weights
+    _layer_wise_routing_weights_save_hooks: Dict[Any, LayerWiseRoutingWeightSaver] = {}
+    _layer_wise_routing_weights_save_hook_handles: Dict[Any, RemovableHandle] = {}
+    _config_mapping = CLIPVisionModelTaskPool._config_mapping | {
+        "_layer_wise_routing_weights_save_path": "layer_wise_routing_weights_save_path",
+    }
+    def __init__(
+        self,
+        layer_wise_routing_weights_save_path: Optional[str],
+        layer_wise_routing_weights_max_num: Optional[int] = None,
+        **kwargs,
+    ):
+        # save path for layer-wise routing weights
+        self._layer_wise_routing_weights_save_path = (
+            layer_wise_routing_weights_save_path
+        )
+        self.layer_wise_routing_weights_save_path = (
+            Path(layer_wise_routing_weights_save_path)
+            if layer_wise_routing_weights_save_path is not None
+            else None
+        )
+        self.layer_wise_routing_weights_max_num = layer_wise_routing_weights_max_num
+        super().__init__(**kwargs)
+    def on_task_evaluation_begin(self, classifier: HFCLIPClassifier, task_name: str):
+        super().on_task_evaluation_begin(classifier, task_name)
+        if self.layer_wise_routing_weights_save_path is not None:
+            # setup hooks for saving layer-wise routing weights
+            assert isinstance(
+                classifier.clip_model.vision_model,
+                (CLIPVisionTransformer, CLIPVisionModel),
+            ), "Vision model is expected to be a CLIPVisionTransformer"
+            vision_model = classifier.clip_model.vision_model
+            if isinstance(vision_model, CLIPVisionModel):
+                vision_model = vision_model.vision_model
+                # assign forward hooks for each layer
+            for i, layer in enumerate(vision_model.encoder.layers):
+                mlp = layer.mlp
+                assert isinstance(
+                    mlp,
+                    (RankOneMoE),
+                ), f"MLP is expected to be a RankOneWeightEnsemblingMoE, but got {type(mlp)}"
+                # layer-wise routing weights
+                hook = LayerWiseRoutingWeightSaver(
+                    self.layer_wise_routing_weights_save_path
+                    / task_name
+                    / f"layer_{i}.pt",
+                    max_num=self.layer_wise_routing_weights_max_num,
+                )
+                self._layer_wise_routing_weights_save_hooks[i] = hook
+                self._layer_wise_routing_weights_save_hook_handles[i] = (
+                    mlp.gate.register_forward_hook(hook)
+                )
+    def on_task_evaluation_end(self):
+        super().on_task_evaluation_end()
+        if self.layer_wise_routing_weights_save_path is not None:
+            # remove hooks for saving layer-wise routing weights
+            for i, handle in self._layer_wise_routing_weights_save_hook_handles.items():
+                self._layer_wise_routing_weights_save_hooks[i].save_routing_weights()
+                handle.remove()

fusion-bench 0.2.5__py3-none-any.whl → 0.2.7__py3-none-any.whl

fusion-bench 0.2.5py3-none-any.whl → 0.2.7py3-none-any.whl