PyPI - fusion-bench - Versions diffs - 0.2.20__py3-none-any.whl → 0.2.22__py3-none-any.whl - Mend

fusion-bench 0.2.20py3-none-any.whl → 0.2.22py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (188) hide show

fusion_bench/method/smile_upscaling/smile_mistral_upscaling.py CHANGED Viewed

@@ -20,6 +20,7 @@ from transformers.models.mistral.modeling_mistral import MistralDecoderLayer
 from fusion_bench.compat.modelpool import to_modelpool
 from fusion_bench.method import BaseAlgorithm
 from fusion_bench.method.simple_average import simple_average
+from fusion_bench.mixins import auto_register_config
 from fusion_bench.mixins.simple_profiler import SimpleProfilerMixin
 from fusion_bench.modelpool import BaseModelPool
 from fusion_bench.models.modeling_smile_mistral import (
@@ -40,7 +41,10 @@ from fusion_bench.utils.parameters import print_parameters
 log = logging.getLogger(__name__)
-class SmileMistralUpscalingAlgorithm(BaseAlgorithm, SimpleProfilerMixin):
+class SmileMistralUpscalingAlgorithm(
+    SimpleProfilerMixin,
+    BaseAlgorithm,
+):
     R"""
     SmileMistralUpscalingAlgorithm is a model fusion algorithm designed to upscale
     a pretrained Mistral model using a set of fine-tuned expert models. The algorithm

fusion_bench/method/smile_upscaling/smile_qwen2_upscaling.py CHANGED Viewed

@@ -16,10 +16,17 @@ from transformers.models.qwen2.modeling_qwen2 import Qwen2DecoderLayer
 from fusion_bench import BaseAlgorithm, BaseModelPool
 from fusion_bench.compat.modelpool import to_modelpool
-from fusion_bench.mixins import SimpleProfilerMixin
+from fusion_bench.constants import RuntimeConstants
+from fusion_bench.mixins import SimpleProfilerMixin, auto_register_config
+from fusion_bench.modelpool import CausalLMPool
+from fusion_bench.models.hf_utils import (
+    create_default_model_card,
+    save_pretrained_with_remote_code,
+)
 from fusion_bench.models.modeling_smile_qwen2 import (
     SmileQwen2Config,
     SmileQwen2ForCausalLM,
+    SmileQwen2Model,
 )
 from fusion_bench.models.modeling_smile_qwen2.modeling_smile_qwen2 import (
     SmileQwen2DecoderLayer,
@@ -34,7 +41,11 @@ from fusion_bench.utils.parameters import print_parameters
 log = logging.getLogger(__name__)
-class SmileQwen2UpscalingAlgorithm(BaseAlgorithm, SimpleProfilerMixin):
+@auto_register_config
+class SmileQwen2UpscalingAlgorithm(
+    SimpleProfilerMixin,
+    BaseAlgorithm,
+):
     R"""
     SmileQwen2UpscalingAlgorithm is a model fusion algorithm designed to upscale
     a pretrained Qwen2 model using a set of fine-tuned expert models. The algorithm
@@ -49,39 +60,29 @@ class SmileQwen2UpscalingAlgorithm(BaseAlgorithm, SimpleProfilerMixin):
             Merges the pretrained model with the fine-tuned models to create an upscaled model.
     """
-    _config_mapping = BaseAlgorithm._config_mapping | {
-        "device": "device",
-        "accelerator": "accelerator",
-        "model_path": "model_path",
-        "model_dtype": "model_dtype",
-        "num_experts_per_tok": "num_experts_per_tok",
-        "rank_of_router": "rank_of_router",
-        "rank_of_expert": "rank_of_expert",
-    }
+    modelpool: CausalLMPool
     def __init__(
         self,
         device,
         accelerator,
-        model_path,
+        model_save_path,
         model_dtype,
         num_experts_per_tok,
         rank_of_router,
         rank_of_expert,
+        save_with_remote_code: bool = True,
         **kwargs,
     ):
-        self.device = device
-        self.accelerator = accelerator
-        self.model_path = model_path
-        self.model_dtype = model_dtype
-        # SmileMoE parameters, except `num_local_experts` which is set later according to the number of finetuned models
-        self.num_experts_per_tok = num_experts_per_tok
-        self.rank_of_router = rank_of_router
-        self.rank_of_expert = rank_of_expert
         super().__init__(**kwargs)
+        if not torch.cuda.is_available():
+            if "cuda" in self.device:
+                self.device = "cpu"
+            if "cuda" in self.accelerator:
+                self.accelerator = "cpu"
     @torch.no_grad()
-    def run(self, modelpool: BaseModelPool) -> SmileQwen2ForCausalLM:
+    def run(self, modelpool) -> SmileQwen2ForCausalLM:
         """
         Executes the upscaling process.
@@ -94,13 +95,6 @@ class SmileQwen2UpscalingAlgorithm(BaseAlgorithm, SimpleProfilerMixin):
         self.modelpool = modelpool = to_modelpool(modelpool)
         config = self.config
-        # load model from path if provided and return directly
-        if config.model_path is not None and os.path.exists(config.model_path):
-            log.info(f"Loading model from {config.model_path}")
-            model = AutoModelForCausalLM.from_pretrained(config.model_path)
-            print_parameters(model)
-            return model
         with self.profile("load pretrained model"):
             pretrained_model = modelpool.load_pretrained_model()
         with self.profile("load fine-tuned model"):
@@ -108,7 +102,7 @@ class SmileQwen2UpscalingAlgorithm(BaseAlgorithm, SimpleProfilerMixin):
                 m for m in tqdm(modelpool.models(), total=len(modelpool.model_names))
             ]
-        if config.device == "cuda" and torch.cuda.is_available():
+        if self.device == "cuda" and torch.cuda.is_available():
             pretrained_model = pretrained_model.cuda()
             print("parameter count of pretrained model:")
             print_parameters(pretrained_model)
@@ -122,20 +116,37 @@ class SmileQwen2UpscalingAlgorithm(BaseAlgorithm, SimpleProfilerMixin):
         print_parameters(model)
         print(model)
-        if config.model_dtype is not None:
-            model.to(dtype=parse_dtype(config.model_dtype))
-        if config.model_path is not None:
-            if os.path.dirname(config.model_path):
-                os.makedirs(os.path.dirname(config.model_path), exist_ok=True)
-            log.info(f"Saving model to {config.model_path}")
-            pretrained_model_config = self.modelpool.get_model_config("_pretrained_")
-            pretrained_path = pretrained_model_config.get(
-                "path", pretrained_model_config["pretrained_model_name_or_path"]
+        if self.model_dtype is not None:
+            model.to(dtype=parse_dtype(self.model_dtype))
+        if self.model_save_path is not None:
+            if os.path.dirname(self.model_save_path):
+                os.makedirs(os.path.dirname(self.model_save_path), exist_ok=True)
+            log.info(f"Saving model to {self.model_save_path}")
+            tokenizer = self.modelpool.load_tokenizer()
+            tokenizer.save_pretrained(self.model_save_path)
+            if not self.save_with_remote_code:
+                model.save_pretrained(self.model_save_path)
+            else:
+                save_pretrained_with_remote_code(
+                    model,
+                    auto_map={
+                        "AutoConfig": SmileQwen2Config,
+                        "AutoModel": SmileQwen2Model,
+                        "AutoModelForCausalLM": SmileQwen2ForCausalLM,
+                    },
+                    save_directory=self.model_save_path,
+                )
+            # save readme
+            model_card_str = create_default_model_card(
+                models=[modelpool.get_model_path(m) for m in modelpool.all_model_names],
+                description="Merged Qwen model using SMILE Upscaling",
+                algorithm_config=self.config,
+                modelpool_config=modelpool.config,
             )
-            tokenizer = AutoTokenizer.from_pretrained(pretrained_path)
-            tokenizer.save_pretrained(config.model_path)
-            model.save_pretrained(config.model_path)
+            with open(os.path.join(self.model_save_path, "README.md"), "w") as f:
+                f.write(model_card_str)
         return model
@@ -158,14 +169,17 @@ class SmileQwen2UpscalingAlgorithm(BaseAlgorithm, SimpleProfilerMixin):
         with init_empty_weights():
             pretrained_model_config = self.modelpool.get_model_config("_pretrained_")
-            pretrained_path = pretrained_model_config.get(
-                "path", pretrained_model_config["pretrained_model_name_or_path"]
-            )
+            if isinstance(pretrained_model_config, str):
+                pretrained_path = pretrained_model_config
+            else:
+                pretrained_path = pretrained_model_config.get(
+                    "path", pretrained_model_config["pretrained_model_name_or_path"]
+                )
             base_config = AutoConfig.from_pretrained(pretrained_path)
             model_config = SmileQwen2Config(
-                num_experts_per_tok=config.num_experts_per_tok,
-                rank_of_router=config.rank_of_router,
-                rank_of_expert=config.rank_of_expert,
+                num_experts_per_tok=self.num_experts_per_tok,
+                rank_of_router=self.rank_of_router,
+                rank_of_expert=self.rank_of_expert,
                 num_local_experts=len(finetuned_models),
                 **base_config.to_dict(),
             )
@@ -175,7 +189,7 @@ class SmileQwen2UpscalingAlgorithm(BaseAlgorithm, SimpleProfilerMixin):
         # copy pretrained model weights
         state_dict = model.state_dict()
-        pretrained_state_dict = dict(pretrained_model.state_dict())
+        pretrained_state_dict = pretrained_model.state_dict()
         for key in list(pretrained_state_dict.keys()):
             if key not in state_dict:
                 pretrained_state_dict.pop(key)
@@ -187,6 +201,12 @@ class SmileQwen2UpscalingAlgorithm(BaseAlgorithm, SimpleProfilerMixin):
             "Upscaling Modules (layer)",
             dynamic_ncols=True,
         ):
+            if RuntimeConstants.debug and layer_idx > 0:
+                log.info(
+                    "Debug mode enabled: processing only the first layer, skipping remaining layers"
+                )
+                break
             pretrained_layer: Qwen2DecoderLayer = pretrained_model.model.layers[
                 layer_idx
             ]
@@ -202,7 +222,7 @@ class SmileQwen2UpscalingAlgorithm(BaseAlgorithm, SimpleProfilerMixin):
                         base=getattr(pretrained_layer.self_attn, n),
                         experts=[getattr(m.self_attn, n) for m in finetuned_layers],
                         target=getattr(target_layer.self_attn, n),
-                        accelerator=config.accelerator,
+                        accelerator=self.accelerator,
                     )
                 except ExpertNotTrainedError:
                     setattr(
@@ -217,7 +237,7 @@ class SmileQwen2UpscalingAlgorithm(BaseAlgorithm, SimpleProfilerMixin):
                         base=getattr(pretrained_layer.mlp, n),
                         experts=[getattr(m.mlp, n) for m in finetuned_layers],
                         target=getattr(target_layer.mlp, n),
-                        accelerator=config.accelerator,
+                        accelerator=self.accelerator,
                     )
                 except ExpertNotTrainedError:
                     setattr(

fusion_bench/method/smile_upscaling/smile_upscaling.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import logging
 import os
 from copy import deepcopy
-from typing import Dict, List, Tuple  # noqa: F401
+from typing import Any, Dict, List, Tuple  # noqa: F401
 import torch
 import torch.nn.functional as F
@@ -20,6 +20,7 @@ from fusion_bench.models.smile_moe.linear_from_module import (
     SmileMoELinear,
 )
 from fusion_bench.models.utils import get_attr, set_attr
+from fusion_bench.utils.devices import get_device
 from fusion_bench.utils.parameters import print_parameters
 log = logging.getLogger(__name__)
@@ -54,7 +55,7 @@ class SmileUpscalingAlgorithm(
         routing_use_diff: bool = True,
         average_experts: bool = False,
         model_path: str = None,
-        **kwargs,
+        **kwargs: Any,
     ):
         """
         Initialize the SmileUpscalingAlgorithm.
@@ -91,7 +92,7 @@ class SmileUpscalingAlgorithm(
         print(f"=== Config for `{type(self).__name__}` ===")
     @torch.no_grad()
-    def run(self, modelpool: BaseModelPool):
+    def run(self, modelpool: BaseModelPool) -> nn.Module:
         """
         Executes the upscaling process.
@@ -142,7 +143,7 @@ class SmileUpscalingAlgorithm(
         pretrained_model: nn.Module,
         finetuned_models: List[nn.Module],
         in_place: bool = True,
-    ):
+    ) -> nn.Module:
         """
         Merges the pretrained model with the fine-tuned models to create an upscaled model.
@@ -180,7 +181,12 @@ class SmileUpscalingAlgorithm(
         name_list = name.split(".")
         module = get_attr(pretrained_model, name_list)
-        experts = [get_attr(m, name_list) for m in finetuned_models]
+        original_device = get_device(module)
+        module = module.to(self.device, non_blocking=True)
+        experts = [
+            get_attr(m, name_list).to(self.device, non_blocking=True)
+            for m in finetuned_models
+        ]
         try:
             moe_linear = SmileMoELinear(
                 module,
@@ -192,6 +198,7 @@ class SmileUpscalingAlgorithm(
                 full_matrices=self.full_matrices,
                 upscaling_accelerator=self.upscaling_accelerator,
             )
+            moe_linear = moe_linear.to(original_device, non_blocking=True)
         except ExpertNotTrainedError:
             print(f"skip {name} because the experts are not trained.")
             return

fusion_bench/method/tall_mask/task_arithmetic.py CHANGED Viewed

@@ -9,7 +9,7 @@ from copy import deepcopy
 import torch
 from fusion_bench import BaseAlgorithm
-from fusion_bench.mixins import SimpleProfilerMixin
+from fusion_bench.mixins import SimpleProfilerMixin, auto_register_config
 from fusion_bench.modelpool import BaseModelPool
 from fusion_bench.utils.state_dict_arithmetic import (
     state_dict_add,
@@ -58,16 +58,11 @@ def generate_task_masks(
     return final_mask
+@auto_register_config
 class TallMaskTaskArithmeticAlgorithm(
-    BaseAlgorithm,
     SimpleProfilerMixin,
+    BaseAlgorithm,
 ):
-    _config_mapping = BaseAlgorithm._config_mapping | {
-        "tall_mask_lambda": "tall_mask_lambda",
-        "debug": "debug",
-        "verbose": "verbose",
-    }
     def __init__(
         self,
         tall_mask_lambda: float,
@@ -76,9 +71,6 @@ class TallMaskTaskArithmeticAlgorithm(
         **kwargs,
     ):
         super().__init__(**kwargs)
-        self.tall_mask_lambda = tall_mask_lambda
-        self.debug = debug
-        self.verbose = verbose
     @torch.no_grad()
     def run(self, modelpool: BaseModelPool):

fusion_bench/method/task_arithmetic/task_arithmetic.py CHANGED Viewed

@@ -12,7 +12,7 @@ import torch
 from torch import nn
 from fusion_bench.method.base_algorithm import BaseAlgorithm
-from fusion_bench.mixins.simple_profiler import SimpleProfilerMixin
+from fusion_bench.mixins import SimpleProfilerMixin, auto_register_config
 from fusion_bench.modelpool import BaseModelPool
 from fusion_bench.utils.state_dict_arithmetic import (
     state_dict_add,
@@ -74,9 +74,10 @@ def task_arithmetic_merge(
     return pretrained_model
+@auto_register_config
 class TaskArithmeticAlgorithm(
-    BaseAlgorithm,
     SimpleProfilerMixin,
+    BaseAlgorithm,
 ):
     """
     Task Arithmetic Algorithm for model fusion.
@@ -89,22 +90,17 @@ class TaskArithmeticAlgorithm(
         scaling_factor (int): The factor by which the task vectors will be scaled before merging.
     """
-    _config_mapping = BaseAlgorithm._config_mapping | {
-        "scaling_factor": "scaling_factor"
-    }
-    def __init__(self, scaling_factor: int):
+    def __init__(self, scaling_factor: int, **kwargs):
         """
         Initializes the TaskArithmeticAlgorithm with the given scaling factor.
         Args:
             scaling_factor (int): The factor by which the task vectors will be scaled before merging.
         """
-        self.scaling_factor = scaling_factor
-        super().__init__()
+        super().__init__(**kwargs)
     @torch.no_grad()
-    def run(self, modelpool: Union[BaseModelPool, Dict[str, nn.Module]]):
+    def run(self, modelpool: Union[BaseModelPool, Dict[str, nn.Module]]) -> nn.Module:
         """
         Runs the Task Arithmetic Algorithm to fuse models in the given model pool.

fusion_bench/method/ties_merging/ties_merging.py CHANGED Viewed

@@ -9,14 +9,14 @@ Overview of Ties-Merging:
 """
 import logging
-from typing import Dict, List, Literal, Mapping, Union  # noqa: F401
+from typing import Any, Dict, List, Literal, Mapping, Union  # noqa: F401
 import torch
 from torch import Tensor, nn
 from fusion_bench.compat.modelpool import to_modelpool
 from fusion_bench.method import BaseAlgorithm
-from fusion_bench.mixins import SimpleProfilerMixin
+from fusion_bench.mixins import SimpleProfilerMixin, auto_register_config
 from fusion_bench.modelpool import BaseModelPool
 from fusion_bench.utils.type import StateDictType
@@ -25,33 +25,22 @@ from .ties_merging_utils import state_dict_to_vector, ties_merging, vector_to_st
 log = logging.getLogger(__name__)
-class TiesMergingAlgorithm(BaseAlgorithm, SimpleProfilerMixin):
-    """
-    TiesMergingAlgorithm is a class for fusing multiple models using the TIES merging technique.
-    Attributes:
-        scaling_factor (float): The scaling factor to apply to the merged task vector.
-        threshold (float): The threshold for resetting values in the task vector.
-        remove_keys (List[str]): List of keys to remove from the state dictionary.
-        merge_func (Literal["sum", "mean", "max"]): The merge function to use for disjoint merging.
-    """
-    _config_mapping = BaseAlgorithm._config_mapping | {
-        "scaling_factor": "scaling_factor",
-        "threshold": "threshold",
-        "remove_keys": "remove_keys",
-        "merge_func": "merge_func",
-    }
+@auto_register_config
+class TiesMergingAlgorithm(
+    SimpleProfilerMixin,
+    BaseAlgorithm,
+):
     def __init__(
         self,
         scaling_factor: float,
         threshold: float,
         remove_keys: List[str],
         merge_func: Literal["sum", "mean", "max"],
-        **kwargs,
+        **kwargs: Any,
     ):
         """
+        TiesMergingAlgorithm is a class for fusing multiple models using the TIES merging technique.
         Initialize the TiesMergingAlgorithm with the given parameters.
         Args:
@@ -61,14 +50,12 @@ class TiesMergingAlgorithm(BaseAlgorithm, SimpleProfilerMixin):
             merge_func (Literal["sum", "mean", "max"]): The merge function to use for disjoint merging.
             **kwargs: Additional keyword arguments for the base class.
         """
-        self.scaling_factor = scaling_factor
-        self.threshold = threshold
-        self.remove_keys = remove_keys
-        self.merge_func = merge_func
         super().__init__(**kwargs)
     @torch.no_grad()
-    def run(self, modelpool: BaseModelPool | Dict[str, nn.Module], **kwargs):
+    def run(
+        self, modelpool: BaseModelPool | Dict[str, nn.Module], **kwargs: Any
+    ) -> nn.Module:
         """
         Run the TIES merging algorithm to fuse models in the model pool.

fusion_bench/method/we_moe/__init__.py CHANGED Viewed

@@ -1,2 +1,3 @@
 # flake8: noqa F401
 from .clip_we_moe import CLIPWeightEnsemblingMoEAlgorithm
+from .flan_t5_we_moe import FlanT5WeightEnsemblingMoEAlgorithm

fusion_bench/method/we_moe/clip_we_moe.py CHANGED Viewed

@@ -2,6 +2,7 @@ import functools
 import logging
 import os
 from copy import deepcopy
+from typing import Any, Iterator
 import torch
 from torch import Tensor
@@ -38,7 +39,7 @@ class CLIPWeightEnsemblingMoEAlgorithm(
     modelpool: CLIPVisionModelPool = None
-    def load_checkpoint(self, model, checkpoint):
+    def load_checkpoint(self, model: Any, checkpoint: Any):
         """
         Load the checkpoint file.
@@ -49,7 +50,7 @@ class CLIPWeightEnsemblingMoEAlgorithm(
         state = {"model": model}
         self._fabric.load(checkpoint, state)
-    def save_checkpoint(self, model, checkpoint):
+    def save_checkpoint(self, model: Any, checkpoint: Any):
         """
         Save the checkpoint file.
@@ -102,7 +103,7 @@ class CLIPWeightEnsemblingMoEAlgorithm(
         return moe_model
     @functools.cache
-    def get_shuffled_test_loader_iter(self, tta_dataset: str):
+    def get_shuffled_test_loader_iter(self, tta_dataset: str) -> Iterator:
         """
         Get an iterator for the shuffled test data loader.
@@ -131,7 +132,7 @@ class CLIPWeightEnsemblingMoEAlgorithm(
         """
         self.setup_zero_shot_classification_head()
-    def compute_logits(self, module, batch, task) -> Tensor:
+    def compute_logits(self, module: Any, batch: Any, task: Any) -> Tensor:
         """
         Compute the logits for the given batch and task.

fusion_bench/method/we_moe/entropy_loss.py ADDED Viewed

@@ -0,0 +1,25 @@
+import torch
+from torch import Tensor
+def entropy_loss(logits: Tensor, eps: float = 1e-8) -> Tensor:
+    """
+    Compute the entropy loss of a set of logits.
+    Args:
+        logits (Tensor): The logits to compute the entropy loss of.
+        eps (float): A small value to avoid log(0). Default is 1e-8.
+    Returns:
+        Tensor: The entropy loss of the logits.
+    """
+    # Ensure the logits tensor has 2 dimensions
+    assert (
+        logits.dim() == 2
+    ), f"Expected logits to have 2 dimensions, found {logits.dim()}, {logits.size()=}"
+    # Compute the softmax probabilities
+    probs = torch.softmax(logits, dim=-1)
+    # Compute the entropy loss
+    return -torch.sum(probs * torch.log(probs + eps), dim=-1).mean()

fusion-bench 0.2.20__py3-none-any.whl → 0.2.22__py3-none-any.whl

fusion-bench 0.2.20py3-none-any.whl → 0.2.22py3-none-any.whl