PyPI - fusion-bench - Versions diffs - 0.2.13__py3-none-any.whl → 0.2.14__py3-none-any.whl - Mend

fusion-bench 0.2.13py3-none-any.whl → 0.2.14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

fusion_bench/method/ada_svd/clip_vision.py CHANGED Viewed

@@ -31,7 +31,10 @@ from fusion_bench.method import WeightedAverageAlgorithm
 from fusion_bench.method.simple_average import simple_average
 from fusion_bench.mixins import SimpleProfilerMixin
 from fusion_bench.modelpool import CLIPVisionModelPool
-from fusion_bench.models.smile_moe.linear import ExpertNotTrainedError, SmileMoELinear
+from fusion_bench.models.smile_moe.linear_from_module import (
+    ExpertNotTrainedError,
+    SmileMoELinear,
+)
 from fusion_bench.models.utils import find_layers_with_type, get_attr, set_attr
 from fusion_bench.utils.devices import get_device

fusion_bench/method/smile_upscaling/smile_mistral_upscaling.py CHANGED Viewed

@@ -9,11 +9,16 @@ import torch.nn.functional as F
 from accelerate import init_empty_weights
 from torch import Tensor, nn
 from tqdm.auto import tqdm
-from transformers import AutoConfig, AutoTokenizer, MistralForCausalLM
+from transformers import (
+    AutoConfig,
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    MistralForCausalLM,
+)
 from transformers.models.mistral.modeling_mistral import MistralDecoderLayer
-from fusion_bench.compat.method import ModelFusionAlgorithm
 from fusion_bench.compat.modelpool import to_modelpool
+from fusion_bench.method import BaseAlgorithm
 from fusion_bench.method.simple_average import simple_average
 from fusion_bench.mixins.simple_profiler import SimpleProfilerMixin
 from fusion_bench.modelpool import BaseModelPool
@@ -25,158 +30,23 @@ from fusion_bench.models.modeling_smile_mistral.modeling_smile_mistral import (
     SmileLinear,
     SmileMistralDecoderLayer,
 )
-from fusion_bench.models.utils import get_attr, set_attr
+from fusion_bench.models.smile_moe.linear_from_hf_config import (
+    ExpertNotTrainedError,
+    upscale_to_smile_linear,
+)
 from fusion_bench.utils.dtype import parse_dtype
 from fusion_bench.utils.parameters import print_parameters
-from fusion_bench.utils.state_dict_arithmetic import state_dict_sub
 log = logging.getLogger(__name__)
-class ExpertNotTrainedError(Exception):
-    pass
-def _is_all_zeros(tensor: Tensor | List[Tensor]) -> bool:
-    """
-    Check if a tensor or a list of tensors are all zeros.
-    Args:
-        tensor (Tensor | List[Tensor]): The tensor or list of tensors to check.
-    Returns:
-        bool: True if all elements are zeros, False otherwise.
-    """
-    if isinstance(tensor, Tensor):
-        return torch.allclose(tensor, torch.zeros_like(tensor))
-    else:
-        return all(_is_all_zeros(t) for t in tensor)
-def _svd(w: Tensor, full_matrices=False) -> Tuple[Tensor, Tensor, Tensor]:
-    """
-    Perform Singular Value Decomposition (SVD) on a tensor.
-    Args:
-        w (Tensor): The input tensor.
-        full_matrices (bool, optional): Whether to compute the full-sized U and V matrices. Defaults to False.
-    Returns:
-        Tuple[Tensor, Tensor, Tensor]: The U, S, and V matrices from SVD.
-    """
-    device = w.device
-    if w.device != torch.float32 or w.device != torch.float64:
-        w = w.float()
-    u, s, vh = torch.linalg.svd(
-        w,
-        full_matrices=full_matrices,
-        # driver="gesvd" if w.is_cuda else None
-    )
-    v = vh.T
-    u = u.to(device)
-    s = s.to(device)
-    v = v.to(device)
-    return u, s, v
-def svd(
-    w: Tensor, full_matrices=True, accelerator=None
-) -> Tuple[Tensor, Tensor, Tensor]:
-    """
-    Perform SVD on a tensor with optional acceleration.
-    Args:
-        w (Tensor): The input tensor.
-        full_matrices (bool, optional): Whether to compute the full-sized U and V matrices. Defaults to True.
-        accelerator (optional): The device to perform the computation on. Defaults to None.
-    Returns:
-        Tuple[Tensor, Tensor, Tensor]: The U, S, and V matrices from SVD.
-    """
-    if accelerator is None:
-        return _svd(w, full_matrices=full_matrices)
-    original_device = w.device
-    w = w.to(accelerator)
-    u, s, v = _svd(w)
-    return u, s, v
-@torch.no_grad()
-def upscale_to_smile_linear(
-    base: nn.Linear, experts: List[nn.Linear], target: SmileLinear, accelerator=None
-):
-    """
-    Upscale a base linear layer to a SmileLinear layer using expert models.
-    Args:
-        base (nn.Linear): The base linear layer.
-        experts (List[nn.Linear]): A list of expert linear layers.
-        target (SmileLinear): The target SmileLinear layer.
-        accelerator (optional): The device to perform the computation on. Defaults to None.
-    Returns:
-        SmileLinear: The upscaled SmileLinear layer.
-    """
-    w = base.weight
-    w_ft_list = [e.weight for e in experts]
-    dw_list = [w_ft - w for w_ft in w_ft_list]
-    if _is_all_zeros(dw_list):
-        raise ExpertNotTrainedError("Expert models are not trained")
-    rank_of_router = target.rank_of_router
-    rank_of_expert = target.rank_of_expert
-    num_local_experts = target.num_local_experts
-    svd_list = [svd(dw, accelerator=accelerator) for dw in dw_list]
-    # gate
-    gate_weight = []
-    for u, s, v in svd_list:
-        gate_weight.append(v[:, :rank_of_router].T)
-    gate_weight = (
-        torch.stack(gate_weight, dim=0)
-        .reshape(num_local_experts * rank_of_router, -1)
-        .contiguous()
-    )
-    target.gate.load_state_dict({"weight": gate_weight})
-    # shared linear
-    target.shared_linear.load_state_dict(base.state_dict())
-    # experts
-    if rank_of_expert > 0:
-        for expert_idx, target_expert in enumerate(target.experts):
-            u, s, v = svd_list[expert_idx]
-            u = u[:, :rank_of_expert]
-            s = s[:rank_of_expert]
-            v = v[:, :rank_of_expert]
-            state_dict = {"u": u, "svh": (s * v).T}
-            if experts[expert_idx].bias is not None:
-                state_dict["bias"] = experts[expert_idx].bias.data
-            target_expert.load_state_dict(state_dict)
-    else:
-        for expert_idx, target_expert in enumerate(target.experts):
-            target_expert.load_state_dict(
-                state_dict_sub(experts[expert_idx].state_dict(), base.state_dict())
-            )
-    return target
-class SmileMistralUpscalingAlgorithm(ModelFusionAlgorithm, SimpleProfilerMixin):
+class SmileMistralUpscalingAlgorithm(BaseAlgorithm, SimpleProfilerMixin):
     R"""
     SmileMistralUpscalingAlgorithm is a model fusion algorithm designed to upscale
     a pretrained Mistral model using a set of fine-tuned expert models. The algorithm
     leverages Singular Value Decomposition (SVD) to merge the weights of the pretrained
     model and the expert models into a new upscaled model.
-    Attributes:
-        modelpool (BaseModelPool): The pool of models to be used for upscaling.
-        config (dict): Configuration parameters for the upscaling process.
     Methods:
         run(modelpool: BaseModelPool) -> SmileMistralForCausalLM:
             Executes the upscaling process and returns the upscaled model.
@@ -185,6 +55,37 @@ class SmileMistralUpscalingAlgorithm(ModelFusionAlgorithm, SimpleProfilerMixin):
             Merges the pretrained model with the fine-tuned models to create an upscaled model.
     """
+    _config_mapping = BaseAlgorithm._config_mapping | {
+        "device": "device",
+        "accelerator": "accelerator",
+        "model_path": "model_path",
+        "model_dtype": "model_dtype",
+        "num_experts_per_tok": "num_experts_per_tok",
+        "rank_of_router": "rank_of_router",
+        "rank_of_expert": "rank_of_expert",
+    }
+    def __init__(
+        self,
+        device,
+        accelerator,
+        model_path,
+        model_dtype,
+        num_experts_per_tok,
+        rank_of_router,
+        rank_of_expert,
+        **kwargs,
+    ):
+        self.device = device
+        self.accelerator = accelerator
+        self.model_path = model_path
+        self.model_dtype = model_dtype
+        # SmileMoE parameters, except `num_local_experts` which is set later according to the number of finetuned models
+        self.num_experts_per_tok = num_experts_per_tok
+        self.rank_of_router = rank_of_router
+        self.rank_of_expert = rank_of_expert
+        super().__init__(**kwargs)
     @torch.no_grad()
     def run(self, modelpool: BaseModelPool) -> SmileMistralForCausalLM:
         """
@@ -199,15 +100,15 @@ class SmileMistralUpscalingAlgorithm(ModelFusionAlgorithm, SimpleProfilerMixin):
         self.modelpool = modelpool = to_modelpool(modelpool)
         config = self.config
-        print(config)
+        # load model from path if provided and return directly
         if config.model_path is not None and os.path.exists(config.model_path):
             log.info(f"Loading model from {config.model_path}")
-            model = torch.load(config.model_path)
+            model = AutoModelForCausalLM.from_pretrained(config.model_path)
             print_parameters(model)
             return model
         with self.profile("load pretrained model"):
-            pretrained_model = modelpool.load_model("_pretrained_")
+            pretrained_model = modelpool.load_pretrained_model()
         with self.profile("load fine-tuned model"):
             finetuned_models = [
                 m for m in tqdm(modelpool.models(), total=len(modelpool.model_names))

fusion_bench/method/smile_upscaling/smile_qwen2_upscaling.py ADDED Viewed

@@ -0,0 +1,229 @@
+import logging
+import os
+from copy import deepcopy
+from typing import TYPE_CHECKING, Dict, List, Tuple
+import torch
+from accelerate import init_empty_weights
+from tqdm.auto import tqdm
+from transformers import (
+    AutoConfig,
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    Qwen2ForCausalLM,
+)
+from transformers.models.qwen2.modeling_qwen2 import Qwen2DecoderLayer
+from fusion_bench import BaseAlgorithm, BaseModelPool
+from fusion_bench.compat.modelpool import to_modelpool
+from fusion_bench.mixins import SimpleProfilerMixin
+from fusion_bench.models.modeling_smile_qwen2 import (
+    SmileQwen2Config,
+    SmileQwen2ForCausalLM,
+)
+from fusion_bench.models.modeling_smile_qwen2.modeling_smile_qwen2 import (
+    SmileQwen2DecoderLayer,
+)
+from fusion_bench.models.smile_moe.linear_from_hf_config import (
+    ExpertNotTrainedError,
+    upscale_to_smile_linear,
+)
+from fusion_bench.utils.dtype import parse_dtype
+from fusion_bench.utils.parameters import print_parameters
+log = logging.getLogger(__name__)
+class SmileQwen2UpscalingAlgorithm(BaseAlgorithm, SimpleProfilerMixin):
+    R"""
+    SmileQwen2UpscalingAlgorithm is a model fusion algorithm designed to upscale
+    a pretrained Qwen2 model using a set of fine-tuned expert models. The algorithm
+    leverages Singular Value Decomposition (SVD) to merge the weights of the pretrained
+    model and the expert models into a new upscaled model.
+    Methods:
+        run(modelpool: BaseModelPool) -> SmileQwen2ForCausalLM:
+            Executes the upscaling process and returns the upscaled model.
+        merge(pretrained_model: Qwen2ForCausalLM, finetuned_models: List[Qwen2ForCausalLM]) -> SmileQwen2ForCausalLM:
+            Merges the pretrained model with the fine-tuned models to create an upscaled model.
+    """
+    _config_mapping = BaseAlgorithm._config_mapping | {
+        "device": "device",
+        "accelerator": "accelerator",
+        "model_path": "model_path",
+        "model_dtype": "model_dtype",
+        "num_experts_per_tok": "num_experts_per_tok",
+        "rank_of_router": "rank_of_router",
+        "rank_of_expert": "rank_of_expert",
+    }
+    def __init__(
+        self,
+        device,
+        accelerator,
+        model_path,
+        model_dtype,
+        num_experts_per_tok,
+        rank_of_router,
+        rank_of_expert,
+        **kwargs,
+    ):
+        self.device = device
+        self.accelerator = accelerator
+        self.model_path = model_path
+        self.model_dtype = model_dtype
+        # SmileMoE parameters, except `num_local_experts` which is set later according to the number of finetuned models
+        self.num_experts_per_tok = num_experts_per_tok
+        self.rank_of_router = rank_of_router
+        self.rank_of_expert = rank_of_expert
+        super().__init__(**kwargs)
+    @torch.no_grad()
+    def run(self, modelpool: BaseModelPool) -> SmileQwen2ForCausalLM:
+        """
+        Executes the upscaling process.
+        Args:
+            modelpool (ModelPool): The pool of models to be used for upscaling.
+        Returns:
+            SmileQwen2ForCausalLM: The upscaled model.
+        """
+        self.modelpool = modelpool = to_modelpool(modelpool)
+        config = self.config
+        # load model from path if provided and return directly
+        if config.model_path is not None and os.path.exists(config.model_path):
+            log.info(f"Loading model from {config.model_path}")
+            model = AutoModelForCausalLM.from_pretrained(config.model_path)
+            print_parameters(model)
+            return model
+        with self.profile("load pretrained model"):
+            pretrained_model = modelpool.load_pretrained_model()
+        with self.profile("load fine-tuned model"):
+            finetuned_models = [
+                m for m in tqdm(modelpool.models(), total=len(modelpool.model_names))
+            ]
+        if config.device == "cuda" and torch.cuda.is_available():
+            pretrained_model = pretrained_model.cuda()
+            print("parameter count of pretrained model:")
+            print_parameters(pretrained_model)
+            finetuned_models = [m.cuda() for m in finetuned_models]
+        with self.profile("merge model"):
+            model = self.merge(pretrained_model, finetuned_models)
+        self.print_profile_summary()
+        print("parameter count of upscaled MoE model:")
+        print_parameters(model)
+        print(model)
+        if config.model_dtype is not None:
+            model.to(dtype=parse_dtype(config.model_dtype))
+        if config.model_path is not None:
+            if os.path.dirname(config.model_path):
+                os.makedirs(os.path.dirname(config.model_path), exist_ok=True)
+            log.info(f"Saving model to {config.model_path}")
+            pretrained_model_config = self.modelpool.get_model_config("_pretrained_")
+            pretrained_path = pretrained_model_config.get(
+                "path", pretrained_model_config["pretrained_model_name_or_path"]
+            )
+            tokenizer = AutoTokenizer.from_pretrained(pretrained_path)
+            tokenizer.save_pretrained(config.model_path)
+            model.save_pretrained(config.model_path)
+        return model
+    def merge(
+        self,
+        pretrained_model: Qwen2ForCausalLM,
+        finetuned_models: List[Qwen2ForCausalLM],
+    ):
+        """
+        Merges the pretrained model with the fine-tuned models to create an upscaled model.
+        Args:
+            pretrained_model (Qwen2ForCausalLM): The pretrained model.
+            finetuned_models (List[Qwen2ForCausalLM]): A list of fine-tuned models.
+        Returns:
+            SmileQwen2ForCausalLM: The upscaled model.
+        """
+        config = self.config
+        with init_empty_weights():
+            pretrained_model_config = self.modelpool.get_model_config("_pretrained_")
+            pretrained_path = pretrained_model_config.get(
+                "path", pretrained_model_config["pretrained_model_name_or_path"]
+            )
+            base_config = AutoConfig.from_pretrained(pretrained_path)
+            model_config = SmileQwen2Config(
+                num_experts_per_tok=config.num_experts_per_tok,
+                rank_of_router=config.rank_of_router,
+                rank_of_expert=config.rank_of_expert,
+                num_local_experts=len(finetuned_models),
+                **base_config.to_dict(),
+            )
+            model = SmileQwen2ForCausalLM(model_config)
+        model.to(dtype=pretrained_model.dtype).to_empty(device="cpu")
+        # copy pretrained model weights
+        state_dict = model.state_dict()
+        pretrained_state_dict = dict(pretrained_model.state_dict())
+        for key in list(pretrained_state_dict.keys()):
+            if key not in state_dict:
+                pretrained_state_dict.pop(key)
+        model.load_state_dict(pretrained_state_dict, strict=False)
+        # upscale model
+        for layer_idx in tqdm(
+            range(len(pretrained_model.model.layers)),
+            "Upscaling Modules (layer)",
+            dynamic_ncols=True,
+        ):
+            pretrained_layer: Qwen2DecoderLayer = pretrained_model.model.layers[
+                layer_idx
+            ]
+            finetuned_layers: List[Qwen2DecoderLayer] = [
+                m.model.layers[layer_idx] for m in finetuned_models
+            ]
+            target_layer: SmileQwen2DecoderLayer = model.model.layers[layer_idx]
+            for n in ["q_proj", "k_proj", "v_proj", "o_proj"]:
+                try:
+                    upscale_to_smile_linear(
+                        base=getattr(pretrained_layer.self_attn, n),
+                        experts=[getattr(m.self_attn, n) for m in finetuned_layers],
+                        target=getattr(target_layer.self_attn, n),
+                        accelerator=config.accelerator,
+                    )
+                except ExpertNotTrainedError:
+                    setattr(
+                        target_layer.self_attn,
+                        n,
+                        getattr(pretrained_layer.self_attn, n),
+                    )
+            for n in ["gate_proj", "up_proj", "down_proj"]:
+                try:
+                    upscale_to_smile_linear(
+                        base=getattr(pretrained_layer.mlp, n),
+                        experts=[getattr(m.mlp, n) for m in finetuned_layers],
+                        target=getattr(target_layer.mlp, n),
+                        accelerator=config.accelerator,
+                    )
+                except ExpertNotTrainedError:
+                    setattr(
+                        target_layer.mlp,
+                        n,
+                        getattr(pretrained_layer.mlp, n),
+                    )
+        return model

fusion-bench 0.2.13__py3-none-any.whl → 0.2.14__py3-none-any.whl

fusion-bench 0.2.13py3-none-any.whl → 0.2.14py3-none-any.whl