PyPI - fusion-bench - Versions diffs - 0.2.22__py3-none-any.whl → 0.2.24__py3-none-any.whl - Mend

fusion-bench 0.2.22py3-none-any.whl → 0.2.24py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (78) hide show

fusion_bench/__init__.py +4 -0
fusion_bench/compat/method/__init__.py +5 -2
fusion_bench/compat/method/base_algorithm.py +3 -2
fusion_bench/compat/modelpool/base_pool.py +3 -3
fusion_bench/compat/taskpool/clip_image_classification.py +1 -1
fusion_bench/dataset/gpt2_glue.py +1 -1
fusion_bench/method/__init__.py +12 -2
fusion_bench/method/analysis/task_vector_cos_similarity.py +95 -12
fusion_bench/method/analysis/task_vector_violin_plot.py +160 -52
fusion_bench/method/bitdelta/bitdelta.py +7 -23
fusion_bench/method/ensemble.py +17 -2
fusion_bench/method/expert_sparsity/mixtral/dynamic_skipping.py +2 -0
fusion_bench/method/expert_sparsity/mixtral/layer_wise_pruning.py +2 -0
fusion_bench/method/expert_sparsity/mixtral/progressive_pruning.py +2 -0
fusion_bench/method/linear/__init__.py +6 -2
fusion_bench/method/linear/{simple_average_for_llama.py → simple_average_for_causallm.py} +8 -4
fusion_bench/method/linear/{task_arithmetic_for_llama.py → task_arithmetic_for_causallm.py} +22 -12
fusion_bench/method/linear/ties_merging_for_causallm.py +70 -0
fusion_bench/method/model_stock/__init__.py +1 -0
fusion_bench/method/model_stock/model_stock.py +309 -0
fusion_bench/method/regmean/clip_regmean.py +3 -6
fusion_bench/method/regmean/regmean.py +27 -56
fusion_bench/method/regmean/utils.py +56 -0
fusion_bench/method/regmean_plusplus/regmean_plusplus.py +21 -60
fusion_bench/method/simple_average.py +2 -2
fusion_bench/method/slerp/__init__.py +1 -1
fusion_bench/method/slerp/slerp.py +110 -14
fusion_bench/method/task_arithmetic/task_arithmetic.py +35 -10
fusion_bench/method/ties_merging/ties_merging.py +22 -6
fusion_bench/method/we_moe/flan_t5_we_moe.py +9 -20
fusion_bench/method/wudi/__init__.py +1 -0
fusion_bench/method/wudi/wudi.py +105 -0
fusion_bench/mixins/clip_classification.py +26 -6
fusion_bench/mixins/lightning_fabric.py +4 -0
fusion_bench/mixins/serialization.py +40 -83
fusion_bench/modelpool/base_pool.py +1 -1
fusion_bench/modelpool/causal_lm/causal_lm.py +285 -44
fusion_bench/modelpool/seq2seq_lm/modelpool.py +146 -0
fusion_bench/models/hf_clip.py +4 -0
fusion_bench/models/hf_utils.py +10 -4
fusion_bench/models/linearized/vision_model.py +6 -6
fusion_bench/models/model_card_templates/default.md +8 -1
fusion_bench/models/modeling_smile_mistral/__init__.py +1 -0
fusion_bench/models/we_moe.py +8 -8
fusion_bench/models/wrappers/ensemble.py +136 -7
fusion_bench/scripts/cli.py +2 -2
fusion_bench/taskpool/base_pool.py +99 -17
fusion_bench/taskpool/clip_vision/taskpool.py +12 -5
fusion_bench/taskpool/dummy.py +101 -13
fusion_bench/taskpool/lm_eval_harness/taskpool.py +80 -0
fusion_bench/taskpool/nyuv2_taskpool.py +28 -0
fusion_bench/utils/__init__.py +1 -0
fusion_bench/utils/data.py +6 -4
fusion_bench/utils/devices.py +36 -11
fusion_bench/utils/dtype.py +3 -2
fusion_bench/utils/lazy_state_dict.py +85 -19
fusion_bench/utils/packages.py +3 -3
fusion_bench/utils/parameters.py +0 -2
fusion_bench/utils/rich_utils.py +7 -3
fusion_bench/utils/timer.py +92 -10
{fusion_bench-0.2.22.dist-info → fusion_bench-0.2.24.dist-info}/METADATA +10 -3
{fusion_bench-0.2.22.dist-info → fusion_bench-0.2.24.dist-info}/RECORD +77 -64
fusion_bench_config/_get_started/llm_slerp.yaml +12 -0
fusion_bench_config/method/ensemble/simple_ensemble.yaml +1 -0
fusion_bench_config/method/linear/{simple_average_for_llama.yaml → simple_average_for_causallm.yaml} +1 -1
fusion_bench_config/method/linear/task_arithmetic_for_causallm.yaml +4 -0
fusion_bench_config/method/linear/ties_merging_for_causallm.yaml +13 -0
fusion_bench_config/method/model_stock/model_stock.yaml +12 -0
fusion_bench_config/method/slerp/slerp_lm.yaml +4 -0
fusion_bench_config/method/wudi/wudi.yaml +4 -0
fusion_bench_config/modelpool/CausalLMPool/{Qwen2.5-1.5B_math_and_coder.yaml → Qwen2.5-1.5B_math_and_code.yaml} +1 -2
fusion_bench_config/modelpool/CausalLMPool/Qwen2.5-1.5B_three_models.yaml +11 -0
fusion_bench_config/modelpool/CausalLMPool/llama-7b_3-models_v1.yaml +11 -0
fusion_bench_config/method/linear/task_arithmetic_for_llama.yaml +0 -4
{fusion_bench-0.2.22.dist-info → fusion_bench-0.2.24.dist-info}/WHEEL +0 -0
{fusion_bench-0.2.22.dist-info → fusion_bench-0.2.24.dist-info}/entry_points.txt +0 -0
{fusion_bench-0.2.22.dist-info → fusion_bench-0.2.24.dist-info}/licenses/LICENSE +0 -0
{fusion_bench-0.2.22.dist-info → fusion_bench-0.2.24.dist-info}/top_level.txt +0 -0

fusion_bench/method/regmean/regmean.py CHANGED Viewed

@@ -16,49 +16,9 @@ from fusion_bench.method import BaseAlgorithm
 from fusion_bench.mixins import SimpleProfilerMixin, auto_register_config
 from fusion_bench.modelpool import BaseModelPool
-log = logging.getLogger(__name__)
-def get_param_names_to_merge(
-    input_param_names: List[str], exclude_param_names_regex: list
-):
-    """
-    get the names of parameters that need to be merged
-    :param input_param_names: list, names of input parameters
-    :param exclude_param_names_regex: list, regular expression of names of parameters that need to be excluded
-    :return:
-    """
-    param_names_to_merge = []
-    for param_name in input_param_names:
-        exclude = any(
-            [
-                re.match(exclude_pattern, param_name)
-                for exclude_pattern in exclude_param_names_regex
-            ]
-        )
-        if not exclude:
-            param_names_to_merge.append(param_name)
-    return param_names_to_merge
+from .utils import get_modules_to_merge, get_param_names_to_merge
-def get_modules_to_merge(model: nn.Module, include_module_types: list):
-    """
-    get the model modules that need to be merged, whose type is in include_module_types
-    :param model: nn.Module, input model
-    :param include_module_types: list, module types that want to include
-    :return:
-    """
-    modules_to_merge: Dict[str, nn.Module] = {}
-    for module_name, module in model.named_modules():
-        is_valid_type = not include_module_types or any(
-            [
-                isinstance(module, include_module_type)
-                for include_module_type in include_module_types
-            ]
-        )
-        if is_valid_type:
-            modules_to_merge[module_name] = module
-    return modules_to_merge
+log = logging.getLogger(__name__)
 def reduce_non_diagonal_elements(
@@ -88,12 +48,16 @@ def merging_with_regmean_weights(
 ):
     """
     merge parameters of different models with computed regmean weights
-    :param models_to_merge_param_dict: dict, dictionary of list, where key is the parameter name,
-    value is a list of the corresponding parameters of all the models that need to be merged
-    :param models_to_merge_regmean_weights_list: list, list of dictionaries with length len(models_to_merge),
-    each dictionary records the regmean weights (matrix) of parameters for each model that needs to be merged, key is module name
-    :param reduce_non_diagonal_ratio: float, reduce non-diagonal elements in regmean weights by multiplying this scalar
-    :return:
+    Args:
+        models_to_merge_param_dict: dict, dictionary of list, where key is the parameter name,
+            value is a list of the corresponding parameters of all the models that need to be merged
+        models_to_merge_regmean_weights_list: list, list of dictionaries with length len(models_to_merge),
+            each dictionary records the regmean weights (matrix) of parameters for each model that needs to be merged, key is module name
+        reduce_non_diagonal_ratio: float, reduce non-diagonal elements in regmean weights by multiplying this scalar
+    Returns:
+        dict: merged model parameters
     """
     # dict, dictionary of model parameters
     merged_params = {}
@@ -164,13 +128,17 @@ def regmean_merging(
     reduce_non_diagonal_ratio: float = 1.0,
 ):
     """
-    regmean merging method
-    :param models_to_merge: list, individual models that need to be merged
-    :param trainers: list, trainers of individual models
-    :param exclude_param_names_regex: list, regular expression of names of parameters that need to be excluded
-    :param nums_regmean_examples: list, numbers of examples to compute regmean weights
-    :param reduce_non_diagonal_ratio: float, reduce non-diagonal elements in regmean weights by multiplying this scalar
-    :return:
+    regmean merging method.
+    Args:
+        models_to_merge: list, individual models that need to be merged
+        trainers: list, trainers of individual models
+        exclude_param_names_regex: list, regular expression of names of parameters that need to be excluded
+        nums_regmean_examples: list, numbers of examples to compute regmean weights
+        reduce_non_diagonal_ratio: float, reduce non-diagonal elements in regmean weights by multiplying this scalar
+    Returns:
+        dict: merged model parameters
     """
     def compute_regmean_weights(module_name: str):
@@ -281,7 +249,10 @@ def regmean_merging(
 @auto_register_config
-class RegMeanAlgorithm(BaseAlgorithm, SimpleProfilerMixin):
+class RegMeanAlgorithm(
+    SimpleProfilerMixin,
+    BaseAlgorithm,
+):
     _include_module_type = [nn.Linear]
     def __init__(

fusion_bench/method/regmean/utils.py ADDED Viewed

@@ -0,0 +1,56 @@
+import re
+from typing import Dict, List
+from torch import nn
+def get_param_names_to_merge(
+    input_param_names: List[str], exclude_param_names_regex: list
+) -> List[str]:
+    """
+    get the names of parameters that need to be merged
+    Args:
+        input_param_names: list, names of input parameters
+        exclude_param_names_regex: list, regular expression of names of parameters that need to be excluded
+    Returns:
+        list: names of parameters that need to be merged
+    """
+    param_names_to_merge = []
+    for param_name in input_param_names:
+        exclude = any(
+            [
+                re.match(exclude_pattern, param_name)
+                for exclude_pattern in exclude_param_names_regex
+            ]
+        )
+        if not exclude:
+            param_names_to_merge.append(param_name)
+    return param_names_to_merge
+def get_modules_to_merge(
+    model: nn.Module, include_module_types: list
+) -> Dict[str, nn.Module]:
+    """
+    get the model modules that need to be merged, whose type is in include_module_types
+    Args:
+        model: nn.Module, input model
+        include_module_types: list, module types that want to include
+    Returns:
+        Dict[str, nn.Module]: a dictionary of modules to merge
+    """
+    modules_to_merge: Dict[str, nn.Module] = {}
+    for module_name, module in model.named_modules():
+        is_valid_type = not include_module_types or any(
+            [
+                isinstance(module, include_module_type)
+                for include_module_type in include_module_types
+            ]
+        )
+        if is_valid_type:
+            modules_to_merge[module_name] = module
+    return modules_to_merge

fusion_bench/method/regmean_plusplus/regmean_plusplus.py CHANGED Viewed

@@ -7,55 +7,14 @@ import torch
 from torch import Tensor, nn
 from tqdm.autonotebook import tqdm
-from fusion_bench.method import BaseAlgorithm
+import fusion_bench.method.regmean.utils as regmean_utils
+from fusion_bench import BaseAlgorithm, auto_register_config
 from fusion_bench.mixins import SimpleProfilerMixin
 from fusion_bench.modelpool import BaseModelPool
 log = logging.getLogger(__name__)
-def get_param_names_to_merge(
-    input_param_names: List[str], exclude_param_names_regex: list
-):
-    """
-    get the names of parameters that need to be merged
-    :param input_param_names: list, names of input parameters
-    :param exclude_param_names_regex: list, regular expression of names of parameters that need to be excluded
-    :return:
-    """
-    param_names_to_merge = []
-    for param_name in input_param_names:
-        exclude = any(
-            [
-                re.match(exclude_pattern, param_name)
-                for exclude_pattern in exclude_param_names_regex
-            ]
-        )
-        if not exclude:
-            param_names_to_merge.append(param_name)
-    return param_names_to_merge
-def get_modules_to_merge(model: nn.Module, include_module_types: list):
-    """
-    get the model modules that need to be merged, whose type is in include_module_types
-    :param model: nn.Module, input model
-    :param include_module_types: list, module types that want to include
-    :return:
-    """
-    modules_to_merge: Dict[str, nn.Module] = {}
-    for module_name, module in model.named_modules():
-        is_valid_type = not include_module_types or any(
-            [
-                isinstance(module, include_module_type)
-                for include_module_type in include_module_types
-            ]
-        )
-        if is_valid_type:
-            modules_to_merge[module_name] = module
-    return modules_to_merge
 def reduce_non_diagonal_elements(
     regmean_weights: torch.Tensor, reduce_non_diagonal_ratio: float
 ):
@@ -130,12 +89,16 @@ def merging_with_regmean_weights(
 ):
     """
     merge parameters of different models with computed regmean weights
-    :param models_to_merge_param_dict: dict, dictionary of list, where key is the parameter name,
-    value is a list of the corresponding parameters of all the models that need to be merged
-    :param models_to_merge_regmean_weights_list: list, list of dictionaries with length len(models_to_merge),
-    each dictionary records the regmean weights (matrix) of parameters for each model that needs to be merged, key is module name
-    :param reduce_non_diagonal_ratio: float, reduce non-diagonal elements in regmean weights by multiplying this scalar
-    :return:
+    Asrgs:
+        models_to_merge_param_dict: dict, dictionary of list, where key is the parameter name,
+            value is a list of the corresponding parameters of all the models that need to be merged
+        models_to_merge_regmean_weights_list: list, list of dictionaries with length len(models_to_merge),
+            each dictionary records the regmean weights (matrix) of parameters for each model that needs to be merged, key is module name
+        reduce_non_diagonal_ratio: float, reduce non-diagonal elements in regmean weights by multiplying this scalar
+    Returns:
+        dict: merged model parameters
     """
     # dict, dictionary of model parameters
     merged_params = {}
@@ -176,14 +139,12 @@ def merging_with_regmean_weights(
     return merged_params
-class RegMeanAlgorithmPlusPlus(BaseAlgorithm, SimpleProfilerMixin):
+@auto_register_config
+class RegMeanAlgorithmPlusPlus(
+    SimpleProfilerMixin,
+    BaseAlgorithm,
+):
     _include_module_type = [nn.Linear]
-    _config_mapping = {
-        "num_regmean_examples": "num_regmean_examples",
-        "exclude_param_names_regex": "exclude_param_names_regex",
-        "reduce_non_diagonal_ratio": "reduce_non_diagonal_ratio",
-        "weight_transpose": "weight_transpose",
-    }
     def __init__(
         self,
@@ -194,11 +155,11 @@ class RegMeanAlgorithmPlusPlus(BaseAlgorithm, SimpleProfilerMixin):
         weight_transpose: bool,
         **kwargs,
     ):
+        super().__init__(**kwargs)
         self.num_regmean_examples = num_regmean_examples
         self.exclude_param_names_regex = exclude_param_names_regex
         self.reduce_non_diagonal_ratio = reduce_non_diagonal_ratio
         self.weight_transpose = weight_transpose
-        super().__init__(**kwargs)
     def run(self, modelpool: BaseModelPool, **kwargs):
         if not isinstance(modelpool, BaseModelPool):
@@ -262,7 +223,7 @@ class RegMeanAlgorithmPlusPlus(BaseAlgorithm, SimpleProfilerMixin):
                     # exclude parameter whose name matches element in exclude_param_names_regex
                     if param_names_to_merge is None:
-                        param_names_to_merge = get_param_names_to_merge(
+                        param_names_to_merge = regmean_utils.get_param_names_to_merge(
                             input_param_names=list(param_dict.keys()),
                             exclude_param_names_regex=self.config.get(
                                 "exclude_param_names_regex", []
@@ -274,7 +235,7 @@ class RegMeanAlgorithmPlusPlus(BaseAlgorithm, SimpleProfilerMixin):
                             param_dict[param_name]
                         )
-                    linear_modules_to_merge = get_modules_to_merge(
+                    linear_modules_to_merge = regmean_utils.get_modules_to_merge(
                         model=layer_to_merge,
                         include_module_types=self._include_module_type,
                     )
@@ -294,7 +255,7 @@ class RegMeanAlgorithmPlusPlus(BaseAlgorithm, SimpleProfilerMixin):
                             linear_modules_to_merge=linear_modules_to_merge,
                         )
-                        module_subset = get_param_names_to_merge(
+                        module_subset = regmean_utils.get_param_names_to_merge(
                             input_param_names=list(param_dict.keys()),
                             exclude_param_names_regex=self.exclude_param_names_regex,
                         )

fusion_bench/method/simple_average.py CHANGED Viewed

@@ -89,7 +89,7 @@ class SimpleAverageAlgorithm(
             modelpool = BaseModelPool(modelpool)
         log.info(
-            f"Fusing models using simple average on {len(modelpool.model_names)} models."
+            f"Fusing models using simple average on {len(modelpool.model_names)} models. "
             f"models: {modelpool.model_names}"
         )
         sd: Optional[StateDictType] = None
@@ -119,7 +119,7 @@ class SimpleAverageAlgorithm(
         if isinstance(forward_model, LazyStateDict):
             # if the model is a LazyStateDict, convert it to an empty module
-            forward_model = forward_model.meta_module.to_empty(
+            forward_model = deepcopy(forward_model.meta_module).to_empty(
                 device=forward_model._device
             )
         result = forward_model.load_state_dict(sd, strict=False)

fusion_bench/method/slerp/__init__.py CHANGED Viewed

@@ -1,2 +1,2 @@
 # flake8: noqa F401
-from .slerp import SlerpMergeAlgorithm
+from .slerp import SlerpForCausalLM, SlerpMergeAlgorithm

fusion_bench/method/slerp/slerp.py CHANGED Viewed

@@ -1,16 +1,24 @@
 import logging
-from typing import Any, Dict
+import os
+from copy import deepcopy
+from typing import TYPE_CHECKING, Any, Dict, Optional
 import torch
 from torch import nn
+from tqdm import tqdm
 from typing_extensions import override
+from fusion_bench import LazyStateDict, create_default_model_card, timeit_context
 from fusion_bench.method import BaseAlgorithm
-from fusion_bench.modelpool import BaseModelPool
+from fusion_bench.mixins import SimpleProfilerMixin, auto_register_config
+from fusion_bench.modelpool import BaseModelPool, CausalLMPool
 from fusion_bench.utils.type import StateDictType
 from .slerp_utils import slerp
+if TYPE_CHECKING:
+    from transformers import PreTrainedModel
 log = logging.getLogger(__name__)
@@ -21,6 +29,7 @@ def slerp_on_state_dicts(
     *,
     DOT_THRESHOLD: float = 0.9995,
     epsilon: float = 1e-8,
+    show_pbar: bool = False,
 ) -> StateDictType:
     """
     Perform spherical linear interpolation (slerp) on the state dictionaries of two models.
@@ -36,7 +45,8 @@ def slerp_on_state_dicts(
         dict: The interpolated state dictionary.
     """
     state_dict = {}
-    for key in secondary_state_dict:
+    pbar = secondary_state_dict if not show_pbar else tqdm(secondary_state_dict)
+    for key in pbar:
         v0 = primary_state_dict[key]
         v1 = secondary_state_dict[key]
         if v0.shape != v1.shape:
@@ -49,18 +59,19 @@ def slerp_on_state_dicts(
     return state_dict
+@auto_register_config
 class SlerpMergeAlgorithm(BaseAlgorithm):
     """
     General purpose implementation of Slerp (Spherical Linear Interpolation) for PyTorch models.
     """
-    _config_mapping = BaseAlgorithm._config_mapping | {
-        "t": "t",
-        "DOT_THRESHOLD": "DOT_THRESHOLD",
-        "epsilon": "epsilon",
-    }
-    def __init__(self, t: float, DOT_THRESHOLD: float = 0.9995, epsilon: float = 1e-8):
+    def __init__(
+        self,
+        t: float,
+        DOT_THRESHOLD: float = 0.9995,
+        epsilon: float = 1e-8,
+        **kwargs,
+    ):
         """
         Initialize the SlerpMergeAlgorithm.
@@ -69,10 +80,7 @@ class SlerpMergeAlgorithm(BaseAlgorithm):
             DOT_THRESHOLD (float, optional): The threshold for the dot product of the two vectors. Defaults to 0.9995.
             epsilon (float, optional): The epsilon value for numerical stability. Defaults to 1e-8.
         """
-        self.t = t
-        self.DOT_THRESHOLD = DOT_THRESHOLD
-        self.epsilon = epsilon
-        super().__init__()
+        super().__init__(**kwargs)
     @override
     def run(self, modelpool: BaseModelPool) -> nn.Module:
@@ -102,3 +110,91 @@ class SlerpMergeAlgorithm(BaseAlgorithm):
         primary_model.load_state_dict(state_dict)
         return primary_model
+@auto_register_config
+class SlerpForCausalLM(
+    SimpleProfilerMixin,
+    BaseAlgorithm,
+):
+    """
+    Slerp (Spherical Linear Interpolation) for Causal Language Models.
+    """
+    def __init__(
+        self,
+        t: float,
+        DOT_THRESHOLD: float = 0.9995,
+        epsilon: float = 1e-8,
+        model_save_path: Optional[str] = None,
+        show_pbar: bool = False,
+        **kwargs,
+    ):
+        """
+        Initialize the SlerpForCausalLM algorithm.
+        Args:
+            t (float): The interpolation parameter. Must be in the range [0, 1].
+                      t=0 returns the first model, t=1 returns the second model,
+                      t=0.5 provides balanced interpolation.
+            DOT_THRESHOLD (float, optional): The threshold for the dot product of normalized vectors.
+                                           When the absolute dot product exceeds this threshold,
+                                           vectors are considered nearly collinear and linear
+                                           interpolation (LERP) is used instead of SLERP for
+                                           numerical stability. Defaults to 0.9995.
+            epsilon (float, optional): Small value used for numerical stability to avoid
+                                     division by zero during vector normalization.
+                                     Defaults to 1e-8.
+            model_save_path (Optional[str], optional): Path where the merged model should be saved.
+                                                     If None, the model is not saved to disk.
+                                                     Defaults to None.
+            show_pbar (bool, optional): Whether to display a progress bar during the interpolation
+                                      process. Useful for debugging or monitoring progress with
+                                      large models. Defaults to False.
+            **kwargs: Additional keyword arguments passed to the parent BaseAlgorithm class.
+        """
+        super().__init__(**kwargs)
+    @override
+    def run(self, modelpool: CausalLMPool):
+        assert len(modelpool.all_model_names) == 2, "Slerp expect exactly 2 models"
+        primary_model = modelpool.load_model(modelpool.all_model_names[0])
+        secondary_model = modelpool.load_model(modelpool.all_model_names[1])
+        with torch.no_grad():
+            primary_state_dict = primary_model.state_dict()
+            secondary_state_dict = secondary_model.state_dict()
+            state_dict = slerp_on_state_dicts(
+                self.t,
+                primary_state_dict,
+                secondary_state_dict,
+                DOT_THRESHOLD=self.DOT_THRESHOLD,
+                epsilon=self.epsilon,
+            )
+        if isinstance(primary_model, nn.Module):
+            model = primary_model
+            model.load_state_dict(state_dict)
+        elif isinstance(primary_model, LazyStateDict):
+            model: "PreTrainedModel" = deepcopy(primary_model.meta_module)
+            model.to(device=primary_model._device)
+            model.load_state_dict(state_dict)
+        else:
+            raise TypeError(
+                f"Unsupported model type: {type(primary_model)}. "
+                "Expected nn.Module or LazyStateDict."
+            )
+        if self.model_save_path is not None:
+            with timeit_context(f"Saving the model to {self.model_save_path}"):
+                tokenizer = modelpool.load_tokenizer()
+                tokenizer.save_pretrained(self.model_save_path)
+                model.save_pretrained(self.model_save_path)
+                model_card_str = create_default_model_card(
+                    models=[modelpool.get_model_path(m) for m in modelpool.model_names],
+                    description="Merged model using Slerp.",
+                    algorithm_config=self.config,
+                    modelpool_config=modelpool.config,
+                )
+                with open(os.path.join(self.model_save_path, "README.md"), "w") as f:
+                    f.write(model_card_str)
+        return model

fusion_bench/method/task_arithmetic/task_arithmetic.py CHANGED Viewed

@@ -6,11 +6,20 @@ http://arxiv.org/abs/2212.04089
 import logging
 from copy import deepcopy
-from typing import Dict, List, Mapping, Optional, TypeVar, Union  # noqa: F401
+from typing import (  # noqa: F401
+    TYPE_CHECKING,
+    Dict,
+    List,
+    Mapping,
+    Optional,
+    TypeVar,
+    Union,
+)
 import torch
 from torch import nn
+from fusion_bench import LazyStateDict
 from fusion_bench.method.base_algorithm import BaseAlgorithm
 from fusion_bench.mixins import SimpleProfilerMixin, auto_register_config
 from fusion_bench.modelpool import BaseModelPool
@@ -21,6 +30,8 @@ from fusion_bench.utils.state_dict_arithmetic import (
 )
 from fusion_bench.utils.type import StateDictType, TorchModelType
+if TYPE_CHECKING:
+    from transformers import PreTrainedModel
 log = logging.getLogger(__name__)
@@ -125,25 +136,39 @@ class TaskArithmeticAlgorithm(
             with self.profile("merge weights"):
                 if task_vector is None:
                     task_vector = state_dict_sub(
-                        model.state_dict(keep_vars=True),
-                        pretrained_model.state_dict(keep_vars=True),
+                        model.state_dict(),
+                        pretrained_model.state_dict(),
                     )
                 else:
                     task_vector = state_dict_add(
                         task_vector,
                         state_dict_sub(
-                            model.state_dict(keep_vars=True),
-                            pretrained_model.state_dict(keep_vars=True),
+                            model.state_dict(),
+                            pretrained_model.state_dict(),
                         ),
                     )
         with self.profile("merge weights"):
             # scale the task vector
             task_vector = state_dict_mul(task_vector, self.config.scaling_factor)
             # add the task vector to the pretrained model
-            state_dict = state_dict_add(
-                pretrained_model.state_dict(keep_vars=True), task_vector
-            )
+            state_dict = state_dict_add(pretrained_model.state_dict(), task_vector)
         self.print_profile_summary()
-        pretrained_model.load_state_dict(state_dict)
-        return pretrained_model
+        # apply state dict to model
+        if isinstance(pretrained_model, nn.Module):
+            model = pretrained_model
+            model.load_state_dict(state_dict)
+        elif isinstance(pretrained_model, LazyStateDict):
+            model = deepcopy(pretrained_model.meta_module)
+            model = model.to_empty(device=pretrained_model._device)
+            result = model.load_state_dict(state_dict, strict=False)
+            if result.unexpected_keys:
+                raise ValueError(
+                    f"Unexpected keys in state dict: {result.unexpected_keys}"
+                )
+            if result.missing_keys:
+                log.warning(f"Missing keys in state dict: {result.missing_keys}")
+        else:
+            raise TypeError(f"Unsupported model type: {type(pretrained_model)}")
+        return model

fusion_bench/method/ties_merging/ties_merging.py CHANGED Viewed

@@ -9,11 +9,14 @@ Overview of Ties-Merging:
 """
 import logging
+from copy import deepcopy
 from typing import Any, Dict, List, Literal, Mapping, Union  # noqa: F401
 import torch
 from torch import Tensor, nn
+from transformers import PreTrainedModel
+from fusion_bench import LazyStateDict
 from fusion_bench.compat.modelpool import to_modelpool
 from fusion_bench.method import BaseAlgorithm
 from fusion_bench.mixins import SimpleProfilerMixin, auto_register_config
@@ -98,12 +101,25 @@ class TiesMergingAlgorithm(
                 merge_func=merge_func,
             )
             merged_check = flat_ptm + scaling_factor * merged_tv
-            merged_state_dict = vector_to_state_dict(
+            state_dict = vector_to_state_dict(
                 merged_check, ptm_check, remove_keys=remove_keys
             )
-            # Load the merged state dict into the pretrained model
-            pretrained_model.load_state_dict(merged_state_dict)
         self.print_profile_summary()
-        return pretrained_model
+        # apply state dict to model
+        if isinstance(pretrained_model, nn.Module):
+            model = pretrained_model
+            model.load_state_dict(state_dict)
+        elif isinstance(pretrained_model, LazyStateDict):
+            model = deepcopy(pretrained_model.meta_module)
+            model = model.to_empty(device=pretrained_model._device)
+            result = model.load_state_dict(state_dict, strict=False)
+            if result.unexpected_keys:
+                raise ValueError(
+                    f"Unexpected keys in state dict: {result.unexpected_keys}"
+                )
+            if result.missing_keys:
+                log.warning(f"Missing keys in state dict: {result.missing_keys}")
+        else:
+            raise TypeError(f"Unsupported model type: {type(pretrained_model)}")
+        return model

fusion-bench 0.2.22__py3-none-any.whl → 0.2.24__py3-none-any.whl

fusion-bench 0.2.22py3-none-any.whl → 0.2.24py3-none-any.whl