PyPI - fusion-bench - Versions diffs - 0.2.21__py3-none-any.whl → 0.2.23__py3-none-any.whl - Mend

fusion-bench 0.2.21py3-none-any.whl → 0.2.23py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (89) hide show

fusion_bench/__init__.py +25 -2
fusion_bench/compat/method/__init__.py +5 -2
fusion_bench/compat/method/base_algorithm.py +3 -2
fusion_bench/compat/modelpool/base_pool.py +3 -3
fusion_bench/compat/taskpool/clip_image_classification.py +1 -1
fusion_bench/constants/__init__.py +1 -0
fusion_bench/constants/runtime.py +57 -0
fusion_bench/dataset/gpt2_glue.py +1 -1
fusion_bench/method/__init__.py +12 -4
fusion_bench/method/analysis/task_vector_cos_similarity.py +95 -12
fusion_bench/method/analysis/task_vector_violin_plot.py +160 -52
fusion_bench/method/bitdelta/__init__.py +1 -0
fusion_bench/method/bitdelta/bitdelta.py +7 -23
fusion_bench/method/classification/clip_finetune.py +1 -1
fusion_bench/method/expert_sparsity/mixtral/dynamic_skipping.py +2 -0
fusion_bench/method/expert_sparsity/mixtral/layer_wise_pruning.py +2 -0
fusion_bench/method/expert_sparsity/mixtral/progressive_pruning.py +2 -0
fusion_bench/method/fisher_merging/clip_fisher_merging.py +0 -4
fusion_bench/method/fisher_merging/gpt2_fisher_merging.py +2 -2
fusion_bench/method/linear/simple_average_for_llama.py +16 -11
fusion_bench/method/model_stock/__init__.py +1 -0
fusion_bench/method/model_stock/model_stock.py +309 -0
fusion_bench/method/regmean/clip_regmean.py +3 -6
fusion_bench/method/regmean/regmean.py +27 -56
fusion_bench/method/regmean/utils.py +56 -0
fusion_bench/method/regmean_plusplus/regmean_plusplus.py +21 -60
fusion_bench/method/simple_average.py +7 -7
fusion_bench/method/slerp/__init__.py +1 -1
fusion_bench/method/slerp/slerp.py +110 -14
fusion_bench/method/smile_upscaling/causal_lm_upscaling.py +371 -0
fusion_bench/method/smile_upscaling/projected_energy.py +1 -2
fusion_bench/method/smile_upscaling/smile_mistral_upscaling.py +5 -1
fusion_bench/method/smile_upscaling/smile_qwen2_upscaling.py +40 -31
fusion_bench/method/smile_upscaling/smile_upscaling.py +1 -1
fusion_bench/method/we_moe/__init__.py +1 -0
fusion_bench/method/we_moe/entropy_loss.py +25 -0
fusion_bench/method/we_moe/flan_t5_we_moe.py +320 -0
fusion_bench/method/we_moe/utils.py +15 -0
fusion_bench/method/weighted_average/llama.py +1 -1
fusion_bench/mixins/clip_classification.py +37 -48
fusion_bench/mixins/serialization.py +30 -10
fusion_bench/modelpool/base_pool.py +1 -1
fusion_bench/modelpool/causal_lm/causal_lm.py +293 -75
fusion_bench/modelpool/seq2seq_lm/modelpool.py +146 -0
fusion_bench/models/__init__.py +5 -0
fusion_bench/models/hf_utils.py +69 -86
fusion_bench/models/linearized/vision_model.py +6 -6
fusion_bench/models/model_card_templates/default.md +46 -0
fusion_bench/models/modeling_smile_llama/__init__.py +7 -0
fusion_bench/models/modeling_smile_llama/modeling_smile_llama.py +1 -8
fusion_bench/models/modeling_smile_mistral/__init__.py +2 -1
fusion_bench/models/modeling_smile_qwen2/modeling_smile_qwen2.py +1 -5
fusion_bench/models/we_moe.py +8 -8
fusion_bench/programs/fabric_fusion_program.py +29 -60
fusion_bench/scripts/cli.py +34 -1
fusion_bench/taskpool/base_pool.py +99 -17
fusion_bench/taskpool/clip_vision/taskpool.py +10 -5
fusion_bench/taskpool/dummy.py +101 -13
fusion_bench/taskpool/lm_eval_harness/taskpool.py +80 -0
fusion_bench/taskpool/nyuv2_taskpool.py +28 -0
fusion_bench/utils/__init__.py +2 -0
fusion_bench/utils/cache_utils.py +101 -1
fusion_bench/utils/data.py +6 -4
fusion_bench/utils/devices.py +7 -4
fusion_bench/utils/dtype.py +3 -2
fusion_bench/utils/fabric.py +2 -2
fusion_bench/utils/lazy_imports.py +23 -0
fusion_bench/utils/lazy_state_dict.py +117 -19
fusion_bench/utils/modelscope.py +3 -3
fusion_bench/utils/packages.py +3 -3
fusion_bench/utils/parameters.py +0 -2
fusion_bench/utils/path.py +56 -0
fusion_bench/utils/pylogger.py +1 -1
fusion_bench/utils/timer.py +92 -10
{fusion_bench-0.2.21.dist-info → fusion_bench-0.2.23.dist-info}/METADATA +1 -23
{fusion_bench-0.2.21.dist-info → fusion_bench-0.2.23.dist-info}/RECORD +89 -75
fusion_bench_config/_get_started/llm_slerp.yaml +12 -0
fusion_bench_config/method/fisher_merging/clip_fisher_merging.yaml +0 -1
fusion_bench_config/method/linear/simple_average_for_llama.yaml +3 -2
fusion_bench_config/method/model_stock/model_stock.yaml +12 -0
fusion_bench_config/method/slerp/slerp_lm.yaml +4 -0
fusion_bench_config/method/smile_upscaling/causal_lm_upscaling.yaml +21 -0
fusion_bench_config/method/smile_upscaling/smile_qwen2_upscaling.yaml +1 -1
fusion_bench_config/method/wemoe/flan_t5_weight_ensembling_moe.yaml +20 -0
fusion_bench_config/modelpool/CausalLMPool/Qwen2.5-1.5B_math_and_coder.yaml +1 -1
{fusion_bench-0.2.21.dist-info → fusion_bench-0.2.23.dist-info}/WHEEL +0 -0
{fusion_bench-0.2.21.dist-info → fusion_bench-0.2.23.dist-info}/entry_points.txt +0 -0
{fusion_bench-0.2.21.dist-info → fusion_bench-0.2.23.dist-info}/licenses/LICENSE +0 -0
{fusion_bench-0.2.21.dist-info → fusion_bench-0.2.23.dist-info}/top_level.txt +0 -0

fusion_bench/method/model_stock/model_stock.py ADDED Viewed

@@ -0,0 +1,309 @@
+import copy
+import logging
+import math
+import os
+from collections import OrderedDict
+from copy import deepcopy
+from typing import Dict, List, Optional, Union
+import numpy as np
+import torch
+from omegaconf import DictConfig
+from torch import nn
+from transformers import PreTrainedModel
+import fusion_bench
+from fusion_bench import BaseAlgorithm, BaseModelPool
+from fusion_bench.mixins import SimpleProfilerMixin
+from fusion_bench.models import create_default_model_card
+from fusion_bench.utils.type import StateDictType
+log = logging.getLogger(__name__)
+EPS = 1e-8
+def compute_angle(
+    state_dict_1: StateDictType,
+    state_dict_2: StateDictType,
+    ref_state_dict: StateDictType,
+    ignore_keys: List[str] = [],
+    return_cos: bool = False,
+) -> Dict[str, float]:
+    """
+    Compute the angle between two state dictionaries relative to a reference state dictionary.
+    Args:
+        state_dict_1: First state dictionary
+        state_dict_2: Second state dictionary
+        ref_state_dict: Reference state dictionary (typically pre-trained model)
+        ignore_keys: Keys to ignore during computation
+        return_cos: If True, return cosine values instead of angles in degrees
+    Returns:
+        Dictionary mapping parameter names to angles (in degrees) or cosine values
+    """
+    # Remove the keys not used for CLIP fine-tuning (from the notebook example)
+    return_dict = OrderedDict()
+    with torch.no_grad():
+        for key in ref_state_dict:
+            if key in ignore_keys:
+                log.info(f"Ignoring key '{key}'")
+                continue
+            state_dict_1_val = state_dict_1[key]
+            state_dict_2_val = state_dict_2[key]
+            ref_val = ref_state_dict[key]
+            if not (state_dict_1_val.shape == state_dict_2_val.shape == ref_val.shape):
+                log.warning(
+                    f"Shape mismatch for key '{key}', ignored during merging: "
+                    f"({state_dict_1_val.shape}, {state_dict_2_val.shape}, {ref_val.shape})"
+                )
+                continue
+            vector1 = (state_dict_1_val - ref_val).clone().detach()
+            vector2 = (state_dict_2_val - ref_val).clone().detach()
+            vector1 = vector1.float()
+            vector2 = vector2.float()
+            cosine_val = torch.sum(vector1 * vector2) / (
+                math.sqrt(torch.sum(vector1**2) * torch.sum(vector2**2)) + EPS
+            )
+            cosine_val = torch.clamp(
+                cosine_val, min=-1.0, max=1.0
+            )  # Prevent nan from acos
+            if return_cos:
+                return_dict[key] = cosine_val.item()
+            else:
+                return_dict[key] = np.rad2deg(
+                    torch.acos(cosine_val).detach().cpu().item()
+                )
+    return return_dict
+def compute_ratio(angle_dict: Dict[str, float], k: int = 2) -> Dict[str, float]:
+    """
+    Compute interpolation ratios based on angles between fine-tuned models.
+    Args:
+        angle_dict: Dictionary mapping parameter names to angles in degrees
+        k: Number of fine-tuned models (default: 2)
+    Returns:
+        Dictionary mapping parameter names to interpolation ratios
+    """
+    ratio_dict = {}
+    for key in angle_dict.keys():
+        angle = np.deg2rad(angle_dict[key])
+        ratio_dict[key] = k * np.cos(angle) / ((k - 1) * np.cos(angle) + 1 + EPS)
+    return ratio_dict
+def merge_weights(
+    w1: StateDictType, w2: StateDictType, w0: StateDictType, ratio: Dict[str, float]
+) -> StateDictType:
+    """
+    Merge model weights using ModelStock formula.
+    Args:
+        w1: First fine-tuned model weights
+        w2: Second fine-tuned model weights
+        w0: Pre-trained model weights
+        ratio: Interpolation ratios for each parameter
+    Returns:
+        Merged model weights
+    """
+    # Compute w12 = (w1 + w2) / 2
+    w12 = {}
+    for key in w1.keys():
+        w12[key] = (w1[key].clone() + w2[key].clone()) / 2.0
+    # Apply ModelStock formula: w_merge = t * w12 + (1-t) * w0
+    w_merge = copy.deepcopy(w12)
+    for key, r in ratio.items():
+        w_merge[key] = w12[key].clone() * r + w0[key].clone() * (1.0 - r)
+    return w_merge
+@fusion_bench.auto_register_config
+class ModelStock(SimpleProfilerMixin, BaseAlgorithm):
+    """
+    Model Stock: All we need is just a few fine-tuned models
+    This method merges fine-tuned models by interpolating between their average
+    and a pre-trained anchor model, with interpolation ratios determined by
+    the angle between fine-tuned models in parameter space.
+    """
+    def __init__(
+        self,
+        ignore_keys: Optional[List[str]] = None,
+        model_save_path: Optional[str] = None,
+        model_save_kwargs: Optional[DictConfig] = None,
+        **kwargs,
+    ):
+        """
+        Initialize ModelStock algorithm.
+        Args:
+            ignore_keys: Additional parameter keys to ignore during merging
+        """
+        super().__init__(**kwargs)
+        if self.ignore_keys is None:
+            self.ignore_keys = []
+        if self.model_save_kwargs is None:
+            self.model_save_kwargs = DictConfig({})
+    def run(self, modelpool: BaseModelPool) -> nn.Module:
+        """
+        Run the ModelStock merging algorithm.
+        Args:
+            modelpool: Pool of models containing pre-trained and fine-tuned models
+        Returns:
+            Merged model
+        """
+        with self.profile("model loading"):
+            # Load the pre-trained model (anchor)
+            pretrained_model = modelpool.load_pretrained_model()
+            if isinstance(pretrained_model, fusion_bench.LazyStateDict):
+                assert (
+                    pretrained_model.meta_module is not None
+                ), "Meta module is not initialized"
+            pretrained_state_dict = pretrained_model.state_dict()
+            # Load all fine-tuned models
+            finetuned_models = []
+            finetuned_state_dicts = []
+            for model_name in modelpool.model_names:
+                model = modelpool.load_model(model_name)
+                finetuned_models.append(model)
+                finetuned_state_dicts.append(model.state_dict())
+                log.info(f"Loaded fine-tuned model: {model_name}")
+        if len(finetuned_models) < 2:
+            raise ValueError("ModelStock requires at least 2 fine-tuned models")
+        log.info(f"Running ModelStock with {len(finetuned_models)} fine-tuned models")
+        with self.profile("compute angles and ratios"):
+            if len(finetuned_models) == 2:
+                # Two fine-tuned models case
+                angle_dict = compute_angle(
+                    finetuned_state_dicts[0],
+                    finetuned_state_dicts[1],
+                    pretrained_state_dict,
+                    ignore_keys=self.ignore_keys,
+                )
+                ratio_dict = compute_ratio(angle_dict, k=2)
+                log.info(f"Computed angles for {len(angle_dict)} parameter groups")
+            else:
+                # N fine-tuned models case - compute average angle
+                angles_sum = {}
+                angles_count = {}
+                # Compute pairwise angles and average them
+                for i in range(len(finetuned_models)):
+                    for j in range(i + 1, len(finetuned_models)):
+                        angle_dict = compute_angle(
+                            finetuned_state_dicts[i],
+                            finetuned_state_dicts[j],
+                            pretrained_state_dict,
+                            ignore_keys=self.ignore_keys,
+                        )
+                        for key, angle in angle_dict.items():
+                            if key not in angles_sum:
+                                angles_sum[key] = 0
+                                angles_count[key] = 0
+                            angles_sum[key] += angle
+                            angles_count[key] += 1
+                # Average the angles
+                avg_angle_dict = {}
+                for key in angles_sum:
+                    avg_angle_dict[key] = angles_sum[key] / angles_count[key]
+                ratio_dict = compute_ratio(avg_angle_dict, k=len(finetuned_models))
+                log.info(
+                    f"Computed average angles for {len(avg_angle_dict)} parameter groups"
+                )
+        with self.profile("merge weights"):
+            if len(finetuned_models) == 2:
+                # Direct merging for two models
+                merged_state_dict = merge_weights(
+                    finetuned_state_dicts[0],
+                    finetuned_state_dicts[1],
+                    pretrained_state_dict,
+                    ratio_dict,
+                )
+            else:
+                # For N models, first compute the average of fine-tuned models
+                avg_finetuned_state_dict = {}
+                for key in finetuned_state_dicts[0].keys():
+                    avg_finetuned_state_dict[key] = torch.zeros_like(
+                        finetuned_state_dicts[0][key]
+                    )
+                    for state_dict in finetuned_state_dicts:
+                        avg_finetuned_state_dict[key] += state_dict[key]
+                    avg_finetuned_state_dict[key] /= len(finetuned_state_dicts)
+                # Apply ModelStock formula: w_H = t * w_avg + (1-t) * w_0
+                merged_state_dict = copy.deepcopy(avg_finetuned_state_dict)
+                for key, r in ratio_dict.items():
+                    merged_state_dict[key] = avg_finetuned_state_dict[
+                        key
+                    ].clone() * r + pretrained_state_dict[key].clone() * (1.0 - r)
+        # Load merged weights into the model
+        if isinstance(pretrained_model, nn.Module):
+            result_model = pretrained_model
+        elif isinstance(pretrained_model, fusion_bench.LazyStateDict):
+            result_model = deepcopy(pretrained_model.meta_module)
+            result_model.to(device=pretrained_model._device)
+        result = result_model.load_state_dict(merged_state_dict, strict=False)
+        if result.unexpected_keys:
+            raise RuntimeError(
+                f"Unexpected keys in state dict: {result.unexpected_keys}"
+            )
+        if result.missing_keys:
+            log.warning(f"Missing keys in state dict: {result.missing_keys}")
+        if self.model_save_path is not None:
+            with self.profile("model saving"):
+                modelpool.save_model(
+                    model, path=self.model_save_path, **self.model_save_kwargs
+                )
+                if isinstance(model, PreTrainedModel):
+                    modelcard = create_default_model_card(
+                        models=[
+                            modelpool.get_model_path(m)
+                            for m in modelpool.all_model_names
+                        ],
+                        description="Merged model using [Model Stock](https://arxiv.org/abs/2403.19522).",
+                        algorithm_config=self.config,
+                        modelpool_config=modelpool.config,
+                    )
+                    with open(
+                        os.path.join(self.model_save_path, "README.md"), "w"
+                    ) as f:
+                        f.write(modelcard)
+        self.print_profile_summary()
+        log.info("ModelStock merging completed successfully")
+        return result_model

fusion_bench/method/regmean/clip_regmean.py CHANGED Viewed

@@ -9,6 +9,7 @@ from torch.nn.modules import Module
 from torch.utils.data import DataLoader
 from tqdm.autonotebook import tqdm
+from fusion_bench import auto_register_config
 from fusion_bench.dataset.clip_dataset import CLIPDataset
 from fusion_bench.mixins import CLIPClassificationMixin
@@ -17,17 +18,13 @@ from .regmean import RegMeanAlgorithm
 log = logging.getLogger(__name__)
+@auto_register_config
 class RegMeanAlgorithmForCLIP(
-    RegMeanAlgorithm,
     CLIPClassificationMixin,
+    RegMeanAlgorithm,
 ):
-    _config_mapping = {
-        "_dataloader_kwargs": "dataloader_kwargs",
-    }
     def __init__(self, *, dataloader_kwargs: DictConfig, **kwargs):
         super().__init__(**kwargs)
-        self.dataloader_kwargs = dataloader_kwargs
     def on_regmean_start(self):
         self.setup_zero_shot_classification_head()

fusion_bench/method/regmean/regmean.py CHANGED Viewed

@@ -16,49 +16,9 @@ from fusion_bench.method import BaseAlgorithm
 from fusion_bench.mixins import SimpleProfilerMixin, auto_register_config
 from fusion_bench.modelpool import BaseModelPool
-log = logging.getLogger(__name__)
-def get_param_names_to_merge(
-    input_param_names: List[str], exclude_param_names_regex: list
-):
-    """
-    get the names of parameters that need to be merged
-    :param input_param_names: list, names of input parameters
-    :param exclude_param_names_regex: list, regular expression of names of parameters that need to be excluded
-    :return:
-    """
-    param_names_to_merge = []
-    for param_name in input_param_names:
-        exclude = any(
-            [
-                re.match(exclude_pattern, param_name)
-                for exclude_pattern in exclude_param_names_regex
-            ]
-        )
-        if not exclude:
-            param_names_to_merge.append(param_name)
-    return param_names_to_merge
+from .utils import get_modules_to_merge, get_param_names_to_merge
-def get_modules_to_merge(model: nn.Module, include_module_types: list):
-    """
-    get the model modules that need to be merged, whose type is in include_module_types
-    :param model: nn.Module, input model
-    :param include_module_types: list, module types that want to include
-    :return:
-    """
-    modules_to_merge: Dict[str, nn.Module] = {}
-    for module_name, module in model.named_modules():
-        is_valid_type = not include_module_types or any(
-            [
-                isinstance(module, include_module_type)
-                for include_module_type in include_module_types
-            ]
-        )
-        if is_valid_type:
-            modules_to_merge[module_name] = module
-    return modules_to_merge
+log = logging.getLogger(__name__)
 def reduce_non_diagonal_elements(
@@ -88,12 +48,16 @@ def merging_with_regmean_weights(
 ):
     """
     merge parameters of different models with computed regmean weights
-    :param models_to_merge_param_dict: dict, dictionary of list, where key is the parameter name,
-    value is a list of the corresponding parameters of all the models that need to be merged
-    :param models_to_merge_regmean_weights_list: list, list of dictionaries with length len(models_to_merge),
-    each dictionary records the regmean weights (matrix) of parameters for each model that needs to be merged, key is module name
-    :param reduce_non_diagonal_ratio: float, reduce non-diagonal elements in regmean weights by multiplying this scalar
-    :return:
+    Args:
+        models_to_merge_param_dict: dict, dictionary of list, where key is the parameter name,
+            value is a list of the corresponding parameters of all the models that need to be merged
+        models_to_merge_regmean_weights_list: list, list of dictionaries with length len(models_to_merge),
+            each dictionary records the regmean weights (matrix) of parameters for each model that needs to be merged, key is module name
+        reduce_non_diagonal_ratio: float, reduce non-diagonal elements in regmean weights by multiplying this scalar
+    Returns:
+        dict: merged model parameters
     """
     # dict, dictionary of model parameters
     merged_params = {}
@@ -164,13 +128,17 @@ def regmean_merging(
     reduce_non_diagonal_ratio: float = 1.0,
 ):
     """
-    regmean merging method
-    :param models_to_merge: list, individual models that need to be merged
-    :param trainers: list, trainers of individual models
-    :param exclude_param_names_regex: list, regular expression of names of parameters that need to be excluded
-    :param nums_regmean_examples: list, numbers of examples to compute regmean weights
-    :param reduce_non_diagonal_ratio: float, reduce non-diagonal elements in regmean weights by multiplying this scalar
-    :return:
+    regmean merging method.
+    Args:
+        models_to_merge: list, individual models that need to be merged
+        trainers: list, trainers of individual models
+        exclude_param_names_regex: list, regular expression of names of parameters that need to be excluded
+        nums_regmean_examples: list, numbers of examples to compute regmean weights
+        reduce_non_diagonal_ratio: float, reduce non-diagonal elements in regmean weights by multiplying this scalar
+    Returns:
+        dict: merged model parameters
     """
     def compute_regmean_weights(module_name: str):
@@ -281,7 +249,10 @@ def regmean_merging(
 @auto_register_config
-class RegMeanAlgorithm(BaseAlgorithm, SimpleProfilerMixin):
+class RegMeanAlgorithm(
+    SimpleProfilerMixin,
+    BaseAlgorithm,
+):
     _include_module_type = [nn.Linear]
     def __init__(

fusion_bench/method/regmean/utils.py ADDED Viewed

@@ -0,0 +1,56 @@
+import re
+from typing import Dict, List
+from torch import nn
+def get_param_names_to_merge(
+    input_param_names: List[str], exclude_param_names_regex: list
+) -> List[str]:
+    """
+    get the names of parameters that need to be merged
+    Args:
+        input_param_names: list, names of input parameters
+        exclude_param_names_regex: list, regular expression of names of parameters that need to be excluded
+    Returns:
+        list: names of parameters that need to be merged
+    """
+    param_names_to_merge = []
+    for param_name in input_param_names:
+        exclude = any(
+            [
+                re.match(exclude_pattern, param_name)
+                for exclude_pattern in exclude_param_names_regex
+            ]
+        )
+        if not exclude:
+            param_names_to_merge.append(param_name)
+    return param_names_to_merge
+def get_modules_to_merge(
+    model: nn.Module, include_module_types: list
+) -> Dict[str, nn.Module]:
+    """
+    get the model modules that need to be merged, whose type is in include_module_types
+    Args:
+        model: nn.Module, input model
+        include_module_types: list, module types that want to include
+    Returns:
+        Dict[str, nn.Module]: a dictionary of modules to merge
+    """
+    modules_to_merge: Dict[str, nn.Module] = {}
+    for module_name, module in model.named_modules():
+        is_valid_type = not include_module_types or any(
+            [
+                isinstance(module, include_module_type)
+                for include_module_type in include_module_types
+            ]
+        )
+        if is_valid_type:
+            modules_to_merge[module_name] = module
+    return modules_to_merge

fusion_bench/method/regmean_plusplus/regmean_plusplus.py CHANGED Viewed

@@ -7,55 +7,14 @@ import torch
 from torch import Tensor, nn
 from tqdm.autonotebook import tqdm
-from fusion_bench.method import BaseAlgorithm
+import fusion_bench.method.regmean.utils as regmean_utils
+from fusion_bench import BaseAlgorithm, auto_register_config
 from fusion_bench.mixins import SimpleProfilerMixin
 from fusion_bench.modelpool import BaseModelPool
 log = logging.getLogger(__name__)
-def get_param_names_to_merge(
-    input_param_names: List[str], exclude_param_names_regex: list
-):
-    """
-    get the names of parameters that need to be merged
-    :param input_param_names: list, names of input parameters
-    :param exclude_param_names_regex: list, regular expression of names of parameters that need to be excluded
-    :return:
-    """
-    param_names_to_merge = []
-    for param_name in input_param_names:
-        exclude = any(
-            [
-                re.match(exclude_pattern, param_name)
-                for exclude_pattern in exclude_param_names_regex
-            ]
-        )
-        if not exclude:
-            param_names_to_merge.append(param_name)
-    return param_names_to_merge
-def get_modules_to_merge(model: nn.Module, include_module_types: list):
-    """
-    get the model modules that need to be merged, whose type is in include_module_types
-    :param model: nn.Module, input model
-    :param include_module_types: list, module types that want to include
-    :return:
-    """
-    modules_to_merge: Dict[str, nn.Module] = {}
-    for module_name, module in model.named_modules():
-        is_valid_type = not include_module_types or any(
-            [
-                isinstance(module, include_module_type)
-                for include_module_type in include_module_types
-            ]
-        )
-        if is_valid_type:
-            modules_to_merge[module_name] = module
-    return modules_to_merge
 def reduce_non_diagonal_elements(
     regmean_weights: torch.Tensor, reduce_non_diagonal_ratio: float
 ):
@@ -130,12 +89,16 @@ def merging_with_regmean_weights(
 ):
     """
     merge parameters of different models with computed regmean weights
-    :param models_to_merge_param_dict: dict, dictionary of list, where key is the parameter name,
-    value is a list of the corresponding parameters of all the models that need to be merged
-    :param models_to_merge_regmean_weights_list: list, list of dictionaries with length len(models_to_merge),
-    each dictionary records the regmean weights (matrix) of parameters for each model that needs to be merged, key is module name
-    :param reduce_non_diagonal_ratio: float, reduce non-diagonal elements in regmean weights by multiplying this scalar
-    :return:
+    Asrgs:
+        models_to_merge_param_dict: dict, dictionary of list, where key is the parameter name,
+            value is a list of the corresponding parameters of all the models that need to be merged
+        models_to_merge_regmean_weights_list: list, list of dictionaries with length len(models_to_merge),
+            each dictionary records the regmean weights (matrix) of parameters for each model that needs to be merged, key is module name
+        reduce_non_diagonal_ratio: float, reduce non-diagonal elements in regmean weights by multiplying this scalar
+    Returns:
+        dict: merged model parameters
     """
     # dict, dictionary of model parameters
     merged_params = {}
@@ -176,14 +139,12 @@ def merging_with_regmean_weights(
     return merged_params
-class RegMeanAlgorithmPlusPlus(BaseAlgorithm, SimpleProfilerMixin):
+@auto_register_config
+class RegMeanAlgorithmPlusPlus(
+    SimpleProfilerMixin,
+    BaseAlgorithm,
+):
     _include_module_type = [nn.Linear]
-    _config_mapping = {
-        "num_regmean_examples": "num_regmean_examples",
-        "exclude_param_names_regex": "exclude_param_names_regex",
-        "reduce_non_diagonal_ratio": "reduce_non_diagonal_ratio",
-        "weight_transpose": "weight_transpose",
-    }
     def __init__(
         self,
@@ -194,11 +155,11 @@ class RegMeanAlgorithmPlusPlus(BaseAlgorithm, SimpleProfilerMixin):
         weight_transpose: bool,
         **kwargs,
     ):
+        super().__init__(**kwargs)
         self.num_regmean_examples = num_regmean_examples
         self.exclude_param_names_regex = exclude_param_names_regex
         self.reduce_non_diagonal_ratio = reduce_non_diagonal_ratio
         self.weight_transpose = weight_transpose
-        super().__init__(**kwargs)
     def run(self, modelpool: BaseModelPool, **kwargs):
         if not isinstance(modelpool, BaseModelPool):
@@ -262,7 +223,7 @@ class RegMeanAlgorithmPlusPlus(BaseAlgorithm, SimpleProfilerMixin):
                     # exclude parameter whose name matches element in exclude_param_names_regex
                     if param_names_to_merge is None:
-                        param_names_to_merge = get_param_names_to_merge(
+                        param_names_to_merge = regmean_utils.get_param_names_to_merge(
                             input_param_names=list(param_dict.keys()),
                             exclude_param_names_regex=self.config.get(
                                 "exclude_param_names_regex", []
@@ -274,7 +235,7 @@ class RegMeanAlgorithmPlusPlus(BaseAlgorithm, SimpleProfilerMixin):
                             param_dict[param_name]
                         )
-                    linear_modules_to_merge = get_modules_to_merge(
+                    linear_modules_to_merge = regmean_utils.get_modules_to_merge(
                         model=layer_to_merge,
                         include_module_types=self._include_module_type,
                     )
@@ -294,7 +255,7 @@ class RegMeanAlgorithmPlusPlus(BaseAlgorithm, SimpleProfilerMixin):
                             linear_modules_to_merge=linear_modules_to_merge,
                         )
-                        module_subset = get_param_names_to_merge(
+                        module_subset = regmean_utils.get_param_names_to_merge(
                             input_param_names=list(param_dict.keys()),
                             exclude_param_names_regex=self.exclude_param_names_regex,
                         )

fusion_bench/method/simple_average.py CHANGED Viewed

@@ -61,8 +61,8 @@ def simple_average(
 @auto_register_config
 class SimpleAverageAlgorithm(
-    BaseAlgorithm,
     SimpleProfilerMixin,
+    BaseAlgorithm,
 ):
     def __init__(self, show_pbar: bool = False, **kwargs):
         """
@@ -120,13 +120,13 @@ class SimpleAverageAlgorithm(
         if isinstance(forward_model, LazyStateDict):
             # if the model is a LazyStateDict, convert it to an empty module
             forward_model = forward_model.meta_module.to_empty(
-                device=(
-                    "cpu"
-                    if forward_model._torch_dtype is None
-                    else forward_model._torch_dtype
-                )
+                device=forward_model._device
             )
-        forward_model.load_state_dict(sd)
+        result = forward_model.load_state_dict(sd, strict=False)
+        if result.unexpected_keys:
+            raise ValueError(f"Unexpected keys in state dict: {result.unexpected_keys}")
+        if result.missing_keys:
+            log.warning(f"Missing keys in state dict: {result.missing_keys}")
         # print profile report and log the merged models
         self.print_profile_summary()
         log.info(f"merged {len(merged_model_names)} models:")

fusion_bench/method/slerp/__init__.py CHANGED Viewed

@@ -1,2 +1,2 @@
 # flake8: noqa F401
-from .slerp import SlerpMergeAlgorithm
+from .slerp import SlerpForCausalLM, SlerpMergeAlgorithm

fusion-bench 0.2.21__py3-none-any.whl → 0.2.23__py3-none-any.whl

fusion-bench 0.2.21py3-none-any.whl → 0.2.23py3-none-any.whl