PyPI - fusion-bench - Versions diffs - 0.2.22__py3-none-any.whl → 0.2.24__py3-none-any.whl - Mend

fusion-bench 0.2.22py3-none-any.whl → 0.2.24py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (78) hide show

fusion_bench/__init__.py +4 -0
fusion_bench/compat/method/__init__.py +5 -2
fusion_bench/compat/method/base_algorithm.py +3 -2
fusion_bench/compat/modelpool/base_pool.py +3 -3
fusion_bench/compat/taskpool/clip_image_classification.py +1 -1
fusion_bench/dataset/gpt2_glue.py +1 -1
fusion_bench/method/__init__.py +12 -2
fusion_bench/method/analysis/task_vector_cos_similarity.py +95 -12
fusion_bench/method/analysis/task_vector_violin_plot.py +160 -52
fusion_bench/method/bitdelta/bitdelta.py +7 -23
fusion_bench/method/ensemble.py +17 -2
fusion_bench/method/expert_sparsity/mixtral/dynamic_skipping.py +2 -0
fusion_bench/method/expert_sparsity/mixtral/layer_wise_pruning.py +2 -0
fusion_bench/method/expert_sparsity/mixtral/progressive_pruning.py +2 -0
fusion_bench/method/linear/__init__.py +6 -2
fusion_bench/method/linear/{simple_average_for_llama.py → simple_average_for_causallm.py} +8 -4
fusion_bench/method/linear/{task_arithmetic_for_llama.py → task_arithmetic_for_causallm.py} +22 -12
fusion_bench/method/linear/ties_merging_for_causallm.py +70 -0
fusion_bench/method/model_stock/__init__.py +1 -0
fusion_bench/method/model_stock/model_stock.py +309 -0
fusion_bench/method/regmean/clip_regmean.py +3 -6
fusion_bench/method/regmean/regmean.py +27 -56
fusion_bench/method/regmean/utils.py +56 -0
fusion_bench/method/regmean_plusplus/regmean_plusplus.py +21 -60
fusion_bench/method/simple_average.py +2 -2
fusion_bench/method/slerp/__init__.py +1 -1
fusion_bench/method/slerp/slerp.py +110 -14
fusion_bench/method/task_arithmetic/task_arithmetic.py +35 -10
fusion_bench/method/ties_merging/ties_merging.py +22 -6
fusion_bench/method/we_moe/flan_t5_we_moe.py +9 -20
fusion_bench/method/wudi/__init__.py +1 -0
fusion_bench/method/wudi/wudi.py +105 -0
fusion_bench/mixins/clip_classification.py +26 -6
fusion_bench/mixins/lightning_fabric.py +4 -0
fusion_bench/mixins/serialization.py +40 -83
fusion_bench/modelpool/base_pool.py +1 -1
fusion_bench/modelpool/causal_lm/causal_lm.py +285 -44
fusion_bench/modelpool/seq2seq_lm/modelpool.py +146 -0
fusion_bench/models/hf_clip.py +4 -0
fusion_bench/models/hf_utils.py +10 -4
fusion_bench/models/linearized/vision_model.py +6 -6
fusion_bench/models/model_card_templates/default.md +8 -1
fusion_bench/models/modeling_smile_mistral/__init__.py +1 -0
fusion_bench/models/we_moe.py +8 -8
fusion_bench/models/wrappers/ensemble.py +136 -7
fusion_bench/scripts/cli.py +2 -2
fusion_bench/taskpool/base_pool.py +99 -17
fusion_bench/taskpool/clip_vision/taskpool.py +12 -5
fusion_bench/taskpool/dummy.py +101 -13
fusion_bench/taskpool/lm_eval_harness/taskpool.py +80 -0
fusion_bench/taskpool/nyuv2_taskpool.py +28 -0
fusion_bench/utils/__init__.py +1 -0
fusion_bench/utils/data.py +6 -4
fusion_bench/utils/devices.py +36 -11
fusion_bench/utils/dtype.py +3 -2
fusion_bench/utils/lazy_state_dict.py +85 -19
fusion_bench/utils/packages.py +3 -3
fusion_bench/utils/parameters.py +0 -2
fusion_bench/utils/rich_utils.py +7 -3
fusion_bench/utils/timer.py +92 -10
{fusion_bench-0.2.22.dist-info → fusion_bench-0.2.24.dist-info}/METADATA +10 -3
{fusion_bench-0.2.22.dist-info → fusion_bench-0.2.24.dist-info}/RECORD +77 -64
fusion_bench_config/_get_started/llm_slerp.yaml +12 -0
fusion_bench_config/method/ensemble/simple_ensemble.yaml +1 -0
fusion_bench_config/method/linear/{simple_average_for_llama.yaml → simple_average_for_causallm.yaml} +1 -1
fusion_bench_config/method/linear/task_arithmetic_for_causallm.yaml +4 -0
fusion_bench_config/method/linear/ties_merging_for_causallm.yaml +13 -0
fusion_bench_config/method/model_stock/model_stock.yaml +12 -0
fusion_bench_config/method/slerp/slerp_lm.yaml +4 -0
fusion_bench_config/method/wudi/wudi.yaml +4 -0
fusion_bench_config/modelpool/CausalLMPool/{Qwen2.5-1.5B_math_and_coder.yaml → Qwen2.5-1.5B_math_and_code.yaml} +1 -2
fusion_bench_config/modelpool/CausalLMPool/Qwen2.5-1.5B_three_models.yaml +11 -0
fusion_bench_config/modelpool/CausalLMPool/llama-7b_3-models_v1.yaml +11 -0
fusion_bench_config/method/linear/task_arithmetic_for_llama.yaml +0 -4
{fusion_bench-0.2.22.dist-info → fusion_bench-0.2.24.dist-info}/WHEEL +0 -0
{fusion_bench-0.2.22.dist-info → fusion_bench-0.2.24.dist-info}/entry_points.txt +0 -0
{fusion_bench-0.2.22.dist-info → fusion_bench-0.2.24.dist-info}/licenses/LICENSE +0 -0
{fusion_bench-0.2.22.dist-info → fusion_bench-0.2.24.dist-info}/top_level.txt +0 -0

fusion_bench/method/ensemble.py CHANGED Viewed

@@ -17,7 +17,21 @@ from fusion_bench.models.wrappers.ensemble import (
 log = logging.getLogger(__name__)
+@auto_register_config
 class SimpleEnsembleAlgorithm(BaseAlgorithm):
+    def __init__(
+        self,
+        device_map: Optional[Mapping[int, Union[str, torch.device]]] = None,
+        **kwargs,
+    ):
+        """
+        Initializes the SimpleEnsembleAlgorithm with an optional device map.
+        Args:
+            device_map (Optional[Mapping[int, Union[str, torch.device]]], optional): A mapping from model index to device. Defaults to None.
+        """
+        super().__init__(**kwargs)
     @torch.no_grad()
     def run(self, modelpool: BaseModelPool | List[nn.Module]) -> EnsembleModule:
         """
@@ -30,9 +44,10 @@ class SimpleEnsembleAlgorithm(BaseAlgorithm):
             EnsembleModule: The ensembled model.
         """
         log.info(f"Running ensemble algorithm with {len(modelpool)} models")
         models = [modelpool.load_model(m) for m in modelpool.model_names]
-        ensemble = EnsembleModule(models=models)
+        log.info("creating ensemble module")
+        ensemble = EnsembleModule(models=models, device_map=self.device_map)
         return ensemble

fusion_bench/method/expert_sparsity/mixtral/dynamic_skipping.py CHANGED Viewed

@@ -23,6 +23,7 @@ from transformers import MixtralForCausalLM
 from transformers.models.mixtral.modeling_mixtral import MixtralForCausalLM
 import fusion_bench as fb
+from fusion_bench import auto_register_config
 from fusion_bench.method.expert_sparsity.utils.calibration_data import (
     build_calib_loader,
 )
@@ -97,6 +98,7 @@ def dynamic_skipping(
     return model, (res_median, res_mean)
+@auto_register_config
 class DynamicSkippingPruningForMixtral(
     fb.BaseAlgorithm,
     fb.mixins.LightningFabricMixin,

fusion_bench/method/expert_sparsity/mixtral/layer_wise_pruning.py CHANGED Viewed

@@ -22,6 +22,7 @@ from transformers import MixtralForCausalLM
 from transformers.models.mixtral.modeling_mixtral import MixtralDecoderLayer
 import fusion_bench as fb
+from fusion_bench import auto_register_config
 from fusion_bench.method.expert_sparsity.utils.calibration_data import (
     build_calib_loader,
 )
@@ -81,6 +82,7 @@ def layerwise_pruning(
     return model, (global_loss_history,)
+@auto_register_config
 class LayerWisePruningForMixtral(
     fb.BaseAlgorithm,
     fb.mixins.LightningFabricMixin,

fusion_bench/method/expert_sparsity/mixtral/progressive_pruning.py CHANGED Viewed

@@ -20,6 +20,7 @@ from tqdm import tqdm
 from transformers import MixtralForCausalLM
 import fusion_bench as fb
+from fusion_bench import auto_register_config
 from fusion_bench.method.expert_sparsity.utils.calibration_data import (
     build_calib_loader,
 )
@@ -95,6 +96,7 @@ def progressive_pruning(
     return model, (global_loss_history,)
+@auto_register_config
 class ProgressivePruningForMixtral(
     fb.BaseAlgorithm,
     fb.mixins.LightningFabricMixin,

fusion_bench/method/linear/__init__.py CHANGED Viewed

@@ -2,5 +2,9 @@
 from .expo import ExPOAlgorithm
 from .linear_interpolation import LinearInterpolationAlgorithm
 from .llama_expo import ExPOAlgorithmForLlama
-from .simple_average_for_llama import SimpleAverageForLlama
-from .task_arithmetic_for_llama import TaskArithmeticForLlama
+from .simple_average_for_causallm import SimpleAverageForCausalLM, SimpleAverageForLlama
+from .task_arithmetic_for_causallm import (
+    TaskArithmeticForCausalLM,
+    TaskArithmeticForLlama,
+)
+from .ties_merging_for_causallm import TiesMergingForCausalLM

fusion_bench/method/linear/{simple_average_for_llama.py → simple_average_for_causallm.py} RENAMED Viewed

@@ -18,16 +18,16 @@ log = get_rankzero_logger(__name__)
 @auto_register_config
-class SimpleAverageForLlama(BaseAlgorithm):
+class SimpleAverageForCausalLM(BaseAlgorithm):
     R"""
     A simple averaging algorithm for LLama models. If `merge_backbone` is set to `True`, the backbone of the model will be averaged and the rest of the model will be loaded from the pre-trained model.
     Examples:
-        The following example demonstrates how to use the `SimpleAverageForLlama` algorithm to merge Mistral models.
+        The following example demonstrates how to use the `SimpleAverageForCausalLM` algorithm to merge Mistral models.
         ```bash
         fusion_bench \
-            method=linear/simple_average_for_llama \
+            method=linear/simple_average_for_causallm \
             method.model_save_path=outputs/simle_mixtral_exp_v4/simple_average \
             modelpool=CausalLMPool/simle_mixtral_exp_v4.yaml
         ```
@@ -35,7 +35,7 @@ class SimpleAverageForLlama(BaseAlgorithm):
     def __init__(
         self,
-        merge_backbone: bool,
+        merge_backbone: bool = False,
         model_save_path: Optional[str] = None,
         show_pbar: bool = False,
         **kwargs,
@@ -81,3 +81,7 @@ class SimpleAverageForLlama(BaseAlgorithm):
                 with open(os.path.join(self.model_save_path, "README.md"), "w") as f:
                     f.write(model_card_str)
         return model
+SimpleAverageForLlama = SimpleAverageForCausalLM
+"""Alias for SimpleAverageForCausalLM"""

fusion_bench/method/linear/{task_arithmetic_for_llama.py → task_arithmetic_for_causallm.py} RENAMED Viewed

@@ -1,22 +1,27 @@
 import logging
+import os
 from typing import Dict, List, Mapping, Optional, TypeVar, Union  # noqa: F401
 from typing_extensions import override
-from fusion_bench import timeit_context
+from fusion_bench import auto_register_config, timeit_context
 from fusion_bench.method import TaskArithmeticAlgorithm
 from fusion_bench.mixins.simple_profiler import SimpleProfilerMixin
 from fusion_bench.modelpool import CausalLMBackbonePool, CausalLMPool
+from fusion_bench.models.hf_utils import create_default_model_card
 log = logging.getLogger(__name__)
-class TaskArithmeticForLlama(TaskArithmeticAlgorithm, SimpleProfilerMixin):
+@auto_register_config
+class TaskArithmeticForCausalLM(
+    TaskArithmeticAlgorithm,
+):
     R"""
     Examples:
     fusion_bench \
-        method=linear/task_arithmetic_for_llama \
+        method=linear/task_arithmetic_for_causallm \
             method.scaling_factor=0.3 \
         method.model_save_path=outputs/simle_mixtral_exp_v4/task_arithmetic_0.3 \
         modelpool=CausalLMPool/simle_mixtral_exp_v4.yaml
@@ -29,18 +34,14 @@ class TaskArithmeticForLlama(TaskArithmeticAlgorithm, SimpleProfilerMixin):
     def __init__(
         self,
         scaling_factor: float,
-        merge_backbone: bool,
+        merge_backbone: bool = False,
         model_save_path: Optional[str] = None,
+        **kwargs,
     ):
-        self.merge_backbone = merge_backbone
-        self.model_save_path = model_save_path
-        super().__init__(scaling_factor=scaling_factor)
+        super().__init__(scaling_factor=scaling_factor, **kwargs)
     @override
     def run(self, modelpool: CausalLMPool):
-        if self.model_save_path:
-            tokenizer = modelpool.load_tokenizer()
         if self.merge_backbone:
             assert modelpool.has_pretrained
             backbone_modelpool = CausalLMBackbonePool(**modelpool.config)
@@ -52,6 +53,15 @@ class TaskArithmeticForLlama(TaskArithmeticAlgorithm, SimpleProfilerMixin):
         if self.model_save_path is not None:
             with timeit_context(f"Saving the model to {self.model_save_path}"):
-                tokenizer.save_pretrained(self.model_save_path)
-                model.save_pretrained(self.model_save_path)
+                description = f"Merged model using task arithmetic with scaling factor {self.scaling_factor}."
+                modelpool.save_model(
+                    model=model,
+                    path=self.model_save_path,
+                    save_tokenizer=True,
+                    algorithm_config=self.config,
+                    description=description,
+                )
         return model
+TaskArithmeticForLlama = TaskArithmeticForCausalLM

fusion_bench/method/linear/ties_merging_for_causallm.py ADDED Viewed

@@ -0,0 +1,70 @@
+import logging
+import os
+from typing import Dict, List, Mapping, Optional, TypeVar, Union  # noqa: F401
+from typing_extensions import override
+from fusion_bench import auto_register_config, timeit_context
+from fusion_bench.method import TiesMergingAlgorithm
+from fusion_bench.mixins.simple_profiler import SimpleProfilerMixin
+from fusion_bench.modelpool import CausalLMBackbonePool, CausalLMPool
+from fusion_bench.models.hf_utils import create_default_model_card
+log = logging.getLogger(__name__)
+@auto_register_config
+class TiesMergingForCausalLM(
+    TiesMergingAlgorithm,
+):
+    R"""
+    TIES merging algorithm for CausalLM models.
+    This class extends the TiesMergingAlgorithm to work specifically with CausalLM models,
+    providing model saving capabilities and backbone merging support.
+    """
+    _config_mapping = TiesMergingAlgorithm._config_mapping | {
+        "merge_backbone": "merge_backbone",
+    }
+    def __init__(
+        self,
+        scaling_factor: float,
+        threshold: float,
+        remove_keys: List[str] = None,
+        merge_func: str = "sum",
+        merge_backbone: bool = False,
+        model_save_path: Optional[str] = None,
+        **kwargs,
+    ):
+        super().__init__(
+            scaling_factor=scaling_factor,
+            threshold=threshold,
+            remove_keys=remove_keys,
+            merge_func=merge_func,
+            **kwargs,
+        )
+    @override
+    def run(self, modelpool: CausalLMPool):
+        if self.merge_backbone:
+            assert modelpool.has_pretrained
+            backbone_modelpool = CausalLMBackbonePool(**modelpool.config)
+            model = modelpool.load_model("_pretrained_")
+            backbone_model = super().run(backbone_modelpool)
+            model.model.layers = backbone_model
+        else:
+            model = super().run(modelpool)
+        if self.model_save_path is not None:
+            with timeit_context(f"Saving the model to {self.model_save_path}"):
+                description = f"Merged model using TIES merging with scaling factor {self.scaling_factor} and threshold {self.threshold}."
+                modelpool.save_model(
+                    model=model,
+                    path=self.model_save_path,
+                    save_tokenizer=True,
+                    algorithm_config=self.config,
+                    description=description,
+                )
+        return model

fusion_bench/method/model_stock/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .model_stock import ModelStock

fusion_bench/method/model_stock/model_stock.py ADDED Viewed

@@ -0,0 +1,309 @@
+import copy
+import logging
+import math
+import os
+from collections import OrderedDict
+from copy import deepcopy
+from typing import Dict, List, Optional, Union
+import numpy as np
+import torch
+from omegaconf import DictConfig
+from torch import nn
+from transformers import PreTrainedModel
+import fusion_bench
+from fusion_bench import BaseAlgorithm, BaseModelPool
+from fusion_bench.mixins import SimpleProfilerMixin
+from fusion_bench.models import create_default_model_card
+from fusion_bench.utils.type import StateDictType
+log = logging.getLogger(__name__)
+EPS = 1e-8
+def compute_angle(
+    state_dict_1: StateDictType,
+    state_dict_2: StateDictType,
+    ref_state_dict: StateDictType,
+    ignore_keys: List[str] = [],
+    return_cos: bool = False,
+) -> Dict[str, float]:
+    """
+    Compute the angle between two state dictionaries relative to a reference state dictionary.
+    Args:
+        state_dict_1: First state dictionary
+        state_dict_2: Second state dictionary
+        ref_state_dict: Reference state dictionary (typically pre-trained model)
+        ignore_keys: Keys to ignore during computation
+        return_cos: If True, return cosine values instead of angles in degrees
+    Returns:
+        Dictionary mapping parameter names to angles (in degrees) or cosine values
+    """
+    # Remove the keys not used for CLIP fine-tuning (from the notebook example)
+    return_dict = OrderedDict()
+    with torch.no_grad():
+        for key in ref_state_dict:
+            if key in ignore_keys:
+                log.info(f"Ignoring key '{key}'")
+                continue
+            state_dict_1_val = state_dict_1[key]
+            state_dict_2_val = state_dict_2[key]
+            ref_val = ref_state_dict[key]
+            if not (state_dict_1_val.shape == state_dict_2_val.shape == ref_val.shape):
+                log.warning(
+                    f"Shape mismatch for key '{key}', ignored during merging: "
+                    f"({state_dict_1_val.shape}, {state_dict_2_val.shape}, {ref_val.shape})"
+                )
+                continue
+            vector1 = (state_dict_1_val - ref_val).clone().detach()
+            vector2 = (state_dict_2_val - ref_val).clone().detach()
+            vector1 = vector1.float()
+            vector2 = vector2.float()
+            cosine_val = torch.sum(vector1 * vector2) / (
+                math.sqrt(torch.sum(vector1**2) * torch.sum(vector2**2)) + EPS
+            )
+            cosine_val = torch.clamp(
+                cosine_val, min=-1.0, max=1.0
+            )  # Prevent nan from acos
+            if return_cos:
+                return_dict[key] = cosine_val.item()
+            else:
+                return_dict[key] = np.rad2deg(
+                    torch.acos(cosine_val).detach().cpu().item()
+                )
+    return return_dict
+def compute_ratio(angle_dict: Dict[str, float], k: int = 2) -> Dict[str, float]:
+    """
+    Compute interpolation ratios based on angles between fine-tuned models.
+    Args:
+        angle_dict: Dictionary mapping parameter names to angles in degrees
+        k: Number of fine-tuned models (default: 2)
+    Returns:
+        Dictionary mapping parameter names to interpolation ratios
+    """
+    ratio_dict = {}
+    for key in angle_dict.keys():
+        angle = np.deg2rad(angle_dict[key])
+        ratio_dict[key] = k * np.cos(angle) / ((k - 1) * np.cos(angle) + 1 + EPS)
+    return ratio_dict
+def merge_weights(
+    w1: StateDictType, w2: StateDictType, w0: StateDictType, ratio: Dict[str, float]
+) -> StateDictType:
+    """
+    Merge model weights using ModelStock formula.
+    Args:
+        w1: First fine-tuned model weights
+        w2: Second fine-tuned model weights
+        w0: Pre-trained model weights
+        ratio: Interpolation ratios for each parameter
+    Returns:
+        Merged model weights
+    """
+    # Compute w12 = (w1 + w2) / 2
+    w12 = {}
+    for key in w1.keys():
+        w12[key] = (w1[key].clone() + w2[key].clone()) / 2.0
+    # Apply ModelStock formula: w_merge = t * w12 + (1-t) * w0
+    w_merge = copy.deepcopy(w12)
+    for key, r in ratio.items():
+        w_merge[key] = w12[key].clone() * r + w0[key].clone() * (1.0 - r)
+    return w_merge
+@fusion_bench.auto_register_config
+class ModelStock(SimpleProfilerMixin, BaseAlgorithm):
+    """
+    Model Stock: All we need is just a few fine-tuned models
+    This method merges fine-tuned models by interpolating between their average
+    and a pre-trained anchor model, with interpolation ratios determined by
+    the angle between fine-tuned models in parameter space.
+    """
+    def __init__(
+        self,
+        ignore_keys: Optional[List[str]] = None,
+        model_save_path: Optional[str] = None,
+        model_save_kwargs: Optional[DictConfig] = None,
+        **kwargs,
+    ):
+        """
+        Initialize ModelStock algorithm.
+        Args:
+            ignore_keys: Additional parameter keys to ignore during merging
+        """
+        super().__init__(**kwargs)
+        if self.ignore_keys is None:
+            self.ignore_keys = []
+        if self.model_save_kwargs is None:
+            self.model_save_kwargs = DictConfig({})
+    def run(self, modelpool: BaseModelPool) -> nn.Module:
+        """
+        Run the ModelStock merging algorithm.
+        Args:
+            modelpool: Pool of models containing pre-trained and fine-tuned models
+        Returns:
+            Merged model
+        """
+        with self.profile("model loading"):
+            # Load the pre-trained model (anchor)
+            pretrained_model = modelpool.load_pretrained_model()
+            if isinstance(pretrained_model, fusion_bench.LazyStateDict):
+                assert (
+                    pretrained_model.meta_module is not None
+                ), "Meta module is not initialized"
+            pretrained_state_dict = pretrained_model.state_dict()
+            # Load all fine-tuned models
+            finetuned_models = []
+            finetuned_state_dicts = []
+            for model_name in modelpool.model_names:
+                model = modelpool.load_model(model_name)
+                finetuned_models.append(model)
+                finetuned_state_dicts.append(model.state_dict())
+                log.info(f"Loaded fine-tuned model: {model_name}")
+        if len(finetuned_models) < 2:
+            raise ValueError("ModelStock requires at least 2 fine-tuned models")
+        log.info(f"Running ModelStock with {len(finetuned_models)} fine-tuned models")
+        with self.profile("compute angles and ratios"):
+            if len(finetuned_models) == 2:
+                # Two fine-tuned models case
+                angle_dict = compute_angle(
+                    finetuned_state_dicts[0],
+                    finetuned_state_dicts[1],
+                    pretrained_state_dict,
+                    ignore_keys=self.ignore_keys,
+                )
+                ratio_dict = compute_ratio(angle_dict, k=2)
+                log.info(f"Computed angles for {len(angle_dict)} parameter groups")
+            else:
+                # N fine-tuned models case - compute average angle
+                angles_sum = {}
+                angles_count = {}
+                # Compute pairwise angles and average them
+                for i in range(len(finetuned_models)):
+                    for j in range(i + 1, len(finetuned_models)):
+                        angle_dict = compute_angle(
+                            finetuned_state_dicts[i],
+                            finetuned_state_dicts[j],
+                            pretrained_state_dict,
+                            ignore_keys=self.ignore_keys,
+                        )
+                        for key, angle in angle_dict.items():
+                            if key not in angles_sum:
+                                angles_sum[key] = 0
+                                angles_count[key] = 0
+                            angles_sum[key] += angle
+                            angles_count[key] += 1
+                # Average the angles
+                avg_angle_dict = {}
+                for key in angles_sum:
+                    avg_angle_dict[key] = angles_sum[key] / angles_count[key]
+                ratio_dict = compute_ratio(avg_angle_dict, k=len(finetuned_models))
+                log.info(
+                    f"Computed average angles for {len(avg_angle_dict)} parameter groups"
+                )
+        with self.profile("merge weights"):
+            if len(finetuned_models) == 2:
+                # Direct merging for two models
+                merged_state_dict = merge_weights(
+                    finetuned_state_dicts[0],
+                    finetuned_state_dicts[1],
+                    pretrained_state_dict,
+                    ratio_dict,
+                )
+            else:
+                # For N models, first compute the average of fine-tuned models
+                avg_finetuned_state_dict = {}
+                for key in finetuned_state_dicts[0].keys():
+                    avg_finetuned_state_dict[key] = torch.zeros_like(
+                        finetuned_state_dicts[0][key]
+                    )
+                    for state_dict in finetuned_state_dicts:
+                        avg_finetuned_state_dict[key] += state_dict[key]
+                    avg_finetuned_state_dict[key] /= len(finetuned_state_dicts)
+                # Apply ModelStock formula: w_H = t * w_avg + (1-t) * w_0
+                merged_state_dict = copy.deepcopy(avg_finetuned_state_dict)
+                for key, r in ratio_dict.items():
+                    merged_state_dict[key] = avg_finetuned_state_dict[
+                        key
+                    ].clone() * r + pretrained_state_dict[key].clone() * (1.0 - r)
+        # Load merged weights into the model
+        if isinstance(pretrained_model, nn.Module):
+            result_model = pretrained_model
+        elif isinstance(pretrained_model, fusion_bench.LazyStateDict):
+            result_model = deepcopy(pretrained_model.meta_module)
+            result_model.to(device=pretrained_model._device)
+        result = result_model.load_state_dict(merged_state_dict, strict=False)
+        if result.unexpected_keys:
+            raise RuntimeError(
+                f"Unexpected keys in state dict: {result.unexpected_keys}"
+            )
+        if result.missing_keys:
+            log.warning(f"Missing keys in state dict: {result.missing_keys}")
+        if self.model_save_path is not None:
+            with self.profile("model saving"):
+                modelpool.save_model(
+                    model, path=self.model_save_path, **self.model_save_kwargs
+                )
+                if isinstance(model, PreTrainedModel):
+                    modelcard = create_default_model_card(
+                        models=[
+                            modelpool.get_model_path(m)
+                            for m in modelpool.all_model_names
+                        ],
+                        description="Merged model using [Model Stock](https://arxiv.org/abs/2403.19522).",
+                        algorithm_config=self.config,
+                        modelpool_config=modelpool.config,
+                    )
+                    with open(
+                        os.path.join(self.model_save_path, "README.md"), "w"
+                    ) as f:
+                        f.write(modelcard)
+        self.print_profile_summary()
+        log.info("ModelStock merging completed successfully")
+        return result_model

fusion_bench/method/regmean/clip_regmean.py CHANGED Viewed

@@ -9,6 +9,7 @@ from torch.nn.modules import Module
 from torch.utils.data import DataLoader
 from tqdm.autonotebook import tqdm
+from fusion_bench import auto_register_config
 from fusion_bench.dataset.clip_dataset import CLIPDataset
 from fusion_bench.mixins import CLIPClassificationMixin
@@ -17,17 +18,13 @@ from .regmean import RegMeanAlgorithm
 log = logging.getLogger(__name__)
+@auto_register_config
 class RegMeanAlgorithmForCLIP(
-    RegMeanAlgorithm,
     CLIPClassificationMixin,
+    RegMeanAlgorithm,
 ):
-    _config_mapping = {
-        "_dataloader_kwargs": "dataloader_kwargs",
-    }
     def __init__(self, *, dataloader_kwargs: DictConfig, **kwargs):
         super().__init__(**kwargs)
-        self.dataloader_kwargs = dataloader_kwargs
     def on_regmean_start(self):
         self.setup_zero_shot_classification_head()

fusion-bench 0.2.22__py3-none-any.whl → 0.2.24__py3-none-any.whl

fusion-bench 0.2.22py3-none-any.whl → 0.2.24py3-none-any.whl