PyPI - fusion-bench - Versions diffs - 0.2.23__py3-none-any.whl → 0.2.24__py3-none-any.whl - Mend

fusion-bench 0.2.23py3-none-any.whl → 0.2.24py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

fusion_bench/method/__init__.py CHANGED Viewed

@@ -26,9 +26,12 @@ _import_structure = {
     "linear": [
         "ExPOAlgorithm",
         "ExPOAlgorithmForLlama",
+        "SimpleAverageForCausalLM",
         "SimpleAverageForLlama",
+        "TaskArithmeticForCausalLM",
         "TaskArithmeticForLlama",
         "LinearInterpolationAlgorithm",
+        "TiesMergingForCausalLM",
     ],
     "slerp": ["SlerpMergeAlgorithm", "SlerpForCausalLM"],
     "simple_average": ["SimpleAverageAlgorithm"],
@@ -72,6 +75,7 @@ _import_structure = {
     "fw_merging": ["FrankWolfeHardAlgorithm", "FrankWolfeSoftAlgorithm"],
     "tall_mask": ["TallMaskTaskArithmeticAlgorithm"],
     "model_stock": ["ModelStock"],
+    "wudi": ["wudi_merging", "WUDIMerging"],
     # plug-and-play model merging methods
     "concrete_subspace": [
         "ConcreteTaskArithmeticAlgorithmForCLIP",
@@ -184,8 +188,11 @@ if TYPE_CHECKING:
         ExPOAlgorithm,
         ExPOAlgorithmForLlama,
         LinearInterpolationAlgorithm,
+        SimpleAverageForCausalLM,
         SimpleAverageForLlama,
+        TaskArithmeticForCausalLM,
         TaskArithmeticForLlama,
+        TiesMergingForCausalLM,
     )
     from .lm_finetune import *
     from .mixture_of_experts import (
@@ -238,6 +245,7 @@ if TYPE_CHECKING:
         FlanT5WeightEnsemblingMoEAlgorithm,
     )
     from .weighted_average import WeightedAverageAlgorithm, WeightedAverageForLLama
+    from .wudi import WUDIMerging, wudi_merging
 else:
     sys.modules[__name__] = LazyImporter(

fusion_bench/method/ensemble.py CHANGED Viewed

@@ -17,7 +17,21 @@ from fusion_bench.models.wrappers.ensemble import (
 log = logging.getLogger(__name__)
+@auto_register_config
 class SimpleEnsembleAlgorithm(BaseAlgorithm):
+    def __init__(
+        self,
+        device_map: Optional[Mapping[int, Union[str, torch.device]]] = None,
+        **kwargs,
+    ):
+        """
+        Initializes the SimpleEnsembleAlgorithm with an optional device map.
+        Args:
+            device_map (Optional[Mapping[int, Union[str, torch.device]]], optional): A mapping from model index to device. Defaults to None.
+        """
+        super().__init__(**kwargs)
     @torch.no_grad()
     def run(self, modelpool: BaseModelPool | List[nn.Module]) -> EnsembleModule:
         """
@@ -30,9 +44,10 @@ class SimpleEnsembleAlgorithm(BaseAlgorithm):
             EnsembleModule: The ensembled model.
         """
         log.info(f"Running ensemble algorithm with {len(modelpool)} models")
         models = [modelpool.load_model(m) for m in modelpool.model_names]
-        ensemble = EnsembleModule(models=models)
+        log.info("creating ensemble module")
+        ensemble = EnsembleModule(models=models, device_map=self.device_map)
         return ensemble

fusion_bench/method/linear/__init__.py CHANGED Viewed

@@ -2,5 +2,9 @@
 from .expo import ExPOAlgorithm
 from .linear_interpolation import LinearInterpolationAlgorithm
 from .llama_expo import ExPOAlgorithmForLlama
-from .simple_average_for_llama import SimpleAverageForLlama
-from .task_arithmetic_for_llama import TaskArithmeticForLlama
+from .simple_average_for_causallm import SimpleAverageForCausalLM, SimpleAverageForLlama
+from .task_arithmetic_for_causallm import (
+    TaskArithmeticForCausalLM,
+    TaskArithmeticForLlama,
+)
+from .ties_merging_for_causallm import TiesMergingForCausalLM

fusion_bench/method/linear/{simple_average_for_llama.py → simple_average_for_causallm.py} RENAMED Viewed

@@ -18,16 +18,16 @@ log = get_rankzero_logger(__name__)
 @auto_register_config
-class SimpleAverageForLlama(BaseAlgorithm):
+class SimpleAverageForCausalLM(BaseAlgorithm):
     R"""
     A simple averaging algorithm for LLama models. If `merge_backbone` is set to `True`, the backbone of the model will be averaged and the rest of the model will be loaded from the pre-trained model.
     Examples:
-        The following example demonstrates how to use the `SimpleAverageForLlama` algorithm to merge Mistral models.
+        The following example demonstrates how to use the `SimpleAverageForCausalLM` algorithm to merge Mistral models.
         ```bash
         fusion_bench \
-            method=linear/simple_average_for_llama \
+            method=linear/simple_average_for_causallm \
             method.model_save_path=outputs/simle_mixtral_exp_v4/simple_average \
             modelpool=CausalLMPool/simle_mixtral_exp_v4.yaml
         ```
@@ -35,7 +35,7 @@ class SimpleAverageForLlama(BaseAlgorithm):
     def __init__(
         self,
-        merge_backbone: bool,
+        merge_backbone: bool = False,
         model_save_path: Optional[str] = None,
         show_pbar: bool = False,
         **kwargs,
@@ -81,3 +81,7 @@ class SimpleAverageForLlama(BaseAlgorithm):
                 with open(os.path.join(self.model_save_path, "README.md"), "w") as f:
                     f.write(model_card_str)
         return model
+SimpleAverageForLlama = SimpleAverageForCausalLM
+"""Alias for SimpleAverageForCausalLM"""

fusion_bench/method/linear/{task_arithmetic_for_llama.py → task_arithmetic_for_causallm.py} RENAMED Viewed

@@ -1,22 +1,27 @@
 import logging
+import os
 from typing import Dict, List, Mapping, Optional, TypeVar, Union  # noqa: F401
 from typing_extensions import override
-from fusion_bench import timeit_context
+from fusion_bench import auto_register_config, timeit_context
 from fusion_bench.method import TaskArithmeticAlgorithm
 from fusion_bench.mixins.simple_profiler import SimpleProfilerMixin
 from fusion_bench.modelpool import CausalLMBackbonePool, CausalLMPool
+from fusion_bench.models.hf_utils import create_default_model_card
 log = logging.getLogger(__name__)
-class TaskArithmeticForLlama(TaskArithmeticAlgorithm, SimpleProfilerMixin):
+@auto_register_config
+class TaskArithmeticForCausalLM(
+    TaskArithmeticAlgorithm,
+):
     R"""
     Examples:
     fusion_bench \
-        method=linear/task_arithmetic_for_llama \
+        method=linear/task_arithmetic_for_causallm \
             method.scaling_factor=0.3 \
         method.model_save_path=outputs/simle_mixtral_exp_v4/task_arithmetic_0.3 \
         modelpool=CausalLMPool/simle_mixtral_exp_v4.yaml
@@ -29,18 +34,14 @@ class TaskArithmeticForLlama(TaskArithmeticAlgorithm, SimpleProfilerMixin):
     def __init__(
         self,
         scaling_factor: float,
-        merge_backbone: bool,
+        merge_backbone: bool = False,
         model_save_path: Optional[str] = None,
+        **kwargs,
     ):
-        self.merge_backbone = merge_backbone
-        self.model_save_path = model_save_path
-        super().__init__(scaling_factor=scaling_factor)
+        super().__init__(scaling_factor=scaling_factor, **kwargs)
     @override
     def run(self, modelpool: CausalLMPool):
-        if self.model_save_path:
-            tokenizer = modelpool.load_tokenizer()
         if self.merge_backbone:
             assert modelpool.has_pretrained
             backbone_modelpool = CausalLMBackbonePool(**modelpool.config)
@@ -52,6 +53,15 @@ class TaskArithmeticForLlama(TaskArithmeticAlgorithm, SimpleProfilerMixin):
         if self.model_save_path is not None:
             with timeit_context(f"Saving the model to {self.model_save_path}"):
-                tokenizer.save_pretrained(self.model_save_path)
-                model.save_pretrained(self.model_save_path)
+                description = f"Merged model using task arithmetic with scaling factor {self.scaling_factor}."
+                modelpool.save_model(
+                    model=model,
+                    path=self.model_save_path,
+                    save_tokenizer=True,
+                    algorithm_config=self.config,
+                    description=description,
+                )
         return model
+TaskArithmeticForLlama = TaskArithmeticForCausalLM

fusion_bench/method/linear/ties_merging_for_causallm.py ADDED Viewed

@@ -0,0 +1,70 @@
+import logging
+import os
+from typing import Dict, List, Mapping, Optional, TypeVar, Union  # noqa: F401
+from typing_extensions import override
+from fusion_bench import auto_register_config, timeit_context
+from fusion_bench.method import TiesMergingAlgorithm
+from fusion_bench.mixins.simple_profiler import SimpleProfilerMixin
+from fusion_bench.modelpool import CausalLMBackbonePool, CausalLMPool
+from fusion_bench.models.hf_utils import create_default_model_card
+log = logging.getLogger(__name__)
+@auto_register_config
+class TiesMergingForCausalLM(
+    TiesMergingAlgorithm,
+):
+    R"""
+    TIES merging algorithm for CausalLM models.
+    This class extends the TiesMergingAlgorithm to work specifically with CausalLM models,
+    providing model saving capabilities and backbone merging support.
+    """
+    _config_mapping = TiesMergingAlgorithm._config_mapping | {
+        "merge_backbone": "merge_backbone",
+    }
+    def __init__(
+        self,
+        scaling_factor: float,
+        threshold: float,
+        remove_keys: List[str] = None,
+        merge_func: str = "sum",
+        merge_backbone: bool = False,
+        model_save_path: Optional[str] = None,
+        **kwargs,
+    ):
+        super().__init__(
+            scaling_factor=scaling_factor,
+            threshold=threshold,
+            remove_keys=remove_keys,
+            merge_func=merge_func,
+            **kwargs,
+        )
+    @override
+    def run(self, modelpool: CausalLMPool):
+        if self.merge_backbone:
+            assert modelpool.has_pretrained
+            backbone_modelpool = CausalLMBackbonePool(**modelpool.config)
+            model = modelpool.load_model("_pretrained_")
+            backbone_model = super().run(backbone_modelpool)
+            model.model.layers = backbone_model
+        else:
+            model = super().run(modelpool)
+        if self.model_save_path is not None:
+            with timeit_context(f"Saving the model to {self.model_save_path}"):
+                description = f"Merged model using TIES merging with scaling factor {self.scaling_factor} and threshold {self.threshold}."
+                modelpool.save_model(
+                    model=model,
+                    path=self.model_save_path,
+                    save_tokenizer=True,
+                    algorithm_config=self.config,
+                    description=description,
+                )
+        return model

fusion_bench/method/simple_average.py CHANGED Viewed

@@ -89,7 +89,7 @@ class SimpleAverageAlgorithm(
             modelpool = BaseModelPool(modelpool)
         log.info(
-            f"Fusing models using simple average on {len(modelpool.model_names)} models."
+            f"Fusing models using simple average on {len(modelpool.model_names)} models. "
             f"models: {modelpool.model_names}"
         )
         sd: Optional[StateDictType] = None
@@ -119,7 +119,7 @@ class SimpleAverageAlgorithm(
         if isinstance(forward_model, LazyStateDict):
             # if the model is a LazyStateDict, convert it to an empty module
-            forward_model = forward_model.meta_module.to_empty(
+            forward_model = deepcopy(forward_model.meta_module).to_empty(
                 device=forward_model._device
             )
         result = forward_model.load_state_dict(sd, strict=False)

fusion_bench/method/task_arithmetic/task_arithmetic.py CHANGED Viewed

@@ -6,11 +6,20 @@ http://arxiv.org/abs/2212.04089
 import logging
 from copy import deepcopy
-from typing import Dict, List, Mapping, Optional, TypeVar, Union  # noqa: F401
+from typing import (  # noqa: F401
+    TYPE_CHECKING,
+    Dict,
+    List,
+    Mapping,
+    Optional,
+    TypeVar,
+    Union,
+)
 import torch
 from torch import nn
+from fusion_bench import LazyStateDict
 from fusion_bench.method.base_algorithm import BaseAlgorithm
 from fusion_bench.mixins import SimpleProfilerMixin, auto_register_config
 from fusion_bench.modelpool import BaseModelPool
@@ -21,6 +30,8 @@ from fusion_bench.utils.state_dict_arithmetic import (
 )
 from fusion_bench.utils.type import StateDictType, TorchModelType
+if TYPE_CHECKING:
+    from transformers import PreTrainedModel
 log = logging.getLogger(__name__)
@@ -125,25 +136,39 @@ class TaskArithmeticAlgorithm(
             with self.profile("merge weights"):
                 if task_vector is None:
                     task_vector = state_dict_sub(
-                        model.state_dict(keep_vars=True),
-                        pretrained_model.state_dict(keep_vars=True),
+                        model.state_dict(),
+                        pretrained_model.state_dict(),
                     )
                 else:
                     task_vector = state_dict_add(
                         task_vector,
                         state_dict_sub(
-                            model.state_dict(keep_vars=True),
-                            pretrained_model.state_dict(keep_vars=True),
+                            model.state_dict(),
+                            pretrained_model.state_dict(),
                         ),
                     )
         with self.profile("merge weights"):
             # scale the task vector
             task_vector = state_dict_mul(task_vector, self.config.scaling_factor)
             # add the task vector to the pretrained model
-            state_dict = state_dict_add(
-                pretrained_model.state_dict(keep_vars=True), task_vector
-            )
+            state_dict = state_dict_add(pretrained_model.state_dict(), task_vector)
         self.print_profile_summary()
-        pretrained_model.load_state_dict(state_dict)
-        return pretrained_model
+        # apply state dict to model
+        if isinstance(pretrained_model, nn.Module):
+            model = pretrained_model
+            model.load_state_dict(state_dict)
+        elif isinstance(pretrained_model, LazyStateDict):
+            model = deepcopy(pretrained_model.meta_module)
+            model = model.to_empty(device=pretrained_model._device)
+            result = model.load_state_dict(state_dict, strict=False)
+            if result.unexpected_keys:
+                raise ValueError(
+                    f"Unexpected keys in state dict: {result.unexpected_keys}"
+                )
+            if result.missing_keys:
+                log.warning(f"Missing keys in state dict: {result.missing_keys}")
+        else:
+            raise TypeError(f"Unsupported model type: {type(pretrained_model)}")
+        return model

fusion_bench/method/ties_merging/ties_merging.py CHANGED Viewed

@@ -9,11 +9,14 @@ Overview of Ties-Merging:
 """
 import logging
+from copy import deepcopy
 from typing import Any, Dict, List, Literal, Mapping, Union  # noqa: F401
 import torch
 from torch import Tensor, nn
+from transformers import PreTrainedModel
+from fusion_bench import LazyStateDict
 from fusion_bench.compat.modelpool import to_modelpool
 from fusion_bench.method import BaseAlgorithm
 from fusion_bench.mixins import SimpleProfilerMixin, auto_register_config
@@ -98,12 +101,25 @@ class TiesMergingAlgorithm(
                 merge_func=merge_func,
             )
             merged_check = flat_ptm + scaling_factor * merged_tv
-            merged_state_dict = vector_to_state_dict(
+            state_dict = vector_to_state_dict(
                 merged_check, ptm_check, remove_keys=remove_keys
             )
-            # Load the merged state dict into the pretrained model
-            pretrained_model.load_state_dict(merged_state_dict)
         self.print_profile_summary()
-        return pretrained_model
+        # apply state dict to model
+        if isinstance(pretrained_model, nn.Module):
+            model = pretrained_model
+            model.load_state_dict(state_dict)
+        elif isinstance(pretrained_model, LazyStateDict):
+            model = deepcopy(pretrained_model.meta_module)
+            model = model.to_empty(device=pretrained_model._device)
+            result = model.load_state_dict(state_dict, strict=False)
+            if result.unexpected_keys:
+                raise ValueError(
+                    f"Unexpected keys in state dict: {result.unexpected_keys}"
+                )
+            if result.missing_keys:
+                log.warning(f"Missing keys in state dict: {result.missing_keys}")
+        else:
+            raise TypeError(f"Unsupported model type: {type(pretrained_model)}")
+        return model

fusion_bench/method/wudi/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .wudi import WUDIMerging, wudi_merging

fusion_bench/method/wudi/wudi.py ADDED Viewed

@@ -0,0 +1,105 @@
+"""
+Whoever Started the Interference Should End It:  Guiding Data-Free Model Merging via Task Vectors
+Arxiv: http://arxiv.org/abs/2503.08099
+"""
+from typing import List
+import torch
+from tqdm import tqdm
+from fusion_bench import BaseAlgorithm, BaseModelPool, auto_register_config
+from fusion_bench.mixins import LightningFabricMixin
+from fusion_bench.utils import timeit_context
+from fusion_bench.utils.state_dict_arithmetic import state_dict_add, state_dict_sub
+def wudi_merging(
+    task_vectors: List[torch.Tensor],
+    accelerator="cuda",
+    iter_num: int = 300,
+    exclude_keys: List[str] = None,
+):
+    exclude_keys = [] if exclude_keys is None else exclude_keys
+    with timeit_context("WUDI Merging"):
+        new_vector = {}
+        for key in tqdm(task_vectors[0], desc="WUDI Merging", leave=False):
+            tqdm.write(f"key: {key}")
+            original_device = task_vectors[0][key].device
+            tvs = torch.stack(
+                [
+                    task_vector[key].to(device=accelerator, non_blocking=True)
+                    for task_vector in task_vectors
+                ]
+            )
+            num_tvs = len(tvs)
+            new_vector[key] = torch.nn.Parameter(torch.sum(tvs, dim=0))
+            if len(task_vectors[0][key].shape) == 2 and key not in exclude_keys:
+                optimizer = torch.optim.Adam([new_vector[key]], lr=1e-5, weight_decay=0)
+                l2_norms = torch.square(
+                    torch.norm(tvs.reshape(tvs.shape[0], -1), p=2, dim=-1)
+                )
+                for i in tqdm(
+                    range(iter_num),
+                ):
+                    disturbing_vectors = new_vector[key].unsqueeze(0) - tvs
+                    product = torch.matmul(disturbing_vectors, tvs.transpose(1, 2))
+                    loss = torch.sum(
+                        torch.square(product) / l2_norms.unsqueeze(-1).unsqueeze(-1)
+                    )
+                    optimizer.zero_grad()
+                    loss.backward()
+                    optimizer.step()
+            else:
+                new_vector[key] = new_vector[key] / num_tvs
+            new_vector[key] = new_vector[key].to(
+                device=original_device, non_blocking=True
+            )
+    return new_vector
+@auto_register_config
+class WUDIMerging(
+    LightningFabricMixin,
+    BaseAlgorithm,
+):
+    """
+    Whoever Started the Interference Should End It:  Guiding Data-Free Model Merging via Task Vectors
+    """
+    def __init__(
+        self,
+        iter_num: int,
+        exclude_keys: List[str] = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+    def run(self, modelpool: BaseModelPool):
+        # load the pretrained model and the task vectors of all the finetuned models
+        with torch.no_grad():
+            pretrained_model = modelpool.load_pretrained_model()
+            task_vectors = []
+            for model_name in modelpool.model_names:
+                finetuned_model = modelpool.load_model(model_name)
+                task_vectors.append(
+                    state_dict_sub(
+                        finetuned_model.state_dict(), pretrained_model.state_dict()
+                    )
+                )
+                del finetuned_model  # free memory
+        merged_tv = wudi_merging(
+            task_vectors,
+            accelerator=self.fabric.device,
+            iter_num=self.iter_num,
+            exclude_keys=self.exclude_keys,
+        )
+        pretrained_model.load_state_dict(
+            state_dict_add(pretrained_model.state_dict(), merged_tv)
+        )
+        return pretrained_model

fusion_bench/mixins/lightning_fabric.py CHANGED Viewed

@@ -100,6 +100,10 @@ class LightningFabricMixin:
             self.setup_lightning_fabric(getattr(self, "config", DictConfig({})))
         return self._fabric_instance
+    @fabric.setter
+    def fabric(self, instance: L.Fabric):
+        self._fabric_instance = instance
     @property
     def log_dir(self):
         """

fusion-bench 0.2.23__py3-none-any.whl → 0.2.24__py3-none-any.whl

fusion-bench 0.2.23py3-none-any.whl → 0.2.24py3-none-any.whl