PyPI - fusion-bench - Versions diffs - 0.2.12__py3-none-any.whl → 0.2.14__py3-none-any.whl - Mend

fusion-bench 0.2.12py3-none-any.whl → 0.2.14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (209) hide show

fusion_bench/method/gossip/task_wise_gossip.py ADDED Viewed

@@ -0,0 +1,265 @@
+import copy
+import gc
+import logging
+from abc import abstractmethod
+from typing import List, Mapping, Union  # noqa: F401
+import lightning as L
+import numpy as np
+import torch
+import torch.nn as nn
+from omegaconf import DictConfig
+from torch import Tensor
+from torch.utils.data import DataLoader
+from tqdm.autonotebook import tqdm
+from fusion_bench.compat.method import ModelFusionAlgorithm
+from fusion_bench.compat.modelpool import ModelPool
+from fusion_bench.models.wrappers.task_wise_fusion import (
+    TaskWiseMergedModel,
+    get_task_wise_weights,
+)
+log = logging.getLogger(__name__)
+# obtain the current GPU memory usage
+def print_memory_usage(desc):
+    print(desc)
+    allocated = torch.cuda.memory_allocated() / 1024**2  # 转换为 MB
+    cached = torch.cuda.memory_reserved() / 1024**2  # 转换为 MB
+    print(f"Allocated Memory: {allocated:.2f} MB")
+    print(f"Cached Memory: {cached:.2f} MB")
+def entropy_loss(logits: Tensor) -> Tensor:
+    """
+    Compute the entropy loss of a set of logits.
+    Args:
+        logits (Tensor): The logits to compute the entropy loss of.
+    Returns:
+        Tensor: The entropy loss of the logits.
+    """
+    probs = torch.softmax(logits, dim=-1)
+    return -torch.sum(probs * torch.log(probs + 1e-8), dim=-1).mean()
+class ModelScheduler:
+    """
+    Manage the storage of models, schedule the order in which models are loaded to GPU
+    transfer data between the CPU and GPU
+    """
+    def __init__(
+        self,
+        modelpool: ModelPool,
+        config: DictConfig,
+    ):
+        self.pretrained_model = modelpool.load_model("_pretrained_")
+        self.finetuned_models = [
+            modelpool.load_model(name) for name in modelpool.model_names
+        ]
+        self.num_finetuned_models = len(self.finetuned_models)
+        self.new_finetuned_models = copy.deepcopy(self.finetuned_models)
+        self.finetuned_model_names = [name for name in modelpool.model_names]
+        self.config = config
+    @torch.no_grad()  # not sure whether to use this
+    def __call__(self, model_id):
+        """
+        return models and relevant data in each step
+        """
+        # TODO: use a mixing matrix to determine which models to use in step idx
+        pretrained_model = copy.deepcopy(self.finetuned_models[model_id])
+        finetuned_models = [
+            copy.deepcopy(
+                self.finetuned_models[(model_id + 1) % self.num_finetuned_models]
+            ),
+            copy.deepcopy(
+                self.finetuned_models[(model_id - 1) % self.num_finetuned_models]
+            ),
+        ]
+        if self.config.weights is None:
+            task_wise_weight = get_task_wise_weights(
+                num_models=len(finetuned_models),
+                init_values=self.config.init_values,
+            )
+        else:
+            pass
+        module = TaskWiseMergedModel(
+            task_wise_weight=task_wise_weight,
+            pretrained_model=pretrained_model,
+            finetuned_models=finetuned_models,
+            clamp_weights=self.config.clamp_weights,
+            tie_weights=self.config.tie_weights,
+            strict=self.config.strict,
+        )
+        return module
+    def store_model(self, new_finetuned_model_dict, model_id):
+        """
+        store new finetuned model after every turn of adamerging
+        """
+        self.new_finetuned_models[model_id].load_state_dict(new_finetuned_model_dict)
+    def update_models(self):
+        self.finetuned_models = copy.deepcopy(self.new_finetuned_models)
+    def get_final_models(self):
+        # need a check
+        final_models = [
+            {"name": name, "model": model}
+            for name, model in zip(self.finetuned_model_names, self.finetuned_models)
+        ]
+        num_finetuned_models = len(self.finetuned_models)
+        state_dict = self.pretrained_model.state_dict(keep_vars=True)
+        for name in state_dict.keys():
+            state_dict[name].data.zero_()
+        for model in self.finetuned_models:
+            for name, param in model.named_parameters():
+                state_dict[name] = state_dict[name] + 1 / num_finetuned_models * param
+        self.pretrained_model.load_state_dict(state_dict)
+        final_models += [{"name": "average model", "model": self.pretrained_model}]
+        return final_models
+class TaskWiseGossipAlgorithm(ModelFusionAlgorithm):
+    _fabric: L.Fabric = None
+    def __init__(self, algorithm_config: DictConfig):
+        super().__init__(algorithm_config)
+        if self._fabric is None and torch.cuda.is_available():
+            self._fabric = L.Fabric(devices=self.config.get("devices", 1))
+            self._fabric.launch()
+        self.optimizer = None  # we want to reuse it in Gossip using single GPU
+    def free_gpu_memory(self, module: TaskWiseMergedModel):
+        module.pretrained_model.to("cpu")
+        for model in module.task_vectors:
+            model.to("cpu")
+        del module
+        gc.collect()
+        torch.cuda.empty_cache()
+        print_memory_usage(
+            "finish local adamerging, after freeing memory, the memory usage of GPU is:"
+        )
+    def run(self, modelpool: ModelPool):
+        log.info("Fusing models using task-wise adaptive merging with gossip.")
+        self.modelpool = modelpool
+        self.num_finetuned_models = len(modelpool.model_names)
+        model_scheduler = ModelScheduler(self.modelpool, self.config)
+        pbar = tqdm(
+            range(self.config.gossip_max_steps), "Gossip merging", dynamic_ncols=True
+        )
+        for step_idx in pbar:
+            log.info(f"step: {step_idx}")
+            for model_id in tqdm(
+                range(self.num_finetuned_models), "local adamerging", dynamic_ncols=True
+            ):
+                # log.info(f"adamerging model: {model_scheduler.finetuned_midels_name[model_id]}")
+                module = model_scheduler(model_id)
+                module = self.test_time_adaptation(module)
+                # if self.config.get("save_merging_weights", False):
+                #     torch.save(module.merge_weight, self.config.save_merging_weights)
+                print_memory_usage(
+                    "local adamerging almost done, the memory usage of GPU is:"
+                )
+                model_scheduler.store_model(module.merge_weights(), model_id)
+                print_memory_usage(
+                    "local adamerging almost done, the memory usage of GPU is:"
+                )
+                self.free_gpu_memory(
+                    module
+                )  # simulate distributed GPU memory usage as much as possible
+            model_scheduler.update_models()
+        return model_scheduler.get_final_models()
+    def on_test_time_adaptation_start(self):
+        pass
+    @abstractmethod
+    def get_shuffled_test_loader_iter(self, task: str) -> DataLoader:
+        pass
+    @abstractmethod
+    def compute_logits(self, module: nn.Module, batch, task: str) -> Tensor:
+        """
+        Compute the logits for the given batch and task.
+        Args:
+            module (nn.Module): The model module.
+            batch (tuple): A batch of input data.
+            task (str): The name of the task.
+        Returns:
+            Tensor: The classification logits for the batch.
+        """
+        pass
+    def test_time_adaptation(self, module: TaskWiseMergedModel):
+        self.on_test_time_adaptation_start()
+        # configure optimizer
+        if self.config.optimizer == "adam":
+            self.optimizer = torch.optim.Adam([module.merge_weight], lr=self.config.lr)
+        else:
+            raise ValueError(f"Unsupported optimizer: {self.config.optimizer}")
+        if self._fabric is not None:
+            module, self.optimizer = self._fabric.setup(module, self.optimizer)
+        print_memory_usage(
+            "load model and optimizer to GPU, the memory usage of GPU is:"
+        )
+        module.train()
+        module.merge_weights()
+        if self.config.get("fast_dev_run", False):
+            log.info("Running fast_dev_run, only one step")
+            pbar = tqdm(
+                range(1),
+                "AdaMerging Test-time adaptation",
+                dynamic_ncols=True,
+            )
+        else:
+            pbar = tqdm(
+                range(self.config.max_steps),
+                "AdaMerging Test-time adaptation",
+                dynamic_ncols=True,
+            )
+        for step_idx in pbar:
+            for task in self.modelpool.model_names:
+                batch = next(self.get_shuffled_test_loader_iter(task))
+                logits = self.compute_logits(module, batch, task)
+                assert (
+                    logits.dim() == 2
+                ), f"Expected logits to be 2D, got {logits.dim()}"
+                loss = entropy_loss(logits)
+                # .backward() accumulates when .zero_grad() wasn't called
+                # this can save memory
+                self._fabric.backward(loss, retain_graph=True)
+            # print_memory_usage('model + dataset: ')
+            self.optimizer.step()
+            self.optimizer.zero_grad()
+            module.merge_weights()
+        del self.optimizer
+        gc.collect()
+        torch.cuda.empty_cache()
+        return module

fusion_bench/method/gossip/utils.py ADDED Viewed

@@ -0,0 +1,74 @@
+import copy
+from collections import OrderedDict
+import torch
+from torch import nn
+def get_memory_usage(desc):
+    """
+    obtain the current GPU memory usage
+    Returns:
+        str: A string containing the allocated and cached memory in MB.
+    """
+    allocated = torch.cuda.memory_allocated() / 1024**2  # 转换为 MB
+    cached = torch.cuda.memory_reserved() / 1024**2  # 转换为 MB
+    return (
+        f"{desc}\nAllocated Memory: {allocated:.2f} MB\nCached Memory: {cached:.2f} MB"
+    )
+# Model conversion utils
+def state_dict_to_vector(state_dict, remove_keys=[]):
+    """
+    Convert a state dictionary to a vector.
+    Args:
+        state_dict (dict): The state dictionary to convert.
+        remove_keys (list, optional): List of keys to remove from the state dictionary. Defaults to [].
+    Returns:
+        torch.Tensor: The converted vector.
+    """
+    shared_state_dict = copy.deepcopy(state_dict)
+    for key in remove_keys:
+        if key in shared_state_dict:
+            del shared_state_dict[key]
+    sorted_shared_state_dict = OrderedDict(sorted(shared_state_dict.items()))
+    return nn.utils.parameters_to_vector(
+        [value.reshape(-1) for key, value in sorted_shared_state_dict.items()]
+    )
+def vector_to_state_dict(vector, state_dict, remove_keys=[]):
+    """
+    Convert a vector to a state dictionary.
+    Args:
+        vector (torch.Tensor): The vector to convert.
+        state_dict (dict): The reference state dictionary to define the order of the vector.
+        remove_keys (list, optional): List of keys to remove from the reference state dictionary. Defaults to [].
+    Returns:
+        dict: The converted state dictionary.
+    """
+    # create a reference dict to define the order of the vector
+    reference_dict = copy.deepcopy(state_dict)
+    for key in remove_keys:
+        if key in reference_dict:
+            del reference_dict[key]
+    sorted_reference_dict = OrderedDict(sorted(reference_dict.items()))
+    # create a shared state dict using the reference dict
+    nn.utils.vector_to_parameters(vector, sorted_reference_dict.values())
+    # add back the encoder and decoder embedding weights.
+    if "transformer.shared.weight" in sorted_reference_dict:
+        for key in remove_keys:
+            sorted_reference_dict[key] = sorted_reference_dict[
+                "transformer.shared.weight"
+            ]
+    return sorted_reference_dict

fusion_bench/method/isotropic_merging/__init__.py CHANGED Viewed

@@ -3,7 +3,7 @@ This module contains the implementation of the Isotropic Merging in Common Subsp
 Modified from the original implementation: https://github.com/danielm1405/iso-merging
 Reference:
-- Daniel Marczak, et al. No Task Left Behind: Isotropic Model Merging with Common and Task-Specific Subspaces. 2025.
+- Daniel Marczak, et al. No Task Left Behind: Isotropic Model Merging with Common and Task-Specific Subspaces. 2025.
     https://arxiv.org/abs/2502.04959
 """

fusion_bench/method/opcm/opcm.py CHANGED Viewed

@@ -126,10 +126,14 @@ class OPCMForCLIP(
                 )
                 self.avg_task_vector_norm = np.mean(self.all_task_vector_norm)
                 self.fabric.log(
-                    "model/task_vector_norm", self.all_task_vector_norm[-1], step=model_idx
+                    "model/task_vector_norm",
+                    self.all_task_vector_norm[-1],
+                    step=model_idx,
                 )
                 self.fabric.log(
-                    "model/avg_task_vector_norm", self.avg_task_vector_norm, step=model_idx
+                    "model/avg_task_vector_norm",
+                    self.avg_task_vector_norm,
+                    step=model_idx,
                 )
                 self.lambda_t = 1  # temporary value
@@ -166,9 +170,9 @@ class OPCMForCLIP(
                                 pretrained_W=pretrained_model.get_submodule(
                                     module_name
                                 ).get_parameter(param_name),
-                                task_W=task_model.get_submodule(module_name).get_parameter(
-                                    param_name
-                                ),
+                                task_W=task_model.get_submodule(
+                                    module_name
+                                ).get_parameter(param_name),
                                 param_name=".".join([module_name, param_name]),
                                 accelerator=accelerator,
                             )
@@ -200,10 +204,15 @@ class OPCMForCLIP(
                 with self.profile("evaluating model"):
                     self.taskpool._is_setup = False
                     self.taskpool._test_datasets = DictConfig(
-                        {n: self._test_datasets[n] for n in model_names[: model_idx + 1]}
+                        {
+                            n: self._test_datasets[n]
+                            for n in model_names[: model_idx + 1]
+                        }
                     )
                     report = self.taskpool.evaluate(deepcopy(merged_model))
-                    save_to_json(report, Path(self.log_dir) / f"report_{model_idx}.json")
+                    save_to_json(
+                        report, Path(self.log_dir) / f"report_{model_idx}.json"
+                    )
         self.print_profile_summary()
         return merged_model

fusion_bench/method/pwe_moe/module.py CHANGED Viewed

@@ -1,5 +1,5 @@
 R"""
-this is adapted from
+this is adapted from
 https://github.com/tanganke/weight-ensembling_MoE/blob/3cbd327cb28c499065f83387472a79829a2e5fee/src/module/dict_moe.py
 but with some modifications
 """

fusion-bench 0.2.12__py3-none-any.whl → 0.2.14__py3-none-any.whl

fusion-bench 0.2.12py3-none-any.whl → 0.2.14py3-none-any.whl