PyPI - fusion-bench - Versions diffs - 0.2.21__py3-none-any.whl → 0.2.23__py3-none-any.whl - Mend

fusion-bench 0.2.21py3-none-any.whl → 0.2.23py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (89) hide show

fusion_bench/__init__.py +25 -2
fusion_bench/compat/method/__init__.py +5 -2
fusion_bench/compat/method/base_algorithm.py +3 -2
fusion_bench/compat/modelpool/base_pool.py +3 -3
fusion_bench/compat/taskpool/clip_image_classification.py +1 -1
fusion_bench/constants/__init__.py +1 -0
fusion_bench/constants/runtime.py +57 -0
fusion_bench/dataset/gpt2_glue.py +1 -1
fusion_bench/method/__init__.py +12 -4
fusion_bench/method/analysis/task_vector_cos_similarity.py +95 -12
fusion_bench/method/analysis/task_vector_violin_plot.py +160 -52
fusion_bench/method/bitdelta/__init__.py +1 -0
fusion_bench/method/bitdelta/bitdelta.py +7 -23
fusion_bench/method/classification/clip_finetune.py +1 -1
fusion_bench/method/expert_sparsity/mixtral/dynamic_skipping.py +2 -0
fusion_bench/method/expert_sparsity/mixtral/layer_wise_pruning.py +2 -0
fusion_bench/method/expert_sparsity/mixtral/progressive_pruning.py +2 -0
fusion_bench/method/fisher_merging/clip_fisher_merging.py +0 -4
fusion_bench/method/fisher_merging/gpt2_fisher_merging.py +2 -2
fusion_bench/method/linear/simple_average_for_llama.py +16 -11
fusion_bench/method/model_stock/__init__.py +1 -0
fusion_bench/method/model_stock/model_stock.py +309 -0
fusion_bench/method/regmean/clip_regmean.py +3 -6
fusion_bench/method/regmean/regmean.py +27 -56
fusion_bench/method/regmean/utils.py +56 -0
fusion_bench/method/regmean_plusplus/regmean_plusplus.py +21 -60
fusion_bench/method/simple_average.py +7 -7
fusion_bench/method/slerp/__init__.py +1 -1
fusion_bench/method/slerp/slerp.py +110 -14
fusion_bench/method/smile_upscaling/causal_lm_upscaling.py +371 -0
fusion_bench/method/smile_upscaling/projected_energy.py +1 -2
fusion_bench/method/smile_upscaling/smile_mistral_upscaling.py +5 -1
fusion_bench/method/smile_upscaling/smile_qwen2_upscaling.py +40 -31
fusion_bench/method/smile_upscaling/smile_upscaling.py +1 -1
fusion_bench/method/we_moe/__init__.py +1 -0
fusion_bench/method/we_moe/entropy_loss.py +25 -0
fusion_bench/method/we_moe/flan_t5_we_moe.py +320 -0
fusion_bench/method/we_moe/utils.py +15 -0
fusion_bench/method/weighted_average/llama.py +1 -1
fusion_bench/mixins/clip_classification.py +37 -48
fusion_bench/mixins/serialization.py +30 -10
fusion_bench/modelpool/base_pool.py +1 -1
fusion_bench/modelpool/causal_lm/causal_lm.py +293 -75
fusion_bench/modelpool/seq2seq_lm/modelpool.py +146 -0
fusion_bench/models/__init__.py +5 -0
fusion_bench/models/hf_utils.py +69 -86
fusion_bench/models/linearized/vision_model.py +6 -6
fusion_bench/models/model_card_templates/default.md +46 -0
fusion_bench/models/modeling_smile_llama/__init__.py +7 -0
fusion_bench/models/modeling_smile_llama/modeling_smile_llama.py +1 -8
fusion_bench/models/modeling_smile_mistral/__init__.py +2 -1
fusion_bench/models/modeling_smile_qwen2/modeling_smile_qwen2.py +1 -5
fusion_bench/models/we_moe.py +8 -8
fusion_bench/programs/fabric_fusion_program.py +29 -60
fusion_bench/scripts/cli.py +34 -1
fusion_bench/taskpool/base_pool.py +99 -17
fusion_bench/taskpool/clip_vision/taskpool.py +10 -5
fusion_bench/taskpool/dummy.py +101 -13
fusion_bench/taskpool/lm_eval_harness/taskpool.py +80 -0
fusion_bench/taskpool/nyuv2_taskpool.py +28 -0
fusion_bench/utils/__init__.py +2 -0
fusion_bench/utils/cache_utils.py +101 -1
fusion_bench/utils/data.py +6 -4
fusion_bench/utils/devices.py +7 -4
fusion_bench/utils/dtype.py +3 -2
fusion_bench/utils/fabric.py +2 -2
fusion_bench/utils/lazy_imports.py +23 -0
fusion_bench/utils/lazy_state_dict.py +117 -19
fusion_bench/utils/modelscope.py +3 -3
fusion_bench/utils/packages.py +3 -3
fusion_bench/utils/parameters.py +0 -2
fusion_bench/utils/path.py +56 -0
fusion_bench/utils/pylogger.py +1 -1
fusion_bench/utils/timer.py +92 -10
{fusion_bench-0.2.21.dist-info → fusion_bench-0.2.23.dist-info}/METADATA +1 -23
{fusion_bench-0.2.21.dist-info → fusion_bench-0.2.23.dist-info}/RECORD +89 -75
fusion_bench_config/_get_started/llm_slerp.yaml +12 -0
fusion_bench_config/method/fisher_merging/clip_fisher_merging.yaml +0 -1
fusion_bench_config/method/linear/simple_average_for_llama.yaml +3 -2
fusion_bench_config/method/model_stock/model_stock.yaml +12 -0
fusion_bench_config/method/slerp/slerp_lm.yaml +4 -0
fusion_bench_config/method/smile_upscaling/causal_lm_upscaling.yaml +21 -0
fusion_bench_config/method/smile_upscaling/smile_qwen2_upscaling.yaml +1 -1
fusion_bench_config/method/wemoe/flan_t5_weight_ensembling_moe.yaml +20 -0
fusion_bench_config/modelpool/CausalLMPool/Qwen2.5-1.5B_math_and_coder.yaml +1 -1
{fusion_bench-0.2.21.dist-info → fusion_bench-0.2.23.dist-info}/WHEEL +0 -0
{fusion_bench-0.2.21.dist-info → fusion_bench-0.2.23.dist-info}/entry_points.txt +0 -0
{fusion_bench-0.2.21.dist-info → fusion_bench-0.2.23.dist-info}/licenses/LICENSE +0 -0
{fusion_bench-0.2.21.dist-info → fusion_bench-0.2.23.dist-info}/top_level.txt +0 -0

fusion_bench/method/smile_upscaling/smile_qwen2_upscaling.py CHANGED Viewed

@@ -16,10 +16,11 @@ from transformers.models.qwen2.modeling_qwen2 import Qwen2DecoderLayer
 from fusion_bench import BaseAlgorithm, BaseModelPool
 from fusion_bench.compat.modelpool import to_modelpool
+from fusion_bench.constants import RuntimeConstants
 from fusion_bench.mixins import SimpleProfilerMixin, auto_register_config
 from fusion_bench.modelpool import CausalLMPool
 from fusion_bench.models.hf_utils import (
-    generate_complete_readme,
+    create_default_model_card,
     save_pretrained_with_remote_code,
 )
 from fusion_bench.models.modeling_smile_qwen2 import (
@@ -41,7 +42,10 @@ log = logging.getLogger(__name__)
 @auto_register_config
-class SmileQwen2UpscalingAlgorithm(BaseAlgorithm, SimpleProfilerMixin):
+class SmileQwen2UpscalingAlgorithm(
+    SimpleProfilerMixin,
+    BaseAlgorithm,
+):
     R"""
     SmileQwen2UpscalingAlgorithm is a model fusion algorithm designed to upscale
     a pretrained Qwen2 model using a set of fine-tuned expert models. The algorithm
@@ -62,7 +66,7 @@ class SmileQwen2UpscalingAlgorithm(BaseAlgorithm, SimpleProfilerMixin):
         self,
         device,
         accelerator,
-        model_path,
+        model_save_path,
         model_dtype,
         num_experts_per_tok,
         rank_of_router,
@@ -71,6 +75,11 @@ class SmileQwen2UpscalingAlgorithm(BaseAlgorithm, SimpleProfilerMixin):
         **kwargs,
     ):
         super().__init__(**kwargs)
+        if not torch.cuda.is_available():
+            if "cuda" in self.device:
+                self.device = "cpu"
+            if "cuda" in self.accelerator:
+                self.accelerator = "cpu"
     @torch.no_grad()
     def run(self, modelpool) -> SmileQwen2ForCausalLM:
@@ -86,13 +95,6 @@ class SmileQwen2UpscalingAlgorithm(BaseAlgorithm, SimpleProfilerMixin):
         self.modelpool = modelpool = to_modelpool(modelpool)
         config = self.config
-        # load model from path if provided and return directly
-        if config.model_path is not None and os.path.exists(config.model_path):
-            log.info(f"Loading model from {config.model_path}")
-            model = AutoModelForCausalLM.from_pretrained(config.model_path)
-            print_parameters(model)
-            return model
         with self.profile("load pretrained model"):
             pretrained_model = modelpool.load_pretrained_model()
         with self.profile("load fine-tuned model"):
@@ -100,7 +102,7 @@ class SmileQwen2UpscalingAlgorithm(BaseAlgorithm, SimpleProfilerMixin):
                 m for m in tqdm(modelpool.models(), total=len(modelpool.model_names))
             ]
-        if config.device == "cuda" and torch.cuda.is_available():
+        if self.device == "cuda" and torch.cuda.is_available():
             pretrained_model = pretrained_model.cuda()
             print("parameter count of pretrained model:")
             print_parameters(pretrained_model)
@@ -114,17 +116,17 @@ class SmileQwen2UpscalingAlgorithm(BaseAlgorithm, SimpleProfilerMixin):
         print_parameters(model)
         print(model)
-        if config.model_dtype is not None:
-            model.to(dtype=parse_dtype(config.model_dtype))
+        if self.model_dtype is not None:
+            model.to(dtype=parse_dtype(self.model_dtype))
-        if config.model_path is not None:
-            if os.path.dirname(config.model_path):
-                os.makedirs(os.path.dirname(config.model_path), exist_ok=True)
-            log.info(f"Saving model to {config.model_path}")
+        if self.model_save_path is not None:
+            if os.path.dirname(self.model_save_path):
+                os.makedirs(os.path.dirname(self.model_save_path), exist_ok=True)
+            log.info(f"Saving model to {self.model_save_path}")
             tokenizer = self.modelpool.load_tokenizer()
-            tokenizer.save_pretrained(config.model_path)
+            tokenizer.save_pretrained(self.model_save_path)
             if not self.save_with_remote_code:
-                model.save_pretrained(config.model_path)
+                model.save_pretrained(self.model_save_path)
             else:
                 save_pretrained_with_remote_code(
                     model,
@@ -133,17 +135,18 @@ class SmileQwen2UpscalingAlgorithm(BaseAlgorithm, SimpleProfilerMixin):
                         "AutoModel": SmileQwen2Model,
                         "AutoModelForCausalLM": SmileQwen2ForCausalLM,
                     },
-                    save_directory=config.model_path,
+                    save_directory=self.model_save_path,
                 )
             # save readme
-            complete_readme = generate_complete_readme(
-                algorithm=self,
-                modelpool=modelpool,
+            model_card_str = create_default_model_card(
                 models=[modelpool.get_model_path(m) for m in modelpool.all_model_names],
+                description="Merged Qwen model using SMILE Upscaling",
+                algorithm_config=self.config,
+                modelpool_config=modelpool.config,
             )
-            with open(os.path.join(config.model_path, "README.md"), "w") as f:
-                f.write(complete_readme)
+            with open(os.path.join(self.model_save_path, "README.md"), "w") as f:
+                f.write(model_card_str)
         return model
@@ -174,9 +177,9 @@ class SmileQwen2UpscalingAlgorithm(BaseAlgorithm, SimpleProfilerMixin):
                 )
             base_config = AutoConfig.from_pretrained(pretrained_path)
             model_config = SmileQwen2Config(
-                num_experts_per_tok=config.num_experts_per_tok,
-                rank_of_router=config.rank_of_router,
-                rank_of_expert=config.rank_of_expert,
+                num_experts_per_tok=self.num_experts_per_tok,
+                rank_of_router=self.rank_of_router,
+                rank_of_expert=self.rank_of_expert,
                 num_local_experts=len(finetuned_models),
                 **base_config.to_dict(),
             )
@@ -186,7 +189,7 @@ class SmileQwen2UpscalingAlgorithm(BaseAlgorithm, SimpleProfilerMixin):
         # copy pretrained model weights
         state_dict = model.state_dict()
-        pretrained_state_dict = dict(pretrained_model.state_dict())
+        pretrained_state_dict = pretrained_model.state_dict()
         for key in list(pretrained_state_dict.keys()):
             if key not in state_dict:
                 pretrained_state_dict.pop(key)
@@ -198,6 +201,12 @@ class SmileQwen2UpscalingAlgorithm(BaseAlgorithm, SimpleProfilerMixin):
             "Upscaling Modules (layer)",
             dynamic_ncols=True,
         ):
+            if RuntimeConstants.debug and layer_idx > 0:
+                log.info(
+                    "Debug mode enabled: processing only the first layer, skipping remaining layers"
+                )
+                break
             pretrained_layer: Qwen2DecoderLayer = pretrained_model.model.layers[
                 layer_idx
             ]
@@ -213,7 +222,7 @@ class SmileQwen2UpscalingAlgorithm(BaseAlgorithm, SimpleProfilerMixin):
                         base=getattr(pretrained_layer.self_attn, n),
                         experts=[getattr(m.self_attn, n) for m in finetuned_layers],
                         target=getattr(target_layer.self_attn, n),
-                        accelerator=config.accelerator,
+                        accelerator=self.accelerator,
                     )
                 except ExpertNotTrainedError:
                     setattr(
@@ -228,7 +237,7 @@ class SmileQwen2UpscalingAlgorithm(BaseAlgorithm, SimpleProfilerMixin):
                         base=getattr(pretrained_layer.mlp, n),
                         experts=[getattr(m.mlp, n) for m in finetuned_layers],
                         target=getattr(target_layer.mlp, n),
-                        accelerator=config.accelerator,
+                        accelerator=self.accelerator,
                     )
                 except ExpertNotTrainedError:
                     setattr(

fusion_bench/method/smile_upscaling/smile_upscaling.py CHANGED Viewed

@@ -20,8 +20,8 @@ from fusion_bench.models.smile_moe.linear_from_module import (
     SmileMoELinear,
 )
 from fusion_bench.models.utils import get_attr, set_attr
-from fusion_bench.utils.parameters import print_parameters
 from fusion_bench.utils.devices import get_device
+from fusion_bench.utils.parameters import print_parameters
 log = logging.getLogger(__name__)

fusion_bench/method/we_moe/__init__.py CHANGED Viewed

@@ -1,2 +1,3 @@
 # flake8: noqa F401
 from .clip_we_moe import CLIPWeightEnsemblingMoEAlgorithm
+from .flan_t5_we_moe import FlanT5WeightEnsemblingMoEAlgorithm

fusion_bench/method/we_moe/entropy_loss.py ADDED Viewed

@@ -0,0 +1,25 @@
+import torch
+from torch import Tensor
+def entropy_loss(logits: Tensor, eps: float = 1e-8) -> Tensor:
+    """
+    Compute the entropy loss of a set of logits.
+    Args:
+        logits (Tensor): The logits to compute the entropy loss of.
+        eps (float): A small value to avoid log(0). Default is 1e-8.
+    Returns:
+        Tensor: The entropy loss of the logits.
+    """
+    # Ensure the logits tensor has 2 dimensions
+    assert (
+        logits.dim() == 2
+    ), f"Expected logits to have 2 dimensions, found {logits.dim()}, {logits.size()=}"
+    # Compute the softmax probabilities
+    probs = torch.softmax(logits, dim=-1)
+    # Compute the entropy loss
+    return -torch.sum(probs * torch.log(probs + eps), dim=-1).mean()

fusion_bench/method/we_moe/flan_t5_we_moe.py ADDED Viewed

@@ -0,0 +1,320 @@
+import functools
+import logging
+import os
+from copy import deepcopy
+from typing import Any, Dict, List, Mapping, Optional, Union, cast  # noqa: F401
+import lightning
+import lightning as L
+import lightning.fabric.wrappers
+import torch
+from torch import Tensor
+from torch.utils.data import DataLoader
+from tqdm.autonotebook import tqdm
+from transformers import T5ForConditionalGeneration
+from transformers.data import default_data_collator
+from fusion_bench.method import BaseAlgorithm
+from fusion_bench.method.task_arithmetic.task_arithmetic import task_arithmetic_merge
+from fusion_bench.mixins import (
+    LightningFabricMixin,
+    SimpleProfilerMixin,
+    auto_register_config,
+)
+from fusion_bench.modelpool import Seq2SeqLMPool
+from fusion_bench.models.we_moe import WeightEnsemblingMoE
+from fusion_bench.utils import print_parameters, timeit_context
+from fusion_bench.utils.data import InfiniteDataLoader, load_tensor_from_file
+from fusion_bench.utils.instantiate_utils import instantiate
+from fusion_bench.utils.parameters import print_parameters
+from .entropy_loss import entropy_loss
+from .utils import get_memory_usage
+log = logging.getLogger(__name__)
+@auto_register_config
+class FlanT5WeightEnsemblingMoEAlgorithm(
+    LightningFabricMixin,
+    SimpleProfilerMixin,
+    BaseAlgorithm,
+):
+    """
+    FlanT5WeightEnsemblingMoEAlgorithm is a class that implements the WeightEnsemblingMoEAlgorithm
+    for FlanT5 models. It extends the WeightEnsemblingMoEAlgorithm and CLIPClassificationMixin classes.
+    Attributes:
+        modelpool (Seq2SeqLMPool): The model pool containing the FlanT5 models.
+    """
+    modelpool: Seq2SeqLMPool = None
+    def __init__(
+        self,
+        checkpoint: bool = False,
+        save_checkpoint: bool = False,
+        router_hidden_layers: int = 2,
+        init_lambda: float = 0.3,
+        batch_reduce: bool = True,
+        lr: float = 1e-4,
+        optimizer: str = "adam",
+        devices: int = 1,
+        batch_size: int = 16,
+        num_workers: int = 0,
+        max_steps: int = 1000,
+        use_grad_accumulate: bool = True,
+        fast_dev_run: bool = False,
+        **kwargs,
+    ):
+        """
+        Initialize the WeightEnsemblingMoEAlgorithm with the given configuration.
+        Args:
+            algorithm_config (DictConfig): The configuration for the algorithm.
+        """
+        super().__init__(**kwargs)
+    def construct_moe_model(self):
+        """
+        Construct the Mixture of Experts (MoE) model using the models in the model pool.
+        Returns:
+            WeightEnsemblingMoE: The constructed MoE model.
+        """
+        base_model = self.modelpool.load_model("_pretrained_")
+        expert_models = [
+            self.modelpool.load_model(name) for name in self.modelpool.model_names
+        ]
+        # Merge the models using task arithmetic
+        moe_model = task_arithmetic_merge(
+            # This function modifies the model in place, so we need to pass a deepcopy
+            deepcopy(base_model),
+            expert_models,
+            scaling_factor=self.init_lambda,
+        ).requires_grad_(False)
+        print(base_model)
+        # Up-scale MLP modules
+        num_layer = 12
+        encoder_mlp_index = 1
+        base_encoder = base_model.encoder
+        moe_encoder = moe_model.encoder
+        expert_encoders = [m.encoder for m in expert_models]
+        for layer_idx in range(num_layer):
+            base_mlp = (
+                base_encoder.block[layer_idx].layer[encoder_mlp_index].DenseReluDense
+            )
+            expert_mlps = [
+                e.block[layer_idx].layer[encoder_mlp_index].DenseReluDense
+                for e in expert_encoders
+            ]
+            moe_encoder.block[layer_idx].layer[encoder_mlp_index].DenseReluDense = (
+                WeightEnsemblingMoE(
+                    hidden_size=base_encoder.config.hidden_size,
+                    base_model=base_mlp,
+                    expert_models=expert_mlps,
+                    init_lambda=self.init_lambda,
+                    batch_first=True,
+                    router_hidden_layers=self.router_hidden_layers,
+                    batch_reduce=self.batch_reduce,
+                )
+            )
+        decoder_mlp_index = 2
+        base_decoder = base_model.decoder
+        moe_decoder = moe_model.decoder
+        expert_decoders = [m.decoder for m in expert_models]
+        for layer_idx in range(num_layer):
+            base_mlp = (
+                base_decoder.block[layer_idx].layer[decoder_mlp_index].DenseReluDense
+            )
+            expert_mlps = [
+                e.block[layer_idx].layer[decoder_mlp_index].DenseReluDense
+                for e in expert_decoders
+            ]
+            moe_decoder.block[layer_idx].layer[decoder_mlp_index].DenseReluDense = (
+                WeightEnsemblingMoE(
+                    hidden_size=base_decoder.config.hidden_size,
+                    base_model=base_mlp,
+                    expert_models=expert_mlps,
+                    init_lambda=self.init_lambda,
+                    batch_first=True,
+                    router_hidden_layers=self.router_hidden_layers,
+                    batch_reduce=self.batch_reduce,
+                )
+            )
+        print(moe_model)
+        return moe_model
+    @functools.cache
+    def get_shuffled_test_loader_iter(self, task: str) -> DataLoader:
+        """
+        Loader of test dataset for test-time adaptation. labels are not needed.
+        Args:
+            task (str): The name of the task.
+        Returns:
+            DataLoader: The data loader for the test dataset.
+        """
+        # dataloader_kwargs = dict(self.dataloader_kwargs)
+        # dataloader_kwargs.update(dict(shuffle=True, collate_fn=default_data_collator))
+        dataset = self.modelpool.load_test_dataset(task)
+        log.info("get_shuffled_test_loader_iter")
+        loader = DataLoader(
+            dataset,
+            batch_size=self.batch_size,
+            shuffle=True,
+            num_workers=self.num_workers,
+            collate_fn=default_data_collator,
+        )
+        # loader = DataLoader(dataset, **dataloader_kwargs)
+        if self.fabric is not None:
+            loader = self.fabric.setup_dataloaders(loader)
+        return iter(InfiniteDataLoader(loader))
+    def compute_logits(
+        self,
+        module: Union[T5ForConditionalGeneration],
+        batch,
+        task: str,
+    ) -> Tensor:
+        """
+        Compute the logits for the given images and task.
+        Args:
+            module: The model module.
+            images (Tensor): The input images.
+            task (str): The name of the task.
+        Returns:
+            Tensor: The computed logits.
+        """
+        input_ids: Tensor = batch["input_ids"]
+        attention_mask: Tensor = batch["attention_mask"]
+        # remove padding tokens from the input
+        while attention_mask[:, -1].eq(0).all():
+            input_ids = input_ids[:, :-1]
+            attention_mask = attention_mask[:, :-1]
+        outputs = module(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=torch.ones(
+                input_ids.size(0), 1, dtype=torch.long, device=input_ids.device
+            ),
+        )
+        logits = outputs.logits[:, 0, :]
+        return logits
+    def test_time_adaptation(self, module):
+        """
+        Perform test-time adaptation for the given module.
+        Args:
+            module (WeightEnsemblingMoE): The MoE module to adapt.
+        Returns:
+            WeightEnsemblingMoE: The adapted MoE module.
+        """
+        self.on_test_time_adaptation_start()
+        # configure optimizer
+        if self.optimizer == "adam":
+            print([name for name, p in module.named_parameters() if p.requires_grad])
+            optimizer = torch.optim.Adam(
+                [p for p in module.parameters() if p.requires_grad], lr=self.lr
+            )
+        else:
+            raise ValueError(f"Unsupported optimizer: {self.optimizer}")
+        module, optimizer = self.fabric.setup(module, optimizer)
+        module.train()
+        # module.merge_weights()
+        for step_idx in (
+            pbar := tqdm(
+                range(self.max_steps if not self.is_debug_mode else 1),
+                ("[DEBUG MODE] " if self.is_debug_mode else "")
+                + "WEMoE Test-time adaptation",
+                dynamic_ncols=True,
+            )
+        ):
+            total_loss = 0
+            for task in self.modelpool.model_names:
+                with self.profile("data loading"):
+                    batch = next(self.get_shuffled_test_loader_iter(task))
+                with self.profile("forward pass"):
+                    logits = self.compute_logits(module, batch, task)
+                    logits = logits.mean(dim=0, keepdim=True)
+                    loss = entropy_loss(logits)
+                    total_loss += loss
+                with self.profile("backward pass"):
+                    self.fabric.backward(loss, retain_graph=True)
+            with self.profile("optimizer step"):
+                optimizer.step()
+                optimizer.zero_grad()
+            metrics = {
+                "train/loss": total_loss.item(),
+            }
+            self.fabric.log_dict(metrics, step=step_idx)
+            pbar.set_postfix(metrics)
+        log.info(get_memory_usage(f"after adamerging, the memory usage of GPU is:"))
+        self.print_profile_summary()
+        return module
+    def on_test_time_adaptation_start(self):
+        """
+        Something to do before the test-time adaptation starts. Such as setting up the task-specific heads.
+        """
+        pass
+    def run(self, modelpool: Seq2SeqLMPool, **kwargs):
+        """
+        Run the WeightEnsemblingMoEAlgorithm to fuse models using Weight Ensembling Mixture of Experts.
+        Args:
+            modelpool (ModelPool): The pool of models to be fused.
+        Returns:
+            WeightEnsemblingMoE: The fused MoE model.
+        """
+        log.info("Fusing models using layer-wise adaptive merging.")
+        self.modelpool = modelpool
+        with timeit_context("upscaling models to a weight-ensembling MoE model"):
+            moe_model = self.construct_moe_model()
+            print_parameters(moe_model)
+        if self.checkpoint != False:
+            log.info(
+                f"load checkpoint from {self.checkpoint}, test-time adaptation will be skipped."
+            )
+            self.load_checkpoint(moe_model, self.checkpoint)
+        else:
+            with self.profile("test-time adaptation"):
+                moe_model = self.test_time_adaptation(moe_model)
+            if self.save_checkpoint != False:
+                log.info(f"save checkpoint to {self.save_checkpoint}")
+                self.save_checkpoint(moe_model, self.save_checkpoint)
+            if lightning.fabric.wrappers.is_wrapped(moe_model):
+                moe_model = lightning.fabric.wrappers._unwrap_objects(moe_model)
+        # enable sample-wise adaptation
+        moe_model.batch_reduce = False
+        self.print_profile_summary()
+        return moe_model

fusion_bench/method/we_moe/utils.py ADDED Viewed

@@ -0,0 +1,15 @@
+import torch
+def get_memory_usage(desc):
+    """
+    obtain the current GPU memory usage
+    Returns:
+        str: A string containing the allocated and cached memory in MB.
+    """
+    allocated = torch.cuda.memory_allocated() / 1024**2  # 转换为 MB
+    cached = torch.cuda.memory_reserved() / 1024**2  # 转换为 MB
+    return (
+        f"{desc}\nAllocated Memory: {allocated:.2f} MB\nCached Memory: {cached:.2f} MB"
+    )

fusion_bench/method/weighted_average/llama.py CHANGED Viewed

@@ -7,11 +7,11 @@ from transformers import PreTrainedModel
 from typing_extensions import override
 from fusion_bench.method import BaseAlgorithm
+from fusion_bench.mixins import auto_register_config
 from fusion_bench.modelpool import CausalLMPool
 from fusion_bench.utils import timeit_context
 from fusion_bench.utils.state_dict_arithmetic import state_dict_add, state_dict_mul
 from fusion_bench.utils.type import StateDictType
-from fusion_bench.mixins import auto_register_config
 log = logging.getLogger(__name__)

fusion-bench 0.2.21__py3-none-any.whl → 0.2.23__py3-none-any.whl

fusion-bench 0.2.21py3-none-any.whl → 0.2.23py3-none-any.whl