PyPI - fusion-bench - Versions diffs - 0.2.5__py3-none-any.whl → 0.2.6__py3-none-any.whl - Mend

fusion-bench 0.2.5py3-none-any.whl → 0.2.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

fusion_bench/method/lm_finetune/fullfinetune_sft.py CHANGED Viewed

@@ -13,6 +13,7 @@ from omegaconf import DictConfig
 from torch import nn
 from torch.utils.data import ConcatDataset, DataLoader
 from tqdm.auto import tqdm
+from transformers import AutoModelForCausalLM, AutoTokenizer
 from typing_extensions import TYPE_CHECKING, override
 from fusion_bench import BaseAlgorithm, BaseModelPool
@@ -117,6 +118,9 @@ class FullFinetuneSFT(BaseAlgorithm, LightningFabricMixin):
             self.model.gradient_checkpointing_enable(
                 gradient_checkpointing_kwargs={"use_reentrant": True}
             )
+            self.use_cache = False
+        else:
+            self.use_cache = True
         self.model_dtype = get_dtype(self.model)
     def configure_optimizer(self):
@@ -215,7 +219,12 @@ class FullFinetuneSFT(BaseAlgorithm, LightningFabricMixin):
             # disable gradient synchronization if accumulating gradients across steps for improved performance
             with fabric.no_backward_sync(self.model, enabled=is_accumulating):
                 # use_cache=True is not compatible with gradient checkpointing, so we disable it here
-                output = self.model(**batch, use_cache=False)
+                output = self.model(
+                    input_ids=batch["input_ids"],
+                    attention_mask=batch["attention_mask"],
+                    labels=batch["labels"],
+                    use_cache=self.use_cache,
+                )
                 loss = output["loss"]
                 fabric.backward(loss)
@@ -252,7 +261,7 @@ class FullFinetuneSFT(BaseAlgorithm, LightningFabricMixin):
             ):
                 break
             # break if max_steps is set, and exit training
-            if self.max_steps > 0 and self.global_step_idx >= self.max_steps:
+            if self.max_steps > 0 and self.global_step_idx >= self.max_steps - 1:
                 self.is_training = False
                 break
@@ -328,14 +337,15 @@ class FullFinetuneSFT(BaseAlgorithm, LightningFabricMixin):
                             "checkpoints",
                             "latest_model.ckpt",
                         ),
-                        os.path.join(
+                        dst := os.path.join(
                             self.log_dir,
                             "checkpoints",
                             f"epoch={self.epoch_idx}_step={self.global_step_idx}.ckpt",
                         ),
+                        target_is_directory=os.path.isdir(dst),
                     )
                 except Exception as e:
-                    pass
+                    log.error(f"Failed to create symlink: {e}")
         else:
             raise ValueError(
                 f"Unknown stage: {stage}. Available options: 'end_of_step', 'end_of_epoch', 'end_of_training'"
@@ -364,8 +374,15 @@ class FullFinetuneSFT(BaseAlgorithm, LightningFabricMixin):
                 }
             )
+        trainable_param_names = set(
+            name
+            for name, param in self.model.state_dict(keep_vars=True).items()
+            if param.requires_grad
+        )
         filter = (
-            None if self.save_full_model else {"model": lambda k, p: p.requires_grad}
+            None
+            if self.save_full_model
+            else {"model": lambda k, p: k in trainable_param_names}
         )
         fabric.save(path, state=state, filter=filter)
@@ -401,3 +418,28 @@ def load_checkpoint(
     state = {"model": model}
     state.update(state_components)
     fabric.load(ckpt_path, state=state, strict=strict)
+if __name__ == "__main__":
+    # convert a checkpoint to hf format
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--base_model_path", type=str)
+    parser.add_argument("--ckpt_path", type=str)
+    parser.add_argument("--output_path", type=str)
+    args = parser.parse_args()
+    fabric = L.Fabric(devices=1, strategy="fsdp")
+    fabric.launch()
+    tokenizer = AutoTokenizer.from_pretrained(args.base_model_path)
+    tokenizer.save_pretrained(args.output_path)
+    model = AutoModelForCausalLM.from_pretrained(
+        args.base_model_path, torch_dtype=torch.bfloat16
+    )
+    model = fabric.setup_module(model)
+    load_checkpoint(fabric, args.ckpt_path, model=model, strict=True)
+    model.save_pretrained(args.output_path)

fusion_bench/method/lm_finetune/peftfinetune_sft.py CHANGED Viewed

@@ -10,10 +10,10 @@ import peft
 import torch
 from lightning.fabric.strategies.fsdp import FSDPStrategy
 from lightning.fabric.utilities import rank_zero_only
-from omegaconf import DictConfig
+from omegaconf import DictConfig, OmegaConf
 from peft import PeftModel, get_peft_config, get_peft_model
 from torch import nn
-from torch.utils.data import DataLoader, ConcatDataset
+from torch.utils.data import ConcatDataset, DataLoader
 from tqdm.auto import tqdm
 from typing_extensions import TYPE_CHECKING, override
@@ -65,7 +65,9 @@ class PeftFinetuneSFT(BaseAlgorithm, LightningFabricMixin):
         gradient_clip_algorithm: Literal["value", "norm"] = "norm",
         save_optimizer_state: bool = False,
         save_full_model: bool = False,
+        save_ckpt_type: Literal["lightning", "peft"] = "peft",
         ckpt_path: Optional[str] = None,
+        max_length: int = 6150,
         **kwargs,
     ):
         """
@@ -90,6 +92,7 @@ class PeftFinetuneSFT(BaseAlgorithm, LightningFabricMixin):
             gradient_clip_algorithm(str): Algorithm to use for gradient clipping. Available options: 'value', 'norm'. If set to 'value', the gradients will be clipped to the specified value. If set to 'norm', the gradients will be clipped to the specified norm.
             save_optimizer_state(bool): Whether to save the optimizer and lr_scheduler state along with the model checkpoint.
             save_full_model(bool): Whether to save the full model or only the trainable parameters in the model checkpoint.
+            save_ckpt_type(str): Type of checkpoint to save. Available options: 'lightning', 'peft'. If set to 'lightning', the model will be saved using the Lightning checkpointing mechanism. If set to 'peft', the model will be saved using the PEFT checkpointing mechanism.
             ckpt_path(str): Path to the checkpoint to load before training. If set to None, no checkpoint will be loaded.
         """
         self._optimizer = optimizer
@@ -110,7 +113,9 @@ class PeftFinetuneSFT(BaseAlgorithm, LightningFabricMixin):
         self.gradient_clip_algorithm = gradient_clip_algorithm
         self.save_optimizer_state = save_optimizer_state
         self.save_full_model = save_full_model
+        self.save_ckpt_type = save_ckpt_type
         self.ckpt_path = ckpt_path
+        self.max_length = max_length
         super().__init__(**kwargs)
     def run(self, modelpool: CausalLMPool):
@@ -126,7 +131,7 @@ class PeftFinetuneSFT(BaseAlgorithm, LightningFabricMixin):
         model = self.modelpool.load_pretrained_model()
         # get the PEFT model
-        peft_config = get_peft_config(self._peft_config)
+        peft_config = instantiate(self._peft_config, _convert_="all")
         peft_model = get_peft_model(model, peft_config, self.adapter_name)
         peft_model.print_trainable_parameters()
@@ -139,6 +144,10 @@ class PeftFinetuneSFT(BaseAlgorithm, LightningFabricMixin):
             self.model.gradient_checkpointing_enable(
                 gradient_checkpointing_kwargs={"use_reentrant": True}
             )
+            self.use_cache = False
+        else:
+            self.use_cache = True
         self.model_dtype = get_dtype(self.model)
     def configure_optimizer(self):
@@ -234,10 +243,22 @@ class PeftFinetuneSFT(BaseAlgorithm, LightningFabricMixin):
         ):
             is_accumulating = (step_idx + 1) % self.accumulate_grad_batches != 0
+            if self.max_length > 0 and batch["input_ids"].shape[1] > self.max_length:
+                log.warning(
+                    f"Input length exceeds max_length: {batch['input_ids'].shape[1]} > {self.max_length}. Truncating input."
+                )
+                batch["input_ids"] = batch["input_ids"][:, : self.max_length]
+                batch["attention_mask"] = batch["attention_mask"][:, : self.max_length]
+                batch["labels"] = batch["labels"][:, : self.max_length]
             # disable gradient synchronization if accumulating gradients across steps for improved performance
             with fabric.no_backward_sync(self.model, enabled=is_accumulating):
                 # use_cache=True is not compatible with gradient checkpointing, so we disable it here
-                output = self.model(**batch, use_cache=False)
+                output = self.model(
+                    input_ids=batch["input_ids"],
+                    attention_mask=batch["attention_mask"],
+                    labels=batch["labels"],
+                    use_cache=self.use_cache,
+                )
                 loss = output["loss"]
                 fabric.backward(loss)
@@ -274,7 +295,7 @@ class PeftFinetuneSFT(BaseAlgorithm, LightningFabricMixin):
             ):
                 break
             # break if max_steps is set, and exit training
-            if self.max_steps > 0 and self.global_step_idx >= self.max_steps:
+            if self.max_steps > 0 and self.global_step_idx >= self.max_steps - 1:
                 self.is_training = False
                 break
@@ -350,14 +371,15 @@ class PeftFinetuneSFT(BaseAlgorithm, LightningFabricMixin):
                             "checkpoints",
                             "latest_model.ckpt",
                         ),
-                        os.path.join(
+                        dst := os.path.join(
                             self.log_dir,
                             "checkpoints",
                             f"epoch={self.epoch_idx}_step={self.global_step_idx}.ckpt",
                         ),
+                        target_is_directory=os.path.isdir(dst),
                     )
                 except Exception as e:
-                    pass
+                    log.error(f"Failed to create symlink: {e}")
         else:
             raise ValueError(
                 f"Unknown stage: {stage}. Available options: 'end_of_step', 'end_of_epoch', 'end_of_training'"
@@ -373,24 +395,37 @@ class PeftFinetuneSFT(BaseAlgorithm, LightningFabricMixin):
             return log.warning(f"Checkpoint already exists at {path}. Skipping save.")
         fabric = self.fabric
-        state = {"model": self.model}
-        # save the optimizer and lr_scheduler state if needed
-        if self.save_optimizer_state and save_optimizer_state is not False:
-            state.update(
-                {
-                    "optimizer": self.optimizer,
-                    "lr_scheduler": self.lr_scheduler,
-                    "global_step_idx": self.global_step_idx,
-                    "epoch_idx": self.epoch_idx,
-                }
+        if self.save_ckpt_type == "lightning":
+            state = {"model": self.model}
+            # save the optimizer and lr_scheduler state if needed
+            if self.save_optimizer_state and save_optimizer_state is not False:
+                state.update(
+                    {
+                        "optimizer": self.optimizer,
+                        "lr_scheduler": self.lr_scheduler,
+                        "global_step_idx": self.global_step_idx,
+                        "epoch_idx": self.epoch_idx,
+                    }
+                )
+            trainable_param_names = set(
+                name
+                for name, param in self.model.state_dict(keep_vars=True).items()
+                if param.requires_grad
+            )
+            filter = (
+                None
+                if self.save_full_model
+                else {"model": lambda k, p: k in trainable_param_names}
             )
-        filter = (
-            None if self.save_full_model else {"model": lambda k, p: p.requires_grad}
-        )
-        fabric.save(path, state=state, filter=filter)
+            fabric.save(path, state=state, filter=filter)
+        elif self.save_ckpt_type == "peft":
+            self.model.save_pretrained(path, is_main_process=fabric.is_global_zero)
+        else:
+            raise ValueError(
+                f"Unknown save_ckpt_type: {self.save_ckpt_type}. Available options: 'lightning', 'peft'"
+            )
         self._latest_saved_checkpoint_global_step = self.global_step_idx
     def load_checkpoint(self, path: Union[str, Path]):

fusion_bench/method/pruning/magnitude_diff_pruning.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import functools
 import logging
 import re
 from copy import deepcopy
@@ -10,7 +11,7 @@ from tqdm.auto import tqdm
 from fusion_bench.method import BaseAlgorithm
 from fusion_bench.mixins.simple_profiler import SimpleProfilerMixin
 from fusion_bench.modelpool import BaseModelPool
-import functools
 from .prune_utils import unstructured_magnitude_prune_
 log = logging.getLogger(__name__)

fusion_bench/method/rankone_moe/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+# flake8: noqa F401
+from .clip_rankone_moe import CLIPRankOneMoEAlgorithm
+from .rankone_moe import RankOneMoEAlgorithm

fusion_bench/method/rankone_moe/clip_rankone_moe.py ADDED Viewed

@@ -0,0 +1,160 @@
+import functools
+import logging
+import os
+from copy import deepcopy
+import torch
+from torch import Tensor
+from torch.utils.data import DataLoader
+from transformers.models.clip.modeling_clip import CLIPEncoder
+from fusion_bench.dataset import CLIPDataset
+from fusion_bench.method.task_arithmetic.task_arithmetic import task_arithmetic_merge
+from fusion_bench.mixins import CLIPClassificationMixin
+from fusion_bench.modelpool import CLIPVisionModelPool
+from fusion_bench.models.rankone_moe import RankOneMoE
+from fusion_bench.utils.data import InfiniteDataLoader
+from .rankone_moe import RankOneMoEAlgorithm
+log = logging.getLogger(__name__)
+class CLIPRankOneMoEAlgorithm(
+    RankOneMoEAlgorithm,
+    CLIPClassificationMixin,
+):
+    """
+    CLIPRankOneMoEAlgorithm is a class that implements the RankOneMoEAlgorithm (https://github.com/EnnengYang/RankOne-MoE)
+    for CLIP models. It extends the RankOneMoEAlgorithm and CLIPClassificationMixin classes.
+    Attributes:
+        modelpool (CLIPVisionModelPool): The model pool containing the CLIP models.
+    """
+    modelpool: CLIPVisionModelPool = None
+    def load_checkpoint(self, model, checkpoint):
+        """
+        Load the checkpoint file.
+        Args:
+            model: The model to load the checkpoint into.
+            checkpoint: The path to the checkpoint file.
+        """
+        state = {"model": model}
+        self._fabric.load(checkpoint, state)
+    def save_checkpoint(self, model, checkpoint):
+        """
+        Save the checkpoint file.
+        Args:
+            model: The model to save the checkpoint from.
+            checkpoint: The path to the checkpoint file.
+        """
+        self._fabric.save(checkpoint, {"model": model})
+    def construct_moe_model(self) -> RankOneMoE:
+        """
+        Construct the RankOne-MoE model using the models in the model pool.
+        Returns:
+            RankOne-MoE: The constructed MoE model.
+        """
+        base_model = self.modelpool.load_model("_pretrained_")
+        expert_models = [
+            self.modelpool.load_model(m) for m in self.modelpool.model_names
+        ]
+        # Merge the models using task arithmetic
+        moe_model = task_arithmetic_merge(
+            # This function modifies the model in place, so we need to pass a deepcopy
+            deepcopy(base_model),
+            expert_models,
+            scaling_factor=self.config.init_lambda,
+        ).requires_grad_(False)
+        # Up-scale MLP modules
+        base_encoder: CLIPEncoder = base_model.vision_model.encoder
+        moe_encoder: CLIPEncoder = moe_model.vision_model.encoder
+        expert_encoders = [m.vision_model.encoder for m in expert_models]
+        num_layers = len(base_encoder.layers)
+        for layer_idx in range(num_layers):
+            base_mlp = base_encoder.layers[layer_idx].mlp
+            expert_mlps = [e.layers[layer_idx].mlp for e in expert_encoders]
+            moe_encoder.layers[layer_idx].mlp = RankOneMoE(
+                hidden_size=base_encoder.config.hidden_size,
+                base_model=base_mlp,
+                expert_models=expert_mlps,
+                init_lambda=self.config.init_lambda,
+                batch_first=True,  # For open_clip models this is False
+                router_hidden_layers=self.config.router_hidden_layers,
+                batch_reduce=self.config.batch_reduce,
+                svd_accelerator=self.config.svd_accelerator,
+                rank_k=self.config.rank_k,
+                select_k=self.config.select_k,
+            )
+        return moe_model
+    @functools.cache
+    def get_shuffled_test_loader_iter(self, tta_dataset: str):
+        """
+        Get an iterator for the shuffled test data loader.
+        Args:
+            tta_dataset (str): The name of the test-time adaptation dataset.
+        Returns:
+            Iterator: An iterator for the shuffled test data loader.
+        """
+        dataset = self.modelpool.load_test_dataset(tta_dataset)
+        dataset = CLIPDataset(dataset, processor=self.clip_processor)
+        log.info("get_shuffled_test_loader_iter")
+        loader = DataLoader(
+            dataset,
+            batch_size=self.config.batch_size,
+            shuffle=True,
+            num_workers=self.config.num_workers,
+            pin_memory=True,
+        )
+        loader = self.fabric.setup_dataloaders(loader)
+        return iter(InfiniteDataLoader(loader))
+    def on_test_time_adaptation_start(self):
+        """
+        Load the CLIP processor and construct the zero-shot classification head for each task.
+        """
+        self.setup_zero_shot_classification_head()
+    def compute_logits(self, module, batch, task) -> Tensor:
+        """
+        Compute the logits for the given batch and task.
+        Args:
+            module: The model module.
+            batch: The input batch.
+            task: The task name.
+        Returns:
+            Tensor: The computed logits.
+        """
+        images, _ = batch
+        text_embeds = self.zeroshot_weights[task]
+        image_embeds = module(images)[1]
+        image_embeds = self.visual_projection(image_embeds)
+        # Normalize embeddings
+        image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
+        # Cosine similarity
+        logits_per_text = (
+            torch.matmul(text_embeds, image_embeds.t()) * self.logit_scale_exp
+        )
+        logits_per_image = logits_per_text.t()
+        return logits_per_image

fusion-bench 0.2.5__py3-none-any.whl → 0.2.6__py3-none-any.whl

fusion-bench 0.2.5py3-none-any.whl → 0.2.6py3-none-any.whl