PyPI - fusion-bench - Versions diffs - 0.2.5__py3-none-any.whl → 0.2.7__py3-none-any.whl - Mend

fusion-bench 0.2.5py3-none-any.whl → 0.2.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (105) hide show

fusion_bench/compat/method/__init__.py +2 -0
fusion_bench/compat/method/base_algorithm.py +7 -2
fusion_bench/compat/modelpool/__init__.py +3 -2
fusion_bench/compat/taskpool/__init__.py +1 -1
fusion_bench/dataset/arc_agi/__init__.py +6 -1
fusion_bench/dataset/arc_agi/arc.py +26 -7
fusion_bench/dataset/arc_agi/arc_agi.py +156 -25
fusion_bench/dataset/arc_agi/np_cache.py +0 -1
fusion_bench/dataset/arc_agi/preprocess.py +51 -9
fusion_bench/dataset/llama/__init__.py +1 -0
fusion_bench/dataset/llama/alpaca.py +93 -3
fusion_bench/dataset/llama/collate.py +72 -5
fusion_bench/dataset/llama/metamathqa.py +50 -0
fusion_bench/dataset/llama/preference_700k.py +70 -0
fusion_bench/dataset/llama/stanford_shp.py +90 -0
fusion_bench/dataset/llama/ultrachat.py +58 -0
fusion_bench/dataset/llama/utils/__init__.py +0 -0
fusion_bench/method/__init__.py +4 -1
fusion_bench/method/adamerging/__init__.py +1 -1
fusion_bench/method/adamerging/layer_wise_adamerging.py +11 -4
fusion_bench/method/adamerging/min_norm_solvers.py +4 -4
fusion_bench/method/linear/expo.py +39 -0
fusion_bench/method/lm_finetune/__init__.py +1 -0
fusion_bench/method/lm_finetune/bradley_terry_rm.py +432 -0
fusion_bench/method/lm_finetune/fullfinetune_sft.py +122 -150
fusion_bench/method/lm_finetune/peftfinetune_sft.py +102 -157
fusion_bench/method/pruning/llama_magnitude_prune.py +2 -2
fusion_bench/method/pruning/llama_random_prune.py +2 -2
fusion_bench/method/pruning/magnitude_diff_pruning.py +2 -1
fusion_bench/method/rankone_moe/__init__.py +3 -0
fusion_bench/method/rankone_moe/clip_rankone_moe.py +160 -0
fusion_bench/method/rankone_moe/rankone_moe.py +249 -0
fusion_bench/method/simple_average.py +1 -1
fusion_bench/method/surgery/__init__.py +3 -0
fusion_bench/method/surgery/clip_layer_wise_adamerging_surgery.py +157 -0
fusion_bench/mixins/__init__.py +2 -0
fusion_bench/mixins/clip_classification.py +60 -12
fusion_bench/mixins/fabric_training.py +320 -0
fusion_bench/mixins/lightning_fabric.py +11 -2
fusion_bench/modelpool/__init__.py +2 -0
fusion_bench/modelpool/causal_lm/__init__.py +1 -1
fusion_bench/modelpool/causal_lm/causal_lm.py +21 -22
fusion_bench/modelpool/seq_classification_lm/__init__.py +2 -0
fusion_bench/modelpool/seq_classification_lm/reward_model.py +15 -0
fusion_bench/modelpool/seq_classification_lm/seq_classification_lm.py +98 -0
fusion_bench/models/chat_templates/__init__.py +1 -0
fusion_bench/models/chat_templates/llama_3_Instruct.py +1 -0
fusion_bench/models/chat_templates/load_tokenizer.py +43 -0
fusion_bench/models/hf_clip.py +50 -9
fusion_bench/models/rankone_moe.py +410 -0
fusion_bench/models/surgery/surgerymodelwrapper.py +157 -0
fusion_bench/models/utils.py +8 -0
fusion_bench/models/wrappers/layer_wise_fusion.py +14 -5
fusion_bench/models/wrappers/task_wise_fusion.py +5 -5
fusion_bench/optim/__init__.py +2 -0
fusion_bench/optim/exception.py +47 -0
fusion_bench/optim/lr_scheduler/__init__.py +1 -0
fusion_bench/optim/lr_scheduler/linear_warmup.py +222 -0
fusion_bench/optim/lr_scheduler/utils/__init__.py +1 -0
fusion_bench/optim/lr_scheduler/utils/visualization.py +119 -0
fusion_bench/optim/mezo.py +0 -2
fusion_bench/programs/fabric_fusion_program.py +5 -1
fusion_bench/taskpool/__init__.py +10 -2
fusion_bench/taskpool/clip_vision/__init__.py +1 -0
fusion_bench/taskpool/clip_vision/clip_rankone_moe_taskpool.py +112 -0
fusion_bench/taskpool/clip_vision/taskpool.py +43 -6
fusion_bench/taskpool/llama/reward_model.py +157 -0
fusion_bench/taskpool/nyuv2_taskpool.py +2 -0
fusion_bench/tasks/flan_t5_text_generation/glue_load_dataset.py +2 -1
fusion_bench/utils/hydra_utils.py +22 -0
fusion_bench/utils/plot/__init__.py +0 -0
fusion_bench/utils/plot/token.py +52 -0
fusion_bench/utils/plot/token_notebook.py +127 -0
fusion_bench/utils/type.py +5 -3
{fusion_bench-0.2.5.dist-info → fusion_bench-0.2.7.dist-info}/METADATA +1 -1
{fusion_bench-0.2.5.dist-info → fusion_bench-0.2.7.dist-info}/RECORD +104 -57
fusion_bench_config/clip-vit-base-patch32_robustness_corrupted.yaml +1 -1
fusion_bench_config/dataset/llm_sft/alpaca_cleaned.yaml +6 -0
fusion_bench_config/dataset/llm_sft/ultrachat_200k.yaml +3 -0
fusion_bench_config/fabric/llama_peft_fsdp.yaml +16 -0
fusion_bench_config/fabric/loggers/wandb_logger.yaml +2 -0
fusion_bench_config/fabric/strategy/deepspeed.yaml +10 -0
fusion_bench_config/fabric/strategy/llama_peft_fsdp.yaml +9 -0
fusion_bench_config/fabric_model_fusion.yaml +1 -1
fusion_bench_config/llama_full_finetune.yaml +19 -0
fusion_bench_config/method/lm_finetune/bradley_terry_rm.yaml +47 -0
fusion_bench_config/method/lm_finetune/fullfinetune_sft.yaml +13 -6
fusion_bench_config/method/lm_finetune/peftfinetune_sft.yaml +17 -9
fusion_bench_config/method/rankone_moe/rankone_moe.yaml +26 -0
fusion_bench_config/method/regmean/clip_regmean.yaml +1 -0
fusion_bench_config/method/surgery/adamerging_surgery.yaml +27 -0
fusion_bench_config/modelpool/CausalLMPool/llama_alpaca_cleaned.yaml +21 -0
fusion_bench_config/modelpool/CausalLMPool/llama_codealpaca.yaml +21 -0
fusion_bench_config/modelpool/CausalLMPool/llama_metamathqa.yaml +19 -0
fusion_bench_config/modelpool/CausalLMPool/llama_ultrachat.yaml +18 -0
fusion_bench_config/modelpool/SeqenceClassificationModelPool/llama_preference700k.yaml +23 -0
fusion_bench_config/modelpool/SeqenceClassificationModelPool/single_reward_model.yaml +14 -0
fusion_bench_config/nyuv2_config.yaml +5 -1
fusion_bench_config/taskpool/CLIPVisionModelTaskPool/clip_rankone_wemoe_clip-vit-classification_TA8.yaml +18 -0
fusion_bench_config/taskpool/reward_model_evaluation.yaml +18 -0
fusion_bench_config/llama_weighted_average.yaml +0 -26
{fusion_bench-0.2.5.dist-info → fusion_bench-0.2.7.dist-info}/LICENSE +0 -0
{fusion_bench-0.2.5.dist-info → fusion_bench-0.2.7.dist-info}/WHEEL +0 -0
{fusion_bench-0.2.5.dist-info → fusion_bench-0.2.7.dist-info}/entry_points.txt +0 -0
{fusion_bench-0.2.5.dist-info → fusion_bench-0.2.7.dist-info}/top_level.txt +0 -0

fusion_bench/method/lm_finetune/fullfinetune_sft.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import functools
 import itertools
 import logging
 import os
@@ -13,11 +14,12 @@ from omegaconf import DictConfig
 from torch import nn
 from torch.utils.data import ConcatDataset, DataLoader
 from tqdm.auto import tqdm
+from transformers import AutoModelForCausalLM, AutoTokenizer
 from typing_extensions import TYPE_CHECKING, override
 from fusion_bench import BaseAlgorithm, BaseModelPool
 from fusion_bench.dataset.llama.collate import padded_collate_sft
-from fusion_bench.mixins import LightningFabricMixin
+from fusion_bench.mixins import FabricTrainingMixin
 from fusion_bench.modelpool import CausalLMPool
 from fusion_bench.utils import instantiate
 from fusion_bench.utils.dtype import get_dtype
@@ -33,7 +35,7 @@ if TYPE_CHECKING:
 log = logging.getLogger(__name__)
-class FullFinetuneSFT(BaseAlgorithm, LightningFabricMixin):
+class FullFinetuneSFT(BaseAlgorithm, FabricTrainingMixin):
     model: Union[nn.Module, "_FabricModule", "LlamaForCausalLM"]
     optimizer: Union[torch.optim.Optimizer, "_FabricOptimizer"]
@@ -58,7 +60,10 @@ class FullFinetuneSFT(BaseAlgorithm, LightningFabricMixin):
         gradient_clip_algorithm: Literal["value", "norm"] = "norm",
         save_optimizer_state: bool = False,
         save_full_model: bool = False,
+        save_ckpt_type: Literal["lightning", "hf"] = "lightning",
         ckpt_path: Optional[str] = None,
+        max_length: int = 6144,
+        fix_token_embedding: bool = True,
         **kwargs,
     ):
         """
@@ -80,7 +85,10 @@ class FullFinetuneSFT(BaseAlgorithm, LightningFabricMixin):
             gradient_clip_algorithm(str): Algorithm to use for gradient clipping. Available options: 'value', 'norm'. If set to 'value', the gradients will be clipped to the specified value. If set to 'norm', the gradients will be clipped to the specified norm.
             save_optimizer_state(bool): Whether to save the optimizer and lr_scheduler state along with the model checkpoint.
             save_full_model(bool): Whether to save the full model or only the trainable parameters in the model checkpoint.
+            save_ckpt_type (str): Type of checkpoint to save. Available options: 'lightning', 'hf'. If set to 'lightning', the checkpoint will be saved in the lightning format. If set to 'hf', the checkpoint will be saved in the huggingface format.
             ckpt_path(str): Path to the checkpoint to load before training. If set to None, no checkpoint will be loaded.
+            max_length(int): Maximum input length to consider. If the input length exceeds this value, it will be truncated.
+            fix_token_embedding(bool): Whether to fix the token embeddings during training. If set to True, the token embeddings will not be updated during training.
         """
         self._optimizer = optimizer
         self._lr_scheduler = lr_scheduler
@@ -97,18 +105,28 @@ class FullFinetuneSFT(BaseAlgorithm, LightningFabricMixin):
         self.gradient_clip_algorithm = gradient_clip_algorithm
         self.save_optimizer_state = save_optimizer_state
         self.save_full_model = save_full_model
+        self.save_ckpt_type = save_ckpt_type
         self.ckpt_path = ckpt_path
+        self.max_length = max_length
+        self.fix_token_embedding = fix_token_embedding
         super().__init__(**kwargs)
     def run(self, modelpool: CausalLMPool):
         self.modelpool = modelpool
         self.setup()
-        self.train()
+        self.train(self.model, self.optimizer, self.lr_scheduler)
         return self.model
     def setup_model(self):
+        self.tokenizer = self.modelpool.load_tokenizer()
+        if self.tokenizer.pad_token_id is None:
+            self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
         model = self.modelpool.load_pretrained_model()
-        self.model = model
+        self.model: "LlamaForCausalLM" = model
+        if self.fix_token_embedding:
+            self.model.model.embed_tokens.requires_grad_(False)
         if self.fabric.strategy == "fsdp" or isinstance(
             self.fabric.strategy, FSDPStrategy
@@ -117,21 +135,14 @@ class FullFinetuneSFT(BaseAlgorithm, LightningFabricMixin):
             self.model.gradient_checkpointing_enable(
                 gradient_checkpointing_kwargs={"use_reentrant": True}
             )
+            self.use_cache = False
+        else:
+            self.use_cache = True
         self.model_dtype = get_dtype(self.model)
     def configure_optimizer(self):
         # compute expected total steps
-        self.expected_total_steps = []
-        if self.max_steps > 0:
-            self.expected_total_steps.append(self.max_steps)
-        if self.max_steps_per_epoch > 0 and self.max_epochs > 0:
-            self.expected_total_steps.append(self.max_steps_per_epoch * self.max_epochs)
-        if self.max_epochs > 0:
-            self.expected_total_steps.append(
-                len(self.train_dataloader) * self.max_epochs
-            )
-        self.expected_total_steps = min(self.expected_total_steps)
-        log.info(f"Expected total steps: {self.expected_total_steps}")
+        self.compute_expected_total_steps(self.train_dataloader)
         optimizer = instantiate(self._optimizer, self.model.parameters())
         if self._lr_scheduler is not None:
@@ -170,7 +181,9 @@ class FullFinetuneSFT(BaseAlgorithm, LightningFabricMixin):
             train_dataset,
             **self.dataloader_kwargs,
             shuffle=True,
-            collate_fn=padded_collate_sft,
+            collate_fn=functools.partial(
+                padded_collate_sft, pad_token_id=self.tokenizer.pad_token_id
+            ),
         )
         self.train_dataloader = fabric.setup_dataloaders(self.train_dataloader)
@@ -186,25 +199,15 @@ class FullFinetuneSFT(BaseAlgorithm, LightningFabricMixin):
         self.model, self.optimizer = fabric.setup(self.model, optimizer)
         self.lr_scheduler = lr_scheduler
-    def _clip_gradients_if_needed(self):
+    @override
+    def train_epoch(self, *args, **kwargs):
         fabric = self.fabric
-        if self.gradient_clip_val is not None:
-            if self.gradient_clip_algorithm == "value":
-                fabric.clip_gradients(self.model, clip_val=self.gradient_clip_val)
-            elif self.gradient_clip_algorithm == "norm":
-                fabric.clip_gradients(self.model, max_norm=self.gradient_clip_val)
-            else:
-                raise ValueError(
-                    f"Unknown gradient clip algorithm: {self.gradient_clip_algorithm}. Available options: 'value', 'norm'"
-                )
-    def train_epoch(self):
-        fabric = self.fabric
+        accumulated_loss = 0
         for step_idx, batch in enumerate(
             pbar := tqdm(
                 self.train_dataloader,
-                desc="Training Steps",
+                desc="Training Batches",
                 dynamic_ncols=True,
                 leave=False,
                 disable=not fabric.is_global_zero,
@@ -212,24 +215,30 @@ class FullFinetuneSFT(BaseAlgorithm, LightningFabricMixin):
         ):
             is_accumulating = (step_idx + 1) % self.accumulate_grad_batches != 0
+            if self.max_length > 0 and batch["input_ids"].shape[1] > self.max_length:
+                log.warning(
+                    f"Input length exceeds max_length: {batch['input_ids'].shape[1]} > {self.max_length}. Truncating input."
+                )
+                batch["input_ids"] = batch["input_ids"][:, : self.max_length]
+                batch["attention_mask"] = batch["attention_mask"][:, : self.max_length]
+                batch["labels"] = batch["labels"][:, : self.max_length]
             # disable gradient synchronization if accumulating gradients across steps for improved performance
             with fabric.no_backward_sync(self.model, enabled=is_accumulating):
                 # use_cache=True is not compatible with gradient checkpointing, so we disable it here
-                output = self.model(**batch, use_cache=False)
-                loss = output["loss"]
+                output = self.model(
+                    input_ids=batch["input_ids"],
+                    attention_mask=batch["attention_mask"],
+                    labels=batch["labels"],
+                    use_cache=self.use_cache,
+                )
+                loss = output["loss"] / self.accumulate_grad_batches
                 fabric.backward(loss)
-            metrics = {
-                "train/loss": loss.item(),
-                "train/epoch_idx": self.epoch_idx,
-                "train/lr": self.optimizer.param_groups[0]["lr"],
-            }
-            fabric.log_dict(metrics, step=self.global_step_idx)
-            pbar.set_postfix(metrics)
+                accumulated_loss += loss.item()
             if not is_accumulating:
-                self._clip_gradients_if_needed()
+                self.clip_gradients_if_needed(self.model, self.optimizer)
                 # run lr_scheduler at the end of the step if interval is set to "step"
                 if (
@@ -242,104 +251,30 @@ class FullFinetuneSFT(BaseAlgorithm, LightningFabricMixin):
                 self.optimizer.step()
                 self.optimizer.zero_grad()
-            # save the model at the end of the step if interval is set to "step" and frequency is met
-            self._try_save_checkpoint(stage="end_of_step")
+                metrics = {
+                    "train/loss": accumulated_loss,
+                    "train/epoch_idx": self.epoch_idx,
+                    "train/lr": self.optimizer.param_groups[0]["lr"],
+                }
+                fabric.log_dict(metrics, step=self.global_step_idx)
+                pbar.set_postfix(metrics)
-            # break if max_steps_per_epoch is set, and exit epoch
-            if (
-                self.max_steps_per_epoch > 0
-                and step_idx + 1 >= self.max_steps_per_epoch
-            ):
-                break
-            # break if max_steps is set, and exit training
-            if self.max_steps > 0 and self.global_step_idx >= self.max_steps:
-                self.is_training = False
-                break
+                # save the model at the end of the step if interval is set to "step" and frequency is met
+                self.conditional_checkpoint_save(stage="end_of_step")
-            self.global_step_idx += 1
+                # break if max_steps_per_epoch is set, and exit epoch
+                if (
+                    self.max_steps_per_epoch > 0
+                    and step_idx + 1 >= self.max_steps_per_epoch
+                ):
+                    break
+                # break if max_steps is set, and exit training
+                if self.max_steps > 0 and self.global_step_idx >= self.max_steps - 1:
+                    self.is_training = False
+                    break
-    def train(self):
-        fabric = self.fabric
-        self.is_training = True
-        self.global_step_idx = 0
-        self.model.train()
-        for epoch_idx in tqdm(
-            range(self.max_epochs) if self.max_epochs > 0 else itertools.count(0),
-            "Training Epoch",
-            dynamic_ncols=True,
-            leave=False,
-            disable=not fabric.is_global_zero,
-        ):
-            self.epoch_idx = epoch_idx
-            self.train_epoch()
-            # run lr_scheduler at the end of the epoch if interval is set to "epoch"
-            if (
-                self.lr_scheduler_interval == "epoch"
-                and (epoch_idx + 1) % self.lr_scheduler_frequency == 0
-            ):
-                self.lr_scheduler.step()
-            # save the model at the end of the epoch if interval is set to "epoch" and frequency is met
-            self._try_save_checkpoint(stage="end_of_epoch")
-            if not self.is_training:
-                break
-        # save the model at the end of training
-        self._try_save_checkpoint(stage="end_of_training")
-    def _try_save_checkpoint(
-        self, stage: Literal["end_of_step", "end_of_epoch", "end_of_training"]
-    ):
-        if stage == "end_of_step":
-            if (
-                self.checkpoint_save_interval == "step"
-                and (self.global_step_idx + 1) % self.checkpoint_save_frequency == 0
-            ):
-                self.save_checkpoint(
-                    os.path.join(
-                        self.log_dir, "checkpoints", f"step={self.global_step_idx}.ckpt"
-                    )
-                )
-        elif stage == "end_of_epoch":
-            if (
-                self.checkpoint_save_interval == "epoch"
-                and (self.epoch_idx + 1) % self.checkpoint_save_frequency == 0
-            ):
-                self.save_checkpoint(
-                    os.path.join(
-                        self.log_dir, "checkpoints", f"epoch={self.epoch_idx}.ckpt"
-                    )
-                )
-        elif stage == "end_of_training":
-            # if the checkpoint has not been saved yet, save it
-            if self.global_step_idx > self._latest_saved_checkpoint_global_step:
-                self.save_checkpoint(
-                    os.path.join(
-                        self.log_dir,
-                        "checkpoints",
-                        f"epoch={self.epoch_idx}_step={self.global_step_idx}.ckpt",
-                    )
-                )
-                try:
-                    os.symlink(
-                        os.path.join(
-                            self.log_dir,
-                            "checkpoints",
-                            "latest_model.ckpt",
-                        ),
-                        os.path.join(
-                            self.log_dir,
-                            "checkpoints",
-                            f"epoch={self.epoch_idx}_step={self.global_step_idx}.ckpt",
-                        ),
-                    )
-                except Exception as e:
-                    pass
-        else:
-            raise ValueError(
-                f"Unknown stage: {stage}. Available options: 'end_of_step', 'end_of_epoch', 'end_of_training'"
-            )
+                self.global_step_idx += 1
+                accumulated_loss = 0
     def save_checkpoint(
         self,
@@ -351,24 +286,36 @@ class FullFinetuneSFT(BaseAlgorithm, LightningFabricMixin):
             return log.warning(f"Checkpoint already exists at {path}. Skipping save.")
         fabric = self.fabric
-        state = {"model": self.model}
-        # save the optimizer and lr_scheduler state if needed
-        if self.save_optimizer_state and save_optimizer_state is not False:
-            state.update(
-                {
-                    "optimizer": self.optimizer,
-                    "lr_scheduler": self.lr_scheduler,
-                    "global_step_idx": self.global_step_idx,
-                    "epoch_idx": self.epoch_idx,
-                }
+        if self.save_ckpt_type == "lightning":
+            state = {"model": self.model}
+            # save the optimizer and lr_scheduler state if needed
+            if self.save_optimizer_state and save_optimizer_state is not False:
+                state.update(
+                    {
+                        "optimizer": self.optimizer,
+                        "lr_scheduler": self.lr_scheduler,
+                        "global_step_idx": self.global_step_idx,
+                        "epoch_idx": self.epoch_idx,
+                    }
+                )
+            trainable_param_names = set(
+                name
+                for name, param in self.model.state_dict(keep_vars=True).items()
+                if param.requires_grad
+            )
+            filter = (
+                None
+                if self.save_full_model
+                else {"model": lambda k, p: k in trainable_param_names}
             )
-        filter = (
-            None if self.save_full_model else {"model": lambda k, p: p.requires_grad}
-        )
+            fabric.save(path, state=state, filter=filter)
+        else:
+            self.model.save_pretrained(path, is_main_process=fabric.is_global_zero)
-        fabric.save(path, state=state, filter=filter)
         self._latest_saved_checkpoint_global_step = self.global_step_idx
     def load_checkpoint(self, path: Union[str, Path]):
@@ -401,3 +348,28 @@ def load_checkpoint(
     state = {"model": model}
     state.update(state_components)
     fabric.load(ckpt_path, state=state, strict=strict)
+if __name__ == "__main__":
+    # convert a checkpoint to hf format
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--base-model-path", type=str)
+    parser.add_argument("--ckpt-path", type=str)
+    parser.add_argument("--output-path", type=str)
+    args = parser.parse_args()
+    fabric = L.Fabric(devices=1, strategy="fsdp")
+    fabric.launch()
+    tokenizer = AutoTokenizer.from_pretrained(args.base_model_path)
+    tokenizer.save_pretrained(args.output_path)
+    model = AutoModelForCausalLM.from_pretrained(
+        args.base_model_path, torch_dtype=torch.bfloat16
+    )
+    model = fabric.setup_module(model)
+    load_checkpoint(fabric, args.ckpt_path, model=model, strict=True)
+    model.save_pretrained(args.output_path)

fusion-bench 0.2.5__py3-none-any.whl → 0.2.7__py3-none-any.whl

fusion-bench 0.2.5py3-none-any.whl → 0.2.7py3-none-any.whl