PyPI - fusion-bench - Versions diffs - 0.2.9__py3-none-any.whl - Mend

fusion-bench 0.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (727) hide show

fusion_bench/method/dawe/dawe_for_clip.py ADDED Viewed

@@ -0,0 +1,274 @@
+# NOTE: Working in progress.
+import logging
+from pathlib import Path
+from typing import Any, Literal, Optional, Union  # noqa: F401
+import torch
+from omegaconf import DictConfig
+from PIL.Image import Image
+from torch import nn
+from torch.utils.data import DataLoader
+from tqdm.auto import tqdm
+from transformers import (
+    AutoFeatureExtractor,
+    CLIPProcessor,
+    PreTrainedModel,
+    ResNetForImageClassification,
+)
+from fusion_bench.dataset.clip_dataset import CLIPDataset
+from fusion_bench.method import BaseAlgorithm
+from fusion_bench.method.adamerging.entropy_loss import entropy_loss
+from fusion_bench.mixins import CLIPClassificationMixin
+from fusion_bench.modelpool import CLIPVisionModelPool
+from fusion_bench.utils import timeit_context
+from fusion_bench.utils.data import InfiniteDataLoader
+from fusion_bench.utils.instantiate import instantiate
+from .warppers.dawe_model import DataAdaptiveWeightEnsemblingCLIPVisionModel
+log = logging.getLogger(__name__)
+def convert_to_rgb(image: Image | list[Image]) -> Image | list[Image]:
+    if isinstance(image, (list, tuple)):
+        return [convert_to_rgb(img) for img in image]
+    else:
+        return image.convert("RGB")
+def load_resnet_processor(pretrained_model_name_or_path: str):
+    """
+    Load a ResNet processor for image preprocessing.
+    Args:
+        pretrained_model_name_or_path (str): The path or name of the pretrained ResNet model.
+    Returns:
+        function: A function that processes images using the ResNet processor.
+    """
+    processor = AutoFeatureExtractor.from_pretrained(pretrained_model_name_or_path)
+    return lambda img: processor(
+        images=convert_to_rgb(img), return_tensors="pt", do_rescale=False
+    ).pixel_values
+class ResNetFeatureExtractor(nn.Module):
+    def __init__(self, pretrained_model_name_or_path):
+        super().__init__()
+        self.model = ResNetForImageClassification.from_pretrained(
+            pretrained_model_name_or_path
+        )
+        self.model.classifier = nn.Flatten(1, -1)
+        self.config = self.model.config
+    def forward(self, *args, **kwargs):
+        outputs = self.model(*args, **kwargs)
+        return outputs.logits
+def load_resnet_feature_extractor(pretrained_model_name_or_path: str):
+    model = ResNetFeatureExtractor(pretrained_model_name_or_path)
+    return model
+def raw_image_collate_fn(batch):
+    images, labels = tuple(zip(*batch))
+    labels = torch.as_tensor(labels)
+    return images, labels
+class DataAdaptiveWeightEnsemblingForCLIP(
+    BaseAlgorithm,
+    CLIPClassificationMixin,
+):
+    modelpool: CLIPVisionModelPool
+    _processor: CLIPProcessor
+    def __init__(
+        self,
+        # merge options
+        merge_mode: Literal["task_wise", "layer_wise"],
+        init_lambda: float,
+        batch_reduce: bool,
+        eval_batch_reduce: bool,
+        # model options
+        dict_processor: DictConfig,
+        dict_feature_extractor: DictConfig,
+        hidden_size: Optional[int],
+        gate_hidden_layers: int,
+        task_vector_dtype: Optional[str | torch.dtype],
+        task_vector_sparsity: float,
+        # training & logging args
+        max_steps: int,
+        save_interval: int,
+        learning_rate: float = 1e-5,
+        skip_training: bool = False,
+        resume_checkpoint_path: Optional[str] = None,
+        # dataloader args
+        batch_size: int = 4,
+        num_workers: int = 0,
+        pin_memory: bool = True,
+        **kwargs,
+    ):
+        # merge options
+        self.merge_mode = merge_mode
+        self.init_lambda = init_lambda
+        self.batch_reduce = batch_reduce
+        self.eval_batch_reduce = eval_batch_reduce
+        # model options
+        self._dict_processor = dict_processor
+        self._dict_feature_extractor = dict_feature_extractor
+        self.hidden_size = hidden_size
+        self.gate_hidden_layers = gate_hidden_layers
+        self.task_vector_dtype = task_vector_dtype
+        self.task_vector_sparsity = task_vector_sparsity
+        # training & logging args
+        self.max_steps = max_steps
+        self.save_interval = save_interval
+        self.learning_rate = learning_rate
+        self.skip_training = skip_training
+        self.resume_checkpoint_path = resume_checkpoint_path
+        # dataloader args
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+        self.pin_memory = pin_memory
+        super().__init__(**kwargs)
+    def load_models(self):
+        modelpool = self.modelpool
+        dict_processor = instantiate(self._dict_processor)
+        clip_processor = modelpool.load_processor()
+        dict_feature_extractor: Union[PreTrainedModel, nn.Module] = instantiate(
+            self._dict_feature_extractor
+        )
+        if self.hidden_size is None:
+            # try to infer hidden size from feature extractor model
+            self.hidden_size = dict_feature_extractor.config.hidden_sizes[-1]
+        # initialize classification head
+        self.setup_zero_shot_classification_head(
+            clip_processor=clip_processor,
+            task_names=modelpool.model_names,
+        )
+        model = DataAdaptiveWeightEnsemblingCLIPVisionModel(
+            merge_mode=self.merge_mode,
+            hidden_size=self.hidden_size,
+            dict_processor=dict_processor,
+            model_processor=lambda images: clip_processor(
+                images=images, return_tensors="pt"
+            ).pixel_values,
+            collate_fn=lambda outputs: torch.cat(
+                [out.pooler_output for out in outputs], dim=0
+            ),
+            dict_feature_extractor=dict_feature_extractor,
+            base_model=modelpool.load_model("_pretrained_"),
+            expert_models=list(modelpool.models()),
+            task_vector_dtype=self.task_vector_dtype,
+            task_vector_sparsity=self.task_vector_sparsity,
+            init_lambda=self.init_lambda,
+            gate_hidden_layers=self.gate_hidden_layers,
+            batch_reduce=self.batch_reduce,
+        )
+        if self.resume_checkpoint_path is not None:
+            self.fabric.load(self.resume_checkpoint_path, {"model": model})
+        return model
+    def load_datasets(self):
+        modelpool = self.modelpool
+        self.test_datasets = {
+            task_name: CLIPDataset(
+                modelpool.load_test_dataset(task_name),
+                processor=None,  # NOTE: processor is not used in CLIPDataset because feature extractor and model may have different processors, so we want to pass the image as is
+            )
+            for task_name in modelpool.model_names
+        }
+        # setup dataloaders for test-time adaptation training
+        dataloader_kwargs = {
+            "batch_size": self.batch_size,
+            "num_workers": self.num_workers,
+            "pin_memory": self.pin_memory,
+        }
+        self.shuffled_test_loaders = {
+            task_name: self.fabric.setup_dataloaders(
+                DataLoader(
+                    test_dataset,
+                    **dataloader_kwargs,
+                    collate_fn=raw_image_collate_fn,
+                    shuffle=True,
+                )
+            )
+            for task_name, test_dataset in self.test_datasets.items()
+        }
+        self.shuffled_test_loader_iters = {
+            task_name: InfiniteDataLoader(loader)
+            for task_name, loader in self.shuffled_test_loaders.items()
+        }
+    def run(self, modelpool: CLIPVisionModelPool):
+        self.modelpool = modelpool
+        with timeit_context("Loading models"):
+            model = self.load_models()
+        with timeit_context("Loading dataloaders"):
+            self.load_datasets()
+        # run test-time adaptation
+        if not self.skip_training:
+            model = self.test_time_adaptation_training(modelpool, model)
+        if self.eval_batch_reduce is not None:
+            model.batch_reduce = self.eval_batch_reduce
+        return model
+    def test_time_adaptation_training(self, modelpool, model):
+        optimizer = torch.optim.Adam(
+            [p for p in model.gate.parameters() if p.requires_grad],
+            lr=self.learning_rate,
+        )
+        model, optimizer = self.fabric.setup(model, optimizer)
+        model.train()
+        for step_idx in tqdm(
+            range(self.max_steps),
+            desc="TTA Training",
+            dynamic_ncols=True,
+        ):
+            log_metrics = {}
+            losses = 0
+            for task_idx, task_name in enumerate(modelpool.model_names):
+                # labels are used for logging acc, not involved in training
+                images, labels = next(self.shuffled_test_loader_iters[task_name])
+                logits = self.compute_logits(model, images=images, task=task_name)
+                loss = entropy_loss(logits)
+                losses += loss
+                log_metrics[f"train/{task_name}_loss"] = loss.item()
+                log_metrics[f"train/{task_name}_accuracy"] = (
+                    logits.argmax(dim=-1).eq(labels).float().mean().item()
+                )
+            optimizer.zero_grad()
+            self.fabric.backward(losses)
+            optimizer.step()
+            log_metrics["train/loss"] = losses.item()
+            self.fabric.log_dict(log_metrics, step=step_idx)
+            if (step_idx + 1) % self.save_interval == 0:
+                log.info(f"Saving model at step {step_idx}")
+                self.fabric.save(
+                    Path(self.log_dir) / "checkpoints" / f"model_{step_idx}.pt",
+                    {"model": model},
+                )
+        if (step_idx + 1) % self.save_interval != 0:
+            # if the last step was not saved, save it now
+            self.fabric.save(
+                Path(self.log_dir) / "checkpoints" / f"model_{step_idx}.pt",
+                {"model": model},
+            )
+        return model

fusion_bench/method/dawe/warppers/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+"""
+This module provides the `DataAdaptiveWeightEnsemblingModel` class for data-adaptive weight ensembling.
+The DataAdaptiveWeightEnsemblingModel class is designed to perform data-adaptive weight ensembling
+for model fusion. It supports both task-wise and layer-wise merging modes and allows for the use
+of different feature extractors and processors.
+Classes:
+    DataAdaptiveWeightEnsemblingModel: A class for data-adaptive weight ensembling.
+"""
+# flake8: noqa F401
+from .dawe_model import DataAdaptiveWeightEnsemblingModel

fusion_bench/method/dawe/warppers/dawe_model.py ADDED Viewed

@@ -0,0 +1,256 @@
+import functools
+import logging
+from typing import List, Literal, Optional
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+from torch.func import functional_call
+from typing_extensions import override
+from fusion_bench.method.pruning import prune_utils
+from fusion_bench.mixins import SimpleProfilerMixin
+from fusion_bench.models.utils import del_attr, get_attr
+from fusion_bench.utils.devices import get_device
+from fusion_bench.utils.dtype import parse_dtype
+from fusion_bench.utils.state_dict_arithmetic import (
+    StateDictType,
+    state_dict_weighted_sum,
+)
+log = logging.getLogger(__name__)
+class Depth_0_Gate(nn.Module):
+    def __init__(self, output_dim: int):
+        super().__init__()
+        self.weight = nn.Parameter(torch.empty(output_dim), requires_grad=True)
+    def init_weight(self, init_lambda: float):
+        nn.init.constant_(self.weight, init_lambda)
+    def forward(self, *args, **kwargs) -> Tensor:
+        return self.weight
+class Depth_1_Gate(nn.Module):
+    def __init__(self, hidden_size: int, output_dim: int):
+        super().__init__()
+        self.fc = nn.Linear(hidden_size, output_dim, bias=True)
+    def init_weight(self, init_lambda: float):
+        nn.init.normal_(self.fc.weight, std=0.001)
+        nn.init.constant_(self.fc.bias, init_lambda)
+    def forward(self, hidden_states: Tensor) -> Tensor:
+        return self.fc(hidden_states)
+class Depth_2_Gate(nn.Module):
+    def __init__(self, hidden_size: int, output_dim: int):
+        super().__init__()
+        self.fc1 = nn.Linear(hidden_size, hidden_size, bias=True)
+        self.fc2 = nn.Linear(hidden_size, output_dim, bias=True)
+    def init_weight(self, init_lambda: float):
+        nn.init.normal_(self.fc1.weight, std=0.01)
+        nn.init.zeros_(self.fc1.bias)
+        nn.init.normal_(self.fc2.weight, std=0.01)
+        nn.init.constant_(self.fc2.bias, init_lambda)
+    def forward(self, hidden_states: Tensor) -> Tensor:
+        hidden_states = F.relu(self.fc1(hidden_states))
+        return self.fc2(hidden_states)
+def construct_dawe_gate(
+    hidden_size: int,
+    coding_size: int,
+    init_lambda: float,
+    num_hidden_layers: int = 2,
+):
+    if num_hidden_layers == 0:
+        gate = Depth_0_Gate(coding_size)
+    elif num_hidden_layers == 1:
+        gate = Depth_1_Gate(hidden_size, coding_size)
+    elif num_hidden_layers == 2:
+        gate = Depth_2_Gate(hidden_size, coding_size)
+    else:
+        raise ValueError(f"Unsupported number of hidden layers: {num_hidden_layers}")
+    gate.num_hidden_layers = num_hidden_layers
+    gate.init_weight(init_lambda)
+    return gate
+class DataAdaptiveWeightEnsemblingModel(nn.Module, SimpleProfilerMixin):
+    def __init__(
+        self,
+        *,
+        merge_mode: Literal["task_wise", "layer_wise"],
+        hidden_size: int,
+        dict_processor,
+        model_processor,
+        collate_fn=torch.stack,
+        dict_feature_extractor: nn.Module,
+        base_model: nn.Module,
+        expert_models: List[nn.Module],
+        task_vector_dtype: Optional[str | torch.dtype],
+        task_vector_sparsity: float,
+        init_lambda: float = 0.2,
+        gate_hidden_layers: int = 2,
+        batch_reduce: bool = False,
+    ):
+        super().__init__()
+        self.merge_mode = merge_mode
+        self.batch_reduce = batch_reduce
+        self.num_experts = len(expert_models)
+        self.collate_fn = collate_fn
+        self.dict_processor = dict_processor
+        self.model_processor = model_processor
+        self.dict_feature_exactor = dict_feature_extractor
+        if isinstance(self.dict_feature_exactor, nn.Module):
+            self.dict_feature_exactor.requires_grad_(False)  # fix the feature extractor
+        self.base_model = base_model
+        # compute the task vectors
+        for name, param in base_model.named_parameters():
+            if not param.requires_grad:
+                for m in expert_models:
+                    del_attr(m, name.split("."))
+            else:
+                for m in expert_models:
+                    get_attr(m, name.split(".")).data = (
+                        get_attr(m, name.split(".")) - param
+                    )
+        # fix base model and expert models
+        self.base_model = base_model.requires_grad_(False)
+        for m in expert_models:
+            m.requires_grad_(False)
+        self.task_vectors = nn.ModuleList(expert_models)
+        self.num_layers = len(self.task_vectors[0].state_dict())
+        if task_vector_dtype is not None:
+            log.info(f"Converting task vectors to {task_vector_dtype}")
+            self.task_vectors = self.task_vectors.to(parse_dtype(task_vector_dtype))
+        if task_vector_sparsity is not None and task_vector_sparsity > 0:
+            for module in self.task_vectors.modules():
+                if isinstance(module, nn.Linear):
+                    prune_utils.unstructured_magnitude_prune_(
+                        module.weight,
+                        metric_function_or_scores=torch.abs,
+                        sparsity_ratio=task_vector_sparsity,
+                    )
+                    module.weight = nn.Parameter(
+                        module.weight.to_sparse(),
+                        requires_grad=module.weight.requires_grad,
+                    )
+        if self.merge_mode == "task_wise":
+            self.coding_size = self.num_experts
+        elif self.merge_mode == "layer_wise":
+            self.coding_size = self.num_experts * self.num_layers
+        else:
+            raise ValueError(
+                "Invalid option of `merge_model`, must be 'task_wise' or 'layer_wise'"
+            )
+        self.gate = construct_dawe_gate(
+            hidden_size,
+            coding_size=self.coding_size,
+            init_lambda=init_lambda,
+            num_hidden_layers=gate_hidden_layers,
+        )
+    def compute_task_vectors(self, coding_weights: Tensor):
+        if self.merge_mode == "task_wise":
+            state_dict = state_dict_weighted_sum(
+                [
+                    task_vector.state_dict(keep_vars=True)
+                    for task_vector in self.task_vectors
+                ],
+                coding_weights,
+            )
+        elif self.merge_mode == "layer_wise":
+            coding_weights = coding_weights.view(self.num_experts, -1)
+            state_dict = {}
+            for weight, task_vector in zip(coding_weights, self.task_vectors):
+                for name, param in task_vector.state_dict(keep_vars=True).items():
+                    state_dict[name] = state_dict.get(name, 0) + weight * param
+        else:
+            raise ValueError(
+                "Invalid option of `merge_model`, must be 'task_wise' or 'layer_wise'"
+            )
+        return state_dict
+    def merge_weights(self, task_vector: StateDictType):
+        state_dict = self.base_model.state_dict(keep_vars=True)
+        for name, param in task_vector.items():
+            state_dict[name] = state_dict[name] + param
+        return state_dict
+    def model_forward_on_single_sample(self, state_dict, sample_idx, *args, **kwargs):
+        raise NotImplementedError
+    def model_forward(self, dict_codings, *args, **kwargs):
+        if self.batch_reduce:
+            with self.profile("merge weights"):
+                dict_codings = dict_codings.mean(dim=0)
+                task_vector = self.compute_task_vectors(dict_codings)
+                state_dict = self.merge_weights(task_vector)
+            with self.profile("model forward"):
+                return functional_call(
+                    self.base_model,
+                    state_dict,
+                    args=args,
+                    kwargs=kwargs,
+                    strict=False,  # buffer is not included in the state_dict
+                )
+        else:
+            model_outputs = []
+            for sample_idx, dict_coding in enumerate(dict_codings):
+                with self.profile("merge weights"):
+                    task_vector = self.compute_task_vectors(dict_coding)
+                    state_dict = self.merge_weights(task_vector)
+                with self.profile("model forward"):
+                    model_outputs.append(
+                        self.model_forward_on_single_sample(
+                            state_dict,
+                            sample_idx,
+                            *args,
+                            **kwargs,
+                        )
+                    )
+            model_outputs = self.collate_fn(model_outputs)
+            return model_outputs
+    def forward(self, *args, **kwargs):
+        # compute dict codings
+        if self.dict_processor is not None:
+            inputs = self.dict_processor(*args, **kwargs)
+            if isinstance(inputs, Tensor):
+                inputs = inputs.to(get_device(self.dict_feature_exactor))
+            with self.profile("compute sparse codings"):
+                dict_features = self.dict_feature_exactor(inputs)
+        else:
+            with self.profile("compute sparse codings"):
+                dict_features = self.dict_feature_exactor(*args, **kwargs)
+        dict_codings: Tensor = self.gate(dict_features)
+        if self.model_processor is not None:
+            inputs = self.model_processor(*args, **kwargs)
+            if isinstance(inputs, Tensor):
+                inputs = inputs.to(get_device(self.base_model))
+            model_outputs = self.model_forward(dict_codings, inputs)
+        else:
+            model_outputs = self.model_forward(dict_codings, *args, **kwargs)
+        return model_outputs
+class DataAdaptiveWeightEnsemblingCLIPVisionModel(DataAdaptiveWeightEnsemblingModel):
+    @override
+    def model_forward_on_single_sample(self, state_dict, sample_idx, images: Tensor):
+        return functional_call(
+            self.base_model, state_dict, args=images[sample_idx : sample_idx + 1]
+        )

fusion_bench/method/depth_upscaling/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+# flake8: noqa F401
+from .depth_upscaling import DepthUpscalingAlgorithm
+from .depth_upscaling_for_llama import DepthUpscalingForLlama

fusion_bench/method/depth_upscaling/depth_upscaling.py ADDED Viewed

@@ -0,0 +1,89 @@
+import logging
+from copy import deepcopy
+from typing import List, Mapping, Union  # noqa: F401
+import torch
+from torch import nn
+from tqdm.autonotebook import tqdm
+from fusion_bench.method import BaseAlgorithm
+from fusion_bench.modelpool import BaseModelPool
+log = logging.getLogger(__name__)
+class DepthUpscalingAlgorithm(BaseAlgorithm):
+    R"""
+    Implements the Depth Upscaling Algorithm.
+    - Kim et al. SOLAR 10.7B: Scaling Large Language Models with Simple yet Effective Depth Up-Scaling. http://arxiv.org/abs/2312.15166
+    This class extends the `BaseModelFusionAlgorithm` to handle depth upscaling of models.
+    It supports upscaling the depth of a model by duplicating specified layers.
+    Args:
+        layer_indices (list): List of layer indices to duplicate.
+        **kwargs: Additional keyword arguments.
+    """
+    _config_mapping = BaseAlgorithm._config_mapping | {
+        "layer_indices": "layer_indices",
+    }
+    def __init__(self, layer_indices: Union[str, List[int]], **kwargs):
+        self.layer_indices = layer_indices
+        super().__init__(**kwargs)
+    @torch.no_grad()
+    def run(self, modelpool: nn.ModuleList | BaseModelPool) -> nn.ModuleList:
+        """
+        Executes the depth upscaling algorithm on a given model pool.
+        This method checks the type of the model pool, ensures that it contains only one model, and verifies that the model is an instance of `nn.ModuleList`.
+        Args:
+            modelpool (nn.ModuleList | ModelPool): The pool of models to upscale. Must contain only one model.
+        Returns:
+            nn.ModuleList: The upscaled model.
+        Raises:
+            AssertionError: If the model pool contains more than one model or if the model is not an instance of `nn.ModuleList`.
+            ValueError: If an invalid layer specification is provided in the configuration.
+        """
+        # check the modelpool type
+        if isinstance(modelpool, BaseModelPool):
+            assert len(modelpool) == 1, "DepthUpscaling only support one model"
+            model = modelpool.load_model(modelpool.model_names[0])
+            assert isinstance(
+                model, nn.ModuleList
+            ), f"The model should be a `nn.ModuleList`, but got {type(model)}"
+        elif isinstance(modelpool, nn.ModuleList):
+            model = modelpool
+        else:
+            raise AssertionError(
+                f"Invalid modelpool type: {type(modelpool)}. Expected `ModelPool` or `nn.ModuleList`."
+            )
+        # parse the layers
+        layer_indices = self.layer_indices
+        parsed_layer_indices = []
+        for layer in layer_indices:
+            if isinstance(layer, int):
+                parsed_layer_indices.append(layer)
+            elif isinstance(layer, str):
+                parsed_layer_indices.extend(eval(layer))
+            else:
+                raise ValueError("Invalid layer specification: {}".format(layer))
+        # create a new model with the specified layers
+        new_model = nn.ModuleList(
+            [
+                deepcopy(model[i])
+                for i in tqdm(
+                    parsed_layer_indices, desc="constructing depth-upscaled model"
+                )
+            ]
+        )
+        return new_model

fusion_bench/method/depth_upscaling/depth_upscaling_for_llama.py ADDED Viewed

@@ -0,0 +1,57 @@
+import os
+from typing import Optional
+from typing_extensions import override
+from fusion_bench.modelpool.causal_lm.causal_lm import CausalLM, CausalLMPool
+from fusion_bench.utils import timeit_context
+from .depth_upscaling import DepthUpscalingAlgorithm
+class DepthUpscalingForLlama(DepthUpscalingAlgorithm):
+    """
+    Implements depth upscaling for Llama models.
+    This class extends the DepthUpscalingAlgorithm to handle Llama models specifically.
+    It supports saving the upscaled model to a specified path.
+    Args:
+        layer_indices (list): List of layer indices to upscale.
+        model_save_path (Optional[str]): Path to save the upscaled model.
+        **kwargs: Additional keyword arguments.
+    """
+    def __init__(self, layer_indices: list, model_save_path: Optional[str], **kwargs):
+        if isinstance(model_save_path, str):
+            model_save_path = os.path.expanduser(model_save_path)
+        self.model_save_path = model_save_path
+        super().__init__(layer_indices, **kwargs)
+    @override
+    def run(self, modelpool: CausalLMPool):
+        """
+        Executes the depth upscaling algorithm on a given model pool.
+        This method loads the pretrained model or the first model in the pool,
+        applies the depth upscaling algorithm, and updates the number of hidden layers in the model configuration.
+        If a save path is provided, it saves the upscaled model and tokenizer to the specified path.
+        Args:
+            modelpool (CausalLMPool): The pool of models to upscale.
+        Returns:
+            CausalLM: The upscaled model.
+        """
+        if self.model_save_path is not None:
+            tokenizer = modelpool.load_tokenizer()
+        model: CausalLM = modelpool.load_pretrained_or_first_model()
+        model.model.layers = super().run(model.model.layers)
+        model.config.num_hidden_layers = len(model.model.layers)
+        if self.model_save_path is not None:
+            with timeit_context(f"Saving the model to {self.model_save_path}"):
+                tokenizer.save_pretrained(self.model_save_path)
+                model.save_pretrained(self.model_save_path)
+        return model