PyPI - fusion-bench - Versions diffs - 0.2.9__py3-none-any.whl - Mend

fusion-bench 0.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (727) hide show

fusion_bench/models/llama/model_utils/embedding.py ADDED Viewed

@@ -0,0 +1,87 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import math
+from contextlib import nullcontext
+from typing import TYPE_CHECKING
+import torch
+from transformers.integrations import is_deepspeed_zero3_enabled
+if TYPE_CHECKING:
+    from transformers import PreTrainedModel, PreTrainedTokenizer
+logger = logging.getLogger(__name__)
+def _noisy_mean_initialization(
+    embed_weight: "torch.Tensor", num_new_tokens: int
+) -> None:
+    embedding_dim = embed_weight.size(1)
+    avg_weight = embed_weight[:-num_new_tokens].mean(dim=0, keepdim=True)
+    noise_weight = torch.empty_like(embed_weight[-num_new_tokens:])
+    noise_weight.normal_(mean=0, std=(1.0 / math.sqrt(embedding_dim)))
+    embed_weight[-num_new_tokens:] = avg_weight + noise_weight
+def resize_embedding_layer(
+    model: "PreTrainedModel", tokenizer: "PreTrainedTokenizer"
+) -> None:
+    r"""
+    Resize token embeddings.
+    """
+    if is_deepspeed_zero3_enabled():
+        import deepspeed  # type: ignore
+        params = [model.get_input_embeddings().weight]
+        if (
+            model.get_output_embeddings() is not None
+            and not model.config.tie_word_embeddings
+        ):
+            params.append(model.get_output_embeddings().weight)
+        context_maybe_zero3 = deepspeed.zero.GatheredParameters(params, modifier_rank=0)
+    else:
+        context_maybe_zero3 = nullcontext()
+    with context_maybe_zero3:
+        current_embedding_size = model.get_input_embeddings().weight.size(0)
+    if len(tokenizer) > current_embedding_size:
+        if getattr(model, "quantization_method", None):
+            raise ValueError("Cannot resize embedding layers of a quantized model.")
+        if not isinstance(model.get_output_embeddings(), torch.nn.Linear):
+            raise ValueError(
+                "Current model does not support resizing embedding layers."
+            )
+        model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=64)
+        with context_maybe_zero3:
+            new_embedding_size = model.get_input_embeddings().weight.size(0)
+            num_new_tokens = new_embedding_size - current_embedding_size
+            _noisy_mean_initialization(
+                model.get_input_embeddings().weight.data, num_new_tokens
+            )
+            _noisy_mean_initialization(
+                model.get_output_embeddings().weight.data, num_new_tokens
+            )
+        logger.info(
+            "Resized token embeddings from {} to {}.".format(
+                current_embedding_size, new_embedding_size
+            )
+        )

fusion_bench/models/llama/model_utils/liger_kernel.py ADDED Viewed

@@ -0,0 +1,86 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+import logging
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from transformers import PretrainedConfig
+logger = logging.getLogger(__name__)
+def apply_liger_kernel(
+    config: "PretrainedConfig",
+    enable_liger_kernel: bool,
+    is_trainable: bool,
+    require_logits: bool,
+) -> None:
+    """
+    References:
+    - https://github.com/linkedin/Liger-Kernel
+    """
+    if not is_trainable or not enable_liger_kernel:
+        return
+    model_type = getattr(config, "model_type", None)
+    if model_type == "gemma":
+        from liger_kernel.transformers import (
+            apply_liger_kernel_to_gemma as apply_liger_kernel,
+        )
+    elif model_type == "gemma2":
+        from liger_kernel.transformers import (
+            apply_liger_kernel_to_gemma2 as apply_liger_kernel,
+        )
+    elif model_type == "llama":
+        from liger_kernel.transformers import (
+            apply_liger_kernel_to_llama as apply_liger_kernel,
+        )
+    elif model_type == "mistral":
+        from liger_kernel.transformers import (
+            apply_liger_kernel_to_mistral as apply_liger_kernel,
+        )
+    elif model_type == "mixtral":
+        from liger_kernel.transformers import (
+            apply_liger_kernel_to_mixtral as apply_liger_kernel,
+        )
+    elif model_type == "phi3":
+        from liger_kernel.transformers import (
+            apply_liger_kernel_to_phi3 as apply_liger_kernel,
+        )
+    elif model_type == "qwen2":
+        from liger_kernel.transformers import (
+            apply_liger_kernel_to_qwen2 as apply_liger_kernel,
+        )
+    elif model_type == "qwen2_vl":
+        from liger_kernel.transformers import (
+            apply_liger_kernel_to_qwen2_vl as apply_liger_kernel,
+        )
+    else:
+        logger.warning("Current model does not support liger kernel.")
+        return
+    if (
+        require_logits
+        and "fused_linear_cross_entropy"
+        in inspect.signature(apply_liger_kernel).parameters
+    ):
+        logger.info("Current training stage does not support chunked cross entropy.")
+        kwargs = {"fused_linear_cross_entropy": False}
+    else:
+        kwargs = {}
+    apply_liger_kernel(**kwargs)
+    logger.info("Liger kernel has been applied to the model.")

fusion_bench/models/llama/model_utils/misc.py ADDED Viewed

@@ -0,0 +1,112 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+from typing import TYPE_CHECKING, List
+if TYPE_CHECKING:
+    from transformers import PretrainedConfig, PreTrainedModel, PreTrainedTokenizer
+logger = logging.getLogger(__name__)
+def find_all_linear_modules(
+    model: "PreTrainedModel", freeze_vision_tower: bool
+) -> List[str]:
+    r"""
+    Finds all available modules to apply lora or galore.
+    """
+    model_type = getattr(model.config, "model_type", None)
+    forbidden_modules = {"lm_head"}
+    if model_type == "chatglm":
+        forbidden_modules.add("output_layer")
+    elif model_type == "internlm2":
+        forbidden_modules.add("output")
+    elif model_type in [
+        "llava",
+        "llava_next",
+        "llava_next_video",
+        "paligemma",
+        "video_llava",
+    ]:
+        forbidden_modules.add("multi_modal_projector")
+    elif model_type == "qwen2_vl":
+        forbidden_modules.add("merger")
+    if freeze_vision_tower:
+        if model_type == "qwen2_vl":
+            forbidden_modules.add("visual")
+        else:
+            forbidden_modules.add("vision_tower")
+    module_names = set()
+    for name, module in model.named_modules():
+        if any(forbidden_module in name for forbidden_module in forbidden_modules):
+            continue
+        if (
+            "Linear" in module.__class__.__name__
+            and "Embedding" not in module.__class__.__name__
+        ):
+            module_names.add(name.split(".")[-1])
+    logger.info("Found linear modules: {}".format(",".join(module_names)))
+    return list(module_names)
+def find_expanded_modules(
+    model: "PreTrainedModel", target_modules: List[str], num_layer_trainable: int
+) -> List[str]:
+    r"""
+    Finds the modules in the expanded blocks to apply lora.
+    """
+    num_layers = getattr(model.config, "num_hidden_layers", None)
+    if not num_layers:
+        raise ValueError("Model was not supported.")
+    if num_layers % num_layer_trainable != 0:
+        raise ValueError(
+            "`num_layers` {} should be divisible by `num_layer_trainable` {}.".format(
+                num_layers, num_layer_trainable
+            )
+        )
+    stride = num_layers // num_layer_trainable
+    trainable_layer_ids = range(stride - 1, num_layers + stride - 1, stride)
+    trainable_layers = [".{:d}.".format(idx) for idx in trainable_layer_ids]
+    module_names = []
+    for name, _ in model.named_modules():
+        if any(target_module in name for target_module in target_modules) and any(
+            trainable_layer in name for trainable_layer in trainable_layers
+        ):
+            module_names.append(name)
+    logger.info(
+        "Apply lora to layers: {}".format(",".join(map(str, trainable_layer_ids)))
+    )
+    return module_names
+def register_autoclass(
+    config: "PretrainedConfig",
+    model: "PreTrainedModel",
+    tokenizer: "PreTrainedTokenizer",
+):
+    if "AutoConfig" in getattr(config, "auto_map", {}):
+        config.__class__.register_for_auto_class()
+    if "AutoModelForCausalLM" in getattr(config, "auto_map", {}):
+        model.__class__.register_for_auto_class()
+    if "AutoTokenizer" in tokenizer.init_kwargs.get("auto_map", {}):
+        tokenizer.__class__.register_for_auto_class()

fusion_bench/models/llama/model_utils/mod.py ADDED Viewed

@@ -0,0 +1,52 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING, Optional
+import torch
+if TYPE_CHECKING:
+    from transformers import PretrainedConfig, PreTrainedModel
+MOD_SUPPORTED_MODELS = {
+    "bloom",
+    "falcon",
+    "gemma",
+    "llama",
+    "mistral",
+    "mixtral",
+    "phi",
+    "starcoder2",
+}
+def load_mod_pretrained_model(**init_kwargs) -> "PreTrainedModel":
+    from MoD import AutoMoDModelForCausalLM
+    return AutoMoDModelForCausalLM.from_pretrained(**init_kwargs)
+def convert_pretrained_model_to_mod(
+    model: "PreTrainedModel",
+    config: "PretrainedConfig",
+    compute_dtype: Optional[torch.dtype] = None,
+) -> "PreTrainedModel":
+    from MoD import apply_mod_to_hf
+    if getattr(config, "model_type", None) not in MOD_SUPPORTED_MODELS:
+        raise ValueError("Current model is not supported by mixture-of-depth.")
+    model = apply_mod_to_hf(model)
+    model = model.to(compute_dtype)
+    return model

fusion_bench/models/llama/model_utils/visual.py ADDED Viewed

@@ -0,0 +1,241 @@
+# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's Transformers library.
+# https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/models/llava/modeling_llava.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+from typing import TYPE_CHECKING, List, Sequence, Set, Tuple, Union
+import torch
+import transformers.models
+from transformers.activations import ACT2FN
+from transformers.utils import logging
+if TYPE_CHECKING:
+    from transformers import LlavaConfig, PretrainedConfig, PreTrainedModel
+logger = logging.getLogger(__name__)
+transformers_logger = logging.getLogger(__name__)
+class LlavaMultiModalProjectorForYiVL(torch.nn.Module):
+    def __init__(self, config: "LlavaConfig") -> None:
+        super().__init__()
+        self.config = config
+        if config is None:
+            return
+        self.linear_1 = torch.nn.Linear(
+            config.vision_config.hidden_size, config.text_config.hidden_size, bias=True
+        )
+        self.linear_2 = torch.nn.LayerNorm(config.text_config.hidden_size, bias=True)
+        self.linear_3 = torch.nn.Linear(
+            config.text_config.hidden_size, config.text_config.hidden_size, bias=True
+        )
+        self.linear_4 = torch.nn.LayerNorm(config.text_config.hidden_size, bias=True)
+        self.act = ACT2FN[config.projector_hidden_act]
+    def forward(self, image_features: "torch.Tensor") -> "torch.Tensor":
+        hidden_states = self.linear_1(image_features)
+        hidden_states = self.linear_2(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.linear_3(hidden_states)
+        hidden_states = self.linear_4(hidden_states)
+        if hidden_states.dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.linear_1.weight.dtype
+            transformers_logger.warning_once(
+                "The hidden states seems to be silently casted in float32."
+            )
+            hidden_states = hidden_states.to(target_dtype)
+        return hidden_states
+class LlavaMultiModalProjectorForYiVLForVLLM(LlavaMultiModalProjectorForYiVL):
+    def __init__(
+        self, vision_hidden_size: int, text_hidden_size: int, projector_hidden_act: str
+    ) -> None:
+        super().__init__(config=None)
+        self.linear_1 = torch.nn.Linear(vision_hidden_size, text_hidden_size, bias=True)
+        self.linear_2 = torch.nn.LayerNorm(text_hidden_size, bias=True)
+        self.linear_3 = torch.nn.Linear(text_hidden_size, text_hidden_size, bias=True)
+        self.linear_4 = torch.nn.LayerNorm(text_hidden_size, bias=True)
+        self.act = ACT2FN[projector_hidden_act]
+def autocast_projector_dtype(
+    model: "PreTrainedModel", model_args: "ModelArguments"
+) -> None:
+    r"""
+    Casts projector output to half precision for fine-tuning quantized VLMs.
+    """
+    def _mm_projector_forward_post_hook(
+        module: "torch.nn.Module", args: Tuple["torch.Tensor"], output: "torch.Tensor"
+    ) -> "torch.Tensor":
+        return output.to(model_args.compute_dtype)
+    if getattr(model, "quantization_method", None):
+        model_type = getattr(model.config, "model_type", None)
+        if model_type in [
+            "llava",
+            "llava_next",
+            "llava_next_video",
+            "paligemma",
+            "video_llava",
+        ]:
+            mm_projector: "torch.nn.Module" = getattr(model, "multi_modal_projector")
+        elif model_type == "qwen2_vl":
+            mm_projector: "torch.nn.Module" = getattr(
+                getattr(model, "visual"), "merger"
+            )
+        else:
+            return
+        logger.info(
+            "Casting multimodal projector outputs in {}.".format(
+                model_args.compute_dtype
+            )
+        )
+        mm_projector.register_forward_hook(_mm_projector_forward_post_hook)
+def configure_visual_model(config: "PretrainedConfig") -> None:
+    r"""
+    Patches VLMs before loading them.
+    """
+    model_type = getattr(config, "model_type", None)
+    if model_type in [
+        "llava",
+        "llava_next",
+        "llava_next_video",
+        "paligemma",
+        "video_llava",
+    ]:  # required for ds zero3 and valuehead models
+        setattr(config, "hidden_size", getattr(config.text_config, "hidden_size", None))
+    if getattr(config, "is_yi_vl_derived_model", None):
+        logger.info("Detected Yi-VL model, applying projector patch.")
+        transformers.models.llava.modeling_llava.LlavaMultiModalProjector = (
+            LlavaMultiModalProjectorForYiVL
+        )
+def get_forbidden_modules(
+    config: "PretrainedConfig", finetuning_args: "FinetuningArguments"
+) -> Set[str]:
+    r"""
+    Freezes vision tower and language model for VLM full/freeze tuning.
+    """
+    model_type = getattr(config, "model_type", None)
+    forbidden_modules = set()
+    if model_type in [
+        "llava",
+        "llava_next",
+        "llava_next_video",
+        "paligemma",
+        "video_llava",
+    ]:
+        if finetuning_args.freeze_vision_tower:
+            forbidden_modules.add("vision_tower")
+        if finetuning_args.train_mm_proj_only:
+            forbidden_modules.add("language_model")
+    elif model_type == "qwen2_vl":
+        if finetuning_args.freeze_vision_tower:
+            forbidden_modules.add("visual")
+        if finetuning_args.train_mm_proj_only:
+            raise ValueError("Qwen2-VL models do not support `train_mm_proj_only`.")
+    return forbidden_modules
+def get_image_seqlen(config: "PretrainedConfig") -> int:
+    r"""
+    Computes the number of special tokens per image.
+    """
+    model_type = getattr(config, "model_type", None)
+    if model_type == "llava":
+        image_seqlen = (
+            config.vision_config.image_size // config.vision_config.patch_size
+        ) ** 2
+        if (
+            getattr(config, "vision_feature_select_strategy", "default") == "full"
+        ):  # add [CLS] token
+            image_seqlen += 1
+    elif model_type == "paligemma":
+        image_seqlen = config.vision_config.num_image_tokens
+    else:
+        image_seqlen = -1
+    return image_seqlen
+def get_patch_size(config: "PretrainedConfig") -> int:
+    r"""
+    Computes the patch size of the vit.
+    """
+    patch_size = getattr(config.vision_config, "patch_size", -1)
+    return patch_size
+def get_vision_feature_select_strategy(config: "PretrainedConfig") -> int:
+    r"""
+    Get the vision_feature_select_strategy.
+    """
+    vision_feature_select_strategy = getattr(
+        config, "vision_feature_select_strategy", "default"
+    )
+    return vision_feature_select_strategy
+def patch_target_modules(
+    config: "PretrainedConfig",
+    finetuning_args: "FinetuningArguments",
+    target_modules: Sequence[str],
+) -> Union[str, List[str]]:
+    r"""
+    Freezes vision tower for VLM LoRA tuning.
+    """
+    model_type = getattr(config, "model_type", None)
+    if finetuning_args.freeze_vision_tower:
+        if model_type in [
+            "llava",
+            "llava_next",
+            "llava_next_video",
+            "paligemma",
+            "video_llava",
+        ]:
+            return "^(?!.*vision_tower).*(?:{}).*".format("|".join(target_modules))
+        elif model_type == "qwen2_vl":
+            return "^(?!.*visual).*(?:{}).*".format("|".join(target_modules))
+        else:
+            return target_modules
+    else:
+        if model_type == "qwen2_vl":
+            return "^(?!.*patch_embed).*(?:{}).*".format("|".join(target_modules))
+        else:
+            return target_modules

fusion_bench/models/llama/patcher.py ADDED Viewed

@@ -0,0 +1,78 @@
+"""
+Modified from Llama-Factory library.
+"""
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import os
+from types import MethodType
+from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional
+import torch
+from peft import PeftModel
+from transformers import PreTrainedTokenizerBase
+from .model_utils.visual import (
+    get_image_seqlen,
+    get_patch_size,
+    get_vision_feature_select_strategy,
+)
+if TYPE_CHECKING:
+    from transformers import PretrainedConfig, PreTrainedTokenizer, ProcessorMixin
+logger = logging.getLogger(__name__)
+def patch_tokenizer_(tokenizer: "PreTrainedTokenizer") -> None:
+    if "PreTrainedTokenizerBase" not in str(tokenizer._pad.__func__):
+        tokenizer._pad = MethodType(PreTrainedTokenizerBase._pad, tokenizer)
+def patch_processor_(
+    processor: "ProcessorMixin",
+    config: "PretrainedConfig",
+    tokenizer: "PreTrainedTokenizer",
+    image_resolution: int = 512,
+    video_resolution: int = 128,
+    video_fps: int = 2,
+    video_maxlen: int = 64,
+) -> None:
+    """
+    Patch processor with additional attributes.
+    Args:
+        processor (ProcessorMixin): ProcessorMixin instance.
+        config (PretrainedConfig): PretrainedConfig instance.
+        tokenizer (PreTrainedTokenizer): PreTrainedTokenizer instance.
+        image_resolution (int): Image resolution. Keeps the height or width of image below this resolution.
+        video_resolution (int): Video resolution. Keeps the height or width of video below this resolution.
+        video_fps (int): The number of frames to sample per second for video inputs.
+        video_maxlen (int): The maximum number of frames to sample from video inputs.
+    """
+    setattr(processor, "tokenizer", tokenizer)
+    setattr(processor, "image_seqlen", get_image_seqlen(config))
+    setattr(processor, "image_resolution", image_resolution)
+    setattr(processor, "patch_size", get_patch_size(config))
+    setattr(processor, "video_resolution", video_resolution)
+    setattr(processor, "video_fps", video_fps)
+    setattr(processor, "video_maxlen", video_maxlen)
+    setattr(
+        processor,
+        "vision_feature_select_strategy",
+        get_vision_feature_select_strategy(config),
+    )