PyPI - liger-kernel - Versions diffs - 0.5.9__py3-none-any.whl → 0.6.0__py3-none-any.whl - Mend

liger-kernel 0.5.9py3-none-any.whl → 0.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

liger_kernel/chunked_loss/__init__.py +1 -0
liger_kernel/chunked_loss/cosine_similarity_loss.py +127 -0
liger_kernel/chunked_loss/dpo_loss.py +1 -1
liger_kernel/chunked_loss/functional.py +2 -0
liger_kernel/chunked_loss/fused_linear_preference.py +0 -1
liger_kernel/chunked_loss/jsd_loss.py +2 -2
liger_kernel/ops/dyt.py +111 -179
liger_kernel/ops/fused_neighborhood_attention.py +1022 -0
liger_kernel/ops/geglu.py +1 -1
liger_kernel/ops/grpo_loss.py +310 -0
liger_kernel/ops/multi_token_attention.py +207 -0
liger_kernel/ops/rms_norm.py +265 -54
liger_kernel/ops/softmax.py +201 -0
liger_kernel/ops/sparsemax.py +179 -0
liger_kernel/ops/swiglu.py +1 -1
liger_kernel/transformers/__init__.py +8 -0
liger_kernel/transformers/dyt.py +5 -3
liger_kernel/transformers/fsdp.py +55 -0
liger_kernel/transformers/functional.py +70 -0
liger_kernel/transformers/fused_neighborhood_attention.py +234 -0
liger_kernel/transformers/grpo_loss.py +98 -0
liger_kernel/transformers/model/gemma.py +25 -16
liger_kernel/transformers/model/gemma2.py +27 -14
liger_kernel/transformers/model/gemma3.py +62 -106
liger_kernel/transformers/model/glm4.py +16 -13
liger_kernel/transformers/model/llama.py +81 -18
liger_kernel/transformers/model/llama4.py +108 -0
liger_kernel/transformers/model/llava.py +95 -132
liger_kernel/transformers/model/mistral.py +13 -14
liger_kernel/transformers/model/mixtral.py +16 -15
liger_kernel/transformers/model/mllama.py +16 -14
liger_kernel/transformers/model/olmo2.py +16 -13
liger_kernel/transformers/model/paligemma.py +8 -9
liger_kernel/transformers/model/phi3.py +25 -16
liger_kernel/transformers/model/qwen2.py +24 -15
liger_kernel/transformers/model/qwen2_5_vl.py +41 -97
liger_kernel/transformers/model/qwen2_vl.py +38 -106
liger_kernel/transformers/model/qwen3.py +11 -9
liger_kernel/transformers/model/qwen3_moe.py +132 -0
liger_kernel/transformers/monkey_patch.py +424 -81
liger_kernel/transformers/multi_token_attention.py +64 -0
liger_kernel/transformers/rms_norm.py +40 -4
liger_kernel/transformers/softmax.py +12 -0
liger_kernel/transformers/sparsemax.py +16 -0
liger_kernel/transformers/swiglu.py +21 -0
liger_kernel/transformers/trainer/orpo_trainer.py +1 -53
liger_kernel/utils.py +11 -0
{liger_kernel-0.5.9.dist-info → liger_kernel-0.6.0.dist-info}/METADATA +41 -21
liger_kernel-0.6.0.dist-info/RECORD +97 -0
{liger_kernel-0.5.9.dist-info → liger_kernel-0.6.0.dist-info}/WHEEL +1 -1
liger_kernel/transformers/gema3_rms.py +0 -8
liger_kernel-0.5.9.dist-info/RECORD +0 -84
{liger_kernel-0.5.9.dist-info → liger_kernel-0.6.0.dist-info}/licenses/LICENSE +0 -0
{liger_kernel-0.5.9.dist-info → liger_kernel-0.6.0.dist-info}/licenses/NOTICE +0 -0
{liger_kernel-0.5.9.dist-info → liger_kernel-0.6.0.dist-info}/top_level.txt +0 -0

liger_kernel/transformers/monkey_patch.py CHANGED Viewed

@@ -2,6 +2,7 @@ import inspect
 import logging
 from functools import partial
+from types import MethodType
 from typing import Callable
 import transformers
@@ -35,6 +36,13 @@ from liger_kernel.transformers.swiglu import LigerBlockSparseTop2MLP
 from liger_kernel.transformers.swiglu import LigerPhi3SwiGLUMLP
 from liger_kernel.transformers.swiglu import LigerSwiGLUMLP
+try:
+    import peft
+    PEFT_AVAILABLE = True
+except ImportError:
+    PEFT_AVAILABLE = False
 transformer_version = version.parse(transformers.__version__)
 logger = logging.getLogger(__name__)
@@ -47,23 +55,72 @@ def _bind_method_to_module(module, method_name: str, new_method: Callable):
     module.__dict__[method_name] = new_method.__get__(module, module.__class__)
-def _patch_rms_norm_module(module, offset=0.0, eps=1e-6, casting_mode="llama", in_place=True):
-    module.offset = offset
-    module.casting_mode = casting_mode
-    module.variance_epsilon = getattr(module, "variance_epsilon", None) or getattr(module, "eps", None) or eps
-    module.in_place = in_place
-    _bind_method_to_module(module, "forward", LigerRMSNorm.forward)
-    _bind_method_to_module(module, "extra_repr", LigerRMSNorm.extra_repr)
-    module.__class__.__name__ = LigerRMSNorm.__name__
+def _patch_rms_norm_module(module, offset=0.0, eps=1e-6, casting_mode="llama", in_place=True, row_mode=None):
+    # Check if the module is a PEFT ModulesToSaveWrapper
+    # If it is, we need to patch the modules_to_save.default and original_modules
+    if PEFT_AVAILABLE and isinstance(module, peft.utils.other.ModulesToSaveWrapper):
+        module.modules_to_save.default.offset = offset
+        module.modules_to_save.default.casting_mode = casting_mode
+        module.modules_to_save.default.variance_epsilon = (
+            getattr(module, "variance_epsilon", None) or getattr(module, "eps", None) or eps
+        )
+        module.modules_to_save.default.in_place = in_place
+        module.modules_to_save.default.row_mode = row_mode
+        module.original_module.offset = offset
+        module.original_module.casting_mode = casting_mode
+        module.original_module.variance_epsilon = (
+            getattr(module, "variance_epsilon", None) or getattr(module, "eps", None) or eps
+        )
+        module.original_module.in_place = in_place
+        module.original_module.row_mode = row_mode
+        _bind_method_to_module(module.modules_to_save.default, "forward", LigerRMSNorm.forward)
+        _bind_method_to_module(module.modules_to_save.default, "extra_repr", LigerRMSNorm.extra_repr)
+        _bind_method_to_module(module.original_module, "forward", LigerRMSNorm.forward)
+        _bind_method_to_module(module.original_module, "extra_repr", LigerRMSNorm.extra_repr)
+        module.modules_to_save.default.__class__.__name__ = LigerRMSNorm.__name__
+        module.original_module.__class__.__name__ = LigerRMSNorm.__name__
+    else:
+        module.offset = offset
+        module.casting_mode = casting_mode
+        module.variance_epsilon = getattr(module, "variance_epsilon", None) or getattr(module, "eps", None) or eps
+        module.in_place = in_place
+        module.row_mode = row_mode
+        _bind_method_to_module(module, "forward", LigerRMSNorm.forward)
+        _bind_method_to_module(module, "extra_repr", LigerRMSNorm.extra_repr)
+        module.__class__.__name__ = LigerRMSNorm.__name__
 def _patch_layer_norm_module(module, eps=1e-6):
-    module.variance_epsilon = getattr(module, "variance_epsilon", None) or getattr(module, "eps", None) or eps
-    module.hidden_size = getattr(module, "hidden_size", None) or getattr(module, "normalized_shape", None)
-    _bind_method_to_module(module, "forward", LigerLayerNorm.forward)
-    _bind_method_to_module(module, "extra_repr", LigerLayerNorm.extra_repr)
-    module.__class__.__name__ = LigerLayerNorm.__name__
+    # Check if the module is a PEFT ModulesToSaveWrapper
+    # If it is, we need to patch the modules_to_save.default and original_modules
+    if PEFT_AVAILABLE and isinstance(module, peft.utils.other.ModulesToSaveWrapper):
+        module.hidden_size = module.normalized_shape
+        _bind_method_to_module(module, "forward", LigerLayerNorm.forward)
+        _bind_method_to_module(module, "extra_repr", LigerLayerNorm.extra_repr)
+        module.modules_to_save.default.variance_epsilon = (
+            getattr(module, "variance_epsilon", None) or getattr(module, "eps", None) or eps
+        )
+        module.original_module.hidden_size = getattr(module, "hidden_size", None) or getattr(
+            module, "normalized_shape", None
+        )
+        module.original_module.variance_epsilon = (
+            getattr(module, "variance_epsilon", None) or getattr(module, "eps", None) or eps
+        )
+        module.original_module.hidden_size = getattr(module, "hidden_size", None) or getattr(
+            module, "normalized_shape", None
+        )
+        _bind_method_to_module(module.modules_to_save.default, "forward", LigerRMSNorm.forward)
+        _bind_method_to_module(module.modules_to_save.default, "extra_repr", LigerRMSNorm.extra_repr)
+        _bind_method_to_module(module.original_module, "forward", LigerRMSNorm.forward)
+        _bind_method_to_module(module.original_module, "extra_repr", LigerRMSNorm.extra_repr)
+        module.modules_to_save.default.__class__.__name__ = LigerLayerNorm.__name__
+        module.original_module.__class__.__name__ = LigerLayerNorm.__name__
+    else:
+        module.variance_epsilon = getattr(module, "variance_epsilon", None) or getattr(module, "eps", None) or eps
+        module.hidden_size = getattr(module, "hidden_size", None) or getattr(module, "normalized_shape", None)
+        _bind_method_to_module(module, "forward", LigerLayerNorm.forward)
+        _bind_method_to_module(module, "extra_repr", LigerLayerNorm.extra_repr)
+        module.__class__.__name__ = LigerLayerNorm.__name__
 def _patch_swiglu_module(module, liger_module):
@@ -204,10 +261,16 @@ def apply_liger_kernel_to_llama(
     if fused_linear_cross_entropy:
         if transformer_version >= version.parse(SUPPORTED_TRANSFORMER_VERSION):
-            modeling_llama.LlamaForCausalLM.forward = llama_lce_forward
+            if model is not None:
+                model.forward = MethodType(llama_lce_forward, model)
+            else:
+                modeling_llama.LlamaForCausalLM.forward = llama_lce_forward
         else:  # if version < 4.46.1
             logger.warning(TRANSFORMER_DEPRECATION_WARNING)
-            modeling_llama.LlamaForCausalLM.forward = llama_lce_forward_deprecated
+            if model is not None:
+                model.forward = MethodType(llama_lce_forward_deprecated, model)
+            else:
+                modeling_llama.LlamaForCausalLM.forward = llama_lce_forward_deprecated
     if model is not None:
         # The model instance already exists, so we need to additionally patch the
@@ -261,13 +324,20 @@ def apply_liger_kernel_to_llava(
         logger.warning(TRANSFORMER_DEPRECATION_WARNING)
         modeling_llava.nn.CrossEntropyLoss = LigerCrossEntropyLoss
     if fused_linear_cross_entropy:
-        if transformer_version >= version.parse("4.49.0"):
-            modeling_llava.LlavaForConditionalGeneration.forward = llava_lce_forward
+        if transformer_version >= version.parse("4.52.0"):
+            if model is not None:
+                model.forward = MethodType(llava_lce_forward, model)
+            else:
+                modeling_llava.LlavaForConditionalGeneration.forward = llava_lce_forward
+        elif transformer_version >= version.parse("4.49.0") and transformer_version < version.parse("4.52.0"):
+            if model is not None:
+                model.forward = MethodType(llava_lce_forward_deprecated, model)
+            else:
+                modeling_llava.LlavaForConditionalGeneration.forward = llava_lce_forward_deprecated
         else:  # if version < 4.49.0
             logger.warning(
-                "Support for transformers versions < 4.49.0 will soon be discontinued due to issues with incorrect legacy processing. \n Please consider upgrading to avoid potential issues. See details: https://github.com/huggingface/transformers/pull/35526"
+                "The latest version of Liger does not support transformers < 4.49.0 for llava. Please downgrade your liger version or upgrade your transformer version."
             )
-            modeling_llava.LlavaForConditionalGeneration.forward = llava_lce_forward_deprecated
     if model is not None:
         text_model_name, vision_model_name = model.config.text_config.model_type, model.config.vision_config.model_type
@@ -306,6 +376,92 @@ def apply_liger_kernel_to_llava(
             logger.warning(f"{vision_model_name} is not supported by Liger kernel.")
+def apply_liger_kernel_to_llama4(
+    rope: bool = False,
+    cross_entropy: bool = False,
+    fused_linear_cross_entropy: bool = True,
+    rms_norm: bool = True,
+    swiglu: bool = True,
+    model: PreTrainedModel = None,
+    layer_norm: bool = True,
+) -> None:
+    """
+    Apply Liger kernels to replace original implementation in HuggingFace Llama4 models.
+    Args:
+        rope (bool): Whether to apply Liger's rotary position embedding. Default is True.
+        cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
+        fused_linear_cross_entropy (bool):
+            Whether to apply Liger's fused linear cross entropy loss. Default is True.
+            `cross_entropy` and `fused_linear_cross_entropy` cannot both be True.
+            If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
+        rms_norm (bool): Whether to apply Liger's RMSNorm. Default is True.
+        swiglu (bool): Whether to apply Liger's SwiGLU MLP. Default is False.
+        model (PreTrainedModel): The model instance to apply Liger kernels to, if the model has already been
+        loaded. Default is None.
+    """
+    assert not (cross_entropy and fused_linear_cross_entropy), (
+        "cross_entropy and fused_linear_cross_entropy cannot both be True."
+    )
+    from transformers.models.llama4 import modeling_llama4
+    from transformers.models.llama4.modeling_llama4 import Llama4ForCausalLM
+    from transformers.models.llama4.modeling_llama4 import Llama4ForConditionalGeneration
+    from transformers.models.llama4.modeling_llama4 import Llama4TextModel
+    from transformers.models.llama4.modeling_llama4 import Llama4VisionModel
+    from liger_kernel.transformers.model.llama4 import lce_forward as llama4_lce_forward
+    if rope:
+        raise NotImplementedError("liger_rotary_pos_emb is not available for Llama4 models.")
+    if rms_norm:
+        modeling_llama4.Llama4TextRMSNorm = LigerRMSNorm
+    if swiglu:
+        modeling_llama4.Llama4TextMLP = LigerSwiGLUMLP
+    if cross_entropy:
+        modeling_llama4.CrossEntropyLoss = LigerCrossEntropyLoss
+    if fused_linear_cross_entropy:
+        modeling_llama4.Llama4ForCausalLM.forward = llama4_lce_forward
+    if model is not None:
+        # The model instance already exists, so we need to additionally patch the
+        # instance variables that reference already-instantiated modules
+        if isinstance(model, Llama4ForConditionalGeneration):
+            language_model: Llama4ForCausalLM = model.language_model
+            vision_model: Llama4VisionModel = model.vision_model
+            text_model: Llama4TextModel = language_model.model
+        elif isinstance(model, Llama4ForCausalLM):
+            text_model = model.model
+            vision_model = None
+        elif isinstance(model, Llama4TextModel):
+            text_model = model
+            vision_model = None
+        else:
+            raise ValueError(f"Unsupported Llama4 model type: {type(model)}")
+        if text_model:
+            if rms_norm:
+                _patch_rms_norm_module(text_model.norm)
+            for decoder_layer in text_model.layers:
+                if swiglu:
+                    _patch_swiglu_module(decoder_layer.feed_forward, LigerSwiGLUMLP)
+                if rms_norm:
+                    _patch_rms_norm_module(decoder_layer.input_layernorm)
+                    _patch_rms_norm_module(decoder_layer.post_attention_layernorm)
+        if vision_model:
+            _patch_layer_norm_module(vision_model.layernorm_pre)
+            _patch_layer_norm_module(vision_model.layernorm_post)
+            for layer in vision_model.model.layers:
+                if layer_norm:
+                    _patch_layer_norm_module(layer.input_layernorm)
+                    _patch_layer_norm_module(layer.post_attention_layernorm)
 def apply_liger_kernel_to_mllama(
     rope: bool = True,
     cross_entropy: bool = False,
@@ -347,7 +503,7 @@ def apply_liger_kernel_to_mllama(
     if rope:
         modeling_mllama.apply_rotary_pos_emb = liger_rotary_pos_emb
-    if layer_norm:
+    if layer_norm and model is None:
         modeling_mllama.nn.LayerNorm = LigerLayerNorm
     if rms_norm:
         modeling_mllama.MllamaTextRMSNorm = LigerRMSNorm
@@ -363,10 +519,16 @@ def apply_liger_kernel_to_mllama(
             modeling_mllama.CrossEntropyLoss = LigerCrossEntropyLoss
     if fused_linear_cross_entropy:
         if transformer_version >= version.parse(SUPPORTED_TRANSFORMER_VERSION):
-            modeling_mllama.MllamaForCausalLM.forward = mllama_lce_forward
+            if model is not None:
+                model.forward = MethodType(mllama_lce_forward, model)
+            else:
+                modeling_mllama.MllamaForCausalLM.forward = mllama_lce_forward
         else:  # if version < 4.46.1
             logger.warning(TRANSFORMER_DEPRECATION_WARNING)
-            modeling_mllama.MllamaForCausalLM.forward = mllama_lce_forward_deprecated
+            if model is not None:
+                model.forward = MethodType(mllama_lce_forward_deprecated, model)
+            else:
+                modeling_mllama.MllamaForCausalLM.forward = mllama_lce_forward_deprecated
     if model is not None:
         # The model instance already exists, so we need to additionally patch the
@@ -375,13 +537,17 @@ def apply_liger_kernel_to_mllama(
         if isinstance(model, MllamaForConditionalGeneration):
             language_model: MllamaForCausalLM = model.language_model
             vision_model: MllamaVisionModel = model.vision_model
-            text_model: MllamaTextModel = language_model.model
+            if isinstance(language_model, MllamaForCausalLM):
+                text_model: MllamaTextModel = language_model.model
+            else:
+                text_model = language_model
         elif isinstance(model, MllamaForCausalLM):
             text_model = model.model
             vision_model = None
         elif isinstance(model, MllamaTextModel):
             text_model = model
             vision_model = None
         else:
             raise ValueError(f"Unsupported Mllama model type: {type(model)}")
@@ -448,7 +614,17 @@ def apply_liger_kernel_to_mistral(
     if cross_entropy:
         modeling_mistral.CrossEntropyLoss = LigerCrossEntropyLoss
     if fused_linear_cross_entropy:
-        modeling_mistral.MistralForCausalLM.forward = mistral_lce_forward
+        if transformer_version >= version.parse("4.49.0"):
+            if model is not None:
+                model.forward = MethodType(mistral_lce_forward, model)
+            else:
+                modeling_mistral.MistralForCausalLM.forward = mistral_lce_forward
+        else:
+            logger.warning(
+                "The latest version of Liger does not support transformers < 4.49.0 for llava. Please downgrade your liger version or upgrade your transformer version."
+            )
+            logger.warning("LigerFusedLinearCrossEntropy patch is not applied.")
     if swiglu:
         modeling_mistral.MistralMLP = LigerSwiGLUMLP
@@ -516,10 +692,16 @@ def apply_liger_kernel_to_mixtral(
     if fused_linear_cross_entropy:
         if transformer_version >= version.parse(SUPPORTED_TRANSFORMER_VERSION):
-            modeling_mixtral.MixtralForCausalLM.forward = mixtral_lce_forward
+            if model is not None:
+                model.forward = MethodType(mixtral_lce_forward, model)
+            else:
+                modeling_mixtral.MixtralForCausalLM.forward = mixtral_lce_forward
         else:  # if version < 4.46.1
             logger.warning(TRANSFORMER_DEPRECATION_WARNING)
-            modeling_mixtral.MixtralForCausalLM.forward = mixtral_lce_forward_deprecated
+            if model is not None:
+                model.forward = MethodType(mixtral_lce_forward_deprecated, model)
+            else:
+                modeling_mixtral.MixtralForCausalLM.forward = mixtral_lce_forward_deprecated
     if swiglu:
         modeling_mixtral.MixtralBlockSparseTop2MLP = LigerBlockSparseTop2MLP
@@ -573,8 +755,8 @@ def apply_liger_kernel_to_gemma(
     from transformers.models.gemma import modeling_gemma
     from transformers.models.gemma.modeling_gemma import GemmaModel
-    # https://github.com/huggingface/transformers/blob/v4.44.2/src/transformers/models/gemma/modeling_gemma.py#L109
-    LigerRMSNormForGemma = partial(LigerRMSNorm, offset=1.0, init_fn="zeros", casting_mode="gemma")
+    from liger_kernel.transformers.rms_norm import LigerRMSNormForGemma
     _patch_rms_norm_module_for_gemma = partial(_patch_rms_norm_module, casting_mode="gemma", offset=1.0)
     if rope:
@@ -593,10 +775,16 @@ def apply_liger_kernel_to_gemma(
         modeling_gemma.GemmaMLP = LigerGEGLUMLP
     if fused_linear_cross_entropy:
         if transformer_version >= version.parse(SUPPORTED_TRANSFORMER_VERSION):
-            modeling_gemma.GemmaForCausalLM.forward = gemma_lce_forward
+            if model is not None:
+                model.forward = MethodType(gemma_lce_forward, model)
+            else:
+                modeling_gemma.GemmaForCausalLM.forward = gemma_lce_forward
         else:  # if version < 4.46.1
             logger.warning(TRANSFORMER_DEPRECATION_WARNING)
-            modeling_gemma.GemmaForCausalLM.forward = gemma_lce_forward_deprecated
+            if model is not None:
+                model.forward = MethodType(gemma_lce_forward_deprecated, model)
+            else:
+                modeling_gemma.GemmaForCausalLM.forward = gemma_lce_forward_deprecated
     if model is not None:
         # The model instance already exists, so we need to additionally patch the
@@ -647,7 +835,8 @@ def apply_liger_kernel_to_gemma2(
     from transformers.models.gemma2 import modeling_gemma2
     from transformers.models.gemma2.modeling_gemma2 import Gemma2Model
-    LigerRMSNormForGemma2 = partial(LigerRMSNorm, offset=1.0, casting_mode="gemma", init_fn="zeros", in_place=False)
+    from liger_kernel.transformers.rms_norm import LigerRMSNormForGemma2
     _patch_rms_norm_module_for_gemma2 = partial(
         _patch_rms_norm_module, offset=1.0, casting_mode="gemma", in_place=False
     )
@@ -667,10 +856,16 @@ def apply_liger_kernel_to_gemma2(
             modeling_gemma2.CrossEntropyLoss = LigerCrossEntropyLoss
     if fused_linear_cross_entropy:
         if transformer_version >= version.parse(SUPPORTED_TRANSFORMER_VERSION):
-            modeling_gemma2.Gemma2ForCausalLM.forward = gemma2_lce_forward
+            if model is not None:
+                model.forward = MethodType(gemma2_lce_forward, model)
+            else:
+                modeling_gemma2.Gemma2ForCausalLM.forward = gemma2_lce_forward
         else:
             logger.warning(TRANSFORMER_DEPRECATION_WARNING)
-            modeling_gemma2.Gemma2ForCausalLM.forward = gemma2_lce_forward_deprected
+            if model is not None:
+                model.forward = MethodType(gemma2_lce_forward_deprected, model)
+            else:
+                modeling_gemma2.Gemma2ForCausalLM.forward = gemma2_lce_forward_deprected
     if geglu:
         modeling_gemma2.Gemma2MLP = LigerGEGLUMLP
@@ -724,9 +919,10 @@ def apply_liger_kernel_to_gemma3_text(
     from transformers.models.gemma3 import modeling_gemma3
     from transformers.models.gemma3.modeling_gemma3 import Gemma3DecoderLayer
     from transformers.models.gemma3.modeling_gemma3 import Gemma3ForCausalLM
+    from transformers.models.gemma3.modeling_gemma3 import Gemma3TextModel
-    from liger_kernel.transformers.gema3_rms import LigerRMSNormForGemma3
     from liger_kernel.transformers.model.gemma3 import causal_forward
+    from liger_kernel.transformers.rms_norm import LigerRMSNormForGemma3
     _patch_rms_norm_module_for_gemma3 = partial(
         _patch_rms_norm_module, offset=1.0, casting_mode="gemma", in_place=False
@@ -748,15 +944,18 @@ def apply_liger_kernel_to_gemma3_text(
         nn.functional.cross_entropy = liger_cross_entropy
     if fused_linear_cross_entropy:
-        modeling_gemma3.Gemma3ForCausalLM.forward = causal_forward
+        if model is not None:
+            model.forward = MethodType(causal_forward, model)
+        else:
+            modeling_gemma3.Gemma3ForCausalLM.forward = causal_forward
     if model is not None:
         # The model instance already exists, so we need to additionally patch the
         # instance variables that reference already-instantiated modules
-        if isinstance(model, Gemma3ForCausalLM):
+        if isinstance(model, Gemma3ForCausalLM) or isinstance(model, Gemma3TextModel):
             # get the base model from the model instance
-            base_model = model.model
+            base_model = model.model if isinstance(model, Gemma3ForCausalLM) else model
             if rms_norm:
                 _patch_rms_norm_module_for_gemma3(base_model.norm)
@@ -818,7 +1017,7 @@ def apply_liger_kernel_to_gemma3(
         _patch_rms_norm_module, offset=1.0, casting_mode="gemma", in_place=False
     )
-    if layer_norm:
+    if layer_norm and model is None:
         modeling_siglip.nn.LayerNorm = LigerLayerNorm
     apply_liger_kernel_to_gemma3_text(
@@ -829,7 +1028,10 @@ def apply_liger_kernel_to_gemma3(
         modeling_gemma3.nn.CrossEntropyLoss = LigerCrossEntropyLoss
     if fused_linear_cross_entropy:
-        modeling_gemma3.Gemma3ForConditionalGeneration.forward = multimodal_forward
+        if model is not None:
+            model.forward = MethodType(multimodal_forward, model)
+        else:
+            modeling_gemma3.Gemma3ForConditionalGeneration.forward = multimodal_forward
     if model is not None:
         # The model instance already exists, so we need to additionally patch the
@@ -897,7 +1099,9 @@ def apply_liger_kernel_to_paligemma(
     # PaliGemma submodules are ['vision_tower', 'multi_modal_projector', 'language_model']
     from transformers.models.gemma.modeling_gemma import GemmaForCausalLM
+    from transformers.models.gemma.modeling_gemma import GemmaModel
     from transformers.models.gemma2.modeling_gemma2 import Gemma2ForCausalLM
+    from transformers.models.gemma2.modeling_gemma2 import Gemma2Model
     from transformers.models.paligemma import modeling_paligemma
     from transformers.models.paligemma.modeling_paligemma import PaliGemmaForConditionalGeneration
     from transformers.models.siglip import modeling_siglip
@@ -908,7 +1112,7 @@ def apply_liger_kernel_to_paligemma(
     from liger_kernel.transformers.model.paligemma import lce_forward_deprecated
     # The vision_tower is a SiglipVisionModel
-    if layer_norm:
+    if layer_norm and model is None:
         modeling_siglip.nn.LayerNorm = LigerLayerNorm
     # SiglipMLP is standard FFN so LigerGEGLUMLP is not compatible
@@ -926,10 +1130,16 @@ def apply_liger_kernel_to_paligemma(
         modeling_paligemma.nn.CrossEntropyLoss = LigerCrossEntropyLoss
     if fused_linear_cross_entropy:
         if transformer_version >= version.parse(SUPPORTED_TRANSFORMER_VERSION):
-            modeling_paligemma.PaliGemmaForConditionalGeneration.forward = lce_forward
+            if model is not None:
+                model.forward = MethodType(lce_forward, model)
+            else:
+                modeling_paligemma.PaliGemmaForConditionalGeneration.forward = lce_forward
         else:  # if version < 4.46.1
             logger.warning(TRANSFORMER_DEPRECATION_WARNING)
-            modeling_paligemma.PaliGemmaForConditionalGeneration.forward = lce_forward_deprecated
+            if model is not None:
+                model.forward = MethodType(lce_forward_deprecated, model)
+            else:
+                modeling_paligemma.PaliGemmaForConditionalGeneration.forward = lce_forward_deprecated
     if model is not None:
         # The model instance already exists, so we need to additionally patch the
@@ -950,7 +1160,7 @@ def apply_liger_kernel_to_paligemma(
         language_model = model.language_model
-        if isinstance(language_model, GemmaForCausalLM):
+        if isinstance(language_model, (GemmaForCausalLM, GemmaModel)):
             apply_liger_kernel_to_gemma(
                 rope=rope,
                 cross_entropy=False,
@@ -960,7 +1170,7 @@ def apply_liger_kernel_to_paligemma(
                 model=language_model,
             )
-        elif isinstance(language_model, Gemma2ForCausalLM):
+        elif isinstance(language_model, (Gemma2ForCausalLM, Gemma2Model)):
             apply_liger_kernel_to_gemma2(
                 rope=rope,
                 cross_entropy=False,
@@ -1021,10 +1231,16 @@ def apply_liger_kernel_to_qwen2(
     if fused_linear_cross_entropy:
         if transformer_version >= version.parse(SUPPORTED_TRANSFORMER_VERSION):
-            modeling_qwen2.Qwen2ForCausalLM.forward = qwen2_lce_forward
+            if model is not None:
+                model.forward = MethodType(qwen2_lce_forward, model)
+            else:
+                modeling_qwen2.Qwen2ForCausalLM.forward = qwen2_lce_forward
         else:  # if version < 4.46.1
             logger.warning(TRANSFORMER_DEPRECATION_WARNING)
-            modeling_qwen2.Qwen2ForCausalLM.forward = qwen2_lce_forward_deprecated
+            if model is not None:
+                model.forward = MethodType(qwen2_lce_forward_deprecated, model)
+            else:
+                modeling_qwen2.Qwen2ForCausalLM.forward = qwen2_lce_forward_deprecated
     if swiglu:
         modeling_qwen2.Qwen2MLP = LigerSwiGLUMLP
@@ -1080,7 +1296,10 @@ def apply_liger_kernel_to_qwen3(
         nn.functional.cross_entropy = liger_cross_entropy
     if fused_linear_cross_entropy:
-        modeling_qwen3.Qwen3ForCausalLM.forward = qwen3_lce_forward
+        if model is not None:
+            model.forward = MethodType(qwen3_lce_forward, model)
+        else:
+            modeling_qwen3.Qwen3ForCausalLM.forward = qwen3_lce_forward
     if swiglu:
         modeling_qwen3.Qwen3MLP = LigerSwiGLUMLP
@@ -1102,6 +1321,65 @@ def apply_liger_kernel_to_qwen3(
                 _patch_rms_norm_module(decoder_layer.post_attention_layernorm)
+def apply_liger_kernel_to_qwen3_moe(
+    rope: bool = True,
+    cross_entropy: bool = False,
+    fused_linear_cross_entropy: bool = True,
+    rms_norm: bool = True,
+    swiglu: bool = True,
+    model: PreTrainedModel = None,
+) -> None:
+    """
+    Apply Liger kernels to replace original implementation in HuggingFace Qwen3 models.
+    """
+    assert not (cross_entropy and fused_linear_cross_entropy), (
+        "cross_entropy and fused_linear_cross_entropy cannot both be True."
+    )
+    from transformers.models.qwen3_moe import modeling_qwen3_moe
+    from transformers.models.qwen3_moe.modeling_qwen3_moe import Qwen3MoeModel
+    from liger_kernel.transformers.model.qwen3_moe import lce_forward as qwen3_lce_forward
+    from liger_kernel.transformers.swiglu import LigerQwen3MoeSwiGLUMLP
+    if rope:
+        modeling_qwen3_moe.apply_rotary_pos_emb = liger_rotary_pos_emb
+    if rms_norm:
+        modeling_qwen3_moe.Qwen3MoeRMSNorm = LigerRMSNorm
+    if cross_entropy:
+        from transformers.loss.loss_utils import nn
+        nn.functional.cross_entropy = liger_cross_entropy
+    if fused_linear_cross_entropy:
+        if model is not None:
+            model.forward = MethodType(qwen3_lce_forward, model)
+        else:
+            modeling_qwen3_moe.Qwen3MoeForCausalLM.forward = qwen3_lce_forward
+    if swiglu:
+        modeling_qwen3_moe.Qwen3MoeMLP = LigerQwen3MoeSwiGLUMLP
+    if model is not None:
+        # The model instance already exists, so we need to additionally patch the
+        # instance variables that reference already-instantiated modules
+        # get the base model from the model instance
+        base_model: Qwen3MoeModel = getattr(model, model.base_model_prefix, model)
+        if rms_norm:
+            _patch_rms_norm_module(base_model.norm)
+        for decoder_layer in base_model.layers:
+            if swiglu:
+                for mlp_expert in decoder_layer.mlp.experts:
+                    _patch_swiglu_module(mlp_expert, LigerQwen3MoeSwiGLUMLP)
+            if rms_norm:
+                _patch_rms_norm_module(decoder_layer.input_layernorm)
+                _patch_rms_norm_module(decoder_layer.post_attention_layernorm)
 def apply_liger_kernel_to_qwen2_vl(
     rope: bool = True,
     cross_entropy: bool = False,
@@ -1113,7 +1391,7 @@ def apply_liger_kernel_to_qwen2_vl(
 ) -> None:
     """
     Apply Liger kernels to replace original implementation in HuggingFace Qwen2-VL models.
-    NOTE: Qwen2-VL is not available in transformers<4.45.0
+    NOTE: Qwen2-VL is not supported in transformers<4.52.4
     Args:
         cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
@@ -1127,12 +1405,19 @@ def apply_liger_kernel_to_qwen2_vl(
         model (PreTrainedModel): The model instance to apply Liger kernels to, if the model has already been
         loaded. Default is None.
     """
+    if transformer_version < version.parse("4.52.4"):
+        logger.warning("Qwen2-VL support is only compatible with transformers >= 4.52.4")
+        return
     assert not (cross_entropy and fused_linear_cross_entropy), (
         "cross_entropy and fused_linear_cross_entropy cannot both be True."
     )
     from transformers.models.qwen2_vl import modeling_qwen2_vl
+    from transformers.models.qwen2_vl.modeling_qwen2_vl import Qwen2VisionTransformerPretrainedModel
+    from transformers.models.qwen2_vl.modeling_qwen2_vl import Qwen2VLForConditionalGeneration
     from transformers.models.qwen2_vl.modeling_qwen2_vl import Qwen2VLModel
+    from transformers.models.qwen2_vl.modeling_qwen2_vl import Qwen2VLTextModel
     from liger_kernel.transformers.model.qwen2_vl import lce_forward as qwen2_vl_lce_forward
@@ -1141,12 +1426,15 @@ def apply_liger_kernel_to_qwen2_vl(
     if rms_norm:
         # https://github.com/huggingface/transformers/blob/main/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L439
         modeling_qwen2_vl.Qwen2RMSNorm = LigerRMSNorm
-    if layer_norm:
+    if layer_norm and model is None:
         modeling_qwen2_vl.LayerNorm = LigerLayerNorm
     if cross_entropy:
         modeling_qwen2_vl.CrossEntropyLoss = LigerCrossEntropyLoss
     if fused_linear_cross_entropy:
-        modeling_qwen2_vl.Qwen2VLForConditionalGeneration.forward = qwen2_vl_lce_forward
+        if model is not None:
+            model.forward = MethodType(qwen2_vl_lce_forward, model)
+        else:
+            modeling_qwen2_vl.Qwen2VLForConditionalGeneration.forward = qwen2_vl_lce_forward
     if swiglu:
         modeling_qwen2_vl.Qwen2MLP = LigerSwiGLUMLP
@@ -1154,24 +1442,38 @@ def apply_liger_kernel_to_qwen2_vl(
         # The model instance already exists, so we need to additionally patch the
         # instance variables that reference already-instantiated modules
-        # get the base model from the model instance
-        base_model: Qwen2VLModel = getattr(model, model.base_model_prefix, model)
+        if isinstance(model, (Qwen2VLForConditionalGeneration, Qwen2VLModel)):
+            # Note: language_model and visual properties can be accessed throught conditional class for BC.
+            # Not sure if it is subject to changes in the future.
+            # Reference: https://github.com/huggingface/transformers/blob/v4.52.4/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L1698
+            text_model: Qwen2VLTextModel = model.language_model
+            vision_model: Qwen2VisionTransformerPretrainedModel = model.visual
+        elif isinstance(model, Qwen2VLTextModel):
+            text_model: Qwen2VLTextModel = model
+            vision_model = None
+        else:
+            # Note: Currently there's no support for patching vision model only. Feel free to raise an issue if needed.
+            raise TypeError(
+                f"Unsupported Qwen2VL model type. `model` must be `Qwen2VLForConditionalGeneration`, `Qwen2VLModel` or `Qwen2VLTextModel`. Got: {type(model)}"
+            )
-        if hasattr(model, "visual"):
-            # Patch Qwen2VisionTransformerPretrainedModel
-            for vision_block in model.visual.blocks:
+        # Patch Qwen2VisionTransformerPretrainedModel
+        if vision_model is not None:
+            for vision_block in vision_model.blocks:
                 if layer_norm:
                     _patch_layer_norm_module(vision_block.norm1)
                     _patch_layer_norm_module(vision_block.norm2)
-        if rms_norm:
-            _patch_rms_norm_module(base_model.norm)
-        for decoder_layer in base_model.layers:
-            if swiglu:
-                _patch_swiglu_module(decoder_layer.mlp, LigerSwiGLUMLP)
+        # Patch Qwen2VisionTextModel
+        if text_model is not None:
             if rms_norm:
-                _patch_rms_norm_module(decoder_layer.input_layernorm)
-                _patch_rms_norm_module(decoder_layer.post_attention_layernorm)
+                _patch_rms_norm_module(text_model.norm)
+            for decoder_layer in text_model.layers:
+                if swiglu:
+                    _patch_swiglu_module(decoder_layer.mlp, LigerSwiGLUMLP)
+                if rms_norm:
+                    _patch_rms_norm_module(decoder_layer.input_layernorm)
+                    _patch_rms_norm_module(decoder_layer.post_attention_layernorm)
 def apply_liger_kernel_to_qwen2_5_vl(
@@ -1197,12 +1499,19 @@ def apply_liger_kernel_to_qwen2_5_vl(
         model (PreTrainedModel): The model instance to apply Liger kernels to, if the model has already been
         loaded. Default is None.
     """
+    if transformer_version < version.parse("4.52.4"):
+        logger.warning("Qwen2.5-VL support is only compatible with transformers >= 4.52.4")
+        return
     assert not (cross_entropy and fused_linear_cross_entropy), (
         "cross_entropy and fused_linear_cross_entropy cannot both be True."
     )
     from transformers.models.qwen2_5_vl import modeling_qwen2_5_vl
+    from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import Qwen2_5_VisionTransformerPretrainedModel
+    from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
     from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import Qwen2_5_VLModel
+    from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import Qwen2_5_VLTextModel
     from liger_kernel.transformers.model.qwen2_5_vl import lce_forward as qwen2_5_vl_lce_forward
@@ -1213,7 +1522,10 @@ def apply_liger_kernel_to_qwen2_5_vl(
     if cross_entropy:
         modeling_qwen2_5_vl.CrossEntropyLoss = LigerCrossEntropyLoss
     if fused_linear_cross_entropy:
-        modeling_qwen2_5_vl.Qwen2_5_VLForConditionalGeneration.forward = qwen2_5_vl_lce_forward
+        if model is not None:
+            model.forward = MethodType(qwen2_5_vl_lce_forward, model)
+        else:
+            modeling_qwen2_5_vl.Qwen2_5_VLForConditionalGeneration.forward = qwen2_5_vl_lce_forward
     if swiglu:
         modeling_qwen2_5_vl.Qwen2MLP = LigerSwiGLUMLP
@@ -1221,24 +1533,37 @@ def apply_liger_kernel_to_qwen2_5_vl(
         # The model instance already exists, so we need to additionally patch the
         # instance variables that reference already-instantiated modules
-        # get the base model from the model instance
-        base_model: Qwen2_5_VLModel = getattr(model, model.base_model_prefix, model)
+        if isinstance(model, (Qwen2_5_VLForConditionalGeneration, Qwen2_5_VLModel)):
+            # Note: language_model and visual properties can be accessed throught conditional class for BC.
+            # Not sure if it is subject to changes in the future.
+            # Reference: https://github.com/huggingface/transformers/blob/v4.52.4/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py#L1823
+            text_model: Qwen2_5_VLTextModel = model.language_model
+            vision_model: Qwen2_5_VisionTransformerPretrainedModel = model.visual
+        elif isinstance(model, Qwen2_5_VLTextModel):
+            text_model: Qwen2_5_VLTextModel = model
+            vision_model = None
+        else:
+            # Note: Currently there's no support for patching vision model only. Feel free to raise an issue if needed.
+            raise TypeError(
+                f"Unsupported Qwen2VL model type. `model` must be `Qwen2VLForConditionalGeneration`, `Qwen2VLModel` or `Qwen2VLTextModel`. Got: {type(model)}"
+            )
-        if hasattr(model, "visual"):
+        if vision_model is not None:
             # Patch Qwen2_5_VisionTransformerPretrainedModel
             for vision_block in model.visual.blocks:
                 if rms_norm:
                     _patch_rms_norm_module(vision_block.norm1)
                     _patch_rms_norm_module(vision_block.norm2)
-        if rms_norm:
-            _patch_rms_norm_module(base_model.norm)
-        for decoder_layer in base_model.layers:
-            if swiglu:
-                _patch_swiglu_module(decoder_layer.mlp, LigerSwiGLUMLP)
+        if text_model is not None:
             if rms_norm:
-                _patch_rms_norm_module(decoder_layer.input_layernorm)
-                _patch_rms_norm_module(decoder_layer.post_attention_layernorm)
+                _patch_rms_norm_module(text_model.norm)
+            for decoder_layer in text_model.layers:
+                if swiglu:
+                    _patch_swiglu_module(decoder_layer.mlp, LigerSwiGLUMLP)
+                if rms_norm:
+                    _patch_rms_norm_module(decoder_layer.input_layernorm)
+                    _patch_rms_norm_module(decoder_layer.post_attention_layernorm)
 def apply_liger_kernel_to_phi3(
@@ -1287,10 +1612,16 @@ def apply_liger_kernel_to_phi3(
             modeling_phi3.CrossEntropyLoss = LigerCrossEntropyLoss
     if fused_linear_cross_entropy:
         if transformer_version >= version.parse(SUPPORTED_TRANSFORMER_VERSION):
-            modeling_phi3.Phi3ForCausalLM.forward = phi3_lce_forward
+            if model is not None:
+                model.forward = MethodType(phi3_lce_forward, model)
+            else:
+                modeling_phi3.Phi3ForCausalLM.forward = phi3_lce_forward
         else:  # if version < 4.46.1
             logger.warning(TRANSFORMER_DEPRECATION_WARNING)
-            modeling_phi3.Phi3ForCausalLM.forward = phi3_lce_forward_deprecated
+            if model is not None:
+                model.forward = MethodType(phi3_lce_forward_deprecated, model)
+            else:
+                modeling_phi3.Phi3ForCausalLM.forward = phi3_lce_forward_deprecated
     if model is not None:
         # The model instance already exists, so we need to additionally patch the
@@ -1341,11 +1672,12 @@ def apply_liger_kernel_to_olmo2(
     from transformers.models.olmo2.modeling_olmo2 import Olmo2Model
     from liger_kernel.transformers.model.olmo2 import lce_forward as olmo2_lce_forward
+    from liger_kernel.transformers.rms_norm import LigerRMSNormForOlmo2
     if rope:
         modeling_olmo2.apply_rotary_pos_emb = liger_rotary_pos_emb
     if rms_norm:
-        modeling_olmo2.Olmo2RMSNorm = partial(LigerRMSNorm, in_place=False)
+        modeling_olmo2.Olmo2RMSNorm = LigerRMSNormForOlmo2
     if swiglu:
         modeling_olmo2.Olmo2MLP = LigerSwiGLUMLP
     if cross_entropy:
@@ -1353,7 +1685,10 @@ def apply_liger_kernel_to_olmo2(
         nn.functional.cross_entropy = liger_cross_entropy
     if fused_linear_cross_entropy:
-        modeling_olmo2.Olmo2ForCausalLM.forward = olmo2_lce_forward
+        if model is not None:
+            model.forward = MethodType(olmo2_lce_forward, model)
+        else:
+            modeling_olmo2.Olmo2ForCausalLM.forward = olmo2_lce_forward
     if model is not None:
         # The model instance already exists, so we need to additionally patch the
@@ -1404,11 +1739,12 @@ def apply_liger_kernel_to_glm4(
     from transformers.models.glm4.modeling_glm4 import Glm4Model
     from liger_kernel.transformers.model.glm4 import lce_forward as glm4_lce_forward
+    from liger_kernel.transformers.rms_norm import LigerRMSNormForGlm4
     if rope:
         raise NotImplementedError("liger_rotary_pos_emb is not available for Glm4 models.")
     if rms_norm:
-        modeling_glm4.Glm4RMSNorm = partial(LigerRMSNorm, in_place=False)
+        modeling_glm4.Glm4RMSNorm = LigerRMSNormForGlm4
     if swiglu:
         modeling_glm4.Glm4MLP = LigerPhi3SwiGLUMLP
     if cross_entropy:
@@ -1416,7 +1752,10 @@ def apply_liger_kernel_to_glm4(
         nn.functional.cross_entropy = liger_cross_entropy
     if fused_linear_cross_entropy:
-        modeling_glm4.Glm4ForCausalLM.forward = glm4_lce_forward
+        if model is not None:
+            model.forward = MethodType(glm4_lce_forward, model)
+        else:
+            modeling_glm4.Glm4ForCausalLM.forward = glm4_lce_forward
     if model is not None:
         # The model instance already exists, so we need to additionally patch the
@@ -1446,6 +1785,8 @@ MODEL_TYPE_TO_APPLY_LIGER_FN = {
     "gemma3": apply_liger_kernel_to_gemma3,
     "glm4": apply_liger_kernel_to_glm4,
     "llama": apply_liger_kernel_to_llama,
+    "llama4_text": apply_liger_kernel_to_llama4,
+    "llama4": apply_liger_kernel_to_llama4,
     "llava": apply_liger_kernel_to_llava,
     "granite": apply_liger_kernel_to_granite,
     "mllama": apply_liger_kernel_to_mllama,
@@ -1455,8 +1796,11 @@ MODEL_TYPE_TO_APPLY_LIGER_FN = {
     "olmo2": apply_liger_kernel_to_olmo2,
     "qwen2": apply_liger_kernel_to_qwen2,
     "qwen3": apply_liger_kernel_to_qwen3,
+    "qwen3_moe": apply_liger_kernel_to_qwen3_moe,
     "qwen2_vl": apply_liger_kernel_to_qwen2_vl,
+    "qwen2_vl_text": apply_liger_kernel_to_qwen2_vl,
     "qwen2_5_vl": apply_liger_kernel_to_qwen2_5_vl,
+    "qwen2_5_vl_text": apply_liger_kernel_to_qwen2_5_vl,
     "phi3": apply_liger_kernel_to_phi3,
     "paligemma": apply_liger_kernel_to_paligemma,
 }
@@ -1516,7 +1860,6 @@ def _apply_liger_kernel_to_instance(model: PreTrainedModel, **kwargs) -> None:
         return
     apply_fn = MODEL_TYPE_TO_APPLY_LIGER_FN[model_type]
     apply_fn_signature = inspect.signature(apply_fn)
     # Filter out the keyword arguments that are not supported by the apply function

liger-kernel 0.5.9__py3-none-any.whl → 0.6.0__py3-none-any.whl

liger-kernel 0.5.9py3-none-any.whl → 0.6.0py3-none-any.whl