PyPI - liger-kernel-nightly - Versions diffs - 0.5.10.dev20250629005644__py3-none-any.whl → 0.5.10.dev20250630172023__py3-none-any.whl - Mend

liger-kernel-nightly 0.5.10.dev20250629005644py3-none-any.whl → 0.5.10.dev20250630172023py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

liger_kernel/transformers/monkey_patch.py CHANGED Viewed

@@ -2,6 +2,7 @@ import inspect
 import logging
 from functools import partial
+from types import MethodType
 from typing import Callable
 import transformers
@@ -260,10 +261,16 @@ def apply_liger_kernel_to_llama(
     if fused_linear_cross_entropy:
         if transformer_version >= version.parse(SUPPORTED_TRANSFORMER_VERSION):
-            modeling_llama.LlamaForCausalLM.forward = llama_lce_forward
+            if model is not None:
+                model.forward = MethodType(llama_lce_forward, model)
+            else:
+                modeling_llama.LlamaForCausalLM.forward = llama_lce_forward
         else:  # if version < 4.46.1
             logger.warning(TRANSFORMER_DEPRECATION_WARNING)
-            modeling_llama.LlamaForCausalLM.forward = llama_lce_forward_deprecated
+            if model is not None:
+                model.forward = MethodType(llama_lce_forward_deprecated, model)
+            else:
+                modeling_llama.LlamaForCausalLM.forward = llama_lce_forward_deprecated
     if model is not None:
         # The model instance already exists, so we need to additionally patch the
@@ -318,9 +325,15 @@ def apply_liger_kernel_to_llava(
         modeling_llava.nn.CrossEntropyLoss = LigerCrossEntropyLoss
     if fused_linear_cross_entropy:
         if transformer_version >= version.parse("4.52.0"):
-            modeling_llava.LlavaForConditionalGeneration.forward = llava_lce_forward
+            if model is not None:
+                model.forward = MethodType(llava_lce_forward, model)
+            else:
+                modeling_llava.LlavaForConditionalGeneration.forward = llava_lce_forward
         elif transformer_version >= version.parse("4.49.0") and transformer_version < version.parse("4.52.0"):
-            modeling_llava.LlavaForConditionalGeneration.forward = llava_lce_forward_deprecated
+            if model is not None:
+                model.forward = MethodType(llava_lce_forward_deprecated, model)
+            else:
+                modeling_llava.LlavaForConditionalGeneration.forward = llava_lce_forward_deprecated
         else:  # if version < 4.49.0
             logger.warning(
                 "The latest version of Liger does not support transformers < 4.49.0 for llava. Please downgrade your liger version or upgrade your transformer version."
@@ -490,7 +503,7 @@ def apply_liger_kernel_to_mllama(
     if rope:
         modeling_mllama.apply_rotary_pos_emb = liger_rotary_pos_emb
-    if layer_norm:
+    if layer_norm and model is None:
         modeling_mllama.nn.LayerNorm = LigerLayerNorm
     if rms_norm:
         modeling_mllama.MllamaTextRMSNorm = LigerRMSNorm
@@ -506,10 +519,16 @@ def apply_liger_kernel_to_mllama(
             modeling_mllama.CrossEntropyLoss = LigerCrossEntropyLoss
     if fused_linear_cross_entropy:
         if transformer_version >= version.parse(SUPPORTED_TRANSFORMER_VERSION):
-            modeling_mllama.MllamaForCausalLM.forward = mllama_lce_forward
+            if model is not None:
+                model.forward = MethodType(mllama_lce_forward, model)
+            else:
+                modeling_mllama.MllamaForCausalLM.forward = mllama_lce_forward
         else:  # if version < 4.46.1
             logger.warning(TRANSFORMER_DEPRECATION_WARNING)
-            modeling_mllama.MllamaForCausalLM.forward = mllama_lce_forward_deprecated
+            if model is not None:
+                model.forward = MethodType(mllama_lce_forward_deprecated, model)
+            else:
+                modeling_mllama.MllamaForCausalLM.forward = mllama_lce_forward_deprecated
     if model is not None:
         # The model instance already exists, so we need to additionally patch the
@@ -592,7 +611,10 @@ def apply_liger_kernel_to_mistral(
     if cross_entropy:
         modeling_mistral.CrossEntropyLoss = LigerCrossEntropyLoss
     if fused_linear_cross_entropy:
-        modeling_mistral.MistralForCausalLM.forward = mistral_lce_forward
+        if model is not None:
+            model.forward = MethodType(mistral_lce_forward, model)
+        else:
+            modeling_mistral.MistralForCausalLM.forward = mistral_lce_forward
     if swiglu:
         modeling_mistral.MistralMLP = LigerSwiGLUMLP
@@ -660,10 +682,16 @@ def apply_liger_kernel_to_mixtral(
     if fused_linear_cross_entropy:
         if transformer_version >= version.parse(SUPPORTED_TRANSFORMER_VERSION):
-            modeling_mixtral.MixtralForCausalLM.forward = mixtral_lce_forward
+            if model is not None:
+                model.forward = MethodType(mixtral_lce_forward, model)
+            else:
+                modeling_mixtral.MixtralForCausalLM.forward = mixtral_lce_forward
         else:  # if version < 4.46.1
             logger.warning(TRANSFORMER_DEPRECATION_WARNING)
-            modeling_mixtral.MixtralForCausalLM.forward = mixtral_lce_forward_deprecated
+            if model is not None:
+                model.forward = MethodType(mixtral_lce_forward_deprecated, model)
+            else:
+                modeling_mixtral.MixtralForCausalLM.forward = mixtral_lce_forward_deprecated
     if swiglu:
         modeling_mixtral.MixtralBlockSparseTop2MLP = LigerBlockSparseTop2MLP
@@ -737,10 +765,16 @@ def apply_liger_kernel_to_gemma(
         modeling_gemma.GemmaMLP = LigerGEGLUMLP
     if fused_linear_cross_entropy:
         if transformer_version >= version.parse(SUPPORTED_TRANSFORMER_VERSION):
-            modeling_gemma.GemmaForCausalLM.forward = gemma_lce_forward
+            if model is not None:
+                model.forward = MethodType(gemma_lce_forward, model)
+            else:
+                modeling_gemma.GemmaForCausalLM.forward = gemma_lce_forward
         else:  # if version < 4.46.1
             logger.warning(TRANSFORMER_DEPRECATION_WARNING)
-            modeling_gemma.GemmaForCausalLM.forward = gemma_lce_forward_deprecated
+            if model is not None:
+                model.forward = MethodType(gemma_lce_forward_deprecated, model)
+            else:
+                modeling_gemma.GemmaForCausalLM.forward = gemma_lce_forward_deprecated
     if model is not None:
         # The model instance already exists, so we need to additionally patch the
@@ -812,10 +846,16 @@ def apply_liger_kernel_to_gemma2(
             modeling_gemma2.CrossEntropyLoss = LigerCrossEntropyLoss
     if fused_linear_cross_entropy:
         if transformer_version >= version.parse(SUPPORTED_TRANSFORMER_VERSION):
-            modeling_gemma2.Gemma2ForCausalLM.forward = gemma2_lce_forward
+            if model is not None:
+                model.forward = MethodType(gemma2_lce_forward, model)
+            else:
+                modeling_gemma2.Gemma2ForCausalLM.forward = gemma2_lce_forward
         else:
             logger.warning(TRANSFORMER_DEPRECATION_WARNING)
-            modeling_gemma2.Gemma2ForCausalLM.forward = gemma2_lce_forward_deprected
+            if model is not None:
+                model.forward = MethodType(gemma2_lce_forward_deprected, model)
+            else:
+                modeling_gemma2.Gemma2ForCausalLM.forward = gemma2_lce_forward_deprected
     if geglu:
         modeling_gemma2.Gemma2MLP = LigerGEGLUMLP
@@ -894,7 +934,10 @@ def apply_liger_kernel_to_gemma3_text(
         nn.functional.cross_entropy = liger_cross_entropy
     if fused_linear_cross_entropy:
-        modeling_gemma3.Gemma3ForCausalLM.forward = causal_forward
+        if model is not None:
+            model.forward = MethodType(causal_forward, model)
+        else:
+            modeling_gemma3.Gemma3ForCausalLM.forward = causal_forward
     if model is not None:
         # The model instance already exists, so we need to additionally patch the
@@ -964,7 +1007,7 @@ def apply_liger_kernel_to_gemma3(
         _patch_rms_norm_module, offset=1.0, casting_mode="gemma", in_place=False
     )
-    if layer_norm:
+    if layer_norm and model is None:
         modeling_siglip.nn.LayerNorm = LigerLayerNorm
     apply_liger_kernel_to_gemma3_text(
@@ -975,7 +1018,10 @@ def apply_liger_kernel_to_gemma3(
         modeling_gemma3.nn.CrossEntropyLoss = LigerCrossEntropyLoss
     if fused_linear_cross_entropy:
-        modeling_gemma3.Gemma3ForConditionalGeneration.forward = multimodal_forward
+        if model is not None:
+            model.forward = MethodType(multimodal_forward, model)
+        else:
+            modeling_gemma3.Gemma3ForConditionalGeneration.forward = multimodal_forward
     if model is not None:
         # The model instance already exists, so we need to additionally patch the
@@ -1054,7 +1100,7 @@ def apply_liger_kernel_to_paligemma(
     from liger_kernel.transformers.model.paligemma import lce_forward_deprecated
     # The vision_tower is a SiglipVisionModel
-    if layer_norm:
+    if layer_norm and model is None:
         modeling_siglip.nn.LayerNorm = LigerLayerNorm
     # SiglipMLP is standard FFN so LigerGEGLUMLP is not compatible
@@ -1072,10 +1118,16 @@ def apply_liger_kernel_to_paligemma(
         modeling_paligemma.nn.CrossEntropyLoss = LigerCrossEntropyLoss
     if fused_linear_cross_entropy:
         if transformer_version >= version.parse(SUPPORTED_TRANSFORMER_VERSION):
-            modeling_paligemma.PaliGemmaForConditionalGeneration.forward = lce_forward
+            if model is not None:
+                model.forward = MethodType(lce_forward, model)
+            else:
+                modeling_paligemma.PaliGemmaForConditionalGeneration.forward = lce_forward
         else:  # if version < 4.46.1
             logger.warning(TRANSFORMER_DEPRECATION_WARNING)
-            modeling_paligemma.PaliGemmaForConditionalGeneration.forward = lce_forward_deprecated
+            if model is not None:
+                model.forward = MethodType(lce_forward_deprecated, model)
+            else:
+                modeling_paligemma.PaliGemmaForConditionalGeneration.forward = lce_forward_deprecated
     if model is not None:
         # The model instance already exists, so we need to additionally patch the
@@ -1167,10 +1219,16 @@ def apply_liger_kernel_to_qwen2(
     if fused_linear_cross_entropy:
         if transformer_version >= version.parse(SUPPORTED_TRANSFORMER_VERSION):
-            modeling_qwen2.Qwen2ForCausalLM.forward = qwen2_lce_forward
+            if model is not None:
+                model.forward = MethodType(qwen2_lce_forward, model)
+            else:
+                modeling_qwen2.Qwen2ForCausalLM.forward = qwen2_lce_forward
         else:  # if version < 4.46.1
             logger.warning(TRANSFORMER_DEPRECATION_WARNING)
-            modeling_qwen2.Qwen2ForCausalLM.forward = qwen2_lce_forward_deprecated
+            if model is not None:
+                model.forward = MethodType(qwen2_lce_forward_deprecated, model)
+            else:
+                modeling_qwen2.Qwen2ForCausalLM.forward = qwen2_lce_forward_deprecated
     if swiglu:
         modeling_qwen2.Qwen2MLP = LigerSwiGLUMLP
@@ -1226,7 +1284,10 @@ def apply_liger_kernel_to_qwen3(
         nn.functional.cross_entropy = liger_cross_entropy
     if fused_linear_cross_entropy:
-        modeling_qwen3.Qwen3ForCausalLM.forward = qwen3_lce_forward
+        if model is not None:
+            model.forward = MethodType(qwen3_lce_forward, model)
+        else:
+            modeling_qwen3.Qwen3ForCausalLM.forward = qwen3_lce_forward
     if swiglu:
         modeling_qwen3.Qwen3MLP = LigerSwiGLUMLP
@@ -1281,7 +1342,10 @@ def apply_liger_kernel_to_qwen3_moe(
         nn.functional.cross_entropy = liger_cross_entropy
     if fused_linear_cross_entropy:
-        modeling_qwen3_moe.Qwen3MoeForCausalLM.forward = qwen3_lce_forward
+        if model is not None:
+            model.forward = MethodType(qwen3_lce_forward, model)
+        else:
+            modeling_qwen3_moe.Qwen3MoeForCausalLM.forward = qwen3_lce_forward
     if swiglu:
         modeling_qwen3_moe.Qwen3MoeMLP = LigerQwen3MoeSwiGLUMLP
@@ -1350,12 +1414,15 @@ def apply_liger_kernel_to_qwen2_vl(
     if rms_norm:
         # https://github.com/huggingface/transformers/blob/main/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L439
         modeling_qwen2_vl.Qwen2RMSNorm = LigerRMSNorm
-    if layer_norm:
+    if layer_norm and model is None:
         modeling_qwen2_vl.LayerNorm = LigerLayerNorm
     if cross_entropy:
         modeling_qwen2_vl.CrossEntropyLoss = LigerCrossEntropyLoss
     if fused_linear_cross_entropy:
-        modeling_qwen2_vl.Qwen2VLForConditionalGeneration.forward = qwen2_vl_lce_forward
+        if model is not None:
+            model.forward = MethodType(qwen2_vl_lce_forward, model)
+        else:
+            modeling_qwen2_vl.Qwen2VLForConditionalGeneration.forward = qwen2_vl_lce_forward
     if swiglu:
         modeling_qwen2_vl.Qwen2MLP = LigerSwiGLUMLP
@@ -1443,7 +1510,10 @@ def apply_liger_kernel_to_qwen2_5_vl(
     if cross_entropy:
         modeling_qwen2_5_vl.CrossEntropyLoss = LigerCrossEntropyLoss
     if fused_linear_cross_entropy:
-        modeling_qwen2_5_vl.Qwen2_5_VLForConditionalGeneration.forward = qwen2_5_vl_lce_forward
+        if model is not None:
+            model.forward = MethodType(qwen2_5_vl_lce_forward, model)
+        else:
+            modeling_qwen2_5_vl.Qwen2_5_VLForConditionalGeneration.forward = qwen2_5_vl_lce_forward
     if swiglu:
         modeling_qwen2_5_vl.Qwen2MLP = LigerSwiGLUMLP
@@ -1530,10 +1600,16 @@ def apply_liger_kernel_to_phi3(
             modeling_phi3.CrossEntropyLoss = LigerCrossEntropyLoss
     if fused_linear_cross_entropy:
         if transformer_version >= version.parse(SUPPORTED_TRANSFORMER_VERSION):
-            modeling_phi3.Phi3ForCausalLM.forward = phi3_lce_forward
+            if model is not None:
+                model.forward = MethodType(phi3_lce_forward, model)
+            else:
+                modeling_phi3.Phi3ForCausalLM.forward = phi3_lce_forward
         else:  # if version < 4.46.1
             logger.warning(TRANSFORMER_DEPRECATION_WARNING)
-            modeling_phi3.Phi3ForCausalLM.forward = phi3_lce_forward_deprecated
+            if model is not None:
+                model.forward = MethodType(phi3_lce_forward_deprecated, model)
+            else:
+                modeling_phi3.Phi3ForCausalLM.forward = phi3_lce_forward_deprecated
     if model is not None:
         # The model instance already exists, so we need to additionally patch the
@@ -1597,7 +1673,10 @@ def apply_liger_kernel_to_olmo2(
         nn.functional.cross_entropy = liger_cross_entropy
     if fused_linear_cross_entropy:
-        modeling_olmo2.Olmo2ForCausalLM.forward = olmo2_lce_forward
+        if model is not None:
+            model.forward = MethodType(olmo2_lce_forward, model)
+        else:
+            modeling_olmo2.Olmo2ForCausalLM.forward = olmo2_lce_forward
     if model is not None:
         # The model instance already exists, so we need to additionally patch the
@@ -1661,7 +1740,10 @@ def apply_liger_kernel_to_glm4(
         nn.functional.cross_entropy = liger_cross_entropy
     if fused_linear_cross_entropy:
-        modeling_glm4.Glm4ForCausalLM.forward = glm4_lce_forward
+        if model is not None:
+            model.forward = MethodType(glm4_lce_forward, model)
+        else:
+            modeling_glm4.Glm4ForCausalLM.forward = glm4_lce_forward
     if model is not None:
         # The model instance already exists, so we need to additionally patch the

{liger_kernel_nightly-0.5.10.dev20250629005644.dist-info → liger_kernel_nightly-0.5.10.dev20250630172023.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: liger_kernel_nightly
-Version: 0.5.10.dev20250629005644
+Version: 0.5.10.dev20250630172023
 Summary: Efficient Triton kernels for LLM Training
 License: BSD 2-CLAUSE LICENSE
         Copyright 2024 LinkedIn Corporation

{liger_kernel_nightly-0.5.10.dev20250629005644.dist-info → liger_kernel_nightly-0.5.10.dev20250630172023.dist-info}/RECORD RENAMED Viewed

@@ -53,7 +53,7 @@ liger_kernel/transformers/grpo_loss.py,sha256=uAkUNKSnUGEOqa82L9w2e6AI1kcmG8K45-
 liger_kernel/transformers/jsd.py,sha256=DGqRnxIZxsvxo0_tbbxX3b-sDbDjC_yKufyRIHCcScY,2979
 liger_kernel/transformers/kl_div.py,sha256=WLffFbh1EExD2Eb1F7lN11fo9JJC-0751WJjZAF1Fj8,409
 liger_kernel/transformers/layer_norm.py,sha256=c9pk3PEasOKYR0rhe5e5nNrnYKVCEW4VC8S6LpCq9EQ,906
-liger_kernel/transformers/monkey_patch.py,sha256=3KqEl_-WlXgUoEAEYgGs-SPolASshGem2ISFemzQAIc,81705
+liger_kernel/transformers/monkey_patch.py,sha256=YkX0LT6lISg3UTqFjjt9kTr36WgiHvYTQObAS1_Bmi4,85172
 liger_kernel/transformers/multi_token_attention.py,sha256=l9VDICK0dfmifUDW668hGscP8AHq2rYcM2oGUa3baRQ,1751
 liger_kernel/transformers/qwen2vl_mrope.py,sha256=5EwSqrMdsL9MYspeBMXBsNJKvH0MOmRrtJXAJlnnlOI,1047
 liger_kernel/transformers/rms_norm.py,sha256=vkekcvTeWY8vL4H6hg3t0XeY0Ew_3OFMPHuzqlxPPVw,2719
@@ -88,9 +88,9 @@ liger_kernel/transformers/trainer/__init__.py,sha256=p7yQfklV8-467qSz_ZMimkbDF7H
 liger_kernel/transformers/trainer/orpo_trainer.py,sha256=tX0h63aOFe3rNqTmk6JpMf75UPo981yzEa6TghnjS0Q,5370
 liger_kernel/triton/__init__.py,sha256=qCiCamzCRv6lpV8IqpAc9YMdNKC7GKurClWceQPnlis,92
 liger_kernel/triton/monkey_patch.py,sha256=Rd0hUHAzDkFfHvnX7-PBaNK5EKnZhtfM_h-fgQH9HPY,1568
-liger_kernel_nightly-0.5.10.dev20250629005644.dist-info/LICENSE,sha256=OhzLDHJ0to4a8sodVLELZiCFylZ1NAAYLs-HrjPy0ag,1312
-liger_kernel_nightly-0.5.10.dev20250629005644.dist-info/METADATA,sha256=FMeKbXVH-02gQ_G0kVMIc6ftN9rv5WeQZ94Br45A9ek,24536
-liger_kernel_nightly-0.5.10.dev20250629005644.dist-info/NOTICE,sha256=njwnoPZLh9AN8SJQzxvCGLHi-8X__AvWRze6joNXIY8,2066
-liger_kernel_nightly-0.5.10.dev20250629005644.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
-liger_kernel_nightly-0.5.10.dev20250629005644.dist-info/top_level.txt,sha256=2eghu4hA3LnkM7ElW92tQ8zegWKgSbeo-k-aGe1YnvY,13
-liger_kernel_nightly-0.5.10.dev20250629005644.dist-info/RECORD,,
+liger_kernel_nightly-0.5.10.dev20250630172023.dist-info/LICENSE,sha256=OhzLDHJ0to4a8sodVLELZiCFylZ1NAAYLs-HrjPy0ag,1312
+liger_kernel_nightly-0.5.10.dev20250630172023.dist-info/METADATA,sha256=R9S054XUfsyrq9HECn8SHjNLRdXF6KxS6vP1w_fuqjI,24536
+liger_kernel_nightly-0.5.10.dev20250630172023.dist-info/NOTICE,sha256=njwnoPZLh9AN8SJQzxvCGLHi-8X__AvWRze6joNXIY8,2066
+liger_kernel_nightly-0.5.10.dev20250630172023.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
+liger_kernel_nightly-0.5.10.dev20250630172023.dist-info/top_level.txt,sha256=2eghu4hA3LnkM7ElW92tQ8zegWKgSbeo-k-aGe1YnvY,13
+liger_kernel_nightly-0.5.10.dev20250630172023.dist-info/RECORD,,

{liger_kernel_nightly-0.5.10.dev20250629005644.dist-info → liger_kernel_nightly-0.5.10.dev20250630172023.dist-info}/LICENSE RENAMED Viewed

File without changes

{liger_kernel_nightly-0.5.10.dev20250629005644.dist-info → liger_kernel_nightly-0.5.10.dev20250630172023.dist-info}/NOTICE RENAMED Viewed

File without changes

{liger_kernel_nightly-0.5.10.dev20250629005644.dist-info → liger_kernel_nightly-0.5.10.dev20250630172023.dist-info}/WHEEL RENAMED Viewed

File without changes

{liger_kernel_nightly-0.5.10.dev20250629005644.dist-info → liger_kernel_nightly-0.5.10.dev20250630172023.dist-info}/top_level.txt RENAMED Viewed

File without changes

liger-kernel-nightly 0.5.10.dev20250629005644__py3-none-any.whl → 0.5.10.dev20250630172023__py3-none-any.whl

liger-kernel-nightly 0.5.10.dev20250629005644py3-none-any.whl → 0.5.10.dev20250630172023py3-none-any.whl