PyPI - liger-kernel-nightly - Versions diffs - 0.5.3.dev20250221002845__py3-none-any.whl → 0.5.3.dev20250221011057__py3-none-any.whl - Mend

liger-kernel-nightly 0.5.3.dev20250221002845py3-none-any.whl → 0.5.3.dev20250221011057py3-none-any.whl

Files changed (8) hide show

liger_kernel/transformers/__init__.py CHANGED Viewed

@@ -9,6 +9,7 @@ from liger_kernel.transformers.monkey_patch import _apply_liger_kernel  # noqa:
 from liger_kernel.transformers.monkey_patch import _apply_liger_kernel_to_instance  # noqa: F401
 from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_gemma  # noqa: F401
 from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_gemma2  # noqa: F401
+from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_granite  # noqa: F401
 from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_llama  # noqa: F401
 from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_mistral  # noqa: F401
 from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_mixtral  # noqa: F401

liger_kernel/transformers/monkey_patch.py CHANGED Viewed

@@ -61,6 +61,85 @@ def _patch_layer_norm_module(module, eps=1e-6):
     _bind_method_to_module(module, "extra_repr", LigerLayerNorm.extra_repr)
+def apply_liger_kernel_to_granite(
+    rope: bool = True,
+    cross_entropy: bool = True,
+    fused_linear_cross_entropy: bool = False,
+    rms_norm: bool = True,
+    swiglu: bool = True,
+    model: PreTrainedModel = None,
+) -> None:
+    """
+    Apply Liger kernels to replace original implementation in HuggingFace Granite 3 models
+    Args:
+        rope (bool): Whether to apply Liger's rotary position embedding. Default is True.
+        cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is True.
+        fused_linear_cross_entropy (bool):
+            Whether to apply Liger's fused linear cross entropy loss. Default is False.
+            `cross_entropy` and `fused_linear_cross_entropy` cannot both be True.
+            If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
+        rms_norm (bool): Whether to apply Liger's RMSNorm. Default is True.
+        swiglu (bool): Whether to apply Liger's SwiGLU MLP. Default is True.
+        model (PreTrainedModel): The model instance to apply Liger kernels to, if the model has already been
+        loaded. Default is None.
+    Debugging notes:
+        If LigerSwiGLUMLP is OK for Llama, it should be fine for Granite, but it's not.
+    """
+    assert not (cross_entropy and fused_linear_cross_entropy), (
+        "cross_entropy and fused_linear_cross_entropy cannot both be True."
+    )
+    from transformers.models.granite import modeling_granite
+    from transformers.models.granite.modeling_granite import GraniteModel
+    if swiglu:
+        modeling_granite.GraniteMLP = LigerSwiGLUMLP
+    if rms_norm:
+        modeling_granite.GraniteRMSNorm = LigerRMSNorm
+    if rope:
+        modeling_granite.apply_rotary_pos_emb = liger_rotary_pos_emb
+    if cross_entropy:
+        if transformer_version >= version.parse(SUPPORTED_TRANSFORMER_VERSION):
+            from transformers.loss.loss_utils import nn
+            nn.functional.cross_entropy = liger_cross_entropy
+        else:
+            logger.warning(TRANSFORMER_DEPRECATION_WARNING)
+            modeling_granite.CrossEntropyLoss = LigerCrossEntropyLoss
+    if fused_linear_cross_entropy:
+        raise NotImplementedError("LigerFusedLinearCrossEntropy is not available for Granite models.")
+        # NOTE: Granite model `GraniteForCausalLM.forward` scales logits each
+        # call, so we can't sidestep logit materialization. A bit more work
+        # would be needed to add a scaling term to the `LigerFusedLinearCrossEntropyFunction`
+        # for the logit output.
+    if model is not None:
+        # The model instance already exists, so we need to additionally patch the
+        # instance variables that reference already-instantiated modules (e.g. GraniteRMSNorm or GraniteMLP)
+        # get the base model from the model instance
+        base_model: GraniteModel = getattr(model, model.base_model_prefix, model)
+        if rms_norm:
+            _patch_rms_norm_module(base_model.norm)
+        for decoder_layer in base_model.layers:
+            if swiglu:
+                _bind_method_to_module(decoder_layer.mlp, "forward", LigerSwiGLUMLP.forward)
+            if rms_norm:
+                _patch_rms_norm_module(decoder_layer.input_layernorm)
+                _patch_rms_norm_module(decoder_layer.post_attention_layernorm)
 def apply_liger_kernel_to_llama(
     rope: bool = True,
     cross_entropy: bool = False,
@@ -740,6 +819,7 @@ MODEL_TYPE_TO_APPLY_LIGER_FN = {
     "gemma": apply_liger_kernel_to_gemma,
     "gemma2": apply_liger_kernel_to_gemma2,
     "llama": apply_liger_kernel_to_llama,
+    "granite": apply_liger_kernel_to_granite,
     "mllama": apply_liger_kernel_to_mllama,
     "mllama_text_model": apply_liger_kernel_to_mllama,
     "mistral": apply_liger_kernel_to_mistral,

{liger_kernel_nightly-0.5.3.dev20250221002845.dist-info → liger_kernel_nightly-0.5.3.dev20250221011057.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: liger_kernel_nightly
-Version: 0.5.3.dev20250221002845
+Version: 0.5.3.dev20250221011057
 Summary: Efficient Triton kernels for LLM Training
 License: BSD 2-CLAUSE LICENSE
         Copyright 2024 LinkedIn Corporation
@@ -126,7 +126,7 @@ Requires-Dist: mkdocs-material; extra == "dev"
 **Liger Kernel** is a collection of Triton kernels designed specifically for LLM training. It can effectively increase multi-GPU **training throughput by 20%** and reduces **memory usage by 60%**. We have implemented **Hugging Face Compatible** `RMSNorm`, `RoPE`, `SwiGLU`, `CrossEntropy`, `FusedLinearCrossEntropy`, and more to come. The kernel works out of the box with [Flash Attention](https://github.com/Dao-AILab/flash-attention), [PyTorch FSDP](https://pytorch.org/tutorials/intermediate/FSDP_tutorial.html), and [Microsoft DeepSpeed](https://github.com/microsoft/DeepSpeed). We welcome contributions from the community to gather the best kernels for LLM training.
-We've also added optimized Post-Training kernels that deliver **up to 80% memory savings** for alignment and distillation tasks. We support losses like DPO, CPO, ORPO, SimPO, JSD, and many more. Check out [how we optimize the memory](https://x.com/hsu_byron/status/1866577403918917655).
+We've also added optimized Post-Training kernels that deliver **up to 80% memory savings** for alignment and distillation tasks. We support losses like DPO, CPO, ORPO, SimPO, KTO, JSD, and many more. Check out [how we optimize the memory](https://x.com/hsu_byron/status/1866577403918917655).
 ## Supercharge Your Model with Liger Kernel
@@ -341,6 +341,7 @@ loss.backward()
 | Fused Linear DPO Loss           | `liger_kernel.chunked_loss.LigerFusedLinearDPOLoss`       |
 | Fused Linear ORPO Loss          | `liger_kernel.chunked_loss.LigerFusedLinearORPOLoss`      |
 | Fused Linear SimPO Loss         | `liger_kernel.chunked_loss.LigerFusedLinearSimPOLoss`     |
+| Fused Linear KTO Loss           | `liger_kernel.chunked_loss.LigerFusedLinearKTOLoss`     |
 ### Distillation Kernels

{liger_kernel_nightly-0.5.3.dev20250221002845.dist-info → liger_kernel_nightly-0.5.3.dev20250221011057.dist-info}/RECORD RENAMED Viewed

@@ -32,7 +32,7 @@ liger_kernel/ops/tvd.py,sha256=9wVCijj2vBtgiLeUHhl7hy_LAiJ3liPIYOGMSU3P1ro,6407
 liger_kernel/ops/utils.py,sha256=uoFKQqo-34N2TWQNvXMFywqGiOMMXNEVBxVojzlUAa0,3836
 liger_kernel/ops/experimental/embedding.py,sha256=tolj3tItkzpSb30zWqDN2_yX4ectflaQ8HMyKyFIQc8,4172
 liger_kernel/ops/experimental/mm_int8int2.py,sha256=TrS9lpwekrik_w5qE7AhMJD1bcq-OidjtbsW80oZ6IM,13314
-liger_kernel/transformers/__init__.py,sha256=VZI9hiCvvA371jsfkJmSt1CNXlBztIvlVGDExyKeqBM,2077
+liger_kernel/transformers/__init__.py,sha256=i6GPkP5-esFBh205nF4MluNrL7KNugseGiUKdSHGW70,2172
 liger_kernel/transformers/auto_model.py,sha256=0qCTRZt280Bj_LcFdzo9hlaR-BWNazawXOGgoCZjgEg,1545
 liger_kernel/transformers/cross_entropy.py,sha256=z3KTWQnFxr_IZaVjtYt0ZNEWQdDdYThN35xWkHlDGH0,1683
 liger_kernel/transformers/functional.py,sha256=zahXVCjA2NxcVFpAgajILIRN0GO6mrbfLPgONUkTrY8,4940
@@ -43,7 +43,7 @@ liger_kernel/transformers/group_norm.py,sha256=6qMAWOprr4SzP0YhNVNGQIBpM5aUHplUD
 liger_kernel/transformers/jsd.py,sha256=DGqRnxIZxsvxo0_tbbxX3b-sDbDjC_yKufyRIHCcScY,2979
 liger_kernel/transformers/kl_div.py,sha256=WLffFbh1EExD2Eb1F7lN11fo9JJC-0751WJjZAF1Fj8,409
 liger_kernel/transformers/layer_norm.py,sha256=c9pk3PEasOKYR0rhe5e5nNrnYKVCEW4VC8S6LpCq9EQ,906
-liger_kernel/transformers/monkey_patch.py,sha256=DXU00zsQvSjAqCx7l36gKm1O81FuHgILkZMhyx4ZSys,37812
+liger_kernel/transformers/monkey_patch.py,sha256=kSJE1aMLN0e4jxiePUISRPcyvhWyzhOaqIwLW6rG0Zo,41191
 liger_kernel/transformers/qwen2vl_mrope.py,sha256=5EwSqrMdsL9MYspeBMXBsNJKvH0MOmRrtJXAJlnnlOI,1047
 liger_kernel/transformers/rms_norm.py,sha256=GqCEJuGt0YdqqlMcToE0Wp4A8YFquDa4UUSyH2uFW2A,1191
 liger_kernel/transformers/rope.py,sha256=ZTrTORSAyfcFIKjk6XEeYmk4ROH7xXED9L4g2NFntlE,999
@@ -65,9 +65,9 @@ liger_kernel/transformers/trainer/__init__.py,sha256=p7yQfklV8-467qSz_ZMimkbDF7H
 liger_kernel/transformers/trainer/orpo_trainer.py,sha256=pdekW7l6Qg_aqa5SYKYlSWUF8m3lkOFvFLcIMEHrz9s,8338
 liger_kernel/triton/__init__.py,sha256=qCiCamzCRv6lpV8IqpAc9YMdNKC7GKurClWceQPnlis,92
 liger_kernel/triton/monkey_patch.py,sha256=Rd0hUHAzDkFfHvnX7-PBaNK5EKnZhtfM_h-fgQH9HPY,1568
-liger_kernel_nightly-0.5.3.dev20250221002845.dist-info/LICENSE,sha256=OhzLDHJ0to4a8sodVLELZiCFylZ1NAAYLs-HrjPy0ag,1312
-liger_kernel_nightly-0.5.3.dev20250221002845.dist-info/METADATA,sha256=IppIvVS1LGihuHadPcZP-l4F7Q5bZH4C5-DuEVlYjqo,21864
-liger_kernel_nightly-0.5.3.dev20250221002845.dist-info/NOTICE,sha256=njwnoPZLh9AN8SJQzxvCGLHi-8X__AvWRze6joNXIY8,2066
-liger_kernel_nightly-0.5.3.dev20250221002845.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
-liger_kernel_nightly-0.5.3.dev20250221002845.dist-info/top_level.txt,sha256=2eghu4hA3LnkM7ElW92tQ8zegWKgSbeo-k-aGe1YnvY,13
-liger_kernel_nightly-0.5.3.dev20250221002845.dist-info/RECORD,,
+liger_kernel_nightly-0.5.3.dev20250221011057.dist-info/LICENSE,sha256=OhzLDHJ0to4a8sodVLELZiCFylZ1NAAYLs-HrjPy0ag,1312
+liger_kernel_nightly-0.5.3.dev20250221011057.dist-info/METADATA,sha256=eOu7vQd8uhDyA4a1jUXnZRPAH7GL3eB4ASMYVb9oDZA,21963
+liger_kernel_nightly-0.5.3.dev20250221011057.dist-info/NOTICE,sha256=njwnoPZLh9AN8SJQzxvCGLHi-8X__AvWRze6joNXIY8,2066
+liger_kernel_nightly-0.5.3.dev20250221011057.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
+liger_kernel_nightly-0.5.3.dev20250221011057.dist-info/top_level.txt,sha256=2eghu4hA3LnkM7ElW92tQ8zegWKgSbeo-k-aGe1YnvY,13
+liger_kernel_nightly-0.5.3.dev20250221011057.dist-info/RECORD,,

{liger_kernel_nightly-0.5.3.dev20250221002845.dist-info → liger_kernel_nightly-0.5.3.dev20250221011057.dist-info}/LICENSE RENAMED Viewed

File without changes

{liger_kernel_nightly-0.5.3.dev20250221002845.dist-info → liger_kernel_nightly-0.5.3.dev20250221011057.dist-info}/NOTICE RENAMED Viewed

File without changes

{liger_kernel_nightly-0.5.3.dev20250221002845.dist-info → liger_kernel_nightly-0.5.3.dev20250221011057.dist-info}/WHEEL RENAMED Viewed

File without changes

{liger_kernel_nightly-0.5.3.dev20250221002845.dist-info → liger_kernel_nightly-0.5.3.dev20250221011057.dist-info}/top_level.txt RENAMED Viewed

File without changes

liger-kernel-nightly 0.5.3.dev20250221002845__py3-none-any.whl → 0.5.3.dev20250221011057__py3-none-any.whl

liger-kernel-nightly 0.5.3.dev20250221002845py3-none-any.whl → 0.5.3.dev20250221011057py3-none-any.whl