PyPI - liger-kernel - Versions diffs - 0.5.3__py3-none-any.whl → 0.5.4__py3-none-any.whl - Mend

liger-kernel 0.5.3py3-none-any.whl → 0.5.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

liger_kernel/chunked_loss/__init__.py +1 -0
liger_kernel/chunked_loss/fused_linear_rlhf.py +213 -0
liger_kernel/chunked_loss/grpo_loss.py +160 -0
liger_kernel/chunked_loss/kto_loss.py +9 -9
liger_kernel/ops/cross_entropy.py +3 -3
liger_kernel/ops/fused_linear_cross_entropy.py +3 -3
liger_kernel/ops/fused_linear_jsd.py +3 -3
liger_kernel/ops/jsd.py +3 -3
liger_kernel/ops/layer_norm.py +20 -7
liger_kernel/ops/tvd.py +207 -0
liger_kernel/ops/utils.py +1 -2
liger_kernel/transformers/__init__.py +3 -0
liger_kernel/transformers/cross_entropy.py +3 -3
liger_kernel/transformers/functional.py +17 -0
liger_kernel/transformers/fused_linear_cross_entropy.py +3 -3
liger_kernel/transformers/group_norm.py +6 -6
liger_kernel/transformers/model/olmo2.py +124 -0
liger_kernel/transformers/monkey_patch.py +171 -27
liger_kernel/transformers/tvd.py +13 -0
liger_kernel/utils.py +49 -0
{liger_kernel-0.5.3.dist-info → liger_kernel-0.5.4.dist-info}/METADATA +17 -3
{liger_kernel-0.5.3.dist-info → liger_kernel-0.5.4.dist-info}/RECORD +26 -21
{liger_kernel-0.5.3.dist-info → liger_kernel-0.5.4.dist-info}/LICENSE +0 -0
{liger_kernel-0.5.3.dist-info → liger_kernel-0.5.4.dist-info}/NOTICE +0 -0
{liger_kernel-0.5.3.dist-info → liger_kernel-0.5.4.dist-info}/WHEEL +0 -0
{liger_kernel-0.5.3.dist-info → liger_kernel-0.5.4.dist-info}/top_level.txt +0 -0

liger_kernel/transformers/monkey_patch.py CHANGED Viewed

@@ -61,6 +61,85 @@ def _patch_layer_norm_module(module, eps=1e-6):
     _bind_method_to_module(module, "extra_repr", LigerLayerNorm.extra_repr)
+def apply_liger_kernel_to_granite(
+    rope: bool = True,
+    cross_entropy: bool = True,
+    fused_linear_cross_entropy: bool = False,
+    rms_norm: bool = True,
+    swiglu: bool = True,
+    model: PreTrainedModel = None,
+) -> None:
+    """
+    Apply Liger kernels to replace original implementation in HuggingFace Granite 3 models
+    Args:
+        rope (bool): Whether to apply Liger's rotary position embedding. Default is True.
+        cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is True.
+        fused_linear_cross_entropy (bool):
+            Whether to apply Liger's fused linear cross entropy loss. Default is False.
+            `cross_entropy` and `fused_linear_cross_entropy` cannot both be True.
+            If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
+        rms_norm (bool): Whether to apply Liger's RMSNorm. Default is True.
+        swiglu (bool): Whether to apply Liger's SwiGLU MLP. Default is True.
+        model (PreTrainedModel): The model instance to apply Liger kernels to, if the model has already been
+        loaded. Default is None.
+    Debugging notes:
+        If LigerSwiGLUMLP is OK for Llama, it should be fine for Granite, but it's not.
+    """
+    assert not (cross_entropy and fused_linear_cross_entropy), (
+        "cross_entropy and fused_linear_cross_entropy cannot both be True."
+    )
+    from transformers.models.granite import modeling_granite
+    from transformers.models.granite.modeling_granite import GraniteModel
+    if swiglu:
+        modeling_granite.GraniteMLP = LigerSwiGLUMLP
+    if rms_norm:
+        modeling_granite.GraniteRMSNorm = LigerRMSNorm
+    if rope:
+        modeling_granite.apply_rotary_pos_emb = liger_rotary_pos_emb
+    if cross_entropy:
+        if transformer_version >= version.parse(SUPPORTED_TRANSFORMER_VERSION):
+            from transformers.loss.loss_utils import nn
+            nn.functional.cross_entropy = liger_cross_entropy
+        else:
+            logger.warning(TRANSFORMER_DEPRECATION_WARNING)
+            modeling_granite.CrossEntropyLoss = LigerCrossEntropyLoss
+    if fused_linear_cross_entropy:
+        raise NotImplementedError("LigerFusedLinearCrossEntropy is not available for Granite models.")
+        # NOTE: Granite model `GraniteForCausalLM.forward` scales logits each
+        # call, so we can't sidestep logit materialization. A bit more work
+        # would be needed to add a scaling term to the `LigerFusedLinearCrossEntropyFunction`
+        # for the logit output.
+    if model is not None:
+        # The model instance already exists, so we need to additionally patch the
+        # instance variables that reference already-instantiated modules (e.g. GraniteRMSNorm or GraniteMLP)
+        # get the base model from the model instance
+        base_model: GraniteModel = getattr(model, model.base_model_prefix, model)
+        if rms_norm:
+            _patch_rms_norm_module(base_model.norm)
+        for decoder_layer in base_model.layers:
+            if swiglu:
+                _bind_method_to_module(decoder_layer.mlp, "forward", LigerSwiGLUMLP.forward)
+            if rms_norm:
+                _patch_rms_norm_module(decoder_layer.input_layernorm)
+                _patch_rms_norm_module(decoder_layer.post_attention_layernorm)
 def apply_liger_kernel_to_llama(
     rope: bool = True,
     cross_entropy: bool = False,
@@ -85,9 +164,9 @@ def apply_liger_kernel_to_llama(
         loaded. Default is None.
     """
-    assert not (
-        cross_entropy and fused_linear_cross_entropy
-    ), "cross_entropy and fused_linear_cross_entropy cannot both be True."
+    assert not (cross_entropy and fused_linear_cross_entropy), (
+        "cross_entropy and fused_linear_cross_entropy cannot both be True."
+    )
     from transformers.models.llama import modeling_llama
     from transformers.models.llama.modeling_llama import LlamaModel
@@ -159,9 +238,9 @@ def apply_liger_kernel_to_mllama(
         loaded. Default is None.
     """
-    assert not (
-        cross_entropy and fused_linear_cross_entropy
-    ), "cross_entropy and fused_linear_cross_entropy cannot both be True."
+    assert not (cross_entropy and fused_linear_cross_entropy), (
+        "cross_entropy and fused_linear_cross_entropy cannot both be True."
+    )
     from transformers.models.mllama import modeling_mllama
     from transformers.models.mllama.modeling_mllama import MllamaForCausalLM
@@ -261,9 +340,9 @@ def apply_liger_kernel_to_mistral(
         model (PreTrainedModel): The model instance to apply Liger kernels to, if the model has already been
         loaded. Default is None.
     """
-    assert not (
-        cross_entropy and fused_linear_cross_entropy
-    ), "cross_entropy and fused_linear_cross_entropy cannot both be True."
+    assert not (cross_entropy and fused_linear_cross_entropy), (
+        "cross_entropy and fused_linear_cross_entropy cannot both be True."
+    )
     from transformers.models.mistral import modeling_mistral
     from transformers.models.mistral.modeling_mistral import MistralModel
@@ -321,9 +400,9 @@ def apply_liger_kernel_to_mixtral(
         loaded. Default is None.
     """
-    assert not (
-        cross_entropy and fused_linear_cross_entropy
-    ), "cross_entropy and fused_linear_cross_entropy cannot both be True."
+    assert not (cross_entropy and fused_linear_cross_entropy), (
+        "cross_entropy and fused_linear_cross_entropy cannot both be True."
+    )
     from transformers.models.mixtral import modeling_mixtral
     from transformers.models.mixtral.modeling_mixtral import MixtralModel
@@ -393,9 +472,9 @@ def apply_liger_kernel_to_gemma(
         model (PreTrainedModel): The model instance to apply Liger kernels to, if the model has already been
         loaded. Default is None.
     """
-    assert not (
-        cross_entropy and fused_linear_cross_entropy
-    ), "cross_entropy and fused_linear_cross_entropy cannot both be True."
+    assert not (cross_entropy and fused_linear_cross_entropy), (
+        "cross_entropy and fused_linear_cross_entropy cannot both be True."
+    )
     from transformers.models.gemma import modeling_gemma
     from transformers.models.gemma.modeling_gemma import GemmaModel
@@ -467,9 +546,9 @@ def apply_liger_kernel_to_gemma2(
         model (PreTrainedModel): The model instance to apply Liger kernels to, if the model has already been
         loaded. Default is None.
     """
-    assert not (
-        cross_entropy and fused_linear_cross_entropy
-    ), "cross_entropy and fused_linear_cross_entropy cannot both be True."
+    assert not (cross_entropy and fused_linear_cross_entropy), (
+        "cross_entropy and fused_linear_cross_entropy cannot both be True."
+    )
     from transformers.models.gemma2 import modeling_gemma2
     from transformers.models.gemma2.modeling_gemma2 import Gemma2Model
@@ -544,9 +623,9 @@ def apply_liger_kernel_to_qwen2(
         model (PreTrainedModel): The model instance to apply Liger kernels to, if the model has already been
         loaded. Default is None.
     """
-    assert not (
-        cross_entropy and fused_linear_cross_entropy
-    ), "cross_entropy and fused_linear_cross_entropy cannot both be True."
+    assert not (cross_entropy and fused_linear_cross_entropy), (
+        "cross_entropy and fused_linear_cross_entropy cannot both be True."
+    )
     from transformers.models.qwen2 import modeling_qwen2
     from transformers.models.qwen2.modeling_qwen2 import Qwen2Model
@@ -619,9 +698,9 @@ def apply_liger_kernel_to_qwen2_vl(
         model (PreTrainedModel): The model instance to apply Liger kernels to, if the model has already been
         loaded. Default is None.
     """
-    assert not (
-        cross_entropy and fused_linear_cross_entropy
-    ), "cross_entropy and fused_linear_cross_entropy cannot both be True."
+    assert not (cross_entropy and fused_linear_cross_entropy), (
+        "cross_entropy and fused_linear_cross_entropy cannot both be True."
+    )
     from transformers.models.qwen2_vl import modeling_qwen2_vl
     from transformers.models.qwen2_vl.modeling_qwen2_vl import Qwen2VLModel
@@ -689,9 +768,9 @@ def apply_liger_kernel_to_phi3(
         model (PreTrainedModel): The model instance to apply Liger kernels to, if the model has already been
         loaded. Default is None.
     """
-    assert not (
-        cross_entropy and fused_linear_cross_entropy
-    ), "cross_entropy and fused_linear_cross_entropy cannot both be True."
+    assert not (cross_entropy and fused_linear_cross_entropy), (
+        "cross_entropy and fused_linear_cross_entropy cannot both be True."
+    )
     from transformers.models.phi3 import modeling_phi3
     from transformers.models.phi3.modeling_phi3 import Phi3Model
@@ -735,15 +814,80 @@ def apply_liger_kernel_to_phi3(
                 _patch_rms_norm_module(decoder_layer.post_attention_layernorm)
+def apply_liger_kernel_to_olmo2(
+    rope: bool = True,
+    cross_entropy: bool = False,
+    fused_linear_cross_entropy: bool = True,
+    rms_norm: bool = True,
+    swiglu: bool = True,
+    model: PreTrainedModel = None,
+) -> None:
+    """
+    Apply Liger kernels to replace original implementation in HuggingFace OLMO2 models.
+    Args:
+        rope (bool): Whether to apply Liger's rotary position embedding. Default is True.
+        cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
+        fused_linear_cross_entropy (bool):
+            Whether to apply Liger's fused linear cross entropy loss. Default is True.
+            `cross_entropy` and `fused_linear_cross_entropy` cannot both be True.
+            If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
+        rms_norm (bool): Whether to apply Liger's RMSNorm. Default is True.
+        swiglu (bool): Whether to apply Liger's SwiGLU Olmo2MLP. Default is True.
+        model (PreTrainedModel): The model instance to apply Liger kernels to, if the model has already been
+        loaded. Default is None.
+    """
+    assert not (cross_entropy and fused_linear_cross_entropy), (
+        "cross_entropy and fused_linear_cross_entropy cannot both be True."
+    )
+    from transformers.models.olmo2 import modeling_olmo2
+    from transformers.models.olmo2.modeling_olmo2 import Olmo2Model
+    from liger_kernel.transformers.model.olmo2 import lce_forward as olmo2_lce_forward
+    if rope:
+        modeling_olmo2.apply_rotary_pos_emb = liger_rotary_pos_emb
+    if rms_norm:
+        modeling_olmo2.Olmo2RMSNorm = partial(LigerRMSNorm, in_place=False)
+    if swiglu:
+        modeling_olmo2.Olmo2MLP = LigerSwiGLUMLP
+    if cross_entropy:
+        from transformers.loss.loss_utils import nn
+        nn.functional.cross_entropy = liger_cross_entropy
+    if fused_linear_cross_entropy:
+        modeling_olmo2.Olmo2ForCausalLM.forward = olmo2_lce_forward
+    if model is not None:
+        # The model instance already exists, so we need to additionally patch the
+        # instance variables that reference already-instantiated modules
+        # get the base model from the model instance
+        base_model: Olmo2Model = getattr(model, model.base_model_prefix, model)
+        if rms_norm:
+            _patch_rms_norm_module(base_model.norm)
+        for decoder_layer in base_model.layers:
+            if swiglu:
+                _bind_method_to_module(decoder_layer.mlp, "forward", LigerSwiGLUMLP.forward)
+            if rms_norm:
+                _patch_rms_norm_module(decoder_layer.post_attention_layernorm, in_place=False)
+                _patch_rms_norm_module(decoder_layer.post_feedforward_layernorm, in_place=False)
 # Model type corresponds to the keys defined in transformers/models/auto/modeling_auto.py
 MODEL_TYPE_TO_APPLY_LIGER_FN = {
     "gemma": apply_liger_kernel_to_gemma,
     "gemma2": apply_liger_kernel_to_gemma2,
     "llama": apply_liger_kernel_to_llama,
+    "granite": apply_liger_kernel_to_granite,
     "mllama": apply_liger_kernel_to_mllama,
     "mllama_text_model": apply_liger_kernel_to_mllama,
     "mistral": apply_liger_kernel_to_mistral,
     "mixtral": apply_liger_kernel_to_mixtral,
+    "olmo2": apply_liger_kernel_to_olmo2,
     "qwen2": apply_liger_kernel_to_qwen2,
     "qwen2_vl": apply_liger_kernel_to_qwen2_vl,
     "phi3": apply_liger_kernel_to_phi3,

liger_kernel/transformers/tvd.py ADDED Viewed

@@ -0,0 +1,13 @@
+import torch.nn as nn
+from liger_kernel.ops.tvd import LigerTVDLossFunction
+class LigerTVDLoss(nn.Module):
+    def __init__(self, reduction="batchmean", ignore_index: int = -100):
+        super(LigerTVDLoss, self).__init__()
+        self.reduction = reduction
+        self.ignore_index = ignore_index
+    def forward(self, p, q, shift_labels=None):
+        return LigerTVDLossFunction.apply(p, q, shift_labels, self.reduction, self.ignore_index)

liger_kernel/utils.py CHANGED Viewed

@@ -9,5 +9,54 @@ def infer_device():
         return "cuda"
     elif torch.xpu.is_available():
         return "xpu"
+    elif torch.hip.is_available():
+        return "hip"
     else:
         return "cpu"
+def transformers_version_dispatch(
+    required_version: str,
+    before_fn,
+    after_fn,
+    before_args: tuple = (),
+    after_args: tuple = (),
+    before_kwargs: dict = None,
+    after_kwargs: dict = None,
+):
+    """
+    Dispatches to different functions based on package version comparison.
+    Args:
+        required_version: Version to compare against (e.g. "4.48.0")
+        before_fn: Function to call if package_version < required_version
+        after_fn: Function to call if package_version >= required_version
+        before_args: Positional arguments for before_fn
+        after_args: Positional arguments for after_fn
+        before_kwargs: Keyword arguments for before_fn
+        after_kwargs: Keyword arguments for after_fn
+    Returns:
+        Result from either before_fn or after_fn
+    Example:
+        >>> rotary_emb = transformers_version_dispatch(
+        ...     "4.48.0",
+        ...     LlamaRotaryEmbedding,
+        ...     LlamaRotaryEmbedding,
+        ...     before_args=(head_dim,),
+        ...     after_args=(LlamaConfig(head_dim=head_dim),),
+        ...     before_kwargs={'device': device},
+        ...     after_kwargs={'device': device}
+        ... )
+    """
+    from packaging import version
+    from transformers import __version__ as transformers_version
+    before_kwargs = before_kwargs or {}
+    after_kwargs = after_kwargs or {}
+    if version.parse(transformers_version) < version.parse(required_version):
+        return before_fn(*before_args, **before_kwargs)
+    else:
+        return after_fn(*after_args, **after_kwargs)

{liger_kernel-0.5.3.dist-info → liger_kernel-0.5.4.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: liger_kernel
-Version: 0.5.3
+Version: 0.5.4
 Summary: Efficient Triton kernels for LLM Training
 License: BSD 2-CLAUSE LICENSE
         Copyright 2024 LinkedIn Corporation
@@ -97,6 +97,11 @@ Dynamic: requires-dist
                     <img src="https://github.com/linkedin/Liger-Kernel/actions/workflows/amd-ci.yml/badge.svg?event=schedule" alt="Build">
                 </a>
             </div>
+            <div style="display: block;">
+                <a href="https://github.com/linkedin/Liger-Kernel/actions/workflows/amd-ci.yml">
+                    <img src="https://github.com/linkedin/Liger-Kernel/actions/workflows/intel-ci.yml/badge.svg?event=schedule" alt="Build">
+                </a>
+            </div>
         </td>
     </tr>
 </table>
@@ -123,7 +128,7 @@ Dynamic: requires-dist
 **Liger Kernel** is a collection of Triton kernels designed specifically for LLM training. It can effectively increase multi-GPU **training throughput by 20%** and reduces **memory usage by 60%**. We have implemented **Hugging Face Compatible** `RMSNorm`, `RoPE`, `SwiGLU`, `CrossEntropy`, `FusedLinearCrossEntropy`, and more to come. The kernel works out of the box with [Flash Attention](https://github.com/Dao-AILab/flash-attention), [PyTorch FSDP](https://pytorch.org/tutorials/intermediate/FSDP_tutorial.html), and [Microsoft DeepSpeed](https://github.com/microsoft/DeepSpeed). We welcome contributions from the community to gather the best kernels for LLM training.
-We've also added optimized Post-Training kernels that deliver **up to 80% memory savings** for alignment and distillation tasks. We support losses like DPO, CPO, ORPO, SimPO, JSD, and many more. Check out [how we optimize the memory](https://x.com/hsu_byron/status/1866577403918917655).
+We've also added optimized Post-Training kernels that deliver **up to 80% memory savings** for alignment and distillation tasks. We support losses like DPO, CPO, ORPO, SimPO, KTO, JSD, and many more. Check out [how we optimize the memory](https://x.com/hsu_byron/status/1866577403918917655).
 ## Supercharge Your Model with Liger Kernel
@@ -188,6 +193,11 @@ y = orpo_loss(lm_head.weight, x, target)
 - `torch >= 2.5.0` Install according to the instruction in Pytorch official webpage.
 - `triton >= 3.0.0` Install from pypi. (e.g. `pip install triton==3.0.0`)
+```bash
+# Need to pass the url when installing
+pip install -e .[dev] --extra-index-url https://download.pytorch.org/whl/nightly/rocm6.2
+```
 ### Optional Dependencies
 - `transformers >= 4.x`: Required if you plan to use the transformers models patching APIs. The specific model you are working will dictate the minimum version of transformers.
@@ -305,6 +315,8 @@ loss.backward()
 | Qwen2, Qwen2.5, & QwQ      | `liger_kernel.transformers.apply_liger_kernel_to_qwen2`    | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy        |
 | Qwen2-VL, & QVQ       | `liger_kernel.transformers.apply_liger_kernel_to_qwen2_vl`    | RMSNorm, LayerNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy        |
 | Phi3 & Phi3.5       | `liger_kernel.transformers.apply_liger_kernel_to_phi3`     | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy         |
+| Granite 3.0 & 3.1   | `liger_kernel.transformers.apply_liger_kernel_to_granite`     | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss |
+| OLMo2   | `liger_kernel.transformers.apply_liger_kernel_to_olmo2`     | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
 ## Low-level APIs
@@ -333,6 +345,7 @@ loss.backward()
 | Fused Linear DPO Loss           | `liger_kernel.chunked_loss.LigerFusedLinearDPOLoss`       |
 | Fused Linear ORPO Loss          | `liger_kernel.chunked_loss.LigerFusedLinearORPOLoss`      |
 | Fused Linear SimPO Loss         | `liger_kernel.chunked_loss.LigerFusedLinearSimPOLoss`     |
+| Fused Linear KTO Loss           | `liger_kernel.chunked_loss.LigerFusedLinearKTOLoss`     |
 ### Distillation Kernels
@@ -341,6 +354,7 @@ loss.backward()
 | KLDivergence                    | `liger_kernel.transformers.LigerKLDIVLoss`                  |
 | JSD                             | `liger_kernel.transformers.LigerJSD`                        |
 | Fused Linear JSD                  | `liger_kernel.transformers.LigerFusedLinearJSD`             |
+| TVD                             | `liger_kernel.transformers.LigerTVDLoss`                    |
 ### Experimental Kernels
@@ -372,7 +386,7 @@ loss.backward()
 - For issues, create a Github ticket in this repository
 - For open discussion, join [our discord channel](https://discord.gg/gpumode)
-- For formal collaboration, send an email to byhsu@linkedin.com
+- For formal collaboration, send an email to yannchen@linkedin.com
 ## Cite this work

{liger_kernel-0.5.3.dist-info → liger_kernel-0.5.4.dist-info}/RECORD RENAMED Viewed

@@ -1,51 +1,55 @@
 liger_kernel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 liger_kernel/env_report.py,sha256=uhdEC8OydxoZlb7B6YYcAaBF3crGFdIck-4cxaW4NJY,1728
-liger_kernel/utils.py,sha256=HJa-xVKOohDn6pLVIx-Fv0V9h0QAL3qZGQNRICI-OpI,249
+liger_kernel/utils.py,sha256=FtVUkCGBT1UNasTl6HMNycWwiwHayK6tx-ZDdA-sNX4,1884
 liger_kernel/chunked_loss/README.md,sha256=0FmkFC3hKBqyoDT5uTlIYmrvRkF-EOCR1y-EBU1LpWU,2248
-liger_kernel/chunked_loss/__init__.py,sha256=48m-8IMOAReZbi0HK5aV-KGBp2IsZSwFvdnzTNrS4bk,516
+liger_kernel/chunked_loss/__init__.py,sha256=ATu-xX5Fc49Cr6yBOGBRNTo593ZrU5ZCsIuvoIbJWw4,603
 liger_kernel/chunked_loss/cpo_loss.py,sha256=OdBR8WYdHTKpLI_c9DcuwqKSWPeAAeTyREz46Vu_cAY,3682
 liger_kernel/chunked_loss/dpo_loss.py,sha256=wgjnwzLfrMUwV5mXgrq6G1YfQKWnbiFJegaP48BGJHY,4509
 liger_kernel/chunked_loss/functional.py,sha256=THWWpCnRVhTVfnPnyvQjdBvo1JDtxhwLmtZE_yiBBqM,817
 liger_kernel/chunked_loss/fused_linear_distillation.py,sha256=5V8rdva89WyHVbmJ8JOmC4DYNOR6ByXfx3qlUieOZkI,11002
 liger_kernel/chunked_loss/fused_linear_preference.py,sha256=idK9V9NivoVITqVpiG0fEGUHSvinYWkn9-EYXZjR-KQ,18356
+liger_kernel/chunked_loss/fused_linear_rlhf.py,sha256=sAApL4GQ3YL2F-ymIAF61GCpFfBgFcWF5LB4Gzd7LgY,8044
 liger_kernel/chunked_loss/fused_linear_unpaired_preference.py,sha256=ZqYlXXhIphkJPxOS7iI70avgrr6x0skEtgpckZTYau0,9819
+liger_kernel/chunked_loss/grpo_loss.py,sha256=M5qlQR-v5Rh8N3P3dPGNhOKygDFJ4516_rJaVPzU_-c,4980
 liger_kernel/chunked_loss/jsd_loss.py,sha256=yRCQdvd3ruTWP4A_BfU8VcZ6LepSUfO0Ob7stGnueQY,6052
-liger_kernel/chunked_loss/kto_loss.py,sha256=eVNW6HVCAm32shpfhbRlk92Flnjd7G32v0gK9DUUSOQ,5655
+liger_kernel/chunked_loss/kto_loss.py,sha256=b3ffJyk97e-6XdXd4HFrYyx8wW4A-CU4gOaJSimKLtA,5476
 liger_kernel/chunked_loss/orpo_loss.py,sha256=yjcrrbVeemLYodoSKT-FMSnaPtyKAZ3aOrvPD6tTY6Y,3617
 liger_kernel/chunked_loss/simpo_loss.py,sha256=3TTc7U79Orjgi-Wu81WZkWk5MgsdqKXIOBHgIvDazPw,3865
 liger_kernel/ops/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-liger_kernel/ops/cross_entropy.py,sha256=SRzAF9Ek84pBVFy3wqQZs7AhRoorKRIgQ-Td_rtl1Kk,18606
-liger_kernel/ops/fused_linear_cross_entropy.py,sha256=jm0bJLRp7UpE4MCa3yhvaSErP792Ze8nNgtUIbOw5sw,10910
-liger_kernel/ops/fused_linear_jsd.py,sha256=eKqaADj7LgWfoYqyH03tjrmhNTfJOF1Dhx_bWzBTnTU,9600
+liger_kernel/ops/cross_entropy.py,sha256=D6vFFloiuxFXoWfjlIjmfO3tVaWOiYmztw9FKAi5vdU,18608
+liger_kernel/ops/fused_linear_cross_entropy.py,sha256=1Y3Uk_TCSjqKgoG2eot1ptnWXJXXQESqGvOmqAW1gsM,10912
+liger_kernel/ops/fused_linear_jsd.py,sha256=Seshez2qaM6HiTQ8_HEqSwhaeVruNT1SvIM4ZrAPBEU,9602
 liger_kernel/ops/geglu.py,sha256=axGvCIvlBzuluoAIrWTsp2iZM4BFKNInkPov8YVvH9E,4126
 liger_kernel/ops/group_norm.py,sha256=qD4D4lSjSgVtO52EBNLC2iTseALRgPgqXE50U2woggk,10837
-liger_kernel/ops/jsd.py,sha256=WwGY9ozuH3PMg3udRI6H96UqAEzIozJoO2HtHg7010M,6107
+liger_kernel/ops/jsd.py,sha256=0jNeRxpcNI5ckxCdoCNyO5GEedLIuzx3lz6KAiksc4o,6109
 liger_kernel/ops/kl_div.py,sha256=MnfuYqqQESON1X2Swy064x1urKtMFdgeSWd60VttBXI,8420
-liger_kernel/ops/layer_norm.py,sha256=quvt2zcwcJCDxrgm-iWoHzDYOoeZdMC76nZ_ckw6-p8,7640
+liger_kernel/ops/layer_norm.py,sha256=6roQjioyg-9O2qLPV8nL4U0-5UH80tdzOMTWwjvDnn8,7961
 liger_kernel/ops/qwen2vl_mrope.py,sha256=3GExhYpLgB4VUtyZyjRk8XjEur3W4EWF6HQ67ML5vBU,8481
 liger_kernel/ops/rms_norm.py,sha256=PWLJcdIKU5e-8BuYFHd9Cqlq6wmr6fUXKi9zQD4LetU,11727
 liger_kernel/ops/rope.py,sha256=ofmBOkUpZZO-Q8Z5B_LOFYYLD-YT-8WnJ4vGOrDYouI,8943
 liger_kernel/ops/swiglu.py,sha256=KmgMjaJQnbLLgZn2nEpbwHU_xpnYRweCyrLQSVvM1vA,3015
-liger_kernel/ops/utils.py,sha256=vMWxfcw02xUvjpEXQQ3Rrj68ddZ8Of3hiOmEFq1zSKg,3852
+liger_kernel/ops/tvd.py,sha256=FHJtLQI95ijqgg9UtaHpMAjSCiPxB6CduPwPMcGxelc,6405
+liger_kernel/ops/utils.py,sha256=uoFKQqo-34N2TWQNvXMFywqGiOMMXNEVBxVojzlUAa0,3836
 liger_kernel/ops/experimental/embedding.py,sha256=tolj3tItkzpSb30zWqDN2_yX4ectflaQ8HMyKyFIQc8,4172
 liger_kernel/ops/experimental/mm_int8int2.py,sha256=TrS9lpwekrik_w5qE7AhMJD1bcq-OidjtbsW80oZ6IM,13314
-liger_kernel/transformers/__init__.py,sha256=QPmYkL6hosBPpPqCUGqvIvAtD9XzLgvZqZxUyYMZeVk,2008
+liger_kernel/transformers/__init__.py,sha256=6v_VcV1GQ9ISgNCd-ZxtmEg_s5GTBQ9F-s1KrFkYzPQ,2265
 liger_kernel/transformers/auto_model.py,sha256=0qCTRZt280Bj_LcFdzo9hlaR-BWNazawXOGgoCZjgEg,1545
-liger_kernel/transformers/cross_entropy.py,sha256=LtiHlj_tK2YFpilwvbG_NEVzbf82zKRpWCZMjaFUd4M,1681
-liger_kernel/transformers/functional.py,sha256=lDOjch622dJIc78K3ePFK_H1DX00GC5kKjodjcbEgbM,4624
-liger_kernel/transformers/fused_linear_cross_entropy.py,sha256=ygU7cycCHWvSrrTgn2TseN8t-Qwfer4V7ldwhZ1E6_g,1776
+liger_kernel/transformers/cross_entropy.py,sha256=z3KTWQnFxr_IZaVjtYt0ZNEWQdDdYThN35xWkHlDGH0,1683
+liger_kernel/transformers/functional.py,sha256=ShLD3eb--XKNtllznCrOYTbo4f-1KVwzi0KLMICdrn4,4942
+liger_kernel/transformers/fused_linear_cross_entropy.py,sha256=09Rt7FZzLH42VOcIbQ4dlQd0o3Rlb4vk6fqiOQ7WTD8,1778
 liger_kernel/transformers/fused_linear_jsd.py,sha256=bZ4otCvWBuOnA5XdQL-FzZVItJlDt-ht9e_pG7PG93E,3999
 liger_kernel/transformers/geglu.py,sha256=mrgqzIUVd6lN7fkDKLkw5YaESDxDtFgbot430WwPVOQ,1107
-liger_kernel/transformers/group_norm.py,sha256=URmjkQFsrbMffzcJiGpX7ckxWlpL95AiJS-80hwAWPk,2173
+liger_kernel/transformers/group_norm.py,sha256=6qMAWOprr4SzP0YhNVNGQIBpM5aUHplUD2VuGJrMBz0,2173
 liger_kernel/transformers/jsd.py,sha256=DGqRnxIZxsvxo0_tbbxX3b-sDbDjC_yKufyRIHCcScY,2979
 liger_kernel/transformers/kl_div.py,sha256=WLffFbh1EExD2Eb1F7lN11fo9JJC-0751WJjZAF1Fj8,409
 liger_kernel/transformers/layer_norm.py,sha256=c9pk3PEasOKYR0rhe5e5nNrnYKVCEW4VC8S6LpCq9EQ,906
-liger_kernel/transformers/monkey_patch.py,sha256=6eXmtERKr4YUppRAaH7a_ml3AOz0ao68E8QnOyXtIkY,37794
+liger_kernel/transformers/monkey_patch.py,sha256=g3i3q5McBg23A3Mnviw-Eb32le1hvN7jByzONa9ngcs,44000
 liger_kernel/transformers/qwen2vl_mrope.py,sha256=5EwSqrMdsL9MYspeBMXBsNJKvH0MOmRrtJXAJlnnlOI,1047
 liger_kernel/transformers/rms_norm.py,sha256=GqCEJuGt0YdqqlMcToE0Wp4A8YFquDa4UUSyH2uFW2A,1191
 liger_kernel/transformers/rope.py,sha256=ZTrTORSAyfcFIKjk6XEeYmk4ROH7xXED9L4g2NFntlE,999
 liger_kernel/transformers/swiglu.py,sha256=i9WTqcNRqReU4XJs391IPbl-I5X0wG4T72D4pqGFfJg,2422
 liger_kernel/transformers/trainer_integration.py,sha256=W3ON51O5GkyzNJsItz0y5rKx-uy2f2cFfveZpqbUdhw,123
+liger_kernel/transformers/tvd.py,sha256=XrRfyJIqN6HFxXk8MYyFVZM1OLz3mtSbRZvWfZ_JerQ,450
 liger_kernel/transformers/experimental/embedding.py,sha256=2P0QYdlFyFrG5OqTzTa1wcRgDSyjBMv5i1a7BrDPDQw,881
 liger_kernel/transformers/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 liger_kernel/transformers/model/gemma.py,sha256=ky89b3aWPaeTGRMC-745KgixtQIRXzNAiCORAMLn9yo,9654
@@ -54,6 +58,7 @@ liger_kernel/transformers/model/llama.py,sha256=3LJFXKFDKvEakaWPc_NicSFst4Y_hdSM
 liger_kernel/transformers/model/mistral.py,sha256=MVRksI5_j_8WJu8znOHKCdSI5jSu-S7cdFYzt9m_vIQ,5180
 liger_kernel/transformers/model/mixtral.py,sha256=jpZJkpl625Q-JHWarj2MqT5mRaSsiCtg0c9vVyvOdCY,11430
 liger_kernel/transformers/model/mllama.py,sha256=qWexBdskuN3gPJvPUwt4J0nU675tGD6W7wxgRZ9Bifg,11145
+liger_kernel/transformers/model/olmo2.py,sha256=yyksS6E4fuWd8asEW8rEDBKqZpFmP4ITCM_bjIDZaoY,5124
 liger_kernel/transformers/model/phi3.py,sha256=biRa8fph9qdnQmkD9I21t5XIjpIt1i6UKU4uk8Up8pU,10292
 liger_kernel/transformers/model/qwen2.py,sha256=14UuPjxB-tjqWn85Tn4fqBFvVhVsth5iPEt8kJSMiew,9581
 liger_kernel/transformers/model/qwen2_vl.py,sha256=yMLqsfSYcvhClUpTUjGoADiOxfLB2B8240VdrPP0c8s,9851
@@ -61,9 +66,9 @@ liger_kernel/transformers/trainer/__init__.py,sha256=p7yQfklV8-467qSz_ZMimkbDF7H
 liger_kernel/transformers/trainer/orpo_trainer.py,sha256=pdekW7l6Qg_aqa5SYKYlSWUF8m3lkOFvFLcIMEHrz9s,8338
 liger_kernel/triton/__init__.py,sha256=qCiCamzCRv6lpV8IqpAc9YMdNKC7GKurClWceQPnlis,92
 liger_kernel/triton/monkey_patch.py,sha256=Rd0hUHAzDkFfHvnX7-PBaNK5EKnZhtfM_h-fgQH9HPY,1568
-liger_kernel-0.5.3.dist-info/LICENSE,sha256=OhzLDHJ0to4a8sodVLELZiCFylZ1NAAYLs-HrjPy0ag,1312
-liger_kernel-0.5.3.dist-info/METADATA,sha256=R1EvFj3Whda7t4SB6FCZn0QOVuqsbNOELuMV7j3rG7k,21323
-liger_kernel-0.5.3.dist-info/NOTICE,sha256=njwnoPZLh9AN8SJQzxvCGLHi-8X__AvWRze6joNXIY8,2066
-liger_kernel-0.5.3.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
-liger_kernel-0.5.3.dist-info/top_level.txt,sha256=2eghu4hA3LnkM7ElW92tQ8zegWKgSbeo-k-aGe1YnvY,13
-liger_kernel-0.5.3.dist-info/RECORD,,
+liger_kernel-0.5.4.dist-info/LICENSE,sha256=OhzLDHJ0to4a8sodVLELZiCFylZ1NAAYLs-HrjPy0ag,1312
+liger_kernel-0.5.4.dist-info/METADATA,sha256=Zw7n3Ey6vUed4E54H9-TzKmhuOpd9P2ZFMVL-zYUnew,22255
+liger_kernel-0.5.4.dist-info/NOTICE,sha256=njwnoPZLh9AN8SJQzxvCGLHi-8X__AvWRze6joNXIY8,2066
+liger_kernel-0.5.4.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
+liger_kernel-0.5.4.dist-info/top_level.txt,sha256=2eghu4hA3LnkM7ElW92tQ8zegWKgSbeo-k-aGe1YnvY,13
+liger_kernel-0.5.4.dist-info/RECORD,,

{liger_kernel-0.5.3.dist-info → liger_kernel-0.5.4.dist-info}/LICENSE RENAMED Viewed

File without changes

{liger_kernel-0.5.3.dist-info → liger_kernel-0.5.4.dist-info}/NOTICE RENAMED Viewed

File without changes

{liger_kernel-0.5.3.dist-info → liger_kernel-0.5.4.dist-info}/WHEEL RENAMED Viewed

File without changes

{liger_kernel-0.5.3.dist-info → liger_kernel-0.5.4.dist-info}/top_level.txt RENAMED Viewed

File without changes

liger-kernel 0.5.3__py3-none-any.whl → 0.5.4__py3-none-any.whl

liger-kernel 0.5.3py3-none-any.whl → 0.5.4py3-none-any.whl