PyPI - liger-kernel - Versions diffs - 0.5.2__py3-none-any.whl → 0.5.4__py3-none-any.whl - Mend

liger-kernel 0.5.2py3-none-any.whl → 0.5.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (69) hide show

liger_kernel/chunked_loss/README.md +25 -0
liger_kernel/chunked_loss/__init__.py +3 -0
liger_kernel/chunked_loss/cpo_loss.py +18 -8
liger_kernel/chunked_loss/dpo_loss.py +20 -10
liger_kernel/chunked_loss/functional.py +4 -0
liger_kernel/chunked_loss/fused_linear_distillation.py +58 -44
liger_kernel/chunked_loss/fused_linear_preference.py +108 -60
liger_kernel/chunked_loss/fused_linear_rlhf.py +213 -0
liger_kernel/chunked_loss/fused_linear_unpaired_preference.py +246 -0
liger_kernel/chunked_loss/grpo_loss.py +160 -0
liger_kernel/chunked_loss/jsd_loss.py +154 -0
liger_kernel/chunked_loss/kto_loss.py +172 -0
liger_kernel/chunked_loss/orpo_loss.py +8 -9
liger_kernel/chunked_loss/simpo_loss.py +22 -8
liger_kernel/env_report.py +5 -12
liger_kernel/ops/cross_entropy.py +102 -51
liger_kernel/ops/experimental/embedding.py +1 -3
liger_kernel/ops/experimental/mm_int8int2.py +3 -9
liger_kernel/ops/fused_linear_cross_entropy.py +89 -55
liger_kernel/ops/fused_linear_jsd.py +14 -32
liger_kernel/ops/geglu.py +6 -17
liger_kernel/ops/group_norm.py +11 -28
liger_kernel/ops/jsd.py +5 -9
liger_kernel/ops/kl_div.py +8 -11
liger_kernel/ops/layer_norm.py +23 -12
liger_kernel/ops/qwen2vl_mrope.py +8 -25
liger_kernel/ops/rms_norm.py +14 -32
liger_kernel/ops/rope.py +31 -33
liger_kernel/ops/swiglu.py +4 -8
liger_kernel/ops/tvd.py +207 -0
liger_kernel/ops/utils.py +3 -2
liger_kernel/transformers/__init__.py +19 -24
liger_kernel/transformers/auto_model.py +6 -13
liger_kernel/transformers/cross_entropy.py +7 -9
liger_kernel/transformers/experimental/embedding.py +1 -3
liger_kernel/transformers/functional.py +28 -7
liger_kernel/transformers/fused_linear_cross_entropy.py +15 -10
liger_kernel/transformers/geglu.py +1 -4
liger_kernel/transformers/group_norm.py +9 -15
liger_kernel/transformers/jsd.py +1 -3
liger_kernel/transformers/kl_div.py +1 -3
liger_kernel/transformers/layer_norm.py +3 -9
liger_kernel/transformers/model/gemma.py +18 -40
liger_kernel/transformers/model/gemma2.py +19 -41
liger_kernel/transformers/model/llama.py +22 -48
liger_kernel/transformers/model/mistral.py +14 -26
liger_kernel/transformers/model/mixtral.py +24 -54
liger_kernel/transformers/model/mllama.py +16 -36
liger_kernel/transformers/model/olmo2.py +124 -0
liger_kernel/transformers/model/phi3.py +18 -40
liger_kernel/transformers/model/qwen2.py +18 -40
liger_kernel/transformers/model/qwen2_vl.py +36 -32
liger_kernel/transformers/monkey_patch.py +214 -144
liger_kernel/transformers/rms_norm.py +4 -4
liger_kernel/transformers/rope.py +2 -2
liger_kernel/transformers/swiglu.py +2 -8
liger_kernel/transformers/trainer/__init__.py +1 -3
liger_kernel/transformers/trainer/orpo_trainer.py +31 -18
liger_kernel/transformers/tvd.py +13 -0
liger_kernel/triton/__init__.py +1 -3
liger_kernel/triton/monkey_patch.py +1 -3
liger_kernel/utils.py +49 -0
{liger_kernel-0.5.2.dist-info → liger_kernel-0.5.4.dist-info}/METADATA +53 -26
liger_kernel-0.5.4.dist-info/RECORD +74 -0
{liger_kernel-0.5.2.dist-info → liger_kernel-0.5.4.dist-info}/WHEEL +1 -1
liger_kernel-0.5.2.dist-info/RECORD +0 -65
{liger_kernel-0.5.2.dist-info → liger_kernel-0.5.4.dist-info}/LICENSE +0 -0
{liger_kernel-0.5.2.dist-info → liger_kernel-0.5.4.dist-info}/NOTICE +0 -0
{liger_kernel-0.5.2.dist-info → liger_kernel-0.5.4.dist-info}/top_level.txt +0 -0

liger_kernel/chunked_loss/kto_loss.py ADDED Viewed

@@ -0,0 +1,172 @@
+import torch
+import torch.nn.functional as F
+from liger_kernel.chunked_loss.fused_linear_unpaired_preference import LigerFusedLinearUnpairedPreferenceBase
+class LigerFusedLinearKTOFunction(LigerFusedLinearUnpairedPreferenceBase):
+    @staticmethod
+    def preference_loss_fn(
+        average_log_prob_chunk,
+        preference_labels_chunk,
+        full_target,
+        ref_average_log_prob_chunk=None,
+        beta=0.1,
+        kl=None,
+    ):
+        """
+        Implements the Kahneman-Tversky Optimization (KTO) loss function.
+        Paper: "KTO: Model Alignment as Prospect Theory-Guided Optimization"
+        https://arxiv.org/abs/2402.01306
+        KTO loss is inspired by prospect theory (https://en.wikipedia.org/wiki/Prospect_theory)
+        from behavioral economics, which models how humans make decisions under uncertainty.
+        The loss function is asymmetric, treating gains and losses differently, similar to
+        human decision-making patterns.
+        Formula:
+        When y is chosen:
+        L_KTO = 1 - σ(β * (log[π(x)/π₀(x)] - KL(π||π₀)_y))
+        When y is rejected:
+        L_KTO = 1 - σ(β * (KL(π||π₀)_y - log[π(x)/π₀(x)]))
+        Where:
+        - σ: Sigmoid function
+        - β: Temperature parameter controlling the strength of the preference signal
+        - π(x): Policy (current model)
+        - π₀(x): Reference policy (reference model)
+        - KL(π||π₀)_y: KL divergence estimated using the rejected response y
+        The loss encourages the model to:
+        1. Assign higher probability to chosen responses
+        2. Assign lower probability to rejected responses
+        3. Maintain reasonable distance from the reference model
+        Args:
+            average_log_prob_chunk: Log probabilities for the chunk (batch_size,)
+            preference_labels_chunk: Preference labels for the chunk (batch_size,)
+            full_target: Non chunked full target tensor
+            ref_average_log_prob_chunk: Reference log probs for the chunk (batch_size,)
+            beta: Weight for the KTO loss
+            kl: KL divergence between the policy model and the reference model for the chosen responses. Shape: (batch_size,)
+        Returns:
+            - loss: The KTO loss value
+        """
+        if ref_average_log_prob_chunk is not None:
+            logratios_chunk = average_log_prob_chunk - ref_average_log_prob_chunk
+        else:
+            logratios_chunk = average_log_prob_chunk
+        multiplier_chunk = torch.where(preference_labels_chunk, 1, -1)
+        if kl is not None:
+            losses = 1 - F.sigmoid(beta * (logratios_chunk - kl) * multiplier_chunk)
+        else:
+            losses = 1 - F.sigmoid(beta * logratios_chunk * multiplier_chunk)
+        return losses.sum() / (full_target.shape[0])
+    @staticmethod
+    def forward(
+        ctx,
+        _input,
+        weight,
+        target,
+        preference_labels,
+        bias=None,
+        ref_input=None,
+        ref_weight=None,
+        ref_bias=None,
+        kl=None,
+        ignore_index=-100,
+        beta=0.1,
+        compiled=True,
+        use_ref_model=True,
+    ):
+        return LigerFusedLinearUnpairedPreferenceBase.forward(
+            ctx=ctx,
+            _input=_input,
+            weight=weight,
+            target=target,
+            preference_labels=preference_labels,
+            bias=bias,
+            loss_fn=LigerFusedLinearKTOFunction.preference_loss_fn,
+            ignore_index=ignore_index,
+            beta=beta,
+            compiled=compiled,
+            use_ref_model=use_ref_model,
+            ref_input=ref_input,
+            ref_weight=ref_weight,
+            ref_bias=ref_bias,
+            kl=kl,
+        )
+    @staticmethod
+    def backward(ctx, *grad_output):
+        grads = LigerFusedLinearUnpairedPreferenceBase.backward(ctx, grad_output)[:5]
+        return (
+            *grads,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
+class LigerFusedLinearKTOLoss(torch.nn.Module):
+    """
+    Fused linear layer with Kahneman-Tversky Optimization (KTO) loss.
+    """
+    def __init__(
+        self,
+        ignore_index: int = -100,
+        beta: float = 0.1,
+        compiled: bool = True,
+        use_ref_model: bool = False,
+    ):
+        """
+        Args:
+            ignore_index (int): Index to ignore in the loss calculation
+            beta (float): Temperature parameter for the KTO loss
+            compiled (bool): Whether to use compiled operations
+            use_ref_model (bool): Whether to use a reference model for the DPO loss.
+        """
+        super().__init__()
+        self.ignore_index = ignore_index
+        self.beta = beta
+        self.compiled = compiled
+        self.use_ref_model = use_ref_model
+    def forward(
+        self,
+        _input,
+        lin_weight,
+        target,
+        bias=None,
+        preference_labels=None,
+        ref_input=None,
+        ref_weight=None,
+        ref_bias=None,
+        kl=None,
+    ):
+        return LigerFusedLinearKTOFunction.apply(
+            _input,
+            lin_weight,
+            target,
+            preference_labels,
+            bias,
+            ref_input,
+            ref_weight,
+            ref_bias,
+            kl,
+            self.ignore_index,
+            self.beta,
+            self.compiled,
+            self.use_ref_model,
+        )

liger_kernel/chunked_loss/orpo_loss.py CHANGED Viewed

@@ -1,13 +1,10 @@
 import torch
 import torch.nn.functional as F
-from liger_kernel.chunked_loss.fused_linear_preference import (
-    LigerFusedLinearPreferenceBase,
-)
+from liger_kernel.chunked_loss.fused_linear_preference import LigerFusedLinearPreferenceBase
 class LigerFusedLinearORPOFunction(LigerFusedLinearPreferenceBase):
     @staticmethod
     def preference_loss_fn(chosen_logps, rejected_logps, full_target, beta=0.1):
         """
@@ -32,11 +29,10 @@ class LigerFusedLinearORPOFunction(LigerFusedLinearPreferenceBase):
             beta (float): Weight for the odds ratio loss.
         """
         log_odds = (chosen_logps - rejected_logps) - (
-            torch.log1p(-torch.exp(chosen_logps))
-            - torch.log1p(-torch.exp(rejected_logps))
+            torch.log1p(-torch.exp(chosen_logps)) - torch.log1p(-torch.exp(rejected_logps))
         )
         ratio = F.logsigmoid(log_odds)
-        loss = beta * ratio.sum() / (full_target.shape[0] // 2)
+        loss = -beta * ratio.sum() / (full_target.shape[0] // 2)
         chosen_rewards = beta * chosen_logps
         rejected_rewards = beta * rejected_logps
@@ -56,6 +52,7 @@ class LigerFusedLinearORPOFunction(LigerFusedLinearPreferenceBase):
         ignore_index=-100,
         beta=0.1,
         compute_nll_loss=True,
+        nll_target=None,
         compiled=True,
     ):
         return LigerFusedLinearPreferenceBase.forward(
@@ -68,13 +65,14 @@ class LigerFusedLinearORPOFunction(LigerFusedLinearPreferenceBase):
             ignore_index=ignore_index,
             beta=beta,
             compute_nll_loss=compute_nll_loss,
+            nll_target=nll_target,
             compiled=compiled,
         )
     @staticmethod
     def backward(ctx, *grad_output):
         grads = LigerFusedLinearPreferenceBase.backward(ctx, grad_output)[:4]
-        return *grads, None, None, None, None
+        return *grads, None, None, None, None, None
 class LigerFusedLinearORPOLoss(torch.nn.Module):
@@ -100,7 +98,7 @@ class LigerFusedLinearORPOLoss(torch.nn.Module):
         self.compute_nll_loss = compute_nll_loss
         self.compiled = compiled
-    def forward(self, lin_weight, _input, target, bias=None):
+    def forward(self, lin_weight, _input, target, bias=None, nll_target=None):
         return LigerFusedLinearORPOFunction.apply(
             _input,
             lin_weight,
@@ -109,5 +107,6 @@ class LigerFusedLinearORPOLoss(torch.nn.Module):
             self.ignore_index,
             self.beta,
             self.compute_nll_loss,
+            nll_target,
             self.compiled,
         )

liger_kernel/chunked_loss/simpo_loss.py CHANGED Viewed

@@ -1,16 +1,18 @@
 import torch
 import torch.nn.functional as F
-from liger_kernel.chunked_loss.fused_linear_preference import (
-    LigerFusedLinearPreferenceBase,
-)
+from liger_kernel.chunked_loss.fused_linear_preference import LigerFusedLinearPreferenceBase
 class LigerFusedLinearSimPOFunction(LigerFusedLinearPreferenceBase):
     @staticmethod
     def preference_loss_fn(
-        chosen_logps, rejected_logps, full_target, beta=0.1, gamma=0.5
+        chosen_logps,
+        rejected_logps,
+        full_target,
+        beta=0.1,
+        gamma=0.5,
+        label_smoothing=0.0,
     ):
         """
         Paper: https://arxiv.org/pdf/2405.14734
@@ -33,10 +35,17 @@ class LigerFusedLinearSimPOFunction(LigerFusedLinearPreferenceBase):
             full_target: Non chunked full target tensor
             beta (float): beta weight
             gamma (float): gemma margin term
+            label_smoothing (float): Label smoothing factor, will reduce to Equation above when label_smoothing -> 0.
         """
         logits = beta * (chosen_logps - rejected_logps) - gamma
-        loss = F.logsigmoid(logits).sum() / (full_target.shape[0] // 2)
-        return loss
+        loss = (-F.logsigmoid(logits) * (1 - label_smoothing) - F.logsigmoid(-logits) * label_smoothing).sum() / (
+            full_target.shape[0] // 2
+        )
+        chosen_rewards = beta * chosen_logps
+        rejected_rewards = beta * rejected_logps
+        return loss, chosen_rewards, rejected_rewards
     @staticmethod
     def forward(
@@ -48,6 +57,7 @@ class LigerFusedLinearSimPOFunction(LigerFusedLinearPreferenceBase):
         ignore_index=-100,
         beta=0.1,
         alpha=1.0,
+        label_smoothing=0.0,
         compute_nll_loss=False,
         compiled=True,
         gamma=0.5,
@@ -63,6 +73,7 @@ class LigerFusedLinearSimPOFunction(LigerFusedLinearPreferenceBase):
             ignore_index=ignore_index,
             alpha=alpha,
             beta=beta,
+            label_smoothing=label_smoothing,
             compiled=compiled,
             gamma=gamma,
         )
@@ -70,7 +81,7 @@ class LigerFusedLinearSimPOFunction(LigerFusedLinearPreferenceBase):
     @staticmethod
     def backward(ctx, *grad_output):
         grads = LigerFusedLinearPreferenceBase.backward(ctx, grad_output)[:4]
-        return *grads, None, None, None, None, None, None
+        return *grads, None, None, None, None, None, None, None
 class LigerFusedLinearSimPOLoss(torch.nn.Module):
@@ -83,6 +94,7 @@ class LigerFusedLinearSimPOLoss(torch.nn.Module):
         ignore_index: int = -100,
         beta: float = 0.1,
         alpha: float = 1.0,
+        label_smoothing: float = 0.0,
         compute_nll_loss: bool = True,
         compiled: bool = True,
         gamma: float = 0.5,
@@ -96,6 +108,7 @@ class LigerFusedLinearSimPOLoss(torch.nn.Module):
         self.ignore_index = ignore_index
         self.beta = beta
         self.alpha = alpha
+        self.label_smoothing = label_smoothing
         self.compute_nll_loss = compute_nll_loss
         self.compiled = compiled
         self.gamma = gamma
@@ -109,6 +122,7 @@ class LigerFusedLinearSimPOLoss(torch.nn.Module):
             self.ignore_index,
             self.beta,
             self.alpha,
+            self.label_smoothing,
             self.compute_nll_loss,
             self.compiled,
             self.gamma,

liger_kernel/env_report.py CHANGED Viewed

@@ -1,12 +1,13 @@
 import platform
 import sys
 from importlib.metadata import version
 def print_env_report():
     """
-    Prints a report of the environment. Useful for debugging and reproducibility.
+    Prints a report of the environment.  Useful for debugging and reproducibility.
     Usage:
     ```
     python -m liger_kernel.env_report
@@ -27,15 +28,9 @@ def print_env_report():
         import torch
         print(f"PyTorch version: {torch.__version__}")
-        cuda_version = (
-            torch.version.cuda if torch.cuda.is_available() else "Not available"
-        )
+        cuda_version = torch.version.cuda if torch.cuda.is_available() else "Not available"
         print(f"CUDA version: {cuda_version}")
-        hip_version = (
-            torch.version.hip
-            if torch.cuda.is_available() and torch.version.hip
-            else "Not available"
-        )
+        hip_version = torch.version.hip if torch.cuda.is_available() and torch.version.hip else "Not available"
         print(f"HIP(ROCm) version: {hip_version}")
     except ImportError:
@@ -58,9 +53,7 @@ def print_env_report():
         print("Transformers: Not installed")
     try:
-        xpu_version = (
-            torch.version.xpu if torch.xpu.is_available() else "XPU Not Available"
-        )
+        xpu_version = torch.version.xpu if torch.xpu.is_available() else "XPU Not Available"
         print(f"XPU version: {xpu_version}")
     except ImportError:
         print("XPU version: Unable to query")

liger-kernel 0.5.2__py3-none-any.whl → 0.5.4__py3-none-any.whl

liger-kernel 0.5.2py3-none-any.whl → 0.5.4py3-none-any.whl