PyPI - liger-kernel-nightly - Versions diffs - 0.4.2.dev20241117192137__tar.gz → 0.4.2.dev20241119054456__tar.gz - Mend

@@ -29,6 +29,7 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
         chunk_size=1,
         compute_nll_loss=True,
         ignore_index=-100,
+        alpha=1.0,
         beta=0.1,
         compiled=True,
     ):
@@ -45,6 +46,7 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
             chunk_size (int): Size of a chunk (# of batches of stacked chosen and rejected inputs).
             compute_nll_loss (bool): Whether to compute NLL loss.
             ignore_index (int): Index to ignore for loss computation.
+            alpha (float): Weight for the NLL loss.
             beta (float): Weight for the odds ratio loss.
             compiled (bool): Whether to use torch compile for chunk accumulation.
         """
@@ -62,6 +64,7 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
             LigerFusedLinearPreferenceBase._compute_loss,
             preference_loss_fn=loss_fn,
             ignore_index=ignore_index,
+            alpha=alpha,
             beta=beta,
             compute_nll_loss=compute_nll_loss,
             full_target=target,
@@ -149,6 +152,7 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
         preference_loss_fn=None,
         full_target=None,
         ignore_index=-100,
+        alpha=1.0,
         beta=0.1,
         compute_nll_loss=True,
         **loss_kwargs,
@@ -163,6 +167,7 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
             bias (torch.Tensor, optional): Bias tensor. Shape: (vocab_size,).
             full_target (torch.Tensor): Full target tensor. Shape: (batch_size, sequence_length).
             ignore_index (int): Index to ignore for loss computation.
+            alpha (float): Weight for the NLL loss.
             beta (float): Weight for the odds ratio loss.
             loss_kwargs (dict): Additional arguments for the loss function.
         """
@@ -202,5 +207,5 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
         )
         alignment_loss = alignment_loss / (full_target.shape[0] // 2)
-        loss = chosen_nll_loss - alignment_loss
+        loss = alpha * chosen_nll_loss - alignment_loss
         return loss, (alignment_loss, chosen_logps, rejected_logps)

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: liger_kernel_nightly
-Version: 0.4.2.dev20241117192137
+Version: 0.4.2.dev20241119054456
 Summary: Efficient Triton kernels for LLM Training
 License: BSD 2-CLAUSE LICENSE
         Copyright 2024 LinkedIn Corporation

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "liger_kernel_nightly"
-version = "0.4.2.dev20241117192137"
+version = "0.4.2.dev20241119054456"
 description = "Efficient Triton kernels for LLM Training"
 urls = { "Homepage" = "https://github.com/linkedin/Liger-Kernel" }
 readme = { file = "README.md", content-type = "text/markdown" }

@@ -0,0 +1,61 @@
+import torch.nn.functional as F
+from liger_kernel.chunked_loss.fused_linear_preference import (
+    LigerFusedLinearPreferenceBase,
+)
+class LigerFusedLinearCPOFunction(LigerFusedLinearPreferenceBase):
+    @staticmethod
+    def preference_loss_fn(chosen_logps, rejected_logps, beta=0.1):
+        """
+        Compute odds-ratio loss.
+        Args:
+            chosen_logps (torch.Tensor): Avg log probabilities of chosen tokens. Shape: (batch_size,).
+            rejected_logps (torch.Tensor): Avg log probabilities of rejected tokens. Shape: (batch_size,).
+            beta (float): Weight for the odds ratio loss.
+        """
+        logits = beta * (chosen_logps - rejected_logps)
+        loss = F.logsigmoid(logits).mean()
+        return loss
+    @staticmethod
+    def forward(
+        ctx,
+        _input,
+        weight,
+        target,
+        bias=None,
+        ignore_index=-100,
+        beta=0.1,
+        alpha=1.0,
+        compute_nll_loss=True,
+        compiled=True,
+    ):
+        """
+        Fused linear layer with CPO (Odds-Ratio Preference Optimization) loss.
+        Handles both the forward and backward pass of the final linear layer with CPO loss.
+        Inspired from LigerFusedLinearCrossEntropyFunction (https://arxiv.org/abs/2410.10989) which fuses final linear layer and CE loss.
+        """
+        return LigerFusedLinearPreferenceBase.forward(
+            ctx,
+            _input,
+            weight,
+            target,
+            bias,
+            loss_fn=LigerFusedLinearCPOFunction.preference_loss_fn,
+            compute_nll_loss=compute_nll_loss,
+            ignore_index=ignore_index,
+            alpha=alpha,
+            beta=beta,
+            compiled=compiled,
+        )
+    @staticmethod
+    def backward(ctx, grad_output):
+        # Get gradients for _input, weight, bias, and target from the base class
+        grads = LigerFusedLinearPreferenceBase.backward(ctx, grad_output)[:4]
+        # Return these gradients, followed by None for the remaining inputs
+        return *grads, None, None, None, None, None

@@ -610,9 +610,7 @@ def apply_liger_kernel_to_qwen2(
             logger.warning(TRANSFORMER_DEPRECATION_WARNING)
             modeling_qwen2.CrossEntropyLoss = LigerCrossEntropyLoss
-    # import pdb; pdb.set_trace()
     if fused_linear_cross_entropy:
         if transformer_version >= version.parse(SUPPORTED_TRANSFORMER_VERSION):
             modeling_qwen2.Qwen2ForCausalLM.forward = qwen2_lce_forward
         else:  # if version < 4.46.1

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: liger_kernel_nightly
-Version: 0.4.2.dev20241117192137
+Version: 0.4.2.dev20241119054456
 Summary: Efficient Triton kernels for LLM Training
 License: BSD 2-CLAUSE LICENSE
         Copyright 2024 LinkedIn Corporation

@@ -4,6 +4,7 @@ README.md
 pyproject.toml
 src/liger_kernel/env_report.py
 src/liger_kernel/chunked_loss/__init__.py
+src/liger_kernel/chunked_loss/cpo_loss.py
 src/liger_kernel/chunked_loss/dpo_loss.py
 src/liger_kernel/chunked_loss/fused_linear_preference.py
 src/liger_kernel/chunked_loss/orpo_loss.py

liger-kernel-nightly 0.4.2.dev20241117192137__tar.gz → 0.4.2.dev20241119054456__tar.gz

liger-kernel-nightly 0.4.2.dev20241117192137tar.gz → 0.4.2.dev20241119054456tar.gz