PyPI - liger-kernel-nightly - Versions diffs - 0.4.2.dev20241119061743__tar.gz → 0.4.2.dev20241119174706__tar.gz - Mend

@@ -0,0 +1,64 @@
+import torch.nn.functional as F
+from liger_kernel.chunked_loss.fused_linear_preference import (
+    LigerFusedLinearPreferenceBase,
+)
+class LigerFusedLinearSimPOFunction(LigerFusedLinearPreferenceBase):
+    @staticmethod
+    def preference_loss_fn(chosen_logps, rejected_logps, beta=0.1, gamma=0.5):
+        """
+        Compute odds-ratio loss.
+        Args:
+            chosen_logps (torch.Tensor): Avg log probabilities of chosen tokens. Shape: (batch_size,).
+            rejected_logps (torch.Tensor): Avg log probabilities of rejected tokens. Shape: (batch_size,).
+            beta (float): Weight for the odds ratio loss.
+            gamma (float): The simpo gamma, margin term.
+        """
+        logits = beta * (chosen_logps - rejected_logps) - gamma
+        loss = F.logsigmoid(logits).mean()
+        return loss
+    @staticmethod
+    def forward(
+        ctx,
+        _input,
+        weight,
+        target,
+        bias=None,
+        ignore_index=-100,
+        beta=0.1,
+        alpha=1.0,
+        compute_nll_loss=False,
+        compiled=True,
+        gamma=0.5,
+    ):
+        """
+        Fused linear layer with SimPO (Simple Preference Optimization) loss. https://arxiv.org/pdf/2405.14734
+        Handles both the forward and backward pass of the final linear layer with SimPO loss.
+        Inspired from LigerFusedLinearCrossEntropyFunction (https://arxiv.org/abs/2410.10989) which fuses final linear layer and CE loss.
+        """
+        return LigerFusedLinearPreferenceBase.forward(
+            ctx,
+            _input,
+            weight,
+            target,
+            bias,
+            loss_fn=LigerFusedLinearSimPOFunction.preference_loss_fn,
+            compute_nll_loss=compute_nll_loss,
+            ignore_index=ignore_index,
+            alpha=alpha,
+            beta=beta,
+            compiled=compiled,
+            gamma=gamma,
+        )
+    @staticmethod
+    def backward(ctx, grad_output):
+        # Get gradients for _input, weight, bias, and target from the base class
+        grads = LigerFusedLinearPreferenceBase.backward(ctx, grad_output)[:4]
+        # Return these gradients, followed by None for the remaining inputs
+        return *grads, None, None, None, None, None, None

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: liger_kernel_nightly
-Version: 0.4.2.dev20241119061743
+Version: 0.4.2.dev20241119174706
 Summary: Efficient Triton kernels for LLM Training
 License: BSD 2-CLAUSE LICENSE
         Copyright 2024 LinkedIn Corporation

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "liger_kernel_nightly"
-version = "0.4.2.dev20241119061743"
+version = "0.4.2.dev20241119174706"
 description = "Efficient Triton kernels for LLM Training"
 urls = { "Homepage" = "https://github.com/linkedin/Liger-Kernel" }
 readme = { file = "README.md", content-type = "text/markdown" }

@@ -32,6 +32,7 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
         alpha=1.0,
         beta=0.1,
         compiled=True,
+        **loss_kwargs,
     ):
         """
         Base class for fused linear layer with preference loss.
@@ -49,6 +50,7 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
             alpha (float): Weight for the NLL loss.
             beta (float): Weight for the odds ratio loss.
             compiled (bool): Whether to use torch compile for chunk accumulation.
+            loss_kwargs (dict): Other possible arguments that a loss function might need
         """
         # TODO: Tune CHUNK_SIZE to fully utilize the GPU
         CHUNK_SIZE = chunk_size
@@ -68,6 +70,7 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
             beta=beta,
             compute_nll_loss=compute_nll_loss,
             full_target=target,
+            **loss_kwargs,
         )
         def accumulate_chunk(input_chunk, target_chunk):
@@ -94,6 +97,9 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
             loss_acc.add_(chunk_loss)
             return chunk_grad_input
+        if compiled:
+            accumulate_chunk = torch.compile(accumulate_chunk)
         len_chosen = target.shape[0] // 2
         _chosen_input_chunks = torch.chunk(_input[:len_chosen], chunks=chunks, dim=0)
         _chosen_target_chunks = torch.chunk(target[:len_chosen], chunks=chunks, dim=0)
@@ -116,8 +122,6 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
                 [chosen_target_chunk, rejected_target_chunk], dim=0
             )
-            if compiled:
-                accumulate_chunk = torch.compile(accumulate_chunk)
             grad_input = accumulate_chunk(input_chunk, target_chunk)
             grad_chosen_inputs.append(grad_input[: chosen_target_chunk.shape[0]])

@@ -34,7 +34,7 @@ class LigerFusedLinearORPOFunction(LigerFusedLinearPreferenceBase):
         ignore_index=-100,
         beta=0.1,
         compute_nll_loss=True,
-        compiled=True,
+        compiled=False,
     ):
         """
         Fused linear layer with ORPO (Odds-Ratio Preference Optimization) loss.

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: liger_kernel_nightly
-Version: 0.4.2.dev20241119061743
+Version: 0.4.2.dev20241119174706
 Summary: Efficient Triton kernels for LLM Training
 License: BSD 2-CLAUSE LICENSE
         Copyright 2024 LinkedIn Corporation

@@ -8,6 +8,7 @@ src/liger_kernel/chunked_loss/cpo_loss.py
 src/liger_kernel/chunked_loss/dpo_loss.py
 src/liger_kernel/chunked_loss/fused_linear_preference.py
 src/liger_kernel/chunked_loss/orpo_loss.py
+src/liger_kernel/chunked_loss/simpo_loss.py
 src/liger_kernel/ops/__init__.py
 src/liger_kernel/ops/cross_entropy.py
 src/liger_kernel/ops/fused_linear_cross_entropy.py

liger-kernel-nightly 0.4.2.dev20241119061743__tar.gz → 0.4.2.dev20241119174706__tar.gz

liger-kernel-nightly 0.4.2.dev20241119061743tar.gz → 0.4.2.dev20241119174706tar.gz