PyPI - liger-kernel - Versions diffs - 0.5.2__py3-none-any.whl → 0.5.4__py3-none-any.whl - Mend

liger-kernel 0.5.2py3-none-any.whl → 0.5.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (69) hide show

liger_kernel/chunked_loss/README.md +25 -0
liger_kernel/chunked_loss/__init__.py +3 -0
liger_kernel/chunked_loss/cpo_loss.py +18 -8
liger_kernel/chunked_loss/dpo_loss.py +20 -10
liger_kernel/chunked_loss/functional.py +4 -0
liger_kernel/chunked_loss/fused_linear_distillation.py +58 -44
liger_kernel/chunked_loss/fused_linear_preference.py +108 -60
liger_kernel/chunked_loss/fused_linear_rlhf.py +213 -0
liger_kernel/chunked_loss/fused_linear_unpaired_preference.py +246 -0
liger_kernel/chunked_loss/grpo_loss.py +160 -0
liger_kernel/chunked_loss/jsd_loss.py +154 -0
liger_kernel/chunked_loss/kto_loss.py +172 -0
liger_kernel/chunked_loss/orpo_loss.py +8 -9
liger_kernel/chunked_loss/simpo_loss.py +22 -8
liger_kernel/env_report.py +5 -12
liger_kernel/ops/cross_entropy.py +102 -51
liger_kernel/ops/experimental/embedding.py +1 -3
liger_kernel/ops/experimental/mm_int8int2.py +3 -9
liger_kernel/ops/fused_linear_cross_entropy.py +89 -55
liger_kernel/ops/fused_linear_jsd.py +14 -32
liger_kernel/ops/geglu.py +6 -17
liger_kernel/ops/group_norm.py +11 -28
liger_kernel/ops/jsd.py +5 -9
liger_kernel/ops/kl_div.py +8 -11
liger_kernel/ops/layer_norm.py +23 -12
liger_kernel/ops/qwen2vl_mrope.py +8 -25
liger_kernel/ops/rms_norm.py +14 -32
liger_kernel/ops/rope.py +31 -33
liger_kernel/ops/swiglu.py +4 -8
liger_kernel/ops/tvd.py +207 -0
liger_kernel/ops/utils.py +3 -2
liger_kernel/transformers/__init__.py +19 -24
liger_kernel/transformers/auto_model.py +6 -13
liger_kernel/transformers/cross_entropy.py +7 -9
liger_kernel/transformers/experimental/embedding.py +1 -3
liger_kernel/transformers/functional.py +28 -7
liger_kernel/transformers/fused_linear_cross_entropy.py +15 -10
liger_kernel/transformers/geglu.py +1 -4
liger_kernel/transformers/group_norm.py +9 -15
liger_kernel/transformers/jsd.py +1 -3
liger_kernel/transformers/kl_div.py +1 -3
liger_kernel/transformers/layer_norm.py +3 -9
liger_kernel/transformers/model/gemma.py +18 -40
liger_kernel/transformers/model/gemma2.py +19 -41
liger_kernel/transformers/model/llama.py +22 -48
liger_kernel/transformers/model/mistral.py +14 -26
liger_kernel/transformers/model/mixtral.py +24 -54
liger_kernel/transformers/model/mllama.py +16 -36
liger_kernel/transformers/model/olmo2.py +124 -0
liger_kernel/transformers/model/phi3.py +18 -40
liger_kernel/transformers/model/qwen2.py +18 -40
liger_kernel/transformers/model/qwen2_vl.py +36 -32
liger_kernel/transformers/monkey_patch.py +214 -144
liger_kernel/transformers/rms_norm.py +4 -4
liger_kernel/transformers/rope.py +2 -2
liger_kernel/transformers/swiglu.py +2 -8
liger_kernel/transformers/trainer/__init__.py +1 -3
liger_kernel/transformers/trainer/orpo_trainer.py +31 -18
liger_kernel/transformers/tvd.py +13 -0
liger_kernel/triton/__init__.py +1 -3
liger_kernel/triton/monkey_patch.py +1 -3
liger_kernel/utils.py +49 -0
{liger_kernel-0.5.2.dist-info → liger_kernel-0.5.4.dist-info}/METADATA +53 -26
liger_kernel-0.5.4.dist-info/RECORD +74 -0
{liger_kernel-0.5.2.dist-info → liger_kernel-0.5.4.dist-info}/WHEEL +1 -1
liger_kernel-0.5.2.dist-info/RECORD +0 -65
{liger_kernel-0.5.2.dist-info → liger_kernel-0.5.4.dist-info}/LICENSE +0 -0
{liger_kernel-0.5.2.dist-info → liger_kernel-0.5.4.dist-info}/NOTICE +0 -0
{liger_kernel-0.5.2.dist-info → liger_kernel-0.5.4.dist-info}/top_level.txt +0 -0

liger_kernel/chunked_loss/README.md ADDED Viewed

@@ -0,0 +1,25 @@
+# Liger FlexChunkLoss: Alignment and Distillation loss
+Liger FlexChunkLoss offers a versatile interface, delivering up to 80% memory savings and a 10% throughput boost for post-training loss functions, including alignment (DPO, ORPO, CPO, KTO) and very soon, distillation. Its flexible design supports custom losses, ensuring efficiency gains across diverse use cases.
+### User interface
+FlexChunkLoss offers two flexible usage options:
+1. **Via `Liger[Custom Loss]Trainer`**
+   For example, by simply replacing the HuggingFace `ORPOTrainer` with `LigerORPOTrainer` in your code, you can leverage our optimized ORPO implementation and immediately benefit from improved performance.
+2. **Using `nn.Module` Implementations of Custom Loss Functions**
+   Explore the [LigerORPOTrainer implementation](https://github.com/linkedin/Liger-Kernel/blob/main/src/liger_kernel/transformers/orpo_trainer.py) to see how the modular design integrates custom loss functions seamlessly.
+### What's under the hood?
+We employ chunking and fused kernel optimizations to enhance performance. By fusing the final linear layer with loss computation and calculating backward gradients during the forward pass, we significantly reduce the need for storing intermediate activations. All operations are implemented in PyTorch, leveraging `torch.compile` to streamline kernel execution without relying on extensive low-level optimizations. Additionally, we minimize `torch.compile` recompilations to reduce overhead and ensure consistent performance gains.
+### Extending to custom loss functions
+We provide two base classes: `LigerFusedLinearPreferenceBase` for alignment use cases and `LigerFusedLinearDistillationBase` for distillation use cases. These base classes manage chunking, kernel fusions, and Torch compilation.
+To implement a custom loss function, you need to create a subclass that defines the custom preference or distillation loss function, capable of processing a given input chunk. The base class will take care of the optimizations, handling most of the heavy lifting for you.
+For a working example, refer to the [ORPO loss implementation](https://github.com/linkedin/Liger-Kernel/blob/main/src/liger_kernel/chunked_loss/orpo_loss.py).

liger_kernel/chunked_loss/__init__.py CHANGED Viewed

@@ -1,4 +1,7 @@
 from liger_kernel.chunked_loss.cpo_loss import LigerFusedLinearCPOLoss  # noqa: F401
 from liger_kernel.chunked_loss.dpo_loss import LigerFusedLinearDPOLoss  # noqa: F401
+from liger_kernel.chunked_loss.grpo_loss import LigerFusedLinearGRPOLoss  # noqa: F401
+from liger_kernel.chunked_loss.jsd_loss import LigerFusedLinearJSDLoss  # noqa: F401
+from liger_kernel.chunked_loss.kto_loss import LigerFusedLinearKTOLoss  # noqa: F401
 from liger_kernel.chunked_loss.orpo_loss import LigerFusedLinearORPOLoss  # noqa: F401
 from liger_kernel.chunked_loss.simpo_loss import LigerFusedLinearSimPOLoss  # noqa: F401

liger_kernel/chunked_loss/cpo_loss.py CHANGED Viewed

@@ -1,15 +1,12 @@
 import torch
 import torch.nn.functional as F
-from liger_kernel.chunked_loss.fused_linear_preference import (
-    LigerFusedLinearPreferenceBase,
-)
+from liger_kernel.chunked_loss.fused_linear_preference import LigerFusedLinearPreferenceBase
 class LigerFusedLinearCPOFunction(LigerFusedLinearPreferenceBase):
     @staticmethod
-    def preference_loss_fn(chosen_logps, rejected_logps, full_target, beta=0.1):
+    def preference_loss_fn(chosen_logps, rejected_logps, full_target, beta=0.1, label_smoothing=0.0):
         """
         Paper: https://arxiv.org/pdf/2401.08417
@@ -30,10 +27,17 @@ class LigerFusedLinearCPOFunction(LigerFusedLinearPreferenceBase):
             rejected_logps (torch.Tensor): Avg log probabilities of rejected tokens. Shape: (batch_size,).
             full_target (torch.Tensor): Non chunked full target tensor
             beta (float): Weight for the CPO loss
+            label_smoothing (float): Label smoothing factor, will reduce to Equation above when label_smoothing -> 0.
         """
         logits = beta * (chosen_logps - rejected_logps)
-        loss = F.logsigmoid(logits).sum() / (full_target.shape[0] // 2)
-        return loss
+        loss = (-F.logsigmoid(logits) * (1 - label_smoothing) - F.logsigmoid(-logits) * label_smoothing).sum() / (
+            full_target.shape[0] // 2
+        )
+        chosen_rewards = beta * chosen_logps
+        rejected_rewards = beta * rejected_logps
+        return loss, chosen_rewards, rejected_rewards
     @staticmethod
     def forward(
@@ -45,6 +49,7 @@ class LigerFusedLinearCPOFunction(LigerFusedLinearPreferenceBase):
         ignore_index=-100,
         beta=0.1,
         alpha=1.0,
+        label_smoothing=0.0,
         compute_nll_loss=True,
         compiled=True,
     ):
@@ -58,14 +63,16 @@ class LigerFusedLinearCPOFunction(LigerFusedLinearPreferenceBase):
             ignore_index=ignore_index,
             alpha=alpha,
             beta=beta,
+            label_smoothing=label_smoothing,
             compute_nll_loss=compute_nll_loss,
+            average_log_prob=False,
             compiled=compiled,
         )
     @staticmethod
     def backward(ctx, *grad_output):
         grads = LigerFusedLinearPreferenceBase.backward(ctx, grad_output)[:4]
-        return *grads, None, None, None, None, None
+        return *grads, None, None, None, None, None, None
 class LigerFusedLinearCPOLoss(torch.nn.Module):
@@ -78,6 +85,7 @@ class LigerFusedLinearCPOLoss(torch.nn.Module):
         ignore_index: int = -100,
         beta: float = 0.1,
         alpha: float = 1.0,
+        label_smoothing: float = 0.0,
         compute_nll_loss: bool = True,
         compiled: bool = True,
     ):
@@ -90,6 +98,7 @@ class LigerFusedLinearCPOLoss(torch.nn.Module):
         self.ignore_index = ignore_index
         self.beta = beta
         self.alpha = alpha
+        self.label_smoothing = label_smoothing
         self.compute_nll_loss = compute_nll_loss
         self.compiled = compiled
@@ -102,6 +111,7 @@ class LigerFusedLinearCPOLoss(torch.nn.Module):
             self.ignore_index,
             self.beta,
             self.alpha,
+            self.label_smoothing,
             self.compute_nll_loss,
             self.compiled,
         )

liger_kernel/chunked_loss/dpo_loss.py CHANGED Viewed

@@ -1,13 +1,10 @@
 import torch
 import torch.nn.functional as F
-from liger_kernel.chunked_loss.fused_linear_preference import (
-    LigerFusedLinearPreferenceBase,
-)
+from liger_kernel.chunked_loss.fused_linear_preference import LigerFusedLinearPreferenceBase
 class LigerFusedLinearDPOFunction(LigerFusedLinearPreferenceBase):
     @staticmethod
     def preference_loss_fn(
         chosen_logps,
@@ -48,9 +45,12 @@ class LigerFusedLinearDPOFunction(LigerFusedLinearPreferenceBase):
         chosen_logratios = chosen_logps - ref_chosen_logps
         rejected_logratios = rejected_logps - ref_rejected_logps
+        chosen_rewards = beta * chosen_logratios
+        rejected_rewards = beta * rejected_logratios
         logits_diff = beta * (chosen_logratios - rejected_logratios)
         loss = -F.logsigmoid(logits_diff).sum() / (full_target.shape[0] // 2)
-        return loss
+        return loss, chosen_rewards, rejected_rewards
     @staticmethod
     def forward(
@@ -59,11 +59,12 @@ class LigerFusedLinearDPOFunction(LigerFusedLinearPreferenceBase):
         weight,
         target,
         bias=None,
+        ref_input=None,
         ref_weight=None,
         ref_bias=None,
         ignore_index=-100,
         beta=0.1,
-        compute_nll_loss=True,
+        compute_nll_loss=False,
         compiled=True,
         use_ref_model=True,
     ):
@@ -79,6 +80,7 @@ class LigerFusedLinearDPOFunction(LigerFusedLinearPreferenceBase):
             compute_nll_loss=compute_nll_loss,
             compiled=compiled,
             use_ref_model=use_ref_model,
+            ref_input=ref_input,
             ref_weight=ref_weight,
             ref_bias=ref_bias,
         )
@@ -86,7 +88,7 @@ class LigerFusedLinearDPOFunction(LigerFusedLinearPreferenceBase):
     @staticmethod
     def backward(ctx, *grad_output):
         grads = LigerFusedLinearPreferenceBase.backward(ctx, grad_output)[:4]
-        return *grads, None, None, None, None, None, None, None
+        return *grads, None, None, None, None, None, None, None, None
 class LigerFusedLinearDPOLoss(torch.nn.Module):
@@ -98,9 +100,9 @@ class LigerFusedLinearDPOLoss(torch.nn.Module):
         self,
         ignore_index: int = -100,
         beta: float = 0.1,
-        compute_nll_loss: bool = True,
+        compute_nll_loss: bool = False,
         compiled: bool = True,
-        use_ref_model: bool = False,
+        use_ref_model: bool = True,
     ):
         """
         Args:
@@ -118,13 +120,21 @@ class LigerFusedLinearDPOLoss(torch.nn.Module):
         self.use_ref_model = use_ref_model
     def forward(
-        self, lin_weight, _input, target, bias=None, ref_weight=None, ref_bias=None
+        self,
+        lin_weight,
+        _input,
+        target,
+        bias=None,
+        ref_input=None,
+        ref_weight=None,
+        ref_bias=None,
     ):
         return LigerFusedLinearDPOFunction.apply(
             _input,
             lin_weight,
             target,
             bias,
+            ref_input,
             ref_weight,
             ref_bias,
             self.ignore_index,

liger_kernel/chunked_loss/functional.py CHANGED Viewed

@@ -1,9 +1,13 @@
 from liger_kernel.chunked_loss.cpo_loss import LigerFusedLinearCPOFunction
 from liger_kernel.chunked_loss.dpo_loss import LigerFusedLinearDPOFunction
+from liger_kernel.chunked_loss.jsd_loss import LigerFusedLinearJSDFunction
+from liger_kernel.chunked_loss.kto_loss import LigerFusedLinearKTOFunction
 from liger_kernel.chunked_loss.orpo_loss import LigerFusedLinearORPOFunction
 from liger_kernel.chunked_loss.simpo_loss import LigerFusedLinearSimPOFunction
 liger_fused_linear_orpo = LigerFusedLinearORPOFunction.apply
 liger_fused_linear_dpo = LigerFusedLinearDPOFunction.apply
+liger_fused_linear_jsd = LigerFusedLinearJSDFunction.apply
 liger_fused_linear_cpo = LigerFusedLinearCPOFunction.apply
 liger_fused_linear_simpo = LigerFusedLinearSimPOFunction.apply
+liger_fused_linear_kto = LigerFusedLinearKTOFunction.apply

liger_kernel/chunked_loss/fused_linear_distillation.py CHANGED Viewed

@@ -2,18 +2,24 @@ from abc import abstractmethod
 from functools import partial
 import torch
 from torch.nn import functional as F
 class LigerFusedLinearDistillationBase(torch.autograd.Function):
     @abstractmethod
-    def distillation_loss_fn(student_logits, teacher_logits, temperature):
+    def distillation_loss_fn(
+        student_logits,
+        teacher_logits,
+    ):
         """
         Compute distillation loss.
         Args:
-            student_logits (torch.Tensor): Raw logits of student tokens. Shape: (batch_size * seq_len, vocab_size).
-            teacher_logits (torch.Tensor): Raw logits of teacher tokens. Shape: (batch_size * seq_len, vocab_size).
+            student_logits (torch.Tensor): Raw (temperature-scaled) logits of student tokens. Shape: (batch_size * seq_len, vocab_size).
+            teacher_logits (torch.Tensor): Raw (temperature-scaled) logits of teacher tokens. Shape: (batch_size * seq_len, vocab_size).
+        Returns:
+            torch.Tensor: Sum of distillation losses for the chunk. The class will handle
+                converting this to mean loss by dividing by the full batch size * sequence length in _compute_loss.
         """
         raise NotImplementedError("Distillation loss function must be implemented.")
@@ -65,14 +71,14 @@ class LigerFusedLinearDistillationBase(torch.autograd.Function):
         distillation_loss_fn=None,
         full_target=None,
         ignore_index=-100,
-        temperature=1.0,
         weight_hard_loss=0.5,
         weight_soft_loss=0.5,
         compute_ce_loss=True,
+        temperature=1,
         **loss_kwargs,
     ):
         """
-        Compute the total loss for a chunk of input and target, while using an knowleedge distillation loss function.
+        Compute the total loss for a chunk of input and target, while using an knowledge distillation loss function.
         Args:
             distillation_loss_fn (callable): Loss function to compute the loss on a chunk of input/target.
             student_input_chunk (torch.Tensor): Chunk of input tensor. Shape: (chunk_size, student_hidden_size).
@@ -82,32 +88,36 @@ class LigerFusedLinearDistillationBase(torch.autograd.Function):
             target_chunk (torch.Tensor): Chunk of target tensor. Shape: (chunk_size,).
             student_bias (torch.Tensor, optional): Bias tensor. Shape: (vocab_size,).
             teacher_bias (torch.Tensor, optional): Bias tensor. Shape: (vocab_size,).
-            full_target (torch.Tensor): Full target tensor. Shape: (chunk_size,).
+            full_target (torch.Tensor): Full target tensor. Shape: (batch_size * sequence_length,).
             ignore_index (int): Index to ignore for loss computation.
             weight_hard_loss (float): Weight for hard loss.
             weight_soft_loss (float): Weight for soft loss.
             compute_ce_loss (bool): Whether to compute CE loss.
+            temperature (float): Temperature to control the input probability distribution. Default: `1.0` (i.e. no scale)
             loss_kwargs (dict): Additional arguments for the loss function.
         """
-        student_logits_chunk, teacher_logits_chunk, hard_loss = (
-            LigerFusedLinearDistillationBase.chunk_forward(
-                student_input_chunk,
-                student_weight,
-                teacher_input_chunk,
-                teacher_weight,
-                target_chunk,
-                student_bias=student_bias,
-                teacher_bias=teacher_bias,
-                ignore_index=ignore_index,
-                compute_ce_loss=compute_ce_loss,
-            )
+        (
+            student_logits_chunk,
+            teacher_logits_chunk,
+            hard_loss,
+        ) = LigerFusedLinearDistillationBase.chunk_forward(
+            student_input_chunk,
+            student_weight,
+            teacher_input_chunk,
+            teacher_weight,
+            target_chunk,
+            student_bias=student_bias,
+            teacher_bias=teacher_bias,
+            ignore_index=ignore_index,
+            compute_ce_loss=compute_ce_loss,
         )
+        student_logits_chunk /= temperature
+        teacher_logits_chunk /= temperature
         hard_loss /= full_target.shape[0]
-        soft_loss = distillation_loss_fn(
-            student_logits_chunk, teacher_logits_chunk, temperature
-        )
+        soft_loss = distillation_loss_fn(student_logits_chunk, teacher_logits_chunk)
         soft_loss /= full_target.shape[0]
         loss = weight_hard_loss * hard_loss + weight_soft_loss * soft_loss
@@ -128,6 +138,7 @@ class LigerFusedLinearDistillationBase(torch.autograd.Function):
         ignore_index=-100,
         weight_hard_loss=0.5,
         weight_soft_loss=0.5,
+        beta=0.5,
         compute_ce_loss=True,
         temperature=1.0,
         compiled=True,
@@ -147,10 +158,12 @@ class LigerFusedLinearDistillationBase(torch.autograd.Function):
             teacher_bias (torch.Tensor, optional): Teacher bias tensor. Shape: (vocab_size,).
             loss_fn (callable): Loss function to compute the loss on a chunk of input/target.
             chunk_size (int): Size of a chunk.
-            compute_ce_loss (bool): Whether to compute CE loss.
             ignore_index (int): Index to ignore for loss computation.
             weight_hard_loss (float): Weight for hard/task loss.
             weight_soft_loss (float): Weight for soft/distillation loss.
+            beta (float): Interpolation coefficient between 0 and 1 (default: 0.5).
+            compute_ce_loss (bool): Whether to compute CE loss.
+            temperature (float): Temperature to control the input probability distribution. Default: `1.0` (i.e. no scale)
             compiled (bool): Whether to use torch compile for chunk accumulation.
             loss_kwargs (dict): Other possible arguments that a loss function might need
         """
@@ -167,6 +180,7 @@ class LigerFusedLinearDistillationBase(torch.autograd.Function):
             ignore_index=ignore_index,
             weight_hard_loss=weight_hard_loss,
             weight_soft_loss=weight_soft_loss,
+            beta=beta,
             compute_ce_loss=compute_ce_loss,
             temperature=temperature,
             **loss_kwargs,
@@ -174,17 +188,18 @@ class LigerFusedLinearDistillationBase(torch.autograd.Function):
         def accumulate_chunk(student_input_chunk, teacher_input_chunk, target_chunk):
             if student_bias is not None:
-                (chunk_grad_input, chunk_grad_weight, chunk_grad_bias), (
-                    chunk_loss,
+                (
+                    (chunk_grad_input, chunk_grad_weight, chunk_grad_bias),
                     (
-                        chunk_soft_loss,
-                        chunk_hard_loss,
-                        chunk_student_logits,
-                        chunk_teacher_logits,
+                        chunk_loss,
+                        (
+                            chunk_soft_loss,
+                            chunk_hard_loss,
+                            chunk_student_logits,
+                            chunk_teacher_logits,
+                        ),
                     ),
-                ) = torch.func.grad_and_value(
-                    loss_func_to_call, argnums=(0, 1, 5), has_aux=True
-                )(
+                ) = torch.func.grad_and_value(loss_func_to_call, argnums=(0, 1, 5), has_aux=True)(
                     student_input_chunk,
                     student_weight,
                     teacher_input_chunk,
@@ -195,17 +210,18 @@ class LigerFusedLinearDistillationBase(torch.autograd.Function):
                 )
                 grad_bias.add_(chunk_grad_bias)
             else:
-                (chunk_grad_input, chunk_grad_weight), (
-                    chunk_loss,
+                (
+                    (chunk_grad_input, chunk_grad_weight),
                     (
-                        chunk_soft_loss,
-                        chunk_hard_loss,
-                        chunk_student_logits,
-                        chunk_teacher_logits,
+                        chunk_loss,
+                        (
+                            chunk_soft_loss,
+                            chunk_hard_loss,
+                            chunk_student_logits,
+                            chunk_teacher_logits,
+                        ),
                     ),
-                ) = torch.func.grad_and_value(
-                    loss_func_to_call, argnums=(0, 1), has_aux=True
-                )(
+                ) = torch.func.grad_and_value(loss_func_to_call, argnums=(0, 1), has_aux=True)(
                     student_input_chunk,
                     student_weight,
                     teacher_input_chunk,
@@ -229,9 +245,7 @@ class LigerFusedLinearDistillationBase(torch.autograd.Function):
         for student_input_chunk, teacher_input_chunk, target_chunk in zip(
             _student_input_chunks, _teacher_input_chunks, _target_chunks
         ):
-            grad_input = accumulate_chunk(
-                student_input_chunk, teacher_input_chunk, target_chunk
-            )
+            grad_input = accumulate_chunk(student_input_chunk, teacher_input_chunk, target_chunk)
             grad_inputs.append(grad_input)
         ctx.save_for_backward(

liger-kernel 0.5.2__py3-none-any.whl → 0.5.4__py3-none-any.whl

liger-kernel 0.5.2py3-none-any.whl → 0.5.4py3-none-any.whl