PyPI - liger-kernel-nightly - Versions diffs - 0.5.6.dev20250411201510__py3-none-any.whl → 0.5.6.dev20250411224032__py3-none-any.whl - Mend

liger-kernel-nightly 0.5.6.dev20250411201510py3-none-any.whl → 0.5.6.dev20250411224032py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

liger_kernel/chunked_loss/fused_linear_ppo.py CHANGED Viewed

@@ -32,6 +32,8 @@ class LigerFusedLinearPPOBase(torch.autograd.Function):
         epsilon_low=0.2,
         epsilon_high=0.2,
         beta=0.04,
+        loss_type="bnpo",
+        max_completion_length=None,
         temperature=1.0,
         compiled=True,
         use_ref_model=False,
@@ -57,6 +59,8 @@ class LigerFusedLinearPPOBase(torch.autograd.Function):
             epsilon_low: Lower bound for clipping the importance sampling ratio
             epsilon_high: Upper bound for clipping the importance sampling ratio
             beta: Weight for the KL penalty
+            loss_type: Type of loss calculation ("grpo", "bnpo", "dr_grpo")
+            max_completion_length: Maximum completion length required for "dr_grpo"
             temperature: Temperature for the logits
             compiled: Whether to use torch compile
             use_ref_model: Whether to use a reference model
@@ -68,6 +72,8 @@ class LigerFusedLinearPPOBase(torch.autograd.Function):
             )
             if ref_per_token_logps is not None and ref_input is not None:
                 raise Warning("Both ref_per_token_logps and ref_input are provided. Using ref_per_token_logps.")
+        if loss_type == "dr_grpo":
+            assert max_completion_length is not None, "max_completion_length must be provided for loss_type 'dr_grpo'"
         # Initialize accumulators
         loss_acc = torch.zeros((), device=_input.device, dtype=torch.float32)
         grad_weight = torch.zeros_like(weight)  # [V, H]
@@ -84,6 +90,8 @@ class LigerFusedLinearPPOBase(torch.autograd.Function):
             epsilon_low=epsilon_low,
             epsilon_high=epsilon_high,
             beta=beta,
+            loss_type=loss_type,
+            max_completion_length=max_completion_length,
             temperature=temperature,
             use_ref_model=use_ref_model,
             ppo_loss_fn=cls.ppo_loss_fn,
@@ -251,6 +259,8 @@ class LigerFusedLinearPPOBase(torch.autograd.Function):
         epsilon_low=0.2,
         epsilon_high=0.2,
         beta=0.04,
+        loss_type="bnpo",
+        max_completion_length=None,
         temperature=1.0,
         use_ref_model=False,
         ppo_loss_fn=None,
@@ -280,6 +290,8 @@ class LigerFusedLinearPPOBase(torch.autograd.Function):
             epsilon_low=epsilon_low,
             epsilon_high=epsilon_high,
             beta=beta,
+            loss_type=loss_type,
+            max_completion_length=max_completion_length,
         )
         return chunk_loss, chunk_metrics
@@ -303,6 +315,7 @@ class LigerFusedLinearPPOBase(torch.autograd.Function):
     def backward(ctx, grad_output, *grad_metrics):
         """Backward pass for PPO loss."""
         grad_input, grad_weight, grad_bias = ctx.saved_tensors
         if grad_output != 1.0:
             grad_input = grad_input * grad_output
             grad_weight = grad_weight * grad_output
@@ -328,4 +341,6 @@ class LigerFusedLinearPPOBase(torch.autograd.Function):
             None,  # grad_compiled
             None,  # grad_use_ref_model
             None,  # grad_chunk_size
+            None,  # grad_loss_type
+            None,  # grad_max_completion_length
         )

liger_kernel/chunked_loss/grpo_loss.py CHANGED Viewed

@@ -27,6 +27,8 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
         epsilon_low=0.2,
         epsilon_high=0.2,
         beta=0.04,
+        loss_type="bnpo",  # ["grpo", "bnpo", "dr_grpo"]
+        max_completion_length=None,  # Required for dr_grpo
         **kwargs,
     ):
         """GRPO Loss Function matching GRPOTrainer implementation."""
@@ -61,7 +63,21 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
         # which is consistent with the DAPO loss implementation (https://arxiv.org/html/2503.14476v1)
         # and TRL GRPO implementation
         # (https://github.com/huggingface/trl/blob/e751a16df56e70190fb94bed4a2035eec3303777/trl/trainer/grpo_trainer.py#L966)
-        loss = (per_token_loss * attention_mask).sum() / torch.clamp(full_attention_mask.sum(), min=1.0)
+        if loss_type == "grpo":
+            # Average per-sequence loss
+            loss = (
+                (per_token_loss * attention_mask).sum(-1) / torch.clamp(attention_mask.sum(-1), min=1.0)
+            ).sum() / full_attention_mask.shape[0]
+        elif loss_type == "bnpo":
+            # Batch Normalized Per-token loss (original implementation)
+            loss = (per_token_loss * attention_mask).sum() / torch.clamp(full_attention_mask.sum(), min=1.0)
+        elif loss_type == "dr_grpo":
+            # Dimension-Reduced GRPO (normalize by batch_size * max_completion_length)
+            if max_completion_length is None:
+                raise ValueError("max_completion_length must be provided for loss_type 'dr_grpo'")
+            loss = (per_token_loss * attention_mask).sum() / (full_attention_mask.shape[0] * max_completion_length)
+        else:
+            raise ValueError(f"Unknown loss type: {loss_type}")
         # Calculate metrics
         metrics = []
@@ -91,6 +107,8 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
         beta=0.04,
         epsilon_low=0.2,
         epsilon_high=0.2,
+        loss_type="bnpo",
+        max_completion_length=None,
         temperature=1.0,
         compiled=True,
         use_ref_model=True,
@@ -110,6 +128,8 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
             ref_weight (torch.Tensor, optional): Reference model weight tensor. Shape: (vocab_size, hidden_size)
             ref_bias (torch.Tensor, optional): Reference model bias tensor. Shape: (vocab_size,)
             beta (float): Weight for the KL penalty
+            loss_type (str): Type of loss calculation ("grpo", "bnpo", "dr_grpo"). Defaults to "bnpo".
+            max_completion_length (int, optional): Maximum completion length, required for "dr_grpo". Defaults to None.
             temperature (float): Temperature for the logits
             compiled (bool): Whether to use torch compile
             use_ref_model (bool): Whether to use a reference model
@@ -134,6 +154,8 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
             beta=beta,
             epsilon_low=epsilon_low,
             epsilon_high=epsilon_high,
+            loss_type=loss_type,
+            max_completion_length=max_completion_length,
             temperature=temperature,
             compiled=compiled,
             use_ref_model=use_ref_model,
@@ -161,6 +183,8 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
             None,  # grad_beta
             None,  # grad_epsilon_low
             None,  # grad_epsilon_high
+            None,  # grad_loss_type (string, not differentiable)
+            None,  # grad_max_completion_length (int, not differentiable)
             None,  # grad_temperature
             None,  # grad_compiled
             None,  # grad_use_ref_model
@@ -179,6 +203,8 @@ class LigerFusedLinearGRPOLoss(torch.nn.Module):
         chunk_size: int = 1,
         epsilon_low: float = 0.2,
         epsilon_high: float = 0.2,
+        loss_type: str = "bnpo",
+        max_completion_length: int | None = None,
         temperature: float = 1.0,
     ):
         """
@@ -189,6 +215,8 @@ class LigerFusedLinearGRPOLoss(torch.nn.Module):
             chunk_size (int): Size of chunks for processing.
             epsilon_low (float): Lower bound for the importance sampling ratio.
             epsilon_high (float): Upper bound for the importance sampling ratio.
+            loss_type (str): Type of loss calculation ("grpo", "bnpo", "dr_grpo"). Defaults to "bnpo".
+            max_completion_length (int, optional): Maximum completion length, required for "dr_grpo". Defaults to None.
             temperature (float): Temperature for the logits.
         """
         super().__init__()
@@ -198,6 +226,8 @@ class LigerFusedLinearGRPOLoss(torch.nn.Module):
         self.chunk_size = chunk_size
         self.epsilon_low = epsilon_low
         self.epsilon_high = epsilon_high
+        self.loss_type = loss_type
+        self.max_completion_length = max_completion_length
         self.temperature = temperature
     def forward(
@@ -229,6 +259,8 @@ class LigerFusedLinearGRPOLoss(torch.nn.Module):
             self.beta,
             self.epsilon_low,
             self.epsilon_high,
+            self.loss_type,
+            self.max_completion_length,
             self.temperature,
             self.compiled,
             self.use_ref_model,

liger_kernel/transformers/model/gemma2.py CHANGED Viewed

@@ -222,7 +222,7 @@ def lce_forward(
             lm_head_weight=self.lm_head.weight,
             labels=labels,
             hidden_size=self.config.hidden_size,
-            softcap=self.config.final_logit_softcapping,
+            final_logit_softcapping=self.config.final_logit_softcapping,
             **loss_kwargs,
         )

liger_kernel/transformers/model/gemma3.py CHANGED Viewed

@@ -112,7 +112,7 @@ def causal_forward(
             lm_head_weight=self.lm_head.weight,
             labels=labels,
             hidden_size=self.config.hidden_size,
-            softcap=self.config.final_logit_softcapping,
+            final_logit_softcapping=self.config.final_logit_softcapping,
             **loss_kwargs,
         )

liger_kernel/transformers/model/loss_utils.py CHANGED Viewed

@@ -1,14 +1,18 @@
+from typing import Optional
+import torch
 import torch.nn as nn
 import liger_kernel.transformers.functional as F
 def fixed_fused_linear_cross_entropy(
-    hidden_states,
-    lm_head_weight,
-    target,
-    num_items_in_batch: int = None,
+    hidden_states: torch.Tensor,
+    lm_head_weight: torch.Tensor,
+    target: torch.Tensor,
+    num_items_in_batch: Optional[int] = None,
     ignore_index: int = -100,
+    final_logit_softcapping: Optional[float] = None,
     **kwargs,
 ):
     reduction = "sum" if num_items_in_batch is not None else "mean"
@@ -18,7 +22,7 @@ def fixed_fused_linear_cross_entropy(
         target,
         reduction=reduction,
         ignore_index=ignore_index,
-        **kwargs,
+        softcap=final_logit_softcapping,
     )
     if reduction == "sum":
         loss = loss / num_items_in_batch
@@ -31,15 +35,17 @@ def LigerForCausalLMLoss(
     lm_head_weight,
     labels,
     hidden_size: int,
-    num_items_in_batch: int = None,
+    num_items_in_batch: Optional[int] = None,
     ignore_index: int = -100,
+    shift_labels: Optional[torch.Tensor] = None,
+    final_logit_softcapping: Optional[float] = None,
     **kwargs,
 ):
     # Skip upcast since intermediate values for the loss are all fp32 in kernel
-    labels = labels.to(hidden_states.device)
-    # Shift so that token < n predict n
-    labels = nn.functional.pad(labels, (0, 1), value=ignore_index)
-    shift_labels = labels[..., 1:].contiguous()
+    if shift_labels is None:
+        # Shift so that token < n predict n
+        labels = nn.functional.pad(labels, (0, 1), value=ignore_index)
+        shift_labels = labels[..., 1:].contiguous()
     # Flatten the tokens
     hidden_states = hidden_states.view(-1, hidden_size)
@@ -52,6 +58,7 @@ def LigerForCausalLMLoss(
         shift_labels,
         num_items_in_batch,
         ignore_index,
+        final_logit_softcapping,
         **kwargs,
     )
     return loss

{liger_kernel_nightly-0.5.6.dev20250411201510.dist-info → liger_kernel_nightly-0.5.6.dev20250411224032.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: liger_kernel_nightly
-Version: 0.5.6.dev20250411201510
+Version: 0.5.6.dev20250411224032
 Summary: Efficient Triton kernels for LLM Training
 License: BSD 2-CLAUSE LICENSE
         Copyright 2024 LinkedIn Corporation

{liger_kernel_nightly-0.5.6.dev20250411201510.dist-info → liger_kernel_nightly-0.5.6.dev20250411224032.dist-info}/RECORD RENAMED Viewed

@@ -7,10 +7,10 @@ liger_kernel/chunked_loss/cpo_loss.py,sha256=Gzz1eU4kgcbdubFVRy55e8A1Cr-r45UgNic
 liger_kernel/chunked_loss/dpo_loss.py,sha256=xZwGqS04si9zXyob95SAdalC-hajZg8fWINqiqffN8k,5855
 liger_kernel/chunked_loss/functional.py,sha256=9G3nKm-Bi7uoZRFkL8wwGMl6juDl4bSzDvTa5GHZPzg,955
 liger_kernel/chunked_loss/fused_linear_distillation.py,sha256=ooR-qnZCyWJN935oHCSWLaKKKyaYERyhNczRGi1VOiw,11935
-liger_kernel/chunked_loss/fused_linear_ppo.py,sha256=-E4AuWY-y2bMo_kAmEQBgQ92UJh3L5IiCRGVcfMJOCE,12731
+liger_kernel/chunked_loss/fused_linear_ppo.py,sha256=AA19cpv6D8mo5RbSK5GRCcZoOSnpxV_Z1eJlAsC5eic,13434
 liger_kernel/chunked_loss/fused_linear_preference.py,sha256=ojB42jYPu0c4ki96Ft-hy7Sf6fh_WikG-aWNrlZzSio,18362
 liger_kernel/chunked_loss/fused_linear_unpaired_preference.py,sha256=RiuK3UtRwH9T6jZ36sA8Urj-TVuOLOO2syLg_JOQapY,13437
-liger_kernel/chunked_loss/grpo_loss.py,sha256=6Mb4ZT6MfnOr4Xo681rMR0LKkhzJhInvQp8wp2YVMK0,8913
+liger_kernel/chunked_loss/grpo_loss.py,sha256=eh6mErFUZsSQrgRRefuXdk-LG0gS7Rg2r-U9CtbH3eU,10834
 liger_kernel/chunked_loss/jsd_loss.py,sha256=u2ahkuHsbhpNaKcpBCz5gCMDk9ou-P04DHji592dIBo,7067
 liger_kernel/chunked_loss/kto_loss.py,sha256=llVCe6DkcpCo57seGWoMikaQVFApx764jsmSbQyqwQY,7529
 liger_kernel/chunked_loss/orpo_loss.py,sha256=nu9UYG16dcMw93lvHi4_hYs3Q0FK1KnlmMRj7OpYU8s,4872
@@ -56,11 +56,11 @@ liger_kernel/transformers/tvd.py,sha256=XrRfyJIqN6HFxXk8MYyFVZM1OLz3mtSbRZvWfZ_J
 liger_kernel/transformers/experimental/embedding.py,sha256=2P0QYdlFyFrG5OqTzTa1wcRgDSyjBMv5i1a7BrDPDQw,881
 liger_kernel/transformers/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 liger_kernel/transformers/model/gemma.py,sha256=-JoHKWjtYPpxHQa6QbCwnzX_cctRZG2ZTsaUv-dmOt4,9816
-liger_kernel/transformers/model/gemma2.py,sha256=tLl1v-O8K0NZ7BQcSf1dE3450-xV72RAk4E5oTPcu_s,10907
-liger_kernel/transformers/model/gemma3.py,sha256=PjAfFtupT9EW0sb57Hx8UJXcnvq9HFgNndeAE4EqyPw,16086
+liger_kernel/transformers/model/gemma2.py,sha256=n4MZupFGDMvtnvkvkNhRrxXS3ZF341BVfyLjrOXp10g,10923
+liger_kernel/transformers/model/gemma3.py,sha256=ge3JYchiKvX1G1Zp00jX2zmQK2K7ymJoZAxbb2ggslw,16102
 liger_kernel/transformers/model/llama.py,sha256=UVXQLRW7rCU5vPab54dLNS3ER37eM446peHX00Yz6eA,10493
 liger_kernel/transformers/model/llava.py,sha256=b0pEagjUbu2-eS9xegjyfl1DwIXLwZcNpff55ibaMbA,17601
-liger_kernel/transformers/model/loss_utils.py,sha256=Z-fUrf-cUDUjUIH7Tl9OL2hT8nmtx7ES3kg8syuWKy4,1476
+liger_kernel/transformers/model/loss_utils.py,sha256=WWAMdiONPaXpIvxyOim_0igLrYh0yyOok5Q9_L9xvZw,1787
 liger_kernel/transformers/model/mistral.py,sha256=RacuKcckuDK6oSraCGD0R0bm-fE0K3q-lkYaAC56C2E,5481
 liger_kernel/transformers/model/mixtral.py,sha256=gLcqGabdv1XnuciS9b-TpkTDnGL8K32Hoq9j2vZMBRY,11502
 liger_kernel/transformers/model/mllama.py,sha256=75mxtmMsNd_q8KlKeawj2uMP6v2KjDuUi4nsUKM5jqA,11308
@@ -74,9 +74,9 @@ liger_kernel/transformers/trainer/__init__.py,sha256=p7yQfklV8-467qSz_ZMimkbDF7H
 liger_kernel/transformers/trainer/orpo_trainer.py,sha256=pdekW7l6Qg_aqa5SYKYlSWUF8m3lkOFvFLcIMEHrz9s,8338
 liger_kernel/triton/__init__.py,sha256=qCiCamzCRv6lpV8IqpAc9YMdNKC7GKurClWceQPnlis,92
 liger_kernel/triton/monkey_patch.py,sha256=Rd0hUHAzDkFfHvnX7-PBaNK5EKnZhtfM_h-fgQH9HPY,1568
-liger_kernel_nightly-0.5.6.dev20250411201510.dist-info/LICENSE,sha256=OhzLDHJ0to4a8sodVLELZiCFylZ1NAAYLs-HrjPy0ag,1312
-liger_kernel_nightly-0.5.6.dev20250411201510.dist-info/METADATA,sha256=exdcHfLuKkUQ2NIene0sQ5hEn8mB98YKJ43XfirrGwM,23297
-liger_kernel_nightly-0.5.6.dev20250411201510.dist-info/NOTICE,sha256=njwnoPZLh9AN8SJQzxvCGLHi-8X__AvWRze6joNXIY8,2066
-liger_kernel_nightly-0.5.6.dev20250411201510.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
-liger_kernel_nightly-0.5.6.dev20250411201510.dist-info/top_level.txt,sha256=2eghu4hA3LnkM7ElW92tQ8zegWKgSbeo-k-aGe1YnvY,13
-liger_kernel_nightly-0.5.6.dev20250411201510.dist-info/RECORD,,
+liger_kernel_nightly-0.5.6.dev20250411224032.dist-info/LICENSE,sha256=OhzLDHJ0to4a8sodVLELZiCFylZ1NAAYLs-HrjPy0ag,1312
+liger_kernel_nightly-0.5.6.dev20250411224032.dist-info/METADATA,sha256=cahu5M3U8t37VfXE0xLqBEH8tLSpXZzbbfwlyS86AuE,23297
+liger_kernel_nightly-0.5.6.dev20250411224032.dist-info/NOTICE,sha256=njwnoPZLh9AN8SJQzxvCGLHi-8X__AvWRze6joNXIY8,2066
+liger_kernel_nightly-0.5.6.dev20250411224032.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
+liger_kernel_nightly-0.5.6.dev20250411224032.dist-info/top_level.txt,sha256=2eghu4hA3LnkM7ElW92tQ8zegWKgSbeo-k-aGe1YnvY,13
+liger_kernel_nightly-0.5.6.dev20250411224032.dist-info/RECORD,,

{liger_kernel_nightly-0.5.6.dev20250411201510.dist-info → liger_kernel_nightly-0.5.6.dev20250411224032.dist-info}/LICENSE RENAMED Viewed

File without changes

{liger_kernel_nightly-0.5.6.dev20250411201510.dist-info → liger_kernel_nightly-0.5.6.dev20250411224032.dist-info}/NOTICE RENAMED Viewed

File without changes

{liger_kernel_nightly-0.5.6.dev20250411201510.dist-info → liger_kernel_nightly-0.5.6.dev20250411224032.dist-info}/WHEEL RENAMED Viewed

File without changes

{liger_kernel_nightly-0.5.6.dev20250411201510.dist-info → liger_kernel_nightly-0.5.6.dev20250411224032.dist-info}/top_level.txt RENAMED Viewed

File without changes

liger-kernel-nightly 0.5.6.dev20250411201510__py3-none-any.whl → 0.5.6.dev20250411224032__py3-none-any.whl

liger-kernel-nightly 0.5.6.dev20250411201510py3-none-any.whl → 0.5.6.dev20250411224032py3-none-any.whl