PyPI - liger-kernel-nightly - Versions diffs - 0.5.6.dev20250411201510__tar.gz → 0.5.6.dev20250411224032__tar.gz - Mend

liger-kernel-nightly 0.5.6.dev20250411201510tar.gz → 0.5.6.dev20250411224032tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (243) hide show

{liger_kernel_nightly-0.5.6.dev20250411201510 → liger_kernel_nightly-0.5.6.dev20250411224032}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: liger_kernel_nightly
-Version: 0.5.6.dev20250411201510
+Version: 0.5.6.dev20250411224032
 Summary: Efficient Triton kernels for LLM Training
 License: BSD 2-CLAUSE LICENSE
         Copyright 2024 LinkedIn Corporation

{liger_kernel_nightly-0.5.6.dev20250411201510 → liger_kernel_nightly-0.5.6.dev20250411224032}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "liger_kernel_nightly"
-version = "0.5.6.dev20250411201510"
+version = "0.5.6.dev20250411224032"
 description = "Efficient Triton kernels for LLM Training"
 urls = { "Homepage" = "https://github.com/linkedin/Liger-Kernel" }
 readme = { file = "README.md", content-type = "text/markdown" }

{liger_kernel_nightly-0.5.6.dev20250411201510 → liger_kernel_nightly-0.5.6.dev20250411224032}/src/liger_kernel/chunked_loss/fused_linear_ppo.py RENAMED Viewed

@@ -32,6 +32,8 @@ class LigerFusedLinearPPOBase(torch.autograd.Function):
         epsilon_low=0.2,
         epsilon_high=0.2,
         beta=0.04,
+        loss_type="bnpo",
+        max_completion_length=None,
         temperature=1.0,
         compiled=True,
         use_ref_model=False,
@@ -57,6 +59,8 @@ class LigerFusedLinearPPOBase(torch.autograd.Function):
             epsilon_low: Lower bound for clipping the importance sampling ratio
             epsilon_high: Upper bound for clipping the importance sampling ratio
             beta: Weight for the KL penalty
+            loss_type: Type of loss calculation ("grpo", "bnpo", "dr_grpo")
+            max_completion_length: Maximum completion length required for "dr_grpo"
             temperature: Temperature for the logits
             compiled: Whether to use torch compile
             use_ref_model: Whether to use a reference model
@@ -68,6 +72,8 @@ class LigerFusedLinearPPOBase(torch.autograd.Function):
             )
             if ref_per_token_logps is not None and ref_input is not None:
                 raise Warning("Both ref_per_token_logps and ref_input are provided. Using ref_per_token_logps.")
+        if loss_type == "dr_grpo":
+            assert max_completion_length is not None, "max_completion_length must be provided for loss_type 'dr_grpo'"
         # Initialize accumulators
         loss_acc = torch.zeros((), device=_input.device, dtype=torch.float32)
         grad_weight = torch.zeros_like(weight)  # [V, H]
@@ -84,6 +90,8 @@ class LigerFusedLinearPPOBase(torch.autograd.Function):
             epsilon_low=epsilon_low,
             epsilon_high=epsilon_high,
             beta=beta,
+            loss_type=loss_type,
+            max_completion_length=max_completion_length,
             temperature=temperature,
             use_ref_model=use_ref_model,
             ppo_loss_fn=cls.ppo_loss_fn,
@@ -251,6 +259,8 @@ class LigerFusedLinearPPOBase(torch.autograd.Function):
         epsilon_low=0.2,
         epsilon_high=0.2,
         beta=0.04,
+        loss_type="bnpo",
+        max_completion_length=None,
         temperature=1.0,
         use_ref_model=False,
         ppo_loss_fn=None,
@@ -280,6 +290,8 @@ class LigerFusedLinearPPOBase(torch.autograd.Function):
             epsilon_low=epsilon_low,
             epsilon_high=epsilon_high,
             beta=beta,
+            loss_type=loss_type,
+            max_completion_length=max_completion_length,
         )
         return chunk_loss, chunk_metrics
@@ -303,6 +315,7 @@ class LigerFusedLinearPPOBase(torch.autograd.Function):
     def backward(ctx, grad_output, *grad_metrics):
         """Backward pass for PPO loss."""
         grad_input, grad_weight, grad_bias = ctx.saved_tensors
         if grad_output != 1.0:
             grad_input = grad_input * grad_output
             grad_weight = grad_weight * grad_output
@@ -328,4 +341,6 @@ class LigerFusedLinearPPOBase(torch.autograd.Function):
             None,  # grad_compiled
             None,  # grad_use_ref_model
             None,  # grad_chunk_size
+            None,  # grad_loss_type
+            None,  # grad_max_completion_length
         )

{liger_kernel_nightly-0.5.6.dev20250411201510 → liger_kernel_nightly-0.5.6.dev20250411224032}/src/liger_kernel/chunked_loss/grpo_loss.py RENAMED Viewed

@@ -27,6 +27,8 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
         epsilon_low=0.2,
         epsilon_high=0.2,
         beta=0.04,
+        loss_type="bnpo",  # ["grpo", "bnpo", "dr_grpo"]
+        max_completion_length=None,  # Required for dr_grpo
         **kwargs,
     ):
         """GRPO Loss Function matching GRPOTrainer implementation."""
@@ -61,7 +63,21 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
         # which is consistent with the DAPO loss implementation (https://arxiv.org/html/2503.14476v1)
         # and TRL GRPO implementation
         # (https://github.com/huggingface/trl/blob/e751a16df56e70190fb94bed4a2035eec3303777/trl/trainer/grpo_trainer.py#L966)
-        loss = (per_token_loss * attention_mask).sum() / torch.clamp(full_attention_mask.sum(), min=1.0)
+        if loss_type == "grpo":
+            # Average per-sequence loss
+            loss = (
+                (per_token_loss * attention_mask).sum(-1) / torch.clamp(attention_mask.sum(-1), min=1.0)
+            ).sum() / full_attention_mask.shape[0]
+        elif loss_type == "bnpo":
+            # Batch Normalized Per-token loss (original implementation)
+            loss = (per_token_loss * attention_mask).sum() / torch.clamp(full_attention_mask.sum(), min=1.0)
+        elif loss_type == "dr_grpo":
+            # Dimension-Reduced GRPO (normalize by batch_size * max_completion_length)
+            if max_completion_length is None:
+                raise ValueError("max_completion_length must be provided for loss_type 'dr_grpo'")
+            loss = (per_token_loss * attention_mask).sum() / (full_attention_mask.shape[0] * max_completion_length)
+        else:
+            raise ValueError(f"Unknown loss type: {loss_type}")
         # Calculate metrics
         metrics = []
@@ -91,6 +107,8 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
         beta=0.04,
         epsilon_low=0.2,
         epsilon_high=0.2,
+        loss_type="bnpo",
+        max_completion_length=None,
         temperature=1.0,
         compiled=True,
         use_ref_model=True,
@@ -110,6 +128,8 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
             ref_weight (torch.Tensor, optional): Reference model weight tensor. Shape: (vocab_size, hidden_size)
             ref_bias (torch.Tensor, optional): Reference model bias tensor. Shape: (vocab_size,)
             beta (float): Weight for the KL penalty
+            loss_type (str): Type of loss calculation ("grpo", "bnpo", "dr_grpo"). Defaults to "bnpo".
+            max_completion_length (int, optional): Maximum completion length, required for "dr_grpo". Defaults to None.
             temperature (float): Temperature for the logits
             compiled (bool): Whether to use torch compile
             use_ref_model (bool): Whether to use a reference model
@@ -134,6 +154,8 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
             beta=beta,
             epsilon_low=epsilon_low,
             epsilon_high=epsilon_high,
+            loss_type=loss_type,
+            max_completion_length=max_completion_length,
             temperature=temperature,
             compiled=compiled,
             use_ref_model=use_ref_model,
@@ -161,6 +183,8 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
             None,  # grad_beta
             None,  # grad_epsilon_low
             None,  # grad_epsilon_high
+            None,  # grad_loss_type (string, not differentiable)
+            None,  # grad_max_completion_length (int, not differentiable)
             None,  # grad_temperature
             None,  # grad_compiled
             None,  # grad_use_ref_model
@@ -179,6 +203,8 @@ class LigerFusedLinearGRPOLoss(torch.nn.Module):
         chunk_size: int = 1,
         epsilon_low: float = 0.2,
         epsilon_high: float = 0.2,
+        loss_type: str = "bnpo",
+        max_completion_length: int | None = None,
         temperature: float = 1.0,
     ):
         """
@@ -189,6 +215,8 @@ class LigerFusedLinearGRPOLoss(torch.nn.Module):
             chunk_size (int): Size of chunks for processing.
             epsilon_low (float): Lower bound for the importance sampling ratio.
             epsilon_high (float): Upper bound for the importance sampling ratio.
+            loss_type (str): Type of loss calculation ("grpo", "bnpo", "dr_grpo"). Defaults to "bnpo".
+            max_completion_length (int, optional): Maximum completion length, required for "dr_grpo". Defaults to None.
             temperature (float): Temperature for the logits.
         """
         super().__init__()
@@ -198,6 +226,8 @@ class LigerFusedLinearGRPOLoss(torch.nn.Module):
         self.chunk_size = chunk_size
         self.epsilon_low = epsilon_low
         self.epsilon_high = epsilon_high
+        self.loss_type = loss_type
+        self.max_completion_length = max_completion_length
         self.temperature = temperature
     def forward(
@@ -229,6 +259,8 @@ class LigerFusedLinearGRPOLoss(torch.nn.Module):
             self.beta,
             self.epsilon_low,
             self.epsilon_high,
+            self.loss_type,
+            self.max_completion_length,
             self.temperature,
             self.compiled,
             self.use_ref_model,

{liger_kernel_nightly-0.5.6.dev20250411201510 → liger_kernel_nightly-0.5.6.dev20250411224032}/src/liger_kernel/transformers/model/gemma2.py RENAMED Viewed

@@ -222,7 +222,7 @@ def lce_forward(
             lm_head_weight=self.lm_head.weight,
             labels=labels,
             hidden_size=self.config.hidden_size,
-            softcap=self.config.final_logit_softcapping,
+            final_logit_softcapping=self.config.final_logit_softcapping,
             **loss_kwargs,
         )

{liger_kernel_nightly-0.5.6.dev20250411201510 → liger_kernel_nightly-0.5.6.dev20250411224032}/src/liger_kernel/transformers/model/gemma3.py RENAMED Viewed

@@ -112,7 +112,7 @@ def causal_forward(
             lm_head_weight=self.lm_head.weight,
             labels=labels,
             hidden_size=self.config.hidden_size,
-            softcap=self.config.final_logit_softcapping,
+            final_logit_softcapping=self.config.final_logit_softcapping,
             **loss_kwargs,
         )

{liger_kernel_nightly-0.5.6.dev20250411201510 → liger_kernel_nightly-0.5.6.dev20250411224032}/src/liger_kernel/transformers/model/loss_utils.py RENAMED Viewed

@@ -1,14 +1,18 @@
+from typing import Optional
+import torch
 import torch.nn as nn
 import liger_kernel.transformers.functional as F
 def fixed_fused_linear_cross_entropy(
-    hidden_states,
-    lm_head_weight,
-    target,
-    num_items_in_batch: int = None,
+    hidden_states: torch.Tensor,
+    lm_head_weight: torch.Tensor,
+    target: torch.Tensor,
+    num_items_in_batch: Optional[int] = None,
     ignore_index: int = -100,
+    final_logit_softcapping: Optional[float] = None,
     **kwargs,
 ):
     reduction = "sum" if num_items_in_batch is not None else "mean"
@@ -18,7 +22,7 @@ def fixed_fused_linear_cross_entropy(
         target,
         reduction=reduction,
         ignore_index=ignore_index,
-        **kwargs,
+        softcap=final_logit_softcapping,
     )
     if reduction == "sum":
         loss = loss / num_items_in_batch
@@ -31,15 +35,17 @@ def LigerForCausalLMLoss(
     lm_head_weight,
     labels,
     hidden_size: int,
-    num_items_in_batch: int = None,
+    num_items_in_batch: Optional[int] = None,
     ignore_index: int = -100,
+    shift_labels: Optional[torch.Tensor] = None,
+    final_logit_softcapping: Optional[float] = None,
     **kwargs,
 ):
     # Skip upcast since intermediate values for the loss are all fp32 in kernel
-    labels = labels.to(hidden_states.device)
-    # Shift so that token < n predict n
-    labels = nn.functional.pad(labels, (0, 1), value=ignore_index)
-    shift_labels = labels[..., 1:].contiguous()
+    if shift_labels is None:
+        # Shift so that token < n predict n
+        labels = nn.functional.pad(labels, (0, 1), value=ignore_index)
+        shift_labels = labels[..., 1:].contiguous()
     # Flatten the tokens
     hidden_states = hidden_states.view(-1, hidden_size)
@@ -52,6 +58,7 @@ def LigerForCausalLMLoss(
         shift_labels,
         num_items_in_batch,
         ignore_index,
+        final_logit_softcapping,
         **kwargs,
     )
     return loss

{liger_kernel_nightly-0.5.6.dev20250411201510 → liger_kernel_nightly-0.5.6.dev20250411224032}/src/liger_kernel_nightly.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: liger_kernel_nightly
-Version: 0.5.6.dev20250411201510
+Version: 0.5.6.dev20250411224032
 Summary: Efficient Triton kernels for LLM Training
 License: BSD 2-CLAUSE LICENSE
         Copyright 2024 LinkedIn Corporation

{liger_kernel_nightly-0.5.6.dev20250411201510 → liger_kernel_nightly-0.5.6.dev20250411224032}/test/chunked_loss/test_grpo_loss.py RENAMED Viewed

@@ -27,6 +27,8 @@ class TorchLMHeadGRPO(torch.nn.Module):
         epsilon_high: float = 0.2,
         temperature: float = 1.0,
         use_ref_model: bool = True,
+        loss_type: str = "bnpo",
+        max_completion_length: int | None = None,
     ):
         super().__init__()
         self.lin = torch.nn.Linear(in_features=H, out_features=V, bias=bias, dtype=dtype)
@@ -36,6 +38,10 @@ class TorchLMHeadGRPO(torch.nn.Module):
         self.epsilon_high = epsilon_high
         self.temperature = temperature
         self.use_ref_model = use_ref_model
+        self.loss_type = loss_type
+        self.max_completion_length = max_completion_length
+        if self.loss_type == "dr_grpo":
+            assert self.max_completion_length is not None, "max_completion_length must be provided for dr_grpo"
     def forward(
         self,
@@ -89,8 +95,15 @@ class TorchLMHeadGRPO(torch.nn.Module):
             kl_div = torch.exp(ref_per_token_logps - per_token_logps) - (ref_per_token_logps - per_token_logps) - 1.0
             per_token_loss = per_token_loss + self.beta * kl_div
-        # Apply masking and normalize
-        loss = (per_token_loss * attention_mask).sum() / torch.clamp(attention_mask.sum(), min=1.0)
+        # Apply masking and calculate loss based on loss_type
+        if self.loss_type == "grpo":
+            loss = ((per_token_loss * attention_mask).sum(-1) / torch.clamp(attention_mask.sum(-1), min=1.0)).mean()
+        elif self.loss_type == "bnpo":
+            loss = (per_token_loss * attention_mask).sum() / torch.clamp(attention_mask.sum(), min=1.0)
+        elif self.loss_type == "dr_grpo":
+            loss = (per_token_loss * attention_mask).sum() / (per_token_loss.size(0) * self.max_completion_length)
+        else:
+            raise ValueError(f"Unknown loss type: {self.loss_type}")
         # Compute metrics
         metrics = []
@@ -115,6 +128,8 @@ class LigerLMHeadGRPO(torch.nn.Module):
         epsilon_high: float = 0.2,
         temperature: float = 1.0,
         use_ref_model: bool = True,
+        loss_type: str = "bnpo",
+        max_completion_length: int | None = None,
     ):
         super().__init__()
         self.lin = torch.nn.Linear(in_features=H, out_features=V, bias=bias, dtype=dtype)
@@ -126,6 +141,8 @@ class LigerLMHeadGRPO(torch.nn.Module):
             temperature=temperature,
             use_ref_model=use_ref_model,
             compiled=True,
+            loss_type=loss_type,
+            max_completion_length=max_completion_length,
         )
     def forward(
@@ -186,6 +203,7 @@ class LigerLMHeadGRPO(torch.nn.Module):
     ],
 )
 @pytest.mark.parametrize("old_per_token_logps", [True, False])
+@pytest.mark.parametrize("loss_type", ["bnpo", "grpo", "dr_grpo"])
 def test_correctness(
     B,
     T,
@@ -203,9 +221,12 @@ def test_correctness(
     use_ref_per_token_logps,
     use_ref_model,
     old_per_token_logps,
+    loss_type,
 ):
     # Reset torch compiler cache for each parameter of the test case
     torch.compiler.reset()
+    max_completion_length = T if loss_type == "dr_grpo" else None
     torch_lm_head_grpo = TorchLMHeadGRPO(
         H=H,
         V=V,
@@ -216,6 +237,8 @@ def test_correctness(
         epsilon_high=epsilon_high,
         temperature=temperature,
         use_ref_model=use_ref_model,
+        loss_type=loss_type,
+        max_completion_length=max_completion_length,
     )
     liger_lm_head_grpo = LigerLMHeadGRPO(
         H=H,
@@ -227,6 +250,8 @@ def test_correctness(
         epsilon_high=epsilon_high,
         temperature=temperature,
         use_ref_model=use_ref_model,
+        loss_type=loss_type,
+        max_completion_length=max_completion_length,
     )
     # Initialize weights
@@ -319,7 +344,7 @@ def test_correctness(
     loss1.backward()
     loss2.backward()
-    # Check gradients match
+    # Check gradients match for loss_type
     assert_verbose_allclose(input1.grad, input2.grad, atol=atol, rtol=rtol)
     assert_verbose_allclose(
         torch_lm_head_grpo.lin.weight.grad,
@@ -351,6 +376,7 @@ def test_correctness(
     ],
 )
 @pytest.mark.parametrize("bias", [True, False])
+@pytest.mark.parametrize("loss_type", ["bnpo", "grpo", "dr_grpo"])
 def test_functional_correctness(
     B,
     T,
@@ -361,9 +387,11 @@ def test_functional_correctness(
     atol,
     rtol,
     bias,
+    loss_type,
 ):
     # Reset torch compiler cache for each parameter of the test case
     torch.compiler.reset()
+    max_completion_length = T if loss_type == "dr_grpo" else None
     _input = torch.randn(B, T, H, device=device, dtype=dtype) * scalar
     input1 = _input.detach().clone().requires_grad_(True)
     input2 = _input.detach().clone().requires_grad_(True)
@@ -418,6 +446,8 @@ def test_functional_correctness(
         0.04,
         0.2,
         0.2,
+        loss_type,
+        max_completion_length,
         1.0,
         True,
         True,
@@ -439,6 +469,8 @@ def test_functional_correctness(
         0.04,
         0.2,
         0.2,
+        loss_type,
+        max_completion_length,
         1.0,
         True,
         True,