PyPI - liger-kernel-nightly - Versions diffs - 0.5.6.dev20250408223717__py3-none-any.whl → 0.5.6.dev20250411210855__py3-none-any.whl - Mend

liger-kernel-nightly 0.5.6.dev20250408223717py3-none-any.whl → 0.5.6.dev20250411210855py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

liger_kernel/chunked_loss/fused_linear_ppo.py CHANGED Viewed

@@ -32,6 +32,8 @@ class LigerFusedLinearPPOBase(torch.autograd.Function):
         epsilon_low=0.2,
         epsilon_high=0.2,
         beta=0.04,
+        loss_type="bnpo",
+        max_completion_length=None,
         temperature=1.0,
         compiled=True,
         use_ref_model=False,
@@ -57,6 +59,8 @@ class LigerFusedLinearPPOBase(torch.autograd.Function):
             epsilon_low: Lower bound for clipping the importance sampling ratio
             epsilon_high: Upper bound for clipping the importance sampling ratio
             beta: Weight for the KL penalty
+            loss_type: Type of loss calculation ("grpo", "bnpo", "dr_grpo")
+            max_completion_length: Maximum completion length required for "dr_grpo"
             temperature: Temperature for the logits
             compiled: Whether to use torch compile
             use_ref_model: Whether to use a reference model
@@ -68,6 +72,8 @@ class LigerFusedLinearPPOBase(torch.autograd.Function):
             )
             if ref_per_token_logps is not None and ref_input is not None:
                 raise Warning("Both ref_per_token_logps and ref_input are provided. Using ref_per_token_logps.")
+        if loss_type == "dr_grpo":
+            assert max_completion_length is not None, "max_completion_length must be provided for loss_type 'dr_grpo'"
         # Initialize accumulators
         loss_acc = torch.zeros((), device=_input.device, dtype=torch.float32)
         grad_weight = torch.zeros_like(weight)  # [V, H]
@@ -84,6 +90,8 @@ class LigerFusedLinearPPOBase(torch.autograd.Function):
             epsilon_low=epsilon_low,
             epsilon_high=epsilon_high,
             beta=beta,
+            loss_type=loss_type,
+            max_completion_length=max_completion_length,
             temperature=temperature,
             use_ref_model=use_ref_model,
             ppo_loss_fn=cls.ppo_loss_fn,
@@ -251,6 +259,8 @@ class LigerFusedLinearPPOBase(torch.autograd.Function):
         epsilon_low=0.2,
         epsilon_high=0.2,
         beta=0.04,
+        loss_type="bnpo",
+        max_completion_length=None,
         temperature=1.0,
         use_ref_model=False,
         ppo_loss_fn=None,
@@ -280,6 +290,8 @@ class LigerFusedLinearPPOBase(torch.autograd.Function):
             epsilon_low=epsilon_low,
             epsilon_high=epsilon_high,
             beta=beta,
+            loss_type=loss_type,
+            max_completion_length=max_completion_length,
         )
         return chunk_loss, chunk_metrics
@@ -303,6 +315,7 @@ class LigerFusedLinearPPOBase(torch.autograd.Function):
     def backward(ctx, grad_output, *grad_metrics):
         """Backward pass for PPO loss."""
         grad_input, grad_weight, grad_bias = ctx.saved_tensors
         if grad_output != 1.0:
             grad_input = grad_input * grad_output
             grad_weight = grad_weight * grad_output
@@ -328,4 +341,6 @@ class LigerFusedLinearPPOBase(torch.autograd.Function):
             None,  # grad_compiled
             None,  # grad_use_ref_model
             None,  # grad_chunk_size
+            None,  # grad_loss_type
+            None,  # grad_max_completion_length
         )

liger_kernel/chunked_loss/grpo_loss.py CHANGED Viewed

@@ -27,6 +27,8 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
         epsilon_low=0.2,
         epsilon_high=0.2,
         beta=0.04,
+        loss_type="bnpo",  # ["grpo", "bnpo", "dr_grpo"]
+        max_completion_length=None,  # Required for dr_grpo
         **kwargs,
     ):
         """GRPO Loss Function matching GRPOTrainer implementation."""
@@ -61,7 +63,21 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
         # which is consistent with the DAPO loss implementation (https://arxiv.org/html/2503.14476v1)
         # and TRL GRPO implementation
         # (https://github.com/huggingface/trl/blob/e751a16df56e70190fb94bed4a2035eec3303777/trl/trainer/grpo_trainer.py#L966)
-        loss = (per_token_loss * attention_mask).sum() / torch.clamp(full_attention_mask.sum(), min=1.0)
+        if loss_type == "grpo":
+            # Average per-sequence loss
+            loss = (
+                (per_token_loss * attention_mask).sum(-1) / torch.clamp(attention_mask.sum(-1), min=1.0)
+            ).sum() / full_attention_mask.shape[0]
+        elif loss_type == "bnpo":
+            # Batch Normalized Per-token loss (original implementation)
+            loss = (per_token_loss * attention_mask).sum() / torch.clamp(full_attention_mask.sum(), min=1.0)
+        elif loss_type == "dr_grpo":
+            # Dimension-Reduced GRPO (normalize by batch_size * max_completion_length)
+            if max_completion_length is None:
+                raise ValueError("max_completion_length must be provided for loss_type 'dr_grpo'")
+            loss = (per_token_loss * attention_mask).sum() / (full_attention_mask.shape[0] * max_completion_length)
+        else:
+            raise ValueError(f"Unknown loss type: {loss_type}")
         # Calculate metrics
         metrics = []
@@ -91,6 +107,8 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
         beta=0.04,
         epsilon_low=0.2,
         epsilon_high=0.2,
+        loss_type="bnpo",
+        max_completion_length=None,
         temperature=1.0,
         compiled=True,
         use_ref_model=True,
@@ -110,6 +128,8 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
             ref_weight (torch.Tensor, optional): Reference model weight tensor. Shape: (vocab_size, hidden_size)
             ref_bias (torch.Tensor, optional): Reference model bias tensor. Shape: (vocab_size,)
             beta (float): Weight for the KL penalty
+            loss_type (str): Type of loss calculation ("grpo", "bnpo", "dr_grpo"). Defaults to "bnpo".
+            max_completion_length (int, optional): Maximum completion length, required for "dr_grpo". Defaults to None.
             temperature (float): Temperature for the logits
             compiled (bool): Whether to use torch compile
             use_ref_model (bool): Whether to use a reference model
@@ -134,6 +154,8 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
             beta=beta,
             epsilon_low=epsilon_low,
             epsilon_high=epsilon_high,
+            loss_type=loss_type,
+            max_completion_length=max_completion_length,
             temperature=temperature,
             compiled=compiled,
             use_ref_model=use_ref_model,
@@ -161,6 +183,8 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
             None,  # grad_beta
             None,  # grad_epsilon_low
             None,  # grad_epsilon_high
+            None,  # grad_loss_type (string, not differentiable)
+            None,  # grad_max_completion_length (int, not differentiable)
             None,  # grad_temperature
             None,  # grad_compiled
             None,  # grad_use_ref_model
@@ -179,6 +203,8 @@ class LigerFusedLinearGRPOLoss(torch.nn.Module):
         chunk_size: int = 1,
         epsilon_low: float = 0.2,
         epsilon_high: float = 0.2,
+        loss_type: str = "bnpo",
+        max_completion_length: int | None = None,
         temperature: float = 1.0,
     ):
         """
@@ -189,6 +215,8 @@ class LigerFusedLinearGRPOLoss(torch.nn.Module):
             chunk_size (int): Size of chunks for processing.
             epsilon_low (float): Lower bound for the importance sampling ratio.
             epsilon_high (float): Upper bound for the importance sampling ratio.
+            loss_type (str): Type of loss calculation ("grpo", "bnpo", "dr_grpo"). Defaults to "bnpo".
+            max_completion_length (int, optional): Maximum completion length, required for "dr_grpo". Defaults to None.
             temperature (float): Temperature for the logits.
         """
         super().__init__()
@@ -198,6 +226,8 @@ class LigerFusedLinearGRPOLoss(torch.nn.Module):
         self.chunk_size = chunk_size
         self.epsilon_low = epsilon_low
         self.epsilon_high = epsilon_high
+        self.loss_type = loss_type
+        self.max_completion_length = max_completion_length
         self.temperature = temperature
     def forward(
@@ -229,6 +259,8 @@ class LigerFusedLinearGRPOLoss(torch.nn.Module):
             self.beta,
             self.epsilon_low,
             self.epsilon_high,
+            self.loss_type,
+            self.max_completion_length,
             self.temperature,
             self.compiled,
             self.use_ref_model,

liger_kernel/ops/kl_div.py CHANGED Viewed

@@ -6,6 +6,7 @@ import triton.language as tl
 from liger_kernel.ops.utils import ensure_contiguous
 from liger_kernel.ops.utils import is_hip
+from liger_kernel.utils import infer_device
 def get_num_warps(BLOCK_SIZE):
@@ -115,9 +116,12 @@ def _kldiv_kernel_backward(
 def kldiv_forward_triton(y_pred, y_true, log_target, reduction, eps):  # [BT, V]
     BT, V = y_pred.shape
-    BLOCK_SIZE = min(MAX_FUSED_SIZE, triton.next_power_of_2(V))
-    num_warps = get_num_warps(BLOCK_SIZE)
+    BLOCK_SIZE = (
+        min(8192, triton.next_power_of_2(V))
+        if infer_device() == "xpu"
+        else min(MAX_FUSED_SIZE, triton.next_power_of_2(V))
+    )
+    num_warps = 32 if infer_device() == "xpu" else get_num_warps(BLOCK_SIZE)
     grid = (BT,)
     reduction = _str_to_reduction_mode[reduction]
@@ -155,9 +159,12 @@ def kldiv_forward_triton(y_pred, y_true, log_target, reduction, eps):  # [BT, V]
 def kldiv_backward_triton(target, grad_output, new_grads, log_target):
     BT, V = target.shape
-    BLOCK_SIZE = min(MAX_FUSED_SIZE, triton.next_power_of_2(V))
-    num_warps = get_num_warps(BLOCK_SIZE)
+    BLOCK_SIZE = (
+        min(8192, triton.next_power_of_2(V))
+        if infer_device() == "xpu"
+        else min(MAX_FUSED_SIZE, triton.next_power_of_2(V))
+    )
+    num_warps = 32 if infer_device() == "xpu" else get_num_warps(BLOCK_SIZE)
     grid = (BT,)

{liger_kernel_nightly-0.5.6.dev20250408223717.dist-info → liger_kernel_nightly-0.5.6.dev20250411210855.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: liger_kernel_nightly
-Version: 0.5.6.dev20250408223717
+Version: 0.5.6.dev20250411210855
 Summary: Efficient Triton kernels for LLM Training
 License: BSD 2-CLAUSE LICENSE
         Copyright 2024 LinkedIn Corporation

{liger_kernel_nightly-0.5.6.dev20250408223717.dist-info → liger_kernel_nightly-0.5.6.dev20250411210855.dist-info}/RECORD RENAMED Viewed

@@ -7,10 +7,10 @@ liger_kernel/chunked_loss/cpo_loss.py,sha256=Gzz1eU4kgcbdubFVRy55e8A1Cr-r45UgNic
 liger_kernel/chunked_loss/dpo_loss.py,sha256=xZwGqS04si9zXyob95SAdalC-hajZg8fWINqiqffN8k,5855
 liger_kernel/chunked_loss/functional.py,sha256=9G3nKm-Bi7uoZRFkL8wwGMl6juDl4bSzDvTa5GHZPzg,955
 liger_kernel/chunked_loss/fused_linear_distillation.py,sha256=ooR-qnZCyWJN935oHCSWLaKKKyaYERyhNczRGi1VOiw,11935
-liger_kernel/chunked_loss/fused_linear_ppo.py,sha256=-E4AuWY-y2bMo_kAmEQBgQ92UJh3L5IiCRGVcfMJOCE,12731
+liger_kernel/chunked_loss/fused_linear_ppo.py,sha256=AA19cpv6D8mo5RbSK5GRCcZoOSnpxV_Z1eJlAsC5eic,13434
 liger_kernel/chunked_loss/fused_linear_preference.py,sha256=ojB42jYPu0c4ki96Ft-hy7Sf6fh_WikG-aWNrlZzSio,18362
 liger_kernel/chunked_loss/fused_linear_unpaired_preference.py,sha256=RiuK3UtRwH9T6jZ36sA8Urj-TVuOLOO2syLg_JOQapY,13437
-liger_kernel/chunked_loss/grpo_loss.py,sha256=6Mb4ZT6MfnOr4Xo681rMR0LKkhzJhInvQp8wp2YVMK0,8913
+liger_kernel/chunked_loss/grpo_loss.py,sha256=eh6mErFUZsSQrgRRefuXdk-LG0gS7Rg2r-U9CtbH3eU,10834
 liger_kernel/chunked_loss/jsd_loss.py,sha256=u2ahkuHsbhpNaKcpBCz5gCMDk9ou-P04DHji592dIBo,7067
 liger_kernel/chunked_loss/kto_loss.py,sha256=llVCe6DkcpCo57seGWoMikaQVFApx764jsmSbQyqwQY,7529
 liger_kernel/chunked_loss/orpo_loss.py,sha256=nu9UYG16dcMw93lvHi4_hYs3Q0FK1KnlmMRj7OpYU8s,4872
@@ -23,7 +23,7 @@ liger_kernel/ops/fused_linear_jsd.py,sha256=CSoprxb-YcJy-YUKiTcYkxN8sb9h2kdk_iHu
 liger_kernel/ops/geglu.py,sha256=axGvCIvlBzuluoAIrWTsp2iZM4BFKNInkPov8YVvH9E,4126
 liger_kernel/ops/group_norm.py,sha256=qD4D4lSjSgVtO52EBNLC2iTseALRgPgqXE50U2woggk,10837
 liger_kernel/ops/jsd.py,sha256=onHp5T3MbvJaVz5Vup7Ww6EQp_HTaZeayTjJk6FgQMY,7042
-liger_kernel/ops/kl_div.py,sha256=NkG7D6_DnPBzr-ohhYiQbRBnq_fbGmpn5UU7y0UBKQo,8420
+liger_kernel/ops/kl_div.py,sha256=ZjGdDLKWksHT9dZ0xF_TDgAkj5cuMTwwT5tr9E-_24o,8734
 liger_kernel/ops/layer_norm.py,sha256=vWCyOm-F2GMAilB-ozJcFeUQQLCJoTE_uiXq-_0uYuI,8356
 liger_kernel/ops/qwen2vl_mrope.py,sha256=3GExhYpLgB4VUtyZyjRk8XjEur3W4EWF6HQ67ML5vBU,8481
 liger_kernel/ops/rms_norm.py,sha256=PP27OIBmV9By63i13jot9ylDowW0nuxY_JFIkaPLgL4,12078
@@ -74,9 +74,9 @@ liger_kernel/transformers/trainer/__init__.py,sha256=p7yQfklV8-467qSz_ZMimkbDF7H
 liger_kernel/transformers/trainer/orpo_trainer.py,sha256=pdekW7l6Qg_aqa5SYKYlSWUF8m3lkOFvFLcIMEHrz9s,8338
 liger_kernel/triton/__init__.py,sha256=qCiCamzCRv6lpV8IqpAc9YMdNKC7GKurClWceQPnlis,92
 liger_kernel/triton/monkey_patch.py,sha256=Rd0hUHAzDkFfHvnX7-PBaNK5EKnZhtfM_h-fgQH9HPY,1568
-liger_kernel_nightly-0.5.6.dev20250408223717.dist-info/LICENSE,sha256=OhzLDHJ0to4a8sodVLELZiCFylZ1NAAYLs-HrjPy0ag,1312
-liger_kernel_nightly-0.5.6.dev20250408223717.dist-info/METADATA,sha256=ZSAGbY1ejoXoRQzTkkCjTwZd-OQxWdTV1IukEftepgU,23297
-liger_kernel_nightly-0.5.6.dev20250408223717.dist-info/NOTICE,sha256=njwnoPZLh9AN8SJQzxvCGLHi-8X__AvWRze6joNXIY8,2066
-liger_kernel_nightly-0.5.6.dev20250408223717.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
-liger_kernel_nightly-0.5.6.dev20250408223717.dist-info/top_level.txt,sha256=2eghu4hA3LnkM7ElW92tQ8zegWKgSbeo-k-aGe1YnvY,13
-liger_kernel_nightly-0.5.6.dev20250408223717.dist-info/RECORD,,
+liger_kernel_nightly-0.5.6.dev20250411210855.dist-info/LICENSE,sha256=OhzLDHJ0to4a8sodVLELZiCFylZ1NAAYLs-HrjPy0ag,1312
+liger_kernel_nightly-0.5.6.dev20250411210855.dist-info/METADATA,sha256=mX6Na52mRBO2g2I7Qqj34QGM17tMQAZLNjE7XX0g9fA,23297
+liger_kernel_nightly-0.5.6.dev20250411210855.dist-info/NOTICE,sha256=njwnoPZLh9AN8SJQzxvCGLHi-8X__AvWRze6joNXIY8,2066
+liger_kernel_nightly-0.5.6.dev20250411210855.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
+liger_kernel_nightly-0.5.6.dev20250411210855.dist-info/top_level.txt,sha256=2eghu4hA3LnkM7ElW92tQ8zegWKgSbeo-k-aGe1YnvY,13
+liger_kernel_nightly-0.5.6.dev20250411210855.dist-info/RECORD,,

{liger_kernel_nightly-0.5.6.dev20250408223717.dist-info → liger_kernel_nightly-0.5.6.dev20250411210855.dist-info}/LICENSE RENAMED Viewed

File without changes

{liger_kernel_nightly-0.5.6.dev20250408223717.dist-info → liger_kernel_nightly-0.5.6.dev20250411210855.dist-info}/NOTICE RENAMED Viewed

File without changes

{liger_kernel_nightly-0.5.6.dev20250408223717.dist-info → liger_kernel_nightly-0.5.6.dev20250411210855.dist-info}/WHEEL RENAMED Viewed

File without changes

{liger_kernel_nightly-0.5.6.dev20250408223717.dist-info → liger_kernel_nightly-0.5.6.dev20250411210855.dist-info}/top_level.txt RENAMED Viewed

File without changes

liger-kernel-nightly 0.5.6.dev20250408223717__py3-none-any.whl → 0.5.6.dev20250411210855__py3-none-any.whl

liger-kernel-nightly 0.5.6.dev20250408223717py3-none-any.whl → 0.5.6.dev20250411210855py3-none-any.whl