PyPI - liger-kernel-nightly - Versions diffs - 0.6.2.dev20250830153353__py3-none-any.whl → 0.6.2.dev20250903164435__py3-none-any.whl - Mend

liger-kernel-nightly 0.6.2.dev20250830153353py3-none-any.whl → 0.6.2.dev20250903164435py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

liger_kernel/chunked_loss/fused_linear_ppo.py CHANGED Viewed

@@ -34,6 +34,7 @@ class LigerFusedLinearPPOBase(torch.autograd.Function):
         beta=0.04,
         loss_type="bnpo",
         max_completion_length=None,
+        importance_sampling_level="token",
         temperature=1.0,
         compiled=True,
         use_ref_model=False,
@@ -92,6 +93,7 @@ class LigerFusedLinearPPOBase(torch.autograd.Function):
             beta=beta,
             loss_type=loss_type,
             max_completion_length=max_completion_length,
+            importance_sampling_level=importance_sampling_level,
             temperature=temperature,
             use_ref_model=use_ref_model,
             ppo_loss_fn=cls.ppo_loss_fn,
@@ -261,6 +263,7 @@ class LigerFusedLinearPPOBase(torch.autograd.Function):
         beta=0.04,
         loss_type="bnpo",
         max_completion_length=None,
+        importance_sampling_level="token",
         temperature=1.0,
         use_ref_model=False,
         ppo_loss_fn=None,
@@ -292,6 +295,7 @@ class LigerFusedLinearPPOBase(torch.autograd.Function):
             beta=beta,
             loss_type=loss_type,
             max_completion_length=max_completion_length,
+            importance_sampling_level=importance_sampling_level,
         )
         return chunk_loss, chunk_metrics

liger_kernel/chunked_loss/grpo_loss.py CHANGED Viewed

@@ -31,6 +31,7 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
         beta=0.04,
         loss_type="bnpo",  # ["grpo", "bnpo", "dr_grpo"]
         max_completion_length=None,  # Required for dr_grpo
+        importance_sampling_level="token",  # ["token", "sequence"] - new parameter for GSPO
         **kwargs,
     ):
         """GRPO Loss Function matching GRPOTrainer implementation."""
@@ -50,7 +51,22 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
         # Compute policy gradient loss with importance sampling ratio
         old_per_token_logps = old_per_token_logps if old_per_token_logps is not None else per_token_logps.detach()
-        coef_1 = torch.exp(per_token_logps - old_per_token_logps)
+        log_ratio = per_token_logps - old_per_token_logps
+        if importance_sampling_level == "token":
+            log_importance_weights = log_ratio
+        elif importance_sampling_level == "sequence":
+            log_importance_weights = (log_ratio * attention_mask).sum(-1) / attention_mask.sum(-1).clamp(min=1.0)
+            log_importance_weights = log_importance_weights.unsqueeze(-1)
+        else:
+            raise ValueError(
+                f"Unknown importance sampling level: {importance_sampling_level}. Possible values are 'token' "
+                "and 'sequence'."
+            )
+        # From here, log_importance_weights (and all subsequent tensors, coef_1, coef_2, etc.) shape depends on
+        # importance_sampling_level: "token" level: (B, T); "sequence" level: (B, 1)
+        coef_1 = torch.exp(log_importance_weights)
         coef_2 = clip_coef_fn(coef_1, epsilon_low, epsilon_high)
         per_token_loss1 = coef_1 * advantages.unsqueeze(1)
         per_token_loss2 = coef_2 * advantages.unsqueeze(1)
@@ -85,9 +101,19 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
         metrics = []
         if beta != 0.0:
             metrics.append(((kl_div * attention_mask).sum() / torch.clamp(full_attention_mask.sum(), min=1.0)))
-        is_clipped = ((coef_1 < 1 - epsilon_low) & (advantages.unsqueeze(1) < 0)) | (
-            (coef_1 > 1 + epsilon_high) & (advantages.unsqueeze(1) > 0)
-        )
+        # Adjust clipping metric calculation based on importance sampling level
+        if importance_sampling_level == "token":
+            is_clipped = ((coef_1 < 1 - epsilon_low) & (advantages.unsqueeze(1) < 0)) | (
+                (coef_1 > 1 + epsilon_high) & (advantages.unsqueeze(1) > 0)
+            )
+        else:  # sequence level
+            # For sequence level, coef_1 is shape (B, 1), advantages is shape (B,)
+            is_clipped = ((coef_1.squeeze(-1) < 1 - epsilon_low) & (advantages < 0)) | (
+                (coef_1.squeeze(-1) > 1 + epsilon_high) & (advantages > 0)
+            )
+            is_clipped = is_clipped.unsqueeze(1).expand_as(attention_mask)
         metrics.append((is_clipped * attention_mask).sum() / torch.clamp(full_attention_mask.sum(), min=1.0))
         return loss, metrics
@@ -111,6 +137,7 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
         epsilon_high=0.2,
         loss_type="bnpo",
         max_completion_length=None,
+        importance_sampling_level="token",
         temperature=1.0,
         compiled=True,
         use_ref_model=True,
@@ -132,6 +159,7 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
             beta (float): Weight for the KL penalty
             loss_type (str): Type of loss calculation ("grpo", "bnpo", "dr_grpo"). Defaults to "bnpo".
             max_completion_length (int, optional): Maximum completion length, required for "dr_grpo". Defaults to None.
+            importance_sampling_level (str): Level of importance sampling ("token" or "sequence"). Defaults to "token".
             temperature (float): Temperature for the logits
             compiled (bool): Whether to use torch compile
             use_ref_model (bool): Whether to use a reference model
@@ -162,6 +190,7 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
             compiled=compiled,
             use_ref_model=use_ref_model,
             chunk_size=chunk_size,
+            importance_sampling_level=importance_sampling_level,
         )
     @staticmethod
@@ -187,6 +216,7 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
             None,  # grad_epsilon_high
             None,  # grad_loss_type (string, not differentiable)
             None,  # grad_max_completion_length (int, not differentiable)
+            None,  # grad_importance_sampling_level (string, not differentiable)
             None,  # grad_temperature
             None,  # grad_compiled
             None,  # grad_use_ref_model
@@ -207,6 +237,7 @@ class LigerFusedLinearGRPOLoss(torch.nn.Module):
         epsilon_high: float = 0.2,
         loss_type: str = "bnpo",
         max_completion_length: Optional[int] = None,
+        importance_sampling_level: str = "token",
         temperature: float = 1.0,
     ):
         """
@@ -219,6 +250,7 @@ class LigerFusedLinearGRPOLoss(torch.nn.Module):
             epsilon_high (float): Upper bound for the importance sampling ratio.
             loss_type (str): Type of loss calculation ("grpo", "bnpo", "dr_grpo"). Defaults to "bnpo".
             max_completion_length (int, optional): Maximum completion length, required for "dr_grpo". Defaults to None.
+            importance_sampling_level (str): Level of importance sampling ("token" or "sequence"). Defaults to "token".
             temperature (float): Temperature for the logits.
         """
         super().__init__()
@@ -230,6 +262,7 @@ class LigerFusedLinearGRPOLoss(torch.nn.Module):
         self.epsilon_high = epsilon_high
         self.loss_type = loss_type
         self.max_completion_length = max_completion_length
+        self.importance_sampling_level = importance_sampling_level
         self.temperature = temperature
     def forward(
@@ -263,6 +296,7 @@ class LigerFusedLinearGRPOLoss(torch.nn.Module):
             self.epsilon_high,
             self.loss_type,
             self.max_completion_length,
+            self.importance_sampling_level,
             self.temperature,
             self.compiled,
             self.use_ref_model,

liger_kernel/transformers/multi_token_attention.py CHANGED Viewed

@@ -9,7 +9,7 @@ from liger_kernel.ops.multi_token_attention import LigerMultiTokenAttentionFunct
 class LigerMultiTokenAttention(nn.Module):
-    """
+    r"""
     Multi-Token Attention:
         out = mask_{0}(conv2d(softmax(mask_{-\inf}(scores))))

{liger_kernel_nightly-0.6.2.dev20250830153353.dist-info → liger_kernel_nightly-0.6.2.dev20250903164435.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: liger_kernel_nightly
-Version: 0.6.2.dev20250830153353
+Version: 0.6.2.dev20250903164435
 Summary: Efficient Triton kernels for LLM Training
 License: BSD 2-CLAUSE LICENSE
         Copyright 2024 LinkedIn Corporation

{liger_kernel_nightly-0.6.2.dev20250830153353.dist-info → liger_kernel_nightly-0.6.2.dev20250903164435.dist-info}/RECORD RENAMED Viewed

@@ -8,10 +8,10 @@ liger_kernel/chunked_loss/cpo_loss.py,sha256=Gzz1eU4kgcbdubFVRy55e8A1Cr-r45UgNic
 liger_kernel/chunked_loss/dpo_loss.py,sha256=I83khNs3QQjuhr8U3NIOAACkbse6DNiBV-TulPZ0lXw,9006
 liger_kernel/chunked_loss/functional.py,sha256=-XPDbLml9dHmvoSU2VNTUrBDFehuzvuAGPikVetBMtI,1132
 liger_kernel/chunked_loss/fused_linear_distillation.py,sha256=ooR-qnZCyWJN935oHCSWLaKKKyaYERyhNczRGi1VOiw,11935
-liger_kernel/chunked_loss/fused_linear_ppo.py,sha256=AA19cpv6D8mo5RbSK5GRCcZoOSnpxV_Z1eJlAsC5eic,13434
+liger_kernel/chunked_loss/fused_linear_ppo.py,sha256=ZjpNP5VC-tXXIKb4AckkQ3iWWQeej-JoG4StJq3N0wg,13650
 liger_kernel/chunked_loss/fused_linear_preference.py,sha256=FIH85uUXAOgYx5Ax8MjFhJHVu-2pKtY7wSegd0zSyyY,18336
 liger_kernel/chunked_loss/fused_linear_unpaired_preference.py,sha256=RiuK3UtRwH9T6jZ36sA8Urj-TVuOLOO2syLg_JOQapY,13437
-liger_kernel/chunked_loss/grpo_loss.py,sha256=kuqHkYV383sUxqJN-DMsfADHi2hxHVyKx5S24TNc8bQ,10866
+liger_kernel/chunked_loss/grpo_loss.py,sha256=SkZuKoW8K94UbWR-OtfopsQkuQ8tFOr_90AGR6_Mhes,12844
 liger_kernel/chunked_loss/jsd_loss.py,sha256=gRhnmB8xwuz7FcMJi5v5eyBsq01owaCbcyyrF4rYtY0,7133
 liger_kernel/chunked_loss/kto_loss.py,sha256=llVCe6DkcpCo57seGWoMikaQVFApx764jsmSbQyqwQY,7529
 liger_kernel/chunked_loss/orpo_loss.py,sha256=nu9UYG16dcMw93lvHi4_hYs3Q0FK1KnlmMRj7OpYU8s,4872
@@ -59,7 +59,7 @@ liger_kernel/transformers/kl_div.py,sha256=WLffFbh1EExD2Eb1F7lN11fo9JJC-0751WJjZ
 liger_kernel/transformers/layer_norm.py,sha256=c9pk3PEasOKYR0rhe5e5nNrnYKVCEW4VC8S6LpCq9EQ,906
 liger_kernel/transformers/llama4_rope.py,sha256=kS6PSHEwf3dS7hD7C7p8S0geugx2EMCiP0h0F7LsUoY,3639
 liger_kernel/transformers/monkey_patch.py,sha256=pG3Yf0fMg4_0pAncc2wLtpdfXvmC5CROpNJ43-MmElM,93075
-liger_kernel/transformers/multi_token_attention.py,sha256=l9VDICK0dfmifUDW668hGscP8AHq2rYcM2oGUa3baRQ,1751
+liger_kernel/transformers/multi_token_attention.py,sha256=K3NIY9_5TPgZ4_Rahn0xnkMXxD_fmlJHK4CWGYvGQp0,1752
 liger_kernel/transformers/qwen2vl_mrope.py,sha256=5EwSqrMdsL9MYspeBMXBsNJKvH0MOmRrtJXAJlnnlOI,1047
 liger_kernel/transformers/rms_norm.py,sha256=vkekcvTeWY8vL4H6hg3t0XeY0Ew_3OFMPHuzqlxPPVw,2719
 liger_kernel/transformers/rope.py,sha256=ZTrTORSAyfcFIKjk6XEeYmk4ROH7xXED9L4g2NFntlE,999
@@ -96,9 +96,9 @@ liger_kernel/transformers/trainer/__init__.py,sha256=p7yQfklV8-467qSz_ZMimkbDF7H
 liger_kernel/transformers/trainer/orpo_trainer.py,sha256=tX0h63aOFe3rNqTmk6JpMf75UPo981yzEa6TghnjS0Q,5370
 liger_kernel/triton/__init__.py,sha256=qCiCamzCRv6lpV8IqpAc9YMdNKC7GKurClWceQPnlis,92
 liger_kernel/triton/monkey_patch.py,sha256=Rd0hUHAzDkFfHvnX7-PBaNK5EKnZhtfM_h-fgQH9HPY,1568
-liger_kernel_nightly-0.6.2.dev20250830153353.dist-info/LICENSE,sha256=OhzLDHJ0to4a8sodVLELZiCFylZ1NAAYLs-HrjPy0ag,1312
-liger_kernel_nightly-0.6.2.dev20250830153353.dist-info/METADATA,sha256=pdvNhCMdDJLC-ipmXC0fO7Nw_8EP9e0oNfbnU_TCPVg,24504
-liger_kernel_nightly-0.6.2.dev20250830153353.dist-info/NOTICE,sha256=njwnoPZLh9AN8SJQzxvCGLHi-8X__AvWRze6joNXIY8,2066
-liger_kernel_nightly-0.6.2.dev20250830153353.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
-liger_kernel_nightly-0.6.2.dev20250830153353.dist-info/top_level.txt,sha256=2eghu4hA3LnkM7ElW92tQ8zegWKgSbeo-k-aGe1YnvY,13
-liger_kernel_nightly-0.6.2.dev20250830153353.dist-info/RECORD,,
+liger_kernel_nightly-0.6.2.dev20250903164435.dist-info/LICENSE,sha256=OhzLDHJ0to4a8sodVLELZiCFylZ1NAAYLs-HrjPy0ag,1312
+liger_kernel_nightly-0.6.2.dev20250903164435.dist-info/METADATA,sha256=BgiSTSMznb0cvZyFqU68T0sEIAOBcf9hvuO6jIPCcC8,24504
+liger_kernel_nightly-0.6.2.dev20250903164435.dist-info/NOTICE,sha256=njwnoPZLh9AN8SJQzxvCGLHi-8X__AvWRze6joNXIY8,2066
+liger_kernel_nightly-0.6.2.dev20250903164435.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
+liger_kernel_nightly-0.6.2.dev20250903164435.dist-info/top_level.txt,sha256=2eghu4hA3LnkM7ElW92tQ8zegWKgSbeo-k-aGe1YnvY,13
+liger_kernel_nightly-0.6.2.dev20250903164435.dist-info/RECORD,,

{liger_kernel_nightly-0.6.2.dev20250830153353.dist-info → liger_kernel_nightly-0.6.2.dev20250903164435.dist-info}/LICENSE RENAMED Viewed

File without changes

{liger_kernel_nightly-0.6.2.dev20250830153353.dist-info → liger_kernel_nightly-0.6.2.dev20250903164435.dist-info}/NOTICE RENAMED Viewed

File without changes

{liger_kernel_nightly-0.6.2.dev20250830153353.dist-info → liger_kernel_nightly-0.6.2.dev20250903164435.dist-info}/WHEEL RENAMED Viewed

File without changes

{liger_kernel_nightly-0.6.2.dev20250830153353.dist-info → liger_kernel_nightly-0.6.2.dev20250903164435.dist-info}/top_level.txt RENAMED Viewed

File without changes

liger-kernel-nightly 0.6.2.dev20250830153353__py3-none-any.whl → 0.6.2.dev20250903164435__py3-none-any.whl

liger-kernel-nightly 0.6.2.dev20250830153353py3-none-any.whl → 0.6.2.dev20250903164435py3-none-any.whl