PyPI - liger-kernel-nightly - Versions diffs - 0.5.10.dev20250624183504__py3-none-any.whl → 0.6.4.dev20251121224847__py3-none-any.whl - Mend

liger-kernel-nightly 0.5.10.dev20250624183504py3-none-any.whl → 0.6.4.dev20251121224847py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of liger-kernel-nightly might be problematic. Click here for more details.

Files changed (73) hide show

liger_kernel/chunked_loss/__init__.py +1 -0
liger_kernel/chunked_loss/cosine_similarity_loss.py +136 -0
liger_kernel/chunked_loss/dpo_loss.py +54 -3
liger_kernel/chunked_loss/functional.py +2 -0
liger_kernel/chunked_loss/fused_linear_distillation.py +13 -2
liger_kernel/chunked_loss/fused_linear_ppo.py +25 -5
liger_kernel/chunked_loss/grpo_loss.py +46 -9
liger_kernel/chunked_loss/jsd_loss.py +23 -7
liger_kernel/ops/cross_entropy.py +118 -62
liger_kernel/ops/fused_add_rms_norm.py +412 -0
liger_kernel/ops/fused_linear_cross_entropy.py +113 -21
liger_kernel/ops/geglu.py +1 -1
liger_kernel/ops/grpo_loss.py +3 -1
liger_kernel/ops/layer_norm.py +133 -79
liger_kernel/ops/llama4_rope.py +225 -0
liger_kernel/ops/poly_norm.py +386 -0
liger_kernel/ops/rms_norm.py +2 -2
liger_kernel/ops/rope.py +1 -1
liger_kernel/ops/swiglu.py +1 -1
liger_kernel/ops/tiled_mlp.py +136 -0
liger_kernel/transformers/__init__.py +59 -0
liger_kernel/transformers/cross_entropy.py +8 -3
liger_kernel/transformers/experimental/__init__.py +5 -0
liger_kernel/transformers/functional.py +38 -6
liger_kernel/transformers/fused_add_rms_norm.py +39 -0
liger_kernel/transformers/fused_linear_cross_entropy.py +16 -4
liger_kernel/transformers/grpo_loss.py +56 -1
liger_kernel/transformers/llama4_rope.py +93 -0
liger_kernel/transformers/model/falcon_h1.py +122 -0
liger_kernel/transformers/model/gemma.py +28 -8
liger_kernel/transformers/model/gemma2.py +31 -8
liger_kernel/transformers/model/gemma3.py +100 -110
liger_kernel/transformers/model/glm4.py +18 -5
liger_kernel/transformers/model/glm4v.py +163 -0
liger_kernel/transformers/model/glm4v_moe.py +172 -0
liger_kernel/transformers/model/hunyuan_v1.py +134 -0
liger_kernel/transformers/model/internvl.py +157 -0
liger_kernel/transformers/model/llama.py +26 -7
liger_kernel/transformers/model/llama4.py +121 -0
liger_kernel/transformers/model/llava.py +18 -6
liger_kernel/transformers/model/loss_utils.py +34 -3
liger_kernel/transformers/model/mistral.py +17 -10
liger_kernel/transformers/model/mixtral.py +24 -9
liger_kernel/transformers/model/mllama.py +18 -7
liger_kernel/transformers/model/olmo2.py +18 -5
liger_kernel/transformers/model/olmo3.py +142 -0
liger_kernel/transformers/model/output_classes.py +147 -0
liger_kernel/transformers/model/paligemma.py +41 -5
liger_kernel/transformers/model/phi3.py +24 -159
liger_kernel/transformers/model/qwen2.py +26 -4
liger_kernel/transformers/model/qwen2_5_vl.py +21 -8
liger_kernel/transformers/model/qwen2_vl.py +24 -7
liger_kernel/transformers/model/qwen3.py +22 -6
liger_kernel/transformers/model/qwen3_moe.py +27 -7
liger_kernel/transformers/model/qwen3_next.py +146 -0
liger_kernel/transformers/model/qwen3_vl.py +150 -0
liger_kernel/transformers/model/qwen3_vl_moe.py +126 -0
liger_kernel/transformers/model/smollm3.py +199 -0
liger_kernel/transformers/model/smolvlm.py +158 -0
liger_kernel/transformers/monkey_patch.py +1278 -116
liger_kernel/transformers/multi_token_attention.py +1 -1
liger_kernel/transformers/poly_norm.py +42 -0
liger_kernel/transformers/rms_norm.py +7 -0
liger_kernel/transformers/rope.py +43 -0
liger_kernel/transformers/swiglu.py +17 -0
liger_kernel/transformers/tiled_mlp.py +133 -0
{liger_kernel_nightly-0.5.10.dev20250624183504.dist-info → liger_kernel_nightly-0.6.4.dev20251121224847.dist-info}/METADATA +29 -24
liger_kernel_nightly-0.6.4.dev20251121224847.dist-info/RECORD +118 -0
liger_kernel_nightly-0.5.10.dev20250624183504.dist-info/RECORD +0 -95
{liger_kernel_nightly-0.5.10.dev20250624183504.dist-info → liger_kernel_nightly-0.6.4.dev20251121224847.dist-info}/LICENSE +0 -0
{liger_kernel_nightly-0.5.10.dev20250624183504.dist-info → liger_kernel_nightly-0.6.4.dev20251121224847.dist-info}/NOTICE +0 -0
{liger_kernel_nightly-0.5.10.dev20250624183504.dist-info → liger_kernel_nightly-0.6.4.dev20251121224847.dist-info}/WHEEL +0 -0
{liger_kernel_nightly-0.5.10.dev20250624183504.dist-info → liger_kernel_nightly-0.6.4.dev20251121224847.dist-info}/top_level.txt +0 -0

liger_kernel/chunked_loss/__init__.py CHANGED Viewed

@@ -1,3 +1,4 @@
+from liger_kernel.chunked_loss.cosine_similarity_loss import LigerFusedLinearCosineSimilarityLoss  # noqa:F401
 from liger_kernel.chunked_loss.cpo_loss import LigerFusedLinearCPOLoss  # noqa: F401
 from liger_kernel.chunked_loss.dpo_loss import LigerFusedLinearDPOLoss  # noqa: F401
 from liger_kernel.chunked_loss.grpo_loss import LigerFusedLinearGRPOLoss  # noqa: F401

liger_kernel/chunked_loss/cosine_similarity_loss.py ADDED Viewed

@@ -0,0 +1,136 @@
+from typing import Tuple
+from typing import Union
+import torch
+import torch.nn.functional as F
+from liger_kernel.chunked_loss.fused_linear_distillation import LigerFusedLinearDistillationBase
+class LigerFusedLinearCosineSimilarityFunction(LigerFusedLinearDistillationBase):
+    @staticmethod
+    def distillation_loss_fn(student_logits, teacher_logits, beta=1.0):
+        """
+        Compute Cosine loss (Cosine Similarity Loss).
+        Args:
+            student_logits (torch.Tensor): Logits of student tokens. Shape: (batch_size * seq_len,).
+            teacher_logits (torch.Tensor): Logits of teacher tokens. Shape: (batch_size * seq_len,).
+            beta: Coefficient beta of generalized Cosine Similarity in the interval [0, 1]. Default: `1.0` (float): .
+        Returns:
+            torch.Tensor: cosine similarity loss
+        """
+        student_norm = F.normalize(student_logits, p=2, dim=-1)
+        teacher_norm = F.normalize(teacher_logits, p=2, dim=-1)
+        cosine_sim = F.cosine_similarity(student_norm, teacher_norm, dim=-1)
+        loss = beta * (1 - cosine_sim)
+        return loss.sum()
+    @classmethod
+    def forward(
+        cls,
+        ctx,
+        student_input: torch.Tensor,
+        student_weight: torch.Tensor,
+        teacher_input: torch.Tensor,
+        teacher_weight: torch.Tensor,
+        true_labels: torch.LongTensor,
+        student_bias: torch.Tensor,
+        teacher_bias: torch.Tensor,
+        weight_hard_loss: float = 0.5,
+        weight_soft_loss: float = 0.5,
+        beta: float = 0.5,
+        ignore_index: int = -100,
+        temperature: float = 1.0,
+        compiled: bool = True,
+        chunk_size: int = 1024,
+        return_soft_hard_loss: bool = False,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor, torch.Tensor]]:
+        return super().forward(
+            cls=cls,
+            ctx=ctx,
+            student_input=student_input,
+            student_weight=student_weight,
+            teacher_input=teacher_input,
+            teacher_weight=teacher_weight,
+            target=true_labels,
+            student_bias=student_bias,
+            teacher_bias=teacher_bias,
+            chunk_size=chunk_size,
+            weight_hard_loss=weight_hard_loss,
+            weight_soft_loss=weight_soft_loss,
+            beta=beta,
+            ignore_index=ignore_index,
+            temperature=temperature,
+            compiled=compiled,
+            return_soft_hard_loss=return_soft_hard_loss,
+        )
+    @staticmethod
+    def backward(ctx, grad_output, *args):
+        grads = LigerFusedLinearDistillationBase.backward(ctx, grad_output, *args)[:6]
+        return (
+            *grads,
+            None,  # teacher_bias
+            None,  # weight_hard_loss
+            None,  # weight_soft_loss
+            None,  # beta
+            None,  # ignore_index
+            None,  # temperature
+            None,  # compiled
+            None,  # chunk_size
+            None,  # return_soft_hard_loss
+        )
+class LigerFusedLinearCosineSimilarityLoss(torch.nn.Module):
+    def __init__(
+        self,
+        weight_hard_loss: float = 0.5,
+        weight_soft_loss: float = 0.5,
+        beta: float = 0.5,
+        ignore_index: int = -100,
+        temperature: float = 1.0,
+        compiled: bool = True,
+        chunk_size: int = 1024,
+        return_soft_hard_loss: bool = False,
+    ):
+        super().__init__()
+        assert temperature != 0, "Temperature cannot be 0."
+        self.weight_hard_loss = weight_hard_loss
+        self.weight_soft_loss = weight_soft_loss
+        self.ignore_index = ignore_index
+        self.temperature = temperature
+        self.compiled = compiled
+        self.beta = beta
+        self.chunk_size = chunk_size
+        self.return_soft_hard_loss = return_soft_hard_loss
+    def forward(
+        self,
+        student_input: torch.Tensor,
+        student_weight: torch.Tensor,
+        teacher_input: torch.Tensor,
+        teacher_weight: torch.Tensor,
+        true_labels: torch.LongTensor,
+        student_bias: torch.Tensor = None,
+        teacher_bias: torch.Tensor = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor, torch.Tensor]]:
+        return LigerFusedLinearCosineSimilarityFunction.apply(
+            student_input,
+            student_weight,
+            teacher_input,
+            teacher_weight,
+            true_labels,
+            student_bias,
+            teacher_bias,
+            self.weight_hard_loss,
+            self.weight_soft_loss,
+            self.beta,
+            self.ignore_index,
+            self.temperature,
+            self.compiled,
+            self.chunk_size,
+            self.return_soft_hard_loss,
+        )

liger_kernel/chunked_loss/dpo_loss.py CHANGED Viewed

@@ -13,6 +13,7 @@ class LigerFusedLinearDPOFunction(LigerFusedLinearPreferenceBase):
         ref_chosen_logps=None,
         ref_rejected_logps=None,
         beta=0.1,
+        loss_type="sigmoid",
     ):
         """
         Paper: https://arxiv.org/pdf/2305.18290
@@ -48,8 +49,50 @@ class LigerFusedLinearDPOFunction(LigerFusedLinearPreferenceBase):
         chosen_rewards = beta * chosen_logratios
         rejected_rewards = beta * rejected_logratios
-        logits_diff = beta * (chosen_logratios - rejected_logratios)
-        loss = -F.logsigmoid(logits_diff).sum() / (full_target.shape[0] // 2)
+        if loss_type == "sigmoid":
+            logits_diff = beta * (chosen_logratios - rejected_logratios)
+            loss = -F.logsigmoid(logits_diff).sum() / (full_target.shape[0] // 2)
+        elif loss_type == "apo_zero":
+            # Eqn (7) of the APO paper (https://huggingface.co/papers/2408.06266)
+            # Use this loss when you believe the chosen outputs are better than your model's default output
+            losses_chosen = 1 - F.sigmoid(beta * chosen_logratios)  # Increase chosen likelihood
+            losses_rejected = F.sigmoid(beta * rejected_logratios)
+            losses = losses_chosen + losses_rejected
+            loss = losses.sum() / (full_target.shape[0] // 2)
+        elif loss_type == "apo_down":
+            # Eqn (8) of the APO paper (https://huggingface.co/papers/2408.06266)
+            # Use this loss when you believe the chosen outputs are worse than your model's default output.
+            # Decrease chosen likelihood and decrease rejected likelihood more
+            losses_chosen = F.sigmoid(beta * chosen_logratios)
+            losses_rejected = 1 - F.sigmoid(beta * (chosen_logratios - rejected_logratios))
+            losses = losses_chosen + losses_rejected
+            loss = losses.sum() / (full_target.shape[0] // 2)
+        elif loss_type == "sppo_hard":
+            # In the paper (https://huggingface.co/papers/2405.00675), SPPO employs a soft probability approach,
+            # estimated using the PairRM score. The probability calculation is conducted outside of the trainer class.
+            # The version described here is the hard probability version, where P in Equation (4.7) of Algorithm 1 is
+            # set to 1 for the winner and 0 for the loser.
+            a = chosen_logps - ref_chosen_logps
+            b = rejected_logps - ref_rejected_logps
+            losses = (a - 0.5 / beta) ** 2 + (b + 0.5 / beta) ** 2
+            loss = losses.sum() / (full_target.shape[0] // 2)
+        elif loss_type == "nca_pair":
+            losses = (
+                -F.logsigmoid(chosen_rewards)
+                - 0.5 * F.logsigmoid(-chosen_rewards)
+                - 0.5 * F.logsigmoid(-rejected_rewards)
+            )
+            loss = losses.sum() / (full_target.shape[0] // 2)
+        else:
+            raise ValueError(
+                f"Unsupported loss_type: {loss_type}. Supported types are: sigmoid, apo_zero, apo_down, sppo_hard, nca_pair"
+            )
         return loss, chosen_rewards, rejected_rewards
     @classmethod
@@ -70,6 +113,7 @@ class LigerFusedLinearDPOFunction(LigerFusedLinearPreferenceBase):
         use_ref_model=True,
         average_log_prob=False,
         chunk_size=1,
+        loss_type="sigmoid",
     ):
         """
         Fused linear layer with DPO loss.
@@ -108,12 +152,13 @@ class LigerFusedLinearDPOFunction(LigerFusedLinearPreferenceBase):
             ref_bias=ref_bias,
             average_log_prob=average_log_prob,
             chunk_size=chunk_size,
+            loss_type=loss_type,
         )
     @staticmethod
     def backward(ctx, *grad_output):
         grads = LigerFusedLinearPreferenceBase.backward(ctx, grad_output)[:4]
-        return *grads, None, None, None, None, None, None, None, None, None, None
+        return *grads, None, None, None, None, None, None, None, None, None, None, None
 class LigerFusedLinearDPOLoss(torch.nn.Module):
@@ -130,6 +175,7 @@ class LigerFusedLinearDPOLoss(torch.nn.Module):
         use_ref_model: bool = True,
         average_log_prob: bool = False,
         chunk_size: int = 1,
+        loss_type: str = "sigmoid",
     ):
         """
         Args:
@@ -149,6 +195,10 @@ class LigerFusedLinearDPOLoss(torch.nn.Module):
         self.use_ref_model = use_ref_model
         self.average_log_prob = average_log_prob
         self.chunk_size = chunk_size
+        self.loss_type = loss_type
+        supported_loss_types = {"sigmoid", "apo_zero", "apo_down", "sppo_hard", "nca_pair"}
+        if self.loss_type not in supported_loss_types:
+            raise ValueError(f"Unsupported loss_type: {self.loss_type}. Supported types are: {supported_loss_types}")
     def forward(
         self,
@@ -175,4 +225,5 @@ class LigerFusedLinearDPOLoss(torch.nn.Module):
             self.use_ref_model,
             self.average_log_prob,
             self.chunk_size,
+            self.loss_type,
         )

liger_kernel/chunked_loss/functional.py CHANGED Viewed

@@ -1,3 +1,4 @@
+from liger_kernel.chunked_loss.cosine_similarity_loss import LigerFusedLinearCosineSimilarityFunction
 from liger_kernel.chunked_loss.cpo_loss import LigerFusedLinearCPOFunction
 from liger_kernel.chunked_loss.dpo_loss import LigerFusedLinearDPOFunction
 from liger_kernel.chunked_loss.grpo_loss import LigerFusedLinearGRPOFunction
@@ -9,6 +10,7 @@ from liger_kernel.chunked_loss.simpo_loss import LigerFusedLinearSimPOFunction
 liger_fused_linear_orpo = LigerFusedLinearORPOFunction.apply
 liger_fused_linear_dpo = LigerFusedLinearDPOFunction.apply
 liger_fused_linear_jsd = LigerFusedLinearJSDFunction.apply
+liger_fused_linear_cosine = LigerFusedLinearCosineSimilarityFunction.apply
 liger_fused_linear_cpo = LigerFusedLinearCPOFunction.apply
 liger_fused_linear_simpo = LigerFusedLinearSimPOFunction.apply
 liger_fused_linear_kto = LigerFusedLinearKTOFunction.apply

liger_kernel/chunked_loss/fused_linear_distillation.py CHANGED Viewed

@@ -1,5 +1,7 @@
 from abc import abstractmethod
 from functools import partial
+from typing import Tuple
+from typing import Union
 import torch
@@ -157,8 +159,9 @@ class LigerFusedLinearDistillationBase(torch.autograd.Function):
         compute_ce_loss=True,
         temperature=1.0,
         compiled=True,
+        return_soft_hard_loss=False,
         **loss_kwargs,
-    ):
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor, torch.Tensor]]:
         """
         Base class for fused linear layer with distillation loss.
         Only need to compute gradients for student model.
@@ -180,6 +183,7 @@ class LigerFusedLinearDistillationBase(torch.autograd.Function):
             compute_ce_loss (bool): Whether to compute CE loss.
             temperature (float): Temperature to control the input probability distribution. Default: `1.0` (i.e. no scale)
             compiled (bool): Whether to use torch compile for chunk accumulation.
+            return_soft_hard_loss (bool): Whether to return soft and hard losses separately. Default: False.
             loss_kwargs (dict): Other possible arguments that a loss function might need
         """
         CHUNK_SIZE = chunk_size
@@ -187,6 +191,8 @@ class LigerFusedLinearDistillationBase(torch.autograd.Function):
         grad_inputs = []
         grad_bias = torch.zeros_like(student_bias) if student_bias is not None else None
         loss_acc = torch.zeros((), device=student_input.device)
+        soft_loss_acc = torch.zeros((), device=student_input.device) if return_soft_hard_loss else None
+        hard_loss_acc = torch.zeros((), device=student_input.device) if return_soft_hard_loss else None
         loss_func_to_call = partial(
             LigerFusedLinearDistillationBase._compute_loss,
@@ -247,6 +253,9 @@ class LigerFusedLinearDistillationBase(torch.autograd.Function):
                 )
             grad_weight.add_(chunk_grad_weight)
             loss_acc.add_(chunk_loss)
+            if return_soft_hard_loss:
+                soft_loss_acc.add_(chunk_soft_loss)
+                hard_loss_acc.add_(chunk_hard_loss)
             return chunk_grad_input
         if compiled:
@@ -268,10 +277,12 @@ class LigerFusedLinearDistillationBase(torch.autograd.Function):
             grad_weight,
             grad_bias,
         )
+        if return_soft_hard_loss:
+            return loss_acc, soft_loss_acc, hard_loss_acc
         return loss_acc
     @staticmethod
-    def backward(ctx, grad_output):
+    def backward(ctx, grad_output, *args):
         grad_input, grad_weight, grad_bias = ctx.saved_tensors
         if torch.ne(grad_output, torch.tensor(1.0, device=grad_output.device)):
             grad_input = grad_input * grad_output

liger_kernel/chunked_loss/fused_linear_ppo.py CHANGED Viewed

@@ -32,8 +32,9 @@ class LigerFusedLinearPPOBase(torch.autograd.Function):
         epsilon_low=0.2,
         epsilon_high=0.2,
         beta=0.04,
-        loss_type="bnpo",
+        loss_type="dapo",
         max_completion_length=None,
+        importance_sampling_level="token",
         temperature=1.0,
         compiled=True,
         use_ref_model=False,
@@ -59,7 +60,7 @@ class LigerFusedLinearPPOBase(torch.autograd.Function):
             epsilon_low: Lower bound for clipping the importance sampling ratio
             epsilon_high: Upper bound for clipping the importance sampling ratio
             beta: Weight for the KL penalty
-            loss_type: Type of loss calculation ("grpo", "bnpo", "dr_grpo")
+            loss_type: Type of loss calculation ("grpo", "bnpo", "dr_grpo", "dapo")
             max_completion_length: Maximum completion length required for "dr_grpo"
             temperature: Temperature for the logits
             compiled: Whether to use torch compile
@@ -92,6 +93,7 @@ class LigerFusedLinearPPOBase(torch.autograd.Function):
             beta=beta,
             loss_type=loss_type,
             max_completion_length=max_completion_length,
+            importance_sampling_level=importance_sampling_level,
             temperature=temperature,
             use_ref_model=use_ref_model,
             ppo_loss_fn=cls.ppo_loss_fn,
@@ -242,6 +244,21 @@ class LigerFusedLinearPPOBase(torch.autograd.Function):
         return loss_acc, tuple(final_metrics)
+    @staticmethod
+    def _compute_dapo_normalizer(attention_mask):
+        """Global active tokens averaged per process."""
+        normalizer = attention_mask.to(torch.float32).sum()
+        world_size = 1
+        if torch.distributed.is_available() and torch.distributed.is_initialized():
+            import torch.distributed as dist
+            normalizer = normalizer.clone()
+            dist.all_reduce(normalizer, op=dist.ReduceOp.SUM)
+            world_size = dist.get_world_size()
+        normalizer = normalizer / world_size
+        return torch.clamp(normalizer, min=1.0)
     @staticmethod
     def _compute_chunk_loss(
         input_chunk,
@@ -259,8 +276,9 @@ class LigerFusedLinearPPOBase(torch.autograd.Function):
         epsilon_low=0.2,
         epsilon_high=0.2,
         beta=0.04,
-        loss_type="bnpo",
+        loss_type="dapo",
         max_completion_length=None,
+        importance_sampling_level="token",
         temperature=1.0,
         use_ref_model=False,
         ppo_loss_fn=None,
@@ -292,6 +310,7 @@ class LigerFusedLinearPPOBase(torch.autograd.Function):
             beta=beta,
             loss_type=loss_type,
             max_completion_length=max_completion_length,
+            importance_sampling_level=importance_sampling_level,
         )
         return chunk_loss, chunk_metrics
@@ -337,10 +356,11 @@ class LigerFusedLinearPPOBase(torch.autograd.Function):
             None,  # grad_epsilon_low
             None,  # grad_epsilon_high
             None,  # grad_beta
+            None,  # grad_loss_type
+            None,  # grad_max_completion_length
+            None,  # grad_importance_sampling_level
             None,  # grad_temperature
             None,  # grad_compiled
             None,  # grad_use_ref_model
             None,  # grad_chunk_size
-            None,  # grad_loss_type
-            None,  # grad_max_completion_length
         )

liger_kernel/chunked_loss/grpo_loss.py CHANGED Viewed

@@ -29,8 +29,9 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
         epsilon_low=0.2,
         epsilon_high=0.2,
         beta=0.04,
-        loss_type="bnpo",  # ["grpo", "bnpo", "dr_grpo"]
+        loss_type="dapo",  # ["grpo", "bnpo", "dr_grpo", "dapo"]
         max_completion_length=None,  # Required for dr_grpo
+        importance_sampling_level="token",  # ["token", "sequence"] - new parameter for GSPO
         **kwargs,
     ):
         """GRPO Loss Function matching GRPOTrainer implementation."""
@@ -50,7 +51,22 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
         # Compute policy gradient loss with importance sampling ratio
         old_per_token_logps = old_per_token_logps if old_per_token_logps is not None else per_token_logps.detach()
-        coef_1 = torch.exp(per_token_logps - old_per_token_logps)
+        log_ratio = per_token_logps - old_per_token_logps
+        if importance_sampling_level == "token":
+            log_importance_weights = log_ratio
+        elif importance_sampling_level == "sequence":
+            log_importance_weights = (log_ratio * attention_mask).sum(-1) / attention_mask.sum(-1).clamp(min=1.0)
+            log_importance_weights = log_importance_weights.unsqueeze(-1)
+        else:
+            raise ValueError(
+                f"Unknown importance sampling level: {importance_sampling_level}. Possible values are 'token' "
+                "and 'sequence'."
+            )
+        # From here, log_importance_weights (and all subsequent tensors, coef_1, coef_2, etc.) shape depends on
+        # importance_sampling_level: "token" level: (B, T); "sequence" level: (B, 1)
+        coef_1 = torch.exp(log_importance_weights)
         coef_2 = clip_coef_fn(coef_1, epsilon_low, epsilon_high)
         per_token_loss1 = coef_1 * advantages.unsqueeze(1)
         per_token_loss2 = coef_2 * advantages.unsqueeze(1)
@@ -78,6 +94,9 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
             if max_completion_length is None:
                 raise ValueError("max_completion_length must be provided for loss_type 'dr_grpo'")
             loss = (per_token_loss * attention_mask).sum() / (full_attention_mask.shape[0] * max_completion_length)
+        elif loss_type == "dapo":
+            loss_normalizer = LigerFusedLinearPPOBase._compute_dapo_normalizer(full_attention_mask)
+            loss = (per_token_loss * attention_mask).sum() / loss_normalizer
         else:
             raise ValueError(f"Unknown loss type: {loss_type}")
@@ -85,9 +104,19 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
         metrics = []
         if beta != 0.0:
             metrics.append(((kl_div * attention_mask).sum() / torch.clamp(full_attention_mask.sum(), min=1.0)))
-        is_clipped = ((coef_1 < 1 - epsilon_low) & (advantages.unsqueeze(1) < 0)) | (
-            (coef_1 > 1 + epsilon_high) & (advantages.unsqueeze(1) > 0)
-        )
+        # Adjust clipping metric calculation based on importance sampling level
+        if importance_sampling_level == "token":
+            is_clipped = ((coef_1 < 1 - epsilon_low) & (advantages.unsqueeze(1) < 0)) | (
+                (coef_1 > 1 + epsilon_high) & (advantages.unsqueeze(1) > 0)
+            )
+        else:  # sequence level
+            # For sequence level, coef_1 is shape (B, 1), advantages is shape (B,)
+            is_clipped = ((coef_1.squeeze(-1) < 1 - epsilon_low) & (advantages < 0)) | (
+                (coef_1.squeeze(-1) > 1 + epsilon_high) & (advantages > 0)
+            )
+            is_clipped = is_clipped.unsqueeze(1).expand_as(attention_mask)
         metrics.append((is_clipped * attention_mask).sum() / torch.clamp(full_attention_mask.sum(), min=1.0))
         return loss, metrics
@@ -109,8 +138,9 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
         beta=0.04,
         epsilon_low=0.2,
         epsilon_high=0.2,
-        loss_type="bnpo",
+        loss_type="dapo",
         max_completion_length=None,
+        importance_sampling_level="token",
         temperature=1.0,
         compiled=True,
         use_ref_model=True,
@@ -130,8 +160,9 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
             ref_weight (torch.Tensor, optional): Reference model weight tensor. Shape: (vocab_size, hidden_size)
             ref_bias (torch.Tensor, optional): Reference model bias tensor. Shape: (vocab_size,)
             beta (float): Weight for the KL penalty
-            loss_type (str): Type of loss calculation ("grpo", "bnpo", "dr_grpo"). Defaults to "bnpo".
+            loss_type (str): Type of loss calculation ("grpo", "bnpo", "dr_grpo", "dapo"). Defaults to "dapo".
             max_completion_length (int, optional): Maximum completion length, required for "dr_grpo". Defaults to None.
+            importance_sampling_level (str): Level of importance sampling ("token" or "sequence"). Defaults to "token".
             temperature (float): Temperature for the logits
             compiled (bool): Whether to use torch compile
             use_ref_model (bool): Whether to use a reference model
@@ -162,6 +193,7 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
             compiled=compiled,
             use_ref_model=use_ref_model,
             chunk_size=chunk_size,
+            importance_sampling_level=importance_sampling_level,
         )
     @staticmethod
@@ -187,6 +219,7 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
             None,  # grad_epsilon_high
             None,  # grad_loss_type (string, not differentiable)
             None,  # grad_max_completion_length (int, not differentiable)
+            None,  # grad_importance_sampling_level (string, not differentiable)
             None,  # grad_temperature
             None,  # grad_compiled
             None,  # grad_use_ref_model
@@ -205,8 +238,9 @@ class LigerFusedLinearGRPOLoss(torch.nn.Module):
         chunk_size: int = 1,
         epsilon_low: float = 0.2,
         epsilon_high: float = 0.2,
-        loss_type: str = "bnpo",
+        loss_type: str = "dapo",
         max_completion_length: Optional[int] = None,
+        importance_sampling_level: str = "token",
         temperature: float = 1.0,
     ):
         """
@@ -217,8 +251,9 @@ class LigerFusedLinearGRPOLoss(torch.nn.Module):
             chunk_size (int): Size of chunks for processing.
             epsilon_low (float): Lower bound for the importance sampling ratio.
             epsilon_high (float): Upper bound for the importance sampling ratio.
-            loss_type (str): Type of loss calculation ("grpo", "bnpo", "dr_grpo"). Defaults to "bnpo".
+            loss_type (str): Type of loss calculation ("grpo", "bnpo", "dr_grpo", "dapo"). Defaults to "dapo".
             max_completion_length (int, optional): Maximum completion length, required for "dr_grpo". Defaults to None.
+            importance_sampling_level (str): Level of importance sampling ("token" or "sequence"). Defaults to "token".
             temperature (float): Temperature for the logits.
         """
         super().__init__()
@@ -230,6 +265,7 @@ class LigerFusedLinearGRPOLoss(torch.nn.Module):
         self.epsilon_high = epsilon_high
         self.loss_type = loss_type
         self.max_completion_length = max_completion_length
+        self.importance_sampling_level = importance_sampling_level
         self.temperature = temperature
     def forward(
@@ -263,6 +299,7 @@ class LigerFusedLinearGRPOLoss(torch.nn.Module):
             self.epsilon_high,
             self.loss_type,
             self.max_completion_length,
+            self.importance_sampling_level,
             self.temperature,
             self.compiled,
             self.use_ref_model,

liger_kernel/chunked_loss/jsd_loss.py CHANGED Viewed

@@ -1,3 +1,8 @@
+import math
+from typing import Tuple
+from typing import Union
 import torch
 import torch.nn.functional as F
@@ -25,8 +30,9 @@ class LigerFusedLinearJSDFunction(LigerFusedLinearDistillationBase):
             jsd_loss = F.kl_div(teacher_log_probs, student_log_probs, reduction="sum", log_target=True)
         else:
             # Compute probabilities (only required for mean calculation)
-            mean_probs = (1 - beta) * student_log_probs.exp() + beta * teacher_log_probs.exp()
-            log_mean_probs = mean_probs.log()
+            log_mean_probs = torch.logsumexp(
+                torch.stack([student_log_probs + math.log(1 - beta), teacher_log_probs + math.log(beta)], dim=0), dim=0
+            )
             student_kl = F.kl_div(log_mean_probs, student_log_probs, reduction="sum", log_target=True)
             teacher_kl = F.kl_div(log_mean_probs, teacher_log_probs, reduction="sum", log_target=True)
@@ -53,6 +59,7 @@ class LigerFusedLinearJSDFunction(LigerFusedLinearDistillationBase):
         temperature: float = 1.0,
         compiled: bool = True,
         chunk_size: int = 1024,
+        return_soft_hard_loss: bool = False,
     ):
         """
         Fused linear layer with JSD distillation loss.
@@ -69,8 +76,9 @@ class LigerFusedLinearJSDFunction(LigerFusedLinearDistillationBase):
             temperature (float): Temperature for softening/sharpening distributions
             compiled (bool): Whether to use torch compile
             chunk_size (int): Size of chunks for processing.
+            return_soft_hard_loss (bool): Whether to return soft and hard losses separately. Default: False.
         Returns:
-            torch.Tensor: Computed loss
+            torch.Tensor: Computed loss, or tuple (loss, soft_loss, hard_loss) if return_soft_hard_loss=True
         """
         return super().forward(
             cls=cls,
@@ -89,11 +97,12 @@ class LigerFusedLinearJSDFunction(LigerFusedLinearDistillationBase):
             ignore_index=ignore_index,
             temperature=temperature,
             compiled=compiled,
+            return_soft_hard_loss=return_soft_hard_loss,
         )
     @staticmethod
-    def backward(ctx, grad_output):
-        grads = LigerFusedLinearDistillationBase.backward(ctx, grad_output)[:6]
+    def backward(ctx, grad_output, *args):
+        grads = LigerFusedLinearDistillationBase.backward(ctx, grad_output, *args)[:6]
         return (
             *grads,
@@ -105,6 +114,7 @@ class LigerFusedLinearJSDFunction(LigerFusedLinearDistillationBase):
             None,  # temperature
             None,  # compiled
             None,  # chunk_size
+            None,  # return_soft_hard_loss
         )
@@ -122,6 +132,7 @@ class LigerFusedLinearJSDLoss(torch.nn.Module):
         temperature: float = 1.0,
         compiled: bool = True,
         chunk_size: int = 1024,
+        return_soft_hard_loss: bool = False,
     ):
         """
         Args:
@@ -132,6 +143,7 @@ class LigerFusedLinearJSDLoss(torch.nn.Module):
             compiled (bool): Whether to use torch compile
             beta (float): Coefficient beta of generalized JSD in the interval [0, 1]. Default: `0.5`.
             chunk_size (int): Size of chunks for processing.
+            return_soft_hard_loss (bool): Whether to return soft and hard losses separately. Default: False.
         """
         super().__init__()
         assert temperature != 0, "Temperature cannot be 0."
@@ -142,6 +154,7 @@ class LigerFusedLinearJSDLoss(torch.nn.Module):
         self.compiled = compiled
         self.beta = beta
         self.chunk_size = chunk_size
+        self.return_soft_hard_loss = return_soft_hard_loss
     def forward(
         self,
@@ -152,7 +165,7 @@ class LigerFusedLinearJSDLoss(torch.nn.Module):
         true_labels: torch.LongTensor,
         student_bias: torch.Tensor = None,
         teacher_bias: torch.Tensor = None,
-    ) -> torch.Tensor:
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor, torch.Tensor]]:
         """
         Compute the JSD distillation loss.
@@ -164,7 +177,9 @@ class LigerFusedLinearJSDLoss(torch.nn.Module):
             true_labels (torch.LongTensor): Target labels tensor
         Returns:
-            torch.Tensor: Computed loss
+            torch.Tensor or Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+                If return_soft_hard_loss is False: Computed combined loss
+                If return_soft_hard_loss is True: Tuple of (combined_loss, soft_loss, hard_loss)
         """
         return LigerFusedLinearJSDFunction.apply(
             student_input,
@@ -181,4 +196,5 @@ class LigerFusedLinearJSDLoss(torch.nn.Module):
             self.temperature,
             self.compiled,
             self.chunk_size,
+            self.return_soft_hard_loss,
         )

liger-kernel-nightly 0.5.10.dev20250624183504__py3-none-any.whl → 0.6.4.dev20251121224847__py3-none-any.whl

Potentially problematic release.

liger-kernel-nightly 0.5.10.dev20250624183504py3-none-any.whl → 0.6.4.dev20251121224847py3-none-any.whl