PyPI - liger-kernel - Versions diffs - 0.6.2__py3-none-any.whl → 0.6.4__py3-none-any.whl - Mend

liger-kernel 0.6.2py3-none-any.whl → 0.6.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

liger_kernel/chunked_loss/cosine_similarity_loss.py +13 -4
liger_kernel/chunked_loss/fused_linear_distillation.py +13 -2
liger_kernel/chunked_loss/fused_linear_ppo.py +25 -5
liger_kernel/chunked_loss/grpo_loss.py +46 -9
liger_kernel/chunked_loss/jsd_loss.py +23 -7
liger_kernel/ops/cross_entropy.py +118 -62
liger_kernel/ops/fused_linear_cross_entropy.py +97 -13
liger_kernel/ops/grpo_loss.py +3 -1
liger_kernel/ops/layer_norm.py +86 -69
liger_kernel/ops/poly_norm.py +386 -0
liger_kernel/ops/tiled_mlp.py +136 -0
liger_kernel/transformers/__init__.py +36 -0
liger_kernel/transformers/cross_entropy.py +8 -3
liger_kernel/transformers/functional.py +31 -6
liger_kernel/transformers/fused_linear_cross_entropy.py +13 -4
liger_kernel/transformers/grpo_loss.py +56 -1
liger_kernel/transformers/model/falcon_h1.py +122 -0
liger_kernel/transformers/model/gemma.py +19 -7
liger_kernel/transformers/model/gemma2.py +22 -7
liger_kernel/transformers/model/gemma3.py +52 -14
liger_kernel/transformers/model/glm4.py +18 -5
liger_kernel/transformers/model/glm4v.py +19 -6
liger_kernel/transformers/model/glm4v_moe.py +172 -0
liger_kernel/transformers/model/hunyuan_v1.py +134 -0
liger_kernel/transformers/model/internvl.py +157 -0
liger_kernel/transformers/model/llama.py +16 -6
liger_kernel/transformers/model/llama4.py +18 -5
liger_kernel/transformers/model/llava.py +18 -6
liger_kernel/transformers/model/loss_utils.py +32 -3
liger_kernel/transformers/model/mistral.py +17 -7
liger_kernel/transformers/model/mixtral.py +24 -9
liger_kernel/transformers/model/mllama.py +14 -5
liger_kernel/transformers/model/olmo2.py +18 -5
liger_kernel/transformers/model/olmo3.py +142 -0
liger_kernel/transformers/model/output_classes.py +147 -0
liger_kernel/transformers/model/paligemma.py +41 -5
liger_kernel/transformers/model/phi3.py +16 -8
liger_kernel/transformers/model/qwen2.py +18 -4
liger_kernel/transformers/model/qwen2_5_vl.py +21 -8
liger_kernel/transformers/model/qwen2_vl.py +24 -7
liger_kernel/transformers/model/qwen3.py +22 -6
liger_kernel/transformers/model/qwen3_moe.py +27 -7
liger_kernel/transformers/model/qwen3_next.py +146 -0
liger_kernel/transformers/model/qwen3_vl.py +150 -0
liger_kernel/transformers/model/qwen3_vl_moe.py +126 -0
liger_kernel/transformers/model/smollm3.py +17 -7
liger_kernel/transformers/model/smolvlm.py +158 -0
liger_kernel/transformers/monkey_patch.py +830 -3
liger_kernel/transformers/multi_token_attention.py +1 -1
liger_kernel/transformers/poly_norm.py +42 -0
liger_kernel/transformers/rms_norm.py +7 -0
liger_kernel/transformers/rope.py +43 -0
liger_kernel/transformers/swiglu.py +17 -0
liger_kernel/transformers/tiled_mlp.py +133 -0
{liger_kernel-0.6.2.dist-info → liger_kernel-0.6.4.dist-info}/METADATA +16 -10
liger_kernel-0.6.4.dist-info/RECORD +118 -0
liger_kernel-0.6.2.dist-info/RECORD +0 -104
{liger_kernel-0.6.2.dist-info → liger_kernel-0.6.4.dist-info}/WHEEL +0 -0
{liger_kernel-0.6.2.dist-info → liger_kernel-0.6.4.dist-info}/licenses/LICENSE +0 -0
{liger_kernel-0.6.2.dist-info → liger_kernel-0.6.4.dist-info}/licenses/NOTICE +0 -0
{liger_kernel-0.6.2.dist-info → liger_kernel-0.6.4.dist-info}/top_level.txt +0 -0

liger_kernel/chunked_loss/cosine_similarity_loss.py CHANGED Viewed

@@ -1,3 +1,6 @@
+from typing import Tuple
+from typing import Union
 import torch
 import torch.nn.functional as F
@@ -41,7 +44,8 @@ class LigerFusedLinearCosineSimilarityFunction(LigerFusedLinearDistillationBase)
         temperature: float = 1.0,
         compiled: bool = True,
         chunk_size: int = 1024,
-    ):
+        return_soft_hard_loss: bool = False,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor, torch.Tensor]]:
         return super().forward(
             cls=cls,
             ctx=ctx,
@@ -59,11 +63,12 @@ class LigerFusedLinearCosineSimilarityFunction(LigerFusedLinearDistillationBase)
             ignore_index=ignore_index,
             temperature=temperature,
             compiled=compiled,
+            return_soft_hard_loss=return_soft_hard_loss,
         )
     @staticmethod
-    def backward(ctx, grad_output):
-        grads = LigerFusedLinearDistillationBase.backward(ctx, grad_output)[:6]
+    def backward(ctx, grad_output, *args):
+        grads = LigerFusedLinearDistillationBase.backward(ctx, grad_output, *args)[:6]
         return (
             *grads,
@@ -75,6 +80,7 @@ class LigerFusedLinearCosineSimilarityFunction(LigerFusedLinearDistillationBase)
             None,  # temperature
             None,  # compiled
             None,  # chunk_size
+            None,  # return_soft_hard_loss
         )
@@ -88,6 +94,7 @@ class LigerFusedLinearCosineSimilarityLoss(torch.nn.Module):
         temperature: float = 1.0,
         compiled: bool = True,
         chunk_size: int = 1024,
+        return_soft_hard_loss: bool = False,
     ):
         super().__init__()
         assert temperature != 0, "Temperature cannot be 0."
@@ -98,6 +105,7 @@ class LigerFusedLinearCosineSimilarityLoss(torch.nn.Module):
         self.compiled = compiled
         self.beta = beta
         self.chunk_size = chunk_size
+        self.return_soft_hard_loss = return_soft_hard_loss
     def forward(
         self,
@@ -108,7 +116,7 @@ class LigerFusedLinearCosineSimilarityLoss(torch.nn.Module):
         true_labels: torch.LongTensor,
         student_bias: torch.Tensor = None,
         teacher_bias: torch.Tensor = None,
-    ) -> torch.Tensor:
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor, torch.Tensor]]:
         return LigerFusedLinearCosineSimilarityFunction.apply(
             student_input,
             student_weight,
@@ -124,4 +132,5 @@ class LigerFusedLinearCosineSimilarityLoss(torch.nn.Module):
             self.temperature,
             self.compiled,
             self.chunk_size,
+            self.return_soft_hard_loss,
         )

liger_kernel/chunked_loss/fused_linear_distillation.py CHANGED Viewed

@@ -1,5 +1,7 @@
 from abc import abstractmethod
 from functools import partial
+from typing import Tuple
+from typing import Union
 import torch
@@ -157,8 +159,9 @@ class LigerFusedLinearDistillationBase(torch.autograd.Function):
         compute_ce_loss=True,
         temperature=1.0,
         compiled=True,
+        return_soft_hard_loss=False,
         **loss_kwargs,
-    ):
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor, torch.Tensor]]:
         """
         Base class for fused linear layer with distillation loss.
         Only need to compute gradients for student model.
@@ -180,6 +183,7 @@ class LigerFusedLinearDistillationBase(torch.autograd.Function):
             compute_ce_loss (bool): Whether to compute CE loss.
             temperature (float): Temperature to control the input probability distribution. Default: `1.0` (i.e. no scale)
             compiled (bool): Whether to use torch compile for chunk accumulation.
+            return_soft_hard_loss (bool): Whether to return soft and hard losses separately. Default: False.
             loss_kwargs (dict): Other possible arguments that a loss function might need
         """
         CHUNK_SIZE = chunk_size
@@ -187,6 +191,8 @@ class LigerFusedLinearDistillationBase(torch.autograd.Function):
         grad_inputs = []
         grad_bias = torch.zeros_like(student_bias) if student_bias is not None else None
         loss_acc = torch.zeros((), device=student_input.device)
+        soft_loss_acc = torch.zeros((), device=student_input.device) if return_soft_hard_loss else None
+        hard_loss_acc = torch.zeros((), device=student_input.device) if return_soft_hard_loss else None
         loss_func_to_call = partial(
             LigerFusedLinearDistillationBase._compute_loss,
@@ -247,6 +253,9 @@ class LigerFusedLinearDistillationBase(torch.autograd.Function):
                 )
             grad_weight.add_(chunk_grad_weight)
             loss_acc.add_(chunk_loss)
+            if return_soft_hard_loss:
+                soft_loss_acc.add_(chunk_soft_loss)
+                hard_loss_acc.add_(chunk_hard_loss)
             return chunk_grad_input
         if compiled:
@@ -268,10 +277,12 @@ class LigerFusedLinearDistillationBase(torch.autograd.Function):
             grad_weight,
             grad_bias,
         )
+        if return_soft_hard_loss:
+            return loss_acc, soft_loss_acc, hard_loss_acc
         return loss_acc
     @staticmethod
-    def backward(ctx, grad_output):
+    def backward(ctx, grad_output, *args):
         grad_input, grad_weight, grad_bias = ctx.saved_tensors
         if torch.ne(grad_output, torch.tensor(1.0, device=grad_output.device)):
             grad_input = grad_input * grad_output

liger_kernel/chunked_loss/fused_linear_ppo.py CHANGED Viewed

@@ -32,8 +32,9 @@ class LigerFusedLinearPPOBase(torch.autograd.Function):
         epsilon_low=0.2,
         epsilon_high=0.2,
         beta=0.04,
-        loss_type="bnpo",
+        loss_type="dapo",
         max_completion_length=None,
+        importance_sampling_level="token",
         temperature=1.0,
         compiled=True,
         use_ref_model=False,
@@ -59,7 +60,7 @@ class LigerFusedLinearPPOBase(torch.autograd.Function):
             epsilon_low: Lower bound for clipping the importance sampling ratio
             epsilon_high: Upper bound for clipping the importance sampling ratio
             beta: Weight for the KL penalty
-            loss_type: Type of loss calculation ("grpo", "bnpo", "dr_grpo")
+            loss_type: Type of loss calculation ("grpo", "bnpo", "dr_grpo", "dapo")
             max_completion_length: Maximum completion length required for "dr_grpo"
             temperature: Temperature for the logits
             compiled: Whether to use torch compile
@@ -92,6 +93,7 @@ class LigerFusedLinearPPOBase(torch.autograd.Function):
             beta=beta,
             loss_type=loss_type,
             max_completion_length=max_completion_length,
+            importance_sampling_level=importance_sampling_level,
             temperature=temperature,
             use_ref_model=use_ref_model,
             ppo_loss_fn=cls.ppo_loss_fn,
@@ -242,6 +244,21 @@ class LigerFusedLinearPPOBase(torch.autograd.Function):
         return loss_acc, tuple(final_metrics)
+    @staticmethod
+    def _compute_dapo_normalizer(attention_mask):
+        """Global active tokens averaged per process."""
+        normalizer = attention_mask.to(torch.float32).sum()
+        world_size = 1
+        if torch.distributed.is_available() and torch.distributed.is_initialized():
+            import torch.distributed as dist
+            normalizer = normalizer.clone()
+            dist.all_reduce(normalizer, op=dist.ReduceOp.SUM)
+            world_size = dist.get_world_size()
+        normalizer = normalizer / world_size
+        return torch.clamp(normalizer, min=1.0)
     @staticmethod
     def _compute_chunk_loss(
         input_chunk,
@@ -259,8 +276,9 @@ class LigerFusedLinearPPOBase(torch.autograd.Function):
         epsilon_low=0.2,
         epsilon_high=0.2,
         beta=0.04,
-        loss_type="bnpo",
+        loss_type="dapo",
         max_completion_length=None,
+        importance_sampling_level="token",
         temperature=1.0,
         use_ref_model=False,
         ppo_loss_fn=None,
@@ -292,6 +310,7 @@ class LigerFusedLinearPPOBase(torch.autograd.Function):
             beta=beta,
             loss_type=loss_type,
             max_completion_length=max_completion_length,
+            importance_sampling_level=importance_sampling_level,
         )
         return chunk_loss, chunk_metrics
@@ -337,10 +356,11 @@ class LigerFusedLinearPPOBase(torch.autograd.Function):
             None,  # grad_epsilon_low
             None,  # grad_epsilon_high
             None,  # grad_beta
+            None,  # grad_loss_type
+            None,  # grad_max_completion_length
+            None,  # grad_importance_sampling_level
             None,  # grad_temperature
             None,  # grad_compiled
             None,  # grad_use_ref_model
             None,  # grad_chunk_size
-            None,  # grad_loss_type
-            None,  # grad_max_completion_length
         )

liger_kernel/chunked_loss/grpo_loss.py CHANGED Viewed

@@ -29,8 +29,9 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
         epsilon_low=0.2,
         epsilon_high=0.2,
         beta=0.04,
-        loss_type="bnpo",  # ["grpo", "bnpo", "dr_grpo"]
+        loss_type="dapo",  # ["grpo", "bnpo", "dr_grpo", "dapo"]
         max_completion_length=None,  # Required for dr_grpo
+        importance_sampling_level="token",  # ["token", "sequence"] - new parameter for GSPO
         **kwargs,
     ):
         """GRPO Loss Function matching GRPOTrainer implementation."""
@@ -50,7 +51,22 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
         # Compute policy gradient loss with importance sampling ratio
         old_per_token_logps = old_per_token_logps if old_per_token_logps is not None else per_token_logps.detach()
-        coef_1 = torch.exp(per_token_logps - old_per_token_logps)
+        log_ratio = per_token_logps - old_per_token_logps
+        if importance_sampling_level == "token":
+            log_importance_weights = log_ratio
+        elif importance_sampling_level == "sequence":
+            log_importance_weights = (log_ratio * attention_mask).sum(-1) / attention_mask.sum(-1).clamp(min=1.0)
+            log_importance_weights = log_importance_weights.unsqueeze(-1)
+        else:
+            raise ValueError(
+                f"Unknown importance sampling level: {importance_sampling_level}. Possible values are 'token' "
+                "and 'sequence'."
+            )
+        # From here, log_importance_weights (and all subsequent tensors, coef_1, coef_2, etc.) shape depends on
+        # importance_sampling_level: "token" level: (B, T); "sequence" level: (B, 1)
+        coef_1 = torch.exp(log_importance_weights)
         coef_2 = clip_coef_fn(coef_1, epsilon_low, epsilon_high)
         per_token_loss1 = coef_1 * advantages.unsqueeze(1)
         per_token_loss2 = coef_2 * advantages.unsqueeze(1)
@@ -78,6 +94,9 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
             if max_completion_length is None:
                 raise ValueError("max_completion_length must be provided for loss_type 'dr_grpo'")
             loss = (per_token_loss * attention_mask).sum() / (full_attention_mask.shape[0] * max_completion_length)
+        elif loss_type == "dapo":
+            loss_normalizer = LigerFusedLinearPPOBase._compute_dapo_normalizer(full_attention_mask)
+            loss = (per_token_loss * attention_mask).sum() / loss_normalizer
         else:
             raise ValueError(f"Unknown loss type: {loss_type}")
@@ -85,9 +104,19 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
         metrics = []
         if beta != 0.0:
             metrics.append(((kl_div * attention_mask).sum() / torch.clamp(full_attention_mask.sum(), min=1.0)))
-        is_clipped = ((coef_1 < 1 - epsilon_low) & (advantages.unsqueeze(1) < 0)) | (
-            (coef_1 > 1 + epsilon_high) & (advantages.unsqueeze(1) > 0)
-        )
+        # Adjust clipping metric calculation based on importance sampling level
+        if importance_sampling_level == "token":
+            is_clipped = ((coef_1 < 1 - epsilon_low) & (advantages.unsqueeze(1) < 0)) | (
+                (coef_1 > 1 + epsilon_high) & (advantages.unsqueeze(1) > 0)
+            )
+        else:  # sequence level
+            # For sequence level, coef_1 is shape (B, 1), advantages is shape (B,)
+            is_clipped = ((coef_1.squeeze(-1) < 1 - epsilon_low) & (advantages < 0)) | (
+                (coef_1.squeeze(-1) > 1 + epsilon_high) & (advantages > 0)
+            )
+            is_clipped = is_clipped.unsqueeze(1).expand_as(attention_mask)
         metrics.append((is_clipped * attention_mask).sum() / torch.clamp(full_attention_mask.sum(), min=1.0))
         return loss, metrics
@@ -109,8 +138,9 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
         beta=0.04,
         epsilon_low=0.2,
         epsilon_high=0.2,
-        loss_type="bnpo",
+        loss_type="dapo",
         max_completion_length=None,
+        importance_sampling_level="token",
         temperature=1.0,
         compiled=True,
         use_ref_model=True,
@@ -130,8 +160,9 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
             ref_weight (torch.Tensor, optional): Reference model weight tensor. Shape: (vocab_size, hidden_size)
             ref_bias (torch.Tensor, optional): Reference model bias tensor. Shape: (vocab_size,)
             beta (float): Weight for the KL penalty
-            loss_type (str): Type of loss calculation ("grpo", "bnpo", "dr_grpo"). Defaults to "bnpo".
+            loss_type (str): Type of loss calculation ("grpo", "bnpo", "dr_grpo", "dapo"). Defaults to "dapo".
             max_completion_length (int, optional): Maximum completion length, required for "dr_grpo". Defaults to None.
+            importance_sampling_level (str): Level of importance sampling ("token" or "sequence"). Defaults to "token".
             temperature (float): Temperature for the logits
             compiled (bool): Whether to use torch compile
             use_ref_model (bool): Whether to use a reference model
@@ -162,6 +193,7 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
             compiled=compiled,
             use_ref_model=use_ref_model,
             chunk_size=chunk_size,
+            importance_sampling_level=importance_sampling_level,
         )
     @staticmethod
@@ -187,6 +219,7 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
             None,  # grad_epsilon_high
             None,  # grad_loss_type (string, not differentiable)
             None,  # grad_max_completion_length (int, not differentiable)
+            None,  # grad_importance_sampling_level (string, not differentiable)
             None,  # grad_temperature
             None,  # grad_compiled
             None,  # grad_use_ref_model
@@ -205,8 +238,9 @@ class LigerFusedLinearGRPOLoss(torch.nn.Module):
         chunk_size: int = 1,
         epsilon_low: float = 0.2,
         epsilon_high: float = 0.2,
-        loss_type: str = "bnpo",
+        loss_type: str = "dapo",
         max_completion_length: Optional[int] = None,
+        importance_sampling_level: str = "token",
         temperature: float = 1.0,
     ):
         """
@@ -217,8 +251,9 @@ class LigerFusedLinearGRPOLoss(torch.nn.Module):
             chunk_size (int): Size of chunks for processing.
             epsilon_low (float): Lower bound for the importance sampling ratio.
             epsilon_high (float): Upper bound for the importance sampling ratio.
-            loss_type (str): Type of loss calculation ("grpo", "bnpo", "dr_grpo"). Defaults to "bnpo".
+            loss_type (str): Type of loss calculation ("grpo", "bnpo", "dr_grpo", "dapo"). Defaults to "dapo".
             max_completion_length (int, optional): Maximum completion length, required for "dr_grpo". Defaults to None.
+            importance_sampling_level (str): Level of importance sampling ("token" or "sequence"). Defaults to "token".
             temperature (float): Temperature for the logits.
         """
         super().__init__()
@@ -230,6 +265,7 @@ class LigerFusedLinearGRPOLoss(torch.nn.Module):
         self.epsilon_high = epsilon_high
         self.loss_type = loss_type
         self.max_completion_length = max_completion_length
+        self.importance_sampling_level = importance_sampling_level
         self.temperature = temperature
     def forward(
@@ -263,6 +299,7 @@ class LigerFusedLinearGRPOLoss(torch.nn.Module):
             self.epsilon_high,
             self.loss_type,
             self.max_completion_length,
+            self.importance_sampling_level,
             self.temperature,
             self.compiled,
             self.use_ref_model,

liger_kernel/chunked_loss/jsd_loss.py CHANGED Viewed

@@ -1,3 +1,8 @@
+import math
+from typing import Tuple
+from typing import Union
 import torch
 import torch.nn.functional as F
@@ -25,8 +30,9 @@ class LigerFusedLinearJSDFunction(LigerFusedLinearDistillationBase):
             jsd_loss = F.kl_div(teacher_log_probs, student_log_probs, reduction="sum", log_target=True)
         else:
             # Compute probabilities (only required for mean calculation)
-            mean_probs = (1 - beta) * student_log_probs.exp() + beta * teacher_log_probs.exp()
-            log_mean_probs = mean_probs.log()
+            log_mean_probs = torch.logsumexp(
+                torch.stack([student_log_probs + math.log(1 - beta), teacher_log_probs + math.log(beta)], dim=0), dim=0
+            )
             student_kl = F.kl_div(log_mean_probs, student_log_probs, reduction="sum", log_target=True)
             teacher_kl = F.kl_div(log_mean_probs, teacher_log_probs, reduction="sum", log_target=True)
@@ -53,6 +59,7 @@ class LigerFusedLinearJSDFunction(LigerFusedLinearDistillationBase):
         temperature: float = 1.0,
         compiled: bool = True,
         chunk_size: int = 1024,
+        return_soft_hard_loss: bool = False,
     ):
         """
         Fused linear layer with JSD distillation loss.
@@ -69,8 +76,9 @@ class LigerFusedLinearJSDFunction(LigerFusedLinearDistillationBase):
             temperature (float): Temperature for softening/sharpening distributions
             compiled (bool): Whether to use torch compile
             chunk_size (int): Size of chunks for processing.
+            return_soft_hard_loss (bool): Whether to return soft and hard losses separately. Default: False.
         Returns:
-            torch.Tensor: Computed loss
+            torch.Tensor: Computed loss, or tuple (loss, soft_loss, hard_loss) if return_soft_hard_loss=True
         """
         return super().forward(
             cls=cls,
@@ -89,11 +97,12 @@ class LigerFusedLinearJSDFunction(LigerFusedLinearDistillationBase):
             ignore_index=ignore_index,
             temperature=temperature,
             compiled=compiled,
+            return_soft_hard_loss=return_soft_hard_loss,
         )
     @staticmethod
-    def backward(ctx, grad_output):
-        grads = LigerFusedLinearDistillationBase.backward(ctx, grad_output)[:6]
+    def backward(ctx, grad_output, *args):
+        grads = LigerFusedLinearDistillationBase.backward(ctx, grad_output, *args)[:6]
         return (
             *grads,
@@ -105,6 +114,7 @@ class LigerFusedLinearJSDFunction(LigerFusedLinearDistillationBase):
             None,  # temperature
             None,  # compiled
             None,  # chunk_size
+            None,  # return_soft_hard_loss
         )
@@ -122,6 +132,7 @@ class LigerFusedLinearJSDLoss(torch.nn.Module):
         temperature: float = 1.0,
         compiled: bool = True,
         chunk_size: int = 1024,
+        return_soft_hard_loss: bool = False,
     ):
         """
         Args:
@@ -132,6 +143,7 @@ class LigerFusedLinearJSDLoss(torch.nn.Module):
             compiled (bool): Whether to use torch compile
             beta (float): Coefficient beta of generalized JSD in the interval [0, 1]. Default: `0.5`.
             chunk_size (int): Size of chunks for processing.
+            return_soft_hard_loss (bool): Whether to return soft and hard losses separately. Default: False.
         """
         super().__init__()
         assert temperature != 0, "Temperature cannot be 0."
@@ -142,6 +154,7 @@ class LigerFusedLinearJSDLoss(torch.nn.Module):
         self.compiled = compiled
         self.beta = beta
         self.chunk_size = chunk_size
+        self.return_soft_hard_loss = return_soft_hard_loss
     def forward(
         self,
@@ -152,7 +165,7 @@ class LigerFusedLinearJSDLoss(torch.nn.Module):
         true_labels: torch.LongTensor,
         student_bias: torch.Tensor = None,
         teacher_bias: torch.Tensor = None,
-    ) -> torch.Tensor:
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor, torch.Tensor]]:
         """
         Compute the JSD distillation loss.
@@ -164,7 +177,9 @@ class LigerFusedLinearJSDLoss(torch.nn.Module):
             true_labels (torch.LongTensor): Target labels tensor
         Returns:
-            torch.Tensor: Computed loss
+            torch.Tensor or Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+                If return_soft_hard_loss is False: Computed combined loss
+                If return_soft_hard_loss is True: Tuple of (combined_loss, soft_loss, hard_loss)
         """
         return LigerFusedLinearJSDFunction.apply(
             student_input,
@@ -181,4 +196,5 @@ class LigerFusedLinearJSDLoss(torch.nn.Module):
             self.temperature,
             self.compiled,
             self.chunk_size,
+            self.return_soft_hard_loss,
         )

liger-kernel 0.6.2__py3-none-any.whl → 0.6.4__py3-none-any.whl

liger-kernel 0.6.2py3-none-any.whl → 0.6.4py3-none-any.whl