PyPI - liger-kernel - Versions diffs - 0.6.1__py3-none-any.whl → 0.6.3__py3-none-any.whl - Mend

liger-kernel 0.6.1py3-none-any.whl → 0.6.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

liger_kernel/chunked_loss/dpo_loss.py +54 -3
liger_kernel/chunked_loss/fused_linear_ppo.py +4 -0
liger_kernel/chunked_loss/grpo_loss.py +38 -4
liger_kernel/chunked_loss/jsd_loss.py +5 -2
liger_kernel/ops/cross_entropy.py +59 -53
liger_kernel/ops/fused_linear_cross_entropy.py +83 -17
liger_kernel/ops/layer_norm.py +4 -6
liger_kernel/ops/llama4_rope.py +225 -0
liger_kernel/ops/poly_norm.py +386 -0
liger_kernel/transformers/__init__.py +32 -0
liger_kernel/transformers/experimental/__init__.py +5 -0
liger_kernel/transformers/functional.py +9 -0
liger_kernel/transformers/fused_linear_cross_entropy.py +8 -1
liger_kernel/transformers/llama4_rope.py +93 -0
liger_kernel/transformers/model/falcon_h1.py +108 -0
liger_kernel/transformers/model/gemma.py +2 -1
liger_kernel/transformers/model/gemma2.py +8 -2
liger_kernel/transformers/model/gemma3.py +27 -2
liger_kernel/transformers/model/glm4.py +2 -1
liger_kernel/transformers/model/glm4v.py +151 -0
liger_kernel/transformers/model/glm4v_moe.py +153 -0
liger_kernel/transformers/model/internvl.py +150 -0
liger_kernel/transformers/model/llama.py +2 -1
liger_kernel/transformers/model/llama4.py +2 -1
liger_kernel/transformers/model/llava.py +6 -2
liger_kernel/transformers/model/loss_utils.py +3 -0
liger_kernel/transformers/model/mistral.py +2 -1
liger_kernel/transformers/model/mixtral.py +8 -2
liger_kernel/transformers/model/mllama.py +6 -3
liger_kernel/transformers/model/olmo2.py +2 -1
liger_kernel/transformers/model/paligemma.py +19 -0
liger_kernel/transformers/model/phi3.py +10 -160
liger_kernel/transformers/model/qwen2.py +2 -1
liger_kernel/transformers/model/qwen2_5_vl.py +7 -2
liger_kernel/transformers/model/qwen2_vl.py +7 -2
liger_kernel/transformers/model/qwen3.py +2 -1
liger_kernel/transformers/model/qwen3_moe.py +8 -2
liger_kernel/transformers/model/qwen3_next.py +134 -0
liger_kernel/transformers/model/smollm3.py +2 -1
liger_kernel/transformers/model/smolvlm.py +158 -0
liger_kernel/transformers/monkey_patch.py +552 -23
liger_kernel/transformers/multi_token_attention.py +1 -1
liger_kernel/transformers/poly_norm.py +42 -0
liger_kernel/transformers/rms_norm.py +7 -0
{liger_kernel-0.6.1.dist-info → liger_kernel-0.6.3.dist-info}/METADATA +14 -11
{liger_kernel-0.6.1.dist-info → liger_kernel-0.6.3.dist-info}/RECORD +50 -39
{liger_kernel-0.6.1.dist-info → liger_kernel-0.6.3.dist-info}/WHEEL +0 -0
{liger_kernel-0.6.1.dist-info → liger_kernel-0.6.3.dist-info}/licenses/LICENSE +0 -0
{liger_kernel-0.6.1.dist-info → liger_kernel-0.6.3.dist-info}/licenses/NOTICE +0 -0
{liger_kernel-0.6.1.dist-info → liger_kernel-0.6.3.dist-info}/top_level.txt +0 -0

liger_kernel/chunked_loss/dpo_loss.py CHANGED Viewed

@@ -13,6 +13,7 @@ class LigerFusedLinearDPOFunction(LigerFusedLinearPreferenceBase):
         ref_chosen_logps=None,
         ref_rejected_logps=None,
         beta=0.1,
+        loss_type="sigmoid",
     ):
         """
         Paper: https://arxiv.org/pdf/2305.18290
@@ -48,8 +49,50 @@ class LigerFusedLinearDPOFunction(LigerFusedLinearPreferenceBase):
         chosen_rewards = beta * chosen_logratios
         rejected_rewards = beta * rejected_logratios
-        logits_diff = beta * (chosen_logratios - rejected_logratios)
-        loss = -F.logsigmoid(logits_diff).sum() / (full_target.shape[0] // 2)
+        if loss_type == "sigmoid":
+            logits_diff = beta * (chosen_logratios - rejected_logratios)
+            loss = -F.logsigmoid(logits_diff).sum() / (full_target.shape[0] // 2)
+        elif loss_type == "apo_zero":
+            # Eqn (7) of the APO paper (https://huggingface.co/papers/2408.06266)
+            # Use this loss when you believe the chosen outputs are better than your model's default output
+            losses_chosen = 1 - F.sigmoid(beta * chosen_logratios)  # Increase chosen likelihood
+            losses_rejected = F.sigmoid(beta * rejected_logratios)
+            losses = losses_chosen + losses_rejected
+            loss = losses.sum() / (full_target.shape[0] // 2)
+        elif loss_type == "apo_down":
+            # Eqn (8) of the APO paper (https://huggingface.co/papers/2408.06266)
+            # Use this loss when you believe the chosen outputs are worse than your model's default output.
+            # Decrease chosen likelihood and decrease rejected likelihood more
+            losses_chosen = F.sigmoid(beta * chosen_logratios)
+            losses_rejected = 1 - F.sigmoid(beta * (chosen_logratios - rejected_logratios))
+            losses = losses_chosen + losses_rejected
+            loss = losses.sum() / (full_target.shape[0] // 2)
+        elif loss_type == "sppo_hard":
+            # In the paper (https://huggingface.co/papers/2405.00675), SPPO employs a soft probability approach,
+            # estimated using the PairRM score. The probability calculation is conducted outside of the trainer class.
+            # The version described here is the hard probability version, where P in Equation (4.7) of Algorithm 1 is
+            # set to 1 for the winner and 0 for the loser.
+            a = chosen_logps - ref_chosen_logps
+            b = rejected_logps - ref_rejected_logps
+            losses = (a - 0.5 / beta) ** 2 + (b + 0.5 / beta) ** 2
+            loss = losses.sum() / (full_target.shape[0] // 2)
+        elif loss_type == "nca_pair":
+            losses = (
+                -F.logsigmoid(chosen_rewards)
+                - 0.5 * F.logsigmoid(-chosen_rewards)
+                - 0.5 * F.logsigmoid(-rejected_rewards)
+            )
+            loss = losses.sum() / (full_target.shape[0] // 2)
+        else:
+            raise ValueError(
+                f"Unsupported loss_type: {loss_type}. Supported types are: sigmoid, apo_zero, apo_down, sppo_hard, nca_pair"
+            )
         return loss, chosen_rewards, rejected_rewards
     @classmethod
@@ -70,6 +113,7 @@ class LigerFusedLinearDPOFunction(LigerFusedLinearPreferenceBase):
         use_ref_model=True,
         average_log_prob=False,
         chunk_size=1,
+        loss_type="sigmoid",
     ):
         """
         Fused linear layer with DPO loss.
@@ -108,12 +152,13 @@ class LigerFusedLinearDPOFunction(LigerFusedLinearPreferenceBase):
             ref_bias=ref_bias,
             average_log_prob=average_log_prob,
             chunk_size=chunk_size,
+            loss_type=loss_type,
         )
     @staticmethod
     def backward(ctx, *grad_output):
         grads = LigerFusedLinearPreferenceBase.backward(ctx, grad_output)[:4]
-        return *grads, None, None, None, None, None, None, None, None, None, None
+        return *grads, None, None, None, None, None, None, None, None, None, None, None
 class LigerFusedLinearDPOLoss(torch.nn.Module):
@@ -130,6 +175,7 @@ class LigerFusedLinearDPOLoss(torch.nn.Module):
         use_ref_model: bool = True,
         average_log_prob: bool = False,
         chunk_size: int = 1,
+        loss_type: str = "sigmoid",
     ):
         """
         Args:
@@ -149,6 +195,10 @@ class LigerFusedLinearDPOLoss(torch.nn.Module):
         self.use_ref_model = use_ref_model
         self.average_log_prob = average_log_prob
         self.chunk_size = chunk_size
+        self.loss_type = loss_type
+        supported_loss_types = {"sigmoid", "apo_zero", "apo_down", "sppo_hard", "nca_pair"}
+        if self.loss_type not in supported_loss_types:
+            raise ValueError(f"Unsupported loss_type: {self.loss_type}. Supported types are: {supported_loss_types}")
     def forward(
         self,
@@ -175,4 +225,5 @@ class LigerFusedLinearDPOLoss(torch.nn.Module):
             self.use_ref_model,
             self.average_log_prob,
             self.chunk_size,
+            self.loss_type,
         )

liger_kernel/chunked_loss/fused_linear_ppo.py CHANGED Viewed

@@ -34,6 +34,7 @@ class LigerFusedLinearPPOBase(torch.autograd.Function):
         beta=0.04,
         loss_type="bnpo",
         max_completion_length=None,
+        importance_sampling_level="token",
         temperature=1.0,
         compiled=True,
         use_ref_model=False,
@@ -92,6 +93,7 @@ class LigerFusedLinearPPOBase(torch.autograd.Function):
             beta=beta,
             loss_type=loss_type,
             max_completion_length=max_completion_length,
+            importance_sampling_level=importance_sampling_level,
             temperature=temperature,
             use_ref_model=use_ref_model,
             ppo_loss_fn=cls.ppo_loss_fn,
@@ -261,6 +263,7 @@ class LigerFusedLinearPPOBase(torch.autograd.Function):
         beta=0.04,
         loss_type="bnpo",
         max_completion_length=None,
+        importance_sampling_level="token",
         temperature=1.0,
         use_ref_model=False,
         ppo_loss_fn=None,
@@ -292,6 +295,7 @@ class LigerFusedLinearPPOBase(torch.autograd.Function):
             beta=beta,
             loss_type=loss_type,
             max_completion_length=max_completion_length,
+            importance_sampling_level=importance_sampling_level,
         )
         return chunk_loss, chunk_metrics

liger_kernel/chunked_loss/grpo_loss.py CHANGED Viewed

@@ -31,6 +31,7 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
         beta=0.04,
         loss_type="bnpo",  # ["grpo", "bnpo", "dr_grpo"]
         max_completion_length=None,  # Required for dr_grpo
+        importance_sampling_level="token",  # ["token", "sequence"] - new parameter for GSPO
         **kwargs,
     ):
         """GRPO Loss Function matching GRPOTrainer implementation."""
@@ -50,7 +51,22 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
         # Compute policy gradient loss with importance sampling ratio
         old_per_token_logps = old_per_token_logps if old_per_token_logps is not None else per_token_logps.detach()
-        coef_1 = torch.exp(per_token_logps - old_per_token_logps)
+        log_ratio = per_token_logps - old_per_token_logps
+        if importance_sampling_level == "token":
+            log_importance_weights = log_ratio
+        elif importance_sampling_level == "sequence":
+            log_importance_weights = (log_ratio * attention_mask).sum(-1) / attention_mask.sum(-1).clamp(min=1.0)
+            log_importance_weights = log_importance_weights.unsqueeze(-1)
+        else:
+            raise ValueError(
+                f"Unknown importance sampling level: {importance_sampling_level}. Possible values are 'token' "
+                "and 'sequence'."
+            )
+        # From here, log_importance_weights (and all subsequent tensors, coef_1, coef_2, etc.) shape depends on
+        # importance_sampling_level: "token" level: (B, T); "sequence" level: (B, 1)
+        coef_1 = torch.exp(log_importance_weights)
         coef_2 = clip_coef_fn(coef_1, epsilon_low, epsilon_high)
         per_token_loss1 = coef_1 * advantages.unsqueeze(1)
         per_token_loss2 = coef_2 * advantages.unsqueeze(1)
@@ -85,9 +101,19 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
         metrics = []
         if beta != 0.0:
             metrics.append(((kl_div * attention_mask).sum() / torch.clamp(full_attention_mask.sum(), min=1.0)))
-        is_clipped = ((coef_1 < 1 - epsilon_low) & (advantages.unsqueeze(1) < 0)) | (
-            (coef_1 > 1 + epsilon_high) & (advantages.unsqueeze(1) > 0)
-        )
+        # Adjust clipping metric calculation based on importance sampling level
+        if importance_sampling_level == "token":
+            is_clipped = ((coef_1 < 1 - epsilon_low) & (advantages.unsqueeze(1) < 0)) | (
+                (coef_1 > 1 + epsilon_high) & (advantages.unsqueeze(1) > 0)
+            )
+        else:  # sequence level
+            # For sequence level, coef_1 is shape (B, 1), advantages is shape (B,)
+            is_clipped = ((coef_1.squeeze(-1) < 1 - epsilon_low) & (advantages < 0)) | (
+                (coef_1.squeeze(-1) > 1 + epsilon_high) & (advantages > 0)
+            )
+            is_clipped = is_clipped.unsqueeze(1).expand_as(attention_mask)
         metrics.append((is_clipped * attention_mask).sum() / torch.clamp(full_attention_mask.sum(), min=1.0))
         return loss, metrics
@@ -111,6 +137,7 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
         epsilon_high=0.2,
         loss_type="bnpo",
         max_completion_length=None,
+        importance_sampling_level="token",
         temperature=1.0,
         compiled=True,
         use_ref_model=True,
@@ -132,6 +159,7 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
             beta (float): Weight for the KL penalty
             loss_type (str): Type of loss calculation ("grpo", "bnpo", "dr_grpo"). Defaults to "bnpo".
             max_completion_length (int, optional): Maximum completion length, required for "dr_grpo". Defaults to None.
+            importance_sampling_level (str): Level of importance sampling ("token" or "sequence"). Defaults to "token".
             temperature (float): Temperature for the logits
             compiled (bool): Whether to use torch compile
             use_ref_model (bool): Whether to use a reference model
@@ -162,6 +190,7 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
             compiled=compiled,
             use_ref_model=use_ref_model,
             chunk_size=chunk_size,
+            importance_sampling_level=importance_sampling_level,
         )
     @staticmethod
@@ -187,6 +216,7 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
             None,  # grad_epsilon_high
             None,  # grad_loss_type (string, not differentiable)
             None,  # grad_max_completion_length (int, not differentiable)
+            None,  # grad_importance_sampling_level (string, not differentiable)
             None,  # grad_temperature
             None,  # grad_compiled
             None,  # grad_use_ref_model
@@ -207,6 +237,7 @@ class LigerFusedLinearGRPOLoss(torch.nn.Module):
         epsilon_high: float = 0.2,
         loss_type: str = "bnpo",
         max_completion_length: Optional[int] = None,
+        importance_sampling_level: str = "token",
         temperature: float = 1.0,
     ):
         """
@@ -219,6 +250,7 @@ class LigerFusedLinearGRPOLoss(torch.nn.Module):
             epsilon_high (float): Upper bound for the importance sampling ratio.
             loss_type (str): Type of loss calculation ("grpo", "bnpo", "dr_grpo"). Defaults to "bnpo".
             max_completion_length (int, optional): Maximum completion length, required for "dr_grpo". Defaults to None.
+            importance_sampling_level (str): Level of importance sampling ("token" or "sequence"). Defaults to "token".
             temperature (float): Temperature for the logits.
         """
         super().__init__()
@@ -230,6 +262,7 @@ class LigerFusedLinearGRPOLoss(torch.nn.Module):
         self.epsilon_high = epsilon_high
         self.loss_type = loss_type
         self.max_completion_length = max_completion_length
+        self.importance_sampling_level = importance_sampling_level
         self.temperature = temperature
     def forward(
@@ -263,6 +296,7 @@ class LigerFusedLinearGRPOLoss(torch.nn.Module):
             self.epsilon_high,
             self.loss_type,
             self.max_completion_length,
+            self.importance_sampling_level,
             self.temperature,
             self.compiled,
             self.use_ref_model,

liger_kernel/chunked_loss/jsd_loss.py CHANGED Viewed

@@ -1,3 +1,5 @@
+import math
 import torch
 import torch.nn.functional as F
@@ -25,8 +27,9 @@ class LigerFusedLinearJSDFunction(LigerFusedLinearDistillationBase):
             jsd_loss = F.kl_div(teacher_log_probs, student_log_probs, reduction="sum", log_target=True)
         else:
             # Compute probabilities (only required for mean calculation)
-            mean_probs = (1 - beta) * student_log_probs.exp() + beta * teacher_log_probs.exp()
-            log_mean_probs = mean_probs.log()
+            log_mean_probs = torch.logsumexp(
+                torch.stack([student_log_probs + math.log(1 - beta), teacher_log_probs + math.log(beta)], dim=0), dim=0
+            )
             student_kl = F.kl_div(log_mean_probs, student_log_probs, reduction="sum", log_target=True)
             teacher_kl = F.kl_div(log_mean_probs, teacher_log_probs, reduction="sum", log_target=True)

liger_kernel/ops/cross_entropy.py CHANGED Viewed

@@ -45,6 +45,7 @@ def liger_cross_entropy_kernel(
     BLOCK_SIZE: tl.constexpr,
     HAS_WEIGHT: tl.constexpr,
     HAS_SOFTCAPPING: tl.constexpr,
+    HAS_GRADIENTS: tl.constexpr,
 ):
     """
     This kernel computes both cross entropy loss and the gradient of the input.
@@ -72,6 +73,7 @@ def liger_cross_entropy_kernel(
     BLOCK_SIZE (int): The block size for Triton operations.
     HAS_WEIGHT (bool): The boolean value to determine whether assigning weight to each of the classes.
     HAS_SOFTCAPPING (bool): The boolean value to determine whether applying soft-capping or not.
+    HAS_GRADIENTS (bool): The boolean value to determine whether calculating gradients in forward pass.
     """
     # https://github.com/triton-lang/triton/issues/1058
@@ -155,58 +157,58 @@ def liger_cross_entropy_kernel(
     # For 'sum' reduction, no normalization is applied:
     # dx_y = softmax(x_y) - 1
     # dx_i = softmax(x_i), for i ≠ y
-    for i in range(0, n_cols, BLOCK_SIZE):
-        X_offsets = i + tl.arange(0, BLOCK_SIZE)
-        X_block = tl.load(
-            X_ptr + X_offsets,
-            mask=X_offsets < n_cols,
-            other=float("-inf"),
-            # Ensure float32 precision for softmax calculation
-        ).cast(tl.float32)
-        if HAS_SOFTCAPPING:
-            intermediate = tanh(X_block / softcap)
-            X_block = softcap * intermediate
-        if not HAS_WEIGHT:
-            # softmax(x_i)
-            X_block = tl.exp(X_block - m) / d
-            # derivative of z-loss: 2 * lse_square_scale * lse * softmax(x_i)
-            X_block += 2 * lse_square_scale * lse * X_block
-            # smoothing term
-            X_block += -eps
-            # special handle dx_y
-            X_block = tl.where(X_offsets != y, X_block, X_block - (1 - label_smoothing))
-            # reduction scale
-            if reduction == "mean":
-                X_block = X_block / n_non_ignore
-        else:
-            weight_block = tl.load(weight_ptr + X_offsets, mask=X_offsets < n_cols)
-            softmax_X = tl.exp(X_block - m) / d
-            # derivative of original_loss
-            dloss_ori = (1 - label_smoothing) * softmax_X
-            # specially handle dx_y
-            dloss_ori = tl.where(X_offsets != y, dloss_ori, dloss_ori - (1 - label_smoothing))
-            dloss_ori = dloss_ori * weight_y
-            # derivative of smooth_loss
-            dloss_smooth = eps * (-weight_block + softmax_X * weight_sum)
-            # derivative of z-loss
-            dz_loss = 2 * lse_square_scale * lse * softmax_X
-            # reduction scale
-            if reduction == "mean":
-                dloss_ori = dloss_ori / sum_non_ignore_weight
-                dloss_smooth = dloss_smooth / sum_non_ignore_weight
-                # TODO: Implement weighted z_loss. Currently, z_loss is not scaled by weight.
-                dz_loss = dz_loss / n_non_ignore
-            # derivative of total_loss
-            X_block = dloss_ori + dloss_smooth + dz_loss
-        # chain rule softcapping
-        # d(softcap * tanh(x / softcap)) = (1 - tanh^2(x / softcap))
-        if HAS_SOFTCAPPING:
-            X_block = X_block * (1 - intermediate * intermediate)
-        tl.store(X_ptr + X_offsets, X_block, mask=X_offsets < n_cols)
+    if HAS_GRADIENTS:
+        for i in range(0, n_cols, BLOCK_SIZE):
+            X_offsets = i + tl.arange(0, BLOCK_SIZE)
+            X_block = tl.load(
+                X_ptr + X_offsets,
+                mask=X_offsets < n_cols,
+                other=float("-inf"),
+                # Ensure float32 precision for softmax calculation
+            ).cast(tl.float32)
+            if HAS_SOFTCAPPING:
+                intermediate = tanh(X_block / softcap)
+                X_block = softcap * intermediate
+            if not HAS_WEIGHT:
+                # softmax(x_i)
+                X_block = tl.exp(X_block - m) / d
+                # derivative of z-loss: 2 * lse_square_scale * lse * softmax(x_i)
+                X_block += 2 * lse_square_scale * lse * X_block
+                # smoothing term
+                X_block += -eps
+                # special handle dx_y
+                X_block = tl.where(X_offsets != y, X_block, X_block - (1 - label_smoothing))
+                # reduction scale
+                if reduction == "mean":
+                    X_block = X_block / n_non_ignore
+            else:
+                weight_block = tl.load(weight_ptr + X_offsets, mask=X_offsets < n_cols)
+                softmax_X = tl.exp(X_block - m) / d
+                # derivative of original_loss
+                dloss_ori = (1 - label_smoothing) * softmax_X
+                # specially handle dx_y
+                dloss_ori = tl.where(X_offsets != y, dloss_ori, dloss_ori - (1 - label_smoothing))
+                dloss_ori = dloss_ori * weight_y
+                # derivative of smooth_loss
+                dloss_smooth = eps * (-weight_block + softmax_X * weight_sum)
+                # derivative of z-loss
+                dz_loss = 2 * lse_square_scale * lse * softmax_X
+                # reduction scale
+                if reduction == "mean":
+                    dloss_ori = dloss_ori / sum_non_ignore_weight
+                    dloss_smooth = dloss_smooth / sum_non_ignore_weight
+                    # TODO: Implement weighted z_loss. Currently, z_loss is not scaled by weight.
+                    dz_loss = dz_loss / n_non_ignore
+                # derivative of total_loss
+                X_block = dloss_ori + dloss_smooth + dz_loss
+            # chain rule softcapping
+            # d(softcap * tanh(x / softcap)) = (1 - tanh^2(x / softcap))
+            if HAS_SOFTCAPPING:
+                X_block = X_block * (1 - intermediate * intermediate)
+            tl.store(X_ptr + X_offsets, X_block, mask=X_offsets < n_cols)
     # We need tl.debug_barrier() to ensure the new result of X_ptr is written as mentioned in
     # https://github.com/triton-lang/triton/blob/ba42a5c68fd0505f8c42f4202d53be0f8d9a5fe0/python/triton/ops/cross_entropy.py#L34
@@ -332,6 +334,7 @@ def cross_entropy_forward(
         BLOCK_SIZE=BLOCK_SIZE,
         HAS_WEIGHT=True if weight is not None else False,
         HAS_SOFTCAPPING=True if softcap is not None else False,
+        HAS_GRADIENTS=_input.requires_grad,
         # TODO: 32 seems to give the best performance
         # Performance is quite sensitive to num_warps
         num_warps=32 if not is_hip() else 16,
@@ -411,6 +414,8 @@ class LigerCrossEntropyFunction(torch.autograd.Function):
         Returns:
         tuple: A tuple with the compouted losses with respect to loss and z loss. The elements are tensors or None.
         """
+        input_requires_grad = _input.requires_grad
         loss, z_loss, _input = cross_entropy_forward(
             _input,
             target,
@@ -425,7 +430,8 @@ class LigerCrossEntropyFunction(torch.autograd.Function):
         # TODO: investigation
         # If we don't detach the _input tensor, the memory will double
         # Not sure why but seems that there will be a time both grad and value exist but in different location
-        ctx.save_for_backward(_input.detach())
+        if input_requires_grad:
+            ctx.save_for_backward(_input.detach())
         ctx.return_z_loss = return_z_loss
         return loss, z_loss

liger_kernel/ops/fused_linear_cross_entropy.py CHANGED Viewed

@@ -25,10 +25,14 @@ def fused_linear_cross_entropy_forward(
     reduction="mean",
     softcap=None,
     return_z_loss=False,
+    accum_dtype=None,
+    use_token_scaling=False,
 ):
     assert isinstance(return_z_loss, bool), f"return_z_loss must be True or False. Got: {return_z_loss}"
     device = _input.device
+    input_requires_grad = _input.requires_grad
     # inputs have shape: BT x H
     # materialized activations will have shape: BT x V
     # the increase in memory = BT x V
@@ -44,10 +48,17 @@ def fused_linear_cross_entropy_forward(
     chunk_size = triton.next_power_of_2(triton.cdiv(BT, inc_factor))  # (BT + inc_factor - 1) // inc_factor
     num_chunks = triton.cdiv(BT, chunk_size)  # (BT + chunk_size - 1) // chunk_size
-    grad_weight = torch.zeros_like(weight, device=device) if weight.requires_grad else None
     grad_input = torch.zeros_like(_input, device=device)
-    grad_bias = torch.zeros_like(bias, device=device) if bias is not None else None
-    # we use fp32 for loss accumulator
+    # we use fp32 for loss and gradients accumulator
+    if input_requires_grad:
+        if accum_dtype is None:
+            grad_weight = torch.zeros_like(weight, device=device) if weight.requires_grad else None
+            grad_bias = torch.zeros_like(bias, device=device) if bias is not None else None
+        else:
+            grad_weight = torch.zeros_like(weight, dtype=accum_dtype, device=device) if weight.requires_grad else None
+            grad_bias = torch.zeros_like(bias, dtype=accum_dtype, device=device) if bias is not None else None
     loss_1d = torch.zeros(BT, dtype=torch.float32, device=device)
     z_loss_1d = torch.zeros(BT, dtype=_input.dtype, device=_input.device) if return_z_loss else None
@@ -82,6 +93,36 @@ def fused_linear_cross_entropy_forward(
         n_rows = logits_chunk.shape[0]
+        # Compute predicted probabilities for token scaling if needed
+        if use_token_scaling:
+            # Compute softmax probabilities for scaling
+            # We need to compute this before the cross entropy kernel modifies logits_chunk
+            logits_for_softmax = logits_chunk.detach().clone()  # Detach to avoid gradient flow
+            if softcap is not None:
+                logits_for_softmax = softcap * torch.tanh(logits_for_softmax / softcap)
+            # Compute softmax to get predicted probabilities
+            probs = torch.softmax(logits_for_softmax, dim=-1)
+            # Get predicted probabilities for token scaling, handling ignored targets
+            valid_target_mask = target_chunk != ignore_index
+            valid_targets = target_chunk[valid_target_mask]
+            if len(valid_targets) > 0:
+                # Gather probabilities only for valid targets
+                valid_probs = probs[valid_target_mask]
+                pred_probs_valid = torch.gather(valid_probs, -1, valid_targets.unsqueeze(-1)).squeeze(-1)
+                # Create full tensor with zeros for ignored targets
+                pred_probs = torch.zeros_like(target_chunk, dtype=probs.dtype, device=probs.device)
+                pred_probs[valid_target_mask] = pred_probs_valid
+            else:
+                # All targets are ignored
+                pred_probs = torch.zeros_like(target_chunk, dtype=probs.dtype, device=probs.device)
+            # Store the scaling factors
+            scaling_factors = pred_probs.detach()  # Detach to ensure no gradient flow
         # unreduced loss
         loss_1d_slice = loss_1d[start_idx:end_idx]  # chunk_size,
         z_loss_1d_slice = z_loss_1d[start_idx:end_idx] if return_z_loss else None
@@ -112,33 +153,38 @@ def fused_linear_cross_entropy_forward(
             RETURN_Z_LOSS=return_z_loss,
             HAS_WEIGHT=True if ce_weight is not None else False,
             HAS_SOFTCAPPING=True if softcap is not None else False,
+            HAS_GRADIENTS=input_requires_grad,
             BLOCK_SIZE=BLOCK_SIZE,
             num_warps=32 if not is_hip() else 16,
         )
+        # Apply token scaling if requested
+        if use_token_scaling:
+            loss_1d_slice = loss_1d_slice * scaling_factors
+            if return_z_loss:
+                z_loss_1d_slice = z_loss_1d_slice * scaling_factors
         loss_1d[start_idx:end_idx] = loss_1d_slice
         if return_z_loss:
             z_loss_1d[start_idx:end_idx] = z_loss_1d_slice
         grad_logits_chunk = logits_chunk  # chunk_size x V
-        grad_input[start_idx:end_idx] = grad_logits_chunk @ weight
+        # Apply token scaling to gradients if requested
+        if use_token_scaling:
+            # Expand scaling factors to match gradient dimensions
+            scaling_factors_expanded = scaling_factors.unsqueeze(-1)  # chunk_size x 1
+            grad_logits_chunk = grad_logits_chunk * scaling_factors_expanded
-        if grad_weight is not None:
-            torch.addmm(
-                input=grad_weight,
-                mat1=logits_chunk.t().to(
-                    _input_chunk.dtype
-                ),  # In an autocast scenario without bias, differing logits_chunk data types will cause an addmm operation error.
-                mat2=_input_chunk,
-                out=grad_weight,
-                alpha=1.0,
-                beta=1.0,
-            )
+        if input_requires_grad:
+            grad_input[start_idx:end_idx] = grad_logits_chunk @ weight
-        if bias is not None:
+        if grad_weight is not None and input_requires_grad:
+            grad_weight += torch.mm(grad_logits_chunk.t(), _input_chunk).float()
+        if bias is not None and input_requires_grad:
             torch.add(
                 input=grad_bias,
-                other=logits_chunk.sum(dim=0),
+                other=grad_logits_chunk.sum(dim=0),
                 out=grad_bias,
                 alpha=1.0,
             )
@@ -148,9 +194,18 @@ def fused_linear_cross_entropy_forward(
     #     loss = loss_1d
     #     z_loss = z_loss_1d if return_z_loss else None
+    if reduction == "none":
+        # Return per-token losses
+        loss = loss_1d
+        z_loss = z_loss_1d if return_z_loss else None
     else:
         loss = torch.sum(loss_1d)
         z_loss = torch.sum(z_loss_1d) if return_z_loss else None
+    # Cast back to original dtype
+    grad_weight = grad_weight.to(weight.dtype) if grad_weight is not None else None
+    grad_bias = grad_bias.to(bias.dtype) if grad_bias is not None else None
     return loss, z_loss, grad_input, grad_weight, grad_bias
@@ -217,6 +272,8 @@ class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
         reduction="mean",
         softcap=None,
         return_z_loss: bool = False,
+        accum_dtype=None,
+        use_token_scaling: bool = False,
     ):
         """
         Fusing the last linear layer with cross-entropy loss
@@ -235,6 +292,11 @@ class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
         ignore_index: the index to ignore in the target
         label_smoothing (float): The amount of smoothing when computing the loss, where 0.0 means no smoothing.
         reduction: reduction to apply
+        accum_dtype (torch.dtype): the dtype of intermediate result buffers for weight and bias gradient accumulations.
+            Recommended to set `accum_dtype` to higher precision, e.g. `torch.float32`, if the training is unstable with original dtype. Default: `None`, performing accumulations in original dtype
+        use_token_scaling (bool): whether to scale each token's loss by its predicted probability (detached).
+            When True, each token's loss is multiplied by the model's predicted probability for that token's true class.
+            Default: False.
         """
         loss, z_loss, grad_input, grad_weight, grad_bias = fused_linear_cross_entropy_forward(
@@ -249,6 +311,8 @@ class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
             reduction=reduction,
             softcap=softcap,
             return_z_loss=return_z_loss,
+            accum_dtype=accum_dtype,
+            use_token_scaling=use_token_scaling,
         )
         # downcast to dtype and store for backward
         ctx.save_for_backward(
@@ -280,4 +344,6 @@ class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
             None,
             None,
             None,
+            None,
+            None,  # use_token_scaling
         )

liger_kernel/ops/layer_norm.py CHANGED Viewed

@@ -63,12 +63,11 @@ def _layer_norm_forward_kernel(
     X_f32 = X_row.to(tl.float32)
     # Compute statistics in fp32 for numerical stability
-    n_cols_f32 = n_cols.to(tl.float32)
-    mean = tl.sum(X_f32, axis=0) / n_cols_f32
+    mean = tl.sum(X_f32, axis=0) / n_cols
     X_centered = X_f32 - mean
     # Apply mask to variance calculation to exclude contributions from masked elements
     X_centered_masked = tl.where(mask, X_centered, 0.0)
-    var = tl.sum(X_centered_masked * X_centered_masked, axis=0) / n_cols_f32
+    var = tl.sum(X_centered_masked * X_centered_masked, axis=0) / n_cols
     rstd = rsqrt(var + eps)
     # Store statistics (convert back to original dtype only once)
@@ -113,7 +112,6 @@ def _layer_norm_backward_kernel(
     # Pre-load weights once (same optimization as forward pass)
     w = tl.load(W_ptr + cols, mask=mask, other=0.0)
     w_f32 = w.to(tl.float32)
-    n_cols_f32 = n_cols.to(tl.float32)
     # Calculate pointers for this specific row
     row_X_ptr = X_ptr + row_idx * stride_x
@@ -137,8 +135,8 @@ def _layer_norm_backward_kernel(
     # Compute backward pass for this row
     x_hat = (x_f32 - mean_f32) * rstd_f32
     wdy = w_f32 * dy_f32
-    c1 = tl.sum(x_hat * wdy, axis=0) / n_cols_f32
-    c2 = tl.sum(wdy, axis=0) / n_cols_f32
+    c1 = tl.sum(x_hat * wdy, axis=0) / n_cols
+    c2 = tl.sum(wdy, axis=0) / n_cols
     dx = (wdy - (x_hat * c1 + c2)) * rstd_f32
     # Store input gradient

liger-kernel 0.6.1__py3-none-any.whl → 0.6.3__py3-none-any.whl

liger-kernel 0.6.1py3-none-any.whl → 0.6.3py3-none-any.whl