PyPI - liger-kernel - Versions diffs - 0.5.2__py3-none-any.whl → 0.5.4__py3-none-any.whl - Mend

liger-kernel 0.5.2py3-none-any.whl → 0.5.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (69) hide show

liger_kernel/chunked_loss/README.md +25 -0
liger_kernel/chunked_loss/__init__.py +3 -0
liger_kernel/chunked_loss/cpo_loss.py +18 -8
liger_kernel/chunked_loss/dpo_loss.py +20 -10
liger_kernel/chunked_loss/functional.py +4 -0
liger_kernel/chunked_loss/fused_linear_distillation.py +58 -44
liger_kernel/chunked_loss/fused_linear_preference.py +108 -60
liger_kernel/chunked_loss/fused_linear_rlhf.py +213 -0
liger_kernel/chunked_loss/fused_linear_unpaired_preference.py +246 -0
liger_kernel/chunked_loss/grpo_loss.py +160 -0
liger_kernel/chunked_loss/jsd_loss.py +154 -0
liger_kernel/chunked_loss/kto_loss.py +172 -0
liger_kernel/chunked_loss/orpo_loss.py +8 -9
liger_kernel/chunked_loss/simpo_loss.py +22 -8
liger_kernel/env_report.py +5 -12
liger_kernel/ops/cross_entropy.py +102 -51
liger_kernel/ops/experimental/embedding.py +1 -3
liger_kernel/ops/experimental/mm_int8int2.py +3 -9
liger_kernel/ops/fused_linear_cross_entropy.py +89 -55
liger_kernel/ops/fused_linear_jsd.py +14 -32
liger_kernel/ops/geglu.py +6 -17
liger_kernel/ops/group_norm.py +11 -28
liger_kernel/ops/jsd.py +5 -9
liger_kernel/ops/kl_div.py +8 -11
liger_kernel/ops/layer_norm.py +23 -12
liger_kernel/ops/qwen2vl_mrope.py +8 -25
liger_kernel/ops/rms_norm.py +14 -32
liger_kernel/ops/rope.py +31 -33
liger_kernel/ops/swiglu.py +4 -8
liger_kernel/ops/tvd.py +207 -0
liger_kernel/ops/utils.py +3 -2
liger_kernel/transformers/__init__.py +19 -24
liger_kernel/transformers/auto_model.py +6 -13
liger_kernel/transformers/cross_entropy.py +7 -9
liger_kernel/transformers/experimental/embedding.py +1 -3
liger_kernel/transformers/functional.py +28 -7
liger_kernel/transformers/fused_linear_cross_entropy.py +15 -10
liger_kernel/transformers/geglu.py +1 -4
liger_kernel/transformers/group_norm.py +9 -15
liger_kernel/transformers/jsd.py +1 -3
liger_kernel/transformers/kl_div.py +1 -3
liger_kernel/transformers/layer_norm.py +3 -9
liger_kernel/transformers/model/gemma.py +18 -40
liger_kernel/transformers/model/gemma2.py +19 -41
liger_kernel/transformers/model/llama.py +22 -48
liger_kernel/transformers/model/mistral.py +14 -26
liger_kernel/transformers/model/mixtral.py +24 -54
liger_kernel/transformers/model/mllama.py +16 -36
liger_kernel/transformers/model/olmo2.py +124 -0
liger_kernel/transformers/model/phi3.py +18 -40
liger_kernel/transformers/model/qwen2.py +18 -40
liger_kernel/transformers/model/qwen2_vl.py +36 -32
liger_kernel/transformers/monkey_patch.py +214 -144
liger_kernel/transformers/rms_norm.py +4 -4
liger_kernel/transformers/rope.py +2 -2
liger_kernel/transformers/swiglu.py +2 -8
liger_kernel/transformers/trainer/__init__.py +1 -3
liger_kernel/transformers/trainer/orpo_trainer.py +31 -18
liger_kernel/transformers/tvd.py +13 -0
liger_kernel/triton/__init__.py +1 -3
liger_kernel/triton/monkey_patch.py +1 -3
liger_kernel/utils.py +49 -0
{liger_kernel-0.5.2.dist-info → liger_kernel-0.5.4.dist-info}/METADATA +53 -26
liger_kernel-0.5.4.dist-info/RECORD +74 -0
{liger_kernel-0.5.2.dist-info → liger_kernel-0.5.4.dist-info}/WHEEL +1 -1
liger_kernel-0.5.2.dist-info/RECORD +0 -65
{liger_kernel-0.5.2.dist-info → liger_kernel-0.5.4.dist-info}/LICENSE +0 -0
{liger_kernel-0.5.2.dist-info → liger_kernel-0.5.4.dist-info}/NOTICE +0 -0
{liger_kernel-0.5.2.dist-info → liger_kernel-0.5.4.dist-info}/top_level.txt +0 -0

liger_kernel/ops/cross_entropy.py CHANGED Viewed

@@ -1,11 +1,14 @@
 import operator
 from typing import Optional
 import torch
 import triton
 import triton.language as tl
-from liger_kernel.ops.utils import compare_version, element_mul_kernel, is_hip
+from liger_kernel.ops.utils import compare_version
+from liger_kernel.ops.utils import element_mul_kernel
+from liger_kernel.ops.utils import is_hip
 if compare_version("triton", operator.ge, "3.0.0"):
     try:
@@ -17,9 +20,6 @@ if compare_version("triton", operator.ge, "3.0.0"):
 else:
     from triton.language.math import tanh
-_TRUE = tl.constexpr(1)
-_FALSE = tl.constexpr(0)
 @triton.jit
 def liger_cross_entropy_kernel(
@@ -27,11 +27,14 @@ def liger_cross_entropy_kernel(
     X_stride,
     Y_ptr,
     Y_stride,
+    weight_ptr,
     loss_ptr,
     z_loss_ptr,
     loss_stride,
     n_cols,
     n_non_ignore,
+    sum_non_ignore_weight,
+    weight_sum,
     ignore_index,
     lse_square_scale: tl.constexpr,
     label_smoothing: tl.constexpr,
@@ -39,6 +42,7 @@ def liger_cross_entropy_kernel(
     softcap,
     RETURN_Z_LOSS: tl.constexpr,
     BLOCK_SIZE: tl.constexpr,
+    HAS_WEIGHT: tl.constexpr,
     HAS_SOFTCAPPING: tl.constexpr,
 ):
     """
@@ -50,18 +54,22 @@ def liger_cross_entropy_kernel(
     X_stride (int): The stride of the input tensor.
     Y_ptr: Pointer to target tensor.
     Y_stride (int): The stride of the target tensor.
+    weight_ptr: Pointer to weight tensor.
     loss_ptr: Pointer to tensor to store the loss.
     z_loss_ptr: Pointer to tensor to store the z loss. No operation if RETURN_Z_LOSS is 0.
     loss_stride (int): The stride of the loss tensor.
     n_cols (int): The number of columns in the input tensor.
-    n_non_ignore (int): The number of non-ignored elements in the batch.
+    n_non_ignore (flaot): The number of non-ignored elements in the batch.
+    sum_non_ignore_weight (float): The sum of non-ignored target's weights in the batch.
+    weight_sum (float): The sum of weight tensor.
     ignore_index (int): The index to ignore in the target.
     label_smoothing (float): The amount of smoothing when computing the loss, where 0.0 means no smoothing.
     lse_square_scale (float): The scaler of (logsumexp(_input)) ^ 2 adding to the loss for the stability of training.
-    RETURN_Z_LOSS (int): The boolean value to decide whether storing z loss to z_loss_ptr or not. It must be 0 or 1.
     reduction (str): The string for the reduction to apply
     softcap (float): The upper threshold for scaling logits to the range (-softcap, +softcap).
+    RETURN_Z_LOSS (int): The boolean value to decide whether storing z loss to z_loss_ptr or not. It must be 0 or 1.
     BLOCK_SIZE (int): The block size for Triton operations.
+    HAS_WEIGHT (bool): The boolean value to determine whether assigning weight to each of the classes.
     HAS_SOFTCAPPING (bool): The boolean value to determine whether applying soft-capping or not.
     """
@@ -84,7 +92,11 @@ def liger_cross_entropy_kernel(
         return
     loss_ptr += program_id * loss_stride
-    z_loss_ptr += program_id * loss_stride
+    if RETURN_Z_LOSS:
+        z_loss_ptr += program_id * loss_stride
+    if HAS_WEIGHT:
+        weight_y = tl.load(weight_ptr + y).cast(tl.float32)
     # Online softmax: 2 loads + 1 store (compared with 3 loads + 1 store for the safe softmax)
     # Refer to Algorithm 3 in the paper: https://arxiv.org/pdf/1805.02867
@@ -92,9 +104,7 @@ def liger_cross_entropy_kernel(
     # 3. [Online softmax] first pass: find max + sum
     m = float("-inf")  # m is the max value. use the notation from the paper
     d = 0.0  # d is the sum. use the notation from the paper
-    ori_X_y = tl.load(X_ptr + y).cast(
-        tl.float32
-    )  # we need to store the original value of X_y for the loss calculation
+    ori_X_y = tl.load(X_ptr + y).cast(tl.float32)  # we need to store the original value of X_y for the loss calculation
     if HAS_SOFTCAPPING:
         ori_X_y = softcap * tanh(ori_X_y / softcap)
@@ -116,7 +126,11 @@ def liger_cross_entropy_kernel(
         block_max = tl.max(X_block)
         if label_smoothing > 0:
             # scale X beforehand to avoid overflow
-            scaled_x_sum += tl.sum(tl.where(X_offsets < n_cols, -eps * X_block, 0.0))
+            if HAS_WEIGHT:
+                weight_block = tl.load(weight_ptr + X_offsets, mask=X_offsets < n_cols)
+                scaled_x_sum += tl.sum(tl.where(X_offsets < n_cols, -eps * X_block * weight_block, 0.0))
+            else:
+                scaled_x_sum += tl.sum(tl.where(X_offsets < n_cols, -eps * X_block, 0.0))
         m_new = tl.maximum(m, block_max)
         d = d * tl.exp(m - m_new) + tl.sum(tl.exp(X_block - m_new))
         m = m_new
@@ -152,18 +166,41 @@ def liger_cross_entropy_kernel(
         if HAS_SOFTCAPPING:
             intermediate = tanh(X_block / softcap)
             X_block = softcap * intermediate
-        # softmax(x_i)
-        X_block = tl.exp(X_block - m) / d
-        # derivative of z-loss: 2 * lse_square_scale * lse * softmax(x_i)
-        X_block += 2 * lse_square_scale * lse * X_block
-        # smoothing term
-        X_block += -eps
-        # special handle dx_y
-        X_block = tl.where(X_offsets != y, X_block, X_block - (1 - label_smoothing))
-        # reduction scale
-        if reduction == "mean":
-            X_block = X_block / (n_non_ignore)
-        # chain rule
+        if not HAS_WEIGHT:
+            # softmax(x_i)
+            X_block = tl.exp(X_block - m) / d
+            # derivative of z-loss: 2 * lse_square_scale * lse * softmax(x_i)
+            X_block += 2 * lse_square_scale * lse * X_block
+            # smoothing term
+            X_block += -eps
+            # special handle dx_y
+            X_block = tl.where(X_offsets != y, X_block, X_block - (1 - label_smoothing))
+            # reduction scale
+            if reduction == "mean":
+                X_block = X_block / n_non_ignore
+        else:
+            weight_block = tl.load(weight_ptr + X_offsets, mask=X_offsets < n_cols)
+            softmax_X = tl.exp(X_block - m) / d
+            # derivative of original_loss
+            dloss_ori = (1 - label_smoothing) * softmax_X
+            # specially handle dx_y
+            dloss_ori = tl.where(X_offsets != y, dloss_ori, dloss_ori - (1 - label_smoothing))
+            dloss_ori = dloss_ori * weight_y
+            # derivative of smooth_loss
+            dloss_smooth = eps * (-weight_block + softmax_X * weight_sum)
+            # derivative of z-loss
+            dz_loss = 2 * lse_square_scale * lse * softmax_X
+            # reduction scale
+            if reduction == "mean":
+                dloss_ori = dloss_ori / sum_non_ignore_weight
+                dloss_smooth = dloss_smooth / sum_non_ignore_weight
+                # TODO: Implement weighted z_loss. Currently, z_loss is not scaled by weight.
+                dz_loss = dz_loss / n_non_ignore
+            # derivative of total_loss
+            X_block = dloss_ori + dloss_smooth + dz_loss
+        # chain rule softcapping
         # d(softcap * tanh(x / softcap)) = (1 - tanh^2(x / softcap))
         if HAS_SOFTCAPPING:
             X_block = X_block * (1 - intermediate * intermediate)
@@ -182,6 +219,8 @@ def liger_cross_entropy_kernel(
     # sum(e ^ (X - max(X))) must >= 1 because the max term is e ^ 0 = 1
     # So we can safely calculate log (softmax(X_y)) without overflow
     loss = lse - ori_X_y
+    if HAS_WEIGHT:
+        loss = weight_y * loss
     # Original loss = H(q, p),  with label smoothing regularization = H(q', p) and (label_smoothing / V) = eps
     # H(q', p) = (1 - label_smoothing) * H(q, p) + label_smoothing * H(u, p)
@@ -192,20 +231,27 @@ def liger_cross_entropy_kernel(
     # pytorch: https://github.com/pytorch/pytorch/blob/2981534f54d49fa3a9755c9b0855e7929c2527f0/aten/src/ATen/native/LossNLL.cpp#L516
     # See full derivation at https://github.com/linkedin/Liger-Kernel/pull/198#issuecomment-2333753087
     if label_smoothing > 0:
-        smooth_loss = scaled_x_sum + label_smoothing * lse
+        if HAS_WEIGHT:
+            smooth_loss = scaled_x_sum + eps * lse * weight_sum
+        else:
+            smooth_loss = scaled_x_sum + label_smoothing * lse
         loss = loss * (1 - label_smoothing) + smooth_loss
     # An auxiliary loss, z_loss
     # Refer to Page14 Loss function section in the paper PaLM: https://www.jmlr.org/papers/v24/22-1144.html
     z_loss = lse_square_scale * lse * lse
-    loss += z_loss
     # Normalize the loss by the number of non-ignored elements if reduction is "mean"
     if reduction == "mean":
+        if HAS_WEIGHT:
+            loss = loss / sum_non_ignore_weight
+        else:
+            loss = loss / n_non_ignore
+        # TODO: Implement weighted z_loss. Currently, z_loss is not scaled by weight.
         z_loss = z_loss / n_non_ignore
-        loss = loss / n_non_ignore
+    loss += z_loss
     tl.store(loss_ptr, loss)
-    if RETURN_Z_LOSS == _TRUE:
+    if RETURN_Z_LOSS:
         tl.store(z_loss_ptr, z_loss)
@@ -215,15 +261,10 @@ def liger_cross_entropy_kernel(
 MAX_FUSED_SIZE = 65536 // 2  # the best size we found by manually tuning
-_bool_to_return_z_loss = {
-    True: _TRUE.value,
-    False: _FALSE.value,
-}
 def cross_entropy_forward(
     _input,
     target,
+    weight,
     ignore_index,
     lse_square_scale,
     label_smoothing,
@@ -231,15 +272,7 @@ def cross_entropy_forward(
     softcap,
     return_z_loss,
 ):
-    if not isinstance(return_z_loss, int):
-        assert (
-            return_z_loss in _bool_to_return_z_loss
-        ), f"return_z_loss must be True or False. Got: {return_z_loss}"
-        return_z_loss = _bool_to_return_z_loss[return_z_loss]
-    else:
-        assert (
-            return_z_loss in _bool_to_return_z_loss
-        ), f"return_z_loss must be True or False. Got: {return_z_loss}"
+    assert isinstance(return_z_loss, bool), f"return_z_loss must be True or False. Got: {return_z_loss}"
     BT, V = _input.shape
     n_rows = BT
@@ -248,12 +281,22 @@ def cross_entropy_forward(
     # unreduced loss
     loss_1d = torch.zeros(n_rows, dtype=_input.dtype, device=_input.device)
-    if return_z_loss == _TRUE.value:
-        z_loss_1d = torch.zeros(n_rows, dtype=_input.dtype, device=_input.device)
-    else:
-        z_loss_1d = loss_1d  # dummy ptr when return_z_loss == False
-    n_non_ignore = (target != ignore_index).sum().item()
+    z_loss_1d = torch.zeros(n_rows, dtype=_input.dtype, device=_input.device) if return_z_loss else None
+    target_mask = target != ignore_index
+    n_non_ignore = target_mask.sum().item()
+    sum_non_ignore_weight = n_non_ignore
+    weight_sum = 0.0
+    if weight is not None:
+        assert weight.shape[0] == V, f"If given, weight has to be a Tensor of size V. Got: {weight.shape}"
+        assert torch.is_floating_point(weight), (
+            f"If given, weight has to be a Tensor of floating point dtype. Got: {weight.dtype}"
+        )
+        sum_non_ignore_weight = torch.gather(weight, dim=0, index=target.masked_select(target_mask)).sum().item()
+        weight_sum = weight.sum().item()
+        # ensure weight is contiguous
+        if weight.stride(-1) != 1:
+            weight = weight.contiguous()
     # ensure _input and target are contiguous in the last dimension
     if _input.stride(-1) != 1:
@@ -267,18 +310,22 @@ def cross_entropy_forward(
         X_stride=_input.stride(-2),
         Y_ptr=target,
         Y_stride=target.stride(-1),  # always 1
+        weight_ptr=weight,  # dummy if None
         loss_ptr=loss_1d,
         z_loss_ptr=z_loss_1d,
         loss_stride=loss_1d.stride(-1),  # always 1
         n_cols=V,
         n_non_ignore=n_non_ignore,
+        sum_non_ignore_weight=sum_non_ignore_weight,
         ignore_index=ignore_index,
+        weight_sum=weight_sum,
         lse_square_scale=lse_square_scale,
         label_smoothing=label_smoothing,
         reduction=reduction,
-        softcap=softcap if softcap is not None else 0.0,
+        softcap=softcap,
         RETURN_Z_LOSS=return_z_loss,
         BLOCK_SIZE=BLOCK_SIZE,
+        HAS_WEIGHT=True if weight is not None else False,
         HAS_SOFTCAPPING=True if softcap is not None else False,
         # TODO: 32 seems to give the best performance
         # Performance is quite sensitive to num_warps
@@ -287,10 +334,10 @@ def cross_entropy_forward(
     if reduction == "none":
         loss = loss_1d
-        z_loss = z_loss_1d if return_z_loss == _TRUE.value else None
+        z_loss = z_loss_1d if return_z_loss else None
     else:
         loss = torch.sum(loss_1d)
-        z_loss = torch.sum(z_loss_1d) if return_z_loss == _TRUE.value else None
+        z_loss = torch.sum(z_loss_1d) if return_z_loss else None
     return loss, z_loss, _input
@@ -330,6 +377,7 @@ class LigerCrossEntropyFunction(torch.autograd.Function):
         ctx,
         _input: torch.Tensor,
         target: torch.Tensor,
+        weight: Optional[torch.FloatTensor],
         ignore_index: int = -100,
         lse_square_scale: float = 0.0,
         label_smoothing: float = 0.0,
@@ -344,6 +392,7 @@ class LigerCrossEntropyFunction(torch.autograd.Function):
         ctx : The context object.
         _input (tensor): The input tensor of shape (BT, V) where B is batch size, T is sequence length, V is vocab size.
         target (tensor): The target tensor of shape (BT) where each value is in [0, V-1].
+        weight(Tensor, optional): a manual rescaling weight given to each class. If given, has to be a Tensor of size V and floating point dtype
         ignore_index (int): The index to ignore in the target.
         lse_square_scale (float): The scaler of (logsumexp(_input)) ^ 2 adding to the loss for the stability of training.
         label_smoothing (float): The amount of smoothing when computing the loss, where 0.0 means no smoothing.
@@ -357,6 +406,7 @@ class LigerCrossEntropyFunction(torch.autograd.Function):
         loss, z_loss, _input = cross_entropy_forward(
             _input,
             target,
+            weight,
             ignore_index,
             lse_square_scale,
             label_smoothing,
@@ -398,4 +448,5 @@ class LigerCrossEntropyFunction(torch.autograd.Function):
             None,
             None,
             None,
+            None,
         )

liger_kernel/ops/experimental/embedding.py CHANGED Viewed

@@ -34,9 +34,7 @@ def embedding_forward_kernel(
     )
     output_offsets = offsets_m[:, None] * embedding_dim + offsets_n[None, :]
-    tl.store(
-        output_ptr + output_offsets, embeddings, mask=mask_m[:, None] & mask_n[None, :]
-    )
+    tl.store(output_ptr + output_offsets, embeddings, mask=mask_m[:, None] & mask_n[None, :])
 @triton.jit

liger_kernel/ops/experimental/mm_int8int2.py CHANGED Viewed

@@ -37,9 +37,7 @@ def pack_weights(intweights: torch.Tensor, bits: int = 2) -> torch.Tensor:
     else:
         packed_tensor_shape = (row_dim, *original_shape[1:])
-    packed = torch.zeros(
-        packed_tensor_shape, device=intweights.device, dtype=torch.uint8
-    )
+    packed = torch.zeros(packed_tensor_shape, device=intweights.device, dtype=torch.uint8)
     unpacked = intweights.to(torch.uint8)
     def lshift(t: torch.Tensor, bits: int):
@@ -327,17 +325,13 @@ def matmul_kernel(
 def matmul(a, b):
-    assert (
-        a.shape[1] == b.shape[0] * 4
-    ), "Incompatible dimensions, the weight matrix need to be packed"
+    assert a.shape[1] == b.shape[0] * 4, "Incompatible dimensions, the weight matrix need to be packed"
     assert a.is_contiguous(), "Matrix A must be contiguous"
     M, K = a.shape
     _, N = b.shape
     # c is in int32 to avoid any overflows or underflows
     c = torch.empty((M, N), device=a.device, dtype=torch.int32)
-    grid = lambda META: (
-        triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]),
-    )
+    grid = lambda META: (triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]),)
     matmul_kernel[grid](
         a,
         b,

liger_kernel/ops/fused_linear_cross_entropy.py CHANGED Viewed

@@ -2,12 +2,10 @@ import torch
 import triton
 from liger_kernel.ops.cross_entropy import liger_cross_entropy_kernel
-from liger_kernel.ops.utils import (
-    amp_custom_bwd,
-    amp_custom_fwd,
-    element_mul_kernel,
-    is_hip,
-)
+from liger_kernel.ops.utils import amp_custom_bwd
+from liger_kernel.ops.utils import amp_custom_fwd
+from liger_kernel.ops.utils import element_mul_kernel
+from liger_kernel.ops.utils import is_hip
 # The hard limit of TRITON_MAX_TENSOR_NUMEL is 1048576 https://github.com/triton-lang/triton/blob/ba42a5c68fd0505f8c42f4202d53be0f8d9a5fe0/python/triton/language/core.py#L19
 # However, setting limit as 65536 as in LayerNorm tutorial is faster because of less register spilling
@@ -19,13 +17,16 @@ def fused_linear_cross_entropy_forward(
     _input,
     weight,
     target,
+    ce_weight=None,
     bias=None,
     ignore_index=-100,
     lse_square_scale=0.0,
     label_smoothing=0.0,
     reduction="mean",
     softcap=None,
+    return_z_loss=False,
 ):
+    assert isinstance(return_z_loss, bool), f"return_z_loss must be True or False. Got: {return_z_loss}"
     device = _input.device
     # inputs have shape: BT x H
@@ -40,21 +41,32 @@ def fused_linear_cross_entropy_forward(
     BLOCK_SIZE = min(MAX_FUSED_SIZE, triton.next_power_of_2(V))
     inc_factor = triton.cdiv(V, H)  # (V + H - 1) // H
-    chunk_size = triton.next_power_of_2(
-        triton.cdiv(BT, inc_factor)
-    )  # (BT + inc_factor - 1) // inc_factor
+    chunk_size = triton.next_power_of_2(triton.cdiv(BT, inc_factor))  # (BT + inc_factor - 1) // inc_factor
     num_chunks = triton.cdiv(BT, chunk_size)  # (BT + chunk_size - 1) // chunk_size
-    grad_weight = (
-        torch.zeros_like(weight, device=device) if weight.requires_grad else None
-    )
+    grad_weight = torch.zeros_like(weight, device=device) if weight.requires_grad else None
     grad_input = torch.zeros_like(_input, device=device)
     grad_bias = torch.zeros_like(bias, device=device) if bias is not None else None
     # we use fp32 for loss accumulator
     loss_1d = torch.zeros(BT, dtype=torch.float32, device=device)
-    # NOTE: skip .item() here to avoid CUDA synchronization
-    total_n_non_ignore = (target != ignore_index).sum()
+    z_loss_1d = torch.zeros(BT, dtype=_input.dtype, device=_input.device) if return_z_loss else None
+    # TODO: evaluate how CUDA synchronization caused by .item() affects the speed
+    target_mask = target != ignore_index
+    total_n_non_ignore = target_mask.sum().item()
+    total_sum_non_ignore_ce_weight = total_n_non_ignore
+    ce_weight_sum = 0.0
+    if ce_weight is not None:
+        assert ce_weight.shape[0] == V, f"If given, weight has to be a Tensor of size V. Got: {ce_weight.shape}"
+        assert torch.is_floating_point(ce_weight), (
+            f"If given, weight has to be a Tensor of floating point dtype. Got: {ce_weight.dtype}"
+        )
+        total_sum_non_ignore_ce_weight = (
+            torch.gather(ce_weight, dim=0, index=target.masked_select(target_mask)).sum().item()
+        )
+        ce_weight_sum = ce_weight.sum().item()
+        if ce_weight.stride(-1) != 1:
+            ce_weight = ce_weight.contiguous()
     for chunk_id in range(num_chunks):
         start_idx = chunk_id * chunk_size
@@ -65,13 +77,14 @@ def fused_linear_cross_entropy_forward(
         logits_chunk = _input_chunk @ weight.t()  # chunk_size x V
         if bias is not None:
             logits_chunk = logits_chunk + bias
         target_chunk = target[start_idx:end_idx]  # chunk_size,
         n_rows = logits_chunk.shape[0]
         # unreduced loss
         loss_1d_slice = loss_1d[start_idx:end_idx]  # chunk_size,
-        n_non_ignore = (target_chunk != ignore_index).sum().item()
+        z_loss_1d_slice = z_loss_1d[start_idx:end_idx] if return_z_loss else None
         # ensure _input and target are contiguous
         logits_chunk = logits_chunk.contiguous()
@@ -83,45 +96,42 @@ def fused_linear_cross_entropy_forward(
             X_stride=logits_chunk.stride(-2),
             Y_ptr=target_chunk,
             Y_stride=target_chunk.stride(-1),  # always 1
+            weight_ptr=ce_weight,
             loss_ptr=loss_1d_slice,
-            z_loss_ptr=loss_1d_slice,  # dummy ptr, not used
+            z_loss_ptr=z_loss_1d_slice,
             loss_stride=loss_1d_slice.stride(-1),  # always 1
             n_cols=V,
-            n_non_ignore=n_non_ignore,
+            n_non_ignore=total_n_non_ignore,
+            sum_non_ignore_weight=total_sum_non_ignore_ce_weight,
+            weight_sum=ce_weight_sum,
             ignore_index=ignore_index,
             lse_square_scale=lse_square_scale,
             label_smoothing=label_smoothing,
             reduction=reduction,
-            softcap=softcap if softcap is not None else 0.0,
-            RETURN_Z_LOSS=0,  # False
+            softcap=softcap,
+            RETURN_Z_LOSS=return_z_loss,
+            HAS_WEIGHT=True if ce_weight is not None else False,
             HAS_SOFTCAPPING=True if softcap is not None else False,
             BLOCK_SIZE=BLOCK_SIZE,
             num_warps=32 if not is_hip() else 16,
         )
-        # gradient of logits_chunk is computed in-place by the above triton kernel and is of shape: chunk_size x V
-        # thus grad_input[start_idx: end_idx] should be of shape: chunk_size x H
-        # additionally, since we are chunking the inputs, observe that the loss and gradients are calculated only
-        # on `n_non_ignore` tokens. However, the gradient of the input should be calculated for all tokens.
-        # Thus, we need an additional scaling factor of (n_non_ignore/total_n_non_ignore) to scale the gradients.
-        if reduction == "mean":
-            alpha = n_non_ignore / total_n_non_ignore if total_n_non_ignore > 0 else 0.0
-        else:
-            alpha = 1.0
-        loss_1d[start_idx:end_idx] = loss_1d_slice * alpha
-        grad_logits_chunk = logits_chunk * alpha  # chunk_size x V
+        loss_1d[start_idx:end_idx] = loss_1d_slice
+        if return_z_loss:
+            z_loss_1d[start_idx:end_idx] = z_loss_1d_slice
+        grad_logits_chunk = logits_chunk  # chunk_size x V
         grad_input[start_idx:end_idx] = grad_logits_chunk @ weight
         if grad_weight is not None:
             torch.addmm(
                 input=grad_weight,
-                mat1=logits_chunk.t(),
+                mat1=logits_chunk.t().to(
+                    _input_chunk.dtype
+                ),  # In an autocast scenario without bias, differing logits_chunk data types will cause an addmm operation error.
                 mat2=_input_chunk,
                 out=grad_weight,
-                alpha=alpha,
+                alpha=1.0,
                 beta=1.0,
             )
@@ -130,18 +140,22 @@ def fused_linear_cross_entropy_forward(
                 input=grad_bias,
                 other=logits_chunk.sum(dim=0),
                 out=grad_bias,
-                alpha=alpha,
+                alpha=1.0,
             )
-    loss = torch.sum(loss_1d)
-    return loss, grad_input, grad_weight, grad_bias
+    if reduction == "none":
+        loss = loss_1d
+        z_loss = z_loss_1d if return_z_loss else None
+    else:
+        loss = torch.sum(loss_1d)
+        z_loss = torch.sum(z_loss_1d) if return_z_loss else None
+    return loss, z_loss, grad_input, grad_weight, grad_bias
-def fused_linear_cross_entropy_backward(
-    grad_output, grad_input, grad_weight, grad_bias
-):
+def fused_linear_cross_entropy_backward(grad_output, grad_input, grad_weight, grad_bias):
     # If cross entropy is the last layer, grad_output is 1.0. Skip the mul to save time
-    if torch.ne(grad_output, torch.tensor(1.0, device=grad_output.device)):
+    if not torch.equal(grad_output, torch.tensor(1.0, device=grad_output.device)):
         # We use a Triton kernel instead of a PyTorch operation because modifying inputs in-place
         # for gradient storage and backward multiple times causes anomalies with PyTorch but not with Triton.
         BT, H = grad_input.shape
@@ -195,11 +209,13 @@ class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
         weight,
         target,
         bias=None,
+        ce_weight=None,
         ignore_index=-100,
         lse_square_scale=0.0,
         label_smoothing=0.0,
         reduction="mean",
         softcap=None,
+        return_z_loss: bool = False,
     ):
         """
         Fusing the last linear layer with cross-entropy loss
@@ -214,21 +230,24 @@ class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
         target: (B*T) where each value is in [0, V-1]
         weight: (V, H) where V is the number of classes
         bias: (V) where V is the number of classes
+        ce_weight: a manual rescaling weight given to each class. If given, has to be a Tensor of size V and floating point dtype
         ignore_index: the index to ignore in the target
         label_smoothing (float): The amount of smoothing when computing the loss, where 0.0 means no smoothing.
         reduction: reduction to apply
         """
-        loss, grad_input, grad_weight, grad_bias = fused_linear_cross_entropy_forward(
-            _input,
-            weight,
-            target,
-            bias,
-            ignore_index,
-            lse_square_scale,
-            label_smoothing,
-            reduction,
-            softcap,
+        loss, z_loss, grad_input, grad_weight, grad_bias = fused_linear_cross_entropy_forward(
+            _input=_input,
+            weight=weight,
+            target=target,
+            bias=bias,
+            ce_weight=ce_weight,
+            ignore_index=ignore_index,
+            lse_square_scale=lse_square_scale,
+            label_smoothing=label_smoothing,
+            reduction=reduction,
+            softcap=softcap,
+            return_z_loss=return_z_loss,
         )
         # downcast to dtype and store for backward
         ctx.save_for_backward(
@@ -236,13 +255,28 @@ class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
             grad_weight.detach() if grad_weight is not None else None,
             grad_bias.detach() if bias is not None else None,
         )
-        return loss
+        ctx.return_z_loss = return_z_loss
+        return loss, z_loss
     @staticmethod
     @amp_custom_bwd
-    def backward(ctx, grad_output):
+    def backward(ctx, grad_output, grad_output2):
+        if ctx.return_z_loss:
+            del grad_output2  # z_loss is only for logging
         (grad_input, grad_weight, grad_bias) = ctx.saved_tensors
         grad_input, grad_weight, grad_bias = fused_linear_cross_entropy_backward(
             grad_output, grad_input, grad_weight, grad_bias
         )
-        return (grad_input, grad_weight, None, grad_bias, None, None, None, None, None)
+        return (
+            grad_input,
+            grad_weight,
+            None,
+            grad_bias,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )

liger-kernel 0.5.2__py3-none-any.whl → 0.5.4__py3-none-any.whl

liger-kernel 0.5.2py3-none-any.whl → 0.5.4py3-none-any.whl