PyPI - liger-kernel-nightly - Versions diffs - 0.5.2.dev20241228022953__py3-none-any.whl → 0.5.2.dev20241229131950__py3-none-any.whl - Mend

liger-kernel-nightly 0.5.2.dev20241228022953py3-none-any.whl → 0.5.2.dev20241229131950py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

liger_kernel/ops/cross_entropy.py CHANGED Viewed

@@ -30,11 +30,14 @@ def liger_cross_entropy_kernel(
     X_stride,
     Y_ptr,
     Y_stride,
+    weight_ptr,
     loss_ptr,
     z_loss_ptr,
     loss_stride,
     n_cols,
     n_non_ignore,
+    sum_non_ignore_weight,
+    weight_sum,
     ignore_index,
     lse_square_scale: tl.constexpr,
     label_smoothing: tl.constexpr,
@@ -42,6 +45,7 @@ def liger_cross_entropy_kernel(
     softcap,
     RETURN_Z_LOSS: tl.constexpr,
     BLOCK_SIZE: tl.constexpr,
+    HAS_WEIGHT: tl.constexpr,
     HAS_SOFTCAPPING: tl.constexpr,
 ):
     """
@@ -53,18 +57,22 @@ def liger_cross_entropy_kernel(
     X_stride (int): The stride of the input tensor.
     Y_ptr: Pointer to target tensor.
     Y_stride (int): The stride of the target tensor.
+    weight_ptr: Pointer to weight tensor.
     loss_ptr: Pointer to tensor to store the loss.
     z_loss_ptr: Pointer to tensor to store the z loss. No operation if RETURN_Z_LOSS is 0.
     loss_stride (int): The stride of the loss tensor.
     n_cols (int): The number of columns in the input tensor.
-    n_non_ignore (int): The number of non-ignored elements in the batch.
+    n_non_ignore (flaot): The number of non-ignored elements in the batch.
+    sum_non_ignore_weight (float): The sum of non-ignored target's weights in the batch.
+    weight_sum (float): The sum of weight tensor.
     ignore_index (int): The index to ignore in the target.
     label_smoothing (float): The amount of smoothing when computing the loss, where 0.0 means no smoothing.
     lse_square_scale (float): The scaler of (logsumexp(_input)) ^ 2 adding to the loss for the stability of training.
-    RETURN_Z_LOSS (int): The boolean value to decide whether storing z loss to z_loss_ptr or not. It must be 0 or 1.
     reduction (str): The string for the reduction to apply
     softcap (float): The upper threshold for scaling logits to the range (-softcap, +softcap).
+    RETURN_Z_LOSS (int): The boolean value to decide whether storing z loss to z_loss_ptr or not. It must be 0 or 1.
     BLOCK_SIZE (int): The block size for Triton operations.
+    HAS_WEIGHT (bool): The boolean value to determine whether assigning weight to each of the classes.
     HAS_SOFTCAPPING (bool): The boolean value to determine whether applying soft-capping or not.
     """
@@ -89,6 +97,9 @@ def liger_cross_entropy_kernel(
     loss_ptr += program_id * loss_stride
     z_loss_ptr += program_id * loss_stride
+    if HAS_WEIGHT:
+        weight_y = tl.load(weight_ptr + y).cast(tl.float32)
     # Online softmax: 2 loads + 1 store (compared with 3 loads + 1 store for the safe softmax)
     # Refer to Algorithm 3 in the paper: https://arxiv.org/pdf/1805.02867
@@ -117,7 +128,11 @@ def liger_cross_entropy_kernel(
         block_max = tl.max(X_block)
         if label_smoothing > 0:
             # scale X beforehand to avoid overflow
-            scaled_x_sum += tl.sum(tl.where(X_offsets < n_cols, -eps * X_block, 0.0))
+            if HAS_WEIGHT:
+                weight_block = tl.load(weight_ptr + X_offsets, mask=X_offsets < n_cols)
+                scaled_x_sum += tl.sum(tl.where(X_offsets < n_cols, -eps * X_block * weight_block, 0.0))
+            else:
+                scaled_x_sum += tl.sum(tl.where(X_offsets < n_cols, -eps * X_block, 0.0))
         m_new = tl.maximum(m, block_max)
         d = d * tl.exp(m - m_new) + tl.sum(tl.exp(X_block - m_new))
         m = m_new
@@ -153,18 +168,41 @@ def liger_cross_entropy_kernel(
         if HAS_SOFTCAPPING:
             intermediate = tanh(X_block / softcap)
             X_block = softcap * intermediate
-        # softmax(x_i)
-        X_block = tl.exp(X_block - m) / d
-        # derivative of z-loss: 2 * lse_square_scale * lse * softmax(x_i)
-        X_block += 2 * lse_square_scale * lse * X_block
-        # smoothing term
-        X_block += -eps
-        # special handle dx_y
-        X_block = tl.where(X_offsets != y, X_block, X_block - (1 - label_smoothing))
-        # reduction scale
-        if reduction == "mean":
-            X_block = X_block / (n_non_ignore)
-        # chain rule
+        if not HAS_WEIGHT:
+            # softmax(x_i)
+            X_block = tl.exp(X_block - m) / d
+            # derivative of z-loss: 2 * lse_square_scale * lse * softmax(x_i)
+            X_block += 2 * lse_square_scale * lse * X_block
+            # smoothing term
+            X_block += -eps
+            # special handle dx_y
+            X_block = tl.where(X_offsets != y, X_block, X_block - (1 - label_smoothing))
+            # reduction scale
+            if reduction == "mean":
+                X_block = X_block / n_non_ignore
+        else:
+            weight_block = tl.load(weight_ptr + X_offsets, mask=X_offsets < n_cols)
+            softmax_X = tl.exp(X_block - m) / d
+            # derivative of original_loss
+            dloss_ori = (1 - label_smoothing) * softmax_X
+            # specially handle dx_y
+            dloss_ori = tl.where(X_offsets != y, dloss_ori, dloss_ori - (1 - label_smoothing))
+            dloss_ori = dloss_ori * weight_y
+            # derivative of smooth_loss
+            dloss_smooth = eps * (-weight_block + softmax_X * weight_sum)
+            # derivative of z-loss
+            dz_loss = 2 * lse_square_scale * lse * softmax_X
+            # reduction scale
+            if reduction == "mean":
+                dloss_ori = dloss_ori / sum_non_ignore_weight
+                dloss_smooth = dloss_smooth / sum_non_ignore_weight
+                # TODO: Implement weighted z_loss. Currently, z_loss is not scaled by weight.
+                dz_loss = dz_loss / n_non_ignore
+            # derivative of total_loss
+            X_block = dloss_ori + dloss_smooth + dz_loss
+        # chain rule softcapping
         # d(softcap * tanh(x / softcap)) = (1 - tanh^2(x / softcap))
         if HAS_SOFTCAPPING:
             X_block = X_block * (1 - intermediate * intermediate)
@@ -183,6 +221,8 @@ def liger_cross_entropy_kernel(
     # sum(e ^ (X - max(X))) must >= 1 because the max term is e ^ 0 = 1
     # So we can safely calculate log (softmax(X_y)) without overflow
     loss = lse - ori_X_y
+    if HAS_WEIGHT:
+        loss = weight_y * loss
     # Original loss = H(q, p),  with label smoothing regularization = H(q', p) and (label_smoothing / V) = eps
     # H(q', p) = (1 - label_smoothing) * H(q, p) + label_smoothing * H(u, p)
@@ -193,17 +233,24 @@ def liger_cross_entropy_kernel(
     # pytorch: https://github.com/pytorch/pytorch/blob/2981534f54d49fa3a9755c9b0855e7929c2527f0/aten/src/ATen/native/LossNLL.cpp#L516
     # See full derivation at https://github.com/linkedin/Liger-Kernel/pull/198#issuecomment-2333753087
     if label_smoothing > 0:
-        smooth_loss = scaled_x_sum + label_smoothing * lse
+        if HAS_WEIGHT:
+            smooth_loss = scaled_x_sum + eps * lse * weight_sum
+        else:
+            smooth_loss = scaled_x_sum + label_smoothing * lse
         loss = loss * (1 - label_smoothing) + smooth_loss
     # An auxiliary loss, z_loss
     # Refer to Page14 Loss function section in the paper PaLM: https://www.jmlr.org/papers/v24/22-1144.html
     z_loss = lse_square_scale * lse * lse
-    loss += z_loss
     # Normalize the loss by the number of non-ignored elements if reduction is "mean"
     if reduction == "mean":
+        if HAS_WEIGHT:
+            loss = loss / sum_non_ignore_weight
+        else:
+            loss = loss / n_non_ignore
+        # TODO: Implement weighted z_loss. Currently, z_loss is not scaled by weight.
         z_loss = z_loss / n_non_ignore
-        loss = loss / n_non_ignore
+    loss += z_loss
     tl.store(loss_ptr, loss)
     if RETURN_Z_LOSS == _TRUE:
@@ -225,6 +272,7 @@ _bool_to_return_z_loss = {
 def cross_entropy_forward(
     _input,
     target,
+    weight,
     ignore_index,
     lse_square_scale,
     label_smoothing,
@@ -250,7 +298,20 @@ def cross_entropy_forward(
     else:
         z_loss_1d = loss_1d  # dummy ptr when return_z_loss == False
-    n_non_ignore = (target != ignore_index).sum().item()
+    target_mask = target != ignore_index
+    n_non_ignore = target_mask.sum().item()
+    sum_non_ignore_weight = n_non_ignore
+    weight_sum = 0.0
+    if weight is not None:
+        assert weight.shape[0] == V, f"If given, weight has to be a Tensor of size V. Got: {weight.shape}"
+        assert torch.is_floating_point(
+            weight
+        ), f"If given, weight has to be a Tensor of floating point dtype. Got: {weight.dtype}"
+        sum_non_ignore_weight = torch.gather(weight, dim=0, index=target.masked_select(target_mask)).sum().item()
+        weight_sum = weight.sum().item()
+        # ensure weight is contiguous
+        if weight.stride(-1) != 1:
+            weight = weight.contiguous()
     # ensure _input and target are contiguous in the last dimension
     if _input.stride(-1) != 1:
@@ -264,18 +325,22 @@ def cross_entropy_forward(
         X_stride=_input.stride(-2),
         Y_ptr=target,
         Y_stride=target.stride(-1),  # always 1
+        weight_ptr=weight if weight is not None else _input,  # dummy if None
         loss_ptr=loss_1d,
         z_loss_ptr=z_loss_1d,
         loss_stride=loss_1d.stride(-1),  # always 1
         n_cols=V,
         n_non_ignore=n_non_ignore,
+        sum_non_ignore_weight=sum_non_ignore_weight,
         ignore_index=ignore_index,
+        weight_sum=weight_sum,
         lse_square_scale=lse_square_scale,
         label_smoothing=label_smoothing,
         reduction=reduction,
         softcap=softcap if softcap is not None else 0.0,
         RETURN_Z_LOSS=return_z_loss,
         BLOCK_SIZE=BLOCK_SIZE,
+        HAS_WEIGHT=True if weight is not None else False,
         HAS_SOFTCAPPING=True if softcap is not None else False,
         # TODO: 32 seems to give the best performance
         # Performance is quite sensitive to num_warps
@@ -327,6 +392,7 @@ class LigerCrossEntropyFunction(torch.autograd.Function):
         ctx,
         _input: torch.Tensor,
         target: torch.Tensor,
+        weight: Optional[torch.FloatTensor],
         ignore_index: int = -100,
         lse_square_scale: float = 0.0,
         label_smoothing: float = 0.0,
@@ -341,6 +407,7 @@ class LigerCrossEntropyFunction(torch.autograd.Function):
         ctx : The context object.
         _input (tensor): The input tensor of shape (BT, V) where B is batch size, T is sequence length, V is vocab size.
         target (tensor): The target tensor of shape (BT) where each value is in [0, V-1].
+        weight(Tensor, optional): a manual rescaling weight given to each class. If given, has to be a Tensor of size V and floating point dtype
         ignore_index (int): The index to ignore in the target.
         lse_square_scale (float): The scaler of (logsumexp(_input)) ^ 2 adding to the loss for the stability of training.
         label_smoothing (float): The amount of smoothing when computing the loss, where 0.0 means no smoothing.
@@ -354,6 +421,7 @@ class LigerCrossEntropyFunction(torch.autograd.Function):
         loss, z_loss, _input = cross_entropy_forward(
             _input,
             target,
+            weight,
             ignore_index,
             lse_square_scale,
             label_smoothing,
@@ -395,4 +463,5 @@ class LigerCrossEntropyFunction(torch.autograd.Function):
             None,
             None,
             None,
+            None,
         )

liger_kernel/ops/fused_linear_cross_entropy.py CHANGED Viewed

@@ -17,6 +17,7 @@ def fused_linear_cross_entropy_forward(
     _input,
     weight,
     target,
+    ce_weight=None,
     bias=None,
     ignore_index=-100,
     lse_square_scale=0.0,
@@ -47,8 +48,22 @@ def fused_linear_cross_entropy_forward(
     # we use fp32 for loss accumulator
     loss_1d = torch.zeros(BT, dtype=torch.float32, device=device)
-    # NOTE: skip .item() here to avoid CUDA synchronization
-    total_n_non_ignore = (target != ignore_index).sum()
+    # TODO: evaluate how CUDA synchronization caused by .item() affects the speed
+    target_mask = target != ignore_index
+    total_n_non_ignore = target_mask.sum().item()
+    total_sum_non_ignore_ce_weight = total_n_non_ignore
+    ce_weight_sum = 0.0
+    if ce_weight is not None:
+        assert ce_weight.shape[0] == V, f"If given, weight has to be a Tensor of size V. Got: {ce_weight.shape}"
+        assert torch.is_floating_point(
+            ce_weight
+        ), f"If given, weight has to be a Tensor of floating point dtype. Got: {ce_weight.dtype}"
+        total_sum_non_ignore_ce_weight = (
+            torch.gather(ce_weight, dim=0, index=target.masked_select(target_mask)).sum().item()
+        )
+        ce_weight_sum = ce_weight.sum().item()
+        if ce_weight.stride(-1) != 1:
+            ce_weight = ce_weight.contiguous()
     for chunk_id in range(num_chunks):
         start_idx = chunk_id * chunk_size
@@ -59,13 +74,13 @@ def fused_linear_cross_entropy_forward(
         logits_chunk = _input_chunk @ weight.t()  # chunk_size x V
         if bias is not None:
             logits_chunk = logits_chunk + bias
         target_chunk = target[start_idx:end_idx]  # chunk_size,
         n_rows = logits_chunk.shape[0]
         # unreduced loss
         loss_1d_slice = loss_1d[start_idx:end_idx]  # chunk_size,
-        n_non_ignore = (target_chunk != ignore_index).sum().item()
         # ensure _input and target are contiguous
         logits_chunk = logits_chunk.contiguous()
@@ -77,45 +92,40 @@ def fused_linear_cross_entropy_forward(
             X_stride=logits_chunk.stride(-2),
             Y_ptr=target_chunk,
             Y_stride=target_chunk.stride(-1),  # always 1
+            weight_ptr=ce_weight if ce_weight is not None else _input,  # dummy if None
             loss_ptr=loss_1d_slice,
             z_loss_ptr=loss_1d_slice,  # dummy ptr, not used
             loss_stride=loss_1d_slice.stride(-1),  # always 1
             n_cols=V,
-            n_non_ignore=n_non_ignore,
+            n_non_ignore=total_n_non_ignore,
+            sum_non_ignore_weight=total_sum_non_ignore_ce_weight,
+            weight_sum=ce_weight_sum,
             ignore_index=ignore_index,
             lse_square_scale=lse_square_scale,
             label_smoothing=label_smoothing,
             reduction=reduction,
             softcap=softcap if softcap is not None else 0.0,
             RETURN_Z_LOSS=0,  # False
+            HAS_WEIGHT=True if ce_weight is not None else False,
             HAS_SOFTCAPPING=True if softcap is not None else False,
             BLOCK_SIZE=BLOCK_SIZE,
             num_warps=32 if not is_hip() else 16,
         )
-        # gradient of logits_chunk is computed in-place by the above triton kernel and is of shape: chunk_size x V
-        # thus grad_input[start_idx: end_idx] should be of shape: chunk_size x H
-        # additionally, since we are chunking the inputs, observe that the loss and gradients are calculated only
-        # on `n_non_ignore` tokens. However, the gradient of the input should be calculated for all tokens.
-        # Thus, we need an additional scaling factor of (n_non_ignore/total_n_non_ignore) to scale the gradients.
-        if reduction == "mean":
-            alpha = n_non_ignore / total_n_non_ignore if total_n_non_ignore > 0 else 0.0
-        else:
-            alpha = 1.0
-        loss_1d[start_idx:end_idx] = loss_1d_slice * alpha
-        grad_logits_chunk = logits_chunk * alpha  # chunk_size x V
+        loss_1d[start_idx:end_idx] = loss_1d_slice
+        grad_logits_chunk = logits_chunk  # chunk_size x V
         grad_input[start_idx:end_idx] = grad_logits_chunk @ weight
         if grad_weight is not None:
             torch.addmm(
                 input=grad_weight,
-                mat1=logits_chunk.t(),
+                mat1=logits_chunk.t().to(
+                    _input_chunk.dtype
+                ),  # In an autocast scenario without bias, differing logits_chunk data types will cause an addmm operation error.
                 mat2=_input_chunk,
                 out=grad_weight,
-                alpha=alpha,
+                alpha=1.0,
                 beta=1.0,
             )
@@ -124,7 +134,7 @@ def fused_linear_cross_entropy_forward(
                 input=grad_bias,
                 other=logits_chunk.sum(dim=0),
                 out=grad_bias,
-                alpha=alpha,
+                alpha=1.0,
             )
     if reduction == "none":
@@ -190,6 +200,7 @@ class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
         weight,
         target,
         bias=None,
+        ce_weight=None,
         ignore_index=-100,
         lse_square_scale=0.0,
         label_smoothing=0.0,
@@ -209,21 +220,23 @@ class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
         target: (B*T) where each value is in [0, V-1]
         weight: (V, H) where V is the number of classes
         bias: (V) where V is the number of classes
+        ce_weight: a manual rescaling weight given to each class. If given, has to be a Tensor of size V and floating point dtype
         ignore_index: the index to ignore in the target
         label_smoothing (float): The amount of smoothing when computing the loss, where 0.0 means no smoothing.
         reduction: reduction to apply
         """
         loss, grad_input, grad_weight, grad_bias = fused_linear_cross_entropy_forward(
-            _input,
-            weight,
-            target,
-            bias,
-            ignore_index,
-            lse_square_scale,
-            label_smoothing,
-            reduction,
-            softcap,
+            _input=_input,
+            weight=weight,
+            target=target,
+            bias=bias,
+            ce_weight=ce_weight,
+            ignore_index=ignore_index,
+            lse_square_scale=lse_square_scale,
+            label_smoothing=label_smoothing,
+            reduction=reduction,
+            softcap=softcap,
         )
         # downcast to dtype and store for backward
         ctx.save_for_backward(
@@ -240,4 +253,15 @@ class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
         grad_input, grad_weight, grad_bias = fused_linear_cross_entropy_backward(
             grad_output, grad_input, grad_weight, grad_bias
         )
-        return (grad_input, grad_weight, None, grad_bias, None, None, None, None, None)
+        return (
+            grad_input,
+            grad_weight,
+            None,
+            grad_bias,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )

liger_kernel/transformers/cross_entropy.py CHANGED Viewed

@@ -8,6 +8,7 @@ from liger_kernel.ops.cross_entropy import LigerCrossEntropyFunction
 class LigerCrossEntropyLoss(torch.nn.Module):
     def __init__(
         self,
+        weight: Optional[torch.FloatTensor] = None,
         ignore_index: int = -100,
         lse_square_scale: float = 0.0,
         label_smoothing: float = 0.0,
@@ -28,6 +29,7 @@ class LigerCrossEntropyLoss(torch.nn.Module):
             "none",
         }, f"reduction must be one of 'mean', 'sum', or 'none'. Got: {reduction}"
         assert softcap is None or softcap > 0, f"softcap must greater than 0.0 or None. Got: {softcap}"
+        self.weight = weight
         self.ignore_index = ignore_index
         self.lse_square_scale = lse_square_scale
         self.label_smoothing = label_smoothing
@@ -39,6 +41,7 @@ class LigerCrossEntropyLoss(torch.nn.Module):
         loss, z_loss = LigerCrossEntropyFunction.apply(
             _input,
             target,
+            self.weight,
             self.ignore_index,
             self.lse_square_scale,
             self.label_smoothing,

liger_kernel/transformers/functional.py CHANGED Viewed

@@ -32,6 +32,7 @@ def liger_cross_entropy(
     loss, z_loss = LigerCrossEntropyFunction.apply(
         input,
         target,
+        weight,
         ignore_index,
         lse_square_scale,
         label_smoothing,
@@ -49,6 +50,7 @@ def liger_fused_linear_cross_entropy(
     weight,
     target,
     bias=None,
+    ce_weight=None,
     ignore_index: int = -100,
     lse_square_scale: float = 0.0,
     label_smoothing: float = 0.0,
@@ -60,6 +62,7 @@ def liger_fused_linear_cross_entropy(
         weight,
         target,
         bias,
+        ce_weight,
         ignore_index,
         lse_square_scale,
         label_smoothing,

liger_kernel/transformers/fused_linear_cross_entropy.py CHANGED Viewed

@@ -8,6 +8,7 @@ from liger_kernel.ops.fused_linear_cross_entropy import LigerFusedLinearCrossEnt
 class LigerFusedLinearCrossEntropyLoss(torch.nn.Module):
     def __init__(
         self,
+        ce_weight: Optional[torch.FloatTensor] = None,
         ignore_index: int = -100,
         lse_square_scale: float = 0.0,
         label_smoothing: float = 0.0,
@@ -24,6 +25,7 @@ class LigerFusedLinearCrossEntropyLoss(torch.nn.Module):
             "none",
         }, f"reduction must be one of 'mean', 'sum', or 'none'. Got: {reduction}"
         assert softcap is None or softcap > 0, f"softcap must greater than 0.0 or None. Got: {softcap}"
+        self.ce_weight = ce_weight
         self.ignore_index = ignore_index
         self.lse_square_scale = lse_square_scale
         self.label_smoothing = label_smoothing
@@ -36,6 +38,7 @@ class LigerFusedLinearCrossEntropyLoss(torch.nn.Module):
             lin_weight,
             target,
             bias,
+            self.ce_weight,
             self.ignore_index,
             self.lse_square_scale,
             self.label_smoothing,

{liger_kernel_nightly-0.5.2.dev20241228022953.dist-info → liger_kernel_nightly-0.5.2.dev20241229131950.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: liger_kernel_nightly
-Version: 0.5.2.dev20241228022953
+Version: 0.5.2.dev20241229131950
 Summary: Efficient Triton kernels for LLM Training
 License: BSD 2-CLAUSE LICENSE
         Copyright 2024 LinkedIn Corporation

{liger_kernel_nightly-0.5.2.dev20241228022953.dist-info → liger_kernel_nightly-0.5.2.dev20241229131950.dist-info}/RECORD RENAMED Viewed

@@ -11,8 +11,8 @@ liger_kernel/chunked_loss/fused_linear_preference.py,sha256=25sTgvphLKAR0jyJcrsJ
 liger_kernel/chunked_loss/orpo_loss.py,sha256=jbZxx-EjPK71A6CSyNzTOAIEQgAUjfvwSViw6R_pPXQ,3510
 liger_kernel/chunked_loss/simpo_loss.py,sha256=ZvDIjT9EQrbwzH2LNZMhv84SPsOHGi_Ywk95vgA0b_o,3736
 liger_kernel/ops/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-liger_kernel/ops/cross_entropy.py,sha256=2OPIkSXeQAIfSCODYK45Jf8xrz7HoGqFHr1MHS_pijE,15895
-liger_kernel/ops/fused_linear_cross_entropy.py,sha256=LR0zLL8JYMhk9e22jmBxU4lwEYic3YqMAG3837yaHmM,9418
+liger_kernel/ops/cross_entropy.py,sha256=4zSPzdPl-d2tB3ZOj7uRMpzI4RzZMNLUzkh6eMkH5kU,19179
+liger_kernel/ops/fused_linear_cross_entropy.py,sha256=j7cgR95rFAwtPsWZ00PfMwis5F7dtO3EVEw0rZ1GPJk,10231
 liger_kernel/ops/fused_linear_jsd.py,sha256=eKqaADj7LgWfoYqyH03tjrmhNTfJOF1Dhx_bWzBTnTU,9600
 liger_kernel/ops/geglu.py,sha256=axGvCIvlBzuluoAIrWTsp2iZM4BFKNInkPov8YVvH9E,4126
 liger_kernel/ops/group_norm.py,sha256=qD4D4lSjSgVtO52EBNLC2iTseALRgPgqXE50U2woggk,10837
@@ -28,9 +28,9 @@ liger_kernel/ops/experimental/embedding.py,sha256=tolj3tItkzpSb30zWqDN2_yX4ectfl
 liger_kernel/ops/experimental/mm_int8int2.py,sha256=TrS9lpwekrik_w5qE7AhMJD1bcq-OidjtbsW80oZ6IM,13314
 liger_kernel/transformers/__init__.py,sha256=QPmYkL6hosBPpPqCUGqvIvAtD9XzLgvZqZxUyYMZeVk,2008
 liger_kernel/transformers/auto_model.py,sha256=0qCTRZt280Bj_LcFdzo9hlaR-BWNazawXOGgoCZjgEg,1545
-liger_kernel/transformers/cross_entropy.py,sha256=s5-ZM1NBMDjG-KKJKBtIkmArj1jCUjDnpL-2QKhKYho,1734
-liger_kernel/transformers/functional.py,sha256=hxReSBDEUZkOnZgURD8sf6ETYvf9yqCOOMU2k9Ywh90,4435
-liger_kernel/transformers/fused_linear_cross_entropy.py,sha256=K4tfpoNPUJpWv7rCHEcs5xhJLg5td8GcpJrAryF5NMk,1451
+liger_kernel/transformers/cross_entropy.py,sha256=s931h9UW_tV4QMRme1HYjS_R2_C5nD6VFmZIXtjJoYo,1840
+liger_kernel/transformers/functional.py,sha256=B1wkHWLx-YNhxvXBEXB4Ch1yEwF3mjwTPCeXA5aCV_c,4490
+liger_kernel/transformers/fused_linear_cross_entropy.py,sha256=LAN8-pjUI2Erz_MnfMer-0ZmxJ0JlKxGzdZGJY-N65g,1569
 liger_kernel/transformers/fused_linear_jsd.py,sha256=bZ4otCvWBuOnA5XdQL-FzZVItJlDt-ht9e_pG7PG93E,3999
 liger_kernel/transformers/geglu.py,sha256=mrgqzIUVd6lN7fkDKLkw5YaESDxDtFgbot430WwPVOQ,1107
 liger_kernel/transformers/group_norm.py,sha256=URmjkQFsrbMffzcJiGpX7ckxWlpL95AiJS-80hwAWPk,2173
@@ -58,9 +58,9 @@ liger_kernel/transformers/trainer/__init__.py,sha256=p7yQfklV8-467qSz_ZMimkbDF7H
 liger_kernel/transformers/trainer/orpo_trainer.py,sha256=MId1S_MfA3pPVQA1rkiKxp-jZDNz8VmvZzXC-Kugol4,7662
 liger_kernel/triton/__init__.py,sha256=qCiCamzCRv6lpV8IqpAc9YMdNKC7GKurClWceQPnlis,92
 liger_kernel/triton/monkey_patch.py,sha256=Rd0hUHAzDkFfHvnX7-PBaNK5EKnZhtfM_h-fgQH9HPY,1568
-liger_kernel_nightly-0.5.2.dev20241228022953.dist-info/LICENSE,sha256=OhzLDHJ0to4a8sodVLELZiCFylZ1NAAYLs-HrjPy0ag,1312
-liger_kernel_nightly-0.5.2.dev20241228022953.dist-info/METADATA,sha256=Z5fzI-xpYPtjwawEGwIw-LRJUIeY1VEdDUK9wgklR7w,21055
-liger_kernel_nightly-0.5.2.dev20241228022953.dist-info/NOTICE,sha256=njwnoPZLh9AN8SJQzxvCGLHi-8X__AvWRze6joNXIY8,2066
-liger_kernel_nightly-0.5.2.dev20241228022953.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
-liger_kernel_nightly-0.5.2.dev20241228022953.dist-info/top_level.txt,sha256=2eghu4hA3LnkM7ElW92tQ8zegWKgSbeo-k-aGe1YnvY,13
-liger_kernel_nightly-0.5.2.dev20241228022953.dist-info/RECORD,,
+liger_kernel_nightly-0.5.2.dev20241229131950.dist-info/LICENSE,sha256=OhzLDHJ0to4a8sodVLELZiCFylZ1NAAYLs-HrjPy0ag,1312
+liger_kernel_nightly-0.5.2.dev20241229131950.dist-info/METADATA,sha256=iOyPsdNf1GL3Z3Ng0CS3xoOq6iiTb8eFXAMwqDT1UZM,21055
+liger_kernel_nightly-0.5.2.dev20241229131950.dist-info/NOTICE,sha256=njwnoPZLh9AN8SJQzxvCGLHi-8X__AvWRze6joNXIY8,2066
+liger_kernel_nightly-0.5.2.dev20241229131950.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
+liger_kernel_nightly-0.5.2.dev20241229131950.dist-info/top_level.txt,sha256=2eghu4hA3LnkM7ElW92tQ8zegWKgSbeo-k-aGe1YnvY,13
+liger_kernel_nightly-0.5.2.dev20241229131950.dist-info/RECORD,,

{liger_kernel_nightly-0.5.2.dev20241228022953.dist-info → liger_kernel_nightly-0.5.2.dev20241229131950.dist-info}/LICENSE RENAMED Viewed

File without changes

{liger_kernel_nightly-0.5.2.dev20241228022953.dist-info → liger_kernel_nightly-0.5.2.dev20241229131950.dist-info}/NOTICE RENAMED Viewed

File without changes

{liger_kernel_nightly-0.5.2.dev20241228022953.dist-info → liger_kernel_nightly-0.5.2.dev20241229131950.dist-info}/WHEEL RENAMED Viewed

File without changes

{liger_kernel_nightly-0.5.2.dev20241228022953.dist-info → liger_kernel_nightly-0.5.2.dev20241229131950.dist-info}/top_level.txt RENAMED Viewed

File without changes

liger-kernel-nightly 0.5.2.dev20241228022953__py3-none-any.whl → 0.5.2.dev20241229131950__py3-none-any.whl

liger-kernel-nightly 0.5.2.dev20241228022953py3-none-any.whl → 0.5.2.dev20241229131950py3-none-any.whl