PyPI - liger-kernel-nightly - Versions diffs - 0.4.0.dev20241106174658__tar.gz → 0.4.0.dev20241107054539__tar.gz - Mend

liger-kernel-nightly 0.4.0.dev20241106174658tar.gz → 0.4.0.dev20241107054539tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of liger-kernel-nightly might be problematic. Click here for more details.

Files changed (55) hide show

{liger_kernel_nightly-0.4.0.dev20241106174658/src/liger_kernel_nightly.egg-info → liger_kernel_nightly-0.4.0.dev20241107054539}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: liger_kernel_nightly
-Version: 0.4.0.dev20241106174658
+Version: 0.4.0.dev20241107054539
 Summary: Efficient Triton kernels for LLM Training
 License: BSD 2-CLAUSE LICENSE
         Copyright 2024 LinkedIn Corporation

{liger_kernel_nightly-0.4.0.dev20241106174658 → liger_kernel_nightly-0.4.0.dev20241107054539}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "liger_kernel_nightly"
-version = "0.4.0.dev20241106174658"
+version = "0.4.0.dev20241107054539"
 description = "Efficient Triton kernels for LLM Training"
 urls = { "Homepage" = "https://github.com/linkedin/Liger-Kernel" }
 readme = { file = "README.md", content-type = "text/markdown" }

{liger_kernel_nightly-0.4.0.dev20241106174658 → liger_kernel_nightly-0.4.0.dev20241107054539}/src/liger_kernel/ops/cross_entropy.py RENAMED Viewed

@@ -4,6 +4,9 @@ import triton.language as tl
 from liger_kernel.ops.utils import element_mul_kernel, is_hip
+_TRUE = tl.constexpr(1)
+_FALSE = tl.constexpr(0)
 @triton.jit
 def liger_cross_entropy_kernel(
@@ -12,12 +15,15 @@ def liger_cross_entropy_kernel(
     Y_ptr,
     Y_stride,
     loss_ptr,
+    z_loss_ptr,
     loss_stride,
     n_cols,
     n_non_ignore,
     ignore_index,
+    lse_square_scale: tl.constexpr,
     label_smoothing: tl.constexpr,
     reduction: tl.constexpr,  # set it as constexpr since reduction is always known at compile time
+    RETURN_Z_LOSS: tl.constexpr,
     BLOCK_SIZE: tl.constexpr,
 ):
     """
@@ -30,11 +36,14 @@ def liger_cross_entropy_kernel(
     Y_ptr: Pointer to target tensor.
     Y_stride (int): The stride of the target tensor.
     loss_ptr: Pointer to tensor to store the loss.
+    z_loss_ptr: Pointer to tensor to store the z loss. No operation if RETURN_Z_LOSS is 0.
     loss_stride (int): The stride of the loss tensor.
     n_cols (int): The number of columns in the input tensor.
     n_non_ignore (int): The number of non-ignored elements in the batch.
     ignore_index (int): The index to ignore in the target.
     label_smoothing (float): The amount of smoothing when computing the loss, where 0.0 means no smoothing.
+    lse_square_scale (float): The scaler of (logsumexp(_input)) ^ 2 adding to the loss for the stability of training.
+    RETURN_Z_LOSS (int): The boolean value to decide whether storing z loss to z_loss_ptr or not. It must be 0 or 1.
     reduction (str): The string for the reduction to apply
     BLOCK_SIZE (int): The block size for Triton operations.
     """
@@ -58,6 +67,7 @@ def liger_cross_entropy_kernel(
         return
     loss_ptr += program_id * loss_stride
+    z_loss_ptr += program_id * loss_stride
     # Online softmax: 2 loads + 1 store (compared with 3 loads + 1 store for the safe softmax)
     # Refer to Algorithm 3 in the paper: https://arxiv.org/pdf/1805.02867
@@ -87,32 +97,40 @@ def liger_cross_entropy_kernel(
         d = d * tl.exp(m - m_new) + tl.sum(tl.exp(X_block - m_new))
         m = m_new
+    # log (sum(e^(X_i))) = log (sum(e ^ (max(X) * e ^ (X_i - max(X)))))
+    #                    = log (e^(max(X)) * sum(e ^ (X_i - max(X))))
+    #                    = max(X) + log (sum(e ^ (X_i - max(X)))) = m + log d
+    lse = m + tl.log(d)
     # 4. [Online Softmax] Second pass: compute gradients
     # For 'mean' reduction, gradients are normalized by number of non-ignored elements (N)
     # dx_y = (softmax(x_y) - 1) / N
     # dx_i = softmax(x_i) / N, i != y
     # For label smoothing:
-    # dx_i = (softmax(x_y) - label_smoothing / V) / N, V = n_cols, i != y
+    # dx_i = (softmax(x_i) - label_smoothing / V) / N, V = n_cols, i != y
     # dx_y = (softmax(x_y) - label_smoothing / V - (1 - label_smoothing)) / N
     #      = dx_i - (1 - label_smoothing) / N
-    #
+    # With Z loss:
+    # dx_i = ((1 + 2 * lse_square_scale * lse) * softmax(x_i) - label_smoothing / V) / N, i != y
+    # dx_y = dx_i - (1 - label_smoothing) / N
     # For 'sum' reduction, no normalization is applied:
     # dx_y = softmax(x_y) - 1
     # dx_i = softmax(x_i), for i ≠ y
-    # For label smoothing:
-    # dx_i = (softmax(x_y) - label_smoothing / V), V = n_cols, i != y
-    # dx_y = (softmax(x_y) - label_smoothing / V - (1 - label_smoothing))
-    #      = dx_i - (1 - label_smoothing)
     for i in range(0, n_cols, BLOCK_SIZE):
         X_offsets = i + tl.arange(0, BLOCK_SIZE)
         X_block = tl.load(
             X_ptr + X_offsets, mask=X_offsets < n_cols, other=float("-inf")
         )
+        # softmax(x_i)
+        X_block = tl.exp(X_block - m) / d
+        # derivative of z-loss: 2 * lse_square_scale * lse * softmax(x_i)
+        X_block += 2 * lse_square_scale * lse * X_block
+        # smoothing term
+        X_block += -eps
+        # reduction scale
         if reduction == "mean":
-            X_block = (tl.exp(X_block - m) / d - eps) / (n_non_ignore)
-        else:
-            X_block = tl.exp(X_block - m) / d - eps
+            X_block = X_block / (n_non_ignore)
         tl.store(X_ptr + X_offsets, X_block, mask=X_offsets < n_cols)
@@ -124,9 +142,10 @@ def liger_cross_entropy_kernel(
     # loss = log (softmax(X_y)) = log ((e ^ (X_y - max(X)) / sum(e ^ (X - max(X))))
     #      = (X_y - max(X)) - log(sum(e ^ (X - max(X))))
+    #      = X_y - m - log d = X_y - lse
     # sum(e ^ (X - max(X))) must >= 1 because the max term is e ^ 0 = 1
     # So we can safely calculate log (softmax(X_y)) without overflow
-    loss = -(ori_X_y - m - tl.log(d))
+    loss = lse - ori_X_y
     # Original loss = H(q, p),  with label smoothing regularization = H(q', p) and (label_smoothing / V) = eps
     # H(q', p) = (1 - label_smoothing) * H(q, p) + label_smoothing * H(u, p)
@@ -137,11 +156,16 @@ def liger_cross_entropy_kernel(
     # pytorch: https://github.com/pytorch/pytorch/blob/2981534f54d49fa3a9755c9b0855e7929c2527f0/aten/src/ATen/native/LossNLL.cpp#L516
     # See full derivation at https://github.com/linkedin/Liger-Kernel/pull/198#issuecomment-2333753087
     if label_smoothing > 0:
-        smooth_loss = scaled_x_sum + label_smoothing * (m + tl.log(d))
+        smooth_loss = scaled_x_sum + label_smoothing * lse
         loss = loss * (1 - label_smoothing) + smooth_loss
+    # An auxiliary loss, z_loss
+    # Refer to Page14 Loss function section in the paper PaLM: https://www.jmlr.org/papers/v24/22-1144.html
+    z_loss = lse_square_scale * lse * lse
+    loss += z_loss
     # Normalize the loss by the number of non-ignored elements if reduction is "mean"
     if reduction == "mean":
+        z_loss = z_loss / n_non_ignore
         loss = loss / n_non_ignore
     # 6. Specially handle the i==y case where `dx_y = (softmax(x_y) - (1 - label_smoothing) / N`
@@ -152,6 +176,8 @@ def liger_cross_entropy_kernel(
         X_y += -(1 - label_smoothing)
     tl.store(loss_ptr, loss)
+    if RETURN_Z_LOSS == _TRUE:
+        tl.store(z_loss_ptr, z_loss)
     tl.store(X_ptr + y, X_y)
@@ -161,7 +187,31 @@ def liger_cross_entropy_kernel(
 MAX_FUSED_SIZE = 65536 // 2  # the best size we found by manually tuning
-def cross_entropy_forward(_input, target, ignore_index, label_smoothing, reduction):
+_bool_to_return_z_loss = {
+    True: _TRUE.value,
+    False: _FALSE.value,
+}
+def cross_entropy_forward(
+    _input,
+    target,
+    ignore_index,
+    lse_square_scale,
+    label_smoothing,
+    reduction,
+    return_z_loss,
+):
+    if not isinstance(return_z_loss, int):
+        assert (
+            return_z_loss in _bool_to_return_z_loss
+        ), f"return_z_loss must be True or False. Got: {return_z_loss}"
+        return_z_loss = _bool_to_return_z_loss[return_z_loss]
+    else:
+        assert (
+            return_z_loss in _bool_to_return_z_loss
+        ), f"return_z_loss must be True or False. Got: {return_z_loss}"
     BT, V = _input.shape
     n_rows = BT
@@ -169,6 +219,10 @@ def cross_entropy_forward(_input, target, ignore_index, label_smoothing, reducti
     # unreduced loss
     loss_1d = torch.zeros(n_rows, dtype=_input.dtype, device=_input.device)
+    if return_z_loss == _TRUE.value:
+        z_loss_1d = torch.zeros(n_rows, dtype=_input.dtype, device=_input.device)
+    else:
+        z_loss_1d = loss_1d  # dummy ptr when return_z_loss == False
     n_non_ignore = (target != ignore_index).sum().item()
@@ -185,12 +239,15 @@ def cross_entropy_forward(_input, target, ignore_index, label_smoothing, reducti
         Y_ptr=target,
         Y_stride=target.stride(-1),  # always 1
         loss_ptr=loss_1d,
+        z_loss_ptr=z_loss_1d,
         loss_stride=loss_1d.stride(-1),  # always 1
         n_cols=V,
         n_non_ignore=n_non_ignore,
         ignore_index=ignore_index,
+        lse_square_scale=lse_square_scale,
         label_smoothing=label_smoothing,
         reduction=reduction,
+        RETURN_Z_LOSS=return_z_loss,
         BLOCK_SIZE=BLOCK_SIZE,
         # TODO: 32 seems to give the best performance
         # Performance is quite sensitive to num_warps
@@ -198,7 +255,12 @@ def cross_entropy_forward(_input, target, ignore_index, label_smoothing, reducti
     )
     loss = torch.sum(loss_1d)
-    return loss, _input
+    if return_z_loss == _TRUE.value:
+        z_loss = torch.sum(z_loss_1d)
+    else:
+        z_loss = None
+    return loss, z_loss, _input
 def cross_entropy_backward(_input, grad_output):
@@ -233,7 +295,14 @@ class LigerCrossEntropyFunction(torch.autograd.Function):
     @staticmethod
     def forward(
-        ctx, _input, target, ignore_index=-100, label_smoothing=0.0, reduction="mean"
+        ctx,
+        _input,
+        target,
+        ignore_index=-100,
+        lse_square_scale=0.0,
+        label_smoothing=0.0,
+        reduction="mean",
+        return_z_loss=False,
     ):
         """
         The forward pass of the Liger Cross Entropy loss.
@@ -243,33 +312,46 @@ class LigerCrossEntropyFunction(torch.autograd.Function):
         _input (tensor): The input tensor of shape (BT, V) where B is batch size, T is sequence length, V is vocab size.
         target (tensor): The target tensor of shape (BT) where each value is in [0, V-1].
         ignore_index (int): The index to ignore in the target.
+        lse_square_scale (float): The scaler of (logsumexp(_input)) ^ 2 adding to the loss for the stability of training.
         label_smoothing (float): The amount of smoothing when computing the loss, where 0.0 means no smoothing.
         reduction (str): The reduction to apply to the output: "none" | "mean | "sum".
+        return_z_loss (bool): When `return_z_loss` is `True`, returns (loss, z_loss) instead of (loss, None). Default: `False`
         Returns:
-        tensor: The computed loss.
+        tuple: A tuple with the compouted losses with respect to loss and z loss. The elements are tensors or None.
         """
-        loss, _input = cross_entropy_forward(
-            _input, target, ignore_index, label_smoothing, reduction
+        loss, z_loss, _input = cross_entropy_forward(
+            _input,
+            target,
+            ignore_index,
+            lse_square_scale,
+            label_smoothing,
+            reduction,
+            return_z_loss,
         )
         # TODO: investigation
         # If we don't detach the _input tensor, the memory will double
         # Not sure why but seems that there will be a time both grad and value exist but in different location
         ctx.save_for_backward(_input.detach())
-        return loss
+        ctx.return_z_loss = return_z_loss
+        return loss, z_loss
     @staticmethod
-    def backward(ctx, grad_output):
+    def backward(ctx, grad_output, grad_ouput2):
         """
         The backward pass of the Liger Cross Entropy loss.
         Parameters:
         ctx : The context object with saved tensors.
         grad_output (tensor): The tensor containing the gradient of the loss with respect to the output.
+        grad_output2 (tenosr): No use.
         Returns:
         tuple: A tuple with the gradients with respect to the inputs. The elements are tensors or None.
         """
+        if ctx.return_z_loss:
+            del grad_ouput2  # z_loss is only for logging
         (_input,) = ctx.saved_tensors
         _input = cross_entropy_backward(_input, grad_output)
         return (
@@ -278,4 +360,6 @@ class LigerCrossEntropyFunction(torch.autograd.Function):
             None,
             None,
             None,
+            None,
+            None,
         )

{liger_kernel_nightly-0.4.0.dev20241106174658 → liger_kernel_nightly-0.4.0.dev20241107054539}/src/liger_kernel/ops/fused_linear_cross_entropy.py RENAMED Viewed

@@ -21,6 +21,7 @@ def fused_linear_cross_entropy_forward(
     target,
     bias=None,
     ignore_index=-100,
+    lse_square_scale=0.0,
     label_smoothing=0.0,
     reduction="mean",
 ):
@@ -86,12 +87,15 @@ def fused_linear_cross_entropy_forward(
             Y_ptr=target_chunk,
             Y_stride=target_chunk.stride(-1),  # always 1
             loss_ptr=loss_1d_slice,
+            z_loss_ptr=loss_1d_slice,  # dummy ptr, not used
             loss_stride=loss_1d_slice.stride(-1),  # always 1
             n_cols=V,
             n_non_ignore=n_non_ignore,
             ignore_index=ignore_index,
+            lse_square_scale=lse_square_scale,
             label_smoothing=label_smoothing,
             reduction=reduction,
+            RETURN_Z_LOSS=0,  # False
             BLOCK_SIZE=BLOCK_SIZE,
             num_warps=32 if not is_hip() else 16,
         )
@@ -200,6 +204,7 @@ class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
         target,
         bias=None,
         ignore_index=-100,
+        lse_square_scale=0.0,
         label_smoothing=0.0,
         reduction="mean",
     ):
@@ -221,7 +226,14 @@ class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
         reduction: reduction to apply
         """
         loss, grad_input, grad_weight, grad_bias = fused_linear_cross_entropy_forward(
-            _input, weight, target, bias, ignore_index, label_smoothing, reduction
+            _input,
+            weight,
+            target,
+            bias,
+            ignore_index,
+            lse_square_scale,
+            label_smoothing,
+            reduction,
         )
         # downcast to dtype and store for backward
         ctx.save_for_backward(
@@ -238,4 +250,4 @@ class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
         grad_input, grad_weight, grad_bias = fused_linear_cross_entropy_backward(
             grad_output, grad_input, grad_weight, grad_bias
         )
-        return (grad_input, grad_weight, None, grad_bias, None, None, None)
+        return (grad_input, grad_weight, None, grad_bias, None, None, None, None)

liger_kernel_nightly-0.4.0.dev20241107054539/src/liger_kernel/transformers/cross_entropy.py ADDED Viewed

@@ -0,0 +1,43 @@
+import torch.nn as nn
+from liger_kernel.ops.cross_entropy import LigerCrossEntropyFunction
+class LigerCrossEntropyLoss(nn.Module):
+    def __init__(
+        self,
+        ignore_index=-100,
+        lse_square_scale=0.0,
+        label_smoothing=0.0,
+        reduction="mean",
+        return_z_loss=False,
+    ):
+        super().__init__()
+        self.ignore_index = ignore_index
+        self.lse_square_scale = lse_square_scale
+        self.label_smoothing = label_smoothing
+        self.reduction = reduction
+        self.return_z_loss = return_z_loss
+        assert (self.label_smoothing >= 0) and (
+            self.label_smoothing <= 1
+        ), f"label_smoothing must be between 0.0 and 1.0. Got: {self.label_smoothing}"
+        assert self.reduction in {
+            "mean",
+            "sum",
+            "none",
+        }, f"reduction must be one of 'mean', 'sum', or 'none'. Got: {self.reduction}"
+    def forward(self, _input, target):
+        loss, z_loss = LigerCrossEntropyFunction.apply(
+            _input,
+            target,
+            self.ignore_index,
+            self.lse_square_scale,
+            self.label_smoothing,
+            self.reduction,
+            self.return_z_loss,
+        )
+        if not self.return_z_loss:
+            return loss
+        return loss, z_loss

liger_kernel_nightly-0.4.0.dev20241107054539/src/liger_kernel/transformers/fused_linear_cross_entropy.py ADDED Viewed

@@ -0,0 +1,35 @@
+import torch.nn as nn
+from liger_kernel.ops.fused_linear_cross_entropy import (
+    LigerFusedLinearCrossEntropyFunction,
+)
+class LigerFusedLinearCrossEntropyLoss(nn.Module):
+    def __init__(
+        self,
+        ignore_index=-100,
+        label_smoothing=0.0,
+        reduction="mean",
+        lse_square_scale=0.0,
+    ):
+        super().__init__()
+        self.ignore_index = ignore_index
+        self.label_smoothing = label_smoothing
+        self.reduction = reduction
+        self.lse_square_scale = lse_square_scale
+        assert (self.label_smoothing >= 0) and (
+            self.label_smoothing <= 1
+        ), f"label_smoothing must be between 0.0 and 1.0. Got: {self.label_smoothing}"
+    def forward(self, lin_weight, _input, target, bias=None):
+        return LigerFusedLinearCrossEntropyFunction.apply(
+            _input,
+            lin_weight,
+            target,
+            bias,
+            self.ignore_index,
+            self.lse_square_scale,
+            self.label_smoothing,
+            self.reduction,
+        )

{liger_kernel_nightly-0.4.0.dev20241106174658 → liger_kernel_nightly-0.4.0.dev20241107054539}/src/liger_kernel/transformers/monkey_patch.py RENAMED Viewed

@@ -99,6 +99,7 @@ def apply_liger_kernel_to_llama(
     ), "cross_entropy and fused_linear_cross_entropy cannot both be True."
     from transformers.models.llama import modeling_llama
+    from transformers.models.llama.modeling_llama import LlamaModel
     if rope:
         modeling_llama.apply_rotary_pos_emb = liger_rotary_pos_emb
@@ -119,15 +120,8 @@ def apply_liger_kernel_to_llama(
         # The model instance already exists, so we need to additionally patch the
         # instance variables that reference already-instantiated modules (e.g. LlamaRMSNorm or LlamaMLP)
-        if hasattr(model, "model"):
-            # The case for LlamaForCausalLM or LlamaForSequenceClassification, for example
-            base_model = model.model
-        elif hasattr(model, "transformer"):
-            # LlamaForQuestionAnswering uses "transformer" instead of "model"
-            base_model = model.transformer
-        else:
-            # Direct LlamaModel
-            base_model = model
+        # get the base model from the model instance
+        base_model: LlamaModel = getattr(model, model.base_model_prefix, model)
         if rms_norm:
             _patch_rms_norm_module(base_model.norm)
@@ -275,6 +269,7 @@ def apply_liger_kernel_to_mistral(
     ), "cross_entropy and fused_linear_cross_entropy cannot both be True."
     from transformers.models.mistral import modeling_mistral
+    from transformers.models.mistral.modeling_mistral import MistralModel
     if rope:
         modeling_mistral.apply_rotary_pos_emb = liger_rotary_pos_emb
@@ -291,12 +286,8 @@ def apply_liger_kernel_to_mistral(
         # The model instance already exists, so we need to additionally patch the
         # instance variables that reference already-instantiated modules
-        if hasattr(model, "model"):
-            # The case for MistralForCausalLM, MistralForTokenClassification for example
-            base_model = model.model
-        else:
-            # Direct MistralModel
-            base_model = model
+        # get the base model from the model instance
+        base_model: MistralModel = getattr(model, model.base_model_prefix, model)
         if rms_norm:
             _patch_rms_norm_module(base_model.norm)
@@ -340,6 +331,7 @@ def apply_liger_kernel_to_mixtral(
     ), "cross_entropy and fused_linear_cross_entropy cannot both be True."
     from transformers.models.mixtral import modeling_mixtral
+    from transformers.models.mixtral.modeling_mixtral import MixtralModel
     if rope:
         modeling_mixtral.apply_rotary_pos_emb = liger_rotary_pos_emb
@@ -360,12 +352,8 @@ def apply_liger_kernel_to_mixtral(
         # The model instance already exists, so we need to additionally patch the
         # instance variables that reference already-instantiated modules
-        if hasattr(model, "model"):
-            # The case for MixtralForCausalLM, MixtralForTokenClassification for example
-            base_model = model.model
-        else:
-            # Direct MixtralModel
-            base_model = model
+        # get the base model from the model instance
+        base_model: MixtralModel = getattr(model, model.base_model_prefix, model)
         if rms_norm:
             _patch_rms_norm_module(base_model.norm)
@@ -410,6 +398,7 @@ def apply_liger_kernel_to_gemma(
     ), "cross_entropy and fused_linear_cross_entropy cannot both be True."
     from transformers.models.gemma import modeling_gemma
+    from transformers.models.gemma.modeling_gemma import GemmaModel
     # https://github.com/huggingface/transformers/blob/v4.44.2/src/transformers/models/gemma/modeling_gemma.py#L109
     LigerRMSNormForGemma = partial(
@@ -438,12 +427,8 @@ def apply_liger_kernel_to_gemma(
         # The model instance already exists, so we need to additionally patch the
         # instance variables that reference already-instantiated modules
-        if hasattr(model, "model"):
-            # The case for GemmaForCausalLM, GemmaForTokenClassification for example
-            base_model = model.model
-        else:
-            # Direct GemmaModel
-            base_model = model
+        # get the base model from the model instance
+        base_model: GemmaModel = getattr(model, model.base_model_prefix, model)
         if rms_norm:
             _patch_rms_norm_module_for_gemma(base_model.norm)
@@ -478,6 +463,7 @@ def apply_liger_kernel_to_gemma2(
         loaded. Default is None.
     """
     from transformers.models.gemma2 import modeling_gemma2
+    from transformers.models.gemma2.modeling_gemma2 import Gemma2Model
     LigerRMSNormForGemma2 = partial(
         LigerRMSNorm, offset=1.0, casting_mode="gemma", init_fn="zeros"
@@ -500,12 +486,8 @@ def apply_liger_kernel_to_gemma2(
         # The model instance already exists, so we need to additionally patch the
         # instance variables that reference already-instantiated modules
-        if hasattr(model, "model"):
-            # The case for Gemma2ForCausalLM, Gemma2ForTokenClassification for example
-            base_model = model.model
-        else:
-            # Direct Gemma2Model
-            base_model = model
+        # get the base model from the model instance
+        base_model: Gemma2Model = getattr(model, model.base_model_prefix, model)
         if rms_norm:
             _patch_rms_norm_module_for_gemma2(base_model.norm)
@@ -556,6 +538,7 @@ def apply_liger_kernel_to_qwen2(
     ), "cross_entropy and fused_linear_cross_entropy cannot both be True."
     from transformers.models.qwen2 import modeling_qwen2
+    from transformers.models.qwen2.modeling_qwen2 import Qwen2Model
     if rope:
         modeling_qwen2.apply_rotary_pos_emb = liger_rotary_pos_emb
@@ -580,12 +563,8 @@ def apply_liger_kernel_to_qwen2(
         # The model instance already exists, so we need to additionally patch the
         # instance variables that reference already-instantiated modules
-        if hasattr(model, "model"):
-            # The case for Qwen2ForCausalLM, Qwen2ForTokenClassification for example
-            base_model = model.model
-        else:
-            # Direct Qwen2Model
-            base_model = model
+        # get the base model from the model instance
+        base_model: Qwen2Model = getattr(model, model.base_model_prefix, model)
         if rms_norm:
             _patch_rms_norm_module(base_model.norm)
@@ -630,6 +609,7 @@ def apply_liger_kernel_to_qwen2_vl(
     ), "cross_entropy and fused_linear_cross_entropy cannot both be True."
     from transformers.models.qwen2_vl import modeling_qwen2_vl
+    from transformers.models.qwen2_vl.modeling_qwen2_vl import Qwen2VLModel
     from liger_kernel.transformers.model.qwen2_vl import (
         lce_forward as qwen2_vl_lce_forward,
@@ -653,12 +633,8 @@ def apply_liger_kernel_to_qwen2_vl(
         # The model instance already exists, so we need to additionally patch the
         # instance variables that reference already-instantiated modules
-        if hasattr(model, "model"):
-            # The case for Qwen2VLForConditionalGeneration.
-            base_model = model.model
-        else:
-            # Direct Qwen2VLModel
-            base_model = model
+        # get the base model from the model instance
+        base_model: Qwen2VLModel = getattr(model, model.base_model_prefix, model)
         if hasattr(model, "visual"):
             # Patch Qwen2VisionTransformerPretrainedModel
@@ -707,6 +683,7 @@ def apply_liger_kernel_to_phi3(
     ), "cross_entropy and fused_linear_cross_entropy cannot both be True."
     from transformers.models.phi3 import modeling_phi3
+    from transformers.models.phi3.modeling_phi3 import Phi3Model
     if rope:
         modeling_phi3.apply_rotary_pos_emb = liger_rotary_pos_emb  # Same as Gemma
@@ -727,12 +704,8 @@ def apply_liger_kernel_to_phi3(
         # The model instance already exists, so we need to additionally patch the
         # instance variables that reference already-instantiated modules
-        if hasattr(model, "model"):
-            # The case for Phi3ForCausalLM, Phi3ForTokenClassification for example
-            base_model = model.model
-        else:
-            # Direct Phi3Model
-            base_model = model
+        # get the base model from the model instance
+        base_model: Phi3Model = getattr(model, model.base_model_prefix, model)
         if rms_norm:
             _patch_rms_norm_module(base_model.norm)

{liger_kernel_nightly-0.4.0.dev20241106174658 → liger_kernel_nightly-0.4.0.dev20241107054539/src/liger_kernel_nightly.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: liger_kernel_nightly
-Version: 0.4.0.dev20241106174658
+Version: 0.4.0.dev20241107054539
 Summary: Efficient Triton kernels for LLM Training
 License: BSD 2-CLAUSE LICENSE
         Copyright 2024 LinkedIn Corporation

liger_kernel_nightly-0.4.0.dev20241106174658/src/liger_kernel/transformers/cross_entropy.py DELETED Viewed

@@ -1,21 +0,0 @@
-from torch.nn import CrossEntropyLoss
-from liger_kernel.ops.cross_entropy import LigerCrossEntropyFunction
-class LigerCrossEntropyLoss(CrossEntropyLoss):
-    def __init__(self, *args, **kwargs):
-        super(LigerCrossEntropyLoss, self).__init__(*args, **kwargs)
-        assert (self.label_smoothing >= 0) and (
-            self.label_smoothing <= 1
-        ), f"label_smoothing must be between 0.0 and 1.0. Got: {self.label_smoothing}"
-        assert self.reduction in {
-            "mean",
-            "sum",
-            "none",
-        }, f"reduction must be one of 'mean', 'sum', or 'none'. Got: {self.reduction}"
-    def forward(self, _input, target):
-        return LigerCrossEntropyFunction.apply(
-            _input, target, self.ignore_index, self.label_smoothing, self.reduction
-        )

liger_kernel_nightly-0.4.0.dev20241106174658/src/liger_kernel/transformers/fused_linear_cross_entropy.py DELETED Viewed

@@ -1,21 +0,0 @@
-from torch.nn import CrossEntropyLoss
-from liger_kernel.ops.fused_linear_cross_entropy import (
-    LigerFusedLinearCrossEntropyFunction,
-)
-class LigerFusedLinearCrossEntropyLoss(CrossEntropyLoss):
-    def __init__(self, *args, **kwargs):
-        super(LigerFusedLinearCrossEntropyLoss, self).__init__(*args, **kwargs)
-    def forward(self, lin_weight, _input, target, bias=None):
-        return LigerFusedLinearCrossEntropyFunction.apply(
-            _input,
-            lin_weight,
-            target,
-            bias,
-            self.ignore_index,
-            self.label_smoothing,
-            self.reduction,
-        )