PyPI - liger-kernel-nightly - Versions diffs - 0.4.0.dev20241108173943__tar.gz → 0.4.0.dev20241108174843__tar.gz - Mend

liger-kernel-nightly 0.4.0.dev20241108173943tar.gz → 0.4.0.dev20241108174843tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of liger-kernel-nightly might be problematic. Click here for more details.

Files changed (56) hide show

{liger_kernel_nightly-0.4.0.dev20241108173943/src/liger_kernel_nightly.egg-info → liger_kernel_nightly-0.4.0.dev20241108174843}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: liger_kernel_nightly
-Version: 0.4.0.dev20241108173943
+Version: 0.4.0.dev20241108174843
 Summary: Efficient Triton kernels for LLM Training
 License: BSD 2-CLAUSE LICENSE
         Copyright 2024 LinkedIn Corporation

{liger_kernel_nightly-0.4.0.dev20241108173943 → liger_kernel_nightly-0.4.0.dev20241108174843}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "liger_kernel_nightly"
-version = "0.4.0.dev20241108173943"
+version = "0.4.0.dev20241108174843"
 description = "Efficient Triton kernels for LLM Training"
 urls = { "Homepage" = "https://github.com/linkedin/Liger-Kernel" }
 readme = { file = "README.md", content-type = "text/markdown" }

{liger_kernel_nightly-0.4.0.dev20241108173943 → liger_kernel_nightly-0.4.0.dev20241108174843}/src/liger_kernel/ops/cross_entropy.py RENAMED Viewed

@@ -1,8 +1,21 @@
+import operator
+from typing import Optional
 import torch
 import triton
 import triton.language as tl
-from liger_kernel.ops.utils import element_mul_kernel, is_hip
+from liger_kernel.ops.utils import compare_version, element_mul_kernel, is_hip
+if compare_version("triton", operator.ge, "3.0.0"):
+    try:
+        # typical import path with dispatch available
+        from triton.language.extra.libdevice import tanh
+    except ModuleNotFoundError:
+        # for working with NGC containers
+        from triton.language.extra.cuda.libdevice import tanh
+else:
+    from triton.language.math import tanh
 _TRUE = tl.constexpr(1)
 _FALSE = tl.constexpr(0)
@@ -23,8 +36,10 @@ def liger_cross_entropy_kernel(
     lse_square_scale: tl.constexpr,
     label_smoothing: tl.constexpr,
     reduction: tl.constexpr,  # set it as constexpr since reduction is always known at compile time
+    softcap,
     RETURN_Z_LOSS: tl.constexpr,
     BLOCK_SIZE: tl.constexpr,
+    HAS_SOFTCAPPING: tl.constexpr,
 ):
     """
     This kernel computes both cross entropy loss and the gradient of the input.
@@ -45,7 +60,9 @@ def liger_cross_entropy_kernel(
     lse_square_scale (float): The scaler of (logsumexp(_input)) ^ 2 adding to the loss for the stability of training.
     RETURN_Z_LOSS (int): The boolean value to decide whether storing z loss to z_loss_ptr or not. It must be 0 or 1.
     reduction (str): The string for the reduction to apply
+    softcap (float): The upper threshold for scaling logits to the range (-softcap, +softcap).
     BLOCK_SIZE (int): The block size for Triton operations.
+    HAS_SOFTCAPPING (bool): The boolean value to determine whether applying soft-capping or not.
     """
     # https://github.com/triton-lang/triton/issues/1058
@@ -78,6 +95,8 @@ def liger_cross_entropy_kernel(
     ori_X_y = tl.load(
         X_ptr + y
     )  # we need to store the original value of X_y for the loss calculation
+    if HAS_SOFTCAPPING:
+        ori_X_y = softcap * tanh(ori_X_y / softcap)
     # Label smoothing is a general case of normal cross entropy
     # See the full derivation at https://github.com/linkedin/Liger-Kernel/pull/198#issue-2503665310
@@ -89,6 +108,8 @@ def liger_cross_entropy_kernel(
         X_block = tl.load(
             X_ptr + X_offsets, mask=X_offsets < n_cols, other=float("-inf")
         )
+        if HAS_SOFTCAPPING:
+            X_block = softcap * tanh(X_block / softcap)
         block_max = tl.max(X_block)
         if label_smoothing > 0:
             # scale X beforehand to avoid overflow
@@ -122,15 +143,24 @@ def liger_cross_entropy_kernel(
         X_block = tl.load(
             X_ptr + X_offsets, mask=X_offsets < n_cols, other=float("-inf")
         )
+        if HAS_SOFTCAPPING:
+            intermediate = tanh(X_block / softcap)
+            X_block = softcap * intermediate
         # softmax(x_i)
         X_block = tl.exp(X_block - m) / d
         # derivative of z-loss: 2 * lse_square_scale * lse * softmax(x_i)
         X_block += 2 * lse_square_scale * lse * X_block
         # smoothing term
         X_block += -eps
+        # special handle dx_y
+        X_block = tl.where(X_offsets != y, X_block, X_block - (1 - label_smoothing))
         # reduction scale
         if reduction == "mean":
             X_block = X_block / (n_non_ignore)
+        # chain rule
+        # d(softcap * tanh(x / softcap)) = (1 - tanh^2(x / softcap))
+        if HAS_SOFTCAPPING:
+            X_block = X_block * (1 - intermediate * intermediate)
         tl.store(X_ptr + X_offsets, X_block, mask=X_offsets < n_cols)
@@ -151,7 +181,7 @@ def liger_cross_entropy_kernel(
     # H(q', p) = (1 - label_smoothing) * H(q, p) + label_smoothing * H(u, p)
     #          = (1 - label_smoothing) * H(q, p) + eps * sum(logsoftmax(x_i))
     # By using m (global max of xi) and d (sum of e^(xi-m)), we can simplify as:
-    #          = (1 - label_smoothing) * H(q, p) + (-sum(x_i * eps) + label_smoothing * (m + logd))
+    #          = (1 - label_smoothing) * H(q, p) + (sum(-eps * x_i) + label_smoothing * (m + logd))
     # Refer to H(q', p) in section 7 of the paper: https://arxiv.org/pdf/1512.00567
     # pytorch: https://github.com/pytorch/pytorch/blob/2981534f54d49fa3a9755c9b0855e7929c2527f0/aten/src/ATen/native/LossNLL.cpp#L516
     # See full derivation at https://github.com/linkedin/Liger-Kernel/pull/198#issuecomment-2333753087
@@ -168,17 +198,9 @@ def liger_cross_entropy_kernel(
         z_loss = z_loss / n_non_ignore
         loss = loss / n_non_ignore
-    # 6. Specially handle the i==y case where `dx_y = (softmax(x_y) - (1 - label_smoothing) / N`
-    X_y = tl.load(X_ptr + y)
-    if reduction == "mean":
-        X_y += -(1 - label_smoothing) / (n_non_ignore)
-    else:
-        X_y += -(1 - label_smoothing)
     tl.store(loss_ptr, loss)
     if RETURN_Z_LOSS == _TRUE:
         tl.store(z_loss_ptr, z_loss)
-    tl.store(X_ptr + y, X_y)
 # The hard limit of TRITON_MAX_TENSOR_NUMEL is 1048576 https://github.com/triton-lang/triton/blob/ba42a5c68fd0505f8c42f4202d53be0f8d9a5fe0/python/triton/language/core.py#L19
@@ -200,6 +222,7 @@ def cross_entropy_forward(
     lse_square_scale,
     label_smoothing,
     reduction,
+    softcap,
     return_z_loss,
 ):
     if not isinstance(return_z_loss, int):
@@ -247,8 +270,10 @@ def cross_entropy_forward(
         lse_square_scale=lse_square_scale,
         label_smoothing=label_smoothing,
         reduction=reduction,
+        softcap=softcap if softcap is not None else 0.0,
         RETURN_Z_LOSS=return_z_loss,
         BLOCK_SIZE=BLOCK_SIZE,
+        HAS_SOFTCAPPING=True if softcap is not None else False,
         # TODO: 32 seems to give the best performance
         # Performance is quite sensitive to num_warps
         num_warps=32 if not is_hip() else 16,
@@ -296,13 +321,14 @@ class LigerCrossEntropyFunction(torch.autograd.Function):
     @staticmethod
     def forward(
         ctx,
-        _input,
-        target,
-        ignore_index=-100,
-        lse_square_scale=0.0,
-        label_smoothing=0.0,
-        reduction="mean",
-        return_z_loss=False,
+        _input: torch.Tensor,
+        target: torch.Tensor,
+        ignore_index: int = -100,
+        lse_square_scale: float = 0.0,
+        label_smoothing: float = 0.0,
+        reduction: str = "mean",
+        softcap: Optional[float] = None,
+        return_z_loss: bool = False,
     ):
         """
         The forward pass of the Liger Cross Entropy loss.
@@ -315,6 +341,7 @@ class LigerCrossEntropyFunction(torch.autograd.Function):
         lse_square_scale (float): The scaler of (logsumexp(_input)) ^ 2 adding to the loss for the stability of training.
         label_smoothing (float): The amount of smoothing when computing the loss, where 0.0 means no smoothing.
         reduction (str): The reduction to apply to the output: "none" | "mean | "sum".
+        softcap (Optional[float]): The upper threshold for scaling logits to the range (-softcap, +softcap).
         return_z_loss (bool): When `return_z_loss` is `True`, returns (loss, z_loss) instead of (loss, None). Default: `False`
         Returns:
@@ -327,6 +354,7 @@ class LigerCrossEntropyFunction(torch.autograd.Function):
             lse_square_scale,
             label_smoothing,
             reduction,
+            softcap,
             return_z_loss,
         )
         # TODO: investigation
@@ -362,4 +390,5 @@ class LigerCrossEntropyFunction(torch.autograd.Function):
             None,
             None,
             None,
+            None,
         )

{liger_kernel_nightly-0.4.0.dev20241108173943 → liger_kernel_nightly-0.4.0.dev20241108174843}/src/liger_kernel/ops/fused_linear_cross_entropy.py RENAMED Viewed

@@ -24,6 +24,7 @@ def fused_linear_cross_entropy_forward(
     lse_square_scale=0.0,
     label_smoothing=0.0,
     reduction="mean",
+    softcap=None,
 ):
     dtype = _input.dtype
     device = _input.device
@@ -95,7 +96,9 @@ def fused_linear_cross_entropy_forward(
             lse_square_scale=lse_square_scale,
             label_smoothing=label_smoothing,
             reduction=reduction,
+            softcap=softcap if softcap is not None else 0.0,
             RETURN_Z_LOSS=0,  # False
+            HAS_SOFTCAPPING=True if softcap is not None else False,
             BLOCK_SIZE=BLOCK_SIZE,
             num_warps=32 if not is_hip() else 16,
         )
@@ -207,6 +210,7 @@ class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
         lse_square_scale=0.0,
         label_smoothing=0.0,
         reduction="mean",
+        softcap=None,
     ):
         """
         Fusing the last linear layer with cross-entropy loss
@@ -234,6 +238,7 @@ class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
             lse_square_scale,
             label_smoothing,
             reduction,
+            softcap,
         )
         # downcast to dtype and store for backward
         ctx.save_for_backward(
@@ -250,4 +255,4 @@ class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
         grad_input, grad_weight, grad_bias = fused_linear_cross_entropy_backward(
             grad_output, grad_input, grad_weight, grad_bias
         )
-        return (grad_input, grad_weight, None, grad_bias, None, None, None, None)
+        return (grad_input, grad_weight, None, grad_bias, None, None, None, None, None)

{liger_kernel_nightly-0.4.0.dev20241108173943 → liger_kernel_nightly-0.4.0.dev20241108174843}/src/liger_kernel/transformers/cross_entropy.py RENAMED Viewed

@@ -1,34 +1,43 @@
-import torch.nn as nn
+from typing import Optional
+import torch
 from liger_kernel.ops.cross_entropy import LigerCrossEntropyFunction
-class LigerCrossEntropyLoss(nn.Module):
+class LigerCrossEntropyLoss(torch.nn.Module):
     def __init__(
         self,
-        ignore_index=-100,
-        lse_square_scale=0.0,
-        label_smoothing=0.0,
-        reduction="mean",
-        return_z_loss=False,
+        ignore_index: int = -100,
+        lse_square_scale: float = 0.0,
+        label_smoothing: float = 0.0,
+        reduction: str = "mean",
+        softcap: Optional[float] = None,
+        return_z_loss: bool = False,
     ):
         super().__init__()
+        assert (label_smoothing >= 0) and (
+            label_smoothing <= 1
+        ), f"label_smoothing must be between 0.0 and 1.0. Got: {label_smoothing}"
+        assert (label_smoothing >= 0) and (
+            label_smoothing <= 1
+        ), f"label_smoothing must be between 0.0 and 1.0. Got: {label_smoothing}"
+        assert reduction in {
+            "mean",
+            "sum",
+            "none",
+        }, f"reduction must be one of 'mean', 'sum', or 'none'. Got: {reduction}"
+        assert (
+            softcap is None or softcap > 0
+        ), f"softcap must greater than 0.0 or None. Got: {softcap}"
         self.ignore_index = ignore_index
         self.lse_square_scale = lse_square_scale
         self.label_smoothing = label_smoothing
         self.reduction = reduction
+        self.softcap = softcap
         self.return_z_loss = return_z_loss
-        assert (self.label_smoothing >= 0) and (
-            self.label_smoothing <= 1
-        ), f"label_smoothing must be between 0.0 and 1.0. Got: {self.label_smoothing}"
-        assert self.reduction in {
-            "mean",
-            "sum",
-            "none",
-        }, f"reduction must be one of 'mean', 'sum', or 'none'. Got: {self.reduction}"
-    def forward(self, _input, target):
+    def forward(self, _input: torch.Tensor, target: torch.Tensor):
         loss, z_loss = LigerCrossEntropyFunction.apply(
             _input,
             target,
@@ -36,6 +45,7 @@ class LigerCrossEntropyLoss(nn.Module):
             self.lse_square_scale,
             self.label_smoothing,
             self.reduction,
+            self.softcap,
             self.return_z_loss,
         )
         if not self.return_z_loss:

{liger_kernel_nightly-0.4.0.dev20241108173943 → liger_kernel_nightly-0.4.0.dev20241108174843}/src/liger_kernel/transformers/fused_linear_cross_entropy.py RENAMED Viewed

@@ -1,26 +1,38 @@
-import torch.nn as nn
+from typing import Optional
+import torch
 from liger_kernel.ops.fused_linear_cross_entropy import (
     LigerFusedLinearCrossEntropyFunction,
 )
-class LigerFusedLinearCrossEntropyLoss(nn.Module):
+class LigerFusedLinearCrossEntropyLoss(torch.nn.Module):
     def __init__(
         self,
-        ignore_index=-100,
-        label_smoothing=0.0,
-        reduction="mean",
-        lse_square_scale=0.0,
+        ignore_index: int = -100,
+        lse_square_scale: float = 0.0,
+        label_smoothing: float = 0.0,
+        reduction: str = "mean",
+        softcap: Optional[float] = None,
     ):
         super().__init__()
+        assert (label_smoothing >= 0) and (
+            label_smoothing <= 1
+        ), f"label_smoothing must be between 0.0 and 1.0. Got: {label_smoothing}"
+        assert reduction in {
+            "mean",
+            "sum",
+            "none",
+        }, f"reduction must be one of 'mean', 'sum', or 'none'. Got: {reduction}"
+        assert (
+            softcap is None or softcap > 0
+        ), f"softcap must greater than 0.0 or None. Got: {softcap}"
         self.ignore_index = ignore_index
+        self.lse_square_scale = lse_square_scale
         self.label_smoothing = label_smoothing
         self.reduction = reduction
-        self.lse_square_scale = lse_square_scale
-        assert (self.label_smoothing >= 0) and (
-            self.label_smoothing <= 1
-        ), f"label_smoothing must be between 0.0 and 1.0. Got: {self.label_smoothing}"
+        self.softcap = softcap
     def forward(self, lin_weight, _input, target, bias=None):
         return LigerFusedLinearCrossEntropyFunction.apply(
@@ -32,4 +44,5 @@ class LigerFusedLinearCrossEntropyLoss(nn.Module):
             self.lse_square_scale,
             self.label_smoothing,
             self.reduction,
+            self.softcap,
         )

liger_kernel_nightly-0.4.0.dev20241108174843/src/liger_kernel/transformers/model/gemma2.py ADDED Viewed

@@ -0,0 +1,277 @@
+import logging
+from typing import Optional, Tuple, Union
+import torch
+from torch.nn import CrossEntropyLoss
+from transformers.cache_utils import HybridCache
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers.models.gemma2.modeling_gemma2 import (
+    _CONFIG_FOR_DOC,
+    GEMMA2_INPUTS_DOCSTRING,
+)
+from transformers.utils import (
+    add_start_docstrings_to_model_forward,
+    replace_return_docstrings,
+)
+from liger_kernel.transformers.fused_linear_cross_entropy import (
+    LigerFusedLinearCrossEntropyLoss,
+)
+logger = logging.getLogger(__name__)
+def lce_forward_deprecated(
+    self,
+    input_ids: torch.LongTensor = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[HybridCache] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    labels: Optional[torch.LongTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    cache_position: Optional[torch.LongTensor] = None,
+) -> Union[Tuple, CausalLMOutputWithPast]:
+    r"""
+    Args:
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+    Returns:
+    Example:
+    ```python
+    >>> from transformers import AutoTokenizer, GemmaForCausalLM
+    >>> model = GemmaForCausalLM.from_pretrained("google/gemma-2-9b")
+    >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")
+    >>> prompt = "What is your favorite condiment?"
+    >>> inputs = tokenizer(prompt, return_tensors="pt")
+    >>> # Generate
+    >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+    >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+    "What is your favorite condiment?"
+    ```"""
+    if self.training and self.config._attn_implementation != "eager":
+        logger.warning_once(
+            "It is strongly recommended to train Gemma2 models with the `eager` attention implementation "
+            f"instead of `{self.config._attn_implementation}`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`."
+        )
+    output_attentions = (
+        output_attentions
+        if output_attentions is not None
+        else self.config.output_attentions
+    )
+    output_hidden_states = (
+        output_hidden_states
+        if output_hidden_states is not None
+        else self.config.output_hidden_states
+    )
+    return_dict = (
+        return_dict if return_dict is not None else self.config.use_return_dict
+    )
+    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+    outputs = self.model(
+        input_ids=input_ids,
+        attention_mask=attention_mask,
+        position_ids=position_ids,
+        past_key_values=past_key_values,
+        inputs_embeds=inputs_embeds,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        return_dict=return_dict,
+        cache_position=cache_position,
+    )
+    hidden_states = outputs[0]
+    loss = None
+    logits = None
+    if self.training and (labels is not None):
+        shift_hidden_states = hidden_states[..., :-1, :].contiguous()
+        shift_labels = labels[..., 1:].contiguous()
+        # flatten
+        shift_hidden_states = shift_hidden_states.view(-1, self.config.hidden_size)
+        shift_labels = shift_labels.view(-1)
+        lce = LigerFusedLinearCrossEntropyLoss(
+            softcap=self.config.final_logit_softcapping
+        )
+        loss = lce(self.lm_head.weight, shift_hidden_states, shift_labels)
+    else:
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        logits = self.lm_head(hidden_states)
+        if self.config.final_logit_softcapping is not None:
+            logits = logits / self.config.final_logit_softcapping
+            logits = torch.tanh(logits)
+            logits = logits * self.config.final_logit_softcapping
+        loss = None
+        if labels is not None:
+            # Upcast to float if we need to compute the loss to avoid potential precision issues
+            logits = logits.float()
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+    if not return_dict:
+        output = (logits,) + outputs[1:]
+        return (loss,) + output if loss is not None else output
+    return CausalLMOutputWithPast(
+        loss=loss,
+        logits=logits,
+        past_key_values=outputs.past_key_values,
+        hidden_states=outputs.hidden_states,
+        attentions=outputs.attentions,
+    )
+@add_start_docstrings_to_model_forward(GEMMA2_INPUTS_DOCSTRING)
+@replace_return_docstrings(
+    output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
+)
+def lce_forward(
+    self,
+    input_ids: torch.LongTensor = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[HybridCache] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    labels: Optional[torch.LongTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    cache_position: Optional[torch.LongTensor] = None,
+    num_logits_to_keep: int = 0,
+    **loss_kwargs,
+) -> Union[Tuple, CausalLMOutputWithPast]:
+    r"""
+    Args:
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        num_logits_to_keep (`int`, *optional*):
+            Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
+            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+    Returns:
+    Example:
+    ```python
+    >>> from transformers import AutoTokenizer, GemmaForCausalLM
+    >>> model = GemmaForCausalLM.from_pretrained("google/gemma-2-9b")
+    >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")
+    >>> prompt = "What is your favorite condiment?"
+    >>> inputs = tokenizer(prompt, return_tensors="pt")
+    >>> # Generate
+    >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+    >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+    "What is your favorite condiment?"
+    ```"""
+    if self.training and self.config._attn_implementation != "eager":
+        logger.warning_once(
+            "It is strongly recommended to train Gemma2 models with the `eager` attention implementation "
+            f"instead of `{self.config._attn_implementation}`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`."
+        )
+    output_attentions = (
+        output_attentions
+        if output_attentions is not None
+        else self.config.output_attentions
+    )
+    output_hidden_states = (
+        output_hidden_states
+        if output_hidden_states is not None
+        else self.config.output_hidden_states
+    )
+    return_dict = (
+        return_dict if return_dict is not None else self.config.use_return_dict
+    )
+    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+    outputs = self.model(
+        input_ids=input_ids,
+        attention_mask=attention_mask,
+        position_ids=position_ids,
+        past_key_values=past_key_values,
+        inputs_embeds=inputs_embeds,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        return_dict=return_dict,
+        cache_position=cache_position,
+    )
+    hidden_states = outputs[0]
+    logits = None
+    loss = None
+    # if in training mode, don't materialize logits
+    if self.training and (labels is not None):
+        # We do the same thing as ForCausalLMLoss but using Liger FLCE
+        shift_hidden_states = hidden_states[..., :-1, :].contiguous()
+        shift_labels = labels[..., 1:].contiguous()
+        # flatten tokens
+        shift_hidden_states = shift_hidden_states.view(-1, self.config.hidden_size)
+        shift_labels = shift_labels.view(-1)
+        reduction = "sum" if "num_items_in_batch" in loss_kwargs else "mean"
+        lce = LigerFusedLinearCrossEntropyLoss(
+            softcap=self.config.final_logit_softcapping,
+            reduction=reduction,
+        )
+        loss = lce(self.lm_head.weight, shift_hidden_states, shift_labels)
+        if reduction == "sum":
+            loss /= loss_kwargs["num_items_in_batch"]
+    else:  # if in inference mode materialize logits
+        logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
+        if self.config.final_logit_softcapping is not None:
+            logits = logits / self.config.final_logit_softcapping
+            logits = torch.tanh(logits)
+            logits = logits * self.config.final_logit_softcapping
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs)
+    if not return_dict:
+        output = (logits,) + outputs[1:]
+        return (loss,) + output if loss is not None else output
+    return CausalLMOutputWithPast(
+        loss=loss,
+        logits=logits,
+        past_key_values=outputs.past_key_values,
+        hidden_states=outputs.hidden_states,
+        attentions=outputs.attentions,
+    )

{liger_kernel_nightly-0.4.0.dev20241108173943 → liger_kernel_nightly-0.4.0.dev20241108174843}/src/liger_kernel/transformers/monkey_patch.py RENAMED Viewed

@@ -14,6 +14,10 @@ from liger_kernel.transformers.model.gemma import lce_forward as gemma_lce_forwa
 from liger_kernel.transformers.model.gemma import (
     lce_forward_deprecated as gemma_lce_forward_deprecated,
 )
+from liger_kernel.transformers.model.gemma2 import lce_forward as gemma2_lce_forward
+from liger_kernel.transformers.model.gemma2 import (
+    lce_forward_deprecated as gemma2_lce_forward_deprected,
+)
 from liger_kernel.transformers.model.llama import lce_forward as llama_lce_forward
 from liger_kernel.transformers.model.llama import (
     lce_forward_deprecated as llama_lce_forward_deprecated,
@@ -252,7 +256,7 @@ def apply_liger_kernel_to_mistral(
     Apply Liger kernels to replace original implementation in HuggingFace Mistral models
     Args:
-        rope (bool): Whether to apply Liger's rotary position embedding. Default is True.
+        rope (bool): Whether to apply Liger's rotary position embedding. Default is False.
         cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is True.
         fused_linear_cross_entropy (bool):
             Whether to apply Liger's fused linear cross entropy loss. Default is True.
@@ -445,7 +449,8 @@ def apply_liger_kernel_to_gemma(
 def apply_liger_kernel_to_gemma2(
     rope: bool = True,
-    cross_entropy: bool = True,
+    cross_entropy: bool = False,
+    fused_linear_cross_entropy: bool = True,
     rms_norm: bool = True,
     geglu: bool = True,
     model: PreTrainedModel = None,
@@ -456,12 +461,19 @@ def apply_liger_kernel_to_gemma2(
     Args:
         rope (bool): Whether to apply Liger's rotary position embedding. Default is True.
-        cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is True.
+        cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
+        fused_linear_cross_entropy (bool):
+            Whether to apply Liger's fused linear cross entropy loss. Default is True.
+            `cross_entropy` and `fused_linear_cross_entropy` cannot both be True.
+            If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
         rms_norm (bool): Whether to apply Liger's RMSNorm. Default is True.
         geglu (bool): Whether to apply Liger's GeGLU MLP. Default is True.
         model (PreTrainedModel): The model instance to apply Liger kernels to, if the model has already been
         loaded. Default is None.
     """
+    assert not (
+        cross_entropy and fused_linear_cross_entropy
+    ), "cross_entropy and fused_linear_cross_entropy cannot both be True."
     from transformers.models.gemma2 import modeling_gemma2
     from transformers.models.gemma2.modeling_gemma2 import Gemma2Model
@@ -479,6 +491,12 @@ def apply_liger_kernel_to_gemma2(
         modeling_gemma2.Gemma2RMSNorm = LigerRMSNormForGemma2
     if cross_entropy:
         modeling_gemma2.CrossEntropyLoss = LigerCrossEntropyLoss
+    if fused_linear_cross_entropy:
+        if transformer_version >= version.parse(SUPPORTED_TRANSFORMER_VERSION):
+            modeling_gemma2.Gemma2ForCausalLM.forward = gemma2_lce_forward
+        else:
+            logger.warning(TRANSFORMER_DEPRECATION_WARNING)
+            modeling_gemma2.Gemma2ForCausalLM.forward = gemma2_lce_forward_deprected
     if geglu:
         modeling_gemma2.Gemma2MLP = LigerGEGLUMLP

{liger_kernel_nightly-0.4.0.dev20241108173943 → liger_kernel_nightly-0.4.0.dev20241108174843/src/liger_kernel_nightly.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: liger_kernel_nightly
-Version: 0.4.0.dev20241108173943
+Version: 0.4.0.dev20241108174843
 Summary: Efficient Triton kernels for LLM Training
 License: BSD 2-CLAUSE LICENSE
         Copyright 2024 LinkedIn Corporation

{liger_kernel_nightly-0.4.0.dev20241108173943 → liger_kernel_nightly-0.4.0.dev20241108174843}/src/liger_kernel_nightly.egg-info/SOURCES.txt RENAMED Viewed

@@ -37,6 +37,7 @@ src/liger_kernel/transformers/trainer_integration.py
 src/liger_kernel/transformers/experimental/embedding.py
 src/liger_kernel/transformers/model/__init__.py
 src/liger_kernel/transformers/model/gemma.py
+src/liger_kernel/transformers/model/gemma2.py
 src/liger_kernel/transformers/model/llama.py
 src/liger_kernel/transformers/model/mistral.py
 src/liger_kernel/transformers/model/mixtral.py