PyPI - liger-kernel - Versions diffs - 0.1.1__py3-none-any.whl → 0.2.1__py3-none-any.whl - Mend

liger-kernel 0.1.1py3-none-any.whl → 0.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

liger_kernel/env_report.py +46 -0
liger_kernel/ops/cross_entropy.py +5 -5
liger_kernel/ops/fused_linear_cross_entropy.py +50 -21
liger_kernel/ops/geglu.py +6 -1
liger_kernel/ops/rms_norm.py +142 -20
liger_kernel/ops/rope.py +3 -3
liger_kernel/transformers/__init__.py +6 -0
liger_kernel/transformers/auto_model.py +33 -0
liger_kernel/transformers/fused_linear_cross_entropy.py +2 -2
liger_kernel/transformers/geglu.py +4 -2
liger_kernel/transformers/model/gemma.py +138 -0
liger_kernel/transformers/model/llama.py +1 -1
liger_kernel/transformers/model/mistral.py +138 -0
liger_kernel/transformers/model/phi3.py +136 -0
liger_kernel/transformers/model/qwen2.py +135 -0
liger_kernel/transformers/monkey_patch.py +204 -11
liger_kernel/transformers/rms_norm.py +20 -4
liger_kernel/transformers/swiglu.py +24 -0
liger_kernel/transformers/trainer_integration.py +2 -45
{liger_kernel-0.1.1.dist-info → liger_kernel-0.2.1.dist-info}/METADATA +87 -25
liger_kernel-0.2.1.dist-info/RECORD +33 -0
liger_kernel-0.1.1.dist-info/RECORD +0 -27
{liger_kernel-0.1.1.dist-info → liger_kernel-0.2.1.dist-info}/LICENSE +0 -0
{liger_kernel-0.1.1.dist-info → liger_kernel-0.2.1.dist-info}/NOTICE +0 -0
{liger_kernel-0.1.1.dist-info → liger_kernel-0.2.1.dist-info}/WHEEL +0 -0
{liger_kernel-0.1.1.dist-info → liger_kernel-0.2.1.dist-info}/top_level.txt +0 -0

liger_kernel/env_report.py ADDED Viewed

@@ -0,0 +1,46 @@
+import platform
+import sys
+def print_env_report():
+    """
+    Prints a report of the environment. Useful for debugging and reproducibility.
+    Usage:
+    ```
+    python -m liger_kernel.env_report
+    ```
+    """
+    print("Environment Report:")
+    print("-------------------")
+    print(f"Operating System: {platform.platform()}")
+    print(f"Python version: {sys.version.split()[0]}")
+    try:
+        import torch
+        print(f"PyTorch version: {torch.__version__}")
+        cuda_version = (
+            torch.version.cuda if torch.cuda.is_available() else "Not available"
+        )
+        print(f"CUDA version: {cuda_version}")
+    except ImportError:
+        print("PyTorch: Not installed")
+        print("CUDA version: Unable to query")
+    try:
+        import triton
+        print(f"Triton version: {triton.__version__}")
+    except ImportError:
+        print("Triton: Not installed")
+    try:
+        import transformers
+        print(f"Transformers version: {transformers.__version__}")
+    except ImportError:
+        print("Transformers: Not installed")
+if __name__ == "__main__":
+    print_env_report()

liger_kernel/ops/cross_entropy.py CHANGED Viewed

@@ -56,7 +56,7 @@ def liger_cross_entropy_kernel(
     # Online softmax: 2 loads + 1 store (compared with 3 loads + 1 store for the safe softmax)
     # Refer to Algorithm 3 in the paper: https://arxiv.org/pdf/1805.02867
-    # 3. [Oneline softmax] first pass: find max + sum
+    # 3. [Online softmax] first pass: find max + sum
     m = float("-inf")  # m is the max value. use the notation from the paper
     d = 0.0  # d is the sum. use the notation from the paper
     ori_X_y = tl.load(
@@ -73,10 +73,10 @@ def liger_cross_entropy_kernel(
         d = d * tl.exp(m - m_new) + tl.sum(tl.exp(X_block - m_new))
         m = m_new
-    # 4. [Oneline softmax] second pass: calculate the gradients
+    # 4. [Online softmax] second pass: calculate the gradients
     # dx_y = (softmax(x_y) - 1) / N
     # dx_i = softmax(x_i) / N, i != y
-    # N is the number of non ingored elements in the batch
+    # N is the number of non ignored elements in the batch
     for i in range(0, n_cols, BLOCK_SIZE):
         X_offsets = i + tl.arange(0, BLOCK_SIZE)
         X_block = tl.load(
@@ -86,7 +86,7 @@ def liger_cross_entropy_kernel(
         tl.store(X_ptr + X_offsets, X_block, mask=X_offsets < n_cols)
     # We need tl.debug_barrier() to ensure the new result of X_ptr is written as mentioned in
-    # ttps://github.com/triton-lang/triton/blob/ba42a5c68fd0505f8c42f4202d53be0f8d9a5fe0/python/triton/ops/cross_entropy.py#L34
+    # https://github.com/triton-lang/triton/blob/ba42a5c68fd0505f8c42f4202d53be0f8d9a5fe0/python/triton/ops/cross_entropy.py#L34
     tl.debug_barrier()
     # 5. Calculate the loss
@@ -196,7 +196,7 @@ class LigerCrossEntropyFunction(torch.autograd.Function):
             ignore_index=ignore_index,
             BLOCK_SIZE=BLOCK_SIZE,
             # TODO: 32 seems to give the best performance
-            # Performance is quite sentitive to num_warps
+            # Performance is quite sensitive to num_warps
             num_warps=32,
         )

liger_kernel/ops/fused_linear_cross_entropy.py CHANGED Viewed

@@ -11,7 +11,7 @@ MAX_FUSED_SIZE = 65536 // 2
 class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
     @staticmethod
-    def forward(ctx, _input, linear, target, ignore_index):
+    def forward(ctx, _input, weight, target, bias=None, ignore_index=-100):
         """
         Fusing the last linear layer with cross-entropy loss
             Reference: https://github.com/mgmalek/efficient_cross_entropy
@@ -23,7 +23,8 @@ class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
         _input: (B*T, H) where B is batch size, T is sequence length, H is hidden dimension.
         target: (B*T) where each value is in [0, V-1]
-        linear: linear projection matrix of shape V x H.
+        weight: (V, H) where V is the number of classes
+        bias: (V) where V is the number of classes
         ignore_index: the index to ignore in the target
         """
         dtype = (
@@ -36,12 +37,12 @@ class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
         # inputs have shape: BT x H
         # materialized activations will have shape: BT x V
         # the increase in memory = BT x V
-        # reduction can be achieved by paritioning the number of tokens BT into smaller chunks.
+        # reduction can be achieved by partitioning the number of tokens BT into smaller chunks.
         # for ex: if we were to achieve the same memory consumption as BT x H, then the chunk size should be:
         # inc_factor = (V+H-1)//H, chunk_size = (BT + inc_factor - 1)//inc_factor
         # for ex: BT = 4096*4, V = 32000, H = 4096 ==> inc_factor = 8, chunk_size = 2048
         BT, H = _input.shape
-        V = linear.shape[0]
+        V = weight.shape[0]
         BLOCK_SIZE = min(MAX_FUSED_SIZE, triton.next_power_of_2(V))
         inc_factor = triton.cdiv(V, H)  # (V + H - 1) // H
@@ -50,9 +51,9 @@ class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
         )  # (BT + inc_factor - 1) // inc_factor
         num_chunks = triton.cdiv(BT, chunk_size)  # (BT + chunk_size - 1) // chunk_size
-        grad_linear = torch.zeros_like(linear, device=device)
+        grad_weight = torch.zeros_like(weight, device=device)
         grad_input = torch.zeros_like(_input, device=device)
+        grad_bias = torch.zeros_like(bias, device=device) if bias is not None else None
         # we use fp32 for loss accumulator
         loss_1d = torch.zeros(BT, dtype=torch.float32, device=device)
@@ -64,7 +65,9 @@ class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
             _input_chunk = _input[start_idx:end_idx]  # chunk_size x H
             # when doing matmul, use the original precision
-            logits_chunk = _input_chunk @ linear.t()  # chunk_size x V
+            logits_chunk = _input_chunk @ weight.t()  # chunk_size x V
+            if bias is not None:
+                logits_chunk = logits_chunk + bias
             target_chunk = target[start_idx:end_idx]  # chunk_size,
             n_rows = logits_chunk.shape[0]
@@ -95,39 +98,52 @@ class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
                 num_warps=32,
             )
-            # gradient of logits_chunk is computed inplace by the above triton kernel.
+            # gradient of logits_chunk is computed in-place by the above triton kernel.
             # Following HuggingFace model source code, we do the forward and backward
             # w.r.t. logits in fp32 for numerical stability especially as the num classes (vocab size) os huge.
             # (reference: https://github.com/huggingface/transformers/blob/v4.42.4/src/transformers/models/llama/modeling_llama.py#L1194)
             # Propagating to lm_head's backward, we'll switch back to the original dtype.
             logits_chunk = logits_chunk.to(dtype)
-            # gradient of logits_chunk is computed inplace by the above triton kernel and is of shape: chunk_size x V
+            # gradient of logits_chunk is computed in-place by the above triton kernel and is of shape: chunk_size x V
             # thus grad_input[start_idx: end_idx] should be of shape: chunk_size x H
             # additionally, since we are chunking the inputs, observe that the loss and gradients are calculated only
             # on `n_non_ignore` tokens. However, the gradient of the input should be calculated for all tokens.
             # Thus, we need an additional scaling factor of (n_non_ignore/total_n_non_ignore) to scale the gradients.
-            grad_logits_chunk = logits_chunk * (n_non_ignore / total_n_non_ignore)
-            grad_input[start_idx:end_idx] = grad_logits_chunk @ linear
+            grad_logits_chunk = logits_chunk * (
+                n_non_ignore / total_n_non_ignore
+            )  # chunk_size x V
+            grad_input[start_idx:end_idx] = grad_logits_chunk @ weight
             torch.addmm(
-                input=grad_linear,
+                input=grad_weight,
                 mat1=logits_chunk.t(),
                 mat2=_input_chunk,
-                out=grad_linear,
+                out=grad_weight,
                 alpha=n_non_ignore / total_n_non_ignore,
                 beta=1.0,
             )
+            if bias is not None:
+                torch.add(
+                    input=grad_bias,
+                    other=logits_chunk.sum(dim=0),
+                    out=grad_bias,
+                    alpha=n_non_ignore / total_n_non_ignore,
+                )
         loss = torch.sum(loss_1d) / total_n_non_ignore
         # downcast to dtype and store for backward
-        ctx.save_for_backward(grad_input.detach(), grad_linear.detach())
+        ctx.save_for_backward(
+            grad_input.detach(),
+            grad_weight.detach(),
+            grad_bias.detach() if bias is not None else None,
+        )
         return loss
     @staticmethod
     def backward(ctx, grad_output):
-        (grad_input, grad_linear) = ctx.saved_tensors
+        (grad_input, grad_weight, grad_bias) = ctx.saved_tensors
         # If cross entropy is the last layer, grad_output is 1.0. Skip the mul to save time
         if torch.ne(grad_output, torch.tensor(1.0, device=grad_output.device)):
             # We use a Triton kernel instead of a PyTorch operation because modifying inputs in-place
@@ -145,17 +161,30 @@ class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
                 num_warps=32,
             )
-            # handle grad_linear
-            V, H = grad_linear.shape
+            # handle grad_weight
+            V, H = grad_weight.shape
             n_rows = V
             element_mul[(n_rows,)](
-                grad_linear,
-                grad_linear.stride(-2),
+                grad_weight,
+                grad_weight.stride(-2),
                 grad_output,
                 H,
                 BLOCK_SIZE=BLOCK_SIZE,
                 num_warps=32,
             )
-        return (grad_input, grad_linear, None, None)
+            if grad_bias is not None:
+                V = grad_bias.shape[0]
+                n_rows = V
+                element_mul[(n_rows,)](
+                    grad_bias,
+                    grad_bias.stride(-1),
+                    grad_output,
+                    1,
+                    BLOCK_SIZE=BLOCK_SIZE,
+                    num_warps=32,
+                )
+        return (grad_input, grad_weight, None, grad_bias, None)

liger_kernel/ops/geglu.py CHANGED Viewed

@@ -11,7 +11,12 @@ from liger_kernel.ops.utils import (
 )
 if compare_version("triton", operator.ge, "3.0.0"):
-    from triton.language.extra.libdevice import tanh
+    try:
+        # typical import path with dispatch available
+        from triton.language.extra.libdevice import tanh
+    except ModuleNotFoundError:
+        # for working with NGC containers
+        from triton.language.extra.cuda.libdevice import tanh
 else:
     from triton.language.math import tanh

liger_kernel/ops/rms_norm.py CHANGED Viewed

@@ -1,8 +1,29 @@
+import operator
 import torch
 import triton
 import triton.language as tl
-from liger_kernel.ops.utils import calculate_settings, ensure_contiguous
+from liger_kernel.ops.utils import (
+    calculate_settings,
+    compare_version,
+    ensure_contiguous,
+)
+if compare_version("triton", operator.ge, "3.0.0"):
+    try:
+        # typical import path with dispatch available
+        from triton.language.extra.libdevice import rsqrt
+    except ModuleNotFoundError:
+        # for working with NGC containers
+        from triton.language.extra.cuda.libdevice import rsqrt
+else:
+    from triton.language.math import rsqrt
+_CASTING_MODE_NONE = tl.constexpr(-1)
+_CASTING_MODE_LLAMA = tl.constexpr(0)
+_CASTING_MODE_GEMMA = tl.constexpr(1)
 @triton.jit
@@ -17,10 +38,12 @@ def _rms_norm_forward(
     r_row_stride,
     n_cols,
     eps,
+    offset,
+    casting_mode: tl.constexpr,  # constexpr so the `if` blocks can be optimized out
     BLOCK_SIZE: tl.constexpr,
 ):
     """
-    y_i = (x_i / (RMS)) * wi, RMS = sqrt(sum(x_i^2) / N)
+    y_i = (x_i / (RMS)) * (offset + wi), RMS = sqrt(sum(x_i^2) / N)
     Reference:
     1. https://triton-lang.org/main/getting-started/tutorials/05-layer-norm.html
@@ -37,17 +60,33 @@ def _rms_norm_forward(
     r_ptr += row_idx * r_row_stride
     X_row = tl.load(X_ptr + col_offsets, mask=mask, other=0)
+    X_row_dtype = X_row.dtype
     W_row = tl.load(W_ptr + col_offsets, mask=mask, other=0)
+    # On Llama, only inv_rms is computed on fp32
+    if casting_mode == _CASTING_MODE_LLAMA:
+        X_row = X_row.to(tl.float32)
+    # Gemma computes everything on fp32, and then casts back the output to the original dtype
+    if casting_mode == _CASTING_MODE_GEMMA:
+        W_row = W_row.to(tl.float32)
+        X_row = X_row.to(tl.float32)
     mean_square = tl.sum(X_row * X_row, axis=0) / n_cols
-    inv_rms = tl.math.rsqrt(mean_square + eps)
+    inv_rms = rsqrt(mean_square + eps)
     # We can save time by caching rms with minimal memory overhead
     # because rms is much smaller compared to X_row, as rms is for each row.
     # However, on the computation side, it can save 4 operations (*, sum, /, sqrt).
     tl.store(r_ptr, inv_rms)
-    Y_row = X_row * inv_rms * W_row
+    X_row = X_row * inv_rms
+    # On Llama, the multiplication with the weight is done on the original dtype
+    if casting_mode == _CASTING_MODE_LLAMA:
+        X_row = X_row.to(X_row_dtype)
+    Y_row = X_row * (offset + W_row)
     tl.store(Y_ptr + col_offsets, Y_row, mask=mask)
@@ -66,10 +105,12 @@ def _rms_norm_backward(
     dW_row_stride,
     n_cols,
     eps,
+    offset,
+    casting_mode: tl.constexpr,
     BLOCK_SIZE: tl.constexpr,
 ):
     """
-    dx = (1 / RMS) * [dy * w  - (1 / N) * (1 / RMS^2) * ((dy * w) dot x) * x]. * means element-wise multiplication, whileas dot means dot product
+    dx = (1 / RMS) * [dy * (w + offset - (1 / N) * (1 / RMS^2) * ((dy * (w + offset)) dot x) * x]. * means element-wise multiplication, whileas dot means dot product
     dw = sum(dy * (x / RMS)). summation over BxT dimension
     """
@@ -85,33 +126,95 @@ def _rms_norm_backward(
     dY_row = tl.load(dY_ptr + col_offsets, mask=mask, other=0)
     X_row = tl.load(X_ptr + col_offsets, mask=mask, other=0)
     W_row = tl.load(W_ptr + col_offsets, mask=mask, other=0)
+    original_x_dtype = X_row.dtype
     # Get cached rms
     inv_rms_row = tl.load(r_ptr)
-    dX_row = (inv_rms_row) * (
-        dY_row * W_row
-        - (1 / n_cols)
-        * inv_rms_row
-        * inv_rms_row
-        * tl.sum(dY_row * W_row * X_row, axis=0)
-        * X_row
-    )
-    tl.store(dY_ptr + col_offsets, dX_row, mask=mask)
+    W_row = W_row + offset
+    # Different bacward graphs for different casting modes
+    if casting_mode == _CASTING_MODE_LLAMA:
+        X_row = X_row.to(tl.float32)
+        m = (dY_row * W_row).to(tl.float32)
+        dX_row = inv_rms_row * m
+        dX_row += (inv_rms_row) * (
+            -(1 / n_cols)
+            * inv_rms_row
+            * inv_rms_row
+            * tl.sum(m * X_row, axis=0)
+            * X_row
+        )
+    if casting_mode == _CASTING_MODE_GEMMA:
+        dY_row, W_row, X_row = (
+            dY_row.to(tl.float32),
+            W_row.to(tl.float32),
+            X_row.to(tl.float32),
+        )
+        dX_row = inv_rms_row * dY_row * W_row
+        dX_row += (inv_rms_row) * (
+            -(1 / n_cols)
+            * inv_rms_row
+            * inv_rms_row
+            * tl.sum(dY_row * W_row * X_row, axis=0)
+            * X_row
+        )
     # calculate the gradient of W
-    dW_row = dY_row * X_row * inv_rms_row
+    if casting_mode == _CASTING_MODE_LLAMA:
+        dW_row = dY_row * (X_row * inv_rms_row).to(original_x_dtype)
+    else:
+        # here X_row is already in fp32 (see previous if block)
+        dW_row = dY_row * (X_row * inv_rms_row)
+    tl.store(dY_ptr + col_offsets, dX_row, mask=mask)
     tl.store(dW_ptr + col_offsets, dW_row, mask=mask)
+_str_to_casting_mode = {
+    "llama": _CASTING_MODE_LLAMA.value,
+    "gemma": _CASTING_MODE_GEMMA.value,
+    "none": _CASTING_MODE_NONE.value,
+}
 class LigerRMSNormFunction(torch.autograd.Function):
+    """
+    Performs RMSNorm (Root Mean Square Normalization), which normalizes the input tensor `X` using the
+    weight tensor `W`, with an optional offset and casting mode.
+    Some models use an 'offset' to shift the weight tensor `W` by a constant value. For example, Gemma
+    uses an offset of 1.0, so the computation becomes `(X / RMS(X)) * (W + 1.0)` instead of the usual
+    `(X / RMS(X)) * W`. You can pass the offset value as an argument to the forward function.
+    In addition, different models cast their inputs at different places during RMSNorm computation. For
+    example, Gemma casts everything to fp32 nefore starting the computation, while Llama casts only the
+    inverse RMS to fp32. You can specify the casting mode using the `casting_mode` argument. We currently
+    support the following casting modes (they match HuggingFace Transformers' implementations):
+    - 'llama': matches the Llama implementation, where only the inverse RMS is computed on fp32.
+    - 'gemma': matches the Gemma implementation, where everything is cast to fp32, then computed, then cast back to the original dtype.
+    - 'none': no casting is done. The computation is done in the original dtype. This saves memory and is slightly faster, but has more error w.r.t. the original implementation.
+    """
     @staticmethod
     @ensure_contiguous
-    def forward(ctx, X, W, eps):
+    def forward(ctx, X, W, eps, offset=0.0, casting_mode="llama"):
         """
         X: (B, T, H) or (BxT, H)
         W: (H,)
         """
+        if not isinstance(casting_mode, int):
+            assert (
+                casting_mode in _str_to_casting_mode
+            ), f"Invalid casting mode: {casting_mode}"
+            casting_mode = _str_to_casting_mode[casting_mode]
+        else:
+            assert (
+                casting_mode in _str_to_casting_mode.values()
+            ), f"Invalid casting mode: {casting_mode}"
         shape = X.shape
         dim = shape[-1]
@@ -121,7 +224,13 @@ class LigerRMSNormFunction(torch.autograd.Function):
         Y = torch.empty((n_rows, n_cols), dtype=X.dtype, device=X.device)
         # r is to cache (1/rms) for each row
-        r = torch.empty(n_rows, dtype=X.dtype, device=X.device)
+        # r is always computed/stored in fp32 if we are using Llama or Gemma casting mode
+        r_dtype = (
+            torch.float32
+            if casting_mode in (_CASTING_MODE_LLAMA.value, _CASTING_MODE_GEMMA.value)
+            else X.dtype
+        )
+        r = torch.empty(n_rows, dtype=r_dtype, device=X.device)
         # Check constraints.
         assert (
@@ -139,10 +248,14 @@ class LigerRMSNormFunction(torch.autograd.Function):
             r.stride(0),
             n_cols,
             eps,
+            offset,
+            casting_mode,
             BLOCK_SIZE=BLOCK_SIZE,
             num_warps=num_warps,
         )
         ctx.eps = eps
+        ctx.offset = offset
+        ctx.casting_mode = casting_mode
         ctx.BLOCK_SIZE = BLOCK_SIZE
         ctx.num_warps = num_warps
@@ -161,7 +274,14 @@ class LigerRMSNormFunction(torch.autograd.Function):
         dY = dY.view(-1, dim)
         X, W, r = ctx.saved_tensors
         n_rows, n_cols = dY.shape
-        dW = torch.zeros_like(X)
+        dW = torch.empty_like(
+            X,
+            dtype=(
+                torch.float32
+                if ctx.casting_mode == _CASTING_MODE_GEMMA.value
+                else W.dtype
+            ),
+        )
         # Here we use dY to store the value of dX to save memory
         _rms_norm_backward[(n_rows,)](
@@ -177,9 +297,11 @@ class LigerRMSNormFunction(torch.autograd.Function):
             dW.stride(0),
             n_cols,
             ctx.eps,
+            ctx.offset,
+            ctx.casting_mode,
             BLOCK_SIZE=ctx.BLOCK_SIZE,
             num_warps=ctx.num_warps,
         )
         dX = dY.view(*shape)
-        dW = torch.sum(dW, dim=0)
-        return dX, dW, None
+        dW = torch.sum(dW, dim=0).to(W.dtype)
+        return dX, dW, None, None, None

liger_kernel/ops/rope.py CHANGED Viewed

@@ -13,8 +13,8 @@ def _triton_rope(
     cos_row_stride,
     sin,
     sin_row_stride,
+    sl,
     bs: tl.constexpr,
-    sl: tl.constexpr,
     n_qh: tl.constexpr,
     n_kh: tl.constexpr,
     hd: tl.constexpr,
@@ -168,8 +168,8 @@ class LigerRopeFunction(torch.autograd.Function):
             cos.stride(-2),
             sin,
             sin.stride(-2),
-            batch_size,
             seq_len,
+            batch_size,
             n_q_head,
             n_kv_head,
             head_dim,
@@ -219,8 +219,8 @@ class LigerRopeFunction(torch.autograd.Function):
             cos.stride(-2),
             sin,
             sin.stride(-2),
-            batch_size,
             seq_len,
+            batch_size,
             n_q_head,
             n_kv_head,
             head_dim,

liger_kernel/transformers/__init__.py CHANGED Viewed

@@ -1,6 +1,12 @@
+from liger_kernel.transformers.auto_model import (  # noqa: F401
+    AutoLigerKernelForCausalLM,
+)
 from liger_kernel.transformers.monkey_patch import (  # noqa: F401
     apply_liger_kernel_to_gemma,
+    apply_liger_kernel_to_gemma2,
     apply_liger_kernel_to_llama,
     apply_liger_kernel_to_mistral,
     apply_liger_kernel_to_mixtral,
+    apply_liger_kernel_to_phi3,
+    apply_liger_kernel_to_qwen2,
 )

liger_kernel/transformers/auto_model.py ADDED Viewed

@@ -0,0 +1,33 @@
+from transformers import AutoConfig, AutoModelForCausalLM
+from liger_kernel.transformers.monkey_patch import _apply_liger_kernel
+def _get_model_config(model_dir, **model_init_kwargs):
+    config = AutoConfig.from_pretrained(model_dir, **model_init_kwargs)
+    return config
+class AutoLigerKernelForCausalLM(AutoModelForCausalLM):
+    """
+    This class is a drop-in replacement for AutoModelForCausalLM that applies the Liger Kernel to the model
+    if applicable.
+    """
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        model_config = _get_model_config(pretrained_model_name_or_path, **kwargs)
+        # Determine the model type and apply the Liger Kernel if applicable
+        # Note: _apply_liger_kernel will only pass relevant kwargs to the apply_liger_kernel_to_* function
+        model_type = model_config.model_type
+        _apply_liger_kernel(model_type, **kwargs)
+        # Retain only the keyword args present in the model configuration
+        for k in list(kwargs.keys()):
+            if k not in model_config.__dict__:
+                del kwargs[k]
+        return super().from_pretrained(
+            pretrained_model_name_or_path, *model_args, **kwargs
+        )

liger_kernel/transformers/fused_linear_cross_entropy.py CHANGED Viewed

@@ -9,7 +9,7 @@ class LigerFusedLinearCrossEntropyLoss(CrossEntropyLoss):
     def __init__(self, *args, **kwargs):
         super(LigerFusedLinearCrossEntropyLoss, self).__init__(*args, **kwargs)
-    def forward(self, lin_weight, _input, target):
+    def forward(self, lin_weight, _input, target, bias=None):
         return LigerFusedLinearCrossEntropyFunction.apply(
-            _input, lin_weight, target, self.ignore_index
+            _input, lin_weight, target, bias, self.ignore_index
         )

liger_kernel/transformers/geglu.py CHANGED Viewed

@@ -13,8 +13,10 @@ class LigerGEGLUMLP(nn.Module):
         self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
         self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
         # TODO: support exact GELU
-        if config.hidden_act not in ["gelu_pytorch_tanh"]:
-            raise ValueError(f"Activation function {config.hidden_act} not supported.")
+        # Right now Gemma 1, 1.1 and 2 models are all using `gelu_pytorch_tanh`
+        # https://github.com/huggingface/transformers/blob/v4.40.1/src/transformers/models/gemma/modeling_gemma.py#L175
+        # https://github.com/huggingface/transformers/blob/v4.40.1/src/transformers/activations.py#L46
+        # So we can safely assume we use tanh approximation form all the time
     def forward(self, x):

liger-kernel 0.1.1__py3-none-any.whl → 0.2.1__py3-none-any.whl

liger-kernel 0.1.1py3-none-any.whl → 0.2.1py3-none-any.whl