PyPI - liger-kernel - Versions diffs - 0.0.0__py3-none-any.whl - Mend

liger-kernel 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

liger_kernel/ops/__init__.py +0 -0
liger_kernel/ops/cross_entropy.py +277 -0
liger_kernel/ops/fused_linear_cross_entropy.py +161 -0
liger_kernel/ops/geglu.py +129 -0
liger_kernel/ops/rms_norm.py +167 -0
liger_kernel/ops/rope.py +234 -0
liger_kernel/ops/swiglu.py +113 -0
liger_kernel/ops/utils.py +38 -0
liger_kernel/transformers/__init__.py +5 -0
liger_kernel/transformers/cross_entropy.py +11 -0
liger_kernel/transformers/fused_linear_cross_entropy.py +15 -0
liger_kernel/transformers/geglu.py +23 -0
liger_kernel/transformers/model/__init__.py +0 -0
liger_kernel/transformers/model/llama.py +143 -0
liger_kernel/transformers/monkey_patch.py +103 -0
liger_kernel/transformers/rms_norm.py +16 -0
liger_kernel/transformers/rope.py +20 -0
liger_kernel/transformers/swiglu.py +40 -0
liger_kernel/triton/__init__.py +3 -0
liger_kernel/triton/monkey_patch.py +44 -0
liger_kernel-0.0.0.dist-info/METADATA +14 -0
liger_kernel-0.0.0.dist-info/RECORD +24 -0
liger_kernel-0.0.0.dist-info/WHEEL +5 -0
liger_kernel-0.0.0.dist-info/top_level.txt +1 -0

liger_kernel/ops/rms_norm.py ADDED Viewed

@@ -0,0 +1,167 @@
+import torch
+import triton
+import triton.language as tl
+from liger_kernel.ops.utils import calculate_settings, ensure_contiguous
+@triton.jit
+def _rms_norm_forward(
+    Y_ptr,
+    Y_row_stride,
+    X_ptr,
+    X_row_stride,
+    W_ptr,
+    W_row_stride,
+    r_ptr,
+    r_row_stride,
+    n_cols,
+    eps,
+    BLOCK_SIZE: tl.constexpr,
+):
+    """
+    Reference:
+    1. https://triton-lang.org/main/getting-started/tutorials/05-layer-norm.html
+    2. https://github.com/unslothai/unsloth/blob/fd753fed99ed5f10ef8a9b7139588d9de9ddecfb/unsloth/kernels/rms_layernorm.py#L22
+    """
+    row_idx = tl.program_id(0)
+    col_offsets = tl.arange(0, BLOCK_SIZE)
+    mask = col_offsets < n_cols
+    Y_ptr += row_idx * Y_row_stride
+    X_ptr += row_idx * X_row_stride
+    r_ptr += row_idx * r_row_stride
+    X_row = tl.load(X_ptr + col_offsets, mask=mask, other=0)
+    W_row = tl.load(W_ptr + col_offsets, mask=mask, other=0)
+    row_var = tl.sum(X_row * X_row, axis=0) / n_cols
+    inv_var = tl.math.rsqrt(row_var + eps)
+    # trick: row_var is tiny compared to X_row because it just has one per row we can save 4 ops (*, sum, /, rqrt) if we cache it
+    tl.store(r_ptr, inv_var)
+    normed = X_row * inv_var
+    output = normed * W_row
+    tl.store(Y_ptr + col_offsets, output, mask=mask)
+@triton.jit
+def _rms_norm_backward(
+    dY_ptr,
+    dY_row_stride,
+    X_ptr,
+    X_row_stride,
+    W_ptr,
+    W_row_stride,
+    r_ptr,
+    r_row_stride,
+    dW_ptr,
+    dW_row_stride,
+    n_cols,
+    eps,
+    BLOCK_SIZE: tl.constexpr,
+):
+    """
+    dx = (1 / var(x)) * (dy * w - (1/N) * (dy * w) dot x) * x
+    dw = sum(dy * (x / var(x)))
+    """
+    row_idx = tl.program_id(0)
+    col_offsets = tl.arange(0, BLOCK_SIZE)
+    mask = col_offsets < n_cols
+    dY_ptr += row_idx * dY_row_stride
+    X_ptr += row_idx * X_row_stride
+    r_ptr += row_idx * r_row_stride
+    dW_ptr += row_idx * dW_row_stride
+    dY_row = tl.load(dY_ptr + col_offsets, mask=mask, other=0)
+    X_row = tl.load(X_ptr + col_offsets, mask=mask, other=0)
+    W_row = tl.load(W_ptr + col_offsets, mask=mask, other=0)
+    # Get saved row variance
+    inv_var = tl.load(r_ptr)
+    normed = X_row * inv_var
+    dY_W = dY_row * W_row
+    dY_normed = dY_row * normed
+    rowsum_dY_normed = tl.sum(dY_W * normed, axis=0)
+    output = inv_var / n_cols * (n_cols * dY_W - normed * rowsum_dY_normed)
+    tl.store(dY_ptr + col_offsets, output, mask=mask)
+    # calculate the gradient of W
+    tl.store(dW_ptr + col_offsets, dY_normed, mask=mask)
+class LigerRMSNormFunction(torch.autograd.Function):
+    @staticmethod
+    @ensure_contiguous
+    def forward(ctx, X, W, eps):
+        shape = X.shape
+        dim = shape[-1]
+        X = X.view(-1, dim)
+        n_rows, n_cols = X.shape
+        BLOCK_SIZE, num_warps = calculate_settings(n_cols)
+        Y = torch.empty((n_rows, n_cols), dtype=X.dtype, device="cuda")
+        r = torch.empty(n_rows, dtype=X.dtype, device="cuda")
+        # Check constraints.
+        assert (
+            X.shape[1] == W.shape[0]
+        ), "Incompatible hidden size dimension between tensor1.shape[1] and tensor2.shape[0]"
+        _rms_norm_forward[(n_rows,)](
+            Y,
+            Y.stride(0),
+            X,
+            X.stride(0),
+            W,
+            W.stride(0),
+            r,
+            r.stride(0),
+            n_cols,
+            eps,
+            BLOCK_SIZE=BLOCK_SIZE,
+            num_warps=num_warps,
+        )
+        ctx.eps = eps
+        ctx.BLOCK_SIZE = BLOCK_SIZE
+        ctx.num_warps = num_warps
+        ctx.save_for_backward(X, W, r)
+        return Y.view(*shape)
+    @staticmethod
+    @ensure_contiguous
+    def backward(ctx, dY):
+        shape = dY.shape
+        dim = shape[-1]
+        dY = dY.view(-1, dim)
+        X, W, r = ctx.saved_tensors
+        n_rows, n_cols = dY.shape
+        dW = torch.zeros_like(X)
+        _rms_norm_backward[(n_rows,)](
+            dY,
+            dY.stride(0),
+            X,
+            X.stride(0),
+            W,
+            W.stride(0),
+            r,
+            r.stride(0),
+            dW,
+            dW.stride(0),
+            n_cols,
+            ctx.eps,
+            BLOCK_SIZE=ctx.BLOCK_SIZE,
+            num_warps=ctx.num_warps,
+        )
+        dX = dY.view(*shape)
+        dW = torch.sum(dW, dim=0)
+        return dX, dW, None

liger_kernel/ops/rope.py ADDED Viewed

@@ -0,0 +1,234 @@
+import torch
+import triton
+import triton.language as tl
+@triton.jit
+def _triton_rope(
+    q_ptr,
+    q_row_stride,
+    k_ptr,
+    k_row_stride,
+    cos,
+    cos_row_stride,
+    sin,
+    sin_row_stride,
+    bs: tl.constexpr,
+    sl: tl.constexpr,
+    n_qh: tl.constexpr,
+    n_kh: tl.constexpr,
+    hd: tl.constexpr,
+    pad_n_qh: tl.constexpr,
+    pad_n_kh: tl.constexpr,
+    pad_hd: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+    BACKWARD_PASS: tl.constexpr = False,
+):
+    # q size: (bsz, seq_len, num_q_heads, head_dim)
+    # q stride: (seq_len * num_q_heads * head_dim, num_q_heads * head_dim, head_dim, 1)
+    # k size: (bsz, seq_len, num_kv_heads, head_dim)
+    # k stride: (seq_len * num_kv_heads * head_dim, num_kv_heads * head_dim, head_dim, 1)
+    # cos size: (1, seq_len, head_dim)
+    # stride: (seq_len * head_dim, head_dim, 1)
+    pid = tl.program_id(0)
+    # locate start address
+    q_ptr = q_ptr + pid * q_row_stride
+    k_ptr = k_ptr + pid * k_row_stride
+    # ####################################################################
+    # get the cos(mθ_{i...d/2}) and sin(mθ_{i...d/2}) for token position
+    # m of this program instance
+    # ####################################################################
+    # 1. program instances are laid out in a 1D vector of size bsz * seq_len, which
+    # effectively represents a 2D grid of size [bsz, seq_len] with seq_len dimension
+    # being the fastest changing dimension. Thus we can simply do pid // sl to get the batch index
+    # and pid % sl to get the sequence index.
+    # 2. We only need the left half of cos and sin matrix because the right half is just
+    # a clone of the left half.
+    cos_row_idx = pid % (sl)
+    cos = cos + cos_row_idx * cos_row_stride
+    sin = sin + cos_row_idx * sin_row_stride
+    cos_offsets = tl.arange(0, pad_hd // 2)
+    cos_mask = cos_offsets < hd // 2
+    cos_row = tl.load(cos + cos_offsets, mask=cos_mask, other=0)
+    sin_row = tl.load(sin + cos_offsets, mask=cos_mask, other=0)
+    # ####################################################################
+    # Load the left and right half of q and k for the current
+    # program instance (i.e. for the current token) separately
+    # ####################################################################
+    # left half of the head
+    first_half_q_offsets = (
+        tl.arange(0, pad_n_qh)[:, None] * hd + tl.arange(0, pad_hd // 2)[None, :]
+    )
+    first_half_k_offsets = (
+        tl.arange(0, pad_n_kh)[:, None] * hd + tl.arange(0, pad_hd // 2)[None, :]
+    )
+    first_q_mask = (tl.arange(0, pad_n_qh)[:, None] < n_qh) & (
+        tl.arange(0, pad_hd // 2)[None, :] < hd // 2
+    )
+    first_k_mask = (tl.arange(0, pad_n_kh)[:, None] < n_kh) & (
+        tl.arange(0, pad_hd // 2)[None, :] < hd // 2
+    )
+    q_tile_1 = tl.load(q_ptr + first_half_q_offsets, mask=first_q_mask, other=0).to(
+        sin_row.dtype
+    )
+    k_tile_1 = tl.load(k_ptr + first_half_k_offsets, mask=first_k_mask, other=0).to(
+        sin_row.dtype
+    )
+    # right half of the head
+    second_half_q_offsets = first_half_q_offsets + (hd // 2)
+    second_half_k_offsets = first_half_k_offsets + (hd // 2)
+    second_q_mask = first_q_mask
+    second_k_mask = first_k_mask
+    q_tile_2 = tl.load(q_ptr + second_half_q_offsets, mask=second_q_mask, other=0).to(
+        sin_row.dtype
+    )
+    k_tile_2 = tl.load(k_ptr + second_half_k_offsets, mask=second_k_mask, other=0).to(
+        sin_row.dtype
+    )
+    if not BACKWARD_PASS:
+        # y = [x1, x2] * [cos, cos] + [-x2, x1] * [sin, sin]
+        new_q_tile_1 = q_tile_1 * cos_row - q_tile_2 * sin_row
+        tl.store(q_ptr + first_half_q_offsets, new_q_tile_1, mask=first_q_mask)
+        new_q_tile_2 = q_tile_2 * cos_row + q_tile_1 * sin_row
+        tl.store(q_ptr + second_half_q_offsets, new_q_tile_2, mask=second_q_mask)
+        new_k_tile_1 = k_tile_1 * cos_row - k_tile_2 * sin_row
+        tl.store(k_ptr + first_half_k_offsets, new_k_tile_1, mask=first_k_mask)
+        new_k_tile_2 = k_tile_2 * cos_row + k_tile_1 * sin_row
+        tl.store(k_ptr + second_half_k_offsets, new_k_tile_2, mask=second_k_mask)
+    else:
+        # with some math, we can get:
+        # dy = [dx1, dx2] * [cos, cos] + [-dx2, dx1] * [-sin, -sin]
+        new_q_tile_1 = q_tile_1 * cos_row + q_tile_2 * sin_row
+        tl.store(q_ptr + first_half_q_offsets, new_q_tile_1, mask=first_q_mask)
+        new_q_tile_2 = q_tile_2 * cos_row - q_tile_1 * sin_row
+        tl.store(q_ptr + second_half_q_offsets, new_q_tile_2, mask=second_q_mask)
+        new_k_tile_1 = k_tile_1 * cos_row + k_tile_2 * sin_row
+        tl.store(k_ptr + first_half_k_offsets, new_k_tile_1, mask=first_k_mask)
+        new_k_tile_2 = k_tile_2 * cos_row - k_tile_1 * sin_row
+        tl.store(k_ptr + second_half_k_offsets, new_k_tile_2, mask=second_k_mask)
+class LigerRopeFunction(torch.autograd.Function):
+    """
+    Triton implementation of the Rotary Positional Embedding (RoPE) operation. Please note that
+    this implements the HuggingFace Llama & Mistral version, whose rotation matrix is slightly different
+    than the original RoPE paper.
+    Please find the corresponding HuggingFace implementation here:
+    https://github.com/huggingface/transformers/blob/v4.40.2/src/transformers/models/llama/modeling_llama.py#L184
+    For more details about the rotation matrix used here, please refer to:
+    https://discuss.huggingface.co/t/is-llama-rotary-embedding-implementation-correct/44509/2
+    """
+    @staticmethod
+    def forward(ctx, q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+        """
+        q size: (bsz, n_q_head, seq_len, head_dim)
+        k size: (bsz, n_kv_head, seq_len, head_dim)
+        cos size: (1, seq_len, head_dim)
+        sin size: (1, seq_len, head_dim)
+        """
+        # transpose it back to the physical shape because Triton looks at the physical storage
+        # note: q and k are incontiguous before the transformation and will become contiguous after transpose
+        q = q.transpose(1, 2)
+        k = k.transpose(1, 2)
+        batch_size, seq_len, n_q_head, head_dim = q.shape
+        n_kv_head = k.shape[2]
+        pad_hd = triton.next_power_of_2(head_dim)
+        pad_n_q_head = triton.next_power_of_2(n_q_head)
+        pad_n_kv_head = triton.next_power_of_2(n_kv_head)
+        BLOCK_SIZE = max(pad_n_q_head, pad_n_kv_head)
+        n_row = batch_size * seq_len
+        # ensure tensors passed into the kernel are contiguous. It will be no-op if they are already contiguous
+        q = q.contiguous()
+        k = k.contiguous()
+        cos = cos.contiguous()
+        sin = sin.contiguous()
+        _triton_rope[(n_row,)](
+            q,
+            q.stride(1),
+            k,
+            k.stride(1),
+            cos,
+            cos.stride(-2),
+            sin,
+            sin.stride(-2),
+            batch_size,
+            seq_len,
+            n_q_head,
+            n_kv_head,
+            head_dim,
+            pad_n_q_head,
+            pad_n_kv_head,
+            pad_hd,
+            BLOCK_SIZE=BLOCK_SIZE,
+            BACKWARD_PASS=False,
+        )
+        ctx.save_for_backward(cos, sin)
+        return q.transpose(1, 2), k.transpose(1, 2)
+    def backward(ctx, dq, dk):
+        """
+        dq size: (bsz, n_q_head, seq_len, head_dim)
+        dk size: (bsz, n_kv_head, seq_len, head_dim)
+        cos size: (1, seq_len, head_dim)
+        sin size: (1, seq_len, head_dim)
+        """
+        cos, sin = ctx.saved_tensors
+        dq = dq.transpose(1, 2)
+        dk = dk.transpose(1, 2)
+        batch_size, seq_len, n_q_head, head_dim = dq.shape
+        n_kv_head = dk.shape[2]
+        pad_hd = triton.next_power_of_2(head_dim)
+        pad_n_q_head = triton.next_power_of_2(n_q_head)
+        pad_n_kv_head = triton.next_power_of_2(n_kv_head)
+        BLOCK_SIZE = max(pad_n_q_head, pad_n_kv_head)
+        n_row = batch_size * seq_len
+        # ensure dq and dk are contiguous
+        dq = dq.contiguous()
+        dk = dk.contiguous()
+        # backward is similar to forward except swapping few ops
+        _triton_rope[(n_row,)](
+            dq,
+            dq.stride(1),
+            dk,
+            dk.stride(1),
+            cos,
+            cos.stride(-2),
+            sin,
+            sin.stride(-2),
+            batch_size,
+            seq_len,
+            n_q_head,
+            n_kv_head,
+            head_dim,
+            pad_n_q_head,
+            pad_n_kv_head,
+            pad_hd,
+            BLOCK_SIZE=BLOCK_SIZE,
+            BACKWARD_PASS=True,
+        )
+        return dq.transpose(1, 2), dk.transpose(1, 2), None, None, None, None

liger_kernel/ops/swiglu.py ADDED Viewed

@@ -0,0 +1,113 @@
+import torch
+import triton
+import triton.language as tl
+from liger_kernel.ops.utils import calculate_settings, ensure_contiguous
+@triton.jit
+def silu(x):
+    return x * tl.sigmoid(x)
+@triton.jit
+def _swiglu_forward_kernel(
+    a, b, c, stride, n_cols: tl.constexpr, BLOCK_SIZE: tl.constexpr
+):
+    program_id = tl.program_id(0)
+    # locate start index
+    a += program_id * stride
+    b += program_id * stride
+    c += program_id * stride
+    col_offsets = tl.arange(0, BLOCK_SIZE)
+    mask = col_offsets < n_cols
+    # sigmoid requires type float32
+    a_row = tl.load(a + col_offsets, mask=mask, other=0).to(tl.float32)
+    b_row = tl.load(b + col_offsets, mask=mask, other=0)
+    c_row = silu(a_row) * b_row
+    tl.store(c + col_offsets, c_row, mask=mask)
+@triton.jit
+def _swiglu_backward_kernel(
+    dc, a, b, stride, n_cols: tl.constexpr, BLOCK_SIZE: tl.constexpr
+):
+    program_id = tl.program_id(0)
+    # locate start index
+    dc += program_id * stride
+    a += program_id * stride
+    b += program_id * stride
+    col_offsets = tl.arange(0, BLOCK_SIZE)
+    mask = col_offsets < n_cols
+    dc_row = tl.load(dc + col_offsets, mask=mask, other=0)
+    # sigmoid requires type float32
+    a_row = tl.load(a + col_offsets, mask=mask, other=0).to(tl.float32)
+    b_row = tl.load(b + col_offsets, mask=mask, other=0)
+    # recomputation to save memory
+    sig_a = tl.sigmoid(a_row)
+    silu_a = a_row * sig_a
+    db_row = dc_row * silu_a
+    da_row = dc_row * (silu_a * (1 - sig_a) + sig_a) * b_row
+    tl.store(a + col_offsets, da_row, mask=mask)
+    tl.store(b + col_offsets, db_row, mask=mask)
+class LigerSiLUMulFunction(torch.autograd.Function):
+    @staticmethod
+    @ensure_contiguous
+    def forward(ctx, a, b):
+        ori_shape = a.shape
+        n_cols = ori_shape[-1]
+        a = a.view(-1, n_cols)
+        b = b.view(-1, n_cols)
+        c = torch.zeros_like(a)
+        n_rows = a.shape[0]
+        BLOCK_SIZE, num_warps = calculate_settings(n_cols)
+        _swiglu_forward_kernel[(n_rows,)](
+            a,
+            b,
+            c,
+            c.stride(-2),
+            n_cols=n_cols,
+            BLOCK_SIZE=BLOCK_SIZE,
+            num_warps=num_warps,
+        )
+        ctx.save_for_backward(a, b)
+        return c.view(*ori_shape)
+    @staticmethod
+    @ensure_contiguous
+    def backward(ctx, dc):
+        ori_shape = dc.shape
+        n_cols = ori_shape[-1]
+        dc = dc.view(-1, n_cols)
+        a, b = ctx.saved_tensors
+        n_rows = dc.shape[0]
+        BLOCK_SIZE, num_warps = calculate_settings(n_cols)
+        _swiglu_backward_kernel[(n_rows,)](
+            dc,
+            a,
+            b,
+            dc.stride(-2),
+            n_cols=n_cols,
+            BLOCK_SIZE=BLOCK_SIZE,
+            num_warps=num_warps,
+        )
+        return a.view(*ori_shape), b.view(*ori_shape)

liger_kernel/ops/utils.py ADDED Viewed

@@ -0,0 +1,38 @@
+import functools
+import torch
+import triton
+def ensure_contiguous(fn):
+    @functools.wraps(fn)
+    def wrapper(ctx, *args, **kwargs):
+        def maybe_to_contiguous(x):
+            return x.contiguous() if isinstance(x, torch.Tensor) else x
+        args = [maybe_to_contiguous(arg) for arg in args]
+        kwargs = {k: maybe_to_contiguous(v) for k, v in kwargs.items()}
+        return fn(ctx, *args, **kwargs)
+    return wrapper
+def calculate_settings(n):
+    # reference: https://github.com/unslothai/unsloth/blob/fd753fed99ed5f10ef8a9b7139588d9de9ddecfb/unsloth/kernels/utils.py#L43
+    MAX_FUSED_SIZE = 65536
+    BLOCK_SIZE = triton.next_power_of_2(n)
+    if BLOCK_SIZE > MAX_FUSED_SIZE:
+        raise RuntimeError(
+            f"Cannot launch Triton kernel since n = {n} exceeds "
+            f"the recommended Triton blocksize = {MAX_FUSED_SIZE}."
+        )
+    num_warps = 4
+    if BLOCK_SIZE >= 32768:
+        num_warps = 32
+    elif BLOCK_SIZE >= 8192:
+        num_warps = 16
+    elif BLOCK_SIZE >= 2048:
+        num_warps = 8
+    return BLOCK_SIZE, num_warps

liger_kernel/transformers/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+from liger_kernel.transformers.monkey_patch import (  # noqa: F401
+    apply_liger_kernel_to_llama,
+    apply_liger_kernel_to_mistral,
+    apply_liger_kernel_to_mixtral,
+)

liger_kernel/transformers/cross_entropy.py ADDED Viewed

@@ -0,0 +1,11 @@
+from torch.nn import CrossEntropyLoss
+from liger_kernel.ops.cross_entropy import LigerCrossEntropyFunction
+class LigerCrossEntropyLoss(CrossEntropyLoss):
+    def __init__(self, *args, **kwargs):
+        super(LigerCrossEntropyLoss, self).__init__(*args, **kwargs)
+    def forward(self, _input, target):
+        return LigerCrossEntropyFunction.apply(_input, target, self.ignore_index)

liger_kernel/transformers/fused_linear_cross_entropy.py ADDED Viewed

@@ -0,0 +1,15 @@
+from torch.nn import CrossEntropyLoss
+from liger_kernel.ops.fused_linear_cross_entropy import (
+    LigerFusedLinearCrossEntropyFunction,
+)
+class LigerFusedLinearCrossEntropyLoss(CrossEntropyLoss):
+    def __init__(self, *args, **kwargs):
+        super(LigerFusedLinearCrossEntropyLoss, self).__init__(*args, **kwargs)
+    def forward(self, lin_weight, _input, target):
+        return LigerFusedLinearCrossEntropyFunction.apply(
+            _input, lin_weight, target, self.ignore_index
+        )

liger_kernel/transformers/geglu.py ADDED Viewed

@@ -0,0 +1,23 @@
+import torch.nn as nn
+from liger_kernel.ops.geglu import LigerGELUMulFunction
+class LigerGEGLUMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        # TODO: support exact GELU
+        if config.hidden_act not in ["gelu_pytorch_tanh"]:
+            raise ValueError(f"Activation function {config.hidden_act} not supported.")
+    def forward(self, x):
+        return self.down_proj(
+            LigerGELUMulFunction.apply(self.gate_proj(x), self.up_proj(x))
+        )

liger_kernel/transformers/model/__init__.py ADDED Viewed

File without changes