PyPI - liger-kernel - Versions diffs - 0.0.0__tar.gz - Mend

liger-kernel 0.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

liger_kernel-0.0.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,4 @@
+Metadata-Version: 2.1
+Name: liger_kernel
+Version: 0.0.0
+Provides-Extra: dev

liger_kernel-0.0.0/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

liger_kernel-0.0.0/setup.py ADDED Viewed

@@ -0,0 +1,26 @@
+from setuptools import find_namespace_packages, setup
+__version__ = "0.0.0"
+setup(
+    name="liger_kernel",
+    version=__version__,
+    package_dir={"": "src"},
+    packages=find_namespace_packages(where="src"),
+    include_package_data=True,
+    install_requires=[
+        "torch>=2.1.2",
+        "triton>=2.3.0",
+        "transformers>=4.40.1",
+    ],
+    extras_require={
+        "dev": [
+            "matplotlib>=3.7.2",
+            "flake8>=4.0.1.1",
+            "black>=24.4.2",
+            "isort>=5.13.2",
+            "pre-commit>=3.7.1",
+            "torch-tb-profiler>=0.4.1",
+        ]
+    },
+)

liger_kernel-0.0.0/src/liger_kernel/ops/__init__.py ADDED Viewed

File without changes

liger_kernel-0.0.0/src/liger_kernel/ops/cross_entropy.py ADDED Viewed

@@ -0,0 +1,277 @@
+import torch
+import triton
+import triton.language as tl
+@triton.jit
+def liger_cross_entropy_kernel(
+    X_ptr,
+    X_stride,
+    Y_ptr,
+    Y_stride,
+    loss_ptr,
+    loss_stride,
+    n_cols,
+    n_non_ignore,
+    ignore_index,
+    BLOCK_SIZE: tl.constexpr,
+):
+    """
+    This kernel computes both cross entropy loss and the gradient of the _input.
+    We only consider hard label + mean reduction for now. Please refer to https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html for the math.
+    Parameters:
+    X_ptr: Pointer to input tensor.
+    X_stride (int): The stride of the input tensor.
+    Y_ptr: Pointer to target tensor.
+    Y_stride (int): The stride of the target tensor.
+    loss_ptr: Pointer to tensor to store the loss.
+    loss_stride (int): The stride of the loss tensor.
+    n_cols (int): The number of columns in the input tensor.
+    n_non_ignore (int): The number of non-ignored elements in the batch.
+    ignore_index (int): The index to ignore in the target.
+    BLOCK_SIZE (int): The block size for Triton operations.
+    """
+    # https://github.com/triton-lang/triton/issues/1058
+    # Essentially if B*T*V is too large, program_id * stride will overflow out of int32
+    program_id = tl.program_id(0).to(tl.int64)
+    # 1. Load Y_ptr first because if the target is ignore_index, we can return right away
+    Y_ptr += program_id * Y_stride
+    y = tl.load(Y_ptr)
+    # 2. locate the start index
+    X_ptr += program_id * X_stride
+    if y == ignore_index:
+        # set all X_ptr as 0
+        for i in range(0, n_cols, BLOCK_SIZE):
+            X_offsets = i + tl.arange(0, BLOCK_SIZE)
+            tl.store(X_ptr + X_offsets, 0.0, mask=X_offsets < n_cols)
+        return
+    loss_ptr += program_id * loss_stride
+    # Online softmax: 2 loads + 1 store (compared with 3 loads + 1 store for the safe softmax)
+    # Refer to Algorithm 3 in the paper: https://arxiv.org/pdf/1805.02867
+    # 3. [Oneline softmax] first pass: find max + sum
+    m = float("-inf")  # m is the max value. use the notation from the paper
+    d = 0.0  # d is the sum. use the notation from the paper
+    ori_X_y = tl.load(
+        X_ptr + y
+    )  # we need to store the original value of X_y for the loss calculation
+    for i in range(0, n_cols, BLOCK_SIZE):
+        X_offsets = i + tl.arange(0, BLOCK_SIZE)
+        X_block = tl.load(
+            X_ptr + X_offsets, mask=X_offsets < n_cols, other=float("-inf")
+        )
+        block_max = tl.max(X_block)
+        m_new = tl.maximum(m, block_max)
+        d = d * tl.exp(m - m_new) + tl.sum(tl.exp(X_block - m_new))
+        m = m_new
+    # 4. [Oneline softmax] second pass: calculate the gradients
+    # dx_y = (softmax(x_y) - 1) / N
+    # dx_i = softmax(x_i) / N, i != y
+    # N is the number of non ingored elements in the batch
+    for i in range(0, n_cols, BLOCK_SIZE):
+        X_offsets = i + tl.arange(0, BLOCK_SIZE)
+        X_block = tl.load(
+            X_ptr + X_offsets, mask=X_offsets < n_cols, other=float("-inf")
+        )
+        X_block = (tl.exp(X_block - m) / d) / (n_non_ignore)
+        tl.store(X_ptr + X_offsets, X_block, mask=X_offsets < n_cols)
+    # We need tl.debug_barrier() to ensure the new result of X_ptr is written as mentioned in
+    # ttps://github.com/triton-lang/triton/blob/ba42a5c68fd0505f8c42f4202d53be0f8d9a5fe0/python/triton/ops/cross_entropy.py#L34
+    tl.debug_barrier()
+    # 5. Calculate the loss
+    # Old Approach: Problematic LogSoftmax
+    # min of bfloat16 and float32 is 1e-38, so we set a value larger than that but small enough
+    # This will overflow if X_y * n_non_ignore is too small. Even if we add a tiny epsilon, it will still overflow
+    # loss = -tl.log(X_y * n_non_ignore)
+    # New Approach: Safe LogSoftmax
+    # Therefore, we propose to use safe logsoftmax by reordering the formula.
+    # loss = log (softmax(X_y)) = log ((e ^ (X_y - max(X)) / sum(e ^ (X - max(X))))
+    #      = (X_y - max(X)) - log(sum(e ^ (X - max(X))))
+    # sum(e ^ (X - max(X))) must >= 1 because the max term is e ^ 0 = 1
+    # So we can safely calculate log (softmax(X_y)) without overflow
+    loss = -(ori_X_y - m - tl.log(d))
+    # 6. Specially handle the i==y case where `dx_y = (softmax(x_y) - 1) / N`
+    X_y = tl.load(X_ptr + y)
+    X_y += -1 / (n_non_ignore)
+    tl.store(loss_ptr, loss)
+    tl.store(X_ptr + y, X_y)
+# The hard limit of TRITON_MAX_TENSOR_NUMEL is 1048576 https://github.com/triton-lang/triton/blob/ba42a5c68fd0505f8c42f4202d53be0f8d9a5fe0/python/triton/language/core.py#L19
+# However, setting limit as 65536 as in LayerNorm tutorial is faster because of less register spilling
+# The optimal maximum block size depends on your hardware, your kernel, and your dtype
+MAX_FUSED_SIZE = 65536 // 2  # manual tune a bit
+@triton.jit
+def element_mul(
+    X_ptr,
+    X_stride,
+    grad_output_ptr,
+    n_cols,
+    BLOCK_SIZE: tl.constexpr,
+):
+    """
+    This function multiplies each element of the tensor pointed by X_ptr with the value pointed by grad_output_ptr.
+    The multiplication is performed in-place on the tensor pointed by X_ptr.
+    Parameters:
+    X_ptr: Pointer to the input tensor.
+    X_stride (int): The stride of the input tensor.
+    grad_output_ptr: Pointer to the gradient output value.
+    n_cols (int): The number of columns in the input tensor.
+    BLOCK_SIZE (int): The block size for Triton operations.
+    """
+    # Get the program ID and convert it to int64 to avoid overflow
+    program_id = tl.program_id(0).to(tl.int64)
+    # Locate the start index
+    X_ptr += program_id * X_stride
+    # Load the gradient output value
+    grad_output = tl.load(grad_output_ptr)
+    # Perform the element-wise multiplication
+    for i in range(0, n_cols, BLOCK_SIZE):
+        X_offsets = i + tl.arange(0, BLOCK_SIZE)
+        X_block = tl.load(X_ptr + X_offsets, mask=X_offsets < n_cols)
+        tl.store(X_ptr + X_offsets, X_block * grad_output, mask=X_offsets < n_cols)
+class LigerCrossEntropyFunction(torch.autograd.Function):
+    """
+    This class implements a custom autograd function for the Liger Cross Entropy loss.
+    It overrides the forward and backward methods of the torch.autograd.Function class.
+    """
+    @staticmethod
+    def forward(ctx, _input, target, ignore_index):
+        """
+        The forward pass of the Liger Cross Entropy loss.
+        Parameters:
+        ctx : The context object.
+        _input (tensor): The input tensor of shape (BT, V) where B is batch size, T is sequence length, V is vocab size.
+        target (tensor): The target tensor of shape (BT) where each value is in [0, V-1].
+        ignore_index (int): The index to ignore in the target.
+        Returns:
+        tensor: The computed loss.
+        """
+        BT, V = _input.shape
+        n_rows = BT
+        BLOCK_SIZE = min(MAX_FUSED_SIZE, triton.next_power_of_2(V))
+        # unreduced loss
+        loss_1d = torch.zeros(n_rows, dtype=_input.dtype, device=_input.device)
+        n_non_ignore = (target != ignore_index).sum().item()
+        # ensure _input and target are contiguous in the last dimension
+        # there are examples that are NOT contiguous overall but contiguous in the last dimension
+        ####################################################################
+        # tensor = torch.arange(1, 21).reshape(5, -1)
+        # print(tensor)
+        # tensor([[ 1,  2,  3,  4],
+        # [ 5,  6,  7,  8],
+        # [ 9, 10, 11, 12],
+        # [13, 14, 15, 16],
+        # [17, 18, 19, 20]])
+        # print(tensor.is_contiguous())
+        # True
+        # slice = tensor[::2, :]
+        # print(slice)
+        # tensor([[ 1,  2,  3,  4],
+        # [ 9, 10, 11, 12],
+        # [17, 18, 19, 20]])
+        # print(slice.is_contiguous())
+        # False
+        # print(slice.stride())
+        # (8, 1)
+        # slice is NOT a contiguous tensor but is contiguous in the last dimension, CE kernel can execute because the stride is 8, and each triton program will jump by 8
+        ####################################################################
+        if _input.stride(-1) != 1:
+            _input = _input.contiguous()
+        if target.stride(-1) != 1:
+            target = target.contiguous()
+        # Here we use a trick to store X_ptr gradient in X_ptr so we can save memory
+        liger_cross_entropy_kernel[(n_rows,)](
+            X_ptr=_input,
+            X_stride=_input.stride(-2),
+            Y_ptr=target,
+            Y_stride=target.stride(-1),  # always 1
+            loss_ptr=loss_1d,
+            loss_stride=loss_1d.stride(-1),  # always 1
+            n_cols=V,
+            n_non_ignore=n_non_ignore,
+            ignore_index=ignore_index,
+            BLOCK_SIZE=BLOCK_SIZE,
+            # TODO: 32 seems to give the best performance
+            # Performance is quite sentitive to num_warps
+            num_warps=32,
+        )
+        loss = torch.sum(loss_1d) / n_non_ignore
+        # TODO: investigation
+        # If we don't detach the _input tensor, the memory will double
+        # Not sure why but seems that there will be a time both grad and value exist but in different location
+        ctx.save_for_backward(_input.detach())
+        return loss
+    @staticmethod
+    def backward(ctx, grad_output):
+        """
+        The backward pass of the Liger Cross Entropy loss.
+        Parameters:
+        ctx : The context object with saved tensors.
+        grad_output (tensor): The tensor containing the gradient of the loss with respect to the output.
+        Returns:
+        tuple: A tuple with the gradients with respect to the inputs. The elements are tensors or None.
+        """
+        (_input,) = ctx.saved_tensors
+        # If cross entropy is the last layer, grad_output is 1.0. Skip the mul to save time
+        if torch.equal(grad_output, torch.tensor(1.0, device=grad_output.device)):
+            pass
+        # We use a Triton kernel instead of a PyTorch operation because modifying inputs in-place
+        # for gradient storage and backward multiple times causes anomalies with PyTorch but not with Triton.
+        # Although the Brew trainer should only perform backward once, it encounters this issue.
+        # https://github.com/triton-lang/triton/issues/4004
+        else:
+            BT, V = _input.shape
+            n_rows = BT
+            BLOCK_SIZE = min(MAX_FUSED_SIZE, triton.next_power_of_2(V))
+            element_mul[(n_rows,)](
+                _input,
+                _input.stride(-2),
+                grad_output,
+                V,
+                BLOCK_SIZE=BLOCK_SIZE,
+                num_warps=32,
+            )
+        return (
+            _input,
+            None,
+            None,
+        )

liger_kernel-0.0.0/src/liger_kernel/ops/fused_linear_cross_entropy.py ADDED Viewed

@@ -0,0 +1,161 @@
+"""Fusing the last linear layer with cross-entropy loss
+Reference: https://github.com/mgmalek/efficient_cross_entropy
+"""
+import torch
+import triton
+from liger_kernel.ops.cross_entropy import element_mul, liger_cross_entropy_kernel
+# The hard limit of TRITON_MAX_TENSOR_NUMEL is 1048576 https://github.com/triton-lang/triton/blob/ba42a5c68fd0505f8c42f4202d53be0f8d9a5fe0/python/triton/language/core.py#L19
+# However, setting limit as 65536 as in LayerNorm tutorial is faster because of less register spilling
+# The optimal maximum block size depends on your hardware, your kernel, and your dtype
+MAX_FUSED_SIZE = 65536 // 2  # manual tune a bit
+class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, _input, linear, target, ignore_index):
+        """
+        Handle the forward and backward pass of the final linear layer via cross-entropy loss by avoiding
+        the materialization of the large logits tensor. Since Cross Entropy Loss is the last layer, we can
+        compute the gradient at the forward pass. By doing so, we don't have to store the _input and target
+        for the backward pass.
+        _input: (B*T, H) where B is batch size, T is sequence length, H is hidden dimension.
+        target: (B*T) where each value is in [0, V-1]
+        linear: linear projection matrix of shape V x H.
+        ignore_index: the index to ignore in the target
+        """
+        dtype = (
+            torch.get_autocast_gpu_dtype()
+            if torch.is_autocast_enabled()
+            else _input.dtype
+        )
+        device = _input.device
+        # inputs have shape: BT x H
+        # materialized activations will have shape: BT x V
+        # the increase in memory = BT x V
+        # reduction can be achieved by paritioning the number of tokens BT into smaller chunks.
+        # for ex: if we were to achieve the same memory consumption as BT x H, then the chunk size should be:
+        # inc_factor = (V+H-1)//H, chunk_size = (BT + inc_factor - 1)//inc_factor
+        # for ex: BT = 4096*4, V = 32000, H = 4096 ==> inc_factor = 8, chunk_size = 2048
+        BT, H = _input.shape
+        V = linear.shape[0]
+        BLOCK_SIZE = min(MAX_FUSED_SIZE, triton.next_power_of_2(V))
+        inc_factor = triton.cdiv(V, H)  # (V + H - 1) // H
+        chunk_size = triton.next_power_of_2(
+            triton.cdiv(BT, inc_factor)
+        )  # (BT + inc_factor - 1) // inc_factor
+        num_chunks = triton.cdiv(BT, chunk_size)  # (BT + chunk_size - 1) // chunk_size
+        grad_linear = torch.zeros_like(linear, device=device)
+        grad_input = torch.zeros_like(_input, device=device)
+        loss_1d = torch.zeros(BT, dtype=torch.float32, device=device)
+        total_n_non_ignore = (target != ignore_index).sum().item()
+        for chunk_id in range(num_chunks):
+            start_idx = chunk_id * chunk_size
+            end_idx = min((chunk_id + 1) * chunk_size, BT)
+            _input_chunk = _input[start_idx:end_idx]  # chunk_size x H
+            # when doing matmul, use the original precision
+            logits_chunk = _input_chunk @ linear.t()  # chunk_size x V
+            target_chunk = target[start_idx:end_idx]  # chunk_size,
+            n_rows = logits_chunk.shape[0]
+            # unreduced loss
+            loss_1d_slice = loss_1d[start_idx:end_idx]  # chunk_size,
+            n_non_ignore = (target_chunk != ignore_index).sum().item()
+            # when doing CE, use the upcasted precision
+            logits_chunk = logits_chunk.float()
+            # ensure _input and target are contiguous
+            logits_chunk = logits_chunk.contiguous()
+            target_chunk = target_chunk.contiguous()
+            # Here we calculate the gradient of logits_chunk in place so we can save memory.
+            liger_cross_entropy_kernel[(n_rows,)](
+                X_ptr=logits_chunk,
+                X_stride=logits_chunk.stride(-2),
+                Y_ptr=target_chunk,
+                Y_stride=target_chunk.stride(-1),  # always 1
+                loss_ptr=loss_1d_slice,
+                loss_stride=loss_1d_slice.stride(-1),  # always 1
+                n_cols=V,
+                n_non_ignore=n_non_ignore,
+                ignore_index=ignore_index,
+                BLOCK_SIZE=BLOCK_SIZE,
+                num_warps=32,
+            )
+            # gradient of logits_chunk is computed inplace by the above triton kernel.
+            # Following HuggingFace model source code, we do the forward and backward
+            # w.r.t. logits in fp32 for numerical stability especially as the num classes (vocab size) os huge.
+            # (reference: https://github.com/huggingface/transformers/blob/v4.42.4/src/transformers/models/llama/modeling_llama.py#L1194)
+            # Propagating to lm_head's backward, we'll switch back to the original dtype.
+            logits_chunk = logits_chunk.to(dtype)
+            # gradient of logits_chunk is computed inplace by the above triton kernel and is of shape: chunk_size x V
+            # thus grad_input[start_idx: end_idx] should be of shape: chunk_size x H
+            # additionally, since we are chunking the inputs, observe that the loss and gradients are calculated only
+            # on `n_non_ignore` tokens. However, the gradient of the input should be calculated for all tokens.
+            # Thus, we need an additional scaling factor of (n_non_ignore/total_n_non_ignore) to scale the gradients.
+            grad_logits_chunk = logits_chunk * (n_non_ignore / total_n_non_ignore)
+            grad_input[start_idx:end_idx] = grad_logits_chunk @ linear
+            torch.addmm(
+                input=grad_linear,
+                mat1=logits_chunk.t(),
+                mat2=_input_chunk,
+                out=grad_linear,
+                alpha=n_non_ignore / total_n_non_ignore,
+                beta=1.0,
+            )
+        loss = torch.sum(loss_1d) / total_n_non_ignore
+        # downcast to dtype and store for backward
+        ctx.save_for_backward(grad_input.detach(), grad_linear.detach())
+        return loss
+    @staticmethod
+    def backward(ctx, grad_output):
+        (grad_input, grad_linear) = ctx.saved_tensors
+        # If cross entropy is the last layer, grad_output is 1.0. Skip the mul to save time
+        if torch.ne(grad_output, torch.tensor(1.0, device=grad_output.device)):
+            # We use a Triton kernel instead of a PyTorch operation because modifying inputs in-place
+            # for gradient storage and backward multiple times causes anomalies with PyTorch but not with Triton.
+            BT, H = grad_input.shape
+            n_rows = BT
+            BLOCK_SIZE = min(MAX_FUSED_SIZE, triton.next_power_of_2(H))
+            element_mul[(n_rows,)](
+                grad_input,
+                grad_input.stride(-2),
+                grad_output,
+                H,
+                BLOCK_SIZE=BLOCK_SIZE,
+                num_warps=32,
+            )
+            # handle grad_linear
+            V, H = grad_linear.shape
+            n_rows = V
+            element_mul[(n_rows,)](
+                grad_linear,
+                grad_linear.stride(-2),
+                grad_output,
+                H,
+                BLOCK_SIZE=BLOCK_SIZE,
+                num_warps=32,
+            )
+        return (grad_input, grad_linear, None, None)

liger_kernel-0.0.0/src/liger_kernel/ops/geglu.py ADDED Viewed

@@ -0,0 +1,129 @@
+import torch
+import triton
+import triton.language as tl
+from liger_kernel.ops.utils import calculate_settings, ensure_contiguous
+@triton.jit
+def _geglu_tanh_forward_kernel(
+    a, b, c, stride, n_cols: tl.constexpr, BLOCK_SIZE: tl.constexpr
+):
+    program_id = tl.program_id(0)
+    # locate start index
+    a += program_id * stride
+    b += program_id * stride
+    c += program_id * stride
+    col_offsets = tl.arange(0, BLOCK_SIZE)
+    mask = col_offsets < n_cols
+    a_row = tl.load(a + col_offsets, mask=mask, other=0).to(tl.float32)
+    b_row = tl.load(b + col_offsets, mask=mask, other=0)
+    # tanh approximation form of GELU is computed with:
+    # 0.5 * a * (1 + tanh(sqrt(2 / pi) * (a + 0.044715 * a^3)))
+    sqrt_2_over_pi = 0.7978845608028654  # sqrt(2 / pi)
+    a_cubed = a_row * a_row * a_row
+    tanh_arg = sqrt_2_over_pi * (a_row + 0.044715 * a_cubed)
+    tanh_result = tl.math.tanh(tanh_arg)
+    geglu_a = 0.5 * a_row * (1 + tanh_result)
+    c_row = geglu_a * b_row
+    tl.store(c + col_offsets, c_row, mask=mask)
+@triton.jit
+def _geglu_tanh_backward_kernel(
+    dc, a, b, stride, n_cols: tl.constexpr, BLOCK_SIZE: tl.constexpr
+):
+    program_id = tl.program_id(0)
+    # locate start index
+    dc += program_id * stride
+    a += program_id * stride
+    b += program_id * stride
+    col_offsets = tl.arange(0, BLOCK_SIZE)
+    mask = col_offsets < n_cols
+    dc_row = tl.load(dc + col_offsets, mask=mask, other=0)
+    a_row = tl.load(a + col_offsets, mask=mask, other=0).to(tl.float32)
+    b_row = tl.load(b + col_offsets, mask=mask, other=0)
+    # recomputation to save memory
+    sqrt_2_over_pi = 0.7978845608028654  # sqrt(2 / pi)
+    a_cubed = a_row * a_row * a_row
+    tanh_arg = sqrt_2_over_pi * (a_row + 0.044715 * a_cubed)
+    tanh_result = tl.math.tanh(tanh_arg)
+    geglu_a = 0.5 * a_row * (1 + tanh_result)
+    db_row = dc_row * geglu_a
+    # Gradient w.r.t. a can be computed with:
+    # b * (0.5 * (1 + tanh(z)) + 0.5 * a * (1 - tanh(z)^2) * (sqrt(2/pi) * (1 + 3 * 0.044715 * a^2)))
+    # where z = sqrt(2/pi) * (a + 0.044715 * a^3)
+    term1 = 0.5 * (1 + tanh_result)
+    tanh_sq = tanh_result * tanh_result
+    term2 = (
+        0.5
+        * a_row
+        * (1 - tanh_sq)
+        * (sqrt_2_over_pi * (1 + 3 * 0.044715 * a_row * a_row))
+    )
+    da_row = dc_row * b_row * (term1 + term2)
+    tl.store(a + col_offsets, da_row, mask=mask)
+    tl.store(b + col_offsets, db_row, mask=mask)
+class LigerGELUMulFunction(torch.autograd.Function):
+    @staticmethod
+    @ensure_contiguous
+    def forward(ctx, a, b):
+        ori_shape = a.shape
+        n_cols = ori_shape[-1]
+        a = a.view(-1, n_cols)
+        b = b.view(-1, n_cols)
+        c = torch.zeros_like(a)
+        n_rows = a.shape[0]
+        BLOCK_SIZE, num_warps = calculate_settings(n_cols)
+        _geglu_tanh_forward_kernel[(n_rows,)](
+            a,
+            b,
+            c,
+            c.stride(-2),
+            n_cols=n_cols,
+            BLOCK_SIZE=BLOCK_SIZE,
+            num_warps=num_warps,
+        )
+        ctx.save_for_backward(a, b)
+        return c.view(*ori_shape)
+    @staticmethod
+    @ensure_contiguous
+    def backward(ctx, dc):
+        ori_shape = dc.shape
+        n_cols = ori_shape[-1]
+        dc = dc.view(-1, n_cols)
+        a, b = ctx.saved_tensors
+        n_rows = dc.shape[0]
+        BLOCK_SIZE, num_warps = calculate_settings(n_cols)
+        _geglu_tanh_backward_kernel[(n_rows,)](
+            dc,
+            a,
+            b,
+            dc.stride(-2),
+            n_cols=n_cols,
+            BLOCK_SIZE=BLOCK_SIZE,
+            num_warps=num_warps,
+        )
+        return a.view(*ori_shape), b.view(*ori_shape)