PyPI - liger-kernel - Versions diffs - 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl - Mend

liger-kernel 0.4.0py3-none-any.whl → 0.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

liger_kernel/env_report.py +2 -0
liger_kernel/ops/cross_entropy.py +143 -30
liger_kernel/ops/fused_linear_cross_entropy.py +19 -2
liger_kernel/ops/group_norm.py +322 -0
liger_kernel/ops/rms_norm.py +27 -6
liger_kernel/transformers/cross_entropy.py +44 -12
liger_kernel/transformers/functional.py +34 -1
liger_kernel/transformers/fused_linear_cross_entropy.py +31 -4
liger_kernel/transformers/group_norm.py +56 -0
liger_kernel/transformers/model/gemma2.py +277 -0
liger_kernel/transformers/monkey_patch.py +101 -62
liger_kernel/transformers/rms_norm.py +11 -3
{liger_kernel-0.4.0.dist-info → liger_kernel-0.4.1.dist-info}/METADATA +5 -3
{liger_kernel-0.4.0.dist-info → liger_kernel-0.4.1.dist-info}/RECORD +18 -15
{liger_kernel-0.4.0.dist-info → liger_kernel-0.4.1.dist-info}/WHEEL +1 -1
{liger_kernel-0.4.0.dist-info → liger_kernel-0.4.1.dist-info}/LICENSE +0 -0
{liger_kernel-0.4.0.dist-info → liger_kernel-0.4.1.dist-info}/NOTICE +0 -0
{liger_kernel-0.4.0.dist-info → liger_kernel-0.4.1.dist-info}/top_level.txt +0 -0

liger_kernel/ops/group_norm.py ADDED Viewed

@@ -0,0 +1,322 @@
+import operator
+import torch
+import triton
+import triton.language as tl
+from liger_kernel.ops.utils import compare_version, ensure_contiguous
+if compare_version("triton", operator.ge, "3.0.0"):
+    try:
+        # typical import path with dispatch available
+        from triton.language.extra.libdevice import rsqrt
+    except ModuleNotFoundError:
+        # for working with NGC containers
+        from triton.language.extra.cuda.libdevice import rsqrt
+else:
+    from triton.language.math import rsqrt
+MAX_FUSED_SIZE = 65536
+@triton.jit
+def _group_norm_forward_kernel(
+    Y_ptr,  # pointer to output, shape (n_rows, n_groups, hidden_size)
+    Y_row_stride,  # stride of each row in output
+    Y_col_stride,  # stride of each column in output
+    X_ptr,  # pointer to input, shape (n_rows, n_groups, hidden_size)
+    X_row_stride,  # stride of each row in input
+    X_col_stride,  # stride of each column in input
+    Mean_ptr,  # pointer to mean, shape (n_rows, n_groups)
+    Mean_row_stride,  # stride of each row in mean
+    Mean_col_stride,  # stride of each column in mean
+    RSTD_ptr,  # pointer to rstd, shape (n_rows, n_groups)
+    RSTD_row_stride,  # stride of each row in rstd
+    RSTD_col_stride,  # stride of each column in rstd
+    W_ptr,  # pointer to W
+    B_ptr,  # pointer to B
+    hidden_size,  # hidden size of X
+    channels_per_group,  # the number of channels per group
+    eps,
+    BLOCK_SIZE: tl.constexpr,
+):
+    """
+    References:
+    https://nn.labml.ai/normalization/group_norm/index.html
+    """
+    batch_idx = tl.program_id(0)
+    group_idx = tl.program_id(1)
+    X_ptr += batch_idx * X_row_stride + group_idx * X_col_stride
+    Y_ptr += batch_idx * Y_row_stride + group_idx * Y_col_stride
+    block_range = tl.arange(0, BLOCK_SIZE)
+    # Compute mean and variance using the online algorithm
+    s = 0.0
+    squared_sum = 0.0
+    for i in tl.range(0, hidden_size, BLOCK_SIZE):
+        hidden_size_offsets = i + block_range
+        mask = hidden_size_offsets < hidden_size
+        X = tl.load(X_ptr + hidden_size_offsets, mask=mask, other=0.0)
+        s += tl.sum(X)
+        # X**2
+        squared_sum += tl.sum(X * X)
+    m = s / hidden_size
+    # variance = E[X**2] - E[X]**2
+    variance = (squared_sum / hidden_size) - (m * m)
+    # 1/std
+    rstd = rsqrt(variance + eps)
+    # Normalize
+    hidden_size_per_channel = hidden_size // channels_per_group
+    for channel_idx in tl.range(
+        group_idx * channels_per_group, (group_idx + 1) * channels_per_group
+    ):
+        W = tl.load(W_ptr + channel_idx)
+        B = tl.load(B_ptr + channel_idx)
+        for i in range(0, hidden_size_per_channel, BLOCK_SIZE):
+            hidden_size_offsets = i + block_range
+            mask = hidden_size_offsets < hidden_size_per_channel
+            X = tl.load(X_ptr + hidden_size_offsets, mask=mask, other=m)
+            Y = (X - m) * rstd * W + B
+            tl.store(Y_ptr + hidden_size_offsets, Y, mask=mask)
+        X_ptr += hidden_size_per_channel
+        Y_ptr += hidden_size_per_channel
+    tl.store(Mean_ptr + batch_idx * Mean_row_stride + group_idx * Mean_col_stride, m)
+    tl.store(RSTD_ptr + batch_idx * RSTD_row_stride + group_idx * RSTD_col_stride, rstd)
+@triton.jit
+def _group_norm_backward_kernel(
+    X_ptr,  # pointer to input, shape (n_rows, n_channels, hidden_size)
+    X_row_stride,  # stride of each row in input
+    X_col_stride,  # stride of each column in input
+    W_ptr,  # pointer to weights, shape (n_channels)
+    Mean_ptr,  # pointer to mean, shape (n_rows, n_groups)
+    Mean_ptr_row_stride,  # stride of each column in mean
+    Mean_ptr_col_stride,  # stride of each column in mean
+    RSTD_ptr,  # pointer to rstd, shape (n_rows, n_groups)
+    DX_ptr,  # pointer to input grad, shape (n_rows, n_groups, hidden_size)
+    DW_ptr,  # pointer to weights grad, shape (n_channels)
+    DB_ptr,  # pointer to bias grad, shape (n_channels)
+    UPSTREAM_ptr,  # pointer to output grad, shape (n_rows, n_channels, hidden_size)
+    hidden_size: tl.constexpr,  # hidden size
+    channels_per_group: tl.constexpr,  # number of groups in group norm
+    BLOCK_SIZE: tl.constexpr,
+    dtype: tl.constexpr,
+):
+    """
+    References:
+    https://nn.labml.ai/normalization/group_norm/index.html
+    https://github.com/karpathy/llm.c/blob/master/doc/layernorm/layernorm.md
+    The backprop equations are the same for group_norm and layer_norm
+    the only difference here is that we load the Mean, Rstd corresponding to the
+    group we're computing gradients for and the mean and rstd are computed over n-channels
+    so the total number of elements we compute the mean over is num_channels_per_group * hidden_size
+    We also need to load the Weights corresponding to the current channel to compute the gradients.
+    """
+    batch_idx = tl.program_id(0)
+    group_idx = tl.program_id(1)
+    # Move the pointers to the correct batch
+    X_ptr += batch_idx * X_row_stride
+    DX_ptr += batch_idx * X_row_stride
+    UPSTREAM_ptr += batch_idx * X_row_stride
+    # Mean and rstd are the same shape so have the same strides
+    mean = tl.load(
+        Mean_ptr + batch_idx * Mean_ptr_row_stride + group_idx * Mean_ptr_col_stride
+    )
+    rstd = tl.load(
+        RSTD_ptr + batch_idx * Mean_ptr_row_stride + group_idx * Mean_ptr_col_stride
+    )
+    c1 = 0.0
+    c2 = 0.0
+    block_range = tl.arange(0, BLOCK_SIZE)
+    # We need to compute the sum terms of the backprop equations across all channels in the group
+    for channel_idx in range(
+        group_idx * channels_per_group, (group_idx + 1) * channels_per_group
+    ):
+        dW = 0.0
+        dB = 0.0
+        # Move the pointers to the correct channel
+        W = tl.load(W_ptr + channel_idx)
+        for i in tl.range(0, hidden_size, BLOCK_SIZE):
+            hidden_size_offsets = i + block_range
+            mask = hidden_size_offsets < hidden_size
+            X = tl.load(
+                X_ptr + channel_idx * X_col_stride + hidden_size_offsets,
+                mask=mask,
+                other=0.0,
+            )
+            UPSTREAM_grad = tl.load(
+                UPSTREAM_ptr + channel_idx * X_col_stride + hidden_size_offsets,
+                mask=mask,
+                other=0.0,
+            )
+            x_hat = (X - mean) * rstd
+            dW += tl.sum(UPSTREAM_grad * x_hat)
+            dB += tl.sum(UPSTREAM_grad)
+            wdy = W * UPSTREAM_grad
+            c1 += tl.sum(x_hat * wdy)
+            c2 += tl.sum(wdy)
+        # Need to ensure additions to the same channel are atomic
+        tl.atomic_add(DW_ptr + channel_idx, dW.to(dtype))
+        tl.atomic_add(DB_ptr + channel_idx, dB.to(dtype))
+    N = hidden_size * channels_per_group
+    c1 = c1 / N
+    c2 = c2 / N
+    for channel_idx in tl.range(
+        group_idx * channels_per_group, (group_idx + 1) * channels_per_group
+    ):
+        # Move the pointers to the correct channel
+        W = tl.load(W_ptr + channel_idx)
+        for i in range(0, hidden_size, BLOCK_SIZE):
+            hidden_size_offsets = i + block_range
+            mask = hidden_size_offsets < hidden_size
+            X = tl.load(
+                X_ptr + channel_idx * X_col_stride + hidden_size_offsets,
+                mask=mask,
+                other=0.0,
+            )
+            UPSTREAM_grad = tl.load(
+                UPSTREAM_ptr + channel_idx * X_col_stride + hidden_size_offsets,
+                mask=mask,
+                other=0.0,
+            )
+            x_hat = (X - mean) * rstd
+            wdy = W * UPSTREAM_grad
+            dx = (wdy - (x_hat * c1 + c2)) * rstd
+            tl.store(
+                DX_ptr + channel_idx * X_col_stride + hidden_size_offsets, dx, mask=mask
+            )
+def group_norm_forward(X, num_channels, num_groups, W, B, eps):
+    shape = X.shape
+    batch_size = shape[0]
+    channels_per_group = num_channels // num_groups
+    # Reshape X so that the mean and std are computed across the groups
+    X = X.view(batch_size, num_groups, -1).contiguous()
+    hidden_size = X.shape[-1]
+    BLOCK_SIZE = min(MAX_FUSED_SIZE, triton.next_power_of_2(hidden_size))
+    Y = torch.empty(
+        (batch_size, num_groups, hidden_size), dtype=X.dtype, device=X.device
+    )
+    Mean = torch.zeros((batch_size, num_groups), dtype=X.dtype, device=X.device)
+    RSTD = torch.zeros((batch_size, num_groups), dtype=X.dtype, device=X.device)
+    _group_norm_forward_kernel[(batch_size, num_groups)](
+        Y,
+        Y.stride(0),
+        Y.stride(1),
+        X,
+        X.stride(0),
+        X.stride(1),
+        Mean,
+        Mean.stride(0),
+        Mean.stride(1),
+        RSTD,
+        RSTD.stride(0),
+        RSTD.stride(1),
+        W,
+        B,
+        hidden_size,
+        channels_per_group,
+        eps,
+        BLOCK_SIZE=BLOCK_SIZE,
+    )
+    # Return tensors in the original shape
+    return Y.view(*shape), X.view(*shape), Mean, RSTD, BLOCK_SIZE
+def group_norm_backward(dY, X, W, B, Mean, RSTD, num_channels, num_groups):
+    shape = dY.shape
+    batch_size = shape[0]
+    hidden_size = dY.shape[-1]
+    channels_per_group = num_channels // num_groups
+    dY = dY.view(batch_size, num_groups, -1)
+    DX = torch.empty(
+        (batch_size, num_groups, hidden_size * channels_per_group),
+        dtype=X.dtype,
+        device=X.device,
+    )
+    DW = torch.zeros((num_channels), dtype=W.dtype, device=W.device)
+    DB = torch.zeros((num_channels), dtype=B.dtype, device=B.device)
+    triton_dtype = tl.float32 if X.dtype == torch.float32 else tl.bfloat16
+    BLOCK_SIZE = min(MAX_FUSED_SIZE, triton.next_power_of_2(hidden_size))
+    _group_norm_backward_kernel[(batch_size, num_groups)](
+        X,
+        X.stride(0),
+        X.stride(1),
+        W,
+        Mean,
+        Mean.stride(0),
+        Mean.stride(1),
+        RSTD,
+        DX,
+        DW,
+        DB,
+        dY,
+        hidden_size,
+        channels_per_group,
+        BLOCK_SIZE=BLOCK_SIZE,
+        dtype=triton_dtype,
+    )
+    # Return tensors in the original shape
+    return DX.view(*shape), DW, DB
+class LigerGroupNormFunction(torch.autograd.Function):
+    @staticmethod
+    @ensure_contiguous
+    def forward(
+        ctx,
+        X,
+        affine_scaling_weight,
+        affine_shifting_bias,
+        num_channels,
+        num_groups,
+        eps,
+    ):
+        Y, X, Mean, RSTD, BLOCK_SIZE = group_norm_forward(
+            X,
+            num_channels,
+            num_groups,
+            affine_scaling_weight,
+            affine_shifting_bias,
+            eps,
+        )
+        ctx.num_channels = num_channels
+        ctx.num_groups = num_groups
+        ctx.save_for_backward(
+            X, affine_scaling_weight, affine_shifting_bias, Mean, RSTD
+        )
+        return Y
+    @staticmethod
+    @ensure_contiguous
+    def backward(ctx, dY):
+        X, W, B, Mean, RSTD = ctx.saved_tensors
+        DX, DW, DB = group_norm_backward(
+            dY, X, W, B, Mean, RSTD, ctx.num_channels, ctx.num_groups
+        )
+        return DX, DW, DB, None, None, None

liger_kernel/ops/rms_norm.py CHANGED Viewed

@@ -116,6 +116,8 @@ def _rms_norm_forward_kernel(
 def _rms_norm_backward_kernel(
     dY_ptr,
     dY_row_stride,
+    dX_ptr,
+    dX_row_stride,
     X_ptr,
     X_row_stride,
     X_dtype: tl.constexpr,
@@ -146,6 +148,8 @@ def _rms_norm_backward_kernel(
     dW_row = tl.zeros((BLOCK_SIZE,), dtype=tl.float32)
     dY_ptr += row_start * dY_row_stride
+    dX_ptr += row_start * dX_row_stride
     X_ptr += row_start * X_row_stride
     RSTD_ptr += row_start
@@ -184,9 +188,10 @@ def _rms_norm_backward_kernel(
             # here X_row is already in fp32 (see previous if block)
             dW_row += dY_row * (X_row * rstd_row)
-        tl.store(dY_ptr + col_offsets, dX_row.to(X_dtype), mask=mask)
+        tl.store(dX_ptr + col_offsets, dX_row.to(X_dtype), mask=mask)
         dY_ptr += dY_row_stride
+        dX_ptr += dX_row_stride
         X_ptr += X_row_stride
         RSTD_ptr += RSTD_row_stride
@@ -251,7 +256,9 @@ def rms_norm_forward(X, W, eps, offset, casting_mode):
     return Y.view(*shape), X, RSTD, BLOCK_SIZE, num_warps, casting_mode
-def rms_norm_backward(dY, X, W, RSTD, offset, casting_mode, BLOCK_SIZE, num_warps):
+def rms_norm_backward(
+    dY, X, W, RSTD, offset, casting_mode, BLOCK_SIZE, num_warps, in_place
+):
     shape = dY.shape
     dim = shape[-1]
     dY = dY.view(-1, dim)
@@ -265,10 +272,17 @@ def rms_norm_backward(dY, X, W, RSTD, offset, casting_mode, BLOCK_SIZE, num_warp
         raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
     rows_per_program = math.ceil(n_rows / sm_count)
     grid = (sm_count,)
-    # Here we use dY to store the value of dX to save memory
+    if in_place is True:
+        dX = dY
+    else:
+        dX = torch.zeros_like(dY)
     _rms_norm_backward_kernel[grid](
         dY,
         dY.stride(0),
+        dX,
+        dX.stride(0),
         X,
         X.stride(0),
         torch_to_triton_dtype[X.dtype],
@@ -286,8 +300,9 @@ def rms_norm_backward(dY, X, W, RSTD, offset, casting_mode, BLOCK_SIZE, num_warp
         BLOCK_SIZE=BLOCK_SIZE,
         num_warps=num_warps,
     )
-    dX = dY.view(*shape)
+    dX = dX.view(*shape)
     dW = _dW.sum(dim=0).to(W.dtype)
     return dX, dW
@@ -307,11 +322,15 @@ class LigerRMSNormFunction(torch.autograd.Function):
     - 'llama': matches the Llama implementation, where only the inverse RMS is computed on fp32.
     - 'gemma': matches the Gemma implementation, where everything is cast to fp32, then computed, then cast back to the original dtype.
     - 'none': no casting is done. The computation is done in the original dtype. This saves memory and is slightly faster, but has more error w.r.t. the original implementation.
+    `in_place` option means whether to in_place modify dY to store dX. This is default to `True` to save memory. However, under certain cases, it can produce incorrect inputs.
+        For example, gemma2 uses two rmsnorm sequentially with residual in between. The resesidual part needs dY so it cannot be modified in-place.
+        Therefore, for the patching of RMSNorm in gemma2, we set `in_place` to `False`
     """
     @staticmethod
     @ensure_contiguous
-    def forward(ctx, X, W, eps, offset=0.0, casting_mode="llama"):
+    def forward(ctx, X, W, eps, offset=0.0, casting_mode="llama", in_place=True):
         """
         X: (B, T, H) or (BxT, H)
         W: (H,)
@@ -321,6 +340,7 @@ class LigerRMSNormFunction(torch.autograd.Function):
         )
         ctx.offset = offset
         ctx.casting_mode = casting_mode
+        ctx.in_place = in_place
         ctx.BLOCK_SIZE = BLOCK_SIZE
         ctx.num_warps = num_warps
         ctx.save_for_backward(X, W, RSTD)
@@ -342,5 +362,6 @@ class LigerRMSNormFunction(torch.autograd.Function):
             ctx.casting_mode,
             ctx.BLOCK_SIZE,
             ctx.num_warps,
+            ctx.in_place,
         )
-        return dX, dW, None, None, None
+        return dX, dW, None, None, None, None

liger_kernel/transformers/cross_entropy.py CHANGED Viewed

@@ -1,21 +1,53 @@
-from torch.nn import CrossEntropyLoss
+from typing import Optional
+import torch
 from liger_kernel.ops.cross_entropy import LigerCrossEntropyFunction
-class LigerCrossEntropyLoss(CrossEntropyLoss):
-    def __init__(self, *args, **kwargs):
-        super(LigerCrossEntropyLoss, self).__init__(*args, **kwargs)
-        assert (self.label_smoothing >= 0) and (
-            self.label_smoothing <= 1
-        ), f"label_smoothing must be between 0.0 and 1.0. Got: {self.label_smoothing}"
-        assert self.reduction in {
+class LigerCrossEntropyLoss(torch.nn.Module):
+    def __init__(
+        self,
+        ignore_index: int = -100,
+        lse_square_scale: float = 0.0,
+        label_smoothing: float = 0.0,
+        reduction: str = "mean",
+        softcap: Optional[float] = None,
+        return_z_loss: bool = False,
+    ):
+        super().__init__()
+        assert (label_smoothing >= 0) and (
+            label_smoothing <= 1
+        ), f"label_smoothing must be between 0.0 and 1.0. Got: {label_smoothing}"
+        assert (label_smoothing >= 0) and (
+            label_smoothing <= 1
+        ), f"label_smoothing must be between 0.0 and 1.0. Got: {label_smoothing}"
+        assert reduction in {
             "mean",
             "sum",
             "none",
-        }, f"reduction must be one of 'mean', 'sum', or 'none'. Got: {self.reduction}"
+        }, f"reduction must be one of 'mean', 'sum', or 'none'. Got: {reduction}"
+        assert (
+            softcap is None or softcap > 0
+        ), f"softcap must greater than 0.0 or None. Got: {softcap}"
+        self.ignore_index = ignore_index
+        self.lse_square_scale = lse_square_scale
+        self.label_smoothing = label_smoothing
+        self.reduction = reduction
+        self.softcap = softcap
+        self.return_z_loss = return_z_loss
-    def forward(self, _input, target):
-        return LigerCrossEntropyFunction.apply(
-            _input, target, self.ignore_index, self.label_smoothing, self.reduction
+    def forward(self, _input: torch.Tensor, target: torch.Tensor):
+        loss, z_loss = LigerCrossEntropyFunction.apply(
+            _input,
+            target,
+            self.ignore_index,
+            self.lse_square_scale,
+            self.label_smoothing,
+            self.reduction,
+            self.softcap,
+            self.return_z_loss,
         )
+        if not self.return_z_loss:
+            return loss
+        return loss, z_loss

liger_kernel/transformers/functional.py CHANGED Viewed

@@ -1,9 +1,12 @@
+from typing import Optional
 from liger_kernel.ops.cross_entropy import LigerCrossEntropyFunction
 from liger_kernel.ops.fused_linear_cross_entropy import (
     LigerFusedLinearCrossEntropyFunction,
 )
 from liger_kernel.ops.fused_linear_jsd import LigerFusedLinearJSDFunction
 from liger_kernel.ops.geglu import LigerGELUMulFunction
+from liger_kernel.ops.group_norm import LigerGroupNormFunction
 from liger_kernel.ops.jsd import LigerJSDFunction
 from liger_kernel.ops.kl_div import LigerKLDivLossFunction
 from liger_kernel.ops.layer_norm import LigerLayerNormFunction
@@ -12,7 +15,6 @@ from liger_kernel.ops.rope import LigerRopeFunction
 from liger_kernel.ops.swiglu import LigerSiLUMulFunction
 liger_swiglu = LigerSiLUMulFunction.apply
-liger_cross_entropy = LigerCrossEntropyFunction.apply
 liger_fused_linear_cross_entropy = LigerFusedLinearCrossEntropyFunction.apply
 liger_geglu = LigerGELUMulFunction.apply
 liger_rms_norm = LigerRMSNormFunction.apply
@@ -21,3 +23,34 @@ liger_layer_norm = LigerLayerNormFunction.apply
 liger_kl_div = LigerKLDivLossFunction.apply
 liger_jsd = LigerJSDFunction.apply
 liger_fused_linear_jsd = LigerFusedLinearJSDFunction.apply
+liger_group_norm = LigerGroupNormFunction.apply
+# conform to the function signature in https://pytorch.org/docs/stable/generated/torch.nn.functional.cross_entropy.html
+# `weight` and `size_average` are placeholders and not implemented yet
+def liger_cross_entropy(
+    input,
+    target,
+    weight=None,
+    size_average=None,
+    ignore_index: int = -100,
+    reduce=None,
+    reduction: str = "mean",
+    label_smoothing: float = 0.0,
+    lse_square_scale: float = 0.0,
+    softcap: Optional[float] = None,
+    return_z_loss: bool = False,
+):
+    loss, z_loss = LigerCrossEntropyFunction.apply(
+        input,
+        target,
+        ignore_index,
+        lse_square_scale,
+        label_smoothing,
+        reduction,
+        softcap,
+        return_z_loss,
+    )
+    if not return_z_loss:
+        return loss
+    return loss, z_loss

liger_kernel/transformers/fused_linear_cross_entropy.py CHANGED Viewed

@@ -1,13 +1,38 @@
-from torch.nn import CrossEntropyLoss
+from typing import Optional
+import torch
 from liger_kernel.ops.fused_linear_cross_entropy import (
     LigerFusedLinearCrossEntropyFunction,
 )
-class LigerFusedLinearCrossEntropyLoss(CrossEntropyLoss):
-    def __init__(self, *args, **kwargs):
-        super(LigerFusedLinearCrossEntropyLoss, self).__init__(*args, **kwargs)
+class LigerFusedLinearCrossEntropyLoss(torch.nn.Module):
+    def __init__(
+        self,
+        ignore_index: int = -100,
+        lse_square_scale: float = 0.0,
+        label_smoothing: float = 0.0,
+        reduction: str = "mean",
+        softcap: Optional[float] = None,
+    ):
+        super().__init__()
+        assert (label_smoothing >= 0) and (
+            label_smoothing <= 1
+        ), f"label_smoothing must be between 0.0 and 1.0. Got: {label_smoothing}"
+        assert reduction in {
+            "mean",
+            "sum",
+            "none",
+        }, f"reduction must be one of 'mean', 'sum', or 'none'. Got: {reduction}"
+        assert (
+            softcap is None or softcap > 0
+        ), f"softcap must greater than 0.0 or None. Got: {softcap}"
+        self.ignore_index = ignore_index
+        self.lse_square_scale = lse_square_scale
+        self.label_smoothing = label_smoothing
+        self.reduction = reduction
+        self.softcap = softcap
     def forward(self, lin_weight, _input, target, bias=None):
         return LigerFusedLinearCrossEntropyFunction.apply(
@@ -16,6 +41,8 @@ class LigerFusedLinearCrossEntropyLoss(CrossEntropyLoss):
             target,
             bias,
             self.ignore_index,
+            self.lse_square_scale,
             self.label_smoothing,
             self.reduction,
+            self.softcap,
         )

liger_kernel/transformers/group_norm.py ADDED Viewed

@@ -0,0 +1,56 @@
+import torch
+import torch.nn as nn
+from liger_kernel.ops.group_norm import LigerGroupNormFunction
+class LigerGroupNorm(nn.Module):
+    def __init__(self, num_channels, num_groups, eps=1e-6, bias=False, init_fn="ones"):
+        """
+        A Group Normalization layer.
+        Args:
+            num_channels (int): Number of channels in the input tensor.
+            num_groups (int): Number of groups to divide the channels into.
+            eps (float, optional): A value added to the denominator for numerical stability. Default: 1e-6.
+            bias (bool, optional): If ``True``, adds a learnable bias to the output. Default: ``False``.
+            init_fn (str, optional): Initialization function for the learnable parameters. Default: "ones".
+        """
+        super().__init__()
+        assert init_fn in [
+            "ones",
+            "zeros",
+        ], f"init_fn must be either 'ones' or 'zeros', got {init_fn}"
+        assert (
+            num_channels % num_groups == 0
+        ), f"Number of channels {num_channels} must be divisible by num_groups {num_groups}"
+        self.num_channels = num_channels
+        self.num_groups = num_groups
+        self.eps = eps
+        self.weight = nn.Parameter(
+            torch.ones(num_channels) if init_fn == "ones" else torch.zeros(num_channels)
+        )
+        self.bias = nn.Parameter(
+            torch.randn(num_channels) if bias else torch.zeros(num_channels)
+        )
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        # hidden_states: (batch_size, num_channels, *)
+        assert (
+            hidden_states.dim() >= 3
+        ), f"Input must have atleast 3 dimensions, got {hidden_states.dim()}"
+        assert (
+            hidden_states.size(1) == self.num_channels
+        ), f"Input tensor must have {self.num_channels} channels, got {hidden_states.size(1)}"
+        return LigerGroupNormFunction.apply(
+            hidden_states,
+            self.weight,
+            self.bias,
+            self.num_channels,
+            self.num_groups,
+            self.variance_epsilon,
+        )
+    def extra_repr(self):
+        return f"{self.hidden_size}, num_channels={self.num_channels}, num_groups={self.num_groups}, eps={self.eps}"

liger-kernel 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl

liger-kernel 0.4.0py3-none-any.whl → 0.4.1py3-none-any.whl