PyPI - liger-kernel - Versions diffs - 0.5.9__py3-none-any.whl → 0.6.0__py3-none-any.whl - Mend

liger-kernel 0.5.9py3-none-any.whl → 0.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

liger_kernel/chunked_loss/__init__.py +1 -0
liger_kernel/chunked_loss/cosine_similarity_loss.py +127 -0
liger_kernel/chunked_loss/dpo_loss.py +1 -1
liger_kernel/chunked_loss/functional.py +2 -0
liger_kernel/chunked_loss/fused_linear_preference.py +0 -1
liger_kernel/chunked_loss/jsd_loss.py +2 -2
liger_kernel/ops/dyt.py +111 -179
liger_kernel/ops/fused_neighborhood_attention.py +1022 -0
liger_kernel/ops/geglu.py +1 -1
liger_kernel/ops/grpo_loss.py +310 -0
liger_kernel/ops/multi_token_attention.py +207 -0
liger_kernel/ops/rms_norm.py +265 -54
liger_kernel/ops/softmax.py +201 -0
liger_kernel/ops/sparsemax.py +179 -0
liger_kernel/ops/swiglu.py +1 -1
liger_kernel/transformers/__init__.py +8 -0
liger_kernel/transformers/dyt.py +5 -3
liger_kernel/transformers/fsdp.py +55 -0
liger_kernel/transformers/functional.py +70 -0
liger_kernel/transformers/fused_neighborhood_attention.py +234 -0
liger_kernel/transformers/grpo_loss.py +98 -0
liger_kernel/transformers/model/gemma.py +25 -16
liger_kernel/transformers/model/gemma2.py +27 -14
liger_kernel/transformers/model/gemma3.py +62 -106
liger_kernel/transformers/model/glm4.py +16 -13
liger_kernel/transformers/model/llama.py +81 -18
liger_kernel/transformers/model/llama4.py +108 -0
liger_kernel/transformers/model/llava.py +95 -132
liger_kernel/transformers/model/mistral.py +13 -14
liger_kernel/transformers/model/mixtral.py +16 -15
liger_kernel/transformers/model/mllama.py +16 -14
liger_kernel/transformers/model/olmo2.py +16 -13
liger_kernel/transformers/model/paligemma.py +8 -9
liger_kernel/transformers/model/phi3.py +25 -16
liger_kernel/transformers/model/qwen2.py +24 -15
liger_kernel/transformers/model/qwen2_5_vl.py +41 -97
liger_kernel/transformers/model/qwen2_vl.py +38 -106
liger_kernel/transformers/model/qwen3.py +11 -9
liger_kernel/transformers/model/qwen3_moe.py +132 -0
liger_kernel/transformers/monkey_patch.py +424 -81
liger_kernel/transformers/multi_token_attention.py +64 -0
liger_kernel/transformers/rms_norm.py +40 -4
liger_kernel/transformers/softmax.py +12 -0
liger_kernel/transformers/sparsemax.py +16 -0
liger_kernel/transformers/swiglu.py +21 -0
liger_kernel/transformers/trainer/orpo_trainer.py +1 -53
liger_kernel/utils.py +11 -0
{liger_kernel-0.5.9.dist-info → liger_kernel-0.6.0.dist-info}/METADATA +41 -21
liger_kernel-0.6.0.dist-info/RECORD +97 -0
{liger_kernel-0.5.9.dist-info → liger_kernel-0.6.0.dist-info}/WHEEL +1 -1
liger_kernel/transformers/gema3_rms.py +0 -8
liger_kernel-0.5.9.dist-info/RECORD +0 -84
{liger_kernel-0.5.9.dist-info → liger_kernel-0.6.0.dist-info}/licenses/LICENSE +0 -0
{liger_kernel-0.5.9.dist-info → liger_kernel-0.6.0.dist-info}/licenses/NOTICE +0 -0
{liger_kernel-0.5.9.dist-info → liger_kernel-0.6.0.dist-info}/top_level.txt +0 -0

liger_kernel/chunked_loss/__init__.py CHANGED Viewed

@@ -1,3 +1,4 @@
+from liger_kernel.chunked_loss.cosine_similarity_loss import LigerFusedLinearCosineSimilarityLoss  # noqa:F401
 from liger_kernel.chunked_loss.cpo_loss import LigerFusedLinearCPOLoss  # noqa: F401
 from liger_kernel.chunked_loss.dpo_loss import LigerFusedLinearDPOLoss  # noqa: F401
 from liger_kernel.chunked_loss.grpo_loss import LigerFusedLinearGRPOLoss  # noqa: F401

liger_kernel/chunked_loss/cosine_similarity_loss.py ADDED Viewed

@@ -0,0 +1,127 @@
+import torch
+import torch.nn.functional as F
+from liger_kernel.chunked_loss.fused_linear_distillation import LigerFusedLinearDistillationBase
+class LigerFusedLinearCosineSimilarityFunction(LigerFusedLinearDistillationBase):
+    @staticmethod
+    def distillation_loss_fn(student_logits, teacher_logits, beta=1.0):
+        """
+        Compute Cosine loss (Cosine Similarity Loss).
+        Args:
+            student_logits (torch.Tensor): Logits of student tokens. Shape: (batch_size * seq_len,).
+            teacher_logits (torch.Tensor): Logits of teacher tokens. Shape: (batch_size * seq_len,).
+            beta: Coefficient beta of generalized Cosine Similarity in the interval [0, 1]. Default: `1.0` (float): .
+        Returns:
+            torch.Tensor: cosine similarity loss
+        """
+        student_norm = F.normalize(student_logits, p=2, dim=-1)
+        teacher_norm = F.normalize(teacher_logits, p=2, dim=-1)
+        cosine_sim = F.cosine_similarity(student_norm, teacher_norm, dim=-1)
+        loss = beta * (1 - cosine_sim)
+        return loss.sum()
+    @classmethod
+    def forward(
+        cls,
+        ctx,
+        student_input: torch.Tensor,
+        student_weight: torch.Tensor,
+        teacher_input: torch.Tensor,
+        teacher_weight: torch.Tensor,
+        true_labels: torch.LongTensor,
+        student_bias: torch.Tensor,
+        teacher_bias: torch.Tensor,
+        weight_hard_loss: float = 0.5,
+        weight_soft_loss: float = 0.5,
+        beta: float = 0.5,
+        ignore_index: int = -100,
+        temperature: float = 1.0,
+        compiled: bool = True,
+        chunk_size: int = 1024,
+    ):
+        return super().forward(
+            cls=cls,
+            ctx=ctx,
+            student_input=student_input,
+            student_weight=student_weight,
+            teacher_input=teacher_input,
+            teacher_weight=teacher_weight,
+            target=true_labels,
+            student_bias=student_bias,
+            teacher_bias=teacher_bias,
+            chunk_size=chunk_size,
+            weight_hard_loss=weight_hard_loss,
+            weight_soft_loss=weight_soft_loss,
+            beta=beta,
+            ignore_index=ignore_index,
+            temperature=temperature,
+            compiled=compiled,
+        )
+    @staticmethod
+    def backward(ctx, grad_output):
+        grads = LigerFusedLinearDistillationBase.backward(ctx, grad_output)[:6]
+        return (
+            *grads,
+            None,  # teacher_bias
+            None,  # weight_hard_loss
+            None,  # weight_soft_loss
+            None,  # beta
+            None,  # ignore_index
+            None,  # temperature
+            None,  # compiled
+            None,  # chunk_size
+        )
+class LigerFusedLinearCosineSimilarityLoss(torch.nn.Module):
+    def __init__(
+        self,
+        weight_hard_loss: float = 0.5,
+        weight_soft_loss: float = 0.5,
+        beta: float = 0.5,
+        ignore_index: int = -100,
+        temperature: float = 1.0,
+        compiled: bool = True,
+        chunk_size: int = 1024,
+    ):
+        super().__init__()
+        assert temperature != 0, "Temperature cannot be 0."
+        self.weight_hard_loss = weight_hard_loss
+        self.weight_soft_loss = weight_soft_loss
+        self.ignore_index = ignore_index
+        self.temperature = temperature
+        self.compiled = compiled
+        self.beta = beta
+        self.chunk_size = chunk_size
+    def forward(
+        self,
+        student_input: torch.Tensor,
+        student_weight: torch.Tensor,
+        teacher_input: torch.Tensor,
+        teacher_weight: torch.Tensor,
+        true_labels: torch.LongTensor,
+        student_bias: torch.Tensor = None,
+        teacher_bias: torch.Tensor = None,
+    ) -> torch.Tensor:
+        return LigerFusedLinearCosineSimilarityFunction.apply(
+            student_input,
+            student_weight,
+            teacher_input,
+            teacher_weight,
+            true_labels,
+            student_bias,
+            teacher_bias,
+            self.weight_hard_loss,
+            self.weight_soft_loss,
+            self.beta,
+            self.ignore_index,
+            self.temperature,
+            self.compiled,
+            self.chunk_size,
+        )

liger_kernel/chunked_loss/dpo_loss.py CHANGED Viewed

@@ -128,7 +128,7 @@ class LigerFusedLinearDPOLoss(torch.nn.Module):
         compute_nll_loss: bool = False,
         compiled: bool = True,
         use_ref_model: bool = True,
-        average_log_prob: bool = True,
+        average_log_prob: bool = False,
         chunk_size: int = 1,
     ):
         """

liger_kernel/chunked_loss/functional.py CHANGED Viewed

@@ -1,3 +1,4 @@
+from liger_kernel.chunked_loss.cosine_similarity_loss import LigerFusedLinearCosineSimilarityFunction
 from liger_kernel.chunked_loss.cpo_loss import LigerFusedLinearCPOFunction
 from liger_kernel.chunked_loss.dpo_loss import LigerFusedLinearDPOFunction
 from liger_kernel.chunked_loss.grpo_loss import LigerFusedLinearGRPOFunction
@@ -9,6 +10,7 @@ from liger_kernel.chunked_loss.simpo_loss import LigerFusedLinearSimPOFunction
 liger_fused_linear_orpo = LigerFusedLinearORPOFunction.apply
 liger_fused_linear_dpo = LigerFusedLinearDPOFunction.apply
 liger_fused_linear_jsd = LigerFusedLinearJSDFunction.apply
+liger_fused_linear_cosine = LigerFusedLinearCosineSimilarityFunction.apply
 liger_fused_linear_cpo = LigerFusedLinearCPOFunction.apply
 liger_fused_linear_simpo = LigerFusedLinearSimPOFunction.apply
 liger_fused_linear_kto = LigerFusedLinearKTOFunction.apply

liger_kernel/chunked_loss/fused_linear_preference.py CHANGED Viewed

@@ -222,7 +222,6 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
             (_ref_chosen_input_chunks if use_ref_model else [None] * len(_chosen_input_chunks)),
             (_ref_rejected_input_chunks if use_ref_model else [None] * len(_rejected_input_chunks)),
             (_chosen_nll_target_chunks if nll_target is not None else [None] * len(_chosen_input_chunks)),
-            strict=False,
         ):
             input_chunk = torch.cat([chosen_input_chunk, rejected_input_chunk], dim=0)
             ref_input_chunk = (

liger_kernel/chunked_loss/jsd_loss.py CHANGED Viewed

@@ -150,8 +150,8 @@ class LigerFusedLinearJSDLoss(torch.nn.Module):
         teacher_input: torch.Tensor,
         teacher_weight: torch.Tensor,
         true_labels: torch.LongTensor,
-        student_bias: torch.Tensor,
-        teacher_bias: torch.Tensor,
+        student_bias: torch.Tensor = None,
+        teacher_bias: torch.Tensor = None,
     ) -> torch.Tensor:
         """
         Compute the JSD distillation loss.

liger_kernel/ops/dyt.py CHANGED Viewed

@@ -4,7 +4,6 @@ import torch
 import triton
 import triton.language as tl
-from liger_kernel.ops.utils import calculate_settings
 from liger_kernel.ops.utils import compare_version
 from liger_kernel.ops.utils import ensure_contiguous
 from liger_kernel.ops.utils import infer_device
@@ -20,187 +19,126 @@ else:
     from triton.language.math import tanh
+# @triton.autotune([triton.Config({"BLOCK_N":bn}, num_stages=ns, num_warps=nw)
+#                   for bn in [1024, 2048, 4096]
+#                   for ns in [1,2,4]
+#                   for nw in [4, 8, 16, 32]
+#                   ],
+#                   key=['N'])
 @triton.jit
-def _dyt_fwd_kernel(
-    x_ptr,
-    x_row_stride,
-    alpha_ptr,
-    gamma_ptr,
-    beta_ptr,
-    y_ptr,
-    y_row_stride,
-    n_cols,
-    BLOCK_SIZE: tl.constexpr,
-):
-    """
-    Reference:
-    https://arxiv.org/abs/2503.10622
-    Shapes:
-        - x: (BT, C)
-        - alpha: (1)
-        - gamma: (C)
-        - beta: (C)
-    """
-    row_idx = tl.program_id(0)
-    offsets = tl.arange(0, BLOCK_SIZE)
-    mask = offsets < n_cols
-    x_ptr += row_idx * x_row_stride
-    y_ptr += row_idx * y_row_stride
-    alpha = tl.load(alpha_ptr)
-    gamma = tl.load(gamma_ptr + offsets, mask=mask)
-    beta = tl.load(beta_ptr + offsets, mask=mask)
-    x = tl.load(x_ptr + offsets, mask=mask)
-    y = gamma * tanh((alpha * x).cast(tl.float32)) + beta
-    tl.store(y_ptr + offsets, y, mask=mask)
+def _dyt_fwd_kernel(X, Y, Alpha, Gamma, Beta, HAVE_BETA: tl.constexpr, N: tl.constexpr, BLOCK_N: tl.constexpr = 1024):
+    col = tl.cast(tl.program_id(0), tl.int64) * BLOCK_N + tl.arange(0, BLOCK_N)
+    mask = col < N
+    row_id = tl.cast(tl.program_id(1), tl.int64)
+    X += row_id * N
+    Y += row_id * N
+    alpha = tl.load(Alpha).to(tl.float32)
+    gamma = tl.load(Gamma + col, mask=mask, other=0.0).to(tl.float32)
+    x = tl.load(X + col, mask=mask, other=0.0).to(tl.float32)
+    tanh_x = tanh(alpha * x)
+    y = tanh_x * gamma
+    if HAVE_BETA:
+        beta = tl.load(Beta + col, mask=mask, other=0.0).to(tl.float32)
+        y += beta
+    tl.store(Y + col, y, mask=mask)
+# @triton.autotune([triton.Config({"BLOCK_N":bn}, num_stages=ns, num_warps=nw)
+#                   for bn in [1024, 2048, 4096]
+#                   for ns in [1,2,4]
+#                   for nw in [4, 8, 16]
+#                   ],
+#                   key=['N'])
 @triton.jit
 def _dyt_bwd_kernel(
-    x_ptr,
-    x_row_stride,
-    dy_ptr,
-    dy_row_stride,
-    dx_ptr,
-    dx_row_stride,
-    alpha_ptr,
-    dalpha_ptr,
-    gamma_ptr,
-    dgamma_ptr,
-    dgamma_row_stride,
-    n_cols,
-    n_rows,
-    ROWS_PER_PROGRAM: tl.constexpr,
-    BLOCK_SIZE: tl.constexpr,
+    DY, DX, DA, DG, DB, X, Alpha, Gamma, HAVE_BETA: tl.constexpr, M, N: tl.constexpr, BLOCK_N: tl.constexpr = 1024
 ):
-    """
-    Reference:
-    https://arxiv.org/abs/2503.10622
-    Shapes:
-        - x: (BT, C)
-        - alpha: (1)
-        - gamma: (C)
-        - dx: (BT, C)
-        - dy: (BT, C)
-        - dgamma: (sm_count, C)
-        - dalpha: (sm_count,)
-    """
-    # d(gamma * tanh(alpha * x) + beta) / dx
-    # = gamma * (1 - tanh^2(alpha * x)) * alpha
-    # d(gamma * tanh(alpha * x) + beta) / dalpha
-    # = gamma * (1 - tanh^2(alpha * x)) * x
-    # d(gamma * tanh(alpha * x) + beta) / dgamma
-    # = tanh(alpha * x)
-    # d(gamma * tanh(alpha * x)) / dbeta = 1
-    pid = tl.program_id(0)
-    row_start = pid * ROWS_PER_PROGRAM
-    row_end = min((pid + 1) * ROWS_PER_PROGRAM, n_rows)
-    offsets = tl.arange(0, BLOCK_SIZE)
-    mask = offsets < n_cols
-    dalpha = 0.0
-    dgamma = tl.zeros((BLOCK_SIZE,), dtype=tl.float32)
-    x_ptr += row_start * x_row_stride
-    dx_ptr += row_start * dx_row_stride
-    dy_ptr += row_start * dy_row_stride
-    alpha = tl.load(alpha_ptr)
-    gamma = tl.load(gamma_ptr + offsets, mask=mask, other=0.0)
-    for _ in tl.range(row_start, row_end):
-        dy = tl.load(dy_ptr + offsets, mask=mask, other=0.0)
-        x = tl.load(x_ptr + offsets, mask=mask, other=0.0)
-        tanh_ax = tanh((alpha * x).cast(tl.float32))
-        sech2_ax = 1 - tanh_ax * tanh_ax
-        dx = dy * gamma * sech2_ax * alpha
-        dalpha += tl.sum(dy * gamma * sech2_ax * x)
-        dgamma += dy * tanh_ax
-        tl.store(dx_ptr + offsets, dx, mask=mask)
-        dy_ptr += dy_row_stride
-        x_ptr += x_row_stride
-        dx_ptr += dx_row_stride
-    tl.store(dgamma_ptr + pid * dgamma_row_stride + offsets, dgamma, mask=mask)
-    tl.store(dalpha_ptr + pid, dalpha)
-    pass
+    col = tl.cast(tl.program_id(0), tl.int64) * BLOCK_N + tl.arange(0, BLOCK_N)
+    mask = col < N
+    start_row_id = tl.cast(tl.program_id(1), tl.int64)
+    alpha = tl.load(Alpha).to(tl.float32)
+    da = 0.0
+    gamma = tl.load(Gamma + col, mask=mask, other=0.0).to(tl.float32)
+    dg = tl.zeros((BLOCK_N,), dtype=tl.float32)
+    if HAVE_BETA:
+        db = tl.zeros((BLOCK_N,), dtype=tl.float32)
+    for row_id in range(start_row_id, M, tl.num_programs(1)):
+        x = tl.load(X + row_id * N + col, mask=mask, other=0.0).to(tl.float32)
+        dy = tl.load(DY + row_id * N + col, mask=mask, other=0.0).to(tl.float32)
+        tanh_x = tanh(alpha * x)
+        if HAVE_BETA:
+            db += dy
+        dg += dy * tanh_x
+        tmp = (1 - tanh_x * tanh_x) * dy * gamma
+        da += tl.sum(x * tmp, 0)
+        dx = alpha * tmp
+        tl.store(DX + row_id * N + col, dx, mask=mask)
+    tl.store(DG + start_row_id * N + col, dg, mask=mask)
+    if HAVE_BETA:
+        tl.store(DB + start_row_id * N + col, db, mask=mask)
+    tl.store(DA + start_row_id * tl.cdiv(N, 512) + tl.program_id(0), da)
 def liger_dyt_fwd(x, alpha, gamma, beta):
-    shape = x.shape
-    dim = shape[-1]
-    x = x.view(-1, dim)
-    n_rows, n_cols = x.shape
+    assert x.is_contiguous()
+    HAVE_BETA = True if beta is not None else False
+    input_shape = x.shape
+    x = x.view(-1, input_shape[-1])
+    M, N = x.shape
     y = torch.empty_like(x)
-    BLOCK_SIZE, num_warps = calculate_settings(n_cols)
-    _dyt_fwd_kernel[(n_rows,)](
-        x_ptr=x,
-        alpha_ptr=alpha,
-        gamma_ptr=gamma,
-        beta_ptr=beta,
-        y_ptr=y,
-        x_row_stride=x.stride(0),
-        y_row_stride=y.stride(0),
-        n_cols=n_cols,
-        BLOCK_SIZE=BLOCK_SIZE,
-        num_warps=num_warps,
+    if N >= 4096:
+        kwargs = {"BLOCK_N": min(triton.next_power_of_2(N), 2048), "num_warps": 4, "num_stages": 1}
+    else:
+        kwargs = {"BLOCK_N": min(triton.next_power_of_2(N), 1024), "num_warps": 4, "num_stages": 1}
+    grid = lambda meta: (triton.cdiv(N, meta["BLOCK_N"]), M)
+    _dyt_fwd_kernel[(grid)](
+        x,
+        y,
+        alpha,
+        gamma,
+        beta,
+        HAVE_BETA,
+        N,
+        **kwargs,
     )
-    return y.view(*shape)
-def liger_dyt_bwd(dy, x, alpha, gamma):
-    shape = dy.shape
-    dtype = x.dtype
-    dim = shape[-1]
-    dy = dy.view(-1, dim)
-    x = x.view(-1, dim)
-    n_rows, n_cols = dy.shape
-    BLOCK_SIZE, num_warps = calculate_settings(n_cols)
-    sm_count = 1
+    return y.view(input_shape)
+def liger_dyt_bwd(dy, x, alpha, gamma, beta):
+    assert dy.is_contiguous()
+    input_shape = x.shape
+    x = x.view(-1, input_shape[-1])
+    M, N = x.shape
+    HAVE_BETA = True if beta is not None else False
     device = infer_device()
     if device == "cuda":
-        sm_count = torch.cuda.get_device_properties(x.device).multi_processor_count
+        NUM_SMS = torch.cuda.get_device_properties(x.device).multi_processor_count
     elif device == "xpu":
-        sm_count = torch.xpu.get_device_properties(x.device).gpu_subslice_count
-    if n_cols > BLOCK_SIZE:
-        raise RuntimeError(
-            f"Feature dimension {dim} exceeds maximum supported size of {BLOCK_SIZE}. Consider using a smaller feature dimension."
-        )
-    dx = torch.empty_like(x, dtype=torch.float32)
-    _dalpha = torch.empty((sm_count,), dtype=torch.float32, device=x.device)
-    _dgamma = torch.empty((sm_count, n_cols), dtype=torch.float32, device=x.device)
-    grid = (sm_count,)
-    rows_per_program = triton.cdiv(n_rows, sm_count)
-    _dyt_bwd_kernel[grid](
-        x_ptr=x,
-        x_row_stride=x.stride(0),
-        dy_ptr=dy,
-        dy_row_stride=dy.stride(0),
-        dx_ptr=dx,
-        dx_row_stride=dx.stride(0),
-        alpha_ptr=alpha,
-        dalpha_ptr=_dalpha,
-        gamma_ptr=gamma,
-        dgamma_ptr=_dgamma,
-        dgamma_row_stride=_dgamma.stride(0),
-        n_cols=n_cols,
-        n_rows=n_rows,
-        ROWS_PER_PROGRAM=rows_per_program,
-        BLOCK_SIZE=BLOCK_SIZE,
-        num_warps=num_warps,
-    )
-    dalpha = _dalpha.sum(dim=0, keepdim=True).to(dtype)
-    dgamma = _dgamma.sum(dim=0).to(dtype)
-    dbeta = dy.sum(dim=0).to(dtype)
-    return dx.view(*shape), dalpha, dgamma, dbeta
+        NUM_SMS = torch.xpu.get_device_properties(x.device).gpu_subslice_count
+    da = torch.zeros(NUM_SMS, triton.cdiv(N, 512), dtype=torch.float32, device=x.device)
+    dg = torch.empty(NUM_SMS, N, dtype=torch.float32, device=x.device)
+    db = torch.empty(NUM_SMS, N, dtype=torch.float32, device=x.device) if HAVE_BETA else None
+    dx = torch.empty_like(dy)
+    kwargs = {"BLOCK_N": min(triton.next_power_of_2(N), 1024), "num_warps": 8, "num_stages": 2}
+    grid = lambda meta: (triton.cdiv(N, meta["BLOCK_N"]), NUM_SMS)
+    _dyt_bwd_kernel[grid](dy, dx, da, dg, db, x, alpha, gamma, HAVE_BETA, M, N, **kwargs)
+    if HAVE_BETA:
+        db = db.sum(0).to(x.dtype)
+    dg = dg.sum(0).to(gamma.dtype)
+    da = da.sum().to(x.dtype).unsqueeze(0)
+    return dx.view(input_shape), da, dg, db
 class LigerDyTFunction(torch.autograd.Function):
@@ -208,18 +146,12 @@ class LigerDyTFunction(torch.autograd.Function):
     @ensure_contiguous
     def forward(ctx, x, alpha, gamma, beta):
         y = liger_dyt_fwd(x, alpha, gamma, beta)
-        ctx.save_for_backward(x, alpha, gamma)
+        ctx.save_for_backward(x, alpha, gamma, beta)
         return y
     @staticmethod
     @ensure_contiguous
-    def backward(ctx, grad_output):
-        x, alpha, gamma = ctx.saved_tensors
-        dx, dalpha, dgamma, dbeta = liger_dyt_bwd(
-            grad_output,
-            x,
-            alpha,
-            gamma,
-        )
-        return (dx, dalpha, dgamma, dbeta)
+    def backward(ctx, dy):
+        x, alpha, gamma, beta = ctx.saved_tensors
+        dx, dalpha, dgamma, dbeta = liger_dyt_bwd(dy, x, alpha, gamma, beta)
+        return dx, dalpha, dgamma, dbeta

liger-kernel 0.5.9__py3-none-any.whl → 0.6.0__py3-none-any.whl

liger-kernel 0.5.9py3-none-any.whl → 0.6.0py3-none-any.whl