PyPI - liger-kernel - Versions diffs - 0.5.8__py3-none-any.whl → 0.5.10__py3-none-any.whl - Mend

liger-kernel 0.5.8py3-none-any.whl → 0.5.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

liger_kernel/chunked_loss/dpo_loss.py +8 -1
liger_kernel/chunked_loss/fused_linear_preference.py +0 -1
liger_kernel/chunked_loss/jsd_loss.py +2 -2
liger_kernel/ops/cross_entropy.py +4 -1
liger_kernel/ops/dyt.py +113 -179
liger_kernel/ops/fused_linear_cross_entropy.py +4 -3
liger_kernel/ops/grpo_loss.py +310 -0
liger_kernel/ops/sparsemax.py +167 -0
liger_kernel/transformers/__init__.py +11 -0
liger_kernel/transformers/dyt.py +5 -3
liger_kernel/transformers/fsdp.py +55 -0
liger_kernel/transformers/functional.py +8 -0
liger_kernel/transformers/fused_linear_cross_entropy.py +1 -2
liger_kernel/transformers/grpo_loss.py +98 -0
liger_kernel/transformers/model/gemma.py +8 -12
liger_kernel/transformers/model/gemma2.py +8 -10
liger_kernel/transformers/model/gemma3.py +3 -9
liger_kernel/transformers/model/glm4.py +119 -0
liger_kernel/transformers/model/llama.py +64 -15
liger_kernel/transformers/model/llava.py +0 -8
liger_kernel/transformers/model/mistral.py +8 -10
liger_kernel/transformers/model/mixtral.py +8 -12
liger_kernel/transformers/model/mllama.py +8 -11
liger_kernel/transformers/model/olmo2.py +8 -10
liger_kernel/transformers/model/paligemma.py +0 -8
liger_kernel/transformers/model/phi3.py +8 -12
liger_kernel/transformers/model/qwen2.py +8 -12
liger_kernel/transformers/model/qwen2_5_vl.py +3 -7
liger_kernel/transformers/model/qwen2_vl.py +3 -7
liger_kernel/transformers/model/qwen3.py +112 -0
liger_kernel/transformers/model/qwen3_moe.py +128 -0
liger_kernel/transformers/monkey_patch.py +243 -13
liger_kernel/transformers/sparsemax.py +16 -0
liger_kernel/transformers/swiglu.py +21 -0
liger_kernel/transformers/trainer/orpo_trainer.py +1 -53
liger_kernel/utils.py +11 -0
{liger_kernel-0.5.8.dist-info → liger_kernel-0.5.10.dist-info}/METADATA +36 -20
{liger_kernel-0.5.8.dist-info → liger_kernel-0.5.10.dist-info}/RECORD +42 -34
{liger_kernel-0.5.8.dist-info → liger_kernel-0.5.10.dist-info}/WHEEL +1 -1
{liger_kernel-0.5.8.dist-info → liger_kernel-0.5.10.dist-info}/licenses/LICENSE +0 -0
{liger_kernel-0.5.8.dist-info → liger_kernel-0.5.10.dist-info}/licenses/NOTICE +0 -0
{liger_kernel-0.5.8.dist-info → liger_kernel-0.5.10.dist-info}/top_level.txt +0 -0

liger_kernel/chunked_loss/dpo_loss.py CHANGED Viewed

@@ -68,6 +68,7 @@ class LigerFusedLinearDPOFunction(LigerFusedLinearPreferenceBase):
         compute_nll_loss=False,
         compiled=True,
         use_ref_model=True,
+        average_log_prob=False,
         chunk_size=1,
     ):
         """
@@ -85,6 +86,7 @@ class LigerFusedLinearDPOFunction(LigerFusedLinearPreferenceBase):
             compute_nll_loss (bool): Whether to compute the NLL loss
             compiled (bool): Whether to use torch compile
             use_ref_model (bool): Whether to use a reference model
+            average_log_prob (bool): Whether to average the log probability per non-masked token
             chunk_size (int): Size of chunks for processing.
         Returns:
             torch.Tensor: Computed loss
@@ -104,13 +106,14 @@ class LigerFusedLinearDPOFunction(LigerFusedLinearPreferenceBase):
             ref_input=ref_input,
             ref_weight=ref_weight,
             ref_bias=ref_bias,
+            average_log_prob=average_log_prob,
             chunk_size=chunk_size,
         )
     @staticmethod
     def backward(ctx, *grad_output):
         grads = LigerFusedLinearPreferenceBase.backward(ctx, grad_output)[:4]
-        return *grads, None, None, None, None, None, None, None, None, None
+        return *grads, None, None, None, None, None, None, None, None, None, None
 class LigerFusedLinearDPOLoss(torch.nn.Module):
@@ -125,6 +128,7 @@ class LigerFusedLinearDPOLoss(torch.nn.Module):
         compute_nll_loss: bool = False,
         compiled: bool = True,
         use_ref_model: bool = True,
+        average_log_prob: bool = False,
         chunk_size: int = 1,
     ):
         """
@@ -134,6 +138,7 @@ class LigerFusedLinearDPOLoss(torch.nn.Module):
             compute_nll_loss (bool): Whether to compute the NLL loss.
             compiled (bool): Whether to use the torch compiled kernel.
             use_ref_model (bool): Whether to use a reference model for the DPO loss.
+            average_log_prob (bool): Whether to average the log probability per non-masked token.
             chunk_size (int): Size of chunks for processing.
         """
         super().__init__()
@@ -142,6 +147,7 @@ class LigerFusedLinearDPOLoss(torch.nn.Module):
         self.compute_nll_loss = compute_nll_loss
         self.compiled = compiled
         self.use_ref_model = use_ref_model
+        self.average_log_prob = average_log_prob
         self.chunk_size = chunk_size
     def forward(
@@ -167,5 +173,6 @@ class LigerFusedLinearDPOLoss(torch.nn.Module):
             self.compute_nll_loss,
             self.compiled,
             self.use_ref_model,
+            self.average_log_prob,
             self.chunk_size,
         )

liger_kernel/chunked_loss/fused_linear_preference.py CHANGED Viewed

@@ -222,7 +222,6 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
             (_ref_chosen_input_chunks if use_ref_model else [None] * len(_chosen_input_chunks)),
             (_ref_rejected_input_chunks if use_ref_model else [None] * len(_rejected_input_chunks)),
             (_chosen_nll_target_chunks if nll_target is not None else [None] * len(_chosen_input_chunks)),
-            strict=False,
         ):
             input_chunk = torch.cat([chosen_input_chunk, rejected_input_chunk], dim=0)
             ref_input_chunk = (

liger_kernel/chunked_loss/jsd_loss.py CHANGED Viewed

@@ -150,8 +150,8 @@ class LigerFusedLinearJSDLoss(torch.nn.Module):
         teacher_input: torch.Tensor,
         teacher_weight: torch.Tensor,
         true_labels: torch.LongTensor,
-        student_bias: torch.Tensor,
-        teacher_bias: torch.Tensor,
+        student_bias: torch.Tensor = None,
+        teacher_bias: torch.Tensor = None,
     ) -> torch.Tensor:
         """
         Compute the JSD distillation loss.

liger_kernel/ops/cross_entropy.py CHANGED Viewed

@@ -351,7 +351,10 @@ def cross_entropy_backward(_input, grad_output):
     # If cross entropy is the last layer, grad_output is 1.0. Skip the mul to save time
     if torch.equal(grad_output, torch.tensor(1.0, device=grad_output.device)):
         pass
+    # If reduction is 'none'
+    elif grad_output.ndim > 0:
+        _input = _input * grad_output.unsqueeze(dim=1)
+    # If reduction is ['mean', 'sum'], grad_output is just a scalar
     # We use a Triton kernel instead of a PyTorch operation because modifying inputs in-place
     # for gradient storage and backward multiple times causes anomalies with PyTorch but not with Triton.
     else:

liger_kernel/ops/dyt.py CHANGED Viewed

@@ -4,7 +4,8 @@ import torch
 import triton
 import triton.language as tl
-from liger_kernel.ops.utils import calculate_settings
+from triton.language.extra.libdevice import tanh
 from liger_kernel.ops.utils import compare_version
 from liger_kernel.ops.utils import ensure_contiguous
 from liger_kernel.ops.utils import infer_device
@@ -20,187 +21,126 @@ else:
     from triton.language.math import tanh
+# @triton.autotune([triton.Config({"BLOCK_N":bn}, num_stages=ns, num_warps=nw)
+#                   for bn in [1024, 2048, 4096]
+#                   for ns in [1,2,4]
+#                   for nw in [4, 8, 16, 32]
+#                   ],
+#                   key=['N'])
 @triton.jit
-def _dyt_fwd_kernel(
-    x_ptr,
-    x_row_stride,
-    alpha_ptr,
-    gamma_ptr,
-    beta_ptr,
-    y_ptr,
-    y_row_stride,
-    n_cols,
-    BLOCK_SIZE: tl.constexpr,
-):
-    """
-    Reference:
-    https://arxiv.org/abs/2503.10622
-    Shapes:
-        - x: (BT, C)
-        - alpha: (1)
-        - gamma: (C)
-        - beta: (C)
-    """
-    row_idx = tl.program_id(0)
-    offsets = tl.arange(0, BLOCK_SIZE)
-    mask = offsets < n_cols
-    x_ptr += row_idx * x_row_stride
-    y_ptr += row_idx * y_row_stride
-    alpha = tl.load(alpha_ptr)
-    gamma = tl.load(gamma_ptr + offsets, mask=mask)
-    beta = tl.load(beta_ptr + offsets, mask=mask)
-    x = tl.load(x_ptr + offsets, mask=mask)
-    y = gamma * tanh((alpha * x).cast(tl.float32)) + beta
-    tl.store(y_ptr + offsets, y, mask=mask)
+def _dyt_fwd_kernel(X, Y, Alpha, Gamma, Beta, HAVE_BETA: tl.constexpr, N: tl.constexpr, BLOCK_N: tl.constexpr = 1024):
+    col = tl.cast(tl.program_id(0), tl.int64) * BLOCK_N + tl.arange(0, BLOCK_N)
+    mask = col < N
+    row_id = tl.cast(tl.program_id(1), tl.int64)
+    X += row_id * N
+    Y += row_id * N
+    alpha = tl.load(Alpha).to(tl.float32)
+    gamma = tl.load(Gamma + col, mask=mask, other=0.0).to(tl.float32)
+    x = tl.load(X + col, mask=mask, other=0.0).to(tl.float32)
+    tanh_x = tanh(alpha * x)
+    y = tanh_x * gamma
+    if HAVE_BETA:
+        beta = tl.load(Beta + col, mask=mask, other=0.0).to(tl.float32)
+        y += beta
+    tl.store(Y + col, y, mask=mask)
+# @triton.autotune([triton.Config({"BLOCK_N":bn}, num_stages=ns, num_warps=nw)
+#                   for bn in [1024, 2048, 4096]
+#                   for ns in [1,2,4]
+#                   for nw in [4, 8, 16]
+#                   ],
+#                   key=['N'])
 @triton.jit
 def _dyt_bwd_kernel(
-    x_ptr,
-    x_row_stride,
-    dy_ptr,
-    dy_row_stride,
-    dx_ptr,
-    dx_row_stride,
-    alpha_ptr,
-    dalpha_ptr,
-    gamma_ptr,
-    dgamma_ptr,
-    dgamma_row_stride,
-    n_cols,
-    n_rows,
-    ROWS_PER_PROGRAM: tl.constexpr,
-    BLOCK_SIZE: tl.constexpr,
+    DY, DX, DA, DG, DB, X, Alpha, Gamma, HAVE_BETA: tl.constexpr, M, N: tl.constexpr, BLOCK_N: tl.constexpr = 1024
 ):
-    """
-    Reference:
-    https://arxiv.org/abs/2503.10622
-    Shapes:
-        - x: (BT, C)
-        - alpha: (1)
-        - gamma: (C)
-        - dx: (BT, C)
-        - dy: (BT, C)
-        - dgamma: (sm_count, C)
-        - dalpha: (sm_count,)
-    """
-    # d(gamma * tanh(alpha * x) + beta) / dx
-    # = gamma * (1 - tanh^2(alpha * x)) * alpha
-    # d(gamma * tanh(alpha * x) + beta) / dalpha
-    # = gamma * (1 - tanh^2(alpha * x)) * x
-    # d(gamma * tanh(alpha * x) + beta) / dgamma
-    # = tanh(alpha * x)
-    # d(gamma * tanh(alpha * x)) / dbeta = 1
-    pid = tl.program_id(0)
-    row_start = pid * ROWS_PER_PROGRAM
-    row_end = min((pid + 1) * ROWS_PER_PROGRAM, n_rows)
-    offsets = tl.arange(0, BLOCK_SIZE)
-    mask = offsets < n_cols
-    dalpha = 0.0
-    dgamma = tl.zeros((BLOCK_SIZE,), dtype=tl.float32)
-    x_ptr += row_start * x_row_stride
-    dx_ptr += row_start * dx_row_stride
-    dy_ptr += row_start * dy_row_stride
-    alpha = tl.load(alpha_ptr)
-    gamma = tl.load(gamma_ptr + offsets, mask=mask, other=0.0)
-    for _ in tl.range(row_start, row_end):
-        dy = tl.load(dy_ptr + offsets, mask=mask, other=0.0)
-        x = tl.load(x_ptr + offsets, mask=mask, other=0.0)
-        tanh_ax = tanh((alpha * x).cast(tl.float32))
-        sech2_ax = 1 - tanh_ax * tanh_ax
-        dx = dy * gamma * sech2_ax * alpha
-        dalpha += tl.sum(dy * gamma * sech2_ax * x)
-        dgamma += dy * tanh_ax
-        tl.store(dx_ptr + offsets, dx, mask=mask)
-        dy_ptr += dy_row_stride
-        x_ptr += x_row_stride
-        dx_ptr += dx_row_stride
-    tl.store(dgamma_ptr + pid * dgamma_row_stride + offsets, dgamma, mask=mask)
-    tl.store(dalpha_ptr + pid, dalpha)
-    pass
+    col = tl.cast(tl.program_id(0), tl.int64) * BLOCK_N + tl.arange(0, BLOCK_N)
+    mask = col < N
+    start_row_id = tl.cast(tl.program_id(1), tl.int64)
+    alpha = tl.load(Alpha).to(tl.float32)
+    da = 0.0
+    gamma = tl.load(Gamma + col, mask=mask, other=0.0).to(tl.float32)
+    dg = tl.zeros((BLOCK_N,), dtype=tl.float32)
+    if HAVE_BETA:
+        db = tl.zeros((BLOCK_N,), dtype=tl.float32)
+    for row_id in range(start_row_id, M, tl.num_programs(1)):
+        x = tl.load(X + row_id * N + col, mask=mask, other=0.0).to(tl.float32)
+        dy = tl.load(DY + row_id * N + col, mask=mask, other=0.0).to(tl.float32)
+        tanh_x = tanh(alpha * x)
+        if HAVE_BETA:
+            db += dy
+        dg += dy * tanh_x
+        tmp = (1 - tanh_x * tanh_x) * dy * gamma
+        da += tl.sum(x * tmp, 0)
+        dx = alpha * tmp
+        tl.store(DX + row_id * N + col, dx, mask=mask)
+    tl.store(DG + start_row_id * N + col, dg, mask=mask)
+    if HAVE_BETA:
+        tl.store(DB + start_row_id * N + col, db, mask=mask)
+    tl.store(DA + start_row_id * tl.cdiv(N, 512) + tl.program_id(0), da)
 def liger_dyt_fwd(x, alpha, gamma, beta):
-    shape = x.shape
-    dim = shape[-1]
-    x = x.view(-1, dim)
-    n_rows, n_cols = x.shape
+    assert x.is_contiguous()
+    HAVE_BETA = True if beta is not None else False
+    input_shape = x.shape
+    x = x.view(-1, input_shape[-1])
+    M, N = x.shape
     y = torch.empty_like(x)
-    BLOCK_SIZE, num_warps = calculate_settings(n_cols)
-    _dyt_fwd_kernel[(n_rows,)](
-        x_ptr=x,
-        alpha_ptr=alpha,
-        gamma_ptr=gamma,
-        beta_ptr=beta,
-        y_ptr=y,
-        x_row_stride=x.stride(0),
-        y_row_stride=y.stride(0),
-        n_cols=n_cols,
-        BLOCK_SIZE=BLOCK_SIZE,
-        num_warps=num_warps,
+    if N >= 4096:
+        kwargs = {"BLOCK_N": min(triton.next_power_of_2(N), 2048), "num_warps": 4, "num_stages": 1}
+    else:
+        kwargs = {"BLOCK_N": min(triton.next_power_of_2(N), 1024), "num_warps": 4, "num_stages": 1}
+    grid = lambda meta: (triton.cdiv(N, meta["BLOCK_N"]), M)
+    _dyt_fwd_kernel[(grid)](
+        x,
+        y,
+        alpha,
+        gamma,
+        beta,
+        HAVE_BETA,
+        N,
+        **kwargs,
     )
-    return y.view(*shape)
-def liger_dyt_bwd(dy, x, alpha, gamma):
-    shape = dy.shape
-    dtype = x.dtype
-    dim = shape[-1]
-    dy = dy.view(-1, dim)
-    x = x.view(-1, dim)
-    n_rows, n_cols = dy.shape
-    BLOCK_SIZE, num_warps = calculate_settings(n_cols)
-    sm_count = 1
+    return y.view(input_shape)
+def liger_dyt_bwd(dy, x, alpha, gamma, beta):
+    assert dy.is_contiguous()
+    input_shape = x.shape
+    x = x.view(-1, input_shape[-1])
+    M, N = x.shape
+    HAVE_BETA = True if beta is not None else False
     device = infer_device()
     if device == "cuda":
-        sm_count = torch.cuda.get_device_properties(x.device).multi_processor_count
+        NUM_SMS = torch.cuda.get_device_properties(x.device).multi_processor_count
     elif device == "xpu":
-        sm_count = torch.xpu.get_device_properties(x.device).gpu_subslice_count
-    if n_cols > BLOCK_SIZE:
-        raise RuntimeError(
-            f"Feature dimension {dim} exceeds maximum supported size of {BLOCK_SIZE}. Consider using a smaller feature dimension."
-        )
-    dx = torch.empty_like(x, dtype=torch.float32)
-    _dalpha = torch.empty((sm_count,), dtype=torch.float32, device=x.device)
-    _dgamma = torch.empty((sm_count, n_cols), dtype=torch.float32, device=x.device)
-    grid = (sm_count,)
-    rows_per_program = triton.cdiv(n_rows, sm_count)
-    _dyt_bwd_kernel[grid](
-        x_ptr=x,
-        x_row_stride=x.stride(0),
-        dy_ptr=dy,
-        dy_row_stride=dy.stride(0),
-        dx_ptr=dx,
-        dx_row_stride=dx.stride(0),
-        alpha_ptr=alpha,
-        dalpha_ptr=_dalpha,
-        gamma_ptr=gamma,
-        dgamma_ptr=_dgamma,
-        dgamma_row_stride=_dgamma.stride(0),
-        n_cols=n_cols,
-        n_rows=n_rows,
-        ROWS_PER_PROGRAM=rows_per_program,
-        BLOCK_SIZE=BLOCK_SIZE,
-        num_warps=num_warps,
-    )
-    dalpha = _dalpha.sum(dim=0, keepdim=True).to(dtype)
-    dgamma = _dgamma.sum(dim=0).to(dtype)
-    dbeta = dy.sum(dim=0).to(dtype)
-    return dx.view(*shape), dalpha, dgamma, dbeta
+        NUM_SMS = torch.xpu.get_device_properties(x.device).gpu_subslice_count
+    da = torch.zeros(NUM_SMS, triton.cdiv(N, 512), dtype=torch.float32, device=x.device)
+    dg = torch.empty(NUM_SMS, N, dtype=torch.float32, device=x.device)
+    db = torch.empty(NUM_SMS, N, dtype=torch.float32, device=x.device) if HAVE_BETA else None
+    dx = torch.empty_like(dy)
+    kwargs = {"BLOCK_N": min(triton.next_power_of_2(N), 1024), "num_warps": 8, "num_stages": 2}
+    grid = lambda meta: (triton.cdiv(N, meta["BLOCK_N"]), NUM_SMS)
+    _dyt_bwd_kernel[grid](dy, dx, da, dg, db, x, alpha, gamma, HAVE_BETA, M, N, **kwargs)
+    if HAVE_BETA:
+        db = db.sum(0).to(x.dtype)
+    dg = dg.sum(0).to(gamma.dtype)
+    da = da.sum().to(x.dtype).unsqueeze(0)
+    return dx.view(input_shape), da, dg, db
 class LigerDyTFunction(torch.autograd.Function):
@@ -208,18 +148,12 @@ class LigerDyTFunction(torch.autograd.Function):
     @ensure_contiguous
     def forward(ctx, x, alpha, gamma, beta):
         y = liger_dyt_fwd(x, alpha, gamma, beta)
-        ctx.save_for_backward(x, alpha, gamma)
+        ctx.save_for_backward(x, alpha, gamma, beta)
         return y
     @staticmethod
     @ensure_contiguous
-    def backward(ctx, grad_output):
-        x, alpha, gamma = ctx.saved_tensors
-        dx, dalpha, dgamma, dbeta = liger_dyt_bwd(
-            grad_output,
-            x,
-            alpha,
-            gamma,
-        )
-        return (dx, dalpha, dgamma, dbeta)
+    def backward(ctx, dy):
+        x, alpha, gamma, beta = ctx.saved_tensors
+        dx, dalpha, dgamma, dbeta = liger_dyt_bwd(dy, x, alpha, gamma, beta)
+        return dx, dalpha, dgamma, dbeta

liger_kernel/ops/fused_linear_cross_entropy.py CHANGED Viewed

@@ -143,9 +143,10 @@ def fused_linear_cross_entropy_forward(
                 alpha=1.0,
             )
-    if reduction == "none":
-        loss = loss_1d
-        z_loss = z_loss_1d if return_z_loss else None
+    # Need extra calculations for backward if reduction=='none'. Not supporting reduction='none' now.
+    # if reduction == "none":
+    #     loss = loss_1d
+    #     z_loss = z_loss_1d if return_z_loss else None
     else:
         loss = torch.sum(loss_1d)

liger-kernel 0.5.8__py3-none-any.whl → 0.5.10__py3-none-any.whl

liger-kernel 0.5.8py3-none-any.whl → 0.5.10py3-none-any.whl