PyPI - liger-kernel - Versions diffs - 0.5.9__py3-none-any.whl → 0.5.10__py3-none-any.whl - Mend

liger-kernel 0.5.9py3-none-any.whl → 0.5.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

liger_kernel/chunked_loss/dpo_loss.py +1 -1
liger_kernel/chunked_loss/fused_linear_preference.py +0 -1
liger_kernel/chunked_loss/jsd_loss.py +2 -2
liger_kernel/ops/dyt.py +113 -179
liger_kernel/ops/grpo_loss.py +310 -0
liger_kernel/ops/sparsemax.py +167 -0
liger_kernel/transformers/__init__.py +5 -0
liger_kernel/transformers/dyt.py +5 -3
liger_kernel/transformers/fsdp.py +55 -0
liger_kernel/transformers/functional.py +8 -0
liger_kernel/transformers/grpo_loss.py +98 -0
liger_kernel/transformers/model/gemma.py +0 -8
liger_kernel/transformers/model/gemma2.py +0 -6
liger_kernel/transformers/model/gemma3.py +0 -8
liger_kernel/transformers/model/glm4.py +0 -6
liger_kernel/transformers/model/llama.py +56 -11
liger_kernel/transformers/model/llava.py +0 -8
liger_kernel/transformers/model/mistral.py +0 -6
liger_kernel/transformers/model/mixtral.py +0 -8
liger_kernel/transformers/model/mllama.py +0 -7
liger_kernel/transformers/model/olmo2.py +0 -6
liger_kernel/transformers/model/paligemma.py +0 -8
liger_kernel/transformers/model/phi3.py +0 -8
liger_kernel/transformers/model/qwen2.py +0 -8
liger_kernel/transformers/model/qwen2_5_vl.py +0 -6
liger_kernel/transformers/model/qwen2_vl.py +0 -6
liger_kernel/transformers/model/qwen3.py +0 -6
liger_kernel/transformers/model/qwen3_moe.py +128 -0
liger_kernel/transformers/monkey_patch.py +122 -13
liger_kernel/transformers/sparsemax.py +16 -0
liger_kernel/transformers/swiglu.py +21 -0
liger_kernel/transformers/trainer/orpo_trainer.py +1 -53
liger_kernel/utils.py +11 -0
{liger_kernel-0.5.9.dist-info → liger_kernel-0.5.10.dist-info}/METADATA +34 -20
{liger_kernel-0.5.9.dist-info → liger_kernel-0.5.10.dist-info}/RECORD +39 -33
{liger_kernel-0.5.9.dist-info → liger_kernel-0.5.10.dist-info}/WHEEL +1 -1
{liger_kernel-0.5.9.dist-info → liger_kernel-0.5.10.dist-info}/licenses/LICENSE +0 -0
{liger_kernel-0.5.9.dist-info → liger_kernel-0.5.10.dist-info}/licenses/NOTICE +0 -0
{liger_kernel-0.5.9.dist-info → liger_kernel-0.5.10.dist-info}/top_level.txt +0 -0

liger_kernel/chunked_loss/dpo_loss.py CHANGED Viewed

@@ -128,7 +128,7 @@ class LigerFusedLinearDPOLoss(torch.nn.Module):
         compute_nll_loss: bool = False,
         compiled: bool = True,
         use_ref_model: bool = True,
-        average_log_prob: bool = True,
+        average_log_prob: bool = False,
         chunk_size: int = 1,
     ):
         """

liger_kernel/chunked_loss/fused_linear_preference.py CHANGED Viewed

@@ -222,7 +222,6 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
             (_ref_chosen_input_chunks if use_ref_model else [None] * len(_chosen_input_chunks)),
             (_ref_rejected_input_chunks if use_ref_model else [None] * len(_rejected_input_chunks)),
             (_chosen_nll_target_chunks if nll_target is not None else [None] * len(_chosen_input_chunks)),
-            strict=False,
         ):
             input_chunk = torch.cat([chosen_input_chunk, rejected_input_chunk], dim=0)
             ref_input_chunk = (

liger_kernel/chunked_loss/jsd_loss.py CHANGED Viewed

@@ -150,8 +150,8 @@ class LigerFusedLinearJSDLoss(torch.nn.Module):
         teacher_input: torch.Tensor,
         teacher_weight: torch.Tensor,
         true_labels: torch.LongTensor,
-        student_bias: torch.Tensor,
-        teacher_bias: torch.Tensor,
+        student_bias: torch.Tensor = None,
+        teacher_bias: torch.Tensor = None,
     ) -> torch.Tensor:
         """
         Compute the JSD distillation loss.

liger_kernel/ops/dyt.py CHANGED Viewed

@@ -4,7 +4,8 @@ import torch
 import triton
 import triton.language as tl
-from liger_kernel.ops.utils import calculate_settings
+from triton.language.extra.libdevice import tanh
 from liger_kernel.ops.utils import compare_version
 from liger_kernel.ops.utils import ensure_contiguous
 from liger_kernel.ops.utils import infer_device
@@ -20,187 +21,126 @@ else:
     from triton.language.math import tanh
+# @triton.autotune([triton.Config({"BLOCK_N":bn}, num_stages=ns, num_warps=nw)
+#                   for bn in [1024, 2048, 4096]
+#                   for ns in [1,2,4]
+#                   for nw in [4, 8, 16, 32]
+#                   ],
+#                   key=['N'])
 @triton.jit
-def _dyt_fwd_kernel(
-    x_ptr,
-    x_row_stride,
-    alpha_ptr,
-    gamma_ptr,
-    beta_ptr,
-    y_ptr,
-    y_row_stride,
-    n_cols,
-    BLOCK_SIZE: tl.constexpr,
-):
-    """
-    Reference:
-    https://arxiv.org/abs/2503.10622
-    Shapes:
-        - x: (BT, C)
-        - alpha: (1)
-        - gamma: (C)
-        - beta: (C)
-    """
-    row_idx = tl.program_id(0)
-    offsets = tl.arange(0, BLOCK_SIZE)
-    mask = offsets < n_cols
-    x_ptr += row_idx * x_row_stride
-    y_ptr += row_idx * y_row_stride
-    alpha = tl.load(alpha_ptr)
-    gamma = tl.load(gamma_ptr + offsets, mask=mask)
-    beta = tl.load(beta_ptr + offsets, mask=mask)
-    x = tl.load(x_ptr + offsets, mask=mask)
-    y = gamma * tanh((alpha * x).cast(tl.float32)) + beta
-    tl.store(y_ptr + offsets, y, mask=mask)
+def _dyt_fwd_kernel(X, Y, Alpha, Gamma, Beta, HAVE_BETA: tl.constexpr, N: tl.constexpr, BLOCK_N: tl.constexpr = 1024):
+    col = tl.cast(tl.program_id(0), tl.int64) * BLOCK_N + tl.arange(0, BLOCK_N)
+    mask = col < N
+    row_id = tl.cast(tl.program_id(1), tl.int64)
+    X += row_id * N
+    Y += row_id * N
+    alpha = tl.load(Alpha).to(tl.float32)
+    gamma = tl.load(Gamma + col, mask=mask, other=0.0).to(tl.float32)
+    x = tl.load(X + col, mask=mask, other=0.0).to(tl.float32)
+    tanh_x = tanh(alpha * x)
+    y = tanh_x * gamma
+    if HAVE_BETA:
+        beta = tl.load(Beta + col, mask=mask, other=0.0).to(tl.float32)
+        y += beta
+    tl.store(Y + col, y, mask=mask)
+# @triton.autotune([triton.Config({"BLOCK_N":bn}, num_stages=ns, num_warps=nw)
+#                   for bn in [1024, 2048, 4096]
+#                   for ns in [1,2,4]
+#                   for nw in [4, 8, 16]
+#                   ],
+#                   key=['N'])
 @triton.jit
 def _dyt_bwd_kernel(
-    x_ptr,
-    x_row_stride,
-    dy_ptr,
-    dy_row_stride,
-    dx_ptr,
-    dx_row_stride,
-    alpha_ptr,
-    dalpha_ptr,
-    gamma_ptr,
-    dgamma_ptr,
-    dgamma_row_stride,
-    n_cols,
-    n_rows,
-    ROWS_PER_PROGRAM: tl.constexpr,
-    BLOCK_SIZE: tl.constexpr,
+    DY, DX, DA, DG, DB, X, Alpha, Gamma, HAVE_BETA: tl.constexpr, M, N: tl.constexpr, BLOCK_N: tl.constexpr = 1024
 ):
-    """
-    Reference:
-    https://arxiv.org/abs/2503.10622
-    Shapes:
-        - x: (BT, C)
-        - alpha: (1)
-        - gamma: (C)
-        - dx: (BT, C)
-        - dy: (BT, C)
-        - dgamma: (sm_count, C)
-        - dalpha: (sm_count,)
-    """
-    # d(gamma * tanh(alpha * x) + beta) / dx
-    # = gamma * (1 - tanh^2(alpha * x)) * alpha
-    # d(gamma * tanh(alpha * x) + beta) / dalpha
-    # = gamma * (1 - tanh^2(alpha * x)) * x
-    # d(gamma * tanh(alpha * x) + beta) / dgamma
-    # = tanh(alpha * x)
-    # d(gamma * tanh(alpha * x)) / dbeta = 1
-    pid = tl.program_id(0)
-    row_start = pid * ROWS_PER_PROGRAM
-    row_end = min((pid + 1) * ROWS_PER_PROGRAM, n_rows)
-    offsets = tl.arange(0, BLOCK_SIZE)
-    mask = offsets < n_cols
-    dalpha = 0.0
-    dgamma = tl.zeros((BLOCK_SIZE,), dtype=tl.float32)
-    x_ptr += row_start * x_row_stride
-    dx_ptr += row_start * dx_row_stride
-    dy_ptr += row_start * dy_row_stride
-    alpha = tl.load(alpha_ptr)
-    gamma = tl.load(gamma_ptr + offsets, mask=mask, other=0.0)
-    for _ in tl.range(row_start, row_end):
-        dy = tl.load(dy_ptr + offsets, mask=mask, other=0.0)
-        x = tl.load(x_ptr + offsets, mask=mask, other=0.0)
-        tanh_ax = tanh((alpha * x).cast(tl.float32))
-        sech2_ax = 1 - tanh_ax * tanh_ax
-        dx = dy * gamma * sech2_ax * alpha
-        dalpha += tl.sum(dy * gamma * sech2_ax * x)
-        dgamma += dy * tanh_ax
-        tl.store(dx_ptr + offsets, dx, mask=mask)
-        dy_ptr += dy_row_stride
-        x_ptr += x_row_stride
-        dx_ptr += dx_row_stride
-    tl.store(dgamma_ptr + pid * dgamma_row_stride + offsets, dgamma, mask=mask)
-    tl.store(dalpha_ptr + pid, dalpha)
-    pass
+    col = tl.cast(tl.program_id(0), tl.int64) * BLOCK_N + tl.arange(0, BLOCK_N)
+    mask = col < N
+    start_row_id = tl.cast(tl.program_id(1), tl.int64)
+    alpha = tl.load(Alpha).to(tl.float32)
+    da = 0.0
+    gamma = tl.load(Gamma + col, mask=mask, other=0.0).to(tl.float32)
+    dg = tl.zeros((BLOCK_N,), dtype=tl.float32)
+    if HAVE_BETA:
+        db = tl.zeros((BLOCK_N,), dtype=tl.float32)
+    for row_id in range(start_row_id, M, tl.num_programs(1)):
+        x = tl.load(X + row_id * N + col, mask=mask, other=0.0).to(tl.float32)
+        dy = tl.load(DY + row_id * N + col, mask=mask, other=0.0).to(tl.float32)
+        tanh_x = tanh(alpha * x)
+        if HAVE_BETA:
+            db += dy
+        dg += dy * tanh_x
+        tmp = (1 - tanh_x * tanh_x) * dy * gamma
+        da += tl.sum(x * tmp, 0)
+        dx = alpha * tmp
+        tl.store(DX + row_id * N + col, dx, mask=mask)
+    tl.store(DG + start_row_id * N + col, dg, mask=mask)
+    if HAVE_BETA:
+        tl.store(DB + start_row_id * N + col, db, mask=mask)
+    tl.store(DA + start_row_id * tl.cdiv(N, 512) + tl.program_id(0), da)
 def liger_dyt_fwd(x, alpha, gamma, beta):
-    shape = x.shape
-    dim = shape[-1]
-    x = x.view(-1, dim)
-    n_rows, n_cols = x.shape
+    assert x.is_contiguous()
+    HAVE_BETA = True if beta is not None else False
+    input_shape = x.shape
+    x = x.view(-1, input_shape[-1])
+    M, N = x.shape
     y = torch.empty_like(x)
-    BLOCK_SIZE, num_warps = calculate_settings(n_cols)
-    _dyt_fwd_kernel[(n_rows,)](
-        x_ptr=x,
-        alpha_ptr=alpha,
-        gamma_ptr=gamma,
-        beta_ptr=beta,
-        y_ptr=y,
-        x_row_stride=x.stride(0),
-        y_row_stride=y.stride(0),
-        n_cols=n_cols,
-        BLOCK_SIZE=BLOCK_SIZE,
-        num_warps=num_warps,
+    if N >= 4096:
+        kwargs = {"BLOCK_N": min(triton.next_power_of_2(N), 2048), "num_warps": 4, "num_stages": 1}
+    else:
+        kwargs = {"BLOCK_N": min(triton.next_power_of_2(N), 1024), "num_warps": 4, "num_stages": 1}
+    grid = lambda meta: (triton.cdiv(N, meta["BLOCK_N"]), M)
+    _dyt_fwd_kernel[(grid)](
+        x,
+        y,
+        alpha,
+        gamma,
+        beta,
+        HAVE_BETA,
+        N,
+        **kwargs,
     )
-    return y.view(*shape)
-def liger_dyt_bwd(dy, x, alpha, gamma):
-    shape = dy.shape
-    dtype = x.dtype
-    dim = shape[-1]
-    dy = dy.view(-1, dim)
-    x = x.view(-1, dim)
-    n_rows, n_cols = dy.shape
-    BLOCK_SIZE, num_warps = calculate_settings(n_cols)
-    sm_count = 1
+    return y.view(input_shape)
+def liger_dyt_bwd(dy, x, alpha, gamma, beta):
+    assert dy.is_contiguous()
+    input_shape = x.shape
+    x = x.view(-1, input_shape[-1])
+    M, N = x.shape
+    HAVE_BETA = True if beta is not None else False
     device = infer_device()
     if device == "cuda":
-        sm_count = torch.cuda.get_device_properties(x.device).multi_processor_count
+        NUM_SMS = torch.cuda.get_device_properties(x.device).multi_processor_count
     elif device == "xpu":
-        sm_count = torch.xpu.get_device_properties(x.device).gpu_subslice_count
-    if n_cols > BLOCK_SIZE:
-        raise RuntimeError(
-            f"Feature dimension {dim} exceeds maximum supported size of {BLOCK_SIZE}. Consider using a smaller feature dimension."
-        )
-    dx = torch.empty_like(x, dtype=torch.float32)
-    _dalpha = torch.empty((sm_count,), dtype=torch.float32, device=x.device)
-    _dgamma = torch.empty((sm_count, n_cols), dtype=torch.float32, device=x.device)
-    grid = (sm_count,)
-    rows_per_program = triton.cdiv(n_rows, sm_count)
-    _dyt_bwd_kernel[grid](
-        x_ptr=x,
-        x_row_stride=x.stride(0),
-        dy_ptr=dy,
-        dy_row_stride=dy.stride(0),
-        dx_ptr=dx,
-        dx_row_stride=dx.stride(0),
-        alpha_ptr=alpha,
-        dalpha_ptr=_dalpha,
-        gamma_ptr=gamma,
-        dgamma_ptr=_dgamma,
-        dgamma_row_stride=_dgamma.stride(0),
-        n_cols=n_cols,
-        n_rows=n_rows,
-        ROWS_PER_PROGRAM=rows_per_program,
-        BLOCK_SIZE=BLOCK_SIZE,
-        num_warps=num_warps,
-    )
-    dalpha = _dalpha.sum(dim=0, keepdim=True).to(dtype)
-    dgamma = _dgamma.sum(dim=0).to(dtype)
-    dbeta = dy.sum(dim=0).to(dtype)
-    return dx.view(*shape), dalpha, dgamma, dbeta
+        NUM_SMS = torch.xpu.get_device_properties(x.device).gpu_subslice_count
+    da = torch.zeros(NUM_SMS, triton.cdiv(N, 512), dtype=torch.float32, device=x.device)
+    dg = torch.empty(NUM_SMS, N, dtype=torch.float32, device=x.device)
+    db = torch.empty(NUM_SMS, N, dtype=torch.float32, device=x.device) if HAVE_BETA else None
+    dx = torch.empty_like(dy)
+    kwargs = {"BLOCK_N": min(triton.next_power_of_2(N), 1024), "num_warps": 8, "num_stages": 2}
+    grid = lambda meta: (triton.cdiv(N, meta["BLOCK_N"]), NUM_SMS)
+    _dyt_bwd_kernel[grid](dy, dx, da, dg, db, x, alpha, gamma, HAVE_BETA, M, N, **kwargs)
+    if HAVE_BETA:
+        db = db.sum(0).to(x.dtype)
+    dg = dg.sum(0).to(gamma.dtype)
+    da = da.sum().to(x.dtype).unsqueeze(0)
+    return dx.view(input_shape), da, dg, db
 class LigerDyTFunction(torch.autograd.Function):
@@ -208,18 +148,12 @@ class LigerDyTFunction(torch.autograd.Function):
     @ensure_contiguous
     def forward(ctx, x, alpha, gamma, beta):
         y = liger_dyt_fwd(x, alpha, gamma, beta)
-        ctx.save_for_backward(x, alpha, gamma)
+        ctx.save_for_backward(x, alpha, gamma, beta)
         return y
     @staticmethod
     @ensure_contiguous
-    def backward(ctx, grad_output):
-        x, alpha, gamma = ctx.saved_tensors
-        dx, dalpha, dgamma, dbeta = liger_dyt_bwd(
-            grad_output,
-            x,
-            alpha,
-            gamma,
-        )
-        return (dx, dalpha, dgamma, dbeta)
+    def backward(ctx, dy):
+        x, alpha, gamma, beta = ctx.saved_tensors
+        dx, dalpha, dgamma, dbeta = liger_dyt_bwd(dy, x, alpha, gamma, beta)
+        return dx, dalpha, dgamma, dbeta

liger_kernel/ops/grpo_loss.py ADDED Viewed

@@ -0,0 +1,310 @@
+import torch
+import triton
+import triton.language as tl
+@triton.jit
+def _selective_log_softmax_kernel(
+    LOGITS,
+    INPUT_IDS,
+    LOG_P,
+    MASK,
+    TEMPERATURE,
+    stride_input_ids_b,
+    L: tl.constexpr,
+    N: tl.constexpr,
+    BLOCK_N: tl.constexpr = 4096,
+):
+    off_b = tl.program_id(0).cast(tl.int64)
+    off_l = tl.program_id(1).cast(tl.int64)
+    LOGITS += off_b * (L + 1) * N + off_l * N
+    INPUT_IDS += off_b * stride_input_ids_b + off_l
+    LOG_P += off_b * L + off_l
+    if MASK is not None:
+        MASK += off_b * stride_input_ids_b + off_l
+        not_skip = tl.load(MASK)
+        if not_skip == 0:
+            return
+    m_i = float("-inf")
+    l_i = 0.0
+    for start in range(0, N, BLOCK_N):
+        cols = start + tl.arange(0, BLOCK_N)
+        logits = tl.load(LOGITS + cols, mask=cols < N, other=float("-inf")).to(tl.float32) / TEMPERATURE
+        new_m_i = tl.maximum(m_i, tl.max(logits))
+        alpha = tl.exp(m_i - new_m_i)
+        l_i = l_i * alpha + tl.sum(tl.exp(logits - new_m_i))
+        m_i = new_m_i
+    lse = m_i + tl.log(l_i)
+    ids = tl.load(INPUT_IDS)
+    x = tl.load(LOGITS + ids).to(tl.float32) / TEMPERATURE
+    logp = x - lse
+    tl.store(LOG_P, logp)
+# compue old_logp and ref_logp, it reduce 10G peak Memory. it does not requires grad
+@torch.no_grad
+def fused_selective_log_softmax(logits: torch.Tensor, input_ids: torch.Tensor, temperature: float = 0.9, mask=None):
+    assert logits.is_contiguous()
+    B, L_ADD_1, N = logits.shape
+    L = L_ADD_1 - 1
+    input_ids = input_ids[:, -L:]
+    if mask is not None:
+        mask = mask[:, -L:]
+    log_p = torch.zeros(B, L, dtype=torch.float32, device=logits.device)
+    kwargs = {"BLOCK_N": 2048, "num_stages": 4, "num_warps": 1}
+    _selective_log_softmax_kernel[(B, L)](
+        logits, input_ids, log_p, mask, temperature, input_ids.stride(0), L, N, **kwargs
+    )
+    return log_p
+# @triton.autotune([triton.Config({"BLOCK_N":BLOCK_N}, num_stages=ns, num_warps=nw)
+#                   for BLOCK_N in [2048, 4096, 8192]
+#                   for ns in [1, 2, 4]
+#                   for nw in [1, 2, 4, 8, 16]],
+#                   key=['N'])
+@triton.jit
+def _grpo_loss_fwd_kernel(
+    LOGITS,
+    OLD_LOGP,
+    REF_LOGP,
+    INPUT_IDS,
+    COMPLETION_MASK,
+    ADVANTAGES,
+    LOSS,
+    LSE,
+    KL,
+    IS_CLIPPED,
+    TEMPERATURE,
+    BETA: tl.constexpr,
+    EPS_LOW,
+    EPS_HIGH,
+    L: tl.constexpr,
+    N: tl.constexpr,
+    BLOCK_N: tl.constexpr = 4096,
+):
+    off_b = tl.program_id(0).cast(tl.int64)
+    off_l = tl.program_id(1).cast(tl.int64)
+    if COMPLETION_MASK is not None:
+        COMPLETION_MASK += off_b * L + off_l
+        not_skip = tl.load(COMPLETION_MASK)
+        if not_skip == 0:
+            return
+    LOGITS += off_b * (L + 1) * N + off_l * N
+    INPUT_IDS += off_b * L + off_l
+    ADVANTAGES += off_b
+    LOSS += off_b * L + off_l
+    LSE += off_b * L + off_l
+    IS_CLIPPED += off_b * L + off_l
+    m_i = float("-inf")
+    l_i = 0.0
+    for start in range(0, N, BLOCK_N):
+        cols = start + tl.arange(0, BLOCK_N)
+        logits = tl.load(LOGITS + cols, mask=cols < N, other=float("-inf")).to(tl.float32) / TEMPERATURE
+        new_m_i = tl.maximum(m_i, tl.max(logits))
+        alpha = tl.exp(m_i - new_m_i)
+        l_i = l_i * alpha + tl.sum(tl.exp(logits - new_m_i))
+        m_i = new_m_i
+    lse = m_i + tl.log(l_i)
+    idx = tl.load(INPUT_IDS)
+    x = tl.load(LOGITS + idx).to(tl.float32) / TEMPERATURE
+    logp = x - lse
+    if OLD_LOGP is None:
+        old_logp = logp
+    else:
+        OLD_LOGP += off_b * L + off_l
+        old_logp = tl.load(OLD_LOGP).to(tl.float32)
+    coef_1 = tl.exp(logp - old_logp)
+    coef_2 = tl.clamp(coef_1, 1 - EPS_LOW, 1 + EPS_HIGH)
+    advantage = tl.load(ADVANTAGES).to(tl.float32)
+    per_token_loss1 = coef_1 * advantage
+    per_token_loss2 = coef_2 * advantage
+    per_token_loss = -tl.minimum(per_token_loss1, per_token_loss2)
+    is_clipped = per_token_loss1 < per_token_loss2
+    if BETA != 0.0:
+        REF_LOGP += off_b * L + off_l
+        KL += off_b * L + off_l
+        ref_logp = tl.load(REF_LOGP).to(tl.float32)
+        kl = tl.exp(ref_logp - logp) - (ref_logp - logp) - 1
+        per_token_loss += BETA * kl
+        tl.store(KL, kl)
+    tl.store(LOSS, per_token_loss)
+    tl.store(LSE, lse)
+    tl.store(IS_CLIPPED, is_clipped)
+# @triton.autotune([triton.Config({"BLOCK_N":BLOCK_N}, num_stages=ns, num_warps=nw)
+#                   for BLOCK_N in [2048, 4096, 8192]
+#                   for ns in [1, 2, 4]
+#                   for nw in [1, 2, 4, 8, 16]],
+#                   key=['N'])
+@triton.jit
+def _grpo_loss_bwd_kernel(
+    DLOSS,
+    DLOGITS,
+    LOGITS,
+    OLD_LOGP,
+    REF_LOGP,
+    INPUT_IDS,
+    ADVANTAGES,
+    COMPLETION_MASK,
+    LSE,
+    TEMPERATURE,
+    BETA: tl.constexpr,
+    EPS_LOW,
+    EPS_HIGH,
+    loss_stride0,
+    loss_stride1,
+    L: tl.constexpr,
+    N: tl.constexpr,
+    BLOCK_N: tl.constexpr = 4096,
+):
+    off_b = tl.program_id(0).cast(tl.int64)
+    off_l = tl.program_id(1).cast(tl.int64)
+    DLOGITS += off_b * (L + 1) * N + off_l * N
+    if COMPLETION_MASK is not None:
+        COMPLETION_MASK += off_b * L + off_l
+        not_skip = tl.load(COMPLETION_MASK)
+        if not_skip == 0:
+            for start in range(0, N, BLOCK_N):
+                cols = tl.arange(0, BLOCK_N) + start
+                tl.store(DLOGITS + cols, 0.0, mask=cols < N)
+            return
+    LOGITS += off_b * (L + 1) * N + off_l * N
+    DLOSS += off_b * loss_stride0 + off_l * loss_stride1
+    INPUT_IDS += off_b * L + off_l
+    ADVANTAGES += off_b
+    LSE += off_b * L + off_l
+    dloss = tl.load(DLOSS).to(tl.float32)
+    lse = tl.load(LSE).to(tl.float32)
+    idx = tl.load(INPUT_IDS)
+    x = tl.load(LOGITS + idx).to(tl.float32) / TEMPERATURE
+    logp = x - lse
+    if OLD_LOGP is None:
+        old_logp = logp
+    else:
+        OLD_LOGP += off_b * L + off_l
+        old_logp = tl.load(OLD_LOGP).to(tl.float32)
+    coef_1 = tl.exp(logp - old_logp)
+    coef_2 = tl.clamp(coef_1, 1 - EPS_LOW, 1 + EPS_HIGH)
+    advantage = tl.load(ADVANTAGES).to(tl.float32)
+    per_token_loss1 = coef_1 * advantage
+    per_token_loss2 = coef_2 * advantage
+    mask = per_token_loss2 >= per_token_loss1
+    dlogp = -per_token_loss1 * mask
+    if BETA != 0.0:
+        REF_LOGP += off_b * L + off_l
+        ref_logp = tl.load(REF_LOGP).to(tl.float32)
+        dlogp += BETA * (1 - tl.exp(ref_logp - logp))
+    dlogp = dlogp * dloss / TEMPERATURE
+    tl.debug_barrier()
+    for start_n in tl.range(0, N, BLOCK_N):
+        cols = start_n + tl.arange(0, BLOCK_N)
+        logits = tl.load(LOGITS + cols, mask=cols < N, other=-float("inf")).to(tl.float32) / TEMPERATURE
+        probs = tl.exp(logits - lse)
+        dlogits = tl.where(cols == idx, 1 - probs, -probs) * dlogp
+        tl.store(DLOGITS + cols, dlogits, mask=cols < N)
+class GrpoLossFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        logits,
+        old_logp,
+        ref_logp,
+        completion_ids,
+        advantages,
+        completion_mask,
+        temperature,
+        beta,
+        eps_low,
+        eps_high,
+        inplace,
+    ):
+        assert logits.is_contiguous() and completion_ids.is_contiguous()
+        assert old_logp is None or old_logp.is_contiguous()
+        assert (ref_logp is not None and ref_logp.is_contiguous()) if beta != 0.0 else True
+        B, L_ADD_1, N = logits.shape
+        L = L_ADD_1 - 1
+        if completion_mask is not None:
+            assert completion_mask.is_contiguous()
+        loss = torch.zeros(B, L, device=logits.device, dtype=torch.float32)
+        lse = torch.zeros_like(loss)
+        is_clipped = torch.zeros_like(loss)
+        kl = torch.zeros_like(loss) if beta != 0.0 else None
+        kwargs = {"BLOCK_N": 2048, "num_stages": 2, "num_warps": 1}
+        _grpo_loss_fwd_kernel[(B, L)](
+            logits,
+            old_logp,
+            ref_logp,
+            completion_ids,
+            completion_mask,
+            advantages,
+            loss,
+            lse,
+            kl,
+            is_clipped,
+            temperature,
+            beta,
+            eps_low,
+            eps_high,
+            L,
+            N,
+            **kwargs,
+        )
+        ctx.save_for_backward(logits, old_logp, ref_logp, completion_ids, advantages, completion_mask, lse)
+        ctx.infos = (temperature, beta, eps_low, eps_high, inplace)
+        # return loss
+        return loss, kl, is_clipped
+    @staticmethod
+    def backward(ctx, *args):
+        dloss = args[0]
+        # print(dloss.shape)
+        logits, old_logp, ref_logp, completion_ids, advantages, completion_mask, lse = ctx.saved_tensors
+        temperature, beta, eps_low, eps_high, inplace = ctx.infos
+        B, L_ADD_1, N = logits.shape
+        L = L_ADD_1 - 1
+        dlogits = logits.data if inplace else torch.empty_like(logits)
+        kwargs = {"BLOCK_N": 4096, "num_stages": 1, "num_warps": 16}
+        _grpo_loss_bwd_kernel[(B, L)](
+            dloss,
+            dlogits,
+            logits,
+            old_logp,
+            ref_logp,
+            completion_ids,
+            advantages,
+            completion_mask,
+            lse,
+            temperature,
+            beta,
+            eps_low,
+            eps_high,
+            *dloss.stride(),
+            L,
+            N,
+            **kwargs,
+        )
+        dlogits[:, -1, :] = 0
+        return dlogits, None, None, None, None, None, None, None, None, None, None

liger-kernel 0.5.9__py3-none-any.whl → 0.5.10__py3-none-any.whl

liger-kernel 0.5.9py3-none-any.whl → 0.5.10py3-none-any.whl