PyPI - liger-kernel - Versions diffs - 0.6.3__py3-none-any.whl → 0.6.4__py3-none-any.whl - Mend

liger-kernel 0.6.3py3-none-any.whl → 0.6.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

liger_kernel/chunked_loss/cosine_similarity_loss.py +13 -4
liger_kernel/chunked_loss/fused_linear_distillation.py +13 -2
liger_kernel/chunked_loss/fused_linear_ppo.py +21 -5
liger_kernel/chunked_loss/grpo_loss.py +8 -5
liger_kernel/chunked_loss/jsd_loss.py +18 -5
liger_kernel/ops/cross_entropy.py +59 -9
liger_kernel/ops/fused_linear_cross_entropy.py +30 -4
liger_kernel/ops/grpo_loss.py +3 -1
liger_kernel/ops/layer_norm.py +84 -65
liger_kernel/ops/tiled_mlp.py +136 -0
liger_kernel/transformers/__init__.py +19 -0
liger_kernel/transformers/cross_entropy.py +8 -3
liger_kernel/transformers/functional.py +24 -6
liger_kernel/transformers/fused_linear_cross_entropy.py +8 -3
liger_kernel/transformers/grpo_loss.py +56 -1
liger_kernel/transformers/model/falcon_h1.py +19 -5
liger_kernel/transformers/model/gemma.py +17 -6
liger_kernel/transformers/model/gemma2.py +14 -5
liger_kernel/transformers/model/gemma3.py +25 -12
liger_kernel/transformers/model/glm4.py +16 -4
liger_kernel/transformers/model/glm4v.py +16 -4
liger_kernel/transformers/model/glm4v_moe.py +23 -4
liger_kernel/transformers/model/hunyuan_v1.py +134 -0
liger_kernel/transformers/model/internvl.py +12 -5
liger_kernel/transformers/model/llama.py +14 -5
liger_kernel/transformers/model/llama4.py +16 -4
liger_kernel/transformers/model/llava.py +12 -4
liger_kernel/transformers/model/loss_utils.py +31 -3
liger_kernel/transformers/model/mistral.py +15 -6
liger_kernel/transformers/model/mixtral.py +16 -7
liger_kernel/transformers/model/mllama.py +12 -4
liger_kernel/transformers/model/olmo2.py +16 -4
liger_kernel/transformers/model/olmo3.py +142 -0
liger_kernel/transformers/model/output_classes.py +147 -0
liger_kernel/transformers/model/paligemma.py +22 -5
liger_kernel/transformers/model/phi3.py +14 -7
liger_kernel/transformers/model/qwen2.py +16 -3
liger_kernel/transformers/model/qwen2_5_vl.py +14 -6
liger_kernel/transformers/model/qwen2_vl.py +16 -4
liger_kernel/transformers/model/qwen3.py +20 -5
liger_kernel/transformers/model/qwen3_moe.py +19 -5
liger_kernel/transformers/model/qwen3_next.py +17 -5
liger_kernel/transformers/model/qwen3_vl.py +150 -0
liger_kernel/transformers/model/qwen3_vl_moe.py +126 -0
liger_kernel/transformers/model/smollm3.py +15 -6
liger_kernel/transformers/monkey_patch.py +398 -20
liger_kernel/transformers/rope.py +43 -0
liger_kernel/transformers/swiglu.py +17 -0
liger_kernel/transformers/tiled_mlp.py +133 -0
{liger_kernel-0.6.3.dist-info → liger_kernel-0.6.4.dist-info}/METADATA +4 -1
{liger_kernel-0.6.3.dist-info → liger_kernel-0.6.4.dist-info}/RECORD +55 -48
{liger_kernel-0.6.3.dist-info → liger_kernel-0.6.4.dist-info}/WHEEL +0 -0
{liger_kernel-0.6.3.dist-info → liger_kernel-0.6.4.dist-info}/licenses/LICENSE +0 -0
{liger_kernel-0.6.3.dist-info → liger_kernel-0.6.4.dist-info}/licenses/NOTICE +0 -0
{liger_kernel-0.6.3.dist-info → liger_kernel-0.6.4.dist-info}/top_level.txt +0 -0

liger_kernel/ops/fused_linear_cross_entropy.py CHANGED Viewed

@@ -27,8 +27,12 @@ def fused_linear_cross_entropy_forward(
     return_z_loss=False,
     accum_dtype=None,
     use_token_scaling=False,
+    return_token_accuracy=False,
 ):
     assert isinstance(return_z_loss, bool), f"return_z_loss must be True or False. Got: {return_z_loss}"
+    assert isinstance(return_token_accuracy, bool), (
+        f"return_token_accuracy must be True or False. Got: {return_token_accuracy}"
+    )
     device = _input.device
     input_requires_grad = _input.requires_grad
@@ -58,9 +62,13 @@ def fused_linear_cross_entropy_forward(
         else:
             grad_weight = torch.zeros_like(weight, dtype=accum_dtype, device=device) if weight.requires_grad else None
             grad_bias = torch.zeros_like(bias, dtype=accum_dtype, device=device) if bias is not None else None
+    else:
+        grad_weight = None
+        grad_bias = None
     loss_1d = torch.zeros(BT, dtype=torch.float32, device=device)
     z_loss_1d = torch.zeros(BT, dtype=_input.dtype, device=_input.device) if return_z_loss else None
+    token_accuracy_1d = torch.zeros(BT, dtype=torch.float32, device=device) if return_token_accuracy else None
     # TODO: evaluate how CUDA synchronization caused by .item() affects the speed
     target_mask = target != ignore_index
@@ -126,6 +134,7 @@ def fused_linear_cross_entropy_forward(
         # unreduced loss
         loss_1d_slice = loss_1d[start_idx:end_idx]  # chunk_size,
         z_loss_1d_slice = z_loss_1d[start_idx:end_idx] if return_z_loss else None
+        token_accuracy_1d_slice = token_accuracy_1d[start_idx:end_idx] if return_token_accuracy else None
         # ensure _input and target are contiguous
         logits_chunk = logits_chunk.contiguous()
@@ -141,6 +150,10 @@ def fused_linear_cross_entropy_forward(
             loss_ptr=loss_1d_slice,
             z_loss_ptr=z_loss_1d_slice,
             loss_stride=loss_1d_slice.stride(-1),  # always 1
+            token_accuracy_ptr=token_accuracy_1d_slice,
+            token_accuracy_stride=token_accuracy_1d_slice.stride(-1)
+            if return_token_accuracy
+            else 0,  # always 1 if accuracy is enabled
             n_cols=V,
             n_non_ignore=total_n_non_ignore,
             sum_non_ignore_weight=total_sum_non_ignore_ce_weight,
@@ -151,6 +164,7 @@ def fused_linear_cross_entropy_forward(
             reduction=reduction,
             softcap=softcap,
             RETURN_Z_LOSS=return_z_loss,
+            RETURN_TOKEN_ACCURACY=return_token_accuracy,
             HAS_WEIGHT=True if ce_weight is not None else False,
             HAS_SOFTCAPPING=True if softcap is not None else False,
             HAS_GRADIENTS=input_requires_grad,
@@ -167,6 +181,8 @@ def fused_linear_cross_entropy_forward(
         loss_1d[start_idx:end_idx] = loss_1d_slice
         if return_z_loss:
             z_loss_1d[start_idx:end_idx] = z_loss_1d_slice
+        if return_token_accuracy:
+            token_accuracy_1d[start_idx:end_idx] = token_accuracy_1d_slice
         grad_logits_chunk = logits_chunk  # chunk_size x V
         # Apply token scaling to gradients if requested
@@ -198,15 +214,18 @@ def fused_linear_cross_entropy_forward(
         # Return per-token losses
         loss = loss_1d
         z_loss = z_loss_1d if return_z_loss else None
+        token_accuracy = token_accuracy_1d if return_token_accuracy else None
     else:
         loss = torch.sum(loss_1d)
         z_loss = torch.sum(z_loss_1d) if return_z_loss else None
+        # For accuracy, we compute the mean across all non-ignored tokens
+        token_accuracy = torch.sum(token_accuracy_1d) / total_n_non_ignore if return_token_accuracy else None
     # Cast back to original dtype
     grad_weight = grad_weight.to(weight.dtype) if grad_weight is not None else None
     grad_bias = grad_bias.to(bias.dtype) if grad_bias is not None else None
-    return loss, z_loss, grad_input, grad_weight, grad_bias
+    return loss, z_loss, token_accuracy, grad_input, grad_weight, grad_bias
 def fused_linear_cross_entropy_backward(grad_output, grad_input, grad_weight, grad_bias):
@@ -274,6 +293,7 @@ class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
         return_z_loss: bool = False,
         accum_dtype=None,
         use_token_scaling: bool = False,
+        return_token_accuracy: bool = False,
     ):
         """
         Fusing the last linear layer with cross-entropy loss
@@ -297,9 +317,10 @@ class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
         use_token_scaling (bool): whether to scale each token's loss by its predicted probability (detached).
             When True, each token's loss is multiplied by the model's predicted probability for that token's true class.
             Default: False.
+        return_token_accuracy (bool): When `return_token_accuracy` is `True`, computes and returns per-token accuracy without materializing logits. Default: `False`
         """
-        loss, z_loss, grad_input, grad_weight, grad_bias = fused_linear_cross_entropy_forward(
+        loss, z_loss, token_accuracy, grad_input, grad_weight, grad_bias = fused_linear_cross_entropy_forward(
             _input=_input,
             weight=weight,
             target=target,
@@ -313,6 +334,7 @@ class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
             return_z_loss=return_z_loss,
             accum_dtype=accum_dtype,
             use_token_scaling=use_token_scaling,
+            return_token_accuracy=return_token_accuracy,
         )
         # downcast to dtype and store for backward
         ctx.save_for_backward(
@@ -321,13 +343,16 @@ class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
             grad_bias.detach() if bias is not None else None,
         )
         ctx.return_z_loss = return_z_loss
-        return loss, z_loss
+        ctx.return_token_accuracy = return_token_accuracy
+        return loss, z_loss, token_accuracy
     @staticmethod
     @amp_custom_bwd
-    def backward(ctx, grad_output, grad_output2):
+    def backward(ctx, grad_output, grad_output2, grad_output3):
         if ctx.return_z_loss:
             del grad_output2  # z_loss is only for logging
+        if ctx.return_token_accuracy:
+            del grad_output3  # token_accuracy is only for metrics
         (grad_input, grad_weight, grad_bias) = ctx.saved_tensors
         grad_input, grad_weight, grad_bias = fused_linear_cross_entropy_backward(
             grad_output, grad_input, grad_weight, grad_bias
@@ -346,4 +371,5 @@ class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
             None,
             None,
             None,  # use_token_scaling
+            None,  # return_token_accuracy
         )

liger_kernel/ops/grpo_loss.py CHANGED Viewed

@@ -128,7 +128,9 @@ def _grpo_loss_fwd_kernel(
     per_token_loss1 = coef_1 * advantage
     per_token_loss2 = coef_2 * advantage
     per_token_loss = -tl.minimum(per_token_loss1, per_token_loss2)
-    is_clipped = per_token_loss1 < per_token_loss2
+    is_low_clipped = (coef_1 < 1 - EPS_LOW) & (advantage < 0)
+    is_high_clipped = (coef_1 > 1 + EPS_HIGH) & (advantage > 0)
+    is_clipped = is_low_clipped | is_high_clipped
     if BETA != 0.0:
         REF_LOGP += off_b * L + off_l

liger_kernel/ops/layer_norm.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import math
 import operator
 import torch
@@ -85,68 +86,87 @@ def _layer_norm_forward_kernel(
 @triton.jit
 def _layer_norm_backward_kernel(
     X_ptr,  # pointer to input, shape (n_rows, n_cols)
+    stride_x,  # stride of each row in input
     W_ptr,  # pointer to weights, shape (n_cols,)
     Mean_ptr,  # pointer to mean, shape (n_rows,)
+    stride_mean,  # stride of each row in mean
     RSTD_ptr,  # pointer to rstd, shape (n_rows,)
+    stride_rstd,  # stride of each row in rstd
     DX_ptr,  # pointer to input grad, shape (n_rows, n_cols)
+    stride_dx,  # stride of each row in input grad
     DW_ptr,  # pointer to weights grad, shape (n_cols,)
+    stride_dw,  # stride of each row in weights grad
     DB_ptr,  # pointer to bias grad, shape (n_cols,)
+    stride_db,  # stride of each row in bias grad
     DY_ptr,  # pointer to output grad, shape (n_rows, n_cols)
-    stride_x,  # stride of each row in input
-    stride_dx,  # stride of each row in input grad
     stride_dy,  # stride of each row in output grad
+    n_rows,
     n_cols,
+    rows_per_program: tl.constexpr,
     BLOCK_SIZE: tl.constexpr,
-    dtype: tl.constexpr,
-    atomic_dtype: tl.constexpr,
 ):
     """
     References:
     https://arxiv.org/abs/1607.06450
     https://github.com/karpathy/llm.c/blob/master/doc/layernorm/layernorm.md
     """
-    row_idx = tl.program_id(0).to(tl.int64)
+    row_block_id = tl.program_id(0).to(tl.int64)
+    row_start = row_block_id * rows_per_program
+    row_end = min((row_block_id + 1) * rows_per_program, n_rows)
     cols = tl.arange(0, BLOCK_SIZE)
     mask = cols < n_cols
+    dW_row = tl.zeros((BLOCK_SIZE,), dtype=tl.float32)
+    db_row = tl.zeros((BLOCK_SIZE,), dtype=tl.float32)
     # Pre-load weights once (same optimization as forward pass)
     w = tl.load(W_ptr + cols, mask=mask, other=0.0)
     w_f32 = w.to(tl.float32)
     # Calculate pointers for this specific row
-    row_X_ptr = X_ptr + row_idx * stride_x
-    row_DX_ptr = DX_ptr + row_idx * stride_dx
-    row_DY_ptr = DY_ptr + row_idx * stride_dy
-    row_Mean_ptr = Mean_ptr + row_idx
-    row_RSTD_ptr = RSTD_ptr + row_idx
-    # Load data for this row
-    x = tl.load(row_X_ptr + cols, mask=mask, other=0.0)
-    dy = tl.load(row_DY_ptr + cols, mask=mask, other=0.0)
-    mean = tl.load(row_Mean_ptr)
-    rstd = tl.load(row_RSTD_ptr)
-    # Convert to fp32 for numerical stability
-    x_f32 = x.to(tl.float32)
-    dy_f32 = dy.to(tl.float32)
-    mean_f32 = mean.to(tl.float32)
-    rstd_f32 = rstd.to(tl.float32)
-    # Compute backward pass for this row
-    x_hat = (x_f32 - mean_f32) * rstd_f32
-    wdy = w_f32 * dy_f32
-    c1 = tl.sum(x_hat * wdy, axis=0) / n_cols
-    c2 = tl.sum(wdy, axis=0) / n_cols
-    dx = (wdy - (x_hat * c1 + c2)) * rstd_f32
-    # Store input gradient
-    tl.store(row_DX_ptr + cols, dx.to(dtype), mask=mask)
-    # Accumulate weight and bias gradients using atomic operations
-    dw = dy_f32 * x_hat
-    db = dy_f32
-    tl.atomic_add(DW_ptr + cols, dw.to(atomic_dtype), mask=mask)
-    tl.atomic_add(DB_ptr + cols, db.to(atomic_dtype), mask=mask)
+    row_X_ptr = X_ptr + row_start * stride_x
+    row_DX_ptr = DX_ptr + row_start * stride_dx
+    row_DY_ptr = DY_ptr + row_start * stride_dy
+    row_Mean_ptr = Mean_ptr + row_start
+    row_RSTD_ptr = RSTD_ptr + row_start
+    for _ in range(row_start, row_end):
+        # Load data for this row
+        x = tl.load(row_X_ptr + cols, mask=mask, other=0.0)
+        dy = tl.load(row_DY_ptr + cols, mask=mask, other=0.0)
+        mean = tl.load(row_Mean_ptr)
+        rstd = tl.load(row_RSTD_ptr)
+        # Convert to fp32 for numerical stability
+        x_f32 = x.to(tl.float32)
+        dy_f32 = dy.to(tl.float32)
+        mean_f32 = mean.to(tl.float32)
+        rstd_f32 = rstd.to(tl.float32)
+        # Compute backward pass for this row
+        x_hat = (x_f32 - mean_f32) * rstd_f32
+        wdy = w_f32 * dy_f32
+        c1 = tl.sum(x_hat * wdy, axis=0) / n_cols
+        c2 = tl.sum(wdy, axis=0) / n_cols
+        dx = (wdy - (x_hat * c1 + c2)) * rstd_f32
+        # Store input gradient
+        tl.store(row_DX_ptr + cols, dx, mask=mask)
+        # Accumulate weight and bias gradients for this thread block's assigned rows
+        dw = dy_f32 * x_hat
+        db = dy_f32
+        dW_row += dw
+        db_row += db
+        row_X_ptr += stride_x
+        row_DX_ptr += stride_dx
+        row_DY_ptr += stride_dy
+        row_Mean_ptr += stride_mean
+        row_RSTD_ptr += stride_rstd
+    tl.store(DW_ptr + row_block_id * stride_dw + cols, dW_row, mask=mask)
+    tl.store(DB_ptr + row_block_id * stride_db + cols, db_row, mask=mask)
 def layer_norm_forward(X, W, B, eps):
@@ -228,31 +248,25 @@ def layer_norm_backward(dY, X, W, B, Mean, RSTD):
     dY = dY.view(-1, dim)
     n_rows, n_cols = dY.shape
-    # Allocate gradient tensors
-    DX = torch.empty((n_rows, n_cols), dtype=X.dtype, device=X.device)
-    # Use float32 for weight/bias gradients if bfloat16 (due to atomic_add limitation)
-    grad_dtype = torch.float32 if W.dtype == torch.bfloat16 else W.dtype
-    DW = torch.zeros(n_cols, dtype=grad_dtype, device=W.device)
-    DB = torch.zeros(n_cols, dtype=grad_dtype, device=W.device)
+    sm_count = 1
+    if X.device.type == "cuda":
+        sm_count = torch.cuda.get_device_properties(X.device).multi_processor_count
+    elif X.device.type == "xpu":
+        sm_count = torch.xpu.get_device_properties(X.device).gpu_eu_count
+    # fp32 for numerical stability especially.
+    _DW = torch.empty((sm_count, n_cols), dtype=torch.float32, device=W.device)
+    _DB = torch.empty((sm_count, n_cols), dtype=torch.float32, device=W.device)
     # Calculate optimal block size and warp configuration
     BLOCK_SIZE, num_warps = calculate_settings(n_cols)
     if n_cols > BLOCK_SIZE:
         raise RuntimeError(f"Feature dimension {n_cols} exceeds maximum supported size of {BLOCK_SIZE}.")
+    rows_per_program = math.ceil(n_rows / sm_count)
+    grid = (sm_count,)
-    # Determine dtype for triton operations
-    triton_dtype = (
-        tl.float32
-        if X.dtype == torch.float32
-        else tl.bfloat16
-        if X.dtype == torch.bfloat16
-        else tl.float16
-        if X.dtype == torch.float16
-        else tl.float32  # fallback
-    )
-    # Use float32 for atomic operations if bfloat16 is not supported
-    atomic_dtype = tl.float32 if triton_dtype == tl.bfloat16 else triton_dtype
+    # Allocate gradient tensors
+    DX = torch.empty((n_rows, n_cols), dtype=X.dtype, device=X.device)
     kernel_args = {"num_warps": num_warps}
     # XPU-specific optimization
@@ -260,28 +274,33 @@ def layer_norm_backward(dY, X, W, B, Mean, RSTD):
         kernel_args.update({"grf_mode": "large", "num_warps": 32, "num_stages": 4})
     # Launch kernel with one thread block per row for optimal performance
-    grid = (n_rows,)
     _layer_norm_backward_kernel[grid](
         X,
+        X.stride(0),
         W,
         Mean,
+        Mean.stride(0),
         RSTD,
+        RSTD.stride(0),
         DX,
-        DW,
-        DB,
-        dY,
-        X.stride(0),
         DX.stride(0),
+        _DW,
+        _DW.stride(0),
+        _DB,
+        _DB.stride(0),
+        dY,
         dY.stride(0),
+        n_rows,
         n_cols,
+        rows_per_program=rows_per_program,
         BLOCK_SIZE=BLOCK_SIZE,
-        dtype=triton_dtype,
-        atomic_dtype=atomic_dtype,
         **kernel_args,
     )
     DX = DX.view(*shape)
-    return DX, DW.to(W.dtype), DB.to(W.dtype)
+    DW = _DW.sum(dim=0).to(W.dtype)
+    DB = _DB.sum(dim=0).to(B.dtype)
+    return DX, DW, DB
 class LigerLayerNormFunction(torch.autograd.Function):

liger_kernel/ops/tiled_mlp.py ADDED Viewed

@@ -0,0 +1,136 @@
+import math
+from typing import Callable
+from typing import List
+from typing import Optional
+import torch
+from liger_kernel.ops.utils import ensure_contiguous
+class LigerTiledMLPFunction(torch.autograd.Function):
+    """
+    Based on DeepSpeed's TiledMLP:
+    https://github.com/deepspeedai/DeepSpeed/blob/v0.18.2/deepspeed/runtime/sequence_parallel/ulysses_sp.py#L838
+    Perform a tiled MLP computation to massively reduce memory usage needed to compute MLP
+    when using very long sequence lengths.
+    This module re-computes `forward` in the `backward`. So the `forward` occurs twice each iteration.
+    And if you're using activation checkpointing it then occurs thrice.
+    Args:
+        fn: the function to call on sharded inputs (e.g., mlp.forward)
+        mlp_module: the MLP nn.Module object
+        x: the input to MLP.forward (hidden_states)
+        shards: how many shards to use
+        compute_params: a list of weights engaged in the compute
+    Returns:
+        the computed hidden_states
+    """
+    @staticmethod
+    @ensure_contiguous
+    def forward(
+        ctx,
+        fn: Callable,
+        mlp_module: torch.nn.Module,
+        x: torch.Tensor,
+        shards: int,
+        compute_params: Optional[List[torch.nn.Parameter]] = None,
+    ) -> torch.Tensor:
+        ctx.fn = fn
+        ctx.mlp_module = mlp_module
+        ctx.shards = shards
+        ctx.save_for_backward(x)
+        # x.shape could be [bs, seqlen, hidden_size] or [seqlen, hidden_size] (moe experts)
+        x_shards = list(torch.chunk(x, chunks=shards, dim=-2))
+        with torch.no_grad():
+            output_shards = [fn(mlp_module, x_shard) for x_shard in x_shards]
+        output_unsharded = torch.cat(output_shards, dim=-2)
+        return output_unsharded
+    @staticmethod
+    @ensure_contiguous
+    def backward(ctx, *grads) -> tuple:
+        fn = ctx.fn
+        (x,) = ctx.saved_tensors
+        mlp_module = ctx.mlp_module
+        shards = ctx.shards
+        x_requires_grad = x.requires_grad
+        x = x.detach()
+        # detach() unsets x.requires_grad, so restore it
+        x.requires_grad_(x_requires_grad)
+        # x.shape could be [bs, seqlen, hidden_size] or [seqlen, hidden_size] (moe experts)
+        hidden_size = x.shape[-1]
+        x_shape_orig = x.shape
+        # flatten bs+seqlen to avoid having stride issues when narrowing into seqlen w/ bs>1
+        x = x.view(-1, hidden_size)
+        incoming_grad = grads[0].view(-1, hidden_size)
+        x_grad = torch.zeros_like(x)
+        x_shards = list(torch.chunk(x, chunks=shards, dim=0))
+        for i, x_shard in enumerate(x_shards):
+            x_shard.requires_grad_(x_requires_grad)
+            # if seqlen is not exactly divisible by shards the last step will be shorter than shard_step
+            shard_step = x_shards[i].shape[0]
+            shard_offset = i * x_shards[0].shape[0]
+            x_shard.grad = x_grad.narrow(0, shard_offset, shard_step).view_as(x_shard)
+            incoming_grad_shard = incoming_grad.narrow(0, shard_offset, shard_step).view_as(x_shard)
+            with torch.enable_grad():
+                output = fn(mlp_module, x_shard)
+            torch.autograd.backward(output, incoming_grad_shard)
+        # unflatten
+        x_grad = x_grad.view(x_shape_orig)
+        return (None, None, x_grad, None, None)
+def apply_tiled_mlp(
+    fn: Callable,
+    mlp_module: torch.nn.Module,
+    x: torch.Tensor,
+    num_shards: Optional[int] = None,
+    compute_params: Optional[List[torch.nn.Parameter]] = None,
+) -> torch.Tensor:
+    """
+    Apply tiled MLP computation for memory efficiency.
+    Args:
+        fn: the function to call on sharded inputs (e.g., lambda module, x: module(x))
+        mlp_module: the MLP nn.Module object
+        x: the input tensor with shape [bs, seqlen, hidden_size] or [seqlen, hidden_size]
+        num_shards: number of shards to use. If None, automatically calculated as ceil(seqlen / hidden_size)
+        compute_params: list of parameters for DeepSpeed ZeRO optimization
+    Returns:
+        output tensor with the same shape as input
+    """
+    if num_shards is None:
+        # x.shape could be [bs, seqlen, hidden_size] or [seqlen, hidden_size]
+        hidden_size = x.shape[-1]
+        seqlen = x.shape[-2]
+        num_shards = math.ceil(seqlen / hidden_size)
+    # Ensure num_shards is at least 1
+    num_shards = max(1, num_shards)
+    return LigerTiledMLPFunction.apply(
+        fn,
+        mlp_module,
+        x,
+        num_shards,
+        compute_params,
+    )

liger_kernel/transformers/__init__.py CHANGED Viewed

@@ -24,6 +24,8 @@ from liger_kernel.transformers.swiglu import LigerBlockSparseTop2MLP  # noqa: F4
 from liger_kernel.transformers.swiglu import LigerPhi3SwiGLUMLP  # noqa: F401
 from liger_kernel.transformers.swiglu import LigerQwen3MoeSwiGLUMLP  # noqa: F401
 from liger_kernel.transformers.swiglu import LigerSwiGLUMLP  # noqa: F401
+from liger_kernel.transformers.tiled_mlp import LigerTiledGEGLUMLP  # noqa: F401
+from liger_kernel.transformers.tiled_mlp import LigerTiledSwiGLUMLP  # noqa: F401
 from liger_kernel.transformers.tvd import LigerTVDLoss  # noqa: F401
 # Static-only imports for IDEs and type checkers
@@ -40,6 +42,8 @@ if TYPE_CHECKING:
     from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_glm4v  # noqa: F401
     from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_glm4v_moe  # noqa: F401
     from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_granite  # noqa: F401
+    from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_hunyuan_v1_dense  # noqa: F401
+    from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_hunyuan_v1_moe  # noqa: F401
     from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_internvl  # noqa: F401
     from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_llama  # noqa: F401
     from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_llama4  # noqa: F401
@@ -48,6 +52,7 @@ if TYPE_CHECKING:
     from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_mixtral  # noqa: F401
     from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_mllama  # noqa: F401
     from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_olmo2  # noqa: F401
+    from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_olmo3  # noqa: F401
     from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_paligemma  # noqa: F401
     from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_phi3  # noqa: F401
     from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_qwen2  # noqa: F401
@@ -56,6 +61,8 @@ if TYPE_CHECKING:
     from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_qwen3  # noqa: F401
     from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_qwen3_moe  # noqa: F401
     from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_qwen3_next  # noqa: F401
+    from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_qwen3_vl  # noqa: F401
+    from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_qwen3_vl_moe  # noqa: F401
     from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_smollm3  # noqa: F401
     from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_smolvlm  # noqa: F401
@@ -112,6 +119,7 @@ def __getattr__(name: str):
         "apply_liger_kernel_to_mixtral",
         "apply_liger_kernel_to_mllama",
         "apply_liger_kernel_to_olmo2",
+        "apply_liger_kernel_to_olmo3",
         "apply_liger_kernel_to_paligemma",
         "apply_liger_kernel_to_phi3",
         "apply_liger_kernel_to_qwen2",
@@ -120,8 +128,12 @@ def __getattr__(name: str):
         "apply_liger_kernel_to_qwen3",
         "apply_liger_kernel_to_qwen3_moe",
         "apply_liger_kernel_to_qwen3_next",
+        "apply_liger_kernel_to_qwen3_vl",
+        "apply_liger_kernel_to_qwen3_vl_moe",
         "apply_liger_kernel_to_smollm3",
         "apply_liger_kernel_to_smolvlm",
+        "apply_liger_kernel_to_hunyuan_v1_dense",
+        "apply_liger_kernel_to_hunyuan_v1_moe",
     }
     if name in monkey_patch_symbols:
@@ -151,6 +163,8 @@ __all__ = [
     "LigerPhi3SwiGLUMLP",
     "LigerQwen3MoeSwiGLUMLP",
     "LigerSwiGLUMLP",
+    "LigerTiledGEGLUMLP",
+    "LigerTiledSwiGLUMLP",
     "LigerTVDLoss",
     "LigerKLDIVLoss",
     "LigerMultiTokenAttention",
@@ -182,6 +196,7 @@ if _TRANSFORMERS_AVAILABLE:
             "apply_liger_kernel_to_mixtral",
             "apply_liger_kernel_to_mllama",
             "apply_liger_kernel_to_olmo2",
+            "apply_liger_kernel_to_olmo3",
             "apply_liger_kernel_to_paligemma",
             "apply_liger_kernel_to_phi3",
             "apply_liger_kernel_to_qwen2",
@@ -190,7 +205,11 @@ if _TRANSFORMERS_AVAILABLE:
             "apply_liger_kernel_to_qwen3",
             "apply_liger_kernel_to_qwen3_moe",
             "apply_liger_kernel_to_qwen3_next",
+            "apply_liger_kernel_to_qwen3_vl",
+            "apply_liger_kernel_to_qwen3_vl_moe",
             "apply_liger_kernel_to_smollm3",
             "apply_liger_kernel_to_smolvlm",
+            "apply_liger_kernel_to_hunyuan_v1_dense",
+            "apply_liger_kernel_to_hunyuan_v1_moe",
         ]
     )

liger_kernel/transformers/cross_entropy.py CHANGED Viewed

@@ -3,6 +3,7 @@ from typing import Optional
 import torch
 from liger_kernel.ops.cross_entropy import LigerCrossEntropyFunction
+from liger_kernel.transformers.functional import CrossEntropyOutput
 class LigerCrossEntropyLoss(torch.nn.Module):
@@ -15,6 +16,7 @@ class LigerCrossEntropyLoss(torch.nn.Module):
         reduction: str = "mean",
         softcap: Optional[float] = None,
         return_z_loss: bool = False,
+        return_token_accuracy: bool = False,
     ):
         super().__init__()
         assert (label_smoothing >= 0) and (label_smoothing <= 1), (
@@ -33,9 +35,10 @@ class LigerCrossEntropyLoss(torch.nn.Module):
         self.reduction = reduction
         self.softcap = softcap
         self.return_z_loss = return_z_loss
+        self.return_token_accuracy = return_token_accuracy
     def forward(self, _input: torch.Tensor, target: torch.Tensor):
-        loss, z_loss = LigerCrossEntropyFunction.apply(
+        loss, z_loss, token_accuracy = LigerCrossEntropyFunction.apply(
             _input,
             target,
             self.weight,
@@ -45,7 +48,9 @@ class LigerCrossEntropyLoss(torch.nn.Module):
             self.reduction,
             self.softcap,
             self.return_z_loss,
+            self.return_token_accuracy,
         )
-        if not self.return_z_loss:
+        if not self.return_z_loss and not self.return_token_accuracy:
             return loss
-        return loss, z_loss
+        return CrossEntropyOutput(loss=loss, z_loss=z_loss, token_accuracy=token_accuracy)

liger-kernel 0.6.3__py3-none-any.whl → 0.6.4__py3-none-any.whl

liger-kernel 0.6.3py3-none-any.whl → 0.6.4py3-none-any.whl