PyPI - liger-kernel - Versions diffs - 0.5.1__py3-none-any.whl → 0.5.3__py3-none-any.whl - Mend

liger-kernel 0.5.1py3-none-any.whl → 0.5.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

liger_kernel/chunked_loss/README.md +25 -0
liger_kernel/chunked_loss/__init__.py +2 -0
liger_kernel/chunked_loss/cpo_loss.py +18 -8
liger_kernel/chunked_loss/dpo_loss.py +20 -10
liger_kernel/chunked_loss/functional.py +4 -0
liger_kernel/chunked_loss/fused_linear_distillation.py +58 -44
liger_kernel/chunked_loss/fused_linear_preference.py +108 -60
liger_kernel/chunked_loss/fused_linear_unpaired_preference.py +246 -0
liger_kernel/chunked_loss/jsd_loss.py +154 -0
liger_kernel/chunked_loss/kto_loss.py +172 -0
liger_kernel/chunked_loss/orpo_loss.py +8 -9
liger_kernel/chunked_loss/simpo_loss.py +22 -8
liger_kernel/env_report.py +5 -12
liger_kernel/ops/cross_entropy.py +102 -51
liger_kernel/ops/experimental/embedding.py +1 -3
liger_kernel/ops/experimental/mm_int8int2.py +3 -9
liger_kernel/ops/fused_linear_cross_entropy.py +89 -55
liger_kernel/ops/fused_linear_jsd.py +11 -29
liger_kernel/ops/geglu.py +6 -17
liger_kernel/ops/group_norm.py +11 -28
liger_kernel/ops/jsd.py +2 -6
liger_kernel/ops/kl_div.py +8 -11
liger_kernel/ops/layer_norm.py +3 -5
liger_kernel/ops/qwen2vl_mrope.py +21 -37
liger_kernel/ops/rms_norm.py +14 -32
liger_kernel/ops/rope.py +31 -33
liger_kernel/ops/swiglu.py +4 -8
liger_kernel/ops/utils.py +2 -0
liger_kernel/transformers/__init__.py +16 -24
liger_kernel/transformers/auto_model.py +6 -13
liger_kernel/transformers/cross_entropy.py +4 -6
liger_kernel/transformers/experimental/embedding.py +1 -3
liger_kernel/transformers/functional.py +11 -7
liger_kernel/transformers/fused_linear_cross_entropy.py +12 -7
liger_kernel/transformers/geglu.py +1 -4
liger_kernel/transformers/group_norm.py +3 -9
liger_kernel/transformers/jsd.py +1 -3
liger_kernel/transformers/kl_div.py +1 -3
liger_kernel/transformers/layer_norm.py +3 -9
liger_kernel/transformers/model/gemma.py +18 -40
liger_kernel/transformers/model/gemma2.py +19 -41
liger_kernel/transformers/model/llama.py +22 -48
liger_kernel/transformers/model/mistral.py +14 -26
liger_kernel/transformers/model/mixtral.py +24 -54
liger_kernel/transformers/model/mllama.py +16 -36
liger_kernel/transformers/model/phi3.py +18 -40
liger_kernel/transformers/model/qwen2.py +18 -40
liger_kernel/transformers/model/qwen2_vl.py +36 -32
liger_kernel/transformers/monkey_patch.py +43 -117
liger_kernel/transformers/qwen2vl_mrope.py +2 -2
liger_kernel/transformers/rms_norm.py +4 -4
liger_kernel/transformers/rope.py +2 -2
liger_kernel/transformers/swiglu.py +2 -8
liger_kernel/transformers/trainer/__init__.py +1 -3
liger_kernel/transformers/trainer/orpo_trainer.py +31 -18
liger_kernel/triton/__init__.py +1 -3
liger_kernel/triton/monkey_patch.py +1 -3
{liger_kernel-0.5.1.dist-info → liger_kernel-0.5.3.dist-info}/METADATA +38 -25
liger_kernel-0.5.3.dist-info/RECORD +69 -0
{liger_kernel-0.5.1.dist-info → liger_kernel-0.5.3.dist-info}/WHEEL +1 -1
liger_kernel-0.5.1.dist-info/RECORD +0 -65
{liger_kernel-0.5.1.dist-info → liger_kernel-0.5.3.dist-info}/LICENSE +0 -0
{liger_kernel-0.5.1.dist-info → liger_kernel-0.5.3.dist-info}/NOTICE +0 -0
{liger_kernel-0.5.1.dist-info → liger_kernel-0.5.3.dist-info}/top_level.txt +0 -0

liger_kernel/ops/fused_linear_jsd.py CHANGED Viewed

@@ -4,12 +4,10 @@ import torch
 import triton
 from liger_kernel.ops.jsd import _jsd_kernel
-from liger_kernel.ops.utils import (
-    amp_custom_bwd,
-    amp_custom_fwd,
-    element_mul_kernel,
-    is_hip,
-)
+from liger_kernel.ops.utils import amp_custom_bwd
+from liger_kernel.ops.utils import amp_custom_fwd
+from liger_kernel.ops.utils import element_mul_kernel
+from liger_kernel.ops.utils import is_hip
 # The hard limit of TRITON_MAX_TENSOR_NUMEL is 1048576 https://github.com/triton-lang/triton/blob/ba42a5c68fd0505f8c42f4202d53be0f8d9a5fe0/python/triton/language/core.py#L19
 # However, setting limit as 65536 as in LayerNorm tutorial is faster because of less register spilling
@@ -43,16 +41,10 @@ def fused_linear_jsd_forward(
     BLOCK_SIZE = min(MAX_FUSED_SIZE, triton.next_power_of_2(V))
     inc_factor = triton.cdiv(V, H)  # (V + H - 1) // H
-    chunk_size = triton.next_power_of_2(
-        triton.cdiv(BT, inc_factor)
-    )  # (BT + inc_factor - 1) // inc_factor
+    chunk_size = triton.next_power_of_2(triton.cdiv(BT, inc_factor))  # (BT + inc_factor - 1) // inc_factor
     num_chunks = triton.cdiv(BT, chunk_size)  # (BT + chunk_size - 1) // chunk_size
-    grad_weight = (
-        torch.zeros_like(student_weight, device=device)
-        if student_weight.requires_grad
-        else None
-    )
+    grad_weight = torch.zeros_like(student_weight, device=device) if student_weight.requires_grad else None
     grad_input = torch.zeros_like(student_input)
     # we use fp32 for loss accumulator
     loss_1d = torch.zeros((BT, V), dtype=torch.float32, device=device)
@@ -73,12 +65,8 @@ def fused_linear_jsd_forward(
         # shape: chunk_size x V
         # For anything starting from logits to the final JSD loss, we do computation
         # in FP32 to avoid losing numerical stability.
-        student_logits_chunk = (student_input_chunk @ student_weight.t()).to(
-            torch.float32
-        )
-        teacher_logits_chunk = (teacher_input_chunk @ teacher_weight.t()).to(
-            torch.float32
-        )
+        student_logits_chunk = (student_input_chunk @ student_weight.t()).to(torch.float32)
+        teacher_logits_chunk = (teacher_input_chunk @ teacher_weight.t()).to(torch.float32)
         chunk_n_rows = student_logits_chunk.shape[0]
         # unreduced loss
@@ -104,9 +92,7 @@ def fused_linear_jsd_forward(
             dX_ptr=student_prob_chunk,
             dX_stride=student_prob_chunk.stride(-2),
             label_ptr=(
-                shift_labels[start_idx:end_idx]
-                if has_label
-                else torch.empty(1, device=device)
+                shift_labels[start_idx:end_idx] if has_label else torch.empty(1, device=device)
             ),  # dummy ptr if no label
             beta=jsd_beta,
             n_non_ignore=n_non_ignore,
@@ -121,9 +107,7 @@ def fused_linear_jsd_forward(
         student_logits_chunk = (
             student_prob_chunk
             - torch.softmax(student_logits_chunk, dim=-1)
-            * student_prob_chunk.sum(dim=-1, keepdim=True).broadcast_to(
-                student_prob_chunk.shape
-            )
+            * student_prob_chunk.sum(dim=-1, keepdim=True).broadcast_to(student_prob_chunk.shape)
         ) / temperature
         # now we traverse back to grad w.r.t. input to `lm_head` and grad
         # w.r.t. `lm_head` which should be computed in original dtype
@@ -239,7 +223,5 @@ class LigerFusedLinearJSDFunction(torch.autograd.Function):
     @amp_custom_bwd
     def backward(ctx, grad_output):
         (grad_input, grad_weight) = ctx.saved_tensors
-        grad_input, grad_weight = fused_linear_jsd_backward(
-            grad_output, grad_input, grad_weight
-        )
+        grad_input, grad_weight = fused_linear_jsd_backward(grad_output, grad_input, grad_weight)
         return (grad_input, grad_weight, None, None, None, None, None, None)

liger_kernel/ops/geglu.py CHANGED Viewed

@@ -4,11 +4,9 @@ import torch
 import triton
 import triton.language as tl
-from liger_kernel.ops.utils import (
-    calculate_settings,
-    compare_version,
-    ensure_contiguous,
-)
+from liger_kernel.ops.utils import calculate_settings
+from liger_kernel.ops.utils import compare_version
+from liger_kernel.ops.utils import ensure_contiguous
 if compare_version("triton", operator.ge, "3.0.0"):
     try:
@@ -22,9 +20,7 @@ else:
 @triton.jit
-def _geglu_tanh_forward_kernel(
-    a, b, c, stride, n_cols: tl.constexpr, BLOCK_SIZE: tl.constexpr
-):
+def _geglu_tanh_forward_kernel(a, b, c, stride, n_cols: tl.constexpr, BLOCK_SIZE: tl.constexpr):
     program_id = tl.program_id(0).to(tl.int64)
     # locate start index
@@ -49,9 +45,7 @@ def _geglu_tanh_forward_kernel(
 @triton.jit
-def _geglu_tanh_backward_kernel(
-    dc, a, b, stride, n_cols: tl.constexpr, BLOCK_SIZE: tl.constexpr
-):
+def _geglu_tanh_backward_kernel(dc, a, b, stride, n_cols: tl.constexpr, BLOCK_SIZE: tl.constexpr):
     program_id = tl.program_id(0).to(tl.int64)
     # locate start index
@@ -80,12 +74,7 @@ def _geglu_tanh_backward_kernel(
     # where z = sqrt(2/pi) * (a + 0.044715 * a^3)
     term1 = 0.5 * (1 + tanh_result)
     tanh_sq = tanh_result * tanh_result
-    term2 = (
-        0.5
-        * a_row
-        * (1 - tanh_sq)
-        * (sqrt_2_over_pi * (1 + 3 * 0.044715 * a_row * a_row))
-    )
+    term2 = 0.5 * a_row * (1 - tanh_sq) * (sqrt_2_over_pi * (1 + 3 * 0.044715 * a_row * a_row))
     da_row = dc_row * b_row * (term1 + term2)
     tl.store(a + col_offsets, da_row, mask=mask)

liger_kernel/ops/group_norm.py CHANGED Viewed

@@ -4,7 +4,8 @@ import torch
 import triton
 import triton.language as tl
-from liger_kernel.ops.utils import compare_version, ensure_contiguous
+from liger_kernel.ops.utils import compare_version
+from liger_kernel.ops.utils import ensure_contiguous
 if compare_version("triton", operator.ge, "3.0.0"):
     try:
@@ -73,9 +74,7 @@ def _group_norm_forward_kernel(
     # Normalize
     hidden_size_per_channel = hidden_size // channels_per_group
-    for channel_idx in tl.range(
-        group_idx * channels_per_group, (group_idx + 1) * channels_per_group
-    ):
+    for channel_idx in tl.range(group_idx * channels_per_group, (group_idx + 1) * channels_per_group):
         W = tl.load(W_ptr + channel_idx)
         B = tl.load(B_ptr + channel_idx)
         for i in range(0, hidden_size_per_channel, BLOCK_SIZE):
@@ -132,21 +131,15 @@ def _group_norm_backward_kernel(
     UPSTREAM_ptr += batch_idx * X_row_stride
     # Mean and rstd are the same shape so have the same strides
-    mean = tl.load(
-        Mean_ptr + batch_idx * Mean_ptr_row_stride + group_idx * Mean_ptr_col_stride
-    )
-    rstd = tl.load(
-        RSTD_ptr + batch_idx * Mean_ptr_row_stride + group_idx * Mean_ptr_col_stride
-    )
+    mean = tl.load(Mean_ptr + batch_idx * Mean_ptr_row_stride + group_idx * Mean_ptr_col_stride)
+    rstd = tl.load(RSTD_ptr + batch_idx * Mean_ptr_row_stride + group_idx * Mean_ptr_col_stride)
     c1 = 0.0
     c2 = 0.0
     block_range = tl.arange(0, BLOCK_SIZE)
     # We need to compute the sum terms of the backprop equations across all channels in the group
-    for channel_idx in range(
-        group_idx * channels_per_group, (group_idx + 1) * channels_per_group
-    ):
+    for channel_idx in range(group_idx * channels_per_group, (group_idx + 1) * channels_per_group):
         dW = 0.0
         dB = 0.0
         # Move the pointers to the correct channel
@@ -181,9 +174,7 @@ def _group_norm_backward_kernel(
     c1 = c1 / N
     c2 = c2 / N
-    for channel_idx in tl.range(
-        group_idx * channels_per_group, (group_idx + 1) * channels_per_group
-    ):
+    for channel_idx in tl.range(group_idx * channels_per_group, (group_idx + 1) * channels_per_group):
         # Move the pointers to the correct channel
         W = tl.load(W_ptr + channel_idx)
         for i in range(0, hidden_size, BLOCK_SIZE):
@@ -203,9 +194,7 @@ def _group_norm_backward_kernel(
             x_hat = (X - mean) * rstd
             wdy = W * UPSTREAM_grad
             dx = (wdy - (x_hat * c1 + c2)) * rstd
-            tl.store(
-                DX_ptr + channel_idx * X_col_stride + hidden_size_offsets, dx, mask=mask
-            )
+            tl.store(DX_ptr + channel_idx * X_col_stride + hidden_size_offsets, dx, mask=mask)
 def group_norm_forward(X, num_channels, num_groups, W, B, eps):
@@ -216,9 +205,7 @@ def group_norm_forward(X, num_channels, num_groups, W, B, eps):
     X = X.view(batch_size, num_groups, -1).contiguous()
     hidden_size = X.shape[-1]
     BLOCK_SIZE = min(MAX_FUSED_SIZE, triton.next_power_of_2(hidden_size))
-    Y = torch.empty(
-        (batch_size, num_groups, hidden_size), dtype=X.dtype, device=X.device
-    )
+    Y = torch.empty((batch_size, num_groups, hidden_size), dtype=X.dtype, device=X.device)
     Mean = torch.zeros((batch_size, num_groups), dtype=X.dtype, device=X.device)
     RSTD = torch.zeros((batch_size, num_groups), dtype=X.dtype, device=X.device)
@@ -307,16 +294,12 @@ class LigerGroupNormFunction(torch.autograd.Function):
         )
         ctx.num_channels = num_channels
         ctx.num_groups = num_groups
-        ctx.save_for_backward(
-            X, affine_scaling_weight, affine_shifting_bias, Mean, RSTD
-        )
+        ctx.save_for_backward(X, affine_scaling_weight, affine_shifting_bias, Mean, RSTD)
         return Y
     @staticmethod
     @ensure_contiguous
     def backward(ctx, dY):
         X, W, B, Mean, RSTD = ctx.saved_tensors
-        DX, DW, DB = group_norm_backward(
-            dY, X, W, B, Mean, RSTD, ctx.num_channels, ctx.num_groups
-        )
+        DX, DW, DB = group_norm_backward(dY, X, W, B, Mean, RSTD, ctx.num_channels, ctx.num_groups)
         return DX, DW, DB, None, None, None

liger_kernel/ops/jsd.py CHANGED Viewed

@@ -98,9 +98,7 @@ def jsd_forward(_input, target, shift_labels, beta, ignore_index, has_label):
         loss_stride=loss.stride(-2),
         dX_ptr=dX,
         dX_stride=dX.stride(-2),
-        label_ptr=(
-            shift_labels if has_label else torch.empty(1, device=_input.device)
-        ),  # dummy ptr if no label
+        label_ptr=(shift_labels if has_label else torch.empty(1, device=_input.device)),  # dummy ptr if no label
         beta=beta,
         n_non_ignore=n_non_ignore,
         ignore_index=ignore_index,
@@ -165,9 +163,7 @@ class LigerJSDFunction(torch.autograd.Function):
             shift_labels = shift_labels.contiguous()
             has_label = True
-        loss, dX = jsd_forward(
-            _input, target, shift_labels, beta, ignore_index, has_label
-        )
+        loss, dX = jsd_forward(_input, target, shift_labels, beta, ignore_index, has_label)
         ctx.save_for_backward(dX)
         return loss

liger_kernel/ops/kl_div.py CHANGED Viewed

@@ -4,7 +4,8 @@ import torch
 import triton
 import triton.language as tl
-from liger_kernel.ops.utils import ensure_contiguous, is_hip
+from liger_kernel.ops.utils import ensure_contiguous
+from liger_kernel.ops.utils import is_hip
 def get_num_warps(BLOCK_SIZE):
@@ -23,10 +24,10 @@ MAX_FUSED_SIZE = 65536 // 4  # 65536 // 4 or 8 works the best
 REDUCTION_LITERAL = Literal["none", "sum", "mean", "batchmean"]
-_REDUCTION_MODE_NONE = tl.constexpr(0)
-_REDUCTION_MODE_SUM = tl.constexpr(1)
-_REDUCTION_MODE_MEAN = tl.constexpr(2)
-_REDUCTION_MODE_BATCHMEAN = tl.constexpr(3)
+_REDUCTION_MODE_NONE: tl.constexpr = tl.constexpr(0)
+_REDUCTION_MODE_SUM: tl.constexpr = tl.constexpr(1)
+_REDUCTION_MODE_MEAN: tl.constexpr = tl.constexpr(2)
+_REDUCTION_MODE_BATCHMEAN: tl.constexpr = tl.constexpr(3)
 _str_to_reduction_mode = {
     "none": _REDUCTION_MODE_NONE.value,
@@ -218,9 +219,7 @@ class LigerKLDivLossFunction(torch.autograd.Function):
         ctx.save_for_backward(y_true)
         ctx.reduction = reduction
         ctx.log_target = log_target
-        return kldiv_forward_triton(
-            y_pred, y_true, log_target=log_target, reduction=reduction, eps=eps
-        )
+        return kldiv_forward_triton(y_pred, y_true, log_target=log_target, reduction=reduction, eps=eps)
     @staticmethod
     @ensure_contiguous
@@ -238,9 +237,7 @@ class LigerKLDivLossFunction(torch.autograd.Function):
         new_grads = torch.empty_like(y_true)
-        derivative = kldiv_backward_triton(
-            y_true, grad_output, new_grads, ctx.log_target
-        )
+        derivative = kldiv_backward_triton(y_true, grad_output, new_grads, ctx.log_target)
         if ctx.reduction == "batchmean":
             derivative = derivative / y_true.shape[0]

liger_kernel/ops/layer_norm.py CHANGED Viewed

@@ -5,11 +5,9 @@ import torch
 import triton
 import triton.language as tl
-from liger_kernel.ops.utils import (
-    calculate_settings,
-    compare_version,
-    ensure_contiguous,
-)
+from liger_kernel.ops.utils import calculate_settings
+from liger_kernel.ops.utils import compare_version
+from liger_kernel.ops.utils import ensure_contiguous
 if compare_version("triton", operator.ge, "3.0.0"):
     try:

liger_kernel/ops/qwen2vl_mrope.py CHANGED Viewed

@@ -10,6 +10,7 @@ def _triton_qwen2vl_mrope(
     cos,
     sin,
     sl,
+    bs: tl.constexpr,
     n_qh: tl.constexpr,
     n_kh: tl.constexpr,
     hd: tl.constexpr,
@@ -41,13 +42,12 @@ def _triton_qwen2vl_mrope(
     t_end = mrope_section_t
     h_end = t_end + mrope_section_h
-    cos_row_idx = pid % sl
-    t_cos = cos + cos_row_idx * hd
-    h_cos = t_cos + sl * hd
-    w_cos = h_cos + sl * hd
-    t_sin = sin + cos_row_idx * hd
-    h_sin = t_sin + sl * hd
-    w_sin = h_sin + sl * hd
+    t_cos = cos + pid * hd
+    h_cos = t_cos + bs * sl * hd
+    w_cos = h_cos + bs * sl * hd
+    t_sin = sin + pid * hd
+    h_sin = t_sin + bs * sl * hd
+    w_sin = h_sin + bs * sl * hd
     cos_offsets = tl.arange(0, pad_hd // 2)
     t_mask = cos_offsets < t_end
@@ -67,36 +67,20 @@ def _triton_qwen2vl_mrope(
     # program instance (i.e. for the current token) separately
     # ####################################################################
     # left half of the head
-    first_half_q_offsets = (
-        tl.arange(0, pad_n_qh)[:, None] * hd + tl.arange(0, pad_hd // 2)[None, :]
-    )
-    first_half_k_offsets = (
-        tl.arange(0, pad_n_kh)[:, None] * hd + tl.arange(0, pad_hd // 2)[None, :]
-    )
-    first_q_mask = (tl.arange(0, pad_n_qh)[:, None] < n_qh) & (
-        tl.arange(0, pad_hd // 2)[None, :] < hd // 2
-    )
-    first_k_mask = (tl.arange(0, pad_n_kh)[:, None] < n_kh) & (
-        tl.arange(0, pad_hd // 2)[None, :] < hd // 2
-    )
-    q_tile_1 = tl.load(q_ptr + first_half_q_offsets, mask=first_q_mask, other=0).to(
-        sin_row.dtype
-    )
-    k_tile_1 = tl.load(k_ptr + first_half_k_offsets, mask=first_k_mask, other=0).to(
-        sin_row.dtype
-    )
+    first_half_q_offsets = tl.arange(0, pad_n_qh)[:, None] * hd + tl.arange(0, pad_hd // 2)[None, :]
+    first_half_k_offsets = tl.arange(0, pad_n_kh)[:, None] * hd + tl.arange(0, pad_hd // 2)[None, :]
+    first_q_mask = (tl.arange(0, pad_n_qh)[:, None] < n_qh) & (tl.arange(0, pad_hd // 2)[None, :] < hd // 2)
+    first_k_mask = (tl.arange(0, pad_n_kh)[:, None] < n_kh) & (tl.arange(0, pad_hd // 2)[None, :] < hd // 2)
+    q_tile_1 = tl.load(q_ptr + first_half_q_offsets, mask=first_q_mask, other=0).to(sin_row.dtype)
+    k_tile_1 = tl.load(k_ptr + first_half_k_offsets, mask=first_k_mask, other=0).to(sin_row.dtype)
     # right half of the head
     second_half_q_offsets = first_half_q_offsets + (hd // 2)
     second_half_k_offsets = first_half_k_offsets + (hd // 2)
     second_q_mask = first_q_mask
     second_k_mask = first_k_mask
-    q_tile_2 = tl.load(q_ptr + second_half_q_offsets, mask=second_q_mask, other=0).to(
-        sin_row.dtype
-    )
-    k_tile_2 = tl.load(k_ptr + second_half_k_offsets, mask=second_k_mask, other=0).to(
-        sin_row.dtype
-    )
+    q_tile_2 = tl.load(q_ptr + second_half_q_offsets, mask=second_q_mask, other=0).to(sin_row.dtype)
+    k_tile_2 = tl.load(k_ptr + second_half_k_offsets, mask=second_k_mask, other=0).to(sin_row.dtype)
     if not BACKWARD_PASS:
         # y = [x1, x2] * [cos, cos] + [-x2, x1] * [sin, sin]
@@ -124,7 +108,6 @@ def _triton_qwen2vl_mrope(
 def qwen2vl_mrope_forward(q, k, cos, sin, mrope_section):
     # transpose it back to the physical shape because Triton looks at the physical storage
     # note: q and k are incontiguous before the transformation and will become contiguous after transpose
     q = q.transpose(1, 2)
@@ -151,6 +134,7 @@ def qwen2vl_mrope_forward(q, k, cos, sin, mrope_section):
         cos,
         sin,
         seq_len,
+        batch_size,
         n_q_head,
         n_kv_head,
         head_dim,
@@ -189,6 +173,7 @@ def qwen2vl_mrope_backward(dq, dk, cos, sin, mrope_section):
         cos,
         sin,
         seq_len,
+        batch_size,
         n_q_head,
         n_kv_head,
         head_dim,
@@ -216,8 +201,8 @@ class LigerQwen2VLMRopeFunction(torch.autograd.Function):
         """
         q size: (bsz, n_q_head, seq_len, head_dim)
         k size: (bsz, n_kv_head, seq_len, head_dim)
-        cos size: (3, 1, seq_len, head_dim)
-        sin size: (3, 1, seq_len, head_dim)
+        cos size: (3, bsz, seq_len, head_dim)
+        sin size: (3, bsz, seq_len, head_dim)
         """
         q, k, cos, sin = qwen2vl_mrope_forward(q, k, cos, sin, mrope_section)
         ctx.save_for_backward(cos, sin)
@@ -228,10 +213,9 @@ class LigerQwen2VLMRopeFunction(torch.autograd.Function):
         """
         dq size: (bsz, n_q_head, seq_len, head_dim)
         dk size: (bsz, n_kv_head, seq_len, head_dim)
-        cos size: (3, 1, seq_len, head_dim)
-        sin size: (3, 1, seq_len, head_dim)
+        cos size: (3, bsz, seq_len, head_dim)
+        sin size: (3, bsz, seq_len, head_dim)
         """
         cos, sin = ctx.saved_tensors
         mrope_section = ctx.mrope_section
         dq, dk = qwen2vl_mrope_backward(dq, dk, cos, sin, mrope_section)

liger_kernel/ops/rms_norm.py CHANGED Viewed

@@ -17,12 +17,10 @@ import torch
 import triton
 import triton.language as tl
-from liger_kernel.ops.utils import (
-    calculate_settings,
-    compare_version,
-    ensure_contiguous,
-    torch_to_triton_dtype,
-)
+from liger_kernel.ops.utils import calculate_settings
+from liger_kernel.ops.utils import compare_version
+from liger_kernel.ops.utils import ensure_contiguous
+from liger_kernel.ops.utils import torch_to_triton_dtype
 if compare_version("triton", operator.ge, "3.0.0"):
     try:
@@ -35,9 +33,9 @@ else:
     from triton.language.math import rsqrt
-_CASTING_MODE_NONE = tl.constexpr(-1)
-_CASTING_MODE_LLAMA = tl.constexpr(0)
-_CASTING_MODE_GEMMA = tl.constexpr(1)
+_CASTING_MODE_NONE: tl.constexpr = tl.constexpr(-1)
+_CASTING_MODE_LLAMA: tl.constexpr = tl.constexpr(0)
+_CASTING_MODE_GEMMA: tl.constexpr = tl.constexpr(1)
 @triton.jit
@@ -177,9 +175,7 @@ def _rms_norm_backward_kernel(
         dX_row = rstd_row * m
-        dX_row += (rstd_row) * (
-            -(1 / n_cols) * rstd_row * rstd_row * tl.sum(m * X_row, axis=0) * X_row
-        )
+        dX_row += (rstd_row) * (-(1 / n_cols) * rstd_row * rstd_row * tl.sum(m * X_row, axis=0) * X_row)
         # calculate the gradient of W
         if casting_mode == _CASTING_MODE_LLAMA:
@@ -207,14 +203,10 @@ _str_to_casting_mode = {
 def rms_norm_forward(X, W, eps, offset, casting_mode):
     if not isinstance(casting_mode, int):
-        assert (
-            casting_mode in _str_to_casting_mode
-        ), f"Invalid casting mode: {casting_mode}"
+        assert casting_mode in _str_to_casting_mode, f"Invalid casting mode: {casting_mode}"
         casting_mode = _str_to_casting_mode[casting_mode]
     else:
-        assert (
-            casting_mode in _str_to_casting_mode.values()
-        ), f"Invalid casting mode: {casting_mode}"
+        assert casting_mode in _str_to_casting_mode.values(), f"Invalid casting mode: {casting_mode}"
     shape = X.shape
     dim = shape[-1]
@@ -225,17 +217,11 @@ def rms_norm_forward(X, W, eps, offset, casting_mode):
     Y = torch.empty((n_rows, n_cols), dtype=X.dtype, device=X.device)
     # RSTD is to cache rstd for each row
     # RSTD is always computed/stored in fp32 if we are using Llama or Gemma casting mode
-    rstd_dtype = (
-        torch.float32
-        if casting_mode in (_CASTING_MODE_LLAMA.value, _CASTING_MODE_GEMMA.value)
-        else X.dtype
-    )
+    rstd_dtype = torch.float32 if casting_mode in (_CASTING_MODE_LLAMA.value, _CASTING_MODE_GEMMA.value) else X.dtype
     RSTD = torch.empty(n_rows, dtype=rstd_dtype, device=X.device)
     # Check constraints.
-    assert (
-        X.shape[1] == W.shape[0]
-    ), "Incompatible hidden size dimension between tensor1.shape[1] and tensor2.shape[0]"
+    assert X.shape[1] == W.shape[0], "Incompatible hidden size dimension between tensor1.shape[1] and tensor2.shape[0]"
     _rms_norm_forward_kernel[(n_rows,)](
         Y,
@@ -256,9 +242,7 @@ def rms_norm_forward(X, W, eps, offset, casting_mode):
     return Y.view(*shape), X, RSTD, BLOCK_SIZE, num_warps, casting_mode
-def rms_norm_backward(
-    dY, X, W, RSTD, offset, casting_mode, BLOCK_SIZE, num_warps, in_place
-):
+def rms_norm_backward(dY, X, W, RSTD, offset, casting_mode, BLOCK_SIZE, num_warps, in_place):
     shape = dY.shape
     dim = shape[-1]
     dY = dY.view(-1, dim)
@@ -340,9 +324,7 @@ class LigerRMSNormFunction(torch.autograd.Function):
         X: (B, T, H) or (BxT, H)
         W: (H,)
         """
-        Y, X, RSTD, BLOCK_SIZE, num_warps, casting_mode = rms_norm_forward(
-            X, W, eps, offset, casting_mode
-        )
+        Y, X, RSTD, BLOCK_SIZE, num_warps, casting_mode = rms_norm_forward(X, W, eps, offset, casting_mode)
         ctx.offset = offset
         ctx.casting_mode = casting_mode
         ctx.in_place = in_place

liger_kernel/ops/rope.py CHANGED Viewed

@@ -15,6 +15,7 @@ def _triton_rope(
     sin_row_stride,
     sl,
     bs: tl.constexpr,
+    cos_bs: tl.constexpr,
     n_qh: tl.constexpr,
     n_kh: tl.constexpr,
     hd: tl.constexpr,
@@ -29,7 +30,7 @@ def _triton_rope(
     # k size: (bsz, seq_len, num_kv_heads, head_dim)
     # k stride: (seq_len * num_kv_heads * head_dim, num_kv_heads * head_dim, head_dim, 1)
-    # cos size: (1, seq_len, head_dim)
+    # cos size: (1, seq_len, head_dim) or (bsz, seq_len, head_dim)
     # stride: (seq_len * head_dim, head_dim, 1)
     pid = tl.program_id(0)
@@ -48,9 +49,19 @@ def _triton_rope(
     # and pid % sl to get the sequence index.
     # 2. We only need the left half of cos and sin matrix because the right half is just
     # a clone of the left half.
-    cos_row_idx = pid % (sl)
-    cos = cos + cos_row_idx * cos_row_stride
-    sin = sin + cos_row_idx * sin_row_stride
+    batch_idx = pid // sl
+    cos_row_idx = pid % sl
+    cos = cos + tl.where(
+        cos_bs == 1,
+        cos_row_idx * cos_row_stride,
+        batch_idx * (sl * cos_row_stride) + cos_row_idx * cos_row_stride,
+    )
+    sin = sin + tl.where(
+        cos_bs == 1,
+        cos_row_idx * sin_row_stride,
+        batch_idx * (sl * sin_row_stride) + cos_row_idx * sin_row_stride,
+    )
     cos_offsets = tl.arange(0, pad_hd // 2)
     cos_mask = cos_offsets < hd // 2
     cos_row = tl.load(cos + cos_offsets, mask=cos_mask, other=0)
@@ -61,36 +72,20 @@ def _triton_rope(
     # program instance (i.e. for the current token) separately
     # ####################################################################
     # left half of the head
-    first_half_q_offsets = (
-        tl.arange(0, pad_n_qh)[:, None] * hd + tl.arange(0, pad_hd // 2)[None, :]
-    )
-    first_half_k_offsets = (
-        tl.arange(0, pad_n_kh)[:, None] * hd + tl.arange(0, pad_hd // 2)[None, :]
-    )
-    first_q_mask = (tl.arange(0, pad_n_qh)[:, None] < n_qh) & (
-        tl.arange(0, pad_hd // 2)[None, :] < hd // 2
-    )
-    first_k_mask = (tl.arange(0, pad_n_kh)[:, None] < n_kh) & (
-        tl.arange(0, pad_hd // 2)[None, :] < hd // 2
-    )
-    q_tile_1 = tl.load(q_ptr + first_half_q_offsets, mask=first_q_mask, other=0).to(
-        sin_row.dtype
-    )
-    k_tile_1 = tl.load(k_ptr + first_half_k_offsets, mask=first_k_mask, other=0).to(
-        sin_row.dtype
-    )
+    first_half_q_offsets = tl.arange(0, pad_n_qh)[:, None] * hd + tl.arange(0, pad_hd // 2)[None, :]
+    first_half_k_offsets = tl.arange(0, pad_n_kh)[:, None] * hd + tl.arange(0, pad_hd // 2)[None, :]
+    first_q_mask = (tl.arange(0, pad_n_qh)[:, None] < n_qh) & (tl.arange(0, pad_hd // 2)[None, :] < hd // 2)
+    first_k_mask = (tl.arange(0, pad_n_kh)[:, None] < n_kh) & (tl.arange(0, pad_hd // 2)[None, :] < hd // 2)
+    q_tile_1 = tl.load(q_ptr + first_half_q_offsets, mask=first_q_mask, other=0).to(sin_row.dtype)
+    k_tile_1 = tl.load(k_ptr + first_half_k_offsets, mask=first_k_mask, other=0).to(sin_row.dtype)
     # right half of the head
     second_half_q_offsets = first_half_q_offsets + (hd // 2)
     second_half_k_offsets = first_half_k_offsets + (hd // 2)
     second_q_mask = first_q_mask
     second_k_mask = first_k_mask
-    q_tile_2 = tl.load(q_ptr + second_half_q_offsets, mask=second_q_mask, other=0).to(
-        sin_row.dtype
-    )
-    k_tile_2 = tl.load(k_ptr + second_half_k_offsets, mask=second_k_mask, other=0).to(
-        sin_row.dtype
-    )
+    q_tile_2 = tl.load(q_ptr + second_half_q_offsets, mask=second_q_mask, other=0).to(sin_row.dtype)
+    k_tile_2 = tl.load(k_ptr + second_half_k_offsets, mask=second_k_mask, other=0).to(sin_row.dtype)
     if not BACKWARD_PASS:
         # y = [x1, x2] * [cos, cos] + [-x2, x1] * [sin, sin]
@@ -118,7 +113,6 @@ def _triton_rope(
 def rope_forward(q, k, cos, sin):
     # transpose it back to the physical shape because Triton looks at the physical storage
     # note: q and k are incontiguous before the transformation and will become contiguous after transpose
     q = q.transpose(1, 2)
@@ -138,6 +132,7 @@ def rope_forward(q, k, cos, sin):
     k = k.contiguous()
     cos = cos.contiguous()
     sin = sin.contiguous()
+    cos_batch_size = cos.shape[0]
     _triton_rope[(n_row,)](
         q,
@@ -150,6 +145,7 @@ def rope_forward(q, k, cos, sin):
         sin.stride(-2),
         seq_len,
         batch_size,
+        cos_batch_size,
         n_q_head,
         n_kv_head,
         head_dim,
@@ -167,6 +163,7 @@ def rope_backward(dq, dk, cos, sin):
     dk = dk.transpose(1, 2)
     batch_size, seq_len, n_q_head, head_dim = dq.shape
+    cos_batch_size = cos.shape[0]
     n_kv_head = dk.shape[2]
     pad_hd = triton.next_power_of_2(head_dim)
     pad_n_q_head = triton.next_power_of_2(n_q_head)
@@ -191,6 +188,7 @@ def rope_backward(dq, dk, cos, sin):
         sin.stride(-2),
         seq_len,
         batch_size,
+        cos_batch_size,
         n_q_head,
         n_kv_head,
         head_dim,
@@ -221,8 +219,8 @@ class LigerRopeFunction(torch.autograd.Function):
         """
         q size: (bsz, n_q_head, seq_len, head_dim)
         k size: (bsz, n_kv_head, seq_len, head_dim)
-        cos size: (1, seq_len, head_dim)
-        sin size: (1, seq_len, head_dim)
+        cos size: (1, seq_len, head_dim) or (bsz, seq_len, head_dim)
+        sin size: (1, seq_len, head_dim) or (bsz, seq_len, head_dim)
         """
         q, k, cos, sin = rope_forward(q, k, cos, sin)
         ctx.save_for_backward(cos, sin)
@@ -232,8 +230,8 @@ class LigerRopeFunction(torch.autograd.Function):
         """
         dq size: (bsz, n_q_head, seq_len, head_dim)
         dk size: (bsz, n_kv_head, seq_len, head_dim)
-        cos size: (1, seq_len, head_dim)
-        sin size: (1, seq_len, head_dim)
+        cos size: (1, seq_len, head_dim) or (bsz, seq_len, head_dim)
+        sin size: (1, seq_len, head_dim) or (bsz, seq_len, head_dim)
         """
         cos, sin = ctx.saved_tensors

liger-kernel 0.5.1__py3-none-any.whl → 0.5.3__py3-none-any.whl

liger-kernel 0.5.1py3-none-any.whl → 0.5.3py3-none-any.whl