PyPI - liger-kernel-nightly - Versions diffs - 0.5.2.dev20241223032015__py3-none-any.whl → 0.5.2.dev20241223042135__py3-none-any.whl - Mend

liger-kernel-nightly 0.5.2.dev20241223032015py3-none-any.whl → 0.5.2.dev20241223042135py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

liger_kernel/chunked_loss/cpo_loss.py +5 -11
liger_kernel/chunked_loss/dpo_loss.py +1 -4
liger_kernel/chunked_loss/fused_linear_distillation.py +37 -37
liger_kernel/chunked_loss/fused_linear_preference.py +40 -64
liger_kernel/chunked_loss/orpo_loss.py +2 -6
liger_kernel/chunked_loss/simpo_loss.py +4 -8
liger_kernel/env_report.py +4 -11
liger_kernel/ops/cross_entropy.py +7 -10
liger_kernel/ops/experimental/embedding.py +1 -3
liger_kernel/ops/experimental/mm_int8int2.py +3 -9
liger_kernel/ops/fused_linear_cross_entropy.py +7 -15
liger_kernel/ops/fused_linear_jsd.py +11 -29
liger_kernel/ops/geglu.py +6 -17
liger_kernel/ops/group_norm.py +11 -28
liger_kernel/ops/jsd.py +2 -6
liger_kernel/ops/kl_div.py +4 -7
liger_kernel/ops/layer_norm.py +3 -5
liger_kernel/ops/qwen2vl_mrope.py +8 -25
liger_kernel/ops/rms_norm.py +11 -29
liger_kernel/ops/rope.py +31 -33
liger_kernel/ops/swiglu.py +4 -8
liger_kernel/ops/utils.py +2 -0
liger_kernel/transformers/__init__.py +16 -24
liger_kernel/transformers/auto_model.py +6 -13
liger_kernel/transformers/cross_entropy.py +1 -3
liger_kernel/transformers/experimental/embedding.py +1 -3
liger_kernel/transformers/functional.py +2 -6
liger_kernel/transformers/fused_linear_cross_entropy.py +2 -6
liger_kernel/transformers/geglu.py +1 -4
liger_kernel/transformers/group_norm.py +3 -9
liger_kernel/transformers/jsd.py +1 -3
liger_kernel/transformers/kl_div.py +1 -3
liger_kernel/transformers/layer_norm.py +3 -9
liger_kernel/transformers/model/gemma.py +18 -40
liger_kernel/transformers/model/gemma2.py +19 -41
liger_kernel/transformers/model/llama.py +22 -48
liger_kernel/transformers/model/mistral.py +14 -26
liger_kernel/transformers/model/mixtral.py +23 -53
liger_kernel/transformers/model/mllama.py +16 -36
liger_kernel/transformers/model/phi3.py +18 -40
liger_kernel/transformers/model/qwen2.py +18 -40
liger_kernel/transformers/model/qwen2_vl.py +16 -30
liger_kernel/transformers/monkey_patch.py +43 -117
liger_kernel/transformers/rms_norm.py +4 -4
liger_kernel/transformers/rope.py +2 -2
liger_kernel/transformers/swiglu.py +2 -8
liger_kernel/transformers/trainer/__init__.py +1 -3
liger_kernel/transformers/trainer/orpo_trainer.py +13 -16
liger_kernel/triton/__init__.py +1 -3
liger_kernel/triton/monkey_patch.py +1 -3
{liger_kernel_nightly-0.5.2.dev20241223032015.dist-info → liger_kernel_nightly-0.5.2.dev20241223042135.dist-info}/METADATA +1 -1
liger_kernel_nightly-0.5.2.dev20241223042135.dist-info/RECORD +66 -0
liger_kernel_nightly-0.5.2.dev20241223032015.dist-info/RECORD +0 -66
{liger_kernel_nightly-0.5.2.dev20241223032015.dist-info → liger_kernel_nightly-0.5.2.dev20241223042135.dist-info}/LICENSE +0 -0
{liger_kernel_nightly-0.5.2.dev20241223032015.dist-info → liger_kernel_nightly-0.5.2.dev20241223042135.dist-info}/NOTICE +0 -0
{liger_kernel_nightly-0.5.2.dev20241223032015.dist-info → liger_kernel_nightly-0.5.2.dev20241223042135.dist-info}/WHEEL +0 -0
{liger_kernel_nightly-0.5.2.dev20241223032015.dist-info → liger_kernel_nightly-0.5.2.dev20241223042135.dist-info}/top_level.txt +0 -0

liger_kernel/ops/geglu.py CHANGED Viewed

@@ -4,11 +4,9 @@ import torch
 import triton
 import triton.language as tl
-from liger_kernel.ops.utils import (
-    calculate_settings,
-    compare_version,
-    ensure_contiguous,
-)
+from liger_kernel.ops.utils import calculate_settings
+from liger_kernel.ops.utils import compare_version
+from liger_kernel.ops.utils import ensure_contiguous
 if compare_version("triton", operator.ge, "3.0.0"):
     try:
@@ -22,9 +20,7 @@ else:
 @triton.jit
-def _geglu_tanh_forward_kernel(
-    a, b, c, stride, n_cols: tl.constexpr, BLOCK_SIZE: tl.constexpr
-):
+def _geglu_tanh_forward_kernel(a, b, c, stride, n_cols: tl.constexpr, BLOCK_SIZE: tl.constexpr):
     program_id = tl.program_id(0).to(tl.int64)
     # locate start index
@@ -49,9 +45,7 @@ def _geglu_tanh_forward_kernel(
 @triton.jit
-def _geglu_tanh_backward_kernel(
-    dc, a, b, stride, n_cols: tl.constexpr, BLOCK_SIZE: tl.constexpr
-):
+def _geglu_tanh_backward_kernel(dc, a, b, stride, n_cols: tl.constexpr, BLOCK_SIZE: tl.constexpr):
     program_id = tl.program_id(0).to(tl.int64)
     # locate start index
@@ -80,12 +74,7 @@ def _geglu_tanh_backward_kernel(
     # where z = sqrt(2/pi) * (a + 0.044715 * a^3)
     term1 = 0.5 * (1 + tanh_result)
     tanh_sq = tanh_result * tanh_result
-    term2 = (
-        0.5
-        * a_row
-        * (1 - tanh_sq)
-        * (sqrt_2_over_pi * (1 + 3 * 0.044715 * a_row * a_row))
-    )
+    term2 = 0.5 * a_row * (1 - tanh_sq) * (sqrt_2_over_pi * (1 + 3 * 0.044715 * a_row * a_row))
     da_row = dc_row * b_row * (term1 + term2)
     tl.store(a + col_offsets, da_row, mask=mask)

liger_kernel/ops/group_norm.py CHANGED Viewed

@@ -4,7 +4,8 @@ import torch
 import triton
 import triton.language as tl
-from liger_kernel.ops.utils import compare_version, ensure_contiguous
+from liger_kernel.ops.utils import compare_version
+from liger_kernel.ops.utils import ensure_contiguous
 if compare_version("triton", operator.ge, "3.0.0"):
     try:
@@ -73,9 +74,7 @@ def _group_norm_forward_kernel(
     # Normalize
     hidden_size_per_channel = hidden_size // channels_per_group
-    for channel_idx in tl.range(
-        group_idx * channels_per_group, (group_idx + 1) * channels_per_group
-    ):
+    for channel_idx in tl.range(group_idx * channels_per_group, (group_idx + 1) * channels_per_group):
         W = tl.load(W_ptr + channel_idx)
         B = tl.load(B_ptr + channel_idx)
         for i in range(0, hidden_size_per_channel, BLOCK_SIZE):
@@ -132,21 +131,15 @@ def _group_norm_backward_kernel(
     UPSTREAM_ptr += batch_idx * X_row_stride
     # Mean and rstd are the same shape so have the same strides
-    mean = tl.load(
-        Mean_ptr + batch_idx * Mean_ptr_row_stride + group_idx * Mean_ptr_col_stride
-    )
-    rstd = tl.load(
-        RSTD_ptr + batch_idx * Mean_ptr_row_stride + group_idx * Mean_ptr_col_stride
-    )
+    mean = tl.load(Mean_ptr + batch_idx * Mean_ptr_row_stride + group_idx * Mean_ptr_col_stride)
+    rstd = tl.load(RSTD_ptr + batch_idx * Mean_ptr_row_stride + group_idx * Mean_ptr_col_stride)
     c1 = 0.0
     c2 = 0.0
     block_range = tl.arange(0, BLOCK_SIZE)
     # We need to compute the sum terms of the backprop equations across all channels in the group
-    for channel_idx in range(
-        group_idx * channels_per_group, (group_idx + 1) * channels_per_group
-    ):
+    for channel_idx in range(group_idx * channels_per_group, (group_idx + 1) * channels_per_group):
         dW = 0.0
         dB = 0.0
         # Move the pointers to the correct channel
@@ -181,9 +174,7 @@ def _group_norm_backward_kernel(
     c1 = c1 / N
     c2 = c2 / N
-    for channel_idx in tl.range(
-        group_idx * channels_per_group, (group_idx + 1) * channels_per_group
-    ):
+    for channel_idx in tl.range(group_idx * channels_per_group, (group_idx + 1) * channels_per_group):
         # Move the pointers to the correct channel
         W = tl.load(W_ptr + channel_idx)
         for i in range(0, hidden_size, BLOCK_SIZE):
@@ -203,9 +194,7 @@ def _group_norm_backward_kernel(
             x_hat = (X - mean) * rstd
             wdy = W * UPSTREAM_grad
             dx = (wdy - (x_hat * c1 + c2)) * rstd
-            tl.store(
-                DX_ptr + channel_idx * X_col_stride + hidden_size_offsets, dx, mask=mask
-            )
+            tl.store(DX_ptr + channel_idx * X_col_stride + hidden_size_offsets, dx, mask=mask)
 def group_norm_forward(X, num_channels, num_groups, W, B, eps):
@@ -216,9 +205,7 @@ def group_norm_forward(X, num_channels, num_groups, W, B, eps):
     X = X.view(batch_size, num_groups, -1).contiguous()
     hidden_size = X.shape[-1]
     BLOCK_SIZE = min(MAX_FUSED_SIZE, triton.next_power_of_2(hidden_size))
-    Y = torch.empty(
-        (batch_size, num_groups, hidden_size), dtype=X.dtype, device=X.device
-    )
+    Y = torch.empty((batch_size, num_groups, hidden_size), dtype=X.dtype, device=X.device)
     Mean = torch.zeros((batch_size, num_groups), dtype=X.dtype, device=X.device)
     RSTD = torch.zeros((batch_size, num_groups), dtype=X.dtype, device=X.device)
@@ -307,16 +294,12 @@ class LigerGroupNormFunction(torch.autograd.Function):
         )
         ctx.num_channels = num_channels
         ctx.num_groups = num_groups
-        ctx.save_for_backward(
-            X, affine_scaling_weight, affine_shifting_bias, Mean, RSTD
-        )
+        ctx.save_for_backward(X, affine_scaling_weight, affine_shifting_bias, Mean, RSTD)
         return Y
     @staticmethod
     @ensure_contiguous
     def backward(ctx, dY):
         X, W, B, Mean, RSTD = ctx.saved_tensors
-        DX, DW, DB = group_norm_backward(
-            dY, X, W, B, Mean, RSTD, ctx.num_channels, ctx.num_groups
-        )
+        DX, DW, DB = group_norm_backward(dY, X, W, B, Mean, RSTD, ctx.num_channels, ctx.num_groups)
         return DX, DW, DB, None, None, None

liger_kernel/ops/jsd.py CHANGED Viewed

@@ -98,9 +98,7 @@ def jsd_forward(_input, target, shift_labels, beta, ignore_index, has_label):
         loss_stride=loss.stride(-2),
         dX_ptr=dX,
         dX_stride=dX.stride(-2),
-        label_ptr=(
-            shift_labels if has_label else torch.empty(1, device=_input.device)
-        ),  # dummy ptr if no label
+        label_ptr=(shift_labels if has_label else torch.empty(1, device=_input.device)),  # dummy ptr if no label
         beta=beta,
         n_non_ignore=n_non_ignore,
         ignore_index=ignore_index,
@@ -165,9 +163,7 @@ class LigerJSDFunction(torch.autograd.Function):
             shift_labels = shift_labels.contiguous()
             has_label = True
-        loss, dX = jsd_forward(
-            _input, target, shift_labels, beta, ignore_index, has_label
-        )
+        loss, dX = jsd_forward(_input, target, shift_labels, beta, ignore_index, has_label)
         ctx.save_for_backward(dX)
         return loss

liger_kernel/ops/kl_div.py CHANGED Viewed

@@ -4,7 +4,8 @@ import torch
 import triton
 import triton.language as tl
-from liger_kernel.ops.utils import ensure_contiguous, is_hip
+from liger_kernel.ops.utils import ensure_contiguous
+from liger_kernel.ops.utils import is_hip
 def get_num_warps(BLOCK_SIZE):
@@ -218,9 +219,7 @@ class LigerKLDivLossFunction(torch.autograd.Function):
         ctx.save_for_backward(y_true)
         ctx.reduction = reduction
         ctx.log_target = log_target
-        return kldiv_forward_triton(
-            y_pred, y_true, log_target=log_target, reduction=reduction, eps=eps
-        )
+        return kldiv_forward_triton(y_pred, y_true, log_target=log_target, reduction=reduction, eps=eps)
     @staticmethod
     @ensure_contiguous
@@ -238,9 +237,7 @@ class LigerKLDivLossFunction(torch.autograd.Function):
         new_grads = torch.empty_like(y_true)
-        derivative = kldiv_backward_triton(
-            y_true, grad_output, new_grads, ctx.log_target
-        )
+        derivative = kldiv_backward_triton(y_true, grad_output, new_grads, ctx.log_target)
         if ctx.reduction == "batchmean":
             derivative = derivative / y_true.shape[0]

liger_kernel/ops/layer_norm.py CHANGED Viewed

@@ -5,11 +5,9 @@ import torch
 import triton
 import triton.language as tl
-from liger_kernel.ops.utils import (
-    calculate_settings,
-    compare_version,
-    ensure_contiguous,
-)
+from liger_kernel.ops.utils import calculate_settings
+from liger_kernel.ops.utils import compare_version
+from liger_kernel.ops.utils import ensure_contiguous
 if compare_version("triton", operator.ge, "3.0.0"):
     try:

liger_kernel/ops/qwen2vl_mrope.py CHANGED Viewed

@@ -67,36 +67,20 @@ def _triton_qwen2vl_mrope(
     # program instance (i.e. for the current token) separately
     # ####################################################################
     # left half of the head
-    first_half_q_offsets = (
-        tl.arange(0, pad_n_qh)[:, None] * hd + tl.arange(0, pad_hd // 2)[None, :]
-    )
-    first_half_k_offsets = (
-        tl.arange(0, pad_n_kh)[:, None] * hd + tl.arange(0, pad_hd // 2)[None, :]
-    )
-    first_q_mask = (tl.arange(0, pad_n_qh)[:, None] < n_qh) & (
-        tl.arange(0, pad_hd // 2)[None, :] < hd // 2
-    )
-    first_k_mask = (tl.arange(0, pad_n_kh)[:, None] < n_kh) & (
-        tl.arange(0, pad_hd // 2)[None, :] < hd // 2
-    )
-    q_tile_1 = tl.load(q_ptr + first_half_q_offsets, mask=first_q_mask, other=0).to(
-        sin_row.dtype
-    )
-    k_tile_1 = tl.load(k_ptr + first_half_k_offsets, mask=first_k_mask, other=0).to(
-        sin_row.dtype
-    )
+    first_half_q_offsets = tl.arange(0, pad_n_qh)[:, None] * hd + tl.arange(0, pad_hd // 2)[None, :]
+    first_half_k_offsets = tl.arange(0, pad_n_kh)[:, None] * hd + tl.arange(0, pad_hd // 2)[None, :]
+    first_q_mask = (tl.arange(0, pad_n_qh)[:, None] < n_qh) & (tl.arange(0, pad_hd // 2)[None, :] < hd // 2)
+    first_k_mask = (tl.arange(0, pad_n_kh)[:, None] < n_kh) & (tl.arange(0, pad_hd // 2)[None, :] < hd // 2)
+    q_tile_1 = tl.load(q_ptr + first_half_q_offsets, mask=first_q_mask, other=0).to(sin_row.dtype)
+    k_tile_1 = tl.load(k_ptr + first_half_k_offsets, mask=first_k_mask, other=0).to(sin_row.dtype)
     # right half of the head
     second_half_q_offsets = first_half_q_offsets + (hd // 2)
     second_half_k_offsets = first_half_k_offsets + (hd // 2)
     second_q_mask = first_q_mask
     second_k_mask = first_k_mask
-    q_tile_2 = tl.load(q_ptr + second_half_q_offsets, mask=second_q_mask, other=0).to(
-        sin_row.dtype
-    )
-    k_tile_2 = tl.load(k_ptr + second_half_k_offsets, mask=second_k_mask, other=0).to(
-        sin_row.dtype
-    )
+    q_tile_2 = tl.load(q_ptr + second_half_q_offsets, mask=second_q_mask, other=0).to(sin_row.dtype)
+    k_tile_2 = tl.load(k_ptr + second_half_k_offsets, mask=second_k_mask, other=0).to(sin_row.dtype)
     if not BACKWARD_PASS:
         # y = [x1, x2] * [cos, cos] + [-x2, x1] * [sin, sin]
@@ -124,7 +108,6 @@ def _triton_qwen2vl_mrope(
 def qwen2vl_mrope_forward(q, k, cos, sin, mrope_section):
     # transpose it back to the physical shape because Triton looks at the physical storage
     # note: q and k are incontiguous before the transformation and will become contiguous after transpose
     q = q.transpose(1, 2)

liger_kernel/ops/rms_norm.py CHANGED Viewed

@@ -17,12 +17,10 @@ import torch
 import triton
 import triton.language as tl
-from liger_kernel.ops.utils import (
-    calculate_settings,
-    compare_version,
-    ensure_contiguous,
-    torch_to_triton_dtype,
-)
+from liger_kernel.ops.utils import calculate_settings
+from liger_kernel.ops.utils import compare_version
+from liger_kernel.ops.utils import ensure_contiguous
+from liger_kernel.ops.utils import torch_to_triton_dtype
 if compare_version("triton", operator.ge, "3.0.0"):
     try:
@@ -177,9 +175,7 @@ def _rms_norm_backward_kernel(
         dX_row = rstd_row * m
-        dX_row += (rstd_row) * (
-            -(1 / n_cols) * rstd_row * rstd_row * tl.sum(m * X_row, axis=0) * X_row
-        )
+        dX_row += (rstd_row) * (-(1 / n_cols) * rstd_row * rstd_row * tl.sum(m * X_row, axis=0) * X_row)
         # calculate the gradient of W
         if casting_mode == _CASTING_MODE_LLAMA:
@@ -207,14 +203,10 @@ _str_to_casting_mode = {
 def rms_norm_forward(X, W, eps, offset, casting_mode):
     if not isinstance(casting_mode, int):
-        assert (
-            casting_mode in _str_to_casting_mode
-        ), f"Invalid casting mode: {casting_mode}"
+        assert casting_mode in _str_to_casting_mode, f"Invalid casting mode: {casting_mode}"
         casting_mode = _str_to_casting_mode[casting_mode]
     else:
-        assert (
-            casting_mode in _str_to_casting_mode.values()
-        ), f"Invalid casting mode: {casting_mode}"
+        assert casting_mode in _str_to_casting_mode.values(), f"Invalid casting mode: {casting_mode}"
     shape = X.shape
     dim = shape[-1]
@@ -225,17 +217,11 @@ def rms_norm_forward(X, W, eps, offset, casting_mode):
     Y = torch.empty((n_rows, n_cols), dtype=X.dtype, device=X.device)
     # RSTD is to cache rstd for each row
     # RSTD is always computed/stored in fp32 if we are using Llama or Gemma casting mode
-    rstd_dtype = (
-        torch.float32
-        if casting_mode in (_CASTING_MODE_LLAMA.value, _CASTING_MODE_GEMMA.value)
-        else X.dtype
-    )
+    rstd_dtype = torch.float32 if casting_mode in (_CASTING_MODE_LLAMA.value, _CASTING_MODE_GEMMA.value) else X.dtype
     RSTD = torch.empty(n_rows, dtype=rstd_dtype, device=X.device)
     # Check constraints.
-    assert (
-        X.shape[1] == W.shape[0]
-    ), "Incompatible hidden size dimension between tensor1.shape[1] and tensor2.shape[0]"
+    assert X.shape[1] == W.shape[0], "Incompatible hidden size dimension between tensor1.shape[1] and tensor2.shape[0]"
     _rms_norm_forward_kernel[(n_rows,)](
         Y,
@@ -256,9 +242,7 @@ def rms_norm_forward(X, W, eps, offset, casting_mode):
     return Y.view(*shape), X, RSTD, BLOCK_SIZE, num_warps, casting_mode
-def rms_norm_backward(
-    dY, X, W, RSTD, offset, casting_mode, BLOCK_SIZE, num_warps, in_place
-):
+def rms_norm_backward(dY, X, W, RSTD, offset, casting_mode, BLOCK_SIZE, num_warps, in_place):
     shape = dY.shape
     dim = shape[-1]
     dY = dY.view(-1, dim)
@@ -340,9 +324,7 @@ class LigerRMSNormFunction(torch.autograd.Function):
         X: (B, T, H) or (BxT, H)
         W: (H,)
         """
-        Y, X, RSTD, BLOCK_SIZE, num_warps, casting_mode = rms_norm_forward(
-            X, W, eps, offset, casting_mode
-        )
+        Y, X, RSTD, BLOCK_SIZE, num_warps, casting_mode = rms_norm_forward(X, W, eps, offset, casting_mode)
         ctx.offset = offset
         ctx.casting_mode = casting_mode
         ctx.in_place = in_place

liger_kernel/ops/rope.py CHANGED Viewed

@@ -15,6 +15,7 @@ def _triton_rope(
     sin_row_stride,
     sl,
     bs: tl.constexpr,
+    cos_bs: tl.constexpr,
     n_qh: tl.constexpr,
     n_kh: tl.constexpr,
     hd: tl.constexpr,
@@ -29,7 +30,7 @@ def _triton_rope(
     # k size: (bsz, seq_len, num_kv_heads, head_dim)
     # k stride: (seq_len * num_kv_heads * head_dim, num_kv_heads * head_dim, head_dim, 1)
-    # cos size: (1, seq_len, head_dim)
+    # cos size: (1, seq_len, head_dim) or (bsz, seq_len, head_dim)
     # stride: (seq_len * head_dim, head_dim, 1)
     pid = tl.program_id(0)
@@ -48,9 +49,19 @@ def _triton_rope(
     # and pid % sl to get the sequence index.
     # 2. We only need the left half of cos and sin matrix because the right half is just
     # a clone of the left half.
-    cos_row_idx = pid % (sl)
-    cos = cos + cos_row_idx * cos_row_stride
-    sin = sin + cos_row_idx * sin_row_stride
+    batch_idx = pid // sl
+    cos_row_idx = pid % sl
+    cos = cos + tl.where(
+        cos_bs == 1,
+        cos_row_idx * cos_row_stride,
+        batch_idx * (sl * cos_row_stride) + cos_row_idx * cos_row_stride,
+    )
+    sin = sin + tl.where(
+        cos_bs == 1,
+        cos_row_idx * sin_row_stride,
+        batch_idx * (sl * sin_row_stride) + cos_row_idx * sin_row_stride,
+    )
     cos_offsets = tl.arange(0, pad_hd // 2)
     cos_mask = cos_offsets < hd // 2
     cos_row = tl.load(cos + cos_offsets, mask=cos_mask, other=0)
@@ -61,36 +72,20 @@ def _triton_rope(
     # program instance (i.e. for the current token) separately
     # ####################################################################
     # left half of the head
-    first_half_q_offsets = (
-        tl.arange(0, pad_n_qh)[:, None] * hd + tl.arange(0, pad_hd // 2)[None, :]
-    )
-    first_half_k_offsets = (
-        tl.arange(0, pad_n_kh)[:, None] * hd + tl.arange(0, pad_hd // 2)[None, :]
-    )
-    first_q_mask = (tl.arange(0, pad_n_qh)[:, None] < n_qh) & (
-        tl.arange(0, pad_hd // 2)[None, :] < hd // 2
-    )
-    first_k_mask = (tl.arange(0, pad_n_kh)[:, None] < n_kh) & (
-        tl.arange(0, pad_hd // 2)[None, :] < hd // 2
-    )
-    q_tile_1 = tl.load(q_ptr + first_half_q_offsets, mask=first_q_mask, other=0).to(
-        sin_row.dtype
-    )
-    k_tile_1 = tl.load(k_ptr + first_half_k_offsets, mask=first_k_mask, other=0).to(
-        sin_row.dtype
-    )
+    first_half_q_offsets = tl.arange(0, pad_n_qh)[:, None] * hd + tl.arange(0, pad_hd // 2)[None, :]
+    first_half_k_offsets = tl.arange(0, pad_n_kh)[:, None] * hd + tl.arange(0, pad_hd // 2)[None, :]
+    first_q_mask = (tl.arange(0, pad_n_qh)[:, None] < n_qh) & (tl.arange(0, pad_hd // 2)[None, :] < hd // 2)
+    first_k_mask = (tl.arange(0, pad_n_kh)[:, None] < n_kh) & (tl.arange(0, pad_hd // 2)[None, :] < hd // 2)
+    q_tile_1 = tl.load(q_ptr + first_half_q_offsets, mask=first_q_mask, other=0).to(sin_row.dtype)
+    k_tile_1 = tl.load(k_ptr + first_half_k_offsets, mask=first_k_mask, other=0).to(sin_row.dtype)
     # right half of the head
     second_half_q_offsets = first_half_q_offsets + (hd // 2)
     second_half_k_offsets = first_half_k_offsets + (hd // 2)
     second_q_mask = first_q_mask
     second_k_mask = first_k_mask
-    q_tile_2 = tl.load(q_ptr + second_half_q_offsets, mask=second_q_mask, other=0).to(
-        sin_row.dtype
-    )
-    k_tile_2 = tl.load(k_ptr + second_half_k_offsets, mask=second_k_mask, other=0).to(
-        sin_row.dtype
-    )
+    q_tile_2 = tl.load(q_ptr + second_half_q_offsets, mask=second_q_mask, other=0).to(sin_row.dtype)
+    k_tile_2 = tl.load(k_ptr + second_half_k_offsets, mask=second_k_mask, other=0).to(sin_row.dtype)
     if not BACKWARD_PASS:
         # y = [x1, x2] * [cos, cos] + [-x2, x1] * [sin, sin]
@@ -118,7 +113,6 @@ def _triton_rope(
 def rope_forward(q, k, cos, sin):
     # transpose it back to the physical shape because Triton looks at the physical storage
     # note: q and k are incontiguous before the transformation and will become contiguous after transpose
     q = q.transpose(1, 2)
@@ -138,6 +132,7 @@ def rope_forward(q, k, cos, sin):
     k = k.contiguous()
     cos = cos.contiguous()
     sin = sin.contiguous()
+    cos_batch_size = cos.shape[0]
     _triton_rope[(n_row,)](
         q,
@@ -150,6 +145,7 @@ def rope_forward(q, k, cos, sin):
         sin.stride(-2),
         seq_len,
         batch_size,
+        cos_batch_size,
         n_q_head,
         n_kv_head,
         head_dim,
@@ -167,6 +163,7 @@ def rope_backward(dq, dk, cos, sin):
     dk = dk.transpose(1, 2)
     batch_size, seq_len, n_q_head, head_dim = dq.shape
+    cos_batch_size = cos.shape[0]
     n_kv_head = dk.shape[2]
     pad_hd = triton.next_power_of_2(head_dim)
     pad_n_q_head = triton.next_power_of_2(n_q_head)
@@ -191,6 +188,7 @@ def rope_backward(dq, dk, cos, sin):
         sin.stride(-2),
         seq_len,
         batch_size,
+        cos_batch_size,
         n_q_head,
         n_kv_head,
         head_dim,
@@ -221,8 +219,8 @@ class LigerRopeFunction(torch.autograd.Function):
         """
         q size: (bsz, n_q_head, seq_len, head_dim)
         k size: (bsz, n_kv_head, seq_len, head_dim)
-        cos size: (1, seq_len, head_dim)
-        sin size: (1, seq_len, head_dim)
+        cos size: (1, seq_len, head_dim) or (bsz, seq_len, head_dim)
+        sin size: (1, seq_len, head_dim) or (bsz, seq_len, head_dim)
         """
         q, k, cos, sin = rope_forward(q, k, cos, sin)
         ctx.save_for_backward(cos, sin)
@@ -232,8 +230,8 @@ class LigerRopeFunction(torch.autograd.Function):
         """
         dq size: (bsz, n_q_head, seq_len, head_dim)
         dk size: (bsz, n_kv_head, seq_len, head_dim)
-        cos size: (1, seq_len, head_dim)
-        sin size: (1, seq_len, head_dim)
+        cos size: (1, seq_len, head_dim) or (bsz, seq_len, head_dim)
+        sin size: (1, seq_len, head_dim) or (bsz, seq_len, head_dim)
         """
         cos, sin = ctx.saved_tensors

liger_kernel/ops/swiglu.py CHANGED Viewed

@@ -2,7 +2,8 @@ import torch
 import triton
 import triton.language as tl
-from liger_kernel.ops.utils import calculate_settings, ensure_contiguous
+from liger_kernel.ops.utils import calculate_settings
+from liger_kernel.ops.utils import ensure_contiguous
 @triton.jit
@@ -11,9 +12,7 @@ def silu(x):
 @triton.jit
-def _swiglu_forward_kernel(
-    a_ptr, b_ptr, c_ptr, stride, n_cols: tl.constexpr, BLOCK_SIZE: tl.constexpr
-):
+def _swiglu_forward_kernel(a_ptr, b_ptr, c_ptr, stride, n_cols: tl.constexpr, BLOCK_SIZE: tl.constexpr):
     program_id = tl.program_id(0).to(tl.int64)
     # locate start index
@@ -32,9 +31,7 @@ def _swiglu_forward_kernel(
 @triton.jit
-def _swiglu_backward_kernel(
-    dc_ptr, a_ptr, b_ptr, stride, n_cols: tl.constexpr, BLOCK_SIZE: tl.constexpr
-):
+def _swiglu_backward_kernel(dc_ptr, a_ptr, b_ptr, stride, n_cols: tl.constexpr, BLOCK_SIZE: tl.constexpr):
     program_id = tl.program_id(0).to(tl.int64)
     # locate start index
@@ -84,7 +81,6 @@ def swiglu_forward(a, b):
 def swiglu_backward(a, b, dc):
     ori_shape = dc.shape
     n_cols = ori_shape[-1]
     dc = dc.view(-1, n_cols)

liger_kernel/ops/utils.py CHANGED Viewed

@@ -13,11 +13,13 @@ Modifications made by Yanning Chen, 2024.
 import functools
 import importlib
 import operator
 from typing import Callable
 import torch
 import triton
 import triton.language as tl
 from packaging.version import Version
 from liger_kernel.utils import infer_device

liger_kernel/transformers/__init__.py CHANGED Viewed

@@ -1,31 +1,23 @@
-from liger_kernel.transformers.auto_model import (  # noqa: F401
-    AutoLigerKernelForCausalLM,
-)
+from liger_kernel.transformers.auto_model import AutoLigerKernelForCausalLM  # noqa: F401
 from liger_kernel.transformers.cross_entropy import LigerCrossEntropyLoss  # noqa: F401
-from liger_kernel.transformers.fused_linear_cross_entropy import (  # noqa: F401
-    LigerFusedLinearCrossEntropyLoss,
-)
+from liger_kernel.transformers.fused_linear_cross_entropy import LigerFusedLinearCrossEntropyLoss  # noqa: F401
 from liger_kernel.transformers.fused_linear_jsd import LigerFusedLinearJSD  # noqa: F401
 from liger_kernel.transformers.geglu import LigerGEGLUMLP  # noqa: F401
 from liger_kernel.transformers.jsd import LigerJSD  # noqa: F401
 from liger_kernel.transformers.layer_norm import LigerLayerNorm  # noqa: F401
-from liger_kernel.transformers.monkey_patch import (  # noqa: F401
-    _apply_liger_kernel,
-    _apply_liger_kernel_to_instance,
-    apply_liger_kernel_to_gemma,
-    apply_liger_kernel_to_gemma2,
-    apply_liger_kernel_to_llama,
-    apply_liger_kernel_to_mistral,
-    apply_liger_kernel_to_mixtral,
-    apply_liger_kernel_to_mllama,
-    apply_liger_kernel_to_phi3,
-    apply_liger_kernel_to_qwen2,
-    apply_liger_kernel_to_qwen2_vl,
-)
+from liger_kernel.transformers.monkey_patch import _apply_liger_kernel  # noqa: F401
+from liger_kernel.transformers.monkey_patch import _apply_liger_kernel_to_instance  # noqa: F401
+from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_gemma  # noqa: F401
+from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_gemma2  # noqa: F401
+from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_llama  # noqa: F401
+from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_mistral  # noqa: F401
+from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_mixtral  # noqa: F401
+from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_mllama  # noqa: F401
+from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_phi3  # noqa: F401
+from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_qwen2  # noqa: F401
+from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_qwen2_vl  # noqa: F401
 from liger_kernel.transformers.rms_norm import LigerRMSNorm  # noqa: F401
 from liger_kernel.transformers.rope import liger_rotary_pos_emb  # noqa: F401
-from liger_kernel.transformers.swiglu import (  # noqa: F401
-    LigerBlockSparseTop2MLP,
-    LigerPhi3SwiGLUMLP,
-    LigerSwiGLUMLP,
-)
+from liger_kernel.transformers.swiglu import LigerBlockSparseTop2MLP  # noqa: F401
+from liger_kernel.transformers.swiglu import LigerPhi3SwiGLUMLP  # noqa: F401
+from liger_kernel.transformers.swiglu import LigerSwiGLUMLP  # noqa: F401

liger_kernel/transformers/auto_model.py CHANGED Viewed

@@ -1,11 +1,10 @@
 import inspect
-from transformers import AutoConfig, AutoModelForCausalLM
+from transformers import AutoConfig
+from transformers import AutoModelForCausalLM
-from liger_kernel.transformers.monkey_patch import (
-    MODEL_TYPE_TO_APPLY_LIGER_FN,
-    _apply_liger_kernel,
-)
+from liger_kernel.transformers.monkey_patch import MODEL_TYPE_TO_APPLY_LIGER_FN
+from liger_kernel.transformers.monkey_patch import _apply_liger_kernel
 def _get_model_config(model_dir, **model_init_kwargs):
@@ -34,12 +33,6 @@ class AutoLigerKernelForCausalLM(AutoModelForCausalLM):
         apply_fn = MODEL_TYPE_TO_APPLY_LIGER_FN[model_type]
         apply_fn_signature = inspect.signature(apply_fn)
-        applicable_kwargs = {
-            key: value
-            for key, value in kwargs.items()
-            if key not in apply_fn_signature.parameters
-        }
+        applicable_kwargs = {key: value for key, value in kwargs.items() if key not in apply_fn_signature.parameters}
-        return super().from_pretrained(
-            pretrained_model_name_or_path, *model_args, **applicable_kwargs
-        )
+        return super().from_pretrained(pretrained_model_name_or_path, *model_args, **applicable_kwargs)

liger_kernel/transformers/cross_entropy.py CHANGED Viewed

@@ -27,9 +27,7 @@ class LigerCrossEntropyLoss(torch.nn.Module):
             "sum",
             "none",
         }, f"reduction must be one of 'mean', 'sum', or 'none'. Got: {reduction}"
-        assert (
-            softcap is None or softcap > 0
-        ), f"softcap must greater than 0.0 or None. Got: {softcap}"
+        assert softcap is None or softcap > 0, f"softcap must greater than 0.0 or None. Got: {softcap}"
         self.ignore_index = ignore_index
         self.lse_square_scale = lse_square_scale
         self.label_smoothing = label_smoothing

liger_kernel/transformers/experimental/embedding.py CHANGED Viewed

@@ -7,9 +7,7 @@ from liger_kernel.ops.experimental.embedding import LigerEmbeddingFunction
 class LigerEmbedding(nn.Module):
-    def __init__(
-        self, num_embeddings, embedding_dim, padding_idx: Optional[int] = None
-    ):
+    def __init__(self, num_embeddings, embedding_dim, padding_idx: Optional[int] = None):
         super().__init__()
         self.num_embeddings = num_embeddings
         self.embedding_dim = embedding_dim

liger-kernel-nightly 0.5.2.dev20241223032015__py3-none-any.whl → 0.5.2.dev20241223042135__py3-none-any.whl

liger-kernel-nightly 0.5.2.dev20241223032015py3-none-any.whl → 0.5.2.dev20241223042135py3-none-any.whl