PyPI - liger-kernel - Versions diffs - 0.6.4__py3-none-any.whl → 0.6.5__py3-none-any.whl - Mend

liger-kernel 0.6.4py3-none-any.whl → 0.6.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (71) hide show

liger_kernel/chunked_loss/cosine_similarity_loss.py +7 -1
liger_kernel/chunked_loss/fused_linear_distillation.py +10 -3
liger_kernel/chunked_loss/jsd_loss.py +21 -6
liger_kernel/ops/__init__.py +141 -0
liger_kernel/ops/backends/README.md +151 -0
liger_kernel/ops/backends/__init__.py +13 -0
liger_kernel/ops/backends/_ascend/__init__.py +5 -0
liger_kernel/ops/backends/_ascend/ascend-ub-manager-design.md +492 -0
liger_kernel/ops/backends/_ascend/ops/__init__.py +61 -0
liger_kernel/ops/backends/_ascend/ops/embedding.py +214 -0
liger_kernel/ops/backends/_ascend/ops/geglu.py +191 -0
liger_kernel/ops/backends/_ascend/ops/llama4_rope.py +298 -0
liger_kernel/ops/backends/_ascend/ops/qwen2vl_mrope.py +275 -0
liger_kernel/ops/backends/_ascend/ops/rope.py +265 -0
liger_kernel/ops/backends/_ascend/ops/swiglu.py +142 -0
liger_kernel/ops/backends/_ascend/ops/tvd.py +223 -0
liger_kernel/ops/backends/_ascend/ub_manager.py +367 -0
liger_kernel/ops/backends/registry.py +61 -0
liger_kernel/ops/cross_entropy.py +14 -4
liger_kernel/ops/dyt.py +5 -2
liger_kernel/ops/fused_add_rms_norm.py +21 -23
liger_kernel/ops/fused_linear_cross_entropy.py +2 -1
liger_kernel/ops/geglu.py +5 -3
liger_kernel/ops/group_norm.py +12 -8
liger_kernel/ops/kl_div.py +8 -11
liger_kernel/ops/layer_norm.py +17 -16
liger_kernel/ops/poly_norm.py +19 -21
liger_kernel/ops/rms_norm.py +149 -71
liger_kernel/ops/utils.py +25 -0
liger_kernel/transformers/__init__.py +6 -0
liger_kernel/transformers/auto_model.py +21 -0
liger_kernel/transformers/cross_entropy.py +1 -1
liger_kernel/transformers/dyt.py +1 -1
liger_kernel/transformers/experimental/embedding.py +1 -1
liger_kernel/transformers/functional.py +20 -20
liger_kernel/transformers/fused_add_rms_norm.py +1 -1
liger_kernel/transformers/fused_linear_cross_entropy.py +1 -1
liger_kernel/transformers/fused_linear_jsd.py +1 -1
liger_kernel/transformers/fused_neighborhood_attention.py +1 -1
liger_kernel/transformers/geglu.py +1 -1
liger_kernel/transformers/group_norm.py +1 -1
liger_kernel/transformers/grpo_loss.py +1 -1
liger_kernel/transformers/jsd.py +1 -1
liger_kernel/transformers/kl_div.py +1 -1
liger_kernel/transformers/layer_norm.py +1 -1
liger_kernel/transformers/llama4_rope.py +1 -1
liger_kernel/transformers/model/exaone4.py +136 -0
liger_kernel/transformers/model/gemma2.py +3 -3
liger_kernel/transformers/model/gemma3.py +11 -5
liger_kernel/transformers/model/gpt_oss.py +211 -0
liger_kernel/transformers/model/loss_utils.py +6 -0
liger_kernel/transformers/model/paligemma.py +1 -0
liger_kernel/transformers/monkey_patch.py +196 -39
liger_kernel/transformers/multi_token_attention.py +1 -1
liger_kernel/transformers/poly_norm.py +1 -1
liger_kernel/transformers/qwen2vl_mrope.py +1 -1
liger_kernel/transformers/rms_norm.py +8 -3
liger_kernel/transformers/rope.py +28 -27
liger_kernel/transformers/softmax.py +1 -1
liger_kernel/transformers/sparsemax.py +1 -1
liger_kernel/transformers/swiglu.py +1 -1
liger_kernel/transformers/tiled_mlp.py +5 -13
liger_kernel/transformers/tvd.py +1 -1
liger_kernel/utils.py +54 -0
{liger_kernel-0.6.4.dist-info → liger_kernel-0.6.5.dist-info}/METADATA +11 -4
liger_kernel-0.6.5.dist-info/RECORD +134 -0
{liger_kernel-0.6.4.dist-info → liger_kernel-0.6.5.dist-info}/WHEEL +1 -1
liger_kernel-0.6.4.dist-info/RECORD +0 -118
{liger_kernel-0.6.4.dist-info → liger_kernel-0.6.5.dist-info}/licenses/LICENSE +0 -0
{liger_kernel-0.6.4.dist-info → liger_kernel-0.6.5.dist-info}/licenses/NOTICE +0 -0
{liger_kernel-0.6.4.dist-info → liger_kernel-0.6.5.dist-info}/top_level.txt +0 -0

liger_kernel/ops/backends/_ascend/ops/qwen2vl_mrope.py ADDED Viewed

@@ -0,0 +1,275 @@
+import torch
+import triton
+import triton.language as tl
+from liger_kernel.ops.backends._ascend.ub_manager import compute_default_tiling_strategy
+from liger_kernel.ops.utils import get_npu_core_count
+@triton.jit
+def _triton_qwen2vl_mrope_npu(
+    q_ptr,
+    q_row_stride,
+    k_ptr,
+    k_row_stride,
+    cos,
+    sin,
+    sl,
+    bs: tl.constexpr,
+    total_rows: tl.constexpr,
+    n_qh: tl.constexpr,
+    n_kh: tl.constexpr,
+    hd: tl.constexpr,
+    mrope_section_t: tl.constexpr,
+    mrope_section_h: tl.constexpr,
+    BLOCK_Q: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    NUM_STAGES: tl.constexpr,
+    BACKWARD_PASS: tl.constexpr = False,
+):
+    program_id = tl.program_id(0)
+    num_programs = tl.num_programs(0)
+    rows_per_program = (total_rows + num_programs - 1) // num_programs
+    start_row = program_id * rows_per_program
+    actual_rows = tl.minimum(rows_per_program, total_rows - start_row)
+    for row_offset in tl.range(0, actual_rows, num_stages=NUM_STAGES):
+        pid = start_row + row_offset
+        t_end = mrope_section_t
+        h_end = t_end + mrope_section_h
+        t_cos = cos + pid * hd
+        h_cos = t_cos + bs * sl * hd
+        w_cos = h_cos + bs * sl * hd
+        t_sin = sin + pid * hd
+        h_sin = t_sin + bs * sl * hd
+        w_sin = h_sin + bs * sl * hd
+        q_base = q_ptr + pid * q_row_stride
+        k_base = k_ptr + pid * k_row_stride
+        d_idx = tl.arange(0, hd // 2)
+        d_mask = d_idx < (hd // 2)
+        pos_mask_t = d_idx < t_end
+        pos_mask_h = (d_idx >= t_end) & (d_idx < h_end)
+        text_cos_vals = tl.load(t_cos + d_idx, mask=d_mask, other=0)
+        text_sin_vals = tl.load(t_sin + d_idx, mask=d_mask, other=0)
+        height_cos_vals = tl.load(h_cos + d_idx, mask=d_mask, other=0)
+        height_sin_vals = tl.load(h_sin + d_idx, mask=d_mask, other=0)
+        width_cos_vals = tl.load(w_cos + d_idx, mask=d_mask, other=0)
+        width_sin_vals = tl.load(w_sin + d_idx, mask=d_mask, other=0)
+        cos_vals = tl.where(pos_mask_t, text_cos_vals, tl.where(pos_mask_h, height_cos_vals, width_cos_vals))
+        sin_vals = tl.where(pos_mask_t, text_sin_vals, tl.where(pos_mask_h, height_sin_vals, width_sin_vals))
+        # Process q heads in chunks to prevent UB overflow
+        for qh_block in range(0, n_qh, BLOCK_Q):
+            qh_idx = tl.arange(0, BLOCK_Q) + qh_block
+            qh_mask = qh_idx < n_qh
+            block_mask = qh_mask[:, None] & d_mask[None, :]
+            offsets = qh_idx[:, None] * hd + d_idx[None, :]
+            q_left = tl.load(q_base + offsets, mask=block_mask, other=0)
+            q_right = tl.load(q_base + offsets + (hd // 2), mask=block_mask, other=0)
+            if not BACKWARD_PASS:
+                new_left = q_left * cos_vals - q_right * sin_vals
+                new_right = q_right * cos_vals + q_left * sin_vals
+            else:
+                new_left = q_left * cos_vals + q_right * sin_vals
+                new_right = q_right * cos_vals - q_left * sin_vals
+            tl.store(q_base + offsets, new_left, mask=block_mask)
+            tl.store(q_base + offsets + (hd // 2), new_right, mask=block_mask)
+        # Process k heads in chunks to prevent UB overflow
+        for kh_block in range(0, n_kh, BLOCK_K):
+            kh_idx = tl.arange(0, BLOCK_K) + kh_block
+            kh_mask = kh_idx < n_kh
+            block_mask = kh_mask[:, None] & d_mask[None, :]
+            offsets = kh_idx[:, None] * hd + d_idx[None, :]
+            k_left = tl.load(k_base + offsets, mask=block_mask, other=0)
+            k_right = tl.load(k_base + offsets + (hd // 2), mask=block_mask, other=0)
+            if not BACKWARD_PASS:
+                new_left = k_left * cos_vals - k_right * sin_vals
+                new_right = k_right * cos_vals + k_left * sin_vals
+            else:
+                new_left = k_left * cos_vals + k_right * sin_vals
+                new_right = k_right * cos_vals - k_left * sin_vals
+            tl.store(k_base + offsets, new_left, mask=block_mask)
+            tl.store(k_base + offsets + (hd // 2), new_right, mask=block_mask)
+def get_optimal_block_size_mrope(pad_n_q_head, pad_n_kv_head, pad_hd, dtype_size):
+    # MROPE forward tiling strategy:
+    # - cos_vals and sin_vals (include text, height and width) are loaded once outside loops (shared): (pad_hd // 2) * 6 = 3 * pad_hd elements each
+    # - In q heads loop (peak memory):
+    #   * q_left: BLOCK_Q * (pad_hd // 2) elements
+    #   * q_right: BLOCK_Q * (pad_hd // 2) elements
+    #   * new_left: BLOCK_Q * (pad_hd // 2) elements (intermediate result)
+    #   * new_right: BLOCK_Q * (pad_hd // 2) elements (intermediate result)
+    #   * Total: 4 * BLOCK_Q * (pad_hd // 2) = 2 * BLOCK_Q * pad_hd elements
+    # - In k heads loop (peak memory):
+    #   * k_left: BLOCK_K * (pad_hd // 2) elements
+    #   * k_right: BLOCK_K * (pad_hd // 2) elements
+    #   * new_left: BLOCK_K * (pad_hd // 2) elements (intermediate result)
+    #   * new_right: BLOCK_K * (pad_hd // 2) elements (intermediate result)
+    #   * Total: 4 * BLOCK_K * (pad_hd // 2) = 2 * BLOCK_K * pad_hd elements
+    # - Since q and k are processed separately, peak memory is max(BLOCK_Q, BLOCK_K) case
+    # - Plus shared cos/sin: 6 * (pad_hd // 2) = 3 * pad_hd elements
+    # - Conservative estimate: (2 * BLOCK_SIZE * pad_hd + 3 * pad_hd) * dtype_size * 8 bits
+    # - Simplified: (2 * BLOCK_SIZE + 3) * pad_hd * dtype_size * 8 bits
+    # - For safety, use: memory_multiplier=3.0 * BLOCK_SIZE * pad_hd * dtype_size * 8 bits
+    # - shapes: ((pad_n_q_head, pad_hd), (pad_n_kv_head, pad_hd))
+    # - tiling_dims: (0, 0) means first dimension of each shape can be tiled
+    # - Returns: ((block_size_q, pad_hd), (block_size_kv, pad_hd))
+    shapes = ((pad_n_q_head, pad_hd), (pad_n_kv_head, pad_hd))
+    tile_shapes = compute_default_tiling_strategy(
+        safety_margin=0.90,
+        dtype_size=dtype_size,
+        memory_multiplier=3.0,
+        shapes=shapes,
+        tiling_dims=(0, 0),
+    )
+    if tile_shapes is not None and len(tile_shapes) == len(shapes):
+        # Strategy returns ((block_size_q, pad_hd), (block_size_kv, pad_hd))
+        q_tile_shape, k_tile_shape = tile_shapes
+        BLOCK_Q, _ = q_tile_shape
+        BLOCK_K, _ = k_tile_shape
+    else:
+        # Fallback to conservative defaults
+        BLOCK_Q = 2048
+        BLOCK_K = 2048
+    return BLOCK_Q, BLOCK_K
+def qwen2vl_mrope_forward(q, k, cos, sin, mrope_section):
+    # transpose it back to the physical shape because Triton looks at the physical storage
+    q = q.transpose(1, 2)
+    k = k.transpose(1, 2)
+    batch_size, seq_len, n_q_head, head_dim = q.shape
+    n_kv_head = k.shape[2]
+    pad_hd = triton.next_power_of_2(head_dim)
+    pad_n_q_head = triton.next_power_of_2(n_q_head)
+    pad_n_kv_head = triton.next_power_of_2(n_kv_head)
+    n_row = batch_size * seq_len
+    # ensure tensors passed into the kernel are contiguous
+    q = q.contiguous()
+    k = k.contiguous()
+    cos = cos.contiguous()
+    sin = sin.contiguous()
+    dtype_size = q.element_size()
+    BLOCK_Q, BLOCK_K = get_optimal_block_size_mrope(pad_n_q_head, pad_n_kv_head, pad_hd, dtype_size)
+    num_cores = get_npu_core_count()
+    grid_size = min(num_cores, n_row)
+    _triton_qwen2vl_mrope_npu[(grid_size,)](
+        q,
+        q.stride(1),
+        k,
+        k.stride(1),
+        cos,
+        sin,
+        seq_len,
+        batch_size,
+        n_row,
+        n_q_head,
+        n_kv_head,
+        head_dim,
+        mrope_section[0],
+        mrope_section[1],
+        BLOCK_Q,
+        BLOCK_K,
+        NUM_STAGES=3,
+        BACKWARD_PASS=False,
+    )
+    return q.transpose(1, 2), k.transpose(1, 2), cos, sin
+def qwen2vl_mrope_backward(dq, dk, cos, sin, mrope_section):
+    dq = dq.transpose(1, 2)
+    dk = dk.transpose(1, 2)
+    batch_size, seq_len, n_q_head, head_dim = dq.shape
+    n_kv_head = dk.shape[2]
+    pad_hd = triton.next_power_of_2(head_dim)
+    pad_n_q_head = triton.next_power_of_2(n_q_head)
+    pad_n_kv_head = triton.next_power_of_2(n_kv_head)
+    n_row = batch_size * seq_len
+    # ensure dq and dk are contiguous
+    dq = dq.contiguous()
+    dk = dk.contiguous()
+    dtype_size = dq.element_size()
+    BLOCK_Q, BLOCK_K = get_optimal_block_size_mrope(pad_n_q_head, pad_n_kv_head, pad_hd, dtype_size)
+    num_cores = get_npu_core_count()
+    grid_size = min(num_cores, n_row)
+    _triton_qwen2vl_mrope_npu[(grid_size,)](
+        dq,
+        dq.stride(1),
+        dk,
+        dk.stride(1),
+        cos,
+        sin,
+        seq_len,
+        batch_size,
+        n_row,
+        n_q_head,
+        n_kv_head,
+        head_dim,
+        mrope_section[0],
+        mrope_section[1],
+        BLOCK_Q,
+        BLOCK_K,
+        NUM_STAGES=3,
+        BACKWARD_PASS=True,
+    )
+    return dq.transpose(1, 2), dk.transpose(1, 2)
+class LigerQwen2VLMRopeFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, q, k, cos, sin, mrope_section, unsqueeze_dim=1):
+        """
+        q size: (bsz, n_q_head, seq_len, head_dim)
+        k size: (bsz, n_kv_head, seq_len, head_dim)
+        cos size: (3, bsz, seq_len, head_dim)
+        sin size: (3, bsz, seq_len, head_dim)
+        """
+        q, k, cos, sin = qwen2vl_mrope_forward(q, k, cos, sin, mrope_section)
+        ctx.save_for_backward(cos, sin)
+        ctx.mrope_section = mrope_section
+        return q, k
+    @staticmethod
+    def backward(ctx, dq, dk):
+        """
+        dq size: (bsz, n_q_head, seq_len, head_dim)
+        dk size: (bsz, n_kv_head, seq_len, head_dim)
+        cos size: (3, bsz, seq_len, head_dim)
+        sin size: (3, bsz, seq_len, head_dim)
+        """
+        cos, sin = ctx.saved_tensors
+        mrope_section = ctx.mrope_section
+        dq, dk = qwen2vl_mrope_backward(dq, dk, cos, sin, mrope_section)
+        return dq, dk, None, None, None, None

liger_kernel/ops/backends/_ascend/ops/rope.py ADDED Viewed

@@ -0,0 +1,265 @@
+import torch
+import triton
+import triton.language as tl
+from liger_kernel.ops.backends._ascend.ub_manager import compute_default_tiling_strategy
+from liger_kernel.ops.utils import get_npu_core_count
+@triton.jit
+def _triton_rope_npu(
+    q_ptr,
+    q_row_stride,
+    k_ptr,
+    k_row_stride,
+    cos,
+    cos_row_stride,
+    sin,
+    sin_row_stride,
+    sl,
+    total_rows: tl.constexpr,
+    cos_bs: tl.constexpr,
+    n_qh: tl.constexpr,
+    n_kh: tl.constexpr,
+    hd: tl.constexpr,
+    BLOCK_Q: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    NUM_STAGES: tl.constexpr,
+    BACKWARD_PASS: tl.constexpr = False,
+):
+    program_id = tl.program_id(0)
+    num_programs = tl.num_programs(0)
+    rows_per_program = (total_rows + num_programs - 1) // num_programs
+    start_row = program_id * rows_per_program
+    actual_rows = tl.minimum(rows_per_program, total_rows - start_row)
+    for row_offset in tl.range(0, actual_rows, num_stages=NUM_STAGES):
+        pid = start_row + row_offset
+        row_idx = pid % sl
+        cos_ptr = cos + tl.where(cos_bs == 1, row_idx * cos_row_stride, pid * cos_row_stride)
+        sin_ptr = sin + tl.where(cos_bs == 1, row_idx * sin_row_stride, pid * sin_row_stride)
+        # Pre-compute d_idx and cos/sin values outside loops (they don't depend on heads)
+        d_idx = tl.arange(0, hd // 2)
+        d_mask = d_idx < (hd // 2)  # Always True, but kept for clarity
+        cos_vals = tl.load(cos_ptr + d_idx, mask=d_mask, other=0)
+        sin_vals = tl.load(sin_ptr + d_idx, mask=d_mask, other=0)
+        # Process q heads in chunks to prevent UB overflow
+        for qh_block in range(0, n_qh, BLOCK_Q):
+            qh_idx = tl.arange(0, BLOCK_Q) + qh_block
+            qh_mask = qh_idx < n_qh
+            # block_mask: qh_mask broadcasted over d_idx dimension
+            block_mask = qh_mask[:, None]
+            offsets = qh_idx[:, None] * hd + d_idx[None, :]
+            q_base = q_ptr + pid * q_row_stride
+            q_left = tl.load(q_base + offsets, mask=block_mask, other=0)
+            q_right = tl.load(q_base + offsets + (hd // 2), mask=block_mask, other=0)
+            if not BACKWARD_PASS:
+                new_left = q_left * cos_vals - q_right * sin_vals
+                new_right = q_right * cos_vals + q_left * sin_vals
+            else:
+                new_left = q_left * cos_vals + q_right * sin_vals
+                new_right = q_right * cos_vals - q_left * sin_vals
+            tl.store(q_base + offsets, new_left, mask=block_mask)
+            tl.store(q_base + offsets + (hd // 2), new_right, mask=block_mask)
+        # Process k heads in chunks to prevent UB overflow
+        for kh_block in range(0, n_kh, BLOCK_K):
+            kh_idx = tl.arange(0, BLOCK_K) + kh_block
+            kh_mask = kh_idx < n_kh
+            # block_mask: kh_mask broadcasted over d_idx dimension
+            block_mask = kh_mask[:, None]
+            offsets = kh_idx[:, None] * hd + d_idx[None, :]
+            k_base = k_ptr + pid * k_row_stride
+            k_left = tl.load(k_base + offsets, mask=block_mask, other=0)
+            k_right = tl.load(k_base + offsets + (hd // 2), mask=block_mask, other=0)
+            if not BACKWARD_PASS:
+                new_left = k_left * cos_vals - k_right * sin_vals
+                new_right = k_right * cos_vals + k_left * sin_vals
+            else:
+                new_left = k_left * cos_vals + k_right * sin_vals
+                new_right = k_right * cos_vals - k_left * sin_vals
+            tl.store(k_base + offsets, new_left, mask=block_mask)
+            tl.store(k_base + offsets + (hd // 2), new_right, mask=block_mask)
+def get_optimal_block_size(pad_n_q_head, pad_n_kv_head, pad_hd, dtype_size):
+    # Compute tiling strategy based on UB capacity
+    # ROPE forward tiling strategy (based on optimized ROPE kernel):
+    # - cos_vals and sin_vals are loaded once outside loops (shared): pad_hd // 2 elements each
+    # - In q heads loop (peak memory):
+    #   * q_left: BLOCK_Q * (pad_hd // 2) elements
+    #   * q_right: BLOCK_Q * (pad_hd // 2) elements
+    #   * new_left: BLOCK_Q * (pad_hd // 2) elements (intermediate result)
+    #   * new_right: BLOCK_Q * (pad_hd // 2) elements (intermediate result)
+    #   * Total: 4 * BLOCK_Q * (pad_hd // 2) = 2 * BLOCK_Q * pad_hd elements
+    # - In k heads loop (peak memory):
+    #   * k_left: BLOCK_K * (pad_hd // 2) elements
+    #   * k_right: BLOCK_K * (pad_hd // 2) elements
+    #   * new_left: BLOCK_K * (pad_hd // 2) elements (intermediate result)
+    #   * new_right: BLOCK_K * (pad_hd // 2) elements (intermediate result)
+    #   * Total: 4 * BLOCK_K * (pad_hd // 2) = 2 * BLOCK_K * pad_hd elements
+    # - Since q and k are processed separately, peak memory is max(BLOCK_Q, BLOCK_K) case
+    # - Plus shared cos/sin: 2 * (pad_hd // 2) = pad_hd elements
+    # - Conservative estimate: (2 * BLOCK_SIZE * pad_hd + pad_hd) * dtype_size * 8 bits
+    # - Simplified: (2 * BLOCK_SIZE + 1) * pad_hd * dtype_size * 8 bits
+    # - For safety, use: memory_multiplier=3.0 * BLOCK_SIZE * pad_hd * dtype_size * 8 bits
+    # - shapes: ((pad_n_q_head, pad_hd), (pad_n_kv_head, pad_hd))
+    # - tiling_dims: (0, 0) means first dimension of each shape can be tiled
+    # - Returns: ((block_size_q, pad_hd), (block_size_kv, pad_hd))
+    shapes = ((pad_n_q_head, pad_hd), (pad_n_kv_head, pad_hd))
+    tile_shapes = compute_default_tiling_strategy(
+        safety_margin=0.90,
+        dtype_size=dtype_size,
+        memory_multiplier=3.0,
+        shapes=shapes,
+        tiling_dims=(0, 0),
+    )
+    if tile_shapes is not None and len(tile_shapes) == len(shapes):
+        # Strategy returns ((block_size_q, pad_hd), (block_size_kv, pad_hd))
+        q_tile_shape, k_tile_shape = tile_shapes
+        BLOCK_Q, _ = q_tile_shape
+        BLOCK_K, _ = k_tile_shape
+    else:
+        # Fallback to conservative defaults
+        BLOCK_Q = 2048
+        BLOCK_K = 2048
+    return BLOCK_Q, BLOCK_K
+def rope_forward(q, k, cos, sin):
+    # transpose it back to the physical shape because Triton looks at the physical storage
+    # note: q and k are incontiguous before the transformation and will become contiguous after transpose
+    q = q.transpose(1, 2)
+    k = k.transpose(1, 2)
+    batch_size, seq_len, n_q_head, head_dim = q.shape
+    n_kv_head = k.shape[2]
+    pad_hd = triton.next_power_of_2(head_dim)
+    pad_n_q_head = triton.next_power_of_2(n_q_head)
+    pad_n_kv_head = triton.next_power_of_2(n_kv_head)
+    n_row = batch_size * seq_len
+    # ensure tensors passed into the kernel are contiguous. It will be no-op if they are already contiguous
+    q = q.contiguous()
+    k = k.contiguous()
+    cos = cos.contiguous()
+    sin = sin.contiguous()
+    cos_batch_size = cos.shape[0]
+    dtype_size = q.element_size()
+    BLOCK_Q, BLOCK_K = get_optimal_block_size(pad_n_q_head, pad_n_kv_head, pad_hd, dtype_size)
+    num_cores = get_npu_core_count()
+    grid_size = min(num_cores, n_row)
+    _triton_rope_npu[(grid_size,)](
+        q,
+        q.stride(1),
+        k,
+        k.stride(1),
+        cos,
+        cos.stride(-2),
+        sin,
+        sin.stride(-2),
+        seq_len,
+        n_row,
+        cos_batch_size,
+        n_q_head,
+        n_kv_head,
+        head_dim,
+        BLOCK_Q,
+        BLOCK_K,
+        NUM_STAGES=3,
+        BACKWARD_PASS=False,
+    )
+    return q.transpose(1, 2), k.transpose(1, 2), cos, sin
+def rope_backward(dq, dk, cos, sin):
+    dq = dq.transpose(1, 2)
+    dk = dk.transpose(1, 2)
+    batch_size, seq_len, n_q_head, head_dim = dq.shape
+    cos_batch_size = cos.shape[0]
+    n_kv_head = dk.shape[2]
+    pad_hd = triton.next_power_of_2(head_dim)
+    pad_n_q_head = triton.next_power_of_2(n_q_head)
+    pad_n_kv_head = triton.next_power_of_2(n_kv_head)
+    n_row = batch_size * seq_len
+    # ensure dq and dk are contiguous
+    dq = dq.contiguous()
+    dk = dk.contiguous()
+    dtype_size = dq.element_size()
+    BLOCK_Q, BLOCK_K = get_optimal_block_size(pad_n_q_head, pad_n_kv_head, pad_hd, dtype_size)
+    num_cores = get_npu_core_count()
+    grid_size = min(num_cores, n_row)
+    _triton_rope_npu[(grid_size,)](
+        dq,
+        dq.stride(1),
+        dk,
+        dk.stride(1),
+        cos,
+        cos.stride(-2),
+        sin,
+        sin.stride(-2),
+        seq_len,
+        n_row,
+        cos_batch_size,
+        n_q_head,
+        n_kv_head,
+        head_dim,
+        BLOCK_Q,
+        BLOCK_K,
+        NUM_STAGES=3,
+        BACKWARD_PASS=True,
+    )
+    return dq.transpose(1, 2), dk.transpose(1, 2)
+class LigerRopeFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+        """
+        q size: (bsz, n_q_head, seq_len, head_dim)
+        k size: (bsz, n_kv_head, seq_len, head_dim)
+        cos size: (1, seq_len, head_dim) or (bsz, seq_len, head_dim)
+        sin size: (1, seq_len, head_dim) or (bsz, seq_len, head_dim)
+        """
+        q, k, cos, sin = rope_forward(q, k, cos, sin)
+        ctx.save_for_backward(cos, sin)
+        return q, k
+    @staticmethod
+    def backward(ctx, dq, dk):
+        """
+        dq size: (bsz, n_q_head, seq_len, head_dim)
+        dk size: (bsz, n_kv_head, seq_len, head_dim)
+        cos size: (1, seq_len, head_dim) or (bsz, seq_len, head_dim)
+        sin size: (1, seq_len, head_dim) or (bsz, seq_len, head_dim)
+        """
+        cos, sin = ctx.saved_tensors
+        dq, dk = rope_backward(dq, dk, cos, sin)
+        return dq, dk, None, None, None, None

liger_kernel/ops/backends/_ascend/ops/swiglu.py ADDED Viewed

@@ -0,0 +1,142 @@
+import torch
+import triton
+import triton.language as tl
+from liger_kernel.ops.backends._ascend.ub_manager import compute_default_tiling_strategy
+from liger_kernel.ops.utils import get_npu_core_count
+# -----------------------------------------------------------------------------
+# Kernels (High-performance 1D Flatten Implementation)
+# -----------------------------------------------------------------------------
+@triton.jit
+def _swiglu_forward_kernel_flat(
+    a_ptr, b_ptr, c_ptr, total_elements, BLOCK_SIZE: tl.constexpr, NUM_STAGES: tl.constexpr
+):
+    pid = tl.program_id(0)
+    num_progs = tl.num_programs(0)
+    # Grid-Stride Loop
+    start_idx = pid * BLOCK_SIZE
+    stride = num_progs * BLOCK_SIZE
+    for idx in tl.range(start_idx, total_elements, stride, num_stages=NUM_STAGES):
+        offsets = idx + tl.arange(0, BLOCK_SIZE)
+        mask = offsets < total_elements
+        a_val = tl.load(a_ptr + offsets, mask=mask, other=0.0).to(tl.float32)
+        b_val = tl.load(b_ptr + offsets, mask=mask, other=0.0).to(tl.float32)
+        res = (a_val * tl.sigmoid(a_val)) * b_val
+        tl.store(c_ptr + offsets, res, mask=mask)
+@triton.jit
+def _swiglu_backward_kernel_flat(
+    dc_ptr, a_ptr, b_ptr, da_ptr, db_ptr, total_elements, BLOCK_SIZE: tl.constexpr, NUM_STAGES: tl.constexpr
+):
+    pid = tl.program_id(0)
+    num_progs = tl.num_programs(0)
+    start_idx = pid * BLOCK_SIZE
+    stride = num_progs * BLOCK_SIZE
+    for idx in tl.range(start_idx, total_elements, stride, num_stages=NUM_STAGES):
+        offsets = idx + tl.arange(0, BLOCK_SIZE)
+        mask = offsets < total_elements
+        dc = tl.load(dc_ptr + offsets, mask=mask, other=0.0).to(tl.float32)
+        a = tl.load(a_ptr + offsets, mask=mask, other=0.0).to(tl.float32)
+        b = tl.load(b_ptr + offsets, mask=mask, other=0.0).to(tl.float32)
+        sig_a = tl.sigmoid(a)
+        silu_a = a * sig_a
+        term1 = silu_a * (1.0 - sig_a) + sig_a
+        db = dc * silu_a
+        da = dc * b * term1
+        tl.store(da_ptr + offsets, da, mask=mask)
+        tl.store(db_ptr + offsets, db, mask=mask)
+# -----------------------------------------------------------------------------
+# Helper: Call compute_default_tiling_strategy
+# -----------------------------------------------------------------------------
+def get_optimal_block_size(total_elements, is_backward=False):
+    """
+    Calculate optimal Block Size using compute_default_tiling_strategy
+    """
+    # 1. Set Memory Multiplier
+    # Forward is lighter, Backward requires more memory for intermediate variables
+    # 8.0 and 12.0 are empirical values based on 910B UB (192KB)
+    multiplier = 12.0 if is_backward else 8.0
+    # 2. Call calculation function
+    # Treat input as 1D (total_elements,), only tiling on dim 0
+    tile_shapes = compute_default_tiling_strategy(
+        safety_margin=0.9, dtype_size=4, memory_multiplier=multiplier, shapes=((total_elements,),), tiling_dims=(0,)
+    )
+    # 3. Parse result
+    if tile_shapes and len(tile_shapes) > 0:
+        block_size = tile_shapes[0][0]
+        return max(256, block_size)
+    else:
+        return 2048
+def swiglu_forward(a, b):
+    if not a.is_contiguous():
+        a = a.contiguous()
+    if not b.is_contiguous():
+        b = b.contiguous()
+    total_elements = a.numel()
+    c = torch.empty_like(a)
+    block_size = get_optimal_block_size(total_elements, is_backward=False)
+    num_cores = get_npu_core_count()
+    grid_size = min(num_cores, (total_elements + block_size - 1) // block_size)
+    _swiglu_forward_kernel_flat[(grid_size,)](a, b, c, total_elements, BLOCK_SIZE=block_size, NUM_STAGES=3, num_warps=4)
+    return c
+def swiglu_backward(a, b, dc):
+    if not dc.is_contiguous():
+        dc = dc.contiguous()
+    if not a.is_contiguous():
+        a = a.contiguous()
+    if not b.is_contiguous():
+        b = b.contiguous()
+    total_elements = dc.numel()
+    grad_a = torch.empty_like(a)
+    grad_b = torch.empty_like(b)
+    block_size = get_optimal_block_size(total_elements, is_backward=True)
+    num_cores = get_npu_core_count()
+    grid_size = min(num_cores, (total_elements + block_size - 1) // block_size)
+    _swiglu_backward_kernel_flat[(grid_size,)](
+        dc, a, b, grad_a, grad_b, total_elements, BLOCK_SIZE=block_size, NUM_STAGES=3, num_warps=4
+    )
+    return grad_a, grad_b
+class LigerSiLUMulFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, a, b):
+        c = swiglu_forward(a, b)
+        ctx.save_for_backward(a, b)
+        return c
+    @staticmethod
+    def backward(ctx, dc):
+        a, b = ctx.saved_tensors
+        grad_a, grad_b = swiglu_backward(a, b, dc)
+        return grad_a, grad_b

liger-kernel 0.6.4__py3-none-any.whl → 0.6.5__py3-none-any.whl

liger-kernel 0.6.4py3-none-any.whl → 0.6.5py3-none-any.whl