PyPI - liger-kernel - Versions diffs - 0.6.4__py3-none-any.whl → 0.6.5__py3-none-any.whl - Mend

liger-kernel 0.6.4py3-none-any.whl → 0.6.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (71) hide show

liger_kernel/chunked_loss/cosine_similarity_loss.py +7 -1
liger_kernel/chunked_loss/fused_linear_distillation.py +10 -3
liger_kernel/chunked_loss/jsd_loss.py +21 -6
liger_kernel/ops/__init__.py +141 -0
liger_kernel/ops/backends/README.md +151 -0
liger_kernel/ops/backends/__init__.py +13 -0
liger_kernel/ops/backends/_ascend/__init__.py +5 -0
liger_kernel/ops/backends/_ascend/ascend-ub-manager-design.md +492 -0
liger_kernel/ops/backends/_ascend/ops/__init__.py +61 -0
liger_kernel/ops/backends/_ascend/ops/embedding.py +214 -0
liger_kernel/ops/backends/_ascend/ops/geglu.py +191 -0
liger_kernel/ops/backends/_ascend/ops/llama4_rope.py +298 -0
liger_kernel/ops/backends/_ascend/ops/qwen2vl_mrope.py +275 -0
liger_kernel/ops/backends/_ascend/ops/rope.py +265 -0
liger_kernel/ops/backends/_ascend/ops/swiglu.py +142 -0
liger_kernel/ops/backends/_ascend/ops/tvd.py +223 -0
liger_kernel/ops/backends/_ascend/ub_manager.py +367 -0
liger_kernel/ops/backends/registry.py +61 -0
liger_kernel/ops/cross_entropy.py +14 -4
liger_kernel/ops/dyt.py +5 -2
liger_kernel/ops/fused_add_rms_norm.py +21 -23
liger_kernel/ops/fused_linear_cross_entropy.py +2 -1
liger_kernel/ops/geglu.py +5 -3
liger_kernel/ops/group_norm.py +12 -8
liger_kernel/ops/kl_div.py +8 -11
liger_kernel/ops/layer_norm.py +17 -16
liger_kernel/ops/poly_norm.py +19 -21
liger_kernel/ops/rms_norm.py +149 -71
liger_kernel/ops/utils.py +25 -0
liger_kernel/transformers/__init__.py +6 -0
liger_kernel/transformers/auto_model.py +21 -0
liger_kernel/transformers/cross_entropy.py +1 -1
liger_kernel/transformers/dyt.py +1 -1
liger_kernel/transformers/experimental/embedding.py +1 -1
liger_kernel/transformers/functional.py +20 -20
liger_kernel/transformers/fused_add_rms_norm.py +1 -1
liger_kernel/transformers/fused_linear_cross_entropy.py +1 -1
liger_kernel/transformers/fused_linear_jsd.py +1 -1
liger_kernel/transformers/fused_neighborhood_attention.py +1 -1
liger_kernel/transformers/geglu.py +1 -1
liger_kernel/transformers/group_norm.py +1 -1
liger_kernel/transformers/grpo_loss.py +1 -1
liger_kernel/transformers/jsd.py +1 -1
liger_kernel/transformers/kl_div.py +1 -1
liger_kernel/transformers/layer_norm.py +1 -1
liger_kernel/transformers/llama4_rope.py +1 -1
liger_kernel/transformers/model/exaone4.py +136 -0
liger_kernel/transformers/model/gemma2.py +3 -3
liger_kernel/transformers/model/gemma3.py +11 -5
liger_kernel/transformers/model/gpt_oss.py +211 -0
liger_kernel/transformers/model/loss_utils.py +6 -0
liger_kernel/transformers/model/paligemma.py +1 -0
liger_kernel/transformers/monkey_patch.py +196 -39
liger_kernel/transformers/multi_token_attention.py +1 -1
liger_kernel/transformers/poly_norm.py +1 -1
liger_kernel/transformers/qwen2vl_mrope.py +1 -1
liger_kernel/transformers/rms_norm.py +8 -3
liger_kernel/transformers/rope.py +28 -27
liger_kernel/transformers/softmax.py +1 -1
liger_kernel/transformers/sparsemax.py +1 -1
liger_kernel/transformers/swiglu.py +1 -1
liger_kernel/transformers/tiled_mlp.py +5 -13
liger_kernel/transformers/tvd.py +1 -1
liger_kernel/utils.py +54 -0
{liger_kernel-0.6.4.dist-info → liger_kernel-0.6.5.dist-info}/METADATA +11 -4
liger_kernel-0.6.5.dist-info/RECORD +134 -0
{liger_kernel-0.6.4.dist-info → liger_kernel-0.6.5.dist-info}/WHEEL +1 -1
liger_kernel-0.6.4.dist-info/RECORD +0 -118
{liger_kernel-0.6.4.dist-info → liger_kernel-0.6.5.dist-info}/licenses/LICENSE +0 -0
{liger_kernel-0.6.4.dist-info → liger_kernel-0.6.5.dist-info}/licenses/NOTICE +0 -0
{liger_kernel-0.6.4.dist-info → liger_kernel-0.6.5.dist-info}/top_level.txt +0 -0

liger_kernel/ops/backends/_ascend/ops/embedding.py ADDED Viewed

@@ -0,0 +1,214 @@
+import torch
+import triton
+import triton.language as tl
+from liger_kernel.ops.backends._ascend.ub_manager import compute_default_tiling_strategy
+from liger_kernel.ops.utils import ensure_contiguous
+from liger_kernel.ops.utils import get_npu_core_count
+@triton.jit
+def embedding_forward_kernel(
+    embeddings_ptr,
+    indices_ptr,
+    output_ptr,
+    n_elements,
+    embedding_dim: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    NUM_STAGES: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    num_progs = tl.num_programs(0)
+    grid_m = tl.cdiv(n_elements, BLOCK_SIZE_M)
+    grid_n = tl.cdiv(embedding_dim, BLOCK_SIZE_N)
+    total_2d_blocks = grid_m * grid_n
+    for block_idx in tl.range(pid, total_2d_blocks, num_progs, num_stages=NUM_STAGES):
+        block_m = block_idx // grid_n
+        block_n = block_idx % grid_n
+        start_m = block_m * BLOCK_SIZE_M
+        start_n = block_n * BLOCK_SIZE_N
+        offsets_m = start_m + tl.arange(0, BLOCK_SIZE_M)
+        mask_m = offsets_m < n_elements
+        indices = tl.load(indices_ptr + offsets_m, mask=mask_m, other=0)
+        offsets_n = start_n + tl.arange(0, BLOCK_SIZE_N)
+        mask_n = offsets_n < embedding_dim
+        block_mask = mask_m[:, None] & mask_n[None, :]
+        embedding_offsets = indices[:, None] * embedding_dim + offsets_n[None, :]
+        embeddings = tl.load(
+            embeddings_ptr + embedding_offsets,
+            mask=block_mask,
+            other=0.0,
+        )
+        output_offsets = offsets_m[:, None] * embedding_dim + offsets_n[None, :]
+        tl.store(
+            output_ptr + output_offsets,
+            embeddings,
+            mask=block_mask,
+        )
+@triton.jit
+def embedding_backward_kernel(
+    grad_output_ptr,
+    grad_weight_ptr,
+    indices_ptr,
+    n_elements,
+    embedding_dim: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    NUM_STAGES: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    num_progs = tl.num_programs(0)
+    grid_m = tl.cdiv(n_elements, BLOCK_SIZE_M)
+    grid_n = tl.cdiv(embedding_dim, BLOCK_SIZE_N)
+    total_2d_blocks = grid_m * grid_n
+    for block_idx in tl.range(pid, total_2d_blocks, num_progs, num_stages=NUM_STAGES):
+        block_m = block_idx // grid_n
+        block_n = block_idx % grid_n
+        start_m = block_m * BLOCK_SIZE_M
+        start_n = block_n * BLOCK_SIZE_N
+        offsets_m = start_m + tl.arange(0, BLOCK_SIZE_M)
+        mask_m = offsets_m < n_elements
+        indices = tl.load(indices_ptr + offsets_m, mask=mask_m, other=0)
+        offsets_n = start_n + tl.arange(0, BLOCK_SIZE_N)
+        mask_n = offsets_n < embedding_dim
+        block_mask = mask_m[:, None] & mask_n[None, :]
+        grad_output_offsets = offsets_m[:, None] * embedding_dim + offsets_n[None, :]
+        grad_output = tl.load(
+            grad_output_ptr + grad_output_offsets,
+            mask=block_mask,
+            other=0.0,
+        )
+        grad_weight_offsets = indices[:, None] * embedding_dim + offsets_n[None, :]
+        tl.atomic_add(
+            grad_weight_ptr + grad_weight_offsets,
+            grad_output,
+            mask=block_mask,
+        )
+def get_optimal_block_size(total_elements, dtype_size, BLOCK_SIZE_N: tl.constexpr):
+    # 1. Set Memory Multiplier
+    # 3.0 are empirical values based on 910B UB (192KB)
+    # embedding_offsets, embedding_offsets : BLOCK_SIZE_N * BLOCK_SIZE_M (total 2 * BLOCK_SIZE_N * BLOCK_SIZE_M)
+    # Reserve a unit of space for the remaining one-dimensional ub to occupy.
+    # A conservative estimate of the total space occupation is 3 * BLOCK_SIZE_N * BLOCK_SIZE_M
+    multiplier = 3.0
+    # 2. Call calculation function
+    # Treat input as 1D (total_elements,), only tiling on dim 0
+    tile_shapes = compute_default_tiling_strategy(
+        safety_margin=0.9,
+        dtype_size=dtype_size,
+        memory_multiplier=multiplier,
+        shapes=((total_elements, BLOCK_SIZE_N),),
+        tiling_dims=(0,),
+    )
+    # 3. Parse result
+    if tile_shapes and len(tile_shapes) > 0:
+        block_size = tile_shapes[0][0]
+        return block_size
+    else:
+        return triton.next_power_of_2(min(128, total_elements))
+def embedding_forward(embeddings, indices):
+    ori_shape = indices.shape
+    indices = indices.view(-1)
+    n_elements = indices.numel()
+    embedding_dim = embeddings.shape[1]
+    output = torch.empty(
+        indices.shape[0],
+        embeddings.shape[1],
+        device=indices.device,
+        dtype=embeddings.dtype,
+    )
+    # Due to the involvement of two-dimensional partitioning,
+    # the sizes of block_m and block_n in the ub space will influence each other.
+    # Considering that embedding_dim is usually relatively smaller in most cases,
+    # a value is first assigned to block_n, and then the largest possible block_m is used.
+    BLOCK_SIZE_N = triton.next_power_of_2(min(128, embedding_dim))
+    BLOCK_SIZE_M = get_optimal_block_size(n_elements, embeddings.element_size(), BLOCK_SIZE_N)
+    num_cores = get_npu_core_count()
+    total_blocks = triton.cdiv(n_elements, BLOCK_SIZE_M) * triton.cdiv(embedding_dim, BLOCK_SIZE_N)
+    grid = min(num_cores, total_blocks)
+    embedding_forward_kernel[(grid,)](
+        embeddings,
+        indices,
+        output,
+        n_elements,
+        embedding_dim=embedding_dim,
+        BLOCK_SIZE_M=BLOCK_SIZE_M,
+        BLOCK_SIZE_N=BLOCK_SIZE_N,
+        NUM_STAGES=3,
+    )
+    return output.view(*ori_shape, -1)
+def embedding_backward(embeddings, indices, grad_output):
+    grad_output = grad_output.contiguous().view(-1, embeddings.shape[1])
+    grad_weight = torch.zeros_like(embeddings)
+    n_elements = indices.numel()
+    embedding_dim = embeddings.shape[1]
+    BLOCK_SIZE_N = triton.next_power_of_2(min(128, embedding_dim))
+    BLOCK_SIZE_M = get_optimal_block_size(n_elements, embeddings.element_size(), BLOCK_SIZE_N)
+    num_cores = get_npu_core_count()
+    total_blocks = triton.cdiv(n_elements, BLOCK_SIZE_M) * triton.cdiv(embedding_dim, BLOCK_SIZE_N)
+    grid = min(num_cores, total_blocks)
+    embedding_backward_kernel[(grid,)](
+        grad_output,
+        grad_weight,
+        indices,
+        n_elements,
+        embedding_dim=embedding_dim,
+        BLOCK_SIZE_M=BLOCK_SIZE_M,
+        BLOCK_SIZE_N=BLOCK_SIZE_N,
+        NUM_STAGES=3,
+    )
+    return grad_weight
+class LigerEmbeddingFunction(torch.autograd.Function):
+    @staticmethod
+    @ensure_contiguous
+    def forward(ctx, embeddings: torch.Tensor, indices: torch.Tensor):
+        output = embedding_forward(embeddings, indices)
+        ctx.save_for_backward(indices, embeddings)
+        return output
+    @staticmethod
+    @ensure_contiguous
+    def backward(ctx, grad_output: torch.Tensor):
+        indices, embeddings = ctx.saved_tensors
+        grad_weight = embedding_backward(embeddings, indices, grad_output)
+        return grad_weight, None

liger_kernel/ops/backends/_ascend/ops/geglu.py ADDED Viewed

@@ -0,0 +1,191 @@
+import torch
+import triton
+import triton.language as tl
+from triton.language.math import tanh
+from liger_kernel.ops.backends._ascend.ub_manager import compute_default_tiling_strategy
+from liger_kernel.ops.utils import ensure_contiguous
+from liger_kernel.ops.utils import get_npu_core_count
+@triton.jit
+def _geglu_forward_kernel_flat(a_ptr, b_ptr, c_ptr, total_elements, BLOCK_SIZE: tl.constexpr, NUM_STAGES: tl.constexpr):
+    """
+    High-performance GEGLU forward kernel using flatten 1D approach.
+    Uses grid-stride loop pattern for optimal performance on NPU.
+    """
+    pid = tl.program_id(0)
+    num_progs = tl.num_programs(0)
+    # Grid-Stride Loop
+    start_idx = pid * BLOCK_SIZE
+    stride = num_progs * BLOCK_SIZE
+    # Constants for GELU tanh approximation
+    sqrt_2_over_pi = 0.7978845608028654  # sqrt(2 / pi)
+    gelu_coeff = 0.044715
+    for idx in tl.range(start_idx, total_elements, stride, num_stages=NUM_STAGES):
+        offsets = idx + tl.arange(0, BLOCK_SIZE)
+        mask = offsets < total_elements
+        a_val = tl.load(a_ptr + offsets, mask=mask, other=0.0).to(tl.float32)
+        b_val = tl.load(b_ptr + offsets, mask=mask, other=0.0)
+        # tanh approximation form of GELU is computed with:
+        # 0.5 * a * (1 + tanh(sqrt(2 / pi) * (a + 0.044715 * a^3)))
+        a_cubed = a_val * a_val * a_val
+        tanh_arg = sqrt_2_over_pi * (a_val + gelu_coeff * a_cubed)
+        tanh_result = tanh(tanh_arg)
+        geglu_a = 0.5 * a_val * (1.0 + tanh_result)
+        c_row = geglu_a.cast(b_val.dtype) * b_val
+        tl.store(c_ptr + offsets, c_row, mask=mask)
+@triton.jit
+def _geglu_backward_kernel_flat(
+    dc_ptr, a_ptr, b_ptr, da_ptr, db_ptr, total_elements, BLOCK_SIZE: tl.constexpr, NUM_STAGES: tl.constexpr
+):
+    """
+    High-performance GEGLU backward kernel using flatten 1D approach.
+    Uses grid-stride loop pattern for optimal performance on NPU.
+    """
+    pid = tl.program_id(0)
+    num_progs = tl.num_programs(0)
+    start_idx = pid * BLOCK_SIZE
+    stride = num_progs * BLOCK_SIZE
+    # Constants for GELU tanh approximation
+    sqrt_2_over_pi = 0.7978845608028654  # sqrt(2 / pi)
+    gelu_coeff = 0.044715
+    for idx in tl.range(start_idx, total_elements, stride, num_stages=NUM_STAGES):
+        offsets = idx + tl.arange(0, BLOCK_SIZE)
+        mask = offsets < total_elements
+        dc = tl.load(dc_ptr + offsets, mask=mask, other=0.0)
+        a = tl.load(a_ptr + offsets, mask=mask, other=0.0).to(tl.float32)
+        b = tl.load(b_ptr + offsets, mask=mask, other=0.0)
+        # recomputation to save memory
+        a_cubed = a * a * a
+        tanh_arg = sqrt_2_over_pi * (a + gelu_coeff * a_cubed)
+        tanh_result = tanh(tanh_arg)
+        geglu_a = 0.5 * a * (1 + tanh_result)
+        geglu_a = geglu_a.to(dc.dtype).to(tl.float32)
+        db = dc.cast(tl.float32) * geglu_a
+        # Gradient w.r.t. a can be computed with:
+        # b * (0.5 * (1 + tanh(z)) + 0.5 * a * (1 - tanh(z)^2) * (sqrt(2/pi) * (1 + 3 * 0.044715 * a^2)))
+        # where z = sqrt(2/pi) * (a + 0.044715 * a^3)
+        term1 = 0.5 * (1.0 + tanh_result)
+        tanh_sq = tanh_result * tanh_result
+        a_sq = a * a
+        term2 = 0.5 * a * (1.0 - tanh_sq) * (sqrt_2_over_pi * (1.0 + 3.0 * gelu_coeff * a_sq))
+        da = dc * b * (term1 + term2)
+        tl.store(da_ptr + offsets, da, mask=mask)
+        tl.store(db_ptr + offsets, db.to(dc.dtype), mask=mask)
+def get_optimal_block_size(total_elements, is_backward=False):
+    """
+    Calculate optimal Block Size using compute_default_tiling_strategy.
+    Args:
+        total_elements: Total number of elements to process
+        is_backward: Whether this is for backward pass (requires more memory)
+    Returns:
+        Optimal block size for the kernel
+    """
+    # Memory multiplier based on peak memory usage analysis
+    if is_backward:
+        memory_multiplier = 6.0
+    else:
+        memory_multiplier = 3.0
+    # Call calculation function
+    # Treat input as 1D (total_elements,), only tiling on dim 0
+    tile_shapes = compute_default_tiling_strategy(
+        safety_margin=0.9,
+        dtype_size=4,
+        memory_multiplier=memory_multiplier,
+        shapes=((total_elements,),),
+        tiling_dims=(0,),
+    )
+    # Parse result
+    if tile_shapes and len(tile_shapes) > 0:
+        block_size = tile_shapes[0][0]
+        return max(256, block_size)
+    else:
+        return 2048
+def geglu_forward(a, b):
+    """
+    High-performance GEGLU forward pass for NPU using flatten 1D approach.
+    """
+    if not a.is_contiguous():
+        a = a.contiguous()
+    if not b.is_contiguous():
+        b = b.contiguous()
+    total_elements = a.numel()
+    c = torch.empty_like(a)
+    block_size = get_optimal_block_size(total_elements, is_backward=False)
+    num_cores = get_npu_core_count()
+    grid_size = min(num_cores, (total_elements + block_size - 1) // block_size)
+    _geglu_forward_kernel_flat[(grid_size,)](a, b, c, total_elements, BLOCK_SIZE=block_size, NUM_STAGES=3, num_warps=4)
+    return c
+def geglu_backward(a, b, dc):
+    """
+    High-performance GEGLU backward pass for NPU using flatten 1D approach.
+    """
+    if not dc.is_contiguous():
+        dc = dc.contiguous()
+    if not a.is_contiguous():
+        a = a.contiguous()
+    if not b.is_contiguous():
+        b = b.contiguous()
+    total_elements = dc.numel()
+    grad_a = torch.empty_like(a)
+    grad_b = torch.empty_like(b)
+    block_size = get_optimal_block_size(total_elements, is_backward=True)
+    num_cores = get_npu_core_count()
+    grid_size = min(num_cores, (total_elements + block_size - 1) // block_size)
+    _geglu_backward_kernel_flat[(grid_size,)](
+        dc, a, b, grad_a, grad_b, total_elements, BLOCK_SIZE=block_size, NUM_STAGES=3, num_warps=4
+    )
+    return grad_a, grad_b
+class LigerGELUMulFunction(torch.autograd.Function):
+    """High-performance GEGLU function for Ascend NPU."""
+    @staticmethod
+    @ensure_contiguous
+    def forward(ctx, a, b):
+        c = geglu_forward(a, b)
+        ctx.save_for_backward(a, b)
+        return c
+    @staticmethod
+    @ensure_contiguous
+    def backward(ctx, dc):
+        a, b = ctx.saved_tensors
+        grad_a, grad_b = geglu_backward(a, b, dc)
+        return grad_a, grad_b

liger_kernel/ops/backends/_ascend/ops/llama4_rope.py ADDED Viewed

@@ -0,0 +1,298 @@
+import torch
+import triton
+import triton.language as tl
+from liger_kernel.ops.backends._ascend.ub_manager import compute_default_tiling_strategy
+def _prepare_freqs(freqs_cis: torch.Tensor, seq_len: int, head_dim_half: int):
+    """
+    Canonicalize freqs to (seq_len, head_dim_half) real/imag tensors.
+    Supports:
+    - complex freqs: (..., head_dim_half) complex -> real/imag
+    - packed freqs: (..., 2*head_dim_half) real -> split into real/imag
+    """
+    if freqs_cis.is_complex():
+        freqs_real = freqs_cis.real
+        freqs_imag = freqs_cis.imag
+    else:
+        if freqs_cis.shape[-1] == 2 * head_dim_half:
+            freqs_real = freqs_cis[..., :head_dim_half]
+            freqs_imag = freqs_cis[..., head_dim_half:]
+        else:
+            raise ValueError(
+                f"Unexpected freqs_cis shape for non-complex input: {freqs_cis.shape}, "
+                f"expected last dim = {2 * head_dim_half}"
+            )
+    if freqs_real.shape[-1] != head_dim_half:
+        raise ValueError(f"Unexpected last dim for freqs: {freqs_real.shape[-1]} (expected {head_dim_half})")
+    # Flatten leading dims -> (N, head_dim_half)
+    freqs_real = freqs_real.reshape(-1, head_dim_half)
+    freqs_imag = freqs_imag.reshape(-1, head_dim_half)
+    # Broadcast/slice to (seq_len, head_dim_half)
+    if freqs_real.shape[0] < seq_len:
+        if freqs_real.shape[0] == 1:
+            freqs_real = freqs_real.expand(seq_len, -1)
+            freqs_imag = freqs_imag.expand(seq_len, -1)
+        else:
+            raise ValueError(f"Insufficient rows in freqs: {freqs_real.shape[0]} < seq_len={seq_len}")
+    elif freqs_real.shape[0] > seq_len:
+        freqs_real = freqs_real[:seq_len]
+        freqs_imag = freqs_imag[:seq_len]
+    return freqs_real, freqs_imag
+def _cast_and_contiguous(q, k, freqs_real, freqs_imag):
+    # Align dtype: fp32 only when q is fp32; otherwise keep q dtype for perf
+    compute_dtype = torch.float32 if q.dtype == torch.float32 else q.dtype
+    if k.dtype != q.dtype:
+        k = k.to(q.dtype)
+    q = q.to(compute_dtype).contiguous()
+    k = k.to(compute_dtype).contiguous()
+    freqs_real = freqs_real.to(compute_dtype).contiguous()
+    freqs_imag = freqs_imag.to(compute_dtype).contiguous()
+    return q, k, freqs_real, freqs_imag, compute_dtype
+@triton.jit
+def _triton_llama4_rope_npu(
+    q_ptr,
+    k_ptr,
+    freqs_real_ptr,
+    freqs_imag_ptr,
+    q_row_stride,
+    k_row_stride,
+    q_head_stride,
+    k_head_stride,
+    freqs_row_stride,
+    sl,
+    bs: tl.constexpr,
+    n_qh: tl.constexpr,
+    n_kh: tl.constexpr,
+    hd: tl.constexpr,
+    BLOCK_Q: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    imag_sign: tl.constexpr,
+):
+    """
+    Llama4 RoPE on Ascend NPU for interleaved complex layout:
+    - q/k shape: (bs, sl, n_heads, hd)
+    - last dim layout: [real0, imag0, real1, imag1, ...]
+    - freqs_real/imag: (sl, hd//2)
+    """
+    pid = tl.program_id(0).to(tl.int64)
+    batch_idx = pid // sl
+    seq_idx = pid % sl
+    if batch_idx >= bs:
+        return
+    q_base = q_ptr + pid * q_row_stride
+    k_base = k_ptr + pid * k_row_stride
+    freq_base = seq_idx * freqs_row_stride
+    hd_idx = tl.arange(0, hd)
+    hd_mask = hd_idx < (hd)
+    freq_idx = tl.arange(0, hd // 2)
+    freq_mask = freq_idx < (hd // 2)
+    freqs_real = tl.load(freqs_real_ptr + freq_base + freq_idx, mask=freq_mask, other=0.0)
+    freqs_imag = tl.load(freqs_imag_ptr + freq_base + freq_idx, mask=freq_mask, other=0.0) * imag_sign
+    # Q heads (chunked for UB)
+    for qh_block in range(0, n_qh, BLOCK_Q):
+        qh_idx = tl.arange(0, BLOCK_Q) + qh_block
+        qh_mask = qh_idx < n_qh
+        block_mask = qh_mask[:, None] & hd_mask[None, :]
+        head_ptr = q_base + qh_idx[:, None] * q_head_stride
+        q_pair = tl.load(
+            head_ptr + hd_idx[None, :],
+            mask=block_mask,
+            other=0.0,
+        )
+        q_pair = q_pair.reshape(BLOCK_Q, hd // 2, 2, can_reorder=True)
+        q_real, q_imag = tl.split(q_pair)
+        new_real = tl.math.fma(q_real, freqs_real, -(q_imag * freqs_imag))
+        new_imag = tl.math.fma(q_real, freqs_imag, q_imag * freqs_real)
+        new_q_pair = tl.interleave(new_real, new_imag)
+        tl.store(head_ptr + hd_idx[None, :], new_q_pair, mask=block_mask)
+    # K heads (chunked for UB)
+    for kh_block in range(0, n_kh, BLOCK_K):
+        kh_idx = tl.arange(0, BLOCK_K) + kh_block
+        kh_mask = kh_idx < n_kh
+        block_mask = kh_mask[:, None] & hd_mask[None, :]
+        head_ptr = k_base + kh_idx[:, None] * k_head_stride
+        k_pair = tl.load(
+            head_ptr + hd_idx[None, :],
+            mask=block_mask,
+            other=0.0,
+        )
+        k_pair = k_pair.reshape(BLOCK_K, hd // 2, 2, can_reorder=True)
+        k_real, k_imag = tl.split(k_pair)
+        new_real = tl.math.fma(k_real, freqs_real, -(k_imag * freqs_imag))
+        new_imag = tl.math.fma(k_real, freqs_imag, k_imag * freqs_real)
+        new_k_pair = tl.interleave(new_real, new_imag)
+        tl.store(head_ptr + hd_idx[None, :], new_k_pair, mask=block_mask)
+def llama4_rope_forward(q, k, freqs_cis):
+    """
+    Ascend NPU implementation of Llama4 RoPE.
+    q/k: (bs, sl, n_heads, hd) with interleaved complex last-dim layout.
+    freqs_cis: complex (..., hd//2) OR packed (..., 2*(hd//2)).
+    """
+    original_dtype = q.dtype
+    bs, sl, n_qh, hd = q.shape
+    _, _, n_kh, _ = k.shape
+    if hd % 2 != 0:
+        raise ValueError(f"head_dim must be even for interleaved complex layout, got {hd}")
+    hd_half = hd // 2
+    freqs_real, freqs_imag = _prepare_freqs(freqs_cis, sl, hd_half)
+    q, k, freqs_real, freqs_imag, compute_dtype = _cast_and_contiguous(q, k, freqs_real, freqs_imag)
+    # UB tiling strategy: tile heads dimension only
+    dtype_size = q.element_size()
+    shapes = ((n_qh, hd), (n_kh, hd))
+    tile_shapes = compute_default_tiling_strategy(
+        safety_margin=0.90,
+        dtype_size=dtype_size,
+        memory_multiplier=12.0,
+        shapes=shapes,
+        tiling_dims=(0, 0),
+    )
+    if tile_shapes is not None and len(tile_shapes) == len(shapes):
+        q_tile_shape, k_tile_shape = tile_shapes
+        BLOCK_Q, _ = q_tile_shape
+        BLOCK_K, _ = k_tile_shape
+    else:
+        BLOCK_Q = triton.next_power_of_2(n_qh)
+        BLOCK_K = triton.next_power_of_2(n_kh)
+    n_row = bs * sl
+    _triton_llama4_rope_npu[(n_row,)](
+        q,
+        k,
+        freqs_real,
+        freqs_imag,
+        q.stride(1),
+        k.stride(1),
+        q.stride(2),
+        k.stride(2),
+        freqs_real.stride(0),
+        sl,
+        bs,
+        n_qh,
+        n_kh,
+        hd,
+        BLOCK_Q,
+        BLOCK_K,
+        imag_sign=1.0,
+    )
+    if compute_dtype != original_dtype:
+        q = q.to(original_dtype)
+        k = k.to(original_dtype)
+    return q, k
+def llama4_rope_backward(dq, dk, freqs_cis):
+    """
+    Ascend NPU implementation of Llama4 RoPE.
+    q/k: (bs, sl, n_heads, hd) with interleaved complex last-dim layout.
+    freqs_cis: complex (..., hd//2) OR packed (..., 2*(hd//2)).
+    """
+    original_dtype = dq.dtype
+    bs, sl, n_qh, hd = dq.shape
+    _, _, n_kh, _ = dk.shape
+    if hd % 2 != 0:
+        raise ValueError(f"head_dim must be even for interleaved complex layout, got {hd}")
+    hd_half = hd // 2
+    freqs_real, freqs_imag = _prepare_freqs(freqs_cis, sl, hd_half)
+    dq, dk, freqs_real, freqs_imag, compute_dtype = _cast_and_contiguous(dq, dk, freqs_real, freqs_imag)
+    # UB tiling strategy: tile heads dimension only
+    dtype_size = dq.element_size()
+    shapes = ((n_qh, hd), (n_kh, hd))
+    tile_shapes = compute_default_tiling_strategy(
+        safety_margin=0.90,
+        dtype_size=dtype_size,
+        memory_multiplier=12.0,
+        shapes=shapes,
+        tiling_dims=(0, 0),
+    )
+    if tile_shapes is not None and len(tile_shapes) == len(shapes):
+        q_tile_shape, k_tile_shape = tile_shapes
+        BLOCK_Q, _ = q_tile_shape
+        BLOCK_K, _ = k_tile_shape
+    else:
+        BLOCK_Q = triton.next_power_of_2(n_qh)
+        BLOCK_K = triton.next_power_of_2(n_kh)
+    n_row = bs * sl
+    _triton_llama4_rope_npu[(n_row,)](
+        dq,
+        dk,
+        freqs_real,
+        freqs_imag,
+        dq.stride(1),
+        dk.stride(1),
+        dq.stride(2),
+        dk.stride(2),
+        freqs_real.stride(0),
+        sl,
+        bs,
+        n_qh,
+        n_kh,
+        hd,
+        BLOCK_Q,
+        BLOCK_K,
+        imag_sign=-1.0,
+    )
+    if compute_dtype != original_dtype:
+        dq = dq.to(original_dtype)
+        dk = dk.to(original_dtype)
+    return dq, dk
+class LigerLlama4RopeFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, q, k, freqs_cis, BLOCK_SIZE: int = None):
+        # BLOCK_SIZE is ignored for Ascend (we auto-tile heads by UB), kept for API compatibility
+        q_out, k_out = llama4_rope_forward(q, k, freqs_cis)
+        ctx.save_for_backward(freqs_cis.detach() if isinstance(freqs_cis, torch.Tensor) else freqs_cis)
+        return q_out, k_out
+    @staticmethod
+    def backward(ctx, dq, dk):
+        (freqs_cis,) = ctx.saved_tensors
+        dq_out, dk_out = llama4_rope_backward(dq, dk, freqs_cis)
+        return dq_out, dk_out, None, None

liger-kernel 0.6.4__py3-none-any.whl → 0.6.5__py3-none-any.whl

liger-kernel 0.6.4py3-none-any.whl → 0.6.5py3-none-any.whl