PyPI - liger-kernel - Versions diffs - 0.6.3__py3-none-any.whl → 0.6.5__py3-none-any.whl - Mend

liger-kernel 0.6.3py3-none-any.whl → 0.6.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (101) hide show

liger_kernel/chunked_loss/cosine_similarity_loss.py +20 -5
liger_kernel/chunked_loss/fused_linear_distillation.py +23 -5
liger_kernel/chunked_loss/fused_linear_ppo.py +21 -5
liger_kernel/chunked_loss/grpo_loss.py +8 -5
liger_kernel/chunked_loss/jsd_loss.py +39 -11
liger_kernel/ops/__init__.py +141 -0
liger_kernel/ops/backends/README.md +151 -0
liger_kernel/ops/backends/__init__.py +13 -0
liger_kernel/ops/backends/_ascend/__init__.py +5 -0
liger_kernel/ops/backends/_ascend/ascend-ub-manager-design.md +492 -0
liger_kernel/ops/backends/_ascend/ops/__init__.py +61 -0
liger_kernel/ops/backends/_ascend/ops/embedding.py +214 -0
liger_kernel/ops/backends/_ascend/ops/geglu.py +191 -0
liger_kernel/ops/backends/_ascend/ops/llama4_rope.py +298 -0
liger_kernel/ops/backends/_ascend/ops/qwen2vl_mrope.py +275 -0
liger_kernel/ops/backends/_ascend/ops/rope.py +265 -0
liger_kernel/ops/backends/_ascend/ops/swiglu.py +142 -0
liger_kernel/ops/backends/_ascend/ops/tvd.py +223 -0
liger_kernel/ops/backends/_ascend/ub_manager.py +367 -0
liger_kernel/ops/backends/registry.py +61 -0
liger_kernel/ops/cross_entropy.py +71 -11
liger_kernel/ops/dyt.py +5 -2
liger_kernel/ops/fused_add_rms_norm.py +21 -23
liger_kernel/ops/fused_linear_cross_entropy.py +32 -5
liger_kernel/ops/geglu.py +5 -3
liger_kernel/ops/group_norm.py +12 -8
liger_kernel/ops/grpo_loss.py +3 -1
liger_kernel/ops/kl_div.py +8 -11
liger_kernel/ops/layer_norm.py +89 -69
liger_kernel/ops/poly_norm.py +19 -21
liger_kernel/ops/rms_norm.py +149 -71
liger_kernel/ops/tiled_mlp.py +136 -0
liger_kernel/ops/utils.py +25 -0
liger_kernel/transformers/__init__.py +25 -0
liger_kernel/transformers/auto_model.py +21 -0
liger_kernel/transformers/cross_entropy.py +9 -4
liger_kernel/transformers/dyt.py +1 -1
liger_kernel/transformers/experimental/embedding.py +1 -1
liger_kernel/transformers/functional.py +44 -26
liger_kernel/transformers/fused_add_rms_norm.py +1 -1
liger_kernel/transformers/fused_linear_cross_entropy.py +9 -4
liger_kernel/transformers/fused_linear_jsd.py +1 -1
liger_kernel/transformers/fused_neighborhood_attention.py +1 -1
liger_kernel/transformers/geglu.py +1 -1
liger_kernel/transformers/group_norm.py +1 -1
liger_kernel/transformers/grpo_loss.py +57 -2
liger_kernel/transformers/jsd.py +1 -1
liger_kernel/transformers/kl_div.py +1 -1
liger_kernel/transformers/layer_norm.py +1 -1
liger_kernel/transformers/llama4_rope.py +1 -1
liger_kernel/transformers/model/exaone4.py +136 -0
liger_kernel/transformers/model/falcon_h1.py +19 -5
liger_kernel/transformers/model/gemma.py +17 -6
liger_kernel/transformers/model/gemma2.py +17 -8
liger_kernel/transformers/model/gemma3.py +35 -16
liger_kernel/transformers/model/glm4.py +16 -4
liger_kernel/transformers/model/glm4v.py +16 -4
liger_kernel/transformers/model/glm4v_moe.py +23 -4
liger_kernel/transformers/model/gpt_oss.py +211 -0
liger_kernel/transformers/model/hunyuan_v1.py +134 -0
liger_kernel/transformers/model/internvl.py +12 -5
liger_kernel/transformers/model/llama.py +14 -5
liger_kernel/transformers/model/llama4.py +16 -4
liger_kernel/transformers/model/llava.py +12 -4
liger_kernel/transformers/model/loss_utils.py +37 -3
liger_kernel/transformers/model/mistral.py +15 -6
liger_kernel/transformers/model/mixtral.py +16 -7
liger_kernel/transformers/model/mllama.py +12 -4
liger_kernel/transformers/model/olmo2.py +16 -4
liger_kernel/transformers/model/olmo3.py +142 -0
liger_kernel/transformers/model/output_classes.py +147 -0
liger_kernel/transformers/model/paligemma.py +23 -5
liger_kernel/transformers/model/phi3.py +14 -7
liger_kernel/transformers/model/qwen2.py +16 -3
liger_kernel/transformers/model/qwen2_5_vl.py +14 -6
liger_kernel/transformers/model/qwen2_vl.py +16 -4
liger_kernel/transformers/model/qwen3.py +20 -5
liger_kernel/transformers/model/qwen3_moe.py +19 -5
liger_kernel/transformers/model/qwen3_next.py +17 -5
liger_kernel/transformers/model/qwen3_vl.py +150 -0
liger_kernel/transformers/model/qwen3_vl_moe.py +126 -0
liger_kernel/transformers/model/smollm3.py +15 -6
liger_kernel/transformers/monkey_patch.py +584 -49
liger_kernel/transformers/multi_token_attention.py +1 -1
liger_kernel/transformers/poly_norm.py +1 -1
liger_kernel/transformers/qwen2vl_mrope.py +1 -1
liger_kernel/transformers/rms_norm.py +8 -3
liger_kernel/transformers/rope.py +45 -1
liger_kernel/transformers/softmax.py +1 -1
liger_kernel/transformers/sparsemax.py +1 -1
liger_kernel/transformers/swiglu.py +18 -1
liger_kernel/transformers/tiled_mlp.py +125 -0
liger_kernel/transformers/tvd.py +1 -1
liger_kernel/utils.py +54 -0
{liger_kernel-0.6.3.dist-info → liger_kernel-0.6.5.dist-info}/METADATA +14 -4
liger_kernel-0.6.5.dist-info/RECORD +134 -0
{liger_kernel-0.6.3.dist-info → liger_kernel-0.6.5.dist-info}/WHEEL +1 -1
liger_kernel-0.6.3.dist-info/RECORD +0 -111
{liger_kernel-0.6.3.dist-info → liger_kernel-0.6.5.dist-info}/licenses/LICENSE +0 -0
{liger_kernel-0.6.3.dist-info → liger_kernel-0.6.5.dist-info}/licenses/NOTICE +0 -0
{liger_kernel-0.6.3.dist-info → liger_kernel-0.6.5.dist-info}/top_level.txt +0 -0

liger_kernel/ops/backends/_ascend/ops/rope.py ADDED Viewed

@@ -0,0 +1,265 @@
+import torch
+import triton
+import triton.language as tl
+from liger_kernel.ops.backends._ascend.ub_manager import compute_default_tiling_strategy
+from liger_kernel.ops.utils import get_npu_core_count
+@triton.jit
+def _triton_rope_npu(
+    q_ptr,
+    q_row_stride,
+    k_ptr,
+    k_row_stride,
+    cos,
+    cos_row_stride,
+    sin,
+    sin_row_stride,
+    sl,
+    total_rows: tl.constexpr,
+    cos_bs: tl.constexpr,
+    n_qh: tl.constexpr,
+    n_kh: tl.constexpr,
+    hd: tl.constexpr,
+    BLOCK_Q: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    NUM_STAGES: tl.constexpr,
+    BACKWARD_PASS: tl.constexpr = False,
+):
+    program_id = tl.program_id(0)
+    num_programs = tl.num_programs(0)
+    rows_per_program = (total_rows + num_programs - 1) // num_programs
+    start_row = program_id * rows_per_program
+    actual_rows = tl.minimum(rows_per_program, total_rows - start_row)
+    for row_offset in tl.range(0, actual_rows, num_stages=NUM_STAGES):
+        pid = start_row + row_offset
+        row_idx = pid % sl
+        cos_ptr = cos + tl.where(cos_bs == 1, row_idx * cos_row_stride, pid * cos_row_stride)
+        sin_ptr = sin + tl.where(cos_bs == 1, row_idx * sin_row_stride, pid * sin_row_stride)
+        # Pre-compute d_idx and cos/sin values outside loops (they don't depend on heads)
+        d_idx = tl.arange(0, hd // 2)
+        d_mask = d_idx < (hd // 2)  # Always True, but kept for clarity
+        cos_vals = tl.load(cos_ptr + d_idx, mask=d_mask, other=0)
+        sin_vals = tl.load(sin_ptr + d_idx, mask=d_mask, other=0)
+        # Process q heads in chunks to prevent UB overflow
+        for qh_block in range(0, n_qh, BLOCK_Q):
+            qh_idx = tl.arange(0, BLOCK_Q) + qh_block
+            qh_mask = qh_idx < n_qh
+            # block_mask: qh_mask broadcasted over d_idx dimension
+            block_mask = qh_mask[:, None]
+            offsets = qh_idx[:, None] * hd + d_idx[None, :]
+            q_base = q_ptr + pid * q_row_stride
+            q_left = tl.load(q_base + offsets, mask=block_mask, other=0)
+            q_right = tl.load(q_base + offsets + (hd // 2), mask=block_mask, other=0)
+            if not BACKWARD_PASS:
+                new_left = q_left * cos_vals - q_right * sin_vals
+                new_right = q_right * cos_vals + q_left * sin_vals
+            else:
+                new_left = q_left * cos_vals + q_right * sin_vals
+                new_right = q_right * cos_vals - q_left * sin_vals
+            tl.store(q_base + offsets, new_left, mask=block_mask)
+            tl.store(q_base + offsets + (hd // 2), new_right, mask=block_mask)
+        # Process k heads in chunks to prevent UB overflow
+        for kh_block in range(0, n_kh, BLOCK_K):
+            kh_idx = tl.arange(0, BLOCK_K) + kh_block
+            kh_mask = kh_idx < n_kh
+            # block_mask: kh_mask broadcasted over d_idx dimension
+            block_mask = kh_mask[:, None]
+            offsets = kh_idx[:, None] * hd + d_idx[None, :]
+            k_base = k_ptr + pid * k_row_stride
+            k_left = tl.load(k_base + offsets, mask=block_mask, other=0)
+            k_right = tl.load(k_base + offsets + (hd // 2), mask=block_mask, other=0)
+            if not BACKWARD_PASS:
+                new_left = k_left * cos_vals - k_right * sin_vals
+                new_right = k_right * cos_vals + k_left * sin_vals
+            else:
+                new_left = k_left * cos_vals + k_right * sin_vals
+                new_right = k_right * cos_vals - k_left * sin_vals
+            tl.store(k_base + offsets, new_left, mask=block_mask)
+            tl.store(k_base + offsets + (hd // 2), new_right, mask=block_mask)
+def get_optimal_block_size(pad_n_q_head, pad_n_kv_head, pad_hd, dtype_size):
+    # Compute tiling strategy based on UB capacity
+    # ROPE forward tiling strategy (based on optimized ROPE kernel):
+    # - cos_vals and sin_vals are loaded once outside loops (shared): pad_hd // 2 elements each
+    # - In q heads loop (peak memory):
+    #   * q_left: BLOCK_Q * (pad_hd // 2) elements
+    #   * q_right: BLOCK_Q * (pad_hd // 2) elements
+    #   * new_left: BLOCK_Q * (pad_hd // 2) elements (intermediate result)
+    #   * new_right: BLOCK_Q * (pad_hd // 2) elements (intermediate result)
+    #   * Total: 4 * BLOCK_Q * (pad_hd // 2) = 2 * BLOCK_Q * pad_hd elements
+    # - In k heads loop (peak memory):
+    #   * k_left: BLOCK_K * (pad_hd // 2) elements
+    #   * k_right: BLOCK_K * (pad_hd // 2) elements
+    #   * new_left: BLOCK_K * (pad_hd // 2) elements (intermediate result)
+    #   * new_right: BLOCK_K * (pad_hd // 2) elements (intermediate result)
+    #   * Total: 4 * BLOCK_K * (pad_hd // 2) = 2 * BLOCK_K * pad_hd elements
+    # - Since q and k are processed separately, peak memory is max(BLOCK_Q, BLOCK_K) case
+    # - Plus shared cos/sin: 2 * (pad_hd // 2) = pad_hd elements
+    # - Conservative estimate: (2 * BLOCK_SIZE * pad_hd + pad_hd) * dtype_size * 8 bits
+    # - Simplified: (2 * BLOCK_SIZE + 1) * pad_hd * dtype_size * 8 bits
+    # - For safety, use: memory_multiplier=3.0 * BLOCK_SIZE * pad_hd * dtype_size * 8 bits
+    # - shapes: ((pad_n_q_head, pad_hd), (pad_n_kv_head, pad_hd))
+    # - tiling_dims: (0, 0) means first dimension of each shape can be tiled
+    # - Returns: ((block_size_q, pad_hd), (block_size_kv, pad_hd))
+    shapes = ((pad_n_q_head, pad_hd), (pad_n_kv_head, pad_hd))
+    tile_shapes = compute_default_tiling_strategy(
+        safety_margin=0.90,
+        dtype_size=dtype_size,
+        memory_multiplier=3.0,
+        shapes=shapes,
+        tiling_dims=(0, 0),
+    )
+    if tile_shapes is not None and len(tile_shapes) == len(shapes):
+        # Strategy returns ((block_size_q, pad_hd), (block_size_kv, pad_hd))
+        q_tile_shape, k_tile_shape = tile_shapes
+        BLOCK_Q, _ = q_tile_shape
+        BLOCK_K, _ = k_tile_shape
+    else:
+        # Fallback to conservative defaults
+        BLOCK_Q = 2048
+        BLOCK_K = 2048
+    return BLOCK_Q, BLOCK_K
+def rope_forward(q, k, cos, sin):
+    # transpose it back to the physical shape because Triton looks at the physical storage
+    # note: q and k are incontiguous before the transformation and will become contiguous after transpose
+    q = q.transpose(1, 2)
+    k = k.transpose(1, 2)
+    batch_size, seq_len, n_q_head, head_dim = q.shape
+    n_kv_head = k.shape[2]
+    pad_hd = triton.next_power_of_2(head_dim)
+    pad_n_q_head = triton.next_power_of_2(n_q_head)
+    pad_n_kv_head = triton.next_power_of_2(n_kv_head)
+    n_row = batch_size * seq_len
+    # ensure tensors passed into the kernel are contiguous. It will be no-op if they are already contiguous
+    q = q.contiguous()
+    k = k.contiguous()
+    cos = cos.contiguous()
+    sin = sin.contiguous()
+    cos_batch_size = cos.shape[0]
+    dtype_size = q.element_size()
+    BLOCK_Q, BLOCK_K = get_optimal_block_size(pad_n_q_head, pad_n_kv_head, pad_hd, dtype_size)
+    num_cores = get_npu_core_count()
+    grid_size = min(num_cores, n_row)
+    _triton_rope_npu[(grid_size,)](
+        q,
+        q.stride(1),
+        k,
+        k.stride(1),
+        cos,
+        cos.stride(-2),
+        sin,
+        sin.stride(-2),
+        seq_len,
+        n_row,
+        cos_batch_size,
+        n_q_head,
+        n_kv_head,
+        head_dim,
+        BLOCK_Q,
+        BLOCK_K,
+        NUM_STAGES=3,
+        BACKWARD_PASS=False,
+    )
+    return q.transpose(1, 2), k.transpose(1, 2), cos, sin
+def rope_backward(dq, dk, cos, sin):
+    dq = dq.transpose(1, 2)
+    dk = dk.transpose(1, 2)
+    batch_size, seq_len, n_q_head, head_dim = dq.shape
+    cos_batch_size = cos.shape[0]
+    n_kv_head = dk.shape[2]
+    pad_hd = triton.next_power_of_2(head_dim)
+    pad_n_q_head = triton.next_power_of_2(n_q_head)
+    pad_n_kv_head = triton.next_power_of_2(n_kv_head)
+    n_row = batch_size * seq_len
+    # ensure dq and dk are contiguous
+    dq = dq.contiguous()
+    dk = dk.contiguous()
+    dtype_size = dq.element_size()
+    BLOCK_Q, BLOCK_K = get_optimal_block_size(pad_n_q_head, pad_n_kv_head, pad_hd, dtype_size)
+    num_cores = get_npu_core_count()
+    grid_size = min(num_cores, n_row)
+    _triton_rope_npu[(grid_size,)](
+        dq,
+        dq.stride(1),
+        dk,
+        dk.stride(1),
+        cos,
+        cos.stride(-2),
+        sin,
+        sin.stride(-2),
+        seq_len,
+        n_row,
+        cos_batch_size,
+        n_q_head,
+        n_kv_head,
+        head_dim,
+        BLOCK_Q,
+        BLOCK_K,
+        NUM_STAGES=3,
+        BACKWARD_PASS=True,
+    )
+    return dq.transpose(1, 2), dk.transpose(1, 2)
+class LigerRopeFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+        """
+        q size: (bsz, n_q_head, seq_len, head_dim)
+        k size: (bsz, n_kv_head, seq_len, head_dim)
+        cos size: (1, seq_len, head_dim) or (bsz, seq_len, head_dim)
+        sin size: (1, seq_len, head_dim) or (bsz, seq_len, head_dim)
+        """
+        q, k, cos, sin = rope_forward(q, k, cos, sin)
+        ctx.save_for_backward(cos, sin)
+        return q, k
+    @staticmethod
+    def backward(ctx, dq, dk):
+        """
+        dq size: (bsz, n_q_head, seq_len, head_dim)
+        dk size: (bsz, n_kv_head, seq_len, head_dim)
+        cos size: (1, seq_len, head_dim) or (bsz, seq_len, head_dim)
+        sin size: (1, seq_len, head_dim) or (bsz, seq_len, head_dim)
+        """
+        cos, sin = ctx.saved_tensors
+        dq, dk = rope_backward(dq, dk, cos, sin)
+        return dq, dk, None, None, None, None

liger_kernel/ops/backends/_ascend/ops/swiglu.py ADDED Viewed

@@ -0,0 +1,142 @@
+import torch
+import triton
+import triton.language as tl
+from liger_kernel.ops.backends._ascend.ub_manager import compute_default_tiling_strategy
+from liger_kernel.ops.utils import get_npu_core_count
+# -----------------------------------------------------------------------------
+# Kernels (High-performance 1D Flatten Implementation)
+# -----------------------------------------------------------------------------
+@triton.jit
+def _swiglu_forward_kernel_flat(
+    a_ptr, b_ptr, c_ptr, total_elements, BLOCK_SIZE: tl.constexpr, NUM_STAGES: tl.constexpr
+):
+    pid = tl.program_id(0)
+    num_progs = tl.num_programs(0)
+    # Grid-Stride Loop
+    start_idx = pid * BLOCK_SIZE
+    stride = num_progs * BLOCK_SIZE
+    for idx in tl.range(start_idx, total_elements, stride, num_stages=NUM_STAGES):
+        offsets = idx + tl.arange(0, BLOCK_SIZE)
+        mask = offsets < total_elements
+        a_val = tl.load(a_ptr + offsets, mask=mask, other=0.0).to(tl.float32)
+        b_val = tl.load(b_ptr + offsets, mask=mask, other=0.0).to(tl.float32)
+        res = (a_val * tl.sigmoid(a_val)) * b_val
+        tl.store(c_ptr + offsets, res, mask=mask)
+@triton.jit
+def _swiglu_backward_kernel_flat(
+    dc_ptr, a_ptr, b_ptr, da_ptr, db_ptr, total_elements, BLOCK_SIZE: tl.constexpr, NUM_STAGES: tl.constexpr
+):
+    pid = tl.program_id(0)
+    num_progs = tl.num_programs(0)
+    start_idx = pid * BLOCK_SIZE
+    stride = num_progs * BLOCK_SIZE
+    for idx in tl.range(start_idx, total_elements, stride, num_stages=NUM_STAGES):
+        offsets = idx + tl.arange(0, BLOCK_SIZE)
+        mask = offsets < total_elements
+        dc = tl.load(dc_ptr + offsets, mask=mask, other=0.0).to(tl.float32)
+        a = tl.load(a_ptr + offsets, mask=mask, other=0.0).to(tl.float32)
+        b = tl.load(b_ptr + offsets, mask=mask, other=0.0).to(tl.float32)
+        sig_a = tl.sigmoid(a)
+        silu_a = a * sig_a
+        term1 = silu_a * (1.0 - sig_a) + sig_a
+        db = dc * silu_a
+        da = dc * b * term1
+        tl.store(da_ptr + offsets, da, mask=mask)
+        tl.store(db_ptr + offsets, db, mask=mask)
+# -----------------------------------------------------------------------------
+# Helper: Call compute_default_tiling_strategy
+# -----------------------------------------------------------------------------
+def get_optimal_block_size(total_elements, is_backward=False):
+    """
+    Calculate optimal Block Size using compute_default_tiling_strategy
+    """
+    # 1. Set Memory Multiplier
+    # Forward is lighter, Backward requires more memory for intermediate variables
+    # 8.0 and 12.0 are empirical values based on 910B UB (192KB)
+    multiplier = 12.0 if is_backward else 8.0
+    # 2. Call calculation function
+    # Treat input as 1D (total_elements,), only tiling on dim 0
+    tile_shapes = compute_default_tiling_strategy(
+        safety_margin=0.9, dtype_size=4, memory_multiplier=multiplier, shapes=((total_elements,),), tiling_dims=(0,)
+    )
+    # 3. Parse result
+    if tile_shapes and len(tile_shapes) > 0:
+        block_size = tile_shapes[0][0]
+        return max(256, block_size)
+    else:
+        return 2048
+def swiglu_forward(a, b):
+    if not a.is_contiguous():
+        a = a.contiguous()
+    if not b.is_contiguous():
+        b = b.contiguous()
+    total_elements = a.numel()
+    c = torch.empty_like(a)
+    block_size = get_optimal_block_size(total_elements, is_backward=False)
+    num_cores = get_npu_core_count()
+    grid_size = min(num_cores, (total_elements + block_size - 1) // block_size)
+    _swiglu_forward_kernel_flat[(grid_size,)](a, b, c, total_elements, BLOCK_SIZE=block_size, NUM_STAGES=3, num_warps=4)
+    return c
+def swiglu_backward(a, b, dc):
+    if not dc.is_contiguous():
+        dc = dc.contiguous()
+    if not a.is_contiguous():
+        a = a.contiguous()
+    if not b.is_contiguous():
+        b = b.contiguous()
+    total_elements = dc.numel()
+    grad_a = torch.empty_like(a)
+    grad_b = torch.empty_like(b)
+    block_size = get_optimal_block_size(total_elements, is_backward=True)
+    num_cores = get_npu_core_count()
+    grid_size = min(num_cores, (total_elements + block_size - 1) // block_size)
+    _swiglu_backward_kernel_flat[(grid_size,)](
+        dc, a, b, grad_a, grad_b, total_elements, BLOCK_SIZE=block_size, NUM_STAGES=3, num_warps=4
+    )
+    return grad_a, grad_b
+class LigerSiLUMulFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, a, b):
+        c = swiglu_forward(a, b)
+        ctx.save_for_backward(a, b)
+        return c
+    @staticmethod
+    def backward(ctx, dc):
+        a, b = ctx.saved_tensors
+        grad_a, grad_b = swiglu_backward(a, b, dc)
+        return grad_a, grad_b

liger_kernel/ops/backends/_ascend/ops/tvd.py ADDED Viewed

@@ -0,0 +1,223 @@
+from typing import Literal
+from typing import Optional
+import torch
+import triton
+import triton.language as tl
+from liger_kernel.ops.backends._ascend.ub_manager import compute_default_tiling_strategy
+from liger_kernel.ops.utils import ensure_contiguous
+from liger_kernel.ops.utils import get_npu_core_count
+MAX_FUSED_SIZE = 65536 // 4
+REDUCTION_LITERAL = Literal["none", "sum", "mean", "batchmean"]
+@triton.jit
+def _tv_distance_kernel(
+    p_ptr,
+    p_stride,
+    q_ptr,
+    q_stride,
+    loss_ptr,
+    loss_stride,
+    grads_ptr,
+    grads_stride,
+    label_ptr,
+    ignore_index: tl.constexpr,
+    n_cols,  # V
+    total_rows: tl.constexpr,  # BT
+    BLOCK_SIZE: tl.constexpr,
+    HAS_LABEL: tl.constexpr,
+    NUM_STAGES: tl.constexpr,
+    reduction: tl.constexpr = "batchmean",
+):
+    thread_id = tl.program_id(0)
+    num_threads = tl.num_programs(0)
+    for pid in tl.range(thread_id, total_rows, num_threads, num_stages=NUM_STAGES):
+        p_row_ptr = p_ptr + pid * p_stride
+        q_row_ptr = q_ptr + pid * q_stride
+        loss_row_ptr = loss_ptr + pid * loss_stride
+        grads_row_ptr = grads_ptr + pid * grads_stride
+        label_row_ptr = label_ptr + pid
+        base_offsets = tl.arange(0, BLOCK_SIZE)
+        should_skip = False
+        if HAS_LABEL:
+            label = tl.load(label_row_ptr)
+            if label == ignore_index:
+                should_skip = True
+        if should_skip:
+            for i in range(0, n_cols, BLOCK_SIZE):
+                offsets = i + base_offsets
+                mask = offsets < n_cols
+                tl.store(grads_row_ptr + offsets, 0.0, mask=mask)
+                if reduction == "none":
+                    tl.store(loss_row_ptr + offsets, 0.0, mask=mask)
+        else:
+            loss_sum = 0.0
+            for i in range(0, n_cols, BLOCK_SIZE):
+                offsets = i + base_offsets
+                mask = offsets < n_cols
+                p = tl.load(p_row_ptr + offsets, mask=mask, other=0.0)
+                q = tl.load(q_row_ptr + offsets, mask=mask, other=0.0)
+                # TVD(P || Q) = 0.5 * |P - Q|
+                tv_loss = 0.5 * tl.abs(p - q)
+                grad_res = tl.where(p > q, 0.5, -0.5)
+                tl.store(grads_row_ptr + offsets, grad_res, mask=mask)
+                if reduction == "none":
+                    tl.store(loss_row_ptr + offsets, tv_loss, mask=mask)
+                else:
+                    loss_sum += tl.sum(tv_loss, axis=0)
+            if reduction != "none":
+                tl.store(loss_row_ptr, loss_sum)
+def tv_distance_forward_triton(p, q, shift_labels, reduction, ignore_index, has_label):
+    BT, V = p.shape
+    # TVD forward tiling strategy
+    # - In main loop (calculate loss and grad):
+    #   * p: BLOCK_Q elements
+    #   * q: BLOCK_Q elements
+    #   * tv_loss: BLOCK_Q elements
+    #   * grad_res: BLOCK_Q elements
+    #   * loss_sum: BLOCK_Q elements (when reduction != "none")
+    #   * Total: 4 * BLOCK_Q elements or 5 * BLOCK_Q elements when reduction != "none"
+    # - Since loss_sum is not necessarily used in every calculation,
+    # - and considering the consumption of other shared memory and the potential memory consumption of the HAS_LABEL loop.
+    # - Conservative estimate: 5 * BLOCK_Q * dtype_size * 8 bits
+    # - For safety, use: memory_multiplier=5.0 * BLOCK_SIZE * pad_hd * dtype_size * 8 bits
+    # - shapes: ((V,),)
+    # - tiling_dims: (0,) means first dimension of each shape can be tiled
+    # - Returns: ((block_size,),
+    shapes = ((V,),)
+    tile_shapes = compute_default_tiling_strategy(
+        safety_margin=0.80,
+        # In the TVD calculation, many data are implicitly converted to f32, so the size of f32 can be directly used.
+        dtype_size=4,
+        memory_multiplier=5.0,
+        shapes=shapes,
+        tiling_dims=(0,),
+    )
+    if tile_shapes is not None and len(tile_shapes) > 0 and len(tile_shapes[0]) > 0:
+        # Strategy returns ((block_size,),)
+        BLOCK_SIZE = tile_shapes[0][0]
+    else:
+        # Fallback to desired block size if no best practice found (no tiling needed)
+        BLOCK_SIZE = min(MAX_FUSED_SIZE, triton.next_power_of_2(V))
+    num_cores = get_npu_core_count()
+    grid = (min(num_cores, BT),)
+    out_size = (BT, V) if reduction == "none" else (BT,)
+    # The loss and grid accumulation on BF16 platform of NPU will have precision errors.
+    output_tensor = torch.zeros(out_size, device=p.device, dtype=torch.float32)
+    grads = torch.empty_like(p, dtype=torch.float32)
+    n_non_ignore = (shift_labels != ignore_index).sum().item() if has_label else BT
+    _tv_distance_kernel[grid](
+        p,
+        p.stride(0),
+        q,
+        q.stride(0),
+        output_tensor,
+        output_tensor.stride(0),
+        grads,
+        grads.stride(0),
+        shift_labels if has_label else torch.empty(1, device=p.device),
+        ignore_index,
+        V,
+        BT,
+        BLOCK_SIZE=BLOCK_SIZE,
+        HAS_LABEL=has_label,
+        NUM_STAGES=3 if BT < 4096 else 4,
+        reduction=reduction,
+    )
+    if reduction == "batchmean":
+        return output_tensor.sum() / n_non_ignore, grads / n_non_ignore
+    elif reduction == "sum":
+        return output_tensor.sum(dim=0), grads
+    elif reduction == "mean":
+        return output_tensor.sum() / (n_non_ignore * V), grads / (n_non_ignore * V)
+    else:
+        return output_tensor, grads
+def tvd_backward_triton(grad_output, grads):
+    # If this is the last layer, grad_output is 1.0. Skip the mul then.
+    if torch.equal(grad_output, torch.tensor(1.0, device=grad_output.device)):
+        return grads
+    return grads * grad_output
+class LigerTVDLossFunction(torch.autograd.Function):
+    """
+    Class implementing the forward and backward pass for the Total Variation Distance Loss using Triton.
+    """
+    @staticmethod
+    @ensure_contiguous
+    def forward(
+        ctx,
+        p: torch.Tensor,
+        q: torch.Tensor,
+        shift_labels: Optional[torch.Tensor] = None,
+        reduction: REDUCTION_LITERAL = "batchmean",
+        ignore_index: int = -100,
+    ) -> torch.Tensor:
+        """A forward pass for the Total Variation Distance Loss.
+        Args:
+            ctx: Torch autograd context
+            p (torch.Tensor): A tensor of shape (BT, V) containing the first distribution.
+            q (torch.Tensor): A tensor of shape (BT, V) containing the second distribution.
+            shift_labels (Optional[torch.Tensor]): A tensor of shape (BT,) containing the labels.
+            reduction (REDUCTION_LITERAL, optional): The reduction method to be applied. Defaults to "batchmean".
+            ignore_index (int, optional): The index to ignore during loss calculation. Defaults to -100.
+        Returns:
+            torch.Tensor: The computed Total Variation Distance Loss.
+        """
+        has_label = False
+        if shift_labels is not None:
+            assert shift_labels.shape == (p.shape[0],), (
+                f"the shape of shift_labels must be (BT,). Got: {shift_labels.shape}"
+            )
+            shift_labels = shift_labels.contiguous()
+            has_label = True
+        loss, grads = tv_distance_forward_triton(p, q, shift_labels, reduction, ignore_index, has_label)
+        ctx.save_for_backward(grads)
+        return loss
+    @staticmethod
+    @ensure_contiguous
+    def backward(ctx, grad_output: torch.Tensor) -> torch.Tensor:
+        """A backward pass for the Total Variation Distance Loss.
+        Args:
+            ctx: Torch autograd context
+            grad_output (torch.Tensor): The gradient of the loss with respect to the output.
+        Returns:
+            tuple[torch.Tensor, None, None, None, None]: The gradient of the loss with respect to the inputs.
+        """
+        (grads,) = ctx.saved_tensors
+        grads = tvd_backward_triton(grad_output, grads)
+        return grads, None, None, None, None

liger-kernel 0.6.3__py3-none-any.whl → 0.6.5__py3-none-any.whl

liger-kernel 0.6.3py3-none-any.whl → 0.6.5py3-none-any.whl