PyPI - liger-kernel-nightly - Versions diffs - 0.6.3.dev20251121202601__tar.gz → 0.6.4.dev20251208235806__tar.gz - Mend

liger-kernel-nightly 0.6.3.dev20251121202601tar.gz → 0.6.4.dev20251208235806tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of liger-kernel-nightly might be problematic. Click here for more details.

Files changed (306) hide show

{liger_kernel_nightly-0.6.3.dev20251121202601 → liger_kernel_nightly-0.6.4.dev20251208235806}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: liger_kernel_nightly
-Version: 0.6.3.dev20251121202601
+Version: 0.6.4.dev20251208235806
 Summary: Efficient Triton kernels for LLM Training
 License: BSD 2-CLAUSE LICENSE
         Copyright 2024 LinkedIn Corporation
@@ -113,6 +113,8 @@ We've also added optimized Post-Training kernels that deliver **up to 80% memory
 You can view the documentation site for additional installation, usage examples, and API references:https://linkedin.github.io/Liger-Kernel/
+You can view the Liger Kernel Technical Report: https://openreview.net/forum?id=36SjAIT42G
 ## Supercharge Your Model with Liger Kernel
 ![Banner](https://raw.githubusercontent.com/linkedin/Liger-Kernel/main/docs/images/banner.GIF)
@@ -312,6 +314,7 @@ loss.backward()
 | OLMo2   | `liger_kernel.transformers.apply_liger_kernel_to_olmo2`     | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
 | Olmo3   | `liger_kernel.transformers.apply_liger_kernel_to_olmo3`     | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
 | GLM-4   | `liger_kernel.transformers.apply_liger_kernel_to_glm4`     | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
+| GPT-OSS   | `liger_kernel.transformers.apply_liger_kernel_to_gpt_oss`     | RoPE, RMSNorm, CrossEntropyLoss, FusedLinearCrossEntropy |
 | InternVL3   | `liger_kernel.transformers.apply_liger_kernel_to_internvl`     | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
 | HunyuanV1   | `liger_kernel.transformers.apply_liger_kernel_to_hunyuan_v1_dense`    |  RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy       |
 | HunyuanV1 MoE | `liger_kernel.transformers.apply_liger_kernel_to_hunyuan_v1_moe` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy       |
@@ -441,3 +444,4 @@ url={https://openreview.net/forum?id=36SjAIT42G}
         ↑ Back to Top ↑
     </a>
 </p>

{liger_kernel_nightly-0.6.3.dev20251121202601 → liger_kernel_nightly-0.6.4.dev20251208235806}/README.md RENAMED Viewed

@@ -65,6 +65,8 @@ We've also added optimized Post-Training kernels that deliver **up to 80% memory
 You can view the documentation site for additional installation, usage examples, and API references:https://linkedin.github.io/Liger-Kernel/
+You can view the Liger Kernel Technical Report: https://openreview.net/forum?id=36SjAIT42G
 ## Supercharge Your Model with Liger Kernel
 ![Banner](https://raw.githubusercontent.com/linkedin/Liger-Kernel/main/docs/images/banner.GIF)
@@ -264,6 +266,7 @@ loss.backward()
 | OLMo2   | `liger_kernel.transformers.apply_liger_kernel_to_olmo2`     | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
 | Olmo3   | `liger_kernel.transformers.apply_liger_kernel_to_olmo3`     | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
 | GLM-4   | `liger_kernel.transformers.apply_liger_kernel_to_glm4`     | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
+| GPT-OSS   | `liger_kernel.transformers.apply_liger_kernel_to_gpt_oss`     | RoPE, RMSNorm, CrossEntropyLoss, FusedLinearCrossEntropy |
 | InternVL3   | `liger_kernel.transformers.apply_liger_kernel_to_internvl`     | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
 | HunyuanV1   | `liger_kernel.transformers.apply_liger_kernel_to_hunyuan_v1_dense`    |  RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy       |
 | HunyuanV1 MoE | `liger_kernel.transformers.apply_liger_kernel_to_hunyuan_v1_moe` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy       |
@@ -393,3 +396,4 @@ url={https://openreview.net/forum?id=36SjAIT42G}
         ↑ Back to Top ↑
     </a>
 </p>

{liger_kernel_nightly-0.6.3.dev20251121202601 → liger_kernel_nightly-0.6.4.dev20251208235806}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "liger_kernel_nightly"
-version = "0.6.3.dev20251121202601"
+version = "0.6.4.dev20251208235806"
 description = "Efficient Triton kernels for LLM Training"
 urls = { "Homepage" = "https://github.com/linkedin/Liger-Kernel" }
 readme = { file = "README.md", content-type = "text/markdown" }

{liger_kernel_nightly-0.6.3.dev20251121202601 → liger_kernel_nightly-0.6.4.dev20251208235806}/setup.py RENAMED Viewed

@@ -24,6 +24,8 @@ def get_default_dependencies():
         return [
             "torch>=2.6.0",
         ]
+    elif platform == "npu":
+        return ["torch_npu==2.6.0", "triton-ascend"]
 def get_optional_dependencies():
@@ -67,7 +69,21 @@ def is_xpu_available():
     return False
-def get_platform() -> Literal["cuda", "rocm", "cpu", "xpu"]:
+def is_ascend_available() -> bool:
+    """Best-effort Ascend detection.
+    Checks for common Ascend environment variables and a possible `npu-smi`
+    utility if present.
+    """
+    try:
+        subprocess.run(["npu-smi", "info"], check=True)
+        return True
+    except (subprocess.SubprocessError, FileNotFoundError):
+        pass
+    return False
+def get_platform() -> Literal["cuda", "rocm", "cpu", "xpu", "npu"]:
     """
     Detect whether the system has NVIDIA or AMD GPU without torch dependency.
     """
@@ -86,6 +102,9 @@ def get_platform() -> Literal["cuda", "rocm", "cpu", "xpu"]:
             if is_xpu_available():
                 print("Intel GPU detected")
                 return "xpu"
+            elif is_ascend_available():
+                print("Ascend NPU detected")
+                return "npu"
             else:
                 print("No GPU detected")
                 return "cpu"

{liger_kernel_nightly-0.6.3.dev20251121202601 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/ops/cross_entropy.py RENAMED Viewed

@@ -10,8 +10,9 @@ from liger_kernel.ops.utils import compare_version
 from liger_kernel.ops.utils import element_mul_kernel
 from liger_kernel.ops.utils import is_hip
 from liger_kernel.utils import infer_device
+from liger_kernel.utils import is_npu_available
-if compare_version("triton", operator.ge, "3.0.0"):
+if compare_version("triton", operator.ge, "3.0.0") and not is_npu_available():
     try:
         # typical import path with dispatch available
         from triton.language.extra.libdevice import tanh

{liger_kernel_nightly-0.6.3.dev20251121202601 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/ops/dyt.py RENAMED Viewed

@@ -7,8 +7,10 @@ import triton.language as tl
 from liger_kernel.ops.utils import compare_version
 from liger_kernel.ops.utils import ensure_contiguous
 from liger_kernel.ops.utils import infer_device
+from liger_kernel.utils import get_npu_multi_processor_count
+from liger_kernel.utils import is_npu_available
-if compare_version("triton", operator.ge, "3.0.0"):
+if compare_version("triton", operator.ge, "3.0.0") and not is_npu_available():
     try:
         # typical import path with dispatch available
         from triton.language.extra.libdevice import tanh
@@ -125,7 +127,8 @@ def liger_dyt_bwd(dy, x, alpha, gamma, beta):
         NUM_SMS = torch.cuda.get_device_properties(x.device).multi_processor_count
     elif device == "xpu":
         NUM_SMS = torch.xpu.get_device_properties(x.device).gpu_subslice_count
+    elif device == "npu":
+        NUM_SMS = get_npu_multi_processor_count()
     da = torch.zeros(NUM_SMS, triton.cdiv(N, 512), dtype=torch.float32, device=x.device)
     dg = torch.empty(NUM_SMS, N, dtype=torch.float32, device=x.device)
     db = torch.empty(NUM_SMS, N, dtype=torch.float32, device=x.device) if HAVE_BETA else None

{liger_kernel_nightly-0.6.3.dev20251121202601 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/ops/fused_add_rms_norm.py RENAMED Viewed

@@ -9,8 +9,10 @@ from liger_kernel.ops.utils import calculate_settings
 from liger_kernel.ops.utils import compare_version
 from liger_kernel.ops.utils import ensure_contiguous
 from liger_kernel.ops.utils import torch_to_triton_dtype
+from liger_kernel.utils import get_npu_multi_processor_count
+from liger_kernel.utils import is_npu_available
-if compare_version("triton", operator.ge, "3.0.0"):
+if compare_version("triton", operator.ge, "3.0.0") and not is_npu_available():
     try:
         # typical import path with dispatch available
         from triton.language.extra.libdevice import rsqrt
@@ -293,6 +295,8 @@ def fused_add_rms_norm_backward(dY, dS_out, S, W, RSTD, offset, casting_mode, BL
         sm_count = torch.cuda.get_device_properties(S.device).multi_processor_count
     elif S.device.type == "xpu":
         sm_count = torch.xpu.get_device_properties(S.device).gpu_eu_count
+    elif S.device.type == "npu":
+        sm_count = get_npu_multi_processor_count()
     # fp32 for numerical stability especially.
     _dW = torch.empty((sm_count, n_cols), dtype=torch.float32, device=W.device)

{liger_kernel_nightly-0.6.3.dev20251121202601 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/ops/geglu.py RENAMED Viewed

@@ -7,8 +7,9 @@ import triton.language as tl
 from liger_kernel.ops.utils import calculate_settings
 from liger_kernel.ops.utils import compare_version
 from liger_kernel.ops.utils import ensure_contiguous
+from liger_kernel.utils import is_npu_available
-if compare_version("triton", operator.ge, "3.0.0"):
+if compare_version("triton", operator.ge, "3.0.0") and not is_npu_available():
     try:
         # typical import path with dispatch available
         from triton.language.extra.libdevice import tanh

{liger_kernel_nightly-0.6.3.dev20251121202601 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/ops/group_norm.py RENAMED Viewed

@@ -6,8 +6,9 @@ import triton.language as tl
 from liger_kernel.ops.utils import compare_version
 from liger_kernel.ops.utils import ensure_contiguous
+from liger_kernel.utils import is_npu_available
-if compare_version("triton", operator.ge, "3.0.0"):
+if compare_version("triton", operator.ge, "3.0.0") and not is_npu_available():
     try:
         # typical import path with dispatch available
         from triton.language.extra.libdevice import rsqrt

{liger_kernel_nightly-0.6.3.dev20251121202601 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/ops/layer_norm.py RENAMED Viewed

@@ -1,3 +1,4 @@
+import math
 import operator
 import torch
@@ -7,8 +8,9 @@ import triton.language as tl
 from liger_kernel.ops.utils import calculate_settings
 from liger_kernel.ops.utils import compare_version
 from liger_kernel.ops.utils import ensure_contiguous
+from liger_kernel.utils import is_npu_available
-if compare_version("triton", operator.ge, "3.0.0"):
+if compare_version("triton", operator.ge, "3.0.0") and not is_npu_available():
     try:
         # typical import path with dispatch available
         from triton.language.extra.libdevice import rsqrt
@@ -85,68 +87,87 @@ def _layer_norm_forward_kernel(
 @triton.jit
 def _layer_norm_backward_kernel(
     X_ptr,  # pointer to input, shape (n_rows, n_cols)
+    stride_x,  # stride of each row in input
     W_ptr,  # pointer to weights, shape (n_cols,)
     Mean_ptr,  # pointer to mean, shape (n_rows,)
+    stride_mean,  # stride of each row in mean
     RSTD_ptr,  # pointer to rstd, shape (n_rows,)
+    stride_rstd,  # stride of each row in rstd
     DX_ptr,  # pointer to input grad, shape (n_rows, n_cols)
+    stride_dx,  # stride of each row in input grad
     DW_ptr,  # pointer to weights grad, shape (n_cols,)
+    stride_dw,  # stride of each row in weights grad
     DB_ptr,  # pointer to bias grad, shape (n_cols,)
+    stride_db,  # stride of each row in bias grad
     DY_ptr,  # pointer to output grad, shape (n_rows, n_cols)
-    stride_x,  # stride of each row in input
-    stride_dx,  # stride of each row in input grad
     stride_dy,  # stride of each row in output grad
+    n_rows,
     n_cols,
+    rows_per_program: tl.constexpr,
     BLOCK_SIZE: tl.constexpr,
-    dtype: tl.constexpr,
-    atomic_dtype: tl.constexpr,
 ):
     """
     References:
     https://arxiv.org/abs/1607.06450
     https://github.com/karpathy/llm.c/blob/master/doc/layernorm/layernorm.md
     """
-    row_idx = tl.program_id(0).to(tl.int64)
+    row_block_id = tl.program_id(0).to(tl.int64)
+    row_start = row_block_id * rows_per_program
+    row_end = min((row_block_id + 1) * rows_per_program, n_rows)
     cols = tl.arange(0, BLOCK_SIZE)
     mask = cols < n_cols
+    dW_row = tl.zeros((BLOCK_SIZE,), dtype=tl.float32)
+    db_row = tl.zeros((BLOCK_SIZE,), dtype=tl.float32)
     # Pre-load weights once (same optimization as forward pass)
     w = tl.load(W_ptr + cols, mask=mask, other=0.0)
     w_f32 = w.to(tl.float32)
     # Calculate pointers for this specific row
-    row_X_ptr = X_ptr + row_idx * stride_x
-    row_DX_ptr = DX_ptr + row_idx * stride_dx
-    row_DY_ptr = DY_ptr + row_idx * stride_dy
-    row_Mean_ptr = Mean_ptr + row_idx
-    row_RSTD_ptr = RSTD_ptr + row_idx
-    # Load data for this row
-    x = tl.load(row_X_ptr + cols, mask=mask, other=0.0)
-    dy = tl.load(row_DY_ptr + cols, mask=mask, other=0.0)
-    mean = tl.load(row_Mean_ptr)
-    rstd = tl.load(row_RSTD_ptr)
-    # Convert to fp32 for numerical stability
-    x_f32 = x.to(tl.float32)
-    dy_f32 = dy.to(tl.float32)
-    mean_f32 = mean.to(tl.float32)
-    rstd_f32 = rstd.to(tl.float32)
-    # Compute backward pass for this row
-    x_hat = (x_f32 - mean_f32) * rstd_f32
-    wdy = w_f32 * dy_f32
-    c1 = tl.sum(x_hat * wdy, axis=0) / n_cols
-    c2 = tl.sum(wdy, axis=0) / n_cols
-    dx = (wdy - (x_hat * c1 + c2)) * rstd_f32
-    # Store input gradient
-    tl.store(row_DX_ptr + cols, dx.to(dtype), mask=mask)
-    # Accumulate weight and bias gradients using atomic operations
-    dw = dy_f32 * x_hat
-    db = dy_f32
-    tl.atomic_add(DW_ptr + cols, dw.to(atomic_dtype), mask=mask)
-    tl.atomic_add(DB_ptr + cols, db.to(atomic_dtype), mask=mask)
+    row_X_ptr = X_ptr + row_start * stride_x
+    row_DX_ptr = DX_ptr + row_start * stride_dx
+    row_DY_ptr = DY_ptr + row_start * stride_dy
+    row_Mean_ptr = Mean_ptr + row_start
+    row_RSTD_ptr = RSTD_ptr + row_start
+    for _ in range(row_start, row_end):
+        # Load data for this row
+        x = tl.load(row_X_ptr + cols, mask=mask, other=0.0)
+        dy = tl.load(row_DY_ptr + cols, mask=mask, other=0.0)
+        mean = tl.load(row_Mean_ptr)
+        rstd = tl.load(row_RSTD_ptr)
+        # Convert to fp32 for numerical stability
+        x_f32 = x.to(tl.float32)
+        dy_f32 = dy.to(tl.float32)
+        mean_f32 = mean.to(tl.float32)
+        rstd_f32 = rstd.to(tl.float32)
+        # Compute backward pass for this row
+        x_hat = (x_f32 - mean_f32) * rstd_f32
+        wdy = w_f32 * dy_f32
+        c1 = tl.sum(x_hat * wdy, axis=0) / n_cols
+        c2 = tl.sum(wdy, axis=0) / n_cols
+        dx = (wdy - (x_hat * c1 + c2)) * rstd_f32
+        # Store input gradient
+        tl.store(row_DX_ptr + cols, dx, mask=mask)
+        # Accumulate weight and bias gradients for this thread block's assigned rows
+        dw = dy_f32 * x_hat
+        db = dy_f32
+        dW_row += dw
+        db_row += db
+        row_X_ptr += stride_x
+        row_DX_ptr += stride_dx
+        row_DY_ptr += stride_dy
+        row_Mean_ptr += stride_mean
+        row_RSTD_ptr += stride_rstd
+    tl.store(DW_ptr + row_block_id * stride_dw + cols, dW_row, mask=mask)
+    tl.store(DB_ptr + row_block_id * stride_db + cols, db_row, mask=mask)
 def layer_norm_forward(X, W, B, eps):
@@ -228,31 +249,25 @@ def layer_norm_backward(dY, X, W, B, Mean, RSTD):
     dY = dY.view(-1, dim)
     n_rows, n_cols = dY.shape
-    # Allocate gradient tensors
-    DX = torch.empty((n_rows, n_cols), dtype=X.dtype, device=X.device)
-    # Use float32 for weight/bias gradients if bfloat16 (due to atomic_add limitation)
-    grad_dtype = torch.float32 if W.dtype == torch.bfloat16 else W.dtype
-    DW = torch.zeros(n_cols, dtype=grad_dtype, device=W.device)
-    DB = torch.zeros(n_cols, dtype=grad_dtype, device=W.device)
+    sm_count = 1
+    if X.device.type == "cuda":
+        sm_count = torch.cuda.get_device_properties(X.device).multi_processor_count
+    elif X.device.type == "xpu":
+        sm_count = torch.xpu.get_device_properties(X.device).gpu_eu_count
+    # fp32 for numerical stability especially.
+    _DW = torch.empty((sm_count, n_cols), dtype=torch.float32, device=W.device)
+    _DB = torch.empty((sm_count, n_cols), dtype=torch.float32, device=W.device)
     # Calculate optimal block size and warp configuration
     BLOCK_SIZE, num_warps = calculate_settings(n_cols)
     if n_cols > BLOCK_SIZE:
         raise RuntimeError(f"Feature dimension {n_cols} exceeds maximum supported size of {BLOCK_SIZE}.")
+    rows_per_program = math.ceil(n_rows / sm_count)
+    grid = (sm_count,)
-    # Determine dtype for triton operations
-    triton_dtype = (
-        tl.float32
-        if X.dtype == torch.float32
-        else tl.bfloat16
-        if X.dtype == torch.bfloat16
-        else tl.float16
-        if X.dtype == torch.float16
-        else tl.float32  # fallback
-    )
-    # Use float32 for atomic operations if bfloat16 is not supported
-    atomic_dtype = tl.float32 if triton_dtype == tl.bfloat16 else triton_dtype
+    # Allocate gradient tensors
+    DX = torch.empty((n_rows, n_cols), dtype=X.dtype, device=X.device)
     kernel_args = {"num_warps": num_warps}
     # XPU-specific optimization
@@ -260,28 +275,33 @@ def layer_norm_backward(dY, X, W, B, Mean, RSTD):
         kernel_args.update({"grf_mode": "large", "num_warps": 32, "num_stages": 4})
     # Launch kernel with one thread block per row for optimal performance
-    grid = (n_rows,)
     _layer_norm_backward_kernel[grid](
         X,
+        X.stride(0),
         W,
         Mean,
+        Mean.stride(0),
         RSTD,
+        RSTD.stride(0),
         DX,
-        DW,
-        DB,
-        dY,
-        X.stride(0),
         DX.stride(0),
+        _DW,
+        _DW.stride(0),
+        _DB,
+        _DB.stride(0),
+        dY,
         dY.stride(0),
+        n_rows,
         n_cols,
+        rows_per_program=rows_per_program,
         BLOCK_SIZE=BLOCK_SIZE,
-        dtype=triton_dtype,
-        atomic_dtype=atomic_dtype,
         **kernel_args,
     )
     DX = DX.view(*shape)
-    return DX, DW.to(W.dtype), DB.to(W.dtype)
+    DW = _DW.sum(dim=0).to(W.dtype)
+    DB = _DB.sum(dim=0).to(B.dtype)
+    return DX, DW, DB
 class LigerLayerNormFunction(torch.autograd.Function):

{liger_kernel_nightly-0.6.3.dev20251121202601 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/ops/poly_norm.py RENAMED Viewed

@@ -7,8 +7,10 @@ import triton.language as tl
 from liger_kernel.ops.utils import calculate_settings
 from liger_kernel.ops.utils import compare_version
 from liger_kernel.ops.utils import ensure_contiguous
+from liger_kernel.utils import get_npu_multi_processor_count
+from liger_kernel.utils import is_npu_available
-if compare_version("triton", operator.ge, "3.0.0"):
+if compare_version("triton", operator.ge, "3.0.0") and not is_npu_available():
     try:
         from triton.language.extra.libdevice import rsqrt
     except ModuleNotFoundError:
@@ -290,6 +292,8 @@ def poly_norm_backward(dY, X, W, RSTD, BLOCK_SIZE, num_warps, in_place):
         sm_count = torch.cuda.get_device_properties(X.device).multi_processor_count
     elif X.device.type == "xpu":
         sm_count = torch.xpu.get_device_properties(X.device).gpu_eu_count
+    elif X.device.type == "npu":
+        sm_count = get_npu_multi_processor_count()
     # Allocate or reuse gradients
     if in_place is True:

{liger_kernel_nightly-0.6.3.dev20251121202601 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/ops/rms_norm.py RENAMED Viewed

@@ -21,8 +21,10 @@ from liger_kernel.ops.utils import calculate_settings
 from liger_kernel.ops.utils import compare_version
 from liger_kernel.ops.utils import ensure_contiguous
 from liger_kernel.ops.utils import torch_to_triton_dtype
+from liger_kernel.utils import get_npu_multi_processor_count
+from liger_kernel.utils import is_npu_available
-if compare_version("triton", operator.ge, "3.0.0"):
+if compare_version("triton", operator.ge, "3.0.0") and not is_npu_available():
     try:
         # typical import path with dispatch available
         from triton.language.extra.libdevice import rsqrt
@@ -349,7 +351,8 @@ def _block_rms_norm_backward_kernel(
         # calculate the gradient of W
         if casting_mode == _CASTING_MODE_LLAMA:
-            dW_row += tl.sum(dY_row * (X_row * rstd_row[:, None]).to(X_dtype), 0)
+            # TODO(tcc): use tl.sum(..., dtype=tl.float32) once we upgrade to triton>=3.3.0
+            dW_row += tl.sum((dY_row * (X_row * rstd_row[:, None]).to(X_dtype)).to(tl.float32), 0)
         else:
             # here X_row is already in fp32 (see previous if block)
             dW_row += tl.sum(dY_row * (X_row * rstd_row[:, None]), 0)
@@ -449,6 +452,8 @@ def rms_norm_backward(dY, X, W, RSTD, offset, casting_mode, BLOCK_SIZE, num_warp
         sm_count = torch.cuda.get_device_properties(X.device).multi_processor_count
     elif X.device.type == "xpu":
         sm_count = torch.xpu.get_device_properties(X.device).gpu_eu_count
+    elif X.device.type == "npu":
+        sm_count = get_npu_multi_processor_count()
     # fp32 for numerical stability especially.
     _dW = torch.empty((sm_count, n_cols), dtype=torch.float32, device=W.device)

{liger_kernel_nightly-0.6.3.dev20251121202601 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/ops/utils.py RENAMED Viewed

@@ -78,6 +78,8 @@ def get_amp_custom_fwd_bwd() -> Callable:
             functools.partial(torch.amp.custom_fwd, device_type=device),
             functools.partial(torch.amp.custom_bwd, device_type=device),
         )
+    if hasattr(torch, "npu") and getattr(torch.npu, "amp", None) is not None:
+        return torch.npu.amp.custom_fwd, torch.npu.amp.custom_bwd
     return torch.cuda.amp.custom_fwd, torch.cuda.amp.custom_bwd

{liger_kernel_nightly-0.6.3.dev20251121202601 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/transformers/__init__.py RENAMED Viewed

@@ -41,6 +41,7 @@ if TYPE_CHECKING:
     from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_glm4  # noqa: F401
     from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_glm4v  # noqa: F401
     from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_glm4v_moe  # noqa: F401
+    from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_gpt_oss  # noqa: F401
     from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_granite  # noqa: F401
     from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_hunyuan_v1_dense  # noqa: F401
     from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_hunyuan_v1_moe  # noqa: F401
@@ -110,6 +111,7 @@ def __getattr__(name: str):
         "apply_liger_kernel_to_glm4",
         "apply_liger_kernel_to_glm4v",
         "apply_liger_kernel_to_glm4v_moe",
+        "apply_liger_kernel_to_gpt_oss",
         "apply_liger_kernel_to_granite",
         "apply_liger_kernel_to_internvl",
         "apply_liger_kernel_to_llama",
@@ -187,6 +189,7 @@ if _TRANSFORMERS_AVAILABLE:
             "apply_liger_kernel_to_glm4",
             "apply_liger_kernel_to_glm4v",
             "apply_liger_kernel_to_glm4v_moe",
+            "apply_liger_kernel_to_gpt_oss",
             "apply_liger_kernel_to_granite",
             "apply_liger_kernel_to_internvl",
             "apply_liger_kernel_to_llama",

{liger_kernel_nightly-0.6.3.dev20251121202601 → liger_kernel_nightly-0.6.4.dev20251208235806}/src/liger_kernel/transformers/model/gemma3.py RENAMED Viewed

@@ -235,6 +235,7 @@ def multimodal_forward(
         **lm_kwargs,
     )
+    shift_labels = lm_kwargs.pop("shift_labels", None)
     hidden_states = outputs[0]
     slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep

liger-kernel-nightly 0.6.3.dev20251121202601__tar.gz → 0.6.4.dev20251208235806__tar.gz

Potentially problematic release.

liger-kernel-nightly 0.6.3.dev20251121202601tar.gz → 0.6.4.dev20251208235806tar.gz