PyPI - cuequivariance-ops-cu12 - Versions diffs - 0.6.0__py3-none-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl - Mend

cuequivariance-ops-cu12 0.6.0__py3-none-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cuequivariance-ops-cu12 might be problematic. Click here for more details.

Files changed (37) hide show

cuequivariance_ops/triton/fused_layer_norm_triton.py ADDED Viewed

@@ -0,0 +1,518 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+import enum
+import triton
+import triton.language as tl
+class Layout(enum.IntEnum):
+    BND_BND = 0
+    BDN_BND = 1
+    BND_BDN = 2
+    DBN_BND = 3
+    BND_DBN = 4
+@triton.jit
+def layer_norm_transpose_forward_single_pass_kernel(
+    # inputs:
+    x_ptr,
+    w_ptr,
+    b_ptr,
+    # outputs: (order matters for jax_triton)
+    out_ptr,
+    mean_ptr,
+    rstd_ptr,
+    B,
+    N,
+    D: tl.constexpr,
+    EPS: tl.constexpr,
+    TILE_N: tl.constexpr,
+    TILE_D: tl.constexpr,
+    ELEMENTWISE_AFFINE: tl.constexpr,
+    LAYOUT: tl.constexpr,
+):
+    pid_n = tl.program_id(0)
+    pid_b = tl.program_id(1)
+    offs_n = pid_n * TILE_N + tl.arange(0, TILE_N)
+    offs_d = tl.arange(0, TILE_D)
+    if LAYOUT == 0:  # bnd->bnd
+        x_ptrs = x_ptr + pid_b * N * D + offs_n[:, None] * D + offs_d[None, :]
+    elif LAYOUT == 1:  # bdn->bnd
+        x_ptrs = x_ptr + pid_b * D * N + offs_d[None, :] * N + offs_n[:, None]
+    elif LAYOUT == 2:  # bnd->bdn
+        x_ptrs = x_ptr + pid_b * N * D + offs_n[:, None] * D + offs_d[None, :]
+    elif LAYOUT == 3:  # dbn->bnd
+        x_ptrs = x_ptr + offs_d[None, :] * B * N + pid_b * N + offs_n[:, None]
+    elif LAYOUT == 4:  # bnd->dbn
+        x_ptrs = x_ptr + pid_b * N * D + offs_n[:, None] * D + offs_d[None, :]
+    mean_ptrs = mean_ptr + pid_b * N + offs_n
+    rstd_ptrs = rstd_ptr + pid_b * N + offs_n
+    mask_n = offs_n < N
+    x = tl.load(x_ptrs, mask=mask_n[:, None], other=0.0).to(tl.float32)
+    mean = tl.sum(x, axis=1) / D
+    x_centered = x - mean[:, None]
+    var = tl.sum(x_centered * x_centered, axis=1) / D
+    rstd = tl.rsqrt(var + EPS)
+    tl.store(mean_ptrs, mean, mask=mask_n)
+    tl.store(rstd_ptrs, rstd, mask=mask_n)
+    x_hat = x_centered * rstd[:, None]
+    if ELEMENTWISE_AFFINE:
+        w_ptrs = w_ptr + offs_d
+        b_ptrs = b_ptr + offs_d
+        w = tl.load(w_ptrs).to(tl.float32)
+        b = tl.load(b_ptrs).to(tl.float32)
+        y = x_hat * w[None, :] + b[None, :]
+    else:
+        y = x_hat
+    if LAYOUT == 0:  # bnd->bnd
+        out_ptrs = out_ptr + pid_b * N * D + offs_n[:, None] * D + offs_d[None, :]
+    elif LAYOUT == 1:  # bdn->bnd
+        out_ptrs = out_ptr + pid_b * N * D + offs_n[:, None] * D + offs_d[None, :]
+    elif LAYOUT == 2:  # bnd->bdn
+        out_ptrs = out_ptr + pid_b * N * D + offs_d[None, :] * N + offs_n[:, None]
+    elif LAYOUT == 3:  # dbn->bnd
+        out_ptrs = out_ptr + pid_b * N * D + offs_n[:, None] * D + offs_d[None, :]
+    elif LAYOUT == 4:  # bnd->dbn
+        out_ptrs = out_ptr + offs_d[None, :] * B * N + pid_b * N + offs_n[:, None]
+    tl.store(out_ptrs, y, mask=mask_n[:, None])
+@triton.jit
+def layer_norm_transpose_forward_kernel(
+    # inputs:
+    x_ptr,
+    w_ptr,
+    b_ptr,
+    # outputs: (order matters for jax_triton)
+    out_ptr,
+    mean_ptr,
+    rstd_ptr,
+    B,
+    N,
+    D: tl.constexpr,
+    EPS: tl.constexpr,
+    TILE_N: tl.constexpr,
+    TILE_D: tl.constexpr,
+    ELEMENTWISE_AFFINE: tl.constexpr,
+    LAYOUT: tl.constexpr,
+):
+    pid_n = tl.program_id(0)
+    pid_b = tl.program_id(1)
+    num_tiles_d = tl.cdiv(D, TILE_D)
+    D_CEIL = num_tiles_d * TILE_D
+    offs_n = pid_n * TILE_N + tl.arange(0, TILE_N)
+    offs_d = tl.arange(0, TILE_D)
+    mask_n = offs_n < N
+    if LAYOUT == 0:  # bnd->bnd
+        x_ptrs = x_ptr + pid_b * N * D + offs_n[:, None] * D + offs_d[None, :]
+    elif LAYOUT == 1:  # bdn->bnd
+        x_ptrs = x_ptr + pid_b * D * N + offs_d[None, :] * N + offs_n[:, None]
+    elif LAYOUT == 2:  # bnd->bdn
+        x_ptrs = x_ptr + pid_b * N * D + offs_n[:, None] * D + offs_d[None, :]
+    elif LAYOUT == 3:  # dbn->bnd
+        x_ptrs = x_ptr + offs_d[None, :] * B * N + pid_b * N + offs_n[:, None]
+    elif LAYOUT == 4:  # bnd->dbn
+        x_ptrs = x_ptr + pid_b * N * D + offs_n[:, None] * D + offs_d[None, :]
+    mean_ptrs = mean_ptr + pid_b * N + offs_n
+    rstd_ptrs = rstd_ptr + pid_b * N + offs_n
+    _mean = tl.zeros([TILE_N, TILE_D], dtype=tl.float32)
+    for di in range(0, num_tiles_d):
+        mask_d = offs_d < (D - di * TILE_D)
+        mask_nd = mask_n[:, None] & mask_d[None, :]
+        x = tl.load(x_ptrs, mask=mask_nd, other=0.0).to(tl.float32)
+        _mean += x
+        if LAYOUT == 0:  # bnd->bnd
+            x_ptrs += TILE_D
+        elif LAYOUT == 1:  # bdn->bnd
+            x_ptrs += TILE_D * N
+        elif LAYOUT == 2:  # bnd->bdn
+            x_ptrs += TILE_D
+        elif LAYOUT == 3:  # dbn->bnd
+            x_ptrs += TILE_D * B * N
+        elif LAYOUT == 4:  # bnd->dbn
+            x_ptrs += TILE_D
+    mean = tl.sum(_mean, axis=1) / D
+    tl.store(mean_ptrs, mean, mask=mask_n)
+    if LAYOUT == 0:  # bnd->bnd
+        x_ptrs -= D_CEIL
+    elif LAYOUT == 1:  # bdn->bnd
+        x_ptrs -= D_CEIL * N
+    elif LAYOUT == 2:  # bnd->bdn
+        x_ptrs -= D_CEIL
+    elif LAYOUT == 3:  # dbn->bnd
+        x_ptrs -= D_CEIL * B * N
+    elif LAYOUT == 4:  # bnd->dbn
+        x_ptrs -= D_CEIL
+    _var = tl.zeros([TILE_N, TILE_D], dtype=tl.float32)
+    for di in range(0, num_tiles_d):
+        mask_d = offs_d < (D - di * TILE_D)
+        mask_nd = mask_n[:, None] & mask_d[None, :]
+        x = tl.load(x_ptrs, mask=mask_nd, other=mean[:, None]).to(tl.float32)
+        x = x - mean[:, None]
+        _var += x * x
+        if LAYOUT == 0:  # bnd->bnd
+            x_ptrs += TILE_D
+        elif LAYOUT == 1:  # bdn->bnd
+            x_ptrs += TILE_D * N
+        elif LAYOUT == 2:  # bnd->bdn
+            x_ptrs += TILE_D
+        elif LAYOUT == 3:  # dbn->bnd
+            x_ptrs += TILE_D * B * N
+        elif LAYOUT == 4:  # bnd->dbn
+            x_ptrs += TILE_D
+    var = tl.sum(_var, axis=1) / D
+    rstd = tl.rsqrt(var + EPS)
+    tl.store(rstd_ptrs, rstd, mask=mask_n)
+    if LAYOUT == 0:  # bnd->bnd
+        x_ptrs -= D_CEIL
+        out_ptrs = out_ptr + pid_b * N * D + offs_n[:, None] * D + offs_d[None, :]
+    elif LAYOUT == 1:  # bdn->bnd
+        x_ptrs -= D_CEIL * N
+        out_ptrs = out_ptr + pid_b * N * D + offs_n[:, None] * D + offs_d[None, :]
+    elif LAYOUT == 2:  # bnd->bdn
+        x_ptrs -= D_CEIL
+        out_ptrs = out_ptr + pid_b * N * D + offs_d[None, :] * N + offs_n[:, None]
+    elif LAYOUT == 3:  # dbn->bnd
+        x_ptrs -= D_CEIL * B * N
+        out_ptrs = out_ptr + pid_b * N * D + offs_n[:, None] * D + offs_d[None, :]
+    elif LAYOUT == 4:  # bnd->dbn
+        x_ptrs -= D_CEIL
+        out_ptrs = out_ptr + offs_d[None, :] * B * N + pid_b * N + offs_n[:, None]
+    if ELEMENTWISE_AFFINE:
+        w_ptrs = w_ptr + offs_d
+        b_ptrs = b_ptr + offs_d
+    for di in range(0, num_tiles_d):
+        mask_d = offs_d < (D - di * TILE_D)
+        mask_nd = mask_n[:, None] & mask_d[None, :]
+        if ELEMENTWISE_AFFINE:
+            w = tl.load(w_ptrs, mask=mask_d, other=0.0).to(tl.float32)
+            b = tl.load(b_ptrs, mask=mask_d, other=0.0).to(tl.float32)
+        else:
+            w = 1.0
+            b = 0.0
+        x = tl.load(x_ptrs, mask=mask_nd, other=0.0).to(tl.float32)
+        x_hat = (x - mean[:, None]) * rstd[:, None]
+        y = x_hat * w[None, :] + b[None, :]
+        tl.store(out_ptrs, y, mask=mask_nd)
+        if LAYOUT == 0:  # bnd->bnd
+            x_ptrs += TILE_D
+            out_ptrs += TILE_D
+        elif LAYOUT == 1:  # bdn->bnd
+            x_ptrs += TILE_D * N
+            out_ptrs += TILE_D
+        elif LAYOUT == 2:  # bnd->bdn
+            x_ptrs += TILE_D
+            out_ptrs += TILE_D * N
+        elif LAYOUT == 3:  # dbn->bnd
+            x_ptrs += TILE_D * B * N
+            out_ptrs += TILE_D
+        elif LAYOUT == 4:  # bnd->dbn
+            x_ptrs += TILE_D
+            out_ptrs += TILE_D * B * N
+        if ELEMENTWISE_AFFINE:
+            w_ptrs += TILE_D
+            b_ptrs += TILE_D
+@triton.jit
+def layer_norm_transpose_backward_single_pass_kernel(
+    # inputs:
+    grad_out_ptr,
+    x_ptr,
+    w_ptr,
+    mean_ptr,
+    rstd_ptr,
+    # outputs: (order matters for jax_triton)
+    grad_x_ptr,
+    grad_w_ptr,
+    grad_b_ptr,
+    B,
+    N,
+    D: tl.constexpr,
+    TILE_N: tl.constexpr,
+    TILE_D: tl.constexpr,
+    ELEMENTWISE_AFFINE: tl.constexpr,
+    LAYOUT: tl.constexpr,
+):
+    pid_n = tl.program_id(0)
+    pid_b = tl.program_id(1)
+    num_tiles_n = tl.cdiv(N, TILE_N)
+    offs_d = tl.arange(0, TILE_D)
+    offs_n = pid_n * TILE_N + tl.arange(0, TILE_N)
+    mask_n = offs_n < N
+    mean_ptrs = mean_ptr + pid_b * N + offs_n
+    rstd_ptrs = rstd_ptr + pid_b * N + offs_n
+    mean = tl.load(mean_ptrs, mask=mask_n, other=0.0).to(tl.float32)
+    rstd = tl.load(rstd_ptrs, mask=mask_n, other=0.0).to(tl.float32)
+    if LAYOUT == 0:  # bnd->bnd
+        x_base_ptrs = x_ptr + pid_b * N * D + offs_n[:, None] * D
+        x_ptrs = x_base_ptrs + offs_d[None, :]
+        grad_x_base_ptrs = grad_x_ptr + pid_b * N * D + offs_n[:, None] * D
+        grad_x_ptrs = grad_x_base_ptrs + offs_d[None, :]
+        grad_out_base_ptrs = grad_out_ptr + pid_b * N * D + offs_n[:, None] * D
+        grad_out_ptrs = grad_out_base_ptrs + offs_d[None, :]
+    elif LAYOUT == 1:  # bdn->bnd
+        x_base_ptrs = x_ptr + pid_b * D * N + offs_n[:, None]
+        x_ptrs = x_base_ptrs + offs_d[None, :] * N
+        grad_x_base_ptrs = grad_x_ptr + pid_b * D * N + offs_n[:, None]
+        grad_x_ptrs = grad_x_base_ptrs + offs_d[None, :] * N
+        grad_out_base_ptrs = grad_out_ptr + pid_b * N * D + offs_n[:, None] * D
+        grad_out_ptrs = grad_out_base_ptrs + offs_d[None, :]
+    elif LAYOUT == 2:  # bnd->bdn
+        x_base_ptrs = x_ptr + pid_b * N * D + offs_n[:, None] * D
+        x_ptrs = x_base_ptrs + offs_d[None, :]
+        grad_x_base_ptrs = grad_x_ptr + pid_b * N * D + offs_n[:, None] * D
+        grad_x_ptrs = grad_x_base_ptrs + offs_d[None, :]
+        grad_out_base_ptrs = grad_out_ptr + pid_b * N * D + offs_n[:, None]
+        grad_out_ptrs = grad_out_base_ptrs + offs_d[None, :] * N
+    elif LAYOUT == 3:  # dbn->bnd
+        x_base_ptrs = x_ptr + pid_b * N + offs_n[:, None]
+        x_ptrs = x_base_ptrs + offs_d[None, :] * B * N
+        grad_x_base_ptrs = grad_x_ptr + pid_b * N + offs_n[:, None]
+        grad_x_ptrs = grad_x_base_ptrs + offs_d[None, :] * B * N
+        grad_out_base_ptrs = grad_out_ptr + pid_b * N * D + offs_n[:, None] * D
+        grad_out_ptrs = grad_out_base_ptrs + offs_d[None, :]
+    elif LAYOUT == 4:  # bnd->dbn
+        x_base_ptrs = x_ptr + pid_b * N * D + offs_n[:, None] * D
+        x_ptrs = x_base_ptrs + offs_d[None, :]
+        grad_x_base_ptrs = grad_x_ptr + pid_b * N * D + offs_n[:, None] * D
+        grad_x_ptrs = grad_x_base_ptrs + offs_d[None, :]
+        grad_out_base_ptrs = grad_out_ptr + pid_b * N + offs_n[:, None]
+        grad_out_ptrs = grad_out_base_ptrs + offs_d[None, :] * B * N
+    grad_w_base_ptrs = grad_w_ptr + pid_b * num_tiles_n * D + pid_n * D
+    grad_b_base_ptrs = grad_b_ptr + pid_b * num_tiles_n * D + pid_n * D
+    x = tl.load(x_ptrs, mask=mask_n[:, None], other=0.0).to(tl.float32)
+    grad_out = tl.load(grad_out_ptrs, mask=mask_n[:, None], other=0.0).to(tl.float32)
+    xhat = (x - mean[:, None]) * rstd[:, None]
+    if ELEMENTWISE_AFFINE:
+        grad_b = grad_out
+        grad_b = tl.sum(grad_b, axis=0)
+        grad_b_ptrs = grad_b_base_ptrs + offs_d
+        tl.store(grad_b_ptrs, grad_b)
+        grad_w = grad_out * xhat
+        grad_w = tl.sum(grad_w, axis=0)
+        grad_w_ptrs = grad_w_base_ptrs + offs_d
+        tl.store(grad_w_ptrs, grad_w)
+        w_ptrs = w_ptr + offs_d
+        w = tl.load(w_ptrs).to(tl.float32)
+        wdo = w * grad_out
+    else:
+        wdo = grad_out
+    c1 = xhat * wdo
+    c2 = wdo
+    c1_dot = tl.sum(c1, axis=1) / D
+    c2_dot = tl.sum(c2, axis=1) / D
+    dx = (wdo - (xhat * c1_dot[:, None] + c2_dot[:, None])) * rstd[:, None]
+    tl.store(grad_x_ptrs, dx, mask=mask_n[:, None])
+@triton.jit
+def layer_norm_transpose_backward_kernel(
+    # inputs:
+    grad_out_ptr,
+    x_ptr,
+    w_ptr,
+    mean_ptr,
+    rstd_ptr,
+    # outputs: (order matters for jax_triton)
+    grad_x_ptr,
+    grad_w_ptr,
+    grad_b_ptr,
+    B,
+    N,
+    D: tl.constexpr,
+    TILE_N: tl.constexpr,
+    TILE_D: tl.constexpr,
+    ELEMENTWISE_AFFINE: tl.constexpr,
+    LAYOUT: tl.constexpr,
+):
+    pid_n = tl.program_id(0)
+    pid_b = tl.program_id(1)
+    num_tiles_d = tl.cdiv(D, TILE_D)
+    num_tiles_n = tl.cdiv(N, TILE_N)
+    offs_d = tl.arange(0, TILE_D)
+    offs_n = pid_n * TILE_N + tl.arange(0, TILE_N)
+    mask_n = offs_n < N
+    mean_ptrs = mean_ptr + pid_b * N + offs_n
+    rstd_ptrs = rstd_ptr + pid_b * N + offs_n
+    mean = tl.load(mean_ptrs, mask=mask_n, other=0.0).to(tl.float32)
+    rstd = tl.load(rstd_ptrs, mask=mask_n, other=0.0).to(tl.float32)
+    if LAYOUT == 0:  # bnd->bnd
+        x_base_ptrs = x_ptr + pid_b * N * D + offs_n[:, None] * D
+        grad_x_base_ptrs = grad_x_ptr + pid_b * N * D + offs_n[:, None] * D
+        grad_out_base_ptrs = grad_out_ptr + pid_b * N * D + offs_n[:, None] * D
+    elif LAYOUT == 1:  # bdn->bnd
+        x_base_ptrs = x_ptr + pid_b * D * N + offs_n[:, None]
+        grad_x_base_ptrs = grad_x_ptr + pid_b * D * N + offs_n[:, None]
+        grad_out_base_ptrs = grad_out_ptr + pid_b * N * D + offs_n[:, None] * D
+    elif LAYOUT == 2:  # bnd->bdn
+        x_base_ptrs = x_ptr + pid_b * N * D + offs_n[:, None] * D
+        grad_x_base_ptrs = grad_x_ptr + pid_b * N * D + offs_n[:, None] * D
+        grad_out_base_ptrs = grad_out_ptr + pid_b * N * D + offs_n[:, None]
+    elif LAYOUT == 3:  # dbn->bnd
+        x_base_ptrs = x_ptr + pid_b * N + offs_n[:, None]
+        grad_x_base_ptrs = grad_x_ptr + pid_b * N + offs_n[:, None]
+        grad_out_base_ptrs = grad_out_ptr + pid_b * N * D + offs_n[:, None] * D
+    elif LAYOUT == 4:  # bnd->dbn
+        x_base_ptrs = x_ptr + pid_b * N * D + offs_n[:, None] * D
+        grad_x_base_ptrs = grad_x_ptr + pid_b * N * D + offs_n[:, None] * D
+        grad_out_base_ptrs = grad_out_ptr + pid_b * N + offs_n[:, None]
+    grad_w_base_ptrs = grad_w_ptr + pid_b * num_tiles_n * D + pid_n * D
+    grad_b_base_ptrs = grad_b_ptr + pid_b * num_tiles_n * D + pid_n * D
+    c1 = tl.zeros([TILE_N, TILE_D], dtype=tl.float32)
+    c2 = tl.zeros([TILE_N, TILE_D], dtype=tl.float32)
+    for di in range(num_tiles_d):
+        mask_d = offs_d < D
+        mask_nd = mask_n[:, None] & mask_d[None, :]
+        if ELEMENTWISE_AFFINE:
+            w_ptrs = w_ptr + offs_d
+            w = tl.load(w_ptrs, mask=mask_d, other=1.0).to(tl.float32)
+        else:
+            w = 1.0
+        if LAYOUT == 0:  # bnd->bnd
+            x_ptrs = x_base_ptrs + offs_d[None, :]
+            grad_out_ptrs = grad_out_base_ptrs + offs_d[None, :]
+        elif LAYOUT == 1:  # bdn->bnd
+            x_ptrs = x_base_ptrs + offs_d[None, :] * N
+            grad_out_ptrs = grad_out_base_ptrs + offs_d[None, :]
+        elif LAYOUT == 2:  # bnd->bdn
+            x_ptrs = x_base_ptrs + offs_d[None, :]
+            grad_out_ptrs = grad_out_base_ptrs + offs_d[None, :] * N
+        elif LAYOUT == 3:  # dbn->bnd
+            x_ptrs = x_base_ptrs + offs_d[None, :] * B * N
+            grad_out_ptrs = grad_out_base_ptrs + offs_d[None, :]
+        elif LAYOUT == 4:  # bnd->dbn
+            x_ptrs = x_base_ptrs + offs_d[None, :]
+            grad_out_ptrs = grad_out_base_ptrs + offs_d[None, :] * B * N
+        x = tl.load(x_ptrs, mask=mask_nd, other=mean[:, None]).to(tl.float32)
+        grad_out = tl.load(grad_out_ptrs, mask=mask_nd, other=0.0).to(tl.float32)
+        xhat = (x - mean[:, None]) * rstd[:, None]
+        wdo = w * grad_out
+        c1 += xhat * wdo
+        c2 += wdo
+        offs_d += TILE_D
+    c1_dot = tl.sum(c1, axis=1) / D
+    c2_dot = tl.sum(c2, axis=1) / D
+    offs_d -= TILE_D * num_tiles_d
+    for di in range(num_tiles_d):
+        mask_d = offs_d < D
+        mask_nd = mask_n[:, None] & mask_d[None, :]
+        if ELEMENTWISE_AFFINE:
+            w_ptrs = w_ptr + offs_d
+            w = tl.load(w_ptrs, mask=mask_d, other=0.0).to(tl.float32)
+        else:
+            w = 1.0
+        if LAYOUT == 0:  # bnd->bnd
+            x_ptrs = x_base_ptrs + offs_d[None, :]
+            grad_x_ptrs = grad_x_base_ptrs + offs_d[None, :]
+            grad_out_ptrs = grad_out_base_ptrs + offs_d[None, :]
+        elif LAYOUT == 1:  # bdn->bnd
+            x_ptrs = x_base_ptrs + offs_d[None, :] * N
+            grad_x_ptrs = grad_x_base_ptrs + offs_d[None, :] * N
+            grad_out_ptrs = grad_out_base_ptrs + offs_d[None, :]
+        elif LAYOUT == 2:  # bnd->bdn
+            x_ptrs = x_base_ptrs + offs_d[None, :]
+            grad_x_ptrs = grad_x_base_ptrs + offs_d[None, :]
+            grad_out_ptrs = grad_out_base_ptrs + offs_d[None, :] * N
+        elif LAYOUT == 3:  # dbn->bnd
+            x_ptrs = x_base_ptrs + offs_d[None, :] * B * N
+            grad_x_ptrs = grad_x_base_ptrs + offs_d[None, :] * B * N
+            grad_out_ptrs = grad_out_base_ptrs + offs_d[None, :]
+        elif LAYOUT == 4:  # bnd->dbn
+            x_ptrs = x_base_ptrs + offs_d[None, :]
+            grad_x_ptrs = grad_x_base_ptrs + offs_d[None, :]
+            grad_out_ptrs = grad_out_base_ptrs + offs_d[None, :] * B * N
+        x = tl.load(x_ptrs, mask=mask_nd, other=mean[:, None]).to(tl.float32)
+        grad_out = tl.load(grad_out_ptrs, mask=mask_nd, other=0.0).to(tl.float32)
+        xhat = (x - mean[:, None]) * rstd[:, None]
+        if ELEMENTWISE_AFFINE:
+            grad_b = grad_out
+            grad_b = tl.sum(grad_b, axis=0)
+            grad_b_ptrs = grad_b_base_ptrs + offs_d
+            tl.store(grad_b_ptrs, grad_b, mask=mask_d)
+            grad_w = grad_out * xhat
+            grad_w = tl.sum(grad_w, axis=0)
+            grad_w_ptrs = grad_w_base_ptrs + offs_d
+            tl.store(grad_w_ptrs, grad_w, mask=mask_d)
+        wdo = w * grad_out
+        dx = (wdo - (xhat * c1_dot[:, None] + c2_dot[:, None])) * rstd[:, None]
+        tl.store(grad_x_ptrs, dx, mask=mask_nd)
+        offs_d += TILE_D