PyPI - cuequivariance-ops-cu12 - Versions diffs - 0.4.0__py3-none-manylinux_2_39_aarch64.whl → 0.5.1__py3-none-manylinux_2_39_aarch64.whl - Mend

cuequivariance-ops-cu12 0.4.0__py3-none-manylinux_2_39_aarch64.whl → 0.5.1__py3-none-manylinux_2_39_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cuequivariance-ops-cu12 might be problematic. Click here for more details.

Files changed (28) hide show

cuequivariance_ops/triton/cache_manager.py ADDED Viewed

@@ -0,0 +1,244 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+import hashlib
+import json
+import logging
+import math
+import os
+from multiprocessing import Lock
+from pathlib import Path
+from typing import Any
+import pynvml
+from platformdirs import user_cache_dir
+# Configure logging
+logger = logging.getLogger(__name__)
+FILE_LOCK = Lock()
+def get_triton_tuning_mode():
+    cueq_at = os.getenv("CUEQ_TRITON_TUNING")
+    if cueq_at is not None and cueq_at not in ["AOT", "ONDEMAND"]:
+        logger.error(f"CUEQ_TRITON_TUNING setting not recognized: {cueq_at}.\n")
+    return cueq_at
+def is_docker():
+    cgroup = Path("/proc/self/cgroup")
+    return Path("/.dockerenv").is_file() or (
+        cgroup.is_file() and "docker" in cgroup.read_text()
+    )
+def overridden_cache_dir():
+    return os.getenv("CUEQ_TRITON_CACHE_DIR")
+def get_triton_cache_dir() -> Path:
+    cache_dir = overridden_cache_dir()
+    if cache_dir is None:
+        cache_dir = user_cache_dir(appname="cuequivariance-triton", ensure_exists=False)
+    cache_dir = Path(cache_dir)
+    if cache_dir.exists():
+        return cache_dir
+    cache_dir.mkdir(parents=True, exist_ok=True)
+    return cache_dir
+def get_gpu_information():
+    pynvml.nvmlInit()
+    # Note: non-uniform multi-GPU setups are not supported
+    handle = pynvml.nvmlDeviceGetHandleByIndex(0)
+    name = pynvml.nvmlDeviceGetName(handle)
+    # pci_info = pynvml.nvmlDeviceGetPciInfo(handle)
+    # device_id = pci_info.pciDeviceId
+    # sub_device_id = pci_info.pciSubSystemId
+    power_limit = pynvml.nvmlDeviceGetPowerManagementLimit(handle)
+    max_clock_rate = pynvml.nvmlDeviceGetMaxClockInfo(
+        handle, pynvml.NVML_CLOCK_GRAPHICS
+    )
+    mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
+    gpu_core_count = pynvml.nvmlDeviceGetNumGpuCores(handle)
+    major, minor = pynvml.nvmlDeviceGetCudaComputeCapability(handle)
+    pynvml.nvmlShutdown()
+    return {
+        "name": name,
+        # "device_id": device_id,
+        # "sub_device_id": sub_device_id,
+        "total_memory": math.ceil(mem_info.total / (1024**3)),
+        "multi_processor_count": gpu_core_count // 128,
+        "power_limit": power_limit // 1000,
+        "clock_rate": max_clock_rate,
+        "major": major,
+        "minor": minor,
+    }
+def gpu_information_to_key(information: dict) -> str:
+    information.pop("name", None)
+    key_string = "_".join(f"{value}" for value in information.values()).replace(
+        " ", "_"
+    )
+    hash_object = hashlib.sha256(key_string.encode())
+    hash_str = hash_object.hexdigest()
+    return hash_str
+class CacheManager:
+    """Singleton managing the cache"""
+    def __init__(self):
+        self.gpu_cache = {}
+        self.gpu_information = get_gpu_information()
+        self.gpu_key = gpu_information_to_key(self.gpu_information)
+        self.site_json_path = os.path.join(os.path.dirname(__file__), "cache")
+        self.json_path = get_triton_cache_dir()
+        self.aot_mode = get_triton_tuning_mode()
+        self.dirty = {}
+        if os.getenv("CUEQ_TRITON_IGNORE_EXISTING_CACHE") == "1":
+            logger.warning(
+                f"\n!!!!!! CUEQ_TRITON_IGNORE_EXISTING_CACHE is ON - previously saved setting will be ignored !!!!!!\n"
+                f"CUEQ_TRITON_TUNING is set to {self.aot_mode}\n"
+                f"The tuning changes will be written to {self.json_path}"
+            )
+        if (
+            self.aot_mode is not None
+            and is_docker()
+            and os.getenv("HOME") == "/root"
+            and not overridden_cache_dir()
+        ):
+            logger.warning(
+                f"\n!!!!!! CUEQ_TRITON_TUNING is set to {self.aot_mode} and you are running as root in a Docker container. !!!!!!\n"
+                f"The tuning changes will be written to {self.json_path}"
+                "Please remember to commit the container - otherwise any tuning changes will be lost on container restart."
+            )
+    def load_cache(self, fn_key: str) -> dict:
+        # load the json file and store it in the cache-dict
+        # if the file does not exist, create an empty dict for the specified function
+        fn_cache = {}
+        gpu_cache = {}
+        best_key = None
+        major, minor = self.gpu_information["major"], self.gpu_information["minor"]
+        basename = f"{fn_key}.{major}.{minor}.json"
+        json_file = os.path.join(self.json_path, basename)
+        def result(self, gpu_cache):
+            # empty cache or fuzzy match, update for possible save
+            if best_key or not gpu_cache:
+                gpu_cache["gpu_information"] = self.gpu_information
+            self.gpu_cache[fn_key] = gpu_cache
+            return gpu_cache
+        if os.getenv("CUEQ_TRITON_IGNORE_EXISTING_CACHE"):
+            return result(self, gpu_cache)
+        try:
+            with FILE_LOCK, open(json_file, "rb") as f:
+                fn_cache = json.load(f)
+        except Exception as e0:
+            site_json_file = os.path.join(self.site_json_path, basename)
+            try:
+                with FILE_LOCK, open(site_json_file, "rb") as f:
+                    fn_cache = json.load(f)
+            except Exception as e:
+                logger.warning(
+                    f"Error reading system-wide triton tuning cache file: {site_json_file}\n{e}\n"
+                    f"Error reading users triton tuning cache file {json_file}:\n{e0}"
+                )
+                pass
+        if fn_cache:
+            gpu_cache = fn_cache.get(self.gpu_key)
+            if gpu_cache is None:
+                # do a fuzzy match of config:
+                def within_10_percent(a, b, key):
+                    a = int(a[key])
+                    b = int(b[key])
+                    return abs(a - b) / (a + b) < 0.2
+                def full_match(a, b):
+                    # matching clock & memory
+                    return (
+                        a["total_memory"] == b["total_memory"]
+                        and a["clock_rate"] == b["clock_rate"]
+                    )
+                def partial_match(a, b):
+                    # matching clk or memory whichever matches
+                    return within_10_percent(a, b, "total_memory") or within_10_percent(
+                        a, b, "clock_rate"
+                    )
+                for key in fn_cache:
+                    conf = fn_cache[key].get("gpu_information")
+                    if conf:
+                        if full_match(conf, self.gpu_information):
+                            best_key = key
+                            break
+                        elif partial_match(conf, self.gpu_information):
+                            best_key = key
+                if best_key is None:
+                    # just pick the first entry there
+                    best_key = next(iter(fn_cache))
+                gpu_cache = fn_cache[best_key]
+        return result(self, gpu_cache)
+    def save_cache(self, fn_key: str) -> None:
+        # save cache-dict to json file
+        major, minor = self.gpu_information["major"], self.gpu_information["minor"]
+        basename = f"{fn_key}.{major}.{minor}.json"
+        json_file = os.path.join(self.json_path, basename)
+        # Load existing data from the file if it exists
+        if os.path.exists(json_file):
+            with FILE_LOCK, open(json_file, "rb") as f:
+                existing_data = json.load(f)
+        else:
+            existing_data = {}
+        # Update the entry for our GPU key with our data
+        existing_data.setdefault(self.gpu_key, {}).update(self.gpu_cache[fn_key])
+        self.gpu_cache[fn_key] = existing_data[self.gpu_key]
+        merged_data = existing_data
+        temp_file = f"{json_file}.{os.getpid()}.tmp"
+        try:
+            # Save the merged data back to the file
+            with FILE_LOCK:
+                with open(temp_file, "w") as f:
+                    json.dump(merged_data, f, indent=4)
+                os.replace(temp_file, json_file)
+        except Exception as e:
+            logger.warning(f"Warning: Failed to write autotune cache: {e}")
+        # Clear the dirty flag
+        del self.dirty[fn_key]
+    def get(self, fn_key: str, inp_key: str) -> Any:
+        # get value from cache
+        # if necessary, load json first
+        gpu_cache = self.gpu_cache.get(fn_key) or self.load_cache(fn_key)
+        # check if fn_key and inp_key exist in cache
+        return gpu_cache.get(inp_key)
+    def set(self, fn_key: str, inp_key: str, value: Any) -> None:
+        # write value to cache-dict
+        self.gpu_cache[fn_key][inp_key] = value
+        self.dirty[fn_key] = 1
+cache_manager = CacheManager()
+def get_cache_manager():
+    return cache_manager

cuequivariance_ops/triton/fused_layer_norm_triton.py ADDED Viewed

@@ -0,0 +1,324 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+import enum
+import triton
+import triton.language as tl
+class Layout(enum.IntEnum):
+    BND_BND = 0
+    BDN_BND = 1
+    BND_BDN = 2
+    DBN_BND = 3
+    BND_DBN = 4
+@triton.jit
+def layer_norm_transpose_forward_kernel(
+    x_ptr,
+    out_ptr,
+    w_ptr,
+    b_ptr,
+    mean_ptr,
+    rstd_ptr,
+    B,
+    N,
+    D: tl.constexpr,
+    EPS: tl.constexpr,
+    TILE_N: tl.constexpr,
+    TILE_D: tl.constexpr,
+    ELEMENTWISE_AFFINE: tl.constexpr,
+    LAYOUT: tl.constexpr,
+):
+    pid_n = tl.program_id(0)
+    pid_b = tl.program_id(1)
+    offs_n = pid_n * TILE_N + tl.arange(0, TILE_N)
+    offs_d = tl.arange(0, TILE_D)
+    if LAYOUT == 0:  # bnd->bnd
+        x_ptrs = x_ptr + pid_b * N * D + offs_n[:, None] * D + offs_d[None, :]
+    elif LAYOUT == 1:  # bdn->bnd
+        x_ptrs = x_ptr + pid_b * D * N + offs_d[None, :] * N + offs_n[:, None]
+    elif LAYOUT == 2:  # bnd->bdn
+        x_ptrs = x_ptr + pid_b * N * D + offs_n[:, None] * D + offs_d[None, :]
+    elif LAYOUT == 3:  # dbn->bnd
+        x_ptrs = x_ptr + offs_d[None, :] * B * N + pid_b * N + offs_n[:, None]
+    elif LAYOUT == 4:  # bnd->dbn
+        x_ptrs = x_ptr + pid_b * N * D + offs_n[:, None] * D + offs_d[None, :]
+    mean_ptrs = mean_ptr + pid_b * N + offs_n
+    rstd_ptrs = rstd_ptr + pid_b * N + offs_n
+    mask_n = offs_n < N
+    num_tiles = D // TILE_D
+    _mean = tl.zeros([TILE_N, TILE_D], dtype=tl.float32)
+    for _ in range(0, num_tiles):
+        x = tl.load(x_ptrs, mask=mask_n[:, None], other=0.0).to(tl.float32)
+        _mean += x
+        if LAYOUT == 0:  # bnd->bnd
+            x_ptrs += TILE_D
+        elif LAYOUT == 1:  # bdn->bnd
+            x_ptrs += TILE_D * N
+        elif LAYOUT == 2:  # bnd->bdn
+            x_ptrs += TILE_D
+        elif LAYOUT == 3:  # dbn->bnd
+            x_ptrs += TILE_D * B * N
+        elif LAYOUT == 4:  # bnd->dbn
+            x_ptrs += TILE_D
+    mean = tl.sum(_mean, axis=1) / D
+    tl.store(mean_ptrs, mean, mask=mask_n)
+    if LAYOUT == 0:  # bnd->bnd
+        x_ptrs -= D
+    elif LAYOUT == 1:  # bdn->bnd
+        x_ptrs -= D * N
+    elif LAYOUT == 2:  # bnd->bdn
+        x_ptrs -= D
+    elif LAYOUT == 3:  # dbn->bnd
+        x_ptrs -= D * B * N
+    elif LAYOUT == 4:  # bnd->dbn
+        x_ptrs -= D
+    _var = tl.zeros([TILE_N, TILE_D], dtype=tl.float32)
+    for d in range(0, num_tiles):
+        x = tl.load(x_ptrs, mask=mask_n[:, None], other=0.0).to(tl.float32)
+        x = x - mean[:, None]
+        _var += x * x
+        if LAYOUT == 0:  # bnd->bnd
+            x_ptrs += TILE_D
+        elif LAYOUT == 1:  # bdn->bnd
+            x_ptrs += TILE_D * N
+        elif LAYOUT == 2:  # bnd->bdn
+            x_ptrs += TILE_D
+        elif LAYOUT == 3:  # dbn->bnd
+            x_ptrs += TILE_D * B * N
+        elif LAYOUT == 4:  # bnd->dbn
+            x_ptrs += TILE_D
+    var = tl.sum(_var, axis=1) / D
+    rstd = 1.0 / tl.sqrt(var + EPS)
+    tl.store(rstd_ptrs, rstd, mask=mask_n)
+    if LAYOUT == 0:  # bnd->bnd
+        x_ptrs -= D
+        out_ptrs = out_ptr + pid_b * N * D + offs_n[:, None] * D + offs_d[None, :]
+    elif LAYOUT == 1:  # bdn->bnd
+        x_ptrs -= D * N
+        out_ptrs = out_ptr + pid_b * N * D + offs_n[:, None] * D + offs_d[None, :]
+    elif LAYOUT == 2:  # bnd->bdn
+        x_ptrs -= D
+        out_ptrs = out_ptr + pid_b * N * D + offs_d[None, :] * N + offs_n[:, None]
+    elif LAYOUT == 3:  # dbn->bnd
+        x_ptrs -= D * B * N
+        out_ptrs = out_ptr + pid_b * N * D + offs_n[:, None] * D + offs_d[None, :]
+    elif LAYOUT == 4:  # bnd->dbn
+        x_ptrs -= D
+        out_ptrs = out_ptr + offs_d[None, :] * B * N + pid_b * N + offs_n[:, None]
+    if ELEMENTWISE_AFFINE:
+        w_ptrs = w_ptr + offs_d
+        b_ptrs = b_ptr + offs_d
+    for _ in range(0, num_tiles):
+        if ELEMENTWISE_AFFINE:
+            w = tl.load(w_ptrs)
+            b = tl.load(b_ptrs)
+        else:
+            w = 1.0
+            b = 0.0
+        x = tl.load(x_ptrs, mask=mask_n[:, None], other=0.0).to(tl.float32)
+        x_hat = (x - mean[:, None]) * rstd[:, None]
+        y = x_hat * w[None, :] + b[None, :]
+        tl.store(out_ptrs, y, mask=mask_n[:, None])
+        if LAYOUT == 0:  # bnd->bnd
+            x_ptrs += TILE_D
+            out_ptrs += TILE_D
+        elif LAYOUT == 1:  # bdn->bnd
+            x_ptrs += TILE_D * N
+            out_ptrs += TILE_D
+        elif LAYOUT == 2:  # bnd->bdn
+            x_ptrs += TILE_D
+            out_ptrs += TILE_D * N
+        elif LAYOUT == 3:  # dbn->bnd
+            x_ptrs += TILE_D * B * N
+            out_ptrs += TILE_D
+        elif LAYOUT == 4:  # bnd->dbn
+            x_ptrs += TILE_D
+            out_ptrs += TILE_D * B * N
+        if ELEMENTWISE_AFFINE:
+            w_ptrs += TILE_D
+            b_ptrs += TILE_D
+@triton.jit
+def layer_norm_transpose_backward_kernel(
+    grad_out_ptr,
+    grad_x_ptr,
+    grad_w_ptr,
+    grad_b_ptr,
+    x_ptr,
+    w_ptr,
+    mean_ptr,
+    rstd_ptr,
+    B,
+    N,
+    D: tl.constexpr,
+    TILE_N: tl.constexpr,
+    TILE_D: tl.constexpr,
+    ELEMENTWISE_AFFINE: tl.constexpr,
+    LAYOUT: tl.constexpr,
+):
+    pid_n = tl.program_id(0)
+    pid_b = tl.program_id(1)
+    num_tiles = D // TILE_D
+    num_tiles_n = tl.cdiv(N, TILE_N)
+    offs_d = tl.arange(0, TILE_D)
+    offs_n = pid_n * TILE_N + tl.arange(0, TILE_N)
+    mask_n = offs_n < N
+    mean_ptrs = mean_ptr + pid_b * N + offs_n
+    rstd_ptrs = rstd_ptr + pid_b * N + offs_n
+    mean = tl.load(mean_ptrs, mask=mask_n, other=0.0).to(tl.float32)
+    rstd = tl.load(rstd_ptrs, mask=mask_n, other=0.0).to(tl.float32)
+    if LAYOUT == 0:  # bnd->bnd
+        x_base_ptrs = x_ptr + pid_b * N * D + offs_n[:, None] * D
+        grad_x_base_ptrs = grad_x_ptr + pid_b * N * D + offs_n[:, None] * D
+        grad_out_base_ptrs = grad_out_ptr + pid_b * N * D + offs_n[:, None] * D
+    elif LAYOUT == 1:  # bdn->bnd
+        x_base_ptrs = x_ptr + pid_b * D * N + offs_n[:, None]
+        grad_x_base_ptrs = grad_x_ptr + pid_b * D * N + offs_n[:, None]
+        grad_out_base_ptrs = grad_out_ptr + pid_b * N * D + offs_n[:, None] * D
+    elif LAYOUT == 2:  # bnd->bdn
+        x_base_ptrs = x_ptr + pid_b * N * D + offs_n[:, None] * D
+        grad_x_base_ptrs = grad_x_ptr + pid_b * N * D + offs_n[:, None] * D
+        grad_out_base_ptrs = grad_out_ptr + pid_b * N * D + offs_n[:, None]
+    elif LAYOUT == 3:  # dbn->bnd
+        x_base_ptrs = x_ptr + pid_b * N + offs_n[:, None]
+        grad_x_base_ptrs = grad_x_ptr + pid_b * N + offs_n[:, None]
+        grad_out_base_ptrs = grad_out_ptr + pid_b * N * D + offs_n[:, None] * D
+    elif LAYOUT == 4:  # bnd->dbn
+        x_base_ptrs = x_ptr + pid_b * N * D + offs_n[:, None] * D
+        grad_x_base_ptrs = grad_x_ptr + pid_b * N * D + offs_n[:, None] * D
+        grad_out_base_ptrs = grad_out_ptr + pid_b * N + offs_n[:, None]
+    grad_w_base_ptrs = grad_w_ptr + pid_b * num_tiles_n * D + pid_n * D
+    grad_b_base_ptrs = grad_b_ptr + pid_b * num_tiles_n * D + pid_n * D
+    c1 = tl.zeros([TILE_N, TILE_D], dtype=tl.float32)
+    c2 = tl.zeros([TILE_N, TILE_D], dtype=tl.float32)
+    for _ in range(num_tiles):
+        if ELEMENTWISE_AFFINE:
+            w_ptrs = w_ptr + offs_d
+            w = tl.load(w_ptrs).to(tl.float32)
+        else:
+            w = 1.0
+        if LAYOUT == 0:  # bnd->bnd
+            x_ptrs = x_base_ptrs + offs_d[None, :]
+            grad_out_ptrs = grad_out_base_ptrs + offs_d[None, :]
+        elif LAYOUT == 1:  # bdn->bnd
+            x_ptrs = x_base_ptrs + offs_d[None, :] * N
+            grad_out_ptrs = grad_out_base_ptrs + offs_d[None, :]
+        elif LAYOUT == 2:  # bnd->bdn
+            x_ptrs = x_base_ptrs + offs_d[None, :]
+            grad_out_ptrs = grad_out_base_ptrs + offs_d[None, :] * N
+        elif LAYOUT == 3:  # dbn->bnd
+            x_ptrs = x_base_ptrs + offs_d[None, :] * B * N
+            grad_out_ptrs = grad_out_base_ptrs + offs_d[None, :]
+        elif LAYOUT == 4:  # bnd->dbn
+            x_ptrs = x_base_ptrs + offs_d[None, :]
+            grad_out_ptrs = grad_out_base_ptrs + offs_d[None, :] * B * N
+        x = tl.load(x_ptrs, mask=mask_n[:, None], other=0.0).to(tl.float32)
+        grad_out = tl.load(grad_out_ptrs, mask=mask_n[:, None], other=0.0).to(
+            tl.float32
+        )
+        xhat = (x - mean[:, None]) * rstd[:, None]
+        if ELEMENTWISE_AFFINE:
+            grad_b = grad_out
+            grad_w = grad_out * xhat
+            grad_b = tl.sum(grad_b, axis=0)
+            grad_w = tl.sum(grad_w, axis=0)
+            grad_w_ptrs = grad_w_base_ptrs + offs_d
+            grad_b_ptrs = grad_b_base_ptrs + offs_d
+            tl.store(grad_w_ptrs, grad_w)
+            tl.store(grad_b_ptrs, grad_b)
+        wdo = w * grad_out
+        c1 += xhat * wdo
+        c2 += wdo
+        offs_d += TILE_D
+    c1_dot = tl.sum(c1, axis=1) / D
+    c2_dot = tl.sum(c2, axis=1) / D
+    offs_d -= TILE_D * num_tiles
+    for _ in range(num_tiles):
+        if ELEMENTWISE_AFFINE:
+            w_ptrs = w_ptr + offs_d
+            w = tl.load(w_ptrs).to(tl.float32)
+        else:
+            w = 1.0
+        if LAYOUT == 0:  # bnd->bnd
+            x_ptrs = x_base_ptrs + offs_d[None, :]
+            grad_x_ptrs = grad_x_base_ptrs + offs_d[None, :]
+            grad_out_ptrs = grad_out_base_ptrs + offs_d[None, :]
+        elif LAYOUT == 1:  # bdn->bnd
+            x_ptrs = x_base_ptrs + offs_d[None, :] * N
+            grad_x_ptrs = grad_x_base_ptrs + offs_d[None, :] * N
+            grad_out_ptrs = grad_out_base_ptrs + offs_d[None, :]
+        elif LAYOUT == 2:  # bnd->bdn
+            x_ptrs = x_base_ptrs + offs_d[None, :]
+            grad_x_ptrs = grad_x_base_ptrs + offs_d[None, :]
+            grad_out_ptrs = grad_out_base_ptrs + offs_d[None, :] * N
+        elif LAYOUT == 3:  # dbn->bnd
+            x_ptrs = x_base_ptrs + offs_d[None, :] * B * N
+            grad_x_ptrs = grad_x_base_ptrs + offs_d[None, :] * B * N
+            grad_out_ptrs = grad_out_base_ptrs + offs_d[None, :]
+        elif LAYOUT == 4:  # bnd->dbn
+            x_ptrs = x_base_ptrs + offs_d[None, :]
+            grad_x_ptrs = grad_x_base_ptrs + offs_d[None, :]
+            grad_out_ptrs = grad_out_base_ptrs + offs_d[None, :] * B * N
+        x = tl.load(x_ptrs, mask=mask_n[:, None], other=0.0).to(tl.float32)
+        grad_out = tl.load(grad_out_ptrs, mask=mask_n[:, None], other=0.0).to(
+            tl.float32
+        )
+        xhat = (x - mean[:, None]) * rstd[:, None]
+        wdo = w * grad_out
+        dx = (wdo - (xhat * c1_dot[:, None] + c2_dot[:, None])) * rstd[:, None]
+        tl.store(grad_x_ptrs, dx, mask=mask_n[:, None])
+        offs_d += TILE_D