PyPI - cuequivariance-ops-cu12 - Versions diffs - 0.6.0__py3-none-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl - Mend

cuequivariance-ops-cu12 0.6.0__py3-none-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cuequivariance-ops-cu12 might be problematic. Click here for more details.

Files changed (37) hide show

cuequivariance_ops/triton/gated_gemm_triton.py ADDED Viewed

@@ -0,0 +1,380 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+import triton
+import triton.language as tl
+from cuequivariance_ops.triton.utils import cvt_tf32_rn
+@triton.jit
+def fused_sigmoid_gated_dual_gemm_forward_kernel(
+    # inputs
+    x1_ptr,
+    x2_ptr,
+    w1_ptr,
+    w2_ptr,
+    b1_ptr,
+    b2_ptr,
+    mask_ptr,
+    M,
+    N,
+    K,
+    # outputs
+    o_ptr,
+    TILE_M: tl.constexpr,
+    TILE_N: tl.constexpr,
+    TILE_K: tl.constexpr,
+    PRECISION: tl.constexpr,
+    APPLY_MASK: tl.constexpr,
+    TRANSPOSE_OUT: tl.constexpr,
+    TWO_INPUTS: tl.constexpr,
+    HAS_B1: tl.constexpr,
+    HAS_B2: tl.constexpr,
+):
+    # fully gated GEMM kernel with optional mask at the end
+    pid_m = tl.program_id(axis=0)
+    pid_n = tl.program_id(axis=1)
+    start_m = pid_m * TILE_M
+    start_n = pid_n * TILE_N
+    offs_xm = start_m + tl.arange(0, TILE_M)
+    offs_wn = start_n + tl.arange(0, TILE_N)
+    offs_k = tl.arange(0, TILE_K)
+    x1_ptrs = x1_ptr + (offs_xm[:, None] * K + offs_k[None, :])
+    if TWO_INPUTS:
+        x2_ptrs = x2_ptr + (offs_xm[:, None] * K + offs_k[None, :])
+    w_tile_offs = offs_wn[None, :] * K + offs_k[:, None]
+    acc_1 = tl.zeros((TILE_M, TILE_N), dtype=tl.float32)
+    acc_2 = tl.zeros((TILE_M, TILE_N), dtype=tl.float32)
+    mask_m = offs_xm < M
+    if TWO_INPUTS:
+        for _ in range(0, tl.cdiv(K, TILE_K)):
+            x1 = tl.load(x1_ptrs, mask=mask_m[:, None], other=0.0).to(
+                w1_ptr.type.element_ty
+            )
+            w1_ptrs = w1_ptr + w_tile_offs
+            w1 = tl.load(w1_ptrs)
+            if PRECISION == 0:
+                acc_1 = tl.dot(x1, w1, acc_1)
+            elif PRECISION == 1:
+                x1 = cvt_tf32_rn(x1)
+                w1 = cvt_tf32_rn(w1)
+                acc_1 = tl.dot(x1, w1, acc_1, input_precision="tf32")
+            elif PRECISION == 2:
+                acc_1 = tl.dot(x1, w1, acc_1, input_precision="tf32x3")
+            elif PRECISION == 3:
+                acc_1 = tl.dot(x1, w1, acc_1, input_precision="ieee")
+            else:
+                tl.static_assert(
+                    False,
+                    "PRECISION must be 0 (default), 1 (tf32), 2 (tf32x3) or 3 (ieee)",
+                )
+            x1_ptrs += TILE_K
+            w1_ptr += TILE_K
+        for _ in range(0, tl.cdiv(K, TILE_K)):
+            x2 = tl.load(x2_ptrs, mask=mask_m[:, None], other=0.0).to(
+                w2_ptr.type.element_ty
+            )
+            w2_ptrs = w2_ptr + w_tile_offs
+            w2 = tl.load(w2_ptrs)
+            if PRECISION == 0:
+                acc_2 = tl.dot(x2, w2, acc_2)
+            elif PRECISION == 1:
+                x2 = cvt_tf32_rn(x2)
+                w2 = cvt_tf32_rn(w2)
+                acc_2 = tl.dot(x2, w2, acc_2, input_precision="tf32")
+            elif PRECISION == 2:
+                acc_2 = tl.dot(x2, w2, acc_2, input_precision="tf32x3")
+            elif PRECISION == 3:
+                acc_2 = tl.dot(x2, w2, acc_2, input_precision="ieee")
+            else:
+                tl.static_assert(
+                    False,
+                    "PRECISION must be 0 (default), 1 (tf32), 2 (tf32x3) or 3 (ieee)",
+                )
+            x2_ptrs += TILE_K
+            w2_ptr += TILE_K
+    else:
+        for _ in range(0, tl.cdiv(K, TILE_K)):
+            x = tl.load(x1_ptrs, mask=mask_m[:, None], other=0.0).to(
+                w1_ptr.type.element_ty
+            )
+            w1_ptrs = w1_ptr + w_tile_offs
+            w1 = tl.load(w1_ptrs)
+            if PRECISION == 0:
+                acc_1 = tl.dot(x, w1, acc_1)
+            elif PRECISION == 1:
+                x = cvt_tf32_rn(x)
+                w1 = cvt_tf32_rn(w1)
+                acc_1 = tl.dot(x, w1, acc_1, input_precision="tf32")
+            elif PRECISION == 2:
+                acc_1 = tl.dot(x, w1, acc_1, input_precision="tf32x3")
+            elif PRECISION == 3:
+                acc_1 = tl.dot(x, w1, acc_1, input_precision="ieee")
+            else:
+                tl.static_assert(
+                    False,
+                    "PRECISION must be 0 (default), 1 (tf32), 2 (tf32x3) or 3 (ieee)",
+                )
+            w2_ptrs = w2_ptr + w_tile_offs
+            w2 = tl.load(w2_ptrs)
+            if PRECISION == 0:
+                acc_2 = tl.dot(x, w2, acc_2)
+            elif PRECISION == 1:
+                x = cvt_tf32_rn(x)
+                w2 = cvt_tf32_rn(w2)
+                acc_2 = tl.dot(x, w2, acc_2, input_precision="tf32")
+            elif PRECISION == 2:
+                acc_2 = tl.dot(x, w2, acc_2, input_precision="tf32x3")
+            elif PRECISION == 3:
+                acc_2 = tl.dot(x, w2, acc_2, input_precision="ieee")
+            else:
+                tl.static_assert(
+                    False,
+                    "PRECISION must be 0 (default), 1 (tf32), 2 (tf32x3) or 3 (ieee)",
+                )
+            x1_ptrs += TILE_K
+            w1_ptr += TILE_K
+            w2_ptr += TILE_K
+    offs_om = pid_m * TILE_M + tl.arange(0, TILE_M)
+    offs_on = pid_n * TILE_N + tl.arange(0, TILE_N)
+    if HAS_B1:
+        b1_ptrs = b1_ptr + offs_on
+        b1_tile = tl.load(b1_ptrs).to(tl.float32)
+        acc_1 += b1_tile
+    if HAS_B2:
+        b2_ptrs = b2_ptr + offs_on
+        b2_tile = tl.load(b2_ptrs).to(tl.float32)
+        acc_2 += b2_tile
+    acc_1 = 1.0 / (1.0 + tl.exp(-acc_1))
+    acc_gated = acc_1 * acc_2
+    if APPLY_MASK:
+        mask = tl.load(mask_ptr + offs_om, mask=mask_m, other=0.0).to(tl.float32)
+        acc_gated = acc_gated * mask[:, None]
+    if TRANSPOSE_OUT:
+        o_ptrs = o_ptr + offs_on[None, :] * M + offs_om[:, None]
+    else:
+        o_ptrs = o_ptr + offs_om[:, None] * N + offs_on[None, :]
+    o_mask = offs_om[:, None] < M
+    tl.store(o_ptrs, acc_gated, mask=o_mask)
+@triton.jit
+def fused_sigmoid_gated_dual_gemm_backward_pregemm_kernel(
+    # inputs
+    grad_o_ptr,
+    x1_ptr,
+    x2_ptr,
+    w1_ptr,
+    w2_ptr,
+    b1_ptr,
+    b2_ptr,
+    mask_ptr,
+    M,
+    N,
+    K,
+    # outputs
+    grad_xw1_ptr,
+    grad_xw2_ptr,
+    grad_mask_ptr,
+    TILE_M: tl.constexpr,
+    TILE_N: tl.constexpr,
+    TILE_K: tl.constexpr,
+    PRECISION: tl.constexpr,
+    APPLY_MASK: tl.constexpr,
+    TRANSPOSE_OUT: tl.constexpr,
+    TWO_INPUTS: tl.constexpr,
+    HAS_B1: tl.constexpr,
+    HAS_B2: tl.constexpr,
+):
+    # fully gated GEMM kernel with optional mask at the end
+    pid_m = tl.program_id(axis=0)
+    pid_n = tl.program_id(axis=1)
+    start_m = pid_m * TILE_M
+    start_n = pid_n * TILE_N
+    offs_xm = start_m + tl.arange(0, TILE_M)
+    offs_wn = start_n + tl.arange(0, TILE_N)
+    offs_k = tl.arange(0, TILE_K)
+    x1_ptrs = x1_ptr + (offs_xm[:, None] * K + offs_k[None, :])
+    if TWO_INPUTS:
+        x2_ptrs = x2_ptr + (offs_xm[:, None] * K + offs_k[None, :])
+    w_tile_offs = offs_wn[None, :] * K + offs_k[:, None]
+    acc_1 = tl.zeros((TILE_M, TILE_N), dtype=tl.float32)
+    acc_2 = tl.zeros((TILE_M, TILE_N), dtype=tl.float32)
+    mask_m = offs_xm < M
+    if TWO_INPUTS:
+        # recompute acc1 and acc2
+        for _ in range(0, tl.cdiv(K, TILE_K)):
+            x1 = tl.load(x1_ptrs, mask=mask_m[:, None], other=0.0).to(
+                w1_ptr.type.element_ty
+            )
+            w1_ptrs = w1_ptr + w_tile_offs
+            w1 = tl.load(w1_ptrs)
+            if PRECISION == 0:
+                acc_1 = tl.dot(x1, w1, acc_1)
+            elif PRECISION == 1:
+                x1 = cvt_tf32_rn(x1)
+                w1 = cvt_tf32_rn(w1)
+                acc_1 = tl.dot(x1, w1, acc_1, input_precision="tf32")
+            elif PRECISION == 2:
+                acc_1 = tl.dot(x1, w1, acc_1, input_precision="tf32x3")
+            elif PRECISION == 3:
+                acc_1 = tl.dot(x1, w1, acc_1, input_precision="ieee")
+            else:
+                tl.static_assert(
+                    False,
+                    "PRECISION must be 0 (default), 1 (tf32), 2 (tf32x3) or 3 (ieee)",
+                )
+            x1_ptrs += TILE_K
+            w1_ptr += TILE_K
+        for _ in range(0, tl.cdiv(K, TILE_K)):
+            x2 = tl.load(x2_ptrs, mask=mask_m[:, None], other=0.0).to(
+                w2_ptr.type.element_ty
+            )
+            w2_ptrs = w2_ptr + w_tile_offs
+            w2 = tl.load(w2_ptrs)
+            if PRECISION == 0:
+                acc_2 = tl.dot(x2, w2, acc_2)
+            elif PRECISION == 1:
+                x2 = cvt_tf32_rn(x2)
+                w2 = cvt_tf32_rn(w2)
+                acc_2 = tl.dot(x2, w2, acc_2, input_precision="tf32")
+            elif PRECISION == 2:
+                acc_2 = tl.dot(x2, w2, acc_2, input_precision="tf32x3")
+            elif PRECISION == 3:
+                acc_2 = tl.dot(x2, w2, acc_2, input_precision="ieee")
+            else:
+                tl.static_assert(
+                    False,
+                    "PRECISION must be 0 (default), 1 (tf32), 2 (tf32x3) or 3 (ieee)",
+                )
+            x2_ptrs += TILE_K
+            w2_ptr += TILE_K
+    else:
+        # recompute acc1 and acc2
+        for _ in range(0, tl.cdiv(K, TILE_K)):
+            x = tl.load(x1_ptrs, mask=mask_m[:, None], other=0.0).to(
+                w1_ptr.type.element_ty
+            )
+            w1_ptrs = w1_ptr + w_tile_offs
+            w1 = tl.load(w1_ptrs)
+            if PRECISION == 0:
+                acc_1 = tl.dot(x, w1, acc_1)
+            elif PRECISION == 1:
+                x = cvt_tf32_rn(x)
+                w1 = cvt_tf32_rn(w1)
+                acc_1 = tl.dot(x, w1, acc_1, input_precision="tf32")
+            elif PRECISION == 2:
+                acc_1 = tl.dot(x, w1, acc_1, input_precision="tf32x3")
+            elif PRECISION == 3:
+                acc_1 = tl.dot(x, w1, acc_1, input_precision="ieee")
+            else:
+                tl.static_assert(
+                    False,
+                    "PRECISION must be 0 (default), 1 (tf32), 2 (tf32x3) or 3 (ieee)",
+                )
+            w2_ptrs = w2_ptr + w_tile_offs
+            w2 = tl.load(w2_ptrs)
+            if PRECISION == 0:
+                acc_2 = tl.dot(x, w2, acc_2)
+            elif PRECISION == 1:
+                x = cvt_tf32_rn(x)
+                w2 = cvt_tf32_rn(w2)
+                acc_2 = tl.dot(x, w2, acc_2, input_precision="tf32")
+            elif PRECISION == 2:
+                acc_2 = tl.dot(x, w2, acc_2, input_precision="tf32x3")
+            elif PRECISION == 3:
+                acc_2 = tl.dot(x, w2, acc_2, input_precision="ieee")
+            else:
+                tl.static_assert(
+                    False,
+                    "PRECISION must be 0 (default), 1 (tf32), 2 (tf32x3) or 3 (ieee)",
+                )
+            x1_ptrs += TILE_K
+            w1_ptr += TILE_K
+            w2_ptr += TILE_K
+    offs_om = pid_m * TILE_M + tl.arange(0, TILE_M)
+    offs_on = pid_n * TILE_N + tl.arange(0, TILE_N)
+    if HAS_B1:
+        b1_ptrs = b1_ptr + offs_on
+        b1_tile = tl.load(b1_ptrs).to(tl.float32)
+        acc_1 += b1_tile
+    if HAS_B2:
+        b2_ptrs = b2_ptr + offs_on
+        b2_tile = tl.load(b2_ptrs).to(tl.float32)
+        acc_2 += b2_tile
+    if TRANSPOSE_OUT:
+        grad_o_ptrs = grad_o_ptr + offs_on[None, :] * M + offs_om[:, None]
+    else:
+        grad_o_ptrs = grad_o_ptr + offs_om[:, None] * N + offs_on[None, :]
+    grad_o = tl.load(grad_o_ptrs, mask=mask_m[:, None], other=0.0).to(tl.float32)
+    acc_sig = 1.0 / (1.0 + tl.exp(-acc_1))
+    if APPLY_MASK:
+        tmp = acc_sig * acc_2
+        grad_mask = grad_o * tmp
+        grad_mask = tl.sum(grad_mask, axis=1)
+        grad_mask_ptrs = grad_mask_ptr + pid_n * M + offs_om
+        tl.store(grad_mask_ptrs, grad_mask, mask=mask_m)
+        mask = tl.load(mask_ptr + offs_om, mask=mask_m, other=0.0).to(tl.float32)
+        grad_o = grad_o * mask[:, None]
+    grad_xw2 = grad_o * acc_sig
+    grad_xw2_ptrs = grad_xw2_ptr + offs_om[:, None] * N + offs_on[None, :]
+    tl.store(grad_xw2_ptrs, grad_xw2, mask=mask_m[:, None])
+    tmp = (1.0 - acc_sig) * acc_sig
+    grad_xw1 = grad_o * acc_2 * tmp
+    grad_xw1_ptrs = grad_xw1_ptr + offs_om[:, None] * N + offs_on[None, :]
+    tl.store(grad_xw1_ptrs, grad_xw1, mask=mask_m[:, None])

cuequivariance_ops/triton/pair_bias.py ADDED Viewed

@@ -0,0 +1,324 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+import triton
+import triton.language as tl
+@triton.jit
+def pair_bias_norm_linear_mask_forward_kernel(
+    z_ptr,
+    mask_ptr,
+    w_proj_z_ptr,
+    b_proj_z_ptr,
+    w_ln_ptr,
+    b_ln_ptr,
+    U,
+    V,
+    multiplicity,
+    out_mask_ptr,
+    z_norm_ptr,
+    mean_ptr,
+    rstd_ptr,
+    TILE_V: tl.constexpr,
+    TILE_K: tl.constexpr,
+    NUM_HEADS: tl.constexpr,
+    NUM_HEADS_PER_BLK: tl.constexpr,
+    DIM_Z: tl.constexpr,
+    INF: tl.constexpr,
+    EPS: tl.constexpr,
+    ELEMENTWISE_AFFINE: tl.constexpr,
+    IS_TRAINING: tl.constexpr,
+    HAS_BIAS: tl.constexpr,
+    MASK_WITH_MULTIPLICITY: tl.constexpr,
+):
+    # prepare single mask
+    # z: B x U x V x D -> z' B x H x U x V
+    # mask: B x V
+    # out_mask = z' + (1 - mask) * inf
+    pid_v = tl.program_id(0)
+    pid_u = tl.program_id(1)
+    head_batch_idx = tl.program_id(2)
+    NUM_HEAD_BLKS = tl.cdiv(NUM_HEADS, NUM_HEADS_PER_BLK)
+    batch_idx = head_batch_idx // NUM_HEAD_BLKS
+    head_idx = head_batch_idx % NUM_HEAD_BLKS
+    stride_vz = V * DIM_Z
+    stride_uv = U * V
+    stride_uvz = U * V * DIM_Z
+    offs_u = pid_u
+    offs_v = pid_v * TILE_V + tl.arange(0, TILE_V)
+    offs_k = tl.arange(0, TILE_K)
+    offs_z = tl.arange(0, DIM_Z)
+    mask_v = offs_v < V
+    offs_head = head_idx * NUM_HEADS_PER_BLK + tl.arange(0, NUM_HEADS_PER_BLK)
+    mask_head = offs_head < NUM_HEADS
+    z_ptrs = z_ptr + batch_idx * stride_uvz + offs_u * stride_vz
+    z_ptrs += offs_v[:, None] * DIM_Z + offs_z[None, :]
+    z_tile_full = tl.load(z_ptrs, mask=mask_v[:, None], other=0.0).to(tl.float32)
+    mean = tl.sum(z_tile_full, axis=1) / DIM_Z
+    rstd = z_tile_full - mean[:, None]
+    rstd = rstd * rstd
+    rstd = tl.sum(rstd, axis=1) / DIM_Z
+    rstd = tl.rsqrt(rstd + EPS)
+    if IS_TRAINING:
+        mean_ptrs = mean_ptr + batch_idx * stride_uv
+        mean_ptrs += offs_u * V + offs_v
+        tl.store(mean_ptrs, mean, mask=mask_v)
+        rstd_ptrs = rstd_ptr + batch_idx * stride_uv
+        rstd_ptrs += offs_u * V + offs_v
+        tl.store(rstd_ptrs, rstd, mask=mask_v)
+    z_ptrs = z_ptr + batch_idx * stride_uvz + offs_u * stride_vz
+    z_ptrs += offs_v[:, None] * DIM_Z + offs_k[None, :]
+    w_ln_ptrs = w_ln_ptr + offs_k
+    b_ln_ptrs = b_ln_ptr + offs_k
+    w_proj_ptrs = w_proj_z_ptr + (offs_head[None, :] * DIM_Z + offs_k[:, None])
+    if IS_TRAINING:
+        z_norm_ptrs = z_norm_ptr + batch_idx * stride_uvz + offs_u * stride_vz
+        z_norm_ptrs += offs_v[:, None] * DIM_Z + offs_k[None, :]
+    num_tiles_k = DIM_Z // TILE_K
+    acc = tl.zeros((TILE_V, NUM_HEADS_PER_BLK), dtype=tl.float32)
+    for _ in range(0, num_tiles_k):
+        z_tile = tl.load(z_ptrs, mask=mask_v[:, None], other=0.0).to(tl.float32)
+        z_tile = (z_tile - mean[:, None]) * rstd[:, None]
+        if ELEMENTWISE_AFFINE:
+            w_ln_tile = tl.load(w_ln_ptrs).to(tl.float32)
+            b_ln_tile = tl.load(b_ln_ptrs).to(tl.float32)
+            z_tile = z_tile * w_ln_tile + b_ln_tile
+        if IS_TRAINING:
+            tl.store(z_norm_ptrs, z_tile, mask=mask_v[:, None])
+        w_tile = tl.load(w_proj_ptrs, mask=mask_head[None, :], other=0.0).to(tl.float32)
+        acc = tl.dot(z_tile, w_tile, acc, input_precision="tf32x3")
+        z_ptrs += TILE_K
+        w_proj_ptrs += TILE_K
+        if ELEMENTWISE_AFFINE:
+            w_ln_ptrs += TILE_K
+            b_ln_ptrs += TILE_K
+        if IS_TRAINING:
+            z_norm_ptrs += TILE_K
+    if HAS_BIAS:
+        b_proj_ptrs = b_proj_z_ptr + offs_head
+        b_proj_tile = tl.load(b_proj_ptrs, mask=mask_head, other=0.0).to(tl.float32)
+        acc += b_proj_tile[None, :]
+    offs_v = pid_v * TILE_V + tl.arange(0, TILE_V)
+    mask_v = offs_v < V
+    offs_head = head_idx * NUM_HEADS_PER_BLK + tl.arange(0, NUM_HEADS_PER_BLK)
+    mask_head = offs_head < NUM_HEADS
+    out_mask_ptrs = out_mask_ptr + batch_idx * multiplicity * NUM_HEADS * stride_uv
+    out_mask_ptrs += offs_u * V
+    out_mask_ptrs += offs_head[None, :] * stride_uv + offs_v[:, None]
+    mask_o = mask_head[None, :] & mask_v[:, None]
+    if MASK_WITH_MULTIPLICITY:
+        mask_ptrs = mask_ptr + batch_idx * multiplicity * V + offs_v
+        for _ in range(multiplicity):
+            mask_tile = tl.load(mask_ptrs, mask=mask_v, other=0.0).to(tl.float32)
+            out_tile = acc + (1.0 - mask_tile[:, None]) * (-INF)
+            tl.store(out_mask_ptrs, out_tile, mask=mask_o)
+            out_mask_ptrs += NUM_HEADS * stride_uv
+            mask_ptrs += V
+    else:
+        mask_ptrs = mask_ptr + batch_idx * V + offs_v
+        mask_tile = tl.load(mask_ptrs, mask=mask_v, other=0.0).to(tl.float32)
+        for _ in range(multiplicity):
+            out_tile = acc + (1.0 - mask_tile[:, None]) * (-INF)
+            tl.store(out_mask_ptrs, out_tile, mask=mask_o)
+            out_mask_ptrs += NUM_HEADS * stride_uv
+            mask_ptrs += V
+@triton.jit
+def pair_bias_linear_mask_forward_kernel(
+    z_ptr,
+    mask_ptr,
+    w_proj_z_ptr,
+    b_proj_z_ptr,
+    U,
+    V,
+    multiplicity,
+    out_mask_ptr,
+    TILE_V: tl.constexpr,
+    TILE_K: tl.constexpr,
+    NUM_HEADS: tl.constexpr,
+    NUM_HEADS_PER_BLK: tl.constexpr,
+    DIM_Z: tl.constexpr,
+    INF: tl.constexpr,
+    HAS_BIAS: tl.constexpr,
+    MASK_WITH_MULTIPLICITY: tl.constexpr,
+):
+    # prepare single mask
+    # z: B x U x V x D -> z' B x H x U x V
+    # mask: B x V
+    # out_mask = z' + (1 - mask) * inf
+    pid_v = tl.program_id(0)
+    pid_u = tl.program_id(1)
+    head_batch_idx = tl.program_id(2)
+    NUM_HEAD_BLKS = tl.cdiv(NUM_HEADS, NUM_HEADS_PER_BLK)
+    batch_idx = head_batch_idx // NUM_HEAD_BLKS
+    head_idx = head_batch_idx % NUM_HEAD_BLKS
+    stride_vz = V * DIM_Z
+    stride_uv = U * V
+    stride_uvz = U * V * DIM_Z
+    offs_u = pid_u
+    offs_v = pid_v * TILE_V + tl.arange(0, TILE_V)
+    offs_k = tl.arange(0, TILE_K)
+    mask_v = offs_v < V
+    offs_head = head_idx * NUM_HEADS_PER_BLK + tl.arange(0, NUM_HEADS_PER_BLK)
+    mask_head = offs_head < NUM_HEADS
+    z_ptrs = z_ptr + batch_idx * stride_uvz + offs_u * stride_vz
+    z_ptrs += offs_v[:, None] * DIM_Z + offs_k[None, :]
+    w_ptrs = w_proj_z_ptr + (offs_head[None, :] * DIM_Z + offs_k[:, None])
+    acc = tl.zeros((TILE_V, NUM_HEADS_PER_BLK), dtype=tl.float32)
+    for _ in range(0, DIM_Z // TILE_K):
+        z_tile = tl.load(z_ptrs, mask=mask_v[:, None], other=0.0).to(tl.float32)
+        w_tile = tl.load(w_ptrs, mask=mask_head[None, :], other=0.0).to(tl.float32)
+        acc = tl.dot(z_tile, w_tile, acc, input_precision="tf32x3")
+        z_ptrs += TILE_K
+        w_ptrs += TILE_K
+    if HAS_BIAS:
+        b_proj_ptrs = b_proj_z_ptr + offs_head
+        b_proj_tile = tl.load(b_proj_ptrs, mask=mask_head, other=0.0)
+        acc += b_proj_tile[None, :]
+    offs_v = pid_v * TILE_V + tl.arange(0, TILE_V)
+    mask_v = offs_v < V
+    offs_head = head_idx * NUM_HEADS_PER_BLK + tl.arange(0, NUM_HEADS_PER_BLK)
+    mask_head = offs_head < NUM_HEADS
+    out_mask_ptrs = out_mask_ptr + batch_idx * multiplicity * NUM_HEADS * stride_uv
+    out_mask_ptrs += offs_u * V
+    out_mask_ptrs += offs_head[None, :] * stride_uv + offs_v[:, None]
+    mask_o = mask_head[None, :] & mask_v[:, None]
+    if MASK_WITH_MULTIPLICITY:
+        mask_ptrs = mask_ptr + batch_idx * multiplicity * V + offs_v
+        for _ in range(multiplicity):
+            mask_tile = tl.load(mask_ptrs, mask=mask_v, other=0.0).to(tl.float32)
+            out_tile = acc + (1.0 - mask_tile[:, None]) * (-INF)
+            tl.store(out_mask_ptrs, out_tile, mask=mask_o)
+            out_mask_ptrs += NUM_HEADS * stride_uv
+            mask_ptrs += V
+    else:
+        mask_ptrs = mask_ptr + batch_idx * V + offs_v
+        mask_tile = tl.load(mask_ptrs, mask=mask_v, other=0.0).to(tl.float32)
+        for _ in range(multiplicity):
+            out_tile = acc + (1.0 - mask_tile[:, None]) * (-INF)
+            tl.store(out_mask_ptrs, out_tile, mask=mask_o)
+            out_mask_ptrs += NUM_HEADS * stride_uv
+            mask_ptrs += V
+@triton.jit
+def pair_bias_mask_forward_kernel(
+    z_ptr,
+    mask_ptr,
+    U,
+    V,
+    multiplicity,
+    out_mask_ptr,
+    TILE_V: tl.constexpr,
+    NUM_HEADS: tl.constexpr,
+    NUM_HEADS_PER_BLK: tl.constexpr,
+    INF: tl.constexpr,
+    MASK_WITH_MULTIPLICITY: tl.constexpr,
+):
+    # prepare single mask
+    # z: B x U x V x H -> z' B x H x U x V
+    # mask: B x V
+    # out_mask = z' + (1 - mask) * inf
+    pid_v = tl.program_id(0)
+    pid_u = tl.program_id(1)
+    head_batch_idx = tl.program_id(2)
+    NUM_HEAD_BLKS = tl.cdiv(NUM_HEADS, NUM_HEADS_PER_BLK)
+    batch_idx = head_batch_idx // NUM_HEAD_BLKS
+    head_idx = head_batch_idx % NUM_HEAD_BLKS
+    stride_nh = V * NUM_HEADS
+    stride_uv = U * V
+    stride_uvh = U * V * NUM_HEADS
+    offs_u = pid_u
+    offs_v = pid_v * TILE_V + tl.arange(0, TILE_V)
+    mask_v = offs_v < V
+    offs_head = head_idx * NUM_HEADS_PER_BLK + tl.arange(0, NUM_HEADS_PER_BLK)
+    mask_head = offs_head < NUM_HEADS
+    z_ptrs = z_ptr + batch_idx * stride_uvh + offs_u * stride_nh
+    z_ptrs += offs_v[:, None] * NUM_HEADS + offs_head[None, :]
+    mask_zo = mask_v[:, None] & mask_head[None, :]
+    z_tile = tl.load(z_ptrs, mask=mask_zo, other=0.0).to(tl.float32)
+    out_mask_ptrs = out_mask_ptr + batch_idx * multiplicity * NUM_HEADS * stride_uv
+    out_mask_ptrs += offs_u * V
+    out_mask_ptrs += offs_head[None, :] * stride_uv + offs_v[:, None]
+    if MASK_WITH_MULTIPLICITY:
+        mask_ptrs = mask_ptr + batch_idx * multiplicity * V + offs_v
+        for _ in range(multiplicity):
+            mask_tile = tl.load(mask_ptrs, mask=mask_v, other=0.0).to(tl.float32)
+            out_tile = z_tile + (1.0 - mask_tile[:, None]) * (-INF)
+            tl.store(out_mask_ptrs, out_tile, mask=mask_zo)
+            out_mask_ptrs += NUM_HEADS * stride_uv
+            mask_ptrs += V
+    else:
+        mask_ptrs = mask_ptr + batch_idx * V + offs_v
+        mask_tile = tl.load(mask_ptrs, mask=mask_v, other=0.0).to(tl.float32)
+        for _ in range(multiplicity):
+            out_tile = z_tile + (1.0 - mask_tile[:, None]) * (-INF)
+            tl.store(out_mask_ptrs, out_tile, mask=mask_zo)
+            out_mask_ptrs += NUM_HEADS * stride_uv
+            mask_ptrs += V