PyPI - fbgemm-gpu-genai-nightly - Versions diffs - 2025.10.20__cp312-cp312-manylinux_2_28_x86_64.whl → 2026.1.8__cp312-cp312-manylinux_2_28_x86_64.whl - Mend

fbgemm-gpu-genai-nightly 2025.10.20__cp312-cp312-manylinux_2_28_x86_64.whl → 2026.1.8__cp312-cp312-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

fbgemm_gpu/__init__.py CHANGED Viewed

@@ -15,6 +15,7 @@ import torch
 # Based on the FBGEMM-PyTorch compatibility table at
 # https://docs.pytorch.org/FBGEMM/general/Releases.html#fbgemm-releases-compatibility
 _fbgemm_torch_compat_table = {
+    "1.5": "2.10",
     "1.4": "2.9",
     "1.3": "2.8",
     "1.2": "2.7",
@@ -81,7 +82,7 @@ def _load_library(filename: str, version: str, no_throw: bool = False) -> None:
             """
         )
-    elif str(torch.__version__) != _fbgemm_torch_compat_table[keys[0]]:
+    elif not str(torch.__version__).startswith(_fbgemm_torch_compat_table[keys[0]]):
         logging.warning(
             f"""
             \033[31m
@@ -132,6 +133,7 @@ fbgemm_gpu_libraries = [
     "fbgemm_gpu_config",
     "fbgemm_gpu_tbe_utils",
     "fbgemm_gpu_tbe_index_select",
+    "fbgemm_gpu_tbe_cache",
     "fbgemm_gpu_tbe_optimizers",
     "fbgemm_gpu_tbe_inference",
     "fbgemm_gpu_tbe_training_forward",

fbgemm_gpu/config/feature_list.py CHANGED Viewed

@@ -63,6 +63,9 @@ class FeatureGateName(Enum):
     # Enable TBE input parameters extraction
     TBE_REPORT_INPUT_PARAMS = auto()
+    # Enable tuned max segment length per CTA for B200
+    TBE_USE_TUNED_SEGMENT_LENGTHS_CTA_B200 = auto()
     def is_enabled(self) -> bool:
         return FeatureGate.is_enabled(self)

fbgemm_gpu/docs/target.genai.json.py CHANGED Viewed

@@ -1,6 +1,6 @@
 {
-    "version": "2025.10.20",
+    "version": "2026.1.8",
     "target": "genai",
     "variant": "cuda"
 }

fbgemm_gpu/experimental/example/fbgemm_gpu_experimental_example_py.so CHANGED Viewed

Binary file

fbgemm_gpu/experimental/gemm/triton_gemm/fp4_quantize.py CHANGED Viewed

@@ -289,7 +289,7 @@ def triton_quantize_mx4_unpack(
         stochastic_casting (bool): Whether to use stochastic casting.
     Returns:
-        torch.Tensor: [M / 2] mx4 scaled tensor packed into in8
+        torch.Tensor: [M / 2] mx4 scaled tensor packed into uint8
         torch.Tensor: [M / group_size] mx4 shared exponents into int8
         eg.
@@ -1410,8 +1410,9 @@ def _kernel_nvfp4_quantize(
         # Apply scale_ to input. We do this by broadcasting scale.
         # scaled_a = a * global_scale (fp32) / local_scale (fp8)
-        scaled_a = tl.reshape(a, [GROUP_LOAD, GROUP_SIZE]) * tl.reshape(
-            input_global_scale / scale_, [GROUP_LOAD, 1]
+        scaled_a = tl.div_rn(
+            tl.reshape(a, [GROUP_LOAD, GROUP_SIZE]).to(tl.float32),
+            tl.reshape(scale_ / input_global_scale, [GROUP_LOAD, 1]).to(tl.float32),
         )
         # Reshape back to a flat array.
         scaled_a = tl.reshape(scaled_a, [GROUP_LOAD * GROUP_SIZE])

fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py CHANGED Viewed

@@ -1212,6 +1212,8 @@ def matmul_fp8_row(
     imprecise_acc: bool = False,
     tma_persistent: bool = True,
     no_use_persistent: Optional[bool] = None,
+    # add an option to explicitly require the use of persistent process
+    use_persistent: Optional[bool] = None,
     use_warp_specialization: bool = False,
 ) -> torch.Tensor:
     """
@@ -1232,12 +1234,16 @@ def matmul_fp8_row(
     Returns:
         torch.Tensor: [M, N] Output tensor a @ b / (a_scale[:, None] * b_scale[None, :])
     """
-    if no_use_persistent is None:
+    if use_persistent:
+        no_use_persistent = False
+    elif no_use_persistent is None:
         # Default True for AMD and False for Nvidia.
         if torch.version.hip is not None:
             no_use_persistent = True
         else:
             no_use_persistent = False
+    # if use_persistent is explicitly requested, set o_use_persistent to False
     # Get datatypes and constants to use.
     pt_fp8_dtype, _, _, _ = get_fp8_constants()
     # Handle 3D+ a shape
@@ -3840,6 +3846,10 @@ _MATMUL_CONFIG_TUPLES_PINGPONG_4K_8K_16K = [
     (256, 128, 128, 1, 1, 2, 16, 1, 8, 2),
     (128, 256, 128, 2, 1, 2, 16, 2, 4, 1),
     (256, 128, 64, 2, 1, 2, 16, 1, 4, 2),
+    (128, 128, 256, 2, 1, 0, 16, 2, 8, 2),
+    (128, 64, 128, 2, 1, 2, 16, 2, 4, 2),
+    (128, 128, 64, 2, 1, 0, 16, 1, 4, 2),
+    (128, 128, 128, 1, 1, 2, 16, 1, 4, 2),
 ]

fbgemm_gpu/experimental/gemm/triton_gemm/grouped_gemm.py CHANGED Viewed

@@ -509,14 +509,13 @@ def _fbgemm_grouped_gemm_ws(
             num_tiles = num_m_tiles * NUM_N_TILES
             if USE_TMA_STORE:
-                with tl.async_task([0]):
-                    c_desc_ptr = tl.make_tensor_descriptor(
-                        c_ptr + M_start_offset * N,
-                        shape=[m_size, N],
-                        # pyre-ignore
-                        strides=[N, 1],
-                        block_shape=[BLOCK_SIZE_M, BLOCK_SIZE_N],
-                    )
+                c_desc_ptr = tl.make_tensor_descriptor(
+                    c_ptr + M_start_offset * N,
+                    shape=[m_size, N],
+                    # pyre-ignore
+                    strides=[N, 1],
+                    block_shape=[BLOCK_SIZE_M, BLOCK_SIZE_N],
+                )
             # Move across tiles
             next_iterated_tiles = iterated_tiles + num_tiles
@@ -534,72 +533,59 @@ def _fbgemm_grouped_gemm_ws(
                     m_offset = (M_start_offset + tile_m_idx * BLOCK_SIZE_M).to(tl.int32)
                     n_offset = (N_start_offset + tile_n_idx * BLOCK_SIZE_N).to(tl.int32)
                     for k_offset in range(0, K, BLOCK_SIZE_K):
-                        with tl.async_task([0]):
-                            a = tl._experimental_descriptor_load(
-                                a_desc_ptr,
-                                [m_offset, k_offset],
-                                [BLOCK_SIZE_M, BLOCK_SIZE_K],
-                                dtype,
-                            )
-                            b = tl._experimental_descriptor_load(
-                                b_desc_ptr,
-                                [n_offset, k_offset],
-                                [BLOCK_SIZE_N, BLOCK_SIZE_K],
-                                dtype,
-                            )
-                        with tl.async_task([1, NUM_CONSUMER_GROUPS]):
-                            if USE_FAST_ACCUM:
-                                accumulator = tl.dot(a, b.T, accumulator)
-                            else:
-                                accumulator += tl.dot(a, b.T)
+                        a = tl._experimental_descriptor_load(
+                            a_desc_ptr,
+                            [m_offset, k_offset],
+                            [BLOCK_SIZE_M, BLOCK_SIZE_K],
+                            dtype,
+                        )
+                        b = tl._experimental_descriptor_load(
+                            b_desc_ptr,
+                            [n_offset, k_offset],
+                            [BLOCK_SIZE_N, BLOCK_SIZE_K],
+                            dtype,
+                        )
+                        if USE_FAST_ACCUM:
+                            accumulator = tl.dot(a, b.T, accumulator)
+                        else:
+                            accumulator += tl.dot(a, b.T)
                     if USE_TMA_STORE:
-                        with tl.async_task([1, NUM_CONSUMER_GROUPS]):
-                            m_offset = (tile_m_idx * BLOCK_SIZE_M).to(tl.int32)
-                            n_offset = (tile_n_idx * BLOCK_SIZE_N).to(tl.int32)
-                            # pyre-ignore
-                            c_desc_ptr.store(
-                                [m_offset, n_offset],
-                                accumulator.to(c_ptr.dtype.element_ty),
-                            )
+                        m_offset = (tile_m_idx * BLOCK_SIZE_M).to(tl.int32)
+                        n_offset = (tile_n_idx * BLOCK_SIZE_N).to(tl.int32)
+                        # pyre-ignore
+                        c_desc_ptr.store(
+                            [m_offset, n_offset],
+                            accumulator.to(c_ptr.dtype.element_ty),
+                        )
                     elif FUSE_SCATTER_ADD:
-                        with tl.async_task([1, NUM_CONSUMER_GROUPS]):
-                            offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(
-                                0, BLOCK_SIZE_M
-                            )
-                            mask = offs_am < m_size
-                            m_offsets = tl.load(
-                                scatter_add_indices + M_start_offset + offs_am,
-                                mask=mask,
-                                cache_modifier=".ca",
-                            )
-                            offs_bn = tile_n_idx * BLOCK_SIZE_N + tl.arange(
-                                0, BLOCK_SIZE_N
-                            )
-                            c = accumulator.to(c_ptr.dtype.element_ty)
-                            tl.atomic_add(
-                                c_ptr + m_offsets[:, None] * N + offs_bn[None, :],
-                                c,
-                                mask=mask[:, None],
-                                sem="relaxed",
-                            )
+                        offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+                        mask = offs_am < m_size
+                        m_offsets = tl.load(
+                            scatter_add_indices + M_start_offset + offs_am,
+                            mask=mask,
+                            cache_modifier=".ca",
+                        )
+                        offs_bn = tile_n_idx * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+                        c = accumulator.to(c_ptr.dtype.element_ty)
+                        tl.atomic_add(
+                            c_ptr + m_offsets[:, None] * N + offs_bn[None, :],
+                            c,
+                            mask=mask[:, None],
+                            sem="relaxed",
+                        )
                     else:
-                        with tl.async_task([1, NUM_CONSUMER_GROUPS]):
-                            offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(
-                                0, BLOCK_SIZE_M
-                            )
-                            offs_bn = tile_n_idx * BLOCK_SIZE_N + tl.arange(
-                                0, BLOCK_SIZE_N
-                            )
-                            c = accumulator.to(c_ptr.dtype.element_ty)
-                            tl.store(
-                                c_ptr
-                                + (M_start_offset + offs_am[:, None]) * N
-                                + offs_bn[None, :],
-                                c,
-                                mask=offs_am[:, None] < m_size,
-                                cache_modifier=".cs",
-                            )
+                        offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+                        offs_bn = tile_n_idx * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+                        c = accumulator.to(c_ptr.dtype.element_ty)
+                        tl.store(
+                            c_ptr
+                            + (M_start_offset + offs_am[:, None]) * N
+                            + offs_bn[None, :],
+                            c,
+                            mask=offs_am[:, None] < m_size,
+                            cache_modifier=".cs",
+                        )
                     tidx += NUM_SMS
             iterated_tiles += num_tiles
@@ -841,14 +827,13 @@ def _fbgemm_grouped_gemm_fp8_rowwise_ws(
             num_tiles = num_m_tiles * NUM_N_TILES
             if USE_TMA_STORE:
-                with tl.async_task([0]):
-                    c_desc_ptr = tl.make_tensor_descriptor(
-                        c_ptr + M_start_offset * N,
-                        shape=[m_size, N],
-                        # pyre-ignore
-                        strides=[N, 1],
-                        block_shape=[BLOCK_SIZE_M, BLOCK_SIZE_N],
-                    )
+                c_desc_ptr = tl.make_tensor_descriptor(
+                    c_ptr + M_start_offset * N,
+                    shape=[m_size, N],
+                    # pyre-ignore
+                    strides=[N, 1],
+                    block_shape=[BLOCK_SIZE_M, BLOCK_SIZE_N],
+                )
             # Move across tiles
             next_iterated_tiles = iterated_tiles + num_tiles
@@ -867,107 +852,85 @@ def _fbgemm_grouped_gemm_fp8_rowwise_ws(
                     m_offset = (M_start_offset + tile_m_idx * BLOCK_SIZE_M).to(tl.int32)
                     n_offset = (N_start_offset + tile_n_idx * BLOCK_SIZE_N).to(tl.int32)
                     for k_offset in range(0, K, BLOCK_SIZE_K):
-                        with tl.async_task([0]):
-                            a = tl._experimental_descriptor_load(
-                                a_desc_ptr,
-                                [m_offset, k_offset],
-                                [BLOCK_SIZE_M, BLOCK_SIZE_K],
-                                dtype,
-                            )
-                            b = tl._experimental_descriptor_load(
-                                b_desc_ptr,
-                                [n_offset, k_offset],
-                                [BLOCK_SIZE_N, BLOCK_SIZE_K],
-                                dtype,
-                            )
-                        with tl.async_task([1, NUM_CONSUMER_GROUPS]):
-                            if USE_FAST_ACCUM:
-                                accumulator = tl.dot(a, b.T, accumulator)
-                            else:
-                                accumulator += tl.dot(a, b.T)
+                        a = tl._experimental_descriptor_load(
+                            a_desc_ptr,
+                            [m_offset, k_offset],
+                            [BLOCK_SIZE_M, BLOCK_SIZE_K],
+                            dtype,
+                        )
+                        b = tl._experimental_descriptor_load(
+                            b_desc_ptr,
+                            [n_offset, k_offset],
+                            [BLOCK_SIZE_N, BLOCK_SIZE_K],
+                            dtype,
+                        )
+                        if USE_FAST_ACCUM:
+                            accumulator = tl.dot(a, b.T, accumulator)
+                        else:
+                            accumulator += tl.dot(a, b.T)
                     if USE_TMA_LOAD_ON_SCALES:
-                        with tl.async_task([0]):
-                            b_scale = tl._experimental_descriptor_load(
-                                b_scale_desc_ptr,
-                                [n_offset],
-                                [BLOCK_SIZE_N],
-                                tl.float32,
-                            )
-                        with tl.async_task([1, NUM_CONSUMER_GROUPS]):
-                            offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(
-                                0, BLOCK_SIZE_M
-                            )
-                            a_scale = tl.load(
-                                a_scale_ptr + M_start_offset + offs_am[:, None],
-                                mask=offs_am[:, None] < m_size,
-                                cache_modifier=".ca",
-                            )
-                            c = accumulator.to(tl.float32) * a_scale * b_scale[None, :]
+                        b_scale = tl._experimental_descriptor_load(
+                            b_scale_desc_ptr,
+                            [n_offset],
+                            [BLOCK_SIZE_N],
+                            tl.float32,
+                        )
+                        offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+                        a_scale = tl.load(
+                            a_scale_ptr + M_start_offset + offs_am[:, None],
+                            mask=offs_am[:, None] < m_size,
+                            cache_modifier=".ca",
+                        )
+                        c = accumulator.to(tl.float32) * a_scale * b_scale[None, :]
                     else:
-                        with tl.async_task([1, NUM_CONSUMER_GROUPS]):
-                            offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(
-                                0, BLOCK_SIZE_M
-                            )
-                            offs_bn = tile_n_idx * BLOCK_SIZE_N + tl.arange(
-                                0, BLOCK_SIZE_N
-                            )
-                            a_scale = tl.load(
-                                a_scale_ptr + M_start_offset + offs_am[:, None],
-                                mask=offs_am[:, None] < m_size,
-                                cache_modifier=".ca",
-                            )
-                            b_scale = tl.load(
-                                b_scale_ptr + N_start_offset + offs_bn[None, :],
-                                cache_modifier=".ca",
-                            )
-                            c = accumulator.to(tl.float32) * a_scale * b_scale
+                        offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+                        offs_bn = tile_n_idx * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+                        a_scale = tl.load(
+                            a_scale_ptr + M_start_offset + offs_am[:, None],
+                            mask=offs_am[:, None] < m_size,
+                            cache_modifier=".ca",
+                        )
+                        b_scale = tl.load(
+                            b_scale_ptr + N_start_offset + offs_bn[None, :],
+                            cache_modifier=".ca",
+                        )
+                        c = accumulator.to(tl.float32) * a_scale * b_scale
                     if USE_TMA_STORE:
-                        with tl.async_task([1, NUM_CONSUMER_GROUPS]):
-                            m_offset = (tile_m_idx * BLOCK_SIZE_M).to(tl.int32)
-                            n_offset = (tile_n_idx * BLOCK_SIZE_N).to(tl.int32)
-                            # pyre-ignore
-                            c_desc_ptr.store(
-                                [m_offset, n_offset], c.to(c_ptr.dtype.element_ty)
-                            )
+                        m_offset = (tile_m_idx * BLOCK_SIZE_M).to(tl.int32)
+                        n_offset = (tile_n_idx * BLOCK_SIZE_N).to(tl.int32)
+                        # pyre-ignore
+                        c_desc_ptr.store(
+                            [m_offset, n_offset], c.to(c_ptr.dtype.element_ty)
+                        )
                     elif FUSE_SCATTER_ADD:
-                        with tl.async_task([1, NUM_CONSUMER_GROUPS]):
-                            offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(
-                                0, BLOCK_SIZE_M
-                            )
-                            mask = offs_am < m_size
-                            m_offsets = tl.load(
-                                scatter_add_indices + M_start_offset + offs_am,
-                                mask=mask,
-                                cache_modifier=".ca",
-                            )
-                            offs_bn = tile_n_idx * BLOCK_SIZE_N + tl.arange(
-                                0, BLOCK_SIZE_N
-                            )
-                            tl.atomic_add(
-                                c_ptr + m_offsets[:, None] * N + offs_bn[None, :],
-                                c,
-                                mask=mask[:, None],
-                                sem="relaxed",
-                            )
+                        offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+                        mask = offs_am < m_size
+                        m_offsets = tl.load(
+                            scatter_add_indices + M_start_offset + offs_am,
+                            mask=mask,
+                            cache_modifier=".ca",
+                        )
+                        offs_bn = tile_n_idx * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+                        tl.atomic_add(
+                            c_ptr + m_offsets[:, None] * N + offs_bn[None, :],
+                            c,
+                            mask=mask[:, None],
+                            sem="relaxed",
+                        )
                     else:
-                        with tl.async_task([1, NUM_CONSUMER_GROUPS]):
-                            offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(
-                                0, BLOCK_SIZE_M
-                            )
-                            offs_bn = tile_n_idx * BLOCK_SIZE_N + tl.arange(
-                                0, BLOCK_SIZE_N
-                            )
-                            tl.store(
-                                c_ptr
-                                + (M_start_offset + offs_am[:, None]) * N
-                                + offs_bn[None, :],
-                                c,
-                                mask=offs_am[:, None] < m_size,
-                                cache_modifier=".cs",
-                            )
+                        offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+                        offs_bn = tile_n_idx * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+                        tl.store(
+                            c_ptr
+                            + (M_start_offset + offs_am[:, None]) * N
+                            + offs_bn[None, :],
+                            c,
+                            mask=offs_am[:, None] < m_size,
+                            cache_modifier=".cs",
+                        )
                     tidx += NUM_SMS
             iterated_tiles += num_tiles

fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/__init__.py CHANGED Viewed

@@ -29,4 +29,18 @@ else:
     )
 from . import cutlass_blackwell_fmha_custom_op  # noqa: F401
-from .cutlass_blackwell_fmha_interface import cutlass_blackwell_fmha_func  # noqa: F401
+from .cutlass_blackwell_fmha_interface import (  # noqa: F401
+    _cutlass_blackwell_fmha_forward,
+    cutlass_blackwell_fmha_decode_forward,
+    cutlass_blackwell_fmha_func,
+)
+# Note: _cutlass_blackwell_fmha_forward is an internal function (indicated by leading underscore)
+# that is exported here specifically for testing purposes. It allows tests to access the LSE
+# (log-sum-exp) values returned by the forward pass without modifying the public API.
+# Production code should use cutlass_blackwell_fmha_func instead.
+__all__ = [
+    "_cutlass_blackwell_fmha_forward",
+    "cutlass_blackwell_fmha_decode_forward",
+    "cutlass_blackwell_fmha_func",
+]

fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_custom_op.py CHANGED Viewed

@@ -12,13 +12,13 @@ from torch.library import register_fake
 torch.library.define(
     "blackwell_fmha::fmha_fwd",
-    "(Tensor q, Tensor k, Tensor v, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int? max_seq_len_q, int? max_seq_len_k, float? softmax_scale, bool? causal, Tensor? seqlen_kv) -> (Tensor, Tensor)",
+    "(Tensor q, Tensor k, Tensor v, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int? max_seq_len_q, int? max_seq_len_k, float? softmax_scale, bool? causal, Tensor? seqlen_kv, Tensor? page_table, int seqlen_k=-1, int window_size_left=-1, int window_size_right=-1, bool bottom_right=True) -> (Tensor, Tensor)",
     tags=torch.Tag.pt2_compliant_tag,
 )
 torch.library.define(
     "blackwell_fmha::fmha_bwd",
-    "(Tensor dout, Tensor q, Tensor k, Tensor v, Tensor out, Tensor softmax_lse, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int? max_seq_len_q, int? max_seq_len_k, bool? causal) -> (Tensor, Tensor, Tensor)",
+    "(Tensor dout, Tensor q, Tensor k, Tensor v, Tensor out, Tensor softmax_lse, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int? max_seq_len_q, int? max_seq_len_k, float? softmax_scale, bool? causal, int window_size_left=-1, int window_size_right=-1, bool bottom_right=True, bool deterministic=False) -> (Tensor, Tensor, Tensor)",
     tags=torch.Tag.pt2_compliant_tag,
 )
@@ -35,6 +35,11 @@ def custom_op_fmha(
     softmax_scale: Optional[float] = None,
     causal: bool = False,
     seqlen_kv: Optional[torch.Tensor] = None,
+    page_table: Optional[torch.Tensor] = None,
+    seqlen_k: Optional[int] = None,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    bottom_right: bool = True,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     assert q.is_contiguous(), "q is not contiguous"
     assert k.is_contiguous(), "k is not contiguous"
@@ -42,6 +47,7 @@ def custom_op_fmha(
     assert q.is_cuda, "q must be on GPU"
     assert k.is_cuda, "k must be on GPU"
     assert v.is_cuda, "v must be on GPU"
     return torch.ops.fbgemm.fmha_fwd(
         q,
         k,
@@ -53,6 +59,11 @@ def custom_op_fmha(
         softmax_scale=softmax_scale,
         causal=causal,
         seqlen_kv=seqlen_kv,
+        page_table=page_table,
+        seqlen_k=seqlen_k,
+        window_size_left=window_size_left,
+        window_size_right=window_size_right,
+        bottom_right=bottom_right,
     )
@@ -68,6 +79,11 @@ def fmha_fwd_meta(
     softmax_scale: Optional[float] = None,
     causal: bool = False,
     seqlen_kv: Optional[torch.Tensor] = None,
+    page_table: Optional[torch.Tensor] = None,
+    seqlen_k: Optional[int] = None,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    bottom_right: bool = True,
 ):
     if q.dtype == torch.float16:
         out_dtype = torch.float16
@@ -122,8 +138,14 @@ def custom_op_fmha_bwd(
     cu_seqlens_k: Optional[torch.Tensor] = None,
     max_seq_len_q: Optional[int] = None,
     max_seq_len_k: Optional[int] = None,
+    softmax_scale: Optional[float] = None,
     causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    bottom_right: bool = True,
+    deterministic: bool = False,
 ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     return torch.ops.fbgemm.fmha_bwd(
         dOutput,
         query,
@@ -135,7 +157,12 @@ def custom_op_fmha_bwd(
         cu_seqlens_k=cu_seqlens_k,
         max_seq_len_q=max_seq_len_q,
         max_seq_len_k=max_seq_len_k,
+        softmax_scale=softmax_scale,
         causal=causal,
+        window_size_left=window_size_left,
+        window_size_right=window_size_right,
+        bottom_right=bottom_right,
+        deterministic=deterministic,
     )
@@ -151,7 +178,12 @@ def fmha_bwd_meta(
     cu_seqlens_k: Optional[torch.Tensor] = None,
     max_seq_len_q: Optional[int] = None,
     max_seq_len_k: Optional[int] = None,
+    softmax_scale: Optional[float] = None,
     causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    bottom_right: bool = True,
+    deterministic: bool = False,
 ):
     return (
         torch.empty_like(query),
@@ -198,9 +230,30 @@ def _backward(ctx, *grad):
         ctx.cu_seqlens_k,
         ctx.max_seq_len_q,
         ctx.max_seq_len_k,
+        ctx.softmax_scale,
         ctx.causal,
+        ctx.window_size_left,
+        ctx.window_size_right,
+        ctx.bottom_right,
+        ctx.deterministic,
+    )
+    return (
+        dq,
+        dk,
+        dv,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
     )
-    return dq, dk, dv, None, None, None, None, None, None, None
 def _setup_context(ctx, inputs, output):
@@ -215,6 +268,11 @@ def _setup_context(ctx, inputs, output):
         softmax_scale,
         causal,
         seqlen_kv,
+        page_table,
+        seqlen_k,
+        window_size_left,
+        window_size_right,
+        bottom_right,
     ) = inputs
     (out, softmax_lse) = output
     ctx.save_for_backward(q, k, v, out, softmax_lse)
@@ -224,6 +282,10 @@ def _setup_context(ctx, inputs, output):
     ctx.max_seq_len_k = max_seq_len_k
     ctx.cu_seqlens_q = cu_seqlens_q
     ctx.cu_seqlens_k = cu_seqlens_k
+    ctx.window_size_left = window_size_left
+    ctx.window_size_right = window_size_right
+    ctx.bottom_right = bottom_right
+    ctx.deterministic = False  # Set default value
     ctx.is_gen = False
@@ -246,6 +308,11 @@ def cutlass_blackwell_fmha_custom_op(
     max_seq_len_q: int | None = None,
     max_seq_len_k: int | None = None,
     seqlen_kv: torch.Tensor | None = None,
+    page_table: torch.Tensor | None = None,
+    seqlen_k: int | None = -1,
+    window_size_left: int | None = -1,
+    window_size_right: int | None = -1,
+    bottom_right: bool | None = True,
 ):
     return torch.ops.blackwell_fmha.fmha_fwd(
         q=q,
@@ -258,4 +325,9 @@ def cutlass_blackwell_fmha_custom_op(
         softmax_scale=softmax_scale,
         causal=causal,
         seqlen_kv=seqlen_kv,
+        page_table=page_table,
+        seqlen_k=seqlen_k,
+        window_size_left=window_size_left,
+        window_size_right=window_size_right,
+        bottom_right=bottom_right,
     )[0]