PyPI - fbgemm-gpu-genai-nightly - Versions diffs - 2025.10.10__cp310-cp310-manylinux_2_28_x86_64.whl → 2025.11.9__cp310-cp310-manylinux_2_28_x86_64.whl - Mend

fbgemm-gpu-genai-nightly 2025.10.10__cp310-cp310-manylinux_2_28_x86_64.whl → 2025.11.9__cp310-cp310-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of fbgemm-gpu-genai-nightly might be problematic. Click here for more details.

Files changed (25) hide show

fbgemm_gpu/__init__.py CHANGED Viewed

@@ -5,6 +5,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+import json
 import logging
 import os
 import re
@@ -26,6 +27,19 @@ _fbgemm_torch_compat_table = {
 }
+def _load_target_info(target: str) -> dict[str, str]:
+    try:
+        filepath = os.path.join(
+            os.path.dirname(__file__), "docs", f"target.{target}.json.py"
+        )
+        with open(filepath, "r") as file:
+            data = json.load(file)
+    except Exception:
+        data = {}
+    return data
 def _load_library(filename: str, version: str, no_throw: bool = False) -> None:
     """Load a shared library from the given filename."""
@@ -98,13 +112,16 @@ open_source: bool = True
 # Trigger the manual addition of docstrings to pybind11-generated operators
 import fbgemm_gpu.docs  # noqa: F401, E402
+__targets_infos__ = {
+    target: _load_target_info(target) for target in ["default", "genai", "hstu"]
+}
+__targets_infos__ = {k: v for (k, v) in __targets_infos__.items() if v}
 try:
-    # Export the version string from the version file auto-generated by setup.py
-    from fbgemm_gpu.docs.version import (  # noqa: F401, E402
-        __target__,
-        __variant__,
-        __version__,
-    )
+    __target__, __info__ = next(iter(__targets_infos__.items()))
+    __variant__ = __info__["variant"]
+    __version__ = __info__["version"]
 except Exception:
     __variant__: str = "INTERNAL"
     __version__: str = "INTERNAL"
@@ -145,18 +162,19 @@ libraries_to_load = {
     "genai": fbgemm_genai_libraries,
 }
-for library in libraries_to_load.get(__target__, []):
-    # NOTE: In all cases, we want to throw an error if we cannot load the
-    # library.  However, this appears to break the OSS documentation build,
-    # where the Python documentation doesn't show up in the generated docs.
-    #
-    # To work around this problem, we introduce a fake build variant called
-    # `docs` and we only throw a library load error when the variant is not
-    # `docs`.  For more information, see:
-    #
-    #   https://github.com/pytorch/FBGEMM/pull/3477
-    #   https://github.com/pytorch/FBGEMM/pull/3717
-    _load_library(f"{library}.so", __version__, __variant__ == "docs")
+for target, info in __targets_infos__.items():
+    for library in libraries_to_load.get(target, []):
+        # NOTE: In all cases, we want to throw an error if we cannot load the
+        # library.  However, this appears to break the OSS documentation build,
+        # where the Python documentation doesn't show up in the generated docs.
+        #
+        # To work around this problem, we introduce a fake build variant called
+        # `docs` and we only throw a library load error when the variant is not
+        # `docs`.  For more information, see:
+        #
+        #   https://github.com/pytorch/FBGEMM/pull/3477
+        #   https://github.com/pytorch/FBGEMM/pull/3717
+        _load_library(f"{library}.so", info["version"], info["variant"] == "docs")
 try:
     # Trigger meta operator registrations

fbgemm_gpu/docs/target.genai.json.py ADDED Viewed

@@ -0,0 +1,6 @@
+{
+    "version": "2025.11.9",
+    "target": "genai",
+    "variant": "cuda"
+}

fbgemm_gpu/experimental/example/__init__.py CHANGED Viewed

@@ -15,10 +15,6 @@ try:
     # pyre-ignore[21]
     # @manual=//deeplearning/fbgemm/fbgemm_gpu:test_utils
     from fbgemm_gpu import open_source
-    # pyre-ignore[21]
-    # @manual=//deeplearning/fbgemm/fbgemm_gpu:test_utils
-    from fbgemm_gpu.docs.version import __version__  # noqa: F401
 except Exception:
     open_source: bool = False

fbgemm_gpu/experimental/example/fbgemm_gpu_experimental_example_py.so CHANGED Viewed

Binary file

fbgemm_gpu/experimental/gemm/triton_gemm/__init__.py CHANGED Viewed

@@ -11,9 +11,5 @@ try:
     # pyre-ignore[21]
     # @manual=//deeplearning/fbgemm/fbgemm_gpu:test_utils
     from fbgemm_gpu import open_source
-    # pyre-ignore[21]
-    # @manual=//deeplearning/fbgemm/fbgemm_gpu:test_utils
-    from fbgemm_gpu.docs.version import __version__  # noqa: F401
 except Exception:
     open_source: bool = False

fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py CHANGED Viewed

@@ -3840,6 +3840,10 @@ _MATMUL_CONFIG_TUPLES_PINGPONG_4K_8K_16K = [
     (256, 128, 128, 1, 1, 2, 16, 1, 8, 2),
     (128, 256, 128, 2, 1, 2, 16, 2, 4, 1),
     (256, 128, 64, 2, 1, 2, 16, 1, 4, 2),
+    (128, 128, 256, 2, 1, 0, 16, 2, 8, 2),
+    (128, 64, 128, 2, 1, 2, 16, 2, 4, 2),
+    (128, 128, 64, 2, 1, 0, 16, 1, 4, 2),
+    (128, 128, 128, 1, 1, 2, 16, 1, 4, 2),
 ]

fbgemm_gpu/experimental/gemm/triton_gemm/grouped_gemm.py CHANGED Viewed

@@ -509,14 +509,13 @@ def _fbgemm_grouped_gemm_ws(
             num_tiles = num_m_tiles * NUM_N_TILES
             if USE_TMA_STORE:
-                with tl.async_task([0]):
-                    c_desc_ptr = tl.make_tensor_descriptor(
-                        c_ptr + M_start_offset * N,
-                        shape=[m_size, N],
-                        # pyre-ignore
-                        strides=[N, 1],
-                        block_shape=[BLOCK_SIZE_M, BLOCK_SIZE_N],
-                    )
+                c_desc_ptr = tl.make_tensor_descriptor(
+                    c_ptr + M_start_offset * N,
+                    shape=[m_size, N],
+                    # pyre-ignore
+                    strides=[N, 1],
+                    block_shape=[BLOCK_SIZE_M, BLOCK_SIZE_N],
+                )
             # Move across tiles
             next_iterated_tiles = iterated_tiles + num_tiles
@@ -534,72 +533,59 @@ def _fbgemm_grouped_gemm_ws(
                     m_offset = (M_start_offset + tile_m_idx * BLOCK_SIZE_M).to(tl.int32)
                     n_offset = (N_start_offset + tile_n_idx * BLOCK_SIZE_N).to(tl.int32)
                     for k_offset in range(0, K, BLOCK_SIZE_K):
-                        with tl.async_task([0]):
-                            a = tl._experimental_descriptor_load(
-                                a_desc_ptr,
-                                [m_offset, k_offset],
-                                [BLOCK_SIZE_M, BLOCK_SIZE_K],
-                                dtype,
-                            )
-                            b = tl._experimental_descriptor_load(
-                                b_desc_ptr,
-                                [n_offset, k_offset],
-                                [BLOCK_SIZE_N, BLOCK_SIZE_K],
-                                dtype,
-                            )
-                        with tl.async_task([1, NUM_CONSUMER_GROUPS]):
-                            if USE_FAST_ACCUM:
-                                accumulator = tl.dot(a, b.T, accumulator)
-                            else:
-                                accumulator += tl.dot(a, b.T)
+                        a = tl._experimental_descriptor_load(
+                            a_desc_ptr,
+                            [m_offset, k_offset],
+                            [BLOCK_SIZE_M, BLOCK_SIZE_K],
+                            dtype,
+                        )
+                        b = tl._experimental_descriptor_load(
+                            b_desc_ptr,
+                            [n_offset, k_offset],
+                            [BLOCK_SIZE_N, BLOCK_SIZE_K],
+                            dtype,
+                        )
+                        if USE_FAST_ACCUM:
+                            accumulator = tl.dot(a, b.T, accumulator)
+                        else:
+                            accumulator += tl.dot(a, b.T)
                     if USE_TMA_STORE:
-                        with tl.async_task([1, NUM_CONSUMER_GROUPS]):
-                            m_offset = (tile_m_idx * BLOCK_SIZE_M).to(tl.int32)
-                            n_offset = (tile_n_idx * BLOCK_SIZE_N).to(tl.int32)
-                            # pyre-ignore
-                            c_desc_ptr.store(
-                                [m_offset, n_offset],
-                                accumulator.to(c_ptr.dtype.element_ty),
-                            )
+                        m_offset = (tile_m_idx * BLOCK_SIZE_M).to(tl.int32)
+                        n_offset = (tile_n_idx * BLOCK_SIZE_N).to(tl.int32)
+                        # pyre-ignore
+                        c_desc_ptr.store(
+                            [m_offset, n_offset],
+                            accumulator.to(c_ptr.dtype.element_ty),
+                        )
                     elif FUSE_SCATTER_ADD:
-                        with tl.async_task([1, NUM_CONSUMER_GROUPS]):
-                            offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(
-                                0, BLOCK_SIZE_M
-                            )
-                            mask = offs_am < m_size
-                            m_offsets = tl.load(
-                                scatter_add_indices + M_start_offset + offs_am,
-                                mask=mask,
-                                cache_modifier=".ca",
-                            )
-                            offs_bn = tile_n_idx * BLOCK_SIZE_N + tl.arange(
-                                0, BLOCK_SIZE_N
-                            )
-                            c = accumulator.to(c_ptr.dtype.element_ty)
-                            tl.atomic_add(
-                                c_ptr + m_offsets[:, None] * N + offs_bn[None, :],
-                                c,
-                                mask=mask[:, None],
-                                sem="relaxed",
-                            )
+                        offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+                        mask = offs_am < m_size
+                        m_offsets = tl.load(
+                            scatter_add_indices + M_start_offset + offs_am,
+                            mask=mask,
+                            cache_modifier=".ca",
+                        )
+                        offs_bn = tile_n_idx * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+                        c = accumulator.to(c_ptr.dtype.element_ty)
+                        tl.atomic_add(
+                            c_ptr + m_offsets[:, None] * N + offs_bn[None, :],
+                            c,
+                            mask=mask[:, None],
+                            sem="relaxed",
+                        )
                     else:
-                        with tl.async_task([1, NUM_CONSUMER_GROUPS]):
-                            offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(
-                                0, BLOCK_SIZE_M
-                            )
-                            offs_bn = tile_n_idx * BLOCK_SIZE_N + tl.arange(
-                                0, BLOCK_SIZE_N
-                            )
-                            c = accumulator.to(c_ptr.dtype.element_ty)
-                            tl.store(
-                                c_ptr
-                                + (M_start_offset + offs_am[:, None]) * N
-                                + offs_bn[None, :],
-                                c,
-                                mask=offs_am[:, None] < m_size,
-                                cache_modifier=".cs",
-                            )
+                        offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+                        offs_bn = tile_n_idx * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+                        c = accumulator.to(c_ptr.dtype.element_ty)
+                        tl.store(
+                            c_ptr
+                            + (M_start_offset + offs_am[:, None]) * N
+                            + offs_bn[None, :],
+                            c,
+                            mask=offs_am[:, None] < m_size,
+                            cache_modifier=".cs",
+                        )
                     tidx += NUM_SMS
             iterated_tiles += num_tiles
@@ -841,14 +827,13 @@ def _fbgemm_grouped_gemm_fp8_rowwise_ws(
             num_tiles = num_m_tiles * NUM_N_TILES
             if USE_TMA_STORE:
-                with tl.async_task([0]):
-                    c_desc_ptr = tl.make_tensor_descriptor(
-                        c_ptr + M_start_offset * N,
-                        shape=[m_size, N],
-                        # pyre-ignore
-                        strides=[N, 1],
-                        block_shape=[BLOCK_SIZE_M, BLOCK_SIZE_N],
-                    )
+                c_desc_ptr = tl.make_tensor_descriptor(
+                    c_ptr + M_start_offset * N,
+                    shape=[m_size, N],
+                    # pyre-ignore
+                    strides=[N, 1],
+                    block_shape=[BLOCK_SIZE_M, BLOCK_SIZE_N],
+                )
             # Move across tiles
             next_iterated_tiles = iterated_tiles + num_tiles
@@ -867,107 +852,85 @@ def _fbgemm_grouped_gemm_fp8_rowwise_ws(
                     m_offset = (M_start_offset + tile_m_idx * BLOCK_SIZE_M).to(tl.int32)
                     n_offset = (N_start_offset + tile_n_idx * BLOCK_SIZE_N).to(tl.int32)
                     for k_offset in range(0, K, BLOCK_SIZE_K):
-                        with tl.async_task([0]):
-                            a = tl._experimental_descriptor_load(
-                                a_desc_ptr,
-                                [m_offset, k_offset],
-                                [BLOCK_SIZE_M, BLOCK_SIZE_K],
-                                dtype,
-                            )
-                            b = tl._experimental_descriptor_load(
-                                b_desc_ptr,
-                                [n_offset, k_offset],
-                                [BLOCK_SIZE_N, BLOCK_SIZE_K],
-                                dtype,
-                            )
-                        with tl.async_task([1, NUM_CONSUMER_GROUPS]):
-                            if USE_FAST_ACCUM:
-                                accumulator = tl.dot(a, b.T, accumulator)
-                            else:
-                                accumulator += tl.dot(a, b.T)
+                        a = tl._experimental_descriptor_load(
+                            a_desc_ptr,
+                            [m_offset, k_offset],
+                            [BLOCK_SIZE_M, BLOCK_SIZE_K],
+                            dtype,
+                        )
+                        b = tl._experimental_descriptor_load(
+                            b_desc_ptr,
+                            [n_offset, k_offset],
+                            [BLOCK_SIZE_N, BLOCK_SIZE_K],
+                            dtype,
+                        )
+                        if USE_FAST_ACCUM:
+                            accumulator = tl.dot(a, b.T, accumulator)
+                        else:
+                            accumulator += tl.dot(a, b.T)
                     if USE_TMA_LOAD_ON_SCALES:
-                        with tl.async_task([0]):
-                            b_scale = tl._experimental_descriptor_load(
-                                b_scale_desc_ptr,
-                                [n_offset],
-                                [BLOCK_SIZE_N],
-                                tl.float32,
-                            )
-                        with tl.async_task([1, NUM_CONSUMER_GROUPS]):
-                            offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(
-                                0, BLOCK_SIZE_M
-                            )
-                            a_scale = tl.load(
-                                a_scale_ptr + M_start_offset + offs_am[:, None],
-                                mask=offs_am[:, None] < m_size,
-                                cache_modifier=".ca",
-                            )
-                            c = accumulator.to(tl.float32) * a_scale * b_scale[None, :]
+                        b_scale = tl._experimental_descriptor_load(
+                            b_scale_desc_ptr,
+                            [n_offset],
+                            [BLOCK_SIZE_N],
+                            tl.float32,
+                        )
+                        offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+                        a_scale = tl.load(
+                            a_scale_ptr + M_start_offset + offs_am[:, None],
+                            mask=offs_am[:, None] < m_size,
+                            cache_modifier=".ca",
+                        )
+                        c = accumulator.to(tl.float32) * a_scale * b_scale[None, :]
                     else:
-                        with tl.async_task([1, NUM_CONSUMER_GROUPS]):
-                            offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(
-                                0, BLOCK_SIZE_M
-                            )
-                            offs_bn = tile_n_idx * BLOCK_SIZE_N + tl.arange(
-                                0, BLOCK_SIZE_N
-                            )
-                            a_scale = tl.load(
-                                a_scale_ptr + M_start_offset + offs_am[:, None],
-                                mask=offs_am[:, None] < m_size,
-                                cache_modifier=".ca",
-                            )
-                            b_scale = tl.load(
-                                b_scale_ptr + N_start_offset + offs_bn[None, :],
-                                cache_modifier=".ca",
-                            )
-                            c = accumulator.to(tl.float32) * a_scale * b_scale
+                        offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+                        offs_bn = tile_n_idx * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+                        a_scale = tl.load(
+                            a_scale_ptr + M_start_offset + offs_am[:, None],
+                            mask=offs_am[:, None] < m_size,
+                            cache_modifier=".ca",
+                        )
+                        b_scale = tl.load(
+                            b_scale_ptr + N_start_offset + offs_bn[None, :],
+                            cache_modifier=".ca",
+                        )
+                        c = accumulator.to(tl.float32) * a_scale * b_scale
                     if USE_TMA_STORE:
-                        with tl.async_task([1, NUM_CONSUMER_GROUPS]):
-                            m_offset = (tile_m_idx * BLOCK_SIZE_M).to(tl.int32)
-                            n_offset = (tile_n_idx * BLOCK_SIZE_N).to(tl.int32)
-                            # pyre-ignore
-                            c_desc_ptr.store(
-                                [m_offset, n_offset], c.to(c_ptr.dtype.element_ty)
-                            )
+                        m_offset = (tile_m_idx * BLOCK_SIZE_M).to(tl.int32)
+                        n_offset = (tile_n_idx * BLOCK_SIZE_N).to(tl.int32)
+                        # pyre-ignore
+                        c_desc_ptr.store(
+                            [m_offset, n_offset], c.to(c_ptr.dtype.element_ty)
+                        )
                     elif FUSE_SCATTER_ADD:
-                        with tl.async_task([1, NUM_CONSUMER_GROUPS]):
-                            offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(
-                                0, BLOCK_SIZE_M
-                            )
-                            mask = offs_am < m_size
-                            m_offsets = tl.load(
-                                scatter_add_indices + M_start_offset + offs_am,
-                                mask=mask,
-                                cache_modifier=".ca",
-                            )
-                            offs_bn = tile_n_idx * BLOCK_SIZE_N + tl.arange(
-                                0, BLOCK_SIZE_N
-                            )
-                            tl.atomic_add(
-                                c_ptr + m_offsets[:, None] * N + offs_bn[None, :],
-                                c,
-                                mask=mask[:, None],
-                                sem="relaxed",
-                            )
+                        offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+                        mask = offs_am < m_size
+                        m_offsets = tl.load(
+                            scatter_add_indices + M_start_offset + offs_am,
+                            mask=mask,
+                            cache_modifier=".ca",
+                        )
+                        offs_bn = tile_n_idx * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+                        tl.atomic_add(
+                            c_ptr + m_offsets[:, None] * N + offs_bn[None, :],
+                            c,
+                            mask=mask[:, None],
+                            sem="relaxed",
+                        )
                     else:
-                        with tl.async_task([1, NUM_CONSUMER_GROUPS]):
-                            offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(
-                                0, BLOCK_SIZE_M
-                            )
-                            offs_bn = tile_n_idx * BLOCK_SIZE_N + tl.arange(
-                                0, BLOCK_SIZE_N
-                            )
-                            tl.store(
-                                c_ptr
-                                + (M_start_offset + offs_am[:, None]) * N
-                                + offs_bn[None, :],
-                                c,
-                                mask=offs_am[:, None] < m_size,
-                                cache_modifier=".cs",
-                            )
+                        offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+                        offs_bn = tile_n_idx * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+                        tl.store(
+                            c_ptr
+                            + (M_start_offset + offs_am[:, None]) * N
+                            + offs_bn[None, :],
+                            c,
+                            mask=offs_am[:, None] < m_size,
+                            cache_modifier=".cs",
+                        )
                     tidx += NUM_SMS
             iterated_tiles += num_tiles

fbgemm_gpu/experimental/gen_ai/__init__.py CHANGED Viewed

@@ -15,10 +15,6 @@ try:
     # pyre-ignore[21]
     # @manual=//deeplearning/fbgemm/fbgemm_gpu:test_utils
     from fbgemm_gpu import open_source
-    # pyre-ignore[21]
-    # @manual=//deeplearning/fbgemm/fbgemm_gpu:test_utils
-    from fbgemm_gpu.docs.version import __version__  # noqa: F401
 except Exception:
     open_source: bool = False

fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/__init__.py CHANGED Viewed

@@ -10,10 +10,6 @@ try:
     # pyre-ignore[21]
     # @manual=//deeplearning/fbgemm/fbgemm_gpu:test_utils
     from fbgemm_gpu import open_source
-    # pyre-ignore[21]
-    # @manual=//deeplearning/fbgemm/fbgemm_gpu:test_utils
-    from fbgemm_gpu.docs.version import __version__  # noqa: F401
 except Exception:
     open_source: bool = False