PyPI - fbgemm-gpu-genai-nightly - Versions diffs - 2025.12.23__cp311-cp311-manylinux_2_28_x86_64.whl → 2026.1.14__cp311-cp311-manylinux_2_28_x86_64.whl - Mend

fbgemm-gpu-genai-nightly 2025.12.23__cp311-cp311-manylinux_2_28_x86_64.whl → 2026.1.14__cp311-cp311-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

fbgemm_gpu/batched_unary_embeddings_ops.py CHANGED Viewed

@@ -11,7 +11,6 @@
 from math import sqrt
 import torch
 from fbgemm_gpu.utils.loader import load_torch_module
 try:

fbgemm_gpu/config/feature_list.py CHANGED Viewed

@@ -63,6 +63,9 @@ class FeatureGateName(Enum):
     # Enable TBE input parameters extraction
     TBE_REPORT_INPUT_PARAMS = auto()
+    # Enable tuned max segment length per CTA for B200
+    TBE_USE_TUNED_SEGMENT_LENGTHS_CTA_B200 = auto()
     def is_enabled(self) -> bool:
         return FeatureGate.is_enabled(self)

fbgemm_gpu/docs/target.genai.json.py CHANGED Viewed

@@ -1,6 +1,6 @@
 {
-    "version": "2025.12.23",
+    "version": "2026.1.14",
     "target": "genai",
     "variant": "cuda"
 }

fbgemm_gpu/experimental/example/fbgemm_gpu_experimental_example_py.so CHANGED Viewed

Binary file

fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py CHANGED Viewed

@@ -12,9 +12,7 @@ from typing import Optional, Union
 import torch
 import triton  # @manual
 import triton.language as tl  # @manual
 from fbgemm_gpu.experimental.gemm.triton_gemm.matmul_perf_model import (
     early_config_prune,
     estimate_matmul_time,
@@ -23,10 +21,8 @@ from fbgemm_gpu.experimental.gemm.triton_gemm.utils import (
     map_dtype_to_triton,
     TmaAutoTuneHelper,
 )
 from packaging import version
 from torch._tensor import Tensor
 from triton import Config  # @manual
 from triton.runtime.jit import reinterpret as tl_reinterpret, TensorWrapper  # @manual

fbgemm_gpu/experimental/gemm/triton_gemm/grouped_gemm.py CHANGED Viewed

@@ -9,14 +9,11 @@
 import functools
 import inspect
 import warnings
 from typing import Optional
 import torch
 import triton
 import triton.language as tl
 from fbgemm_gpu.experimental.gemm.triton_gemm import utils
 from triton.runtime import driver  # @manual

fbgemm_gpu/experimental/gemm/triton_gemm/matmul_perf_model.py CHANGED Viewed

@@ -12,7 +12,6 @@ import functools
 import heapq
 import torch
 from triton import cdiv  # @manual
 from triton.runtime import driver  # @manual
 from triton.testing import (  # @manual

fbgemm_gpu/experimental/gemm/triton_gemm/utils.py CHANGED Viewed

@@ -9,7 +9,6 @@ import sys
 import torch
 import triton  # @manual
 import triton.language as tl  # @manual

fbgemm_gpu/experimental/gen_ai/bench/comm_bench.py CHANGED Viewed

@@ -6,7 +6,6 @@
 import argparse
 import os
 import tempfile
 import uuid
@@ -15,7 +14,6 @@ from pprint import pprint
 import fbgemm_gpu.experimental.gen_ai  # noqa: F401
 import pandas as pd
 import torch
 import torch.distributed as dist
 import torch.distributed._symmetric_memory as symm_mem

fbgemm_gpu/experimental/gen_ai/bench/quantize_bench.py CHANGED Viewed

@@ -7,16 +7,13 @@
 import itertools
 import os
 import sys
 from dataclasses import dataclass
 from datetime import datetime
 from typing import Any, Optional
 import click
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 import seaborn as sns
 import torch

fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py CHANGED Viewed

@@ -9,10 +9,8 @@ import abc
 import fbgemm_gpu.experimental.gen_ai  # noqa: F401
 import numpy as np
 import torch
 import triton  # @manual=//triton:triton
 from fbgemm_gpu.experimental.gemm.triton_gemm.fp4_quantize import (
     _to_blocked,
     calculate_group_max,
@@ -26,7 +24,6 @@ from fbgemm_gpu.experimental.gemm.triton_gemm.fp4_quantize import (
     triton_scale_nvfp4_quant_rms,
     triton_scale_nvfp4_quant_silu,
 )
 from fbgemm_gpu.experimental.gemm.triton_gemm.fp8_gemm import (
     get_fp8_constants,
     matmul_fp8_block,

fbgemm_gpu/experimental/gen_ai/fbgemm_gpu_experimental_gen_ai.so CHANGED Viewed

Binary file

fbgemm_gpu/experimental/gen_ai/moe/__init__.py CHANGED Viewed

@@ -56,7 +56,6 @@ from fbgemm_gpu.experimental.gemm.triton_gemm.grouped_gemm import (  # noqa F401
 )
 from .activation import silu_mul, silu_mul_quant  # noqa F401
 from .gather_scatter import (  # noqa F401
     gather_scale_dense_tokens,
     gather_scale_quant_dense_tokens,

fbgemm_gpu/experimental/gen_ai/moe/activation.py CHANGED Viewed

@@ -11,7 +11,6 @@ from typing import Optional
 import torch
 import triton
 import triton.language as tl
 from fbgemm_gpu.experimental.gemm.triton_gemm.fp8_gemm import get_fp8_constants

fbgemm_gpu/experimental/gen_ai/moe/gather_scatter.py CHANGED Viewed

@@ -11,7 +11,6 @@ from typing import Optional
 import torch
 import triton
 import triton.language as tl
 from fbgemm_gpu.experimental.gemm.triton_gemm.fp8_gemm import get_fp8_constants

fbgemm_gpu/experimental/gen_ai/moe/layers.py CHANGED Viewed

@@ -12,9 +12,7 @@ from functools import cached_property
 from typing import Callable, Optional, Union
 import torch
 from fairscale.nn.model_parallel.initialize import get_model_parallel_world_size
 from fbgemm_gpu.experimental.gemm.triton_gemm.fp8_gemm import triton_quantize_fp8_row
 from fbgemm_gpu.experimental.gemm.triton_gemm.grouped_gemm import (
     grouped_gemm,

fbgemm_gpu/experimental/gen_ai/quantize.py CHANGED Viewed

@@ -10,7 +10,6 @@
 import torch
 from fbgemm_gpu.experimental.gemm.triton_gemm.fp8_gemm import quantize_fp8_row

fbgemm_gpu/fbgemm.so CHANGED Viewed

Binary file

fbgemm_gpu/permute_pooled_embedding_modules.py CHANGED Viewed

@@ -11,7 +11,6 @@ from itertools import accumulate
 from typing import Optional
 import torch
 from fbgemm_gpu.utils.loader import load_torch_module
 try:

fbgemm_gpu/quantize/quantize_ops.py CHANGED Viewed

@@ -8,7 +8,6 @@
 from typing import Union
 import torch
 from fbgemm_gpu.quantize_utils import fp32_to_mx4, mx4_to_fp32, RoundingMode

fbgemm_gpu/quantize_comm.py CHANGED Viewed

@@ -16,7 +16,6 @@ import logging
 from typing import Optional, TypeVar
 import torch
 from fbgemm_gpu.quantize_utils import (
     bf16_to_fp32,
     fp16_to_fp32,
@@ -25,12 +24,10 @@ from fbgemm_gpu.quantize_utils import (
     fp32_to_hfp8_with_clamp,
     fp32_to_mx4,
     hfp8_to_fp32,
-    mx4_to_fp32,
+    mx4_to_float,
     RoundingMode,
 )
 from fbgemm_gpu.split_embedding_configs import SparseType
 from torch.autograd.profiler import record_function  # usort:skip
 from dataclasses import dataclass
@@ -123,7 +120,7 @@ def _dequantize_tensor(
     comm_precision: SparseType,
     ctx: Optional[QuantizationContext] = None,
     is_fwd: bool = True,
-    fp8_output_dtype: Optional[SparseType] = None,
+    output_dtype: Optional[SparseType] = None,
 ) -> torch.Tensor:
     if comm_precision == SparseType.FP32:
         assert quantized_tensor.dtype == torch.float
@@ -138,10 +135,8 @@ def _dequantize_tensor(
         if ctx is not None and ctx.row_dim > 0:
             row_dim_quant = ctx.row_dim_quant
             quantized_tensor_2d = quantized_tensor.view((-1, row_dim_quant))
-            # use provided fp8_output_dtype or default to FP32 (0)
-            output_dtype_int = (
-                fp8_output_dtype.as_int() if fp8_output_dtype is not None else 0
-            )
+            # use provided output_dtype or default to FP32 (0)
+            output_dtype_int = output_dtype.as_int() if output_dtype is not None else 0
             dequant_tensor = torch.ops.fbgemm.FP8RowwiseQuantizedToFloat(
                 quantized_tensor_2d,
                 is_fwd,
@@ -161,7 +156,7 @@ def _dequantize_tensor(
         return dequant_tensor.view(-1)
     elif comm_precision == SparseType.MX4:
         mx_group_size = ctx.mx_group_size if ctx is not None else MX_GROUP_SIZE_DEFAULT
-        return mx4_to_fp32(quantized_tensor, mx_group_size)
+        return mx4_to_float(quantized_tensor, mx_group_size, output_dtype=output_dtype)
     else:
         raise ValueError(f"comm_precision={comm_precision} is not supported")
@@ -175,7 +170,7 @@ class QuantizedCommCodec:
         row_dim: Optional[int] = None,
         is_fwd: bool = True,
         rounding_mode: Optional[RoundingMode] = None,
-        fp8_output_dtype: Optional[SparseType] = None,
+        output_dtype: Optional[SparseType] = None,
     ) -> None:
         if loss_scale is not None:
             if comm_precision not in [SparseType.FP16, SparseType.BF16]:
@@ -193,7 +188,7 @@ class QuantizedCommCodec:
         self._is_fwd = is_fwd
         self._row_dim: int = -1 if row_dim is None else row_dim
         self._rounding_mode: Optional[RoundingMode] = rounding_mode
-        self._fp8_output_dtype: Optional[SparseType] = fp8_output_dtype
+        self._output_dtype: Optional[SparseType] = output_dtype
         if self._comm_precision == SparseType.MX4:
             self._row_dim = MX_GROUP_SIZE_DEFAULT if row_dim is None else row_dim
             self._rounding_mode = (
@@ -229,7 +224,7 @@ class QuantizedCommCodec:
                 self._comm_precision,
                 ctx,
                 self._is_fwd,
-                fp8_output_dtype=self._fp8_output_dtype,
+                output_dtype=self._output_dtype,
             )
         return dequantized_tensor

fbgemm_gpu/quantize_utils.py CHANGED Viewed

@@ -13,10 +13,18 @@ from typing import Optional, Union
 import torch  # isort:skip
 import fbgemm_gpu
-from fbgemm_gpu.triton import dequantize_mx4, quantize_mx4, RoundingMode
+from fbgemm_gpu.split_embedding_configs import SparseType
+from fbgemm_gpu.triton.common import RoundingMode
 from fbgemm_gpu.triton.quantize_ref import py_dequantize_mx4, py_quantize_mx4
+try:
+    if torch.cuda.is_available():
+        from fbgemm_gpu.triton import quantize_mx4
+        from fbgemm_gpu.triton.quantize import triton_dequantize_mx4
+except Exception:
+    pass
 try:
     # pyre-fixme[16]: Module `fbgemm_gpu` has no attribute `open_source`.
     open_source = bool(getattr(fbgemm_gpu, "open_source", False))
@@ -126,25 +134,71 @@ def mx4_to_fp32(
 ) -> torch.Tensor:
     """Dequantize an MX4 tensor to FP32 with triton or native cuda impl.
+    This function is kept for backward compatibility and always returns FP32.
+    For BF16 output, use mx4_to_float() with output_dtype=SparseType.BF16.
+    """
+    return mx4_to_float(
+        tensor,
+        group_size,
+        use_triton,
+        ebits,
+        mbits,
+        output_dtype=None,  # None = FP32 default for backward compatibility
+    )
+def mx4_to_float(
+    tensor: torch.Tensor,
+    group_size: int = 32,
+    use_triton: bool = True,
+    ebits: int = 2,
+    mbits: int = 1,
+    output_dtype: Optional[SparseType] = None,
+) -> torch.Tensor:
+    """Dequantize an MX4 tensor to FP32 or BF16 with triton or native cuda impl.
     Args:
         tensor (torch.Tensor): MX4 packed tensor with total elements (M / 2 + M / groupsize)
         group_size (int): Compute scale in chunks of group_size.
         use_triton (bool): If set, use triton quantization, otherwise cuda.
         ebits (int): Number of exponent bits in target mx4 format.
         mbits (int): Number of mantissa bits in target mx4 format.
+        output_dtype (Optional[SparseType]): Output dtype (FP32 or BF16).
+            Defaults to None (FP32) for backward compatibility.
     Return:
-        output: FP32 tensor with total elements (M).
+        output: Tensor with dtype matching output_dtype and total elements (M).
     """
+    # Validate output_dtype
+    supported_dtypes = {SparseType.FP32, SparseType.BF16}
+    if output_dtype is not None and output_dtype not in supported_dtypes:
+        raise ValueError(
+            f"output_dtype must be one of {supported_dtypes}, got {output_dtype}. "
+            f"FP16 is not supported due to potential overflow/underflow with MX4's wide exponent range. "
+            f"Use BF16 for memory savings with same dynamic range as FP32."
+        )
+    target_dtype = (
+        output_dtype.as_dtype() if output_dtype is not None else torch.float32
+    )
     # Accelerated MX4 dequantize is only available on cuda, if input is on cpu, use python.
     if not tensor.is_cuda and not tensor.is_mtia:
-        return py_dequantize_mx4(tensor, group_size, ebits=ebits, mbits=mbits)
+        result = py_dequantize_mx4(tensor, group_size, ebits=ebits, mbits=mbits)
+        return result.to(target_dtype) if output_dtype is not None else result
     if use_triton:
         if tensor.is_mtia:
-            return mtia_dequantize_mx4(tensor, group_size, ebits=ebits, mbits=mbits)
-        return dequantize_mx4(tensor, group_size, ebits=ebits, mbits=mbits)
+            return mtia_dequantize_mx4(
+                tensor, group_size, ebits=ebits, mbits=mbits, output_dtype=target_dtype
+            )
+        return triton_dequantize_mx4(
+            tensor, group_size, ebits=ebits, mbits=mbits, output_dtype=target_dtype
+        )
     else:
-        return torch.ops.fbgemm.dequantize_mx_cuda(tensor.flatten(), group_size)
+        output_dtype_int = output_dtype.as_int() if output_dtype is not None else 0
+        return torch.ops.fbgemm.dequantize_mx_cuda(
+            tensor.flatten(), group_size, output_dtype_int
+        )
 def fp32_to_fp16_with_clamp(tensor: torch.Tensor) -> torch.Tensor:

fbgemm_gpu/sll/__init__.py CHANGED Viewed

@@ -8,7 +8,6 @@
 # pyre-strict
 import torch
 from fbgemm_gpu.sll.cpu import op_registrations as sll_cpu_registrations
 from fbgemm_gpu.sll.meta import op_registrations as sll_meta_registrations
 from fbgemm_gpu.utils import TorchLibraryFragment

fbgemm_gpu/sll/triton/__init__.py CHANGED Viewed

@@ -10,19 +10,16 @@
 from fbgemm_gpu.sll.triton.triton_dense_jagged_cat_jagged_out import (
     dense_jagged_cat_jagged_out,
 )
 from fbgemm_gpu.sll.triton.triton_jagged2_to_padded_dense import (  # noqa F401
     jagged2_to_padded_dense,
     Jagged2ToPaddedDense,  # noqa F401
 )
 from fbgemm_gpu.sll.triton.triton_jagged_bmm import (  # noqa F401
     jagged_dense_bmm,
     jagged_jagged_bmm,
     JaggedDenseBmm,  # noqa F401
     JaggedJaggedBmm,  # noqa F401
 )
 from fbgemm_gpu.sll.triton.triton_jagged_bmm_jagged_out import (  # noqa F401
     array_jagged_bmm_jagged_out,
     ArrayJaggedBmmNopadding,  # noqa F401
@@ -31,38 +28,31 @@ from fbgemm_gpu.sll.triton.triton_jagged_bmm_jagged_out import (  # noqa F401
     triton_array_jagged_bmm_jagged_out,  # noqa F401
     triton_jagged_jagged_bmm_jagged_out,  # noqa F401
 )
 from fbgemm_gpu.sll.triton.triton_jagged_dense_elementwise_add import (  # noqa F401
     jagged_dense_elementwise_add,
     JaggedDenseAdd,  # noqa F401
 )
 from fbgemm_gpu.sll.triton.triton_jagged_dense_elementwise_mul_jagged_out import (  # noqa F401
     jagged_dense_elementwise_mul_jagged_out,
     JaggedDenseElementwiseMul,  # noqa F401
 )
 from fbgemm_gpu.sll.triton.triton_jagged_dense_flash_attention import (  # noqa F401
     jagged_dense_flash_attention,
     JaggedDenseFlashAttention,  # noqa F401
 )
 from fbgemm_gpu.sll.triton.triton_jagged_flash_attention_basic import (  # noqa F401
     jagged_flash_attention_basic,
     JaggedFlashAttentionBasic,  # noqa F401
 )
 from fbgemm_gpu.sll.triton.triton_jagged_self_substraction_jagged_out import (
     triton_jagged_self_substraction_jagged_out,
 )
 from fbgemm_gpu.sll.triton.triton_jagged_softmax import (  # noqa F401
     jagged2_softmax,
     Jagged2Softmax,  # noqa F401
     jagged_softmax,
     JaggedSoftmax,  # noqa F401
 )
 from fbgemm_gpu.sll.triton.triton_multi_head_jagged_flash_attention import (  # noqa F401
     multi_head_jagged_flash_attention,
     MultiHeadJaggedFlashAttention,  # noqa F401

fbgemm_gpu/sll/triton/triton_jagged_dense_elementwise_add.py CHANGED Viewed

@@ -7,7 +7,6 @@
 # pyre-unsafe
 import torch
 from fbgemm_gpu.triton.jagged.triton_jagged_tensor_ops import (
     dense_to_jagged,
     jagged_to_dense,

fbgemm_gpu/sparse_ops.py CHANGED Viewed

@@ -11,7 +11,6 @@ from collections.abc import Sequence
 from typing import Callable, Optional
 import torch
 from fbgemm_gpu.split_embedding_configs import SparseType
 from fbgemm_gpu.split_table_batched_embeddings_ops_common import PoolingMode
 from fbgemm_gpu.utils.loader import load_torch_module

fbgemm_gpu/split_embedding_configs.py CHANGED Viewed

@@ -12,7 +12,6 @@ import itertools
 from typing import Any, Dict  # noqa: F401
 import torch
 from fbgemm_gpu.split_table_batched_embeddings_ops_common import (
     EmbeddingLocation,
     SplitState,

fbgemm_gpu/split_embedding_inference_converter.py CHANGED Viewed

@@ -13,7 +13,6 @@ import math
 from typing import cast, Optional
 import torch
 from fbgemm_gpu.split_embedding_configs import (
     FP8QuantizationConfig,
     QuantizationConfig,

fbgemm_gpu/split_table_batched_embeddings_ops_training.py CHANGED Viewed

@@ -26,7 +26,6 @@ from torch.autograd.profiler import record_function  # usort:skip
 # @manual=//deeplearning/fbgemm/fbgemm_gpu/codegen:split_embedding_codegen_lookup_invokers
 import fbgemm_gpu.split_embedding_codegen_lookup_invokers as invokers
 from fbgemm_gpu.config import FeatureGate, FeatureGateName
 from fbgemm_gpu.runtime_monitor import (
     AsyncSeriesTimer,
@@ -59,7 +58,6 @@ from fbgemm_gpu.tbe_input_multiplexer import (
     TBEInputMultiplexer,
     TBEInputMultiplexerConfig,
 )
 from fbgemm_gpu.utils.loader import load_torch_module, load_torch_module_bc
 from fbgemm_gpu.utils.writeback_util import writeback_gradient
@@ -2764,20 +2762,21 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
                 self.prefetch_stream != forward_stream
             ), "prefetch_stream and forward_stream should not be the same stream"
-        indices, offsets, _, vbe_metadata = self.prepare_inputs(
-            indices,
-            offsets,
-            per_sample_weights=None,
-            batch_size_per_feature_per_rank=batch_size_per_feature_per_rank,
-            force_cast_input_types=False,
-            prefetch_pipeline=self.prefetch_pipeline,
-        )
         with self._recording_to_timer(
             self.prefetch_duration_timer,
             context=self.step,
             stream=torch.cuda.current_stream(),
         ):
+            indices, offsets, _, vbe_metadata = self.prepare_inputs(
+                indices,
+                offsets,
+                per_sample_weights=None,
+                batch_size_per_feature_per_rank=batch_size_per_feature_per_rank,
+                force_cast_input_types=False,
+                prefetch_pipeline=self.prefetch_pipeline,
+            )
             self._prefetch(
                 indices,
                 offsets,

fbgemm_gpu/tbe/bench/bench_runs.py CHANGED Viewed

@@ -15,7 +15,6 @@ from subprocess import Popen
 from typing import Callable, Optional
 import torch
 from fbgemm_gpu.tbe.utils import b_indices, TBERequest
 from fbgemm_gpu.tbe.utils.common import get_device

fbgemm_gpu/tbe/bench/benchmark_click_interface.py CHANGED Viewed

@@ -8,12 +8,10 @@
 # pyre-strict
 import click
 from fbgemm_gpu.split_embedding_configs import SparseType
 from fbgemm_gpu.split_table_batched_embeddings_ops_common import BoundsCheckMode
-from .bench_config import TBEBenchmarkingHelperText
-from .tbe_data_config_loader import TBEDataConfigHelperText
+from .bench_config import TBEBenchmarkingHelperText  # usort:skip
+from .tbe_data_config_loader import TBEDataConfigHelperText  # usort:skip
 class TbeBenchClickInterface:

fbgemm_gpu/tbe/bench/eeg_cli.py CHANGED Viewed

@@ -9,7 +9,6 @@
 import click
 import torch
 from fbgemm_gpu.tbe.bench import IndicesParams

fbgemm_gpu/tbe/bench/embedding_ops_common_config.py CHANGED Viewed

@@ -12,7 +12,6 @@ from typing import Any, Optional
 import click
 import torch
 from fbgemm_gpu.split_embedding_configs import SparseType
 from fbgemm_gpu.split_table_batched_embeddings_ops_common import (
     BoundsCheckMode,

fbgemm_gpu/tbe/bench/tbe_data_config.py CHANGED Viewed

@@ -13,10 +13,12 @@ import logging
 from typing import Any, Optional
 import torch
 from fbgemm_gpu.tbe.utils.common import get_device
-from .tbe_data_config_param_models import BatchParams, IndicesParams, PoolingParams
+from .tbe_data_config_param_models import (
+    BatchParams,
+    IndicesParams,
+    PoolingParams,
+)  # usort:skip
 try:
     torch.ops.load_library(

fbgemm_gpu/tbe/bench/tbe_data_config_bench_helper.py CHANGED Viewed

@@ -11,10 +11,8 @@ from typing import Optional
 import numpy as np
 import torch
 from fbgemm_gpu.tbe.bench.tbe_data_config import TBEDataConfig
 from fbgemm_gpu.tbe.utils.common import get_device, round_up
 from fbgemm_gpu.tbe.utils.requests import (
     generate_batch_sizes_from_stats,
     generate_pooling_factors_from_stats,

fbgemm_gpu/tbe/bench/tbe_data_config_loader.py CHANGED Viewed

@@ -13,7 +13,6 @@ from enum import Enum
 import click
 import torch
 import yaml
 from fbgemm_gpu.tbe.bench.tbe_data_config import (
     BatchParams,
     IndicesParams,

fbgemm_gpu/tbe/bench/utils.py CHANGED Viewed

@@ -10,7 +10,6 @@ import logging
 import numpy as np
 import torch
 from fbgemm_gpu.split_embedding_configs import SparseType
 logging.basicConfig(level=logging.DEBUG)

fbgemm_gpu/tbe/ssd/common.py CHANGED Viewed

@@ -9,7 +9,6 @@
 # pyre-ignore-all-errors[56]
 import torch
 from fbgemm_gpu.utils.loader import load_torch_module
 try:

fbgemm_gpu/tbe_input_multiplexer.py CHANGED Viewed

@@ -8,7 +8,6 @@
 # pyre-unsafe
 import abc
 from dataclasses import dataclass
 from typing import Optional

fbgemm_gpu/triton/quantize.py CHANGED Viewed

@@ -11,7 +11,6 @@ from typing import Union
 import torch
 import triton  # @manual
 import triton.language as tl  # @manual
 from .common import get_mx4_exp_bias, get_mx4_lookup_table, RoundingMode
@@ -575,7 +574,7 @@ def _kernel_dequantize_mx4(
         # Write final outputs.
         tl.store(
             out + output_offset,
-            scaled_fp32,
+            scaled_fp32.to(out.dtype.element_ty),
             # Mask values that are out of this chunk or the main array.
             mask=(output_offset < OUTPUT_SIZE)
             & (output_offset < OUTPUT_CHUNK_SIZE * (pid + 1)),
@@ -588,10 +587,14 @@ def _kernel_dequantize_mx4(
 def triton_dequantize_mx4(
-    a: torch.Tensor, group_size: int = 32, ebits: int = 2, mbits: int = 1
+    a: torch.Tensor,
+    group_size: int = 32,
+    ebits: int = 2,
+    mbits: int = 1,
+    output_dtype: torch.dtype = torch.float32,
 ) -> torch.Tensor:
     """
-    Dequantize a tensor from mx4 format to fp32.
+    Dequantize a tensor from mx4 format to fp32 or bf16.
     Args:
         a (Tensor): [M / 2 + M / group_size] MX4 tensor packed into int8 values
@@ -599,13 +602,15 @@ def triton_dequantize_mx4(
         group_size (int): Size of chunks that use the same shared exponent.
         ebits (int): Number of bits to use for exponent in target mx4 format.
         mbits (int): Number of bits to use for mantissa in target mx4 format.
+        output_dtype (torch.dtype): Output dtype (FP32 or BF16).
+            Defaults to torch.float32 for backward compatibility.
     Returns:
-        torch.Tensor: [M, K] dequantized fp32 tensor.
+        torch.Tensor: [M, K] dequantized tensor in the specified dtype.
     """
     # If given an empty shape, return an empty tensor.
     if a.numel() == 0:
-        return torch.empty(a.shape, device=a.device, dtype=torch.float32)
+        return torch.empty(a.shape, device=a.device, dtype=output_dtype)
     # View a as 2D for simplicity.
     orig_shape = a.shape
     a = a.flatten()
@@ -622,9 +627,9 @@ def triton_dequantize_mx4(
     # Use a lookup table to convert
     mx4_to_fp_values = get_mx4_lookup_table(ebits, mbits, a.device)
-    # Create output tensor.
+    # Create output tensor in target dtype.
     output_elems = num_groups * group_size
-    out = torch.empty([output_elems], device=a.device, dtype=torch.float)
+    out = torch.empty([output_elems], device=a.device, dtype=output_dtype)
     # Check if we need to use int64 for indexing.
     use_int64 = num_threads * groups_per_thread * group_size > 2**31 - 1
     # Invoke triton dequantization kernel over rows.

fbgemm_gpu/uvm.py CHANGED Viewed

@@ -11,7 +11,6 @@ from enum import Enum
 from typing import Optional
 import torch
 from fbgemm_gpu.enums import create_enums
 try:

{fbgemm_gpu_genai_nightly-2025.12.23.dist-info → fbgemm_gpu_genai_nightly-2026.1.14.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: fbgemm_gpu_genai_nightly
-Version: 2025.12.23
+Version: 2026.1.14
 Home-page: https://github.com/pytorch/fbgemm
 Author: FBGEMM Team
 Author-email: packages@pytorch.org

{fbgemm_gpu_genai_nightly-2025.12.23.dist-info → fbgemm_gpu_genai_nightly-2026.1.14.dist-info}/RECORD RENAMED Viewed

@@ -1,29 +1,29 @@
 fbgemm_gpu/__init__.py,sha256=bL2dL7uYeXb1GvdjIDUTcLXLRGNfmnI4MQoE3-Gg5m8,6361
 fbgemm_gpu/asmjit.so,sha256=RxTYI8zY4PpIBRpSKT_-U7bRIVeTRohdtRFUmLNU1tQ,501728
-fbgemm_gpu/batched_unary_embeddings_ops.py,sha256=GYeJ9pg-Wc9FokXVci_npDsL6UV18-pJXID2xzrJ9O8,2904
+fbgemm_gpu/batched_unary_embeddings_ops.py,sha256=Zst_OhYCBgbNMWfUADp1W1pGL1pT5t_8XX2q-QT50TI,2903
 fbgemm_gpu/enums.py,sha256=37ewGSfO1x7sO31ZkRiqV1yKuklfHXT5qZIxzeeGogo,755
-fbgemm_gpu/fbgemm.so,sha256=U864UANx-CVyFYk5ADawCd0uWRfntHaVcyl6AVty_3Q,5642616
+fbgemm_gpu/fbgemm.so,sha256=pW04240G6WyXyaVJszQM0R7p8Jr1ZoyfblH5OJsmCyo,5675384
 fbgemm_gpu/metrics.py,sha256=TsurFLJf0nJvPDN7urWb4LMQlf5RgdWPTTTDO7S4wtI,5663
-fbgemm_gpu/permute_pooled_embedding_modules.py,sha256=vOXMYclaGnwSt0St_SOAlAe18kz6WjMyTeHnC9jLhcE,5130
+fbgemm_gpu/permute_pooled_embedding_modules.py,sha256=dGQ8o3wN0yaLj8adx4oR6ncmkOH3PT7_zGZ8yYTnnk0,5129
 fbgemm_gpu/permute_pooled_embedding_modules_split.py,sha256=f3VJvH_kw9Ltd_DXtaf_PJPHmlmEWrQgzQ7MDkhh5Nw,2746
-fbgemm_gpu/quantize_comm.py,sha256=ZfXtRHfqpVpV6k2PDL6oTUkKYzopqAV2M6vavp_RLSM,12022
-fbgemm_gpu/quantize_utils.py,sha256=q8Aokk6nlHbXF6HcDBbhBCAGSZV4klM8uPF-MUFFtAw,8324
+fbgemm_gpu/quantize_comm.py,sha256=yKKDJF_aMIYJG_22KG4BX1-AF_88ulgOXLvRO2a4RNI,11980
+fbgemm_gpu/quantize_utils.py,sha256=sROgIdOrAjQT5_CmFafg40GMo0-pe4d56bAZTI57548,10243
 fbgemm_gpu/runtime_monitor.py,sha256=YXRUv6nXCsoTgh5_RzailTGvCYzwoYDb-eR4rlGwtaw,7619
-fbgemm_gpu/sparse_ops.py,sha256=_EJC1pAbNnAnVQQ5JBg4DAV2TboIj-4XQkiKMmg1vXI,50417
-fbgemm_gpu/split_embedding_configs.py,sha256=EuVFKIDrgRQpRC5mmB4Du6WftK5GXJvDue9_ezt_eBI,16575
-fbgemm_gpu/split_embedding_inference_converter.py,sha256=AghGW22MgMsdHzdwdPMPYDjgas5AE_estckY8rMgXVU,7056
+fbgemm_gpu/sparse_ops.py,sha256=uCmtitnCJnDAIq1TCYvk24COyUnbvjIHVob37JgSDkg,50416
+fbgemm_gpu/split_embedding_configs.py,sha256=awc9gAhCsRulXmQM089gxJwW0G3PeIw48gUesf13AKc,16574
+fbgemm_gpu/split_embedding_inference_converter.py,sha256=rKILaM_C5Y-4Ypl1uHG4pZfiMZ-XlzjMwgik4X-wWeU,7055
 fbgemm_gpu/split_embedding_optimizer_ops.py,sha256=wXuGazClBMk62yL_r9udUIKaPgQP7SlkSb5ugB75wrQ,711
 fbgemm_gpu/split_embedding_utils.py,sha256=Gb40ZKeATxIKEKI3aVQMgDDBanNpKMc53Z43mnzdR_I,851
 fbgemm_gpu/split_table_batched_embeddings_ops.py,sha256=_MIp6uHYHLn4GxGdrGsfddfSsZ2Z9mjsYIrih3ncI1I,2339
 fbgemm_gpu/split_table_batched_embeddings_ops_common.py,sha256=eFxb_bDfBV8G76pmd-SxDXXXnqgbuGYOS4pSU8JS5dg,19295
 fbgemm_gpu/split_table_batched_embeddings_ops_inference.py,sha256=dGC85xjQiRUrequBibSf9oMAVHT5Q49zsVo2zW4n_88,81679
-fbgemm_gpu/split_table_batched_embeddings_ops_training.py,sha256=rNGMELM_xFIsdS_340PB7bsn9h_VjONq_JJG1SjHyvQ,188992
+fbgemm_gpu/split_table_batched_embeddings_ops_training.py,sha256=kzTVo_o7ouCdPuGdziPSz3LZbEi3jI0aTLp4u7fuWRs,189023
 fbgemm_gpu/split_table_batched_embeddings_ops_training_common.py,sha256=jofAN2UB_iSk53Id6MBvn9Bi3Qxw67IL0_VE_EHlw_Q,7593
 fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py,sha256=7qGkO8FARku38mFYl4Bc4qL8dS1wrfyorS9l1m5ZAVA,718
-fbgemm_gpu/tbe_input_multiplexer.py,sha256=TQjwkJ2JkOaQsMYuRdk9RbNa9759EPEtx8bYclChtZY,3063
-fbgemm_gpu/uvm.py,sha256=guNK8ZzR80jmv-CyRgEhxhVYhjz3R9d6tB8Hu1uWDUo,1047
+fbgemm_gpu/tbe_input_multiplexer.py,sha256=MbZF8aZdm_kV-JRMaooeZrqlh6Pn5IuNkSXBXODp-LE,3062
+fbgemm_gpu/uvm.py,sha256=V6LvMN7_Oc0YifB6AgwD37ymZzyZO9ydDWany1FoDf0,1046
 fbgemm_gpu/config/__init__.py,sha256=yN0KAneCICgF2BTfOYGsd0qU1PvZX_6msC6YHHZKLMg,292
-fbgemm_gpu/config/feature_list.py,sha256=iDOGr9nwTqUhWsqOefRIqIo1jwLSeSII4jGnLeU01kg,2359
+fbgemm_gpu/config/feature_list.py,sha256=hhDNkkafd-Oetvuqv9ylBVTNM-lKPi029mpRqq-JZCA,2467
 fbgemm_gpu/docs/__init__.py,sha256=DR6hMSQrsZALfH2AnuJQ4Zq2CfBUUhMN8YjD6APjiAE,523
 fbgemm_gpu/docs/common.py,sha256=8ipXTwVb222X-aZ71O6n8fhxHCHPNhJEHMFiO7epcIs,273
 fbgemm_gpu/docs/examples.py,sha256=ZMN_6sL74LH_hrp2bF_hmg8gi29GhcgvwV3kCMjxkoE,2377
@@ -32,47 +32,47 @@ fbgemm_gpu/docs/merge_pooled_embedding_ops.py,sha256=oJLgSgZQmhsyGLbTmZTxNgQrk65
 fbgemm_gpu/docs/permute_pooled_embedding_ops.py,sha256=tZUqLVXlk5O6VAKKDA-OEMx2fCu5QPOOeoAPZA9_nLY,4454
 fbgemm_gpu/docs/quantize_ops.py,sha256=xTtOaVK1P02ymreE_i21YiyYDZCqhoZY9eWp_mEIRlo,1297
 fbgemm_gpu/docs/sparse_ops.py,sha256=gSLUFdnu8lle_6gLewFkM20wL3ek2jKLvDGMKR6POaY,27292
-fbgemm_gpu/docs/target.genai.json.py,sha256=_wCZZFTZPnoCRTnmtXfpjGrdRZuWl7T171wr-JhtC-Y,79
+fbgemm_gpu/docs/target.genai.json.py,sha256=ruseG1ciUe-WwzabUt1S-x9bEycf4pNFzFtE7-nSnuk,78
 fbgemm_gpu/experimental/example/__init__.py,sha256=OvJHZgWnycL1gWKyCXFJCTKuys3KAqx4iadjx3R-tBQ,723
-fbgemm_gpu/experimental/example/fbgemm_gpu_experimental_example_py.so,sha256=ebm5zEzVjAj-j6DP1W41ZD2_UB4DrV-3xEq9iIAkCqg,190656
+fbgemm_gpu/experimental/example/fbgemm_gpu_experimental_example_py.so,sha256=nZblYdz9XBtJD3YAP1GZkCLIryI1DDh5ri9rb0pR90Y,358592
 fbgemm_gpu/experimental/example/utils.py,sha256=Je__VkMlBMLOhh7NXOocOdvaa2gz9kl9Dkqeu25tpFA,562
 fbgemm_gpu/experimental/gemm/triton_gemm/__init__.py,sha256=1CqUfzlYyXTvU-BNaUq4RZpLV-2lKAVCAHeJzSIZFWw,419
 fbgemm_gpu/experimental/gemm/triton_gemm/fp4_quantize.py,sha256=R4VNZdPSgmRmwDfTt2CShED2SGUF6dCXSUW2C4LISgE,215713
-fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py,sha256=KrI-wZeIf4AqcjXo5XoxAUWzOeM5MHTvhKBKzbQ-Hc0,153178
-fbgemm_gpu/experimental/gemm/triton_gemm/grouped_gemm.py,sha256=5ClZ-GDrx6q0uaqWOOmKGVANBQfAd1KFBt0LneFeZDY,42364
-fbgemm_gpu/experimental/gemm/triton_gemm/matmul_perf_model.py,sha256=SltbY_dsit5e7B8lDIB_VYPrEq0t9kckthj9mQaVNfA,7571
-fbgemm_gpu/experimental/gemm/triton_gemm/utils.py,sha256=rULXIpVaaRS3GKUZ1RHcWUrUyy0xMVREwS1SFShGgcw,4302
+fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py,sha256=8I3qGh9lzio3Wt67X0Vt0aZvkqcecyO5mpktHRrl8jc,153174
+fbgemm_gpu/experimental/gemm/triton_gemm/grouped_gemm.py,sha256=OXvsVGtULWPYIyWXqdvRf_v-ZgeG5qiDCdmjbvmR2nE,42361
+fbgemm_gpu/experimental/gemm/triton_gemm/matmul_perf_model.py,sha256=_7qbOVZAPvaTBAEFA5lvQIFtcgd-iCXAZ4KWlwEkcAE,7570
+fbgemm_gpu/experimental/gemm/triton_gemm/utils.py,sha256=HR4sVGYswh_h3aSGUoZrN76WX01mTYCGDVMdCXt9Ruc,4301
 fbgemm_gpu/experimental/gen_ai/__init__.py,sha256=r3NlNCXuIh0pfKwKU5v14y6AZkpoIkKWbtzxSprgeKA,1713
-fbgemm_gpu/experimental/gen_ai/fbgemm_gpu_experimental_gen_ai.so,sha256=eCZ54iUjb6Z4A1IJcGwiZVm2uwjF6yDSHl2ZEWlokC8,65238760
-fbgemm_gpu/experimental/gen_ai/quantize.py,sha256=KAljWSdN-1_c5DWfT-3MDxWLMULK49Yu36t6TmQI9Tw,12599
+fbgemm_gpu/experimental/gen_ai/fbgemm_gpu_experimental_gen_ai.so,sha256=NBnThgRqEGzgO0DMhmF2fQcOGqQyEp8sAScSL-rFafs,229525416
+fbgemm_gpu/experimental/gen_ai/quantize.py,sha256=EOfTJI2efb37hivgJd__xe8-YdWRzCBbGpXd4rSu-ck,12598
 fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/__init__.py,sha256=-R_LxyHpdXMILU9TNuYoRisBCkfK0_VLyixefaeZf4g,1463
 fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_custom_op.py,sha256=gbhNU3mDTKJb3yt3inIDbiUjX_SG1oZfzgDygtHvMpk,10101
 fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_interface.py,sha256=fD39_WH7TfNCiP5Vl46ToX6PsLMLUFLhizT26Qe7TWg,17282
 fbgemm_gpu/experimental/gen_ai/bench/__init__.py,sha256=XpAK_eyqDSKeFC5J9KpnKtbZG07mrDh9d2j1LFKzr-8,404
-fbgemm_gpu/experimental/gen_ai/bench/comm_bench.py,sha256=ApEyJOf_rdIo8V_EgvhZXBGNov8ITC_dnB95v8szulI,8515
+fbgemm_gpu/experimental/gen_ai/bench/comm_bench.py,sha256=aQbX9JzNeC_7Ka2EjJhShBWCgOmDg3bDYXWHhipYjps,8513
 fbgemm_gpu/experimental/gen_ai/bench/gather_scatter_bench.py,sha256=K9Nib6D7xJbw1QwEVuCJrVyI1qs988moo3cieVKYuFY,12057
-fbgemm_gpu/experimental/gen_ai/bench/quantize_bench.py,sha256=BWl6t-4acbuRSEX2aVNDlFrSWZkqMWK2sI3VONaMd3Q,24047
-fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py,sha256=Kq4zSfxrzmSL75RWWdhPSTWq3AxClu_RO3onn5vzx8s,104983
+fbgemm_gpu/experimental/gen_ai/bench/quantize_bench.py,sha256=7OiaaOvVIQJLNgxEeqW6t8ZkFtXRd7js-6ZAJ29zuRs,24044
+fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py,sha256=bl23zzD4LanvTinNHMrlcCOxBvsRiTCnxBpB-Ed4yO0,104980
 fbgemm_gpu/experimental/gen_ai/moe/README.md,sha256=z9ybHmv4KFJ1drj5OByuFaOY0tRQwwiIW3Q22TB_2-k,904
-fbgemm_gpu/experimental/gen_ai/moe/__init__.py,sha256=lwSvff07yEav024B1XyfgW8r8hwNe--aEDywcO7rnbM,1905
-fbgemm_gpu/experimental/gen_ai/moe/activation.py,sha256=NiXhWyCNagI3P9N3N89iSX7xKuShdkq9DxEUAzoV6y0,7892
-fbgemm_gpu/experimental/gen_ai/moe/gather_scatter.py,sha256=8inrE4dkpfO9NFkrmXyXOCM262LMcTA3SQldxPoosT8,21044
-fbgemm_gpu/experimental/gen_ai/moe/layers.py,sha256=QLwoKjyYUHT5vXAvp_maRSxyruwGXaNURgtW8ataVyg,42693
+fbgemm_gpu/experimental/gen_ai/moe/__init__.py,sha256=SeASfWgbuYq4p6_YIax-8KhRFaqyL5933dadUKRJNgo,1904
+fbgemm_gpu/experimental/gen_ai/moe/activation.py,sha256=GeIcBKXpfvJWSn1P0nlbMqzuLYvlyyaZ8pQsSf1GHT0,7891
+fbgemm_gpu/experimental/gen_ai/moe/gather_scatter.py,sha256=I_pNn2-VUD9_tiEvLlJMJJFP3orOepnfQ--75ytnfIo,21043
+fbgemm_gpu/experimental/gen_ai/moe/layers.py,sha256=fuCPsRzM62zrfW5lHkjtpbyRi9YP3ytp0GgYmwweHa8,42691
 fbgemm_gpu/experimental/gen_ai/moe/shuffling.py,sha256=VDGEUdLZyj6mblJkAIReLICxU5BGnvmUjgZDP0VVqt8,11077
 fbgemm_gpu/quantize/__init__.py,sha256=pftciXHE7csekDFkl7Ui1AWglVMMnSrOO04mREnUdb0,921
-fbgemm_gpu/quantize/quantize_ops.py,sha256=25AIOv9n2UoxamMUaI6EK1Ur4gSHxbZIReHBtgOjjCs,2228
-fbgemm_gpu/sll/__init__.py,sha256=rgXh35-OFUE54E9gGBq3NGxouGvgMv2ccY2bWUTxONY,4191
+fbgemm_gpu/quantize/quantize_ops.py,sha256=BhOS3PPKJ6-UFyKFYBB3qtRESSDmHo0UKl2zlXKeKhQ,2227
+fbgemm_gpu/sll/__init__.py,sha256=dvFBTqA7Rw8bvZclAAH-l1eMxD9-haQ9lKYUnZXCmIM,4190
 fbgemm_gpu/sll/cpu/__init__.py,sha256=glsukNpXtf47VRIdBktILD-4CmVcf4621SGB55lT_ho,2692
 fbgemm_gpu/sll/cpu/cpu_sll.py,sha256=3zRsDZKCFPly1EZWl4LNB3ABJVy4JM4RVwmDuUeJZzc,27870
 fbgemm_gpu/sll/meta/__init__.py,sha256=2sMcD67XGsweBZ-UV2AEJmM4ELPsHeRAYED6kqfgAd4,1077
 fbgemm_gpu/sll/meta/meta_sll.py,sha256=Jk14EOW9VPFwawD7Bwky0R0A5rmbcLWMo52oH8J6Koc,8305
-fbgemm_gpu/sll/triton/__init__.py,sha256=dW_cEW0R8635sKLozsL88SP0Cch5QnBGvfnAmoqWMic,4109
+fbgemm_gpu/sll/triton/__init__.py,sha256=ndvZ5OO81KP65HopJql91R9y_5fC88WnNIGYxCAVKwM,4099
 fbgemm_gpu/sll/triton/common.py,sha256=hISlX4Y-7FtGof-Xx4_B8-2vlF27F9t4p2qyLMUnJ8A,798
 fbgemm_gpu/sll/triton/triton_dense_jagged_cat_jagged_out.py,sha256=J9qOqjNJ72LUBqs-pGI9wrFzzzBpsZ5fzYjgfKc2YhY,1885
 fbgemm_gpu/sll/triton/triton_jagged2_to_padded_dense.py,sha256=M_AMJfW9D67xa4ezhmBViKsrt_n9EiX-Ki_drI5K3Bo,5925
 fbgemm_gpu/sll/triton/triton_jagged_bmm.py,sha256=QFhaIQc8g-TRHr7wjm-Wd-atNJS1fDDkImHXXB3v-gU,11789
 fbgemm_gpu/sll/triton/triton_jagged_bmm_jagged_out.py,sha256=hccLxsKoSZKiWid5P_yl-IVdBSXw1Rt0WeiRsjLD2Iw,13864
-fbgemm_gpu/sll/triton/triton_jagged_dense_elementwise_add.py,sha256=_0hke_aaAdKQJpGUYX20NLss1_cXDIKxqblX4QQb7Io,1592
+fbgemm_gpu/sll/triton/triton_jagged_dense_elementwise_add.py,sha256=HpSn4BPFHAODTmXAsZUibAppL1x7qI50vpQhA_p98OE,1591
 fbgemm_gpu/sll/triton/triton_jagged_dense_elementwise_mul_jagged_out.py,sha256=9R7BOOe8SJiko1PgbiuHlFyPKtGaaCFSlZ1RaEQyICE,4198
 fbgemm_gpu/sll/triton/triton_jagged_dense_flash_attention.py,sha256=nebxJ7-1muDn-1oEuE46NbYbr6BcsPcuTOsQ49nCchI,22783
 fbgemm_gpu/sll/triton/triton_jagged_flash_attention_basic.py,sha256=po9Nx4uAGVu_YIZ9CWvrmzSwxDsnDuNAtnk9VR7-Ems,17750
@@ -82,22 +82,22 @@ fbgemm_gpu/sll/triton/triton_multi_head_jagged_flash_attention.py,sha256=nEo5I-b
 fbgemm_gpu/tbe/__init__.py,sha256=fE0IHi1JJpxsNVBNzWNee2thrNXFFRhY94c80RxNSIE,231
 fbgemm_gpu/tbe/bench/__init__.py,sha256=wgPBmxtQMmbA39cbQ2nO4PGAk5lXjFGjG8-9FoAXg34,1589
 fbgemm_gpu/tbe/bench/bench_config.py,sha256=xgtlGLCeZVW6jBYwkKsiQeCslCrWDgJbV2NLLwCRSn4,5452
-fbgemm_gpu/tbe/bench/bench_runs.py,sha256=vCblxjwvpzZ5oBxd6Z9fYy2KYmI--ySYlqRw_PLPX3k,23507
-fbgemm_gpu/tbe/bench/benchmark_click_interface.py,sha256=Ey-3Rx4jfzam4QnYs-pNIe-UJvgmoeeM0zZ4C5j5ZuU,6891
-fbgemm_gpu/tbe/bench/eeg_cli.py,sha256=DuF0pjy1wjrGaqsf1Bo9IP_q5nNx237cv9j80pG5aCk,3569
-fbgemm_gpu/tbe/bench/embedding_ops_common_config.py,sha256=CXwupJIhtDQiOedqSYhJyXbiMOikML5torrXb5hqt2Y,4967
+fbgemm_gpu/tbe/bench/bench_runs.py,sha256=CBxO3Jad091cuD3ARr3UxRdGWrpWJkop44Tc17_OaeM,23506
+fbgemm_gpu/tbe/bench/benchmark_click_interface.py,sha256=_e86jTLSWxYSkj8aiHm53kVzPJirDZWRl4S_Zd5FuOo,6917
+fbgemm_gpu/tbe/bench/eeg_cli.py,sha256=n7_9L2dbb2F65BSABH50HRzRQFgujnPESjzuHSVjG_U,3568
+fbgemm_gpu/tbe/bench/embedding_ops_common_config.py,sha256=WvoPvw-pY7gHQuJZlcU5RL87-pDcKKdMPH5wwUUOmAc,4966
 fbgemm_gpu/tbe/bench/eval_compression.py,sha256=ulFMaNZF2g_vfkXLWZSh02ibotg1zpTz3swVU484mzU,3486
 fbgemm_gpu/tbe/bench/reporter.py,sha256=ZK5RFolUmZEcsEaife270_iOdXAQD5EjTUkuxctnAbY,804
-fbgemm_gpu/tbe/bench/tbe_data_config.py,sha256=M0lK6m3S7Kl34prQcC3z8POr93FgX1oEUZ6MdVXZq5M,4794
-fbgemm_gpu/tbe/bench/tbe_data_config_bench_helper.py,sha256=tgNB_3qWqWpjR86BhgRSU74bdW_ilRjtG61Cxmy1_Vk,10923
-fbgemm_gpu/tbe/bench/tbe_data_config_loader.py,sha256=MNddYzoRlu0mNhnsVVG57JN7pBAepfaRL7UCEzS2KoI,10007
+fbgemm_gpu/tbe/bench/tbe_data_config.py,sha256=zV8gzA9wcpDqh8y9JC9mUCEt-_6IxrcJn3SlvpqMBo4,4823
+fbgemm_gpu/tbe/bench/tbe_data_config_bench_helper.py,sha256=A-iFmpRZkMMH2qgJuAoRplH5CyT1MUFTvgSDf1n6e4A,10921
+fbgemm_gpu/tbe/bench/tbe_data_config_loader.py,sha256=2pz1HBhQ4UP6dHtxECdxWUhEb05wv6ZkG1u33Sy1EJA,10006
 fbgemm_gpu/tbe/bench/tbe_data_config_param_models.py,sha256=sptdqcNE9JlgyIJ17neZaMxagKG469_ynX0mVx_JKBY,6090
-fbgemm_gpu/tbe/bench/utils.py,sha256=cq_6FJHlgZ5femAK6XKpj7nJ9jc03qXI16N1ht1CcLg,1721
+fbgemm_gpu/tbe/bench/utils.py,sha256=kxc3mqsZKq_tjlCN65TPevuKt6JUvwZs9LN8lu8Pfds,1720
 fbgemm_gpu/tbe/cache/__init__.py,sha256=lrYwhvqX2eWN0vAPe89HYgMW_O1vccoOcoFHJ9cyM-s,398
 fbgemm_gpu/tbe/cache/kv_embedding_ops_inference.py,sha256=VmG9EennGcq2By8Tj8VkFsJG0oOCGw8EhlPo8-t--Fk,14604
 fbgemm_gpu/tbe/cache/split_embeddings_cache_ops.py,sha256=vZHj7KIe1DoJDy5eft29XtGg6I-tRx60tjKOcTHRAYI,1321
 fbgemm_gpu/tbe/ssd/__init__.py,sha256=wzfMT10cp_dqK2lrebC449hOdexBnizcf_98lA1NyHs,483
-fbgemm_gpu/tbe/ssd/common.py,sha256=1J8K7sTQswgCYWaVwF-ZdCJj7mNN6O9GI70AaZWzJGE,1044
+fbgemm_gpu/tbe/ssd/common.py,sha256=QP9Cz2t3dxzSQ2P4x0R2ekQY2Dk1TzijqXBdMJ-uLkQ,1043
 fbgemm_gpu/tbe/ssd/inference.py,sha256=B_uX66ajGA9YKGlFa5TmGWs7b-b1RFigzwxmENZ9Oio,22816
 fbgemm_gpu/tbe/ssd/training.py,sha256=C6M3H_f8oWWRkC4R-BJED73au-Gl9SUVllxOoFSiDkI,212234
 fbgemm_gpu/tbe/ssd/utils/__init__.py,sha256=5DgmR2HA6NtmYh2ddkUgpDsZ6a7hF0DPedA1gMpdh18,250
@@ -111,7 +111,7 @@ fbgemm_gpu/tbe/utils/quantize.py,sha256=icN2MXnl5rNqtKhGKkjpelx5pYBMYUv-6CrghxeV
 fbgemm_gpu/tbe/utils/requests.py,sha256=rQkEoaUUWEYCQM-1K_Lxg1wPcyIVw8sbdaGFTpsaE5I,18040
 fbgemm_gpu/triton/__init__.py,sha256=kPn_Ye6J9DAzWtqi76KYGwfKSqw0IhqG3Bir5aUpkWM,658
 fbgemm_gpu/triton/common.py,sha256=wnkLd2a8fKpefymLL-LjNKEL4hDVSxFiF5g3aF8mzsw,2131
-fbgemm_gpu/triton/quantize.py,sha256=z3y74-DCbGcQDsO70b2jK_HQDIYC0UJ7IEG2vvMu0_Y,26816
+fbgemm_gpu/triton/quantize.py,sha256=I0pxyfIx04zyq55x4Pvj-28Cb2ZeF-SGtFhAymFagkg,27073
 fbgemm_gpu/triton/quantize_ref.py,sha256=q4RBmFaqPVPELU52lbSgB0n26Aun7apeK7bRF2MWS80,11553
 fbgemm_gpu/triton/jagged/__init__.py,sha256=om0yhjuzKuE1UQakFMWHsXN4WNb8mvNkZtYofQ8hdn4,246
 fbgemm_gpu/triton/jagged/triton_jagged_tensor_ops.py,sha256=F2eQWjkWMR5RWQ48oIr-8OU_CRZyLazDpT7DFrDWS6g,29871
@@ -121,8 +121,8 @@ fbgemm_gpu/utils/loader.py,sha256=1hCEhNvkflniH46fGcrguLeP1z-6uyOu2QFwqKU5CIM,99
 fbgemm_gpu/utils/torch_library.py,sha256=ywsAHjbuwesj50LjEu99WkAH17FlaVgePZ9OmFg6YE4,4193
 fbgemm_gpu/utils/writeback_util.py,sha256=PyVbHp1EuF-GKrJv_CTP6B50Z0oBblXKucf7Rhd6KKY,4614
 list_versions/__init__.py,sha256=UmTeqCk-UJWFtlZQWvZao3xvui2w9E3X_JdOXVjRaNw,315
-list_versions/cli_run.py,sha256=CChZoXQ-tiKaWboXAYlPVJ5w8K5zAKiKcncA087I1sc,4508
-fbgemm_gpu_genai_nightly-2025.12.23.dist-info/METADATA,sha256=f_OA5iQSJM23ogn2epFKT0jnDM5ggh7Xg6c7FtDT0ag,2657
-fbgemm_gpu_genai_nightly-2025.12.23.dist-info/WHEEL,sha256=V2Q6mQKbouIadCxoRjt9FQ9oKfi45-uZUcoc77zzs0M,108
-fbgemm_gpu_genai_nightly-2025.12.23.dist-info/top_level.txt,sha256=_2s1Aa08r_eDn0JP4FjOhzK09Q8bVlEI7q8pMep51UY,25
-fbgemm_gpu_genai_nightly-2025.12.23.dist-info/RECORD,,
+list_versions/cli_run.py,sha256=BCRaJvjVFBFmD5WPdjC_yJwlLv1w_TYOe3eYlf_9ZMo,4506
+fbgemm_gpu_genai_nightly-2026.1.14.dist-info/METADATA,sha256=oPv8amMA9l2QQ5sKkeeABPCccKVbYAZkVJX7314f1kY,2656
+fbgemm_gpu_genai_nightly-2026.1.14.dist-info/WHEEL,sha256=V2Q6mQKbouIadCxoRjt9FQ9oKfi45-uZUcoc77zzs0M,108
+fbgemm_gpu_genai_nightly-2026.1.14.dist-info/top_level.txt,sha256=_2s1Aa08r_eDn0JP4FjOhzK09Q8bVlEI7q8pMep51UY,25
+fbgemm_gpu_genai_nightly-2026.1.14.dist-info/RECORD,,

list_versions/cli_run.py CHANGED Viewed

@@ -13,9 +13,7 @@ from datetime import datetime
 from typing import Union
 import click
 import pandas as pd
 import torch

{fbgemm_gpu_genai_nightly-2025.12.23.dist-info → fbgemm_gpu_genai_nightly-2026.1.14.dist-info}/WHEEL RENAMED Viewed

File without changes

{fbgemm_gpu_genai_nightly-2025.12.23.dist-info → fbgemm_gpu_genai_nightly-2026.1.14.dist-info}/top_level.txt RENAMED Viewed

File without changes