fbgemm-gpu-genai-nightly 2025.8.2__cp313-cp313-manylinux_2_28_x86_64.whl → 2025.11.4__cp313-cp313-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. fbgemm_gpu/__init__.py +106 -19
  2. fbgemm_gpu/asmjit.so +0 -0
  3. fbgemm_gpu/batched_unary_embeddings_ops.py +2 -3
  4. fbgemm_gpu/config/feature_list.py +4 -1
  5. fbgemm_gpu/docs/sparse_ops.py +118 -0
  6. fbgemm_gpu/docs/target.genai.json.py +6 -0
  7. fbgemm_gpu/enums.py +3 -4
  8. fbgemm_gpu/experimental/example/__init__.py +0 -4
  9. fbgemm_gpu/experimental/example/fbgemm_gpu_experimental_example_py.so +0 -0
  10. fbgemm_gpu/experimental/gemm/triton_gemm/__init__.py +0 -4
  11. fbgemm_gpu/experimental/gemm/triton_gemm/fp4_quantize.py +277 -218
  12. fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py +509 -433
  13. fbgemm_gpu/experimental/gemm/triton_gemm/grouped_gemm.py +157 -102
  14. fbgemm_gpu/experimental/gen_ai/__init__.py +12 -7
  15. fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/__init__.py +32 -0
  16. fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_custom_op.py +261 -0
  17. fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_interface.py +344 -0
  18. fbgemm_gpu/experimental/gen_ai/bench/__init__.py +0 -4
  19. fbgemm_gpu/experimental/gen_ai/bench/comm_bench.py +1 -2
  20. fbgemm_gpu/experimental/gen_ai/bench/gather_scatter_bench.py +15 -16
  21. fbgemm_gpu/experimental/gen_ai/bench/quantize_bench.py +250 -190
  22. fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py +721 -129
  23. fbgemm_gpu/experimental/gen_ai/fbgemm_gpu_experimental_gen_ai.so +0 -0
  24. fbgemm_gpu/experimental/gen_ai/moe/__init__.py +0 -4
  25. fbgemm_gpu/experimental/gen_ai/moe/activation.py +2 -2
  26. fbgemm_gpu/experimental/gen_ai/moe/gather_scatter.py +24 -17
  27. fbgemm_gpu/experimental/gen_ai/moe/layers.py +6 -9
  28. fbgemm_gpu/experimental/gen_ai/moe/shuffling.py +82 -67
  29. fbgemm_gpu/experimental/gen_ai/quantize.py +6 -7
  30. fbgemm_gpu/fbgemm.so +0 -0
  31. fbgemm_gpu/permute_pooled_embedding_modules.py +4 -4
  32. fbgemm_gpu/permute_pooled_embedding_modules_split.py +4 -4
  33. fbgemm_gpu/quantize_comm.py +13 -6
  34. fbgemm_gpu/quantize_utils.py +29 -3
  35. fbgemm_gpu/runtime_monitor.py +9 -5
  36. fbgemm_gpu/sll/cpu/cpu_sll.py +6 -6
  37. fbgemm_gpu/sll/triton/triton_jagged2_to_padded_dense.py +1 -2
  38. fbgemm_gpu/sll/triton/triton_jagged_dense_flash_attention.py +3 -4
  39. fbgemm_gpu/sll/triton/triton_jagged_flash_attention_basic.py +1 -2
  40. fbgemm_gpu/sll/triton/triton_multi_head_jagged_flash_attention.py +1 -2
  41. fbgemm_gpu/sparse_ops.py +93 -53
  42. fbgemm_gpu/split_embedding_configs.py +98 -48
  43. fbgemm_gpu/split_embedding_inference_converter.py +4 -4
  44. fbgemm_gpu/split_table_batched_embeddings_ops_common.py +101 -23
  45. fbgemm_gpu/split_table_batched_embeddings_ops_inference.py +37 -37
  46. fbgemm_gpu/split_table_batched_embeddings_ops_training.py +528 -71
  47. fbgemm_gpu/split_table_batched_embeddings_ops_training_common.py +2 -2
  48. fbgemm_gpu/tbe/bench/__init__.py +1 -0
  49. fbgemm_gpu/tbe/bench/bench_config.py +14 -3
  50. fbgemm_gpu/tbe/bench/bench_runs.py +155 -14
  51. fbgemm_gpu/tbe/bench/eeg_cli.py +2 -3
  52. fbgemm_gpu/tbe/bench/embedding_ops_common_config.py +2 -2
  53. fbgemm_gpu/tbe/bench/eval_compression.py +3 -3
  54. fbgemm_gpu/tbe/bench/tbe_data_config.py +30 -185
  55. fbgemm_gpu/tbe/bench/tbe_data_config_bench_helper.py +323 -0
  56. fbgemm_gpu/tbe/bench/tbe_data_config_loader.py +55 -3
  57. fbgemm_gpu/tbe/bench/tbe_data_config_param_models.py +13 -8
  58. fbgemm_gpu/tbe/cache/kv_embedding_ops_inference.py +22 -19
  59. fbgemm_gpu/tbe/cache/split_embeddings_cache_ops.py +4 -4
  60. fbgemm_gpu/tbe/ssd/inference.py +13 -13
  61. fbgemm_gpu/tbe/ssd/training.py +812 -174
  62. fbgemm_gpu/tbe/ssd/utils/partially_materialized_tensor.py +2 -2
  63. fbgemm_gpu/tbe/stats/bench_params_reporter.py +187 -44
  64. fbgemm_gpu/tbe/utils/offsets.py +3 -3
  65. fbgemm_gpu/tbe/utils/quantize.py +2 -2
  66. fbgemm_gpu/tbe/utils/requests.py +14 -14
  67. fbgemm_gpu/tbe_input_multiplexer.py +10 -10
  68. fbgemm_gpu/triton/jagged/triton_jagged_tensor_ops.py +11 -11
  69. fbgemm_gpu/utils/filestore.py +6 -2
  70. fbgemm_gpu/utils/torch_library.py +2 -2
  71. {fbgemm_gpu_genai_nightly-2025.8.2.dist-info → fbgemm_gpu_genai_nightly-2025.11.4.dist-info}/METADATA +1 -1
  72. fbgemm_gpu_genai_nightly-2025.11.4.dist-info/RECORD +127 -0
  73. list_versions/cli_run.py +5 -6
  74. fbgemm_gpu/docs/version.py +0 -11
  75. fbgemm_gpu/experimental/gen_ai/bench/ck_bf16_bench.py +0 -168
  76. fbgemm_gpu_genai_nightly-2025.8.2.dist-info/RECORD +0 -124
  77. {fbgemm_gpu_genai_nightly-2025.8.2.dist-info → fbgemm_gpu_genai_nightly-2025.11.4.dist-info}/WHEEL +0 -0
  78. {fbgemm_gpu_genai_nightly-2025.8.2.dist-info → fbgemm_gpu_genai_nightly-2025.11.4.dist-info}/top_level.txt +0 -0
fbgemm_gpu/__init__.py CHANGED
@@ -5,17 +5,100 @@
5
5
  # This source code is licensed under the BSD-style license found in the
6
6
  # LICENSE file in the root directory of this source tree.
7
7
 
8
+ import json
8
9
  import logging
9
10
  import os
11
+ import re
10
12
 
11
13
  import torch
12
14
 
15
+ # Based on the FBGEMM-PyTorch compatibility table at
16
+ # https://docs.pytorch.org/FBGEMM/general/Releases.html#fbgemm-releases-compatibility
17
+ _fbgemm_torch_compat_table = {
18
+ "1.3": "2.8",
19
+ "1.2": "2.7",
20
+ "1.1": "2.6",
21
+ "1.0": "2.5",
22
+ "0.8": "2.4",
23
+ "0.7": "2.3",
24
+ "0.6": "2.2",
25
+ "0.5": "2.1",
26
+ "0.4": "2.0",
27
+ }
28
+
29
+
30
+ def _load_target_info(target: str) -> dict[str, str]:
31
+ try:
32
+ filepath = os.path.join(
33
+ os.path.dirname(__file__), "docs", f"target.{target}.json.py"
34
+ )
35
+ with open(filepath, "r") as file:
36
+ data = json.load(file)
37
+ except Exception:
38
+ data = {}
39
+
40
+ return data
41
+
13
42
 
14
- def _load_library(filename: str, no_throw: bool = False) -> None:
43
+ def _load_library(filename: str, version: str, no_throw: bool = False) -> None:
15
44
  """Load a shared library from the given filename."""
45
+
46
+ # Check if the version of PyTorch is compatible with the version of FBGEMM
47
+ # that we are trying to load, and print a loud warning if not. This is
48
+ # useful for the OSS build, where we have a single FBGEMM library that is
49
+ # compatible with multiple versions of PyTorch.
50
+ #
51
+ # Based on: https://github.com/pytorch/ao/blob/main/torchao/__init__.py#L30
52
+
53
+ keys = [
54
+ key
55
+ for key in _fbgemm_torch_compat_table.keys()
56
+ if version.startswith(f"{key}.")
57
+ ]
58
+
59
+ if version == "INTERNAL" or "+git" in version:
60
+ # if FBGEMM version has "+git", assume it's locally built and we don't know
61
+ # anything about the PyTorch version used to build it
62
+ logging.info(
63
+ "FBGEMM version is INTERNAL or local, ignoring version compatibility check with PyTorch"
64
+ )
65
+
66
+ elif re.match(r"^\d{4}\.\d{1,2}\.\d{1,2}.*$", version):
67
+ # if FBGEMM version is a date, assume it's a nightly build and that we
68
+ # know what we're doing
69
+ logging.info(
70
+ "FBGEMM version is a nightly version, ignoring version compatibility check with PyTorch"
71
+ )
72
+
73
+ elif not keys:
74
+ logging.warning(
75
+ f"""
76
+ \033[33m
77
+ _fbgemm_torch_compat_table has no entry for {version} of FBGEMM;
78
+ cannot determine compatibility with PyTorch {torch.__version__}
79
+ \033[0m
80
+ """
81
+ )
82
+
83
+ elif str(torch.__version__) != _fbgemm_torch_compat_table[keys[0]]:
84
+ logging.warning(
85
+ f"""
86
+ \033[31m
87
+ FBGEMM_GPU version is {version}, which is not guaranteed to be
88
+ compatible with PyTorch {torch.__version__}; library loading might
89
+ crash!
90
+
91
+ Please refer to
92
+ https://docs.pytorch.org/FBGEMM/general/Releases.html#fbgemm-releases-compatibility
93
+ for the FBGEMM-PyTorch compatibility table.
94
+ \033[0m
95
+ """
96
+ )
97
+
16
98
  try:
17
99
  torch.ops.load_library(os.path.join(os.path.dirname(__file__), filename))
18
100
  logging.info(f"Successfully loaded: '{filename}'")
101
+
19
102
  except Exception as error:
20
103
  logging.error(f"Could not load the library '{filename}'!\n\n\n{error}\n\n\n")
21
104
  if not no_throw:
@@ -29,13 +112,16 @@ open_source: bool = True
29
112
  # Trigger the manual addition of docstrings to pybind11-generated operators
30
113
  import fbgemm_gpu.docs # noqa: F401, E402
31
114
 
115
+
116
+ __targets_infos__ = {
117
+ target: _load_target_info(target) for target in ["default", "genai", "hstu"]
118
+ }
119
+ __targets_infos__ = {k: v for (k, v) in __targets_infos__.items() if v}
120
+
32
121
  try:
33
- # Export the version string from the version file auto-generated by setup.py
34
- from fbgemm_gpu.docs.version import ( # noqa: F401, E402
35
- __target__,
36
- __variant__,
37
- __version__,
38
- )
122
+ __target__, __info__ = next(iter(__targets_infos__.items()))
123
+ __variant__ = __info__["variant"]
124
+ __version__ = __info__["version"]
39
125
  except Exception:
40
126
  __variant__: str = "INTERNAL"
41
127
  __version__: str = "INTERNAL"
@@ -76,18 +162,19 @@ libraries_to_load = {
76
162
  "genai": fbgemm_genai_libraries,
77
163
  }
78
164
 
79
- for library in libraries_to_load.get(__target__, []):
80
- # NOTE: In all cases, we want to throw an error if we cannot load the
81
- # library. However, this appears to break the OSS documentation build,
82
- # where the Python documentation doesn't show up in the generated docs.
83
- #
84
- # To work around this problem, we introduce a fake build variant called
85
- # `docs` and we only throw a library load error when the variant is not
86
- # `docs`. For more information, see:
87
- #
88
- # https://github.com/pytorch/FBGEMM/pull/3477
89
- # https://github.com/pytorch/FBGEMM/pull/3717
90
- _load_library(f"{library}.so", __variant__ == "docs")
165
+ for target, info in __targets_infos__.items():
166
+ for library in libraries_to_load.get(target, []):
167
+ # NOTE: In all cases, we want to throw an error if we cannot load the
168
+ # library. However, this appears to break the OSS documentation build,
169
+ # where the Python documentation doesn't show up in the generated docs.
170
+ #
171
+ # To work around this problem, we introduce a fake build variant called
172
+ # `docs` and we only throw a library load error when the variant is not
173
+ # `docs`. For more information, see:
174
+ #
175
+ # https://github.com/pytorch/FBGEMM/pull/3477
176
+ # https://github.com/pytorch/FBGEMM/pull/3717
177
+ _load_library(f"{library}.so", info["version"], info["variant"] == "docs")
91
178
 
92
179
  try:
93
180
  # Trigger meta operator registrations
fbgemm_gpu/asmjit.so CHANGED
Binary file
@@ -9,7 +9,6 @@
9
9
 
10
10
 
11
11
  from math import sqrt
12
- from typing import List
13
12
 
14
13
  import torch
15
14
 
@@ -22,7 +21,7 @@ except Exception:
22
21
  load_torch_module("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
23
22
 
24
23
 
25
- def wrap_weight_to_parameter(weights: List[torch.Tensor]) -> List[torch.Tensor]:
24
+ def wrap_weight_to_parameter(weights: list[torch.Tensor]) -> list[torch.Tensor]:
26
25
  for i, v in enumerate(weights):
27
26
  if not isinstance(v, torch.nn.Parameter):
28
27
  weights[i] = torch.nn.Parameter(v)
@@ -31,7 +30,7 @@ def wrap_weight_to_parameter(weights: List[torch.Tensor]) -> List[torch.Tensor]:
31
30
 
32
31
  class BatchedUnaryEmbeddingBag(torch.nn.Module):
33
32
  # pyre-fixme[3]: Return type must be annotated.
34
- def __init__(self, num_tasks: int, hash_sizes: List[int], long_index: bool = False):
33
+ def __init__(self, num_tasks: int, hash_sizes: list[int], long_index: bool = False):
35
34
  super().__init__()
36
35
  self.num_tasks = num_tasks
37
36
  self.hash_sizes = hash_sizes
@@ -11,7 +11,7 @@ from enum import auto, Enum
11
11
  import torch
12
12
 
13
13
  try:
14
- torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:config_cpp")
14
+ torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:config_cpp_torch_op")
15
15
  except Exception:
16
16
  import fbgemm_gpu # noqa F401
17
17
 
@@ -60,6 +60,9 @@ class FeatureGateName(Enum):
60
60
  # Enable bounds_check_indices_v2
61
61
  BOUNDS_CHECK_INDICES_V2 = auto()
62
62
 
63
+ # Enable TBE input parameters extraction
64
+ TBE_REPORT_INPUT_PARAMS = auto()
65
+
63
66
  def is_enabled(self) -> bool:
64
67
  return FeatureGate.is_enabled(self)
65
68
 
@@ -496,3 +496,121 @@ Return:
496
496
  None)
497
497
  """,
498
498
  )
499
+
500
+ add_docs(
501
+ torch.ops.fbgemm.block_bucketize_sparse_features_2d_weights,
502
+ """
503
+ block_bucketize_sparse_features_2d_weights(lengths, indices, bucketize_pos, sequence, block_sizes, my_size, weights, weights_dim=1, batch_size_per_feature=None, max_B= -1, block_bucketize_pos=None, keep_orig_idx=False, total_num_blocks=None, keep_orig_idx_per_feature=None) -> Tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor], Optional[Tensor]]
504
+
505
+ Preprocess sparse features by partitioning sparse features into multiple
506
+ buckets with support for 2D weights. Every feature is split into the same number of buckets, but the bucket
507
+ sizes (widths) for the different features can be different. Moreover, the
508
+ bucket sizes within each feature can be different.
509
+
510
+ This function is similar to block_bucketize_sparse_features but supports 2D weights,
511
+ where each index can have multiple weight values associated with it.
512
+
513
+ Args:
514
+ lengths (Tensor): The lengths of the sparse features. The tensor contains
515
+ the lengths of each sample in a batch and each feature. Shape is `B *
516
+ T` where `B` is the batch size and `T` is the number of features
517
+
518
+ indices (Tensor): The sparse data. Only support integer types. Shape is the
519
+ sum of `lengths`
520
+
521
+ bucketize_pos (bool): If True, return the original relative indices within
522
+ a sample. For example, `indices = [9, 8, 2, 1, 0, 8, 9]` and `lengths =
523
+ [3, 4]`. The original relative indices within a sample for the indices
524
+ are `[0, 1, 2, 0, 1, 2, 3]`
525
+
526
+ sequence (bool): If True, return the new indices positions in the original
527
+ indices positions (the tensor is called `unbucketize_permute_data`).
528
+
529
+ block_sizes (Tensor): This tensor is used for the case where the bucket
530
+ size within a feature is uniform (i.e., when
531
+ `block_bucketize_pos=None`). The tensor contains bucket sizes (i.e.,
532
+ bucket widths) for each feature. `block_sizes[t]` represents the
533
+ bucket size of feature `t`. Shape is the number of features.
534
+
535
+ my_size (int): The number of buckets for each feature. Note that every
536
+ feature has the same number of buckets.
537
+
538
+ weights (Tensor): A float tensor that will be bucketized the same way as
539
+ `indices`. This tensor must have shape `[indices.size(0), weights_dim]`
540
+ where `weights_dim` is the dimension of the weight values for each index.
541
+
542
+ weights_dim (int = 1): The dimension of the weight values for each index.
543
+ This parameter is only used when `weights` is not None.
544
+
545
+ batch_size_per_feature (Optional[Tensor] = None): An optional tensor that
546
+ contains batch sizes for different features. If not None, batch sizes
547
+ are not uniform among features. Otherwise, the operator will assume
548
+ that the batch size is uniform and infer it from the `lengths` and
549
+ `block_sizes` tensors
550
+
551
+ max_B (int = -1): The max batch size. Must be set if
552
+ `batch_size_per_feature` is not None
553
+
554
+ block_bucketize_pos (Optional[List[Tensor]] = None): The input is used for
555
+ non-uniform bucket sizes within a feature. `block_bucketize_pos` is a
556
+ list of tensors. Each tensor contains the range offsets of buckets for
557
+ each feature. These range offsets are equivalent to the complete
558
+ cumulative sum of the bucket sizes. For example, `[0, 4, 20]` represents
559
+ two buckets. The first bucket size is `(4 - 0) = 4`, and the second
560
+ bucket size is `(20 - 4) = 16`. The length of `block_bucketize_pos`
561
+ must be equal to the number of features.
562
+
563
+ keep_orig_idx (bool = False): If True, return original indices instead of
564
+ the relative indices within each bucket
565
+
566
+ total_num_blocks (Optional[torch.Tensor] = None): An optional tensor that
567
+ contains then number of logical buckets (aka blocks) within a given
568
+ feature. This is useful for applications where the number of buckets
569
+ is more than the number of physical GPUs, which is common in cases
570
+ where we scale up/down the number of GPUs but want to maintain
571
+ same numerical behavior.
572
+
573
+ keep_orig_idx_per_feature (Optional[Tensor] = None): An optional tensor that
574
+ contains whether to keep original indices for each feature. If not None,
575
+ the operator will use this tensor to determine whether to keep original
576
+ indices for each feature. if None, will fallback to `keep_orig_idx`
577
+
578
+ Return:
579
+ A tuple of tensors containing
580
+
581
+ (1) Bucketized lengths. Shape is `lengths.num() * my_size`.
582
+
583
+ (2) Bucketized indices. Same shape as `indices`.
584
+
585
+ (3) Bucketized weights or None if `weights` is None. Shape is
586
+ `[indices.size(0), weights_dim]`.
587
+
588
+ (4) Bucketized positions or None if `bucketize_pos=False`. Same shape as
589
+ `indices`.
590
+
591
+ (5) `unbucketize_permute` or None if `sequence=False`. Same shape as
592
+ `indices`
593
+
594
+ **Example**:
595
+
596
+ >>> # Generate input example. Batch size = 2. Number of features = 4
597
+ >>> lengths = torch.tensor([0, 2, 1, 3, 2, 3, 3, 1], dtype=torch.int, device="cuda")
598
+ >>> indices = torch.tensor([3, 4, 15, 11, 28, 29, 1, 10, 11, 12, 13, 11, 22, 20, 20], dtype=torch.int, device="cuda")
599
+ >>> block_sizes = torch.tensor([[5, 15, 10, 20]], dtype=torch.int, device="cuda")
600
+ >>> my_size = 2 # Number of buckets
601
+ >>> weights_dim = 3 # Dimension of weight values for each index
602
+ >>> weights = torch.randn(indices.size(0), weights_dim, dtype=torch.float, device="cuda")
603
+ >>> # Invoke with keep_orig_idx=False, bucketize_pos=False, and
604
+ >>> # sequence=False
605
+ >>> torch.ops.fbgemm.block_bucketize_sparse_features_2d_weights(
606
+ >>> lengths,
607
+ >>> indices,
608
+ >>> bucketize_pos=False,
609
+ >>> sequence=False,
610
+ >>> block_sizes=block_sizes,
611
+ >>> my_size=my_size,
612
+ >>> weights=weights,
613
+ >>> weights_dim=weights_dim,
614
+ >>> keep_orig_idx=False)
615
+ """,
616
+ )
@@ -0,0 +1,6 @@
1
+
2
+ {
3
+ "version": "2025.11.4",
4
+ "target": "genai",
5
+ "variant": "cuda"
6
+ }
fbgemm_gpu/enums.py CHANGED
@@ -8,14 +8,13 @@
8
8
  # pyre-strict
9
9
 
10
10
  import enum
11
- import typing
12
- from typing import Any, Callable, List, Tuple
11
+ from typing import Any, Callable
13
12
 
14
13
 
15
14
  # Create enums in given namespace with information from query_op
16
15
  def create_enums(
17
- namespace: typing.Dict[str, Any],
18
- query_op: Callable[[], List[Tuple[str, List[Tuple[str, int]]]]],
16
+ namespace: dict[str, Any],
17
+ query_op: Callable[[], list[tuple[str, list[tuple[str, int]]]]],
19
18
  ) -> None:
20
19
  for enum_name, items in query_op():
21
20
  # Create matching python enumeration
@@ -15,10 +15,6 @@ try:
15
15
  # pyre-ignore[21]
16
16
  # @manual=//deeplearning/fbgemm/fbgemm_gpu:test_utils
17
17
  from fbgemm_gpu import open_source
18
-
19
- # pyre-ignore[21]
20
- # @manual=//deeplearning/fbgemm/fbgemm_gpu:test_utils
21
- from fbgemm_gpu.docs.version import __version__ # noqa: F401
22
18
  except Exception:
23
19
  open_source: bool = False
24
20
 
@@ -11,9 +11,5 @@ try:
11
11
  # pyre-ignore[21]
12
12
  # @manual=//deeplearning/fbgemm/fbgemm_gpu:test_utils
13
13
  from fbgemm_gpu import open_source
14
-
15
- # pyre-ignore[21]
16
- # @manual=//deeplearning/fbgemm/fbgemm_gpu:test_utils
17
- from fbgemm_gpu.docs.version import __version__ # noqa: F401
18
14
  except Exception:
19
15
  open_source: bool = False