fbgemm-gpu-genai-nightly 2025.12.19__cp310-cp310-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of fbgemm-gpu-genai-nightly might be problematic. Click here for more details.

Files changed (127) hide show
  1. fbgemm_gpu/__init__.py +186 -0
  2. fbgemm_gpu/asmjit.so +0 -0
  3. fbgemm_gpu/batched_unary_embeddings_ops.py +87 -0
  4. fbgemm_gpu/config/__init__.py +9 -0
  5. fbgemm_gpu/config/feature_list.py +88 -0
  6. fbgemm_gpu/docs/__init__.py +18 -0
  7. fbgemm_gpu/docs/common.py +9 -0
  8. fbgemm_gpu/docs/examples.py +73 -0
  9. fbgemm_gpu/docs/jagged_tensor_ops.py +259 -0
  10. fbgemm_gpu/docs/merge_pooled_embedding_ops.py +36 -0
  11. fbgemm_gpu/docs/permute_pooled_embedding_ops.py +108 -0
  12. fbgemm_gpu/docs/quantize_ops.py +41 -0
  13. fbgemm_gpu/docs/sparse_ops.py +616 -0
  14. fbgemm_gpu/docs/target.genai.json.py +6 -0
  15. fbgemm_gpu/enums.py +24 -0
  16. fbgemm_gpu/experimental/example/__init__.py +29 -0
  17. fbgemm_gpu/experimental/example/fbgemm_gpu_experimental_example_py.so +0 -0
  18. fbgemm_gpu/experimental/example/utils.py +20 -0
  19. fbgemm_gpu/experimental/gemm/triton_gemm/__init__.py +15 -0
  20. fbgemm_gpu/experimental/gemm/triton_gemm/fp4_quantize.py +5654 -0
  21. fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py +4422 -0
  22. fbgemm_gpu/experimental/gemm/triton_gemm/grouped_gemm.py +1192 -0
  23. fbgemm_gpu/experimental/gemm/triton_gemm/matmul_perf_model.py +232 -0
  24. fbgemm_gpu/experimental/gemm/triton_gemm/utils.py +130 -0
  25. fbgemm_gpu/experimental/gen_ai/__init__.py +56 -0
  26. fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/__init__.py +46 -0
  27. fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_custom_op.py +333 -0
  28. fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_interface.py +552 -0
  29. fbgemm_gpu/experimental/gen_ai/bench/__init__.py +13 -0
  30. fbgemm_gpu/experimental/gen_ai/bench/comm_bench.py +257 -0
  31. fbgemm_gpu/experimental/gen_ai/bench/gather_scatter_bench.py +348 -0
  32. fbgemm_gpu/experimental/gen_ai/bench/quantize_bench.py +707 -0
  33. fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py +3483 -0
  34. fbgemm_gpu/experimental/gen_ai/fbgemm_gpu_experimental_gen_ai.so +0 -0
  35. fbgemm_gpu/experimental/gen_ai/moe/README.md +15 -0
  36. fbgemm_gpu/experimental/gen_ai/moe/__init__.py +66 -0
  37. fbgemm_gpu/experimental/gen_ai/moe/activation.py +292 -0
  38. fbgemm_gpu/experimental/gen_ai/moe/gather_scatter.py +740 -0
  39. fbgemm_gpu/experimental/gen_ai/moe/layers.py +1272 -0
  40. fbgemm_gpu/experimental/gen_ai/moe/shuffling.py +421 -0
  41. fbgemm_gpu/experimental/gen_ai/quantize.py +307 -0
  42. fbgemm_gpu/fbgemm.so +0 -0
  43. fbgemm_gpu/metrics.py +160 -0
  44. fbgemm_gpu/permute_pooled_embedding_modules.py +142 -0
  45. fbgemm_gpu/permute_pooled_embedding_modules_split.py +85 -0
  46. fbgemm_gpu/quantize/__init__.py +43 -0
  47. fbgemm_gpu/quantize/quantize_ops.py +64 -0
  48. fbgemm_gpu/quantize_comm.py +315 -0
  49. fbgemm_gpu/quantize_utils.py +246 -0
  50. fbgemm_gpu/runtime_monitor.py +237 -0
  51. fbgemm_gpu/sll/__init__.py +189 -0
  52. fbgemm_gpu/sll/cpu/__init__.py +80 -0
  53. fbgemm_gpu/sll/cpu/cpu_sll.py +1001 -0
  54. fbgemm_gpu/sll/meta/__init__.py +35 -0
  55. fbgemm_gpu/sll/meta/meta_sll.py +337 -0
  56. fbgemm_gpu/sll/triton/__init__.py +127 -0
  57. fbgemm_gpu/sll/triton/common.py +38 -0
  58. fbgemm_gpu/sll/triton/triton_dense_jagged_cat_jagged_out.py +72 -0
  59. fbgemm_gpu/sll/triton/triton_jagged2_to_padded_dense.py +221 -0
  60. fbgemm_gpu/sll/triton/triton_jagged_bmm.py +418 -0
  61. fbgemm_gpu/sll/triton/triton_jagged_bmm_jagged_out.py +553 -0
  62. fbgemm_gpu/sll/triton/triton_jagged_dense_elementwise_add.py +52 -0
  63. fbgemm_gpu/sll/triton/triton_jagged_dense_elementwise_mul_jagged_out.py +175 -0
  64. fbgemm_gpu/sll/triton/triton_jagged_dense_flash_attention.py +861 -0
  65. fbgemm_gpu/sll/triton/triton_jagged_flash_attention_basic.py +667 -0
  66. fbgemm_gpu/sll/triton/triton_jagged_self_substraction_jagged_out.py +73 -0
  67. fbgemm_gpu/sll/triton/triton_jagged_softmax.py +463 -0
  68. fbgemm_gpu/sll/triton/triton_multi_head_jagged_flash_attention.py +751 -0
  69. fbgemm_gpu/sparse_ops.py +1455 -0
  70. fbgemm_gpu/split_embedding_configs.py +452 -0
  71. fbgemm_gpu/split_embedding_inference_converter.py +175 -0
  72. fbgemm_gpu/split_embedding_optimizer_ops.py +21 -0
  73. fbgemm_gpu/split_embedding_utils.py +29 -0
  74. fbgemm_gpu/split_table_batched_embeddings_ops.py +73 -0
  75. fbgemm_gpu/split_table_batched_embeddings_ops_common.py +484 -0
  76. fbgemm_gpu/split_table_batched_embeddings_ops_inference.py +2042 -0
  77. fbgemm_gpu/split_table_batched_embeddings_ops_training.py +4600 -0
  78. fbgemm_gpu/split_table_batched_embeddings_ops_training_common.py +146 -0
  79. fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py +26 -0
  80. fbgemm_gpu/tbe/__init__.py +6 -0
  81. fbgemm_gpu/tbe/bench/__init__.py +55 -0
  82. fbgemm_gpu/tbe/bench/bench_config.py +156 -0
  83. fbgemm_gpu/tbe/bench/bench_runs.py +709 -0
  84. fbgemm_gpu/tbe/bench/benchmark_click_interface.py +187 -0
  85. fbgemm_gpu/tbe/bench/eeg_cli.py +137 -0
  86. fbgemm_gpu/tbe/bench/embedding_ops_common_config.py +149 -0
  87. fbgemm_gpu/tbe/bench/eval_compression.py +119 -0
  88. fbgemm_gpu/tbe/bench/reporter.py +35 -0
  89. fbgemm_gpu/tbe/bench/tbe_data_config.py +137 -0
  90. fbgemm_gpu/tbe/bench/tbe_data_config_bench_helper.py +323 -0
  91. fbgemm_gpu/tbe/bench/tbe_data_config_loader.py +289 -0
  92. fbgemm_gpu/tbe/bench/tbe_data_config_param_models.py +170 -0
  93. fbgemm_gpu/tbe/bench/utils.py +48 -0
  94. fbgemm_gpu/tbe/cache/__init__.py +11 -0
  95. fbgemm_gpu/tbe/cache/kv_embedding_ops_inference.py +385 -0
  96. fbgemm_gpu/tbe/cache/split_embeddings_cache_ops.py +48 -0
  97. fbgemm_gpu/tbe/ssd/__init__.py +15 -0
  98. fbgemm_gpu/tbe/ssd/common.py +46 -0
  99. fbgemm_gpu/tbe/ssd/inference.py +586 -0
  100. fbgemm_gpu/tbe/ssd/training.py +4908 -0
  101. fbgemm_gpu/tbe/ssd/utils/__init__.py +7 -0
  102. fbgemm_gpu/tbe/ssd/utils/partially_materialized_tensor.py +273 -0
  103. fbgemm_gpu/tbe/stats/__init__.py +10 -0
  104. fbgemm_gpu/tbe/stats/bench_params_reporter.py +339 -0
  105. fbgemm_gpu/tbe/utils/__init__.py +13 -0
  106. fbgemm_gpu/tbe/utils/common.py +42 -0
  107. fbgemm_gpu/tbe/utils/offsets.py +65 -0
  108. fbgemm_gpu/tbe/utils/quantize.py +251 -0
  109. fbgemm_gpu/tbe/utils/requests.py +556 -0
  110. fbgemm_gpu/tbe_input_multiplexer.py +108 -0
  111. fbgemm_gpu/triton/__init__.py +22 -0
  112. fbgemm_gpu/triton/common.py +77 -0
  113. fbgemm_gpu/triton/jagged/__init__.py +8 -0
  114. fbgemm_gpu/triton/jagged/triton_jagged_tensor_ops.py +824 -0
  115. fbgemm_gpu/triton/quantize.py +647 -0
  116. fbgemm_gpu/triton/quantize_ref.py +286 -0
  117. fbgemm_gpu/utils/__init__.py +11 -0
  118. fbgemm_gpu/utils/filestore.py +211 -0
  119. fbgemm_gpu/utils/loader.py +36 -0
  120. fbgemm_gpu/utils/torch_library.py +132 -0
  121. fbgemm_gpu/uvm.py +40 -0
  122. fbgemm_gpu_genai_nightly-2025.12.19.dist-info/METADATA +62 -0
  123. fbgemm_gpu_genai_nightly-2025.12.19.dist-info/RECORD +127 -0
  124. fbgemm_gpu_genai_nightly-2025.12.19.dist-info/WHEEL +5 -0
  125. fbgemm_gpu_genai_nightly-2025.12.19.dist-info/top_level.txt +2 -0
  126. list_versions/__init__.py +12 -0
  127. list_versions/cli_run.py +163 -0
@@ -0,0 +1,616 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ import torch
8
+
9
+ from .common import add_docs
10
+
11
+ add_docs(
12
+ torch.ops.fbgemm.permute_2D_sparse_data,
13
+ """
14
+ permute_2D_sparse_data(permute, lengths, values, weights=None, permuted_lengths_sum=None) -> Tuple[Tensor, Tensor, Optional[Tensor]]
15
+
16
+ Permute 2D sparse data along the first dimension (dim 0). Note that 2D
17
+ refers to the number of dense dimensions. The input data is actually 3D
18
+ where the first two dimensions are dense and the last dimension is
19
+ jagged (sparse). The data to permute over can be less or more and with or
20
+ without repetitions.
21
+
22
+ Args:
23
+ permute (Tensor): A 1D-tensor that describes how data is permuted along dim
24
+ 0. `permute[i]` indicates that data at position `permute[i]` is moved
25
+ to position `i`. The length of this tensor is the total amount of data
26
+ in dim 0 to be permuted. The values in `permute` must be >= 0 and <
27
+ `lengths.shape[0]`
28
+
29
+ lengths (Tensor): A 2D-tensor that contains jagged shapes corresponding to
30
+ the other two dense dimensions. For example, in the case of the
31
+ embedding input, the 3D shape is (num features, batch size, bag size).
32
+ `lengths[t][b]` represents the bag size of feature `t` and sample `b`.
33
+
34
+ values (Tensor): A 1D-input-tensor to be permuted. The length of this
35
+ tensor must be equal to `lengths.sum()`. This tensor can be of any data
36
+ type.
37
+
38
+ weights (Optional[Tensor] = None): An optional 1D-float-tensor. It must
39
+ have the same length as `values`. It will be permuted the same way as
40
+ values
41
+
42
+ permuted_lengths_sum (Optional[int] = None): An optional value that
43
+ represents the total number of elements in the permuted data (output
44
+ shape). If not provided, the operator will compute this data which may
45
+ cause a device-host synchronization (if using GPU). Thus, it is
46
+ recommended to supply this value to avoid such the synchronization.
47
+
48
+ Returns:
49
+ A tuple of permuted lengths, permuted indices and permuted weights
50
+
51
+ **Example:**
52
+
53
+ >>> permute = torch.tensor([1, 0, 2], dtype=torch.int32, device="cuda")
54
+ >>> lengths = torch.tensor([[2, 3, 4, 5], [1, 2, 4, 8], [0, 3, 2, 3]], dtype=torch.int64, device="cuda")
55
+ >>> values = torch.randint(low=0, high=100, size=(lengths.sum().item(),), dtype=torch.int64, device="cuda")
56
+ >>> print(values)
57
+ tensor([29, 12, 61, 98, 56, 94, 5, 89, 65, 48, 71, 54, 40, 33, 78, 68, 42, 21,
58
+ 60, 51, 15, 47, 48, 68, 52, 19, 38, 30, 38, 97, 97, 98, 18, 40, 42, 89,
59
+ 66], device='cuda:0')
60
+ >>> torch.ops.fbgemm.permute_2D_sparse_data(permute, lengths, values)
61
+ (tensor([[1, 2, 4, 8],
62
+ [2, 3, 4, 5],
63
+ [0, 3, 2, 3]], device='cuda:0'),
64
+ tensor([78, 68, 42, 21, 60, 51, 15, 47, 48, 68, 52, 19, 38, 30, 38, 29, 12, 61,
65
+ 98, 56, 94, 5, 89, 65, 48, 71, 54, 40, 33, 97, 97, 98, 18, 40, 42, 89,
66
+ 66], device='cuda:0'),
67
+ None)
68
+ """,
69
+ )
70
+
71
+ add_docs(
72
+ torch.ops.fbgemm.permute_1D_sparse_data,
73
+ """
74
+ permute_1D_sparse_data(permute, lengths, values, weights=None, permuted_lengths_sum=None) -> Tuple[Tensor, Tensor, Optional[Tensor]]
75
+
76
+ Permute 1D sparse data. Note that 1D referrs to the number of dense dimensions.
77
+ The input data is actually 2D where the first dimension is dense and the second
78
+ dimension is jagged (sparse). The data to permute over can be less or more and
79
+ withh or without repetitions.
80
+
81
+ Args:
82
+ permute (Tensor): A 1D-tensor that describes how data is permuted along dim
83
+ 0. `permute[i]` indicates that data at position `permute[i]` is moved
84
+ to position `i`. The length of this tensor is the total amount of data
85
+ in dim 0 to be permuted. The values in `permute` must be >= 0 and <
86
+ `lengths.numel()`
87
+
88
+ lengths (Tensor): A 1D-tensor that contains jagged shapes corresponding to
89
+ the other dense dimension. `lengths[i]` represents the jagged shape of
90
+ data at position `i` in dim 0
91
+
92
+ values (Tensor): A 1D-input-tensor to be permuted. The length of this
93
+ tensor must be equal to `lengths.sum()`. This tensor can be of any data
94
+ type.
95
+
96
+ weights (Optional[Tensor] = None): An optional 1D-float-tensor. It must
97
+ have the same length as `values`. It will be permuted the same way as
98
+ values
99
+
100
+ permuted_lengths_sum (Optional[int] = None): An optional value that
101
+ represents the total number of elements in the permuted data (output
102
+ shape). If not provided, the operator will compute this data which may
103
+ cause a device-host synchronization (if using GPU). Thus, it is
104
+ recommended to supply this value to avoid such the synchronization.
105
+
106
+ Returns:
107
+ A tuple of permuted lengths, permuted indices and permuted weights
108
+
109
+ **Example:**
110
+ >>> permute = torch.tensor([1, 0, 3, 0], dtype=torch.int32, device="cuda")
111
+ >>> lengths = torch.tensor([2, 3, 4, 5], dtype=torch.int64, device="cuda")
112
+ >>> values = torch.randint(low=0, high=100, size=(lengths.sum().item(),), dtype=torch.int64, device="cuda")
113
+ >>> print(values)
114
+ tensor([ 1, 76, 24, 84, 94, 25, 15, 23, 31, 46, 9, 23, 34, 3],
115
+ device='cuda:0')
116
+ >>> torch.ops.fbgemm.permute_1D_sparse_data(permute, lengths, values)
117
+ (tensor([3, 2, 5, 2], device='cuda:0'),
118
+ tensor([24, 84, 94, 1, 76, 46, 9, 23, 34, 3, 1, 76], device='cuda:0'),
119
+ None)
120
+ """,
121
+ )
122
+
123
+ add_docs(
124
+ torch.ops.fbgemm.expand_into_jagged_permute,
125
+ """
126
+ expand_into_jagged_permute(permute, input_offset, output_offset, output_size) -> Tensor
127
+
128
+ Expand the sparse data permute index from feature dimension to batch dimension,
129
+ for cases where the sparse features has different batch sizes across ranks.
130
+
131
+ The op expands the permute from feature level to batch level by contiguously
132
+ mapping each bag of its corresponding features to the position the batch sits
133
+ on after feature permute. The op will automatically derive offset array of
134
+ feature and batch to compute the output permute.
135
+
136
+ Args:
137
+ permute (Tensor): The feature level permute index.
138
+
139
+ input_offset (Tensor): The exclusive offsets of feature-level length.
140
+
141
+ output_offsets (Tensor): The exclusive offsets of feature-level permuted
142
+ length.
143
+
144
+ output_size (int): The number of elements in the output tensor
145
+
146
+ Returns:
147
+ The output follows the following formula
148
+
149
+ >>> output_permute[feature_offset[permute[feature]] + batch] <- bag_offset[batch]
150
+ """,
151
+ )
152
+
153
+ add_docs(
154
+ torch.ops.fbgemm.asynchronous_complete_cumsum,
155
+ """
156
+ asynchronous_complete_cumsum(t_in) -> Tensor
157
+
158
+ Compute complete cumulative sum. For the GPU operator, the operator is
159
+ nonblocking asynchronous. For the CPU operator, it is a blocking operator.
160
+
161
+ Args:
162
+ t_in (Tensor): An input tensor
163
+
164
+ Returns:
165
+ The complete cumulative sum of `t_in`. Shape is `t_in.numel() + 1`
166
+
167
+ **Example:**
168
+
169
+ >>> t_in = torch.tensor([7, 8, 2, 1, 0, 9, 4], dtype=torch.int64, device="cuda")
170
+ >>> torch.ops.fbgemm.asynchronous_complete_cumsum(t_in)
171
+ tensor([ 0, 7, 15, 17, 18, 18, 27, 31], device='cuda:0')
172
+ """,
173
+ )
174
+
175
+ add_docs(
176
+ torch.ops.fbgemm.offsets_range,
177
+ """
178
+ offsets_range(offsets, range_size) -> Tensor
179
+
180
+ Generate an integer sequence from 0 to `(offsets[i+1] - offsets[i])` for every
181
+ `i`, where `0 <= i < offsets.numel()`
182
+
183
+ Args:
184
+ offsets (Tensor): The offsets (complete cumulative sum values)
185
+
186
+ range_size (int): The output size (the total sum)
187
+
188
+ Returns:
189
+ A tensor that contains offsets range
190
+
191
+ **Example:**
192
+ >>> # Generate example inputs
193
+ >>> lengths = torch.tensor([3, 4, 1, 9, 3, 7], dtype=torch.int64, device="cuda")
194
+ >>> offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(lengths)
195
+ >>> range_size = offsets[-1].item()
196
+ >>> print(range_size)
197
+ 27
198
+ >>> offsets = offsets[:-1]
199
+ >>> print(offsets)
200
+ tensor([ 0, 3, 7, 8, 17, 20], device='cuda:0')
201
+ >>> # Invoke
202
+ >>> torch.ops.fbgemm.offsets_range(offsets, range_size)
203
+ tensor([0, 1, 2, 0, 1, 2, 3, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 0, 1, 2, 3,
204
+ 4, 5, 6], device='cuda:0')
205
+ """,
206
+ )
207
+
208
+ add_docs(
209
+ torch.ops.fbgemm.segment_sum_csr,
210
+ """
211
+ segment_sum_csr(batch_size, csr_seg, values) -> Tensor
212
+
213
+ Sum values within each segment on the given CSR data where each row has the
214
+ same number of non-zero elements.
215
+
216
+ Args:
217
+ batch_size (int): The row stride (number of non-zero elements in each row)
218
+
219
+ csr_seg (Tensor): The complete cumulative sum of segment lengths. A segment
220
+ length is the number of rows within each segment. The shape of the
221
+ `csr_seg` tensor is `num_segments + 1` where `num_segments` is the
222
+ number of segments.
223
+
224
+ values (Tensor): The values tensor to be segment summed. The number of
225
+ elements in the tensor must be multiple of `batch_size`
226
+
227
+ Returns:
228
+ A tensor containing the segment sum results. Shape is the number of
229
+ segments.
230
+
231
+ **Example:**
232
+
233
+ >>> batch_size = 2
234
+ >>> # Randomize inputs
235
+ >>> lengths = torch.tensor([3, 4, 1], dtype=torch.int, device="cuda")
236
+ >>> offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(lengths)
237
+ >>> print(offsets)
238
+ tensor([0, 3, 7, 8], device='cuda:0', dtype=torch.int32)
239
+ >>> values = torch.randn(lengths.sum().item() * batch_size, dtype=torch.float32, device="cuda")
240
+ >>> print(values)
241
+ tensor([-2.8642e-01, 1.6451e+00, 1.1322e-01, 1.7335e+00, -8.4700e-02,
242
+ -1.2756e+00, 1.1206e+00, 9.6385e-01, 6.2122e-02, 1.3104e-03,
243
+ 2.2667e-01, 2.3113e+00, -1.1948e+00, -1.5463e-01, -1.0031e+00,
244
+ -3.5531e-01], device='cuda:0')
245
+ >>> # Invoke
246
+ >>> torch.ops.fbgemm.segment_sum_csr(batch_size, offsets, values)
247
+ tensor([ 1.8451, 3.3365, -1.3584], device='cuda:0')
248
+ """,
249
+ )
250
+
251
+ add_docs(
252
+ torch.ops.fbgemm.keyed_jagged_index_select_dim1,
253
+ """
254
+ keyed_jagged_index_select_dim1(values, lengths, offsets, indices, batch_size, weights=None, selected_lengths_sum=None) -> List[Tensor]
255
+
256
+ Perform an index select operation on the batch dimension (dim 1) of the given
257
+ keyed jagged tensor (KJT) input. The same samples in the batch of every key
258
+ will be selected. Note that each KJT has 3 dimensions: (`num_keys`, `batch_size`,
259
+ jagged dim), where `num_keys` is the number of keys, and `batch_size` is the
260
+ batch size. This operator is similar to a permute operator.
261
+
262
+ Args:
263
+ values (Tensor): The KJT values tensor which contains concatenated data of
264
+ every key
265
+
266
+ lengths (Tensor): The KJT lengths tensor which contains the jagged shapes
267
+ of every key (dim 0) and sample (dim 1). Shape is `num_keys *
268
+ batch_size`
269
+
270
+ offsets (Tensor): The KJT offsets tensor which is the complete cumulative
271
+ sum of `lengths`. Shape is `num_keys * batch_size + 1`
272
+
273
+ indices (Tensor): The indices to select, i.e., samples in the batch to
274
+ select. The values of `indices` must be >= 0 and < `batch_size`
275
+
276
+ batch_size (int): The batch size (dim 1 of KJT)
277
+
278
+ weights (Optional[Tensor] = None): An optional float tensor which will be
279
+ selected the same way as `values`. Thus, it must have the same shape as
280
+ `values`
281
+
282
+ selected_lengths_sum (Optional[int] = None): An optional value that
283
+ represents the total number of elements in the index select data
284
+ (output shape). If not provided, the operator will compute this data
285
+ which may cause a device-host synchronization (if using GPU). Thus, it
286
+ is recommended to supply this value to avoid such the synchronization.
287
+
288
+ Returns:
289
+ The index-select KJT tensor (as a list of values, lengths, and weights if
290
+ `weights` is not None)
291
+
292
+ **Example:**
293
+
294
+ >>> num_keys = 2
295
+ >>> batch_size = 4
296
+ >>> output_size = 3
297
+ >>> # Randomize inputs
298
+ >>> lengths = torch.randint(low=0, high=10, size=(batch_size * num_keys,), dtype=torch.int64, device="cuda")
299
+ >>> print(lengths)
300
+ tensor([8, 5, 1, 4, 2, 7, 5, 9], device='cuda:0')
301
+ >>> offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(lengths)
302
+ >>> print(offsets)
303
+ tensor([ 0, 8, 13, 14, 18, 20, 27, 32, 41], device='cuda:0')
304
+ >>> indices = torch.randint(low=0, high=batch_size, size=(output_size,), dtype=torch.int64, device="cuda")
305
+ >>> print(indices)
306
+ tensor([3, 3, 1], device='cuda:0')
307
+ >>> # Use torch.arange instead of torch.randn to simplify the example
308
+ >>> values = torch.arange(lengths.sum().item(), dtype=torch.float32, device="cuda")
309
+ >>> print(values)
310
+ tensor([ 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13.,
311
+ 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27.,
312
+ 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40.],
313
+ device='cuda:0')
314
+ >>> # Invoke. Output = (output, lengths)
315
+ >>> torch.ops.fbgemm.keyed_jagged_index_select_dim1(values, lengths, offsets, indices, batch_size)
316
+ [tensor([14., 15., 16., 17., 14., 15., 16., 17., 8., 9., 10., 11., 12., 32.,
317
+ 33., 34., 35., 36., 37., 38., 39., 40., 32., 33., 34., 35., 36., 37.,
318
+ 38., 39., 40., 20., 21., 22., 23., 24., 25., 26.], device='cuda:0'),
319
+ tensor([4, 4, 5, 9, 9, 7], device='cuda:0')]
320
+ """,
321
+ )
322
+
323
+ add_docs(
324
+ torch.ops.fbgemm.block_bucketize_sparse_features,
325
+ """
326
+ block_bucketize_sparse_features(lengths, indices, bucketize_pos, sequence, block_sizes, my_size, weights=None, batch_size_per_feature=None, max_B= -1, block_bucketize_pos=None, keep_orig_idx=False, total_num_blocks=None, keep_orig_idx_per_feature=None) -> Tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor], Optional[Tensor]]
327
+
328
+ Preprocess sparse features by partitioning sparse features into multiple
329
+ buckets. Every feature is split into the same number of buckets, but the bucket
330
+ sizes (widths) for the different features can be different. Moreover, the
331
+ bucket sizes within each feature can be different.
332
+
333
+ Args:
334
+ lengths (Tensor): The lengths of the sparse features. The tensor contains
335
+ the lengths of each sample in a batch and each feature. Shape is `B *
336
+ T` where `B` is the batch size and `T` is the number of features
337
+
338
+ indices (Tensor): The sparse data. Only support integer types. Shape is the
339
+ sum of `lengths`
340
+
341
+ bucketize_pos (bool): If True, return the original relative indices within
342
+ a sample. For example, `indices = [9, 8, 2, 1, 0, 8, 9]` and `lengths =
343
+ [3, 4]`. The original relative indices within a sample for the indices
344
+ are `[0, 1, 2, 0, 1, 2, 3]`
345
+
346
+ sequence (bool): If True, return the new indices positions in the original
347
+ indices positions (the tensor is called `unbucketize_permute_data`).
348
+
349
+ block_sizes (Tensor): This tensor is used for the case where the bucket
350
+ size within a feature is uniform (i.e., when
351
+ `block_bucketize_pos=None`). The tensor contains bucket sizes (i.e.,
352
+ bucket widths) for each feature. `block_sizes[t]` represents the
353
+ bucket size of feature `t`. Shape is the number of features.
354
+
355
+ my_size (int): The number of buckets for each feature. Note that every
356
+ feature has the same number of buckets.
357
+
358
+ weights (Optional[Tensor] = None): An optional float tensor that will be
359
+ bucketized the same way as `indices`. This tensor must have the same
360
+ shape as `indices`
361
+
362
+ batch_size_per_feature (Optional[Tensor] = None): An optional tensor that
363
+ contains batch sizes for different features. If not None, batch sizes
364
+ are not uniform among features. Otherwise, the operator will assume
365
+ that the batch size is uniform and infer it from the `lengths` and
366
+ `block_sizes` tensors
367
+
368
+ max_B (int = -1): The max batch size. Must be set if
369
+ `batch_size_per_feature` is not None
370
+
371
+ block_bucketize_pos (Optional[List[Tensor]] = None): The input is used for
372
+ non-uniform bucket sizes within a feature. `block_bucketize_pos` is a
373
+ list of tensors. Each tensor contains the range offsets of buckets for
374
+ each feature. These range offsets are equivalent to the complete
375
+ cumulative sum of the bucket sizes. For example, `[0, 4, 20]` represents
376
+ two buckets. The first bucket size is `(4 - 0) = 4`, and the second
377
+ bucket size is `(20 - 4) = 16`. The length of `block_bucketize_pos`
378
+ must be equal to the number of features.
379
+
380
+ keep_orig_idx (bool = False): If True, return original indices instead of
381
+ the relative indices within each bucket
382
+
383
+ total_num_blocks (Optional[torch.Tensor] = None): An optional tensor that
384
+ contains then number of logical buckets (aka blocks) within a given
385
+ feature. This is useful for applications where the number of buckets
386
+ is more than the number of physical GPUs, which is common in cases
387
+ where we scale up/down the number of GPUs but want to maintain
388
+ same numerical behavior.
389
+
390
+ keep_orig_idx_per_feature (Optional[Tensor] = None): An optional tensor that
391
+ contains whether to keep original indices for each feature. If not None,
392
+ the operator will use this tensor to determine whether to keep original
393
+ indices for each feature. if None, will fallback to `keep_orig_idx`
394
+
395
+ Return:
396
+ A tuple of tensors containing
397
+
398
+ (1) Bucketized lengths. Shape is `lengths.num() * my_size`.
399
+
400
+ (2) Bucketized indices. Same shape as `indices`.
401
+
402
+ (3) Bucketized weights or None if `weights` is None. Same shape as
403
+ `indices`.
404
+
405
+ (4) Bucketized positions or None if `bucketize_pos=False`. Same shape as
406
+ `indices`.
407
+
408
+ (5) `unbucketize_permute` or None if `sequence=False`. Same shape as
409
+ `indices`
410
+
411
+ **Example**:
412
+
413
+ >>> # Generate input example. Batch size = 2. Number of features = 4
414
+ >>> lengths = torch.tensor([0, 2, 1, 3, 2, 3, 3, 1], dtype=torch.int, device="cuda")
415
+ >>> indices = torch.tensor([3, 4, 15, 11, 28, 29, 1, 10, 11, 12, 13, 11, 22, 20, 20], dtype=torch.int, device="cuda")
416
+ >>> block_sizes = torch.tensor([[5, 15, 10, 20]], dtype=torch.int, device="cuda")
417
+ >>> my_size = 2 # Number of buckets
418
+ >>> # Invoke with keep_orig_idx=False, bucketize_pos=False, and
419
+ >>> # sequence=False
420
+ >>> torch.ops.fbgemm.block_bucketize_sparse_features(
421
+ >>> lengths,
422
+ >>> indices,
423
+ >>> bucketize_pos=False,
424
+ >>> sequence=False,
425
+ >>> block_sizes=block_sizes,
426
+ >>> my_size=my_size,
427
+ >>> keep_orig_idx=False)
428
+ >>> # The first 8 values in the returned lengths are the lengths for bucket
429
+ >>> # 0 and the rests are the legths for bucket 1
430
+ (tensor([0, 2, 0, 1, 1, 0, 1, 0, 0, 0, 1, 2, 1, 3, 2, 1], device='cuda:0',
431
+ dtype=torch.int32),
432
+ tensor([ 3, 4, 11, 1, 11, 0, 13, 14, 0, 1, 2, 3, 2, 0, 0],
433
+ device='cuda:0', dtype=torch.int32),
434
+ None,
435
+ None,
436
+ None)
437
+ >>> # Invoke with keep_orig_idx=True, bucketize_pos=True, and
438
+ >>> # sequence=True
439
+ >>> torch.ops.fbgemm.block_bucketize_sparse_features(
440
+ >>> lengths,
441
+ >>> indices,
442
+ >>> bucketize_pos=True,
443
+ >>> sequence=True,
444
+ >>> block_sizes=block_sizes,
445
+ >>> my_size=my_size,
446
+ >>> keep_orig_idx=True)
447
+ (tensor([0, 2, 0, 1, 1, 0, 1, 0, 0, 0, 1, 2, 1, 3, 2, 1], device='cuda:0',
448
+ dtype=torch.int32),
449
+ tensor([ 3, 4, 11, 1, 11, 15, 28, 29, 10, 11, 12, 13, 22, 20, 20],
450
+ device='cuda:0', dtype=torch.int32),
451
+ None,
452
+ tensor([0, 1, 0, 0, 0, 0, 1, 2, 1, 0, 1, 2, 1, 2, 0], device='cuda:0',
453
+ dtype=torch.int32),
454
+ tensor([ 0, 1, 5, 2, 6, 7, 3, 8, 9, 10, 11, 4, 12, 13, 14],
455
+ device='cuda:0', dtype=torch.int32))
456
+ >>> # Invoke with keep_orig_idx_per_feature
457
+ >>> keep_orig_idx_per_feature = torch.tensor([False, True, False, True], dtype=torch.bool)
458
+ >>> torch.ops.fbgemm.block_bucketize_sparse_features(
459
+ >>> lengths,
460
+ >>> indices,
461
+ >>> bucketize_pos=False,
462
+ >>> sequence=False,
463
+ >>> block_sizes=block_sizes,
464
+ >>> my_size=my_size,
465
+ >>> keep_orig_idx=False,
466
+ >>> keep_orig_idx_per_feature=keep_orig_idx_per_feature)
467
+ (tensor([0, 0, 0, 1, 1, 1, 2, 1, 0, 2, 1, 2, 1, 2, 1, 0], device='cuda:0',
468
+ dtype=torch.int32),
469
+ tensor([ 3, 4, 11, 1, 11, 15, 28, 29, 0, 1, 2, 3, 22, 20, 20],
470
+ device='cuda:0', dtype=torch.int32),
471
+ None,
472
+ None,
473
+ None)
474
+ >>> # Invoke with block_bucketize_pos
475
+ >>> block_bucketize_pos = [
476
+ >>> torch.tensor([0, 2, 8], dtype=torch.int),
477
+ >>> torch.tensor([0, 5, 10], dtype=torch.int),
478
+ >>> torch.tensor([0, 7, 12], dtype=torch.int),
479
+ >>> torch.tensor([0, 2, 16], dtype=torch.int),
480
+ >>> ]
481
+ >>> torch.ops.fbgemm.block_bucketize_sparse_features(
482
+ >>> lengths,
483
+ >>> indices,
484
+ >>> bucketize_pos=False,
485
+ >>> sequence=False,
486
+ >>> block_sizes=block_sizes,
487
+ >>> my_size=my_size,
488
+ >>> block_bucketize_pos=block_bucketize_pos,
489
+ >>> keep_orig_idx=False)
490
+ (tensor([0, 0, 0, 1, 1, 1, 2, 1, 0, 2, 1, 2, 1, 2, 1, 0], device='cuda:0',
491
+ dtype=torch.int32),
492
+ tensor([14, 1, 6, 11, 10, 10, 1, 2, 7, 5, 14, 3, 4, 6, 9],
493
+ device='cuda:0', dtype=torch.int32),
494
+ None,
495
+ None,
496
+ None)
497
+ """,
498
+ )
499
+
500
+ add_docs(
501
+ torch.ops.fbgemm.block_bucketize_sparse_features_2d_weights,
502
+ """
503
+ block_bucketize_sparse_features_2d_weights(lengths, indices, bucketize_pos, sequence, block_sizes, my_size, weights, weights_dim=1, batch_size_per_feature=None, max_B= -1, block_bucketize_pos=None, keep_orig_idx=False, total_num_blocks=None, keep_orig_idx_per_feature=None) -> Tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor], Optional[Tensor]]
504
+
505
+ Preprocess sparse features by partitioning sparse features into multiple
506
+ buckets with support for 2D weights. Every feature is split into the same number of buckets, but the bucket
507
+ sizes (widths) for the different features can be different. Moreover, the
508
+ bucket sizes within each feature can be different.
509
+
510
+ This function is similar to block_bucketize_sparse_features but supports 2D weights,
511
+ where each index can have multiple weight values associated with it.
512
+
513
+ Args:
514
+ lengths (Tensor): The lengths of the sparse features. The tensor contains
515
+ the lengths of each sample in a batch and each feature. Shape is `B *
516
+ T` where `B` is the batch size and `T` is the number of features
517
+
518
+ indices (Tensor): The sparse data. Only support integer types. Shape is the
519
+ sum of `lengths`
520
+
521
+ bucketize_pos (bool): If True, return the original relative indices within
522
+ a sample. For example, `indices = [9, 8, 2, 1, 0, 8, 9]` and `lengths =
523
+ [3, 4]`. The original relative indices within a sample for the indices
524
+ are `[0, 1, 2, 0, 1, 2, 3]`
525
+
526
+ sequence (bool): If True, return the new indices positions in the original
527
+ indices positions (the tensor is called `unbucketize_permute_data`).
528
+
529
+ block_sizes (Tensor): This tensor is used for the case where the bucket
530
+ size within a feature is uniform (i.e., when
531
+ `block_bucketize_pos=None`). The tensor contains bucket sizes (i.e.,
532
+ bucket widths) for each feature. `block_sizes[t]` represents the
533
+ bucket size of feature `t`. Shape is the number of features.
534
+
535
+ my_size (int): The number of buckets for each feature. Note that every
536
+ feature has the same number of buckets.
537
+
538
+ weights (Tensor): A float tensor that will be bucketized the same way as
539
+ `indices`. This tensor must have shape `[indices.size(0), weights_dim]`
540
+ where `weights_dim` is the dimension of the weight values for each index.
541
+
542
+ weights_dim (int = 1): The dimension of the weight values for each index.
543
+ This parameter is only used when `weights` is not None.
544
+
545
+ batch_size_per_feature (Optional[Tensor] = None): An optional tensor that
546
+ contains batch sizes for different features. If not None, batch sizes
547
+ are not uniform among features. Otherwise, the operator will assume
548
+ that the batch size is uniform and infer it from the `lengths` and
549
+ `block_sizes` tensors
550
+
551
+ max_B (int = -1): The max batch size. Must be set if
552
+ `batch_size_per_feature` is not None
553
+
554
+ block_bucketize_pos (Optional[List[Tensor]] = None): The input is used for
555
+ non-uniform bucket sizes within a feature. `block_bucketize_pos` is a
556
+ list of tensors. Each tensor contains the range offsets of buckets for
557
+ each feature. These range offsets are equivalent to the complete
558
+ cumulative sum of the bucket sizes. For example, `[0, 4, 20]` represents
559
+ two buckets. The first bucket size is `(4 - 0) = 4`, and the second
560
+ bucket size is `(20 - 4) = 16`. The length of `block_bucketize_pos`
561
+ must be equal to the number of features.
562
+
563
+ keep_orig_idx (bool = False): If True, return original indices instead of
564
+ the relative indices within each bucket
565
+
566
+ total_num_blocks (Optional[torch.Tensor] = None): An optional tensor that
567
+ contains then number of logical buckets (aka blocks) within a given
568
+ feature. This is useful for applications where the number of buckets
569
+ is more than the number of physical GPUs, which is common in cases
570
+ where we scale up/down the number of GPUs but want to maintain
571
+ same numerical behavior.
572
+
573
+ keep_orig_idx_per_feature (Optional[Tensor] = None): An optional tensor that
574
+ contains whether to keep original indices for each feature. If not None,
575
+ the operator will use this tensor to determine whether to keep original
576
+ indices for each feature. if None, will fallback to `keep_orig_idx`
577
+
578
+ Return:
579
+ A tuple of tensors containing
580
+
581
+ (1) Bucketized lengths. Shape is `lengths.num() * my_size`.
582
+
583
+ (2) Bucketized indices. Same shape as `indices`.
584
+
585
+ (3) Bucketized weights or None if `weights` is None. Shape is
586
+ `[indices.size(0), weights_dim]`.
587
+
588
+ (4) Bucketized positions or None if `bucketize_pos=False`. Same shape as
589
+ `indices`.
590
+
591
+ (5) `unbucketize_permute` or None if `sequence=False`. Same shape as
592
+ `indices`
593
+
594
+ **Example**:
595
+
596
+ >>> # Generate input example. Batch size = 2. Number of features = 4
597
+ >>> lengths = torch.tensor([0, 2, 1, 3, 2, 3, 3, 1], dtype=torch.int, device="cuda")
598
+ >>> indices = torch.tensor([3, 4, 15, 11, 28, 29, 1, 10, 11, 12, 13, 11, 22, 20, 20], dtype=torch.int, device="cuda")
599
+ >>> block_sizes = torch.tensor([[5, 15, 10, 20]], dtype=torch.int, device="cuda")
600
+ >>> my_size = 2 # Number of buckets
601
+ >>> weights_dim = 3 # Dimension of weight values for each index
602
+ >>> weights = torch.randn(indices.size(0), weights_dim, dtype=torch.float, device="cuda")
603
+ >>> # Invoke with keep_orig_idx=False, bucketize_pos=False, and
604
+ >>> # sequence=False
605
+ >>> torch.ops.fbgemm.block_bucketize_sparse_features_2d_weights(
606
+ >>> lengths,
607
+ >>> indices,
608
+ >>> bucketize_pos=False,
609
+ >>> sequence=False,
610
+ >>> block_sizes=block_sizes,
611
+ >>> my_size=my_size,
612
+ >>> weights=weights,
613
+ >>> weights_dim=weights_dim,
614
+ >>> keep_orig_idx=False)
615
+ """,
616
+ )
@@ -0,0 +1,6 @@
1
+
2
+ {
3
+ "version": "2025.12.19",
4
+ "target": "genai",
5
+ "variant": "cuda"
6
+ }
fbgemm_gpu/enums.py ADDED
@@ -0,0 +1,24 @@
1
+ #!/usr/bin/env python3
2
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the BSD-style license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+
8
+ # pyre-strict
9
+
10
+ import enum
11
+ from typing import Any, Callable
12
+
13
+
14
+ # Create enums in given namespace with information from query_op
15
+ def create_enums(
16
+ namespace: dict[str, Any],
17
+ query_op: Callable[[], list[tuple[str, list[tuple[str, int]]]]],
18
+ ) -> None:
19
+ for enum_name, items in query_op():
20
+ # Create matching python enumeration
21
+ # pyre-fixme[19]: Expected 1 positional argument.
22
+ new_enum = enum.Enum(enum_name, items)
23
+ # and store it in the module
24
+ namespace[enum_name] = new_enum
@@ -0,0 +1,29 @@
1
+ #!/usr/bin/env python3
2
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the BSD-style license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+
8
+ # pyre-strict
9
+
10
+ import os
11
+
12
+ import torch
13
+
14
+ try:
15
+ # pyre-ignore[21]
16
+ # @manual=//deeplearning/fbgemm/fbgemm_gpu:test_utils
17
+ from fbgemm_gpu import open_source
18
+ except Exception:
19
+ open_source: bool = False
20
+
21
+ # pyre-ignore[16]
22
+ if open_source:
23
+ torch.ops.load_library(
24
+ os.path.join(os.path.dirname(__file__), "fbgemm_gpu_experimental_example_py.so")
25
+ )
26
+ else:
27
+ torch.ops.load_library(
28
+ "//deeplearning/fbgemm/fbgemm_gpu/experimental/example:example_ops_cuda"
29
+ )