fbgemm-gpu-genai-nightly 2025.12.19__cp310-cp310-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of fbgemm-gpu-genai-nightly might be problematic. Click here for more details.

Files changed (127) hide show
  1. fbgemm_gpu/__init__.py +186 -0
  2. fbgemm_gpu/asmjit.so +0 -0
  3. fbgemm_gpu/batched_unary_embeddings_ops.py +87 -0
  4. fbgemm_gpu/config/__init__.py +9 -0
  5. fbgemm_gpu/config/feature_list.py +88 -0
  6. fbgemm_gpu/docs/__init__.py +18 -0
  7. fbgemm_gpu/docs/common.py +9 -0
  8. fbgemm_gpu/docs/examples.py +73 -0
  9. fbgemm_gpu/docs/jagged_tensor_ops.py +259 -0
  10. fbgemm_gpu/docs/merge_pooled_embedding_ops.py +36 -0
  11. fbgemm_gpu/docs/permute_pooled_embedding_ops.py +108 -0
  12. fbgemm_gpu/docs/quantize_ops.py +41 -0
  13. fbgemm_gpu/docs/sparse_ops.py +616 -0
  14. fbgemm_gpu/docs/target.genai.json.py +6 -0
  15. fbgemm_gpu/enums.py +24 -0
  16. fbgemm_gpu/experimental/example/__init__.py +29 -0
  17. fbgemm_gpu/experimental/example/fbgemm_gpu_experimental_example_py.so +0 -0
  18. fbgemm_gpu/experimental/example/utils.py +20 -0
  19. fbgemm_gpu/experimental/gemm/triton_gemm/__init__.py +15 -0
  20. fbgemm_gpu/experimental/gemm/triton_gemm/fp4_quantize.py +5654 -0
  21. fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py +4422 -0
  22. fbgemm_gpu/experimental/gemm/triton_gemm/grouped_gemm.py +1192 -0
  23. fbgemm_gpu/experimental/gemm/triton_gemm/matmul_perf_model.py +232 -0
  24. fbgemm_gpu/experimental/gemm/triton_gemm/utils.py +130 -0
  25. fbgemm_gpu/experimental/gen_ai/__init__.py +56 -0
  26. fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/__init__.py +46 -0
  27. fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_custom_op.py +333 -0
  28. fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_interface.py +552 -0
  29. fbgemm_gpu/experimental/gen_ai/bench/__init__.py +13 -0
  30. fbgemm_gpu/experimental/gen_ai/bench/comm_bench.py +257 -0
  31. fbgemm_gpu/experimental/gen_ai/bench/gather_scatter_bench.py +348 -0
  32. fbgemm_gpu/experimental/gen_ai/bench/quantize_bench.py +707 -0
  33. fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py +3483 -0
  34. fbgemm_gpu/experimental/gen_ai/fbgemm_gpu_experimental_gen_ai.so +0 -0
  35. fbgemm_gpu/experimental/gen_ai/moe/README.md +15 -0
  36. fbgemm_gpu/experimental/gen_ai/moe/__init__.py +66 -0
  37. fbgemm_gpu/experimental/gen_ai/moe/activation.py +292 -0
  38. fbgemm_gpu/experimental/gen_ai/moe/gather_scatter.py +740 -0
  39. fbgemm_gpu/experimental/gen_ai/moe/layers.py +1272 -0
  40. fbgemm_gpu/experimental/gen_ai/moe/shuffling.py +421 -0
  41. fbgemm_gpu/experimental/gen_ai/quantize.py +307 -0
  42. fbgemm_gpu/fbgemm.so +0 -0
  43. fbgemm_gpu/metrics.py +160 -0
  44. fbgemm_gpu/permute_pooled_embedding_modules.py +142 -0
  45. fbgemm_gpu/permute_pooled_embedding_modules_split.py +85 -0
  46. fbgemm_gpu/quantize/__init__.py +43 -0
  47. fbgemm_gpu/quantize/quantize_ops.py +64 -0
  48. fbgemm_gpu/quantize_comm.py +315 -0
  49. fbgemm_gpu/quantize_utils.py +246 -0
  50. fbgemm_gpu/runtime_monitor.py +237 -0
  51. fbgemm_gpu/sll/__init__.py +189 -0
  52. fbgemm_gpu/sll/cpu/__init__.py +80 -0
  53. fbgemm_gpu/sll/cpu/cpu_sll.py +1001 -0
  54. fbgemm_gpu/sll/meta/__init__.py +35 -0
  55. fbgemm_gpu/sll/meta/meta_sll.py +337 -0
  56. fbgemm_gpu/sll/triton/__init__.py +127 -0
  57. fbgemm_gpu/sll/triton/common.py +38 -0
  58. fbgemm_gpu/sll/triton/triton_dense_jagged_cat_jagged_out.py +72 -0
  59. fbgemm_gpu/sll/triton/triton_jagged2_to_padded_dense.py +221 -0
  60. fbgemm_gpu/sll/triton/triton_jagged_bmm.py +418 -0
  61. fbgemm_gpu/sll/triton/triton_jagged_bmm_jagged_out.py +553 -0
  62. fbgemm_gpu/sll/triton/triton_jagged_dense_elementwise_add.py +52 -0
  63. fbgemm_gpu/sll/triton/triton_jagged_dense_elementwise_mul_jagged_out.py +175 -0
  64. fbgemm_gpu/sll/triton/triton_jagged_dense_flash_attention.py +861 -0
  65. fbgemm_gpu/sll/triton/triton_jagged_flash_attention_basic.py +667 -0
  66. fbgemm_gpu/sll/triton/triton_jagged_self_substraction_jagged_out.py +73 -0
  67. fbgemm_gpu/sll/triton/triton_jagged_softmax.py +463 -0
  68. fbgemm_gpu/sll/triton/triton_multi_head_jagged_flash_attention.py +751 -0
  69. fbgemm_gpu/sparse_ops.py +1455 -0
  70. fbgemm_gpu/split_embedding_configs.py +452 -0
  71. fbgemm_gpu/split_embedding_inference_converter.py +175 -0
  72. fbgemm_gpu/split_embedding_optimizer_ops.py +21 -0
  73. fbgemm_gpu/split_embedding_utils.py +29 -0
  74. fbgemm_gpu/split_table_batched_embeddings_ops.py +73 -0
  75. fbgemm_gpu/split_table_batched_embeddings_ops_common.py +484 -0
  76. fbgemm_gpu/split_table_batched_embeddings_ops_inference.py +2042 -0
  77. fbgemm_gpu/split_table_batched_embeddings_ops_training.py +4600 -0
  78. fbgemm_gpu/split_table_batched_embeddings_ops_training_common.py +146 -0
  79. fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py +26 -0
  80. fbgemm_gpu/tbe/__init__.py +6 -0
  81. fbgemm_gpu/tbe/bench/__init__.py +55 -0
  82. fbgemm_gpu/tbe/bench/bench_config.py +156 -0
  83. fbgemm_gpu/tbe/bench/bench_runs.py +709 -0
  84. fbgemm_gpu/tbe/bench/benchmark_click_interface.py +187 -0
  85. fbgemm_gpu/tbe/bench/eeg_cli.py +137 -0
  86. fbgemm_gpu/tbe/bench/embedding_ops_common_config.py +149 -0
  87. fbgemm_gpu/tbe/bench/eval_compression.py +119 -0
  88. fbgemm_gpu/tbe/bench/reporter.py +35 -0
  89. fbgemm_gpu/tbe/bench/tbe_data_config.py +137 -0
  90. fbgemm_gpu/tbe/bench/tbe_data_config_bench_helper.py +323 -0
  91. fbgemm_gpu/tbe/bench/tbe_data_config_loader.py +289 -0
  92. fbgemm_gpu/tbe/bench/tbe_data_config_param_models.py +170 -0
  93. fbgemm_gpu/tbe/bench/utils.py +48 -0
  94. fbgemm_gpu/tbe/cache/__init__.py +11 -0
  95. fbgemm_gpu/tbe/cache/kv_embedding_ops_inference.py +385 -0
  96. fbgemm_gpu/tbe/cache/split_embeddings_cache_ops.py +48 -0
  97. fbgemm_gpu/tbe/ssd/__init__.py +15 -0
  98. fbgemm_gpu/tbe/ssd/common.py +46 -0
  99. fbgemm_gpu/tbe/ssd/inference.py +586 -0
  100. fbgemm_gpu/tbe/ssd/training.py +4908 -0
  101. fbgemm_gpu/tbe/ssd/utils/__init__.py +7 -0
  102. fbgemm_gpu/tbe/ssd/utils/partially_materialized_tensor.py +273 -0
  103. fbgemm_gpu/tbe/stats/__init__.py +10 -0
  104. fbgemm_gpu/tbe/stats/bench_params_reporter.py +339 -0
  105. fbgemm_gpu/tbe/utils/__init__.py +13 -0
  106. fbgemm_gpu/tbe/utils/common.py +42 -0
  107. fbgemm_gpu/tbe/utils/offsets.py +65 -0
  108. fbgemm_gpu/tbe/utils/quantize.py +251 -0
  109. fbgemm_gpu/tbe/utils/requests.py +556 -0
  110. fbgemm_gpu/tbe_input_multiplexer.py +108 -0
  111. fbgemm_gpu/triton/__init__.py +22 -0
  112. fbgemm_gpu/triton/common.py +77 -0
  113. fbgemm_gpu/triton/jagged/__init__.py +8 -0
  114. fbgemm_gpu/triton/jagged/triton_jagged_tensor_ops.py +824 -0
  115. fbgemm_gpu/triton/quantize.py +647 -0
  116. fbgemm_gpu/triton/quantize_ref.py +286 -0
  117. fbgemm_gpu/utils/__init__.py +11 -0
  118. fbgemm_gpu/utils/filestore.py +211 -0
  119. fbgemm_gpu/utils/loader.py +36 -0
  120. fbgemm_gpu/utils/torch_library.py +132 -0
  121. fbgemm_gpu/uvm.py +40 -0
  122. fbgemm_gpu_genai_nightly-2025.12.19.dist-info/METADATA +62 -0
  123. fbgemm_gpu_genai_nightly-2025.12.19.dist-info/RECORD +127 -0
  124. fbgemm_gpu_genai_nightly-2025.12.19.dist-info/WHEEL +5 -0
  125. fbgemm_gpu_genai_nightly-2025.12.19.dist-info/top_level.txt +2 -0
  126. list_versions/__init__.py +12 -0
  127. list_versions/cli_run.py +163 -0
@@ -0,0 +1,232 @@
1
+ # Source: https://github.com/triton-lang/kernels/blob/8821ef322394ee2d3c58a859780ee1e2e10b5c79/kernels/matmul_perf_model.py
2
+
3
+ # This file is taken from the upstream triton-lang/kernels repo.
4
+ # Currently that repo does not have a license file, so disabling
5
+ # the license lint for now:
6
+ # @lint-ignore-every LICENSELINT
7
+
8
+ # flake8: noqa
9
+ # pyre-ignore-all-errors
10
+
11
+ import functools
12
+ import heapq
13
+
14
+ import torch
15
+
16
+ from triton import cdiv # @manual
17
+ from triton.runtime import driver # @manual
18
+ from triton.testing import ( # @manual
19
+ get_dram_gbps,
20
+ get_max_simd_tflops,
21
+ get_max_tensorcore_tflops,
22
+ nvsmi,
23
+ )
24
+
25
+
26
+ @functools.lru_cache()
27
+ def get_clock_rate_in_khz():
28
+ try:
29
+ return nvsmi(["clocks.max.sm"])[0] * 1e3
30
+ except FileNotFoundError:
31
+ import pynvml # @manual=fbsource//third-party/pypi/pynvml:pynvml
32
+
33
+ pynvml.nvmlInit()
34
+ handle = pynvml.nvmlDeviceGetHandleByIndex(0)
35
+ return pynvml.nvmlDeviceGetMaxClockInfo(handle, pynvml.NVML_CLOCK_SM) * 1e3
36
+
37
+
38
+ def get_tensorcore_tflops(device, num_ctas, num_warps, dtype):
39
+ """return compute throughput in TOPS"""
40
+ total_warps = num_ctas * min(num_warps, 4)
41
+ num_subcores = (
42
+ driver.active.utils.get_device_properties(device)["multiprocessor_count"] * 4
43
+ ) # on recent GPUs
44
+ tflops = (
45
+ min(num_subcores, total_warps)
46
+ / num_subcores
47
+ * get_max_tensorcore_tflops(dtype, get_clock_rate_in_khz(), device)
48
+ )
49
+ return tflops
50
+
51
+
52
+ def get_simd_tflops(device, num_ctas, num_warps, dtype):
53
+ """return compute throughput in TOPS"""
54
+ total_warps = num_ctas * min(num_warps, 4)
55
+ num_subcores = (
56
+ driver.active.utils.get_device_properties(device)["multiprocessor_count"] * 4
57
+ ) # on recent GPUs
58
+ tflops = (
59
+ min(num_subcores, total_warps)
60
+ / num_subcores
61
+ * get_max_simd_tflops(dtype, get_clock_rate_in_khz(), device)
62
+ )
63
+ return tflops
64
+
65
+
66
+ def get_tflops(device, num_ctas, num_warps, dtype):
67
+ capability = torch.cuda.get_device_capability(device)
68
+ if capability[0] < 8 and dtype == torch.float32:
69
+ return get_simd_tflops(device, num_ctas, num_warps, dtype)
70
+ return get_tensorcore_tflops(device, num_ctas, num_warps, dtype)
71
+
72
+
73
+ def estimate_matmul_time(
74
+ # backend, device,
75
+ num_warps,
76
+ num_stages, #
77
+ A,
78
+ B,
79
+ C, #
80
+ M,
81
+ N,
82
+ K, #
83
+ BLOCK_M,
84
+ BLOCK_N,
85
+ BLOCK_K,
86
+ SPLIT_K, #
87
+ debug=False,
88
+ **kwargs, #
89
+ ):
90
+ """return estimated running time in ms
91
+ = max(compute, loading) + store"""
92
+ device = torch.cuda.current_device()
93
+ dtype = A.dtype
94
+ dtsize = A.element_size()
95
+
96
+ num_cta_m = cdiv(M, BLOCK_M)
97
+ num_cta_n = cdiv(N, BLOCK_N)
98
+ num_cta_k = SPLIT_K
99
+ num_ctas = num_cta_m * num_cta_n * num_cta_k
100
+
101
+ # If the input is smaller than the block size
102
+ M, N = max(M, BLOCK_M), max(N, BLOCK_N)
103
+
104
+ # time to compute
105
+ total_ops = 2 * M * N * K / (1024 * 1024 * 1024) # GOPS
106
+ tput = get_tflops(device, num_ctas, num_warps, dtype)
107
+ compute_ms = total_ops / tput
108
+
109
+ # time to load data
110
+ num_sm = driver.active.utils.get_device_properties(device)["multiprocessor_count"]
111
+ active_cta_ratio = min(1, num_ctas / num_sm)
112
+ active_cta_ratio_bw1 = min(
113
+ 1, num_ctas / 32
114
+ ) # 32 active ctas are enough to saturate
115
+ active_cta_ratio_bw2 = max(
116
+ min(1, (num_ctas - 32) / (108 - 32)), 0
117
+ ) # 32-108, remaining 5%
118
+ dram_bw = get_dram_gbps(device) * (
119
+ active_cta_ratio_bw1 * 0.95 + active_cta_ratio_bw2 * 0.05
120
+ ) # in GB/s
121
+ l2_bw = dram_bw * 4 # rough estimation (should be 4.7 for A100?)
122
+ # assume 80% of (following) loads are in L2 cache
123
+ load_a_dram = M * K * dtsize * (1 + 0.2 * (num_cta_n - 1))
124
+ load_a_l2 = M * K * dtsize * 0.8 * (num_cta_n - 1)
125
+ load_b_dram = N * K * dtsize * (1 + 0.2 * (num_cta_m - 1))
126
+ load_b_l2 = N * K * dtsize * 0.8 * (num_cta_m - 1)
127
+ # total
128
+ total_dram = (load_a_dram + load_b_dram) / (1024 * 1024) # MB
129
+ total_l2 = (load_a_l2 + load_b_l2) / (1024 * 1024)
130
+ # loading time in ms
131
+ load_ms = total_dram / dram_bw + total_l2 / l2_bw
132
+
133
+ # estimate storing time
134
+ store_bw = dram_bw * 0.6 # :o
135
+ store_c_dram = M * N * dtsize * SPLIT_K / (1024 * 1024) # MB
136
+ if SPLIT_K == 1:
137
+ store_ms = store_c_dram / store_bw
138
+ else:
139
+ reduce_bw = store_bw
140
+ store_ms = store_c_dram / reduce_bw
141
+ # c.zero_()
142
+ zero_ms = M * N * 2 / (1024 * 1024) / store_bw
143
+ store_ms += zero_ms
144
+
145
+ total_time_ms = max(compute_ms, load_ms) + store_ms
146
+ if debug:
147
+ print(
148
+ f"Total time: {total_time_ms}ms, compute time: {compute_ms}ms, "
149
+ f"loading time: {load_ms}ms, store time: {store_ms}ms, "
150
+ f"Activate CTAs: {active_cta_ratio*100}%"
151
+ )
152
+ return total_time_ms
153
+
154
+
155
+ def early_config_prune(configs, named_args, **kwargs):
156
+ device = torch.cuda.current_device()
157
+ capability = torch.cuda.get_device_capability()
158
+ # BLOCK_M, BLOCK_N, BLOCK_K, SPLIT_K, num_warps, num_stages
159
+ dtsize = named_args["A"].element_size()
160
+ dtype = named_args["A"].dtype
161
+
162
+ # 1. make sure we have enough smem
163
+ pruned_configs = []
164
+ for config in configs:
165
+ kw = config.kwargs
166
+ BLOCK_M, BLOCK_N, BLOCK_K, num_stages = (
167
+ kw["BLOCK_M"],
168
+ kw["BLOCK_N"],
169
+ kw["BLOCK_K"],
170
+ config.num_stages,
171
+ )
172
+
173
+ max_shared_memory = driver.active.utils.get_device_properties(device)[
174
+ "max_shared_mem"
175
+ ]
176
+ required_shared_memory = (BLOCK_M + BLOCK_N) * BLOCK_K * num_stages * dtsize
177
+ if required_shared_memory <= max_shared_memory:
178
+ pruned_configs.append(config)
179
+ configs = pruned_configs
180
+
181
+ # Some dtypes do not allow atomic_add
182
+ if dtype not in [torch.float16, torch.float32]:
183
+ configs = [config for config in configs if config.kwargs["SPLIT_K"] == 1]
184
+
185
+ # group configs by (BLOCK_M,_N,_K, SPLIT_K, num_warps)
186
+ configs_map = {}
187
+ for config in configs:
188
+ kw = config.kwargs
189
+ BLOCK_M, BLOCK_N, BLOCK_K, SPLIT_K, num_warps, num_stages = (
190
+ kw["BLOCK_M"],
191
+ kw["BLOCK_N"],
192
+ kw["BLOCK_K"],
193
+ kw["SPLIT_K"],
194
+ config.num_warps,
195
+ config.num_stages,
196
+ )
197
+
198
+ key = (BLOCK_M, BLOCK_N, BLOCK_K, SPLIT_K, num_warps)
199
+ if key in configs_map:
200
+ configs_map[key].append((config, num_stages))
201
+ else:
202
+ configs_map[key] = [(config, num_stages)]
203
+
204
+ pruned_configs = []
205
+ for k, v in configs_map.items():
206
+ BLOCK_M, BLOCK_N, BLOCK_K, SPLIT_K, num_warps = k
207
+ if capability[0] >= 8:
208
+ # compute cycles (only works for ampere GPUs)
209
+ mmas = BLOCK_M * BLOCK_N * BLOCK_K / (16 * 8 * 16)
210
+ mma_cycles = mmas / min(4, num_warps) * 8
211
+
212
+ ldgsts_latency = 300 # Does this matter?
213
+ optimal_num_stages = ldgsts_latency / mma_cycles
214
+
215
+ # nearest stages, prefer large #stages
216
+ nearest = heapq.nsmallest(
217
+ 2,
218
+ v,
219
+ key=lambda x: (
220
+ 10 + abs(x[1] - optimal_num_stages)
221
+ if (x[1] - optimal_num_stages) < 0
222
+ else x[1] - optimal_num_stages
223
+ ),
224
+ )
225
+
226
+ for n in nearest:
227
+ pruned_configs.append(n[0])
228
+ else: # Volta & Turing only supports num_stages <= 2
229
+ random_config = v[0][0]
230
+ random_config.num_stages = 2
231
+ pruned_configs.append(random_config)
232
+ return pruned_configs
@@ -0,0 +1,130 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # pyre-unsafe
8
+ import sys
9
+
10
+ import torch
11
+ import triton # @manual
12
+
13
+ import triton.language as tl # @manual
14
+
15
+
16
+ def map_dtype_to_triton(dtype: torch.dtype) -> tl.dtype:
17
+ """
18
+ Maps torch dtype to triton dtype.
19
+
20
+ Args:
21
+ dtype (torch.dtype): input dtype.
22
+
23
+ Returns:
24
+ tl.dtype: triton dtype.
25
+ """
26
+ if dtype == torch.float16:
27
+ return tl.float16
28
+ elif dtype == torch.bfloat16:
29
+ return tl.bfloat16
30
+ elif dtype == torch.float32:
31
+ return tl.float32
32
+ elif dtype == torch.int32:
33
+ return tl.int32
34
+ elif dtype == torch.float8_e4m3fn and torch.version.hip is None:
35
+ return tl.float8e4nv
36
+ else:
37
+ raise ValueError(f"Unsupported dtype {dtype}")
38
+
39
+
40
+ # check if we have the TMA version in Triton PR #4498 (https://github.com/triton-lang/triton/pull/4498).
41
+ HAS_TMA_DESC = "nv_tma_desc_type" in dir(tl)
42
+
43
+ if HAS_TMA_DESC:
44
+ print(
45
+ "TMA benchmarks will be running with experimental grid constant TMA descriptor.",
46
+ file=sys.stderr,
47
+ )
48
+ else:
49
+ print(
50
+ "TMA benchmarks will be running without grid constant TMA descriptor.",
51
+ file=sys.stderr,
52
+ )
53
+
54
+
55
+ class TmaAutoTuneHelper:
56
+
57
+ # duck typing wrapper to implement the same interface as TmaDescKernelParam in Triton PR #4498
58
+ class KernelParamWrapper:
59
+ def __init__(self, desc):
60
+ self.desc = desc
61
+
62
+ def tma_desc_cpu_ptr(self):
63
+ return self.desc.data_ptr()
64
+
65
+ TMA_SIZE = 128
66
+
67
+ def __init__(self):
68
+ self.fill_1d_tma_descriptor_inner = (
69
+ triton.runtime.driver.active.utils.fill_1d_tma_descriptor
70
+ )
71
+ self.fill_2d_tma_descriptor_inner = (
72
+ triton.runtime.driver.active.utils.fill_2d_tma_descriptor
73
+ )
74
+ if HAS_TMA_DESC:
75
+ self.descriptors = {}
76
+ else:
77
+ self.cuda_descriptors = {}
78
+
79
+ # Call this method outside of the lambda function for grid size
80
+ def init_tma_descriptor(self, name):
81
+ if HAS_TMA_DESC:
82
+ self.descriptors[name] = torch.empty(
83
+ TmaAutoTuneHelper.TMA_SIZE, device="cpu", dtype=torch.int8
84
+ )
85
+ else:
86
+ self.cuda_descriptors[name] = torch.empty(
87
+ TmaAutoTuneHelper.TMA_SIZE, device="cuda", dtype=torch.int8
88
+ )
89
+
90
+ # Call this method inside the lambda function for grid size
91
+ def fill_1d_tma_descriptor(self, name, ptr, dim, block_dim, element_size):
92
+ if HAS_TMA_DESC:
93
+ desc_x = self.descriptors[name]
94
+ assert desc_x.data_ptr() % 64 == 0
95
+ self.fill_1d_tma_descriptor_inner(
96
+ ptr, dim, block_dim, element_size, desc_x.data_ptr()
97
+ )
98
+ else:
99
+ desc_x = self.cuda_descriptors[name]
100
+ buf_x = torch.empty_like(desc_x, device="cpu", pin_memory=True)
101
+ self.fill_1d_tma_descriptor_inner(
102
+ ptr, dim, block_dim, element_size, buf_x.data_ptr()
103
+ )
104
+ desc_x.copy_(buf_x, non_blocking=True)
105
+
106
+ # Call this method inside the lambda function for grid size
107
+ def fill_2d_tma_descriptor(
108
+ self, name, ptr, dim1, dim0, block_dim1, block_dim0, element_size
109
+ ):
110
+ if HAS_TMA_DESC:
111
+ desc_x = self.descriptors[name]
112
+ assert desc_x.data_ptr() % 64 == 0
113
+ self.fill_2d_tma_descriptor_inner(
114
+ ptr, dim1, dim0, block_dim1, block_dim0, element_size, desc_x.data_ptr()
115
+ )
116
+ else:
117
+ desc_x = self.cuda_descriptors[name]
118
+ buf_x = torch.empty_like(desc_x, device="cpu", pin_memory=True)
119
+ self.fill_2d_tma_descriptor_inner(
120
+ ptr, dim1, dim0, block_dim1, block_dim0, element_size, buf_x.data_ptr()
121
+ )
122
+ desc_x.copy_(buf_x, non_blocking=True)
123
+
124
+ def get_tma_descriptor_kernel_param(self, name):
125
+ if HAS_TMA_DESC:
126
+ assert self.descriptors[name] is not None
127
+ return self.KernelParamWrapper(self.descriptors[name])
128
+ else:
129
+ assert self.cuda_descriptors[name] is not None
130
+ return self.cuda_descriptors[name]
@@ -0,0 +1,56 @@
1
+ #!/usr/bin/env python3
2
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the BSD-style license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+
8
+ # pyre-strict
9
+
10
+ import os
11
+
12
+ import torch
13
+
14
+ try:
15
+ # pyre-ignore[21]
16
+ # @manual=//deeplearning/fbgemm/fbgemm_gpu:test_utils
17
+ from fbgemm_gpu import open_source
18
+ except Exception:
19
+ open_source: bool = False
20
+
21
+ # pyre-ignore[16]
22
+ if open_source:
23
+ torch.ops.load_library(
24
+ os.path.join(os.path.dirname(__file__), "fbgemm_gpu_experimental_gen_ai.so")
25
+ )
26
+ torch.classes.load_library(
27
+ os.path.join(os.path.dirname(__file__), "fbgemm_gpu_experimental_gen_ai.so")
28
+ )
29
+ else:
30
+ torch.ops.load_library(
31
+ "//deeplearning/fbgemm/fbgemm_gpu/experimental/gen_ai:attention_ops"
32
+ )
33
+ torch.ops.load_library(
34
+ "//deeplearning/fbgemm/fbgemm_gpu/experimental/gen_ai:comm_ops"
35
+ )
36
+ torch.ops.load_library(
37
+ "//deeplearning/fbgemm/fbgemm_gpu/experimental/gen_ai:quantize_ops"
38
+ )
39
+ torch.ops.load_library(
40
+ "//deeplearning/fbgemm/fbgemm_gpu/experimental/gen_ai:kv_cache_ops"
41
+ )
42
+ torch.ops.load_library(
43
+ "//deeplearning/fbgemm/fbgemm_gpu/experimental/gen_ai:gather_scatter_ops"
44
+ )
45
+
46
+ gemm_ops = [
47
+ "//deeplearning/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions:cutlass_bf16bf16bf16_grouped_grad",
48
+ "//deeplearning/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions:cutlass_bf16bf16bf16_grouped_wgrad",
49
+ ]
50
+ for op in gemm_ops:
51
+ try:
52
+ torch.ops.load_library(
53
+ op,
54
+ )
55
+ except OSError:
56
+ pass
@@ -0,0 +1,46 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ import torch
8
+
9
+ try:
10
+ # pyre-ignore[21]
11
+ # @manual=//deeplearning/fbgemm/fbgemm_gpu:test_utils
12
+ from fbgemm_gpu import open_source
13
+ except Exception:
14
+ open_source: bool = False
15
+
16
+ if open_source:
17
+ import os
18
+
19
+ torch.ops.load_library(
20
+ os.path.join(
21
+ os.path.dirname(os.path.dirname(__file__)),
22
+ "..",
23
+ "fbgemm_gpu_experimental_gen_ai.so",
24
+ )
25
+ )
26
+ else:
27
+ torch.ops.load_library(
28
+ "//deeplearning/fbgemm/fbgemm_gpu/experimental/gen_ai:blackwell_attention_ops_gpu"
29
+ )
30
+
31
+ from . import cutlass_blackwell_fmha_custom_op # noqa: F401
32
+ from .cutlass_blackwell_fmha_interface import ( # noqa: F401
33
+ _cutlass_blackwell_fmha_forward,
34
+ cutlass_blackwell_fmha_decode_forward,
35
+ cutlass_blackwell_fmha_func,
36
+ )
37
+
38
+ # Note: _cutlass_blackwell_fmha_forward is an internal function (indicated by leading underscore)
39
+ # that is exported here specifically for testing purposes. It allows tests to access the LSE
40
+ # (log-sum-exp) values returned by the forward pass without modifying the public API.
41
+ # Production code should use cutlass_blackwell_fmha_func instead.
42
+ __all__ = [
43
+ "_cutlass_blackwell_fmha_forward",
44
+ "cutlass_blackwell_fmha_decode_forward",
45
+ "cutlass_blackwell_fmha_func",
46
+ ]