mslk-cuda-nightly 2026.1.19__cp310-cp310-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. mslk/__init__.py +56 -0
  2. mslk/attention/__init__.py +7 -0
  3. mslk/attention/cutlass_blackwell_fmha/__init__.py +30 -0
  4. mslk/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_custom_op.py +332 -0
  5. mslk/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_interface.py +533 -0
  6. mslk/attention/flash_attn/__init__.py +22 -0
  7. mslk/attention/flash_attn/ampere_helpers.py +104 -0
  8. mslk/attention/flash_attn/barrier.py +72 -0
  9. mslk/attention/flash_attn/benchmark.py +269 -0
  10. mslk/attention/flash_attn/blackwell_helpers.py +754 -0
  11. mslk/attention/flash_attn/block_info.py +109 -0
  12. mslk/attention/flash_attn/block_sparse_utils.py +1452 -0
  13. mslk/attention/flash_attn/block_sparsity.py +219 -0
  14. mslk/attention/flash_attn/compute_block_sparsity.py +378 -0
  15. mslk/attention/flash_attn/copy_utils.py +341 -0
  16. mslk/attention/flash_attn/cute_dsl_utils.py +135 -0
  17. mslk/attention/flash_attn/fast_math.py +22 -0
  18. mslk/attention/flash_attn/flash_bwd.py +1262 -0
  19. mslk/attention/flash_attn/flash_bwd_postprocess.py +464 -0
  20. mslk/attention/flash_attn/flash_bwd_preprocess.py +366 -0
  21. mslk/attention/flash_attn/flash_bwd_sm100.py +2951 -0
  22. mslk/attention/flash_attn/flash_bwd_sm90.py +1703 -0
  23. mslk/attention/flash_attn/flash_fwd.py +2471 -0
  24. mslk/attention/flash_attn/flash_fwd_combine.py +705 -0
  25. mslk/attention/flash_attn/flash_fwd_sm100.py +2727 -0
  26. mslk/attention/flash_attn/hopper_helpers.py +102 -0
  27. mslk/attention/flash_attn/interface.py +1771 -0
  28. mslk/attention/flash_attn/mask.py +610 -0
  29. mslk/attention/flash_attn/mma_sm100_desc.py +292 -0
  30. mslk/attention/flash_attn/named_barrier.py +32 -0
  31. mslk/attention/flash_attn/pack_gqa.py +165 -0
  32. mslk/attention/flash_attn/paged_kv.py +176 -0
  33. mslk/attention/flash_attn/pipeline.py +273 -0
  34. mslk/attention/flash_attn/seqlen_info.py +139 -0
  35. mslk/attention/flash_attn/softmax.py +583 -0
  36. mslk/attention/flash_attn/testing.py +424 -0
  37. mslk/attention/flash_attn/tile_scheduler.py +720 -0
  38. mslk/attention/flash_attn/utils.py +860 -0
  39. mslk/attention/fmha/__init__.py +967 -0
  40. mslk/attention/fmha/_triton/__init__.py +6 -0
  41. mslk/attention/fmha/_triton/available.py +50 -0
  42. mslk/attention/fmha/_triton/splitk_kernels.py +1534 -0
  43. mslk/attention/fmha/_triton/vararg_kernel.py +262 -0
  44. mslk/attention/fmha/attn_bias.py +2186 -0
  45. mslk/attention/fmha/attn_bias_utils.py +536 -0
  46. mslk/attention/fmha/ck.py +508 -0
  47. mslk/attention/fmha/ck_decoder.py +141 -0
  48. mslk/attention/fmha/ck_splitk.py +204 -0
  49. mslk/attention/fmha/common.py +598 -0
  50. mslk/attention/fmha/cutlass.py +461 -0
  51. mslk/attention/fmha/cutlass_blackwell.py +560 -0
  52. mslk/attention/fmha/dispatch.py +224 -0
  53. mslk/attention/fmha/flash.py +862 -0
  54. mslk/attention/fmha/flash3.py +858 -0
  55. mslk/attention/fmha/flash_mtia.py +245 -0
  56. mslk/attention/fmha/merge_training.py +192 -0
  57. mslk/attention/fmha/split_blocks_fairinternal.py +329 -0
  58. mslk/attention/fmha/torch_attention_compat.py +154 -0
  59. mslk/attention/fmha/tree_attention.py +718 -0
  60. mslk/attention/fmha/triton_splitk.py +1378 -0
  61. mslk/attention/fmha/unbind.py +130 -0
  62. mslk/attention/fmha/utils/__init__.py +6 -0
  63. mslk/attention/fmha/utils/bench.py +74 -0
  64. mslk/attention/fmha/utils/cpp_lib.py +148 -0
  65. mslk/attention/fmha/utils/op_common.py +65 -0
  66. mslk/attention/gqa_attn_splitk/__init__.py +11 -0
  67. mslk/bench/comm/__init__.py +7 -0
  68. mslk/bench/comm/comm_bench.py +255 -0
  69. mslk/bench/common/__init__.py +5 -0
  70. mslk/bench/common/utils.py +148 -0
  71. mslk/bench/conv/__init__.py +7 -0
  72. mslk/bench/conv/conv_bench.py +551 -0
  73. mslk/bench/conv/conv_ops.py +213 -0
  74. mslk/bench/gemm/__init__.py +7 -0
  75. mslk/bench/gemm/gemm_bench.py +859 -0
  76. mslk/bench/gemm/gemm_ops.py +3342 -0
  77. mslk/bench/gemm/grouped_gemm_bias_scale_benchmark.py +177 -0
  78. mslk/bench/moe/__init__.py +7 -0
  79. mslk/bench/moe/gather_scatter_bench.py +356 -0
  80. mslk/bench/quantize/quantize_bench.py +345 -0
  81. mslk/bench/quantize/quantize_ops.py +266 -0
  82. mslk/comm/__init__.py +11 -0
  83. mslk/conv/__init__.py +11 -0
  84. mslk/gemm/__init__.py +18 -0
  85. mslk/gemm/triton/__init__.py +7 -0
  86. mslk/gemm/triton/fp8_gemm.py +2702 -0
  87. mslk/gemm/triton/grouped_gemm.py +1132 -0
  88. mslk/gemm/triton/matmul_perf_model.py +237 -0
  89. mslk/gemm/triton/utils.py +128 -0
  90. mslk/kv_cache/__init__.py +11 -0
  91. mslk/moe/__init__.py +26 -0
  92. mslk/moe/activation.py +291 -0
  93. mslk/moe/gather_scatter.py +739 -0
  94. mslk/moe/layers.py +1240 -0
  95. mslk/moe/shuffling.py +421 -0
  96. mslk/mslk.so +0 -0
  97. mslk/quantize/__init__.py +11 -0
  98. mslk/quantize/shuffle.py +306 -0
  99. mslk/quantize/triton/__init__.py +7 -0
  100. mslk/quantize/triton/fp4_quantize.py +5942 -0
  101. mslk/quantize/triton/fp8_quantize.py +1902 -0
  102. mslk/testing/__init__.py +7 -0
  103. mslk/testing/attributes.py +60 -0
  104. mslk/testing/rocm.py +91 -0
  105. mslk/utils/__init__.py +7 -0
  106. mslk/utils/torch/__init__.py +7 -0
  107. mslk/utils/torch/library.py +150 -0
  108. mslk/utils/triton/__init__.py +7 -0
  109. mslk/utils/triton/fp8_utils.py +72 -0
  110. mslk/utils/triton/utils.py +128 -0
  111. mslk/version.py +11 -0
  112. mslk_cuda_nightly-2026.1.19.dist-info/METADATA +102 -0
  113. mslk_cuda_nightly-2026.1.19.dist-info/RECORD +116 -0
  114. mslk_cuda_nightly-2026.1.19.dist-info/WHEEL +5 -0
  115. mslk_cuda_nightly-2026.1.19.dist-info/licenses/LICENSE +30 -0
  116. mslk_cuda_nightly-2026.1.19.dist-info/top_level.txt +1 -0
@@ -0,0 +1,237 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # Source: https://github.com/triton-lang/kernels/blob/8821ef322394ee2d3c58a859780ee1e2e10b5c79/kernels/matmul_perf_model.py
8
+
9
+ # This file is taken from the upstream triton-lang/kernels repo.
10
+ # Currently that repo does not have a license file, so disabling
11
+ # the license lint for now:
12
+ # @lint-ignore-every LICENSELINT
13
+
14
+ # flake8: noqa
15
+ # pyre-ignore-all-errors
16
+
17
+ import functools
18
+ import heapq
19
+
20
+ import torch
21
+ from triton import cdiv # @manual
22
+ from triton.runtime import driver # @manual
23
+ from triton.testing import ( # @manual
24
+ get_dram_gbps,
25
+ get_max_simd_tflops,
26
+ get_max_tensorcore_tflops,
27
+ nvsmi,
28
+ )
29
+
30
+
31
+ @functools.lru_cache()
32
+ def get_clock_rate_in_khz():
33
+ try:
34
+ return nvsmi(["clocks.max.sm"])[0] * 1e3
35
+ except FileNotFoundError:
36
+ import pynvml # @manual=fbsource//third-party/pypi/pynvml:pynvml
37
+
38
+ pynvml.nvmlInit()
39
+ handle = pynvml.nvmlDeviceGetHandleByIndex(0)
40
+ return pynvml.nvmlDeviceGetMaxClockInfo(handle, pynvml.NVML_CLOCK_SM) * 1e3
41
+
42
+
43
+ def get_tensorcore_tflops(device, num_ctas, num_warps, dtype):
44
+ """return compute throughput in TOPS"""
45
+ total_warps = num_ctas * min(num_warps, 4)
46
+ num_subcores = (
47
+ driver.active.utils.get_device_properties(device)["multiprocessor_count"] * 4
48
+ ) # on recent GPUs
49
+ tflops = (
50
+ min(num_subcores, total_warps)
51
+ / num_subcores
52
+ * get_max_tensorcore_tflops(dtype, get_clock_rate_in_khz(), device)
53
+ )
54
+ return tflops
55
+
56
+
57
+ def get_simd_tflops(device, num_ctas, num_warps, dtype):
58
+ """return compute throughput in TOPS"""
59
+ total_warps = num_ctas * min(num_warps, 4)
60
+ num_subcores = (
61
+ driver.active.utils.get_device_properties(device)["multiprocessor_count"] * 4
62
+ ) # on recent GPUs
63
+ tflops = (
64
+ min(num_subcores, total_warps)
65
+ / num_subcores
66
+ * get_max_simd_tflops(dtype, get_clock_rate_in_khz(), device)
67
+ )
68
+ return tflops
69
+
70
+
71
+ def get_tflops(device, num_ctas, num_warps, dtype):
72
+ capability = torch.cuda.get_device_capability(device)
73
+ if capability[0] < 8 and dtype == torch.float32:
74
+ return get_simd_tflops(device, num_ctas, num_warps, dtype)
75
+ return get_tensorcore_tflops(device, num_ctas, num_warps, dtype)
76
+
77
+
78
+ def estimate_matmul_time(
79
+ # backend, device,
80
+ num_warps,
81
+ num_stages, #
82
+ A,
83
+ B,
84
+ C, #
85
+ M,
86
+ N,
87
+ K, #
88
+ BLOCK_M,
89
+ BLOCK_N,
90
+ BLOCK_K,
91
+ SPLIT_K, #
92
+ debug=False,
93
+ **kwargs, #
94
+ ):
95
+ """return estimated running time in ms
96
+ = max(compute, loading) + store"""
97
+ device = torch.cuda.current_device()
98
+ dtype = A.dtype
99
+ dtsize = A.element_size()
100
+
101
+ num_cta_m = cdiv(M, BLOCK_M)
102
+ num_cta_n = cdiv(N, BLOCK_N)
103
+ num_cta_k = SPLIT_K
104
+ num_ctas = num_cta_m * num_cta_n * num_cta_k
105
+
106
+ # If the input is smaller than the block size
107
+ M, N = max(M, BLOCK_M), max(N, BLOCK_N)
108
+
109
+ # time to compute
110
+ total_ops = 2 * M * N * K / (1024 * 1024 * 1024) # GOPS
111
+ tput = get_tflops(device, num_ctas, num_warps, dtype)
112
+ compute_ms = total_ops / tput
113
+
114
+ # time to load data
115
+ num_sm = driver.active.utils.get_device_properties(device)["multiprocessor_count"]
116
+ active_cta_ratio = min(1, num_ctas / num_sm)
117
+ active_cta_ratio_bw1 = min(
118
+ 1, num_ctas / 32
119
+ ) # 32 active ctas are enough to saturate
120
+ active_cta_ratio_bw2 = max(
121
+ min(1, (num_ctas - 32) / (108 - 32)), 0
122
+ ) # 32-108, remaining 5%
123
+ dram_bw = get_dram_gbps(device) * (
124
+ active_cta_ratio_bw1 * 0.95 + active_cta_ratio_bw2 * 0.05
125
+ ) # in GB/s
126
+ l2_bw = dram_bw * 4 # rough estimation (should be 4.7 for A100?)
127
+ # assume 80% of (following) loads are in L2 cache
128
+ load_a_dram = M * K * dtsize * (1 + 0.2 * (num_cta_n - 1))
129
+ load_a_l2 = M * K * dtsize * 0.8 * (num_cta_n - 1)
130
+ load_b_dram = N * K * dtsize * (1 + 0.2 * (num_cta_m - 1))
131
+ load_b_l2 = N * K * dtsize * 0.8 * (num_cta_m - 1)
132
+ # total
133
+ total_dram = (load_a_dram + load_b_dram) / (1024 * 1024) # MB
134
+ total_l2 = (load_a_l2 + load_b_l2) / (1024 * 1024)
135
+ # loading time in ms
136
+ load_ms = total_dram / dram_bw + total_l2 / l2_bw
137
+
138
+ # estimate storing time
139
+ store_bw = dram_bw * 0.6 # :o
140
+ store_c_dram = M * N * dtsize * SPLIT_K / (1024 * 1024) # MB
141
+ if SPLIT_K == 1:
142
+ store_ms = store_c_dram / store_bw
143
+ else:
144
+ reduce_bw = store_bw
145
+ store_ms = store_c_dram / reduce_bw
146
+ # c.zero_()
147
+ zero_ms = M * N * 2 / (1024 * 1024) / store_bw
148
+ store_ms += zero_ms
149
+
150
+ total_time_ms = max(compute_ms, load_ms) + store_ms
151
+ if debug:
152
+ print(
153
+ f"Total time: {total_time_ms}ms, compute time: {compute_ms}ms, "
154
+ f"loading time: {load_ms}ms, store time: {store_ms}ms, "
155
+ f"Activate CTAs: {active_cta_ratio * 100}%"
156
+ )
157
+ return total_time_ms
158
+
159
+
160
+ def early_config_prune(configs, named_args, **kwargs):
161
+ device = torch.cuda.current_device()
162
+ capability = torch.cuda.get_device_capability()
163
+ # BLOCK_M, BLOCK_N, BLOCK_K, SPLIT_K, num_warps, num_stages
164
+ dtsize = named_args["A"].element_size()
165
+ dtype = named_args["A"].dtype
166
+
167
+ # 1. make sure we have enough smem
168
+ pruned_configs = []
169
+ for config in configs:
170
+ kw = config.kwargs
171
+ BLOCK_M, BLOCK_N, BLOCK_K, num_stages = (
172
+ kw["BLOCK_M"],
173
+ kw["BLOCK_N"],
174
+ kw["BLOCK_K"],
175
+ config.num_stages,
176
+ )
177
+
178
+ max_shared_memory = driver.active.utils.get_device_properties(device)[
179
+ "max_shared_mem"
180
+ ]
181
+ required_shared_memory = (BLOCK_M + BLOCK_N) * BLOCK_K * num_stages * dtsize
182
+ if required_shared_memory <= max_shared_memory:
183
+ pruned_configs.append(config)
184
+ configs = pruned_configs
185
+
186
+ # Some dtypes do not allow atomic_add
187
+ if dtype not in [torch.float16, torch.float32]:
188
+ configs = [config for config in configs if config.kwargs["SPLIT_K"] == 1]
189
+
190
+ # group configs by (BLOCK_M,_N,_K, SPLIT_K, num_warps)
191
+ configs_map = {}
192
+ for config in configs:
193
+ kw = config.kwargs
194
+ BLOCK_M, BLOCK_N, BLOCK_K, SPLIT_K, num_warps, num_stages = (
195
+ kw["BLOCK_M"],
196
+ kw["BLOCK_N"],
197
+ kw["BLOCK_K"],
198
+ kw["SPLIT_K"],
199
+ config.num_warps,
200
+ config.num_stages,
201
+ )
202
+
203
+ key = (BLOCK_M, BLOCK_N, BLOCK_K, SPLIT_K, num_warps)
204
+ if key in configs_map:
205
+ configs_map[key].append((config, num_stages))
206
+ else:
207
+ configs_map[key] = [(config, num_stages)]
208
+
209
+ pruned_configs = []
210
+ for k, v in configs_map.items():
211
+ BLOCK_M, BLOCK_N, BLOCK_K, SPLIT_K, num_warps = k
212
+ if capability[0] >= 8:
213
+ # compute cycles (only works for ampere GPUs)
214
+ mmas = BLOCK_M * BLOCK_N * BLOCK_K / (16 * 8 * 16)
215
+ mma_cycles = mmas / min(4, num_warps) * 8
216
+
217
+ ldgsts_latency = 300 # Does this matter?
218
+ optimal_num_stages = ldgsts_latency / mma_cycles
219
+
220
+ # nearest stages, prefer large #stages
221
+ nearest = heapq.nsmallest(
222
+ 2,
223
+ v,
224
+ key=lambda x: (
225
+ 10 + abs(x[1] - optimal_num_stages)
226
+ if (x[1] - optimal_num_stages) < 0
227
+ else x[1] - optimal_num_stages
228
+ ),
229
+ )
230
+
231
+ for n in nearest:
232
+ pruned_configs.append(n[0])
233
+ else: # Volta & Turing only supports num_stages <= 2
234
+ random_config = v[0][0]
235
+ random_config.num_stages = 2
236
+ pruned_configs.append(random_config)
237
+ return pruned_configs
@@ -0,0 +1,128 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # pyre-unsafe
8
+ import sys
9
+
10
+ import torch
11
+ import triton # @manual
12
+ import triton.language as tl # @manual
13
+
14
+
15
+ def map_dtype_to_triton(dtype: torch.dtype) -> tl.dtype:
16
+ """
17
+ Maps torch dtype to triton dtype.
18
+
19
+ Args:
20
+ dtype (torch.dtype): input dtype.
21
+
22
+ Returns:
23
+ tl.dtype: triton dtype.
24
+ """
25
+ if dtype == torch.float16:
26
+ return tl.float16
27
+ elif dtype == torch.bfloat16:
28
+ return tl.bfloat16
29
+ elif dtype == torch.float32:
30
+ return tl.float32
31
+ elif dtype == torch.int32:
32
+ return tl.int32
33
+ elif dtype == torch.float8_e4m3fn and torch.version.hip is None:
34
+ return tl.float8e4nv
35
+ else:
36
+ raise ValueError(f"Unsupported dtype {dtype}")
37
+
38
+
39
+ # check if we have the TMA version in Triton PR #4498 (https://github.com/triton-lang/triton/pull/4498).
40
+ HAS_TMA_DESC = "nv_tma_desc_type" in dir(tl)
41
+
42
+ if HAS_TMA_DESC:
43
+ print(
44
+ "TMA benchmarks will be running with experimental grid constant TMA descriptor.",
45
+ file=sys.stderr,
46
+ )
47
+ else:
48
+ print(
49
+ "TMA benchmarks will be running without grid constant TMA descriptor.",
50
+ file=sys.stderr,
51
+ )
52
+
53
+
54
+ class TmaAutoTuneHelper:
55
+ # duck typing wrapper to implement the same interface as TmaDescKernelParam in Triton PR #4498
56
+ class KernelParamWrapper:
57
+ def __init__(self, desc):
58
+ self.desc = desc
59
+
60
+ def tma_desc_cpu_ptr(self):
61
+ return self.desc.data_ptr()
62
+
63
+ TMA_SIZE = 128
64
+
65
+ def __init__(self):
66
+ self.fill_1d_tma_descriptor_inner = (
67
+ triton.runtime.driver.active.utils.fill_1d_tma_descriptor
68
+ )
69
+ self.fill_2d_tma_descriptor_inner = (
70
+ triton.runtime.driver.active.utils.fill_2d_tma_descriptor
71
+ )
72
+ if HAS_TMA_DESC:
73
+ self.descriptors = {}
74
+ else:
75
+ self.cuda_descriptors = {}
76
+
77
+ # Call this method outside of the lambda function for grid size
78
+ def init_tma_descriptor(self, name):
79
+ if HAS_TMA_DESC:
80
+ self.descriptors[name] = torch.empty(
81
+ TmaAutoTuneHelper.TMA_SIZE, device="cpu", dtype=torch.int8
82
+ )
83
+ else:
84
+ self.cuda_descriptors[name] = torch.empty(
85
+ TmaAutoTuneHelper.TMA_SIZE, device="cuda", dtype=torch.int8
86
+ )
87
+
88
+ # Call this method inside the lambda function for grid size
89
+ def fill_1d_tma_descriptor(self, name, ptr, dim, block_dim, element_size):
90
+ if HAS_TMA_DESC:
91
+ desc_x = self.descriptors[name]
92
+ assert desc_x.data_ptr() % 64 == 0
93
+ self.fill_1d_tma_descriptor_inner(
94
+ ptr, dim, block_dim, element_size, desc_x.data_ptr()
95
+ )
96
+ else:
97
+ desc_x = self.cuda_descriptors[name]
98
+ buf_x = torch.empty_like(desc_x, device="cpu", pin_memory=True)
99
+ self.fill_1d_tma_descriptor_inner(
100
+ ptr, dim, block_dim, element_size, buf_x.data_ptr()
101
+ )
102
+ desc_x.copy_(buf_x, non_blocking=True)
103
+
104
+ # Call this method inside the lambda function for grid size
105
+ def fill_2d_tma_descriptor(
106
+ self, name, ptr, dim1, dim0, block_dim1, block_dim0, element_size
107
+ ):
108
+ if HAS_TMA_DESC:
109
+ desc_x = self.descriptors[name]
110
+ assert desc_x.data_ptr() % 64 == 0
111
+ self.fill_2d_tma_descriptor_inner(
112
+ ptr, dim1, dim0, block_dim1, block_dim0, element_size, desc_x.data_ptr()
113
+ )
114
+ else:
115
+ desc_x = self.cuda_descriptors[name]
116
+ buf_x = torch.empty_like(desc_x, device="cpu", pin_memory=True)
117
+ self.fill_2d_tma_descriptor_inner(
118
+ ptr, dim1, dim0, block_dim1, block_dim0, element_size, buf_x.data_ptr()
119
+ )
120
+ desc_x.copy_(buf_x, non_blocking=True)
121
+
122
+ def get_tma_descriptor_kernel_param(self, name):
123
+ if HAS_TMA_DESC:
124
+ assert self.descriptors[name] is not None
125
+ return self.KernelParamWrapper(self.descriptors[name])
126
+ else:
127
+ assert self.cuda_descriptors[name] is not None
128
+ return self.cuda_descriptors[name]
@@ -0,0 +1,11 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # pyre-strict
8
+
9
+ from mslk.utils.torch.library import load_library_buck
10
+
11
+ load_library_buck("//mslk/csrc/kv_cache:kv_cache_ops")
mslk/moe/__init__.py ADDED
@@ -0,0 +1,26 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # pyre-unsafe
8
+
9
+ import mslk # noqa F401
10
+ import torch
11
+ from mslk.utils.torch.library import load_library_buck
12
+
13
+ load_library_buck("//mslk/csrc/moe:moe_ops")
14
+
15
+ index_shuffling = None
16
+ if torch.cuda.is_available():
17
+ index_shuffling = torch.ops.mslk.index_shuffling # noqa F401
18
+
19
+ from .activation import silu_mul, silu_mul_quant # noqa F401
20
+ from .gather_scatter import ( # noqa F401
21
+ gather_scale_dense_tokens,
22
+ gather_scale_quant_dense_tokens,
23
+ scatter_add_dense_tokens,
24
+ scatter_add_padded_tokens,
25
+ )
26
+ from .shuffling import combine_shuffling, split_shuffling # noqa F401
mslk/moe/activation.py ADDED
@@ -0,0 +1,291 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # pyre-unsafe
8
+
9
+ from typing import Optional
10
+
11
+ import torch
12
+ import triton
13
+ import triton.language as tl
14
+ from mslk.utils.triton.fp8_utils import get_fp8_constants
15
+
16
+
17
+ # Function APIs
18
+ def silu_mul(
19
+ x0: torch.Tensor,
20
+ x1: torch.Tensor,
21
+ valid_token_count: Optional[torch.Tensor] = None,
22
+ ) -> torch.Tensor:
23
+ """
24
+ Fused silu and mul operations.
25
+
26
+ y = x0 * sigmoid(x0) * x1
27
+
28
+ Args:
29
+ x0: input tensor of shape (T, D)
30
+ x1: input tensor of shape (T, D)
31
+ valid_token_count: tensor of shape (1,) to indicate the number of valid tokens.
32
+
33
+ Returns:
34
+ output tensor of shape (T, D)
35
+ """
36
+
37
+ assert x0.ndim == 2 and x0.stride(1) == 1
38
+ assert x1.ndim == 2 and x1.stride(1) == 1
39
+ assert x0.shape == x1.shape
40
+ assert x0.dtype == x1.dtype
41
+
42
+ T, D = x0.shape
43
+ stride_0 = x0.stride(0)
44
+ stride_1 = x1.stride(0)
45
+
46
+ out = torch.empty((T, D), device="cuda", dtype=x0.dtype)
47
+
48
+ NUM_SMS = torch.cuda.get_device_properties("cuda").multi_processor_count
49
+ if T >= NUM_SMS:
50
+ BLOCK_D_OUTER = D
51
+ BLOCK_D_INNER = 1024
52
+ assert D % BLOCK_D_INNER == 0
53
+ else:
54
+ BLOCK_D_OUTER = 512
55
+ BLOCK_D_INNER = 256
56
+ assert D % BLOCK_D_OUTER == 0
57
+ grid = (T, D // BLOCK_D_OUTER)
58
+ _mslk_silu_mul[grid](
59
+ out,
60
+ x0,
61
+ x1,
62
+ stride_0,
63
+ stride_1,
64
+ valid_token_count,
65
+ D, # pyre-ignore
66
+ BLOCK_D_OUTER, # pyre-ignore
67
+ BLOCK_D_INNER, # pyre-ignore
68
+ )
69
+ return out
70
+
71
+
72
+ def silu_mul_quant(
73
+ x0: torch.Tensor,
74
+ x1: torch.Tensor,
75
+ scale_ub: Optional[torch.Tensor] = None,
76
+ valid_token_count: Optional[torch.Tensor] = None,
77
+ ) -> tuple[torch.Tensor, torch.Tensor]:
78
+ """
79
+ Fused silu, mul, and FP8 rowwise quantization operations.
80
+
81
+ y, y_scale = quantize(x0 * sigmoid(x0) * x1)
82
+
83
+ Args:
84
+ x0: input tensor of shape (T, D)
85
+ x1: input tensor of shape (T, D)
86
+ scale_ub: tensor of shape (1,) to indicate the upper bound of the scale.
87
+ valid_token_count: tensor of shape (1,) to indicate the number of valid tokens.
88
+
89
+ Returns:
90
+ output quantized tensor of shape (T, D) and its inverse scale of shape (T,)
91
+ """
92
+
93
+ assert x0.ndim == 2 and x0.stride(1) == 1
94
+ assert x1.ndim == 2 and x1.stride(1) == 1
95
+ assert x0.shape == x1.shape
96
+ assert x0.dtype == x1.dtype
97
+
98
+ pt_dtype, tl_dtype, max_fp8, eps = get_fp8_constants()
99
+
100
+ T, D = x0.shape
101
+ stride_0 = x0.stride(0)
102
+ stride_1 = x1.stride(0)
103
+
104
+ out = torch.empty((T, D), device="cuda", dtype=pt_dtype)
105
+ out_inv_scale = torch.empty((T,), device="cuda", dtype=torch.float32)
106
+ if T == 0:
107
+ return out, out_inv_scale
108
+
109
+ NUM_SMS = torch.cuda.get_device_properties("cuda").multi_processor_count
110
+ BLOCK_T = triton.cdiv(T, NUM_SMS)
111
+
112
+ NUM_CTAS = triton.cdiv(T, BLOCK_T)
113
+
114
+ grid = (NUM_CTAS,)
115
+ _mslk_silu_mul_quant[grid](
116
+ out,
117
+ out_inv_scale,
118
+ x0,
119
+ x1,
120
+ scale_ub,
121
+ stride_0,
122
+ stride_1,
123
+ valid_token_count,
124
+ T,
125
+ D, # pyre-ignore
126
+ BLOCK_T,
127
+ TL_FP8_DTYPE=tl_dtype, # pyre-ignore
128
+ MAX_FP8=max_fp8, # pyre-ignore
129
+ EPS=eps, # pyre-ignore
130
+ CLAMP_MAX=scale_ub is not None, # pyre-ignore
131
+ )
132
+ return out, out_inv_scale
133
+
134
+
135
+ # Torch Custom Op Registrations
136
+ _SILU_MUL_OP_NAME = "mslk::silu_mul"
137
+
138
+ torch.library.define(
139
+ "mslk::silu_mul",
140
+ "(Tensor x0, Tensor x1, Tensor? valid_token_count=None) -> Tensor",
141
+ )
142
+
143
+
144
+ @torch.library.impl(_SILU_MUL_OP_NAME, "Meta")
145
+ def silu_mul_meta(x0, x1, valid_token_count):
146
+ return x0.new_empty(x0.shape)
147
+
148
+
149
+ @torch.library.impl(_SILU_MUL_OP_NAME, "CUDA")
150
+ def silu_mul_cuda(x0, x1, valid_token_count):
151
+ return silu_mul(x0, x1, valid_token_count)
152
+
153
+
154
+ _SILU_MUL_OP_QUANT_NAME = "mslk::silu_mul_quant"
155
+
156
+ torch.library.define(
157
+ "mslk::silu_mul_quant",
158
+ "(Tensor x0, Tensor x1, Tensor? scale_ub=None, Tensor? valid_token_count=None) -> (Tensor, Tensor)",
159
+ )
160
+
161
+
162
+ @torch.library.impl(_SILU_MUL_OP_QUANT_NAME, "Meta")
163
+ def silu_mul_quant_meta(x0, x1, scale_ub, valid_token_count):
164
+ pt_dtype, tl_dtype, max_fp8, eps = get_fp8_constants()
165
+ return torch.empty(x0.shape, device=x0.device, dtype=pt_dtype)
166
+
167
+
168
+ @torch.library.impl(_SILU_MUL_OP_QUANT_NAME, "CUDA")
169
+ def silu_mul_quant_cuda(x0, x1, scale_ub=None, valid_token_count=None):
170
+ return silu_mul_quant(x0, x1, scale_ub, valid_token_count)
171
+
172
+
173
+ # Kernel Implementations
174
+ @triton.jit
175
+ def _mslk_silu_mul(
176
+ y_ptr,
177
+ x0_ptr,
178
+ x1_ptr,
179
+ stride_0,
180
+ stride_1,
181
+ valid_token_count,
182
+ D: tl.constexpr,
183
+ BLOCK_D_OUTER: tl.constexpr,
184
+ BLOCK_D_INNER: tl.constexpr,
185
+ ) -> None:
186
+ token_index = tl.program_id(0)
187
+ feature_offset = tl.program_id(1) * BLOCK_D_OUTER + tl.arange(0, BLOCK_D_INNER)[:]
188
+
189
+ if valid_token_count is not None:
190
+ valid_token_count = tl.load(
191
+ valid_token_count, None, eviction_policy="evict_last"
192
+ )
193
+ if token_index >= valid_token_count:
194
+ return
195
+
196
+ for _ in tl.range(0, BLOCK_D_OUTER // BLOCK_D_INNER, num_stages=3):
197
+ x0 = tl.load(
198
+ x0_ptr + token_index * stride_0 + feature_offset,
199
+ None,
200
+ eviction_policy="evict_first",
201
+ ).to(tl.float32)
202
+ x1 = tl.load(
203
+ x1_ptr + token_index * stride_1 + feature_offset,
204
+ None,
205
+ eviction_policy="evict_first",
206
+ ).to(tl.float32)
207
+
208
+ y = x0 * tl.sigmoid(x0) * x1
209
+
210
+ tl.store(
211
+ y_ptr + token_index * D + feature_offset,
212
+ y,
213
+ None,
214
+ )
215
+ feature_offset += BLOCK_D_INNER
216
+
217
+
218
+ @triton.jit
219
+ def _mslk_silu_mul_quant(
220
+ y_ptr,
221
+ y_inv_scale_ptr,
222
+ x0_ptr,
223
+ x1_ptr,
224
+ scale_ub_ptr,
225
+ stride_0,
226
+ stride_1,
227
+ valid_token_count,
228
+ T,
229
+ D: tl.constexpr,
230
+ BLOCK_T: tl.constexpr,
231
+ TL_FP8_DTYPE: tl.constexpr,
232
+ MAX_FP8: tl.constexpr,
233
+ EPS: tl.constexpr,
234
+ CLAMP_MAX: tl.constexpr,
235
+ ) -> None:
236
+ PADDED_D: tl.constexpr = triton.next_power_of_2(D) # pyre-ignore
237
+
238
+ tidx = tl.program_id(0)
239
+ start_idx = tidx * BLOCK_T
240
+ end_idx = tl.minimum(start_idx + BLOCK_T, T)
241
+
242
+ if valid_token_count is not None:
243
+ valid_token_count = tl.load(
244
+ valid_token_count, None, eviction_policy="evict_last"
245
+ )
246
+ if start_idx >= valid_token_count:
247
+ return
248
+
249
+ offsets = tl.arange(0, PADDED_D)[:]
250
+ mask = offsets < D
251
+
252
+ if CLAMP_MAX:
253
+ ub = tl.load(scale_ub_ptr, eviction_policy="evict_last")
254
+ else:
255
+ ub = float("inf")
256
+
257
+ for token_index in tl.range(start_idx, end_idx, 1, num_stages=2):
258
+ x0 = tl.load(
259
+ x0_ptr + token_index * stride_0 + offsets,
260
+ mask,
261
+ eviction_policy="evict_first",
262
+ ).to(tl.float32)
263
+ x1 = tl.load(
264
+ x1_ptr + token_index * stride_1 + offsets,
265
+ mask,
266
+ eviction_policy="evict_first",
267
+ ).to(tl.float32)
268
+
269
+ y = x0 * tl.sigmoid(x0) * x1
270
+
271
+ # Masked values are set to 0.0.
272
+ row_max = tl.max(tl.where(mask, tl.abs(y), 0.0))
273
+ if CLAMP_MAX:
274
+ row_max = tl.clamp(row_max, EPS, ub)
275
+ else:
276
+ row_max = tl.maximum(row_max, EPS)
277
+
278
+ y_scale = MAX_FP8 / row_max
279
+ tl.store(y_inv_scale_ptr + token_index, 1.0 / y_scale)
280
+
281
+ y = y * y_scale
282
+ # Clamp A to fp8 range to make sure there's no overflow.
283
+ # This is required for AMD. Nvidia's default saturation
284
+ # handles it, but it's nice to have anyway.
285
+ y_fp8 = tl.clamp(y, -MAX_FP8, MAX_FP8).to(TL_FP8_DTYPE)
286
+
287
+ tl.store(
288
+ y_ptr + token_index * D + offsets,
289
+ y_fp8,
290
+ mask,
291
+ )