fbgemm-gpu-genai-nightly 2025.10.10__cp310-cp310-manylinux_2_28_x86_64.whl → 2025.11.9__cp310-cp310-manylinux_2_28_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of fbgemm-gpu-genai-nightly might be problematic. Click here for more details.
- fbgemm_gpu/__init__.py +36 -18
- fbgemm_gpu/docs/target.genai.json.py +6 -0
- fbgemm_gpu/experimental/example/__init__.py +0 -4
- fbgemm_gpu/experimental/example/fbgemm_gpu_experimental_example_py.so +0 -0
- fbgemm_gpu/experimental/gemm/triton_gemm/__init__.py +0 -4
- fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py +4 -0
- fbgemm_gpu/experimental/gemm/triton_gemm/grouped_gemm.py +135 -172
- fbgemm_gpu/experimental/gen_ai/__init__.py +0 -4
- fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/__init__.py +0 -4
- fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_interface.py +186 -63
- fbgemm_gpu/experimental/gen_ai/bench/__init__.py +0 -4
- fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py +2 -2
- fbgemm_gpu/experimental/gen_ai/fbgemm_gpu_experimental_gen_ai.so +0 -0
- fbgemm_gpu/experimental/gen_ai/moe/__init__.py +0 -4
- fbgemm_gpu/experimental/gen_ai/moe/layers.py +0 -4
- fbgemm_gpu/split_table_batched_embeddings_ops_common.py +43 -10
- fbgemm_gpu/split_table_batched_embeddings_ops_training.py +287 -39
- fbgemm_gpu/tbe/bench/bench_runs.py +7 -0
- fbgemm_gpu/tbe/cache/kv_embedding_ops_inference.py +6 -1
- fbgemm_gpu/tbe/ssd/training.py +319 -41
- {fbgemm_gpu_genai_nightly-2025.10.10.dist-info → fbgemm_gpu_genai_nightly-2025.11.9.dist-info}/METADATA +1 -2
- {fbgemm_gpu_genai_nightly-2025.10.10.dist-info → fbgemm_gpu_genai_nightly-2025.11.9.dist-info}/RECORD +24 -24
- fbgemm_gpu/docs/version.py +0 -11
- {fbgemm_gpu_genai_nightly-2025.10.10.dist-info → fbgemm_gpu_genai_nightly-2025.11.9.dist-info}/WHEEL +0 -0
- {fbgemm_gpu_genai_nightly-2025.10.10.dist-info → fbgemm_gpu_genai_nightly-2025.11.9.dist-info}/top_level.txt +0 -0
fbgemm_gpu/__init__.py
CHANGED
|
@@ -5,6 +5,7 @@
|
|
|
5
5
|
# This source code is licensed under the BSD-style license found in the
|
|
6
6
|
# LICENSE file in the root directory of this source tree.
|
|
7
7
|
|
|
8
|
+
import json
|
|
8
9
|
import logging
|
|
9
10
|
import os
|
|
10
11
|
import re
|
|
@@ -26,6 +27,19 @@ _fbgemm_torch_compat_table = {
|
|
|
26
27
|
}
|
|
27
28
|
|
|
28
29
|
|
|
30
|
+
def _load_target_info(target: str) -> dict[str, str]:
|
|
31
|
+
try:
|
|
32
|
+
filepath = os.path.join(
|
|
33
|
+
os.path.dirname(__file__), "docs", f"target.{target}.json.py"
|
|
34
|
+
)
|
|
35
|
+
with open(filepath, "r") as file:
|
|
36
|
+
data = json.load(file)
|
|
37
|
+
except Exception:
|
|
38
|
+
data = {}
|
|
39
|
+
|
|
40
|
+
return data
|
|
41
|
+
|
|
42
|
+
|
|
29
43
|
def _load_library(filename: str, version: str, no_throw: bool = False) -> None:
|
|
30
44
|
"""Load a shared library from the given filename."""
|
|
31
45
|
|
|
@@ -98,13 +112,16 @@ open_source: bool = True
|
|
|
98
112
|
# Trigger the manual addition of docstrings to pybind11-generated operators
|
|
99
113
|
import fbgemm_gpu.docs # noqa: F401, E402
|
|
100
114
|
|
|
115
|
+
|
|
116
|
+
__targets_infos__ = {
|
|
117
|
+
target: _load_target_info(target) for target in ["default", "genai", "hstu"]
|
|
118
|
+
}
|
|
119
|
+
__targets_infos__ = {k: v for (k, v) in __targets_infos__.items() if v}
|
|
120
|
+
|
|
101
121
|
try:
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
__variant__,
|
|
106
|
-
__version__,
|
|
107
|
-
)
|
|
122
|
+
__target__, __info__ = next(iter(__targets_infos__.items()))
|
|
123
|
+
__variant__ = __info__["variant"]
|
|
124
|
+
__version__ = __info__["version"]
|
|
108
125
|
except Exception:
|
|
109
126
|
__variant__: str = "INTERNAL"
|
|
110
127
|
__version__: str = "INTERNAL"
|
|
@@ -145,18 +162,19 @@ libraries_to_load = {
|
|
|
145
162
|
"genai": fbgemm_genai_libraries,
|
|
146
163
|
}
|
|
147
164
|
|
|
148
|
-
for
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
165
|
+
for target, info in __targets_infos__.items():
|
|
166
|
+
for library in libraries_to_load.get(target, []):
|
|
167
|
+
# NOTE: In all cases, we want to throw an error if we cannot load the
|
|
168
|
+
# library. However, this appears to break the OSS documentation build,
|
|
169
|
+
# where the Python documentation doesn't show up in the generated docs.
|
|
170
|
+
#
|
|
171
|
+
# To work around this problem, we introduce a fake build variant called
|
|
172
|
+
# `docs` and we only throw a library load error when the variant is not
|
|
173
|
+
# `docs`. For more information, see:
|
|
174
|
+
#
|
|
175
|
+
# https://github.com/pytorch/FBGEMM/pull/3477
|
|
176
|
+
# https://github.com/pytorch/FBGEMM/pull/3717
|
|
177
|
+
_load_library(f"{library}.so", info["version"], info["variant"] == "docs")
|
|
160
178
|
|
|
161
179
|
try:
|
|
162
180
|
# Trigger meta operator registrations
|
|
@@ -15,10 +15,6 @@ try:
|
|
|
15
15
|
# pyre-ignore[21]
|
|
16
16
|
# @manual=//deeplearning/fbgemm/fbgemm_gpu:test_utils
|
|
17
17
|
from fbgemm_gpu import open_source
|
|
18
|
-
|
|
19
|
-
# pyre-ignore[21]
|
|
20
|
-
# @manual=//deeplearning/fbgemm/fbgemm_gpu:test_utils
|
|
21
|
-
from fbgemm_gpu.docs.version import __version__ # noqa: F401
|
|
22
18
|
except Exception:
|
|
23
19
|
open_source: bool = False
|
|
24
20
|
|
|
Binary file
|
|
@@ -11,9 +11,5 @@ try:
|
|
|
11
11
|
# pyre-ignore[21]
|
|
12
12
|
# @manual=//deeplearning/fbgemm/fbgemm_gpu:test_utils
|
|
13
13
|
from fbgemm_gpu import open_source
|
|
14
|
-
|
|
15
|
-
# pyre-ignore[21]
|
|
16
|
-
# @manual=//deeplearning/fbgemm/fbgemm_gpu:test_utils
|
|
17
|
-
from fbgemm_gpu.docs.version import __version__ # noqa: F401
|
|
18
14
|
except Exception:
|
|
19
15
|
open_source: bool = False
|
|
@@ -3840,6 +3840,10 @@ _MATMUL_CONFIG_TUPLES_PINGPONG_4K_8K_16K = [
|
|
|
3840
3840
|
(256, 128, 128, 1, 1, 2, 16, 1, 8, 2),
|
|
3841
3841
|
(128, 256, 128, 2, 1, 2, 16, 2, 4, 1),
|
|
3842
3842
|
(256, 128, 64, 2, 1, 2, 16, 1, 4, 2),
|
|
3843
|
+
(128, 128, 256, 2, 1, 0, 16, 2, 8, 2),
|
|
3844
|
+
(128, 64, 128, 2, 1, 2, 16, 2, 4, 2),
|
|
3845
|
+
(128, 128, 64, 2, 1, 0, 16, 1, 4, 2),
|
|
3846
|
+
(128, 128, 128, 1, 1, 2, 16, 1, 4, 2),
|
|
3843
3847
|
]
|
|
3844
3848
|
|
|
3845
3849
|
|
|
@@ -509,14 +509,13 @@ def _fbgemm_grouped_gemm_ws(
|
|
|
509
509
|
num_tiles = num_m_tiles * NUM_N_TILES
|
|
510
510
|
|
|
511
511
|
if USE_TMA_STORE:
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
)
|
|
512
|
+
c_desc_ptr = tl.make_tensor_descriptor(
|
|
513
|
+
c_ptr + M_start_offset * N,
|
|
514
|
+
shape=[m_size, N],
|
|
515
|
+
# pyre-ignore
|
|
516
|
+
strides=[N, 1],
|
|
517
|
+
block_shape=[BLOCK_SIZE_M, BLOCK_SIZE_N],
|
|
518
|
+
)
|
|
520
519
|
|
|
521
520
|
# Move across tiles
|
|
522
521
|
next_iterated_tiles = iterated_tiles + num_tiles
|
|
@@ -534,72 +533,59 @@ def _fbgemm_grouped_gemm_ws(
|
|
|
534
533
|
m_offset = (M_start_offset + tile_m_idx * BLOCK_SIZE_M).to(tl.int32)
|
|
535
534
|
n_offset = (N_start_offset + tile_n_idx * BLOCK_SIZE_N).to(tl.int32)
|
|
536
535
|
for k_offset in range(0, K, BLOCK_SIZE_K):
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
else:
|
|
554
|
-
accumulator += tl.dot(a, b.T)
|
|
536
|
+
a = tl._experimental_descriptor_load(
|
|
537
|
+
a_desc_ptr,
|
|
538
|
+
[m_offset, k_offset],
|
|
539
|
+
[BLOCK_SIZE_M, BLOCK_SIZE_K],
|
|
540
|
+
dtype,
|
|
541
|
+
)
|
|
542
|
+
b = tl._experimental_descriptor_load(
|
|
543
|
+
b_desc_ptr,
|
|
544
|
+
[n_offset, k_offset],
|
|
545
|
+
[BLOCK_SIZE_N, BLOCK_SIZE_K],
|
|
546
|
+
dtype,
|
|
547
|
+
)
|
|
548
|
+
if USE_FAST_ACCUM:
|
|
549
|
+
accumulator = tl.dot(a, b.T, accumulator)
|
|
550
|
+
else:
|
|
551
|
+
accumulator += tl.dot(a, b.T)
|
|
555
552
|
|
|
556
553
|
if USE_TMA_STORE:
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
)
|
|
554
|
+
m_offset = (tile_m_idx * BLOCK_SIZE_M).to(tl.int32)
|
|
555
|
+
n_offset = (tile_n_idx * BLOCK_SIZE_N).to(tl.int32)
|
|
556
|
+
# pyre-ignore
|
|
557
|
+
c_desc_ptr.store(
|
|
558
|
+
[m_offset, n_offset],
|
|
559
|
+
accumulator.to(c_ptr.dtype.element_ty),
|
|
560
|
+
)
|
|
565
561
|
elif FUSE_SCATTER_ADD:
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
mask
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
c_ptr + m_offsets[:, None] * N + offs_bn[None, :],
|
|
582
|
-
c,
|
|
583
|
-
mask=mask[:, None],
|
|
584
|
-
sem="relaxed",
|
|
585
|
-
)
|
|
562
|
+
offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
|
|
563
|
+
mask = offs_am < m_size
|
|
564
|
+
m_offsets = tl.load(
|
|
565
|
+
scatter_add_indices + M_start_offset + offs_am,
|
|
566
|
+
mask=mask,
|
|
567
|
+
cache_modifier=".ca",
|
|
568
|
+
)
|
|
569
|
+
offs_bn = tile_n_idx * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
|
|
570
|
+
c = accumulator.to(c_ptr.dtype.element_ty)
|
|
571
|
+
tl.atomic_add(
|
|
572
|
+
c_ptr + m_offsets[:, None] * N + offs_bn[None, :],
|
|
573
|
+
c,
|
|
574
|
+
mask=mask[:, None],
|
|
575
|
+
sem="relaxed",
|
|
576
|
+
)
|
|
586
577
|
else:
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
c
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
+ offs_bn[None, :],
|
|
599
|
-
c,
|
|
600
|
-
mask=offs_am[:, None] < m_size,
|
|
601
|
-
cache_modifier=".cs",
|
|
602
|
-
)
|
|
578
|
+
offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
|
|
579
|
+
offs_bn = tile_n_idx * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
|
|
580
|
+
c = accumulator.to(c_ptr.dtype.element_ty)
|
|
581
|
+
tl.store(
|
|
582
|
+
c_ptr
|
|
583
|
+
+ (M_start_offset + offs_am[:, None]) * N
|
|
584
|
+
+ offs_bn[None, :],
|
|
585
|
+
c,
|
|
586
|
+
mask=offs_am[:, None] < m_size,
|
|
587
|
+
cache_modifier=".cs",
|
|
588
|
+
)
|
|
603
589
|
tidx += NUM_SMS
|
|
604
590
|
|
|
605
591
|
iterated_tiles += num_tiles
|
|
@@ -841,14 +827,13 @@ def _fbgemm_grouped_gemm_fp8_rowwise_ws(
|
|
|
841
827
|
num_tiles = num_m_tiles * NUM_N_TILES
|
|
842
828
|
|
|
843
829
|
if USE_TMA_STORE:
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
)
|
|
830
|
+
c_desc_ptr = tl.make_tensor_descriptor(
|
|
831
|
+
c_ptr + M_start_offset * N,
|
|
832
|
+
shape=[m_size, N],
|
|
833
|
+
# pyre-ignore
|
|
834
|
+
strides=[N, 1],
|
|
835
|
+
block_shape=[BLOCK_SIZE_M, BLOCK_SIZE_N],
|
|
836
|
+
)
|
|
852
837
|
|
|
853
838
|
# Move across tiles
|
|
854
839
|
next_iterated_tiles = iterated_tiles + num_tiles
|
|
@@ -867,107 +852,85 @@ def _fbgemm_grouped_gemm_fp8_rowwise_ws(
|
|
|
867
852
|
m_offset = (M_start_offset + tile_m_idx * BLOCK_SIZE_M).to(tl.int32)
|
|
868
853
|
n_offset = (N_start_offset + tile_n_idx * BLOCK_SIZE_N).to(tl.int32)
|
|
869
854
|
for k_offset in range(0, K, BLOCK_SIZE_K):
|
|
870
|
-
|
|
871
|
-
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
else:
|
|
887
|
-
accumulator += tl.dot(a, b.T)
|
|
855
|
+
a = tl._experimental_descriptor_load(
|
|
856
|
+
a_desc_ptr,
|
|
857
|
+
[m_offset, k_offset],
|
|
858
|
+
[BLOCK_SIZE_M, BLOCK_SIZE_K],
|
|
859
|
+
dtype,
|
|
860
|
+
)
|
|
861
|
+
b = tl._experimental_descriptor_load(
|
|
862
|
+
b_desc_ptr,
|
|
863
|
+
[n_offset, k_offset],
|
|
864
|
+
[BLOCK_SIZE_N, BLOCK_SIZE_K],
|
|
865
|
+
dtype,
|
|
866
|
+
)
|
|
867
|
+
if USE_FAST_ACCUM:
|
|
868
|
+
accumulator = tl.dot(a, b.T, accumulator)
|
|
869
|
+
else:
|
|
870
|
+
accumulator += tl.dot(a, b.T)
|
|
888
871
|
|
|
889
872
|
if USE_TMA_LOAD_ON_SCALES:
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
mask=offs_am[:, None] < m_size,
|
|
905
|
-
cache_modifier=".ca",
|
|
906
|
-
)
|
|
907
|
-
c = accumulator.to(tl.float32) * a_scale * b_scale[None, :]
|
|
873
|
+
b_scale = tl._experimental_descriptor_load(
|
|
874
|
+
b_scale_desc_ptr,
|
|
875
|
+
[n_offset],
|
|
876
|
+
[BLOCK_SIZE_N],
|
|
877
|
+
tl.float32,
|
|
878
|
+
)
|
|
879
|
+
|
|
880
|
+
offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
|
|
881
|
+
a_scale = tl.load(
|
|
882
|
+
a_scale_ptr + M_start_offset + offs_am[:, None],
|
|
883
|
+
mask=offs_am[:, None] < m_size,
|
|
884
|
+
cache_modifier=".ca",
|
|
885
|
+
)
|
|
886
|
+
c = accumulator.to(tl.float32) * a_scale * b_scale[None, :]
|
|
908
887
|
else:
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
|
|
917
|
-
|
|
918
|
-
|
|
919
|
-
|
|
920
|
-
|
|
921
|
-
b_scale = tl.load(
|
|
922
|
-
b_scale_ptr + N_start_offset + offs_bn[None, :],
|
|
923
|
-
cache_modifier=".ca",
|
|
924
|
-
)
|
|
925
|
-
c = accumulator.to(tl.float32) * a_scale * b_scale
|
|
888
|
+
offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
|
|
889
|
+
offs_bn = tile_n_idx * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
|
|
890
|
+
a_scale = tl.load(
|
|
891
|
+
a_scale_ptr + M_start_offset + offs_am[:, None],
|
|
892
|
+
mask=offs_am[:, None] < m_size,
|
|
893
|
+
cache_modifier=".ca",
|
|
894
|
+
)
|
|
895
|
+
b_scale = tl.load(
|
|
896
|
+
b_scale_ptr + N_start_offset + offs_bn[None, :],
|
|
897
|
+
cache_modifier=".ca",
|
|
898
|
+
)
|
|
899
|
+
c = accumulator.to(tl.float32) * a_scale * b_scale
|
|
926
900
|
|
|
927
901
|
if USE_TMA_STORE:
|
|
928
|
-
|
|
929
|
-
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
|
|
933
|
-
|
|
934
|
-
)
|
|
902
|
+
m_offset = (tile_m_idx * BLOCK_SIZE_M).to(tl.int32)
|
|
903
|
+
n_offset = (tile_n_idx * BLOCK_SIZE_N).to(tl.int32)
|
|
904
|
+
# pyre-ignore
|
|
905
|
+
c_desc_ptr.store(
|
|
906
|
+
[m_offset, n_offset], c.to(c_ptr.dtype.element_ty)
|
|
907
|
+
)
|
|
935
908
|
elif FUSE_SCATTER_ADD:
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
|
|
939
|
-
|
|
940
|
-
mask
|
|
941
|
-
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
c_ptr + m_offsets[:, None] * N + offs_bn[None, :],
|
|
951
|
-
c,
|
|
952
|
-
mask=mask[:, None],
|
|
953
|
-
sem="relaxed",
|
|
954
|
-
)
|
|
909
|
+
offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
|
|
910
|
+
mask = offs_am < m_size
|
|
911
|
+
m_offsets = tl.load(
|
|
912
|
+
scatter_add_indices + M_start_offset + offs_am,
|
|
913
|
+
mask=mask,
|
|
914
|
+
cache_modifier=".ca",
|
|
915
|
+
)
|
|
916
|
+
offs_bn = tile_n_idx * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
|
|
917
|
+
tl.atomic_add(
|
|
918
|
+
c_ptr + m_offsets[:, None] * N + offs_bn[None, :],
|
|
919
|
+
c,
|
|
920
|
+
mask=mask[:, None],
|
|
921
|
+
sem="relaxed",
|
|
922
|
+
)
|
|
955
923
|
else:
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
+ offs_bn[None, :],
|
|
967
|
-
c,
|
|
968
|
-
mask=offs_am[:, None] < m_size,
|
|
969
|
-
cache_modifier=".cs",
|
|
970
|
-
)
|
|
924
|
+
offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
|
|
925
|
+
offs_bn = tile_n_idx * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
|
|
926
|
+
tl.store(
|
|
927
|
+
c_ptr
|
|
928
|
+
+ (M_start_offset + offs_am[:, None]) * N
|
|
929
|
+
+ offs_bn[None, :],
|
|
930
|
+
c,
|
|
931
|
+
mask=offs_am[:, None] < m_size,
|
|
932
|
+
cache_modifier=".cs",
|
|
933
|
+
)
|
|
971
934
|
tidx += NUM_SMS
|
|
972
935
|
|
|
973
936
|
iterated_tiles += num_tiles
|
|
@@ -15,10 +15,6 @@ try:
|
|
|
15
15
|
# pyre-ignore[21]
|
|
16
16
|
# @manual=//deeplearning/fbgemm/fbgemm_gpu:test_utils
|
|
17
17
|
from fbgemm_gpu import open_source
|
|
18
|
-
|
|
19
|
-
# pyre-ignore[21]
|
|
20
|
-
# @manual=//deeplearning/fbgemm/fbgemm_gpu:test_utils
|
|
21
|
-
from fbgemm_gpu.docs.version import __version__ # noqa: F401
|
|
22
18
|
except Exception:
|
|
23
19
|
open_source: bool = False
|
|
24
20
|
|
|
@@ -10,10 +10,6 @@ try:
|
|
|
10
10
|
# pyre-ignore[21]
|
|
11
11
|
# @manual=//deeplearning/fbgemm/fbgemm_gpu:test_utils
|
|
12
12
|
from fbgemm_gpu import open_source
|
|
13
|
-
|
|
14
|
-
# pyre-ignore[21]
|
|
15
|
-
# @manual=//deeplearning/fbgemm/fbgemm_gpu:test_utils
|
|
16
|
-
from fbgemm_gpu.docs.version import __version__ # noqa: F401
|
|
17
13
|
except Exception:
|
|
18
14
|
open_source: bool = False
|
|
19
15
|
|