fbgemm-gpu-genai-nightly 2025.10.10__cp310-cp310-manylinux_2_28_x86_64.whl → 2025.11.9__cp310-cp310-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of fbgemm-gpu-genai-nightly might be problematic. Click here for more details.

Files changed (25) hide show
  1. fbgemm_gpu/__init__.py +36 -18
  2. fbgemm_gpu/docs/target.genai.json.py +6 -0
  3. fbgemm_gpu/experimental/example/__init__.py +0 -4
  4. fbgemm_gpu/experimental/example/fbgemm_gpu_experimental_example_py.so +0 -0
  5. fbgemm_gpu/experimental/gemm/triton_gemm/__init__.py +0 -4
  6. fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py +4 -0
  7. fbgemm_gpu/experimental/gemm/triton_gemm/grouped_gemm.py +135 -172
  8. fbgemm_gpu/experimental/gen_ai/__init__.py +0 -4
  9. fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/__init__.py +0 -4
  10. fbgemm_gpu/experimental/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_interface.py +186 -63
  11. fbgemm_gpu/experimental/gen_ai/bench/__init__.py +0 -4
  12. fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py +2 -2
  13. fbgemm_gpu/experimental/gen_ai/fbgemm_gpu_experimental_gen_ai.so +0 -0
  14. fbgemm_gpu/experimental/gen_ai/moe/__init__.py +0 -4
  15. fbgemm_gpu/experimental/gen_ai/moe/layers.py +0 -4
  16. fbgemm_gpu/split_table_batched_embeddings_ops_common.py +43 -10
  17. fbgemm_gpu/split_table_batched_embeddings_ops_training.py +287 -39
  18. fbgemm_gpu/tbe/bench/bench_runs.py +7 -0
  19. fbgemm_gpu/tbe/cache/kv_embedding_ops_inference.py +6 -1
  20. fbgemm_gpu/tbe/ssd/training.py +319 -41
  21. {fbgemm_gpu_genai_nightly-2025.10.10.dist-info → fbgemm_gpu_genai_nightly-2025.11.9.dist-info}/METADATA +1 -2
  22. {fbgemm_gpu_genai_nightly-2025.10.10.dist-info → fbgemm_gpu_genai_nightly-2025.11.9.dist-info}/RECORD +24 -24
  23. fbgemm_gpu/docs/version.py +0 -11
  24. {fbgemm_gpu_genai_nightly-2025.10.10.dist-info → fbgemm_gpu_genai_nightly-2025.11.9.dist-info}/WHEEL +0 -0
  25. {fbgemm_gpu_genai_nightly-2025.10.10.dist-info → fbgemm_gpu_genai_nightly-2025.11.9.dist-info}/top_level.txt +0 -0
fbgemm_gpu/__init__.py CHANGED
@@ -5,6 +5,7 @@
5
5
  # This source code is licensed under the BSD-style license found in the
6
6
  # LICENSE file in the root directory of this source tree.
7
7
 
8
+ import json
8
9
  import logging
9
10
  import os
10
11
  import re
@@ -26,6 +27,19 @@ _fbgemm_torch_compat_table = {
26
27
  }
27
28
 
28
29
 
30
+ def _load_target_info(target: str) -> dict[str, str]:
31
+ try:
32
+ filepath = os.path.join(
33
+ os.path.dirname(__file__), "docs", f"target.{target}.json.py"
34
+ )
35
+ with open(filepath, "r") as file:
36
+ data = json.load(file)
37
+ except Exception:
38
+ data = {}
39
+
40
+ return data
41
+
42
+
29
43
  def _load_library(filename: str, version: str, no_throw: bool = False) -> None:
30
44
  """Load a shared library from the given filename."""
31
45
 
@@ -98,13 +112,16 @@ open_source: bool = True
98
112
  # Trigger the manual addition of docstrings to pybind11-generated operators
99
113
  import fbgemm_gpu.docs # noqa: F401, E402
100
114
 
115
+
116
+ __targets_infos__ = {
117
+ target: _load_target_info(target) for target in ["default", "genai", "hstu"]
118
+ }
119
+ __targets_infos__ = {k: v for (k, v) in __targets_infos__.items() if v}
120
+
101
121
  try:
102
- # Export the version string from the version file auto-generated by setup.py
103
- from fbgemm_gpu.docs.version import ( # noqa: F401, E402
104
- __target__,
105
- __variant__,
106
- __version__,
107
- )
122
+ __target__, __info__ = next(iter(__targets_infos__.items()))
123
+ __variant__ = __info__["variant"]
124
+ __version__ = __info__["version"]
108
125
  except Exception:
109
126
  __variant__: str = "INTERNAL"
110
127
  __version__: str = "INTERNAL"
@@ -145,18 +162,19 @@ libraries_to_load = {
145
162
  "genai": fbgemm_genai_libraries,
146
163
  }
147
164
 
148
- for library in libraries_to_load.get(__target__, []):
149
- # NOTE: In all cases, we want to throw an error if we cannot load the
150
- # library. However, this appears to break the OSS documentation build,
151
- # where the Python documentation doesn't show up in the generated docs.
152
- #
153
- # To work around this problem, we introduce a fake build variant called
154
- # `docs` and we only throw a library load error when the variant is not
155
- # `docs`. For more information, see:
156
- #
157
- # https://github.com/pytorch/FBGEMM/pull/3477
158
- # https://github.com/pytorch/FBGEMM/pull/3717
159
- _load_library(f"{library}.so", __version__, __variant__ == "docs")
165
+ for target, info in __targets_infos__.items():
166
+ for library in libraries_to_load.get(target, []):
167
+ # NOTE: In all cases, we want to throw an error if we cannot load the
168
+ # library. However, this appears to break the OSS documentation build,
169
+ # where the Python documentation doesn't show up in the generated docs.
170
+ #
171
+ # To work around this problem, we introduce a fake build variant called
172
+ # `docs` and we only throw a library load error when the variant is not
173
+ # `docs`. For more information, see:
174
+ #
175
+ # https://github.com/pytorch/FBGEMM/pull/3477
176
+ # https://github.com/pytorch/FBGEMM/pull/3717
177
+ _load_library(f"{library}.so", info["version"], info["variant"] == "docs")
160
178
 
161
179
  try:
162
180
  # Trigger meta operator registrations
@@ -0,0 +1,6 @@
1
+
2
+ {
3
+ "version": "2025.11.9",
4
+ "target": "genai",
5
+ "variant": "cuda"
6
+ }
@@ -15,10 +15,6 @@ try:
15
15
  # pyre-ignore[21]
16
16
  # @manual=//deeplearning/fbgemm/fbgemm_gpu:test_utils
17
17
  from fbgemm_gpu import open_source
18
-
19
- # pyre-ignore[21]
20
- # @manual=//deeplearning/fbgemm/fbgemm_gpu:test_utils
21
- from fbgemm_gpu.docs.version import __version__ # noqa: F401
22
18
  except Exception:
23
19
  open_source: bool = False
24
20
 
@@ -11,9 +11,5 @@ try:
11
11
  # pyre-ignore[21]
12
12
  # @manual=//deeplearning/fbgemm/fbgemm_gpu:test_utils
13
13
  from fbgemm_gpu import open_source
14
-
15
- # pyre-ignore[21]
16
- # @manual=//deeplearning/fbgemm/fbgemm_gpu:test_utils
17
- from fbgemm_gpu.docs.version import __version__ # noqa: F401
18
14
  except Exception:
19
15
  open_source: bool = False
@@ -3840,6 +3840,10 @@ _MATMUL_CONFIG_TUPLES_PINGPONG_4K_8K_16K = [
3840
3840
  (256, 128, 128, 1, 1, 2, 16, 1, 8, 2),
3841
3841
  (128, 256, 128, 2, 1, 2, 16, 2, 4, 1),
3842
3842
  (256, 128, 64, 2, 1, 2, 16, 1, 4, 2),
3843
+ (128, 128, 256, 2, 1, 0, 16, 2, 8, 2),
3844
+ (128, 64, 128, 2, 1, 2, 16, 2, 4, 2),
3845
+ (128, 128, 64, 2, 1, 0, 16, 1, 4, 2),
3846
+ (128, 128, 128, 1, 1, 2, 16, 1, 4, 2),
3843
3847
  ]
3844
3848
 
3845
3849
 
@@ -509,14 +509,13 @@ def _fbgemm_grouped_gemm_ws(
509
509
  num_tiles = num_m_tiles * NUM_N_TILES
510
510
 
511
511
  if USE_TMA_STORE:
512
- with tl.async_task([0]):
513
- c_desc_ptr = tl.make_tensor_descriptor(
514
- c_ptr + M_start_offset * N,
515
- shape=[m_size, N],
516
- # pyre-ignore
517
- strides=[N, 1],
518
- block_shape=[BLOCK_SIZE_M, BLOCK_SIZE_N],
519
- )
512
+ c_desc_ptr = tl.make_tensor_descriptor(
513
+ c_ptr + M_start_offset * N,
514
+ shape=[m_size, N],
515
+ # pyre-ignore
516
+ strides=[N, 1],
517
+ block_shape=[BLOCK_SIZE_M, BLOCK_SIZE_N],
518
+ )
520
519
 
521
520
  # Move across tiles
522
521
  next_iterated_tiles = iterated_tiles + num_tiles
@@ -534,72 +533,59 @@ def _fbgemm_grouped_gemm_ws(
534
533
  m_offset = (M_start_offset + tile_m_idx * BLOCK_SIZE_M).to(tl.int32)
535
534
  n_offset = (N_start_offset + tile_n_idx * BLOCK_SIZE_N).to(tl.int32)
536
535
  for k_offset in range(0, K, BLOCK_SIZE_K):
537
- with tl.async_task([0]):
538
- a = tl._experimental_descriptor_load(
539
- a_desc_ptr,
540
- [m_offset, k_offset],
541
- [BLOCK_SIZE_M, BLOCK_SIZE_K],
542
- dtype,
543
- )
544
- b = tl._experimental_descriptor_load(
545
- b_desc_ptr,
546
- [n_offset, k_offset],
547
- [BLOCK_SIZE_N, BLOCK_SIZE_K],
548
- dtype,
549
- )
550
- with tl.async_task([1, NUM_CONSUMER_GROUPS]):
551
- if USE_FAST_ACCUM:
552
- accumulator = tl.dot(a, b.T, accumulator)
553
- else:
554
- accumulator += tl.dot(a, b.T)
536
+ a = tl._experimental_descriptor_load(
537
+ a_desc_ptr,
538
+ [m_offset, k_offset],
539
+ [BLOCK_SIZE_M, BLOCK_SIZE_K],
540
+ dtype,
541
+ )
542
+ b = tl._experimental_descriptor_load(
543
+ b_desc_ptr,
544
+ [n_offset, k_offset],
545
+ [BLOCK_SIZE_N, BLOCK_SIZE_K],
546
+ dtype,
547
+ )
548
+ if USE_FAST_ACCUM:
549
+ accumulator = tl.dot(a, b.T, accumulator)
550
+ else:
551
+ accumulator += tl.dot(a, b.T)
555
552
 
556
553
  if USE_TMA_STORE:
557
- with tl.async_task([1, NUM_CONSUMER_GROUPS]):
558
- m_offset = (tile_m_idx * BLOCK_SIZE_M).to(tl.int32)
559
- n_offset = (tile_n_idx * BLOCK_SIZE_N).to(tl.int32)
560
- # pyre-ignore
561
- c_desc_ptr.store(
562
- [m_offset, n_offset],
563
- accumulator.to(c_ptr.dtype.element_ty),
564
- )
554
+ m_offset = (tile_m_idx * BLOCK_SIZE_M).to(tl.int32)
555
+ n_offset = (tile_n_idx * BLOCK_SIZE_N).to(tl.int32)
556
+ # pyre-ignore
557
+ c_desc_ptr.store(
558
+ [m_offset, n_offset],
559
+ accumulator.to(c_ptr.dtype.element_ty),
560
+ )
565
561
  elif FUSE_SCATTER_ADD:
566
- with tl.async_task([1, NUM_CONSUMER_GROUPS]):
567
- offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(
568
- 0, BLOCK_SIZE_M
569
- )
570
- mask = offs_am < m_size
571
- m_offsets = tl.load(
572
- scatter_add_indices + M_start_offset + offs_am,
573
- mask=mask,
574
- cache_modifier=".ca",
575
- )
576
- offs_bn = tile_n_idx * BLOCK_SIZE_N + tl.arange(
577
- 0, BLOCK_SIZE_N
578
- )
579
- c = accumulator.to(c_ptr.dtype.element_ty)
580
- tl.atomic_add(
581
- c_ptr + m_offsets[:, None] * N + offs_bn[None, :],
582
- c,
583
- mask=mask[:, None],
584
- sem="relaxed",
585
- )
562
+ offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
563
+ mask = offs_am < m_size
564
+ m_offsets = tl.load(
565
+ scatter_add_indices + M_start_offset + offs_am,
566
+ mask=mask,
567
+ cache_modifier=".ca",
568
+ )
569
+ offs_bn = tile_n_idx * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
570
+ c = accumulator.to(c_ptr.dtype.element_ty)
571
+ tl.atomic_add(
572
+ c_ptr + m_offsets[:, None] * N + offs_bn[None, :],
573
+ c,
574
+ mask=mask[:, None],
575
+ sem="relaxed",
576
+ )
586
577
  else:
587
- with tl.async_task([1, NUM_CONSUMER_GROUPS]):
588
- offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(
589
- 0, BLOCK_SIZE_M
590
- )
591
- offs_bn = tile_n_idx * BLOCK_SIZE_N + tl.arange(
592
- 0, BLOCK_SIZE_N
593
- )
594
- c = accumulator.to(c_ptr.dtype.element_ty)
595
- tl.store(
596
- c_ptr
597
- + (M_start_offset + offs_am[:, None]) * N
598
- + offs_bn[None, :],
599
- c,
600
- mask=offs_am[:, None] < m_size,
601
- cache_modifier=".cs",
602
- )
578
+ offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
579
+ offs_bn = tile_n_idx * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
580
+ c = accumulator.to(c_ptr.dtype.element_ty)
581
+ tl.store(
582
+ c_ptr
583
+ + (M_start_offset + offs_am[:, None]) * N
584
+ + offs_bn[None, :],
585
+ c,
586
+ mask=offs_am[:, None] < m_size,
587
+ cache_modifier=".cs",
588
+ )
603
589
  tidx += NUM_SMS
604
590
 
605
591
  iterated_tiles += num_tiles
@@ -841,14 +827,13 @@ def _fbgemm_grouped_gemm_fp8_rowwise_ws(
841
827
  num_tiles = num_m_tiles * NUM_N_TILES
842
828
 
843
829
  if USE_TMA_STORE:
844
- with tl.async_task([0]):
845
- c_desc_ptr = tl.make_tensor_descriptor(
846
- c_ptr + M_start_offset * N,
847
- shape=[m_size, N],
848
- # pyre-ignore
849
- strides=[N, 1],
850
- block_shape=[BLOCK_SIZE_M, BLOCK_SIZE_N],
851
- )
830
+ c_desc_ptr = tl.make_tensor_descriptor(
831
+ c_ptr + M_start_offset * N,
832
+ shape=[m_size, N],
833
+ # pyre-ignore
834
+ strides=[N, 1],
835
+ block_shape=[BLOCK_SIZE_M, BLOCK_SIZE_N],
836
+ )
852
837
 
853
838
  # Move across tiles
854
839
  next_iterated_tiles = iterated_tiles + num_tiles
@@ -867,107 +852,85 @@ def _fbgemm_grouped_gemm_fp8_rowwise_ws(
867
852
  m_offset = (M_start_offset + tile_m_idx * BLOCK_SIZE_M).to(tl.int32)
868
853
  n_offset = (N_start_offset + tile_n_idx * BLOCK_SIZE_N).to(tl.int32)
869
854
  for k_offset in range(0, K, BLOCK_SIZE_K):
870
- with tl.async_task([0]):
871
- a = tl._experimental_descriptor_load(
872
- a_desc_ptr,
873
- [m_offset, k_offset],
874
- [BLOCK_SIZE_M, BLOCK_SIZE_K],
875
- dtype,
876
- )
877
- b = tl._experimental_descriptor_load(
878
- b_desc_ptr,
879
- [n_offset, k_offset],
880
- [BLOCK_SIZE_N, BLOCK_SIZE_K],
881
- dtype,
882
- )
883
- with tl.async_task([1, NUM_CONSUMER_GROUPS]):
884
- if USE_FAST_ACCUM:
885
- accumulator = tl.dot(a, b.T, accumulator)
886
- else:
887
- accumulator += tl.dot(a, b.T)
855
+ a = tl._experimental_descriptor_load(
856
+ a_desc_ptr,
857
+ [m_offset, k_offset],
858
+ [BLOCK_SIZE_M, BLOCK_SIZE_K],
859
+ dtype,
860
+ )
861
+ b = tl._experimental_descriptor_load(
862
+ b_desc_ptr,
863
+ [n_offset, k_offset],
864
+ [BLOCK_SIZE_N, BLOCK_SIZE_K],
865
+ dtype,
866
+ )
867
+ if USE_FAST_ACCUM:
868
+ accumulator = tl.dot(a, b.T, accumulator)
869
+ else:
870
+ accumulator += tl.dot(a, b.T)
888
871
 
889
872
  if USE_TMA_LOAD_ON_SCALES:
890
- with tl.async_task([0]):
891
- b_scale = tl._experimental_descriptor_load(
892
- b_scale_desc_ptr,
893
- [n_offset],
894
- [BLOCK_SIZE_N],
895
- tl.float32,
896
- )
897
-
898
- with tl.async_task([1, NUM_CONSUMER_GROUPS]):
899
- offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(
900
- 0, BLOCK_SIZE_M
901
- )
902
- a_scale = tl.load(
903
- a_scale_ptr + M_start_offset + offs_am[:, None],
904
- mask=offs_am[:, None] < m_size,
905
- cache_modifier=".ca",
906
- )
907
- c = accumulator.to(tl.float32) * a_scale * b_scale[None, :]
873
+ b_scale = tl._experimental_descriptor_load(
874
+ b_scale_desc_ptr,
875
+ [n_offset],
876
+ [BLOCK_SIZE_N],
877
+ tl.float32,
878
+ )
879
+
880
+ offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
881
+ a_scale = tl.load(
882
+ a_scale_ptr + M_start_offset + offs_am[:, None],
883
+ mask=offs_am[:, None] < m_size,
884
+ cache_modifier=".ca",
885
+ )
886
+ c = accumulator.to(tl.float32) * a_scale * b_scale[None, :]
908
887
  else:
909
- with tl.async_task([1, NUM_CONSUMER_GROUPS]):
910
- offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(
911
- 0, BLOCK_SIZE_M
912
- )
913
- offs_bn = tile_n_idx * BLOCK_SIZE_N + tl.arange(
914
- 0, BLOCK_SIZE_N
915
- )
916
- a_scale = tl.load(
917
- a_scale_ptr + M_start_offset + offs_am[:, None],
918
- mask=offs_am[:, None] < m_size,
919
- cache_modifier=".ca",
920
- )
921
- b_scale = tl.load(
922
- b_scale_ptr + N_start_offset + offs_bn[None, :],
923
- cache_modifier=".ca",
924
- )
925
- c = accumulator.to(tl.float32) * a_scale * b_scale
888
+ offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
889
+ offs_bn = tile_n_idx * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
890
+ a_scale = tl.load(
891
+ a_scale_ptr + M_start_offset + offs_am[:, None],
892
+ mask=offs_am[:, None] < m_size,
893
+ cache_modifier=".ca",
894
+ )
895
+ b_scale = tl.load(
896
+ b_scale_ptr + N_start_offset + offs_bn[None, :],
897
+ cache_modifier=".ca",
898
+ )
899
+ c = accumulator.to(tl.float32) * a_scale * b_scale
926
900
 
927
901
  if USE_TMA_STORE:
928
- with tl.async_task([1, NUM_CONSUMER_GROUPS]):
929
- m_offset = (tile_m_idx * BLOCK_SIZE_M).to(tl.int32)
930
- n_offset = (tile_n_idx * BLOCK_SIZE_N).to(tl.int32)
931
- # pyre-ignore
932
- c_desc_ptr.store(
933
- [m_offset, n_offset], c.to(c_ptr.dtype.element_ty)
934
- )
902
+ m_offset = (tile_m_idx * BLOCK_SIZE_M).to(tl.int32)
903
+ n_offset = (tile_n_idx * BLOCK_SIZE_N).to(tl.int32)
904
+ # pyre-ignore
905
+ c_desc_ptr.store(
906
+ [m_offset, n_offset], c.to(c_ptr.dtype.element_ty)
907
+ )
935
908
  elif FUSE_SCATTER_ADD:
936
- with tl.async_task([1, NUM_CONSUMER_GROUPS]):
937
- offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(
938
- 0, BLOCK_SIZE_M
939
- )
940
- mask = offs_am < m_size
941
- m_offsets = tl.load(
942
- scatter_add_indices + M_start_offset + offs_am,
943
- mask=mask,
944
- cache_modifier=".ca",
945
- )
946
- offs_bn = tile_n_idx * BLOCK_SIZE_N + tl.arange(
947
- 0, BLOCK_SIZE_N
948
- )
949
- tl.atomic_add(
950
- c_ptr + m_offsets[:, None] * N + offs_bn[None, :],
951
- c,
952
- mask=mask[:, None],
953
- sem="relaxed",
954
- )
909
+ offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
910
+ mask = offs_am < m_size
911
+ m_offsets = tl.load(
912
+ scatter_add_indices + M_start_offset + offs_am,
913
+ mask=mask,
914
+ cache_modifier=".ca",
915
+ )
916
+ offs_bn = tile_n_idx * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
917
+ tl.atomic_add(
918
+ c_ptr + m_offsets[:, None] * N + offs_bn[None, :],
919
+ c,
920
+ mask=mask[:, None],
921
+ sem="relaxed",
922
+ )
955
923
  else:
956
- with tl.async_task([1, NUM_CONSUMER_GROUPS]):
957
- offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(
958
- 0, BLOCK_SIZE_M
959
- )
960
- offs_bn = tile_n_idx * BLOCK_SIZE_N + tl.arange(
961
- 0, BLOCK_SIZE_N
962
- )
963
- tl.store(
964
- c_ptr
965
- + (M_start_offset + offs_am[:, None]) * N
966
- + offs_bn[None, :],
967
- c,
968
- mask=offs_am[:, None] < m_size,
969
- cache_modifier=".cs",
970
- )
924
+ offs_am = tile_m_idx * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
925
+ offs_bn = tile_n_idx * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
926
+ tl.store(
927
+ c_ptr
928
+ + (M_start_offset + offs_am[:, None]) * N
929
+ + offs_bn[None, :],
930
+ c,
931
+ mask=offs_am[:, None] < m_size,
932
+ cache_modifier=".cs",
933
+ )
971
934
  tidx += NUM_SMS
972
935
 
973
936
  iterated_tiles += num_tiles
@@ -15,10 +15,6 @@ try:
15
15
  # pyre-ignore[21]
16
16
  # @manual=//deeplearning/fbgemm/fbgemm_gpu:test_utils
17
17
  from fbgemm_gpu import open_source
18
-
19
- # pyre-ignore[21]
20
- # @manual=//deeplearning/fbgemm/fbgemm_gpu:test_utils
21
- from fbgemm_gpu.docs.version import __version__ # noqa: F401
22
18
  except Exception:
23
19
  open_source: bool = False
24
20
 
@@ -10,10 +10,6 @@ try:
10
10
  # pyre-ignore[21]
11
11
  # @manual=//deeplearning/fbgemm/fbgemm_gpu:test_utils
12
12
  from fbgemm_gpu import open_source
13
-
14
- # pyre-ignore[21]
15
- # @manual=//deeplearning/fbgemm/fbgemm_gpu:test_utils
16
- from fbgemm_gpu.docs.version import __version__ # noqa: F401
17
13
  except Exception:
18
14
  open_source: bool = False
19
15