fbgemm-gpu-nightly-cpu 2025.7.19__cp311-cp311-manylinux_2_28_aarch64.whl → 2026.1.29__cp311-cp311-manylinux_2_28_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. fbgemm_gpu/__init__.py +112 -19
  2. fbgemm_gpu/asmjit.so +0 -0
  3. fbgemm_gpu/batched_unary_embeddings_ops.py +3 -3
  4. fbgemm_gpu/config/feature_list.py +7 -1
  5. fbgemm_gpu/docs/jagged_tensor_ops.py +0 -1
  6. fbgemm_gpu/docs/sparse_ops.py +118 -0
  7. fbgemm_gpu/docs/target.default.json.py +6 -0
  8. fbgemm_gpu/enums.py +3 -4
  9. fbgemm_gpu/fbgemm.so +0 -0
  10. fbgemm_gpu/fbgemm_gpu_config.so +0 -0
  11. fbgemm_gpu/fbgemm_gpu_embedding_inplace_ops.so +0 -0
  12. fbgemm_gpu/fbgemm_gpu_py.so +0 -0
  13. fbgemm_gpu/fbgemm_gpu_sparse_async_cumsum.so +0 -0
  14. fbgemm_gpu/fbgemm_gpu_tbe_cache.so +0 -0
  15. fbgemm_gpu/fbgemm_gpu_tbe_common.so +0 -0
  16. fbgemm_gpu/fbgemm_gpu_tbe_index_select.so +0 -0
  17. fbgemm_gpu/fbgemm_gpu_tbe_inference.so +0 -0
  18. fbgemm_gpu/fbgemm_gpu_tbe_optimizers.so +0 -0
  19. fbgemm_gpu/fbgemm_gpu_tbe_training_backward.so +0 -0
  20. fbgemm_gpu/fbgemm_gpu_tbe_training_backward_dense.so +0 -0
  21. fbgemm_gpu/fbgemm_gpu_tbe_training_backward_gwd.so +0 -0
  22. fbgemm_gpu/fbgemm_gpu_tbe_training_backward_pt2.so +0 -0
  23. fbgemm_gpu/fbgemm_gpu_tbe_training_backward_split_host.so +0 -0
  24. fbgemm_gpu/fbgemm_gpu_tbe_training_backward_vbe.so +0 -0
  25. fbgemm_gpu/fbgemm_gpu_tbe_training_forward.so +0 -0
  26. fbgemm_gpu/fbgemm_gpu_tbe_utils.so +0 -0
  27. fbgemm_gpu/permute_pooled_embedding_modules.py +5 -4
  28. fbgemm_gpu/permute_pooled_embedding_modules_split.py +4 -4
  29. fbgemm_gpu/quantize/__init__.py +2 -0
  30. fbgemm_gpu/quantize/quantize_ops.py +1 -0
  31. fbgemm_gpu/quantize_comm.py +29 -12
  32. fbgemm_gpu/quantize_utils.py +88 -8
  33. fbgemm_gpu/runtime_monitor.py +9 -5
  34. fbgemm_gpu/sll/__init__.py +3 -0
  35. fbgemm_gpu/sll/cpu/cpu_sll.py +8 -8
  36. fbgemm_gpu/sll/triton/__init__.py +0 -10
  37. fbgemm_gpu/sll/triton/triton_jagged2_to_padded_dense.py +2 -3
  38. fbgemm_gpu/sll/triton/triton_jagged_bmm.py +2 -2
  39. fbgemm_gpu/sll/triton/triton_jagged_dense_elementwise_add.py +1 -0
  40. fbgemm_gpu/sll/triton/triton_jagged_dense_flash_attention.py +5 -6
  41. fbgemm_gpu/sll/triton/triton_jagged_flash_attention_basic.py +1 -2
  42. fbgemm_gpu/sll/triton/triton_multi_head_jagged_flash_attention.py +1 -2
  43. fbgemm_gpu/sparse_ops.py +190 -54
  44. fbgemm_gpu/split_embedding_codegen_lookup_invokers/__init__.py +12 -0
  45. fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_adagrad.py +12 -5
  46. fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_adam.py +14 -7
  47. fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_args.py +2 -0
  48. fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_args_ssd.py +2 -0
  49. fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_lamb.py +12 -5
  50. fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_lars_sgd.py +12 -5
  51. fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_none.py +12 -5
  52. fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_partial_rowwise_adam.py +12 -5
  53. fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_partial_rowwise_lamb.py +12 -5
  54. fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad.py +12 -5
  55. fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad_ssd.py +12 -5
  56. fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad_with_counter.py +12 -5
  57. fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_sgd.py +12 -5
  58. fbgemm_gpu/split_embedding_configs.py +134 -37
  59. fbgemm_gpu/split_embedding_inference_converter.py +7 -6
  60. fbgemm_gpu/split_table_batched_embeddings_ops_common.py +117 -24
  61. fbgemm_gpu/split_table_batched_embeddings_ops_inference.py +37 -37
  62. fbgemm_gpu/split_table_batched_embeddings_ops_training.py +764 -123
  63. fbgemm_gpu/split_table_batched_embeddings_ops_training_common.py +44 -1
  64. fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py +0 -1
  65. fbgemm_gpu/tbe/bench/__init__.py +6 -1
  66. fbgemm_gpu/tbe/bench/bench_config.py +14 -3
  67. fbgemm_gpu/tbe/bench/bench_runs.py +163 -14
  68. fbgemm_gpu/tbe/bench/benchmark_click_interface.py +5 -2
  69. fbgemm_gpu/tbe/bench/eeg_cli.py +3 -3
  70. fbgemm_gpu/tbe/bench/embedding_ops_common_config.py +3 -2
  71. fbgemm_gpu/tbe/bench/eval_compression.py +3 -3
  72. fbgemm_gpu/tbe/bench/tbe_data_config.py +115 -197
  73. fbgemm_gpu/tbe/bench/tbe_data_config_bench_helper.py +332 -0
  74. fbgemm_gpu/tbe/bench/tbe_data_config_loader.py +108 -8
  75. fbgemm_gpu/tbe/bench/tbe_data_config_param_models.py +15 -8
  76. fbgemm_gpu/tbe/bench/utils.py +129 -5
  77. fbgemm_gpu/tbe/cache/kv_embedding_ops_inference.py +22 -19
  78. fbgemm_gpu/tbe/cache/split_embeddings_cache_ops.py +4 -4
  79. fbgemm_gpu/tbe/ssd/common.py +1 -0
  80. fbgemm_gpu/tbe/ssd/inference.py +15 -15
  81. fbgemm_gpu/tbe/ssd/training.py +1292 -267
  82. fbgemm_gpu/tbe/ssd/utils/partially_materialized_tensor.py +2 -3
  83. fbgemm_gpu/tbe/stats/bench_params_reporter.py +198 -42
  84. fbgemm_gpu/tbe/utils/offsets.py +6 -6
  85. fbgemm_gpu/tbe/utils/quantize.py +8 -8
  86. fbgemm_gpu/tbe/utils/requests.py +15 -15
  87. fbgemm_gpu/tbe_input_multiplexer.py +10 -11
  88. fbgemm_gpu/triton/common.py +0 -1
  89. fbgemm_gpu/triton/jagged/triton_jagged_tensor_ops.py +11 -11
  90. fbgemm_gpu/triton/quantize.py +14 -9
  91. fbgemm_gpu/utils/filestore.py +6 -2
  92. fbgemm_gpu/utils/torch_library.py +2 -2
  93. fbgemm_gpu/utils/writeback_util.py +124 -0
  94. fbgemm_gpu/uvm.py +1 -0
  95. {fbgemm_gpu_nightly_cpu-2025.7.19.dist-info → fbgemm_gpu_nightly_cpu-2026.1.29.dist-info}/METADATA +2 -2
  96. fbgemm_gpu_nightly_cpu-2026.1.29.dist-info/RECORD +135 -0
  97. fbgemm_gpu_nightly_cpu-2026.1.29.dist-info/top_level.txt +2 -0
  98. fbgemm_gpu/docs/version.py → list_versions/__init__.py +5 -4
  99. list_versions/cli_run.py +161 -0
  100. fbgemm_gpu_nightly_cpu-2025.7.19.dist-info/RECORD +0 -131
  101. fbgemm_gpu_nightly_cpu-2025.7.19.dist-info/top_level.txt +0 -1
  102. {fbgemm_gpu_nightly_cpu-2025.7.19.dist-info → fbgemm_gpu_nightly_cpu-2026.1.29.dist-info}/WHEEL +0 -0
@@ -9,7 +9,7 @@
9
9
 
10
10
  # pyre-ignore-all-errors[6]
11
11
 
12
- from typing import List, Optional, Tuple, Union
12
+ from typing import Optional, Union
13
13
 
14
14
  import torch
15
15
  import triton # @manual
@@ -472,7 +472,7 @@ def triton_jagged_to_dense_optimization_2d(
472
472
  # In FBGEMM it was computed by GPU but in triton currently has some compilation issue so we use CUP computation method as workaround
473
473
  # However in real-world case if we only dealing with 2d jagged tensor we don't need to use this function at all
474
474
  def _jagged_offsets_to_dense_indice(
475
- offsets: List[torch.Tensor], dense_strides: List[int], dense_sizes: List[int]
475
+ offsets: list[torch.Tensor], dense_strides: list[int], dense_sizes: list[int]
476
476
  ) -> torch.Tensor:
477
477
 
478
478
  output_offset = torch.zeros(len(offsets[-1]) - 1, device="cpu", dtype=torch.int32)
@@ -532,8 +532,8 @@ def _jagged_offsets_to_dense_indice(
532
532
  # not be affected at all
533
533
  def jagged_to_dense(
534
534
  jagged_values: torch.Tensor,
535
- jagged_offsets: List[torch.Tensor],
536
- jagged_max_lengths: List[int],
535
+ jagged_offsets: list[torch.Tensor],
536
+ jagged_max_lengths: list[int],
537
537
  padding_value: float = 0.0, # padding value currently use 0.0 as default value
538
538
  operation_function: Union[
539
539
  str, None
@@ -720,10 +720,10 @@ def triton_dense_to_jagged(
720
720
 
721
721
  def dense_to_jagged(
722
722
  dense: torch.Tensor,
723
- jagged_offsets: List[torch.Tensor],
723
+ jagged_offsets: list[torch.Tensor],
724
724
  operation_function: Union[str, None] = None,
725
725
  operation_jagged_values: Union[torch.Tensor, None] = None,
726
- ) -> Tuple[torch.Tensor, List[torch.Tensor]]:
726
+ ) -> tuple[torch.Tensor, list[torch.Tensor]]:
727
727
 
728
728
  thread_block_row_size = 32
729
729
  thread_block_col_size = 32
@@ -780,7 +780,7 @@ def dense_to_jagged(
780
780
  # jagged_tensor + dense -> dense
781
781
  def jagged_dense_elementwise_add_dense_output(
782
782
  jagged_values: Tensor,
783
- jagged_offsets: List[Tensor],
783
+ jagged_offsets: list[Tensor],
784
784
  # pyre-fixme[2]: Parameter must be annotated.
785
785
  dense,
786
786
  ) -> Tensor:
@@ -800,8 +800,8 @@ def jagged_dense_elementwise_add_dense_output(
800
800
 
801
801
  # jagged_tensor + dense -> jagged_tensor
802
802
  def jagged_dense_elementwise_add_jagged_output(
803
- jagged_values: Optional[Tensor], jagged_offsets: List[Tensor], dense: Tensor
804
- ) -> Tuple[Tensor, List[Tensor]]:
803
+ jagged_values: Optional[Tensor], jagged_offsets: list[Tensor], dense: Tensor
804
+ ) -> tuple[Tensor, list[Tensor]]:
805
805
 
806
806
  return dense_to_jagged(
807
807
  dense,
@@ -813,8 +813,8 @@ def jagged_dense_elementwise_add_jagged_output(
813
813
 
814
814
  # jagged_tensor * dense -> jagged_tensor
815
815
  def jagged_dense_elementwise_mul_jagged_output(
816
- jagged_values: Optional[Tensor], jagged_offsets: List[Tensor], dense: Tensor
817
- ) -> Tuple[Tensor, List[Tensor]]:
816
+ jagged_values: Optional[Tensor], jagged_offsets: list[Tensor], dense: Tensor
817
+ ) -> tuple[Tensor, list[Tensor]]:
818
818
 
819
819
  return dense_to_jagged(
820
820
  dense,
@@ -11,7 +11,6 @@ from typing import Union
11
11
 
12
12
  import torch
13
13
  import triton # @manual
14
-
15
14
  import triton.language as tl # @manual
16
15
 
17
16
  from .common import get_mx4_exp_bias, get_mx4_lookup_table, RoundingMode
@@ -238,7 +237,7 @@ def _kernel_quantize_mx4(
238
237
  # We readd fp32_exp_bias for compatibility with cuda dequant.
239
238
  tl.store(
240
239
  out + exp_offset,
241
- (group_exp + FP32_EXP_BIAS).to(tl.int8),
240
+ (group_exp + FP32_EXP_BIAS).to(tl.uint8),
242
241
  # Prevent writing outside this chunk or the main array.
243
242
  mask=(exp_offset < OUTPUT_SIZE)
244
243
  & (exp_offset < (OUTPUT_CHUNK_SIZE * (pid + 1))),
@@ -575,7 +574,7 @@ def _kernel_dequantize_mx4(
575
574
  # Write final outputs.
576
575
  tl.store(
577
576
  out + output_offset,
578
- scaled_fp32,
577
+ scaled_fp32.to(out.dtype.element_ty),
579
578
  # Mask values that are out of this chunk or the main array.
580
579
  mask=(output_offset < OUTPUT_SIZE)
581
580
  & (output_offset < OUTPUT_CHUNK_SIZE * (pid + 1)),
@@ -588,10 +587,14 @@ def _kernel_dequantize_mx4(
588
587
 
589
588
 
590
589
  def triton_dequantize_mx4(
591
- a: torch.Tensor, group_size: int = 32, ebits: int = 2, mbits: int = 1
590
+ a: torch.Tensor,
591
+ group_size: int = 32,
592
+ ebits: int = 2,
593
+ mbits: int = 1,
594
+ output_dtype: torch.dtype = torch.float32,
592
595
  ) -> torch.Tensor:
593
596
  """
594
- Dequantize a tensor from mx4 format to fp32.
597
+ Dequantize a tensor from mx4 format to fp32 or bf16.
595
598
 
596
599
  Args:
597
600
  a (Tensor): [M / 2 + M / group_size] MX4 tensor packed into int8 values
@@ -599,13 +602,15 @@ def triton_dequantize_mx4(
599
602
  group_size (int): Size of chunks that use the same shared exponent.
600
603
  ebits (int): Number of bits to use for exponent in target mx4 format.
601
604
  mbits (int): Number of bits to use for mantissa in target mx4 format.
605
+ output_dtype (torch.dtype): Output dtype (FP32 or BF16).
606
+ Defaults to torch.float32 for backward compatibility.
602
607
 
603
608
  Returns:
604
- torch.Tensor: [M, K] dequantized fp32 tensor.
609
+ torch.Tensor: [M, K] dequantized tensor in the specified dtype.
605
610
  """
606
611
  # If given an empty shape, return an empty tensor.
607
612
  if a.numel() == 0:
608
- return torch.empty(a.shape, device=a.device, dtype=torch.float32)
613
+ return torch.empty(a.shape, device=a.device, dtype=output_dtype)
609
614
  # View a as 2D for simplicity.
610
615
  orig_shape = a.shape
611
616
  a = a.flatten()
@@ -622,9 +627,9 @@ def triton_dequantize_mx4(
622
627
  # Use a lookup table to convert
623
628
  mx4_to_fp_values = get_mx4_lookup_table(ebits, mbits, a.device)
624
629
 
625
- # Create output tensor.
630
+ # Create output tensor in target dtype.
626
631
  output_elems = num_groups * group_size
627
- out = torch.empty([output_elems], device=a.device, dtype=torch.float)
632
+ out = torch.empty([output_elems], device=a.device, dtype=output_dtype)
628
633
  # Check if we need to use int64 for indexing.
629
634
  use_int64 = num_threads * groups_per_thread * group_size > 2**31 - 1
630
635
  # Invoke triton dequantization kernel over rows.
@@ -11,7 +11,6 @@
11
11
  import io
12
12
  import logging
13
13
  import os
14
- import shutil
15
14
  from dataclasses import dataclass
16
15
  from pathlib import Path
17
16
  from typing import BinaryIO, Union
@@ -76,7 +75,12 @@ class FileStore:
76
75
  elif isinstance(raw_input, Path):
77
76
  if not os.path.exists(raw_input):
78
77
  raise FileNotFoundError(f"File {raw_input} does not exist")
79
- shutil.copyfile(raw_input, filepath)
78
+ # Open the source file and destination file, and copy the contents
79
+ with open(raw_input, "rb") as src_file, open(
80
+ filepath, "wb"
81
+ ) as dst_file:
82
+ while chunk := src_file.read(4096): # Read 4 KB at a time
83
+ dst_file.write(chunk)
80
84
 
81
85
  elif isinstance(raw_input, io.BytesIO) or isinstance(raw_input, BinaryIO):
82
86
  with open(filepath, "wb") as file:
@@ -8,7 +8,7 @@
8
8
  # pyre-strict
9
9
 
10
10
  import re
11
- from typing import Callable, Dict
11
+ from typing import Callable
12
12
 
13
13
  import torch
14
14
 
@@ -112,7 +112,7 @@ class TorchLibraryFragment:
112
112
  self.lib.impl(op_name, fn, dispatch_key)
113
113
 
114
114
  # pyre-ignore[24]
115
- def register(self, op_name: str, functors: Dict[str, Callable]) -> None:
115
+ def register(self, op_name: str, functors: dict[str, Callable]) -> None:
116
116
  """
117
117
  Registers a set of dispatches for a defined operator.
118
118
 
@@ -0,0 +1,124 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ import torch
8
+
9
+
10
+ def writeback_update_gradient(
11
+ indices: torch.Tensor,
12
+ offsets: torch.Tensor,
13
+ grad: torch.Tensor,
14
+ feature_table_map: list[int],
15
+ ) -> torch.Tensor:
16
+ """
17
+ Update gradient tensor by deduplicating indices across all features/tables.
18
+ For duplicate indices, only the first occurrence receives the gradient to achieve the assign purpose via gradient update
19
+
20
+ NOTE: This function is not supporting VBE yet
21
+
22
+ Args:
23
+ indices (torch.Tensor): Embedding indices tensor
24
+ offsets (torch.Tensor): Offsets tensor for batched embeddings
25
+ grad (torch.Tensor): Gradient tensor to be updated
26
+ feature_table_map (list[int]): Mapping from feature to table
27
+
28
+ Returns:
29
+ torch.Tensor: Updated gradient tensor with duplicates masked out
30
+ """
31
+ if indices.numel() == 0:
32
+ return grad[0]
33
+ # get num of feature to estimate batch size
34
+ num_of_tables = len(feature_table_map)
35
+ assert num_of_tables * indices.max() < torch.iinfo(indices.dtype).max
36
+ batch_size = offsets.shape[0] // num_of_tables
37
+ max_indices = indices.max()
38
+ non_empty_index = (offsets[1:] - offsets[:-1]).nonzero().flatten()
39
+ # disable dedup across different table
40
+ indices = ((offsets[non_empty_index]) // batch_size) * (1 + max_indices) + indices
41
+ grad = grad[0]
42
+ _, idx, counts = torch.unique(
43
+ indices, dim=0, sorted=True, return_inverse=True, return_counts=True
44
+ )
45
+ _, ind_sorted = torch.sort(idx, stable=True)
46
+ cum_sum = counts.cumsum(0)
47
+ cum_sum = torch.cat((torch.tensor([0]).to(indices.device), cum_sum[:-1]))
48
+ first_indicies = ind_sorted[cum_sum]
49
+ mask = torch.zeros_like(grad, device=grad.device)
50
+ original_index = non_empty_index[first_indicies]
51
+
52
+ mask[original_index] = grad[original_index]
53
+ return mask
54
+
55
+
56
+ def writeback_update_gradient_first_feature_only(
57
+ indices: torch.Tensor,
58
+ offsets: torch.Tensor,
59
+ grad: torch.Tensor,
60
+ feature_table_map: list[int],
61
+ ) -> torch.Tensor:
62
+ """
63
+ Special case of writeback_update_gradient where gradient only needs to be updated for the first feature. Other features will be forward-only
64
+
65
+ NOTE: This function is not supporting VBE yet
66
+
67
+ Args:
68
+ indices (torch.Tensor): Embedding indices tensor
69
+ offsets (torch.Tensor): Offsets tensor for batched embeddings
70
+ grad (torch.Tensor): Gradient tensor to be updated
71
+ feature_table_map (list[int]): Mapping from feature to table
72
+
73
+ Returns:
74
+ torch.Tensor: Updated gradient tensor with duplicates masked out
75
+ """
76
+ num_of_tables = len(feature_table_map)
77
+ batch_size = (offsets.shape[0] - 1) // num_of_tables
78
+ shrink_indices = indices[: offsets[batch_size]]
79
+ if shrink_indices.numel() == 0 or indices.numel() == 0:
80
+ return grad[0]
81
+ assert num_of_tables * indices.max() < torch.iinfo(indices.dtype).max
82
+
83
+ grad = grad[0]
84
+ _, idx, counts = torch.unique(
85
+ shrink_indices, dim=0, sorted=True, return_inverse=True, return_counts=True
86
+ )
87
+ _, ind_sorted = torch.sort(idx, stable=True)
88
+ cum_sum = counts.cumsum(0)
89
+ cum_sum = torch.cat((torch.tensor([0]).to(shrink_indices.device), cum_sum[:-1]))
90
+ first_indicies = ind_sorted[cum_sum]
91
+ mask = torch.zeros_like(grad, device=grad.device)
92
+
93
+ mask[first_indicies] = grad[first_indicies]
94
+ return mask
95
+
96
+
97
+ def writeback_gradient(
98
+ grad: torch.Tensor,
99
+ indices: torch.Tensor,
100
+ offsets: torch.Tensor,
101
+ feature_table_map: list[int],
102
+ writeback_first_feature_only: bool = False,
103
+ ) -> tuple[torch.Tensor]:
104
+ """
105
+ Compute deduplicated gradient for writeback operation.
106
+
107
+ Args:
108
+ grad (torch.Tensor): Gradient tensor to be updated
109
+ indices (torch.Tensor): Embedding indices tensor
110
+ offsets (torch.Tensor): Offsets tensor for batched embeddings
111
+ feature_table_map (list[int]): Mapping from feature to table
112
+ writeback_first_feature_only (bool): If True, only first feature will apply gradient update, other features will be read-only
113
+
114
+ Returns:
115
+ tuple[torch.Tensor]: Tuple containing the updated gradient tensor
116
+ """
117
+ if writeback_first_feature_only:
118
+ return (
119
+ writeback_update_gradient_first_feature_only(
120
+ indices, offsets, grad, feature_table_map
121
+ ),
122
+ )
123
+ else:
124
+ return (writeback_update_gradient(indices, offsets, grad, feature_table_map),)
fbgemm_gpu/uvm.py CHANGED
@@ -12,6 +12,7 @@ from typing import Optional
12
12
 
13
13
  import torch
14
14
 
15
+ # fmt:skip
15
16
  from fbgemm_gpu.enums import create_enums
16
17
 
17
18
  try:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: fbgemm_gpu_nightly-cpu
3
- Version: 2025.7.19
3
+ Version: 2026.1.29
4
4
  Home-page: https://github.com/pytorch/fbgemm
5
5
  Author: FBGEMM Team
6
6
  Author-email: packages@pytorch.org
@@ -12,11 +12,11 @@ Classifier: Intended Audience :: Science/Research
12
12
  Classifier: License :: OSI Approved :: BSD License
13
13
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
14
14
  Classifier: Programming Language :: Python :: 3
15
- Classifier: Programming Language :: Python :: 3.9
16
15
  Classifier: Programming Language :: Python :: 3.10
17
16
  Classifier: Programming Language :: Python :: 3.11
18
17
  Classifier: Programming Language :: Python :: 3.12
19
18
  Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Programming Language :: Python :: 3.14
20
20
  Description-Content-Type: text/markdown
21
21
  Requires-Dist: numpy
22
22
  Dynamic: author
@@ -0,0 +1,135 @@
1
+ fbgemm_gpu/__init__.py,sha256=JrSxUgY_diRl9kXapbyq3iteiB32D02CPan3stEFiAM,6434
2
+ fbgemm_gpu/asmjit.so,sha256=j3yeBSR2egw60Od2aIs6-mcVEhCaL4OFXQUsU2h3oyk,526272
3
+ fbgemm_gpu/batched_unary_embeddings_ops.py,sha256=pZqqUfvPIsaIo1CWX-_W087WQg-YEZuS0GNGoKFO_9c,2915
4
+ fbgemm_gpu/enums.py,sha256=37ewGSfO1x7sO31ZkRiqV1yKuklfHXT5qZIxzeeGogo,755
5
+ fbgemm_gpu/fbgemm.so,sha256=HibkS9eOXd0H4HnVQL8_sjCJhwFSwfXE8UnkXiFMMas,1378600
6
+ fbgemm_gpu/fbgemm_gpu_config.so,sha256=m9ScFJOCn8P4YQT7evcFq1g75IO9X6VB3h1Eojm1A-k,67528
7
+ fbgemm_gpu/fbgemm_gpu_embedding_inplace_ops.so,sha256=nKTMQeQptvT8uKuhdwCVFunWLjVFqQ8dKwawRIyaEbY,133400
8
+ fbgemm_gpu/fbgemm_gpu_py.so,sha256=ha2K03pws-2S001OH7HyBOfpHsNlq3ekh_XasmHdEQg,4667832
9
+ fbgemm_gpu/fbgemm_gpu_sparse_async_cumsum.so,sha256=MDL4JvJB-r72fFz_QDyqXhHn26eMff8XJh14cURL3Og,133200
10
+ fbgemm_gpu/fbgemm_gpu_tbe_cache.so,sha256=Qy2xM7LWHfn3Xh78AfNbrMJyHmyvfA2-z0fH25Cw26M,331352
11
+ fbgemm_gpu/fbgemm_gpu_tbe_common.so,sha256=QG3Y-63J_c62MIZQOESFYKZyiTwZpIIliNCY4UY47rk,463944
12
+ fbgemm_gpu/fbgemm_gpu_tbe_index_select.so,sha256=2QEr7JKp-rGfkDpT4AwjSi18RqtlmumOymZJv_Cu21w,330064
13
+ fbgemm_gpu/fbgemm_gpu_tbe_inference.so,sha256=FjQTkVJ_-E2zv7NiGLG43A-2MzIkmZku1LanCdwToGE,593608
14
+ fbgemm_gpu/fbgemm_gpu_tbe_optimizers.so,sha256=PvFTSTJPOc4c0Znvub9EFh2C534elsn6Ryb7BYPic0U,67128
15
+ fbgemm_gpu/fbgemm_gpu_tbe_training_backward.so,sha256=iOi0axu_FJL-IWEeHgIqp7dkEA1wIUU-X5sKtxlt3uk,1127800
16
+ fbgemm_gpu/fbgemm_gpu_tbe_training_backward_dense.so,sha256=tglT4kIO_7ii8f9j4wMo9D80iue8lrdMrYqTxQPWZSk,67128
17
+ fbgemm_gpu/fbgemm_gpu_tbe_training_backward_gwd.so,sha256=RoWy1q3QHApZw9SIoTq-S_KrfDAmaOwGOABQfVs0weo,67128
18
+ fbgemm_gpu/fbgemm_gpu_tbe_training_backward_pt2.so,sha256=1uU9-vaWm5jRTmCSVZ5EHxmRQ3qMohCEjSkXXP2tumc,3291352
19
+ fbgemm_gpu/fbgemm_gpu_tbe_training_backward_split_host.so,sha256=ghqy1C0cu6s_Rp2SOP0egSgHa2y-zvZXPY9CORvoT8Q,67128
20
+ fbgemm_gpu/fbgemm_gpu_tbe_training_backward_vbe.so,sha256=SG48X3jqnUV_H4XhmzQbHpcIpXTytALZ_oh-8UAeY_s,67128
21
+ fbgemm_gpu/fbgemm_gpu_tbe_training_forward.so,sha256=hl-8Df6FJO86GsOCNBLMDfWLLq7VzuQLCbhYK6lHxkU,200160
22
+ fbgemm_gpu/fbgemm_gpu_tbe_utils.so,sha256=qGGsojKO4pkIS4nwfoMI6XjFOwPGNwhuZlmGzJ0zJG4,133600
23
+ fbgemm_gpu/metrics.py,sha256=TsurFLJf0nJvPDN7urWb4LMQlf5RgdWPTTTDO7S4wtI,5663
24
+ fbgemm_gpu/permute_pooled_embedding_modules.py,sha256=B4_-TufcYZq__8ek92cKGjOIkqkZO47pQMkQSDKJIWo,5141
25
+ fbgemm_gpu/permute_pooled_embedding_modules_split.py,sha256=f3VJvH_kw9Ltd_DXtaf_PJPHmlmEWrQgzQ7MDkhh5Nw,2746
26
+ fbgemm_gpu/quantize_comm.py,sha256=gtp0zWYdobAnG6Xe8vZuTu0ZWKDu2hWmsmvV1zA09UQ,11992
27
+ fbgemm_gpu/quantize_utils.py,sha256=sROgIdOrAjQT5_CmFafg40GMo0-pe4d56bAZTI57548,10243
28
+ fbgemm_gpu/runtime_monitor.py,sha256=YXRUv6nXCsoTgh5_RzailTGvCYzwoYDb-eR4rlGwtaw,7619
29
+ fbgemm_gpu/sparse_ops.py,sha256=lJ55cgpP7MoNKo6l6QTDgvfEx8ftkJQrj8kUiIHWBvY,52183
30
+ fbgemm_gpu/split_embedding_configs.py,sha256=bEFnWzCGoHFfJIfzyusmSnSSl9tTd5C8z_j176SS0w0,16584
31
+ fbgemm_gpu/split_embedding_inference_converter.py,sha256=TpGZUXLA0rYemPT37Y0zmZnMIzjHogkRcL0gIhggbM8,7063
32
+ fbgemm_gpu/split_embedding_optimizer_ops.py,sha256=wXuGazClBMk62yL_r9udUIKaPgQP7SlkSb5ugB75wrQ,711
33
+ fbgemm_gpu/split_embedding_utils.py,sha256=Gb40ZKeATxIKEKI3aVQMgDDBanNpKMc53Z43mnzdR_I,851
34
+ fbgemm_gpu/split_table_batched_embeddings_ops.py,sha256=_MIp6uHYHLn4GxGdrGsfddfSsZ2Z9mjsYIrih3ncI1I,2339
35
+ fbgemm_gpu/split_table_batched_embeddings_ops_common.py,sha256=_uUplpcyQOQuxqv8-HV94VUM5lG8e3aGWltXhOgICQc,19294
36
+ fbgemm_gpu/split_table_batched_embeddings_ops_inference.py,sha256=dGC85xjQiRUrequBibSf9oMAVHT5Q49zsVo2zW4n_88,81679
37
+ fbgemm_gpu/split_table_batched_embeddings_ops_training.py,sha256=uCPngWxxC5OQhJv7o6aGs8xf3WlRSrdRHbpCBlPbIuE,191511
38
+ fbgemm_gpu/split_table_batched_embeddings_ops_training_common.py,sha256=jofAN2UB_iSk53Id6MBvn9Bi3Qxw67IL0_VE_EHlw_Q,7593
39
+ fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py,sha256=2TTKsF5yaROTaI69YdCIt8hr_v2TDEo8EraZ0QXNBxc,717
40
+ fbgemm_gpu/tbe_input_multiplexer.py,sha256=MbZF8aZdm_kV-JRMaooeZrqlh6Pn5IuNkSXBXODp-LE,3062
41
+ fbgemm_gpu/uvm.py,sha256=5kOlOauKhOmj-B8AUqpal7riMwTfmsL0HGrh1Wweb80,1058
42
+ fbgemm_gpu/config/__init__.py,sha256=yN0KAneCICgF2BTfOYGsd0qU1PvZX_6msC6YHHZKLMg,292
43
+ fbgemm_gpu/config/feature_list.py,sha256=hhDNkkafd-Oetvuqv9ylBVTNM-lKPi029mpRqq-JZCA,2467
44
+ fbgemm_gpu/docs/__init__.py,sha256=DR6hMSQrsZALfH2AnuJQ4Zq2CfBUUhMN8YjD6APjiAE,523
45
+ fbgemm_gpu/docs/common.py,sha256=8ipXTwVb222X-aZ71O6n8fhxHCHPNhJEHMFiO7epcIs,273
46
+ fbgemm_gpu/docs/examples.py,sha256=ZMN_6sL74LH_hrp2bF_hmg8gi29GhcgvwV3kCMjxkoE,2377
47
+ fbgemm_gpu/docs/jagged_tensor_ops.py,sha256=g8MA8ezTXiqingvk1DlTZJDQcmcCZPXpshuiWxS34F0,7380
48
+ fbgemm_gpu/docs/merge_pooled_embedding_ops.py,sha256=oJLgSgZQmhsyGLbTmZTxNgQrk65_3E8xSJaWSj_Jbo8,1102
49
+ fbgemm_gpu/docs/permute_pooled_embedding_ops.py,sha256=tZUqLVXlk5O6VAKKDA-OEMx2fCu5QPOOeoAPZA9_nLY,4454
50
+ fbgemm_gpu/docs/quantize_ops.py,sha256=xTtOaVK1P02ymreE_i21YiyYDZCqhoZY9eWp_mEIRlo,1297
51
+ fbgemm_gpu/docs/sparse_ops.py,sha256=gSLUFdnu8lle_6gLewFkM20wL3ek2jKLvDGMKR6POaY,27292
52
+ fbgemm_gpu/docs/target.default.json.py,sha256=_BcuMA1hCJ_Jtf08E7O8t-R8A5HiRXHH3Z9rpgCq66U,79
53
+ fbgemm_gpu/quantize/__init__.py,sha256=yPUCmLhNdahHFireHPQMmmiRp3g6W2dkIl5MB51M6SU,942
54
+ fbgemm_gpu/quantize/quantize_ops.py,sha256=C3SN79GcL7fczzoFkxUojm6cGkvvI4iWttkGN4LFQcM,2239
55
+ fbgemm_gpu/sll/__init__.py,sha256=nLFeTiRed6A5STRi_EgHCyNoik0zhXUk2db5kTmMUNU,4221
56
+ fbgemm_gpu/sll/cpu/__init__.py,sha256=glsukNpXtf47VRIdBktILD-4CmVcf4621SGB55lT_ho,2692
57
+ fbgemm_gpu/sll/cpu/cpu_sll.py,sha256=2XyvpZ_UgSThCzUmFDQbjUdLFbz0AvhvqPR_suUcyd8,27866
58
+ fbgemm_gpu/sll/meta/__init__.py,sha256=2sMcD67XGsweBZ-UV2AEJmM4ELPsHeRAYED6kqfgAd4,1077
59
+ fbgemm_gpu/sll/meta/meta_sll.py,sha256=Jk14EOW9VPFwawD7Bwky0R0A5rmbcLWMo52oH8J6Koc,8305
60
+ fbgemm_gpu/sll/triton/__init__.py,sha256=ndvZ5OO81KP65HopJql91R9y_5fC88WnNIGYxCAVKwM,4099
61
+ fbgemm_gpu/sll/triton/common.py,sha256=hISlX4Y-7FtGof-Xx4_B8-2vlF27F9t4p2qyLMUnJ8A,798
62
+ fbgemm_gpu/sll/triton/triton_dense_jagged_cat_jagged_out.py,sha256=J9qOqjNJ72LUBqs-pGI9wrFzzzBpsZ5fzYjgfKc2YhY,1885
63
+ fbgemm_gpu/sll/triton/triton_jagged2_to_padded_dense.py,sha256=lxIYe2MUde2qxLVO_aeTm34fDsMIz8ZkIjyx9Xk-YkE,5923
64
+ fbgemm_gpu/sll/triton/triton_jagged_bmm.py,sha256=bZIgk-GBdP8lPOoAOiIvO-9IE86B5Ejljmnh6-IuQeA,11785
65
+ fbgemm_gpu/sll/triton/triton_jagged_bmm_jagged_out.py,sha256=hccLxsKoSZKiWid5P_yl-IVdBSXw1Rt0WeiRsjLD2Iw,13864
66
+ fbgemm_gpu/sll/triton/triton_jagged_dense_elementwise_add.py,sha256=FRZ7vqaTIxVWkztr50q94Uic209e2KriLgF-3PQD6QM,1603
67
+ fbgemm_gpu/sll/triton/triton_jagged_dense_elementwise_mul_jagged_out.py,sha256=9R7BOOe8SJiko1PgbiuHlFyPKtGaaCFSlZ1RaEQyICE,4198
68
+ fbgemm_gpu/sll/triton/triton_jagged_dense_flash_attention.py,sha256=qJvMCRUqMOwL_kxYs1fd5QvYdbjaGeoBy9ovNGpjMws,22779
69
+ fbgemm_gpu/sll/triton/triton_jagged_flash_attention_basic.py,sha256=po9Nx4uAGVu_YIZ9CWvrmzSwxDsnDuNAtnk9VR7-Ems,17750
70
+ fbgemm_gpu/sll/triton/triton_jagged_self_substraction_jagged_out.py,sha256=VaOIxQn4Obvna2Co5VNDGILCDfKuYwkhVxK2oCi5mPI,1754
71
+ fbgemm_gpu/sll/triton/triton_jagged_softmax.py,sha256=odN66XGPc5VWmMZ34FRBsodpUtbpEILDpOgPtpCNrY4,14225
72
+ fbgemm_gpu/sll/triton/triton_multi_head_jagged_flash_attention.py,sha256=nEo5I-bba1XlG59qoACGB18OrA1LISs-e7Lasgys1s8,19572
73
+ fbgemm_gpu/split_embedding_codegen_lookup_invokers/__init__.py,sha256=kHLPaFr6UcvXDtdDQuF9CP-fvRNdniOORrG5B8O8SmU,6917
74
+ fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_adagrad.py,sha256=V8CXfcyi5cXSP4-EbXGAq8NRXZdibZQSbPoFLHEcmo8,9733
75
+ fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_adam.py,sha256=Bm0R9a6zL6LTEavWsRgkQilPW7aWg3SBOyE-S5AV8B8,12735
76
+ fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_args.py,sha256=k7ZvSHi8fEsZP2GjofNIEFO9mdaQbQxINIhDbPdol0U,2830
77
+ fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_args_ssd.py,sha256=pOhpRdDutSGpOZW5CylR4IIxljYpodizlLUbrO7PoF8,2909
78
+ fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_lamb.py,sha256=GyRwkpONuthj_MG0PEbDpkiTMUpQ6ffg1xo5NgGbpGU,10720
79
+ fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_lars_sgd.py,sha256=xHxf8neshHuE_6ybtAOrVFFMnwxfPQG11iuF0QIItVs,10285
80
+ fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_none.py,sha256=7xOrQArbnUvGR2xMMRZ5gEsxoKRbDXi5ufxd-55b24g,9414
81
+ fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_partial_rowwise_adam.py,sha256=7V7BdQCUZPOp8gmxrQvTfeinULf_uQppdFe7t9POBZ8,10425
82
+ fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_partial_rowwise_lamb.py,sha256=oRjJ302FMr1O9ibFvNtXqn3i-lpmNDh-3JslMstBAxY,10425
83
+ fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad.py,sha256=wQyjKE5xjZNDyNwQmiwuviMrgtEv2QX-MQMDZ8St2_A,10182
84
+ fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad_ssd.py,sha256=YE6RgV8By8FGUxnzduUrjJdNI8j2JOmdEuWcCAikLMk,11523
85
+ fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad_with_counter.py,sha256=LZLuTucNYd2wlzfC4pU6339SmRZJiKIWlYwFDU1VFt4,12172
86
+ fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_sgd.py,sha256=N32H1lUbbWModjDk_Ci8iF8P0hmlSmemmJynGQhuBGI,9195
87
+ fbgemm_gpu/split_embedding_optimizer_codegen/optimizer_args.py,sha256=xWSmk56JgoYfO8eiiK4BP9Brbhixs4tUAMeWp5TPZ30,956
88
+ fbgemm_gpu/split_embedding_optimizer_codegen/split_embedding_optimizer_rowwise_adagrad.py,sha256=bjrbKAypa-FnOIVKH-IUnWP1Jhlu0lk1SopZ0KLFVdo,6623
89
+ fbgemm_gpu/tbe/__init__.py,sha256=fE0IHi1JJpxsNVBNzWNee2thrNXFFRhY94c80RxNSIE,231
90
+ fbgemm_gpu/tbe/bench/__init__.py,sha256=TyUVsIH4p-RtFaXAKppYoaWbf9UTjCTUpnIV7RD_O5E,1653
91
+ fbgemm_gpu/tbe/bench/bench_config.py,sha256=xgtlGLCeZVW6jBYwkKsiQeCslCrWDgJbV2NLLwCRSn4,5452
92
+ fbgemm_gpu/tbe/bench/bench_runs.py,sha256=K4HRUcsX4BWqtrYwinZSXjnjNDFkvpoEdQmv-6rz7Tk,23518
93
+ fbgemm_gpu/tbe/bench/benchmark_click_interface.py,sha256=ofcGsiTUj3_Ml7JSsqg_LcMw3CV-6ypmlRWAUmT_cjc,6941
94
+ fbgemm_gpu/tbe/bench/eeg_cli.py,sha256=B3QOZhtycMDwHMG3dFKnlFuWOqYRCF3RCozEQfrqv8o,3580
95
+ fbgemm_gpu/tbe/bench/embedding_ops_common_config.py,sha256=zdL_ve1Ga6ziU5LjfnzJXOBOIqtCjLlhSrlGfa42H9w,4978
96
+ fbgemm_gpu/tbe/bench/eval_compression.py,sha256=ulFMaNZF2g_vfkXLWZSh02ibotg1zpTz3swVU484mzU,3486
97
+ fbgemm_gpu/tbe/bench/reporter.py,sha256=ZK5RFolUmZEcsEaife270_iOdXAQD5EjTUkuxctnAbY,804
98
+ fbgemm_gpu/tbe/bench/tbe_data_config.py,sha256=d724L4Is3Bo2D5reglgsBs7H6ezLFDrQUbTP5tsnPEQ,8509
99
+ fbgemm_gpu/tbe/bench/tbe_data_config_bench_helper.py,sha256=c-IwLbx04Qbqxzfcn9N4U9Eo9QnmgbBN6HxJYAJwvMw,11311
100
+ fbgemm_gpu/tbe/bench/tbe_data_config_loader.py,sha256=fSdtEAnKu6r56mHMtMJIHo-S6m3vC4cPRyXJKKUevzc,11996
101
+ fbgemm_gpu/tbe/bench/tbe_data_config_param_models.py,sha256=I9dozlJAW_XzuopyJapJ4gmDkLU0YSUz2znugiLZRMg,6203
102
+ fbgemm_gpu/tbe/bench/utils.py,sha256=C0GTTomJO3r9LVfbpzlkudxoA_3QyeMdM-7zM-YOAHA,6716
103
+ fbgemm_gpu/tbe/cache/__init__.py,sha256=lrYwhvqX2eWN0vAPe89HYgMW_O1vccoOcoFHJ9cyM-s,398
104
+ fbgemm_gpu/tbe/cache/kv_embedding_ops_inference.py,sha256=VmG9EennGcq2By8Tj8VkFsJG0oOCGw8EhlPo8-t--Fk,14604
105
+ fbgemm_gpu/tbe/cache/split_embeddings_cache_ops.py,sha256=vZHj7KIe1DoJDy5eft29XtGg6I-tRx60tjKOcTHRAYI,1321
106
+ fbgemm_gpu/tbe/ssd/__init__.py,sha256=wzfMT10cp_dqK2lrebC449hOdexBnizcf_98lA1NyHs,483
107
+ fbgemm_gpu/tbe/ssd/common.py,sha256=zecFfJCcQIwNYbaGoI44Q8rGCskvtmOmc1zxqYHS7Tg,1055
108
+ fbgemm_gpu/tbe/ssd/inference.py,sha256=MwSXP4l2fJUSQJRPu9-bqU08Kg9-0ux8uA5UPSabW3M,22812
109
+ fbgemm_gpu/tbe/ssd/training.py,sha256=2CFA4KmA9IfcpX14K4MlzBuSRPD9h5NM1M7TqepH6vA,212168
110
+ fbgemm_gpu/tbe/ssd/utils/__init__.py,sha256=5DgmR2HA6NtmYh2ddkUgpDsZ6a7hF0DPedA1gMpdh18,250
111
+ fbgemm_gpu/tbe/ssd/utils/partially_materialized_tensor.py,sha256=SFg2-29b-i49LWm-FlaWUkTz2XzXbicYi_AzVj4jKNE,7601
112
+ fbgemm_gpu/tbe/stats/__init__.py,sha256=on29iDtq7cVNh90JR9aeFNG-K9DDoYq0JryzoplL49I,322
113
+ fbgemm_gpu/tbe/stats/bench_params_reporter.py,sha256=_lA4peKXI0GCWsZHJ7IUKlUHU98CA-gVoOc-uhRfcoY,13233
114
+ fbgemm_gpu/tbe/utils/__init__.py,sha256=rlXFm-kTByFZO4SS5C5zMzANRiQmM1NT__eWBayncYg,549
115
+ fbgemm_gpu/tbe/utils/common.py,sha256=KBCyBT-7ShhTRRd1Rs5sEU4g8JggEM7Es6wQ0qhWY-o,1313
116
+ fbgemm_gpu/tbe/utils/offsets.py,sha256=DDWwGaQsVZbhaEZ_fRxxeY8ndLc7IORPZrx61eOqwJc,1904
117
+ fbgemm_gpu/tbe/utils/quantize.py,sha256=EdYh9FS_kMsvCWPuvNms4uSE9de_3cQNo_DCScGG3zI,9166
118
+ fbgemm_gpu/tbe/utils/requests.py,sha256=_lxGVt2J0tEmG7aXv24BMrvfdK6HuvMPlPZHWsF_EDI,18038
119
+ fbgemm_gpu/triton/__init__.py,sha256=kPn_Ye6J9DAzWtqi76KYGwfKSqw0IhqG3Bir5aUpkWM,658
120
+ fbgemm_gpu/triton/common.py,sha256=tsK56Dom_XSb5kXuoN0KnGAWlC5HWV7Ook--a59UHdI,2130
121
+ fbgemm_gpu/triton/quantize.py,sha256=I0pxyfIx04zyq55x4Pvj-28Cb2ZeF-SGtFhAymFagkg,27073
122
+ fbgemm_gpu/triton/quantize_ref.py,sha256=q4RBmFaqPVPELU52lbSgB0n26Aun7apeK7bRF2MWS80,11553
123
+ fbgemm_gpu/triton/jagged/__init__.py,sha256=om0yhjuzKuE1UQakFMWHsXN4WNb8mvNkZtYofQ8hdn4,246
124
+ fbgemm_gpu/triton/jagged/triton_jagged_tensor_ops.py,sha256=F2eQWjkWMR5RWQ48oIr-8OU_CRZyLazDpT7DFrDWS6g,29871
125
+ fbgemm_gpu/utils/__init__.py,sha256=JQQNdcTTaEU6ptK-OW-ZQBwTFxEZZpWOtBXWwEZm39o,354
126
+ fbgemm_gpu/utils/filestore.py,sha256=oVtbKGaPQki1JgbJCkrkElukOFVyxntQpSC0lYBKgho,6455
127
+ fbgemm_gpu/utils/loader.py,sha256=1hCEhNvkflniH46fGcrguLeP1z-6uyOu2QFwqKU5CIM,990
128
+ fbgemm_gpu/utils/torch_library.py,sha256=ywsAHjbuwesj50LjEu99WkAH17FlaVgePZ9OmFg6YE4,4193
129
+ fbgemm_gpu/utils/writeback_util.py,sha256=PyVbHp1EuF-GKrJv_CTP6B50Z0oBblXKucf7Rhd6KKY,4614
130
+ list_versions/__init__.py,sha256=UmTeqCk-UJWFtlZQWvZao3xvui2w9E3X_JdOXVjRaNw,315
131
+ list_versions/cli_run.py,sha256=BCRaJvjVFBFmD5WPdjC_yJwlLv1w_TYOe3eYlf_9ZMo,4506
132
+ fbgemm_gpu_nightly_cpu-2026.1.29.dist-info/METADATA,sha256=sqUYIVBwodRVxysq3jEToUNFX12vtC4tZenZnKnynjo,2654
133
+ fbgemm_gpu_nightly_cpu-2026.1.29.dist-info/WHEEL,sha256=IaW-egZU3n4QvS-XsoO31KhIl6_BixcQGgBPEoTC6GI,109
134
+ fbgemm_gpu_nightly_cpu-2026.1.29.dist-info/top_level.txt,sha256=_2s1Aa08r_eDn0JP4FjOhzK09Q8bVlEI7q8pMep51UY,25
135
+ fbgemm_gpu_nightly_cpu-2026.1.29.dist-info/RECORD,,
@@ -0,0 +1,2 @@
1
+ fbgemm_gpu
2
+ list_versions
@@ -1,4 +1,3 @@
1
-
2
1
  #!/usr/bin/env python3
3
2
  # Copyright (c) Meta Platforms, Inc. and affiliates.
4
3
  # All rights reserved.
@@ -6,6 +5,8 @@
6
5
  # This source code is licensed under the BSD-style license found in the
7
6
  # LICENSE file in the root directory of this source tree.
8
7
 
9
- __version__: str = "2025.7.19"
10
- __target__: str = "default"
11
- __variant__: str = "cpu"
8
+ # pyre-strict
9
+
10
+ from .cli_run import CLI, CLIOutput
11
+
12
+ __all__ = ["CLI", "CLIOutput"]
@@ -0,0 +1,161 @@
1
+ #!/usr/bin/env python3
2
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the BSD-style license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+
8
+ # pyre-strict
9
+
10
+ import logging
11
+ import subprocess
12
+ from datetime import datetime
13
+ from typing import Union
14
+
15
+ import click
16
+ import pandas as pd
17
+ import torch
18
+
19
+
20
+ class CLIOutput:
21
+ def __init__(
22
+ self,
23
+ cli: str = "",
24
+ stdout: str = "",
25
+ stderr: str = "",
26
+ returncode: int = 0,
27
+ timestamp: str = "2025-01-01T20:00:00.00000",
28
+ visible: bool = True,
29
+ ) -> None:
30
+ self._cli = cli
31
+ self._stdout = stdout
32
+ self._stderr = stderr
33
+ self._returncode = returncode
34
+ self._timestamp = timestamp
35
+ self._visible = visible
36
+
37
+ def to_dict(self) -> dict[str, Union[int, str]]:
38
+ return {
39
+ "cli": self._cli,
40
+ "stdout": self._stdout,
41
+ "stderr": self._stderr,
42
+ "returncode": self._returncode,
43
+ "timestamp": self._timestamp,
44
+ "visible": self._visible,
45
+ }
46
+
47
+
48
+ class CLI:
49
+ def __init__(self) -> None:
50
+ pd.options.display.max_rows
51
+ pd.set_option("display.max_colwidth", None)
52
+ self._cli_outputs: list[CLIOutput] = [
53
+ CLIOutput(
54
+ cli="python –c “import torch; print(torch.__version__)”",
55
+ stdout="{}".format(torch.__version__),
56
+ stderr="",
57
+ returncode=0,
58
+ timestamp=datetime.now().isoformat(),
59
+ visible=True,
60
+ )
61
+ ]
62
+
63
+ def run(
64
+ self,
65
+ cli: Union[str, list[str]],
66
+ visible: bool = True,
67
+ input: str = "",
68
+ capture_output: bool = True,
69
+ ) -> CLIOutput:
70
+ if isinstance(cli, str):
71
+ cli = cli.split()
72
+ result = CLIOutput()
73
+ try:
74
+ completed = subprocess.run(
75
+ cli, text=True, check=False, capture_output=capture_output, input=input
76
+ )
77
+ result = CLIOutput(
78
+ cli=" ".join(cli),
79
+ stdout=completed.stdout,
80
+ stderr=completed.stderr,
81
+ returncode=completed.returncode,
82
+ timestamp=datetime.now().isoformat(),
83
+ visible=visible,
84
+ )
85
+ if visible:
86
+ self._cli_outputs.append(result)
87
+ except Exception as e:
88
+ logging.error(f'For cli {" ".join(cli)} we got exception {e}')
89
+ result = CLIOutput(
90
+ cli=" ".join(cli),
91
+ stdout="",
92
+ stderr=str(e),
93
+ returncode=-1,
94
+ visible=visible,
95
+ timestamp=datetime.now().isoformat(),
96
+ )
97
+ if visible:
98
+ self._cli_outputs.append(result)
99
+ return result
100
+
101
+ def run_piped(self, clis: list[str]) -> None:
102
+ the_input = ""
103
+ for cli in clis[:-1]:
104
+ result = self.run(
105
+ cli=cli, visible=False, input=the_input, capture_output=True
106
+ )
107
+ the_input = result._stdout
108
+ self.run(cli=clis[-1], visible=True, input=the_input, capture_output=True)
109
+
110
+ def to_dataframe(self) -> pd.DataFrame:
111
+ return pd.DataFrame([output.to_dict() for output in self._cli_outputs])
112
+
113
+ def save(self, filename: str, format: str = "csv") -> None:
114
+ df = self.to_dataframe()
115
+ if format == "csv":
116
+ df.to_csv(filename, index=False)
117
+ elif format == "json":
118
+ df.to_json(filename, orient="records", lines=True)
119
+ else:
120
+ raise ValueError(f"Invalid format {format} : must be one of 'csv', 'json'")
121
+
122
+
123
+ @click.command()
124
+ @click.option("--json", default="")
125
+ @click.option("--csv", default="")
126
+ def cli_run(
127
+ json: str,
128
+ csv: str,
129
+ ) -> None:
130
+ cli = CLI()
131
+
132
+ the_rpm = "rpm -qa"
133
+ the_grep1 = "grep -E ^amdgpu-(dkms|kmod)"
134
+ the_grep2 = "grep -v firmware"
135
+ the_sed1 = "sed -E s/^[^-]-[^-]-//"
136
+ the_sed2 = "sed -E s/.[^.].[^.]$//"
137
+ cli.run_piped([the_rpm, the_grep1, the_grep2, the_sed1, the_sed2])
138
+
139
+ cli.run("uname -r")
140
+
141
+ cli.run("fw-util all --version")
142
+
143
+ cli.run("amd-smi firmware")
144
+ cli.run("amd-smi version")
145
+ cli.run("amd-smi static")
146
+
147
+ if len(csv):
148
+ cli.save(csv)
149
+
150
+ if len(json):
151
+ cli.save(json, format="json")
152
+
153
+ print(cli.to_dataframe())
154
+
155
+
156
+ def main() -> None:
157
+ cli_run()
158
+
159
+
160
+ if __name__ == "__main__":
161
+ main()