fbgemm-gpu-nightly-cpu 2025.7.19__cp311-cp311-manylinux_2_28_aarch64.whl → 2026.1.29__cp311-cp311-manylinux_2_28_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fbgemm_gpu/__init__.py +112 -19
- fbgemm_gpu/asmjit.so +0 -0
- fbgemm_gpu/batched_unary_embeddings_ops.py +3 -3
- fbgemm_gpu/config/feature_list.py +7 -1
- fbgemm_gpu/docs/jagged_tensor_ops.py +0 -1
- fbgemm_gpu/docs/sparse_ops.py +118 -0
- fbgemm_gpu/docs/target.default.json.py +6 -0
- fbgemm_gpu/enums.py +3 -4
- fbgemm_gpu/fbgemm.so +0 -0
- fbgemm_gpu/fbgemm_gpu_config.so +0 -0
- fbgemm_gpu/fbgemm_gpu_embedding_inplace_ops.so +0 -0
- fbgemm_gpu/fbgemm_gpu_py.so +0 -0
- fbgemm_gpu/fbgemm_gpu_sparse_async_cumsum.so +0 -0
- fbgemm_gpu/fbgemm_gpu_tbe_cache.so +0 -0
- fbgemm_gpu/fbgemm_gpu_tbe_common.so +0 -0
- fbgemm_gpu/fbgemm_gpu_tbe_index_select.so +0 -0
- fbgemm_gpu/fbgemm_gpu_tbe_inference.so +0 -0
- fbgemm_gpu/fbgemm_gpu_tbe_optimizers.so +0 -0
- fbgemm_gpu/fbgemm_gpu_tbe_training_backward.so +0 -0
- fbgemm_gpu/fbgemm_gpu_tbe_training_backward_dense.so +0 -0
- fbgemm_gpu/fbgemm_gpu_tbe_training_backward_gwd.so +0 -0
- fbgemm_gpu/fbgemm_gpu_tbe_training_backward_pt2.so +0 -0
- fbgemm_gpu/fbgemm_gpu_tbe_training_backward_split_host.so +0 -0
- fbgemm_gpu/fbgemm_gpu_tbe_training_backward_vbe.so +0 -0
- fbgemm_gpu/fbgemm_gpu_tbe_training_forward.so +0 -0
- fbgemm_gpu/fbgemm_gpu_tbe_utils.so +0 -0
- fbgemm_gpu/permute_pooled_embedding_modules.py +5 -4
- fbgemm_gpu/permute_pooled_embedding_modules_split.py +4 -4
- fbgemm_gpu/quantize/__init__.py +2 -0
- fbgemm_gpu/quantize/quantize_ops.py +1 -0
- fbgemm_gpu/quantize_comm.py +29 -12
- fbgemm_gpu/quantize_utils.py +88 -8
- fbgemm_gpu/runtime_monitor.py +9 -5
- fbgemm_gpu/sll/__init__.py +3 -0
- fbgemm_gpu/sll/cpu/cpu_sll.py +8 -8
- fbgemm_gpu/sll/triton/__init__.py +0 -10
- fbgemm_gpu/sll/triton/triton_jagged2_to_padded_dense.py +2 -3
- fbgemm_gpu/sll/triton/triton_jagged_bmm.py +2 -2
- fbgemm_gpu/sll/triton/triton_jagged_dense_elementwise_add.py +1 -0
- fbgemm_gpu/sll/triton/triton_jagged_dense_flash_attention.py +5 -6
- fbgemm_gpu/sll/triton/triton_jagged_flash_attention_basic.py +1 -2
- fbgemm_gpu/sll/triton/triton_multi_head_jagged_flash_attention.py +1 -2
- fbgemm_gpu/sparse_ops.py +190 -54
- fbgemm_gpu/split_embedding_codegen_lookup_invokers/__init__.py +12 -0
- fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_adagrad.py +12 -5
- fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_adam.py +14 -7
- fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_args.py +2 -0
- fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_args_ssd.py +2 -0
- fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_lamb.py +12 -5
- fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_lars_sgd.py +12 -5
- fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_none.py +12 -5
- fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_partial_rowwise_adam.py +12 -5
- fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_partial_rowwise_lamb.py +12 -5
- fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad.py +12 -5
- fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad_ssd.py +12 -5
- fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad_with_counter.py +12 -5
- fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_sgd.py +12 -5
- fbgemm_gpu/split_embedding_configs.py +134 -37
- fbgemm_gpu/split_embedding_inference_converter.py +7 -6
- fbgemm_gpu/split_table_batched_embeddings_ops_common.py +117 -24
- fbgemm_gpu/split_table_batched_embeddings_ops_inference.py +37 -37
- fbgemm_gpu/split_table_batched_embeddings_ops_training.py +764 -123
- fbgemm_gpu/split_table_batched_embeddings_ops_training_common.py +44 -1
- fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py +0 -1
- fbgemm_gpu/tbe/bench/__init__.py +6 -1
- fbgemm_gpu/tbe/bench/bench_config.py +14 -3
- fbgemm_gpu/tbe/bench/bench_runs.py +163 -14
- fbgemm_gpu/tbe/bench/benchmark_click_interface.py +5 -2
- fbgemm_gpu/tbe/bench/eeg_cli.py +3 -3
- fbgemm_gpu/tbe/bench/embedding_ops_common_config.py +3 -2
- fbgemm_gpu/tbe/bench/eval_compression.py +3 -3
- fbgemm_gpu/tbe/bench/tbe_data_config.py +115 -197
- fbgemm_gpu/tbe/bench/tbe_data_config_bench_helper.py +332 -0
- fbgemm_gpu/tbe/bench/tbe_data_config_loader.py +108 -8
- fbgemm_gpu/tbe/bench/tbe_data_config_param_models.py +15 -8
- fbgemm_gpu/tbe/bench/utils.py +129 -5
- fbgemm_gpu/tbe/cache/kv_embedding_ops_inference.py +22 -19
- fbgemm_gpu/tbe/cache/split_embeddings_cache_ops.py +4 -4
- fbgemm_gpu/tbe/ssd/common.py +1 -0
- fbgemm_gpu/tbe/ssd/inference.py +15 -15
- fbgemm_gpu/tbe/ssd/training.py +1292 -267
- fbgemm_gpu/tbe/ssd/utils/partially_materialized_tensor.py +2 -3
- fbgemm_gpu/tbe/stats/bench_params_reporter.py +198 -42
- fbgemm_gpu/tbe/utils/offsets.py +6 -6
- fbgemm_gpu/tbe/utils/quantize.py +8 -8
- fbgemm_gpu/tbe/utils/requests.py +15 -15
- fbgemm_gpu/tbe_input_multiplexer.py +10 -11
- fbgemm_gpu/triton/common.py +0 -1
- fbgemm_gpu/triton/jagged/triton_jagged_tensor_ops.py +11 -11
- fbgemm_gpu/triton/quantize.py +14 -9
- fbgemm_gpu/utils/filestore.py +6 -2
- fbgemm_gpu/utils/torch_library.py +2 -2
- fbgemm_gpu/utils/writeback_util.py +124 -0
- fbgemm_gpu/uvm.py +1 -0
- {fbgemm_gpu_nightly_cpu-2025.7.19.dist-info → fbgemm_gpu_nightly_cpu-2026.1.29.dist-info}/METADATA +2 -2
- fbgemm_gpu_nightly_cpu-2026.1.29.dist-info/RECORD +135 -0
- fbgemm_gpu_nightly_cpu-2026.1.29.dist-info/top_level.txt +2 -0
- fbgemm_gpu/docs/version.py → list_versions/__init__.py +5 -4
- list_versions/cli_run.py +161 -0
- fbgemm_gpu_nightly_cpu-2025.7.19.dist-info/RECORD +0 -131
- fbgemm_gpu_nightly_cpu-2025.7.19.dist-info/top_level.txt +0 -1
- {fbgemm_gpu_nightly_cpu-2025.7.19.dist-info → fbgemm_gpu_nightly_cpu-2026.1.29.dist-info}/WHEEL +0 -0
|
@@ -9,7 +9,7 @@
|
|
|
9
9
|
|
|
10
10
|
# pyre-ignore-all-errors[6]
|
|
11
11
|
|
|
12
|
-
from typing import
|
|
12
|
+
from typing import Optional, Union
|
|
13
13
|
|
|
14
14
|
import torch
|
|
15
15
|
import triton # @manual
|
|
@@ -472,7 +472,7 @@ def triton_jagged_to_dense_optimization_2d(
|
|
|
472
472
|
# In FBGEMM it was computed by GPU but in triton currently has some compilation issue so we use CUP computation method as workaround
|
|
473
473
|
# However in real-world case if we only dealing with 2d jagged tensor we don't need to use this function at all
|
|
474
474
|
def _jagged_offsets_to_dense_indice(
|
|
475
|
-
offsets:
|
|
475
|
+
offsets: list[torch.Tensor], dense_strides: list[int], dense_sizes: list[int]
|
|
476
476
|
) -> torch.Tensor:
|
|
477
477
|
|
|
478
478
|
output_offset = torch.zeros(len(offsets[-1]) - 1, device="cpu", dtype=torch.int32)
|
|
@@ -532,8 +532,8 @@ def _jagged_offsets_to_dense_indice(
|
|
|
532
532
|
# not be affected at all
|
|
533
533
|
def jagged_to_dense(
|
|
534
534
|
jagged_values: torch.Tensor,
|
|
535
|
-
jagged_offsets:
|
|
536
|
-
jagged_max_lengths:
|
|
535
|
+
jagged_offsets: list[torch.Tensor],
|
|
536
|
+
jagged_max_lengths: list[int],
|
|
537
537
|
padding_value: float = 0.0, # padding value currently use 0.0 as default value
|
|
538
538
|
operation_function: Union[
|
|
539
539
|
str, None
|
|
@@ -720,10 +720,10 @@ def triton_dense_to_jagged(
|
|
|
720
720
|
|
|
721
721
|
def dense_to_jagged(
|
|
722
722
|
dense: torch.Tensor,
|
|
723
|
-
jagged_offsets:
|
|
723
|
+
jagged_offsets: list[torch.Tensor],
|
|
724
724
|
operation_function: Union[str, None] = None,
|
|
725
725
|
operation_jagged_values: Union[torch.Tensor, None] = None,
|
|
726
|
-
) ->
|
|
726
|
+
) -> tuple[torch.Tensor, list[torch.Tensor]]:
|
|
727
727
|
|
|
728
728
|
thread_block_row_size = 32
|
|
729
729
|
thread_block_col_size = 32
|
|
@@ -780,7 +780,7 @@ def dense_to_jagged(
|
|
|
780
780
|
# jagged_tensor + dense -> dense
|
|
781
781
|
def jagged_dense_elementwise_add_dense_output(
|
|
782
782
|
jagged_values: Tensor,
|
|
783
|
-
jagged_offsets:
|
|
783
|
+
jagged_offsets: list[Tensor],
|
|
784
784
|
# pyre-fixme[2]: Parameter must be annotated.
|
|
785
785
|
dense,
|
|
786
786
|
) -> Tensor:
|
|
@@ -800,8 +800,8 @@ def jagged_dense_elementwise_add_dense_output(
|
|
|
800
800
|
|
|
801
801
|
# jagged_tensor + dense -> jagged_tensor
|
|
802
802
|
def jagged_dense_elementwise_add_jagged_output(
|
|
803
|
-
jagged_values: Optional[Tensor], jagged_offsets:
|
|
804
|
-
) ->
|
|
803
|
+
jagged_values: Optional[Tensor], jagged_offsets: list[Tensor], dense: Tensor
|
|
804
|
+
) -> tuple[Tensor, list[Tensor]]:
|
|
805
805
|
|
|
806
806
|
return dense_to_jagged(
|
|
807
807
|
dense,
|
|
@@ -813,8 +813,8 @@ def jagged_dense_elementwise_add_jagged_output(
|
|
|
813
813
|
|
|
814
814
|
# jagged_tensor * dense -> jagged_tensor
|
|
815
815
|
def jagged_dense_elementwise_mul_jagged_output(
|
|
816
|
-
jagged_values: Optional[Tensor], jagged_offsets:
|
|
817
|
-
) ->
|
|
816
|
+
jagged_values: Optional[Tensor], jagged_offsets: list[Tensor], dense: Tensor
|
|
817
|
+
) -> tuple[Tensor, list[Tensor]]:
|
|
818
818
|
|
|
819
819
|
return dense_to_jagged(
|
|
820
820
|
dense,
|
fbgemm_gpu/triton/quantize.py
CHANGED
|
@@ -11,7 +11,6 @@ from typing import Union
|
|
|
11
11
|
|
|
12
12
|
import torch
|
|
13
13
|
import triton # @manual
|
|
14
|
-
|
|
15
14
|
import triton.language as tl # @manual
|
|
16
15
|
|
|
17
16
|
from .common import get_mx4_exp_bias, get_mx4_lookup_table, RoundingMode
|
|
@@ -238,7 +237,7 @@ def _kernel_quantize_mx4(
|
|
|
238
237
|
# We readd fp32_exp_bias for compatibility with cuda dequant.
|
|
239
238
|
tl.store(
|
|
240
239
|
out + exp_offset,
|
|
241
|
-
(group_exp + FP32_EXP_BIAS).to(tl.
|
|
240
|
+
(group_exp + FP32_EXP_BIAS).to(tl.uint8),
|
|
242
241
|
# Prevent writing outside this chunk or the main array.
|
|
243
242
|
mask=(exp_offset < OUTPUT_SIZE)
|
|
244
243
|
& (exp_offset < (OUTPUT_CHUNK_SIZE * (pid + 1))),
|
|
@@ -575,7 +574,7 @@ def _kernel_dequantize_mx4(
|
|
|
575
574
|
# Write final outputs.
|
|
576
575
|
tl.store(
|
|
577
576
|
out + output_offset,
|
|
578
|
-
scaled_fp32,
|
|
577
|
+
scaled_fp32.to(out.dtype.element_ty),
|
|
579
578
|
# Mask values that are out of this chunk or the main array.
|
|
580
579
|
mask=(output_offset < OUTPUT_SIZE)
|
|
581
580
|
& (output_offset < OUTPUT_CHUNK_SIZE * (pid + 1)),
|
|
@@ -588,10 +587,14 @@ def _kernel_dequantize_mx4(
|
|
|
588
587
|
|
|
589
588
|
|
|
590
589
|
def triton_dequantize_mx4(
|
|
591
|
-
a: torch.Tensor,
|
|
590
|
+
a: torch.Tensor,
|
|
591
|
+
group_size: int = 32,
|
|
592
|
+
ebits: int = 2,
|
|
593
|
+
mbits: int = 1,
|
|
594
|
+
output_dtype: torch.dtype = torch.float32,
|
|
592
595
|
) -> torch.Tensor:
|
|
593
596
|
"""
|
|
594
|
-
Dequantize a tensor from mx4 format to fp32.
|
|
597
|
+
Dequantize a tensor from mx4 format to fp32 or bf16.
|
|
595
598
|
|
|
596
599
|
Args:
|
|
597
600
|
a (Tensor): [M / 2 + M / group_size] MX4 tensor packed into int8 values
|
|
@@ -599,13 +602,15 @@ def triton_dequantize_mx4(
|
|
|
599
602
|
group_size (int): Size of chunks that use the same shared exponent.
|
|
600
603
|
ebits (int): Number of bits to use for exponent in target mx4 format.
|
|
601
604
|
mbits (int): Number of bits to use for mantissa in target mx4 format.
|
|
605
|
+
output_dtype (torch.dtype): Output dtype (FP32 or BF16).
|
|
606
|
+
Defaults to torch.float32 for backward compatibility.
|
|
602
607
|
|
|
603
608
|
Returns:
|
|
604
|
-
torch.Tensor: [M, K] dequantized
|
|
609
|
+
torch.Tensor: [M, K] dequantized tensor in the specified dtype.
|
|
605
610
|
"""
|
|
606
611
|
# If given an empty shape, return an empty tensor.
|
|
607
612
|
if a.numel() == 0:
|
|
608
|
-
return torch.empty(a.shape, device=a.device, dtype=
|
|
613
|
+
return torch.empty(a.shape, device=a.device, dtype=output_dtype)
|
|
609
614
|
# View a as 2D for simplicity.
|
|
610
615
|
orig_shape = a.shape
|
|
611
616
|
a = a.flatten()
|
|
@@ -622,9 +627,9 @@ def triton_dequantize_mx4(
|
|
|
622
627
|
# Use a lookup table to convert
|
|
623
628
|
mx4_to_fp_values = get_mx4_lookup_table(ebits, mbits, a.device)
|
|
624
629
|
|
|
625
|
-
# Create output tensor.
|
|
630
|
+
# Create output tensor in target dtype.
|
|
626
631
|
output_elems = num_groups * group_size
|
|
627
|
-
out = torch.empty([output_elems], device=a.device, dtype=
|
|
632
|
+
out = torch.empty([output_elems], device=a.device, dtype=output_dtype)
|
|
628
633
|
# Check if we need to use int64 for indexing.
|
|
629
634
|
use_int64 = num_threads * groups_per_thread * group_size > 2**31 - 1
|
|
630
635
|
# Invoke triton dequantization kernel over rows.
|
fbgemm_gpu/utils/filestore.py
CHANGED
|
@@ -11,7 +11,6 @@
|
|
|
11
11
|
import io
|
|
12
12
|
import logging
|
|
13
13
|
import os
|
|
14
|
-
import shutil
|
|
15
14
|
from dataclasses import dataclass
|
|
16
15
|
from pathlib import Path
|
|
17
16
|
from typing import BinaryIO, Union
|
|
@@ -76,7 +75,12 @@ class FileStore:
|
|
|
76
75
|
elif isinstance(raw_input, Path):
|
|
77
76
|
if not os.path.exists(raw_input):
|
|
78
77
|
raise FileNotFoundError(f"File {raw_input} does not exist")
|
|
79
|
-
|
|
78
|
+
# Open the source file and destination file, and copy the contents
|
|
79
|
+
with open(raw_input, "rb") as src_file, open(
|
|
80
|
+
filepath, "wb"
|
|
81
|
+
) as dst_file:
|
|
82
|
+
while chunk := src_file.read(4096): # Read 4 KB at a time
|
|
83
|
+
dst_file.write(chunk)
|
|
80
84
|
|
|
81
85
|
elif isinstance(raw_input, io.BytesIO) or isinstance(raw_input, BinaryIO):
|
|
82
86
|
with open(filepath, "wb") as file:
|
|
@@ -8,7 +8,7 @@
|
|
|
8
8
|
# pyre-strict
|
|
9
9
|
|
|
10
10
|
import re
|
|
11
|
-
from typing import Callable
|
|
11
|
+
from typing import Callable
|
|
12
12
|
|
|
13
13
|
import torch
|
|
14
14
|
|
|
@@ -112,7 +112,7 @@ class TorchLibraryFragment:
|
|
|
112
112
|
self.lib.impl(op_name, fn, dispatch_key)
|
|
113
113
|
|
|
114
114
|
# pyre-ignore[24]
|
|
115
|
-
def register(self, op_name: str, functors:
|
|
115
|
+
def register(self, op_name: str, functors: dict[str, Callable]) -> None:
|
|
116
116
|
"""
|
|
117
117
|
Registers a set of dispatches for a defined operator.
|
|
118
118
|
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# This source code is licensed under the BSD-style license found in the
|
|
5
|
+
# LICENSE file in the root directory of this source tree.
|
|
6
|
+
|
|
7
|
+
import torch
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def writeback_update_gradient(
|
|
11
|
+
indices: torch.Tensor,
|
|
12
|
+
offsets: torch.Tensor,
|
|
13
|
+
grad: torch.Tensor,
|
|
14
|
+
feature_table_map: list[int],
|
|
15
|
+
) -> torch.Tensor:
|
|
16
|
+
"""
|
|
17
|
+
Update gradient tensor by deduplicating indices across all features/tables.
|
|
18
|
+
For duplicate indices, only the first occurrence receives the gradient to achieve the assign purpose via gradient update
|
|
19
|
+
|
|
20
|
+
NOTE: This function is not supporting VBE yet
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
indices (torch.Tensor): Embedding indices tensor
|
|
24
|
+
offsets (torch.Tensor): Offsets tensor for batched embeddings
|
|
25
|
+
grad (torch.Tensor): Gradient tensor to be updated
|
|
26
|
+
feature_table_map (list[int]): Mapping from feature to table
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
torch.Tensor: Updated gradient tensor with duplicates masked out
|
|
30
|
+
"""
|
|
31
|
+
if indices.numel() == 0:
|
|
32
|
+
return grad[0]
|
|
33
|
+
# get num of feature to estimate batch size
|
|
34
|
+
num_of_tables = len(feature_table_map)
|
|
35
|
+
assert num_of_tables * indices.max() < torch.iinfo(indices.dtype).max
|
|
36
|
+
batch_size = offsets.shape[0] // num_of_tables
|
|
37
|
+
max_indices = indices.max()
|
|
38
|
+
non_empty_index = (offsets[1:] - offsets[:-1]).nonzero().flatten()
|
|
39
|
+
# disable dedup across different table
|
|
40
|
+
indices = ((offsets[non_empty_index]) // batch_size) * (1 + max_indices) + indices
|
|
41
|
+
grad = grad[0]
|
|
42
|
+
_, idx, counts = torch.unique(
|
|
43
|
+
indices, dim=0, sorted=True, return_inverse=True, return_counts=True
|
|
44
|
+
)
|
|
45
|
+
_, ind_sorted = torch.sort(idx, stable=True)
|
|
46
|
+
cum_sum = counts.cumsum(0)
|
|
47
|
+
cum_sum = torch.cat((torch.tensor([0]).to(indices.device), cum_sum[:-1]))
|
|
48
|
+
first_indicies = ind_sorted[cum_sum]
|
|
49
|
+
mask = torch.zeros_like(grad, device=grad.device)
|
|
50
|
+
original_index = non_empty_index[first_indicies]
|
|
51
|
+
|
|
52
|
+
mask[original_index] = grad[original_index]
|
|
53
|
+
return mask
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def writeback_update_gradient_first_feature_only(
|
|
57
|
+
indices: torch.Tensor,
|
|
58
|
+
offsets: torch.Tensor,
|
|
59
|
+
grad: torch.Tensor,
|
|
60
|
+
feature_table_map: list[int],
|
|
61
|
+
) -> torch.Tensor:
|
|
62
|
+
"""
|
|
63
|
+
Special case of writeback_update_gradient where gradient only needs to be updated for the first feature. Other features will be forward-only
|
|
64
|
+
|
|
65
|
+
NOTE: This function is not supporting VBE yet
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
indices (torch.Tensor): Embedding indices tensor
|
|
69
|
+
offsets (torch.Tensor): Offsets tensor for batched embeddings
|
|
70
|
+
grad (torch.Tensor): Gradient tensor to be updated
|
|
71
|
+
feature_table_map (list[int]): Mapping from feature to table
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
torch.Tensor: Updated gradient tensor with duplicates masked out
|
|
75
|
+
"""
|
|
76
|
+
num_of_tables = len(feature_table_map)
|
|
77
|
+
batch_size = (offsets.shape[0] - 1) // num_of_tables
|
|
78
|
+
shrink_indices = indices[: offsets[batch_size]]
|
|
79
|
+
if shrink_indices.numel() == 0 or indices.numel() == 0:
|
|
80
|
+
return grad[0]
|
|
81
|
+
assert num_of_tables * indices.max() < torch.iinfo(indices.dtype).max
|
|
82
|
+
|
|
83
|
+
grad = grad[0]
|
|
84
|
+
_, idx, counts = torch.unique(
|
|
85
|
+
shrink_indices, dim=0, sorted=True, return_inverse=True, return_counts=True
|
|
86
|
+
)
|
|
87
|
+
_, ind_sorted = torch.sort(idx, stable=True)
|
|
88
|
+
cum_sum = counts.cumsum(0)
|
|
89
|
+
cum_sum = torch.cat((torch.tensor([0]).to(shrink_indices.device), cum_sum[:-1]))
|
|
90
|
+
first_indicies = ind_sorted[cum_sum]
|
|
91
|
+
mask = torch.zeros_like(grad, device=grad.device)
|
|
92
|
+
|
|
93
|
+
mask[first_indicies] = grad[first_indicies]
|
|
94
|
+
return mask
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def writeback_gradient(
|
|
98
|
+
grad: torch.Tensor,
|
|
99
|
+
indices: torch.Tensor,
|
|
100
|
+
offsets: torch.Tensor,
|
|
101
|
+
feature_table_map: list[int],
|
|
102
|
+
writeback_first_feature_only: bool = False,
|
|
103
|
+
) -> tuple[torch.Tensor]:
|
|
104
|
+
"""
|
|
105
|
+
Compute deduplicated gradient for writeback operation.
|
|
106
|
+
|
|
107
|
+
Args:
|
|
108
|
+
grad (torch.Tensor): Gradient tensor to be updated
|
|
109
|
+
indices (torch.Tensor): Embedding indices tensor
|
|
110
|
+
offsets (torch.Tensor): Offsets tensor for batched embeddings
|
|
111
|
+
feature_table_map (list[int]): Mapping from feature to table
|
|
112
|
+
writeback_first_feature_only (bool): If True, only first feature will apply gradient update, other features will be read-only
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
tuple[torch.Tensor]: Tuple containing the updated gradient tensor
|
|
116
|
+
"""
|
|
117
|
+
if writeback_first_feature_only:
|
|
118
|
+
return (
|
|
119
|
+
writeback_update_gradient_first_feature_only(
|
|
120
|
+
indices, offsets, grad, feature_table_map
|
|
121
|
+
),
|
|
122
|
+
)
|
|
123
|
+
else:
|
|
124
|
+
return (writeback_update_gradient(indices, offsets, grad, feature_table_map),)
|
fbgemm_gpu/uvm.py
CHANGED
{fbgemm_gpu_nightly_cpu-2025.7.19.dist-info → fbgemm_gpu_nightly_cpu-2026.1.29.dist-info}/METADATA
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: fbgemm_gpu_nightly-cpu
|
|
3
|
-
Version:
|
|
3
|
+
Version: 2026.1.29
|
|
4
4
|
Home-page: https://github.com/pytorch/fbgemm
|
|
5
5
|
Author: FBGEMM Team
|
|
6
6
|
Author-email: packages@pytorch.org
|
|
@@ -12,11 +12,11 @@ Classifier: Intended Audience :: Science/Research
|
|
|
12
12
|
Classifier: License :: OSI Approved :: BSD License
|
|
13
13
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
14
14
|
Classifier: Programming Language :: Python :: 3
|
|
15
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
16
15
|
Classifier: Programming Language :: Python :: 3.10
|
|
17
16
|
Classifier: Programming Language :: Python :: 3.11
|
|
18
17
|
Classifier: Programming Language :: Python :: 3.12
|
|
19
18
|
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
20
20
|
Description-Content-Type: text/markdown
|
|
21
21
|
Requires-Dist: numpy
|
|
22
22
|
Dynamic: author
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
fbgemm_gpu/__init__.py,sha256=JrSxUgY_diRl9kXapbyq3iteiB32D02CPan3stEFiAM,6434
|
|
2
|
+
fbgemm_gpu/asmjit.so,sha256=j3yeBSR2egw60Od2aIs6-mcVEhCaL4OFXQUsU2h3oyk,526272
|
|
3
|
+
fbgemm_gpu/batched_unary_embeddings_ops.py,sha256=pZqqUfvPIsaIo1CWX-_W087WQg-YEZuS0GNGoKFO_9c,2915
|
|
4
|
+
fbgemm_gpu/enums.py,sha256=37ewGSfO1x7sO31ZkRiqV1yKuklfHXT5qZIxzeeGogo,755
|
|
5
|
+
fbgemm_gpu/fbgemm.so,sha256=HibkS9eOXd0H4HnVQL8_sjCJhwFSwfXE8UnkXiFMMas,1378600
|
|
6
|
+
fbgemm_gpu/fbgemm_gpu_config.so,sha256=m9ScFJOCn8P4YQT7evcFq1g75IO9X6VB3h1Eojm1A-k,67528
|
|
7
|
+
fbgemm_gpu/fbgemm_gpu_embedding_inplace_ops.so,sha256=nKTMQeQptvT8uKuhdwCVFunWLjVFqQ8dKwawRIyaEbY,133400
|
|
8
|
+
fbgemm_gpu/fbgemm_gpu_py.so,sha256=ha2K03pws-2S001OH7HyBOfpHsNlq3ekh_XasmHdEQg,4667832
|
|
9
|
+
fbgemm_gpu/fbgemm_gpu_sparse_async_cumsum.so,sha256=MDL4JvJB-r72fFz_QDyqXhHn26eMff8XJh14cURL3Og,133200
|
|
10
|
+
fbgemm_gpu/fbgemm_gpu_tbe_cache.so,sha256=Qy2xM7LWHfn3Xh78AfNbrMJyHmyvfA2-z0fH25Cw26M,331352
|
|
11
|
+
fbgemm_gpu/fbgemm_gpu_tbe_common.so,sha256=QG3Y-63J_c62MIZQOESFYKZyiTwZpIIliNCY4UY47rk,463944
|
|
12
|
+
fbgemm_gpu/fbgemm_gpu_tbe_index_select.so,sha256=2QEr7JKp-rGfkDpT4AwjSi18RqtlmumOymZJv_Cu21w,330064
|
|
13
|
+
fbgemm_gpu/fbgemm_gpu_tbe_inference.so,sha256=FjQTkVJ_-E2zv7NiGLG43A-2MzIkmZku1LanCdwToGE,593608
|
|
14
|
+
fbgemm_gpu/fbgemm_gpu_tbe_optimizers.so,sha256=PvFTSTJPOc4c0Znvub9EFh2C534elsn6Ryb7BYPic0U,67128
|
|
15
|
+
fbgemm_gpu/fbgemm_gpu_tbe_training_backward.so,sha256=iOi0axu_FJL-IWEeHgIqp7dkEA1wIUU-X5sKtxlt3uk,1127800
|
|
16
|
+
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_dense.so,sha256=tglT4kIO_7ii8f9j4wMo9D80iue8lrdMrYqTxQPWZSk,67128
|
|
17
|
+
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_gwd.so,sha256=RoWy1q3QHApZw9SIoTq-S_KrfDAmaOwGOABQfVs0weo,67128
|
|
18
|
+
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_pt2.so,sha256=1uU9-vaWm5jRTmCSVZ5EHxmRQ3qMohCEjSkXXP2tumc,3291352
|
|
19
|
+
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_split_host.so,sha256=ghqy1C0cu6s_Rp2SOP0egSgHa2y-zvZXPY9CORvoT8Q,67128
|
|
20
|
+
fbgemm_gpu/fbgemm_gpu_tbe_training_backward_vbe.so,sha256=SG48X3jqnUV_H4XhmzQbHpcIpXTytALZ_oh-8UAeY_s,67128
|
|
21
|
+
fbgemm_gpu/fbgemm_gpu_tbe_training_forward.so,sha256=hl-8Df6FJO86GsOCNBLMDfWLLq7VzuQLCbhYK6lHxkU,200160
|
|
22
|
+
fbgemm_gpu/fbgemm_gpu_tbe_utils.so,sha256=qGGsojKO4pkIS4nwfoMI6XjFOwPGNwhuZlmGzJ0zJG4,133600
|
|
23
|
+
fbgemm_gpu/metrics.py,sha256=TsurFLJf0nJvPDN7urWb4LMQlf5RgdWPTTTDO7S4wtI,5663
|
|
24
|
+
fbgemm_gpu/permute_pooled_embedding_modules.py,sha256=B4_-TufcYZq__8ek92cKGjOIkqkZO47pQMkQSDKJIWo,5141
|
|
25
|
+
fbgemm_gpu/permute_pooled_embedding_modules_split.py,sha256=f3VJvH_kw9Ltd_DXtaf_PJPHmlmEWrQgzQ7MDkhh5Nw,2746
|
|
26
|
+
fbgemm_gpu/quantize_comm.py,sha256=gtp0zWYdobAnG6Xe8vZuTu0ZWKDu2hWmsmvV1zA09UQ,11992
|
|
27
|
+
fbgemm_gpu/quantize_utils.py,sha256=sROgIdOrAjQT5_CmFafg40GMo0-pe4d56bAZTI57548,10243
|
|
28
|
+
fbgemm_gpu/runtime_monitor.py,sha256=YXRUv6nXCsoTgh5_RzailTGvCYzwoYDb-eR4rlGwtaw,7619
|
|
29
|
+
fbgemm_gpu/sparse_ops.py,sha256=lJ55cgpP7MoNKo6l6QTDgvfEx8ftkJQrj8kUiIHWBvY,52183
|
|
30
|
+
fbgemm_gpu/split_embedding_configs.py,sha256=bEFnWzCGoHFfJIfzyusmSnSSl9tTd5C8z_j176SS0w0,16584
|
|
31
|
+
fbgemm_gpu/split_embedding_inference_converter.py,sha256=TpGZUXLA0rYemPT37Y0zmZnMIzjHogkRcL0gIhggbM8,7063
|
|
32
|
+
fbgemm_gpu/split_embedding_optimizer_ops.py,sha256=wXuGazClBMk62yL_r9udUIKaPgQP7SlkSb5ugB75wrQ,711
|
|
33
|
+
fbgemm_gpu/split_embedding_utils.py,sha256=Gb40ZKeATxIKEKI3aVQMgDDBanNpKMc53Z43mnzdR_I,851
|
|
34
|
+
fbgemm_gpu/split_table_batched_embeddings_ops.py,sha256=_MIp6uHYHLn4GxGdrGsfddfSsZ2Z9mjsYIrih3ncI1I,2339
|
|
35
|
+
fbgemm_gpu/split_table_batched_embeddings_ops_common.py,sha256=_uUplpcyQOQuxqv8-HV94VUM5lG8e3aGWltXhOgICQc,19294
|
|
36
|
+
fbgemm_gpu/split_table_batched_embeddings_ops_inference.py,sha256=dGC85xjQiRUrequBibSf9oMAVHT5Q49zsVo2zW4n_88,81679
|
|
37
|
+
fbgemm_gpu/split_table_batched_embeddings_ops_training.py,sha256=uCPngWxxC5OQhJv7o6aGs8xf3WlRSrdRHbpCBlPbIuE,191511
|
|
38
|
+
fbgemm_gpu/split_table_batched_embeddings_ops_training_common.py,sha256=jofAN2UB_iSk53Id6MBvn9Bi3Qxw67IL0_VE_EHlw_Q,7593
|
|
39
|
+
fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py,sha256=2TTKsF5yaROTaI69YdCIt8hr_v2TDEo8EraZ0QXNBxc,717
|
|
40
|
+
fbgemm_gpu/tbe_input_multiplexer.py,sha256=MbZF8aZdm_kV-JRMaooeZrqlh6Pn5IuNkSXBXODp-LE,3062
|
|
41
|
+
fbgemm_gpu/uvm.py,sha256=5kOlOauKhOmj-B8AUqpal7riMwTfmsL0HGrh1Wweb80,1058
|
|
42
|
+
fbgemm_gpu/config/__init__.py,sha256=yN0KAneCICgF2BTfOYGsd0qU1PvZX_6msC6YHHZKLMg,292
|
|
43
|
+
fbgemm_gpu/config/feature_list.py,sha256=hhDNkkafd-Oetvuqv9ylBVTNM-lKPi029mpRqq-JZCA,2467
|
|
44
|
+
fbgemm_gpu/docs/__init__.py,sha256=DR6hMSQrsZALfH2AnuJQ4Zq2CfBUUhMN8YjD6APjiAE,523
|
|
45
|
+
fbgemm_gpu/docs/common.py,sha256=8ipXTwVb222X-aZ71O6n8fhxHCHPNhJEHMFiO7epcIs,273
|
|
46
|
+
fbgemm_gpu/docs/examples.py,sha256=ZMN_6sL74LH_hrp2bF_hmg8gi29GhcgvwV3kCMjxkoE,2377
|
|
47
|
+
fbgemm_gpu/docs/jagged_tensor_ops.py,sha256=g8MA8ezTXiqingvk1DlTZJDQcmcCZPXpshuiWxS34F0,7380
|
|
48
|
+
fbgemm_gpu/docs/merge_pooled_embedding_ops.py,sha256=oJLgSgZQmhsyGLbTmZTxNgQrk65_3E8xSJaWSj_Jbo8,1102
|
|
49
|
+
fbgemm_gpu/docs/permute_pooled_embedding_ops.py,sha256=tZUqLVXlk5O6VAKKDA-OEMx2fCu5QPOOeoAPZA9_nLY,4454
|
|
50
|
+
fbgemm_gpu/docs/quantize_ops.py,sha256=xTtOaVK1P02ymreE_i21YiyYDZCqhoZY9eWp_mEIRlo,1297
|
|
51
|
+
fbgemm_gpu/docs/sparse_ops.py,sha256=gSLUFdnu8lle_6gLewFkM20wL3ek2jKLvDGMKR6POaY,27292
|
|
52
|
+
fbgemm_gpu/docs/target.default.json.py,sha256=_BcuMA1hCJ_Jtf08E7O8t-R8A5HiRXHH3Z9rpgCq66U,79
|
|
53
|
+
fbgemm_gpu/quantize/__init__.py,sha256=yPUCmLhNdahHFireHPQMmmiRp3g6W2dkIl5MB51M6SU,942
|
|
54
|
+
fbgemm_gpu/quantize/quantize_ops.py,sha256=C3SN79GcL7fczzoFkxUojm6cGkvvI4iWttkGN4LFQcM,2239
|
|
55
|
+
fbgemm_gpu/sll/__init__.py,sha256=nLFeTiRed6A5STRi_EgHCyNoik0zhXUk2db5kTmMUNU,4221
|
|
56
|
+
fbgemm_gpu/sll/cpu/__init__.py,sha256=glsukNpXtf47VRIdBktILD-4CmVcf4621SGB55lT_ho,2692
|
|
57
|
+
fbgemm_gpu/sll/cpu/cpu_sll.py,sha256=2XyvpZ_UgSThCzUmFDQbjUdLFbz0AvhvqPR_suUcyd8,27866
|
|
58
|
+
fbgemm_gpu/sll/meta/__init__.py,sha256=2sMcD67XGsweBZ-UV2AEJmM4ELPsHeRAYED6kqfgAd4,1077
|
|
59
|
+
fbgemm_gpu/sll/meta/meta_sll.py,sha256=Jk14EOW9VPFwawD7Bwky0R0A5rmbcLWMo52oH8J6Koc,8305
|
|
60
|
+
fbgemm_gpu/sll/triton/__init__.py,sha256=ndvZ5OO81KP65HopJql91R9y_5fC88WnNIGYxCAVKwM,4099
|
|
61
|
+
fbgemm_gpu/sll/triton/common.py,sha256=hISlX4Y-7FtGof-Xx4_B8-2vlF27F9t4p2qyLMUnJ8A,798
|
|
62
|
+
fbgemm_gpu/sll/triton/triton_dense_jagged_cat_jagged_out.py,sha256=J9qOqjNJ72LUBqs-pGI9wrFzzzBpsZ5fzYjgfKc2YhY,1885
|
|
63
|
+
fbgemm_gpu/sll/triton/triton_jagged2_to_padded_dense.py,sha256=lxIYe2MUde2qxLVO_aeTm34fDsMIz8ZkIjyx9Xk-YkE,5923
|
|
64
|
+
fbgemm_gpu/sll/triton/triton_jagged_bmm.py,sha256=bZIgk-GBdP8lPOoAOiIvO-9IE86B5Ejljmnh6-IuQeA,11785
|
|
65
|
+
fbgemm_gpu/sll/triton/triton_jagged_bmm_jagged_out.py,sha256=hccLxsKoSZKiWid5P_yl-IVdBSXw1Rt0WeiRsjLD2Iw,13864
|
|
66
|
+
fbgemm_gpu/sll/triton/triton_jagged_dense_elementwise_add.py,sha256=FRZ7vqaTIxVWkztr50q94Uic209e2KriLgF-3PQD6QM,1603
|
|
67
|
+
fbgemm_gpu/sll/triton/triton_jagged_dense_elementwise_mul_jagged_out.py,sha256=9R7BOOe8SJiko1PgbiuHlFyPKtGaaCFSlZ1RaEQyICE,4198
|
|
68
|
+
fbgemm_gpu/sll/triton/triton_jagged_dense_flash_attention.py,sha256=qJvMCRUqMOwL_kxYs1fd5QvYdbjaGeoBy9ovNGpjMws,22779
|
|
69
|
+
fbgemm_gpu/sll/triton/triton_jagged_flash_attention_basic.py,sha256=po9Nx4uAGVu_YIZ9CWvrmzSwxDsnDuNAtnk9VR7-Ems,17750
|
|
70
|
+
fbgemm_gpu/sll/triton/triton_jagged_self_substraction_jagged_out.py,sha256=VaOIxQn4Obvna2Co5VNDGILCDfKuYwkhVxK2oCi5mPI,1754
|
|
71
|
+
fbgemm_gpu/sll/triton/triton_jagged_softmax.py,sha256=odN66XGPc5VWmMZ34FRBsodpUtbpEILDpOgPtpCNrY4,14225
|
|
72
|
+
fbgemm_gpu/sll/triton/triton_multi_head_jagged_flash_attention.py,sha256=nEo5I-bba1XlG59qoACGB18OrA1LISs-e7Lasgys1s8,19572
|
|
73
|
+
fbgemm_gpu/split_embedding_codegen_lookup_invokers/__init__.py,sha256=kHLPaFr6UcvXDtdDQuF9CP-fvRNdniOORrG5B8O8SmU,6917
|
|
74
|
+
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_adagrad.py,sha256=V8CXfcyi5cXSP4-EbXGAq8NRXZdibZQSbPoFLHEcmo8,9733
|
|
75
|
+
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_adam.py,sha256=Bm0R9a6zL6LTEavWsRgkQilPW7aWg3SBOyE-S5AV8B8,12735
|
|
76
|
+
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_args.py,sha256=k7ZvSHi8fEsZP2GjofNIEFO9mdaQbQxINIhDbPdol0U,2830
|
|
77
|
+
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_args_ssd.py,sha256=pOhpRdDutSGpOZW5CylR4IIxljYpodizlLUbrO7PoF8,2909
|
|
78
|
+
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_lamb.py,sha256=GyRwkpONuthj_MG0PEbDpkiTMUpQ6ffg1xo5NgGbpGU,10720
|
|
79
|
+
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_lars_sgd.py,sha256=xHxf8neshHuE_6ybtAOrVFFMnwxfPQG11iuF0QIItVs,10285
|
|
80
|
+
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_none.py,sha256=7xOrQArbnUvGR2xMMRZ5gEsxoKRbDXi5ufxd-55b24g,9414
|
|
81
|
+
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_partial_rowwise_adam.py,sha256=7V7BdQCUZPOp8gmxrQvTfeinULf_uQppdFe7t9POBZ8,10425
|
|
82
|
+
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_partial_rowwise_lamb.py,sha256=oRjJ302FMr1O9ibFvNtXqn3i-lpmNDh-3JslMstBAxY,10425
|
|
83
|
+
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad.py,sha256=wQyjKE5xjZNDyNwQmiwuviMrgtEv2QX-MQMDZ8St2_A,10182
|
|
84
|
+
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad_ssd.py,sha256=YE6RgV8By8FGUxnzduUrjJdNI8j2JOmdEuWcCAikLMk,11523
|
|
85
|
+
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_rowwise_adagrad_with_counter.py,sha256=LZLuTucNYd2wlzfC4pU6339SmRZJiKIWlYwFDU1VFt4,12172
|
|
86
|
+
fbgemm_gpu/split_embedding_codegen_lookup_invokers/lookup_sgd.py,sha256=N32H1lUbbWModjDk_Ci8iF8P0hmlSmemmJynGQhuBGI,9195
|
|
87
|
+
fbgemm_gpu/split_embedding_optimizer_codegen/optimizer_args.py,sha256=xWSmk56JgoYfO8eiiK4BP9Brbhixs4tUAMeWp5TPZ30,956
|
|
88
|
+
fbgemm_gpu/split_embedding_optimizer_codegen/split_embedding_optimizer_rowwise_adagrad.py,sha256=bjrbKAypa-FnOIVKH-IUnWP1Jhlu0lk1SopZ0KLFVdo,6623
|
|
89
|
+
fbgemm_gpu/tbe/__init__.py,sha256=fE0IHi1JJpxsNVBNzWNee2thrNXFFRhY94c80RxNSIE,231
|
|
90
|
+
fbgemm_gpu/tbe/bench/__init__.py,sha256=TyUVsIH4p-RtFaXAKppYoaWbf9UTjCTUpnIV7RD_O5E,1653
|
|
91
|
+
fbgemm_gpu/tbe/bench/bench_config.py,sha256=xgtlGLCeZVW6jBYwkKsiQeCslCrWDgJbV2NLLwCRSn4,5452
|
|
92
|
+
fbgemm_gpu/tbe/bench/bench_runs.py,sha256=K4HRUcsX4BWqtrYwinZSXjnjNDFkvpoEdQmv-6rz7Tk,23518
|
|
93
|
+
fbgemm_gpu/tbe/bench/benchmark_click_interface.py,sha256=ofcGsiTUj3_Ml7JSsqg_LcMw3CV-6ypmlRWAUmT_cjc,6941
|
|
94
|
+
fbgemm_gpu/tbe/bench/eeg_cli.py,sha256=B3QOZhtycMDwHMG3dFKnlFuWOqYRCF3RCozEQfrqv8o,3580
|
|
95
|
+
fbgemm_gpu/tbe/bench/embedding_ops_common_config.py,sha256=zdL_ve1Ga6ziU5LjfnzJXOBOIqtCjLlhSrlGfa42H9w,4978
|
|
96
|
+
fbgemm_gpu/tbe/bench/eval_compression.py,sha256=ulFMaNZF2g_vfkXLWZSh02ibotg1zpTz3swVU484mzU,3486
|
|
97
|
+
fbgemm_gpu/tbe/bench/reporter.py,sha256=ZK5RFolUmZEcsEaife270_iOdXAQD5EjTUkuxctnAbY,804
|
|
98
|
+
fbgemm_gpu/tbe/bench/tbe_data_config.py,sha256=d724L4Is3Bo2D5reglgsBs7H6ezLFDrQUbTP5tsnPEQ,8509
|
|
99
|
+
fbgemm_gpu/tbe/bench/tbe_data_config_bench_helper.py,sha256=c-IwLbx04Qbqxzfcn9N4U9Eo9QnmgbBN6HxJYAJwvMw,11311
|
|
100
|
+
fbgemm_gpu/tbe/bench/tbe_data_config_loader.py,sha256=fSdtEAnKu6r56mHMtMJIHo-S6m3vC4cPRyXJKKUevzc,11996
|
|
101
|
+
fbgemm_gpu/tbe/bench/tbe_data_config_param_models.py,sha256=I9dozlJAW_XzuopyJapJ4gmDkLU0YSUz2znugiLZRMg,6203
|
|
102
|
+
fbgemm_gpu/tbe/bench/utils.py,sha256=C0GTTomJO3r9LVfbpzlkudxoA_3QyeMdM-7zM-YOAHA,6716
|
|
103
|
+
fbgemm_gpu/tbe/cache/__init__.py,sha256=lrYwhvqX2eWN0vAPe89HYgMW_O1vccoOcoFHJ9cyM-s,398
|
|
104
|
+
fbgemm_gpu/tbe/cache/kv_embedding_ops_inference.py,sha256=VmG9EennGcq2By8Tj8VkFsJG0oOCGw8EhlPo8-t--Fk,14604
|
|
105
|
+
fbgemm_gpu/tbe/cache/split_embeddings_cache_ops.py,sha256=vZHj7KIe1DoJDy5eft29XtGg6I-tRx60tjKOcTHRAYI,1321
|
|
106
|
+
fbgemm_gpu/tbe/ssd/__init__.py,sha256=wzfMT10cp_dqK2lrebC449hOdexBnizcf_98lA1NyHs,483
|
|
107
|
+
fbgemm_gpu/tbe/ssd/common.py,sha256=zecFfJCcQIwNYbaGoI44Q8rGCskvtmOmc1zxqYHS7Tg,1055
|
|
108
|
+
fbgemm_gpu/tbe/ssd/inference.py,sha256=MwSXP4l2fJUSQJRPu9-bqU08Kg9-0ux8uA5UPSabW3M,22812
|
|
109
|
+
fbgemm_gpu/tbe/ssd/training.py,sha256=2CFA4KmA9IfcpX14K4MlzBuSRPD9h5NM1M7TqepH6vA,212168
|
|
110
|
+
fbgemm_gpu/tbe/ssd/utils/__init__.py,sha256=5DgmR2HA6NtmYh2ddkUgpDsZ6a7hF0DPedA1gMpdh18,250
|
|
111
|
+
fbgemm_gpu/tbe/ssd/utils/partially_materialized_tensor.py,sha256=SFg2-29b-i49LWm-FlaWUkTz2XzXbicYi_AzVj4jKNE,7601
|
|
112
|
+
fbgemm_gpu/tbe/stats/__init__.py,sha256=on29iDtq7cVNh90JR9aeFNG-K9DDoYq0JryzoplL49I,322
|
|
113
|
+
fbgemm_gpu/tbe/stats/bench_params_reporter.py,sha256=_lA4peKXI0GCWsZHJ7IUKlUHU98CA-gVoOc-uhRfcoY,13233
|
|
114
|
+
fbgemm_gpu/tbe/utils/__init__.py,sha256=rlXFm-kTByFZO4SS5C5zMzANRiQmM1NT__eWBayncYg,549
|
|
115
|
+
fbgemm_gpu/tbe/utils/common.py,sha256=KBCyBT-7ShhTRRd1Rs5sEU4g8JggEM7Es6wQ0qhWY-o,1313
|
|
116
|
+
fbgemm_gpu/tbe/utils/offsets.py,sha256=DDWwGaQsVZbhaEZ_fRxxeY8ndLc7IORPZrx61eOqwJc,1904
|
|
117
|
+
fbgemm_gpu/tbe/utils/quantize.py,sha256=EdYh9FS_kMsvCWPuvNms4uSE9de_3cQNo_DCScGG3zI,9166
|
|
118
|
+
fbgemm_gpu/tbe/utils/requests.py,sha256=_lxGVt2J0tEmG7aXv24BMrvfdK6HuvMPlPZHWsF_EDI,18038
|
|
119
|
+
fbgemm_gpu/triton/__init__.py,sha256=kPn_Ye6J9DAzWtqi76KYGwfKSqw0IhqG3Bir5aUpkWM,658
|
|
120
|
+
fbgemm_gpu/triton/common.py,sha256=tsK56Dom_XSb5kXuoN0KnGAWlC5HWV7Ook--a59UHdI,2130
|
|
121
|
+
fbgemm_gpu/triton/quantize.py,sha256=I0pxyfIx04zyq55x4Pvj-28Cb2ZeF-SGtFhAymFagkg,27073
|
|
122
|
+
fbgemm_gpu/triton/quantize_ref.py,sha256=q4RBmFaqPVPELU52lbSgB0n26Aun7apeK7bRF2MWS80,11553
|
|
123
|
+
fbgemm_gpu/triton/jagged/__init__.py,sha256=om0yhjuzKuE1UQakFMWHsXN4WNb8mvNkZtYofQ8hdn4,246
|
|
124
|
+
fbgemm_gpu/triton/jagged/triton_jagged_tensor_ops.py,sha256=F2eQWjkWMR5RWQ48oIr-8OU_CRZyLazDpT7DFrDWS6g,29871
|
|
125
|
+
fbgemm_gpu/utils/__init__.py,sha256=JQQNdcTTaEU6ptK-OW-ZQBwTFxEZZpWOtBXWwEZm39o,354
|
|
126
|
+
fbgemm_gpu/utils/filestore.py,sha256=oVtbKGaPQki1JgbJCkrkElukOFVyxntQpSC0lYBKgho,6455
|
|
127
|
+
fbgemm_gpu/utils/loader.py,sha256=1hCEhNvkflniH46fGcrguLeP1z-6uyOu2QFwqKU5CIM,990
|
|
128
|
+
fbgemm_gpu/utils/torch_library.py,sha256=ywsAHjbuwesj50LjEu99WkAH17FlaVgePZ9OmFg6YE4,4193
|
|
129
|
+
fbgemm_gpu/utils/writeback_util.py,sha256=PyVbHp1EuF-GKrJv_CTP6B50Z0oBblXKucf7Rhd6KKY,4614
|
|
130
|
+
list_versions/__init__.py,sha256=UmTeqCk-UJWFtlZQWvZao3xvui2w9E3X_JdOXVjRaNw,315
|
|
131
|
+
list_versions/cli_run.py,sha256=BCRaJvjVFBFmD5WPdjC_yJwlLv1w_TYOe3eYlf_9ZMo,4506
|
|
132
|
+
fbgemm_gpu_nightly_cpu-2026.1.29.dist-info/METADATA,sha256=sqUYIVBwodRVxysq3jEToUNFX12vtC4tZenZnKnynjo,2654
|
|
133
|
+
fbgemm_gpu_nightly_cpu-2026.1.29.dist-info/WHEEL,sha256=IaW-egZU3n4QvS-XsoO31KhIl6_BixcQGgBPEoTC6GI,109
|
|
134
|
+
fbgemm_gpu_nightly_cpu-2026.1.29.dist-info/top_level.txt,sha256=_2s1Aa08r_eDn0JP4FjOhzK09Q8bVlEI7q8pMep51UY,25
|
|
135
|
+
fbgemm_gpu_nightly_cpu-2026.1.29.dist-info/RECORD,,
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
|
|
2
1
|
#!/usr/bin/env python3
|
|
3
2
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
4
3
|
# All rights reserved.
|
|
@@ -6,6 +5,8 @@
|
|
|
6
5
|
# This source code is licensed under the BSD-style license found in the
|
|
7
6
|
# LICENSE file in the root directory of this source tree.
|
|
8
7
|
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
8
|
+
# pyre-strict
|
|
9
|
+
|
|
10
|
+
from .cli_run import CLI, CLIOutput
|
|
11
|
+
|
|
12
|
+
__all__ = ["CLI", "CLIOutput"]
|
list_versions/cli_run.py
ADDED
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
3
|
+
# All rights reserved.
|
|
4
|
+
#
|
|
5
|
+
# This source code is licensed under the BSD-style license found in the
|
|
6
|
+
# LICENSE file in the root directory of this source tree.
|
|
7
|
+
|
|
8
|
+
# pyre-strict
|
|
9
|
+
|
|
10
|
+
import logging
|
|
11
|
+
import subprocess
|
|
12
|
+
from datetime import datetime
|
|
13
|
+
from typing import Union
|
|
14
|
+
|
|
15
|
+
import click
|
|
16
|
+
import pandas as pd
|
|
17
|
+
import torch
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class CLIOutput:
|
|
21
|
+
def __init__(
|
|
22
|
+
self,
|
|
23
|
+
cli: str = "",
|
|
24
|
+
stdout: str = "",
|
|
25
|
+
stderr: str = "",
|
|
26
|
+
returncode: int = 0,
|
|
27
|
+
timestamp: str = "2025-01-01T20:00:00.00000",
|
|
28
|
+
visible: bool = True,
|
|
29
|
+
) -> None:
|
|
30
|
+
self._cli = cli
|
|
31
|
+
self._stdout = stdout
|
|
32
|
+
self._stderr = stderr
|
|
33
|
+
self._returncode = returncode
|
|
34
|
+
self._timestamp = timestamp
|
|
35
|
+
self._visible = visible
|
|
36
|
+
|
|
37
|
+
def to_dict(self) -> dict[str, Union[int, str]]:
|
|
38
|
+
return {
|
|
39
|
+
"cli": self._cli,
|
|
40
|
+
"stdout": self._stdout,
|
|
41
|
+
"stderr": self._stderr,
|
|
42
|
+
"returncode": self._returncode,
|
|
43
|
+
"timestamp": self._timestamp,
|
|
44
|
+
"visible": self._visible,
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class CLI:
|
|
49
|
+
def __init__(self) -> None:
|
|
50
|
+
pd.options.display.max_rows
|
|
51
|
+
pd.set_option("display.max_colwidth", None)
|
|
52
|
+
self._cli_outputs: list[CLIOutput] = [
|
|
53
|
+
CLIOutput(
|
|
54
|
+
cli="python –c “import torch; print(torch.__version__)”",
|
|
55
|
+
stdout="{}".format(torch.__version__),
|
|
56
|
+
stderr="",
|
|
57
|
+
returncode=0,
|
|
58
|
+
timestamp=datetime.now().isoformat(),
|
|
59
|
+
visible=True,
|
|
60
|
+
)
|
|
61
|
+
]
|
|
62
|
+
|
|
63
|
+
def run(
|
|
64
|
+
self,
|
|
65
|
+
cli: Union[str, list[str]],
|
|
66
|
+
visible: bool = True,
|
|
67
|
+
input: str = "",
|
|
68
|
+
capture_output: bool = True,
|
|
69
|
+
) -> CLIOutput:
|
|
70
|
+
if isinstance(cli, str):
|
|
71
|
+
cli = cli.split()
|
|
72
|
+
result = CLIOutput()
|
|
73
|
+
try:
|
|
74
|
+
completed = subprocess.run(
|
|
75
|
+
cli, text=True, check=False, capture_output=capture_output, input=input
|
|
76
|
+
)
|
|
77
|
+
result = CLIOutput(
|
|
78
|
+
cli=" ".join(cli),
|
|
79
|
+
stdout=completed.stdout,
|
|
80
|
+
stderr=completed.stderr,
|
|
81
|
+
returncode=completed.returncode,
|
|
82
|
+
timestamp=datetime.now().isoformat(),
|
|
83
|
+
visible=visible,
|
|
84
|
+
)
|
|
85
|
+
if visible:
|
|
86
|
+
self._cli_outputs.append(result)
|
|
87
|
+
except Exception as e:
|
|
88
|
+
logging.error(f'For cli {" ".join(cli)} we got exception {e}')
|
|
89
|
+
result = CLIOutput(
|
|
90
|
+
cli=" ".join(cli),
|
|
91
|
+
stdout="",
|
|
92
|
+
stderr=str(e),
|
|
93
|
+
returncode=-1,
|
|
94
|
+
visible=visible,
|
|
95
|
+
timestamp=datetime.now().isoformat(),
|
|
96
|
+
)
|
|
97
|
+
if visible:
|
|
98
|
+
self._cli_outputs.append(result)
|
|
99
|
+
return result
|
|
100
|
+
|
|
101
|
+
def run_piped(self, clis: list[str]) -> None:
|
|
102
|
+
the_input = ""
|
|
103
|
+
for cli in clis[:-1]:
|
|
104
|
+
result = self.run(
|
|
105
|
+
cli=cli, visible=False, input=the_input, capture_output=True
|
|
106
|
+
)
|
|
107
|
+
the_input = result._stdout
|
|
108
|
+
self.run(cli=clis[-1], visible=True, input=the_input, capture_output=True)
|
|
109
|
+
|
|
110
|
+
def to_dataframe(self) -> pd.DataFrame:
|
|
111
|
+
return pd.DataFrame([output.to_dict() for output in self._cli_outputs])
|
|
112
|
+
|
|
113
|
+
def save(self, filename: str, format: str = "csv") -> None:
|
|
114
|
+
df = self.to_dataframe()
|
|
115
|
+
if format == "csv":
|
|
116
|
+
df.to_csv(filename, index=False)
|
|
117
|
+
elif format == "json":
|
|
118
|
+
df.to_json(filename, orient="records", lines=True)
|
|
119
|
+
else:
|
|
120
|
+
raise ValueError(f"Invalid format {format} : must be one of 'csv', 'json'")
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
@click.command()
|
|
124
|
+
@click.option("--json", default="")
|
|
125
|
+
@click.option("--csv", default="")
|
|
126
|
+
def cli_run(
|
|
127
|
+
json: str,
|
|
128
|
+
csv: str,
|
|
129
|
+
) -> None:
|
|
130
|
+
cli = CLI()
|
|
131
|
+
|
|
132
|
+
the_rpm = "rpm -qa"
|
|
133
|
+
the_grep1 = "grep -E ^amdgpu-(dkms|kmod)"
|
|
134
|
+
the_grep2 = "grep -v firmware"
|
|
135
|
+
the_sed1 = "sed -E s/^[^-]-[^-]-//"
|
|
136
|
+
the_sed2 = "sed -E s/.[^.].[^.]$//"
|
|
137
|
+
cli.run_piped([the_rpm, the_grep1, the_grep2, the_sed1, the_sed2])
|
|
138
|
+
|
|
139
|
+
cli.run("uname -r")
|
|
140
|
+
|
|
141
|
+
cli.run("fw-util all --version")
|
|
142
|
+
|
|
143
|
+
cli.run("amd-smi firmware")
|
|
144
|
+
cli.run("amd-smi version")
|
|
145
|
+
cli.run("amd-smi static")
|
|
146
|
+
|
|
147
|
+
if len(csv):
|
|
148
|
+
cli.save(csv)
|
|
149
|
+
|
|
150
|
+
if len(json):
|
|
151
|
+
cli.save(json, format="json")
|
|
152
|
+
|
|
153
|
+
print(cli.to_dataframe())
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def main() -> None:
|
|
157
|
+
cli_run()
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
if __name__ == "__main__":
|
|
161
|
+
main()
|