torch-rb 0.15.0 → 0.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -134,7 +134,7 @@
134
134
  autogen: _new_zeros_with_same_feature_meta.out
135
135
 
136
136
  # This function compares the storage numel of self with that of other, where
137
- # storage numel is cumputed as: `other.storage().nbytes() / other.itemsize()`.
137
+ # storage numel is computed as: `other.storage().nbytes() / other.itemsize()`.
138
138
  # We create this function for composite compliance purposes. The batching rule
139
139
  # always returns true because vmapped as_strided does not support accessing
140
140
  # storage locations not indexable by the input tensor.
@@ -175,12 +175,24 @@
175
175
  CPU: _assert_async_msg_cpu
176
176
  CUDA: _assert_async_msg_cuda
177
177
 
178
+ - func: _assert_scalar(Scalar self, str assert_msg) -> ()
179
+ dispatch:
180
+ CompositeExplicitAutograd: _assert_scalar
181
+
182
+ - func: _functional_assert_scalar(Scalar self, str assert_msg, Tensor dep_token) -> Tensor
183
+ dispatch:
184
+ CompositeExplicitAutograd: _functional_assert_scalar
185
+
178
186
  - func: _functional_assert_async.msg(Tensor self, str assert_msg, Tensor dep_token) -> Tensor
179
187
  dispatch:
180
188
  CPU: _functional_assert_async_msg_cpu
181
189
 
182
190
  - func: _assert_tensor_metadata(Tensor a, SymInt[]? size=None, SymInt[]? stride=None, ScalarType? dtype=None) -> ()
183
191
 
192
+ - func: _print(str s) -> ()
193
+ dispatch:
194
+ CompositeExplicitAutograd: _print
195
+
184
196
  - func: sym_constrain_range(Scalar size, *, int? min=None, int? max=None) -> ()
185
197
  dispatch:
186
198
  CompositeExplicitAutograd: sym_constrain_range
@@ -470,6 +482,7 @@
470
482
  - func: conj_physical.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
471
483
  dispatch:
472
484
  CPU, CUDA: conj_physical_out
485
+ MPS: conj_physical_out_mps
473
486
  SparseCPU, SparseCUDA: conj_physical_out_sparse
474
487
  SparseCsrCPU, SparseCsrCUDA: conj_physical_sparse_csr_out
475
488
  tags: pointwise
@@ -536,8 +549,8 @@
536
549
  structured_delegate: add.out
537
550
  variants: function, method
538
551
  dispatch:
539
- SparseCPU, SparseCUDA: add_sparse
540
- SparseCsrCPU, SparseCsrCUDA: add_sparse_csr
552
+ SparseCPU, SparseCUDA, SparseMeta: add_sparse
553
+ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: add_sparse_csr
541
554
  MkldnnCPU: mkldnn_add
542
555
  ZeroTensor: add_zerotensor
543
556
  NestedTensorCPU, NestedTensorCUDA: NestedTensor_add_Tensor
@@ -548,8 +561,8 @@
548
561
  variants: method
549
562
  structured_delegate: add.out
550
563
  dispatch:
551
- SparseCPU, SparseCUDA: add_sparse_
552
- SparseCsrCPU, SparseCsrCUDA: add_sparse_csr_
564
+ SparseCPU, SparseCUDA, SparseMeta: add_sparse_
565
+ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: add_sparse_csr_
553
566
  MkldnnCPU: mkldnn_add_
554
567
  NestedTensorCPU, NestedTensorCUDA: NestedTensor_add__Tensor
555
568
  tags: pointwise
@@ -562,10 +575,10 @@
562
575
  Generic: add (AllAndComplex, BFloat16, Half, ComplexHalf)
563
576
  ScalarOnly: add (Bool)
564
577
  dispatch:
565
- SparseCPU: add_out_sparse_cpu
578
+ SparseCPU, SparseMeta: add_out_sparse_cpu
566
579
  SparseCUDA: add_out_sparse_cuda
567
- SparseCsrCPU: add_out_sparse_csr_cpu
568
- SparseCsrCUDA: add_out_sparse_csr_cuda
580
+ SparseCsrCPU, SparseCsrMeta: add_out_sparse_compressed_cpu
581
+ SparseCsrCUDA: add_out_sparse_compressed_cuda
569
582
  MkldnnCPU: mkldnn_add_out
570
583
  MPS: add_out_mps
571
584
  tags: pointwise
@@ -763,7 +776,7 @@
763
776
  dispatch:
764
777
  CompositeExplicitAutograd: arange
765
778
 
766
- # This operator should be named `aragne.start_out` if following the naming convention. However that
779
+ # This operator should be named `arange.start_out` if following the naming convention. However that
767
780
  # name is already taken. Disabled because of CI job failures.
768
781
  # FIXME: enable this
769
782
  #- func: arange.start_out_(Scalar start, Scalar end, *, Tensor(a!) out) -> Tensor(a!)
@@ -1220,6 +1233,13 @@
1220
1233
  CompositeExplicitAutograd: copysign_out
1221
1234
  tags: pointwise
1222
1235
 
1236
+ - func: _lazy_clone(Tensor self) -> Tensor
1237
+ # Like clone, but the copy takes place lazily, only if either the
1238
+ # input or the output are written.
1239
+ variants: function, method
1240
+ dispatch:
1241
+ CompositeExplicitAutograd: _lazy_clone
1242
+
1223
1243
  - func: logical_not(Tensor self) -> Tensor
1224
1244
  device_check: NoCheck # TensorIterator
1225
1245
  variants: function, method
@@ -1621,6 +1641,7 @@
1621
1641
  - func: complex.out(Tensor real, Tensor imag, *, Tensor(a!) out) -> Tensor(a!)
1622
1642
  dispatch:
1623
1643
  CPU, CUDA: complex_out
1644
+ MPS: complex_out_mps
1624
1645
 
1625
1646
  - func: polar(Tensor abs, Tensor angle) -> Tensor
1626
1647
  variants: function
@@ -1729,6 +1750,7 @@
1729
1750
  - func: copy(Tensor self, Tensor src, bool non_blocking=False) -> Tensor
1730
1751
  variants: function
1731
1752
  dispatch:
1753
+ Meta: copy_meta
1732
1754
  CompositeExplicitAutogradNonFunctional: copy
1733
1755
  tags: core
1734
1756
 
@@ -1847,7 +1869,10 @@
1847
1869
  - func: cudnn_convolution(Tensor self, Tensor weight, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor
1848
1870
  dispatch:
1849
1871
  CUDA: cudnn_convolution
1850
- autogen: cudnn_convolution.out
1872
+
1873
+ - func: cudnn_convolution.out(Tensor self, Tensor weight, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, bool allow_tf32, *, Tensor(a!) out) -> Tensor(a!)
1874
+ dispatch:
1875
+ CUDA: cudnn_convolution_out
1851
1876
 
1852
1877
  - func: cudnn_convolution_transpose(Tensor self, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor
1853
1878
  dispatch:
@@ -2346,7 +2371,7 @@
2346
2371
  Meta: empty_meta_symint
2347
2372
  MkldnnCPU: empty_mkldnn
2348
2373
  SparseCPU, SparseCUDA, SparseMeta: empty_sparse
2349
- SparseCsrCPU, SparseCsrCUDA: empty_sparse_compressed
2374
+ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: empty_sparse_compressed
2350
2375
  QuantizedCPU, QuantizedCUDA, QuantizedMeta: empty_unknown_quantized
2351
2376
  tags: core
2352
2377
 
@@ -2452,7 +2477,7 @@
2452
2477
  CompositeExplicitAutograd: empty_like
2453
2478
  QuantizedCPU, QuantizedCUDA: empty_like_quantized
2454
2479
  SparseCPU, SparseCUDA, SparseMeta: empty_like_sparse_coo
2455
- SparseCsrCPU, SparseCsrCUDA: empty_like_sparse_csr
2480
+ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: empty_like_sparse_csr
2456
2481
  NestedTensorCPU, NestedTensorCUDA: empty_like_nested
2457
2482
  autogen: empty_like.out
2458
2483
 
@@ -2954,12 +2979,14 @@
2954
2979
  dispatch:
2955
2980
  CPU: _fft_r2c_mkl
2956
2981
  CUDA: _fft_r2c_cufft
2982
+ MPS: _fft_r2c_mps
2957
2983
 
2958
2984
  - func: _fft_r2c.out(Tensor self, int[] dim, int normalization, bool onesided, *, Tensor(a!) out) -> Tensor(a!)
2959
2985
  variants: function
2960
2986
  dispatch:
2961
2987
  CPU: _fft_r2c_mkl_out
2962
2988
  CUDA: _fft_r2c_cufft_out
2989
+ MPS: _fft_r2c_mps_out
2963
2990
 
2964
2991
  # Complex to real inverse FFT
2965
2992
  - func: _fft_c2r(Tensor self, int[] dim, int normalization, SymInt last_dim_size) -> Tensor
@@ -2967,12 +2994,14 @@
2967
2994
  dispatch:
2968
2995
  CPU: _fft_c2r_mkl
2969
2996
  CUDA: _fft_c2r_cufft
2997
+ MPS: _fft_c2r_mps
2970
2998
 
2971
2999
  - func: _fft_c2r.out(Tensor self, int[] dim, int normalization, SymInt last_dim_size, *, Tensor(a!) out) -> Tensor(a!)
2972
3000
  variants: function
2973
3001
  dispatch:
2974
3002
  CPU: _fft_c2r_mkl_out
2975
3003
  CUDA: _fft_c2r_cufft_out
3004
+ MPS: _fft_c2r_mps_out
2976
3005
 
2977
3006
  # Standard complex to complex FFT (forward or backward)
2978
3007
  - func: _fft_c2c(Tensor self, SymInt[] dim, int normalization, bool forward) -> Tensor
@@ -2980,12 +3009,14 @@
2980
3009
  dispatch:
2981
3010
  CPU: _fft_c2c_mkl
2982
3011
  CUDA: _fft_c2c_cufft
3012
+ MPS: _fft_c2c_mps
2983
3013
 
2984
3014
  - func: _fft_c2c.out(Tensor self, SymInt[] dim, int normalization, bool forward, *, Tensor(a!) out) -> Tensor(a!)
2985
3015
  variants: function
2986
3016
  dispatch:
2987
3017
  CPU: _fft_c2c_mkl_out
2988
3018
  CUDA: _fft_c2c_cufft_out
3019
+ MPS: _fft_c2c_mps_out
2989
3020
 
2990
3021
  - func: _validate_compressed_sparse_indices(bool is_crow, Tensor compressed_idx, Tensor plain_idx, int cdim, int dim, int nnz) -> ()
2991
3022
  device_check: NoCheck
@@ -3097,6 +3128,7 @@
3097
3128
  structured: True
3098
3129
  dispatch:
3099
3130
  CPU, CUDA: isin_Tensor_Tensor_out
3131
+ MPS: isin_Tensor_Tensor_out_mps
3100
3132
 
3101
3133
  - func: isin.Tensor_Tensor(Tensor elements, Tensor test_elements, *, bool assume_unique=False, bool invert=False) -> Tensor
3102
3134
  variants: function
@@ -3238,6 +3270,8 @@
3238
3270
  autogen: native_layer_norm_backward.out
3239
3271
  tags: core
3240
3272
 
3273
+ - func: rms_norm(Tensor input, int[] normalized_shape, Tensor? weight=None, float? eps=None) -> Tensor
3274
+
3241
3275
  - func: nan_to_num(Tensor self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor
3242
3276
  variants: function, method
3243
3277
  dispatch:
@@ -3302,14 +3336,39 @@
3302
3336
  dispatch:
3303
3337
  CUDA: _cslt_compress
3304
3338
 
3305
- - func: _cslt_sparse_mm(Tensor compressed_A, Tensor dense_B, Tensor? bias=None, Tensor? alpha=None, ScalarType? out_dtype=None, bool transpose_result=False) -> Tensor
3339
+ - func: _cslt_sparse_mm(Tensor compressed_A, Tensor dense_B, Tensor? bias=None, Tensor? alpha=None, ScalarType? out_dtype=None, bool transpose_result=False, int alg_id=0) -> Tensor
3306
3340
  dispatch:
3307
3341
  CUDA: _cslt_sparse_mm
3308
3342
 
3309
- - func: _sparse_semi_structured_linear(Tensor input, Tensor weight, Tensor meta, *, Tensor? bias=None, str? activation=None) -> Tensor
3343
+ - func: _cslt_sparse_mm_search(Tensor compressed_A, Tensor dense_B, Tensor? bias=None, Tensor? alpha=None, ScalarType? out_dtype=None, bool transpose_result=False) -> int
3344
+ dispatch:
3345
+ CUDA: _cslt_sparse_mm_search
3346
+
3347
+ - func: _sparse_semi_structured_tile(Tensor input, str algorithm="", bool use_cutlass=True) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
3348
+ dispatch:
3349
+ CUDA: _sparse_semi_structured_tile
3350
+
3351
+ - func: _sparse_semi_structured_apply(Tensor input, Tensor thread_masks) -> (Tensor, Tensor)
3352
+ dispatch:
3353
+ CUDA: _sparse_semi_structured_apply
3354
+
3355
+ - func: _sparse_semi_structured_apply_dense(Tensor input, Tensor thread_masks) -> Tensor
3356
+ dispatch:
3357
+ CUDA: _sparse_semi_structured_apply_dense
3358
+
3359
+ # DEPRECATED: Use torch.__sparse_semi_structured_mm/torch._sparse_semi_structured_addmm instead
3360
+ - func: _sparse_semi_structured_linear(Tensor input, Tensor weight, Tensor meta, *, Tensor? bias=None, str? activation=None, ScalarType? out_dtype=None) -> Tensor
3310
3361
  dispatch:
3311
3362
  CUDA: _sparse_semi_structured_linear
3312
3363
 
3364
+ - func: _sparse_semi_structured_mm(Tensor mat1, Tensor mat1_meta, Tensor mat2, *, ScalarType? out_dtype=None) -> Tensor
3365
+ dispatch:
3366
+ CUDA: _sparse_semi_structured_mm
3367
+
3368
+ - func: _sparse_semi_structured_addmm(Tensor input, Tensor mat1, Tensor mat1_meta, Tensor mat2, *, Scalar alpha=1, Scalar beta=1, ScalarType? out_dtype=None) -> Tensor
3369
+ dispatch:
3370
+ CUDA: _sparse_semi_structured_addmm
3371
+
3313
3372
  - func: _mixed_dtypes_linear(Tensor input, Tensor weight, Tensor scale, *, Tensor? bias=None, str? activation=None) -> Tensor
3314
3373
  dispatch:
3315
3374
  CUDA: _mixed_dtypes_linear
@@ -4050,20 +4109,30 @@
4050
4109
 
4051
4110
  - func: _int_mm(Tensor self, Tensor mat2) -> Tensor
4052
4111
  dispatch:
4112
+ CPU: _int_mm_cpu
4053
4113
  CUDA: _int_mm_cuda
4054
4114
 
4055
4115
  - func: _int_mm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)
4056
4116
  dispatch:
4117
+ CPU: _int_mm_out_cpu
4057
4118
  CUDA: _int_mm_out_cuda
4058
4119
 
4059
4120
  - func: _convert_weight_to_int4pack(Tensor self, int innerKTiles) -> Tensor
4060
4121
  dispatch:
4122
+ CPU: _convert_weight_to_int4pack_cpu
4061
4123
  CUDA: _convert_weight_to_int4pack_cuda
4062
4124
 
4063
4125
  - func: _weight_int4pack_mm(Tensor self, Tensor mat2, int qGroupSize, Tensor qScaleAndZeros) -> Tensor
4064
4126
  dispatch:
4127
+ CPU: _weight_int4pack_mm_cpu
4128
+ MPS: _weight_int4pack_mm_mps
4065
4129
  CUDA: _weight_int4pack_mm_cuda
4066
4130
 
4131
+ - func: _weight_int8pack_mm(Tensor self, Tensor mat2, Tensor scales) -> Tensor
4132
+ dispatch:
4133
+ CPU: _weight_int8pack_mm_cpu
4134
+ MPS: _weight_int8pack_mm_mps
4135
+
4067
4136
  - func: _sparse_mm(Tensor sparse, Tensor dense) -> Tensor
4068
4137
  python_module: sparse
4069
4138
 
@@ -4439,7 +4508,6 @@
4439
4508
  MPS: pixel_shuffle_mps
4440
4509
  CompositeExplicitAutogradNonFunctional: math_pixel_shuffle
4441
4510
  autogen: pixel_shuffle.out
4442
- tags: core
4443
4511
 
4444
4512
  - func: pixel_unshuffle(Tensor self, int downscale_factor) -> Tensor
4445
4513
  dispatch:
@@ -4810,7 +4878,7 @@
4810
4878
  device_guard: False
4811
4879
  dispatch:
4812
4880
  CompositeImplicitAutograd: reshape_symint
4813
- CompositeImplicitAutogradNestedTensor: reshape_nested
4881
+ CompositeImplicitAutogradNestedTensor: reshape_nested_symint
4814
4882
 
4815
4883
  - func: _reshape_copy(Tensor self, SymInt[] size) -> Tensor
4816
4884
  variants: function
@@ -4969,6 +5037,7 @@
4969
5037
  device_check: NoCheck # TensorIterator
4970
5038
  python_module: nn
4971
5039
  dispatch:
5040
+ QuantizedCPU: gelu_quantized_cpu_
4972
5041
  NestedTensorCPU, NestedTensorCUDA: NestedTensor_gelu_
4973
5042
 
4974
5043
  - func: gelu(Tensor self, *, str approximate='none') -> Tensor
@@ -5356,6 +5425,21 @@
5356
5425
  CompositeExplicitAutograd: slice_backward
5357
5426
  autogen: slice_backward.out
5358
5427
 
5428
+ # NB: This op exists to back the implementation of reverse view_funcs for various views (chunk,
5429
+ # slice.Tensor, split_with_sizes, et al.). Currently, these are only used during fake-ification
5430
+ # of PT2 graph input subclass instances that are views. This means:
5431
+ # * This op shouldn't really show up in eager mode (so e.g. XLA shouldn't have to implement it)
5432
+ # * This op shouldn't show up in a PT2 graph (so a PT2 backend shouldn't have to implement it)
5433
+ # * A subclass will have to implement this to work in PT2 if a subclass view is used as a graph
5434
+ # input AND the view utilizes this op in its inverse. The idea is that slice_inverse() is
5435
+ # easier to implement for a subclass than as_strided()
5436
+ - func: slice_inverse(Tensor(a) self, Tensor src, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor(a)
5437
+ variants: function, method
5438
+ device_check: NoCheck
5439
+ device_guard: False
5440
+ dispatch:
5441
+ CompositeExplicitAutograd: slice_inverse_symint
5442
+
5359
5443
  - func: slice_scatter(Tensor self, Tensor src, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor
5360
5444
  variants: function, method
5361
5445
  device_check: NoCheck
@@ -5363,7 +5447,7 @@
5363
5447
  dispatch:
5364
5448
  CompositeExplicitAutogradNonFunctional: slice_scatter
5365
5449
  autogen: slice_scatter.out
5366
- tags: core
5450
+ tags: [core, view_copy]
5367
5451
 
5368
5452
  - func: select_scatter(Tensor self, Tensor src, int dim, SymInt index) -> Tensor
5369
5453
  variants: function, method
@@ -5562,6 +5646,16 @@
5562
5646
  SparseCPU: _sspaddmm_out_cpu
5563
5647
  SparseCUDA: _sspaddmm_out_cuda
5564
5648
 
5649
+ - func: _chunk_cat(Tensor[] tensors, int dim, int num_chunks) -> Tensor
5650
+ dispatch:
5651
+ CompositeExplicitAutograd: _chunk_cat
5652
+ CUDA: _chunk_cat_cuda
5653
+
5654
+ - func: _chunk_cat.out(Tensor[] tensors, int dim, int num_chunks, *, Tensor(a!) out) -> Tensor(a!)
5655
+ dispatch:
5656
+ CompositeExplicitAutograd: _chunk_cat_out
5657
+ CUDA: _chunk_cat_out_cuda
5658
+
5565
5659
  - func: stack(Tensor[] tensors, int dim=0) -> Tensor
5566
5660
  dispatch:
5567
5661
  CompositeExplicitAutograd: stack
@@ -5626,8 +5720,8 @@
5626
5720
  variants: function, method
5627
5721
  dispatch:
5628
5722
  CompositeExplicitAutograd: sum
5629
- SparseCPU, SparseCUDA: sum_coo
5630
- SparseCsrCPU, SparseCsrCUDA: sum_csr
5723
+ SparseCPU, SparseCUDA, SparseMeta: sum_coo
5724
+ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sum_csr
5631
5725
  autogen: sum.out
5632
5726
 
5633
5727
  - func: sum.dim_IntList(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
@@ -5753,6 +5847,7 @@
5753
5847
  variants: function
5754
5848
  dispatch:
5755
5849
  CPU, CUDA: std_mean
5850
+ MPS: std_mean_mps
5756
5851
  autogen: std_mean.correction_out
5757
5852
 
5758
5853
  - func: std_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
@@ -6008,7 +6103,6 @@
6008
6103
  CPU, MPS: roll
6009
6104
  CUDA: roll_cuda
6010
6105
  autogen: roll.out
6011
- tags: core
6012
6106
 
6013
6107
  # default int[] value [0,1] should not add space after comma, since codegen parser uses ', ' to split args
6014
6108
 
@@ -6091,6 +6185,58 @@
6091
6185
  CompositeExplicitAutogradNonFunctional: _nested_view_from_buffer_copy
6092
6186
  autogen: _nested_view_from_buffer_copy.out
6093
6187
 
6188
+ - func: _nested_view_from_jagged(Tensor(a) self, Tensor offsets, Tensor dummy, Tensor? lengths=None, int ragged_idx=1) -> Tensor(a)
6189
+ variants: function
6190
+ device_check: NoCheck
6191
+ dispatch: {}
6192
+
6193
+ - func: _nested_view_from_jagged_copy(Tensor self, Tensor offsets, Tensor dummy, Tensor? lengths=None, int ragged_idx=1) -> Tensor
6194
+ variants: function
6195
+ device_check: NoCheck
6196
+ tags: view_copy
6197
+ dispatch:
6198
+ CompositeExplicitAutogradNonFunctional: _nested_view_from_jagged_copy
6199
+ autogen: _nested_view_from_jagged_copy.out
6200
+
6201
+ - func: _nested_get_values(Tensor(a) self) -> Tensor(a)
6202
+ variants: function
6203
+ device_check: NoCheck
6204
+ dispatch: {}
6205
+
6206
+ - func: _nested_get_values_copy(Tensor self) -> Tensor
6207
+ variants: function
6208
+ device_check: NoCheck
6209
+ tags: view_copy
6210
+ dispatch:
6211
+ CompositeExplicitAutogradNonFunctional: _nested_get_values_copy
6212
+ autogen: _nested_get_values_copy.out
6213
+
6214
+ - func: _nested_get_offsets(Tensor self) -> Tensor
6215
+ variants: function
6216
+ device_check: NoCheck
6217
+ dispatch: {}
6218
+
6219
+ # returns undefined Tensor if no lengths present
6220
+ - func: _nested_get_lengths(Tensor self) -> Tensor
6221
+ variants: function
6222
+ device_check: NoCheck
6223
+ dispatch: {}
6224
+
6225
+ - func: _nested_get_ragged_idx(Tensor self) -> int
6226
+ variants: function
6227
+ device_check: NoCheck
6228
+ dispatch: {}
6229
+
6230
+ - func: _nested_get_jagged_dummy(Tensor any) -> Tensor
6231
+ category_override: dummy
6232
+ dispatch: {}
6233
+
6234
+ - func: _nested_compute_contiguous_strides_offsets(Tensor nested_size) -> (Tensor, Tensor)
6235
+ variants: function
6236
+ device_check: NoCheck
6237
+ dispatch:
6238
+ CPU, CUDA: _nested_compute_contiguous_strides_offsets
6239
+
6094
6240
  - func: _trilinear(Tensor i1, Tensor i2, Tensor i3, int[] expand1, int[] expand2, int[] expand3, int[] sumdim, int unroll_dim=1) -> Tensor
6095
6241
  dispatch:
6096
6242
  # calls unsqueeze
@@ -6275,6 +6421,7 @@
6275
6421
  variants: function
6276
6422
  dispatch:
6277
6423
  CPU, CUDA: var_mean
6424
+ MPS: var_mean_mps
6278
6425
  autogen: var_mean.correction_out
6279
6426
 
6280
6427
  - func: var_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
@@ -6295,15 +6442,13 @@
6295
6442
  device_check: NoCheck # TensorIterator
6296
6443
  variants: function, method
6297
6444
  dispatch:
6298
- CPU, CUDA: where
6299
- MPS: where_mps
6445
+ CPU, CUDA, MPS: where
6300
6446
  tags: [core, pointwise]
6301
6447
 
6302
6448
  - func: where.self_out(Tensor condition, Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
6303
6449
  device_check: NoCheck # TensorIterator
6304
6450
  dispatch:
6305
- CPU, CUDA: where_self_out
6306
- MPS: where_self_out_mps
6451
+ CPU, CUDA, MPS: where_self_out
6307
6452
 
6308
6453
  - func: where.ScalarSelf(Tensor condition, Scalar self, Tensor other) -> Tensor
6309
6454
  variants: function
@@ -6357,7 +6502,7 @@
6357
6502
  CPU: _efficientzerotensor
6358
6503
  CUDA: _efficientzerotensor_cuda
6359
6504
  MPS: _efficientzerotensor_mps
6360
- Meta: _efficientzerotensor_meta
6505
+ Meta: _efficientzerotensor_meta_symint
6361
6506
  autogen: _efficientzerotensor.out
6362
6507
 
6363
6508
  - func: zeros(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
@@ -6434,6 +6579,32 @@
6434
6579
  SparseCPU, SparseCUDA: norm_sparse
6435
6580
  autogen: native_norm.ScalarOpt_dim_dtype_out
6436
6581
 
6582
+ - func: _batch_norm_with_update(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, float momentum, float eps) -> (Tensor, Tensor, Tensor, Tensor)
6583
+ dispatch:
6584
+ CPU: _batch_norm_with_update_cpu
6585
+ CUDA: _batch_norm_with_update_cuda
6586
+ MPS: _batch_norm_with_update_mps
6587
+ MkldnnCPU: _batch_norm_with_update_mkldnn
6588
+ autogen: _batch_norm_with_update_functional
6589
+
6590
+ - func: _batch_norm_with_update.out(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, float momentum, float eps, *, Tensor(d!) out, Tensor(e!) save_mean, Tensor(f!) save_invstd, Tensor(g!) reserve) -> (Tensor(d!), Tensor(e!), Tensor(f!), Tensor(g!))
6591
+ dispatch:
6592
+ CPU: _batch_norm_with_update_cpu_out
6593
+ CUDA: _batch_norm_with_update_cuda_out
6594
+ MPS: _batch_norm_with_update_mps_out
6595
+
6596
+ - func: _batch_norm_no_update(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, float momentum, float eps) -> (Tensor, Tensor, Tensor, Tensor)
6597
+ dispatch:
6598
+ CompositeExplicitAutograd: _batch_norm_no_update
6599
+ autogen: _batch_norm_no_update.out
6600
+
6601
+ - func: batch_norm_backward(Tensor grad_out, Tensor input, Tensor weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var, bool update, float eps, bool[3] output_mask, Tensor reserve) -> (Tensor, Tensor, Tensor)
6602
+ dispatch:
6603
+ CPU: _new_batch_norm_backward_cpu
6604
+ CUDA: _new_batch_norm_backward_cuda
6605
+ MPS: _new_batch_norm_backward_mps
6606
+ MkldnnCPU: _new_batch_norm_backward_mkldnn
6607
+
6437
6608
  # TODO: reduce signatures down to one when optional args is available
6438
6609
  - func: _sparse_sum(Tensor self) -> Tensor
6439
6610
 
@@ -6644,7 +6815,7 @@
6644
6815
  MPS: zero_mps_
6645
6816
  Meta: zero_meta_
6646
6817
  SparseCPU, SparseCUDA, SparseMeta: zero_sparse_
6647
- SparseCsrCPU, SparseCsrCUDA: zero_sparse_csr_
6818
+ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: zero_sparse_csr_
6648
6819
  MkldnnCPU: mkldnn_zero_
6649
6820
  NestedTensorCPU, NestedTensorCUDA: zero_nested_
6650
6821
  autogen: zero, zero.out
@@ -6934,7 +7105,11 @@
6934
7105
  # FIXME: would be nicer if TensorOptions was optional based; not adding default arguments for options given
6935
7106
  # the default would never make sense.
6936
7107
 
6937
- - func: sparse_compressed_tensor.comp_plain_value_size(Tensor compressed_indices, Tensor plain_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
7108
+ - func: _sparse_compressed_tensor_with_dims(int nnz, int dense_dim, int[] size, int[] blocksize, ScalarType index_dtype, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
7109
+ dispatch:
7110
+ CompositeExplicitAutograd: sparse_compressed_tensor_with_dims
7111
+
7112
+ - func: sparse_compressed_tensor.comp_plain_value_size(Tensor compressed_indices, Tensor plain_indices, Tensor values, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
6938
7113
  dispatch:
6939
7114
  CompositeExplicitAutograd: sparse_compressed_tensor
6940
7115
 
@@ -6951,7 +7126,10 @@
6951
7126
  - func: sparse_bsr_tensor.crow_col_value(Tensor crow_indices, Tensor col_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
6952
7127
  - func: sparse_bsc_tensor.ccol_row_value(Tensor ccol_indices, Tensor row_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
6953
7128
 
6954
- - func: _sparse_compressed_tensor_unsafe(Tensor compressed_indices, Tensor plain_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
7129
+ - func: _sparse_compressed_tensor_unsafe(Tensor compressed_indices, Tensor plain_indices, Tensor values, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
7130
+ dispatch:
7131
+ CompositeImplicitAutograd: _sparse_compressed_tensor_unsafe_symint
7132
+
6955
7133
  - func: _sparse_csr_tensor_unsafe(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
6956
7134
  - func: _sparse_csc_tensor_unsafe(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
6957
7135
  - func: _sparse_bsr_tensor_unsafe(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
@@ -7035,9 +7213,9 @@
7035
7213
  - func: sparse_dim(Tensor self) -> int
7036
7214
  variants: method
7037
7215
  dispatch:
7038
- CPU, CUDA: sparse_dim_strided
7039
7216
  SparseCPU, SparseCUDA, SparseMeta: sparse_dim_sparse
7040
- SparseCsrCPU, SparseCsrCUDA: sparse_dim_sparse_csr
7217
+ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sparse_dim_sparse_csr
7218
+ CompositeExplicitAutograd: sparse_dim_default
7041
7219
  device_check: NoCheck
7042
7220
  device_guard: False
7043
7221
 
@@ -7052,9 +7230,9 @@
7052
7230
  - func: dense_dim(Tensor self) -> int
7053
7231
  variants: method
7054
7232
  dispatch:
7055
- CPU, CUDA: dense_dim_strided
7056
7233
  SparseCPU, SparseCUDA, SparseMeta: dense_dim_sparse
7057
- SparseCsrCPU, SparseCsrCUDA: dense_dim_sparse_csr
7234
+ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: dense_dim_sparse_csr
7235
+ CompositeExplicitAutograd: dense_dim_default
7058
7236
  device_check: NoCheck
7059
7237
  device_guard: False
7060
7238
 
@@ -7070,7 +7248,7 @@
7070
7248
  variants: method
7071
7249
  dispatch:
7072
7250
  SparseCPU, SparseCUDA, SparseMeta: _nnz_sparse
7073
- SparseCsrCPU, SparseCsrCUDA: _nnz_sparse_csr
7251
+ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: _nnz_sparse_csr
7074
7252
  device_check: NoCheck
7075
7253
  device_guard: False
7076
7254
 
@@ -7133,7 +7311,7 @@
7133
7311
  variants: method
7134
7312
  dispatch:
7135
7313
  SparseCPU, SparseCUDA, SparseMeta: values_sparse
7136
- SparseCsrCPU, SparseCsrCUDA: values_sparse_csr
7314
+ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: values_sparse_csr
7137
7315
  NestedTensorCPU, NestedTensorCUDA: values_nested
7138
7316
  CompositeExplicitAutograd: values_default
7139
7317
  device_check: NoCheck
@@ -7142,7 +7320,7 @@
7142
7320
  - func: crow_indices(Tensor(a) self) -> Tensor(a)
7143
7321
  variants: method
7144
7322
  dispatch:
7145
- SparseCsrCPU, SparseCsrCUDA: crow_indices_sparse_csr
7323
+ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: crow_indices_sparse_csr
7146
7324
  CompositeExplicitAutograd: crow_indices_default
7147
7325
  device_check: NoCheck
7148
7326
  device_guard: False
@@ -7150,7 +7328,7 @@
7150
7328
  - func: col_indices(Tensor(a) self) -> Tensor(a)
7151
7329
  variants: method
7152
7330
  dispatch:
7153
- SparseCsrCPU, SparseCsrCUDA: col_indices_sparse_csr
7331
+ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: col_indices_sparse_csr
7154
7332
  CompositeExplicitAutograd: col_indices_default
7155
7333
  device_check: NoCheck
7156
7334
  device_guard: False
@@ -7158,7 +7336,7 @@
7158
7336
  - func: ccol_indices(Tensor(a) self) -> Tensor(a)
7159
7337
  variants: method
7160
7338
  dispatch:
7161
- SparseCsrCPU, SparseCsrCUDA: ccol_indices_sparse_csr
7339
+ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: ccol_indices_sparse_csr
7162
7340
  CompositeExplicitAutograd: ccol_indices_default
7163
7341
  device_check: NoCheck
7164
7342
  device_guard: False
@@ -7166,7 +7344,7 @@
7166
7344
  - func: row_indices(Tensor(a) self) -> Tensor(a)
7167
7345
  variants: method
7168
7346
  dispatch:
7169
- SparseCsrCPU, SparseCsrCUDA: row_indices_sparse_csr
7347
+ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: row_indices_sparse_csr
7170
7348
  CompositeExplicitAutograd: row_indices_default
7171
7349
  device_check: NoCheck
7172
7350
  device_guard: False
@@ -7185,7 +7363,7 @@
7185
7363
  device_check: NoCheck # Allows copy into different device
7186
7364
  variants: function
7187
7365
  dispatch:
7188
- SparseCPU, SparseCUDA: copy_sparse_
7366
+ SparseCPU, SparseCUDA, SparseMeta: copy_sparse_
7189
7367
  autogen: copy_sparse_to_sparse, copy_sparse_to_sparse.out
7190
7368
 
7191
7369
  # By adding the AutogradNestedTensor this makes this function CompositeImplicit-like for nested tensors
@@ -7288,7 +7466,7 @@
7288
7466
  MkldnnCPU: mkldnn_reorder_conv2d_weight
7289
7467
  autogen: mkldnn_reorder_conv2d_weight.out
7290
7468
 
7291
- - func: mkldnn_reorder_conv3d_weight(Tensor self, SymInt[3] padding=0, SymInt[3] stride=1, SymInt[3] dilation=1, SymInt groups=1) -> Tensor
7469
+ - func: mkldnn_reorder_conv3d_weight(Tensor self, SymInt[3] padding=0, SymInt[3] stride=1, SymInt[3] dilation=1, SymInt groups=1, SymInt[]? input_size=None) -> Tensor
7292
7470
  variants: function
7293
7471
  python_module: nn
7294
7472
  dispatch:
@@ -7536,7 +7714,7 @@
7536
7714
 
7537
7715
  - func: result_type.Scalar_Scalar(Scalar scalar1, Scalar scalar2) -> ScalarType
7538
7716
 
7539
- - func: can_cast(ScalarType from, ScalarType to) -> bool
7717
+ - func: can_cast(ScalarType from_, ScalarType to) -> bool
7540
7718
  variants: function
7541
7719
 
7542
7720
  - func: promote_types(ScalarType type1, ScalarType type2) -> ScalarType
@@ -7675,6 +7853,7 @@
7675
7853
  dispatch:
7676
7854
  CPU, CUDA, Meta, MPS: set_
7677
7855
  autogen: set.source_Storage, set.source_Storage_out
7856
+ tags: inplace_view
7678
7857
 
7679
7858
  - func: set_.source_Storage_storage_offset(Tensor(a!) self, Storage source, SymInt storage_offset, SymInt[] size, SymInt[] stride=[]) -> Tensor(a!)
7680
7859
  variants: method
@@ -7687,6 +7866,7 @@
7687
7866
  MPS: set_storage_mps_
7688
7867
  QuantizedCPU, QuantizedCUDA: set_storage_quantized_
7689
7868
  autogen: set.source_Storage_storage_offset, set.source_Storage_storage_offset_out
7869
+ tags: inplace_view
7690
7870
 
7691
7871
  - func: set_.source_Tensor_storage_offset(Tensor(a!) self, Tensor source, SymInt storage_offset, SymInt[] size, SymInt[] stride=[]) -> Tensor(a!)
7692
7872
  variants: method
@@ -7694,6 +7874,7 @@
7694
7874
  device_guard: False
7695
7875
  dispatch:
7696
7876
  CompositeImplicitAutograd: set__symint
7877
+ tags: inplace_view
7697
7878
 
7698
7879
  - func: set_.source_Tensor(Tensor(a!) self, Tensor source) -> Tensor(a!)
7699
7880
  variants: method
@@ -7702,6 +7883,7 @@
7702
7883
  dispatch:
7703
7884
  CPU, CUDA, Meta, MPS: set_tensor_
7704
7885
  autogen: set.source_Tensor, set.source_Tensor_out
7886
+ tags: inplace_view
7705
7887
 
7706
7888
  - func: set_(Tensor(a!) self) -> Tensor(a!)
7707
7889
  variants: method
@@ -7711,6 +7893,7 @@
7711
7893
  Meta: set_meta_
7712
7894
  MPS: set_mps_
7713
7895
  autogen: set, set.out
7896
+ tags: inplace_view
7714
7897
 
7715
7898
  # Not making it CompositeImplicitAutograd because lift
7716
7899
  # should be a primitive w.r.t. functorch
@@ -10106,18 +10289,21 @@
10106
10289
  variants: method, function
10107
10290
  dispatch:
10108
10291
  CompositeExplicitAutograd: alias
10292
+ NestedTensorCPU, NestedTensorCUDA: alias_nested
10109
10293
  tags: core
10110
10294
 
10111
10295
  - func: _amp_foreach_non_finite_check_and_unscale_(Tensor(a!)[] self, Tensor(b!) found_inf, Tensor inv_scale) -> ()
10112
10296
  variants: function
10113
10297
  dispatch:
10114
10298
  CUDA: _amp_foreach_non_finite_check_and_unscale_cuda_
10299
+ CPU: _amp_foreach_non_finite_check_and_unscale_cpu_
10115
10300
  autogen: _amp_foreach_non_finite_check_and_unscale, _amp_foreach_non_finite_check_and_unscale.out
10116
10301
 
10117
10302
  - func: _amp_update_scale_(Tensor(a!) self, Tensor(b!) growth_tracker, Tensor found_inf, float scale_growth_factor, float scale_backoff_factor, int growth_interval) -> Tensor(a!)
10118
10303
  variants: function
10119
10304
  dispatch:
10120
10305
  CUDA: _amp_update_scale_cuda_
10306
+ CPU: _amp_update_scale_cpu_
10121
10307
  autogen: _amp_update_scale, _amp_update_scale.out
10122
10308
 
10123
10309
  #- func: _cat(Tensor[] tensors, int dim=0) -> Tensor
@@ -10137,14 +10323,14 @@
10137
10323
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10138
10324
  variants: function
10139
10325
  dispatch:
10140
- CPU: foreach_tensor_add_scalar_kernel_slow
10326
+ CompositeExplicitAutograd: foreach_tensor_add_scalar_kernel_slow
10141
10327
  CUDA: foreach_tensor_add_scalar_kernel_cuda
10142
10328
 
10143
10329
  - func: _foreach_add_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
10144
10330
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10145
10331
  variants: function
10146
10332
  dispatch:
10147
- CPU: foreach_tensor_add_scalar_kernel_slow_
10333
+ CompositeExplicitAutograd: foreach_tensor_add_scalar_kernel_slow_
10148
10334
  CUDA: foreach_tensor_add_scalar_kernel_cuda_
10149
10335
  autogen: _foreach_add.Scalar_out
10150
10336
 
@@ -10152,14 +10338,14 @@
10152
10338
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10153
10339
  variants: function
10154
10340
  dispatch:
10155
- CPU: foreach_tensor_add_list_kernel_slow
10341
+ CompositeExplicitAutograd: foreach_tensor_add_list_kernel_slow
10156
10342
  CUDA: foreach_tensor_add_list_kernel_cuda
10157
10343
 
10158
10344
  - func: _foreach_add_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> ()
10159
10345
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10160
10346
  variants: function
10161
10347
  dispatch:
10162
- CPU: foreach_tensor_add_list_kernel_slow_
10348
+ CompositeExplicitAutograd: foreach_tensor_add_list_kernel_slow_
10163
10349
  CUDA: foreach_tensor_add_list_kernel_cuda_
10164
10350
  autogen: _foreach_add.List_out
10165
10351
 
@@ -10167,14 +10353,14 @@
10167
10353
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10168
10354
  variants: function
10169
10355
  dispatch:
10170
- CPU: foreach_tensor_add_scalarlist_kernel_slow
10356
+ CompositeExplicitAutograd: foreach_tensor_add_scalarlist_kernel_slow
10171
10357
  CUDA: foreach_tensor_add_scalarlist_kernel_cuda
10172
10358
 
10173
10359
  - func: _foreach_add_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
10174
10360
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10175
10361
  variants: function
10176
10362
  dispatch:
10177
- CPU: foreach_tensor_add_scalarlist_kernel_slow_
10363
+ CompositeExplicitAutograd: foreach_tensor_add_scalarlist_kernel_slow_
10178
10364
  CUDA: foreach_tensor_add_scalarlist_kernel_cuda_
10179
10365
  autogen: _foreach_add.ScalarList_out
10180
10366
 
@@ -10182,14 +10368,14 @@
10182
10368
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10183
10369
  variants: function
10184
10370
  dispatch:
10185
- CPU: foreach_tensor_add_tensor_kernel_slow
10371
+ CompositeExplicitAutograd: foreach_tensor_add_tensor_kernel_slow
10186
10372
  CUDA: foreach_tensor_add_tensor_kernel_cuda
10187
10373
 
10188
10374
  - func: _foreach_add_.Tensor(Tensor(a!)[] self, Tensor other, *, Scalar alpha=1) -> ()
10189
10375
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10190
10376
  variants: function
10191
10377
  dispatch:
10192
- CPU: foreach_tensor_add_tensor_kernel_slow_
10378
+ CompositeExplicitAutograd: foreach_tensor_add_tensor_kernel_slow_
10193
10379
  CUDA: foreach_tensor_add_tensor_kernel_cuda_
10194
10380
  autogen: _foreach_add.Tensor_out
10195
10381
 
@@ -10197,14 +10383,14 @@
10197
10383
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10198
10384
  variants: function
10199
10385
  dispatch:
10200
- CPU: foreach_tensor_sub_scalar_kernel_slow
10386
+ CompositeExplicitAutograd: foreach_tensor_sub_scalar_kernel_slow
10201
10387
  CUDA: foreach_tensor_sub_scalar_kernel_cuda
10202
10388
 
10203
10389
  - func: _foreach_sub_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
10204
10390
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10205
10391
  variants: function
10206
10392
  dispatch:
10207
- CPU: foreach_tensor_sub_scalar_kernel_slow_
10393
+ CompositeExplicitAutograd: foreach_tensor_sub_scalar_kernel_slow_
10208
10394
  CUDA: foreach_tensor_sub_scalar_kernel_cuda_
10209
10395
  autogen: _foreach_sub.Scalar_out
10210
10396
 
@@ -10212,14 +10398,14 @@
10212
10398
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10213
10399
  variants: function
10214
10400
  dispatch:
10215
- CPU: foreach_tensor_sub_list_kernel_slow
10401
+ CompositeExplicitAutograd: foreach_tensor_sub_list_kernel_slow
10216
10402
  CUDA: foreach_tensor_sub_list_kernel_cuda
10217
10403
 
10218
10404
  - func: _foreach_sub_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> ()
10219
10405
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10220
10406
  variants: function
10221
10407
  dispatch:
10222
- CPU: foreach_tensor_sub_list_kernel_slow_
10408
+ CompositeExplicitAutograd: foreach_tensor_sub_list_kernel_slow_
10223
10409
  CUDA: foreach_tensor_sub_list_kernel_cuda_
10224
10410
  autogen: _foreach_sub.List_out
10225
10411
 
@@ -10227,14 +10413,14 @@
10227
10413
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10228
10414
  variants: function
10229
10415
  dispatch:
10230
- CPU: foreach_tensor_sub_scalarlist_kernel_slow
10416
+ CompositeExplicitAutograd: foreach_tensor_sub_scalarlist_kernel_slow
10231
10417
  CUDA: foreach_tensor_sub_scalarlist_kernel_cuda
10232
10418
 
10233
10419
  - func: _foreach_sub_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
10234
10420
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10235
10421
  variants: function
10236
10422
  dispatch:
10237
- CPU: foreach_tensor_sub_scalarlist_kernel_slow_
10423
+ CompositeExplicitAutograd: foreach_tensor_sub_scalarlist_kernel_slow_
10238
10424
  CUDA: foreach_tensor_sub_scalarlist_kernel_cuda_
10239
10425
  autogen: _foreach_sub.ScalarList_out
10240
10426
 
@@ -10242,14 +10428,14 @@
10242
10428
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10243
10429
  variants: function
10244
10430
  dispatch:
10245
- CPU: foreach_tensor_mul_scalar_kernel_slow
10431
+ CompositeExplicitAutograd: foreach_tensor_mul_scalar_kernel_slow
10246
10432
  CUDA: foreach_tensor_mul_scalar_kernel_cuda
10247
10433
 
10248
10434
  - func: _foreach_mul_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
10249
10435
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10250
10436
  variants: function
10251
10437
  dispatch:
10252
- CPU: foreach_tensor_mul_scalar_kernel_slow_
10438
+ CompositeExplicitAutograd: foreach_tensor_mul_scalar_kernel_slow_
10253
10439
  CUDA: foreach_tensor_mul_scalar_kernel_cuda_
10254
10440
  autogen: _foreach_mul.Scalar_out
10255
10441
 
@@ -10257,14 +10443,14 @@
10257
10443
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10258
10444
  variants: function
10259
10445
  dispatch:
10260
- CPU: foreach_tensor_mul_list_kernel_slow
10446
+ CompositeExplicitAutograd: foreach_tensor_mul_list_kernel_slow
10261
10447
  CUDA: foreach_tensor_mul_list_kernel_cuda
10262
10448
 
10263
10449
  - func: _foreach_mul_.List(Tensor(a!)[] self, Tensor[] other) -> ()
10264
10450
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10265
10451
  variants: function
10266
10452
  dispatch:
10267
- CPU: foreach_tensor_mul_list_kernel_slow_
10453
+ CompositeExplicitAutograd: foreach_tensor_mul_list_kernel_slow_
10268
10454
  CUDA: foreach_tensor_mul_list_kernel_cuda_
10269
10455
  autogen: _foreach_mul.List_out
10270
10456
 
@@ -10272,14 +10458,14 @@
10272
10458
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10273
10459
  variants: function
10274
10460
  dispatch:
10275
- CPU: foreach_tensor_mul_scalarlist_kernel_slow
10461
+ CompositeExplicitAutograd: foreach_tensor_mul_scalarlist_kernel_slow
10276
10462
  CUDA: foreach_tensor_mul_scalarlist_kernel_cuda
10277
10463
 
10278
10464
  - func: _foreach_mul_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
10279
10465
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10280
10466
  variants: function
10281
10467
  dispatch:
10282
- CPU: foreach_tensor_mul_scalarlist_kernel_slow_
10468
+ CompositeExplicitAutograd: foreach_tensor_mul_scalarlist_kernel_slow_
10283
10469
  CUDA: foreach_tensor_mul_scalarlist_kernel_cuda_
10284
10470
  autogen: _foreach_mul.ScalarList_out
10285
10471
 
@@ -10287,14 +10473,14 @@
10287
10473
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10288
10474
  variants: function
10289
10475
  dispatch:
10290
- CPU: foreach_tensor_mul_tensor_kernel_slow
10476
+ CompositeExplicitAutograd: foreach_tensor_mul_tensor_kernel_slow
10291
10477
  CUDA: foreach_tensor_mul_tensor_kernel_cuda
10292
10478
 
10293
10479
  - func: _foreach_mul_.Tensor(Tensor(a!)[] self, Tensor other) -> ()
10294
10480
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10295
10481
  variants: function
10296
10482
  dispatch:
10297
- CPU: foreach_tensor_mul_tensor_kernel_slow_
10483
+ CompositeExplicitAutograd: foreach_tensor_mul_tensor_kernel_slow_
10298
10484
  CUDA: foreach_tensor_mul_tensor_kernel_cuda_
10299
10485
  autogen: _foreach_mul.Tensor_out
10300
10486
 
@@ -10302,14 +10488,14 @@
10302
10488
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10303
10489
  variants: function
10304
10490
  dispatch:
10305
- CPU: foreach_tensor_div_scalar_kernel_slow
10491
+ CompositeExplicitAutograd: foreach_tensor_div_scalar_kernel_slow
10306
10492
  CUDA: foreach_tensor_div_scalar_kernel_cuda
10307
10493
 
10308
10494
  - func: _foreach_div_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
10309
10495
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10310
10496
  variants: function
10311
10497
  dispatch:
10312
- CPU: foreach_tensor_div_scalar_kernel_slow_
10498
+ CompositeExplicitAutograd: foreach_tensor_div_scalar_kernel_slow_
10313
10499
  CUDA: foreach_tensor_div_scalar_kernel_cuda_
10314
10500
  autogen: _foreach_div.Scalar_out
10315
10501
 
@@ -10317,14 +10503,14 @@
10317
10503
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10318
10504
  variants: function
10319
10505
  dispatch:
10320
- CPU: foreach_tensor_div_list_kernel_slow
10506
+ CompositeExplicitAutograd: foreach_tensor_div_list_kernel_slow
10321
10507
  CUDA: foreach_tensor_div_list_kernel_cuda
10322
10508
 
10323
10509
  - func: _foreach_div_.List(Tensor(a!)[] self, Tensor[] other) -> ()
10324
10510
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10325
10511
  variants: function
10326
10512
  dispatch:
10327
- CPU: foreach_tensor_div_list_kernel_slow_
10513
+ CompositeExplicitAutograd: foreach_tensor_div_list_kernel_slow_
10328
10514
  CUDA: foreach_tensor_div_list_kernel_cuda_
10329
10515
  autogen: _foreach_div.List_out
10330
10516
 
@@ -10332,14 +10518,14 @@
10332
10518
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10333
10519
  variants: function
10334
10520
  dispatch:
10335
- CPU: foreach_tensor_div_scalarlist_kernel_slow
10521
+ CompositeExplicitAutograd: foreach_tensor_div_scalarlist_kernel_slow
10336
10522
  CUDA: foreach_tensor_div_scalarlist_kernel_cuda
10337
10523
 
10338
10524
  - func: _foreach_div_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
10339
10525
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10340
10526
  variants: function
10341
10527
  dispatch:
10342
- CPU: foreach_tensor_div_scalarlist_kernel_slow_
10528
+ CompositeExplicitAutograd: foreach_tensor_div_scalarlist_kernel_slow_
10343
10529
  CUDA: foreach_tensor_div_scalarlist_kernel_cuda_
10344
10530
  autogen: _foreach_div.ScalarList_out
10345
10531
 
@@ -10347,14 +10533,14 @@
10347
10533
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10348
10534
  variants: function
10349
10535
  dispatch:
10350
- CPU: foreach_tensor_div_tensor_kernel_slow
10536
+ CompositeExplicitAutograd: foreach_tensor_div_tensor_kernel_slow
10351
10537
  CUDA: foreach_tensor_div_tensor_kernel_cuda
10352
10538
 
10353
10539
  - func: _foreach_div_.Tensor(Tensor(a!)[] self, Tensor other) -> ()
10354
10540
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10355
10541
  variants: function
10356
10542
  dispatch:
10357
- CPU: foreach_tensor_div_tensor_kernel_slow_
10543
+ CompositeExplicitAutograd: foreach_tensor_div_tensor_kernel_slow_
10358
10544
  CUDA: foreach_tensor_div_tensor_kernel_cuda_
10359
10545
  autogen: _foreach_div.Tensor_out
10360
10546
 
@@ -10362,14 +10548,14 @@
10362
10548
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10363
10549
  variants: function
10364
10550
  dispatch:
10365
- CPU: foreach_tensor_clamp_max_scalar_kernel_slow
10551
+ CompositeExplicitAutograd: foreach_tensor_clamp_max_scalar_kernel_slow
10366
10552
  CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda
10367
10553
 
10368
10554
  - func: _foreach_clamp_max_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
10369
10555
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10370
10556
  variants: function
10371
10557
  dispatch:
10372
- CPU: foreach_tensor_clamp_max_scalar_kernel_slow_
10558
+ CompositeExplicitAutograd: foreach_tensor_clamp_max_scalar_kernel_slow_
10373
10559
  CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda_
10374
10560
  autogen: _foreach_clamp_max.Scalar_out
10375
10561
 
@@ -10377,14 +10563,14 @@
10377
10563
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10378
10564
  variants: function
10379
10565
  dispatch:
10380
- CPU: foreach_tensor_clamp_max_list_kernel_slow
10566
+ CompositeExplicitAutograd: foreach_tensor_clamp_max_list_kernel_slow
10381
10567
  CUDA: foreach_tensor_clamp_max_list_kernel_cuda
10382
10568
 
10383
10569
  - func: _foreach_clamp_max_.List(Tensor(a!)[] self, Tensor[] other) -> ()
10384
10570
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10385
10571
  variants: function
10386
10572
  dispatch:
10387
- CPU: foreach_tensor_clamp_max_list_kernel_slow_
10573
+ CompositeExplicitAutograd: foreach_tensor_clamp_max_list_kernel_slow_
10388
10574
  CUDA: foreach_tensor_clamp_max_list_kernel_cuda_
10389
10575
  autogen: _foreach_clamp_max.List_out
10390
10576
 
@@ -10392,14 +10578,14 @@
10392
10578
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10393
10579
  variants: function
10394
10580
  dispatch:
10395
- CPU: foreach_tensor_clamp_max_scalarlist_kernel_slow
10581
+ CompositeExplicitAutograd: foreach_tensor_clamp_max_scalarlist_kernel_slow
10396
10582
  CUDA: foreach_tensor_clamp_max_scalarlist_kernel_cuda
10397
10583
 
10398
10584
  - func: _foreach_clamp_max_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
10399
10585
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10400
10586
  variants: function
10401
10587
  dispatch:
10402
- CPU: foreach_tensor_clamp_max_scalarlist_kernel_slow_
10588
+ CompositeExplicitAutograd: foreach_tensor_clamp_max_scalarlist_kernel_slow_
10403
10589
  CUDA: foreach_tensor_clamp_max_scalarlist_kernel_cuda_
10404
10590
  autogen: _foreach_clamp_max.ScalarList_out
10405
10591
 
@@ -10407,14 +10593,14 @@
10407
10593
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10408
10594
  variants: function
10409
10595
  dispatch:
10410
- CPU: foreach_tensor_clamp_min_scalar_kernel_slow
10596
+ CompositeExplicitAutograd: foreach_tensor_clamp_min_scalar_kernel_slow
10411
10597
  CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda
10412
10598
 
10413
10599
  - func: _foreach_clamp_min_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
10414
10600
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10415
10601
  variants: function
10416
10602
  dispatch:
10417
- CPU: foreach_tensor_clamp_min_scalar_kernel_slow_
10603
+ CompositeExplicitAutograd: foreach_tensor_clamp_min_scalar_kernel_slow_
10418
10604
  CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda_
10419
10605
  autogen: _foreach_clamp_min.Scalar_out
10420
10606
 
@@ -10422,14 +10608,14 @@
10422
10608
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10423
10609
  variants: function
10424
10610
  dispatch:
10425
- CPU: foreach_tensor_clamp_min_list_kernel_slow
10611
+ CompositeExplicitAutograd: foreach_tensor_clamp_min_list_kernel_slow
10426
10612
  CUDA: foreach_tensor_clamp_min_list_kernel_cuda
10427
10613
 
10428
10614
  - func: _foreach_clamp_min_.List(Tensor(a!)[] self, Tensor[] other) -> ()
10429
10615
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10430
10616
  variants: function
10431
10617
  dispatch:
10432
- CPU: foreach_tensor_clamp_min_list_kernel_slow_
10618
+ CompositeExplicitAutograd: foreach_tensor_clamp_min_list_kernel_slow_
10433
10619
  CUDA: foreach_tensor_clamp_min_list_kernel_cuda_
10434
10620
  autogen: _foreach_clamp_min.List_out
10435
10621
 
@@ -10437,14 +10623,14 @@
10437
10623
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10438
10624
  variants: function
10439
10625
  dispatch:
10440
- CPU: foreach_tensor_clamp_min_scalarlist_kernel_slow
10626
+ CompositeExplicitAutograd: foreach_tensor_clamp_min_scalarlist_kernel_slow
10441
10627
  CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda
10442
10628
 
10443
10629
  - func: _foreach_clamp_min_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
10444
10630
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10445
10631
  variants: function
10446
10632
  dispatch:
10447
- CPU: foreach_tensor_clamp_min_scalarlist_kernel_slow_
10633
+ CompositeExplicitAutograd: foreach_tensor_clamp_min_scalarlist_kernel_slow_
10448
10634
  CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda_
10449
10635
  autogen: _foreach_clamp_min.ScalarList_out
10450
10636
 
@@ -10453,14 +10639,14 @@
10453
10639
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10454
10640
  variants: function
10455
10641
  dispatch:
10456
- CPU: foreach_tensor_clamp_min_scalar_kernel_slow
10642
+ CompositeExplicitAutograd: foreach_tensor_clamp_min_scalar_kernel_slow
10457
10643
  CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda
10458
10644
 
10459
10645
  - func: _foreach_maximum_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
10460
10646
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10461
10647
  variants: function
10462
10648
  dispatch:
10463
- CPU: foreach_tensor_clamp_min_scalar_kernel_slow_
10649
+ CompositeExplicitAutograd: foreach_tensor_clamp_min_scalar_kernel_slow_
10464
10650
  CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda_
10465
10651
  autogen: _foreach_maximum.Scalar_out
10466
10652
 
@@ -10469,14 +10655,14 @@
10469
10655
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10470
10656
  variants: function
10471
10657
  dispatch:
10472
- CPU: foreach_tensor_clamp_min_list_kernel_slow
10658
+ CompositeExplicitAutograd: foreach_tensor_clamp_min_list_kernel_slow
10473
10659
  CUDA: foreach_tensor_clamp_min_list_kernel_cuda
10474
10660
 
10475
10661
  - func: _foreach_maximum_.List(Tensor(a!)[] self, Tensor[] other) -> ()
10476
10662
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10477
10663
  variants: function
10478
10664
  dispatch:
10479
- CPU: foreach_tensor_clamp_min_list_kernel_slow_
10665
+ CompositeExplicitAutograd: foreach_tensor_clamp_min_list_kernel_slow_
10480
10666
  CUDA: foreach_tensor_clamp_min_list_kernel_cuda_
10481
10667
  autogen: _foreach_maximum.List_out
10482
10668
 
@@ -10485,14 +10671,14 @@
10485
10671
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10486
10672
  variants: function
10487
10673
  dispatch:
10488
- CPU: foreach_tensor_clamp_min_scalarlist_kernel_slow
10674
+ CompositeExplicitAutograd: foreach_tensor_clamp_min_scalarlist_kernel_slow
10489
10675
  CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda
10490
10676
 
10491
10677
  - func: _foreach_maximum_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
10492
10678
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10493
10679
  variants: function
10494
10680
  dispatch:
10495
- CPU: foreach_tensor_clamp_min_scalarlist_kernel_slow_
10681
+ CompositeExplicitAutograd: foreach_tensor_clamp_min_scalarlist_kernel_slow_
10496
10682
  CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda_
10497
10683
  autogen: _foreach_maximum.ScalarList_out
10498
10684
 
@@ -10500,14 +10686,14 @@
10500
10686
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10501
10687
  variants: function
10502
10688
  dispatch:
10503
- CPU: foreach_tensor_clamp_max_scalar_kernel_slow
10689
+ CompositeExplicitAutograd: foreach_tensor_clamp_max_scalar_kernel_slow
10504
10690
  CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda
10505
10691
 
10506
10692
  - func: _foreach_minimum_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
10507
10693
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10508
10694
  variants: function
10509
10695
  dispatch:
10510
- CPU: foreach_tensor_clamp_max_scalar_kernel_slow_
10696
+ CompositeExplicitAutograd: foreach_tensor_clamp_max_scalar_kernel_slow_
10511
10697
  CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda_
10512
10698
  autogen: _foreach_minimum.Scalar_out
10513
10699
 
@@ -10515,14 +10701,14 @@
10515
10701
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10516
10702
  variants: function
10517
10703
  dispatch:
10518
- CPU: foreach_tensor_clamp_max_list_kernel_slow
10704
+ CompositeExplicitAutograd: foreach_tensor_clamp_max_list_kernel_slow
10519
10705
  CUDA: foreach_tensor_clamp_max_list_kernel_cuda
10520
10706
 
10521
10707
  - func: _foreach_minimum_.List(Tensor(a!)[] self, Tensor[] other) -> ()
10522
10708
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10523
10709
  variants: function
10524
10710
  dispatch:
10525
- CPU: foreach_tensor_clamp_max_list_kernel_slow_
10711
+ CompositeExplicitAutograd: foreach_tensor_clamp_max_list_kernel_slow_
10526
10712
  CUDA: foreach_tensor_clamp_max_list_kernel_cuda_
10527
10713
  autogen: _foreach_minimum.List_out
10528
10714
 
@@ -10530,14 +10716,14 @@
10530
10716
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10531
10717
  variants: function
10532
10718
  dispatch:
10533
- CPU: foreach_tensor_clamp_max_scalarlist_kernel_slow
10719
+ CompositeExplicitAutograd: foreach_tensor_clamp_max_scalarlist_kernel_slow
10534
10720
  CUDA: foreach_tensor_clamp_max_scalarlist_kernel_cuda
10535
10721
 
10536
10722
  - func: _foreach_minimum_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
10537
10723
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10538
10724
  variants: function
10539
10725
  dispatch:
10540
- CPU: foreach_tensor_clamp_max_scalarlist_kernel_slow_
10726
+ CompositeExplicitAutograd: foreach_tensor_clamp_max_scalarlist_kernel_slow_
10541
10727
  CUDA: foreach_tensor_clamp_max_scalarlist_kernel_cuda_
10542
10728
  autogen: _foreach_minimum.ScalarList_out
10543
10729
 
@@ -10545,28 +10731,28 @@
10545
10731
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10546
10732
  variants: function
10547
10733
  dispatch:
10548
- CPU: foreach_tensor_addcdiv_scalar_slow
10734
+ CompositeExplicitAutograd: foreach_tensor_addcdiv_scalar_slow
10549
10735
  CUDA: foreach_tensor_addcdiv_scalar_cuda
10550
10736
 
10551
10737
  - func: _foreach_addcdiv.ScalarList(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> Tensor[]
10552
10738
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10553
10739
  variants: function
10554
10740
  dispatch:
10555
- CPU: foreach_tensor_addcdiv_scalarlist_slow
10741
+ CompositeExplicitAutograd: foreach_tensor_addcdiv_scalarlist_slow
10556
10742
  CUDA: foreach_tensor_addcdiv_scalarlist_cuda
10557
10743
 
10558
10744
  - func: _foreach_addcdiv.Tensor(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> Tensor[]
10559
10745
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10560
10746
  variants: function
10561
10747
  dispatch:
10562
- CPU: foreach_tensor_addcdiv_tensor_slow
10748
+ CompositeExplicitAutograd: foreach_tensor_addcdiv_tensor_slow
10563
10749
  CUDA: foreach_tensor_addcdiv_tensor_cuda
10564
10750
 
10565
10751
  - func: _foreach_addcdiv_.Scalar(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> ()
10566
10752
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10567
10753
  variants: function
10568
10754
  dispatch:
10569
- CPU: foreach_tensor_addcdiv_scalar_slow_
10755
+ CompositeExplicitAutograd: foreach_tensor_addcdiv_scalar_slow_
10570
10756
  CUDA: foreach_tensor_addcdiv_scalar_cuda_
10571
10757
  autogen: _foreach_addcdiv.Scalar_out
10572
10758
 
@@ -10574,7 +10760,7 @@
10574
10760
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10575
10761
  variants: function
10576
10762
  dispatch:
10577
- CPU: foreach_tensor_addcdiv_scalarlist_slow_
10763
+ CompositeExplicitAutograd: foreach_tensor_addcdiv_scalarlist_slow_
10578
10764
  CUDA: foreach_tensor_addcdiv_scalarlist_cuda_
10579
10765
  autogen: _foreach_addcdiv.ScalarList_out
10580
10766
 
@@ -10582,7 +10768,7 @@
10582
10768
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10583
10769
  variants: function
10584
10770
  dispatch:
10585
- CPU: foreach_tensor_addcdiv_tensor_slow_
10771
+ CompositeExplicitAutograd: foreach_tensor_addcdiv_tensor_slow_
10586
10772
  CUDA: foreach_tensor_addcdiv_tensor_cuda_
10587
10773
  autogen: _foreach_addcdiv.Tensor_out
10588
10774
 
@@ -10590,28 +10776,28 @@
10590
10776
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10591
10777
  variants: function
10592
10778
  dispatch:
10593
- CPU: foreach_tensor_addcmul_scalar_slow
10779
+ CompositeExplicitAutograd: foreach_tensor_addcmul_scalar_slow
10594
10780
  CUDA: foreach_tensor_addcmul_scalar_cuda
10595
10781
 
10596
10782
  - func: _foreach_addcmul.ScalarList(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> Tensor[]
10597
10783
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10598
10784
  variants: function
10599
10785
  dispatch:
10600
- CPU: foreach_tensor_addcmul_scalarlist_slow
10786
+ CompositeExplicitAutograd: foreach_tensor_addcmul_scalarlist_slow
10601
10787
  CUDA: foreach_tensor_addcmul_scalarlist_cuda
10602
10788
 
10603
10789
  - func: _foreach_addcmul.Tensor(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> Tensor[]
10604
10790
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10605
10791
  variants: function
10606
10792
  dispatch:
10607
- CPU: foreach_tensor_addcmul_tensor_slow
10793
+ CompositeExplicitAutograd: foreach_tensor_addcmul_tensor_slow
10608
10794
  CUDA: foreach_tensor_addcmul_tensor_cuda
10609
10795
 
10610
10796
  - func: _foreach_addcmul_.Scalar(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> ()
10611
10797
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10612
10798
  variants: function
10613
10799
  dispatch:
10614
- CPU: foreach_tensor_addcmul_scalar_slow_
10800
+ CompositeExplicitAutograd: foreach_tensor_addcmul_scalar_slow_
10615
10801
  CUDA: foreach_tensor_addcmul_scalar_cuda_
10616
10802
  autogen: _foreach_addcmul.Scalar_out
10617
10803
 
@@ -10619,7 +10805,7 @@
10619
10805
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10620
10806
  variants: function
10621
10807
  dispatch:
10622
- CPU: foreach_tensor_addcmul_scalarlist_slow_
10808
+ CompositeExplicitAutograd: foreach_tensor_addcmul_scalarlist_slow_
10623
10809
  CUDA: foreach_tensor_addcmul_scalarlist_cuda_
10624
10810
  autogen: _foreach_addcmul.ScalarList_out
10625
10811
 
@@ -10627,7 +10813,7 @@
10627
10813
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10628
10814
  variants: function
10629
10815
  dispatch:
10630
- CPU: foreach_tensor_addcmul_tensor_slow_
10816
+ CompositeExplicitAutograd: foreach_tensor_addcmul_tensor_slow_
10631
10817
  CUDA: foreach_tensor_addcmul_tensor_cuda_
10632
10818
  autogen: _foreach_addcmul.Tensor_out
10633
10819
 
@@ -10635,14 +10821,14 @@
10635
10821
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10636
10822
  variants: function
10637
10823
  dispatch:
10638
- CPU: foreach_tensor_abs_slow
10824
+ CompositeExplicitAutograd: foreach_tensor_abs_slow
10639
10825
  CUDA: foreach_tensor_abs_cuda
10640
10826
 
10641
10827
  - func: _foreach_abs_(Tensor(a!)[] self) -> ()
10642
10828
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10643
10829
  variants: function
10644
10830
  dispatch:
10645
- CPU: foreach_tensor_abs_slow_
10831
+ CompositeExplicitAutograd: foreach_tensor_abs_slow_
10646
10832
  CUDA: foreach_tensor_abs_cuda_
10647
10833
  autogen: _foreach_abs.out
10648
10834
 
@@ -10650,14 +10836,14 @@
10650
10836
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10651
10837
  variants: function
10652
10838
  dispatch:
10653
- CPU: foreach_tensor_acos_slow
10839
+ CompositeExplicitAutograd: foreach_tensor_acos_slow
10654
10840
  CUDA: foreach_tensor_acos_cuda
10655
10841
 
10656
10842
  - func: _foreach_acos_(Tensor(a!)[] self) -> ()
10657
10843
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10658
10844
  variants: function
10659
10845
  dispatch:
10660
- CPU: foreach_tensor_acos_slow_
10846
+ CompositeExplicitAutograd: foreach_tensor_acos_slow_
10661
10847
  CUDA: foreach_tensor_acos_cuda_
10662
10848
  autogen: _foreach_acos.out
10663
10849
 
@@ -10665,14 +10851,14 @@
10665
10851
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10666
10852
  variants: function
10667
10853
  dispatch:
10668
- CPU: foreach_tensor_asin_slow
10854
+ CompositeExplicitAutograd: foreach_tensor_asin_slow
10669
10855
  CUDA: foreach_tensor_asin_cuda
10670
10856
 
10671
10857
  - func: _foreach_asin_(Tensor(a!)[] self) -> ()
10672
10858
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10673
10859
  variants: function
10674
10860
  dispatch:
10675
- CPU: foreach_tensor_asin_slow_
10861
+ CompositeExplicitAutograd: foreach_tensor_asin_slow_
10676
10862
  CUDA: foreach_tensor_asin_cuda_
10677
10863
  autogen: _foreach_asin.out
10678
10864
 
@@ -10680,14 +10866,14 @@
10680
10866
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10681
10867
  variants: function
10682
10868
  dispatch:
10683
- CPU: foreach_tensor_atan_slow
10869
+ CompositeExplicitAutograd: foreach_tensor_atan_slow
10684
10870
  CUDA: foreach_tensor_atan_cuda
10685
10871
 
10686
10872
  - func: _foreach_atan_(Tensor(a!)[] self) -> ()
10687
10873
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10688
10874
  variants: function
10689
10875
  dispatch:
10690
- CPU: foreach_tensor_atan_slow_
10876
+ CompositeExplicitAutograd: foreach_tensor_atan_slow_
10691
10877
  CUDA: foreach_tensor_atan_cuda_
10692
10878
  autogen: _foreach_atan.out
10693
10879
 
@@ -10695,14 +10881,14 @@
10695
10881
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10696
10882
  variants: function
10697
10883
  dispatch:
10698
- CPU: foreach_tensor_ceil_slow
10884
+ CompositeExplicitAutograd: foreach_tensor_ceil_slow
10699
10885
  CUDA: foreach_tensor_ceil_cuda
10700
10886
 
10701
10887
  - func: _foreach_ceil_(Tensor(a!)[] self) -> ()
10702
10888
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10703
10889
  variants: function
10704
10890
  dispatch:
10705
- CPU: foreach_tensor_ceil_slow_
10891
+ CompositeExplicitAutograd: foreach_tensor_ceil_slow_
10706
10892
  CUDA: foreach_tensor_ceil_cuda_
10707
10893
  autogen: _foreach_ceil.out
10708
10894
 
@@ -10710,14 +10896,14 @@
10710
10896
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10711
10897
  variants: function
10712
10898
  dispatch:
10713
- CPU: foreach_tensor_cos_slow
10899
+ CompositeExplicitAutograd: foreach_tensor_cos_slow
10714
10900
  CUDA: foreach_tensor_cos_cuda
10715
10901
 
10716
10902
  - func: _foreach_cos_(Tensor(a!)[] self) -> ()
10717
10903
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10718
10904
  variants: function
10719
10905
  dispatch:
10720
- CPU: foreach_tensor_cos_slow_
10906
+ CompositeExplicitAutograd: foreach_tensor_cos_slow_
10721
10907
  CUDA: foreach_tensor_cos_cuda_
10722
10908
  autogen: _foreach_cos.out
10723
10909
 
@@ -10725,14 +10911,14 @@
10725
10911
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10726
10912
  variants: function
10727
10913
  dispatch:
10728
- CPU: foreach_tensor_cosh_slow
10914
+ CompositeExplicitAutograd: foreach_tensor_cosh_slow
10729
10915
  CUDA: foreach_tensor_cosh_cuda
10730
10916
 
10731
10917
  - func: _foreach_cosh_(Tensor(a!)[] self) -> ()
10732
10918
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10733
10919
  variants: function
10734
10920
  dispatch:
10735
- CPU: foreach_tensor_cosh_slow_
10921
+ CompositeExplicitAutograd: foreach_tensor_cosh_slow_
10736
10922
  CUDA: foreach_tensor_cosh_cuda_
10737
10923
  autogen: _foreach_cosh.out
10738
10924
 
@@ -10740,14 +10926,14 @@
10740
10926
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10741
10927
  variants: function
10742
10928
  dispatch:
10743
- CPU: foreach_tensor_erf_slow
10929
+ CompositeExplicitAutograd: foreach_tensor_erf_slow
10744
10930
  CUDA: foreach_tensor_erf_cuda
10745
10931
 
10746
10932
  - func: _foreach_erf_(Tensor(a!)[] self) -> ()
10747
10933
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10748
10934
  variants: function
10749
10935
  dispatch:
10750
- CPU: foreach_tensor_erf_slow_
10936
+ CompositeExplicitAutograd: foreach_tensor_erf_slow_
10751
10937
  CUDA: foreach_tensor_erf_cuda_
10752
10938
  autogen: _foreach_erf.out
10753
10939
 
@@ -10755,14 +10941,14 @@
10755
10941
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10756
10942
  variants: function
10757
10943
  dispatch:
10758
- CPU: foreach_tensor_erfc_slow
10944
+ CompositeExplicitAutograd: foreach_tensor_erfc_slow
10759
10945
  CUDA: foreach_tensor_erfc_cuda
10760
10946
 
10761
10947
  - func: _foreach_erfc_(Tensor(a!)[] self) -> ()
10762
10948
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10763
10949
  variants: function
10764
10950
  dispatch:
10765
- CPU: foreach_tensor_erfc_slow_
10951
+ CompositeExplicitAutograd: foreach_tensor_erfc_slow_
10766
10952
  CUDA: foreach_tensor_erfc_cuda_
10767
10953
  autogen: _foreach_erfc.out
10768
10954
 
@@ -10770,14 +10956,14 @@
10770
10956
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10771
10957
  variants: function
10772
10958
  dispatch:
10773
- CPU: foreach_tensor_exp_slow
10959
+ CompositeExplicitAutograd: foreach_tensor_exp_slow
10774
10960
  CUDA: foreach_tensor_exp_cuda
10775
10961
 
10776
10962
  - func: _foreach_exp_(Tensor(a!)[] self) -> ()
10777
10963
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10778
10964
  variants: function
10779
10965
  dispatch:
10780
- CPU: foreach_tensor_exp_slow_
10966
+ CompositeExplicitAutograd: foreach_tensor_exp_slow_
10781
10967
  CUDA: foreach_tensor_exp_cuda_
10782
10968
  autogen: _foreach_exp.out
10783
10969
 
@@ -10785,14 +10971,14 @@
10785
10971
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10786
10972
  variants: function
10787
10973
  dispatch:
10788
- CPU: foreach_tensor_expm1_slow
10974
+ CompositeExplicitAutograd: foreach_tensor_expm1_slow
10789
10975
  CUDA: foreach_tensor_expm1_cuda
10790
10976
 
10791
10977
  - func: _foreach_expm1_(Tensor(a!)[] self) -> ()
10792
10978
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10793
10979
  variants: function
10794
10980
  dispatch:
10795
- CPU: foreach_tensor_expm1_slow_
10981
+ CompositeExplicitAutograd: foreach_tensor_expm1_slow_
10796
10982
  CUDA: foreach_tensor_expm1_cuda_
10797
10983
  autogen: _foreach_expm1.out
10798
10984
 
@@ -10800,14 +10986,14 @@
10800
10986
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10801
10987
  variants: function
10802
10988
  dispatch:
10803
- CPU: foreach_tensor_floor_slow
10989
+ CompositeExplicitAutograd: foreach_tensor_floor_slow
10804
10990
  CUDA: foreach_tensor_floor_cuda
10805
10991
 
10806
10992
  - func: _foreach_floor_(Tensor(a!)[] self) -> ()
10807
10993
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10808
10994
  variants: function
10809
10995
  dispatch:
10810
- CPU: foreach_tensor_floor_slow_
10996
+ CompositeExplicitAutograd: foreach_tensor_floor_slow_
10811
10997
  CUDA: foreach_tensor_floor_cuda_
10812
10998
  autogen: _foreach_floor.out
10813
10999
 
@@ -10815,14 +11001,14 @@
10815
11001
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10816
11002
  variants: function
10817
11003
  dispatch:
10818
- CPU: foreach_tensor_frac_slow
11004
+ CompositeExplicitAutograd: foreach_tensor_frac_slow
10819
11005
  CUDA: foreach_tensor_frac_cuda
10820
11006
 
10821
11007
  - func: _foreach_frac_(Tensor(a!)[] self) -> ()
10822
11008
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10823
11009
  variants: function
10824
11010
  dispatch:
10825
- CPU: foreach_tensor_frac_slow_
11011
+ CompositeExplicitAutograd: foreach_tensor_frac_slow_
10826
11012
  CUDA: foreach_tensor_frac_cuda_
10827
11013
  autogen: _foreach_frac.out
10828
11014
 
@@ -10830,7 +11016,7 @@
10830
11016
  device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices
10831
11017
  variants: function
10832
11018
  dispatch:
10833
- CPU: foreach_tensor_ternary_lerp_slow
11019
+ CompositeExplicitAutograd: foreach_tensor_ternary_lerp_slow
10834
11020
  CUDA: foreach_tensor_lerp_ternary_cuda
10835
11021
  autogen: _foreach_lerp.List_out
10836
11022
 
@@ -10838,7 +11024,7 @@
10838
11024
  device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices
10839
11025
  variants: function
10840
11026
  dispatch:
10841
- CPU: foreach_tensor_ternary_lerp_slow_
11027
+ CompositeExplicitAutograd: foreach_tensor_ternary_lerp_slow_
10842
11028
  CUDA: foreach_tensor_lerp_ternary_cuda_
10843
11029
  autogen: _foreach_lerp.List_out
10844
11030
 
@@ -10846,7 +11032,7 @@
10846
11032
  device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices
10847
11033
  variants: function
10848
11034
  dispatch:
10849
- CPU: foreach_tensor_lerp_list_kernel_slow
11035
+ CompositeExplicitAutograd: foreach_tensor_lerp_list_kernel_slow
10850
11036
  CUDA: foreach_tensor_lerp_list_cuda
10851
11037
  autogen: _foreach_lerp.Scalar_out
10852
11038
 
@@ -10854,7 +11040,7 @@
10854
11040
  device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices
10855
11041
  variants: function
10856
11042
  dispatch:
10857
- CPU: foreach_tensor_lerp_list_kernel_slow_
11043
+ CompositeExplicitAutograd: foreach_tensor_lerp_list_kernel_slow_
10858
11044
  CUDA: foreach_tensor_lerp_list_cuda_
10859
11045
  autogen: _foreach_lerp.Scalar_out
10860
11046
 
@@ -10862,14 +11048,14 @@
10862
11048
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10863
11049
  variants: function
10864
11050
  dispatch:
10865
- CPU: foreach_tensor_lgamma_slow
11051
+ CompositeExplicitAutograd: foreach_tensor_lgamma_slow
10866
11052
  CUDA: foreach_tensor_lgamma_cuda
10867
11053
 
10868
11054
  - func: _foreach_lgamma_(Tensor(a!)[] self) -> ()
10869
11055
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10870
11056
  variants: function
10871
11057
  dispatch:
10872
- CPU: foreach_tensor_lgamma_slow_
11058
+ CompositeExplicitAutograd: foreach_tensor_lgamma_slow_
10873
11059
  CUDA: foreach_tensor_lgamma_cuda_
10874
11060
  autogen: _foreach_lgamma.out
10875
11061
 
@@ -10877,14 +11063,14 @@
10877
11063
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10878
11064
  variants: function
10879
11065
  dispatch:
10880
- CPU: foreach_tensor_log_slow
11066
+ CompositeExplicitAutograd: foreach_tensor_log_slow
10881
11067
  CUDA: foreach_tensor_log_cuda
10882
11068
 
10883
11069
  - func: _foreach_log_(Tensor(a!)[] self) -> ()
10884
11070
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10885
11071
  variants: function
10886
11072
  dispatch:
10887
- CPU: foreach_tensor_log_slow_
11073
+ CompositeExplicitAutograd: foreach_tensor_log_slow_
10888
11074
  CUDA: foreach_tensor_log_cuda_
10889
11075
  autogen: _foreach_log.out
10890
11076
 
@@ -10892,14 +11078,14 @@
10892
11078
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10893
11079
  variants: function
10894
11080
  dispatch:
10895
- CPU: foreach_tensor_log10_slow
11081
+ CompositeExplicitAutograd: foreach_tensor_log10_slow
10896
11082
  CUDA: foreach_tensor_log10_cuda
10897
11083
 
10898
11084
  - func: _foreach_log10_(Tensor(a!)[] self) -> ()
10899
11085
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10900
11086
  variants: function
10901
11087
  dispatch:
10902
- CPU: foreach_tensor_log10_slow_
11088
+ CompositeExplicitAutograd: foreach_tensor_log10_slow_
10903
11089
  CUDA: foreach_tensor_log10_cuda_
10904
11090
  autogen: _foreach_log10.out
10905
11091
 
@@ -10907,14 +11093,14 @@
10907
11093
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10908
11094
  variants: function
10909
11095
  dispatch:
10910
- CPU: foreach_tensor_log1p_slow
11096
+ CompositeExplicitAutograd: foreach_tensor_log1p_slow
10911
11097
  CUDA: foreach_tensor_log1p_cuda
10912
11098
 
10913
11099
  - func: _foreach_log1p_(Tensor(a!)[] self) -> ()
10914
11100
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10915
11101
  variants: function
10916
11102
  dispatch:
10917
- CPU: foreach_tensor_log1p_slow_
11103
+ CompositeExplicitAutograd: foreach_tensor_log1p_slow_
10918
11104
  CUDA: foreach_tensor_log1p_cuda_
10919
11105
  autogen: _foreach_log1p.out
10920
11106
 
@@ -10922,37 +11108,45 @@
10922
11108
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10923
11109
  variants: function
10924
11110
  dispatch:
10925
- CPU: foreach_tensor_log2_slow
11111
+ CompositeExplicitAutograd: foreach_tensor_log2_slow
10926
11112
  CUDA: foreach_tensor_log2_cuda
10927
11113
 
10928
11114
  - func: _foreach_log2_(Tensor(a!)[] self) -> ()
10929
11115
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10930
11116
  variants: function
10931
11117
  dispatch:
10932
- CPU: foreach_tensor_log2_slow_
11118
+ CompositeExplicitAutograd: foreach_tensor_log2_slow_
10933
11119
  CUDA: foreach_tensor_log2_cuda_
10934
11120
  autogen: _foreach_log2.out
10935
11121
 
11122
+ - func: _foreach_max(Tensor[] self) -> Tensor[]
11123
+ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11124
+ variants: function
11125
+ dispatch:
11126
+ CompositeExplicitAutograd: foreach_tensor_max_slow
11127
+ CUDA: foreach_tensor_max_cuda
11128
+ autogen: _foreach_max.out
11129
+
10936
11130
  - func: _foreach_neg(Tensor[] self) -> Tensor[]
10937
11131
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10938
11132
  variants: function
10939
11133
  dispatch:
10940
- CPU: foreach_tensor_neg_slow
11134
+ CompositeExplicitAutograd: foreach_tensor_neg_slow
10941
11135
  CUDA: foreach_tensor_neg_cuda
10942
11136
 
10943
11137
  - func: _foreach_neg_(Tensor(a!)[] self) -> ()
10944
11138
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10945
11139
  variants: function
10946
11140
  dispatch:
10947
- CPU: foreach_tensor_neg_slow_
11141
+ CompositeExplicitAutograd: foreach_tensor_neg_slow_
10948
11142
  CUDA: foreach_tensor_neg_cuda_
10949
11143
  autogen: _foreach_neg.out
10950
11144
 
10951
- - func: _foreach_norm.Scalar(Tensor[] self, Scalar ord=2) -> Tensor[]
11145
+ - func: _foreach_norm.Scalar(Tensor[] self, Scalar ord=2, ScalarType? dtype=None) -> Tensor[]
10952
11146
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10953
11147
  variants: function
10954
11148
  dispatch:
10955
- CPU: foreach_tensor_norm_slow
11149
+ CompositeExplicitAutograd: foreach_tensor_norm_slow
10956
11150
  CUDA: foreach_tensor_norm_cuda
10957
11151
  autogen: _foreach_norm.Scalar_out
10958
11152
 
@@ -10960,35 +11154,35 @@
10960
11154
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10961
11155
  variants: function
10962
11156
  dispatch:
10963
- CPU: foreach_tensor_pow_list_kernel_slow
11157
+ CompositeExplicitAutograd: foreach_tensor_pow_list_kernel_slow
10964
11158
  CUDA: foreach_tensor_pow_list_kernel_cuda
10965
11159
 
10966
11160
  - func: _foreach_pow.Scalar(Tensor[] self, Scalar exponent) -> Tensor[]
10967
11161
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10968
11162
  variants: function
10969
11163
  dispatch:
10970
- CPU: foreach_tensor_pow_scalar_kernel_slow
11164
+ CompositeExplicitAutograd: foreach_tensor_pow_scalar_kernel_slow
10971
11165
  CUDA: foreach_tensor_pow_scalar_kernel_cuda
10972
11166
 
10973
11167
  - func: _foreach_pow.ScalarList(Tensor[] self, Scalar[] exponent) -> Tensor[]
10974
11168
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10975
11169
  variants: function
10976
11170
  dispatch:
10977
- CPU: foreach_tensor_pow_scalarlist_kernel_slow
11171
+ CompositeExplicitAutograd: foreach_tensor_pow_scalarlist_kernel_slow
10978
11172
  CUDA: foreach_tensor_pow_scalarlist_kernel_cuda
10979
11173
 
10980
11174
  - func: _foreach_pow.ScalarAndTensor(Scalar self, Tensor[] exponent) -> Tensor[]
10981
11175
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10982
11176
  variants: function
10983
11177
  dispatch:
10984
- CPU: foreach_scalar_pow_list_kernel_slow
11178
+ CompositeExplicitAutograd: foreach_scalar_pow_list_kernel_slow
10985
11179
  CUDA: foreach_scalar_pow_list_kernel_cuda
10986
11180
 
10987
11181
  - func: _foreach_pow_.List(Tensor(a!)[] self, Tensor[] exponent) -> ()
10988
11182
  device_check: NoCheck
10989
11183
  variants: function
10990
11184
  dispatch:
10991
- CPU: foreach_tensor_pow_list_kernel_slow_
11185
+ CompositeExplicitAutograd: foreach_tensor_pow_list_kernel_slow_
10992
11186
  CUDA: foreach_tensor_pow_list_kernel_cuda_
10993
11187
  autogen: _foreach_pow.List_out
10994
11188
 
@@ -10996,7 +11190,7 @@
10996
11190
  device_check: NoCheck
10997
11191
  variants: function
10998
11192
  dispatch:
10999
- CPU: foreach_tensor_pow_scalar_kernel_slow_
11193
+ CompositeExplicitAutograd: foreach_tensor_pow_scalar_kernel_slow_
11000
11194
  CUDA: foreach_tensor_pow_scalar_kernel_cuda_
11001
11195
  autogen: _foreach_pow.Scalar_out
11002
11196
 
@@ -11004,7 +11198,7 @@
11004
11198
  device_check: NoCheck
11005
11199
  variants: function
11006
11200
  dispatch:
11007
- CPU: foreach_tensor_pow_scalarlist_kernel_slow_
11201
+ CompositeExplicitAutograd: foreach_tensor_pow_scalarlist_kernel_slow_
11008
11202
  CUDA: foreach_tensor_pow_scalarlist_kernel_cuda_
11009
11203
  autogen: _foreach_pow.ScalarList_out
11010
11204
 
@@ -11012,14 +11206,14 @@
11012
11206
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11013
11207
  variants: function
11014
11208
  dispatch:
11015
- CPU: foreach_tensor_reciprocal_slow
11209
+ CompositeExplicitAutograd: foreach_tensor_reciprocal_slow
11016
11210
  CUDA: foreach_tensor_reciprocal_cuda
11017
11211
 
11018
11212
  - func: _foreach_reciprocal_(Tensor(a!)[] self) -> ()
11019
11213
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11020
11214
  variants: function
11021
11215
  dispatch:
11022
- CPU: foreach_tensor_reciprocal_slow_
11216
+ CompositeExplicitAutograd: foreach_tensor_reciprocal_slow_
11023
11217
  CUDA: foreach_tensor_reciprocal_cuda_
11024
11218
  autogen: _foreach_reciprocal.out
11025
11219
 
@@ -11027,14 +11221,14 @@
11027
11221
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11028
11222
  variants: function
11029
11223
  dispatch:
11030
- CPU: foreach_tensor_round_slow
11224
+ CompositeExplicitAutograd: foreach_tensor_round_slow
11031
11225
  CUDA: foreach_tensor_round_cuda
11032
11226
 
11033
11227
  - func: _foreach_round_(Tensor(a!)[] self) -> ()
11034
11228
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11035
11229
  variants: function
11036
11230
  dispatch:
11037
- CPU: foreach_tensor_round_slow_
11231
+ CompositeExplicitAutograd: foreach_tensor_round_slow_
11038
11232
  CUDA: foreach_tensor_round_cuda_
11039
11233
  autogen: _foreach_round.out
11040
11234
 
@@ -11042,14 +11236,14 @@
11042
11236
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11043
11237
  variants: function
11044
11238
  dispatch:
11045
- CPU: foreach_tensor_sigmoid_slow
11239
+ CompositeExplicitAutograd: foreach_tensor_sigmoid_slow
11046
11240
  CUDA: foreach_tensor_sigmoid_cuda
11047
11241
 
11048
11242
  - func: _foreach_sigmoid_(Tensor(a!)[] self) -> ()
11049
11243
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11050
11244
  variants: function
11051
11245
  dispatch:
11052
- CPU: foreach_tensor_sigmoid_slow_
11246
+ CompositeExplicitAutograd: foreach_tensor_sigmoid_slow_
11053
11247
  CUDA: foreach_tensor_sigmoid_cuda_
11054
11248
  autogen: _foreach_sigmoid.out
11055
11249
 
@@ -11057,14 +11251,14 @@
11057
11251
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11058
11252
  variants: function
11059
11253
  dispatch:
11060
- CPU: foreach_tensor_sign_slow
11254
+ CompositeExplicitAutograd: foreach_tensor_sign_slow
11061
11255
  CUDA: foreach_tensor_sign_cuda
11062
11256
 
11063
11257
  - func: _foreach_sign_(Tensor(a!)[] self) -> ()
11064
11258
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11065
11259
  variants: function
11066
11260
  dispatch:
11067
- CPU: foreach_tensor_sign_slow_
11261
+ CompositeExplicitAutograd: foreach_tensor_sign_slow_
11068
11262
  CUDA: foreach_tensor_sign_cuda_
11069
11263
  autogen: _foreach_sign.out
11070
11264
 
@@ -11072,14 +11266,14 @@
11072
11266
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11073
11267
  variants: function
11074
11268
  dispatch:
11075
- CPU: foreach_tensor_sin_slow
11269
+ CompositeExplicitAutograd: foreach_tensor_sin_slow
11076
11270
  CUDA: foreach_tensor_sin_cuda
11077
11271
 
11078
11272
  - func: _foreach_sin_(Tensor(a!)[] self) -> ()
11079
11273
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11080
11274
  variants: function
11081
11275
  dispatch:
11082
- CPU: foreach_tensor_sin_slow_
11276
+ CompositeExplicitAutograd: foreach_tensor_sin_slow_
11083
11277
  CUDA: foreach_tensor_sin_cuda_
11084
11278
  autogen: _foreach_sin.out
11085
11279
 
@@ -11087,14 +11281,14 @@
11087
11281
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11088
11282
  variants: function
11089
11283
  dispatch:
11090
- CPU: foreach_tensor_sinh_slow
11284
+ CompositeExplicitAutograd: foreach_tensor_sinh_slow
11091
11285
  CUDA: foreach_tensor_sinh_cuda
11092
11286
 
11093
11287
  - func: _foreach_sinh_(Tensor(a!)[] self) -> ()
11094
11288
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11095
11289
  variants: function
11096
11290
  dispatch:
11097
- CPU: foreach_tensor_sinh_slow_
11291
+ CompositeExplicitAutograd: foreach_tensor_sinh_slow_
11098
11292
  CUDA: foreach_tensor_sinh_cuda_
11099
11293
  autogen: _foreach_sinh.out
11100
11294
 
@@ -11102,14 +11296,14 @@
11102
11296
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11103
11297
  variants: function
11104
11298
  dispatch:
11105
- CPU: foreach_tensor_sqrt_slow
11299
+ CompositeExplicitAutograd: foreach_tensor_sqrt_slow
11106
11300
  CUDA: foreach_tensor_sqrt_cuda
11107
11301
 
11108
11302
  - func: _foreach_sqrt_(Tensor(a!)[] self) -> ()
11109
11303
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11110
11304
  variants: function
11111
11305
  dispatch:
11112
- CPU: foreach_tensor_sqrt_slow_
11306
+ CompositeExplicitAutograd: foreach_tensor_sqrt_slow_
11113
11307
  CUDA: foreach_tensor_sqrt_cuda_
11114
11308
  autogen: _foreach_sqrt.out
11115
11309
 
@@ -11117,14 +11311,14 @@
11117
11311
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11118
11312
  variants: function
11119
11313
  dispatch:
11120
- CPU: foreach_tensor_tan_slow
11314
+ CompositeExplicitAutograd: foreach_tensor_tan_slow
11121
11315
  CUDA: foreach_tensor_tan_cuda
11122
11316
 
11123
11317
  - func: _foreach_tan_(Tensor(a!)[] self) -> ()
11124
11318
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11125
11319
  variants: function
11126
11320
  dispatch:
11127
- CPU: foreach_tensor_tan_slow_
11321
+ CompositeExplicitAutograd: foreach_tensor_tan_slow_
11128
11322
  CUDA: foreach_tensor_tan_cuda_
11129
11323
  autogen: _foreach_tan.out
11130
11324
 
@@ -11132,14 +11326,14 @@
11132
11326
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11133
11327
  variants: function
11134
11328
  dispatch:
11135
- CPU: foreach_tensor_tanh_slow
11329
+ CompositeExplicitAutograd: foreach_tensor_tanh_slow
11136
11330
  CUDA: foreach_tensor_tanh_cuda
11137
11331
 
11138
11332
  - func: _foreach_tanh_(Tensor(a!)[] self) -> ()
11139
11333
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11140
11334
  variants: function
11141
11335
  dispatch:
11142
- CPU: foreach_tensor_tanh_slow_
11336
+ CompositeExplicitAutograd: foreach_tensor_tanh_slow_
11143
11337
  CUDA: foreach_tensor_tanh_cuda_
11144
11338
  autogen: _foreach_tanh.out
11145
11339
 
@@ -11147,14 +11341,14 @@
11147
11341
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11148
11342
  variants: function
11149
11343
  dispatch:
11150
- CPU: foreach_tensor_trunc_slow
11344
+ CompositeExplicitAutograd: foreach_tensor_trunc_slow
11151
11345
  CUDA: foreach_tensor_trunc_cuda
11152
11346
 
11153
11347
  - func: _foreach_trunc_(Tensor(a!)[] self) -> ()
11154
11348
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11155
11349
  variants: function
11156
11350
  dispatch:
11157
- CPU: foreach_tensor_trunc_slow_
11351
+ CompositeExplicitAutograd: foreach_tensor_trunc_slow_
11158
11352
  CUDA: foreach_tensor_trunc_cuda_
11159
11353
  autogen: _foreach_trunc.out
11160
11354
 
@@ -11162,7 +11356,7 @@
11162
11356
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11163
11357
  variants: function
11164
11358
  dispatch:
11165
- CPU: foreach_tensor_zero_slow_
11359
+ CompositeExplicitAutograd: foreach_tensor_zero_slow_
11166
11360
  CUDA: foreach_tensor_zero_cuda_
11167
11361
  autogen: _foreach_zero, _foreach_zero.out
11168
11362
 
@@ -11170,9 +11364,15 @@
11170
11364
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11171
11365
  variants: function
11172
11366
  dispatch:
11173
- CPU: foreach_tensor_copy_list_kernel_slow_
11367
+ CompositeExplicitAutograd: foreach_tensor_copy_list_kernel_slow_
11174
11368
  CUDA: foreach_tensor_copy_list_kernel_cuda_
11175
- autogen: _foreach_copy, _foreach_copy.out
11369
+ autogen: _foreach_copy.out
11370
+
11371
+ - func: _foreach_copy(Tensor[] self, Tensor[] src, bool non_blocking=False) -> Tensor[] self_out
11372
+ device_check: NoCheck
11373
+ variants: function
11374
+ dispatch:
11375
+ CompositeExplicitAutograd: _foreach_copy
11176
11376
 
11177
11377
  - func: bucketize.Tensor(Tensor self, Tensor boundaries, *, bool out_int32=False, bool right=False) -> Tensor
11178
11378
  dispatch:
@@ -12341,6 +12541,7 @@
12341
12541
  dispatch:
12342
12542
  CPU: upsample_linear1d_out_cpu
12343
12543
  CUDA: upsample_linear1d_out_cuda
12544
+ MPS: upsample_linear1d_out_mps
12344
12545
 
12345
12546
  - func: upsample_linear1d(Tensor self, SymInt[1] output_size, bool align_corners, float? scales=None) -> Tensor
12346
12547
  python_module: nn
@@ -12352,6 +12553,7 @@
12352
12553
  dispatch:
12353
12554
  CPU: upsample_linear1d_backward_out_cpu
12354
12555
  CUDA: upsample_linear1d_backward_out_cuda
12556
+ MPS: upsample_linear1d_backward_out_mps
12355
12557
 
12356
12558
  - func: upsample_linear1d_backward(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, bool align_corners, float? scales=None) -> Tensor
12357
12559
  python_module: nn
@@ -12824,7 +13026,7 @@
12824
13026
  SparseMeta: isinf_sparse_meta
12825
13027
  SparseCsrCPU, SparseCsrCUDA: isinf_sparse_csr
12826
13028
  autogen: isinf.out
12827
- tags: core
13029
+ tags: [core, pointwise]
12828
13030
 
12829
13031
  - func: record_stream(Tensor(a!) self, Stream s) -> ()
12830
13032
  variants: method
@@ -13750,11 +13952,18 @@
13750
13952
  dispatch:
13751
13953
  CPU, CUDA: linalg_eig_out
13752
13954
 
13955
+ - func: _linalg_eigvals(Tensor self) -> Tensor
13956
+ python_module: linalg
13957
+ dispatch:
13958
+ CPU, CUDA: _linalg_eigvals
13959
+
13753
13960
  - func: linalg_eigvals(Tensor self) -> Tensor
13754
13961
  python_module: linalg
13755
13962
 
13756
13963
  - func: linalg_eigvals.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
13757
13964
  python_module: linalg
13965
+ dispatch:
13966
+ CPU, CUDA: linalg_eigvals_out
13758
13967
 
13759
13968
  # This function is exposes the `compute_v` flag, which is then used to implement `linalg.eigh` and
13760
13969
  # `linalg.eigvalsh` as composite functions that call this one
@@ -14058,6 +14267,12 @@
14058
14267
  # It is undocumented and should not be used outside of tests.
14059
14268
  - func: _test_serialization_subcmul(Tensor self, Tensor other, Scalar alpha=1) -> Tensor
14060
14269
 
14270
+ # Note: for testing COW materialization within `at::parallel_for` loop function
14271
+ - func: _test_parallel_materialize(Tensor self, int num_parallel, bool skip_first=False) -> Tensor
14272
+ variants: function
14273
+ dispatch:
14274
+ CompositeExplicitAutograd: _test_parallel_materialize
14275
+
14061
14276
  # Note: this function is only for testing.
14062
14277
  - func: _test_optional_intlist(Tensor values, int[]? addends) -> Tensor
14063
14278
  python_module: nn
@@ -14392,6 +14607,7 @@
14392
14607
  variants: function
14393
14608
  dispatch:
14394
14609
  CompositeExplicitAutograd: split_with_sizes_copy_out
14610
+ CUDA: split_with_sizes_copy_out_cuda
14395
14611
 
14396
14612
  - func: view_copy(Tensor self, SymInt[] size) -> Tensor
14397
14613
  variants: function
@@ -14428,6 +14644,16 @@
14428
14644
  NestedTensorCUDA: NestedTensor_to_padded_tensor_cuda
14429
14645
  autogen: to_padded_tensor.out
14430
14646
 
14647
+ - func: _jagged_to_padded_dense_forward(Tensor values, Tensor[] offsets, SymInt[] max_lengths, float padding_value=0.0) -> Tensor
14648
+ variants: function
14649
+ dispatch:
14650
+ CUDA: _fbgemm_jagged_to_padded_dense_forward
14651
+
14652
+ - func: _padded_dense_to_jagged_forward(Tensor dense, Tensor[] offsets, SymInt? total_L=None) -> Tensor
14653
+ variants: function
14654
+ dispatch:
14655
+ CUDA: _fbgemm_dense_to_jagged_forward_symint
14656
+
14431
14657
  - func: _nested_tensor_softmax_with_shape(Tensor self, Tensor query) -> Tensor
14432
14658
  dispatch:
14433
14659
  NestedTensorCPU: NestedTensor_softmax_dropout
@@ -14468,19 +14694,28 @@
14468
14694
 
14469
14695
  - func: _scaled_dot_product_flash_attention(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor output, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
14470
14696
  dispatch:
14471
- CPU: _scaled_dot_product_flash_attention_cpu
14472
14697
  CUDA: _scaled_dot_product_flash_attention_cuda
14473
14698
  NestedTensorCUDA: _scaled_dot_product_flash_attention_nestedtensor_cuda
14474
14699
  tags: nondeterministic_seeded
14475
14700
 
14701
+ - func: _scaled_dot_product_flash_attention_for_cpu(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool is_causal=False, *, Tensor? attn_mask=None, float? scale=None) -> (Tensor output, Tensor logsumexp)
14702
+ dispatch:
14703
+ CPU: _scaled_dot_product_flash_attention_cpu
14704
+ tags: nondeterministic_seeded
14705
+
14476
14706
  - func: _scaled_dot_product_flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, Tensor philox_seed, Tensor philox_offset, *, float? scale=None) -> (Tensor grad_query, Tensor grad_key, Tensor grad_value)
14477
14707
  device_check: NoCheck
14478
14708
  variants: function
14479
14709
  dispatch:
14480
- CPU: _scaled_dot_product_flash_attention_backward_cpu
14481
14710
  CUDA: _scaled_dot_product_flash_attention_backward_cuda
14482
14711
  NestedTensorCUDA: _scaled_dot_product_flash_attention_backward_nested
14483
14712
 
14713
+ - func: _scaled_dot_product_flash_attention_for_cpu_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, float dropout_p, bool is_causal, *, Tensor? attn_mask=None, float? scale=None) -> (Tensor grad_query, Tensor grad_key, Tensor grad_value)
14714
+ device_check: NoCheck
14715
+ variants: function
14716
+ dispatch:
14717
+ CPU: _scaled_dot_product_flash_attention_cpu_backward
14718
+
14484
14719
  - func: _scaled_dot_product_efficient_attention(Tensor query, Tensor key, Tensor value, Tensor? attn_bias, bool compute_log_sumexp, float dropout_p=0.0, bool is_causal=False, *, float? scale=None) -> (Tensor output, Tensor log_sumexp, Tensor philox_seed, Tensor philox_offset)
14485
14720
  dispatch:
14486
14721
  CUDA: _scaled_dot_product_efficient_attention_cuda
@@ -14493,26 +14728,36 @@
14493
14728
  CUDA: _scaled_dot_product_efficient_attention_backward_cuda
14494
14729
  tags: nondeterministic_seeded
14495
14730
 
14496
- - func: _flash_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? cum_seq_q, Tensor? cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, bool return_debug_mask, *, float? scale=None) -> (Tensor output, Tensor softmax_logsumexp, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
14731
+ - func: _scaled_dot_product_cudnn_attention(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor output, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
14732
+ dispatch:
14733
+ CUDA: _scaled_dot_product_cudnn_attention_cuda
14734
+ tags: nondeterministic_seeded
14735
+
14736
+ - func: _scaled_dot_product_cudnn_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, Tensor philox_seed, Tensor philox_offset, *, float? scale=None) -> (Tensor, Tensor, Tensor)
14737
+ dispatch:
14738
+ CUDA: _scaled_dot_product_cudnn_attention_backward_cuda
14739
+ tags: nondeterministic_seeded
14740
+
14741
+ - func: _flash_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? cum_seq_q, Tensor? cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, bool return_debug_mask, *, float? scale=None, SymInt? window_size_left=None, SymInt? window_size_right=None, Tensor? seqused_k=None, Tensor? alibi_slopes=None) -> (Tensor output, Tensor softmax_logsumexp, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
14497
14742
  variants: function
14498
14743
  dispatch:
14499
14744
  CUDA: _flash_attention_forward
14500
14745
  tags: nondeterministic_seeded
14501
14746
 
14502
- - func: _flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, Tensor philox_seed, Tensor philox_offset, *, float? scale=None) -> (Tensor, Tensor, Tensor)
14747
+ - func: _flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, Tensor philox_seed, Tensor philox_offset, *, float? scale=None, SymInt? window_size_left=None, SymInt? window_size_right=None) -> (Tensor, Tensor, Tensor)
14503
14748
  device_check: NoCheck
14504
14749
  variants: function
14505
14750
  dispatch:
14506
14751
  CUDA: _flash_attention_backward
14507
14752
 
14508
- # Returns ouput, logsumexp if compute_logsumexp
14509
- - func: _efficient_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int? max_seqlen_q, float dropout_p, int custom_mask_type, bool compute_log_sumexp=False, *, float? scale=None, Tensor? causal_diagonal=None, Tensor? seqlen_k=None) -> (Tensor output, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset, SymInt max_seqlen_batch_q, SymInt max_seqlen_batch_k)
14753
+ # Returns output, logsumexp if compute_logsumexp
14754
+ - func: _efficient_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, SymInt? max_seqlen_q, SymInt? max_seqlen_k, float dropout_p, int custom_mask_type, bool compute_log_sumexp=False, *, float? scale=None, Tensor? seqlen_k=None, int? window_size=None) -> (Tensor output, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset, SymInt max_seqlen_batch_q, SymInt max_seqlen_batch_k)
14510
14755
  variants: function
14511
14756
  dispatch:
14512
14757
  CUDA: _efficient_attention_forward
14513
14758
  tags: nondeterministic_seeded
14514
14759
 
14515
- - func: _efficient_attention_backward(Tensor grad_out_, Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor out, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, SymInt max_seqlen_q, SymInt max_seqlen_k, Tensor logsumexp, float dropout_p, Tensor philox_seed, Tensor philox_offset, int custom_mask_type, bool bias_requires_grad, *, float? scale=None, int? num_splits_key=None) -> (Tensor, Tensor, Tensor, Tensor)
14760
+ - func: _efficient_attention_backward(Tensor grad_out_, Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor out, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, SymInt max_seqlen_q, SymInt max_seqlen_k, Tensor logsumexp, float dropout_p, Tensor philox_seed, Tensor philox_offset, int custom_mask_type, bool bias_requires_grad, *, float? scale=None, int? num_splits_key=None, int? window_size=None, bool shared_storage_dqdkdv=False) -> (Tensor, Tensor, Tensor, Tensor)
14516
14761
  device_check: NoCheck
14517
14762
  variants: function
14518
14763
  dispatch:
@@ -15312,11 +15557,11 @@
15312
15557
  CPU: foobar
15313
15558
  autogen: _foobar.out
15314
15559
 
15315
- # Fused Optimizer CUDA kernels.
15316
15560
  - func: _fused_adam_(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, float lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
15317
15561
  # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now).
15318
15562
  variants: function
15319
15563
  dispatch:
15564
+ CPU: _fused_adam_kernel_cpu_
15320
15565
  CUDA: _fused_adam_kernel_cuda_
15321
15566
  autogen: _fused_adam, _fused_adam.out
15322
15567
 
@@ -15326,6 +15571,7 @@
15326
15571
  device_check: NoCheck
15327
15572
  variants: function
15328
15573
  dispatch:
15574
+ CPU: _fused_adam_kernel_cpu_
15329
15575
  CUDA: _fused_adam_kernel_cuda_
15330
15576
  autogen: _fused_adam.tensor_lr, _fused_adam.tensor_lr_out
15331
15577
 
@@ -15333,6 +15579,7 @@
15333
15579
  # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now).
15334
15580
  variants: function
15335
15581
  dispatch:
15582
+ CPU: _fused_adamw_kernel_cpu_
15336
15583
  CUDA: _fused_adamw_kernel_cuda_
15337
15584
  autogen: _fused_adamw, _fused_adamw.out
15338
15585
 
@@ -15342,9 +15589,34 @@
15342
15589
  device_check: NoCheck
15343
15590
  variants: function
15344
15591
  dispatch:
15592
+ CPU: _fused_adamw_kernel_cpu_
15345
15593
  CUDA: _fused_adamw_kernel_cuda_
15346
15594
  autogen: _fused_adamw.tensor_lr, _fused_adamw.tensor_lr_out
15347
15595
 
15596
+ - func: _fused_sgd_(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] momentum_buffer_list, *, float weight_decay, float momentum, float lr, float dampening, bool nesterov, bool maximize, bool is_first_step, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
15597
+ # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now).
15598
+ variants: function
15599
+ dispatch:
15600
+ CPU: _fused_sgd_kernel_cpu_
15601
+ CUDA: _fused_sgd_kernel_cuda_
15602
+ autogen: _fused_sgd, _fused_sgd.out
15603
+
15604
+ - func: _fused_sgd_.tensor_lr(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] momentum_buffer_list, *, float weight_decay, float momentum, Tensor lr, float dampening, bool nesterov, bool maximize, bool is_first_step, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
15605
+ # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now).
15606
+ # but still skip the device check as the Tensor LR can be on CPU
15607
+ device_check: NoCheck
15608
+ variants: function
15609
+ dispatch:
15610
+ CPU: _fused_sgd_kernel_cpu_
15611
+ CUDA: _fused_sgd_kernel_cuda_
15612
+ autogen: _fused_sgd.tensor_lr, _fused_sgd.tensor_lr_out
15613
+
15614
+ - func: _fused_adagrad_(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] state_sums, Tensor(d!)[] state_steps, *, float lr, float lr_decay, float weight_decay, float eps, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
15615
+ variants: function
15616
+ dispatch:
15617
+ CPU: _fused_adagrad_kernel_cpu_
15618
+ autogen: _fused_adagrad, _fused_adagrad.out
15619
+
15348
15620
  # This op is ONLY used by pytorch/XLA in functionalization, and should never show up in vanilla eager mode or in any pytorch tracing contexts.
15349
15621
  - func: _propagate_xla_data(Tensor input, Tensor output) -> ()
15350
15622
  variants: function