torch-rb 0.15.0 → 0.17.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -134,7 +134,7 @@
134
134
  autogen: _new_zeros_with_same_feature_meta.out
135
135
 
136
136
  # This function compares the storage numel of self with that of other, where
137
- # storage numel is cumputed as: `other.storage().nbytes() / other.itemsize()`.
137
+ # storage numel is computed as: `other.storage().nbytes() / other.itemsize()`.
138
138
  # We create this function for composite compliance purposes. The batching rule
139
139
  # always returns true because vmapped as_strided does not support accessing
140
140
  # storage locations not indexable by the input tensor.
@@ -175,12 +175,24 @@
175
175
  CPU: _assert_async_msg_cpu
176
176
  CUDA: _assert_async_msg_cuda
177
177
 
178
+ - func: _assert_scalar(Scalar self, str assert_msg) -> ()
179
+ dispatch:
180
+ CompositeExplicitAutograd: _assert_scalar
181
+
182
+ - func: _functional_assert_scalar(Scalar self, str assert_msg, Tensor dep_token) -> Tensor
183
+ dispatch:
184
+ CompositeExplicitAutograd: _functional_assert_scalar
185
+
178
186
  - func: _functional_assert_async.msg(Tensor self, str assert_msg, Tensor dep_token) -> Tensor
179
187
  dispatch:
180
188
  CPU: _functional_assert_async_msg_cpu
181
189
 
182
190
  - func: _assert_tensor_metadata(Tensor a, SymInt[]? size=None, SymInt[]? stride=None, ScalarType? dtype=None) -> ()
183
191
 
192
+ - func: _print(str s) -> ()
193
+ dispatch:
194
+ CompositeExplicitAutograd: _print
195
+
184
196
  - func: sym_constrain_range(Scalar size, *, int? min=None, int? max=None) -> ()
185
197
  dispatch:
186
198
  CompositeExplicitAutograd: sym_constrain_range
@@ -470,6 +482,7 @@
470
482
  - func: conj_physical.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
471
483
  dispatch:
472
484
  CPU, CUDA: conj_physical_out
485
+ MPS: conj_physical_out_mps
473
486
  SparseCPU, SparseCUDA: conj_physical_out_sparse
474
487
  SparseCsrCPU, SparseCsrCUDA: conj_physical_sparse_csr_out
475
488
  tags: pointwise
@@ -536,8 +549,8 @@
536
549
  structured_delegate: add.out
537
550
  variants: function, method
538
551
  dispatch:
539
- SparseCPU, SparseCUDA: add_sparse
540
- SparseCsrCPU, SparseCsrCUDA: add_sparse_csr
552
+ SparseCPU, SparseCUDA, SparseMeta: add_sparse
553
+ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: add_sparse_csr
541
554
  MkldnnCPU: mkldnn_add
542
555
  ZeroTensor: add_zerotensor
543
556
  NestedTensorCPU, NestedTensorCUDA: NestedTensor_add_Tensor
@@ -548,8 +561,8 @@
548
561
  variants: method
549
562
  structured_delegate: add.out
550
563
  dispatch:
551
- SparseCPU, SparseCUDA: add_sparse_
552
- SparseCsrCPU, SparseCsrCUDA: add_sparse_csr_
564
+ SparseCPU, SparseCUDA, SparseMeta: add_sparse_
565
+ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: add_sparse_csr_
553
566
  MkldnnCPU: mkldnn_add_
554
567
  NestedTensorCPU, NestedTensorCUDA: NestedTensor_add__Tensor
555
568
  tags: pointwise
@@ -562,10 +575,10 @@
562
575
  Generic: add (AllAndComplex, BFloat16, Half, ComplexHalf)
563
576
  ScalarOnly: add (Bool)
564
577
  dispatch:
565
- SparseCPU: add_out_sparse_cpu
578
+ SparseCPU, SparseMeta: add_out_sparse_cpu
566
579
  SparseCUDA: add_out_sparse_cuda
567
- SparseCsrCPU: add_out_sparse_csr_cpu
568
- SparseCsrCUDA: add_out_sparse_csr_cuda
580
+ SparseCsrCPU, SparseCsrMeta: add_out_sparse_compressed_cpu
581
+ SparseCsrCUDA: add_out_sparse_compressed_cuda
569
582
  MkldnnCPU: mkldnn_add_out
570
583
  MPS: add_out_mps
571
584
  tags: pointwise
@@ -763,7 +776,7 @@
763
776
  dispatch:
764
777
  CompositeExplicitAutograd: arange
765
778
 
766
- # This operator should be named `aragne.start_out` if following the naming convention. However that
779
+ # This operator should be named `arange.start_out` if following the naming convention. However that
767
780
  # name is already taken. Disabled because of CI job failures.
768
781
  # FIXME: enable this
769
782
  #- func: arange.start_out_(Scalar start, Scalar end, *, Tensor(a!) out) -> Tensor(a!)
@@ -1220,6 +1233,13 @@
1220
1233
  CompositeExplicitAutograd: copysign_out
1221
1234
  tags: pointwise
1222
1235
 
1236
+ - func: _lazy_clone(Tensor self) -> Tensor
1237
+ # Like clone, but the copy takes place lazily, only if either the
1238
+ # input or the output are written.
1239
+ variants: function, method
1240
+ dispatch:
1241
+ CompositeExplicitAutograd: _lazy_clone
1242
+
1223
1243
  - func: logical_not(Tensor self) -> Tensor
1224
1244
  device_check: NoCheck # TensorIterator
1225
1245
  variants: function, method
@@ -1621,6 +1641,7 @@
1621
1641
  - func: complex.out(Tensor real, Tensor imag, *, Tensor(a!) out) -> Tensor(a!)
1622
1642
  dispatch:
1623
1643
  CPU, CUDA: complex_out
1644
+ MPS: complex_out_mps
1624
1645
 
1625
1646
  - func: polar(Tensor abs, Tensor angle) -> Tensor
1626
1647
  variants: function
@@ -1729,6 +1750,7 @@
1729
1750
  - func: copy(Tensor self, Tensor src, bool non_blocking=False) -> Tensor
1730
1751
  variants: function
1731
1752
  dispatch:
1753
+ Meta: copy_meta
1732
1754
  CompositeExplicitAutogradNonFunctional: copy
1733
1755
  tags: core
1734
1756
 
@@ -1847,7 +1869,10 @@
1847
1869
  - func: cudnn_convolution(Tensor self, Tensor weight, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor
1848
1870
  dispatch:
1849
1871
  CUDA: cudnn_convolution
1850
- autogen: cudnn_convolution.out
1872
+
1873
+ - func: cudnn_convolution.out(Tensor self, Tensor weight, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, bool allow_tf32, *, Tensor(a!) out) -> Tensor(a!)
1874
+ dispatch:
1875
+ CUDA: cudnn_convolution_out
1851
1876
 
1852
1877
  - func: cudnn_convolution_transpose(Tensor self, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor
1853
1878
  dispatch:
@@ -2346,7 +2371,7 @@
2346
2371
  Meta: empty_meta_symint
2347
2372
  MkldnnCPU: empty_mkldnn
2348
2373
  SparseCPU, SparseCUDA, SparseMeta: empty_sparse
2349
- SparseCsrCPU, SparseCsrCUDA: empty_sparse_compressed
2374
+ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: empty_sparse_compressed
2350
2375
  QuantizedCPU, QuantizedCUDA, QuantizedMeta: empty_unknown_quantized
2351
2376
  tags: core
2352
2377
 
@@ -2452,7 +2477,7 @@
2452
2477
  CompositeExplicitAutograd: empty_like
2453
2478
  QuantizedCPU, QuantizedCUDA: empty_like_quantized
2454
2479
  SparseCPU, SparseCUDA, SparseMeta: empty_like_sparse_coo
2455
- SparseCsrCPU, SparseCsrCUDA: empty_like_sparse_csr
2480
+ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: empty_like_sparse_csr
2456
2481
  NestedTensorCPU, NestedTensorCUDA: empty_like_nested
2457
2482
  autogen: empty_like.out
2458
2483
 
@@ -2954,12 +2979,14 @@
2954
2979
  dispatch:
2955
2980
  CPU: _fft_r2c_mkl
2956
2981
  CUDA: _fft_r2c_cufft
2982
+ MPS: _fft_r2c_mps
2957
2983
 
2958
2984
  - func: _fft_r2c.out(Tensor self, int[] dim, int normalization, bool onesided, *, Tensor(a!) out) -> Tensor(a!)
2959
2985
  variants: function
2960
2986
  dispatch:
2961
2987
  CPU: _fft_r2c_mkl_out
2962
2988
  CUDA: _fft_r2c_cufft_out
2989
+ MPS: _fft_r2c_mps_out
2963
2990
 
2964
2991
  # Complex to real inverse FFT
2965
2992
  - func: _fft_c2r(Tensor self, int[] dim, int normalization, SymInt last_dim_size) -> Tensor
@@ -2967,12 +2994,14 @@
2967
2994
  dispatch:
2968
2995
  CPU: _fft_c2r_mkl
2969
2996
  CUDA: _fft_c2r_cufft
2997
+ MPS: _fft_c2r_mps
2970
2998
 
2971
2999
  - func: _fft_c2r.out(Tensor self, int[] dim, int normalization, SymInt last_dim_size, *, Tensor(a!) out) -> Tensor(a!)
2972
3000
  variants: function
2973
3001
  dispatch:
2974
3002
  CPU: _fft_c2r_mkl_out
2975
3003
  CUDA: _fft_c2r_cufft_out
3004
+ MPS: _fft_c2r_mps_out
2976
3005
 
2977
3006
  # Standard complex to complex FFT (forward or backward)
2978
3007
  - func: _fft_c2c(Tensor self, SymInt[] dim, int normalization, bool forward) -> Tensor
@@ -2980,12 +3009,14 @@
2980
3009
  dispatch:
2981
3010
  CPU: _fft_c2c_mkl
2982
3011
  CUDA: _fft_c2c_cufft
3012
+ MPS: _fft_c2c_mps
2983
3013
 
2984
3014
  - func: _fft_c2c.out(Tensor self, SymInt[] dim, int normalization, bool forward, *, Tensor(a!) out) -> Tensor(a!)
2985
3015
  variants: function
2986
3016
  dispatch:
2987
3017
  CPU: _fft_c2c_mkl_out
2988
3018
  CUDA: _fft_c2c_cufft_out
3019
+ MPS: _fft_c2c_mps_out
2989
3020
 
2990
3021
  - func: _validate_compressed_sparse_indices(bool is_crow, Tensor compressed_idx, Tensor plain_idx, int cdim, int dim, int nnz) -> ()
2991
3022
  device_check: NoCheck
@@ -3097,6 +3128,7 @@
3097
3128
  structured: True
3098
3129
  dispatch:
3099
3130
  CPU, CUDA: isin_Tensor_Tensor_out
3131
+ MPS: isin_Tensor_Tensor_out_mps
3100
3132
 
3101
3133
  - func: isin.Tensor_Tensor(Tensor elements, Tensor test_elements, *, bool assume_unique=False, bool invert=False) -> Tensor
3102
3134
  variants: function
@@ -3238,6 +3270,8 @@
3238
3270
  autogen: native_layer_norm_backward.out
3239
3271
  tags: core
3240
3272
 
3273
+ - func: rms_norm(Tensor input, int[] normalized_shape, Tensor? weight=None, float? eps=None) -> Tensor
3274
+
3241
3275
  - func: nan_to_num(Tensor self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor
3242
3276
  variants: function, method
3243
3277
  dispatch:
@@ -3302,14 +3336,39 @@
3302
3336
  dispatch:
3303
3337
  CUDA: _cslt_compress
3304
3338
 
3305
- - func: _cslt_sparse_mm(Tensor compressed_A, Tensor dense_B, Tensor? bias=None, Tensor? alpha=None, ScalarType? out_dtype=None, bool transpose_result=False) -> Tensor
3339
+ - func: _cslt_sparse_mm(Tensor compressed_A, Tensor dense_B, Tensor? bias=None, Tensor? alpha=None, ScalarType? out_dtype=None, bool transpose_result=False, int alg_id=0) -> Tensor
3306
3340
  dispatch:
3307
3341
  CUDA: _cslt_sparse_mm
3308
3342
 
3309
- - func: _sparse_semi_structured_linear(Tensor input, Tensor weight, Tensor meta, *, Tensor? bias=None, str? activation=None) -> Tensor
3343
+ - func: _cslt_sparse_mm_search(Tensor compressed_A, Tensor dense_B, Tensor? bias=None, Tensor? alpha=None, ScalarType? out_dtype=None, bool transpose_result=False) -> int
3344
+ dispatch:
3345
+ CUDA: _cslt_sparse_mm_search
3346
+
3347
+ - func: _sparse_semi_structured_tile(Tensor input, str algorithm="", bool use_cutlass=True) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
3348
+ dispatch:
3349
+ CUDA: _sparse_semi_structured_tile
3350
+
3351
+ - func: _sparse_semi_structured_apply(Tensor input, Tensor thread_masks) -> (Tensor, Tensor)
3352
+ dispatch:
3353
+ CUDA: _sparse_semi_structured_apply
3354
+
3355
+ - func: _sparse_semi_structured_apply_dense(Tensor input, Tensor thread_masks) -> Tensor
3356
+ dispatch:
3357
+ CUDA: _sparse_semi_structured_apply_dense
3358
+
3359
+ # DEPRECATED: Use torch.__sparse_semi_structured_mm/torch._sparse_semi_structured_addmm instead
3360
+ - func: _sparse_semi_structured_linear(Tensor input, Tensor weight, Tensor meta, *, Tensor? bias=None, str? activation=None, ScalarType? out_dtype=None) -> Tensor
3310
3361
  dispatch:
3311
3362
  CUDA: _sparse_semi_structured_linear
3312
3363
 
3364
+ - func: _sparse_semi_structured_mm(Tensor mat1, Tensor mat1_meta, Tensor mat2, *, ScalarType? out_dtype=None) -> Tensor
3365
+ dispatch:
3366
+ CUDA: _sparse_semi_structured_mm
3367
+
3368
+ - func: _sparse_semi_structured_addmm(Tensor input, Tensor mat1, Tensor mat1_meta, Tensor mat2, *, Scalar alpha=1, Scalar beta=1, ScalarType? out_dtype=None) -> Tensor
3369
+ dispatch:
3370
+ CUDA: _sparse_semi_structured_addmm
3371
+
3313
3372
  - func: _mixed_dtypes_linear(Tensor input, Tensor weight, Tensor scale, *, Tensor? bias=None, str? activation=None) -> Tensor
3314
3373
  dispatch:
3315
3374
  CUDA: _mixed_dtypes_linear
@@ -4050,20 +4109,30 @@
4050
4109
 
4051
4110
  - func: _int_mm(Tensor self, Tensor mat2) -> Tensor
4052
4111
  dispatch:
4112
+ CPU: _int_mm_cpu
4053
4113
  CUDA: _int_mm_cuda
4054
4114
 
4055
4115
  - func: _int_mm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)
4056
4116
  dispatch:
4117
+ CPU: _int_mm_out_cpu
4057
4118
  CUDA: _int_mm_out_cuda
4058
4119
 
4059
4120
  - func: _convert_weight_to_int4pack(Tensor self, int innerKTiles) -> Tensor
4060
4121
  dispatch:
4122
+ CPU: _convert_weight_to_int4pack_cpu
4061
4123
  CUDA: _convert_weight_to_int4pack_cuda
4062
4124
 
4063
4125
  - func: _weight_int4pack_mm(Tensor self, Tensor mat2, int qGroupSize, Tensor qScaleAndZeros) -> Tensor
4064
4126
  dispatch:
4127
+ CPU: _weight_int4pack_mm_cpu
4128
+ MPS: _weight_int4pack_mm_mps
4065
4129
  CUDA: _weight_int4pack_mm_cuda
4066
4130
 
4131
+ - func: _weight_int8pack_mm(Tensor self, Tensor mat2, Tensor scales) -> Tensor
4132
+ dispatch:
4133
+ CPU: _weight_int8pack_mm_cpu
4134
+ MPS: _weight_int8pack_mm_mps
4135
+
4067
4136
  - func: _sparse_mm(Tensor sparse, Tensor dense) -> Tensor
4068
4137
  python_module: sparse
4069
4138
 
@@ -4439,7 +4508,6 @@
4439
4508
  MPS: pixel_shuffle_mps
4440
4509
  CompositeExplicitAutogradNonFunctional: math_pixel_shuffle
4441
4510
  autogen: pixel_shuffle.out
4442
- tags: core
4443
4511
 
4444
4512
  - func: pixel_unshuffle(Tensor self, int downscale_factor) -> Tensor
4445
4513
  dispatch:
@@ -4810,7 +4878,7 @@
4810
4878
  device_guard: False
4811
4879
  dispatch:
4812
4880
  CompositeImplicitAutograd: reshape_symint
4813
- CompositeImplicitAutogradNestedTensor: reshape_nested
4881
+ CompositeImplicitAutogradNestedTensor: reshape_nested_symint
4814
4882
 
4815
4883
  - func: _reshape_copy(Tensor self, SymInt[] size) -> Tensor
4816
4884
  variants: function
@@ -4969,6 +5037,7 @@
4969
5037
  device_check: NoCheck # TensorIterator
4970
5038
  python_module: nn
4971
5039
  dispatch:
5040
+ QuantizedCPU: gelu_quantized_cpu_
4972
5041
  NestedTensorCPU, NestedTensorCUDA: NestedTensor_gelu_
4973
5042
 
4974
5043
  - func: gelu(Tensor self, *, str approximate='none') -> Tensor
@@ -5356,6 +5425,21 @@
5356
5425
  CompositeExplicitAutograd: slice_backward
5357
5426
  autogen: slice_backward.out
5358
5427
 
5428
+ # NB: This op exists to back the implementation of reverse view_funcs for various views (chunk,
5429
+ # slice.Tensor, split_with_sizes, et al.). Currently, these are only used during fake-ification
5430
+ # of PT2 graph input subclass instances that are views. This means:
5431
+ # * This op shouldn't really show up in eager mode (so e.g. XLA shouldn't have to implement it)
5432
+ # * This op shouldn't show up in a PT2 graph (so a PT2 backend shouldn't have to implement it)
5433
+ # * A subclass will have to implement this to work in PT2 if a subclass view is used as a graph
5434
+ # input AND the view utilizes this op in its inverse. The idea is that slice_inverse() is
5435
+ # easier to implement for a subclass than as_strided()
5436
+ - func: slice_inverse(Tensor(a) self, Tensor src, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor(a)
5437
+ variants: function, method
5438
+ device_check: NoCheck
5439
+ device_guard: False
5440
+ dispatch:
5441
+ CompositeExplicitAutograd: slice_inverse_symint
5442
+
5359
5443
  - func: slice_scatter(Tensor self, Tensor src, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor
5360
5444
  variants: function, method
5361
5445
  device_check: NoCheck
@@ -5363,7 +5447,7 @@
5363
5447
  dispatch:
5364
5448
  CompositeExplicitAutogradNonFunctional: slice_scatter
5365
5449
  autogen: slice_scatter.out
5366
- tags: core
5450
+ tags: [core, view_copy]
5367
5451
 
5368
5452
  - func: select_scatter(Tensor self, Tensor src, int dim, SymInt index) -> Tensor
5369
5453
  variants: function, method
@@ -5562,6 +5646,16 @@
5562
5646
  SparseCPU: _sspaddmm_out_cpu
5563
5647
  SparseCUDA: _sspaddmm_out_cuda
5564
5648
 
5649
+ - func: _chunk_cat(Tensor[] tensors, int dim, int num_chunks) -> Tensor
5650
+ dispatch:
5651
+ CompositeExplicitAutograd: _chunk_cat
5652
+ CUDA: _chunk_cat_cuda
5653
+
5654
+ - func: _chunk_cat.out(Tensor[] tensors, int dim, int num_chunks, *, Tensor(a!) out) -> Tensor(a!)
5655
+ dispatch:
5656
+ CompositeExplicitAutograd: _chunk_cat_out
5657
+ CUDA: _chunk_cat_out_cuda
5658
+
5565
5659
  - func: stack(Tensor[] tensors, int dim=0) -> Tensor
5566
5660
  dispatch:
5567
5661
  CompositeExplicitAutograd: stack
@@ -5626,8 +5720,8 @@
5626
5720
  variants: function, method
5627
5721
  dispatch:
5628
5722
  CompositeExplicitAutograd: sum
5629
- SparseCPU, SparseCUDA: sum_coo
5630
- SparseCsrCPU, SparseCsrCUDA: sum_csr
5723
+ SparseCPU, SparseCUDA, SparseMeta: sum_coo
5724
+ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sum_csr
5631
5725
  autogen: sum.out
5632
5726
 
5633
5727
  - func: sum.dim_IntList(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
@@ -5753,6 +5847,7 @@
5753
5847
  variants: function
5754
5848
  dispatch:
5755
5849
  CPU, CUDA: std_mean
5850
+ MPS: std_mean_mps
5756
5851
  autogen: std_mean.correction_out
5757
5852
 
5758
5853
  - func: std_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
@@ -6008,7 +6103,6 @@
6008
6103
  CPU, MPS: roll
6009
6104
  CUDA: roll_cuda
6010
6105
  autogen: roll.out
6011
- tags: core
6012
6106
 
6013
6107
  # default int[] value [0,1] should not add space after comma, since codegen parser uses ', ' to split args
6014
6108
 
@@ -6091,6 +6185,58 @@
6091
6185
  CompositeExplicitAutogradNonFunctional: _nested_view_from_buffer_copy
6092
6186
  autogen: _nested_view_from_buffer_copy.out
6093
6187
 
6188
+ - func: _nested_view_from_jagged(Tensor(a) self, Tensor offsets, Tensor dummy, Tensor? lengths=None, int ragged_idx=1) -> Tensor(a)
6189
+ variants: function
6190
+ device_check: NoCheck
6191
+ dispatch: {}
6192
+
6193
+ - func: _nested_view_from_jagged_copy(Tensor self, Tensor offsets, Tensor dummy, Tensor? lengths=None, int ragged_idx=1) -> Tensor
6194
+ variants: function
6195
+ device_check: NoCheck
6196
+ tags: view_copy
6197
+ dispatch:
6198
+ CompositeExplicitAutogradNonFunctional: _nested_view_from_jagged_copy
6199
+ autogen: _nested_view_from_jagged_copy.out
6200
+
6201
+ - func: _nested_get_values(Tensor(a) self) -> Tensor(a)
6202
+ variants: function
6203
+ device_check: NoCheck
6204
+ dispatch: {}
6205
+
6206
+ - func: _nested_get_values_copy(Tensor self) -> Tensor
6207
+ variants: function
6208
+ device_check: NoCheck
6209
+ tags: view_copy
6210
+ dispatch:
6211
+ CompositeExplicitAutogradNonFunctional: _nested_get_values_copy
6212
+ autogen: _nested_get_values_copy.out
6213
+
6214
+ - func: _nested_get_offsets(Tensor self) -> Tensor
6215
+ variants: function
6216
+ device_check: NoCheck
6217
+ dispatch: {}
6218
+
6219
+ # returns undefined Tensor if no lengths present
6220
+ - func: _nested_get_lengths(Tensor self) -> Tensor
6221
+ variants: function
6222
+ device_check: NoCheck
6223
+ dispatch: {}
6224
+
6225
+ - func: _nested_get_ragged_idx(Tensor self) -> int
6226
+ variants: function
6227
+ device_check: NoCheck
6228
+ dispatch: {}
6229
+
6230
+ - func: _nested_get_jagged_dummy(Tensor any) -> Tensor
6231
+ category_override: dummy
6232
+ dispatch: {}
6233
+
6234
+ - func: _nested_compute_contiguous_strides_offsets(Tensor nested_size) -> (Tensor, Tensor)
6235
+ variants: function
6236
+ device_check: NoCheck
6237
+ dispatch:
6238
+ CPU, CUDA: _nested_compute_contiguous_strides_offsets
6239
+
6094
6240
  - func: _trilinear(Tensor i1, Tensor i2, Tensor i3, int[] expand1, int[] expand2, int[] expand3, int[] sumdim, int unroll_dim=1) -> Tensor
6095
6241
  dispatch:
6096
6242
  # calls unsqueeze
@@ -6275,6 +6421,7 @@
6275
6421
  variants: function
6276
6422
  dispatch:
6277
6423
  CPU, CUDA: var_mean
6424
+ MPS: var_mean_mps
6278
6425
  autogen: var_mean.correction_out
6279
6426
 
6280
6427
  - func: var_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
@@ -6295,15 +6442,13 @@
6295
6442
  device_check: NoCheck # TensorIterator
6296
6443
  variants: function, method
6297
6444
  dispatch:
6298
- CPU, CUDA: where
6299
- MPS: where_mps
6445
+ CPU, CUDA, MPS: where
6300
6446
  tags: [core, pointwise]
6301
6447
 
6302
6448
  - func: where.self_out(Tensor condition, Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
6303
6449
  device_check: NoCheck # TensorIterator
6304
6450
  dispatch:
6305
- CPU, CUDA: where_self_out
6306
- MPS: where_self_out_mps
6451
+ CPU, CUDA, MPS: where_self_out
6307
6452
 
6308
6453
  - func: where.ScalarSelf(Tensor condition, Scalar self, Tensor other) -> Tensor
6309
6454
  variants: function
@@ -6357,7 +6502,7 @@
6357
6502
  CPU: _efficientzerotensor
6358
6503
  CUDA: _efficientzerotensor_cuda
6359
6504
  MPS: _efficientzerotensor_mps
6360
- Meta: _efficientzerotensor_meta
6505
+ Meta: _efficientzerotensor_meta_symint
6361
6506
  autogen: _efficientzerotensor.out
6362
6507
 
6363
6508
  - func: zeros(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
@@ -6434,6 +6579,32 @@
6434
6579
  SparseCPU, SparseCUDA: norm_sparse
6435
6580
  autogen: native_norm.ScalarOpt_dim_dtype_out
6436
6581
 
6582
+ - func: _batch_norm_with_update(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, float momentum, float eps) -> (Tensor, Tensor, Tensor, Tensor)
6583
+ dispatch:
6584
+ CPU: _batch_norm_with_update_cpu
6585
+ CUDA: _batch_norm_with_update_cuda
6586
+ MPS: _batch_norm_with_update_mps
6587
+ MkldnnCPU: _batch_norm_with_update_mkldnn
6588
+ autogen: _batch_norm_with_update_functional
6589
+
6590
+ - func: _batch_norm_with_update.out(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, float momentum, float eps, *, Tensor(d!) out, Tensor(e!) save_mean, Tensor(f!) save_invstd, Tensor(g!) reserve) -> (Tensor(d!), Tensor(e!), Tensor(f!), Tensor(g!))
6591
+ dispatch:
6592
+ CPU: _batch_norm_with_update_cpu_out
6593
+ CUDA: _batch_norm_with_update_cuda_out
6594
+ MPS: _batch_norm_with_update_mps_out
6595
+
6596
+ - func: _batch_norm_no_update(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, float momentum, float eps) -> (Tensor, Tensor, Tensor, Tensor)
6597
+ dispatch:
6598
+ CompositeExplicitAutograd: _batch_norm_no_update
6599
+ autogen: _batch_norm_no_update.out
6600
+
6601
+ - func: batch_norm_backward(Tensor grad_out, Tensor input, Tensor weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var, bool update, float eps, bool[3] output_mask, Tensor reserve) -> (Tensor, Tensor, Tensor)
6602
+ dispatch:
6603
+ CPU: _new_batch_norm_backward_cpu
6604
+ CUDA: _new_batch_norm_backward_cuda
6605
+ MPS: _new_batch_norm_backward_mps
6606
+ MkldnnCPU: _new_batch_norm_backward_mkldnn
6607
+
6437
6608
  # TODO: reduce signatures down to one when optional args is available
6438
6609
  - func: _sparse_sum(Tensor self) -> Tensor
6439
6610
 
@@ -6644,7 +6815,7 @@
6644
6815
  MPS: zero_mps_
6645
6816
  Meta: zero_meta_
6646
6817
  SparseCPU, SparseCUDA, SparseMeta: zero_sparse_
6647
- SparseCsrCPU, SparseCsrCUDA: zero_sparse_csr_
6818
+ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: zero_sparse_csr_
6648
6819
  MkldnnCPU: mkldnn_zero_
6649
6820
  NestedTensorCPU, NestedTensorCUDA: zero_nested_
6650
6821
  autogen: zero, zero.out
@@ -6934,7 +7105,11 @@
6934
7105
  # FIXME: would be nicer if TensorOptions was optional based; not adding default arguments for options given
6935
7106
  # the default would never make sense.
6936
7107
 
6937
- - func: sparse_compressed_tensor.comp_plain_value_size(Tensor compressed_indices, Tensor plain_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
7108
+ - func: _sparse_compressed_tensor_with_dims(int nnz, int dense_dim, int[] size, int[] blocksize, ScalarType index_dtype, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
7109
+ dispatch:
7110
+ CompositeExplicitAutograd: sparse_compressed_tensor_with_dims
7111
+
7112
+ - func: sparse_compressed_tensor.comp_plain_value_size(Tensor compressed_indices, Tensor plain_indices, Tensor values, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
6938
7113
  dispatch:
6939
7114
  CompositeExplicitAutograd: sparse_compressed_tensor
6940
7115
 
@@ -6951,7 +7126,10 @@
6951
7126
  - func: sparse_bsr_tensor.crow_col_value(Tensor crow_indices, Tensor col_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
6952
7127
  - func: sparse_bsc_tensor.ccol_row_value(Tensor ccol_indices, Tensor row_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
6953
7128
 
6954
- - func: _sparse_compressed_tensor_unsafe(Tensor compressed_indices, Tensor plain_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
7129
+ - func: _sparse_compressed_tensor_unsafe(Tensor compressed_indices, Tensor plain_indices, Tensor values, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
7130
+ dispatch:
7131
+ CompositeImplicitAutograd: _sparse_compressed_tensor_unsafe_symint
7132
+
6955
7133
  - func: _sparse_csr_tensor_unsafe(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
6956
7134
  - func: _sparse_csc_tensor_unsafe(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
6957
7135
  - func: _sparse_bsr_tensor_unsafe(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
@@ -7035,9 +7213,9 @@
7035
7213
  - func: sparse_dim(Tensor self) -> int
7036
7214
  variants: method
7037
7215
  dispatch:
7038
- CPU, CUDA: sparse_dim_strided
7039
7216
  SparseCPU, SparseCUDA, SparseMeta: sparse_dim_sparse
7040
- SparseCsrCPU, SparseCsrCUDA: sparse_dim_sparse_csr
7217
+ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sparse_dim_sparse_csr
7218
+ CompositeExplicitAutograd: sparse_dim_default
7041
7219
  device_check: NoCheck
7042
7220
  device_guard: False
7043
7221
 
@@ -7052,9 +7230,9 @@
7052
7230
  - func: dense_dim(Tensor self) -> int
7053
7231
  variants: method
7054
7232
  dispatch:
7055
- CPU, CUDA: dense_dim_strided
7056
7233
  SparseCPU, SparseCUDA, SparseMeta: dense_dim_sparse
7057
- SparseCsrCPU, SparseCsrCUDA: dense_dim_sparse_csr
7234
+ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: dense_dim_sparse_csr
7235
+ CompositeExplicitAutograd: dense_dim_default
7058
7236
  device_check: NoCheck
7059
7237
  device_guard: False
7060
7238
 
@@ -7070,7 +7248,7 @@
7070
7248
  variants: method
7071
7249
  dispatch:
7072
7250
  SparseCPU, SparseCUDA, SparseMeta: _nnz_sparse
7073
- SparseCsrCPU, SparseCsrCUDA: _nnz_sparse_csr
7251
+ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: _nnz_sparse_csr
7074
7252
  device_check: NoCheck
7075
7253
  device_guard: False
7076
7254
 
@@ -7133,7 +7311,7 @@
7133
7311
  variants: method
7134
7312
  dispatch:
7135
7313
  SparseCPU, SparseCUDA, SparseMeta: values_sparse
7136
- SparseCsrCPU, SparseCsrCUDA: values_sparse_csr
7314
+ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: values_sparse_csr
7137
7315
  NestedTensorCPU, NestedTensorCUDA: values_nested
7138
7316
  CompositeExplicitAutograd: values_default
7139
7317
  device_check: NoCheck
@@ -7142,7 +7320,7 @@
7142
7320
  - func: crow_indices(Tensor(a) self) -> Tensor(a)
7143
7321
  variants: method
7144
7322
  dispatch:
7145
- SparseCsrCPU, SparseCsrCUDA: crow_indices_sparse_csr
7323
+ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: crow_indices_sparse_csr
7146
7324
  CompositeExplicitAutograd: crow_indices_default
7147
7325
  device_check: NoCheck
7148
7326
  device_guard: False
@@ -7150,7 +7328,7 @@
7150
7328
  - func: col_indices(Tensor(a) self) -> Tensor(a)
7151
7329
  variants: method
7152
7330
  dispatch:
7153
- SparseCsrCPU, SparseCsrCUDA: col_indices_sparse_csr
7331
+ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: col_indices_sparse_csr
7154
7332
  CompositeExplicitAutograd: col_indices_default
7155
7333
  device_check: NoCheck
7156
7334
  device_guard: False
@@ -7158,7 +7336,7 @@
7158
7336
  - func: ccol_indices(Tensor(a) self) -> Tensor(a)
7159
7337
  variants: method
7160
7338
  dispatch:
7161
- SparseCsrCPU, SparseCsrCUDA: ccol_indices_sparse_csr
7339
+ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: ccol_indices_sparse_csr
7162
7340
  CompositeExplicitAutograd: ccol_indices_default
7163
7341
  device_check: NoCheck
7164
7342
  device_guard: False
@@ -7166,7 +7344,7 @@
7166
7344
  - func: row_indices(Tensor(a) self) -> Tensor(a)
7167
7345
  variants: method
7168
7346
  dispatch:
7169
- SparseCsrCPU, SparseCsrCUDA: row_indices_sparse_csr
7347
+ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: row_indices_sparse_csr
7170
7348
  CompositeExplicitAutograd: row_indices_default
7171
7349
  device_check: NoCheck
7172
7350
  device_guard: False
@@ -7185,7 +7363,7 @@
7185
7363
  device_check: NoCheck # Allows copy into different device
7186
7364
  variants: function
7187
7365
  dispatch:
7188
- SparseCPU, SparseCUDA: copy_sparse_
7366
+ SparseCPU, SparseCUDA, SparseMeta: copy_sparse_
7189
7367
  autogen: copy_sparse_to_sparse, copy_sparse_to_sparse.out
7190
7368
 
7191
7369
  # By adding the AutogradNestedTensor this makes this function CompositeImplicit-like for nested tensors
@@ -7288,7 +7466,7 @@
7288
7466
  MkldnnCPU: mkldnn_reorder_conv2d_weight
7289
7467
  autogen: mkldnn_reorder_conv2d_weight.out
7290
7468
 
7291
- - func: mkldnn_reorder_conv3d_weight(Tensor self, SymInt[3] padding=0, SymInt[3] stride=1, SymInt[3] dilation=1, SymInt groups=1) -> Tensor
7469
+ - func: mkldnn_reorder_conv3d_weight(Tensor self, SymInt[3] padding=0, SymInt[3] stride=1, SymInt[3] dilation=1, SymInt groups=1, SymInt[]? input_size=None) -> Tensor
7292
7470
  variants: function
7293
7471
  python_module: nn
7294
7472
  dispatch:
@@ -7536,7 +7714,7 @@
7536
7714
 
7537
7715
  - func: result_type.Scalar_Scalar(Scalar scalar1, Scalar scalar2) -> ScalarType
7538
7716
 
7539
- - func: can_cast(ScalarType from, ScalarType to) -> bool
7717
+ - func: can_cast(ScalarType from_, ScalarType to) -> bool
7540
7718
  variants: function
7541
7719
 
7542
7720
  - func: promote_types(ScalarType type1, ScalarType type2) -> ScalarType
@@ -7675,6 +7853,7 @@
7675
7853
  dispatch:
7676
7854
  CPU, CUDA, Meta, MPS: set_
7677
7855
  autogen: set.source_Storage, set.source_Storage_out
7856
+ tags: inplace_view
7678
7857
 
7679
7858
  - func: set_.source_Storage_storage_offset(Tensor(a!) self, Storage source, SymInt storage_offset, SymInt[] size, SymInt[] stride=[]) -> Tensor(a!)
7680
7859
  variants: method
@@ -7687,6 +7866,7 @@
7687
7866
  MPS: set_storage_mps_
7688
7867
  QuantizedCPU, QuantizedCUDA: set_storage_quantized_
7689
7868
  autogen: set.source_Storage_storage_offset, set.source_Storage_storage_offset_out
7869
+ tags: inplace_view
7690
7870
 
7691
7871
  - func: set_.source_Tensor_storage_offset(Tensor(a!) self, Tensor source, SymInt storage_offset, SymInt[] size, SymInt[] stride=[]) -> Tensor(a!)
7692
7872
  variants: method
@@ -7694,6 +7874,7 @@
7694
7874
  device_guard: False
7695
7875
  dispatch:
7696
7876
  CompositeImplicitAutograd: set__symint
7877
+ tags: inplace_view
7697
7878
 
7698
7879
  - func: set_.source_Tensor(Tensor(a!) self, Tensor source) -> Tensor(a!)
7699
7880
  variants: method
@@ -7702,6 +7883,7 @@
7702
7883
  dispatch:
7703
7884
  CPU, CUDA, Meta, MPS: set_tensor_
7704
7885
  autogen: set.source_Tensor, set.source_Tensor_out
7886
+ tags: inplace_view
7705
7887
 
7706
7888
  - func: set_(Tensor(a!) self) -> Tensor(a!)
7707
7889
  variants: method
@@ -7711,6 +7893,7 @@
7711
7893
  Meta: set_meta_
7712
7894
  MPS: set_mps_
7713
7895
  autogen: set, set.out
7896
+ tags: inplace_view
7714
7897
 
7715
7898
  # Not making it CompositeImplicitAutograd because lift
7716
7899
  # should be a primitive w.r.t. functorch
@@ -10106,18 +10289,21 @@
10106
10289
  variants: method, function
10107
10290
  dispatch:
10108
10291
  CompositeExplicitAutograd: alias
10292
+ NestedTensorCPU, NestedTensorCUDA: alias_nested
10109
10293
  tags: core
10110
10294
 
10111
10295
  - func: _amp_foreach_non_finite_check_and_unscale_(Tensor(a!)[] self, Tensor(b!) found_inf, Tensor inv_scale) -> ()
10112
10296
  variants: function
10113
10297
  dispatch:
10114
10298
  CUDA: _amp_foreach_non_finite_check_and_unscale_cuda_
10299
+ CPU: _amp_foreach_non_finite_check_and_unscale_cpu_
10115
10300
  autogen: _amp_foreach_non_finite_check_and_unscale, _amp_foreach_non_finite_check_and_unscale.out
10116
10301
 
10117
10302
  - func: _amp_update_scale_(Tensor(a!) self, Tensor(b!) growth_tracker, Tensor found_inf, float scale_growth_factor, float scale_backoff_factor, int growth_interval) -> Tensor(a!)
10118
10303
  variants: function
10119
10304
  dispatch:
10120
10305
  CUDA: _amp_update_scale_cuda_
10306
+ CPU: _amp_update_scale_cpu_
10121
10307
  autogen: _amp_update_scale, _amp_update_scale.out
10122
10308
 
10123
10309
  #- func: _cat(Tensor[] tensors, int dim=0) -> Tensor
@@ -10137,14 +10323,14 @@
10137
10323
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10138
10324
  variants: function
10139
10325
  dispatch:
10140
- CPU: foreach_tensor_add_scalar_kernel_slow
10326
+ CompositeExplicitAutograd: foreach_tensor_add_scalar_kernel_slow
10141
10327
  CUDA: foreach_tensor_add_scalar_kernel_cuda
10142
10328
 
10143
10329
  - func: _foreach_add_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
10144
10330
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10145
10331
  variants: function
10146
10332
  dispatch:
10147
- CPU: foreach_tensor_add_scalar_kernel_slow_
10333
+ CompositeExplicitAutograd: foreach_tensor_add_scalar_kernel_slow_
10148
10334
  CUDA: foreach_tensor_add_scalar_kernel_cuda_
10149
10335
  autogen: _foreach_add.Scalar_out
10150
10336
 
@@ -10152,14 +10338,14 @@
10152
10338
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10153
10339
  variants: function
10154
10340
  dispatch:
10155
- CPU: foreach_tensor_add_list_kernel_slow
10341
+ CompositeExplicitAutograd: foreach_tensor_add_list_kernel_slow
10156
10342
  CUDA: foreach_tensor_add_list_kernel_cuda
10157
10343
 
10158
10344
  - func: _foreach_add_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> ()
10159
10345
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10160
10346
  variants: function
10161
10347
  dispatch:
10162
- CPU: foreach_tensor_add_list_kernel_slow_
10348
+ CompositeExplicitAutograd: foreach_tensor_add_list_kernel_slow_
10163
10349
  CUDA: foreach_tensor_add_list_kernel_cuda_
10164
10350
  autogen: _foreach_add.List_out
10165
10351
 
@@ -10167,14 +10353,14 @@
10167
10353
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10168
10354
  variants: function
10169
10355
  dispatch:
10170
- CPU: foreach_tensor_add_scalarlist_kernel_slow
10356
+ CompositeExplicitAutograd: foreach_tensor_add_scalarlist_kernel_slow
10171
10357
  CUDA: foreach_tensor_add_scalarlist_kernel_cuda
10172
10358
 
10173
10359
  - func: _foreach_add_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
10174
10360
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10175
10361
  variants: function
10176
10362
  dispatch:
10177
- CPU: foreach_tensor_add_scalarlist_kernel_slow_
10363
+ CompositeExplicitAutograd: foreach_tensor_add_scalarlist_kernel_slow_
10178
10364
  CUDA: foreach_tensor_add_scalarlist_kernel_cuda_
10179
10365
  autogen: _foreach_add.ScalarList_out
10180
10366
 
@@ -10182,14 +10368,14 @@
10182
10368
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10183
10369
  variants: function
10184
10370
  dispatch:
10185
- CPU: foreach_tensor_add_tensor_kernel_slow
10371
+ CompositeExplicitAutograd: foreach_tensor_add_tensor_kernel_slow
10186
10372
  CUDA: foreach_tensor_add_tensor_kernel_cuda
10187
10373
 
10188
10374
  - func: _foreach_add_.Tensor(Tensor(a!)[] self, Tensor other, *, Scalar alpha=1) -> ()
10189
10375
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10190
10376
  variants: function
10191
10377
  dispatch:
10192
- CPU: foreach_tensor_add_tensor_kernel_slow_
10378
+ CompositeExplicitAutograd: foreach_tensor_add_tensor_kernel_slow_
10193
10379
  CUDA: foreach_tensor_add_tensor_kernel_cuda_
10194
10380
  autogen: _foreach_add.Tensor_out
10195
10381
 
@@ -10197,14 +10383,14 @@
10197
10383
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10198
10384
  variants: function
10199
10385
  dispatch:
10200
- CPU: foreach_tensor_sub_scalar_kernel_slow
10386
+ CompositeExplicitAutograd: foreach_tensor_sub_scalar_kernel_slow
10201
10387
  CUDA: foreach_tensor_sub_scalar_kernel_cuda
10202
10388
 
10203
10389
  - func: _foreach_sub_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
10204
10390
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10205
10391
  variants: function
10206
10392
  dispatch:
10207
- CPU: foreach_tensor_sub_scalar_kernel_slow_
10393
+ CompositeExplicitAutograd: foreach_tensor_sub_scalar_kernel_slow_
10208
10394
  CUDA: foreach_tensor_sub_scalar_kernel_cuda_
10209
10395
  autogen: _foreach_sub.Scalar_out
10210
10396
 
@@ -10212,14 +10398,14 @@
10212
10398
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10213
10399
  variants: function
10214
10400
  dispatch:
10215
- CPU: foreach_tensor_sub_list_kernel_slow
10401
+ CompositeExplicitAutograd: foreach_tensor_sub_list_kernel_slow
10216
10402
  CUDA: foreach_tensor_sub_list_kernel_cuda
10217
10403
 
10218
10404
  - func: _foreach_sub_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> ()
10219
10405
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10220
10406
  variants: function
10221
10407
  dispatch:
10222
- CPU: foreach_tensor_sub_list_kernel_slow_
10408
+ CompositeExplicitAutograd: foreach_tensor_sub_list_kernel_slow_
10223
10409
  CUDA: foreach_tensor_sub_list_kernel_cuda_
10224
10410
  autogen: _foreach_sub.List_out
10225
10411
 
@@ -10227,14 +10413,14 @@
10227
10413
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10228
10414
  variants: function
10229
10415
  dispatch:
10230
- CPU: foreach_tensor_sub_scalarlist_kernel_slow
10416
+ CompositeExplicitAutograd: foreach_tensor_sub_scalarlist_kernel_slow
10231
10417
  CUDA: foreach_tensor_sub_scalarlist_kernel_cuda
10232
10418
 
10233
10419
  - func: _foreach_sub_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
10234
10420
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10235
10421
  variants: function
10236
10422
  dispatch:
10237
- CPU: foreach_tensor_sub_scalarlist_kernel_slow_
10423
+ CompositeExplicitAutograd: foreach_tensor_sub_scalarlist_kernel_slow_
10238
10424
  CUDA: foreach_tensor_sub_scalarlist_kernel_cuda_
10239
10425
  autogen: _foreach_sub.ScalarList_out
10240
10426
 
@@ -10242,14 +10428,14 @@
10242
10428
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10243
10429
  variants: function
10244
10430
  dispatch:
10245
- CPU: foreach_tensor_mul_scalar_kernel_slow
10431
+ CompositeExplicitAutograd: foreach_tensor_mul_scalar_kernel_slow
10246
10432
  CUDA: foreach_tensor_mul_scalar_kernel_cuda
10247
10433
 
10248
10434
  - func: _foreach_mul_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
10249
10435
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10250
10436
  variants: function
10251
10437
  dispatch:
10252
- CPU: foreach_tensor_mul_scalar_kernel_slow_
10438
+ CompositeExplicitAutograd: foreach_tensor_mul_scalar_kernel_slow_
10253
10439
  CUDA: foreach_tensor_mul_scalar_kernel_cuda_
10254
10440
  autogen: _foreach_mul.Scalar_out
10255
10441
 
@@ -10257,14 +10443,14 @@
10257
10443
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10258
10444
  variants: function
10259
10445
  dispatch:
10260
- CPU: foreach_tensor_mul_list_kernel_slow
10446
+ CompositeExplicitAutograd: foreach_tensor_mul_list_kernel_slow
10261
10447
  CUDA: foreach_tensor_mul_list_kernel_cuda
10262
10448
 
10263
10449
  - func: _foreach_mul_.List(Tensor(a!)[] self, Tensor[] other) -> ()
10264
10450
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10265
10451
  variants: function
10266
10452
  dispatch:
10267
- CPU: foreach_tensor_mul_list_kernel_slow_
10453
+ CompositeExplicitAutograd: foreach_tensor_mul_list_kernel_slow_
10268
10454
  CUDA: foreach_tensor_mul_list_kernel_cuda_
10269
10455
  autogen: _foreach_mul.List_out
10270
10456
 
@@ -10272,14 +10458,14 @@
10272
10458
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10273
10459
  variants: function
10274
10460
  dispatch:
10275
- CPU: foreach_tensor_mul_scalarlist_kernel_slow
10461
+ CompositeExplicitAutograd: foreach_tensor_mul_scalarlist_kernel_slow
10276
10462
  CUDA: foreach_tensor_mul_scalarlist_kernel_cuda
10277
10463
 
10278
10464
  - func: _foreach_mul_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
10279
10465
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10280
10466
  variants: function
10281
10467
  dispatch:
10282
- CPU: foreach_tensor_mul_scalarlist_kernel_slow_
10468
+ CompositeExplicitAutograd: foreach_tensor_mul_scalarlist_kernel_slow_
10283
10469
  CUDA: foreach_tensor_mul_scalarlist_kernel_cuda_
10284
10470
  autogen: _foreach_mul.ScalarList_out
10285
10471
 
@@ -10287,14 +10473,14 @@
10287
10473
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10288
10474
  variants: function
10289
10475
  dispatch:
10290
- CPU: foreach_tensor_mul_tensor_kernel_slow
10476
+ CompositeExplicitAutograd: foreach_tensor_mul_tensor_kernel_slow
10291
10477
  CUDA: foreach_tensor_mul_tensor_kernel_cuda
10292
10478
 
10293
10479
  - func: _foreach_mul_.Tensor(Tensor(a!)[] self, Tensor other) -> ()
10294
10480
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10295
10481
  variants: function
10296
10482
  dispatch:
10297
- CPU: foreach_tensor_mul_tensor_kernel_slow_
10483
+ CompositeExplicitAutograd: foreach_tensor_mul_tensor_kernel_slow_
10298
10484
  CUDA: foreach_tensor_mul_tensor_kernel_cuda_
10299
10485
  autogen: _foreach_mul.Tensor_out
10300
10486
 
@@ -10302,14 +10488,14 @@
10302
10488
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10303
10489
  variants: function
10304
10490
  dispatch:
10305
- CPU: foreach_tensor_div_scalar_kernel_slow
10491
+ CompositeExplicitAutograd: foreach_tensor_div_scalar_kernel_slow
10306
10492
  CUDA: foreach_tensor_div_scalar_kernel_cuda
10307
10493
 
10308
10494
  - func: _foreach_div_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
10309
10495
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10310
10496
  variants: function
10311
10497
  dispatch:
10312
- CPU: foreach_tensor_div_scalar_kernel_slow_
10498
+ CompositeExplicitAutograd: foreach_tensor_div_scalar_kernel_slow_
10313
10499
  CUDA: foreach_tensor_div_scalar_kernel_cuda_
10314
10500
  autogen: _foreach_div.Scalar_out
10315
10501
 
@@ -10317,14 +10503,14 @@
10317
10503
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10318
10504
  variants: function
10319
10505
  dispatch:
10320
- CPU: foreach_tensor_div_list_kernel_slow
10506
+ CompositeExplicitAutograd: foreach_tensor_div_list_kernel_slow
10321
10507
  CUDA: foreach_tensor_div_list_kernel_cuda
10322
10508
 
10323
10509
  - func: _foreach_div_.List(Tensor(a!)[] self, Tensor[] other) -> ()
10324
10510
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10325
10511
  variants: function
10326
10512
  dispatch:
10327
- CPU: foreach_tensor_div_list_kernel_slow_
10513
+ CompositeExplicitAutograd: foreach_tensor_div_list_kernel_slow_
10328
10514
  CUDA: foreach_tensor_div_list_kernel_cuda_
10329
10515
  autogen: _foreach_div.List_out
10330
10516
 
@@ -10332,14 +10518,14 @@
10332
10518
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10333
10519
  variants: function
10334
10520
  dispatch:
10335
- CPU: foreach_tensor_div_scalarlist_kernel_slow
10521
+ CompositeExplicitAutograd: foreach_tensor_div_scalarlist_kernel_slow
10336
10522
  CUDA: foreach_tensor_div_scalarlist_kernel_cuda
10337
10523
 
10338
10524
  - func: _foreach_div_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
10339
10525
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10340
10526
  variants: function
10341
10527
  dispatch:
10342
- CPU: foreach_tensor_div_scalarlist_kernel_slow_
10528
+ CompositeExplicitAutograd: foreach_tensor_div_scalarlist_kernel_slow_
10343
10529
  CUDA: foreach_tensor_div_scalarlist_kernel_cuda_
10344
10530
  autogen: _foreach_div.ScalarList_out
10345
10531
 
@@ -10347,14 +10533,14 @@
10347
10533
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10348
10534
  variants: function
10349
10535
  dispatch:
10350
- CPU: foreach_tensor_div_tensor_kernel_slow
10536
+ CompositeExplicitAutograd: foreach_tensor_div_tensor_kernel_slow
10351
10537
  CUDA: foreach_tensor_div_tensor_kernel_cuda
10352
10538
 
10353
10539
  - func: _foreach_div_.Tensor(Tensor(a!)[] self, Tensor other) -> ()
10354
10540
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10355
10541
  variants: function
10356
10542
  dispatch:
10357
- CPU: foreach_tensor_div_tensor_kernel_slow_
10543
+ CompositeExplicitAutograd: foreach_tensor_div_tensor_kernel_slow_
10358
10544
  CUDA: foreach_tensor_div_tensor_kernel_cuda_
10359
10545
  autogen: _foreach_div.Tensor_out
10360
10546
 
@@ -10362,14 +10548,14 @@
10362
10548
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10363
10549
  variants: function
10364
10550
  dispatch:
10365
- CPU: foreach_tensor_clamp_max_scalar_kernel_slow
10551
+ CompositeExplicitAutograd: foreach_tensor_clamp_max_scalar_kernel_slow
10366
10552
  CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda
10367
10553
 
10368
10554
  - func: _foreach_clamp_max_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
10369
10555
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10370
10556
  variants: function
10371
10557
  dispatch:
10372
- CPU: foreach_tensor_clamp_max_scalar_kernel_slow_
10558
+ CompositeExplicitAutograd: foreach_tensor_clamp_max_scalar_kernel_slow_
10373
10559
  CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda_
10374
10560
  autogen: _foreach_clamp_max.Scalar_out
10375
10561
 
@@ -10377,14 +10563,14 @@
10377
10563
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10378
10564
  variants: function
10379
10565
  dispatch:
10380
- CPU: foreach_tensor_clamp_max_list_kernel_slow
10566
+ CompositeExplicitAutograd: foreach_tensor_clamp_max_list_kernel_slow
10381
10567
  CUDA: foreach_tensor_clamp_max_list_kernel_cuda
10382
10568
 
10383
10569
  - func: _foreach_clamp_max_.List(Tensor(a!)[] self, Tensor[] other) -> ()
10384
10570
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10385
10571
  variants: function
10386
10572
  dispatch:
10387
- CPU: foreach_tensor_clamp_max_list_kernel_slow_
10573
+ CompositeExplicitAutograd: foreach_tensor_clamp_max_list_kernel_slow_
10388
10574
  CUDA: foreach_tensor_clamp_max_list_kernel_cuda_
10389
10575
  autogen: _foreach_clamp_max.List_out
10390
10576
 
@@ -10392,14 +10578,14 @@
10392
10578
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10393
10579
  variants: function
10394
10580
  dispatch:
10395
- CPU: foreach_tensor_clamp_max_scalarlist_kernel_slow
10581
+ CompositeExplicitAutograd: foreach_tensor_clamp_max_scalarlist_kernel_slow
10396
10582
  CUDA: foreach_tensor_clamp_max_scalarlist_kernel_cuda
10397
10583
 
10398
10584
  - func: _foreach_clamp_max_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
10399
10585
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10400
10586
  variants: function
10401
10587
  dispatch:
10402
- CPU: foreach_tensor_clamp_max_scalarlist_kernel_slow_
10588
+ CompositeExplicitAutograd: foreach_tensor_clamp_max_scalarlist_kernel_slow_
10403
10589
  CUDA: foreach_tensor_clamp_max_scalarlist_kernel_cuda_
10404
10590
  autogen: _foreach_clamp_max.ScalarList_out
10405
10591
 
@@ -10407,14 +10593,14 @@
10407
10593
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10408
10594
  variants: function
10409
10595
  dispatch:
10410
- CPU: foreach_tensor_clamp_min_scalar_kernel_slow
10596
+ CompositeExplicitAutograd: foreach_tensor_clamp_min_scalar_kernel_slow
10411
10597
  CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda
10412
10598
 
10413
10599
  - func: _foreach_clamp_min_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
10414
10600
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10415
10601
  variants: function
10416
10602
  dispatch:
10417
- CPU: foreach_tensor_clamp_min_scalar_kernel_slow_
10603
+ CompositeExplicitAutograd: foreach_tensor_clamp_min_scalar_kernel_slow_
10418
10604
  CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda_
10419
10605
  autogen: _foreach_clamp_min.Scalar_out
10420
10606
 
@@ -10422,14 +10608,14 @@
10422
10608
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10423
10609
  variants: function
10424
10610
  dispatch:
10425
- CPU: foreach_tensor_clamp_min_list_kernel_slow
10611
+ CompositeExplicitAutograd: foreach_tensor_clamp_min_list_kernel_slow
10426
10612
  CUDA: foreach_tensor_clamp_min_list_kernel_cuda
10427
10613
 
10428
10614
  - func: _foreach_clamp_min_.List(Tensor(a!)[] self, Tensor[] other) -> ()
10429
10615
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10430
10616
  variants: function
10431
10617
  dispatch:
10432
- CPU: foreach_tensor_clamp_min_list_kernel_slow_
10618
+ CompositeExplicitAutograd: foreach_tensor_clamp_min_list_kernel_slow_
10433
10619
  CUDA: foreach_tensor_clamp_min_list_kernel_cuda_
10434
10620
  autogen: _foreach_clamp_min.List_out
10435
10621
 
@@ -10437,14 +10623,14 @@
10437
10623
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10438
10624
  variants: function
10439
10625
  dispatch:
10440
- CPU: foreach_tensor_clamp_min_scalarlist_kernel_slow
10626
+ CompositeExplicitAutograd: foreach_tensor_clamp_min_scalarlist_kernel_slow
10441
10627
  CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda
10442
10628
 
10443
10629
  - func: _foreach_clamp_min_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
10444
10630
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10445
10631
  variants: function
10446
10632
  dispatch:
10447
- CPU: foreach_tensor_clamp_min_scalarlist_kernel_slow_
10633
+ CompositeExplicitAutograd: foreach_tensor_clamp_min_scalarlist_kernel_slow_
10448
10634
  CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda_
10449
10635
  autogen: _foreach_clamp_min.ScalarList_out
10450
10636
 
@@ -10453,14 +10639,14 @@
10453
10639
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10454
10640
  variants: function
10455
10641
  dispatch:
10456
- CPU: foreach_tensor_clamp_min_scalar_kernel_slow
10642
+ CompositeExplicitAutograd: foreach_tensor_clamp_min_scalar_kernel_slow
10457
10643
  CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda
10458
10644
 
10459
10645
  - func: _foreach_maximum_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
10460
10646
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10461
10647
  variants: function
10462
10648
  dispatch:
10463
- CPU: foreach_tensor_clamp_min_scalar_kernel_slow_
10649
+ CompositeExplicitAutograd: foreach_tensor_clamp_min_scalar_kernel_slow_
10464
10650
  CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda_
10465
10651
  autogen: _foreach_maximum.Scalar_out
10466
10652
 
@@ -10469,14 +10655,14 @@
10469
10655
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10470
10656
  variants: function
10471
10657
  dispatch:
10472
- CPU: foreach_tensor_clamp_min_list_kernel_slow
10658
+ CompositeExplicitAutograd: foreach_tensor_clamp_min_list_kernel_slow
10473
10659
  CUDA: foreach_tensor_clamp_min_list_kernel_cuda
10474
10660
 
10475
10661
  - func: _foreach_maximum_.List(Tensor(a!)[] self, Tensor[] other) -> ()
10476
10662
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10477
10663
  variants: function
10478
10664
  dispatch:
10479
- CPU: foreach_tensor_clamp_min_list_kernel_slow_
10665
+ CompositeExplicitAutograd: foreach_tensor_clamp_min_list_kernel_slow_
10480
10666
  CUDA: foreach_tensor_clamp_min_list_kernel_cuda_
10481
10667
  autogen: _foreach_maximum.List_out
10482
10668
 
@@ -10485,14 +10671,14 @@
10485
10671
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10486
10672
  variants: function
10487
10673
  dispatch:
10488
- CPU: foreach_tensor_clamp_min_scalarlist_kernel_slow
10674
+ CompositeExplicitAutograd: foreach_tensor_clamp_min_scalarlist_kernel_slow
10489
10675
  CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda
10490
10676
 
10491
10677
  - func: _foreach_maximum_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
10492
10678
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10493
10679
  variants: function
10494
10680
  dispatch:
10495
- CPU: foreach_tensor_clamp_min_scalarlist_kernel_slow_
10681
+ CompositeExplicitAutograd: foreach_tensor_clamp_min_scalarlist_kernel_slow_
10496
10682
  CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda_
10497
10683
  autogen: _foreach_maximum.ScalarList_out
10498
10684
 
@@ -10500,14 +10686,14 @@
10500
10686
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10501
10687
  variants: function
10502
10688
  dispatch:
10503
- CPU: foreach_tensor_clamp_max_scalar_kernel_slow
10689
+ CompositeExplicitAutograd: foreach_tensor_clamp_max_scalar_kernel_slow
10504
10690
  CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda
10505
10691
 
10506
10692
  - func: _foreach_minimum_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
10507
10693
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10508
10694
  variants: function
10509
10695
  dispatch:
10510
- CPU: foreach_tensor_clamp_max_scalar_kernel_slow_
10696
+ CompositeExplicitAutograd: foreach_tensor_clamp_max_scalar_kernel_slow_
10511
10697
  CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda_
10512
10698
  autogen: _foreach_minimum.Scalar_out
10513
10699
 
@@ -10515,14 +10701,14 @@
10515
10701
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10516
10702
  variants: function
10517
10703
  dispatch:
10518
- CPU: foreach_tensor_clamp_max_list_kernel_slow
10704
+ CompositeExplicitAutograd: foreach_tensor_clamp_max_list_kernel_slow
10519
10705
  CUDA: foreach_tensor_clamp_max_list_kernel_cuda
10520
10706
 
10521
10707
  - func: _foreach_minimum_.List(Tensor(a!)[] self, Tensor[] other) -> ()
10522
10708
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10523
10709
  variants: function
10524
10710
  dispatch:
10525
- CPU: foreach_tensor_clamp_max_list_kernel_slow_
10711
+ CompositeExplicitAutograd: foreach_tensor_clamp_max_list_kernel_slow_
10526
10712
  CUDA: foreach_tensor_clamp_max_list_kernel_cuda_
10527
10713
  autogen: _foreach_minimum.List_out
10528
10714
 
@@ -10530,14 +10716,14 @@
10530
10716
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10531
10717
  variants: function
10532
10718
  dispatch:
10533
- CPU: foreach_tensor_clamp_max_scalarlist_kernel_slow
10719
+ CompositeExplicitAutograd: foreach_tensor_clamp_max_scalarlist_kernel_slow
10534
10720
  CUDA: foreach_tensor_clamp_max_scalarlist_kernel_cuda
10535
10721
 
10536
10722
  - func: _foreach_minimum_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
10537
10723
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10538
10724
  variants: function
10539
10725
  dispatch:
10540
- CPU: foreach_tensor_clamp_max_scalarlist_kernel_slow_
10726
+ CompositeExplicitAutograd: foreach_tensor_clamp_max_scalarlist_kernel_slow_
10541
10727
  CUDA: foreach_tensor_clamp_max_scalarlist_kernel_cuda_
10542
10728
  autogen: _foreach_minimum.ScalarList_out
10543
10729
 
@@ -10545,28 +10731,28 @@
10545
10731
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10546
10732
  variants: function
10547
10733
  dispatch:
10548
- CPU: foreach_tensor_addcdiv_scalar_slow
10734
+ CompositeExplicitAutograd: foreach_tensor_addcdiv_scalar_slow
10549
10735
  CUDA: foreach_tensor_addcdiv_scalar_cuda
10550
10736
 
10551
10737
  - func: _foreach_addcdiv.ScalarList(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> Tensor[]
10552
10738
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10553
10739
  variants: function
10554
10740
  dispatch:
10555
- CPU: foreach_tensor_addcdiv_scalarlist_slow
10741
+ CompositeExplicitAutograd: foreach_tensor_addcdiv_scalarlist_slow
10556
10742
  CUDA: foreach_tensor_addcdiv_scalarlist_cuda
10557
10743
 
10558
10744
  - func: _foreach_addcdiv.Tensor(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> Tensor[]
10559
10745
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10560
10746
  variants: function
10561
10747
  dispatch:
10562
- CPU: foreach_tensor_addcdiv_tensor_slow
10748
+ CompositeExplicitAutograd: foreach_tensor_addcdiv_tensor_slow
10563
10749
  CUDA: foreach_tensor_addcdiv_tensor_cuda
10564
10750
 
10565
10751
  - func: _foreach_addcdiv_.Scalar(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> ()
10566
10752
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10567
10753
  variants: function
10568
10754
  dispatch:
10569
- CPU: foreach_tensor_addcdiv_scalar_slow_
10755
+ CompositeExplicitAutograd: foreach_tensor_addcdiv_scalar_slow_
10570
10756
  CUDA: foreach_tensor_addcdiv_scalar_cuda_
10571
10757
  autogen: _foreach_addcdiv.Scalar_out
10572
10758
 
@@ -10574,7 +10760,7 @@
10574
10760
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10575
10761
  variants: function
10576
10762
  dispatch:
10577
- CPU: foreach_tensor_addcdiv_scalarlist_slow_
10763
+ CompositeExplicitAutograd: foreach_tensor_addcdiv_scalarlist_slow_
10578
10764
  CUDA: foreach_tensor_addcdiv_scalarlist_cuda_
10579
10765
  autogen: _foreach_addcdiv.ScalarList_out
10580
10766
 
@@ -10582,7 +10768,7 @@
10582
10768
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10583
10769
  variants: function
10584
10770
  dispatch:
10585
- CPU: foreach_tensor_addcdiv_tensor_slow_
10771
+ CompositeExplicitAutograd: foreach_tensor_addcdiv_tensor_slow_
10586
10772
  CUDA: foreach_tensor_addcdiv_tensor_cuda_
10587
10773
  autogen: _foreach_addcdiv.Tensor_out
10588
10774
 
@@ -10590,28 +10776,28 @@
10590
10776
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10591
10777
  variants: function
10592
10778
  dispatch:
10593
- CPU: foreach_tensor_addcmul_scalar_slow
10779
+ CompositeExplicitAutograd: foreach_tensor_addcmul_scalar_slow
10594
10780
  CUDA: foreach_tensor_addcmul_scalar_cuda
10595
10781
 
10596
10782
  - func: _foreach_addcmul.ScalarList(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> Tensor[]
10597
10783
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10598
10784
  variants: function
10599
10785
  dispatch:
10600
- CPU: foreach_tensor_addcmul_scalarlist_slow
10786
+ CompositeExplicitAutograd: foreach_tensor_addcmul_scalarlist_slow
10601
10787
  CUDA: foreach_tensor_addcmul_scalarlist_cuda
10602
10788
 
10603
10789
  - func: _foreach_addcmul.Tensor(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> Tensor[]
10604
10790
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10605
10791
  variants: function
10606
10792
  dispatch:
10607
- CPU: foreach_tensor_addcmul_tensor_slow
10793
+ CompositeExplicitAutograd: foreach_tensor_addcmul_tensor_slow
10608
10794
  CUDA: foreach_tensor_addcmul_tensor_cuda
10609
10795
 
10610
10796
  - func: _foreach_addcmul_.Scalar(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> ()
10611
10797
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10612
10798
  variants: function
10613
10799
  dispatch:
10614
- CPU: foreach_tensor_addcmul_scalar_slow_
10800
+ CompositeExplicitAutograd: foreach_tensor_addcmul_scalar_slow_
10615
10801
  CUDA: foreach_tensor_addcmul_scalar_cuda_
10616
10802
  autogen: _foreach_addcmul.Scalar_out
10617
10803
 
@@ -10619,7 +10805,7 @@
10619
10805
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10620
10806
  variants: function
10621
10807
  dispatch:
10622
- CPU: foreach_tensor_addcmul_scalarlist_slow_
10808
+ CompositeExplicitAutograd: foreach_tensor_addcmul_scalarlist_slow_
10623
10809
  CUDA: foreach_tensor_addcmul_scalarlist_cuda_
10624
10810
  autogen: _foreach_addcmul.ScalarList_out
10625
10811
 
@@ -10627,7 +10813,7 @@
10627
10813
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10628
10814
  variants: function
10629
10815
  dispatch:
10630
- CPU: foreach_tensor_addcmul_tensor_slow_
10816
+ CompositeExplicitAutograd: foreach_tensor_addcmul_tensor_slow_
10631
10817
  CUDA: foreach_tensor_addcmul_tensor_cuda_
10632
10818
  autogen: _foreach_addcmul.Tensor_out
10633
10819
 
@@ -10635,14 +10821,14 @@
10635
10821
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10636
10822
  variants: function
10637
10823
  dispatch:
10638
- CPU: foreach_tensor_abs_slow
10824
+ CompositeExplicitAutograd: foreach_tensor_abs_slow
10639
10825
  CUDA: foreach_tensor_abs_cuda
10640
10826
 
10641
10827
  - func: _foreach_abs_(Tensor(a!)[] self) -> ()
10642
10828
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10643
10829
  variants: function
10644
10830
  dispatch:
10645
- CPU: foreach_tensor_abs_slow_
10831
+ CompositeExplicitAutograd: foreach_tensor_abs_slow_
10646
10832
  CUDA: foreach_tensor_abs_cuda_
10647
10833
  autogen: _foreach_abs.out
10648
10834
 
@@ -10650,14 +10836,14 @@
10650
10836
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10651
10837
  variants: function
10652
10838
  dispatch:
10653
- CPU: foreach_tensor_acos_slow
10839
+ CompositeExplicitAutograd: foreach_tensor_acos_slow
10654
10840
  CUDA: foreach_tensor_acos_cuda
10655
10841
 
10656
10842
  - func: _foreach_acos_(Tensor(a!)[] self) -> ()
10657
10843
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10658
10844
  variants: function
10659
10845
  dispatch:
10660
- CPU: foreach_tensor_acos_slow_
10846
+ CompositeExplicitAutograd: foreach_tensor_acos_slow_
10661
10847
  CUDA: foreach_tensor_acos_cuda_
10662
10848
  autogen: _foreach_acos.out
10663
10849
 
@@ -10665,14 +10851,14 @@
10665
10851
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10666
10852
  variants: function
10667
10853
  dispatch:
10668
- CPU: foreach_tensor_asin_slow
10854
+ CompositeExplicitAutograd: foreach_tensor_asin_slow
10669
10855
  CUDA: foreach_tensor_asin_cuda
10670
10856
 
10671
10857
  - func: _foreach_asin_(Tensor(a!)[] self) -> ()
10672
10858
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10673
10859
  variants: function
10674
10860
  dispatch:
10675
- CPU: foreach_tensor_asin_slow_
10861
+ CompositeExplicitAutograd: foreach_tensor_asin_slow_
10676
10862
  CUDA: foreach_tensor_asin_cuda_
10677
10863
  autogen: _foreach_asin.out
10678
10864
 
@@ -10680,14 +10866,14 @@
10680
10866
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10681
10867
  variants: function
10682
10868
  dispatch:
10683
- CPU: foreach_tensor_atan_slow
10869
+ CompositeExplicitAutograd: foreach_tensor_atan_slow
10684
10870
  CUDA: foreach_tensor_atan_cuda
10685
10871
 
10686
10872
  - func: _foreach_atan_(Tensor(a!)[] self) -> ()
10687
10873
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10688
10874
  variants: function
10689
10875
  dispatch:
10690
- CPU: foreach_tensor_atan_slow_
10876
+ CompositeExplicitAutograd: foreach_tensor_atan_slow_
10691
10877
  CUDA: foreach_tensor_atan_cuda_
10692
10878
  autogen: _foreach_atan.out
10693
10879
 
@@ -10695,14 +10881,14 @@
10695
10881
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10696
10882
  variants: function
10697
10883
  dispatch:
10698
- CPU: foreach_tensor_ceil_slow
10884
+ CompositeExplicitAutograd: foreach_tensor_ceil_slow
10699
10885
  CUDA: foreach_tensor_ceil_cuda
10700
10886
 
10701
10887
  - func: _foreach_ceil_(Tensor(a!)[] self) -> ()
10702
10888
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10703
10889
  variants: function
10704
10890
  dispatch:
10705
- CPU: foreach_tensor_ceil_slow_
10891
+ CompositeExplicitAutograd: foreach_tensor_ceil_slow_
10706
10892
  CUDA: foreach_tensor_ceil_cuda_
10707
10893
  autogen: _foreach_ceil.out
10708
10894
 
@@ -10710,14 +10896,14 @@
10710
10896
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10711
10897
  variants: function
10712
10898
  dispatch:
10713
- CPU: foreach_tensor_cos_slow
10899
+ CompositeExplicitAutograd: foreach_tensor_cos_slow
10714
10900
  CUDA: foreach_tensor_cos_cuda
10715
10901
 
10716
10902
  - func: _foreach_cos_(Tensor(a!)[] self) -> ()
10717
10903
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10718
10904
  variants: function
10719
10905
  dispatch:
10720
- CPU: foreach_tensor_cos_slow_
10906
+ CompositeExplicitAutograd: foreach_tensor_cos_slow_
10721
10907
  CUDA: foreach_tensor_cos_cuda_
10722
10908
  autogen: _foreach_cos.out
10723
10909
 
@@ -10725,14 +10911,14 @@
10725
10911
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10726
10912
  variants: function
10727
10913
  dispatch:
10728
- CPU: foreach_tensor_cosh_slow
10914
+ CompositeExplicitAutograd: foreach_tensor_cosh_slow
10729
10915
  CUDA: foreach_tensor_cosh_cuda
10730
10916
 
10731
10917
  - func: _foreach_cosh_(Tensor(a!)[] self) -> ()
10732
10918
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10733
10919
  variants: function
10734
10920
  dispatch:
10735
- CPU: foreach_tensor_cosh_slow_
10921
+ CompositeExplicitAutograd: foreach_tensor_cosh_slow_
10736
10922
  CUDA: foreach_tensor_cosh_cuda_
10737
10923
  autogen: _foreach_cosh.out
10738
10924
 
@@ -10740,14 +10926,14 @@
10740
10926
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10741
10927
  variants: function
10742
10928
  dispatch:
10743
- CPU: foreach_tensor_erf_slow
10929
+ CompositeExplicitAutograd: foreach_tensor_erf_slow
10744
10930
  CUDA: foreach_tensor_erf_cuda
10745
10931
 
10746
10932
  - func: _foreach_erf_(Tensor(a!)[] self) -> ()
10747
10933
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10748
10934
  variants: function
10749
10935
  dispatch:
10750
- CPU: foreach_tensor_erf_slow_
10936
+ CompositeExplicitAutograd: foreach_tensor_erf_slow_
10751
10937
  CUDA: foreach_tensor_erf_cuda_
10752
10938
  autogen: _foreach_erf.out
10753
10939
 
@@ -10755,14 +10941,14 @@
10755
10941
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10756
10942
  variants: function
10757
10943
  dispatch:
10758
- CPU: foreach_tensor_erfc_slow
10944
+ CompositeExplicitAutograd: foreach_tensor_erfc_slow
10759
10945
  CUDA: foreach_tensor_erfc_cuda
10760
10946
 
10761
10947
  - func: _foreach_erfc_(Tensor(a!)[] self) -> ()
10762
10948
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10763
10949
  variants: function
10764
10950
  dispatch:
10765
- CPU: foreach_tensor_erfc_slow_
10951
+ CompositeExplicitAutograd: foreach_tensor_erfc_slow_
10766
10952
  CUDA: foreach_tensor_erfc_cuda_
10767
10953
  autogen: _foreach_erfc.out
10768
10954
 
@@ -10770,14 +10956,14 @@
10770
10956
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10771
10957
  variants: function
10772
10958
  dispatch:
10773
- CPU: foreach_tensor_exp_slow
10959
+ CompositeExplicitAutograd: foreach_tensor_exp_slow
10774
10960
  CUDA: foreach_tensor_exp_cuda
10775
10961
 
10776
10962
  - func: _foreach_exp_(Tensor(a!)[] self) -> ()
10777
10963
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10778
10964
  variants: function
10779
10965
  dispatch:
10780
- CPU: foreach_tensor_exp_slow_
10966
+ CompositeExplicitAutograd: foreach_tensor_exp_slow_
10781
10967
  CUDA: foreach_tensor_exp_cuda_
10782
10968
  autogen: _foreach_exp.out
10783
10969
 
@@ -10785,14 +10971,14 @@
10785
10971
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10786
10972
  variants: function
10787
10973
  dispatch:
10788
- CPU: foreach_tensor_expm1_slow
10974
+ CompositeExplicitAutograd: foreach_tensor_expm1_slow
10789
10975
  CUDA: foreach_tensor_expm1_cuda
10790
10976
 
10791
10977
  - func: _foreach_expm1_(Tensor(a!)[] self) -> ()
10792
10978
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10793
10979
  variants: function
10794
10980
  dispatch:
10795
- CPU: foreach_tensor_expm1_slow_
10981
+ CompositeExplicitAutograd: foreach_tensor_expm1_slow_
10796
10982
  CUDA: foreach_tensor_expm1_cuda_
10797
10983
  autogen: _foreach_expm1.out
10798
10984
 
@@ -10800,14 +10986,14 @@
10800
10986
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10801
10987
  variants: function
10802
10988
  dispatch:
10803
- CPU: foreach_tensor_floor_slow
10989
+ CompositeExplicitAutograd: foreach_tensor_floor_slow
10804
10990
  CUDA: foreach_tensor_floor_cuda
10805
10991
 
10806
10992
  - func: _foreach_floor_(Tensor(a!)[] self) -> ()
10807
10993
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10808
10994
  variants: function
10809
10995
  dispatch:
10810
- CPU: foreach_tensor_floor_slow_
10996
+ CompositeExplicitAutograd: foreach_tensor_floor_slow_
10811
10997
  CUDA: foreach_tensor_floor_cuda_
10812
10998
  autogen: _foreach_floor.out
10813
10999
 
@@ -10815,14 +11001,14 @@
10815
11001
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10816
11002
  variants: function
10817
11003
  dispatch:
10818
- CPU: foreach_tensor_frac_slow
11004
+ CompositeExplicitAutograd: foreach_tensor_frac_slow
10819
11005
  CUDA: foreach_tensor_frac_cuda
10820
11006
 
10821
11007
  - func: _foreach_frac_(Tensor(a!)[] self) -> ()
10822
11008
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10823
11009
  variants: function
10824
11010
  dispatch:
10825
- CPU: foreach_tensor_frac_slow_
11011
+ CompositeExplicitAutograd: foreach_tensor_frac_slow_
10826
11012
  CUDA: foreach_tensor_frac_cuda_
10827
11013
  autogen: _foreach_frac.out
10828
11014
 
@@ -10830,7 +11016,7 @@
10830
11016
  device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices
10831
11017
  variants: function
10832
11018
  dispatch:
10833
- CPU: foreach_tensor_ternary_lerp_slow
11019
+ CompositeExplicitAutograd: foreach_tensor_ternary_lerp_slow
10834
11020
  CUDA: foreach_tensor_lerp_ternary_cuda
10835
11021
  autogen: _foreach_lerp.List_out
10836
11022
 
@@ -10838,7 +11024,7 @@
10838
11024
  device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices
10839
11025
  variants: function
10840
11026
  dispatch:
10841
- CPU: foreach_tensor_ternary_lerp_slow_
11027
+ CompositeExplicitAutograd: foreach_tensor_ternary_lerp_slow_
10842
11028
  CUDA: foreach_tensor_lerp_ternary_cuda_
10843
11029
  autogen: _foreach_lerp.List_out
10844
11030
 
@@ -10846,7 +11032,7 @@
10846
11032
  device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices
10847
11033
  variants: function
10848
11034
  dispatch:
10849
- CPU: foreach_tensor_lerp_list_kernel_slow
11035
+ CompositeExplicitAutograd: foreach_tensor_lerp_list_kernel_slow
10850
11036
  CUDA: foreach_tensor_lerp_list_cuda
10851
11037
  autogen: _foreach_lerp.Scalar_out
10852
11038
 
@@ -10854,7 +11040,7 @@
10854
11040
  device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices
10855
11041
  variants: function
10856
11042
  dispatch:
10857
- CPU: foreach_tensor_lerp_list_kernel_slow_
11043
+ CompositeExplicitAutograd: foreach_tensor_lerp_list_kernel_slow_
10858
11044
  CUDA: foreach_tensor_lerp_list_cuda_
10859
11045
  autogen: _foreach_lerp.Scalar_out
10860
11046
 
@@ -10862,14 +11048,14 @@
10862
11048
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10863
11049
  variants: function
10864
11050
  dispatch:
10865
- CPU: foreach_tensor_lgamma_slow
11051
+ CompositeExplicitAutograd: foreach_tensor_lgamma_slow
10866
11052
  CUDA: foreach_tensor_lgamma_cuda
10867
11053
 
10868
11054
  - func: _foreach_lgamma_(Tensor(a!)[] self) -> ()
10869
11055
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10870
11056
  variants: function
10871
11057
  dispatch:
10872
- CPU: foreach_tensor_lgamma_slow_
11058
+ CompositeExplicitAutograd: foreach_tensor_lgamma_slow_
10873
11059
  CUDA: foreach_tensor_lgamma_cuda_
10874
11060
  autogen: _foreach_lgamma.out
10875
11061
 
@@ -10877,14 +11063,14 @@
10877
11063
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10878
11064
  variants: function
10879
11065
  dispatch:
10880
- CPU: foreach_tensor_log_slow
11066
+ CompositeExplicitAutograd: foreach_tensor_log_slow
10881
11067
  CUDA: foreach_tensor_log_cuda
10882
11068
 
10883
11069
  - func: _foreach_log_(Tensor(a!)[] self) -> ()
10884
11070
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10885
11071
  variants: function
10886
11072
  dispatch:
10887
- CPU: foreach_tensor_log_slow_
11073
+ CompositeExplicitAutograd: foreach_tensor_log_slow_
10888
11074
  CUDA: foreach_tensor_log_cuda_
10889
11075
  autogen: _foreach_log.out
10890
11076
 
@@ -10892,14 +11078,14 @@
10892
11078
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10893
11079
  variants: function
10894
11080
  dispatch:
10895
- CPU: foreach_tensor_log10_slow
11081
+ CompositeExplicitAutograd: foreach_tensor_log10_slow
10896
11082
  CUDA: foreach_tensor_log10_cuda
10897
11083
 
10898
11084
  - func: _foreach_log10_(Tensor(a!)[] self) -> ()
10899
11085
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10900
11086
  variants: function
10901
11087
  dispatch:
10902
- CPU: foreach_tensor_log10_slow_
11088
+ CompositeExplicitAutograd: foreach_tensor_log10_slow_
10903
11089
  CUDA: foreach_tensor_log10_cuda_
10904
11090
  autogen: _foreach_log10.out
10905
11091
 
@@ -10907,14 +11093,14 @@
10907
11093
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10908
11094
  variants: function
10909
11095
  dispatch:
10910
- CPU: foreach_tensor_log1p_slow
11096
+ CompositeExplicitAutograd: foreach_tensor_log1p_slow
10911
11097
  CUDA: foreach_tensor_log1p_cuda
10912
11098
 
10913
11099
  - func: _foreach_log1p_(Tensor(a!)[] self) -> ()
10914
11100
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10915
11101
  variants: function
10916
11102
  dispatch:
10917
- CPU: foreach_tensor_log1p_slow_
11103
+ CompositeExplicitAutograd: foreach_tensor_log1p_slow_
10918
11104
  CUDA: foreach_tensor_log1p_cuda_
10919
11105
  autogen: _foreach_log1p.out
10920
11106
 
@@ -10922,37 +11108,45 @@
10922
11108
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10923
11109
  variants: function
10924
11110
  dispatch:
10925
- CPU: foreach_tensor_log2_slow
11111
+ CompositeExplicitAutograd: foreach_tensor_log2_slow
10926
11112
  CUDA: foreach_tensor_log2_cuda
10927
11113
 
10928
11114
  - func: _foreach_log2_(Tensor(a!)[] self) -> ()
10929
11115
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10930
11116
  variants: function
10931
11117
  dispatch:
10932
- CPU: foreach_tensor_log2_slow_
11118
+ CompositeExplicitAutograd: foreach_tensor_log2_slow_
10933
11119
  CUDA: foreach_tensor_log2_cuda_
10934
11120
  autogen: _foreach_log2.out
10935
11121
 
11122
+ - func: _foreach_max(Tensor[] self) -> Tensor[]
11123
+ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11124
+ variants: function
11125
+ dispatch:
11126
+ CompositeExplicitAutograd: foreach_tensor_max_slow
11127
+ CUDA: foreach_tensor_max_cuda
11128
+ autogen: _foreach_max.out
11129
+
10936
11130
  - func: _foreach_neg(Tensor[] self) -> Tensor[]
10937
11131
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10938
11132
  variants: function
10939
11133
  dispatch:
10940
- CPU: foreach_tensor_neg_slow
11134
+ CompositeExplicitAutograd: foreach_tensor_neg_slow
10941
11135
  CUDA: foreach_tensor_neg_cuda
10942
11136
 
10943
11137
  - func: _foreach_neg_(Tensor(a!)[] self) -> ()
10944
11138
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10945
11139
  variants: function
10946
11140
  dispatch:
10947
- CPU: foreach_tensor_neg_slow_
11141
+ CompositeExplicitAutograd: foreach_tensor_neg_slow_
10948
11142
  CUDA: foreach_tensor_neg_cuda_
10949
11143
  autogen: _foreach_neg.out
10950
11144
 
10951
- - func: _foreach_norm.Scalar(Tensor[] self, Scalar ord=2) -> Tensor[]
11145
+ - func: _foreach_norm.Scalar(Tensor[] self, Scalar ord=2, ScalarType? dtype=None) -> Tensor[]
10952
11146
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10953
11147
  variants: function
10954
11148
  dispatch:
10955
- CPU: foreach_tensor_norm_slow
11149
+ CompositeExplicitAutograd: foreach_tensor_norm_slow
10956
11150
  CUDA: foreach_tensor_norm_cuda
10957
11151
  autogen: _foreach_norm.Scalar_out
10958
11152
 
@@ -10960,35 +11154,35 @@
10960
11154
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10961
11155
  variants: function
10962
11156
  dispatch:
10963
- CPU: foreach_tensor_pow_list_kernel_slow
11157
+ CompositeExplicitAutograd: foreach_tensor_pow_list_kernel_slow
10964
11158
  CUDA: foreach_tensor_pow_list_kernel_cuda
10965
11159
 
10966
11160
  - func: _foreach_pow.Scalar(Tensor[] self, Scalar exponent) -> Tensor[]
10967
11161
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10968
11162
  variants: function
10969
11163
  dispatch:
10970
- CPU: foreach_tensor_pow_scalar_kernel_slow
11164
+ CompositeExplicitAutograd: foreach_tensor_pow_scalar_kernel_slow
10971
11165
  CUDA: foreach_tensor_pow_scalar_kernel_cuda
10972
11166
 
10973
11167
  - func: _foreach_pow.ScalarList(Tensor[] self, Scalar[] exponent) -> Tensor[]
10974
11168
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10975
11169
  variants: function
10976
11170
  dispatch:
10977
- CPU: foreach_tensor_pow_scalarlist_kernel_slow
11171
+ CompositeExplicitAutograd: foreach_tensor_pow_scalarlist_kernel_slow
10978
11172
  CUDA: foreach_tensor_pow_scalarlist_kernel_cuda
10979
11173
 
10980
11174
  - func: _foreach_pow.ScalarAndTensor(Scalar self, Tensor[] exponent) -> Tensor[]
10981
11175
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10982
11176
  variants: function
10983
11177
  dispatch:
10984
- CPU: foreach_scalar_pow_list_kernel_slow
11178
+ CompositeExplicitAutograd: foreach_scalar_pow_list_kernel_slow
10985
11179
  CUDA: foreach_scalar_pow_list_kernel_cuda
10986
11180
 
10987
11181
  - func: _foreach_pow_.List(Tensor(a!)[] self, Tensor[] exponent) -> ()
10988
11182
  device_check: NoCheck
10989
11183
  variants: function
10990
11184
  dispatch:
10991
- CPU: foreach_tensor_pow_list_kernel_slow_
11185
+ CompositeExplicitAutograd: foreach_tensor_pow_list_kernel_slow_
10992
11186
  CUDA: foreach_tensor_pow_list_kernel_cuda_
10993
11187
  autogen: _foreach_pow.List_out
10994
11188
 
@@ -10996,7 +11190,7 @@
10996
11190
  device_check: NoCheck
10997
11191
  variants: function
10998
11192
  dispatch:
10999
- CPU: foreach_tensor_pow_scalar_kernel_slow_
11193
+ CompositeExplicitAutograd: foreach_tensor_pow_scalar_kernel_slow_
11000
11194
  CUDA: foreach_tensor_pow_scalar_kernel_cuda_
11001
11195
  autogen: _foreach_pow.Scalar_out
11002
11196
 
@@ -11004,7 +11198,7 @@
11004
11198
  device_check: NoCheck
11005
11199
  variants: function
11006
11200
  dispatch:
11007
- CPU: foreach_tensor_pow_scalarlist_kernel_slow_
11201
+ CompositeExplicitAutograd: foreach_tensor_pow_scalarlist_kernel_slow_
11008
11202
  CUDA: foreach_tensor_pow_scalarlist_kernel_cuda_
11009
11203
  autogen: _foreach_pow.ScalarList_out
11010
11204
 
@@ -11012,14 +11206,14 @@
11012
11206
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11013
11207
  variants: function
11014
11208
  dispatch:
11015
- CPU: foreach_tensor_reciprocal_slow
11209
+ CompositeExplicitAutograd: foreach_tensor_reciprocal_slow
11016
11210
  CUDA: foreach_tensor_reciprocal_cuda
11017
11211
 
11018
11212
  - func: _foreach_reciprocal_(Tensor(a!)[] self) -> ()
11019
11213
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11020
11214
  variants: function
11021
11215
  dispatch:
11022
- CPU: foreach_tensor_reciprocal_slow_
11216
+ CompositeExplicitAutograd: foreach_tensor_reciprocal_slow_
11023
11217
  CUDA: foreach_tensor_reciprocal_cuda_
11024
11218
  autogen: _foreach_reciprocal.out
11025
11219
 
@@ -11027,14 +11221,14 @@
11027
11221
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11028
11222
  variants: function
11029
11223
  dispatch:
11030
- CPU: foreach_tensor_round_slow
11224
+ CompositeExplicitAutograd: foreach_tensor_round_slow
11031
11225
  CUDA: foreach_tensor_round_cuda
11032
11226
 
11033
11227
  - func: _foreach_round_(Tensor(a!)[] self) -> ()
11034
11228
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11035
11229
  variants: function
11036
11230
  dispatch:
11037
- CPU: foreach_tensor_round_slow_
11231
+ CompositeExplicitAutograd: foreach_tensor_round_slow_
11038
11232
  CUDA: foreach_tensor_round_cuda_
11039
11233
  autogen: _foreach_round.out
11040
11234
 
@@ -11042,14 +11236,14 @@
11042
11236
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11043
11237
  variants: function
11044
11238
  dispatch:
11045
- CPU: foreach_tensor_sigmoid_slow
11239
+ CompositeExplicitAutograd: foreach_tensor_sigmoid_slow
11046
11240
  CUDA: foreach_tensor_sigmoid_cuda
11047
11241
 
11048
11242
  - func: _foreach_sigmoid_(Tensor(a!)[] self) -> ()
11049
11243
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11050
11244
  variants: function
11051
11245
  dispatch:
11052
- CPU: foreach_tensor_sigmoid_slow_
11246
+ CompositeExplicitAutograd: foreach_tensor_sigmoid_slow_
11053
11247
  CUDA: foreach_tensor_sigmoid_cuda_
11054
11248
  autogen: _foreach_sigmoid.out
11055
11249
 
@@ -11057,14 +11251,14 @@
11057
11251
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11058
11252
  variants: function
11059
11253
  dispatch:
11060
- CPU: foreach_tensor_sign_slow
11254
+ CompositeExplicitAutograd: foreach_tensor_sign_slow
11061
11255
  CUDA: foreach_tensor_sign_cuda
11062
11256
 
11063
11257
  - func: _foreach_sign_(Tensor(a!)[] self) -> ()
11064
11258
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11065
11259
  variants: function
11066
11260
  dispatch:
11067
- CPU: foreach_tensor_sign_slow_
11261
+ CompositeExplicitAutograd: foreach_tensor_sign_slow_
11068
11262
  CUDA: foreach_tensor_sign_cuda_
11069
11263
  autogen: _foreach_sign.out
11070
11264
 
@@ -11072,14 +11266,14 @@
11072
11266
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11073
11267
  variants: function
11074
11268
  dispatch:
11075
- CPU: foreach_tensor_sin_slow
11269
+ CompositeExplicitAutograd: foreach_tensor_sin_slow
11076
11270
  CUDA: foreach_tensor_sin_cuda
11077
11271
 
11078
11272
  - func: _foreach_sin_(Tensor(a!)[] self) -> ()
11079
11273
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11080
11274
  variants: function
11081
11275
  dispatch:
11082
- CPU: foreach_tensor_sin_slow_
11276
+ CompositeExplicitAutograd: foreach_tensor_sin_slow_
11083
11277
  CUDA: foreach_tensor_sin_cuda_
11084
11278
  autogen: _foreach_sin.out
11085
11279
 
@@ -11087,14 +11281,14 @@
11087
11281
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11088
11282
  variants: function
11089
11283
  dispatch:
11090
- CPU: foreach_tensor_sinh_slow
11284
+ CompositeExplicitAutograd: foreach_tensor_sinh_slow
11091
11285
  CUDA: foreach_tensor_sinh_cuda
11092
11286
 
11093
11287
  - func: _foreach_sinh_(Tensor(a!)[] self) -> ()
11094
11288
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11095
11289
  variants: function
11096
11290
  dispatch:
11097
- CPU: foreach_tensor_sinh_slow_
11291
+ CompositeExplicitAutograd: foreach_tensor_sinh_slow_
11098
11292
  CUDA: foreach_tensor_sinh_cuda_
11099
11293
  autogen: _foreach_sinh.out
11100
11294
 
@@ -11102,14 +11296,14 @@
11102
11296
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11103
11297
  variants: function
11104
11298
  dispatch:
11105
- CPU: foreach_tensor_sqrt_slow
11299
+ CompositeExplicitAutograd: foreach_tensor_sqrt_slow
11106
11300
  CUDA: foreach_tensor_sqrt_cuda
11107
11301
 
11108
11302
  - func: _foreach_sqrt_(Tensor(a!)[] self) -> ()
11109
11303
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11110
11304
  variants: function
11111
11305
  dispatch:
11112
- CPU: foreach_tensor_sqrt_slow_
11306
+ CompositeExplicitAutograd: foreach_tensor_sqrt_slow_
11113
11307
  CUDA: foreach_tensor_sqrt_cuda_
11114
11308
  autogen: _foreach_sqrt.out
11115
11309
 
@@ -11117,14 +11311,14 @@
11117
11311
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11118
11312
  variants: function
11119
11313
  dispatch:
11120
- CPU: foreach_tensor_tan_slow
11314
+ CompositeExplicitAutograd: foreach_tensor_tan_slow
11121
11315
  CUDA: foreach_tensor_tan_cuda
11122
11316
 
11123
11317
  - func: _foreach_tan_(Tensor(a!)[] self) -> ()
11124
11318
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11125
11319
  variants: function
11126
11320
  dispatch:
11127
- CPU: foreach_tensor_tan_slow_
11321
+ CompositeExplicitAutograd: foreach_tensor_tan_slow_
11128
11322
  CUDA: foreach_tensor_tan_cuda_
11129
11323
  autogen: _foreach_tan.out
11130
11324
 
@@ -11132,14 +11326,14 @@
11132
11326
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11133
11327
  variants: function
11134
11328
  dispatch:
11135
- CPU: foreach_tensor_tanh_slow
11329
+ CompositeExplicitAutograd: foreach_tensor_tanh_slow
11136
11330
  CUDA: foreach_tensor_tanh_cuda
11137
11331
 
11138
11332
  - func: _foreach_tanh_(Tensor(a!)[] self) -> ()
11139
11333
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11140
11334
  variants: function
11141
11335
  dispatch:
11142
- CPU: foreach_tensor_tanh_slow_
11336
+ CompositeExplicitAutograd: foreach_tensor_tanh_slow_
11143
11337
  CUDA: foreach_tensor_tanh_cuda_
11144
11338
  autogen: _foreach_tanh.out
11145
11339
 
@@ -11147,14 +11341,14 @@
11147
11341
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11148
11342
  variants: function
11149
11343
  dispatch:
11150
- CPU: foreach_tensor_trunc_slow
11344
+ CompositeExplicitAutograd: foreach_tensor_trunc_slow
11151
11345
  CUDA: foreach_tensor_trunc_cuda
11152
11346
 
11153
11347
  - func: _foreach_trunc_(Tensor(a!)[] self) -> ()
11154
11348
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11155
11349
  variants: function
11156
11350
  dispatch:
11157
- CPU: foreach_tensor_trunc_slow_
11351
+ CompositeExplicitAutograd: foreach_tensor_trunc_slow_
11158
11352
  CUDA: foreach_tensor_trunc_cuda_
11159
11353
  autogen: _foreach_trunc.out
11160
11354
 
@@ -11162,7 +11356,7 @@
11162
11356
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11163
11357
  variants: function
11164
11358
  dispatch:
11165
- CPU: foreach_tensor_zero_slow_
11359
+ CompositeExplicitAutograd: foreach_tensor_zero_slow_
11166
11360
  CUDA: foreach_tensor_zero_cuda_
11167
11361
  autogen: _foreach_zero, _foreach_zero.out
11168
11362
 
@@ -11170,9 +11364,15 @@
11170
11364
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11171
11365
  variants: function
11172
11366
  dispatch:
11173
- CPU: foreach_tensor_copy_list_kernel_slow_
11367
+ CompositeExplicitAutograd: foreach_tensor_copy_list_kernel_slow_
11174
11368
  CUDA: foreach_tensor_copy_list_kernel_cuda_
11175
- autogen: _foreach_copy, _foreach_copy.out
11369
+ autogen: _foreach_copy.out
11370
+
11371
+ - func: _foreach_copy(Tensor[] self, Tensor[] src, bool non_blocking=False) -> Tensor[] self_out
11372
+ device_check: NoCheck
11373
+ variants: function
11374
+ dispatch:
11375
+ CompositeExplicitAutograd: _foreach_copy
11176
11376
 
11177
11377
  - func: bucketize.Tensor(Tensor self, Tensor boundaries, *, bool out_int32=False, bool right=False) -> Tensor
11178
11378
  dispatch:
@@ -12341,6 +12541,7 @@
12341
12541
  dispatch:
12342
12542
  CPU: upsample_linear1d_out_cpu
12343
12543
  CUDA: upsample_linear1d_out_cuda
12544
+ MPS: upsample_linear1d_out_mps
12344
12545
 
12345
12546
  - func: upsample_linear1d(Tensor self, SymInt[1] output_size, bool align_corners, float? scales=None) -> Tensor
12346
12547
  python_module: nn
@@ -12352,6 +12553,7 @@
12352
12553
  dispatch:
12353
12554
  CPU: upsample_linear1d_backward_out_cpu
12354
12555
  CUDA: upsample_linear1d_backward_out_cuda
12556
+ MPS: upsample_linear1d_backward_out_mps
12355
12557
 
12356
12558
  - func: upsample_linear1d_backward(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, bool align_corners, float? scales=None) -> Tensor
12357
12559
  python_module: nn
@@ -12824,7 +13026,7 @@
12824
13026
  SparseMeta: isinf_sparse_meta
12825
13027
  SparseCsrCPU, SparseCsrCUDA: isinf_sparse_csr
12826
13028
  autogen: isinf.out
12827
- tags: core
13029
+ tags: [core, pointwise]
12828
13030
 
12829
13031
  - func: record_stream(Tensor(a!) self, Stream s) -> ()
12830
13032
  variants: method
@@ -13750,11 +13952,18 @@
13750
13952
  dispatch:
13751
13953
  CPU, CUDA: linalg_eig_out
13752
13954
 
13955
+ - func: _linalg_eigvals(Tensor self) -> Tensor
13956
+ python_module: linalg
13957
+ dispatch:
13958
+ CPU, CUDA: _linalg_eigvals
13959
+
13753
13960
  - func: linalg_eigvals(Tensor self) -> Tensor
13754
13961
  python_module: linalg
13755
13962
 
13756
13963
  - func: linalg_eigvals.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
13757
13964
  python_module: linalg
13965
+ dispatch:
13966
+ CPU, CUDA: linalg_eigvals_out
13758
13967
 
13759
13968
  # This function is exposes the `compute_v` flag, which is then used to implement `linalg.eigh` and
13760
13969
  # `linalg.eigvalsh` as composite functions that call this one
@@ -14058,6 +14267,12 @@
14058
14267
  # It is undocumented and should not be used outside of tests.
14059
14268
  - func: _test_serialization_subcmul(Tensor self, Tensor other, Scalar alpha=1) -> Tensor
14060
14269
 
14270
+ # Note: for testing COW materialization within `at::parallel_for` loop function
14271
+ - func: _test_parallel_materialize(Tensor self, int num_parallel, bool skip_first=False) -> Tensor
14272
+ variants: function
14273
+ dispatch:
14274
+ CompositeExplicitAutograd: _test_parallel_materialize
14275
+
14061
14276
  # Note: this function is only for testing.
14062
14277
  - func: _test_optional_intlist(Tensor values, int[]? addends) -> Tensor
14063
14278
  python_module: nn
@@ -14392,6 +14607,7 @@
14392
14607
  variants: function
14393
14608
  dispatch:
14394
14609
  CompositeExplicitAutograd: split_with_sizes_copy_out
14610
+ CUDA: split_with_sizes_copy_out_cuda
14395
14611
 
14396
14612
  - func: view_copy(Tensor self, SymInt[] size) -> Tensor
14397
14613
  variants: function
@@ -14428,6 +14644,16 @@
14428
14644
  NestedTensorCUDA: NestedTensor_to_padded_tensor_cuda
14429
14645
  autogen: to_padded_tensor.out
14430
14646
 
14647
+ - func: _jagged_to_padded_dense_forward(Tensor values, Tensor[] offsets, SymInt[] max_lengths, float padding_value=0.0) -> Tensor
14648
+ variants: function
14649
+ dispatch:
14650
+ CUDA: _fbgemm_jagged_to_padded_dense_forward
14651
+
14652
+ - func: _padded_dense_to_jagged_forward(Tensor dense, Tensor[] offsets, SymInt? total_L=None) -> Tensor
14653
+ variants: function
14654
+ dispatch:
14655
+ CUDA: _fbgemm_dense_to_jagged_forward_symint
14656
+
14431
14657
  - func: _nested_tensor_softmax_with_shape(Tensor self, Tensor query) -> Tensor
14432
14658
  dispatch:
14433
14659
  NestedTensorCPU: NestedTensor_softmax_dropout
@@ -14468,19 +14694,28 @@
14468
14694
 
14469
14695
  - func: _scaled_dot_product_flash_attention(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor output, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
14470
14696
  dispatch:
14471
- CPU: _scaled_dot_product_flash_attention_cpu
14472
14697
  CUDA: _scaled_dot_product_flash_attention_cuda
14473
14698
  NestedTensorCUDA: _scaled_dot_product_flash_attention_nestedtensor_cuda
14474
14699
  tags: nondeterministic_seeded
14475
14700
 
14701
+ - func: _scaled_dot_product_flash_attention_for_cpu(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool is_causal=False, *, Tensor? attn_mask=None, float? scale=None) -> (Tensor output, Tensor logsumexp)
14702
+ dispatch:
14703
+ CPU: _scaled_dot_product_flash_attention_cpu
14704
+ tags: nondeterministic_seeded
14705
+
14476
14706
  - func: _scaled_dot_product_flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, Tensor philox_seed, Tensor philox_offset, *, float? scale=None) -> (Tensor grad_query, Tensor grad_key, Tensor grad_value)
14477
14707
  device_check: NoCheck
14478
14708
  variants: function
14479
14709
  dispatch:
14480
- CPU: _scaled_dot_product_flash_attention_backward_cpu
14481
14710
  CUDA: _scaled_dot_product_flash_attention_backward_cuda
14482
14711
  NestedTensorCUDA: _scaled_dot_product_flash_attention_backward_nested
14483
14712
 
14713
+ - func: _scaled_dot_product_flash_attention_for_cpu_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, float dropout_p, bool is_causal, *, Tensor? attn_mask=None, float? scale=None) -> (Tensor grad_query, Tensor grad_key, Tensor grad_value)
14714
+ device_check: NoCheck
14715
+ variants: function
14716
+ dispatch:
14717
+ CPU: _scaled_dot_product_flash_attention_cpu_backward
14718
+
14484
14719
  - func: _scaled_dot_product_efficient_attention(Tensor query, Tensor key, Tensor value, Tensor? attn_bias, bool compute_log_sumexp, float dropout_p=0.0, bool is_causal=False, *, float? scale=None) -> (Tensor output, Tensor log_sumexp, Tensor philox_seed, Tensor philox_offset)
14485
14720
  dispatch:
14486
14721
  CUDA: _scaled_dot_product_efficient_attention_cuda
@@ -14493,26 +14728,36 @@
14493
14728
  CUDA: _scaled_dot_product_efficient_attention_backward_cuda
14494
14729
  tags: nondeterministic_seeded
14495
14730
 
14496
- - func: _flash_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? cum_seq_q, Tensor? cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, bool return_debug_mask, *, float? scale=None) -> (Tensor output, Tensor softmax_logsumexp, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
14731
+ - func: _scaled_dot_product_cudnn_attention(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor output, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
14732
+ dispatch:
14733
+ CUDA: _scaled_dot_product_cudnn_attention_cuda
14734
+ tags: nondeterministic_seeded
14735
+
14736
+ - func: _scaled_dot_product_cudnn_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, Tensor philox_seed, Tensor philox_offset, *, float? scale=None) -> (Tensor, Tensor, Tensor)
14737
+ dispatch:
14738
+ CUDA: _scaled_dot_product_cudnn_attention_backward_cuda
14739
+ tags: nondeterministic_seeded
14740
+
14741
+ - func: _flash_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? cum_seq_q, Tensor? cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, bool return_debug_mask, *, float? scale=None, SymInt? window_size_left=None, SymInt? window_size_right=None, Tensor? seqused_k=None, Tensor? alibi_slopes=None) -> (Tensor output, Tensor softmax_logsumexp, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
14497
14742
  variants: function
14498
14743
  dispatch:
14499
14744
  CUDA: _flash_attention_forward
14500
14745
  tags: nondeterministic_seeded
14501
14746
 
14502
- - func: _flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, Tensor philox_seed, Tensor philox_offset, *, float? scale=None) -> (Tensor, Tensor, Tensor)
14747
+ - func: _flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, Tensor philox_seed, Tensor philox_offset, *, float? scale=None, SymInt? window_size_left=None, SymInt? window_size_right=None) -> (Tensor, Tensor, Tensor)
14503
14748
  device_check: NoCheck
14504
14749
  variants: function
14505
14750
  dispatch:
14506
14751
  CUDA: _flash_attention_backward
14507
14752
 
14508
- # Returns ouput, logsumexp if compute_logsumexp
14509
- - func: _efficient_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int? max_seqlen_q, float dropout_p, int custom_mask_type, bool compute_log_sumexp=False, *, float? scale=None, Tensor? causal_diagonal=None, Tensor? seqlen_k=None) -> (Tensor output, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset, SymInt max_seqlen_batch_q, SymInt max_seqlen_batch_k)
14753
+ # Returns output, logsumexp if compute_logsumexp
14754
+ - func: _efficient_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, SymInt? max_seqlen_q, SymInt? max_seqlen_k, float dropout_p, int custom_mask_type, bool compute_log_sumexp=False, *, float? scale=None, Tensor? seqlen_k=None, int? window_size=None) -> (Tensor output, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset, SymInt max_seqlen_batch_q, SymInt max_seqlen_batch_k)
14510
14755
  variants: function
14511
14756
  dispatch:
14512
14757
  CUDA: _efficient_attention_forward
14513
14758
  tags: nondeterministic_seeded
14514
14759
 
14515
- - func: _efficient_attention_backward(Tensor grad_out_, Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor out, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, SymInt max_seqlen_q, SymInt max_seqlen_k, Tensor logsumexp, float dropout_p, Tensor philox_seed, Tensor philox_offset, int custom_mask_type, bool bias_requires_grad, *, float? scale=None, int? num_splits_key=None) -> (Tensor, Tensor, Tensor, Tensor)
14760
+ - func: _efficient_attention_backward(Tensor grad_out_, Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor out, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, SymInt max_seqlen_q, SymInt max_seqlen_k, Tensor logsumexp, float dropout_p, Tensor philox_seed, Tensor philox_offset, int custom_mask_type, bool bias_requires_grad, *, float? scale=None, int? num_splits_key=None, int? window_size=None, bool shared_storage_dqdkdv=False) -> (Tensor, Tensor, Tensor, Tensor)
14516
14761
  device_check: NoCheck
14517
14762
  variants: function
14518
14763
  dispatch:
@@ -15312,11 +15557,11 @@
15312
15557
  CPU: foobar
15313
15558
  autogen: _foobar.out
15314
15559
 
15315
- # Fused Optimizer CUDA kernels.
15316
15560
  - func: _fused_adam_(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, float lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
15317
15561
  # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now).
15318
15562
  variants: function
15319
15563
  dispatch:
15564
+ CPU: _fused_adam_kernel_cpu_
15320
15565
  CUDA: _fused_adam_kernel_cuda_
15321
15566
  autogen: _fused_adam, _fused_adam.out
15322
15567
 
@@ -15326,6 +15571,7 @@
15326
15571
  device_check: NoCheck
15327
15572
  variants: function
15328
15573
  dispatch:
15574
+ CPU: _fused_adam_kernel_cpu_
15329
15575
  CUDA: _fused_adam_kernel_cuda_
15330
15576
  autogen: _fused_adam.tensor_lr, _fused_adam.tensor_lr_out
15331
15577
 
@@ -15333,6 +15579,7 @@
15333
15579
  # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now).
15334
15580
  variants: function
15335
15581
  dispatch:
15582
+ CPU: _fused_adamw_kernel_cpu_
15336
15583
  CUDA: _fused_adamw_kernel_cuda_
15337
15584
  autogen: _fused_adamw, _fused_adamw.out
15338
15585
 
@@ -15342,9 +15589,34 @@
15342
15589
  device_check: NoCheck
15343
15590
  variants: function
15344
15591
  dispatch:
15592
+ CPU: _fused_adamw_kernel_cpu_
15345
15593
  CUDA: _fused_adamw_kernel_cuda_
15346
15594
  autogen: _fused_adamw.tensor_lr, _fused_adamw.tensor_lr_out
15347
15595
 
15596
+ - func: _fused_sgd_(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] momentum_buffer_list, *, float weight_decay, float momentum, float lr, float dampening, bool nesterov, bool maximize, bool is_first_step, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
15597
+ # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now).
15598
+ variants: function
15599
+ dispatch:
15600
+ CPU: _fused_sgd_kernel_cpu_
15601
+ CUDA: _fused_sgd_kernel_cuda_
15602
+ autogen: _fused_sgd, _fused_sgd.out
15603
+
15604
+ - func: _fused_sgd_.tensor_lr(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] momentum_buffer_list, *, float weight_decay, float momentum, Tensor lr, float dampening, bool nesterov, bool maximize, bool is_first_step, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
15605
+ # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now).
15606
+ # but still skip the device check as the Tensor LR can be on CPU
15607
+ device_check: NoCheck
15608
+ variants: function
15609
+ dispatch:
15610
+ CPU: _fused_sgd_kernel_cpu_
15611
+ CUDA: _fused_sgd_kernel_cuda_
15612
+ autogen: _fused_sgd.tensor_lr, _fused_sgd.tensor_lr_out
15613
+
15614
+ - func: _fused_adagrad_(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] state_sums, Tensor(d!)[] state_steps, *, float lr, float lr_decay, float weight_decay, float eps, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
15615
+ variants: function
15616
+ dispatch:
15617
+ CPU: _fused_adagrad_kernel_cpu_
15618
+ autogen: _fused_adagrad, _fused_adagrad.out
15619
+
15348
15620
  # This op is ONLY used by pytorch/XLA in functionalization, and should never show up in vanilla eager mode or in any pytorch tracing contexts.
15349
15621
  - func: _propagate_xla_data(Tensor input, Tensor output) -> ()
15350
15622
  variants: function