torch-rb 0.16.0 → 0.17.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -549,8 +549,8 @@
549
549
  structured_delegate: add.out
550
550
  variants: function, method
551
551
  dispatch:
552
- SparseCPU, SparseCUDA: add_sparse
553
- SparseCsrCPU, SparseCsrCUDA: add_sparse_csr
552
+ SparseCPU, SparseCUDA, SparseMeta: add_sparse
553
+ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: add_sparse_csr
554
554
  MkldnnCPU: mkldnn_add
555
555
  ZeroTensor: add_zerotensor
556
556
  NestedTensorCPU, NestedTensorCUDA: NestedTensor_add_Tensor
@@ -561,8 +561,8 @@
561
561
  variants: method
562
562
  structured_delegate: add.out
563
563
  dispatch:
564
- SparseCPU, SparseCUDA: add_sparse_
565
- SparseCsrCPU, SparseCsrCUDA: add_sparse_csr_
564
+ SparseCPU, SparseCUDA, SparseMeta: add_sparse_
565
+ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: add_sparse_csr_
566
566
  MkldnnCPU: mkldnn_add_
567
567
  NestedTensorCPU, NestedTensorCUDA: NestedTensor_add__Tensor
568
568
  tags: pointwise
@@ -575,9 +575,9 @@
575
575
  Generic: add (AllAndComplex, BFloat16, Half, ComplexHalf)
576
576
  ScalarOnly: add (Bool)
577
577
  dispatch:
578
- SparseCPU: add_out_sparse_cpu
578
+ SparseCPU, SparseMeta: add_out_sparse_cpu
579
579
  SparseCUDA: add_out_sparse_cuda
580
- SparseCsrCPU: add_out_sparse_compressed_cpu
580
+ SparseCsrCPU, SparseCsrMeta: add_out_sparse_compressed_cpu
581
581
  SparseCsrCUDA: add_out_sparse_compressed_cuda
582
582
  MkldnnCPU: mkldnn_add_out
583
583
  MPS: add_out_mps
@@ -1750,6 +1750,7 @@
1750
1750
  - func: copy(Tensor self, Tensor src, bool non_blocking=False) -> Tensor
1751
1751
  variants: function
1752
1752
  dispatch:
1753
+ Meta: copy_meta
1753
1754
  CompositeExplicitAutogradNonFunctional: copy
1754
1755
  tags: core
1755
1756
 
@@ -3127,6 +3128,7 @@
3127
3128
  structured: True
3128
3129
  dispatch:
3129
3130
  CPU, CUDA: isin_Tensor_Tensor_out
3131
+ MPS: isin_Tensor_Tensor_out_mps
3130
3132
 
3131
3133
  - func: isin.Tensor_Tensor(Tensor elements, Tensor test_elements, *, bool assume_unique=False, bool invert=False) -> Tensor
3132
3134
  variants: function
@@ -3268,6 +3270,8 @@
3268
3270
  autogen: native_layer_norm_backward.out
3269
3271
  tags: core
3270
3272
 
3273
+ - func: rms_norm(Tensor input, int[] normalized_shape, Tensor? weight=None, float? eps=None) -> Tensor
3274
+
3271
3275
  - func: nan_to_num(Tensor self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor
3272
3276
  variants: function, method
3273
3277
  dispatch:
@@ -3340,10 +3344,31 @@
3340
3344
  dispatch:
3341
3345
  CUDA: _cslt_sparse_mm_search
3342
3346
 
3347
+ - func: _sparse_semi_structured_tile(Tensor input, str algorithm="", bool use_cutlass=True) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
3348
+ dispatch:
3349
+ CUDA: _sparse_semi_structured_tile
3350
+
3351
+ - func: _sparse_semi_structured_apply(Tensor input, Tensor thread_masks) -> (Tensor, Tensor)
3352
+ dispatch:
3353
+ CUDA: _sparse_semi_structured_apply
3354
+
3355
+ - func: _sparse_semi_structured_apply_dense(Tensor input, Tensor thread_masks) -> Tensor
3356
+ dispatch:
3357
+ CUDA: _sparse_semi_structured_apply_dense
3358
+
3359
+ # DEPRECATED: Use torch.__sparse_semi_structured_mm/torch._sparse_semi_structured_addmm instead
3343
3360
  - func: _sparse_semi_structured_linear(Tensor input, Tensor weight, Tensor meta, *, Tensor? bias=None, str? activation=None, ScalarType? out_dtype=None) -> Tensor
3344
3361
  dispatch:
3345
3362
  CUDA: _sparse_semi_structured_linear
3346
3363
 
3364
+ - func: _sparse_semi_structured_mm(Tensor mat1, Tensor mat1_meta, Tensor mat2, *, ScalarType? out_dtype=None) -> Tensor
3365
+ dispatch:
3366
+ CUDA: _sparse_semi_structured_mm
3367
+
3368
+ - func: _sparse_semi_structured_addmm(Tensor input, Tensor mat1, Tensor mat1_meta, Tensor mat2, *, Scalar alpha=1, Scalar beta=1, ScalarType? out_dtype=None) -> Tensor
3369
+ dispatch:
3370
+ CUDA: _sparse_semi_structured_addmm
3371
+
3347
3372
  - func: _mixed_dtypes_linear(Tensor input, Tensor weight, Tensor scale, *, Tensor? bias=None, str? activation=None) -> Tensor
3348
3373
  dispatch:
3349
3374
  CUDA: _mixed_dtypes_linear
@@ -4084,10 +4109,12 @@
4084
4109
 
4085
4110
  - func: _int_mm(Tensor self, Tensor mat2) -> Tensor
4086
4111
  dispatch:
4112
+ CPU: _int_mm_cpu
4087
4113
  CUDA: _int_mm_cuda
4088
4114
 
4089
4115
  - func: _int_mm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)
4090
4116
  dispatch:
4117
+ CPU: _int_mm_out_cpu
4091
4118
  CUDA: _int_mm_out_cuda
4092
4119
 
4093
4120
  - func: _convert_weight_to_int4pack(Tensor self, int innerKTiles) -> Tensor
@@ -4098,11 +4125,13 @@
4098
4125
  - func: _weight_int4pack_mm(Tensor self, Tensor mat2, int qGroupSize, Tensor qScaleAndZeros) -> Tensor
4099
4126
  dispatch:
4100
4127
  CPU: _weight_int4pack_mm_cpu
4128
+ MPS: _weight_int4pack_mm_mps
4101
4129
  CUDA: _weight_int4pack_mm_cuda
4102
4130
 
4103
4131
  - func: _weight_int8pack_mm(Tensor self, Tensor mat2, Tensor scales) -> Tensor
4104
4132
  dispatch:
4105
4133
  CPU: _weight_int8pack_mm_cpu
4134
+ MPS: _weight_int8pack_mm_mps
4106
4135
 
4107
4136
  - func: _sparse_mm(Tensor sparse, Tensor dense) -> Tensor
4108
4137
  python_module: sparse
@@ -5397,7 +5426,7 @@
5397
5426
  autogen: slice_backward.out
5398
5427
 
5399
5428
  # NB: This op exists to back the implementation of reverse view_funcs for various views (chunk,
5400
- # slice.Tensor, split_with_sizes, et. al.). Currently, these are only used during fake-ification
5429
+ # slice.Tensor, split_with_sizes, et al.). Currently, these are only used during fake-ification
5401
5430
  # of PT2 graph input subclass instances that are views. This means:
5402
5431
  # * This op shouldn't really show up in eager mode (so e.g. XLA shouldn't have to implement it)
5403
5432
  # * This op shouldn't show up in a PT2 graph (so a PT2 backend shouldn't have to implement it)
@@ -5620,10 +5649,12 @@
5620
5649
  - func: _chunk_cat(Tensor[] tensors, int dim, int num_chunks) -> Tensor
5621
5650
  dispatch:
5622
5651
  CompositeExplicitAutograd: _chunk_cat
5652
+ CUDA: _chunk_cat_cuda
5623
5653
 
5624
5654
  - func: _chunk_cat.out(Tensor[] tensors, int dim, int num_chunks, *, Tensor(a!) out) -> Tensor(a!)
5625
5655
  dispatch:
5626
5656
  CompositeExplicitAutograd: _chunk_cat_out
5657
+ CUDA: _chunk_cat_out_cuda
5627
5658
 
5628
5659
  - func: stack(Tensor[] tensors, int dim=0) -> Tensor
5629
5660
  dispatch:
@@ -5689,8 +5720,8 @@
5689
5720
  variants: function, method
5690
5721
  dispatch:
5691
5722
  CompositeExplicitAutograd: sum
5692
- SparseCPU, SparseCUDA: sum_coo
5693
- SparseCsrCPU, SparseCsrCUDA: sum_csr
5723
+ SparseCPU, SparseCUDA, SparseMeta: sum_coo
5724
+ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sum_csr
5694
5725
  autogen: sum.out
5695
5726
 
5696
5727
  - func: sum.dim_IntList(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
@@ -6200,6 +6231,12 @@
6200
6231
  category_override: dummy
6201
6232
  dispatch: {}
6202
6233
 
6234
+ - func: _nested_compute_contiguous_strides_offsets(Tensor nested_size) -> (Tensor, Tensor)
6235
+ variants: function
6236
+ device_check: NoCheck
6237
+ dispatch:
6238
+ CPU, CUDA: _nested_compute_contiguous_strides_offsets
6239
+
6203
6240
  - func: _trilinear(Tensor i1, Tensor i2, Tensor i3, int[] expand1, int[] expand2, int[] expand3, int[] sumdim, int unroll_dim=1) -> Tensor
6204
6241
  dispatch:
6205
6242
  # calls unsqueeze
@@ -6465,7 +6502,7 @@
6465
6502
  CPU: _efficientzerotensor
6466
6503
  CUDA: _efficientzerotensor_cuda
6467
6504
  MPS: _efficientzerotensor_mps
6468
- Meta: _efficientzerotensor_meta
6505
+ Meta: _efficientzerotensor_meta_symint
6469
6506
  autogen: _efficientzerotensor.out
6470
6507
 
6471
6508
  - func: zeros(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
@@ -6542,6 +6579,32 @@
6542
6579
  SparseCPU, SparseCUDA: norm_sparse
6543
6580
  autogen: native_norm.ScalarOpt_dim_dtype_out
6544
6581
 
6582
+ - func: _batch_norm_with_update(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, float momentum, float eps) -> (Tensor, Tensor, Tensor, Tensor)
6583
+ dispatch:
6584
+ CPU: _batch_norm_with_update_cpu
6585
+ CUDA: _batch_norm_with_update_cuda
6586
+ MPS: _batch_norm_with_update_mps
6587
+ MkldnnCPU: _batch_norm_with_update_mkldnn
6588
+ autogen: _batch_norm_with_update_functional
6589
+
6590
+ - func: _batch_norm_with_update.out(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, float momentum, float eps, *, Tensor(d!) out, Tensor(e!) save_mean, Tensor(f!) save_invstd, Tensor(g!) reserve) -> (Tensor(d!), Tensor(e!), Tensor(f!), Tensor(g!))
6591
+ dispatch:
6592
+ CPU: _batch_norm_with_update_cpu_out
6593
+ CUDA: _batch_norm_with_update_cuda_out
6594
+ MPS: _batch_norm_with_update_mps_out
6595
+
6596
+ - func: _batch_norm_no_update(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, float momentum, float eps) -> (Tensor, Tensor, Tensor, Tensor)
6597
+ dispatch:
6598
+ CompositeExplicitAutograd: _batch_norm_no_update
6599
+ autogen: _batch_norm_no_update.out
6600
+
6601
+ - func: batch_norm_backward(Tensor grad_out, Tensor input, Tensor weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var, bool update, float eps, bool[3] output_mask, Tensor reserve) -> (Tensor, Tensor, Tensor)
6602
+ dispatch:
6603
+ CPU: _new_batch_norm_backward_cpu
6604
+ CUDA: _new_batch_norm_backward_cuda
6605
+ MPS: _new_batch_norm_backward_mps
6606
+ MkldnnCPU: _new_batch_norm_backward_mkldnn
6607
+
6545
6608
  # TODO: reduce signatures down to one when optional args is available
6546
6609
  - func: _sparse_sum(Tensor self) -> Tensor
6547
6610
 
@@ -7042,6 +7105,10 @@
7042
7105
  # FIXME: would be nicer if TensorOptions was optional based; not adding default arguments for options given
7043
7106
  # the default would never make sense.
7044
7107
 
7108
+ - func: _sparse_compressed_tensor_with_dims(int nnz, int dense_dim, int[] size, int[] blocksize, ScalarType index_dtype, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
7109
+ dispatch:
7110
+ CompositeExplicitAutograd: sparse_compressed_tensor_with_dims
7111
+
7045
7112
  - func: sparse_compressed_tensor.comp_plain_value_size(Tensor compressed_indices, Tensor plain_indices, Tensor values, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
7046
7113
  dispatch:
7047
7114
  CompositeExplicitAutograd: sparse_compressed_tensor
@@ -7146,9 +7213,9 @@
7146
7213
  - func: sparse_dim(Tensor self) -> int
7147
7214
  variants: method
7148
7215
  dispatch:
7149
- CPU, CUDA: sparse_dim_strided
7150
7216
  SparseCPU, SparseCUDA, SparseMeta: sparse_dim_sparse
7151
7217
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sparse_dim_sparse_csr
7218
+ CompositeExplicitAutograd: sparse_dim_default
7152
7219
  device_check: NoCheck
7153
7220
  device_guard: False
7154
7221
 
@@ -7163,9 +7230,9 @@
7163
7230
  - func: dense_dim(Tensor self) -> int
7164
7231
  variants: method
7165
7232
  dispatch:
7166
- CPU, CUDA: dense_dim_strided
7167
7233
  SparseCPU, SparseCUDA, SparseMeta: dense_dim_sparse
7168
7234
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: dense_dim_sparse_csr
7235
+ CompositeExplicitAutograd: dense_dim_default
7169
7236
  device_check: NoCheck
7170
7237
  device_guard: False
7171
7238
 
@@ -7296,7 +7363,7 @@
7296
7363
  device_check: NoCheck # Allows copy into different device
7297
7364
  variants: function
7298
7365
  dispatch:
7299
- SparseCPU, SparseCUDA: copy_sparse_
7366
+ SparseCPU, SparseCUDA, SparseMeta: copy_sparse_
7300
7367
  autogen: copy_sparse_to_sparse, copy_sparse_to_sparse.out
7301
7368
 
7302
7369
  # By adding the AutogradNestedTensor this makes this function CompositeImplicit-like for nested tensors
@@ -7399,7 +7466,7 @@
7399
7466
  MkldnnCPU: mkldnn_reorder_conv2d_weight
7400
7467
  autogen: mkldnn_reorder_conv2d_weight.out
7401
7468
 
7402
- - func: mkldnn_reorder_conv3d_weight(Tensor self, SymInt[3] padding=0, SymInt[3] stride=1, SymInt[3] dilation=1, SymInt groups=1) -> Tensor
7469
+ - func: mkldnn_reorder_conv3d_weight(Tensor self, SymInt[3] padding=0, SymInt[3] stride=1, SymInt[3] dilation=1, SymInt groups=1, SymInt[]? input_size=None) -> Tensor
7403
7470
  variants: function
7404
7471
  python_module: nn
7405
7472
  dispatch:
@@ -7647,7 +7714,7 @@
7647
7714
 
7648
7715
  - func: result_type.Scalar_Scalar(Scalar scalar1, Scalar scalar2) -> ScalarType
7649
7716
 
7650
- - func: can_cast(ScalarType from, ScalarType to) -> bool
7717
+ - func: can_cast(ScalarType from_, ScalarType to) -> bool
7651
7718
  variants: function
7652
7719
 
7653
7720
  - func: promote_types(ScalarType type1, ScalarType type2) -> ScalarType
@@ -10222,6 +10289,7 @@
10222
10289
  variants: method, function
10223
10290
  dispatch:
10224
10291
  CompositeExplicitAutograd: alias
10292
+ NestedTensorCPU, NestedTensorCUDA: alias_nested
10225
10293
  tags: core
10226
10294
 
10227
10295
  - func: _amp_foreach_non_finite_check_and_unscale_(Tensor(a!)[] self, Tensor(b!) found_inf, Tensor inv_scale) -> ()
@@ -10255,14 +10323,14 @@
10255
10323
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10256
10324
  variants: function
10257
10325
  dispatch:
10258
- CPU: foreach_tensor_add_scalar_kernel_slow
10326
+ CompositeExplicitAutograd: foreach_tensor_add_scalar_kernel_slow
10259
10327
  CUDA: foreach_tensor_add_scalar_kernel_cuda
10260
10328
 
10261
10329
  - func: _foreach_add_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
10262
10330
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10263
10331
  variants: function
10264
10332
  dispatch:
10265
- CPU: foreach_tensor_add_scalar_kernel_slow_
10333
+ CompositeExplicitAutograd: foreach_tensor_add_scalar_kernel_slow_
10266
10334
  CUDA: foreach_tensor_add_scalar_kernel_cuda_
10267
10335
  autogen: _foreach_add.Scalar_out
10268
10336
 
@@ -10270,14 +10338,14 @@
10270
10338
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10271
10339
  variants: function
10272
10340
  dispatch:
10273
- CPU: foreach_tensor_add_list_kernel_slow
10341
+ CompositeExplicitAutograd: foreach_tensor_add_list_kernel_slow
10274
10342
  CUDA: foreach_tensor_add_list_kernel_cuda
10275
10343
 
10276
10344
  - func: _foreach_add_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> ()
10277
10345
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10278
10346
  variants: function
10279
10347
  dispatch:
10280
- CPU: foreach_tensor_add_list_kernel_slow_
10348
+ CompositeExplicitAutograd: foreach_tensor_add_list_kernel_slow_
10281
10349
  CUDA: foreach_tensor_add_list_kernel_cuda_
10282
10350
  autogen: _foreach_add.List_out
10283
10351
 
@@ -10285,14 +10353,14 @@
10285
10353
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10286
10354
  variants: function
10287
10355
  dispatch:
10288
- CPU: foreach_tensor_add_scalarlist_kernel_slow
10356
+ CompositeExplicitAutograd: foreach_tensor_add_scalarlist_kernel_slow
10289
10357
  CUDA: foreach_tensor_add_scalarlist_kernel_cuda
10290
10358
 
10291
10359
  - func: _foreach_add_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
10292
10360
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10293
10361
  variants: function
10294
10362
  dispatch:
10295
- CPU: foreach_tensor_add_scalarlist_kernel_slow_
10363
+ CompositeExplicitAutograd: foreach_tensor_add_scalarlist_kernel_slow_
10296
10364
  CUDA: foreach_tensor_add_scalarlist_kernel_cuda_
10297
10365
  autogen: _foreach_add.ScalarList_out
10298
10366
 
@@ -10300,14 +10368,14 @@
10300
10368
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10301
10369
  variants: function
10302
10370
  dispatch:
10303
- CPU: foreach_tensor_add_tensor_kernel_slow
10371
+ CompositeExplicitAutograd: foreach_tensor_add_tensor_kernel_slow
10304
10372
  CUDA: foreach_tensor_add_tensor_kernel_cuda
10305
10373
 
10306
10374
  - func: _foreach_add_.Tensor(Tensor(a!)[] self, Tensor other, *, Scalar alpha=1) -> ()
10307
10375
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10308
10376
  variants: function
10309
10377
  dispatch:
10310
- CPU: foreach_tensor_add_tensor_kernel_slow_
10378
+ CompositeExplicitAutograd: foreach_tensor_add_tensor_kernel_slow_
10311
10379
  CUDA: foreach_tensor_add_tensor_kernel_cuda_
10312
10380
  autogen: _foreach_add.Tensor_out
10313
10381
 
@@ -10315,14 +10383,14 @@
10315
10383
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10316
10384
  variants: function
10317
10385
  dispatch:
10318
- CPU: foreach_tensor_sub_scalar_kernel_slow
10386
+ CompositeExplicitAutograd: foreach_tensor_sub_scalar_kernel_slow
10319
10387
  CUDA: foreach_tensor_sub_scalar_kernel_cuda
10320
10388
 
10321
10389
  - func: _foreach_sub_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
10322
10390
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10323
10391
  variants: function
10324
10392
  dispatch:
10325
- CPU: foreach_tensor_sub_scalar_kernel_slow_
10393
+ CompositeExplicitAutograd: foreach_tensor_sub_scalar_kernel_slow_
10326
10394
  CUDA: foreach_tensor_sub_scalar_kernel_cuda_
10327
10395
  autogen: _foreach_sub.Scalar_out
10328
10396
 
@@ -10330,14 +10398,14 @@
10330
10398
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10331
10399
  variants: function
10332
10400
  dispatch:
10333
- CPU: foreach_tensor_sub_list_kernel_slow
10401
+ CompositeExplicitAutograd: foreach_tensor_sub_list_kernel_slow
10334
10402
  CUDA: foreach_tensor_sub_list_kernel_cuda
10335
10403
 
10336
10404
  - func: _foreach_sub_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> ()
10337
10405
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10338
10406
  variants: function
10339
10407
  dispatch:
10340
- CPU: foreach_tensor_sub_list_kernel_slow_
10408
+ CompositeExplicitAutograd: foreach_tensor_sub_list_kernel_slow_
10341
10409
  CUDA: foreach_tensor_sub_list_kernel_cuda_
10342
10410
  autogen: _foreach_sub.List_out
10343
10411
 
@@ -10345,14 +10413,14 @@
10345
10413
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10346
10414
  variants: function
10347
10415
  dispatch:
10348
- CPU: foreach_tensor_sub_scalarlist_kernel_slow
10416
+ CompositeExplicitAutograd: foreach_tensor_sub_scalarlist_kernel_slow
10349
10417
  CUDA: foreach_tensor_sub_scalarlist_kernel_cuda
10350
10418
 
10351
10419
  - func: _foreach_sub_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
10352
10420
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10353
10421
  variants: function
10354
10422
  dispatch:
10355
- CPU: foreach_tensor_sub_scalarlist_kernel_slow_
10423
+ CompositeExplicitAutograd: foreach_tensor_sub_scalarlist_kernel_slow_
10356
10424
  CUDA: foreach_tensor_sub_scalarlist_kernel_cuda_
10357
10425
  autogen: _foreach_sub.ScalarList_out
10358
10426
 
@@ -10360,14 +10428,14 @@
10360
10428
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10361
10429
  variants: function
10362
10430
  dispatch:
10363
- CPU: foreach_tensor_mul_scalar_kernel_slow
10431
+ CompositeExplicitAutograd: foreach_tensor_mul_scalar_kernel_slow
10364
10432
  CUDA: foreach_tensor_mul_scalar_kernel_cuda
10365
10433
 
10366
10434
  - func: _foreach_mul_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
10367
10435
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10368
10436
  variants: function
10369
10437
  dispatch:
10370
- CPU: foreach_tensor_mul_scalar_kernel_slow_
10438
+ CompositeExplicitAutograd: foreach_tensor_mul_scalar_kernel_slow_
10371
10439
  CUDA: foreach_tensor_mul_scalar_kernel_cuda_
10372
10440
  autogen: _foreach_mul.Scalar_out
10373
10441
 
@@ -10375,14 +10443,14 @@
10375
10443
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10376
10444
  variants: function
10377
10445
  dispatch:
10378
- CPU: foreach_tensor_mul_list_kernel_slow
10446
+ CompositeExplicitAutograd: foreach_tensor_mul_list_kernel_slow
10379
10447
  CUDA: foreach_tensor_mul_list_kernel_cuda
10380
10448
 
10381
10449
  - func: _foreach_mul_.List(Tensor(a!)[] self, Tensor[] other) -> ()
10382
10450
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10383
10451
  variants: function
10384
10452
  dispatch:
10385
- CPU: foreach_tensor_mul_list_kernel_slow_
10453
+ CompositeExplicitAutograd: foreach_tensor_mul_list_kernel_slow_
10386
10454
  CUDA: foreach_tensor_mul_list_kernel_cuda_
10387
10455
  autogen: _foreach_mul.List_out
10388
10456
 
@@ -10390,14 +10458,14 @@
10390
10458
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10391
10459
  variants: function
10392
10460
  dispatch:
10393
- CPU: foreach_tensor_mul_scalarlist_kernel_slow
10461
+ CompositeExplicitAutograd: foreach_tensor_mul_scalarlist_kernel_slow
10394
10462
  CUDA: foreach_tensor_mul_scalarlist_kernel_cuda
10395
10463
 
10396
10464
  - func: _foreach_mul_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
10397
10465
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10398
10466
  variants: function
10399
10467
  dispatch:
10400
- CPU: foreach_tensor_mul_scalarlist_kernel_slow_
10468
+ CompositeExplicitAutograd: foreach_tensor_mul_scalarlist_kernel_slow_
10401
10469
  CUDA: foreach_tensor_mul_scalarlist_kernel_cuda_
10402
10470
  autogen: _foreach_mul.ScalarList_out
10403
10471
 
@@ -10405,14 +10473,14 @@
10405
10473
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10406
10474
  variants: function
10407
10475
  dispatch:
10408
- CPU: foreach_tensor_mul_tensor_kernel_slow
10476
+ CompositeExplicitAutograd: foreach_tensor_mul_tensor_kernel_slow
10409
10477
  CUDA: foreach_tensor_mul_tensor_kernel_cuda
10410
10478
 
10411
10479
  - func: _foreach_mul_.Tensor(Tensor(a!)[] self, Tensor other) -> ()
10412
10480
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10413
10481
  variants: function
10414
10482
  dispatch:
10415
- CPU: foreach_tensor_mul_tensor_kernel_slow_
10483
+ CompositeExplicitAutograd: foreach_tensor_mul_tensor_kernel_slow_
10416
10484
  CUDA: foreach_tensor_mul_tensor_kernel_cuda_
10417
10485
  autogen: _foreach_mul.Tensor_out
10418
10486
 
@@ -10420,14 +10488,14 @@
10420
10488
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10421
10489
  variants: function
10422
10490
  dispatch:
10423
- CPU: foreach_tensor_div_scalar_kernel_slow
10491
+ CompositeExplicitAutograd: foreach_tensor_div_scalar_kernel_slow
10424
10492
  CUDA: foreach_tensor_div_scalar_kernel_cuda
10425
10493
 
10426
10494
  - func: _foreach_div_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
10427
10495
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10428
10496
  variants: function
10429
10497
  dispatch:
10430
- CPU: foreach_tensor_div_scalar_kernel_slow_
10498
+ CompositeExplicitAutograd: foreach_tensor_div_scalar_kernel_slow_
10431
10499
  CUDA: foreach_tensor_div_scalar_kernel_cuda_
10432
10500
  autogen: _foreach_div.Scalar_out
10433
10501
 
@@ -10435,14 +10503,14 @@
10435
10503
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10436
10504
  variants: function
10437
10505
  dispatch:
10438
- CPU: foreach_tensor_div_list_kernel_slow
10506
+ CompositeExplicitAutograd: foreach_tensor_div_list_kernel_slow
10439
10507
  CUDA: foreach_tensor_div_list_kernel_cuda
10440
10508
 
10441
10509
  - func: _foreach_div_.List(Tensor(a!)[] self, Tensor[] other) -> ()
10442
10510
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10443
10511
  variants: function
10444
10512
  dispatch:
10445
- CPU: foreach_tensor_div_list_kernel_slow_
10513
+ CompositeExplicitAutograd: foreach_tensor_div_list_kernel_slow_
10446
10514
  CUDA: foreach_tensor_div_list_kernel_cuda_
10447
10515
  autogen: _foreach_div.List_out
10448
10516
 
@@ -10450,14 +10518,14 @@
10450
10518
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10451
10519
  variants: function
10452
10520
  dispatch:
10453
- CPU: foreach_tensor_div_scalarlist_kernel_slow
10521
+ CompositeExplicitAutograd: foreach_tensor_div_scalarlist_kernel_slow
10454
10522
  CUDA: foreach_tensor_div_scalarlist_kernel_cuda
10455
10523
 
10456
10524
  - func: _foreach_div_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
10457
10525
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10458
10526
  variants: function
10459
10527
  dispatch:
10460
- CPU: foreach_tensor_div_scalarlist_kernel_slow_
10528
+ CompositeExplicitAutograd: foreach_tensor_div_scalarlist_kernel_slow_
10461
10529
  CUDA: foreach_tensor_div_scalarlist_kernel_cuda_
10462
10530
  autogen: _foreach_div.ScalarList_out
10463
10531
 
@@ -10465,14 +10533,14 @@
10465
10533
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10466
10534
  variants: function
10467
10535
  dispatch:
10468
- CPU: foreach_tensor_div_tensor_kernel_slow
10536
+ CompositeExplicitAutograd: foreach_tensor_div_tensor_kernel_slow
10469
10537
  CUDA: foreach_tensor_div_tensor_kernel_cuda
10470
10538
 
10471
10539
  - func: _foreach_div_.Tensor(Tensor(a!)[] self, Tensor other) -> ()
10472
10540
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10473
10541
  variants: function
10474
10542
  dispatch:
10475
- CPU: foreach_tensor_div_tensor_kernel_slow_
10543
+ CompositeExplicitAutograd: foreach_tensor_div_tensor_kernel_slow_
10476
10544
  CUDA: foreach_tensor_div_tensor_kernel_cuda_
10477
10545
  autogen: _foreach_div.Tensor_out
10478
10546
 
@@ -10480,14 +10548,14 @@
10480
10548
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10481
10549
  variants: function
10482
10550
  dispatch:
10483
- CPU: foreach_tensor_clamp_max_scalar_kernel_slow
10551
+ CompositeExplicitAutograd: foreach_tensor_clamp_max_scalar_kernel_slow
10484
10552
  CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda
10485
10553
 
10486
10554
  - func: _foreach_clamp_max_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
10487
10555
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10488
10556
  variants: function
10489
10557
  dispatch:
10490
- CPU: foreach_tensor_clamp_max_scalar_kernel_slow_
10558
+ CompositeExplicitAutograd: foreach_tensor_clamp_max_scalar_kernel_slow_
10491
10559
  CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda_
10492
10560
  autogen: _foreach_clamp_max.Scalar_out
10493
10561
 
@@ -10495,14 +10563,14 @@
10495
10563
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10496
10564
  variants: function
10497
10565
  dispatch:
10498
- CPU: foreach_tensor_clamp_max_list_kernel_slow
10566
+ CompositeExplicitAutograd: foreach_tensor_clamp_max_list_kernel_slow
10499
10567
  CUDA: foreach_tensor_clamp_max_list_kernel_cuda
10500
10568
 
10501
10569
  - func: _foreach_clamp_max_.List(Tensor(a!)[] self, Tensor[] other) -> ()
10502
10570
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10503
10571
  variants: function
10504
10572
  dispatch:
10505
- CPU: foreach_tensor_clamp_max_list_kernel_slow_
10573
+ CompositeExplicitAutograd: foreach_tensor_clamp_max_list_kernel_slow_
10506
10574
  CUDA: foreach_tensor_clamp_max_list_kernel_cuda_
10507
10575
  autogen: _foreach_clamp_max.List_out
10508
10576
 
@@ -10510,14 +10578,14 @@
10510
10578
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10511
10579
  variants: function
10512
10580
  dispatch:
10513
- CPU: foreach_tensor_clamp_max_scalarlist_kernel_slow
10581
+ CompositeExplicitAutograd: foreach_tensor_clamp_max_scalarlist_kernel_slow
10514
10582
  CUDA: foreach_tensor_clamp_max_scalarlist_kernel_cuda
10515
10583
 
10516
10584
  - func: _foreach_clamp_max_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
10517
10585
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10518
10586
  variants: function
10519
10587
  dispatch:
10520
- CPU: foreach_tensor_clamp_max_scalarlist_kernel_slow_
10588
+ CompositeExplicitAutograd: foreach_tensor_clamp_max_scalarlist_kernel_slow_
10521
10589
  CUDA: foreach_tensor_clamp_max_scalarlist_kernel_cuda_
10522
10590
  autogen: _foreach_clamp_max.ScalarList_out
10523
10591
 
@@ -10525,14 +10593,14 @@
10525
10593
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10526
10594
  variants: function
10527
10595
  dispatch:
10528
- CPU: foreach_tensor_clamp_min_scalar_kernel_slow
10596
+ CompositeExplicitAutograd: foreach_tensor_clamp_min_scalar_kernel_slow
10529
10597
  CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda
10530
10598
 
10531
10599
  - func: _foreach_clamp_min_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
10532
10600
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10533
10601
  variants: function
10534
10602
  dispatch:
10535
- CPU: foreach_tensor_clamp_min_scalar_kernel_slow_
10603
+ CompositeExplicitAutograd: foreach_tensor_clamp_min_scalar_kernel_slow_
10536
10604
  CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda_
10537
10605
  autogen: _foreach_clamp_min.Scalar_out
10538
10606
 
@@ -10540,14 +10608,14 @@
10540
10608
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10541
10609
  variants: function
10542
10610
  dispatch:
10543
- CPU: foreach_tensor_clamp_min_list_kernel_slow
10611
+ CompositeExplicitAutograd: foreach_tensor_clamp_min_list_kernel_slow
10544
10612
  CUDA: foreach_tensor_clamp_min_list_kernel_cuda
10545
10613
 
10546
10614
  - func: _foreach_clamp_min_.List(Tensor(a!)[] self, Tensor[] other) -> ()
10547
10615
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10548
10616
  variants: function
10549
10617
  dispatch:
10550
- CPU: foreach_tensor_clamp_min_list_kernel_slow_
10618
+ CompositeExplicitAutograd: foreach_tensor_clamp_min_list_kernel_slow_
10551
10619
  CUDA: foreach_tensor_clamp_min_list_kernel_cuda_
10552
10620
  autogen: _foreach_clamp_min.List_out
10553
10621
 
@@ -10555,14 +10623,14 @@
10555
10623
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10556
10624
  variants: function
10557
10625
  dispatch:
10558
- CPU: foreach_tensor_clamp_min_scalarlist_kernel_slow
10626
+ CompositeExplicitAutograd: foreach_tensor_clamp_min_scalarlist_kernel_slow
10559
10627
  CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda
10560
10628
 
10561
10629
  - func: _foreach_clamp_min_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
10562
10630
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10563
10631
  variants: function
10564
10632
  dispatch:
10565
- CPU: foreach_tensor_clamp_min_scalarlist_kernel_slow_
10633
+ CompositeExplicitAutograd: foreach_tensor_clamp_min_scalarlist_kernel_slow_
10566
10634
  CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda_
10567
10635
  autogen: _foreach_clamp_min.ScalarList_out
10568
10636
 
@@ -10571,14 +10639,14 @@
10571
10639
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10572
10640
  variants: function
10573
10641
  dispatch:
10574
- CPU: foreach_tensor_clamp_min_scalar_kernel_slow
10642
+ CompositeExplicitAutograd: foreach_tensor_clamp_min_scalar_kernel_slow
10575
10643
  CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda
10576
10644
 
10577
10645
  - func: _foreach_maximum_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
10578
10646
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10579
10647
  variants: function
10580
10648
  dispatch:
10581
- CPU: foreach_tensor_clamp_min_scalar_kernel_slow_
10649
+ CompositeExplicitAutograd: foreach_tensor_clamp_min_scalar_kernel_slow_
10582
10650
  CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda_
10583
10651
  autogen: _foreach_maximum.Scalar_out
10584
10652
 
@@ -10587,14 +10655,14 @@
10587
10655
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10588
10656
  variants: function
10589
10657
  dispatch:
10590
- CPU: foreach_tensor_clamp_min_list_kernel_slow
10658
+ CompositeExplicitAutograd: foreach_tensor_clamp_min_list_kernel_slow
10591
10659
  CUDA: foreach_tensor_clamp_min_list_kernel_cuda
10592
10660
 
10593
10661
  - func: _foreach_maximum_.List(Tensor(a!)[] self, Tensor[] other) -> ()
10594
10662
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10595
10663
  variants: function
10596
10664
  dispatch:
10597
- CPU: foreach_tensor_clamp_min_list_kernel_slow_
10665
+ CompositeExplicitAutograd: foreach_tensor_clamp_min_list_kernel_slow_
10598
10666
  CUDA: foreach_tensor_clamp_min_list_kernel_cuda_
10599
10667
  autogen: _foreach_maximum.List_out
10600
10668
 
@@ -10603,14 +10671,14 @@
10603
10671
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10604
10672
  variants: function
10605
10673
  dispatch:
10606
- CPU: foreach_tensor_clamp_min_scalarlist_kernel_slow
10674
+ CompositeExplicitAutograd: foreach_tensor_clamp_min_scalarlist_kernel_slow
10607
10675
  CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda
10608
10676
 
10609
10677
  - func: _foreach_maximum_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
10610
10678
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10611
10679
  variants: function
10612
10680
  dispatch:
10613
- CPU: foreach_tensor_clamp_min_scalarlist_kernel_slow_
10681
+ CompositeExplicitAutograd: foreach_tensor_clamp_min_scalarlist_kernel_slow_
10614
10682
  CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda_
10615
10683
  autogen: _foreach_maximum.ScalarList_out
10616
10684
 
@@ -10618,14 +10686,14 @@
10618
10686
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10619
10687
  variants: function
10620
10688
  dispatch:
10621
- CPU: foreach_tensor_clamp_max_scalar_kernel_slow
10689
+ CompositeExplicitAutograd: foreach_tensor_clamp_max_scalar_kernel_slow
10622
10690
  CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda
10623
10691
 
10624
10692
  - func: _foreach_minimum_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
10625
10693
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10626
10694
  variants: function
10627
10695
  dispatch:
10628
- CPU: foreach_tensor_clamp_max_scalar_kernel_slow_
10696
+ CompositeExplicitAutograd: foreach_tensor_clamp_max_scalar_kernel_slow_
10629
10697
  CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda_
10630
10698
  autogen: _foreach_minimum.Scalar_out
10631
10699
 
@@ -10633,14 +10701,14 @@
10633
10701
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10634
10702
  variants: function
10635
10703
  dispatch:
10636
- CPU: foreach_tensor_clamp_max_list_kernel_slow
10704
+ CompositeExplicitAutograd: foreach_tensor_clamp_max_list_kernel_slow
10637
10705
  CUDA: foreach_tensor_clamp_max_list_kernel_cuda
10638
10706
 
10639
10707
  - func: _foreach_minimum_.List(Tensor(a!)[] self, Tensor[] other) -> ()
10640
10708
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10641
10709
  variants: function
10642
10710
  dispatch:
10643
- CPU: foreach_tensor_clamp_max_list_kernel_slow_
10711
+ CompositeExplicitAutograd: foreach_tensor_clamp_max_list_kernel_slow_
10644
10712
  CUDA: foreach_tensor_clamp_max_list_kernel_cuda_
10645
10713
  autogen: _foreach_minimum.List_out
10646
10714
 
@@ -10648,14 +10716,14 @@
10648
10716
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10649
10717
  variants: function
10650
10718
  dispatch:
10651
- CPU: foreach_tensor_clamp_max_scalarlist_kernel_slow
10719
+ CompositeExplicitAutograd: foreach_tensor_clamp_max_scalarlist_kernel_slow
10652
10720
  CUDA: foreach_tensor_clamp_max_scalarlist_kernel_cuda
10653
10721
 
10654
10722
  - func: _foreach_minimum_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
10655
10723
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10656
10724
  variants: function
10657
10725
  dispatch:
10658
- CPU: foreach_tensor_clamp_max_scalarlist_kernel_slow_
10726
+ CompositeExplicitAutograd: foreach_tensor_clamp_max_scalarlist_kernel_slow_
10659
10727
  CUDA: foreach_tensor_clamp_max_scalarlist_kernel_cuda_
10660
10728
  autogen: _foreach_minimum.ScalarList_out
10661
10729
 
@@ -10663,28 +10731,28 @@
10663
10731
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10664
10732
  variants: function
10665
10733
  dispatch:
10666
- CPU: foreach_tensor_addcdiv_scalar_slow
10734
+ CompositeExplicitAutograd: foreach_tensor_addcdiv_scalar_slow
10667
10735
  CUDA: foreach_tensor_addcdiv_scalar_cuda
10668
10736
 
10669
10737
  - func: _foreach_addcdiv.ScalarList(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> Tensor[]
10670
10738
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10671
10739
  variants: function
10672
10740
  dispatch:
10673
- CPU: foreach_tensor_addcdiv_scalarlist_slow
10741
+ CompositeExplicitAutograd: foreach_tensor_addcdiv_scalarlist_slow
10674
10742
  CUDA: foreach_tensor_addcdiv_scalarlist_cuda
10675
10743
 
10676
10744
  - func: _foreach_addcdiv.Tensor(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> Tensor[]
10677
10745
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10678
10746
  variants: function
10679
10747
  dispatch:
10680
- CPU: foreach_tensor_addcdiv_tensor_slow
10748
+ CompositeExplicitAutograd: foreach_tensor_addcdiv_tensor_slow
10681
10749
  CUDA: foreach_tensor_addcdiv_tensor_cuda
10682
10750
 
10683
10751
  - func: _foreach_addcdiv_.Scalar(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> ()
10684
10752
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10685
10753
  variants: function
10686
10754
  dispatch:
10687
- CPU: foreach_tensor_addcdiv_scalar_slow_
10755
+ CompositeExplicitAutograd: foreach_tensor_addcdiv_scalar_slow_
10688
10756
  CUDA: foreach_tensor_addcdiv_scalar_cuda_
10689
10757
  autogen: _foreach_addcdiv.Scalar_out
10690
10758
 
@@ -10692,7 +10760,7 @@
10692
10760
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10693
10761
  variants: function
10694
10762
  dispatch:
10695
- CPU: foreach_tensor_addcdiv_scalarlist_slow_
10763
+ CompositeExplicitAutograd: foreach_tensor_addcdiv_scalarlist_slow_
10696
10764
  CUDA: foreach_tensor_addcdiv_scalarlist_cuda_
10697
10765
  autogen: _foreach_addcdiv.ScalarList_out
10698
10766
 
@@ -10700,7 +10768,7 @@
10700
10768
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10701
10769
  variants: function
10702
10770
  dispatch:
10703
- CPU: foreach_tensor_addcdiv_tensor_slow_
10771
+ CompositeExplicitAutograd: foreach_tensor_addcdiv_tensor_slow_
10704
10772
  CUDA: foreach_tensor_addcdiv_tensor_cuda_
10705
10773
  autogen: _foreach_addcdiv.Tensor_out
10706
10774
 
@@ -10708,28 +10776,28 @@
10708
10776
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10709
10777
  variants: function
10710
10778
  dispatch:
10711
- CPU: foreach_tensor_addcmul_scalar_slow
10779
+ CompositeExplicitAutograd: foreach_tensor_addcmul_scalar_slow
10712
10780
  CUDA: foreach_tensor_addcmul_scalar_cuda
10713
10781
 
10714
10782
  - func: _foreach_addcmul.ScalarList(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> Tensor[]
10715
10783
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10716
10784
  variants: function
10717
10785
  dispatch:
10718
- CPU: foreach_tensor_addcmul_scalarlist_slow
10786
+ CompositeExplicitAutograd: foreach_tensor_addcmul_scalarlist_slow
10719
10787
  CUDA: foreach_tensor_addcmul_scalarlist_cuda
10720
10788
 
10721
10789
  - func: _foreach_addcmul.Tensor(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> Tensor[]
10722
10790
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10723
10791
  variants: function
10724
10792
  dispatch:
10725
- CPU: foreach_tensor_addcmul_tensor_slow
10793
+ CompositeExplicitAutograd: foreach_tensor_addcmul_tensor_slow
10726
10794
  CUDA: foreach_tensor_addcmul_tensor_cuda
10727
10795
 
10728
10796
  - func: _foreach_addcmul_.Scalar(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> ()
10729
10797
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10730
10798
  variants: function
10731
10799
  dispatch:
10732
- CPU: foreach_tensor_addcmul_scalar_slow_
10800
+ CompositeExplicitAutograd: foreach_tensor_addcmul_scalar_slow_
10733
10801
  CUDA: foreach_tensor_addcmul_scalar_cuda_
10734
10802
  autogen: _foreach_addcmul.Scalar_out
10735
10803
 
@@ -10737,7 +10805,7 @@
10737
10805
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10738
10806
  variants: function
10739
10807
  dispatch:
10740
- CPU: foreach_tensor_addcmul_scalarlist_slow_
10808
+ CompositeExplicitAutograd: foreach_tensor_addcmul_scalarlist_slow_
10741
10809
  CUDA: foreach_tensor_addcmul_scalarlist_cuda_
10742
10810
  autogen: _foreach_addcmul.ScalarList_out
10743
10811
 
@@ -10745,7 +10813,7 @@
10745
10813
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10746
10814
  variants: function
10747
10815
  dispatch:
10748
- CPU: foreach_tensor_addcmul_tensor_slow_
10816
+ CompositeExplicitAutograd: foreach_tensor_addcmul_tensor_slow_
10749
10817
  CUDA: foreach_tensor_addcmul_tensor_cuda_
10750
10818
  autogen: _foreach_addcmul.Tensor_out
10751
10819
 
@@ -10753,14 +10821,14 @@
10753
10821
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10754
10822
  variants: function
10755
10823
  dispatch:
10756
- CPU: foreach_tensor_abs_slow
10824
+ CompositeExplicitAutograd: foreach_tensor_abs_slow
10757
10825
  CUDA: foreach_tensor_abs_cuda
10758
10826
 
10759
10827
  - func: _foreach_abs_(Tensor(a!)[] self) -> ()
10760
10828
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10761
10829
  variants: function
10762
10830
  dispatch:
10763
- CPU: foreach_tensor_abs_slow_
10831
+ CompositeExplicitAutograd: foreach_tensor_abs_slow_
10764
10832
  CUDA: foreach_tensor_abs_cuda_
10765
10833
  autogen: _foreach_abs.out
10766
10834
 
@@ -10768,14 +10836,14 @@
10768
10836
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10769
10837
  variants: function
10770
10838
  dispatch:
10771
- CPU: foreach_tensor_acos_slow
10839
+ CompositeExplicitAutograd: foreach_tensor_acos_slow
10772
10840
  CUDA: foreach_tensor_acos_cuda
10773
10841
 
10774
10842
  - func: _foreach_acos_(Tensor(a!)[] self) -> ()
10775
10843
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10776
10844
  variants: function
10777
10845
  dispatch:
10778
- CPU: foreach_tensor_acos_slow_
10846
+ CompositeExplicitAutograd: foreach_tensor_acos_slow_
10779
10847
  CUDA: foreach_tensor_acos_cuda_
10780
10848
  autogen: _foreach_acos.out
10781
10849
 
@@ -10783,14 +10851,14 @@
10783
10851
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10784
10852
  variants: function
10785
10853
  dispatch:
10786
- CPU: foreach_tensor_asin_slow
10854
+ CompositeExplicitAutograd: foreach_tensor_asin_slow
10787
10855
  CUDA: foreach_tensor_asin_cuda
10788
10856
 
10789
10857
  - func: _foreach_asin_(Tensor(a!)[] self) -> ()
10790
10858
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10791
10859
  variants: function
10792
10860
  dispatch:
10793
- CPU: foreach_tensor_asin_slow_
10861
+ CompositeExplicitAutograd: foreach_tensor_asin_slow_
10794
10862
  CUDA: foreach_tensor_asin_cuda_
10795
10863
  autogen: _foreach_asin.out
10796
10864
 
@@ -10798,14 +10866,14 @@
10798
10866
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10799
10867
  variants: function
10800
10868
  dispatch:
10801
- CPU: foreach_tensor_atan_slow
10869
+ CompositeExplicitAutograd: foreach_tensor_atan_slow
10802
10870
  CUDA: foreach_tensor_atan_cuda
10803
10871
 
10804
10872
  - func: _foreach_atan_(Tensor(a!)[] self) -> ()
10805
10873
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10806
10874
  variants: function
10807
10875
  dispatch:
10808
- CPU: foreach_tensor_atan_slow_
10876
+ CompositeExplicitAutograd: foreach_tensor_atan_slow_
10809
10877
  CUDA: foreach_tensor_atan_cuda_
10810
10878
  autogen: _foreach_atan.out
10811
10879
 
@@ -10813,14 +10881,14 @@
10813
10881
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10814
10882
  variants: function
10815
10883
  dispatch:
10816
- CPU: foreach_tensor_ceil_slow
10884
+ CompositeExplicitAutograd: foreach_tensor_ceil_slow
10817
10885
  CUDA: foreach_tensor_ceil_cuda
10818
10886
 
10819
10887
  - func: _foreach_ceil_(Tensor(a!)[] self) -> ()
10820
10888
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10821
10889
  variants: function
10822
10890
  dispatch:
10823
- CPU: foreach_tensor_ceil_slow_
10891
+ CompositeExplicitAutograd: foreach_tensor_ceil_slow_
10824
10892
  CUDA: foreach_tensor_ceil_cuda_
10825
10893
  autogen: _foreach_ceil.out
10826
10894
 
@@ -10828,14 +10896,14 @@
10828
10896
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10829
10897
  variants: function
10830
10898
  dispatch:
10831
- CPU: foreach_tensor_cos_slow
10899
+ CompositeExplicitAutograd: foreach_tensor_cos_slow
10832
10900
  CUDA: foreach_tensor_cos_cuda
10833
10901
 
10834
10902
  - func: _foreach_cos_(Tensor(a!)[] self) -> ()
10835
10903
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10836
10904
  variants: function
10837
10905
  dispatch:
10838
- CPU: foreach_tensor_cos_slow_
10906
+ CompositeExplicitAutograd: foreach_tensor_cos_slow_
10839
10907
  CUDA: foreach_tensor_cos_cuda_
10840
10908
  autogen: _foreach_cos.out
10841
10909
 
@@ -10843,14 +10911,14 @@
10843
10911
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10844
10912
  variants: function
10845
10913
  dispatch:
10846
- CPU: foreach_tensor_cosh_slow
10914
+ CompositeExplicitAutograd: foreach_tensor_cosh_slow
10847
10915
  CUDA: foreach_tensor_cosh_cuda
10848
10916
 
10849
10917
  - func: _foreach_cosh_(Tensor(a!)[] self) -> ()
10850
10918
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10851
10919
  variants: function
10852
10920
  dispatch:
10853
- CPU: foreach_tensor_cosh_slow_
10921
+ CompositeExplicitAutograd: foreach_tensor_cosh_slow_
10854
10922
  CUDA: foreach_tensor_cosh_cuda_
10855
10923
  autogen: _foreach_cosh.out
10856
10924
 
@@ -10858,14 +10926,14 @@
10858
10926
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10859
10927
  variants: function
10860
10928
  dispatch:
10861
- CPU: foreach_tensor_erf_slow
10929
+ CompositeExplicitAutograd: foreach_tensor_erf_slow
10862
10930
  CUDA: foreach_tensor_erf_cuda
10863
10931
 
10864
10932
  - func: _foreach_erf_(Tensor(a!)[] self) -> ()
10865
10933
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10866
10934
  variants: function
10867
10935
  dispatch:
10868
- CPU: foreach_tensor_erf_slow_
10936
+ CompositeExplicitAutograd: foreach_tensor_erf_slow_
10869
10937
  CUDA: foreach_tensor_erf_cuda_
10870
10938
  autogen: _foreach_erf.out
10871
10939
 
@@ -10873,14 +10941,14 @@
10873
10941
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10874
10942
  variants: function
10875
10943
  dispatch:
10876
- CPU: foreach_tensor_erfc_slow
10944
+ CompositeExplicitAutograd: foreach_tensor_erfc_slow
10877
10945
  CUDA: foreach_tensor_erfc_cuda
10878
10946
 
10879
10947
  - func: _foreach_erfc_(Tensor(a!)[] self) -> ()
10880
10948
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10881
10949
  variants: function
10882
10950
  dispatch:
10883
- CPU: foreach_tensor_erfc_slow_
10951
+ CompositeExplicitAutograd: foreach_tensor_erfc_slow_
10884
10952
  CUDA: foreach_tensor_erfc_cuda_
10885
10953
  autogen: _foreach_erfc.out
10886
10954
 
@@ -10888,14 +10956,14 @@
10888
10956
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10889
10957
  variants: function
10890
10958
  dispatch:
10891
- CPU: foreach_tensor_exp_slow
10959
+ CompositeExplicitAutograd: foreach_tensor_exp_slow
10892
10960
  CUDA: foreach_tensor_exp_cuda
10893
10961
 
10894
10962
  - func: _foreach_exp_(Tensor(a!)[] self) -> ()
10895
10963
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10896
10964
  variants: function
10897
10965
  dispatch:
10898
- CPU: foreach_tensor_exp_slow_
10966
+ CompositeExplicitAutograd: foreach_tensor_exp_slow_
10899
10967
  CUDA: foreach_tensor_exp_cuda_
10900
10968
  autogen: _foreach_exp.out
10901
10969
 
@@ -10903,14 +10971,14 @@
10903
10971
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10904
10972
  variants: function
10905
10973
  dispatch:
10906
- CPU: foreach_tensor_expm1_slow
10974
+ CompositeExplicitAutograd: foreach_tensor_expm1_slow
10907
10975
  CUDA: foreach_tensor_expm1_cuda
10908
10976
 
10909
10977
  - func: _foreach_expm1_(Tensor(a!)[] self) -> ()
10910
10978
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10911
10979
  variants: function
10912
10980
  dispatch:
10913
- CPU: foreach_tensor_expm1_slow_
10981
+ CompositeExplicitAutograd: foreach_tensor_expm1_slow_
10914
10982
  CUDA: foreach_tensor_expm1_cuda_
10915
10983
  autogen: _foreach_expm1.out
10916
10984
 
@@ -10918,14 +10986,14 @@
10918
10986
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10919
10987
  variants: function
10920
10988
  dispatch:
10921
- CPU: foreach_tensor_floor_slow
10989
+ CompositeExplicitAutograd: foreach_tensor_floor_slow
10922
10990
  CUDA: foreach_tensor_floor_cuda
10923
10991
 
10924
10992
  - func: _foreach_floor_(Tensor(a!)[] self) -> ()
10925
10993
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10926
10994
  variants: function
10927
10995
  dispatch:
10928
- CPU: foreach_tensor_floor_slow_
10996
+ CompositeExplicitAutograd: foreach_tensor_floor_slow_
10929
10997
  CUDA: foreach_tensor_floor_cuda_
10930
10998
  autogen: _foreach_floor.out
10931
10999
 
@@ -10933,14 +11001,14 @@
10933
11001
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10934
11002
  variants: function
10935
11003
  dispatch:
10936
- CPU: foreach_tensor_frac_slow
11004
+ CompositeExplicitAutograd: foreach_tensor_frac_slow
10937
11005
  CUDA: foreach_tensor_frac_cuda
10938
11006
 
10939
11007
  - func: _foreach_frac_(Tensor(a!)[] self) -> ()
10940
11008
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10941
11009
  variants: function
10942
11010
  dispatch:
10943
- CPU: foreach_tensor_frac_slow_
11011
+ CompositeExplicitAutograd: foreach_tensor_frac_slow_
10944
11012
  CUDA: foreach_tensor_frac_cuda_
10945
11013
  autogen: _foreach_frac.out
10946
11014
 
@@ -10948,7 +11016,7 @@
10948
11016
  device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices
10949
11017
  variants: function
10950
11018
  dispatch:
10951
- CPU: foreach_tensor_ternary_lerp_slow
11019
+ CompositeExplicitAutograd: foreach_tensor_ternary_lerp_slow
10952
11020
  CUDA: foreach_tensor_lerp_ternary_cuda
10953
11021
  autogen: _foreach_lerp.List_out
10954
11022
 
@@ -10956,7 +11024,7 @@
10956
11024
  device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices
10957
11025
  variants: function
10958
11026
  dispatch:
10959
- CPU: foreach_tensor_ternary_lerp_slow_
11027
+ CompositeExplicitAutograd: foreach_tensor_ternary_lerp_slow_
10960
11028
  CUDA: foreach_tensor_lerp_ternary_cuda_
10961
11029
  autogen: _foreach_lerp.List_out
10962
11030
 
@@ -10964,7 +11032,7 @@
10964
11032
  device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices
10965
11033
  variants: function
10966
11034
  dispatch:
10967
- CPU: foreach_tensor_lerp_list_kernel_slow
11035
+ CompositeExplicitAutograd: foreach_tensor_lerp_list_kernel_slow
10968
11036
  CUDA: foreach_tensor_lerp_list_cuda
10969
11037
  autogen: _foreach_lerp.Scalar_out
10970
11038
 
@@ -10972,7 +11040,7 @@
10972
11040
  device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices
10973
11041
  variants: function
10974
11042
  dispatch:
10975
- CPU: foreach_tensor_lerp_list_kernel_slow_
11043
+ CompositeExplicitAutograd: foreach_tensor_lerp_list_kernel_slow_
10976
11044
  CUDA: foreach_tensor_lerp_list_cuda_
10977
11045
  autogen: _foreach_lerp.Scalar_out
10978
11046
 
@@ -10980,14 +11048,14 @@
10980
11048
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10981
11049
  variants: function
10982
11050
  dispatch:
10983
- CPU: foreach_tensor_lgamma_slow
11051
+ CompositeExplicitAutograd: foreach_tensor_lgamma_slow
10984
11052
  CUDA: foreach_tensor_lgamma_cuda
10985
11053
 
10986
11054
  - func: _foreach_lgamma_(Tensor(a!)[] self) -> ()
10987
11055
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10988
11056
  variants: function
10989
11057
  dispatch:
10990
- CPU: foreach_tensor_lgamma_slow_
11058
+ CompositeExplicitAutograd: foreach_tensor_lgamma_slow_
10991
11059
  CUDA: foreach_tensor_lgamma_cuda_
10992
11060
  autogen: _foreach_lgamma.out
10993
11061
 
@@ -10995,14 +11063,14 @@
10995
11063
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10996
11064
  variants: function
10997
11065
  dispatch:
10998
- CPU: foreach_tensor_log_slow
11066
+ CompositeExplicitAutograd: foreach_tensor_log_slow
10999
11067
  CUDA: foreach_tensor_log_cuda
11000
11068
 
11001
11069
  - func: _foreach_log_(Tensor(a!)[] self) -> ()
11002
11070
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11003
11071
  variants: function
11004
11072
  dispatch:
11005
- CPU: foreach_tensor_log_slow_
11073
+ CompositeExplicitAutograd: foreach_tensor_log_slow_
11006
11074
  CUDA: foreach_tensor_log_cuda_
11007
11075
  autogen: _foreach_log.out
11008
11076
 
@@ -11010,14 +11078,14 @@
11010
11078
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11011
11079
  variants: function
11012
11080
  dispatch:
11013
- CPU: foreach_tensor_log10_slow
11081
+ CompositeExplicitAutograd: foreach_tensor_log10_slow
11014
11082
  CUDA: foreach_tensor_log10_cuda
11015
11083
 
11016
11084
  - func: _foreach_log10_(Tensor(a!)[] self) -> ()
11017
11085
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11018
11086
  variants: function
11019
11087
  dispatch:
11020
- CPU: foreach_tensor_log10_slow_
11088
+ CompositeExplicitAutograd: foreach_tensor_log10_slow_
11021
11089
  CUDA: foreach_tensor_log10_cuda_
11022
11090
  autogen: _foreach_log10.out
11023
11091
 
@@ -11025,14 +11093,14 @@
11025
11093
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11026
11094
  variants: function
11027
11095
  dispatch:
11028
- CPU: foreach_tensor_log1p_slow
11096
+ CompositeExplicitAutograd: foreach_tensor_log1p_slow
11029
11097
  CUDA: foreach_tensor_log1p_cuda
11030
11098
 
11031
11099
  - func: _foreach_log1p_(Tensor(a!)[] self) -> ()
11032
11100
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11033
11101
  variants: function
11034
11102
  dispatch:
11035
- CPU: foreach_tensor_log1p_slow_
11103
+ CompositeExplicitAutograd: foreach_tensor_log1p_slow_
11036
11104
  CUDA: foreach_tensor_log1p_cuda_
11037
11105
  autogen: _foreach_log1p.out
11038
11106
 
@@ -11040,37 +11108,45 @@
11040
11108
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11041
11109
  variants: function
11042
11110
  dispatch:
11043
- CPU: foreach_tensor_log2_slow
11111
+ CompositeExplicitAutograd: foreach_tensor_log2_slow
11044
11112
  CUDA: foreach_tensor_log2_cuda
11045
11113
 
11046
11114
  - func: _foreach_log2_(Tensor(a!)[] self) -> ()
11047
11115
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11048
11116
  variants: function
11049
11117
  dispatch:
11050
- CPU: foreach_tensor_log2_slow_
11118
+ CompositeExplicitAutograd: foreach_tensor_log2_slow_
11051
11119
  CUDA: foreach_tensor_log2_cuda_
11052
11120
  autogen: _foreach_log2.out
11053
11121
 
11122
+ - func: _foreach_max(Tensor[] self) -> Tensor[]
11123
+ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11124
+ variants: function
11125
+ dispatch:
11126
+ CompositeExplicitAutograd: foreach_tensor_max_slow
11127
+ CUDA: foreach_tensor_max_cuda
11128
+ autogen: _foreach_max.out
11129
+
11054
11130
  - func: _foreach_neg(Tensor[] self) -> Tensor[]
11055
11131
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11056
11132
  variants: function
11057
11133
  dispatch:
11058
- CPU: foreach_tensor_neg_slow
11134
+ CompositeExplicitAutograd: foreach_tensor_neg_slow
11059
11135
  CUDA: foreach_tensor_neg_cuda
11060
11136
 
11061
11137
  - func: _foreach_neg_(Tensor(a!)[] self) -> ()
11062
11138
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11063
11139
  variants: function
11064
11140
  dispatch:
11065
- CPU: foreach_tensor_neg_slow_
11141
+ CompositeExplicitAutograd: foreach_tensor_neg_slow_
11066
11142
  CUDA: foreach_tensor_neg_cuda_
11067
11143
  autogen: _foreach_neg.out
11068
11144
 
11069
- - func: _foreach_norm.Scalar(Tensor[] self, Scalar ord=2) -> Tensor[]
11145
+ - func: _foreach_norm.Scalar(Tensor[] self, Scalar ord=2, ScalarType? dtype=None) -> Tensor[]
11070
11146
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11071
11147
  variants: function
11072
11148
  dispatch:
11073
- CPU: foreach_tensor_norm_slow
11149
+ CompositeExplicitAutograd: foreach_tensor_norm_slow
11074
11150
  CUDA: foreach_tensor_norm_cuda
11075
11151
  autogen: _foreach_norm.Scalar_out
11076
11152
 
@@ -11078,35 +11154,35 @@
11078
11154
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11079
11155
  variants: function
11080
11156
  dispatch:
11081
- CPU: foreach_tensor_pow_list_kernel_slow
11157
+ CompositeExplicitAutograd: foreach_tensor_pow_list_kernel_slow
11082
11158
  CUDA: foreach_tensor_pow_list_kernel_cuda
11083
11159
 
11084
11160
  - func: _foreach_pow.Scalar(Tensor[] self, Scalar exponent) -> Tensor[]
11085
11161
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11086
11162
  variants: function
11087
11163
  dispatch:
11088
- CPU: foreach_tensor_pow_scalar_kernel_slow
11164
+ CompositeExplicitAutograd: foreach_tensor_pow_scalar_kernel_slow
11089
11165
  CUDA: foreach_tensor_pow_scalar_kernel_cuda
11090
11166
 
11091
11167
  - func: _foreach_pow.ScalarList(Tensor[] self, Scalar[] exponent) -> Tensor[]
11092
11168
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11093
11169
  variants: function
11094
11170
  dispatch:
11095
- CPU: foreach_tensor_pow_scalarlist_kernel_slow
11171
+ CompositeExplicitAutograd: foreach_tensor_pow_scalarlist_kernel_slow
11096
11172
  CUDA: foreach_tensor_pow_scalarlist_kernel_cuda
11097
11173
 
11098
11174
  - func: _foreach_pow.ScalarAndTensor(Scalar self, Tensor[] exponent) -> Tensor[]
11099
11175
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11100
11176
  variants: function
11101
11177
  dispatch:
11102
- CPU: foreach_scalar_pow_list_kernel_slow
11178
+ CompositeExplicitAutograd: foreach_scalar_pow_list_kernel_slow
11103
11179
  CUDA: foreach_scalar_pow_list_kernel_cuda
11104
11180
 
11105
11181
  - func: _foreach_pow_.List(Tensor(a!)[] self, Tensor[] exponent) -> ()
11106
11182
  device_check: NoCheck
11107
11183
  variants: function
11108
11184
  dispatch:
11109
- CPU: foreach_tensor_pow_list_kernel_slow_
11185
+ CompositeExplicitAutograd: foreach_tensor_pow_list_kernel_slow_
11110
11186
  CUDA: foreach_tensor_pow_list_kernel_cuda_
11111
11187
  autogen: _foreach_pow.List_out
11112
11188
 
@@ -11114,7 +11190,7 @@
11114
11190
  device_check: NoCheck
11115
11191
  variants: function
11116
11192
  dispatch:
11117
- CPU: foreach_tensor_pow_scalar_kernel_slow_
11193
+ CompositeExplicitAutograd: foreach_tensor_pow_scalar_kernel_slow_
11118
11194
  CUDA: foreach_tensor_pow_scalar_kernel_cuda_
11119
11195
  autogen: _foreach_pow.Scalar_out
11120
11196
 
@@ -11122,7 +11198,7 @@
11122
11198
  device_check: NoCheck
11123
11199
  variants: function
11124
11200
  dispatch:
11125
- CPU: foreach_tensor_pow_scalarlist_kernel_slow_
11201
+ CompositeExplicitAutograd: foreach_tensor_pow_scalarlist_kernel_slow_
11126
11202
  CUDA: foreach_tensor_pow_scalarlist_kernel_cuda_
11127
11203
  autogen: _foreach_pow.ScalarList_out
11128
11204
 
@@ -11130,14 +11206,14 @@
11130
11206
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11131
11207
  variants: function
11132
11208
  dispatch:
11133
- CPU: foreach_tensor_reciprocal_slow
11209
+ CompositeExplicitAutograd: foreach_tensor_reciprocal_slow
11134
11210
  CUDA: foreach_tensor_reciprocal_cuda
11135
11211
 
11136
11212
  - func: _foreach_reciprocal_(Tensor(a!)[] self) -> ()
11137
11213
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11138
11214
  variants: function
11139
11215
  dispatch:
11140
- CPU: foreach_tensor_reciprocal_slow_
11216
+ CompositeExplicitAutograd: foreach_tensor_reciprocal_slow_
11141
11217
  CUDA: foreach_tensor_reciprocal_cuda_
11142
11218
  autogen: _foreach_reciprocal.out
11143
11219
 
@@ -11145,14 +11221,14 @@
11145
11221
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11146
11222
  variants: function
11147
11223
  dispatch:
11148
- CPU: foreach_tensor_round_slow
11224
+ CompositeExplicitAutograd: foreach_tensor_round_slow
11149
11225
  CUDA: foreach_tensor_round_cuda
11150
11226
 
11151
11227
  - func: _foreach_round_(Tensor(a!)[] self) -> ()
11152
11228
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11153
11229
  variants: function
11154
11230
  dispatch:
11155
- CPU: foreach_tensor_round_slow_
11231
+ CompositeExplicitAutograd: foreach_tensor_round_slow_
11156
11232
  CUDA: foreach_tensor_round_cuda_
11157
11233
  autogen: _foreach_round.out
11158
11234
 
@@ -11160,14 +11236,14 @@
11160
11236
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11161
11237
  variants: function
11162
11238
  dispatch:
11163
- CPU: foreach_tensor_sigmoid_slow
11239
+ CompositeExplicitAutograd: foreach_tensor_sigmoid_slow
11164
11240
  CUDA: foreach_tensor_sigmoid_cuda
11165
11241
 
11166
11242
  - func: _foreach_sigmoid_(Tensor(a!)[] self) -> ()
11167
11243
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11168
11244
  variants: function
11169
11245
  dispatch:
11170
- CPU: foreach_tensor_sigmoid_slow_
11246
+ CompositeExplicitAutograd: foreach_tensor_sigmoid_slow_
11171
11247
  CUDA: foreach_tensor_sigmoid_cuda_
11172
11248
  autogen: _foreach_sigmoid.out
11173
11249
 
@@ -11175,14 +11251,14 @@
11175
11251
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11176
11252
  variants: function
11177
11253
  dispatch:
11178
- CPU: foreach_tensor_sign_slow
11254
+ CompositeExplicitAutograd: foreach_tensor_sign_slow
11179
11255
  CUDA: foreach_tensor_sign_cuda
11180
11256
 
11181
11257
  - func: _foreach_sign_(Tensor(a!)[] self) -> ()
11182
11258
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11183
11259
  variants: function
11184
11260
  dispatch:
11185
- CPU: foreach_tensor_sign_slow_
11261
+ CompositeExplicitAutograd: foreach_tensor_sign_slow_
11186
11262
  CUDA: foreach_tensor_sign_cuda_
11187
11263
  autogen: _foreach_sign.out
11188
11264
 
@@ -11190,14 +11266,14 @@
11190
11266
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11191
11267
  variants: function
11192
11268
  dispatch:
11193
- CPU: foreach_tensor_sin_slow
11269
+ CompositeExplicitAutograd: foreach_tensor_sin_slow
11194
11270
  CUDA: foreach_tensor_sin_cuda
11195
11271
 
11196
11272
  - func: _foreach_sin_(Tensor(a!)[] self) -> ()
11197
11273
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11198
11274
  variants: function
11199
11275
  dispatch:
11200
- CPU: foreach_tensor_sin_slow_
11276
+ CompositeExplicitAutograd: foreach_tensor_sin_slow_
11201
11277
  CUDA: foreach_tensor_sin_cuda_
11202
11278
  autogen: _foreach_sin.out
11203
11279
 
@@ -11205,14 +11281,14 @@
11205
11281
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11206
11282
  variants: function
11207
11283
  dispatch:
11208
- CPU: foreach_tensor_sinh_slow
11284
+ CompositeExplicitAutograd: foreach_tensor_sinh_slow
11209
11285
  CUDA: foreach_tensor_sinh_cuda
11210
11286
 
11211
11287
  - func: _foreach_sinh_(Tensor(a!)[] self) -> ()
11212
11288
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11213
11289
  variants: function
11214
11290
  dispatch:
11215
- CPU: foreach_tensor_sinh_slow_
11291
+ CompositeExplicitAutograd: foreach_tensor_sinh_slow_
11216
11292
  CUDA: foreach_tensor_sinh_cuda_
11217
11293
  autogen: _foreach_sinh.out
11218
11294
 
@@ -11220,14 +11296,14 @@
11220
11296
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11221
11297
  variants: function
11222
11298
  dispatch:
11223
- CPU: foreach_tensor_sqrt_slow
11299
+ CompositeExplicitAutograd: foreach_tensor_sqrt_slow
11224
11300
  CUDA: foreach_tensor_sqrt_cuda
11225
11301
 
11226
11302
  - func: _foreach_sqrt_(Tensor(a!)[] self) -> ()
11227
11303
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11228
11304
  variants: function
11229
11305
  dispatch:
11230
- CPU: foreach_tensor_sqrt_slow_
11306
+ CompositeExplicitAutograd: foreach_tensor_sqrt_slow_
11231
11307
  CUDA: foreach_tensor_sqrt_cuda_
11232
11308
  autogen: _foreach_sqrt.out
11233
11309
 
@@ -11235,14 +11311,14 @@
11235
11311
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11236
11312
  variants: function
11237
11313
  dispatch:
11238
- CPU: foreach_tensor_tan_slow
11314
+ CompositeExplicitAutograd: foreach_tensor_tan_slow
11239
11315
  CUDA: foreach_tensor_tan_cuda
11240
11316
 
11241
11317
  - func: _foreach_tan_(Tensor(a!)[] self) -> ()
11242
11318
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11243
11319
  variants: function
11244
11320
  dispatch:
11245
- CPU: foreach_tensor_tan_slow_
11321
+ CompositeExplicitAutograd: foreach_tensor_tan_slow_
11246
11322
  CUDA: foreach_tensor_tan_cuda_
11247
11323
  autogen: _foreach_tan.out
11248
11324
 
@@ -11250,14 +11326,14 @@
11250
11326
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11251
11327
  variants: function
11252
11328
  dispatch:
11253
- CPU: foreach_tensor_tanh_slow
11329
+ CompositeExplicitAutograd: foreach_tensor_tanh_slow
11254
11330
  CUDA: foreach_tensor_tanh_cuda
11255
11331
 
11256
11332
  - func: _foreach_tanh_(Tensor(a!)[] self) -> ()
11257
11333
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11258
11334
  variants: function
11259
11335
  dispatch:
11260
- CPU: foreach_tensor_tanh_slow_
11336
+ CompositeExplicitAutograd: foreach_tensor_tanh_slow_
11261
11337
  CUDA: foreach_tensor_tanh_cuda_
11262
11338
  autogen: _foreach_tanh.out
11263
11339
 
@@ -11265,14 +11341,14 @@
11265
11341
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11266
11342
  variants: function
11267
11343
  dispatch:
11268
- CPU: foreach_tensor_trunc_slow
11344
+ CompositeExplicitAutograd: foreach_tensor_trunc_slow
11269
11345
  CUDA: foreach_tensor_trunc_cuda
11270
11346
 
11271
11347
  - func: _foreach_trunc_(Tensor(a!)[] self) -> ()
11272
11348
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11273
11349
  variants: function
11274
11350
  dispatch:
11275
- CPU: foreach_tensor_trunc_slow_
11351
+ CompositeExplicitAutograd: foreach_tensor_trunc_slow_
11276
11352
  CUDA: foreach_tensor_trunc_cuda_
11277
11353
  autogen: _foreach_trunc.out
11278
11354
 
@@ -11280,7 +11356,7 @@
11280
11356
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11281
11357
  variants: function
11282
11358
  dispatch:
11283
- CPU: foreach_tensor_zero_slow_
11359
+ CompositeExplicitAutograd: foreach_tensor_zero_slow_
11284
11360
  CUDA: foreach_tensor_zero_cuda_
11285
11361
  autogen: _foreach_zero, _foreach_zero.out
11286
11362
 
@@ -11288,9 +11364,15 @@
11288
11364
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11289
11365
  variants: function
11290
11366
  dispatch:
11291
- CPU: foreach_tensor_copy_list_kernel_slow_
11367
+ CompositeExplicitAutograd: foreach_tensor_copy_list_kernel_slow_
11292
11368
  CUDA: foreach_tensor_copy_list_kernel_cuda_
11293
- autogen: _foreach_copy, _foreach_copy.out
11369
+ autogen: _foreach_copy.out
11370
+
11371
+ - func: _foreach_copy(Tensor[] self, Tensor[] src, bool non_blocking=False) -> Tensor[] self_out
11372
+ device_check: NoCheck
11373
+ variants: function
11374
+ dispatch:
11375
+ CompositeExplicitAutograd: _foreach_copy
11294
11376
 
11295
11377
  - func: bucketize.Tensor(Tensor self, Tensor boundaries, *, bool out_int32=False, bool right=False) -> Tensor
11296
11378
  dispatch:
@@ -14562,6 +14644,16 @@
14562
14644
  NestedTensorCUDA: NestedTensor_to_padded_tensor_cuda
14563
14645
  autogen: to_padded_tensor.out
14564
14646
 
14647
+ - func: _jagged_to_padded_dense_forward(Tensor values, Tensor[] offsets, SymInt[] max_lengths, float padding_value=0.0) -> Tensor
14648
+ variants: function
14649
+ dispatch:
14650
+ CUDA: _fbgemm_jagged_to_padded_dense_forward
14651
+
14652
+ - func: _padded_dense_to_jagged_forward(Tensor dense, Tensor[] offsets, SymInt? total_L=None) -> Tensor
14653
+ variants: function
14654
+ dispatch:
14655
+ CUDA: _fbgemm_dense_to_jagged_forward_symint
14656
+
14565
14657
  - func: _nested_tensor_softmax_with_shape(Tensor self, Tensor query) -> Tensor
14566
14658
  dispatch:
14567
14659
  NestedTensorCPU: NestedTensor_softmax_dropout
@@ -14636,31 +14728,36 @@
14636
14728
  CUDA: _scaled_dot_product_efficient_attention_backward_cuda
14637
14729
  tags: nondeterministic_seeded
14638
14730
 
14639
- - func: _scaled_dot_product_cudnn_attention(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor output, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset)
14731
+ - func: _scaled_dot_product_cudnn_attention(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor output, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
14640
14732
  dispatch:
14641
14733
  CUDA: _scaled_dot_product_cudnn_attention_cuda
14642
14734
  tags: nondeterministic_seeded
14643
14735
 
14644
- - func: _flash_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? cum_seq_q, Tensor? cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, bool return_debug_mask, *, float? scale=None) -> (Tensor output, Tensor softmax_logsumexp, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
14736
+ - func: _scaled_dot_product_cudnn_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, Tensor philox_seed, Tensor philox_offset, *, float? scale=None) -> (Tensor, Tensor, Tensor)
14737
+ dispatch:
14738
+ CUDA: _scaled_dot_product_cudnn_attention_backward_cuda
14739
+ tags: nondeterministic_seeded
14740
+
14741
+ - func: _flash_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? cum_seq_q, Tensor? cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, bool return_debug_mask, *, float? scale=None, SymInt? window_size_left=None, SymInt? window_size_right=None, Tensor? seqused_k=None, Tensor? alibi_slopes=None) -> (Tensor output, Tensor softmax_logsumexp, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
14645
14742
  variants: function
14646
14743
  dispatch:
14647
14744
  CUDA: _flash_attention_forward
14648
14745
  tags: nondeterministic_seeded
14649
14746
 
14650
- - func: _flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, Tensor philox_seed, Tensor philox_offset, *, float? scale=None) -> (Tensor, Tensor, Tensor)
14747
+ - func: _flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, Tensor philox_seed, Tensor philox_offset, *, float? scale=None, SymInt? window_size_left=None, SymInt? window_size_right=None) -> (Tensor, Tensor, Tensor)
14651
14748
  device_check: NoCheck
14652
14749
  variants: function
14653
14750
  dispatch:
14654
14751
  CUDA: _flash_attention_backward
14655
14752
 
14656
14753
  # Returns output, logsumexp if compute_logsumexp
14657
- - func: _efficient_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int? max_seqlen_q, int? max_seqlen_k, float dropout_p, int custom_mask_type, bool compute_log_sumexp=False, *, float? scale=None, Tensor? causal_diagonal=None, Tensor? seqlen_k=None) -> (Tensor output, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset, SymInt max_seqlen_batch_q, SymInt max_seqlen_batch_k)
14754
+ - func: _efficient_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, SymInt? max_seqlen_q, SymInt? max_seqlen_k, float dropout_p, int custom_mask_type, bool compute_log_sumexp=False, *, float? scale=None, Tensor? seqlen_k=None, int? window_size=None) -> (Tensor output, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset, SymInt max_seqlen_batch_q, SymInt max_seqlen_batch_k)
14658
14755
  variants: function
14659
14756
  dispatch:
14660
14757
  CUDA: _efficient_attention_forward
14661
14758
  tags: nondeterministic_seeded
14662
14759
 
14663
- - func: _efficient_attention_backward(Tensor grad_out_, Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor out, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, SymInt max_seqlen_q, SymInt max_seqlen_k, Tensor logsumexp, float dropout_p, Tensor philox_seed, Tensor philox_offset, int custom_mask_type, bool bias_requires_grad, *, float? scale=None, int? num_splits_key=None) -> (Tensor, Tensor, Tensor, Tensor)
14760
+ - func: _efficient_attention_backward(Tensor grad_out_, Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor out, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, SymInt max_seqlen_q, SymInt max_seqlen_k, Tensor logsumexp, float dropout_p, Tensor philox_seed, Tensor philox_offset, int custom_mask_type, bool bias_requires_grad, *, float? scale=None, int? num_splits_key=None, int? window_size=None, bool shared_storage_dqdkdv=False) -> (Tensor, Tensor, Tensor, Tensor)
14664
14761
  device_check: NoCheck
14665
14762
  variants: function
14666
14763
  dispatch:
@@ -15460,11 +15557,11 @@
15460
15557
  CPU: foobar
15461
15558
  autogen: _foobar.out
15462
15559
 
15463
- # Fused Optimizer CUDA kernels.
15464
15560
  - func: _fused_adam_(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, float lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
15465
15561
  # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now).
15466
15562
  variants: function
15467
15563
  dispatch:
15564
+ CPU: _fused_adam_kernel_cpu_
15468
15565
  CUDA: _fused_adam_kernel_cuda_
15469
15566
  autogen: _fused_adam, _fused_adam.out
15470
15567
 
@@ -15474,6 +15571,7 @@
15474
15571
  device_check: NoCheck
15475
15572
  variants: function
15476
15573
  dispatch:
15574
+ CPU: _fused_adam_kernel_cpu_
15477
15575
  CUDA: _fused_adam_kernel_cuda_
15478
15576
  autogen: _fused_adam.tensor_lr, _fused_adam.tensor_lr_out
15479
15577
 
@@ -15481,6 +15579,7 @@
15481
15579
  # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now).
15482
15580
  variants: function
15483
15581
  dispatch:
15582
+ CPU: _fused_adamw_kernel_cpu_
15484
15583
  CUDA: _fused_adamw_kernel_cuda_
15485
15584
  autogen: _fused_adamw, _fused_adamw.out
15486
15585
 
@@ -15490,6 +15589,7 @@
15490
15589
  device_check: NoCheck
15491
15590
  variants: function
15492
15591
  dispatch:
15592
+ CPU: _fused_adamw_kernel_cpu_
15493
15593
  CUDA: _fused_adamw_kernel_cuda_
15494
15594
  autogen: _fused_adamw.tensor_lr, _fused_adamw.tensor_lr_out
15495
15595
 
@@ -15497,6 +15597,7 @@
15497
15597
  # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now).
15498
15598
  variants: function
15499
15599
  dispatch:
15600
+ CPU: _fused_sgd_kernel_cpu_
15500
15601
  CUDA: _fused_sgd_kernel_cuda_
15501
15602
  autogen: _fused_sgd, _fused_sgd.out
15502
15603
 
@@ -15506,9 +15607,16 @@
15506
15607
  device_check: NoCheck
15507
15608
  variants: function
15508
15609
  dispatch:
15610
+ CPU: _fused_sgd_kernel_cpu_
15509
15611
  CUDA: _fused_sgd_kernel_cuda_
15510
15612
  autogen: _fused_sgd.tensor_lr, _fused_sgd.tensor_lr_out
15511
15613
 
15614
+ - func: _fused_adagrad_(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] state_sums, Tensor(d!)[] state_steps, *, float lr, float lr_decay, float weight_decay, float eps, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
15615
+ variants: function
15616
+ dispatch:
15617
+ CPU: _fused_adagrad_kernel_cpu_
15618
+ autogen: _fused_adagrad, _fused_adagrad.out
15619
+
15512
15620
  # This op is ONLY used by pytorch/XLA in functionalization, and should never show up in vanilla eager mode or in any pytorch tracing contexts.
15513
15621
  - func: _propagate_xla_data(Tensor input, Tensor output) -> ()
15514
15622
  variants: function