torch-rb 0.16.0 → 0.17.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -549,8 +549,8 @@
549
549
  structured_delegate: add.out
550
550
  variants: function, method
551
551
  dispatch:
552
- SparseCPU, SparseCUDA: add_sparse
553
- SparseCsrCPU, SparseCsrCUDA: add_sparse_csr
552
+ SparseCPU, SparseCUDA, SparseMeta: add_sparse
553
+ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: add_sparse_csr
554
554
  MkldnnCPU: mkldnn_add
555
555
  ZeroTensor: add_zerotensor
556
556
  NestedTensorCPU, NestedTensorCUDA: NestedTensor_add_Tensor
@@ -561,8 +561,8 @@
561
561
  variants: method
562
562
  structured_delegate: add.out
563
563
  dispatch:
564
- SparseCPU, SparseCUDA: add_sparse_
565
- SparseCsrCPU, SparseCsrCUDA: add_sparse_csr_
564
+ SparseCPU, SparseCUDA, SparseMeta: add_sparse_
565
+ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: add_sparse_csr_
566
566
  MkldnnCPU: mkldnn_add_
567
567
  NestedTensorCPU, NestedTensorCUDA: NestedTensor_add__Tensor
568
568
  tags: pointwise
@@ -575,9 +575,9 @@
575
575
  Generic: add (AllAndComplex, BFloat16, Half, ComplexHalf)
576
576
  ScalarOnly: add (Bool)
577
577
  dispatch:
578
- SparseCPU: add_out_sparse_cpu
578
+ SparseCPU, SparseMeta: add_out_sparse_cpu
579
579
  SparseCUDA: add_out_sparse_cuda
580
- SparseCsrCPU: add_out_sparse_compressed_cpu
580
+ SparseCsrCPU, SparseCsrMeta: add_out_sparse_compressed_cpu
581
581
  SparseCsrCUDA: add_out_sparse_compressed_cuda
582
582
  MkldnnCPU: mkldnn_add_out
583
583
  MPS: add_out_mps
@@ -1750,6 +1750,7 @@
1750
1750
  - func: copy(Tensor self, Tensor src, bool non_blocking=False) -> Tensor
1751
1751
  variants: function
1752
1752
  dispatch:
1753
+ Meta: copy_meta
1753
1754
  CompositeExplicitAutogradNonFunctional: copy
1754
1755
  tags: core
1755
1756
 
@@ -3127,6 +3128,7 @@
3127
3128
  structured: True
3128
3129
  dispatch:
3129
3130
  CPU, CUDA: isin_Tensor_Tensor_out
3131
+ MPS: isin_Tensor_Tensor_out_mps
3130
3132
 
3131
3133
  - func: isin.Tensor_Tensor(Tensor elements, Tensor test_elements, *, bool assume_unique=False, bool invert=False) -> Tensor
3132
3134
  variants: function
@@ -3268,6 +3270,8 @@
3268
3270
  autogen: native_layer_norm_backward.out
3269
3271
  tags: core
3270
3272
 
3273
+ - func: rms_norm(Tensor input, int[] normalized_shape, Tensor? weight=None, float? eps=None) -> Tensor
3274
+
3271
3275
  - func: nan_to_num(Tensor self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor
3272
3276
  variants: function, method
3273
3277
  dispatch:
@@ -3340,10 +3344,31 @@
3340
3344
  dispatch:
3341
3345
  CUDA: _cslt_sparse_mm_search
3342
3346
 
3347
+ - func: _sparse_semi_structured_tile(Tensor input, str algorithm="", bool use_cutlass=True) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
3348
+ dispatch:
3349
+ CUDA: _sparse_semi_structured_tile
3350
+
3351
+ - func: _sparse_semi_structured_apply(Tensor input, Tensor thread_masks) -> (Tensor, Tensor)
3352
+ dispatch:
3353
+ CUDA: _sparse_semi_structured_apply
3354
+
3355
+ - func: _sparse_semi_structured_apply_dense(Tensor input, Tensor thread_masks) -> Tensor
3356
+ dispatch:
3357
+ CUDA: _sparse_semi_structured_apply_dense
3358
+
3359
+ # DEPRECATED: Use torch.__sparse_semi_structured_mm/torch._sparse_semi_structured_addmm instead
3343
3360
  - func: _sparse_semi_structured_linear(Tensor input, Tensor weight, Tensor meta, *, Tensor? bias=None, str? activation=None, ScalarType? out_dtype=None) -> Tensor
3344
3361
  dispatch:
3345
3362
  CUDA: _sparse_semi_structured_linear
3346
3363
 
3364
+ - func: _sparse_semi_structured_mm(Tensor mat1, Tensor mat1_meta, Tensor mat2, *, ScalarType? out_dtype=None) -> Tensor
3365
+ dispatch:
3366
+ CUDA: _sparse_semi_structured_mm
3367
+
3368
+ - func: _sparse_semi_structured_addmm(Tensor input, Tensor mat1, Tensor mat1_meta, Tensor mat2, *, Scalar alpha=1, Scalar beta=1, ScalarType? out_dtype=None) -> Tensor
3369
+ dispatch:
3370
+ CUDA: _sparse_semi_structured_addmm
3371
+
3347
3372
  - func: _mixed_dtypes_linear(Tensor input, Tensor weight, Tensor scale, *, Tensor? bias=None, str? activation=None) -> Tensor
3348
3373
  dispatch:
3349
3374
  CUDA: _mixed_dtypes_linear
@@ -4084,10 +4109,12 @@
4084
4109
 
4085
4110
  - func: _int_mm(Tensor self, Tensor mat2) -> Tensor
4086
4111
  dispatch:
4112
+ CPU: _int_mm_cpu
4087
4113
  CUDA: _int_mm_cuda
4088
4114
 
4089
4115
  - func: _int_mm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)
4090
4116
  dispatch:
4117
+ CPU: _int_mm_out_cpu
4091
4118
  CUDA: _int_mm_out_cuda
4092
4119
 
4093
4120
  - func: _convert_weight_to_int4pack(Tensor self, int innerKTiles) -> Tensor
@@ -4098,11 +4125,13 @@
4098
4125
  - func: _weight_int4pack_mm(Tensor self, Tensor mat2, int qGroupSize, Tensor qScaleAndZeros) -> Tensor
4099
4126
  dispatch:
4100
4127
  CPU: _weight_int4pack_mm_cpu
4128
+ MPS: _weight_int4pack_mm_mps
4101
4129
  CUDA: _weight_int4pack_mm_cuda
4102
4130
 
4103
4131
  - func: _weight_int8pack_mm(Tensor self, Tensor mat2, Tensor scales) -> Tensor
4104
4132
  dispatch:
4105
4133
  CPU: _weight_int8pack_mm_cpu
4134
+ MPS: _weight_int8pack_mm_mps
4106
4135
 
4107
4136
  - func: _sparse_mm(Tensor sparse, Tensor dense) -> Tensor
4108
4137
  python_module: sparse
@@ -5397,7 +5426,7 @@
5397
5426
  autogen: slice_backward.out
5398
5427
 
5399
5428
  # NB: This op exists to back the implementation of reverse view_funcs for various views (chunk,
5400
- # slice.Tensor, split_with_sizes, et. al.). Currently, these are only used during fake-ification
5429
+ # slice.Tensor, split_with_sizes, et al.). Currently, these are only used during fake-ification
5401
5430
  # of PT2 graph input subclass instances that are views. This means:
5402
5431
  # * This op shouldn't really show up in eager mode (so e.g. XLA shouldn't have to implement it)
5403
5432
  # * This op shouldn't show up in a PT2 graph (so a PT2 backend shouldn't have to implement it)
@@ -5620,10 +5649,12 @@
5620
5649
  - func: _chunk_cat(Tensor[] tensors, int dim, int num_chunks) -> Tensor
5621
5650
  dispatch:
5622
5651
  CompositeExplicitAutograd: _chunk_cat
5652
+ CUDA: _chunk_cat_cuda
5623
5653
 
5624
5654
  - func: _chunk_cat.out(Tensor[] tensors, int dim, int num_chunks, *, Tensor(a!) out) -> Tensor(a!)
5625
5655
  dispatch:
5626
5656
  CompositeExplicitAutograd: _chunk_cat_out
5657
+ CUDA: _chunk_cat_out_cuda
5627
5658
 
5628
5659
  - func: stack(Tensor[] tensors, int dim=0) -> Tensor
5629
5660
  dispatch:
@@ -5689,8 +5720,8 @@
5689
5720
  variants: function, method
5690
5721
  dispatch:
5691
5722
  CompositeExplicitAutograd: sum
5692
- SparseCPU, SparseCUDA: sum_coo
5693
- SparseCsrCPU, SparseCsrCUDA: sum_csr
5723
+ SparseCPU, SparseCUDA, SparseMeta: sum_coo
5724
+ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sum_csr
5694
5725
  autogen: sum.out
5695
5726
 
5696
5727
  - func: sum.dim_IntList(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
@@ -6200,6 +6231,12 @@
6200
6231
  category_override: dummy
6201
6232
  dispatch: {}
6202
6233
 
6234
+ - func: _nested_compute_contiguous_strides_offsets(Tensor nested_size) -> (Tensor, Tensor)
6235
+ variants: function
6236
+ device_check: NoCheck
6237
+ dispatch:
6238
+ CPU, CUDA: _nested_compute_contiguous_strides_offsets
6239
+
6203
6240
  - func: _trilinear(Tensor i1, Tensor i2, Tensor i3, int[] expand1, int[] expand2, int[] expand3, int[] sumdim, int unroll_dim=1) -> Tensor
6204
6241
  dispatch:
6205
6242
  # calls unsqueeze
@@ -6465,7 +6502,7 @@
6465
6502
  CPU: _efficientzerotensor
6466
6503
  CUDA: _efficientzerotensor_cuda
6467
6504
  MPS: _efficientzerotensor_mps
6468
- Meta: _efficientzerotensor_meta
6505
+ Meta: _efficientzerotensor_meta_symint
6469
6506
  autogen: _efficientzerotensor.out
6470
6507
 
6471
6508
  - func: zeros(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
@@ -6542,6 +6579,32 @@
6542
6579
  SparseCPU, SparseCUDA: norm_sparse
6543
6580
  autogen: native_norm.ScalarOpt_dim_dtype_out
6544
6581
 
6582
+ - func: _batch_norm_with_update(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, float momentum, float eps) -> (Tensor, Tensor, Tensor, Tensor)
6583
+ dispatch:
6584
+ CPU: _batch_norm_with_update_cpu
6585
+ CUDA: _batch_norm_with_update_cuda
6586
+ MPS: _batch_norm_with_update_mps
6587
+ MkldnnCPU: _batch_norm_with_update_mkldnn
6588
+ autogen: _batch_norm_with_update_functional
6589
+
6590
+ - func: _batch_norm_with_update.out(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, float momentum, float eps, *, Tensor(d!) out, Tensor(e!) save_mean, Tensor(f!) save_invstd, Tensor(g!) reserve) -> (Tensor(d!), Tensor(e!), Tensor(f!), Tensor(g!))
6591
+ dispatch:
6592
+ CPU: _batch_norm_with_update_cpu_out
6593
+ CUDA: _batch_norm_with_update_cuda_out
6594
+ MPS: _batch_norm_with_update_mps_out
6595
+
6596
+ - func: _batch_norm_no_update(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, float momentum, float eps) -> (Tensor, Tensor, Tensor, Tensor)
6597
+ dispatch:
6598
+ CompositeExplicitAutograd: _batch_norm_no_update
6599
+ autogen: _batch_norm_no_update.out
6600
+
6601
+ - func: batch_norm_backward(Tensor grad_out, Tensor input, Tensor weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var, bool update, float eps, bool[3] output_mask, Tensor reserve) -> (Tensor, Tensor, Tensor)
6602
+ dispatch:
6603
+ CPU: _new_batch_norm_backward_cpu
6604
+ CUDA: _new_batch_norm_backward_cuda
6605
+ MPS: _new_batch_norm_backward_mps
6606
+ MkldnnCPU: _new_batch_norm_backward_mkldnn
6607
+
6545
6608
  # TODO: reduce signatures down to one when optional args is available
6546
6609
  - func: _sparse_sum(Tensor self) -> Tensor
6547
6610
 
@@ -7042,6 +7105,10 @@
7042
7105
  # FIXME: would be nicer if TensorOptions was optional based; not adding default arguments for options given
7043
7106
  # the default would never make sense.
7044
7107
 
7108
+ - func: _sparse_compressed_tensor_with_dims(int nnz, int dense_dim, int[] size, int[] blocksize, ScalarType index_dtype, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
7109
+ dispatch:
7110
+ CompositeExplicitAutograd: sparse_compressed_tensor_with_dims
7111
+
7045
7112
  - func: sparse_compressed_tensor.comp_plain_value_size(Tensor compressed_indices, Tensor plain_indices, Tensor values, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
7046
7113
  dispatch:
7047
7114
  CompositeExplicitAutograd: sparse_compressed_tensor
@@ -7146,9 +7213,9 @@
7146
7213
  - func: sparse_dim(Tensor self) -> int
7147
7214
  variants: method
7148
7215
  dispatch:
7149
- CPU, CUDA: sparse_dim_strided
7150
7216
  SparseCPU, SparseCUDA, SparseMeta: sparse_dim_sparse
7151
7217
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sparse_dim_sparse_csr
7218
+ CompositeExplicitAutograd: sparse_dim_default
7152
7219
  device_check: NoCheck
7153
7220
  device_guard: False
7154
7221
 
@@ -7163,9 +7230,9 @@
7163
7230
  - func: dense_dim(Tensor self) -> int
7164
7231
  variants: method
7165
7232
  dispatch:
7166
- CPU, CUDA: dense_dim_strided
7167
7233
  SparseCPU, SparseCUDA, SparseMeta: dense_dim_sparse
7168
7234
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: dense_dim_sparse_csr
7235
+ CompositeExplicitAutograd: dense_dim_default
7169
7236
  device_check: NoCheck
7170
7237
  device_guard: False
7171
7238
 
@@ -7296,7 +7363,7 @@
7296
7363
  device_check: NoCheck # Allows copy into different device
7297
7364
  variants: function
7298
7365
  dispatch:
7299
- SparseCPU, SparseCUDA: copy_sparse_
7366
+ SparseCPU, SparseCUDA, SparseMeta: copy_sparse_
7300
7367
  autogen: copy_sparse_to_sparse, copy_sparse_to_sparse.out
7301
7368
 
7302
7369
  # By adding the AutogradNestedTensor this makes this function CompositeImplicit-like for nested tensors
@@ -7399,7 +7466,7 @@
7399
7466
  MkldnnCPU: mkldnn_reorder_conv2d_weight
7400
7467
  autogen: mkldnn_reorder_conv2d_weight.out
7401
7468
 
7402
- - func: mkldnn_reorder_conv3d_weight(Tensor self, SymInt[3] padding=0, SymInt[3] stride=1, SymInt[3] dilation=1, SymInt groups=1) -> Tensor
7469
+ - func: mkldnn_reorder_conv3d_weight(Tensor self, SymInt[3] padding=0, SymInt[3] stride=1, SymInt[3] dilation=1, SymInt groups=1, SymInt[]? input_size=None) -> Tensor
7403
7470
  variants: function
7404
7471
  python_module: nn
7405
7472
  dispatch:
@@ -7647,7 +7714,7 @@
7647
7714
 
7648
7715
  - func: result_type.Scalar_Scalar(Scalar scalar1, Scalar scalar2) -> ScalarType
7649
7716
 
7650
- - func: can_cast(ScalarType from, ScalarType to) -> bool
7717
+ - func: can_cast(ScalarType from_, ScalarType to) -> bool
7651
7718
  variants: function
7652
7719
 
7653
7720
  - func: promote_types(ScalarType type1, ScalarType type2) -> ScalarType
@@ -10222,6 +10289,7 @@
10222
10289
  variants: method, function
10223
10290
  dispatch:
10224
10291
  CompositeExplicitAutograd: alias
10292
+ NestedTensorCPU, NestedTensorCUDA: alias_nested
10225
10293
  tags: core
10226
10294
 
10227
10295
  - func: _amp_foreach_non_finite_check_and_unscale_(Tensor(a!)[] self, Tensor(b!) found_inf, Tensor inv_scale) -> ()
@@ -10255,14 +10323,14 @@
10255
10323
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10256
10324
  variants: function
10257
10325
  dispatch:
10258
- CPU: foreach_tensor_add_scalar_kernel_slow
10326
+ CompositeExplicitAutograd: foreach_tensor_add_scalar_kernel_slow
10259
10327
  CUDA: foreach_tensor_add_scalar_kernel_cuda
10260
10328
 
10261
10329
  - func: _foreach_add_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
10262
10330
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10263
10331
  variants: function
10264
10332
  dispatch:
10265
- CPU: foreach_tensor_add_scalar_kernel_slow_
10333
+ CompositeExplicitAutograd: foreach_tensor_add_scalar_kernel_slow_
10266
10334
  CUDA: foreach_tensor_add_scalar_kernel_cuda_
10267
10335
  autogen: _foreach_add.Scalar_out
10268
10336
 
@@ -10270,14 +10338,14 @@
10270
10338
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10271
10339
  variants: function
10272
10340
  dispatch:
10273
- CPU: foreach_tensor_add_list_kernel_slow
10341
+ CompositeExplicitAutograd: foreach_tensor_add_list_kernel_slow
10274
10342
  CUDA: foreach_tensor_add_list_kernel_cuda
10275
10343
 
10276
10344
  - func: _foreach_add_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> ()
10277
10345
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10278
10346
  variants: function
10279
10347
  dispatch:
10280
- CPU: foreach_tensor_add_list_kernel_slow_
10348
+ CompositeExplicitAutograd: foreach_tensor_add_list_kernel_slow_
10281
10349
  CUDA: foreach_tensor_add_list_kernel_cuda_
10282
10350
  autogen: _foreach_add.List_out
10283
10351
 
@@ -10285,14 +10353,14 @@
10285
10353
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10286
10354
  variants: function
10287
10355
  dispatch:
10288
- CPU: foreach_tensor_add_scalarlist_kernel_slow
10356
+ CompositeExplicitAutograd: foreach_tensor_add_scalarlist_kernel_slow
10289
10357
  CUDA: foreach_tensor_add_scalarlist_kernel_cuda
10290
10358
 
10291
10359
  - func: _foreach_add_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
10292
10360
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10293
10361
  variants: function
10294
10362
  dispatch:
10295
- CPU: foreach_tensor_add_scalarlist_kernel_slow_
10363
+ CompositeExplicitAutograd: foreach_tensor_add_scalarlist_kernel_slow_
10296
10364
  CUDA: foreach_tensor_add_scalarlist_kernel_cuda_
10297
10365
  autogen: _foreach_add.ScalarList_out
10298
10366
 
@@ -10300,14 +10368,14 @@
10300
10368
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10301
10369
  variants: function
10302
10370
  dispatch:
10303
- CPU: foreach_tensor_add_tensor_kernel_slow
10371
+ CompositeExplicitAutograd: foreach_tensor_add_tensor_kernel_slow
10304
10372
  CUDA: foreach_tensor_add_tensor_kernel_cuda
10305
10373
 
10306
10374
  - func: _foreach_add_.Tensor(Tensor(a!)[] self, Tensor other, *, Scalar alpha=1) -> ()
10307
10375
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10308
10376
  variants: function
10309
10377
  dispatch:
10310
- CPU: foreach_tensor_add_tensor_kernel_slow_
10378
+ CompositeExplicitAutograd: foreach_tensor_add_tensor_kernel_slow_
10311
10379
  CUDA: foreach_tensor_add_tensor_kernel_cuda_
10312
10380
  autogen: _foreach_add.Tensor_out
10313
10381
 
@@ -10315,14 +10383,14 @@
10315
10383
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10316
10384
  variants: function
10317
10385
  dispatch:
10318
- CPU: foreach_tensor_sub_scalar_kernel_slow
10386
+ CompositeExplicitAutograd: foreach_tensor_sub_scalar_kernel_slow
10319
10387
  CUDA: foreach_tensor_sub_scalar_kernel_cuda
10320
10388
 
10321
10389
  - func: _foreach_sub_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
10322
10390
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10323
10391
  variants: function
10324
10392
  dispatch:
10325
- CPU: foreach_tensor_sub_scalar_kernel_slow_
10393
+ CompositeExplicitAutograd: foreach_tensor_sub_scalar_kernel_slow_
10326
10394
  CUDA: foreach_tensor_sub_scalar_kernel_cuda_
10327
10395
  autogen: _foreach_sub.Scalar_out
10328
10396
 
@@ -10330,14 +10398,14 @@
10330
10398
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10331
10399
  variants: function
10332
10400
  dispatch:
10333
- CPU: foreach_tensor_sub_list_kernel_slow
10401
+ CompositeExplicitAutograd: foreach_tensor_sub_list_kernel_slow
10334
10402
  CUDA: foreach_tensor_sub_list_kernel_cuda
10335
10403
 
10336
10404
  - func: _foreach_sub_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> ()
10337
10405
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10338
10406
  variants: function
10339
10407
  dispatch:
10340
- CPU: foreach_tensor_sub_list_kernel_slow_
10408
+ CompositeExplicitAutograd: foreach_tensor_sub_list_kernel_slow_
10341
10409
  CUDA: foreach_tensor_sub_list_kernel_cuda_
10342
10410
  autogen: _foreach_sub.List_out
10343
10411
 
@@ -10345,14 +10413,14 @@
10345
10413
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10346
10414
  variants: function
10347
10415
  dispatch:
10348
- CPU: foreach_tensor_sub_scalarlist_kernel_slow
10416
+ CompositeExplicitAutograd: foreach_tensor_sub_scalarlist_kernel_slow
10349
10417
  CUDA: foreach_tensor_sub_scalarlist_kernel_cuda
10350
10418
 
10351
10419
  - func: _foreach_sub_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
10352
10420
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10353
10421
  variants: function
10354
10422
  dispatch:
10355
- CPU: foreach_tensor_sub_scalarlist_kernel_slow_
10423
+ CompositeExplicitAutograd: foreach_tensor_sub_scalarlist_kernel_slow_
10356
10424
  CUDA: foreach_tensor_sub_scalarlist_kernel_cuda_
10357
10425
  autogen: _foreach_sub.ScalarList_out
10358
10426
 
@@ -10360,14 +10428,14 @@
10360
10428
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10361
10429
  variants: function
10362
10430
  dispatch:
10363
- CPU: foreach_tensor_mul_scalar_kernel_slow
10431
+ CompositeExplicitAutograd: foreach_tensor_mul_scalar_kernel_slow
10364
10432
  CUDA: foreach_tensor_mul_scalar_kernel_cuda
10365
10433
 
10366
10434
  - func: _foreach_mul_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
10367
10435
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10368
10436
  variants: function
10369
10437
  dispatch:
10370
- CPU: foreach_tensor_mul_scalar_kernel_slow_
10438
+ CompositeExplicitAutograd: foreach_tensor_mul_scalar_kernel_slow_
10371
10439
  CUDA: foreach_tensor_mul_scalar_kernel_cuda_
10372
10440
  autogen: _foreach_mul.Scalar_out
10373
10441
 
@@ -10375,14 +10443,14 @@
10375
10443
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10376
10444
  variants: function
10377
10445
  dispatch:
10378
- CPU: foreach_tensor_mul_list_kernel_slow
10446
+ CompositeExplicitAutograd: foreach_tensor_mul_list_kernel_slow
10379
10447
  CUDA: foreach_tensor_mul_list_kernel_cuda
10380
10448
 
10381
10449
  - func: _foreach_mul_.List(Tensor(a!)[] self, Tensor[] other) -> ()
10382
10450
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10383
10451
  variants: function
10384
10452
  dispatch:
10385
- CPU: foreach_tensor_mul_list_kernel_slow_
10453
+ CompositeExplicitAutograd: foreach_tensor_mul_list_kernel_slow_
10386
10454
  CUDA: foreach_tensor_mul_list_kernel_cuda_
10387
10455
  autogen: _foreach_mul.List_out
10388
10456
 
@@ -10390,14 +10458,14 @@
10390
10458
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10391
10459
  variants: function
10392
10460
  dispatch:
10393
- CPU: foreach_tensor_mul_scalarlist_kernel_slow
10461
+ CompositeExplicitAutograd: foreach_tensor_mul_scalarlist_kernel_slow
10394
10462
  CUDA: foreach_tensor_mul_scalarlist_kernel_cuda
10395
10463
 
10396
10464
  - func: _foreach_mul_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
10397
10465
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10398
10466
  variants: function
10399
10467
  dispatch:
10400
- CPU: foreach_tensor_mul_scalarlist_kernel_slow_
10468
+ CompositeExplicitAutograd: foreach_tensor_mul_scalarlist_kernel_slow_
10401
10469
  CUDA: foreach_tensor_mul_scalarlist_kernel_cuda_
10402
10470
  autogen: _foreach_mul.ScalarList_out
10403
10471
 
@@ -10405,14 +10473,14 @@
10405
10473
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10406
10474
  variants: function
10407
10475
  dispatch:
10408
- CPU: foreach_tensor_mul_tensor_kernel_slow
10476
+ CompositeExplicitAutograd: foreach_tensor_mul_tensor_kernel_slow
10409
10477
  CUDA: foreach_tensor_mul_tensor_kernel_cuda
10410
10478
 
10411
10479
  - func: _foreach_mul_.Tensor(Tensor(a!)[] self, Tensor other) -> ()
10412
10480
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10413
10481
  variants: function
10414
10482
  dispatch:
10415
- CPU: foreach_tensor_mul_tensor_kernel_slow_
10483
+ CompositeExplicitAutograd: foreach_tensor_mul_tensor_kernel_slow_
10416
10484
  CUDA: foreach_tensor_mul_tensor_kernel_cuda_
10417
10485
  autogen: _foreach_mul.Tensor_out
10418
10486
 
@@ -10420,14 +10488,14 @@
10420
10488
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10421
10489
  variants: function
10422
10490
  dispatch:
10423
- CPU: foreach_tensor_div_scalar_kernel_slow
10491
+ CompositeExplicitAutograd: foreach_tensor_div_scalar_kernel_slow
10424
10492
  CUDA: foreach_tensor_div_scalar_kernel_cuda
10425
10493
 
10426
10494
  - func: _foreach_div_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
10427
10495
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10428
10496
  variants: function
10429
10497
  dispatch:
10430
- CPU: foreach_tensor_div_scalar_kernel_slow_
10498
+ CompositeExplicitAutograd: foreach_tensor_div_scalar_kernel_slow_
10431
10499
  CUDA: foreach_tensor_div_scalar_kernel_cuda_
10432
10500
  autogen: _foreach_div.Scalar_out
10433
10501
 
@@ -10435,14 +10503,14 @@
10435
10503
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10436
10504
  variants: function
10437
10505
  dispatch:
10438
- CPU: foreach_tensor_div_list_kernel_slow
10506
+ CompositeExplicitAutograd: foreach_tensor_div_list_kernel_slow
10439
10507
  CUDA: foreach_tensor_div_list_kernel_cuda
10440
10508
 
10441
10509
  - func: _foreach_div_.List(Tensor(a!)[] self, Tensor[] other) -> ()
10442
10510
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10443
10511
  variants: function
10444
10512
  dispatch:
10445
- CPU: foreach_tensor_div_list_kernel_slow_
10513
+ CompositeExplicitAutograd: foreach_tensor_div_list_kernel_slow_
10446
10514
  CUDA: foreach_tensor_div_list_kernel_cuda_
10447
10515
  autogen: _foreach_div.List_out
10448
10516
 
@@ -10450,14 +10518,14 @@
10450
10518
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10451
10519
  variants: function
10452
10520
  dispatch:
10453
- CPU: foreach_tensor_div_scalarlist_kernel_slow
10521
+ CompositeExplicitAutograd: foreach_tensor_div_scalarlist_kernel_slow
10454
10522
  CUDA: foreach_tensor_div_scalarlist_kernel_cuda
10455
10523
 
10456
10524
  - func: _foreach_div_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
10457
10525
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10458
10526
  variants: function
10459
10527
  dispatch:
10460
- CPU: foreach_tensor_div_scalarlist_kernel_slow_
10528
+ CompositeExplicitAutograd: foreach_tensor_div_scalarlist_kernel_slow_
10461
10529
  CUDA: foreach_tensor_div_scalarlist_kernel_cuda_
10462
10530
  autogen: _foreach_div.ScalarList_out
10463
10531
 
@@ -10465,14 +10533,14 @@
10465
10533
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10466
10534
  variants: function
10467
10535
  dispatch:
10468
- CPU: foreach_tensor_div_tensor_kernel_slow
10536
+ CompositeExplicitAutograd: foreach_tensor_div_tensor_kernel_slow
10469
10537
  CUDA: foreach_tensor_div_tensor_kernel_cuda
10470
10538
 
10471
10539
  - func: _foreach_div_.Tensor(Tensor(a!)[] self, Tensor other) -> ()
10472
10540
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10473
10541
  variants: function
10474
10542
  dispatch:
10475
- CPU: foreach_tensor_div_tensor_kernel_slow_
10543
+ CompositeExplicitAutograd: foreach_tensor_div_tensor_kernel_slow_
10476
10544
  CUDA: foreach_tensor_div_tensor_kernel_cuda_
10477
10545
  autogen: _foreach_div.Tensor_out
10478
10546
 
@@ -10480,14 +10548,14 @@
10480
10548
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10481
10549
  variants: function
10482
10550
  dispatch:
10483
- CPU: foreach_tensor_clamp_max_scalar_kernel_slow
10551
+ CompositeExplicitAutograd: foreach_tensor_clamp_max_scalar_kernel_slow
10484
10552
  CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda
10485
10553
 
10486
10554
  - func: _foreach_clamp_max_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
10487
10555
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10488
10556
  variants: function
10489
10557
  dispatch:
10490
- CPU: foreach_tensor_clamp_max_scalar_kernel_slow_
10558
+ CompositeExplicitAutograd: foreach_tensor_clamp_max_scalar_kernel_slow_
10491
10559
  CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda_
10492
10560
  autogen: _foreach_clamp_max.Scalar_out
10493
10561
 
@@ -10495,14 +10563,14 @@
10495
10563
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10496
10564
  variants: function
10497
10565
  dispatch:
10498
- CPU: foreach_tensor_clamp_max_list_kernel_slow
10566
+ CompositeExplicitAutograd: foreach_tensor_clamp_max_list_kernel_slow
10499
10567
  CUDA: foreach_tensor_clamp_max_list_kernel_cuda
10500
10568
 
10501
10569
  - func: _foreach_clamp_max_.List(Tensor(a!)[] self, Tensor[] other) -> ()
10502
10570
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10503
10571
  variants: function
10504
10572
  dispatch:
10505
- CPU: foreach_tensor_clamp_max_list_kernel_slow_
10573
+ CompositeExplicitAutograd: foreach_tensor_clamp_max_list_kernel_slow_
10506
10574
  CUDA: foreach_tensor_clamp_max_list_kernel_cuda_
10507
10575
  autogen: _foreach_clamp_max.List_out
10508
10576
 
@@ -10510,14 +10578,14 @@
10510
10578
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10511
10579
  variants: function
10512
10580
  dispatch:
10513
- CPU: foreach_tensor_clamp_max_scalarlist_kernel_slow
10581
+ CompositeExplicitAutograd: foreach_tensor_clamp_max_scalarlist_kernel_slow
10514
10582
  CUDA: foreach_tensor_clamp_max_scalarlist_kernel_cuda
10515
10583
 
10516
10584
  - func: _foreach_clamp_max_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
10517
10585
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10518
10586
  variants: function
10519
10587
  dispatch:
10520
- CPU: foreach_tensor_clamp_max_scalarlist_kernel_slow_
10588
+ CompositeExplicitAutograd: foreach_tensor_clamp_max_scalarlist_kernel_slow_
10521
10589
  CUDA: foreach_tensor_clamp_max_scalarlist_kernel_cuda_
10522
10590
  autogen: _foreach_clamp_max.ScalarList_out
10523
10591
 
@@ -10525,14 +10593,14 @@
10525
10593
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10526
10594
  variants: function
10527
10595
  dispatch:
10528
- CPU: foreach_tensor_clamp_min_scalar_kernel_slow
10596
+ CompositeExplicitAutograd: foreach_tensor_clamp_min_scalar_kernel_slow
10529
10597
  CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda
10530
10598
 
10531
10599
  - func: _foreach_clamp_min_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
10532
10600
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10533
10601
  variants: function
10534
10602
  dispatch:
10535
- CPU: foreach_tensor_clamp_min_scalar_kernel_slow_
10603
+ CompositeExplicitAutograd: foreach_tensor_clamp_min_scalar_kernel_slow_
10536
10604
  CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda_
10537
10605
  autogen: _foreach_clamp_min.Scalar_out
10538
10606
 
@@ -10540,14 +10608,14 @@
10540
10608
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10541
10609
  variants: function
10542
10610
  dispatch:
10543
- CPU: foreach_tensor_clamp_min_list_kernel_slow
10611
+ CompositeExplicitAutograd: foreach_tensor_clamp_min_list_kernel_slow
10544
10612
  CUDA: foreach_tensor_clamp_min_list_kernel_cuda
10545
10613
 
10546
10614
  - func: _foreach_clamp_min_.List(Tensor(a!)[] self, Tensor[] other) -> ()
10547
10615
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10548
10616
  variants: function
10549
10617
  dispatch:
10550
- CPU: foreach_tensor_clamp_min_list_kernel_slow_
10618
+ CompositeExplicitAutograd: foreach_tensor_clamp_min_list_kernel_slow_
10551
10619
  CUDA: foreach_tensor_clamp_min_list_kernel_cuda_
10552
10620
  autogen: _foreach_clamp_min.List_out
10553
10621
 
@@ -10555,14 +10623,14 @@
10555
10623
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10556
10624
  variants: function
10557
10625
  dispatch:
10558
- CPU: foreach_tensor_clamp_min_scalarlist_kernel_slow
10626
+ CompositeExplicitAutograd: foreach_tensor_clamp_min_scalarlist_kernel_slow
10559
10627
  CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda
10560
10628
 
10561
10629
  - func: _foreach_clamp_min_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
10562
10630
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10563
10631
  variants: function
10564
10632
  dispatch:
10565
- CPU: foreach_tensor_clamp_min_scalarlist_kernel_slow_
10633
+ CompositeExplicitAutograd: foreach_tensor_clamp_min_scalarlist_kernel_slow_
10566
10634
  CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda_
10567
10635
  autogen: _foreach_clamp_min.ScalarList_out
10568
10636
 
@@ -10571,14 +10639,14 @@
10571
10639
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10572
10640
  variants: function
10573
10641
  dispatch:
10574
- CPU: foreach_tensor_clamp_min_scalar_kernel_slow
10642
+ CompositeExplicitAutograd: foreach_tensor_clamp_min_scalar_kernel_slow
10575
10643
  CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda
10576
10644
 
10577
10645
  - func: _foreach_maximum_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
10578
10646
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10579
10647
  variants: function
10580
10648
  dispatch:
10581
- CPU: foreach_tensor_clamp_min_scalar_kernel_slow_
10649
+ CompositeExplicitAutograd: foreach_tensor_clamp_min_scalar_kernel_slow_
10582
10650
  CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda_
10583
10651
  autogen: _foreach_maximum.Scalar_out
10584
10652
 
@@ -10587,14 +10655,14 @@
10587
10655
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10588
10656
  variants: function
10589
10657
  dispatch:
10590
- CPU: foreach_tensor_clamp_min_list_kernel_slow
10658
+ CompositeExplicitAutograd: foreach_tensor_clamp_min_list_kernel_slow
10591
10659
  CUDA: foreach_tensor_clamp_min_list_kernel_cuda
10592
10660
 
10593
10661
  - func: _foreach_maximum_.List(Tensor(a!)[] self, Tensor[] other) -> ()
10594
10662
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10595
10663
  variants: function
10596
10664
  dispatch:
10597
- CPU: foreach_tensor_clamp_min_list_kernel_slow_
10665
+ CompositeExplicitAutograd: foreach_tensor_clamp_min_list_kernel_slow_
10598
10666
  CUDA: foreach_tensor_clamp_min_list_kernel_cuda_
10599
10667
  autogen: _foreach_maximum.List_out
10600
10668
 
@@ -10603,14 +10671,14 @@
10603
10671
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10604
10672
  variants: function
10605
10673
  dispatch:
10606
- CPU: foreach_tensor_clamp_min_scalarlist_kernel_slow
10674
+ CompositeExplicitAutograd: foreach_tensor_clamp_min_scalarlist_kernel_slow
10607
10675
  CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda
10608
10676
 
10609
10677
  - func: _foreach_maximum_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
10610
10678
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10611
10679
  variants: function
10612
10680
  dispatch:
10613
- CPU: foreach_tensor_clamp_min_scalarlist_kernel_slow_
10681
+ CompositeExplicitAutograd: foreach_tensor_clamp_min_scalarlist_kernel_slow_
10614
10682
  CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda_
10615
10683
  autogen: _foreach_maximum.ScalarList_out
10616
10684
 
@@ -10618,14 +10686,14 @@
10618
10686
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10619
10687
  variants: function
10620
10688
  dispatch:
10621
- CPU: foreach_tensor_clamp_max_scalar_kernel_slow
10689
+ CompositeExplicitAutograd: foreach_tensor_clamp_max_scalar_kernel_slow
10622
10690
  CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda
10623
10691
 
10624
10692
  - func: _foreach_minimum_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
10625
10693
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10626
10694
  variants: function
10627
10695
  dispatch:
10628
- CPU: foreach_tensor_clamp_max_scalar_kernel_slow_
10696
+ CompositeExplicitAutograd: foreach_tensor_clamp_max_scalar_kernel_slow_
10629
10697
  CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda_
10630
10698
  autogen: _foreach_minimum.Scalar_out
10631
10699
 
@@ -10633,14 +10701,14 @@
10633
10701
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10634
10702
  variants: function
10635
10703
  dispatch:
10636
- CPU: foreach_tensor_clamp_max_list_kernel_slow
10704
+ CompositeExplicitAutograd: foreach_tensor_clamp_max_list_kernel_slow
10637
10705
  CUDA: foreach_tensor_clamp_max_list_kernel_cuda
10638
10706
 
10639
10707
  - func: _foreach_minimum_.List(Tensor(a!)[] self, Tensor[] other) -> ()
10640
10708
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10641
10709
  variants: function
10642
10710
  dispatch:
10643
- CPU: foreach_tensor_clamp_max_list_kernel_slow_
10711
+ CompositeExplicitAutograd: foreach_tensor_clamp_max_list_kernel_slow_
10644
10712
  CUDA: foreach_tensor_clamp_max_list_kernel_cuda_
10645
10713
  autogen: _foreach_minimum.List_out
10646
10714
 
@@ -10648,14 +10716,14 @@
10648
10716
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10649
10717
  variants: function
10650
10718
  dispatch:
10651
- CPU: foreach_tensor_clamp_max_scalarlist_kernel_slow
10719
+ CompositeExplicitAutograd: foreach_tensor_clamp_max_scalarlist_kernel_slow
10652
10720
  CUDA: foreach_tensor_clamp_max_scalarlist_kernel_cuda
10653
10721
 
10654
10722
  - func: _foreach_minimum_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
10655
10723
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10656
10724
  variants: function
10657
10725
  dispatch:
10658
- CPU: foreach_tensor_clamp_max_scalarlist_kernel_slow_
10726
+ CompositeExplicitAutograd: foreach_tensor_clamp_max_scalarlist_kernel_slow_
10659
10727
  CUDA: foreach_tensor_clamp_max_scalarlist_kernel_cuda_
10660
10728
  autogen: _foreach_minimum.ScalarList_out
10661
10729
 
@@ -10663,28 +10731,28 @@
10663
10731
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10664
10732
  variants: function
10665
10733
  dispatch:
10666
- CPU: foreach_tensor_addcdiv_scalar_slow
10734
+ CompositeExplicitAutograd: foreach_tensor_addcdiv_scalar_slow
10667
10735
  CUDA: foreach_tensor_addcdiv_scalar_cuda
10668
10736
 
10669
10737
  - func: _foreach_addcdiv.ScalarList(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> Tensor[]
10670
10738
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10671
10739
  variants: function
10672
10740
  dispatch:
10673
- CPU: foreach_tensor_addcdiv_scalarlist_slow
10741
+ CompositeExplicitAutograd: foreach_tensor_addcdiv_scalarlist_slow
10674
10742
  CUDA: foreach_tensor_addcdiv_scalarlist_cuda
10675
10743
 
10676
10744
  - func: _foreach_addcdiv.Tensor(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> Tensor[]
10677
10745
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10678
10746
  variants: function
10679
10747
  dispatch:
10680
- CPU: foreach_tensor_addcdiv_tensor_slow
10748
+ CompositeExplicitAutograd: foreach_tensor_addcdiv_tensor_slow
10681
10749
  CUDA: foreach_tensor_addcdiv_tensor_cuda
10682
10750
 
10683
10751
  - func: _foreach_addcdiv_.Scalar(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> ()
10684
10752
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10685
10753
  variants: function
10686
10754
  dispatch:
10687
- CPU: foreach_tensor_addcdiv_scalar_slow_
10755
+ CompositeExplicitAutograd: foreach_tensor_addcdiv_scalar_slow_
10688
10756
  CUDA: foreach_tensor_addcdiv_scalar_cuda_
10689
10757
  autogen: _foreach_addcdiv.Scalar_out
10690
10758
 
@@ -10692,7 +10760,7 @@
10692
10760
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10693
10761
  variants: function
10694
10762
  dispatch:
10695
- CPU: foreach_tensor_addcdiv_scalarlist_slow_
10763
+ CompositeExplicitAutograd: foreach_tensor_addcdiv_scalarlist_slow_
10696
10764
  CUDA: foreach_tensor_addcdiv_scalarlist_cuda_
10697
10765
  autogen: _foreach_addcdiv.ScalarList_out
10698
10766
 
@@ -10700,7 +10768,7 @@
10700
10768
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10701
10769
  variants: function
10702
10770
  dispatch:
10703
- CPU: foreach_tensor_addcdiv_tensor_slow_
10771
+ CompositeExplicitAutograd: foreach_tensor_addcdiv_tensor_slow_
10704
10772
  CUDA: foreach_tensor_addcdiv_tensor_cuda_
10705
10773
  autogen: _foreach_addcdiv.Tensor_out
10706
10774
 
@@ -10708,28 +10776,28 @@
10708
10776
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10709
10777
  variants: function
10710
10778
  dispatch:
10711
- CPU: foreach_tensor_addcmul_scalar_slow
10779
+ CompositeExplicitAutograd: foreach_tensor_addcmul_scalar_slow
10712
10780
  CUDA: foreach_tensor_addcmul_scalar_cuda
10713
10781
 
10714
10782
  - func: _foreach_addcmul.ScalarList(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> Tensor[]
10715
10783
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10716
10784
  variants: function
10717
10785
  dispatch:
10718
- CPU: foreach_tensor_addcmul_scalarlist_slow
10786
+ CompositeExplicitAutograd: foreach_tensor_addcmul_scalarlist_slow
10719
10787
  CUDA: foreach_tensor_addcmul_scalarlist_cuda
10720
10788
 
10721
10789
  - func: _foreach_addcmul.Tensor(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> Tensor[]
10722
10790
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10723
10791
  variants: function
10724
10792
  dispatch:
10725
- CPU: foreach_tensor_addcmul_tensor_slow
10793
+ CompositeExplicitAutograd: foreach_tensor_addcmul_tensor_slow
10726
10794
  CUDA: foreach_tensor_addcmul_tensor_cuda
10727
10795
 
10728
10796
  - func: _foreach_addcmul_.Scalar(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> ()
10729
10797
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10730
10798
  variants: function
10731
10799
  dispatch:
10732
- CPU: foreach_tensor_addcmul_scalar_slow_
10800
+ CompositeExplicitAutograd: foreach_tensor_addcmul_scalar_slow_
10733
10801
  CUDA: foreach_tensor_addcmul_scalar_cuda_
10734
10802
  autogen: _foreach_addcmul.Scalar_out
10735
10803
 
@@ -10737,7 +10805,7 @@
10737
10805
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10738
10806
  variants: function
10739
10807
  dispatch:
10740
- CPU: foreach_tensor_addcmul_scalarlist_slow_
10808
+ CompositeExplicitAutograd: foreach_tensor_addcmul_scalarlist_slow_
10741
10809
  CUDA: foreach_tensor_addcmul_scalarlist_cuda_
10742
10810
  autogen: _foreach_addcmul.ScalarList_out
10743
10811
 
@@ -10745,7 +10813,7 @@
10745
10813
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10746
10814
  variants: function
10747
10815
  dispatch:
10748
- CPU: foreach_tensor_addcmul_tensor_slow_
10816
+ CompositeExplicitAutograd: foreach_tensor_addcmul_tensor_slow_
10749
10817
  CUDA: foreach_tensor_addcmul_tensor_cuda_
10750
10818
  autogen: _foreach_addcmul.Tensor_out
10751
10819
 
@@ -10753,14 +10821,14 @@
10753
10821
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10754
10822
  variants: function
10755
10823
  dispatch:
10756
- CPU: foreach_tensor_abs_slow
10824
+ CompositeExplicitAutograd: foreach_tensor_abs_slow
10757
10825
  CUDA: foreach_tensor_abs_cuda
10758
10826
 
10759
10827
  - func: _foreach_abs_(Tensor(a!)[] self) -> ()
10760
10828
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10761
10829
  variants: function
10762
10830
  dispatch:
10763
- CPU: foreach_tensor_abs_slow_
10831
+ CompositeExplicitAutograd: foreach_tensor_abs_slow_
10764
10832
  CUDA: foreach_tensor_abs_cuda_
10765
10833
  autogen: _foreach_abs.out
10766
10834
 
@@ -10768,14 +10836,14 @@
10768
10836
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10769
10837
  variants: function
10770
10838
  dispatch:
10771
- CPU: foreach_tensor_acos_slow
10839
+ CompositeExplicitAutograd: foreach_tensor_acos_slow
10772
10840
  CUDA: foreach_tensor_acos_cuda
10773
10841
 
10774
10842
  - func: _foreach_acos_(Tensor(a!)[] self) -> ()
10775
10843
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10776
10844
  variants: function
10777
10845
  dispatch:
10778
- CPU: foreach_tensor_acos_slow_
10846
+ CompositeExplicitAutograd: foreach_tensor_acos_slow_
10779
10847
  CUDA: foreach_tensor_acos_cuda_
10780
10848
  autogen: _foreach_acos.out
10781
10849
 
@@ -10783,14 +10851,14 @@
10783
10851
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10784
10852
  variants: function
10785
10853
  dispatch:
10786
- CPU: foreach_tensor_asin_slow
10854
+ CompositeExplicitAutograd: foreach_tensor_asin_slow
10787
10855
  CUDA: foreach_tensor_asin_cuda
10788
10856
 
10789
10857
  - func: _foreach_asin_(Tensor(a!)[] self) -> ()
10790
10858
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10791
10859
  variants: function
10792
10860
  dispatch:
10793
- CPU: foreach_tensor_asin_slow_
10861
+ CompositeExplicitAutograd: foreach_tensor_asin_slow_
10794
10862
  CUDA: foreach_tensor_asin_cuda_
10795
10863
  autogen: _foreach_asin.out
10796
10864
 
@@ -10798,14 +10866,14 @@
10798
10866
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10799
10867
  variants: function
10800
10868
  dispatch:
10801
- CPU: foreach_tensor_atan_slow
10869
+ CompositeExplicitAutograd: foreach_tensor_atan_slow
10802
10870
  CUDA: foreach_tensor_atan_cuda
10803
10871
 
10804
10872
  - func: _foreach_atan_(Tensor(a!)[] self) -> ()
10805
10873
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10806
10874
  variants: function
10807
10875
  dispatch:
10808
- CPU: foreach_tensor_atan_slow_
10876
+ CompositeExplicitAutograd: foreach_tensor_atan_slow_
10809
10877
  CUDA: foreach_tensor_atan_cuda_
10810
10878
  autogen: _foreach_atan.out
10811
10879
 
@@ -10813,14 +10881,14 @@
10813
10881
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10814
10882
  variants: function
10815
10883
  dispatch:
10816
- CPU: foreach_tensor_ceil_slow
10884
+ CompositeExplicitAutograd: foreach_tensor_ceil_slow
10817
10885
  CUDA: foreach_tensor_ceil_cuda
10818
10886
 
10819
10887
  - func: _foreach_ceil_(Tensor(a!)[] self) -> ()
10820
10888
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10821
10889
  variants: function
10822
10890
  dispatch:
10823
- CPU: foreach_tensor_ceil_slow_
10891
+ CompositeExplicitAutograd: foreach_tensor_ceil_slow_
10824
10892
  CUDA: foreach_tensor_ceil_cuda_
10825
10893
  autogen: _foreach_ceil.out
10826
10894
 
@@ -10828,14 +10896,14 @@
10828
10896
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10829
10897
  variants: function
10830
10898
  dispatch:
10831
- CPU: foreach_tensor_cos_slow
10899
+ CompositeExplicitAutograd: foreach_tensor_cos_slow
10832
10900
  CUDA: foreach_tensor_cos_cuda
10833
10901
 
10834
10902
  - func: _foreach_cos_(Tensor(a!)[] self) -> ()
10835
10903
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10836
10904
  variants: function
10837
10905
  dispatch:
10838
- CPU: foreach_tensor_cos_slow_
10906
+ CompositeExplicitAutograd: foreach_tensor_cos_slow_
10839
10907
  CUDA: foreach_tensor_cos_cuda_
10840
10908
  autogen: _foreach_cos.out
10841
10909
 
@@ -10843,14 +10911,14 @@
10843
10911
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10844
10912
  variants: function
10845
10913
  dispatch:
10846
- CPU: foreach_tensor_cosh_slow
10914
+ CompositeExplicitAutograd: foreach_tensor_cosh_slow
10847
10915
  CUDA: foreach_tensor_cosh_cuda
10848
10916
 
10849
10917
  - func: _foreach_cosh_(Tensor(a!)[] self) -> ()
10850
10918
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10851
10919
  variants: function
10852
10920
  dispatch:
10853
- CPU: foreach_tensor_cosh_slow_
10921
+ CompositeExplicitAutograd: foreach_tensor_cosh_slow_
10854
10922
  CUDA: foreach_tensor_cosh_cuda_
10855
10923
  autogen: _foreach_cosh.out
10856
10924
 
@@ -10858,14 +10926,14 @@
10858
10926
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10859
10927
  variants: function
10860
10928
  dispatch:
10861
- CPU: foreach_tensor_erf_slow
10929
+ CompositeExplicitAutograd: foreach_tensor_erf_slow
10862
10930
  CUDA: foreach_tensor_erf_cuda
10863
10931
 
10864
10932
  - func: _foreach_erf_(Tensor(a!)[] self) -> ()
10865
10933
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10866
10934
  variants: function
10867
10935
  dispatch:
10868
- CPU: foreach_tensor_erf_slow_
10936
+ CompositeExplicitAutograd: foreach_tensor_erf_slow_
10869
10937
  CUDA: foreach_tensor_erf_cuda_
10870
10938
  autogen: _foreach_erf.out
10871
10939
 
@@ -10873,14 +10941,14 @@
10873
10941
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10874
10942
  variants: function
10875
10943
  dispatch:
10876
- CPU: foreach_tensor_erfc_slow
10944
+ CompositeExplicitAutograd: foreach_tensor_erfc_slow
10877
10945
  CUDA: foreach_tensor_erfc_cuda
10878
10946
 
10879
10947
  - func: _foreach_erfc_(Tensor(a!)[] self) -> ()
10880
10948
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10881
10949
  variants: function
10882
10950
  dispatch:
10883
- CPU: foreach_tensor_erfc_slow_
10951
+ CompositeExplicitAutograd: foreach_tensor_erfc_slow_
10884
10952
  CUDA: foreach_tensor_erfc_cuda_
10885
10953
  autogen: _foreach_erfc.out
10886
10954
 
@@ -10888,14 +10956,14 @@
10888
10956
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10889
10957
  variants: function
10890
10958
  dispatch:
10891
- CPU: foreach_tensor_exp_slow
10959
+ CompositeExplicitAutograd: foreach_tensor_exp_slow
10892
10960
  CUDA: foreach_tensor_exp_cuda
10893
10961
 
10894
10962
  - func: _foreach_exp_(Tensor(a!)[] self) -> ()
10895
10963
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10896
10964
  variants: function
10897
10965
  dispatch:
10898
- CPU: foreach_tensor_exp_slow_
10966
+ CompositeExplicitAutograd: foreach_tensor_exp_slow_
10899
10967
  CUDA: foreach_tensor_exp_cuda_
10900
10968
  autogen: _foreach_exp.out
10901
10969
 
@@ -10903,14 +10971,14 @@
10903
10971
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10904
10972
  variants: function
10905
10973
  dispatch:
10906
- CPU: foreach_tensor_expm1_slow
10974
+ CompositeExplicitAutograd: foreach_tensor_expm1_slow
10907
10975
  CUDA: foreach_tensor_expm1_cuda
10908
10976
 
10909
10977
  - func: _foreach_expm1_(Tensor(a!)[] self) -> ()
10910
10978
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10911
10979
  variants: function
10912
10980
  dispatch:
10913
- CPU: foreach_tensor_expm1_slow_
10981
+ CompositeExplicitAutograd: foreach_tensor_expm1_slow_
10914
10982
  CUDA: foreach_tensor_expm1_cuda_
10915
10983
  autogen: _foreach_expm1.out
10916
10984
 
@@ -10918,14 +10986,14 @@
10918
10986
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10919
10987
  variants: function
10920
10988
  dispatch:
10921
- CPU: foreach_tensor_floor_slow
10989
+ CompositeExplicitAutograd: foreach_tensor_floor_slow
10922
10990
  CUDA: foreach_tensor_floor_cuda
10923
10991
 
10924
10992
  - func: _foreach_floor_(Tensor(a!)[] self) -> ()
10925
10993
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10926
10994
  variants: function
10927
10995
  dispatch:
10928
- CPU: foreach_tensor_floor_slow_
10996
+ CompositeExplicitAutograd: foreach_tensor_floor_slow_
10929
10997
  CUDA: foreach_tensor_floor_cuda_
10930
10998
  autogen: _foreach_floor.out
10931
10999
 
@@ -10933,14 +11001,14 @@
10933
11001
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10934
11002
  variants: function
10935
11003
  dispatch:
10936
- CPU: foreach_tensor_frac_slow
11004
+ CompositeExplicitAutograd: foreach_tensor_frac_slow
10937
11005
  CUDA: foreach_tensor_frac_cuda
10938
11006
 
10939
11007
  - func: _foreach_frac_(Tensor(a!)[] self) -> ()
10940
11008
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10941
11009
  variants: function
10942
11010
  dispatch:
10943
- CPU: foreach_tensor_frac_slow_
11011
+ CompositeExplicitAutograd: foreach_tensor_frac_slow_
10944
11012
  CUDA: foreach_tensor_frac_cuda_
10945
11013
  autogen: _foreach_frac.out
10946
11014
 
@@ -10948,7 +11016,7 @@
10948
11016
  device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices
10949
11017
  variants: function
10950
11018
  dispatch:
10951
- CPU: foreach_tensor_ternary_lerp_slow
11019
+ CompositeExplicitAutograd: foreach_tensor_ternary_lerp_slow
10952
11020
  CUDA: foreach_tensor_lerp_ternary_cuda
10953
11021
  autogen: _foreach_lerp.List_out
10954
11022
 
@@ -10956,7 +11024,7 @@
10956
11024
  device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices
10957
11025
  variants: function
10958
11026
  dispatch:
10959
- CPU: foreach_tensor_ternary_lerp_slow_
11027
+ CompositeExplicitAutograd: foreach_tensor_ternary_lerp_slow_
10960
11028
  CUDA: foreach_tensor_lerp_ternary_cuda_
10961
11029
  autogen: _foreach_lerp.List_out
10962
11030
 
@@ -10964,7 +11032,7 @@
10964
11032
  device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices
10965
11033
  variants: function
10966
11034
  dispatch:
10967
- CPU: foreach_tensor_lerp_list_kernel_slow
11035
+ CompositeExplicitAutograd: foreach_tensor_lerp_list_kernel_slow
10968
11036
  CUDA: foreach_tensor_lerp_list_cuda
10969
11037
  autogen: _foreach_lerp.Scalar_out
10970
11038
 
@@ -10972,7 +11040,7 @@
10972
11040
  device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices
10973
11041
  variants: function
10974
11042
  dispatch:
10975
- CPU: foreach_tensor_lerp_list_kernel_slow_
11043
+ CompositeExplicitAutograd: foreach_tensor_lerp_list_kernel_slow_
10976
11044
  CUDA: foreach_tensor_lerp_list_cuda_
10977
11045
  autogen: _foreach_lerp.Scalar_out
10978
11046
 
@@ -10980,14 +11048,14 @@
10980
11048
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10981
11049
  variants: function
10982
11050
  dispatch:
10983
- CPU: foreach_tensor_lgamma_slow
11051
+ CompositeExplicitAutograd: foreach_tensor_lgamma_slow
10984
11052
  CUDA: foreach_tensor_lgamma_cuda
10985
11053
 
10986
11054
  - func: _foreach_lgamma_(Tensor(a!)[] self) -> ()
10987
11055
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10988
11056
  variants: function
10989
11057
  dispatch:
10990
- CPU: foreach_tensor_lgamma_slow_
11058
+ CompositeExplicitAutograd: foreach_tensor_lgamma_slow_
10991
11059
  CUDA: foreach_tensor_lgamma_cuda_
10992
11060
  autogen: _foreach_lgamma.out
10993
11061
 
@@ -10995,14 +11063,14 @@
10995
11063
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10996
11064
  variants: function
10997
11065
  dispatch:
10998
- CPU: foreach_tensor_log_slow
11066
+ CompositeExplicitAutograd: foreach_tensor_log_slow
10999
11067
  CUDA: foreach_tensor_log_cuda
11000
11068
 
11001
11069
  - func: _foreach_log_(Tensor(a!)[] self) -> ()
11002
11070
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11003
11071
  variants: function
11004
11072
  dispatch:
11005
- CPU: foreach_tensor_log_slow_
11073
+ CompositeExplicitAutograd: foreach_tensor_log_slow_
11006
11074
  CUDA: foreach_tensor_log_cuda_
11007
11075
  autogen: _foreach_log.out
11008
11076
 
@@ -11010,14 +11078,14 @@
11010
11078
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11011
11079
  variants: function
11012
11080
  dispatch:
11013
- CPU: foreach_tensor_log10_slow
11081
+ CompositeExplicitAutograd: foreach_tensor_log10_slow
11014
11082
  CUDA: foreach_tensor_log10_cuda
11015
11083
 
11016
11084
  - func: _foreach_log10_(Tensor(a!)[] self) -> ()
11017
11085
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11018
11086
  variants: function
11019
11087
  dispatch:
11020
- CPU: foreach_tensor_log10_slow_
11088
+ CompositeExplicitAutograd: foreach_tensor_log10_slow_
11021
11089
  CUDA: foreach_tensor_log10_cuda_
11022
11090
  autogen: _foreach_log10.out
11023
11091
 
@@ -11025,14 +11093,14 @@
11025
11093
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11026
11094
  variants: function
11027
11095
  dispatch:
11028
- CPU: foreach_tensor_log1p_slow
11096
+ CompositeExplicitAutograd: foreach_tensor_log1p_slow
11029
11097
  CUDA: foreach_tensor_log1p_cuda
11030
11098
 
11031
11099
  - func: _foreach_log1p_(Tensor(a!)[] self) -> ()
11032
11100
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11033
11101
  variants: function
11034
11102
  dispatch:
11035
- CPU: foreach_tensor_log1p_slow_
11103
+ CompositeExplicitAutograd: foreach_tensor_log1p_slow_
11036
11104
  CUDA: foreach_tensor_log1p_cuda_
11037
11105
  autogen: _foreach_log1p.out
11038
11106
 
@@ -11040,37 +11108,45 @@
11040
11108
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11041
11109
  variants: function
11042
11110
  dispatch:
11043
- CPU: foreach_tensor_log2_slow
11111
+ CompositeExplicitAutograd: foreach_tensor_log2_slow
11044
11112
  CUDA: foreach_tensor_log2_cuda
11045
11113
 
11046
11114
  - func: _foreach_log2_(Tensor(a!)[] self) -> ()
11047
11115
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11048
11116
  variants: function
11049
11117
  dispatch:
11050
- CPU: foreach_tensor_log2_slow_
11118
+ CompositeExplicitAutograd: foreach_tensor_log2_slow_
11051
11119
  CUDA: foreach_tensor_log2_cuda_
11052
11120
  autogen: _foreach_log2.out
11053
11121
 
11122
+ - func: _foreach_max(Tensor[] self) -> Tensor[]
11123
+ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11124
+ variants: function
11125
+ dispatch:
11126
+ CompositeExplicitAutograd: foreach_tensor_max_slow
11127
+ CUDA: foreach_tensor_max_cuda
11128
+ autogen: _foreach_max.out
11129
+
11054
11130
  - func: _foreach_neg(Tensor[] self) -> Tensor[]
11055
11131
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11056
11132
  variants: function
11057
11133
  dispatch:
11058
- CPU: foreach_tensor_neg_slow
11134
+ CompositeExplicitAutograd: foreach_tensor_neg_slow
11059
11135
  CUDA: foreach_tensor_neg_cuda
11060
11136
 
11061
11137
  - func: _foreach_neg_(Tensor(a!)[] self) -> ()
11062
11138
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11063
11139
  variants: function
11064
11140
  dispatch:
11065
- CPU: foreach_tensor_neg_slow_
11141
+ CompositeExplicitAutograd: foreach_tensor_neg_slow_
11066
11142
  CUDA: foreach_tensor_neg_cuda_
11067
11143
  autogen: _foreach_neg.out
11068
11144
 
11069
- - func: _foreach_norm.Scalar(Tensor[] self, Scalar ord=2) -> Tensor[]
11145
+ - func: _foreach_norm.Scalar(Tensor[] self, Scalar ord=2, ScalarType? dtype=None) -> Tensor[]
11070
11146
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11071
11147
  variants: function
11072
11148
  dispatch:
11073
- CPU: foreach_tensor_norm_slow
11149
+ CompositeExplicitAutograd: foreach_tensor_norm_slow
11074
11150
  CUDA: foreach_tensor_norm_cuda
11075
11151
  autogen: _foreach_norm.Scalar_out
11076
11152
 
@@ -11078,35 +11154,35 @@
11078
11154
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11079
11155
  variants: function
11080
11156
  dispatch:
11081
- CPU: foreach_tensor_pow_list_kernel_slow
11157
+ CompositeExplicitAutograd: foreach_tensor_pow_list_kernel_slow
11082
11158
  CUDA: foreach_tensor_pow_list_kernel_cuda
11083
11159
 
11084
11160
  - func: _foreach_pow.Scalar(Tensor[] self, Scalar exponent) -> Tensor[]
11085
11161
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11086
11162
  variants: function
11087
11163
  dispatch:
11088
- CPU: foreach_tensor_pow_scalar_kernel_slow
11164
+ CompositeExplicitAutograd: foreach_tensor_pow_scalar_kernel_slow
11089
11165
  CUDA: foreach_tensor_pow_scalar_kernel_cuda
11090
11166
 
11091
11167
  - func: _foreach_pow.ScalarList(Tensor[] self, Scalar[] exponent) -> Tensor[]
11092
11168
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11093
11169
  variants: function
11094
11170
  dispatch:
11095
- CPU: foreach_tensor_pow_scalarlist_kernel_slow
11171
+ CompositeExplicitAutograd: foreach_tensor_pow_scalarlist_kernel_slow
11096
11172
  CUDA: foreach_tensor_pow_scalarlist_kernel_cuda
11097
11173
 
11098
11174
  - func: _foreach_pow.ScalarAndTensor(Scalar self, Tensor[] exponent) -> Tensor[]
11099
11175
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11100
11176
  variants: function
11101
11177
  dispatch:
11102
- CPU: foreach_scalar_pow_list_kernel_slow
11178
+ CompositeExplicitAutograd: foreach_scalar_pow_list_kernel_slow
11103
11179
  CUDA: foreach_scalar_pow_list_kernel_cuda
11104
11180
 
11105
11181
  - func: _foreach_pow_.List(Tensor(a!)[] self, Tensor[] exponent) -> ()
11106
11182
  device_check: NoCheck
11107
11183
  variants: function
11108
11184
  dispatch:
11109
- CPU: foreach_tensor_pow_list_kernel_slow_
11185
+ CompositeExplicitAutograd: foreach_tensor_pow_list_kernel_slow_
11110
11186
  CUDA: foreach_tensor_pow_list_kernel_cuda_
11111
11187
  autogen: _foreach_pow.List_out
11112
11188
 
@@ -11114,7 +11190,7 @@
11114
11190
  device_check: NoCheck
11115
11191
  variants: function
11116
11192
  dispatch:
11117
- CPU: foreach_tensor_pow_scalar_kernel_slow_
11193
+ CompositeExplicitAutograd: foreach_tensor_pow_scalar_kernel_slow_
11118
11194
  CUDA: foreach_tensor_pow_scalar_kernel_cuda_
11119
11195
  autogen: _foreach_pow.Scalar_out
11120
11196
 
@@ -11122,7 +11198,7 @@
11122
11198
  device_check: NoCheck
11123
11199
  variants: function
11124
11200
  dispatch:
11125
- CPU: foreach_tensor_pow_scalarlist_kernel_slow_
11201
+ CompositeExplicitAutograd: foreach_tensor_pow_scalarlist_kernel_slow_
11126
11202
  CUDA: foreach_tensor_pow_scalarlist_kernel_cuda_
11127
11203
  autogen: _foreach_pow.ScalarList_out
11128
11204
 
@@ -11130,14 +11206,14 @@
11130
11206
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11131
11207
  variants: function
11132
11208
  dispatch:
11133
- CPU: foreach_tensor_reciprocal_slow
11209
+ CompositeExplicitAutograd: foreach_tensor_reciprocal_slow
11134
11210
  CUDA: foreach_tensor_reciprocal_cuda
11135
11211
 
11136
11212
  - func: _foreach_reciprocal_(Tensor(a!)[] self) -> ()
11137
11213
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11138
11214
  variants: function
11139
11215
  dispatch:
11140
- CPU: foreach_tensor_reciprocal_slow_
11216
+ CompositeExplicitAutograd: foreach_tensor_reciprocal_slow_
11141
11217
  CUDA: foreach_tensor_reciprocal_cuda_
11142
11218
  autogen: _foreach_reciprocal.out
11143
11219
 
@@ -11145,14 +11221,14 @@
11145
11221
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11146
11222
  variants: function
11147
11223
  dispatch:
11148
- CPU: foreach_tensor_round_slow
11224
+ CompositeExplicitAutograd: foreach_tensor_round_slow
11149
11225
  CUDA: foreach_tensor_round_cuda
11150
11226
 
11151
11227
  - func: _foreach_round_(Tensor(a!)[] self) -> ()
11152
11228
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11153
11229
  variants: function
11154
11230
  dispatch:
11155
- CPU: foreach_tensor_round_slow_
11231
+ CompositeExplicitAutograd: foreach_tensor_round_slow_
11156
11232
  CUDA: foreach_tensor_round_cuda_
11157
11233
  autogen: _foreach_round.out
11158
11234
 
@@ -11160,14 +11236,14 @@
11160
11236
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11161
11237
  variants: function
11162
11238
  dispatch:
11163
- CPU: foreach_tensor_sigmoid_slow
11239
+ CompositeExplicitAutograd: foreach_tensor_sigmoid_slow
11164
11240
  CUDA: foreach_tensor_sigmoid_cuda
11165
11241
 
11166
11242
  - func: _foreach_sigmoid_(Tensor(a!)[] self) -> ()
11167
11243
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11168
11244
  variants: function
11169
11245
  dispatch:
11170
- CPU: foreach_tensor_sigmoid_slow_
11246
+ CompositeExplicitAutograd: foreach_tensor_sigmoid_slow_
11171
11247
  CUDA: foreach_tensor_sigmoid_cuda_
11172
11248
  autogen: _foreach_sigmoid.out
11173
11249
 
@@ -11175,14 +11251,14 @@
11175
11251
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11176
11252
  variants: function
11177
11253
  dispatch:
11178
- CPU: foreach_tensor_sign_slow
11254
+ CompositeExplicitAutograd: foreach_tensor_sign_slow
11179
11255
  CUDA: foreach_tensor_sign_cuda
11180
11256
 
11181
11257
  - func: _foreach_sign_(Tensor(a!)[] self) -> ()
11182
11258
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11183
11259
  variants: function
11184
11260
  dispatch:
11185
- CPU: foreach_tensor_sign_slow_
11261
+ CompositeExplicitAutograd: foreach_tensor_sign_slow_
11186
11262
  CUDA: foreach_tensor_sign_cuda_
11187
11263
  autogen: _foreach_sign.out
11188
11264
 
@@ -11190,14 +11266,14 @@
11190
11266
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11191
11267
  variants: function
11192
11268
  dispatch:
11193
- CPU: foreach_tensor_sin_slow
11269
+ CompositeExplicitAutograd: foreach_tensor_sin_slow
11194
11270
  CUDA: foreach_tensor_sin_cuda
11195
11271
 
11196
11272
  - func: _foreach_sin_(Tensor(a!)[] self) -> ()
11197
11273
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11198
11274
  variants: function
11199
11275
  dispatch:
11200
- CPU: foreach_tensor_sin_slow_
11276
+ CompositeExplicitAutograd: foreach_tensor_sin_slow_
11201
11277
  CUDA: foreach_tensor_sin_cuda_
11202
11278
  autogen: _foreach_sin.out
11203
11279
 
@@ -11205,14 +11281,14 @@
11205
11281
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11206
11282
  variants: function
11207
11283
  dispatch:
11208
- CPU: foreach_tensor_sinh_slow
11284
+ CompositeExplicitAutograd: foreach_tensor_sinh_slow
11209
11285
  CUDA: foreach_tensor_sinh_cuda
11210
11286
 
11211
11287
  - func: _foreach_sinh_(Tensor(a!)[] self) -> ()
11212
11288
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11213
11289
  variants: function
11214
11290
  dispatch:
11215
- CPU: foreach_tensor_sinh_slow_
11291
+ CompositeExplicitAutograd: foreach_tensor_sinh_slow_
11216
11292
  CUDA: foreach_tensor_sinh_cuda_
11217
11293
  autogen: _foreach_sinh.out
11218
11294
 
@@ -11220,14 +11296,14 @@
11220
11296
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11221
11297
  variants: function
11222
11298
  dispatch:
11223
- CPU: foreach_tensor_sqrt_slow
11299
+ CompositeExplicitAutograd: foreach_tensor_sqrt_slow
11224
11300
  CUDA: foreach_tensor_sqrt_cuda
11225
11301
 
11226
11302
  - func: _foreach_sqrt_(Tensor(a!)[] self) -> ()
11227
11303
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11228
11304
  variants: function
11229
11305
  dispatch:
11230
- CPU: foreach_tensor_sqrt_slow_
11306
+ CompositeExplicitAutograd: foreach_tensor_sqrt_slow_
11231
11307
  CUDA: foreach_tensor_sqrt_cuda_
11232
11308
  autogen: _foreach_sqrt.out
11233
11309
 
@@ -11235,14 +11311,14 @@
11235
11311
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11236
11312
  variants: function
11237
11313
  dispatch:
11238
- CPU: foreach_tensor_tan_slow
11314
+ CompositeExplicitAutograd: foreach_tensor_tan_slow
11239
11315
  CUDA: foreach_tensor_tan_cuda
11240
11316
 
11241
11317
  - func: _foreach_tan_(Tensor(a!)[] self) -> ()
11242
11318
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11243
11319
  variants: function
11244
11320
  dispatch:
11245
- CPU: foreach_tensor_tan_slow_
11321
+ CompositeExplicitAutograd: foreach_tensor_tan_slow_
11246
11322
  CUDA: foreach_tensor_tan_cuda_
11247
11323
  autogen: _foreach_tan.out
11248
11324
 
@@ -11250,14 +11326,14 @@
11250
11326
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11251
11327
  variants: function
11252
11328
  dispatch:
11253
- CPU: foreach_tensor_tanh_slow
11329
+ CompositeExplicitAutograd: foreach_tensor_tanh_slow
11254
11330
  CUDA: foreach_tensor_tanh_cuda
11255
11331
 
11256
11332
  - func: _foreach_tanh_(Tensor(a!)[] self) -> ()
11257
11333
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11258
11334
  variants: function
11259
11335
  dispatch:
11260
- CPU: foreach_tensor_tanh_slow_
11336
+ CompositeExplicitAutograd: foreach_tensor_tanh_slow_
11261
11337
  CUDA: foreach_tensor_tanh_cuda_
11262
11338
  autogen: _foreach_tanh.out
11263
11339
 
@@ -11265,14 +11341,14 @@
11265
11341
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11266
11342
  variants: function
11267
11343
  dispatch:
11268
- CPU: foreach_tensor_trunc_slow
11344
+ CompositeExplicitAutograd: foreach_tensor_trunc_slow
11269
11345
  CUDA: foreach_tensor_trunc_cuda
11270
11346
 
11271
11347
  - func: _foreach_trunc_(Tensor(a!)[] self) -> ()
11272
11348
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11273
11349
  variants: function
11274
11350
  dispatch:
11275
- CPU: foreach_tensor_trunc_slow_
11351
+ CompositeExplicitAutograd: foreach_tensor_trunc_slow_
11276
11352
  CUDA: foreach_tensor_trunc_cuda_
11277
11353
  autogen: _foreach_trunc.out
11278
11354
 
@@ -11280,7 +11356,7 @@
11280
11356
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11281
11357
  variants: function
11282
11358
  dispatch:
11283
- CPU: foreach_tensor_zero_slow_
11359
+ CompositeExplicitAutograd: foreach_tensor_zero_slow_
11284
11360
  CUDA: foreach_tensor_zero_cuda_
11285
11361
  autogen: _foreach_zero, _foreach_zero.out
11286
11362
 
@@ -11288,9 +11364,15 @@
11288
11364
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11289
11365
  variants: function
11290
11366
  dispatch:
11291
- CPU: foreach_tensor_copy_list_kernel_slow_
11367
+ CompositeExplicitAutograd: foreach_tensor_copy_list_kernel_slow_
11292
11368
  CUDA: foreach_tensor_copy_list_kernel_cuda_
11293
- autogen: _foreach_copy, _foreach_copy.out
11369
+ autogen: _foreach_copy.out
11370
+
11371
+ - func: _foreach_copy(Tensor[] self, Tensor[] src, bool non_blocking=False) -> Tensor[] self_out
11372
+ device_check: NoCheck
11373
+ variants: function
11374
+ dispatch:
11375
+ CompositeExplicitAutograd: _foreach_copy
11294
11376
 
11295
11377
  - func: bucketize.Tensor(Tensor self, Tensor boundaries, *, bool out_int32=False, bool right=False) -> Tensor
11296
11378
  dispatch:
@@ -14562,6 +14644,16 @@
14562
14644
  NestedTensorCUDA: NestedTensor_to_padded_tensor_cuda
14563
14645
  autogen: to_padded_tensor.out
14564
14646
 
14647
+ - func: _jagged_to_padded_dense_forward(Tensor values, Tensor[] offsets, SymInt[] max_lengths, float padding_value=0.0) -> Tensor
14648
+ variants: function
14649
+ dispatch:
14650
+ CUDA: _fbgemm_jagged_to_padded_dense_forward
14651
+
14652
+ - func: _padded_dense_to_jagged_forward(Tensor dense, Tensor[] offsets, SymInt? total_L=None) -> Tensor
14653
+ variants: function
14654
+ dispatch:
14655
+ CUDA: _fbgemm_dense_to_jagged_forward_symint
14656
+
14565
14657
  - func: _nested_tensor_softmax_with_shape(Tensor self, Tensor query) -> Tensor
14566
14658
  dispatch:
14567
14659
  NestedTensorCPU: NestedTensor_softmax_dropout
@@ -14636,31 +14728,36 @@
14636
14728
  CUDA: _scaled_dot_product_efficient_attention_backward_cuda
14637
14729
  tags: nondeterministic_seeded
14638
14730
 
14639
- - func: _scaled_dot_product_cudnn_attention(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor output, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset)
14731
+ - func: _scaled_dot_product_cudnn_attention(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor output, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
14640
14732
  dispatch:
14641
14733
  CUDA: _scaled_dot_product_cudnn_attention_cuda
14642
14734
  tags: nondeterministic_seeded
14643
14735
 
14644
- - func: _flash_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? cum_seq_q, Tensor? cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, bool return_debug_mask, *, float? scale=None) -> (Tensor output, Tensor softmax_logsumexp, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
14736
+ - func: _scaled_dot_product_cudnn_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, Tensor philox_seed, Tensor philox_offset, *, float? scale=None) -> (Tensor, Tensor, Tensor)
14737
+ dispatch:
14738
+ CUDA: _scaled_dot_product_cudnn_attention_backward_cuda
14739
+ tags: nondeterministic_seeded
14740
+
14741
+ - func: _flash_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? cum_seq_q, Tensor? cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, bool return_debug_mask, *, float? scale=None, SymInt? window_size_left=None, SymInt? window_size_right=None, Tensor? seqused_k=None, Tensor? alibi_slopes=None) -> (Tensor output, Tensor softmax_logsumexp, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
14645
14742
  variants: function
14646
14743
  dispatch:
14647
14744
  CUDA: _flash_attention_forward
14648
14745
  tags: nondeterministic_seeded
14649
14746
 
14650
- - func: _flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, Tensor philox_seed, Tensor philox_offset, *, float? scale=None) -> (Tensor, Tensor, Tensor)
14747
+ - func: _flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, Tensor philox_seed, Tensor philox_offset, *, float? scale=None, SymInt? window_size_left=None, SymInt? window_size_right=None) -> (Tensor, Tensor, Tensor)
14651
14748
  device_check: NoCheck
14652
14749
  variants: function
14653
14750
  dispatch:
14654
14751
  CUDA: _flash_attention_backward
14655
14752
 
14656
14753
  # Returns output, logsumexp if compute_logsumexp
14657
- - func: _efficient_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int? max_seqlen_q, int? max_seqlen_k, float dropout_p, int custom_mask_type, bool compute_log_sumexp=False, *, float? scale=None, Tensor? causal_diagonal=None, Tensor? seqlen_k=None) -> (Tensor output, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset, SymInt max_seqlen_batch_q, SymInt max_seqlen_batch_k)
14754
+ - func: _efficient_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, SymInt? max_seqlen_q, SymInt? max_seqlen_k, float dropout_p, int custom_mask_type, bool compute_log_sumexp=False, *, float? scale=None, Tensor? seqlen_k=None, int? window_size=None) -> (Tensor output, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset, SymInt max_seqlen_batch_q, SymInt max_seqlen_batch_k)
14658
14755
  variants: function
14659
14756
  dispatch:
14660
14757
  CUDA: _efficient_attention_forward
14661
14758
  tags: nondeterministic_seeded
14662
14759
 
14663
- - func: _efficient_attention_backward(Tensor grad_out_, Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor out, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, SymInt max_seqlen_q, SymInt max_seqlen_k, Tensor logsumexp, float dropout_p, Tensor philox_seed, Tensor philox_offset, int custom_mask_type, bool bias_requires_grad, *, float? scale=None, int? num_splits_key=None) -> (Tensor, Tensor, Tensor, Tensor)
14760
+ - func: _efficient_attention_backward(Tensor grad_out_, Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor out, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, SymInt max_seqlen_q, SymInt max_seqlen_k, Tensor logsumexp, float dropout_p, Tensor philox_seed, Tensor philox_offset, int custom_mask_type, bool bias_requires_grad, *, float? scale=None, int? num_splits_key=None, int? window_size=None, bool shared_storage_dqdkdv=False) -> (Tensor, Tensor, Tensor, Tensor)
14664
14761
  device_check: NoCheck
14665
14762
  variants: function
14666
14763
  dispatch:
@@ -15460,11 +15557,11 @@
15460
15557
  CPU: foobar
15461
15558
  autogen: _foobar.out
15462
15559
 
15463
- # Fused Optimizer CUDA kernels.
15464
15560
  - func: _fused_adam_(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, float lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
15465
15561
  # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now).
15466
15562
  variants: function
15467
15563
  dispatch:
15564
+ CPU: _fused_adam_kernel_cpu_
15468
15565
  CUDA: _fused_adam_kernel_cuda_
15469
15566
  autogen: _fused_adam, _fused_adam.out
15470
15567
 
@@ -15474,6 +15571,7 @@
15474
15571
  device_check: NoCheck
15475
15572
  variants: function
15476
15573
  dispatch:
15574
+ CPU: _fused_adam_kernel_cpu_
15477
15575
  CUDA: _fused_adam_kernel_cuda_
15478
15576
  autogen: _fused_adam.tensor_lr, _fused_adam.tensor_lr_out
15479
15577
 
@@ -15481,6 +15579,7 @@
15481
15579
  # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now).
15482
15580
  variants: function
15483
15581
  dispatch:
15582
+ CPU: _fused_adamw_kernel_cpu_
15484
15583
  CUDA: _fused_adamw_kernel_cuda_
15485
15584
  autogen: _fused_adamw, _fused_adamw.out
15486
15585
 
@@ -15490,6 +15589,7 @@
15490
15589
  device_check: NoCheck
15491
15590
  variants: function
15492
15591
  dispatch:
15592
+ CPU: _fused_adamw_kernel_cpu_
15493
15593
  CUDA: _fused_adamw_kernel_cuda_
15494
15594
  autogen: _fused_adamw.tensor_lr, _fused_adamw.tensor_lr_out
15495
15595
 
@@ -15497,6 +15597,7 @@
15497
15597
  # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now).
15498
15598
  variants: function
15499
15599
  dispatch:
15600
+ CPU: _fused_sgd_kernel_cpu_
15500
15601
  CUDA: _fused_sgd_kernel_cuda_
15501
15602
  autogen: _fused_sgd, _fused_sgd.out
15502
15603
 
@@ -15506,9 +15607,16 @@
15506
15607
  device_check: NoCheck
15507
15608
  variants: function
15508
15609
  dispatch:
15610
+ CPU: _fused_sgd_kernel_cpu_
15509
15611
  CUDA: _fused_sgd_kernel_cuda_
15510
15612
  autogen: _fused_sgd.tensor_lr, _fused_sgd.tensor_lr_out
15511
15613
 
15614
+ - func: _fused_adagrad_(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] state_sums, Tensor(d!)[] state_steps, *, float lr, float lr_decay, float weight_decay, float eps, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
15615
+ variants: function
15616
+ dispatch:
15617
+ CPU: _fused_adagrad_kernel_cpu_
15618
+ autogen: _fused_adagrad, _fused_adagrad.out
15619
+
15512
15620
  # This op is ONLY used by pytorch/XLA in functionalization, and should never show up in vanilla eager mode or in any pytorch tracing contexts.
15513
15621
  - func: _propagate_xla_data(Tensor input, Tensor output) -> ()
15514
15622
  variants: function