torch-rb 0.10.0 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -145,6 +145,7 @@
145
145
 
146
146
  - func: rename_(Tensor(a!) self, Dimname[]? names) -> Tensor(a!)
147
147
  variants: method
148
+ tags: inplace_view
148
149
 
149
150
  - func: rename(Tensor(a) self, Dimname[]? names) -> Tensor(a)
150
151
  variants: method
@@ -274,6 +275,7 @@
274
275
  device_check: NoCheck # TensorIterator
275
276
  dispatch:
276
277
  CPU, CUDA: abs_out
278
+ MPS: abs_out_mps
277
279
  SparseCPU, SparseCUDA: abs_sparse_out
278
280
  SparseCsrCPU, SparseCsrCUDA: abs_sparse_csr_out
279
281
 
@@ -328,12 +330,12 @@
328
330
  - func: view_as_real(Tensor(a) self) -> Tensor(a)
329
331
  variants: function
330
332
  dispatch:
331
- CPU, CUDA: view_as_real
333
+ CPU, CUDA, MPS, Meta: view_as_real
332
334
 
333
335
  - func: view_as_complex(Tensor(a) self) -> Tensor(a)
334
336
  variants: function
335
337
  dispatch:
336
- CPU, CUDA: view_as_complex
338
+ CPU, CUDA, Meta: view_as_complex
337
339
 
338
340
  - func: sgn(Tensor self) -> Tensor
339
341
  variants: function, method
@@ -357,6 +359,9 @@
357
359
  SparseCPU, SparseCUDA: sgn_sparse_out
358
360
  SparseCsrCPU, SparseCsrCUDA: sgn_sparse_csr_out
359
361
 
362
+ - func: chalf(Tensor self, *, MemoryFormat? memory_format=None) -> Tensor
363
+ variants: method
364
+
360
365
  - func: real(Tensor(a) self) -> Tensor(a)
361
366
  device_check: NoCheck # TensorIterator
362
367
  variants: function
@@ -422,6 +427,7 @@
422
427
  structured_inherits: TensorIteratorBase
423
428
  dispatch:
424
429
  CPU, CUDA: acos_out
430
+ MPS: acos_out_mps
425
431
 
426
432
  # arccos, alias of acos
427
433
  - func: arccos(Tensor self) -> Tensor
@@ -448,6 +454,7 @@
448
454
  SparseCsrCPU, SparseCsrCUDA: add_sparse_csr
449
455
  MkldnnCPU: mkldnn_add
450
456
  ZeroTensor: add_zerotensor
457
+ NestedTensorCPU, NestedTensorCUDA: NestedTensor_add_Tensor
451
458
 
452
459
  - func: add_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
453
460
  device_check: NoCheck # TensorIterator
@@ -457,18 +464,22 @@
457
464
  SparseCPU, SparseCUDA: add_sparse_
458
465
  SparseCsrCPU, SparseCsrCUDA: add_sparse_csr_
459
466
  MkldnnCPU: mkldnn_add_
467
+ NestedTensorCPU, NestedTensorCUDA: NestedTensor_add__Tensor
460
468
 
461
469
  - func: add.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
462
470
  device_check: NoCheck # TensorIterator
463
471
  structured: True
464
472
  structured_inherits: TensorIteratorBase
473
+ ufunc_inner_loop:
474
+ Generic: add (AllAndComplex, BFloat16, Half, ComplexHalf)
475
+ ScalarOnly: add (Bool)
465
476
  dispatch:
466
- CPU, CUDA: add_out
467
477
  SparseCPU: add_out_sparse_cpu
468
478
  SparseCUDA: add_out_sparse_cuda
469
479
  SparseCsrCPU: add_out_sparse_csr_cpu
470
480
  SparseCsrCUDA: add_out_sparse_csr_cuda
471
481
  MkldnnCPU: mkldnn_add_out
482
+ MPS: add_out_mps
472
483
 
473
484
  - func: _add_relu.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
474
485
  variants: function
@@ -494,6 +505,7 @@
494
505
  variants: function
495
506
  dispatch:
496
507
  CPU: add_relu_
508
+ autogen: _add_relu.Scalar_out
497
509
 
498
510
  # For C++ only, until we have conversion from C++ numbers to Tensor
499
511
  - func: add.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor
@@ -507,6 +519,7 @@
507
519
  variants: method
508
520
  dispatch:
509
521
  CompositeExplicitAutograd: add_
522
+ autogen: add.Scalar_out
510
523
 
511
524
  - func: addmv(Tensor self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor
512
525
  structured_delegate: addmv.out
@@ -521,8 +534,9 @@
521
534
  dispatch:
522
535
  CPU: addmv_out_cpu
523
536
  CUDA: addmv_out_cuda
524
- SparseCsrCPU: addmv_out_sparse_csr
525
- SparseCsrCUDA: addmv_out_sparse_csr_cuda
537
+ MPS: addmv_out_mps
538
+ SparseCsrCPU: addmv_out_sparse_compressed
539
+ SparseCsrCUDA: addmv_out_sparse_compressed_cuda
526
540
 
527
541
  - func: addr(Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
528
542
  variants: function, method
@@ -560,6 +574,7 @@
560
574
  - dim -> int dim
561
575
  dispatch:
562
576
  CPU, CUDA: all_out
577
+ MPS: all_out_mps
563
578
 
564
579
  - func: all.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor
565
580
  device_check: NoCheck # TensorIterator
@@ -583,6 +598,7 @@
583
598
  - dim -> int dim
584
599
  dispatch:
585
600
  CPU, CUDA: any_out
601
+ MPS: any_out_mps
586
602
 
587
603
  - func: any.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor
588
604
  device_check: NoCheck # TensorIterator
@@ -595,6 +611,12 @@
595
611
 
596
612
  - func: arange.start(Scalar start, Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
597
613
 
614
+ # Note [arange.start_step schema]
615
+ # We want `arange.start_step` to be grouped up with `arange.start_out`,
616
+ # But this doesn't happen automatically because the step argument
617
+ # is defaultable for .start_out but not for .start_step.
618
+ # We should probably just make "step" a defaultable param on arange.start,
619
+ # and kill arange.start_step.
598
620
  - func: arange.start_step(Scalar start, Scalar end, Scalar step, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
599
621
 
600
622
  - func: arange.out(Scalar end, *, Tensor(a!) out) -> Tensor(a!)
@@ -603,6 +625,7 @@
603
625
  dispatch:
604
626
  CPU, Meta: arange_out
605
627
  CUDA: arange_cuda_out
628
+ MPS: arange_mps_out
606
629
 
607
630
  # This function is a temporary hack to allow tracing of arange like constructs with dynamic
608
631
  # bounds on arange. Normal arange is not traceable because it does not take any tensor inputs;
@@ -620,6 +643,7 @@
620
643
  structured: True
621
644
  dispatch:
622
645
  CPU, CUDA: argmax_out
646
+ MPS: argmax_out_mps
623
647
 
624
648
  - func: argmin(Tensor self, int? dim=None, bool keepdim=False) -> Tensor
625
649
  structured_delegate: argmin.out
@@ -644,6 +668,7 @@
644
668
  structured_inherits: TensorIteratorBase
645
669
  dispatch:
646
670
  CPU, CUDA: acosh_out
671
+ MPS: acosh_out_mps
647
672
 
648
673
  # arccosh, alias for acosh
649
674
  - func: arccosh(Tensor self) -> Tensor
@@ -673,6 +698,7 @@
673
698
  structured_inherits: TensorIteratorBase
674
699
  dispatch:
675
700
  CPU, CUDA: asinh_out
701
+ MPS: asinh_out_mps
676
702
  SparseCPU, SparseCUDA: asinh_sparse_out
677
703
  SparseCsrCPU, SparseCsrCUDA: asinh_sparse_csr_out
678
704
 
@@ -705,6 +731,7 @@
705
731
  structured_inherits: TensorIteratorBase
706
732
  dispatch:
707
733
  CPU, CUDA: atanh_out
734
+ MPS: atanh_out_mps
708
735
  SparseCPU, SparseCUDA: atanh_sparse_out
709
736
  SparseCsrCPU, SparseCsrCUDA: atanh_sparse_csr_out
710
737
 
@@ -721,6 +748,7 @@
721
748
  variants: function, method
722
749
  dispatch:
723
750
  ZeroTensor, CPU, CUDA, Meta: as_strided_tensorimpl
751
+ MPS: as_strided_tensorimpl_mps
724
752
  QuantizedCPU, QuantizedCUDA: as_strided_qtensorimpl
725
753
  device_check: NoCheck
726
754
  device_guard: False
@@ -756,6 +784,7 @@
756
784
  structured_inherits: TensorIteratorBase
757
785
  dispatch:
758
786
  CPU, CUDA: asin_out
787
+ MPS: asin_out_mps
759
788
  SparseCPU, SparseCUDA: asin_sparse_out
760
789
  SparseCsrCPU, SparseCsrCUDA: asin_sparse_csr_out
761
790
 
@@ -790,6 +819,7 @@
790
819
  structured_inherits: TensorIteratorBase
791
820
  dispatch:
792
821
  CPU, CUDA: atan_out
822
+ MPS: atan_out_mps
793
823
  SparseCPU, SparseCUDA: atan_sparse_out
794
824
  SparseCsrCPU, SparseCsrCUDA: atan_sparse_csr_out
795
825
 
@@ -833,6 +863,7 @@
833
863
  dispatch:
834
864
  CPU: baddbmm_out_cpu
835
865
  CUDA: baddbmm_out_cuda
866
+ MPS: baddbmm_out_mps
836
867
  SparseCsrCUDA: baddbmm_out_sparse_csr_cuda
837
868
 
838
869
  - func: bartlett_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
@@ -861,19 +892,26 @@
861
892
  variants: function
862
893
  dispatch:
863
894
  CPU, CUDA: bernoulli_out
895
+ MPS: bernoulli_out_mps
864
896
 
865
897
  - func: bernoulli_.Tensor(Tensor(a!) self, Tensor p, *, Generator? generator=None) -> Tensor(a!)
866
898
  device_check: NoCheck # TensorIterator
867
899
  variants: method
868
900
  dispatch:
869
901
  CPU, CUDA: bernoulli_
902
+ MPS: bernoulli_mps_
903
+ autogen: bernoulli.Tensor_functional, bernoulli.Tensor_out
870
904
 
871
905
  - func: bernoulli_.float(Tensor(a!) self, float p=0.5, *, Generator? generator=None) -> Tensor(a!)
872
906
  device_check: NoCheck # TensorIterator
873
907
  variants: method
874
908
  dispatch:
875
909
  CPU, CUDA: bernoulli_
910
+ MPS: bernoulli_mps_
911
+ autogen: bernoulli.float_out
876
912
 
913
+ # Note [bernoulli.p schema]
914
+ # We should probably just fix the overload ambiguity by appending a _functional to the C++ API name (BC breaking)
877
915
  # This out-of-place version isn't used explicitly, but needed by jit.
878
916
  # There is no default valid on `p` here because it would introduce ambiguity
879
917
  # with `bernoulli(Tensor self, *, Generator? generator=None)` declaration.
@@ -890,6 +928,7 @@
890
928
  dispatch:
891
929
  CPU: binary_cross_entropy_cpu
892
930
  CUDA: binary_cross_entropy_cuda
931
+ MPS: binary_cross_entropy_mps
893
932
 
894
933
  - func: binary_cross_entropy.out(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)
895
934
  device_check: NoCheck # TensorIterator
@@ -898,6 +937,7 @@
898
937
  dispatch:
899
938
  CPU: binary_cross_entropy_out_cpu
900
939
  CUDA: binary_cross_entropy_out_cuda
940
+ MPS: binary_cross_entropy_out_mps
901
941
 
902
942
  - func: binary_cross_entropy_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean) -> Tensor
903
943
  python_module: nn
@@ -905,6 +945,7 @@
905
945
  dispatch:
906
946
  CPU: binary_cross_entropy_backward_cpu
907
947
  CUDA: binary_cross_entropy_backward_cuda
948
+ MPS: binary_cross_entropy_backward_mps
908
949
 
909
950
  - func: binary_cross_entropy_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, *, Tensor(a!) grad_input) -> Tensor(a!)
910
951
  python_module: nn
@@ -912,6 +953,7 @@
912
953
  dispatch:
913
954
  CPU: binary_cross_entropy_backward_out_cpu
914
955
  CUDA: binary_cross_entropy_backward_out_cuda
956
+ MPS: binary_cross_entropy_backward_out_mps
915
957
 
916
958
  - func: binary_cross_entropy_with_logits(Tensor self, Tensor target, Tensor? weight=None, Tensor? pos_weight=None, int reduction=Mean) -> Tensor
917
959
  device_check: NoCheck # TensorIterator
@@ -1061,6 +1103,7 @@
1061
1103
  dispatch:
1062
1104
  CPU: bmm_out_cpu
1063
1105
  CUDA: bmm_out_cuda
1106
+ MPS: bmm_out_mps
1064
1107
  SparseCPU: bmm_out_sparse_cpu
1065
1108
  SparseCUDA: bmm_out_sparse_cuda
1066
1109
  SparseCsrCUDA: bmm_out_sparse_csr_cuda
@@ -1078,12 +1121,20 @@
1078
1121
  SparseCPU, SparseCUDA: sparse_broadcast_to
1079
1122
 
1080
1123
  - func: cat(Tensor[] tensors, int dim=0) -> Tensor
1124
+ structured_delegate: cat.out
1081
1125
  dispatch:
1082
- CompositeExplicitAutograd: cat
1126
+ SparseCPU, SparseCUDA: cat_sparse
1127
+ QuantizedCPU: cat_quantized_cpu
1083
1128
 
1084
1129
  - func: cat.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!)
1130
+ structured: True
1131
+ precomputed:
1132
+ - dim -> int dim, int valid, bool all_contiguous, bool all_same_dtype, bool all_same_sizes_and_stride, MemoryFormat memory_format
1085
1133
  dispatch:
1086
- CompositeExplicitAutograd: cat_out
1134
+ CPU: cat_out_cpu
1135
+ CUDA: cat_out_cuda
1136
+ MPS: cat_out_mps
1137
+ QuantizedCPU: cat_out_quantized_cpu
1087
1138
 
1088
1139
  - func: cat.names(Tensor[] tensors, Dimname dim) -> Tensor
1089
1140
 
@@ -1125,6 +1176,7 @@
1125
1176
  structured_inherits: TensorIteratorBase
1126
1177
  dispatch:
1127
1178
  CPU, CUDA: ceil_out
1179
+ MPS: ceil_out_mps
1128
1180
  SparseCPU, SparseCUDA: ceil_sparse_out
1129
1181
  SparseCsrCPU, SparseCsrCUDA: ceil_sparse_csr_out
1130
1182
 
@@ -1164,8 +1216,7 @@
1164
1216
 
1165
1217
  - func: clamp.Tensor(Tensor self, Tensor? min=None, Tensor? max=None) -> Tensor
1166
1218
  variants: function, method
1167
- dispatch:
1168
- CPU, CUDA: clamp
1219
+ structured_delegate: clamp.Tensor_out
1169
1220
 
1170
1221
  - func: clamp_(Tensor(a!) self, Scalar? min=None, Scalar? max=None) -> Tensor(a!)
1171
1222
  device_check: NoCheck # TensorIterator
@@ -1177,8 +1228,7 @@
1177
1228
 
1178
1229
  - func: clamp_.Tensor(Tensor(a!) self, Tensor? min=None, Tensor? max=None) -> Tensor(a!)
1179
1230
  variants: function, method
1180
- dispatch:
1181
- CompositeExplicitAutograd: clamp_
1231
+ structured_delegate: clamp.Tensor_out
1182
1232
 
1183
1233
  - func: clamp.out(Tensor self, Scalar? min=None, Scalar? max=None, *, Tensor(a!) out) -> Tensor(a!)
1184
1234
  device_check: NoCheck # TensorIterator
@@ -1187,73 +1237,83 @@
1187
1237
  structured_inherits: TensorIteratorBase
1188
1238
  dispatch:
1189
1239
  CPU, CUDA: clamp_out
1240
+ MPS: clamp_out_mps
1190
1241
 
1191
1242
  - func: clamp.Tensor_out(Tensor self, Tensor? min=None, Tensor? max=None, *, Tensor(a!) out) -> Tensor(a!)
1192
1243
  device_check: NoCheck # TensorIterator
1244
+ structured: True
1245
+ structured_inherits: TensorIteratorBase
1193
1246
  dispatch:
1194
- CPU, CUDA: clamp_out
1247
+ CPU, CUDA: clamp_Tensor_out
1248
+ MPS: clamp_Tensor_out_mps
1195
1249
 
1196
1250
  - func: clamp_max(Tensor self, Scalar max) -> Tensor
1197
1251
  device_check: NoCheck # TensorIterator
1198
1252
  variants: function, method
1199
- dispatch:
1200
- CompositeExplicitAutograd: clamp_max
1253
+ structured_delegate: clamp_max.out
1201
1254
 
1202
1255
  - func: clamp_max.Tensor(Tensor self, Tensor max) -> Tensor
1203
1256
  variants: function, method
1204
- dispatch:
1205
- CompositeExplicitAutograd: clamp_max
1257
+ structured_delegate: clamp_max.Tensor_out
1206
1258
 
1207
1259
  - func: clamp_max_(Tensor(a!) self, Scalar max) -> Tensor(a!)
1208
1260
  device_check: NoCheck # TensorIterator
1209
1261
  variants: function, method
1210
- dispatch:
1211
- CompositeExplicitAutograd: clamp_max_
1262
+ structured_delegate: clamp_max.out
1212
1263
 
1213
1264
  - func: clamp_max_.Tensor(Tensor(a!) self, Tensor max) -> Tensor(a!)
1214
1265
  variants: function, method
1215
- dispatch:
1216
- CompositeExplicitAutograd: clamp_max_
1266
+ structured_delegate: clamp_max.Tensor_out
1217
1267
 
1218
1268
  - func: clamp_max.out(Tensor self, Scalar max, *, Tensor(a!) out) -> Tensor(a!)
1219
1269
  device_check: NoCheck # TensorIterator
1270
+ structured: True
1271
+ structured_inherits: TensorIteratorBase
1220
1272
  dispatch:
1221
1273
  CPU, CUDA: clamp_max_out
1274
+ MPS: clamp_max_out_mps
1222
1275
 
1223
1276
  - func: clamp_max.Tensor_out(Tensor self, Tensor max, *, Tensor(a!) out) -> Tensor(a!)
1277
+ device_check: NoCheck # TensorIterator
1278
+ structured: True
1279
+ structured_inherits: TensorIteratorBase
1224
1280
  dispatch:
1225
- CPU, CUDA: clamp_max_out
1281
+ CPU, CUDA: clamp_max_Tensor_out
1282
+ MPS: clamp_max_Tensor_out_mps
1226
1283
 
1227
1284
  - func: clamp_min(Tensor self, Scalar min) -> Tensor
1228
1285
  device_check: NoCheck # TensorIterator
1229
1286
  variants: function, method
1230
- dispatch:
1231
- CompositeExplicitAutograd: clamp_min
1287
+ structured_delegate: clamp_min.out
1232
1288
 
1233
1289
  - func: clamp_min.Tensor(Tensor self, Tensor min) -> Tensor
1234
1290
  variants: function, method
1235
- dispatch:
1236
- CompositeExplicitAutograd: clamp_min
1291
+ structured_delegate: clamp_min.Tensor_out
1237
1292
 
1238
1293
  - func: clamp_min_(Tensor(a!) self, Scalar min) -> Tensor(a!)
1239
1294
  device_check: NoCheck # TensorIterator
1240
1295
  variants: function, method
1241
- dispatch:
1242
- CompositeExplicitAutograd: clamp_min_
1296
+ structured_delegate: clamp_min.out
1243
1297
 
1244
1298
  - func: clamp_min_.Tensor(Tensor(a!) self, Tensor min) -> Tensor(a!)
1245
1299
  variants: function, method
1246
- dispatch:
1247
- CompositeExplicitAutograd: clamp_min_
1300
+ structured_delegate: clamp_min.Tensor_out
1248
1301
 
1249
1302
  - func: clamp_min.out(Tensor self, Scalar min, *, Tensor(a!) out) -> Tensor(a!)
1250
1303
  device_check: NoCheck # TensorIterator
1304
+ structured: True
1305
+ structured_inherits: TensorIteratorBase
1251
1306
  dispatch:
1252
1307
  CPU, CUDA: clamp_min_out
1308
+ MPS: clamp_min_out_mps
1253
1309
 
1254
1310
  - func: clamp_min.Tensor_out(Tensor self, Tensor min, *, Tensor(a!) out) -> Tensor(a!)
1311
+ device_check: NoCheck # TensorIterator
1312
+ structured: True
1313
+ structured_inherits: TensorIteratorBase
1255
1314
  dispatch:
1256
- CPU, CUDA: clamp_min_out
1315
+ CPU, CUDA: clamp_min_Tensor_out
1316
+ MPS: clamp_min_Tensor_out_mps
1257
1317
 
1258
1318
  # clip is an alias for clamp
1259
1319
  - func: clip(Tensor self, Scalar? min=None, Scalar? max=None) -> Tensor
@@ -1360,23 +1420,29 @@
1360
1420
 
1361
1421
  - func: conv_transpose3d.input(Tensor input, Tensor weight, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] output_padding=0, int groups=1, int[3] dilation=1) -> Tensor
1362
1422
 
1423
+ - func: copy(Tensor self, Tensor src, bool non_blocking=False) -> Tensor
1424
+ variants: function
1425
+
1363
1426
  - func: copy_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!)
1364
1427
  variants: method
1365
1428
  device_check: NoCheck
1366
1429
  device_guard: False
1367
1430
  dispatch:
1368
1431
  MkldnnCPU: copy_mkldnn_
1369
- SparseCPU, SparseCUDA, SparseHIP: copy_sparse_wrapper_
1432
+ SparseCPU, SparseCUDA: copy_sparse_wrapper_
1370
1433
  CompositeExplicitAutograd: copy_
1371
- SparseCsrCPU, SparseCsrCUDA: copy_sparse_csr_
1434
+ SparseCsrCPU, SparseCsrCUDA: copy_sparse_compressed_
1435
+ autogen: copy.out
1372
1436
 
1373
1437
  - func: _copy_from(Tensor self, Tensor dst, bool non_blocking=False) -> Tensor
1374
- dispatch: {}
1438
+ dispatch:
1439
+ MPS: _copy_from_mps
1375
1440
 
1376
1441
  # We need this to be able to properly copy from a CPU to an XLA tensor with different sizes.
1377
1442
  # See https://github.com/pytorch/xla/issues/2881
1378
1443
  - func: _copy_from_and_resize(Tensor self, Tensor dst) -> Tensor
1379
- dispatch: {}
1444
+ dispatch:
1445
+ MPS: _copy_from_and_resize_mps
1380
1446
 
1381
1447
  - func: cos(Tensor self) -> Tensor
1382
1448
  device_check: NoCheck # TensorIterator
@@ -1394,6 +1460,7 @@
1394
1460
  structured_inherits: TensorIteratorBase
1395
1461
  dispatch:
1396
1462
  CPU, CUDA: cos_out
1463
+ MPS: cos_out_mps
1397
1464
 
1398
1465
  - func: cosh(Tensor self) -> Tensor
1399
1466
  device_check: NoCheck # TensorIterator
@@ -1411,6 +1478,7 @@
1411
1478
  structured_inherits: TensorIteratorBase
1412
1479
  dispatch:
1413
1480
  CPU, CUDA: cosh_out
1481
+ MPS: cosh_out_mps
1414
1482
 
1415
1483
  - func: cosine_embedding_loss(Tensor input1, Tensor input2, Tensor target, float margin=0.0, int reduction=Mean) -> Tensor
1416
1484
 
@@ -1419,6 +1487,7 @@
1419
1487
  dispatch:
1420
1488
  CPU: count_nonzero_cpu
1421
1489
  CUDA: count_nonzero_cuda
1490
+ MPS: count_nonzero_mps
1422
1491
 
1423
1492
  - func: count_nonzero(Tensor self, int? dim=None) -> Tensor
1424
1493
  variants: function, method
@@ -1457,6 +1526,14 @@
1457
1526
  dispatch:
1458
1527
  CUDA: cudnn_convolution_transpose
1459
1528
 
1529
+ - func: _mps_convolution_transpose(Tensor self, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups) -> Tensor
1530
+ dispatch:
1531
+ MPS: _mps_convolution_transpose
1532
+
1533
+ - func: mps_convolution_transpose_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool[2] output_mask) -> (Tensor, Tensor)
1534
+ dispatch:
1535
+ MPS: mps_convolution_transpose_backward
1536
+
1460
1537
  - func: cudnn_convolution_relu(Tensor self, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups) -> Tensor
1461
1538
  dispatch:
1462
1539
  CUDA: cudnn_convolution_relu
@@ -1679,6 +1756,7 @@
1679
1756
  structured_inherits: TensorIteratorBase
1680
1757
  dispatch:
1681
1758
  CPU, CUDA: div_out
1759
+ MPS: div_out_mps
1682
1760
  SparseCPU, SparseCUDA: div_out_sparse_zerodim
1683
1761
 
1684
1762
  - func: div.Tensor_mode(Tensor self, Tensor other, *, str? rounding_mode) -> Tensor
@@ -1701,6 +1779,7 @@
1701
1779
  structured_inherits: TensorIteratorBase
1702
1780
  dispatch:
1703
1781
  CPU, CUDA: div_out_mode
1782
+ MPS: div_out_mode_mps
1704
1783
  SparseCPU, SparseCUDA: div_out_sparse_zerodim
1705
1784
 
1706
1785
  # For C++ only, until we have conversion from C++ numbers to Tensor
@@ -1715,6 +1794,7 @@
1715
1794
  variants: method
1716
1795
  dispatch:
1717
1796
  CompositeExplicitAutograd: div_
1797
+ autogen: div.Scalar_out
1718
1798
 
1719
1799
  - func: div.Scalar_mode(Tensor self, Scalar other, *, str? rounding_mode) -> Tensor
1720
1800
  variants: function, method
@@ -1725,6 +1805,7 @@
1725
1805
  variants: method
1726
1806
  dispatch:
1727
1807
  CompositeExplicitAutograd: div_
1808
+ autogen: div.Scalar_mode_out
1728
1809
 
1729
1810
  # divide, alias for div
1730
1811
  - func: divide.Tensor(Tensor self, Tensor other) -> Tensor
@@ -1780,6 +1861,7 @@
1780
1861
  dispatch:
1781
1862
  CPU: dot
1782
1863
  CUDA: dot_cuda
1864
+ MPS: dot_mps
1783
1865
 
1784
1866
  - func: dot.out(Tensor self, Tensor tensor, *, Tensor(a!) out) -> Tensor(a!)
1785
1867
  dispatch:
@@ -1800,6 +1882,7 @@
1800
1882
  - func: embedding(Tensor weight, Tensor indices, int padding_idx=-1, bool scale_grad_by_freq=False, bool sparse=False) -> Tensor
1801
1883
  dispatch:
1802
1884
  CompositeExplicitAutograd: embedding
1885
+ NestedTensorCPU, NestedTensorCUDA: NestedTensor_embedding
1803
1886
 
1804
1887
  - func: embedding_backward(Tensor grad, Tensor indices, int num_weights, int padding_idx, bool scale_grad_by_freq, bool sparse) -> Tensor
1805
1888
 
@@ -1807,11 +1890,13 @@
1807
1890
  dispatch:
1808
1891
  CPU: embedding_dense_backward_cpu
1809
1892
  CUDA: embedding_dense_backward_cuda
1893
+ MPS: embedding_dense_backward_mps
1810
1894
 
1811
1895
  - func: embedding_renorm_(Tensor(a!) self, Tensor indices, float max_norm, float norm_type) -> Tensor(a!)
1812
1896
  dispatch:
1813
1897
  CPU: embedding_renorm_cpu_
1814
1898
  CUDA: embedding_renorm_cuda_
1899
+ autogen: embedding_renorm.functional, embedding_renorm.out
1815
1900
 
1816
1901
  - func: embedding_sparse_backward(Tensor grad, Tensor indices, int num_weights, int padding_idx, bool scale_grad_by_freq) -> Tensor
1817
1902
 
@@ -1872,10 +1957,12 @@
1872
1957
  dispatch:
1873
1958
  CPU: empty_cpu
1874
1959
  CUDA: empty_cuda
1960
+ MPS: empty_mps
1875
1961
  Meta: empty_meta
1876
1962
  MkldnnCPU: empty_mkldnn
1877
1963
  SparseCPU, SparseCUDA: empty_sparse
1878
- SparseCsrCPU, SparseCsrCUDA: empty_sparse_csr
1964
+ SparseCsrCPU, SparseCsrCUDA: empty_sparse_compressed
1965
+ QuantizedCPU, QuantizedCUDA: empty_unknown_quantized
1879
1966
 
1880
1967
  # We do not make new_empty a composite that calls into new_empty_strided, as the strided version
1881
1968
  # is significantly more difficult to implement by different backends
@@ -1920,8 +2007,20 @@
1920
2007
  dispatch:
1921
2008
  CPU, Meta: resize_
1922
2009
  CUDA: resize_cuda_
2010
+ MPS: resize_mps_
1923
2011
  QuantizedCPU: quantized_resize_cpu_
1924
2012
  SparseCsrCPU, SparseCsrCUDA: resize_sparse_csr_
2013
+ autogen: resize.functional, resize.out
2014
+
2015
+ # This is a utility function to enable users to resize out tensor while registering kernels for out variants.
2016
+ # Eventually, we can consider exposing `resize_output` as a public API to ship it with python op registration
2017
+ # to make it easy to register out variants for ops.
2018
+ - func: _resize_output_(Tensor(a!) self, int[] size, Device device) -> Tensor(a!)
2019
+ use_const_ref_for_mutable_tensors: True
2020
+ variants: function
2021
+ dispatch:
2022
+ Meta: _resize_output_
2023
+ autogen: _resize_output.functional, _resize_output.out
1925
2024
 
1926
2025
  - func: empty_quantized(int[] size, Tensor qtensor, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
1927
2026
  category_override: factory
@@ -1938,6 +2037,7 @@
1938
2037
  device_guard: False
1939
2038
  dispatch:
1940
2039
  CompositeExplicitAutograd: empty_like
2040
+ QuantizedCPU, QuantizedCUDA: empty_like_quantized
1941
2041
  SparseCPU, SparseCUDA: empty_like_sparse_coo
1942
2042
  SparseCsrCPU, SparseCsrCUDA: empty_like_sparse_csr
1943
2043
 
@@ -1945,7 +2045,9 @@
1945
2045
  dispatch:
1946
2046
  CPU: empty_strided_cpu
1947
2047
  CUDA: empty_strided_cuda
2048
+ MPS: empty_strided_mps
1948
2049
  Meta: empty_strided_meta
2050
+ QuantizedCPU, QuantizedCUDA: empty_strided_unknown_quantized
1949
2051
 
1950
2052
  - func: erf(Tensor self) -> Tensor
1951
2053
  device_check: NoCheck # TensorIterator
@@ -1969,6 +2071,7 @@
1969
2071
  structured_inherits: TensorIteratorBase
1970
2072
  dispatch:
1971
2073
  CPU, CUDA: erf_out
2074
+ MPS: erf_out_mps
1972
2075
  SparseCPU, SparseCUDA: erf_sparse_out
1973
2076
  SparseCsrCPU, SparseCsrCUDA: erf_sparse_csr_out
1974
2077
 
@@ -2005,6 +2108,7 @@
2005
2108
  structured_inherits: TensorIteratorBase
2006
2109
  dispatch:
2007
2110
  CPU, CUDA: exp_out
2111
+ MPS: exp_out_mps
2008
2112
 
2009
2113
  - func: exp2(Tensor self) -> Tensor
2010
2114
  structured_delegate: exp2.out
@@ -2019,6 +2123,7 @@
2019
2123
  structured_inherits: TensorIteratorBase
2020
2124
  dispatch:
2021
2125
  CPU, CUDA: exp2_out
2126
+ MPS: exp2_out_mps
2022
2127
 
2023
2128
  - func: expm1(Tensor self) -> Tensor
2024
2129
  device_check: NoCheck # TensorIterator
@@ -2045,6 +2150,13 @@
2045
2150
  SparseCPU, SparseCUDA: expm1_sparse_out
2046
2151
  SparseCsrCPU, SparseCsrCUDA: expm1_sparse_csr_out
2047
2152
 
2153
+ - func: expand.SymInt(Tensor(a) self, SymInt[] size, *, bool implicit=False) -> Tensor(a)
2154
+ variants: method # This is method-only to match the previous tensor API. In the future we could make this a function too.
2155
+ device_check: NoCheck
2156
+ device_guard: False
2157
+ dispatch:
2158
+ CompositeExplicitAutograd: expand_symint
2159
+
2048
2160
  - func: expand(Tensor(a) self, int[] size, *, bool implicit=False) -> Tensor(a)
2049
2161
  variants: method # This is method-only to match the previous tensor API. In the future we could make this a function too.
2050
2162
  device_check: NoCheck
@@ -2065,11 +2177,13 @@
2065
2177
  dispatch:
2066
2178
  CPU: eye_out_cpu
2067
2179
  CUDA: eye_out_cuda
2180
+ MPS: eye_out_mps
2068
2181
 
2069
2182
  - func: eye.m_out(int n, int m, *, Tensor(a!) out) -> Tensor(a!)
2070
2183
  dispatch:
2071
2184
  CPU: eye_out_cpu
2072
2185
  CUDA: eye_out_cuda
2186
+ MPS: eye_out_mps
2073
2187
 
2074
2188
  - func: flatten.using_ints(Tensor(a) self, int start_dim=0, int end_dim=-1) -> Tensor(a)
2075
2189
  variants: function, method
@@ -2089,21 +2203,36 @@
2089
2203
  - func: unflatten.Dimname(Tensor(a) self, Dimname dim, int[] sizes, Dimname[] names) -> Tensor(a)
2090
2204
  variants: method
2091
2205
 
2206
+ - func: fill.Scalar(Tensor self, Scalar value) -> Tensor
2207
+ variants: function
2208
+ dispatch:
2209
+ CompositeExplicitAutograd: fill
2210
+
2211
+ - func: fill.Tensor(Tensor self, Tensor value) -> Tensor
2212
+ variants: function
2213
+ dispatch:
2214
+ CompositeExplicitAutograd: fill
2215
+
2092
2216
  - func: fill_.Scalar(Tensor(a!) self, Scalar value) -> Tensor(a!)
2093
2217
  device_check: NoCheck # TensorIterator
2094
2218
  variants: function, method
2095
2219
  dispatch:
2096
2220
  CPU, CUDA: fill_
2221
+ MPS: fill_scalar_mps
2097
2222
  QuantizedCPU, QuantizedCUDA: fill_quantized_
2098
2223
  Meta: fill_meta_
2224
+ SparseCsrCPU, SparseCsrCUDA: fill_sparse_csr_
2225
+ autogen: fill.Scalar_out
2099
2226
 
2100
2227
  - func: fill_.Tensor(Tensor(a!) self, Tensor value) -> Tensor(a!)
2101
2228
  device_check: NoCheck # TensorIterator
2102
2229
  variants: function, method
2103
2230
  dispatch:
2104
2231
  CPU, CUDA: fill_
2232
+ MPS: fill_tensor_mps_
2105
2233
  QuantizedCPU, QuantizedCUDA: fill_quantized_
2106
2234
  Meta: fill_meta_
2235
+ autogen: fill.Tensor_out
2107
2236
 
2108
2237
  - func: floor(Tensor self) -> Tensor
2109
2238
  device_check: NoCheck # TensorIterator
@@ -2129,6 +2258,7 @@
2129
2258
  structured_inherits: TensorIteratorBase
2130
2259
  dispatch:
2131
2260
  CPU, CUDA: floor_out
2261
+ MPS: floor_out_mps
2132
2262
  SparseCPU, SparseCUDA: floor_sparse_out
2133
2263
  SparseCsrCPU, SparseCsrCUDA: floor_sparse_csr_out
2134
2264
 
@@ -2220,10 +2350,12 @@
2220
2350
  variants: function, method
2221
2351
 
2222
2352
  # NOTE [ grid_sampler Native Functions ]
2223
- # `grid_sampler` does all the shape checking and then dispatches to one of
2224
- # `cudnn_grid_sampler`, `grid_sampler_2d`, or `grid_sampler_3d`, each of which
2225
- # has the corresponding backward defined as native functions as well. Therefore,
2226
- # in these functions and their backwards, no more shape checking is done.
2353
+ # `grid_sampler` is _supposed to_ do all the shape checking and then dispatch to
2354
+ # one of `cudnn_grid_sampler`, `grid_sampler_2d`, or `grid_sampler_3d`, each of
2355
+ # which has the corresponding backward defined as native functions as well.
2356
+ # However, we do shape checking everywhere for now since each of the mentioned
2357
+ # functions can be called directly, which will lead to crashes otherwise.
2358
+ # See https://github.com/pytorch/pytorch/issues/73187 for more information.
2227
2359
  #
2228
2360
  # There is also _grid_sampler_2d_backward_cpu_fallback which is an
2229
2361
  # implementation detail of grid_sampler_2d and is only exposed here for testing
@@ -2261,7 +2393,10 @@
2261
2393
  CPU: grid_sampler_3d_cpu
2262
2394
  CUDA: grid_sampler_3d_cuda
2263
2395
 
2264
- - func: grid_sampler_3d_backward(Tensor grad_output, Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> (Tensor, Tensor)
2396
+ # `grid_sampler_3d_backward` takes in `output_mask` to optimize performance for
2397
+ # the case where `input` doesn't require gradient. Gradient for `grid` is always
2398
+ # computed (only `output_mask[0]` is checked by the implementations).
2399
+ - func: grid_sampler_3d_backward(Tensor grad_output, Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners, bool[2] output_mask) -> (Tensor, Tensor)
2265
2400
  dispatch:
2266
2401
  CPU: grid_sampler_3d_backward_cpu
2267
2402
  CUDA: grid_sampler_3d_backward_cuda
@@ -2355,15 +2490,21 @@
2355
2490
  # - Tensor Tensor::index(ArrayRef<TensorIndex> indices)
2356
2491
  # - Tensor Tensor::index(std::initializer_list<TensorIndex> indices)
2357
2492
 
2493
+ - func: index_copy.out(Tensor self, int dim, Tensor index, Tensor source, *, Tensor(a!) out) -> Tensor(a!)
2494
+ structured: True
2495
+ variants: function
2496
+ precomputed:
2497
+ - dim -> int dim
2498
+ dispatch:
2499
+ CPU, CUDA: index_copy_out
2500
+
2358
2501
  - func: index_copy_(Tensor(a!) self, int dim, Tensor index, Tensor source) -> Tensor(a!)
2359
2502
  variants: method
2360
- dispatch:
2361
- CompositeExplicitAutograd: index_copy_
2503
+ structured_delegate: index_copy.out
2362
2504
 
2363
2505
  - func: index_copy(Tensor self, int dim, Tensor index, Tensor source) -> Tensor
2364
2506
  variants: function, method
2365
- dispatch:
2366
- CompositeExplicitAutograd: index_copy
2507
+ structured_delegate: index_copy.out
2367
2508
 
2368
2509
  - func: index_copy_.dimname(Tensor(a!) self, Dimname dim, Tensor index, Tensor source) -> Tensor(a!)
2369
2510
  variants: method
@@ -2376,6 +2517,7 @@
2376
2517
  variants: function, method
2377
2518
  dispatch:
2378
2519
  CompositeExplicitAutograd: index_put_
2520
+ autogen: index_put.out
2379
2521
  # NB: The following functions are declared in aten/src/ATen/templates/TensorBody.h and defined in aten/src/ATen/TensorIndexing.cpp:
2380
2522
  # - Tensor & Tensor::index_put_(ArrayRef<TensorIndex> indices, Tensor const & rhs)
2381
2523
  # - Tensor & Tensor::index_put_(ArrayRef<TensorIndex> indices, Scalar v)
@@ -2393,6 +2535,7 @@
2393
2535
  variants: function
2394
2536
  dispatch:
2395
2537
  CPU, CUDA: _index_put_impl_
2538
+ autogen: _index_put_impl.functional, _index_put_impl.out
2396
2539
 
2397
2540
  - func: instance_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool use_input_stats, float momentum, float eps, bool cudnn_enabled) -> Tensor
2398
2541
  variants: function
@@ -2444,7 +2587,7 @@
2444
2587
  device_check: NoCheck
2445
2588
  device_guard: False
2446
2589
  dispatch:
2447
- CPU, CUDA: isnan
2590
+ CPU, CUDA, MPS: isnan
2448
2591
  SparseCPU, SparseCUDA: isnan_sparse
2449
2592
  SparseCsrCPU, SparseCsrCUDA: isnan_sparse_csr
2450
2593
 
@@ -2540,17 +2683,14 @@
2540
2683
  dispatch:
2541
2684
  CPU: layer_norm_cpu
2542
2685
  CUDA: layer_norm_cuda
2686
+ MPS: layer_norm_mps
2543
2687
  CompositeImplicitAutograd: math_native_layer_norm
2544
2688
 
2545
- - func: _native_multi_head_self_attention(Tensor query, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, Tensor? mask=None) -> Tensor
2546
- dispatch:
2547
- CPU: multi_head_self_attention_cpu
2548
- CUDA: multi_head_self_attention_cuda
2549
-
2550
2689
  - func: native_layer_norm_backward(Tensor grad_out, Tensor input, int[] normalized_shape, Tensor mean, Tensor rstd, Tensor? weight, Tensor? bias, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
2551
2690
  dispatch:
2552
2691
  CPU: layer_norm_backward_cpu
2553
2692
  CUDA: layer_norm_backward_cuda
2693
+ MPS: layer_norm_backward_mps
2554
2694
 
2555
2695
  - func: nan_to_num(Tensor self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor
2556
2696
  variants: function, method
@@ -2575,6 +2715,14 @@
2575
2715
  - func: linear.out(Tensor input, Tensor weight, Tensor? bias=None, *, Tensor(a!) out) -> Tensor(a!)
2576
2716
  python_module: nn
2577
2717
 
2718
+ # TODO: Add this function to MPS dispatch key so that we avoid declaring it in
2719
+ # native_functions.yaml
2720
+ # https://github.com/pytorch/pytorch/issues/77394
2721
+ - func: _mps_linear(Tensor self, Tensor weight, Tensor? bias=None) -> Tensor
2722
+ python_module: nn
2723
+ dispatch:
2724
+ MPS: _mps_linear
2725
+
2578
2726
  - func: mkldnn_linear(Tensor self, Tensor weight, Tensor? bias=None) -> Tensor
2579
2727
  python_module: nn
2580
2728
  dispatch:
@@ -2592,6 +2740,18 @@
2592
2740
  dispatch:
2593
2741
  MkldnnCPU: mkldnn_linear_backward
2594
2742
 
2743
+ - func: _mps_linear_backward_input(int[] input_size, Tensor grad_output, Tensor weight) -> Tensor
2744
+ dispatch:
2745
+ MPS: _mps_linear_backward_input
2746
+
2747
+ - func: _mps_linear_backward_weights(Tensor grad_output, Tensor input, Tensor weight, bool bias_defined) -> (Tensor, Tensor)
2748
+ dispatch:
2749
+ MPS: _mps_linear_backward_weights
2750
+
2751
+ - func: mps_linear_backward(Tensor self, Tensor grad_output, Tensor weight, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
2752
+ dispatch:
2753
+ MPS: mps_linear_backward
2754
+
2595
2755
  - func: fbgemm_linear_int8_weight_fp32_activation(Tensor input, Tensor weight, Tensor packed, Tensor col_offsets, Scalar weight_scale, Scalar weight_zero_point, Tensor bias) -> Tensor
2596
2756
 
2597
2757
  - func: fbgemm_linear_int8_weight(Tensor input, Tensor weight, Tensor packed, Tensor col_offsets, Scalar weight_scale, Scalar weight_zero_point, Tensor bias) -> Tensor
@@ -2622,6 +2782,7 @@
2622
2782
  dispatch:
2623
2783
  CPU, Meta: linspace_out
2624
2784
  CUDA: linspace_cuda_out
2785
+ MPS: linspace_out_mps
2625
2786
 
2626
2787
  - func: log(Tensor self) -> Tensor
2627
2788
  device_check: NoCheck # TensorIterator
@@ -2639,6 +2800,7 @@
2639
2800
  structured_inherits: TensorIteratorBase
2640
2801
  dispatch:
2641
2802
  CPU, CUDA: log_out
2803
+ MPS: log_out_mps
2642
2804
 
2643
2805
  - func: log10(Tensor self) -> Tensor
2644
2806
  device_check: NoCheck # TensorIterator
@@ -2658,6 +2820,7 @@
2658
2820
  structured_inherits: TensorIteratorBase
2659
2821
  dispatch:
2660
2822
  CPU, CUDA: log10_out
2823
+ MPS: log10_out_mps
2661
2824
 
2662
2825
  - func: log1p(Tensor self) -> Tensor
2663
2826
  device_check: NoCheck # TensorIterator
@@ -2681,6 +2844,7 @@
2681
2844
  structured_inherits: TensorIteratorBase
2682
2845
  dispatch:
2683
2846
  CPU, CUDA: log1p_out
2847
+ MPS: log1p_out_mps
2684
2848
  SparseCPU, SparseCUDA: log1p_sparse_out
2685
2849
  SparseCsrCPU, SparseCsrCUDA: log1p_sparse_csr_out
2686
2850
 
@@ -2700,12 +2864,14 @@
2700
2864
  structured_inherits: TensorIteratorBase
2701
2865
  dispatch:
2702
2866
  CPU, CUDA: log2_out
2867
+ MPS: log2_out_mps
2703
2868
 
2704
2869
  - func: logaddexp.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
2705
2870
  structured: True
2706
2871
  structured_inherits: TensorIteratorBase
2707
2872
  dispatch:
2708
2873
  CPU, CUDA: logaddexp_out
2874
+ MPS: logaddexp_out_mps
2709
2875
 
2710
2876
  - func: logaddexp(Tensor self, Tensor other) -> Tensor
2711
2877
  variants: method, function
@@ -2718,6 +2884,7 @@
2718
2884
  structured_inherits: TensorIteratorBase
2719
2885
  dispatch:
2720
2886
  CPU, CUDA: logaddexp2_out
2887
+ MPS: logaddexp2_out_mps
2721
2888
 
2722
2889
  - func: logaddexp2(Tensor self, Tensor other) -> Tensor
2723
2890
  variants: method, function
@@ -2791,6 +2958,11 @@
2791
2958
  - func: log_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
2792
2959
  variants: function, method
2793
2960
 
2961
+ - func: log_softmax.int_out(Tensor self, int dim, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!)
2962
+ variants: function
2963
+ dispatch:
2964
+ CompositeExplicitAutograd: log_softmax_out
2965
+
2794
2966
  - func: log_softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor
2795
2967
  variants: function, method
2796
2968
 
@@ -2802,6 +2974,7 @@
2802
2974
  dispatch:
2803
2975
  CPU: log_softmax_cpu_out
2804
2976
  CUDA: log_softmax_cuda_out
2977
+ MPS: log_softmax_mps_out
2805
2978
 
2806
2979
  - func: _log_softmax_backward_data(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype) -> Tensor
2807
2980
  structured_delegate: _log_softmax_backward_data.out
@@ -2811,6 +2984,7 @@
2811
2984
  dispatch:
2812
2985
  CPU: log_softmax_backward_cpu_out
2813
2986
  CUDA: log_softmax_backward_cuda_out
2987
+ MPS: log_softmax_backward_mps_out
2814
2988
 
2815
2989
  - func: _logcumsumexp(Tensor self, int dim) -> Tensor
2816
2990
  dispatch:
@@ -2922,6 +3096,7 @@
2922
3096
  - dim -> int dim
2923
3097
  dispatch:
2924
3098
  CPU, CUDA: max_out
3099
+ MPS: max_out_mps
2925
3100
 
2926
3101
  - func: max.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
2927
3102
  device_check: NoCheck # TensorIterator
@@ -2937,10 +3112,10 @@
2937
3112
 
2938
3113
  - func: amax(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor
2939
3114
  variants: function, method
2940
- dispatch:
2941
- CompositeExplicitAutograd: amax
3115
+ structured_delegate: amax.out
2942
3116
 
2943
3117
  - func: amax.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
3118
+ structured: True
2944
3119
  dispatch:
2945
3120
  CPU, CUDA: amax_out
2946
3121
 
@@ -2951,6 +3126,17 @@
2951
3126
 
2952
3127
  - func: max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
2953
3128
 
3129
+ # TODO: Add this function to MPS dispatch key so that we avoid declaring it in
3130
+ # native_functions.yaml
3131
+ # https://github.com/pytorch/pytorch/issues/77394
3132
+ - func: _mps_max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
3133
+ dispatch:
3134
+ MPS: _mps_max_pool2d
3135
+
3136
+ - func: mps_max_pool2d_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
3137
+ dispatch:
3138
+ MPS: mps_max_pool2d_backward
3139
+
2954
3140
  - func: mkldnn_max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
2955
3141
  dispatch:
2956
3142
  MkldnnCPU: mkldnn_max_pool2d
@@ -2974,6 +3160,7 @@
2974
3160
  - func: quantized_max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
2975
3161
  dispatch:
2976
3162
  QuantizedCPU: quantized_max_pool2d
3163
+ QuantizedCUDA: quantized_max_pool2d_cudnn
2977
3164
 
2978
3165
  - func: max_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> Tensor
2979
3166
 
@@ -2997,6 +3184,7 @@
2997
3184
  device_check: NoCheck # TensorIterator
2998
3185
  dispatch:
2999
3186
  CPU, CUDA: mean_out
3187
+ MPS: mean_out_mps
3000
3188
  QuantizedCPU: mean_out_quantized_cpu
3001
3189
 
3002
3190
  - func: mean.names_dim(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
@@ -3069,6 +3257,7 @@
3069
3257
  - dim -> int dim
3070
3258
  dispatch:
3071
3259
  CPU, CUDA: min_out
3260
+ MPS: min_out_mps
3072
3261
 
3073
3262
  - func: min.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
3074
3263
  device_check: NoCheck # TensorIterator
@@ -3079,13 +3268,24 @@
3079
3268
 
3080
3269
  - func: amin(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor
3081
3270
  variants: function, method
3082
- dispatch:
3083
- CompositeExplicitAutograd: amin
3271
+ structured_delegate: amin.out
3084
3272
 
3085
3273
  - func: amin.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
3274
+ structured: True
3086
3275
  dispatch:
3087
3276
  CPU, CUDA: amin_out
3088
3277
 
3278
+ # TODO: Add this function to MPS dispatch key so that we avoid declaring it in
3279
+ # native_functions.yaml
3280
+ # https://github.com/pytorch/pytorch/issues/77394
3281
+ - func: _mps_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups) -> Tensor
3282
+ dispatch:
3283
+ MPS: _mps_convolution
3284
+
3285
+ - func: mps_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
3286
+ dispatch:
3287
+ MPS: mps_convolution_backward
3288
+
3089
3289
  - func: mkldnn_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups) -> Tensor
3090
3290
  dispatch:
3091
3291
  CompositeExplicitAutograd: mkldnn_convolution
@@ -3130,10 +3330,12 @@
3130
3330
  dispatch:
3131
3331
  CPU: mm_out_cpu
3132
3332
  CUDA: mm_out_cuda
3333
+ MPS: mm_out_mps
3133
3334
  SparseCPU, SparseCUDA: _sparse_mm_out
3134
3335
  SparseCsrCPU, SparseCsrCUDA: _sparse_csr_mm_out
3135
3336
 
3136
3337
  - func: _sparse_mm(Tensor sparse, Tensor dense) -> Tensor
3338
+ python_module: sparse
3137
3339
 
3138
3340
  - func: _sparse_sparse_matmul(Tensor self, Tensor other) -> Tensor
3139
3341
  dispatch:
@@ -3165,8 +3367,10 @@
3165
3367
  variants: function, method
3166
3368
  dispatch:
3167
3369
  SparseCPU, SparseCUDA: mul_sparse
3370
+ SparseCsrCPU, SparseCsrCUDA: mul_sparse_csr
3168
3371
  MkldnnCPU: mkldnn_mul
3169
3372
  ZeroTensor: mul_zerotensor
3373
+ NestedTensorCPU, NestedTensorCUDA: NestedTensor_mul_Tensor
3170
3374
 
3171
3375
  - func: mul_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
3172
3376
  device_check: NoCheck # TensorIterator
@@ -3174,7 +3378,9 @@
3174
3378
  variants: method
3175
3379
  dispatch:
3176
3380
  SparseCPU, SparseCUDA: mul_sparse_
3381
+ SparseCsrCPU, SparseCsrCUDA: mul_sparse_csr_
3177
3382
  MkldnnCPU: mkldnn_mul_
3383
+ NestedTensorCPU, NestedTensorCUDA: NestedTensor_mul__Tensor
3178
3384
 
3179
3385
  - func: mul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
3180
3386
  device_check: NoCheck # TensorIterator
@@ -3182,8 +3388,10 @@
3182
3388
  structured_inherits: TensorIteratorBase
3183
3389
  dispatch:
3184
3390
  CPU, CUDA: mul_out
3391
+ MPS: mul_out_mps
3185
3392
  SparseCPU: mul_out_sparse_cpu
3186
3393
  SparseCUDA: mul_out_sparse_cuda
3394
+ SparseCsrCPU, SparseCsrCUDA: mul_out_sparse_csr
3187
3395
  MkldnnCPU: mkldnn_mul_out
3188
3396
 
3189
3397
  # For C++ only, until we have conversion from C++ numbers to Tensor
@@ -3192,12 +3400,15 @@
3192
3400
  variants: function, method
3193
3401
  dispatch:
3194
3402
  CompositeExplicitAutograd: mul
3403
+ SparseCsrCPU, SparseCsrCUDA: mul_scalar_sparse_csr
3195
3404
 
3196
3405
  - func: mul_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
3197
3406
  device_check: NoCheck # TensorIterator
3198
3407
  variants: method
3199
3408
  dispatch:
3200
3409
  CompositeExplicitAutograd: mul_
3410
+ SparseCsrCPU, SparseCsrCUDA: mul__scalar_sparse_csr
3411
+ autogen: mul.Scalar_out
3201
3412
 
3202
3413
  # multiply, alias for mul
3203
3414
  - func: multiply.Tensor(Tensor self, Tensor other) -> Tensor
@@ -3246,6 +3457,12 @@
3246
3457
  CPU: narrow_copy_dense_cpu
3247
3458
  SparseCPU, SparseCUDA: narrow_copy_sparse
3248
3459
  CompositeExplicitAutograd: narrow_copy_dense
3460
+ tags: view_copy
3461
+
3462
+ - func: narrow_copy.SymInt(Tensor self, int dim, int start, SymInt length) -> Tensor
3463
+ variants: function, method
3464
+ dispatch:
3465
+ CompositeExplicitAutograd: narrow_copy_symint
3249
3466
 
3250
3467
  - func: narrow_copy.out(Tensor self, int dim, int start, int length, *, Tensor(a!) out) -> Tensor(a!)
3251
3468
  dispatch:
@@ -3265,11 +3482,13 @@
3265
3482
  dispatch:
3266
3483
  CPU: batch_norm_cpu
3267
3484
  CUDA: batch_norm_cuda
3485
+ MPS: batch_norm_mps
3268
3486
  MkldnnCPU: mkldnn_batch_norm
3269
3487
 
3270
3488
  - func: native_batch_norm.out(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, *, Tensor(a!) out, Tensor(b!) save_mean, Tensor(c!) save_invstd) -> (Tensor(a!), Tensor(b!), Tensor(c!))
3271
3489
  dispatch:
3272
3490
  CUDA: batch_norm_cuda_out
3491
+ MPS: batch_norm_mps_out
3273
3492
 
3274
3493
  - func: batch_norm_stats(Tensor input, float eps) -> (Tensor, Tensor)
3275
3494
  dispatch:
@@ -3296,6 +3515,7 @@
3296
3515
  dispatch:
3297
3516
  CPU: batch_norm_backward_cpu
3298
3517
  CUDA: batch_norm_backward_cuda
3518
+ MPS: batch_norm_backward_mps
3299
3519
  MkldnnCPU: mkldnn_batch_norm_backward
3300
3520
 
3301
3521
  - func: batch_norm_backward_reduce(Tensor grad_out, Tensor input, Tensor mean, Tensor invstd, Tensor? weight, bool input_g, bool weight_g, bool bias_g) -> (Tensor, Tensor, Tensor, Tensor)
@@ -3363,6 +3583,7 @@
3363
3583
  variants: function, method
3364
3584
  dispatch:
3365
3585
  CompositeExplicitAutograd: permute
3586
+ MPS: permute_mps
3366
3587
 
3367
3588
  - func: movedim.intlist(Tensor(a) self, int[] source, int[] destination) -> Tensor(a)
3368
3589
  variants: function, method
@@ -3403,8 +3624,14 @@
3403
3624
  variants: function, method
3404
3625
 
3405
3626
  - func: pixel_shuffle(Tensor self, int upscale_factor) -> Tensor
3627
+ dispatch:
3628
+ CPU: pixel_shuffle_cpu
3629
+ CompositeExplicitAutograd: math_pixel_shuffle
3406
3630
 
3407
3631
  - func: pixel_unshuffle(Tensor self, int downscale_factor) -> Tensor
3632
+ dispatch:
3633
+ CPU: pixel_unshuffle_cpu
3634
+ CompositeExplicitAutograd: math_pixel_unshuffle
3408
3635
 
3409
3636
  - func: channel_shuffle(Tensor self, int groups) -> Tensor
3410
3637
  dispatch:
@@ -3420,6 +3647,7 @@
3420
3647
  variants: method
3421
3648
  dispatch:
3422
3649
  CUDA: is_pinned_cuda
3650
+ MPS: is_pinned_mps
3423
3651
  CompositeExplicitAutograd: is_pinned_default
3424
3652
 
3425
3653
  # TODO: add a copy kwarg that guarantees that the tensor is put into fresh
@@ -3431,6 +3659,7 @@
3431
3659
  - func: _pin_memory(Tensor self, Device? device=None) -> Tensor
3432
3660
  dispatch:
3433
3661
  CUDA: _pin_memory_cuda
3662
+ MPS: _pin_memory_mps
3434
3663
 
3435
3664
  - func: pinverse(Tensor self, float rcond=1e-15) -> Tensor
3436
3665
  variants: function, method
@@ -3566,6 +3795,7 @@
3566
3795
  structured_inherits: TensorIteratorBase
3567
3796
  dispatch:
3568
3797
  CPU, CUDA: reciprocal_out
3798
+ MPS: reciprocal_out_mps
3569
3799
 
3570
3800
  - func: neg(Tensor self) -> Tensor
3571
3801
  device_check: NoCheck # TensorIterator
@@ -3589,6 +3819,7 @@
3589
3819
  structured_inherits: TensorIteratorBase
3590
3820
  dispatch:
3591
3821
  CPU, CUDA: neg_out
3822
+ MPS: neg_out_mps
3592
3823
  SparseCPU, SparseCUDA: neg_out_sparse
3593
3824
  SparseCsrCPU, SparseCsrCUDA: neg_sparse_csr_out
3594
3825
 
@@ -3605,6 +3836,7 @@
3605
3836
  variants: method # This is method-only to match the previous tensor API. In the future we could make this a function too.
3606
3837
  dispatch:
3607
3838
  CompositeExplicitAutograd: repeat
3839
+ MPS: repeat_mps
3608
3840
 
3609
3841
  - func: repeat_interleave.Tensor(Tensor repeats, *, int? output_size=None) -> Tensor
3610
3842
  variants: function
@@ -3631,7 +3863,7 @@
3631
3863
  device_check: NoCheck
3632
3864
  device_guard: False
3633
3865
  dispatch:
3634
- CPU, CUDA, Meta, QuantizedCPU, QuantizedCUDA, ZeroTensor: _reshape_alias
3866
+ CPU, CUDA, Meta, QuantizedCPU, QuantizedCUDA, ZeroTensor, MPS: _reshape_alias
3635
3867
  # We don't need to support mkldnn since this is handled explicitly by the reshape operator.
3636
3868
 
3637
3869
  - func: _mkldnn_reshape(Tensor self, int[] shape) -> Tensor
@@ -3668,6 +3900,7 @@
3668
3900
  dispatch:
3669
3901
  CPU: round_out
3670
3902
  CUDA: round_out
3903
+ MPS: round_out_mps
3671
3904
  SparseCPU, SparseCUDA: round_sparse_out
3672
3905
  SparseCsrCPU, SparseCsrCUDA: round_sparse_csr_out
3673
3906
 
@@ -3700,16 +3933,21 @@
3700
3933
  variants: function, method
3701
3934
  dispatch:
3702
3935
  CPU, CUDA: relu
3936
+ MPS: relu_mps
3703
3937
  MkldnnCPU: mkldnn_relu
3704
3938
  QuantizedCPU: relu_quantized_cpu
3939
+ NestedTensorCPU, NestedTensorCUDA: NestedTensor_relu
3705
3940
 
3706
3941
  - func: relu_(Tensor(a!) self) -> Tensor(a!)
3707
3942
  device_check: NoCheck # TensorIterator
3708
3943
  variants: function, method
3709
3944
  dispatch:
3710
3945
  CPU, CUDA: relu_
3946
+ MPS: relu_mps_
3711
3947
  MkldnnCPU: mkldnn_relu_
3712
3948
  QuantizedCPU: relu_quantized_cpu_
3949
+ NestedTensorCPU, NestedTensorCUDA: NestedTensor_relu_
3950
+ autogen: relu.out
3713
3951
 
3714
3952
  - func: relu6(Tensor self) -> Tensor
3715
3953
  python_module: nn
@@ -3720,16 +3958,18 @@
3720
3958
  - func: prelu(Tensor self, Tensor weight) -> Tensor
3721
3959
  variants: function, method
3722
3960
  dispatch:
3961
+ MkldnnCPU: mkldnn_prelu
3723
3962
  CPU: prelu_cpu
3724
3963
  CUDA: prelu_cuda
3725
3964
 
3726
3965
  - func: prelu_backward(Tensor grad_output, Tensor self, Tensor weight) -> (Tensor, Tensor)
3727
3966
  variants: function, method
3728
3967
  dispatch:
3968
+ MkldnnCPU: mkldnn_prelu_backward
3729
3969
  CPU: prelu_backward_cpu
3730
3970
  CUDA: prelu_backward_cuda
3731
3971
 
3732
- - func: gelu.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
3972
+ - func: gelu.out(Tensor self, *, str approximate='none', Tensor(a!) out) -> Tensor(a!)
3733
3973
  structured: True
3734
3974
  structured_inherits: TensorIteratorBase
3735
3975
  device_check: NoCheck # TensorIterator
@@ -3737,24 +3977,34 @@
3737
3977
  dispatch:
3738
3978
  CPU: gelu_out_cpu
3739
3979
  CUDA: gelu_out_cuda
3980
+ MPS: gelu_out_mps
3981
+
3982
+ - func: gelu_(Tensor(a!) self, *, str approximate='none') -> Tensor(a!)
3983
+ structured_delegate: gelu.out
3984
+ device_check: NoCheck # TensorIterator
3985
+ python_module: nn
3986
+ dispatch:
3987
+ NestedTensorCPU, NestedTensorCUDA: NestedTensor_gelu_
3740
3988
 
3741
- - func: gelu(Tensor self) -> Tensor
3989
+ - func: gelu(Tensor self, *, str approximate='none') -> Tensor
3742
3990
  structured_delegate: gelu.out
3743
3991
  device_check: NoCheck # TensorIterator
3744
3992
  python_module: nn
3745
3993
  dispatch:
3746
3994
  MkldnnCPU: mkldnn_gelu
3747
3995
  QuantizedCPU: gelu_quantized_cpu
3996
+ NestedTensorCPU, NestedTensorCUDA: NestedTensor_gelu
3748
3997
 
3749
- - func: gelu_backward.grad_input(Tensor grad, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!)
3998
+ - func: gelu_backward.grad_input(Tensor grad_output, Tensor self, *, str approximate='none', Tensor(a!) grad_input) -> Tensor(a!)
3750
3999
  structured: True
3751
4000
  structured_inherits: TensorIteratorBase
3752
4001
  python_module: nn
3753
4002
  dispatch:
3754
4003
  CPU: gelu_backward_out_cpu
3755
4004
  CUDA: gelu_backward_out_cuda
4005
+ MPS: gelu_backward_out_mps
3756
4006
 
3757
- - func: gelu_backward(Tensor grad, Tensor self) -> Tensor
4007
+ - func: gelu_backward(Tensor grad_output, Tensor self, *, str approximate='none') -> Tensor
3758
4008
  structured_delegate: gelu_backward.grad_input
3759
4009
  python_module: nn
3760
4010
  dispatch:
@@ -3804,6 +4054,7 @@
3804
4054
  structured_inherits: TensorIteratorBase
3805
4055
  dispatch:
3806
4056
  CPU, CUDA: rsqrt_out
4057
+ MPS: rsqrt_out_mps
3807
4058
 
3808
4059
  - func: select.Dimname(Tensor(a) self, Dimname dim, int index) -> Tensor(a)
3809
4060
  variants: function, method
@@ -3816,6 +4067,7 @@
3816
4067
  device_guard: False
3817
4068
  dispatch:
3818
4069
  CompositeExplicitAutograd: select
4070
+ SparseCsrCPU, SparseCsrCUDA: select_sparse_csr
3819
4071
 
3820
4072
  - func: select_backward(Tensor grad_output, int[] input_sizes, int dim, int index) -> Tensor
3821
4073
  variants: function
@@ -3839,6 +4091,7 @@
3839
4091
  device_check: NoCheck # TensorIterator
3840
4092
  dispatch:
3841
4093
  CompositeExplicitAutograd: celu_
4094
+ autogen: celu.out
3842
4095
 
3843
4096
  - func: silu(Tensor self) -> Tensor
3844
4097
  structured_delegate: silu.out
@@ -3858,6 +4111,7 @@
3858
4111
  python_module: nn
3859
4112
  dispatch:
3860
4113
  CPU, CUDA: silu_out
4114
+ MPS: silu_out_mps
3861
4115
 
3862
4116
  - func: silu_backward.grad_input(Tensor grad_output, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!)
3863
4117
  structured: True
@@ -3865,6 +4119,7 @@
3865
4119
  python_module: nn
3866
4120
  dispatch:
3867
4121
  CPU, CUDA: silu_backward_out
4122
+ MPS: silu_backward_out_mps
3868
4123
 
3869
4124
  - func: silu_backward(Tensor grad_output, Tensor self) -> Tensor
3870
4125
  structured_delegate: silu_backward.grad_input
@@ -3918,6 +4173,7 @@
3918
4173
  structured_inherits: TensorIteratorBase
3919
4174
  dispatch:
3920
4175
  CPU, CUDA: sigmoid_out
4176
+ MPS: sigmoid_out_mps
3921
4177
 
3922
4178
  - func: logit(Tensor self, float? eps=None) -> Tensor
3923
4179
  variants: function, method
@@ -3955,6 +4211,7 @@
3955
4211
  structured_inherits: TensorIteratorBase
3956
4212
  dispatch:
3957
4213
  CPU, CUDA: sin_out
4214
+ MPS: sin_out_mps
3958
4215
  SparseCsrCPU, SparseCsrCUDA: sin_sparse_csr_out
3959
4216
  SparseCPU, SparseCUDA: sin_sparse_out
3960
4217
 
@@ -3994,6 +4251,7 @@
3994
4251
  structured_inherits: TensorIteratorBase
3995
4252
  dispatch:
3996
4253
  CPU, CUDA: sinh_out
4254
+ MPS: sinh_out_mps
3997
4255
  SparseCPU, SparseCUDA: sinh_sparse_out
3998
4256
  SparseCsrCPU, SparseCsrCUDA: sinh_sparse_csr_out
3999
4257
 
@@ -4080,6 +4338,11 @@
4080
4338
  - func: softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
4081
4339
  variants: function, method
4082
4340
 
4341
+ - func: softmax.int_out(Tensor self, int dim, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!)
4342
+ variants: function
4343
+ dispatch:
4344
+ CompositeExplicitAutograd: softmax_out
4345
+
4083
4346
  - func: softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor
4084
4347
  variants: function, method
4085
4348
 
@@ -4093,6 +4356,7 @@
4093
4356
  dispatch:
4094
4357
  CPU: softmax_cpu_out
4095
4358
  CUDA: softmax_cuda_out
4359
+ MPS: softmax_mps_out
4096
4360
 
4097
4361
  - func: _softmax_backward_data(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype) -> Tensor
4098
4362
  structured_delegate: _softmax_backward_data.out
@@ -4102,6 +4366,7 @@
4102
4366
  dispatch:
4103
4367
  CPU: softmax_backward_cpu_out
4104
4368
  CUDA: softmax_backward_cuda_out
4369
+ MPS: softmax_backward_mps_out
4105
4370
 
4106
4371
  - func: unsafe_split.Tensor(Tensor self, int split_size, int dim=0) -> Tensor[]
4107
4372
  variants: function, method
@@ -4117,6 +4382,10 @@
4117
4382
  dispatch:
4118
4383
  CompositeExplicitAutograd: split
4119
4384
 
4385
+ - func: split.sizes(Tensor(a -> *) self, int[] split_size, int dim=0) -> Tensor(a)[]
4386
+ variants: function, method
4387
+ device_guard: False
4388
+
4120
4389
  - func: unsafe_split_with_sizes(Tensor self, int[] split_sizes, int dim=0) -> Tensor[]
4121
4390
  variants: function, method
4122
4391
  device_check: NoCheck
@@ -4154,7 +4423,7 @@
4154
4423
  device_check: NoCheck
4155
4424
  device_guard: False
4156
4425
  dispatch:
4157
- CPU, CUDA: squeeze
4426
+ CompositeExplicitAutograd: squeeze
4158
4427
  QuantizedCPU, QuantizedCUDA: squeeze_quantized
4159
4428
 
4160
4429
  - func: squeeze.dim(Tensor(a) self, int dim) -> Tensor(a)
@@ -4162,7 +4431,7 @@
4162
4431
  device_check: NoCheck
4163
4432
  device_guard: False
4164
4433
  dispatch:
4165
- CPU, CUDA: squeeze
4434
+ CompositeExplicitAutograd: squeeze
4166
4435
  QuantizedCPU, QuantizedCUDA: squeeze_quantized
4167
4436
 
4168
4437
  - func: squeeze.dimname(Tensor(a) self, Dimname dim) -> Tensor(a)
@@ -4232,12 +4501,13 @@
4232
4501
 
4233
4502
  - func: dstack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!)
4234
4503
 
4235
- # The signature is designed to be consistent with librosa except that it is
4236
- # missing the `pad_mode` and `center` arguments, which are taken care of at
4237
- # `torch.functional.py`. They shall be moved here once we have mapping between
4238
- # Python strings and C++ Enum in codegen.
4504
+ # Overload without center & pad mode, needed for forward-compatibility
4239
4505
  - func: stft(Tensor self, int n_fft, int? hop_length=None, int? win_length=None, Tensor? window=None, bool normalized=False, bool? onesided=None, bool? return_complex=None) -> Tensor
4240
4506
  variants: function, method
4507
+ cpp_no_default_args: ['hop_length', 'win_length', 'window', 'normalized']
4508
+
4509
+ - func: stft.center(Tensor self, int n_fft, int? hop_length=None, int? win_length=None, Tensor? window=None, bool center=True, str pad_mode="reflect", bool normalized=False, bool? onesided=None, bool? return_complex=None) -> Tensor
4510
+ variants: function, method
4241
4511
 
4242
4512
  - func: istft(Tensor self, int n_fft, int? hop_length=None, int? win_length=None, Tensor? window=None, bool center=True, bool normalized=False, bool? onesided=None, int? length=None, bool return_complex=False) -> Tensor
4243
4513
  variants: function, method
@@ -4258,6 +4528,7 @@
4258
4528
  variants: function, method
4259
4529
  dispatch:
4260
4530
  CompositeExplicitAutograd: sum
4531
+ SparseCsrCPU, SparseCsrCUDA: sum_csr
4261
4532
 
4262
4533
  - func: sum.dim_IntList(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
4263
4534
  structured_delegate: sum.IntList_out
@@ -4273,21 +4544,17 @@
4273
4544
  device_check: NoCheck # TensorIterator
4274
4545
  dispatch:
4275
4546
  CPU, CUDA: sum_out
4547
+ MPS: sum_out_mps
4276
4548
 
4277
4549
  - func: sum.DimnameList_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
4278
4550
  device_check: NoCheck # TensorIterator
4279
4551
 
4280
- - func: nansum(Tensor self, *, ScalarType? dtype=None) -> Tensor
4552
+ - func: nansum(Tensor self, int[1] dim=[], bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
4281
4553
  variants: function, method
4282
4554
  dispatch:
4283
4555
  CPU, CUDA: nansum
4284
4556
 
4285
- - func: nansum.dim_IntList(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
4286
- variants: function, method
4287
- dispatch:
4288
- CPU, CUDA: nansum
4289
-
4290
- - func: nansum.IntList_out(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
4557
+ - func: nansum.out(Tensor self, int[1] dim=[], bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
4291
4558
  dispatch:
4292
4559
  CPU, CUDA: nansum_out
4293
4560
 
@@ -4318,6 +4585,7 @@
4318
4585
  structured_inherits: TensorIteratorBase
4319
4586
  dispatch:
4320
4587
  CPU, CUDA: sqrt_out
4588
+ MPS: sqrt_out_mps
4321
4589
  SparseCPU, SparseCUDA: sqrt_sparse_out
4322
4590
  SparseCsrCPU, SparseCsrCUDA: sqrt_sparse_csr_out
4323
4591
 
@@ -4330,8 +4598,6 @@
4330
4598
  variants: function, method
4331
4599
 
4332
4600
  - func: square.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
4333
- dispatch:
4334
- CPU, CUDA: square_out
4335
4601
 
4336
4602
  - func: std(Tensor self, bool unbiased=True) -> Tensor
4337
4603
  device_check: NoCheck # TensorIterator
@@ -4346,6 +4612,7 @@
4346
4612
  variants: function, method
4347
4613
  dispatch:
4348
4614
  CPU, CUDA: std
4615
+ MPS: std_mps
4349
4616
 
4350
4617
  - func: std_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor)
4351
4618
  device_check: NoCheck # TensorIterator
@@ -4397,6 +4664,7 @@
4397
4664
  variants: function, method
4398
4665
  dispatch:
4399
4666
  CPU, CUDA: prod
4667
+ MPS: prod_mps
4400
4668
 
4401
4669
  - func: prod.dim_int(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
4402
4670
  structured_delegate: prod.int_out
@@ -4408,6 +4676,7 @@
4408
4676
  device_check: NoCheck # TensorIterator
4409
4677
  dispatch:
4410
4678
  CPU, CUDA: prod_out
4679
+ MPS: prod_out_mps
4411
4680
 
4412
4681
  - func: prod.dim_Dimname(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
4413
4682
  device_check: NoCheck # TensorIterator
@@ -4453,6 +4722,7 @@
4453
4722
  structured_inherits: TensorIteratorBase
4454
4723
  dispatch:
4455
4724
  CPU, CUDA: tan_out
4725
+ MPS: tan_out_mps
4456
4726
  SparseCPU, SparseCUDA: tan_sparse_out
4457
4727
  SparseCsrCPU, SparseCsrCUDA: tan_sparse_csr_out
4458
4728
 
@@ -4481,6 +4751,7 @@
4481
4751
  structured_inherits: TensorIteratorBase
4482
4752
  dispatch:
4483
4753
  CPU, CUDA: tanh_out
4754
+ MPS: tanh_out_mps
4484
4755
  SparseCPU, SparseCUDA: tanh_sparse_out
4485
4756
  SparseCsrCPU, SparseCsrCUDA: tanh_sparse_csr_out
4486
4757
 
@@ -4511,12 +4782,14 @@
4511
4782
  structured_inherits: TensorIteratorBase
4512
4783
  dispatch:
4513
4784
  CPU, CUDA: threshold_out
4785
+ MPS: threshold_out_mps
4514
4786
 
4515
4787
  - func: threshold_backward.grad_input(Tensor grad_output, Tensor self, Scalar threshold, *, Tensor(a!) grad_input) -> Tensor(a!)
4516
4788
  structured: True
4517
4789
  structured_inherits: TensorIteratorBase
4518
4790
  dispatch:
4519
4791
  CPU, CUDA: threshold_backward_out
4792
+ MPS: threshold_backward_out_mps
4520
4793
 
4521
4794
  - func: threshold_backward(Tensor grad_output, Tensor self, Scalar threshold) -> Tensor
4522
4795
  variants: function
@@ -4558,6 +4831,7 @@
4558
4831
  device_guard: False
4559
4832
  dispatch:
4560
4833
  MkldnnCPU: mkldnn_transpose_
4834
+ autogen: _mkldnn_transpose.out
4561
4835
 
4562
4836
  - func: one_hot(Tensor self, int num_classes=-1) -> Tensor
4563
4837
  python_module: nn
@@ -4595,6 +4869,28 @@
4595
4869
 
4596
4870
  - func: trapz.dx(Tensor y, *, float dx=1, int dim=-1) -> Tensor
4597
4871
 
4872
+ # Fused implementation detail for transformers. Adds in-projection bias to QKV and divides Q by sqrt(D/num_heads).
4873
+ - func: _transform_bias_rescale_qkv(Tensor qkv, Tensor qkv_bias, int num_heads) -> (Tensor, Tensor, Tensor)
4874
+ dispatch:
4875
+ CPU, NestedTensorCPU: transform_bias_rescale_qkv_cpu
4876
+ CUDA, NestedTensorCUDA: transform_bias_rescale_qkv_cuda
4877
+
4878
+ - func: _nested_tensor_from_mask(Tensor t, Tensor mask) -> Tensor
4879
+ dispatch:
4880
+ CPU, CUDA: NestedTensor_nested_tensor_from_mask
4881
+
4882
+ - func: _nested_from_padded(Tensor padded, Tensor cpu_nested_shape_example, bool fuse_transform_0213=False) -> Tensor
4883
+ device_check: NoCheck # cpu_nested_shape_example will always be on CPU
4884
+ dispatch:
4885
+ CPU: nested_from_padded_generic
4886
+ CUDA: nested_from_padded_cuda
4887
+
4888
+ # _nested_from_padded is not usable from Python, so
4889
+ # _nested_from_padded_and_nested_example is available for testing.
4890
+ - func: _nested_from_padded_and_nested_example(Tensor padded, Tensor nt_example) -> Tensor
4891
+ dispatch:
4892
+ NestedTensorCPU, NestedTensorCUDA: NestedTensor_from_padded_and_nested_example
4893
+
4598
4894
  - func: _trilinear(Tensor i1, Tensor i2, Tensor i3, int[] expand1, int[] expand2, int[] expand3, int[] sumdim, int unroll_dim=1) -> Tensor
4599
4895
  dispatch:
4600
4896
  CompositeExplicitAutograd: _trilinear
@@ -4625,6 +4921,7 @@
4625
4921
  device_check: NoCheck # TensorIterator
4626
4922
  dispatch:
4627
4923
  CPU, CUDA: trunc_out
4924
+ MPS: trunc_out_mps
4628
4925
  SparseCPU, SparseCUDA: trunc_sparse_out
4629
4926
  SparseCsrCPU, SparseCsrCUDA: trunc_sparse_csr_out
4630
4927
 
@@ -4686,7 +4983,7 @@
4686
4983
  device_check: NoCheck
4687
4984
  device_guard: False
4688
4985
  dispatch:
4689
- CPU, CUDA: unsqueeze
4986
+ CompositeExplicitAutograd: unsqueeze
4690
4987
  SparseCPU, SparseCUDA: unsqueeze_sparse
4691
4988
  QuantizedCPU, QuantizedCUDA: unsqueeze_quantized
4692
4989
 
@@ -4713,6 +5010,7 @@
4713
5010
  variants: function, method
4714
5011
  dispatch:
4715
5012
  CPU, CUDA: var
5013
+ MPS: var_mps
4716
5014
 
4717
5015
  - func: var.out(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
4718
5016
  device_check: NoCheck # TensorIterator
@@ -4764,12 +5062,18 @@
4764
5062
  device_check: NoCheck
4765
5063
  device_guard: False
4766
5064
 
4767
- # we define both of these because 'where' does the broadcast and '_s_where' doesn't;
4768
- # this allows us to implicitly calculate the broadcast derivative, while only dealing with the
4769
- # _s_where derivative.
4770
5065
  - func: where.self(Tensor condition, Tensor self, Tensor other) -> Tensor
4771
5066
  device_check: NoCheck # TensorIterator
4772
5067
  variants: function, method
5068
+ dispatch:
5069
+ CPU, CUDA: where
5070
+ MPS: where_mps
5071
+
5072
+ - func: where.self_out(Tensor condition, Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
5073
+ device_check: NoCheck # TensorIterator
5074
+ dispatch:
5075
+ CPU, CUDA: where_self_out
5076
+ MPS: where_self_out_mps
4773
5077
 
4774
5078
  - func: where.ScalarSelf(Tensor condition, Scalar self, Tensor other) -> Tensor
4775
5079
  variants: function
@@ -4784,11 +5088,6 @@
4784
5088
  device_check: NoCheck # TensorIterator
4785
5089
  variants: function
4786
5090
 
4787
- - func: _s_where(Tensor condition, Tensor self, Tensor other) -> Tensor
4788
- variants: function
4789
- dispatch:
4790
- CPU, CUDA: _s_where
4791
-
4792
5091
  - func: norm_except_dim(Tensor v, int pow=2, int dim=0) -> Tensor
4793
5092
  variants: function
4794
5093
 
@@ -4797,15 +5096,17 @@
4797
5096
  - func: _weight_norm(Tensor v, Tensor g, int dim=0) -> Tensor
4798
5097
  variants: function
4799
5098
 
4800
- - func: _weight_norm_cuda_interface(Tensor v, Tensor g, int dim=0) -> (Tensor, Tensor)
5099
+ - func: _weight_norm_interface(Tensor v, Tensor g, int dim=0) -> (Tensor, Tensor)
4801
5100
  variants: function
4802
5101
  dispatch:
5102
+ CPU: weight_norm_cpu
4803
5103
  CUDA: weight_norm_cuda
4804
5104
 
4805
- - func: _weight_norm_cuda_interface_backward(Tensor grad_w, Tensor saved_v, Tensor saved_g, Tensor saved_norms, int dim) -> (Tensor, Tensor)
5105
+ - func: _weight_norm_interface_backward(Tensor grad_w, Tensor saved_v, Tensor saved_g, Tensor saved_norms, int dim) -> (Tensor, Tensor)
4806
5106
  variants: function
4807
5107
  dispatch:
4808
- CUDA: weight_norm_cuda_backward
5108
+ CPU: weight_norm_backward_cpu
5109
+ CUDA: weight_norm_backward_cuda
4809
5110
 
4810
5111
  - func: _weight_norm_differentiable_backward(Tensor grad_w, Tensor saved_v, Tensor saved_g, Tensor saved_norms, int dim) -> (Tensor, Tensor)
4811
5112
  variants: function
@@ -4887,6 +5188,16 @@
4887
5188
  SparseCPU: _sparse_sum_backward_cpu
4888
5189
  SparseCUDA: _sparse_sum_backward_cuda
4889
5190
 
5191
+ - func: _sparse_csr_sum.dim_dtype(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
5192
+ dispatch:
5193
+ SparseCsrCPU: _sparse_csr_sum_cpu
5194
+ SparseCsrCUDA: _sparse_csr_sum_cuda
5195
+
5196
+ - func: _sparse_csr_prod.dim_dtype(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
5197
+ dispatch:
5198
+ SparseCsrCPU: _sparse_csr_prod_cpu
5199
+ SparseCsrCUDA: _sparse_csr_prod_cuda
5200
+
4890
5201
  - func: _sparse_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
4891
5202
  python_module: sparse
4892
5203
  variants: function
@@ -4962,6 +5273,7 @@
4962
5273
  device_check: NoCheck # TensorIterator
4963
5274
  dispatch:
4964
5275
  CPU, CUDA: norm_out
5276
+ MPS: norm_out_mps
4965
5277
 
4966
5278
  # These four redispatch in their implementation, so OK to be CompositeImplicitAutograd
4967
5279
  - func: norm.names_ScalarOpt_dim_dtype(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor
@@ -4987,24 +5299,31 @@
4987
5299
  dispatch:
4988
5300
  CPU, CUDA: frexp_out
4989
5301
 
5302
+ # Deprecated (v.1.12)
4990
5303
  - func: frobenius_norm(Tensor self) -> Tensor
4991
5304
  variants: function
4992
5305
 
5306
+ # Deprecated (v.1.12)
4993
5307
  - func: frobenius_norm.dim(Tensor self, int[1] dim, bool keepdim=False) -> Tensor
4994
5308
  variants: function
4995
5309
 
5310
+ # Deprecated (v.1.12)
4996
5311
  - func: frobenius_norm.out(Tensor self, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
4997
5312
  variants: function
4998
5313
 
5314
+ # Deprecated (v.1.12)
4999
5315
  - func: nuclear_norm(Tensor self, bool keepdim=False) -> Tensor
5000
5316
  variants: function
5001
5317
 
5318
+ # Deprecated (v.1.12)
5002
5319
  - func: nuclear_norm.out(Tensor self, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
5003
5320
  variants: function
5004
5321
 
5322
+ # Deprecated (v.1.12)
5005
5323
  - func: nuclear_norm.dim(Tensor self, int[2] dim, bool keepdim=False) -> Tensor
5006
5324
  variants: function
5007
5325
 
5326
+ # Deprecated (v.1.12)
5008
5327
  - func: nuclear_norm.dim_out(Tensor self, int[2] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
5009
5328
  variants: function
5010
5329
 
@@ -5013,7 +5332,7 @@
5013
5332
  dispatch:
5014
5333
  CompositeExplicitAutograd: clone
5015
5334
  SparseCPU, SparseCUDA: clone_sparse
5016
- SparseCsrCPU, SparseCsrCUDA: clone_sparse_csr
5335
+ SparseCsrCPU, SparseCsrCUDA: clone_sparse_compressed
5017
5336
  MkldnnCPU: mkldnn_clone
5018
5337
  QuantizedCPU, QuantizedCUDA: quantized_clone
5019
5338
 
@@ -5025,22 +5344,27 @@
5025
5344
  variants: function, method
5026
5345
  dispatch:
5027
5346
  CompositeExplicitAutograd: resize_as_
5347
+ autogen: resize_as.functional, resize_as.out
5028
5348
 
5029
5349
  - func: resize_as_sparse_(Tensor(a!) self, Tensor the_template) -> Tensor(a!)
5030
5350
  use_const_ref_for_mutable_tensors: True
5031
- variants: function
5351
+ variants: function, method
5032
5352
  dispatch:
5033
5353
  SparseCPU, SparseCUDA: resize_as_sparse_
5034
5354
  SparseCsrCPU, SparseCsrCUDA: resize_as_sparse_csr_
5355
+ autogen: resize_as_sparse.functional, resize_as_sparse.out
5035
5356
 
5036
5357
  - func: zero_(Tensor(a!) self) -> Tensor(a!)
5037
5358
  device_check: NoCheck # TensorIterator
5038
5359
  variants: method, function
5039
5360
  dispatch:
5040
5361
  CPU, CUDA: zero_
5362
+ MPS: zero_mps_
5041
5363
  Meta: zero_meta_
5042
5364
  SparseCPU, SparseCUDA: zero_sparse_
5365
+ SparseCsrCPU, SparseCsrCUDA: zero_sparse_csr_
5043
5366
  MkldnnCPU: mkldnn_zero_
5367
+ autogen: zero.functional, zero.out
5044
5368
 
5045
5369
  - func: sub.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
5046
5370
  device_check: NoCheck # TensorIterator
@@ -5048,6 +5372,7 @@
5048
5372
  structured_inherits: TensorIteratorBase
5049
5373
  dispatch:
5050
5374
  CPU, CUDA: sub_out
5375
+ MPS: sub_out_mps
5051
5376
  SparseCPU, SparseCUDA: sub_out_sparse
5052
5377
 
5053
5378
  - func: sub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
@@ -5056,6 +5381,7 @@
5056
5381
  structured_delegate: sub.out
5057
5382
  dispatch:
5058
5383
  SparseCPU, SparseCUDA: sub_sparse
5384
+ ZeroTensor: sub_zerotensor
5059
5385
 
5060
5386
  - func: sub_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
5061
5387
  device_check: NoCheck # TensorIterator
@@ -5076,6 +5402,7 @@
5076
5402
  variants: method
5077
5403
  dispatch:
5078
5404
  CompositeExplicitAutograd: sub_
5405
+ autogen: sub.Scalar_out
5079
5406
 
5080
5407
  # subtract, alias for sub
5081
5408
  - func: subtract.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
@@ -5125,7 +5452,7 @@
5125
5452
 
5126
5453
  # Functionally the same as addmm, but we give it a different derivative formula
5127
5454
  # that doesn't propagate gradients to non-present entries on sparse.
5128
- - func: _sparse_addmm(Tensor self, Tensor sparse, Tensor dense, *, Scalar beta=1, Scalar alpha=1) -> Tensor
5455
+ - func: _sparse_addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
5129
5456
  python_module: sparse
5130
5457
  dispatch:
5131
5458
  CompositeExplicitAutograd: _sparse_addmm
@@ -5134,21 +5461,24 @@
5134
5461
  python_module: sparse
5135
5462
  dispatch:
5136
5463
  SparseCsrCUDA: sparse_sampled_addmm_out_sparse_csr_cuda
5464
+ SparseCsrCPU: sparse_sampled_addmm_out_sparse_csr_cpu
5137
5465
 
5138
5466
  - func: sparse_sampled_addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
5139
5467
  python_module: sparse
5140
5468
  dispatch:
5141
5469
  SparseCsrCUDA: sparse_sampled_addmm_sparse_csr_cuda
5470
+ SparseCsrCPU: sparse_sampled_addmm_sparse_csr_cpu
5142
5471
 
5143
5472
  - func: addmm.out(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
5144
5473
  structured: True
5145
5474
  dispatch:
5146
5475
  CPU: addmm_out_cpu
5147
5476
  CUDA: addmm_out_cuda
5477
+ MPS: addmm_out_mps
5148
5478
  SparseCPU: addmm_out_sparse_dense_cpu
5149
5479
  SparseCUDA: addmm_out_sparse_dense_cuda
5150
- SparseCsrCPU: addmm_out_sparse_csr_cpu
5151
- SparseCsrCUDA: addmm_out_sparse_csr_cuda
5480
+ SparseCsrCPU: addmm_out_sparse_compressed_cpu
5481
+ SparseCsrCUDA: addmm_out_sparse_compressed_cuda
5152
5482
 
5153
5483
  - func: addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
5154
5484
  structured_delegate: addmm.out
@@ -5156,7 +5486,7 @@
5156
5486
  dispatch:
5157
5487
  SparseCPU: addmm_sparse_dense_cpu
5158
5488
  SparseCUDA: addmm_sparse_dense_cuda
5159
- SparseCsrCPU, SparseCsrCUDA: addmm_sparse_csr_dense
5489
+ SparseCsrCPU, SparseCsrCUDA: addmm_sparse_compressed_dense
5160
5490
 
5161
5491
  - func: addmm_(Tensor(a!) self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
5162
5492
  structured_delegate: addmm.out
@@ -5167,6 +5497,16 @@
5167
5497
  SparseCPU: s_addmm_sparse_dense_cpu_
5168
5498
  SparseCUDA: s_addmm_sparse_dense_cuda_
5169
5499
 
5500
+ - func: _addmm_activation.out(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, bool use_gelu=False, Tensor(a!) out) -> Tensor(a!)
5501
+ structured: True
5502
+ dispatch:
5503
+ CPU: addmm_activation_out_cpu
5504
+ CUDA: addmm_activation_out_cuda
5505
+
5506
+ - func: _addmm_activation(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, bool use_gelu=False) -> Tensor
5507
+ structured_delegate: _addmm_activation.out
5508
+ variants: function, method
5509
+
5170
5510
  # NOTE [ Sparse: autograd and API ]
5171
5511
  #
5172
5512
  #
@@ -5278,11 +5618,23 @@
5278
5618
  # FIXME: would be nicer if TensorOptions was optional based; not adding default arguments for options given
5279
5619
  # the default would never make sense.
5280
5620
 
5621
+ - func: sparse_compressed_tensor.comp_plain_value_size(Tensor compressed_indices, Tensor plain_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
5281
5622
  - func: sparse_csr_tensor.crow_col_value_size(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
5623
+ - func: sparse_csc_tensor.ccol_row_value_size(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
5624
+ - func: sparse_bsr_tensor.crow_col_value_size(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
5625
+ - func: sparse_bsc_tensor.ccol_row_value_size(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
5282
5626
 
5627
+ - func: sparse_compressed_tensor.comp_plain_value(Tensor compressed_indices, Tensor plain_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
5283
5628
  - func: sparse_csr_tensor.crow_col_value(Tensor crow_indices, Tensor col_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
5629
+ - func: sparse_csc_tensor.ccol_row_value(Tensor ccol_indices, Tensor row_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
5630
+ - func: sparse_bsr_tensor.crow_col_value(Tensor crow_indices, Tensor col_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
5631
+ - func: sparse_bsc_tensor.ccol_row_value(Tensor ccol_indices, Tensor row_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
5284
5632
 
5633
+ - func: _sparse_compressed_tensor_unsafe(Tensor compressed_indices, Tensor plain_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
5285
5634
  - func: _sparse_csr_tensor_unsafe(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
5635
+ - func: _sparse_csc_tensor_unsafe(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
5636
+ - func: _sparse_bsr_tensor_unsafe(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
5637
+ - func: _sparse_bsc_tensor_unsafe(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
5286
5638
 
5287
5639
  - func: sparse_coo_tensor.size(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
5288
5640
 
@@ -5294,7 +5646,11 @@
5294
5646
 
5295
5647
  - func: _validate_sparse_coo_tensor_args(Tensor indices, Tensor values, int[] size) -> ()
5296
5648
 
5649
+ - func: _validate_sparse_compressed_tensor_args(Tensor compressed_indices, Tensor plain_indices, Tensor values, int[] size, Layout layout) -> ()
5297
5650
  - func: _validate_sparse_csr_tensor_args(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size) -> ()
5651
+ - func: _validate_sparse_csc_tensor_args(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size) -> ()
5652
+ - func: _validate_sparse_bsr_tensor_args(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size) -> ()
5653
+ - func: _validate_sparse_bsc_tensor_args(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size) -> ()
5298
5654
 
5299
5655
  - func: _sparse_coo_tensor_with_dims(int sparse_dim, int dense_dim, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
5300
5656
  dispatch:
@@ -5309,26 +5665,34 @@
5309
5665
  variants: method
5310
5666
  dispatch:
5311
5667
  SparseCPU, SparseCUDA: sparse_resize_
5668
+ autogen: sparse_resize.functional, sparse_resize.out
5312
5669
 
5313
5670
  - func: sparse_resize_and_clear_(Tensor(a!) self, int[] size, int sparse_dim, int dense_dim) -> Tensor(a!)
5314
5671
  use_const_ref_for_mutable_tensors: True
5315
5672
  variants: method
5316
5673
  dispatch:
5317
5674
  SparseCPU, SparseCUDA: sparse_resize_and_clear_
5675
+ autogen: sparse_resize_and_clear.functional, sparse_resize_and_clear.out
5318
5676
 
5319
5677
  - func: sparse_mask(Tensor self, Tensor mask) -> Tensor
5320
5678
  variants: method
5321
5679
  dispatch:
5322
5680
  SparseCPU: sparse_mask_cpu
5323
5681
  SparseCUDA: sparse_mask_cuda
5682
+ SparseCsrCPU, SparseCsrCUDA: sparse_mask_sparse_csr
5324
5683
 
5325
5684
  - func: _to_cpu(Tensor[] tensors) -> Tensor[]
5326
5685
  variants: function
5327
5686
 
5328
5687
  - func: to_dense(Tensor self, ScalarType? dtype=None) -> Tensor
5329
5688
  variants: method
5689
+
5690
+ # Special case of to_dense with custom derivative
5691
+ - func: _to_dense(Tensor self, ScalarType? dtype=None) -> Tensor
5692
+ variants: method
5330
5693
  dispatch:
5331
- SparseCPU, SparseCUDA, SparseCsrCPU, SparseCsrCUDA: sparse_to_dense
5694
+ SparseCPU, SparseCUDA: sparse_to_dense
5695
+ SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_dense
5332
5696
  MkldnnCPU: mkldnn_to_dense
5333
5697
 
5334
5698
  - func: to_dense_backward(Tensor grad, Tensor input) -> Tensor
@@ -5414,6 +5778,7 @@
5414
5778
  SparseCPU, SparseCUDA: _coalesced_sparse_
5415
5779
  device_check: NoCheck
5416
5780
  device_guard: False
5781
+ autogen: _coalesced.functional, _coalesced.out
5417
5782
 
5418
5783
  - func: indices(Tensor(a) self) -> Tensor(a)
5419
5784
  variants: method
@@ -5444,6 +5809,20 @@
5444
5809
  device_check: NoCheck
5445
5810
  device_guard: False
5446
5811
 
5812
+ - func: ccol_indices(Tensor(a) self) -> Tensor(a)
5813
+ variants: method
5814
+ dispatch:
5815
+ SparseCsrCPU, SparseCsrCUDA: ccol_indices_sparse_csr
5816
+ device_check: NoCheck
5817
+ device_guard: False
5818
+
5819
+ - func: row_indices(Tensor(a) self) -> Tensor(a)
5820
+ variants: method
5821
+ dispatch:
5822
+ SparseCsrCPU, SparseCsrCUDA: row_indices_sparse_csr
5823
+ device_check: NoCheck
5824
+ device_guard: False
5825
+
5447
5826
  - func: hspmm.out(Tensor mat1, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)
5448
5827
  dispatch:
5449
5828
  SparseCPU: hspmm_out_sparse_cpu
@@ -5459,11 +5838,13 @@
5459
5838
  variants: function
5460
5839
  dispatch:
5461
5840
  SparseCPU, SparseCUDA: copy_sparse_
5841
+ autogen: copy_sparse_to_sparse.functional, copy_sparse_to_sparse.out
5462
5842
 
5463
5843
  - func: unbind.int(Tensor(a -> *) self, int dim=0) -> Tensor(a)[]
5464
5844
  variants: function, method
5465
5845
  dispatch:
5466
5846
  CompositeExplicitAutograd: unbind
5847
+ NestedTensorCPU, NestedTensorCUDA: NestedTensor_unbind
5467
5848
 
5468
5849
  - func: unbind.Dimname(Tensor(a -> *) self, Dimname dim) -> Tensor(a)[]
5469
5850
  variants: function, method
@@ -5472,11 +5853,41 @@
5472
5853
  variants: method
5473
5854
  dispatch:
5474
5855
  CPU, CUDA: dense_to_sparse
5856
+ SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse
5475
5857
 
5476
5858
  - func: to_sparse(Tensor self) -> Tensor
5477
5859
  variants: method
5478
5860
  dispatch:
5479
5861
  CPU, CUDA: dense_to_sparse
5862
+ SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse
5863
+
5864
+ - func: to_sparse_csr(Tensor self) -> Tensor
5865
+ variants: method
5866
+ dispatch:
5867
+ CPU, CUDA: dense_to_sparse_csr
5868
+ SparseCPU, SparseCUDA: coo_to_sparse_csr
5869
+ SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse_csr
5870
+
5871
+ - func: to_sparse_csc(Tensor self) -> Tensor
5872
+ variants: method
5873
+ dispatch:
5874
+ CPU, CUDA: dense_to_sparse_csc
5875
+ SparseCPU, SparseCUDA: coo_to_sparse_csc
5876
+ SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse_csc
5877
+
5878
+ - func: to_sparse_bsr(Tensor self, int[2] blocksize) -> Tensor
5879
+ variants: method
5880
+ dispatch:
5881
+ CPU, CUDA: dense_to_sparse_bsr
5882
+ SparseCPU, SparseCUDA: coo_to_sparse_bsr
5883
+ SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse_bsr
5884
+
5885
+ - func: to_sparse_bsc(Tensor self, int[2] blocksize) -> Tensor
5886
+ variants: method
5887
+ dispatch:
5888
+ CPU, CUDA: dense_to_sparse_bsc
5889
+ SparseCPU, SparseCUDA: coo_to_sparse_bsc
5890
+ SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse_bsc
5480
5891
 
5481
5892
  - func: to_mkldnn(Tensor self, ScalarType? dtype=None) -> Tensor
5482
5893
  variants: method
@@ -5636,7 +6047,7 @@
5636
6047
  dispatch:
5637
6048
  CPU: fused_moving_avg_obs_fake_quant_cpu
5638
6049
  CUDA: fused_moving_avg_obs_fake_quant_cuda
5639
-
6050
+ autogen: _fused_moving_avg_obs_fq_helper.functional, _fused_moving_avg_obs_fq_helper.out
5640
6051
 
5641
6052
  - func: _choose_qparams_per_tensor(Tensor self, bool reduce_range=False) -> (float, int)
5642
6053
  variants: function
@@ -5722,16 +6133,33 @@
5722
6133
  dispatch:
5723
6134
  CPU: _local_scalar_dense_cpu
5724
6135
  CUDA: _local_scalar_dense_cuda
6136
+ MPS: _local_scalar_dense_mps
5725
6137
  variants: function
5726
6138
 
6139
+ # MPS LSTM implementation
6140
+
6141
+ - func: _lstm_mps(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
6142
+ dispatch:
6143
+ MPS: _lstm_mps
6144
+
6145
+ - func: lstm_mps_backward(Tensor grad_y, Tensor? grad_hy, Tensor? grad_cy, Tensor z_state, Tensor cell_state_fwd, Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor[], Tensor[])
6146
+ dispatch:
6147
+ MPS: lstm_mps_backward
6148
+
6149
+
5727
6150
  # Fused RNN kernels
5728
6151
  - func: _thnn_fused_lstm_cell(Tensor input_gates, Tensor hidden_gates, Tensor cx, Tensor? input_bias=None, Tensor? hidden_bias=None) -> (Tensor, Tensor, Tensor)
5729
6152
  dispatch:
5730
6153
  CUDA: _thnn_fused_lstm_cell_cuda
5731
6154
 
5732
- - func: _thnn_fused_lstm_cell_backward(Tensor? grad_hy, Tensor? grad_cy, Tensor cx, Tensor cy, Tensor workspace, bool has_bias) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
6155
+ # NB: The composite version of this function below is a simple wrapper that duplicates some of the outputs
6156
+ # It is necessary to avoid triggering TensorImpl use count checks in debug mode
6157
+ # NB: this is function is NOT differentiable
6158
+ - func: _thnn_fused_lstm_cell_backward_impl(Tensor? grad_hy, Tensor? grad_cy, Tensor cx, Tensor cy, Tensor workspace, bool has_bias) -> (Tensor, Tensor, Tensor)
5733
6159
  dispatch:
5734
- CUDA: _thnn_fused_lstm_cell_backward_cuda
6160
+ CUDA: _thnn_fused_lstm_cell_backward_impl_cuda
6161
+
6162
+ - func: _thnn_fused_lstm_cell_backward(Tensor? grad_hy, Tensor? grad_cy, Tensor cx, Tensor cy, Tensor workspace, bool has_bias) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
5735
6163
 
5736
6164
  - func: _thnn_differentiable_lstm_cell_backward(Tensor? grad_hy, Tensor? grad_cy, Tensor input_gates, Tensor hidden_gates, Tensor? input_bias, Tensor? hidden_bias, Tensor cx, Tensor cy) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
5737
6165
 
@@ -5812,36 +6240,55 @@
5812
6240
  device_check: NoCheck
5813
6241
  device_guard: False
5814
6242
  dispatch:
5815
- CPU, CUDA: set_
6243
+ CPU, CUDA, Meta, MPS: set_
6244
+ autogen: set.source_Storage_functional, set.source_Storage_out
5816
6245
 
5817
6246
  - func: set_.source_Storage_storage_offset(Tensor(a!) self, Storage source, int storage_offset, int[] size, int[] stride=[]) -> Tensor(a!)
5818
6247
  variants: method
5819
6248
  device_check: NoCheck
5820
6249
  device_guard: False
5821
6250
  dispatch:
5822
- CPU: set_storage_cpu_
6251
+ CPU, Meta: set_storage_cpu_
5823
6252
  CUDA: set_storage_cuda_
6253
+ MPS: set_storage_mps_
5824
6254
  QuantizedCPU, QuantizedCUDA: set_storage_quantized_
6255
+ autogen: set.source_Storage_storage_offset_functional, set.source_Storage_storage_offset_out
6256
+
6257
+ - func: set_.source_Tensor_storage_offset(Tensor(a!) self, Tensor source, int storage_offset, int[] size, int[] stride=[]) -> Tensor(a!)
6258
+ variants: method
6259
+ device_check: NoCheck
6260
+ device_guard: False
5825
6261
 
5826
6262
  - func: set_.source_Tensor(Tensor(a!) self, Tensor source) -> Tensor(a!)
5827
6263
  variants: method
5828
6264
  device_check: NoCheck
5829
6265
  device_guard: False
5830
6266
  dispatch:
5831
- CPU, CUDA: set_tensor_
6267
+ CPU, CUDA, Meta, MPS: set_tensor_
6268
+ autogen: set.source_Tensor_functional, set.source_Tensor_out
5832
6269
 
5833
6270
  - func: set_(Tensor(a!) self) -> Tensor(a!)
5834
6271
  variants: method
5835
6272
  dispatch:
5836
6273
  CPU: set_cpu_
5837
6274
  CUDA: set_cuda_
6275
+ Meta: set_meta_
6276
+ MPS: set_mps_
6277
+ autogen: set.functional, set.out
6278
+
6279
+ - func: lift(Tensor self) -> Tensor
6280
+ variants: method
6281
+ dispatch:
6282
+ # Not making it CompositeImplicitAutograd because lift
6283
+ # should be a primitive w.r.t. functorch
6284
+ CompositeExplicitAutograd: lift
5838
6285
 
5839
6286
  - func: is_set_to(Tensor self, Tensor tensor) -> bool
5840
6287
  variants: method
5841
6288
  device_check: NoCheck
5842
6289
  device_guard: False
5843
6290
  dispatch:
5844
- CPU, CUDA: is_set_to
6291
+ CPU, CUDA, MPS: is_set_to
5845
6292
 
5846
6293
  - func: masked_fill_.Scalar(Tensor(a!) self, Tensor mask, Scalar value) -> Tensor(a!)
5847
6294
  device_check: NoCheck # TensorIterator
@@ -5849,6 +6296,8 @@
5849
6296
  dispatch:
5850
6297
  CPU: masked_fill__cpu
5851
6298
  CUDA: masked_fill__cuda
6299
+ MPS: masked_fill__mps
6300
+ autogen: masked_fill.Scalar_out
5852
6301
 
5853
6302
  - func: masked_fill.Scalar(Tensor self, Tensor mask, Scalar value) -> Tensor
5854
6303
  device_check: NoCheck # TensorIterator
@@ -5862,6 +6311,8 @@
5862
6311
  dispatch:
5863
6312
  CPU: masked_fill__cpu
5864
6313
  CUDA: masked_fill__cuda
6314
+ MPS: masked_fill__mps
6315
+ autogen: masked_fill.Tensor_out
5865
6316
 
5866
6317
  - func: masked_fill.Tensor(Tensor self, Tensor mask, Tensor value) -> Tensor
5867
6318
  device_check: NoCheck # TensorIterator
@@ -5874,23 +6325,29 @@
5874
6325
  dispatch:
5875
6326
  CPU: masked_scatter__cpu
5876
6327
  CUDA: masked_scatter__cuda
6328
+ autogen: masked_scatter.out
5877
6329
 
5878
6330
  - func: masked_scatter(Tensor self, Tensor mask, Tensor source) -> Tensor
5879
6331
  variants: function, method
5880
6332
  dispatch:
5881
6333
  CompositeExplicitAutograd: masked_scatter
5882
6334
 
5883
- - func: _masked_softmax(Tensor self, Tensor mask) -> Tensor
6335
+ - func: _masked_softmax(Tensor self, Tensor mask, int? dim=None) -> Tensor
5884
6336
  dispatch:
5885
6337
  CUDA: masked_softmax_cuda
5886
6338
  CPU: masked_softmax_cpu
5887
6339
 
6340
+ - func: _masked_softmax_backward(Tensor grad_output, Tensor output, Tensor mask, int? dim=None) -> Tensor
6341
+ dispatch:
6342
+ CUDA: masked_softmax_backward_cuda
6343
+ CPU: masked_softmax_backward_cpu
6344
+
5888
6345
  - func: view(Tensor(a) self, int[] size) -> Tensor(a)
5889
6346
  variants: method
5890
6347
  device_check: NoCheck
5891
6348
  device_guard: False
5892
6349
  dispatch:
5893
- ZeroTensor, CPU, CUDA, Meta, QuantizedCPU, QuantizedCUDA: view
6350
+ ZeroTensor, CPU, CUDA, Meta, QuantizedCPU, QuantizedCUDA, MPS: view
5894
6351
  MkldnnCPU: mkldnn_view
5895
6352
 
5896
6353
  # Warning: If you want to change the name or overload name of this
@@ -5909,7 +6366,8 @@
5909
6366
  - func: put_(Tensor(a!) self, Tensor index, Tensor source, bool accumulate=False) -> Tensor(a!)
5910
6367
  variants: method
5911
6368
  dispatch:
5912
- CPU, CUDA: put_
6369
+ CPU, CUDA, MPS: put_
6370
+ autogen: put.out
5913
6371
 
5914
6372
  - func: put(Tensor self, Tensor index, Tensor source, bool accumulate=False) -> Tensor
5915
6373
  variants: function, method
@@ -5934,12 +6392,30 @@
5934
6392
  - func: index_add.dimname(Tensor self, Dimname dim, Tensor index, Tensor source, *, Scalar alpha=1) -> Tensor
5935
6393
  variants: function, method
5936
6394
 
6395
+ - func: index_reduce.out(Tensor self, int dim, Tensor index, Tensor source, str reduce, *, bool include_self=True, Tensor(a!) out) -> Tensor(a!)
6396
+ structured: True
6397
+ variants: function
6398
+ precomputed:
6399
+ - dim -> int dim
6400
+ dispatch:
6401
+ CPU: index_reduce_cpu_out
6402
+ CUDA: index_reduce_cuda_out
6403
+
6404
+ - func: index_reduce_(Tensor(a!) self, int dim, Tensor index, Tensor source, str reduce, *, bool include_self=True) -> Tensor(a!)
6405
+ structured_delegate: index_reduce.out
6406
+ variants: method
6407
+
6408
+ - func: index_reduce(Tensor self, int dim, Tensor index, Tensor source, str reduce, *, bool include_self=True) -> Tensor
6409
+ structured_delegate: index_reduce.out
6410
+ variants: function, method
6411
+
5937
6412
  - func: index_fill_.int_Scalar(Tensor(a!) self, int dim, Tensor index, Scalar value) -> Tensor(a!)
5938
6413
  device_check: NoCheck # TensorIterator
5939
6414
  variants: method
5940
6415
  dispatch:
5941
6416
  CPU: index_fill_
5942
6417
  CUDA: index_fill_
6418
+ autogen: index_fill.int_Scalar_out
5943
6419
 
5944
6420
  - func: index_fill.int_Scalar(Tensor self, int dim, Tensor index, Scalar value) -> Tensor
5945
6421
  device_check: NoCheck # TensorIterator
@@ -5952,6 +6428,7 @@
5952
6428
  variants: method
5953
6429
  dispatch:
5954
6430
  CPU, CUDA: index_fill_
6431
+ autogen: index_fill.int_Tensor_out
5955
6432
 
5956
6433
  - func: index_fill.int_Tensor(Tensor self, int dim, Tensor index, Tensor value) -> Tensor
5957
6434
  device_check: NoCheck # TensorIterator
@@ -5988,6 +6465,7 @@
5988
6465
  variants: function
5989
6466
  dispatch:
5990
6467
  CPU, CUDA: scatter_src_out
6468
+ MPS: scatter_src_out_mps
5991
6469
 
5992
6470
  - func: scatter.value(Tensor self, int dim, Tensor index, Scalar value) -> Tensor
5993
6471
  structured_delegate: scatter.value_out
@@ -6002,6 +6480,7 @@
6002
6480
  variants: function
6003
6481
  dispatch:
6004
6482
  CPU, CUDA: scatter_value_out
6483
+ MPS: scatter_value_out_mps
6005
6484
 
6006
6485
  - func: scatter.reduce(Tensor self, int dim, Tensor index, Tensor src, *, str reduce) -> Tensor
6007
6486
  structured_delegate: scatter.reduce_out
@@ -6016,6 +6495,7 @@
6016
6495
  variants: function
6017
6496
  dispatch:
6018
6497
  CPU, CUDA: scatter_reduce_out
6498
+ MPS: scatter_reduce_out_mps
6019
6499
 
6020
6500
  - func: scatter.value_reduce(Tensor self, int dim, Tensor index, Scalar value, *, str reduce) -> Tensor
6021
6501
  structured_delegate: scatter.value_reduce_out
@@ -6030,6 +6510,7 @@
6030
6510
  variants: function
6031
6511
  dispatch:
6032
6512
  CPU, CUDA: scatter_value_reduce_out
6513
+ MPS: scatter_value_reduce_out_mps
6033
6514
 
6034
6515
  - func: scatter.dimname_src(Tensor self, Dimname dim, Tensor index, Tensor src) -> Tensor
6035
6516
  variants: function, method
@@ -6050,14 +6531,24 @@
6050
6531
  variants: function
6051
6532
  dispatch:
6052
6533
  CPU, CUDA: scatter_add
6534
+ MPS: scatter_add_mps_out
6053
6535
 
6054
6536
  - func: scatter_add.dimname(Tensor self, Dimname dim, Tensor index, Tensor src) -> Tensor
6055
6537
  variants: function, method
6056
6538
 
6057
- - func: scatter_reduce.two(Tensor self, int dim, Tensor index, str reduce, *, int? output_size=None) -> Tensor
6539
+ - func: scatter_reduce.two(Tensor self, int dim, Tensor index, Tensor src, str reduce, *, bool include_self=True) -> Tensor
6540
+ structured_delegate: scatter_reduce.two_out
6058
6541
  variants: function, method
6542
+
6543
+ - func: scatter_reduce_.two(Tensor(a!) self, int dim, Tensor index, Tensor src, str reduce, *, bool include_self=True) -> Tensor(a!)
6544
+ structured_delegate: scatter_reduce.two_out
6545
+ variants: method
6546
+
6547
+ - func: scatter_reduce.two_out(Tensor self, int dim, Tensor index, Tensor src, str reduce, *, bool include_self=True, Tensor(a!) out) -> Tensor(a!)
6548
+ structured: True
6549
+ variants: function
6059
6550
  dispatch:
6060
- CPU: scatter_reduce_two_cpu
6551
+ CPU, CUDA: scatter_reduce_two
6061
6552
 
6062
6553
  - func: eq_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
6063
6554
  structured_delegate: eq.Scalar_out
@@ -6093,6 +6584,12 @@
6093
6584
  dispatch:
6094
6585
  CompositeExplicitAutograd: bitwise_and
6095
6586
 
6587
+ - func: bitwise_and.Scalar_Tensor(Scalar self, Tensor other) -> Tensor
6588
+ device_check: NoCheck # TensorIterator
6589
+ variants: function
6590
+ dispatch:
6591
+ CompositeExplicitAutograd: bitwise_and
6592
+
6096
6593
  - func: bitwise_and.Tensor(Tensor self, Tensor other) -> Tensor
6097
6594
  device_check: NoCheck # TensorIterator
6098
6595
  variants: method, function
@@ -6141,6 +6638,12 @@
6141
6638
  device_check: NoCheck # TensorIterator
6142
6639
  variants: method, function
6143
6640
 
6641
+ - func: bitwise_or.Scalar_Tensor(Scalar self, Tensor other) -> Tensor
6642
+ device_check: NoCheck # TensorIterator
6643
+ variants: function
6644
+ dispatch:
6645
+ CompositeExplicitAutograd: bitwise_or
6646
+
6144
6647
  - func: bitwise_or.Tensor(Tensor self, Tensor other) -> Tensor
6145
6648
  device_check: NoCheck # TensorIterator
6146
6649
  variants: method, function
@@ -6189,6 +6692,12 @@
6189
6692
  device_check: NoCheck # TensorIterator
6190
6693
  variants: method, function
6191
6694
 
6695
+ - func: bitwise_xor.Scalar_Tensor(Scalar self, Tensor other) -> Tensor
6696
+ device_check: NoCheck # TensorIterator
6697
+ variants: function
6698
+ dispatch:
6699
+ CompositeExplicitAutograd: bitwise_xor
6700
+
6192
6701
  - func: bitwise_xor.Tensor(Tensor self, Tensor other) -> Tensor
6193
6702
  device_check: NoCheck # TensorIterator
6194
6703
  variants: method, function
@@ -6236,12 +6745,14 @@
6236
6745
  variants: method
6237
6746
  dispatch:
6238
6747
  CPU, CUDA: __ilshift__
6748
+ autogen: __lshift__.Scalar_out
6239
6749
 
6240
6750
  - func: __ilshift__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
6241
6751
  device_check: NoCheck # TensorIterator
6242
6752
  variants: method
6243
6753
  dispatch:
6244
6754
  CPU, CUDA: __ilshift__
6755
+ autogen: __lshift__.Tensor_out
6245
6756
 
6246
6757
  - func: bitwise_left_shift.Tensor(Tensor self, Tensor other) -> Tensor
6247
6758
  device_check: NoCheck # TensorIterator
@@ -6264,25 +6775,25 @@
6264
6775
  device_check: NoCheck # TensorIterator
6265
6776
  variants: method, function
6266
6777
  dispatch:
6267
- CPU, CUDA: bitwise_left_shift
6778
+ CompositeExplicitAutograd: bitwise_left_shift
6268
6779
 
6269
6780
  - func: bitwise_left_shift_.Tensor_Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
6270
6781
  device_check: NoCheck # TensorIterator
6271
6782
  variants: method
6272
6783
  dispatch:
6273
- CPU, CUDA: bitwise_left_shift_
6784
+ CompositeExplicitAutograd: bitwise_left_shift_
6274
6785
 
6275
6786
  - func: bitwise_left_shift.Tensor_Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
6276
6787
  device_check: NoCheck # TensorIterator
6277
6788
  variants: function
6278
6789
  dispatch:
6279
- CPU, CUDA: bitwise_left_shift_out
6790
+ CompositeExplicitAutograd: bitwise_left_shift_out
6280
6791
 
6281
6792
  - func: bitwise_left_shift.Scalar_Tensor(Scalar self, Tensor other) -> Tensor
6282
6793
  device_check: NoCheck # TensorIterator
6283
6794
  variants: function
6284
6795
  dispatch:
6285
- CPU, CUDA: bitwise_left_shift
6796
+ CompositeExplicitAutograd: bitwise_left_shift
6286
6797
 
6287
6798
  - func: __rshift__.Scalar(Tensor self, Scalar other) -> Tensor
6288
6799
  device_check: NoCheck # TensorIterator
@@ -6301,12 +6812,14 @@
6301
6812
  variants: method
6302
6813
  dispatch:
6303
6814
  CPU, CUDA: __irshift__
6815
+ autogen: __rshift__.Scalar_out
6304
6816
 
6305
6817
  - func: __irshift__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
6306
6818
  device_check: NoCheck # TensorIterator
6307
6819
  variants: method
6308
6820
  dispatch:
6309
6821
  CPU, CUDA: __irshift__
6822
+ autogen: __rshift__.Tensor_out
6310
6823
 
6311
6824
  - func: bitwise_right_shift.Tensor(Tensor self, Tensor other) -> Tensor
6312
6825
  device_check: NoCheck # TensorIterator
@@ -6329,25 +6842,25 @@
6329
6842
  device_check: NoCheck # TensorIterator
6330
6843
  variants: method, function
6331
6844
  dispatch:
6332
- CPU, CUDA: bitwise_right_shift
6845
+ CompositeExplicitAutograd: bitwise_right_shift
6333
6846
 
6334
6847
  - func: bitwise_right_shift_.Tensor_Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
6335
6848
  device_check: NoCheck # TensorIterator
6336
6849
  variants: method
6337
6850
  dispatch:
6338
- CPU, CUDA: bitwise_right_shift_
6851
+ CompositeExplicitAutograd: bitwise_right_shift_
6339
6852
 
6340
6853
  - func: bitwise_right_shift.Tensor_Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
6341
6854
  device_check: NoCheck # TensorIterator
6342
6855
  variants: function
6343
6856
  dispatch:
6344
- CPU, CUDA: bitwise_right_shift_out
6857
+ CompositeExplicitAutograd: bitwise_right_shift_out
6345
6858
 
6346
6859
  - func: bitwise_right_shift.Scalar_Tensor(Scalar self, Tensor other) -> Tensor
6347
6860
  device_check: NoCheck # TensorIterator
6348
6861
  variants: function
6349
6862
  dispatch:
6350
- CPU, CUDA: bitwise_right_shift
6863
+ CompositeExplicitAutograd: bitwise_right_shift
6351
6864
 
6352
6865
  - func: tril_(Tensor(a!) self, int diagonal=0) -> Tensor(a!)
6353
6866
  structured_delegate: tril.out
@@ -6376,15 +6889,18 @@
6376
6889
  variants: method
6377
6890
  dispatch:
6378
6891
  CPU, CUDA: addbmm_
6892
+ MPS: addbmm_mps_
6379
6893
 
6380
6894
  - func: addbmm.out(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
6381
6895
  dispatch:
6382
6896
  CPU, CUDA: addbmm_out
6897
+ MPS: addbmm_out_mps
6383
6898
 
6384
6899
  - func: addbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
6385
6900
  variants: method, function
6386
6901
  dispatch:
6387
6902
  CPU, CUDA: addbmm
6903
+ MPS: addbmm_mps
6388
6904
 
6389
6905
  - func: random_.from(Tensor(a!) self, int from, int? to, *, Generator? generator=None) -> Tensor(a!)
6390
6906
  device_check: NoCheck # TensorIterator
@@ -6392,6 +6908,8 @@
6392
6908
  dispatch:
6393
6909
  CPU, CUDA: random_
6394
6910
  Meta: random_meta_
6911
+ MPS: random_mps_
6912
+ autogen: random.from_functional, random.from_out
6395
6913
 
6396
6914
  - func: random_.to(Tensor(a!) self, int to, *, Generator? generator=None) -> Tensor(a!)
6397
6915
  device_check: NoCheck # TensorIterator
@@ -6399,6 +6917,8 @@
6399
6917
  dispatch:
6400
6918
  CPU, CUDA: random_
6401
6919
  Meta: random_meta_
6920
+ MPS: random_mps_
6921
+ autogen: random.to_functional, random.to_out
6402
6922
 
6403
6923
  - func: random_(Tensor(a!) self, *, Generator? generator=None) -> Tensor(a!)
6404
6924
  device_check: NoCheck # TensorIterator
@@ -6406,31 +6926,37 @@
6406
6926
  dispatch:
6407
6927
  CPU, CUDA: random_
6408
6928
  Meta: random_meta_
6929
+ autogen: random.functional, random.out
6409
6930
 
6410
6931
  - func: uniform_(Tensor(a!) self, float from=0, float to=1, *, Generator? generator=None) -> Tensor(a!)
6411
6932
  device_check: NoCheck # TensorIterator
6412
6933
  variants: method
6413
6934
  dispatch:
6414
6935
  CPU, CUDA: uniform_
6936
+ MPS: uniform_mps_
6415
6937
  Meta: uniform_meta_
6938
+ autogen: uniform.functional, uniform.out
6416
6939
 
6417
6940
  - func: cauchy_(Tensor(a!) self, float median=0, float sigma=1, *, Generator? generator=None) -> Tensor(a!)
6418
6941
  device_check: NoCheck # TensorIterator
6419
6942
  variants: method
6420
6943
  dispatch:
6421
6944
  CPU, CUDA: cauchy_
6945
+ autogen: cauchy.functional, cauchy.out
6422
6946
 
6423
6947
  - func: log_normal_(Tensor(a!) self, float mean=1, float std=2, *, Generator? generator=None) -> Tensor(a!)
6424
6948
  device_check: NoCheck # TensorIterator
6425
6949
  variants: method
6426
6950
  dispatch:
6427
6951
  CPU, CUDA: log_normal_
6952
+ autogen: log_normal.functional, log_normal.out
6428
6953
 
6429
6954
  - func: exponential_(Tensor(a!) self, float lambd=1, *, Generator? generator=None) -> Tensor(a!)
6430
6955
  device_check: NoCheck # TensorIterator
6431
6956
  variants: method
6432
6957
  dispatch:
6433
6958
  CPU, CUDA: exponential_
6959
+ autogen: exponential.functional, exponential.out
6434
6960
 
6435
6961
  - func: geometric_(Tensor(a!) self, float p, *, Generator? generator=None) -> Tensor(a!)
6436
6962
  device_check: NoCheck # TensorIterator
@@ -6439,11 +6965,13 @@
6439
6965
  CPU, CUDA: geometric_
6440
6966
 
6441
6967
  # wrappers for TH functions
6968
+ autogen: geometric.functional, geometric.out
6442
6969
 
6443
6970
  - func: diag.out(Tensor self, int diagonal=0, *, Tensor(a!) out) -> Tensor(a!)
6444
6971
  dispatch:
6445
6972
  CPU: diag_cpu_out
6446
6973
  CUDA: diag_cuda_out
6974
+ MPS: diag_mps_out
6447
6975
 
6448
6976
  - func: diag(Tensor self, int diagonal=0) -> Tensor
6449
6977
  variants: method, function
@@ -6465,6 +6993,7 @@
6465
6993
  dispatch:
6466
6994
  CPU: triu_cpu
6467
6995
  CUDA: triu_cuda
6996
+ MPS: triu_mps_out
6468
6997
 
6469
6998
  - func: triu(Tensor self, int diagonal=0) -> Tensor
6470
6999
  structured_delegate: triu.out
@@ -6475,6 +7004,7 @@
6475
7004
  dispatch:
6476
7005
  CPU: tril_cpu
6477
7006
  CUDA: tril_cuda
7007
+ MPS: tril_mps_out
6478
7008
 
6479
7009
  - func: tril(Tensor self, int diagonal=0) -> Tensor
6480
7010
  structured_delegate: tril.out
@@ -6507,6 +7037,7 @@
6507
7037
  device_check: NoCheck # TensorIterator
6508
7038
  dispatch:
6509
7039
  CPU, CUDA: ne_Scalar_out
7040
+ MPS: ne_scalar_out_mps
6510
7041
  QuantizedCPU: ne_out_quantized_cpu
6511
7042
 
6512
7043
  - func: ne.Scalar(Tensor self, Scalar other) -> Tensor
@@ -6522,6 +7053,7 @@
6522
7053
  device_check: NoCheck # TensorIterator
6523
7054
  dispatch:
6524
7055
  CPU, CUDA: ne_Tensor_out
7056
+ MPS: ne_tensor_out_mps
6525
7057
  QuantizedCPU: ne_out_quantized_cpu
6526
7058
 
6527
7059
  - func: ne.Tensor(Tensor self, Tensor other) -> Tensor
@@ -6568,6 +7100,7 @@
6568
7100
  device_check: NoCheck # TensorIterator
6569
7101
  dispatch:
6570
7102
  CPU, CUDA: eq_Scalar_out
7103
+ MPS: eq_scalar_out_mps
6571
7104
  QuantizedCPU: eq_out_quantized_cpu
6572
7105
 
6573
7106
  - func: eq.Scalar(Tensor self, Scalar other) -> Tensor
@@ -6583,6 +7116,7 @@
6583
7116
  device_check: NoCheck # TensorIterator
6584
7117
  dispatch:
6585
7118
  CPU, CUDA: eq_Tensor_out
7119
+ MPS: eq_tensor_out_mps
6586
7120
  QuantizedCPU: eq_out_quantized_cpu
6587
7121
 
6588
7122
  - func: eq.Tensor(Tensor self, Tensor other) -> Tensor
@@ -6598,6 +7132,7 @@
6598
7132
  device_check: NoCheck # TensorIterator
6599
7133
  dispatch:
6600
7134
  CPU, CUDA: ge_Scalar_out
7135
+ MPS: ge_scalar_out_mps
6601
7136
  QuantizedCPU: ge_out_quantized_cpu
6602
7137
 
6603
7138
  - func: ge.Scalar(Tensor self, Scalar other) -> Tensor
@@ -6613,6 +7148,7 @@
6613
7148
  device_check: NoCheck # TensorIterator
6614
7149
  dispatch:
6615
7150
  CPU, CUDA: ge_Tensor_out
7151
+ MPS: ge_tensor_out_mps
6616
7152
  QuantizedCPU: ge_out_quantized_cpu
6617
7153
 
6618
7154
  - func: ge.Tensor(Tensor self, Tensor other) -> Tensor
@@ -6659,6 +7195,7 @@
6659
7195
  device_check: NoCheck # TensorIterator
6660
7196
  dispatch:
6661
7197
  CPU, CUDA: le_Scalar_out
7198
+ MPS: le_scalar_out_mps
6662
7199
  QuantizedCPU: le_out_quantized_cpu
6663
7200
 
6664
7201
  - func: le.Scalar(Tensor self, Scalar other) -> Tensor
@@ -6674,6 +7211,7 @@
6674
7211
  device_check: NoCheck # TensorIterator
6675
7212
  dispatch:
6676
7213
  CPU, CUDA: le_Tensor_out
7214
+ MPS: le_tensor_out_mps
6677
7215
  QuantizedCPU: le_out_quantized_cpu
6678
7216
 
6679
7217
  - func: le.Tensor(Tensor self, Tensor other) -> Tensor
@@ -6720,6 +7258,7 @@
6720
7258
  device_check: NoCheck # TensorIterator
6721
7259
  dispatch:
6722
7260
  CPU, CUDA: gt_Scalar_out
7261
+ MPS: gt_scalar_out_mps
6723
7262
  QuantizedCPU: gt_out_quantized_cpu
6724
7263
 
6725
7264
  - func: gt.Scalar(Tensor self, Scalar other) -> Tensor
@@ -6735,6 +7274,7 @@
6735
7274
  device_check: NoCheck # TensorIterator
6736
7275
  dispatch:
6737
7276
  CPU, CUDA: gt_Tensor_out
7277
+ MPS: gt_tensor_out_mps
6738
7278
  QuantizedCPU: gt_out_quantized_cpu
6739
7279
 
6740
7280
  - func: gt.Tensor(Tensor self, Tensor other) -> Tensor
@@ -6781,6 +7321,7 @@
6781
7321
  device_check: NoCheck # TensorIterator
6782
7322
  dispatch:
6783
7323
  CPU, CUDA: lt_Scalar_out
7324
+ MPS: lt_scalar_out_mps
6784
7325
  QuantizedCPU: lt_out_quantized_cpu
6785
7326
 
6786
7327
  - func: lt.Scalar(Tensor self, Scalar other) -> Tensor
@@ -6796,6 +7337,7 @@
6796
7337
  device_check: NoCheck # TensorIterator
6797
7338
  dispatch:
6798
7339
  CPU, CUDA: lt_Tensor_out
7340
+ MPS: lt_tensor_out_mps
6799
7341
  QuantizedCPU: lt_out_quantized_cpu
6800
7342
 
6801
7343
  - func: lt.Tensor(Tensor self, Tensor other) -> Tensor
@@ -6854,15 +7396,18 @@
6854
7396
  dispatch:
6855
7397
  CPU, QuantizedCPU: index_select_out_cpu_
6856
7398
  CUDA, QuantizedCUDA: index_select_out_cuda
7399
+ MPS: index_select_out_mps
6857
7400
 
6858
7401
  - func: index_select(Tensor self, int dim, Tensor index) -> Tensor
6859
7402
  variants: method, function
6860
7403
  dispatch:
6861
7404
  CPU: index_select_cpu_
6862
7405
  QuantizedCPU: index_select_quantized_cpu_
6863
- CUDA, QuantizedCUDA: index_select_cuda
6864
- SparseCPU: index_select_sparse
6865
- SparseCUDA: index_select_sparse
7406
+ CUDA: index_select_cuda
7407
+ QuantizedCUDA: index_select_quantized_cuda
7408
+ SparseCPU: index_select_sparse_cpu
7409
+ SparseCUDA: index_select_sparse_cuda
7410
+ MPS: index_select_mps
6866
7411
 
6867
7412
  - func: index_select.dimname_out(Tensor self, Dimname dim, Tensor index, *, Tensor(a!) out) -> Tensor(a!)
6868
7413
 
@@ -6911,6 +7456,7 @@
6911
7456
  structured: True
6912
7457
  dispatch:
6913
7458
  CPU, CUDA: gather_out
7459
+ MPS: gather_out_mps
6914
7460
 
6915
7461
  - func: gather(Tensor self, int dim, Tensor index, *, bool sparse_grad=False) -> Tensor
6916
7462
  variants: method, function
@@ -6934,6 +7480,7 @@
6934
7480
  device_check: NoCheck # TensorIterator
6935
7481
  dispatch:
6936
7482
  CPU, CUDA: addcmul_out
7483
+ MPS: addcmul_out_mps
6937
7484
 
6938
7485
  - func: addcmul(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor
6939
7486
  structured_delegate: addcmul.out
@@ -6951,6 +7498,7 @@
6951
7498
  device_check: NoCheck # TensorIterator
6952
7499
  dispatch:
6953
7500
  CPU, CUDA: addcdiv_out
7501
+ MPS: addcdiv_out_mps
6954
7502
 
6955
7503
  - func: addcdiv(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor
6956
7504
  structured_delegate: addcdiv.out
@@ -6998,10 +7546,13 @@
6998
7546
 
6999
7547
  - func: linalg_solve_triangular(Tensor self, Tensor B, *, bool upper, bool left=True, bool unitriangular=False) -> Tensor
7000
7548
  python_module: linalg
7001
- variants: method, function
7549
+ variants: function
7002
7550
  dispatch:
7003
7551
  CPU, CUDA: linalg_solve_triangular
7004
7552
 
7553
+ - func: linalg_vander(Tensor x, *, int? N=None) -> Tensor
7554
+ python_module: linalg
7555
+
7005
7556
  - func: symeig.e(Tensor self, bool eigenvectors=False, bool upper=True, *, Tensor(a!) e, Tensor(b!) V) -> (Tensor(a!) eigenvalues, Tensor(b!) eigenvectors)
7006
7557
  dispatch:
7007
7558
  CompositeExplicitAutograd: symeig_out
@@ -7079,21 +7630,6 @@
7079
7630
  CPU: _cholesky_solve_helper_cpu
7080
7631
  CUDA: _cholesky_solve_helper_cuda
7081
7632
 
7082
- - func: solve(Tensor self, Tensor A) -> (Tensor solution, Tensor LU)
7083
- variants: function, method
7084
- dispatch:
7085
- CompositeExplicitAutograd: solve
7086
-
7087
- - func: solve.solution(Tensor self, Tensor A, *, Tensor(a!) solution, Tensor(b!) lu) -> (Tensor(a!) solution, Tensor(b!) LU)
7088
- dispatch:
7089
- CompositeExplicitAutograd: solve_out
7090
-
7091
- - func: _solve_helper(Tensor self, Tensor A) -> (Tensor, Tensor)
7092
- variants: function
7093
- dispatch:
7094
- CPU: _solve_helper_cpu
7095
- CUDA: _solve_helper_cuda
7096
-
7097
7633
  - func: cholesky_inverse(Tensor self, bool upper=False) -> Tensor
7098
7634
  variants: method, function
7099
7635
  dispatch:
@@ -7144,13 +7680,14 @@
7144
7680
  dispatch:
7145
7681
  CPU, CUDA: lu_solve
7146
7682
 
7683
+ # lu_unpack
7147
7684
  - func: lu_unpack(Tensor LU_data, Tensor LU_pivots, bool unpack_data=True, bool unpack_pivots=True) -> (Tensor P, Tensor L, Tensor U)
7685
+ structured_delegate: lu_unpack.out
7148
7686
  variants: function
7149
- dispatch:
7150
- CPU, CUDA: lu_unpack
7151
7687
 
7152
7688
  - func: lu_unpack.out(Tensor LU_data, Tensor LU_pivots, bool unpack_data=True, bool unpack_pivots=True, *, Tensor(a!) P, Tensor(b!) L, Tensor(c!) U) -> (Tensor(a!) P, Tensor(b!) L, Tensor(c!) U)
7153
7689
  variants: function
7690
+ structured: True
7154
7691
  dispatch:
7155
7692
  CPU, CUDA: lu_unpack_out
7156
7693
 
@@ -7274,6 +7811,7 @@
7274
7811
  structured_inherits: TensorIteratorBase
7275
7812
  dispatch:
7276
7813
  CPU, CUDA: sign_out
7814
+ MPS: sign_out_mps
7277
7815
  SparseCPU, SparseCUDA: sign_sparse_out
7278
7816
  SparseCsrCPU, SparseCsrCUDA: sign_sparse_csr_out
7279
7817
 
@@ -7305,6 +7843,7 @@
7305
7843
  structured_inherits: TensorIteratorBase
7306
7844
  dispatch:
7307
7845
  CPU, CUDA: atan2_out
7846
+ MPS: atan2_mps_out
7308
7847
 
7309
7848
  - func: atan2_(Tensor(a!) self, Tensor other) -> Tensor(a!)
7310
7849
  device_check: NoCheck # TensorIterator
@@ -7391,6 +7930,12 @@
7391
7930
  dispatch:
7392
7931
  CPU: histogramdd_cpu
7393
7932
 
7933
+ - func: histogramdd(Tensor self, int[] bins, float[]? range=None, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor[] bin_edges)
7934
+
7935
+ - func: histogramdd.int_bins(Tensor self, int bins, float[]? range=None, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor[] bin_edges)
7936
+
7937
+ - func: histogramdd.TensorList_bins(Tensor self, Tensor[] bins, float[]? range=None, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor[] bin_edges)
7938
+
7394
7939
  - func: fmod.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
7395
7940
  device_check: NoCheck # TensorIterator
7396
7941
  dispatch:
@@ -7528,6 +8073,7 @@
7528
8073
  variants: method, function
7529
8074
  dispatch:
7530
8075
  CPU, CUDA: min
8076
+ MPS: min_mps
7531
8077
  QuantizedCPU: min_quantized_cpu
7532
8078
 
7533
8079
  - func: fmin(Tensor self, Tensor other) -> Tensor
@@ -7547,6 +8093,7 @@
7547
8093
  variants: method, function
7548
8094
  dispatch:
7549
8095
  CPU, CUDA: max
8096
+ MPS: max_mps
7550
8097
  QuantizedCPU: max_quantized_cpu
7551
8098
 
7552
8099
  - func: fmax(Tensor self, Tensor other) -> Tensor
@@ -7572,6 +8119,7 @@
7572
8119
  device_check: NoCheck # TensorIterator
7573
8120
  dispatch:
7574
8121
  CPU, CUDA: maximum_out
8122
+ MPS: maximum_out_mps
7575
8123
 
7576
8124
  # binary max, alias of maximum
7577
8125
  # NOTE: max is not an alias for maximum, since there is also unary max
@@ -7593,6 +8141,7 @@
7593
8141
  device_check: NoCheck # TensorIterator
7594
8142
  dispatch:
7595
8143
  CPU, CUDA: minimum_out
8144
+ MPS: minimum_out_mps
7596
8145
 
7597
8146
  # binary min, alias for minimum
7598
8147
  # NOTE: min is not an alias for minimum, since there is also unary min
@@ -7626,27 +8175,23 @@
7626
8175
  - func: sort.values(Tensor self, int dim=-1, bool descending=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
7627
8176
  device_check: NoCheck # TensorIterator
7628
8177
  dispatch:
7629
- CPU: sort_out_cpu
7630
- CUDA: sort_out_cuda
8178
+ CompositeExplicitAutograd: sort_out
7631
8179
 
7632
8180
  - func: sort.values_stable(Tensor self, *, bool? stable, int dim=-1, bool descending=False, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
8181
+ structured: True
7633
8182
  dispatch:
7634
- CPU: sort_out_cpu_stable
7635
- CUDA: sort_out_stable_cuda
8183
+ CPU, CUDA: sort_stable_out
7636
8184
 
7637
8185
  - func: sort(Tensor self, int dim=-1, bool descending=False) -> (Tensor values, Tensor indices)
7638
8186
  device_check: NoCheck # TensorIterator
7639
8187
  variants: method, function
7640
8188
  dispatch:
7641
- CPU: sort_cpu
7642
- CUDA: sort_cuda
7643
- QuantizedCPU: sort_quantized_cpu
8189
+ CompositeExplicitAutograd: sort
7644
8190
 
7645
8191
  - func: sort.stable(Tensor self, *, bool? stable, int dim=-1, bool descending=False) -> (Tensor values, Tensor indices)
8192
+ structured_delegate: sort.values_stable
7646
8193
  variants: method, function
7647
8194
  dispatch:
7648
- CPU: sort_cpu_stable
7649
- CUDA: sort_stable_cuda
7650
8195
  QuantizedCPU: sort_quantized_cpu_stable
7651
8196
 
7652
8197
  - func: sort.dimname_values(Tensor self, Dimname dim, bool descending=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
@@ -7676,6 +8221,7 @@
7676
8221
  dispatch:
7677
8222
  CPU: topk_out_cpu
7678
8223
  CUDA: topk_out_cuda
8224
+ MPS: topk_out_mps
7679
8225
 
7680
8226
  - func: topk(Tensor self, int k, int dim=-1, bool largest=True, bool sorted=True) -> (Tensor values, Tensor indices)
7681
8227
  variants: method, function
@@ -7693,6 +8239,7 @@
7693
8239
  structured: True
7694
8240
  dispatch:
7695
8241
  CPU, CUDA: all_all_out
8242
+ MPS: all_all_out_mps
7696
8243
 
7697
8244
  - func: any(Tensor self) -> Tensor
7698
8245
  device_check: NoCheck # TensorIterator
@@ -7706,6 +8253,7 @@
7706
8253
  structured: True
7707
8254
  dispatch:
7708
8255
  CPU, CUDA: any_all_out
8256
+ MPS: any_all_out_mps
7709
8257
 
7710
8258
  - func: renorm.out(Tensor self, Scalar p, int dim, Scalar maxnorm, *, Tensor(a!) out) -> Tensor(a!)
7711
8259
  device_check: NoCheck # TensorIterator
@@ -7728,7 +8276,7 @@
7728
8276
  device_check: NoCheck
7729
8277
  device_guard: False
7730
8278
  dispatch:
7731
- CPU, CUDA: unfold
8279
+ CPU, CUDA, Meta: unfold
7732
8280
  QuantizedCPU, QuantizedCUDA: unfold
7733
8281
 
7734
8282
  - func: unfold_backward(Tensor grad_in, int[] input_sizes, int dim, int size, int step) -> Tensor
@@ -7749,6 +8297,7 @@
7749
8297
  structured_inherits: TensorIteratorBase
7750
8298
  dispatch:
7751
8299
  CPU, CUDA: pow_Tensor_Tensor_out
8300
+ MPS: pow_tensor_tensor_out_mps
7752
8301
 
7753
8302
  - func: pow.Tensor_Tensor(Tensor self, Tensor exponent) -> Tensor
7754
8303
  device_check: NoCheck # TensorIterator
@@ -7772,6 +8321,7 @@
7772
8321
  dispatch:
7773
8322
  CPU, CUDA: pow_Tensor_Scalar_out
7774
8323
  SparseCPU, SparseCUDA: pow_out_sparse_scalar
8324
+ MPS: pow_tensor_scalar_out_mps
7775
8325
 
7776
8326
  - func: pow.Tensor_Scalar(Tensor self, Scalar exponent) -> Tensor
7777
8327
  device_check: NoCheck # TensorIterator
@@ -7815,32 +8365,46 @@
7815
8365
  variants: method
7816
8366
  dispatch:
7817
8367
  CPU, CUDA: normal_
8368
+ MPS: normal_mps_
7818
8369
  Meta: normal_meta_
7819
8370
  SparseCsrCPU, SparseCsrCUDA: normal_sparse_csr_
8371
+ autogen: normal.functional, normal.out
7820
8372
 
7821
8373
  - func: normal.Tensor_float_out(Tensor mean, float std=1, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
7822
8374
  dispatch:
7823
8375
  CPU, CUDA: normal_out
8376
+ MPS: normal_mps_out
8377
+ Meta: normal_out_meta
7824
8378
 
7825
8379
  - func: normal.Tensor_float(Tensor mean, float std=1, *, Generator? generator=None) -> Tensor
7826
8380
  dispatch:
7827
8381
  CPU, CUDA: normal
8382
+ #MPS: normal_mps
8383
+ Meta: normal_meta
7828
8384
 
7829
8385
  - func: normal.float_Tensor_out(float mean, Tensor std, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
7830
8386
  dispatch:
7831
8387
  CPU, CUDA: normal_out
8388
+ Meta: normal_out_meta
8389
+ MPS: normal_mps_out
7832
8390
 
7833
8391
  - func: normal.float_Tensor(float mean, Tensor std, *, Generator? generator=None) -> Tensor
7834
8392
  dispatch:
7835
8393
  CPU, CUDA: normal
8394
+ Meta: normal_meta
8395
+ #MPS: normal_mps
7836
8396
 
7837
8397
  - func: normal.Tensor_Tensor_out(Tensor mean, Tensor std, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
7838
8398
  dispatch:
7839
8399
  CPU, CUDA: normal_out
8400
+ Meta: normal_out_meta
8401
+ MPS: normal_mps_out
7840
8402
 
7841
8403
  - func: normal.Tensor_Tensor(Tensor mean, Tensor std, *, Generator? generator=None) -> Tensor
7842
8404
  dispatch:
7843
8405
  CPU, CUDA: normal
8406
+ Meta: normal_meta
8407
+ #MPS: normal_mps
7844
8408
 
7845
8409
  - func: normal.float_float(float mean, float std, int[] size, *, Generator? generator=None, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
7846
8410
 
@@ -7851,32 +8415,30 @@
7851
8415
  dispatch:
7852
8416
  CompositeExplicitAutograd: alias
7853
8417
 
7854
- - func: _index_copy_(Tensor(a!) self, int dim, Tensor index, Tensor source) -> Tensor(a!)
7855
- dispatch:
7856
- CPU: _index_copy_impl_
7857
- CUDA: _index_copy_impl_
7858
-
7859
8418
  - func: _amp_foreach_non_finite_check_and_unscale_(Tensor(a!)[] self, Tensor(b!) found_inf, Tensor inv_scale) -> ()
7860
8419
  variants: function
7861
8420
  dispatch:
7862
8421
  CUDA: _amp_foreach_non_finite_check_and_unscale_cuda_
8422
+ autogen: _amp_foreach_non_finite_check_and_unscale.functional, _amp_foreach_non_finite_check_and_unscale.out
7863
8423
 
7864
8424
  - func: _amp_update_scale_(Tensor(a!) self, Tensor(b!) growth_tracker, Tensor found_inf, float scale_growth_factor, float scale_backoff_factor, int growth_interval) -> Tensor(a!)
7865
8425
  variants: function
7866
8426
  dispatch:
7867
8427
  CUDA: _amp_update_scale_cuda_
8428
+ autogen: _amp_update_scale.functional, _amp_update_scale.out
7868
8429
 
7869
- - func: _cat(Tensor[] tensors, int dim=0) -> Tensor
7870
- dispatch:
7871
- CPU: _cat_cpu
7872
- CUDA: cat_cuda
7873
- QuantizedCPU: cat_quantized_cpu
8430
+ #- func: _cat(Tensor[] tensors, int dim=0) -> Tensor
8431
+ #dispatch:
8432
+ #CPU: _cat_cpu
8433
+ #CUDA: cat_cuda
8434
+ #MPS: cat_mps
8435
+ #QuantizedCPU: cat_quantized_cpu
7874
8436
 
7875
- - func: _cat.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!)
7876
- dispatch:
7877
- CPU: _cat_out_cpu
7878
- CUDA: cat_out_cuda
7879
- QuantizedCPU: cat_out_quantized_cpu
8437
+ #- func: _cat.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!)
8438
+ #dispatch:
8439
+ #CPU: _cat_out_cpu
8440
+ #CUDA: cat_out_cuda
8441
+ #QuantizedCPU: cat_out_quantized_cpu
7880
8442
 
7881
8443
  - func: _foreach_add.Scalar(Tensor[] tensors, Scalar scalar) -> Tensor[]
7882
8444
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -7891,6 +8453,7 @@
7891
8453
  dispatch:
7892
8454
  CPU: foreach_tensor_add_scalar_kernel_slow_
7893
8455
  CUDA: foreach_tensor_add_scalar_kernel_cuda_
8456
+ autogen: _foreach_add.Scalar_functional, _foreach_add.Scalar_out
7894
8457
 
7895
8458
  - func: _foreach_sub.Scalar(Tensor[] tensors, Scalar scalar) -> Tensor[]
7896
8459
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -7905,6 +8468,7 @@
7905
8468
  dispatch:
7906
8469
  CPU: foreach_tensor_sub_scalar_kernel_slow_
7907
8470
  CUDA: foreach_tensor_sub_scalar_kernel_cuda_
8471
+ autogen: _foreach_sub.Scalar_functional, _foreach_sub.Scalar_out
7908
8472
 
7909
8473
  - func: _foreach_mul.Scalar(Tensor[] tensors, Scalar scalar) -> Tensor[]
7910
8474
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -7919,6 +8483,7 @@
7919
8483
  dispatch:
7920
8484
  CPU: foreach_tensor_mul_scalar_kernel_slow_
7921
8485
  CUDA: foreach_tensor_mul_scalar_kernel_cuda_
8486
+ autogen: _foreach_mul.Scalar_functional, _foreach_mul.Scalar_out
7922
8487
 
7923
8488
  - func: _foreach_div.Scalar(Tensor[] tensors, Scalar scalar) -> Tensor[]
7924
8489
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -7933,6 +8498,7 @@
7933
8498
  dispatch:
7934
8499
  CPU: foreach_tensor_div_scalar_kernel_slow_
7935
8500
  CUDA: foreach_tensor_div_scalar_kernel_cuda_
8501
+ autogen: _foreach_div.Scalar_functional, _foreach_div.Scalar_out
7936
8502
 
7937
8503
  - func: _foreach_add.List(Tensor[] tensors1, Tensor[] tensors2, *, Scalar alpha=1) -> Tensor[]
7938
8504
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -7947,6 +8513,7 @@
7947
8513
  dispatch:
7948
8514
  CPU: foreach_tensor_add_list_kernel_slow_
7949
8515
  CUDA: foreach_tensor_add_list_kernel_cuda_
8516
+ autogen: _foreach_add.List_functional, _foreach_add.List_out
7950
8517
 
7951
8518
  - func: _foreach_sub.List(Tensor[] tensors1, Tensor[] tensors2, *, Scalar alpha=1) -> Tensor[]
7952
8519
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -7961,6 +8528,7 @@
7961
8528
  dispatch:
7962
8529
  CPU: foreach_tensor_sub_list_kernel_slow_
7963
8530
  CUDA: foreach_tensor_sub_list_kernel_cuda_
8531
+ autogen: _foreach_sub.List_functional, _foreach_sub.List_out
7964
8532
 
7965
8533
  - func: _foreach_mul.List(Tensor[] tensors1, Tensor[] tensors2) -> Tensor[]
7966
8534
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -7975,6 +8543,7 @@
7975
8543
  dispatch:
7976
8544
  CPU: foreach_tensor_mul_list_kernel_slow_
7977
8545
  CUDA: foreach_tensor_mul_list_kernel_cuda_
8546
+ autogen: _foreach_mul.List_functional, _foreach_mul.List_out
7978
8547
 
7979
8548
  - func: _foreach_div.List(Tensor[] tensors1, Tensor[] tensors2) -> Tensor[]
7980
8549
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -7989,6 +8558,7 @@
7989
8558
  dispatch:
7990
8559
  CPU: foreach_tensor_div_list_kernel_slow_
7991
8560
  CUDA: foreach_tensor_div_list_kernel_cuda_
8561
+ autogen: _foreach_div.List_functional, _foreach_div.List_out
7992
8562
 
7993
8563
  - func: _foreach_add.ScalarList(Tensor[] tensors, Scalar[] scalars) -> Tensor[]
7994
8564
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8003,6 +8573,7 @@
8003
8573
  dispatch:
8004
8574
  CPU: foreach_tensor_add_scalarlist_kernel_slow_
8005
8575
  CUDA: foreach_tensor_add_scalarlist_kernel_cuda_
8576
+ autogen: _foreach_add.ScalarList_functional, _foreach_add.ScalarList_out
8006
8577
 
8007
8578
  - func: _foreach_sub.ScalarList(Tensor[] tensors, Scalar[] scalars) -> Tensor[]
8008
8579
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8017,6 +8588,7 @@
8017
8588
  dispatch:
8018
8589
  CPU: foreach_tensor_sub_scalarlist_kernel_slow_
8019
8590
  CUDA: foreach_tensor_sub_scalarlist_kernel_cuda_
8591
+ autogen: _foreach_sub.ScalarList_functional, _foreach_sub.ScalarList_out
8020
8592
 
8021
8593
  - func: _foreach_div.ScalarList(Tensor[] tensors, Scalar[] scalars) -> Tensor[]
8022
8594
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8031,6 +8603,7 @@
8031
8603
  dispatch:
8032
8604
  CPU: foreach_tensor_div_scalarlist_kernel_slow_
8033
8605
  CUDA: foreach_tensor_div_scalarlist_kernel_cuda_
8606
+ autogen: _foreach_div.ScalarList_functional, _foreach_div.ScalarList_out
8034
8607
 
8035
8608
  - func: _foreach_mul.ScalarList(Tensor[] tensors, Scalar[] scalars) -> Tensor[]
8036
8609
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8045,6 +8618,7 @@
8045
8618
  dispatch:
8046
8619
  CPU: foreach_tensor_mul_scalarlist_kernel_slow_
8047
8620
  CUDA: foreach_tensor_mul_scalarlist_kernel_cuda_
8621
+ autogen: _foreach_mul.ScalarList_functional, _foreach_mul.ScalarList_out
8048
8622
 
8049
8623
  - func: _foreach_exp(Tensor[] tensors) -> Tensor[]
8050
8624
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8059,6 +8633,7 @@
8059
8633
  dispatch:
8060
8634
  CPU: foreach_tensor_zero_slow_
8061
8635
  CUDA: foreach_tensor_zero_cuda_
8636
+ autogen: _foreach_zero.functional, _foreach_zero.out
8062
8637
 
8063
8638
  - func: _foreach_exp_(Tensor(a!)[] self) -> ()
8064
8639
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8066,6 +8641,7 @@
8066
8641
  dispatch:
8067
8642
  CPU: foreach_tensor_exp_slow_
8068
8643
  CUDA: foreach_tensor_exp_cuda_
8644
+ autogen: _foreach_exp.functional, _foreach_exp.out
8069
8645
 
8070
8646
  - func: _foreach_sqrt(Tensor[] tensors) -> Tensor[]
8071
8647
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8080,6 +8656,7 @@
8080
8656
  dispatch:
8081
8657
  CPU: foreach_tensor_sqrt_slow_
8082
8658
  CUDA: foreach_tensor_sqrt_cuda_
8659
+ autogen: _foreach_sqrt.functional, _foreach_sqrt.out
8083
8660
 
8084
8661
  - func: _foreach_abs(Tensor[] tensors) -> Tensor[]
8085
8662
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8094,6 +8671,7 @@
8094
8671
  dispatch:
8095
8672
  CPU: foreach_tensor_abs_slow_
8096
8673
  CUDA: foreach_tensor_abs_cuda_
8674
+ autogen: _foreach_abs.functional, _foreach_abs.out
8097
8675
 
8098
8676
  - func: _foreach_acos(Tensor[] tensors) -> Tensor[]
8099
8677
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8108,6 +8686,7 @@
8108
8686
  dispatch:
8109
8687
  CPU: foreach_tensor_acos_slow_
8110
8688
  CUDA: foreach_tensor_acos_cuda_
8689
+ autogen: _foreach_acos.functional, _foreach_acos.out
8111
8690
 
8112
8691
  - func: _foreach_asin(Tensor[] tensors) -> Tensor[]
8113
8692
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8122,6 +8701,7 @@
8122
8701
  dispatch:
8123
8702
  CPU: foreach_tensor_asin_slow_
8124
8703
  CUDA: foreach_tensor_asin_cuda_
8704
+ autogen: _foreach_asin.functional, _foreach_asin.out
8125
8705
 
8126
8706
  - func: _foreach_atan(Tensor[] tensors) -> Tensor[]
8127
8707
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8136,6 +8716,7 @@
8136
8716
  dispatch:
8137
8717
  CPU: foreach_tensor_atan_slow_
8138
8718
  CUDA: foreach_tensor_atan_cuda_
8719
+ autogen: _foreach_atan.functional, _foreach_atan.out
8139
8720
 
8140
8721
  - func: _foreach_ceil(Tensor[] tensors) -> Tensor[]
8141
8722
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8150,6 +8731,7 @@
8150
8731
  dispatch:
8151
8732
  CPU: foreach_tensor_ceil_slow_
8152
8733
  CUDA: foreach_tensor_ceil_cuda_
8734
+ autogen: _foreach_ceil.functional, _foreach_ceil.out
8153
8735
 
8154
8736
  - func: _foreach_cos(Tensor[] tensors) -> Tensor[]
8155
8737
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8164,6 +8746,7 @@
8164
8746
  dispatch:
8165
8747
  CPU: foreach_tensor_cos_slow_
8166
8748
  CUDA: foreach_tensor_cos_cuda_
8749
+ autogen: _foreach_cos.functional, _foreach_cos.out
8167
8750
 
8168
8751
  - func: _foreach_cosh(Tensor[] tensors) -> Tensor[]
8169
8752
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8178,6 +8761,7 @@
8178
8761
  dispatch:
8179
8762
  CPU: foreach_tensor_cosh_slow_
8180
8763
  CUDA: foreach_tensor_cosh_cuda_
8764
+ autogen: _foreach_cosh.functional, _foreach_cosh.out
8181
8765
 
8182
8766
  - func: _foreach_erf(Tensor[] tensors) -> Tensor[]
8183
8767
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8192,6 +8776,7 @@
8192
8776
  dispatch:
8193
8777
  CPU: foreach_tensor_erf_slow_
8194
8778
  CUDA: foreach_tensor_erf_cuda_
8779
+ autogen: _foreach_erf.functional, _foreach_erf.out
8195
8780
 
8196
8781
  - func: _foreach_erfc(Tensor[] tensors) -> Tensor[]
8197
8782
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8206,6 +8791,7 @@
8206
8791
  dispatch:
8207
8792
  CPU: foreach_tensor_erfc_slow_
8208
8793
  CUDA: foreach_tensor_erfc_cuda_
8794
+ autogen: _foreach_erfc.functional, _foreach_erfc.out
8209
8795
 
8210
8796
  - func: _foreach_expm1(Tensor[] tensors) -> Tensor[]
8211
8797
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8220,6 +8806,7 @@
8220
8806
  dispatch:
8221
8807
  CPU: foreach_tensor_expm1_slow_
8222
8808
  CUDA: foreach_tensor_expm1_cuda_
8809
+ autogen: _foreach_expm1.functional, _foreach_expm1.out
8223
8810
 
8224
8811
  - func: _foreach_floor(Tensor[] tensors) -> Tensor[]
8225
8812
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8234,6 +8821,7 @@
8234
8821
  dispatch:
8235
8822
  CPU: foreach_tensor_floor_slow_
8236
8823
  CUDA: foreach_tensor_floor_cuda_
8824
+ autogen: _foreach_floor.functional, _foreach_floor.out
8237
8825
 
8238
8826
  - func: _foreach_log(Tensor[] tensors) -> Tensor[]
8239
8827
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8248,6 +8836,7 @@
8248
8836
  dispatch:
8249
8837
  CPU: foreach_tensor_log_slow_
8250
8838
  CUDA: foreach_tensor_log_cuda_
8839
+ autogen: _foreach_log.functional, _foreach_log.out
8251
8840
 
8252
8841
  - func: _foreach_log10(Tensor[] tensors) -> Tensor[]
8253
8842
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8262,6 +8851,7 @@
8262
8851
  dispatch:
8263
8852
  CPU: foreach_tensor_log10_slow_
8264
8853
  CUDA: foreach_tensor_log10_cuda_
8854
+ autogen: _foreach_log10.functional, _foreach_log10.out
8265
8855
 
8266
8856
  - func: _foreach_log1p(Tensor[] tensors) -> Tensor[]
8267
8857
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8276,6 +8866,7 @@
8276
8866
  dispatch:
8277
8867
  CPU: foreach_tensor_log1p_slow_
8278
8868
  CUDA: foreach_tensor_log1p_cuda_
8869
+ autogen: _foreach_log1p.functional, _foreach_log1p.out
8279
8870
 
8280
8871
  - func: _foreach_log2(Tensor[] tensors) -> Tensor[]
8281
8872
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8290,6 +8881,7 @@
8290
8881
  dispatch:
8291
8882
  CPU: foreach_tensor_log2_slow_
8292
8883
  CUDA: foreach_tensor_log2_cuda_
8884
+ autogen: _foreach_log2.functional, _foreach_log2.out
8293
8885
 
8294
8886
  - func: _foreach_neg(Tensor[] tensors) -> Tensor[]
8295
8887
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8304,6 +8896,7 @@
8304
8896
  dispatch:
8305
8897
  CPU: foreach_tensor_neg_slow_
8306
8898
  CUDA: foreach_tensor_neg_cuda_
8899
+ autogen: _foreach_neg.functional, _foreach_neg.out
8307
8900
 
8308
8901
  - func: _foreach_tan(Tensor[] tensors) -> Tensor[]
8309
8902
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8318,6 +8911,7 @@
8318
8911
  dispatch:
8319
8912
  CPU: foreach_tensor_tan_slow_
8320
8913
  CUDA: foreach_tensor_tan_cuda_
8914
+ autogen: _foreach_tan.functional, _foreach_tan.out
8321
8915
 
8322
8916
  - func: _foreach_tanh(Tensor[] tensors) -> Tensor[]
8323
8917
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8332,6 +8926,7 @@
8332
8926
  dispatch:
8333
8927
  CPU: foreach_tensor_tanh_slow_
8334
8928
  CUDA: foreach_tensor_tanh_cuda_
8929
+ autogen: _foreach_tanh.functional, _foreach_tanh.out
8335
8930
 
8336
8931
  - func: _foreach_sin(Tensor[] tensors) -> Tensor[]
8337
8932
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8346,6 +8941,7 @@
8346
8941
  dispatch:
8347
8942
  CPU: foreach_tensor_sin_slow_
8348
8943
  CUDA: foreach_tensor_sin_cuda_
8944
+ autogen: _foreach_sin.functional, _foreach_sin.out
8349
8945
 
8350
8946
  - func: _foreach_sinh(Tensor[] tensors) -> Tensor[]
8351
8947
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8360,6 +8956,7 @@
8360
8956
  dispatch:
8361
8957
  CPU: foreach_tensor_sinh_slow_
8362
8958
  CUDA: foreach_tensor_sinh_cuda_
8959
+ autogen: _foreach_sinh.functional, _foreach_sinh.out
8363
8960
 
8364
8961
  - func: _foreach_round(Tensor[] tensors) -> Tensor[]
8365
8962
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8374,6 +8971,7 @@
8374
8971
  dispatch:
8375
8972
  CPU: foreach_tensor_round_slow_
8376
8973
  CUDA: foreach_tensor_round_cuda_
8974
+ autogen: _foreach_round.functional, _foreach_round.out
8377
8975
 
8378
8976
  - func: _foreach_lgamma(Tensor[] tensors) -> Tensor[]
8379
8977
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8388,6 +8986,7 @@
8388
8986
  dispatch:
8389
8987
  CPU: foreach_tensor_lgamma_slow_
8390
8988
  CUDA: foreach_tensor_lgamma_cuda_
8989
+ autogen: _foreach_lgamma.functional, _foreach_lgamma.out
8391
8990
 
8392
8991
  - func: _foreach_frac(Tensor[] tensors) -> Tensor[]
8393
8992
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8402,6 +9001,7 @@
8402
9001
  dispatch:
8403
9002
  CPU: foreach_tensor_frac_slow_
8404
9003
  CUDA: foreach_tensor_frac_cuda_
9004
+ autogen: _foreach_frac.functional, _foreach_frac.out
8405
9005
 
8406
9006
  - func: _foreach_reciprocal(Tensor[] tensors) -> Tensor[]
8407
9007
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8416,6 +9016,7 @@
8416
9016
  dispatch:
8417
9017
  CPU: foreach_tensor_reciprocal_slow_
8418
9018
  CUDA: foreach_tensor_reciprocal_cuda_
9019
+ autogen: _foreach_reciprocal.functional, _foreach_reciprocal.out
8419
9020
 
8420
9021
  - func: _foreach_sigmoid(Tensor[] tensors) -> Tensor[]
8421
9022
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8430,6 +9031,7 @@
8430
9031
  dispatch:
8431
9032
  CPU: foreach_tensor_sigmoid_slow_
8432
9033
  CUDA: foreach_tensor_sigmoid_cuda_
9034
+ autogen: _foreach_sigmoid.functional, _foreach_sigmoid.out
8433
9035
 
8434
9036
  - func: _foreach_trunc(Tensor[] tensors) -> Tensor[]
8435
9037
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8444,6 +9046,7 @@
8444
9046
  dispatch:
8445
9047
  CPU: foreach_tensor_trunc_slow_
8446
9048
  CUDA: foreach_tensor_trunc_cuda_
9049
+ autogen: _foreach_trunc.functional, _foreach_trunc.out
8447
9050
 
8448
9051
  - func: _foreach_addcdiv_.Scalar(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> ()
8449
9052
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8451,6 +9054,7 @@
8451
9054
  dispatch:
8452
9055
  CPU: foreach_tensor_addcdiv_scalar_slow_
8453
9056
  CUDA: foreach_tensor_addcdiv_scalar_cuda_
9057
+ autogen: _foreach_addcdiv.Scalar_functional, _foreach_addcdiv.Scalar_out
8454
9058
 
8455
9059
  - func: _foreach_addcmul_.Scalar(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> ()
8456
9060
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8458,6 +9062,7 @@
8458
9062
  dispatch:
8459
9063
  CPU: foreach_tensor_addcmul_scalar_slow_
8460
9064
  CUDA: foreach_tensor_addcmul_scalar_cuda_
9065
+ autogen: _foreach_addcmul.Scalar_functional, _foreach_addcmul.Scalar_out
8461
9066
 
8462
9067
  - func: _foreach_addcdiv_.ScalarList(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> ()
8463
9068
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8465,6 +9070,7 @@
8465
9070
  dispatch:
8466
9071
  CPU: foreach_tensor_addcdiv_scalarlist_slow_
8467
9072
  CUDA: foreach_tensor_addcdiv_scalarlist_cuda_
9073
+ autogen: _foreach_addcdiv.ScalarList_functional, _foreach_addcdiv.ScalarList_out
8468
9074
 
8469
9075
  - func: _foreach_addcmul_.ScalarList(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> ()
8470
9076
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8472,6 +9078,7 @@
8472
9078
  dispatch:
8473
9079
  CPU: foreach_tensor_addcmul_scalarlist_slow_
8474
9080
  CUDA: foreach_tensor_addcmul_scalarlist_cuda_
9081
+ autogen: _foreach_addcmul.ScalarList_functional, _foreach_addcmul.ScalarList_out
8475
9082
 
8476
9083
  - func: _foreach_addcdiv.Scalar(Tensor[] input, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[]
8477
9084
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8584,25 +9191,29 @@
8584
9191
 
8585
9192
  - func: mse_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)
8586
9193
  device_check: NoCheck # TensorIterator
9194
+ structured: True
9195
+ structured_inherits: TensorIteratorBase
8587
9196
  python_module: nn
8588
9197
  dispatch:
8589
9198
  CPU, CUDA: mse_loss_out
9199
+ MPS: mse_loss_out_mps
8590
9200
 
8591
9201
  - func: mse_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor
8592
9202
  device_check: NoCheck # TensorIterator
9203
+ structured_delegate: mse_loss.out
8593
9204
  python_module: nn
8594
- dispatch:
8595
- CPU, CUDA: mse_loss
8596
9205
 
8597
9206
  - func: mse_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, *, Tensor(a!) grad_input) -> Tensor(a!)
8598
9207
  python_module: nn
8599
9208
  dispatch:
8600
9209
  CPU, CUDA: mse_loss_backward_out
9210
+ MPS: mse_loss_backward_out_mps
8601
9211
 
8602
9212
  - func: mse_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction) -> Tensor
8603
9213
  python_module: nn
8604
9214
  dispatch:
8605
9215
  CPU, CUDA: mse_loss_backward
9216
+ MPS: mse_loss_backward_mps
8606
9217
 
8607
9218
  - func: l1_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)
8608
9219
  python_module: nn
@@ -8693,6 +9304,7 @@
8693
9304
  dispatch:
8694
9305
  CPU: nll_loss_forward_out_cpu
8695
9306
  CUDA: nll_loss_forward_out_cuda
9307
+ MPS: nll_loss_forward_out_mps
8696
9308
 
8697
9309
  - func: nll_loss_forward(Tensor self, Tensor target, Tensor? weight, int reduction, int ignore_index) -> (Tensor output, Tensor total_weight)
8698
9310
  python_module: nn
@@ -8704,6 +9316,7 @@
8704
9316
  dispatch:
8705
9317
  CPU: nll_loss_backward_out_cpu
8706
9318
  CUDA: nll_loss_backward_out_cuda
9319
+ MPS: nll_loss_backward_out_mps
8707
9320
 
8708
9321
  - func: nll_loss_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, int ignore_index, Tensor total_weight) -> Tensor
8709
9322
  python_module: nn
@@ -8720,24 +9333,28 @@
8720
9333
  dispatch:
8721
9334
  CPU: nll_loss2d_forward_out_cpu
8722
9335
  CUDA: nll_loss2d_forward_out_cuda
9336
+ MPS: nll_loss2d_forward_out_mps
8723
9337
 
8724
9338
  - func: nll_loss2d_forward(Tensor self, Tensor target, Tensor? weight, int reduction, int ignore_index) -> (Tensor output, Tensor total_weight)
8725
9339
  python_module: nn
8726
9340
  dispatch:
8727
9341
  CPU: nll_loss2d_forward_cpu
8728
9342
  CUDA: nll_loss2d_forward_cuda
9343
+ MPS: nll_loss2d_forward_mps
8729
9344
 
8730
9345
  - func: nll_loss2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, int ignore_index, Tensor total_weight, *, Tensor(a!) grad_input) -> Tensor(a!)
8731
9346
  python_module: nn
8732
9347
  dispatch:
8733
9348
  CPU: nll_loss2d_backward_out_cpu
8734
9349
  CUDA: nll_loss2d_backward_out_cuda
9350
+ MPS: nll_loss2d_backward_out_mps
8735
9351
 
8736
9352
  - func: nll_loss2d_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, int ignore_index, Tensor total_weight) -> Tensor
8737
9353
  python_module: nn
8738
9354
  dispatch:
8739
9355
  CPU: nll_loss2d_backward_cpu
8740
9356
  CUDA: nll_loss2d_backward_cuda
9357
+ MPS: nll_loss2d_backward_mps
8741
9358
 
8742
9359
  - func: smooth_l1_loss.out(Tensor self, Tensor target, int reduction=Mean, float beta=1.0, *, Tensor(a!) out) -> Tensor(a!)
8743
9360
  device_check: NoCheck # TensorIterator
@@ -8746,6 +9363,7 @@
8746
9363
  python_module: nn
8747
9364
  dispatch:
8748
9365
  CPU, CUDA: smooth_l1_loss_out
9366
+ MPS: smooth_l1_loss_out_mps
8749
9367
 
8750
9368
  - func: smooth_l1_loss(Tensor self, Tensor target, int reduction=Mean, float beta=1.0) -> Tensor
8751
9369
  device_check: NoCheck # TensorIterator
@@ -8757,6 +9375,7 @@
8757
9375
  dispatch:
8758
9376
  CPU: smooth_l1_loss_backward_out
8759
9377
  CUDA: smooth_l1_loss_backward_out
9378
+ MPS: smooth_l1_loss_backward_out_mps
8760
9379
 
8761
9380
  - func: smooth_l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta) -> Tensor
8762
9381
  python_module: nn
@@ -8810,6 +9429,7 @@
8810
9429
  python_module: nn
8811
9430
  dispatch:
8812
9431
  CPU, CUDA: elu_out
9432
+ MPS: elu_out_mps
8813
9433
 
8814
9434
  - func: elu(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor
8815
9435
  structured_delegate: elu.out
@@ -8822,6 +9442,7 @@
8822
9442
  python_module: nn
8823
9443
  dispatch:
8824
9444
  CPU, CUDA: elu_backward_out
9445
+ MPS: elu_backward_out_mps
8825
9446
 
8826
9447
  - func: elu_backward(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, bool is_result, Tensor self_or_result) -> Tensor
8827
9448
  structured_delegate: elu_backward.grad_input
@@ -8858,6 +9479,16 @@
8858
9479
  CPU: glu_backward_cpu
8859
9480
  CUDA: glu_backward_cuda
8860
9481
 
9482
+ - func: glu_jvp(Tensor glu, Tensor x, Tensor dx, int dim) -> Tensor
9483
+ python_module: nn
9484
+ dispatch:
9485
+ CPU, CUDA: glu_jvp
9486
+
9487
+ - func: glu_backward_jvp(Tensor grad_x, Tensor grad_glu, Tensor x, Tensor dgrad_glu, Tensor dx, int dim) -> Tensor
9488
+ python_module: nn
9489
+ dispatch:
9490
+ CPU, CUDA: glu_backward_jvp
9491
+
8861
9492
  - func: hardsigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
8862
9493
  structured: True
8863
9494
  structured_inherits: TensorIteratorBase
@@ -8894,31 +9525,33 @@
8894
9525
  device_check: NoCheck # TensorIterator
8895
9526
  python_module: nn
8896
9527
  dispatch:
8897
- CPU, CUDA: hardtanh_out
9528
+ CPU, CUDA, MPS: hardtanh_out
8898
9529
  QuantizedCPU: hardtanh_out_quantized_cpu
8899
9530
 
8900
9531
  - func: hardtanh(Tensor self, Scalar min_val=-1, Scalar max_val=1) -> Tensor
8901
9532
  device_check: NoCheck # TensorIterator
8902
9533
  python_module: nn
8903
9534
  dispatch:
8904
- CPU, CUDA: hardtanh
9535
+ CPU, CUDA, MPS: hardtanh
8905
9536
  QuantizedCPU: hardtanh_quantized_cpu
8906
9537
 
8907
9538
  - func: hardtanh_backward.grad_input(Tensor grad_output, Tensor self, Scalar min_val, Scalar max_val, *, Tensor(a!) grad_input) -> Tensor(a!)
8908
9539
  python_module: nn
8909
9540
  dispatch:
8910
9541
  CPU, CUDA: hardtanh_backward_out
9542
+ MPS: hardtanh_backward_out_mps
8911
9543
 
8912
9544
  - func: hardtanh_backward(Tensor grad_output, Tensor self, Scalar min_val, Scalar max_val) -> Tensor
8913
9545
  python_module: nn
8914
9546
  dispatch:
8915
9547
  CPU, CUDA: hardtanh_backward
9548
+ MPS: hardtanh_backward_mps
8916
9549
 
8917
9550
  - func: hardtanh_(Tensor(a!) self, Scalar min_val=-1, Scalar max_val=1) -> Tensor(a!)
8918
9551
  device_check: NoCheck # TensorIterator
8919
9552
  python_module: nn
8920
9553
  dispatch:
8921
- CPU, CUDA: hardtanh_
9554
+ CPU, CUDA, MPS: hardtanh_
8922
9555
  QuantizedCPU: hardtanh_quantized_cpu_
8923
9556
 
8924
9557
  - func: hardswish.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@@ -8951,6 +9584,7 @@
8951
9584
  python_module: nn
8952
9585
  dispatch:
8953
9586
  CPU, CUDA: leaky_relu_out
9587
+ MPS: leaky_relu_out_mps
8954
9588
  QuantizedCPU: leaky_relu_out_quantized_cpu
8955
9589
 
8956
9590
  - func: leaky_relu(Tensor self, Scalar negative_slope=0.01) -> Tensor
@@ -8966,6 +9600,7 @@
8966
9600
  python_module: nn
8967
9601
  dispatch:
8968
9602
  CPU, CUDA: leaky_relu_backward_out
9603
+ MPS: leaky_relu_backward_out_mps
8969
9604
 
8970
9605
  - func: leaky_relu_backward(Tensor grad_output, Tensor self, Scalar negative_slope, bool self_is_result) -> Tensor
8971
9606
  structured_delegate: leaky_relu_backward.grad_input
@@ -9088,6 +9723,7 @@
9088
9723
  dispatch:
9089
9724
  CPU: adaptive_avg_pool2d_out_cpu
9090
9725
  CUDA: adaptive_avg_pool2d_out_cuda
9726
+ MPS: adaptive_avg_pool2d_out_mps
9091
9727
  MkldnnCPU: mkldnn_adaptive_avg_pool2d_out
9092
9728
 
9093
9729
  - func: adaptive_avg_pool2d(Tensor self, int[2] output_size) -> Tensor
@@ -9105,13 +9741,16 @@
9105
9741
  dispatch:
9106
9742
  CPU: adaptive_avg_pool2d_cpu
9107
9743
  CUDA: adaptive_avg_pool2d_cuda
9744
+ MPS: adaptive_avg_pool2d_mps
9108
9745
  QuantizedCPU: adaptive_avg_pool2d_quantized_cpu
9746
+ QuantizedCUDA: adaptive_avg_pool2d_quantized_cuda
9109
9747
 
9110
9748
  - func: _adaptive_avg_pool2d_backward(Tensor grad_output, Tensor self) -> Tensor
9111
9749
  python_module: nn
9112
9750
  dispatch:
9113
9751
  CPU: adaptive_avg_pool2d_backward_cpu
9114
9752
  CUDA: adaptive_avg_pool2d_backward_cuda
9753
+ MPS: adaptive_avg_pool2d_backward_mps
9115
9754
 
9116
9755
  - func: adaptive_avg_pool3d.out(Tensor self, int[3] output_size, *, Tensor(a!) out) -> Tensor(a!)
9117
9756
  python_module: nn
@@ -9148,6 +9787,7 @@
9148
9787
  dispatch:
9149
9788
  CPU: adaptive_max_pool2d_out_cpu
9150
9789
  CUDA: adaptive_max_pool2d_out_cuda
9790
+ MPS: adaptive_max_pool2d_out_mps
9151
9791
 
9152
9792
  # Return: (Tensor output, Tensor indices)
9153
9793
  - func: adaptive_max_pool2d(Tensor self, int[2] output_size) -> (Tensor, Tensor)
@@ -9160,6 +9800,7 @@
9160
9800
  dispatch:
9161
9801
  CPU: adaptive_max_pool2d_backward_out_cpu
9162
9802
  CUDA: adaptive_max_pool2d_backward_out_cuda
9803
+ MPS: adaptive_max_pool2d_backward_out_mps
9163
9804
 
9164
9805
  - func: adaptive_max_pool2d_backward(Tensor grad_output, Tensor self, Tensor indices) -> Tensor
9165
9806
  python_module: nn
@@ -9199,6 +9840,7 @@
9199
9840
  dispatch:
9200
9841
  CPU: avg_pool2d_out_cpu
9201
9842
  CUDA: avg_pool2d_out_cuda
9843
+ MPS: avg_pool2d_out_mps
9202
9844
  MkldnnCPU: mkldnn_avg_pool2d_out
9203
9845
 
9204
9846
  - func: avg_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor
@@ -9214,6 +9856,7 @@
9214
9856
  dispatch:
9215
9857
  CPU: avg_pool2d_backward_out_cpu
9216
9858
  CUDA: avg_pool2d_backward_out_cuda
9859
+ MPS: avg_pool2d_backward_out_mps
9217
9860
  MkldnnCPU: mkldnn_avg_pool2d_backward_out
9218
9861
 
9219
9862
  - func: avg_pool2d_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, bool ceil_mode, bool count_include_pad, int? divisor_override) -> Tensor
@@ -9282,6 +9925,7 @@
9282
9925
  precomputed:
9283
9926
  - kernel_size -> int poolSizeT, int poolSizeH, int poolSizeW
9284
9927
  - output_size -> int outputT, int outputH, int outputW
9928
+ - int numBatch, int numPlanes, int inputT, int inputH, int inputW
9285
9929
  dispatch:
9286
9930
  CPU: fractional_max_pool3d_out_cpu
9287
9931
  CUDA: fractional_max_pool3d_out_cuda
@@ -9310,6 +9954,7 @@
9310
9954
  dispatch:
9311
9955
  CPU: max_pool2d_with_indices_out_cpu
9312
9956
  CUDA: max_pool2d_with_indices_out_cuda
9957
+ MPS: max_pool2d_with_indices_out_mps
9313
9958
 
9314
9959
  # Return: (Tensor output, Tensor indices)
9315
9960
  - func: max_pool2d_with_indices(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)
@@ -9322,6 +9967,7 @@
9322
9967
  dispatch:
9323
9968
  CPU: max_pool2d_with_indices_backward_out_cpu
9324
9969
  CUDA: max_pool2d_with_indices_backward_out_cuda
9970
+ MPS: max_pool2d_with_indices_backward_out_mps
9325
9971
 
9326
9972
  - func: max_pool2d_with_indices_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, int[2] dilation, bool ceil_mode, Tensor indices) -> Tensor
9327
9973
  python_module: nn
@@ -9365,18 +10011,6 @@
9365
10011
  CPU: max_unpooling2d_forward_cpu
9366
10012
  CUDA: max_unpooling2d_forward_cuda
9367
10013
 
9368
- - func: max_unpool2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor indices, int[2] output_size, *, Tensor(a!) grad_input) -> Tensor(a!)
9369
- python_module: nn
9370
- dispatch:
9371
- CPU: max_unpooling2d_backward_out_cpu
9372
- CUDA: max_unpooling2d_backward_out_cuda
9373
-
9374
- - func: max_unpool2d_backward(Tensor grad_output, Tensor self, Tensor indices, int[2] output_size) -> Tensor
9375
- python_module: nn
9376
- dispatch:
9377
- CPU: max_unpooling2d_backward_cpu
9378
- CUDA: max_unpooling2d_backward_cuda
9379
-
9380
10014
  - func: max_unpool3d.out(Tensor self, Tensor indices, int[3] output_size, int[3] stride, int[3] padding, *, Tensor(a!) out) -> Tensor(a!)
9381
10015
  python_module: nn
9382
10016
  dispatch:
@@ -9389,30 +10023,18 @@
9389
10023
  CPU: max_unpooling3d_forward_cpu
9390
10024
  CUDA: max_unpooling3d_forward_cuda
9391
10025
 
9392
- - func: max_unpool3d_backward.grad_input(Tensor grad_output, Tensor self, Tensor indices, int[3] output_size, int[3] stride, int[3] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
9393
- python_module: nn
9394
- dispatch:
9395
- CPU: max_unpooling3d_backward_out_cpu
9396
- CUDA: max_unpooling3d_backward_out_cuda
9397
-
9398
- - func: max_unpool3d_backward(Tensor grad_output, Tensor self, Tensor indices, int[3] output_size, int[3] stride, int[3] padding) -> Tensor
9399
- python_module: nn
9400
- dispatch:
9401
- CPU: max_unpooling3d_backward_cpu
9402
- CUDA: max_unpooling3d_backward_cuda
9403
-
9404
10026
  - func: reflection_pad1d.out(Tensor self, int[2] padding, *, Tensor(a!) out) -> Tensor(a!)
9405
10027
  python_module: nn
9406
10028
  structured: True
9407
10029
  dispatch:
9408
- CPU, QuantizedCPU: reflection_pad1d_out_cpu
10030
+ CPU: reflection_pad1d_out_cpu
10031
+ QuantizedCPU: reflection_pad1d_out_quantized_cpu
9409
10032
  CUDA: reflection_pad1d_out_cuda
10033
+ MPS: reflection_pad1d_out_mps
9410
10034
 
9411
10035
  - func: reflection_pad1d(Tensor self, int[2] padding) -> Tensor
9412
10036
  python_module: nn
9413
10037
  structured_delegate: reflection_pad1d.out
9414
- dispatch:
9415
- QuantizedCPU: reflection_pad1d_cpu
9416
10038
 
9417
10039
  - func: reflection_pad1d_backward.grad_input(Tensor grad_output, Tensor self, int[2] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
9418
10040
  python_module: nn
@@ -9420,6 +10042,7 @@
9420
10042
  dispatch:
9421
10043
  CPU: reflection_pad1d_backward_out_cpu
9422
10044
  CUDA: reflection_pad1d_backward_out_cuda
10045
+ MPS: reflection_pad1d_backward_out_mps
9423
10046
 
9424
10047
  - func: reflection_pad1d_backward(Tensor grad_output, Tensor self, int[2] padding) -> Tensor
9425
10048
  python_module: nn
@@ -9430,24 +10053,29 @@
9430
10053
  dispatch:
9431
10054
  CPU, QuantizedCPU: reflection_pad2d_out_cpu
9432
10055
  CUDA: reflection_pad2d_out_cuda
10056
+ MPS: reflection_pad2d_out_mps
9433
10057
 
9434
10058
  - func: reflection_pad2d(Tensor self, int[4] padding) -> Tensor
9435
10059
  python_module: nn
9436
10060
  dispatch:
9437
- CPU, QuantizedCPU: reflection_pad2d_cpu
10061
+ CPU: reflection_pad2d_cpu
10062
+ QuantizedCPU: reflection_pad2d_quantized_cpu
9438
10063
  CUDA: reflection_pad2d_cuda
10064
+ MPS: reflection_pad2d_mps
9439
10065
 
9440
10066
  - func: reflection_pad2d_backward.grad_input(Tensor grad_output, Tensor self, int[4] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
9441
10067
  python_module: nn
9442
10068
  dispatch:
9443
10069
  CPU: reflection_pad2d_backward_out_cpu
9444
10070
  CUDA: reflection_pad2d_backward_out_cuda
10071
+ MPS: reflection_pad2d_backward_out_mps
9445
10072
 
9446
10073
  - func: reflection_pad2d_backward(Tensor grad_output, Tensor self, int[4] padding) -> Tensor
9447
10074
  python_module: nn
9448
10075
  dispatch:
9449
10076
  CPU: reflection_pad2d_backward_cpu
9450
10077
  CUDA: reflection_pad2d_backward_cuda
10078
+ MPS: reflection_pad2d_backward_mps
9451
10079
 
9452
10080
  - func: reflection_pad3d.out(Tensor self, int[6] padding, *, Tensor(a!) out) -> Tensor(a!)
9453
10081
  python_module: nn
@@ -9455,6 +10083,7 @@
9455
10083
  dispatch:
9456
10084
  CPU: reflection_pad3d_out_cpu
9457
10085
  CUDA: reflection_pad3d_out_cuda
10086
+ MPS: reflection_pad3d_out_mps
9458
10087
 
9459
10088
  - func: reflection_pad3d(Tensor self, int[6] padding) -> Tensor
9460
10089
  python_module: nn
@@ -9466,6 +10095,7 @@
9466
10095
  dispatch:
9467
10096
  CPU: reflection_pad3d_backward_out_cpu
9468
10097
  CUDA: reflection_pad3d_backward_out_cuda
10098
+ MPS: reflection_pad3d_backward_out_mps
9469
10099
 
9470
10100
  - func: reflection_pad3d_backward(Tensor grad_output, Tensor self, int[6] padding) -> Tensor
9471
10101
  python_module: nn
@@ -9477,6 +10107,7 @@
9477
10107
  dispatch:
9478
10108
  CPU: replication_pad1d_out_cpu
9479
10109
  CUDA: replication_pad1d_out_cuda
10110
+ MPS: replication_pad1d_out_mps
9480
10111
 
9481
10112
  - func: replication_pad1d(Tensor self, int[2] padding) -> Tensor
9482
10113
  python_module: nn
@@ -9488,6 +10119,7 @@
9488
10119
  dispatch:
9489
10120
  CPU: replication_pad1d_backward_out_cpu
9490
10121
  CUDA: replication_pad1d_backward_out_cuda
10122
+ MPS: replication_pad1d_backward_out_mps
9491
10123
 
9492
10124
  - func: replication_pad1d_backward(Tensor grad_output, Tensor self, int[2] padding) -> Tensor
9493
10125
  python_module: nn
@@ -9499,6 +10131,7 @@
9499
10131
  dispatch:
9500
10132
  CPU: replication_pad2d_out_cpu
9501
10133
  CUDA: replication_pad2d_out_cuda
10134
+ MPS: replication_pad2d_out_mps
9502
10135
 
9503
10136
  - func: replication_pad2d(Tensor self, int[4] padding) -> Tensor
9504
10137
  python_module: nn
@@ -9509,12 +10142,14 @@
9509
10142
  dispatch:
9510
10143
  CPU: replication_pad2d_backward_out_cpu
9511
10144
  CUDA: replication_pad2d_backward_out_cuda
10145
+ MPS: replication_pad2d_backward_out_mps
9512
10146
 
9513
10147
  - func: replication_pad2d_backward(Tensor grad_output, Tensor self, int[4] padding) -> Tensor
9514
10148
  python_module: nn
9515
10149
  dispatch:
9516
10150
  CPU: replication_pad2d_backward_cpu
9517
10151
  CUDA: replication_pad2d_backward_cuda
10152
+ MPS: replication_pad2d_backward_mps
9518
10153
 
9519
10154
  - func: replication_pad3d.out(Tensor self, int[6] padding, *, Tensor(a!) out) -> Tensor(a!)
9520
10155
  python_module: nn
@@ -9522,6 +10157,7 @@
9522
10157
  dispatch:
9523
10158
  CPU: replication_pad3d_out_cpu
9524
10159
  CUDA: replication_pad3d_out_cuda
10160
+ MPS: replication_pad3d_out_mps
9525
10161
 
9526
10162
  - func: replication_pad3d(Tensor self, int[6] padding) -> Tensor
9527
10163
  python_module: nn
@@ -9532,12 +10168,23 @@
9532
10168
  dispatch:
9533
10169
  CPU: replication_pad3d_backward_out_cpu
9534
10170
  CUDA: replication_pad3d_backward_out_cuda
10171
+ MPS: replication_pad3d_backward_out_mps
9535
10172
 
9536
10173
  - func: replication_pad3d_backward(Tensor grad_output, Tensor self, int[6] padding) -> Tensor
9537
10174
  python_module: nn
9538
10175
  dispatch:
9539
10176
  CPU: replication_pad3d_backward_cpu
9540
10177
  CUDA: replication_pad3d_backward_cuda
10178
+ MPS: replication_pad3d_backward_mps
10179
+
10180
+ - func: _pad_circular(Tensor self, int[] pad) -> Tensor
10181
+ python_module: nn
10182
+
10183
+ - func: _pad_enum(Tensor self, int[] pad, int mode, float? value=None) -> Tensor
10184
+ python_module: nn
10185
+
10186
+ - func: pad(Tensor self, int[] pad, str mode="constant", float? value=None) -> Tensor
10187
+ python_module: nn
9541
10188
 
9542
10189
  - func: upsample_linear1d.vec(Tensor input, int[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor
9543
10190
  python_module: nn
@@ -9694,6 +10341,7 @@
9694
10341
  dispatch:
9695
10342
  CPU: upsample_bilinear2d_out_cpu
9696
10343
  CUDA: upsample_bilinear2d_out_cuda
10344
+ MPS: upsample_bilinear2d_out_mps
9697
10345
 
9698
10346
  - func: upsample_bilinear2d(Tensor self, int[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
9699
10347
  python_module: nn
@@ -9707,6 +10355,7 @@
9707
10355
  dispatch:
9708
10356
  CPU: upsample_bilinear2d_backward_out_cpu
9709
10357
  CUDA: upsample_bilinear2d_backward_out_cuda
10358
+ MPS: upsample_bilinear2d_backward_out_mps
9710
10359
 
9711
10360
  - func: upsample_bilinear2d_backward(Tensor grad_output, int[2] output_size, int[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
9712
10361
  python_module: nn
@@ -9850,6 +10499,7 @@
9850
10499
  dispatch:
9851
10500
  CPU: upsample_nearest2d_out_cpu
9852
10501
  CUDA: upsample_nearest2d_out_cuda
10502
+ MPS: upsample_nearest2d_out_mps
9853
10503
 
9854
10504
  - func: _upsample_nearest_exact2d.out(Tensor self, int[2] output_size, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
9855
10505
  python_module: nn
@@ -9857,6 +10507,7 @@
9857
10507
  dispatch:
9858
10508
  CPU: _upsample_nearest_exact2d_out_cpu
9859
10509
  CUDA: _upsample_nearest_exact2d_out_cuda
10510
+ MPS: _upsample_nearest_exact2d_out_mps
9860
10511
 
9861
10512
  - func: upsample_nearest2d(Tensor self, int[2] output_size, float? scales_h=None, float? scales_w=None) -> Tensor
9862
10513
  python_module: nn
@@ -9876,6 +10527,7 @@
9876
10527
  dispatch:
9877
10528
  CPU: upsample_nearest2d_backward_out_cpu
9878
10529
  CUDA: upsample_nearest2d_backward_out_cuda
10530
+ MPS: upsample_nearest2d_backward_out_mps
9879
10531
 
9880
10532
  - func: _upsample_nearest_exact2d_backward.grad_input(Tensor grad_output, int[2] output_size, int[4] input_size, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
9881
10533
  python_module: nn
@@ -9883,6 +10535,7 @@
9883
10535
  dispatch:
9884
10536
  CPU: _upsample_nearest_exact2d_backward_out_cpu
9885
10537
  CUDA: _upsample_nearest_exact2d_backward_out_cuda
10538
+ MPS: _upsample_nearest_exact2d_backward_out_mps
9886
10539
 
9887
10540
  - func: upsample_nearest2d_backward(Tensor grad_output, int[2] output_size, int[4] input_size, float? scales_h=None, float? scales_w=None) -> Tensor
9888
10541
  python_module: nn
@@ -9946,6 +10599,7 @@
9946
10599
  structured_inherits: TensorIteratorBase
9947
10600
  dispatch:
9948
10601
  CPU, CUDA: sigmoid_backward_out
10602
+ MPS: sigmoid_backward_out_mps
9949
10603
 
9950
10604
  - func: sigmoid_backward(Tensor grad_output, Tensor output) -> Tensor
9951
10605
  python_module: nn
@@ -9968,6 +10622,7 @@
9968
10622
  structured_inherits: TensorIteratorBase
9969
10623
  dispatch:
9970
10624
  CPU, CUDA: tanh_backward_out
10625
+ MPS: tanh_backward_out_mps
9971
10626
 
9972
10627
  - func: tanh_backward(Tensor grad_output, Tensor output) -> Tensor
9973
10628
  python_module: nn
@@ -10233,6 +10888,19 @@
10233
10888
  dispatch:
10234
10889
  CPU, CUDA: special_ndtri_out
10235
10890
 
10891
+ - func: special_log_ndtr(Tensor self) -> Tensor
10892
+ structured_delegate: special_log_ndtr.out
10893
+ python_module: special
10894
+ variants: function
10895
+
10896
+ - func: special_log_ndtr.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
10897
+ structured: True
10898
+ structured_inherits: TensorIteratorBase
10899
+ python_module: special
10900
+ variants: function
10901
+ dispatch:
10902
+ CPU, CUDA: special_log_ndtr_out
10903
+
10236
10904
  - func: special_expm1(Tensor self) -> Tensor
10237
10905
  python_module: special
10238
10906
  variants: function
@@ -10486,7 +11154,7 @@
10486
11154
 
10487
11155
  - func: special_polygamma(int n, Tensor self) -> Tensor
10488
11156
  python_module: special
10489
- variants: function, method
11157
+ variants: function
10490
11158
 
10491
11159
  - func: special_polygamma.out(int n, Tensor self, *, Tensor(a!) out) -> Tensor(a!)
10492
11160
  python_module: special
@@ -10782,11 +11450,15 @@
10782
11450
  - func: linalg_cross(Tensor self, Tensor other, *, int dim=-1) -> Tensor
10783
11451
  python_module: linalg
10784
11452
  variants: function
11453
+ structured_delegate: linalg_cross.out
10785
11454
  dispatch:
10786
- CPU, CUDA: linalg_cross
11455
+ ZeroTensor: linalg_cross_zerotensor
10787
11456
 
10788
11457
  - func: linalg_cross.out(Tensor self, Tensor other, *, int dim=-1, Tensor(a!) out) -> Tensor(a!)
10789
11458
  python_module: linalg
11459
+ structured: True
11460
+ precomputed:
11461
+ - dim -> int dim
10790
11462
  dispatch:
10791
11463
  CPU, CUDA: linalg_cross_out
10792
11464
 
@@ -10811,6 +11483,20 @@
10811
11483
  dispatch:
10812
11484
  CPU, CUDA: linalg_lu_factor_ex_out
10813
11485
 
11486
+ # linalg.lu
11487
+ - func: linalg_lu(Tensor A, *, bool pivot=True) -> (Tensor P, Tensor L, Tensor U)
11488
+ python_module: linalg
11489
+ structured_delegate: linalg_lu.out
11490
+ variants: function
11491
+
11492
+ - func: linalg_lu.out(Tensor A, *, bool pivot=True, Tensor(a!) P, Tensor(b!) L, Tensor(c!) U) -> (Tensor(a!) P, Tensor(b!) L, Tensor(c!) U)
11493
+ python_module: linalg
11494
+ variants: function
11495
+ structured: True
11496
+ dispatch:
11497
+ CPU, CUDA: linalg_lu_out
11498
+
11499
+ # linalg.det
10814
11500
  - func: linalg_det(Tensor self) -> Tensor
10815
11501
  python_module: linalg
10816
11502
  variants: function
@@ -10832,6 +11518,38 @@
10832
11518
  dispatch:
10833
11519
  CPU, CUDA: _det_lu_based_helper_backward_helper
10834
11520
 
11521
+ - func: linalg_ldl_factor_ex(Tensor self, *, bool hermitian=False, bool check_errors=False) -> (Tensor LD, Tensor pivots, Tensor info)
11522
+ structured_delegate: linalg_ldl_factor_ex.out
11523
+ python_module: linalg
11524
+ variants: function
11525
+
11526
+ - func: linalg_ldl_factor_ex.out(Tensor self, *, bool hermitian=False, bool check_errors=False, Tensor(a!) LD, Tensor(b!) pivots, Tensor(c!) info) -> (Tensor(a!) LD, Tensor(b!) pivots, Tensor(c!) info)
11527
+ structured: True
11528
+ python_module: linalg
11529
+ variants: function
11530
+ dispatch:
11531
+ CPU, CUDA: linalg_ldl_factor_ex_out
11532
+
11533
+ - func: linalg_ldl_factor(Tensor self, *, bool hermitian=False) -> (Tensor LD, Tensor pivots)
11534
+ python_module: linalg
11535
+ variants: function
11536
+
11537
+ - func: linalg_ldl_factor.out(Tensor self, *, bool hermitian=False, Tensor(a!) LD, Tensor(b!) pivots) -> (Tensor(a!) LD, Tensor(b!) pivots)
11538
+ python_module: linalg
11539
+ variants: function
11540
+
11541
+ - func: linalg_ldl_solve(Tensor LD, Tensor pivots, Tensor B, *, bool hermitian=False) -> Tensor
11542
+ structured_delegate: linalg_ldl_solve.out
11543
+ python_module: linalg
11544
+ variants: function
11545
+
11546
+ - func: linalg_ldl_solve.out(Tensor LD, Tensor pivots, Tensor B, *, bool hermitian=False, Tensor(a!) out) -> Tensor(a!)
11547
+ structured: True
11548
+ python_module: linalg
11549
+ variants: function
11550
+ dispatch:
11551
+ CPU, CUDA: linalg_ldl_solve_out
11552
+
10835
11553
  - func: linalg_lstsq(Tensor self, Tensor b, float? rcond=None, *, str? driver=None) -> (Tensor solution, Tensor residuals, Tensor rank, Tensor singular_values)
10836
11554
  python_module: linalg
10837
11555
  variants: function
@@ -10901,7 +11619,7 @@
10901
11619
  python_module: linalg
10902
11620
  variants: function
10903
11621
 
10904
- - func: linalg_eigvalsh.out(Tensor self, str UPLO='L', *, Tensor(a!) out) -> Tensor(a!)
11622
+ - func: linalg_eigvalsh.out(Tensor self, str UPLO="L", *, Tensor(a!) out) -> Tensor(a!)
10905
11623
  python_module: linalg
10906
11624
  dispatch:
10907
11625
  CPU, CUDA: linalg_eigvalsh_out
@@ -10922,6 +11640,7 @@
10922
11640
  dispatch:
10923
11641
  CPU: _linalg_inv_out_helper_cpu
10924
11642
  CUDA: _linalg_inv_out_helper_cuda
11643
+ autogen: _linalg_inv_out_helper.functional, _linalg_inv_out_helper.out
10925
11644
 
10926
11645
  - func: linalg_inv_ex(Tensor self, *, bool check_errors=False) -> (Tensor inverse, Tensor info)
10927
11646
  python_module: linalg
@@ -10978,11 +11697,11 @@
10978
11697
  - func: linalg_vector_norm(Tensor self, Scalar ord=2, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
10979
11698
  python_module: linalg
10980
11699
  variants: function
10981
- dispatch:
10982
- CPU, CUDA: linalg_vector_norm
11700
+ structured_delegate: linalg_vector_norm.out
10983
11701
 
10984
11702
  - func: linalg_vector_norm.out(Tensor self, Scalar ord=2, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
10985
11703
  python_module: linalg
11704
+ structured: True
10986
11705
  dispatch:
10987
11706
  CPU, CUDA: linalg_vector_norm_out
10988
11707
 
@@ -11106,13 +11825,13 @@
11106
11825
  python_module: linalg
11107
11826
  variants: function
11108
11827
 
11109
- - func: linalg_qr(Tensor self, str mode='reduced') -> (Tensor Q, Tensor R)
11828
+ - func: linalg_qr(Tensor A, str mode='reduced') -> (Tensor Q, Tensor R)
11110
11829
  python_module: linalg
11111
11830
  variants: function
11112
11831
  dispatch:
11113
11832
  CompositeExplicitAutograd: linalg_qr
11114
11833
 
11115
- - func: linalg_qr.out(Tensor self, str mode='reduced', *, Tensor(a!) Q, Tensor(b!) R) -> (Tensor(a!) Q, Tensor(b!) R)
11834
+ - func: linalg_qr.out(Tensor A, str mode='reduced', *, Tensor(a!) Q, Tensor(b!) R) -> (Tensor(a!) Q, Tensor(b!) R)
11116
11835
  python_module: linalg
11117
11836
  variants: function
11118
11837
  dispatch:
@@ -11232,3 +11951,447 @@
11232
11951
  - func: unflatten_dense_tensors(Tensor flat, Tensor[] tensors) -> Tensor[]
11233
11952
  variants: function
11234
11953
  python_module: nn
11954
+
11955
+ - func: nested_tensor(Tensor[] list, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
11956
+ variants: function
11957
+
11958
+ - func: _fw_primal_copy(Tensor self, int level) -> Tensor
11959
+ variants: function
11960
+ dispatch:
11961
+ CompositeExplicitAutograd: _fw_primal_copy
11962
+ tags: view_copy
11963
+
11964
+ - func: _make_dual_copy(Tensor primal, Tensor tangent, int level) -> Tensor
11965
+ variants: function
11966
+ dispatch:
11967
+ CompositeExplicitAutograd: _make_dual_copy
11968
+ tags: view_copy
11969
+
11970
+ - func: view_as_real_copy(Tensor self) -> Tensor
11971
+ variants: function
11972
+ dispatch:
11973
+ CompositeExplicitAutograd: view_as_real_copy
11974
+ tags: view_copy
11975
+
11976
+ - func: view_as_complex_copy(Tensor self) -> Tensor
11977
+ variants: function
11978
+ dispatch:
11979
+ CompositeExplicitAutograd: view_as_complex_copy
11980
+ tags: view_copy
11981
+
11982
+ - func: _conj_copy(Tensor self) -> Tensor
11983
+ variants: function
11984
+ dispatch:
11985
+ CompositeExplicitAutograd: _conj_copy
11986
+ tags: view_copy
11987
+
11988
+ - func: _neg_view_copy(Tensor self) -> Tensor
11989
+ variants: function
11990
+ dispatch:
11991
+ CompositeExplicitAutograd: _neg_view_copy
11992
+ tags: view_copy
11993
+
11994
+ - func: as_strided_copy(Tensor self, int[] size, int[] stride, int? storage_offset=None) -> Tensor
11995
+ variants: function
11996
+ dispatch:
11997
+ CompositeExplicitAutograd: as_strided_copy
11998
+ tags: view_copy
11999
+
12000
+ - func: _sparse_broadcast_to_copy(Tensor self, int[] size) -> Tensor
12001
+ variants: function
12002
+ dispatch:
12003
+ CompositeExplicitAutograd: _sparse_broadcast_to_copy
12004
+ tags: view_copy
12005
+
12006
+ - func: diagonal_copy(Tensor self, int offset=0, int dim1=0, int dim2=1) -> Tensor
12007
+ variants: function
12008
+ dispatch:
12009
+ CompositeExplicitAutograd: diagonal_copy
12010
+ tags: view_copy
12011
+
12012
+ - func: expand_copy(Tensor self, int[] size, *, bool implicit=False) -> Tensor
12013
+ variants: function
12014
+ dispatch:
12015
+ CompositeExplicitAutograd: expand_copy
12016
+ tags: view_copy
12017
+
12018
+ - func: expand_copy.SymInt(Tensor self, SymInt[] size, *, bool implicit=False) -> Tensor
12019
+ variants: function
12020
+ dispatch:
12021
+ CompositeExplicitAutograd: expand_copy_SymInt
12022
+ tags: view_copy
12023
+
12024
+ - func: permute_copy(Tensor self, int[] dims) -> Tensor
12025
+ variants: function
12026
+ dispatch:
12027
+ CompositeExplicitAutograd: permute_copy
12028
+ tags: view_copy
12029
+
12030
+ - func: _reshape_alias_copy(Tensor self, int[] size, int[] stride) -> Tensor
12031
+ variants: function
12032
+ dispatch:
12033
+ CompositeExplicitAutograd: _reshape_alias_copy
12034
+ tags: view_copy
12035
+
12036
+ - func: select_copy.int(Tensor self, int dim, int index) -> Tensor
12037
+ variants: function
12038
+ dispatch:
12039
+ CompositeExplicitAutograd: select_copy_int
12040
+ tags: view_copy
12041
+
12042
+ - func: detach_copy(Tensor self) -> Tensor
12043
+ variants: function
12044
+ dispatch:
12045
+ CompositeExplicitAutograd: detach_copy
12046
+ tags: view_copy
12047
+
12048
+ - func: slice_copy.Tensor(Tensor self, int dim=0, int? start=None, int? end=None, int step=1) -> Tensor
12049
+ variants: function
12050
+ dispatch:
12051
+ CompositeExplicitAutograd: slice_copy_Tensor
12052
+ tags: view_copy
12053
+
12054
+ - func: split_copy.Tensor(Tensor self, int split_size, int dim=0) -> Tensor[]
12055
+ variants: function
12056
+ dispatch:
12057
+ CompositeExplicitAutograd: split_copy_Tensor
12058
+ tags: view_copy
12059
+
12060
+ - func: split_with_sizes_copy(Tensor self, int[] split_sizes, int dim=0) -> Tensor[]
12061
+ variants: function
12062
+ dispatch:
12063
+ CompositeExplicitAutograd: split_with_sizes_copy
12064
+ tags: view_copy
12065
+
12066
+ - func: squeeze_copy(Tensor self) -> Tensor
12067
+ variants: function
12068
+ dispatch:
12069
+ CompositeExplicitAutograd: squeeze_copy
12070
+ tags: view_copy
12071
+
12072
+ - func: squeeze_copy.dim(Tensor self, int dim) -> Tensor
12073
+ variants: function
12074
+ dispatch:
12075
+ CompositeExplicitAutograd: squeeze_copy_dim
12076
+ tags: view_copy
12077
+
12078
+ - func: t_copy(Tensor self) -> Tensor
12079
+ variants: function
12080
+ dispatch:
12081
+ CompositeExplicitAutograd: t_copy
12082
+ tags: view_copy
12083
+
12084
+ - func: transpose_copy.int(Tensor self, int dim0, int dim1) -> Tensor
12085
+ variants: function
12086
+ dispatch:
12087
+ CompositeExplicitAutograd: transpose_copy_int
12088
+ tags: view_copy
12089
+
12090
+ - func: unsqueeze_copy(Tensor self, int dim) -> Tensor
12091
+ variants: function
12092
+ dispatch:
12093
+ CompositeExplicitAutograd: unsqueeze_copy
12094
+ tags: view_copy
12095
+
12096
+ - func: _indices_copy(Tensor self) -> Tensor
12097
+ variants: function
12098
+ dispatch:
12099
+ CompositeExplicitAutograd: _indices_copy
12100
+ tags: view_copy
12101
+
12102
+ - func: _values_copy(Tensor self) -> Tensor
12103
+ variants: function
12104
+ dispatch:
12105
+ CompositeExplicitAutograd: _values_copy
12106
+ tags: view_copy
12107
+
12108
+ - func: indices_copy(Tensor self) -> Tensor
12109
+ variants: function
12110
+ dispatch:
12111
+ CompositeExplicitAutograd: indices_copy
12112
+ tags: view_copy
12113
+
12114
+ - func: values_copy(Tensor self) -> Tensor
12115
+ variants: function
12116
+ dispatch:
12117
+ CompositeExplicitAutograd: values_copy
12118
+ tags: view_copy
12119
+
12120
+ - func: crow_indices_copy(Tensor self) -> Tensor
12121
+ variants: function
12122
+ dispatch:
12123
+ CompositeExplicitAutograd: crow_indices_copy
12124
+ tags: view_copy
12125
+
12126
+ - func: col_indices_copy(Tensor self) -> Tensor
12127
+ variants: function
12128
+ dispatch:
12129
+ CompositeExplicitAutograd: col_indices_copy
12130
+ tags: view_copy
12131
+
12132
+ - func: ccol_indices_copy(Tensor self) -> Tensor
12133
+ variants: function
12134
+ dispatch:
12135
+ CompositeExplicitAutograd: ccol_indices_copy
12136
+ tags: view_copy
12137
+
12138
+ - func: row_indices_copy(Tensor self) -> Tensor
12139
+ variants: function
12140
+ dispatch:
12141
+ CompositeExplicitAutograd: row_indices_copy
12142
+ tags: view_copy
12143
+
12144
+ - func: unbind_copy.int(Tensor self, int dim=0) -> Tensor[]
12145
+ variants: function
12146
+ dispatch:
12147
+ CompositeExplicitAutograd: unbind_copy_int
12148
+ tags: view_copy
12149
+
12150
+ - func: view_copy(Tensor self, int[] size) -> Tensor
12151
+ variants: function
12152
+ dispatch:
12153
+ CompositeExplicitAutograd: view_copy
12154
+ tags: view_copy
12155
+
12156
+ - func: view_copy.dtype(Tensor self, ScalarType dtype) -> Tensor
12157
+ variants: function
12158
+ dispatch:
12159
+ CompositeExplicitAutograd: view_copy_dtype
12160
+ tags: view_copy
12161
+
12162
+ - func: unfold_copy(Tensor self, int dimension, int size, int step) -> Tensor
12163
+ variants: function
12164
+ dispatch:
12165
+ CompositeExplicitAutograd: unfold_copy
12166
+ tags: view_copy
12167
+
12168
+ - func: alias_copy(Tensor self) -> Tensor
12169
+ variants: function
12170
+ dispatch:
12171
+ CompositeExplicitAutograd: alias_copy
12172
+ tags: view_copy
12173
+
12174
+ - func: _fw_primal_copy.out(Tensor self, int level, *, Tensor(a!) out) -> Tensor(a!)
12175
+ variants: function
12176
+ dispatch:
12177
+ CompositeExplicitAutograd: _fw_primal_copy_out
12178
+
12179
+
12180
+ - func: _make_dual_copy.out(Tensor primal, Tensor tangent, int level, *, Tensor(a!) out) -> Tensor(a!)
12181
+ variants: function
12182
+ dispatch:
12183
+ CompositeExplicitAutograd: _make_dual_copy_out
12184
+
12185
+
12186
+ - func: view_as_real_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
12187
+ variants: function
12188
+ dispatch:
12189
+ CompositeExplicitAutograd: view_as_real_copy_out
12190
+
12191
+
12192
+ - func: view_as_complex_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
12193
+ variants: function
12194
+ dispatch:
12195
+ CompositeExplicitAutograd: view_as_complex_copy_out
12196
+
12197
+
12198
+ - func: _conj_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
12199
+ variants: function
12200
+ dispatch:
12201
+ CompositeExplicitAutograd: _conj_copy_out
12202
+
12203
+
12204
+ - func: _neg_view_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
12205
+ variants: function
12206
+ dispatch:
12207
+ CompositeExplicitAutograd: _neg_view_copy_out
12208
+
12209
+
12210
+ - func: as_strided_copy.out(Tensor self, int[] size, int[] stride, int? storage_offset=None, *, Tensor(a!) out) -> Tensor(a!)
12211
+ variants: function
12212
+ dispatch:
12213
+ CompositeExplicitAutograd: as_strided_copy_out
12214
+
12215
+
12216
+ - func: _sparse_broadcast_to_copy.out(Tensor self, int[] size, *, Tensor(a!) out) -> Tensor(a!)
12217
+ variants: function
12218
+ dispatch:
12219
+ CompositeExplicitAutograd: _sparse_broadcast_to_copy_out
12220
+
12221
+
12222
+ - func: diagonal_copy.out(Tensor self, int offset=0, int dim1=0, int dim2=1, *, Tensor(a!) out) -> Tensor(a!)
12223
+ variants: function
12224
+ dispatch:
12225
+ CompositeExplicitAutograd: diagonal_copy_out
12226
+
12227
+
12228
+ - func: expand_copy.SymInt_out(Tensor self, SymInt[] size, *, bool implicit=False, Tensor(a!) out) -> Tensor(a!)
12229
+ variants: function
12230
+ dispatch:
12231
+ CompositeExplicitAutograd: expand_copy_SymInt_out
12232
+
12233
+
12234
+ - func: expand_copy.out(Tensor self, int[] size, *, bool implicit=False, Tensor(a!) out) -> Tensor(a!)
12235
+ variants: function
12236
+ dispatch:
12237
+ CompositeExplicitAutograd: expand_copy_out
12238
+
12239
+
12240
+ - func: permute_copy.out(Tensor self, int[] dims, *, Tensor(a!) out) -> Tensor(a!)
12241
+ variants: function
12242
+ dispatch:
12243
+ CompositeExplicitAutograd: permute_copy_out
12244
+
12245
+
12246
+ - func: _reshape_alias_copy.out(Tensor self, int[] size, int[] stride, *, Tensor(a!) out) -> Tensor(a!)
12247
+ variants: function
12248
+ dispatch:
12249
+ CompositeExplicitAutograd: _reshape_alias_copy_out
12250
+
12251
+
12252
+ - func: select_copy.int_out(Tensor self, int dim, int index, *, Tensor(a!) out) -> Tensor(a!)
12253
+ variants: function
12254
+ dispatch:
12255
+ CompositeExplicitAutograd: select_copy_int_out
12256
+
12257
+
12258
+ - func: detach_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
12259
+ variants: function
12260
+ dispatch:
12261
+ CompositeExplicitAutograd: detach_copy_out
12262
+
12263
+
12264
+ - func: slice_copy.Tensor_out(Tensor self, int dim=0, int? start=None, int? end=None, int step=1, *, Tensor(a!) out) -> Tensor(a!)
12265
+ variants: function
12266
+ dispatch:
12267
+ CompositeExplicitAutograd: slice_copy_Tensor_out
12268
+
12269
+
12270
+ - func: split_copy.Tensor_out(Tensor self, int split_size, int dim=0, *, Tensor(a!)[] out) -> ()
12271
+ variants: function
12272
+ dispatch:
12273
+ CompositeExplicitAutograd: split_copy_Tensor_out
12274
+
12275
+
12276
+ - func: split_with_sizes_copy.out(Tensor self, int[] split_sizes, int dim=0, *, Tensor(a!)[] out) -> ()
12277
+ variants: function
12278
+ dispatch:
12279
+ CompositeExplicitAutograd: split_with_sizes_copy_out
12280
+
12281
+
12282
+ - func: squeeze_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
12283
+ variants: function
12284
+ dispatch:
12285
+ CompositeExplicitAutograd: squeeze_copy_out
12286
+
12287
+
12288
+ - func: squeeze_copy.dim_out(Tensor self, int dim, *, Tensor(a!) out) -> Tensor(a!)
12289
+ variants: function
12290
+ dispatch:
12291
+ CompositeExplicitAutograd: squeeze_copy_dim_out
12292
+
12293
+
12294
+ - func: t_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
12295
+ variants: function
12296
+ dispatch:
12297
+ CompositeExplicitAutograd: t_copy_out
12298
+
12299
+
12300
+ - func: transpose_copy.int_out(Tensor self, int dim0, int dim1, *, Tensor(a!) out) -> Tensor(a!)
12301
+ variants: function
12302
+ dispatch:
12303
+ CompositeExplicitAutograd: transpose_copy_int_out
12304
+
12305
+
12306
+ - func: unsqueeze_copy.out(Tensor self, int dim, *, Tensor(a!) out) -> Tensor(a!)
12307
+ variants: function
12308
+ dispatch:
12309
+ CompositeExplicitAutograd: unsqueeze_copy_out
12310
+
12311
+
12312
+ - func: _indices_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
12313
+ variants: function
12314
+ dispatch:
12315
+ CompositeExplicitAutograd: _indices_copy_out
12316
+
12317
+
12318
+ - func: _values_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
12319
+ variants: function
12320
+ dispatch:
12321
+ CompositeExplicitAutograd: _values_copy_out
12322
+
12323
+
12324
+ - func: indices_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
12325
+ variants: function
12326
+ dispatch:
12327
+ CompositeExplicitAutograd: indices_copy_out
12328
+
12329
+
12330
+ - func: values_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
12331
+ variants: function
12332
+ dispatch:
12333
+ CompositeExplicitAutograd: values_copy_out
12334
+
12335
+
12336
+ - func: crow_indices_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
12337
+ variants: function
12338
+ dispatch:
12339
+ CompositeExplicitAutograd: crow_indices_copy_out
12340
+
12341
+
12342
+ - func: col_indices_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
12343
+ variants: function
12344
+ dispatch:
12345
+ CompositeExplicitAutograd: col_indices_copy_out
12346
+
12347
+
12348
+ - func: unbind_copy.int_out(Tensor self, int dim=0, *, Tensor(a!)[] out) -> ()
12349
+ variants: function
12350
+ dispatch:
12351
+ CompositeExplicitAutograd: unbind_copy_int_out
12352
+
12353
+
12354
+ - func: view_copy.out(Tensor self, int[] size, *, Tensor(a!) out) -> Tensor(a!)
12355
+ variants: function
12356
+ dispatch:
12357
+ CompositeExplicitAutograd: view_copy_out
12358
+
12359
+
12360
+ - func: view_copy.dtype_out(Tensor self, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
12361
+ variants: function
12362
+ dispatch:
12363
+ CompositeExplicitAutograd: view_copy_dtype_out
12364
+
12365
+
12366
+ - func: unfold_copy.out(Tensor self, int dimension, int size, int step, *, Tensor(a!) out) -> Tensor(a!)
12367
+ variants: function
12368
+ dispatch:
12369
+ CompositeExplicitAutograd: unfold_copy_out
12370
+
12371
+
12372
+ - func: alias_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
12373
+ variants: function
12374
+ dispatch:
12375
+ CompositeExplicitAutograd: alias_copy_out
12376
+
12377
+ - func: to_padded_tensor(Tensor self, float padding, int[]? output_size=None) -> Tensor
12378
+ variants: method
12379
+ dispatch:
12380
+ NestedTensorCPU: NestedTensor_to_padded_tensor_generic
12381
+ NestedTensorCUDA: NestedTensor_to_padded_tensor_cuda
12382
+
12383
+ - func: _nested_tensor_layer_norm(Tensor self, Tensor? weight, Tensor? bias, float eps) -> Tensor
12384
+ variants: method
12385
+ dispatch:
12386
+ NestedTensorCPU, NestedTensorCUDA: NestedTensor_layer_norm
12387
+
12388
+ # Apparently, putting "forward" in the name will cause Python bindings to be skipped, so "fwd" it is.
12389
+ - func: _transformer_encoder_layer_fwd(Tensor src, int embed_dim, int num_heads, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, bool use_gelu, bool norm_first, float eps, Tensor norm_weight_1, Tensor norm_bias_1, Tensor norm_weight_2, Tensor norm_bias_2, Tensor ffn_weight_1, Tensor ffn_bias_1, Tensor ffn_weight_2, Tensor ffn_bias_2, Tensor? mask=None) -> Tensor
12390
+ variants: function
12391
+ dispatch:
12392
+ CPU, CUDA, NestedTensorCPU, NestedTensorCUDA: transformer_encoder_layer_forward
12393
+
12394
+ - func: _native_multi_head_attention(Tensor query, Tensor key, Tensor value, int embed_dim, int num_head, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, Tensor? mask=None, bool need_weights=True, bool average_attn_weights=True) -> (Tensor, Tensor)
12395
+ variants: function
12396
+ dispatch:
12397
+ CPU, CUDA, NestedTensorCPU, NestedTensorCUDA: native_multi_head_attention