torch-rb 0.10.1 → 0.11.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -145,6 +145,7 @@
145
145
 
146
146
  - func: rename_(Tensor(a!) self, Dimname[]? names) -> Tensor(a!)
147
147
  variants: method
148
+ tags: inplace_view
148
149
 
149
150
  - func: rename(Tensor(a) self, Dimname[]? names) -> Tensor(a)
150
151
  variants: method
@@ -274,6 +275,7 @@
274
275
  device_check: NoCheck # TensorIterator
275
276
  dispatch:
276
277
  CPU, CUDA: abs_out
278
+ MPS: abs_out_mps
277
279
  SparseCPU, SparseCUDA: abs_sparse_out
278
280
  SparseCsrCPU, SparseCsrCUDA: abs_sparse_csr_out
279
281
 
@@ -328,12 +330,12 @@
328
330
  - func: view_as_real(Tensor(a) self) -> Tensor(a)
329
331
  variants: function
330
332
  dispatch:
331
- CPU, CUDA: view_as_real
333
+ CPU, CUDA, MPS, Meta: view_as_real
332
334
 
333
335
  - func: view_as_complex(Tensor(a) self) -> Tensor(a)
334
336
  variants: function
335
337
  dispatch:
336
- CPU, CUDA: view_as_complex
338
+ CPU, CUDA, Meta: view_as_complex
337
339
 
338
340
  - func: sgn(Tensor self) -> Tensor
339
341
  variants: function, method
@@ -357,6 +359,9 @@
357
359
  SparseCPU, SparseCUDA: sgn_sparse_out
358
360
  SparseCsrCPU, SparseCsrCUDA: sgn_sparse_csr_out
359
361
 
362
+ - func: chalf(Tensor self, *, MemoryFormat? memory_format=None) -> Tensor
363
+ variants: method
364
+
360
365
  - func: real(Tensor(a) self) -> Tensor(a)
361
366
  device_check: NoCheck # TensorIterator
362
367
  variants: function
@@ -422,6 +427,7 @@
422
427
  structured_inherits: TensorIteratorBase
423
428
  dispatch:
424
429
  CPU, CUDA: acos_out
430
+ MPS: acos_out_mps
425
431
 
426
432
  # arccos, alias of acos
427
433
  - func: arccos(Tensor self) -> Tensor
@@ -448,6 +454,7 @@
448
454
  SparseCsrCPU, SparseCsrCUDA: add_sparse_csr
449
455
  MkldnnCPU: mkldnn_add
450
456
  ZeroTensor: add_zerotensor
457
+ NestedTensorCPU, NestedTensorCUDA: NestedTensor_add_Tensor
451
458
 
452
459
  - func: add_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
453
460
  device_check: NoCheck # TensorIterator
@@ -457,18 +464,22 @@
457
464
  SparseCPU, SparseCUDA: add_sparse_
458
465
  SparseCsrCPU, SparseCsrCUDA: add_sparse_csr_
459
466
  MkldnnCPU: mkldnn_add_
467
+ NestedTensorCPU, NestedTensorCUDA: NestedTensor_add__Tensor
460
468
 
461
469
  - func: add.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
462
470
  device_check: NoCheck # TensorIterator
463
471
  structured: True
464
472
  structured_inherits: TensorIteratorBase
473
+ ufunc_inner_loop:
474
+ Generic: add (AllAndComplex, BFloat16, Half, ComplexHalf)
475
+ ScalarOnly: add (Bool)
465
476
  dispatch:
466
- CPU, CUDA: add_out
467
477
  SparseCPU: add_out_sparse_cpu
468
478
  SparseCUDA: add_out_sparse_cuda
469
479
  SparseCsrCPU: add_out_sparse_csr_cpu
470
480
  SparseCsrCUDA: add_out_sparse_csr_cuda
471
481
  MkldnnCPU: mkldnn_add_out
482
+ MPS: add_out_mps
472
483
 
473
484
  - func: _add_relu.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
474
485
  variants: function
@@ -494,6 +505,7 @@
494
505
  variants: function
495
506
  dispatch:
496
507
  CPU: add_relu_
508
+ autogen: _add_relu.Scalar_out
497
509
 
498
510
  # For C++ only, until we have conversion from C++ numbers to Tensor
499
511
  - func: add.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor
@@ -507,6 +519,7 @@
507
519
  variants: method
508
520
  dispatch:
509
521
  CompositeExplicitAutograd: add_
522
+ autogen: add.Scalar_out
510
523
 
511
524
  - func: addmv(Tensor self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor
512
525
  structured_delegate: addmv.out
@@ -521,8 +534,9 @@
521
534
  dispatch:
522
535
  CPU: addmv_out_cpu
523
536
  CUDA: addmv_out_cuda
524
- SparseCsrCPU: addmv_out_sparse_csr
525
- SparseCsrCUDA: addmv_out_sparse_csr_cuda
537
+ MPS: addmv_out_mps
538
+ SparseCsrCPU: addmv_out_sparse_compressed
539
+ SparseCsrCUDA: addmv_out_sparse_compressed_cuda
526
540
 
527
541
  - func: addr(Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
528
542
  variants: function, method
@@ -560,6 +574,7 @@
560
574
  - dim -> int dim
561
575
  dispatch:
562
576
  CPU, CUDA: all_out
577
+ MPS: all_out_mps
563
578
 
564
579
  - func: all.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor
565
580
  device_check: NoCheck # TensorIterator
@@ -583,6 +598,7 @@
583
598
  - dim -> int dim
584
599
  dispatch:
585
600
  CPU, CUDA: any_out
601
+ MPS: any_out_mps
586
602
 
587
603
  - func: any.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor
588
604
  device_check: NoCheck # TensorIterator
@@ -595,6 +611,12 @@
595
611
 
596
612
  - func: arange.start(Scalar start, Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
597
613
 
614
+ # Note [arange.start_step schema]
615
+ # We want `arange.start_step` to be grouped up with `arange.start_out`,
616
+ # But this doesn't happen automatically because the step argument
617
+ # is defaultable for .start_out but not for .start_step.
618
+ # We should probably just make "step" a defaultable param on arange.start,
619
+ # and kill arange.start_step.
598
620
  - func: arange.start_step(Scalar start, Scalar end, Scalar step, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
599
621
 
600
622
  - func: arange.out(Scalar end, *, Tensor(a!) out) -> Tensor(a!)
@@ -603,6 +625,7 @@
603
625
  dispatch:
604
626
  CPU, Meta: arange_out
605
627
  CUDA: arange_cuda_out
628
+ MPS: arange_mps_out
606
629
 
607
630
  # This function is a temporary hack to allow tracing of arange like constructs with dynamic
608
631
  # bounds on arange. Normal arange is not traceable because it does not take any tensor inputs;
@@ -620,6 +643,7 @@
620
643
  structured: True
621
644
  dispatch:
622
645
  CPU, CUDA: argmax_out
646
+ MPS: argmax_out_mps
623
647
 
624
648
  - func: argmin(Tensor self, int? dim=None, bool keepdim=False) -> Tensor
625
649
  structured_delegate: argmin.out
@@ -644,6 +668,7 @@
644
668
  structured_inherits: TensorIteratorBase
645
669
  dispatch:
646
670
  CPU, CUDA: acosh_out
671
+ MPS: acosh_out_mps
647
672
 
648
673
  # arccosh, alias for acosh
649
674
  - func: arccosh(Tensor self) -> Tensor
@@ -673,6 +698,7 @@
673
698
  structured_inherits: TensorIteratorBase
674
699
  dispatch:
675
700
  CPU, CUDA: asinh_out
701
+ MPS: asinh_out_mps
676
702
  SparseCPU, SparseCUDA: asinh_sparse_out
677
703
  SparseCsrCPU, SparseCsrCUDA: asinh_sparse_csr_out
678
704
 
@@ -705,6 +731,7 @@
705
731
  structured_inherits: TensorIteratorBase
706
732
  dispatch:
707
733
  CPU, CUDA: atanh_out
734
+ MPS: atanh_out_mps
708
735
  SparseCPU, SparseCUDA: atanh_sparse_out
709
736
  SparseCsrCPU, SparseCsrCUDA: atanh_sparse_csr_out
710
737
 
@@ -721,6 +748,7 @@
721
748
  variants: function, method
722
749
  dispatch:
723
750
  ZeroTensor, CPU, CUDA, Meta: as_strided_tensorimpl
751
+ MPS: as_strided_tensorimpl_mps
724
752
  QuantizedCPU, QuantizedCUDA: as_strided_qtensorimpl
725
753
  device_check: NoCheck
726
754
  device_guard: False
@@ -756,6 +784,7 @@
756
784
  structured_inherits: TensorIteratorBase
757
785
  dispatch:
758
786
  CPU, CUDA: asin_out
787
+ MPS: asin_out_mps
759
788
  SparseCPU, SparseCUDA: asin_sparse_out
760
789
  SparseCsrCPU, SparseCsrCUDA: asin_sparse_csr_out
761
790
 
@@ -790,6 +819,7 @@
790
819
  structured_inherits: TensorIteratorBase
791
820
  dispatch:
792
821
  CPU, CUDA: atan_out
822
+ MPS: atan_out_mps
793
823
  SparseCPU, SparseCUDA: atan_sparse_out
794
824
  SparseCsrCPU, SparseCsrCUDA: atan_sparse_csr_out
795
825
 
@@ -833,6 +863,7 @@
833
863
  dispatch:
834
864
  CPU: baddbmm_out_cpu
835
865
  CUDA: baddbmm_out_cuda
866
+ MPS: baddbmm_out_mps
836
867
  SparseCsrCUDA: baddbmm_out_sparse_csr_cuda
837
868
 
838
869
  - func: bartlett_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
@@ -861,19 +892,26 @@
861
892
  variants: function
862
893
  dispatch:
863
894
  CPU, CUDA: bernoulli_out
895
+ MPS: bernoulli_out_mps
864
896
 
865
897
  - func: bernoulli_.Tensor(Tensor(a!) self, Tensor p, *, Generator? generator=None) -> Tensor(a!)
866
898
  device_check: NoCheck # TensorIterator
867
899
  variants: method
868
900
  dispatch:
869
901
  CPU, CUDA: bernoulli_
902
+ MPS: bernoulli_mps_
903
+ autogen: bernoulli.Tensor_functional, bernoulli.Tensor_out
870
904
 
871
905
  - func: bernoulli_.float(Tensor(a!) self, float p=0.5, *, Generator? generator=None) -> Tensor(a!)
872
906
  device_check: NoCheck # TensorIterator
873
907
  variants: method
874
908
  dispatch:
875
909
  CPU, CUDA: bernoulli_
910
+ MPS: bernoulli_mps_
911
+ autogen: bernoulli.float_out
876
912
 
913
+ # Note [bernoulli.p schema]
914
+ # We should probably just fix the overload ambiguity by appending a _functional to the C++ API name (BC breaking)
877
915
  # This out-of-place version isn't used explicitly, but needed by jit.
878
916
  # There is no default valid on `p` here because it would introduce ambiguity
879
917
  # with `bernoulli(Tensor self, *, Generator? generator=None)` declaration.
@@ -890,6 +928,7 @@
890
928
  dispatch:
891
929
  CPU: binary_cross_entropy_cpu
892
930
  CUDA: binary_cross_entropy_cuda
931
+ MPS: binary_cross_entropy_mps
893
932
 
894
933
  - func: binary_cross_entropy.out(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)
895
934
  device_check: NoCheck # TensorIterator
@@ -898,6 +937,7 @@
898
937
  dispatch:
899
938
  CPU: binary_cross_entropy_out_cpu
900
939
  CUDA: binary_cross_entropy_out_cuda
940
+ MPS: binary_cross_entropy_out_mps
901
941
 
902
942
  - func: binary_cross_entropy_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean) -> Tensor
903
943
  python_module: nn
@@ -905,6 +945,7 @@
905
945
  dispatch:
906
946
  CPU: binary_cross_entropy_backward_cpu
907
947
  CUDA: binary_cross_entropy_backward_cuda
948
+ MPS: binary_cross_entropy_backward_mps
908
949
 
909
950
  - func: binary_cross_entropy_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, *, Tensor(a!) grad_input) -> Tensor(a!)
910
951
  python_module: nn
@@ -912,6 +953,7 @@
912
953
  dispatch:
913
954
  CPU: binary_cross_entropy_backward_out_cpu
914
955
  CUDA: binary_cross_entropy_backward_out_cuda
956
+ MPS: binary_cross_entropy_backward_out_mps
915
957
 
916
958
  - func: binary_cross_entropy_with_logits(Tensor self, Tensor target, Tensor? weight=None, Tensor? pos_weight=None, int reduction=Mean) -> Tensor
917
959
  device_check: NoCheck # TensorIterator
@@ -1061,6 +1103,7 @@
1061
1103
  dispatch:
1062
1104
  CPU: bmm_out_cpu
1063
1105
  CUDA: bmm_out_cuda
1106
+ MPS: bmm_out_mps
1064
1107
  SparseCPU: bmm_out_sparse_cpu
1065
1108
  SparseCUDA: bmm_out_sparse_cuda
1066
1109
  SparseCsrCUDA: bmm_out_sparse_csr_cuda
@@ -1078,12 +1121,20 @@
1078
1121
  SparseCPU, SparseCUDA: sparse_broadcast_to
1079
1122
 
1080
1123
  - func: cat(Tensor[] tensors, int dim=0) -> Tensor
1124
+ structured_delegate: cat.out
1081
1125
  dispatch:
1082
- CompositeExplicitAutograd: cat
1126
+ SparseCPU, SparseCUDA: cat_sparse
1127
+ QuantizedCPU: cat_quantized_cpu
1083
1128
 
1084
1129
  - func: cat.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!)
1130
+ structured: True
1131
+ precomputed:
1132
+ - dim -> int dim, int valid, bool all_contiguous, bool all_same_dtype, bool all_same_sizes_and_stride, MemoryFormat memory_format
1085
1133
  dispatch:
1086
- CompositeExplicitAutograd: cat_out
1134
+ CPU: cat_out_cpu
1135
+ CUDA: cat_out_cuda
1136
+ MPS: cat_out_mps
1137
+ QuantizedCPU: cat_out_quantized_cpu
1087
1138
 
1088
1139
  - func: cat.names(Tensor[] tensors, Dimname dim) -> Tensor
1089
1140
 
@@ -1125,6 +1176,7 @@
1125
1176
  structured_inherits: TensorIteratorBase
1126
1177
  dispatch:
1127
1178
  CPU, CUDA: ceil_out
1179
+ MPS: ceil_out_mps
1128
1180
  SparseCPU, SparseCUDA: ceil_sparse_out
1129
1181
  SparseCsrCPU, SparseCsrCUDA: ceil_sparse_csr_out
1130
1182
 
@@ -1164,8 +1216,7 @@
1164
1216
 
1165
1217
  - func: clamp.Tensor(Tensor self, Tensor? min=None, Tensor? max=None) -> Tensor
1166
1218
  variants: function, method
1167
- dispatch:
1168
- CPU, CUDA: clamp
1219
+ structured_delegate: clamp.Tensor_out
1169
1220
 
1170
1221
  - func: clamp_(Tensor(a!) self, Scalar? min=None, Scalar? max=None) -> Tensor(a!)
1171
1222
  device_check: NoCheck # TensorIterator
@@ -1177,8 +1228,7 @@
1177
1228
 
1178
1229
  - func: clamp_.Tensor(Tensor(a!) self, Tensor? min=None, Tensor? max=None) -> Tensor(a!)
1179
1230
  variants: function, method
1180
- dispatch:
1181
- CompositeExplicitAutograd: clamp_
1231
+ structured_delegate: clamp.Tensor_out
1182
1232
 
1183
1233
  - func: clamp.out(Tensor self, Scalar? min=None, Scalar? max=None, *, Tensor(a!) out) -> Tensor(a!)
1184
1234
  device_check: NoCheck # TensorIterator
@@ -1187,73 +1237,83 @@
1187
1237
  structured_inherits: TensorIteratorBase
1188
1238
  dispatch:
1189
1239
  CPU, CUDA: clamp_out
1240
+ MPS: clamp_out_mps
1190
1241
 
1191
1242
  - func: clamp.Tensor_out(Tensor self, Tensor? min=None, Tensor? max=None, *, Tensor(a!) out) -> Tensor(a!)
1192
1243
  device_check: NoCheck # TensorIterator
1244
+ structured: True
1245
+ structured_inherits: TensorIteratorBase
1193
1246
  dispatch:
1194
- CPU, CUDA: clamp_out
1247
+ CPU, CUDA: clamp_Tensor_out
1248
+ MPS: clamp_Tensor_out_mps
1195
1249
 
1196
1250
  - func: clamp_max(Tensor self, Scalar max) -> Tensor
1197
1251
  device_check: NoCheck # TensorIterator
1198
1252
  variants: function, method
1199
- dispatch:
1200
- CompositeExplicitAutograd: clamp_max
1253
+ structured_delegate: clamp_max.out
1201
1254
 
1202
1255
  - func: clamp_max.Tensor(Tensor self, Tensor max) -> Tensor
1203
1256
  variants: function, method
1204
- dispatch:
1205
- CompositeExplicitAutograd: clamp_max
1257
+ structured_delegate: clamp_max.Tensor_out
1206
1258
 
1207
1259
  - func: clamp_max_(Tensor(a!) self, Scalar max) -> Tensor(a!)
1208
1260
  device_check: NoCheck # TensorIterator
1209
1261
  variants: function, method
1210
- dispatch:
1211
- CompositeExplicitAutograd: clamp_max_
1262
+ structured_delegate: clamp_max.out
1212
1263
 
1213
1264
  - func: clamp_max_.Tensor(Tensor(a!) self, Tensor max) -> Tensor(a!)
1214
1265
  variants: function, method
1215
- dispatch:
1216
- CompositeExplicitAutograd: clamp_max_
1266
+ structured_delegate: clamp_max.Tensor_out
1217
1267
 
1218
1268
  - func: clamp_max.out(Tensor self, Scalar max, *, Tensor(a!) out) -> Tensor(a!)
1219
1269
  device_check: NoCheck # TensorIterator
1270
+ structured: True
1271
+ structured_inherits: TensorIteratorBase
1220
1272
  dispatch:
1221
1273
  CPU, CUDA: clamp_max_out
1274
+ MPS: clamp_max_out_mps
1222
1275
 
1223
1276
  - func: clamp_max.Tensor_out(Tensor self, Tensor max, *, Tensor(a!) out) -> Tensor(a!)
1277
+ device_check: NoCheck # TensorIterator
1278
+ structured: True
1279
+ structured_inherits: TensorIteratorBase
1224
1280
  dispatch:
1225
- CPU, CUDA: clamp_max_out
1281
+ CPU, CUDA: clamp_max_Tensor_out
1282
+ MPS: clamp_max_Tensor_out_mps
1226
1283
 
1227
1284
  - func: clamp_min(Tensor self, Scalar min) -> Tensor
1228
1285
  device_check: NoCheck # TensorIterator
1229
1286
  variants: function, method
1230
- dispatch:
1231
- CompositeExplicitAutograd: clamp_min
1287
+ structured_delegate: clamp_min.out
1232
1288
 
1233
1289
  - func: clamp_min.Tensor(Tensor self, Tensor min) -> Tensor
1234
1290
  variants: function, method
1235
- dispatch:
1236
- CompositeExplicitAutograd: clamp_min
1291
+ structured_delegate: clamp_min.Tensor_out
1237
1292
 
1238
1293
  - func: clamp_min_(Tensor(a!) self, Scalar min) -> Tensor(a!)
1239
1294
  device_check: NoCheck # TensorIterator
1240
1295
  variants: function, method
1241
- dispatch:
1242
- CompositeExplicitAutograd: clamp_min_
1296
+ structured_delegate: clamp_min.out
1243
1297
 
1244
1298
  - func: clamp_min_.Tensor(Tensor(a!) self, Tensor min) -> Tensor(a!)
1245
1299
  variants: function, method
1246
- dispatch:
1247
- CompositeExplicitAutograd: clamp_min_
1300
+ structured_delegate: clamp_min.Tensor_out
1248
1301
 
1249
1302
  - func: clamp_min.out(Tensor self, Scalar min, *, Tensor(a!) out) -> Tensor(a!)
1250
1303
  device_check: NoCheck # TensorIterator
1304
+ structured: True
1305
+ structured_inherits: TensorIteratorBase
1251
1306
  dispatch:
1252
1307
  CPU, CUDA: clamp_min_out
1308
+ MPS: clamp_min_out_mps
1253
1309
 
1254
1310
  - func: clamp_min.Tensor_out(Tensor self, Tensor min, *, Tensor(a!) out) -> Tensor(a!)
1311
+ device_check: NoCheck # TensorIterator
1312
+ structured: True
1313
+ structured_inherits: TensorIteratorBase
1255
1314
  dispatch:
1256
- CPU, CUDA: clamp_min_out
1315
+ CPU, CUDA: clamp_min_Tensor_out
1316
+ MPS: clamp_min_Tensor_out_mps
1257
1317
 
1258
1318
  # clip is an alias for clamp
1259
1319
  - func: clip(Tensor self, Scalar? min=None, Scalar? max=None) -> Tensor
@@ -1360,23 +1420,29 @@
1360
1420
 
1361
1421
  - func: conv_transpose3d.input(Tensor input, Tensor weight, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] output_padding=0, int groups=1, int[3] dilation=1) -> Tensor
1362
1422
 
1423
+ - func: copy(Tensor self, Tensor src, bool non_blocking=False) -> Tensor
1424
+ variants: function
1425
+
1363
1426
  - func: copy_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!)
1364
1427
  variants: method
1365
1428
  device_check: NoCheck
1366
1429
  device_guard: False
1367
1430
  dispatch:
1368
1431
  MkldnnCPU: copy_mkldnn_
1369
- SparseCPU, SparseCUDA, SparseHIP: copy_sparse_wrapper_
1432
+ SparseCPU, SparseCUDA: copy_sparse_wrapper_
1370
1433
  CompositeExplicitAutograd: copy_
1371
- SparseCsrCPU, SparseCsrCUDA: copy_sparse_csr_
1434
+ SparseCsrCPU, SparseCsrCUDA: copy_sparse_compressed_
1435
+ autogen: copy.out
1372
1436
 
1373
1437
  - func: _copy_from(Tensor self, Tensor dst, bool non_blocking=False) -> Tensor
1374
- dispatch: {}
1438
+ dispatch:
1439
+ MPS: _copy_from_mps
1375
1440
 
1376
1441
  # We need this to be able to properly copy from a CPU to an XLA tensor with different sizes.
1377
1442
  # See https://github.com/pytorch/xla/issues/2881
1378
1443
  - func: _copy_from_and_resize(Tensor self, Tensor dst) -> Tensor
1379
- dispatch: {}
1444
+ dispatch:
1445
+ MPS: _copy_from_and_resize_mps
1380
1446
 
1381
1447
  - func: cos(Tensor self) -> Tensor
1382
1448
  device_check: NoCheck # TensorIterator
@@ -1394,6 +1460,7 @@
1394
1460
  structured_inherits: TensorIteratorBase
1395
1461
  dispatch:
1396
1462
  CPU, CUDA: cos_out
1463
+ MPS: cos_out_mps
1397
1464
 
1398
1465
  - func: cosh(Tensor self) -> Tensor
1399
1466
  device_check: NoCheck # TensorIterator
@@ -1411,6 +1478,7 @@
1411
1478
  structured_inherits: TensorIteratorBase
1412
1479
  dispatch:
1413
1480
  CPU, CUDA: cosh_out
1481
+ MPS: cosh_out_mps
1414
1482
 
1415
1483
  - func: cosine_embedding_loss(Tensor input1, Tensor input2, Tensor target, float margin=0.0, int reduction=Mean) -> Tensor
1416
1484
 
@@ -1419,6 +1487,7 @@
1419
1487
  dispatch:
1420
1488
  CPU: count_nonzero_cpu
1421
1489
  CUDA: count_nonzero_cuda
1490
+ MPS: count_nonzero_mps
1422
1491
 
1423
1492
  - func: count_nonzero(Tensor self, int? dim=None) -> Tensor
1424
1493
  variants: function, method
@@ -1457,6 +1526,14 @@
1457
1526
  dispatch:
1458
1527
  CUDA: cudnn_convolution_transpose
1459
1528
 
1529
+ - func: _mps_convolution_transpose(Tensor self, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups) -> Tensor
1530
+ dispatch:
1531
+ MPS: _mps_convolution_transpose
1532
+
1533
+ - func: mps_convolution_transpose_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool[2] output_mask) -> (Tensor, Tensor)
1534
+ dispatch:
1535
+ MPS: mps_convolution_transpose_backward
1536
+
1460
1537
  - func: cudnn_convolution_relu(Tensor self, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups) -> Tensor
1461
1538
  dispatch:
1462
1539
  CUDA: cudnn_convolution_relu
@@ -1679,6 +1756,7 @@
1679
1756
  structured_inherits: TensorIteratorBase
1680
1757
  dispatch:
1681
1758
  CPU, CUDA: div_out
1759
+ MPS: div_out_mps
1682
1760
  SparseCPU, SparseCUDA: div_out_sparse_zerodim
1683
1761
 
1684
1762
  - func: div.Tensor_mode(Tensor self, Tensor other, *, str? rounding_mode) -> Tensor
@@ -1701,6 +1779,7 @@
1701
1779
  structured_inherits: TensorIteratorBase
1702
1780
  dispatch:
1703
1781
  CPU, CUDA: div_out_mode
1782
+ MPS: div_out_mode_mps
1704
1783
  SparseCPU, SparseCUDA: div_out_sparse_zerodim
1705
1784
 
1706
1785
  # For C++ only, until we have conversion from C++ numbers to Tensor
@@ -1715,6 +1794,7 @@
1715
1794
  variants: method
1716
1795
  dispatch:
1717
1796
  CompositeExplicitAutograd: div_
1797
+ autogen: div.Scalar_out
1718
1798
 
1719
1799
  - func: div.Scalar_mode(Tensor self, Scalar other, *, str? rounding_mode) -> Tensor
1720
1800
  variants: function, method
@@ -1725,6 +1805,7 @@
1725
1805
  variants: method
1726
1806
  dispatch:
1727
1807
  CompositeExplicitAutograd: div_
1808
+ autogen: div.Scalar_mode_out
1728
1809
 
1729
1810
  # divide, alias for div
1730
1811
  - func: divide.Tensor(Tensor self, Tensor other) -> Tensor
@@ -1780,6 +1861,7 @@
1780
1861
  dispatch:
1781
1862
  CPU: dot
1782
1863
  CUDA: dot_cuda
1864
+ MPS: dot_mps
1783
1865
 
1784
1866
  - func: dot.out(Tensor self, Tensor tensor, *, Tensor(a!) out) -> Tensor(a!)
1785
1867
  dispatch:
@@ -1800,6 +1882,7 @@
1800
1882
  - func: embedding(Tensor weight, Tensor indices, int padding_idx=-1, bool scale_grad_by_freq=False, bool sparse=False) -> Tensor
1801
1883
  dispatch:
1802
1884
  CompositeExplicitAutograd: embedding
1885
+ NestedTensorCPU, NestedTensorCUDA: NestedTensor_embedding
1803
1886
 
1804
1887
  - func: embedding_backward(Tensor grad, Tensor indices, int num_weights, int padding_idx, bool scale_grad_by_freq, bool sparse) -> Tensor
1805
1888
 
@@ -1807,11 +1890,13 @@
1807
1890
  dispatch:
1808
1891
  CPU: embedding_dense_backward_cpu
1809
1892
  CUDA: embedding_dense_backward_cuda
1893
+ MPS: embedding_dense_backward_mps
1810
1894
 
1811
1895
  - func: embedding_renorm_(Tensor(a!) self, Tensor indices, float max_norm, float norm_type) -> Tensor(a!)
1812
1896
  dispatch:
1813
1897
  CPU: embedding_renorm_cpu_
1814
1898
  CUDA: embedding_renorm_cuda_
1899
+ autogen: embedding_renorm.functional, embedding_renorm.out
1815
1900
 
1816
1901
  - func: embedding_sparse_backward(Tensor grad, Tensor indices, int num_weights, int padding_idx, bool scale_grad_by_freq) -> Tensor
1817
1902
 
@@ -1872,10 +1957,12 @@
1872
1957
  dispatch:
1873
1958
  CPU: empty_cpu
1874
1959
  CUDA: empty_cuda
1960
+ MPS: empty_mps
1875
1961
  Meta: empty_meta
1876
1962
  MkldnnCPU: empty_mkldnn
1877
1963
  SparseCPU, SparseCUDA: empty_sparse
1878
- SparseCsrCPU, SparseCsrCUDA: empty_sparse_csr
1964
+ SparseCsrCPU, SparseCsrCUDA: empty_sparse_compressed
1965
+ QuantizedCPU, QuantizedCUDA: empty_unknown_quantized
1879
1966
 
1880
1967
  # We do not make new_empty a composite that calls into new_empty_strided, as the strided version
1881
1968
  # is significantly more difficult to implement by different backends
@@ -1920,8 +2007,20 @@
1920
2007
  dispatch:
1921
2008
  CPU, Meta: resize_
1922
2009
  CUDA: resize_cuda_
2010
+ MPS: resize_mps_
1923
2011
  QuantizedCPU: quantized_resize_cpu_
1924
2012
  SparseCsrCPU, SparseCsrCUDA: resize_sparse_csr_
2013
+ autogen: resize.functional, resize.out
2014
+
2015
+ # This is a utility function to enable users to resize out tensor while registering kernels for out variants.
2016
+ # Eventually, we can consider exposing `resize_output` as a public API to ship it with python op registration
2017
+ # to make it easy to register out variants for ops.
2018
+ - func: _resize_output_(Tensor(a!) self, int[] size, Device device) -> Tensor(a!)
2019
+ use_const_ref_for_mutable_tensors: True
2020
+ variants: function
2021
+ dispatch:
2022
+ Meta: _resize_output_
2023
+ autogen: _resize_output.functional, _resize_output.out
1925
2024
 
1926
2025
  - func: empty_quantized(int[] size, Tensor qtensor, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
1927
2026
  category_override: factory
@@ -1938,6 +2037,7 @@
1938
2037
  device_guard: False
1939
2038
  dispatch:
1940
2039
  CompositeExplicitAutograd: empty_like
2040
+ QuantizedCPU, QuantizedCUDA: empty_like_quantized
1941
2041
  SparseCPU, SparseCUDA: empty_like_sparse_coo
1942
2042
  SparseCsrCPU, SparseCsrCUDA: empty_like_sparse_csr
1943
2043
 
@@ -1945,7 +2045,9 @@
1945
2045
  dispatch:
1946
2046
  CPU: empty_strided_cpu
1947
2047
  CUDA: empty_strided_cuda
2048
+ MPS: empty_strided_mps
1948
2049
  Meta: empty_strided_meta
2050
+ QuantizedCPU, QuantizedCUDA: empty_strided_unknown_quantized
1949
2051
 
1950
2052
  - func: erf(Tensor self) -> Tensor
1951
2053
  device_check: NoCheck # TensorIterator
@@ -1969,6 +2071,7 @@
1969
2071
  structured_inherits: TensorIteratorBase
1970
2072
  dispatch:
1971
2073
  CPU, CUDA: erf_out
2074
+ MPS: erf_out_mps
1972
2075
  SparseCPU, SparseCUDA: erf_sparse_out
1973
2076
  SparseCsrCPU, SparseCsrCUDA: erf_sparse_csr_out
1974
2077
 
@@ -2005,6 +2108,7 @@
2005
2108
  structured_inherits: TensorIteratorBase
2006
2109
  dispatch:
2007
2110
  CPU, CUDA: exp_out
2111
+ MPS: exp_out_mps
2008
2112
 
2009
2113
  - func: exp2(Tensor self) -> Tensor
2010
2114
  structured_delegate: exp2.out
@@ -2019,6 +2123,7 @@
2019
2123
  structured_inherits: TensorIteratorBase
2020
2124
  dispatch:
2021
2125
  CPU, CUDA: exp2_out
2126
+ MPS: exp2_out_mps
2022
2127
 
2023
2128
  - func: expm1(Tensor self) -> Tensor
2024
2129
  device_check: NoCheck # TensorIterator
@@ -2045,6 +2150,13 @@
2045
2150
  SparseCPU, SparseCUDA: expm1_sparse_out
2046
2151
  SparseCsrCPU, SparseCsrCUDA: expm1_sparse_csr_out
2047
2152
 
2153
+ - func: expand.SymInt(Tensor(a) self, SymInt[] size, *, bool implicit=False) -> Tensor(a)
2154
+ variants: method # This is method-only to match the previous tensor API. In the future we could make this a function too.
2155
+ device_check: NoCheck
2156
+ device_guard: False
2157
+ dispatch:
2158
+ CompositeExplicitAutograd: expand_symint
2159
+
2048
2160
  - func: expand(Tensor(a) self, int[] size, *, bool implicit=False) -> Tensor(a)
2049
2161
  variants: method # This is method-only to match the previous tensor API. In the future we could make this a function too.
2050
2162
  device_check: NoCheck
@@ -2065,11 +2177,13 @@
2065
2177
  dispatch:
2066
2178
  CPU: eye_out_cpu
2067
2179
  CUDA: eye_out_cuda
2180
+ MPS: eye_out_mps
2068
2181
 
2069
2182
  - func: eye.m_out(int n, int m, *, Tensor(a!) out) -> Tensor(a!)
2070
2183
  dispatch:
2071
2184
  CPU: eye_out_cpu
2072
2185
  CUDA: eye_out_cuda
2186
+ MPS: eye_out_mps
2073
2187
 
2074
2188
  - func: flatten.using_ints(Tensor(a) self, int start_dim=0, int end_dim=-1) -> Tensor(a)
2075
2189
  variants: function, method
@@ -2089,21 +2203,36 @@
2089
2203
  - func: unflatten.Dimname(Tensor(a) self, Dimname dim, int[] sizes, Dimname[] names) -> Tensor(a)
2090
2204
  variants: method
2091
2205
 
2206
+ - func: fill.Scalar(Tensor self, Scalar value) -> Tensor
2207
+ variants: function
2208
+ dispatch:
2209
+ CompositeExplicitAutograd: fill
2210
+
2211
+ - func: fill.Tensor(Tensor self, Tensor value) -> Tensor
2212
+ variants: function
2213
+ dispatch:
2214
+ CompositeExplicitAutograd: fill
2215
+
2092
2216
  - func: fill_.Scalar(Tensor(a!) self, Scalar value) -> Tensor(a!)
2093
2217
  device_check: NoCheck # TensorIterator
2094
2218
  variants: function, method
2095
2219
  dispatch:
2096
2220
  CPU, CUDA: fill_
2221
+ MPS: fill_scalar_mps
2097
2222
  QuantizedCPU, QuantizedCUDA: fill_quantized_
2098
2223
  Meta: fill_meta_
2224
+ SparseCsrCPU, SparseCsrCUDA: fill_sparse_csr_
2225
+ autogen: fill.Scalar_out
2099
2226
 
2100
2227
  - func: fill_.Tensor(Tensor(a!) self, Tensor value) -> Tensor(a!)
2101
2228
  device_check: NoCheck # TensorIterator
2102
2229
  variants: function, method
2103
2230
  dispatch:
2104
2231
  CPU, CUDA: fill_
2232
+ MPS: fill_tensor_mps_
2105
2233
  QuantizedCPU, QuantizedCUDA: fill_quantized_
2106
2234
  Meta: fill_meta_
2235
+ autogen: fill.Tensor_out
2107
2236
 
2108
2237
  - func: floor(Tensor self) -> Tensor
2109
2238
  device_check: NoCheck # TensorIterator
@@ -2129,6 +2258,7 @@
2129
2258
  structured_inherits: TensorIteratorBase
2130
2259
  dispatch:
2131
2260
  CPU, CUDA: floor_out
2261
+ MPS: floor_out_mps
2132
2262
  SparseCPU, SparseCUDA: floor_sparse_out
2133
2263
  SparseCsrCPU, SparseCsrCUDA: floor_sparse_csr_out
2134
2264
 
@@ -2220,10 +2350,12 @@
2220
2350
  variants: function, method
2221
2351
 
2222
2352
  # NOTE [ grid_sampler Native Functions ]
2223
- # `grid_sampler` does all the shape checking and then dispatches to one of
2224
- # `cudnn_grid_sampler`, `grid_sampler_2d`, or `grid_sampler_3d`, each of which
2225
- # has the corresponding backward defined as native functions as well. Therefore,
2226
- # in these functions and their backwards, no more shape checking is done.
2353
+ # `grid_sampler` is _supposed to_ do all the shape checking and then dispatch to
2354
+ # one of `cudnn_grid_sampler`, `grid_sampler_2d`, or `grid_sampler_3d`, each of
2355
+ # which has the corresponding backward defined as native functions as well.
2356
+ # However, we do shape checking everywhere for now since each of the mentioned
2357
+ # functions can be called directly, which will lead to crashes otherwise.
2358
+ # See https://github.com/pytorch/pytorch/issues/73187 for more information.
2227
2359
  #
2228
2360
  # There is also _grid_sampler_2d_backward_cpu_fallback which is an
2229
2361
  # implementation detail of grid_sampler_2d and is only exposed here for testing
@@ -2261,7 +2393,10 @@
2261
2393
  CPU: grid_sampler_3d_cpu
2262
2394
  CUDA: grid_sampler_3d_cuda
2263
2395
 
2264
- - func: grid_sampler_3d_backward(Tensor grad_output, Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> (Tensor, Tensor)
2396
+ # `grid_sampler_3d_backward` takes in `output_mask` to optimize performance for
2397
+ # the case where `input` doesn't require gradient. Gradient for `grid` is always
2398
+ # computed (only `output_mask[0]` is checked by the implementations).
2399
+ - func: grid_sampler_3d_backward(Tensor grad_output, Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners, bool[2] output_mask) -> (Tensor, Tensor)
2265
2400
  dispatch:
2266
2401
  CPU: grid_sampler_3d_backward_cpu
2267
2402
  CUDA: grid_sampler_3d_backward_cuda
@@ -2355,15 +2490,21 @@
2355
2490
  # - Tensor Tensor::index(ArrayRef<TensorIndex> indices)
2356
2491
  # - Tensor Tensor::index(std::initializer_list<TensorIndex> indices)
2357
2492
 
2493
+ - func: index_copy.out(Tensor self, int dim, Tensor index, Tensor source, *, Tensor(a!) out) -> Tensor(a!)
2494
+ structured: True
2495
+ variants: function
2496
+ precomputed:
2497
+ - dim -> int dim
2498
+ dispatch:
2499
+ CPU, CUDA: index_copy_out
2500
+
2358
2501
  - func: index_copy_(Tensor(a!) self, int dim, Tensor index, Tensor source) -> Tensor(a!)
2359
2502
  variants: method
2360
- dispatch:
2361
- CompositeExplicitAutograd: index_copy_
2503
+ structured_delegate: index_copy.out
2362
2504
 
2363
2505
  - func: index_copy(Tensor self, int dim, Tensor index, Tensor source) -> Tensor
2364
2506
  variants: function, method
2365
- dispatch:
2366
- CompositeExplicitAutograd: index_copy
2507
+ structured_delegate: index_copy.out
2367
2508
 
2368
2509
  - func: index_copy_.dimname(Tensor(a!) self, Dimname dim, Tensor index, Tensor source) -> Tensor(a!)
2369
2510
  variants: method
@@ -2376,6 +2517,7 @@
2376
2517
  variants: function, method
2377
2518
  dispatch:
2378
2519
  CompositeExplicitAutograd: index_put_
2520
+ autogen: index_put.out
2379
2521
  # NB: The following functions are declared in aten/src/ATen/templates/TensorBody.h and defined in aten/src/ATen/TensorIndexing.cpp:
2380
2522
  # - Tensor & Tensor::index_put_(ArrayRef<TensorIndex> indices, Tensor const & rhs)
2381
2523
  # - Tensor & Tensor::index_put_(ArrayRef<TensorIndex> indices, Scalar v)
@@ -2393,6 +2535,7 @@
2393
2535
  variants: function
2394
2536
  dispatch:
2395
2537
  CPU, CUDA: _index_put_impl_
2538
+ autogen: _index_put_impl.functional, _index_put_impl.out
2396
2539
 
2397
2540
  - func: instance_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool use_input_stats, float momentum, float eps, bool cudnn_enabled) -> Tensor
2398
2541
  variants: function
@@ -2444,7 +2587,7 @@
2444
2587
  device_check: NoCheck
2445
2588
  device_guard: False
2446
2589
  dispatch:
2447
- CPU, CUDA: isnan
2590
+ CPU, CUDA, MPS: isnan
2448
2591
  SparseCPU, SparseCUDA: isnan_sparse
2449
2592
  SparseCsrCPU, SparseCsrCUDA: isnan_sparse_csr
2450
2593
 
@@ -2540,17 +2683,14 @@
2540
2683
  dispatch:
2541
2684
  CPU: layer_norm_cpu
2542
2685
  CUDA: layer_norm_cuda
2686
+ MPS: layer_norm_mps
2543
2687
  CompositeImplicitAutograd: math_native_layer_norm
2544
2688
 
2545
- - func: _native_multi_head_self_attention(Tensor query, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, Tensor? mask=None) -> Tensor
2546
- dispatch:
2547
- CPU: multi_head_self_attention_cpu
2548
- CUDA: multi_head_self_attention_cuda
2549
-
2550
2689
  - func: native_layer_norm_backward(Tensor grad_out, Tensor input, int[] normalized_shape, Tensor mean, Tensor rstd, Tensor? weight, Tensor? bias, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
2551
2690
  dispatch:
2552
2691
  CPU: layer_norm_backward_cpu
2553
2692
  CUDA: layer_norm_backward_cuda
2693
+ MPS: layer_norm_backward_mps
2554
2694
 
2555
2695
  - func: nan_to_num(Tensor self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor
2556
2696
  variants: function, method
@@ -2575,6 +2715,14 @@
2575
2715
  - func: linear.out(Tensor input, Tensor weight, Tensor? bias=None, *, Tensor(a!) out) -> Tensor(a!)
2576
2716
  python_module: nn
2577
2717
 
2718
+ # TODO: Add this function to MPS dispatch key so that we avoid declaring it in
2719
+ # native_functions.yaml
2720
+ # https://github.com/pytorch/pytorch/issues/77394
2721
+ - func: _mps_linear(Tensor self, Tensor weight, Tensor? bias=None) -> Tensor
2722
+ python_module: nn
2723
+ dispatch:
2724
+ MPS: _mps_linear
2725
+
2578
2726
  - func: mkldnn_linear(Tensor self, Tensor weight, Tensor? bias=None) -> Tensor
2579
2727
  python_module: nn
2580
2728
  dispatch:
@@ -2592,6 +2740,18 @@
2592
2740
  dispatch:
2593
2741
  MkldnnCPU: mkldnn_linear_backward
2594
2742
 
2743
+ - func: _mps_linear_backward_input(int[] input_size, Tensor grad_output, Tensor weight) -> Tensor
2744
+ dispatch:
2745
+ MPS: _mps_linear_backward_input
2746
+
2747
+ - func: _mps_linear_backward_weights(Tensor grad_output, Tensor input, Tensor weight, bool bias_defined) -> (Tensor, Tensor)
2748
+ dispatch:
2749
+ MPS: _mps_linear_backward_weights
2750
+
2751
+ - func: mps_linear_backward(Tensor self, Tensor grad_output, Tensor weight, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
2752
+ dispatch:
2753
+ MPS: mps_linear_backward
2754
+
2595
2755
  - func: fbgemm_linear_int8_weight_fp32_activation(Tensor input, Tensor weight, Tensor packed, Tensor col_offsets, Scalar weight_scale, Scalar weight_zero_point, Tensor bias) -> Tensor
2596
2756
 
2597
2757
  - func: fbgemm_linear_int8_weight(Tensor input, Tensor weight, Tensor packed, Tensor col_offsets, Scalar weight_scale, Scalar weight_zero_point, Tensor bias) -> Tensor
@@ -2622,6 +2782,7 @@
2622
2782
  dispatch:
2623
2783
  CPU, Meta: linspace_out
2624
2784
  CUDA: linspace_cuda_out
2785
+ MPS: linspace_out_mps
2625
2786
 
2626
2787
  - func: log(Tensor self) -> Tensor
2627
2788
  device_check: NoCheck # TensorIterator
@@ -2639,6 +2800,7 @@
2639
2800
  structured_inherits: TensorIteratorBase
2640
2801
  dispatch:
2641
2802
  CPU, CUDA: log_out
2803
+ MPS: log_out_mps
2642
2804
 
2643
2805
  - func: log10(Tensor self) -> Tensor
2644
2806
  device_check: NoCheck # TensorIterator
@@ -2658,6 +2820,7 @@
2658
2820
  structured_inherits: TensorIteratorBase
2659
2821
  dispatch:
2660
2822
  CPU, CUDA: log10_out
2823
+ MPS: log10_out_mps
2661
2824
 
2662
2825
  - func: log1p(Tensor self) -> Tensor
2663
2826
  device_check: NoCheck # TensorIterator
@@ -2681,6 +2844,7 @@
2681
2844
  structured_inherits: TensorIteratorBase
2682
2845
  dispatch:
2683
2846
  CPU, CUDA: log1p_out
2847
+ MPS: log1p_out_mps
2684
2848
  SparseCPU, SparseCUDA: log1p_sparse_out
2685
2849
  SparseCsrCPU, SparseCsrCUDA: log1p_sparse_csr_out
2686
2850
 
@@ -2700,12 +2864,14 @@
2700
2864
  structured_inherits: TensorIteratorBase
2701
2865
  dispatch:
2702
2866
  CPU, CUDA: log2_out
2867
+ MPS: log2_out_mps
2703
2868
 
2704
2869
  - func: logaddexp.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
2705
2870
  structured: True
2706
2871
  structured_inherits: TensorIteratorBase
2707
2872
  dispatch:
2708
2873
  CPU, CUDA: logaddexp_out
2874
+ MPS: logaddexp_out_mps
2709
2875
 
2710
2876
  - func: logaddexp(Tensor self, Tensor other) -> Tensor
2711
2877
  variants: method, function
@@ -2718,6 +2884,7 @@
2718
2884
  structured_inherits: TensorIteratorBase
2719
2885
  dispatch:
2720
2886
  CPU, CUDA: logaddexp2_out
2887
+ MPS: logaddexp2_out_mps
2721
2888
 
2722
2889
  - func: logaddexp2(Tensor self, Tensor other) -> Tensor
2723
2890
  variants: method, function
@@ -2791,6 +2958,11 @@
2791
2958
  - func: log_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
2792
2959
  variants: function, method
2793
2960
 
2961
+ - func: log_softmax.int_out(Tensor self, int dim, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!)
2962
+ variants: function
2963
+ dispatch:
2964
+ CompositeExplicitAutograd: log_softmax_out
2965
+
2794
2966
  - func: log_softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor
2795
2967
  variants: function, method
2796
2968
 
@@ -2802,6 +2974,7 @@
2802
2974
  dispatch:
2803
2975
  CPU: log_softmax_cpu_out
2804
2976
  CUDA: log_softmax_cuda_out
2977
+ MPS: log_softmax_mps_out
2805
2978
 
2806
2979
  - func: _log_softmax_backward_data(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype) -> Tensor
2807
2980
  structured_delegate: _log_softmax_backward_data.out
@@ -2811,6 +2984,7 @@
2811
2984
  dispatch:
2812
2985
  CPU: log_softmax_backward_cpu_out
2813
2986
  CUDA: log_softmax_backward_cuda_out
2987
+ MPS: log_softmax_backward_mps_out
2814
2988
 
2815
2989
  - func: _logcumsumexp(Tensor self, int dim) -> Tensor
2816
2990
  dispatch:
@@ -2922,6 +3096,7 @@
2922
3096
  - dim -> int dim
2923
3097
  dispatch:
2924
3098
  CPU, CUDA: max_out
3099
+ MPS: max_out_mps
2925
3100
 
2926
3101
  - func: max.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
2927
3102
  device_check: NoCheck # TensorIterator
@@ -2937,10 +3112,10 @@
2937
3112
 
2938
3113
  - func: amax(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor
2939
3114
  variants: function, method
2940
- dispatch:
2941
- CompositeExplicitAutograd: amax
3115
+ structured_delegate: amax.out
2942
3116
 
2943
3117
  - func: amax.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
3118
+ structured: True
2944
3119
  dispatch:
2945
3120
  CPU, CUDA: amax_out
2946
3121
 
@@ -2951,6 +3126,17 @@
2951
3126
 
2952
3127
  - func: max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
2953
3128
 
3129
+ # TODO: Add this function to MPS dispatch key so that we avoid declaring it in
3130
+ # native_functions.yaml
3131
+ # https://github.com/pytorch/pytorch/issues/77394
3132
+ - func: _mps_max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
3133
+ dispatch:
3134
+ MPS: _mps_max_pool2d
3135
+
3136
+ - func: mps_max_pool2d_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
3137
+ dispatch:
3138
+ MPS: mps_max_pool2d_backward
3139
+
2954
3140
  - func: mkldnn_max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
2955
3141
  dispatch:
2956
3142
  MkldnnCPU: mkldnn_max_pool2d
@@ -2974,6 +3160,7 @@
2974
3160
  - func: quantized_max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
2975
3161
  dispatch:
2976
3162
  QuantizedCPU: quantized_max_pool2d
3163
+ QuantizedCUDA: quantized_max_pool2d_cudnn
2977
3164
 
2978
3165
  - func: max_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> Tensor
2979
3166
 
@@ -2997,6 +3184,7 @@
2997
3184
  device_check: NoCheck # TensorIterator
2998
3185
  dispatch:
2999
3186
  CPU, CUDA: mean_out
3187
+ MPS: mean_out_mps
3000
3188
  QuantizedCPU: mean_out_quantized_cpu
3001
3189
 
3002
3190
  - func: mean.names_dim(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
@@ -3069,6 +3257,7 @@
3069
3257
  - dim -> int dim
3070
3258
  dispatch:
3071
3259
  CPU, CUDA: min_out
3260
+ MPS: min_out_mps
3072
3261
 
3073
3262
  - func: min.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
3074
3263
  device_check: NoCheck # TensorIterator
@@ -3079,13 +3268,24 @@
3079
3268
 
3080
3269
  - func: amin(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor
3081
3270
  variants: function, method
3082
- dispatch:
3083
- CompositeExplicitAutograd: amin
3271
+ structured_delegate: amin.out
3084
3272
 
3085
3273
  - func: amin.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
3274
+ structured: True
3086
3275
  dispatch:
3087
3276
  CPU, CUDA: amin_out
3088
3277
 
3278
+ # TODO: Add this function to MPS dispatch key so that we avoid declaring it in
3279
+ # native_functions.yaml
3280
+ # https://github.com/pytorch/pytorch/issues/77394
3281
+ - func: _mps_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups) -> Tensor
3282
+ dispatch:
3283
+ MPS: _mps_convolution
3284
+
3285
+ - func: mps_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
3286
+ dispatch:
3287
+ MPS: mps_convolution_backward
3288
+
3089
3289
  - func: mkldnn_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups) -> Tensor
3090
3290
  dispatch:
3091
3291
  CompositeExplicitAutograd: mkldnn_convolution
@@ -3130,10 +3330,12 @@
3130
3330
  dispatch:
3131
3331
  CPU: mm_out_cpu
3132
3332
  CUDA: mm_out_cuda
3333
+ MPS: mm_out_mps
3133
3334
  SparseCPU, SparseCUDA: _sparse_mm_out
3134
3335
  SparseCsrCPU, SparseCsrCUDA: _sparse_csr_mm_out
3135
3336
 
3136
3337
  - func: _sparse_mm(Tensor sparse, Tensor dense) -> Tensor
3338
+ python_module: sparse
3137
3339
 
3138
3340
  - func: _sparse_sparse_matmul(Tensor self, Tensor other) -> Tensor
3139
3341
  dispatch:
@@ -3165,8 +3367,10 @@
3165
3367
  variants: function, method
3166
3368
  dispatch:
3167
3369
  SparseCPU, SparseCUDA: mul_sparse
3370
+ SparseCsrCPU, SparseCsrCUDA: mul_sparse_csr
3168
3371
  MkldnnCPU: mkldnn_mul
3169
3372
  ZeroTensor: mul_zerotensor
3373
+ NestedTensorCPU, NestedTensorCUDA: NestedTensor_mul_Tensor
3170
3374
 
3171
3375
  - func: mul_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
3172
3376
  device_check: NoCheck # TensorIterator
@@ -3174,7 +3378,9 @@
3174
3378
  variants: method
3175
3379
  dispatch:
3176
3380
  SparseCPU, SparseCUDA: mul_sparse_
3381
+ SparseCsrCPU, SparseCsrCUDA: mul_sparse_csr_
3177
3382
  MkldnnCPU: mkldnn_mul_
3383
+ NestedTensorCPU, NestedTensorCUDA: NestedTensor_mul__Tensor
3178
3384
 
3179
3385
  - func: mul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
3180
3386
  device_check: NoCheck # TensorIterator
@@ -3182,8 +3388,10 @@
3182
3388
  structured_inherits: TensorIteratorBase
3183
3389
  dispatch:
3184
3390
  CPU, CUDA: mul_out
3391
+ MPS: mul_out_mps
3185
3392
  SparseCPU: mul_out_sparse_cpu
3186
3393
  SparseCUDA: mul_out_sparse_cuda
3394
+ SparseCsrCPU, SparseCsrCUDA: mul_out_sparse_csr
3187
3395
  MkldnnCPU: mkldnn_mul_out
3188
3396
 
3189
3397
  # For C++ only, until we have conversion from C++ numbers to Tensor
@@ -3192,12 +3400,15 @@
3192
3400
  variants: function, method
3193
3401
  dispatch:
3194
3402
  CompositeExplicitAutograd: mul
3403
+ SparseCsrCPU, SparseCsrCUDA: mul_scalar_sparse_csr
3195
3404
 
3196
3405
  - func: mul_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
3197
3406
  device_check: NoCheck # TensorIterator
3198
3407
  variants: method
3199
3408
  dispatch:
3200
3409
  CompositeExplicitAutograd: mul_
3410
+ SparseCsrCPU, SparseCsrCUDA: mul__scalar_sparse_csr
3411
+ autogen: mul.Scalar_out
3201
3412
 
3202
3413
  # multiply, alias for mul
3203
3414
  - func: multiply.Tensor(Tensor self, Tensor other) -> Tensor
@@ -3246,6 +3457,12 @@
3246
3457
  CPU: narrow_copy_dense_cpu
3247
3458
  SparseCPU, SparseCUDA: narrow_copy_sparse
3248
3459
  CompositeExplicitAutograd: narrow_copy_dense
3460
+ tags: view_copy
3461
+
3462
+ - func: narrow_copy.SymInt(Tensor self, int dim, int start, SymInt length) -> Tensor
3463
+ variants: function, method
3464
+ dispatch:
3465
+ CompositeExplicitAutograd: narrow_copy_symint
3249
3466
 
3250
3467
  - func: narrow_copy.out(Tensor self, int dim, int start, int length, *, Tensor(a!) out) -> Tensor(a!)
3251
3468
  dispatch:
@@ -3265,11 +3482,13 @@
3265
3482
  dispatch:
3266
3483
  CPU: batch_norm_cpu
3267
3484
  CUDA: batch_norm_cuda
3485
+ MPS: batch_norm_mps
3268
3486
  MkldnnCPU: mkldnn_batch_norm
3269
3487
 
3270
3488
  - func: native_batch_norm.out(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, *, Tensor(a!) out, Tensor(b!) save_mean, Tensor(c!) save_invstd) -> (Tensor(a!), Tensor(b!), Tensor(c!))
3271
3489
  dispatch:
3272
3490
  CUDA: batch_norm_cuda_out
3491
+ MPS: batch_norm_mps_out
3273
3492
 
3274
3493
  - func: batch_norm_stats(Tensor input, float eps) -> (Tensor, Tensor)
3275
3494
  dispatch:
@@ -3296,6 +3515,7 @@
3296
3515
  dispatch:
3297
3516
  CPU: batch_norm_backward_cpu
3298
3517
  CUDA: batch_norm_backward_cuda
3518
+ MPS: batch_norm_backward_mps
3299
3519
  MkldnnCPU: mkldnn_batch_norm_backward
3300
3520
 
3301
3521
  - func: batch_norm_backward_reduce(Tensor grad_out, Tensor input, Tensor mean, Tensor invstd, Tensor? weight, bool input_g, bool weight_g, bool bias_g) -> (Tensor, Tensor, Tensor, Tensor)
@@ -3363,6 +3583,7 @@
3363
3583
  variants: function, method
3364
3584
  dispatch:
3365
3585
  CompositeExplicitAutograd: permute
3586
+ MPS: permute_mps
3366
3587
 
3367
3588
  - func: movedim.intlist(Tensor(a) self, int[] source, int[] destination) -> Tensor(a)
3368
3589
  variants: function, method
@@ -3403,8 +3624,14 @@
3403
3624
  variants: function, method
3404
3625
 
3405
3626
  - func: pixel_shuffle(Tensor self, int upscale_factor) -> Tensor
3627
+ dispatch:
3628
+ CPU: pixel_shuffle_cpu
3629
+ CompositeExplicitAutograd: math_pixel_shuffle
3406
3630
 
3407
3631
  - func: pixel_unshuffle(Tensor self, int downscale_factor) -> Tensor
3632
+ dispatch:
3633
+ CPU: pixel_unshuffle_cpu
3634
+ CompositeExplicitAutograd: math_pixel_unshuffle
3408
3635
 
3409
3636
  - func: channel_shuffle(Tensor self, int groups) -> Tensor
3410
3637
  dispatch:
@@ -3420,6 +3647,7 @@
3420
3647
  variants: method
3421
3648
  dispatch:
3422
3649
  CUDA: is_pinned_cuda
3650
+ MPS: is_pinned_mps
3423
3651
  CompositeExplicitAutograd: is_pinned_default
3424
3652
 
3425
3653
  # TODO: add a copy kwarg that guarantees that the tensor is put into fresh
@@ -3431,6 +3659,7 @@
3431
3659
  - func: _pin_memory(Tensor self, Device? device=None) -> Tensor
3432
3660
  dispatch:
3433
3661
  CUDA: _pin_memory_cuda
3662
+ MPS: _pin_memory_mps
3434
3663
 
3435
3664
  - func: pinverse(Tensor self, float rcond=1e-15) -> Tensor
3436
3665
  variants: function, method
@@ -3566,6 +3795,7 @@
3566
3795
  structured_inherits: TensorIteratorBase
3567
3796
  dispatch:
3568
3797
  CPU, CUDA: reciprocal_out
3798
+ MPS: reciprocal_out_mps
3569
3799
 
3570
3800
  - func: neg(Tensor self) -> Tensor
3571
3801
  device_check: NoCheck # TensorIterator
@@ -3589,6 +3819,7 @@
3589
3819
  structured_inherits: TensorIteratorBase
3590
3820
  dispatch:
3591
3821
  CPU, CUDA: neg_out
3822
+ MPS: neg_out_mps
3592
3823
  SparseCPU, SparseCUDA: neg_out_sparse
3593
3824
  SparseCsrCPU, SparseCsrCUDA: neg_sparse_csr_out
3594
3825
 
@@ -3605,6 +3836,7 @@
3605
3836
  variants: method # This is method-only to match the previous tensor API. In the future we could make this a function too.
3606
3837
  dispatch:
3607
3838
  CompositeExplicitAutograd: repeat
3839
+ MPS: repeat_mps
3608
3840
 
3609
3841
  - func: repeat_interleave.Tensor(Tensor repeats, *, int? output_size=None) -> Tensor
3610
3842
  variants: function
@@ -3631,7 +3863,7 @@
3631
3863
  device_check: NoCheck
3632
3864
  device_guard: False
3633
3865
  dispatch:
3634
- CPU, CUDA, Meta, QuantizedCPU, QuantizedCUDA, ZeroTensor: _reshape_alias
3866
+ CPU, CUDA, Meta, QuantizedCPU, QuantizedCUDA, ZeroTensor, MPS: _reshape_alias
3635
3867
  # We don't need to support mkldnn since this is handled explicitly by the reshape operator.
3636
3868
 
3637
3869
  - func: _mkldnn_reshape(Tensor self, int[] shape) -> Tensor
@@ -3668,6 +3900,7 @@
3668
3900
  dispatch:
3669
3901
  CPU: round_out
3670
3902
  CUDA: round_out
3903
+ MPS: round_out_mps
3671
3904
  SparseCPU, SparseCUDA: round_sparse_out
3672
3905
  SparseCsrCPU, SparseCsrCUDA: round_sparse_csr_out
3673
3906
 
@@ -3700,16 +3933,21 @@
3700
3933
  variants: function, method
3701
3934
  dispatch:
3702
3935
  CPU, CUDA: relu
3936
+ MPS: relu_mps
3703
3937
  MkldnnCPU: mkldnn_relu
3704
3938
  QuantizedCPU: relu_quantized_cpu
3939
+ NestedTensorCPU, NestedTensorCUDA: NestedTensor_relu
3705
3940
 
3706
3941
  - func: relu_(Tensor(a!) self) -> Tensor(a!)
3707
3942
  device_check: NoCheck # TensorIterator
3708
3943
  variants: function, method
3709
3944
  dispatch:
3710
3945
  CPU, CUDA: relu_
3946
+ MPS: relu_mps_
3711
3947
  MkldnnCPU: mkldnn_relu_
3712
3948
  QuantizedCPU: relu_quantized_cpu_
3949
+ NestedTensorCPU, NestedTensorCUDA: NestedTensor_relu_
3950
+ autogen: relu.out
3713
3951
 
3714
3952
  - func: relu6(Tensor self) -> Tensor
3715
3953
  python_module: nn
@@ -3720,16 +3958,18 @@
3720
3958
  - func: prelu(Tensor self, Tensor weight) -> Tensor
3721
3959
  variants: function, method
3722
3960
  dispatch:
3961
+ MkldnnCPU: mkldnn_prelu
3723
3962
  CPU: prelu_cpu
3724
3963
  CUDA: prelu_cuda
3725
3964
 
3726
3965
  - func: prelu_backward(Tensor grad_output, Tensor self, Tensor weight) -> (Tensor, Tensor)
3727
3966
  variants: function, method
3728
3967
  dispatch:
3968
+ MkldnnCPU: mkldnn_prelu_backward
3729
3969
  CPU: prelu_backward_cpu
3730
3970
  CUDA: prelu_backward_cuda
3731
3971
 
3732
- - func: gelu.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
3972
+ - func: gelu.out(Tensor self, *, str approximate='none', Tensor(a!) out) -> Tensor(a!)
3733
3973
  structured: True
3734
3974
  structured_inherits: TensorIteratorBase
3735
3975
  device_check: NoCheck # TensorIterator
@@ -3737,24 +3977,34 @@
3737
3977
  dispatch:
3738
3978
  CPU: gelu_out_cpu
3739
3979
  CUDA: gelu_out_cuda
3980
+ MPS: gelu_out_mps
3981
+
3982
+ - func: gelu_(Tensor(a!) self, *, str approximate='none') -> Tensor(a!)
3983
+ structured_delegate: gelu.out
3984
+ device_check: NoCheck # TensorIterator
3985
+ python_module: nn
3986
+ dispatch:
3987
+ NestedTensorCPU, NestedTensorCUDA: NestedTensor_gelu_
3740
3988
 
3741
- - func: gelu(Tensor self) -> Tensor
3989
+ - func: gelu(Tensor self, *, str approximate='none') -> Tensor
3742
3990
  structured_delegate: gelu.out
3743
3991
  device_check: NoCheck # TensorIterator
3744
3992
  python_module: nn
3745
3993
  dispatch:
3746
3994
  MkldnnCPU: mkldnn_gelu
3747
3995
  QuantizedCPU: gelu_quantized_cpu
3996
+ NestedTensorCPU, NestedTensorCUDA: NestedTensor_gelu
3748
3997
 
3749
- - func: gelu_backward.grad_input(Tensor grad, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!)
3998
+ - func: gelu_backward.grad_input(Tensor grad_output, Tensor self, *, str approximate='none', Tensor(a!) grad_input) -> Tensor(a!)
3750
3999
  structured: True
3751
4000
  structured_inherits: TensorIteratorBase
3752
4001
  python_module: nn
3753
4002
  dispatch:
3754
4003
  CPU: gelu_backward_out_cpu
3755
4004
  CUDA: gelu_backward_out_cuda
4005
+ MPS: gelu_backward_out_mps
3756
4006
 
3757
- - func: gelu_backward(Tensor grad, Tensor self) -> Tensor
4007
+ - func: gelu_backward(Tensor grad_output, Tensor self, *, str approximate='none') -> Tensor
3758
4008
  structured_delegate: gelu_backward.grad_input
3759
4009
  python_module: nn
3760
4010
  dispatch:
@@ -3804,6 +4054,7 @@
3804
4054
  structured_inherits: TensorIteratorBase
3805
4055
  dispatch:
3806
4056
  CPU, CUDA: rsqrt_out
4057
+ MPS: rsqrt_out_mps
3807
4058
 
3808
4059
  - func: select.Dimname(Tensor(a) self, Dimname dim, int index) -> Tensor(a)
3809
4060
  variants: function, method
@@ -3816,6 +4067,7 @@
3816
4067
  device_guard: False
3817
4068
  dispatch:
3818
4069
  CompositeExplicitAutograd: select
4070
+ SparseCsrCPU, SparseCsrCUDA: select_sparse_csr
3819
4071
 
3820
4072
  - func: select_backward(Tensor grad_output, int[] input_sizes, int dim, int index) -> Tensor
3821
4073
  variants: function
@@ -3839,6 +4091,7 @@
3839
4091
  device_check: NoCheck # TensorIterator
3840
4092
  dispatch:
3841
4093
  CompositeExplicitAutograd: celu_
4094
+ autogen: celu.out
3842
4095
 
3843
4096
  - func: silu(Tensor self) -> Tensor
3844
4097
  structured_delegate: silu.out
@@ -3858,6 +4111,7 @@
3858
4111
  python_module: nn
3859
4112
  dispatch:
3860
4113
  CPU, CUDA: silu_out
4114
+ MPS: silu_out_mps
3861
4115
 
3862
4116
  - func: silu_backward.grad_input(Tensor grad_output, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!)
3863
4117
  structured: True
@@ -3865,6 +4119,7 @@
3865
4119
  python_module: nn
3866
4120
  dispatch:
3867
4121
  CPU, CUDA: silu_backward_out
4122
+ MPS: silu_backward_out_mps
3868
4123
 
3869
4124
  - func: silu_backward(Tensor grad_output, Tensor self) -> Tensor
3870
4125
  structured_delegate: silu_backward.grad_input
@@ -3918,6 +4173,7 @@
3918
4173
  structured_inherits: TensorIteratorBase
3919
4174
  dispatch:
3920
4175
  CPU, CUDA: sigmoid_out
4176
+ MPS: sigmoid_out_mps
3921
4177
 
3922
4178
  - func: logit(Tensor self, float? eps=None) -> Tensor
3923
4179
  variants: function, method
@@ -3955,6 +4211,7 @@
3955
4211
  structured_inherits: TensorIteratorBase
3956
4212
  dispatch:
3957
4213
  CPU, CUDA: sin_out
4214
+ MPS: sin_out_mps
3958
4215
  SparseCsrCPU, SparseCsrCUDA: sin_sparse_csr_out
3959
4216
  SparseCPU, SparseCUDA: sin_sparse_out
3960
4217
 
@@ -3994,6 +4251,7 @@
3994
4251
  structured_inherits: TensorIteratorBase
3995
4252
  dispatch:
3996
4253
  CPU, CUDA: sinh_out
4254
+ MPS: sinh_out_mps
3997
4255
  SparseCPU, SparseCUDA: sinh_sparse_out
3998
4256
  SparseCsrCPU, SparseCsrCUDA: sinh_sparse_csr_out
3999
4257
 
@@ -4080,6 +4338,11 @@
4080
4338
  - func: softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
4081
4339
  variants: function, method
4082
4340
 
4341
+ - func: softmax.int_out(Tensor self, int dim, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!)
4342
+ variants: function
4343
+ dispatch:
4344
+ CompositeExplicitAutograd: softmax_out
4345
+
4083
4346
  - func: softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor
4084
4347
  variants: function, method
4085
4348
 
@@ -4093,6 +4356,7 @@
4093
4356
  dispatch:
4094
4357
  CPU: softmax_cpu_out
4095
4358
  CUDA: softmax_cuda_out
4359
+ MPS: softmax_mps_out
4096
4360
 
4097
4361
  - func: _softmax_backward_data(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype) -> Tensor
4098
4362
  structured_delegate: _softmax_backward_data.out
@@ -4102,6 +4366,7 @@
4102
4366
  dispatch:
4103
4367
  CPU: softmax_backward_cpu_out
4104
4368
  CUDA: softmax_backward_cuda_out
4369
+ MPS: softmax_backward_mps_out
4105
4370
 
4106
4371
  - func: unsafe_split.Tensor(Tensor self, int split_size, int dim=0) -> Tensor[]
4107
4372
  variants: function, method
@@ -4117,6 +4382,10 @@
4117
4382
  dispatch:
4118
4383
  CompositeExplicitAutograd: split
4119
4384
 
4385
+ - func: split.sizes(Tensor(a -> *) self, int[] split_size, int dim=0) -> Tensor(a)[]
4386
+ variants: function, method
4387
+ device_guard: False
4388
+
4120
4389
  - func: unsafe_split_with_sizes(Tensor self, int[] split_sizes, int dim=0) -> Tensor[]
4121
4390
  variants: function, method
4122
4391
  device_check: NoCheck
@@ -4154,7 +4423,7 @@
4154
4423
  device_check: NoCheck
4155
4424
  device_guard: False
4156
4425
  dispatch:
4157
- CPU, CUDA: squeeze
4426
+ CompositeExplicitAutograd: squeeze
4158
4427
  QuantizedCPU, QuantizedCUDA: squeeze_quantized
4159
4428
 
4160
4429
  - func: squeeze.dim(Tensor(a) self, int dim) -> Tensor(a)
@@ -4162,7 +4431,7 @@
4162
4431
  device_check: NoCheck
4163
4432
  device_guard: False
4164
4433
  dispatch:
4165
- CPU, CUDA: squeeze
4434
+ CompositeExplicitAutograd: squeeze
4166
4435
  QuantizedCPU, QuantizedCUDA: squeeze_quantized
4167
4436
 
4168
4437
  - func: squeeze.dimname(Tensor(a) self, Dimname dim) -> Tensor(a)
@@ -4232,12 +4501,13 @@
4232
4501
 
4233
4502
  - func: dstack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!)
4234
4503
 
4235
- # The signature is designed to be consistent with librosa except that it is
4236
- # missing the `pad_mode` and `center` arguments, which are taken care of at
4237
- # `torch.functional.py`. They shall be moved here once we have mapping between
4238
- # Python strings and C++ Enum in codegen.
4504
+ # Overload without center & pad mode, needed for forward-compatibility
4239
4505
  - func: stft(Tensor self, int n_fft, int? hop_length=None, int? win_length=None, Tensor? window=None, bool normalized=False, bool? onesided=None, bool? return_complex=None) -> Tensor
4240
4506
  variants: function, method
4507
+ cpp_no_default_args: ['hop_length', 'win_length', 'window', 'normalized']
4508
+
4509
+ - func: stft.center(Tensor self, int n_fft, int? hop_length=None, int? win_length=None, Tensor? window=None, bool center=True, str pad_mode="reflect", bool normalized=False, bool? onesided=None, bool? return_complex=None) -> Tensor
4510
+ variants: function, method
4241
4511
 
4242
4512
  - func: istft(Tensor self, int n_fft, int? hop_length=None, int? win_length=None, Tensor? window=None, bool center=True, bool normalized=False, bool? onesided=None, int? length=None, bool return_complex=False) -> Tensor
4243
4513
  variants: function, method
@@ -4258,6 +4528,7 @@
4258
4528
  variants: function, method
4259
4529
  dispatch:
4260
4530
  CompositeExplicitAutograd: sum
4531
+ SparseCsrCPU, SparseCsrCUDA: sum_csr
4261
4532
 
4262
4533
  - func: sum.dim_IntList(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
4263
4534
  structured_delegate: sum.IntList_out
@@ -4273,21 +4544,17 @@
4273
4544
  device_check: NoCheck # TensorIterator
4274
4545
  dispatch:
4275
4546
  CPU, CUDA: sum_out
4547
+ MPS: sum_out_mps
4276
4548
 
4277
4549
  - func: sum.DimnameList_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
4278
4550
  device_check: NoCheck # TensorIterator
4279
4551
 
4280
- - func: nansum(Tensor self, *, ScalarType? dtype=None) -> Tensor
4552
+ - func: nansum(Tensor self, int[1] dim=[], bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
4281
4553
  variants: function, method
4282
4554
  dispatch:
4283
4555
  CPU, CUDA: nansum
4284
4556
 
4285
- - func: nansum.dim_IntList(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
4286
- variants: function, method
4287
- dispatch:
4288
- CPU, CUDA: nansum
4289
-
4290
- - func: nansum.IntList_out(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
4557
+ - func: nansum.out(Tensor self, int[1] dim=[], bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
4291
4558
  dispatch:
4292
4559
  CPU, CUDA: nansum_out
4293
4560
 
@@ -4318,6 +4585,7 @@
4318
4585
  structured_inherits: TensorIteratorBase
4319
4586
  dispatch:
4320
4587
  CPU, CUDA: sqrt_out
4588
+ MPS: sqrt_out_mps
4321
4589
  SparseCPU, SparseCUDA: sqrt_sparse_out
4322
4590
  SparseCsrCPU, SparseCsrCUDA: sqrt_sparse_csr_out
4323
4591
 
@@ -4330,8 +4598,6 @@
4330
4598
  variants: function, method
4331
4599
 
4332
4600
  - func: square.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
4333
- dispatch:
4334
- CPU, CUDA: square_out
4335
4601
 
4336
4602
  - func: std(Tensor self, bool unbiased=True) -> Tensor
4337
4603
  device_check: NoCheck # TensorIterator
@@ -4346,6 +4612,7 @@
4346
4612
  variants: function, method
4347
4613
  dispatch:
4348
4614
  CPU, CUDA: std
4615
+ MPS: std_mps
4349
4616
 
4350
4617
  - func: std_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor)
4351
4618
  device_check: NoCheck # TensorIterator
@@ -4397,6 +4664,7 @@
4397
4664
  variants: function, method
4398
4665
  dispatch:
4399
4666
  CPU, CUDA: prod
4667
+ MPS: prod_mps
4400
4668
 
4401
4669
  - func: prod.dim_int(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
4402
4670
  structured_delegate: prod.int_out
@@ -4408,6 +4676,7 @@
4408
4676
  device_check: NoCheck # TensorIterator
4409
4677
  dispatch:
4410
4678
  CPU, CUDA: prod_out
4679
+ MPS: prod_out_mps
4411
4680
 
4412
4681
  - func: prod.dim_Dimname(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
4413
4682
  device_check: NoCheck # TensorIterator
@@ -4453,6 +4722,7 @@
4453
4722
  structured_inherits: TensorIteratorBase
4454
4723
  dispatch:
4455
4724
  CPU, CUDA: tan_out
4725
+ MPS: tan_out_mps
4456
4726
  SparseCPU, SparseCUDA: tan_sparse_out
4457
4727
  SparseCsrCPU, SparseCsrCUDA: tan_sparse_csr_out
4458
4728
 
@@ -4481,6 +4751,7 @@
4481
4751
  structured_inherits: TensorIteratorBase
4482
4752
  dispatch:
4483
4753
  CPU, CUDA: tanh_out
4754
+ MPS: tanh_out_mps
4484
4755
  SparseCPU, SparseCUDA: tanh_sparse_out
4485
4756
  SparseCsrCPU, SparseCsrCUDA: tanh_sparse_csr_out
4486
4757
 
@@ -4511,12 +4782,14 @@
4511
4782
  structured_inherits: TensorIteratorBase
4512
4783
  dispatch:
4513
4784
  CPU, CUDA: threshold_out
4785
+ MPS: threshold_out_mps
4514
4786
 
4515
4787
  - func: threshold_backward.grad_input(Tensor grad_output, Tensor self, Scalar threshold, *, Tensor(a!) grad_input) -> Tensor(a!)
4516
4788
  structured: True
4517
4789
  structured_inherits: TensorIteratorBase
4518
4790
  dispatch:
4519
4791
  CPU, CUDA: threshold_backward_out
4792
+ MPS: threshold_backward_out_mps
4520
4793
 
4521
4794
  - func: threshold_backward(Tensor grad_output, Tensor self, Scalar threshold) -> Tensor
4522
4795
  variants: function
@@ -4558,6 +4831,7 @@
4558
4831
  device_guard: False
4559
4832
  dispatch:
4560
4833
  MkldnnCPU: mkldnn_transpose_
4834
+ autogen: _mkldnn_transpose.out
4561
4835
 
4562
4836
  - func: one_hot(Tensor self, int num_classes=-1) -> Tensor
4563
4837
  python_module: nn
@@ -4595,6 +4869,28 @@
4595
4869
 
4596
4870
  - func: trapz.dx(Tensor y, *, float dx=1, int dim=-1) -> Tensor
4597
4871
 
4872
+ # Fused implementation detail for transformers. Adds in-projection bias to QKV and divides Q by sqrt(D/num_heads).
4873
+ - func: _transform_bias_rescale_qkv(Tensor qkv, Tensor qkv_bias, int num_heads) -> (Tensor, Tensor, Tensor)
4874
+ dispatch:
4875
+ CPU, NestedTensorCPU: transform_bias_rescale_qkv_cpu
4876
+ CUDA, NestedTensorCUDA: transform_bias_rescale_qkv_cuda
4877
+
4878
+ - func: _nested_tensor_from_mask(Tensor t, Tensor mask) -> Tensor
4879
+ dispatch:
4880
+ CPU, CUDA: NestedTensor_nested_tensor_from_mask
4881
+
4882
+ - func: _nested_from_padded(Tensor padded, Tensor cpu_nested_shape_example, bool fuse_transform_0213=False) -> Tensor
4883
+ device_check: NoCheck # cpu_nested_shape_example will always be on CPU
4884
+ dispatch:
4885
+ CPU: nested_from_padded_generic
4886
+ CUDA: nested_from_padded_cuda
4887
+
4888
+ # _nested_from_padded is not usable from Python, so
4889
+ # _nested_from_padded_and_nested_example is available for testing.
4890
+ - func: _nested_from_padded_and_nested_example(Tensor padded, Tensor nt_example) -> Tensor
4891
+ dispatch:
4892
+ NestedTensorCPU, NestedTensorCUDA: NestedTensor_from_padded_and_nested_example
4893
+
4598
4894
  - func: _trilinear(Tensor i1, Tensor i2, Tensor i3, int[] expand1, int[] expand2, int[] expand3, int[] sumdim, int unroll_dim=1) -> Tensor
4599
4895
  dispatch:
4600
4896
  CompositeExplicitAutograd: _trilinear
@@ -4625,6 +4921,7 @@
4625
4921
  device_check: NoCheck # TensorIterator
4626
4922
  dispatch:
4627
4923
  CPU, CUDA: trunc_out
4924
+ MPS: trunc_out_mps
4628
4925
  SparseCPU, SparseCUDA: trunc_sparse_out
4629
4926
  SparseCsrCPU, SparseCsrCUDA: trunc_sparse_csr_out
4630
4927
 
@@ -4686,7 +4983,7 @@
4686
4983
  device_check: NoCheck
4687
4984
  device_guard: False
4688
4985
  dispatch:
4689
- CPU, CUDA: unsqueeze
4986
+ CompositeExplicitAutograd: unsqueeze
4690
4987
  SparseCPU, SparseCUDA: unsqueeze_sparse
4691
4988
  QuantizedCPU, QuantizedCUDA: unsqueeze_quantized
4692
4989
 
@@ -4713,6 +5010,7 @@
4713
5010
  variants: function, method
4714
5011
  dispatch:
4715
5012
  CPU, CUDA: var
5013
+ MPS: var_mps
4716
5014
 
4717
5015
  - func: var.out(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
4718
5016
  device_check: NoCheck # TensorIterator
@@ -4764,12 +5062,18 @@
4764
5062
  device_check: NoCheck
4765
5063
  device_guard: False
4766
5064
 
4767
- # we define both of these because 'where' does the broadcast and '_s_where' doesn't;
4768
- # this allows us to implicitly calculate the broadcast derivative, while only dealing with the
4769
- # _s_where derivative.
4770
5065
  - func: where.self(Tensor condition, Tensor self, Tensor other) -> Tensor
4771
5066
  device_check: NoCheck # TensorIterator
4772
5067
  variants: function, method
5068
+ dispatch:
5069
+ CPU, CUDA: where
5070
+ MPS: where_mps
5071
+
5072
+ - func: where.self_out(Tensor condition, Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
5073
+ device_check: NoCheck # TensorIterator
5074
+ dispatch:
5075
+ CPU, CUDA: where_self_out
5076
+ MPS: where_self_out_mps
4773
5077
 
4774
5078
  - func: where.ScalarSelf(Tensor condition, Scalar self, Tensor other) -> Tensor
4775
5079
  variants: function
@@ -4784,11 +5088,6 @@
4784
5088
  device_check: NoCheck # TensorIterator
4785
5089
  variants: function
4786
5090
 
4787
- - func: _s_where(Tensor condition, Tensor self, Tensor other) -> Tensor
4788
- variants: function
4789
- dispatch:
4790
- CPU, CUDA: _s_where
4791
-
4792
5091
  - func: norm_except_dim(Tensor v, int pow=2, int dim=0) -> Tensor
4793
5092
  variants: function
4794
5093
 
@@ -4797,15 +5096,17 @@
4797
5096
  - func: _weight_norm(Tensor v, Tensor g, int dim=0) -> Tensor
4798
5097
  variants: function
4799
5098
 
4800
- - func: _weight_norm_cuda_interface(Tensor v, Tensor g, int dim=0) -> (Tensor, Tensor)
5099
+ - func: _weight_norm_interface(Tensor v, Tensor g, int dim=0) -> (Tensor, Tensor)
4801
5100
  variants: function
4802
5101
  dispatch:
5102
+ CPU: weight_norm_cpu
4803
5103
  CUDA: weight_norm_cuda
4804
5104
 
4805
- - func: _weight_norm_cuda_interface_backward(Tensor grad_w, Tensor saved_v, Tensor saved_g, Tensor saved_norms, int dim) -> (Tensor, Tensor)
5105
+ - func: _weight_norm_interface_backward(Tensor grad_w, Tensor saved_v, Tensor saved_g, Tensor saved_norms, int dim) -> (Tensor, Tensor)
4806
5106
  variants: function
4807
5107
  dispatch:
4808
- CUDA: weight_norm_cuda_backward
5108
+ CPU: weight_norm_backward_cpu
5109
+ CUDA: weight_norm_backward_cuda
4809
5110
 
4810
5111
  - func: _weight_norm_differentiable_backward(Tensor grad_w, Tensor saved_v, Tensor saved_g, Tensor saved_norms, int dim) -> (Tensor, Tensor)
4811
5112
  variants: function
@@ -4887,6 +5188,16 @@
4887
5188
  SparseCPU: _sparse_sum_backward_cpu
4888
5189
  SparseCUDA: _sparse_sum_backward_cuda
4889
5190
 
5191
+ - func: _sparse_csr_sum.dim_dtype(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
5192
+ dispatch:
5193
+ SparseCsrCPU: _sparse_csr_sum_cpu
5194
+ SparseCsrCUDA: _sparse_csr_sum_cuda
5195
+
5196
+ - func: _sparse_csr_prod.dim_dtype(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
5197
+ dispatch:
5198
+ SparseCsrCPU: _sparse_csr_prod_cpu
5199
+ SparseCsrCUDA: _sparse_csr_prod_cuda
5200
+
4890
5201
  - func: _sparse_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
4891
5202
  python_module: sparse
4892
5203
  variants: function
@@ -4962,6 +5273,7 @@
4962
5273
  device_check: NoCheck # TensorIterator
4963
5274
  dispatch:
4964
5275
  CPU, CUDA: norm_out
5276
+ MPS: norm_out_mps
4965
5277
 
4966
5278
  # These four redispatch in their implementation, so OK to be CompositeImplicitAutograd
4967
5279
  - func: norm.names_ScalarOpt_dim_dtype(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor
@@ -4987,24 +5299,31 @@
4987
5299
  dispatch:
4988
5300
  CPU, CUDA: frexp_out
4989
5301
 
5302
+ # Deprecated (v.1.12)
4990
5303
  - func: frobenius_norm(Tensor self) -> Tensor
4991
5304
  variants: function
4992
5305
 
5306
+ # Deprecated (v.1.12)
4993
5307
  - func: frobenius_norm.dim(Tensor self, int[1] dim, bool keepdim=False) -> Tensor
4994
5308
  variants: function
4995
5309
 
5310
+ # Deprecated (v.1.12)
4996
5311
  - func: frobenius_norm.out(Tensor self, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
4997
5312
  variants: function
4998
5313
 
5314
+ # Deprecated (v.1.12)
4999
5315
  - func: nuclear_norm(Tensor self, bool keepdim=False) -> Tensor
5000
5316
  variants: function
5001
5317
 
5318
+ # Deprecated (v.1.12)
5002
5319
  - func: nuclear_norm.out(Tensor self, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
5003
5320
  variants: function
5004
5321
 
5322
+ # Deprecated (v.1.12)
5005
5323
  - func: nuclear_norm.dim(Tensor self, int[2] dim, bool keepdim=False) -> Tensor
5006
5324
  variants: function
5007
5325
 
5326
+ # Deprecated (v.1.12)
5008
5327
  - func: nuclear_norm.dim_out(Tensor self, int[2] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
5009
5328
  variants: function
5010
5329
 
@@ -5013,7 +5332,7 @@
5013
5332
  dispatch:
5014
5333
  CompositeExplicitAutograd: clone
5015
5334
  SparseCPU, SparseCUDA: clone_sparse
5016
- SparseCsrCPU, SparseCsrCUDA: clone_sparse_csr
5335
+ SparseCsrCPU, SparseCsrCUDA: clone_sparse_compressed
5017
5336
  MkldnnCPU: mkldnn_clone
5018
5337
  QuantizedCPU, QuantizedCUDA: quantized_clone
5019
5338
 
@@ -5025,22 +5344,27 @@
5025
5344
  variants: function, method
5026
5345
  dispatch:
5027
5346
  CompositeExplicitAutograd: resize_as_
5347
+ autogen: resize_as.functional, resize_as.out
5028
5348
 
5029
5349
  - func: resize_as_sparse_(Tensor(a!) self, Tensor the_template) -> Tensor(a!)
5030
5350
  use_const_ref_for_mutable_tensors: True
5031
- variants: function
5351
+ variants: function, method
5032
5352
  dispatch:
5033
5353
  SparseCPU, SparseCUDA: resize_as_sparse_
5034
5354
  SparseCsrCPU, SparseCsrCUDA: resize_as_sparse_csr_
5355
+ autogen: resize_as_sparse.functional, resize_as_sparse.out
5035
5356
 
5036
5357
  - func: zero_(Tensor(a!) self) -> Tensor(a!)
5037
5358
  device_check: NoCheck # TensorIterator
5038
5359
  variants: method, function
5039
5360
  dispatch:
5040
5361
  CPU, CUDA: zero_
5362
+ MPS: zero_mps_
5041
5363
  Meta: zero_meta_
5042
5364
  SparseCPU, SparseCUDA: zero_sparse_
5365
+ SparseCsrCPU, SparseCsrCUDA: zero_sparse_csr_
5043
5366
  MkldnnCPU: mkldnn_zero_
5367
+ autogen: zero.functional, zero.out
5044
5368
 
5045
5369
  - func: sub.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
5046
5370
  device_check: NoCheck # TensorIterator
@@ -5048,6 +5372,7 @@
5048
5372
  structured_inherits: TensorIteratorBase
5049
5373
  dispatch:
5050
5374
  CPU, CUDA: sub_out
5375
+ MPS: sub_out_mps
5051
5376
  SparseCPU, SparseCUDA: sub_out_sparse
5052
5377
 
5053
5378
  - func: sub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
@@ -5056,6 +5381,7 @@
5056
5381
  structured_delegate: sub.out
5057
5382
  dispatch:
5058
5383
  SparseCPU, SparseCUDA: sub_sparse
5384
+ ZeroTensor: sub_zerotensor
5059
5385
 
5060
5386
  - func: sub_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
5061
5387
  device_check: NoCheck # TensorIterator
@@ -5076,6 +5402,7 @@
5076
5402
  variants: method
5077
5403
  dispatch:
5078
5404
  CompositeExplicitAutograd: sub_
5405
+ autogen: sub.Scalar_out
5079
5406
 
5080
5407
  # subtract, alias for sub
5081
5408
  - func: subtract.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
@@ -5125,7 +5452,7 @@
5125
5452
 
5126
5453
  # Functionally the same as addmm, but we give it a different derivative formula
5127
5454
  # that doesn't propagate gradients to non-present entries on sparse.
5128
- - func: _sparse_addmm(Tensor self, Tensor sparse, Tensor dense, *, Scalar beta=1, Scalar alpha=1) -> Tensor
5455
+ - func: _sparse_addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
5129
5456
  python_module: sparse
5130
5457
  dispatch:
5131
5458
  CompositeExplicitAutograd: _sparse_addmm
@@ -5134,21 +5461,24 @@
5134
5461
  python_module: sparse
5135
5462
  dispatch:
5136
5463
  SparseCsrCUDA: sparse_sampled_addmm_out_sparse_csr_cuda
5464
+ SparseCsrCPU: sparse_sampled_addmm_out_sparse_csr_cpu
5137
5465
 
5138
5466
  - func: sparse_sampled_addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
5139
5467
  python_module: sparse
5140
5468
  dispatch:
5141
5469
  SparseCsrCUDA: sparse_sampled_addmm_sparse_csr_cuda
5470
+ SparseCsrCPU: sparse_sampled_addmm_sparse_csr_cpu
5142
5471
 
5143
5472
  - func: addmm.out(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
5144
5473
  structured: True
5145
5474
  dispatch:
5146
5475
  CPU: addmm_out_cpu
5147
5476
  CUDA: addmm_out_cuda
5477
+ MPS: addmm_out_mps
5148
5478
  SparseCPU: addmm_out_sparse_dense_cpu
5149
5479
  SparseCUDA: addmm_out_sparse_dense_cuda
5150
- SparseCsrCPU: addmm_out_sparse_csr_cpu
5151
- SparseCsrCUDA: addmm_out_sparse_csr_cuda
5480
+ SparseCsrCPU: addmm_out_sparse_compressed_cpu
5481
+ SparseCsrCUDA: addmm_out_sparse_compressed_cuda
5152
5482
 
5153
5483
  - func: addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
5154
5484
  structured_delegate: addmm.out
@@ -5156,7 +5486,7 @@
5156
5486
  dispatch:
5157
5487
  SparseCPU: addmm_sparse_dense_cpu
5158
5488
  SparseCUDA: addmm_sparse_dense_cuda
5159
- SparseCsrCPU, SparseCsrCUDA: addmm_sparse_csr_dense
5489
+ SparseCsrCPU, SparseCsrCUDA: addmm_sparse_compressed_dense
5160
5490
 
5161
5491
  - func: addmm_(Tensor(a!) self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
5162
5492
  structured_delegate: addmm.out
@@ -5167,6 +5497,16 @@
5167
5497
  SparseCPU: s_addmm_sparse_dense_cpu_
5168
5498
  SparseCUDA: s_addmm_sparse_dense_cuda_
5169
5499
 
5500
+ - func: _addmm_activation.out(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, bool use_gelu=False, Tensor(a!) out) -> Tensor(a!)
5501
+ structured: True
5502
+ dispatch:
5503
+ CPU: addmm_activation_out_cpu
5504
+ CUDA: addmm_activation_out_cuda
5505
+
5506
+ - func: _addmm_activation(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, bool use_gelu=False) -> Tensor
5507
+ structured_delegate: _addmm_activation.out
5508
+ variants: function, method
5509
+
5170
5510
  # NOTE [ Sparse: autograd and API ]
5171
5511
  #
5172
5512
  #
@@ -5278,11 +5618,23 @@
5278
5618
  # FIXME: would be nicer if TensorOptions was optional based; not adding default arguments for options given
5279
5619
  # the default would never make sense.
5280
5620
 
5621
+ - func: sparse_compressed_tensor.comp_plain_value_size(Tensor compressed_indices, Tensor plain_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
5281
5622
  - func: sparse_csr_tensor.crow_col_value_size(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
5623
+ - func: sparse_csc_tensor.ccol_row_value_size(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
5624
+ - func: sparse_bsr_tensor.crow_col_value_size(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
5625
+ - func: sparse_bsc_tensor.ccol_row_value_size(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
5282
5626
 
5627
+ - func: sparse_compressed_tensor.comp_plain_value(Tensor compressed_indices, Tensor plain_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
5283
5628
  - func: sparse_csr_tensor.crow_col_value(Tensor crow_indices, Tensor col_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
5629
+ - func: sparse_csc_tensor.ccol_row_value(Tensor ccol_indices, Tensor row_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
5630
+ - func: sparse_bsr_tensor.crow_col_value(Tensor crow_indices, Tensor col_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
5631
+ - func: sparse_bsc_tensor.ccol_row_value(Tensor ccol_indices, Tensor row_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
5284
5632
 
5633
+ - func: _sparse_compressed_tensor_unsafe(Tensor compressed_indices, Tensor plain_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
5285
5634
  - func: _sparse_csr_tensor_unsafe(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
5635
+ - func: _sparse_csc_tensor_unsafe(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
5636
+ - func: _sparse_bsr_tensor_unsafe(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
5637
+ - func: _sparse_bsc_tensor_unsafe(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
5286
5638
 
5287
5639
  - func: sparse_coo_tensor.size(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
5288
5640
 
@@ -5294,7 +5646,11 @@
5294
5646
 
5295
5647
  - func: _validate_sparse_coo_tensor_args(Tensor indices, Tensor values, int[] size) -> ()
5296
5648
 
5649
+ - func: _validate_sparse_compressed_tensor_args(Tensor compressed_indices, Tensor plain_indices, Tensor values, int[] size, Layout layout) -> ()
5297
5650
  - func: _validate_sparse_csr_tensor_args(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size) -> ()
5651
+ - func: _validate_sparse_csc_tensor_args(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size) -> ()
5652
+ - func: _validate_sparse_bsr_tensor_args(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size) -> ()
5653
+ - func: _validate_sparse_bsc_tensor_args(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size) -> ()
5298
5654
 
5299
5655
  - func: _sparse_coo_tensor_with_dims(int sparse_dim, int dense_dim, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
5300
5656
  dispatch:
@@ -5309,26 +5665,34 @@
5309
5665
  variants: method
5310
5666
  dispatch:
5311
5667
  SparseCPU, SparseCUDA: sparse_resize_
5668
+ autogen: sparse_resize.functional, sparse_resize.out
5312
5669
 
5313
5670
  - func: sparse_resize_and_clear_(Tensor(a!) self, int[] size, int sparse_dim, int dense_dim) -> Tensor(a!)
5314
5671
  use_const_ref_for_mutable_tensors: True
5315
5672
  variants: method
5316
5673
  dispatch:
5317
5674
  SparseCPU, SparseCUDA: sparse_resize_and_clear_
5675
+ autogen: sparse_resize_and_clear.functional, sparse_resize_and_clear.out
5318
5676
 
5319
5677
  - func: sparse_mask(Tensor self, Tensor mask) -> Tensor
5320
5678
  variants: method
5321
5679
  dispatch:
5322
5680
  SparseCPU: sparse_mask_cpu
5323
5681
  SparseCUDA: sparse_mask_cuda
5682
+ SparseCsrCPU, SparseCsrCUDA: sparse_mask_sparse_csr
5324
5683
 
5325
5684
  - func: _to_cpu(Tensor[] tensors) -> Tensor[]
5326
5685
  variants: function
5327
5686
 
5328
5687
  - func: to_dense(Tensor self, ScalarType? dtype=None) -> Tensor
5329
5688
  variants: method
5689
+
5690
+ # Special case of to_dense with custom derivative
5691
+ - func: _to_dense(Tensor self, ScalarType? dtype=None) -> Tensor
5692
+ variants: method
5330
5693
  dispatch:
5331
- SparseCPU, SparseCUDA, SparseCsrCPU, SparseCsrCUDA: sparse_to_dense
5694
+ SparseCPU, SparseCUDA: sparse_to_dense
5695
+ SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_dense
5332
5696
  MkldnnCPU: mkldnn_to_dense
5333
5697
 
5334
5698
  - func: to_dense_backward(Tensor grad, Tensor input) -> Tensor
@@ -5414,6 +5778,7 @@
5414
5778
  SparseCPU, SparseCUDA: _coalesced_sparse_
5415
5779
  device_check: NoCheck
5416
5780
  device_guard: False
5781
+ autogen: _coalesced.functional, _coalesced.out
5417
5782
 
5418
5783
  - func: indices(Tensor(a) self) -> Tensor(a)
5419
5784
  variants: method
@@ -5444,6 +5809,20 @@
5444
5809
  device_check: NoCheck
5445
5810
  device_guard: False
5446
5811
 
5812
+ - func: ccol_indices(Tensor(a) self) -> Tensor(a)
5813
+ variants: method
5814
+ dispatch:
5815
+ SparseCsrCPU, SparseCsrCUDA: ccol_indices_sparse_csr
5816
+ device_check: NoCheck
5817
+ device_guard: False
5818
+
5819
+ - func: row_indices(Tensor(a) self) -> Tensor(a)
5820
+ variants: method
5821
+ dispatch:
5822
+ SparseCsrCPU, SparseCsrCUDA: row_indices_sparse_csr
5823
+ device_check: NoCheck
5824
+ device_guard: False
5825
+
5447
5826
  - func: hspmm.out(Tensor mat1, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)
5448
5827
  dispatch:
5449
5828
  SparseCPU: hspmm_out_sparse_cpu
@@ -5459,11 +5838,13 @@
5459
5838
  variants: function
5460
5839
  dispatch:
5461
5840
  SparseCPU, SparseCUDA: copy_sparse_
5841
+ autogen: copy_sparse_to_sparse.functional, copy_sparse_to_sparse.out
5462
5842
 
5463
5843
  - func: unbind.int(Tensor(a -> *) self, int dim=0) -> Tensor(a)[]
5464
5844
  variants: function, method
5465
5845
  dispatch:
5466
5846
  CompositeExplicitAutograd: unbind
5847
+ NestedTensorCPU, NestedTensorCUDA: NestedTensor_unbind
5467
5848
 
5468
5849
  - func: unbind.Dimname(Tensor(a -> *) self, Dimname dim) -> Tensor(a)[]
5469
5850
  variants: function, method
@@ -5472,11 +5853,41 @@
5472
5853
  variants: method
5473
5854
  dispatch:
5474
5855
  CPU, CUDA: dense_to_sparse
5856
+ SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse
5475
5857
 
5476
5858
  - func: to_sparse(Tensor self) -> Tensor
5477
5859
  variants: method
5478
5860
  dispatch:
5479
5861
  CPU, CUDA: dense_to_sparse
5862
+ SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse
5863
+
5864
+ - func: to_sparse_csr(Tensor self) -> Tensor
5865
+ variants: method
5866
+ dispatch:
5867
+ CPU, CUDA: dense_to_sparse_csr
5868
+ SparseCPU, SparseCUDA: coo_to_sparse_csr
5869
+ SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse_csr
5870
+
5871
+ - func: to_sparse_csc(Tensor self) -> Tensor
5872
+ variants: method
5873
+ dispatch:
5874
+ CPU, CUDA: dense_to_sparse_csc
5875
+ SparseCPU, SparseCUDA: coo_to_sparse_csc
5876
+ SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse_csc
5877
+
5878
+ - func: to_sparse_bsr(Tensor self, int[2] blocksize) -> Tensor
5879
+ variants: method
5880
+ dispatch:
5881
+ CPU, CUDA: dense_to_sparse_bsr
5882
+ SparseCPU, SparseCUDA: coo_to_sparse_bsr
5883
+ SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse_bsr
5884
+
5885
+ - func: to_sparse_bsc(Tensor self, int[2] blocksize) -> Tensor
5886
+ variants: method
5887
+ dispatch:
5888
+ CPU, CUDA: dense_to_sparse_bsc
5889
+ SparseCPU, SparseCUDA: coo_to_sparse_bsc
5890
+ SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse_bsc
5480
5891
 
5481
5892
  - func: to_mkldnn(Tensor self, ScalarType? dtype=None) -> Tensor
5482
5893
  variants: method
@@ -5636,7 +6047,7 @@
5636
6047
  dispatch:
5637
6048
  CPU: fused_moving_avg_obs_fake_quant_cpu
5638
6049
  CUDA: fused_moving_avg_obs_fake_quant_cuda
5639
-
6050
+ autogen: _fused_moving_avg_obs_fq_helper.functional, _fused_moving_avg_obs_fq_helper.out
5640
6051
 
5641
6052
  - func: _choose_qparams_per_tensor(Tensor self, bool reduce_range=False) -> (float, int)
5642
6053
  variants: function
@@ -5722,16 +6133,33 @@
5722
6133
  dispatch:
5723
6134
  CPU: _local_scalar_dense_cpu
5724
6135
  CUDA: _local_scalar_dense_cuda
6136
+ MPS: _local_scalar_dense_mps
5725
6137
  variants: function
5726
6138
 
6139
+ # MPS LSTM implementation
6140
+
6141
+ - func: _lstm_mps(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
6142
+ dispatch:
6143
+ MPS: _lstm_mps
6144
+
6145
+ - func: lstm_mps_backward(Tensor grad_y, Tensor? grad_hy, Tensor? grad_cy, Tensor z_state, Tensor cell_state_fwd, Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor[], Tensor[])
6146
+ dispatch:
6147
+ MPS: lstm_mps_backward
6148
+
6149
+
5727
6150
  # Fused RNN kernels
5728
6151
  - func: _thnn_fused_lstm_cell(Tensor input_gates, Tensor hidden_gates, Tensor cx, Tensor? input_bias=None, Tensor? hidden_bias=None) -> (Tensor, Tensor, Tensor)
5729
6152
  dispatch:
5730
6153
  CUDA: _thnn_fused_lstm_cell_cuda
5731
6154
 
5732
- - func: _thnn_fused_lstm_cell_backward(Tensor? grad_hy, Tensor? grad_cy, Tensor cx, Tensor cy, Tensor workspace, bool has_bias) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
6155
+ # NB: The composite version of this function below is a simple wrapper that duplicates some of the outputs
6156
+ # It is necessary to avoid triggering TensorImpl use count checks in debug mode
6157
+ # NB: this is function is NOT differentiable
6158
+ - func: _thnn_fused_lstm_cell_backward_impl(Tensor? grad_hy, Tensor? grad_cy, Tensor cx, Tensor cy, Tensor workspace, bool has_bias) -> (Tensor, Tensor, Tensor)
5733
6159
  dispatch:
5734
- CUDA: _thnn_fused_lstm_cell_backward_cuda
6160
+ CUDA: _thnn_fused_lstm_cell_backward_impl_cuda
6161
+
6162
+ - func: _thnn_fused_lstm_cell_backward(Tensor? grad_hy, Tensor? grad_cy, Tensor cx, Tensor cy, Tensor workspace, bool has_bias) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
5735
6163
 
5736
6164
  - func: _thnn_differentiable_lstm_cell_backward(Tensor? grad_hy, Tensor? grad_cy, Tensor input_gates, Tensor hidden_gates, Tensor? input_bias, Tensor? hidden_bias, Tensor cx, Tensor cy) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
5737
6165
 
@@ -5812,36 +6240,55 @@
5812
6240
  device_check: NoCheck
5813
6241
  device_guard: False
5814
6242
  dispatch:
5815
- CPU, CUDA: set_
6243
+ CPU, CUDA, Meta, MPS: set_
6244
+ autogen: set.source_Storage_functional, set.source_Storage_out
5816
6245
 
5817
6246
  - func: set_.source_Storage_storage_offset(Tensor(a!) self, Storage source, int storage_offset, int[] size, int[] stride=[]) -> Tensor(a!)
5818
6247
  variants: method
5819
6248
  device_check: NoCheck
5820
6249
  device_guard: False
5821
6250
  dispatch:
5822
- CPU: set_storage_cpu_
6251
+ CPU, Meta: set_storage_cpu_
5823
6252
  CUDA: set_storage_cuda_
6253
+ MPS: set_storage_mps_
5824
6254
  QuantizedCPU, QuantizedCUDA: set_storage_quantized_
6255
+ autogen: set.source_Storage_storage_offset_functional, set.source_Storage_storage_offset_out
6256
+
6257
+ - func: set_.source_Tensor_storage_offset(Tensor(a!) self, Tensor source, int storage_offset, int[] size, int[] stride=[]) -> Tensor(a!)
6258
+ variants: method
6259
+ device_check: NoCheck
6260
+ device_guard: False
5825
6261
 
5826
6262
  - func: set_.source_Tensor(Tensor(a!) self, Tensor source) -> Tensor(a!)
5827
6263
  variants: method
5828
6264
  device_check: NoCheck
5829
6265
  device_guard: False
5830
6266
  dispatch:
5831
- CPU, CUDA: set_tensor_
6267
+ CPU, CUDA, Meta, MPS: set_tensor_
6268
+ autogen: set.source_Tensor_functional, set.source_Tensor_out
5832
6269
 
5833
6270
  - func: set_(Tensor(a!) self) -> Tensor(a!)
5834
6271
  variants: method
5835
6272
  dispatch:
5836
6273
  CPU: set_cpu_
5837
6274
  CUDA: set_cuda_
6275
+ Meta: set_meta_
6276
+ MPS: set_mps_
6277
+ autogen: set.functional, set.out
6278
+
6279
+ - func: lift(Tensor self) -> Tensor
6280
+ variants: method
6281
+ dispatch:
6282
+ # Not making it CompositeImplicitAutograd because lift
6283
+ # should be a primitive w.r.t. functorch
6284
+ CompositeExplicitAutograd: lift
5838
6285
 
5839
6286
  - func: is_set_to(Tensor self, Tensor tensor) -> bool
5840
6287
  variants: method
5841
6288
  device_check: NoCheck
5842
6289
  device_guard: False
5843
6290
  dispatch:
5844
- CPU, CUDA: is_set_to
6291
+ CPU, CUDA, MPS: is_set_to
5845
6292
 
5846
6293
  - func: masked_fill_.Scalar(Tensor(a!) self, Tensor mask, Scalar value) -> Tensor(a!)
5847
6294
  device_check: NoCheck # TensorIterator
@@ -5849,6 +6296,8 @@
5849
6296
  dispatch:
5850
6297
  CPU: masked_fill__cpu
5851
6298
  CUDA: masked_fill__cuda
6299
+ MPS: masked_fill__mps
6300
+ autogen: masked_fill.Scalar_out
5852
6301
 
5853
6302
  - func: masked_fill.Scalar(Tensor self, Tensor mask, Scalar value) -> Tensor
5854
6303
  device_check: NoCheck # TensorIterator
@@ -5862,6 +6311,8 @@
5862
6311
  dispatch:
5863
6312
  CPU: masked_fill__cpu
5864
6313
  CUDA: masked_fill__cuda
6314
+ MPS: masked_fill__mps
6315
+ autogen: masked_fill.Tensor_out
5865
6316
 
5866
6317
  - func: masked_fill.Tensor(Tensor self, Tensor mask, Tensor value) -> Tensor
5867
6318
  device_check: NoCheck # TensorIterator
@@ -5874,23 +6325,29 @@
5874
6325
  dispatch:
5875
6326
  CPU: masked_scatter__cpu
5876
6327
  CUDA: masked_scatter__cuda
6328
+ autogen: masked_scatter.out
5877
6329
 
5878
6330
  - func: masked_scatter(Tensor self, Tensor mask, Tensor source) -> Tensor
5879
6331
  variants: function, method
5880
6332
  dispatch:
5881
6333
  CompositeExplicitAutograd: masked_scatter
5882
6334
 
5883
- - func: _masked_softmax(Tensor self, Tensor mask) -> Tensor
6335
+ - func: _masked_softmax(Tensor self, Tensor mask, int? dim=None) -> Tensor
5884
6336
  dispatch:
5885
6337
  CUDA: masked_softmax_cuda
5886
6338
  CPU: masked_softmax_cpu
5887
6339
 
6340
+ - func: _masked_softmax_backward(Tensor grad_output, Tensor output, Tensor mask, int? dim=None) -> Tensor
6341
+ dispatch:
6342
+ CUDA: masked_softmax_backward_cuda
6343
+ CPU: masked_softmax_backward_cpu
6344
+
5888
6345
  - func: view(Tensor(a) self, int[] size) -> Tensor(a)
5889
6346
  variants: method
5890
6347
  device_check: NoCheck
5891
6348
  device_guard: False
5892
6349
  dispatch:
5893
- ZeroTensor, CPU, CUDA, Meta, QuantizedCPU, QuantizedCUDA: view
6350
+ ZeroTensor, CPU, CUDA, Meta, QuantizedCPU, QuantizedCUDA, MPS: view
5894
6351
  MkldnnCPU: mkldnn_view
5895
6352
 
5896
6353
  # Warning: If you want to change the name or overload name of this
@@ -5909,7 +6366,8 @@
5909
6366
  - func: put_(Tensor(a!) self, Tensor index, Tensor source, bool accumulate=False) -> Tensor(a!)
5910
6367
  variants: method
5911
6368
  dispatch:
5912
- CPU, CUDA: put_
6369
+ CPU, CUDA, MPS: put_
6370
+ autogen: put.out
5913
6371
 
5914
6372
  - func: put(Tensor self, Tensor index, Tensor source, bool accumulate=False) -> Tensor
5915
6373
  variants: function, method
@@ -5934,12 +6392,30 @@
5934
6392
  - func: index_add.dimname(Tensor self, Dimname dim, Tensor index, Tensor source, *, Scalar alpha=1) -> Tensor
5935
6393
  variants: function, method
5936
6394
 
6395
+ - func: index_reduce.out(Tensor self, int dim, Tensor index, Tensor source, str reduce, *, bool include_self=True, Tensor(a!) out) -> Tensor(a!)
6396
+ structured: True
6397
+ variants: function
6398
+ precomputed:
6399
+ - dim -> int dim
6400
+ dispatch:
6401
+ CPU: index_reduce_cpu_out
6402
+ CUDA: index_reduce_cuda_out
6403
+
6404
+ - func: index_reduce_(Tensor(a!) self, int dim, Tensor index, Tensor source, str reduce, *, bool include_self=True) -> Tensor(a!)
6405
+ structured_delegate: index_reduce.out
6406
+ variants: method
6407
+
6408
+ - func: index_reduce(Tensor self, int dim, Tensor index, Tensor source, str reduce, *, bool include_self=True) -> Tensor
6409
+ structured_delegate: index_reduce.out
6410
+ variants: function, method
6411
+
5937
6412
  - func: index_fill_.int_Scalar(Tensor(a!) self, int dim, Tensor index, Scalar value) -> Tensor(a!)
5938
6413
  device_check: NoCheck # TensorIterator
5939
6414
  variants: method
5940
6415
  dispatch:
5941
6416
  CPU: index_fill_
5942
6417
  CUDA: index_fill_
6418
+ autogen: index_fill.int_Scalar_out
5943
6419
 
5944
6420
  - func: index_fill.int_Scalar(Tensor self, int dim, Tensor index, Scalar value) -> Tensor
5945
6421
  device_check: NoCheck # TensorIterator
@@ -5952,6 +6428,7 @@
5952
6428
  variants: method
5953
6429
  dispatch:
5954
6430
  CPU, CUDA: index_fill_
6431
+ autogen: index_fill.int_Tensor_out
5955
6432
 
5956
6433
  - func: index_fill.int_Tensor(Tensor self, int dim, Tensor index, Tensor value) -> Tensor
5957
6434
  device_check: NoCheck # TensorIterator
@@ -5988,6 +6465,7 @@
5988
6465
  variants: function
5989
6466
  dispatch:
5990
6467
  CPU, CUDA: scatter_src_out
6468
+ MPS: scatter_src_out_mps
5991
6469
 
5992
6470
  - func: scatter.value(Tensor self, int dim, Tensor index, Scalar value) -> Tensor
5993
6471
  structured_delegate: scatter.value_out
@@ -6002,6 +6480,7 @@
6002
6480
  variants: function
6003
6481
  dispatch:
6004
6482
  CPU, CUDA: scatter_value_out
6483
+ MPS: scatter_value_out_mps
6005
6484
 
6006
6485
  - func: scatter.reduce(Tensor self, int dim, Tensor index, Tensor src, *, str reduce) -> Tensor
6007
6486
  structured_delegate: scatter.reduce_out
@@ -6016,6 +6495,7 @@
6016
6495
  variants: function
6017
6496
  dispatch:
6018
6497
  CPU, CUDA: scatter_reduce_out
6498
+ MPS: scatter_reduce_out_mps
6019
6499
 
6020
6500
  - func: scatter.value_reduce(Tensor self, int dim, Tensor index, Scalar value, *, str reduce) -> Tensor
6021
6501
  structured_delegate: scatter.value_reduce_out
@@ -6030,6 +6510,7 @@
6030
6510
  variants: function
6031
6511
  dispatch:
6032
6512
  CPU, CUDA: scatter_value_reduce_out
6513
+ MPS: scatter_value_reduce_out_mps
6033
6514
 
6034
6515
  - func: scatter.dimname_src(Tensor self, Dimname dim, Tensor index, Tensor src) -> Tensor
6035
6516
  variants: function, method
@@ -6050,14 +6531,24 @@
6050
6531
  variants: function
6051
6532
  dispatch:
6052
6533
  CPU, CUDA: scatter_add
6534
+ MPS: scatter_add_mps_out
6053
6535
 
6054
6536
  - func: scatter_add.dimname(Tensor self, Dimname dim, Tensor index, Tensor src) -> Tensor
6055
6537
  variants: function, method
6056
6538
 
6057
- - func: scatter_reduce.two(Tensor self, int dim, Tensor index, str reduce, *, int? output_size=None) -> Tensor
6539
+ - func: scatter_reduce.two(Tensor self, int dim, Tensor index, Tensor src, str reduce, *, bool include_self=True) -> Tensor
6540
+ structured_delegate: scatter_reduce.two_out
6058
6541
  variants: function, method
6542
+
6543
+ - func: scatter_reduce_.two(Tensor(a!) self, int dim, Tensor index, Tensor src, str reduce, *, bool include_self=True) -> Tensor(a!)
6544
+ structured_delegate: scatter_reduce.two_out
6545
+ variants: method
6546
+
6547
+ - func: scatter_reduce.two_out(Tensor self, int dim, Tensor index, Tensor src, str reduce, *, bool include_self=True, Tensor(a!) out) -> Tensor(a!)
6548
+ structured: True
6549
+ variants: function
6059
6550
  dispatch:
6060
- CPU: scatter_reduce_two_cpu
6551
+ CPU, CUDA: scatter_reduce_two
6061
6552
 
6062
6553
  - func: eq_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
6063
6554
  structured_delegate: eq.Scalar_out
@@ -6093,6 +6584,12 @@
6093
6584
  dispatch:
6094
6585
  CompositeExplicitAutograd: bitwise_and
6095
6586
 
6587
+ - func: bitwise_and.Scalar_Tensor(Scalar self, Tensor other) -> Tensor
6588
+ device_check: NoCheck # TensorIterator
6589
+ variants: function
6590
+ dispatch:
6591
+ CompositeExplicitAutograd: bitwise_and
6592
+
6096
6593
  - func: bitwise_and.Tensor(Tensor self, Tensor other) -> Tensor
6097
6594
  device_check: NoCheck # TensorIterator
6098
6595
  variants: method, function
@@ -6141,6 +6638,12 @@
6141
6638
  device_check: NoCheck # TensorIterator
6142
6639
  variants: method, function
6143
6640
 
6641
+ - func: bitwise_or.Scalar_Tensor(Scalar self, Tensor other) -> Tensor
6642
+ device_check: NoCheck # TensorIterator
6643
+ variants: function
6644
+ dispatch:
6645
+ CompositeExplicitAutograd: bitwise_or
6646
+
6144
6647
  - func: bitwise_or.Tensor(Tensor self, Tensor other) -> Tensor
6145
6648
  device_check: NoCheck # TensorIterator
6146
6649
  variants: method, function
@@ -6189,6 +6692,12 @@
6189
6692
  device_check: NoCheck # TensorIterator
6190
6693
  variants: method, function
6191
6694
 
6695
+ - func: bitwise_xor.Scalar_Tensor(Scalar self, Tensor other) -> Tensor
6696
+ device_check: NoCheck # TensorIterator
6697
+ variants: function
6698
+ dispatch:
6699
+ CompositeExplicitAutograd: bitwise_xor
6700
+
6192
6701
  - func: bitwise_xor.Tensor(Tensor self, Tensor other) -> Tensor
6193
6702
  device_check: NoCheck # TensorIterator
6194
6703
  variants: method, function
@@ -6236,12 +6745,14 @@
6236
6745
  variants: method
6237
6746
  dispatch:
6238
6747
  CPU, CUDA: __ilshift__
6748
+ autogen: __lshift__.Scalar_out
6239
6749
 
6240
6750
  - func: __ilshift__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
6241
6751
  device_check: NoCheck # TensorIterator
6242
6752
  variants: method
6243
6753
  dispatch:
6244
6754
  CPU, CUDA: __ilshift__
6755
+ autogen: __lshift__.Tensor_out
6245
6756
 
6246
6757
  - func: bitwise_left_shift.Tensor(Tensor self, Tensor other) -> Tensor
6247
6758
  device_check: NoCheck # TensorIterator
@@ -6264,25 +6775,25 @@
6264
6775
  device_check: NoCheck # TensorIterator
6265
6776
  variants: method, function
6266
6777
  dispatch:
6267
- CPU, CUDA: bitwise_left_shift
6778
+ CompositeExplicitAutograd: bitwise_left_shift
6268
6779
 
6269
6780
  - func: bitwise_left_shift_.Tensor_Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
6270
6781
  device_check: NoCheck # TensorIterator
6271
6782
  variants: method
6272
6783
  dispatch:
6273
- CPU, CUDA: bitwise_left_shift_
6784
+ CompositeExplicitAutograd: bitwise_left_shift_
6274
6785
 
6275
6786
  - func: bitwise_left_shift.Tensor_Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
6276
6787
  device_check: NoCheck # TensorIterator
6277
6788
  variants: function
6278
6789
  dispatch:
6279
- CPU, CUDA: bitwise_left_shift_out
6790
+ CompositeExplicitAutograd: bitwise_left_shift_out
6280
6791
 
6281
6792
  - func: bitwise_left_shift.Scalar_Tensor(Scalar self, Tensor other) -> Tensor
6282
6793
  device_check: NoCheck # TensorIterator
6283
6794
  variants: function
6284
6795
  dispatch:
6285
- CPU, CUDA: bitwise_left_shift
6796
+ CompositeExplicitAutograd: bitwise_left_shift
6286
6797
 
6287
6798
  - func: __rshift__.Scalar(Tensor self, Scalar other) -> Tensor
6288
6799
  device_check: NoCheck # TensorIterator
@@ -6301,12 +6812,14 @@
6301
6812
  variants: method
6302
6813
  dispatch:
6303
6814
  CPU, CUDA: __irshift__
6815
+ autogen: __rshift__.Scalar_out
6304
6816
 
6305
6817
  - func: __irshift__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
6306
6818
  device_check: NoCheck # TensorIterator
6307
6819
  variants: method
6308
6820
  dispatch:
6309
6821
  CPU, CUDA: __irshift__
6822
+ autogen: __rshift__.Tensor_out
6310
6823
 
6311
6824
  - func: bitwise_right_shift.Tensor(Tensor self, Tensor other) -> Tensor
6312
6825
  device_check: NoCheck # TensorIterator
@@ -6329,25 +6842,25 @@
6329
6842
  device_check: NoCheck # TensorIterator
6330
6843
  variants: method, function
6331
6844
  dispatch:
6332
- CPU, CUDA: bitwise_right_shift
6845
+ CompositeExplicitAutograd: bitwise_right_shift
6333
6846
 
6334
6847
  - func: bitwise_right_shift_.Tensor_Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
6335
6848
  device_check: NoCheck # TensorIterator
6336
6849
  variants: method
6337
6850
  dispatch:
6338
- CPU, CUDA: bitwise_right_shift_
6851
+ CompositeExplicitAutograd: bitwise_right_shift_
6339
6852
 
6340
6853
  - func: bitwise_right_shift.Tensor_Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
6341
6854
  device_check: NoCheck # TensorIterator
6342
6855
  variants: function
6343
6856
  dispatch:
6344
- CPU, CUDA: bitwise_right_shift_out
6857
+ CompositeExplicitAutograd: bitwise_right_shift_out
6345
6858
 
6346
6859
  - func: bitwise_right_shift.Scalar_Tensor(Scalar self, Tensor other) -> Tensor
6347
6860
  device_check: NoCheck # TensorIterator
6348
6861
  variants: function
6349
6862
  dispatch:
6350
- CPU, CUDA: bitwise_right_shift
6863
+ CompositeExplicitAutograd: bitwise_right_shift
6351
6864
 
6352
6865
  - func: tril_(Tensor(a!) self, int diagonal=0) -> Tensor(a!)
6353
6866
  structured_delegate: tril.out
@@ -6376,15 +6889,18 @@
6376
6889
  variants: method
6377
6890
  dispatch:
6378
6891
  CPU, CUDA: addbmm_
6892
+ MPS: addbmm_mps_
6379
6893
 
6380
6894
  - func: addbmm.out(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
6381
6895
  dispatch:
6382
6896
  CPU, CUDA: addbmm_out
6897
+ MPS: addbmm_out_mps
6383
6898
 
6384
6899
  - func: addbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
6385
6900
  variants: method, function
6386
6901
  dispatch:
6387
6902
  CPU, CUDA: addbmm
6903
+ MPS: addbmm_mps
6388
6904
 
6389
6905
  - func: random_.from(Tensor(a!) self, int from, int? to, *, Generator? generator=None) -> Tensor(a!)
6390
6906
  device_check: NoCheck # TensorIterator
@@ -6392,6 +6908,8 @@
6392
6908
  dispatch:
6393
6909
  CPU, CUDA: random_
6394
6910
  Meta: random_meta_
6911
+ MPS: random_mps_
6912
+ autogen: random.from_functional, random.from_out
6395
6913
 
6396
6914
  - func: random_.to(Tensor(a!) self, int to, *, Generator? generator=None) -> Tensor(a!)
6397
6915
  device_check: NoCheck # TensorIterator
@@ -6399,6 +6917,8 @@
6399
6917
  dispatch:
6400
6918
  CPU, CUDA: random_
6401
6919
  Meta: random_meta_
6920
+ MPS: random_mps_
6921
+ autogen: random.to_functional, random.to_out
6402
6922
 
6403
6923
  - func: random_(Tensor(a!) self, *, Generator? generator=None) -> Tensor(a!)
6404
6924
  device_check: NoCheck # TensorIterator
@@ -6406,31 +6926,37 @@
6406
6926
  dispatch:
6407
6927
  CPU, CUDA: random_
6408
6928
  Meta: random_meta_
6929
+ autogen: random.functional, random.out
6409
6930
 
6410
6931
  - func: uniform_(Tensor(a!) self, float from=0, float to=1, *, Generator? generator=None) -> Tensor(a!)
6411
6932
  device_check: NoCheck # TensorIterator
6412
6933
  variants: method
6413
6934
  dispatch:
6414
6935
  CPU, CUDA: uniform_
6936
+ MPS: uniform_mps_
6415
6937
  Meta: uniform_meta_
6938
+ autogen: uniform.functional, uniform.out
6416
6939
 
6417
6940
  - func: cauchy_(Tensor(a!) self, float median=0, float sigma=1, *, Generator? generator=None) -> Tensor(a!)
6418
6941
  device_check: NoCheck # TensorIterator
6419
6942
  variants: method
6420
6943
  dispatch:
6421
6944
  CPU, CUDA: cauchy_
6945
+ autogen: cauchy.functional, cauchy.out
6422
6946
 
6423
6947
  - func: log_normal_(Tensor(a!) self, float mean=1, float std=2, *, Generator? generator=None) -> Tensor(a!)
6424
6948
  device_check: NoCheck # TensorIterator
6425
6949
  variants: method
6426
6950
  dispatch:
6427
6951
  CPU, CUDA: log_normal_
6952
+ autogen: log_normal.functional, log_normal.out
6428
6953
 
6429
6954
  - func: exponential_(Tensor(a!) self, float lambd=1, *, Generator? generator=None) -> Tensor(a!)
6430
6955
  device_check: NoCheck # TensorIterator
6431
6956
  variants: method
6432
6957
  dispatch:
6433
6958
  CPU, CUDA: exponential_
6959
+ autogen: exponential.functional, exponential.out
6434
6960
 
6435
6961
  - func: geometric_(Tensor(a!) self, float p, *, Generator? generator=None) -> Tensor(a!)
6436
6962
  device_check: NoCheck # TensorIterator
@@ -6439,11 +6965,13 @@
6439
6965
  CPU, CUDA: geometric_
6440
6966
 
6441
6967
  # wrappers for TH functions
6968
+ autogen: geometric.functional, geometric.out
6442
6969
 
6443
6970
  - func: diag.out(Tensor self, int diagonal=0, *, Tensor(a!) out) -> Tensor(a!)
6444
6971
  dispatch:
6445
6972
  CPU: diag_cpu_out
6446
6973
  CUDA: diag_cuda_out
6974
+ MPS: diag_mps_out
6447
6975
 
6448
6976
  - func: diag(Tensor self, int diagonal=0) -> Tensor
6449
6977
  variants: method, function
@@ -6465,6 +6993,7 @@
6465
6993
  dispatch:
6466
6994
  CPU: triu_cpu
6467
6995
  CUDA: triu_cuda
6996
+ MPS: triu_mps_out
6468
6997
 
6469
6998
  - func: triu(Tensor self, int diagonal=0) -> Tensor
6470
6999
  structured_delegate: triu.out
@@ -6475,6 +7004,7 @@
6475
7004
  dispatch:
6476
7005
  CPU: tril_cpu
6477
7006
  CUDA: tril_cuda
7007
+ MPS: tril_mps_out
6478
7008
 
6479
7009
  - func: tril(Tensor self, int diagonal=0) -> Tensor
6480
7010
  structured_delegate: tril.out
@@ -6507,6 +7037,7 @@
6507
7037
  device_check: NoCheck # TensorIterator
6508
7038
  dispatch:
6509
7039
  CPU, CUDA: ne_Scalar_out
7040
+ MPS: ne_scalar_out_mps
6510
7041
  QuantizedCPU: ne_out_quantized_cpu
6511
7042
 
6512
7043
  - func: ne.Scalar(Tensor self, Scalar other) -> Tensor
@@ -6522,6 +7053,7 @@
6522
7053
  device_check: NoCheck # TensorIterator
6523
7054
  dispatch:
6524
7055
  CPU, CUDA: ne_Tensor_out
7056
+ MPS: ne_tensor_out_mps
6525
7057
  QuantizedCPU: ne_out_quantized_cpu
6526
7058
 
6527
7059
  - func: ne.Tensor(Tensor self, Tensor other) -> Tensor
@@ -6568,6 +7100,7 @@
6568
7100
  device_check: NoCheck # TensorIterator
6569
7101
  dispatch:
6570
7102
  CPU, CUDA: eq_Scalar_out
7103
+ MPS: eq_scalar_out_mps
6571
7104
  QuantizedCPU: eq_out_quantized_cpu
6572
7105
 
6573
7106
  - func: eq.Scalar(Tensor self, Scalar other) -> Tensor
@@ -6583,6 +7116,7 @@
6583
7116
  device_check: NoCheck # TensorIterator
6584
7117
  dispatch:
6585
7118
  CPU, CUDA: eq_Tensor_out
7119
+ MPS: eq_tensor_out_mps
6586
7120
  QuantizedCPU: eq_out_quantized_cpu
6587
7121
 
6588
7122
  - func: eq.Tensor(Tensor self, Tensor other) -> Tensor
@@ -6598,6 +7132,7 @@
6598
7132
  device_check: NoCheck # TensorIterator
6599
7133
  dispatch:
6600
7134
  CPU, CUDA: ge_Scalar_out
7135
+ MPS: ge_scalar_out_mps
6601
7136
  QuantizedCPU: ge_out_quantized_cpu
6602
7137
 
6603
7138
  - func: ge.Scalar(Tensor self, Scalar other) -> Tensor
@@ -6613,6 +7148,7 @@
6613
7148
  device_check: NoCheck # TensorIterator
6614
7149
  dispatch:
6615
7150
  CPU, CUDA: ge_Tensor_out
7151
+ MPS: ge_tensor_out_mps
6616
7152
  QuantizedCPU: ge_out_quantized_cpu
6617
7153
 
6618
7154
  - func: ge.Tensor(Tensor self, Tensor other) -> Tensor
@@ -6659,6 +7195,7 @@
6659
7195
  device_check: NoCheck # TensorIterator
6660
7196
  dispatch:
6661
7197
  CPU, CUDA: le_Scalar_out
7198
+ MPS: le_scalar_out_mps
6662
7199
  QuantizedCPU: le_out_quantized_cpu
6663
7200
 
6664
7201
  - func: le.Scalar(Tensor self, Scalar other) -> Tensor
@@ -6674,6 +7211,7 @@
6674
7211
  device_check: NoCheck # TensorIterator
6675
7212
  dispatch:
6676
7213
  CPU, CUDA: le_Tensor_out
7214
+ MPS: le_tensor_out_mps
6677
7215
  QuantizedCPU: le_out_quantized_cpu
6678
7216
 
6679
7217
  - func: le.Tensor(Tensor self, Tensor other) -> Tensor
@@ -6720,6 +7258,7 @@
6720
7258
  device_check: NoCheck # TensorIterator
6721
7259
  dispatch:
6722
7260
  CPU, CUDA: gt_Scalar_out
7261
+ MPS: gt_scalar_out_mps
6723
7262
  QuantizedCPU: gt_out_quantized_cpu
6724
7263
 
6725
7264
  - func: gt.Scalar(Tensor self, Scalar other) -> Tensor
@@ -6735,6 +7274,7 @@
6735
7274
  device_check: NoCheck # TensorIterator
6736
7275
  dispatch:
6737
7276
  CPU, CUDA: gt_Tensor_out
7277
+ MPS: gt_tensor_out_mps
6738
7278
  QuantizedCPU: gt_out_quantized_cpu
6739
7279
 
6740
7280
  - func: gt.Tensor(Tensor self, Tensor other) -> Tensor
@@ -6781,6 +7321,7 @@
6781
7321
  device_check: NoCheck # TensorIterator
6782
7322
  dispatch:
6783
7323
  CPU, CUDA: lt_Scalar_out
7324
+ MPS: lt_scalar_out_mps
6784
7325
  QuantizedCPU: lt_out_quantized_cpu
6785
7326
 
6786
7327
  - func: lt.Scalar(Tensor self, Scalar other) -> Tensor
@@ -6796,6 +7337,7 @@
6796
7337
  device_check: NoCheck # TensorIterator
6797
7338
  dispatch:
6798
7339
  CPU, CUDA: lt_Tensor_out
7340
+ MPS: lt_tensor_out_mps
6799
7341
  QuantizedCPU: lt_out_quantized_cpu
6800
7342
 
6801
7343
  - func: lt.Tensor(Tensor self, Tensor other) -> Tensor
@@ -6854,15 +7396,18 @@
6854
7396
  dispatch:
6855
7397
  CPU, QuantizedCPU: index_select_out_cpu_
6856
7398
  CUDA, QuantizedCUDA: index_select_out_cuda
7399
+ MPS: index_select_out_mps
6857
7400
 
6858
7401
  - func: index_select(Tensor self, int dim, Tensor index) -> Tensor
6859
7402
  variants: method, function
6860
7403
  dispatch:
6861
7404
  CPU: index_select_cpu_
6862
7405
  QuantizedCPU: index_select_quantized_cpu_
6863
- CUDA, QuantizedCUDA: index_select_cuda
6864
- SparseCPU: index_select_sparse
6865
- SparseCUDA: index_select_sparse
7406
+ CUDA: index_select_cuda
7407
+ QuantizedCUDA: index_select_quantized_cuda
7408
+ SparseCPU: index_select_sparse_cpu
7409
+ SparseCUDA: index_select_sparse_cuda
7410
+ MPS: index_select_mps
6866
7411
 
6867
7412
  - func: index_select.dimname_out(Tensor self, Dimname dim, Tensor index, *, Tensor(a!) out) -> Tensor(a!)
6868
7413
 
@@ -6911,6 +7456,7 @@
6911
7456
  structured: True
6912
7457
  dispatch:
6913
7458
  CPU, CUDA: gather_out
7459
+ MPS: gather_out_mps
6914
7460
 
6915
7461
  - func: gather(Tensor self, int dim, Tensor index, *, bool sparse_grad=False) -> Tensor
6916
7462
  variants: method, function
@@ -6934,6 +7480,7 @@
6934
7480
  device_check: NoCheck # TensorIterator
6935
7481
  dispatch:
6936
7482
  CPU, CUDA: addcmul_out
7483
+ MPS: addcmul_out_mps
6937
7484
 
6938
7485
  - func: addcmul(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor
6939
7486
  structured_delegate: addcmul.out
@@ -6951,6 +7498,7 @@
6951
7498
  device_check: NoCheck # TensorIterator
6952
7499
  dispatch:
6953
7500
  CPU, CUDA: addcdiv_out
7501
+ MPS: addcdiv_out_mps
6954
7502
 
6955
7503
  - func: addcdiv(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor
6956
7504
  structured_delegate: addcdiv.out
@@ -6998,10 +7546,13 @@
6998
7546
 
6999
7547
  - func: linalg_solve_triangular(Tensor self, Tensor B, *, bool upper, bool left=True, bool unitriangular=False) -> Tensor
7000
7548
  python_module: linalg
7001
- variants: method, function
7549
+ variants: function
7002
7550
  dispatch:
7003
7551
  CPU, CUDA: linalg_solve_triangular
7004
7552
 
7553
+ - func: linalg_vander(Tensor x, *, int? N=None) -> Tensor
7554
+ python_module: linalg
7555
+
7005
7556
  - func: symeig.e(Tensor self, bool eigenvectors=False, bool upper=True, *, Tensor(a!) e, Tensor(b!) V) -> (Tensor(a!) eigenvalues, Tensor(b!) eigenvectors)
7006
7557
  dispatch:
7007
7558
  CompositeExplicitAutograd: symeig_out
@@ -7079,21 +7630,6 @@
7079
7630
  CPU: _cholesky_solve_helper_cpu
7080
7631
  CUDA: _cholesky_solve_helper_cuda
7081
7632
 
7082
- - func: solve(Tensor self, Tensor A) -> (Tensor solution, Tensor LU)
7083
- variants: function, method
7084
- dispatch:
7085
- CompositeExplicitAutograd: solve
7086
-
7087
- - func: solve.solution(Tensor self, Tensor A, *, Tensor(a!) solution, Tensor(b!) lu) -> (Tensor(a!) solution, Tensor(b!) LU)
7088
- dispatch:
7089
- CompositeExplicitAutograd: solve_out
7090
-
7091
- - func: _solve_helper(Tensor self, Tensor A) -> (Tensor, Tensor)
7092
- variants: function
7093
- dispatch:
7094
- CPU: _solve_helper_cpu
7095
- CUDA: _solve_helper_cuda
7096
-
7097
7633
  - func: cholesky_inverse(Tensor self, bool upper=False) -> Tensor
7098
7634
  variants: method, function
7099
7635
  dispatch:
@@ -7144,13 +7680,14 @@
7144
7680
  dispatch:
7145
7681
  CPU, CUDA: lu_solve
7146
7682
 
7683
+ # lu_unpack
7147
7684
  - func: lu_unpack(Tensor LU_data, Tensor LU_pivots, bool unpack_data=True, bool unpack_pivots=True) -> (Tensor P, Tensor L, Tensor U)
7685
+ structured_delegate: lu_unpack.out
7148
7686
  variants: function
7149
- dispatch:
7150
- CPU, CUDA: lu_unpack
7151
7687
 
7152
7688
  - func: lu_unpack.out(Tensor LU_data, Tensor LU_pivots, bool unpack_data=True, bool unpack_pivots=True, *, Tensor(a!) P, Tensor(b!) L, Tensor(c!) U) -> (Tensor(a!) P, Tensor(b!) L, Tensor(c!) U)
7153
7689
  variants: function
7690
+ structured: True
7154
7691
  dispatch:
7155
7692
  CPU, CUDA: lu_unpack_out
7156
7693
 
@@ -7274,6 +7811,7 @@
7274
7811
  structured_inherits: TensorIteratorBase
7275
7812
  dispatch:
7276
7813
  CPU, CUDA: sign_out
7814
+ MPS: sign_out_mps
7277
7815
  SparseCPU, SparseCUDA: sign_sparse_out
7278
7816
  SparseCsrCPU, SparseCsrCUDA: sign_sparse_csr_out
7279
7817
 
@@ -7305,6 +7843,7 @@
7305
7843
  structured_inherits: TensorIteratorBase
7306
7844
  dispatch:
7307
7845
  CPU, CUDA: atan2_out
7846
+ MPS: atan2_mps_out
7308
7847
 
7309
7848
  - func: atan2_(Tensor(a!) self, Tensor other) -> Tensor(a!)
7310
7849
  device_check: NoCheck # TensorIterator
@@ -7391,6 +7930,12 @@
7391
7930
  dispatch:
7392
7931
  CPU: histogramdd_cpu
7393
7932
 
7933
+ - func: histogramdd(Tensor self, int[] bins, float[]? range=None, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor[] bin_edges)
7934
+
7935
+ - func: histogramdd.int_bins(Tensor self, int bins, float[]? range=None, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor[] bin_edges)
7936
+
7937
+ - func: histogramdd.TensorList_bins(Tensor self, Tensor[] bins, float[]? range=None, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor[] bin_edges)
7938
+
7394
7939
  - func: fmod.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
7395
7940
  device_check: NoCheck # TensorIterator
7396
7941
  dispatch:
@@ -7528,6 +8073,7 @@
7528
8073
  variants: method, function
7529
8074
  dispatch:
7530
8075
  CPU, CUDA: min
8076
+ MPS: min_mps
7531
8077
  QuantizedCPU: min_quantized_cpu
7532
8078
 
7533
8079
  - func: fmin(Tensor self, Tensor other) -> Tensor
@@ -7547,6 +8093,7 @@
7547
8093
  variants: method, function
7548
8094
  dispatch:
7549
8095
  CPU, CUDA: max
8096
+ MPS: max_mps
7550
8097
  QuantizedCPU: max_quantized_cpu
7551
8098
 
7552
8099
  - func: fmax(Tensor self, Tensor other) -> Tensor
@@ -7572,6 +8119,7 @@
7572
8119
  device_check: NoCheck # TensorIterator
7573
8120
  dispatch:
7574
8121
  CPU, CUDA: maximum_out
8122
+ MPS: maximum_out_mps
7575
8123
 
7576
8124
  # binary max, alias of maximum
7577
8125
  # NOTE: max is not an alias for maximum, since there is also unary max
@@ -7593,6 +8141,7 @@
7593
8141
  device_check: NoCheck # TensorIterator
7594
8142
  dispatch:
7595
8143
  CPU, CUDA: minimum_out
8144
+ MPS: minimum_out_mps
7596
8145
 
7597
8146
  # binary min, alias for minimum
7598
8147
  # NOTE: min is not an alias for minimum, since there is also unary min
@@ -7626,27 +8175,23 @@
7626
8175
  - func: sort.values(Tensor self, int dim=-1, bool descending=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
7627
8176
  device_check: NoCheck # TensorIterator
7628
8177
  dispatch:
7629
- CPU: sort_out_cpu
7630
- CUDA: sort_out_cuda
8178
+ CompositeExplicitAutograd: sort_out
7631
8179
 
7632
8180
  - func: sort.values_stable(Tensor self, *, bool? stable, int dim=-1, bool descending=False, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
8181
+ structured: True
7633
8182
  dispatch:
7634
- CPU: sort_out_cpu_stable
7635
- CUDA: sort_out_stable_cuda
8183
+ CPU, CUDA: sort_stable_out
7636
8184
 
7637
8185
  - func: sort(Tensor self, int dim=-1, bool descending=False) -> (Tensor values, Tensor indices)
7638
8186
  device_check: NoCheck # TensorIterator
7639
8187
  variants: method, function
7640
8188
  dispatch:
7641
- CPU: sort_cpu
7642
- CUDA: sort_cuda
7643
- QuantizedCPU: sort_quantized_cpu
8189
+ CompositeExplicitAutograd: sort
7644
8190
 
7645
8191
  - func: sort.stable(Tensor self, *, bool? stable, int dim=-1, bool descending=False) -> (Tensor values, Tensor indices)
8192
+ structured_delegate: sort.values_stable
7646
8193
  variants: method, function
7647
8194
  dispatch:
7648
- CPU: sort_cpu_stable
7649
- CUDA: sort_stable_cuda
7650
8195
  QuantizedCPU: sort_quantized_cpu_stable
7651
8196
 
7652
8197
  - func: sort.dimname_values(Tensor self, Dimname dim, bool descending=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
@@ -7676,6 +8221,7 @@
7676
8221
  dispatch:
7677
8222
  CPU: topk_out_cpu
7678
8223
  CUDA: topk_out_cuda
8224
+ MPS: topk_out_mps
7679
8225
 
7680
8226
  - func: topk(Tensor self, int k, int dim=-1, bool largest=True, bool sorted=True) -> (Tensor values, Tensor indices)
7681
8227
  variants: method, function
@@ -7693,6 +8239,7 @@
7693
8239
  structured: True
7694
8240
  dispatch:
7695
8241
  CPU, CUDA: all_all_out
8242
+ MPS: all_all_out_mps
7696
8243
 
7697
8244
  - func: any(Tensor self) -> Tensor
7698
8245
  device_check: NoCheck # TensorIterator
@@ -7706,6 +8253,7 @@
7706
8253
  structured: True
7707
8254
  dispatch:
7708
8255
  CPU, CUDA: any_all_out
8256
+ MPS: any_all_out_mps
7709
8257
 
7710
8258
  - func: renorm.out(Tensor self, Scalar p, int dim, Scalar maxnorm, *, Tensor(a!) out) -> Tensor(a!)
7711
8259
  device_check: NoCheck # TensorIterator
@@ -7728,7 +8276,7 @@
7728
8276
  device_check: NoCheck
7729
8277
  device_guard: False
7730
8278
  dispatch:
7731
- CPU, CUDA: unfold
8279
+ CPU, CUDA, Meta: unfold
7732
8280
  QuantizedCPU, QuantizedCUDA: unfold
7733
8281
 
7734
8282
  - func: unfold_backward(Tensor grad_in, int[] input_sizes, int dim, int size, int step) -> Tensor
@@ -7749,6 +8297,7 @@
7749
8297
  structured_inherits: TensorIteratorBase
7750
8298
  dispatch:
7751
8299
  CPU, CUDA: pow_Tensor_Tensor_out
8300
+ MPS: pow_tensor_tensor_out_mps
7752
8301
 
7753
8302
  - func: pow.Tensor_Tensor(Tensor self, Tensor exponent) -> Tensor
7754
8303
  device_check: NoCheck # TensorIterator
@@ -7772,6 +8321,7 @@
7772
8321
  dispatch:
7773
8322
  CPU, CUDA: pow_Tensor_Scalar_out
7774
8323
  SparseCPU, SparseCUDA: pow_out_sparse_scalar
8324
+ MPS: pow_tensor_scalar_out_mps
7775
8325
 
7776
8326
  - func: pow.Tensor_Scalar(Tensor self, Scalar exponent) -> Tensor
7777
8327
  device_check: NoCheck # TensorIterator
@@ -7815,32 +8365,46 @@
7815
8365
  variants: method
7816
8366
  dispatch:
7817
8367
  CPU, CUDA: normal_
8368
+ MPS: normal_mps_
7818
8369
  Meta: normal_meta_
7819
8370
  SparseCsrCPU, SparseCsrCUDA: normal_sparse_csr_
8371
+ autogen: normal.functional, normal.out
7820
8372
 
7821
8373
  - func: normal.Tensor_float_out(Tensor mean, float std=1, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
7822
8374
  dispatch:
7823
8375
  CPU, CUDA: normal_out
8376
+ MPS: normal_mps_out
8377
+ Meta: normal_out_meta
7824
8378
 
7825
8379
  - func: normal.Tensor_float(Tensor mean, float std=1, *, Generator? generator=None) -> Tensor
7826
8380
  dispatch:
7827
8381
  CPU, CUDA: normal
8382
+ #MPS: normal_mps
8383
+ Meta: normal_meta
7828
8384
 
7829
8385
  - func: normal.float_Tensor_out(float mean, Tensor std, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
7830
8386
  dispatch:
7831
8387
  CPU, CUDA: normal_out
8388
+ Meta: normal_out_meta
8389
+ MPS: normal_mps_out
7832
8390
 
7833
8391
  - func: normal.float_Tensor(float mean, Tensor std, *, Generator? generator=None) -> Tensor
7834
8392
  dispatch:
7835
8393
  CPU, CUDA: normal
8394
+ Meta: normal_meta
8395
+ #MPS: normal_mps
7836
8396
 
7837
8397
  - func: normal.Tensor_Tensor_out(Tensor mean, Tensor std, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
7838
8398
  dispatch:
7839
8399
  CPU, CUDA: normal_out
8400
+ Meta: normal_out_meta
8401
+ MPS: normal_mps_out
7840
8402
 
7841
8403
  - func: normal.Tensor_Tensor(Tensor mean, Tensor std, *, Generator? generator=None) -> Tensor
7842
8404
  dispatch:
7843
8405
  CPU, CUDA: normal
8406
+ Meta: normal_meta
8407
+ #MPS: normal_mps
7844
8408
 
7845
8409
  - func: normal.float_float(float mean, float std, int[] size, *, Generator? generator=None, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
7846
8410
 
@@ -7851,32 +8415,30 @@
7851
8415
  dispatch:
7852
8416
  CompositeExplicitAutograd: alias
7853
8417
 
7854
- - func: _index_copy_(Tensor(a!) self, int dim, Tensor index, Tensor source) -> Tensor(a!)
7855
- dispatch:
7856
- CPU: _index_copy_impl_
7857
- CUDA: _index_copy_impl_
7858
-
7859
8418
  - func: _amp_foreach_non_finite_check_and_unscale_(Tensor(a!)[] self, Tensor(b!) found_inf, Tensor inv_scale) -> ()
7860
8419
  variants: function
7861
8420
  dispatch:
7862
8421
  CUDA: _amp_foreach_non_finite_check_and_unscale_cuda_
8422
+ autogen: _amp_foreach_non_finite_check_and_unscale.functional, _amp_foreach_non_finite_check_and_unscale.out
7863
8423
 
7864
8424
  - func: _amp_update_scale_(Tensor(a!) self, Tensor(b!) growth_tracker, Tensor found_inf, float scale_growth_factor, float scale_backoff_factor, int growth_interval) -> Tensor(a!)
7865
8425
  variants: function
7866
8426
  dispatch:
7867
8427
  CUDA: _amp_update_scale_cuda_
8428
+ autogen: _amp_update_scale.functional, _amp_update_scale.out
7868
8429
 
7869
- - func: _cat(Tensor[] tensors, int dim=0) -> Tensor
7870
- dispatch:
7871
- CPU: _cat_cpu
7872
- CUDA: cat_cuda
7873
- QuantizedCPU: cat_quantized_cpu
8430
+ #- func: _cat(Tensor[] tensors, int dim=0) -> Tensor
8431
+ #dispatch:
8432
+ #CPU: _cat_cpu
8433
+ #CUDA: cat_cuda
8434
+ #MPS: cat_mps
8435
+ #QuantizedCPU: cat_quantized_cpu
7874
8436
 
7875
- - func: _cat.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!)
7876
- dispatch:
7877
- CPU: _cat_out_cpu
7878
- CUDA: cat_out_cuda
7879
- QuantizedCPU: cat_out_quantized_cpu
8437
+ #- func: _cat.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!)
8438
+ #dispatch:
8439
+ #CPU: _cat_out_cpu
8440
+ #CUDA: cat_out_cuda
8441
+ #QuantizedCPU: cat_out_quantized_cpu
7880
8442
 
7881
8443
  - func: _foreach_add.Scalar(Tensor[] tensors, Scalar scalar) -> Tensor[]
7882
8444
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -7891,6 +8453,7 @@
7891
8453
  dispatch:
7892
8454
  CPU: foreach_tensor_add_scalar_kernel_slow_
7893
8455
  CUDA: foreach_tensor_add_scalar_kernel_cuda_
8456
+ autogen: _foreach_add.Scalar_functional, _foreach_add.Scalar_out
7894
8457
 
7895
8458
  - func: _foreach_sub.Scalar(Tensor[] tensors, Scalar scalar) -> Tensor[]
7896
8459
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -7905,6 +8468,7 @@
7905
8468
  dispatch:
7906
8469
  CPU: foreach_tensor_sub_scalar_kernel_slow_
7907
8470
  CUDA: foreach_tensor_sub_scalar_kernel_cuda_
8471
+ autogen: _foreach_sub.Scalar_functional, _foreach_sub.Scalar_out
7908
8472
 
7909
8473
  - func: _foreach_mul.Scalar(Tensor[] tensors, Scalar scalar) -> Tensor[]
7910
8474
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -7919,6 +8483,7 @@
7919
8483
  dispatch:
7920
8484
  CPU: foreach_tensor_mul_scalar_kernel_slow_
7921
8485
  CUDA: foreach_tensor_mul_scalar_kernel_cuda_
8486
+ autogen: _foreach_mul.Scalar_functional, _foreach_mul.Scalar_out
7922
8487
 
7923
8488
  - func: _foreach_div.Scalar(Tensor[] tensors, Scalar scalar) -> Tensor[]
7924
8489
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -7933,6 +8498,7 @@
7933
8498
  dispatch:
7934
8499
  CPU: foreach_tensor_div_scalar_kernel_slow_
7935
8500
  CUDA: foreach_tensor_div_scalar_kernel_cuda_
8501
+ autogen: _foreach_div.Scalar_functional, _foreach_div.Scalar_out
7936
8502
 
7937
8503
  - func: _foreach_add.List(Tensor[] tensors1, Tensor[] tensors2, *, Scalar alpha=1) -> Tensor[]
7938
8504
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -7947,6 +8513,7 @@
7947
8513
  dispatch:
7948
8514
  CPU: foreach_tensor_add_list_kernel_slow_
7949
8515
  CUDA: foreach_tensor_add_list_kernel_cuda_
8516
+ autogen: _foreach_add.List_functional, _foreach_add.List_out
7950
8517
 
7951
8518
  - func: _foreach_sub.List(Tensor[] tensors1, Tensor[] tensors2, *, Scalar alpha=1) -> Tensor[]
7952
8519
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -7961,6 +8528,7 @@
7961
8528
  dispatch:
7962
8529
  CPU: foreach_tensor_sub_list_kernel_slow_
7963
8530
  CUDA: foreach_tensor_sub_list_kernel_cuda_
8531
+ autogen: _foreach_sub.List_functional, _foreach_sub.List_out
7964
8532
 
7965
8533
  - func: _foreach_mul.List(Tensor[] tensors1, Tensor[] tensors2) -> Tensor[]
7966
8534
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -7975,6 +8543,7 @@
7975
8543
  dispatch:
7976
8544
  CPU: foreach_tensor_mul_list_kernel_slow_
7977
8545
  CUDA: foreach_tensor_mul_list_kernel_cuda_
8546
+ autogen: _foreach_mul.List_functional, _foreach_mul.List_out
7978
8547
 
7979
8548
  - func: _foreach_div.List(Tensor[] tensors1, Tensor[] tensors2) -> Tensor[]
7980
8549
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -7989,6 +8558,7 @@
7989
8558
  dispatch:
7990
8559
  CPU: foreach_tensor_div_list_kernel_slow_
7991
8560
  CUDA: foreach_tensor_div_list_kernel_cuda_
8561
+ autogen: _foreach_div.List_functional, _foreach_div.List_out
7992
8562
 
7993
8563
  - func: _foreach_add.ScalarList(Tensor[] tensors, Scalar[] scalars) -> Tensor[]
7994
8564
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8003,6 +8573,7 @@
8003
8573
  dispatch:
8004
8574
  CPU: foreach_tensor_add_scalarlist_kernel_slow_
8005
8575
  CUDA: foreach_tensor_add_scalarlist_kernel_cuda_
8576
+ autogen: _foreach_add.ScalarList_functional, _foreach_add.ScalarList_out
8006
8577
 
8007
8578
  - func: _foreach_sub.ScalarList(Tensor[] tensors, Scalar[] scalars) -> Tensor[]
8008
8579
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8017,6 +8588,7 @@
8017
8588
  dispatch:
8018
8589
  CPU: foreach_tensor_sub_scalarlist_kernel_slow_
8019
8590
  CUDA: foreach_tensor_sub_scalarlist_kernel_cuda_
8591
+ autogen: _foreach_sub.ScalarList_functional, _foreach_sub.ScalarList_out
8020
8592
 
8021
8593
  - func: _foreach_div.ScalarList(Tensor[] tensors, Scalar[] scalars) -> Tensor[]
8022
8594
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8031,6 +8603,7 @@
8031
8603
  dispatch:
8032
8604
  CPU: foreach_tensor_div_scalarlist_kernel_slow_
8033
8605
  CUDA: foreach_tensor_div_scalarlist_kernel_cuda_
8606
+ autogen: _foreach_div.ScalarList_functional, _foreach_div.ScalarList_out
8034
8607
 
8035
8608
  - func: _foreach_mul.ScalarList(Tensor[] tensors, Scalar[] scalars) -> Tensor[]
8036
8609
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8045,6 +8618,7 @@
8045
8618
  dispatch:
8046
8619
  CPU: foreach_tensor_mul_scalarlist_kernel_slow_
8047
8620
  CUDA: foreach_tensor_mul_scalarlist_kernel_cuda_
8621
+ autogen: _foreach_mul.ScalarList_functional, _foreach_mul.ScalarList_out
8048
8622
 
8049
8623
  - func: _foreach_exp(Tensor[] tensors) -> Tensor[]
8050
8624
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8059,6 +8633,7 @@
8059
8633
  dispatch:
8060
8634
  CPU: foreach_tensor_zero_slow_
8061
8635
  CUDA: foreach_tensor_zero_cuda_
8636
+ autogen: _foreach_zero.functional, _foreach_zero.out
8062
8637
 
8063
8638
  - func: _foreach_exp_(Tensor(a!)[] self) -> ()
8064
8639
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8066,6 +8641,7 @@
8066
8641
  dispatch:
8067
8642
  CPU: foreach_tensor_exp_slow_
8068
8643
  CUDA: foreach_tensor_exp_cuda_
8644
+ autogen: _foreach_exp.functional, _foreach_exp.out
8069
8645
 
8070
8646
  - func: _foreach_sqrt(Tensor[] tensors) -> Tensor[]
8071
8647
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8080,6 +8656,7 @@
8080
8656
  dispatch:
8081
8657
  CPU: foreach_tensor_sqrt_slow_
8082
8658
  CUDA: foreach_tensor_sqrt_cuda_
8659
+ autogen: _foreach_sqrt.functional, _foreach_sqrt.out
8083
8660
 
8084
8661
  - func: _foreach_abs(Tensor[] tensors) -> Tensor[]
8085
8662
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8094,6 +8671,7 @@
8094
8671
  dispatch:
8095
8672
  CPU: foreach_tensor_abs_slow_
8096
8673
  CUDA: foreach_tensor_abs_cuda_
8674
+ autogen: _foreach_abs.functional, _foreach_abs.out
8097
8675
 
8098
8676
  - func: _foreach_acos(Tensor[] tensors) -> Tensor[]
8099
8677
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8108,6 +8686,7 @@
8108
8686
  dispatch:
8109
8687
  CPU: foreach_tensor_acos_slow_
8110
8688
  CUDA: foreach_tensor_acos_cuda_
8689
+ autogen: _foreach_acos.functional, _foreach_acos.out
8111
8690
 
8112
8691
  - func: _foreach_asin(Tensor[] tensors) -> Tensor[]
8113
8692
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8122,6 +8701,7 @@
8122
8701
  dispatch:
8123
8702
  CPU: foreach_tensor_asin_slow_
8124
8703
  CUDA: foreach_tensor_asin_cuda_
8704
+ autogen: _foreach_asin.functional, _foreach_asin.out
8125
8705
 
8126
8706
  - func: _foreach_atan(Tensor[] tensors) -> Tensor[]
8127
8707
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8136,6 +8716,7 @@
8136
8716
  dispatch:
8137
8717
  CPU: foreach_tensor_atan_slow_
8138
8718
  CUDA: foreach_tensor_atan_cuda_
8719
+ autogen: _foreach_atan.functional, _foreach_atan.out
8139
8720
 
8140
8721
  - func: _foreach_ceil(Tensor[] tensors) -> Tensor[]
8141
8722
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8150,6 +8731,7 @@
8150
8731
  dispatch:
8151
8732
  CPU: foreach_tensor_ceil_slow_
8152
8733
  CUDA: foreach_tensor_ceil_cuda_
8734
+ autogen: _foreach_ceil.functional, _foreach_ceil.out
8153
8735
 
8154
8736
  - func: _foreach_cos(Tensor[] tensors) -> Tensor[]
8155
8737
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8164,6 +8746,7 @@
8164
8746
  dispatch:
8165
8747
  CPU: foreach_tensor_cos_slow_
8166
8748
  CUDA: foreach_tensor_cos_cuda_
8749
+ autogen: _foreach_cos.functional, _foreach_cos.out
8167
8750
 
8168
8751
  - func: _foreach_cosh(Tensor[] tensors) -> Tensor[]
8169
8752
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8178,6 +8761,7 @@
8178
8761
  dispatch:
8179
8762
  CPU: foreach_tensor_cosh_slow_
8180
8763
  CUDA: foreach_tensor_cosh_cuda_
8764
+ autogen: _foreach_cosh.functional, _foreach_cosh.out
8181
8765
 
8182
8766
  - func: _foreach_erf(Tensor[] tensors) -> Tensor[]
8183
8767
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8192,6 +8776,7 @@
8192
8776
  dispatch:
8193
8777
  CPU: foreach_tensor_erf_slow_
8194
8778
  CUDA: foreach_tensor_erf_cuda_
8779
+ autogen: _foreach_erf.functional, _foreach_erf.out
8195
8780
 
8196
8781
  - func: _foreach_erfc(Tensor[] tensors) -> Tensor[]
8197
8782
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8206,6 +8791,7 @@
8206
8791
  dispatch:
8207
8792
  CPU: foreach_tensor_erfc_slow_
8208
8793
  CUDA: foreach_tensor_erfc_cuda_
8794
+ autogen: _foreach_erfc.functional, _foreach_erfc.out
8209
8795
 
8210
8796
  - func: _foreach_expm1(Tensor[] tensors) -> Tensor[]
8211
8797
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8220,6 +8806,7 @@
8220
8806
  dispatch:
8221
8807
  CPU: foreach_tensor_expm1_slow_
8222
8808
  CUDA: foreach_tensor_expm1_cuda_
8809
+ autogen: _foreach_expm1.functional, _foreach_expm1.out
8223
8810
 
8224
8811
  - func: _foreach_floor(Tensor[] tensors) -> Tensor[]
8225
8812
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8234,6 +8821,7 @@
8234
8821
  dispatch:
8235
8822
  CPU: foreach_tensor_floor_slow_
8236
8823
  CUDA: foreach_tensor_floor_cuda_
8824
+ autogen: _foreach_floor.functional, _foreach_floor.out
8237
8825
 
8238
8826
  - func: _foreach_log(Tensor[] tensors) -> Tensor[]
8239
8827
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8248,6 +8836,7 @@
8248
8836
  dispatch:
8249
8837
  CPU: foreach_tensor_log_slow_
8250
8838
  CUDA: foreach_tensor_log_cuda_
8839
+ autogen: _foreach_log.functional, _foreach_log.out
8251
8840
 
8252
8841
  - func: _foreach_log10(Tensor[] tensors) -> Tensor[]
8253
8842
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8262,6 +8851,7 @@
8262
8851
  dispatch:
8263
8852
  CPU: foreach_tensor_log10_slow_
8264
8853
  CUDA: foreach_tensor_log10_cuda_
8854
+ autogen: _foreach_log10.functional, _foreach_log10.out
8265
8855
 
8266
8856
  - func: _foreach_log1p(Tensor[] tensors) -> Tensor[]
8267
8857
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8276,6 +8866,7 @@
8276
8866
  dispatch:
8277
8867
  CPU: foreach_tensor_log1p_slow_
8278
8868
  CUDA: foreach_tensor_log1p_cuda_
8869
+ autogen: _foreach_log1p.functional, _foreach_log1p.out
8279
8870
 
8280
8871
  - func: _foreach_log2(Tensor[] tensors) -> Tensor[]
8281
8872
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8290,6 +8881,7 @@
8290
8881
  dispatch:
8291
8882
  CPU: foreach_tensor_log2_slow_
8292
8883
  CUDA: foreach_tensor_log2_cuda_
8884
+ autogen: _foreach_log2.functional, _foreach_log2.out
8293
8885
 
8294
8886
  - func: _foreach_neg(Tensor[] tensors) -> Tensor[]
8295
8887
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8304,6 +8896,7 @@
8304
8896
  dispatch:
8305
8897
  CPU: foreach_tensor_neg_slow_
8306
8898
  CUDA: foreach_tensor_neg_cuda_
8899
+ autogen: _foreach_neg.functional, _foreach_neg.out
8307
8900
 
8308
8901
  - func: _foreach_tan(Tensor[] tensors) -> Tensor[]
8309
8902
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8318,6 +8911,7 @@
8318
8911
  dispatch:
8319
8912
  CPU: foreach_tensor_tan_slow_
8320
8913
  CUDA: foreach_tensor_tan_cuda_
8914
+ autogen: _foreach_tan.functional, _foreach_tan.out
8321
8915
 
8322
8916
  - func: _foreach_tanh(Tensor[] tensors) -> Tensor[]
8323
8917
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8332,6 +8926,7 @@
8332
8926
  dispatch:
8333
8927
  CPU: foreach_tensor_tanh_slow_
8334
8928
  CUDA: foreach_tensor_tanh_cuda_
8929
+ autogen: _foreach_tanh.functional, _foreach_tanh.out
8335
8930
 
8336
8931
  - func: _foreach_sin(Tensor[] tensors) -> Tensor[]
8337
8932
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8346,6 +8941,7 @@
8346
8941
  dispatch:
8347
8942
  CPU: foreach_tensor_sin_slow_
8348
8943
  CUDA: foreach_tensor_sin_cuda_
8944
+ autogen: _foreach_sin.functional, _foreach_sin.out
8349
8945
 
8350
8946
  - func: _foreach_sinh(Tensor[] tensors) -> Tensor[]
8351
8947
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8360,6 +8956,7 @@
8360
8956
  dispatch:
8361
8957
  CPU: foreach_tensor_sinh_slow_
8362
8958
  CUDA: foreach_tensor_sinh_cuda_
8959
+ autogen: _foreach_sinh.functional, _foreach_sinh.out
8363
8960
 
8364
8961
  - func: _foreach_round(Tensor[] tensors) -> Tensor[]
8365
8962
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8374,6 +8971,7 @@
8374
8971
  dispatch:
8375
8972
  CPU: foreach_tensor_round_slow_
8376
8973
  CUDA: foreach_tensor_round_cuda_
8974
+ autogen: _foreach_round.functional, _foreach_round.out
8377
8975
 
8378
8976
  - func: _foreach_lgamma(Tensor[] tensors) -> Tensor[]
8379
8977
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8388,6 +8986,7 @@
8388
8986
  dispatch:
8389
8987
  CPU: foreach_tensor_lgamma_slow_
8390
8988
  CUDA: foreach_tensor_lgamma_cuda_
8989
+ autogen: _foreach_lgamma.functional, _foreach_lgamma.out
8391
8990
 
8392
8991
  - func: _foreach_frac(Tensor[] tensors) -> Tensor[]
8393
8992
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8402,6 +9001,7 @@
8402
9001
  dispatch:
8403
9002
  CPU: foreach_tensor_frac_slow_
8404
9003
  CUDA: foreach_tensor_frac_cuda_
9004
+ autogen: _foreach_frac.functional, _foreach_frac.out
8405
9005
 
8406
9006
  - func: _foreach_reciprocal(Tensor[] tensors) -> Tensor[]
8407
9007
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8416,6 +9016,7 @@
8416
9016
  dispatch:
8417
9017
  CPU: foreach_tensor_reciprocal_slow_
8418
9018
  CUDA: foreach_tensor_reciprocal_cuda_
9019
+ autogen: _foreach_reciprocal.functional, _foreach_reciprocal.out
8419
9020
 
8420
9021
  - func: _foreach_sigmoid(Tensor[] tensors) -> Tensor[]
8421
9022
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8430,6 +9031,7 @@
8430
9031
  dispatch:
8431
9032
  CPU: foreach_tensor_sigmoid_slow_
8432
9033
  CUDA: foreach_tensor_sigmoid_cuda_
9034
+ autogen: _foreach_sigmoid.functional, _foreach_sigmoid.out
8433
9035
 
8434
9036
  - func: _foreach_trunc(Tensor[] tensors) -> Tensor[]
8435
9037
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8444,6 +9046,7 @@
8444
9046
  dispatch:
8445
9047
  CPU: foreach_tensor_trunc_slow_
8446
9048
  CUDA: foreach_tensor_trunc_cuda_
9049
+ autogen: _foreach_trunc.functional, _foreach_trunc.out
8447
9050
 
8448
9051
  - func: _foreach_addcdiv_.Scalar(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> ()
8449
9052
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8451,6 +9054,7 @@
8451
9054
  dispatch:
8452
9055
  CPU: foreach_tensor_addcdiv_scalar_slow_
8453
9056
  CUDA: foreach_tensor_addcdiv_scalar_cuda_
9057
+ autogen: _foreach_addcdiv.Scalar_functional, _foreach_addcdiv.Scalar_out
8454
9058
 
8455
9059
  - func: _foreach_addcmul_.Scalar(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> ()
8456
9060
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8458,6 +9062,7 @@
8458
9062
  dispatch:
8459
9063
  CPU: foreach_tensor_addcmul_scalar_slow_
8460
9064
  CUDA: foreach_tensor_addcmul_scalar_cuda_
9065
+ autogen: _foreach_addcmul.Scalar_functional, _foreach_addcmul.Scalar_out
8461
9066
 
8462
9067
  - func: _foreach_addcdiv_.ScalarList(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> ()
8463
9068
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8465,6 +9070,7 @@
8465
9070
  dispatch:
8466
9071
  CPU: foreach_tensor_addcdiv_scalarlist_slow_
8467
9072
  CUDA: foreach_tensor_addcdiv_scalarlist_cuda_
9073
+ autogen: _foreach_addcdiv.ScalarList_functional, _foreach_addcdiv.ScalarList_out
8468
9074
 
8469
9075
  - func: _foreach_addcmul_.ScalarList(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> ()
8470
9076
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8472,6 +9078,7 @@
8472
9078
  dispatch:
8473
9079
  CPU: foreach_tensor_addcmul_scalarlist_slow_
8474
9080
  CUDA: foreach_tensor_addcmul_scalarlist_cuda_
9081
+ autogen: _foreach_addcmul.ScalarList_functional, _foreach_addcmul.ScalarList_out
8475
9082
 
8476
9083
  - func: _foreach_addcdiv.Scalar(Tensor[] input, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[]
8477
9084
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -8584,25 +9191,29 @@
8584
9191
 
8585
9192
  - func: mse_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)
8586
9193
  device_check: NoCheck # TensorIterator
9194
+ structured: True
9195
+ structured_inherits: TensorIteratorBase
8587
9196
  python_module: nn
8588
9197
  dispatch:
8589
9198
  CPU, CUDA: mse_loss_out
9199
+ MPS: mse_loss_out_mps
8590
9200
 
8591
9201
  - func: mse_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor
8592
9202
  device_check: NoCheck # TensorIterator
9203
+ structured_delegate: mse_loss.out
8593
9204
  python_module: nn
8594
- dispatch:
8595
- CPU, CUDA: mse_loss
8596
9205
 
8597
9206
  - func: mse_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, *, Tensor(a!) grad_input) -> Tensor(a!)
8598
9207
  python_module: nn
8599
9208
  dispatch:
8600
9209
  CPU, CUDA: mse_loss_backward_out
9210
+ MPS: mse_loss_backward_out_mps
8601
9211
 
8602
9212
  - func: mse_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction) -> Tensor
8603
9213
  python_module: nn
8604
9214
  dispatch:
8605
9215
  CPU, CUDA: mse_loss_backward
9216
+ MPS: mse_loss_backward_mps
8606
9217
 
8607
9218
  - func: l1_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)
8608
9219
  python_module: nn
@@ -8693,6 +9304,7 @@
8693
9304
  dispatch:
8694
9305
  CPU: nll_loss_forward_out_cpu
8695
9306
  CUDA: nll_loss_forward_out_cuda
9307
+ MPS: nll_loss_forward_out_mps
8696
9308
 
8697
9309
  - func: nll_loss_forward(Tensor self, Tensor target, Tensor? weight, int reduction, int ignore_index) -> (Tensor output, Tensor total_weight)
8698
9310
  python_module: nn
@@ -8704,6 +9316,7 @@
8704
9316
  dispatch:
8705
9317
  CPU: nll_loss_backward_out_cpu
8706
9318
  CUDA: nll_loss_backward_out_cuda
9319
+ MPS: nll_loss_backward_out_mps
8707
9320
 
8708
9321
  - func: nll_loss_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, int ignore_index, Tensor total_weight) -> Tensor
8709
9322
  python_module: nn
@@ -8720,24 +9333,28 @@
8720
9333
  dispatch:
8721
9334
  CPU: nll_loss2d_forward_out_cpu
8722
9335
  CUDA: nll_loss2d_forward_out_cuda
9336
+ MPS: nll_loss2d_forward_out_mps
8723
9337
 
8724
9338
  - func: nll_loss2d_forward(Tensor self, Tensor target, Tensor? weight, int reduction, int ignore_index) -> (Tensor output, Tensor total_weight)
8725
9339
  python_module: nn
8726
9340
  dispatch:
8727
9341
  CPU: nll_loss2d_forward_cpu
8728
9342
  CUDA: nll_loss2d_forward_cuda
9343
+ MPS: nll_loss2d_forward_mps
8729
9344
 
8730
9345
  - func: nll_loss2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, int ignore_index, Tensor total_weight, *, Tensor(a!) grad_input) -> Tensor(a!)
8731
9346
  python_module: nn
8732
9347
  dispatch:
8733
9348
  CPU: nll_loss2d_backward_out_cpu
8734
9349
  CUDA: nll_loss2d_backward_out_cuda
9350
+ MPS: nll_loss2d_backward_out_mps
8735
9351
 
8736
9352
  - func: nll_loss2d_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, int ignore_index, Tensor total_weight) -> Tensor
8737
9353
  python_module: nn
8738
9354
  dispatch:
8739
9355
  CPU: nll_loss2d_backward_cpu
8740
9356
  CUDA: nll_loss2d_backward_cuda
9357
+ MPS: nll_loss2d_backward_mps
8741
9358
 
8742
9359
  - func: smooth_l1_loss.out(Tensor self, Tensor target, int reduction=Mean, float beta=1.0, *, Tensor(a!) out) -> Tensor(a!)
8743
9360
  device_check: NoCheck # TensorIterator
@@ -8746,6 +9363,7 @@
8746
9363
  python_module: nn
8747
9364
  dispatch:
8748
9365
  CPU, CUDA: smooth_l1_loss_out
9366
+ MPS: smooth_l1_loss_out_mps
8749
9367
 
8750
9368
  - func: smooth_l1_loss(Tensor self, Tensor target, int reduction=Mean, float beta=1.0) -> Tensor
8751
9369
  device_check: NoCheck # TensorIterator
@@ -8757,6 +9375,7 @@
8757
9375
  dispatch:
8758
9376
  CPU: smooth_l1_loss_backward_out
8759
9377
  CUDA: smooth_l1_loss_backward_out
9378
+ MPS: smooth_l1_loss_backward_out_mps
8760
9379
 
8761
9380
  - func: smooth_l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta) -> Tensor
8762
9381
  python_module: nn
@@ -8810,6 +9429,7 @@
8810
9429
  python_module: nn
8811
9430
  dispatch:
8812
9431
  CPU, CUDA: elu_out
9432
+ MPS: elu_out_mps
8813
9433
 
8814
9434
  - func: elu(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor
8815
9435
  structured_delegate: elu.out
@@ -8822,6 +9442,7 @@
8822
9442
  python_module: nn
8823
9443
  dispatch:
8824
9444
  CPU, CUDA: elu_backward_out
9445
+ MPS: elu_backward_out_mps
8825
9446
 
8826
9447
  - func: elu_backward(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, bool is_result, Tensor self_or_result) -> Tensor
8827
9448
  structured_delegate: elu_backward.grad_input
@@ -8858,6 +9479,16 @@
8858
9479
  CPU: glu_backward_cpu
8859
9480
  CUDA: glu_backward_cuda
8860
9481
 
9482
+ - func: glu_jvp(Tensor glu, Tensor x, Tensor dx, int dim) -> Tensor
9483
+ python_module: nn
9484
+ dispatch:
9485
+ CPU, CUDA: glu_jvp
9486
+
9487
+ - func: glu_backward_jvp(Tensor grad_x, Tensor grad_glu, Tensor x, Tensor dgrad_glu, Tensor dx, int dim) -> Tensor
9488
+ python_module: nn
9489
+ dispatch:
9490
+ CPU, CUDA: glu_backward_jvp
9491
+
8861
9492
  - func: hardsigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
8862
9493
  structured: True
8863
9494
  structured_inherits: TensorIteratorBase
@@ -8894,31 +9525,33 @@
8894
9525
  device_check: NoCheck # TensorIterator
8895
9526
  python_module: nn
8896
9527
  dispatch:
8897
- CPU, CUDA: hardtanh_out
9528
+ CPU, CUDA, MPS: hardtanh_out
8898
9529
  QuantizedCPU: hardtanh_out_quantized_cpu
8899
9530
 
8900
9531
  - func: hardtanh(Tensor self, Scalar min_val=-1, Scalar max_val=1) -> Tensor
8901
9532
  device_check: NoCheck # TensorIterator
8902
9533
  python_module: nn
8903
9534
  dispatch:
8904
- CPU, CUDA: hardtanh
9535
+ CPU, CUDA, MPS: hardtanh
8905
9536
  QuantizedCPU: hardtanh_quantized_cpu
8906
9537
 
8907
9538
  - func: hardtanh_backward.grad_input(Tensor grad_output, Tensor self, Scalar min_val, Scalar max_val, *, Tensor(a!) grad_input) -> Tensor(a!)
8908
9539
  python_module: nn
8909
9540
  dispatch:
8910
9541
  CPU, CUDA: hardtanh_backward_out
9542
+ MPS: hardtanh_backward_out_mps
8911
9543
 
8912
9544
  - func: hardtanh_backward(Tensor grad_output, Tensor self, Scalar min_val, Scalar max_val) -> Tensor
8913
9545
  python_module: nn
8914
9546
  dispatch:
8915
9547
  CPU, CUDA: hardtanh_backward
9548
+ MPS: hardtanh_backward_mps
8916
9549
 
8917
9550
  - func: hardtanh_(Tensor(a!) self, Scalar min_val=-1, Scalar max_val=1) -> Tensor(a!)
8918
9551
  device_check: NoCheck # TensorIterator
8919
9552
  python_module: nn
8920
9553
  dispatch:
8921
- CPU, CUDA: hardtanh_
9554
+ CPU, CUDA, MPS: hardtanh_
8922
9555
  QuantizedCPU: hardtanh_quantized_cpu_
8923
9556
 
8924
9557
  - func: hardswish.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@@ -8951,6 +9584,7 @@
8951
9584
  python_module: nn
8952
9585
  dispatch:
8953
9586
  CPU, CUDA: leaky_relu_out
9587
+ MPS: leaky_relu_out_mps
8954
9588
  QuantizedCPU: leaky_relu_out_quantized_cpu
8955
9589
 
8956
9590
  - func: leaky_relu(Tensor self, Scalar negative_slope=0.01) -> Tensor
@@ -8966,6 +9600,7 @@
8966
9600
  python_module: nn
8967
9601
  dispatch:
8968
9602
  CPU, CUDA: leaky_relu_backward_out
9603
+ MPS: leaky_relu_backward_out_mps
8969
9604
 
8970
9605
  - func: leaky_relu_backward(Tensor grad_output, Tensor self, Scalar negative_slope, bool self_is_result) -> Tensor
8971
9606
  structured_delegate: leaky_relu_backward.grad_input
@@ -9088,6 +9723,7 @@
9088
9723
  dispatch:
9089
9724
  CPU: adaptive_avg_pool2d_out_cpu
9090
9725
  CUDA: adaptive_avg_pool2d_out_cuda
9726
+ MPS: adaptive_avg_pool2d_out_mps
9091
9727
  MkldnnCPU: mkldnn_adaptive_avg_pool2d_out
9092
9728
 
9093
9729
  - func: adaptive_avg_pool2d(Tensor self, int[2] output_size) -> Tensor
@@ -9105,13 +9741,16 @@
9105
9741
  dispatch:
9106
9742
  CPU: adaptive_avg_pool2d_cpu
9107
9743
  CUDA: adaptive_avg_pool2d_cuda
9744
+ MPS: adaptive_avg_pool2d_mps
9108
9745
  QuantizedCPU: adaptive_avg_pool2d_quantized_cpu
9746
+ QuantizedCUDA: adaptive_avg_pool2d_quantized_cuda
9109
9747
 
9110
9748
  - func: _adaptive_avg_pool2d_backward(Tensor grad_output, Tensor self) -> Tensor
9111
9749
  python_module: nn
9112
9750
  dispatch:
9113
9751
  CPU: adaptive_avg_pool2d_backward_cpu
9114
9752
  CUDA: adaptive_avg_pool2d_backward_cuda
9753
+ MPS: adaptive_avg_pool2d_backward_mps
9115
9754
 
9116
9755
  - func: adaptive_avg_pool3d.out(Tensor self, int[3] output_size, *, Tensor(a!) out) -> Tensor(a!)
9117
9756
  python_module: nn
@@ -9148,6 +9787,7 @@
9148
9787
  dispatch:
9149
9788
  CPU: adaptive_max_pool2d_out_cpu
9150
9789
  CUDA: adaptive_max_pool2d_out_cuda
9790
+ MPS: adaptive_max_pool2d_out_mps
9151
9791
 
9152
9792
  # Return: (Tensor output, Tensor indices)
9153
9793
  - func: adaptive_max_pool2d(Tensor self, int[2] output_size) -> (Tensor, Tensor)
@@ -9160,6 +9800,7 @@
9160
9800
  dispatch:
9161
9801
  CPU: adaptive_max_pool2d_backward_out_cpu
9162
9802
  CUDA: adaptive_max_pool2d_backward_out_cuda
9803
+ MPS: adaptive_max_pool2d_backward_out_mps
9163
9804
 
9164
9805
  - func: adaptive_max_pool2d_backward(Tensor grad_output, Tensor self, Tensor indices) -> Tensor
9165
9806
  python_module: nn
@@ -9199,6 +9840,7 @@
9199
9840
  dispatch:
9200
9841
  CPU: avg_pool2d_out_cpu
9201
9842
  CUDA: avg_pool2d_out_cuda
9843
+ MPS: avg_pool2d_out_mps
9202
9844
  MkldnnCPU: mkldnn_avg_pool2d_out
9203
9845
 
9204
9846
  - func: avg_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor
@@ -9214,6 +9856,7 @@
9214
9856
  dispatch:
9215
9857
  CPU: avg_pool2d_backward_out_cpu
9216
9858
  CUDA: avg_pool2d_backward_out_cuda
9859
+ MPS: avg_pool2d_backward_out_mps
9217
9860
  MkldnnCPU: mkldnn_avg_pool2d_backward_out
9218
9861
 
9219
9862
  - func: avg_pool2d_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, bool ceil_mode, bool count_include_pad, int? divisor_override) -> Tensor
@@ -9282,6 +9925,7 @@
9282
9925
  precomputed:
9283
9926
  - kernel_size -> int poolSizeT, int poolSizeH, int poolSizeW
9284
9927
  - output_size -> int outputT, int outputH, int outputW
9928
+ - int numBatch, int numPlanes, int inputT, int inputH, int inputW
9285
9929
  dispatch:
9286
9930
  CPU: fractional_max_pool3d_out_cpu
9287
9931
  CUDA: fractional_max_pool3d_out_cuda
@@ -9310,6 +9954,7 @@
9310
9954
  dispatch:
9311
9955
  CPU: max_pool2d_with_indices_out_cpu
9312
9956
  CUDA: max_pool2d_with_indices_out_cuda
9957
+ MPS: max_pool2d_with_indices_out_mps
9313
9958
 
9314
9959
  # Return: (Tensor output, Tensor indices)
9315
9960
  - func: max_pool2d_with_indices(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)
@@ -9322,6 +9967,7 @@
9322
9967
  dispatch:
9323
9968
  CPU: max_pool2d_with_indices_backward_out_cpu
9324
9969
  CUDA: max_pool2d_with_indices_backward_out_cuda
9970
+ MPS: max_pool2d_with_indices_backward_out_mps
9325
9971
 
9326
9972
  - func: max_pool2d_with_indices_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, int[2] dilation, bool ceil_mode, Tensor indices) -> Tensor
9327
9973
  python_module: nn
@@ -9365,18 +10011,6 @@
9365
10011
  CPU: max_unpooling2d_forward_cpu
9366
10012
  CUDA: max_unpooling2d_forward_cuda
9367
10013
 
9368
- - func: max_unpool2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor indices, int[2] output_size, *, Tensor(a!) grad_input) -> Tensor(a!)
9369
- python_module: nn
9370
- dispatch:
9371
- CPU: max_unpooling2d_backward_out_cpu
9372
- CUDA: max_unpooling2d_backward_out_cuda
9373
-
9374
- - func: max_unpool2d_backward(Tensor grad_output, Tensor self, Tensor indices, int[2] output_size) -> Tensor
9375
- python_module: nn
9376
- dispatch:
9377
- CPU: max_unpooling2d_backward_cpu
9378
- CUDA: max_unpooling2d_backward_cuda
9379
-
9380
10014
  - func: max_unpool3d.out(Tensor self, Tensor indices, int[3] output_size, int[3] stride, int[3] padding, *, Tensor(a!) out) -> Tensor(a!)
9381
10015
  python_module: nn
9382
10016
  dispatch:
@@ -9389,30 +10023,18 @@
9389
10023
  CPU: max_unpooling3d_forward_cpu
9390
10024
  CUDA: max_unpooling3d_forward_cuda
9391
10025
 
9392
- - func: max_unpool3d_backward.grad_input(Tensor grad_output, Tensor self, Tensor indices, int[3] output_size, int[3] stride, int[3] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
9393
- python_module: nn
9394
- dispatch:
9395
- CPU: max_unpooling3d_backward_out_cpu
9396
- CUDA: max_unpooling3d_backward_out_cuda
9397
-
9398
- - func: max_unpool3d_backward(Tensor grad_output, Tensor self, Tensor indices, int[3] output_size, int[3] stride, int[3] padding) -> Tensor
9399
- python_module: nn
9400
- dispatch:
9401
- CPU: max_unpooling3d_backward_cpu
9402
- CUDA: max_unpooling3d_backward_cuda
9403
-
9404
10026
  - func: reflection_pad1d.out(Tensor self, int[2] padding, *, Tensor(a!) out) -> Tensor(a!)
9405
10027
  python_module: nn
9406
10028
  structured: True
9407
10029
  dispatch:
9408
- CPU, QuantizedCPU: reflection_pad1d_out_cpu
10030
+ CPU: reflection_pad1d_out_cpu
10031
+ QuantizedCPU: reflection_pad1d_out_quantized_cpu
9409
10032
  CUDA: reflection_pad1d_out_cuda
10033
+ MPS: reflection_pad1d_out_mps
9410
10034
 
9411
10035
  - func: reflection_pad1d(Tensor self, int[2] padding) -> Tensor
9412
10036
  python_module: nn
9413
10037
  structured_delegate: reflection_pad1d.out
9414
- dispatch:
9415
- QuantizedCPU: reflection_pad1d_cpu
9416
10038
 
9417
10039
  - func: reflection_pad1d_backward.grad_input(Tensor grad_output, Tensor self, int[2] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
9418
10040
  python_module: nn
@@ -9420,6 +10042,7 @@
9420
10042
  dispatch:
9421
10043
  CPU: reflection_pad1d_backward_out_cpu
9422
10044
  CUDA: reflection_pad1d_backward_out_cuda
10045
+ MPS: reflection_pad1d_backward_out_mps
9423
10046
 
9424
10047
  - func: reflection_pad1d_backward(Tensor grad_output, Tensor self, int[2] padding) -> Tensor
9425
10048
  python_module: nn
@@ -9430,24 +10053,29 @@
9430
10053
  dispatch:
9431
10054
  CPU, QuantizedCPU: reflection_pad2d_out_cpu
9432
10055
  CUDA: reflection_pad2d_out_cuda
10056
+ MPS: reflection_pad2d_out_mps
9433
10057
 
9434
10058
  - func: reflection_pad2d(Tensor self, int[4] padding) -> Tensor
9435
10059
  python_module: nn
9436
10060
  dispatch:
9437
- CPU, QuantizedCPU: reflection_pad2d_cpu
10061
+ CPU: reflection_pad2d_cpu
10062
+ QuantizedCPU: reflection_pad2d_quantized_cpu
9438
10063
  CUDA: reflection_pad2d_cuda
10064
+ MPS: reflection_pad2d_mps
9439
10065
 
9440
10066
  - func: reflection_pad2d_backward.grad_input(Tensor grad_output, Tensor self, int[4] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
9441
10067
  python_module: nn
9442
10068
  dispatch:
9443
10069
  CPU: reflection_pad2d_backward_out_cpu
9444
10070
  CUDA: reflection_pad2d_backward_out_cuda
10071
+ MPS: reflection_pad2d_backward_out_mps
9445
10072
 
9446
10073
  - func: reflection_pad2d_backward(Tensor grad_output, Tensor self, int[4] padding) -> Tensor
9447
10074
  python_module: nn
9448
10075
  dispatch:
9449
10076
  CPU: reflection_pad2d_backward_cpu
9450
10077
  CUDA: reflection_pad2d_backward_cuda
10078
+ MPS: reflection_pad2d_backward_mps
9451
10079
 
9452
10080
  - func: reflection_pad3d.out(Tensor self, int[6] padding, *, Tensor(a!) out) -> Tensor(a!)
9453
10081
  python_module: nn
@@ -9455,6 +10083,7 @@
9455
10083
  dispatch:
9456
10084
  CPU: reflection_pad3d_out_cpu
9457
10085
  CUDA: reflection_pad3d_out_cuda
10086
+ MPS: reflection_pad3d_out_mps
9458
10087
 
9459
10088
  - func: reflection_pad3d(Tensor self, int[6] padding) -> Tensor
9460
10089
  python_module: nn
@@ -9466,6 +10095,7 @@
9466
10095
  dispatch:
9467
10096
  CPU: reflection_pad3d_backward_out_cpu
9468
10097
  CUDA: reflection_pad3d_backward_out_cuda
10098
+ MPS: reflection_pad3d_backward_out_mps
9469
10099
 
9470
10100
  - func: reflection_pad3d_backward(Tensor grad_output, Tensor self, int[6] padding) -> Tensor
9471
10101
  python_module: nn
@@ -9477,6 +10107,7 @@
9477
10107
  dispatch:
9478
10108
  CPU: replication_pad1d_out_cpu
9479
10109
  CUDA: replication_pad1d_out_cuda
10110
+ MPS: replication_pad1d_out_mps
9480
10111
 
9481
10112
  - func: replication_pad1d(Tensor self, int[2] padding) -> Tensor
9482
10113
  python_module: nn
@@ -9488,6 +10119,7 @@
9488
10119
  dispatch:
9489
10120
  CPU: replication_pad1d_backward_out_cpu
9490
10121
  CUDA: replication_pad1d_backward_out_cuda
10122
+ MPS: replication_pad1d_backward_out_mps
9491
10123
 
9492
10124
  - func: replication_pad1d_backward(Tensor grad_output, Tensor self, int[2] padding) -> Tensor
9493
10125
  python_module: nn
@@ -9499,6 +10131,7 @@
9499
10131
  dispatch:
9500
10132
  CPU: replication_pad2d_out_cpu
9501
10133
  CUDA: replication_pad2d_out_cuda
10134
+ MPS: replication_pad2d_out_mps
9502
10135
 
9503
10136
  - func: replication_pad2d(Tensor self, int[4] padding) -> Tensor
9504
10137
  python_module: nn
@@ -9509,12 +10142,14 @@
9509
10142
  dispatch:
9510
10143
  CPU: replication_pad2d_backward_out_cpu
9511
10144
  CUDA: replication_pad2d_backward_out_cuda
10145
+ MPS: replication_pad2d_backward_out_mps
9512
10146
 
9513
10147
  - func: replication_pad2d_backward(Tensor grad_output, Tensor self, int[4] padding) -> Tensor
9514
10148
  python_module: nn
9515
10149
  dispatch:
9516
10150
  CPU: replication_pad2d_backward_cpu
9517
10151
  CUDA: replication_pad2d_backward_cuda
10152
+ MPS: replication_pad2d_backward_mps
9518
10153
 
9519
10154
  - func: replication_pad3d.out(Tensor self, int[6] padding, *, Tensor(a!) out) -> Tensor(a!)
9520
10155
  python_module: nn
@@ -9522,6 +10157,7 @@
9522
10157
  dispatch:
9523
10158
  CPU: replication_pad3d_out_cpu
9524
10159
  CUDA: replication_pad3d_out_cuda
10160
+ MPS: replication_pad3d_out_mps
9525
10161
 
9526
10162
  - func: replication_pad3d(Tensor self, int[6] padding) -> Tensor
9527
10163
  python_module: nn
@@ -9532,12 +10168,23 @@
9532
10168
  dispatch:
9533
10169
  CPU: replication_pad3d_backward_out_cpu
9534
10170
  CUDA: replication_pad3d_backward_out_cuda
10171
+ MPS: replication_pad3d_backward_out_mps
9535
10172
 
9536
10173
  - func: replication_pad3d_backward(Tensor grad_output, Tensor self, int[6] padding) -> Tensor
9537
10174
  python_module: nn
9538
10175
  dispatch:
9539
10176
  CPU: replication_pad3d_backward_cpu
9540
10177
  CUDA: replication_pad3d_backward_cuda
10178
+ MPS: replication_pad3d_backward_mps
10179
+
10180
+ - func: _pad_circular(Tensor self, int[] pad) -> Tensor
10181
+ python_module: nn
10182
+
10183
+ - func: _pad_enum(Tensor self, int[] pad, int mode, float? value=None) -> Tensor
10184
+ python_module: nn
10185
+
10186
+ - func: pad(Tensor self, int[] pad, str mode="constant", float? value=None) -> Tensor
10187
+ python_module: nn
9541
10188
 
9542
10189
  - func: upsample_linear1d.vec(Tensor input, int[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor
9543
10190
  python_module: nn
@@ -9694,6 +10341,7 @@
9694
10341
  dispatch:
9695
10342
  CPU: upsample_bilinear2d_out_cpu
9696
10343
  CUDA: upsample_bilinear2d_out_cuda
10344
+ MPS: upsample_bilinear2d_out_mps
9697
10345
 
9698
10346
  - func: upsample_bilinear2d(Tensor self, int[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
9699
10347
  python_module: nn
@@ -9707,6 +10355,7 @@
9707
10355
  dispatch:
9708
10356
  CPU: upsample_bilinear2d_backward_out_cpu
9709
10357
  CUDA: upsample_bilinear2d_backward_out_cuda
10358
+ MPS: upsample_bilinear2d_backward_out_mps
9710
10359
 
9711
10360
  - func: upsample_bilinear2d_backward(Tensor grad_output, int[2] output_size, int[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
9712
10361
  python_module: nn
@@ -9850,6 +10499,7 @@
9850
10499
  dispatch:
9851
10500
  CPU: upsample_nearest2d_out_cpu
9852
10501
  CUDA: upsample_nearest2d_out_cuda
10502
+ MPS: upsample_nearest2d_out_mps
9853
10503
 
9854
10504
  - func: _upsample_nearest_exact2d.out(Tensor self, int[2] output_size, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
9855
10505
  python_module: nn
@@ -9857,6 +10507,7 @@
9857
10507
  dispatch:
9858
10508
  CPU: _upsample_nearest_exact2d_out_cpu
9859
10509
  CUDA: _upsample_nearest_exact2d_out_cuda
10510
+ MPS: _upsample_nearest_exact2d_out_mps
9860
10511
 
9861
10512
  - func: upsample_nearest2d(Tensor self, int[2] output_size, float? scales_h=None, float? scales_w=None) -> Tensor
9862
10513
  python_module: nn
@@ -9876,6 +10527,7 @@
9876
10527
  dispatch:
9877
10528
  CPU: upsample_nearest2d_backward_out_cpu
9878
10529
  CUDA: upsample_nearest2d_backward_out_cuda
10530
+ MPS: upsample_nearest2d_backward_out_mps
9879
10531
 
9880
10532
  - func: _upsample_nearest_exact2d_backward.grad_input(Tensor grad_output, int[2] output_size, int[4] input_size, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
9881
10533
  python_module: nn
@@ -9883,6 +10535,7 @@
9883
10535
  dispatch:
9884
10536
  CPU: _upsample_nearest_exact2d_backward_out_cpu
9885
10537
  CUDA: _upsample_nearest_exact2d_backward_out_cuda
10538
+ MPS: _upsample_nearest_exact2d_backward_out_mps
9886
10539
 
9887
10540
  - func: upsample_nearest2d_backward(Tensor grad_output, int[2] output_size, int[4] input_size, float? scales_h=None, float? scales_w=None) -> Tensor
9888
10541
  python_module: nn
@@ -9946,6 +10599,7 @@
9946
10599
  structured_inherits: TensorIteratorBase
9947
10600
  dispatch:
9948
10601
  CPU, CUDA: sigmoid_backward_out
10602
+ MPS: sigmoid_backward_out_mps
9949
10603
 
9950
10604
  - func: sigmoid_backward(Tensor grad_output, Tensor output) -> Tensor
9951
10605
  python_module: nn
@@ -9968,6 +10622,7 @@
9968
10622
  structured_inherits: TensorIteratorBase
9969
10623
  dispatch:
9970
10624
  CPU, CUDA: tanh_backward_out
10625
+ MPS: tanh_backward_out_mps
9971
10626
 
9972
10627
  - func: tanh_backward(Tensor grad_output, Tensor output) -> Tensor
9973
10628
  python_module: nn
@@ -10233,6 +10888,19 @@
10233
10888
  dispatch:
10234
10889
  CPU, CUDA: special_ndtri_out
10235
10890
 
10891
+ - func: special_log_ndtr(Tensor self) -> Tensor
10892
+ structured_delegate: special_log_ndtr.out
10893
+ python_module: special
10894
+ variants: function
10895
+
10896
+ - func: special_log_ndtr.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
10897
+ structured: True
10898
+ structured_inherits: TensorIteratorBase
10899
+ python_module: special
10900
+ variants: function
10901
+ dispatch:
10902
+ CPU, CUDA: special_log_ndtr_out
10903
+
10236
10904
  - func: special_expm1(Tensor self) -> Tensor
10237
10905
  python_module: special
10238
10906
  variants: function
@@ -10486,7 +11154,7 @@
10486
11154
 
10487
11155
  - func: special_polygamma(int n, Tensor self) -> Tensor
10488
11156
  python_module: special
10489
- variants: function, method
11157
+ variants: function
10490
11158
 
10491
11159
  - func: special_polygamma.out(int n, Tensor self, *, Tensor(a!) out) -> Tensor(a!)
10492
11160
  python_module: special
@@ -10782,11 +11450,15 @@
10782
11450
  - func: linalg_cross(Tensor self, Tensor other, *, int dim=-1) -> Tensor
10783
11451
  python_module: linalg
10784
11452
  variants: function
11453
+ structured_delegate: linalg_cross.out
10785
11454
  dispatch:
10786
- CPU, CUDA: linalg_cross
11455
+ ZeroTensor: linalg_cross_zerotensor
10787
11456
 
10788
11457
  - func: linalg_cross.out(Tensor self, Tensor other, *, int dim=-1, Tensor(a!) out) -> Tensor(a!)
10789
11458
  python_module: linalg
11459
+ structured: True
11460
+ precomputed:
11461
+ - dim -> int dim
10790
11462
  dispatch:
10791
11463
  CPU, CUDA: linalg_cross_out
10792
11464
 
@@ -10811,6 +11483,20 @@
10811
11483
  dispatch:
10812
11484
  CPU, CUDA: linalg_lu_factor_ex_out
10813
11485
 
11486
+ # linalg.lu
11487
+ - func: linalg_lu(Tensor A, *, bool pivot=True) -> (Tensor P, Tensor L, Tensor U)
11488
+ python_module: linalg
11489
+ structured_delegate: linalg_lu.out
11490
+ variants: function
11491
+
11492
+ - func: linalg_lu.out(Tensor A, *, bool pivot=True, Tensor(a!) P, Tensor(b!) L, Tensor(c!) U) -> (Tensor(a!) P, Tensor(b!) L, Tensor(c!) U)
11493
+ python_module: linalg
11494
+ variants: function
11495
+ structured: True
11496
+ dispatch:
11497
+ CPU, CUDA: linalg_lu_out
11498
+
11499
+ # linalg.det
10814
11500
  - func: linalg_det(Tensor self) -> Tensor
10815
11501
  python_module: linalg
10816
11502
  variants: function
@@ -10832,6 +11518,38 @@
10832
11518
  dispatch:
10833
11519
  CPU, CUDA: _det_lu_based_helper_backward_helper
10834
11520
 
11521
+ - func: linalg_ldl_factor_ex(Tensor self, *, bool hermitian=False, bool check_errors=False) -> (Tensor LD, Tensor pivots, Tensor info)
11522
+ structured_delegate: linalg_ldl_factor_ex.out
11523
+ python_module: linalg
11524
+ variants: function
11525
+
11526
+ - func: linalg_ldl_factor_ex.out(Tensor self, *, bool hermitian=False, bool check_errors=False, Tensor(a!) LD, Tensor(b!) pivots, Tensor(c!) info) -> (Tensor(a!) LD, Tensor(b!) pivots, Tensor(c!) info)
11527
+ structured: True
11528
+ python_module: linalg
11529
+ variants: function
11530
+ dispatch:
11531
+ CPU, CUDA: linalg_ldl_factor_ex_out
11532
+
11533
+ - func: linalg_ldl_factor(Tensor self, *, bool hermitian=False) -> (Tensor LD, Tensor pivots)
11534
+ python_module: linalg
11535
+ variants: function
11536
+
11537
+ - func: linalg_ldl_factor.out(Tensor self, *, bool hermitian=False, Tensor(a!) LD, Tensor(b!) pivots) -> (Tensor(a!) LD, Tensor(b!) pivots)
11538
+ python_module: linalg
11539
+ variants: function
11540
+
11541
+ - func: linalg_ldl_solve(Tensor LD, Tensor pivots, Tensor B, *, bool hermitian=False) -> Tensor
11542
+ structured_delegate: linalg_ldl_solve.out
11543
+ python_module: linalg
11544
+ variants: function
11545
+
11546
+ - func: linalg_ldl_solve.out(Tensor LD, Tensor pivots, Tensor B, *, bool hermitian=False, Tensor(a!) out) -> Tensor(a!)
11547
+ structured: True
11548
+ python_module: linalg
11549
+ variants: function
11550
+ dispatch:
11551
+ CPU, CUDA: linalg_ldl_solve_out
11552
+
10835
11553
  - func: linalg_lstsq(Tensor self, Tensor b, float? rcond=None, *, str? driver=None) -> (Tensor solution, Tensor residuals, Tensor rank, Tensor singular_values)
10836
11554
  python_module: linalg
10837
11555
  variants: function
@@ -10901,7 +11619,7 @@
10901
11619
  python_module: linalg
10902
11620
  variants: function
10903
11621
 
10904
- - func: linalg_eigvalsh.out(Tensor self, str UPLO='L', *, Tensor(a!) out) -> Tensor(a!)
11622
+ - func: linalg_eigvalsh.out(Tensor self, str UPLO="L", *, Tensor(a!) out) -> Tensor(a!)
10905
11623
  python_module: linalg
10906
11624
  dispatch:
10907
11625
  CPU, CUDA: linalg_eigvalsh_out
@@ -10922,6 +11640,7 @@
10922
11640
  dispatch:
10923
11641
  CPU: _linalg_inv_out_helper_cpu
10924
11642
  CUDA: _linalg_inv_out_helper_cuda
11643
+ autogen: _linalg_inv_out_helper.functional, _linalg_inv_out_helper.out
10925
11644
 
10926
11645
  - func: linalg_inv_ex(Tensor self, *, bool check_errors=False) -> (Tensor inverse, Tensor info)
10927
11646
  python_module: linalg
@@ -10978,11 +11697,11 @@
10978
11697
  - func: linalg_vector_norm(Tensor self, Scalar ord=2, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
10979
11698
  python_module: linalg
10980
11699
  variants: function
10981
- dispatch:
10982
- CPU, CUDA: linalg_vector_norm
11700
+ structured_delegate: linalg_vector_norm.out
10983
11701
 
10984
11702
  - func: linalg_vector_norm.out(Tensor self, Scalar ord=2, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
10985
11703
  python_module: linalg
11704
+ structured: True
10986
11705
  dispatch:
10987
11706
  CPU, CUDA: linalg_vector_norm_out
10988
11707
 
@@ -11106,13 +11825,13 @@
11106
11825
  python_module: linalg
11107
11826
  variants: function
11108
11827
 
11109
- - func: linalg_qr(Tensor self, str mode='reduced') -> (Tensor Q, Tensor R)
11828
+ - func: linalg_qr(Tensor A, str mode='reduced') -> (Tensor Q, Tensor R)
11110
11829
  python_module: linalg
11111
11830
  variants: function
11112
11831
  dispatch:
11113
11832
  CompositeExplicitAutograd: linalg_qr
11114
11833
 
11115
- - func: linalg_qr.out(Tensor self, str mode='reduced', *, Tensor(a!) Q, Tensor(b!) R) -> (Tensor(a!) Q, Tensor(b!) R)
11834
+ - func: linalg_qr.out(Tensor A, str mode='reduced', *, Tensor(a!) Q, Tensor(b!) R) -> (Tensor(a!) Q, Tensor(b!) R)
11116
11835
  python_module: linalg
11117
11836
  variants: function
11118
11837
  dispatch:
@@ -11232,3 +11951,447 @@
11232
11951
  - func: unflatten_dense_tensors(Tensor flat, Tensor[] tensors) -> Tensor[]
11233
11952
  variants: function
11234
11953
  python_module: nn
11954
+
11955
+ - func: nested_tensor(Tensor[] list, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
11956
+ variants: function
11957
+
11958
+ - func: _fw_primal_copy(Tensor self, int level) -> Tensor
11959
+ variants: function
11960
+ dispatch:
11961
+ CompositeExplicitAutograd: _fw_primal_copy
11962
+ tags: view_copy
11963
+
11964
+ - func: _make_dual_copy(Tensor primal, Tensor tangent, int level) -> Tensor
11965
+ variants: function
11966
+ dispatch:
11967
+ CompositeExplicitAutograd: _make_dual_copy
11968
+ tags: view_copy
11969
+
11970
+ - func: view_as_real_copy(Tensor self) -> Tensor
11971
+ variants: function
11972
+ dispatch:
11973
+ CompositeExplicitAutograd: view_as_real_copy
11974
+ tags: view_copy
11975
+
11976
+ - func: view_as_complex_copy(Tensor self) -> Tensor
11977
+ variants: function
11978
+ dispatch:
11979
+ CompositeExplicitAutograd: view_as_complex_copy
11980
+ tags: view_copy
11981
+
11982
+ - func: _conj_copy(Tensor self) -> Tensor
11983
+ variants: function
11984
+ dispatch:
11985
+ CompositeExplicitAutograd: _conj_copy
11986
+ tags: view_copy
11987
+
11988
+ - func: _neg_view_copy(Tensor self) -> Tensor
11989
+ variants: function
11990
+ dispatch:
11991
+ CompositeExplicitAutograd: _neg_view_copy
11992
+ tags: view_copy
11993
+
11994
+ - func: as_strided_copy(Tensor self, int[] size, int[] stride, int? storage_offset=None) -> Tensor
11995
+ variants: function
11996
+ dispatch:
11997
+ CompositeExplicitAutograd: as_strided_copy
11998
+ tags: view_copy
11999
+
12000
+ - func: _sparse_broadcast_to_copy(Tensor self, int[] size) -> Tensor
12001
+ variants: function
12002
+ dispatch:
12003
+ CompositeExplicitAutograd: _sparse_broadcast_to_copy
12004
+ tags: view_copy
12005
+
12006
+ - func: diagonal_copy(Tensor self, int offset=0, int dim1=0, int dim2=1) -> Tensor
12007
+ variants: function
12008
+ dispatch:
12009
+ CompositeExplicitAutograd: diagonal_copy
12010
+ tags: view_copy
12011
+
12012
+ - func: expand_copy(Tensor self, int[] size, *, bool implicit=False) -> Tensor
12013
+ variants: function
12014
+ dispatch:
12015
+ CompositeExplicitAutograd: expand_copy
12016
+ tags: view_copy
12017
+
12018
+ - func: expand_copy.SymInt(Tensor self, SymInt[] size, *, bool implicit=False) -> Tensor
12019
+ variants: function
12020
+ dispatch:
12021
+ CompositeExplicitAutograd: expand_copy_SymInt
12022
+ tags: view_copy
12023
+
12024
+ - func: permute_copy(Tensor self, int[] dims) -> Tensor
12025
+ variants: function
12026
+ dispatch:
12027
+ CompositeExplicitAutograd: permute_copy
12028
+ tags: view_copy
12029
+
12030
+ - func: _reshape_alias_copy(Tensor self, int[] size, int[] stride) -> Tensor
12031
+ variants: function
12032
+ dispatch:
12033
+ CompositeExplicitAutograd: _reshape_alias_copy
12034
+ tags: view_copy
12035
+
12036
+ - func: select_copy.int(Tensor self, int dim, int index) -> Tensor
12037
+ variants: function
12038
+ dispatch:
12039
+ CompositeExplicitAutograd: select_copy_int
12040
+ tags: view_copy
12041
+
12042
+ - func: detach_copy(Tensor self) -> Tensor
12043
+ variants: function
12044
+ dispatch:
12045
+ CompositeExplicitAutograd: detach_copy
12046
+ tags: view_copy
12047
+
12048
+ - func: slice_copy.Tensor(Tensor self, int dim=0, int? start=None, int? end=None, int step=1) -> Tensor
12049
+ variants: function
12050
+ dispatch:
12051
+ CompositeExplicitAutograd: slice_copy_Tensor
12052
+ tags: view_copy
12053
+
12054
+ - func: split_copy.Tensor(Tensor self, int split_size, int dim=0) -> Tensor[]
12055
+ variants: function
12056
+ dispatch:
12057
+ CompositeExplicitAutograd: split_copy_Tensor
12058
+ tags: view_copy
12059
+
12060
+ - func: split_with_sizes_copy(Tensor self, int[] split_sizes, int dim=0) -> Tensor[]
12061
+ variants: function
12062
+ dispatch:
12063
+ CompositeExplicitAutograd: split_with_sizes_copy
12064
+ tags: view_copy
12065
+
12066
+ - func: squeeze_copy(Tensor self) -> Tensor
12067
+ variants: function
12068
+ dispatch:
12069
+ CompositeExplicitAutograd: squeeze_copy
12070
+ tags: view_copy
12071
+
12072
+ - func: squeeze_copy.dim(Tensor self, int dim) -> Tensor
12073
+ variants: function
12074
+ dispatch:
12075
+ CompositeExplicitAutograd: squeeze_copy_dim
12076
+ tags: view_copy
12077
+
12078
+ - func: t_copy(Tensor self) -> Tensor
12079
+ variants: function
12080
+ dispatch:
12081
+ CompositeExplicitAutograd: t_copy
12082
+ tags: view_copy
12083
+
12084
+ - func: transpose_copy.int(Tensor self, int dim0, int dim1) -> Tensor
12085
+ variants: function
12086
+ dispatch:
12087
+ CompositeExplicitAutograd: transpose_copy_int
12088
+ tags: view_copy
12089
+
12090
+ - func: unsqueeze_copy(Tensor self, int dim) -> Tensor
12091
+ variants: function
12092
+ dispatch:
12093
+ CompositeExplicitAutograd: unsqueeze_copy
12094
+ tags: view_copy
12095
+
12096
+ - func: _indices_copy(Tensor self) -> Tensor
12097
+ variants: function
12098
+ dispatch:
12099
+ CompositeExplicitAutograd: _indices_copy
12100
+ tags: view_copy
12101
+
12102
+ - func: _values_copy(Tensor self) -> Tensor
12103
+ variants: function
12104
+ dispatch:
12105
+ CompositeExplicitAutograd: _values_copy
12106
+ tags: view_copy
12107
+
12108
+ - func: indices_copy(Tensor self) -> Tensor
12109
+ variants: function
12110
+ dispatch:
12111
+ CompositeExplicitAutograd: indices_copy
12112
+ tags: view_copy
12113
+
12114
+ - func: values_copy(Tensor self) -> Tensor
12115
+ variants: function
12116
+ dispatch:
12117
+ CompositeExplicitAutograd: values_copy
12118
+ tags: view_copy
12119
+
12120
+ - func: crow_indices_copy(Tensor self) -> Tensor
12121
+ variants: function
12122
+ dispatch:
12123
+ CompositeExplicitAutograd: crow_indices_copy
12124
+ tags: view_copy
12125
+
12126
+ - func: col_indices_copy(Tensor self) -> Tensor
12127
+ variants: function
12128
+ dispatch:
12129
+ CompositeExplicitAutograd: col_indices_copy
12130
+ tags: view_copy
12131
+
12132
+ - func: ccol_indices_copy(Tensor self) -> Tensor
12133
+ variants: function
12134
+ dispatch:
12135
+ CompositeExplicitAutograd: ccol_indices_copy
12136
+ tags: view_copy
12137
+
12138
+ - func: row_indices_copy(Tensor self) -> Tensor
12139
+ variants: function
12140
+ dispatch:
12141
+ CompositeExplicitAutograd: row_indices_copy
12142
+ tags: view_copy
12143
+
12144
+ - func: unbind_copy.int(Tensor self, int dim=0) -> Tensor[]
12145
+ variants: function
12146
+ dispatch:
12147
+ CompositeExplicitAutograd: unbind_copy_int
12148
+ tags: view_copy
12149
+
12150
+ - func: view_copy(Tensor self, int[] size) -> Tensor
12151
+ variants: function
12152
+ dispatch:
12153
+ CompositeExplicitAutograd: view_copy
12154
+ tags: view_copy
12155
+
12156
+ - func: view_copy.dtype(Tensor self, ScalarType dtype) -> Tensor
12157
+ variants: function
12158
+ dispatch:
12159
+ CompositeExplicitAutograd: view_copy_dtype
12160
+ tags: view_copy
12161
+
12162
+ - func: unfold_copy(Tensor self, int dimension, int size, int step) -> Tensor
12163
+ variants: function
12164
+ dispatch:
12165
+ CompositeExplicitAutograd: unfold_copy
12166
+ tags: view_copy
12167
+
12168
+ - func: alias_copy(Tensor self) -> Tensor
12169
+ variants: function
12170
+ dispatch:
12171
+ CompositeExplicitAutograd: alias_copy
12172
+ tags: view_copy
12173
+
12174
+ - func: _fw_primal_copy.out(Tensor self, int level, *, Tensor(a!) out) -> Tensor(a!)
12175
+ variants: function
12176
+ dispatch:
12177
+ CompositeExplicitAutograd: _fw_primal_copy_out
12178
+
12179
+
12180
+ - func: _make_dual_copy.out(Tensor primal, Tensor tangent, int level, *, Tensor(a!) out) -> Tensor(a!)
12181
+ variants: function
12182
+ dispatch:
12183
+ CompositeExplicitAutograd: _make_dual_copy_out
12184
+
12185
+
12186
+ - func: view_as_real_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
12187
+ variants: function
12188
+ dispatch:
12189
+ CompositeExplicitAutograd: view_as_real_copy_out
12190
+
12191
+
12192
+ - func: view_as_complex_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
12193
+ variants: function
12194
+ dispatch:
12195
+ CompositeExplicitAutograd: view_as_complex_copy_out
12196
+
12197
+
12198
+ - func: _conj_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
12199
+ variants: function
12200
+ dispatch:
12201
+ CompositeExplicitAutograd: _conj_copy_out
12202
+
12203
+
12204
+ - func: _neg_view_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
12205
+ variants: function
12206
+ dispatch:
12207
+ CompositeExplicitAutograd: _neg_view_copy_out
12208
+
12209
+
12210
+ - func: as_strided_copy.out(Tensor self, int[] size, int[] stride, int? storage_offset=None, *, Tensor(a!) out) -> Tensor(a!)
12211
+ variants: function
12212
+ dispatch:
12213
+ CompositeExplicitAutograd: as_strided_copy_out
12214
+
12215
+
12216
+ - func: _sparse_broadcast_to_copy.out(Tensor self, int[] size, *, Tensor(a!) out) -> Tensor(a!)
12217
+ variants: function
12218
+ dispatch:
12219
+ CompositeExplicitAutograd: _sparse_broadcast_to_copy_out
12220
+
12221
+
12222
+ - func: diagonal_copy.out(Tensor self, int offset=0, int dim1=0, int dim2=1, *, Tensor(a!) out) -> Tensor(a!)
12223
+ variants: function
12224
+ dispatch:
12225
+ CompositeExplicitAutograd: diagonal_copy_out
12226
+
12227
+
12228
+ - func: expand_copy.SymInt_out(Tensor self, SymInt[] size, *, bool implicit=False, Tensor(a!) out) -> Tensor(a!)
12229
+ variants: function
12230
+ dispatch:
12231
+ CompositeExplicitAutograd: expand_copy_SymInt_out
12232
+
12233
+
12234
+ - func: expand_copy.out(Tensor self, int[] size, *, bool implicit=False, Tensor(a!) out) -> Tensor(a!)
12235
+ variants: function
12236
+ dispatch:
12237
+ CompositeExplicitAutograd: expand_copy_out
12238
+
12239
+
12240
+ - func: permute_copy.out(Tensor self, int[] dims, *, Tensor(a!) out) -> Tensor(a!)
12241
+ variants: function
12242
+ dispatch:
12243
+ CompositeExplicitAutograd: permute_copy_out
12244
+
12245
+
12246
+ - func: _reshape_alias_copy.out(Tensor self, int[] size, int[] stride, *, Tensor(a!) out) -> Tensor(a!)
12247
+ variants: function
12248
+ dispatch:
12249
+ CompositeExplicitAutograd: _reshape_alias_copy_out
12250
+
12251
+
12252
+ - func: select_copy.int_out(Tensor self, int dim, int index, *, Tensor(a!) out) -> Tensor(a!)
12253
+ variants: function
12254
+ dispatch:
12255
+ CompositeExplicitAutograd: select_copy_int_out
12256
+
12257
+
12258
+ - func: detach_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
12259
+ variants: function
12260
+ dispatch:
12261
+ CompositeExplicitAutograd: detach_copy_out
12262
+
12263
+
12264
+ - func: slice_copy.Tensor_out(Tensor self, int dim=0, int? start=None, int? end=None, int step=1, *, Tensor(a!) out) -> Tensor(a!)
12265
+ variants: function
12266
+ dispatch:
12267
+ CompositeExplicitAutograd: slice_copy_Tensor_out
12268
+
12269
+
12270
+ - func: split_copy.Tensor_out(Tensor self, int split_size, int dim=0, *, Tensor(a!)[] out) -> ()
12271
+ variants: function
12272
+ dispatch:
12273
+ CompositeExplicitAutograd: split_copy_Tensor_out
12274
+
12275
+
12276
+ - func: split_with_sizes_copy.out(Tensor self, int[] split_sizes, int dim=0, *, Tensor(a!)[] out) -> ()
12277
+ variants: function
12278
+ dispatch:
12279
+ CompositeExplicitAutograd: split_with_sizes_copy_out
12280
+
12281
+
12282
+ - func: squeeze_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
12283
+ variants: function
12284
+ dispatch:
12285
+ CompositeExplicitAutograd: squeeze_copy_out
12286
+
12287
+
12288
+ - func: squeeze_copy.dim_out(Tensor self, int dim, *, Tensor(a!) out) -> Tensor(a!)
12289
+ variants: function
12290
+ dispatch:
12291
+ CompositeExplicitAutograd: squeeze_copy_dim_out
12292
+
12293
+
12294
+ - func: t_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
12295
+ variants: function
12296
+ dispatch:
12297
+ CompositeExplicitAutograd: t_copy_out
12298
+
12299
+
12300
+ - func: transpose_copy.int_out(Tensor self, int dim0, int dim1, *, Tensor(a!) out) -> Tensor(a!)
12301
+ variants: function
12302
+ dispatch:
12303
+ CompositeExplicitAutograd: transpose_copy_int_out
12304
+
12305
+
12306
+ - func: unsqueeze_copy.out(Tensor self, int dim, *, Tensor(a!) out) -> Tensor(a!)
12307
+ variants: function
12308
+ dispatch:
12309
+ CompositeExplicitAutograd: unsqueeze_copy_out
12310
+
12311
+
12312
+ - func: _indices_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
12313
+ variants: function
12314
+ dispatch:
12315
+ CompositeExplicitAutograd: _indices_copy_out
12316
+
12317
+
12318
+ - func: _values_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
12319
+ variants: function
12320
+ dispatch:
12321
+ CompositeExplicitAutograd: _values_copy_out
12322
+
12323
+
12324
+ - func: indices_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
12325
+ variants: function
12326
+ dispatch:
12327
+ CompositeExplicitAutograd: indices_copy_out
12328
+
12329
+
12330
+ - func: values_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
12331
+ variants: function
12332
+ dispatch:
12333
+ CompositeExplicitAutograd: values_copy_out
12334
+
12335
+
12336
+ - func: crow_indices_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
12337
+ variants: function
12338
+ dispatch:
12339
+ CompositeExplicitAutograd: crow_indices_copy_out
12340
+
12341
+
12342
+ - func: col_indices_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
12343
+ variants: function
12344
+ dispatch:
12345
+ CompositeExplicitAutograd: col_indices_copy_out
12346
+
12347
+
12348
+ - func: unbind_copy.int_out(Tensor self, int dim=0, *, Tensor(a!)[] out) -> ()
12349
+ variants: function
12350
+ dispatch:
12351
+ CompositeExplicitAutograd: unbind_copy_int_out
12352
+
12353
+
12354
+ - func: view_copy.out(Tensor self, int[] size, *, Tensor(a!) out) -> Tensor(a!)
12355
+ variants: function
12356
+ dispatch:
12357
+ CompositeExplicitAutograd: view_copy_out
12358
+
12359
+
12360
+ - func: view_copy.dtype_out(Tensor self, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
12361
+ variants: function
12362
+ dispatch:
12363
+ CompositeExplicitAutograd: view_copy_dtype_out
12364
+
12365
+
12366
+ - func: unfold_copy.out(Tensor self, int dimension, int size, int step, *, Tensor(a!) out) -> Tensor(a!)
12367
+ variants: function
12368
+ dispatch:
12369
+ CompositeExplicitAutograd: unfold_copy_out
12370
+
12371
+
12372
+ - func: alias_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
12373
+ variants: function
12374
+ dispatch:
12375
+ CompositeExplicitAutograd: alias_copy_out
12376
+
12377
+ - func: to_padded_tensor(Tensor self, float padding, int[]? output_size=None) -> Tensor
12378
+ variants: method
12379
+ dispatch:
12380
+ NestedTensorCPU: NestedTensor_to_padded_tensor_generic
12381
+ NestedTensorCUDA: NestedTensor_to_padded_tensor_cuda
12382
+
12383
+ - func: _nested_tensor_layer_norm(Tensor self, Tensor? weight, Tensor? bias, float eps) -> Tensor
12384
+ variants: method
12385
+ dispatch:
12386
+ NestedTensorCPU, NestedTensorCUDA: NestedTensor_layer_norm
12387
+
12388
+ # Apparently, putting "forward" in the name will cause Python bindings to be skipped, so "fwd" it is.
12389
+ - func: _transformer_encoder_layer_fwd(Tensor src, int embed_dim, int num_heads, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, bool use_gelu, bool norm_first, float eps, Tensor norm_weight_1, Tensor norm_bias_1, Tensor norm_weight_2, Tensor norm_bias_2, Tensor ffn_weight_1, Tensor ffn_bias_1, Tensor ffn_weight_2, Tensor ffn_bias_2, Tensor? mask=None) -> Tensor
12390
+ variants: function
12391
+ dispatch:
12392
+ CPU, CUDA, NestedTensorCPU, NestedTensorCUDA: transformer_encoder_layer_forward
12393
+
12394
+ - func: _native_multi_head_attention(Tensor query, Tensor key, Tensor value, int embed_dim, int num_head, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, Tensor? mask=None, bool need_weights=True, bool average_attn_weights=True) -> (Tensor, Tensor)
12395
+ variants: function
12396
+ dispatch:
12397
+ CPU, CUDA, NestedTensorCPU, NestedTensorCUDA: native_multi_head_attention