torch-rb 0.20.0 → 0.21.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -288,13 +288,13 @@
288
288
  dispatch:
289
289
  CPU: native_dropout_cpu
290
290
  CUDA: native_dropout_cuda
291
- NestedTensorCPU, NestedTensorCUDA: native_dropout_nested
291
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: native_dropout_nested
292
292
  tags: [nondeterministic_seeded, core]
293
293
  autogen: native_dropout.out
294
294
 
295
295
  - func: native_dropout_backward(Tensor grad_output, Tensor mask, float scale) -> Tensor
296
296
  dispatch:
297
- CPU, NestedTensorCPU, NestedTensorCUDA: native_dropout_backward
297
+ CPU, NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: native_dropout_backward
298
298
  CUDA: native_dropout_backward_cuda
299
299
  autogen: native_dropout_backward.out
300
300
  tags: pointwise
@@ -342,7 +342,7 @@
342
342
  CompositeExplicitAutograd: abs
343
343
  SparseCPU, SparseCUDA: abs_sparse
344
344
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: abs_sparse_csr
345
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_abs
345
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_abs
346
346
  tags: [core, pointwise]
347
347
 
348
348
  - func: abs_(Tensor(a!) self) -> Tensor(a!)
@@ -352,13 +352,12 @@
352
352
  CompositeExplicitAutograd: abs_
353
353
  SparseCPU, SparseCUDA: abs_sparse_
354
354
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: abs_sparse_csr_
355
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_abs_
355
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_abs_
356
356
 
357
357
  - func: abs.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
358
358
  device_check: NoCheck # TensorIterator
359
359
  dispatch:
360
- CPU, CUDA: abs_out
361
- MPS: abs_out_mps
360
+ CPU, CUDA, MPS: abs_out
362
361
  SparseCPU, SparseCUDA: abs_sparse_out
363
362
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: abs_sparse_csr_out
364
363
  tags: pointwise
@@ -431,7 +430,7 @@
431
430
  dispatch:
432
431
  SparseCPU, SparseCUDA: sgn_sparse
433
432
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sgn_sparse_csr
434
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_sgn
433
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sgn
435
434
  tags: pointwise
436
435
 
437
436
  - func: sgn_(Tensor(a!) self) -> Tensor(a!)
@@ -440,7 +439,7 @@
440
439
  dispatch:
441
440
  SparseCPU, SparseCUDA: sgn_sparse_
442
441
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sgn_sparse_csr_
443
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_sgn_
442
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sgn_
444
443
  tags: pointwise
445
444
 
446
445
  - func: sgn.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@@ -527,8 +526,7 @@
527
526
  structured: True
528
527
  structured_inherits: TensorIteratorBase
529
528
  dispatch:
530
- CPU, CUDA: acos_out
531
- MPS: acos_out_mps
529
+ CPU, CUDA, MPS: acos_out
532
530
  tags: pointwise
533
531
 
534
532
  # arccos, alias of acos
@@ -560,7 +558,7 @@
560
558
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: add_sparse_csr
561
559
  MkldnnCPU: mkldnn_add
562
560
  ZeroTensor: add_zerotensor
563
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_add_Tensor
561
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_add_Tensor
564
562
  tags: [core, pointwise]
565
563
 
566
564
  - func: add_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
@@ -571,7 +569,7 @@
571
569
  SparseCPU, SparseCUDA, SparseMeta: add_sparse_
572
570
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: add_sparse_csr_
573
571
  MkldnnCPU: mkldnn_add_
574
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_add__Tensor
572
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_add__Tensor
575
573
  tags: pointwise
576
574
 
577
575
  - func: add.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
@@ -703,7 +701,7 @@
703
701
  structured_delegate: all.out
704
702
  variants: function, method
705
703
  dispatch:
706
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_all
704
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_all
707
705
 
708
706
 
709
707
  - func: all.dims(Tensor self, int[]? dim=None, bool keepdim=False) -> Tensor
@@ -942,7 +940,7 @@
942
940
  - func: as_strided(Tensor(a) self, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None) -> Tensor(a)
943
941
  variants: function, method
944
942
  dispatch:
945
- ZeroTensor, CPU, CUDA: as_strided_tensorimpl
943
+ ZeroTensor, CPU, CUDA, MTIA: as_strided_tensorimpl
946
944
  Meta: as_strided_tensorimpl_meta_symint
947
945
  MPS: as_strided_tensorimpl_mps
948
946
  QuantizedCPU, QuantizedCUDA: as_strided_qtensorimpl
@@ -982,8 +980,7 @@
982
980
  structured: True
983
981
  structured_inherits: TensorIteratorBase
984
982
  dispatch:
985
- CPU, CUDA: asin_out
986
- MPS: asin_out_mps
983
+ CPU, CUDA, MPS: asin_out
987
984
  SparseCPU, SparseCUDA: asin_sparse_out
988
985
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asin_sparse_csr_out
989
986
  tags: pointwise
@@ -1020,8 +1017,7 @@
1020
1017
  structured: True
1021
1018
  structured_inherits: TensorIteratorBase
1022
1019
  dispatch:
1023
- CPU, CUDA: atan_out
1024
- MPS: atan_out_mps
1020
+ CPU, CUDA, MPS: atan_out
1025
1021
  SparseCPU, SparseCUDA: atan_sparse_out
1026
1022
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atan_sparse_csr_out
1027
1023
  tags: pointwise
@@ -1073,6 +1069,16 @@
1073
1069
  XPU: baddbmm_out_xpu
1074
1070
  SparseCsrCUDA: baddbmm_out_sparse_csr_cuda
1075
1071
 
1072
+ - func: baddbmm.dtype(Tensor self, Tensor batch1, Tensor batch2, ScalarType out_dtype, *, Scalar beta=1, Scalar alpha=1) -> Tensor
1073
+ variants: function
1074
+ dispatch:
1075
+ CUDA: _baddbmm_dtype_cuda
1076
+
1077
+ - func: baddbmm.dtype_out(Tensor self, Tensor batch1, Tensor batch2, ScalarType out_dtype, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
1078
+ variants: function
1079
+ dispatch:
1080
+ CUDA: _baddbmm_out_dtype_cuda
1081
+
1076
1082
  - func: bartlett_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
1077
1083
  dispatch:
1078
1084
  CompositeExplicitAutograd: bartlett_window
@@ -1185,7 +1191,7 @@
1185
1191
  CompositeExplicitAutograd: binary_cross_entropy_with_logits
1186
1192
  autogen: binary_cross_entropy_with_logits.out
1187
1193
 
1188
- - func: bincount(Tensor self, Tensor? weights=None, int minlength=0) -> Tensor
1194
+ - func: bincount(Tensor self, Tensor? weights=None, SymInt minlength=0) -> Tensor
1189
1195
  variants: function, method
1190
1196
  dispatch:
1191
1197
  CPU: _bincount_cpu
@@ -1211,8 +1217,7 @@
1211
1217
  structured: True
1212
1218
  structured_inherits: TensorIteratorBase
1213
1219
  dispatch:
1214
- CPU, CUDA: bitwise_not_out
1215
- MPS: bitwise_not_out_mps
1220
+ CPU, CUDA, MPS, MTIA: bitwise_not_out
1216
1221
  tags: pointwise
1217
1222
 
1218
1223
  - func: copysign.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
@@ -1262,7 +1267,7 @@
1262
1267
  variants: function, method
1263
1268
  dispatch:
1264
1269
  CompositeExplicitAutograd: logical_not
1265
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_logical_not
1270
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_logical_not
1266
1271
  tags: [core, pointwise]
1267
1272
 
1268
1273
  - func: logical_not_(Tensor(a!) self) -> Tensor(a!)
@@ -1270,7 +1275,7 @@
1270
1275
  variants: method
1271
1276
  dispatch:
1272
1277
  CompositeExplicitAutograd: logical_not_
1273
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_logical_not_
1278
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_logical_not_
1274
1279
  tags: pointwise
1275
1280
 
1276
1281
  - func: logical_not.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@@ -1318,7 +1323,7 @@
1318
1323
  - func: logical_and.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
1319
1324
  device_check: NoCheck # TensorIterator
1320
1325
  dispatch:
1321
- CPU, CUDA: logical_and_out
1326
+ CPU, CUDA, MTIA: logical_and_out
1322
1327
  MPS: logical_and_out_mps
1323
1328
  tags: pointwise
1324
1329
 
@@ -1339,7 +1344,7 @@
1339
1344
  - func: logical_or.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
1340
1345
  device_check: NoCheck # TensorIterator
1341
1346
  dispatch:
1342
- CPU, CUDA: logical_or_out
1347
+ CPU, CUDA, MTIA: logical_or_out
1343
1348
  MPS: logical_or_out_mps
1344
1349
  tags: pointwise
1345
1350
 
@@ -1375,6 +1380,16 @@
1375
1380
  SparseCUDA: bmm_out_sparse_cuda
1376
1381
  SparseCsrCUDA: bmm_out_sparse_csr_cuda
1377
1382
 
1383
+ - func: bmm.dtype(Tensor self, Tensor mat2, ScalarType out_dtype) -> Tensor
1384
+ variants: function
1385
+ dispatch:
1386
+ CUDA: _bmm_dtype_cuda
1387
+
1388
+ - func: bmm.dtype_out(Tensor self, Tensor mat2, ScalarType out_dtype, *, Tensor(a!) out) -> Tensor(a!)
1389
+ variants: function
1390
+ dispatch:
1391
+ CUDA: _bmm_out_dtype_cuda
1392
+
1378
1393
  - func: broadcast_tensors(Tensor[] tensors) -> Tensor[]
1379
1394
  device_check: NoCheck
1380
1395
  device_guard: False
@@ -1394,7 +1409,7 @@
1394
1409
  dispatch:
1395
1410
  SparseCPU, SparseCUDA: cat_sparse
1396
1411
  QuantizedCPU: cat_quantized_cpu
1397
- NestedTensorCPU, NestedTensorCUDA: cat_nested
1412
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: cat_nested
1398
1413
  tags: core
1399
1414
 
1400
1415
  - func: cat.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!)
@@ -1482,7 +1497,7 @@
1482
1497
  device_guard: False
1483
1498
  dispatch:
1484
1499
  CompositeImplicitAutograd: chunk
1485
- NestedTensorCPU, NestedTensorCUDA: chunk_nested_tensor
1500
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: chunk_nested_tensor
1486
1501
 
1487
1502
  - func: tensor_split.sections(Tensor(a -> *) self, SymInt sections, int dim=0) -> Tensor(a)[]
1488
1503
  variants: function, method
@@ -1529,7 +1544,7 @@
1529
1544
  structured: True
1530
1545
  structured_inherits: TensorIteratorBase
1531
1546
  dispatch:
1532
- CPU, CUDA: clamp_out
1547
+ CPU, CUDA, MTIA: clamp_out
1533
1548
  MPS: clamp_out_mps
1534
1549
  tags: pointwise
1535
1550
 
@@ -1569,7 +1584,7 @@
1569
1584
  structured: True
1570
1585
  structured_inherits: TensorIteratorBase
1571
1586
  dispatch:
1572
- CPU, CUDA: clamp_max_out
1587
+ CPU, CUDA, MTIA: clamp_max_out
1573
1588
  MPS: clamp_max_out_mps
1574
1589
  tags: pointwise
1575
1590
 
@@ -1609,7 +1624,7 @@
1609
1624
  structured: True
1610
1625
  structured_inherits: TensorIteratorBase
1611
1626
  dispatch:
1612
- CPU, CUDA: clamp_min_out
1627
+ CPU, CUDA, MTIA: clamp_min_out
1613
1628
  MPS: clamp_min_out_mps
1614
1629
  tags: pointwise
1615
1630
 
@@ -1658,8 +1673,7 @@
1658
1673
 
1659
1674
  - func: complex.out(Tensor real, Tensor imag, *, Tensor(a!) out) -> Tensor(a!)
1660
1675
  dispatch:
1661
- CPU, CUDA: complex_out
1662
- MPS: complex_out_mps
1676
+ CPU, CUDA, MPS: complex_out
1663
1677
 
1664
1678
  - func: polar(Tensor abs, Tensor angle) -> Tensor
1665
1679
  variants: function
@@ -1668,8 +1682,7 @@
1668
1682
 
1669
1683
  - func: polar.out(Tensor abs, Tensor angle, *, Tensor(a!) out) -> Tensor(a!)
1670
1684
  dispatch:
1671
- CPU, CUDA: polar_out
1672
- MPS: polar_out_mps
1685
+ CPU, CUDA, MPS: polar_out
1673
1686
 
1674
1687
  - func: constant_pad_nd(Tensor self, SymInt[] pad, Scalar value=0) -> Tensor
1675
1688
  variants: function
@@ -1781,7 +1794,7 @@
1781
1794
  SparseCPU, SparseCUDA: copy_sparse_wrapper_
1782
1795
  CompositeExplicitAutograd: copy_
1783
1796
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: copy_sparse_compressed_
1784
- NestedTensorCPU, NestedTensorCUDA: copy_nested_
1797
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: copy_nested_
1785
1798
  autogen: copy.out
1786
1799
 
1787
1800
  - func: _copy_from(Tensor self, Tensor dst, bool non_blocking=False) -> Tensor
@@ -1801,7 +1814,7 @@
1801
1814
  variants: function, method
1802
1815
  structured_delegate: cos.out
1803
1816
  dispatch:
1804
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_cos
1817
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_cos
1805
1818
  tags: [core, pointwise]
1806
1819
 
1807
1820
  - func: cos_(Tensor(a!) self) -> Tensor(a!)
@@ -1815,8 +1828,7 @@
1815
1828
  structured: True
1816
1829
  structured_inherits: TensorIteratorBase
1817
1830
  dispatch:
1818
- CPU, CUDA: cos_out
1819
- MPS: cos_out_mps
1831
+ CPU, CUDA, MPS, MTIA: cos_out
1820
1832
  tags: pointwise
1821
1833
 
1822
1834
  - func: cosh(Tensor self) -> Tensor
@@ -1836,8 +1848,7 @@
1836
1848
  structured: True
1837
1849
  structured_inherits: TensorIteratorBase
1838
1850
  dispatch:
1839
- CPU, CUDA: cosh_out
1840
- MPS: cosh_out_mps
1851
+ CPU, CUDA, MPS: cosh_out
1841
1852
  tags: pointwise
1842
1853
 
1843
1854
  - func: cosine_embedding_loss(Tensor input1, Tensor input2, Tensor target, float margin=0.0, int reduction=Mean) -> Tensor
@@ -1951,6 +1962,7 @@
1951
1962
  dispatch:
1952
1963
  CPU: cummax_helper_cpu
1953
1964
  CUDA: cummax_helper_cuda
1965
+ MPS: cummax_helper_mps
1954
1966
 
1955
1967
  - func: cummin(Tensor self, int dim) -> (Tensor values, Tensor indices)
1956
1968
  device_check: NoCheck # TensorIterator
@@ -1975,6 +1987,7 @@
1975
1987
  dispatch:
1976
1988
  CPU: cummin_helper_cpu
1977
1989
  CUDA: cummin_helper_cuda
1990
+ MPS: cummin_helper_mps
1978
1991
 
1979
1992
  - func: cummaxmin_backward(Tensor grad, Tensor input, Tensor indices, int dim) -> Tensor
1980
1993
  variants: function
@@ -2139,7 +2152,7 @@
2139
2152
  dispatch:
2140
2153
  SparseCPU, SparseCUDA: div_sparse
2141
2154
  ZeroTensor: div_zerotensor
2142
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_div_Tensor
2155
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_div_Tensor
2143
2156
  tags: [core, pointwise]
2144
2157
 
2145
2158
  - func: div_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
@@ -2155,8 +2168,7 @@
2155
2168
  structured: True
2156
2169
  structured_inherits: TensorIteratorBase
2157
2170
  dispatch:
2158
- CPU, CUDA: div_out
2159
- MPS: div_out_mps
2171
+ CPU, CUDA, MPS: div_out
2160
2172
  SparseCPU, SparseCUDA: div_out_sparse_zerodim
2161
2173
  tags: pointwise
2162
2174
 
@@ -2181,8 +2193,7 @@
2181
2193
  structured: True
2182
2194
  structured_inherits: TensorIteratorBase
2183
2195
  dispatch:
2184
- CPU, CUDA: div_out_mode
2185
- MPS: div_out_mode_mps
2196
+ CPU, CUDA, MPS: div_out_mode
2186
2197
  SparseCPU, SparseCUDA: div_out_sparse_zerodim
2187
2198
  tags: pointwise
2188
2199
 
@@ -2192,7 +2203,7 @@
2192
2203
  variants: function, method
2193
2204
  dispatch:
2194
2205
  CompositeExplicitAutograd: div
2195
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_div_Scalar
2206
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_div_Scalar
2196
2207
  tags: [core, pointwise]
2197
2208
 
2198
2209
  - func: div_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
@@ -2292,7 +2303,7 @@
2292
2303
  - func: embedding(Tensor weight, Tensor indices, SymInt padding_idx=-1, bool scale_grad_by_freq=False, bool sparse=False) -> Tensor
2293
2304
  dispatch:
2294
2305
  CompositeExplicitAutograd: embedding_symint
2295
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_embedding
2306
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_embedding
2296
2307
  autogen: embedding.out
2297
2308
  tags: core
2298
2309
 
@@ -2498,7 +2509,7 @@
2498
2509
  QuantizedCPU, QuantizedCUDA: empty_like_quantized
2499
2510
  SparseCPU, SparseCUDA, SparseMeta: empty_like_sparse_coo
2500
2511
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: empty_like_sparse_csr
2501
- NestedTensorCPU, NestedTensorCUDA: empty_like_nested
2512
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: empty_like_nested
2502
2513
  autogen: empty_like.out
2503
2514
 
2504
2515
  - func: empty_strided(SymInt[] size, SymInt[] stride, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
@@ -2534,8 +2545,7 @@
2534
2545
  structured: True
2535
2546
  structured_inherits: TensorIteratorBase
2536
2547
  dispatch:
2537
- CPU, CUDA: erf_out
2538
- MPS: erf_out_mps
2548
+ CPU, CUDA, MPS, MTIA: erf_out
2539
2549
  SparseCPU, SparseCUDA: erf_sparse_out
2540
2550
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erf_sparse_csr_out
2541
2551
  tags: pointwise
@@ -2557,7 +2567,7 @@
2557
2567
  structured: True
2558
2568
  structured_inherits: TensorIteratorBase
2559
2569
  dispatch:
2560
- CPU, CUDA: erfc_out
2570
+ CPU, CUDA, MPS: erfc_out
2561
2571
  tags: pointwise
2562
2572
 
2563
2573
  - func: exp(Tensor self) -> Tensor
@@ -2577,7 +2587,7 @@
2577
2587
  structured: True
2578
2588
  structured_inherits: TensorIteratorBase
2579
2589
  dispatch:
2580
- CPU, CUDA, MPS: exp_out
2590
+ CPU, CUDA, MPS, MTIA: exp_out
2581
2591
  tags: pointwise
2582
2592
 
2583
2593
  - func: exp2(Tensor self) -> Tensor
@@ -2594,8 +2604,7 @@
2594
2604
  structured: True
2595
2605
  structured_inherits: TensorIteratorBase
2596
2606
  dispatch:
2597
- CPU, CUDA: exp2_out
2598
- MPS: exp2_out_mps
2607
+ CPU, CUDA, MPS: exp2_out
2599
2608
  tags: pointwise
2600
2609
 
2601
2610
  - func: expm1(Tensor self) -> Tensor
@@ -2621,8 +2630,7 @@
2621
2630
  structured: True
2622
2631
  structured_inherits: TensorIteratorBase
2623
2632
  dispatch:
2624
- CPU, CUDA: expm1_out
2625
- MPS: expm1_out_mps
2633
+ CPU, CUDA, MPS: expm1_out
2626
2634
  SparseCPU, SparseCUDA: expm1_sparse_out
2627
2635
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: expm1_sparse_csr_out
2628
2636
  tags: pointwise
@@ -2703,7 +2711,7 @@
2703
2711
  QuantizedCPU, QuantizedCUDA: fill_quantized_
2704
2712
  Meta: fill_meta_
2705
2713
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: fill_sparse_csr_
2706
- NestedTensorCPU, NestedTensorCUDA: fill_nested_
2714
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: fill_nested_
2707
2715
  autogen: fill.Scalar_out
2708
2716
 
2709
2717
  - func: fill_.Tensor(Tensor(a!) self, Tensor value) -> Tensor(a!)
@@ -2714,7 +2722,7 @@
2714
2722
  MPS: fill_tensor_mps_
2715
2723
  QuantizedCPU, QuantizedCUDA: fill_quantized_
2716
2724
  Meta: fill_meta_
2717
- NestedTensorCPU, NestedTensorCUDA: fill_nested_
2725
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: fill_nested_
2718
2726
  autogen: fill.Tensor_out
2719
2727
 
2720
2728
  - func: floor(Tensor self) -> Tensor
@@ -2749,23 +2757,20 @@
2749
2757
  device_check: NoCheck # TensorIterator
2750
2758
  variants: function, method
2751
2759
  dispatch:
2752
- CPU, CUDA: floor_divide
2753
- MPS: floor_divide_mps
2760
+ CPU, CUDA, MPS: floor_divide
2754
2761
  SparseCPU, SparseCUDA: floor_divide_sparse
2755
2762
 
2756
2763
  - func: floor_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
2757
2764
  device_check: NoCheck # TensorIterator
2758
2765
  variants: method
2759
2766
  dispatch:
2760
- CPU, CUDA: floor_divide_
2761
- MPS: floor_divide_mps_
2767
+ CPU, CUDA, MPS: floor_divide_
2762
2768
  SparseCPU, SparseCUDA: floor_divide_sparse_
2763
2769
 
2764
2770
  - func: floor_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
2765
2771
  device_check: NoCheck # TensorIterator
2766
2772
  dispatch:
2767
- CPU, CUDA: floor_divide_out
2768
- MPS: floor_divide_out_mps
2773
+ CPU, CUDA, MPS: floor_divide_out
2769
2774
  SparseCPU, SparseCUDA: floor_divide_out_sparse_zerodim
2770
2775
 
2771
2776
  - func: floor_divide.Scalar(Tensor self, Scalar other) -> Tensor
@@ -3100,6 +3105,7 @@
3100
3105
  - dim -> int dim
3101
3106
  dispatch:
3102
3107
  CPU, CUDA: index_copy_out
3108
+ MPS: index_copy_out_mps
3103
3109
 
3104
3110
  - func: index_copy_(Tensor(a!) self, int dim, Tensor index, Tensor source) -> Tensor(a!)
3105
3111
  variants: method
@@ -3170,7 +3176,7 @@
3170
3176
  variants: function
3171
3177
  structured: True
3172
3178
  dispatch:
3173
- CPU, CUDA: isin_Tensor_Scalar_out
3179
+ CPU, CUDA, MPS: isin_Tensor_Scalar_out
3174
3180
 
3175
3181
  - func: isin.Tensor_Scalar(Tensor elements, Scalar test_element, *, bool assume_unique=False, bool invert=False) -> Tensor
3176
3182
  variants: function
@@ -3181,6 +3187,7 @@
3181
3187
  structured: True
3182
3188
  dispatch:
3183
3189
  CPU, CUDA: isin_Scalar_Tensor_out
3190
+ MPS: isin_Scalar_Tensor_out_mps
3184
3191
 
3185
3192
  - func: isin.Scalar_Tensor(Scalar element, Tensor test_elements, *, bool assume_unique=False, bool invert=False) -> Tensor
3186
3193
  variants: function
@@ -3191,8 +3198,8 @@
3191
3198
  device_check: NoCheck
3192
3199
  device_guard: False
3193
3200
  dispatch:
3194
- CPU, CUDA, MPS: isnan
3195
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_isnan
3201
+ CPU, CUDA, MPS, MTIA: isnan
3202
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_isnan
3196
3203
  SparseCPU, SparseCUDA: isnan_sparse
3197
3204
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isnan_sparse_csr
3198
3205
  autogen: isnan.out
@@ -3243,7 +3250,7 @@
3243
3250
  device_check: NoCheck
3244
3251
  device_guard: False
3245
3252
  dispatch:
3246
- NestedTensorCPU, NestedTensorCUDA: nested_is_same_size
3253
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: nested_is_same_size
3247
3254
  CompositeExplicitAutograd: is_same_size
3248
3255
 
3249
3256
  - func: is_signed(Tensor self) -> bool
@@ -3265,20 +3272,20 @@
3265
3272
 
3266
3273
  - func: kron.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
3267
3274
 
3268
- - func: kthvalue(Tensor self, int k, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices)
3275
+ - func: kthvalue(Tensor self, SymInt k, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices)
3269
3276
  variants: function, method
3270
3277
  dispatch:
3271
3278
  CompositeExplicitAutograd: kthvalue
3272
3279
 
3273
- - func: kthvalue.values(Tensor self, int k, int dim=-1, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
3280
+ - func: kthvalue.values(Tensor self, SymInt k, int dim=-1, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
3274
3281
  dispatch:
3275
3282
  CPU: kthvalue_out_cpu
3276
3283
  CUDA: kthvalue_out_cuda
3277
3284
 
3278
- - func: kthvalue.dimname(Tensor self, int k, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
3285
+ - func: kthvalue.dimname(Tensor self, SymInt k, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
3279
3286
  variants: function, method
3280
3287
 
3281
- - func: kthvalue.dimname_out(Tensor self, int k, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
3288
+ - func: kthvalue.dimname_out(Tensor self, SymInt k, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
3282
3289
 
3283
3290
  - func: layer_norm(Tensor input, SymInt[] normalized_shape, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enable=True) -> Tensor
3284
3291
  dispatch:
@@ -3290,7 +3297,7 @@
3290
3297
  CUDA: layer_norm_cuda
3291
3298
  MPS: layer_norm_mps
3292
3299
  CompositeExplicitAutograd: math_native_layer_norm
3293
- NestedTensorCPU, NestedTensorCUDA: nested_layer_norm
3300
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: nested_layer_norm
3294
3301
  autogen: native_layer_norm.out
3295
3302
  tags: core
3296
3303
 
@@ -3299,7 +3306,7 @@
3299
3306
  CPU: layer_norm_backward_cpu
3300
3307
  CUDA: layer_norm_backward_cuda
3301
3308
  MPS: layer_norm_backward_mps
3302
- NestedTensorCPU, NestedTensorCUDA: layer_norm_backward_nested
3309
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: layer_norm_backward_nested
3303
3310
  autogen: native_layer_norm_backward.out
3304
3311
  tags: core
3305
3312
 
@@ -3307,6 +3314,10 @@
3307
3314
  dispatch:
3308
3315
  CompositeImplicitAutograd: rms_norm_symint
3309
3316
 
3317
+ - func: _fused_rms_norm(Tensor input, int normalized_shape_ndim, Tensor weight, float eps) -> Tensor
3318
+ dispatch:
3319
+ MPS: _fused_rms_norm_mps
3320
+
3310
3321
  - func: nan_to_num(Tensor self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor
3311
3322
  variants: function, method
3312
3323
  dispatch:
@@ -3323,7 +3334,7 @@
3323
3334
 
3324
3335
  - func: nan_to_num.out(Tensor self, float? nan=None, float? posinf=None, float? neginf=None, *, Tensor(a!) out) -> Tensor(a!)
3325
3336
  dispatch:
3326
- CPU, CUDA: nan_to_num_out
3337
+ CPU, CUDA, MTIA: nan_to_num_out
3327
3338
  MPS: nan_to_num_out_mps
3328
3339
  SparseCPU, SparseCUDA: nan_to_num_sparse_out
3329
3340
  tags: pointwise
@@ -3332,12 +3343,12 @@
3332
3343
  python_module: nn
3333
3344
  dispatch:
3334
3345
  CompositeImplicitAutograd: linear
3335
- NestedTensorCPU, NestedTensorCUDA: nested_linear
3346
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: nested_linear
3336
3347
  MPS: _mps_linear
3337
3348
 
3338
3349
  - func: linear_backward(Tensor self, Tensor grad_output, Tensor weight, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
3339
3350
  dispatch:
3340
- NestedTensorCPU, NestedTensorCUDA: nested_linear_backward
3351
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: nested_linear_backward
3341
3352
  MPS: mps_linear_backward
3342
3353
  autogen: linear_backward.out
3343
3354
 
@@ -3371,7 +3382,7 @@
3371
3382
  dispatch:
3372
3383
  CUDA: _cslt_compress
3373
3384
 
3374
- - func: _cslt_sparse_mm(Tensor compressed_A, Tensor dense_B, Tensor? bias=None, Tensor? alpha=None, ScalarType? out_dtype=None, bool transpose_result=False, int alg_id=0, int split_k=1, bool split_k_one_kernel=True) -> Tensor
3385
+ - func: _cslt_sparse_mm(Tensor compressed_A, Tensor dense_B, Tensor? bias=None, Tensor? alpha=None, ScalarType? out_dtype=None, bool transpose_result=False, int alg_id=0, int split_k=1, int split_k_mode=-1) -> Tensor
3375
3386
  dispatch:
3376
3387
  CUDA: _cslt_sparse_mm
3377
3388
  tags: needs_fixed_stride_order
@@ -3496,8 +3507,7 @@
3496
3507
  structured: True
3497
3508
  structured_inherits: TensorIteratorBase
3498
3509
  dispatch:
3499
- CPU, CUDA: log_out
3500
- MPS: log_out_mps
3510
+ CPU, CUDA, MPS, MTIA: log_out
3501
3511
  tags: pointwise
3502
3512
 
3503
3513
  - func: log10(Tensor self) -> Tensor
@@ -3517,8 +3527,7 @@
3517
3527
  structured: True
3518
3528
  structured_inherits: TensorIteratorBase
3519
3529
  dispatch:
3520
- CPU, CUDA: log10_out
3521
- MPS: log10_out_mps
3530
+ CPU, CUDA, MPS: log10_out
3522
3531
  tags: pointwise
3523
3532
 
3524
3533
  - func: log1p(Tensor self) -> Tensor
@@ -3544,8 +3553,7 @@
3544
3553
  structured: True
3545
3554
  structured_inherits: TensorIteratorBase
3546
3555
  dispatch:
3547
- CPU, CUDA: log1p_out
3548
- MPS: log1p_out_mps
3556
+ CPU, CUDA, MPS: log1p_out
3549
3557
  SparseCPU, SparseCUDA: log1p_sparse_out
3550
3558
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: log1p_sparse_csr_out
3551
3559
  tags: pointwise
@@ -3567,8 +3575,7 @@
3567
3575
  structured: True
3568
3576
  structured_inherits: TensorIteratorBase
3569
3577
  dispatch:
3570
- CPU, CUDA: log2_out
3571
- MPS: log2_out_mps
3578
+ CPU, CUDA, MPS, MTIA: log2_out
3572
3579
  tags: pointwise
3573
3580
 
3574
3581
  - func: logaddexp.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
@@ -3715,6 +3722,7 @@
3715
3722
  dispatch:
3716
3723
  CPU: log_softmax_cpu_out
3717
3724
  CUDA: log_softmax_cuda_out
3725
+ MTIA: log_softmax_mtia_out
3718
3726
  MPS: log_softmax_mps_out
3719
3727
 
3720
3728
  - func: _log_softmax_backward_data(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype) -> Tensor
@@ -3725,6 +3733,7 @@
3725
3733
  dispatch:
3726
3734
  CPU: log_softmax_backward_cpu_out
3727
3735
  CUDA: log_softmax_backward_cuda_out
3736
+ MTIA: log_softmax_backward_mtia_out
3728
3737
  MPS: log_softmax_backward_mps_out
3729
3738
 
3730
3739
  - func: _logcumsumexp(Tensor self, int dim) -> Tensor
@@ -3776,17 +3785,17 @@
3776
3785
  variants: function, method
3777
3786
  dispatch:
3778
3787
  CompositeImplicitAutograd: matmul
3779
- NestedTensorCPU, NestedTensorCUDA: matmul_nested
3788
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: matmul_nested
3780
3789
 
3781
3790
  - func: matmul_backward(Tensor grad, Tensor self, Tensor other, bool[2] mask) -> (Tensor, Tensor)
3782
3791
  dispatch:
3783
- NestedTensorCPU, NestedTensorCUDA: matmul_backward_nested
3792
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: matmul_backward_nested
3784
3793
  autogen: matmul_backward.out
3785
3794
 
3786
3795
  - func: matmul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
3787
3796
  dispatch:
3788
3797
  CompositeImplicitAutograd: matmul_out
3789
- NestedTensorCPU, NestedTensorCUDA: matmul_out_nested
3798
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: matmul_out_nested
3790
3799
 
3791
3800
  # Alias to linalg.matrix_power
3792
3801
  - func: matrix_power(Tensor self, int n) -> Tensor
@@ -3848,7 +3857,7 @@
3848
3857
  precomputed:
3849
3858
  - dim -> int dim
3850
3859
  dispatch:
3851
- CPU, CUDA: max_out
3860
+ CPU, CUDA, MTIA: max_out
3852
3861
  MPS: max_out_mps
3853
3862
 
3854
3863
  - func: max.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
@@ -4004,6 +4013,7 @@
4004
4013
  dispatch:
4005
4014
  CPU: nanmedian_cpu
4006
4015
  CUDA: nanmedian_cuda
4016
+ MPS: nanmedian_mps
4007
4017
  autogen: nanmedian.out
4008
4018
 
4009
4019
  - func: nanmedian.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)
@@ -4015,6 +4025,7 @@
4015
4025
  dispatch:
4016
4026
  CPU: nanmedian_out_cpu
4017
4027
  CUDA: nanmedian_out_cuda
4028
+ MPS: nanmedian_out_mps
4018
4029
 
4019
4030
  - func: nanmedian.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
4020
4031
  variants: function, method
@@ -4035,7 +4046,7 @@
4035
4046
  precomputed:
4036
4047
  - dim -> int dim
4037
4048
  dispatch:
4038
- CPU, CUDA: min_out
4049
+ CPU, CUDA, MTIA: min_out
4039
4050
  MPS: min_out_mps
4040
4051
 
4041
4052
  - func: min.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
@@ -4143,11 +4154,20 @@
4143
4154
  dispatch:
4144
4155
  CPU: mm_out_cpu
4145
4156
  CUDA: mm_out_cuda
4157
+ MTIA: mm_out_mtia
4146
4158
  MPS: mm_out_mps
4147
4159
  XPU: mm_out_xpu
4148
4160
  SparseCPU, SparseCUDA: _sparse_mm_out
4149
4161
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: _sparse_csr_mm_out
4150
4162
 
4163
+ - func: mm.dtype(Tensor self, Tensor mat2, ScalarType out_dtype) -> Tensor
4164
+ dispatch:
4165
+ CUDA: _mm_dtype_cuda
4166
+
4167
+ - func: mm.dtype_out(Tensor self, Tensor mat2, ScalarType out_dtype, *, Tensor(a!) out) -> Tensor(a!)
4168
+ dispatch:
4169
+ CUDA: _mm_dtype_out_cuda
4170
+
4151
4171
  - func: _int_mm(Tensor self, Tensor mat2) -> Tensor
4152
4172
  dispatch:
4153
4173
  CPU: _int_mm_cpu
@@ -4168,6 +4188,10 @@
4168
4188
  MPS: _weight_int4pack_mm_mps
4169
4189
  CUDA: _weight_int4pack_mm_cuda
4170
4190
 
4191
+ - func: _weight_int4pack_mm_with_scales_and_zeros(Tensor self, Tensor mat2, int qGroupSize, Tensor qScale, Tensor qZeros) -> Tensor
4192
+ dispatch:
4193
+ XPU: _weight_int4pack_mm_xpu
4194
+
4171
4195
  # Split int4 pack weight between cpu and other devices due to
4172
4196
  # https://github.com/pytorch/ao/issues/1117#issuecomment-2451252756.
4173
4197
  - func: _convert_weight_to_int4pack_for_cpu(Tensor self, int innerKTiles) -> Tensor
@@ -4226,7 +4250,7 @@
4226
4250
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: mul_sparse_csr
4227
4251
  MkldnnCPU: mkldnn_mul
4228
4252
  ZeroTensor: mul_zerotensor
4229
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_mul_Tensor
4253
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_mul_Tensor
4230
4254
  tags: [core, pointwise]
4231
4255
 
4232
4256
  - func: mul_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
@@ -4237,7 +4261,7 @@
4237
4261
  SparseCPU, SparseCUDA: mul_sparse_
4238
4262
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: mul_sparse_csr_
4239
4263
  MkldnnCPU: mkldnn_mul_
4240
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_mul__Tensor
4264
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_mul__Tensor
4241
4265
  tags: pointwise
4242
4266
 
4243
4267
  - func: mul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
@@ -4245,8 +4269,7 @@
4245
4269
  structured: True
4246
4270
  structured_inherits: TensorIteratorBase
4247
4271
  dispatch:
4248
- CPU, CUDA: mul_out
4249
- MPS: mul_out_mps
4272
+ CPU, CUDA, MPS: mul_out
4250
4273
  SparseCPU: mul_out_sparse_cpu
4251
4274
  SparseCUDA: mul_out_sparse_cuda
4252
4275
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: mul_out_sparse_csr
@@ -4260,7 +4283,7 @@
4260
4283
  dispatch:
4261
4284
  CompositeExplicitAutograd: mul
4262
4285
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: mul_scalar_sparse_csr
4263
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_mul_Scalar
4286
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_mul_Scalar
4264
4287
  tags: [core, pointwise]
4265
4288
 
4266
4289
  - func: mul_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
@@ -4269,7 +4292,7 @@
4269
4292
  dispatch:
4270
4293
  CompositeExplicitAutograd: mul_
4271
4294
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: mul__scalar_sparse_csr
4272
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_mul__Scalar
4295
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_mul__Scalar
4273
4296
  autogen: mul.Scalar_out
4274
4297
  tags: pointwise
4275
4298
  # multiply, alias for mul
@@ -4335,7 +4358,7 @@
4335
4358
  device_guard: False
4336
4359
  dispatch:
4337
4360
  CompositeImplicitAutograd: narrow_symint
4338
- NestedTensorCPU, NestedTensorCUDA: narrow_nested_symint
4361
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: narrow_nested_symint
4339
4362
 
4340
4363
  - func: narrow.Tensor(Tensor(a) self, int dim, Tensor start, SymInt length) -> Tensor(a)
4341
4364
  variants: function, method
@@ -4474,7 +4497,7 @@
4474
4497
  # NB: Although this composite mutates on the inside, it is
4475
4498
  # non-differentiable so NonFunctional doesn't apply
4476
4499
  CompositeExplicitAutograd: ones_like
4477
- NestedTensorCPU, NestedTensorCUDA: ones_like
4500
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: ones_like
4478
4501
  autogen: ones_like.out
4479
4502
 
4480
4503
  - func: pairwise_distance(Tensor x1, Tensor x2, float p=2, float eps=1e-06, bool keepdim=False) -> Tensor
@@ -4756,6 +4779,14 @@
4756
4779
  CompositeExplicitAutograd: randint_like
4757
4780
  autogen: randint_like.out
4758
4781
 
4782
+ - func: randint_like.Tensor(Tensor self, Tensor high, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
4783
+ tags: nondeterministic_seeded
4784
+ dispatch:
4785
+ # NB: Although this composite mutates on the inside, it is
4786
+ # non-differentiable so NonFunctional doesn't apply
4787
+ CompositeExplicitAutograd: randint_like
4788
+ autogen: randint_like.Tensor_out
4789
+
4759
4790
  - func: randint_like.low_dtype(Tensor self, SymInt low, SymInt high, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
4760
4791
  tags: nondeterministic_seeded
4761
4792
  dispatch:
@@ -4865,7 +4896,7 @@
4865
4896
  structured: True
4866
4897
  structured_inherits: TensorIteratorBase
4867
4898
  dispatch:
4868
- CPU, CUDA: reciprocal_out
4899
+ CPU, CUDA, MTIA: reciprocal_out
4869
4900
  MPS: reciprocal_out_mps
4870
4901
  tags: pointwise
4871
4902
 
@@ -4876,7 +4907,7 @@
4876
4907
  dispatch:
4877
4908
  SparseCPU, SparseCUDA: neg_sparse
4878
4909
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: neg_sparse_csr
4879
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_neg
4910
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_neg
4880
4911
  tags: [core, pointwise]
4881
4912
 
4882
4913
  - func: neg_(Tensor(a!) self) -> Tensor(a!)
@@ -4886,7 +4917,7 @@
4886
4917
  dispatch:
4887
4918
  SparseCPU, SparseCUDA: neg_sparse_
4888
4919
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: neg_sparse_csr_
4889
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_neg_
4920
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_neg_
4890
4921
  tags: pointwise
4891
4922
 
4892
4923
  - func: neg.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@@ -4894,8 +4925,7 @@
4894
4925
  structured: True
4895
4926
  structured_inherits: TensorIteratorBase
4896
4927
  dispatch:
4897
- CPU, CUDA: neg_out
4898
- MPS: neg_out_mps
4928
+ CPU, CUDA, MPS, MTIA: neg_out
4899
4929
  SparseCPU, SparseCUDA: neg_out_sparse
4900
4930
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: neg_sparse_csr_out
4901
4931
  tags: pointwise
@@ -4957,7 +4987,7 @@
4957
4987
  device_check: NoCheck
4958
4988
  device_guard: False
4959
4989
  dispatch:
4960
- CPU, CUDA, Meta, QuantizedCPU, QuantizedCUDA, ZeroTensor, MPS: _reshape_alias
4990
+ CPU, CUDA, Meta, QuantizedCPU, QuantizedCUDA, ZeroTensor, MPS, MTIA: _reshape_alias
4961
4991
  # We don't need to support mkldnn since this is handled explicitly by the reshape operator.
4962
4992
 
4963
4993
  - func: _mkldnn_reshape(Tensor self, int[] shape) -> Tensor
@@ -5035,12 +5065,12 @@
5035
5065
  device_check: NoCheck # TensorIterator
5036
5066
  variants: function, method
5037
5067
  dispatch:
5038
- CPU, CUDA: relu
5068
+ CPU, CUDA, MTIA: relu
5039
5069
  MPS: relu_mps
5040
5070
  MkldnnCPU: mkldnn_relu
5041
5071
  QuantizedCPU: relu_quantized_cpu
5042
5072
  QuantizedCUDA: relu_quantized_cuda
5043
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_relu
5073
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_relu
5044
5074
  SparseCPU, SparseCUDA: relu_sparse
5045
5075
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: relu_sparse_csr
5046
5076
  tags: [core, pointwise]
@@ -5049,12 +5079,12 @@
5049
5079
  device_check: NoCheck # TensorIterator
5050
5080
  variants: function, method
5051
5081
  dispatch:
5052
- CPU, CUDA: relu_
5082
+ CPU, CUDA, MTIA: relu_
5053
5083
  MPS: relu_mps_
5054
5084
  MkldnnCPU: mkldnn_relu_
5055
5085
  QuantizedCPU: relu_quantized_cpu_
5056
5086
  QuantizedCUDA: relu_quantized_cuda_
5057
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_relu_
5087
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_relu_
5058
5088
  SparseCPU, SparseCUDA: relu_sparse_
5059
5089
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: relu_sparse_csr_
5060
5090
  autogen: relu.out
@@ -5100,7 +5130,7 @@
5100
5130
  python_module: nn
5101
5131
  dispatch:
5102
5132
  QuantizedCPU: gelu_quantized_cpu_
5103
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_gelu_
5133
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_gelu_
5104
5134
 
5105
5135
  - func: gelu(Tensor self, *, str approximate='none') -> Tensor
5106
5136
  structured_delegate: gelu.out
@@ -5110,7 +5140,7 @@
5110
5140
  MkldnnCPU: mkldnn_gelu
5111
5141
  QuantizedCPU: gelu_quantized_cpu
5112
5142
  QuantizedCUDA: gelu_quantized_cuda
5113
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_gelu
5143
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_gelu
5114
5144
  tags: [core, pointwise]
5115
5145
 
5116
5146
  - func: gelu_backward.grad_input(Tensor grad_output, Tensor self, *, str approximate='none', Tensor(a!) grad_input) -> Tensor(a!)
@@ -5127,7 +5157,7 @@
5127
5157
  python_module: nn
5128
5158
  dispatch:
5129
5159
  MkldnnCPU: mkldnn_gelu_backward
5130
- NestedTensorCPU, NestedTensorCUDA: gelu_backwards_nested
5160
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: gelu_backwards_nested
5131
5161
  tags: pointwise
5132
5162
 
5133
5163
  - func: infinitely_differentiable_gelu_backward(Tensor grad, Tensor self) -> Tensor
@@ -5141,7 +5171,7 @@
5141
5171
  structured_inherits: TensorIteratorBase
5142
5172
  device_check: NoCheck # TensorIterator
5143
5173
  dispatch:
5144
- CPU, CUDA: hardshrink_out
5174
+ CPU, CUDA, MPS: hardshrink_out
5145
5175
 
5146
5176
  - func: hardshrink(Tensor self, Scalar lambd=0.5) -> Tensor
5147
5177
  structured_delegate: hardshrink.out
@@ -5153,7 +5183,7 @@
5153
5183
  structured: True
5154
5184
  structured_inherits: TensorIteratorBase
5155
5185
  dispatch:
5156
- CPU, CUDA: hardshrink_backward_out
5186
+ CPU, CUDA, MPS: hardshrink_backward_out
5157
5187
 
5158
5188
  - func: hardshrink_backward(Tensor grad_out, Tensor self, Scalar lambd) -> Tensor
5159
5189
  structured_delegate: hardshrink_backward.grad_input
@@ -5176,8 +5206,7 @@
5176
5206
  structured: True
5177
5207
  structured_inherits: TensorIteratorBase
5178
5208
  dispatch:
5179
- CPU, CUDA: rsqrt_out
5180
- MPS: rsqrt_out_mps
5209
+ CPU, CUDA, MPS, MTIA: rsqrt_out
5181
5210
  tags: pointwise
5182
5211
 
5183
5212
  - func: select.Dimname(Tensor(a) self, Dimname dim, int index) -> Tensor(a)
@@ -5192,7 +5221,7 @@
5192
5221
  dispatch:
5193
5222
  CompositeExplicitAutograd: select_symint
5194
5223
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: select_sparse_csr
5195
- NestedTensorCPU, NestedTensorCUDA: select_nested
5224
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: select_nested
5196
5225
  tags: core
5197
5226
 
5198
5227
  - func: select_backward(Tensor grad_output, SymInt[] input_sizes, int dim, SymInt index) -> Tensor
@@ -5208,7 +5237,7 @@
5208
5237
  device_check: NoCheck
5209
5238
  device_guard: False
5210
5239
  dispatch:
5211
- NestedTensorCPU, NestedTensorCUDA: _nested_select_backward_symint
5240
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: _nested_select_backward_symint
5212
5241
 
5213
5242
  - func: selu(Tensor self) -> Tensor
5214
5243
  device_check: NoCheck # TensorIterator
@@ -5233,14 +5262,14 @@
5233
5262
  structured_delegate: silu.out
5234
5263
  python_module: nn
5235
5264
  dispatch:
5236
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_silu
5265
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_silu
5237
5266
  tags: pointwise
5238
5267
 
5239
5268
  - func: silu_(Tensor(a!) self) -> Tensor(a!)
5240
5269
  structured_delegate: silu.out
5241
5270
  python_module: nn
5242
5271
  dispatch:
5243
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_silu_
5272
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_silu_
5244
5273
  tags: pointwise
5245
5274
 
5246
5275
  - func: silu.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@@ -5248,7 +5277,7 @@
5248
5277
  structured_inherits: TensorIteratorBase
5249
5278
  python_module: nn
5250
5279
  dispatch:
5251
- CPU, CUDA: silu_out
5280
+ CPU, CUDA, MTIA: silu_out
5252
5281
  MPS: silu_out_mps
5253
5282
  tags: pointwise
5254
5283
 
@@ -5266,7 +5295,7 @@
5266
5295
  python_module: nn
5267
5296
  dispatch:
5268
5297
  CompositeImplicitAutograd: math_silu_backward
5269
- NestedTensorCPU, NestedTensorCUDA: silu_backward_nested
5298
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: silu_backward_nested
5270
5299
  tags: pointwise
5271
5300
 
5272
5301
  - func: mish(Tensor self) -> Tensor
@@ -5315,14 +5344,13 @@
5315
5344
  structured: True
5316
5345
  structured_inherits: TensorIteratorBase
5317
5346
  dispatch:
5318
- CPU, CUDA: sigmoid_out
5319
- MPS: sigmoid_out_mps
5347
+ CPU, CUDA, MPS: sigmoid_out
5320
5348
  tags: pointwise
5321
5349
 
5322
5350
  - func: logit(Tensor self, float? eps=None) -> Tensor
5323
5351
  variants: function, method
5324
5352
  dispatch:
5325
- CPU, CUDA: logit
5353
+ CPU, CUDA, MTIA: logit
5326
5354
  MPS: logit_mps
5327
5355
  tags: pointwise
5328
5356
 
@@ -5345,7 +5373,7 @@
5345
5373
  dispatch:
5346
5374
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sin_sparse_csr
5347
5375
  SparseCPU, SparseCUDA: sin_sparse
5348
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_sin
5376
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sin
5349
5377
  tags: [core, pointwise]
5350
5378
 
5351
5379
  - func: sin_(Tensor(a!) self) -> Tensor(a!)
@@ -5362,8 +5390,7 @@
5362
5390
  structured: True
5363
5391
  structured_inherits: TensorIteratorBase
5364
5392
  dispatch:
5365
- CPU, CUDA: sin_out
5366
- MPS: sin_out_mps
5393
+ CPU, CUDA, MPS, MTIA: sin_out
5367
5394
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sin_sparse_csr_out
5368
5395
  SparseCPU, SparseCUDA: sin_sparse_out
5369
5396
  tags: pointwise
@@ -5408,8 +5435,7 @@
5408
5435
  structured: True
5409
5436
  structured_inherits: TensorIteratorBase
5410
5437
  dispatch:
5411
- CPU, CUDA: sinh_out
5412
- MPS: sinh_out_mps
5438
+ CPU, CUDA, MPS: sinh_out
5413
5439
  SparseCPU, SparseCUDA: sinh_sparse_out
5414
5440
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sinh_sparse_csr_out
5415
5441
 
@@ -5429,7 +5455,7 @@
5429
5455
  variants: function, method
5430
5456
  dispatch:
5431
5457
  CompositeExplicitAutograd: detach
5432
- NestedTensorCPU, NestedTensorCUDA: detach
5458
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: detach
5433
5459
 
5434
5460
  # Like `detach()`, but modifies this `Variable` in-place. This method may
5435
5461
  # only be called on non-view `Variable`s. You can use `is_view()` to check
@@ -5559,7 +5585,7 @@
5559
5585
  structured_delegate: _softmax.out
5560
5586
  dispatch:
5561
5587
  MkldnnCPU: mkldnn_softmax
5562
- NestedTensorCPU, NestedTensorCUDA: softmax_nested
5588
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: softmax_nested
5563
5589
  tags: core
5564
5590
 
5565
5591
  - func: _softmax.out(Tensor self, int dim, bool half_to_float, *, Tensor(a!) out) -> Tensor(a!)
@@ -5572,7 +5598,7 @@
5572
5598
  - func: _softmax_backward_data(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype) -> Tensor
5573
5599
  structured_delegate: _softmax_backward_data.out
5574
5600
  dispatch:
5575
- NestedTensorCPU, NestedTensorCUDA: nested_softmax_backward
5601
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: nested_softmax_backward
5576
5602
 
5577
5603
  - func: _softmax_backward_data.out(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype, *, Tensor(a!) grad_input) -> Tensor(a!)
5578
5604
  structured: True
@@ -5616,7 +5642,7 @@
5616
5642
  device_guard: False
5617
5643
  dispatch:
5618
5644
  CompositeExplicitAutograd: split_with_sizes
5619
- NestedTensorCPU, NestedTensorCUDA: split_with_sizes_nested
5645
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: split_with_sizes_nested
5620
5646
  tags: core
5621
5647
 
5622
5648
  - func: hsplit.int(Tensor(a -> *) self, int sections) -> Tensor(a)[]
@@ -5644,7 +5670,7 @@
5644
5670
  dispatch:
5645
5671
  CompositeExplicitAutograd: squeeze
5646
5672
  QuantizedCPU, QuantizedCUDA: squeeze_quantized
5647
- NestedTensorCPU, NestedTensorCUDA: squeeze_nested
5673
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: squeeze_nested
5648
5674
 
5649
5675
  - func: squeeze.dim(Tensor(a) self, int dim) -> Tensor(a)
5650
5676
  variants: function, method
@@ -5653,7 +5679,7 @@
5653
5679
  dispatch:
5654
5680
  CompositeExplicitAutograd: squeeze
5655
5681
  QuantizedCPU, QuantizedCUDA: squeeze_quantized
5656
- NestedTensorCPU, NestedTensorCUDA: squeeze_dim_nested
5682
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: squeeze_dim_nested
5657
5683
  tags: core
5658
5684
 
5659
5685
  - func: squeeze.dimname(Tensor(a) self, Dimname dim) -> Tensor(a)
@@ -5669,7 +5695,7 @@
5669
5695
  dispatch:
5670
5696
  CompositeExplicitAutograd: squeeze
5671
5697
  QuantizedCPU, QuantizedCUDA: squeeze_quantized
5672
- NestedTensorCPU, NestedTensorCUDA: squeeze_dim_nested
5698
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: squeeze_dim_nested
5673
5699
  tags: core
5674
5700
 
5675
5701
  - func: squeeze_(Tensor(a!) self) -> Tensor(a!)
@@ -5843,7 +5869,7 @@
5843
5869
  structured_delegate: sqrt.out
5844
5870
  variants: function, method
5845
5871
  dispatch:
5846
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_sqrt
5872
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sqrt
5847
5873
  SparseCPU, SparseCUDA: sqrt_sparse
5848
5874
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sqrt_sparse_csr
5849
5875
  tags: [core, pointwise]
@@ -5862,7 +5888,7 @@
5862
5888
  structured: True
5863
5889
  structured_inherits: TensorIteratorBase
5864
5890
  dispatch:
5865
- CPU, CUDA, MPS: sqrt_out
5891
+ CPU, CUDA, MPS, MTIA: sqrt_out
5866
5892
  SparseCPU, SparseCUDA: sqrt_sparse_out
5867
5893
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sqrt_sparse_csr_out
5868
5894
  tags: pointwise
@@ -6019,8 +6045,7 @@
6019
6045
  structured: True
6020
6046
  structured_inherits: TensorIteratorBase
6021
6047
  dispatch:
6022
- CPU, CUDA: tan_out
6023
- MPS: tan_out_mps
6048
+ CPU, CUDA, MPS: tan_out
6024
6049
  SparseCPU, SparseCUDA: tan_sparse_out
6025
6050
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tan_sparse_csr_out
6026
6051
  tags: pointwise
@@ -6034,7 +6059,7 @@
6034
6059
  MkldnnCPU: mkldnn_tanh
6035
6060
  SparseCPU, SparseCUDA: tanh_sparse
6036
6061
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tanh_sparse_csr
6037
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_tanh
6062
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_tanh
6038
6063
  tags: [core, pointwise]
6039
6064
 
6040
6065
  - func: tanh_(Tensor(a!) self) -> Tensor(a!)
@@ -6045,7 +6070,7 @@
6045
6070
  MkldnnCPU: mkldnn_tanh_
6046
6071
  SparseCPU, SparseCUDA: tanh_sparse_
6047
6072
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tanh_sparse_csr_
6048
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_tanh_
6073
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_tanh_
6049
6074
  tags: pointwise
6050
6075
 
6051
6076
  - func: tanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@@ -6053,7 +6078,7 @@
6053
6078
  structured: True
6054
6079
  structured_inherits: TensorIteratorBase
6055
6080
  dispatch:
6056
- CPU, CUDA, MPS: tanh_out
6081
+ CPU, CUDA, MPS, MTIA: tanh_out
6057
6082
  SparseCPU, SparseCUDA: tanh_sparse_out
6058
6083
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tanh_sparse_csr_out
6059
6084
  tags: pointwise
@@ -6102,7 +6127,7 @@
6102
6127
  MkldnnCPU: mkldnn_relu_backward
6103
6128
  SparseCPU, SparseCUDA: threshold_backward_sparse
6104
6129
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: threshold_backward_sparse_compressed
6105
- NestedTensorCPU, NestedTensorCUDA: threshold_backwards_nested
6130
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: threshold_backwards_nested
6106
6131
  tags: pointwise
6107
6132
 
6108
6133
  - func: tile(Tensor self, SymInt[] dims) -> Tensor
@@ -6116,7 +6141,7 @@
6116
6141
  device_guard: False
6117
6142
  dispatch:
6118
6143
  CompositeExplicitAutograd: transpose
6119
- NestedTensorCPU, NestedTensorCUDA: transpose_nested
6144
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: transpose_nested
6120
6145
 
6121
6146
  - func: transpose.Dimname(Tensor(a) self, Dimname dim0, Dimname dim1) -> Tensor(a)
6122
6147
  variants: function, method
@@ -6213,13 +6238,13 @@
6213
6238
  - func: _nested_tensor_size(Tensor self) -> Tensor
6214
6239
  variants: method
6215
6240
  dispatch:
6216
- NestedTensorCPU, NestedTensorCUDA: _nested_tensor_size
6241
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: _nested_tensor_size
6217
6242
  autogen: _nested_tensor_size.out
6218
6243
 
6219
6244
  - func: _nested_tensor_strides(Tensor self) -> Tensor
6220
6245
  variants: method
6221
6246
  dispatch:
6222
- NestedTensorCPU, NestedTensorCUDA: _nested_tensor_strides
6247
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: _nested_tensor_strides
6223
6248
  autogen: _nested_tensor_strides.out
6224
6249
 
6225
6250
  - func: _nested_tensor_storage_offsets(Tensor self) -> Tensor
@@ -6232,7 +6257,7 @@
6232
6257
  # _nested_from_padded_and_nested_example is available for testing.
6233
6258
  - func: _nested_from_padded_and_nested_example(Tensor padded, Tensor nt_example) -> Tensor
6234
6259
  dispatch:
6235
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_from_padded_and_nested_example
6260
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_from_padded_and_nested_example
6236
6261
  autogen: _nested_from_padded_and_nested_example.out
6237
6262
 
6238
6263
  # The input arguments' types to this functions are temporary. When nested tensors switch to using SymInts for their metadata representation
@@ -6423,7 +6448,7 @@
6423
6448
  CompositeExplicitAutograd: unsqueeze
6424
6449
  SparseCPU, SparseCUDA: unsqueeze_sparse
6425
6450
  QuantizedCPU, QuantizedCUDA: unsqueeze_quantized
6426
- NestedTensorCPU, NestedTensorCUDA: unsqueeze_nested
6451
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: unsqueeze_nested
6427
6452
  tags: core
6428
6453
 
6429
6454
  - func: unsqueeze_(Tensor(a!) self, int dim) -> Tensor(a!)
@@ -6517,15 +6542,15 @@
6517
6542
  device_check: NoCheck # TensorIterator
6518
6543
  variants: function, method
6519
6544
  dispatch:
6520
- CPU, CUDA, MPS: where
6521
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_where
6545
+ CPU, CUDA, MPS, MTIA: where
6546
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_where
6522
6547
  tags: [core, pointwise]
6523
6548
 
6524
6549
  - func: where.self_out(Tensor condition, Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
6525
6550
  device_check: NoCheck # TensorIterator
6526
6551
  dispatch:
6527
- CPU, CUDA, MPS: where_self_out
6528
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_where_out
6552
+ CPU, CUDA, MPS, MTIA: where_self_out
6553
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_where_out
6529
6554
 
6530
6555
  - func: where.ScalarSelf(Tensor condition, Scalar self, Tensor other) -> Tensor
6531
6556
  variants: function
@@ -6860,7 +6885,7 @@
6860
6885
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: clone_sparse_compressed
6861
6886
  MkldnnCPU: mkldnn_clone
6862
6887
  QuantizedCPU, QuantizedCUDA: quantized_clone
6863
- NestedTensorCPU, NestedTensorCUDA: clone_nested
6888
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: clone_nested
6864
6889
  autogen: clone.out
6865
6890
  tags: [core, pointwise]
6866
6891
 
@@ -6894,7 +6919,7 @@
6894
6919
  SparseCPU, SparseCUDA, SparseMeta: zero_sparse_
6895
6920
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: zero_sparse_csr_
6896
6921
  MkldnnCPU: mkldnn_zero_
6897
- NestedTensorCPU, NestedTensorCUDA: zero_nested_
6922
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: zero_nested_
6898
6923
  autogen: zero, zero.out
6899
6924
 
6900
6925
  - func: sub.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
@@ -6914,7 +6939,7 @@
6914
6939
  dispatch:
6915
6940
  SparseCPU, SparseCUDA: sub_sparse
6916
6941
  ZeroTensor: sub_zerotensor
6917
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_sub_Tensor
6942
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sub_Tensor
6918
6943
  tags: [core, pointwise]
6919
6944
 
6920
6945
  - func: sub_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
@@ -6961,7 +6986,7 @@
6961
6986
  device_check: NoCheck # TensorIterator
6962
6987
  variants: function
6963
6988
  dispatch:
6964
- CPU, CUDA: rsub
6989
+ CPU, CUDA, MPS: rsub
6965
6990
  autogen: rsub.Tensor_out
6966
6991
 
6967
6992
  - func: heaviside.out(Tensor self, Tensor values, *, Tensor(a!) out) -> Tensor(a!)
@@ -7043,6 +7068,14 @@
7043
7068
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: addmm_sparse_compressed_dense
7044
7069
  tags: core
7045
7070
 
7071
+ - func: addmm.dtype(Tensor self, Tensor mat1, Tensor mat2, ScalarType out_dtype, *, Scalar beta=1, Scalar alpha=1) -> Tensor
7072
+ dispatch:
7073
+ CUDA: _addmm_dtype_cuda
7074
+
7075
+ - func: addmm.dtype_out(Tensor self, Tensor mat1, Tensor mat2, ScalarType out_dtype, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
7076
+ dispatch:
7077
+ CUDA: _addmm_dtype_out_cuda
7078
+
7046
7079
  - func: addmm_(Tensor(a!) self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
7047
7080
  structured_delegate: addmm.out
7048
7081
  variants: method
@@ -7066,11 +7099,13 @@
7066
7099
  - func: _scaled_mm(Tensor self, Tensor mat2, Tensor scale_a, Tensor scale_b, Tensor? bias=None, Tensor? scale_result=None, ScalarType? out_dtype=None, bool use_fast_accum=False) -> Tensor
7067
7100
  variants: function
7068
7101
  dispatch:
7102
+ CPU: _scaled_mm_cpu
7069
7103
  CUDA: _scaled_mm_cuda
7070
7104
 
7071
7105
  - func: _scaled_mm.out(Tensor self, Tensor mat2, Tensor scale_a, Tensor scale_b, Tensor? bias=None, Tensor? scale_result=None, ScalarType? out_dtype=None, bool use_fast_accum=False, *, Tensor(a!) out) -> Tensor(a!)
7072
7106
  variants: function
7073
7107
  dispatch:
7108
+ CPU: _scaled_mm_out_cpu
7074
7109
  CUDA: _scaled_mm_out_cuda
7075
7110
 
7076
7111
 
@@ -7079,6 +7114,11 @@
7079
7114
  dispatch:
7080
7115
  CUDA: _scaled_grouped_mm_cuda
7081
7116
 
7117
+ - func: _grouped_mm(Tensor self, Tensor mat2, Tensor? offs=None, Tensor? bias=None, ScalarType? out_dtype=None) -> Tensor
7118
+ variants: function
7119
+ dispatch:
7120
+ CUDA: _grouped_mm_cuda
7121
+
7082
7122
  # NOTE [ Sparse: autograd and API ]
7083
7123
  #
7084
7124
  #
@@ -7233,13 +7273,13 @@
7233
7273
  dispatch:
7234
7274
  CompositeImplicitAutograd: _sparse_coo_tensor_unsafe_symint
7235
7275
 
7236
- - func: _validate_sparse_coo_tensor_args(Tensor indices, Tensor values, int[] size, bool? is_coalesced=None) -> ()
7276
+ - func: _validate_sparse_coo_tensor_args(Tensor indices, Tensor values, int[] size, bool? is_coalesced=None, bool? check_pinning=None) -> ()
7237
7277
 
7238
- - func: _validate_sparse_compressed_tensor_args(Tensor compressed_indices, Tensor plain_indices, Tensor values, int[] size, Layout layout) -> ()
7239
- - func: _validate_sparse_csr_tensor_args(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size) -> ()
7240
- - func: _validate_sparse_csc_tensor_args(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size) -> ()
7241
- - func: _validate_sparse_bsr_tensor_args(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size) -> ()
7242
- - func: _validate_sparse_bsc_tensor_args(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size) -> ()
7278
+ - func: _validate_sparse_compressed_tensor_args(Tensor compressed_indices, Tensor plain_indices, Tensor values, int[] size, Layout layout, bool? check_pinning=None) -> ()
7279
+ - func: _validate_sparse_csr_tensor_args(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, bool? check_pinning=None) -> ()
7280
+ - func: _validate_sparse_csc_tensor_args(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, bool? check_pinning=None) -> ()
7281
+ - func: _validate_sparse_bsr_tensor_args(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, bool? check_pinning=None) -> ()
7282
+ - func: _validate_sparse_bsc_tensor_args(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, bool? check_pinning=None) -> ()
7243
7283
 
7244
7284
  - func: _sparse_coo_tensor_with_dims(int sparse_dim, int dense_dim, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
7245
7285
  dispatch:
@@ -7397,7 +7437,7 @@
7397
7437
  dispatch:
7398
7438
  SparseCPU, SparseCUDA, SparseMeta: values_sparse
7399
7439
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: values_sparse_csr
7400
- NestedTensorCPU, NestedTensorCUDA: values_nested
7440
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: values_nested
7401
7441
  CompositeExplicitAutograd: values_default
7402
7442
  device_check: NoCheck
7403
7443
  device_guard: False
@@ -7456,7 +7496,7 @@
7456
7496
  variants: function, method
7457
7497
  dispatch:
7458
7498
  CompositeExplicitAutograd: unbind
7459
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_unbind
7499
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_unbind
7460
7500
 
7461
7501
  - func: unbind.Dimname(Tensor(a -> *) self, Dimname dim) -> Tensor(a)[]
7462
7502
  variants: function, method
@@ -7744,7 +7784,7 @@
7744
7784
  device_guard: False
7745
7785
  dispatch:
7746
7786
  CompositeExplicitAutograd: _to_copy
7747
- NestedTensorCPU, NestedTensorCUDA: _to_copy_nested
7787
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: _to_copy_nested
7748
7788
  autogen: _to_copy.out
7749
7789
  tags: core
7750
7790
 
@@ -8030,7 +8070,7 @@
8030
8070
  variants: function, method
8031
8071
  dispatch:
8032
8072
  CompositeExplicitAutograd: masked_fill
8033
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_masked_fill
8073
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_masked_fill
8034
8074
  tags: pointwise
8035
8075
 
8036
8076
  - func: masked_fill_.Tensor(Tensor(a!) self, Tensor mask, Tensor value) -> Tensor(a!)
@@ -8085,9 +8125,9 @@
8085
8125
  device_check: NoCheck
8086
8126
  device_guard: False
8087
8127
  dispatch:
8088
- ZeroTensor, Meta, CPU, CUDA, QuantizedCPU, QuantizedCUDA, MPS: view
8128
+ ZeroTensor, Meta, CPU, CUDA, QuantizedCPU, QuantizedCUDA, MPS, MTIA: view
8089
8129
  MkldnnCPU: mkldnn_view
8090
- NestedTensorCPU, NestedTensorCUDA: view_nested
8130
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: view_nested
8091
8131
  tags: core
8092
8132
 
8093
8133
  # Warning: If you want to change the name or overload name of this
@@ -8315,7 +8355,7 @@
8315
8355
  structured_inherits: TensorIteratorBase
8316
8356
  variants: function
8317
8357
  dispatch:
8318
- CPU, CUDA: bitwise_and_out
8358
+ CPU, CUDA, MTIA: bitwise_and_out
8319
8359
  MPS: bitwise_and_out_mps
8320
8360
  tags: pointwise
8321
8361
 
@@ -8382,7 +8422,7 @@
8382
8422
  structured_inherits: TensorIteratorBase
8383
8423
  variants: function
8384
8424
  dispatch:
8385
- CPU, CUDA: bitwise_or_out
8425
+ CPU, CUDA, MTIA: bitwise_or_out
8386
8426
  MPS: bitwise_or_out_mps
8387
8427
  tags: pointwise
8388
8428
 
@@ -8928,7 +8968,7 @@
8928
8968
  variants: method, function
8929
8969
  dispatch:
8930
8970
  QuantizedCPU: eq_quantized_cpu
8931
- NestedTensorCPU, NestedTensorCUDA: eq_scalar_nested
8971
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: eq_scalar_nested
8932
8972
  tags: [core, pointwise]
8933
8973
 
8934
8974
  - func: eq.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
@@ -8947,7 +8987,7 @@
8947
8987
  variants: method, function
8948
8988
  dispatch:
8949
8989
  QuantizedCPU: eq_quantized_cpu
8950
- NestedTensorCPU, NestedTensorCUDA: eq_tensor_nested
8990
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: eq_tensor_nested
8951
8991
  tags: [core, pointwise]
8952
8992
 
8953
8993
  - func: ge.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
@@ -8966,7 +9006,7 @@
8966
9006
  variants: method, function
8967
9007
  dispatch:
8968
9008
  QuantizedCPU: ge_quantized_cpu
8969
- NestedTensorCPU, NestedTensorCUDA: ge_scalar_nested
9009
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: ge_scalar_nested
8970
9010
  tags: [core, pointwise]
8971
9011
 
8972
9012
  - func: ge.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
@@ -9093,7 +9133,7 @@
9093
9133
  variants: method, function
9094
9134
  dispatch:
9095
9135
  QuantizedCPU: gt_quantized_cpu
9096
- NestedTensorCPU, NestedTensorCUDA: gt_scalar_nested
9136
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: gt_scalar_nested
9097
9137
  tags: [core, pointwise]
9098
9138
 
9099
9139
  - func: gt.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
@@ -9146,7 +9186,7 @@
9146
9186
  structured_inherits: TensorIteratorBase
9147
9187
  device_check: NoCheck # TensorIterator
9148
9188
  dispatch:
9149
- CPU, CUDA: lt_Scalar_out
9189
+ CPU, CUDA, MTIA: lt_Scalar_out
9150
9190
  MPS: lt_scalar_out_mps
9151
9191
  QuantizedCPU: lt_out_quantized_cpu
9152
9192
  tags: pointwise
@@ -9164,7 +9204,7 @@
9164
9204
  structured_inherits: TensorIteratorBase
9165
9205
  device_check: NoCheck # TensorIterator
9166
9206
  dispatch:
9167
- CPU, CUDA: lt_Tensor_out
9207
+ CPU, CUDA, MTIA: lt_Tensor_out
9168
9208
  MPS: lt_tensor_out_mps
9169
9209
  QuantizedCPU: lt_out_quantized_cpu
9170
9210
  tags: pointwise
@@ -9436,14 +9476,12 @@
9436
9476
 
9437
9477
  - func: cholesky.out(Tensor self, bool upper=False, *, Tensor(a!) out) -> Tensor(a!)
9438
9478
  dispatch:
9439
- CPU, CUDA: cholesky_out
9440
- MPS: cholesky_mps_out
9479
+ CPU, CUDA, MPS: cholesky_out
9441
9480
 
9442
9481
  - func: cholesky(Tensor self, bool upper=False) -> Tensor
9443
9482
  variants: method, function
9444
9483
  dispatch:
9445
- CPU, CUDA: cholesky
9446
- MPS: cholesky_mps
9484
+ CPU, CUDA, MPS: cholesky
9447
9485
 
9448
9486
  - func: cholesky_solve.out(Tensor self, Tensor input2, bool upper=False, *, Tensor(a!) out) -> Tensor(a!)
9449
9487
  dispatch:
@@ -9520,13 +9558,13 @@
9520
9558
  MPS: lu_unpack_out_mps
9521
9559
 
9522
9560
  # TODO: remove dispatch section when porting TH CUDA to ATen
9523
- - func: multinomial.out(Tensor self, int num_samples, bool replacement=False, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
9561
+ - func: multinomial.out(Tensor self, SymInt num_samples, bool replacement=False, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
9524
9562
  tags: nondeterministic_seeded
9525
9563
  dispatch:
9526
9564
  CPU, CUDA: multinomial_out
9527
9565
  MPS: multinomial_out_mps
9528
9566
 
9529
- - func: multinomial(Tensor self, int num_samples, bool replacement=False, *, Generator? generator=None) -> Tensor
9567
+ - func: multinomial(Tensor self, SymInt num_samples, bool replacement=False, *, Generator? generator=None) -> Tensor
9530
9568
  variants: method, function
9531
9569
  dispatch:
9532
9570
  CPU, CUDA: multinomial
@@ -9727,8 +9765,7 @@
9727
9765
  structured: True
9728
9766
  structured_inherits: TensorIteratorBase
9729
9767
  dispatch:
9730
- CPU, CUDA: lerp_Scalar
9731
- MPS: lerp_Scalar_mps
9768
+ CPU, CUDA, MPS: lerp_Scalar
9732
9769
  tags: pointwise
9733
9770
 
9734
9771
  - func: lerp.Tensor_out(Tensor self, Tensor end, Tensor weight, *, Tensor(a!) out) -> Tensor(a!)
@@ -9827,8 +9864,7 @@
9827
9864
  structured: True
9828
9865
  structured_inherits: TensorIteratorBase
9829
9866
  dispatch:
9830
- CPU, CUDA: fmod_out
9831
- MPS: fmod_mps_out
9867
+ CPU, CUDA, MPS: fmod_out
9832
9868
  tags: pointwise
9833
9869
 
9834
9870
  - func: fmod.Tensor(Tensor self, Tensor other) -> Tensor
@@ -9934,8 +9970,7 @@
9934
9970
  structured: True
9935
9971
  structured_inherits: TensorIteratorBase
9936
9972
  dispatch:
9937
- CPU, CUDA: remainder_out
9938
- MPS: remainder_out_mps
9973
+ CPU, CUDA, MPS, MTIA: remainder_out
9939
9974
  tags: pointwise
9940
9975
 
9941
9976
  - func: remainder.Tensor(Tensor self, Tensor other) -> Tensor
@@ -10019,7 +10054,7 @@
10019
10054
  structured_inherits: TensorIteratorBase
10020
10055
  device_check: NoCheck # TensorIterator
10021
10056
  dispatch:
10022
- CPU, CUDA: maximum_out
10057
+ CPU, CUDA, MTIA: maximum_out
10023
10058
  MPS: maximum_out_mps
10024
10059
  tags: pointwise
10025
10060
 
@@ -10051,7 +10086,7 @@
10051
10086
  structured_inherits: TensorIteratorBase
10052
10087
  device_check: NoCheck # TensorIterator
10053
10088
  dispatch:
10054
- CPU, CUDA: minimum_out
10089
+ CPU, CUDA, MTIA: minimum_out
10055
10090
  MPS: minimum_out_mps
10056
10091
  tags: pointwise
10057
10092
 
@@ -10203,7 +10238,7 @@
10203
10238
  device_check: NoCheck
10204
10239
  device_guard: False
10205
10240
  dispatch:
10206
- CPU, CUDA, Meta, MPS: unfold
10241
+ CPU, CUDA, Meta, MPS, MTIA: unfold
10207
10242
  QuantizedCPU, QuantizedCUDA: unfold
10208
10243
 
10209
10244
  - func: unfold_backward(Tensor grad_in, SymInt[] input_sizes, int dim, int size, int step) -> Tensor
@@ -10316,7 +10351,7 @@
10316
10351
  MPS: normal_mps_
10317
10352
  Meta: normal_meta_
10318
10353
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: normal_sparse_csr_
10319
- NestedTensorCPU, NestedTensorCUDA: normal_nested_
10354
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: normal_nested_
10320
10355
  autogen: normal.out
10321
10356
 
10322
10357
  # Only used by the functionalization pass.
@@ -10384,7 +10419,7 @@
10384
10419
  variants: method, function
10385
10420
  dispatch:
10386
10421
  CompositeExplicitAutograd: alias
10387
- NestedTensorCPU, NestedTensorCUDA: alias_nested
10422
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: alias_nested
10388
10423
  tags: core
10389
10424
 
10390
10425
  - func: _amp_foreach_non_finite_check_and_unscale_(Tensor(a!)[] self, Tensor(b!) found_inf, Tensor inv_scale) -> ()
@@ -10392,6 +10427,7 @@
10392
10427
  dispatch:
10393
10428
  CUDA: _amp_foreach_non_finite_check_and_unscale_cuda_
10394
10429
  CPU: _amp_foreach_non_finite_check_and_unscale_cpu_
10430
+ MPS: _amp_foreach_non_finite_check_and_unscale_mps_
10395
10431
  autogen: _amp_foreach_non_finite_check_and_unscale, _amp_foreach_non_finite_check_and_unscale.out
10396
10432
 
10397
10433
  - func: _amp_update_scale_(Tensor(a!) self, Tensor(b!) growth_tracker, Tensor found_inf, float scale_growth_factor, float scale_backoff_factor, int growth_interval) -> Tensor(a!)
@@ -10399,6 +10435,7 @@
10399
10435
  dispatch:
10400
10436
  CUDA: _amp_update_scale_cuda_
10401
10437
  CPU: _amp_update_scale_cpu_
10438
+ MPS: _amp_update_scale_mps_
10402
10439
  autogen: _amp_update_scale, _amp_update_scale.out
10403
10440
 
10404
10441
  #- func: _cat(Tensor[] tensors, int dim=0) -> Tensor
@@ -11801,7 +11838,7 @@
11801
11838
  structured_delegate: elu.out
11802
11839
  device_check: NoCheck # TensorIterator
11803
11840
  python_module: nn
11804
- tags: pointwise
11841
+ tags: [core, pointwise]
11805
11842
 
11806
11843
  - func: elu_backward.grad_input(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, bool is_result, Tensor self_or_result, *, Tensor(a!) grad_input) -> Tensor(a!)
11807
11844
  structured: True
@@ -11865,8 +11902,7 @@
11865
11902
  device_check: NoCheck # TensorIterator
11866
11903
  python_module: nn
11867
11904
  dispatch:
11868
- CPU, CUDA: hardsigmoid_out
11869
- MPS: hardsigmoid_out_mps
11905
+ CPU, CUDA, MPS: hardsigmoid_out
11870
11906
  QuantizedCPU: hardsigmoid_out_quantized_cpu
11871
11907
 
11872
11908
  - func: hardsigmoid(Tensor self) -> Tensor
@@ -11887,8 +11923,7 @@
11887
11923
  structured_inherits: TensorIteratorBase
11888
11924
  python_module: nn
11889
11925
  dispatch:
11890
- CPU, CUDA: hardsigmoid_backward_out
11891
- MPS: hardsigmoid_backward_out_mps
11926
+ CPU, CUDA, MPS: hardsigmoid_backward_out
11892
11927
 
11893
11928
  - func: hardsigmoid_backward(Tensor grad_output, Tensor self) -> Tensor
11894
11929
  structured_delegate: hardsigmoid_backward.grad_input
@@ -11932,28 +11967,24 @@
11932
11967
  device_check: NoCheck # TensorIterator
11933
11968
  python_module: nn
11934
11969
  dispatch:
11935
- CPU, CUDA: hardswish_out
11936
- MPS: hardswish_out_mps
11970
+ CPU, CUDA, MPS: hardswish_out
11937
11971
 
11938
11972
  - func: hardswish(Tensor self) -> Tensor
11939
11973
  device_check: NoCheck # TensorIterator
11940
11974
  python_module: nn
11941
11975
  dispatch:
11942
- CPU, CUDA: hardswish
11943
- MPS: hardswish_mps
11976
+ CPU, CUDA, MPS: hardswish
11944
11977
 
11945
11978
  - func: hardswish_(Tensor(a!) self) -> Tensor(a!)
11946
11979
  device_check: NoCheck # TensorIterator
11947
11980
  python_module: nn
11948
11981
  dispatch:
11949
- CPU, CUDA: hardswish_
11950
- MPS: hardswish_mps_
11982
+ CPU, CUDA, MPS: hardswish_
11951
11983
 
11952
11984
  - func: hardswish_backward(Tensor grad_output, Tensor self) -> Tensor
11953
11985
  python_module: nn
11954
11986
  dispatch:
11955
- CPU, CUDA: hardswish_backward
11956
- MPS: hardswish_backward_mps
11987
+ CPU, CUDA, MPS: hardswish_backward
11957
11988
  autogen: hardswish_backward.out
11958
11989
 
11959
11990
  - func: leaky_relu.out(Tensor self, Scalar negative_slope=0.01, *, Tensor(a!) out) -> Tensor(a!)
@@ -11962,8 +11993,7 @@
11962
11993
  device_check: NoCheck # TensorIterator
11963
11994
  python_module: nn
11964
11995
  dispatch:
11965
- CPU, CUDA: leaky_relu_out
11966
- MPS: leaky_relu_out_mps
11996
+ CPU, CUDA, MPS: leaky_relu_out
11967
11997
  QuantizedCPU: leaky_relu_out_quantized_cpu
11968
11998
 
11969
11999
  - func: leaky_relu(Tensor self, Scalar negative_slope=0.01) -> Tensor
@@ -11979,8 +12009,7 @@
11979
12009
  structured_inherits: TensorIteratorBase
11980
12010
  python_module: nn
11981
12011
  dispatch:
11982
- CPU, CUDA: leaky_relu_backward_out
11983
- MPS: leaky_relu_backward_out_mps
12012
+ CPU, CUDA, MPS: leaky_relu_backward_out
11984
12013
 
11985
12014
  - func: leaky_relu_backward(Tensor grad_output, Tensor self, Scalar negative_slope, bool self_is_result) -> Tensor
11986
12015
  structured_delegate: leaky_relu_backward.grad_input
@@ -12092,8 +12121,7 @@
12092
12121
  device_check: NoCheck # TensorIterator
12093
12122
  python_module: nn
12094
12123
  dispatch:
12095
- CPU, CUDA: softshrink_out
12096
- MPS: softshrink_out_mps
12124
+ CPU, CUDA, MPS: softshrink_out
12097
12125
 
12098
12126
  - func: softshrink(Tensor self, Scalar lambd=0.5) -> Tensor
12099
12127
  structured_delegate: softshrink.out
@@ -12106,8 +12134,7 @@
12106
12134
  structured_inherits: TensorIteratorBase
12107
12135
  python_module: nn
12108
12136
  dispatch:
12109
- CPU, CUDA: softshrink_backward_out
12110
- MPS: softshrink_backward_out_mps
12137
+ CPU, CUDA, MPS: softshrink_backward_out
12111
12138
 
12112
12139
  - func: softshrink_backward(Tensor grad_output, Tensor self, Scalar lambd) -> Tensor
12113
12140
  structured_delegate: softshrink_backward.grad_input
@@ -12769,6 +12796,7 @@
12769
12796
  dispatch:
12770
12797
  CPU: _upsample_bicubic2d_aa_out_cpu
12771
12798
  CUDA: _upsample_bicubic2d_aa_out_cuda
12799
+ MPS: _upsample_bicubic2d_aa_out_mps
12772
12800
 
12773
12801
  - func: _upsample_bicubic2d_aa(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
12774
12802
  python_module: nn
@@ -12791,6 +12819,7 @@
12791
12819
  dispatch:
12792
12820
  CPU: upsample_trilinear3d_out_cpu
12793
12821
  CUDA: upsample_trilinear3d_out_cuda
12822
+ MPS: upsample_trilinear3d_out_mps
12794
12823
 
12795
12824
  - func: upsample_trilinear3d(Tensor self, SymInt[3] output_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
12796
12825
  python_module: nn
@@ -12802,6 +12831,7 @@
12802
12831
  dispatch:
12803
12832
  CPU: upsample_trilinear3d_backward_out_cpu
12804
12833
  CUDA: upsample_trilinear3d_backward_out_cuda
12834
+ MPS: upsample_trilinear3d_backward_out_mps
12805
12835
 
12806
12836
  - func: upsample_trilinear3d_backward(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
12807
12837
  python_module: nn
@@ -12913,6 +12943,7 @@
12913
12943
  dispatch:
12914
12944
  CPU: upsample_nearest3d_out_cpu
12915
12945
  CUDA: upsample_nearest3d_out_cuda
12946
+ MPS: upsample_nearest3d_out_mps
12916
12947
 
12917
12948
  - func: _upsample_nearest_exact3d.out(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
12918
12949
  python_module: nn
@@ -12920,6 +12951,7 @@
12920
12951
  dispatch:
12921
12952
  CPU: _upsample_nearest_exact3d_out_cpu
12922
12953
  CUDA: _upsample_nearest_exact3d_out_cuda
12954
+ MPS: _upsample_nearest_exact3d_out_mps
12923
12955
 
12924
12956
  - func: upsample_nearest3d(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
12925
12957
  python_module: nn
@@ -12939,6 +12971,7 @@
12939
12971
  dispatch:
12940
12972
  CPU: upsample_nearest3d_backward_out_cpu
12941
12973
  CUDA: upsample_nearest3d_backward_out_cuda
12974
+ MPS: upsample_nearest3d_backward_out_mps
12942
12975
 
12943
12976
  - func: _upsample_nearest_exact3d_backward.grad_input(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
12944
12977
  python_module: nn
@@ -12946,6 +12979,7 @@
12946
12979
  dispatch:
12947
12980
  CPU: _upsample_nearest_exact3d_backward_out_cpu
12948
12981
  CUDA: _upsample_nearest_exact3d_backward_out_cuda
12982
+ MPS: _upsample_nearest_exact3d_backward_out_mps
12949
12983
 
12950
12984
  - func: upsample_nearest3d_backward(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
12951
12985
  python_module: nn
@@ -12988,7 +13022,7 @@
12988
13022
  structured: True
12989
13023
  structured_inherits: TensorIteratorBase
12990
13024
  dispatch:
12991
- CPU, CUDA: tanh_backward_out
13025
+ CPU, CUDA, MTIA: tanh_backward_out
12992
13026
  MPS: tanh_backward_out_mps
12993
13027
  tags: pointwise
12994
13028
 
@@ -13120,12 +13154,14 @@
13120
13154
  dispatch:
13121
13155
  CPU: col2im_out_cpu
13122
13156
  CUDA: col2im_out_cuda
13157
+ MPS: col2im_out_mps
13123
13158
 
13124
13159
  - func: col2im(Tensor self, SymInt[2] output_size, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride) -> Tensor
13125
13160
  python_module: nn
13126
13161
  dispatch:
13127
13162
  CPU: col2im_cpu
13128
13163
  CUDA: col2im_cuda
13164
+ MPS: col2im_mps
13129
13165
  tags: core
13130
13166
 
13131
13167
  - func: column_stack(Tensor[] tensors) -> Tensor
@@ -13158,7 +13194,7 @@
13158
13194
  device_guard: False
13159
13195
  dispatch:
13160
13196
  CompositeExplicitAutograd: isinf
13161
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_isinf
13197
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_isinf
13162
13198
  SparseCPU, SparseCUDA: isinf_sparse
13163
13199
  SparseMeta: isinf_sparse_meta
13164
13200
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isinf_sparse_csr
@@ -13174,7 +13210,7 @@
13174
13210
  variants: function, method
13175
13211
  structured_delegate: isposinf.out
13176
13212
  dispatch:
13177
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_isposinf
13213
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_isposinf
13178
13214
  SparseCPU, SparseCUDA: isposinf_sparse
13179
13215
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isposinf_sparse_csr
13180
13216
  tags: pointwise
@@ -13192,7 +13228,7 @@
13192
13228
  variants: function, method
13193
13229
  structured_delegate: isneginf.out
13194
13230
  dispatch:
13195
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_isneginf
13231
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_isneginf
13196
13232
  SparseCPU, SparseCUDA: isneginf_sparse
13197
13233
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isneginf_sparse_csr
13198
13234
  tags: pointwise
@@ -13500,7 +13536,7 @@
13500
13536
  structured: True
13501
13537
  structured_inherits: TensorIteratorBase
13502
13538
  dispatch:
13503
- CPU, CUDA: special_i0e_out
13539
+ CPU, CUDA, MPS: special_i0e_out
13504
13540
  tags: pointwise
13505
13541
 
13506
13542
  - func: special_i1(Tensor self) -> Tensor
@@ -13528,7 +13564,7 @@
13528
13564
  structured: True
13529
13565
  structured_inherits: TensorIteratorBase
13530
13566
  dispatch:
13531
- CPU, CUDA: special_i1e_out
13567
+ CPU, CUDA, MPS: special_i1e_out
13532
13568
  tags: pointwise
13533
13569
 
13534
13570
  - func: special_logit(Tensor self, float? eps=None) -> Tensor
@@ -13897,8 +13933,7 @@
13897
13933
  python_module: linalg
13898
13934
  structured: True
13899
13935
  dispatch:
13900
- CPU, CUDA: linalg_cholesky_ex_out
13901
- MPS: linalg_cholesky_ex_out_mps
13936
+ CPU, CUDA, MPS: linalg_cholesky_ex_out
13902
13937
 
13903
13938
  - func: linalg_cholesky(Tensor self, *, bool upper=False) -> Tensor
13904
13939
  python_module: linalg
@@ -14468,13 +14503,13 @@
14468
14503
  dispatch:
14469
14504
  # the NestedTensor keys are necessary because NestedTensor has been removed
14470
14505
  # from the CompositeExplicitAutograd keyset see Note [NestedTensor Not Included in Backend Keys]
14471
- CompositeExplicitAutograd, NestedTensorCPU, NestedTensorCUDA: _test_autograd_multiple_dispatch_fullcoverage
14506
+ CompositeExplicitAutograd, NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: _test_autograd_multiple_dispatch_fullcoverage
14472
14507
  autogen: _test_autograd_multiple_dispatch.fullcoverage_out
14473
14508
 
14474
14509
  # Note: this function is only for testing.
14475
14510
  - func: _test_autograd_multiple_dispatch.ntonly(Tensor self, bool b) -> Tensor
14476
14511
  dispatch:
14477
- CompositeImplicitAutograd, NestedTensorCPU, NestedTensorCUDA: _test_autograd_multiple_dispatch_ntonly
14512
+ CompositeImplicitAutograd, NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: _test_autograd_multiple_dispatch_ntonly
14478
14513
 
14479
14514
  # Note: this function is only for testing.
14480
14515
  - func: _test_autograd_multiple_dispatch_view(Tensor(a) self) -> Tensor(a)
@@ -14819,13 +14854,13 @@
14819
14854
  - func: _safe_softmax(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
14820
14855
  dispatch:
14821
14856
  CompositeExplicitAutograd: _safe_softmax
14822
- NestedTensorCPU, NestedTensorCUDA: _safe_softmax
14857
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: _safe_softmax
14823
14858
 
14824
14859
  # Apparently, putting "forward" in the name will cause Python bindings to be skipped, so "fwd" it is.
14825
14860
  - func: _transformer_encoder_layer_fwd(Tensor src, int embed_dim, int num_heads, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, bool use_gelu, bool norm_first, float eps, Tensor norm_weight_1, Tensor norm_bias_1, Tensor norm_weight_2, Tensor norm_bias_2, Tensor ffn_weight_1, Tensor ffn_bias_1, Tensor ffn_weight_2, Tensor ffn_bias_2, Tensor? mask=None, int? mask_type=None) -> Tensor
14826
14861
  variants: function
14827
14862
  dispatch:
14828
- CPU, CUDA, NestedTensorCPU, NestedTensorCUDA: transformer_encoder_layer_forward
14863
+ CPU, CUDA, NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: transformer_encoder_layer_forward
14829
14864
  autogen: _transformer_encoder_layer_fwd.out
14830
14865
 
14831
14866
  - func: _native_multi_head_attention(Tensor query, Tensor key, Tensor value, int embed_dim, int num_head, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, Tensor? mask=None, bool need_weights=True, bool average_attn_weights=True, int? mask_type=None) -> (Tensor, Tensor)
@@ -14990,7 +15025,7 @@
14990
15025
 
14991
15026
  - func: special_bessel_j0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
14992
15027
  dispatch:
14993
- CPU, CUDA: special_bessel_j0_out
15028
+ CPU, CUDA, MPS: special_bessel_j0_out
14994
15029
  python_module: special
14995
15030
  structured_inherits: TensorIteratorBase
14996
15031
  structured: True
@@ -15005,7 +15040,7 @@
15005
15040
 
15006
15041
  - func: special_bessel_j1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
15007
15042
  dispatch:
15008
- CPU, CUDA: special_bessel_j1_out
15043
+ CPU, CUDA, MPS: special_bessel_j1_out
15009
15044
  python_module: special
15010
15045
  structured_inherits: TensorIteratorBase
15011
15046
  structured: True
@@ -15020,7 +15055,7 @@
15020
15055
 
15021
15056
  - func: special_bessel_y0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
15022
15057
  dispatch:
15023
- CPU, CUDA: special_bessel_y0_out
15058
+ CPU, CUDA, MPS: special_bessel_y0_out
15024
15059
  python_module: special
15025
15060
  structured_inherits: TensorIteratorBase
15026
15061
  structured: True
@@ -15035,7 +15070,7 @@
15035
15070
 
15036
15071
  - func: special_bessel_y1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
15037
15072
  dispatch:
15038
- CPU, CUDA: special_bessel_y1_out
15073
+ CPU, CUDA, MPS: special_bessel_y1_out
15039
15074
  python_module: special
15040
15075
  structured_inherits: TensorIteratorBase
15041
15076
  structured: True
@@ -15068,7 +15103,7 @@
15068
15103
  - func: special_chebyshev_polynomial_t.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
15069
15104
  device_check: NoCheck
15070
15105
  dispatch:
15071
- CPU, CUDA: special_chebyshev_polynomial_t_out
15106
+ CPU, CUDA, MPS: special_chebyshev_polynomial_t_out
15072
15107
  python_module: special
15073
15108
  structured_inherits: TensorIteratorBase
15074
15109
  structured: True
@@ -15117,7 +15152,7 @@
15117
15152
  - func: special_chebyshev_polynomial_u.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
15118
15153
  device_check: NoCheck
15119
15154
  dispatch:
15120
- CPU, CUDA: special_chebyshev_polynomial_u_out
15155
+ CPU, CUDA, MPS: special_chebyshev_polynomial_u_out
15121
15156
  python_module: special
15122
15157
  structured_inherits: TensorIteratorBase
15123
15158
  structured: True
@@ -15166,7 +15201,7 @@
15166
15201
  - func: special_chebyshev_polynomial_v.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
15167
15202
  device_check: NoCheck
15168
15203
  dispatch:
15169
- CPU, CUDA: special_chebyshev_polynomial_v_out
15204
+ CPU, CUDA, MPS: special_chebyshev_polynomial_v_out
15170
15205
  python_module: special
15171
15206
  structured_inherits: TensorIteratorBase
15172
15207
  structured: True
@@ -15215,7 +15250,7 @@
15215
15250
  - func: special_chebyshev_polynomial_w.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
15216
15251
  device_check: NoCheck
15217
15252
  dispatch:
15218
- CPU, CUDA: special_chebyshev_polynomial_w_out
15253
+ CPU, CUDA, MPS: special_chebyshev_polynomial_w_out
15219
15254
  python_module: special
15220
15255
  structured_inherits: TensorIteratorBase
15221
15256
  structured: True
@@ -15264,7 +15299,7 @@
15264
15299
  - func: special_hermite_polynomial_h.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
15265
15300
  device_check: NoCheck
15266
15301
  dispatch:
15267
- CPU, CUDA: special_hermite_polynomial_h_out
15302
+ CPU, CUDA, MPS: special_hermite_polynomial_h_out
15268
15303
  python_module: special
15269
15304
  structured_inherits: TensorIteratorBase
15270
15305
  structured: True
@@ -15313,7 +15348,7 @@
15313
15348
  - func: special_hermite_polynomial_he.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
15314
15349
  device_check: NoCheck
15315
15350
  dispatch:
15316
- CPU, CUDA: special_hermite_polynomial_he_out
15351
+ CPU, CUDA, MPS: special_hermite_polynomial_he_out
15317
15352
  python_module: special
15318
15353
  structured_inherits: TensorIteratorBase
15319
15354
  structured: True
@@ -15442,7 +15477,7 @@
15442
15477
 
15443
15478
  - func: special_modified_bessel_i0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
15444
15479
  dispatch:
15445
- CPU, CUDA: special_modified_bessel_i0_out
15480
+ CPU, CUDA, MPS: special_modified_bessel_i0_out
15446
15481
  python_module: special
15447
15482
  structured_inherits: TensorIteratorBase
15448
15483
  structured: True
@@ -15457,7 +15492,7 @@
15457
15492
 
15458
15493
  - func: special_modified_bessel_i1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
15459
15494
  dispatch:
15460
- CPU, CUDA: special_modified_bessel_i1_out
15495
+ CPU, CUDA, MPS: special_modified_bessel_i1_out
15461
15496
  python_module: special
15462
15497
  structured_inherits: TensorIteratorBase
15463
15498
  structured: True
@@ -15472,7 +15507,7 @@
15472
15507
 
15473
15508
  - func: special_modified_bessel_k0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
15474
15509
  dispatch:
15475
- CPU, CUDA: special_modified_bessel_k0_out
15510
+ CPU, CUDA, MPS: special_modified_bessel_k0_out
15476
15511
  python_module: special
15477
15512
  structured_inherits: TensorIteratorBase
15478
15513
  structured: True
@@ -15487,7 +15522,7 @@
15487
15522
 
15488
15523
  - func: special_modified_bessel_k1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
15489
15524
  dispatch:
15490
- CPU, CUDA: special_modified_bessel_k1_out
15525
+ CPU, CUDA, MPS: special_modified_bessel_k1_out
15491
15526
  python_module: special
15492
15527
  structured_inherits: TensorIteratorBase
15493
15528
  structured: True
@@ -15502,7 +15537,7 @@
15502
15537
 
15503
15538
  - func: special_scaled_modified_bessel_k0.out(Tensor x, *, Tensor(a!) out) -> Tensor(a!)
15504
15539
  dispatch:
15505
- CPU, CUDA: special_scaled_modified_bessel_k0_out
15540
+ CPU, CUDA, MPS: special_scaled_modified_bessel_k0_out
15506
15541
  python_module: special
15507
15542
  structured_inherits: TensorIteratorBase
15508
15543
  structured: True
@@ -15517,7 +15552,7 @@
15517
15552
 
15518
15553
  - func: special_scaled_modified_bessel_k1.out(Tensor x, *, Tensor(a!) out) -> Tensor(a!)
15519
15554
  dispatch:
15520
- CPU, CUDA: special_scaled_modified_bessel_k1_out
15555
+ CPU, CUDA, MPS: special_scaled_modified_bessel_k1_out
15521
15556
  python_module: special
15522
15557
  structured_inherits: TensorIteratorBase
15523
15558
  structured: True
@@ -15808,6 +15843,13 @@
15808
15843
  CPU: _fused_adagrad_kernel_cpu_
15809
15844
  autogen: _fused_adagrad, _fused_adagrad.out
15810
15845
 
15846
+ - func: _fused_adagrad_.tensor_lr(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] state_sums, Tensor[] state_steps, *, Tensor lr, float lr_decay, float weight_decay, float eps, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
15847
+ device_check: NoCheck
15848
+ variants: function
15849
+ dispatch:
15850
+ CPU: _fused_adagrad_kernel_cpu_
15851
+ autogen: _fused_adagrad.tensor_lr, _fused_adagrad.tensor_lr_out
15852
+
15811
15853
  # This op is ONLY used by pytorch/XLA in functionalization, and should never show up in vanilla eager mode or in any pytorch tracing contexts.
15812
15854
  - func: _propagate_xla_data(Tensor input, Tensor output) -> ()
15813
15855
  variants: function