torch-rb 0.19.1 → 0.21.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -288,13 +288,13 @@
288
288
  dispatch:
289
289
  CPU: native_dropout_cpu
290
290
  CUDA: native_dropout_cuda
291
- NestedTensorCPU, NestedTensorCUDA: native_dropout_nested
291
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: native_dropout_nested
292
292
  tags: [nondeterministic_seeded, core]
293
293
  autogen: native_dropout.out
294
294
 
295
295
  - func: native_dropout_backward(Tensor grad_output, Tensor mask, float scale) -> Tensor
296
296
  dispatch:
297
- CPU, NestedTensorCPU, NestedTensorCUDA: native_dropout_backward
297
+ CPU, NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: native_dropout_backward
298
298
  CUDA: native_dropout_backward_cuda
299
299
  autogen: native_dropout_backward.out
300
300
  tags: pointwise
@@ -342,7 +342,7 @@
342
342
  CompositeExplicitAutograd: abs
343
343
  SparseCPU, SparseCUDA: abs_sparse
344
344
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: abs_sparse_csr
345
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_abs
345
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_abs
346
346
  tags: [core, pointwise]
347
347
 
348
348
  - func: abs_(Tensor(a!) self) -> Tensor(a!)
@@ -352,13 +352,12 @@
352
352
  CompositeExplicitAutograd: abs_
353
353
  SparseCPU, SparseCUDA: abs_sparse_
354
354
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: abs_sparse_csr_
355
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_abs_
355
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_abs_
356
356
 
357
357
  - func: abs.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
358
358
  device_check: NoCheck # TensorIterator
359
359
  dispatch:
360
- CPU, CUDA: abs_out
361
- MPS: abs_out_mps
360
+ CPU, CUDA, MPS: abs_out
362
361
  SparseCPU, SparseCUDA: abs_sparse_out
363
362
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: abs_sparse_csr_out
364
363
  tags: pointwise
@@ -403,6 +402,7 @@
403
402
  variants: function, method
404
403
  dispatch:
405
404
  CPU, CUDA: angle
405
+ MPS: angle_mps
406
406
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: angle_sparse_csr
407
407
  tags: pointwise
408
408
 
@@ -410,6 +410,7 @@
410
410
  device_check: NoCheck # TensorIterator
411
411
  dispatch:
412
412
  CPU, CUDA: angle_out
413
+ MPS: angle_out_mps
413
414
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: angle_sparse_csr_out
414
415
  tags: pointwise
415
416
 
@@ -429,7 +430,7 @@
429
430
  dispatch:
430
431
  SparseCPU, SparseCUDA: sgn_sparse
431
432
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sgn_sparse_csr
432
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_sgn
433
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sgn
433
434
  tags: pointwise
434
435
 
435
436
  - func: sgn_(Tensor(a!) self) -> Tensor(a!)
@@ -438,7 +439,7 @@
438
439
  dispatch:
439
440
  SparseCPU, SparseCUDA: sgn_sparse_
440
441
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sgn_sparse_csr_
441
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_sgn_
442
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sgn_
442
443
  tags: pointwise
443
444
 
444
445
  - func: sgn.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@@ -525,8 +526,7 @@
525
526
  structured: True
526
527
  structured_inherits: TensorIteratorBase
527
528
  dispatch:
528
- CPU, CUDA: acos_out
529
- MPS: acos_out_mps
529
+ CPU, CUDA, MPS: acos_out
530
530
  tags: pointwise
531
531
 
532
532
  # arccos, alias of acos
@@ -558,7 +558,7 @@
558
558
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: add_sparse_csr
559
559
  MkldnnCPU: mkldnn_add
560
560
  ZeroTensor: add_zerotensor
561
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_add_Tensor
561
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_add_Tensor
562
562
  tags: [core, pointwise]
563
563
 
564
564
  - func: add_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
@@ -569,7 +569,7 @@
569
569
  SparseCPU, SparseCUDA, SparseMeta: add_sparse_
570
570
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: add_sparse_csr_
571
571
  MkldnnCPU: mkldnn_add_
572
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_add__Tensor
572
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_add__Tensor
573
573
  tags: pointwise
574
574
 
575
575
  - func: add.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
@@ -701,7 +701,7 @@
701
701
  structured_delegate: all.out
702
702
  variants: function, method
703
703
  dispatch:
704
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_all
704
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_all
705
705
 
706
706
 
707
707
  - func: all.dims(Tensor self, int[]? dim=None, bool keepdim=False) -> Tensor
@@ -940,7 +940,7 @@
940
940
  - func: as_strided(Tensor(a) self, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None) -> Tensor(a)
941
941
  variants: function, method
942
942
  dispatch:
943
- ZeroTensor, CPU, CUDA: as_strided_tensorimpl
943
+ ZeroTensor, CPU, CUDA, MTIA: as_strided_tensorimpl
944
944
  Meta: as_strided_tensorimpl_meta_symint
945
945
  MPS: as_strided_tensorimpl_mps
946
946
  QuantizedCPU, QuantizedCUDA: as_strided_qtensorimpl
@@ -980,8 +980,7 @@
980
980
  structured: True
981
981
  structured_inherits: TensorIteratorBase
982
982
  dispatch:
983
- CPU, CUDA: asin_out
984
- MPS: asin_out_mps
983
+ CPU, CUDA, MPS: asin_out
985
984
  SparseCPU, SparseCUDA: asin_sparse_out
986
985
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asin_sparse_csr_out
987
986
  tags: pointwise
@@ -1018,8 +1017,7 @@
1018
1017
  structured: True
1019
1018
  structured_inherits: TensorIteratorBase
1020
1019
  dispatch:
1021
- CPU, CUDA: atan_out
1022
- MPS: atan_out_mps
1020
+ CPU, CUDA, MPS: atan_out
1023
1021
  SparseCPU, SparseCUDA: atan_sparse_out
1024
1022
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atan_sparse_csr_out
1025
1023
  tags: pointwise
@@ -1071,6 +1069,16 @@
1071
1069
  XPU: baddbmm_out_xpu
1072
1070
  SparseCsrCUDA: baddbmm_out_sparse_csr_cuda
1073
1071
 
1072
+ - func: baddbmm.dtype(Tensor self, Tensor batch1, Tensor batch2, ScalarType out_dtype, *, Scalar beta=1, Scalar alpha=1) -> Tensor
1073
+ variants: function
1074
+ dispatch:
1075
+ CUDA: _baddbmm_dtype_cuda
1076
+
1077
+ - func: baddbmm.dtype_out(Tensor self, Tensor batch1, Tensor batch2, ScalarType out_dtype, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
1078
+ variants: function
1079
+ dispatch:
1080
+ CUDA: _baddbmm_out_dtype_cuda
1081
+
1074
1082
  - func: bartlett_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
1075
1083
  dispatch:
1076
1084
  CompositeExplicitAutograd: bartlett_window
@@ -1183,7 +1191,7 @@
1183
1191
  CompositeExplicitAutograd: binary_cross_entropy_with_logits
1184
1192
  autogen: binary_cross_entropy_with_logits.out
1185
1193
 
1186
- - func: bincount(Tensor self, Tensor? weights=None, int minlength=0) -> Tensor
1194
+ - func: bincount(Tensor self, Tensor? weights=None, SymInt minlength=0) -> Tensor
1187
1195
  variants: function, method
1188
1196
  dispatch:
1189
1197
  CPU: _bincount_cpu
@@ -1209,8 +1217,7 @@
1209
1217
  structured: True
1210
1218
  structured_inherits: TensorIteratorBase
1211
1219
  dispatch:
1212
- CPU, CUDA: bitwise_not_out
1213
- MPS: bitwise_not_out_mps
1220
+ CPU, CUDA, MPS, MTIA: bitwise_not_out
1214
1221
  tags: pointwise
1215
1222
 
1216
1223
  - func: copysign.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
@@ -1260,7 +1267,7 @@
1260
1267
  variants: function, method
1261
1268
  dispatch:
1262
1269
  CompositeExplicitAutograd: logical_not
1263
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_logical_not
1270
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_logical_not
1264
1271
  tags: [core, pointwise]
1265
1272
 
1266
1273
  - func: logical_not_(Tensor(a!) self) -> Tensor(a!)
@@ -1268,7 +1275,7 @@
1268
1275
  variants: method
1269
1276
  dispatch:
1270
1277
  CompositeExplicitAutograd: logical_not_
1271
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_logical_not_
1278
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_logical_not_
1272
1279
  tags: pointwise
1273
1280
 
1274
1281
  - func: logical_not.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@@ -1316,7 +1323,7 @@
1316
1323
  - func: logical_and.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
1317
1324
  device_check: NoCheck # TensorIterator
1318
1325
  dispatch:
1319
- CPU, CUDA: logical_and_out
1326
+ CPU, CUDA, MTIA: logical_and_out
1320
1327
  MPS: logical_and_out_mps
1321
1328
  tags: pointwise
1322
1329
 
@@ -1337,7 +1344,7 @@
1337
1344
  - func: logical_or.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
1338
1345
  device_check: NoCheck # TensorIterator
1339
1346
  dispatch:
1340
- CPU, CUDA: logical_or_out
1347
+ CPU, CUDA, MTIA: logical_or_out
1341
1348
  MPS: logical_or_out_mps
1342
1349
  tags: pointwise
1343
1350
 
@@ -1373,6 +1380,16 @@
1373
1380
  SparseCUDA: bmm_out_sparse_cuda
1374
1381
  SparseCsrCUDA: bmm_out_sparse_csr_cuda
1375
1382
 
1383
+ - func: bmm.dtype(Tensor self, Tensor mat2, ScalarType out_dtype) -> Tensor
1384
+ variants: function
1385
+ dispatch:
1386
+ CUDA: _bmm_dtype_cuda
1387
+
1388
+ - func: bmm.dtype_out(Tensor self, Tensor mat2, ScalarType out_dtype, *, Tensor(a!) out) -> Tensor(a!)
1389
+ variants: function
1390
+ dispatch:
1391
+ CUDA: _bmm_out_dtype_cuda
1392
+
1376
1393
  - func: broadcast_tensors(Tensor[] tensors) -> Tensor[]
1377
1394
  device_check: NoCheck
1378
1395
  device_guard: False
@@ -1392,7 +1409,7 @@
1392
1409
  dispatch:
1393
1410
  SparseCPU, SparseCUDA: cat_sparse
1394
1411
  QuantizedCPU: cat_quantized_cpu
1395
- NestedTensorCPU, NestedTensorCUDA: cat_nested
1412
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: cat_nested
1396
1413
  tags: core
1397
1414
 
1398
1415
  - func: cat.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!)
@@ -1456,8 +1473,7 @@
1456
1473
  structured: True
1457
1474
  structured_inherits: TensorIteratorBase
1458
1475
  dispatch:
1459
- CPU, CUDA: ceil_out
1460
- MPS: ceil_out_mps
1476
+ CPU, CUDA, MPS: ceil_out
1461
1477
  SparseCPU, SparseCUDA: ceil_sparse_out
1462
1478
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: ceil_sparse_csr_out
1463
1479
  tags: pointwise
@@ -1481,7 +1497,7 @@
1481
1497
  device_guard: False
1482
1498
  dispatch:
1483
1499
  CompositeImplicitAutograd: chunk
1484
- NestedTensorCPU, NestedTensorCUDA: chunk_nested_tensor
1500
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: chunk_nested_tensor
1485
1501
 
1486
1502
  - func: tensor_split.sections(Tensor(a -> *) self, SymInt sections, int dim=0) -> Tensor(a)[]
1487
1503
  variants: function, method
@@ -1528,7 +1544,7 @@
1528
1544
  structured: True
1529
1545
  structured_inherits: TensorIteratorBase
1530
1546
  dispatch:
1531
- CPU, CUDA: clamp_out
1547
+ CPU, CUDA, MTIA: clamp_out
1532
1548
  MPS: clamp_out_mps
1533
1549
  tags: pointwise
1534
1550
 
@@ -1568,7 +1584,7 @@
1568
1584
  structured: True
1569
1585
  structured_inherits: TensorIteratorBase
1570
1586
  dispatch:
1571
- CPU, CUDA: clamp_max_out
1587
+ CPU, CUDA, MTIA: clamp_max_out
1572
1588
  MPS: clamp_max_out_mps
1573
1589
  tags: pointwise
1574
1590
 
@@ -1608,7 +1624,7 @@
1608
1624
  structured: True
1609
1625
  structured_inherits: TensorIteratorBase
1610
1626
  dispatch:
1611
- CPU, CUDA: clamp_min_out
1627
+ CPU, CUDA, MTIA: clamp_min_out
1612
1628
  MPS: clamp_min_out_mps
1613
1629
  tags: pointwise
1614
1630
 
@@ -1657,8 +1673,7 @@
1657
1673
 
1658
1674
  - func: complex.out(Tensor real, Tensor imag, *, Tensor(a!) out) -> Tensor(a!)
1659
1675
  dispatch:
1660
- CPU, CUDA: complex_out
1661
- MPS: complex_out_mps
1676
+ CPU, CUDA, MPS: complex_out
1662
1677
 
1663
1678
  - func: polar(Tensor abs, Tensor angle) -> Tensor
1664
1679
  variants: function
@@ -1667,8 +1682,7 @@
1667
1682
 
1668
1683
  - func: polar.out(Tensor abs, Tensor angle, *, Tensor(a!) out) -> Tensor(a!)
1669
1684
  dispatch:
1670
- CPU, CUDA: polar_out
1671
- MPS: polar_out_mps
1685
+ CPU, CUDA, MPS: polar_out
1672
1686
 
1673
1687
  - func: constant_pad_nd(Tensor self, SymInt[] pad, Scalar value=0) -> Tensor
1674
1688
  variants: function
@@ -1780,7 +1794,7 @@
1780
1794
  SparseCPU, SparseCUDA: copy_sparse_wrapper_
1781
1795
  CompositeExplicitAutograd: copy_
1782
1796
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: copy_sparse_compressed_
1783
- NestedTensorCPU, NestedTensorCUDA: copy_nested_
1797
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: copy_nested_
1784
1798
  autogen: copy.out
1785
1799
 
1786
1800
  - func: _copy_from(Tensor self, Tensor dst, bool non_blocking=False) -> Tensor
@@ -1800,7 +1814,7 @@
1800
1814
  variants: function, method
1801
1815
  structured_delegate: cos.out
1802
1816
  dispatch:
1803
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_cos
1817
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_cos
1804
1818
  tags: [core, pointwise]
1805
1819
 
1806
1820
  - func: cos_(Tensor(a!) self) -> Tensor(a!)
@@ -1814,8 +1828,7 @@
1814
1828
  structured: True
1815
1829
  structured_inherits: TensorIteratorBase
1816
1830
  dispatch:
1817
- CPU, CUDA: cos_out
1818
- MPS: cos_out_mps
1831
+ CPU, CUDA, MPS, MTIA: cos_out
1819
1832
  tags: pointwise
1820
1833
 
1821
1834
  - func: cosh(Tensor self) -> Tensor
@@ -1835,8 +1848,7 @@
1835
1848
  structured: True
1836
1849
  structured_inherits: TensorIteratorBase
1837
1850
  dispatch:
1838
- CPU, CUDA: cosh_out
1839
- MPS: cosh_out_mps
1851
+ CPU, CUDA, MPS: cosh_out
1840
1852
  tags: pointwise
1841
1853
 
1842
1854
  - func: cosine_embedding_loss(Tensor input1, Tensor input2, Tensor target, float margin=0.0, int reduction=Mean) -> Tensor
@@ -1950,6 +1962,7 @@
1950
1962
  dispatch:
1951
1963
  CPU: cummax_helper_cpu
1952
1964
  CUDA: cummax_helper_cuda
1965
+ MPS: cummax_helper_mps
1953
1966
 
1954
1967
  - func: cummin(Tensor self, int dim) -> (Tensor values, Tensor indices)
1955
1968
  device_check: NoCheck # TensorIterator
@@ -1974,6 +1987,7 @@
1974
1987
  dispatch:
1975
1988
  CPU: cummin_helper_cpu
1976
1989
  CUDA: cummin_helper_cuda
1990
+ MPS: cummin_helper_mps
1977
1991
 
1978
1992
  - func: cummaxmin_backward(Tensor grad, Tensor input, Tensor indices, int dim) -> Tensor
1979
1993
  variants: function
@@ -2138,7 +2152,7 @@
2138
2152
  dispatch:
2139
2153
  SparseCPU, SparseCUDA: div_sparse
2140
2154
  ZeroTensor: div_zerotensor
2141
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_div_Tensor
2155
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_div_Tensor
2142
2156
  tags: [core, pointwise]
2143
2157
 
2144
2158
  - func: div_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
@@ -2154,8 +2168,7 @@
2154
2168
  structured: True
2155
2169
  structured_inherits: TensorIteratorBase
2156
2170
  dispatch:
2157
- CPU, CUDA: div_out
2158
- MPS: div_out_mps
2171
+ CPU, CUDA, MPS: div_out
2159
2172
  SparseCPU, SparseCUDA: div_out_sparse_zerodim
2160
2173
  tags: pointwise
2161
2174
 
@@ -2180,8 +2193,7 @@
2180
2193
  structured: True
2181
2194
  structured_inherits: TensorIteratorBase
2182
2195
  dispatch:
2183
- CPU, CUDA: div_out_mode
2184
- MPS: div_out_mode_mps
2196
+ CPU, CUDA, MPS: div_out_mode
2185
2197
  SparseCPU, SparseCUDA: div_out_sparse_zerodim
2186
2198
  tags: pointwise
2187
2199
 
@@ -2191,7 +2203,7 @@
2191
2203
  variants: function, method
2192
2204
  dispatch:
2193
2205
  CompositeExplicitAutograd: div
2194
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_div_Scalar
2206
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_div_Scalar
2195
2207
  tags: [core, pointwise]
2196
2208
 
2197
2209
  - func: div_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
@@ -2291,7 +2303,7 @@
2291
2303
  - func: embedding(Tensor weight, Tensor indices, SymInt padding_idx=-1, bool scale_grad_by_freq=False, bool sparse=False) -> Tensor
2292
2304
  dispatch:
2293
2305
  CompositeExplicitAutograd: embedding_symint
2294
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_embedding
2306
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_embedding
2295
2307
  autogen: embedding.out
2296
2308
  tags: core
2297
2309
 
@@ -2497,7 +2509,7 @@
2497
2509
  QuantizedCPU, QuantizedCUDA: empty_like_quantized
2498
2510
  SparseCPU, SparseCUDA, SparseMeta: empty_like_sparse_coo
2499
2511
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: empty_like_sparse_csr
2500
- NestedTensorCPU, NestedTensorCUDA: empty_like_nested
2512
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: empty_like_nested
2501
2513
  autogen: empty_like.out
2502
2514
 
2503
2515
  - func: empty_strided(SymInt[] size, SymInt[] stride, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
@@ -2533,8 +2545,7 @@
2533
2545
  structured: True
2534
2546
  structured_inherits: TensorIteratorBase
2535
2547
  dispatch:
2536
- CPU, CUDA: erf_out
2537
- MPS: erf_out_mps
2548
+ CPU, CUDA, MPS, MTIA: erf_out
2538
2549
  SparseCPU, SparseCUDA: erf_sparse_out
2539
2550
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erf_sparse_csr_out
2540
2551
  tags: pointwise
@@ -2556,7 +2567,7 @@
2556
2567
  structured: True
2557
2568
  structured_inherits: TensorIteratorBase
2558
2569
  dispatch:
2559
- CPU, CUDA: erfc_out
2570
+ CPU, CUDA, MPS: erfc_out
2560
2571
  tags: pointwise
2561
2572
 
2562
2573
  - func: exp(Tensor self) -> Tensor
@@ -2576,8 +2587,7 @@
2576
2587
  structured: True
2577
2588
  structured_inherits: TensorIteratorBase
2578
2589
  dispatch:
2579
- CPU, CUDA: exp_out
2580
- MPS: exp_out_mps
2590
+ CPU, CUDA, MPS, MTIA: exp_out
2581
2591
  tags: pointwise
2582
2592
 
2583
2593
  - func: exp2(Tensor self) -> Tensor
@@ -2594,8 +2604,7 @@
2594
2604
  structured: True
2595
2605
  structured_inherits: TensorIteratorBase
2596
2606
  dispatch:
2597
- CPU, CUDA: exp2_out
2598
- MPS: exp2_out_mps
2607
+ CPU, CUDA, MPS: exp2_out
2599
2608
  tags: pointwise
2600
2609
 
2601
2610
  - func: expm1(Tensor self) -> Tensor
@@ -2621,8 +2630,7 @@
2621
2630
  structured: True
2622
2631
  structured_inherits: TensorIteratorBase
2623
2632
  dispatch:
2624
- CPU, CUDA: expm1_out
2625
- MPS: expm1_out_mps
2633
+ CPU, CUDA, MPS: expm1_out
2626
2634
  SparseCPU, SparseCUDA: expm1_sparse_out
2627
2635
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: expm1_sparse_csr_out
2628
2636
  tags: pointwise
@@ -2703,7 +2711,7 @@
2703
2711
  QuantizedCPU, QuantizedCUDA: fill_quantized_
2704
2712
  Meta: fill_meta_
2705
2713
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: fill_sparse_csr_
2706
- NestedTensorCPU, NestedTensorCUDA: fill_nested_
2714
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: fill_nested_
2707
2715
  autogen: fill.Scalar_out
2708
2716
 
2709
2717
  - func: fill_.Tensor(Tensor(a!) self, Tensor value) -> Tensor(a!)
@@ -2714,7 +2722,7 @@
2714
2722
  MPS: fill_tensor_mps_
2715
2723
  QuantizedCPU, QuantizedCUDA: fill_quantized_
2716
2724
  Meta: fill_meta_
2717
- NestedTensorCPU, NestedTensorCUDA: fill_nested_
2725
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: fill_nested_
2718
2726
  autogen: fill.Tensor_out
2719
2727
 
2720
2728
  - func: floor(Tensor self) -> Tensor
@@ -2740,8 +2748,7 @@
2740
2748
  structured: True
2741
2749
  structured_inherits: TensorIteratorBase
2742
2750
  dispatch:
2743
- CPU, CUDA: floor_out
2744
- MPS: floor_out_mps
2751
+ CPU, CUDA, MPS: floor_out
2745
2752
  SparseCPU, SparseCUDA: floor_sparse_out
2746
2753
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: floor_sparse_csr_out
2747
2754
  tags: pointwise
@@ -2750,23 +2757,20 @@
2750
2757
  device_check: NoCheck # TensorIterator
2751
2758
  variants: function, method
2752
2759
  dispatch:
2753
- CPU, CUDA: floor_divide
2754
- MPS: floor_divide_mps
2760
+ CPU, CUDA, MPS: floor_divide
2755
2761
  SparseCPU, SparseCUDA: floor_divide_sparse
2756
2762
 
2757
2763
  - func: floor_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
2758
2764
  device_check: NoCheck # TensorIterator
2759
2765
  variants: method
2760
2766
  dispatch:
2761
- CPU, CUDA: floor_divide_
2762
- MPS: floor_divide_mps_
2767
+ CPU, CUDA, MPS: floor_divide_
2763
2768
  SparseCPU, SparseCUDA: floor_divide_sparse_
2764
2769
 
2765
2770
  - func: floor_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
2766
2771
  device_check: NoCheck # TensorIterator
2767
2772
  dispatch:
2768
- CPU, CUDA: floor_divide_out
2769
- MPS: floor_divide_out_mps
2773
+ CPU, CUDA, MPS: floor_divide_out
2770
2774
  SparseCPU, SparseCUDA: floor_divide_out_sparse_zerodim
2771
2775
 
2772
2776
  - func: floor_divide.Scalar(Tensor self, Scalar other) -> Tensor
@@ -3000,6 +3004,7 @@
3000
3004
  CPU: _fft_r2c_mkl
3001
3005
  CUDA: _fft_r2c_cufft
3002
3006
  MPS: _fft_r2c_mps
3007
+ tags: core
3003
3008
 
3004
3009
  - func: _fft_r2c.out(Tensor self, int[] dim, int normalization, bool onesided, *, Tensor(a!) out) -> Tensor(a!)
3005
3010
  variants: function
@@ -3100,6 +3105,7 @@
3100
3105
  - dim -> int dim
3101
3106
  dispatch:
3102
3107
  CPU, CUDA: index_copy_out
3108
+ MPS: index_copy_out_mps
3103
3109
 
3104
3110
  - func: index_copy_(Tensor(a!) self, int dim, Tensor index, Tensor source) -> Tensor(a!)
3105
3111
  variants: method
@@ -3170,7 +3176,7 @@
3170
3176
  variants: function
3171
3177
  structured: True
3172
3178
  dispatch:
3173
- CPU, CUDA: isin_Tensor_Scalar_out
3179
+ CPU, CUDA, MPS: isin_Tensor_Scalar_out
3174
3180
 
3175
3181
  - func: isin.Tensor_Scalar(Tensor elements, Scalar test_element, *, bool assume_unique=False, bool invert=False) -> Tensor
3176
3182
  variants: function
@@ -3181,6 +3187,7 @@
3181
3187
  structured: True
3182
3188
  dispatch:
3183
3189
  CPU, CUDA: isin_Scalar_Tensor_out
3190
+ MPS: isin_Scalar_Tensor_out_mps
3184
3191
 
3185
3192
  - func: isin.Scalar_Tensor(Scalar element, Tensor test_elements, *, bool assume_unique=False, bool invert=False) -> Tensor
3186
3193
  variants: function
@@ -3191,8 +3198,8 @@
3191
3198
  device_check: NoCheck
3192
3199
  device_guard: False
3193
3200
  dispatch:
3194
- CPU, CUDA, MPS: isnan
3195
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_isnan
3201
+ CPU, CUDA, MPS, MTIA: isnan
3202
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_isnan
3196
3203
  SparseCPU, SparseCUDA: isnan_sparse
3197
3204
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isnan_sparse_csr
3198
3205
  autogen: isnan.out
@@ -3243,7 +3250,7 @@
3243
3250
  device_check: NoCheck
3244
3251
  device_guard: False
3245
3252
  dispatch:
3246
- NestedTensorCPU, NestedTensorCUDA: nested_is_same_size
3253
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: nested_is_same_size
3247
3254
  CompositeExplicitAutograd: is_same_size
3248
3255
 
3249
3256
  - func: is_signed(Tensor self) -> bool
@@ -3265,20 +3272,20 @@
3265
3272
 
3266
3273
  - func: kron.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
3267
3274
 
3268
- - func: kthvalue(Tensor self, int k, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices)
3275
+ - func: kthvalue(Tensor self, SymInt k, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices)
3269
3276
  variants: function, method
3270
3277
  dispatch:
3271
3278
  CompositeExplicitAutograd: kthvalue
3272
3279
 
3273
- - func: kthvalue.values(Tensor self, int k, int dim=-1, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
3280
+ - func: kthvalue.values(Tensor self, SymInt k, int dim=-1, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
3274
3281
  dispatch:
3275
3282
  CPU: kthvalue_out_cpu
3276
3283
  CUDA: kthvalue_out_cuda
3277
3284
 
3278
- - func: kthvalue.dimname(Tensor self, int k, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
3285
+ - func: kthvalue.dimname(Tensor self, SymInt k, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
3279
3286
  variants: function, method
3280
3287
 
3281
- - func: kthvalue.dimname_out(Tensor self, int k, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
3288
+ - func: kthvalue.dimname_out(Tensor self, SymInt k, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
3282
3289
 
3283
3290
  - func: layer_norm(Tensor input, SymInt[] normalized_shape, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enable=True) -> Tensor
3284
3291
  dispatch:
@@ -3290,7 +3297,7 @@
3290
3297
  CUDA: layer_norm_cuda
3291
3298
  MPS: layer_norm_mps
3292
3299
  CompositeExplicitAutograd: math_native_layer_norm
3293
- NestedTensorCPU, NestedTensorCUDA: nested_layer_norm
3300
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: nested_layer_norm
3294
3301
  autogen: native_layer_norm.out
3295
3302
  tags: core
3296
3303
 
@@ -3299,7 +3306,7 @@
3299
3306
  CPU: layer_norm_backward_cpu
3300
3307
  CUDA: layer_norm_backward_cuda
3301
3308
  MPS: layer_norm_backward_mps
3302
- NestedTensorCPU, NestedTensorCUDA: layer_norm_backward_nested
3309
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: layer_norm_backward_nested
3303
3310
  autogen: native_layer_norm_backward.out
3304
3311
  tags: core
3305
3312
 
@@ -3307,6 +3314,10 @@
3307
3314
  dispatch:
3308
3315
  CompositeImplicitAutograd: rms_norm_symint
3309
3316
 
3317
+ - func: _fused_rms_norm(Tensor input, int normalized_shape_ndim, Tensor weight, float eps) -> Tensor
3318
+ dispatch:
3319
+ MPS: _fused_rms_norm_mps
3320
+
3310
3321
  - func: nan_to_num(Tensor self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor
3311
3322
  variants: function, method
3312
3323
  dispatch:
@@ -3323,7 +3334,7 @@
3323
3334
 
3324
3335
  - func: nan_to_num.out(Tensor self, float? nan=None, float? posinf=None, float? neginf=None, *, Tensor(a!) out) -> Tensor(a!)
3325
3336
  dispatch:
3326
- CPU, CUDA: nan_to_num_out
3337
+ CPU, CUDA, MTIA: nan_to_num_out
3327
3338
  MPS: nan_to_num_out_mps
3328
3339
  SparseCPU, SparseCUDA: nan_to_num_sparse_out
3329
3340
  tags: pointwise
@@ -3332,12 +3343,12 @@
3332
3343
  python_module: nn
3333
3344
  dispatch:
3334
3345
  CompositeImplicitAutograd: linear
3335
- NestedTensorCPU, NestedTensorCUDA: nested_linear
3346
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: nested_linear
3336
3347
  MPS: _mps_linear
3337
3348
 
3338
3349
  - func: linear_backward(Tensor self, Tensor grad_output, Tensor weight, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
3339
3350
  dispatch:
3340
- NestedTensorCPU, NestedTensorCUDA: nested_linear_backward
3351
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: nested_linear_backward
3341
3352
  MPS: mps_linear_backward
3342
3353
  autogen: linear_backward.out
3343
3354
 
@@ -3371,7 +3382,7 @@
3371
3382
  dispatch:
3372
3383
  CUDA: _cslt_compress
3373
3384
 
3374
- - func: _cslt_sparse_mm(Tensor compressed_A, Tensor dense_B, Tensor? bias=None, Tensor? alpha=None, ScalarType? out_dtype=None, bool transpose_result=False, int alg_id=0, int split_k=1, bool split_k_one_kernel=True) -> Tensor
3385
+ - func: _cslt_sparse_mm(Tensor compressed_A, Tensor dense_B, Tensor? bias=None, Tensor? alpha=None, ScalarType? out_dtype=None, bool transpose_result=False, int alg_id=0, int split_k=1, int split_k_mode=-1) -> Tensor
3375
3386
  dispatch:
3376
3387
  CUDA: _cslt_sparse_mm
3377
3388
  tags: needs_fixed_stride_order
@@ -3496,8 +3507,7 @@
3496
3507
  structured: True
3497
3508
  structured_inherits: TensorIteratorBase
3498
3509
  dispatch:
3499
- CPU, CUDA: log_out
3500
- MPS: log_out_mps
3510
+ CPU, CUDA, MPS, MTIA: log_out
3501
3511
  tags: pointwise
3502
3512
 
3503
3513
  - func: log10(Tensor self) -> Tensor
@@ -3517,8 +3527,7 @@
3517
3527
  structured: True
3518
3528
  structured_inherits: TensorIteratorBase
3519
3529
  dispatch:
3520
- CPU, CUDA: log10_out
3521
- MPS: log10_out_mps
3530
+ CPU, CUDA, MPS: log10_out
3522
3531
  tags: pointwise
3523
3532
 
3524
3533
  - func: log1p(Tensor self) -> Tensor
@@ -3544,8 +3553,7 @@
3544
3553
  structured: True
3545
3554
  structured_inherits: TensorIteratorBase
3546
3555
  dispatch:
3547
- CPU, CUDA: log1p_out
3548
- MPS: log1p_out_mps
3556
+ CPU, CUDA, MPS: log1p_out
3549
3557
  SparseCPU, SparseCUDA: log1p_sparse_out
3550
3558
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: log1p_sparse_csr_out
3551
3559
  tags: pointwise
@@ -3567,8 +3575,7 @@
3567
3575
  structured: True
3568
3576
  structured_inherits: TensorIteratorBase
3569
3577
  dispatch:
3570
- CPU, CUDA: log2_out
3571
- MPS: log2_out_mps
3578
+ CPU, CUDA, MPS, MTIA: log2_out
3572
3579
  tags: pointwise
3573
3580
 
3574
3581
  - func: logaddexp.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
@@ -3715,6 +3722,7 @@
3715
3722
  dispatch:
3716
3723
  CPU: log_softmax_cpu_out
3717
3724
  CUDA: log_softmax_cuda_out
3725
+ MTIA: log_softmax_mtia_out
3718
3726
  MPS: log_softmax_mps_out
3719
3727
 
3720
3728
  - func: _log_softmax_backward_data(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype) -> Tensor
@@ -3725,6 +3733,7 @@
3725
3733
  dispatch:
3726
3734
  CPU: log_softmax_backward_cpu_out
3727
3735
  CUDA: log_softmax_backward_cuda_out
3736
+ MTIA: log_softmax_backward_mtia_out
3728
3737
  MPS: log_softmax_backward_mps_out
3729
3738
 
3730
3739
  - func: _logcumsumexp(Tensor self, int dim) -> Tensor
@@ -3776,17 +3785,17 @@
3776
3785
  variants: function, method
3777
3786
  dispatch:
3778
3787
  CompositeImplicitAutograd: matmul
3779
- NestedTensorCPU, NestedTensorCUDA: matmul_nested
3788
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: matmul_nested
3780
3789
 
3781
3790
  - func: matmul_backward(Tensor grad, Tensor self, Tensor other, bool[2] mask) -> (Tensor, Tensor)
3782
3791
  dispatch:
3783
- NestedTensorCPU, NestedTensorCUDA: matmul_backward_nested
3792
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: matmul_backward_nested
3784
3793
  autogen: matmul_backward.out
3785
3794
 
3786
3795
  - func: matmul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
3787
3796
  dispatch:
3788
3797
  CompositeImplicitAutograd: matmul_out
3789
- NestedTensorCPU, NestedTensorCUDA: matmul_out_nested
3798
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: matmul_out_nested
3790
3799
 
3791
3800
  # Alias to linalg.matrix_power
3792
3801
  - func: matrix_power(Tensor self, int n) -> Tensor
@@ -3848,7 +3857,7 @@
3848
3857
  precomputed:
3849
3858
  - dim -> int dim
3850
3859
  dispatch:
3851
- CPU, CUDA: max_out
3860
+ CPU, CUDA, MTIA: max_out
3852
3861
  MPS: max_out_mps
3853
3862
 
3854
3863
  - func: max.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
@@ -3864,6 +3873,7 @@
3864
3873
  device_guard: False
3865
3874
  dispatch:
3866
3875
  CompositeImplicitAutograd: value_selecting_reduction_backward_symint
3876
+ NestedTensorCPU, NestedTensorCUDA: value_selecting_reduction_backward_nested_symint
3867
3877
 
3868
3878
  - func: amax(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor
3869
3879
  variants: function, method
@@ -4003,6 +4013,7 @@
4003
4013
  dispatch:
4004
4014
  CPU: nanmedian_cpu
4005
4015
  CUDA: nanmedian_cuda
4016
+ MPS: nanmedian_mps
4006
4017
  autogen: nanmedian.out
4007
4018
 
4008
4019
  - func: nanmedian.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)
@@ -4014,6 +4025,7 @@
4014
4025
  dispatch:
4015
4026
  CPU: nanmedian_out_cpu
4016
4027
  CUDA: nanmedian_out_cuda
4028
+ MPS: nanmedian_out_mps
4017
4029
 
4018
4030
  - func: nanmedian.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
4019
4031
  variants: function, method
@@ -4034,7 +4046,7 @@
4034
4046
  precomputed:
4035
4047
  - dim -> int dim
4036
4048
  dispatch:
4037
- CPU, CUDA: min_out
4049
+ CPU, CUDA, MTIA: min_out
4038
4050
  MPS: min_out_mps
4039
4051
 
4040
4052
  - func: min.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
@@ -4142,11 +4154,20 @@
4142
4154
  dispatch:
4143
4155
  CPU: mm_out_cpu
4144
4156
  CUDA: mm_out_cuda
4157
+ MTIA: mm_out_mtia
4145
4158
  MPS: mm_out_mps
4146
4159
  XPU: mm_out_xpu
4147
4160
  SparseCPU, SparseCUDA: _sparse_mm_out
4148
4161
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: _sparse_csr_mm_out
4149
4162
 
4163
+ - func: mm.dtype(Tensor self, Tensor mat2, ScalarType out_dtype) -> Tensor
4164
+ dispatch:
4165
+ CUDA: _mm_dtype_cuda
4166
+
4167
+ - func: mm.dtype_out(Tensor self, Tensor mat2, ScalarType out_dtype, *, Tensor(a!) out) -> Tensor(a!)
4168
+ dispatch:
4169
+ CUDA: _mm_dtype_out_cuda
4170
+
4150
4171
  - func: _int_mm(Tensor self, Tensor mat2) -> Tensor
4151
4172
  dispatch:
4152
4173
  CPU: _int_mm_cpu
@@ -4167,6 +4188,10 @@
4167
4188
  MPS: _weight_int4pack_mm_mps
4168
4189
  CUDA: _weight_int4pack_mm_cuda
4169
4190
 
4191
+ - func: _weight_int4pack_mm_with_scales_and_zeros(Tensor self, Tensor mat2, int qGroupSize, Tensor qScale, Tensor qZeros) -> Tensor
4192
+ dispatch:
4193
+ XPU: _weight_int4pack_mm_xpu
4194
+
4170
4195
  # Split int4 pack weight between cpu and other devices due to
4171
4196
  # https://github.com/pytorch/ao/issues/1117#issuecomment-2451252756.
4172
4197
  - func: _convert_weight_to_int4pack_for_cpu(Tensor self, int innerKTiles) -> Tensor
@@ -4177,6 +4202,14 @@
4177
4202
  dispatch:
4178
4203
  CPU: _weight_int4pack_mm_cpu
4179
4204
 
4205
+ - func: _dyn_quant_pack_4bit_weight(Tensor weights, Tensor scales_zeros, Tensor? bias, int block_size, int in_features, int out_features) -> Tensor
4206
+ dispatch:
4207
+ CPU: _dyn_quant_pack_4bit_weight_cpu
4208
+
4209
+ - func: _dyn_quant_matmul_4bit(Tensor inp, Tensor packed_weights, int block_size, int in_features, int out_features) -> Tensor
4210
+ dispatch:
4211
+ CPU: _dyn_quant_matmul_4bit_cpu
4212
+
4180
4213
  - func: _weight_int8pack_mm(Tensor self, Tensor mat2, Tensor scales) -> Tensor
4181
4214
  dispatch:
4182
4215
  CPU: _weight_int8pack_mm_cpu
@@ -4217,7 +4250,7 @@
4217
4250
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: mul_sparse_csr
4218
4251
  MkldnnCPU: mkldnn_mul
4219
4252
  ZeroTensor: mul_zerotensor
4220
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_mul_Tensor
4253
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_mul_Tensor
4221
4254
  tags: [core, pointwise]
4222
4255
 
4223
4256
  - func: mul_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
@@ -4228,7 +4261,7 @@
4228
4261
  SparseCPU, SparseCUDA: mul_sparse_
4229
4262
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: mul_sparse_csr_
4230
4263
  MkldnnCPU: mkldnn_mul_
4231
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_mul__Tensor
4264
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_mul__Tensor
4232
4265
  tags: pointwise
4233
4266
 
4234
4267
  - func: mul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
@@ -4236,8 +4269,7 @@
4236
4269
  structured: True
4237
4270
  structured_inherits: TensorIteratorBase
4238
4271
  dispatch:
4239
- CPU, CUDA: mul_out
4240
- MPS: mul_out_mps
4272
+ CPU, CUDA, MPS: mul_out
4241
4273
  SparseCPU: mul_out_sparse_cpu
4242
4274
  SparseCUDA: mul_out_sparse_cuda
4243
4275
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: mul_out_sparse_csr
@@ -4251,7 +4283,7 @@
4251
4283
  dispatch:
4252
4284
  CompositeExplicitAutograd: mul
4253
4285
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: mul_scalar_sparse_csr
4254
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_mul_Scalar
4286
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_mul_Scalar
4255
4287
  tags: [core, pointwise]
4256
4288
 
4257
4289
  - func: mul_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
@@ -4260,7 +4292,7 @@
4260
4292
  dispatch:
4261
4293
  CompositeExplicitAutograd: mul_
4262
4294
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: mul__scalar_sparse_csr
4263
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_mul__Scalar
4295
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_mul__Scalar
4264
4296
  autogen: mul.Scalar_out
4265
4297
  tags: pointwise
4266
4298
  # multiply, alias for mul
@@ -4326,7 +4358,7 @@
4326
4358
  device_guard: False
4327
4359
  dispatch:
4328
4360
  CompositeImplicitAutograd: narrow_symint
4329
- NestedTensorCPU, NestedTensorCUDA: narrow_nested_symint
4361
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: narrow_nested_symint
4330
4362
 
4331
4363
  - func: narrow.Tensor(Tensor(a) self, int dim, Tensor start, SymInt length) -> Tensor(a)
4332
4364
  variants: function, method
@@ -4465,7 +4497,7 @@
4465
4497
  # NB: Although this composite mutates on the inside, it is
4466
4498
  # non-differentiable so NonFunctional doesn't apply
4467
4499
  CompositeExplicitAutograd: ones_like
4468
- NestedTensorCPU, NestedTensorCUDA: ones_like
4500
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: ones_like
4469
4501
  autogen: ones_like.out
4470
4502
 
4471
4503
  - func: pairwise_distance(Tensor x1, Tensor x2, float p=2, float eps=1e-06, bool keepdim=False) -> Tensor
@@ -4747,6 +4779,14 @@
4747
4779
  CompositeExplicitAutograd: randint_like
4748
4780
  autogen: randint_like.out
4749
4781
 
4782
+ - func: randint_like.Tensor(Tensor self, Tensor high, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
4783
+ tags: nondeterministic_seeded
4784
+ dispatch:
4785
+ # NB: Although this composite mutates on the inside, it is
4786
+ # non-differentiable so NonFunctional doesn't apply
4787
+ CompositeExplicitAutograd: randint_like
4788
+ autogen: randint_like.Tensor_out
4789
+
4750
4790
  - func: randint_like.low_dtype(Tensor self, SymInt low, SymInt high, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
4751
4791
  tags: nondeterministic_seeded
4752
4792
  dispatch:
@@ -4856,7 +4896,7 @@
4856
4896
  structured: True
4857
4897
  structured_inherits: TensorIteratorBase
4858
4898
  dispatch:
4859
- CPU, CUDA: reciprocal_out
4899
+ CPU, CUDA, MTIA: reciprocal_out
4860
4900
  MPS: reciprocal_out_mps
4861
4901
  tags: pointwise
4862
4902
 
@@ -4867,7 +4907,7 @@
4867
4907
  dispatch:
4868
4908
  SparseCPU, SparseCUDA: neg_sparse
4869
4909
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: neg_sparse_csr
4870
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_neg
4910
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_neg
4871
4911
  tags: [core, pointwise]
4872
4912
 
4873
4913
  - func: neg_(Tensor(a!) self) -> Tensor(a!)
@@ -4877,7 +4917,7 @@
4877
4917
  dispatch:
4878
4918
  SparseCPU, SparseCUDA: neg_sparse_
4879
4919
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: neg_sparse_csr_
4880
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_neg_
4920
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_neg_
4881
4921
  tags: pointwise
4882
4922
 
4883
4923
  - func: neg.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@@ -4885,8 +4925,7 @@
4885
4925
  structured: True
4886
4926
  structured_inherits: TensorIteratorBase
4887
4927
  dispatch:
4888
- CPU, CUDA: neg_out
4889
- MPS: neg_out_mps
4928
+ CPU, CUDA, MPS, MTIA: neg_out
4890
4929
  SparseCPU, SparseCUDA: neg_out_sparse
4891
4930
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: neg_sparse_csr_out
4892
4931
  tags: pointwise
@@ -4948,7 +4987,7 @@
4948
4987
  device_check: NoCheck
4949
4988
  device_guard: False
4950
4989
  dispatch:
4951
- CPU, CUDA, Meta, QuantizedCPU, QuantizedCUDA, ZeroTensor, MPS: _reshape_alias
4990
+ CPU, CUDA, Meta, QuantizedCPU, QuantizedCUDA, ZeroTensor, MPS, MTIA: _reshape_alias
4952
4991
  # We don't need to support mkldnn since this is handled explicitly by the reshape operator.
4953
4992
 
4954
4993
  - func: _mkldnn_reshape(Tensor self, int[] shape) -> Tensor
@@ -4989,9 +5028,7 @@
4989
5028
  structured: True
4990
5029
  structured_inherits: TensorIteratorBase
4991
5030
  dispatch:
4992
- CPU: round_out
4993
- CUDA: round_out
4994
- MPS: round_out_mps
5031
+ CPU, CUDA, MPS: round_out
4995
5032
  SparseCPU, SparseCUDA: round_sparse_out
4996
5033
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: round_sparse_csr_out
4997
5034
  tags: pointwise
@@ -5013,8 +5050,7 @@
5013
5050
  structured: True
5014
5051
  structured_inherits: TensorIteratorBase
5015
5052
  dispatch:
5016
- CPU: round_decimals_out
5017
- CUDA: round_decimals_out
5053
+ CPU, CUDA, MPS: round_decimals_out
5018
5054
  tags: pointwise
5019
5055
 
5020
5056
  - func: rrelu(Tensor self, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor
@@ -5029,12 +5065,12 @@
5029
5065
  device_check: NoCheck # TensorIterator
5030
5066
  variants: function, method
5031
5067
  dispatch:
5032
- CPU, CUDA: relu
5068
+ CPU, CUDA, MTIA: relu
5033
5069
  MPS: relu_mps
5034
5070
  MkldnnCPU: mkldnn_relu
5035
5071
  QuantizedCPU: relu_quantized_cpu
5036
5072
  QuantizedCUDA: relu_quantized_cuda
5037
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_relu
5073
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_relu
5038
5074
  SparseCPU, SparseCUDA: relu_sparse
5039
5075
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: relu_sparse_csr
5040
5076
  tags: [core, pointwise]
@@ -5043,12 +5079,12 @@
5043
5079
  device_check: NoCheck # TensorIterator
5044
5080
  variants: function, method
5045
5081
  dispatch:
5046
- CPU, CUDA: relu_
5082
+ CPU, CUDA, MTIA: relu_
5047
5083
  MPS: relu_mps_
5048
5084
  MkldnnCPU: mkldnn_relu_
5049
5085
  QuantizedCPU: relu_quantized_cpu_
5050
5086
  QuantizedCUDA: relu_quantized_cuda_
5051
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_relu_
5087
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_relu_
5052
5088
  SparseCPU, SparseCUDA: relu_sparse_
5053
5089
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: relu_sparse_csr_
5054
5090
  autogen: relu.out
@@ -5094,7 +5130,7 @@
5094
5130
  python_module: nn
5095
5131
  dispatch:
5096
5132
  QuantizedCPU: gelu_quantized_cpu_
5097
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_gelu_
5133
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_gelu_
5098
5134
 
5099
5135
  - func: gelu(Tensor self, *, str approximate='none') -> Tensor
5100
5136
  structured_delegate: gelu.out
@@ -5104,7 +5140,7 @@
5104
5140
  MkldnnCPU: mkldnn_gelu
5105
5141
  QuantizedCPU: gelu_quantized_cpu
5106
5142
  QuantizedCUDA: gelu_quantized_cuda
5107
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_gelu
5143
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_gelu
5108
5144
  tags: [core, pointwise]
5109
5145
 
5110
5146
  - func: gelu_backward.grad_input(Tensor grad_output, Tensor self, *, str approximate='none', Tensor(a!) grad_input) -> Tensor(a!)
@@ -5121,7 +5157,7 @@
5121
5157
  python_module: nn
5122
5158
  dispatch:
5123
5159
  MkldnnCPU: mkldnn_gelu_backward
5124
- NestedTensorCPU, NestedTensorCUDA: gelu_backwards_nested
5160
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: gelu_backwards_nested
5125
5161
  tags: pointwise
5126
5162
 
5127
5163
  - func: infinitely_differentiable_gelu_backward(Tensor grad, Tensor self) -> Tensor
@@ -5135,7 +5171,7 @@
5135
5171
  structured_inherits: TensorIteratorBase
5136
5172
  device_check: NoCheck # TensorIterator
5137
5173
  dispatch:
5138
- CPU, CUDA: hardshrink_out
5174
+ CPU, CUDA, MPS: hardshrink_out
5139
5175
 
5140
5176
  - func: hardshrink(Tensor self, Scalar lambd=0.5) -> Tensor
5141
5177
  structured_delegate: hardshrink.out
@@ -5147,7 +5183,7 @@
5147
5183
  structured: True
5148
5184
  structured_inherits: TensorIteratorBase
5149
5185
  dispatch:
5150
- CPU, CUDA: hardshrink_backward_out
5186
+ CPU, CUDA, MPS: hardshrink_backward_out
5151
5187
 
5152
5188
  - func: hardshrink_backward(Tensor grad_out, Tensor self, Scalar lambd) -> Tensor
5153
5189
  structured_delegate: hardshrink_backward.grad_input
@@ -5170,8 +5206,7 @@
5170
5206
  structured: True
5171
5207
  structured_inherits: TensorIteratorBase
5172
5208
  dispatch:
5173
- CPU, CUDA: rsqrt_out
5174
- MPS: rsqrt_out_mps
5209
+ CPU, CUDA, MPS, MTIA: rsqrt_out
5175
5210
  tags: pointwise
5176
5211
 
5177
5212
  - func: select.Dimname(Tensor(a) self, Dimname dim, int index) -> Tensor(a)
@@ -5186,7 +5221,7 @@
5186
5221
  dispatch:
5187
5222
  CompositeExplicitAutograd: select_symint
5188
5223
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: select_sparse_csr
5189
- NestedTensorCPU, NestedTensorCUDA: select_nested
5224
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: select_nested
5190
5225
  tags: core
5191
5226
 
5192
5227
  - func: select_backward(Tensor grad_output, SymInt[] input_sizes, int dim, SymInt index) -> Tensor
@@ -5202,7 +5237,7 @@
5202
5237
  device_check: NoCheck
5203
5238
  device_guard: False
5204
5239
  dispatch:
5205
- NestedTensorCPU, NestedTensorCUDA: _nested_select_backward_symint
5240
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: _nested_select_backward_symint
5206
5241
 
5207
5242
  - func: selu(Tensor self) -> Tensor
5208
5243
  device_check: NoCheck # TensorIterator
@@ -5227,14 +5262,14 @@
5227
5262
  structured_delegate: silu.out
5228
5263
  python_module: nn
5229
5264
  dispatch:
5230
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_silu
5265
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_silu
5231
5266
  tags: pointwise
5232
5267
 
5233
5268
  - func: silu_(Tensor(a!) self) -> Tensor(a!)
5234
5269
  structured_delegate: silu.out
5235
5270
  python_module: nn
5236
5271
  dispatch:
5237
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_silu_
5272
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_silu_
5238
5273
  tags: pointwise
5239
5274
 
5240
5275
  - func: silu.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@@ -5242,7 +5277,7 @@
5242
5277
  structured_inherits: TensorIteratorBase
5243
5278
  python_module: nn
5244
5279
  dispatch:
5245
- CPU, CUDA: silu_out
5280
+ CPU, CUDA, MTIA: silu_out
5246
5281
  MPS: silu_out_mps
5247
5282
  tags: pointwise
5248
5283
 
@@ -5260,7 +5295,7 @@
5260
5295
  python_module: nn
5261
5296
  dispatch:
5262
5297
  CompositeImplicitAutograd: math_silu_backward
5263
- NestedTensorCPU, NestedTensorCUDA: silu_backward_nested
5298
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: silu_backward_nested
5264
5299
  tags: pointwise
5265
5300
 
5266
5301
  - func: mish(Tensor self) -> Tensor
@@ -5309,14 +5344,13 @@
5309
5344
  structured: True
5310
5345
  structured_inherits: TensorIteratorBase
5311
5346
  dispatch:
5312
- CPU, CUDA: sigmoid_out
5313
- MPS: sigmoid_out_mps
5347
+ CPU, CUDA, MPS: sigmoid_out
5314
5348
  tags: pointwise
5315
5349
 
5316
5350
  - func: logit(Tensor self, float? eps=None) -> Tensor
5317
5351
  variants: function, method
5318
5352
  dispatch:
5319
- CPU, CUDA: logit
5353
+ CPU, CUDA, MTIA: logit
5320
5354
  MPS: logit_mps
5321
5355
  tags: pointwise
5322
5356
 
@@ -5339,7 +5373,7 @@
5339
5373
  dispatch:
5340
5374
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sin_sparse_csr
5341
5375
  SparseCPU, SparseCUDA: sin_sparse
5342
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_sin
5376
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sin
5343
5377
  tags: [core, pointwise]
5344
5378
 
5345
5379
  - func: sin_(Tensor(a!) self) -> Tensor(a!)
@@ -5356,8 +5390,7 @@
5356
5390
  structured: True
5357
5391
  structured_inherits: TensorIteratorBase
5358
5392
  dispatch:
5359
- CPU, CUDA: sin_out
5360
- MPS: sin_out_mps
5393
+ CPU, CUDA, MPS, MTIA: sin_out
5361
5394
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sin_sparse_csr_out
5362
5395
  SparseCPU, SparseCUDA: sin_sparse_out
5363
5396
  tags: pointwise
@@ -5376,7 +5409,7 @@
5376
5409
  structured: True
5377
5410
  structured_inherits: TensorIteratorBase
5378
5411
  dispatch:
5379
- CPU, CUDA: sinc_out
5412
+ CPU, CUDA, MPS: sinc_out
5380
5413
  tags: pointwise
5381
5414
 
5382
5415
  - func: sinh(Tensor self) -> Tensor
@@ -5402,8 +5435,7 @@
5402
5435
  structured: True
5403
5436
  structured_inherits: TensorIteratorBase
5404
5437
  dispatch:
5405
- CPU, CUDA: sinh_out
5406
- MPS: sinh_out_mps
5438
+ CPU, CUDA, MPS: sinh_out
5407
5439
  SparseCPU, SparseCUDA: sinh_sparse_out
5408
5440
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sinh_sparse_csr_out
5409
5441
 
@@ -5423,7 +5455,7 @@
5423
5455
  variants: function, method
5424
5456
  dispatch:
5425
5457
  CompositeExplicitAutograd: detach
5426
- NestedTensorCPU, NestedTensorCUDA: detach
5458
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: detach
5427
5459
 
5428
5460
  # Like `detach()`, but modifies this `Variable` in-place. This method may
5429
5461
  # only be called on non-view `Variable`s. You can use `is_view()` to check
@@ -5553,7 +5585,7 @@
5553
5585
  structured_delegate: _softmax.out
5554
5586
  dispatch:
5555
5587
  MkldnnCPU: mkldnn_softmax
5556
- NestedTensorCPU, NestedTensorCUDA: softmax_nested
5588
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: softmax_nested
5557
5589
  tags: core
5558
5590
 
5559
5591
  - func: _softmax.out(Tensor self, int dim, bool half_to_float, *, Tensor(a!) out) -> Tensor(a!)
@@ -5566,7 +5598,7 @@
5566
5598
  - func: _softmax_backward_data(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype) -> Tensor
5567
5599
  structured_delegate: _softmax_backward_data.out
5568
5600
  dispatch:
5569
- NestedTensorCPU, NestedTensorCUDA: nested_softmax_backward
5601
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: nested_softmax_backward
5570
5602
 
5571
5603
  - func: _softmax_backward_data.out(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype, *, Tensor(a!) grad_input) -> Tensor(a!)
5572
5604
  structured: True
@@ -5610,7 +5642,7 @@
5610
5642
  device_guard: False
5611
5643
  dispatch:
5612
5644
  CompositeExplicitAutograd: split_with_sizes
5613
- NestedTensorCPU, NestedTensorCUDA: split_with_sizes_nested
5645
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: split_with_sizes_nested
5614
5646
  tags: core
5615
5647
 
5616
5648
  - func: hsplit.int(Tensor(a -> *) self, int sections) -> Tensor(a)[]
@@ -5638,7 +5670,7 @@
5638
5670
  dispatch:
5639
5671
  CompositeExplicitAutograd: squeeze
5640
5672
  QuantizedCPU, QuantizedCUDA: squeeze_quantized
5641
- NestedTensorCPU, NestedTensorCUDA: squeeze_nested
5673
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: squeeze_nested
5642
5674
 
5643
5675
  - func: squeeze.dim(Tensor(a) self, int dim) -> Tensor(a)
5644
5676
  variants: function, method
@@ -5647,7 +5679,7 @@
5647
5679
  dispatch:
5648
5680
  CompositeExplicitAutograd: squeeze
5649
5681
  QuantizedCPU, QuantizedCUDA: squeeze_quantized
5650
- NestedTensorCPU, NestedTensorCUDA: squeeze_dim_nested
5682
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: squeeze_dim_nested
5651
5683
  tags: core
5652
5684
 
5653
5685
  - func: squeeze.dimname(Tensor(a) self, Dimname dim) -> Tensor(a)
@@ -5663,7 +5695,7 @@
5663
5695
  dispatch:
5664
5696
  CompositeExplicitAutograd: squeeze
5665
5697
  QuantizedCPU, QuantizedCUDA: squeeze_quantized
5666
- NestedTensorCPU, NestedTensorCUDA: squeeze_dim_nested
5698
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: squeeze_dim_nested
5667
5699
  tags: core
5668
5700
 
5669
5701
  - func: squeeze_(Tensor(a!) self) -> Tensor(a!)
@@ -5747,11 +5779,11 @@
5747
5779
  - func: dstack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!)
5748
5780
 
5749
5781
  # Overload without center & pad mode, needed for forward-compatibility
5750
- - func: stft(Tensor self, int n_fft, int? hop_length=None, int? win_length=None, Tensor? window=None, bool normalized=False, bool? onesided=None, bool? return_complex=None) -> Tensor
5782
+ - func: stft(Tensor self, int n_fft, int? hop_length=None, int? win_length=None, Tensor? window=None, bool normalized=False, bool? onesided=None, bool? return_complex=None, bool? align_to_window=None) -> Tensor
5751
5783
  variants: function, method
5752
5784
  cpp_no_default_args: ['hop_length', 'win_length', 'window', 'normalized']
5753
5785
 
5754
- - func: stft.center(Tensor self, int n_fft, int? hop_length=None, int? win_length=None, Tensor? window=None, bool center=True, str pad_mode="reflect", bool normalized=False, bool? onesided=None, bool? return_complex=None) -> Tensor
5786
+ - func: stft.center(Tensor self, int n_fft, int? hop_length=None, int? win_length=None, Tensor? window=None, bool center=True, str pad_mode="reflect", bool normalized=False, bool? onesided=None, bool? return_complex=None, bool? align_to_window=None) -> Tensor
5755
5787
  variants: function, method
5756
5788
 
5757
5789
  - func: istft(Tensor self, int n_fft, int? hop_length=None, int? win_length=None, Tensor? window=None, bool center=True, bool normalized=False, bool? onesided=None, int? length=None, bool return_complex=False) -> Tensor
@@ -5837,7 +5869,7 @@
5837
5869
  structured_delegate: sqrt.out
5838
5870
  variants: function, method
5839
5871
  dispatch:
5840
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_sqrt
5872
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sqrt
5841
5873
  SparseCPU, SparseCUDA: sqrt_sparse
5842
5874
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sqrt_sparse_csr
5843
5875
  tags: [core, pointwise]
@@ -5856,8 +5888,7 @@
5856
5888
  structured: True
5857
5889
  structured_inherits: TensorIteratorBase
5858
5890
  dispatch:
5859
- CPU, CUDA: sqrt_out
5860
- MPS: sqrt_out_mps
5891
+ CPU, CUDA, MPS, MTIA: sqrt_out
5861
5892
  SparseCPU, SparseCUDA: sqrt_sparse_out
5862
5893
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sqrt_sparse_csr_out
5863
5894
  tags: pointwise
@@ -6014,8 +6045,7 @@
6014
6045
  structured: True
6015
6046
  structured_inherits: TensorIteratorBase
6016
6047
  dispatch:
6017
- CPU, CUDA: tan_out
6018
- MPS: tan_out_mps
6048
+ CPU, CUDA, MPS: tan_out
6019
6049
  SparseCPU, SparseCUDA: tan_sparse_out
6020
6050
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tan_sparse_csr_out
6021
6051
  tags: pointwise
@@ -6029,7 +6059,7 @@
6029
6059
  MkldnnCPU: mkldnn_tanh
6030
6060
  SparseCPU, SparseCUDA: tanh_sparse
6031
6061
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tanh_sparse_csr
6032
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_tanh
6062
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_tanh
6033
6063
  tags: [core, pointwise]
6034
6064
 
6035
6065
  - func: tanh_(Tensor(a!) self) -> Tensor(a!)
@@ -6040,7 +6070,7 @@
6040
6070
  MkldnnCPU: mkldnn_tanh_
6041
6071
  SparseCPU, SparseCUDA: tanh_sparse_
6042
6072
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tanh_sparse_csr_
6043
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_tanh_
6073
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_tanh_
6044
6074
  tags: pointwise
6045
6075
 
6046
6076
  - func: tanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@@ -6048,8 +6078,7 @@
6048
6078
  structured: True
6049
6079
  structured_inherits: TensorIteratorBase
6050
6080
  dispatch:
6051
- CPU, CUDA: tanh_out
6052
- MPS: tanh_out_mps
6081
+ CPU, CUDA, MPS, MTIA: tanh_out
6053
6082
  SparseCPU, SparseCUDA: tanh_sparse_out
6054
6083
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tanh_sparse_csr_out
6055
6084
  tags: pointwise
@@ -6098,7 +6127,7 @@
6098
6127
  MkldnnCPU: mkldnn_relu_backward
6099
6128
  SparseCPU, SparseCUDA: threshold_backward_sparse
6100
6129
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: threshold_backward_sparse_compressed
6101
- NestedTensorCPU, NestedTensorCUDA: threshold_backwards_nested
6130
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: threshold_backwards_nested
6102
6131
  tags: pointwise
6103
6132
 
6104
6133
  - func: tile(Tensor self, SymInt[] dims) -> Tensor
@@ -6112,7 +6141,7 @@
6112
6141
  device_guard: False
6113
6142
  dispatch:
6114
6143
  CompositeExplicitAutograd: transpose
6115
- NestedTensorCPU, NestedTensorCUDA: transpose_nested
6144
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: transpose_nested
6116
6145
 
6117
6146
  - func: transpose.Dimname(Tensor(a) self, Dimname dim0, Dimname dim1) -> Tensor(a)
6118
6147
  variants: function, method
@@ -6209,13 +6238,13 @@
6209
6238
  - func: _nested_tensor_size(Tensor self) -> Tensor
6210
6239
  variants: method
6211
6240
  dispatch:
6212
- NestedTensorCPU, NestedTensorCUDA: _nested_tensor_size
6241
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: _nested_tensor_size
6213
6242
  autogen: _nested_tensor_size.out
6214
6243
 
6215
6244
  - func: _nested_tensor_strides(Tensor self) -> Tensor
6216
6245
  variants: method
6217
6246
  dispatch:
6218
- NestedTensorCPU, NestedTensorCUDA: _nested_tensor_strides
6247
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: _nested_tensor_strides
6219
6248
  autogen: _nested_tensor_strides.out
6220
6249
 
6221
6250
  - func: _nested_tensor_storage_offsets(Tensor self) -> Tensor
@@ -6228,7 +6257,7 @@
6228
6257
  # _nested_from_padded_and_nested_example is available for testing.
6229
6258
  - func: _nested_from_padded_and_nested_example(Tensor padded, Tensor nt_example) -> Tensor
6230
6259
  dispatch:
6231
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_from_padded_and_nested_example
6260
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_from_padded_and_nested_example
6232
6261
  autogen: _nested_from_padded_and_nested_example.out
6233
6262
 
6234
6263
  # The input arguments' types to this functions are temporary. When nested tensors switch to using SymInts for their metadata representation
@@ -6340,8 +6369,7 @@
6340
6369
  structured_inherits: TensorIteratorBase
6341
6370
  device_check: NoCheck # TensorIterator
6342
6371
  dispatch:
6343
- CPU, CUDA: trunc_out
6344
- MPS: trunc_out_mps
6372
+ CPU, CUDA, MPS: trunc_out
6345
6373
  SparseCPU, SparseCUDA: trunc_sparse_out
6346
6374
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: trunc_sparse_csr_out
6347
6375
  tags: pointwise
@@ -6420,7 +6448,7 @@
6420
6448
  CompositeExplicitAutograd: unsqueeze
6421
6449
  SparseCPU, SparseCUDA: unsqueeze_sparse
6422
6450
  QuantizedCPU, QuantizedCUDA: unsqueeze_quantized
6423
- NestedTensorCPU, NestedTensorCUDA: unsqueeze_nested
6451
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: unsqueeze_nested
6424
6452
  tags: core
6425
6453
 
6426
6454
  - func: unsqueeze_(Tensor(a!) self, int dim) -> Tensor(a!)
@@ -6514,15 +6542,15 @@
6514
6542
  device_check: NoCheck # TensorIterator
6515
6543
  variants: function, method
6516
6544
  dispatch:
6517
- CPU, CUDA, MPS: where
6518
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_where
6545
+ CPU, CUDA, MPS, MTIA: where
6546
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_where
6519
6547
  tags: [core, pointwise]
6520
6548
 
6521
6549
  - func: where.self_out(Tensor condition, Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
6522
6550
  device_check: NoCheck # TensorIterator
6523
6551
  dispatch:
6524
- CPU, CUDA, MPS: where_self_out
6525
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_where_out
6552
+ CPU, CUDA, MPS, MTIA: where_self_out
6553
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_where_out
6526
6554
 
6527
6555
  - func: where.ScalarSelf(Tensor condition, Scalar self, Tensor other) -> Tensor
6528
6556
  variants: function
@@ -6857,7 +6885,7 @@
6857
6885
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: clone_sparse_compressed
6858
6886
  MkldnnCPU: mkldnn_clone
6859
6887
  QuantizedCPU, QuantizedCUDA: quantized_clone
6860
- NestedTensorCPU, NestedTensorCUDA: clone_nested
6888
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: clone_nested
6861
6889
  autogen: clone.out
6862
6890
  tags: [core, pointwise]
6863
6891
 
@@ -6891,7 +6919,7 @@
6891
6919
  SparseCPU, SparseCUDA, SparseMeta: zero_sparse_
6892
6920
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: zero_sparse_csr_
6893
6921
  MkldnnCPU: mkldnn_zero_
6894
- NestedTensorCPU, NestedTensorCUDA: zero_nested_
6922
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: zero_nested_
6895
6923
  autogen: zero, zero.out
6896
6924
 
6897
6925
  - func: sub.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
@@ -6911,7 +6939,7 @@
6911
6939
  dispatch:
6912
6940
  SparseCPU, SparseCUDA: sub_sparse
6913
6941
  ZeroTensor: sub_zerotensor
6914
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_sub_Tensor
6942
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sub_Tensor
6915
6943
  tags: [core, pointwise]
6916
6944
 
6917
6945
  - func: sub_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
@@ -6958,7 +6986,7 @@
6958
6986
  device_check: NoCheck # TensorIterator
6959
6987
  variants: function
6960
6988
  dispatch:
6961
- CPU, CUDA: rsub
6989
+ CPU, CUDA, MPS: rsub
6962
6990
  autogen: rsub.Tensor_out
6963
6991
 
6964
6992
  - func: heaviside.out(Tensor self, Tensor values, *, Tensor(a!) out) -> Tensor(a!)
@@ -7040,6 +7068,14 @@
7040
7068
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: addmm_sparse_compressed_dense
7041
7069
  tags: core
7042
7070
 
7071
+ - func: addmm.dtype(Tensor self, Tensor mat1, Tensor mat2, ScalarType out_dtype, *, Scalar beta=1, Scalar alpha=1) -> Tensor
7072
+ dispatch:
7073
+ CUDA: _addmm_dtype_cuda
7074
+
7075
+ - func: addmm.dtype_out(Tensor self, Tensor mat1, Tensor mat2, ScalarType out_dtype, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
7076
+ dispatch:
7077
+ CUDA: _addmm_dtype_out_cuda
7078
+
7043
7079
  - func: addmm_(Tensor(a!) self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
7044
7080
  structured_delegate: addmm.out
7045
7081
  variants: method
@@ -7063,13 +7099,26 @@
7063
7099
  - func: _scaled_mm(Tensor self, Tensor mat2, Tensor scale_a, Tensor scale_b, Tensor? bias=None, Tensor? scale_result=None, ScalarType? out_dtype=None, bool use_fast_accum=False) -> Tensor
7064
7100
  variants: function
7065
7101
  dispatch:
7102
+ CPU: _scaled_mm_cpu
7066
7103
  CUDA: _scaled_mm_cuda
7067
7104
 
7068
7105
  - func: _scaled_mm.out(Tensor self, Tensor mat2, Tensor scale_a, Tensor scale_b, Tensor? bias=None, Tensor? scale_result=None, ScalarType? out_dtype=None, bool use_fast_accum=False, *, Tensor(a!) out) -> Tensor(a!)
7069
7106
  variants: function
7070
7107
  dispatch:
7108
+ CPU: _scaled_mm_out_cpu
7071
7109
  CUDA: _scaled_mm_out_cuda
7072
7110
 
7111
+
7112
+ - func: _scaled_grouped_mm(Tensor self, Tensor mat2, Tensor scale_a, Tensor scale_b, Tensor? offs=None, Tensor? bias=None, Tensor? scale_result=None, ScalarType? out_dtype=None, bool use_fast_accum=False) -> Tensor
7113
+ variants: function
7114
+ dispatch:
7115
+ CUDA: _scaled_grouped_mm_cuda
7116
+
7117
+ - func: _grouped_mm(Tensor self, Tensor mat2, Tensor? offs=None, Tensor? bias=None, ScalarType? out_dtype=None) -> Tensor
7118
+ variants: function
7119
+ dispatch:
7120
+ CUDA: _grouped_mm_cuda
7121
+
7073
7122
  # NOTE [ Sparse: autograd and API ]
7074
7123
  #
7075
7124
  #
@@ -7224,13 +7273,13 @@
7224
7273
  dispatch:
7225
7274
  CompositeImplicitAutograd: _sparse_coo_tensor_unsafe_symint
7226
7275
 
7227
- - func: _validate_sparse_coo_tensor_args(Tensor indices, Tensor values, int[] size, bool? is_coalesced=None) -> ()
7276
+ - func: _validate_sparse_coo_tensor_args(Tensor indices, Tensor values, int[] size, bool? is_coalesced=None, bool? check_pinning=None) -> ()
7228
7277
 
7229
- - func: _validate_sparse_compressed_tensor_args(Tensor compressed_indices, Tensor plain_indices, Tensor values, int[] size, Layout layout) -> ()
7230
- - func: _validate_sparse_csr_tensor_args(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size) -> ()
7231
- - func: _validate_sparse_csc_tensor_args(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size) -> ()
7232
- - func: _validate_sparse_bsr_tensor_args(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size) -> ()
7233
- - func: _validate_sparse_bsc_tensor_args(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size) -> ()
7278
+ - func: _validate_sparse_compressed_tensor_args(Tensor compressed_indices, Tensor plain_indices, Tensor values, int[] size, Layout layout, bool? check_pinning=None) -> ()
7279
+ - func: _validate_sparse_csr_tensor_args(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, bool? check_pinning=None) -> ()
7280
+ - func: _validate_sparse_csc_tensor_args(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, bool? check_pinning=None) -> ()
7281
+ - func: _validate_sparse_bsr_tensor_args(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, bool? check_pinning=None) -> ()
7282
+ - func: _validate_sparse_bsc_tensor_args(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, bool? check_pinning=None) -> ()
7234
7283
 
7235
7284
  - func: _sparse_coo_tensor_with_dims(int sparse_dim, int dense_dim, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
7236
7285
  dispatch:
@@ -7388,7 +7437,7 @@
7388
7437
  dispatch:
7389
7438
  SparseCPU, SparseCUDA, SparseMeta: values_sparse
7390
7439
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: values_sparse_csr
7391
- NestedTensorCPU, NestedTensorCUDA: values_nested
7440
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: values_nested
7392
7441
  CompositeExplicitAutograd: values_default
7393
7442
  device_check: NoCheck
7394
7443
  device_guard: False
@@ -7447,7 +7496,7 @@
7447
7496
  variants: function, method
7448
7497
  dispatch:
7449
7498
  CompositeExplicitAutograd: unbind
7450
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_unbind
7499
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_unbind
7451
7500
 
7452
7501
  - func: unbind.Dimname(Tensor(a -> *) self, Dimname dim) -> Tensor(a)[]
7453
7502
  variants: function, method
@@ -7735,7 +7784,7 @@
7735
7784
  device_guard: False
7736
7785
  dispatch:
7737
7786
  CompositeExplicitAutograd: _to_copy
7738
- NestedTensorCPU, NestedTensorCUDA: _to_copy_nested
7787
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: _to_copy_nested
7739
7788
  autogen: _to_copy.out
7740
7789
  tags: core
7741
7790
 
@@ -8021,7 +8070,7 @@
8021
8070
  variants: function, method
8022
8071
  dispatch:
8023
8072
  CompositeExplicitAutograd: masked_fill
8024
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_masked_fill
8073
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_masked_fill
8025
8074
  tags: pointwise
8026
8075
 
8027
8076
  - func: masked_fill_.Tensor(Tensor(a!) self, Tensor mask, Tensor value) -> Tensor(a!)
@@ -8076,9 +8125,9 @@
8076
8125
  device_check: NoCheck
8077
8126
  device_guard: False
8078
8127
  dispatch:
8079
- ZeroTensor, Meta, CPU, CUDA, QuantizedCPU, QuantizedCUDA, MPS: view
8128
+ ZeroTensor, Meta, CPU, CUDA, QuantizedCPU, QuantizedCUDA, MPS, MTIA: view
8080
8129
  MkldnnCPU: mkldnn_view
8081
- NestedTensorCPU, NestedTensorCUDA: view_nested
8130
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: view_nested
8082
8131
  tags: core
8083
8132
 
8084
8133
  # Warning: If you want to change the name or overload name of this
@@ -8306,7 +8355,7 @@
8306
8355
  structured_inherits: TensorIteratorBase
8307
8356
  variants: function
8308
8357
  dispatch:
8309
- CPU, CUDA: bitwise_and_out
8358
+ CPU, CUDA, MTIA: bitwise_and_out
8310
8359
  MPS: bitwise_and_out_mps
8311
8360
  tags: pointwise
8312
8361
 
@@ -8373,7 +8422,7 @@
8373
8422
  structured_inherits: TensorIteratorBase
8374
8423
  variants: function
8375
8424
  dispatch:
8376
- CPU, CUDA: bitwise_or_out
8425
+ CPU, CUDA, MTIA: bitwise_or_out
8377
8426
  MPS: bitwise_or_out_mps
8378
8427
  tags: pointwise
8379
8428
 
@@ -8919,7 +8968,7 @@
8919
8968
  variants: method, function
8920
8969
  dispatch:
8921
8970
  QuantizedCPU: eq_quantized_cpu
8922
- NestedTensorCPU, NestedTensorCUDA: eq_scalar_nested
8971
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: eq_scalar_nested
8923
8972
  tags: [core, pointwise]
8924
8973
 
8925
8974
  - func: eq.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
@@ -8938,7 +8987,7 @@
8938
8987
  variants: method, function
8939
8988
  dispatch:
8940
8989
  QuantizedCPU: eq_quantized_cpu
8941
- NestedTensorCPU, NestedTensorCUDA: eq_tensor_nested
8990
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: eq_tensor_nested
8942
8991
  tags: [core, pointwise]
8943
8992
 
8944
8993
  - func: ge.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
@@ -8957,7 +9006,7 @@
8957
9006
  variants: method, function
8958
9007
  dispatch:
8959
9008
  QuantizedCPU: ge_quantized_cpu
8960
- NestedTensorCPU, NestedTensorCUDA: ge_scalar_nested
9009
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: ge_scalar_nested
8961
9010
  tags: [core, pointwise]
8962
9011
 
8963
9012
  - func: ge.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
@@ -9084,7 +9133,7 @@
9084
9133
  variants: method, function
9085
9134
  dispatch:
9086
9135
  QuantizedCPU: gt_quantized_cpu
9087
- NestedTensorCPU, NestedTensorCUDA: gt_scalar_nested
9136
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: gt_scalar_nested
9088
9137
  tags: [core, pointwise]
9089
9138
 
9090
9139
  - func: gt.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
@@ -9137,7 +9186,7 @@
9137
9186
  structured_inherits: TensorIteratorBase
9138
9187
  device_check: NoCheck # TensorIterator
9139
9188
  dispatch:
9140
- CPU, CUDA: lt_Scalar_out
9189
+ CPU, CUDA, MTIA: lt_Scalar_out
9141
9190
  MPS: lt_scalar_out_mps
9142
9191
  QuantizedCPU: lt_out_quantized_cpu
9143
9192
  tags: pointwise
@@ -9155,7 +9204,7 @@
9155
9204
  structured_inherits: TensorIteratorBase
9156
9205
  device_check: NoCheck # TensorIterator
9157
9206
  dispatch:
9158
- CPU, CUDA: lt_Tensor_out
9207
+ CPU, CUDA, MTIA: lt_Tensor_out
9159
9208
  MPS: lt_tensor_out_mps
9160
9209
  QuantizedCPU: lt_out_quantized_cpu
9161
9210
  tags: pointwise
@@ -9274,12 +9323,12 @@
9274
9323
  MPS: nonzero_mps
9275
9324
  tags: [dynamic_output_shape, core]
9276
9325
 
9277
- - func: nonzero_static.out(Tensor self, *, int size, int fill_value=-1, Tensor(a!) out) -> Tensor(a!)
9326
+ - func: nonzero_static.out(Tensor self, *, SymInt size, int fill_value=-1, Tensor(a!) out) -> Tensor(a!)
9278
9327
  dispatch:
9279
9328
  CPU: nonzero_static_out_cpu
9280
9329
  CUDA: nonzero_static_out_cuda
9281
9330
 
9282
- - func: nonzero_static(Tensor self, *, int size, int fill_value=-1) -> Tensor
9331
+ - func: nonzero_static(Tensor self, *, SymInt size, int fill_value=-1) -> Tensor
9283
9332
  variants: method, function
9284
9333
  dispatch:
9285
9334
  CPU: nonzero_static_cpu
@@ -9427,12 +9476,12 @@
9427
9476
 
9428
9477
  - func: cholesky.out(Tensor self, bool upper=False, *, Tensor(a!) out) -> Tensor(a!)
9429
9478
  dispatch:
9430
- CPU, CUDA: cholesky_out
9479
+ CPU, CUDA, MPS: cholesky_out
9431
9480
 
9432
9481
  - func: cholesky(Tensor self, bool upper=False) -> Tensor
9433
9482
  variants: method, function
9434
9483
  dispatch:
9435
- CPU, CUDA: cholesky
9484
+ CPU, CUDA, MPS: cholesky
9436
9485
 
9437
9486
  - func: cholesky_solve.out(Tensor self, Tensor input2, bool upper=False, *, Tensor(a!) out) -> Tensor(a!)
9438
9487
  dispatch:
@@ -9506,15 +9555,16 @@
9506
9555
  structured: True
9507
9556
  dispatch:
9508
9557
  CPU, CUDA: lu_unpack_out
9558
+ MPS: lu_unpack_out_mps
9509
9559
 
9510
9560
  # TODO: remove dispatch section when porting TH CUDA to ATen
9511
- - func: multinomial.out(Tensor self, int num_samples, bool replacement=False, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
9561
+ - func: multinomial.out(Tensor self, SymInt num_samples, bool replacement=False, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
9512
9562
  tags: nondeterministic_seeded
9513
9563
  dispatch:
9514
9564
  CPU, CUDA: multinomial_out
9515
9565
  MPS: multinomial_out_mps
9516
9566
 
9517
- - func: multinomial(Tensor self, int num_samples, bool replacement=False, *, Generator? generator=None) -> Tensor
9567
+ - func: multinomial(Tensor self, SymInt num_samples, bool replacement=False, *, Generator? generator=None) -> Tensor
9518
9568
  variants: method, function
9519
9569
  dispatch:
9520
9570
  CPU, CUDA: multinomial
@@ -9602,8 +9652,7 @@
9602
9652
  structured: True
9603
9653
  structured_inherits: TensorIteratorBase
9604
9654
  dispatch:
9605
- CPU, CUDA: erfinv_out
9606
- MPS: erfinv_out_mps
9655
+ CPU, CUDA, MPS: erfinv_out
9607
9656
  SparseCPU, SparseCUDA: erfinv_sparse_out
9608
9657
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erfinv_sparse_csr_out
9609
9658
  tags: pointwise
@@ -9716,8 +9765,7 @@
9716
9765
  structured: True
9717
9766
  structured_inherits: TensorIteratorBase
9718
9767
  dispatch:
9719
- CPU, CUDA: lerp_Scalar
9720
- MPS: lerp_Scalar_mps
9768
+ CPU, CUDA, MPS: lerp_Scalar
9721
9769
  tags: pointwise
9722
9770
 
9723
9771
  - func: lerp.Tensor_out(Tensor self, Tensor end, Tensor weight, *, Tensor(a!) out) -> Tensor(a!)
@@ -9816,8 +9864,7 @@
9816
9864
  structured: True
9817
9865
  structured_inherits: TensorIteratorBase
9818
9866
  dispatch:
9819
- CPU, CUDA: fmod_out
9820
- MPS: fmod_mps_out
9867
+ CPU, CUDA, MPS: fmod_out
9821
9868
  tags: pointwise
9822
9869
 
9823
9870
  - func: fmod.Tensor(Tensor self, Tensor other) -> Tensor
@@ -9923,8 +9970,7 @@
9923
9970
  structured: True
9924
9971
  structured_inherits: TensorIteratorBase
9925
9972
  dispatch:
9926
- CPU, CUDA: remainder_out
9927
- MPS: remainder_out_mps
9973
+ CPU, CUDA, MPS, MTIA: remainder_out
9928
9974
  tags: pointwise
9929
9975
 
9930
9976
  - func: remainder.Tensor(Tensor self, Tensor other) -> Tensor
@@ -10008,7 +10054,7 @@
10008
10054
  structured_inherits: TensorIteratorBase
10009
10055
  device_check: NoCheck # TensorIterator
10010
10056
  dispatch:
10011
- CPU, CUDA: maximum_out
10057
+ CPU, CUDA, MTIA: maximum_out
10012
10058
  MPS: maximum_out_mps
10013
10059
  tags: pointwise
10014
10060
 
@@ -10040,7 +10086,7 @@
10040
10086
  structured_inherits: TensorIteratorBase
10041
10087
  device_check: NoCheck # TensorIterator
10042
10088
  dispatch:
10043
- CPU, CUDA: minimum_out
10089
+ CPU, CUDA, MTIA: minimum_out
10044
10090
  MPS: minimum_out_mps
10045
10091
  tags: pointwise
10046
10092
 
@@ -10192,7 +10238,7 @@
10192
10238
  device_check: NoCheck
10193
10239
  device_guard: False
10194
10240
  dispatch:
10195
- CPU, CUDA, Meta, MPS: unfold
10241
+ CPU, CUDA, Meta, MPS, MTIA: unfold
10196
10242
  QuantizedCPU, QuantizedCUDA: unfold
10197
10243
 
10198
10244
  - func: unfold_backward(Tensor grad_in, SymInt[] input_sizes, int dim, int size, int step) -> Tensor
@@ -10305,7 +10351,7 @@
10305
10351
  MPS: normal_mps_
10306
10352
  Meta: normal_meta_
10307
10353
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: normal_sparse_csr_
10308
- NestedTensorCPU, NestedTensorCUDA: normal_nested_
10354
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: normal_nested_
10309
10355
  autogen: normal.out
10310
10356
 
10311
10357
  # Only used by the functionalization pass.
@@ -10373,7 +10419,7 @@
10373
10419
  variants: method, function
10374
10420
  dispatch:
10375
10421
  CompositeExplicitAutograd: alias
10376
- NestedTensorCPU, NestedTensorCUDA: alias_nested
10422
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: alias_nested
10377
10423
  tags: core
10378
10424
 
10379
10425
  - func: _amp_foreach_non_finite_check_and_unscale_(Tensor(a!)[] self, Tensor(b!) found_inf, Tensor inv_scale) -> ()
@@ -10381,6 +10427,7 @@
10381
10427
  dispatch:
10382
10428
  CUDA: _amp_foreach_non_finite_check_and_unscale_cuda_
10383
10429
  CPU: _amp_foreach_non_finite_check_and_unscale_cpu_
10430
+ MPS: _amp_foreach_non_finite_check_and_unscale_mps_
10384
10431
  autogen: _amp_foreach_non_finite_check_and_unscale, _amp_foreach_non_finite_check_and_unscale.out
10385
10432
 
10386
10433
  - func: _amp_update_scale_(Tensor(a!) self, Tensor(b!) growth_tracker, Tensor found_inf, float scale_growth_factor, float scale_backoff_factor, int growth_interval) -> Tensor(a!)
@@ -10388,6 +10435,7 @@
10388
10435
  dispatch:
10389
10436
  CUDA: _amp_update_scale_cuda_
10390
10437
  CPU: _amp_update_scale_cpu_
10438
+ MPS: _amp_update_scale_mps_
10391
10439
  autogen: _amp_update_scale, _amp_update_scale.out
10392
10440
 
10393
10441
  #- func: _cat(Tensor[] tensors, int dim=0) -> Tensor
@@ -11790,7 +11838,7 @@
11790
11838
  structured_delegate: elu.out
11791
11839
  device_check: NoCheck # TensorIterator
11792
11840
  python_module: nn
11793
- tags: pointwise
11841
+ tags: [core, pointwise]
11794
11842
 
11795
11843
  - func: elu_backward.grad_input(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, bool is_result, Tensor self_or_result, *, Tensor(a!) grad_input) -> Tensor(a!)
11796
11844
  structured: True
@@ -11854,8 +11902,7 @@
11854
11902
  device_check: NoCheck # TensorIterator
11855
11903
  python_module: nn
11856
11904
  dispatch:
11857
- CPU, CUDA: hardsigmoid_out
11858
- MPS: hardsigmoid_out_mps
11905
+ CPU, CUDA, MPS: hardsigmoid_out
11859
11906
  QuantizedCPU: hardsigmoid_out_quantized_cpu
11860
11907
 
11861
11908
  - func: hardsigmoid(Tensor self) -> Tensor
@@ -11876,8 +11923,7 @@
11876
11923
  structured_inherits: TensorIteratorBase
11877
11924
  python_module: nn
11878
11925
  dispatch:
11879
- CPU, CUDA: hardsigmoid_backward_out
11880
- MPS: hardsigmoid_backward_out_mps
11926
+ CPU, CUDA, MPS: hardsigmoid_backward_out
11881
11927
 
11882
11928
  - func: hardsigmoid_backward(Tensor grad_output, Tensor self) -> Tensor
11883
11929
  structured_delegate: hardsigmoid_backward.grad_input
@@ -11921,28 +11967,24 @@
11921
11967
  device_check: NoCheck # TensorIterator
11922
11968
  python_module: nn
11923
11969
  dispatch:
11924
- CPU, CUDA: hardswish_out
11925
- MPS: hardswish_out_mps
11970
+ CPU, CUDA, MPS: hardswish_out
11926
11971
 
11927
11972
  - func: hardswish(Tensor self) -> Tensor
11928
11973
  device_check: NoCheck # TensorIterator
11929
11974
  python_module: nn
11930
11975
  dispatch:
11931
- CPU, CUDA: hardswish
11932
- MPS: hardswish_mps
11976
+ CPU, CUDA, MPS: hardswish
11933
11977
 
11934
11978
  - func: hardswish_(Tensor(a!) self) -> Tensor(a!)
11935
11979
  device_check: NoCheck # TensorIterator
11936
11980
  python_module: nn
11937
11981
  dispatch:
11938
- CPU, CUDA: hardswish_
11939
- MPS: hardswish_mps_
11982
+ CPU, CUDA, MPS: hardswish_
11940
11983
 
11941
11984
  - func: hardswish_backward(Tensor grad_output, Tensor self) -> Tensor
11942
11985
  python_module: nn
11943
11986
  dispatch:
11944
- CPU, CUDA: hardswish_backward
11945
- MPS: hardswish_backward_mps
11987
+ CPU, CUDA, MPS: hardswish_backward
11946
11988
  autogen: hardswish_backward.out
11947
11989
 
11948
11990
  - func: leaky_relu.out(Tensor self, Scalar negative_slope=0.01, *, Tensor(a!) out) -> Tensor(a!)
@@ -11951,8 +11993,7 @@
11951
11993
  device_check: NoCheck # TensorIterator
11952
11994
  python_module: nn
11953
11995
  dispatch:
11954
- CPU, CUDA: leaky_relu_out
11955
- MPS: leaky_relu_out_mps
11996
+ CPU, CUDA, MPS: leaky_relu_out
11956
11997
  QuantizedCPU: leaky_relu_out_quantized_cpu
11957
11998
 
11958
11999
  - func: leaky_relu(Tensor self, Scalar negative_slope=0.01) -> Tensor
@@ -11968,8 +12009,7 @@
11968
12009
  structured_inherits: TensorIteratorBase
11969
12010
  python_module: nn
11970
12011
  dispatch:
11971
- CPU, CUDA: leaky_relu_backward_out
11972
- MPS: leaky_relu_backward_out_mps
12012
+ CPU, CUDA, MPS: leaky_relu_backward_out
11973
12013
 
11974
12014
  - func: leaky_relu_backward(Tensor grad_output, Tensor self, Scalar negative_slope, bool self_is_result) -> Tensor
11975
12015
  structured_delegate: leaky_relu_backward.grad_input
@@ -12081,8 +12121,7 @@
12081
12121
  device_check: NoCheck # TensorIterator
12082
12122
  python_module: nn
12083
12123
  dispatch:
12084
- CPU, CUDA: softshrink_out
12085
- MPS: softshrink_out_mps
12124
+ CPU, CUDA, MPS: softshrink_out
12086
12125
 
12087
12126
  - func: softshrink(Tensor self, Scalar lambd=0.5) -> Tensor
12088
12127
  structured_delegate: softshrink.out
@@ -12095,8 +12134,7 @@
12095
12134
  structured_inherits: TensorIteratorBase
12096
12135
  python_module: nn
12097
12136
  dispatch:
12098
- CPU, CUDA: softshrink_backward_out
12099
- MPS: softshrink_backward_out_mps
12137
+ CPU, CUDA, MPS: softshrink_backward_out
12100
12138
 
12101
12139
  - func: softshrink_backward(Tensor grad_output, Tensor self, Scalar lambd) -> Tensor
12102
12140
  structured_delegate: softshrink_backward.grad_input
@@ -12711,6 +12749,7 @@
12711
12749
  dispatch:
12712
12750
  CPU: _upsample_bilinear2d_aa_out_cpu
12713
12751
  CUDA: _upsample_bilinear2d_aa_out_cuda
12752
+ MPS: _upsample_bilinear2d_aa_out_mps
12714
12753
 
12715
12754
  - func: _upsample_bilinear2d_aa(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
12716
12755
  python_module: nn
@@ -12757,6 +12796,7 @@
12757
12796
  dispatch:
12758
12797
  CPU: _upsample_bicubic2d_aa_out_cpu
12759
12798
  CUDA: _upsample_bicubic2d_aa_out_cuda
12799
+ MPS: _upsample_bicubic2d_aa_out_mps
12760
12800
 
12761
12801
  - func: _upsample_bicubic2d_aa(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
12762
12802
  python_module: nn
@@ -12779,6 +12819,7 @@
12779
12819
  dispatch:
12780
12820
  CPU: upsample_trilinear3d_out_cpu
12781
12821
  CUDA: upsample_trilinear3d_out_cuda
12822
+ MPS: upsample_trilinear3d_out_mps
12782
12823
 
12783
12824
  - func: upsample_trilinear3d(Tensor self, SymInt[3] output_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
12784
12825
  python_module: nn
@@ -12790,6 +12831,7 @@
12790
12831
  dispatch:
12791
12832
  CPU: upsample_trilinear3d_backward_out_cpu
12792
12833
  CUDA: upsample_trilinear3d_backward_out_cuda
12834
+ MPS: upsample_trilinear3d_backward_out_mps
12793
12835
 
12794
12836
  - func: upsample_trilinear3d_backward(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
12795
12837
  python_module: nn
@@ -12901,6 +12943,7 @@
12901
12943
  dispatch:
12902
12944
  CPU: upsample_nearest3d_out_cpu
12903
12945
  CUDA: upsample_nearest3d_out_cuda
12946
+ MPS: upsample_nearest3d_out_mps
12904
12947
 
12905
12948
  - func: _upsample_nearest_exact3d.out(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
12906
12949
  python_module: nn
@@ -12908,6 +12951,7 @@
12908
12951
  dispatch:
12909
12952
  CPU: _upsample_nearest_exact3d_out_cpu
12910
12953
  CUDA: _upsample_nearest_exact3d_out_cuda
12954
+ MPS: _upsample_nearest_exact3d_out_mps
12911
12955
 
12912
12956
  - func: upsample_nearest3d(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
12913
12957
  python_module: nn
@@ -12927,6 +12971,7 @@
12927
12971
  dispatch:
12928
12972
  CPU: upsample_nearest3d_backward_out_cpu
12929
12973
  CUDA: upsample_nearest3d_backward_out_cuda
12974
+ MPS: upsample_nearest3d_backward_out_mps
12930
12975
 
12931
12976
  - func: _upsample_nearest_exact3d_backward.grad_input(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
12932
12977
  python_module: nn
@@ -12934,6 +12979,7 @@
12934
12979
  dispatch:
12935
12980
  CPU: _upsample_nearest_exact3d_backward_out_cpu
12936
12981
  CUDA: _upsample_nearest_exact3d_backward_out_cuda
12982
+ MPS: _upsample_nearest_exact3d_backward_out_mps
12937
12983
 
12938
12984
  - func: upsample_nearest3d_backward(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
12939
12985
  python_module: nn
@@ -12976,7 +13022,7 @@
12976
13022
  structured: True
12977
13023
  structured_inherits: TensorIteratorBase
12978
13024
  dispatch:
12979
- CPU, CUDA: tanh_backward_out
13025
+ CPU, CUDA, MTIA: tanh_backward_out
12980
13026
  MPS: tanh_backward_out_mps
12981
13027
  tags: pointwise
12982
13028
 
@@ -13058,7 +13104,6 @@
13058
13104
  autogen: _slow_conv2d_backward.output_mask_out
13059
13105
 
13060
13106
  - func: _conv_depthwise2d.out(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias, SymInt[2] stride, SymInt[2] padding, SymInt[2] dilation, *, Tensor(a!) out) -> Tensor(a!)
13061
- use_const_ref_for_mutable_tensors: True
13062
13107
  python_module: nn
13063
13108
  dispatch:
13064
13109
  CUDA: conv_depthwise2d_cuda_out
@@ -13109,12 +13154,14 @@
13109
13154
  dispatch:
13110
13155
  CPU: col2im_out_cpu
13111
13156
  CUDA: col2im_out_cuda
13157
+ MPS: col2im_out_mps
13112
13158
 
13113
13159
  - func: col2im(Tensor self, SymInt[2] output_size, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride) -> Tensor
13114
13160
  python_module: nn
13115
13161
  dispatch:
13116
13162
  CPU: col2im_cpu
13117
13163
  CUDA: col2im_cuda
13164
+ MPS: col2im_mps
13118
13165
  tags: core
13119
13166
 
13120
13167
  - func: column_stack(Tensor[] tensors) -> Tensor
@@ -13147,7 +13194,7 @@
13147
13194
  device_guard: False
13148
13195
  dispatch:
13149
13196
  CompositeExplicitAutograd: isinf
13150
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_isinf
13197
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_isinf
13151
13198
  SparseCPU, SparseCUDA: isinf_sparse
13152
13199
  SparseMeta: isinf_sparse_meta
13153
13200
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isinf_sparse_csr
@@ -13163,7 +13210,7 @@
13163
13210
  variants: function, method
13164
13211
  structured_delegate: isposinf.out
13165
13212
  dispatch:
13166
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_isposinf
13213
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_isposinf
13167
13214
  SparseCPU, SparseCUDA: isposinf_sparse
13168
13215
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isposinf_sparse_csr
13169
13216
  tags: pointwise
@@ -13181,7 +13228,7 @@
13181
13228
  variants: function, method
13182
13229
  structured_delegate: isneginf.out
13183
13230
  dispatch:
13184
- NestedTensorCPU, NestedTensorCUDA: NestedTensor_isneginf
13231
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_isneginf
13185
13232
  SparseCPU, SparseCUDA: isneginf_sparse
13186
13233
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isneginf_sparse_csr
13187
13234
  tags: pointwise
@@ -13225,7 +13272,7 @@
13225
13272
  python_module: special
13226
13273
  variants: function
13227
13274
  dispatch:
13228
- CPU, CUDA: special_entr_out
13275
+ CPU, CUDA, MPS: special_entr_out
13229
13276
  tags: pointwise
13230
13277
 
13231
13278
  - func: special_ndtri(Tensor self) -> Tensor
@@ -13372,7 +13419,7 @@
13372
13419
  python_module: special
13373
13420
  variants: function
13374
13421
  dispatch:
13375
- CPU, CUDA: special_xlog1py_out
13422
+ CPU, CUDA, MPS: special_xlog1py_out
13376
13423
  tags: pointwise
13377
13424
 
13378
13425
  - func: special_xlog1py.self_scalar_out(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
@@ -13451,7 +13498,7 @@
13451
13498
  python_module: special
13452
13499
  variants: function
13453
13500
  dispatch:
13454
- CPU, CUDA: special_zeta_out
13501
+ CPU, CUDA, MPS: special_zeta_out
13455
13502
  tags: pointwise
13456
13503
 
13457
13504
  - func: special_zeta.self_scalar_out(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
@@ -13489,7 +13536,7 @@
13489
13536
  structured: True
13490
13537
  structured_inherits: TensorIteratorBase
13491
13538
  dispatch:
13492
- CPU, CUDA: special_i0e_out
13539
+ CPU, CUDA, MPS: special_i0e_out
13493
13540
  tags: pointwise
13494
13541
 
13495
13542
  - func: special_i1(Tensor self) -> Tensor
@@ -13517,7 +13564,7 @@
13517
13564
  structured: True
13518
13565
  structured_inherits: TensorIteratorBase
13519
13566
  dispatch:
13520
- CPU, CUDA: special_i1e_out
13567
+ CPU, CUDA, MPS: special_i1e_out
13521
13568
  tags: pointwise
13522
13569
 
13523
13570
  - func: special_logit(Tensor self, float? eps=None) -> Tensor
@@ -13744,7 +13791,6 @@
13744
13791
  CompositeImplicitAutograd: fft_hfft2_symint
13745
13792
 
13746
13793
  - func: fft_hfft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
13747
- use_const_ref_for_mutable_tensors: True
13748
13794
  python_module: fft
13749
13795
  variants: function
13750
13796
  dispatch:
@@ -13758,7 +13804,6 @@
13758
13804
  CompositeImplicitAutograd: fft_ihfft2_symint
13759
13805
 
13760
13806
  - func: fft_ihfft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
13761
- use_const_ref_for_mutable_tensors: True
13762
13807
  python_module: fft
13763
13808
  variants: function
13764
13809
  dispatch:
@@ -13820,7 +13865,6 @@
13820
13865
  CompositeImplicitAutograd: fft_hfftn_symint
13821
13866
 
13822
13867
  - func: fft_hfftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
13823
- use_const_ref_for_mutable_tensors: True
13824
13868
  python_module: fft
13825
13869
  variants: function
13826
13870
  dispatch:
@@ -13834,7 +13878,6 @@
13834
13878
  CompositeImplicitAutograd: fft_ihfftn_symint
13835
13879
 
13836
13880
  - func: fft_ihfftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
13837
- use_const_ref_for_mutable_tensors: True
13838
13881
  python_module: fft
13839
13882
  variants: function
13840
13883
  dispatch:
@@ -13890,7 +13933,7 @@
13890
13933
  python_module: linalg
13891
13934
  structured: True
13892
13935
  dispatch:
13893
- CPU, CUDA: linalg_cholesky_ex_out
13936
+ CPU, CUDA, MPS: linalg_cholesky_ex_out
13894
13937
 
13895
13938
  - func: linalg_cholesky(Tensor self, *, bool upper=False) -> Tensor
13896
13939
  python_module: linalg
@@ -13937,6 +13980,7 @@
13937
13980
  structured: True
13938
13981
  dispatch:
13939
13982
  CPU, CUDA: linalg_lu_factor_ex_out
13983
+ MPS: linalg_lu_factor_ex_out_mps
13940
13984
 
13941
13985
  # linalg.lu
13942
13986
  - func: linalg_lu(Tensor A, *, bool pivot=True) -> (Tensor P, Tensor L, Tensor U)
@@ -13971,7 +14015,7 @@
13971
14015
  - func: _linalg_det.result(Tensor A, *, Tensor(a!) result, Tensor(b!) LU, Tensor(c!) pivots) -> (Tensor(a!) result, Tensor(b!) LU, Tensor(c!) pivots)
13972
14016
  structured: True
13973
14017
  dispatch:
13974
- CPU, CUDA: _linalg_det_out
14018
+ CPU, CUDA, MPS: _linalg_det_out
13975
14019
 
13976
14020
  - func: linalg_det(Tensor A) -> Tensor
13977
14021
  python_module: linalg
@@ -14058,7 +14102,7 @@
14058
14102
  - func: _linalg_slogdet.sign(Tensor A, *, Tensor(a!) sign, Tensor(b!) logabsdet, Tensor(c!) LU, Tensor(d!) pivots) -> (Tensor(a!) sign, Tensor(b!) logabsdet, Tensor(c!) LU, Tensor(d!) pivots)
14059
14103
  structured: True
14060
14104
  dispatch:
14061
- CPU, CUDA: _linalg_slogdet_out
14105
+ CPU, CUDA, MPS: _linalg_slogdet_out
14062
14106
 
14063
14107
  - func: linalg_slogdet(Tensor A) -> (Tensor sign, Tensor logabsdet)
14064
14108
  python_module: linalg
@@ -14300,6 +14344,7 @@
14300
14344
  structured: True
14301
14345
  dispatch:
14302
14346
  CPU, CUDA: _linalg_solve_ex_out
14347
+ MPS: _linalg_solve_ex_out_mps
14303
14348
 
14304
14349
  - func: linalg_solve_ex(Tensor A, Tensor B, *, bool left=True, bool check_errors=False) -> (Tensor result, Tensor info)
14305
14350
  python_module: linalg
@@ -14458,13 +14503,13 @@
14458
14503
  dispatch:
14459
14504
  # the NestedTensor keys are necessary because NestedTensor has been removed
14460
14505
  # from the CompositeExplicitAutograd keyset see Note [NestedTensor Not Included in Backend Keys]
14461
- CompositeExplicitAutograd, NestedTensorCPU, NestedTensorCUDA: _test_autograd_multiple_dispatch_fullcoverage
14506
+ CompositeExplicitAutograd, NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: _test_autograd_multiple_dispatch_fullcoverage
14462
14507
  autogen: _test_autograd_multiple_dispatch.fullcoverage_out
14463
14508
 
14464
14509
  # Note: this function is only for testing.
14465
14510
  - func: _test_autograd_multiple_dispatch.ntonly(Tensor self, bool b) -> Tensor
14466
14511
  dispatch:
14467
- CompositeImplicitAutograd, NestedTensorCPU, NestedTensorCUDA: _test_autograd_multiple_dispatch_ntonly
14512
+ CompositeImplicitAutograd, NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: _test_autograd_multiple_dispatch_ntonly
14468
14513
 
14469
14514
  # Note: this function is only for testing.
14470
14515
  - func: _test_autograd_multiple_dispatch_view(Tensor(a) self) -> Tensor(a)
@@ -14809,13 +14854,13 @@
14809
14854
  - func: _safe_softmax(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
14810
14855
  dispatch:
14811
14856
  CompositeExplicitAutograd: _safe_softmax
14812
- NestedTensorCPU, NestedTensorCUDA: _safe_softmax
14857
+ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: _safe_softmax
14813
14858
 
14814
14859
  # Apparently, putting "forward" in the name will cause Python bindings to be skipped, so "fwd" it is.
14815
14860
  - func: _transformer_encoder_layer_fwd(Tensor src, int embed_dim, int num_heads, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, bool use_gelu, bool norm_first, float eps, Tensor norm_weight_1, Tensor norm_bias_1, Tensor norm_weight_2, Tensor norm_bias_2, Tensor ffn_weight_1, Tensor ffn_bias_1, Tensor ffn_weight_2, Tensor ffn_bias_2, Tensor? mask=None, int? mask_type=None) -> Tensor
14816
14861
  variants: function
14817
14862
  dispatch:
14818
- CPU, CUDA, NestedTensorCPU, NestedTensorCUDA: transformer_encoder_layer_forward
14863
+ CPU, CUDA, NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: transformer_encoder_layer_forward
14819
14864
  autogen: _transformer_encoder_layer_fwd.out
14820
14865
 
14821
14866
  - func: _native_multi_head_attention(Tensor query, Tensor key, Tensor value, int embed_dim, int num_head, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, Tensor? mask=None, bool need_weights=True, bool average_attn_weights=True, int? mask_type=None) -> (Tensor, Tensor)
@@ -14837,6 +14882,7 @@
14837
14882
  Meta: _fused_sdp_choice_meta
14838
14883
  CPU, NestedTensorCPU: _fused_sdp_choice_cpp
14839
14884
  CUDA, NestedTensorCUDA: _fused_sdp_choice_cuda
14885
+ XPU: _fused_sdp_choice_xpu
14840
14886
  tags: nondeterministic_seeded
14841
14887
 
14842
14888
  - func: _scaled_dot_product_attention_math(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False, Tensor? dropout_mask=None, *, float? scale=None, bool enable_gqa=False) -> (Tensor, Tensor)
@@ -14848,7 +14894,7 @@
14848
14894
  MPS: _scaled_dot_product_attention_math_mps
14849
14895
  tags: nondeterministic_seeded
14850
14896
 
14851
- - func: _scaled_dot_product_flash_attention(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor output, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
14897
+ - func: _scaled_dot_product_flash_attention(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor output, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, Tensor rng_state, Tensor unused, Tensor debug_attn_mask)
14852
14898
  dispatch:
14853
14899
  CUDA: _scaled_dot_product_flash_attention_cuda
14854
14900
  NestedTensorCUDA: _scaled_dot_product_flash_attention_nestedtensor_cuda
@@ -14862,6 +14908,7 @@
14862
14908
  - func: _scaled_dot_product_fused_attention_overrideable(Tensor query, Tensor key, Tensor value, Tensor? attn_bias=None, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor output, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
14863
14909
  dispatch:
14864
14910
  CompositeExplicitAutograd: _scaled_dot_product_fused_attention_overrideable
14911
+ XPU: _scaled_dot_product_fused_attention_overrideable_xpu
14865
14912
  tags: nondeterministic_seeded
14866
14913
 
14867
14914
  - func: _scaled_dot_product_flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, Tensor philox_seed, Tensor philox_offset, *, float? scale=None) -> (Tensor grad_query, Tensor grad_key, Tensor grad_value)
@@ -14898,6 +14945,7 @@
14898
14945
  - func: _scaled_dot_product_cudnn_attention(Tensor query, Tensor key, Tensor value, Tensor? attn_bias, bool compute_log_sumexp, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor output, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
14899
14946
  dispatch:
14900
14947
  CUDA: _scaled_dot_product_cudnn_attention_cuda
14948
+ NestedTensorCUDA: _scaled_dot_product_cudnn_attention_nestedtensor_cuda
14901
14949
  tags: nondeterministic_seeded
14902
14950
 
14903
14951
  - func: _scaled_dot_product_cudnn_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset, Tensor attn_bias, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, *, float? scale=None) -> (Tensor, Tensor, Tensor)
@@ -14905,13 +14953,13 @@
14905
14953
  CUDA: _scaled_dot_product_cudnn_attention_backward_cuda
14906
14954
  tags: nondeterministic_seeded
14907
14955
 
14908
- - func: _flash_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? cum_seq_q, Tensor? cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, bool return_debug_mask, *, float? scale=None, SymInt? window_size_left=None, SymInt? window_size_right=None, Tensor? seqused_k=None, Tensor? alibi_slopes=None) -> (Tensor output, Tensor softmax_logsumexp, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
14956
+ - func: _flash_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? cum_seq_q, Tensor? cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, bool return_debug_mask, *, float? scale=None, SymInt? window_size_left=None, SymInt? window_size_right=None, Tensor? seqused_k=None, Tensor? alibi_slopes=None) -> (Tensor output, Tensor softmax_logsumexp, Tensor rng_state, Tensor unused, Tensor debug_attn_mask)
14909
14957
  variants: function
14910
14958
  dispatch:
14911
14959
  CUDA: _flash_attention_forward
14912
14960
  tags: nondeterministic_seeded
14913
14961
 
14914
- - func: _flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, Tensor philox_seed, Tensor philox_offset, *, float? scale=None, SymInt? window_size_left=None, SymInt? window_size_right=None) -> (Tensor, Tensor, Tensor)
14962
+ - func: _flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, Tensor rng_state, Tensor unused, *, float? scale=None, SymInt? window_size_left=None, SymInt? window_size_right=None) -> (Tensor, Tensor, Tensor)
14915
14963
  device_check: NoCheck
14916
14964
  variants: function
14917
14965
  dispatch:
@@ -14930,6 +14978,11 @@
14930
14978
  dispatch:
14931
14979
  CUDA: _efficient_attention_backward
14932
14980
 
14981
+ - func: _cudnn_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? attn_bias, Tensor? cum_seq_q, Tensor? cum_seq_k, SymInt max_q, SymInt max_k, bool compute_log_sumexp, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor output, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
14982
+ dispatch:
14983
+ CUDA: _cudnn_attention_forward
14984
+ tags: nondeterministic_seeded
14985
+
14933
14986
  - func: _triton_scaled_dot_attention(Tensor q, Tensor k, Tensor v, float dropout_p=0.0) -> Tensor
14934
14987
  variants: function
14935
14988
  dispatch:
@@ -14972,7 +15025,7 @@
14972
15025
 
14973
15026
  - func: special_bessel_j0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
14974
15027
  dispatch:
14975
- CPU, CUDA: special_bessel_j0_out
15028
+ CPU, CUDA, MPS: special_bessel_j0_out
14976
15029
  python_module: special
14977
15030
  structured_inherits: TensorIteratorBase
14978
15031
  structured: True
@@ -14987,7 +15040,7 @@
14987
15040
 
14988
15041
  - func: special_bessel_j1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
14989
15042
  dispatch:
14990
- CPU, CUDA: special_bessel_j1_out
15043
+ CPU, CUDA, MPS: special_bessel_j1_out
14991
15044
  python_module: special
14992
15045
  structured_inherits: TensorIteratorBase
14993
15046
  structured: True
@@ -15002,7 +15055,7 @@
15002
15055
 
15003
15056
  - func: special_bessel_y0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
15004
15057
  dispatch:
15005
- CPU, CUDA: special_bessel_y0_out
15058
+ CPU, CUDA, MPS: special_bessel_y0_out
15006
15059
  python_module: special
15007
15060
  structured_inherits: TensorIteratorBase
15008
15061
  structured: True
@@ -15017,7 +15070,7 @@
15017
15070
 
15018
15071
  - func: special_bessel_y1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
15019
15072
  dispatch:
15020
- CPU, CUDA: special_bessel_y1_out
15073
+ CPU, CUDA, MPS: special_bessel_y1_out
15021
15074
  python_module: special
15022
15075
  structured_inherits: TensorIteratorBase
15023
15076
  structured: True
@@ -15050,7 +15103,7 @@
15050
15103
  - func: special_chebyshev_polynomial_t.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
15051
15104
  device_check: NoCheck
15052
15105
  dispatch:
15053
- CPU, CUDA: special_chebyshev_polynomial_t_out
15106
+ CPU, CUDA, MPS: special_chebyshev_polynomial_t_out
15054
15107
  python_module: special
15055
15108
  structured_inherits: TensorIteratorBase
15056
15109
  structured: True
@@ -15099,7 +15152,7 @@
15099
15152
  - func: special_chebyshev_polynomial_u.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
15100
15153
  device_check: NoCheck
15101
15154
  dispatch:
15102
- CPU, CUDA: special_chebyshev_polynomial_u_out
15155
+ CPU, CUDA, MPS: special_chebyshev_polynomial_u_out
15103
15156
  python_module: special
15104
15157
  structured_inherits: TensorIteratorBase
15105
15158
  structured: True
@@ -15148,7 +15201,7 @@
15148
15201
  - func: special_chebyshev_polynomial_v.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
15149
15202
  device_check: NoCheck
15150
15203
  dispatch:
15151
- CPU, CUDA: special_chebyshev_polynomial_v_out
15204
+ CPU, CUDA, MPS: special_chebyshev_polynomial_v_out
15152
15205
  python_module: special
15153
15206
  structured_inherits: TensorIteratorBase
15154
15207
  structured: True
@@ -15197,7 +15250,7 @@
15197
15250
  - func: special_chebyshev_polynomial_w.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
15198
15251
  device_check: NoCheck
15199
15252
  dispatch:
15200
- CPU, CUDA: special_chebyshev_polynomial_w_out
15253
+ CPU, CUDA, MPS: special_chebyshev_polynomial_w_out
15201
15254
  python_module: special
15202
15255
  structured_inherits: TensorIteratorBase
15203
15256
  structured: True
@@ -15246,7 +15299,7 @@
15246
15299
  - func: special_hermite_polynomial_h.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
15247
15300
  device_check: NoCheck
15248
15301
  dispatch:
15249
- CPU, CUDA: special_hermite_polynomial_h_out
15302
+ CPU, CUDA, MPS: special_hermite_polynomial_h_out
15250
15303
  python_module: special
15251
15304
  structured_inherits: TensorIteratorBase
15252
15305
  structured: True
@@ -15295,7 +15348,7 @@
15295
15348
  - func: special_hermite_polynomial_he.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
15296
15349
  device_check: NoCheck
15297
15350
  dispatch:
15298
- CPU, CUDA: special_hermite_polynomial_he_out
15351
+ CPU, CUDA, MPS: special_hermite_polynomial_he_out
15299
15352
  python_module: special
15300
15353
  structured_inherits: TensorIteratorBase
15301
15354
  structured: True
@@ -15424,7 +15477,7 @@
15424
15477
 
15425
15478
  - func: special_modified_bessel_i0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
15426
15479
  dispatch:
15427
- CPU, CUDA: special_modified_bessel_i0_out
15480
+ CPU, CUDA, MPS: special_modified_bessel_i0_out
15428
15481
  python_module: special
15429
15482
  structured_inherits: TensorIteratorBase
15430
15483
  structured: True
@@ -15439,7 +15492,7 @@
15439
15492
 
15440
15493
  - func: special_modified_bessel_i1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
15441
15494
  dispatch:
15442
- CPU, CUDA: special_modified_bessel_i1_out
15495
+ CPU, CUDA, MPS: special_modified_bessel_i1_out
15443
15496
  python_module: special
15444
15497
  structured_inherits: TensorIteratorBase
15445
15498
  structured: True
@@ -15454,7 +15507,7 @@
15454
15507
 
15455
15508
  - func: special_modified_bessel_k0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
15456
15509
  dispatch:
15457
- CPU, CUDA: special_modified_bessel_k0_out
15510
+ CPU, CUDA, MPS: special_modified_bessel_k0_out
15458
15511
  python_module: special
15459
15512
  structured_inherits: TensorIteratorBase
15460
15513
  structured: True
@@ -15469,7 +15522,7 @@
15469
15522
 
15470
15523
  - func: special_modified_bessel_k1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
15471
15524
  dispatch:
15472
- CPU, CUDA: special_modified_bessel_k1_out
15525
+ CPU, CUDA, MPS: special_modified_bessel_k1_out
15473
15526
  python_module: special
15474
15527
  structured_inherits: TensorIteratorBase
15475
15528
  structured: True
@@ -15484,7 +15537,7 @@
15484
15537
 
15485
15538
  - func: special_scaled_modified_bessel_k0.out(Tensor x, *, Tensor(a!) out) -> Tensor(a!)
15486
15539
  dispatch:
15487
- CPU, CUDA: special_scaled_modified_bessel_k0_out
15540
+ CPU, CUDA, MPS: special_scaled_modified_bessel_k0_out
15488
15541
  python_module: special
15489
15542
  structured_inherits: TensorIteratorBase
15490
15543
  structured: True
@@ -15499,7 +15552,7 @@
15499
15552
 
15500
15553
  - func: special_scaled_modified_bessel_k1.out(Tensor x, *, Tensor(a!) out) -> Tensor(a!)
15501
15554
  dispatch:
15502
- CPU, CUDA: special_scaled_modified_bessel_k1_out
15555
+ CPU, CUDA, MPS: special_scaled_modified_bessel_k1_out
15503
15556
  python_module: special
15504
15557
  structured_inherits: TensorIteratorBase
15505
15558
  structured: True
@@ -15710,7 +15763,7 @@
15710
15763
 
15711
15764
  - func: special_spherical_bessel_j0.out(Tensor x, *, Tensor(a!) out) -> Tensor(a!)
15712
15765
  dispatch:
15713
- CPU, CUDA: special_spherical_bessel_j0_out
15766
+ CPU, CUDA, MPS: special_spherical_bessel_j0_out
15714
15767
  python_module: special
15715
15768
  structured_inherits: TensorIteratorBase
15716
15769
  structured: True
@@ -15790,6 +15843,13 @@
15790
15843
  CPU: _fused_adagrad_kernel_cpu_
15791
15844
  autogen: _fused_adagrad, _fused_adagrad.out
15792
15845
 
15846
+ - func: _fused_adagrad_.tensor_lr(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] state_sums, Tensor[] state_steps, *, Tensor lr, float lr_decay, float weight_decay, float eps, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
15847
+ device_check: NoCheck
15848
+ variants: function
15849
+ dispatch:
15850
+ CPU: _fused_adagrad_kernel_cpu_
15851
+ autogen: _fused_adagrad.tensor_lr, _fused_adagrad.tensor_lr_out
15852
+
15793
15853
  # This op is ONLY used by pytorch/XLA in functionalization, and should never show up in vanilla eager mode or in any pytorch tracing contexts.
15794
15854
  - func: _propagate_xla_data(Tensor input, Tensor output) -> ()
15795
15855
  variants: function