torch-rb 0.23.1 → 0.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -418,11 +418,13 @@
418
418
  variants: function
419
419
  dispatch:
420
420
  CPU, CUDA, MPS, Meta: view_as_real
421
+ SparseCPU, SparseCUDA, SparseMPS: view_as_real_sparse
421
422
 
422
423
  - func: view_as_complex(Tensor(a) self) -> Tensor(a)
423
424
  variants: function
424
425
  dispatch:
425
426
  CPU, CUDA, MPS, Meta: view_as_complex
427
+ SparseCPU, SparseCUDA, SparseMPS: view_as_complex_sparse
426
428
 
427
429
  - func: sgn(Tensor self) -> Tensor
428
430
  variants: function, method
@@ -478,6 +480,7 @@
478
480
  CompositeExplicitAutograd: _conj_physical
479
481
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: conj_physical_sparse_csr
480
482
  autogen: _conj_physical.out
483
+ tags: pointwise
481
484
 
482
485
  - func: conj_physical(Tensor self) -> Tensor
483
486
  variants: function, method
@@ -1089,11 +1092,13 @@
1089
1092
  variants: function
1090
1093
  dispatch:
1091
1094
  CUDA: _baddbmm_dtype_cuda
1095
+ XPU: _baddbmm_dtype_xpu
1092
1096
 
1093
1097
  - func: baddbmm.dtype_out(Tensor self, Tensor batch1, Tensor batch2, ScalarType out_dtype, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
1094
1098
  variants: function
1095
1099
  dispatch:
1096
1100
  CUDA: _baddbmm_out_dtype_cuda
1101
+ XPU: _baddbmm_out_dtype_xpu
1097
1102
 
1098
1103
  - func: bartlett_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
1099
1104
  dispatch:
@@ -1318,7 +1323,7 @@
1318
1323
  - func: logical_xor.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
1319
1324
  device_check: NoCheck # TensorIterator
1320
1325
  dispatch:
1321
- CPU, CUDA: logical_xor_out
1326
+ CPU, CUDA, MTIA: logical_xor_out
1322
1327
  MPS: logical_xor_out_mps
1323
1328
  tags: pointwise
1324
1329
 
@@ -1403,11 +1408,13 @@
1403
1408
  variants: function
1404
1409
  dispatch:
1405
1410
  CUDA: _bmm_dtype_cuda
1411
+ XPU: _bmm_dtype_xpu
1406
1412
 
1407
1413
  - func: bmm.dtype_out(Tensor self, Tensor mat2, ScalarType out_dtype, *, Tensor(a!) out) -> Tensor(a!)
1408
1414
  variants: function
1409
1415
  dispatch:
1410
1416
  CUDA: _bmm_out_dtype_cuda
1417
+ XPU: _bmm_out_dtype_xpu
1411
1418
 
1412
1419
  - func: broadcast_tensors(Tensor[] tensors) -> Tensor[]
1413
1420
  device_check: NoCheck
@@ -2311,6 +2318,7 @@
2311
2318
  dispatch:
2312
2319
  CPU: vdot
2313
2320
  CUDA: vdot_cuda
2321
+ MPS: vdot_mps
2314
2322
 
2315
2323
  - func: vdot.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
2316
2324
  dispatch:
@@ -2728,8 +2736,7 @@
2728
2736
  device_check: NoCheck # TensorIterator
2729
2737
  variants: function, method
2730
2738
  dispatch:
2731
- CPU, CUDA: fill_
2732
- MPS: fill_scalar_mps
2739
+ CPU, CUDA, MPS: fill_
2733
2740
  QuantizedCPU, QuantizedCUDA: fill_quantized_
2734
2741
  Meta: fill_meta_
2735
2742
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: fill_sparse_csr_
@@ -2740,8 +2747,7 @@
2740
2747
  device_check: NoCheck # TensorIterator
2741
2748
  variants: function, method
2742
2749
  dispatch:
2743
- CPU, CUDA: fill_
2744
- MPS: fill_tensor_mps_
2750
+ CPU, CUDA, MPS: fill_
2745
2751
  QuantizedCPU, QuantizedCUDA: fill_quantized_
2746
2752
  Meta: fill_meta_
2747
2753
  NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: fill_nested_
@@ -2870,7 +2876,7 @@
2870
2876
  structured: True
2871
2877
  structured_inherits: TensorIteratorBase
2872
2878
  dispatch:
2873
- CPU, CUDA: gcd_out
2879
+ CPU, CUDA, MPS: gcd_out
2874
2880
  tags: pointwise
2875
2881
 
2876
2882
  - func: gcd(Tensor self, Tensor other) -> Tensor
@@ -2941,6 +2947,8 @@
2941
2947
  autogen: _grid_sampler_2d_cpu_fallback.out
2942
2948
 
2943
2949
  - func: _grid_sampler_2d_cpu_fallback_backward(Tensor grad_output, Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> (Tensor, Tensor)
2950
+ dispatch:
2951
+ CompositeExplicitAutograd: _grid_sampler_2d_cpu_fallback_backward
2944
2952
 
2945
2953
  - func: grid_sampler_3d(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> Tensor
2946
2954
  dispatch:
@@ -3043,6 +3051,7 @@
3043
3051
  CPU: _fft_c2r_mkl
3044
3052
  CUDA: _fft_c2r_cufft
3045
3053
  MPS: _fft_c2r_mps
3054
+ tags: core
3046
3055
 
3047
3056
  - func: _fft_c2r.out(Tensor self, int[] dim, int normalization, SymInt last_dim_size, *, Tensor(a!) out) -> Tensor(a!)
3048
3057
  variants: function
@@ -3288,6 +3297,42 @@
3288
3297
  device_guard: False
3289
3298
  manual_cpp_binding: True
3290
3299
 
3300
+ - func: numel(Tensor self) -> int
3301
+ variants: method
3302
+ device_check: NoCheck
3303
+ device_guard: False
3304
+ manual_cpp_binding: True
3305
+
3306
+ - func: dim(Tensor self) -> int
3307
+ variants: method
3308
+ device_check: NoCheck
3309
+ device_guard: False
3310
+ manual_cpp_binding: True
3311
+
3312
+ - func: get_device(Tensor self) -> int
3313
+ variants: method
3314
+ device_check: NoCheck
3315
+ device_guard: False
3316
+ manual_cpp_binding: True
3317
+
3318
+ - func: storage_offset(Tensor self) -> int
3319
+ variants: method
3320
+ device_check: NoCheck
3321
+ device_guard: False
3322
+ manual_cpp_binding: True
3323
+
3324
+ - func: is_contiguous(Tensor self) -> bool
3325
+ variants: method
3326
+ device_check: NoCheck
3327
+ device_guard: False
3328
+ manual_cpp_binding: True
3329
+
3330
+ - func: is_contiguous.memory_format(Tensor self, MemoryFormat memory_format) -> bool
3331
+ variants: method
3332
+ device_check: NoCheck
3333
+ device_guard: False
3334
+ manual_cpp_binding: True
3335
+
3291
3336
  - func: kl_div(Tensor self, Tensor target, int reduction=Mean, *, bool log_target=False) -> Tensor
3292
3337
 
3293
3338
  - func: kron(Tensor self, Tensor other) -> Tensor
@@ -3342,11 +3387,13 @@
3342
3387
  dispatch:
3343
3388
  CUDA: _fused_rms_norm_cuda
3344
3389
  MPS: _fused_rms_norm_mps
3390
+ XPU: _fused_rms_norm_xpu
3345
3391
  CompositeImplicitAutograd: rms_norm_composite
3346
3392
 
3347
3393
  - func: _fused_rms_norm_backward(Tensor grad_out, Tensor input, int[] normalized_shape, Tensor rstd, Tensor? weight, bool[2] output_mask) -> (Tensor, Tensor)
3348
3394
  dispatch:
3349
3395
  CUDA: _fused_rms_norm_backward_cuda
3396
+ XPU: _fused_rms_norm_backward_xpu
3350
3397
 
3351
3398
  - func: nan_to_num(Tensor self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor
3352
3399
  variants: function, method
@@ -3476,13 +3523,20 @@
3476
3523
 
3477
3524
  - func: ldexp.Tensor(Tensor self, Tensor other) -> Tensor
3478
3525
  variants: function, method
3526
+ tags: pointwise
3527
+ dispatch:
3528
+ CompositeExplicitAutograd: ldexp
3479
3529
 
3480
3530
  - func: ldexp_(Tensor(a!) self, Tensor other) -> Tensor(a!)
3481
3531
  variants: function, method
3482
3532
  tags: pointwise
3533
+ dispatch:
3534
+ CompositeExplicitAutograd: ldexp_
3483
3535
 
3484
3536
  - func: ldexp.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
3485
3537
  tags: pointwise
3538
+ dispatch:
3539
+ CompositeExplicitAutograd: ldexp_out
3486
3540
 
3487
3541
  - func: linspace(Scalar start, Scalar end, int steps, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
3488
3542
  dispatch:
@@ -3676,8 +3730,7 @@
3676
3730
  structured_inherits: TensorIteratorBase
3677
3731
  variants: function
3678
3732
  dispatch:
3679
- CPU, CUDA: xlogy_out
3680
- MPS: xlogy_out_mps
3733
+ CPU, CUDA, MPS: xlogy_out
3681
3734
  tags: pointwise
3682
3735
 
3683
3736
  - func: xlogy.OutScalar_Self(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
@@ -3871,7 +3924,7 @@
3871
3924
  device_check: NoCheck # TensorIterator
3872
3925
  structured: True
3873
3926
  dispatch:
3874
- CPU, CUDA, MTIA: aminmax_out
3927
+ CPU, CUDA: aminmax_out
3875
3928
  MPS: aminmax_out_mps
3876
3929
  tags: reduction
3877
3930
 
@@ -3926,7 +3979,7 @@
3926
3979
  - func: amax.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
3927
3980
  structured: True
3928
3981
  dispatch:
3929
- CPU, CUDA, MTIA: amax_out
3982
+ CPU, CUDA: amax_out
3930
3983
  MPS: amax_out_mps
3931
3984
  tags: reduction
3932
3985
 
@@ -4115,7 +4168,7 @@
4115
4168
  - func: amin.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
4116
4169
  structured: True
4117
4170
  dispatch:
4118
- CPU, CUDA, MTIA: amin_out
4171
+ CPU, CUDA: amin_out
4119
4172
  MPS: amin_out_mps
4120
4173
  tags: reduction
4121
4174
 
@@ -4193,6 +4246,27 @@
4193
4246
  CUDA: miopen_rnn_backward
4194
4247
  autogen: miopen_rnn_backward.out
4195
4248
 
4249
+ - func: _use_miopen_ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank) -> bool
4250
+ device_check: NoCheck # Tensor arguments allowed to be on different devices, see also miopen_ctc_loss
4251
+ dispatch:
4252
+ CUDA: _use_miopen_ctc_loss
4253
+
4254
+ - func: _use_miopen_ctc_loss.Tensor(Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, int blank) -> bool
4255
+ device_check: NoCheck # Tensor arguments allowed to be on different devices, see also miopen_ctc_loss
4256
+ dispatch:
4257
+ CUDA: _use_miopen_ctc_loss_tensor
4258
+
4259
+ - func: miopen_ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank, bool deterministic, bool zero_infinity) -> (Tensor, Tensor)
4260
+ device_check: NoCheck # log_probs is expected to be on CUDA while targets is expected to be on CPU
4261
+ dispatch:
4262
+ CUDA: miopen_ctc_loss
4263
+ autogen: miopen_ctc_loss.out
4264
+
4265
+ - func: miopen_ctc_loss.Tensor(Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, int blank, bool deterministic, bool zero_infinity) -> (Tensor, Tensor)
4266
+ device_check: NoCheck # log_probs is expected to be on CUDA while targets is expected to be on CPU
4267
+ dispatch:
4268
+ CUDA: miopen_ctc_loss_tensor
4269
+
4196
4270
  - func: mm(Tensor self, Tensor mat2) -> Tensor
4197
4271
  structured_delegate: mm.out
4198
4272
  variants: function, method
@@ -4215,10 +4289,12 @@
4215
4289
  - func: mm.dtype(Tensor self, Tensor mat2, ScalarType out_dtype) -> Tensor
4216
4290
  dispatch:
4217
4291
  CUDA: _mm_dtype_cuda
4292
+ XPU: _mm_dtype_xpu
4218
4293
 
4219
4294
  - func: mm.dtype_out(Tensor self, Tensor mat2, ScalarType out_dtype, *, Tensor(a!) out) -> Tensor(a!)
4220
4295
  dispatch:
4221
4296
  CUDA: _mm_dtype_out_cuda
4297
+ XPU: _mm_dtype_out_xpu
4222
4298
 
4223
4299
  - func: _int_mm(Tensor self, Tensor mat2) -> Tensor
4224
4300
  dispatch:
@@ -4381,7 +4457,7 @@
4381
4457
 
4382
4458
  - func: mvlgamma.out(Tensor self, int p, *, Tensor(a!) out) -> Tensor(a!)
4383
4459
  dispatch:
4384
- CPU, CUDA: mvlgamma_out
4460
+ CPU, CUDA, MPS: mvlgamma_out
4385
4461
  tags: pointwise
4386
4462
 
4387
4463
  - func: mvlgamma(Tensor self, int p) -> Tensor
@@ -4993,8 +5069,7 @@
4993
5069
  structured: True
4994
5070
  structured_inherits: TensorIteratorBase
4995
5071
  dispatch:
4996
- CPU, CUDA, MTIA: reciprocal_out
4997
- MPS: reciprocal_out_mps
5072
+ CPU, CUDA, MPS, MTIA: reciprocal_out
4998
5073
  tags: pointwise
4999
5074
 
5000
5075
  - func: neg(Tensor self) -> Tensor
@@ -5344,6 +5419,7 @@
5344
5419
 
5345
5420
  - func: selu_(Tensor(a!) self) -> Tensor(a!)
5346
5421
  device_check: NoCheck # TensorIterator
5422
+ tags: pointwise
5347
5423
 
5348
5424
  - func: celu(Tensor self, Scalar alpha=1.0) -> Tensor
5349
5425
  device_check: NoCheck # TensorIterator
@@ -5356,6 +5432,7 @@
5356
5432
  dispatch:
5357
5433
  CompositeExplicitAutograd: celu_
5358
5434
  autogen: celu.out
5435
+ tags: pointwise
5359
5436
 
5360
5437
  - func: silu(Tensor self) -> Tensor
5361
5438
  structured_delegate: silu.out
@@ -5376,8 +5453,7 @@
5376
5453
  structured_inherits: TensorIteratorBase
5377
5454
  python_module: nn
5378
5455
  dispatch:
5379
- CPU, CUDA, MTIA: silu_out
5380
- MPS: silu_out_mps
5456
+ CPU, CUDA, MPS, MTIA: silu_out
5381
5457
  tags: pointwise
5382
5458
 
5383
5459
  - func: silu_backward.grad_input(Tensor grad_output, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!)
@@ -5385,8 +5461,7 @@
5385
5461
  structured_inherits: TensorIteratorBase
5386
5462
  python_module: nn
5387
5463
  dispatch:
5388
- CPU, CUDA: silu_backward_out
5389
- MPS: silu_backward_out_mps
5464
+ CPU, CUDA, MPS: silu_backward_out
5390
5465
  tags: pointwise
5391
5466
 
5392
5467
  - func: silu_backward(Tensor grad_output, Tensor self) -> Tensor
@@ -6532,6 +6607,7 @@
6532
6607
  dispatch:
6533
6608
  CPU: _unique_cpu
6534
6609
  CUDA: _unique_cuda
6610
+ MPS: _unique_mps
6535
6611
  autogen: _unique.out
6536
6612
 
6537
6613
  - func: unique_dim(Tensor self, int dim, bool sorted=True, bool return_inverse=False, bool return_counts=False) -> (Tensor, Tensor, Tensor)
@@ -6618,7 +6694,6 @@
6618
6694
  dispatch:
6619
6695
  CPU, CUDA: var
6620
6696
  MPS: var_mps
6621
- MTIA: var_mtia
6622
6697
  tags: [core, reduction]
6623
6698
 
6624
6699
  - func: var.out(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
@@ -6780,6 +6855,7 @@
6780
6855
  dispatch:
6781
6856
  CPU: _standard_gamma_grad_cpu
6782
6857
  CUDA: _standard_gamma_grad_cuda
6858
+ MPS: _standard_gamma_grad_mps
6783
6859
  autogen: _standard_gamma_grad.out
6784
6860
 
6785
6861
  - func: _standard_gamma(Tensor self, Generator? generator=None) -> Tensor
@@ -6787,9 +6863,32 @@
6787
6863
  dispatch:
6788
6864
  CPU: _s_gamma_cpu
6789
6865
  CUDA: _s_gamma_cuda
6866
+ MPS: _s_gamma_mps
6790
6867
  tags: nondeterministic_seeded
6791
6868
  autogen: _standard_gamma.out
6792
6869
 
6870
+ - func: _philox_key_split(Tensor key, int num_splits) -> Tensor
6871
+ variants: function
6872
+ dispatch:
6873
+ CUDA: _philox_key_split_cuda
6874
+
6875
+ - func: _philox_key_fold_in(Tensor key, int data) -> Tensor
6876
+ variants: function
6877
+ dispatch:
6878
+ CUDA: _philox_key_fold_in_cuda
6879
+
6880
+ - func: _philox_normal_(Tensor(a!) self, Tensor key, float mean=0, float std=1) -> Tensor(a!)
6881
+ variants: function, method
6882
+ dispatch:
6883
+ CUDA: _philox_normal_cuda_
6884
+ autogen: _philox_normal, _philox_normal.out
6885
+
6886
+ - func: _philox_uniform_(Tensor(a!) self, Tensor key, float low=0, float high=1) -> Tensor(a!)
6887
+ variants: function, method
6888
+ dispatch:
6889
+ CUDA: _philox_uniform_cuda_
6890
+ autogen: _philox_uniform, _philox_uniform.out
6891
+
6793
6892
  - func: _dirichlet_grad(Tensor x, Tensor alpha, Tensor total) -> Tensor
6794
6893
  dispatch:
6795
6894
  CPU: _dirichlet_grad_cpu
@@ -6978,16 +7077,14 @@
6978
7077
  structured: True
6979
7078
  device_check: NoCheck # TensorIterator
6980
7079
  dispatch:
6981
- CPU, CUDA: norm_dtype_out
6982
- MPS: norm_dtype_out_mps
7080
+ CPU, CUDA, MPS: norm_dtype_out
6983
7081
  tags: reduction
6984
7082
 
6985
7083
  - func: norm.out(Tensor self, Scalar? p, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
6986
7084
  structured: True
6987
7085
  device_check: NoCheck # TensorIterator
6988
7086
  dispatch:
6989
- CPU, CUDA: norm_out
6990
- MPS: norm_out_mps
7087
+ CPU, CUDA, MPS: norm_out
6991
7088
  tags: reduction
6992
7089
 
6993
7090
  # These four redispatch in their implementation, so OK to be CompositeImplicitAutograd
@@ -7080,8 +7177,7 @@
7080
7177
  device_check: NoCheck # TensorIterator
7081
7178
  variants: method, function
7082
7179
  dispatch:
7083
- CPU, CUDA: zero_
7084
- MPS: zero_mps_
7180
+ CPU, CUDA, MPS: zero_
7085
7181
  Meta: zero_meta_
7086
7182
  SparseCPU, SparseCUDA, SparseMPS, SparseMeta: zero_sparse_
7087
7183
  SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: zero_sparse_csr_
@@ -7242,10 +7338,12 @@
7242
7338
  - func: addmm.dtype(Tensor self, Tensor mat1, Tensor mat2, ScalarType out_dtype, *, Scalar beta=1, Scalar alpha=1) -> Tensor
7243
7339
  dispatch:
7244
7340
  CUDA: _addmm_dtype_cuda
7341
+ XPU: _addmm_dtype_xpu
7245
7342
 
7246
7343
  - func: addmm.dtype_out(Tensor self, Tensor mat1, Tensor mat2, ScalarType out_dtype, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
7247
7344
  dispatch:
7248
7345
  CUDA: _addmm_dtype_out_cuda
7346
+ XPU: _addmm_dtype_out_xpu
7249
7347
 
7250
7348
  - func: addmm_(Tensor(a!) self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
7251
7349
  structured_delegate: addmm.out
@@ -7287,14 +7385,18 @@
7287
7385
  - func: _scaled_mm_v2(Tensor self, Tensor mat2, Tensor[] scale_a, int[] recipe_a, int[] swizzle_a, Tensor[] scale_b, int[] recipe_b, int[] swizzle_b, Tensor? bias, ScalarType? out_dtype, int[] contraction_dim=[], bool use_fast_accum=False) -> Tensor
7288
7386
  variants: function
7289
7387
  dispatch:
7388
+ CPU: _scaled_mm_cpu_v2
7290
7389
  CUDA: _scaled_mm_cuda_v2
7291
7390
  XPU: _scaled_mm_xpu_v2
7391
+ tags: needs_exact_strides
7292
7392
 
7293
7393
  - func: _scaled_mm_v2.out(Tensor self, Tensor mat2, Tensor[] scale_a, int[] recipe_a, int[] swizzle_a, Tensor[] scale_b, int[] recipe_b, int[] swizzle_b, Tensor? bias, ScalarType? out_dtype, int[] contraction_dim=[], bool use_fast_accum=False, *, Tensor(a!) out) -> Tensor(a!)
7294
7394
  variants: function
7295
7395
  dispatch:
7396
+ CPU: _scaled_mm_cpu_v2_out
7296
7397
  CUDA: _scaled_mm_cuda_v2_out
7297
7398
  XPU: _scaled_mm_xpu_v2_out
7399
+ tags: needs_exact_strides
7298
7400
 
7299
7401
 
7300
7402
  - func: _scaled_grouped_mm(Tensor self, Tensor mat2, Tensor scale_a, Tensor scale_b, Tensor? offs=None, Tensor? bias=None, Tensor? scale_result=None, ScalarType? out_dtype=None, bool use_fast_accum=False) -> Tensor
@@ -8380,6 +8482,7 @@
8380
8482
  dispatch:
8381
8483
  CPU: index_reduce_cpu_out
8382
8484
  CUDA: index_reduce_cuda_out
8485
+ MPS: index_reduce_mps_out
8383
8486
 
8384
8487
  - func: index_reduce_(Tensor(a!) self, int dim, Tensor index, Tensor source, str reduce, *, bool include_self=True) -> Tensor(a!)
8385
8488
  structured_delegate: index_reduce.out
@@ -8393,9 +8496,7 @@
8393
8496
  device_check: NoCheck # TensorIterator
8394
8497
  variants: method
8395
8498
  dispatch:
8396
- CPU: index_fill_
8397
- CUDA: index_fill_
8398
- MPS: index_fill_mps_
8499
+ CPU, CUDA, MPS: index_fill_
8399
8500
  autogen: index_fill.int_Scalar_out
8400
8501
 
8401
8502
  - func: index_fill.int_Scalar(Tensor self, int dim, Tensor index, Scalar value) -> Tensor
@@ -8408,8 +8509,7 @@
8408
8509
  device_check: NoCheck # TensorIterator
8409
8510
  variants: method
8410
8511
  dispatch:
8411
- CPU, CUDA: index_fill_
8412
- MPS: index_fill_mps_
8512
+ CPU, CUDA, MPS: index_fill_
8413
8513
  autogen: index_fill.int_Tensor_out
8414
8514
 
8415
8515
  - func: index_fill.int_Tensor(Tensor self, int dim, Tensor index, Tensor value) -> Tensor
@@ -8755,14 +8855,14 @@
8755
8855
  device_check: NoCheck # TensorIterator
8756
8856
  variants: method, function
8757
8857
  dispatch:
8758
- CPU, CUDA, MPS: __lshift__
8858
+ CPU, CUDA, MPS, MTIA: __lshift__
8759
8859
  tags: pointwise
8760
8860
 
8761
8861
  - func: __lshift__.Tensor(Tensor self, Tensor other) -> Tensor
8762
8862
  device_check: NoCheck # TensorIterator
8763
8863
  variants: method, function
8764
8864
  dispatch:
8765
- CPU, CUDA, MPS: __lshift__
8865
+ CPU, CUDA, MPS, MTIA: __lshift__
8766
8866
  tags: pointwise
8767
8867
 
8768
8868
  - func: __ilshift__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
@@ -8834,14 +8934,14 @@
8834
8934
  device_check: NoCheck # TensorIterator
8835
8935
  variants: method, function
8836
8936
  dispatch:
8837
- CPU, CUDA, MPS: __rshift__
8937
+ CPU, CUDA, MPS, MTIA: __rshift__
8838
8938
  tags: pointwise
8839
8939
 
8840
8940
  - func: __rshift__.Tensor(Tensor self, Tensor other) -> Tensor
8841
8941
  device_check: NoCheck # TensorIterator
8842
8942
  variants: method, function
8843
8943
  dispatch:
8844
- CPU, CUDA, MPS: __rshift__
8944
+ CPU, CUDA, MPS, MTIA: __rshift__
8845
8945
  tags: pointwise
8846
8946
 
8847
8947
  - func: __irshift__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
@@ -8996,6 +9096,7 @@
8996
9096
  tags: nondeterministic_seeded
8997
9097
  dispatch:
8998
9098
  CPU, CUDA: cauchy_
9099
+ MPS: cauchy_mps_
8999
9100
  autogen: cauchy, cauchy.out
9000
9101
 
9001
9102
  - func: log_normal_(Tensor(a!) self, float mean=1, float std=2, *, Generator? generator=None) -> Tensor(a!)
@@ -9004,6 +9105,7 @@
9004
9105
  variants: method
9005
9106
  dispatch:
9006
9107
  CPU, CUDA: log_normal_
9108
+ MPS: log_normal_mps_
9007
9109
  autogen: log_normal, log_normal.out
9008
9110
 
9009
9111
  - func: exponential_(Tensor(a!) self, float lambd=1, *, Generator? generator=None) -> Tensor(a!)
@@ -9021,6 +9123,7 @@
9021
9123
  variants: method
9022
9124
  dispatch:
9023
9125
  CPU, CUDA: geometric_
9126
+ MPS: geometric_mps_
9024
9127
 
9025
9128
  # wrappers for TH functions
9026
9129
  autogen: geometric, geometric.out
@@ -9525,12 +9628,14 @@
9525
9628
  dispatch:
9526
9629
  CPU: nonzero_static_out_cpu
9527
9630
  CUDA: nonzero_static_out_cuda
9631
+ MPS: nonzero_static_out_mps
9528
9632
 
9529
9633
  - func: nonzero_static(Tensor self, *, SymInt size, int fill_value=-1) -> Tensor
9530
9634
  variants: method, function
9531
9635
  dispatch:
9532
9636
  CPU: nonzero_static_cpu
9533
9637
  CUDA: nonzero_static_cuda
9638
+ MPS: nonzero_static_mps
9534
9639
 
9535
9640
  - func: nonzero_numpy(Tensor self) -> Tensor[]
9536
9641
  variants: method, function
@@ -9695,16 +9800,17 @@
9695
9800
  dispatch:
9696
9801
  CPU: _cholesky_solve_helper_cpu
9697
9802
  CUDA: _cholesky_solve_helper_cuda
9803
+ MPS: _cholesky_solve_helper_mps
9698
9804
  autogen: _cholesky_solve_helper.out
9699
9805
 
9700
9806
  - func: cholesky_inverse(Tensor self, bool upper=False) -> Tensor
9701
9807
  variants: method, function
9702
9808
  dispatch:
9703
- CPU, CUDA: cholesky_inverse
9809
+ CPU, CUDA, MPS: cholesky_inverse
9704
9810
 
9705
9811
  - func: cholesky_inverse.out(Tensor self, bool upper=False, *, Tensor(a!) out) -> Tensor(a!)
9706
9812
  dispatch:
9707
- CPU, CUDA: cholesky_inverse_out
9813
+ CPU, CUDA, MPS: cholesky_inverse_out
9708
9814
 
9709
9815
  - func: qr.Q(Tensor self, bool some=True, *, Tensor(a!) Q, Tensor(b!) R) -> (Tensor(a!) Q, Tensor(b!) R)
9710
9816
 
@@ -9773,8 +9879,7 @@
9773
9879
  structured: True
9774
9880
  structured_inherits: TensorIteratorBase
9775
9881
  dispatch:
9776
- CPU, CUDA: lgamma_out
9777
- MPS: lgamma_out_mps
9882
+ CPU, CUDA, MPS: lgamma_out
9778
9883
  tags: pointwise
9779
9884
 
9780
9885
  - func: lgamma_(Tensor(a!) self) -> Tensor(a!)
@@ -9794,8 +9899,7 @@
9794
9899
  structured: True
9795
9900
  structured_inherits: TensorIteratorBase
9796
9901
  dispatch:
9797
- CPU, CUDA: digamma_out
9798
- MPS: digamma_out_mps
9902
+ CPU, CUDA, MPS: digamma_out
9799
9903
  tags: pointwise
9800
9904
 
9801
9905
  - func: digamma(Tensor self) -> Tensor
@@ -9809,8 +9913,7 @@
9809
9913
  structured: True
9810
9914
  structured_inherits: TensorIteratorBase
9811
9915
  dispatch:
9812
- CPU, CUDA: polygamma_out
9813
- MPS: polygamma_out_mps
9916
+ CPU, CUDA, MPS: polygamma_out
9814
9917
  tags: pointwise
9815
9918
 
9816
9919
  - func: polygamma(int n, Tensor self) -> Tensor
@@ -9931,8 +10034,7 @@
9931
10034
  structured: True
9932
10035
  structured_inherits: TensorIteratorBase
9933
10036
  dispatch:
9934
- CPU, CUDA: atan2_out
9935
- MPS: atan2_out_mps
10037
+ CPU, CUDA, MPS: atan2_out
9936
10038
  tags: [core, pointwise]
9937
10039
 
9938
10040
  - func: atan2_(Tensor(a!) self, Tensor other) -> Tensor(a!)
@@ -9970,8 +10072,7 @@
9970
10072
  structured: True
9971
10073
  structured_inherits: TensorIteratorBase
9972
10074
  dispatch:
9973
- CPU, CUDA: lerp_Tensor
9974
- MPS: lerp_Tensor_mps
10075
+ CPU, CUDA, MPS: lerp_Tensor
9975
10076
  tags: pointwise
9976
10077
 
9977
10078
  - func: lerp.Scalar(Tensor self, Tensor end, Scalar weight) -> Tensor
@@ -10256,8 +10357,7 @@
10256
10357
  structured_inherits: TensorIteratorBase
10257
10358
  device_check: NoCheck # TensorIterator
10258
10359
  dispatch:
10259
- CPU, CUDA, MTIA: maximum_out
10260
- MPS: maximum_out_mps
10360
+ CPU, CUDA, MTIA, MPS: maximum_out
10261
10361
  tags: pointwise
10262
10362
 
10263
10363
  # binary max, alias of maximum
@@ -10289,8 +10389,7 @@
10289
10389
  structured_inherits: TensorIteratorBase
10290
10390
  device_check: NoCheck # TensorIterator
10291
10391
  dispatch:
10292
- CPU, CUDA, MTIA: minimum_out
10293
- MPS: minimum_out_mps
10392
+ CPU, CUDA, MTIA, MPS: minimum_out
10294
10393
  tags: pointwise
10295
10394
 
10296
10395
  # binary min, alias for minimum
@@ -10496,9 +10595,8 @@
10496
10595
  structured: True
10497
10596
  structured_inherits: TensorIteratorBase
10498
10597
  dispatch:
10499
- CPU, CUDA: pow_Tensor_Scalar_out
10598
+ CPU, CUDA, MPS: pow_Tensor_Scalar_out
10500
10599
  SparseCPU, SparseCUDA, SparseMPS: pow_out_sparse_scalar
10501
- MPS: pow_tensor_scalar_out_mps
10502
10600
  tags: pointwise
10503
10601
 
10504
10602
  - func: pow.Tensor_Scalar(Tensor self, Scalar exponent) -> Tensor
@@ -11524,6 +11622,15 @@
11524
11622
  MTIA: foreach_tensor_norm_mtia
11525
11623
  autogen: _foreach_norm.Scalar_out
11526
11624
 
11625
+ # Like _foreach_norm but returns sum(|x|^ord) without the final root
11626
+ - func: _foreach_powsum.Scalar(Tensor[] self, Scalar ord=2, ScalarType? dtype=None) -> Tensor[]
11627
+ device_check: NoCheck
11628
+ variants: function
11629
+ dispatch:
11630
+ CompositeExplicitAutograd: foreach_tensor_powsum_slow
11631
+ CUDA: foreach_tensor_powsum_cuda
11632
+ autogen: _foreach_powsum.Scalar_out
11633
+
11527
11634
  - func: _foreach_pow.List(Tensor[] self, Tensor[] exponent) -> Tensor[]
11528
11635
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11529
11636
  variants: function
@@ -11750,6 +11857,14 @@
11750
11857
  CUDA: foreach_tensor_zero_cuda_
11751
11858
  autogen: _foreach_zero, _foreach_zero.out
11752
11859
 
11860
+ - func: _foreach_clone(Tensor[] self, *, MemoryFormat? memory_format=None) -> Tensor[]
11861
+ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11862
+ variants: function
11863
+ dispatch:
11864
+ CompositeExplicitAutograd: foreach_tensor_clone_slow
11865
+ CUDA: foreach_tensor_clone_cuda
11866
+ autogen: _foreach_clone.out
11867
+
11753
11868
  - func: _foreach_copy_(Tensor(a!)[] self, Tensor[] src, bool non_blocking=False) -> ()
11754
11869
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
11755
11870
  variants: function
@@ -12083,6 +12198,7 @@
12083
12198
  structured_delegate: elu.out
12084
12199
  device_check: NoCheck # TensorIterator
12085
12200
  python_module: nn
12201
+ tags: pointwise
12086
12202
 
12087
12203
  - func: glu.out(Tensor self, int dim=-1, *, Tensor(a!) out) -> Tensor(a!)
12088
12204
  structured: True
@@ -12144,6 +12260,7 @@
12144
12260
  structured_delegate: hardsigmoid.out
12145
12261
  device_check: NoCheck # TensorIterator
12146
12262
  python_module: nn
12263
+ tags: pointwise
12147
12264
 
12148
12265
  - func: hardsigmoid_backward.grad_input(Tensor grad_output, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!)
12149
12266
  structured: True
@@ -12189,6 +12306,7 @@
12189
12306
  dispatch:
12190
12307
  CPU, CUDA, MPS: hardtanh_
12191
12308
  QuantizedCPU: hardtanh_quantized_cpu_
12309
+ tags: pointwise
12192
12310
 
12193
12311
  - func: hardswish.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
12194
12312
  device_check: NoCheck # TensorIterator
@@ -12229,7 +12347,7 @@
12229
12347
  python_module: nn
12230
12348
  dispatch:
12231
12349
  QuantizedCPU: leaky_relu_quantized_cpu
12232
- tags: core
12350
+ tags: [core, pointwise]
12233
12351
 
12234
12352
  - func: leaky_relu_backward.grad_input(Tensor grad_output, Tensor self, Scalar negative_slope, bool self_is_result, *, Tensor(a!) grad_input) -> Tensor(a!)
12235
12353
  structured: True
@@ -12248,6 +12366,7 @@
12248
12366
  python_module: nn
12249
12367
  dispatch:
12250
12368
  QuantizedCPU: leaky_relu_quantized_cpu_
12369
+ tags: pointwise
12251
12370
 
12252
12371
  - func: log_sigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
12253
12372
  device_check: NoCheck # TensorIterator
@@ -12904,6 +13023,10 @@
12904
13023
  python_module: nn
12905
13024
  autogen: _upsample_bicubic2d_aa.vec_out
12906
13025
 
13026
+ - func: _upsample_lanczos2d_aa.vec(Tensor input, SymInt[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor
13027
+ python_module: nn
13028
+ autogen: _upsample_lanczos2d_aa.vec_out
13029
+
12907
13030
  - func: upsample_nearest1d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor
12908
13031
  python_module: nn
12909
13032
  autogen: upsample_nearest1d.vec_out
@@ -13050,6 +13173,26 @@
13050
13173
  python_module: nn
13051
13174
  structured_delegate: _upsample_bicubic2d_aa_backward.grad_input
13052
13175
 
13176
+ - func: _upsample_lanczos2d_aa.out(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
13177
+ python_module: nn
13178
+ structured: True
13179
+ dispatch:
13180
+ CPU: _upsample_lanczos2d_aa_out_cpu
13181
+
13182
+ - func: _upsample_lanczos2d_aa(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
13183
+ python_module: nn
13184
+ structured_delegate: _upsample_lanczos2d_aa.out
13185
+
13186
+ - func: _upsample_lanczos2d_aa_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
13187
+ python_module: nn
13188
+ structured: True
13189
+ dispatch:
13190
+ CPU: _upsample_lanczos2d_aa_backward_out_cpu
13191
+
13192
+ - func: _upsample_lanczos2d_aa_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
13193
+ python_module: nn
13194
+ structured_delegate: _upsample_lanczos2d_aa_backward.grad_input
13195
+
13053
13196
  - func: upsample_trilinear3d.out(Tensor self, SymInt[3] output_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
13054
13197
  python_module: nn
13055
13198
  structured: True
@@ -13608,7 +13751,7 @@
13608
13751
  structured: True
13609
13752
  structured_inherits: TensorIteratorBase
13610
13753
  dispatch:
13611
- CPU, CUDA: special_erfcx_out
13754
+ CPU, CUDA, MPS: special_erfcx_out
13612
13755
  tags: pointwise
13613
13756
 
13614
13757
  - func: special_erfinv(Tensor self) -> Tensor
@@ -14471,8 +14614,18 @@
14471
14614
  python_module: linalg
14472
14615
  structured: True
14473
14616
  dispatch:
14474
- CPU, CUDA: linalg_vector_norm_out
14475
- MPS: linalg_vector_norm_out_mps
14617
+ CPU, CUDA, MPS: linalg_vector_norm_out
14618
+ tags: reduction
14619
+
14620
+ # Computes sum(|x|^ord) - the "power sum" without the final root.
14621
+ # This is useful for distributed computing where partial power sums
14622
+ # can be reduced across shards before taking the final root.
14623
+ - func: linalg__powsum(Tensor self, Scalar ord=2, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
14624
+ python_module: linalg
14625
+ variants: function
14626
+ dispatch:
14627
+ CompositeExplicitAutograd: linalg__powsum_slow
14628
+ CPU, CUDA: linalg__powsum
14476
14629
  tags: reduction
14477
14630
 
14478
14631
  - func: linalg_matrix_norm(Tensor self, Scalar ord, int[] dim=[-2,-1], bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
@@ -14622,6 +14775,7 @@
14622
14775
  structured: True
14623
14776
  dispatch:
14624
14777
  CPU, CUDA: linalg_qr_out
14778
+ MPS: linalg_qr_out_mps
14625
14779
 
14626
14780
  - func: linalg_matrix_power(Tensor self, int n) -> Tensor
14627
14781
  python_module: linalg
@@ -15122,7 +15276,7 @@
15122
15276
  variants: function
15123
15277
  tags: nondeterministic_seeded
15124
15278
 
15125
- - func: _scaled_dot_product_attention_math_for_mps(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False, Tensor? dropout_mask=None, *, float? scale=None) -> (Tensor, Tensor)
15279
+ - func: _scaled_dot_product_attention_math_for_mps(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False, Tensor? dropout_mask=None, *, float? scale=None, bool enable_gqa=False) -> (Tensor, Tensor)
15126
15280
  dispatch:
15127
15281
  MPS: _scaled_dot_product_attention_math_mps
15128
15282
  tags: nondeterministic_seeded
@@ -15134,6 +15288,11 @@
15134
15288
  NestedTensorCUDA: _scaled_dot_product_flash_attention_nestedtensor_cuda
15135
15289
  tags: nondeterministic_seeded
15136
15290
 
15291
+ - func: _scaled_dot_product_flash_attention.quantized(Tensor query, Tensor key, Tensor value, Tensor? q_descale, Tensor? k_descale, Tensor? v_descale, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor output, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, Tensor rng_state, Tensor unused, Tensor debug_attn_mask)
15292
+ dispatch:
15293
+ CUDA: _scaled_dot_product_flash_attention_cuda_quantized
15294
+ tags: nondeterministic_seeded
15295
+
15137
15296
  - func: _scaled_dot_product_flash_attention_for_cpu(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool is_causal=False, *, Tensor? attn_mask=None, float? scale=None) -> (Tensor output, Tensor logsumexp)
15138
15297
  dispatch:
15139
15298
  CPU: _scaled_dot_product_flash_attention_cpu
@@ -15189,12 +15348,24 @@
15189
15348
  NestedTensorCUDA: _scaled_dot_product_cudnn_attention_nestedtensor_backward_cuda
15190
15349
  tags: nondeterministic_seeded
15191
15350
 
15192
- - func: _flash_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? cum_seq_q, Tensor? cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, bool return_debug_mask, *, float? scale=None, SymInt? window_size_left=None, SymInt? window_size_right=None, Tensor? seqused_k=None, Tensor? alibi_slopes=None) -> (Tensor output, Tensor softmax_logsumexp, Tensor rng_state, Tensor unused, Tensor debug_attn_mask)
15351
+ - func: _flash_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? cum_seq_q, Tensor? cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, bool return_debug_mask, *, float? scale=None, SymInt? window_size_left=None, SymInt? window_size_right=None, Tensor? seqused_k=None, Tensor? alibi_slopes=None, Tensor? block_table=None, int? num_splits=None) -> (Tensor output, Tensor softmax_logsumexp, Tensor rng_state, Tensor unused, Tensor debug_attn_mask)
15193
15352
  variants: function
15194
15353
  dispatch:
15195
15354
  CUDA: _flash_attention_forward
15196
15355
  tags: nondeterministic_seeded
15197
15356
 
15357
+ - func: _flash_attention_forward_no_dropout_inplace(Tensor(a!) out, Tensor query, Tensor key, Tensor value, Tensor? cum_seq_q, Tensor? cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, bool return_debug_mask, *, float? scale=None, SymInt? window_size_left=None, SymInt? window_size_right=None, Tensor? seqused_k=None, Tensor? alibi_slopes=None, Tensor? block_table=None, int? num_splits=None) -> Tensor softmax_logsumexp
15358
+ variants: function
15359
+ dispatch:
15360
+ CUDA: _flash_attention_forward_no_dropout_inplace
15361
+ tags: nondeterministic_seeded
15362
+
15363
+ - func: _flash_attention_forward.quantized(Tensor query, Tensor key, Tensor value, Tensor? cum_seq_q, Tensor? cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, bool return_debug_mask, Tensor? q_descale, Tensor? k_descale, Tensor? v_descale, *, float? scale=None, SymInt? window_size_left=None, SymInt? window_size_right=None, Tensor? seqused_k=None, Tensor? alibi_slopes=None) -> (Tensor output, Tensor softmax_logsumexp, Tensor rng_state, Tensor unused, Tensor debug_attn_mask)
15364
+ variants: function
15365
+ dispatch:
15366
+ CUDA: _flash_attention_forward_quantized
15367
+ tags: nondeterministic_seeded
15368
+
15198
15369
  - func: _flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, Tensor rng_state, Tensor unused, *, float? scale=None, SymInt? window_size_left=None, SymInt? window_size_right=None) -> (Tensor, Tensor, Tensor)
15199
15370
  device_check: NoCheck
15200
15371
  variants: function