torch-rb 0.13.2 → 0.14.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -170,8 +170,36 @@
170
170
  CPU: _assert_async_cpu
171
171
  CUDA: _assert_async_cuda
172
172
 
173
+ - func: _assert_async.msg(Tensor self, str assert_msg) -> ()
174
+ dispatch:
175
+ CPU: _assert_async_msg_cpu
176
+ CUDA: _assert_async_msg_cuda
177
+
178
+ - func: _functional_assert_async.msg(Tensor self, str assert_msg, Tensor dep_token) -> Tensor
179
+ dispatch:
180
+ CPU: _functional_assert_async_msg_cpu
181
+
182
+ - func: _assert_tensor_metadata(Tensor a, SymInt[]? size=None, SymInt[]? stride=None, ScalarType? dtype=None) -> ()
183
+
184
+ - func: sym_constrain_range(Scalar size, *, int? min=None, int? max=None) -> ()
185
+ dispatch:
186
+ CompositeExplicitAutograd: sym_constrain_range
187
+
188
+ - func: sym_constrain_range_for_size(Scalar size, *, int? min, int? max) -> ()
189
+ dispatch:
190
+ CompositeExplicitAutograd: sym_constrain_range_for_size
173
191
 
174
- - func: _assert_tensor_metadata(Tensor a, int[]? size=None, int[]? stride=None, ScalarType? dtype=None) -> ()
192
+ - func: _functional_sym_constrain_range(Scalar size, int? min, int? max, Tensor dep_token) -> Tensor
193
+ dispatch:
194
+ CompositeExplicitAutograd: _functional_sym_constrain_range
195
+
196
+ - func: _functional_sym_constrain_range_for_size(Scalar size, int? min, int? max, Tensor dep_token) -> Tensor
197
+ dispatch:
198
+ CompositeExplicitAutograd: _functional_sym_constrain_range_for_size
199
+
200
+ - func: _make_dep_token(*, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
201
+ dispatch:
202
+ CPU: _make_dep_token_cpu
175
203
 
176
204
  - func: refine_names(Tensor(a) self, Dimname[] names) -> Tensor(a)
177
205
  variants: method
@@ -211,6 +239,7 @@
211
239
  dispatch:
212
240
  CUDA: _cudnn_rnn
213
241
  autogen: _cudnn_rnn.out
242
+ tags: nondeterministic_seeded
214
243
 
215
244
  - func: _cudnn_rnn_backward(Tensor input, Tensor[] weight, int weight_stride0, Tensor weight_buf, Tensor hx, Tensor? cx, Tensor output, Tensor? grad_output, Tensor? grad_hy, Tensor? grad_cy, int mode, SymInt hidden_size, SymInt proj_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, SymInt[] batch_sizes, Tensor? dropout_state, Tensor reserve, bool[4] output_mask) -> (Tensor, Tensor, Tensor, Tensor[])
216
245
  dispatch:
@@ -221,6 +250,7 @@
221
250
  dispatch:
222
251
  CUDA: _cudnn_init_dropout_state
223
252
  autogen: _cudnn_init_dropout_state.out
253
+ tags: nondeterministic_seeded
224
254
 
225
255
  - func: _debug_has_internal_overlap(Tensor self) -> int
226
256
  variants: function
@@ -297,6 +327,7 @@
297
327
  CompositeExplicitAutograd: abs
298
328
  SparseCPU, SparseCUDA: abs_sparse
299
329
  SparseCsrCPU, SparseCsrCUDA: abs_sparse_csr
330
+ NestedTensorCPU, NestedTensorCUDA: NestedTensor_abs
300
331
  tags: [core, pointwise]
301
332
 
302
333
  - func: abs_(Tensor(a!) self) -> Tensor(a!)
@@ -306,6 +337,7 @@
306
337
  CompositeExplicitAutograd: abs_
307
338
  SparseCPU, SparseCUDA: abs_sparse_
308
339
  SparseCsrCPU, SparseCsrCUDA: abs_sparse_csr_
340
+ NestedTensorCPU, NestedTensorCUDA: NestedTensor_abs_
309
341
 
310
342
  - func: abs.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
311
343
  device_check: NoCheck # TensorIterator
@@ -374,7 +406,7 @@
374
406
  - func: view_as_complex(Tensor(a) self) -> Tensor(a)
375
407
  variants: function
376
408
  dispatch:
377
- CPU, CUDA, Meta: view_as_complex
409
+ CPU, CUDA, MPS, Meta: view_as_complex
378
410
 
379
411
  - func: sgn(Tensor self) -> Tensor
380
412
  variants: function, method
@@ -382,6 +414,7 @@
382
414
  dispatch:
383
415
  SparseCPU, SparseCUDA: sgn_sparse
384
416
  SparseCsrCPU, SparseCsrCUDA: sgn_sparse_csr
417
+ NestedTensorCPU, NestedTensorCUDA: NestedTensor_sgn
385
418
  tags: pointwise
386
419
 
387
420
  - func: sgn_(Tensor(a!) self) -> Tensor(a!)
@@ -390,6 +423,7 @@
390
423
  dispatch:
391
424
  SparseCPU, SparseCUDA: sgn_sparse_
392
425
  SparseCsrCPU, SparseCsrCUDA: sgn_sparse_csr_
426
+ NestedTensorCPU, NestedTensorCUDA: NestedTensor_sgn_
393
427
  tags: pointwise
394
428
 
395
429
  - func: sgn.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@@ -488,8 +522,10 @@
488
522
  - func: arccos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
489
523
 
490
524
  - func: avg_pool1d(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, bool ceil_mode=False, bool count_include_pad=True) -> Tensor
525
+ tags: core
491
526
 
492
527
  - func: adaptive_avg_pool1d(Tensor self, int[1] output_size) -> Tensor
528
+ tags: core
493
529
 
494
530
  # Return: (Tensor output, Tensor indices)
495
531
  - func: adaptive_max_pool1d(Tensor self, int[1] output_size) -> (Tensor, Tensor)
@@ -610,13 +646,13 @@
610
646
  MPS: addr_out_mps
611
647
  CompositeExplicitAutograd: math_addr_out
612
648
 
613
- - func: affine_grid_generator(Tensor theta, int[] size, bool align_corners) -> Tensor
649
+ - func: affine_grid_generator(Tensor theta, SymInt[] size, bool align_corners) -> Tensor
614
650
  variants: function
615
651
  dispatch:
616
652
  CompositeExplicitAutograd: affine_grid_generator
617
653
  autogen: affine_grid_generator.out
618
654
 
619
- - func: affine_grid_generator_backward(Tensor grad, int[] size, bool align_corners) -> Tensor
655
+ - func: affine_grid_generator_backward(Tensor grad, SymInt[] size, bool align_corners) -> Tensor
620
656
  variants: function
621
657
 
622
658
  - func: _is_all_true(Tensor self) -> Tensor
@@ -633,6 +669,13 @@
633
669
  - func: _test_check_tensor(Tensor self) -> Tensor
634
670
  variants: function
635
671
 
672
+ # Note; this function is only for testing
673
+ - func: _test_functorch_fallback(Tensor self, Tensor other) -> Tensor
674
+ variants: function
675
+ dispatch:
676
+ CPU: _test_functorch_fallback
677
+ autogen: _test_functorch_fallback.out
678
+
636
679
  - func: all.dim(Tensor self, int dim, bool keepdim=False) -> Tensor
637
680
  device_check: NoCheck # TensorIterator
638
681
  structured_delegate: all.out
@@ -664,6 +707,7 @@
664
707
  device_check: NoCheck # TensorIterator
665
708
  structured_delegate: any.out
666
709
  variants: function, method
710
+ tags: core
667
711
 
668
712
  - func: any.out(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
669
713
  device_check: NoCheck # TensorIterator
@@ -1108,6 +1152,7 @@
1108
1152
  structured_inherits: TensorIteratorBase
1109
1153
  dispatch:
1110
1154
  CPU, CUDA: bitwise_not_out
1155
+ MPS: bitwise_not_out_mps
1111
1156
  tags: pointwise
1112
1157
 
1113
1158
  - func: copysign.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
@@ -1115,7 +1160,7 @@
1115
1160
  structured: True
1116
1161
  structured_inherits: TensorIteratorBase
1117
1162
  dispatch:
1118
- CPU, CUDA: copysign_out
1163
+ CPU, CUDA, MPS: copysign_out
1119
1164
  tags: pointwise
1120
1165
 
1121
1166
  - func: copysign.Tensor(Tensor self, Tensor other) -> Tensor
@@ -1150,6 +1195,7 @@
1150
1195
  variants: function, method
1151
1196
  dispatch:
1152
1197
  CompositeExplicitAutograd: logical_not
1198
+ NestedTensorCPU, NestedTensorCUDA: NestedTensor_logical_not
1153
1199
  tags: [core, pointwise]
1154
1200
 
1155
1201
  - func: logical_not_(Tensor(a!) self) -> Tensor(a!)
@@ -1157,6 +1203,7 @@
1157
1203
  variants: method
1158
1204
  dispatch:
1159
1205
  CompositeExplicitAutograd: logical_not_
1206
+ NestedTensorCPU, NestedTensorCUDA: NestedTensor_logical_not_
1160
1207
  tags: pointwise
1161
1208
 
1162
1209
  - func: logical_not.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@@ -1171,7 +1218,7 @@
1171
1218
  variants: function, method
1172
1219
  dispatch:
1173
1220
  CompositeExplicitAutograd: logical_xor
1174
- tags: pointwise
1221
+ tags: [core, pointwise]
1175
1222
 
1176
1223
  - func: logical_xor_(Tensor(a!) self, Tensor other) -> Tensor(a!)
1177
1224
  device_check: NoCheck # TensorIterator
@@ -1326,7 +1373,7 @@
1326
1373
  dispatch:
1327
1374
  SparseCPU, SparseCUDA: ceil_sparse
1328
1375
  SparseCsrCPU, SparseCsrCUDA: ceil_sparse_csr
1329
- tags: pointwise
1376
+ tags: [core, pointwise]
1330
1377
 
1331
1378
  - func: ceil_(Tensor(a!) self) -> Tensor(a!)
1332
1379
  device_check: NoCheck # TensorIterator
@@ -1393,7 +1440,7 @@
1393
1440
  - func: clamp.Tensor(Tensor self, Tensor? min=None, Tensor? max=None) -> Tensor
1394
1441
  variants: function, method
1395
1442
  structured_delegate: clamp.Tensor_out
1396
- tags: pointwise
1443
+ tags: [core, pointwise]
1397
1444
 
1398
1445
  - func: clamp_(Tensor(a!) self, Scalar? min=None, Scalar? max=None) -> Tensor(a!)
1399
1446
  device_check: NoCheck # TensorIterator
@@ -1552,6 +1599,7 @@
1552
1599
  - func: polar.out(Tensor abs, Tensor angle, *, Tensor(a!) out) -> Tensor(a!)
1553
1600
  dispatch:
1554
1601
  CPU, CUDA: polar_out
1602
+ MPS: polar_out_mps
1555
1603
 
1556
1604
  - func: constant_pad_nd(Tensor self, SymInt[] pad, Scalar value=0) -> Tensor
1557
1605
  variants: function
@@ -1598,11 +1646,17 @@
1598
1646
 
1599
1647
  - func: _convolution_double_backward(Tensor? ggI, Tensor? ggW, Tensor? ggb, Tensor gO, Tensor weight, Tensor self, int[] stride, SymInt[] padding, int[] dilation, bool transposed, SymInt[] output_padding, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
1600
1648
 
1601
- - func: conv1d(Tensor input, Tensor weight, Tensor? bias=None, int[1] stride=1, int[1] padding=0, int[1] dilation=1, int groups=1) -> Tensor
1649
+ - func: conv1d(Tensor input, Tensor weight, Tensor? bias=None, int[1] stride=1, SymInt[1] padding=0, int[1] dilation=1, int groups=1) -> Tensor
1650
+ dispatch:
1651
+ CompositeImplicitAutograd: conv1d_symint
1602
1652
 
1603
- - func: conv2d(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] dilation=1, int groups=1) -> Tensor
1653
+ - func: conv2d(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, SymInt[2] padding=0, int[2] dilation=1, int groups=1) -> Tensor
1654
+ dispatch:
1655
+ CompositeImplicitAutograd: conv2d_symint
1604
1656
 
1605
- - func: conv3d(Tensor input, Tensor weight, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] dilation=1, int groups=1) -> Tensor
1657
+ - func: conv3d(Tensor input, Tensor weight, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0, int[3] dilation=1, int groups=1) -> Tensor
1658
+ dispatch:
1659
+ CompositeImplicitAutograd: conv3d_symint
1606
1660
 
1607
1661
  - func: conv1d.padding(Tensor input, Tensor weight, Tensor? bias=None, int[1] stride=1, str padding="valid", int[1] dilation=1, int groups=1) -> Tensor
1608
1662
  cpp_no_default_args: ['bias', 'stride', 'padding']
@@ -1621,11 +1675,17 @@
1621
1675
  - func: conv_tbc_backward(Tensor self, Tensor input, Tensor weight, Tensor bias, int pad) -> (Tensor, Tensor, Tensor)
1622
1676
 
1623
1677
  # NB: we inherit the goofy argument order from PyTorch torch.nn.functional
1624
- - func: conv_transpose1d(Tensor input, Tensor weight, Tensor? bias=None, int[1] stride=1, int[1] padding=0, int[1] output_padding=0, int groups=1, int[1] dilation=1) -> Tensor
1678
+ - func: conv_transpose1d(Tensor input, Tensor weight, Tensor? bias=None, int[1] stride=1, SymInt[1] padding=0, SymInt[1] output_padding=0, int groups=1, int[1] dilation=1) -> Tensor
1679
+ dispatch:
1680
+ CompositeImplicitAutograd: conv_transpose1d_symint
1625
1681
 
1626
- - func: conv_transpose2d.input(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] output_padding=0, int groups=1, int[2] dilation=1) -> Tensor
1682
+ - func: conv_transpose2d.input(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, int groups=1, int[2] dilation=1) -> Tensor
1683
+ dispatch:
1684
+ CompositeImplicitAutograd: conv_transpose2d_symint
1627
1685
 
1628
- - func: conv_transpose3d.input(Tensor input, Tensor weight, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] output_padding=0, int groups=1, int[3] dilation=1) -> Tensor
1686
+ - func: conv_transpose3d.input(Tensor input, Tensor weight, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0, SymInt[3] output_padding=0, int groups=1, int[3] dilation=1) -> Tensor
1687
+ dispatch:
1688
+ CompositeImplicitAutograd: conv_transpose3d_symint
1629
1689
 
1630
1690
  - func: copy(Tensor self, Tensor src, bool non_blocking=False) -> Tensor
1631
1691
  variants: function
@@ -1850,6 +1910,7 @@
1850
1910
  device_check: NoCheck # TensorIterator
1851
1911
  dispatch:
1852
1912
  CPU, CUDA: cumprod_out
1913
+ MPS: cumprod_out_mps
1853
1914
 
1854
1915
  - func: cumprod.dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor
1855
1916
  device_check: NoCheck # TensorIterator
@@ -1870,6 +1931,7 @@
1870
1931
  structured_delegate: cumsum.out
1871
1932
  device_check: NoCheck # TensorIterator
1872
1933
  variants: function, method
1934
+ tags: core
1873
1935
 
1874
1936
  - func: cumsum_(Tensor(a!) self, int dim, *, ScalarType? dtype=None) -> Tensor(a!)
1875
1937
  structured_delegate: cumsum.out
@@ -2145,6 +2207,7 @@
2145
2207
  CompositeExplicitAutograd: embedding_symint
2146
2208
  NestedTensorCPU, NestedTensorCUDA: NestedTensor_embedding
2147
2209
  autogen: embedding.out
2210
+ tags: core
2148
2211
 
2149
2212
  - func: embedding_backward(Tensor grad, Tensor indices, SymInt num_weights, SymInt padding_idx, bool scale_grad_by_freq, bool sparse) -> Tensor
2150
2213
  dispatch:
@@ -2202,6 +2265,7 @@
2202
2265
  CPU: _embedding_bag_cpu
2203
2266
  CUDA: _embedding_bag_cuda
2204
2267
  autogen: _embedding_bag.out
2268
+ tags: core
2205
2269
 
2206
2270
  - func: _embedding_bag_backward(Tensor grad, Tensor indices, Tensor offsets, Tensor offset2bag, Tensor bag_size, Tensor maximum_indices, SymInt num_weights, bool scale_grad_by_freq, int mode, bool sparse, Tensor? per_sample_weights, int padding_idx=-1) -> Tensor
2207
2271
  dispatch:
@@ -2240,6 +2304,12 @@
2240
2304
  SparseCPU, SparseCUDA, SparseMeta: empty_sparse
2241
2305
  SparseCsrCPU, SparseCsrCUDA: empty_sparse_compressed
2242
2306
  QuantizedCPU, QuantizedCUDA, QuantizedMeta: empty_unknown_quantized
2307
+ tags: core
2308
+
2309
+ - func: empty_permuted(SymInt[] size, int[] physical_layout, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
2310
+ dispatch:
2311
+ CompositeExplicitAutograd: empty_permuted_symint
2312
+ autogen: empty_permuted.out
2243
2313
 
2244
2314
  # We do not make new_empty a composite that calls into new_empty_strided, as the strided version
2245
2315
  # is significantly more difficult to implement by different backends
@@ -2280,7 +2350,7 @@
2280
2350
  autogen: new_ones.out
2281
2351
 
2282
2352
  # other overrides are to provide a more helpful error message that dtype is required
2283
- - func: _empty_affine_quantized(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, float scale=1, int zero_point=0, MemoryFormat? memory_format=contiguous_format) -> Tensor
2353
+ - func: _empty_affine_quantized(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, float scale=1, int zero_point=0, MemoryFormat? memory_format=contiguous_format) -> Tensor
2284
2354
  dispatch:
2285
2355
  CPU: empty_affine_quantized_other_backends_stub
2286
2356
  QuantizedCPU, QuantizedCUDA: empty_affine_quantized
@@ -2288,7 +2358,7 @@
2288
2358
 
2289
2359
  # it's a factory function receiving a tensor argument, thus overriding explicitly
2290
2360
  # other overrides are to provide a more helpful error message that dtype is required
2291
- - func: _empty_per_channel_affine_quantized(int[] size, *, Tensor scales, Tensor zero_points, int axis, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=contiguous_format) -> Tensor
2361
+ - func: _empty_per_channel_affine_quantized(SymInt[] size, *, Tensor scales, Tensor zero_points, int axis, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=contiguous_format) -> Tensor
2292
2362
  category_override: factory
2293
2363
  dispatch:
2294
2364
  CPU: empty_per_channel_affine_quantized_other_backends_stub
@@ -2313,7 +2383,7 @@
2313
2383
  # This is a utility function to enable users to resize out tensor while registering kernels for out variants.
2314
2384
  # Eventually, we can consider exposing `resize_output` as a public API to ship it with python op registration
2315
2385
  # to make it easy to register out variants for ops.
2316
- - func: _resize_output_(Tensor(a!) self, int[] size, Device device) -> Tensor(a!)
2386
+ - func: _resize_output_(Tensor(a!) self, SymInt[] size, Device device) -> Tensor(a!)
2317
2387
  use_const_ref_for_mutable_tensors: True
2318
2388
  variants: function
2319
2389
  dispatch:
@@ -2483,21 +2553,21 @@
2483
2553
  device_guard: False
2484
2554
 
2485
2555
  # decomposes to eye.m
2486
- - func: eye(int n, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
2556
+ - func: eye(SymInt n, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
2487
2557
  dispatch:
2488
2558
  CompositeExplicitAutograd: eye
2489
2559
 
2490
- - func: eye.m(int n, int m, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
2560
+ - func: eye.m(SymInt n, SymInt m, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
2491
2561
  dispatch:
2492
2562
  CompositeExplicitAutograd: eye
2493
2563
 
2494
- - func: eye.out(int n, *, Tensor(a!) out) -> Tensor(a!)
2564
+ - func: eye.out(SymInt n, *, Tensor(a!) out) -> Tensor(a!)
2495
2565
  dispatch:
2496
2566
  CPU, Meta: eye_out_cpu
2497
2567
  CUDA: eye_out_cuda
2498
2568
  MPS: eye_out_mps
2499
2569
 
2500
- - func: eye.m_out(int n, int m, *, Tensor(a!) out) -> Tensor(a!)
2570
+ - func: eye.m_out(SymInt n, SymInt m, *, Tensor(a!) out) -> Tensor(a!)
2501
2571
  dispatch:
2502
2572
  CPU, Meta: eye_out_cpu
2503
2573
  CUDA: eye_out_cuda
@@ -2515,11 +2585,15 @@
2515
2585
  - func: flatten.DimnameList(Tensor(a) self, Dimname[] dims, Dimname out_dim) -> Tensor(a)
2516
2586
  variants: function, method
2517
2587
 
2518
- - func: unflatten.int(Tensor(a) self, int dim, int[] sizes) -> Tensor(a)
2588
+ - func: unflatten.int(Tensor(a) self, int dim, SymInt[] sizes) -> Tensor(a)
2519
2589
  variants: function, method
2590
+ dispatch:
2591
+ CompositeImplicitAutograd: unflatten_symint
2520
2592
 
2521
- - func: unflatten.Dimname(Tensor(a) self, Dimname dim, int[] sizes, Dimname[] names) -> Tensor(a)
2593
+ - func: unflatten.Dimname(Tensor(a) self, Dimname dim, SymInt[] sizes, Dimname[] names) -> Tensor(a)
2522
2594
  variants: function, method
2595
+ dispatch:
2596
+ CompositeImplicitAutograd: unflatten_dimname_symint
2523
2597
 
2524
2598
  - func: fill.Scalar(Tensor self, Scalar value) -> Tensor
2525
2599
  variants: function
@@ -2839,13 +2913,13 @@
2839
2913
  CUDA: _fft_r2c_cufft_out
2840
2914
 
2841
2915
  # Complex to real inverse FFT
2842
- - func: _fft_c2r(Tensor self, int[] dim, int normalization, int last_dim_size) -> Tensor
2916
+ - func: _fft_c2r(Tensor self, int[] dim, int normalization, SymInt last_dim_size) -> Tensor
2843
2917
  variants: function
2844
2918
  dispatch:
2845
2919
  CPU: _fft_c2r_mkl
2846
2920
  CUDA: _fft_c2r_cufft
2847
2921
 
2848
- - func: _fft_c2r.out(Tensor self, int[] dim, int normalization, int last_dim_size, *, Tensor(a!) out) -> Tensor(a!)
2922
+ - func: _fft_c2r.out(Tensor self, int[] dim, int normalization, SymInt last_dim_size, *, Tensor(a!) out) -> Tensor(a!)
2849
2923
  variants: function
2850
2924
  dispatch:
2851
2925
  CPU: _fft_c2r_mkl_out
@@ -2871,13 +2945,13 @@
2871
2945
  CPU: _validate_compressed_sparse_indices_cpu
2872
2946
  CUDA: _validate_compressed_sparse_indices_cuda
2873
2947
 
2874
- - func: _cufft_get_plan_cache_size(int device_index) -> int
2948
+ - func: _cufft_get_plan_cache_size(DeviceIndex device_index) -> int
2875
2949
 
2876
- - func: _cufft_get_plan_cache_max_size(int device_index) -> int
2950
+ - func: _cufft_get_plan_cache_max_size(DeviceIndex device_index) -> int
2877
2951
 
2878
- - func: _cufft_set_plan_cache_max_size(int device_index, int max_size) -> ()
2952
+ - func: _cufft_set_plan_cache_max_size(DeviceIndex device_index, int max_size) -> ()
2879
2953
 
2880
- - func: _cufft_clear_plan_cache(int device_index) -> ()
2954
+ - func: _cufft_clear_plan_cache(DeviceIndex device_index) -> ()
2881
2955
 
2882
2956
  - func: index.Tensor(Tensor self, Tensor?[] indices) -> Tensor
2883
2957
  device_check: NoCheck # TensorIterator
@@ -2885,7 +2959,7 @@
2885
2959
  variants: function, method
2886
2960
  dispatch:
2887
2961
  QuantizedCPU: quantized_index
2888
- tags: dynamic_output_shape
2962
+ tags: [core, dynamic_output_shape]
2889
2963
  # NB: This function is special-cased in tools/autograd/gen_variable_type.py
2890
2964
  # NB: The following functions are declared in aten/src/ATen/templates/TensorBody.h and defined in aten/src/ATen/TensorIndexing.cpp:
2891
2965
  # - Tensor Tensor::index(ArrayRef<TensorIndex> indices)
@@ -2900,6 +2974,13 @@
2900
2974
  dispatch:
2901
2975
  CPU, CUDA, MPS: index_out
2902
2976
 
2977
+ # Used by inductor to signal indexing without bounds checks
2978
+ # Note that we don't support boolean indexing, to avoid dynamic output shapes
2979
+ - func: _unsafe_index.Tensor(Tensor self, Tensor?[] indices) -> Tensor
2980
+ variants: function
2981
+ dispatch:
2982
+ CPU, CUDA: _unsafe_index
2983
+
2903
2984
  - func: index_copy.out(Tensor self, int dim, Tensor index, Tensor source, *, Tensor(a!) out) -> Tensor(a!)
2904
2985
  structured: True
2905
2986
  variants: function
@@ -2939,6 +3020,13 @@
2939
3020
  variants: function, method
2940
3021
  dispatch:
2941
3022
  CompositeExplicitAutograd: index_put
3023
+ tags: core
3024
+
3025
+ - func: _unsafe_index_put(Tensor self, Tensor?[] indices, Tensor values, bool accumulate=False) -> Tensor
3026
+ device_check: NoCheck # delegate to _index_put_impl_ after clone, which leverages TensorIterator
3027
+ variants: function
3028
+ dispatch:
3029
+ CompositeExplicitAutograd: _unsafe_index_put
2942
3030
 
2943
3031
  - func: _index_put_impl_(Tensor(a!) self, Tensor?[] indices, Tensor values, bool accumulate=False, bool unsafe=False) -> Tensor(a!)
2944
3032
  device_check: NoCheck # TensorIterator
@@ -3097,6 +3185,7 @@
3097
3185
  CPU: layer_norm_backward_cpu
3098
3186
  CUDA: layer_norm_backward_cuda
3099
3187
  MPS: layer_norm_backward_mps
3188
+ NestedTensorCPU, NestedTensorCUDA: layer_norm_backward_nested
3100
3189
  autogen: native_layer_norm_backward.out
3101
3190
  tags: core
3102
3191
 
@@ -3160,6 +3249,18 @@
3160
3249
  MkldnnCPU: mkldnn_linear_backward
3161
3250
  autogen: mkldnn_linear_backward.out
3162
3251
 
3252
+ - func: _cslt_compress(Tensor input) -> Tensor
3253
+ dispatch:
3254
+ CUDA: _cslt_compress
3255
+
3256
+ - func: _cslt_sparse_mm(Tensor compressed_A, Tensor dense_B, Tensor? bias=None, bool transpose_result=False) -> Tensor
3257
+ dispatch:
3258
+ CUDA: _cslt_sparse_mm
3259
+
3260
+ - func: _sparse_semi_structured_linear(Tensor input, Tensor weight, Tensor meta, *, Tensor? bias=None, str? activation=None) -> Tensor
3261
+ dispatch:
3262
+ CUDA: _sparse_semi_structured_linear
3263
+
3163
3264
  - func: fbgemm_linear_int8_weight_fp32_activation(Tensor input, Tensor weight, Tensor packed, Tensor col_offsets, Scalar weight_scale, Scalar weight_zero_point, Tensor bias) -> Tensor
3164
3265
 
3165
3266
  - func: fbgemm_linear_int8_weight(Tensor input, Tensor weight, Tensor packed, Tensor col_offsets, Scalar weight_scale, Scalar weight_zero_point, Tensor bias) -> Tensor
@@ -3355,6 +3456,7 @@
3355
3456
  variants: function
3356
3457
  dispatch:
3357
3458
  CPU, CUDA: xlogy_out
3459
+ MPS: xlogy_out_mps
3358
3460
  tags: pointwise
3359
3461
 
3360
3462
  - func: xlogy.OutScalar_Self(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
@@ -3510,6 +3612,7 @@
3510
3612
  structured: True
3511
3613
  dispatch:
3512
3614
  CPU, CUDA: aminmax_out
3615
+ MPS: aminmax_out_mps
3513
3616
 
3514
3617
  - func: _compute_linear_combination(Tensor input, Tensor coefficients) -> Tensor
3515
3618
  dispatch:
@@ -3607,6 +3710,11 @@
3607
3710
  QuantizedCUDA: quantized_max_pool2d_cudnn
3608
3711
  autogen: quantized_max_pool2d.out
3609
3712
 
3713
+ - func: quantized_max_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> Tensor
3714
+ dispatch:
3715
+ QuantizedCPU: quantized_max_pool3d
3716
+ autogen: quantized_max_pool3d.out
3717
+
3610
3718
  - func: max_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> Tensor
3611
3719
 
3612
3720
  # The CPU and GPU dispatch variants are named weirdly here because otherwise there
@@ -3616,6 +3724,7 @@
3616
3724
  variants: function, method
3617
3725
  dispatch:
3618
3726
  CompositeExplicitAutograd: mean
3727
+ tags: core
3619
3728
 
3620
3729
  # For normal naming convention this should be `mean.out`. However since we already have `mean.out` we have to rename this.
3621
3730
  # FIXME: fix CI jobs and re-enable this
@@ -3756,6 +3865,7 @@
3756
3865
  - func: mkldnn_rnn_layer(Tensor input, Tensor weight0, Tensor weight1, Tensor weight2, Tensor weight3, Tensor hx_, Tensor cx_, bool reverse, int[] batch_sizes, int mode, int hidden_size, int num_layers, bool has_biases, bool bidirectional, bool batch_first, bool train) -> (Tensor, Tensor, Tensor, Tensor)
3757
3866
  dispatch:
3758
3867
  CPU: mkldnn_rnn_layer
3868
+ MkldnnCPU: mkldnn_rnn_layer
3759
3869
  autogen: mkldnn_rnn_layer.out
3760
3870
 
3761
3871
  - func: mkldnn_rnn_layer_backward(Tensor input, Tensor weight1, Tensor weight2, Tensor weight3, Tensor weight4, Tensor hx_, Tensor cx_tmp, Tensor output, Tensor hy_, Tensor cy_, Tensor? grad_output, Tensor? grad_hy, Tensor? grad_cy, bool reverse, int mode, int hidden_size, int num_layers, bool has_biases, bool train, bool bidirectional, int[] batch_sizes, bool batch_first, Tensor workspace) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor)
@@ -3800,6 +3910,8 @@
3800
3910
  dispatch:
3801
3911
  CUDA: miopen_rnn
3802
3912
  autogen: miopen_rnn.out
3913
+ tags: nondeterministic_seeded
3914
+
3803
3915
 
3804
3916
  - func: miopen_rnn_backward(Tensor input, Tensor[] weight, int weight_stride0, Tensor weight_buf, Tensor hx, Tensor? cx, Tensor output, Tensor? grad_output, Tensor? grad_hy, Tensor? grad_cy, int mode, int hidden_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, int[] batch_sizes, Tensor? dropout_state, Tensor reserve, bool[4] output_mask) -> (Tensor, Tensor, Tensor, Tensor[])
3805
3917
  dispatch:
@@ -3823,6 +3935,14 @@
3823
3935
  SparseCPU, SparseCUDA: _sparse_mm_out
3824
3936
  SparseCsrCPU, SparseCsrCUDA: _sparse_csr_mm_out
3825
3937
 
3938
+ - func: _int_mm(Tensor self, Tensor mat2) -> Tensor
3939
+ dispatch:
3940
+ CUDA: _int_mm_cuda
3941
+
3942
+ - func: _int_mm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)
3943
+ dispatch:
3944
+ CUDA: _int_mm_out_cuda
3945
+
3826
3946
  - func: _sparse_mm(Tensor sparse, Tensor dense) -> Tensor
3827
3947
  python_module: sparse
3828
3948
 
@@ -3981,7 +4101,6 @@
3981
4101
  CUDA: batch_norm_cuda
3982
4102
  MPS: batch_norm_mps
3983
4103
  MkldnnCPU: mkldnn_batch_norm
3984
- tags: core
3985
4104
 
3986
4105
  - func: native_batch_norm.out(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, *, Tensor(a!) out, Tensor(b!) save_mean, Tensor(c!) save_invstd) -> (Tensor(a!), Tensor(b!), Tensor(c!))
3987
4106
  dispatch:
@@ -3997,6 +4116,16 @@
3997
4116
  MPS: _batch_norm_legit_mps
3998
4117
  MkldnnCPU: _mkldnn_batch_norm_legit
3999
4118
  autogen: _native_batch_norm_legit_functional
4119
+ tags: core
4120
+
4121
+ # HACK: identical to _native_batch_norm_legit, but training is known to be False,
4122
+ # So we known that running stats will not be mutated.
4123
+ # The real fix here is batch norm consolidation.
4124
+ - func: _native_batch_norm_legit_no_training(Tensor input, Tensor? weight, Tensor? bias, Tensor running_mean, Tensor running_var, float momentum, float eps) -> (Tensor, Tensor, Tensor)
4125
+ dispatch:
4126
+ CompositeExplicitAutograd: _batch_norm_legit_no_training
4127
+ autogen: _native_batch_norm_legit_no_training.out
4128
+ tags: core
4000
4129
 
4001
4130
  - func: _native_batch_norm_legit.out(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, bool training, float momentum, float eps, *, Tensor(d!) out, Tensor(e!) save_mean, Tensor(f!) save_invstd) -> (Tensor(d!), Tensor(e!), Tensor(f!))
4002
4131
  dispatch:
@@ -4055,7 +4184,7 @@
4055
4184
  CUDA: batch_norm_backward_reduce_cuda
4056
4185
  autogen: batch_norm_backward_reduce.out
4057
4186
 
4058
- - func: batch_norm_backward_elemt(Tensor grad_out, Tensor input, Tensor mean, Tensor invstd, Tensor? weight, Tensor mean_dy, Tensor mean_dy_xmu, Tensor count) -> Tensor
4187
+ - func: batch_norm_backward_elemt(Tensor grad_out, Tensor input, Tensor mean, Tensor invstd, Tensor? weight, Tensor sum_dy, Tensor sum_dy_xmu, Tensor count) -> Tensor
4059
4188
  dispatch:
4060
4189
  CUDA: batch_norm_backward_elemt_cuda
4061
4190
  autogen: batch_norm_backward_elemt.out
@@ -4113,6 +4242,7 @@
4113
4242
  CPU, CUDA: _cdist_forward
4114
4243
  MPS: _cdist_forward_mps
4115
4244
  autogen: _cdist_forward.out
4245
+ tags: core
4116
4246
 
4117
4247
  - func: _cdist_backward(Tensor grad, Tensor x1, Tensor x2, float p, Tensor cdist) -> Tensor
4118
4248
  dispatch:
@@ -4125,6 +4255,7 @@
4125
4255
  dispatch:
4126
4256
  CPU, CUDA: _pdist_forward
4127
4257
  autogen: _pdist_forward.out
4258
+ tags: core
4128
4259
 
4129
4260
  - func: _pdist_backward(Tensor grad, Tensor self, float p, Tensor pdist) -> Tensor
4130
4261
  dispatch:
@@ -4185,6 +4316,7 @@
4185
4316
  CPU: pixel_shuffle_cpu
4186
4317
  CompositeExplicitAutogradNonFunctional: math_pixel_shuffle
4187
4318
  autogen: pixel_shuffle.out
4319
+ tags: core
4188
4320
 
4189
4321
  - func: pixel_unshuffle(Tensor self, int downscale_factor) -> Tensor
4190
4322
  dispatch:
@@ -4194,7 +4326,7 @@
4194
4326
 
4195
4327
  - func: channel_shuffle(Tensor self, int groups) -> Tensor
4196
4328
  dispatch:
4197
- CPU: channel_shuffle
4329
+ CPU, CUDA: channel_shuffle
4198
4330
  QuantizedCPU: channel_shuffle_quantized_cpu
4199
4331
  autogen: channel_shuffle.out
4200
4332
 
@@ -4294,7 +4426,7 @@
4294
4426
  autogen: rand.generator_with_names_out
4295
4427
 
4296
4428
  - func: rand(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
4297
- tags: nondeterministic_seeded
4429
+ tags: [core, nondeterministic_seeded]
4298
4430
  dispatch:
4299
4431
  CompositeExplicitAutograd: rand
4300
4432
 
@@ -4319,47 +4451,47 @@
4319
4451
  CompositeExplicitAutograd: rand_like
4320
4452
  autogen: rand_like.out
4321
4453
 
4322
- - func: randint(int high, SymInt[] size, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
4454
+ - func: randint(SymInt high, SymInt[] size, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
4323
4455
  tags: nondeterministic_seeded
4324
4456
  dispatch:
4325
4457
  CompositeExplicitAutograd: randint
4326
4458
 
4327
- - func: randint.generator(int high, SymInt[] size, *, Generator? generator, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
4459
+ - func: randint.generator(SymInt high, SymInt[] size, *, Generator? generator, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
4328
4460
  tags: nondeterministic_seeded
4329
4461
  dispatch:
4330
4462
  CompositeExplicitAutograd: randint
4331
4463
 
4332
- - func: randint.low(int low, int high, SymInt[] size, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
4464
+ - func: randint.low(SymInt low, SymInt high, SymInt[] size, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
4333
4465
  tags: nondeterministic_seeded
4334
4466
  dispatch:
4335
4467
  CompositeExplicitAutograd: randint
4336
4468
 
4337
- - func: randint.low_generator(int low, int high, SymInt[] size, *, Generator? generator, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
4469
+ - func: randint.low_generator(SymInt low, SymInt high, SymInt[] size, *, Generator? generator, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
4338
4470
  tags: nondeterministic_seeded
4339
4471
  dispatch:
4340
4472
  CompositeExplicitAutograd: randint
4341
4473
 
4342
- - func: randint.out(int high, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
4474
+ - func: randint.out(SymInt high, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
4343
4475
  tags: nondeterministic_seeded
4344
4476
  dispatch:
4345
4477
  CompositeExplicitAutograd: randint_out
4346
4478
 
4347
- - func: randint.generator_out(int high, SymInt[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)
4479
+ - func: randint.generator_out(SymInt high, SymInt[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)
4348
4480
  tags: nondeterministic_seeded
4349
4481
  dispatch:
4350
4482
  CompositeExplicitAutograd: randint_out
4351
4483
 
4352
- - func: randint.low_out(int low, int high, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
4484
+ - func: randint.low_out(SymInt low, SymInt high, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
4353
4485
  tags: nondeterministic_seeded
4354
4486
  dispatch:
4355
4487
  CompositeExplicitAutograd: randint_out
4356
4488
 
4357
- - func: randint.low_generator_out(int low, int high, SymInt[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)
4489
+ - func: randint.low_generator_out(SymInt low, SymInt high, SymInt[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)
4358
4490
  tags: nondeterministic_seeded
4359
4491
  dispatch:
4360
4492
  CompositeExplicitAutograd: randint_out
4361
4493
 
4362
- - func: randint_like(Tensor self, int high, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
4494
+ - func: randint_like(Tensor self, SymInt high, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
4363
4495
  tags: nondeterministic_seeded
4364
4496
  dispatch:
4365
4497
  # NB: Although this composite mutates on the inside, it is
@@ -4367,7 +4499,7 @@
4367
4499
  CompositeExplicitAutograd: randint_like
4368
4500
  autogen: randint_like.out
4369
4501
 
4370
- - func: randint_like.low_dtype(Tensor self, int low, int high, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
4502
+ - func: randint_like.low_dtype(Tensor self, SymInt low, SymInt high, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
4371
4503
  tags: nondeterministic_seeded
4372
4504
  dispatch:
4373
4505
  # NB: Although this composite mutates on the inside, it is
@@ -4376,7 +4508,7 @@
4376
4508
  autogen: randint_like.low_dtype_out
4377
4509
 
4378
4510
  - func: randn(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
4379
- tags: nondeterministic_seeded
4511
+ tags: [core, nondeterministic_seeded]
4380
4512
  dispatch:
4381
4513
  CompositeExplicitAutograd: randn
4382
4514
 
@@ -4412,25 +4544,25 @@
4412
4544
  dispatch:
4413
4545
  # NB: Although this composite mutates on the inside, it is
4414
4546
  # non-differentiable so NonFunctional doesn't apply
4415
- CompositeExplicitAutograd: randn_like
4547
+ CompositeExplicitAutograd, CompositeImplicitAutogradNestedTensor: randn_like
4416
4548
  autogen: randn_like.out
4417
4549
 
4418
- - func: randperm(int n, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
4419
- tags: nondeterministic_seeded
4550
+ - func: randperm(SymInt n, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
4551
+ tags: [core, nondeterministic_seeded]
4420
4552
  dispatch:
4421
4553
  CompositeExplicitAutograd: randperm
4422
4554
 
4423
- - func: randperm.generator(int n, *, Generator? generator, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
4555
+ - func: randperm.generator(SymInt n, *, Generator? generator, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
4424
4556
  tags: nondeterministic_seeded
4425
4557
  dispatch:
4426
4558
  CompositeExplicitAutograd: randperm
4427
4559
 
4428
- - func: randperm.out(int n, *, Tensor(a!) out) -> Tensor(a!)
4560
+ - func: randperm.out(SymInt n, *, Tensor(a!) out) -> Tensor(a!)
4429
4561
  tags: nondeterministic_seeded
4430
4562
  dispatch:
4431
4563
  CompositeExplicitAutograd: randperm_out
4432
4564
 
4433
- - func: randperm.generator_out(int n, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)
4565
+ - func: randperm.generator_out(SymInt n, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)
4434
4566
  tags: nondeterministic_seeded
4435
4567
  dispatch:
4436
4568
  CPU: randperm_out_cpu
@@ -4591,7 +4723,7 @@
4591
4723
  dispatch:
4592
4724
  SparseCPU, SparseCUDA: round_sparse
4593
4725
  SparseCsrCPU, SparseCsrCUDA: round_sparse_csr
4594
- tags: pointwise
4726
+ tags: [core, pointwise]
4595
4727
 
4596
4728
  - func: round_(Tensor(a!) self) -> Tensor(a!)
4597
4729
  device_check: NoCheck # TensorIterator
@@ -4839,10 +4971,14 @@
4839
4971
  - func: silu(Tensor self) -> Tensor
4840
4972
  structured_delegate: silu.out
4841
4973
  python_module: nn
4974
+ dispatch:
4975
+ NestedTensorCPU, NestedTensorCUDA: NestedTensor_silu
4842
4976
 
4843
4977
  - func: silu_(Tensor(a!) self) -> Tensor(a!)
4844
4978
  structured_delegate: silu.out
4845
4979
  python_module: nn
4980
+ dispatch:
4981
+ NestedTensorCPU, NestedTensorCUDA: NestedTensor_silu_
4846
4982
 
4847
4983
  - func: silu.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
4848
4984
  structured: True
@@ -4865,6 +5001,7 @@
4865
5001
  python_module: nn
4866
5002
  dispatch:
4867
5003
  CompositeImplicitAutograd: math_silu_backward
5004
+ NestedTensorCPU, NestedTensorCUDA: silu_backward_nested
4868
5005
 
4869
5006
  - func: mish(Tensor self) -> Tensor
4870
5007
  structured_delegate: mish.out
@@ -4917,6 +5054,7 @@
4917
5054
  variants: function, method
4918
5055
  dispatch:
4919
5056
  CPU, CUDA: logit
5057
+ MPS: logit_mps
4920
5058
  tags: pointwise
4921
5059
 
4922
5060
  - func: logit_(Tensor(a!) self, float? eps=None) -> Tensor(a!)
@@ -4928,6 +5066,7 @@
4928
5066
  - func: logit.out(Tensor self, float? eps=None, *, Tensor(a!) out) -> Tensor(a!)
4929
5067
  dispatch:
4930
5068
  CPU, CUDA: logit_out
5069
+ MPS: logit_out_mps
4931
5070
  tags: pointwise
4932
5071
 
4933
5072
  - func: sin(Tensor self) -> Tensor
@@ -5042,6 +5181,27 @@
5042
5181
  device_check: NoCheck
5043
5182
  device_guard: False
5044
5183
 
5184
+ - func: sym_size.int(Tensor self, int dim) -> SymInt
5185
+ variants: function
5186
+ device_check: NoCheck
5187
+ device_guard: False
5188
+ tags: core
5189
+ manual_cpp_binding: True
5190
+
5191
+ - func: sym_numel(Tensor self) -> SymInt
5192
+ variants: function
5193
+ device_check: NoCheck
5194
+ device_guard: False
5195
+ tags: core
5196
+ manual_cpp_binding: True
5197
+
5198
+ - func: sym_storage_offset(Tensor self) -> SymInt
5199
+ variants: function
5200
+ device_check: NoCheck
5201
+ device_guard: False
5202
+ tags: core
5203
+ manual_cpp_binding: True
5204
+
5045
5205
  - func: slice.Tensor(Tensor(a) self, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor(a)
5046
5206
  variants: function, method
5047
5207
  device_check: NoCheck
@@ -5066,7 +5226,7 @@
5066
5226
  device_check: NoCheck
5067
5227
  device_guard: False
5068
5228
  dispatch:
5069
- CompositeExplicitAutograd: slice_scatter
5229
+ CompositeExplicitAutogradNonFunctional: slice_scatter
5070
5230
  autogen: slice_scatter.out
5071
5231
  tags: core
5072
5232
 
@@ -5075,15 +5235,16 @@
5075
5235
  device_check: NoCheck
5076
5236
  device_guard: False
5077
5237
  dispatch:
5078
- CompositeExplicitAutograd: select_scatter_symint
5238
+ CompositeExplicitAutogradNonFunctional: select_scatter_symint
5079
5239
  autogen: select_scatter.out
5240
+ tags: core
5080
5241
 
5081
5242
  - func: diagonal_scatter(Tensor self, Tensor src, int offset=0, int dim1=0, int dim2=1) -> Tensor
5082
5243
  variants: function, method
5083
5244
  device_check: NoCheck
5084
5245
  device_guard: False
5085
5246
  dispatch:
5086
- CompositeExplicitAutograd: diagonal_scatter
5247
+ CompositeExplicitAutogradNonFunctional: diagonal_scatter
5087
5248
  autogen: diagonal_scatter.out
5088
5249
 
5089
5250
  - func: as_strided_scatter(Tensor self, Tensor src, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None) -> Tensor
@@ -5091,7 +5252,7 @@
5091
5252
  device_check: NoCheck
5092
5253
  device_guard: False
5093
5254
  dispatch:
5094
- CompositeExplicitAutograd: as_strided_scatter_symint
5255
+ CompositeExplicitAutogradNonFunctional: as_strided_scatter_symint
5095
5256
  autogen: as_strided_scatter.out
5096
5257
 
5097
5258
  - func: smm(Tensor self, Tensor mat2) -> Tensor
@@ -5170,6 +5331,8 @@
5170
5331
  device_guard: False
5171
5332
  dispatch:
5172
5333
  CompositeExplicitAutograd: split_with_sizes
5334
+ NestedTensorCPU, NestedTensorCUDA: split_with_sizes_nested
5335
+ tags: core
5173
5336
 
5174
5337
  - func: hsplit.int(Tensor(a -> *) self, int sections) -> Tensor(a)[]
5175
5338
  variants: function, method
@@ -5316,6 +5479,13 @@
5316
5479
  device_check: NoCheck
5317
5480
  device_guard: False
5318
5481
 
5482
+ - func: sym_stride.int(Tensor self, int dim) -> SymInt
5483
+ variants: function
5484
+ device_check: NoCheck
5485
+ device_guard: False
5486
+ tags: core
5487
+ manual_cpp_binding: True
5488
+
5319
5489
  - func: sum(Tensor self, *, ScalarType? dtype=None) -> Tensor
5320
5490
  device_check: NoCheck # TensorIterator
5321
5491
  variants: function, method
@@ -5326,12 +5496,14 @@
5326
5496
  autogen: sum.out
5327
5497
 
5328
5498
  - func: sum.dim_IntList(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
5499
+ # TODO: Align the signature of sum.dim_IntList and _sparse_csr_sum.dim_dtype
5329
5500
  structured_delegate: sum.IntList_out
5330
5501
  device_check: NoCheck # TensorIterator
5331
5502
  variants: function, method
5332
5503
  dispatch:
5333
5504
  NestedTensorCPU: NestedTensor_sum_dim_CPU
5334
5505
  SparseCPU, SparseCUDA: sum_sparse_coo
5506
+ SparseCsrCPU, SparseCsrCUDA: sum_sparse_compressed
5335
5507
  tags: core
5336
5508
 
5337
5509
  - func: sum.dim_DimnameList(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
@@ -5364,10 +5536,12 @@
5364
5536
  CPU, CUDA: nansum_out
5365
5537
  MPS: nansum_out_mps
5366
5538
 
5367
- - func: sum_to_size(Tensor self, int[] size) -> Tensor
5539
+ - func: sum_to_size(Tensor self, SymInt[] size) -> Tensor
5368
5540
  variants: method
5369
5541
  device_check: NoCheck
5370
5542
  device_guard: False
5543
+ dispatch:
5544
+ CompositeImplicitAutograd: sum_to_size_symint
5371
5545
 
5372
5546
  - func: sqrt(Tensor self) -> Tensor
5373
5547
  device_check: NoCheck # TensorIterator
@@ -5421,7 +5595,7 @@
5421
5595
  variants: function, method
5422
5596
  cpp_no_default_args: ["unbiased"]
5423
5597
 
5424
- - func: std.correction(Tensor self, int[1]? dim=None, *, int? correction=None, bool keepdim=False) -> Tensor
5598
+ - func: std.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> Tensor
5425
5599
  device_check: NoCheck # TensorIterator
5426
5600
  variants: function, method
5427
5601
  dispatch:
@@ -5439,7 +5613,7 @@
5439
5613
  variants: function
5440
5614
  cpp_no_default_args: ["unbiased"]
5441
5615
 
5442
- - func: std_mean.correction(Tensor self, int[1]? dim=None, *, int? correction=None, bool keepdim=False) -> (Tensor, Tensor)
5616
+ - func: std_mean.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
5443
5617
  device_check: NoCheck # TensorIterator
5444
5618
  variants: function
5445
5619
  dispatch:
@@ -5451,7 +5625,7 @@
5451
5625
  variants: function
5452
5626
  cpp_no_default_args: ["unbiased"]
5453
5627
 
5454
- - func: std_mean.correction_names(Tensor self, Dimname[1] dim, *, int? correction=None, bool keepdim=False) -> (Tensor, Tensor)
5628
+ - func: std_mean.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
5455
5629
  device_check: NoCheck # TensorIterator
5456
5630
  variants: function
5457
5631
 
@@ -5459,7 +5633,7 @@
5459
5633
  device_check: NoCheck # TensorIterator
5460
5634
  cpp_no_default_args: ["unbiased"]
5461
5635
 
5462
- - func: std.correction_out(Tensor self, int[1]? dim=None, *, int? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
5636
+ - func: std.correction_out(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
5463
5637
  device_check: NoCheck # TensorIterator
5464
5638
  dispatch:
5465
5639
  CPU, CUDA: std_out
@@ -5474,11 +5648,11 @@
5474
5648
  device_check: NoCheck # TensorIterator
5475
5649
  cpp_no_default_args: ["unbiased"]
5476
5650
 
5477
- - func: std.correction_names(Tensor self, Dimname[1] dim, *, int? correction=None, bool keepdim=False) -> Tensor
5651
+ - func: std.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> Tensor
5478
5652
  device_check: NoCheck # TensorIterator
5479
5653
  variants: function, method
5480
5654
 
5481
- - func: std.correction_names_out(Tensor self, Dimname[1] dim, *, int? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
5655
+ - func: std.correction_names_out(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
5482
5656
  device_check: NoCheck # TensorIterator
5483
5657
  variants: function
5484
5658
 
@@ -5489,11 +5663,13 @@
5489
5663
  CPU, CUDA: prod
5490
5664
  MPS: prod_mps
5491
5665
  autogen: prod.out
5666
+ tags: core
5492
5667
 
5493
5668
  - func: prod.dim_int(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
5494
5669
  structured_delegate: prod.int_out
5495
5670
  device_check: NoCheck # TensorIterator
5496
5671
  variants: function, method
5672
+ tags: core
5497
5673
 
5498
5674
  - func: prod.int_out(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
5499
5675
  structured: True
@@ -5531,7 +5707,7 @@
5531
5707
  dispatch:
5532
5708
  SparseCPU, SparseCUDA: tan_sparse
5533
5709
  SparseCsrCPU, SparseCsrCUDA: tan_sparse_csr
5534
- tags: pointwise
5710
+ tags: [core, pointwise]
5535
5711
 
5536
5712
  - func: tan_(Tensor(a!) self) -> Tensor(a!)
5537
5713
  device_check: NoCheck # TensorIterator
@@ -5592,8 +5768,6 @@
5592
5768
 
5593
5769
  - func: tensordot.out(Tensor self, Tensor other, int[] dims_self, int[] dims_other, *, Tensor(a!) out) -> Tensor(a!)
5594
5770
  variants: function
5595
- dispatch:
5596
- CPU, CUDA: tensordot_out
5597
5771
 
5598
5772
  # TODO: namespace threshold in 'nn'
5599
5773
  - func: threshold(Tensor self, Scalar threshold, Scalar value) -> Tensor
@@ -5635,8 +5809,10 @@
5635
5809
  NestedTensorCPU, NestedTensorCUDA: threshold_backwards_nested
5636
5810
  tags: pointwise
5637
5811
 
5638
- - func: tile(Tensor self, int[] dims) -> Tensor
5812
+ - func: tile(Tensor self, SymInt[] dims) -> Tensor
5639
5813
  variants: function, method
5814
+ dispatch:
5815
+ CompositeImplicitAutograd: tile_symint
5640
5816
 
5641
5817
  - func: transpose.int(Tensor(a) self, int dim0, int dim1) -> Tensor(a)
5642
5818
  variants: function, method
@@ -5691,12 +5867,13 @@
5691
5867
  - func: flipud(Tensor self) -> Tensor
5692
5868
  variants: function, method
5693
5869
 
5694
- - func: roll(Tensor self, int[1] shifts, int[1] dims=[]) -> Tensor
5870
+ - func: roll(Tensor self, SymInt[1] shifts, int[1] dims=[]) -> Tensor
5695
5871
  variants: function, method
5696
5872
  dispatch:
5697
- CPU: roll_cpu
5873
+ CPU, MPS: roll
5698
5874
  CUDA: roll_cuda
5699
5875
  autogen: roll.out
5876
+ tags: core
5700
5877
 
5701
5878
  # default int[] value [0,1] should not add space after comma, since codegen parser uses ', ' to split args
5702
5879
 
@@ -5750,10 +5927,11 @@
5750
5927
  NestedTensorCPU, NestedTensorCUDA: _nested_tensor_strides
5751
5928
  autogen: _nested_tensor_strides.out
5752
5929
 
5753
- - func: _nested_tensor_offsets(Tensor self) -> int[]
5930
+ - func: _nested_tensor_storage_offsets(Tensor self) -> Tensor
5754
5931
  variants: method
5755
5932
  dispatch:
5756
- NestedTensorCPU, NestedTensorCUDA: _nested_tensor_offsets
5933
+ NestedTensorCPU, NestedTensorCUDA, NestedTensorMeta: _nested_tensor_storage_offsets
5934
+ autogen: _nested_tensor_storage_offsets.out
5757
5935
 
5758
5936
  # _nested_from_padded is not usable from Python, so
5759
5937
  # _nested_from_padded_and_nested_example is available for testing.
@@ -5764,13 +5942,13 @@
5764
5942
 
5765
5943
  # The input arguments' types to this functions are temporary. When nested tensors switch to using SymInts for their metadata representation
5766
5944
  # this will need to be updated
5767
- - func: _nested_view_from_buffer(Tensor(a) self, Tensor nested_size, Tensor nested_strides, int[] offsets) -> Tensor(a)
5945
+ - func: _nested_view_from_buffer(Tensor(a) self, Tensor nested_size, Tensor nested_strides, Tensor offsets) -> Tensor(a)
5768
5946
  variants: function
5769
5947
  device_check: NoCheck
5770
5948
  dispatch:
5771
5949
  CPU, CUDA: _nested_view_from_buffer
5772
5950
 
5773
- - func: _nested_view_from_buffer_copy(Tensor self, Tensor nested_size, Tensor nested_strides, int[] offsets) -> Tensor
5951
+ - func: _nested_view_from_buffer_copy(Tensor self, Tensor nested_size, Tensor nested_strides, Tensor offsets) -> Tensor
5774
5952
  variants: function
5775
5953
  device_check: NoCheck
5776
5954
  tags: view_copy
@@ -5913,18 +6091,19 @@
5913
6091
  tags: core
5914
6092
  cpp_no_default_args: ["unbiased"]
5915
6093
 
5916
- - func: var.correction(Tensor self, int[1]? dim=None, *, int? correction=None, bool keepdim=False) -> Tensor
6094
+ - func: var.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> Tensor
5917
6095
  device_check: NoCheck # TensorIterator
5918
6096
  variants: function, method
5919
6097
  dispatch:
5920
6098
  CPU, CUDA: var
5921
6099
  MPS: var_mps
6100
+ tags: core
5922
6101
 
5923
6102
  - func: var.out(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
5924
6103
  device_check: NoCheck # TensorIterator
5925
6104
  cpp_no_default_args: ["unbiased"]
5926
6105
 
5927
- - func: var.correction_out(Tensor self, int[1]? dim=None, *, int? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
6106
+ - func: var.correction_out(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
5928
6107
  device_check: NoCheck # TensorIterator
5929
6108
  dispatch:
5930
6109
  CPU, CUDA: var_out
@@ -5938,11 +6117,11 @@
5938
6117
  device_check: NoCheck # TensorIterator
5939
6118
  cpp_no_default_args: ["unbiased"]
5940
6119
 
5941
- - func: var.correction_names(Tensor self, Dimname[1] dim, *, int? correction=None, bool keepdim=False) -> Tensor
6120
+ - func: var.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> Tensor
5942
6121
  device_check: NoCheck # TensorIterator
5943
6122
  variants: function, method
5944
6123
 
5945
- - func: var.correction_names_out(Tensor self, Dimname[1] dim, *, int? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
6124
+ - func: var.correction_names_out(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
5946
6125
  device_check: NoCheck # TensorIterator
5947
6126
  variants: function
5948
6127
 
@@ -5956,7 +6135,7 @@
5956
6135
  variants: function
5957
6136
  cpp_no_default_args: ["unbiased"]
5958
6137
 
5959
- - func: var_mean.correction(Tensor self, int[1]? dim=None, *, int? correction=None, bool keepdim=False) -> (Tensor, Tensor)
6138
+ - func: var_mean.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
5960
6139
  device_check: NoCheck # TensorIterator
5961
6140
  variants: function
5962
6141
  dispatch:
@@ -5968,7 +6147,7 @@
5968
6147
  variants: function
5969
6148
  cpp_no_default_args: ["unbiased"]
5970
6149
 
5971
- - func: var_mean.correction_names(Tensor self, Dimname[1] dim, *, int? correction=None, bool keepdim=False) -> (Tensor, Tensor)
6150
+ - func: var_mean.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
5972
6151
  device_check: NoCheck # TensorIterator
5973
6152
  variants: function
5974
6153
 
@@ -6036,7 +6215,7 @@
6036
6215
  CompositeExplicitAutograd: zeros
6037
6216
  autogen: zeros.names_out
6038
6217
 
6039
- - func: _efficientzerotensor(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
6218
+ - func: _efficientzerotensor(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
6040
6219
  dispatch:
6041
6220
  CPU: _efficientzerotensor
6042
6221
  CUDA: _efficientzerotensor_cuda
@@ -6056,7 +6235,7 @@
6056
6235
  dispatch:
6057
6236
  # NB: Although this composite mutates on the inside, it is
6058
6237
  # non-differentiable so NonFunctional doesn't apply
6059
- CompositeExplicitAutograd: zeros_like
6238
+ CompositeExplicitAutograd, CompositeImplicitAutogradNestedTensor: zeros_like
6060
6239
  autogen: zeros_like.out
6061
6240
 
6062
6241
  - func: _standard_gamma_grad(Tensor self, Tensor output) -> Tensor
@@ -6297,7 +6476,7 @@
6297
6476
  QuantizedCPU, QuantizedCUDA: quantized_clone
6298
6477
  NestedTensorCPU, NestedTensorCUDA: clone_nested
6299
6478
  autogen: clone.out
6300
- tags: core
6479
+ tags: [core, pointwise]
6301
6480
 
6302
6481
  - func: positive(Tensor(a) self) -> Tensor(a)
6303
6482
  variants: function, method
@@ -6309,6 +6488,7 @@
6309
6488
  dispatch:
6310
6489
  CompositeExplicitAutograd: resize_as_
6311
6490
  autogen: resize_as, resize_as.out
6491
+ tags: inplace_view
6312
6492
 
6313
6493
  - func: resize_as_sparse_(Tensor(a!) self, Tensor the_template) -> Tensor(a!)
6314
6494
  use_const_ref_for_mutable_tensors: True
@@ -6328,6 +6508,7 @@
6328
6508
  SparseCPU, SparseCUDA, SparseMeta: zero_sparse_
6329
6509
  SparseCsrCPU, SparseCsrCUDA: zero_sparse_csr_
6330
6510
  MkldnnCPU: mkldnn_zero_
6511
+ NestedTensorCPU, NestedTensorCUDA: zero_nested_
6331
6512
  autogen: zero, zero.out
6332
6513
 
6333
6514
  - func: sub.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
@@ -6347,6 +6528,7 @@
6347
6528
  dispatch:
6348
6529
  SparseCPU, SparseCUDA: sub_sparse
6349
6530
  ZeroTensor: sub_zerotensor
6531
+ NestedTensorCPU, NestedTensorCUDA: NestedTensor_sub_Tensor
6350
6532
  tags: [core, pointwise]
6351
6533
 
6352
6534
  - func: sub_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
@@ -6493,6 +6675,16 @@
6493
6675
  structured_delegate: _addmm_activation.out
6494
6676
  variants: function, method
6495
6677
 
6678
+ - func: _scaled_mm(Tensor self, Tensor mat2, *, Tensor? bias=None, ScalarType? out_dtype=None, Tensor? scale_a=None, Tensor? scale_b=None, Tensor? scale_result=None) -> (Tensor, Tensor)
6679
+ variants: function
6680
+ dispatch:
6681
+ CUDA: _scaled_mm_cuda
6682
+
6683
+ - func: _scaled_mm.out(Tensor self, Tensor mat2, *, Tensor? bias=None, ScalarType? out_dtype=None, Tensor? scale_a=None, Tensor? scale_b=None, Tensor? scale_result=None, Tensor(a!) out, Tensor(b!) out_amax) -> (Tensor(a!), Tensor(b!))
6684
+ variants: function
6685
+ dispatch:
6686
+ CUDA: _scaled_mm_out_cuda
6687
+
6496
6688
  # NOTE [ Sparse: autograd and API ]
6497
6689
  #
6498
6690
  #
@@ -6605,12 +6797,17 @@
6605
6797
  # the default would never make sense.
6606
6798
 
6607
6799
  - func: sparse_compressed_tensor.comp_plain_value_size(Tensor compressed_indices, Tensor plain_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
6800
+ dispatch:
6801
+ CompositeExplicitAutograd: sparse_compressed_tensor
6802
+
6608
6803
  - func: sparse_csr_tensor.crow_col_value_size(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
6609
6804
  - func: sparse_csc_tensor.ccol_row_value_size(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
6610
6805
  - func: sparse_bsr_tensor.crow_col_value_size(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
6611
6806
  - func: sparse_bsc_tensor.ccol_row_value_size(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
6612
6807
 
6613
6808
  - func: sparse_compressed_tensor.comp_plain_value(Tensor compressed_indices, Tensor plain_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
6809
+ dispatch:
6810
+ CompositeExplicitAutograd: sparse_compressed_tensor
6614
6811
  - func: sparse_csr_tensor.crow_col_value(Tensor crow_indices, Tensor col_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
6615
6812
  - func: sparse_csc_tensor.ccol_row_value(Tensor ccol_indices, Tensor row_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
6616
6813
  - func: sparse_bsr_tensor.crow_col_value(Tensor crow_indices, Tensor col_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
@@ -6627,15 +6824,15 @@
6627
6824
  CompositeExplicitAutograd: sparse_coo_tensor
6628
6825
  autogen: sparse_coo_tensor.size_out
6629
6826
 
6630
- - func: sparse_coo_tensor.indices(Tensor indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
6827
+ - func: sparse_coo_tensor.indices(Tensor indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool? is_coalesced=None) -> Tensor
6631
6828
 
6632
- - func: sparse_coo_tensor.indices_size(Tensor indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
6829
+ - func: sparse_coo_tensor.indices_size(Tensor indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool? is_coalesced=None) -> Tensor
6633
6830
 
6634
- - func: _sparse_coo_tensor_unsafe(Tensor indices, Tensor values, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
6831
+ - func: _sparse_coo_tensor_unsafe(Tensor indices, Tensor values, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool? is_coalesced=None) -> Tensor
6635
6832
  dispatch:
6636
6833
  CompositeImplicitAutograd: _sparse_coo_tensor_unsafe_symint
6637
6834
 
6638
- - func: _validate_sparse_coo_tensor_args(Tensor indices, Tensor values, int[] size) -> ()
6835
+ - func: _validate_sparse_coo_tensor_args(Tensor indices, Tensor values, int[] size, bool? is_coalesced=None) -> ()
6639
6836
 
6640
6837
  - func: _validate_sparse_compressed_tensor_args(Tensor compressed_indices, Tensor plain_indices, Tensor values, int[] size, Layout layout) -> ()
6641
6838
  - func: _validate_sparse_csr_tensor_args(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size) -> ()
@@ -6648,7 +6845,7 @@
6648
6845
  SparseCPU, SparseCUDA, SparseMeta, Meta: new_with_dims_sparse
6649
6846
  autogen: _sparse_coo_tensor_with_dims.out
6650
6847
 
6651
- - func: _sparse_coo_tensor_with_dims_and_tensors(int sparse_dim, int dense_dim, SymInt[] size, Tensor indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
6848
+ - func: _sparse_coo_tensor_with_dims_and_tensors(int sparse_dim, int dense_dim, SymInt[] size, Tensor indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False, bool? is_coalesced=None) -> Tensor
6652
6849
  dispatch:
6653
6850
  SparseCPU, SparseCUDA, SparseMeta, Meta: new_with_dims_and_tensor_sparse_symint
6654
6851
  autogen: _sparse_coo_tensor_with_dims_and_tensors.out
@@ -6671,17 +6868,23 @@
6671
6868
  variants: method
6672
6869
  dispatch:
6673
6870
  SparseCPU, SparseCUDA: sparse_mask
6674
- SparseCsrCPU, SparseCsrCUDA: sparse_mask_sparse_csr
6871
+ SparseCsrCPU, SparseCsrCUDA: sparse_mask_sparse_compressed
6675
6872
  autogen: sparse_mask.out
6676
6873
 
6874
+ - func: _sparse_mask_projection(Tensor self, Tensor mask, bool accumulate_matches=False) -> Tensor
6875
+ variants: method
6876
+ dispatch:
6877
+ SparseCPU, SparseCUDA: sparse_mask_projection
6878
+ autogen: _sparse_mask_projection.out
6879
+
6677
6880
  - func: _to_cpu(Tensor[] tensors) -> Tensor[]
6678
6881
  variants: function
6679
6882
 
6680
- - func: to_dense(Tensor self, ScalarType? dtype=None) -> Tensor
6883
+ - func: to_dense(Tensor self, ScalarType? dtype=None, *, bool? masked_grad=None) -> Tensor
6681
6884
  variants: method
6682
6885
 
6683
6886
  # Special case of to_dense with custom derivative
6684
- - func: _to_dense(Tensor self, ScalarType? dtype=None) -> Tensor
6887
+ - func: _to_dense(Tensor self, ScalarType? dtype=None, bool? masked_grad=None) -> Tensor
6685
6888
  variants: method
6686
6889
  dispatch:
6687
6890
  SparseCPU, SparseCUDA: sparse_to_dense
@@ -6689,7 +6892,7 @@
6689
6892
  MkldnnCPU: mkldnn_to_dense
6690
6893
  autogen: _to_dense.out
6691
6894
 
6692
- - func: to_dense_backward(Tensor grad, Tensor input) -> Tensor
6895
+ - func: to_dense_backward(Tensor grad, Tensor input, bool? masked_grad=None) -> Tensor
6693
6896
 
6694
6897
  - func: sparse_dim(Tensor self) -> int
6695
6898
  variants: method
@@ -6859,51 +7062,80 @@
6859
7062
 
6860
7063
  - func: to_sparse.sparse_dim(Tensor self, int sparse_dim) -> Tensor
6861
7064
  variants: method
7065
+
7066
+ # Special case of to_sparse.sparse_dim with custom derivative
7067
+ - func: _to_sparse.sparse_dim(Tensor self, int sparse_dim) -> Tensor
7068
+ variants: method
6862
7069
  dispatch:
6863
7070
  CPU, CUDA: dense_to_sparse
6864
7071
  SparseCPU, SparseCUDA: sparse_coo_to_sparse
6865
7072
  SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse
6866
- autogen: to_sparse.sparse_dim_out
7073
+ autogen: _to_sparse.sparse_dim_out
6867
7074
 
6868
7075
  - func: to_sparse(Tensor self, *, Layout? layout=None, int[2]? blocksize=None, int? dense_dim=None) -> Tensor
6869
7076
  variants: method
7077
+
7078
+ # Special case of to_sparse with custom derivative
7079
+ - func: _to_sparse(Tensor self, *, Layout? layout=None, int[2]? blocksize=None, int? dense_dim=None) -> Tensor
7080
+ variants: method
6870
7081
  dispatch:
6871
7082
  CPU, CUDA: dense_to_sparse
6872
7083
  SparseCPU, SparseCUDA: sparse_coo_to_sparse
6873
7084
  SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse
6874
- autogen: to_sparse.out
7085
+ autogen: _to_sparse.out
6875
7086
 
6876
7087
  - func: to_sparse_csr(Tensor self, int? dense_dim=None) -> Tensor
6877
7088
  variants: method
7089
+
7090
+ # Special case of to_sparse_csr with custom derivative
7091
+ - func: _to_sparse_csr(Tensor self, int? dense_dim=None) -> Tensor
7092
+ variants: method
6878
7093
  dispatch:
6879
7094
  CPU, CUDA: dense_to_sparse_csr
6880
7095
  SparseCPU, SparseCUDA: coo_to_sparse_csr
6881
7096
  SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse_csr
6882
- autogen: to_sparse_csr.out
7097
+ autogen: _to_sparse_csr.out
6883
7098
 
6884
7099
  - func: to_sparse_csc(Tensor self, int? dense_dim=None) -> Tensor
6885
7100
  variants: method
7101
+
7102
+ # Special case of to_sparse_csc with custom derivative
7103
+ - func: _to_sparse_csc(Tensor self, int? dense_dim=None) -> Tensor
7104
+ variants: method
6886
7105
  dispatch:
6887
7106
  CPU, CUDA: dense_to_sparse_csc
6888
7107
  SparseCPU, SparseCUDA: coo_to_sparse_csc
6889
7108
  SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse_csc
6890
- autogen: to_sparse_csc.out
7109
+ autogen: _to_sparse_csc.out
6891
7110
 
6892
7111
  - func: to_sparse_bsr(Tensor self, int[2] blocksize, int? dense_dim=None) -> Tensor
6893
7112
  variants: method
7113
+
7114
+ # Special case of to_sparse_bsr with custom derivative
7115
+ - func: _to_sparse_bsr(Tensor self, int[2] blocksize, int? dense_dim=None) -> Tensor
7116
+ variants: method
6894
7117
  dispatch:
6895
7118
  CPU, CUDA: dense_to_sparse_bsr
6896
7119
  SparseCPU, SparseCUDA: coo_to_sparse_bsr
6897
7120
  SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse_bsr
6898
- autogen: to_sparse_bsr.out
7121
+ autogen: _to_sparse_bsr.out
6899
7122
 
6900
7123
  - func: to_sparse_bsc(Tensor self, int[2] blocksize, int? dense_dim=None) -> Tensor
6901
7124
  variants: method
7125
+
7126
+ # Special case of to_sparse_bsc with custom derivative
7127
+ - func: _to_sparse_bsc(Tensor self, int[2] blocksize, int? dense_dim=None) -> Tensor
7128
+ variants: method
6902
7129
  dispatch:
6903
7130
  CPU, CUDA: dense_to_sparse_bsc
6904
7131
  SparseCPU, SparseCUDA: coo_to_sparse_bsc
6905
7132
  SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse_bsc
6906
- autogen: to_sparse_bsc.out
7133
+ autogen: _to_sparse_bsc.out
7134
+
7135
+ - func: _to_sparse_semi_structured(Tensor dense) -> (Tensor, Tensor)
7136
+ variants: function
7137
+ dispatch:
7138
+ CUDA: _to_sparse_semi_structured
6907
7139
 
6908
7140
  - func: to_mkldnn(Tensor self, ScalarType? dtype=None) -> Tensor
6909
7141
  variants: method
@@ -7174,7 +7406,7 @@
7174
7406
 
7175
7407
  # NB: Does NOT check precondition that numel == 1
7176
7408
  - func: _local_scalar_dense(Tensor self) -> Scalar
7177
- tags: data_dependent_output
7409
+ tags: [core, data_dependent_output]
7178
7410
  dispatch:
7179
7411
  CPU: _local_scalar_dense_cpu
7180
7412
  CUDA: _local_scalar_dense_cuda
@@ -7187,8 +7419,9 @@
7187
7419
  dispatch:
7188
7420
  MPS: _lstm_mps
7189
7421
  autogen: _lstm_mps.out
7422
+ tags: nondeterministic_seeded
7190
7423
 
7191
- - func: lstm_mps_backward(Tensor grad_y, Tensor? grad_hy, Tensor? grad_cy, Tensor z_state, Tensor cell_state_fwd, Tensor input, Tensor layersOutputs, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor[], Tensor[])
7424
+ - func: lstm_mps_backward(Tensor? grad_y, Tensor? grad_hy, Tensor? grad_cy, Tensor z_state, Tensor cell_state_fwd, Tensor input, Tensor layersOutputs, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor[], Tensor[])
7192
7425
  dispatch:
7193
7426
  MPS: lstm_mps_backward
7194
7427
  autogen: lstm_mps_backward.out
@@ -7226,20 +7459,28 @@
7226
7459
 
7227
7460
  # RNN cells and layers
7228
7461
  - func: lstm.input(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor)
7462
+ tags: nondeterministic_seeded
7229
7463
 
7230
7464
  - func: lstm.data(Tensor data, Tensor batch_sizes, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor, Tensor)
7465
+ tags: nondeterministic_seeded
7231
7466
 
7232
7467
  - func: gru.input(Tensor input, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor)
7468
+ tags: nondeterministic_seeded
7233
7469
 
7234
7470
  - func: gru.data(Tensor data, Tensor batch_sizes, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor)
7471
+ tags: nondeterministic_seeded
7235
7472
 
7236
7473
  - func: rnn_tanh.input(Tensor input, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor)
7474
+ tags: nondeterministic_seeded
7237
7475
 
7238
7476
  - func: rnn_tanh.data(Tensor data, Tensor batch_sizes, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor)
7477
+ tags: nondeterministic_seeded
7239
7478
 
7240
7479
  - func: rnn_relu.input(Tensor input, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor)
7480
+ tags: nondeterministic_seeded
7241
7481
 
7242
7482
  - func: rnn_relu.data(Tensor data, Tensor batch_sizes, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor)
7483
+ tags: nondeterministic_seeded
7243
7484
 
7244
7485
  - func: lstm_cell(Tensor input, Tensor[] hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih=None, Tensor? b_hh=None) -> (Tensor, Tensor)
7245
7486
 
@@ -7382,6 +7623,7 @@
7382
7623
  variants: function, method
7383
7624
  dispatch:
7384
7625
  CompositeExplicitAutograd: masked_fill
7626
+ NestedTensorCPU, NestedTensorCUDA: NestedTensor_masked_fill
7385
7627
  tags: pointwise
7386
7628
 
7387
7629
  - func: masked_fill_.Tensor(Tensor(a!) self, Tensor mask, Tensor value) -> Tensor(a!)
@@ -7406,6 +7648,7 @@
7406
7648
  dispatch:
7407
7649
  CPU: masked_scatter__cpu
7408
7650
  CUDA: masked_scatter__cuda
7651
+ MPS: masked_scatter__mps
7409
7652
  autogen: masked_scatter.out
7410
7653
 
7411
7654
  - func: masked_scatter(Tensor self, Tensor mask, Tensor source) -> Tensor
@@ -7503,6 +7746,7 @@
7503
7746
  dispatch:
7504
7747
  CPU: index_fill_
7505
7748
  CUDA: index_fill_
7749
+ MPS: index_fill_mps_
7506
7750
  autogen: index_fill.int_Scalar_out
7507
7751
 
7508
7752
  - func: index_fill.int_Scalar(Tensor self, int dim, Tensor index, Scalar value) -> Tensor
@@ -7516,6 +7760,7 @@
7516
7760
  variants: method
7517
7761
  dispatch:
7518
7762
  CPU, CUDA: index_fill_
7763
+ MPS: index_fill_mps_
7519
7764
  autogen: index_fill.int_Tensor_out
7520
7765
 
7521
7766
  - func: index_fill.int_Tensor(Tensor self, int dim, Tensor index, Tensor value) -> Tensor
@@ -7543,6 +7788,7 @@
7543
7788
  - func: scatter.src(Tensor self, int dim, Tensor index, Tensor src) -> Tensor
7544
7789
  structured_delegate: scatter.src_out
7545
7790
  variants: function, method
7791
+ tags: core
7546
7792
 
7547
7793
  - func: scatter_.src(Tensor(a!) self, int dim, Tensor index, Tensor src) -> Tensor(a!)
7548
7794
  structured_delegate: scatter.src_out
@@ -7558,6 +7804,7 @@
7558
7804
  - func: scatter.value(Tensor self, int dim, Tensor index, Scalar value) -> Tensor
7559
7805
  structured_delegate: scatter.value_out
7560
7806
  variants: function, method
7807
+ tags: core
7561
7808
 
7562
7809
  - func: scatter_.value(Tensor(a!) self, int dim, Tensor index, Scalar value) -> Tensor(a!)
7563
7810
  structured_delegate: scatter.value_out
@@ -7657,6 +7904,7 @@
7657
7904
  variants: function
7658
7905
  dispatch:
7659
7906
  CPU, CUDA: bitwise_and_out
7907
+ MPS: bitwise_and_out_mps
7660
7908
  tags: pointwise
7661
7909
 
7662
7910
  - func: bitwise_and.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
@@ -7671,7 +7919,7 @@
7671
7919
  variants: method, function
7672
7920
  dispatch:
7673
7921
  CompositeExplicitAutograd: bitwise_and
7674
- tags: pointwise
7922
+ tags: [core, pointwise]
7675
7923
 
7676
7924
  - func: bitwise_and.Scalar_Tensor(Scalar self, Tensor other) -> Tensor
7677
7925
  device_check: NoCheck # TensorIterator
@@ -7721,6 +7969,7 @@
7721
7969
  variants: function
7722
7970
  dispatch:
7723
7971
  CPU, CUDA: bitwise_or_out
7972
+ MPS: bitwise_or_out_mps
7724
7973
  tags: pointwise
7725
7974
 
7726
7975
  - func: bitwise_or.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
@@ -7733,7 +7982,7 @@
7733
7982
  - func: bitwise_or.Scalar(Tensor self, Scalar other) -> Tensor
7734
7983
  device_check: NoCheck # TensorIterator
7735
7984
  variants: method, function
7736
- tags: pointwise
7985
+ tags: [core, pointwise]
7737
7986
 
7738
7987
  - func: bitwise_or.Scalar_Tensor(Scalar self, Tensor other) -> Tensor
7739
7988
  device_check: NoCheck # TensorIterator
@@ -7783,6 +8032,7 @@
7783
8032
  variants: function
7784
8033
  dispatch:
7785
8034
  CPU, CUDA: bitwise_xor_out
8035
+ MPS: bitwise_xor_out_mps
7786
8036
  tags: pointwise
7787
8037
 
7788
8038
  - func: bitwise_xor.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
@@ -7795,7 +8045,7 @@
7795
8045
  - func: bitwise_xor.Scalar(Tensor self, Scalar other) -> Tensor
7796
8046
  device_check: NoCheck # TensorIterator
7797
8047
  variants: method, function
7798
- tags: pointwise
8048
+ tags: [core, pointwise]
7799
8049
 
7800
8050
  - func: bitwise_xor.Scalar_Tensor(Scalar self, Tensor other) -> Tensor
7801
8051
  device_check: NoCheck # TensorIterator
@@ -8067,6 +8317,7 @@
8067
8317
  variants: method
8068
8318
  dispatch:
8069
8319
  CPU, CUDA: random_
8320
+ MPS: random_mps_
8070
8321
  Meta: random_meta_
8071
8322
  autogen: random, random.out
8072
8323
 
@@ -8164,7 +8415,7 @@
8164
8415
  dispatch:
8165
8416
  CPU: trace_cpu
8166
8417
  CUDA: trace_cuda
8167
- MPS: trace_mps_out
8418
+ MPS: trace_mps
8168
8419
  autogen: trace.out
8169
8420
 
8170
8421
  - func: trace_backward(Tensor grad, SymInt[] sizes) -> Tensor
@@ -8604,6 +8855,15 @@
8604
8855
  MPS: nonzero_mps
8605
8856
  tags: [dynamic_output_shape, core]
8606
8857
 
8858
+ - func: nonzero_static.out(Tensor self, *, int size, int fill_value=-1, Tensor(a!) out) -> Tensor(a!)
8859
+ dispatch:
8860
+ CPU: nonzero_static_out_cpu
8861
+
8862
+ - func: nonzero_static(Tensor self, *, int size, int fill_value=-1) -> Tensor
8863
+ variants: method, function
8864
+ dispatch:
8865
+ CPU: nonzero_static_cpu
8866
+
8607
8867
  - func: nonzero_numpy(Tensor self) -> Tensor[]
8608
8868
  variants: method, function
8609
8869
 
@@ -8710,8 +8970,10 @@
8710
8970
  CPU, CUDA: linalg_solve_triangular
8711
8971
  MPS: linalg_solve_triangular_mps
8712
8972
 
8713
- - func: linalg_vander(Tensor x, *, int? N=None) -> Tensor
8973
+ - func: linalg_vander(Tensor x, *, SymInt? N=None) -> Tensor
8714
8974
  python_module: linalg
8975
+ dispatch:
8976
+ CompositeImplicitAutograd: linalg_vander_symint
8715
8977
 
8716
8978
  - func: svd.U(Tensor self, bool some=True, bool compute_uv=True, *, Tensor(a!) U, Tensor(b!) S, Tensor(c!) V) -> (Tensor(a!) U, Tensor(b!) S, Tensor(c!) V)
8717
8979
 
@@ -8917,6 +9179,7 @@
8917
9179
  structured_inherits: TensorIteratorBase
8918
9180
  dispatch:
8919
9181
  CPU, CUDA: erfinv_out
9182
+ MPS: erfinv_out_mps
8920
9183
  SparseCPU, SparseCUDA: erfinv_sparse_out
8921
9184
  SparseCsrCPU, SparseCsrCUDA: erfinv_sparse_csr_out
8922
9185
  tags: pointwise
@@ -8999,7 +9262,7 @@
8999
9262
  structured_inherits: TensorIteratorBase
9000
9263
  dispatch:
9001
9264
  CPU, CUDA: atan2_out
9002
- MPS: atan2_mps_out
9265
+ MPS: atan2_out_mps
9003
9266
  tags: pointwise
9004
9267
 
9005
9268
  - func: atan2_(Tensor(a!) self, Tensor other) -> Tensor(a!)
@@ -9030,6 +9293,7 @@
9030
9293
  structured_inherits: TensorIteratorBase
9031
9294
  dispatch:
9032
9295
  CPU, CUDA: lerp_Scalar
9296
+ MPS: lerp_Scalar_mps
9033
9297
  tags: pointwise
9034
9298
 
9035
9299
  - func: lerp.Tensor_out(Tensor self, Tensor end, Tensor weight, *, Tensor(a!) out) -> Tensor(a!)
@@ -9038,6 +9302,7 @@
9038
9302
  structured_inherits: TensorIteratorBase
9039
9303
  dispatch:
9040
9304
  CPU, CUDA: lerp_Tensor
9305
+ MPS: lerp_Tensor_mps
9041
9306
  tags: pointwise
9042
9307
 
9043
9308
  - func: lerp.Scalar(Tensor self, Tensor end, Scalar weight) -> Tensor
@@ -9054,46 +9319,46 @@
9054
9319
 
9055
9320
  - func: histc.out(Tensor self, int bins=100, Scalar min=0, Scalar max=0, *, Tensor(a!) out) -> Tensor(a!)
9056
9321
  dispatch:
9057
- CPU: histogram_histc_cpu_out
9322
+ CPU, MPS: histogram_histc_out
9058
9323
  CUDA: _histc_out_cuda
9059
9324
 
9060
9325
  - func: histc(Tensor self, int bins=100, Scalar min=0, Scalar max=0) -> Tensor
9061
9326
  variants: method, function
9062
9327
  dispatch:
9063
- CPU: histogram_histc_cpu
9328
+ CPU, MPS: histogram_histc
9064
9329
  CUDA: _histc_cuda
9065
9330
 
9066
9331
  - func: histogram.bins_tensor_out(Tensor self, Tensor bins, *, Tensor? weight=None, bool density=False, Tensor(a!) hist, Tensor(b!) bin_edges) -> (Tensor(a!) hist, Tensor(b!) bin_edges)
9067
9332
  dispatch:
9068
- CPU: histogram_out_cpu
9333
+ CPU, MPS: histogram_out
9069
9334
 
9070
9335
  - func: histogram.bins_tensor(Tensor self, Tensor bins, *, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor bin_edges)
9071
9336
  variants: method, function
9072
9337
  dispatch:
9073
- CPU: histogram_cpu
9338
+ CPU, MPS: histogram
9074
9339
 
9075
9340
  - func: histogram.bin_ct_out(Tensor self, int bins=100, *, float[]? range=None, Tensor? weight=None, bool density=False, Tensor(a!) hist, Tensor(b!) bin_edges) -> (Tensor(a!) hist, Tensor(b!) bin_edges)
9076
9341
  dispatch:
9077
- CPU: histogram_out_cpu
9342
+ CPU, MPS: histogram_out
9078
9343
 
9079
9344
  - func: histogram.bin_ct(Tensor self, int bins=100, *, float[]? range=None, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor bin_edges)
9080
9345
  variants: method, function
9081
9346
  dispatch:
9082
- CPU: histogram_cpu
9347
+ CPU, MPS: histogram
9083
9348
 
9084
9349
  - func: _histogramdd_bin_edges(Tensor self, int[] bins, *, float[]? range=None, Tensor? weight=None, bool density=False) -> Tensor[]
9085
9350
  dispatch:
9086
- CPU: histogramdd_bin_edges_cpu
9351
+ CPU, MPS: histogramdd_bin_edges
9087
9352
  autogen: _histogramdd_bin_edges.out
9088
9353
 
9089
9354
  - func: _histogramdd_from_bin_cts(Tensor self, int[] bins, *, float[]? range=None, Tensor? weight=None, bool density=False) -> Tensor
9090
9355
  dispatch:
9091
- CPU: histogramdd_cpu
9356
+ CPU, MPS: _histogramdd
9092
9357
  autogen: _histogramdd_from_bin_cts.out
9093
9358
 
9094
9359
  - func: _histogramdd_from_bin_tensors(Tensor self, Tensor[] bins, *, Tensor? weight=None, bool density=False) -> Tensor
9095
9360
  dispatch:
9096
- CPU: histogramdd_cpu
9361
+ CPU, MPS: _histogramdd
9097
9362
  autogen: _histogramdd_from_bin_tensors.out
9098
9363
 
9099
9364
  - func: histogramdd(Tensor self, int[] bins, float[]? range=None, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor[] bin_edges)
@@ -9113,7 +9378,7 @@
9113
9378
  variants: method, function
9114
9379
  dispatch:
9115
9380
  CompositeExplicitAutograd: fmod
9116
- tags: pointwise
9381
+ tags: [core, pointwise]
9117
9382
 
9118
9383
  - func: fmod_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
9119
9384
  device_check: NoCheck # TensorIterator
@@ -9148,6 +9413,7 @@
9148
9413
  structured_inherits: TensorIteratorBase
9149
9414
  dispatch:
9150
9415
  CPU, CUDA: hypot_out
9416
+ MPS: hypot_out_mps
9151
9417
  tags: pointwise
9152
9418
 
9153
9419
  - func: hypot(Tensor self, Tensor other) -> Tensor
@@ -9220,7 +9486,7 @@
9220
9486
  variants: method, function
9221
9487
  dispatch:
9222
9488
  CompositeExplicitAutograd: remainder
9223
- tags: pointwise
9489
+ tags: [core, pointwise]
9224
9490
 
9225
9491
  - func: remainder_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
9226
9492
  variants: method
@@ -9265,12 +9531,11 @@
9265
9531
  MPS: min_mps
9266
9532
  QuantizedCPU: min_quantized_cpu
9267
9533
 
9268
- # Not to be confused with binary op `min.out`. Commented because of failed CI
9269
- # FIXME: enable this
9270
- #- func: min.unary_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
9271
- # device_check: NoCheck # TensorIterator
9272
- # dispatch:
9273
- # CompositeExplicitAutograd: min_unary_out
9534
+ - func: min.unary_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
9535
+ device_check: NoCheck # TensorIterator
9536
+ dispatch:
9537
+ CPU, CUDA: min_unary_out
9538
+ QuantizedCPU: min_quantized_unary_out
9274
9539
 
9275
9540
  - func: fmin(Tensor self, Tensor other) -> Tensor
9276
9541
  structured_delegate: fmin.out
@@ -9283,7 +9548,7 @@
9283
9548
  structured_inherits: TensorIteratorBase
9284
9549
  device_check: NoCheck # TensorIterator
9285
9550
  dispatch:
9286
- CPU, CUDA: fmin_out
9551
+ CPU, CUDA, MPS: fmin_out
9287
9552
  tags: pointwise
9288
9553
 
9289
9554
  - func: max(Tensor self) -> Tensor
@@ -9305,7 +9570,7 @@
9305
9570
  structured_inherits: TensorIteratorBase
9306
9571
  device_check: NoCheck # TensorIterator
9307
9572
  dispatch:
9308
- CPU, CUDA: fmax_out
9573
+ CPU, CUDA, MPS: fmax_out
9309
9574
  tags: pointwise
9310
9575
 
9311
9576
  - func: maximum(Tensor self, Tensor other) -> Tensor
@@ -9402,6 +9667,7 @@
9402
9667
  variants: method, function
9403
9668
  dispatch:
9404
9669
  CompositeExplicitAutograd: sort
9670
+ tags: core
9405
9671
 
9406
9672
  - func: sort.stable(Tensor self, *, bool? stable, int dim=-1, bool descending=False) -> (Tensor values, Tensor indices)
9407
9673
  structured_delegate: sort.values_stable
@@ -9438,14 +9704,14 @@
9438
9704
  - func: argsort.dimname(Tensor self, Dimname dim, bool descending=False) -> Tensor
9439
9705
  variants: method, function
9440
9706
 
9441
- - func: topk.values(Tensor self, int k, int dim=-1, bool largest=True, bool sorted=True, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
9707
+ - func: topk.values(Tensor self, SymInt k, int dim=-1, bool largest=True, bool sorted=True, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
9442
9708
  structured: True
9443
9709
  dispatch:
9444
9710
  CPU: topk_out_cpu
9445
9711
  CUDA: topk_out_cuda
9446
9712
  MPS: topk_out_mps
9447
9713
 
9448
- - func: topk(Tensor self, int k, int dim=-1, bool largest=True, bool sorted=True) -> (Tensor values, Tensor indices)
9714
+ - func: topk(Tensor self, SymInt k, int dim=-1, bool largest=True, bool sorted=True) -> (Tensor values, Tensor indices)
9449
9715
  variants: method, function
9450
9716
  structured_delegate: topk.values
9451
9717
  dispatch:
@@ -9470,6 +9736,7 @@
9470
9736
  variants: method, function
9471
9737
  dispatch:
9472
9738
  SparseCPU, SparseCUDA: any_sparse
9739
+ tags: core
9473
9740
 
9474
9741
  - func: any.all_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
9475
9742
  device_check: NoCheck
@@ -9483,6 +9750,7 @@
9483
9750
  structured: True
9484
9751
  dispatch:
9485
9752
  CPU, CUDA: renorm_out
9753
+ MPS: renorm_out_mps
9486
9754
 
9487
9755
  - func: renorm(Tensor self, Scalar p, int dim, Scalar maxnorm) -> Tensor
9488
9756
  device_check: NoCheck # TensorIterator
@@ -9537,6 +9805,7 @@
9537
9805
  structured: True
9538
9806
  dispatch:
9539
9807
  CPU, CUDA: pow_Scalar_out
9808
+ MPS: pow_Scalar_out_mps
9540
9809
  tags: pointwise
9541
9810
 
9542
9811
  - func: pow.Scalar(Scalar self, Tensor exponent) -> Tensor
@@ -9611,6 +9880,7 @@
9611
9880
  MPS: normal_mps_
9612
9881
  Meta: normal_meta_
9613
9882
  SparseCsrCPU, SparseCsrCUDA: normal_sparse_csr_
9883
+ NestedTensorCPU, NestedTensorCUDA: normal_nested_
9614
9884
  autogen: normal.out
9615
9885
 
9616
9886
  # Only used by the functionalization pass.
@@ -9720,156 +9990,155 @@
9720
9990
  CUDA: foreach_tensor_add_scalar_kernel_cuda_
9721
9991
  autogen: _foreach_add.Scalar_out
9722
9992
 
9723
- - func: _foreach_sub.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
9993
+ - func: _foreach_add.List(Tensor[] self, Tensor[] other, *, Scalar alpha=1) -> Tensor[]
9724
9994
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9725
9995
  variants: function
9726
9996
  dispatch:
9727
- CPU: foreach_tensor_sub_scalar_kernel_slow
9728
- CUDA: foreach_tensor_sub_scalar_kernel_cuda
9997
+ CPU: foreach_tensor_add_list_kernel_slow
9998
+ CUDA: foreach_tensor_add_list_kernel_cuda
9729
9999
 
9730
- - func: _foreach_sub_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
10000
+ - func: _foreach_add_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> ()
9731
10001
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9732
10002
  variants: function
9733
10003
  dispatch:
9734
- CPU: foreach_tensor_sub_scalar_kernel_slow_
9735
- CUDA: foreach_tensor_sub_scalar_kernel_cuda_
9736
- autogen: _foreach_sub.Scalar_out
10004
+ CPU: foreach_tensor_add_list_kernel_slow_
10005
+ CUDA: foreach_tensor_add_list_kernel_cuda_
10006
+ autogen: _foreach_add.List_out
9737
10007
 
9738
- - func: _foreach_mul.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
10008
+ - func: _foreach_add.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
9739
10009
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9740
10010
  variants: function
9741
10011
  dispatch:
9742
- CPU: foreach_tensor_mul_scalar_kernel_slow
9743
- CUDA: foreach_tensor_mul_scalar_kernel_cuda
10012
+ CPU: foreach_tensor_add_scalarlist_kernel_slow
10013
+ CUDA: foreach_tensor_add_scalarlist_kernel_cuda
9744
10014
 
9745
- - func: _foreach_mul_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
10015
+ - func: _foreach_add_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
9746
10016
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9747
10017
  variants: function
9748
10018
  dispatch:
9749
- CPU: foreach_tensor_mul_scalar_kernel_slow_
9750
- CUDA: foreach_tensor_mul_scalar_kernel_cuda_
9751
- autogen: _foreach_mul.Scalar_out
10019
+ CPU: foreach_tensor_add_scalarlist_kernel_slow_
10020
+ CUDA: foreach_tensor_add_scalarlist_kernel_cuda_
10021
+ autogen: _foreach_add.ScalarList_out
9752
10022
 
9753
- - func: _foreach_div.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
10023
+ - func: _foreach_sub.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
9754
10024
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9755
10025
  variants: function
9756
10026
  dispatch:
9757
- CPU: foreach_tensor_div_scalar_kernel_slow
9758
- CUDA: foreach_tensor_div_scalar_kernel_cuda
10027
+ CPU: foreach_tensor_sub_scalar_kernel_slow
10028
+ CUDA: foreach_tensor_sub_scalar_kernel_cuda
9759
10029
 
9760
- - func: _foreach_div_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
10030
+ - func: _foreach_sub_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
9761
10031
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9762
10032
  variants: function
9763
10033
  dispatch:
9764
- CPU: foreach_tensor_div_scalar_kernel_slow_
9765
- CUDA: foreach_tensor_div_scalar_kernel_cuda_
9766
- autogen: _foreach_div.Scalar_out
10034
+ CPU: foreach_tensor_sub_scalar_kernel_slow_
10035
+ CUDA: foreach_tensor_sub_scalar_kernel_cuda_
10036
+ autogen: _foreach_sub.Scalar_out
9767
10037
 
9768
- - func: _foreach_clamp_min.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
10038
+ - func: _foreach_sub.List(Tensor[] self, Tensor[] other, *, Scalar alpha=1) -> Tensor[]
9769
10039
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9770
10040
  variants: function
9771
10041
  dispatch:
9772
- CPU: foreach_tensor_clamp_min_scalar_kernel_slow
9773
- CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda
10042
+ CPU: foreach_tensor_sub_list_kernel_slow
10043
+ CUDA: foreach_tensor_sub_list_kernel_cuda
9774
10044
 
9775
- - func: _foreach_clamp_min_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
10045
+ - func: _foreach_sub_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> ()
9776
10046
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9777
10047
  variants: function
9778
10048
  dispatch:
9779
- CPU: foreach_tensor_clamp_min_scalar_kernel_slow_
9780
- CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda_
9781
- autogen: _foreach_clamp_min.Scalar_out
10049
+ CPU: foreach_tensor_sub_list_kernel_slow_
10050
+ CUDA: foreach_tensor_sub_list_kernel_cuda_
10051
+ autogen: _foreach_sub.List_out
9782
10052
 
9783
- - func: _foreach_clamp_max.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
10053
+ - func: _foreach_sub.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
9784
10054
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9785
10055
  variants: function
9786
10056
  dispatch:
9787
- CPU: foreach_tensor_clamp_max_scalar_kernel_slow
9788
- CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda
10057
+ CPU: foreach_tensor_sub_scalarlist_kernel_slow
10058
+ CUDA: foreach_tensor_sub_scalarlist_kernel_cuda
9789
10059
 
9790
- - func: _foreach_clamp_max_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
10060
+ - func: _foreach_sub_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
9791
10061
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9792
10062
  variants: function
9793
10063
  dispatch:
9794
- CPU: foreach_tensor_clamp_max_scalar_kernel_slow_
9795
- CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda_
9796
- autogen: _foreach_clamp_max.Scalar_out
10064
+ CPU: foreach_tensor_sub_scalarlist_kernel_slow_
10065
+ CUDA: foreach_tensor_sub_scalarlist_kernel_cuda_
10066
+ autogen: _foreach_sub.ScalarList_out
9797
10067
 
9798
- # foreach_minimum/maximum dispatches to clamp_max/min
9799
- - func: _foreach_maximum.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
10068
+ - func: _foreach_mul.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
9800
10069
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9801
10070
  variants: function
9802
10071
  dispatch:
9803
- CPU: foreach_tensor_clamp_min_scalar_kernel_slow
9804
- CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda
10072
+ CPU: foreach_tensor_mul_scalar_kernel_slow
10073
+ CUDA: foreach_tensor_mul_scalar_kernel_cuda
9805
10074
 
9806
- - func: _foreach_maximum_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
10075
+ - func: _foreach_mul_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
9807
10076
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9808
10077
  variants: function
9809
10078
  dispatch:
9810
- CPU: foreach_tensor_clamp_min_scalar_kernel_slow_
9811
- CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda_
9812
- autogen: _foreach_maximum.Scalar_out
10079
+ CPU: foreach_tensor_mul_scalar_kernel_slow_
10080
+ CUDA: foreach_tensor_mul_scalar_kernel_cuda_
10081
+ autogen: _foreach_mul.Scalar_out
9813
10082
 
9814
- - func: _foreach_minimum.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
10083
+ - func: _foreach_mul.List(Tensor[] self, Tensor[] other) -> Tensor[]
9815
10084
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9816
10085
  variants: function
9817
10086
  dispatch:
9818
- CPU: foreach_tensor_clamp_max_scalar_kernel_slow
9819
- CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda
10087
+ CPU: foreach_tensor_mul_list_kernel_slow
10088
+ CUDA: foreach_tensor_mul_list_kernel_cuda
9820
10089
 
9821
- - func: _foreach_minimum_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
10090
+ - func: _foreach_mul_.List(Tensor(a!)[] self, Tensor[] other) -> ()
9822
10091
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9823
10092
  variants: function
9824
10093
  dispatch:
9825
- CPU: foreach_tensor_clamp_max_scalar_kernel_slow_
9826
- CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda_
9827
- autogen: _foreach_minimum.Scalar_out
10094
+ CPU: foreach_tensor_mul_list_kernel_slow_
10095
+ CUDA: foreach_tensor_mul_list_kernel_cuda_
10096
+ autogen: _foreach_mul.List_out
9828
10097
 
9829
- - func: _foreach_add.List(Tensor[] self, Tensor[] other, *, Scalar alpha=1) -> Tensor[]
10098
+ - func: _foreach_mul.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
9830
10099
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9831
10100
  variants: function
9832
10101
  dispatch:
9833
- CPU: foreach_tensor_add_list_kernel_slow
9834
- CUDA: foreach_tensor_add_list_kernel_cuda
10102
+ CPU: foreach_tensor_mul_scalarlist_kernel_slow
10103
+ CUDA: foreach_tensor_mul_scalarlist_kernel_cuda
9835
10104
 
9836
- - func: _foreach_add_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> ()
10105
+ - func: _foreach_mul_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
9837
10106
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9838
10107
  variants: function
9839
10108
  dispatch:
9840
- CPU: foreach_tensor_add_list_kernel_slow_
9841
- CUDA: foreach_tensor_add_list_kernel_cuda_
9842
- autogen: _foreach_add.List_out
10109
+ CPU: foreach_tensor_mul_scalarlist_kernel_slow_
10110
+ CUDA: foreach_tensor_mul_scalarlist_kernel_cuda_
10111
+ autogen: _foreach_mul.ScalarList_out
9843
10112
 
9844
- - func: _foreach_sub.List(Tensor[] self, Tensor[] other, *, Scalar alpha=1) -> Tensor[]
10113
+ - func: _foreach_mul.Tensor(Tensor[] self, Tensor other) -> Tensor[]
9845
10114
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9846
10115
  variants: function
9847
10116
  dispatch:
9848
- CPU: foreach_tensor_sub_list_kernel_slow
9849
- CUDA: foreach_tensor_sub_list_kernel_cuda
10117
+ CPU: foreach_tensor_mul_tensor_kernel_slow
10118
+ CUDA: foreach_tensor_mul_tensor_kernel_cuda
9850
10119
 
9851
- - func: _foreach_sub_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> ()
10120
+ - func: _foreach_mul_.Tensor(Tensor(a!)[] self, Tensor other) -> ()
9852
10121
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9853
10122
  variants: function
9854
10123
  dispatch:
9855
- CPU: foreach_tensor_sub_list_kernel_slow_
9856
- CUDA: foreach_tensor_sub_list_kernel_cuda_
9857
- autogen: _foreach_sub.List_out
10124
+ CPU: foreach_tensor_mul_tensor_kernel_slow_
10125
+ CUDA: foreach_tensor_mul_tensor_kernel_cuda_
10126
+ autogen: _foreach_mul.Tensor_out
9858
10127
 
9859
- - func: _foreach_mul.List(Tensor[] self, Tensor[] other) -> Tensor[]
10128
+ - func: _foreach_div.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
9860
10129
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9861
10130
  variants: function
9862
10131
  dispatch:
9863
- CPU: foreach_tensor_mul_list_kernel_slow
9864
- CUDA: foreach_tensor_mul_list_kernel_cuda
10132
+ CPU: foreach_tensor_div_scalar_kernel_slow
10133
+ CUDA: foreach_tensor_div_scalar_kernel_cuda
9865
10134
 
9866
- - func: _foreach_mul_.List(Tensor(a!)[] self, Tensor[] other) -> ()
10135
+ - func: _foreach_div_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
9867
10136
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9868
10137
  variants: function
9869
10138
  dispatch:
9870
- CPU: foreach_tensor_mul_list_kernel_slow_
9871
- CUDA: foreach_tensor_mul_list_kernel_cuda_
9872
- autogen: _foreach_mul.List_out
10139
+ CPU: foreach_tensor_div_scalar_kernel_slow_
10140
+ CUDA: foreach_tensor_div_scalar_kernel_cuda_
10141
+ autogen: _foreach_div.Scalar_out
9873
10142
 
9874
10143
  - func: _foreach_div.List(Tensor[] self, Tensor[] other) -> Tensor[]
9875
10144
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -9886,20 +10155,35 @@
9886
10155
  CUDA: foreach_tensor_div_list_kernel_cuda_
9887
10156
  autogen: _foreach_div.List_out
9888
10157
 
9889
- - func: _foreach_clamp_min.List(Tensor[] self, Tensor[] other) -> Tensor[]
10158
+ - func: _foreach_div.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
9890
10159
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9891
10160
  variants: function
9892
10161
  dispatch:
9893
- CPU: foreach_tensor_clamp_min_list_kernel_slow
9894
- CUDA: foreach_tensor_clamp_min_list_kernel_cuda
10162
+ CPU: foreach_tensor_div_scalarlist_kernel_slow
10163
+ CUDA: foreach_tensor_div_scalarlist_kernel_cuda
9895
10164
 
9896
- - func: _foreach_clamp_min_.List(Tensor(a!)[] self, Tensor[] other) -> ()
10165
+ - func: _foreach_div_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
9897
10166
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9898
10167
  variants: function
9899
10168
  dispatch:
9900
- CPU: foreach_tensor_clamp_min_list_kernel_slow_
9901
- CUDA: foreach_tensor_clamp_min_list_kernel_cuda_
9902
- autogen: _foreach_clamp_min.List_out
10169
+ CPU: foreach_tensor_div_scalarlist_kernel_slow_
10170
+ CUDA: foreach_tensor_div_scalarlist_kernel_cuda_
10171
+ autogen: _foreach_div.ScalarList_out
10172
+
10173
+ - func: _foreach_clamp_max.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
10174
+ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10175
+ variants: function
10176
+ dispatch:
10177
+ CPU: foreach_tensor_clamp_max_scalar_kernel_slow
10178
+ CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda
10179
+
10180
+ - func: _foreach_clamp_max_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
10181
+ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10182
+ variants: function
10183
+ dispatch:
10184
+ CPU: foreach_tensor_clamp_max_scalar_kernel_slow_
10185
+ CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda_
10186
+ autogen: _foreach_clamp_max.Scalar_out
9903
10187
 
9904
10188
  - func: _foreach_clamp_max.List(Tensor[] self, Tensor[] other) -> Tensor[]
9905
10189
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -9916,143 +10200,143 @@
9916
10200
  CUDA: foreach_tensor_clamp_max_list_kernel_cuda_
9917
10201
  autogen: _foreach_clamp_max.List_out
9918
10202
 
9919
- # foreach_minimum/maximum dispatches to clamp_max/min
9920
- - func: _foreach_maximum.List(Tensor[] self, Tensor[] other) -> Tensor[]
10203
+ - func: _foreach_clamp_max.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
9921
10204
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9922
10205
  variants: function
9923
10206
  dispatch:
9924
- CPU: foreach_tensor_clamp_min_list_kernel_slow
9925
- CUDA: foreach_tensor_clamp_min_list_kernel_cuda
10207
+ CPU: foreach_tensor_clamp_max_scalarlist_kernel_slow
10208
+ CUDA: foreach_tensor_clamp_max_scalarlist_kernel_cuda
9926
10209
 
9927
- - func: _foreach_maximum_.List(Tensor(a!)[] self, Tensor[] other) -> ()
10210
+ - func: _foreach_clamp_max_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
9928
10211
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9929
10212
  variants: function
9930
10213
  dispatch:
9931
- CPU: foreach_tensor_clamp_min_list_kernel_slow_
9932
- CUDA: foreach_tensor_clamp_min_list_kernel_cuda_
9933
- autogen: _foreach_maximum.List_out
10214
+ CPU: foreach_tensor_clamp_max_scalarlist_kernel_slow_
10215
+ CUDA: foreach_tensor_clamp_max_scalarlist_kernel_cuda_
10216
+ autogen: _foreach_clamp_max.ScalarList_out
9934
10217
 
9935
- - func: _foreach_minimum.List(Tensor[] self, Tensor[] other) -> Tensor[]
10218
+ - func: _foreach_clamp_min.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
9936
10219
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9937
10220
  variants: function
9938
10221
  dispatch:
9939
- CPU: foreach_tensor_clamp_max_list_kernel_slow
9940
- CUDA: foreach_tensor_clamp_max_list_kernel_cuda
10222
+ CPU: foreach_tensor_clamp_min_scalar_kernel_slow
10223
+ CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda
9941
10224
 
9942
- - func: _foreach_minimum_.List(Tensor(a!)[] self, Tensor[] other) -> ()
10225
+ - func: _foreach_clamp_min_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
9943
10226
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9944
10227
  variants: function
9945
10228
  dispatch:
9946
- CPU: foreach_tensor_clamp_max_list_kernel_slow_
9947
- CUDA: foreach_tensor_clamp_max_list_kernel_cuda_
9948
- autogen: _foreach_minimum.List_out
9949
-
10229
+ CPU: foreach_tensor_clamp_min_scalar_kernel_slow_
10230
+ CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda_
10231
+ autogen: _foreach_clamp_min.Scalar_out
9950
10232
 
9951
- - func: _foreach_add.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
10233
+ - func: _foreach_clamp_min.List(Tensor[] self, Tensor[] other) -> Tensor[]
9952
10234
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9953
10235
  variants: function
9954
10236
  dispatch:
9955
- CPU: foreach_tensor_add_scalarlist_kernel_slow
9956
- CUDA: foreach_tensor_add_scalarlist_kernel_cuda
10237
+ CPU: foreach_tensor_clamp_min_list_kernel_slow
10238
+ CUDA: foreach_tensor_clamp_min_list_kernel_cuda
9957
10239
 
9958
- - func: _foreach_add_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
10240
+ - func: _foreach_clamp_min_.List(Tensor(a!)[] self, Tensor[] other) -> ()
9959
10241
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9960
10242
  variants: function
9961
10243
  dispatch:
9962
- CPU: foreach_tensor_add_scalarlist_kernel_slow_
9963
- CUDA: foreach_tensor_add_scalarlist_kernel_cuda_
9964
- autogen: _foreach_add.ScalarList_out
10244
+ CPU: foreach_tensor_clamp_min_list_kernel_slow_
10245
+ CUDA: foreach_tensor_clamp_min_list_kernel_cuda_
10246
+ autogen: _foreach_clamp_min.List_out
9965
10247
 
9966
- - func: _foreach_sub.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
10248
+ - func: _foreach_clamp_min.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
9967
10249
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9968
10250
  variants: function
9969
10251
  dispatch:
9970
- CPU: foreach_tensor_sub_scalarlist_kernel_slow
9971
- CUDA: foreach_tensor_sub_scalarlist_kernel_cuda
10252
+ CPU: foreach_tensor_clamp_min_scalarlist_kernel_slow
10253
+ CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda
9972
10254
 
9973
- - func: _foreach_sub_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
10255
+ - func: _foreach_clamp_min_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
9974
10256
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9975
10257
  variants: function
9976
10258
  dispatch:
9977
- CPU: foreach_tensor_sub_scalarlist_kernel_slow_
9978
- CUDA: foreach_tensor_sub_scalarlist_kernel_cuda_
9979
- autogen: _foreach_sub.ScalarList_out
10259
+ CPU: foreach_tensor_clamp_min_scalarlist_kernel_slow_
10260
+ CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda_
10261
+ autogen: _foreach_clamp_min.ScalarList_out
9980
10262
 
9981
- - func: _foreach_div.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
10263
+ # foreach_minimum/maximum dispatches to clamp_max/min
10264
+ - func: _foreach_maximum.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
9982
10265
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9983
10266
  variants: function
9984
10267
  dispatch:
9985
- CPU: foreach_tensor_div_scalarlist_kernel_slow
9986
- CUDA: foreach_tensor_div_scalarlist_kernel_cuda
10268
+ CPU: foreach_tensor_clamp_min_scalar_kernel_slow
10269
+ CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda
9987
10270
 
9988
- - func: _foreach_div_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
10271
+ - func: _foreach_maximum_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
9989
10272
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9990
10273
  variants: function
9991
10274
  dispatch:
9992
- CPU: foreach_tensor_div_scalarlist_kernel_slow_
9993
- CUDA: foreach_tensor_div_scalarlist_kernel_cuda_
9994
- autogen: _foreach_div.ScalarList_out
10275
+ CPU: foreach_tensor_clamp_min_scalar_kernel_slow_
10276
+ CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda_
10277
+ autogen: _foreach_maximum.Scalar_out
9995
10278
 
9996
- - func: _foreach_mul.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
10279
+ # foreach_minimum/maximum dispatches to clamp_max/min
10280
+ - func: _foreach_maximum.List(Tensor[] self, Tensor[] other) -> Tensor[]
9997
10281
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9998
10282
  variants: function
9999
10283
  dispatch:
10000
- CPU: foreach_tensor_mul_scalarlist_kernel_slow
10001
- CUDA: foreach_tensor_mul_scalarlist_kernel_cuda
10284
+ CPU: foreach_tensor_clamp_min_list_kernel_slow
10285
+ CUDA: foreach_tensor_clamp_min_list_kernel_cuda
10002
10286
 
10003
- - func: _foreach_mul_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
10287
+ - func: _foreach_maximum_.List(Tensor(a!)[] self, Tensor[] other) -> ()
10004
10288
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10005
10289
  variants: function
10006
10290
  dispatch:
10007
- CPU: foreach_tensor_mul_scalarlist_kernel_slow_
10008
- CUDA: foreach_tensor_mul_scalarlist_kernel_cuda_
10009
- autogen: _foreach_mul.ScalarList_out
10291
+ CPU: foreach_tensor_clamp_min_list_kernel_slow_
10292
+ CUDA: foreach_tensor_clamp_min_list_kernel_cuda_
10293
+ autogen: _foreach_maximum.List_out
10010
10294
 
10011
- - func: _foreach_clamp_min.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
10295
+ # foreach_minimum/maximum dispatches to clamp_max/min
10296
+ - func: _foreach_maximum.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
10012
10297
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10013
10298
  variants: function
10014
10299
  dispatch:
10015
10300
  CPU: foreach_tensor_clamp_min_scalarlist_kernel_slow
10016
10301
  CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda
10017
10302
 
10018
- - func: _foreach_clamp_min_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
10303
+ - func: _foreach_maximum_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
10019
10304
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10020
10305
  variants: function
10021
10306
  dispatch:
10022
10307
  CPU: foreach_tensor_clamp_min_scalarlist_kernel_slow_
10023
10308
  CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda_
10024
- autogen: _foreach_clamp_min.ScalarList_out
10309
+ autogen: _foreach_maximum.ScalarList_out
10025
10310
 
10026
- - func: _foreach_clamp_max.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
10311
+ - func: _foreach_minimum.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
10027
10312
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10028
10313
  variants: function
10029
10314
  dispatch:
10030
- CPU: foreach_tensor_clamp_max_scalarlist_kernel_slow
10031
- CUDA: foreach_tensor_clamp_max_scalarlist_kernel_cuda
10315
+ CPU: foreach_tensor_clamp_max_scalar_kernel_slow
10316
+ CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda
10032
10317
 
10033
- - func: _foreach_clamp_max_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
10318
+ - func: _foreach_minimum_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
10034
10319
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10035
10320
  variants: function
10036
10321
  dispatch:
10037
- CPU: foreach_tensor_clamp_max_scalarlist_kernel_slow_
10038
- CUDA: foreach_tensor_clamp_max_scalarlist_kernel_cuda_
10039
- autogen: _foreach_clamp_max.ScalarList_out
10322
+ CPU: foreach_tensor_clamp_max_scalar_kernel_slow_
10323
+ CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda_
10324
+ autogen: _foreach_minimum.Scalar_out
10040
10325
 
10041
- # foreach_minimum/maximum dispatches to clamp_max/min
10042
- - func: _foreach_maximum.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
10326
+ - func: _foreach_minimum.List(Tensor[] self, Tensor[] other) -> Tensor[]
10043
10327
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10044
10328
  variants: function
10045
10329
  dispatch:
10046
- CPU: foreach_tensor_clamp_min_scalarlist_kernel_slow
10047
- CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda
10330
+ CPU: foreach_tensor_clamp_max_list_kernel_slow
10331
+ CUDA: foreach_tensor_clamp_max_list_kernel_cuda
10048
10332
 
10049
- - func: _foreach_maximum_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
10333
+ - func: _foreach_minimum_.List(Tensor(a!)[] self, Tensor[] other) -> ()
10050
10334
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10051
10335
  variants: function
10052
10336
  dispatch:
10053
- CPU: foreach_tensor_clamp_min_scalarlist_kernel_slow_
10054
- CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda_
10055
- autogen: _foreach_maximum.ScalarList_out
10337
+ CPU: foreach_tensor_clamp_max_list_kernel_slow_
10338
+ CUDA: foreach_tensor_clamp_max_list_kernel_cuda_
10339
+ autogen: _foreach_minimum.List_out
10056
10340
 
10057
10341
  - func: _foreach_minimum.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
10058
10342
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -10069,43 +10353,95 @@
10069
10353
  CUDA: foreach_tensor_clamp_max_scalarlist_kernel_cuda_
10070
10354
  autogen: _foreach_minimum.ScalarList_out
10071
10355
 
10072
- - func: _foreach_exp(Tensor[] self) -> Tensor[]
10356
+ - func: _foreach_addcdiv.Scalar(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[]
10073
10357
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10074
10358
  variants: function
10075
10359
  dispatch:
10076
- CPU: foreach_tensor_exp_slow
10077
- CUDA: foreach_tensor_exp_cuda
10360
+ CPU: foreach_tensor_addcdiv_scalar_slow
10361
+ CUDA: foreach_tensor_addcdiv_scalar_cuda
10078
10362
 
10079
- - func: _foreach_zero_(Tensor(a!)[] self) -> ()
10363
+ - func: _foreach_addcdiv.ScalarList(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> Tensor[]
10080
10364
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10081
10365
  variants: function
10082
10366
  dispatch:
10083
- CPU: foreach_tensor_zero_slow_
10084
- CUDA: foreach_tensor_zero_cuda_
10085
- autogen: _foreach_zero, _foreach_zero.out
10367
+ CPU: foreach_tensor_addcdiv_scalarlist_slow
10368
+ CUDA: foreach_tensor_addcdiv_scalarlist_cuda
10086
10369
 
10087
- - func: _foreach_exp_(Tensor(a!)[] self) -> ()
10370
+ - func: _foreach_addcdiv.Tensor(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> Tensor[]
10088
10371
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10089
10372
  variants: function
10090
10373
  dispatch:
10091
- CPU: foreach_tensor_exp_slow_
10092
- CUDA: foreach_tensor_exp_cuda_
10093
- autogen: _foreach_exp.out
10374
+ CPU: foreach_tensor_addcdiv_tensor_slow
10375
+ CUDA: foreach_tensor_addcdiv_tensor_cuda
10094
10376
 
10095
- - func: _foreach_sqrt(Tensor[] self) -> Tensor[]
10377
+ - func: _foreach_addcdiv_.Scalar(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> ()
10096
10378
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10097
10379
  variants: function
10098
10380
  dispatch:
10099
- CPU: foreach_tensor_sqrt_slow
10100
- CUDA: foreach_tensor_sqrt_cuda
10381
+ CPU: foreach_tensor_addcdiv_scalar_slow_
10382
+ CUDA: foreach_tensor_addcdiv_scalar_cuda_
10383
+ autogen: _foreach_addcdiv.Scalar_out
10101
10384
 
10102
- - func: _foreach_sqrt_(Tensor(a!)[] self) -> ()
10385
+ - func: _foreach_addcdiv_.ScalarList(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> ()
10103
10386
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10104
10387
  variants: function
10105
10388
  dispatch:
10106
- CPU: foreach_tensor_sqrt_slow_
10107
- CUDA: foreach_tensor_sqrt_cuda_
10108
- autogen: _foreach_sqrt.out
10389
+ CPU: foreach_tensor_addcdiv_scalarlist_slow_
10390
+ CUDA: foreach_tensor_addcdiv_scalarlist_cuda_
10391
+ autogen: _foreach_addcdiv.ScalarList_out
10392
+
10393
+ - func: _foreach_addcdiv_.Tensor(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> ()
10394
+ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10395
+ variants: function
10396
+ dispatch:
10397
+ CPU: foreach_tensor_addcdiv_tensor_slow_
10398
+ CUDA: foreach_tensor_addcdiv_tensor_cuda_
10399
+ autogen: _foreach_addcdiv.Tensor_out
10400
+
10401
+ - func: _foreach_addcmul.Scalar(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[]
10402
+ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10403
+ variants: function
10404
+ dispatch:
10405
+ CPU: foreach_tensor_addcmul_scalar_slow
10406
+ CUDA: foreach_tensor_addcmul_scalar_cuda
10407
+
10408
+ - func: _foreach_addcmul.ScalarList(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> Tensor[]
10409
+ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10410
+ variants: function
10411
+ dispatch:
10412
+ CPU: foreach_tensor_addcmul_scalarlist_slow
10413
+ CUDA: foreach_tensor_addcmul_scalarlist_cuda
10414
+
10415
+ - func: _foreach_addcmul.Tensor(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> Tensor[]
10416
+ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10417
+ variants: function
10418
+ dispatch:
10419
+ CPU: foreach_tensor_addcmul_tensor_slow
10420
+ CUDA: foreach_tensor_addcmul_tensor_cuda
10421
+
10422
+ - func: _foreach_addcmul_.Scalar(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> ()
10423
+ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10424
+ variants: function
10425
+ dispatch:
10426
+ CPU: foreach_tensor_addcmul_scalar_slow_
10427
+ CUDA: foreach_tensor_addcmul_scalar_cuda_
10428
+ autogen: _foreach_addcmul.Scalar_out
10429
+
10430
+ - func: _foreach_addcmul_.ScalarList(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> ()
10431
+ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10432
+ variants: function
10433
+ dispatch:
10434
+ CPU: foreach_tensor_addcmul_scalarlist_slow_
10435
+ CUDA: foreach_tensor_addcmul_scalarlist_cuda_
10436
+ autogen: _foreach_addcmul.ScalarList_out
10437
+
10438
+ - func: _foreach_addcmul_.Tensor(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> ()
10439
+ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10440
+ variants: function
10441
+ dispatch:
10442
+ CPU: foreach_tensor_addcmul_tensor_slow_
10443
+ CUDA: foreach_tensor_addcmul_tensor_cuda_
10444
+ autogen: _foreach_addcmul.Tensor_out
10109
10445
 
10110
10446
  - func: _foreach_abs(Tensor[] self) -> Tensor[]
10111
10447
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -10242,6 +10578,21 @@
10242
10578
  CUDA: foreach_tensor_erfc_cuda_
10243
10579
  autogen: _foreach_erfc.out
10244
10580
 
10581
+ - func: _foreach_exp(Tensor[] self) -> Tensor[]
10582
+ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10583
+ variants: function
10584
+ dispatch:
10585
+ CPU: foreach_tensor_exp_slow
10586
+ CUDA: foreach_tensor_exp_cuda
10587
+
10588
+ - func: _foreach_exp_(Tensor(a!)[] self) -> ()
10589
+ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10590
+ variants: function
10591
+ dispatch:
10592
+ CPU: foreach_tensor_exp_slow_
10593
+ CUDA: foreach_tensor_exp_cuda_
10594
+ autogen: _foreach_exp.out
10595
+
10245
10596
  - func: _foreach_expm1(Tensor[] self) -> Tensor[]
10246
10597
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10247
10598
  variants: function
@@ -10272,6 +10623,68 @@
10272
10623
  CUDA: foreach_tensor_floor_cuda_
10273
10624
  autogen: _foreach_floor.out
10274
10625
 
10626
+ - func: _foreach_frac(Tensor[] self) -> Tensor[]
10627
+ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10628
+ variants: function
10629
+ dispatch:
10630
+ CPU: foreach_tensor_frac_slow
10631
+ CUDA: foreach_tensor_frac_cuda
10632
+
10633
+ - func: _foreach_frac_(Tensor(a!)[] self) -> ()
10634
+ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10635
+ variants: function
10636
+ dispatch:
10637
+ CPU: foreach_tensor_frac_slow_
10638
+ CUDA: foreach_tensor_frac_cuda_
10639
+ autogen: _foreach_frac.out
10640
+
10641
+ - func: _foreach_lerp.List(Tensor[] self, Tensor[] tensors1, Tensor[] weights) -> Tensor[]
10642
+ device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices
10643
+ variants: function
10644
+ dispatch:
10645
+ CPU: foreach_tensor_ternary_lerp_slow
10646
+ CUDA: foreach_tensor_lerp_ternary_cuda
10647
+ autogen: _foreach_lerp.List_out
10648
+
10649
+ - func: _foreach_lerp_.List(Tensor(a!)[] self, Tensor[] tensors1, Tensor[] weights) -> ()
10650
+ device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices
10651
+ variants: function
10652
+ dispatch:
10653
+ CPU: foreach_tensor_ternary_lerp_slow_
10654
+ CUDA: foreach_tensor_lerp_ternary_cuda_
10655
+ autogen: _foreach_lerp.List_out
10656
+
10657
+ - func: _foreach_lerp.Scalar(Tensor[] self, Tensor[] tensors1, Scalar weight) -> Tensor[]
10658
+ device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices
10659
+ variants: function
10660
+ dispatch:
10661
+ CPU: foreach_tensor_lerp_list_kernel_slow
10662
+ CUDA: foreach_tensor_lerp_list_cuda
10663
+ autogen: _foreach_lerp.Scalar_out
10664
+
10665
+ - func: _foreach_lerp_.Scalar(Tensor(a!)[] self, Tensor[] tensors1, Scalar weight) -> ()
10666
+ device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices
10667
+ variants: function
10668
+ dispatch:
10669
+ CPU: foreach_tensor_lerp_list_kernel_slow_
10670
+ CUDA: foreach_tensor_lerp_list_cuda_
10671
+ autogen: _foreach_lerp.Scalar_out
10672
+
10673
+ - func: _foreach_lgamma(Tensor[] self) -> Tensor[]
10674
+ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10675
+ variants: function
10676
+ dispatch:
10677
+ CPU: foreach_tensor_lgamma_slow
10678
+ CUDA: foreach_tensor_lgamma_cuda
10679
+
10680
+ - func: _foreach_lgamma_(Tensor(a!)[] self) -> ()
10681
+ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10682
+ variants: function
10683
+ dispatch:
10684
+ CPU: foreach_tensor_lgamma_slow_
10685
+ CUDA: foreach_tensor_lgamma_cuda_
10686
+ autogen: _foreach_lgamma.out
10687
+
10275
10688
  - func: _foreach_log(Tensor[] self) -> Tensor[]
10276
10689
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10277
10690
  variants: function
@@ -10347,110 +10760,65 @@
10347
10760
  CUDA: foreach_tensor_neg_cuda_
10348
10761
  autogen: _foreach_neg.out
10349
10762
 
10350
- - func: _foreach_tan(Tensor[] self) -> Tensor[]
10351
- device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10352
- variants: function
10353
- dispatch:
10354
- CPU: foreach_tensor_tan_slow
10355
- CUDA: foreach_tensor_tan_cuda
10356
-
10357
- - func: _foreach_tan_(Tensor(a!)[] self) -> ()
10358
- device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10359
- variants: function
10360
- dispatch:
10361
- CPU: foreach_tensor_tan_slow_
10362
- CUDA: foreach_tensor_tan_cuda_
10363
- autogen: _foreach_tan.out
10364
-
10365
- - func: _foreach_tanh(Tensor[] self) -> Tensor[]
10366
- device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10367
- variants: function
10368
- dispatch:
10369
- CPU: foreach_tensor_tanh_slow
10370
- CUDA: foreach_tensor_tanh_cuda
10371
-
10372
- - func: _foreach_tanh_(Tensor(a!)[] self) -> ()
10373
- device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10374
- variants: function
10375
- dispatch:
10376
- CPU: foreach_tensor_tanh_slow_
10377
- CUDA: foreach_tensor_tanh_cuda_
10378
- autogen: _foreach_tanh.out
10379
-
10380
- - func: _foreach_sin(Tensor[] self) -> Tensor[]
10381
- device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10382
- variants: function
10383
- dispatch:
10384
- CPU: foreach_tensor_sin_slow
10385
- CUDA: foreach_tensor_sin_cuda
10386
-
10387
- - func: _foreach_sin_(Tensor(a!)[] self) -> ()
10388
- device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10389
- variants: function
10390
- dispatch:
10391
- CPU: foreach_tensor_sin_slow_
10392
- CUDA: foreach_tensor_sin_cuda_
10393
- autogen: _foreach_sin.out
10394
-
10395
- - func: _foreach_sinh(Tensor[] self) -> Tensor[]
10763
+ - func: _foreach_norm.Scalar(Tensor[] self, Scalar ord=2) -> Tensor[]
10396
10764
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10397
10765
  variants: function
10398
10766
  dispatch:
10399
- CPU: foreach_tensor_sinh_slow
10400
- CUDA: foreach_tensor_sinh_cuda
10767
+ CPU: foreach_tensor_norm_slow
10768
+ CUDA: foreach_tensor_norm_cuda
10769
+ autogen: _foreach_norm.Scalar_out
10401
10770
 
10402
- - func: _foreach_sinh_(Tensor(a!)[] self) -> ()
10771
+ - func: _foreach_pow.List(Tensor[] self, Tensor[] exponent) -> Tensor[]
10403
10772
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10404
10773
  variants: function
10405
10774
  dispatch:
10406
- CPU: foreach_tensor_sinh_slow_
10407
- CUDA: foreach_tensor_sinh_cuda_
10408
- autogen: _foreach_sinh.out
10775
+ CPU: foreach_tensor_pow_list_kernel_slow
10776
+ CUDA: foreach_tensor_pow_list_kernel_cuda
10409
10777
 
10410
- - func: _foreach_round(Tensor[] self) -> Tensor[]
10778
+ - func: _foreach_pow.Scalar(Tensor[] self, Scalar exponent) -> Tensor[]
10411
10779
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10412
10780
  variants: function
10413
10781
  dispatch:
10414
- CPU: foreach_tensor_round_slow
10415
- CUDA: foreach_tensor_round_cuda
10782
+ CPU: foreach_tensor_pow_scalar_kernel_slow
10783
+ CUDA: foreach_tensor_pow_scalar_kernel_cuda
10416
10784
 
10417
- - func: _foreach_round_(Tensor(a!)[] self) -> ()
10785
+ - func: _foreach_pow.ScalarList(Tensor[] self, Scalar[] exponent) -> Tensor[]
10418
10786
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10419
10787
  variants: function
10420
10788
  dispatch:
10421
- CPU: foreach_tensor_round_slow_
10422
- CUDA: foreach_tensor_round_cuda_
10423
- autogen: _foreach_round.out
10789
+ CPU: foreach_tensor_pow_scalarlist_kernel_slow
10790
+ CUDA: foreach_tensor_pow_scalarlist_kernel_cuda
10424
10791
 
10425
- - func: _foreach_lgamma(Tensor[] self) -> Tensor[]
10792
+ - func: _foreach_pow.ScalarAndTensor(Scalar self, Tensor[] exponent) -> Tensor[]
10426
10793
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10427
10794
  variants: function
10428
10795
  dispatch:
10429
- CPU: foreach_tensor_lgamma_slow
10430
- CUDA: foreach_tensor_lgamma_cuda
10796
+ CPU: foreach_scalar_pow_list_kernel_slow
10797
+ CUDA: foreach_scalar_pow_list_kernel_cuda
10431
10798
 
10432
- - func: _foreach_lgamma_(Tensor(a!)[] self) -> ()
10433
- device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10799
+ - func: _foreach_pow_.List(Tensor(a!)[] self, Tensor[] exponent) -> ()
10800
+ device_check: NoCheck
10434
10801
  variants: function
10435
10802
  dispatch:
10436
- CPU: foreach_tensor_lgamma_slow_
10437
- CUDA: foreach_tensor_lgamma_cuda_
10438
- autogen: _foreach_lgamma.out
10803
+ CPU: foreach_tensor_pow_list_kernel_slow_
10804
+ CUDA: foreach_tensor_pow_list_kernel_cuda_
10805
+ autogen: _foreach_pow.List_out
10439
10806
 
10440
- - func: _foreach_frac(Tensor[] self) -> Tensor[]
10441
- device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10807
+ - func: _foreach_pow_.Scalar(Tensor(a!)[] self, Scalar exponent) -> ()
10808
+ device_check: NoCheck
10442
10809
  variants: function
10443
10810
  dispatch:
10444
- CPU: foreach_tensor_frac_slow
10445
- CUDA: foreach_tensor_frac_cuda
10811
+ CPU: foreach_tensor_pow_scalar_kernel_slow_
10812
+ CUDA: foreach_tensor_pow_scalar_kernel_cuda_
10813
+ autogen: _foreach_pow.Scalar_out
10446
10814
 
10447
- - func: _foreach_frac_(Tensor(a!)[] self) -> ()
10448
- device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10815
+ - func: _foreach_pow_.ScalarList(Tensor(a!)[] self, Scalar[] exponent) -> ()
10816
+ device_check: NoCheck
10449
10817
  variants: function
10450
10818
  dispatch:
10451
- CPU: foreach_tensor_frac_slow_
10452
- CUDA: foreach_tensor_frac_cuda_
10453
- autogen: _foreach_frac.out
10819
+ CPU: foreach_tensor_pow_scalarlist_kernel_slow_
10820
+ CUDA: foreach_tensor_pow_scalarlist_kernel_cuda_
10821
+ autogen: _foreach_pow.ScalarList_out
10454
10822
 
10455
10823
  - func: _foreach_reciprocal(Tensor[] self) -> Tensor[]
10456
10824
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -10467,6 +10835,21 @@
10467
10835
  CUDA: foreach_tensor_reciprocal_cuda_
10468
10836
  autogen: _foreach_reciprocal.out
10469
10837
 
10838
+ - func: _foreach_round(Tensor[] self) -> Tensor[]
10839
+ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10840
+ variants: function
10841
+ dispatch:
10842
+ CPU: foreach_tensor_round_slow
10843
+ CUDA: foreach_tensor_round_cuda
10844
+
10845
+ - func: _foreach_round_(Tensor(a!)[] self) -> ()
10846
+ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10847
+ variants: function
10848
+ dispatch:
10849
+ CPU: foreach_tensor_round_slow_
10850
+ CUDA: foreach_tensor_round_cuda_
10851
+ autogen: _foreach_round.out
10852
+
10470
10853
  - func: _foreach_sigmoid(Tensor[] self) -> Tensor[]
10471
10854
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10472
10855
  variants: function
@@ -10482,150 +10865,126 @@
10482
10865
  CUDA: foreach_tensor_sigmoid_cuda_
10483
10866
  autogen: _foreach_sigmoid.out
10484
10867
 
10485
- - func: _foreach_trunc(Tensor[] self) -> Tensor[]
10868
+ - func: _foreach_sign(Tensor[] self) -> Tensor[]
10486
10869
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10487
10870
  variants: function
10488
10871
  dispatch:
10489
- CPU: foreach_tensor_trunc_slow
10490
- CUDA: foreach_tensor_trunc_cuda
10872
+ CPU: foreach_tensor_sign_slow
10873
+ CUDA: foreach_tensor_sign_cuda
10491
10874
 
10492
- - func: _foreach_trunc_(Tensor(a!)[] self) -> ()
10875
+ - func: _foreach_sign_(Tensor(a!)[] self) -> ()
10493
10876
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10494
10877
  variants: function
10495
10878
  dispatch:
10496
- CPU: foreach_tensor_trunc_slow_
10497
- CUDA: foreach_tensor_trunc_cuda_
10498
- autogen: _foreach_trunc.out
10879
+ CPU: foreach_tensor_sign_slow_
10880
+ CUDA: foreach_tensor_sign_cuda_
10881
+ autogen: _foreach_sign.out
10499
10882
 
10500
- - func: _foreach_addcdiv_.Scalar(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> ()
10883
+ - func: _foreach_sin(Tensor[] self) -> Tensor[]
10501
10884
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10502
10885
  variants: function
10503
10886
  dispatch:
10504
- CPU: foreach_tensor_addcdiv_scalar_slow_
10505
- CUDA: foreach_tensor_addcdiv_scalar_cuda_
10506
- autogen: _foreach_addcdiv.Scalar_out
10887
+ CPU: foreach_tensor_sin_slow
10888
+ CUDA: foreach_tensor_sin_cuda
10507
10889
 
10508
- - func: _foreach_addcmul_.Scalar(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> ()
10890
+ - func: _foreach_sin_(Tensor(a!)[] self) -> ()
10509
10891
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10510
10892
  variants: function
10511
10893
  dispatch:
10512
- CPU: foreach_tensor_addcmul_scalar_slow_
10513
- CUDA: foreach_tensor_addcmul_scalar_cuda_
10514
- autogen: _foreach_addcmul.Scalar_out
10894
+ CPU: foreach_tensor_sin_slow_
10895
+ CUDA: foreach_tensor_sin_cuda_
10896
+ autogen: _foreach_sin.out
10515
10897
 
10516
- - func: _foreach_addcdiv_.ScalarList(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> ()
10898
+ - func: _foreach_sinh(Tensor[] self) -> Tensor[]
10517
10899
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10518
10900
  variants: function
10519
10901
  dispatch:
10520
- CPU: foreach_tensor_addcdiv_scalarlist_slow_
10521
- CUDA: foreach_tensor_addcdiv_scalarlist_cuda_
10522
- autogen: _foreach_addcdiv.ScalarList_out
10902
+ CPU: foreach_tensor_sinh_slow
10903
+ CUDA: foreach_tensor_sinh_cuda
10523
10904
 
10524
- - func: _foreach_addcdiv_.Tensor(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> ()
10905
+ - func: _foreach_sinh_(Tensor(a!)[] self) -> ()
10525
10906
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10526
10907
  variants: function
10527
10908
  dispatch:
10528
- CPU: foreach_tensor_addcdiv_tensor_slow_
10529
- CUDA: foreach_tensor_addcdiv_tensor_cuda_
10530
- autogen: _foreach_addcdiv.Tensor_out
10909
+ CPU: foreach_tensor_sinh_slow_
10910
+ CUDA: foreach_tensor_sinh_cuda_
10911
+ autogen: _foreach_sinh.out
10531
10912
 
10532
- - func: _foreach_addcmul_.ScalarList(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> ()
10913
+ - func: _foreach_sqrt(Tensor[] self) -> Tensor[]
10533
10914
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10534
10915
  variants: function
10535
10916
  dispatch:
10536
- CPU: foreach_tensor_addcmul_scalarlist_slow_
10537
- CUDA: foreach_tensor_addcmul_scalarlist_cuda_
10538
- autogen: _foreach_addcmul.ScalarList_out
10917
+ CPU: foreach_tensor_sqrt_slow
10918
+ CUDA: foreach_tensor_sqrt_cuda
10539
10919
 
10540
- - func: _foreach_addcmul_.Tensor(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> ()
10920
+ - func: _foreach_sqrt_(Tensor(a!)[] self) -> ()
10541
10921
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10542
10922
  variants: function
10543
10923
  dispatch:
10544
- CPU: foreach_tensor_addcmul_tensor_slow_
10545
- CUDA: foreach_tensor_addcmul_tensor_cuda_
10546
- autogen: _foreach_addcmul.Tensor_out
10924
+ CPU: foreach_tensor_sqrt_slow_
10925
+ CUDA: foreach_tensor_sqrt_cuda_
10926
+ autogen: _foreach_sqrt.out
10547
10927
 
10548
- - func: _foreach_addcdiv.Scalar(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[]
10928
+ - func: _foreach_tan(Tensor[] self) -> Tensor[]
10549
10929
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10550
10930
  variants: function
10551
10931
  dispatch:
10552
- CPU: foreach_tensor_addcdiv_scalar_slow
10553
- CUDA: foreach_tensor_addcdiv_scalar_cuda
10932
+ CPU: foreach_tensor_tan_slow
10933
+ CUDA: foreach_tensor_tan_cuda
10554
10934
 
10555
- - func: _foreach_addcmul.Scalar(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[]
10935
+ - func: _foreach_tan_(Tensor(a!)[] self) -> ()
10556
10936
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10557
10937
  variants: function
10558
10938
  dispatch:
10559
- CPU: foreach_tensor_addcmul_scalar_slow
10560
- CUDA: foreach_tensor_addcmul_scalar_cuda
10939
+ CPU: foreach_tensor_tan_slow_
10940
+ CUDA: foreach_tensor_tan_cuda_
10941
+ autogen: _foreach_tan.out
10561
10942
 
10562
- - func: _foreach_addcdiv.ScalarList(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> Tensor[]
10943
+ - func: _foreach_tanh(Tensor[] self) -> Tensor[]
10563
10944
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10564
10945
  variants: function
10565
10946
  dispatch:
10566
- CPU: foreach_tensor_addcdiv_scalarlist_slow
10567
- CUDA: foreach_tensor_addcdiv_scalarlist_cuda
10947
+ CPU: foreach_tensor_tanh_slow
10948
+ CUDA: foreach_tensor_tanh_cuda
10568
10949
 
10569
- - func: _foreach_addcdiv.Tensor(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> Tensor[]
10950
+ - func: _foreach_tanh_(Tensor(a!)[] self) -> ()
10570
10951
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10571
10952
  variants: function
10572
10953
  dispatch:
10573
- CPU: foreach_tensor_addcdiv_tensor_slow
10574
- CUDA: foreach_tensor_addcdiv_tensor_cuda
10954
+ CPU: foreach_tensor_tanh_slow_
10955
+ CUDA: foreach_tensor_tanh_cuda_
10956
+ autogen: _foreach_tanh.out
10575
10957
 
10576
- - func: _foreach_addcmul.ScalarList(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> Tensor[]
10958
+ - func: _foreach_trunc(Tensor[] self) -> Tensor[]
10577
10959
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10578
10960
  variants: function
10579
10961
  dispatch:
10580
- CPU: foreach_tensor_addcmul_scalarlist_slow
10581
- CUDA: foreach_tensor_addcmul_scalarlist_cuda
10962
+ CPU: foreach_tensor_trunc_slow
10963
+ CUDA: foreach_tensor_trunc_cuda
10582
10964
 
10583
- - func: _foreach_addcmul.Tensor(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> Tensor[]
10965
+ - func: _foreach_trunc_(Tensor(a!)[] self) -> ()
10584
10966
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10585
10967
  variants: function
10586
10968
  dispatch:
10587
- CPU: foreach_tensor_addcmul_tensor_slow
10588
- CUDA: foreach_tensor_addcmul_tensor_cuda
10969
+ CPU: foreach_tensor_trunc_slow_
10970
+ CUDA: foreach_tensor_trunc_cuda_
10971
+ autogen: _foreach_trunc.out
10589
10972
 
10590
- - func: _foreach_norm.Scalar(Tensor[] self, Scalar ord=2) -> Tensor[]
10973
+ - func: _foreach_zero_(Tensor(a!)[] self) -> ()
10591
10974
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10592
10975
  variants: function
10593
10976
  dispatch:
10594
- CPU: foreach_tensor_norm_slow
10595
- CUDA: foreach_tensor_norm_cuda
10596
- autogen: _foreach_norm.Scalar_out
10597
-
10598
- - func: _foreach_lerp.List(Tensor[] self, Tensor[] tensors1, Tensor[] weights) -> Tensor[]
10599
- device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices
10600
- variants: function
10601
- dispatch:
10602
- CPU: foreach_tensor_ternary_lerp_slow
10603
- CUDA: foreach_tensor_lerp_ternary_cuda
10604
- autogen: _foreach_lerp.List_out
10605
-
10606
- - func: _foreach_lerp_.List(Tensor(a!)[] self, Tensor[] tensors1, Tensor[] weights) -> ()
10607
- device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices
10608
- variants: function
10609
- dispatch:
10610
- CPU: foreach_tensor_ternary_lerp_slow_
10611
- CUDA: foreach_tensor_lerp_ternary_cuda_
10612
- autogen: _foreach_lerp.List_out
10613
-
10614
- - func: _foreach_lerp.Scalar(Tensor[] self, Tensor[] tensors1, Scalar weight) -> Tensor[]
10615
- device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices
10616
- variants: function
10617
- dispatch:
10618
- CPU: foreach_tensor_lerp_list_kernel_slow
10619
- CUDA: foreach_tensor_lerp_list_cuda
10620
- autogen: _foreach_lerp.Scalar_out
10977
+ CPU: foreach_tensor_zero_slow_
10978
+ CUDA: foreach_tensor_zero_cuda_
10979
+ autogen: _foreach_zero, _foreach_zero.out
10621
10980
 
10622
- - func: _foreach_lerp_.Scalar(Tensor(a!)[] self, Tensor[] tensors1, Scalar weight) -> ()
10623
- device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices
10981
+ - func: _foreach_copy_(Tensor(a!)[] self, Tensor[] src, bool non_blocking=False) -> ()
10982
+ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10624
10983
  variants: function
10625
10984
  dispatch:
10626
- CPU: foreach_tensor_lerp_list_kernel_slow_
10627
- CUDA: foreach_tensor_lerp_list_cuda_
10628
- autogen: _foreach_lerp.Scalar_out
10985
+ CPU: foreach_tensor_copy_list_kernel_slow_
10986
+ CUDA: foreach_tensor_copy_list_kernel_cuda_
10987
+ autogen: _foreach_copy, _foreach_copy.out
10629
10988
 
10630
10989
  - func: bucketize.Tensor(Tensor self, Tensor boundaries, *, bool out_int32=False, bool right=False) -> Tensor
10631
10990
  dispatch:
@@ -10657,7 +11016,11 @@
10657
11016
  dispatch:
10658
11017
  CPU: searchsorted_cpu
10659
11018
  CUDA: searchsorted_cuda
10660
- autogen: searchsorted.Scalar_out
11019
+
11020
+ - func: searchsorted.Scalar_out(Tensor sorted_sequence, Scalar self, *, bool out_int32=False, bool right=False, str? side=None, Tensor? sorter=None, Tensor(a!) out) -> Tensor(a!)
11021
+ dispatch:
11022
+ CPU: searchsorted_out_cpu
11023
+ CUDA: searchsorted_out_cuda
10661
11024
 
10662
11025
  - func: _convert_indices_from_coo_to_csr(Tensor self, int size, *, bool out_int32=False) -> Tensor
10663
11026
  structured_delegate: _convert_indices_from_coo_to_csr.out
@@ -10981,6 +11344,7 @@
10981
11344
  python_module: nn
10982
11345
  dispatch:
10983
11346
  CPU, CUDA: hardsigmoid_out
11347
+ MPS: hardsigmoid_out_mps
10984
11348
  QuantizedCPU: hardsigmoid_out_quantized_cpu
10985
11349
 
10986
11350
  - func: hardsigmoid(Tensor self) -> Tensor
@@ -11001,6 +11365,7 @@
11001
11365
  python_module: nn
11002
11366
  dispatch:
11003
11367
  CPU, CUDA: hardsigmoid_backward_out
11368
+ MPS: hardsigmoid_backward_out_mps
11004
11369
 
11005
11370
  - func: hardsigmoid_backward(Tensor grad_output, Tensor self) -> Tensor
11006
11371
  structured_delegate: hardsigmoid_backward.grad_input
@@ -11119,6 +11484,7 @@
11119
11484
  dispatch:
11120
11485
  CPU: log_sigmoid_forward_out_cpu
11121
11486
  CUDA: log_sigmoid_forward_out_cuda
11487
+ MPS: log_sigmoid_forward_out_mps
11122
11488
 
11123
11489
  - func: log_sigmoid_forward(Tensor self) -> (Tensor output, Tensor buffer)
11124
11490
  device_check: NoCheck # TensorIterator
@@ -11126,18 +11492,21 @@
11126
11492
  dispatch:
11127
11493
  CPU: log_sigmoid_forward_cpu
11128
11494
  CUDA: log_sigmoid_forward_cuda
11495
+ MPS: log_sigmoid_forward_mps
11129
11496
 
11130
11497
  - func: log_sigmoid_backward.grad_input(Tensor grad_output, Tensor self, Tensor buffer, *, Tensor(a!) grad_input) -> Tensor(a!)
11131
11498
  python_module: nn
11132
11499
  dispatch:
11133
11500
  CPU: log_sigmoid_backward_cpu_out
11134
11501
  CUDA: log_sigmoid_backward_cuda_out
11502
+ MPS: log_sigmoid_backward_mps_out
11135
11503
 
11136
11504
  - func: log_sigmoid_backward(Tensor grad_output, Tensor self, Tensor buffer) -> Tensor
11137
11505
  python_module: nn
11138
11506
  dispatch:
11139
11507
  CPU: log_sigmoid_backward_cpu
11140
11508
  CUDA: log_sigmoid_backward_cuda
11509
+ MPS: log_sigmoid_backward_mps
11141
11510
 
11142
11511
  - func: rrelu_with_noise.out(Tensor self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None, *, Tensor(a!) out) -> Tensor(a!)
11143
11512
  python_module: nn
@@ -11279,6 +11648,7 @@
11279
11648
  CUDA: adaptive_avg_pool3d_cuda
11280
11649
  QuantizedCPU: adaptive_avg_pool3d_quantized_cpu
11281
11650
  autogen: _adaptive_avg_pool3d.out
11651
+ tags: core
11282
11652
 
11283
11653
  - func: adaptive_avg_pool3d_backward.grad_input(Tensor grad_output, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!)
11284
11654
  python_module: nn
@@ -11394,6 +11764,7 @@
11394
11764
  dispatch:
11395
11765
  MkldnnCPU: mkldnn_avg_pool3d
11396
11766
  QuantizedCPU: avg_pool3d_quantized_cpu
11767
+ tags: core
11397
11768
 
11398
11769
  - func: avg_pool3d_backward.grad_input(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] stride, int[3] padding, bool ceil_mode, bool count_include_pad, int? divisor_override, *, Tensor(a!) grad_input) -> Tensor(a!)
11399
11770
  python_module: nn
@@ -11517,25 +11888,25 @@
11517
11888
  CPU: max_pool3d_with_indices_backward_cpu
11518
11889
  CUDA: max_pool3d_with_indices_backward_cuda
11519
11890
 
11520
- - func: max_unpool2d.out(Tensor self, Tensor indices, int[2] output_size, *, Tensor(a!) out) -> Tensor(a!)
11891
+ - func: max_unpool2d.out(Tensor self, Tensor indices, SymInt[2] output_size, *, Tensor(a!) out) -> Tensor(a!)
11521
11892
  python_module: nn
11522
11893
  dispatch:
11523
11894
  CPU: max_unpooling2d_forward_out_cpu
11524
11895
  CUDA: max_unpooling2d_forward_out_cuda
11525
11896
 
11526
- - func: max_unpool2d(Tensor self, Tensor indices, int[2] output_size) -> Tensor
11897
+ - func: max_unpool2d(Tensor self, Tensor indices, SymInt[2] output_size) -> Tensor
11527
11898
  python_module: nn
11528
11899
  dispatch:
11529
11900
  CPU: max_unpooling2d_forward_cpu
11530
11901
  CUDA: max_unpooling2d_forward_cuda
11531
11902
 
11532
- - func: max_unpool3d.out(Tensor self, Tensor indices, int[3] output_size, int[3] stride, int[3] padding, *, Tensor(a!) out) -> Tensor(a!)
11903
+ - func: max_unpool3d.out(Tensor self, Tensor indices, SymInt[3] output_size, int[3] stride, int[3] padding, *, Tensor(a!) out) -> Tensor(a!)
11533
11904
  python_module: nn
11534
11905
  dispatch:
11535
11906
  CPU: max_unpooling3d_forward_out_cpu
11536
11907
  CUDA: max_unpooling3d_forward_out_cuda
11537
11908
 
11538
- - func: max_unpool3d(Tensor self, Tensor indices, int[3] output_size, int[3] stride, int[3] padding) -> Tensor
11909
+ - func: max_unpool3d(Tensor self, Tensor indices, SymInt[3] output_size, int[3] stride, int[3] padding) -> Tensor
11539
11910
  python_module: nn
11540
11911
  dispatch:
11541
11912
  CPU: max_unpooling3d_forward_cpu
@@ -11553,6 +11924,7 @@
11553
11924
  - func: reflection_pad1d(Tensor self, SymInt[2] padding) -> Tensor
11554
11925
  python_module: nn
11555
11926
  structured_delegate: reflection_pad1d.out
11927
+ tags: core
11556
11928
 
11557
11929
  - func: reflection_pad1d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[2] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
11558
11930
  python_module: nn
@@ -11607,6 +11979,7 @@
11607
11979
  - func: reflection_pad3d(Tensor self, SymInt[6] padding) -> Tensor
11608
11980
  python_module: nn
11609
11981
  structured_delegate: reflection_pad3d.out
11982
+ tags: core
11610
11983
 
11611
11984
  - func: reflection_pad3d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[6] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
11612
11985
  python_module: nn
@@ -12069,6 +12442,7 @@
12069
12442
  structured_inherits: TensorIteratorBase
12070
12443
  dispatch:
12071
12444
  CPU, CUDA: logit_backward_out
12445
+ MPS: logit_backward_out_mps
12072
12446
  tags: pointwise
12073
12447
 
12074
12448
  - func: logit_backward(Tensor grad_output, Tensor self, float? eps=None) -> Tensor
@@ -12715,157 +13089,229 @@
12715
13089
 
12716
13090
  # torch.fft.fft
12717
13091
  # NOTE: NOT an alias for torch.fft, which has different semantics
12718
- - func: fft_fft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor
13092
+ - func: fft_fft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor
12719
13093
  python_module: fft
12720
13094
  variants: function
13095
+ dispatch:
13096
+ CompositeImplicitAutograd: fft_fft_symint
12721
13097
 
12722
- - func: fft_fft.out(Tensor self, int? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
13098
+ - func: fft_fft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
12723
13099
  python_module: fft
12724
13100
  variants: function
13101
+ dispatch:
13102
+ CompositeImplicitAutograd: fft_fft_symint_out
12725
13103
 
12726
- - func: fft_ifft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor
13104
+ - func: fft_ifft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor
12727
13105
  python_module: fft
12728
13106
  variants: function
13107
+ dispatch:
13108
+ CompositeImplicitAutograd: fft_ifft_symint
12729
13109
 
12730
- - func: fft_ifft.out(Tensor self, int? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
13110
+ - func: fft_ifft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
12731
13111
  python_module: fft
12732
13112
  variants: function
13113
+ dispatch:
13114
+ CompositeImplicitAutograd: fft_ifft_symint_out
12733
13115
 
12734
- - func: fft_rfft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor
13116
+ - func: fft_rfft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor
12735
13117
  python_module: fft
12736
13118
  variants: function
13119
+ dispatch:
13120
+ CompositeImplicitAutograd: fft_rfft_symint
12737
13121
 
12738
- - func: fft_rfft.out(Tensor self, int? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
13122
+ - func: fft_rfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
12739
13123
  python_module: fft
12740
13124
  variants: function
13125
+ dispatch:
13126
+ CompositeImplicitAutograd: fft_rfft_symint_out
12741
13127
 
12742
- - func: fft_irfft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor
13128
+ - func: fft_irfft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor
12743
13129
  python_module: fft
12744
13130
  variants: function
13131
+ dispatch:
13132
+ CompositeImplicitAutograd: fft_irfft_symint
12745
13133
 
12746
- - func: fft_irfft.out(Tensor self, int? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
13134
+ - func: fft_irfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
12747
13135
  python_module: fft
12748
13136
  variants: function
13137
+ dispatch:
13138
+ CompositeImplicitAutograd: fft_irfft_symint_out
12749
13139
 
12750
- - func: fft_hfft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor
13140
+ - func: fft_hfft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor
12751
13141
  python_module: fft
12752
13142
  variants: function
13143
+ dispatch:
13144
+ CompositeImplicitAutograd: fft_hfft_symint
12753
13145
 
12754
- - func: fft_hfft.out(Tensor self, int? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
13146
+ - func: fft_hfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
12755
13147
  python_module: fft
12756
13148
  variants: function
13149
+ dispatch:
13150
+ CompositeImplicitAutograd: fft_hfft_symint_out
12757
13151
 
12758
- - func: fft_ihfft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor
13152
+ - func: fft_ihfft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor
12759
13153
  python_module: fft
12760
13154
  variants: function
13155
+ dispatch:
13156
+ CompositeImplicitAutograd: fft_ihfft_symint
12761
13157
 
12762
- - func: fft_ihfft.out(Tensor self, int? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
13158
+ - func: fft_ihfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
12763
13159
  python_module: fft
12764
13160
  variants: function
13161
+ dispatch:
13162
+ CompositeImplicitAutograd: fft_ihfft_symint_out
12765
13163
 
12766
- - func: fft_fft2(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
13164
+ - func: fft_fft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
12767
13165
  python_module: fft
12768
13166
  variants: function
13167
+ dispatch:
13168
+ CompositeImplicitAutograd: fft_fft2_symint
12769
13169
 
12770
- - func: fft_fft2.out(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
13170
+ - func: fft_fft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
12771
13171
  python_module: fft
12772
13172
  variants: function
13173
+ dispatch:
13174
+ CompositeImplicitAutograd: fft_fft2_symint_out
12773
13175
 
12774
- - func: fft_ifft2(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
13176
+ - func: fft_ifft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
12775
13177
  python_module: fft
12776
13178
  variants: function
13179
+ dispatch:
13180
+ CompositeImplicitAutograd: fft_ifft2_symint
12777
13181
 
12778
- - func: fft_ifft2.out(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
13182
+ - func: fft_ifft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
12779
13183
  python_module: fft
12780
13184
  variants: function
13185
+ dispatch:
13186
+ CompositeImplicitAutograd: fft_ifft2_symint_out
12781
13187
 
12782
- - func: fft_rfft2(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
13188
+ - func: fft_rfft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
12783
13189
  python_module: fft
12784
13190
  variants: function
13191
+ dispatch:
13192
+ CompositeImplicitAutograd: fft_rfft2_symint
12785
13193
 
12786
- - func: fft_rfft2.out(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
13194
+ - func: fft_rfft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
12787
13195
  python_module: fft
12788
13196
  variants: function
13197
+ dispatch:
13198
+ CompositeImplicitAutograd: fft_rfft2_symint_out
12789
13199
 
12790
- - func: fft_irfft2(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
13200
+ - func: fft_irfft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
12791
13201
  python_module: fft
12792
13202
  variants: function
13203
+ dispatch:
13204
+ CompositeImplicitAutograd: fft_irfft2_symint
12793
13205
 
12794
- - func: fft_irfft2.out(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
13206
+ - func: fft_irfft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
12795
13207
  python_module: fft
12796
13208
  variants: function
13209
+ dispatch:
13210
+ CompositeImplicitAutograd: fft_irfft2_symint_out
12797
13211
 
12798
- - func: fft_hfft2(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
13212
+ - func: fft_hfft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
12799
13213
  use_const_ref_for_mutable_tensors: True
12800
13214
  python_module: fft
12801
13215
  variants: function
13216
+ dispatch:
13217
+ CompositeImplicitAutograd: fft_hfft2_symint
12802
13218
 
12803
- - func: fft_hfft2.out(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
13219
+ - func: fft_hfft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
12804
13220
  use_const_ref_for_mutable_tensors: True
12805
13221
  python_module: fft
12806
13222
  variants: function
13223
+ dispatch:
13224
+ CompositeImplicitAutograd: fft_hfft2_symint_out
12807
13225
 
12808
- - func: fft_ihfft2(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
13226
+ - func: fft_ihfft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
12809
13227
  use_const_ref_for_mutable_tensors: True
12810
13228
  python_module: fft
12811
13229
  variants: function
13230
+ dispatch:
13231
+ CompositeImplicitAutograd: fft_ihfft2_symint
12812
13232
 
12813
- - func: fft_ihfft2.out(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
13233
+ - func: fft_ihfft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
12814
13234
  use_const_ref_for_mutable_tensors: True
12815
13235
  python_module: fft
12816
13236
  variants: function
13237
+ dispatch:
13238
+ CompositeImplicitAutograd: fft_ihfft2_symint_out
12817
13239
 
12818
- - func: fft_fftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
13240
+ - func: fft_fftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
12819
13241
  python_module: fft
12820
13242
  variants: function
13243
+ dispatch:
13244
+ CompositeImplicitAutograd: fft_fftn_symint
12821
13245
 
12822
- - func: fft_fftn.out(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
13246
+ - func: fft_fftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
12823
13247
  python_module: fft
12824
13248
  variants: function
13249
+ dispatch:
13250
+ CompositeImplicitAutograd: fft_fftn_symint_out
12825
13251
 
12826
- - func: fft_ifftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
13252
+ - func: fft_ifftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
12827
13253
  python_module: fft
12828
13254
  variants: function
13255
+ dispatch:
13256
+ CompositeImplicitAutograd: fft_ifftn_symint
12829
13257
 
12830
- - func: fft_ifftn.out(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
13258
+ - func: fft_ifftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
12831
13259
  python_module: fft
12832
13260
  variants: function
13261
+ dispatch:
13262
+ CompositeImplicitAutograd: fft_ifftn_symint_out
12833
13263
 
12834
- - func: fft_rfftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
13264
+ - func: fft_rfftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
12835
13265
  python_module: fft
12836
13266
  variants: function
13267
+ dispatch:
13268
+ CompositeImplicitAutograd: fft_rfftn_symint
12837
13269
 
12838
- - func: fft_rfftn.out(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
13270
+ - func: fft_rfftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
12839
13271
  python_module: fft
12840
13272
  variants: function
13273
+ dispatch:
13274
+ CompositeImplicitAutograd: fft_rfftn_symint_out
12841
13275
 
12842
- - func: fft_irfftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
13276
+ - func: fft_irfftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
12843
13277
  python_module: fft
12844
13278
  variants: function
13279
+ dispatch:
13280
+ CompositeImplicitAutograd: fft_irfftn_symint
12845
13281
 
12846
- - func: fft_irfftn.out(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
13282
+ - func: fft_irfftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
12847
13283
  python_module: fft
12848
13284
  variants: function
13285
+ dispatch:
13286
+ CompositeImplicitAutograd: fft_irfftn_symint_out
12849
13287
 
12850
- - func: fft_hfftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
13288
+ - func: fft_hfftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
12851
13289
  use_const_ref_for_mutable_tensors: True
12852
13290
  python_module: fft
12853
13291
  variants: function
13292
+ dispatch:
13293
+ CompositeImplicitAutograd: fft_hfftn_symint
12854
13294
 
12855
- - func: fft_hfftn.out(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
13295
+ - func: fft_hfftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
12856
13296
  use_const_ref_for_mutable_tensors: True
12857
13297
  python_module: fft
12858
13298
  variants: function
13299
+ dispatch:
13300
+ CompositeImplicitAutograd: fft_hfftn_symint_out
12859
13301
 
12860
- - func: fft_ihfftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
13302
+ - func: fft_ihfftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
12861
13303
  use_const_ref_for_mutable_tensors: True
12862
13304
  python_module: fft
12863
13305
  variants: function
13306
+ dispatch:
13307
+ CompositeImplicitAutograd: fft_ihfftn_symint
12864
13308
 
12865
- - func: fft_ihfftn.out(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
13309
+ - func: fft_ihfftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
12866
13310
  use_const_ref_for_mutable_tensors: True
12867
13311
  python_module: fft
12868
13312
  variants: function
13313
+ dispatch:
13314
+ CompositeImplicitAutograd: fft_ihfftn_symint_out
12869
13315
 
12870
13316
  - func: fft_fftfreq(int n, float d=1.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
12871
13317
  python_module: fft
@@ -13210,6 +13656,7 @@
13210
13656
  structured: True
13211
13657
  dispatch:
13212
13658
  CPU, CUDA: linalg_vector_norm_out
13659
+ MPS: linalg_vector_norm_out_mps
13213
13660
 
13214
13661
  - func: linalg_matrix_norm(Tensor self, Scalar ord, int[] dim=[-2,-1], bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
13215
13662
  python_module: linalg
@@ -13788,6 +14235,7 @@
13788
14235
  dispatch:
13789
14236
  NestedTensorCPU: NestedTensor_softmax_dropout
13790
14237
  NestedTensorCUDA: NestedTensor_softmax_dropout_cuda
14238
+ tags: nondeterministic_seeded
13791
14239
 
13792
14240
  # Apparently, putting "forward" in the name will cause Python bindings to be skipped, so "fwd" it is.
13793
14241
  - func: _transformer_encoder_layer_fwd(Tensor src, int embed_dim, int num_heads, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, bool use_gelu, bool norm_first, float eps, Tensor norm_weight_1, Tensor norm_bias_1, Tensor norm_weight_2, Tensor norm_bias_2, Tensor ffn_weight_1, Tensor ffn_bias_1, Tensor ffn_weight_2, Tensor ffn_bias_2, Tensor? mask=None, int? mask_type=None) -> Tensor
@@ -13803,67 +14251,71 @@
13803
14251
  CUDA, NestedTensorCUDA: native_multi_head_attention_cuda
13804
14252
  autogen: _native_multi_head_attention.out
13805
14253
 
13806
- - func: scaled_dot_product_attention(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False) -> Tensor
14254
+ - func: scaled_dot_product_attention(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False, *, float? scale=None) -> Tensor
13807
14255
  python_module: nn
13808
14256
  variants: function
13809
14257
  autogen: scaled_dot_product_attention.out
13810
-
13811
- # TODO: THIS NEEDS TO BE REMOVED BUT PEOPLE HAVE TRAINED THEIR MODELS WITH THIS OP BUILTIN
13812
- - func: _scaled_dot_product_attention(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool need_attn_weights=False, bool is_causal=False) -> (Tensor, Tensor)
13813
- python_module: nn
13814
- variants: function
13815
- autogen: _scaled_dot_product_attention.out
14258
+ tags: nondeterministic_seeded
13816
14259
 
13817
14260
  # This aten function is kept so that we can test the choice function from Python
13818
- - func: _fused_sdp_choice(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False) -> int
14261
+ - func: _fused_sdp_choice(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False, *, float? scale=None) -> int
13819
14262
  dispatch:
13820
14263
  Meta: _fused_sdp_choice_meta
13821
14264
  CPU, NestedTensorCPU: _fused_sdp_choice_cpp
13822
14265
  CUDA, NestedTensorCUDA: _fused_sdp_choice_cuda
14266
+ tags: nondeterministic_seeded
13823
14267
 
13824
- - func: _scaled_dot_product_attention_math(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False, Tensor? dropout_mask=None) -> (Tensor, Tensor)
14268
+ - func: _scaled_dot_product_attention_math(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False, Tensor? dropout_mask=None, *, float? scale=None) -> (Tensor, Tensor)
13825
14269
  variants: function
14270
+ tags: nondeterministic_seeded
13826
14271
 
13827
- - func: _scaled_dot_product_flash_attention(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False) -> (Tensor ouput, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, int max_q, int max_k, int philox_seed, int philox_offset, Tensor debug_attn_mask)
14272
+ - func: _scaled_dot_product_flash_attention(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor ouput, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, int max_q, int max_k, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
13828
14273
  dispatch:
14274
+ CPU: _scaled_dot_product_flash_attention_cpu
13829
14275
  CUDA: _scaled_dot_product_flash_attention_cuda
13830
14276
  NestedTensorCUDA: _scaled_dot_product_flash_attention_nestedtensor_cuda
14277
+ tags: nondeterministic_seeded
13831
14278
 
13832
- - func: _scaled_dot_product_flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, int max_q, int max_k, float dropout_p, bool is_causal, int philox_seed, int philox_offset) -> (Tensor grad_query, Tensor grad_key, Tensor grad_value)
14279
+ - func: _scaled_dot_product_flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, int max_q, int max_k, float dropout_p, bool is_causal, Tensor philox_seed, Tensor philox_offset, *, float? scale=None) -> (Tensor grad_query, Tensor grad_key, Tensor grad_value)
14280
+ device_check: NoCheck
13833
14281
  variants: function
13834
14282
  dispatch:
14283
+ CPU: _scaled_dot_product_flash_attention_backward_cpu
13835
14284
  CUDA: _scaled_dot_product_flash_attention_backward_cuda
13836
14285
 
13837
- - func: _scaled_dot_product_efficient_attention(Tensor query, Tensor key, Tensor value, bool compute_log_sumexp, bool is_causal=False) -> (Tensor, Tensor)
14286
+ - func: _scaled_dot_product_efficient_attention(Tensor query, Tensor key, Tensor value, Tensor? attn_bias, bool compute_log_sumexp, float dropout_p=0.0, bool is_causal=False, *, float? scale=None) -> (Tensor output, Tensor log_sumexp, Tensor philox_seed, Tensor philox_offset)
13838
14287
  dispatch:
13839
14288
  CUDA: _scaled_dot_product_efficient_attention_cuda
13840
14289
  NestedTensorCUDA: _scaled_dot_product_efficient_attention_nestedtensor_cuda
14290
+ tags: nondeterministic_seeded
13841
14291
 
13842
- - func: _scaled_dot_product_efficient_attention_backward(Tensor grad_out_, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, bool is_causal=False, bool chunk_grad_outputs=False) -> (Tensor, Tensor, Tensor)
14292
+ - func: _scaled_dot_product_efficient_attention_backward(Tensor grad_out_, Tensor query, Tensor key, Tensor value, Tensor attn_bias, Tensor out, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset, float dropout_p, bool[4] grad_input_mask, bool is_causal=False, *, float? scale=None) -> (Tensor, Tensor, Tensor, Tensor)
14293
+ device_check: NoCheck
13843
14294
  dispatch:
13844
14295
  CUDA: _scaled_dot_product_efficient_attention_backward_cuda
14296
+ tags: nondeterministic_seeded
13845
14297
 
13846
- - func: _chunk_grad_outputs_efficient_attention(Tensor query, Tensor key, Tensor value, bool is_causal=False) -> bool
13847
- dispatch:
13848
- CUDA: _chunk_grad_outputs_efficient_attention
13849
-
13850
- - func: _flash_attention_forward(Tensor query, Tensor key, Tensor value, Tensor cum_seq_q, Tensor cum_seq_k, int max_q, int max_k, float dropout_p, bool is_causal, bool return_debug_mask) -> (Tensor output, Tensor softmax_logsumexp, int philox_seed, int philox_offset, Tensor debug_attn_mask)
14298
+ - func: _flash_attention_forward(Tensor query, Tensor key, Tensor value, Tensor cum_seq_q, Tensor cum_seq_k, int max_q, int max_k, float dropout_p, bool is_causal, bool return_debug_mask, *, float? scale=None) -> (Tensor output, Tensor softmax_logsumexp, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
13851
14299
  variants: function
13852
14300
  dispatch:
13853
14301
  CUDA: _flash_attention_forward
14302
+ tags: nondeterministic_seeded
13854
14303
 
13855
- - func: _flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, int max_q, int max_k, float dropout_p, bool is_causal, int philox_seed, int philox_offset) -> (Tensor, Tensor, Tensor)
14304
+ - func: _flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, int max_q, int max_k, float dropout_p, bool is_causal, Tensor philox_seed, Tensor philox_offset, *, float? scale=None) -> (Tensor, Tensor, Tensor)
14305
+ device_check: NoCheck
13856
14306
  variants: function
13857
14307
  dispatch:
13858
14308
  CUDA: _flash_attention_backward
13859
14309
 
13860
14310
  # Returns ouput, logsumexp if compute_logsumexp
13861
- - func: _efficient_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int? max_seqlen_q, bool compute_log_sumexp=False, bool causal=False) -> (Tensor, Tensor)
14311
+ - func: _efficient_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int? max_seqlen_q, float dropout_p, int custom_mask_type, bool compute_log_sumexp=False, *, float? scale=None, Tensor? causal_diagonal=None, Tensor? seqlen_k=None) -> (Tensor output, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset)
13862
14312
  variants: function
13863
14313
  dispatch:
13864
14314
  CUDA: _efficient_attention_forward
14315
+ tags: nondeterministic_seeded
13865
14316
 
13866
- - func: _efficient_attention_backward(Tensor grad_out_, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, bool is_causal=False, bool chunk_grad_outputs=False) -> (Tensor, Tensor, Tensor)
14317
+ - func: _efficient_attention_backward(Tensor grad_out_, Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor out, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int max_seqlen_k, int max_seqlen_q, Tensor logsumexp, float dropout_p, Tensor philox_seed, Tensor philox_offset, int custom_mask_type, bool bias_requires_grad, *, float? scale=None, int? num_splits_key=None) -> (Tensor, Tensor, Tensor, Tensor)
14318
+ device_check: NoCheck
13867
14319
  variants: function
13868
14320
  dispatch:
13869
14321
  CUDA: _efficient_attention_backward
@@ -13872,8 +14324,15 @@
13872
14324
  variants: function
13873
14325
  dispatch:
13874
14326
  CUDA: triton_scaled_dot_attention
14327
+ tags: nondeterministic_seeded
13875
14328
  autogen: _triton_scaled_dot_attention.out
13876
14329
 
14330
+ - func: _fill_mem_eff_dropout_mask_(Tensor(a!) self, float dropout_p, int seed, int offset) -> Tensor(a!)
14331
+ variants: function
14332
+ dispatch:
14333
+ CUDA: _fill_mem_eff_dropout_mask_
14334
+ tags: nondeterministic_seeded
14335
+
13877
14336
  - func: _triton_multi_head_attention(Tensor query, Tensor key, Tensor value, int embed_dim, int num_head, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, Tensor? mask=None) -> Tensor
13878
14337
  variants: function
13879
14338
  dispatch:
@@ -13895,18 +14354,6 @@
13895
14354
  variants: function
13896
14355
  tags: pointwise
13897
14356
 
13898
- - func: _transformer_decoder_only_layer_fwd(Tensor src, int embed_dim, int num_heads, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, bool use_gelu, bool norm_first, float eps, Tensor norm_weight_1, Tensor norm_bias_1, Tensor norm_weight_2, Tensor norm_bias_2, Tensor ffn_weight_1, Tensor ffn_bias_1, Tensor ffn_weight_2, Tensor ffn_bias_2, Tensor? mask=None, Tensor? incr_key=None, Tensor? incr_value=None) -> (Tensor, Tensor, Tensor)
13899
- variants: function
13900
- dispatch:
13901
- CPU, CUDA, NestedTensorCPU, NestedTensorCUDA: transformer_decoder_only_layer_forward
13902
- autogen: _transformer_decoder_only_layer_fwd.out
13903
-
13904
- - func: _native_decoder_only_multi_head_attention(Tensor query, Tensor key, Tensor value, int embed_dim, int num_head, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, Tensor? mask=None, Tensor? incr_key=None, Tensor? incr_value=None, bool need_weights=True, bool average_attn_weights=True) -> (Tensor, Tensor, Tensor, Tensor)
13905
- variants: function
13906
- dispatch:
13907
- CPU, CUDA, NestedTensorCPU, NestedTensorCUDA: native_decoder_only_multi_head_attention
13908
- autogen: _native_decoder_only_multi_head_attention.out
13909
-
13910
14357
  - func: special_bessel_j0(Tensor self) -> Tensor
13911
14358
  python_module: special
13912
14359
  structured_delegate: special_bessel_j0.out
@@ -14603,9 +15050,31 @@
14603
15050
  CUDA: _fused_adam_kernel_cuda_
14604
15051
  autogen: _fused_adam, _fused_adam.out
14605
15052
 
15053
+ - func: _fused_adam_.tensor_lr(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, Tensor lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
15054
+ # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now),
15055
+ # but still skip the device check as the Tensor LR can be on CPU
15056
+ device_check: NoCheck
15057
+ variants: function
15058
+ dispatch:
15059
+ CUDA: _fused_adam_kernel_cuda_
15060
+ autogen: _fused_adam.tensor_lr, _fused_adam.tensor_lr_out
15061
+
14606
15062
  - func: _fused_adamw_(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, float lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
14607
15063
  # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now).
14608
15064
  variants: function
14609
15065
  dispatch:
14610
15066
  CUDA: _fused_adamw_kernel_cuda_
14611
15067
  autogen: _fused_adamw, _fused_adamw.out
15068
+
15069
+ - func: _fused_adamw_.tensor_lr(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, Tensor lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
15070
+ # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now),
15071
+ # but still skip the device check as the Tensor LR can be on CPU
15072
+ device_check: NoCheck
15073
+ variants: function
15074
+ dispatch:
15075
+ CUDA: _fused_adamw_kernel_cuda_
15076
+ autogen: _fused_adamw.tensor_lr, _fused_adamw.tensor_lr_out
15077
+
15078
+ # This op is ONLY used by pytorch/XLA in functionalization, and should never show up in vanilla eager mode or in any pytorch tracing contexts.
15079
+ - func: _propagate_xla_data(Tensor input, Tensor output) -> ()
15080
+ variants: function