torch-rb 0.13.2 → 0.14.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -170,8 +170,36 @@
170
170
  CPU: _assert_async_cpu
171
171
  CUDA: _assert_async_cuda
172
172
 
173
+ - func: _assert_async.msg(Tensor self, str assert_msg) -> ()
174
+ dispatch:
175
+ CPU: _assert_async_msg_cpu
176
+ CUDA: _assert_async_msg_cuda
177
+
178
+ - func: _functional_assert_async.msg(Tensor self, str assert_msg, Tensor dep_token) -> Tensor
179
+ dispatch:
180
+ CPU: _functional_assert_async_msg_cpu
181
+
182
+ - func: _assert_tensor_metadata(Tensor a, SymInt[]? size=None, SymInt[]? stride=None, ScalarType? dtype=None) -> ()
183
+
184
+ - func: sym_constrain_range(Scalar size, *, int? min=None, int? max=None) -> ()
185
+ dispatch:
186
+ CompositeExplicitAutograd: sym_constrain_range
187
+
188
+ - func: sym_constrain_range_for_size(Scalar size, *, int? min, int? max) -> ()
189
+ dispatch:
190
+ CompositeExplicitAutograd: sym_constrain_range_for_size
173
191
 
174
- - func: _assert_tensor_metadata(Tensor a, int[]? size=None, int[]? stride=None, ScalarType? dtype=None) -> ()
192
+ - func: _functional_sym_constrain_range(Scalar size, int? min, int? max, Tensor dep_token) -> Tensor
193
+ dispatch:
194
+ CompositeExplicitAutograd: _functional_sym_constrain_range
195
+
196
+ - func: _functional_sym_constrain_range_for_size(Scalar size, int? min, int? max, Tensor dep_token) -> Tensor
197
+ dispatch:
198
+ CompositeExplicitAutograd: _functional_sym_constrain_range_for_size
199
+
200
+ - func: _make_dep_token(*, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
201
+ dispatch:
202
+ CPU: _make_dep_token_cpu
175
203
 
176
204
  - func: refine_names(Tensor(a) self, Dimname[] names) -> Tensor(a)
177
205
  variants: method
@@ -211,6 +239,7 @@
211
239
  dispatch:
212
240
  CUDA: _cudnn_rnn
213
241
  autogen: _cudnn_rnn.out
242
+ tags: nondeterministic_seeded
214
243
 
215
244
  - func: _cudnn_rnn_backward(Tensor input, Tensor[] weight, int weight_stride0, Tensor weight_buf, Tensor hx, Tensor? cx, Tensor output, Tensor? grad_output, Tensor? grad_hy, Tensor? grad_cy, int mode, SymInt hidden_size, SymInt proj_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, SymInt[] batch_sizes, Tensor? dropout_state, Tensor reserve, bool[4] output_mask) -> (Tensor, Tensor, Tensor, Tensor[])
216
245
  dispatch:
@@ -221,6 +250,7 @@
221
250
  dispatch:
222
251
  CUDA: _cudnn_init_dropout_state
223
252
  autogen: _cudnn_init_dropout_state.out
253
+ tags: nondeterministic_seeded
224
254
 
225
255
  - func: _debug_has_internal_overlap(Tensor self) -> int
226
256
  variants: function
@@ -297,6 +327,7 @@
297
327
  CompositeExplicitAutograd: abs
298
328
  SparseCPU, SparseCUDA: abs_sparse
299
329
  SparseCsrCPU, SparseCsrCUDA: abs_sparse_csr
330
+ NestedTensorCPU, NestedTensorCUDA: NestedTensor_abs
300
331
  tags: [core, pointwise]
301
332
 
302
333
  - func: abs_(Tensor(a!) self) -> Tensor(a!)
@@ -306,6 +337,7 @@
306
337
  CompositeExplicitAutograd: abs_
307
338
  SparseCPU, SparseCUDA: abs_sparse_
308
339
  SparseCsrCPU, SparseCsrCUDA: abs_sparse_csr_
340
+ NestedTensorCPU, NestedTensorCUDA: NestedTensor_abs_
309
341
 
310
342
  - func: abs.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
311
343
  device_check: NoCheck # TensorIterator
@@ -374,7 +406,7 @@
374
406
  - func: view_as_complex(Tensor(a) self) -> Tensor(a)
375
407
  variants: function
376
408
  dispatch:
377
- CPU, CUDA, Meta: view_as_complex
409
+ CPU, CUDA, MPS, Meta: view_as_complex
378
410
 
379
411
  - func: sgn(Tensor self) -> Tensor
380
412
  variants: function, method
@@ -382,6 +414,7 @@
382
414
  dispatch:
383
415
  SparseCPU, SparseCUDA: sgn_sparse
384
416
  SparseCsrCPU, SparseCsrCUDA: sgn_sparse_csr
417
+ NestedTensorCPU, NestedTensorCUDA: NestedTensor_sgn
385
418
  tags: pointwise
386
419
 
387
420
  - func: sgn_(Tensor(a!) self) -> Tensor(a!)
@@ -390,6 +423,7 @@
390
423
  dispatch:
391
424
  SparseCPU, SparseCUDA: sgn_sparse_
392
425
  SparseCsrCPU, SparseCsrCUDA: sgn_sparse_csr_
426
+ NestedTensorCPU, NestedTensorCUDA: NestedTensor_sgn_
393
427
  tags: pointwise
394
428
 
395
429
  - func: sgn.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@@ -488,8 +522,10 @@
488
522
  - func: arccos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
489
523
 
490
524
  - func: avg_pool1d(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, bool ceil_mode=False, bool count_include_pad=True) -> Tensor
525
+ tags: core
491
526
 
492
527
  - func: adaptive_avg_pool1d(Tensor self, int[1] output_size) -> Tensor
528
+ tags: core
493
529
 
494
530
  # Return: (Tensor output, Tensor indices)
495
531
  - func: adaptive_max_pool1d(Tensor self, int[1] output_size) -> (Tensor, Tensor)
@@ -610,13 +646,13 @@
610
646
  MPS: addr_out_mps
611
647
  CompositeExplicitAutograd: math_addr_out
612
648
 
613
- - func: affine_grid_generator(Tensor theta, int[] size, bool align_corners) -> Tensor
649
+ - func: affine_grid_generator(Tensor theta, SymInt[] size, bool align_corners) -> Tensor
614
650
  variants: function
615
651
  dispatch:
616
652
  CompositeExplicitAutograd: affine_grid_generator
617
653
  autogen: affine_grid_generator.out
618
654
 
619
- - func: affine_grid_generator_backward(Tensor grad, int[] size, bool align_corners) -> Tensor
655
+ - func: affine_grid_generator_backward(Tensor grad, SymInt[] size, bool align_corners) -> Tensor
620
656
  variants: function
621
657
 
622
658
  - func: _is_all_true(Tensor self) -> Tensor
@@ -633,6 +669,13 @@
633
669
  - func: _test_check_tensor(Tensor self) -> Tensor
634
670
  variants: function
635
671
 
672
+ # Note; this function is only for testing
673
+ - func: _test_functorch_fallback(Tensor self, Tensor other) -> Tensor
674
+ variants: function
675
+ dispatch:
676
+ CPU: _test_functorch_fallback
677
+ autogen: _test_functorch_fallback.out
678
+
636
679
  - func: all.dim(Tensor self, int dim, bool keepdim=False) -> Tensor
637
680
  device_check: NoCheck # TensorIterator
638
681
  structured_delegate: all.out
@@ -664,6 +707,7 @@
664
707
  device_check: NoCheck # TensorIterator
665
708
  structured_delegate: any.out
666
709
  variants: function, method
710
+ tags: core
667
711
 
668
712
  - func: any.out(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
669
713
  device_check: NoCheck # TensorIterator
@@ -1108,6 +1152,7 @@
1108
1152
  structured_inherits: TensorIteratorBase
1109
1153
  dispatch:
1110
1154
  CPU, CUDA: bitwise_not_out
1155
+ MPS: bitwise_not_out_mps
1111
1156
  tags: pointwise
1112
1157
 
1113
1158
  - func: copysign.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
@@ -1115,7 +1160,7 @@
1115
1160
  structured: True
1116
1161
  structured_inherits: TensorIteratorBase
1117
1162
  dispatch:
1118
- CPU, CUDA: copysign_out
1163
+ CPU, CUDA, MPS: copysign_out
1119
1164
  tags: pointwise
1120
1165
 
1121
1166
  - func: copysign.Tensor(Tensor self, Tensor other) -> Tensor
@@ -1150,6 +1195,7 @@
1150
1195
  variants: function, method
1151
1196
  dispatch:
1152
1197
  CompositeExplicitAutograd: logical_not
1198
+ NestedTensorCPU, NestedTensorCUDA: NestedTensor_logical_not
1153
1199
  tags: [core, pointwise]
1154
1200
 
1155
1201
  - func: logical_not_(Tensor(a!) self) -> Tensor(a!)
@@ -1157,6 +1203,7 @@
1157
1203
  variants: method
1158
1204
  dispatch:
1159
1205
  CompositeExplicitAutograd: logical_not_
1206
+ NestedTensorCPU, NestedTensorCUDA: NestedTensor_logical_not_
1160
1207
  tags: pointwise
1161
1208
 
1162
1209
  - func: logical_not.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@@ -1171,7 +1218,7 @@
1171
1218
  variants: function, method
1172
1219
  dispatch:
1173
1220
  CompositeExplicitAutograd: logical_xor
1174
- tags: pointwise
1221
+ tags: [core, pointwise]
1175
1222
 
1176
1223
  - func: logical_xor_(Tensor(a!) self, Tensor other) -> Tensor(a!)
1177
1224
  device_check: NoCheck # TensorIterator
@@ -1326,7 +1373,7 @@
1326
1373
  dispatch:
1327
1374
  SparseCPU, SparseCUDA: ceil_sparse
1328
1375
  SparseCsrCPU, SparseCsrCUDA: ceil_sparse_csr
1329
- tags: pointwise
1376
+ tags: [core, pointwise]
1330
1377
 
1331
1378
  - func: ceil_(Tensor(a!) self) -> Tensor(a!)
1332
1379
  device_check: NoCheck # TensorIterator
@@ -1393,7 +1440,7 @@
1393
1440
  - func: clamp.Tensor(Tensor self, Tensor? min=None, Tensor? max=None) -> Tensor
1394
1441
  variants: function, method
1395
1442
  structured_delegate: clamp.Tensor_out
1396
- tags: pointwise
1443
+ tags: [core, pointwise]
1397
1444
 
1398
1445
  - func: clamp_(Tensor(a!) self, Scalar? min=None, Scalar? max=None) -> Tensor(a!)
1399
1446
  device_check: NoCheck # TensorIterator
@@ -1552,6 +1599,7 @@
1552
1599
  - func: polar.out(Tensor abs, Tensor angle, *, Tensor(a!) out) -> Tensor(a!)
1553
1600
  dispatch:
1554
1601
  CPU, CUDA: polar_out
1602
+ MPS: polar_out_mps
1555
1603
 
1556
1604
  - func: constant_pad_nd(Tensor self, SymInt[] pad, Scalar value=0) -> Tensor
1557
1605
  variants: function
@@ -1598,11 +1646,17 @@
1598
1646
 
1599
1647
  - func: _convolution_double_backward(Tensor? ggI, Tensor? ggW, Tensor? ggb, Tensor gO, Tensor weight, Tensor self, int[] stride, SymInt[] padding, int[] dilation, bool transposed, SymInt[] output_padding, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
1600
1648
 
1601
- - func: conv1d(Tensor input, Tensor weight, Tensor? bias=None, int[1] stride=1, int[1] padding=0, int[1] dilation=1, int groups=1) -> Tensor
1649
+ - func: conv1d(Tensor input, Tensor weight, Tensor? bias=None, int[1] stride=1, SymInt[1] padding=0, int[1] dilation=1, int groups=1) -> Tensor
1650
+ dispatch:
1651
+ CompositeImplicitAutograd: conv1d_symint
1602
1652
 
1603
- - func: conv2d(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] dilation=1, int groups=1) -> Tensor
1653
+ - func: conv2d(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, SymInt[2] padding=0, int[2] dilation=1, int groups=1) -> Tensor
1654
+ dispatch:
1655
+ CompositeImplicitAutograd: conv2d_symint
1604
1656
 
1605
- - func: conv3d(Tensor input, Tensor weight, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] dilation=1, int groups=1) -> Tensor
1657
+ - func: conv3d(Tensor input, Tensor weight, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0, int[3] dilation=1, int groups=1) -> Tensor
1658
+ dispatch:
1659
+ CompositeImplicitAutograd: conv3d_symint
1606
1660
 
1607
1661
  - func: conv1d.padding(Tensor input, Tensor weight, Tensor? bias=None, int[1] stride=1, str padding="valid", int[1] dilation=1, int groups=1) -> Tensor
1608
1662
  cpp_no_default_args: ['bias', 'stride', 'padding']
@@ -1621,11 +1675,17 @@
1621
1675
  - func: conv_tbc_backward(Tensor self, Tensor input, Tensor weight, Tensor bias, int pad) -> (Tensor, Tensor, Tensor)
1622
1676
 
1623
1677
  # NB: we inherit the goofy argument order from PyTorch torch.nn.functional
1624
- - func: conv_transpose1d(Tensor input, Tensor weight, Tensor? bias=None, int[1] stride=1, int[1] padding=0, int[1] output_padding=0, int groups=1, int[1] dilation=1) -> Tensor
1678
+ - func: conv_transpose1d(Tensor input, Tensor weight, Tensor? bias=None, int[1] stride=1, SymInt[1] padding=0, SymInt[1] output_padding=0, int groups=1, int[1] dilation=1) -> Tensor
1679
+ dispatch:
1680
+ CompositeImplicitAutograd: conv_transpose1d_symint
1625
1681
 
1626
- - func: conv_transpose2d.input(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] output_padding=0, int groups=1, int[2] dilation=1) -> Tensor
1682
+ - func: conv_transpose2d.input(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, int groups=1, int[2] dilation=1) -> Tensor
1683
+ dispatch:
1684
+ CompositeImplicitAutograd: conv_transpose2d_symint
1627
1685
 
1628
- - func: conv_transpose3d.input(Tensor input, Tensor weight, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] output_padding=0, int groups=1, int[3] dilation=1) -> Tensor
1686
+ - func: conv_transpose3d.input(Tensor input, Tensor weight, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0, SymInt[3] output_padding=0, int groups=1, int[3] dilation=1) -> Tensor
1687
+ dispatch:
1688
+ CompositeImplicitAutograd: conv_transpose3d_symint
1629
1689
 
1630
1690
  - func: copy(Tensor self, Tensor src, bool non_blocking=False) -> Tensor
1631
1691
  variants: function
@@ -1850,6 +1910,7 @@
1850
1910
  device_check: NoCheck # TensorIterator
1851
1911
  dispatch:
1852
1912
  CPU, CUDA: cumprod_out
1913
+ MPS: cumprod_out_mps
1853
1914
 
1854
1915
  - func: cumprod.dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor
1855
1916
  device_check: NoCheck # TensorIterator
@@ -1870,6 +1931,7 @@
1870
1931
  structured_delegate: cumsum.out
1871
1932
  device_check: NoCheck # TensorIterator
1872
1933
  variants: function, method
1934
+ tags: core
1873
1935
 
1874
1936
  - func: cumsum_(Tensor(a!) self, int dim, *, ScalarType? dtype=None) -> Tensor(a!)
1875
1937
  structured_delegate: cumsum.out
@@ -2145,6 +2207,7 @@
2145
2207
  CompositeExplicitAutograd: embedding_symint
2146
2208
  NestedTensorCPU, NestedTensorCUDA: NestedTensor_embedding
2147
2209
  autogen: embedding.out
2210
+ tags: core
2148
2211
 
2149
2212
  - func: embedding_backward(Tensor grad, Tensor indices, SymInt num_weights, SymInt padding_idx, bool scale_grad_by_freq, bool sparse) -> Tensor
2150
2213
  dispatch:
@@ -2202,6 +2265,7 @@
2202
2265
  CPU: _embedding_bag_cpu
2203
2266
  CUDA: _embedding_bag_cuda
2204
2267
  autogen: _embedding_bag.out
2268
+ tags: core
2205
2269
 
2206
2270
  - func: _embedding_bag_backward(Tensor grad, Tensor indices, Tensor offsets, Tensor offset2bag, Tensor bag_size, Tensor maximum_indices, SymInt num_weights, bool scale_grad_by_freq, int mode, bool sparse, Tensor? per_sample_weights, int padding_idx=-1) -> Tensor
2207
2271
  dispatch:
@@ -2240,6 +2304,12 @@
2240
2304
  SparseCPU, SparseCUDA, SparseMeta: empty_sparse
2241
2305
  SparseCsrCPU, SparseCsrCUDA: empty_sparse_compressed
2242
2306
  QuantizedCPU, QuantizedCUDA, QuantizedMeta: empty_unknown_quantized
2307
+ tags: core
2308
+
2309
+ - func: empty_permuted(SymInt[] size, int[] physical_layout, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
2310
+ dispatch:
2311
+ CompositeExplicitAutograd: empty_permuted_symint
2312
+ autogen: empty_permuted.out
2243
2313
 
2244
2314
  # We do not make new_empty a composite that calls into new_empty_strided, as the strided version
2245
2315
  # is significantly more difficult to implement by different backends
@@ -2280,7 +2350,7 @@
2280
2350
  autogen: new_ones.out
2281
2351
 
2282
2352
  # other overrides are to provide a more helpful error message that dtype is required
2283
- - func: _empty_affine_quantized(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, float scale=1, int zero_point=0, MemoryFormat? memory_format=contiguous_format) -> Tensor
2353
+ - func: _empty_affine_quantized(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, float scale=1, int zero_point=0, MemoryFormat? memory_format=contiguous_format) -> Tensor
2284
2354
  dispatch:
2285
2355
  CPU: empty_affine_quantized_other_backends_stub
2286
2356
  QuantizedCPU, QuantizedCUDA: empty_affine_quantized
@@ -2288,7 +2358,7 @@
2288
2358
 
2289
2359
  # it's a factory function receiving a tensor argument, thus overriding explicitly
2290
2360
  # other overrides are to provide a more helpful error message that dtype is required
2291
- - func: _empty_per_channel_affine_quantized(int[] size, *, Tensor scales, Tensor zero_points, int axis, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=contiguous_format) -> Tensor
2361
+ - func: _empty_per_channel_affine_quantized(SymInt[] size, *, Tensor scales, Tensor zero_points, int axis, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=contiguous_format) -> Tensor
2292
2362
  category_override: factory
2293
2363
  dispatch:
2294
2364
  CPU: empty_per_channel_affine_quantized_other_backends_stub
@@ -2313,7 +2383,7 @@
2313
2383
  # This is a utility function to enable users to resize out tensor while registering kernels for out variants.
2314
2384
  # Eventually, we can consider exposing `resize_output` as a public API to ship it with python op registration
2315
2385
  # to make it easy to register out variants for ops.
2316
- - func: _resize_output_(Tensor(a!) self, int[] size, Device device) -> Tensor(a!)
2386
+ - func: _resize_output_(Tensor(a!) self, SymInt[] size, Device device) -> Tensor(a!)
2317
2387
  use_const_ref_for_mutable_tensors: True
2318
2388
  variants: function
2319
2389
  dispatch:
@@ -2483,21 +2553,21 @@
2483
2553
  device_guard: False
2484
2554
 
2485
2555
  # decomposes to eye.m
2486
- - func: eye(int n, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
2556
+ - func: eye(SymInt n, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
2487
2557
  dispatch:
2488
2558
  CompositeExplicitAutograd: eye
2489
2559
 
2490
- - func: eye.m(int n, int m, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
2560
+ - func: eye.m(SymInt n, SymInt m, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
2491
2561
  dispatch:
2492
2562
  CompositeExplicitAutograd: eye
2493
2563
 
2494
- - func: eye.out(int n, *, Tensor(a!) out) -> Tensor(a!)
2564
+ - func: eye.out(SymInt n, *, Tensor(a!) out) -> Tensor(a!)
2495
2565
  dispatch:
2496
2566
  CPU, Meta: eye_out_cpu
2497
2567
  CUDA: eye_out_cuda
2498
2568
  MPS: eye_out_mps
2499
2569
 
2500
- - func: eye.m_out(int n, int m, *, Tensor(a!) out) -> Tensor(a!)
2570
+ - func: eye.m_out(SymInt n, SymInt m, *, Tensor(a!) out) -> Tensor(a!)
2501
2571
  dispatch:
2502
2572
  CPU, Meta: eye_out_cpu
2503
2573
  CUDA: eye_out_cuda
@@ -2515,11 +2585,15 @@
2515
2585
  - func: flatten.DimnameList(Tensor(a) self, Dimname[] dims, Dimname out_dim) -> Tensor(a)
2516
2586
  variants: function, method
2517
2587
 
2518
- - func: unflatten.int(Tensor(a) self, int dim, int[] sizes) -> Tensor(a)
2588
+ - func: unflatten.int(Tensor(a) self, int dim, SymInt[] sizes) -> Tensor(a)
2519
2589
  variants: function, method
2590
+ dispatch:
2591
+ CompositeImplicitAutograd: unflatten_symint
2520
2592
 
2521
- - func: unflatten.Dimname(Tensor(a) self, Dimname dim, int[] sizes, Dimname[] names) -> Tensor(a)
2593
+ - func: unflatten.Dimname(Tensor(a) self, Dimname dim, SymInt[] sizes, Dimname[] names) -> Tensor(a)
2522
2594
  variants: function, method
2595
+ dispatch:
2596
+ CompositeImplicitAutograd: unflatten_dimname_symint
2523
2597
 
2524
2598
  - func: fill.Scalar(Tensor self, Scalar value) -> Tensor
2525
2599
  variants: function
@@ -2839,13 +2913,13 @@
2839
2913
  CUDA: _fft_r2c_cufft_out
2840
2914
 
2841
2915
  # Complex to real inverse FFT
2842
- - func: _fft_c2r(Tensor self, int[] dim, int normalization, int last_dim_size) -> Tensor
2916
+ - func: _fft_c2r(Tensor self, int[] dim, int normalization, SymInt last_dim_size) -> Tensor
2843
2917
  variants: function
2844
2918
  dispatch:
2845
2919
  CPU: _fft_c2r_mkl
2846
2920
  CUDA: _fft_c2r_cufft
2847
2921
 
2848
- - func: _fft_c2r.out(Tensor self, int[] dim, int normalization, int last_dim_size, *, Tensor(a!) out) -> Tensor(a!)
2922
+ - func: _fft_c2r.out(Tensor self, int[] dim, int normalization, SymInt last_dim_size, *, Tensor(a!) out) -> Tensor(a!)
2849
2923
  variants: function
2850
2924
  dispatch:
2851
2925
  CPU: _fft_c2r_mkl_out
@@ -2871,13 +2945,13 @@
2871
2945
  CPU: _validate_compressed_sparse_indices_cpu
2872
2946
  CUDA: _validate_compressed_sparse_indices_cuda
2873
2947
 
2874
- - func: _cufft_get_plan_cache_size(int device_index) -> int
2948
+ - func: _cufft_get_plan_cache_size(DeviceIndex device_index) -> int
2875
2949
 
2876
- - func: _cufft_get_plan_cache_max_size(int device_index) -> int
2950
+ - func: _cufft_get_plan_cache_max_size(DeviceIndex device_index) -> int
2877
2951
 
2878
- - func: _cufft_set_plan_cache_max_size(int device_index, int max_size) -> ()
2952
+ - func: _cufft_set_plan_cache_max_size(DeviceIndex device_index, int max_size) -> ()
2879
2953
 
2880
- - func: _cufft_clear_plan_cache(int device_index) -> ()
2954
+ - func: _cufft_clear_plan_cache(DeviceIndex device_index) -> ()
2881
2955
 
2882
2956
  - func: index.Tensor(Tensor self, Tensor?[] indices) -> Tensor
2883
2957
  device_check: NoCheck # TensorIterator
@@ -2885,7 +2959,7 @@
2885
2959
  variants: function, method
2886
2960
  dispatch:
2887
2961
  QuantizedCPU: quantized_index
2888
- tags: dynamic_output_shape
2962
+ tags: [core, dynamic_output_shape]
2889
2963
  # NB: This function is special-cased in tools/autograd/gen_variable_type.py
2890
2964
  # NB: The following functions are declared in aten/src/ATen/templates/TensorBody.h and defined in aten/src/ATen/TensorIndexing.cpp:
2891
2965
  # - Tensor Tensor::index(ArrayRef<TensorIndex> indices)
@@ -2900,6 +2974,13 @@
2900
2974
  dispatch:
2901
2975
  CPU, CUDA, MPS: index_out
2902
2976
 
2977
+ # Used by inductor to signal indexing without bounds checks
2978
+ # Note that we don't support boolean indexing, to avoid dynamic output shapes
2979
+ - func: _unsafe_index.Tensor(Tensor self, Tensor?[] indices) -> Tensor
2980
+ variants: function
2981
+ dispatch:
2982
+ CPU, CUDA: _unsafe_index
2983
+
2903
2984
  - func: index_copy.out(Tensor self, int dim, Tensor index, Tensor source, *, Tensor(a!) out) -> Tensor(a!)
2904
2985
  structured: True
2905
2986
  variants: function
@@ -2939,6 +3020,13 @@
2939
3020
  variants: function, method
2940
3021
  dispatch:
2941
3022
  CompositeExplicitAutograd: index_put
3023
+ tags: core
3024
+
3025
+ - func: _unsafe_index_put(Tensor self, Tensor?[] indices, Tensor values, bool accumulate=False) -> Tensor
3026
+ device_check: NoCheck # delegate to _index_put_impl_ after clone, which leverages TensorIterator
3027
+ variants: function
3028
+ dispatch:
3029
+ CompositeExplicitAutograd: _unsafe_index_put
2942
3030
 
2943
3031
  - func: _index_put_impl_(Tensor(a!) self, Tensor?[] indices, Tensor values, bool accumulate=False, bool unsafe=False) -> Tensor(a!)
2944
3032
  device_check: NoCheck # TensorIterator
@@ -3097,6 +3185,7 @@
3097
3185
  CPU: layer_norm_backward_cpu
3098
3186
  CUDA: layer_norm_backward_cuda
3099
3187
  MPS: layer_norm_backward_mps
3188
+ NestedTensorCPU, NestedTensorCUDA: layer_norm_backward_nested
3100
3189
  autogen: native_layer_norm_backward.out
3101
3190
  tags: core
3102
3191
 
@@ -3160,6 +3249,18 @@
3160
3249
  MkldnnCPU: mkldnn_linear_backward
3161
3250
  autogen: mkldnn_linear_backward.out
3162
3251
 
3252
+ - func: _cslt_compress(Tensor input) -> Tensor
3253
+ dispatch:
3254
+ CUDA: _cslt_compress
3255
+
3256
+ - func: _cslt_sparse_mm(Tensor compressed_A, Tensor dense_B, Tensor? bias=None, bool transpose_result=False) -> Tensor
3257
+ dispatch:
3258
+ CUDA: _cslt_sparse_mm
3259
+
3260
+ - func: _sparse_semi_structured_linear(Tensor input, Tensor weight, Tensor meta, *, Tensor? bias=None, str? activation=None) -> Tensor
3261
+ dispatch:
3262
+ CUDA: _sparse_semi_structured_linear
3263
+
3163
3264
  - func: fbgemm_linear_int8_weight_fp32_activation(Tensor input, Tensor weight, Tensor packed, Tensor col_offsets, Scalar weight_scale, Scalar weight_zero_point, Tensor bias) -> Tensor
3164
3265
 
3165
3266
  - func: fbgemm_linear_int8_weight(Tensor input, Tensor weight, Tensor packed, Tensor col_offsets, Scalar weight_scale, Scalar weight_zero_point, Tensor bias) -> Tensor
@@ -3355,6 +3456,7 @@
3355
3456
  variants: function
3356
3457
  dispatch:
3357
3458
  CPU, CUDA: xlogy_out
3459
+ MPS: xlogy_out_mps
3358
3460
  tags: pointwise
3359
3461
 
3360
3462
  - func: xlogy.OutScalar_Self(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
@@ -3510,6 +3612,7 @@
3510
3612
  structured: True
3511
3613
  dispatch:
3512
3614
  CPU, CUDA: aminmax_out
3615
+ MPS: aminmax_out_mps
3513
3616
 
3514
3617
  - func: _compute_linear_combination(Tensor input, Tensor coefficients) -> Tensor
3515
3618
  dispatch:
@@ -3607,6 +3710,11 @@
3607
3710
  QuantizedCUDA: quantized_max_pool2d_cudnn
3608
3711
  autogen: quantized_max_pool2d.out
3609
3712
 
3713
+ - func: quantized_max_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> Tensor
3714
+ dispatch:
3715
+ QuantizedCPU: quantized_max_pool3d
3716
+ autogen: quantized_max_pool3d.out
3717
+
3610
3718
  - func: max_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> Tensor
3611
3719
 
3612
3720
  # The CPU and GPU dispatch variants are named weirdly here because otherwise there
@@ -3616,6 +3724,7 @@
3616
3724
  variants: function, method
3617
3725
  dispatch:
3618
3726
  CompositeExplicitAutograd: mean
3727
+ tags: core
3619
3728
 
3620
3729
  # For normal naming convention this should be `mean.out`. However since we already have `mean.out` we have to rename this.
3621
3730
  # FIXME: fix CI jobs and re-enable this
@@ -3756,6 +3865,7 @@
3756
3865
  - func: mkldnn_rnn_layer(Tensor input, Tensor weight0, Tensor weight1, Tensor weight2, Tensor weight3, Tensor hx_, Tensor cx_, bool reverse, int[] batch_sizes, int mode, int hidden_size, int num_layers, bool has_biases, bool bidirectional, bool batch_first, bool train) -> (Tensor, Tensor, Tensor, Tensor)
3757
3866
  dispatch:
3758
3867
  CPU: mkldnn_rnn_layer
3868
+ MkldnnCPU: mkldnn_rnn_layer
3759
3869
  autogen: mkldnn_rnn_layer.out
3760
3870
 
3761
3871
  - func: mkldnn_rnn_layer_backward(Tensor input, Tensor weight1, Tensor weight2, Tensor weight3, Tensor weight4, Tensor hx_, Tensor cx_tmp, Tensor output, Tensor hy_, Tensor cy_, Tensor? grad_output, Tensor? grad_hy, Tensor? grad_cy, bool reverse, int mode, int hidden_size, int num_layers, bool has_biases, bool train, bool bidirectional, int[] batch_sizes, bool batch_first, Tensor workspace) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor)
@@ -3800,6 +3910,8 @@
3800
3910
  dispatch:
3801
3911
  CUDA: miopen_rnn
3802
3912
  autogen: miopen_rnn.out
3913
+ tags: nondeterministic_seeded
3914
+
3803
3915
 
3804
3916
  - func: miopen_rnn_backward(Tensor input, Tensor[] weight, int weight_stride0, Tensor weight_buf, Tensor hx, Tensor? cx, Tensor output, Tensor? grad_output, Tensor? grad_hy, Tensor? grad_cy, int mode, int hidden_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, int[] batch_sizes, Tensor? dropout_state, Tensor reserve, bool[4] output_mask) -> (Tensor, Tensor, Tensor, Tensor[])
3805
3917
  dispatch:
@@ -3823,6 +3935,14 @@
3823
3935
  SparseCPU, SparseCUDA: _sparse_mm_out
3824
3936
  SparseCsrCPU, SparseCsrCUDA: _sparse_csr_mm_out
3825
3937
 
3938
+ - func: _int_mm(Tensor self, Tensor mat2) -> Tensor
3939
+ dispatch:
3940
+ CUDA: _int_mm_cuda
3941
+
3942
+ - func: _int_mm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)
3943
+ dispatch:
3944
+ CUDA: _int_mm_out_cuda
3945
+
3826
3946
  - func: _sparse_mm(Tensor sparse, Tensor dense) -> Tensor
3827
3947
  python_module: sparse
3828
3948
 
@@ -3981,7 +4101,6 @@
3981
4101
  CUDA: batch_norm_cuda
3982
4102
  MPS: batch_norm_mps
3983
4103
  MkldnnCPU: mkldnn_batch_norm
3984
- tags: core
3985
4104
 
3986
4105
  - func: native_batch_norm.out(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, *, Tensor(a!) out, Tensor(b!) save_mean, Tensor(c!) save_invstd) -> (Tensor(a!), Tensor(b!), Tensor(c!))
3987
4106
  dispatch:
@@ -3997,6 +4116,16 @@
3997
4116
  MPS: _batch_norm_legit_mps
3998
4117
  MkldnnCPU: _mkldnn_batch_norm_legit
3999
4118
  autogen: _native_batch_norm_legit_functional
4119
+ tags: core
4120
+
4121
+ # HACK: identical to _native_batch_norm_legit, but training is known to be False,
4122
+ # So we known that running stats will not be mutated.
4123
+ # The real fix here is batch norm consolidation.
4124
+ - func: _native_batch_norm_legit_no_training(Tensor input, Tensor? weight, Tensor? bias, Tensor running_mean, Tensor running_var, float momentum, float eps) -> (Tensor, Tensor, Tensor)
4125
+ dispatch:
4126
+ CompositeExplicitAutograd: _batch_norm_legit_no_training
4127
+ autogen: _native_batch_norm_legit_no_training.out
4128
+ tags: core
4000
4129
 
4001
4130
  - func: _native_batch_norm_legit.out(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, bool training, float momentum, float eps, *, Tensor(d!) out, Tensor(e!) save_mean, Tensor(f!) save_invstd) -> (Tensor(d!), Tensor(e!), Tensor(f!))
4002
4131
  dispatch:
@@ -4055,7 +4184,7 @@
4055
4184
  CUDA: batch_norm_backward_reduce_cuda
4056
4185
  autogen: batch_norm_backward_reduce.out
4057
4186
 
4058
- - func: batch_norm_backward_elemt(Tensor grad_out, Tensor input, Tensor mean, Tensor invstd, Tensor? weight, Tensor mean_dy, Tensor mean_dy_xmu, Tensor count) -> Tensor
4187
+ - func: batch_norm_backward_elemt(Tensor grad_out, Tensor input, Tensor mean, Tensor invstd, Tensor? weight, Tensor sum_dy, Tensor sum_dy_xmu, Tensor count) -> Tensor
4059
4188
  dispatch:
4060
4189
  CUDA: batch_norm_backward_elemt_cuda
4061
4190
  autogen: batch_norm_backward_elemt.out
@@ -4113,6 +4242,7 @@
4113
4242
  CPU, CUDA: _cdist_forward
4114
4243
  MPS: _cdist_forward_mps
4115
4244
  autogen: _cdist_forward.out
4245
+ tags: core
4116
4246
 
4117
4247
  - func: _cdist_backward(Tensor grad, Tensor x1, Tensor x2, float p, Tensor cdist) -> Tensor
4118
4248
  dispatch:
@@ -4125,6 +4255,7 @@
4125
4255
  dispatch:
4126
4256
  CPU, CUDA: _pdist_forward
4127
4257
  autogen: _pdist_forward.out
4258
+ tags: core
4128
4259
 
4129
4260
  - func: _pdist_backward(Tensor grad, Tensor self, float p, Tensor pdist) -> Tensor
4130
4261
  dispatch:
@@ -4185,6 +4316,7 @@
4185
4316
  CPU: pixel_shuffle_cpu
4186
4317
  CompositeExplicitAutogradNonFunctional: math_pixel_shuffle
4187
4318
  autogen: pixel_shuffle.out
4319
+ tags: core
4188
4320
 
4189
4321
  - func: pixel_unshuffle(Tensor self, int downscale_factor) -> Tensor
4190
4322
  dispatch:
@@ -4194,7 +4326,7 @@
4194
4326
 
4195
4327
  - func: channel_shuffle(Tensor self, int groups) -> Tensor
4196
4328
  dispatch:
4197
- CPU: channel_shuffle
4329
+ CPU, CUDA: channel_shuffle
4198
4330
  QuantizedCPU: channel_shuffle_quantized_cpu
4199
4331
  autogen: channel_shuffle.out
4200
4332
 
@@ -4294,7 +4426,7 @@
4294
4426
  autogen: rand.generator_with_names_out
4295
4427
 
4296
4428
  - func: rand(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
4297
- tags: nondeterministic_seeded
4429
+ tags: [core, nondeterministic_seeded]
4298
4430
  dispatch:
4299
4431
  CompositeExplicitAutograd: rand
4300
4432
 
@@ -4319,47 +4451,47 @@
4319
4451
  CompositeExplicitAutograd: rand_like
4320
4452
  autogen: rand_like.out
4321
4453
 
4322
- - func: randint(int high, SymInt[] size, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
4454
+ - func: randint(SymInt high, SymInt[] size, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
4323
4455
  tags: nondeterministic_seeded
4324
4456
  dispatch:
4325
4457
  CompositeExplicitAutograd: randint
4326
4458
 
4327
- - func: randint.generator(int high, SymInt[] size, *, Generator? generator, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
4459
+ - func: randint.generator(SymInt high, SymInt[] size, *, Generator? generator, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
4328
4460
  tags: nondeterministic_seeded
4329
4461
  dispatch:
4330
4462
  CompositeExplicitAutograd: randint
4331
4463
 
4332
- - func: randint.low(int low, int high, SymInt[] size, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
4464
+ - func: randint.low(SymInt low, SymInt high, SymInt[] size, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
4333
4465
  tags: nondeterministic_seeded
4334
4466
  dispatch:
4335
4467
  CompositeExplicitAutograd: randint
4336
4468
 
4337
- - func: randint.low_generator(int low, int high, SymInt[] size, *, Generator? generator, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
4469
+ - func: randint.low_generator(SymInt low, SymInt high, SymInt[] size, *, Generator? generator, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
4338
4470
  tags: nondeterministic_seeded
4339
4471
  dispatch:
4340
4472
  CompositeExplicitAutograd: randint
4341
4473
 
4342
- - func: randint.out(int high, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
4474
+ - func: randint.out(SymInt high, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
4343
4475
  tags: nondeterministic_seeded
4344
4476
  dispatch:
4345
4477
  CompositeExplicitAutograd: randint_out
4346
4478
 
4347
- - func: randint.generator_out(int high, SymInt[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)
4479
+ - func: randint.generator_out(SymInt high, SymInt[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)
4348
4480
  tags: nondeterministic_seeded
4349
4481
  dispatch:
4350
4482
  CompositeExplicitAutograd: randint_out
4351
4483
 
4352
- - func: randint.low_out(int low, int high, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
4484
+ - func: randint.low_out(SymInt low, SymInt high, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
4353
4485
  tags: nondeterministic_seeded
4354
4486
  dispatch:
4355
4487
  CompositeExplicitAutograd: randint_out
4356
4488
 
4357
- - func: randint.low_generator_out(int low, int high, SymInt[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)
4489
+ - func: randint.low_generator_out(SymInt low, SymInt high, SymInt[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)
4358
4490
  tags: nondeterministic_seeded
4359
4491
  dispatch:
4360
4492
  CompositeExplicitAutograd: randint_out
4361
4493
 
4362
- - func: randint_like(Tensor self, int high, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
4494
+ - func: randint_like(Tensor self, SymInt high, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
4363
4495
  tags: nondeterministic_seeded
4364
4496
  dispatch:
4365
4497
  # NB: Although this composite mutates on the inside, it is
@@ -4367,7 +4499,7 @@
4367
4499
  CompositeExplicitAutograd: randint_like
4368
4500
  autogen: randint_like.out
4369
4501
 
4370
- - func: randint_like.low_dtype(Tensor self, int low, int high, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
4502
+ - func: randint_like.low_dtype(Tensor self, SymInt low, SymInt high, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
4371
4503
  tags: nondeterministic_seeded
4372
4504
  dispatch:
4373
4505
  # NB: Although this composite mutates on the inside, it is
@@ -4376,7 +4508,7 @@
4376
4508
  autogen: randint_like.low_dtype_out
4377
4509
 
4378
4510
  - func: randn(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
4379
- tags: nondeterministic_seeded
4511
+ tags: [core, nondeterministic_seeded]
4380
4512
  dispatch:
4381
4513
  CompositeExplicitAutograd: randn
4382
4514
 
@@ -4412,25 +4544,25 @@
4412
4544
  dispatch:
4413
4545
  # NB: Although this composite mutates on the inside, it is
4414
4546
  # non-differentiable so NonFunctional doesn't apply
4415
- CompositeExplicitAutograd: randn_like
4547
+ CompositeExplicitAutograd, CompositeImplicitAutogradNestedTensor: randn_like
4416
4548
  autogen: randn_like.out
4417
4549
 
4418
- - func: randperm(int n, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
4419
- tags: nondeterministic_seeded
4550
+ - func: randperm(SymInt n, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
4551
+ tags: [core, nondeterministic_seeded]
4420
4552
  dispatch:
4421
4553
  CompositeExplicitAutograd: randperm
4422
4554
 
4423
- - func: randperm.generator(int n, *, Generator? generator, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
4555
+ - func: randperm.generator(SymInt n, *, Generator? generator, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
4424
4556
  tags: nondeterministic_seeded
4425
4557
  dispatch:
4426
4558
  CompositeExplicitAutograd: randperm
4427
4559
 
4428
- - func: randperm.out(int n, *, Tensor(a!) out) -> Tensor(a!)
4560
+ - func: randperm.out(SymInt n, *, Tensor(a!) out) -> Tensor(a!)
4429
4561
  tags: nondeterministic_seeded
4430
4562
  dispatch:
4431
4563
  CompositeExplicitAutograd: randperm_out
4432
4564
 
4433
- - func: randperm.generator_out(int n, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)
4565
+ - func: randperm.generator_out(SymInt n, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)
4434
4566
  tags: nondeterministic_seeded
4435
4567
  dispatch:
4436
4568
  CPU: randperm_out_cpu
@@ -4591,7 +4723,7 @@
4591
4723
  dispatch:
4592
4724
  SparseCPU, SparseCUDA: round_sparse
4593
4725
  SparseCsrCPU, SparseCsrCUDA: round_sparse_csr
4594
- tags: pointwise
4726
+ tags: [core, pointwise]
4595
4727
 
4596
4728
  - func: round_(Tensor(a!) self) -> Tensor(a!)
4597
4729
  device_check: NoCheck # TensorIterator
@@ -4839,10 +4971,14 @@
4839
4971
  - func: silu(Tensor self) -> Tensor
4840
4972
  structured_delegate: silu.out
4841
4973
  python_module: nn
4974
+ dispatch:
4975
+ NestedTensorCPU, NestedTensorCUDA: NestedTensor_silu
4842
4976
 
4843
4977
  - func: silu_(Tensor(a!) self) -> Tensor(a!)
4844
4978
  structured_delegate: silu.out
4845
4979
  python_module: nn
4980
+ dispatch:
4981
+ NestedTensorCPU, NestedTensorCUDA: NestedTensor_silu_
4846
4982
 
4847
4983
  - func: silu.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
4848
4984
  structured: True
@@ -4865,6 +5001,7 @@
4865
5001
  python_module: nn
4866
5002
  dispatch:
4867
5003
  CompositeImplicitAutograd: math_silu_backward
5004
+ NestedTensorCPU, NestedTensorCUDA: silu_backward_nested
4868
5005
 
4869
5006
  - func: mish(Tensor self) -> Tensor
4870
5007
  structured_delegate: mish.out
@@ -4917,6 +5054,7 @@
4917
5054
  variants: function, method
4918
5055
  dispatch:
4919
5056
  CPU, CUDA: logit
5057
+ MPS: logit_mps
4920
5058
  tags: pointwise
4921
5059
 
4922
5060
  - func: logit_(Tensor(a!) self, float? eps=None) -> Tensor(a!)
@@ -4928,6 +5066,7 @@
4928
5066
  - func: logit.out(Tensor self, float? eps=None, *, Tensor(a!) out) -> Tensor(a!)
4929
5067
  dispatch:
4930
5068
  CPU, CUDA: logit_out
5069
+ MPS: logit_out_mps
4931
5070
  tags: pointwise
4932
5071
 
4933
5072
  - func: sin(Tensor self) -> Tensor
@@ -5042,6 +5181,27 @@
5042
5181
  device_check: NoCheck
5043
5182
  device_guard: False
5044
5183
 
5184
+ - func: sym_size.int(Tensor self, int dim) -> SymInt
5185
+ variants: function
5186
+ device_check: NoCheck
5187
+ device_guard: False
5188
+ tags: core
5189
+ manual_cpp_binding: True
5190
+
5191
+ - func: sym_numel(Tensor self) -> SymInt
5192
+ variants: function
5193
+ device_check: NoCheck
5194
+ device_guard: False
5195
+ tags: core
5196
+ manual_cpp_binding: True
5197
+
5198
+ - func: sym_storage_offset(Tensor self) -> SymInt
5199
+ variants: function
5200
+ device_check: NoCheck
5201
+ device_guard: False
5202
+ tags: core
5203
+ manual_cpp_binding: True
5204
+
5045
5205
  - func: slice.Tensor(Tensor(a) self, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor(a)
5046
5206
  variants: function, method
5047
5207
  device_check: NoCheck
@@ -5066,7 +5226,7 @@
5066
5226
  device_check: NoCheck
5067
5227
  device_guard: False
5068
5228
  dispatch:
5069
- CompositeExplicitAutograd: slice_scatter
5229
+ CompositeExplicitAutogradNonFunctional: slice_scatter
5070
5230
  autogen: slice_scatter.out
5071
5231
  tags: core
5072
5232
 
@@ -5075,15 +5235,16 @@
5075
5235
  device_check: NoCheck
5076
5236
  device_guard: False
5077
5237
  dispatch:
5078
- CompositeExplicitAutograd: select_scatter_symint
5238
+ CompositeExplicitAutogradNonFunctional: select_scatter_symint
5079
5239
  autogen: select_scatter.out
5240
+ tags: core
5080
5241
 
5081
5242
  - func: diagonal_scatter(Tensor self, Tensor src, int offset=0, int dim1=0, int dim2=1) -> Tensor
5082
5243
  variants: function, method
5083
5244
  device_check: NoCheck
5084
5245
  device_guard: False
5085
5246
  dispatch:
5086
- CompositeExplicitAutograd: diagonal_scatter
5247
+ CompositeExplicitAutogradNonFunctional: diagonal_scatter
5087
5248
  autogen: diagonal_scatter.out
5088
5249
 
5089
5250
  - func: as_strided_scatter(Tensor self, Tensor src, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None) -> Tensor
@@ -5091,7 +5252,7 @@
5091
5252
  device_check: NoCheck
5092
5253
  device_guard: False
5093
5254
  dispatch:
5094
- CompositeExplicitAutograd: as_strided_scatter_symint
5255
+ CompositeExplicitAutogradNonFunctional: as_strided_scatter_symint
5095
5256
  autogen: as_strided_scatter.out
5096
5257
 
5097
5258
  - func: smm(Tensor self, Tensor mat2) -> Tensor
@@ -5170,6 +5331,8 @@
5170
5331
  device_guard: False
5171
5332
  dispatch:
5172
5333
  CompositeExplicitAutograd: split_with_sizes
5334
+ NestedTensorCPU, NestedTensorCUDA: split_with_sizes_nested
5335
+ tags: core
5173
5336
 
5174
5337
  - func: hsplit.int(Tensor(a -> *) self, int sections) -> Tensor(a)[]
5175
5338
  variants: function, method
@@ -5316,6 +5479,13 @@
5316
5479
  device_check: NoCheck
5317
5480
  device_guard: False
5318
5481
 
5482
+ - func: sym_stride.int(Tensor self, int dim) -> SymInt
5483
+ variants: function
5484
+ device_check: NoCheck
5485
+ device_guard: False
5486
+ tags: core
5487
+ manual_cpp_binding: True
5488
+
5319
5489
  - func: sum(Tensor self, *, ScalarType? dtype=None) -> Tensor
5320
5490
  device_check: NoCheck # TensorIterator
5321
5491
  variants: function, method
@@ -5326,12 +5496,14 @@
5326
5496
  autogen: sum.out
5327
5497
 
5328
5498
  - func: sum.dim_IntList(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
5499
+ # TODO: Align the signature of sum.dim_IntList and _sparse_csr_sum.dim_dtype
5329
5500
  structured_delegate: sum.IntList_out
5330
5501
  device_check: NoCheck # TensorIterator
5331
5502
  variants: function, method
5332
5503
  dispatch:
5333
5504
  NestedTensorCPU: NestedTensor_sum_dim_CPU
5334
5505
  SparseCPU, SparseCUDA: sum_sparse_coo
5506
+ SparseCsrCPU, SparseCsrCUDA: sum_sparse_compressed
5335
5507
  tags: core
5336
5508
 
5337
5509
  - func: sum.dim_DimnameList(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
@@ -5364,10 +5536,12 @@
5364
5536
  CPU, CUDA: nansum_out
5365
5537
  MPS: nansum_out_mps
5366
5538
 
5367
- - func: sum_to_size(Tensor self, int[] size) -> Tensor
5539
+ - func: sum_to_size(Tensor self, SymInt[] size) -> Tensor
5368
5540
  variants: method
5369
5541
  device_check: NoCheck
5370
5542
  device_guard: False
5543
+ dispatch:
5544
+ CompositeImplicitAutograd: sum_to_size_symint
5371
5545
 
5372
5546
  - func: sqrt(Tensor self) -> Tensor
5373
5547
  device_check: NoCheck # TensorIterator
@@ -5421,7 +5595,7 @@
5421
5595
  variants: function, method
5422
5596
  cpp_no_default_args: ["unbiased"]
5423
5597
 
5424
- - func: std.correction(Tensor self, int[1]? dim=None, *, int? correction=None, bool keepdim=False) -> Tensor
5598
+ - func: std.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> Tensor
5425
5599
  device_check: NoCheck # TensorIterator
5426
5600
  variants: function, method
5427
5601
  dispatch:
@@ -5439,7 +5613,7 @@
5439
5613
  variants: function
5440
5614
  cpp_no_default_args: ["unbiased"]
5441
5615
 
5442
- - func: std_mean.correction(Tensor self, int[1]? dim=None, *, int? correction=None, bool keepdim=False) -> (Tensor, Tensor)
5616
+ - func: std_mean.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
5443
5617
  device_check: NoCheck # TensorIterator
5444
5618
  variants: function
5445
5619
  dispatch:
@@ -5451,7 +5625,7 @@
5451
5625
  variants: function
5452
5626
  cpp_no_default_args: ["unbiased"]
5453
5627
 
5454
- - func: std_mean.correction_names(Tensor self, Dimname[1] dim, *, int? correction=None, bool keepdim=False) -> (Tensor, Tensor)
5628
+ - func: std_mean.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
5455
5629
  device_check: NoCheck # TensorIterator
5456
5630
  variants: function
5457
5631
 
@@ -5459,7 +5633,7 @@
5459
5633
  device_check: NoCheck # TensorIterator
5460
5634
  cpp_no_default_args: ["unbiased"]
5461
5635
 
5462
- - func: std.correction_out(Tensor self, int[1]? dim=None, *, int? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
5636
+ - func: std.correction_out(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
5463
5637
  device_check: NoCheck # TensorIterator
5464
5638
  dispatch:
5465
5639
  CPU, CUDA: std_out
@@ -5474,11 +5648,11 @@
5474
5648
  device_check: NoCheck # TensorIterator
5475
5649
  cpp_no_default_args: ["unbiased"]
5476
5650
 
5477
- - func: std.correction_names(Tensor self, Dimname[1] dim, *, int? correction=None, bool keepdim=False) -> Tensor
5651
+ - func: std.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> Tensor
5478
5652
  device_check: NoCheck # TensorIterator
5479
5653
  variants: function, method
5480
5654
 
5481
- - func: std.correction_names_out(Tensor self, Dimname[1] dim, *, int? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
5655
+ - func: std.correction_names_out(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
5482
5656
  device_check: NoCheck # TensorIterator
5483
5657
  variants: function
5484
5658
 
@@ -5489,11 +5663,13 @@
5489
5663
  CPU, CUDA: prod
5490
5664
  MPS: prod_mps
5491
5665
  autogen: prod.out
5666
+ tags: core
5492
5667
 
5493
5668
  - func: prod.dim_int(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
5494
5669
  structured_delegate: prod.int_out
5495
5670
  device_check: NoCheck # TensorIterator
5496
5671
  variants: function, method
5672
+ tags: core
5497
5673
 
5498
5674
  - func: prod.int_out(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
5499
5675
  structured: True
@@ -5531,7 +5707,7 @@
5531
5707
  dispatch:
5532
5708
  SparseCPU, SparseCUDA: tan_sparse
5533
5709
  SparseCsrCPU, SparseCsrCUDA: tan_sparse_csr
5534
- tags: pointwise
5710
+ tags: [core, pointwise]
5535
5711
 
5536
5712
  - func: tan_(Tensor(a!) self) -> Tensor(a!)
5537
5713
  device_check: NoCheck # TensorIterator
@@ -5592,8 +5768,6 @@
5592
5768
 
5593
5769
  - func: tensordot.out(Tensor self, Tensor other, int[] dims_self, int[] dims_other, *, Tensor(a!) out) -> Tensor(a!)
5594
5770
  variants: function
5595
- dispatch:
5596
- CPU, CUDA: tensordot_out
5597
5771
 
5598
5772
  # TODO: namespace threshold in 'nn'
5599
5773
  - func: threshold(Tensor self, Scalar threshold, Scalar value) -> Tensor
@@ -5635,8 +5809,10 @@
5635
5809
  NestedTensorCPU, NestedTensorCUDA: threshold_backwards_nested
5636
5810
  tags: pointwise
5637
5811
 
5638
- - func: tile(Tensor self, int[] dims) -> Tensor
5812
+ - func: tile(Tensor self, SymInt[] dims) -> Tensor
5639
5813
  variants: function, method
5814
+ dispatch:
5815
+ CompositeImplicitAutograd: tile_symint
5640
5816
 
5641
5817
  - func: transpose.int(Tensor(a) self, int dim0, int dim1) -> Tensor(a)
5642
5818
  variants: function, method
@@ -5691,12 +5867,13 @@
5691
5867
  - func: flipud(Tensor self) -> Tensor
5692
5868
  variants: function, method
5693
5869
 
5694
- - func: roll(Tensor self, int[1] shifts, int[1] dims=[]) -> Tensor
5870
+ - func: roll(Tensor self, SymInt[1] shifts, int[1] dims=[]) -> Tensor
5695
5871
  variants: function, method
5696
5872
  dispatch:
5697
- CPU: roll_cpu
5873
+ CPU, MPS: roll
5698
5874
  CUDA: roll_cuda
5699
5875
  autogen: roll.out
5876
+ tags: core
5700
5877
 
5701
5878
  # default int[] value [0,1] should not add space after comma, since codegen parser uses ', ' to split args
5702
5879
 
@@ -5750,10 +5927,11 @@
5750
5927
  NestedTensorCPU, NestedTensorCUDA: _nested_tensor_strides
5751
5928
  autogen: _nested_tensor_strides.out
5752
5929
 
5753
- - func: _nested_tensor_offsets(Tensor self) -> int[]
5930
+ - func: _nested_tensor_storage_offsets(Tensor self) -> Tensor
5754
5931
  variants: method
5755
5932
  dispatch:
5756
- NestedTensorCPU, NestedTensorCUDA: _nested_tensor_offsets
5933
+ NestedTensorCPU, NestedTensorCUDA, NestedTensorMeta: _nested_tensor_storage_offsets
5934
+ autogen: _nested_tensor_storage_offsets.out
5757
5935
 
5758
5936
  # _nested_from_padded is not usable from Python, so
5759
5937
  # _nested_from_padded_and_nested_example is available for testing.
@@ -5764,13 +5942,13 @@
5764
5942
 
5765
5943
  # The input arguments' types to this functions are temporary. When nested tensors switch to using SymInts for their metadata representation
5766
5944
  # this will need to be updated
5767
- - func: _nested_view_from_buffer(Tensor(a) self, Tensor nested_size, Tensor nested_strides, int[] offsets) -> Tensor(a)
5945
+ - func: _nested_view_from_buffer(Tensor(a) self, Tensor nested_size, Tensor nested_strides, Tensor offsets) -> Tensor(a)
5768
5946
  variants: function
5769
5947
  device_check: NoCheck
5770
5948
  dispatch:
5771
5949
  CPU, CUDA: _nested_view_from_buffer
5772
5950
 
5773
- - func: _nested_view_from_buffer_copy(Tensor self, Tensor nested_size, Tensor nested_strides, int[] offsets) -> Tensor
5951
+ - func: _nested_view_from_buffer_copy(Tensor self, Tensor nested_size, Tensor nested_strides, Tensor offsets) -> Tensor
5774
5952
  variants: function
5775
5953
  device_check: NoCheck
5776
5954
  tags: view_copy
@@ -5913,18 +6091,19 @@
5913
6091
  tags: core
5914
6092
  cpp_no_default_args: ["unbiased"]
5915
6093
 
5916
- - func: var.correction(Tensor self, int[1]? dim=None, *, int? correction=None, bool keepdim=False) -> Tensor
6094
+ - func: var.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> Tensor
5917
6095
  device_check: NoCheck # TensorIterator
5918
6096
  variants: function, method
5919
6097
  dispatch:
5920
6098
  CPU, CUDA: var
5921
6099
  MPS: var_mps
6100
+ tags: core
5922
6101
 
5923
6102
  - func: var.out(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
5924
6103
  device_check: NoCheck # TensorIterator
5925
6104
  cpp_no_default_args: ["unbiased"]
5926
6105
 
5927
- - func: var.correction_out(Tensor self, int[1]? dim=None, *, int? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
6106
+ - func: var.correction_out(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
5928
6107
  device_check: NoCheck # TensorIterator
5929
6108
  dispatch:
5930
6109
  CPU, CUDA: var_out
@@ -5938,11 +6117,11 @@
5938
6117
  device_check: NoCheck # TensorIterator
5939
6118
  cpp_no_default_args: ["unbiased"]
5940
6119
 
5941
- - func: var.correction_names(Tensor self, Dimname[1] dim, *, int? correction=None, bool keepdim=False) -> Tensor
6120
+ - func: var.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> Tensor
5942
6121
  device_check: NoCheck # TensorIterator
5943
6122
  variants: function, method
5944
6123
 
5945
- - func: var.correction_names_out(Tensor self, Dimname[1] dim, *, int? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
6124
+ - func: var.correction_names_out(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
5946
6125
  device_check: NoCheck # TensorIterator
5947
6126
  variants: function
5948
6127
 
@@ -5956,7 +6135,7 @@
5956
6135
  variants: function
5957
6136
  cpp_no_default_args: ["unbiased"]
5958
6137
 
5959
- - func: var_mean.correction(Tensor self, int[1]? dim=None, *, int? correction=None, bool keepdim=False) -> (Tensor, Tensor)
6138
+ - func: var_mean.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
5960
6139
  device_check: NoCheck # TensorIterator
5961
6140
  variants: function
5962
6141
  dispatch:
@@ -5968,7 +6147,7 @@
5968
6147
  variants: function
5969
6148
  cpp_no_default_args: ["unbiased"]
5970
6149
 
5971
- - func: var_mean.correction_names(Tensor self, Dimname[1] dim, *, int? correction=None, bool keepdim=False) -> (Tensor, Tensor)
6150
+ - func: var_mean.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
5972
6151
  device_check: NoCheck # TensorIterator
5973
6152
  variants: function
5974
6153
 
@@ -6036,7 +6215,7 @@
6036
6215
  CompositeExplicitAutograd: zeros
6037
6216
  autogen: zeros.names_out
6038
6217
 
6039
- - func: _efficientzerotensor(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
6218
+ - func: _efficientzerotensor(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
6040
6219
  dispatch:
6041
6220
  CPU: _efficientzerotensor
6042
6221
  CUDA: _efficientzerotensor_cuda
@@ -6056,7 +6235,7 @@
6056
6235
  dispatch:
6057
6236
  # NB: Although this composite mutates on the inside, it is
6058
6237
  # non-differentiable so NonFunctional doesn't apply
6059
- CompositeExplicitAutograd: zeros_like
6238
+ CompositeExplicitAutograd, CompositeImplicitAutogradNestedTensor: zeros_like
6060
6239
  autogen: zeros_like.out
6061
6240
 
6062
6241
  - func: _standard_gamma_grad(Tensor self, Tensor output) -> Tensor
@@ -6297,7 +6476,7 @@
6297
6476
  QuantizedCPU, QuantizedCUDA: quantized_clone
6298
6477
  NestedTensorCPU, NestedTensorCUDA: clone_nested
6299
6478
  autogen: clone.out
6300
- tags: core
6479
+ tags: [core, pointwise]
6301
6480
 
6302
6481
  - func: positive(Tensor(a) self) -> Tensor(a)
6303
6482
  variants: function, method
@@ -6309,6 +6488,7 @@
6309
6488
  dispatch:
6310
6489
  CompositeExplicitAutograd: resize_as_
6311
6490
  autogen: resize_as, resize_as.out
6491
+ tags: inplace_view
6312
6492
 
6313
6493
  - func: resize_as_sparse_(Tensor(a!) self, Tensor the_template) -> Tensor(a!)
6314
6494
  use_const_ref_for_mutable_tensors: True
@@ -6328,6 +6508,7 @@
6328
6508
  SparseCPU, SparseCUDA, SparseMeta: zero_sparse_
6329
6509
  SparseCsrCPU, SparseCsrCUDA: zero_sparse_csr_
6330
6510
  MkldnnCPU: mkldnn_zero_
6511
+ NestedTensorCPU, NestedTensorCUDA: zero_nested_
6331
6512
  autogen: zero, zero.out
6332
6513
 
6333
6514
  - func: sub.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
@@ -6347,6 +6528,7 @@
6347
6528
  dispatch:
6348
6529
  SparseCPU, SparseCUDA: sub_sparse
6349
6530
  ZeroTensor: sub_zerotensor
6531
+ NestedTensorCPU, NestedTensorCUDA: NestedTensor_sub_Tensor
6350
6532
  tags: [core, pointwise]
6351
6533
 
6352
6534
  - func: sub_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
@@ -6493,6 +6675,16 @@
6493
6675
  structured_delegate: _addmm_activation.out
6494
6676
  variants: function, method
6495
6677
 
6678
+ - func: _scaled_mm(Tensor self, Tensor mat2, *, Tensor? bias=None, ScalarType? out_dtype=None, Tensor? scale_a=None, Tensor? scale_b=None, Tensor? scale_result=None) -> (Tensor, Tensor)
6679
+ variants: function
6680
+ dispatch:
6681
+ CUDA: _scaled_mm_cuda
6682
+
6683
+ - func: _scaled_mm.out(Tensor self, Tensor mat2, *, Tensor? bias=None, ScalarType? out_dtype=None, Tensor? scale_a=None, Tensor? scale_b=None, Tensor? scale_result=None, Tensor(a!) out, Tensor(b!) out_amax) -> (Tensor(a!), Tensor(b!))
6684
+ variants: function
6685
+ dispatch:
6686
+ CUDA: _scaled_mm_out_cuda
6687
+
6496
6688
  # NOTE [ Sparse: autograd and API ]
6497
6689
  #
6498
6690
  #
@@ -6605,12 +6797,17 @@
6605
6797
  # the default would never make sense.
6606
6798
 
6607
6799
  - func: sparse_compressed_tensor.comp_plain_value_size(Tensor compressed_indices, Tensor plain_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
6800
+ dispatch:
6801
+ CompositeExplicitAutograd: sparse_compressed_tensor
6802
+
6608
6803
  - func: sparse_csr_tensor.crow_col_value_size(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
6609
6804
  - func: sparse_csc_tensor.ccol_row_value_size(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
6610
6805
  - func: sparse_bsr_tensor.crow_col_value_size(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
6611
6806
  - func: sparse_bsc_tensor.ccol_row_value_size(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
6612
6807
 
6613
6808
  - func: sparse_compressed_tensor.comp_plain_value(Tensor compressed_indices, Tensor plain_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
6809
+ dispatch:
6810
+ CompositeExplicitAutograd: sparse_compressed_tensor
6614
6811
  - func: sparse_csr_tensor.crow_col_value(Tensor crow_indices, Tensor col_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
6615
6812
  - func: sparse_csc_tensor.ccol_row_value(Tensor ccol_indices, Tensor row_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
6616
6813
  - func: sparse_bsr_tensor.crow_col_value(Tensor crow_indices, Tensor col_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
@@ -6627,15 +6824,15 @@
6627
6824
  CompositeExplicitAutograd: sparse_coo_tensor
6628
6825
  autogen: sparse_coo_tensor.size_out
6629
6826
 
6630
- - func: sparse_coo_tensor.indices(Tensor indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
6827
+ - func: sparse_coo_tensor.indices(Tensor indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool? is_coalesced=None) -> Tensor
6631
6828
 
6632
- - func: sparse_coo_tensor.indices_size(Tensor indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
6829
+ - func: sparse_coo_tensor.indices_size(Tensor indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool? is_coalesced=None) -> Tensor
6633
6830
 
6634
- - func: _sparse_coo_tensor_unsafe(Tensor indices, Tensor values, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
6831
+ - func: _sparse_coo_tensor_unsafe(Tensor indices, Tensor values, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool? is_coalesced=None) -> Tensor
6635
6832
  dispatch:
6636
6833
  CompositeImplicitAutograd: _sparse_coo_tensor_unsafe_symint
6637
6834
 
6638
- - func: _validate_sparse_coo_tensor_args(Tensor indices, Tensor values, int[] size) -> ()
6835
+ - func: _validate_sparse_coo_tensor_args(Tensor indices, Tensor values, int[] size, bool? is_coalesced=None) -> ()
6639
6836
 
6640
6837
  - func: _validate_sparse_compressed_tensor_args(Tensor compressed_indices, Tensor plain_indices, Tensor values, int[] size, Layout layout) -> ()
6641
6838
  - func: _validate_sparse_csr_tensor_args(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size) -> ()
@@ -6648,7 +6845,7 @@
6648
6845
  SparseCPU, SparseCUDA, SparseMeta, Meta: new_with_dims_sparse
6649
6846
  autogen: _sparse_coo_tensor_with_dims.out
6650
6847
 
6651
- - func: _sparse_coo_tensor_with_dims_and_tensors(int sparse_dim, int dense_dim, SymInt[] size, Tensor indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
6848
+ - func: _sparse_coo_tensor_with_dims_and_tensors(int sparse_dim, int dense_dim, SymInt[] size, Tensor indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False, bool? is_coalesced=None) -> Tensor
6652
6849
  dispatch:
6653
6850
  SparseCPU, SparseCUDA, SparseMeta, Meta: new_with_dims_and_tensor_sparse_symint
6654
6851
  autogen: _sparse_coo_tensor_with_dims_and_tensors.out
@@ -6671,17 +6868,23 @@
6671
6868
  variants: method
6672
6869
  dispatch:
6673
6870
  SparseCPU, SparseCUDA: sparse_mask
6674
- SparseCsrCPU, SparseCsrCUDA: sparse_mask_sparse_csr
6871
+ SparseCsrCPU, SparseCsrCUDA: sparse_mask_sparse_compressed
6675
6872
  autogen: sparse_mask.out
6676
6873
 
6874
+ - func: _sparse_mask_projection(Tensor self, Tensor mask, bool accumulate_matches=False) -> Tensor
6875
+ variants: method
6876
+ dispatch:
6877
+ SparseCPU, SparseCUDA: sparse_mask_projection
6878
+ autogen: _sparse_mask_projection.out
6879
+
6677
6880
  - func: _to_cpu(Tensor[] tensors) -> Tensor[]
6678
6881
  variants: function
6679
6882
 
6680
- - func: to_dense(Tensor self, ScalarType? dtype=None) -> Tensor
6883
+ - func: to_dense(Tensor self, ScalarType? dtype=None, *, bool? masked_grad=None) -> Tensor
6681
6884
  variants: method
6682
6885
 
6683
6886
  # Special case of to_dense with custom derivative
6684
- - func: _to_dense(Tensor self, ScalarType? dtype=None) -> Tensor
6887
+ - func: _to_dense(Tensor self, ScalarType? dtype=None, bool? masked_grad=None) -> Tensor
6685
6888
  variants: method
6686
6889
  dispatch:
6687
6890
  SparseCPU, SparseCUDA: sparse_to_dense
@@ -6689,7 +6892,7 @@
6689
6892
  MkldnnCPU: mkldnn_to_dense
6690
6893
  autogen: _to_dense.out
6691
6894
 
6692
- - func: to_dense_backward(Tensor grad, Tensor input) -> Tensor
6895
+ - func: to_dense_backward(Tensor grad, Tensor input, bool? masked_grad=None) -> Tensor
6693
6896
 
6694
6897
  - func: sparse_dim(Tensor self) -> int
6695
6898
  variants: method
@@ -6859,51 +7062,80 @@
6859
7062
 
6860
7063
  - func: to_sparse.sparse_dim(Tensor self, int sparse_dim) -> Tensor
6861
7064
  variants: method
7065
+
7066
+ # Special case of to_sparse.sparse_dim with custom derivative
7067
+ - func: _to_sparse.sparse_dim(Tensor self, int sparse_dim) -> Tensor
7068
+ variants: method
6862
7069
  dispatch:
6863
7070
  CPU, CUDA: dense_to_sparse
6864
7071
  SparseCPU, SparseCUDA: sparse_coo_to_sparse
6865
7072
  SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse
6866
- autogen: to_sparse.sparse_dim_out
7073
+ autogen: _to_sparse.sparse_dim_out
6867
7074
 
6868
7075
  - func: to_sparse(Tensor self, *, Layout? layout=None, int[2]? blocksize=None, int? dense_dim=None) -> Tensor
6869
7076
  variants: method
7077
+
7078
+ # Special case of to_sparse with custom derivative
7079
+ - func: _to_sparse(Tensor self, *, Layout? layout=None, int[2]? blocksize=None, int? dense_dim=None) -> Tensor
7080
+ variants: method
6870
7081
  dispatch:
6871
7082
  CPU, CUDA: dense_to_sparse
6872
7083
  SparseCPU, SparseCUDA: sparse_coo_to_sparse
6873
7084
  SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse
6874
- autogen: to_sparse.out
7085
+ autogen: _to_sparse.out
6875
7086
 
6876
7087
  - func: to_sparse_csr(Tensor self, int? dense_dim=None) -> Tensor
6877
7088
  variants: method
7089
+
7090
+ # Special case of to_sparse_csr with custom derivative
7091
+ - func: _to_sparse_csr(Tensor self, int? dense_dim=None) -> Tensor
7092
+ variants: method
6878
7093
  dispatch:
6879
7094
  CPU, CUDA: dense_to_sparse_csr
6880
7095
  SparseCPU, SparseCUDA: coo_to_sparse_csr
6881
7096
  SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse_csr
6882
- autogen: to_sparse_csr.out
7097
+ autogen: _to_sparse_csr.out
6883
7098
 
6884
7099
  - func: to_sparse_csc(Tensor self, int? dense_dim=None) -> Tensor
6885
7100
  variants: method
7101
+
7102
+ # Special case of to_sparse_csc with custom derivative
7103
+ - func: _to_sparse_csc(Tensor self, int? dense_dim=None) -> Tensor
7104
+ variants: method
6886
7105
  dispatch:
6887
7106
  CPU, CUDA: dense_to_sparse_csc
6888
7107
  SparseCPU, SparseCUDA: coo_to_sparse_csc
6889
7108
  SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse_csc
6890
- autogen: to_sparse_csc.out
7109
+ autogen: _to_sparse_csc.out
6891
7110
 
6892
7111
  - func: to_sparse_bsr(Tensor self, int[2] blocksize, int? dense_dim=None) -> Tensor
6893
7112
  variants: method
7113
+
7114
+ # Special case of to_sparse_bsr with custom derivative
7115
+ - func: _to_sparse_bsr(Tensor self, int[2] blocksize, int? dense_dim=None) -> Tensor
7116
+ variants: method
6894
7117
  dispatch:
6895
7118
  CPU, CUDA: dense_to_sparse_bsr
6896
7119
  SparseCPU, SparseCUDA: coo_to_sparse_bsr
6897
7120
  SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse_bsr
6898
- autogen: to_sparse_bsr.out
7121
+ autogen: _to_sparse_bsr.out
6899
7122
 
6900
7123
  - func: to_sparse_bsc(Tensor self, int[2] blocksize, int? dense_dim=None) -> Tensor
6901
7124
  variants: method
7125
+
7126
+ # Special case of to_sparse_bsc with custom derivative
7127
+ - func: _to_sparse_bsc(Tensor self, int[2] blocksize, int? dense_dim=None) -> Tensor
7128
+ variants: method
6902
7129
  dispatch:
6903
7130
  CPU, CUDA: dense_to_sparse_bsc
6904
7131
  SparseCPU, SparseCUDA: coo_to_sparse_bsc
6905
7132
  SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse_bsc
6906
- autogen: to_sparse_bsc.out
7133
+ autogen: _to_sparse_bsc.out
7134
+
7135
+ - func: _to_sparse_semi_structured(Tensor dense) -> (Tensor, Tensor)
7136
+ variants: function
7137
+ dispatch:
7138
+ CUDA: _to_sparse_semi_structured
6907
7139
 
6908
7140
  - func: to_mkldnn(Tensor self, ScalarType? dtype=None) -> Tensor
6909
7141
  variants: method
@@ -7174,7 +7406,7 @@
7174
7406
 
7175
7407
  # NB: Does NOT check precondition that numel == 1
7176
7408
  - func: _local_scalar_dense(Tensor self) -> Scalar
7177
- tags: data_dependent_output
7409
+ tags: [core, data_dependent_output]
7178
7410
  dispatch:
7179
7411
  CPU: _local_scalar_dense_cpu
7180
7412
  CUDA: _local_scalar_dense_cuda
@@ -7187,8 +7419,9 @@
7187
7419
  dispatch:
7188
7420
  MPS: _lstm_mps
7189
7421
  autogen: _lstm_mps.out
7422
+ tags: nondeterministic_seeded
7190
7423
 
7191
- - func: lstm_mps_backward(Tensor grad_y, Tensor? grad_hy, Tensor? grad_cy, Tensor z_state, Tensor cell_state_fwd, Tensor input, Tensor layersOutputs, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor[], Tensor[])
7424
+ - func: lstm_mps_backward(Tensor? grad_y, Tensor? grad_hy, Tensor? grad_cy, Tensor z_state, Tensor cell_state_fwd, Tensor input, Tensor layersOutputs, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor[], Tensor[])
7192
7425
  dispatch:
7193
7426
  MPS: lstm_mps_backward
7194
7427
  autogen: lstm_mps_backward.out
@@ -7226,20 +7459,28 @@
7226
7459
 
7227
7460
  # RNN cells and layers
7228
7461
  - func: lstm.input(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor)
7462
+ tags: nondeterministic_seeded
7229
7463
 
7230
7464
  - func: lstm.data(Tensor data, Tensor batch_sizes, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor, Tensor)
7465
+ tags: nondeterministic_seeded
7231
7466
 
7232
7467
  - func: gru.input(Tensor input, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor)
7468
+ tags: nondeterministic_seeded
7233
7469
 
7234
7470
  - func: gru.data(Tensor data, Tensor batch_sizes, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor)
7471
+ tags: nondeterministic_seeded
7235
7472
 
7236
7473
  - func: rnn_tanh.input(Tensor input, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor)
7474
+ tags: nondeterministic_seeded
7237
7475
 
7238
7476
  - func: rnn_tanh.data(Tensor data, Tensor batch_sizes, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor)
7477
+ tags: nondeterministic_seeded
7239
7478
 
7240
7479
  - func: rnn_relu.input(Tensor input, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor)
7480
+ tags: nondeterministic_seeded
7241
7481
 
7242
7482
  - func: rnn_relu.data(Tensor data, Tensor batch_sizes, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor)
7483
+ tags: nondeterministic_seeded
7243
7484
 
7244
7485
  - func: lstm_cell(Tensor input, Tensor[] hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih=None, Tensor? b_hh=None) -> (Tensor, Tensor)
7245
7486
 
@@ -7382,6 +7623,7 @@
7382
7623
  variants: function, method
7383
7624
  dispatch:
7384
7625
  CompositeExplicitAutograd: masked_fill
7626
+ NestedTensorCPU, NestedTensorCUDA: NestedTensor_masked_fill
7385
7627
  tags: pointwise
7386
7628
 
7387
7629
  - func: masked_fill_.Tensor(Tensor(a!) self, Tensor mask, Tensor value) -> Tensor(a!)
@@ -7406,6 +7648,7 @@
7406
7648
  dispatch:
7407
7649
  CPU: masked_scatter__cpu
7408
7650
  CUDA: masked_scatter__cuda
7651
+ MPS: masked_scatter__mps
7409
7652
  autogen: masked_scatter.out
7410
7653
 
7411
7654
  - func: masked_scatter(Tensor self, Tensor mask, Tensor source) -> Tensor
@@ -7503,6 +7746,7 @@
7503
7746
  dispatch:
7504
7747
  CPU: index_fill_
7505
7748
  CUDA: index_fill_
7749
+ MPS: index_fill_mps_
7506
7750
  autogen: index_fill.int_Scalar_out
7507
7751
 
7508
7752
  - func: index_fill.int_Scalar(Tensor self, int dim, Tensor index, Scalar value) -> Tensor
@@ -7516,6 +7760,7 @@
7516
7760
  variants: method
7517
7761
  dispatch:
7518
7762
  CPU, CUDA: index_fill_
7763
+ MPS: index_fill_mps_
7519
7764
  autogen: index_fill.int_Tensor_out
7520
7765
 
7521
7766
  - func: index_fill.int_Tensor(Tensor self, int dim, Tensor index, Tensor value) -> Tensor
@@ -7543,6 +7788,7 @@
7543
7788
  - func: scatter.src(Tensor self, int dim, Tensor index, Tensor src) -> Tensor
7544
7789
  structured_delegate: scatter.src_out
7545
7790
  variants: function, method
7791
+ tags: core
7546
7792
 
7547
7793
  - func: scatter_.src(Tensor(a!) self, int dim, Tensor index, Tensor src) -> Tensor(a!)
7548
7794
  structured_delegate: scatter.src_out
@@ -7558,6 +7804,7 @@
7558
7804
  - func: scatter.value(Tensor self, int dim, Tensor index, Scalar value) -> Tensor
7559
7805
  structured_delegate: scatter.value_out
7560
7806
  variants: function, method
7807
+ tags: core
7561
7808
 
7562
7809
  - func: scatter_.value(Tensor(a!) self, int dim, Tensor index, Scalar value) -> Tensor(a!)
7563
7810
  structured_delegate: scatter.value_out
@@ -7657,6 +7904,7 @@
7657
7904
  variants: function
7658
7905
  dispatch:
7659
7906
  CPU, CUDA: bitwise_and_out
7907
+ MPS: bitwise_and_out_mps
7660
7908
  tags: pointwise
7661
7909
 
7662
7910
  - func: bitwise_and.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
@@ -7671,7 +7919,7 @@
7671
7919
  variants: method, function
7672
7920
  dispatch:
7673
7921
  CompositeExplicitAutograd: bitwise_and
7674
- tags: pointwise
7922
+ tags: [core, pointwise]
7675
7923
 
7676
7924
  - func: bitwise_and.Scalar_Tensor(Scalar self, Tensor other) -> Tensor
7677
7925
  device_check: NoCheck # TensorIterator
@@ -7721,6 +7969,7 @@
7721
7969
  variants: function
7722
7970
  dispatch:
7723
7971
  CPU, CUDA: bitwise_or_out
7972
+ MPS: bitwise_or_out_mps
7724
7973
  tags: pointwise
7725
7974
 
7726
7975
  - func: bitwise_or.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
@@ -7733,7 +7982,7 @@
7733
7982
  - func: bitwise_or.Scalar(Tensor self, Scalar other) -> Tensor
7734
7983
  device_check: NoCheck # TensorIterator
7735
7984
  variants: method, function
7736
- tags: pointwise
7985
+ tags: [core, pointwise]
7737
7986
 
7738
7987
  - func: bitwise_or.Scalar_Tensor(Scalar self, Tensor other) -> Tensor
7739
7988
  device_check: NoCheck # TensorIterator
@@ -7783,6 +8032,7 @@
7783
8032
  variants: function
7784
8033
  dispatch:
7785
8034
  CPU, CUDA: bitwise_xor_out
8035
+ MPS: bitwise_xor_out_mps
7786
8036
  tags: pointwise
7787
8037
 
7788
8038
  - func: bitwise_xor.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
@@ -7795,7 +8045,7 @@
7795
8045
  - func: bitwise_xor.Scalar(Tensor self, Scalar other) -> Tensor
7796
8046
  device_check: NoCheck # TensorIterator
7797
8047
  variants: method, function
7798
- tags: pointwise
8048
+ tags: [core, pointwise]
7799
8049
 
7800
8050
  - func: bitwise_xor.Scalar_Tensor(Scalar self, Tensor other) -> Tensor
7801
8051
  device_check: NoCheck # TensorIterator
@@ -8067,6 +8317,7 @@
8067
8317
  variants: method
8068
8318
  dispatch:
8069
8319
  CPU, CUDA: random_
8320
+ MPS: random_mps_
8070
8321
  Meta: random_meta_
8071
8322
  autogen: random, random.out
8072
8323
 
@@ -8164,7 +8415,7 @@
8164
8415
  dispatch:
8165
8416
  CPU: trace_cpu
8166
8417
  CUDA: trace_cuda
8167
- MPS: trace_mps_out
8418
+ MPS: trace_mps
8168
8419
  autogen: trace.out
8169
8420
 
8170
8421
  - func: trace_backward(Tensor grad, SymInt[] sizes) -> Tensor
@@ -8604,6 +8855,15 @@
8604
8855
  MPS: nonzero_mps
8605
8856
  tags: [dynamic_output_shape, core]
8606
8857
 
8858
+ - func: nonzero_static.out(Tensor self, *, int size, int fill_value=-1, Tensor(a!) out) -> Tensor(a!)
8859
+ dispatch:
8860
+ CPU: nonzero_static_out_cpu
8861
+
8862
+ - func: nonzero_static(Tensor self, *, int size, int fill_value=-1) -> Tensor
8863
+ variants: method, function
8864
+ dispatch:
8865
+ CPU: nonzero_static_cpu
8866
+
8607
8867
  - func: nonzero_numpy(Tensor self) -> Tensor[]
8608
8868
  variants: method, function
8609
8869
 
@@ -8710,8 +8970,10 @@
8710
8970
  CPU, CUDA: linalg_solve_triangular
8711
8971
  MPS: linalg_solve_triangular_mps
8712
8972
 
8713
- - func: linalg_vander(Tensor x, *, int? N=None) -> Tensor
8973
+ - func: linalg_vander(Tensor x, *, SymInt? N=None) -> Tensor
8714
8974
  python_module: linalg
8975
+ dispatch:
8976
+ CompositeImplicitAutograd: linalg_vander_symint
8715
8977
 
8716
8978
  - func: svd.U(Tensor self, bool some=True, bool compute_uv=True, *, Tensor(a!) U, Tensor(b!) S, Tensor(c!) V) -> (Tensor(a!) U, Tensor(b!) S, Tensor(c!) V)
8717
8979
 
@@ -8917,6 +9179,7 @@
8917
9179
  structured_inherits: TensorIteratorBase
8918
9180
  dispatch:
8919
9181
  CPU, CUDA: erfinv_out
9182
+ MPS: erfinv_out_mps
8920
9183
  SparseCPU, SparseCUDA: erfinv_sparse_out
8921
9184
  SparseCsrCPU, SparseCsrCUDA: erfinv_sparse_csr_out
8922
9185
  tags: pointwise
@@ -8999,7 +9262,7 @@
8999
9262
  structured_inherits: TensorIteratorBase
9000
9263
  dispatch:
9001
9264
  CPU, CUDA: atan2_out
9002
- MPS: atan2_mps_out
9265
+ MPS: atan2_out_mps
9003
9266
  tags: pointwise
9004
9267
 
9005
9268
  - func: atan2_(Tensor(a!) self, Tensor other) -> Tensor(a!)
@@ -9030,6 +9293,7 @@
9030
9293
  structured_inherits: TensorIteratorBase
9031
9294
  dispatch:
9032
9295
  CPU, CUDA: lerp_Scalar
9296
+ MPS: lerp_Scalar_mps
9033
9297
  tags: pointwise
9034
9298
 
9035
9299
  - func: lerp.Tensor_out(Tensor self, Tensor end, Tensor weight, *, Tensor(a!) out) -> Tensor(a!)
@@ -9038,6 +9302,7 @@
9038
9302
  structured_inherits: TensorIteratorBase
9039
9303
  dispatch:
9040
9304
  CPU, CUDA: lerp_Tensor
9305
+ MPS: lerp_Tensor_mps
9041
9306
  tags: pointwise
9042
9307
 
9043
9308
  - func: lerp.Scalar(Tensor self, Tensor end, Scalar weight) -> Tensor
@@ -9054,46 +9319,46 @@
9054
9319
 
9055
9320
  - func: histc.out(Tensor self, int bins=100, Scalar min=0, Scalar max=0, *, Tensor(a!) out) -> Tensor(a!)
9056
9321
  dispatch:
9057
- CPU: histogram_histc_cpu_out
9322
+ CPU, MPS: histogram_histc_out
9058
9323
  CUDA: _histc_out_cuda
9059
9324
 
9060
9325
  - func: histc(Tensor self, int bins=100, Scalar min=0, Scalar max=0) -> Tensor
9061
9326
  variants: method, function
9062
9327
  dispatch:
9063
- CPU: histogram_histc_cpu
9328
+ CPU, MPS: histogram_histc
9064
9329
  CUDA: _histc_cuda
9065
9330
 
9066
9331
  - func: histogram.bins_tensor_out(Tensor self, Tensor bins, *, Tensor? weight=None, bool density=False, Tensor(a!) hist, Tensor(b!) bin_edges) -> (Tensor(a!) hist, Tensor(b!) bin_edges)
9067
9332
  dispatch:
9068
- CPU: histogram_out_cpu
9333
+ CPU, MPS: histogram_out
9069
9334
 
9070
9335
  - func: histogram.bins_tensor(Tensor self, Tensor bins, *, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor bin_edges)
9071
9336
  variants: method, function
9072
9337
  dispatch:
9073
- CPU: histogram_cpu
9338
+ CPU, MPS: histogram
9074
9339
 
9075
9340
  - func: histogram.bin_ct_out(Tensor self, int bins=100, *, float[]? range=None, Tensor? weight=None, bool density=False, Tensor(a!) hist, Tensor(b!) bin_edges) -> (Tensor(a!) hist, Tensor(b!) bin_edges)
9076
9341
  dispatch:
9077
- CPU: histogram_out_cpu
9342
+ CPU, MPS: histogram_out
9078
9343
 
9079
9344
  - func: histogram.bin_ct(Tensor self, int bins=100, *, float[]? range=None, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor bin_edges)
9080
9345
  variants: method, function
9081
9346
  dispatch:
9082
- CPU: histogram_cpu
9347
+ CPU, MPS: histogram
9083
9348
 
9084
9349
  - func: _histogramdd_bin_edges(Tensor self, int[] bins, *, float[]? range=None, Tensor? weight=None, bool density=False) -> Tensor[]
9085
9350
  dispatch:
9086
- CPU: histogramdd_bin_edges_cpu
9351
+ CPU, MPS: histogramdd_bin_edges
9087
9352
  autogen: _histogramdd_bin_edges.out
9088
9353
 
9089
9354
  - func: _histogramdd_from_bin_cts(Tensor self, int[] bins, *, float[]? range=None, Tensor? weight=None, bool density=False) -> Tensor
9090
9355
  dispatch:
9091
- CPU: histogramdd_cpu
9356
+ CPU, MPS: _histogramdd
9092
9357
  autogen: _histogramdd_from_bin_cts.out
9093
9358
 
9094
9359
  - func: _histogramdd_from_bin_tensors(Tensor self, Tensor[] bins, *, Tensor? weight=None, bool density=False) -> Tensor
9095
9360
  dispatch:
9096
- CPU: histogramdd_cpu
9361
+ CPU, MPS: _histogramdd
9097
9362
  autogen: _histogramdd_from_bin_tensors.out
9098
9363
 
9099
9364
  - func: histogramdd(Tensor self, int[] bins, float[]? range=None, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor[] bin_edges)
@@ -9113,7 +9378,7 @@
9113
9378
  variants: method, function
9114
9379
  dispatch:
9115
9380
  CompositeExplicitAutograd: fmod
9116
- tags: pointwise
9381
+ tags: [core, pointwise]
9117
9382
 
9118
9383
  - func: fmod_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
9119
9384
  device_check: NoCheck # TensorIterator
@@ -9148,6 +9413,7 @@
9148
9413
  structured_inherits: TensorIteratorBase
9149
9414
  dispatch:
9150
9415
  CPU, CUDA: hypot_out
9416
+ MPS: hypot_out_mps
9151
9417
  tags: pointwise
9152
9418
 
9153
9419
  - func: hypot(Tensor self, Tensor other) -> Tensor
@@ -9220,7 +9486,7 @@
9220
9486
  variants: method, function
9221
9487
  dispatch:
9222
9488
  CompositeExplicitAutograd: remainder
9223
- tags: pointwise
9489
+ tags: [core, pointwise]
9224
9490
 
9225
9491
  - func: remainder_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
9226
9492
  variants: method
@@ -9265,12 +9531,11 @@
9265
9531
  MPS: min_mps
9266
9532
  QuantizedCPU: min_quantized_cpu
9267
9533
 
9268
- # Not to be confused with binary op `min.out`. Commented because of failed CI
9269
- # FIXME: enable this
9270
- #- func: min.unary_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
9271
- # device_check: NoCheck # TensorIterator
9272
- # dispatch:
9273
- # CompositeExplicitAutograd: min_unary_out
9534
+ - func: min.unary_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
9535
+ device_check: NoCheck # TensorIterator
9536
+ dispatch:
9537
+ CPU, CUDA: min_unary_out
9538
+ QuantizedCPU: min_quantized_unary_out
9274
9539
 
9275
9540
  - func: fmin(Tensor self, Tensor other) -> Tensor
9276
9541
  structured_delegate: fmin.out
@@ -9283,7 +9548,7 @@
9283
9548
  structured_inherits: TensorIteratorBase
9284
9549
  device_check: NoCheck # TensorIterator
9285
9550
  dispatch:
9286
- CPU, CUDA: fmin_out
9551
+ CPU, CUDA, MPS: fmin_out
9287
9552
  tags: pointwise
9288
9553
 
9289
9554
  - func: max(Tensor self) -> Tensor
@@ -9305,7 +9570,7 @@
9305
9570
  structured_inherits: TensorIteratorBase
9306
9571
  device_check: NoCheck # TensorIterator
9307
9572
  dispatch:
9308
- CPU, CUDA: fmax_out
9573
+ CPU, CUDA, MPS: fmax_out
9309
9574
  tags: pointwise
9310
9575
 
9311
9576
  - func: maximum(Tensor self, Tensor other) -> Tensor
@@ -9402,6 +9667,7 @@
9402
9667
  variants: method, function
9403
9668
  dispatch:
9404
9669
  CompositeExplicitAutograd: sort
9670
+ tags: core
9405
9671
 
9406
9672
  - func: sort.stable(Tensor self, *, bool? stable, int dim=-1, bool descending=False) -> (Tensor values, Tensor indices)
9407
9673
  structured_delegate: sort.values_stable
@@ -9438,14 +9704,14 @@
9438
9704
  - func: argsort.dimname(Tensor self, Dimname dim, bool descending=False) -> Tensor
9439
9705
  variants: method, function
9440
9706
 
9441
- - func: topk.values(Tensor self, int k, int dim=-1, bool largest=True, bool sorted=True, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
9707
+ - func: topk.values(Tensor self, SymInt k, int dim=-1, bool largest=True, bool sorted=True, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
9442
9708
  structured: True
9443
9709
  dispatch:
9444
9710
  CPU: topk_out_cpu
9445
9711
  CUDA: topk_out_cuda
9446
9712
  MPS: topk_out_mps
9447
9713
 
9448
- - func: topk(Tensor self, int k, int dim=-1, bool largest=True, bool sorted=True) -> (Tensor values, Tensor indices)
9714
+ - func: topk(Tensor self, SymInt k, int dim=-1, bool largest=True, bool sorted=True) -> (Tensor values, Tensor indices)
9449
9715
  variants: method, function
9450
9716
  structured_delegate: topk.values
9451
9717
  dispatch:
@@ -9470,6 +9736,7 @@
9470
9736
  variants: method, function
9471
9737
  dispatch:
9472
9738
  SparseCPU, SparseCUDA: any_sparse
9739
+ tags: core
9473
9740
 
9474
9741
  - func: any.all_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
9475
9742
  device_check: NoCheck
@@ -9483,6 +9750,7 @@
9483
9750
  structured: True
9484
9751
  dispatch:
9485
9752
  CPU, CUDA: renorm_out
9753
+ MPS: renorm_out_mps
9486
9754
 
9487
9755
  - func: renorm(Tensor self, Scalar p, int dim, Scalar maxnorm) -> Tensor
9488
9756
  device_check: NoCheck # TensorIterator
@@ -9537,6 +9805,7 @@
9537
9805
  structured: True
9538
9806
  dispatch:
9539
9807
  CPU, CUDA: pow_Scalar_out
9808
+ MPS: pow_Scalar_out_mps
9540
9809
  tags: pointwise
9541
9810
 
9542
9811
  - func: pow.Scalar(Scalar self, Tensor exponent) -> Tensor
@@ -9611,6 +9880,7 @@
9611
9880
  MPS: normal_mps_
9612
9881
  Meta: normal_meta_
9613
9882
  SparseCsrCPU, SparseCsrCUDA: normal_sparse_csr_
9883
+ NestedTensorCPU, NestedTensorCUDA: normal_nested_
9614
9884
  autogen: normal.out
9615
9885
 
9616
9886
  # Only used by the functionalization pass.
@@ -9720,156 +9990,155 @@
9720
9990
  CUDA: foreach_tensor_add_scalar_kernel_cuda_
9721
9991
  autogen: _foreach_add.Scalar_out
9722
9992
 
9723
- - func: _foreach_sub.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
9993
+ - func: _foreach_add.List(Tensor[] self, Tensor[] other, *, Scalar alpha=1) -> Tensor[]
9724
9994
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9725
9995
  variants: function
9726
9996
  dispatch:
9727
- CPU: foreach_tensor_sub_scalar_kernel_slow
9728
- CUDA: foreach_tensor_sub_scalar_kernel_cuda
9997
+ CPU: foreach_tensor_add_list_kernel_slow
9998
+ CUDA: foreach_tensor_add_list_kernel_cuda
9729
9999
 
9730
- - func: _foreach_sub_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
10000
+ - func: _foreach_add_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> ()
9731
10001
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9732
10002
  variants: function
9733
10003
  dispatch:
9734
- CPU: foreach_tensor_sub_scalar_kernel_slow_
9735
- CUDA: foreach_tensor_sub_scalar_kernel_cuda_
9736
- autogen: _foreach_sub.Scalar_out
10004
+ CPU: foreach_tensor_add_list_kernel_slow_
10005
+ CUDA: foreach_tensor_add_list_kernel_cuda_
10006
+ autogen: _foreach_add.List_out
9737
10007
 
9738
- - func: _foreach_mul.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
10008
+ - func: _foreach_add.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
9739
10009
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9740
10010
  variants: function
9741
10011
  dispatch:
9742
- CPU: foreach_tensor_mul_scalar_kernel_slow
9743
- CUDA: foreach_tensor_mul_scalar_kernel_cuda
10012
+ CPU: foreach_tensor_add_scalarlist_kernel_slow
10013
+ CUDA: foreach_tensor_add_scalarlist_kernel_cuda
9744
10014
 
9745
- - func: _foreach_mul_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
10015
+ - func: _foreach_add_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
9746
10016
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9747
10017
  variants: function
9748
10018
  dispatch:
9749
- CPU: foreach_tensor_mul_scalar_kernel_slow_
9750
- CUDA: foreach_tensor_mul_scalar_kernel_cuda_
9751
- autogen: _foreach_mul.Scalar_out
10019
+ CPU: foreach_tensor_add_scalarlist_kernel_slow_
10020
+ CUDA: foreach_tensor_add_scalarlist_kernel_cuda_
10021
+ autogen: _foreach_add.ScalarList_out
9752
10022
 
9753
- - func: _foreach_div.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
10023
+ - func: _foreach_sub.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
9754
10024
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9755
10025
  variants: function
9756
10026
  dispatch:
9757
- CPU: foreach_tensor_div_scalar_kernel_slow
9758
- CUDA: foreach_tensor_div_scalar_kernel_cuda
10027
+ CPU: foreach_tensor_sub_scalar_kernel_slow
10028
+ CUDA: foreach_tensor_sub_scalar_kernel_cuda
9759
10029
 
9760
- - func: _foreach_div_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
10030
+ - func: _foreach_sub_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
9761
10031
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9762
10032
  variants: function
9763
10033
  dispatch:
9764
- CPU: foreach_tensor_div_scalar_kernel_slow_
9765
- CUDA: foreach_tensor_div_scalar_kernel_cuda_
9766
- autogen: _foreach_div.Scalar_out
10034
+ CPU: foreach_tensor_sub_scalar_kernel_slow_
10035
+ CUDA: foreach_tensor_sub_scalar_kernel_cuda_
10036
+ autogen: _foreach_sub.Scalar_out
9767
10037
 
9768
- - func: _foreach_clamp_min.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
10038
+ - func: _foreach_sub.List(Tensor[] self, Tensor[] other, *, Scalar alpha=1) -> Tensor[]
9769
10039
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9770
10040
  variants: function
9771
10041
  dispatch:
9772
- CPU: foreach_tensor_clamp_min_scalar_kernel_slow
9773
- CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda
10042
+ CPU: foreach_tensor_sub_list_kernel_slow
10043
+ CUDA: foreach_tensor_sub_list_kernel_cuda
9774
10044
 
9775
- - func: _foreach_clamp_min_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
10045
+ - func: _foreach_sub_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> ()
9776
10046
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9777
10047
  variants: function
9778
10048
  dispatch:
9779
- CPU: foreach_tensor_clamp_min_scalar_kernel_slow_
9780
- CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda_
9781
- autogen: _foreach_clamp_min.Scalar_out
10049
+ CPU: foreach_tensor_sub_list_kernel_slow_
10050
+ CUDA: foreach_tensor_sub_list_kernel_cuda_
10051
+ autogen: _foreach_sub.List_out
9782
10052
 
9783
- - func: _foreach_clamp_max.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
10053
+ - func: _foreach_sub.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
9784
10054
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9785
10055
  variants: function
9786
10056
  dispatch:
9787
- CPU: foreach_tensor_clamp_max_scalar_kernel_slow
9788
- CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda
10057
+ CPU: foreach_tensor_sub_scalarlist_kernel_slow
10058
+ CUDA: foreach_tensor_sub_scalarlist_kernel_cuda
9789
10059
 
9790
- - func: _foreach_clamp_max_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
10060
+ - func: _foreach_sub_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
9791
10061
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9792
10062
  variants: function
9793
10063
  dispatch:
9794
- CPU: foreach_tensor_clamp_max_scalar_kernel_slow_
9795
- CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda_
9796
- autogen: _foreach_clamp_max.Scalar_out
10064
+ CPU: foreach_tensor_sub_scalarlist_kernel_slow_
10065
+ CUDA: foreach_tensor_sub_scalarlist_kernel_cuda_
10066
+ autogen: _foreach_sub.ScalarList_out
9797
10067
 
9798
- # foreach_minimum/maximum dispatches to clamp_max/min
9799
- - func: _foreach_maximum.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
10068
+ - func: _foreach_mul.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
9800
10069
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9801
10070
  variants: function
9802
10071
  dispatch:
9803
- CPU: foreach_tensor_clamp_min_scalar_kernel_slow
9804
- CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda
10072
+ CPU: foreach_tensor_mul_scalar_kernel_slow
10073
+ CUDA: foreach_tensor_mul_scalar_kernel_cuda
9805
10074
 
9806
- - func: _foreach_maximum_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
10075
+ - func: _foreach_mul_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
9807
10076
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9808
10077
  variants: function
9809
10078
  dispatch:
9810
- CPU: foreach_tensor_clamp_min_scalar_kernel_slow_
9811
- CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda_
9812
- autogen: _foreach_maximum.Scalar_out
10079
+ CPU: foreach_tensor_mul_scalar_kernel_slow_
10080
+ CUDA: foreach_tensor_mul_scalar_kernel_cuda_
10081
+ autogen: _foreach_mul.Scalar_out
9813
10082
 
9814
- - func: _foreach_minimum.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
10083
+ - func: _foreach_mul.List(Tensor[] self, Tensor[] other) -> Tensor[]
9815
10084
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9816
10085
  variants: function
9817
10086
  dispatch:
9818
- CPU: foreach_tensor_clamp_max_scalar_kernel_slow
9819
- CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda
10087
+ CPU: foreach_tensor_mul_list_kernel_slow
10088
+ CUDA: foreach_tensor_mul_list_kernel_cuda
9820
10089
 
9821
- - func: _foreach_minimum_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
10090
+ - func: _foreach_mul_.List(Tensor(a!)[] self, Tensor[] other) -> ()
9822
10091
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9823
10092
  variants: function
9824
10093
  dispatch:
9825
- CPU: foreach_tensor_clamp_max_scalar_kernel_slow_
9826
- CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda_
9827
- autogen: _foreach_minimum.Scalar_out
10094
+ CPU: foreach_tensor_mul_list_kernel_slow_
10095
+ CUDA: foreach_tensor_mul_list_kernel_cuda_
10096
+ autogen: _foreach_mul.List_out
9828
10097
 
9829
- - func: _foreach_add.List(Tensor[] self, Tensor[] other, *, Scalar alpha=1) -> Tensor[]
10098
+ - func: _foreach_mul.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
9830
10099
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9831
10100
  variants: function
9832
10101
  dispatch:
9833
- CPU: foreach_tensor_add_list_kernel_slow
9834
- CUDA: foreach_tensor_add_list_kernel_cuda
10102
+ CPU: foreach_tensor_mul_scalarlist_kernel_slow
10103
+ CUDA: foreach_tensor_mul_scalarlist_kernel_cuda
9835
10104
 
9836
- - func: _foreach_add_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> ()
10105
+ - func: _foreach_mul_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
9837
10106
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9838
10107
  variants: function
9839
10108
  dispatch:
9840
- CPU: foreach_tensor_add_list_kernel_slow_
9841
- CUDA: foreach_tensor_add_list_kernel_cuda_
9842
- autogen: _foreach_add.List_out
10109
+ CPU: foreach_tensor_mul_scalarlist_kernel_slow_
10110
+ CUDA: foreach_tensor_mul_scalarlist_kernel_cuda_
10111
+ autogen: _foreach_mul.ScalarList_out
9843
10112
 
9844
- - func: _foreach_sub.List(Tensor[] self, Tensor[] other, *, Scalar alpha=1) -> Tensor[]
10113
+ - func: _foreach_mul.Tensor(Tensor[] self, Tensor other) -> Tensor[]
9845
10114
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9846
10115
  variants: function
9847
10116
  dispatch:
9848
- CPU: foreach_tensor_sub_list_kernel_slow
9849
- CUDA: foreach_tensor_sub_list_kernel_cuda
10117
+ CPU: foreach_tensor_mul_tensor_kernel_slow
10118
+ CUDA: foreach_tensor_mul_tensor_kernel_cuda
9850
10119
 
9851
- - func: _foreach_sub_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> ()
10120
+ - func: _foreach_mul_.Tensor(Tensor(a!)[] self, Tensor other) -> ()
9852
10121
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9853
10122
  variants: function
9854
10123
  dispatch:
9855
- CPU: foreach_tensor_sub_list_kernel_slow_
9856
- CUDA: foreach_tensor_sub_list_kernel_cuda_
9857
- autogen: _foreach_sub.List_out
10124
+ CPU: foreach_tensor_mul_tensor_kernel_slow_
10125
+ CUDA: foreach_tensor_mul_tensor_kernel_cuda_
10126
+ autogen: _foreach_mul.Tensor_out
9858
10127
 
9859
- - func: _foreach_mul.List(Tensor[] self, Tensor[] other) -> Tensor[]
10128
+ - func: _foreach_div.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
9860
10129
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9861
10130
  variants: function
9862
10131
  dispatch:
9863
- CPU: foreach_tensor_mul_list_kernel_slow
9864
- CUDA: foreach_tensor_mul_list_kernel_cuda
10132
+ CPU: foreach_tensor_div_scalar_kernel_slow
10133
+ CUDA: foreach_tensor_div_scalar_kernel_cuda
9865
10134
 
9866
- - func: _foreach_mul_.List(Tensor(a!)[] self, Tensor[] other) -> ()
10135
+ - func: _foreach_div_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
9867
10136
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9868
10137
  variants: function
9869
10138
  dispatch:
9870
- CPU: foreach_tensor_mul_list_kernel_slow_
9871
- CUDA: foreach_tensor_mul_list_kernel_cuda_
9872
- autogen: _foreach_mul.List_out
10139
+ CPU: foreach_tensor_div_scalar_kernel_slow_
10140
+ CUDA: foreach_tensor_div_scalar_kernel_cuda_
10141
+ autogen: _foreach_div.Scalar_out
9873
10142
 
9874
10143
  - func: _foreach_div.List(Tensor[] self, Tensor[] other) -> Tensor[]
9875
10144
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -9886,20 +10155,35 @@
9886
10155
  CUDA: foreach_tensor_div_list_kernel_cuda_
9887
10156
  autogen: _foreach_div.List_out
9888
10157
 
9889
- - func: _foreach_clamp_min.List(Tensor[] self, Tensor[] other) -> Tensor[]
10158
+ - func: _foreach_div.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
9890
10159
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9891
10160
  variants: function
9892
10161
  dispatch:
9893
- CPU: foreach_tensor_clamp_min_list_kernel_slow
9894
- CUDA: foreach_tensor_clamp_min_list_kernel_cuda
10162
+ CPU: foreach_tensor_div_scalarlist_kernel_slow
10163
+ CUDA: foreach_tensor_div_scalarlist_kernel_cuda
9895
10164
 
9896
- - func: _foreach_clamp_min_.List(Tensor(a!)[] self, Tensor[] other) -> ()
10165
+ - func: _foreach_div_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
9897
10166
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9898
10167
  variants: function
9899
10168
  dispatch:
9900
- CPU: foreach_tensor_clamp_min_list_kernel_slow_
9901
- CUDA: foreach_tensor_clamp_min_list_kernel_cuda_
9902
- autogen: _foreach_clamp_min.List_out
10169
+ CPU: foreach_tensor_div_scalarlist_kernel_slow_
10170
+ CUDA: foreach_tensor_div_scalarlist_kernel_cuda_
10171
+ autogen: _foreach_div.ScalarList_out
10172
+
10173
+ - func: _foreach_clamp_max.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
10174
+ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10175
+ variants: function
10176
+ dispatch:
10177
+ CPU: foreach_tensor_clamp_max_scalar_kernel_slow
10178
+ CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda
10179
+
10180
+ - func: _foreach_clamp_max_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
10181
+ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10182
+ variants: function
10183
+ dispatch:
10184
+ CPU: foreach_tensor_clamp_max_scalar_kernel_slow_
10185
+ CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda_
10186
+ autogen: _foreach_clamp_max.Scalar_out
9903
10187
 
9904
10188
  - func: _foreach_clamp_max.List(Tensor[] self, Tensor[] other) -> Tensor[]
9905
10189
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -9916,143 +10200,143 @@
9916
10200
  CUDA: foreach_tensor_clamp_max_list_kernel_cuda_
9917
10201
  autogen: _foreach_clamp_max.List_out
9918
10202
 
9919
- # foreach_minimum/maximum dispatches to clamp_max/min
9920
- - func: _foreach_maximum.List(Tensor[] self, Tensor[] other) -> Tensor[]
10203
+ - func: _foreach_clamp_max.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
9921
10204
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9922
10205
  variants: function
9923
10206
  dispatch:
9924
- CPU: foreach_tensor_clamp_min_list_kernel_slow
9925
- CUDA: foreach_tensor_clamp_min_list_kernel_cuda
10207
+ CPU: foreach_tensor_clamp_max_scalarlist_kernel_slow
10208
+ CUDA: foreach_tensor_clamp_max_scalarlist_kernel_cuda
9926
10209
 
9927
- - func: _foreach_maximum_.List(Tensor(a!)[] self, Tensor[] other) -> ()
10210
+ - func: _foreach_clamp_max_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
9928
10211
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9929
10212
  variants: function
9930
10213
  dispatch:
9931
- CPU: foreach_tensor_clamp_min_list_kernel_slow_
9932
- CUDA: foreach_tensor_clamp_min_list_kernel_cuda_
9933
- autogen: _foreach_maximum.List_out
10214
+ CPU: foreach_tensor_clamp_max_scalarlist_kernel_slow_
10215
+ CUDA: foreach_tensor_clamp_max_scalarlist_kernel_cuda_
10216
+ autogen: _foreach_clamp_max.ScalarList_out
9934
10217
 
9935
- - func: _foreach_minimum.List(Tensor[] self, Tensor[] other) -> Tensor[]
10218
+ - func: _foreach_clamp_min.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
9936
10219
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9937
10220
  variants: function
9938
10221
  dispatch:
9939
- CPU: foreach_tensor_clamp_max_list_kernel_slow
9940
- CUDA: foreach_tensor_clamp_max_list_kernel_cuda
10222
+ CPU: foreach_tensor_clamp_min_scalar_kernel_slow
10223
+ CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda
9941
10224
 
9942
- - func: _foreach_minimum_.List(Tensor(a!)[] self, Tensor[] other) -> ()
10225
+ - func: _foreach_clamp_min_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
9943
10226
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9944
10227
  variants: function
9945
10228
  dispatch:
9946
- CPU: foreach_tensor_clamp_max_list_kernel_slow_
9947
- CUDA: foreach_tensor_clamp_max_list_kernel_cuda_
9948
- autogen: _foreach_minimum.List_out
9949
-
10229
+ CPU: foreach_tensor_clamp_min_scalar_kernel_slow_
10230
+ CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda_
10231
+ autogen: _foreach_clamp_min.Scalar_out
9950
10232
 
9951
- - func: _foreach_add.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
10233
+ - func: _foreach_clamp_min.List(Tensor[] self, Tensor[] other) -> Tensor[]
9952
10234
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9953
10235
  variants: function
9954
10236
  dispatch:
9955
- CPU: foreach_tensor_add_scalarlist_kernel_slow
9956
- CUDA: foreach_tensor_add_scalarlist_kernel_cuda
10237
+ CPU: foreach_tensor_clamp_min_list_kernel_slow
10238
+ CUDA: foreach_tensor_clamp_min_list_kernel_cuda
9957
10239
 
9958
- - func: _foreach_add_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
10240
+ - func: _foreach_clamp_min_.List(Tensor(a!)[] self, Tensor[] other) -> ()
9959
10241
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9960
10242
  variants: function
9961
10243
  dispatch:
9962
- CPU: foreach_tensor_add_scalarlist_kernel_slow_
9963
- CUDA: foreach_tensor_add_scalarlist_kernel_cuda_
9964
- autogen: _foreach_add.ScalarList_out
10244
+ CPU: foreach_tensor_clamp_min_list_kernel_slow_
10245
+ CUDA: foreach_tensor_clamp_min_list_kernel_cuda_
10246
+ autogen: _foreach_clamp_min.List_out
9965
10247
 
9966
- - func: _foreach_sub.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
10248
+ - func: _foreach_clamp_min.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
9967
10249
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9968
10250
  variants: function
9969
10251
  dispatch:
9970
- CPU: foreach_tensor_sub_scalarlist_kernel_slow
9971
- CUDA: foreach_tensor_sub_scalarlist_kernel_cuda
10252
+ CPU: foreach_tensor_clamp_min_scalarlist_kernel_slow
10253
+ CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda
9972
10254
 
9973
- - func: _foreach_sub_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
10255
+ - func: _foreach_clamp_min_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
9974
10256
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9975
10257
  variants: function
9976
10258
  dispatch:
9977
- CPU: foreach_tensor_sub_scalarlist_kernel_slow_
9978
- CUDA: foreach_tensor_sub_scalarlist_kernel_cuda_
9979
- autogen: _foreach_sub.ScalarList_out
10259
+ CPU: foreach_tensor_clamp_min_scalarlist_kernel_slow_
10260
+ CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda_
10261
+ autogen: _foreach_clamp_min.ScalarList_out
9980
10262
 
9981
- - func: _foreach_div.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
10263
+ # foreach_minimum/maximum dispatches to clamp_max/min
10264
+ - func: _foreach_maximum.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
9982
10265
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9983
10266
  variants: function
9984
10267
  dispatch:
9985
- CPU: foreach_tensor_div_scalarlist_kernel_slow
9986
- CUDA: foreach_tensor_div_scalarlist_kernel_cuda
10268
+ CPU: foreach_tensor_clamp_min_scalar_kernel_slow
10269
+ CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda
9987
10270
 
9988
- - func: _foreach_div_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
10271
+ - func: _foreach_maximum_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
9989
10272
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9990
10273
  variants: function
9991
10274
  dispatch:
9992
- CPU: foreach_tensor_div_scalarlist_kernel_slow_
9993
- CUDA: foreach_tensor_div_scalarlist_kernel_cuda_
9994
- autogen: _foreach_div.ScalarList_out
10275
+ CPU: foreach_tensor_clamp_min_scalar_kernel_slow_
10276
+ CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda_
10277
+ autogen: _foreach_maximum.Scalar_out
9995
10278
 
9996
- - func: _foreach_mul.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
10279
+ # foreach_minimum/maximum dispatches to clamp_max/min
10280
+ - func: _foreach_maximum.List(Tensor[] self, Tensor[] other) -> Tensor[]
9997
10281
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
9998
10282
  variants: function
9999
10283
  dispatch:
10000
- CPU: foreach_tensor_mul_scalarlist_kernel_slow
10001
- CUDA: foreach_tensor_mul_scalarlist_kernel_cuda
10284
+ CPU: foreach_tensor_clamp_min_list_kernel_slow
10285
+ CUDA: foreach_tensor_clamp_min_list_kernel_cuda
10002
10286
 
10003
- - func: _foreach_mul_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
10287
+ - func: _foreach_maximum_.List(Tensor(a!)[] self, Tensor[] other) -> ()
10004
10288
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10005
10289
  variants: function
10006
10290
  dispatch:
10007
- CPU: foreach_tensor_mul_scalarlist_kernel_slow_
10008
- CUDA: foreach_tensor_mul_scalarlist_kernel_cuda_
10009
- autogen: _foreach_mul.ScalarList_out
10291
+ CPU: foreach_tensor_clamp_min_list_kernel_slow_
10292
+ CUDA: foreach_tensor_clamp_min_list_kernel_cuda_
10293
+ autogen: _foreach_maximum.List_out
10010
10294
 
10011
- - func: _foreach_clamp_min.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
10295
+ # foreach_minimum/maximum dispatches to clamp_max/min
10296
+ - func: _foreach_maximum.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
10012
10297
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10013
10298
  variants: function
10014
10299
  dispatch:
10015
10300
  CPU: foreach_tensor_clamp_min_scalarlist_kernel_slow
10016
10301
  CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda
10017
10302
 
10018
- - func: _foreach_clamp_min_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
10303
+ - func: _foreach_maximum_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
10019
10304
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10020
10305
  variants: function
10021
10306
  dispatch:
10022
10307
  CPU: foreach_tensor_clamp_min_scalarlist_kernel_slow_
10023
10308
  CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda_
10024
- autogen: _foreach_clamp_min.ScalarList_out
10309
+ autogen: _foreach_maximum.ScalarList_out
10025
10310
 
10026
- - func: _foreach_clamp_max.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
10311
+ - func: _foreach_minimum.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
10027
10312
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10028
10313
  variants: function
10029
10314
  dispatch:
10030
- CPU: foreach_tensor_clamp_max_scalarlist_kernel_slow
10031
- CUDA: foreach_tensor_clamp_max_scalarlist_kernel_cuda
10315
+ CPU: foreach_tensor_clamp_max_scalar_kernel_slow
10316
+ CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda
10032
10317
 
10033
- - func: _foreach_clamp_max_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
10318
+ - func: _foreach_minimum_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
10034
10319
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10035
10320
  variants: function
10036
10321
  dispatch:
10037
- CPU: foreach_tensor_clamp_max_scalarlist_kernel_slow_
10038
- CUDA: foreach_tensor_clamp_max_scalarlist_kernel_cuda_
10039
- autogen: _foreach_clamp_max.ScalarList_out
10322
+ CPU: foreach_tensor_clamp_max_scalar_kernel_slow_
10323
+ CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda_
10324
+ autogen: _foreach_minimum.Scalar_out
10040
10325
 
10041
- # foreach_minimum/maximum dispatches to clamp_max/min
10042
- - func: _foreach_maximum.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
10326
+ - func: _foreach_minimum.List(Tensor[] self, Tensor[] other) -> Tensor[]
10043
10327
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10044
10328
  variants: function
10045
10329
  dispatch:
10046
- CPU: foreach_tensor_clamp_min_scalarlist_kernel_slow
10047
- CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda
10330
+ CPU: foreach_tensor_clamp_max_list_kernel_slow
10331
+ CUDA: foreach_tensor_clamp_max_list_kernel_cuda
10048
10332
 
10049
- - func: _foreach_maximum_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
10333
+ - func: _foreach_minimum_.List(Tensor(a!)[] self, Tensor[] other) -> ()
10050
10334
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10051
10335
  variants: function
10052
10336
  dispatch:
10053
- CPU: foreach_tensor_clamp_min_scalarlist_kernel_slow_
10054
- CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda_
10055
- autogen: _foreach_maximum.ScalarList_out
10337
+ CPU: foreach_tensor_clamp_max_list_kernel_slow_
10338
+ CUDA: foreach_tensor_clamp_max_list_kernel_cuda_
10339
+ autogen: _foreach_minimum.List_out
10056
10340
 
10057
10341
  - func: _foreach_minimum.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
10058
10342
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -10069,43 +10353,95 @@
10069
10353
  CUDA: foreach_tensor_clamp_max_scalarlist_kernel_cuda_
10070
10354
  autogen: _foreach_minimum.ScalarList_out
10071
10355
 
10072
- - func: _foreach_exp(Tensor[] self) -> Tensor[]
10356
+ - func: _foreach_addcdiv.Scalar(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[]
10073
10357
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10074
10358
  variants: function
10075
10359
  dispatch:
10076
- CPU: foreach_tensor_exp_slow
10077
- CUDA: foreach_tensor_exp_cuda
10360
+ CPU: foreach_tensor_addcdiv_scalar_slow
10361
+ CUDA: foreach_tensor_addcdiv_scalar_cuda
10078
10362
 
10079
- - func: _foreach_zero_(Tensor(a!)[] self) -> ()
10363
+ - func: _foreach_addcdiv.ScalarList(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> Tensor[]
10080
10364
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10081
10365
  variants: function
10082
10366
  dispatch:
10083
- CPU: foreach_tensor_zero_slow_
10084
- CUDA: foreach_tensor_zero_cuda_
10085
- autogen: _foreach_zero, _foreach_zero.out
10367
+ CPU: foreach_tensor_addcdiv_scalarlist_slow
10368
+ CUDA: foreach_tensor_addcdiv_scalarlist_cuda
10086
10369
 
10087
- - func: _foreach_exp_(Tensor(a!)[] self) -> ()
10370
+ - func: _foreach_addcdiv.Tensor(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> Tensor[]
10088
10371
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10089
10372
  variants: function
10090
10373
  dispatch:
10091
- CPU: foreach_tensor_exp_slow_
10092
- CUDA: foreach_tensor_exp_cuda_
10093
- autogen: _foreach_exp.out
10374
+ CPU: foreach_tensor_addcdiv_tensor_slow
10375
+ CUDA: foreach_tensor_addcdiv_tensor_cuda
10094
10376
 
10095
- - func: _foreach_sqrt(Tensor[] self) -> Tensor[]
10377
+ - func: _foreach_addcdiv_.Scalar(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> ()
10096
10378
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10097
10379
  variants: function
10098
10380
  dispatch:
10099
- CPU: foreach_tensor_sqrt_slow
10100
- CUDA: foreach_tensor_sqrt_cuda
10381
+ CPU: foreach_tensor_addcdiv_scalar_slow_
10382
+ CUDA: foreach_tensor_addcdiv_scalar_cuda_
10383
+ autogen: _foreach_addcdiv.Scalar_out
10101
10384
 
10102
- - func: _foreach_sqrt_(Tensor(a!)[] self) -> ()
10385
+ - func: _foreach_addcdiv_.ScalarList(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> ()
10103
10386
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10104
10387
  variants: function
10105
10388
  dispatch:
10106
- CPU: foreach_tensor_sqrt_slow_
10107
- CUDA: foreach_tensor_sqrt_cuda_
10108
- autogen: _foreach_sqrt.out
10389
+ CPU: foreach_tensor_addcdiv_scalarlist_slow_
10390
+ CUDA: foreach_tensor_addcdiv_scalarlist_cuda_
10391
+ autogen: _foreach_addcdiv.ScalarList_out
10392
+
10393
+ - func: _foreach_addcdiv_.Tensor(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> ()
10394
+ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10395
+ variants: function
10396
+ dispatch:
10397
+ CPU: foreach_tensor_addcdiv_tensor_slow_
10398
+ CUDA: foreach_tensor_addcdiv_tensor_cuda_
10399
+ autogen: _foreach_addcdiv.Tensor_out
10400
+
10401
+ - func: _foreach_addcmul.Scalar(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[]
10402
+ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10403
+ variants: function
10404
+ dispatch:
10405
+ CPU: foreach_tensor_addcmul_scalar_slow
10406
+ CUDA: foreach_tensor_addcmul_scalar_cuda
10407
+
10408
+ - func: _foreach_addcmul.ScalarList(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> Tensor[]
10409
+ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10410
+ variants: function
10411
+ dispatch:
10412
+ CPU: foreach_tensor_addcmul_scalarlist_slow
10413
+ CUDA: foreach_tensor_addcmul_scalarlist_cuda
10414
+
10415
+ - func: _foreach_addcmul.Tensor(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> Tensor[]
10416
+ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10417
+ variants: function
10418
+ dispatch:
10419
+ CPU: foreach_tensor_addcmul_tensor_slow
10420
+ CUDA: foreach_tensor_addcmul_tensor_cuda
10421
+
10422
+ - func: _foreach_addcmul_.Scalar(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> ()
10423
+ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10424
+ variants: function
10425
+ dispatch:
10426
+ CPU: foreach_tensor_addcmul_scalar_slow_
10427
+ CUDA: foreach_tensor_addcmul_scalar_cuda_
10428
+ autogen: _foreach_addcmul.Scalar_out
10429
+
10430
+ - func: _foreach_addcmul_.ScalarList(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> ()
10431
+ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10432
+ variants: function
10433
+ dispatch:
10434
+ CPU: foreach_tensor_addcmul_scalarlist_slow_
10435
+ CUDA: foreach_tensor_addcmul_scalarlist_cuda_
10436
+ autogen: _foreach_addcmul.ScalarList_out
10437
+
10438
+ - func: _foreach_addcmul_.Tensor(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> ()
10439
+ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10440
+ variants: function
10441
+ dispatch:
10442
+ CPU: foreach_tensor_addcmul_tensor_slow_
10443
+ CUDA: foreach_tensor_addcmul_tensor_cuda_
10444
+ autogen: _foreach_addcmul.Tensor_out
10109
10445
 
10110
10446
  - func: _foreach_abs(Tensor[] self) -> Tensor[]
10111
10447
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -10242,6 +10578,21 @@
10242
10578
  CUDA: foreach_tensor_erfc_cuda_
10243
10579
  autogen: _foreach_erfc.out
10244
10580
 
10581
+ - func: _foreach_exp(Tensor[] self) -> Tensor[]
10582
+ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10583
+ variants: function
10584
+ dispatch:
10585
+ CPU: foreach_tensor_exp_slow
10586
+ CUDA: foreach_tensor_exp_cuda
10587
+
10588
+ - func: _foreach_exp_(Tensor(a!)[] self) -> ()
10589
+ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10590
+ variants: function
10591
+ dispatch:
10592
+ CPU: foreach_tensor_exp_slow_
10593
+ CUDA: foreach_tensor_exp_cuda_
10594
+ autogen: _foreach_exp.out
10595
+
10245
10596
  - func: _foreach_expm1(Tensor[] self) -> Tensor[]
10246
10597
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10247
10598
  variants: function
@@ -10272,6 +10623,68 @@
10272
10623
  CUDA: foreach_tensor_floor_cuda_
10273
10624
  autogen: _foreach_floor.out
10274
10625
 
10626
+ - func: _foreach_frac(Tensor[] self) -> Tensor[]
10627
+ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10628
+ variants: function
10629
+ dispatch:
10630
+ CPU: foreach_tensor_frac_slow
10631
+ CUDA: foreach_tensor_frac_cuda
10632
+
10633
+ - func: _foreach_frac_(Tensor(a!)[] self) -> ()
10634
+ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10635
+ variants: function
10636
+ dispatch:
10637
+ CPU: foreach_tensor_frac_slow_
10638
+ CUDA: foreach_tensor_frac_cuda_
10639
+ autogen: _foreach_frac.out
10640
+
10641
+ - func: _foreach_lerp.List(Tensor[] self, Tensor[] tensors1, Tensor[] weights) -> Tensor[]
10642
+ device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices
10643
+ variants: function
10644
+ dispatch:
10645
+ CPU: foreach_tensor_ternary_lerp_slow
10646
+ CUDA: foreach_tensor_lerp_ternary_cuda
10647
+ autogen: _foreach_lerp.List_out
10648
+
10649
+ - func: _foreach_lerp_.List(Tensor(a!)[] self, Tensor[] tensors1, Tensor[] weights) -> ()
10650
+ device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices
10651
+ variants: function
10652
+ dispatch:
10653
+ CPU: foreach_tensor_ternary_lerp_slow_
10654
+ CUDA: foreach_tensor_lerp_ternary_cuda_
10655
+ autogen: _foreach_lerp.List_out
10656
+
10657
+ - func: _foreach_lerp.Scalar(Tensor[] self, Tensor[] tensors1, Scalar weight) -> Tensor[]
10658
+ device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices
10659
+ variants: function
10660
+ dispatch:
10661
+ CPU: foreach_tensor_lerp_list_kernel_slow
10662
+ CUDA: foreach_tensor_lerp_list_cuda
10663
+ autogen: _foreach_lerp.Scalar_out
10664
+
10665
+ - func: _foreach_lerp_.Scalar(Tensor(a!)[] self, Tensor[] tensors1, Scalar weight) -> ()
10666
+ device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices
10667
+ variants: function
10668
+ dispatch:
10669
+ CPU: foreach_tensor_lerp_list_kernel_slow_
10670
+ CUDA: foreach_tensor_lerp_list_cuda_
10671
+ autogen: _foreach_lerp.Scalar_out
10672
+
10673
+ - func: _foreach_lgamma(Tensor[] self) -> Tensor[]
10674
+ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10675
+ variants: function
10676
+ dispatch:
10677
+ CPU: foreach_tensor_lgamma_slow
10678
+ CUDA: foreach_tensor_lgamma_cuda
10679
+
10680
+ - func: _foreach_lgamma_(Tensor(a!)[] self) -> ()
10681
+ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10682
+ variants: function
10683
+ dispatch:
10684
+ CPU: foreach_tensor_lgamma_slow_
10685
+ CUDA: foreach_tensor_lgamma_cuda_
10686
+ autogen: _foreach_lgamma.out
10687
+
10275
10688
  - func: _foreach_log(Tensor[] self) -> Tensor[]
10276
10689
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10277
10690
  variants: function
@@ -10347,110 +10760,65 @@
10347
10760
  CUDA: foreach_tensor_neg_cuda_
10348
10761
  autogen: _foreach_neg.out
10349
10762
 
10350
- - func: _foreach_tan(Tensor[] self) -> Tensor[]
10351
- device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10352
- variants: function
10353
- dispatch:
10354
- CPU: foreach_tensor_tan_slow
10355
- CUDA: foreach_tensor_tan_cuda
10356
-
10357
- - func: _foreach_tan_(Tensor(a!)[] self) -> ()
10358
- device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10359
- variants: function
10360
- dispatch:
10361
- CPU: foreach_tensor_tan_slow_
10362
- CUDA: foreach_tensor_tan_cuda_
10363
- autogen: _foreach_tan.out
10364
-
10365
- - func: _foreach_tanh(Tensor[] self) -> Tensor[]
10366
- device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10367
- variants: function
10368
- dispatch:
10369
- CPU: foreach_tensor_tanh_slow
10370
- CUDA: foreach_tensor_tanh_cuda
10371
-
10372
- - func: _foreach_tanh_(Tensor(a!)[] self) -> ()
10373
- device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10374
- variants: function
10375
- dispatch:
10376
- CPU: foreach_tensor_tanh_slow_
10377
- CUDA: foreach_tensor_tanh_cuda_
10378
- autogen: _foreach_tanh.out
10379
-
10380
- - func: _foreach_sin(Tensor[] self) -> Tensor[]
10381
- device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10382
- variants: function
10383
- dispatch:
10384
- CPU: foreach_tensor_sin_slow
10385
- CUDA: foreach_tensor_sin_cuda
10386
-
10387
- - func: _foreach_sin_(Tensor(a!)[] self) -> ()
10388
- device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10389
- variants: function
10390
- dispatch:
10391
- CPU: foreach_tensor_sin_slow_
10392
- CUDA: foreach_tensor_sin_cuda_
10393
- autogen: _foreach_sin.out
10394
-
10395
- - func: _foreach_sinh(Tensor[] self) -> Tensor[]
10763
+ - func: _foreach_norm.Scalar(Tensor[] self, Scalar ord=2) -> Tensor[]
10396
10764
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10397
10765
  variants: function
10398
10766
  dispatch:
10399
- CPU: foreach_tensor_sinh_slow
10400
- CUDA: foreach_tensor_sinh_cuda
10767
+ CPU: foreach_tensor_norm_slow
10768
+ CUDA: foreach_tensor_norm_cuda
10769
+ autogen: _foreach_norm.Scalar_out
10401
10770
 
10402
- - func: _foreach_sinh_(Tensor(a!)[] self) -> ()
10771
+ - func: _foreach_pow.List(Tensor[] self, Tensor[] exponent) -> Tensor[]
10403
10772
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10404
10773
  variants: function
10405
10774
  dispatch:
10406
- CPU: foreach_tensor_sinh_slow_
10407
- CUDA: foreach_tensor_sinh_cuda_
10408
- autogen: _foreach_sinh.out
10775
+ CPU: foreach_tensor_pow_list_kernel_slow
10776
+ CUDA: foreach_tensor_pow_list_kernel_cuda
10409
10777
 
10410
- - func: _foreach_round(Tensor[] self) -> Tensor[]
10778
+ - func: _foreach_pow.Scalar(Tensor[] self, Scalar exponent) -> Tensor[]
10411
10779
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10412
10780
  variants: function
10413
10781
  dispatch:
10414
- CPU: foreach_tensor_round_slow
10415
- CUDA: foreach_tensor_round_cuda
10782
+ CPU: foreach_tensor_pow_scalar_kernel_slow
10783
+ CUDA: foreach_tensor_pow_scalar_kernel_cuda
10416
10784
 
10417
- - func: _foreach_round_(Tensor(a!)[] self) -> ()
10785
+ - func: _foreach_pow.ScalarList(Tensor[] self, Scalar[] exponent) -> Tensor[]
10418
10786
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10419
10787
  variants: function
10420
10788
  dispatch:
10421
- CPU: foreach_tensor_round_slow_
10422
- CUDA: foreach_tensor_round_cuda_
10423
- autogen: _foreach_round.out
10789
+ CPU: foreach_tensor_pow_scalarlist_kernel_slow
10790
+ CUDA: foreach_tensor_pow_scalarlist_kernel_cuda
10424
10791
 
10425
- - func: _foreach_lgamma(Tensor[] self) -> Tensor[]
10792
+ - func: _foreach_pow.ScalarAndTensor(Scalar self, Tensor[] exponent) -> Tensor[]
10426
10793
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10427
10794
  variants: function
10428
10795
  dispatch:
10429
- CPU: foreach_tensor_lgamma_slow
10430
- CUDA: foreach_tensor_lgamma_cuda
10796
+ CPU: foreach_scalar_pow_list_kernel_slow
10797
+ CUDA: foreach_scalar_pow_list_kernel_cuda
10431
10798
 
10432
- - func: _foreach_lgamma_(Tensor(a!)[] self) -> ()
10433
- device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10799
+ - func: _foreach_pow_.List(Tensor(a!)[] self, Tensor[] exponent) -> ()
10800
+ device_check: NoCheck
10434
10801
  variants: function
10435
10802
  dispatch:
10436
- CPU: foreach_tensor_lgamma_slow_
10437
- CUDA: foreach_tensor_lgamma_cuda_
10438
- autogen: _foreach_lgamma.out
10803
+ CPU: foreach_tensor_pow_list_kernel_slow_
10804
+ CUDA: foreach_tensor_pow_list_kernel_cuda_
10805
+ autogen: _foreach_pow.List_out
10439
10806
 
10440
- - func: _foreach_frac(Tensor[] self) -> Tensor[]
10441
- device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10807
+ - func: _foreach_pow_.Scalar(Tensor(a!)[] self, Scalar exponent) -> ()
10808
+ device_check: NoCheck
10442
10809
  variants: function
10443
10810
  dispatch:
10444
- CPU: foreach_tensor_frac_slow
10445
- CUDA: foreach_tensor_frac_cuda
10811
+ CPU: foreach_tensor_pow_scalar_kernel_slow_
10812
+ CUDA: foreach_tensor_pow_scalar_kernel_cuda_
10813
+ autogen: _foreach_pow.Scalar_out
10446
10814
 
10447
- - func: _foreach_frac_(Tensor(a!)[] self) -> ()
10448
- device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10815
+ - func: _foreach_pow_.ScalarList(Tensor(a!)[] self, Scalar[] exponent) -> ()
10816
+ device_check: NoCheck
10449
10817
  variants: function
10450
10818
  dispatch:
10451
- CPU: foreach_tensor_frac_slow_
10452
- CUDA: foreach_tensor_frac_cuda_
10453
- autogen: _foreach_frac.out
10819
+ CPU: foreach_tensor_pow_scalarlist_kernel_slow_
10820
+ CUDA: foreach_tensor_pow_scalarlist_kernel_cuda_
10821
+ autogen: _foreach_pow.ScalarList_out
10454
10822
 
10455
10823
  - func: _foreach_reciprocal(Tensor[] self) -> Tensor[]
10456
10824
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@@ -10467,6 +10835,21 @@
10467
10835
  CUDA: foreach_tensor_reciprocal_cuda_
10468
10836
  autogen: _foreach_reciprocal.out
10469
10837
 
10838
+ - func: _foreach_round(Tensor[] self) -> Tensor[]
10839
+ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10840
+ variants: function
10841
+ dispatch:
10842
+ CPU: foreach_tensor_round_slow
10843
+ CUDA: foreach_tensor_round_cuda
10844
+
10845
+ - func: _foreach_round_(Tensor(a!)[] self) -> ()
10846
+ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10847
+ variants: function
10848
+ dispatch:
10849
+ CPU: foreach_tensor_round_slow_
10850
+ CUDA: foreach_tensor_round_cuda_
10851
+ autogen: _foreach_round.out
10852
+
10470
10853
  - func: _foreach_sigmoid(Tensor[] self) -> Tensor[]
10471
10854
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10472
10855
  variants: function
@@ -10482,150 +10865,126 @@
10482
10865
  CUDA: foreach_tensor_sigmoid_cuda_
10483
10866
  autogen: _foreach_sigmoid.out
10484
10867
 
10485
- - func: _foreach_trunc(Tensor[] self) -> Tensor[]
10868
+ - func: _foreach_sign(Tensor[] self) -> Tensor[]
10486
10869
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10487
10870
  variants: function
10488
10871
  dispatch:
10489
- CPU: foreach_tensor_trunc_slow
10490
- CUDA: foreach_tensor_trunc_cuda
10872
+ CPU: foreach_tensor_sign_slow
10873
+ CUDA: foreach_tensor_sign_cuda
10491
10874
 
10492
- - func: _foreach_trunc_(Tensor(a!)[] self) -> ()
10875
+ - func: _foreach_sign_(Tensor(a!)[] self) -> ()
10493
10876
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10494
10877
  variants: function
10495
10878
  dispatch:
10496
- CPU: foreach_tensor_trunc_slow_
10497
- CUDA: foreach_tensor_trunc_cuda_
10498
- autogen: _foreach_trunc.out
10879
+ CPU: foreach_tensor_sign_slow_
10880
+ CUDA: foreach_tensor_sign_cuda_
10881
+ autogen: _foreach_sign.out
10499
10882
 
10500
- - func: _foreach_addcdiv_.Scalar(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> ()
10883
+ - func: _foreach_sin(Tensor[] self) -> Tensor[]
10501
10884
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10502
10885
  variants: function
10503
10886
  dispatch:
10504
- CPU: foreach_tensor_addcdiv_scalar_slow_
10505
- CUDA: foreach_tensor_addcdiv_scalar_cuda_
10506
- autogen: _foreach_addcdiv.Scalar_out
10887
+ CPU: foreach_tensor_sin_slow
10888
+ CUDA: foreach_tensor_sin_cuda
10507
10889
 
10508
- - func: _foreach_addcmul_.Scalar(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> ()
10890
+ - func: _foreach_sin_(Tensor(a!)[] self) -> ()
10509
10891
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10510
10892
  variants: function
10511
10893
  dispatch:
10512
- CPU: foreach_tensor_addcmul_scalar_slow_
10513
- CUDA: foreach_tensor_addcmul_scalar_cuda_
10514
- autogen: _foreach_addcmul.Scalar_out
10894
+ CPU: foreach_tensor_sin_slow_
10895
+ CUDA: foreach_tensor_sin_cuda_
10896
+ autogen: _foreach_sin.out
10515
10897
 
10516
- - func: _foreach_addcdiv_.ScalarList(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> ()
10898
+ - func: _foreach_sinh(Tensor[] self) -> Tensor[]
10517
10899
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10518
10900
  variants: function
10519
10901
  dispatch:
10520
- CPU: foreach_tensor_addcdiv_scalarlist_slow_
10521
- CUDA: foreach_tensor_addcdiv_scalarlist_cuda_
10522
- autogen: _foreach_addcdiv.ScalarList_out
10902
+ CPU: foreach_tensor_sinh_slow
10903
+ CUDA: foreach_tensor_sinh_cuda
10523
10904
 
10524
- - func: _foreach_addcdiv_.Tensor(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> ()
10905
+ - func: _foreach_sinh_(Tensor(a!)[] self) -> ()
10525
10906
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10526
10907
  variants: function
10527
10908
  dispatch:
10528
- CPU: foreach_tensor_addcdiv_tensor_slow_
10529
- CUDA: foreach_tensor_addcdiv_tensor_cuda_
10530
- autogen: _foreach_addcdiv.Tensor_out
10909
+ CPU: foreach_tensor_sinh_slow_
10910
+ CUDA: foreach_tensor_sinh_cuda_
10911
+ autogen: _foreach_sinh.out
10531
10912
 
10532
- - func: _foreach_addcmul_.ScalarList(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> ()
10913
+ - func: _foreach_sqrt(Tensor[] self) -> Tensor[]
10533
10914
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10534
10915
  variants: function
10535
10916
  dispatch:
10536
- CPU: foreach_tensor_addcmul_scalarlist_slow_
10537
- CUDA: foreach_tensor_addcmul_scalarlist_cuda_
10538
- autogen: _foreach_addcmul.ScalarList_out
10917
+ CPU: foreach_tensor_sqrt_slow
10918
+ CUDA: foreach_tensor_sqrt_cuda
10539
10919
 
10540
- - func: _foreach_addcmul_.Tensor(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> ()
10920
+ - func: _foreach_sqrt_(Tensor(a!)[] self) -> ()
10541
10921
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10542
10922
  variants: function
10543
10923
  dispatch:
10544
- CPU: foreach_tensor_addcmul_tensor_slow_
10545
- CUDA: foreach_tensor_addcmul_tensor_cuda_
10546
- autogen: _foreach_addcmul.Tensor_out
10924
+ CPU: foreach_tensor_sqrt_slow_
10925
+ CUDA: foreach_tensor_sqrt_cuda_
10926
+ autogen: _foreach_sqrt.out
10547
10927
 
10548
- - func: _foreach_addcdiv.Scalar(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[]
10928
+ - func: _foreach_tan(Tensor[] self) -> Tensor[]
10549
10929
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10550
10930
  variants: function
10551
10931
  dispatch:
10552
- CPU: foreach_tensor_addcdiv_scalar_slow
10553
- CUDA: foreach_tensor_addcdiv_scalar_cuda
10932
+ CPU: foreach_tensor_tan_slow
10933
+ CUDA: foreach_tensor_tan_cuda
10554
10934
 
10555
- - func: _foreach_addcmul.Scalar(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[]
10935
+ - func: _foreach_tan_(Tensor(a!)[] self) -> ()
10556
10936
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10557
10937
  variants: function
10558
10938
  dispatch:
10559
- CPU: foreach_tensor_addcmul_scalar_slow
10560
- CUDA: foreach_tensor_addcmul_scalar_cuda
10939
+ CPU: foreach_tensor_tan_slow_
10940
+ CUDA: foreach_tensor_tan_cuda_
10941
+ autogen: _foreach_tan.out
10561
10942
 
10562
- - func: _foreach_addcdiv.ScalarList(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> Tensor[]
10943
+ - func: _foreach_tanh(Tensor[] self) -> Tensor[]
10563
10944
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10564
10945
  variants: function
10565
10946
  dispatch:
10566
- CPU: foreach_tensor_addcdiv_scalarlist_slow
10567
- CUDA: foreach_tensor_addcdiv_scalarlist_cuda
10947
+ CPU: foreach_tensor_tanh_slow
10948
+ CUDA: foreach_tensor_tanh_cuda
10568
10949
 
10569
- - func: _foreach_addcdiv.Tensor(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> Tensor[]
10950
+ - func: _foreach_tanh_(Tensor(a!)[] self) -> ()
10570
10951
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10571
10952
  variants: function
10572
10953
  dispatch:
10573
- CPU: foreach_tensor_addcdiv_tensor_slow
10574
- CUDA: foreach_tensor_addcdiv_tensor_cuda
10954
+ CPU: foreach_tensor_tanh_slow_
10955
+ CUDA: foreach_tensor_tanh_cuda_
10956
+ autogen: _foreach_tanh.out
10575
10957
 
10576
- - func: _foreach_addcmul.ScalarList(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> Tensor[]
10958
+ - func: _foreach_trunc(Tensor[] self) -> Tensor[]
10577
10959
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10578
10960
  variants: function
10579
10961
  dispatch:
10580
- CPU: foreach_tensor_addcmul_scalarlist_slow
10581
- CUDA: foreach_tensor_addcmul_scalarlist_cuda
10962
+ CPU: foreach_tensor_trunc_slow
10963
+ CUDA: foreach_tensor_trunc_cuda
10582
10964
 
10583
- - func: _foreach_addcmul.Tensor(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> Tensor[]
10965
+ - func: _foreach_trunc_(Tensor(a!)[] self) -> ()
10584
10966
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10585
10967
  variants: function
10586
10968
  dispatch:
10587
- CPU: foreach_tensor_addcmul_tensor_slow
10588
- CUDA: foreach_tensor_addcmul_tensor_cuda
10969
+ CPU: foreach_tensor_trunc_slow_
10970
+ CUDA: foreach_tensor_trunc_cuda_
10971
+ autogen: _foreach_trunc.out
10589
10972
 
10590
- - func: _foreach_norm.Scalar(Tensor[] self, Scalar ord=2) -> Tensor[]
10973
+ - func: _foreach_zero_(Tensor(a!)[] self) -> ()
10591
10974
  device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10592
10975
  variants: function
10593
10976
  dispatch:
10594
- CPU: foreach_tensor_norm_slow
10595
- CUDA: foreach_tensor_norm_cuda
10596
- autogen: _foreach_norm.Scalar_out
10597
-
10598
- - func: _foreach_lerp.List(Tensor[] self, Tensor[] tensors1, Tensor[] weights) -> Tensor[]
10599
- device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices
10600
- variants: function
10601
- dispatch:
10602
- CPU: foreach_tensor_ternary_lerp_slow
10603
- CUDA: foreach_tensor_lerp_ternary_cuda
10604
- autogen: _foreach_lerp.List_out
10605
-
10606
- - func: _foreach_lerp_.List(Tensor(a!)[] self, Tensor[] tensors1, Tensor[] weights) -> ()
10607
- device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices
10608
- variants: function
10609
- dispatch:
10610
- CPU: foreach_tensor_ternary_lerp_slow_
10611
- CUDA: foreach_tensor_lerp_ternary_cuda_
10612
- autogen: _foreach_lerp.List_out
10613
-
10614
- - func: _foreach_lerp.Scalar(Tensor[] self, Tensor[] tensors1, Scalar weight) -> Tensor[]
10615
- device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices
10616
- variants: function
10617
- dispatch:
10618
- CPU: foreach_tensor_lerp_list_kernel_slow
10619
- CUDA: foreach_tensor_lerp_list_cuda
10620
- autogen: _foreach_lerp.Scalar_out
10977
+ CPU: foreach_tensor_zero_slow_
10978
+ CUDA: foreach_tensor_zero_cuda_
10979
+ autogen: _foreach_zero, _foreach_zero.out
10621
10980
 
10622
- - func: _foreach_lerp_.Scalar(Tensor(a!)[] self, Tensor[] tensors1, Scalar weight) -> ()
10623
- device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices
10981
+ - func: _foreach_copy_(Tensor(a!)[] self, Tensor[] src, bool non_blocking=False) -> ()
10982
+ device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
10624
10983
  variants: function
10625
10984
  dispatch:
10626
- CPU: foreach_tensor_lerp_list_kernel_slow_
10627
- CUDA: foreach_tensor_lerp_list_cuda_
10628
- autogen: _foreach_lerp.Scalar_out
10985
+ CPU: foreach_tensor_copy_list_kernel_slow_
10986
+ CUDA: foreach_tensor_copy_list_kernel_cuda_
10987
+ autogen: _foreach_copy, _foreach_copy.out
10629
10988
 
10630
10989
  - func: bucketize.Tensor(Tensor self, Tensor boundaries, *, bool out_int32=False, bool right=False) -> Tensor
10631
10990
  dispatch:
@@ -10657,7 +11016,11 @@
10657
11016
  dispatch:
10658
11017
  CPU: searchsorted_cpu
10659
11018
  CUDA: searchsorted_cuda
10660
- autogen: searchsorted.Scalar_out
11019
+
11020
+ - func: searchsorted.Scalar_out(Tensor sorted_sequence, Scalar self, *, bool out_int32=False, bool right=False, str? side=None, Tensor? sorter=None, Tensor(a!) out) -> Tensor(a!)
11021
+ dispatch:
11022
+ CPU: searchsorted_out_cpu
11023
+ CUDA: searchsorted_out_cuda
10661
11024
 
10662
11025
  - func: _convert_indices_from_coo_to_csr(Tensor self, int size, *, bool out_int32=False) -> Tensor
10663
11026
  structured_delegate: _convert_indices_from_coo_to_csr.out
@@ -10981,6 +11344,7 @@
10981
11344
  python_module: nn
10982
11345
  dispatch:
10983
11346
  CPU, CUDA: hardsigmoid_out
11347
+ MPS: hardsigmoid_out_mps
10984
11348
  QuantizedCPU: hardsigmoid_out_quantized_cpu
10985
11349
 
10986
11350
  - func: hardsigmoid(Tensor self) -> Tensor
@@ -11001,6 +11365,7 @@
11001
11365
  python_module: nn
11002
11366
  dispatch:
11003
11367
  CPU, CUDA: hardsigmoid_backward_out
11368
+ MPS: hardsigmoid_backward_out_mps
11004
11369
 
11005
11370
  - func: hardsigmoid_backward(Tensor grad_output, Tensor self) -> Tensor
11006
11371
  structured_delegate: hardsigmoid_backward.grad_input
@@ -11119,6 +11484,7 @@
11119
11484
  dispatch:
11120
11485
  CPU: log_sigmoid_forward_out_cpu
11121
11486
  CUDA: log_sigmoid_forward_out_cuda
11487
+ MPS: log_sigmoid_forward_out_mps
11122
11488
 
11123
11489
  - func: log_sigmoid_forward(Tensor self) -> (Tensor output, Tensor buffer)
11124
11490
  device_check: NoCheck # TensorIterator
@@ -11126,18 +11492,21 @@
11126
11492
  dispatch:
11127
11493
  CPU: log_sigmoid_forward_cpu
11128
11494
  CUDA: log_sigmoid_forward_cuda
11495
+ MPS: log_sigmoid_forward_mps
11129
11496
 
11130
11497
  - func: log_sigmoid_backward.grad_input(Tensor grad_output, Tensor self, Tensor buffer, *, Tensor(a!) grad_input) -> Tensor(a!)
11131
11498
  python_module: nn
11132
11499
  dispatch:
11133
11500
  CPU: log_sigmoid_backward_cpu_out
11134
11501
  CUDA: log_sigmoid_backward_cuda_out
11502
+ MPS: log_sigmoid_backward_mps_out
11135
11503
 
11136
11504
  - func: log_sigmoid_backward(Tensor grad_output, Tensor self, Tensor buffer) -> Tensor
11137
11505
  python_module: nn
11138
11506
  dispatch:
11139
11507
  CPU: log_sigmoid_backward_cpu
11140
11508
  CUDA: log_sigmoid_backward_cuda
11509
+ MPS: log_sigmoid_backward_mps
11141
11510
 
11142
11511
  - func: rrelu_with_noise.out(Tensor self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None, *, Tensor(a!) out) -> Tensor(a!)
11143
11512
  python_module: nn
@@ -11279,6 +11648,7 @@
11279
11648
  CUDA: adaptive_avg_pool3d_cuda
11280
11649
  QuantizedCPU: adaptive_avg_pool3d_quantized_cpu
11281
11650
  autogen: _adaptive_avg_pool3d.out
11651
+ tags: core
11282
11652
 
11283
11653
  - func: adaptive_avg_pool3d_backward.grad_input(Tensor grad_output, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!)
11284
11654
  python_module: nn
@@ -11394,6 +11764,7 @@
11394
11764
  dispatch:
11395
11765
  MkldnnCPU: mkldnn_avg_pool3d
11396
11766
  QuantizedCPU: avg_pool3d_quantized_cpu
11767
+ tags: core
11397
11768
 
11398
11769
  - func: avg_pool3d_backward.grad_input(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] stride, int[3] padding, bool ceil_mode, bool count_include_pad, int? divisor_override, *, Tensor(a!) grad_input) -> Tensor(a!)
11399
11770
  python_module: nn
@@ -11517,25 +11888,25 @@
11517
11888
  CPU: max_pool3d_with_indices_backward_cpu
11518
11889
  CUDA: max_pool3d_with_indices_backward_cuda
11519
11890
 
11520
- - func: max_unpool2d.out(Tensor self, Tensor indices, int[2] output_size, *, Tensor(a!) out) -> Tensor(a!)
11891
+ - func: max_unpool2d.out(Tensor self, Tensor indices, SymInt[2] output_size, *, Tensor(a!) out) -> Tensor(a!)
11521
11892
  python_module: nn
11522
11893
  dispatch:
11523
11894
  CPU: max_unpooling2d_forward_out_cpu
11524
11895
  CUDA: max_unpooling2d_forward_out_cuda
11525
11896
 
11526
- - func: max_unpool2d(Tensor self, Tensor indices, int[2] output_size) -> Tensor
11897
+ - func: max_unpool2d(Tensor self, Tensor indices, SymInt[2] output_size) -> Tensor
11527
11898
  python_module: nn
11528
11899
  dispatch:
11529
11900
  CPU: max_unpooling2d_forward_cpu
11530
11901
  CUDA: max_unpooling2d_forward_cuda
11531
11902
 
11532
- - func: max_unpool3d.out(Tensor self, Tensor indices, int[3] output_size, int[3] stride, int[3] padding, *, Tensor(a!) out) -> Tensor(a!)
11903
+ - func: max_unpool3d.out(Tensor self, Tensor indices, SymInt[3] output_size, int[3] stride, int[3] padding, *, Tensor(a!) out) -> Tensor(a!)
11533
11904
  python_module: nn
11534
11905
  dispatch:
11535
11906
  CPU: max_unpooling3d_forward_out_cpu
11536
11907
  CUDA: max_unpooling3d_forward_out_cuda
11537
11908
 
11538
- - func: max_unpool3d(Tensor self, Tensor indices, int[3] output_size, int[3] stride, int[3] padding) -> Tensor
11909
+ - func: max_unpool3d(Tensor self, Tensor indices, SymInt[3] output_size, int[3] stride, int[3] padding) -> Tensor
11539
11910
  python_module: nn
11540
11911
  dispatch:
11541
11912
  CPU: max_unpooling3d_forward_cpu
@@ -11553,6 +11924,7 @@
11553
11924
  - func: reflection_pad1d(Tensor self, SymInt[2] padding) -> Tensor
11554
11925
  python_module: nn
11555
11926
  structured_delegate: reflection_pad1d.out
11927
+ tags: core
11556
11928
 
11557
11929
  - func: reflection_pad1d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[2] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
11558
11930
  python_module: nn
@@ -11607,6 +11979,7 @@
11607
11979
  - func: reflection_pad3d(Tensor self, SymInt[6] padding) -> Tensor
11608
11980
  python_module: nn
11609
11981
  structured_delegate: reflection_pad3d.out
11982
+ tags: core
11610
11983
 
11611
11984
  - func: reflection_pad3d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[6] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
11612
11985
  python_module: nn
@@ -12069,6 +12442,7 @@
12069
12442
  structured_inherits: TensorIteratorBase
12070
12443
  dispatch:
12071
12444
  CPU, CUDA: logit_backward_out
12445
+ MPS: logit_backward_out_mps
12072
12446
  tags: pointwise
12073
12447
 
12074
12448
  - func: logit_backward(Tensor grad_output, Tensor self, float? eps=None) -> Tensor
@@ -12715,157 +13089,229 @@
12715
13089
 
12716
13090
  # torch.fft.fft
12717
13091
  # NOTE: NOT an alias for torch.fft, which has different semantics
12718
- - func: fft_fft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor
13092
+ - func: fft_fft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor
12719
13093
  python_module: fft
12720
13094
  variants: function
13095
+ dispatch:
13096
+ CompositeImplicitAutograd: fft_fft_symint
12721
13097
 
12722
- - func: fft_fft.out(Tensor self, int? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
13098
+ - func: fft_fft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
12723
13099
  python_module: fft
12724
13100
  variants: function
13101
+ dispatch:
13102
+ CompositeImplicitAutograd: fft_fft_symint_out
12725
13103
 
12726
- - func: fft_ifft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor
13104
+ - func: fft_ifft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor
12727
13105
  python_module: fft
12728
13106
  variants: function
13107
+ dispatch:
13108
+ CompositeImplicitAutograd: fft_ifft_symint
12729
13109
 
12730
- - func: fft_ifft.out(Tensor self, int? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
13110
+ - func: fft_ifft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
12731
13111
  python_module: fft
12732
13112
  variants: function
13113
+ dispatch:
13114
+ CompositeImplicitAutograd: fft_ifft_symint_out
12733
13115
 
12734
- - func: fft_rfft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor
13116
+ - func: fft_rfft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor
12735
13117
  python_module: fft
12736
13118
  variants: function
13119
+ dispatch:
13120
+ CompositeImplicitAutograd: fft_rfft_symint
12737
13121
 
12738
- - func: fft_rfft.out(Tensor self, int? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
13122
+ - func: fft_rfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
12739
13123
  python_module: fft
12740
13124
  variants: function
13125
+ dispatch:
13126
+ CompositeImplicitAutograd: fft_rfft_symint_out
12741
13127
 
12742
- - func: fft_irfft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor
13128
+ - func: fft_irfft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor
12743
13129
  python_module: fft
12744
13130
  variants: function
13131
+ dispatch:
13132
+ CompositeImplicitAutograd: fft_irfft_symint
12745
13133
 
12746
- - func: fft_irfft.out(Tensor self, int? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
13134
+ - func: fft_irfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
12747
13135
  python_module: fft
12748
13136
  variants: function
13137
+ dispatch:
13138
+ CompositeImplicitAutograd: fft_irfft_symint_out
12749
13139
 
12750
- - func: fft_hfft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor
13140
+ - func: fft_hfft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor
12751
13141
  python_module: fft
12752
13142
  variants: function
13143
+ dispatch:
13144
+ CompositeImplicitAutograd: fft_hfft_symint
12753
13145
 
12754
- - func: fft_hfft.out(Tensor self, int? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
13146
+ - func: fft_hfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
12755
13147
  python_module: fft
12756
13148
  variants: function
13149
+ dispatch:
13150
+ CompositeImplicitAutograd: fft_hfft_symint_out
12757
13151
 
12758
- - func: fft_ihfft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor
13152
+ - func: fft_ihfft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor
12759
13153
  python_module: fft
12760
13154
  variants: function
13155
+ dispatch:
13156
+ CompositeImplicitAutograd: fft_ihfft_symint
12761
13157
 
12762
- - func: fft_ihfft.out(Tensor self, int? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
13158
+ - func: fft_ihfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
12763
13159
  python_module: fft
12764
13160
  variants: function
13161
+ dispatch:
13162
+ CompositeImplicitAutograd: fft_ihfft_symint_out
12765
13163
 
12766
- - func: fft_fft2(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
13164
+ - func: fft_fft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
12767
13165
  python_module: fft
12768
13166
  variants: function
13167
+ dispatch:
13168
+ CompositeImplicitAutograd: fft_fft2_symint
12769
13169
 
12770
- - func: fft_fft2.out(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
13170
+ - func: fft_fft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
12771
13171
  python_module: fft
12772
13172
  variants: function
13173
+ dispatch:
13174
+ CompositeImplicitAutograd: fft_fft2_symint_out
12773
13175
 
12774
- - func: fft_ifft2(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
13176
+ - func: fft_ifft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
12775
13177
  python_module: fft
12776
13178
  variants: function
13179
+ dispatch:
13180
+ CompositeImplicitAutograd: fft_ifft2_symint
12777
13181
 
12778
- - func: fft_ifft2.out(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
13182
+ - func: fft_ifft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
12779
13183
  python_module: fft
12780
13184
  variants: function
13185
+ dispatch:
13186
+ CompositeImplicitAutograd: fft_ifft2_symint_out
12781
13187
 
12782
- - func: fft_rfft2(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
13188
+ - func: fft_rfft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
12783
13189
  python_module: fft
12784
13190
  variants: function
13191
+ dispatch:
13192
+ CompositeImplicitAutograd: fft_rfft2_symint
12785
13193
 
12786
- - func: fft_rfft2.out(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
13194
+ - func: fft_rfft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
12787
13195
  python_module: fft
12788
13196
  variants: function
13197
+ dispatch:
13198
+ CompositeImplicitAutograd: fft_rfft2_symint_out
12789
13199
 
12790
- - func: fft_irfft2(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
13200
+ - func: fft_irfft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
12791
13201
  python_module: fft
12792
13202
  variants: function
13203
+ dispatch:
13204
+ CompositeImplicitAutograd: fft_irfft2_symint
12793
13205
 
12794
- - func: fft_irfft2.out(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
13206
+ - func: fft_irfft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
12795
13207
  python_module: fft
12796
13208
  variants: function
13209
+ dispatch:
13210
+ CompositeImplicitAutograd: fft_irfft2_symint_out
12797
13211
 
12798
- - func: fft_hfft2(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
13212
+ - func: fft_hfft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
12799
13213
  use_const_ref_for_mutable_tensors: True
12800
13214
  python_module: fft
12801
13215
  variants: function
13216
+ dispatch:
13217
+ CompositeImplicitAutograd: fft_hfft2_symint
12802
13218
 
12803
- - func: fft_hfft2.out(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
13219
+ - func: fft_hfft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
12804
13220
  use_const_ref_for_mutable_tensors: True
12805
13221
  python_module: fft
12806
13222
  variants: function
13223
+ dispatch:
13224
+ CompositeImplicitAutograd: fft_hfft2_symint_out
12807
13225
 
12808
- - func: fft_ihfft2(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
13226
+ - func: fft_ihfft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
12809
13227
  use_const_ref_for_mutable_tensors: True
12810
13228
  python_module: fft
12811
13229
  variants: function
13230
+ dispatch:
13231
+ CompositeImplicitAutograd: fft_ihfft2_symint
12812
13232
 
12813
- - func: fft_ihfft2.out(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
13233
+ - func: fft_ihfft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
12814
13234
  use_const_ref_for_mutable_tensors: True
12815
13235
  python_module: fft
12816
13236
  variants: function
13237
+ dispatch:
13238
+ CompositeImplicitAutograd: fft_ihfft2_symint_out
12817
13239
 
12818
- - func: fft_fftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
13240
+ - func: fft_fftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
12819
13241
  python_module: fft
12820
13242
  variants: function
13243
+ dispatch:
13244
+ CompositeImplicitAutograd: fft_fftn_symint
12821
13245
 
12822
- - func: fft_fftn.out(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
13246
+ - func: fft_fftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
12823
13247
  python_module: fft
12824
13248
  variants: function
13249
+ dispatch:
13250
+ CompositeImplicitAutograd: fft_fftn_symint_out
12825
13251
 
12826
- - func: fft_ifftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
13252
+ - func: fft_ifftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
12827
13253
  python_module: fft
12828
13254
  variants: function
13255
+ dispatch:
13256
+ CompositeImplicitAutograd: fft_ifftn_symint
12829
13257
 
12830
- - func: fft_ifftn.out(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
13258
+ - func: fft_ifftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
12831
13259
  python_module: fft
12832
13260
  variants: function
13261
+ dispatch:
13262
+ CompositeImplicitAutograd: fft_ifftn_symint_out
12833
13263
 
12834
- - func: fft_rfftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
13264
+ - func: fft_rfftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
12835
13265
  python_module: fft
12836
13266
  variants: function
13267
+ dispatch:
13268
+ CompositeImplicitAutograd: fft_rfftn_symint
12837
13269
 
12838
- - func: fft_rfftn.out(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
13270
+ - func: fft_rfftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
12839
13271
  python_module: fft
12840
13272
  variants: function
13273
+ dispatch:
13274
+ CompositeImplicitAutograd: fft_rfftn_symint_out
12841
13275
 
12842
- - func: fft_irfftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
13276
+ - func: fft_irfftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
12843
13277
  python_module: fft
12844
13278
  variants: function
13279
+ dispatch:
13280
+ CompositeImplicitAutograd: fft_irfftn_symint
12845
13281
 
12846
- - func: fft_irfftn.out(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
13282
+ - func: fft_irfftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
12847
13283
  python_module: fft
12848
13284
  variants: function
13285
+ dispatch:
13286
+ CompositeImplicitAutograd: fft_irfftn_symint_out
12849
13287
 
12850
- - func: fft_hfftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
13288
+ - func: fft_hfftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
12851
13289
  use_const_ref_for_mutable_tensors: True
12852
13290
  python_module: fft
12853
13291
  variants: function
13292
+ dispatch:
13293
+ CompositeImplicitAutograd: fft_hfftn_symint
12854
13294
 
12855
- - func: fft_hfftn.out(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
13295
+ - func: fft_hfftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
12856
13296
  use_const_ref_for_mutable_tensors: True
12857
13297
  python_module: fft
12858
13298
  variants: function
13299
+ dispatch:
13300
+ CompositeImplicitAutograd: fft_hfftn_symint_out
12859
13301
 
12860
- - func: fft_ihfftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
13302
+ - func: fft_ihfftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
12861
13303
  use_const_ref_for_mutable_tensors: True
12862
13304
  python_module: fft
12863
13305
  variants: function
13306
+ dispatch:
13307
+ CompositeImplicitAutograd: fft_ihfftn_symint
12864
13308
 
12865
- - func: fft_ihfftn.out(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
13309
+ - func: fft_ihfftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
12866
13310
  use_const_ref_for_mutable_tensors: True
12867
13311
  python_module: fft
12868
13312
  variants: function
13313
+ dispatch:
13314
+ CompositeImplicitAutograd: fft_ihfftn_symint_out
12869
13315
 
12870
13316
  - func: fft_fftfreq(int n, float d=1.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
12871
13317
  python_module: fft
@@ -13210,6 +13656,7 @@
13210
13656
  structured: True
13211
13657
  dispatch:
13212
13658
  CPU, CUDA: linalg_vector_norm_out
13659
+ MPS: linalg_vector_norm_out_mps
13213
13660
 
13214
13661
  - func: linalg_matrix_norm(Tensor self, Scalar ord, int[] dim=[-2,-1], bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
13215
13662
  python_module: linalg
@@ -13788,6 +14235,7 @@
13788
14235
  dispatch:
13789
14236
  NestedTensorCPU: NestedTensor_softmax_dropout
13790
14237
  NestedTensorCUDA: NestedTensor_softmax_dropout_cuda
14238
+ tags: nondeterministic_seeded
13791
14239
 
13792
14240
  # Apparently, putting "forward" in the name will cause Python bindings to be skipped, so "fwd" it is.
13793
14241
  - func: _transformer_encoder_layer_fwd(Tensor src, int embed_dim, int num_heads, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, bool use_gelu, bool norm_first, float eps, Tensor norm_weight_1, Tensor norm_bias_1, Tensor norm_weight_2, Tensor norm_bias_2, Tensor ffn_weight_1, Tensor ffn_bias_1, Tensor ffn_weight_2, Tensor ffn_bias_2, Tensor? mask=None, int? mask_type=None) -> Tensor
@@ -13803,67 +14251,71 @@
13803
14251
  CUDA, NestedTensorCUDA: native_multi_head_attention_cuda
13804
14252
  autogen: _native_multi_head_attention.out
13805
14253
 
13806
- - func: scaled_dot_product_attention(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False) -> Tensor
14254
+ - func: scaled_dot_product_attention(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False, *, float? scale=None) -> Tensor
13807
14255
  python_module: nn
13808
14256
  variants: function
13809
14257
  autogen: scaled_dot_product_attention.out
13810
-
13811
- # TODO: THIS NEEDS TO BE REMOVED BUT PEOPLE HAVE TRAINED THEIR MODELS WITH THIS OP BUILTIN
13812
- - func: _scaled_dot_product_attention(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool need_attn_weights=False, bool is_causal=False) -> (Tensor, Tensor)
13813
- python_module: nn
13814
- variants: function
13815
- autogen: _scaled_dot_product_attention.out
14258
+ tags: nondeterministic_seeded
13816
14259
 
13817
14260
  # This aten function is kept so that we can test the choice function from Python
13818
- - func: _fused_sdp_choice(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False) -> int
14261
+ - func: _fused_sdp_choice(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False, *, float? scale=None) -> int
13819
14262
  dispatch:
13820
14263
  Meta: _fused_sdp_choice_meta
13821
14264
  CPU, NestedTensorCPU: _fused_sdp_choice_cpp
13822
14265
  CUDA, NestedTensorCUDA: _fused_sdp_choice_cuda
14266
+ tags: nondeterministic_seeded
13823
14267
 
13824
- - func: _scaled_dot_product_attention_math(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False, Tensor? dropout_mask=None) -> (Tensor, Tensor)
14268
+ - func: _scaled_dot_product_attention_math(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False, Tensor? dropout_mask=None, *, float? scale=None) -> (Tensor, Tensor)
13825
14269
  variants: function
14270
+ tags: nondeterministic_seeded
13826
14271
 
13827
- - func: _scaled_dot_product_flash_attention(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False) -> (Tensor ouput, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, int max_q, int max_k, int philox_seed, int philox_offset, Tensor debug_attn_mask)
14272
+ - func: _scaled_dot_product_flash_attention(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor ouput, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, int max_q, int max_k, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
13828
14273
  dispatch:
14274
+ CPU: _scaled_dot_product_flash_attention_cpu
13829
14275
  CUDA: _scaled_dot_product_flash_attention_cuda
13830
14276
  NestedTensorCUDA: _scaled_dot_product_flash_attention_nestedtensor_cuda
14277
+ tags: nondeterministic_seeded
13831
14278
 
13832
- - func: _scaled_dot_product_flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, int max_q, int max_k, float dropout_p, bool is_causal, int philox_seed, int philox_offset) -> (Tensor grad_query, Tensor grad_key, Tensor grad_value)
14279
+ - func: _scaled_dot_product_flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, int max_q, int max_k, float dropout_p, bool is_causal, Tensor philox_seed, Tensor philox_offset, *, float? scale=None) -> (Tensor grad_query, Tensor grad_key, Tensor grad_value)
14280
+ device_check: NoCheck
13833
14281
  variants: function
13834
14282
  dispatch:
14283
+ CPU: _scaled_dot_product_flash_attention_backward_cpu
13835
14284
  CUDA: _scaled_dot_product_flash_attention_backward_cuda
13836
14285
 
13837
- - func: _scaled_dot_product_efficient_attention(Tensor query, Tensor key, Tensor value, bool compute_log_sumexp, bool is_causal=False) -> (Tensor, Tensor)
14286
+ - func: _scaled_dot_product_efficient_attention(Tensor query, Tensor key, Tensor value, Tensor? attn_bias, bool compute_log_sumexp, float dropout_p=0.0, bool is_causal=False, *, float? scale=None) -> (Tensor output, Tensor log_sumexp, Tensor philox_seed, Tensor philox_offset)
13838
14287
  dispatch:
13839
14288
  CUDA: _scaled_dot_product_efficient_attention_cuda
13840
14289
  NestedTensorCUDA: _scaled_dot_product_efficient_attention_nestedtensor_cuda
14290
+ tags: nondeterministic_seeded
13841
14291
 
13842
- - func: _scaled_dot_product_efficient_attention_backward(Tensor grad_out_, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, bool is_causal=False, bool chunk_grad_outputs=False) -> (Tensor, Tensor, Tensor)
14292
+ - func: _scaled_dot_product_efficient_attention_backward(Tensor grad_out_, Tensor query, Tensor key, Tensor value, Tensor attn_bias, Tensor out, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset, float dropout_p, bool[4] grad_input_mask, bool is_causal=False, *, float? scale=None) -> (Tensor, Tensor, Tensor, Tensor)
14293
+ device_check: NoCheck
13843
14294
  dispatch:
13844
14295
  CUDA: _scaled_dot_product_efficient_attention_backward_cuda
14296
+ tags: nondeterministic_seeded
13845
14297
 
13846
- - func: _chunk_grad_outputs_efficient_attention(Tensor query, Tensor key, Tensor value, bool is_causal=False) -> bool
13847
- dispatch:
13848
- CUDA: _chunk_grad_outputs_efficient_attention
13849
-
13850
- - func: _flash_attention_forward(Tensor query, Tensor key, Tensor value, Tensor cum_seq_q, Tensor cum_seq_k, int max_q, int max_k, float dropout_p, bool is_causal, bool return_debug_mask) -> (Tensor output, Tensor softmax_logsumexp, int philox_seed, int philox_offset, Tensor debug_attn_mask)
14298
+ - func: _flash_attention_forward(Tensor query, Tensor key, Tensor value, Tensor cum_seq_q, Tensor cum_seq_k, int max_q, int max_k, float dropout_p, bool is_causal, bool return_debug_mask, *, float? scale=None) -> (Tensor output, Tensor softmax_logsumexp, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
13851
14299
  variants: function
13852
14300
  dispatch:
13853
14301
  CUDA: _flash_attention_forward
14302
+ tags: nondeterministic_seeded
13854
14303
 
13855
- - func: _flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, int max_q, int max_k, float dropout_p, bool is_causal, int philox_seed, int philox_offset) -> (Tensor, Tensor, Tensor)
14304
+ - func: _flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, int max_q, int max_k, float dropout_p, bool is_causal, Tensor philox_seed, Tensor philox_offset, *, float? scale=None) -> (Tensor, Tensor, Tensor)
14305
+ device_check: NoCheck
13856
14306
  variants: function
13857
14307
  dispatch:
13858
14308
  CUDA: _flash_attention_backward
13859
14309
 
13860
14310
  # Returns ouput, logsumexp if compute_logsumexp
13861
- - func: _efficient_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int? max_seqlen_q, bool compute_log_sumexp=False, bool causal=False) -> (Tensor, Tensor)
14311
+ - func: _efficient_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int? max_seqlen_q, float dropout_p, int custom_mask_type, bool compute_log_sumexp=False, *, float? scale=None, Tensor? causal_diagonal=None, Tensor? seqlen_k=None) -> (Tensor output, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset)
13862
14312
  variants: function
13863
14313
  dispatch:
13864
14314
  CUDA: _efficient_attention_forward
14315
+ tags: nondeterministic_seeded
13865
14316
 
13866
- - func: _efficient_attention_backward(Tensor grad_out_, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, bool is_causal=False, bool chunk_grad_outputs=False) -> (Tensor, Tensor, Tensor)
14317
+ - func: _efficient_attention_backward(Tensor grad_out_, Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor out, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int max_seqlen_k, int max_seqlen_q, Tensor logsumexp, float dropout_p, Tensor philox_seed, Tensor philox_offset, int custom_mask_type, bool bias_requires_grad, *, float? scale=None, int? num_splits_key=None) -> (Tensor, Tensor, Tensor, Tensor)
14318
+ device_check: NoCheck
13867
14319
  variants: function
13868
14320
  dispatch:
13869
14321
  CUDA: _efficient_attention_backward
@@ -13872,8 +14324,15 @@
13872
14324
  variants: function
13873
14325
  dispatch:
13874
14326
  CUDA: triton_scaled_dot_attention
14327
+ tags: nondeterministic_seeded
13875
14328
  autogen: _triton_scaled_dot_attention.out
13876
14329
 
14330
+ - func: _fill_mem_eff_dropout_mask_(Tensor(a!) self, float dropout_p, int seed, int offset) -> Tensor(a!)
14331
+ variants: function
14332
+ dispatch:
14333
+ CUDA: _fill_mem_eff_dropout_mask_
14334
+ tags: nondeterministic_seeded
14335
+
13877
14336
  - func: _triton_multi_head_attention(Tensor query, Tensor key, Tensor value, int embed_dim, int num_head, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, Tensor? mask=None) -> Tensor
13878
14337
  variants: function
13879
14338
  dispatch:
@@ -13895,18 +14354,6 @@
13895
14354
  variants: function
13896
14355
  tags: pointwise
13897
14356
 
13898
- - func: _transformer_decoder_only_layer_fwd(Tensor src, int embed_dim, int num_heads, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, bool use_gelu, bool norm_first, float eps, Tensor norm_weight_1, Tensor norm_bias_1, Tensor norm_weight_2, Tensor norm_bias_2, Tensor ffn_weight_1, Tensor ffn_bias_1, Tensor ffn_weight_2, Tensor ffn_bias_2, Tensor? mask=None, Tensor? incr_key=None, Tensor? incr_value=None) -> (Tensor, Tensor, Tensor)
13899
- variants: function
13900
- dispatch:
13901
- CPU, CUDA, NestedTensorCPU, NestedTensorCUDA: transformer_decoder_only_layer_forward
13902
- autogen: _transformer_decoder_only_layer_fwd.out
13903
-
13904
- - func: _native_decoder_only_multi_head_attention(Tensor query, Tensor key, Tensor value, int embed_dim, int num_head, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, Tensor? mask=None, Tensor? incr_key=None, Tensor? incr_value=None, bool need_weights=True, bool average_attn_weights=True) -> (Tensor, Tensor, Tensor, Tensor)
13905
- variants: function
13906
- dispatch:
13907
- CPU, CUDA, NestedTensorCPU, NestedTensorCUDA: native_decoder_only_multi_head_attention
13908
- autogen: _native_decoder_only_multi_head_attention.out
13909
-
13910
14357
  - func: special_bessel_j0(Tensor self) -> Tensor
13911
14358
  python_module: special
13912
14359
  structured_delegate: special_bessel_j0.out
@@ -14603,9 +15050,31 @@
14603
15050
  CUDA: _fused_adam_kernel_cuda_
14604
15051
  autogen: _fused_adam, _fused_adam.out
14605
15052
 
15053
+ - func: _fused_adam_.tensor_lr(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, Tensor lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
15054
+ # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now),
15055
+ # but still skip the device check as the Tensor LR can be on CPU
15056
+ device_check: NoCheck
15057
+ variants: function
15058
+ dispatch:
15059
+ CUDA: _fused_adam_kernel_cuda_
15060
+ autogen: _fused_adam.tensor_lr, _fused_adam.tensor_lr_out
15061
+
14606
15062
  - func: _fused_adamw_(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, float lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
14607
15063
  # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now).
14608
15064
  variants: function
14609
15065
  dispatch:
14610
15066
  CUDA: _fused_adamw_kernel_cuda_
14611
15067
  autogen: _fused_adamw, _fused_adamw.out
15068
+
15069
+ - func: _fused_adamw_.tensor_lr(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, Tensor lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
15070
+ # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now),
15071
+ # but still skip the device check as the Tensor LR can be on CPU
15072
+ device_check: NoCheck
15073
+ variants: function
15074
+ dispatch:
15075
+ CUDA: _fused_adamw_kernel_cuda_
15076
+ autogen: _fused_adamw.tensor_lr, _fused_adamw.tensor_lr_out
15077
+
15078
+ # This op is ONLY used by pytorch/XLA in functionalization, and should never show up in vanilla eager mode or in any pytorch tracing contexts.
15079
+ - func: _propagate_xla_data(Tensor input, Tensor output) -> ()
15080
+ variants: function