numba-cuda 0.9.0__py3-none-any.whl → 0.10.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. numba_cuda/VERSION +1 -1
  2. numba_cuda/numba/cuda/compiler.py +35 -3
  3. numba_cuda/numba/cuda/cuda_bf16.py +5155 -0
  4. numba_cuda/numba/cuda/cuda_paths.py +2 -0
  5. numba_cuda/numba/cuda/cudadecl.py +0 -42
  6. numba_cuda/numba/cuda/cudadrv/linkable_code.py +11 -2
  7. numba_cuda/numba/cuda/cudadrv/nvrtc.py +10 -3
  8. numba_cuda/numba/cuda/cudaimpl.py +0 -63
  9. numba_cuda/numba/cuda/debuginfo.py +92 -2
  10. numba_cuda/numba/cuda/decorators.py +27 -1
  11. numba_cuda/numba/cuda/device_init.py +4 -5
  12. numba_cuda/numba/cuda/dispatcher.py +4 -3
  13. numba_cuda/numba/cuda/extending.py +54 -0
  14. numba_cuda/numba/cuda/include/11/cuda_bf16.h +3749 -0
  15. numba_cuda/numba/cuda/include/11/cuda_bf16.hpp +2683 -0
  16. numba_cuda/numba/cuda/{cuda_fp16.h → include/11/cuda_fp16.h} +550 -387
  17. numba_cuda/numba/cuda/{cuda_fp16.hpp → include/11/cuda_fp16.hpp} +465 -316
  18. numba_cuda/numba/cuda/include/12/cuda_bf16.h +5118 -0
  19. numba_cuda/numba/cuda/include/12/cuda_bf16.hpp +3865 -0
  20. numba_cuda/numba/cuda/include/12/cuda_fp16.h +5363 -0
  21. numba_cuda/numba/cuda/include/12/cuda_fp16.hpp +3483 -0
  22. numba_cuda/numba/cuda/intrinsic_wrapper.py +0 -39
  23. numba_cuda/numba/cuda/intrinsics.py +172 -1
  24. numba_cuda/numba/cuda/lowering.py +43 -0
  25. numba_cuda/numba/cuda/stubs.py +0 -11
  26. numba_cuda/numba/cuda/target.py +28 -0
  27. numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +4 -2
  28. numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +1 -1
  29. numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +257 -0
  30. numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +1 -1
  31. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +46 -0
  32. numba_cuda/numba/cuda/tests/cudapy/test_enums.py +18 -0
  33. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +4 -2
  34. numba_cuda/numba/cuda/tests/cudapy/test_inline.py +156 -0
  35. numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +1 -1
  36. numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +50 -5
  37. numba_cuda/numba/cuda/vector_types.py +3 -1
  38. numba_cuda/numba/cuda/vectorizers.py +1 -1
  39. {numba_cuda-0.9.0.dist-info → numba_cuda-0.10.1.dist-info}/METADATA +1 -1
  40. {numba_cuda-0.9.0.dist-info → numba_cuda-0.10.1.dist-info}/RECORD +43 -33
  41. {numba_cuda-0.9.0.dist-info → numba_cuda-0.10.1.dist-info}/WHEEL +1 -1
  42. {numba_cuda-0.9.0.dist-info → numba_cuda-0.10.1.dist-info}/licenses/LICENSE +0 -0
  43. {numba_cuda-0.9.0.dist-info → numba_cuda-0.10.1.dist-info}/top_level.txt +0 -0
@@ -99,6 +99,9 @@
99
99
  #ifndef __CUDA_FP16_H__
100
100
  #define __CUDA_FP16_H__
101
101
 
102
+ #define ___CUDA_FP16_STRINGIFY_INNERMOST(x) #x
103
+ #define __CUDA_FP16_STRINGIFY(x) ___CUDA_FP16_STRINGIFY_INNERMOST(x)
104
+
102
105
  #if defined(__cplusplus)
103
106
  #if defined(__CUDACC__)
104
107
  #define __CUDA_FP16_DECL__ static __device__ __inline__
@@ -151,7 +154,7 @@ struct __half2;
151
154
  * \details Converts double number \p a to half precision in round-to-nearest-even mode.
152
155
  * \param[in] a - double. Is only being read.
153
156
  * \returns half
154
- * \retval a converted to half.
157
+ * - \p a converted to half.
155
158
  * \internal
156
159
  * \exception-guarantee no-throw guarantee
157
160
  * \behavior reentrant, thread safe
@@ -166,7 +169,7 @@ __CUDA_HOSTDEVICE_FP16_DECL__ __half __double2half(const double a);
166
169
  * \details Converts float number \p a to half precision in round-to-nearest-even mode.
167
170
  * \param[in] a - float. Is only being read.
168
171
  * \returns half
169
- * \retval a converted to half.
172
+ * - \p a converted to half.
170
173
  * \internal
171
174
  * \exception-guarantee no-throw guarantee
172
175
  * \behavior reentrant, thread safe
@@ -181,7 +184,7 @@ __CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half(const float a);
181
184
  * \details Converts float number \p a to half precision in round-to-nearest-even mode.
182
185
  * \param[in] a - float. Is only being read.
183
186
  * \returns half
184
- * \retval a converted to half.
187
+ * - \p a converted to half.
185
188
  * \internal
186
189
  * \exception-guarantee no-throw guarantee
187
190
  * \behavior reentrant, thread safe
@@ -196,7 +199,7 @@ __CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rn(const float a);
196
199
  * \details Converts float number \p a to half precision in round-towards-zero mode.
197
200
  * \param[in] a - float. Is only being read.
198
201
  * \returns half
199
- * \retval a converted to half.
202
+ * - \p a converted to half.
200
203
  * \internal
201
204
  * \exception-guarantee no-throw guarantee
202
205
  * \behavior reentrant, thread safe
@@ -212,7 +215,7 @@ __CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rz(const float a);
212
215
  * \param[in] a - float. Is only being read.
213
216
  *
214
217
  * \returns half
215
- * \retval a converted to half.
218
+ * - \p a converted to half.
216
219
  * \internal
217
220
  * \exception-guarantee no-throw guarantee
218
221
  * \behavior reentrant, thread safe
@@ -228,7 +231,7 @@ __CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rd(const float a);
228
231
  * \param[in] a - float. Is only being read.
229
232
  *
230
233
  * \returns half
231
- * \retval a converted to half.
234
+ * - \p a converted to half.
232
235
  * \internal
233
236
  * \exception-guarantee no-throw guarantee
234
237
  * \behavior reentrant, thread safe
@@ -243,7 +246,7 @@ __CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_ru(const float a);
243
246
  * \param[in] a - float. Is only being read.
244
247
  *
245
248
  * \returns float
246
- * \retval a converted to float.
249
+ * - \p a converted to float.
247
250
  * \internal
248
251
  * \exception-guarantee no-throw guarantee
249
252
  * \behavior reentrant, thread safe
@@ -260,7 +263,7 @@ __CUDA_HOSTDEVICE_FP16_DECL__ float __half2float(const __half a);
260
263
  * \param[in] a - float. Is only being read.
261
264
  *
262
265
  * \returns half2
263
- * \retval The \p half2 value with both halves equal to the converted half
266
+ * - The \p half2 value with both halves equal to the converted half
264
267
  * precision number.
265
268
  * \internal
266
269
  * \exception-guarantee no-throw guarantee
@@ -281,7 +284,7 @@ __CUDA_HOSTDEVICE_FP16_DECL__ __half2 __float2half2_rn(const float a);
281
284
  * \param[in] b - float. Is only being read.
282
285
  *
283
286
  * \returns half2
284
- * \retval The \p half2 value with corresponding halves equal to the
287
+ * - The \p half2 value with corresponding halves equal to the
285
288
  * converted input floats.
286
289
  * \internal
287
290
  * \exception-guarantee no-throw guarantee
@@ -298,7 +301,7 @@ __CUDA_HOSTDEVICE_FP16_DECL__ __half2 __floats2half2_rn(const float a, const flo
298
301
  * \param[in] a - half2. Is only being read.
299
302
  *
300
303
  * \returns float
301
- * \retval The low 16 bits of \p a converted to float.
304
+ * - The low 16 bits of \p a converted to float.
302
305
  * \internal
303
306
  * \exception-guarantee no-throw guarantee
304
307
  * \behavior reentrant, thread safe
@@ -314,13 +317,111 @@ __CUDA_HOSTDEVICE_FP16_DECL__ float __low2float(const __half2 a);
314
317
  * \param[in] a - half2. Is only being read.
315
318
  *
316
319
  * \returns float
317
- * \retval The high 16 bits of \p a converted to float.
320
+ * - The high 16 bits of \p a converted to float.
318
321
  * \internal
319
322
  * \exception-guarantee no-throw guarantee
320
323
  * \behavior reentrant, thread safe
321
324
  * \endinternal
322
325
  */
323
326
  __CUDA_HOSTDEVICE_FP16_DECL__ float __high2float(const __half2 a);
327
+ /**
328
+ * \ingroup CUDA_MATH__HALF_MISC
329
+ * \brief Convert a half to a signed short integer in round-towards-zero mode.
330
+ *
331
+ * \details Convert the half-precision floating-point value \p h to a signed short
332
+ * integer in round-towards-zero mode. NaN inputs are converted to 0.
333
+ * \param[in] h - half. Is only being read.
334
+ *
335
+ * \returns short int
336
+ * - \p h converted to a signed short integer.
337
+ * \internal
338
+ * \exception-guarantee no-throw guarantee
339
+ * \behavior reentrant, thread safe
340
+ * \endinternal
341
+ */
342
+ __CUDA_HOSTDEVICE_FP16_DECL__ short int __half2short_rz(const __half h);
343
+ /**
344
+ * \ingroup CUDA_MATH__HALF_MISC
345
+ * \brief Convert a half to an unsigned short integer in round-towards-zero
346
+ * mode.
347
+ *
348
+ * \details Convert the half-precision floating-point value \p h to an unsigned short
349
+ * integer in round-towards-zero mode. NaN inputs are converted to 0.
350
+ * \param[in] h - half. Is only being read.
351
+ *
352
+ * \returns unsigned short int
353
+ * - \p h converted to an unsigned short integer.
354
+ * \internal
355
+ * \exception-guarantee no-throw guarantee
356
+ * \behavior reentrant, thread safe
357
+ * \endinternal
358
+ */
359
+ __CUDA_HOSTDEVICE_FP16_DECL__ unsigned short int __half2ushort_rz(const __half h);
360
+ /**
361
+ * \ingroup CUDA_MATH__HALF_MISC
362
+ * \brief Convert a half to a signed integer in round-towards-zero mode.
363
+ *
364
+ * \details Convert the half-precision floating-point value \p h to a signed integer in
365
+ * round-towards-zero mode. NaN inputs are converted to 0.
366
+ * \param[in] h - half. Is only being read.
367
+ *
368
+ * \returns int
369
+ * - \p h converted to a signed integer.
370
+ * \internal
371
+ * \exception-guarantee no-throw guarantee
372
+ * \behavior reentrant, thread safe
373
+ * \endinternal
374
+ */
375
+ __CUDA_HOSTDEVICE_FP16_DECL__ int __half2int_rz(const __half h);
376
+ /**
377
+ * \ingroup CUDA_MATH__HALF_MISC
378
+ * \brief Convert a half to an unsigned integer in round-towards-zero mode.
379
+ *
380
+ * \details Convert the half-precision floating-point value \p h to an unsigned integer
381
+ * in round-towards-zero mode. NaN inputs are converted to 0.
382
+ * \param[in] h - half. Is only being read.
383
+ *
384
+ * \returns unsigned int
385
+ * - \p h converted to an unsigned integer.
386
+ * \internal
387
+ * \exception-guarantee no-throw guarantee
388
+ * \behavior reentrant, thread safe
389
+ * \endinternal
390
+ */
391
+ __CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __half2uint_rz(const __half h);
392
+ /**
393
+ * \ingroup CUDA_MATH__HALF_MISC
394
+ * \brief Convert a half to a signed 64-bit integer in round-towards-zero mode.
395
+ *
396
+ * \details Convert the half-precision floating-point value \p h to a signed 64-bit
397
+ * integer in round-towards-zero mode. NaN inputs return a long long int with hex value of 0x8000000000000000.
398
+ * \param[in] h - half. Is only being read.
399
+ *
400
+ * \returns long long int
401
+ * - \p h converted to a signed 64-bit integer.
402
+ * \internal
403
+ * \exception-guarantee no-throw guarantee
404
+ * \behavior reentrant, thread safe
405
+ * \endinternal
406
+ */
407
+ __CUDA_HOSTDEVICE_FP16_DECL__ long long int __half2ll_rz(const __half h);
408
+ /**
409
+ * \ingroup CUDA_MATH__HALF_MISC
410
+ * \brief Convert a half to an unsigned 64-bit integer in round-towards-zero
411
+ * mode.
412
+ *
413
+ * \details Convert the half-precision floating-point value \p h to an unsigned 64-bit
414
+ * integer in round-towards-zero mode. NaN inputs return 0x8000000000000000.
415
+ * \param[in] h - half. Is only being read.
416
+ *
417
+ * \returns unsigned long long int
418
+ * - \p h converted to an unsigned 64-bit integer.
419
+ * \internal
420
+ * \exception-guarantee no-throw guarantee
421
+ * \behavior reentrant, thread safe
422
+ * \endinternal
423
+ */
424
+ __CUDA_HOSTDEVICE_FP16_DECL__ unsigned long long int __half2ull_rz(const __half h);
324
425
 
325
426
  #if defined(__CUDACC__)
326
427
  /**
@@ -335,7 +436,7 @@ __CUDA_HOSTDEVICE_FP16_DECL__ float __high2float(const __half2 a);
335
436
  * \param[in] a - float2. Is only being read.
336
437
  *
337
438
  * \returns half2
338
- * \retval The \p half2 which has corresponding halves equal to the
439
+ * - The \p half2 which has corresponding halves equal to the
339
440
  * converted float2 components.
340
441
  * \internal
341
442
  * \exception-guarantee no-throw guarantee
@@ -352,7 +453,7 @@ __CUDA_HOSTDEVICE_FP16_DECL__ __half2 __float22half2_rn(const float2 a);
352
453
  * \param[in] a - half2. Is only being read.
353
454
  *
354
455
  * \returns float2
355
- * \retval a converted to float2.
456
+ * - \p a converted to float2.
356
457
  * \internal
357
458
  * \exception-guarantee no-throw guarantee
358
459
  * \behavior reentrant, thread safe
@@ -364,11 +465,11 @@ __CUDA_HOSTDEVICE_FP16_DECL__ float2 __half22float2(const __half2 a);
364
465
  * \brief Convert a half to a signed integer in round-to-nearest-even mode.
365
466
  *
366
467
  * \details Convert the half-precision floating-point value \p h to a signed integer in
367
- * round-to-nearest-even mode.
468
+ * round-to-nearest-even mode. NaN inputs are converted to 0.
368
469
  * \param[in] h - half. Is only being read.
369
470
  *
370
471
  * \returns int
371
- * \retval h converted to a signed integer.
472
+ * - \p h converted to a signed integer.
372
473
  * \internal
373
474
  * \exception-guarantee no-throw guarantee
374
475
  * \behavior reentrant, thread safe
@@ -377,30 +478,14 @@ __CUDA_HOSTDEVICE_FP16_DECL__ float2 __half22float2(const __half2 a);
377
478
  __CUDA_FP16_DECL__ int __half2int_rn(const __half h);
378
479
  /**
379
480
  * \ingroup CUDA_MATH__HALF_MISC
380
- * \brief Convert a half to a signed integer in round-towards-zero mode.
381
- *
382
- * \details Convert the half-precision floating-point value \p h to a signed integer in
383
- * round-towards-zero mode.
384
- * \param[in] h - half. Is only being read.
385
- *
386
- * \returns int
387
- * \retval h converted to a signed integer.
388
- * \internal
389
- * \exception-guarantee no-throw guarantee
390
- * \behavior reentrant, thread safe
391
- * \endinternal
392
- */
393
- __CUDA_HOSTDEVICE_FP16_DECL__ int __half2int_rz(const __half h);
394
- /**
395
- * \ingroup CUDA_MATH__HALF_MISC
396
481
  * \brief Convert a half to a signed integer in round-down mode.
397
482
  *
398
483
  * \details Convert the half-precision floating-point value \p h to a signed integer in
399
- * round-down mode.
484
+ * round-down mode. NaN inputs are converted to 0.
400
485
  * \param[in] h - half. Is only being read.
401
486
  *
402
487
  * \returns int
403
- * \retval h converted to a signed integer.
488
+ * - \p h converted to a signed integer.
404
489
  * \internal
405
490
  * \exception-guarantee no-throw guarantee
406
491
  * \behavior reentrant, thread safe
@@ -412,11 +497,11 @@ __CUDA_FP16_DECL__ int __half2int_rd(const __half h);
412
497
  * \brief Convert a half to a signed integer in round-up mode.
413
498
  *
414
499
  * \details Convert the half-precision floating-point value \p h to a signed integer in
415
- * round-up mode.
500
+ * round-up mode. NaN inputs are converted to 0.
416
501
  * \param[in] h - half. Is only being read.
417
502
  *
418
503
  * \returns int
419
- * \retval h converted to a signed integer.
504
+ * - \p h converted to a signed integer.
420
505
  * \internal
421
506
  * \exception-guarantee no-throw guarantee
422
507
  * \behavior reentrant, thread safe
@@ -433,7 +518,7 @@ __CUDA_FP16_DECL__ int __half2int_ru(const __half h);
433
518
  * \param[in] i - int. Is only being read.
434
519
  *
435
520
  * \returns half
436
- * \retval i converted to half.
521
+ * - \p i converted to half.
437
522
  * \internal
438
523
  * \exception-guarantee no-throw guarantee
439
524
  * \behavior reentrant, thread safe
@@ -449,7 +534,7 @@ __CUDA_HOSTDEVICE_FP16_DECL__ __half __int2half_rn(const int i);
449
534
  * \param[in] i - int. Is only being read.
450
535
  *
451
536
  * \returns half
452
- * \retval i converted to half.
537
+ * - \p i converted to half.
453
538
  * \internal
454
539
  * \exception-guarantee no-throw guarantee
455
540
  * \behavior reentrant, thread safe
@@ -465,7 +550,7 @@ __CUDA_FP16_DECL__ __half __int2half_rz(const int i);
465
550
  * \param[in] i - int. Is only being read.
466
551
  *
467
552
  * \returns half
468
- * \retval i converted to half.
553
+ * - \p i converted to half.
469
554
  * \internal
470
555
  * \exception-guarantee no-throw guarantee
471
556
  * \behavior reentrant, thread safe
@@ -481,7 +566,7 @@ __CUDA_FP16_DECL__ __half __int2half_rd(const int i);
481
566
  * \param[in] i - int. Is only being read.
482
567
  *
483
568
  * \returns half
484
- * \retval i converted to half.
569
+ * - \p i converted to half.
485
570
  * \internal
486
571
  * \exception-guarantee no-throw guarantee
487
572
  * \behavior reentrant, thread safe
@@ -495,11 +580,11 @@ __CUDA_FP16_DECL__ __half __int2half_ru(const int i);
495
580
  * mode.
496
581
  *
497
582
  * \details Convert the half-precision floating-point value \p h to a signed short
498
- * integer in round-to-nearest-even mode.
583
+ * integer in round-to-nearest-even mode. NaN inputs are converted to 0.
499
584
  * \param[in] h - half. Is only being read.
500
585
  *
501
586
  * \returns short int
502
- * \retval h converted to a signed short integer.
587
+ * - \p h converted to a signed short integer.
503
588
  * \internal
504
589
  * \exception-guarantee no-throw guarantee
505
590
  * \behavior reentrant, thread safe
@@ -508,30 +593,14 @@ __CUDA_FP16_DECL__ __half __int2half_ru(const int i);
508
593
  __CUDA_FP16_DECL__ short int __half2short_rn(const __half h);
509
594
  /**
510
595
  * \ingroup CUDA_MATH__HALF_MISC
511
- * \brief Convert a half to a signed short integer in round-towards-zero mode.
512
- *
513
- * \details Convert the half-precision floating-point value \p h to a signed short
514
- * integer in round-towards-zero mode.
515
- * \param[in] h - half. Is only being read.
516
- *
517
- * \returns short int
518
- * \retval h converted to a signed short integer.
519
- * \internal
520
- * \exception-guarantee no-throw guarantee
521
- * \behavior reentrant, thread safe
522
- * \endinternal
523
- */
524
- __CUDA_HOSTDEVICE_FP16_DECL__ short int __half2short_rz(const __half h);
525
- /**
526
- * \ingroup CUDA_MATH__HALF_MISC
527
596
  * \brief Convert a half to a signed short integer in round-down mode.
528
597
  *
529
598
  * \details Convert the half-precision floating-point value \p h to a signed short
530
- * integer in round-down mode.
599
+ * integer in round-down mode. NaN inputs are converted to 0.
531
600
  * \param[in] h - half. Is only being read.
532
601
  *
533
602
  * \returns short int
534
- * \retval h converted to a signed short integer.
603
+ * - \p h converted to a signed short integer.
535
604
  * \internal
536
605
  * \exception-guarantee no-throw guarantee
537
606
  * \behavior reentrant, thread safe
@@ -543,11 +612,11 @@ __CUDA_FP16_DECL__ short int __half2short_rd(const __half h);
543
612
  * \brief Convert a half to a signed short integer in round-up mode.
544
613
  *
545
614
  * \details Convert the half-precision floating-point value \p h to a signed short
546
- * integer in round-up mode.
615
+ * integer in round-up mode. NaN inputs are converted to 0.
547
616
  * \param[in] h - half. Is only being read.
548
617
  *
549
618
  * \returns short int
550
- * \retval h converted to a signed short integer.
619
+ * - \p h converted to a signed short integer.
551
620
  * \internal
552
621
  * \exception-guarantee no-throw guarantee
553
622
  * \behavior reentrant, thread safe
@@ -565,7 +634,7 @@ __CUDA_FP16_DECL__ short int __half2short_ru(const __half h);
565
634
  * \param[in] i - short int. Is only being read.
566
635
  *
567
636
  * \returns half
568
- * \retval i converted to half.
637
+ * - \p i converted to half.
569
638
  * \internal
570
639
  * \exception-guarantee no-throw guarantee
571
640
  * \behavior reentrant, thread safe
@@ -581,7 +650,7 @@ __CUDA_HOSTDEVICE_FP16_DECL__ __half __short2half_rn(const short int i);
581
650
  * \param[in] i - short int. Is only being read.
582
651
  *
583
652
  * \returns half
584
- * \retval i converted to half.
653
+ * - \p i converted to half.
585
654
  * \internal
586
655
  * \exception-guarantee no-throw guarantee
587
656
  * \behavior reentrant, thread safe
@@ -597,7 +666,7 @@ __CUDA_FP16_DECL__ __half __short2half_rz(const short int i);
597
666
  * \param[in] i - short int. Is only being read.
598
667
  *
599
668
  * \returns half
600
- * \retval i converted to half.
669
+ * - \p i converted to half.
601
670
  * \internal
602
671
  * \exception-guarantee no-throw guarantee
603
672
  * \behavior reentrant, thread safe
@@ -613,7 +682,7 @@ __CUDA_FP16_DECL__ __half __short2half_rd(const short int i);
613
682
  * \param[in] i - short int. Is only being read.
614
683
  *
615
684
  * \returns half
616
- * \retval i converted to half.
685
+ * - \p i converted to half.
617
686
  * \internal
618
687
  * \exception-guarantee no-throw guarantee
619
688
  * \behavior reentrant, thread safe
@@ -626,11 +695,11 @@ __CUDA_FP16_DECL__ __half __short2half_ru(const short int i);
626
695
  * \brief Convert a half to an unsigned integer in round-to-nearest-even mode.
627
696
  *
628
697
  * \details Convert the half-precision floating-point value \p h to an unsigned integer
629
- * in round-to-nearest-even mode.
698
+ * in round-to-nearest-even mode. NaN inputs are converted to 0.
630
699
  * \param[in] h - half. Is only being read.
631
700
  *
632
701
  * \returns unsigned int
633
- * \retval h converted to an unsigned integer.
702
+ * - \p h converted to an unsigned integer.
634
703
  * \internal
635
704
  * \exception-guarantee no-throw guarantee
636
705
  * \behavior reentrant, thread safe
@@ -639,30 +708,14 @@ __CUDA_FP16_DECL__ __half __short2half_ru(const short int i);
639
708
  __CUDA_FP16_DECL__ unsigned int __half2uint_rn(const __half h);
640
709
  /**
641
710
  * \ingroup CUDA_MATH__HALF_MISC
642
- * \brief Convert a half to an unsigned integer in round-towards-zero mode.
643
- *
644
- * \details Convert the half-precision floating-point value \p h to an unsigned integer
645
- * in round-towards-zero mode.
646
- * \param[in] h - half. Is only being read.
647
- *
648
- * \returns unsigned int
649
- * \retval h converted to an unsigned integer.
650
- * \internal
651
- * \exception-guarantee no-throw guarantee
652
- * \behavior reentrant, thread safe
653
- * \endinternal
654
- */
655
- __CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __half2uint_rz(const __half h);
656
- /**
657
- * \ingroup CUDA_MATH__HALF_MISC
658
711
  * \brief Convert a half to an unsigned integer in round-down mode.
659
712
  *
660
713
  * \details Convert the half-precision floating-point value \p h to an unsigned integer
661
- * in round-down mode.
714
+ * in round-down mode. NaN inputs are converted to 0.
662
715
  * \param[in] h - half. Is only being read.
663
716
  *
664
717
  * \returns unsigned int
665
- * \retval h converted to an unsigned integer.
718
+ * - \p h converted to an unsigned integer.
666
719
  * \internal
667
720
  * \exception-guarantee no-throw guarantee
668
721
  * \behavior reentrant, thread safe
@@ -674,11 +727,11 @@ __CUDA_FP16_DECL__ unsigned int __half2uint_rd(const __half h);
674
727
  * \brief Convert a half to an unsigned integer in round-up mode.
675
728
  *
676
729
  * \details Convert the half-precision floating-point value \p h to an unsigned integer
677
- * in round-up mode.
730
+ * in round-up mode. NaN inputs are converted to 0.
678
731
  * \param[in] h - half. Is only being read.
679
732
  *
680
733
  * \returns unsigned int
681
- * \retval h converted to an unsigned integer.
734
+ * - \p h converted to an unsigned integer.
682
735
  * \internal
683
736
  * \exception-guarantee no-throw guarantee
684
737
  * \behavior reentrant, thread safe
@@ -695,7 +748,7 @@ __CUDA_FP16_DECL__ unsigned int __half2uint_ru(const __half h);
695
748
  * \param[in] i - unsigned int. Is only being read.
696
749
  *
697
750
  * \returns half
698
- * \retval i converted to half.
751
+ * - \p i converted to half.
699
752
  * \internal
700
753
  * \exception-guarantee no-throw guarantee
701
754
  * \behavior reentrant, thread safe
@@ -711,7 +764,7 @@ __CUDA_HOSTDEVICE_FP16_DECL__ __half __uint2half_rn(const unsigned int i);
711
764
  * \param[in] i - unsigned int. Is only being read.
712
765
  *
713
766
  * \returns half
714
- * \retval i converted to half.
767
+ * - \p i converted to half.
715
768
  * \internal
716
769
  * \exception-guarantee no-throw guarantee
717
770
  * \behavior reentrant, thread safe
@@ -727,7 +780,7 @@ __CUDA_FP16_DECL__ __half __uint2half_rz(const unsigned int i);
727
780
  * \param[in] i - unsigned int. Is only being read.
728
781
  *
729
782
  * \returns half
730
- * \retval i converted to half.
783
+ * - \p i converted to half.
731
784
  * \internal
732
785
  * \exception-guarantee no-throw guarantee
733
786
  * \behavior reentrant, thread safe
@@ -743,7 +796,7 @@ __CUDA_FP16_DECL__ __half __uint2half_rd(const unsigned int i);
743
796
  * \param[in] i - unsigned int. Is only being read.
744
797
  *
745
798
  * \returns half
746
- * \retval i converted to half.
799
+ * - \p i converted to half.
747
800
  * \internal
748
801
  * \exception-guarantee no-throw guarantee
749
802
  * \behavior reentrant, thread safe
@@ -757,11 +810,11 @@ __CUDA_FP16_DECL__ __half __uint2half_ru(const unsigned int i);
757
810
  * mode.
758
811
  *
759
812
  * \details Convert the half-precision floating-point value \p h to an unsigned short
760
- * integer in round-to-nearest-even mode.
813
+ * integer in round-to-nearest-even mode. NaN inputs are converted to 0.
761
814
  * \param[in] h - half. Is only being read.
762
815
  *
763
816
  * \returns unsigned short int
764
- * \retval h converted to an unsigned short integer.
817
+ * - \p h converted to an unsigned short integer.
765
818
  * \internal
766
819
  * \exception-guarantee no-throw guarantee
767
820
  * \behavior reentrant, thread safe
@@ -770,31 +823,14 @@ __CUDA_FP16_DECL__ __half __uint2half_ru(const unsigned int i);
770
823
  __CUDA_FP16_DECL__ unsigned short int __half2ushort_rn(const __half h);
771
824
  /**
772
825
  * \ingroup CUDA_MATH__HALF_MISC
773
- * \brief Convert a half to an unsigned short integer in round-towards-zero
774
- * mode.
775
- *
776
- * \details Convert the half-precision floating-point value \p h to an unsigned short
777
- * integer in round-towards-zero mode.
778
- * \param[in] h - half. Is only being read.
779
- *
780
- * \returns unsigned short int
781
- * \retval h converted to an unsigned short integer.
782
- * \internal
783
- * \exception-guarantee no-throw guarantee
784
- * \behavior reentrant, thread safe
785
- * \endinternal
786
- */
787
- __CUDA_HOSTDEVICE_FP16_DECL__ unsigned short int __half2ushort_rz(const __half h);
788
- /**
789
- * \ingroup CUDA_MATH__HALF_MISC
790
826
  * \brief Convert a half to an unsigned short integer in round-down mode.
791
827
  *
792
828
  * \details Convert the half-precision floating-point value \p h to an unsigned short
793
- * integer in round-down mode.
829
+ * integer in round-down mode. NaN inputs are converted to 0.
794
830
  * \param[in] h - half. Is only being read.
795
831
  *
796
832
  * \returns unsigned short int
797
- * \retval h converted to an unsigned short integer.
833
+ * - \p h converted to an unsigned short integer.
798
834
  */
799
835
  __CUDA_FP16_DECL__ unsigned short int __half2ushort_rd(const __half h);
800
836
  /**
@@ -802,11 +838,11 @@ __CUDA_FP16_DECL__ unsigned short int __half2ushort_rd(const __half h);
802
838
  * \brief Convert a half to an unsigned short integer in round-up mode.
803
839
  *
804
840
  * \details Convert the half-precision floating-point value \p h to an unsigned short
805
- * integer in round-up mode.
841
+ * integer in round-up mode. NaN inputs are converted to 0.
806
842
  * \param[in] h - half. Is only being read.
807
843
  *
808
844
  * \returns unsigned short int
809
- * \retval h converted to an unsigned short integer.
845
+ * - \p h converted to an unsigned short integer.
810
846
  */
811
847
  __CUDA_FP16_DECL__ unsigned short int __half2ushort_ru(const __half h);
812
848
 
@@ -820,7 +856,7 @@ __CUDA_FP16_DECL__ unsigned short int __half2ushort_ru(const __half h);
820
856
  * \param[in] i - unsigned short int. Is only being read.
821
857
  *
822
858
  * \returns half
823
- * \retval i converted to half.
859
+ * - \p i converted to half.
824
860
  * \internal
825
861
  * \exception-guarantee no-throw guarantee
826
862
  * \behavior reentrant, thread safe
@@ -837,7 +873,7 @@ __CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort2half_rn(const unsigned short int i
837
873
  * \param[in] i - unsigned short int. Is only being read.
838
874
  *
839
875
  * \returns half
840
- * \retval i converted to half.
876
+ * - \p i converted to half.
841
877
  * \internal
842
878
  * \exception-guarantee no-throw guarantee
843
879
  * \behavior reentrant, thread safe
@@ -853,7 +889,7 @@ __CUDA_FP16_DECL__ __half __ushort2half_rz(const unsigned short int i);
853
889
  * \param[in] i - unsigned short int. Is only being read.
854
890
  *
855
891
  * \returns half
856
- * \retval i converted to half.
892
+ * - \p i converted to half.
857
893
  * \internal
858
894
  * \exception-guarantee no-throw guarantee
859
895
  * \behavior reentrant, thread safe
@@ -869,7 +905,7 @@ __CUDA_FP16_DECL__ __half __ushort2half_rd(const unsigned short int i);
869
905
  * \param[in] i - unsigned short int. Is only being read.
870
906
  *
871
907
  * \returns half
872
- * \retval i converted to half.
908
+ * - \p i converted to half.
873
909
  * \internal
874
910
  * \exception-guarantee no-throw guarantee
875
911
  * \behavior reentrant, thread safe
@@ -883,11 +919,11 @@ __CUDA_FP16_DECL__ __half __ushort2half_ru(const unsigned short int i);
883
919
  * mode.
884
920
  *
885
921
  * \details Convert the half-precision floating-point value \p h to an unsigned 64-bit
886
- * integer in round-to-nearest-even mode.
922
+ * integer in round-to-nearest-even mode. NaN inputs return 0x8000000000000000.
887
923
  * \param[in] h - half. Is only being read.
888
924
  *
889
925
  * \returns unsigned long long int
890
- * \retval h converted to an unsigned 64-bit integer.
926
+ * - \p h converted to an unsigned 64-bit integer.
891
927
  * \internal
892
928
  * \exception-guarantee no-throw guarantee
893
929
  * \behavior reentrant, thread safe
@@ -896,31 +932,14 @@ __CUDA_FP16_DECL__ __half __ushort2half_ru(const unsigned short int i);
896
932
  __CUDA_FP16_DECL__ unsigned long long int __half2ull_rn(const __half h);
897
933
  /**
898
934
  * \ingroup CUDA_MATH__HALF_MISC
899
- * \brief Convert a half to an unsigned 64-bit integer in round-towards-zero
900
- * mode.
901
- *
902
- * \details Convert the half-precision floating-point value \p h to an unsigned 64-bit
903
- * integer in round-towards-zero mode.
904
- * \param[in] h - half. Is only being read.
905
- *
906
- * \returns unsigned long long int
907
- * \retval h converted to an unsigned 64-bit integer.
908
- * \internal
909
- * \exception-guarantee no-throw guarantee
910
- * \behavior reentrant, thread safe
911
- * \endinternal
912
- */
913
- __CUDA_HOSTDEVICE_FP16_DECL__ unsigned long long int __half2ull_rz(const __half h);
914
- /**
915
- * \ingroup CUDA_MATH__HALF_MISC
916
935
  * \brief Convert a half to an unsigned 64-bit integer in round-down mode.
917
936
  *
918
937
  * \details Convert the half-precision floating-point value \p h to an unsigned 64-bit
919
- * integer in round-down mode.
938
+ * integer in round-down mode. NaN inputs return 0x8000000000000000.
920
939
  * \param[in] h - half. Is only being read.
921
940
  *
922
941
  * \returns unsigned long long int
923
- * \retval h converted to an unsigned 64-bit integer.
942
+ * - \p h converted to an unsigned 64-bit integer.
924
943
  * \internal
925
944
  * \exception-guarantee no-throw guarantee
926
945
  * \behavior reentrant, thread safe
@@ -932,11 +951,11 @@ __CUDA_FP16_DECL__ unsigned long long int __half2ull_rd(const __half h);
932
951
  * \brief Convert a half to an unsigned 64-bit integer in round-up mode.
933
952
  *
934
953
  * \details Convert the half-precision floating-point value \p h to an unsigned 64-bit
935
- * integer in round-up mode.
954
+ * integer in round-up mode. NaN inputs return 0x8000000000000000.
936
955
  * \param[in] h - half. Is only being read.
937
956
  *
938
957
  * \returns unsigned long long int
939
- * \retval h converted to an unsigned 64-bit integer.
958
+ * - \p h converted to an unsigned 64-bit integer.
940
959
  * \internal
941
960
  * \exception-guarantee no-throw guarantee
942
961
  * \behavior reentrant, thread safe
@@ -954,7 +973,7 @@ __CUDA_FP16_DECL__ unsigned long long int __half2ull_ru(const __half h);
954
973
  * \param[in] i - unsigned long long int. Is only being read.
955
974
  *
956
975
  * \returns half
957
- * \retval i converted to half.
976
+ * - \p i converted to half.
958
977
  * \internal
959
978
  * \exception-guarantee no-throw guarantee
960
979
  * \behavior reentrant, thread safe
@@ -971,7 +990,7 @@ __CUDA_HOSTDEVICE_FP16_DECL__ __half __ull2half_rn(const unsigned long long int
971
990
  * \param[in] i - unsigned long long int. Is only being read.
972
991
  *
973
992
  * \returns half
974
- * \retval i converted to half.
993
+ * - \p i converted to half.
975
994
  * \internal
976
995
  * \exception-guarantee no-throw guarantee
977
996
  * \behavior reentrant, thread safe
@@ -987,7 +1006,7 @@ __CUDA_FP16_DECL__ __half __ull2half_rz(const unsigned long long int i);
987
1006
  * \param[in] i - unsigned long long int. Is only being read.
988
1007
  *
989
1008
  * \returns half
990
- * \retval i converted to half.
1009
+ * - \p i converted to half.
991
1010
  * \internal
992
1011
  * \exception-guarantee no-throw guarantee
993
1012
  * \behavior reentrant, thread safe
@@ -1003,7 +1022,7 @@ __CUDA_FP16_DECL__ __half __ull2half_rd(const unsigned long long int i);
1003
1022
  * \param[in] i - unsigned long long int. Is only being read.
1004
1023
  *
1005
1024
  * \returns half
1006
- * \retval i converted to half.
1025
+ * - \p i converted to half.
1007
1026
  * \internal
1008
1027
  * \exception-guarantee no-throw guarantee
1009
1028
  * \behavior reentrant, thread safe
@@ -1017,11 +1036,11 @@ __CUDA_FP16_DECL__ __half __ull2half_ru(const unsigned long long int i);
1017
1036
  * mode.
1018
1037
  *
1019
1038
  * \details Convert the half-precision floating-point value \p h to a signed 64-bit
1020
- * integer in round-to-nearest-even mode.
1039
+ * integer in round-to-nearest-even mode. NaN inputs return a long long int with hex value of 0x8000000000000000.
1021
1040
  * \param[in] h - half. Is only being read.
1022
1041
  *
1023
1042
  * \returns long long int
1024
- * \retval h converted to a signed 64-bit integer.
1043
+ * - \p h converted to a signed 64-bit integer.
1025
1044
  * \internal
1026
1045
  * \exception-guarantee no-throw guarantee
1027
1046
  * \behavior reentrant, thread safe
@@ -1030,30 +1049,14 @@ __CUDA_FP16_DECL__ __half __ull2half_ru(const unsigned long long int i);
1030
1049
  __CUDA_FP16_DECL__ long long int __half2ll_rn(const __half h);
1031
1050
  /**
1032
1051
  * \ingroup CUDA_MATH__HALF_MISC
1033
- * \brief Convert a half to a signed 64-bit integer in round-towards-zero mode.
1034
- *
1035
- * \details Convert the half-precision floating-point value \p h to a signed 64-bit
1036
- * integer in round-towards-zero mode.
1037
- * \param[in] h - half. Is only being read.
1038
- *
1039
- * \returns long long int
1040
- * \retval h converted to a signed 64-bit integer.
1041
- * \internal
1042
- * \exception-guarantee no-throw guarantee
1043
- * \behavior reentrant, thread safe
1044
- * \endinternal
1045
- */
1046
- __CUDA_HOSTDEVICE_FP16_DECL__ long long int __half2ll_rz(const __half h);
1047
- /**
1048
- * \ingroup CUDA_MATH__HALF_MISC
1049
1052
  * \brief Convert a half to a signed 64-bit integer in round-down mode.
1050
1053
  *
1051
1054
  * \details Convert the half-precision floating-point value \p h to a signed 64-bit
1052
- * integer in round-down mode.
1055
+ * integer in round-down mode. NaN inputs return a long long int with hex value of 0x8000000000000000.
1053
1056
  * \param[in] h - half. Is only being read.
1054
1057
  *
1055
1058
  * \returns long long int
1056
- * \retval h converted to a signed 64-bit integer.
1059
+ * - \p h converted to a signed 64-bit integer.
1057
1060
  * \internal
1058
1061
  * \exception-guarantee no-throw guarantee
1059
1062
  * \behavior reentrant, thread safe
@@ -1065,11 +1068,11 @@ __CUDA_FP16_DECL__ long long int __half2ll_rd(const __half h);
1065
1068
  * \brief Convert a half to a signed 64-bit integer in round-up mode.
1066
1069
  *
1067
1070
  * \details Convert the half-precision floating-point value \p h to a signed 64-bit
1068
- * integer in round-up mode.
1071
+ * integer in round-up mode. NaN inputs return a long long int with hex value of 0x8000000000000000.
1069
1072
  * \param[in] h - half. Is only being read.
1070
1073
  *
1071
1074
  * \returns long long int
1072
- * \retval h converted to a signed 64-bit integer.
1075
+ * - \p h converted to a signed 64-bit integer.
1073
1076
  * \internal
1074
1077
  * \exception-guarantee no-throw guarantee
1075
1078
  * \behavior reentrant, thread safe
@@ -1087,7 +1090,7 @@ __CUDA_FP16_DECL__ long long int __half2ll_ru(const __half h);
1087
1090
  * \param[in] i - long long int. Is only being read.
1088
1091
  *
1089
1092
  * \returns half
1090
- * \retval i converted to half.
1093
+ * - \p i converted to half.
1091
1094
  * \internal
1092
1095
  * \exception-guarantee no-throw guarantee
1093
1096
  * \behavior reentrant, thread safe
@@ -1103,7 +1106,7 @@ __CUDA_HOSTDEVICE_FP16_DECL__ __half __ll2half_rn(const long long int i);
1103
1106
  * \param[in] i - long long int. Is only being read.
1104
1107
  *
1105
1108
  * \returns half
1106
- * \retval i converted to half.
1109
+ * - \p i converted to half.
1107
1110
  */
1108
1111
  __CUDA_FP16_DECL__ __half __ll2half_rz(const long long int i);
1109
1112
  /**
@@ -1115,7 +1118,7 @@ __CUDA_FP16_DECL__ __half __ll2half_rz(const long long int i);
1115
1118
  * \param[in] i - long long int. Is only being read.
1116
1119
  *
1117
1120
  * \returns half
1118
- * \retval i converted to half.
1121
+ * - \p i converted to half.
1119
1122
  * \internal
1120
1123
  * \exception-guarantee no-throw guarantee
1121
1124
  * \behavior reentrant, thread safe
@@ -1131,7 +1134,7 @@ __CUDA_FP16_DECL__ __half __ll2half_rd(const long long int i);
1131
1134
  * \param[in] i - long long int. Is only being read.
1132
1135
  *
1133
1136
  * \returns half
1134
- * \retval i converted to half.
1137
+ * - \p i converted to half.
1135
1138
  * \internal
1136
1139
  * \exception-guarantee no-throw guarantee
1137
1140
  * \behavior reentrant, thread safe
@@ -1148,7 +1151,7 @@ __CUDA_FP16_DECL__ __half __ll2half_ru(const long long int i);
1148
1151
  * \param[in] h - half. Is only being read.
1149
1152
  *
1150
1153
  * \returns half
1151
- * \retval The truncated integer value.
1154
+ * - The truncated integer value.
1152
1155
  * \internal
1153
1156
  * \exception-guarantee no-throw guarantee
1154
1157
  * \behavior reentrant, thread safe
@@ -1163,7 +1166,7 @@ __CUDA_FP16_DECL__ __half htrunc(const __half h);
1163
1166
  * \param[in] h - half. Is only being read.
1164
1167
  *
1165
1168
  * \returns half
1166
- * \retval The smallest integer value not less than \p h.
1169
+ * - The smallest integer value not less than \p h.
1167
1170
  * \internal
1168
1171
  * \exception-guarantee no-throw guarantee
1169
1172
  * \behavior reentrant, thread safe
@@ -1178,7 +1181,7 @@ __CUDA_FP16_DECL__ __half hceil(const __half h);
1178
1181
  * \param[in] h - half. Is only being read.
1179
1182
  *
1180
1183
  * \returns half
1181
- * \retval The largest integer value which is less than or equal to \p h.
1184
+ * - The largest integer value which is less than or equal to \p h.
1182
1185
  * \internal
1183
1186
  * \exception-guarantee no-throw guarantee
1184
1187
  * \behavior reentrant, thread safe
@@ -1195,7 +1198,7 @@ __CUDA_FP16_DECL__ __half hfloor(const __half h);
1195
1198
  * \param[in] h - half. Is only being read.
1196
1199
  *
1197
1200
  * \returns half
1198
- * \retval The nearest integer to \p h.
1201
+ * - The nearest integer to \p h.
1199
1202
  * \internal
1200
1203
  * \exception-guarantee no-throw guarantee
1201
1204
  * \behavior reentrant, thread safe
@@ -1212,7 +1215,7 @@ __CUDA_FP16_DECL__ __half hrint(const __half h);
1212
1215
  * \param[in] h - half2. Is only being read.
1213
1216
  *
1214
1217
  * \returns half2
1215
- * \retval The truncated \p h.
1218
+ * - The truncated \p h.
1216
1219
  * \internal
1217
1220
  * \exception-guarantee no-throw guarantee
1218
1221
  * \behavior reentrant, thread safe
@@ -1228,7 +1231,7 @@ __CUDA_FP16_DECL__ __half2 h2trunc(const __half2 h);
1228
1231
  * \param[in] h - half2. Is only being read.
1229
1232
  *
1230
1233
  * \returns half2
1231
- * \retval The vector of smallest integers not less than \p h.
1234
+ * - The vector of smallest integers not less than \p h.
1232
1235
  * \internal
1233
1236
  * \exception-guarantee no-throw guarantee
1234
1237
  * \behavior reentrant, thread safe
@@ -1244,7 +1247,7 @@ __CUDA_FP16_DECL__ __half2 h2ceil(const __half2 h);
1244
1247
  * \param[in] h - half2. Is only being read.
1245
1248
  *
1246
1249
  * \returns half2
1247
- * \retval The vector of largest integers which is less than or equal to \p h.
1250
+ * - The vector of largest integers which is less than or equal to \p h.
1248
1251
  * \internal
1249
1252
  * \exception-guarantee no-throw guarantee
1250
1253
  * \behavior reentrant, thread safe
@@ -1262,7 +1265,7 @@ __CUDA_FP16_DECL__ __half2 h2floor(const __half2 h);
1262
1265
  * \param[in] h - half2. Is only being read.
1263
1266
  *
1264
1267
  * \returns half2
1265
- * \retval The vector of rounded integer values.
1268
+ * - The vector of rounded integer values.
1266
1269
  * \internal
1267
1270
  * \exception-guarantee no-throw guarantee
1268
1271
  * \behavior reentrant, thread safe
@@ -1279,7 +1282,7 @@ __CUDA_FP16_DECL__ __half2 h2rint(const __half2 h);
1279
1282
  * \param[in] a - half. Is only being read.
1280
1283
  *
1281
1284
  * \returns half2
1282
- * \retval The vector which has both its halves equal to the input \p a.
1285
+ * - The vector which has both its halves equal to the input \p a.
1283
1286
  * \internal
1284
1287
  * \exception-guarantee no-throw guarantee
1285
1288
  * \behavior reentrant, thread safe
@@ -1295,7 +1298,7 @@ __CUDA_FP16_DECL__ __half2 __half2half2(const __half a);
1295
1298
  * \param[in] a - half2. Is only being read.
1296
1299
  *
1297
1300
  * \returns half2
1298
- * \retval a with its halves being swapped.
1301
+ * - \p a with its halves being swapped.
1299
1302
  * \internal
1300
1303
  * \exception-guarantee no-throw guarantee
1301
1304
  * \behavior reentrant, thread safe
@@ -1315,7 +1318,7 @@ __CUDA_FP16_DECL__ __half2 __lowhigh2highlow(const __half2 a);
1315
1318
  * \param[in] b - half2. Is only being read.
1316
1319
  *
1317
1320
  * \returns half2
1318
- * \retval The low 16 bits of \p a and of \p b.
1321
+ * - The low 16 bits of \p a and of \p b.
1319
1322
  * \internal
1320
1323
  * \exception-guarantee no-throw guarantee
1321
1324
  * \behavior reentrant, thread safe
@@ -1335,7 +1338,7 @@ __CUDA_FP16_DECL__ __half2 __lows2half2(const __half2 a, const __half2 b);
1335
1338
  * \param[in] b - half2. Is only being read.
1336
1339
  *
1337
1340
  * \returns half2
1338
- * \retval The high 16 bits of \p a and of \p b.
1341
+ * - The high 16 bits of \p a and of \p b.
1339
1342
  * \internal
1340
1343
  * \exception-guarantee no-throw guarantee
1341
1344
  * \behavior reentrant, thread safe
@@ -1350,7 +1353,7 @@ __CUDA_FP16_DECL__ __half2 __highs2half2(const __half2 a, const __half2 b);
1350
1353
  * \param[in] a - half2. Is only being read.
1351
1354
  *
1352
1355
  * \returns half
1353
- * \retval The high 16 bits of the input.
1356
+ * - The high 16 bits of the input.
1354
1357
  * \internal
1355
1358
  * \exception-guarantee no-throw guarantee
1356
1359
  * \behavior reentrant, thread safe
@@ -1365,7 +1368,7 @@ __CUDA_FP16_DECL__ __half __high2half(const __half2 a);
1365
1368
  * \param[in] a - half2. Is only being read.
1366
1369
  *
1367
1370
  * \returns half
1368
- * \retval Returns \p half which contains low 16 bits of the input \p a.
1371
+ * - Returns \p half which contains low 16 bits of the input \p a.
1369
1372
  * \internal
1370
1373
  * \exception-guarantee no-throw guarantee
1371
1374
  * \behavior reentrant, thread safe
@@ -1380,9 +1383,9 @@ __CUDA_FP16_DECL__ __half __low2half(const __half2 a);
1380
1383
  * \param[in] a - half. Is only being read.
1381
1384
  *
1382
1385
  * \returns int
1383
- * \retval -1 iff \p a is equal to negative infinity,
1384
- * \retval 1 iff \p a is equal to positive infinity,
1385
- * \retval 0 otherwise.
1386
+ * - -1 iff \p a is equal to negative infinity,
1387
+ * - 1 iff \p a is equal to positive infinity,
1388
+ * - 0 otherwise.
1386
1389
  * \internal
1387
1390
  * \exception-guarantee no-throw guarantee
1388
1391
  * \behavior reentrant, thread safe
@@ -1400,7 +1403,7 @@ __CUDA_FP16_DECL__ int __hisinf(const __half a);
1400
1403
  * \param[in] b - half. Is only being read.
1401
1404
  *
1402
1405
  * \returns half2
1403
- * \retval The half2 with one half equal to \p a and the other to \p b.
1406
+ * - The half2 with one half equal to \p a and the other to \p b.
1404
1407
  * \internal
1405
1408
  * \exception-guarantee no-throw guarantee
1406
1409
  * \behavior reentrant, thread safe
@@ -1416,7 +1419,7 @@ __CUDA_FP16_DECL__ __half2 __halves2half2(const __half a, const __half b);
1416
1419
  * \param[in] a - half2. Is only being read.
1417
1420
  *
1418
1421
  * \returns half2
1419
- * \retval The half2 with both halves equal to the low 16 bits of the input.
1422
+ * - The half2 with both halves equal to the low 16 bits of the input.
1420
1423
  * \internal
1421
1424
  * \exception-guarantee no-throw guarantee
1422
1425
  * \behavior reentrant, thread safe
@@ -1432,7 +1435,7 @@ __CUDA_FP16_DECL__ __half2 __low2half2(const __half2 a);
1432
1435
  * \param[in] a - half2. Is only being read.
1433
1436
  *
1434
1437
  * \returns half2
1435
- * \retval The half2 with both halves equal to the high 16 bits of the input.
1438
+ * - The half2 with both halves equal to the high 16 bits of the input.
1436
1439
  * \internal
1437
1440
  * \exception-guarantee no-throw guarantee
1438
1441
  * \behavior reentrant, thread safe
@@ -1449,7 +1452,7 @@ __CUDA_FP16_DECL__ __half2 __high2half2(const __half2 a);
1449
1452
  * \param[in] h - half. Is only being read.
1450
1453
  *
1451
1454
  * \returns short int
1452
- * \retval The reinterpreted value.
1455
+ * - The reinterpreted value.
1453
1456
  * \internal
1454
1457
  * \exception-guarantee no-throw guarantee
1455
1458
  * \behavior reentrant, thread safe
@@ -1465,7 +1468,7 @@ __CUDA_FP16_DECL__ short int __half_as_short(const __half h);
1465
1468
  * \param[in] h - half. Is only being read.
1466
1469
  *
1467
1470
  * \returns unsigned short int
1468
- * \retval The reinterpreted value.
1471
+ * - The reinterpreted value.
1469
1472
  * \internal
1470
1473
  * \exception-guarantee no-throw guarantee
1471
1474
  * \behavior reentrant, thread safe
@@ -1481,7 +1484,7 @@ __CUDA_FP16_DECL__ unsigned short int __half_as_ushort(const __half h);
1481
1484
  * \param[in] i - short int. Is only being read.
1482
1485
  *
1483
1486
  * \returns half
1484
- * \retval The reinterpreted value.
1487
+ * - The reinterpreted value.
1485
1488
  * \internal
1486
1489
  * \exception-guarantee no-throw guarantee
1487
1490
  * \behavior reentrant, thread safe
@@ -1497,15 +1500,95 @@ __CUDA_FP16_DECL__ __half __short_as_half(const short int i);
1497
1500
  * \param[in] i - unsigned short int. Is only being read.
1498
1501
  *
1499
1502
  * \returns half
1500
- * \retval The reinterpreted value.
1503
+ * - The reinterpreted value.
1501
1504
  * \internal
1502
1505
  * \exception-guarantee no-throw guarantee
1503
1506
  * \behavior reentrant, thread safe
1504
1507
  * \endinternal
1505
1508
  */
1506
1509
  __CUDA_FP16_DECL__ __half __ushort_as_half(const unsigned short int i);
1510
+ /**
1511
+ * \ingroup CUDA_MATH__HALF_COMPARISON
1512
+ * \brief Calculates \p half maximum of two input values.
1513
+ *
1514
+ * \details Calculates \p half max(\p a, \p b)
1515
+ * defined as (\p a > \p b) ? \p a : \p b.
1516
+ * - If either of inputs is NaN, the other input is returned.
1517
+ * - If both inputs are NaNs, then canonical NaN is returned.
1518
+ * - If values of both inputs are 0.0, then +0.0 > -0.0
1519
+ * \param[in] a - half. Is only being read.
1520
+ * \param[in] b - half. Is only being read.
1521
+ *
1522
+ * \returns half
1523
+ * \internal
1524
+ * \exception-guarantee no-throw guarantee
1525
+ * \behavior reentrant, thread safe
1526
+ * \endinternal
1527
+ */
1528
+ __CUDA_FP16_DECL__ __half __hmax(const __half a, const __half b);
1529
+ /**
1530
+ * \ingroup CUDA_MATH__HALF_COMPARISON
1531
+ * \brief Calculates \p half minimum of two input values.
1532
+ *
1533
+ * \details Calculates \p half min(\p a, \p b)
1534
+ * defined as (\p a < \p b) ? \p a : \p b.
1535
+ * - If either of inputs is NaN, the other input is returned.
1536
+ * - If both inputs are NaNs, then canonical NaN is returned.
1537
+ * - If values of both inputs are 0.0, then +0.0 > -0.0
1538
+ * \param[in] a - half. Is only being read.
1539
+ * \param[in] b - half. Is only being read.
1540
+ *
1541
+ * \returns half
1542
+ * \internal
1543
+ * \exception-guarantee no-throw guarantee
1544
+ * \behavior reentrant, thread safe
1545
+ * \endinternal
1546
+ */
1547
+ __CUDA_FP16_DECL__ __half __hmin(const __half a, const __half b);
1548
+ /**
1549
+ * \ingroup CUDA_MATH__HALF2_COMPARISON
1550
+ * \brief Calculates \p half2 vector maximum of two inputs.
1551
+ *
1552
+ * \details Calculates \p half2 vector max(\p a, \p b).
1553
+ * Elementwise \p half operation is defined as
1554
+ * (\p a > \p b) ? \p a : \p b.
1555
+ * - If either of inputs is NaN, the other input is returned.
1556
+ * - If both inputs are NaNs, then canonical NaN is returned.
1557
+ * - If values of both inputs are 0.0, then +0.0 > -0.0
1558
+ * \param[in] a - half2. Is only being read.
1559
+ * \param[in] b - half2. Is only being read.
1560
+ *
1561
+ * \returns half2
1562
+ * - The result of elementwise maximum of vectors \p a and \p b
1563
+ * \internal
1564
+ * \exception-guarantee no-throw guarantee
1565
+ * \behavior reentrant, thread safe
1566
+ * \endinternal
1567
+ */
1568
+ __CUDA_FP16_DECL__ __half2 __hmax2(const __half2 a, const __half2 b);
1569
+ /**
1570
+ * \ingroup CUDA_MATH__HALF2_COMPARISON
1571
+ * \brief Calculates \p half2 vector minimum of two inputs.
1572
+ *
1573
+ * \details Calculates \p half2 vector min(\p a, \p b).
1574
+ * Elementwise \p half operation is defined as
1575
+ * (\p a < \p b) ? \p a : \p b.
1576
+ * - If either of inputs is NaN, the other input is returned.
1577
+ * - If both inputs are NaNs, then canonical NaN is returned.
1578
+ * - If values of both inputs are 0.0, then +0.0 > -0.0
1579
+ * \param[in] a - half2. Is only being read.
1580
+ * \param[in] b - half2. Is only being read.
1581
+ *
1582
+ * \returns half2
1583
+ * - The result of elementwise minimum of vectors \p a and \p b
1584
+ * \internal
1585
+ * \exception-guarantee no-throw guarantee
1586
+ * \behavior reentrant, thread safe
1587
+ * \endinternal
1588
+ */
1589
+ __CUDA_FP16_DECL__ __half2 __hmin2(const __half2 a, const __half2 b);
1507
1590
 
1508
- #if __CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__)
1591
+ #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300)
1509
1592
  #if !defined warpSize && !defined __local_warpSize
1510
1593
  #define warpSize 32
1511
1594
  #define __local_warpSize
@@ -1520,7 +1603,7 @@ __CUDA_FP16_DECL__ __half __ushort_as_half(const unsigned short int i);
1520
1603
  #endif
1521
1604
 
1522
1605
  #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
1523
- #define __WSB_DEPRECATION_MESSAGE(x) #x"() is deprecated in favor of "#x"_sync() and may be removed in a future release (Use -Wno-deprecated-declarations to suppress this warning)."
1606
+ #define __WSB_DEPRECATION_MESSAGE(x) __CUDA_FP16_STRINGIFY(x) "() is deprecated in favor of " __CUDA_FP16_STRINGIFY(x) "_sync() and may be removed in a future release (Use -Wno-deprecated-declarations to suppress this warning)."
1524
1607
 
1525
1608
  __CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) __half2 __shfl(const __half2 var, const int delta, const int width = warpSize);
1526
1609
  __CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) __half2 __shfl_up(const __half2 var, const unsigned int delta, const int width = warpSize);
@@ -1574,6 +1657,7 @@ __CUDA_FP16_DECL__ __half2 __shfl_sync(const unsigned mask, const __half2 var, c
1574
1657
  *
1575
1658
  * \returns Returns the 4-byte word referenced by var from the source thread ID as half2.
1576
1659
  * If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned.
1660
+ * \note_ref_guide_warp_shuffle
1577
1661
  * \internal
1578
1662
  * \exception-guarantee no-throw guarantee
1579
1663
  * \behavior not reentrant, not thread safe
@@ -1598,6 +1682,7 @@ __CUDA_FP16_DECL__ __half2 __shfl_up_sync(const unsigned mask, const __half2 var
1598
1682
  *
1599
1683
  * \returns Returns the 4-byte word referenced by var from the source thread ID as half2.
1600
1684
  * If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned.
1685
+ * \note_ref_guide_warp_shuffle
1601
1686
  * \internal
1602
1687
  * \exception-guarantee no-throw guarantee
1603
1688
  * \behavior not reentrant, not thread safe
@@ -1621,6 +1706,7 @@ __CUDA_FP16_DECL__ __half2 __shfl_down_sync(const unsigned mask, const __half2 v
1621
1706
  *
1622
1707
  * \returns Returns the 4-byte word referenced by var from the source thread ID as half2.
1623
1708
  * If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned.
1709
+ * \note_ref_guide_warp_shuffle
1624
1710
  * \internal
1625
1711
  * \exception-guarantee no-throw guarantee
1626
1712
  * \behavior not reentrant, not thread safe
@@ -1645,6 +1731,7 @@ __CUDA_FP16_DECL__ __half2 __shfl_xor_sync(const unsigned mask, const __half2 va
1645
1731
  *
1646
1732
  * \returns Returns the 2-byte word referenced by var from the source thread ID as half.
1647
1733
  * If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned.
1734
+ * \note_ref_guide_warp_shuffle
1648
1735
  * \internal
1649
1736
  * \exception-guarantee no-throw guarantee
1650
1737
  * \behavior not reentrant, not thread safe
@@ -1668,6 +1755,7 @@ __CUDA_FP16_DECL__ __half __shfl_sync(const unsigned mask, const __half var, con
1668
1755
  *
1669
1756
  * \returns Returns the 2-byte word referenced by var from the source thread ID as half.
1670
1757
  * If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned.
1758
+ * \note_ref_guide_warp_shuffle
1671
1759
  * \internal
1672
1760
  * \exception-guarantee no-throw guarantee
1673
1761
  * \behavior not reentrant, not thread safe
@@ -1692,6 +1780,7 @@ __CUDA_FP16_DECL__ __half __shfl_up_sync(const unsigned mask, const __half var,
1692
1780
  *
1693
1781
  * \returns Returns the 2-byte word referenced by var from the source thread ID as half.
1694
1782
  * If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned.
1783
+ * \note_ref_guide_warp_shuffle
1695
1784
  * \internal
1696
1785
  * \exception-guarantee no-throw guarantee
1697
1786
  * \behavior not reentrant, not thread safe
@@ -1715,6 +1804,7 @@ __CUDA_FP16_DECL__ __half __shfl_down_sync(const unsigned mask, const __half var
1715
1804
  *
1716
1805
  * \returns Returns the 2-byte word referenced by var from the source thread ID as half.
1717
1806
  * If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned.
1807
+ * \note_ref_guide_warp_shuffle
1718
1808
  * \internal
1719
1809
  * \exception-guarantee no-throw guarantee
1720
1810
  * \behavior not reentrant, not thread safe
@@ -1726,9 +1816,9 @@ __CUDA_FP16_DECL__ __half __shfl_xor_sync(const unsigned mask, const __half var,
1726
1816
  #undef warpSize
1727
1817
  #undef __local_warpSize
1728
1818
  #endif
1729
- #endif /*__CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__) */
1819
+ #endif /*!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300) */
1730
1820
 
1731
- #if defined(__cplusplus) && ( __CUDA_ARCH__ >=320 || !defined(__CUDA_ARCH__) )
1821
+ #if defined(__cplusplus) && ( !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320) )
1732
1822
  /**
1733
1823
  * \ingroup CUDA_MATH__HALF_MISC
1734
1824
  * \brief Generates a `ld.global.nc` load instruction.
@@ -1869,9 +1959,9 @@ __CUDA_FP16_DECL__ void __stwt(__half2 *const ptr, const __half2 value);
1869
1959
  * \param[in] value - the value to be stored
1870
1960
  */
1871
1961
  __CUDA_FP16_DECL__ void __stwt(__half *const ptr, const __half value);
1872
- #endif /*defined(__cplusplus) && ( __CUDA_ARCH__ >=320 || !defined(__CUDA_ARCH__) )*/
1962
+ #endif /*defined(__cplusplus) && ( !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320) )*/
1873
1963
 
1874
- #if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
1964
+ #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
1875
1965
  /**
1876
1966
  * \ingroup CUDA_MATH__HALF2_COMPARISON
1877
1967
  * \brief Performs half2 vector if-equal comparison.
@@ -1883,7 +1973,7 @@ __CUDA_FP16_DECL__ void __stwt(__half *const ptr, const __half value);
1883
1973
  * \param[in] b - half2. Is only being read.
1884
1974
  *
1885
1975
  * \returns half2
1886
- * \retval The vector result of if-equal comparison of vectors \p a and \p b.
1976
+ * - The vector result of if-equal comparison of vectors \p a and \p b.
1887
1977
  * \internal
1888
1978
  * \exception-guarantee no-throw guarantee
1889
1979
  * \behavior reentrant, thread safe
@@ -1901,7 +1991,7 @@ __CUDA_FP16_DECL__ __half2 __heq2(const __half2 a, const __half2 b);
1901
1991
  * \param[in] b - half2. Is only being read.
1902
1992
  *
1903
1993
  * \returns half2
1904
- * \retval The vector result of not-equal comparison of vectors \p a and \p b.
1994
+ * - The vector result of not-equal comparison of vectors \p a and \p b.
1905
1995
  * \internal
1906
1996
  * \exception-guarantee no-throw guarantee
1907
1997
  * \behavior reentrant, thread safe
@@ -1919,7 +2009,7 @@ __CUDA_FP16_DECL__ __half2 __hne2(const __half2 a, const __half2 b);
1919
2009
  * \param[in] b - half2. Is only being read.
1920
2010
  *
1921
2011
  * \returns half2
1922
- * \retval The \p half2 result of less-equal comparison of vectors \p a and \p b.
2012
+ * - The \p half2 result of less-equal comparison of vectors \p a and \p b.
1923
2013
  * \internal
1924
2014
  * \exception-guarantee no-throw guarantee
1925
2015
  * \behavior reentrant, thread safe
@@ -1937,7 +2027,7 @@ __CUDA_FP16_DECL__ __half2 __hle2(const __half2 a, const __half2 b);
1937
2027
  * \param[in] b - half2. Is only being read.
1938
2028
  *
1939
2029
  * \returns half2
1940
- * \retval The vector result of greater-equal comparison of vectors \p a and \p b.
2030
+ * - The vector result of greater-equal comparison of vectors \p a and \p b.
1941
2031
  * \internal
1942
2032
  * \exception-guarantee no-throw guarantee
1943
2033
  * \behavior reentrant, thread safe
@@ -1955,7 +2045,7 @@ __CUDA_FP16_DECL__ __half2 __hge2(const __half2 a, const __half2 b);
1955
2045
  * \param[in] b - half2. Is only being read.
1956
2046
  *
1957
2047
  * \returns half2
1958
- * \retval The half2 vector result of less-than comparison of vectors \p a and \p b.
2048
+ * - The half2 vector result of less-than comparison of vectors \p a and \p b.
1959
2049
  * \internal
1960
2050
  * \exception-guarantee no-throw guarantee
1961
2051
  * \behavior reentrant, thread safe
@@ -1973,7 +2063,7 @@ __CUDA_FP16_DECL__ __half2 __hlt2(const __half2 a, const __half2 b);
1973
2063
  * \param[in] b - half2. Is only being read.
1974
2064
  *
1975
2065
  * \returns half2
1976
- * \retval The vector result of greater-than comparison of vectors \p a and \p b.
2066
+ * - The vector result of greater-than comparison of vectors \p a and \p b.
1977
2067
  * \internal
1978
2068
  * \exception-guarantee no-throw guarantee
1979
2069
  * \behavior reentrant, thread safe
@@ -1991,7 +2081,7 @@ __CUDA_FP16_DECL__ __half2 __hgt2(const __half2 a, const __half2 b);
1991
2081
  * \param[in] b - half2. Is only being read.
1992
2082
  *
1993
2083
  * \returns half2
1994
- * \retval The vector result of unordered if-equal comparison of vectors \p a and \p b.
2084
+ * - The vector result of unordered if-equal comparison of vectors \p a and \p b.
1995
2085
  * \internal
1996
2086
  * \exception-guarantee no-throw guarantee
1997
2087
  * \behavior reentrant, thread safe
@@ -2009,7 +2099,7 @@ __CUDA_FP16_DECL__ __half2 __hequ2(const __half2 a, const __half2 b);
2009
2099
  * \param[in] b - half2. Is only being read.
2010
2100
  *
2011
2101
  * \returns half2
2012
- * \retval The vector result of unordered not-equal comparison of vectors \p a and \p b.
2102
+ * - The vector result of unordered not-equal comparison of vectors \p a and \p b.
2013
2103
  * \internal
2014
2104
  * \exception-guarantee no-throw guarantee
2015
2105
  * \behavior reentrant, thread safe
@@ -2027,7 +2117,7 @@ __CUDA_FP16_DECL__ __half2 __hneu2(const __half2 a, const __half2 b);
2027
2117
  * \param[in] b - half2. Is only being read.
2028
2118
  *
2029
2119
  * \returns half2
2030
- * \retval The vector result of unordered less-equal comparison of vectors \p a and \p b.
2120
+ * - The vector result of unordered less-equal comparison of vectors \p a and \p b.
2031
2121
  * \internal
2032
2122
  * \exception-guarantee no-throw guarantee
2033
2123
  * \behavior reentrant, thread safe
@@ -2045,7 +2135,7 @@ __CUDA_FP16_DECL__ __half2 __hleu2(const __half2 a, const __half2 b);
2045
2135
  * \param[in] b - half2. Is only being read.
2046
2136
  *
2047
2137
  * \returns half2
2048
- * \retval The \p half2 vector result of unordered greater-equal comparison of vectors \p a and \p b.
2138
+ * - The \p half2 vector result of unordered greater-equal comparison of vectors \p a and \p b.
2049
2139
  * \internal
2050
2140
  * \exception-guarantee no-throw guarantee
2051
2141
  * \behavior reentrant, thread safe
@@ -2063,7 +2153,7 @@ __CUDA_FP16_DECL__ __half2 __hgeu2(const __half2 a, const __half2 b);
2063
2153
  * \param[in] b - half2. Is only being read.
2064
2154
  *
2065
2155
  * \returns half2
2066
- * \retval The vector result of unordered less-than comparison of vectors \p a and \p b.
2156
+ * - The vector result of unordered less-than comparison of vectors \p a and \p b.
2067
2157
  * \internal
2068
2158
  * \exception-guarantee no-throw guarantee
2069
2159
  * \behavior reentrant, thread safe
@@ -2081,7 +2171,7 @@ __CUDA_FP16_DECL__ __half2 __hltu2(const __half2 a, const __half2 b);
2081
2171
  * \param[in] b - half2. Is only being read.
2082
2172
  *
2083
2173
  * \returns half2
2084
- * \retval The \p half2 vector result of unordered greater-than comparison of vectors \p a and \p b.
2174
+ * - The \p half2 vector result of unordered greater-than comparison of vectors \p a and \p b.
2085
2175
  * \internal
2086
2176
  * \exception-guarantee no-throw guarantee
2087
2177
  * \behavior reentrant, thread safe
@@ -2096,7 +2186,7 @@ __CUDA_FP16_DECL__ __half2 __hgtu2(const __half2 a, const __half2 b);
2096
2186
  * \param[in] a - half2. Is only being read.
2097
2187
  *
2098
2188
  * \returns half2
2099
- * \retval The half2 with the corresponding \p half results set to
2189
+ * - The half2 with the corresponding \p half results set to
2100
2190
  * 1.0 for NaN, 0.0 otherwise.
2101
2191
  * \internal
2102
2192
  * \exception-guarantee no-throw guarantee
@@ -2117,7 +2207,7 @@ __CUDA_FP16_DECL__ __half2 __hisnan2(const __half2 a);
2117
2207
  * \param[in] b - half2. Is only being read.
2118
2208
  *
2119
2209
  * \returns half2
2120
- * \retval The sum of vectors \p a and \p b.
2210
+ * - The sum of vectors \p a and \p b.
2121
2211
  * \internal
2122
2212
  * \exception-guarantee no-throw guarantee
2123
2213
  * \behavior reentrant, thread safe
@@ -2137,7 +2227,7 @@ __CUDA_FP16_DECL__ __half2 __hadd2(const __half2 a, const __half2 b);
2137
2227
  * \param[in] b - half2. Is only being read.
2138
2228
  *
2139
2229
  * \returns half2
2140
- * \retval The subtraction of vector \p b from \p a.
2230
+ * - The subtraction of vector \p b from \p a.
2141
2231
  * \internal
2142
2232
  * \exception-guarantee no-throw guarantee
2143
2233
  * \behavior reentrant, thread safe
@@ -2157,7 +2247,7 @@ __CUDA_FP16_DECL__ __half2 __hsub2(const __half2 a, const __half2 b);
2157
2247
  * \param[in] b - half2. Is only being read.
2158
2248
  *
2159
2249
  * \returns half2
2160
- * \retval The result of elementwise multiplying the vectors \p a and \p b.
2250
+ * - The result of elementwise multiplying the vectors \p a and \p b.
2161
2251
  * \internal
2162
2252
  * \exception-guarantee no-throw guarantee
2163
2253
  * \behavior reentrant, thread safe
@@ -2166,6 +2256,68 @@ __CUDA_FP16_DECL__ __half2 __hsub2(const __half2 a, const __half2 b);
2166
2256
  __CUDA_FP16_DECL__ __half2 __hmul2(const __half2 a, const __half2 b);
2167
2257
  /**
2168
2258
  * \ingroup CUDA_MATH__HALF2_ARITHMETIC
2259
+ * \brief Performs \p half2 vector addition in round-to-nearest-even mode.
2260
+ *
2261
+ * \details Performs \p half2 vector add of inputs \p a and \p b, in round-to-nearest
2262
+ * mode. Prevents floating-point contractions of mul+add into fma.
2263
+ * \internal
2264
+ * \req DEEPLEARN-SRM_REQ-95
2265
+ * \endinternal
2266
+ * \param[in] a - half2. Is only being read.
2267
+ * \param[in] b - half2. Is only being read.
2268
+ *
2269
+ * \returns half2
2270
+ * - The sum of vectors \p a and \p b.
2271
+ * \internal
2272
+ * \exception-guarantee no-throw guarantee
2273
+ * \behavior reentrant, thread safe
2274
+ * \endinternal
2275
+ */
2276
+ __CUDA_FP16_DECL__ __half2 __hadd2_rn(const __half2 a, const __half2 b);
2277
+ /**
2278
+ * \ingroup CUDA_MATH__HALF2_ARITHMETIC
2279
+ * \brief Performs \p half2 vector subtraction in round-to-nearest-even mode.
2280
+ *
2281
+ * \details Subtracts \p half2 input vector \p b from input vector \p a in
2282
+ * round-to-nearest-even mode. Prevents floating-point contractions of mul+sub
2283
+ * into fma.
2284
+ * \internal
2285
+ * \req DEEPLEARN-SRM_REQ-104
2286
+ * \endinternal
2287
+ * \param[in] a - half2. Is only being read.
2288
+ * \param[in] b - half2. Is only being read.
2289
+ *
2290
+ * \returns half2
2291
+ * - The subtraction of vector \p b from \p a.
2292
+ * \internal
2293
+ * \exception-guarantee no-throw guarantee
2294
+ * \behavior reentrant, thread safe
2295
+ * \endinternal
2296
+ */
2297
+ __CUDA_FP16_DECL__ __half2 __hsub2_rn(const __half2 a, const __half2 b);
2298
+ /**
2299
+ * \ingroup CUDA_MATH__HALF2_ARITHMETIC
2300
+ * \brief Performs \p half2 vector multiplication in round-to-nearest-even mode.
2301
+ *
2302
+ * \details Performs \p half2 vector multiplication of inputs \p a and \p b, in
2303
+ * round-to-nearest-even mode. Prevents floating-point contractions of
2304
+ * mul+add or sub into fma.
2305
+ * \internal
2306
+ * \req DEEPLEARN-SRM_REQ-102
2307
+ * \endinternal
2308
+ * \param[in] a - half2. Is only being read.
2309
+ * \param[in] b - half2. Is only being read.
2310
+ *
2311
+ * \returns half2
2312
+ * - The result of elementwise multiplying the vectors \p a and \p b.
2313
+ * \internal
2314
+ * \exception-guarantee no-throw guarantee
2315
+ * \behavior reentrant, thread safe
2316
+ * \endinternal
2317
+ */
2318
+ __CUDA_FP16_DECL__ __half2 __hmul2_rn(const __half2 a, const __half2 b);
2319
+ /**
2320
+ * \ingroup CUDA_MATH__HALF2_ARITHMETIC
2169
2321
  * \brief Performs \p half2 vector division in round-to-nearest-even mode.
2170
2322
  *
2171
2323
  * \details Divides \p half2 input vector \p a by input vector \p b in round-to-nearest
@@ -2177,7 +2329,7 @@ __CUDA_FP16_DECL__ __half2 __hmul2(const __half2 a, const __half2 b);
2177
2329
  * \param[in] b - half2. Is only being read.
2178
2330
  *
2179
2331
  * \returns half2
2180
- * \retval The elementwise division of \p a with \p b.
2332
+ * - The elementwise division of \p a with \p b.
2181
2333
  * \internal
2182
2334
  * \exception-guarantee no-throw guarantee
2183
2335
  * \behavior reentrant, thread safe
@@ -2194,7 +2346,7 @@ __CUDA_FP16_DECL__ __half2 __h2div(const __half2 a, const __half2 b);
2194
2346
  * \param[in] a - half2. Is only being read.
2195
2347
  *
2196
2348
  * \returns half2
2197
- * \retval Returns \p a with the absolute value of both halves.
2349
+ * - Returns \p a with the absolute value of both halves.
2198
2350
  * \internal
2199
2351
  * \exception-guarantee no-throw guarantee
2200
2352
  * \behavior reentrant, thread safe
@@ -2213,7 +2365,7 @@ __CUDA_FP16_DECL__ __half2 __habs2(const __half2 a);
2213
2365
  * \param[in] b - half2. Is only being read.
2214
2366
  *
2215
2367
  * \returns half2
2216
- * \retval The sum of \p a and \p b, with respect to saturation.
2368
+ * - The sum of \p a and \p b, with respect to saturation.
2217
2369
  * \internal
2218
2370
  * \exception-guarantee no-throw guarantee
2219
2371
  * \behavior reentrant, thread safe
@@ -2232,7 +2384,7 @@ __CUDA_FP16_DECL__ __half2 __hadd2_sat(const __half2 a, const __half2 b);
2232
2384
  * \param[in] b - half2. Is only being read.
2233
2385
  *
2234
2386
  * \returns half2
2235
- * \retval The subtraction of vector \p b from \p a, with respect to saturation.
2387
+ * - The subtraction of vector \p b from \p a, with respect to saturation.
2236
2388
  * \internal
2237
2389
  * \exception-guarantee no-throw guarantee
2238
2390
  * \behavior reentrant, thread safe
@@ -2251,7 +2403,7 @@ __CUDA_FP16_DECL__ __half2 __hsub2_sat(const __half2 a, const __half2 b);
2251
2403
  * \param[in] b - half2. Is only being read.
2252
2404
  *
2253
2405
  * \returns half2
2254
- * \retval The result of elementwise multiplication of vectors \p a and \p b,
2406
+ * - The result of elementwise multiplication of vectors \p a and \p b,
2255
2407
  * with respect to saturation.
2256
2408
  * \internal
2257
2409
  * \exception-guarantee no-throw guarantee
@@ -2275,7 +2427,7 @@ __CUDA_FP16_DECL__ __half2 __hmul2_sat(const __half2 a, const __half2 b);
2275
2427
  * \param[in] c - half2. Is only being read.
2276
2428
  *
2277
2429
  * \returns half2
2278
- * \retval The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c.
2430
+ * - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c.
2279
2431
  * \internal
2280
2432
  * \exception-guarantee no-throw guarantee
2281
2433
  * \behavior reentrant, thread safe
@@ -2296,7 +2448,7 @@ __CUDA_FP16_DECL__ __half2 __hfma2(const __half2 a, const __half2 b, const __hal
2296
2448
  * \param[in] c - half2. Is only being read.
2297
2449
  *
2298
2450
  * \returns half2
2299
- * \retval The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c,
2451
+ * - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c,
2300
2452
  * with respect to saturation.
2301
2453
  * \internal
2302
2454
  * \exception-guarantee no-throw guarantee
@@ -2316,7 +2468,7 @@ __CUDA_FP16_DECL__ __half2 __hfma2_sat(const __half2 a, const __half2 b, const _
2316
2468
  * \param[in] a - half2. Is only being read.
2317
2469
  *
2318
2470
  * \returns half2
2319
- * \retval Returns \p a with both halves negated.
2471
+ * - Returns \p a with both halves negated.
2320
2472
  * \internal
2321
2473
  * \exception-guarantee no-throw guarantee
2322
2474
  * \behavior reentrant, thread safe
@@ -2331,7 +2483,7 @@ __CUDA_FP16_DECL__ __half2 __hneg2(const __half2 a);
2331
2483
  * \param[in] a - half. Is only being read.
2332
2484
  *
2333
2485
  * \returns half
2334
- * \retval The absolute value of a.
2486
+ * - The absolute value of \p a.
2335
2487
  * \internal
2336
2488
  * \exception-guarantee no-throw guarantee
2337
2489
  * \behavior reentrant, thread safe
@@ -2351,7 +2503,7 @@ __CUDA_FP16_DECL__ __half __habs(const __half a);
2351
2503
  * \param[in] b - half. Is only being read.
2352
2504
  *
2353
2505
  * \returns half
2354
- * \retval The sum of \p a and \p b.
2506
+ * - The sum of \p a and \p b.
2355
2507
  * \internal
2356
2508
  * \exception-guarantee no-throw guarantee
2357
2509
  * \behavior reentrant, thread safe
@@ -2371,7 +2523,7 @@ __CUDA_FP16_DECL__ __half __hadd(const __half a, const __half b);
2371
2523
  * \param[in] b - half. Is only being read.
2372
2524
  *
2373
2525
  * \returns half
2374
- * \retval The result of subtracting \p b from \p a.
2526
+ * - The result of subtracting \p b from \p a.
2375
2527
  * \internal
2376
2528
  * \exception-guarantee no-throw guarantee
2377
2529
  * \behavior reentrant, thread safe
@@ -2391,11 +2543,67 @@ __CUDA_FP16_DECL__ __half __hsub(const __half a, const __half b);
2391
2543
  * \param[in] b - half. Is only being read.
2392
2544
  *
2393
2545
  * \returns half
2394
- * \retval The result of multiplying \p a and \p b.
2546
+ * - The result of multiplying \p a and \p b.
2395
2547
  */
2396
2548
  __CUDA_FP16_DECL__ __half __hmul(const __half a, const __half b);
2397
2549
  /**
2398
2550
  * \ingroup CUDA_MATH__HALF_ARITHMETIC
2551
+ * \brief Performs \p half addition in round-to-nearest-even mode.
2552
+ *
2553
+ * \details Performs \p half addition of inputs \p a and \p b, in round-to-nearest-even
2554
+ * mode. Prevents floating-point contractions of mul+add into fma.
2555
+ * \internal
2556
+ * \req DEEPLEARN-SRM_REQ-94
2557
+ * \endinternal
2558
+ * \param[in] a - half. Is only being read.
2559
+ * \param[in] b - half. Is only being read.
2560
+ *
2561
+ * \returns half
2562
+ * - The sum of \p a and \p b.
2563
+ * \internal
2564
+ * \exception-guarantee no-throw guarantee
2565
+ * \behavior reentrant, thread safe
2566
+ * \endinternal
2567
+ */
2568
+ __CUDA_FP16_DECL__ __half __hadd_rn(const __half a, const __half b);
2569
+ /**
2570
+ * \ingroup CUDA_MATH__HALF_ARITHMETIC
2571
+ * \brief Performs \p half subtraction in round-to-nearest-even mode.
2572
+ *
2573
+ * \details Subtracts \p half input \p b from input \p a in round-to-nearest
2574
+ * mode. Prevents floating-point contractions of mul+sub into fma.
2575
+ * \internal
2576
+ * \req DEEPLEARN-SRM_REQ-97
2577
+ * \endinternal
2578
+ * \param[in] a - half. Is only being read.
2579
+ * \param[in] b - half. Is only being read.
2580
+ *
2581
+ * \returns half
2582
+ * - The result of subtracting \p b from \p a.
2583
+ * \internal
2584
+ * \exception-guarantee no-throw guarantee
2585
+ * \behavior reentrant, thread safe
2586
+ * \endinternal
2587
+ */
2588
+ __CUDA_FP16_DECL__ __half __hsub_rn(const __half a, const __half b);
2589
+ /**
2590
+ * \ingroup CUDA_MATH__HALF_ARITHMETIC
2591
+ * \brief Performs \p half multiplication in round-to-nearest-even mode.
2592
+ *
2593
+ * \details Performs \p half multiplication of inputs \p a and \p b, in round-to-nearest
2594
+ * mode. Prevents floating-point contractions of mul+add or sub into fma.
2595
+ * \internal
2596
+ * \req DEEPLEARN-SRM_REQ-99
2597
+ * \endinternal
2598
+ * \param[in] a - half. Is only being read.
2599
+ * \param[in] b - half. Is only being read.
2600
+ *
2601
+ * \returns half
2602
+ * - The result of multiplying \p a and \p b.
2603
+ */
2604
+ __CUDA_FP16_DECL__ __half __hmul_rn(const __half a, const __half b);
2605
+ /**
2606
+ * \ingroup CUDA_MATH__HALF_ARITHMETIC
2399
2607
  * \brief Performs \p half division in round-to-nearest-even mode.
2400
2608
  *
2401
2609
  * \details Divides \p half input \p a by input \p b in round-to-nearest
@@ -2407,7 +2615,7 @@ __CUDA_FP16_DECL__ __half __hmul(const __half a, const __half b);
2407
2615
  * \param[in] b - half. Is only being read.
2408
2616
  *
2409
2617
  * \returns half
2410
- * \retval The result of dividing \p a by \p b.
2618
+ * - The result of dividing \p a by \p b.
2411
2619
  * \internal
2412
2620
  * \exception-guarantee no-throw guarantee
2413
2621
  * \behavior reentrant, thread safe
@@ -2425,7 +2633,7 @@ __CUDA_FP16_DECL__ __half __hdiv(const __half a, const __half b);
2425
2633
  * \param[in] b - half. Is only being read.
2426
2634
  *
2427
2635
  * \returns half
2428
- * \retval The sum of \p a and \p b, with respect to saturation.
2636
+ * - The sum of \p a and \p b, with respect to saturation.
2429
2637
  * \internal
2430
2638
  * \exception-guarantee no-throw guarantee
2431
2639
  * \behavior reentrant, thread safe
@@ -2444,7 +2652,7 @@ __CUDA_FP16_DECL__ __half __hadd_sat(const __half a, const __half b);
2444
2652
  * \param[in] b - half. Is only being read.
2445
2653
  *
2446
2654
  * \returns half
2447
- * \retval The result of subtraction of \p b from \p a, with respect to saturation.
2655
+ * - The result of subtraction of \p b from \p a, with respect to saturation.
2448
2656
  * \internal
2449
2657
  * \exception-guarantee no-throw guarantee
2450
2658
  * \behavior reentrant, thread safe
@@ -2463,7 +2671,7 @@ __CUDA_FP16_DECL__ __half __hsub_sat(const __half a, const __half b);
2463
2671
  * \param[in] b - half. Is only being read.
2464
2672
  *
2465
2673
  * \returns half
2466
- * \retval The result of multiplying \p a and \p b, with respect to saturation.
2674
+ * - The result of multiplying \p a and \p b, with respect to saturation.
2467
2675
  * \internal
2468
2676
  * \exception-guarantee no-throw guarantee
2469
2677
  * \behavior reentrant, thread safe
@@ -2485,7 +2693,7 @@ __CUDA_FP16_DECL__ __half __hmul_sat(const __half a, const __half b);
2485
2693
  * \param[in] c - half. Is only being read.
2486
2694
  *
2487
2695
  * \returns half
2488
- * \retval The result of fused multiply-add operation on \p
2696
+ * - The result of fused multiply-add operation on \p
2489
2697
  * a, \p b, and \p c.
2490
2698
  * \internal
2491
2699
  * \exception-guarantee no-throw guarantee
@@ -2507,7 +2715,7 @@ __CUDA_FP16_DECL__ __half __hfma(const __half a, const __half b, const __half c)
2507
2715
  * \param[in] c - half. Is only being read.
2508
2716
  *
2509
2717
  * \returns half
2510
- * \retval The result of fused multiply-add operation on \p
2718
+ * - The result of fused multiply-add operation on \p
2511
2719
  * a, \p b, and \p c, with respect to saturation.
2512
2720
  * \internal
2513
2721
  * \exception-guarantee no-throw guarantee
@@ -2526,7 +2734,7 @@ __CUDA_FP16_DECL__ __half __hfma_sat(const __half a, const __half b, const __hal
2526
2734
  * \param[in] a - half. Is only being read.
2527
2735
  *
2528
2736
  * \returns half
2529
- * \retval minus a
2737
+ * - minus a
2530
2738
  * \internal
2531
2739
  * \exception-guarantee no-throw guarantee
2532
2740
  * \behavior reentrant, thread safe
@@ -2546,9 +2754,9 @@ __CUDA_FP16_DECL__ __half __hneg(const __half a);
2546
2754
  * \param[in] b - half2. Is only being read.
2547
2755
  *
2548
2756
  * \returns bool
2549
- * \retval true if both \p half results of if-equal comparison
2757
+ * - true if both \p half results of if-equal comparison
2550
2758
  * of vectors \p a and \p b are true;
2551
- * \retval false otherwise.
2759
+ * - false otherwise.
2552
2760
  * \internal
2553
2761
  * \exception-guarantee no-throw guarantee
2554
2762
  * \behavior reentrant, thread safe
@@ -2568,9 +2776,9 @@ __CUDA_FP16_DECL__ bool __hbeq2(const __half2 a, const __half2 b);
2568
2776
  * \param[in] b - half2. Is only being read.
2569
2777
  *
2570
2778
  * \returns bool
2571
- * \retval true if both \p half results of not-equal comparison
2779
+ * - true if both \p half results of not-equal comparison
2572
2780
  * of vectors \p a and \p b are true,
2573
- * \retval false otherwise.
2781
+ * - false otherwise.
2574
2782
  * \internal
2575
2783
  * \exception-guarantee no-throw guarantee
2576
2784
  * \behavior reentrant, thread safe
@@ -2590,9 +2798,9 @@ __CUDA_FP16_DECL__ bool __hbne2(const __half2 a, const __half2 b);
2590
2798
  * \param[in] b - half2. Is only being read.
2591
2799
  *
2592
2800
  * \returns bool
2593
- * \retval true if both \p half results of less-equal comparison
2801
+ * - true if both \p half results of less-equal comparison
2594
2802
  * of vectors \p a and \p b are true;
2595
- * \retval false otherwise.
2803
+ * - false otherwise.
2596
2804
  * \internal
2597
2805
  * \exception-guarantee no-throw guarantee
2598
2806
  * \behavior reentrant, thread safe
@@ -2612,9 +2820,9 @@ __CUDA_FP16_DECL__ bool __hble2(const __half2 a, const __half2 b);
2612
2820
  * \param[in] b - half2. Is only being read.
2613
2821
  *
2614
2822
  * \returns bool
2615
- * \retval true if both \p half results of greater-equal
2823
+ * - true if both \p half results of greater-equal
2616
2824
  * comparison of vectors \p a and \p b are true;
2617
- * \retval false otherwise.
2825
+ * - false otherwise.
2618
2826
  * \internal
2619
2827
  * \exception-guarantee no-throw guarantee
2620
2828
  * \behavior reentrant, thread safe
@@ -2634,9 +2842,9 @@ __CUDA_FP16_DECL__ bool __hbge2(const __half2 a, const __half2 b);
2634
2842
  * \param[in] b - half2. Is only being read.
2635
2843
  *
2636
2844
  * \returns bool
2637
- * \retval true if both \p half results of less-than comparison
2845
+ * - true if both \p half results of less-than comparison
2638
2846
  * of vectors \p a and \p b are true;
2639
- * \retval false otherwise.
2847
+ * - false otherwise.
2640
2848
  * \internal
2641
2849
  * \exception-guarantee no-throw guarantee
2642
2850
  * \behavior reentrant, thread safe
@@ -2656,9 +2864,9 @@ __CUDA_FP16_DECL__ bool __hblt2(const __half2 a, const __half2 b);
2656
2864
  * \param[in] b - half2. Is only being read.
2657
2865
  *
2658
2866
  * \returns bool
2659
- * \retval true if both \p half results of greater-than
2867
+ * - true if both \p half results of greater-than
2660
2868
  * comparison of vectors \p a and \p b are true;
2661
- * \retval false otherwise.
2869
+ * - false otherwise.
2662
2870
  * \internal
2663
2871
  * \exception-guarantee no-throw guarantee
2664
2872
  * \behavior reentrant, thread safe
@@ -2678,9 +2886,9 @@ __CUDA_FP16_DECL__ bool __hbgt2(const __half2 a, const __half2 b);
2678
2886
  * \param[in] b - half2. Is only being read.
2679
2887
  *
2680
2888
  * \returns bool
2681
- * \retval true if both \p half results of unordered if-equal
2889
+ * - true if both \p half results of unordered if-equal
2682
2890
  * comparison of vectors \p a and \p b are true;
2683
- * \retval false otherwise.
2891
+ * - false otherwise.
2684
2892
  * \internal
2685
2893
  * \exception-guarantee no-throw guarantee
2686
2894
  * \behavior reentrant, thread safe
@@ -2700,9 +2908,9 @@ __CUDA_FP16_DECL__ bool __hbequ2(const __half2 a, const __half2 b);
2700
2908
  * \param[in] b - half2. Is only being read.
2701
2909
  *
2702
2910
  * \returns bool
2703
- * \retval true if both \p half results of unordered not-equal
2911
+ * - true if both \p half results of unordered not-equal
2704
2912
  * comparison of vectors \p a and \p b are true;
2705
- * \retval false otherwise.
2913
+ * - false otherwise.
2706
2914
  * \internal
2707
2915
  * \exception-guarantee no-throw guarantee
2708
2916
  * \behavior reentrant, thread safe
@@ -2722,9 +2930,9 @@ __CUDA_FP16_DECL__ bool __hbneu2(const __half2 a, const __half2 b);
2722
2930
  * \param[in] b - half2. Is only being read.
2723
2931
  *
2724
2932
  * \returns bool
2725
- * \retval true if both \p half results of unordered less-equal
2933
+ * - true if both \p half results of unordered less-equal
2726
2934
  * comparison of vectors \p a and \p b are true;
2727
- * \retval false otherwise.
2935
+ * - false otherwise.
2728
2936
  * \internal
2729
2937
  * \exception-guarantee no-throw guarantee
2730
2938
  * \behavior reentrant, thread safe
@@ -2745,9 +2953,9 @@ __CUDA_FP16_DECL__ bool __hbleu2(const __half2 a, const __half2 b);
2745
2953
  * \param[in] b - half2. Is only being read.
2746
2954
  *
2747
2955
  * \returns bool
2748
- * \retval true if both \p half results of unordered
2956
+ * - true if both \p half results of unordered
2749
2957
  * greater-equal comparison of vectors \p a and \p b are true;
2750
- * \retval false otherwise.
2958
+ * - false otherwise.
2751
2959
  * \internal
2752
2960
  * \exception-guarantee no-throw guarantee
2753
2961
  * \behavior reentrant, thread safe
@@ -2767,9 +2975,9 @@ __CUDA_FP16_DECL__ bool __hbgeu2(const __half2 a, const __half2 b);
2767
2975
  * \param[in] b - half2. Is only being read.
2768
2976
  *
2769
2977
  * \returns bool
2770
- * \retval true if both \p half results of unordered less-than comparison of
2978
+ * - true if both \p half results of unordered less-than comparison of
2771
2979
  * vectors \p a and \p b are true;
2772
- * \retval false otherwise.
2980
+ * - false otherwise.
2773
2981
  * \internal
2774
2982
  * \exception-guarantee no-throw guarantee
2775
2983
  * \behavior reentrant, thread safe
@@ -2790,9 +2998,9 @@ __CUDA_FP16_DECL__ bool __hbltu2(const __half2 a, const __half2 b);
2790
2998
  * \param[in] b - half2. Is only being read.
2791
2999
  *
2792
3000
  * \returns bool
2793
- * \retval true if both \p half results of unordered
3001
+ * - true if both \p half results of unordered
2794
3002
  * greater-than comparison of vectors \p a and \p b are true;
2795
- * \retval false otherwise.
3003
+ * - false otherwise.
2796
3004
  * \internal
2797
3005
  * \exception-guarantee no-throw guarantee
2798
3006
  * \behavior reentrant, thread safe
@@ -2809,7 +3017,7 @@ __CUDA_FP16_DECL__ bool __hbgtu2(const __half2 a, const __half2 b);
2809
3017
  * \param[in] b - half. Is only being read.
2810
3018
  *
2811
3019
  * \returns bool
2812
- * \retval The boolean result of if-equal comparison of \p a and \p b.
3020
+ * - The boolean result of if-equal comparison of \p a and \p b.
2813
3021
  * \internal
2814
3022
  * \exception-guarantee no-throw guarantee
2815
3023
  * \behavior reentrant, thread safe
@@ -2826,7 +3034,7 @@ __CUDA_FP16_DECL__ bool __heq(const __half a, const __half b);
2826
3034
  * \param[in] b - half. Is only being read.
2827
3035
  *
2828
3036
  * \returns bool
2829
- * \retval The boolean result of not-equal comparison of \p a and \p b.
3037
+ * - The boolean result of not-equal comparison of \p a and \p b.
2830
3038
  * \internal
2831
3039
  * \exception-guarantee no-throw guarantee
2832
3040
  * \behavior reentrant, thread safe
@@ -2843,7 +3051,7 @@ __CUDA_FP16_DECL__ bool __hne(const __half a, const __half b);
2843
3051
  * \param[in] b - half. Is only being read.
2844
3052
  *
2845
3053
  * \returns bool
2846
- * \retval The boolean result of less-equal comparison of \p a and \p b.
3054
+ * - The boolean result of less-equal comparison of \p a and \p b.
2847
3055
  * \internal
2848
3056
  * \exception-guarantee no-throw guarantee
2849
3057
  * \behavior reentrant, thread safe
@@ -2860,7 +3068,7 @@ __CUDA_FP16_DECL__ bool __hle(const __half a, const __half b);
2860
3068
  * \param[in] b - half. Is only being read.
2861
3069
  *
2862
3070
  * \returns bool
2863
- * \retval The boolean result of greater-equal comparison of \p a and \p b.
3071
+ * - The boolean result of greater-equal comparison of \p a and \p b.
2864
3072
  * \internal
2865
3073
  * \exception-guarantee no-throw guarantee
2866
3074
  * \behavior reentrant, thread safe
@@ -2877,7 +3085,7 @@ __CUDA_FP16_DECL__ bool __hge(const __half a, const __half b);
2877
3085
  * \param[in] b - half. Is only being read.
2878
3086
  *
2879
3087
  * \returns bool
2880
- * \retval The boolean result of less-than comparison of \p a and \p b.
3088
+ * - The boolean result of less-than comparison of \p a and \p b.
2881
3089
  * \internal
2882
3090
  * \exception-guarantee no-throw guarantee
2883
3091
  * \behavior reentrant, thread safe
@@ -2894,7 +3102,7 @@ __CUDA_FP16_DECL__ bool __hlt(const __half a, const __half b);
2894
3102
  * \param[in] b - half. Is only being read.
2895
3103
  *
2896
3104
  * \returns bool
2897
- * \retval The boolean result of greater-than comparison of \p a and \p b.
3105
+ * - The boolean result of greater-than comparison of \p a and \p b.
2898
3106
  * \internal
2899
3107
  * \exception-guarantee no-throw guarantee
2900
3108
  * \behavior reentrant, thread safe
@@ -2911,7 +3119,7 @@ __CUDA_FP16_DECL__ bool __hgt(const __half a, const __half b);
2911
3119
  * \param[in] b - half. Is only being read.
2912
3120
  *
2913
3121
  * \returns bool
2914
- * \retval The boolean result of unordered if-equal comparison of \p a and
3122
+ * - The boolean result of unordered if-equal comparison of \p a and
2915
3123
  * \p b.
2916
3124
  * \internal
2917
3125
  * \exception-guarantee no-throw guarantee
@@ -2929,7 +3137,7 @@ __CUDA_FP16_DECL__ bool __hequ(const __half a, const __half b);
2929
3137
  * \param[in] b - half. Is only being read.
2930
3138
  *
2931
3139
  * \returns bool
2932
- * \retval The boolean result of unordered not-equal comparison of \p a and
3140
+ * - The boolean result of unordered not-equal comparison of \p a and
2933
3141
  * \p b.
2934
3142
  * \internal
2935
3143
  * \exception-guarantee no-throw guarantee
@@ -2947,7 +3155,7 @@ __CUDA_FP16_DECL__ bool __hneu(const __half a, const __half b);
2947
3155
  * \param[in] b - half. Is only being read.
2948
3156
  *
2949
3157
  * \returns bool
2950
- * \retval The boolean result of unordered less-equal comparison of \p a and
3158
+ * - The boolean result of unordered less-equal comparison of \p a and
2951
3159
  * \p b.
2952
3160
  * \internal
2953
3161
  * \exception-guarantee no-throw guarantee
@@ -2965,7 +3173,7 @@ __CUDA_FP16_DECL__ bool __hleu(const __half a, const __half b);
2965
3173
  * \param[in] b - half. Is only being read.
2966
3174
  *
2967
3175
  * \returns bool
2968
- * \retval The boolean result of unordered greater-equal comparison of \p a
3176
+ * - The boolean result of unordered greater-equal comparison of \p a
2969
3177
  * and \p b.
2970
3178
  * \internal
2971
3179
  * \exception-guarantee no-throw guarantee
@@ -2983,7 +3191,7 @@ __CUDA_FP16_DECL__ bool __hgeu(const __half a, const __half b);
2983
3191
  * \param[in] b - half. Is only being read.
2984
3192
  *
2985
3193
  * \returns bool
2986
- * \retval The boolean result of unordered less-than comparison of \p a and
3194
+ * - The boolean result of unordered less-than comparison of \p a and
2987
3195
  * \p b.
2988
3196
  * \internal
2989
3197
  * \exception-guarantee no-throw guarantee
@@ -3001,7 +3209,7 @@ __CUDA_FP16_DECL__ bool __hltu(const __half a, const __half b);
3001
3209
  * \param[in] b - half. Is only being read.
3002
3210
  *
3003
3211
  * \returns bool
3004
- * \retval The boolean result of unordered greater-than comparison of \p a
3212
+ * - The boolean result of unordered greater-than comparison of \p a
3005
3213
  * and \p b.
3006
3214
  * \internal
3007
3215
  * \exception-guarantee no-throw guarantee
@@ -3017,52 +3225,14 @@ __CUDA_FP16_DECL__ bool __hgtu(const __half a, const __half b);
3017
3225
  * \param[in] a - half. Is only being read.
3018
3226
  *
3019
3227
  * \returns bool
3020
- * \retval true iff argument is NaN.
3228
+ * - true iff argument is NaN.
3021
3229
  * \internal
3022
3230
  * \exception-guarantee no-throw guarantee
3023
3231
  * \behavior reentrant, thread safe
3024
3232
  * \endinternal
3025
3233
  */
3026
3234
  __CUDA_FP16_DECL__ bool __hisnan(const __half a);
3027
- #if __CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)
3028
- /**
3029
- * \ingroup CUDA_MATH__HALF_COMPARISON
3030
- * \brief Calculates \p half maximum of two input values.
3031
- *
3032
- * \details Calculates \p half max(\p a, \p b)
3033
- * defined as (\p a > \p b) ? \p a : \p b.
3034
- * - If either of inputs is NaN, the other input is returned.
3035
- * - If both inputs are NaNs, then canonical NaN is returned.
3036
- * - If values of both inputs are 0.0, then +0.0 > -0.0
3037
- * \param[in] a - half. Is only being read.
3038
- * \param[in] b - half. Is only being read.
3039
- *
3040
- * \returns half
3041
- * \internal
3042
- * \exception-guarantee no-throw guarantee
3043
- * \behavior reentrant, thread safe
3044
- * \endinternal
3045
- */
3046
- __CUDA_FP16_DECL__ __half __hmax(const __half a, const __half b);
3047
- /**
3048
- * \ingroup CUDA_MATH__HALF_COMPARISON
3049
- * \brief Calculates \p half minimum of two input values.
3050
- *
3051
- * \details Calculates \p half min(\p a, \p b)
3052
- * defined as (\p a < \p b) ? \p a : \p b.
3053
- * - If either of inputs is NaN, the other input is returned.
3054
- * - If both inputs are NaNs, then canonical NaN is returned.
3055
- * - If values of both inputs are 0.0, then +0.0 > -0.0
3056
- * \param[in] a - half. Is only being read.
3057
- * \param[in] b - half. Is only being read.
3058
- *
3059
- * \returns half
3060
- * \internal
3061
- * \exception-guarantee no-throw guarantee
3062
- * \behavior reentrant, thread safe
3063
- * \endinternal
3064
- */
3065
- __CUDA_FP16_DECL__ __half __hmin(const __half a, const __half b);
3235
+ #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800)
3066
3236
  /**
3067
3237
  * \ingroup CUDA_MATH__HALF_COMPARISON
3068
3238
  * \brief Calculates \p half maximum of two input values, NaNs pass through.
@@ -3113,7 +3283,7 @@ __CUDA_FP16_DECL__ __half __hmin_nan(const __half a, const __half b);
3113
3283
  * \param[in] c - half. Is only being read.
3114
3284
  *
3115
3285
  * \returns half
3116
- * \retval The result of fused multiply-add operation on \p
3286
+ * - The result of fused multiply-add operation on \p
3117
3287
  * a, \p b, and \p c with relu saturation.
3118
3288
  * \internal
3119
3289
  * \exception-guarantee no-throw guarantee
@@ -3123,48 +3293,6 @@ __CUDA_FP16_DECL__ __half __hmin_nan(const __half a, const __half b);
3123
3293
  __CUDA_FP16_DECL__ __half __hfma_relu(const __half a, const __half b, const __half c);
3124
3294
  /**
3125
3295
  * \ingroup CUDA_MATH__HALF2_COMPARISON
3126
- * \brief Calculates \p half2 vector maximum of two inputs.
3127
- *
3128
- * \details Calculates \p half2 vector max(\p a, \p b).
3129
- * Elementwise \p half operation is defined as
3130
- * (\p a > \p b) ? \p a : \p b.
3131
- * - If either of inputs is NaN, the other input is returned.
3132
- * - If both inputs are NaNs, then canonical NaN is returned.
3133
- * - If values of both inputs are 0.0, then +0.0 > -0.0
3134
- * \param[in] a - half2. Is only being read.
3135
- * \param[in] b - half2. Is only being read.
3136
- *
3137
- * \returns half2
3138
- * \retval The result of elementwise maximum of vectors \p a and \p b
3139
- * \internal
3140
- * \exception-guarantee no-throw guarantee
3141
- * \behavior reentrant, thread safe
3142
- * \endinternal
3143
- */
3144
- __CUDA_FP16_DECL__ __half2 __hmax2(const __half2 a, const __half2 b);
3145
- /**
3146
- * \ingroup CUDA_MATH__HALF2_COMPARISON
3147
- * \brief Calculates \p half2 vector minimum of two inputs.
3148
- *
3149
- * \details Calculates \p half2 vector min(\p a, \p b).
3150
- * Elementwise \p half operation is defined as
3151
- * (\p a < \p b) ? \p a : \p b.
3152
- * - If either of inputs is NaN, the other input is returned.
3153
- * - If both inputs are NaNs, then canonical NaN is returned.
3154
- * - If values of both inputs are 0.0, then +0.0 > -0.0
3155
- * \param[in] a - half2. Is only being read.
3156
- * \param[in] b - half2. Is only being read.
3157
- *
3158
- * \returns half2
3159
- * \retval The result of elementwise minimum of vectors \p a and \p b
3160
- * \internal
3161
- * \exception-guarantee no-throw guarantee
3162
- * \behavior reentrant, thread safe
3163
- * \endinternal
3164
- */
3165
- __CUDA_FP16_DECL__ __half2 __hmin2(const __half2 a, const __half2 b);
3166
- /**
3167
- * \ingroup CUDA_MATH__HALF2_COMPARISON
3168
3296
  * \brief Calculates \p half2 vector maximum of two inputs, NaNs pass through.
3169
3297
  *
3170
3298
  * \details Calculates \p half2 vector max(\p a, \p b).
@@ -3176,7 +3304,7 @@ __CUDA_FP16_DECL__ __half2 __hmin2(const __half2 a, const __half2 b);
3176
3304
  * \param[in] b - half2. Is only being read.
3177
3305
  *
3178
3306
  * \returns half2
3179
- * \retval The result of elementwise maximum of vectors \p a and \p b, with NaNs pass through
3307
+ * - The result of elementwise maximum of vectors \p a and \p b, with NaNs pass through
3180
3308
  * \internal
3181
3309
  * \exception-guarantee no-throw guarantee
3182
3310
  * \behavior reentrant, thread safe
@@ -3196,7 +3324,7 @@ __CUDA_FP16_DECL__ __half2 __hmax2_nan(const __half2 a, const __half2 b);
3196
3324
  * \param[in] b - half2. Is only being read.
3197
3325
  *
3198
3326
  * \returns half2
3199
- * \retval The result of elementwise minimum of vectors \p a and \p b, with NaNs pass through
3327
+ * - The result of elementwise minimum of vectors \p a and \p b, with NaNs pass through
3200
3328
  * \internal
3201
3329
  * \exception-guarantee no-throw guarantee
3202
3330
  * \behavior reentrant, thread safe
@@ -3218,14 +3346,14 @@ __CUDA_FP16_DECL__ __half2 __hmin2_nan(const __half2 a, const __half2 b);
3218
3346
  * \param[in] c - half2. Is only being read.
3219
3347
  *
3220
3348
  * \returns half2
3221
- * \retval The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c with relu saturation.
3349
+ * - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c with relu saturation.
3222
3350
  * \internal
3223
3351
  * \exception-guarantee no-throw guarantee
3224
3352
  * \behavior reentrant, thread safe
3225
3353
  * \endinternal
3226
3354
  */
3227
3355
  __CUDA_FP16_DECL__ __half2 __hfma2_relu(const __half2 a, const __half2 b, const __half2 c);
3228
- #endif /*__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)*/
3356
+ #endif /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800) */
3229
3357
  /**
3230
3358
  * \ingroup CUDA_MATH__HALF2_ARITHMETIC
3231
3359
  * \brief Performs fast complex multiply-accumulate
@@ -3238,7 +3366,7 @@ __CUDA_FP16_DECL__ __half2 __hfma2_relu(const __half2 a, const __half2 b, const
3238
3366
  * \param[in] c - half2. Is only being read.
3239
3367
  *
3240
3368
  * \returns half2
3241
- * \retval The result of complex multiply-accumulate operation on complex numbers \p a, \p b, and \p c
3369
+ * - The result of complex multiply-accumulate operation on complex numbers \p a, \p b, and \p c
3242
3370
  * \internal
3243
3371
  * \exception-guarantee no-throw guarantee
3244
3372
  * \behavior reentrant, thread safe
@@ -3253,7 +3381,7 @@ __CUDA_FP16_DECL__ __half2 __hcmadd(const __half2 a, const __half2 b, const __ha
3253
3381
  * \param[in] a - half. Is only being read.
3254
3382
  *
3255
3383
  * \returns half
3256
- * \retval The square root of \p a.
3384
+ * - The square root of \p a.
3257
3385
  * \internal
3258
3386
  * \exception-guarantee no-throw guarantee
3259
3387
  * \behavior reentrant, thread safe
@@ -3270,7 +3398,7 @@ __CUDA_FP16_DECL__ __half hsqrt(const __half a);
3270
3398
  * \param[in] a - half. Is only being read.
3271
3399
  *
3272
3400
  * \returns half
3273
- * \retval The reciprocal square root of \p a.
3401
+ * - The reciprocal square root of \p a.
3274
3402
  * \internal
3275
3403
  * \exception-guarantee no-throw guarantee
3276
3404
  * \behavior reentrant, thread safe
@@ -3285,7 +3413,7 @@ __CUDA_FP16_DECL__ __half hrsqrt(const __half a);
3285
3413
  * \param[in] a - half. Is only being read.
3286
3414
  *
3287
3415
  * \returns half
3288
- * \retval The reciprocal of \p a.
3416
+ * - The reciprocal of \p a.
3289
3417
  * \internal
3290
3418
  * \exception-guarantee no-throw guarantee
3291
3419
  * \behavior reentrant, thread safe
@@ -3301,7 +3429,7 @@ __CUDA_FP16_DECL__ __half hrcp(const __half a);
3301
3429
  * \param[in] a - half. Is only being read.
3302
3430
  *
3303
3431
  * \returns half
3304
- * \retval The natural logarithm of \p a.
3432
+ * - The natural logarithm of \p a.
3305
3433
  * \internal
3306
3434
  * \exception-guarantee no-throw guarantee
3307
3435
  * \behavior reentrant, thread safe
@@ -3317,7 +3445,7 @@ __CUDA_FP16_DECL__ __half hlog(const __half a);
3317
3445
  * \param[in] a - half. Is only being read.
3318
3446
  *
3319
3447
  * \returns half
3320
- * \retval The binary logarithm of \p a.
3448
+ * - The binary logarithm of \p a.
3321
3449
  * \internal
3322
3450
  * \exception-guarantee no-throw guarantee
3323
3451
  * \behavior reentrant, thread safe
@@ -3333,7 +3461,7 @@ __CUDA_FP16_DECL__ __half hlog2(const __half a);
3333
3461
  * \param[in] a - half. Is only being read.
3334
3462
  *
3335
3463
  * \returns half
3336
- * \retval The decimal logarithm of \p a.
3464
+ * - The decimal logarithm of \p a.
3337
3465
  * \internal
3338
3466
  * \exception-guarantee no-throw guarantee
3339
3467
  * \behavior reentrant, thread safe
@@ -3350,7 +3478,7 @@ __CUDA_FP16_DECL__ __half hlog10(const __half a);
3350
3478
  * \param[in] a - half. Is only being read.
3351
3479
  *
3352
3480
  * \returns half
3353
- * \retval The natural exponential function on \p a.
3481
+ * - The natural exponential function on \p a.
3354
3482
  * \internal
3355
3483
  * \exception-guarantee no-throw guarantee
3356
3484
  * \behavior reentrant, thread safe
@@ -3367,7 +3495,7 @@ __CUDA_FP16_DECL__ __half hexp(const __half a);
3367
3495
  * \param[in] a - half. Is only being read.
3368
3496
  *
3369
3497
  * \returns half
3370
- * \retval The binary exponential function on \p a.
3498
+ * - The binary exponential function on \p a.
3371
3499
  * \internal
3372
3500
  * \exception-guarantee no-throw guarantee
3373
3501
  * \behavior reentrant, thread safe
@@ -3384,7 +3512,7 @@ __CUDA_FP16_DECL__ __half hexp2(const __half a);
3384
3512
  * \param[in] a - half. Is only being read.
3385
3513
  *
3386
3514
  * \returns half
3387
- * \retval The decimal exponential function on \p a.
3515
+ * - The decimal exponential function on \p a.
3388
3516
  * \internal
3389
3517
  * \exception-guarantee no-throw guarantee
3390
3518
  * \behavior reentrant, thread safe
@@ -3399,7 +3527,7 @@ __CUDA_FP16_DECL__ __half hexp10(const __half a);
3399
3527
  * \param[in] a - half. Is only being read.
3400
3528
  *
3401
3529
  * \returns half
3402
- * \retval The cosine of \p a.
3530
+ * - The cosine of \p a.
3403
3531
  * \internal
3404
3532
  * \exception-guarantee no-throw guarantee
3405
3533
  * \behavior reentrant, thread safe
@@ -3414,7 +3542,7 @@ __CUDA_FP16_DECL__ __half hcos(const __half a);
3414
3542
  * \param[in] a - half. Is only being read.
3415
3543
  *
3416
3544
  * \returns half
3417
- * \retval The sine of \p a.
3545
+ * - The sine of \p a.
3418
3546
  * \internal
3419
3547
  * \exception-guarantee no-throw guarantee
3420
3548
  * \behavior reentrant, thread safe
@@ -3430,7 +3558,7 @@ __CUDA_FP16_DECL__ __half hsin(const __half a);
3430
3558
  * \param[in] a - half2. Is only being read.
3431
3559
  *
3432
3560
  * \returns half2
3433
- * \retval The elementwise square root on vector \p a.
3561
+ * - The elementwise square root on vector \p a.
3434
3562
  * \internal
3435
3563
  * \exception-guarantee no-throw guarantee
3436
3564
  * \behavior reentrant, thread safe
@@ -3447,7 +3575,7 @@ __CUDA_FP16_DECL__ __half2 h2sqrt(const __half2 a);
3447
3575
  * \param[in] a - half2. Is only being read.
3448
3576
  *
3449
3577
  * \returns half2
3450
- * \retval The elementwise reciprocal square root on vector \p a.
3578
+ * - The elementwise reciprocal square root on vector \p a.
3451
3579
  * \internal
3452
3580
  * \exception-guarantee no-throw guarantee
3453
3581
  * \behavior reentrant, thread safe
@@ -3463,7 +3591,7 @@ __CUDA_FP16_DECL__ __half2 h2rsqrt(const __half2 a);
3463
3591
  * \param[in] a - half2. Is only being read.
3464
3592
  *
3465
3593
  * \returns half2
3466
- * \retval The elementwise reciprocal on vector \p a.
3594
+ * - The elementwise reciprocal on vector \p a.
3467
3595
  * \internal
3468
3596
  * \exception-guarantee no-throw guarantee
3469
3597
  * \behavior reentrant, thread safe
@@ -3480,7 +3608,7 @@ __CUDA_FP16_DECL__ __half2 h2rcp(const __half2 a);
3480
3608
  * \param[in] a - half2. Is only being read.
3481
3609
  *
3482
3610
  * \returns half2
3483
- * \retval The elementwise natural logarithm on vector \p a.
3611
+ * - The elementwise natural logarithm on vector \p a.
3484
3612
  * \internal
3485
3613
  * \exception-guarantee no-throw guarantee
3486
3614
  * \behavior reentrant, thread safe
@@ -3497,7 +3625,7 @@ __CUDA_FP16_DECL__ __half2 h2log(const __half2 a);
3497
3625
  * \param[in] a - half2. Is only being read.
3498
3626
  *
3499
3627
  * \returns half2
3500
- * \retval The elementwise binary logarithm on vector \p a.
3628
+ * - The elementwise binary logarithm on vector \p a.
3501
3629
  * \internal
3502
3630
  * \exception-guarantee no-throw guarantee
3503
3631
  * \behavior reentrant, thread safe
@@ -3514,7 +3642,7 @@ __CUDA_FP16_DECL__ __half2 h2log2(const __half2 a);
3514
3642
  * \param[in] a - half2. Is only being read.
3515
3643
  *
3516
3644
  * \returns half2
3517
- * \retval The elementwise decimal logarithm on vector \p a.
3645
+ * - The elementwise decimal logarithm on vector \p a.
3518
3646
  * \internal
3519
3647
  * \exception-guarantee no-throw guarantee
3520
3648
  * \behavior reentrant, thread safe
@@ -3531,7 +3659,7 @@ __CUDA_FP16_DECL__ __half2 h2log10(const __half2 a);
3531
3659
  * \param[in] a - half2. Is only being read.
3532
3660
  *
3533
3661
  * \returns half2
3534
- * \retval The elementwise exponential function on vector \p a.
3662
+ * - The elementwise exponential function on vector \p a.
3535
3663
  * \internal
3536
3664
  * \exception-guarantee no-throw guarantee
3537
3665
  * \behavior reentrant, thread safe
@@ -3548,7 +3676,7 @@ __CUDA_FP16_DECL__ __half2 h2exp(const __half2 a);
3548
3676
  * \param[in] a - half2. Is only being read.
3549
3677
  *
3550
3678
  * \returns half2
3551
- * \retval The elementwise binary exponential function on vector \p a.
3679
+ * - The elementwise binary exponential function on vector \p a.
3552
3680
  * \internal
3553
3681
  * \exception-guarantee no-throw guarantee
3554
3682
  * \behavior reentrant, thread safe
@@ -3565,7 +3693,7 @@ __CUDA_FP16_DECL__ __half2 h2exp2(const __half2 a);
3565
3693
  * \param[in] a - half2. Is only being read.
3566
3694
  *
3567
3695
  * \returns half2
3568
- * \retval The elementwise decimal exponential function on vector \p a.
3696
+ * - The elementwise decimal exponential function on vector \p a.
3569
3697
  * \internal
3570
3698
  * \exception-guarantee no-throw guarantee
3571
3699
  * \behavior reentrant, thread safe
@@ -3581,7 +3709,7 @@ __CUDA_FP16_DECL__ __half2 h2exp10(const __half2 a);
3581
3709
  * \param[in] a - half2. Is only being read.
3582
3710
  *
3583
3711
  * \returns half2
3584
- * \retval The elementwise cosine on vector \p a.
3712
+ * - The elementwise cosine on vector \p a.
3585
3713
  * \internal
3586
3714
  * \exception-guarantee no-throw guarantee
3587
3715
  * \behavior reentrant, thread safe
@@ -3596,7 +3724,7 @@ __CUDA_FP16_DECL__ __half2 h2cos(const __half2 a);
3596
3724
  * \param[in] a - half2. Is only being read.
3597
3725
  *
3598
3726
  * \returns half2
3599
- * \retval The elementwise sine on vector \p a.
3727
+ * - The elementwise sine on vector \p a.
3600
3728
  * \internal
3601
3729
  * \exception-guarantee no-throw guarantee
3602
3730
  * \behavior reentrant, thread safe
@@ -3604,19 +3732,52 @@ __CUDA_FP16_DECL__ __half2 h2cos(const __half2 a);
3604
3732
  */
3605
3733
  __CUDA_FP16_DECL__ __half2 h2sin(const __half2 a);
3606
3734
 
3607
- #endif /*if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)*/
3735
+ #endif /*if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)*/
3608
3736
 
3609
- #if __CUDA_ARCH__ >= 600 || !defined(__CUDA_ARCH__)
3737
+ #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 600)
3610
3738
 
3739
+ /**
3740
+ * \ingroup CUDA_MATH__HALF2_ARITHMETIC
3741
+ * \brief Vector add \p val to the value stored at \p address in global or shared memory, and writes this
3742
+ * value back to \p address. The atomicity of the add operation is guaranteed separately for each of the
3743
+ * two __half elements; the entire __half2 is not guaranteed to be atomic as a single 32-bit access.
3744
+ *
3745
+ * \details The location of \p address must be in global or shared memory. This operation has undefined
3746
+ * behavior otherwise. This operation is only supported by devices of compute capability 6.x and higher.
3747
+ *
3748
+ * \param[in] address - half2*. An address in global or shared memory.
3749
+ * \param[in] val - half2. The value to be added.
3750
+ *
3751
+ * \returns half2
3752
+ * - The old value read from \p address.
3753
+ *
3754
+ * \note_ref_guide_atomic
3755
+ */
3611
3756
  __CUDA_FP16_DECL__ __half2 atomicAdd(__half2 *const address, const __half2 val);
3612
3757
 
3613
- #endif /*if __CUDA_ARCH__ >= 600 || !defined(__CUDA_ARCH__)*/
3758
+ #endif /*if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 600)*/
3614
3759
 
3615
- #if __CUDA_ARCH__ >= 700 || !defined(__CUDA_ARCH__)
3760
+ #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700)
3616
3761
 
3762
+ /**
3763
+ * \ingroup CUDA_MATH__HALF_ARITHMETIC
3764
+ * \brief Adds \p val to the value stored at \p address in global or shared memory, and writes this value
3765
+ * back to \p address. This operation is performed in one atomic operation.
3766
+ *
3767
+ * \details The location of \p address must be in global or shared memory. This operation has undefined
3768
+ * behavior otherwise. This operation is only supported by devices of compute capability 7.x and higher.
3769
+ *
3770
+ * \param[in] address - half*. An address in global or shared memory.
3771
+ * \param[in] val - half. The value to be added.
3772
+ *
3773
+ * \returns half
3774
+ * - The old value read from \p address.
3775
+ *
3776
+ * \note_ref_guide_atomic
3777
+ */
3617
3778
  __CUDA_FP16_DECL__ __half atomicAdd(__half *const address, const __half val);
3618
3779
 
3619
- #endif /*if __CUDA_ARCH__ >= 700 || !defined(__CUDA_ARCH__)*/
3780
+ #endif /*if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700)*/
3620
3781
 
3621
3782
  #endif /* defined(__CUDACC__) */
3622
3783
 
@@ -3627,5 +3788,7 @@ __CUDA_FP16_DECL__ __half atomicAdd(__half *const address, const __half val);
3627
3788
 
3628
3789
  /* Note the .hpp file is included even for host-side compilation, to capture the "half" & "half2" definitions */
3629
3790
  #include "cuda_fp16.hpp"
3791
+ #undef ___CUDA_FP16_STRINGIFY_INNERMOST
3792
+ #undef __CUDA_FP16_STRINGIFY
3630
3793
 
3631
3794
  #endif /* end of include guard: __CUDA_FP16_H__ */