numba-cuda 0.9.0__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. numba_cuda/VERSION +1 -1
  2. numba_cuda/numba/cuda/compiler.py +14 -1
  3. numba_cuda/numba/cuda/cuda_bf16.py +5155 -0
  4. numba_cuda/numba/cuda/cuda_paths.py +2 -0
  5. numba_cuda/numba/cuda/cudadecl.py +0 -42
  6. numba_cuda/numba/cuda/cudadrv/linkable_code.py +11 -2
  7. numba_cuda/numba/cuda/cudadrv/nvrtc.py +10 -3
  8. numba_cuda/numba/cuda/cudaimpl.py +0 -63
  9. numba_cuda/numba/cuda/debuginfo.py +92 -2
  10. numba_cuda/numba/cuda/decorators.py +13 -1
  11. numba_cuda/numba/cuda/device_init.py +4 -5
  12. numba_cuda/numba/cuda/extending.py +54 -0
  13. numba_cuda/numba/cuda/include/11/cuda_bf16.h +3749 -0
  14. numba_cuda/numba/cuda/include/11/cuda_bf16.hpp +2683 -0
  15. numba_cuda/numba/cuda/{cuda_fp16.h → include/11/cuda_fp16.h} +550 -387
  16. numba_cuda/numba/cuda/{cuda_fp16.hpp → include/11/cuda_fp16.hpp} +465 -316
  17. numba_cuda/numba/cuda/include/12/cuda_bf16.h +5118 -0
  18. numba_cuda/numba/cuda/include/12/cuda_bf16.hpp +3865 -0
  19. numba_cuda/numba/cuda/include/12/cuda_fp16.h +5363 -0
  20. numba_cuda/numba/cuda/include/12/cuda_fp16.hpp +3483 -0
  21. numba_cuda/numba/cuda/intrinsic_wrapper.py +0 -39
  22. numba_cuda/numba/cuda/intrinsics.py +172 -1
  23. numba_cuda/numba/cuda/lowering.py +43 -0
  24. numba_cuda/numba/cuda/stubs.py +0 -11
  25. numba_cuda/numba/cuda/target.py +28 -0
  26. numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +4 -2
  27. numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +1 -1
  28. numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +257 -0
  29. numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +1 -1
  30. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +46 -0
  31. numba_cuda/numba/cuda/tests/cudapy/test_enums.py +18 -0
  32. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +4 -2
  33. numba_cuda/numba/cuda/tests/cudapy/test_inline.py +59 -0
  34. numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +1 -1
  35. numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +50 -5
  36. numba_cuda/numba/cuda/vector_types.py +3 -1
  37. numba_cuda/numba/cuda/vectorizers.py +1 -1
  38. {numba_cuda-0.9.0.dist-info → numba_cuda-0.10.0.dist-info}/METADATA +1 -1
  39. {numba_cuda-0.9.0.dist-info → numba_cuda-0.10.0.dist-info}/RECORD +42 -32
  40. {numba_cuda-0.9.0.dist-info → numba_cuda-0.10.0.dist-info}/WHEEL +1 -1
  41. {numba_cuda-0.9.0.dist-info → numba_cuda-0.10.0.dist-info}/licenses/LICENSE +0 -0
  42. {numba_cuda-0.9.0.dist-info → numba_cuda-0.10.0.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright 1993-2020 NVIDIA Corporation. All rights reserved.
2
+ * Copyright 1993-2021 NVIDIA Corporation. All rights reserved.
3
3
  *
4
4
  * NOTICE TO LICENSEE:
5
5
  *
@@ -118,25 +118,25 @@
118
118
  /* Macros for half & half2 binary arithmetic */
119
119
  #define __BINARY_OP_HALF_MACRO(name) /* do */ {\
120
120
  __half val; \
121
- asm( "{"#name".f16 %0,%1,%2;\n}" \
121
+ asm( "{" __CUDA_FP16_STRINGIFY(name) ".f16 %0,%1,%2;\n}" \
122
122
  :"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)),"h"(__HALF_TO_CUS(b))); \
123
123
  return val; \
124
124
  } /* while(0) */
125
125
  #define __BINARY_OP_HALF2_MACRO(name) /* do */ {\
126
126
  __half2 val; \
127
- asm( "{"#name".f16x2 %0,%1,%2;\n}" \
127
+ asm( "{" __CUDA_FP16_STRINGIFY(name) ".f16x2 %0,%1,%2;\n}" \
128
128
  :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b))); \
129
129
  return val; \
130
130
  } /* while(0) */
131
131
  #define __TERNARY_OP_HALF_MACRO(name) /* do */ {\
132
132
  __half val; \
133
- asm( "{"#name".f16 %0,%1,%2,%3;\n}" \
133
+ asm( "{" __CUDA_FP16_STRINGIFY(name) ".f16 %0,%1,%2,%3;\n}" \
134
134
  :"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)),"h"(__HALF_TO_CUS(b)),"h"(__HALF_TO_CUS(c))); \
135
135
  return val; \
136
136
  } /* while(0) */
137
137
  #define __TERNARY_OP_HALF2_MACRO(name) /* do */ {\
138
138
  __half2 val; \
139
- asm( "{"#name".f16x2 %0,%1,%2,%3;\n}" \
139
+ asm( "{" __CUDA_FP16_STRINGIFY(name) ".f16x2 %0,%1,%2,%3;\n}" \
140
140
  :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b)),"r"(__HALF2_TO_CUI(c))); \
141
141
  return val; \
142
142
  } /* while(0) */
@@ -247,7 +247,7 @@ public:
247
247
  #if defined(__CUDACC__)
248
248
 
249
249
  /* Arithmetic FP16 operations only supported on arch >= 5.3 */
250
- #if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
250
+ #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
251
251
  #if !defined(__CUDA_NO_HALF_OPERATORS__)
252
252
  /* Some basic arithmetic operations expected of a builtin */
253
253
  __device__ __forceinline__ __half operator+(const __half &lh, const __half &rh) { return __hadd(lh, rh); }
@@ -263,8 +263,28 @@ __device__ __forceinline__ __half &operator/=(__half &lh, const __half &rh) { lh
263
263
  /* Note for increment and decrement we use the raw value 0x3C00U equating to half(1.0F), to avoid the extra conversion */
264
264
  __device__ __forceinline__ __half &operator++(__half &h) { __half_raw one; one.x = 0x3C00U; h += one; return h; }
265
265
  __device__ __forceinline__ __half &operator--(__half &h) { __half_raw one; one.x = 0x3C00U; h -= one; return h; }
266
- __device__ __forceinline__ __half operator++(__half &h, const int ignored) { const __half ret = h; __half_raw one; one.x = 0x3C00U; h += one; return ret; }
267
- __device__ __forceinline__ __half operator--(__half &h, const int ignored) { const __half ret = h; __half_raw one; one.x = 0x3C00U; h -= one; return ret; }
266
+ __device__ __forceinline__ __half operator++(__half &h, const int ignored)
267
+ {
268
+ // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators.
269
+ static_cast<void>(ignored);
270
+
271
+ const __half ret = h;
272
+ __half_raw one;
273
+ one.x = 0x3C00U;
274
+ h += one;
275
+ return ret;
276
+ }
277
+ __device__ __forceinline__ __half operator--(__half &h, const int ignored)
278
+ {
279
+ // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators.
280
+ static_cast<void>(ignored);
281
+
282
+ const __half ret = h;
283
+ __half_raw one;
284
+ one.x = 0x3C00U;
285
+ h -= one;
286
+ return ret;
287
+ }
268
288
 
269
289
  /* Unary plus and inverse operators */
270
290
  __device__ __forceinline__ __half operator+(const __half &h) { return h; }
@@ -278,7 +298,7 @@ __device__ __forceinline__ bool operator< (const __half &lh, const __half &rh) {
278
298
  __device__ __forceinline__ bool operator>=(const __half &lh, const __half &rh) { return __hge(lh, rh); }
279
299
  __device__ __forceinline__ bool operator<=(const __half &lh, const __half &rh) { return __hle(lh, rh); }
280
300
  #endif /* !defined(__CUDA_NO_HALF_OPERATORS__) */
281
- #endif /* __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) */
301
+ #endif /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) */
282
302
  #endif /* defined(__CUDACC__) */
283
303
 
284
304
  /* __half2 is visible to non-nvcc host compilers */
@@ -309,7 +329,7 @@ public:
309
329
  #if defined(__CUDACC__)
310
330
 
311
331
  /* Arithmetic FP16x2 operations only supported on arch >= 5.3 */
312
- #if (__CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)) && !defined(__CUDA_NO_HALF2_OPERATORS__)
332
+ #if (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) && !defined(__CUDA_NO_HALF2_OPERATORS__)
313
333
 
314
334
  __device__ __forceinline__ __half2 operator+(const __half2 &lh, const __half2 &rh) { return __hadd2(lh, rh); }
315
335
  __device__ __forceinline__ __half2 operator-(const __half2 &lh, const __half2 &rh) { return __hsub2(lh, rh); }
@@ -323,8 +343,30 @@ __device__ __forceinline__ __half2& operator/=(__half2 &lh, const __half2 &rh) {
323
343
 
324
344
  __device__ __forceinline__ __half2 &operator++(__half2 &h) { __half2_raw one; one.x = 0x3C00U; one.y = 0x3C00U; h = __hadd2(h, one); return h; }
325
345
  __device__ __forceinline__ __half2 &operator--(__half2 &h) { __half2_raw one; one.x = 0x3C00U; one.y = 0x3C00U; h = __hsub2(h, one); return h; }
326
- __device__ __forceinline__ __half2 operator++(__half2 &h, const int ignored) { const __half2 ret = h; __half2_raw one; one.x = 0x3C00U; one.y = 0x3C00U; h = __hadd2(h, one); return ret; }
327
- __device__ __forceinline__ __half2 operator--(__half2 &h, const int ignored) { const __half2 ret = h; __half2_raw one; one.x = 0x3C00U; one.y = 0x3C00U; h = __hsub2(h, one); return ret; }
346
+ __device__ __forceinline__ __half2 operator++(__half2 &h, const int ignored)
347
+ {
348
+ // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators.
349
+ static_cast<void>(ignored);
350
+
351
+ const __half2 ret = h;
352
+ __half2_raw one;
353
+ one.x = 0x3C00U;
354
+ one.y = 0x3C00U;
355
+ h = __hadd2(h, one);
356
+ return ret;
357
+ }
358
+ __device__ __forceinline__ __half2 operator--(__half2 &h, const int ignored)
359
+ {
360
+ // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators.
361
+ static_cast<void>(ignored);
362
+
363
+ const __half2 ret = h;
364
+ __half2_raw one;
365
+ one.x = 0x3C00U;
366
+ one.y = 0x3C00U;
367
+ h = __hsub2(h, one);
368
+ return ret;
369
+ }
328
370
 
329
371
  __device__ __forceinline__ __half2 operator+(const __half2 &h) { return h; }
330
372
  __device__ __forceinline__ __half2 operator-(const __half2 &h) { return __hneg2(h); }
@@ -336,7 +378,7 @@ __device__ __forceinline__ bool operator<(const __half2 &lh, const __half2 &rh)
336
378
  __device__ __forceinline__ bool operator>=(const __half2 &lh, const __half2 &rh) { return __hbge2(lh, rh); }
337
379
  __device__ __forceinline__ bool operator<=(const __half2 &lh, const __half2 &rh) { return __hble2(lh, rh); }
338
380
 
339
- #endif /* __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) */
381
+ #endif /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) */
340
382
  #endif /* defined(__CUDACC__) */
341
383
 
342
384
  /* Restore warning for multiple assignment operators */
@@ -388,6 +430,7 @@ static inline unsigned short __internal_float2half(const float f, unsigned int &
388
430
  mantissa |= 0x800000U;
389
431
  remainder = mantissa << (32U - shift);
390
432
  result = (sign | (mantissa >> shift));
433
+ result &= 0x0000FFFFU;
391
434
  }
392
435
  return static_cast<unsigned short>(result);
393
436
  }
@@ -401,10 +444,12 @@ __CUDA_HOSTDEVICE_FP16_DECL__ __half __double2half(const double a)
401
444
  return val;
402
445
  #else
403
446
  __half result;
447
+ /*
404
448
  // Perform rounding to 11 bits of precision, convert value
405
449
  // to float and call existing float to half conversion.
406
450
  // By pre-rounding to 11 bits we avoid additional rounding
407
451
  // in float to half conversion.
452
+ */
408
453
  unsigned long long int absa;
409
454
  unsigned long long int ua;
410
455
  #if defined(__CUDACC__)
@@ -415,12 +460,15 @@ __CUDA_HOSTDEVICE_FP16_DECL__ __half __double2half(const double a)
415
460
  absa = (ua & 0x7fffffffffffffffULL);
416
461
  if ((absa >= 0x40f0000000000000ULL) || (absa <= 0x3e60000000000000ULL))
417
462
  {
463
+ /*
418
464
  // |a| >= 2^16 or NaN or |a| <= 2^(-25)
419
465
  // double-rounding is not a problem
466
+ */
420
467
  result = __float2half(static_cast<float>(a));
421
468
  }
422
469
  else
423
470
  {
471
+ /*
424
472
  // here 2^(-25) < |a| < 2^16
425
473
  // prepare shifter value such that a + shifter
426
474
  // done in double precision performs round-to-nearest-even
@@ -431,15 +479,22 @@ __CUDA_HOSTDEVICE_FP16_DECL__ __half __double2half(const double a)
431
479
  // So need to have |a| capped to avoid overflow in exponent.
432
480
  // For inputs that are smaller than half precision minnorm
433
481
  // we prepare fixed shifter exponent.
482
+ */
434
483
  unsigned long long shifterBits;
435
484
  if (absa >= 0x3f10000000000000ULL)
436
- { // Here if |a| >= 2^(-14)
485
+ {
486
+ /*
487
+ // Here if |a| >= 2^(-14)
437
488
  // add 42 to exponent bits
489
+ */
438
490
  shifterBits = (ua & 0x7ff0000000000000ULL) + 0x02A0000000000000ULL;
439
491
  }
440
492
  else
441
- { // 2^(-25) < |a| < 2^(-14), potentially results in denormal
493
+ {
494
+ /*
495
+ // 2^(-25) < |a| < 2^(-14), potentially results in denormal
442
496
  // set exponent bits to 42 - 14 + bias
497
+ */
443
498
  shifterBits = 0x41B0000000000000ULL;
444
499
  }
445
500
  // set leading mantissa bit to protect against negative inputs
@@ -452,8 +507,10 @@ __CUDA_HOSTDEVICE_FP16_DECL__ __half __double2half(const double a)
452
507
  #endif
453
508
  double aShiftRound = a + shifter;
454
509
 
510
+ /*
455
511
  // Prevent the compiler from optimizing away a + shifter - shifter
456
512
  // by doing intermediate memcopy and harmless bitwize operation
513
+ */
457
514
  unsigned long long int aShiftRoundBits;
458
515
  #if defined(__CUDACC__)
459
516
  (void)memcpy(&aShiftRoundBits, &aShiftRound, sizeof(aShiftRound));
@@ -575,10 +632,15 @@ __CUDA_HOSTDEVICE_FP16_DECL__ __half2 __floats2half2_rn(const float a, const flo
575
632
  {
576
633
  __half2 val;
577
634
  #if defined(__CUDA_ARCH__)
635
+ #if (__CUDA_ARCH__ >= 800)
636
+ asm("{ cvt.rn.f16x2.f32 %0, %2, %1; }\n"
637
+ : "=r"(__HALF2_TO_UI(val)) : "f"(a), "f"(b));
638
+ #else
578
639
  asm("{.reg .f16 low,high;\n"
579
640
  " cvt.rn.f16.f32 low, %1;\n"
580
641
  " cvt.rn.f16.f32 high, %2;\n"
581
642
  " mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "f"(a), "f"(b));
643
+ #endif
582
644
  #else
583
645
  val = __half2(__float2half_rn(a), __float2half_rn(b));
584
646
  #endif
@@ -611,7 +673,7 @@ static inline float __internal_half2float(const unsigned short h)
611
673
  } else {
612
674
  exponent += 0x70U;
613
675
  }
614
- unsigned int u = ((sign << 31U) | (exponent << 23U) | mantissa);
676
+ const unsigned int u = ((sign << 31U) | (exponent << 23U) | mantissa);
615
677
  #if defined(__CUDACC__)
616
678
  (void)memcpy(&f, &u, sizeof(u));
617
679
  #else
@@ -655,6 +717,168 @@ __CUDA_HOSTDEVICE_FP16_DECL__ float __high2float(const __half2 a)
655
717
  #endif
656
718
  return val;
657
719
  }
720
+ __CUDA_HOSTDEVICE_FP16_DECL__ short int __half2short_rz(const __half h)
721
+ {
722
+ short int i;
723
+ #if defined __CUDA_ARCH__
724
+ asm("cvt.rzi.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
725
+ #else
726
+ const float f = __half2float(h);
727
+ const short int max_val = (short int)0x7fffU;
728
+ const short int min_val = (short int)0x8000U;
729
+ const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U);
730
+ // saturation fixup
731
+ if (bits > (unsigned short)0xF800U) {
732
+ // NaN
733
+ i = 0;
734
+ } else if (f > static_cast<float>(max_val)) {
735
+ // saturate maximum
736
+ i = max_val;
737
+ } else if (f < static_cast<float>(min_val)) {
738
+ // saturate minimum
739
+ i = min_val;
740
+ } else {
741
+ // normal value, conversion is well-defined
742
+ i = static_cast<short int>(f);
743
+ }
744
+ #endif
745
+ return i;
746
+ }
747
+ __CUDA_HOSTDEVICE_FP16_DECL__ unsigned short int __half2ushort_rz(const __half h)
748
+ {
749
+ unsigned short int i;
750
+ #if defined __CUDA_ARCH__
751
+ asm("cvt.rzi.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
752
+ #else
753
+ const float f = __half2float(h);
754
+ const unsigned short int max_val = 0xffffU;
755
+ const unsigned short int min_val = 0U;
756
+ const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U);
757
+ // saturation fixup
758
+ if (bits > (unsigned short)0xF800U) {
759
+ // NaN
760
+ i = 0U;
761
+ } else if (f > static_cast<float>(max_val)) {
762
+ // saturate maximum
763
+ i = max_val;
764
+ } else if (f < static_cast<float>(min_val)) {
765
+ // saturate minimum
766
+ i = min_val;
767
+ } else {
768
+ // normal value, conversion is well-defined
769
+ i = static_cast<unsigned short int>(f);
770
+ }
771
+ #endif
772
+ return i;
773
+ }
774
+ __CUDA_HOSTDEVICE_FP16_DECL__ int __half2int_rz(const __half h)
775
+ {
776
+ int i;
777
+ #if defined __CUDA_ARCH__
778
+ asm("cvt.rzi.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
779
+ #else
780
+ const float f = __half2float(h);
781
+ const int max_val = (int)0x7fffffffU;
782
+ const int min_val = (int)0x80000000U;
783
+ const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U);
784
+ // saturation fixup
785
+ if (bits > (unsigned short)0xF800U) {
786
+ // NaN
787
+ i = 0;
788
+ } else if (f > static_cast<float>(max_val)) {
789
+ // saturate maximum
790
+ i = max_val;
791
+ } else if (f < static_cast<float>(min_val)) {
792
+ // saturate minimum
793
+ i = min_val;
794
+ } else {
795
+ // normal value, conversion is well-defined
796
+ i = static_cast<int>(f);
797
+ }
798
+ #endif
799
+ return i;
800
+ }
801
+ __CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __half2uint_rz(const __half h)
802
+ {
803
+ unsigned int i;
804
+ #if defined __CUDA_ARCH__
805
+ asm("cvt.rzi.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
806
+ #else
807
+ const float f = __half2float(h);
808
+ const unsigned int max_val = 0xffffffffU;
809
+ const unsigned int min_val = 0U;
810
+ const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U);
811
+ // saturation fixup
812
+ if (bits > (unsigned short)0xF800U) {
813
+ // NaN
814
+ i = 0U;
815
+ } else if (f > static_cast<float>(max_val)) {
816
+ // saturate maximum
817
+ i = max_val;
818
+ } else if (f < static_cast<float>(min_val)) {
819
+ // saturate minimum
820
+ i = min_val;
821
+ } else {
822
+ // normal value, conversion is well-defined
823
+ i = static_cast<unsigned int>(f);
824
+ }
825
+ #endif
826
+ return i;
827
+ }
828
+ __CUDA_HOSTDEVICE_FP16_DECL__ long long int __half2ll_rz(const __half h)
829
+ {
830
+ long long int i;
831
+ #if defined __CUDA_ARCH__
832
+ asm("cvt.rzi.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
833
+ #else
834
+ const float f = __half2float(h);
835
+ const long long int max_val = (long long int)0x7fffffffffffffffULL;
836
+ const long long int min_val = (long long int)0x8000000000000000ULL;
837
+ const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U);
838
+ // saturation fixup
839
+ if (bits > (unsigned short)0xF800U) {
840
+ // NaN
841
+ i = min_val;
842
+ } else if (f > static_cast<float>(max_val)) {
843
+ // saturate maximum
844
+ i = max_val;
845
+ } else if (f < static_cast<float>(min_val)) {
846
+ // saturate minimum
847
+ i = min_val;
848
+ } else {
849
+ // normal value, conversion is well-defined
850
+ i = static_cast<long long int>(f);
851
+ }
852
+ #endif
853
+ return i;
854
+ }
855
+ __CUDA_HOSTDEVICE_FP16_DECL__ unsigned long long int __half2ull_rz(const __half h)
856
+ {
857
+ unsigned long long int i;
858
+ #if defined __CUDA_ARCH__
859
+ asm("cvt.rzi.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
860
+ #else
861
+ const float f = __half2float(h);
862
+ const unsigned long long int max_val = 0xffffffffffffffffULL;
863
+ const unsigned long long int min_val = 0ULL;
864
+ const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U);
865
+ // saturation fixup
866
+ if (bits > (unsigned short)0xF800U) {
867
+ // NaN
868
+ i = 0x8000000000000000ULL;
869
+ } else if (f > static_cast<float>(max_val)) {
870
+ // saturate maximum
871
+ i = max_val;
872
+ } else if (f < static_cast<float>(min_val)) {
873
+ // saturate minimum
874
+ i = min_val;
875
+ } else {
876
+ // normal value, conversion is well-defined
877
+ i = static_cast<unsigned long long int>(f);
878
+ }
879
+ #endif
880
+ return i;
881
+ }
658
882
 
659
883
  /* Intrinsic functions only available to nvcc compilers */
660
884
  #if defined(__CUDACC__)
@@ -697,30 +921,6 @@ __CUDA_FP16_DECL__ int __half2int_rn(const __half h)
697
921
  asm("cvt.rni.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
698
922
  return i;
699
923
  }
700
- __CUDA_HOSTDEVICE_FP16_DECL__ int __half2int_rz(const __half h)
701
- {
702
- int i;
703
- #if defined __CUDA_ARCH__
704
- asm("cvt.rzi.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
705
- #else
706
- const float f = __half2float(h);
707
- i = static_cast<int>(f);
708
- const int max_val = (int)0x7fffffffU;
709
- const int min_val = (int)0x80000000U;
710
- // saturation fixup
711
- if (f != f) {
712
- // NaN
713
- i = 0;
714
- } else if (f > static_cast<float>(max_val)) {
715
- // saturate maximum
716
- i = max_val;
717
- } else if (f < static_cast<float>(min_val)) {
718
- // saturate minimum
719
- i = min_val;
720
- }
721
- #endif
722
- return i;
723
- }
724
924
  __CUDA_FP16_DECL__ int __half2int_rd(const __half h)
725
925
  {
726
926
  int i;
@@ -773,30 +973,6 @@ __CUDA_FP16_DECL__ short int __half2short_rn(const __half h)
773
973
  asm("cvt.rni.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
774
974
  return i;
775
975
  }
776
- __CUDA_HOSTDEVICE_FP16_DECL__ short int __half2short_rz(const __half h)
777
- {
778
- short int i;
779
- #if defined __CUDA_ARCH__
780
- asm("cvt.rzi.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
781
- #else
782
- const float f = __half2float(h);
783
- i = static_cast<short int>(f);
784
- const short int max_val = (short int)0x7fffU;
785
- const short int min_val = (short int)0x8000U;
786
- // saturation fixup
787
- if (f != f) {
788
- // NaN
789
- i = 0;
790
- } else if (f > static_cast<float>(max_val)) {
791
- // saturate maximum
792
- i = max_val;
793
- } else if (f < static_cast<float>(min_val)) {
794
- // saturate minimum
795
- i = min_val;
796
- }
797
- #endif
798
- return i;
799
- }
800
976
  __CUDA_FP16_DECL__ short int __half2short_rd(const __half h)
801
977
  {
802
978
  short int i;
@@ -845,30 +1021,6 @@ __CUDA_FP16_DECL__ unsigned int __half2uint_rn(const __half h)
845
1021
  asm("cvt.rni.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
846
1022
  return i;
847
1023
  }
848
- __CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __half2uint_rz(const __half h)
849
- {
850
- unsigned int i;
851
- #if defined __CUDA_ARCH__
852
- asm("cvt.rzi.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
853
- #else
854
- const float f = __half2float(h);
855
- i = static_cast<unsigned int>(f);
856
- const unsigned int max_val = 0xffffffffU;
857
- const unsigned int min_val = 0U;
858
- // saturation fixup
859
- if (f != f) {
860
- // NaN
861
- i = 0U;
862
- } else if (f > static_cast<float>(max_val)) {
863
- // saturate maximum
864
- i = max_val;
865
- } else if (f < static_cast<float>(min_val)) {
866
- // saturate minimum
867
- i = min_val;
868
- }
869
- #endif
870
- return i;
871
- }
872
1024
  __CUDA_FP16_DECL__ unsigned int __half2uint_rd(const __half h)
873
1025
  {
874
1026
  unsigned int i;
@@ -921,30 +1073,6 @@ __CUDA_FP16_DECL__ unsigned short int __half2ushort_rn(const __half h)
921
1073
  asm("cvt.rni.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
922
1074
  return i;
923
1075
  }
924
- __CUDA_HOSTDEVICE_FP16_DECL__ unsigned short int __half2ushort_rz(const __half h)
925
- {
926
- unsigned short int i;
927
- #if defined __CUDA_ARCH__
928
- asm("cvt.rzi.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
929
- #else
930
- const float f = __half2float(h);
931
- i = static_cast<unsigned short int>(f);
932
- const unsigned short int max_val = 0xffffU;
933
- const unsigned short int min_val = 0U;
934
- // saturation fixup
935
- if (f != f) {
936
- // NaN
937
- i = 0U;
938
- } else if (f > static_cast<float>(max_val)) {
939
- // saturate maximum
940
- i = max_val;
941
- } else if (f < static_cast<float>(min_val)) {
942
- // saturate minimum
943
- i = min_val;
944
- }
945
- #endif
946
- return i;
947
- }
948
1076
  __CUDA_FP16_DECL__ unsigned short int __half2ushort_rd(const __half h)
949
1077
  {
950
1078
  unsigned short int i;
@@ -993,30 +1121,6 @@ __CUDA_FP16_DECL__ unsigned long long int __half2ull_rn(const __half h)
993
1121
  asm("cvt.rni.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
994
1122
  return i;
995
1123
  }
996
- __CUDA_HOSTDEVICE_FP16_DECL__ unsigned long long int __half2ull_rz(const __half h)
997
- {
998
- unsigned long long int i;
999
- #if defined __CUDA_ARCH__
1000
- asm("cvt.rzi.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
1001
- #else
1002
- const float f = __half2float(h);
1003
- i = static_cast<unsigned long long int>(f);
1004
- const unsigned long long int max_val = 0xffffffffffffffffULL;
1005
- const unsigned long long int min_val = 0ULL;
1006
- // saturation fixup
1007
- if (f != f) {
1008
- // NaN
1009
- i = 0x8000000000000000ULL;
1010
- } else if (f > static_cast<float>(max_val)) {
1011
- // saturate maximum
1012
- i = max_val;
1013
- } else if (f < static_cast<float>(min_val)) {
1014
- // saturate minimum
1015
- i = min_val;
1016
- }
1017
- #endif
1018
- return i;
1019
- }
1020
1124
  __CUDA_FP16_DECL__ unsigned long long int __half2ull_rd(const __half h)
1021
1125
  {
1022
1126
  unsigned long long int i;
@@ -1069,30 +1173,6 @@ __CUDA_FP16_DECL__ long long int __half2ll_rn(const __half h)
1069
1173
  asm("cvt.rni.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
1070
1174
  return i;
1071
1175
  }
1072
- __CUDA_HOSTDEVICE_FP16_DECL__ long long int __half2ll_rz(const __half h)
1073
- {
1074
- long long int i;
1075
- #if defined __CUDA_ARCH__
1076
- asm("cvt.rzi.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
1077
- #else
1078
- const float f = __half2float(h);
1079
- i = static_cast<long long int>(f);
1080
- const long long int max_val = (long long int)0x7fffffffffffffffULL;
1081
- const long long int min_val = (long long int)0x8000000000000000ULL;
1082
- // saturation fixup
1083
- if (f != f) {
1084
- // NaN
1085
- i = min_val;
1086
- } else if (f > static_cast<float>(max_val)) {
1087
- // saturate maximum
1088
- i = max_val;
1089
- } else if (f < static_cast<float>(min_val)) {
1090
- // saturate minimum
1091
- i = min_val;
1092
- }
1093
- #endif
1094
- return i;
1095
- }
1096
1176
  __CUDA_FP16_DECL__ long long int __half2ll_rd(const __half h)
1097
1177
  {
1098
1178
  long long int i;
@@ -1309,20 +1389,89 @@ __CUDA_FP16_DECL__ __half __ushort_as_half(const unsigned short int i)
1309
1389
  return h;
1310
1390
  }
1311
1391
 
1312
- #if __CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__)
1392
+ /******************************************************************************
1393
+ * __half arithmetic *
1394
+ ******************************************************************************/
1395
+ __CUDA_FP16_DECL__ __half __hmax(const __half a, const __half b)
1396
+ {
1397
+ #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800)
1398
+ __BINARY_OP_HALF_MACRO(max)
1399
+ #else
1400
+ const float fa = __half2float(a);
1401
+ const float fb = __half2float(b);
1402
+ float fr;
1403
+ asm("{max.f32 %0,%1,%2;\n}"
1404
+ :"=f"(fr) : "f"(fa), "f"(fb));
1405
+ const __half hr = __float2half(fr);
1406
+ return hr;
1407
+ #endif
1408
+ }
1409
+ __CUDA_FP16_DECL__ __half __hmin(const __half a, const __half b)
1410
+ {
1411
+ #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800)
1412
+ __BINARY_OP_HALF_MACRO(min)
1413
+ #else
1414
+ const float fa = __half2float(a);
1415
+ const float fb = __half2float(b);
1416
+ float fr;
1417
+ asm("{min.f32 %0,%1,%2;\n}"
1418
+ :"=f"(fr) : "f"(fa), "f"(fb));
1419
+ const __half hr = __float2half(fr);
1420
+ return hr;
1421
+ #endif
1422
+ }
1423
+
1424
+ /******************************************************************************
1425
+ * __half2 arithmetic *
1426
+ ******************************************************************************/
1427
+ __CUDA_FP16_DECL__ __half2 __hmax2(const __half2 a, const __half2 b)
1428
+ {
1429
+ #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800)
1430
+ __BINARY_OP_HALF2_MACRO(max)
1431
+ #else
1432
+ const float2 fa = __half22float2(a);
1433
+ const float2 fb = __half22float2(b);
1434
+ float2 fr;
1435
+ asm("{max.f32 %0,%1,%2;\n}"
1436
+ :"=f"(fr.x) : "f"(fa.x), "f"(fb.x));
1437
+ asm("{max.f32 %0,%1,%2;\n}"
1438
+ :"=f"(fr.y) : "f"(fa.y), "f"(fb.y));
1439
+ const __half2 hr = __float22half2_rn(fr);
1440
+ return hr;
1441
+ #endif
1442
+ }
1443
+ __CUDA_FP16_DECL__ __half2 __hmin2(const __half2 a, const __half2 b)
1444
+ {
1445
+ #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800)
1446
+ __BINARY_OP_HALF2_MACRO(min)
1447
+ #else
1448
+ const float2 fa = __half22float2(a);
1449
+ const float2 fb = __half22float2(b);
1450
+ float2 fr;
1451
+ asm("{min.f32 %0,%1,%2;\n}"
1452
+ :"=f"(fr.x) : "f"(fa.x), "f"(fb.x));
1453
+ asm("{min.f32 %0,%1,%2;\n}"
1454
+ :"=f"(fr.y) : "f"(fa.y), "f"(fb.y));
1455
+ const __half2 hr = __float22half2_rn(fr);
1456
+ return hr;
1457
+ #endif
1458
+ }
1459
+
1460
+
1461
+ #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300)
1313
1462
  /******************************************************************************
1314
1463
  * __half, __half2 warp shuffle *
1315
1464
  ******************************************************************************/
1316
1465
  #define __SHUFFLE_HALF2_MACRO(name) /* do */ {\
1317
1466
  __half2 r; \
1318
- asm volatile ("{"#name" %0,%1,%2,%3;\n}" \
1467
+ asm volatile ("{" __CUDA_FP16_STRINGIFY(name) " %0,%1,%2,%3;\n}" \
1319
1468
  :"=r"(__HALF2_TO_UI(r)): "r"(__HALF2_TO_CUI(var)), "r"(delta), "r"(c)); \
1320
1469
  return r; \
1321
1470
  } /* while(0) */
1322
1471
 
1323
1472
  #define __SHUFFLE_SYNC_HALF2_MACRO(name) /* do */ {\
1324
1473
  __half2 r; \
1325
- asm volatile ("{"#name" %0,%1,%2,%3,%4;\n}" \
1474
+ asm volatile ("{" __CUDA_FP16_STRINGIFY(name) " %0,%1,%2,%3,%4;\n}" \
1326
1475
  :"=r"(__HALF2_TO_UI(r)): "r"(__HALF2_TO_CUI(var)), "r"(delta), "r"(c), "r"(mask)); \
1327
1476
  return r; \
1328
1477
  } /* while(0) */
@@ -1446,12 +1595,12 @@ __CUDA_FP16_DECL__ __half __shfl_xor_sync(const unsigned mask, const __half var,
1446
1595
  return __low2half(temp2);
1447
1596
  }
1448
1597
 
1449
- #endif /*__CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__)*/
1598
+ #endif /*!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300)*/
1450
1599
  /******************************************************************************
1451
1600
  * __half and __half2 __ldg,__ldcg,__ldca,__ldcs *
1452
1601
  ******************************************************************************/
1453
1602
 
1454
- #if defined(__cplusplus) && (__CUDA_ARCH__ >= 320 || !defined(__CUDA_ARCH__))
1603
+ #if defined(__cplusplus) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320))
1455
1604
  #if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)
1456
1605
  #define __LDG_PTR "l"
1457
1606
  #else
@@ -1562,14 +1711,14 @@ __CUDA_FP16_DECL__ void __stwt(__half *const ptr, const __half value)
1562
1711
  asm ("st.global.wt.b16 [%0], %1;" :: __LDG_PTR(ptr), "h"(__HALF_TO_CUS(value)) : "memory");
1563
1712
  }
1564
1713
  #undef __LDG_PTR
1565
- #endif /*defined(__cplusplus) && (__CUDA_ARCH__ >= 320 || !defined(__CUDA_ARCH__))*/
1566
- #if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
1714
+ #endif /*defined(__cplusplus) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320))*/
1715
+ #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
1567
1716
  /******************************************************************************
1568
1717
  * __half2 comparison *
1569
1718
  ******************************************************************************/
1570
1719
  #define __COMPARISON_OP_HALF2_MACRO(name) /* do */ {\
1571
1720
  __half2 val; \
1572
- asm( "{ "#name".f16x2.f16x2 %0,%1,%2;\n}" \
1721
+ asm( "{ " __CUDA_FP16_STRINGIFY(name) ".f16x2.f16x2 %0,%1,%2;\n}" \
1573
1722
  :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b))); \
1574
1723
  return val; \
1575
1724
  } /* while(0) */
@@ -1625,7 +1774,7 @@ __CUDA_FP16_DECL__ __half2 __hgtu2(const __half2 a, const __half2 b)
1625
1774
  #define __BOOL_COMPARISON_OP_HALF2_MACRO(name) /* do */ {\
1626
1775
  __half2 val; \
1627
1776
  bool retval; \
1628
- asm( "{ "#name".f16x2.f16x2 %0,%1,%2;\n}" \
1777
+ asm( "{ " __CUDA_FP16_STRINGIFY(name) ".f16x2.f16x2 %0,%1,%2;\n}" \
1629
1778
  :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b))); \
1630
1779
  if (__HALF2_TO_CUI(val) == 0x3C003C00U) {\
1631
1780
  retval = true; \
@@ -1689,7 +1838,7 @@ __CUDA_FP16_DECL__ bool __hbgtu2(const __half2 a, const __half2 b)
1689
1838
  #define __COMPARISON_OP_HALF_MACRO(name) /* do */ {\
1690
1839
  unsigned short val; \
1691
1840
  asm( "{ .reg .pred __$temp3;\n" \
1692
- " setp."#name".f16 __$temp3, %1, %2;\n" \
1841
+ " setp." __CUDA_FP16_STRINGIFY(name) ".f16 __$temp3, %1, %2;\n" \
1693
1842
  " selp.u16 %0, 1, 0, __$temp3;}" \
1694
1843
  : "=h"(val) : "h"(__HALF_TO_CUS(a)), "h"(__HALF_TO_CUS(b))); \
1695
1844
  return (val != 0U) ? true : false; \
@@ -1770,6 +1919,18 @@ __CUDA_FP16_DECL__ __half2 __hmul2_sat(const __half2 a, const __half2 b)
1770
1919
  {
1771
1920
  __BINARY_OP_HALF2_MACRO(mul.sat)
1772
1921
  }
1922
+ __CUDA_FP16_DECL__ __half2 __hadd2_rn(const __half2 a, const __half2 b)
1923
+ {
1924
+ __BINARY_OP_HALF2_MACRO(add.rn)
1925
+ }
1926
+ __CUDA_FP16_DECL__ __half2 __hsub2_rn(const __half2 a, const __half2 b)
1927
+ {
1928
+ __BINARY_OP_HALF2_MACRO(sub.rn)
1929
+ }
1930
+ __CUDA_FP16_DECL__ __half2 __hmul2_rn(const __half2 a, const __half2 b)
1931
+ {
1932
+ __BINARY_OP_HALF2_MACRO(mul.rn)
1933
+ }
1773
1934
  __CUDA_FP16_DECL__ __half2 __hfma2(const __half2 a, const __half2 b, const __half2 c)
1774
1935
  {
1775
1936
  __TERNARY_OP_HALF2_MACRO(fma.rn)
@@ -1818,7 +1979,18 @@ __CUDA_FP16_DECL__ __half __hmul_sat(const __half a, const __half b)
1818
1979
  {
1819
1980
  __BINARY_OP_HALF_MACRO(mul.sat)
1820
1981
  }
1821
-
1982
+ __CUDA_FP16_DECL__ __half __hadd_rn(const __half a, const __half b)
1983
+ {
1984
+ __BINARY_OP_HALF_MACRO(add.rn)
1985
+ }
1986
+ __CUDA_FP16_DECL__ __half __hsub_rn(const __half a, const __half b)
1987
+ {
1988
+ __BINARY_OP_HALF_MACRO(sub.rn)
1989
+ }
1990
+ __CUDA_FP16_DECL__ __half __hmul_rn(const __half a, const __half b)
1991
+ {
1992
+ __BINARY_OP_HALF_MACRO(mul.rn)
1993
+ }
1822
1994
  __CUDA_FP16_DECL__ __half __hfma(const __half a, const __half b, const __half c)
1823
1995
  {
1824
1996
  __TERNARY_OP_HALF_MACRO(fma.rn)
@@ -1856,23 +2028,23 @@ __CUDA_FP16_DECL__ __half __hdiv(const __half a, const __half b) {
1856
2028
  ******************************************************************************/
1857
2029
  #define __SPEC_CASE2(i,r, spc, ulp) \
1858
2030
  "{.reg.b32 spc, ulp, p;\n"\
1859
- " mov.b32 spc,"#spc";\n"\
1860
- " mov.b32 ulp,"#ulp";\n"\
1861
- " set.eq.f16x2.f16x2 p,"#i", spc;\n"\
1862
- " fma.rn.f16x2 "#r",p,ulp,"#r";\n}\n"
2031
+ " mov.b32 spc," __CUDA_FP16_STRINGIFY(spc) ";\n"\
2032
+ " mov.b32 ulp," __CUDA_FP16_STRINGIFY(ulp) ";\n"\
2033
+ " set.eq.f16x2.f16x2 p," __CUDA_FP16_STRINGIFY(i) ", spc;\n"\
2034
+ " fma.rn.f16x2 " __CUDA_FP16_STRINGIFY(r) ",p,ulp," __CUDA_FP16_STRINGIFY(r) ";\n}\n"
1863
2035
  #define __SPEC_CASE(i,r, spc, ulp) \
1864
2036
  "{.reg.b16 spc, ulp, p;\n"\
1865
- " mov.b16 spc,"#spc";\n"\
1866
- " mov.b16 ulp,"#ulp";\n"\
1867
- " set.eq.f16.f16 p,"#i", spc;\n"\
1868
- " fma.rn.f16 "#r",p,ulp,"#r";\n}\n"
2037
+ " mov.b16 spc," __CUDA_FP16_STRINGIFY(spc) ";\n"\
2038
+ " mov.b16 ulp," __CUDA_FP16_STRINGIFY(ulp) ";\n"\
2039
+ " set.eq.f16.f16 p," __CUDA_FP16_STRINGIFY(i) ", spc;\n"\
2040
+ " fma.rn.f16 " __CUDA_FP16_STRINGIFY(r) ",p,ulp," __CUDA_FP16_STRINGIFY(r) ";\n}\n"
1869
2041
  #define __APPROX_FCAST(fun) /* do */ {\
1870
2042
  __half val;\
1871
2043
  asm("{.reg.b32 f; \n"\
1872
2044
  " .reg.b16 r; \n"\
1873
2045
  " mov.b16 r,%1; \n"\
1874
2046
  " cvt.f32.f16 f,r; \n"\
1875
- " "#fun".approx.f32 f,f; \n"\
2047
+ " " __CUDA_FP16_STRINGIFY(fun) ".approx.ftz.f32 f,f; \n"\
1876
2048
  " cvt.rn.f16.f32 r,f; \n"\
1877
2049
  " mov.b16 %0,r; \n"\
1878
2050
  "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));\
@@ -1885,8 +2057,8 @@ __CUDA_FP16_DECL__ __half __hdiv(const __half a, const __half b) {
1885
2057
  " mov.b32 {hl, hu}, %1; \n"\
1886
2058
  " cvt.f32.f16 fl, hl; \n"\
1887
2059
  " cvt.f32.f16 fu, hu; \n"\
1888
- " "#fun".approx.f32 fl, fl; \n"\
1889
- " "#fun".approx.f32 fu, fu; \n"\
2060
+ " " __CUDA_FP16_STRINGIFY(fun) ".approx.ftz.f32 fl, fl; \n"\
2061
+ " " __CUDA_FP16_STRINGIFY(fun) ".approx.ftz.f32 fu, fu; \n"\
1890
2062
  " cvt.rn.f16.f32 hl, fl; \n"\
1891
2063
  " cvt.rn.f16.f32 hu, fu; \n"\
1892
2064
  " mov.b32 %0, {hl, hu}; \n"\
@@ -1895,129 +2067,122 @@ __CUDA_FP16_DECL__ __half __hdiv(const __half a, const __half b) {
1895
2067
  } /* while(0) */
1896
2068
  static __device__ __forceinline__ float __float_simpl_sinf(float a);
1897
2069
  static __device__ __forceinline__ float __float_simpl_cosf(float a);
1898
- __CUDA_FP16_DECL__ __half __hsin_internal(const __half a) {
1899
- float f = __half2float(a);
1900
- f = __float_simpl_sinf(f);
1901
- return __float2half_rn(f);
1902
- }
1903
2070
  __CUDA_FP16_DECL__ __half hsin(const __half a) {
1904
- __half r = __hsin_internal(a);
2071
+ const float sl = __float_simpl_sinf(__half2float(a));
2072
+ __half r = __float2half_rn(sl);
1905
2073
  asm("{\n\t"
1906
2074
  " .reg.b16 i,r,t; \n\t"
1907
2075
  " mov.b16 r, %0; \n\t"
1908
2076
  " mov.b16 i, %1; \n\t"
1909
- " mov.b16 t, 0x8000U; \n\t"
1910
- " and.b16 t,r,t; \n\t"
2077
+ " and.b16 t, r, 0x8000U; \n\t"
2078
+ " abs.f16 r, r; \n\t"
2079
+ " abs.f16 i, i; \n\t"
1911
2080
  __SPEC_CASE(i, r, 0X32B3U, 0x0800U)
1912
- __SPEC_CASE(i, r, 0X5CB0U, 0x1000U)
1913
- __SPEC_CASE(i, r, 0XB2B3U, 0x8800U)
1914
- __SPEC_CASE(i, r, 0XDCB0U, 0x9000U)
2081
+ __SPEC_CASE(i, r, 0X5CB0U, 0x9000U)
1915
2082
  " or.b16 r,r,t; \n\t"
1916
2083
  " mov.b16 %0, r; \n"
1917
2084
  "}\n" : "+h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a)));
1918
2085
  return r;
1919
2086
  }
1920
2087
  __CUDA_FP16_DECL__ __half2 h2sin(const __half2 a) {
1921
- const __half l = __low2half(a);
1922
- const __half h = __high2half(a);
1923
- const __half sl = __hsin_internal(l);
1924
- const __half sh = __hsin_internal(h);
1925
- __half2 r = __halves2half2(sl, sh);
2088
+ const float sl = __float_simpl_sinf(__half2float(a.x));
2089
+ const float sh = __float_simpl_sinf(__half2float(a.y));
2090
+ __half2 r = __floats2half2_rn(sl, sh);
1926
2091
  asm("{\n\t"
1927
2092
  " .reg.b32 i,r,t; \n\t"
1928
2093
  " mov.b32 r, %0; \n\t"
1929
2094
  " mov.b32 i, %1; \n\t"
1930
2095
  " and.b32 t, r, 0x80008000U; \n\t"
2096
+ " abs.f16x2 r, r; \n\t"
2097
+ " abs.f16x2 i, i; \n\t"
1931
2098
  __SPEC_CASE2(i, r, 0X32B332B3U, 0x08000800U)
1932
- __SPEC_CASE2(i, r, 0X5CB05CB0U, 0x10001000U)
1933
- __SPEC_CASE2(i, r, 0XB2B3B2B3U, 0x88008800U)
1934
- __SPEC_CASE2(i, r, 0XDCB0DCB0U, 0x90009000U)
2099
+ __SPEC_CASE2(i, r, 0X5CB05CB0U, 0x90009000U)
1935
2100
  " or.b32 r, r, t; \n\t"
1936
2101
  " mov.b32 %0, r; \n"
1937
2102
  "}\n" : "+r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a)));
1938
2103
  return r;
1939
2104
  }
1940
- __CUDA_FP16_DECL__ __half __hcos_internal(const __half a) {
1941
- float f = __half2float(a);
1942
- f = __float_simpl_cosf(f);
1943
- return __float2half_rn(f);
1944
- }
1945
2105
  __CUDA_FP16_DECL__ __half hcos(const __half a) {
1946
- __half r = __hcos_internal(a);
2106
+ const float cl = __float_simpl_cosf(__half2float(a));
2107
+ __half r = __float2half_rn(cl);
1947
2108
  asm("{\n\t"
1948
2109
  " .reg.b16 i,r; \n\t"
1949
2110
  " mov.b16 r, %0; \n\t"
1950
2111
  " mov.b16 i, %1; \n\t"
2112
+ " abs.f16 i, i; \n\t"
1951
2113
  __SPEC_CASE(i, r, 0X2B7CU, 0x1000U)
1952
- __SPEC_CASE(i, r, 0XAB7CU, 0x1000U)
1953
2114
  " mov.b16 %0, r; \n"
1954
2115
  "}\n" : "+h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a)));
1955
2116
  return r;
1956
2117
  }
1957
2118
  __CUDA_FP16_DECL__ __half2 h2cos(const __half2 a) {
1958
- const __half l = __low2half(a);
1959
- const __half h = __high2half(a);
1960
- const __half cl = __hcos_internal(l);
1961
- const __half ch = __hcos_internal(h);
1962
- __half2 r = __halves2half2(cl, ch);
2119
+ const float cl = __float_simpl_cosf(__half2float(a.x));
2120
+ const float ch = __float_simpl_cosf(__half2float(a.y));
2121
+ __half2 r = __floats2half2_rn(cl, ch);
1963
2122
  asm("{\n\t"
1964
2123
  " .reg.b32 i,r; \n\t"
1965
2124
  " mov.b32 r, %0; \n\t"
1966
2125
  " mov.b32 i, %1; \n\t"
2126
+ " abs.f16x2 i, i; \n\t"
1967
2127
  __SPEC_CASE2(i, r, 0X2B7C2B7CU, 0x10001000U)
1968
- __SPEC_CASE2(i, r, 0XAB7CAB7CU, 0x10001000U)
1969
2128
  " mov.b32 %0, r; \n"
1970
2129
  "}\n" : "+r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a)));
1971
2130
  return r;
1972
2131
  }
1973
- static __device__ __forceinline__ float __internal_trig_reduction_kernel(const float a, int *quadrant)
2132
+ static __device__ __forceinline__ float __internal_trig_reduction_kernel(const float a, unsigned int *const quadrant)
1974
2133
  {
1975
- const int q = __float2int_rn(a * 0.636619772F);
1976
- const float j = static_cast<float>(q);
1977
- float t = __fmaf_rn(-j, 1.5707962512969971e+000F, a);
1978
- t = __fmaf_rn(-j, 7.5497894158615964e-008F, t);
2134
+ const float ar = __fmaf_rn(a, 0.636619772F, 12582912.0F);
2135
+ const unsigned q = __float_as_uint(ar);
2136
+ const float j = __fsub_rn(ar, 12582912.0F);
2137
+ float t = __fmaf_rn(j, -1.5707962512969971e+000F, a);
2138
+ t = __fmaf_rn(j, -7.5497894158615964e-008F, t);
1979
2139
  *quadrant = q;
1980
2140
  return t;
1981
2141
  }
1982
- static __device__ __forceinline__ float __internal_sin_cos_kernel(float x, const int i)
2142
+ static __device__ __forceinline__ float __internal_sin_cos_kernel(const float x, const unsigned int i)
1983
2143
  {
1984
2144
  float z;
1985
2145
  const float x2 = x*x;
1986
-
1987
- if ((static_cast<unsigned>(i) & 1U) != 0U) {
1988
- z = 2.44331571e-5F;
1989
- z = __fmaf_rn(z, x2, -1.38873163e-3F);
1990
- }
1991
- else {
1992
- z = -1.95152959e-4F;
1993
- z = __fmaf_rn(z, x2, 8.33216087e-3F);
1994
- }
1995
- if ((static_cast<unsigned>(i) & 1U) != 0U) {
1996
- z = __fmaf_rn(z, x2, 4.16666457e-2F);
1997
- z = __fmaf_rn(z, x2, -5.00000000e-1F);
1998
- }
1999
- else {
2000
- z = __fmaf_rn(z, x2, -1.66666546e-1F);
2001
- z = __fmaf_rn(z, x2, 0.0F);
2002
- }
2003
- if ((static_cast<unsigned>(i) & 1U) != 0U) {
2004
- x = __fmaf_rn(z, x2, 1.0F);
2146
+ float a8;
2147
+ float a6;
2148
+ float a4;
2149
+ float a2;
2150
+ float a1;
2151
+ float a0;
2152
+
2153
+ if ((i & 1U) != 0U) {
2154
+ // cos
2155
+ a8 = 2.44331571e-5F;
2156
+ a6 = -1.38873163e-3F;
2157
+ a4 = 4.16666457e-2F;
2158
+ a2 = -5.00000000e-1F;
2159
+ a1 = x2;
2160
+ a0 = 1.0F;
2005
2161
  }
2006
2162
  else {
2007
- x = __fmaf_rn(z, x, x);
2163
+ // sin
2164
+ a8 = -1.95152959e-4F;
2165
+ a6 = 8.33216087e-3F;
2166
+ a4 = -1.66666546e-1F;
2167
+ a2 = 0.0F;
2168
+ a1 = x;
2169
+ a0 = x;
2008
2170
  }
2009
- if ((static_cast<unsigned>(i) & 2U) != 0U) {
2010
- x = __fmaf_rn(x, -1.0F, 0.0F);
2171
+
2172
+ z = __fmaf_rn(a8, x2, a6);
2173
+ z = __fmaf_rn(z, x2, a4);
2174
+ z = __fmaf_rn(z, x2, a2);
2175
+ z = __fmaf_rn(z, a1, a0);
2176
+
2177
+ if ((i & 2U) != 0U) {
2178
+ z = -z;
2011
2179
  }
2012
- return x;
2180
+ return z;
2013
2181
  }
2014
2182
  static __device__ __forceinline__ float __float_simpl_sinf(float a)
2015
2183
  {
2016
2184
  float z;
2017
- int i;
2018
- if (::isinf(a)) {
2019
- a = a * 0.0F;
2020
- }
2185
+ unsigned i;
2021
2186
  a = __internal_trig_reduction_kernel(a, &i);
2022
2187
  z = __internal_sin_cos_kernel(a, i);
2023
2188
  return z;
@@ -2025,25 +2190,22 @@ static __device__ __forceinline__ float __float_simpl_sinf(float a)
2025
2190
  static __device__ __forceinline__ float __float_simpl_cosf(float a)
2026
2191
  {
2027
2192
  float z;
2028
- int i;
2029
- if (::isinf(a)) {
2030
- a = a * 0.0F;
2031
- }
2193
+ unsigned i;
2032
2194
  a = __internal_trig_reduction_kernel(a, &i);
2033
- i++;
2034
- z = __internal_sin_cos_kernel(a, i);
2195
+ z = __internal_sin_cos_kernel(a, (i & 0x3U) + 1U);
2035
2196
  return z;
2036
2197
  }
2037
2198
 
2038
2199
  __CUDA_FP16_DECL__ __half hexp(const __half a) {
2039
2200
  __half val;
2040
- asm("{.reg.b32 f, C; \n"
2201
+ asm("{.reg.b32 f, C, nZ; \n"
2041
2202
  " .reg.b16 h,r; \n"
2042
2203
  " mov.b16 h,%1; \n"
2043
2204
  " cvt.f32.f16 f,h; \n"
2044
- " mov.b32 C, 0x3fb8aa3bU; \n"
2045
- " mul.f32 f,f,C; \n"
2046
- " ex2.approx.f32 f,f; \n"
2205
+ " mov.b32 C, 0x3fb8aa3bU; \n"
2206
+ " mov.b32 nZ, 0x80000000U;\n"
2207
+ " fma.rn.f32 f,f,C,nZ; \n"
2208
+ " ex2.approx.ftz.f32 f,f; \n"
2047
2209
  " cvt.rn.f16.f32 r,f; \n"
2048
2210
  __SPEC_CASE(h, r, 0X1F79U, 0x9400U)
2049
2211
  __SPEC_CASE(h, r, 0X25CFU, 0x9400U)
@@ -2056,16 +2218,17 @@ __CUDA_FP16_DECL__ __half hexp(const __half a) {
2056
2218
  __CUDA_FP16_DECL__ __half2 h2exp(const __half2 a) {
2057
2219
  __half2 val;
2058
2220
  asm("{.reg.b16 hl, hu; \n"
2059
- " .reg.b32 h,r,fl,fu, C; \n"
2221
+ " .reg.b32 h,r,fl,fu,C,nZ; \n"
2060
2222
  " mov.b32 {hl, hu}, %1; \n"
2061
2223
  " mov.b32 h, %1; \n"
2062
2224
  " cvt.f32.f16 fl, hl; \n"
2063
2225
  " cvt.f32.f16 fu, hu; \n"
2064
- " mov.b32 C, 0x3fb8aa3bU; \n"
2065
- " mul.f32 fl,fl,C; \n"
2066
- " mul.f32 fu,fu,C; \n"
2067
- " ex2.approx.f32 fl, fl; \n"
2068
- " ex2.approx.f32 fu, fu; \n"
2226
+ " mov.b32 C, 0x3fb8aa3bU; \n"
2227
+ " mov.b32 nZ, 0x80000000U;\n"
2228
+ " fma.rn.f32 fl,fl,C,nZ; \n"
2229
+ " fma.rn.f32 fu,fu,C,nZ; \n"
2230
+ " ex2.approx.ftz.f32 fl, fl; \n"
2231
+ " ex2.approx.ftz.f32 fu, fu; \n"
2069
2232
  " cvt.rn.f16.f32 hl, fl; \n"
2070
2233
  " cvt.rn.f16.f32 hu, fu; \n"
2071
2234
  " mov.b32 r, {hl, hu}; \n"
@@ -2083,7 +2246,7 @@ __CUDA_FP16_DECL__ __half hexp2(const __half a) {
2083
2246
  " .reg.b16 r; \n"
2084
2247
  " mov.b16 r,%1; \n"
2085
2248
  " cvt.f32.f16 f,r; \n"
2086
- " ex2.approx.f32 f,f; \n"
2249
+ " ex2.approx.ftz.f32 f,f; \n"
2087
2250
  " mov.b32 ULP, 0x33800000U;\n"
2088
2251
  " fma.rn.f32 f,f,ULP,f; \n"
2089
2252
  " cvt.rn.f16.f32 r,f; \n"
@@ -2098,8 +2261,8 @@ __CUDA_FP16_DECL__ __half2 h2exp2(const __half2 a) {
2098
2261
  " mov.b32 {hl, hu}, %1; \n"
2099
2262
  " cvt.f32.f16 fl, hl; \n"
2100
2263
  " cvt.f32.f16 fu, hu; \n"
2101
- " ex2.approx.f32 fl, fl; \n"
2102
- " ex2.approx.f32 fu, fu; \n"
2264
+ " ex2.approx.ftz.f32 fl, fl; \n"
2265
+ " ex2.approx.ftz.f32 fu, fu; \n"
2103
2266
  " mov.b32 ULP, 0x33800000U;\n"
2104
2267
  " fma.rn.f32 fl,fl,ULP,fl; \n"
2105
2268
  " fma.rn.f32 fu,fu,ULP,fu; \n"
@@ -2112,12 +2275,13 @@ __CUDA_FP16_DECL__ __half2 h2exp2(const __half2 a) {
2112
2275
  __CUDA_FP16_DECL__ __half hexp10(const __half a) {
2113
2276
  __half val;
2114
2277
  asm("{.reg.b16 h,r; \n"
2115
- " .reg.b32 f, C; \n"
2278
+ " .reg.b32 f, C, nZ; \n"
2116
2279
  " mov.b16 h, %1; \n"
2117
2280
  " cvt.f32.f16 f, h; \n"
2118
- " mov.b32 C, 0x40549A78U; \n"
2119
- " mul.f32 f,f,C; \n"
2120
- " ex2.approx.f32 f, f; \n"
2281
+ " mov.b32 C, 0x40549A78U; \n"
2282
+ " mov.b32 nZ, 0x80000000U;\n"
2283
+ " fma.rn.f32 f,f,C,nZ; \n"
2284
+ " ex2.approx.ftz.f32 f, f; \n"
2121
2285
  " cvt.rn.f16.f32 r, f; \n"
2122
2286
  __SPEC_CASE(h, r, 0x34DEU, 0x9800U)
2123
2287
  __SPEC_CASE(h, r, 0x9766U, 0x9000U)
@@ -2131,16 +2295,17 @@ __CUDA_FP16_DECL__ __half hexp10(const __half a) {
2131
2295
  __CUDA_FP16_DECL__ __half2 h2exp10(const __half2 a) {
2132
2296
  __half2 val;
2133
2297
  asm("{.reg.b16 hl, hu; \n"
2134
- " .reg.b32 h,r,fl,fu, C; \n"
2298
+ " .reg.b32 h,r,fl,fu,C,nZ; \n"
2135
2299
  " mov.b32 {hl, hu}, %1; \n"
2136
2300
  " mov.b32 h, %1; \n"
2137
2301
  " cvt.f32.f16 fl, hl; \n"
2138
2302
  " cvt.f32.f16 fu, hu; \n"
2139
- " mov.b32 C, 0x40549A78U; \n"
2140
- " mul.f32 fl,fl,C; \n"
2141
- " mul.f32 fu,fu,C; \n"
2142
- " ex2.approx.f32 fl, fl; \n"
2143
- " ex2.approx.f32 fu, fu; \n"
2303
+ " mov.b32 C, 0x40549A78U; \n"
2304
+ " mov.b32 nZ, 0x80000000U;\n"
2305
+ " fma.rn.f32 fl,fl,C,nZ; \n"
2306
+ " fma.rn.f32 fu,fu,C,nZ; \n"
2307
+ " ex2.approx.ftz.f32 fl, fl; \n"
2308
+ " ex2.approx.ftz.f32 fu, fu; \n"
2144
2309
  " cvt.rn.f16.f32 hl, fl; \n"
2145
2310
  " cvt.rn.f16.f32 hu, fu; \n"
2146
2311
  " mov.b32 r, {hl, hu}; \n"
@@ -2159,7 +2324,7 @@ __CUDA_FP16_DECL__ __half hlog2(const __half a) {
2159
2324
  " .reg.b32 f; \n"
2160
2325
  " mov.b16 h, %1; \n"
2161
2326
  " cvt.f32.f16 f, h; \n"
2162
- " lg2.approx.f32 f, f; \n"
2327
+ " lg2.approx.ftz.f32 f, f; \n"
2163
2328
  " cvt.rn.f16.f32 r, f; \n"
2164
2329
  __SPEC_CASE(r, r, 0xA2E2U, 0x8080U)
2165
2330
  __SPEC_CASE(r, r, 0xBF46U, 0x9400U)
@@ -2174,8 +2339,8 @@ __CUDA_FP16_DECL__ __half2 h2log2(const __half2 a) {
2174
2339
  " mov.b32 {hl, hu}, %1; \n"
2175
2340
  " cvt.f32.f16 fl, hl; \n"
2176
2341
  " cvt.f32.f16 fu, hu; \n"
2177
- " lg2.approx.f32 fl, fl; \n"
2178
- " lg2.approx.f32 fu, fu; \n"
2342
+ " lg2.approx.ftz.f32 fl, fl; \n"
2343
+ " lg2.approx.ftz.f32 fu, fu; \n"
2179
2344
  " cvt.rn.f16.f32 hl, fl; \n"
2180
2345
  " cvt.rn.f16.f32 hu, fu; \n"
2181
2346
  " mov.b32 r, {hl, hu}; \n"
@@ -2191,7 +2356,7 @@ __CUDA_FP16_DECL__ __half hlog(const __half a) {
2191
2356
  " .reg.b16 r,h; \n"
2192
2357
  " mov.b16 h,%1; \n"
2193
2358
  " cvt.f32.f16 f,h; \n"
2194
- " lg2.approx.f32 f,f; \n"
2359
+ " lg2.approx.ftz.f32 f,f; \n"
2195
2360
  " mov.b32 C, 0x3f317218U; \n"
2196
2361
  " mul.f32 f,f,C; \n"
2197
2362
  " cvt.rn.f16.f32 r,f; \n"
@@ -2211,8 +2376,8 @@ __CUDA_FP16_DECL__ __half2 h2log(const __half2 a) {
2211
2376
  " mov.b32 h, %1; \n"
2212
2377
  " cvt.f32.f16 fl, hl; \n"
2213
2378
  " cvt.f32.f16 fu, hu; \n"
2214
- " lg2.approx.f32 fl, fl; \n"
2215
- " lg2.approx.f32 fu, fu; \n"
2379
+ " lg2.approx.ftz.f32 fl, fl; \n"
2380
+ " lg2.approx.ftz.f32 fu, fu; \n"
2216
2381
  " mov.b32 C, 0x3f317218U; \n"
2217
2382
  " mul.f32 fl,fl,C; \n"
2218
2383
  " mul.f32 fu,fu,C; \n"
@@ -2233,7 +2398,7 @@ __CUDA_FP16_DECL__ __half hlog10(const __half a) {
2233
2398
  " .reg.b32 f, C; \n"
2234
2399
  " mov.b16 h, %1; \n"
2235
2400
  " cvt.f32.f16 f, h; \n"
2236
- " lg2.approx.f32 f, f; \n"
2401
+ " lg2.approx.ftz.f32 f, f; \n"
2237
2402
  " mov.b32 C, 0x3E9A209BU; \n"
2238
2403
  " mul.f32 f,f,C; \n"
2239
2404
  " cvt.rn.f16.f32 r, f; \n"
@@ -2253,8 +2418,8 @@ __CUDA_FP16_DECL__ __half2 h2log10(const __half2 a) {
2253
2418
  " mov.b32 h, %1; \n"
2254
2419
  " cvt.f32.f16 fl, hl; \n"
2255
2420
  " cvt.f32.f16 fu, hu; \n"
2256
- " lg2.approx.f32 fl, fl; \n"
2257
- " lg2.approx.f32 fu, fu; \n"
2421
+ " lg2.approx.ftz.f32 fl, fl; \n"
2422
+ " lg2.approx.ftz.f32 fu, fu; \n"
2258
2423
  " mov.b32 C, 0x3E9A209BU; \n"
2259
2424
  " mul.f32 fl,fl,C; \n"
2260
2425
  " mul.f32 fu,fu,C; \n"
@@ -2340,27 +2505,16 @@ __CUDA_FP16_DECL__ __half2 __hcmadd(const __half2 a, const __half2 b, const __ha
2340
2505
  // (a.re, a.im) * (b.re, b.im) + (c.re, c.im)
2341
2506
  // acc.re = (c.re + a.re*b.re) - a.im*b.im
2342
2507
  // acc.im = (c.im + a.re*b.im) + a.im*b.re
2343
- const __half2 a_re = __half2half2(a.x);
2344
- __half2 acc = __hfma2(a_re, b, c);
2345
- const __half2 a_im = __half2half2(a.y);
2346
- const __half2 ib = __halves2half2(__hneg(b.y), b.x);
2347
- acc = __hfma2(a_im, ib, acc);
2348
- return acc;
2508
+ __half real_tmp = __hfma(a.x, b.x, c.x);
2509
+ __half img_tmp = __hfma(a.x, b.y, c.y);
2510
+ real_tmp = __hfma(__hneg(a.y), b.y, real_tmp);
2511
+ img_tmp = __hfma(a.y, b.x, img_tmp);
2512
+ return make_half2(real_tmp, img_tmp);
2349
2513
  }
2350
- #endif /*__CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)*/
2351
2514
 
2352
- #if __CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)
2353
- /******************************************************************************
2354
- * __half arithmetic *
2355
- ******************************************************************************/
2356
- __CUDA_FP16_DECL__ __half __hmax(const __half a, const __half b)
2357
- {
2358
- __BINARY_OP_HALF_MACRO(max)
2359
- }
2360
- __CUDA_FP16_DECL__ __half __hmin(const __half a, const __half b)
2361
- {
2362
- __BINARY_OP_HALF_MACRO(min)
2363
- }
2515
+ #endif /*!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)*/
2516
+
2517
+ #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800)
2364
2518
  __CUDA_FP16_DECL__ __half __hmax_nan(const __half a, const __half b)
2365
2519
  {
2366
2520
  __BINARY_OP_HALF_MACRO(max.NaN)
@@ -2373,17 +2527,7 @@ __CUDA_FP16_DECL__ __half __hfma_relu(const __half a, const __half b, const __ha
2373
2527
  {
2374
2528
  __TERNARY_OP_HALF_MACRO(fma.rn.relu)
2375
2529
  }
2376
- /******************************************************************************
2377
- * __half2 arithmetic *
2378
- ******************************************************************************/
2379
- __CUDA_FP16_DECL__ __half2 __hmax2(const __half2 a, const __half2 b)
2380
- {
2381
- __BINARY_OP_HALF2_MACRO(max)
2382
- }
2383
- __CUDA_FP16_DECL__ __half2 __hmin2(const __half2 a, const __half2 b)
2384
- {
2385
- __BINARY_OP_HALF2_MACRO(min)
2386
- }
2530
+
2387
2531
  __CUDA_FP16_DECL__ __half2 __hmax2_nan(const __half2 a, const __half2 b)
2388
2532
  {
2389
2533
  __BINARY_OP_HALF2_MACRO(max.NaN)
@@ -2396,7 +2540,7 @@ __CUDA_FP16_DECL__ __half2 __hfma2_relu(const __half2 a, const __half2 b, const
2396
2540
  {
2397
2541
  __TERNARY_OP_HALF2_MACRO(fma.rn.relu)
2398
2542
  }
2399
- #endif /*__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)*/
2543
+ #endif /*!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800)*/
2400
2544
 
2401
2545
  /* Define __PTR for atomicAdd prototypes below, undef after done */
2402
2546
  #if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)
@@ -2444,6 +2588,11 @@ __CUDA_FP16_DECL__ __half atomicAdd(__half *const address, const __half val) {
2444
2588
  #undef __CUDA_HOSTDEVICE_FP16_DECL__
2445
2589
  #undef __CUDA_FP16_DECL__
2446
2590
 
2591
+ #undef __HALF_TO_US
2592
+ #undef __HALF_TO_CUS
2593
+ #undef __HALF2_TO_UI
2594
+ #undef __HALF2_TO_CUI
2595
+
2447
2596
  /* Define first-class types "half" and "half2", unless user specifies otherwise via "#define CUDA_NO_HALF" */
2448
2597
  /* C cannot ever have these types defined here, because __half and __half2 are C++ classes */
2449
2598
  #if defined(__cplusplus) && !defined(CUDA_NO_HALF)