numba-cuda 0.9.0__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- numba_cuda/VERSION +1 -1
- numba_cuda/numba/cuda/compiler.py +14 -1
- numba_cuda/numba/cuda/cuda_bf16.py +5155 -0
- numba_cuda/numba/cuda/cuda_paths.py +2 -0
- numba_cuda/numba/cuda/cudadecl.py +0 -42
- numba_cuda/numba/cuda/cudadrv/linkable_code.py +11 -2
- numba_cuda/numba/cuda/cudadrv/nvrtc.py +10 -3
- numba_cuda/numba/cuda/cudaimpl.py +0 -63
- numba_cuda/numba/cuda/debuginfo.py +92 -2
- numba_cuda/numba/cuda/decorators.py +13 -1
- numba_cuda/numba/cuda/device_init.py +4 -5
- numba_cuda/numba/cuda/extending.py +54 -0
- numba_cuda/numba/cuda/include/11/cuda_bf16.h +3749 -0
- numba_cuda/numba/cuda/include/11/cuda_bf16.hpp +2683 -0
- numba_cuda/numba/cuda/{cuda_fp16.h → include/11/cuda_fp16.h} +550 -387
- numba_cuda/numba/cuda/{cuda_fp16.hpp → include/11/cuda_fp16.hpp} +465 -316
- numba_cuda/numba/cuda/include/12/cuda_bf16.h +5118 -0
- numba_cuda/numba/cuda/include/12/cuda_bf16.hpp +3865 -0
- numba_cuda/numba/cuda/include/12/cuda_fp16.h +5363 -0
- numba_cuda/numba/cuda/include/12/cuda_fp16.hpp +3483 -0
- numba_cuda/numba/cuda/intrinsic_wrapper.py +0 -39
- numba_cuda/numba/cuda/intrinsics.py +172 -1
- numba_cuda/numba/cuda/lowering.py +43 -0
- numba_cuda/numba/cuda/stubs.py +0 -11
- numba_cuda/numba/cuda/target.py +28 -0
- numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +4 -2
- numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +257 -0
- numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +46 -0
- numba_cuda/numba/cuda/tests/cudapy/test_enums.py +18 -0
- numba_cuda/numba/cuda/tests/cudapy/test_extending.py +4 -2
- numba_cuda/numba/cuda/tests/cudapy/test_inline.py +59 -0
- numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +1 -1
- numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +50 -5
- numba_cuda/numba/cuda/vector_types.py +3 -1
- numba_cuda/numba/cuda/vectorizers.py +1 -1
- {numba_cuda-0.9.0.dist-info → numba_cuda-0.10.0.dist-info}/METADATA +1 -1
- {numba_cuda-0.9.0.dist-info → numba_cuda-0.10.0.dist-info}/RECORD +42 -32
- {numba_cuda-0.9.0.dist-info → numba_cuda-0.10.0.dist-info}/WHEEL +1 -1
- {numba_cuda-0.9.0.dist-info → numba_cuda-0.10.0.dist-info}/licenses/LICENSE +0 -0
- {numba_cuda-0.9.0.dist-info → numba_cuda-0.10.0.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,5 @@
|
|
1
1
|
/*
|
2
|
-
* Copyright 1993-
|
2
|
+
* Copyright 1993-2021 NVIDIA Corporation. All rights reserved.
|
3
3
|
*
|
4
4
|
* NOTICE TO LICENSEE:
|
5
5
|
*
|
@@ -118,25 +118,25 @@
|
|
118
118
|
/* Macros for half & half2 binary arithmetic */
|
119
119
|
#define __BINARY_OP_HALF_MACRO(name) /* do */ {\
|
120
120
|
__half val; \
|
121
|
-
asm( "{"
|
121
|
+
asm( "{" __CUDA_FP16_STRINGIFY(name) ".f16 %0,%1,%2;\n}" \
|
122
122
|
:"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)),"h"(__HALF_TO_CUS(b))); \
|
123
123
|
return val; \
|
124
124
|
} /* while(0) */
|
125
125
|
#define __BINARY_OP_HALF2_MACRO(name) /* do */ {\
|
126
126
|
__half2 val; \
|
127
|
-
asm( "{"
|
127
|
+
asm( "{" __CUDA_FP16_STRINGIFY(name) ".f16x2 %0,%1,%2;\n}" \
|
128
128
|
:"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b))); \
|
129
129
|
return val; \
|
130
130
|
} /* while(0) */
|
131
131
|
#define __TERNARY_OP_HALF_MACRO(name) /* do */ {\
|
132
132
|
__half val; \
|
133
|
-
asm( "{"
|
133
|
+
asm( "{" __CUDA_FP16_STRINGIFY(name) ".f16 %0,%1,%2,%3;\n}" \
|
134
134
|
:"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)),"h"(__HALF_TO_CUS(b)),"h"(__HALF_TO_CUS(c))); \
|
135
135
|
return val; \
|
136
136
|
} /* while(0) */
|
137
137
|
#define __TERNARY_OP_HALF2_MACRO(name) /* do */ {\
|
138
138
|
__half2 val; \
|
139
|
-
asm( "{"
|
139
|
+
asm( "{" __CUDA_FP16_STRINGIFY(name) ".f16x2 %0,%1,%2,%3;\n}" \
|
140
140
|
:"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b)),"r"(__HALF2_TO_CUI(c))); \
|
141
141
|
return val; \
|
142
142
|
} /* while(0) */
|
@@ -247,7 +247,7 @@ public:
|
|
247
247
|
#if defined(__CUDACC__)
|
248
248
|
|
249
249
|
/* Arithmetic FP16 operations only supported on arch >= 5.3 */
|
250
|
-
#if __CUDA_ARCH__
|
250
|
+
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
|
251
251
|
#if !defined(__CUDA_NO_HALF_OPERATORS__)
|
252
252
|
/* Some basic arithmetic operations expected of a builtin */
|
253
253
|
__device__ __forceinline__ __half operator+(const __half &lh, const __half &rh) { return __hadd(lh, rh); }
|
@@ -263,8 +263,28 @@ __device__ __forceinline__ __half &operator/=(__half &lh, const __half &rh) { lh
|
|
263
263
|
/* Note for increment and decrement we use the raw value 0x3C00U equating to half(1.0F), to avoid the extra conversion */
|
264
264
|
__device__ __forceinline__ __half &operator++(__half &h) { __half_raw one; one.x = 0x3C00U; h += one; return h; }
|
265
265
|
__device__ __forceinline__ __half &operator--(__half &h) { __half_raw one; one.x = 0x3C00U; h -= one; return h; }
|
266
|
-
__device__ __forceinline__ __half operator++(__half &h, const int ignored)
|
267
|
-
|
266
|
+
__device__ __forceinline__ __half operator++(__half &h, const int ignored)
|
267
|
+
{
|
268
|
+
// ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators.
|
269
|
+
static_cast<void>(ignored);
|
270
|
+
|
271
|
+
const __half ret = h;
|
272
|
+
__half_raw one;
|
273
|
+
one.x = 0x3C00U;
|
274
|
+
h += one;
|
275
|
+
return ret;
|
276
|
+
}
|
277
|
+
__device__ __forceinline__ __half operator--(__half &h, const int ignored)
|
278
|
+
{
|
279
|
+
// ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators.
|
280
|
+
static_cast<void>(ignored);
|
281
|
+
|
282
|
+
const __half ret = h;
|
283
|
+
__half_raw one;
|
284
|
+
one.x = 0x3C00U;
|
285
|
+
h -= one;
|
286
|
+
return ret;
|
287
|
+
}
|
268
288
|
|
269
289
|
/* Unary plus and inverse operators */
|
270
290
|
__device__ __forceinline__ __half operator+(const __half &h) { return h; }
|
@@ -278,7 +298,7 @@ __device__ __forceinline__ bool operator< (const __half &lh, const __half &rh) {
|
|
278
298
|
__device__ __forceinline__ bool operator>=(const __half &lh, const __half &rh) { return __hge(lh, rh); }
|
279
299
|
__device__ __forceinline__ bool operator<=(const __half &lh, const __half &rh) { return __hle(lh, rh); }
|
280
300
|
#endif /* !defined(__CUDA_NO_HALF_OPERATORS__) */
|
281
|
-
#endif /* __CUDA_ARCH__
|
301
|
+
#endif /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) */
|
282
302
|
#endif /* defined(__CUDACC__) */
|
283
303
|
|
284
304
|
/* __half2 is visible to non-nvcc host compilers */
|
@@ -309,7 +329,7 @@ public:
|
|
309
329
|
#if defined(__CUDACC__)
|
310
330
|
|
311
331
|
/* Arithmetic FP16x2 operations only supported on arch >= 5.3 */
|
312
|
-
#if (__CUDA_ARCH__
|
332
|
+
#if (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) && !defined(__CUDA_NO_HALF2_OPERATORS__)
|
313
333
|
|
314
334
|
__device__ __forceinline__ __half2 operator+(const __half2 &lh, const __half2 &rh) { return __hadd2(lh, rh); }
|
315
335
|
__device__ __forceinline__ __half2 operator-(const __half2 &lh, const __half2 &rh) { return __hsub2(lh, rh); }
|
@@ -323,8 +343,30 @@ __device__ __forceinline__ __half2& operator/=(__half2 &lh, const __half2 &rh) {
|
|
323
343
|
|
324
344
|
__device__ __forceinline__ __half2 &operator++(__half2 &h) { __half2_raw one; one.x = 0x3C00U; one.y = 0x3C00U; h = __hadd2(h, one); return h; }
|
325
345
|
__device__ __forceinline__ __half2 &operator--(__half2 &h) { __half2_raw one; one.x = 0x3C00U; one.y = 0x3C00U; h = __hsub2(h, one); return h; }
|
326
|
-
__device__ __forceinline__ __half2 operator++(__half2 &h, const int ignored)
|
327
|
-
|
346
|
+
__device__ __forceinline__ __half2 operator++(__half2 &h, const int ignored)
|
347
|
+
{
|
348
|
+
// ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators.
|
349
|
+
static_cast<void>(ignored);
|
350
|
+
|
351
|
+
const __half2 ret = h;
|
352
|
+
__half2_raw one;
|
353
|
+
one.x = 0x3C00U;
|
354
|
+
one.y = 0x3C00U;
|
355
|
+
h = __hadd2(h, one);
|
356
|
+
return ret;
|
357
|
+
}
|
358
|
+
__device__ __forceinline__ __half2 operator--(__half2 &h, const int ignored)
|
359
|
+
{
|
360
|
+
// ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators.
|
361
|
+
static_cast<void>(ignored);
|
362
|
+
|
363
|
+
const __half2 ret = h;
|
364
|
+
__half2_raw one;
|
365
|
+
one.x = 0x3C00U;
|
366
|
+
one.y = 0x3C00U;
|
367
|
+
h = __hsub2(h, one);
|
368
|
+
return ret;
|
369
|
+
}
|
328
370
|
|
329
371
|
__device__ __forceinline__ __half2 operator+(const __half2 &h) { return h; }
|
330
372
|
__device__ __forceinline__ __half2 operator-(const __half2 &h) { return __hneg2(h); }
|
@@ -336,7 +378,7 @@ __device__ __forceinline__ bool operator<(const __half2 &lh, const __half2 &rh)
|
|
336
378
|
__device__ __forceinline__ bool operator>=(const __half2 &lh, const __half2 &rh) { return __hbge2(lh, rh); }
|
337
379
|
__device__ __forceinline__ bool operator<=(const __half2 &lh, const __half2 &rh) { return __hble2(lh, rh); }
|
338
380
|
|
339
|
-
#endif /* __CUDA_ARCH__
|
381
|
+
#endif /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) */
|
340
382
|
#endif /* defined(__CUDACC__) */
|
341
383
|
|
342
384
|
/* Restore warning for multiple assignment operators */
|
@@ -388,6 +430,7 @@ static inline unsigned short __internal_float2half(const float f, unsigned int &
|
|
388
430
|
mantissa |= 0x800000U;
|
389
431
|
remainder = mantissa << (32U - shift);
|
390
432
|
result = (sign | (mantissa >> shift));
|
433
|
+
result &= 0x0000FFFFU;
|
391
434
|
}
|
392
435
|
return static_cast<unsigned short>(result);
|
393
436
|
}
|
@@ -401,10 +444,12 @@ __CUDA_HOSTDEVICE_FP16_DECL__ __half __double2half(const double a)
|
|
401
444
|
return val;
|
402
445
|
#else
|
403
446
|
__half result;
|
447
|
+
/*
|
404
448
|
// Perform rounding to 11 bits of precision, convert value
|
405
449
|
// to float and call existing float to half conversion.
|
406
450
|
// By pre-rounding to 11 bits we avoid additional rounding
|
407
451
|
// in float to half conversion.
|
452
|
+
*/
|
408
453
|
unsigned long long int absa;
|
409
454
|
unsigned long long int ua;
|
410
455
|
#if defined(__CUDACC__)
|
@@ -415,12 +460,15 @@ __CUDA_HOSTDEVICE_FP16_DECL__ __half __double2half(const double a)
|
|
415
460
|
absa = (ua & 0x7fffffffffffffffULL);
|
416
461
|
if ((absa >= 0x40f0000000000000ULL) || (absa <= 0x3e60000000000000ULL))
|
417
462
|
{
|
463
|
+
/*
|
418
464
|
// |a| >= 2^16 or NaN or |a| <= 2^(-25)
|
419
465
|
// double-rounding is not a problem
|
466
|
+
*/
|
420
467
|
result = __float2half(static_cast<float>(a));
|
421
468
|
}
|
422
469
|
else
|
423
470
|
{
|
471
|
+
/*
|
424
472
|
// here 2^(-25) < |a| < 2^16
|
425
473
|
// prepare shifter value such that a + shifter
|
426
474
|
// done in double precision performs round-to-nearest-even
|
@@ -431,15 +479,22 @@ __CUDA_HOSTDEVICE_FP16_DECL__ __half __double2half(const double a)
|
|
431
479
|
// So need to have |a| capped to avoid overflow in exponent.
|
432
480
|
// For inputs that are smaller than half precision minnorm
|
433
481
|
// we prepare fixed shifter exponent.
|
482
|
+
*/
|
434
483
|
unsigned long long shifterBits;
|
435
484
|
if (absa >= 0x3f10000000000000ULL)
|
436
|
-
{
|
485
|
+
{
|
486
|
+
/*
|
487
|
+
// Here if |a| >= 2^(-14)
|
437
488
|
// add 42 to exponent bits
|
489
|
+
*/
|
438
490
|
shifterBits = (ua & 0x7ff0000000000000ULL) + 0x02A0000000000000ULL;
|
439
491
|
}
|
440
492
|
else
|
441
|
-
{
|
493
|
+
{
|
494
|
+
/*
|
495
|
+
// 2^(-25) < |a| < 2^(-14), potentially results in denormal
|
442
496
|
// set exponent bits to 42 - 14 + bias
|
497
|
+
*/
|
443
498
|
shifterBits = 0x41B0000000000000ULL;
|
444
499
|
}
|
445
500
|
// set leading mantissa bit to protect against negative inputs
|
@@ -452,8 +507,10 @@ __CUDA_HOSTDEVICE_FP16_DECL__ __half __double2half(const double a)
|
|
452
507
|
#endif
|
453
508
|
double aShiftRound = a + shifter;
|
454
509
|
|
510
|
+
/*
|
455
511
|
// Prevent the compiler from optimizing away a + shifter - shifter
|
456
512
|
// by doing intermediate memcopy and harmless bitwize operation
|
513
|
+
*/
|
457
514
|
unsigned long long int aShiftRoundBits;
|
458
515
|
#if defined(__CUDACC__)
|
459
516
|
(void)memcpy(&aShiftRoundBits, &aShiftRound, sizeof(aShiftRound));
|
@@ -575,10 +632,15 @@ __CUDA_HOSTDEVICE_FP16_DECL__ __half2 __floats2half2_rn(const float a, const flo
|
|
575
632
|
{
|
576
633
|
__half2 val;
|
577
634
|
#if defined(__CUDA_ARCH__)
|
635
|
+
#if (__CUDA_ARCH__ >= 800)
|
636
|
+
asm("{ cvt.rn.f16x2.f32 %0, %2, %1; }\n"
|
637
|
+
: "=r"(__HALF2_TO_UI(val)) : "f"(a), "f"(b));
|
638
|
+
#else
|
578
639
|
asm("{.reg .f16 low,high;\n"
|
579
640
|
" cvt.rn.f16.f32 low, %1;\n"
|
580
641
|
" cvt.rn.f16.f32 high, %2;\n"
|
581
642
|
" mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "f"(a), "f"(b));
|
643
|
+
#endif
|
582
644
|
#else
|
583
645
|
val = __half2(__float2half_rn(a), __float2half_rn(b));
|
584
646
|
#endif
|
@@ -611,7 +673,7 @@ static inline float __internal_half2float(const unsigned short h)
|
|
611
673
|
} else {
|
612
674
|
exponent += 0x70U;
|
613
675
|
}
|
614
|
-
unsigned int u = ((sign << 31U) | (exponent << 23U) | mantissa);
|
676
|
+
const unsigned int u = ((sign << 31U) | (exponent << 23U) | mantissa);
|
615
677
|
#if defined(__CUDACC__)
|
616
678
|
(void)memcpy(&f, &u, sizeof(u));
|
617
679
|
#else
|
@@ -655,6 +717,168 @@ __CUDA_HOSTDEVICE_FP16_DECL__ float __high2float(const __half2 a)
|
|
655
717
|
#endif
|
656
718
|
return val;
|
657
719
|
}
|
720
|
+
__CUDA_HOSTDEVICE_FP16_DECL__ short int __half2short_rz(const __half h)
|
721
|
+
{
|
722
|
+
short int i;
|
723
|
+
#if defined __CUDA_ARCH__
|
724
|
+
asm("cvt.rzi.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
|
725
|
+
#else
|
726
|
+
const float f = __half2float(h);
|
727
|
+
const short int max_val = (short int)0x7fffU;
|
728
|
+
const short int min_val = (short int)0x8000U;
|
729
|
+
const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U);
|
730
|
+
// saturation fixup
|
731
|
+
if (bits > (unsigned short)0xF800U) {
|
732
|
+
// NaN
|
733
|
+
i = 0;
|
734
|
+
} else if (f > static_cast<float>(max_val)) {
|
735
|
+
// saturate maximum
|
736
|
+
i = max_val;
|
737
|
+
} else if (f < static_cast<float>(min_val)) {
|
738
|
+
// saturate minimum
|
739
|
+
i = min_val;
|
740
|
+
} else {
|
741
|
+
// normal value, conversion is well-defined
|
742
|
+
i = static_cast<short int>(f);
|
743
|
+
}
|
744
|
+
#endif
|
745
|
+
return i;
|
746
|
+
}
|
747
|
+
__CUDA_HOSTDEVICE_FP16_DECL__ unsigned short int __half2ushort_rz(const __half h)
|
748
|
+
{
|
749
|
+
unsigned short int i;
|
750
|
+
#if defined __CUDA_ARCH__
|
751
|
+
asm("cvt.rzi.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
|
752
|
+
#else
|
753
|
+
const float f = __half2float(h);
|
754
|
+
const unsigned short int max_val = 0xffffU;
|
755
|
+
const unsigned short int min_val = 0U;
|
756
|
+
const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U);
|
757
|
+
// saturation fixup
|
758
|
+
if (bits > (unsigned short)0xF800U) {
|
759
|
+
// NaN
|
760
|
+
i = 0U;
|
761
|
+
} else if (f > static_cast<float>(max_val)) {
|
762
|
+
// saturate maximum
|
763
|
+
i = max_val;
|
764
|
+
} else if (f < static_cast<float>(min_val)) {
|
765
|
+
// saturate minimum
|
766
|
+
i = min_val;
|
767
|
+
} else {
|
768
|
+
// normal value, conversion is well-defined
|
769
|
+
i = static_cast<unsigned short int>(f);
|
770
|
+
}
|
771
|
+
#endif
|
772
|
+
return i;
|
773
|
+
}
|
774
|
+
__CUDA_HOSTDEVICE_FP16_DECL__ int __half2int_rz(const __half h)
|
775
|
+
{
|
776
|
+
int i;
|
777
|
+
#if defined __CUDA_ARCH__
|
778
|
+
asm("cvt.rzi.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
|
779
|
+
#else
|
780
|
+
const float f = __half2float(h);
|
781
|
+
const int max_val = (int)0x7fffffffU;
|
782
|
+
const int min_val = (int)0x80000000U;
|
783
|
+
const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U);
|
784
|
+
// saturation fixup
|
785
|
+
if (bits > (unsigned short)0xF800U) {
|
786
|
+
// NaN
|
787
|
+
i = 0;
|
788
|
+
} else if (f > static_cast<float>(max_val)) {
|
789
|
+
// saturate maximum
|
790
|
+
i = max_val;
|
791
|
+
} else if (f < static_cast<float>(min_val)) {
|
792
|
+
// saturate minimum
|
793
|
+
i = min_val;
|
794
|
+
} else {
|
795
|
+
// normal value, conversion is well-defined
|
796
|
+
i = static_cast<int>(f);
|
797
|
+
}
|
798
|
+
#endif
|
799
|
+
return i;
|
800
|
+
}
|
801
|
+
__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __half2uint_rz(const __half h)
|
802
|
+
{
|
803
|
+
unsigned int i;
|
804
|
+
#if defined __CUDA_ARCH__
|
805
|
+
asm("cvt.rzi.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
|
806
|
+
#else
|
807
|
+
const float f = __half2float(h);
|
808
|
+
const unsigned int max_val = 0xffffffffU;
|
809
|
+
const unsigned int min_val = 0U;
|
810
|
+
const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U);
|
811
|
+
// saturation fixup
|
812
|
+
if (bits > (unsigned short)0xF800U) {
|
813
|
+
// NaN
|
814
|
+
i = 0U;
|
815
|
+
} else if (f > static_cast<float>(max_val)) {
|
816
|
+
// saturate maximum
|
817
|
+
i = max_val;
|
818
|
+
} else if (f < static_cast<float>(min_val)) {
|
819
|
+
// saturate minimum
|
820
|
+
i = min_val;
|
821
|
+
} else {
|
822
|
+
// normal value, conversion is well-defined
|
823
|
+
i = static_cast<unsigned int>(f);
|
824
|
+
}
|
825
|
+
#endif
|
826
|
+
return i;
|
827
|
+
}
|
828
|
+
__CUDA_HOSTDEVICE_FP16_DECL__ long long int __half2ll_rz(const __half h)
|
829
|
+
{
|
830
|
+
long long int i;
|
831
|
+
#if defined __CUDA_ARCH__
|
832
|
+
asm("cvt.rzi.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
|
833
|
+
#else
|
834
|
+
const float f = __half2float(h);
|
835
|
+
const long long int max_val = (long long int)0x7fffffffffffffffULL;
|
836
|
+
const long long int min_val = (long long int)0x8000000000000000ULL;
|
837
|
+
const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U);
|
838
|
+
// saturation fixup
|
839
|
+
if (bits > (unsigned short)0xF800U) {
|
840
|
+
// NaN
|
841
|
+
i = min_val;
|
842
|
+
} else if (f > static_cast<float>(max_val)) {
|
843
|
+
// saturate maximum
|
844
|
+
i = max_val;
|
845
|
+
} else if (f < static_cast<float>(min_val)) {
|
846
|
+
// saturate minimum
|
847
|
+
i = min_val;
|
848
|
+
} else {
|
849
|
+
// normal value, conversion is well-defined
|
850
|
+
i = static_cast<long long int>(f);
|
851
|
+
}
|
852
|
+
#endif
|
853
|
+
return i;
|
854
|
+
}
|
855
|
+
__CUDA_HOSTDEVICE_FP16_DECL__ unsigned long long int __half2ull_rz(const __half h)
|
856
|
+
{
|
857
|
+
unsigned long long int i;
|
858
|
+
#if defined __CUDA_ARCH__
|
859
|
+
asm("cvt.rzi.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
|
860
|
+
#else
|
861
|
+
const float f = __half2float(h);
|
862
|
+
const unsigned long long int max_val = 0xffffffffffffffffULL;
|
863
|
+
const unsigned long long int min_val = 0ULL;
|
864
|
+
const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U);
|
865
|
+
// saturation fixup
|
866
|
+
if (bits > (unsigned short)0xF800U) {
|
867
|
+
// NaN
|
868
|
+
i = 0x8000000000000000ULL;
|
869
|
+
} else if (f > static_cast<float>(max_val)) {
|
870
|
+
// saturate maximum
|
871
|
+
i = max_val;
|
872
|
+
} else if (f < static_cast<float>(min_val)) {
|
873
|
+
// saturate minimum
|
874
|
+
i = min_val;
|
875
|
+
} else {
|
876
|
+
// normal value, conversion is well-defined
|
877
|
+
i = static_cast<unsigned long long int>(f);
|
878
|
+
}
|
879
|
+
#endif
|
880
|
+
return i;
|
881
|
+
}
|
658
882
|
|
659
883
|
/* Intrinsic functions only available to nvcc compilers */
|
660
884
|
#if defined(__CUDACC__)
|
@@ -697,30 +921,6 @@ __CUDA_FP16_DECL__ int __half2int_rn(const __half h)
|
|
697
921
|
asm("cvt.rni.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
|
698
922
|
return i;
|
699
923
|
}
|
700
|
-
__CUDA_HOSTDEVICE_FP16_DECL__ int __half2int_rz(const __half h)
|
701
|
-
{
|
702
|
-
int i;
|
703
|
-
#if defined __CUDA_ARCH__
|
704
|
-
asm("cvt.rzi.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
|
705
|
-
#else
|
706
|
-
const float f = __half2float(h);
|
707
|
-
i = static_cast<int>(f);
|
708
|
-
const int max_val = (int)0x7fffffffU;
|
709
|
-
const int min_val = (int)0x80000000U;
|
710
|
-
// saturation fixup
|
711
|
-
if (f != f) {
|
712
|
-
// NaN
|
713
|
-
i = 0;
|
714
|
-
} else if (f > static_cast<float>(max_val)) {
|
715
|
-
// saturate maximum
|
716
|
-
i = max_val;
|
717
|
-
} else if (f < static_cast<float>(min_val)) {
|
718
|
-
// saturate minimum
|
719
|
-
i = min_val;
|
720
|
-
}
|
721
|
-
#endif
|
722
|
-
return i;
|
723
|
-
}
|
724
924
|
__CUDA_FP16_DECL__ int __half2int_rd(const __half h)
|
725
925
|
{
|
726
926
|
int i;
|
@@ -773,30 +973,6 @@ __CUDA_FP16_DECL__ short int __half2short_rn(const __half h)
|
|
773
973
|
asm("cvt.rni.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
|
774
974
|
return i;
|
775
975
|
}
|
776
|
-
__CUDA_HOSTDEVICE_FP16_DECL__ short int __half2short_rz(const __half h)
|
777
|
-
{
|
778
|
-
short int i;
|
779
|
-
#if defined __CUDA_ARCH__
|
780
|
-
asm("cvt.rzi.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
|
781
|
-
#else
|
782
|
-
const float f = __half2float(h);
|
783
|
-
i = static_cast<short int>(f);
|
784
|
-
const short int max_val = (short int)0x7fffU;
|
785
|
-
const short int min_val = (short int)0x8000U;
|
786
|
-
// saturation fixup
|
787
|
-
if (f != f) {
|
788
|
-
// NaN
|
789
|
-
i = 0;
|
790
|
-
} else if (f > static_cast<float>(max_val)) {
|
791
|
-
// saturate maximum
|
792
|
-
i = max_val;
|
793
|
-
} else if (f < static_cast<float>(min_val)) {
|
794
|
-
// saturate minimum
|
795
|
-
i = min_val;
|
796
|
-
}
|
797
|
-
#endif
|
798
|
-
return i;
|
799
|
-
}
|
800
976
|
__CUDA_FP16_DECL__ short int __half2short_rd(const __half h)
|
801
977
|
{
|
802
978
|
short int i;
|
@@ -845,30 +1021,6 @@ __CUDA_FP16_DECL__ unsigned int __half2uint_rn(const __half h)
|
|
845
1021
|
asm("cvt.rni.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
|
846
1022
|
return i;
|
847
1023
|
}
|
848
|
-
__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __half2uint_rz(const __half h)
|
849
|
-
{
|
850
|
-
unsigned int i;
|
851
|
-
#if defined __CUDA_ARCH__
|
852
|
-
asm("cvt.rzi.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
|
853
|
-
#else
|
854
|
-
const float f = __half2float(h);
|
855
|
-
i = static_cast<unsigned int>(f);
|
856
|
-
const unsigned int max_val = 0xffffffffU;
|
857
|
-
const unsigned int min_val = 0U;
|
858
|
-
// saturation fixup
|
859
|
-
if (f != f) {
|
860
|
-
// NaN
|
861
|
-
i = 0U;
|
862
|
-
} else if (f > static_cast<float>(max_val)) {
|
863
|
-
// saturate maximum
|
864
|
-
i = max_val;
|
865
|
-
} else if (f < static_cast<float>(min_val)) {
|
866
|
-
// saturate minimum
|
867
|
-
i = min_val;
|
868
|
-
}
|
869
|
-
#endif
|
870
|
-
return i;
|
871
|
-
}
|
872
1024
|
__CUDA_FP16_DECL__ unsigned int __half2uint_rd(const __half h)
|
873
1025
|
{
|
874
1026
|
unsigned int i;
|
@@ -921,30 +1073,6 @@ __CUDA_FP16_DECL__ unsigned short int __half2ushort_rn(const __half h)
|
|
921
1073
|
asm("cvt.rni.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
|
922
1074
|
return i;
|
923
1075
|
}
|
924
|
-
__CUDA_HOSTDEVICE_FP16_DECL__ unsigned short int __half2ushort_rz(const __half h)
|
925
|
-
{
|
926
|
-
unsigned short int i;
|
927
|
-
#if defined __CUDA_ARCH__
|
928
|
-
asm("cvt.rzi.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
|
929
|
-
#else
|
930
|
-
const float f = __half2float(h);
|
931
|
-
i = static_cast<unsigned short int>(f);
|
932
|
-
const unsigned short int max_val = 0xffffU;
|
933
|
-
const unsigned short int min_val = 0U;
|
934
|
-
// saturation fixup
|
935
|
-
if (f != f) {
|
936
|
-
// NaN
|
937
|
-
i = 0U;
|
938
|
-
} else if (f > static_cast<float>(max_val)) {
|
939
|
-
// saturate maximum
|
940
|
-
i = max_val;
|
941
|
-
} else if (f < static_cast<float>(min_val)) {
|
942
|
-
// saturate minimum
|
943
|
-
i = min_val;
|
944
|
-
}
|
945
|
-
#endif
|
946
|
-
return i;
|
947
|
-
}
|
948
1076
|
__CUDA_FP16_DECL__ unsigned short int __half2ushort_rd(const __half h)
|
949
1077
|
{
|
950
1078
|
unsigned short int i;
|
@@ -993,30 +1121,6 @@ __CUDA_FP16_DECL__ unsigned long long int __half2ull_rn(const __half h)
|
|
993
1121
|
asm("cvt.rni.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
|
994
1122
|
return i;
|
995
1123
|
}
|
996
|
-
__CUDA_HOSTDEVICE_FP16_DECL__ unsigned long long int __half2ull_rz(const __half h)
|
997
|
-
{
|
998
|
-
unsigned long long int i;
|
999
|
-
#if defined __CUDA_ARCH__
|
1000
|
-
asm("cvt.rzi.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
|
1001
|
-
#else
|
1002
|
-
const float f = __half2float(h);
|
1003
|
-
i = static_cast<unsigned long long int>(f);
|
1004
|
-
const unsigned long long int max_val = 0xffffffffffffffffULL;
|
1005
|
-
const unsigned long long int min_val = 0ULL;
|
1006
|
-
// saturation fixup
|
1007
|
-
if (f != f) {
|
1008
|
-
// NaN
|
1009
|
-
i = 0x8000000000000000ULL;
|
1010
|
-
} else if (f > static_cast<float>(max_val)) {
|
1011
|
-
// saturate maximum
|
1012
|
-
i = max_val;
|
1013
|
-
} else if (f < static_cast<float>(min_val)) {
|
1014
|
-
// saturate minimum
|
1015
|
-
i = min_val;
|
1016
|
-
}
|
1017
|
-
#endif
|
1018
|
-
return i;
|
1019
|
-
}
|
1020
1124
|
__CUDA_FP16_DECL__ unsigned long long int __half2ull_rd(const __half h)
|
1021
1125
|
{
|
1022
1126
|
unsigned long long int i;
|
@@ -1069,30 +1173,6 @@ __CUDA_FP16_DECL__ long long int __half2ll_rn(const __half h)
|
|
1069
1173
|
asm("cvt.rni.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
|
1070
1174
|
return i;
|
1071
1175
|
}
|
1072
|
-
__CUDA_HOSTDEVICE_FP16_DECL__ long long int __half2ll_rz(const __half h)
|
1073
|
-
{
|
1074
|
-
long long int i;
|
1075
|
-
#if defined __CUDA_ARCH__
|
1076
|
-
asm("cvt.rzi.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
|
1077
|
-
#else
|
1078
|
-
const float f = __half2float(h);
|
1079
|
-
i = static_cast<long long int>(f);
|
1080
|
-
const long long int max_val = (long long int)0x7fffffffffffffffULL;
|
1081
|
-
const long long int min_val = (long long int)0x8000000000000000ULL;
|
1082
|
-
// saturation fixup
|
1083
|
-
if (f != f) {
|
1084
|
-
// NaN
|
1085
|
-
i = min_val;
|
1086
|
-
} else if (f > static_cast<float>(max_val)) {
|
1087
|
-
// saturate maximum
|
1088
|
-
i = max_val;
|
1089
|
-
} else if (f < static_cast<float>(min_val)) {
|
1090
|
-
// saturate minimum
|
1091
|
-
i = min_val;
|
1092
|
-
}
|
1093
|
-
#endif
|
1094
|
-
return i;
|
1095
|
-
}
|
1096
1176
|
__CUDA_FP16_DECL__ long long int __half2ll_rd(const __half h)
|
1097
1177
|
{
|
1098
1178
|
long long int i;
|
@@ -1309,20 +1389,89 @@ __CUDA_FP16_DECL__ __half __ushort_as_half(const unsigned short int i)
|
|
1309
1389
|
return h;
|
1310
1390
|
}
|
1311
1391
|
|
1312
|
-
|
1392
|
+
/******************************************************************************
|
1393
|
+
* __half arithmetic *
|
1394
|
+
******************************************************************************/
|
1395
|
+
__CUDA_FP16_DECL__ __half __hmax(const __half a, const __half b)
|
1396
|
+
{
|
1397
|
+
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800)
|
1398
|
+
__BINARY_OP_HALF_MACRO(max)
|
1399
|
+
#else
|
1400
|
+
const float fa = __half2float(a);
|
1401
|
+
const float fb = __half2float(b);
|
1402
|
+
float fr;
|
1403
|
+
asm("{max.f32 %0,%1,%2;\n}"
|
1404
|
+
:"=f"(fr) : "f"(fa), "f"(fb));
|
1405
|
+
const __half hr = __float2half(fr);
|
1406
|
+
return hr;
|
1407
|
+
#endif
|
1408
|
+
}
|
1409
|
+
__CUDA_FP16_DECL__ __half __hmin(const __half a, const __half b)
|
1410
|
+
{
|
1411
|
+
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800)
|
1412
|
+
__BINARY_OP_HALF_MACRO(min)
|
1413
|
+
#else
|
1414
|
+
const float fa = __half2float(a);
|
1415
|
+
const float fb = __half2float(b);
|
1416
|
+
float fr;
|
1417
|
+
asm("{min.f32 %0,%1,%2;\n}"
|
1418
|
+
:"=f"(fr) : "f"(fa), "f"(fb));
|
1419
|
+
const __half hr = __float2half(fr);
|
1420
|
+
return hr;
|
1421
|
+
#endif
|
1422
|
+
}
|
1423
|
+
|
1424
|
+
/******************************************************************************
|
1425
|
+
* __half2 arithmetic *
|
1426
|
+
******************************************************************************/
|
1427
|
+
__CUDA_FP16_DECL__ __half2 __hmax2(const __half2 a, const __half2 b)
|
1428
|
+
{
|
1429
|
+
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800)
|
1430
|
+
__BINARY_OP_HALF2_MACRO(max)
|
1431
|
+
#else
|
1432
|
+
const float2 fa = __half22float2(a);
|
1433
|
+
const float2 fb = __half22float2(b);
|
1434
|
+
float2 fr;
|
1435
|
+
asm("{max.f32 %0,%1,%2;\n}"
|
1436
|
+
:"=f"(fr.x) : "f"(fa.x), "f"(fb.x));
|
1437
|
+
asm("{max.f32 %0,%1,%2;\n}"
|
1438
|
+
:"=f"(fr.y) : "f"(fa.y), "f"(fb.y));
|
1439
|
+
const __half2 hr = __float22half2_rn(fr);
|
1440
|
+
return hr;
|
1441
|
+
#endif
|
1442
|
+
}
|
1443
|
+
__CUDA_FP16_DECL__ __half2 __hmin2(const __half2 a, const __half2 b)
|
1444
|
+
{
|
1445
|
+
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800)
|
1446
|
+
__BINARY_OP_HALF2_MACRO(min)
|
1447
|
+
#else
|
1448
|
+
const float2 fa = __half22float2(a);
|
1449
|
+
const float2 fb = __half22float2(b);
|
1450
|
+
float2 fr;
|
1451
|
+
asm("{min.f32 %0,%1,%2;\n}"
|
1452
|
+
:"=f"(fr.x) : "f"(fa.x), "f"(fb.x));
|
1453
|
+
asm("{min.f32 %0,%1,%2;\n}"
|
1454
|
+
:"=f"(fr.y) : "f"(fa.y), "f"(fb.y));
|
1455
|
+
const __half2 hr = __float22half2_rn(fr);
|
1456
|
+
return hr;
|
1457
|
+
#endif
|
1458
|
+
}
|
1459
|
+
|
1460
|
+
|
1461
|
+
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300)
|
1313
1462
|
/******************************************************************************
|
1314
1463
|
* __half, __half2 warp shuffle *
|
1315
1464
|
******************************************************************************/
|
1316
1465
|
#define __SHUFFLE_HALF2_MACRO(name) /* do */ {\
|
1317
1466
|
__half2 r; \
|
1318
|
-
asm volatile ("{"
|
1467
|
+
asm volatile ("{" __CUDA_FP16_STRINGIFY(name) " %0,%1,%2,%3;\n}" \
|
1319
1468
|
:"=r"(__HALF2_TO_UI(r)): "r"(__HALF2_TO_CUI(var)), "r"(delta), "r"(c)); \
|
1320
1469
|
return r; \
|
1321
1470
|
} /* while(0) */
|
1322
1471
|
|
1323
1472
|
#define __SHUFFLE_SYNC_HALF2_MACRO(name) /* do */ {\
|
1324
1473
|
__half2 r; \
|
1325
|
-
asm volatile ("{"
|
1474
|
+
asm volatile ("{" __CUDA_FP16_STRINGIFY(name) " %0,%1,%2,%3,%4;\n}" \
|
1326
1475
|
:"=r"(__HALF2_TO_UI(r)): "r"(__HALF2_TO_CUI(var)), "r"(delta), "r"(c), "r"(mask)); \
|
1327
1476
|
return r; \
|
1328
1477
|
} /* while(0) */
|
@@ -1446,12 +1595,12 @@ __CUDA_FP16_DECL__ __half __shfl_xor_sync(const unsigned mask, const __half var,
|
|
1446
1595
|
return __low2half(temp2);
|
1447
1596
|
}
|
1448
1597
|
|
1449
|
-
#endif
|
1598
|
+
#endif /*!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300)*/
|
1450
1599
|
/******************************************************************************
|
1451
1600
|
* __half and __half2 __ldg,__ldcg,__ldca,__ldcs *
|
1452
1601
|
******************************************************************************/
|
1453
1602
|
|
1454
|
-
#if defined(__cplusplus) && (__CUDA_ARCH__
|
1603
|
+
#if defined(__cplusplus) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320))
|
1455
1604
|
#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)
|
1456
1605
|
#define __LDG_PTR "l"
|
1457
1606
|
#else
|
@@ -1562,14 +1711,14 @@ __CUDA_FP16_DECL__ void __stwt(__half *const ptr, const __half value)
|
|
1562
1711
|
asm ("st.global.wt.b16 [%0], %1;" :: __LDG_PTR(ptr), "h"(__HALF_TO_CUS(value)) : "memory");
|
1563
1712
|
}
|
1564
1713
|
#undef __LDG_PTR
|
1565
|
-
#endif /*defined(__cplusplus) && (__CUDA_ARCH__
|
1566
|
-
#if __CUDA_ARCH__
|
1714
|
+
#endif /*defined(__cplusplus) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320))*/
|
1715
|
+
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
|
1567
1716
|
/******************************************************************************
|
1568
1717
|
* __half2 comparison *
|
1569
1718
|
******************************************************************************/
|
1570
1719
|
#define __COMPARISON_OP_HALF2_MACRO(name) /* do */ {\
|
1571
1720
|
__half2 val; \
|
1572
|
-
asm( "{ "
|
1721
|
+
asm( "{ " __CUDA_FP16_STRINGIFY(name) ".f16x2.f16x2 %0,%1,%2;\n}" \
|
1573
1722
|
:"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b))); \
|
1574
1723
|
return val; \
|
1575
1724
|
} /* while(0) */
|
@@ -1625,7 +1774,7 @@ __CUDA_FP16_DECL__ __half2 __hgtu2(const __half2 a, const __half2 b)
|
|
1625
1774
|
#define __BOOL_COMPARISON_OP_HALF2_MACRO(name) /* do */ {\
|
1626
1775
|
__half2 val; \
|
1627
1776
|
bool retval; \
|
1628
|
-
asm( "{ "
|
1777
|
+
asm( "{ " __CUDA_FP16_STRINGIFY(name) ".f16x2.f16x2 %0,%1,%2;\n}" \
|
1629
1778
|
:"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b))); \
|
1630
1779
|
if (__HALF2_TO_CUI(val) == 0x3C003C00U) {\
|
1631
1780
|
retval = true; \
|
@@ -1689,7 +1838,7 @@ __CUDA_FP16_DECL__ bool __hbgtu2(const __half2 a, const __half2 b)
|
|
1689
1838
|
#define __COMPARISON_OP_HALF_MACRO(name) /* do */ {\
|
1690
1839
|
unsigned short val; \
|
1691
1840
|
asm( "{ .reg .pred __$temp3;\n" \
|
1692
|
-
" setp."
|
1841
|
+
" setp." __CUDA_FP16_STRINGIFY(name) ".f16 __$temp3, %1, %2;\n" \
|
1693
1842
|
" selp.u16 %0, 1, 0, __$temp3;}" \
|
1694
1843
|
: "=h"(val) : "h"(__HALF_TO_CUS(a)), "h"(__HALF_TO_CUS(b))); \
|
1695
1844
|
return (val != 0U) ? true : false; \
|
@@ -1770,6 +1919,18 @@ __CUDA_FP16_DECL__ __half2 __hmul2_sat(const __half2 a, const __half2 b)
|
|
1770
1919
|
{
|
1771
1920
|
__BINARY_OP_HALF2_MACRO(mul.sat)
|
1772
1921
|
}
|
1922
|
+
__CUDA_FP16_DECL__ __half2 __hadd2_rn(const __half2 a, const __half2 b)
|
1923
|
+
{
|
1924
|
+
__BINARY_OP_HALF2_MACRO(add.rn)
|
1925
|
+
}
|
1926
|
+
__CUDA_FP16_DECL__ __half2 __hsub2_rn(const __half2 a, const __half2 b)
|
1927
|
+
{
|
1928
|
+
__BINARY_OP_HALF2_MACRO(sub.rn)
|
1929
|
+
}
|
1930
|
+
__CUDA_FP16_DECL__ __half2 __hmul2_rn(const __half2 a, const __half2 b)
|
1931
|
+
{
|
1932
|
+
__BINARY_OP_HALF2_MACRO(mul.rn)
|
1933
|
+
}
|
1773
1934
|
__CUDA_FP16_DECL__ __half2 __hfma2(const __half2 a, const __half2 b, const __half2 c)
|
1774
1935
|
{
|
1775
1936
|
__TERNARY_OP_HALF2_MACRO(fma.rn)
|
@@ -1818,7 +1979,18 @@ __CUDA_FP16_DECL__ __half __hmul_sat(const __half a, const __half b)
|
|
1818
1979
|
{
|
1819
1980
|
__BINARY_OP_HALF_MACRO(mul.sat)
|
1820
1981
|
}
|
1821
|
-
|
1982
|
+
__CUDA_FP16_DECL__ __half __hadd_rn(const __half a, const __half b)
|
1983
|
+
{
|
1984
|
+
__BINARY_OP_HALF_MACRO(add.rn)
|
1985
|
+
}
|
1986
|
+
__CUDA_FP16_DECL__ __half __hsub_rn(const __half a, const __half b)
|
1987
|
+
{
|
1988
|
+
__BINARY_OP_HALF_MACRO(sub.rn)
|
1989
|
+
}
|
1990
|
+
__CUDA_FP16_DECL__ __half __hmul_rn(const __half a, const __half b)
|
1991
|
+
{
|
1992
|
+
__BINARY_OP_HALF_MACRO(mul.rn)
|
1993
|
+
}
|
1822
1994
|
__CUDA_FP16_DECL__ __half __hfma(const __half a, const __half b, const __half c)
|
1823
1995
|
{
|
1824
1996
|
__TERNARY_OP_HALF_MACRO(fma.rn)
|
@@ -1856,23 +2028,23 @@ __CUDA_FP16_DECL__ __half __hdiv(const __half a, const __half b) {
|
|
1856
2028
|
******************************************************************************/
|
1857
2029
|
#define __SPEC_CASE2(i,r, spc, ulp) \
|
1858
2030
|
"{.reg.b32 spc, ulp, p;\n"\
|
1859
|
-
" mov.b32 spc,"
|
1860
|
-
" mov.b32 ulp,"
|
1861
|
-
" set.eq.f16x2.f16x2 p,"
|
1862
|
-
" fma.rn.f16x2 "
|
2031
|
+
" mov.b32 spc," __CUDA_FP16_STRINGIFY(spc) ";\n"\
|
2032
|
+
" mov.b32 ulp," __CUDA_FP16_STRINGIFY(ulp) ";\n"\
|
2033
|
+
" set.eq.f16x2.f16x2 p," __CUDA_FP16_STRINGIFY(i) ", spc;\n"\
|
2034
|
+
" fma.rn.f16x2 " __CUDA_FP16_STRINGIFY(r) ",p,ulp," __CUDA_FP16_STRINGIFY(r) ";\n}\n"
|
1863
2035
|
#define __SPEC_CASE(i,r, spc, ulp) \
|
1864
2036
|
"{.reg.b16 spc, ulp, p;\n"\
|
1865
|
-
" mov.b16 spc,"
|
1866
|
-
" mov.b16 ulp,"
|
1867
|
-
" set.eq.f16.f16 p,"
|
1868
|
-
" fma.rn.f16 "
|
2037
|
+
" mov.b16 spc," __CUDA_FP16_STRINGIFY(spc) ";\n"\
|
2038
|
+
" mov.b16 ulp," __CUDA_FP16_STRINGIFY(ulp) ";\n"\
|
2039
|
+
" set.eq.f16.f16 p," __CUDA_FP16_STRINGIFY(i) ", spc;\n"\
|
2040
|
+
" fma.rn.f16 " __CUDA_FP16_STRINGIFY(r) ",p,ulp," __CUDA_FP16_STRINGIFY(r) ";\n}\n"
|
1869
2041
|
#define __APPROX_FCAST(fun) /* do */ {\
|
1870
2042
|
__half val;\
|
1871
2043
|
asm("{.reg.b32 f; \n"\
|
1872
2044
|
" .reg.b16 r; \n"\
|
1873
2045
|
" mov.b16 r,%1; \n"\
|
1874
2046
|
" cvt.f32.f16 f,r; \n"\
|
1875
|
-
" "
|
2047
|
+
" " __CUDA_FP16_STRINGIFY(fun) ".approx.ftz.f32 f,f; \n"\
|
1876
2048
|
" cvt.rn.f16.f32 r,f; \n"\
|
1877
2049
|
" mov.b16 %0,r; \n"\
|
1878
2050
|
"}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));\
|
@@ -1885,8 +2057,8 @@ __CUDA_FP16_DECL__ __half __hdiv(const __half a, const __half b) {
|
|
1885
2057
|
" mov.b32 {hl, hu}, %1; \n"\
|
1886
2058
|
" cvt.f32.f16 fl, hl; \n"\
|
1887
2059
|
" cvt.f32.f16 fu, hu; \n"\
|
1888
|
-
" "
|
1889
|
-
" "
|
2060
|
+
" " __CUDA_FP16_STRINGIFY(fun) ".approx.ftz.f32 fl, fl; \n"\
|
2061
|
+
" " __CUDA_FP16_STRINGIFY(fun) ".approx.ftz.f32 fu, fu; \n"\
|
1890
2062
|
" cvt.rn.f16.f32 hl, fl; \n"\
|
1891
2063
|
" cvt.rn.f16.f32 hu, fu; \n"\
|
1892
2064
|
" mov.b32 %0, {hl, hu}; \n"\
|
@@ -1895,129 +2067,122 @@ __CUDA_FP16_DECL__ __half __hdiv(const __half a, const __half b) {
|
|
1895
2067
|
} /* while(0) */
|
1896
2068
|
static __device__ __forceinline__ float __float_simpl_sinf(float a);
|
1897
2069
|
static __device__ __forceinline__ float __float_simpl_cosf(float a);
|
1898
|
-
__CUDA_FP16_DECL__ __half __hsin_internal(const __half a) {
|
1899
|
-
float f = __half2float(a);
|
1900
|
-
f = __float_simpl_sinf(f);
|
1901
|
-
return __float2half_rn(f);
|
1902
|
-
}
|
1903
2070
|
__CUDA_FP16_DECL__ __half hsin(const __half a) {
|
1904
|
-
|
2071
|
+
const float sl = __float_simpl_sinf(__half2float(a));
|
2072
|
+
__half r = __float2half_rn(sl);
|
1905
2073
|
asm("{\n\t"
|
1906
2074
|
" .reg.b16 i,r,t; \n\t"
|
1907
2075
|
" mov.b16 r, %0; \n\t"
|
1908
2076
|
" mov.b16 i, %1; \n\t"
|
1909
|
-
"
|
1910
|
-
"
|
2077
|
+
" and.b16 t, r, 0x8000U; \n\t"
|
2078
|
+
" abs.f16 r, r; \n\t"
|
2079
|
+
" abs.f16 i, i; \n\t"
|
1911
2080
|
__SPEC_CASE(i, r, 0X32B3U, 0x0800U)
|
1912
|
-
__SPEC_CASE(i, r, 0X5CB0U,
|
1913
|
-
__SPEC_CASE(i, r, 0XB2B3U, 0x8800U)
|
1914
|
-
__SPEC_CASE(i, r, 0XDCB0U, 0x9000U)
|
2081
|
+
__SPEC_CASE(i, r, 0X5CB0U, 0x9000U)
|
1915
2082
|
" or.b16 r,r,t; \n\t"
|
1916
2083
|
" mov.b16 %0, r; \n"
|
1917
2084
|
"}\n" : "+h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a)));
|
1918
2085
|
return r;
|
1919
2086
|
}
|
1920
2087
|
__CUDA_FP16_DECL__ __half2 h2sin(const __half2 a) {
|
1921
|
-
const
|
1922
|
-
const
|
1923
|
-
|
1924
|
-
const __half sh = __hsin_internal(h);
|
1925
|
-
__half2 r = __halves2half2(sl, sh);
|
2088
|
+
const float sl = __float_simpl_sinf(__half2float(a.x));
|
2089
|
+
const float sh = __float_simpl_sinf(__half2float(a.y));
|
2090
|
+
__half2 r = __floats2half2_rn(sl, sh);
|
1926
2091
|
asm("{\n\t"
|
1927
2092
|
" .reg.b32 i,r,t; \n\t"
|
1928
2093
|
" mov.b32 r, %0; \n\t"
|
1929
2094
|
" mov.b32 i, %1; \n\t"
|
1930
2095
|
" and.b32 t, r, 0x80008000U; \n\t"
|
2096
|
+
" abs.f16x2 r, r; \n\t"
|
2097
|
+
" abs.f16x2 i, i; \n\t"
|
1931
2098
|
__SPEC_CASE2(i, r, 0X32B332B3U, 0x08000800U)
|
1932
|
-
__SPEC_CASE2(i, r, 0X5CB05CB0U,
|
1933
|
-
__SPEC_CASE2(i, r, 0XB2B3B2B3U, 0x88008800U)
|
1934
|
-
__SPEC_CASE2(i, r, 0XDCB0DCB0U, 0x90009000U)
|
2099
|
+
__SPEC_CASE2(i, r, 0X5CB05CB0U, 0x90009000U)
|
1935
2100
|
" or.b32 r, r, t; \n\t"
|
1936
2101
|
" mov.b32 %0, r; \n"
|
1937
2102
|
"}\n" : "+r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a)));
|
1938
2103
|
return r;
|
1939
2104
|
}
|
1940
|
-
__CUDA_FP16_DECL__ __half __hcos_internal(const __half a) {
|
1941
|
-
float f = __half2float(a);
|
1942
|
-
f = __float_simpl_cosf(f);
|
1943
|
-
return __float2half_rn(f);
|
1944
|
-
}
|
1945
2105
|
__CUDA_FP16_DECL__ __half hcos(const __half a) {
|
1946
|
-
|
2106
|
+
const float cl = __float_simpl_cosf(__half2float(a));
|
2107
|
+
__half r = __float2half_rn(cl);
|
1947
2108
|
asm("{\n\t"
|
1948
2109
|
" .reg.b16 i,r; \n\t"
|
1949
2110
|
" mov.b16 r, %0; \n\t"
|
1950
2111
|
" mov.b16 i, %1; \n\t"
|
2112
|
+
" abs.f16 i, i; \n\t"
|
1951
2113
|
__SPEC_CASE(i, r, 0X2B7CU, 0x1000U)
|
1952
|
-
__SPEC_CASE(i, r, 0XAB7CU, 0x1000U)
|
1953
2114
|
" mov.b16 %0, r; \n"
|
1954
2115
|
"}\n" : "+h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a)));
|
1955
2116
|
return r;
|
1956
2117
|
}
|
1957
2118
|
__CUDA_FP16_DECL__ __half2 h2cos(const __half2 a) {
|
1958
|
-
const
|
1959
|
-
const
|
1960
|
-
|
1961
|
-
const __half ch = __hcos_internal(h);
|
1962
|
-
__half2 r = __halves2half2(cl, ch);
|
2119
|
+
const float cl = __float_simpl_cosf(__half2float(a.x));
|
2120
|
+
const float ch = __float_simpl_cosf(__half2float(a.y));
|
2121
|
+
__half2 r = __floats2half2_rn(cl, ch);
|
1963
2122
|
asm("{\n\t"
|
1964
2123
|
" .reg.b32 i,r; \n\t"
|
1965
2124
|
" mov.b32 r, %0; \n\t"
|
1966
2125
|
" mov.b32 i, %1; \n\t"
|
2126
|
+
" abs.f16x2 i, i; \n\t"
|
1967
2127
|
__SPEC_CASE2(i, r, 0X2B7C2B7CU, 0x10001000U)
|
1968
|
-
__SPEC_CASE2(i, r, 0XAB7CAB7CU, 0x10001000U)
|
1969
2128
|
" mov.b32 %0, r; \n"
|
1970
2129
|
"}\n" : "+r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a)));
|
1971
2130
|
return r;
|
1972
2131
|
}
|
1973
|
-
static __device__ __forceinline__ float __internal_trig_reduction_kernel(const float a, int *quadrant)
|
2132
|
+
static __device__ __forceinline__ float __internal_trig_reduction_kernel(const float a, unsigned int *const quadrant)
|
1974
2133
|
{
|
1975
|
-
const
|
1976
|
-
const
|
1977
|
-
float
|
1978
|
-
t = __fmaf_rn(
|
2134
|
+
const float ar = __fmaf_rn(a, 0.636619772F, 12582912.0F);
|
2135
|
+
const unsigned q = __float_as_uint(ar);
|
2136
|
+
const float j = __fsub_rn(ar, 12582912.0F);
|
2137
|
+
float t = __fmaf_rn(j, -1.5707962512969971e+000F, a);
|
2138
|
+
t = __fmaf_rn(j, -7.5497894158615964e-008F, t);
|
1979
2139
|
*quadrant = q;
|
1980
2140
|
return t;
|
1981
2141
|
}
|
1982
|
-
static __device__ __forceinline__ float __internal_sin_cos_kernel(float x, const int i)
|
2142
|
+
static __device__ __forceinline__ float __internal_sin_cos_kernel(const float x, const unsigned int i)
|
1983
2143
|
{
|
1984
2144
|
float z;
|
1985
2145
|
const float x2 = x*x;
|
1986
|
-
|
1987
|
-
|
1988
|
-
|
1989
|
-
|
1990
|
-
|
1991
|
-
|
1992
|
-
|
1993
|
-
|
1994
|
-
|
1995
|
-
|
1996
|
-
|
1997
|
-
|
1998
|
-
|
1999
|
-
|
2000
|
-
|
2001
|
-
z = __fmaf_rn(z, x2, 0.0F);
|
2002
|
-
}
|
2003
|
-
if ((static_cast<unsigned>(i) & 1U) != 0U) {
|
2004
|
-
x = __fmaf_rn(z, x2, 1.0F);
|
2146
|
+
float a8;
|
2147
|
+
float a6;
|
2148
|
+
float a4;
|
2149
|
+
float a2;
|
2150
|
+
float a1;
|
2151
|
+
float a0;
|
2152
|
+
|
2153
|
+
if ((i & 1U) != 0U) {
|
2154
|
+
// cos
|
2155
|
+
a8 = 2.44331571e-5F;
|
2156
|
+
a6 = -1.38873163e-3F;
|
2157
|
+
a4 = 4.16666457e-2F;
|
2158
|
+
a2 = -5.00000000e-1F;
|
2159
|
+
a1 = x2;
|
2160
|
+
a0 = 1.0F;
|
2005
2161
|
}
|
2006
2162
|
else {
|
2007
|
-
|
2163
|
+
// sin
|
2164
|
+
a8 = -1.95152959e-4F;
|
2165
|
+
a6 = 8.33216087e-3F;
|
2166
|
+
a4 = -1.66666546e-1F;
|
2167
|
+
a2 = 0.0F;
|
2168
|
+
a1 = x;
|
2169
|
+
a0 = x;
|
2008
2170
|
}
|
2009
|
-
|
2010
|
-
|
2171
|
+
|
2172
|
+
z = __fmaf_rn(a8, x2, a6);
|
2173
|
+
z = __fmaf_rn(z, x2, a4);
|
2174
|
+
z = __fmaf_rn(z, x2, a2);
|
2175
|
+
z = __fmaf_rn(z, a1, a0);
|
2176
|
+
|
2177
|
+
if ((i & 2U) != 0U) {
|
2178
|
+
z = -z;
|
2011
2179
|
}
|
2012
|
-
return
|
2180
|
+
return z;
|
2013
2181
|
}
|
2014
2182
|
static __device__ __forceinline__ float __float_simpl_sinf(float a)
|
2015
2183
|
{
|
2016
2184
|
float z;
|
2017
|
-
|
2018
|
-
if (::isinf(a)) {
|
2019
|
-
a = a * 0.0F;
|
2020
|
-
}
|
2185
|
+
unsigned i;
|
2021
2186
|
a = __internal_trig_reduction_kernel(a, &i);
|
2022
2187
|
z = __internal_sin_cos_kernel(a, i);
|
2023
2188
|
return z;
|
@@ -2025,25 +2190,22 @@ static __device__ __forceinline__ float __float_simpl_sinf(float a)
|
|
2025
2190
|
static __device__ __forceinline__ float __float_simpl_cosf(float a)
|
2026
2191
|
{
|
2027
2192
|
float z;
|
2028
|
-
|
2029
|
-
if (::isinf(a)) {
|
2030
|
-
a = a * 0.0F;
|
2031
|
-
}
|
2193
|
+
unsigned i;
|
2032
2194
|
a = __internal_trig_reduction_kernel(a, &i);
|
2033
|
-
i
|
2034
|
-
z = __internal_sin_cos_kernel(a, i);
|
2195
|
+
z = __internal_sin_cos_kernel(a, (i & 0x3U) + 1U);
|
2035
2196
|
return z;
|
2036
2197
|
}
|
2037
2198
|
|
2038
2199
|
__CUDA_FP16_DECL__ __half hexp(const __half a) {
|
2039
2200
|
__half val;
|
2040
|
-
asm("{.reg.b32 f, C;
|
2201
|
+
asm("{.reg.b32 f, C, nZ; \n"
|
2041
2202
|
" .reg.b16 h,r; \n"
|
2042
2203
|
" mov.b16 h,%1; \n"
|
2043
2204
|
" cvt.f32.f16 f,h; \n"
|
2044
|
-
" mov.b32 C, 0x3fb8aa3bU;
|
2045
|
-
"
|
2046
|
-
"
|
2205
|
+
" mov.b32 C, 0x3fb8aa3bU; \n"
|
2206
|
+
" mov.b32 nZ, 0x80000000U;\n"
|
2207
|
+
" fma.rn.f32 f,f,C,nZ; \n"
|
2208
|
+
" ex2.approx.ftz.f32 f,f; \n"
|
2047
2209
|
" cvt.rn.f16.f32 r,f; \n"
|
2048
2210
|
__SPEC_CASE(h, r, 0X1F79U, 0x9400U)
|
2049
2211
|
__SPEC_CASE(h, r, 0X25CFU, 0x9400U)
|
@@ -2056,16 +2218,17 @@ __CUDA_FP16_DECL__ __half hexp(const __half a) {
|
|
2056
2218
|
__CUDA_FP16_DECL__ __half2 h2exp(const __half2 a) {
|
2057
2219
|
__half2 val;
|
2058
2220
|
asm("{.reg.b16 hl, hu; \n"
|
2059
|
-
" .reg.b32 h,r,fl,fu,
|
2221
|
+
" .reg.b32 h,r,fl,fu,C,nZ; \n"
|
2060
2222
|
" mov.b32 {hl, hu}, %1; \n"
|
2061
2223
|
" mov.b32 h, %1; \n"
|
2062
2224
|
" cvt.f32.f16 fl, hl; \n"
|
2063
2225
|
" cvt.f32.f16 fu, hu; \n"
|
2064
|
-
" mov.b32 C, 0x3fb8aa3bU;
|
2065
|
-
"
|
2066
|
-
"
|
2067
|
-
"
|
2068
|
-
" ex2.approx.f32
|
2226
|
+
" mov.b32 C, 0x3fb8aa3bU; \n"
|
2227
|
+
" mov.b32 nZ, 0x80000000U;\n"
|
2228
|
+
" fma.rn.f32 fl,fl,C,nZ; \n"
|
2229
|
+
" fma.rn.f32 fu,fu,C,nZ; \n"
|
2230
|
+
" ex2.approx.ftz.f32 fl, fl; \n"
|
2231
|
+
" ex2.approx.ftz.f32 fu, fu; \n"
|
2069
2232
|
" cvt.rn.f16.f32 hl, fl; \n"
|
2070
2233
|
" cvt.rn.f16.f32 hu, fu; \n"
|
2071
2234
|
" mov.b32 r, {hl, hu}; \n"
|
@@ -2083,7 +2246,7 @@ __CUDA_FP16_DECL__ __half hexp2(const __half a) {
|
|
2083
2246
|
" .reg.b16 r; \n"
|
2084
2247
|
" mov.b16 r,%1; \n"
|
2085
2248
|
" cvt.f32.f16 f,r; \n"
|
2086
|
-
" ex2.approx.f32 f,f;
|
2249
|
+
" ex2.approx.ftz.f32 f,f; \n"
|
2087
2250
|
" mov.b32 ULP, 0x33800000U;\n"
|
2088
2251
|
" fma.rn.f32 f,f,ULP,f; \n"
|
2089
2252
|
" cvt.rn.f16.f32 r,f; \n"
|
@@ -2098,8 +2261,8 @@ __CUDA_FP16_DECL__ __half2 h2exp2(const __half2 a) {
|
|
2098
2261
|
" mov.b32 {hl, hu}, %1; \n"
|
2099
2262
|
" cvt.f32.f16 fl, hl; \n"
|
2100
2263
|
" cvt.f32.f16 fu, hu; \n"
|
2101
|
-
" ex2.approx.f32
|
2102
|
-
" ex2.approx.f32
|
2264
|
+
" ex2.approx.ftz.f32 fl, fl; \n"
|
2265
|
+
" ex2.approx.ftz.f32 fu, fu; \n"
|
2103
2266
|
" mov.b32 ULP, 0x33800000U;\n"
|
2104
2267
|
" fma.rn.f32 fl,fl,ULP,fl; \n"
|
2105
2268
|
" fma.rn.f32 fu,fu,ULP,fu; \n"
|
@@ -2112,12 +2275,13 @@ __CUDA_FP16_DECL__ __half2 h2exp2(const __half2 a) {
|
|
2112
2275
|
__CUDA_FP16_DECL__ __half hexp10(const __half a) {
|
2113
2276
|
__half val;
|
2114
2277
|
asm("{.reg.b16 h,r; \n"
|
2115
|
-
" .reg.b32 f, C;
|
2278
|
+
" .reg.b32 f, C, nZ; \n"
|
2116
2279
|
" mov.b16 h, %1; \n"
|
2117
2280
|
" cvt.f32.f16 f, h; \n"
|
2118
|
-
" mov.b32 C, 0x40549A78U;
|
2119
|
-
"
|
2120
|
-
"
|
2281
|
+
" mov.b32 C, 0x40549A78U; \n"
|
2282
|
+
" mov.b32 nZ, 0x80000000U;\n"
|
2283
|
+
" fma.rn.f32 f,f,C,nZ; \n"
|
2284
|
+
" ex2.approx.ftz.f32 f, f; \n"
|
2121
2285
|
" cvt.rn.f16.f32 r, f; \n"
|
2122
2286
|
__SPEC_CASE(h, r, 0x34DEU, 0x9800U)
|
2123
2287
|
__SPEC_CASE(h, r, 0x9766U, 0x9000U)
|
@@ -2131,16 +2295,17 @@ __CUDA_FP16_DECL__ __half hexp10(const __half a) {
|
|
2131
2295
|
__CUDA_FP16_DECL__ __half2 h2exp10(const __half2 a) {
|
2132
2296
|
__half2 val;
|
2133
2297
|
asm("{.reg.b16 hl, hu; \n"
|
2134
|
-
" .reg.b32 h,r,fl,fu,
|
2298
|
+
" .reg.b32 h,r,fl,fu,C,nZ; \n"
|
2135
2299
|
" mov.b32 {hl, hu}, %1; \n"
|
2136
2300
|
" mov.b32 h, %1; \n"
|
2137
2301
|
" cvt.f32.f16 fl, hl; \n"
|
2138
2302
|
" cvt.f32.f16 fu, hu; \n"
|
2139
|
-
" mov.b32 C, 0x40549A78U;
|
2140
|
-
"
|
2141
|
-
"
|
2142
|
-
"
|
2143
|
-
" ex2.approx.f32
|
2303
|
+
" mov.b32 C, 0x40549A78U; \n"
|
2304
|
+
" mov.b32 nZ, 0x80000000U;\n"
|
2305
|
+
" fma.rn.f32 fl,fl,C,nZ; \n"
|
2306
|
+
" fma.rn.f32 fu,fu,C,nZ; \n"
|
2307
|
+
" ex2.approx.ftz.f32 fl, fl; \n"
|
2308
|
+
" ex2.approx.ftz.f32 fu, fu; \n"
|
2144
2309
|
" cvt.rn.f16.f32 hl, fl; \n"
|
2145
2310
|
" cvt.rn.f16.f32 hu, fu; \n"
|
2146
2311
|
" mov.b32 r, {hl, hu}; \n"
|
@@ -2159,7 +2324,7 @@ __CUDA_FP16_DECL__ __half hlog2(const __half a) {
|
|
2159
2324
|
" .reg.b32 f; \n"
|
2160
2325
|
" mov.b16 h, %1; \n"
|
2161
2326
|
" cvt.f32.f16 f, h; \n"
|
2162
|
-
" lg2.approx.f32
|
2327
|
+
" lg2.approx.ftz.f32 f, f; \n"
|
2163
2328
|
" cvt.rn.f16.f32 r, f; \n"
|
2164
2329
|
__SPEC_CASE(r, r, 0xA2E2U, 0x8080U)
|
2165
2330
|
__SPEC_CASE(r, r, 0xBF46U, 0x9400U)
|
@@ -2174,8 +2339,8 @@ __CUDA_FP16_DECL__ __half2 h2log2(const __half2 a) {
|
|
2174
2339
|
" mov.b32 {hl, hu}, %1; \n"
|
2175
2340
|
" cvt.f32.f16 fl, hl; \n"
|
2176
2341
|
" cvt.f32.f16 fu, hu; \n"
|
2177
|
-
" lg2.approx.f32
|
2178
|
-
" lg2.approx.f32
|
2342
|
+
" lg2.approx.ftz.f32 fl, fl; \n"
|
2343
|
+
" lg2.approx.ftz.f32 fu, fu; \n"
|
2179
2344
|
" cvt.rn.f16.f32 hl, fl; \n"
|
2180
2345
|
" cvt.rn.f16.f32 hu, fu; \n"
|
2181
2346
|
" mov.b32 r, {hl, hu}; \n"
|
@@ -2191,7 +2356,7 @@ __CUDA_FP16_DECL__ __half hlog(const __half a) {
|
|
2191
2356
|
" .reg.b16 r,h; \n"
|
2192
2357
|
" mov.b16 h,%1; \n"
|
2193
2358
|
" cvt.f32.f16 f,h; \n"
|
2194
|
-
" lg2.approx.f32
|
2359
|
+
" lg2.approx.ftz.f32 f,f; \n"
|
2195
2360
|
" mov.b32 C, 0x3f317218U; \n"
|
2196
2361
|
" mul.f32 f,f,C; \n"
|
2197
2362
|
" cvt.rn.f16.f32 r,f; \n"
|
@@ -2211,8 +2376,8 @@ __CUDA_FP16_DECL__ __half2 h2log(const __half2 a) {
|
|
2211
2376
|
" mov.b32 h, %1; \n"
|
2212
2377
|
" cvt.f32.f16 fl, hl; \n"
|
2213
2378
|
" cvt.f32.f16 fu, hu; \n"
|
2214
|
-
" lg2.approx.f32
|
2215
|
-
" lg2.approx.f32
|
2379
|
+
" lg2.approx.ftz.f32 fl, fl; \n"
|
2380
|
+
" lg2.approx.ftz.f32 fu, fu; \n"
|
2216
2381
|
" mov.b32 C, 0x3f317218U; \n"
|
2217
2382
|
" mul.f32 fl,fl,C; \n"
|
2218
2383
|
" mul.f32 fu,fu,C; \n"
|
@@ -2233,7 +2398,7 @@ __CUDA_FP16_DECL__ __half hlog10(const __half a) {
|
|
2233
2398
|
" .reg.b32 f, C; \n"
|
2234
2399
|
" mov.b16 h, %1; \n"
|
2235
2400
|
" cvt.f32.f16 f, h; \n"
|
2236
|
-
" lg2.approx.f32
|
2401
|
+
" lg2.approx.ftz.f32 f, f; \n"
|
2237
2402
|
" mov.b32 C, 0x3E9A209BU; \n"
|
2238
2403
|
" mul.f32 f,f,C; \n"
|
2239
2404
|
" cvt.rn.f16.f32 r, f; \n"
|
@@ -2253,8 +2418,8 @@ __CUDA_FP16_DECL__ __half2 h2log10(const __half2 a) {
|
|
2253
2418
|
" mov.b32 h, %1; \n"
|
2254
2419
|
" cvt.f32.f16 fl, hl; \n"
|
2255
2420
|
" cvt.f32.f16 fu, hu; \n"
|
2256
|
-
" lg2.approx.f32
|
2257
|
-
" lg2.approx.f32
|
2421
|
+
" lg2.approx.ftz.f32 fl, fl; \n"
|
2422
|
+
" lg2.approx.ftz.f32 fu, fu; \n"
|
2258
2423
|
" mov.b32 C, 0x3E9A209BU; \n"
|
2259
2424
|
" mul.f32 fl,fl,C; \n"
|
2260
2425
|
" mul.f32 fu,fu,C; \n"
|
@@ -2340,27 +2505,16 @@ __CUDA_FP16_DECL__ __half2 __hcmadd(const __half2 a, const __half2 b, const __ha
|
|
2340
2505
|
// (a.re, a.im) * (b.re, b.im) + (c.re, c.im)
|
2341
2506
|
// acc.re = (c.re + a.re*b.re) - a.im*b.im
|
2342
2507
|
// acc.im = (c.im + a.re*b.im) + a.im*b.re
|
2343
|
-
|
2344
|
-
|
2345
|
-
|
2346
|
-
|
2347
|
-
|
2348
|
-
return acc;
|
2508
|
+
__half real_tmp = __hfma(a.x, b.x, c.x);
|
2509
|
+
__half img_tmp = __hfma(a.x, b.y, c.y);
|
2510
|
+
real_tmp = __hfma(__hneg(a.y), b.y, real_tmp);
|
2511
|
+
img_tmp = __hfma(a.y, b.x, img_tmp);
|
2512
|
+
return make_half2(real_tmp, img_tmp);
|
2349
2513
|
}
|
2350
|
-
#endif /*__CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)*/
|
2351
2514
|
|
2352
|
-
#
|
2353
|
-
|
2354
|
-
|
2355
|
-
******************************************************************************/
|
2356
|
-
__CUDA_FP16_DECL__ __half __hmax(const __half a, const __half b)
|
2357
|
-
{
|
2358
|
-
__BINARY_OP_HALF_MACRO(max)
|
2359
|
-
}
|
2360
|
-
__CUDA_FP16_DECL__ __half __hmin(const __half a, const __half b)
|
2361
|
-
{
|
2362
|
-
__BINARY_OP_HALF_MACRO(min)
|
2363
|
-
}
|
2515
|
+
#endif /*!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)*/
|
2516
|
+
|
2517
|
+
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800)
|
2364
2518
|
__CUDA_FP16_DECL__ __half __hmax_nan(const __half a, const __half b)
|
2365
2519
|
{
|
2366
2520
|
__BINARY_OP_HALF_MACRO(max.NaN)
|
@@ -2373,17 +2527,7 @@ __CUDA_FP16_DECL__ __half __hfma_relu(const __half a, const __half b, const __ha
|
|
2373
2527
|
{
|
2374
2528
|
__TERNARY_OP_HALF_MACRO(fma.rn.relu)
|
2375
2529
|
}
|
2376
|
-
|
2377
|
-
* __half2 arithmetic *
|
2378
|
-
******************************************************************************/
|
2379
|
-
__CUDA_FP16_DECL__ __half2 __hmax2(const __half2 a, const __half2 b)
|
2380
|
-
{
|
2381
|
-
__BINARY_OP_HALF2_MACRO(max)
|
2382
|
-
}
|
2383
|
-
__CUDA_FP16_DECL__ __half2 __hmin2(const __half2 a, const __half2 b)
|
2384
|
-
{
|
2385
|
-
__BINARY_OP_HALF2_MACRO(min)
|
2386
|
-
}
|
2530
|
+
|
2387
2531
|
__CUDA_FP16_DECL__ __half2 __hmax2_nan(const __half2 a, const __half2 b)
|
2388
2532
|
{
|
2389
2533
|
__BINARY_OP_HALF2_MACRO(max.NaN)
|
@@ -2396,7 +2540,7 @@ __CUDA_FP16_DECL__ __half2 __hfma2_relu(const __half2 a, const __half2 b, const
|
|
2396
2540
|
{
|
2397
2541
|
__TERNARY_OP_HALF2_MACRO(fma.rn.relu)
|
2398
2542
|
}
|
2399
|
-
#endif
|
2543
|
+
#endif /*!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800)*/
|
2400
2544
|
|
2401
2545
|
/* Define __PTR for atomicAdd prototypes below, undef after done */
|
2402
2546
|
#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)
|
@@ -2444,6 +2588,11 @@ __CUDA_FP16_DECL__ __half atomicAdd(__half *const address, const __half val) {
|
|
2444
2588
|
#undef __CUDA_HOSTDEVICE_FP16_DECL__
|
2445
2589
|
#undef __CUDA_FP16_DECL__
|
2446
2590
|
|
2591
|
+
#undef __HALF_TO_US
|
2592
|
+
#undef __HALF_TO_CUS
|
2593
|
+
#undef __HALF2_TO_UI
|
2594
|
+
#undef __HALF2_TO_CUI
|
2595
|
+
|
2447
2596
|
/* Define first-class types "half" and "half2", unless user specifies otherwise via "#define CUDA_NO_HALF" */
|
2448
2597
|
/* C cannot ever have these types defined here, because __half and __half2 are C++ classes */
|
2449
2598
|
#if defined(__cplusplus) && !defined(CUDA_NO_HALF)
|