numba-cuda 0.0.1__py3-none-any.whl → 0.0.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. _numba_cuda_redirector.pth +1 -0
  2. _numba_cuda_redirector.py +74 -0
  3. numba_cuda/VERSION +1 -0
  4. numba_cuda/__init__.py +5 -0
  5. numba_cuda/_version.py +19 -0
  6. numba_cuda/numba/cuda/__init__.py +22 -0
  7. numba_cuda/numba/cuda/api.py +526 -0
  8. numba_cuda/numba/cuda/api_util.py +30 -0
  9. numba_cuda/numba/cuda/args.py +77 -0
  10. numba_cuda/numba/cuda/cg.py +62 -0
  11. numba_cuda/numba/cuda/codegen.py +378 -0
  12. numba_cuda/numba/cuda/compiler.py +422 -0
  13. numba_cuda/numba/cuda/cpp_function_wrappers.cu +47 -0
  14. numba_cuda/numba/cuda/cuda_fp16.h +3631 -0
  15. numba_cuda/numba/cuda/cuda_fp16.hpp +2465 -0
  16. numba_cuda/numba/cuda/cuda_paths.py +258 -0
  17. numba_cuda/numba/cuda/cudadecl.py +806 -0
  18. numba_cuda/numba/cuda/cudadrv/__init__.py +9 -0
  19. numba_cuda/numba/cuda/cudadrv/devicearray.py +904 -0
  20. numba_cuda/numba/cuda/cudadrv/devices.py +248 -0
  21. numba_cuda/numba/cuda/cudadrv/driver.py +3201 -0
  22. numba_cuda/numba/cuda/cudadrv/drvapi.py +398 -0
  23. numba_cuda/numba/cuda/cudadrv/dummyarray.py +452 -0
  24. numba_cuda/numba/cuda/cudadrv/enums.py +607 -0
  25. numba_cuda/numba/cuda/cudadrv/error.py +36 -0
  26. numba_cuda/numba/cuda/cudadrv/libs.py +176 -0
  27. numba_cuda/numba/cuda/cudadrv/ndarray.py +20 -0
  28. numba_cuda/numba/cuda/cudadrv/nvrtc.py +260 -0
  29. numba_cuda/numba/cuda/cudadrv/nvvm.py +707 -0
  30. numba_cuda/numba/cuda/cudadrv/rtapi.py +10 -0
  31. numba_cuda/numba/cuda/cudadrv/runtime.py +142 -0
  32. numba_cuda/numba/cuda/cudaimpl.py +1055 -0
  33. numba_cuda/numba/cuda/cudamath.py +140 -0
  34. numba_cuda/numba/cuda/decorators.py +189 -0
  35. numba_cuda/numba/cuda/descriptor.py +33 -0
  36. numba_cuda/numba/cuda/device_init.py +89 -0
  37. numba_cuda/numba/cuda/deviceufunc.py +908 -0
  38. numba_cuda/numba/cuda/dispatcher.py +1057 -0
  39. numba_cuda/numba/cuda/errors.py +59 -0
  40. numba_cuda/numba/cuda/extending.py +7 -0
  41. numba_cuda/numba/cuda/initialize.py +13 -0
  42. numba_cuda/numba/cuda/intrinsic_wrapper.py +77 -0
  43. numba_cuda/numba/cuda/intrinsics.py +198 -0
  44. numba_cuda/numba/cuda/kernels/__init__.py +0 -0
  45. numba_cuda/numba/cuda/kernels/reduction.py +262 -0
  46. numba_cuda/numba/cuda/kernels/transpose.py +65 -0
  47. numba_cuda/numba/cuda/libdevice.py +3382 -0
  48. numba_cuda/numba/cuda/libdevicedecl.py +17 -0
  49. numba_cuda/numba/cuda/libdevicefuncs.py +1057 -0
  50. numba_cuda/numba/cuda/libdeviceimpl.py +83 -0
  51. numba_cuda/numba/cuda/mathimpl.py +448 -0
  52. numba_cuda/numba/cuda/models.py +48 -0
  53. numba_cuda/numba/cuda/nvvmutils.py +235 -0
  54. numba_cuda/numba/cuda/printimpl.py +86 -0
  55. numba_cuda/numba/cuda/random.py +292 -0
  56. numba_cuda/numba/cuda/simulator/__init__.py +38 -0
  57. numba_cuda/numba/cuda/simulator/api.py +110 -0
  58. numba_cuda/numba/cuda/simulator/compiler.py +9 -0
  59. numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +2 -0
  60. numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +432 -0
  61. numba_cuda/numba/cuda/simulator/cudadrv/devices.py +117 -0
  62. numba_cuda/numba/cuda/simulator/cudadrv/driver.py +62 -0
  63. numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +4 -0
  64. numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py +4 -0
  65. numba_cuda/numba/cuda/simulator/cudadrv/error.py +6 -0
  66. numba_cuda/numba/cuda/simulator/cudadrv/libs.py +2 -0
  67. numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +29 -0
  68. numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +19 -0
  69. numba_cuda/numba/cuda/simulator/kernel.py +308 -0
  70. numba_cuda/numba/cuda/simulator/kernelapi.py +495 -0
  71. numba_cuda/numba/cuda/simulator/reduction.py +15 -0
  72. numba_cuda/numba/cuda/simulator/vector_types.py +58 -0
  73. numba_cuda/numba/cuda/simulator_init.py +17 -0
  74. numba_cuda/numba/cuda/stubs.py +902 -0
  75. numba_cuda/numba/cuda/target.py +440 -0
  76. numba_cuda/numba/cuda/testing.py +202 -0
  77. numba_cuda/numba/cuda/tests/__init__.py +58 -0
  78. numba_cuda/numba/cuda/tests/cudadrv/__init__.py +8 -0
  79. numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +145 -0
  80. numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +145 -0
  81. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +375 -0
  82. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +21 -0
  83. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +179 -0
  84. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +235 -0
  85. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +22 -0
  86. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +193 -0
  87. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +547 -0
  88. numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +249 -0
  89. numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +81 -0
  90. numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +192 -0
  91. numba_cuda/numba/cuda/tests/cudadrv/test_events.py +38 -0
  92. numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +65 -0
  93. numba_cuda/numba/cuda/tests/cudadrv/test_init.py +139 -0
  94. numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +37 -0
  95. numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py +12 -0
  96. numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +317 -0
  97. numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +127 -0
  98. numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +54 -0
  99. numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +199 -0
  100. numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +37 -0
  101. numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +20 -0
  102. numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +149 -0
  103. numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +36 -0
  104. numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +85 -0
  105. numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +41 -0
  106. numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +122 -0
  107. numba_cuda/numba/cuda/tests/cudapy/__init__.py +8 -0
  108. numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +234 -0
  109. numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +41 -0
  110. numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +58 -0
  111. numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +30 -0
  112. numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +100 -0
  113. numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +42 -0
  114. numba_cuda/numba/cuda/tests/cudapy/test_array.py +260 -0
  115. numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +201 -0
  116. numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +35 -0
  117. numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +1620 -0
  118. numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +120 -0
  119. numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +24 -0
  120. numba_cuda/numba/cuda/tests/cudapy/test_caching.py +545 -0
  121. numba_cuda/numba/cuda/tests/cudapy/test_casting.py +257 -0
  122. numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +33 -0
  123. numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +276 -0
  124. numba_cuda/numba/cuda/tests/cudapy/test_complex.py +296 -0
  125. numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +20 -0
  126. numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +129 -0
  127. numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +176 -0
  128. numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +147 -0
  129. numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +435 -0
  130. numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +90 -0
  131. numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +94 -0
  132. numba_cuda/numba/cuda/tests/cudapy/test_debug.py +101 -0
  133. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +221 -0
  134. numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +222 -0
  135. numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +700 -0
  136. numba_cuda/numba/cuda/tests/cudapy/test_enums.py +121 -0
  137. numba_cuda/numba/cuda/tests/cudapy/test_errors.py +79 -0
  138. numba_cuda/numba/cuda/tests/cudapy/test_exception.py +174 -0
  139. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +155 -0
  140. numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +244 -0
  141. numba_cuda/numba/cuda/tests/cudapy/test_forall.py +52 -0
  142. numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +29 -0
  143. numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +66 -0
  144. numba_cuda/numba/cuda/tests/cudapy/test_globals.py +60 -0
  145. numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +456 -0
  146. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +159 -0
  147. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +95 -0
  148. numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +37 -0
  149. numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +165 -0
  150. numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +1106 -0
  151. numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +318 -0
  152. numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +99 -0
  153. numba_cuda/numba/cuda/tests/cudapy/test_lang.py +64 -0
  154. numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +119 -0
  155. numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +187 -0
  156. numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +199 -0
  157. numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +164 -0
  158. numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +37 -0
  159. numba_cuda/numba/cuda/tests/cudapy/test_math.py +786 -0
  160. numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +74 -0
  161. numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +113 -0
  162. numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +22 -0
  163. numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +140 -0
  164. numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +46 -0
  165. numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +101 -0
  166. numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +49 -0
  167. numba_cuda/numba/cuda/tests/cudapy/test_operator.py +401 -0
  168. numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +86 -0
  169. numba_cuda/numba/cuda/tests/cudapy/test_overload.py +335 -0
  170. numba_cuda/numba/cuda/tests/cudapy/test_powi.py +124 -0
  171. numba_cuda/numba/cuda/tests/cudapy/test_print.py +128 -0
  172. numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +33 -0
  173. numba_cuda/numba/cuda/tests/cudapy/test_random.py +104 -0
  174. numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +610 -0
  175. numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +125 -0
  176. numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +76 -0
  177. numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +83 -0
  178. numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +85 -0
  179. numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +37 -0
  180. numba_cuda/numba/cuda/tests/cudapy/test_sm.py +444 -0
  181. numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +205 -0
  182. numba_cuda/numba/cuda/tests/cudapy/test_sync.py +271 -0
  183. numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +80 -0
  184. numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +277 -0
  185. numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +47 -0
  186. numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +307 -0
  187. numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +283 -0
  188. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +20 -0
  189. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +69 -0
  190. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +36 -0
  191. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +37 -0
  192. numba_cuda/numba/cuda/tests/cudapy/test_warning.py +139 -0
  193. numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +276 -0
  194. numba_cuda/numba/cuda/tests/cudasim/__init__.py +6 -0
  195. numba_cuda/numba/cuda/tests/cudasim/support.py +6 -0
  196. numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +102 -0
  197. numba_cuda/numba/cuda/tests/data/__init__.py +0 -0
  198. numba_cuda/numba/cuda/tests/data/cuda_include.cu +5 -0
  199. numba_cuda/numba/cuda/tests/data/error.cu +7 -0
  200. numba_cuda/numba/cuda/tests/data/jitlink.cu +23 -0
  201. numba_cuda/numba/cuda/tests/data/jitlink.ptx +51 -0
  202. numba_cuda/numba/cuda/tests/data/warn.cu +7 -0
  203. numba_cuda/numba/cuda/tests/doc_examples/__init__.py +6 -0
  204. numba_cuda/numba/cuda/tests/doc_examples/ffi/__init__.py +0 -0
  205. numba_cuda/numba/cuda/tests/doc_examples/ffi/functions.cu +49 -0
  206. numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +77 -0
  207. numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +76 -0
  208. numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +82 -0
  209. numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +155 -0
  210. numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +173 -0
  211. numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +109 -0
  212. numba_cuda/numba/cuda/tests/doc_examples/test_random.py +59 -0
  213. numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +76 -0
  214. numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +130 -0
  215. numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py +50 -0
  216. numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +73 -0
  217. numba_cuda/numba/cuda/tests/nocuda/__init__.py +8 -0
  218. numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +359 -0
  219. numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +36 -0
  220. numba_cuda/numba/cuda/tests/nocuda/test_import.py +49 -0
  221. numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +238 -0
  222. numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +54 -0
  223. numba_cuda/numba/cuda/types.py +37 -0
  224. numba_cuda/numba/cuda/ufuncs.py +662 -0
  225. numba_cuda/numba/cuda/vector_types.py +209 -0
  226. numba_cuda/numba/cuda/vectorizers.py +252 -0
  227. numba_cuda-0.0.12.dist-info/LICENSE +25 -0
  228. numba_cuda-0.0.12.dist-info/METADATA +68 -0
  229. numba_cuda-0.0.12.dist-info/RECORD +231 -0
  230. {numba_cuda-0.0.1.dist-info → numba_cuda-0.0.12.dist-info}/WHEEL +1 -1
  231. numba_cuda-0.0.1.dist-info/METADATA +0 -10
  232. numba_cuda-0.0.1.dist-info/RECORD +0 -5
  233. {numba_cuda-0.0.1.dist-info → numba_cuda-0.0.12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,2465 @@
1
+ /*
2
+ * Copyright 1993-2020 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__CUDA_FP16_HPP__)
51
+ #define __CUDA_FP16_HPP__
52
+
53
+ #if !defined(__CUDA_FP16_H__)
54
+ #error "Do not include this file directly. Instead, include cuda_fp16.h."
55
+ #endif
56
+
57
+ #if !defined(_MSC_VER) && __cplusplus >= 201103L
58
+ # define __CPP_VERSION_AT_LEAST_11_FP16
59
+ #elif _MSC_FULL_VER >= 190024210 && _MSVC_LANG >= 201103L
60
+ # define __CPP_VERSION_AT_LEAST_11_FP16
61
+ #endif
62
+
63
+ /* C++11 header for std::move.
64
+ * In RTC mode, std::move is provided implicitly; don't include the header
65
+ */
66
+ #if defined(__CPP_VERSION_AT_LEAST_11_FP16) && !defined(__CUDACC_RTC__)
67
+ #include <utility>
68
+ #endif /* __cplusplus >= 201103L && !defined(__CUDACC_RTC__) */
69
+
70
+ /* C++ header for std::memcpy (used for type punning in host-side implementations).
71
+ * When compiling as a CUDA source file memcpy is provided implicitly.
72
+ * !defined(__CUDACC__) implies !defined(__CUDACC_RTC__).
73
+ */
74
+ #if defined(__cplusplus) && !defined(__CUDACC__)
75
+ #include <cstring>
76
+ #endif /* defined(__cplusplus) && !defined(__CUDACC__) */
77
+
78
+
79
+ /* Set up function decorations */
80
+ #if defined(__CUDACC__)
81
+ #define __CUDA_FP16_DECL__ static __device__ __inline__
82
+ #define __CUDA_HOSTDEVICE_FP16_DECL__ static __host__ __device__ __inline__
83
+ #define __VECTOR_FUNCTIONS_DECL__ static __inline__ __host__ __device__
84
+ #define __CUDA_HOSTDEVICE__ __host__ __device__
85
+ #else /* !defined(__CUDACC__) */
86
+ #if defined(__GNUC__)
87
+ #define __CUDA_HOSTDEVICE_FP16_DECL__ static __attribute__ ((unused))
88
+ #else
89
+ #define __CUDA_HOSTDEVICE_FP16_DECL__ static
90
+ #endif /* defined(__GNUC__) */
91
+ #define __CUDA_HOSTDEVICE__
92
+ #endif /* defined(__CUDACC_) */
93
+
94
+ /* Set up structure-alignment attribute */
95
+ #if defined(__CUDACC__)
96
+ #define __CUDA_ALIGN__(align) __align__(align)
97
+ #else
98
+ /* Define alignment macro based on compiler type (cannot assume C11 "_Alignas" is available) */
99
+ #if __cplusplus >= 201103L
100
+ #define __CUDA_ALIGN__(n) alignas(n) /* C++11 kindly gives us a keyword for this */
101
+ #else /* !defined(__CPP_VERSION_AT_LEAST_11_FP16)*/
102
+ #if defined(__GNUC__)
103
+ #define __CUDA_ALIGN__(n) __attribute__ ((aligned(n)))
104
+ #elif defined(_MSC_VER)
105
+ #define __CUDA_ALIGN__(n) __declspec(align(n))
106
+ #else
107
+ #define __CUDA_ALIGN__(n)
108
+ #endif /* defined(__GNUC__) */
109
+ #endif /* defined(__CPP_VERSION_AT_LEAST_11_FP16) */
110
+ #endif /* defined(__CUDACC__) */
111
+
112
+ /* Macros to allow half & half2 to be used by inline assembly */
113
+ #define __HALF_TO_US(var) *(reinterpret_cast<unsigned short *>(&(var)))
114
+ #define __HALF_TO_CUS(var) *(reinterpret_cast<const unsigned short *>(&(var)))
115
+ #define __HALF2_TO_UI(var) *(reinterpret_cast<unsigned int *>(&(var)))
116
+ #define __HALF2_TO_CUI(var) *(reinterpret_cast<const unsigned int *>(&(var)))
117
+
118
+ /* Macros for half & half2 binary arithmetic */
119
+ #define __BINARY_OP_HALF_MACRO(name) /* do */ {\
120
+ __half val; \
121
+ asm( "{"#name".f16 %0,%1,%2;\n}" \
122
+ :"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)),"h"(__HALF_TO_CUS(b))); \
123
+ return val; \
124
+ } /* while(0) */
125
+ #define __BINARY_OP_HALF2_MACRO(name) /* do */ {\
126
+ __half2 val; \
127
+ asm( "{"#name".f16x2 %0,%1,%2;\n}" \
128
+ :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b))); \
129
+ return val; \
130
+ } /* while(0) */
131
+ #define __TERNARY_OP_HALF_MACRO(name) /* do */ {\
132
+ __half val; \
133
+ asm( "{"#name".f16 %0,%1,%2,%3;\n}" \
134
+ :"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)),"h"(__HALF_TO_CUS(b)),"h"(__HALF_TO_CUS(c))); \
135
+ return val; \
136
+ } /* while(0) */
137
+ #define __TERNARY_OP_HALF2_MACRO(name) /* do */ {\
138
+ __half2 val; \
139
+ asm( "{"#name".f16x2 %0,%1,%2,%3;\n}" \
140
+ :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b)),"r"(__HALF2_TO_CUI(c))); \
141
+ return val; \
142
+ } /* while(0) */
143
+
144
+ /**
145
+ * Types which allow static initialization of "half" and "half2" until
146
+ * these become an actual builtin. Note this initialization is as a
147
+ * bitfield representation of "half", and not a conversion from short->half.
148
+ * Such a representation will be deprecated in a future version of CUDA.
149
+ * (Note these are visible to non-nvcc compilers, including C-only compilation)
150
+ */
151
+ typedef struct __CUDA_ALIGN__(2) {
152
+ unsigned short x;
153
+ } __half_raw;
154
+
155
+ typedef struct __CUDA_ALIGN__(4) {
156
+ unsigned short x;
157
+ unsigned short y;
158
+ } __half2_raw;
159
+
160
+ /* All other definitions in this file are only visible to C++ compilers */
161
+ #if defined(__cplusplus)
162
+
163
+ /* Hide GCC member initialization list warnings because of host/device in-function init requirement */
164
+ #if defined(__GNUC__)
165
+ #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
166
+ #pragma GCC diagnostic push
167
+ #pragma GCC diagnostic ignored "-Wstrict-aliasing"
168
+ #pragma GCC diagnostic ignored "-Weffc++"
169
+ #endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) */
170
+ #endif /* defined(__GNUC__) */
171
+
172
+ /* class' : multiple assignment operators specified
173
+ The class has multiple assignment operators of a single type. This warning is informational */
174
+ #if defined(_MSC_VER) && _MSC_VER >= 1500
175
+ #pragma warning( push )
176
+ #pragma warning( disable:4522 )
177
+ #endif /* defined(__GNUC__) */
178
+
179
+ struct __CUDA_ALIGN__(2) __half {
180
+ protected:
181
+ unsigned short __x;
182
+
183
+ public:
184
+ #if defined(__CPP_VERSION_AT_LEAST_11_FP16)
185
+ __half() = default;
186
+ #else
187
+ __CUDA_HOSTDEVICE__ __half() { }
188
+ #endif /* defined(__CPP_VERSION_AT_LEAST_11_FP16) */
189
+
190
+ /* Convert to/from __half_raw */
191
+ __CUDA_HOSTDEVICE__ __half(const __half_raw &hr) : __x(hr.x) { }
192
+ __CUDA_HOSTDEVICE__ __half &operator=(const __half_raw &hr) { __x = hr.x; return *this; }
193
+ __CUDA_HOSTDEVICE__ volatile __half &operator=(const __half_raw &hr) volatile { __x = hr.x; return *this; }
194
+ __CUDA_HOSTDEVICE__ volatile __half &operator=(const volatile __half_raw &hr) volatile { __x = hr.x; return *this; }
195
+ __CUDA_HOSTDEVICE__ operator __half_raw() const { __half_raw ret; ret.x = __x; return ret; }
196
+ __CUDA_HOSTDEVICE__ operator __half_raw() const volatile { __half_raw ret; ret.x = __x; return ret; }
197
+
198
+ #if !defined(__CUDA_NO_HALF_CONVERSIONS__)
199
+
200
+ /* Construct from float/double */
201
+ __CUDA_HOSTDEVICE__ __half(const float f) { __x = __float2half(f).__x; }
202
+ __CUDA_HOSTDEVICE__ __half(const double f) { __x = __double2half(f).__x; }
203
+
204
+ __CUDA_HOSTDEVICE__ operator float() const { return __half2float(*this); }
205
+ __CUDA_HOSTDEVICE__ __half &operator=(const float f) { __x = __float2half(f).__x; return *this; }
206
+
207
+ /* We omit "cast to double" operator, so as to not be ambiguous about up-cast */
208
+ __CUDA_HOSTDEVICE__ __half &operator=(const double f) { __x = __double2half(f).__x; return *this; }
209
+
210
+ /* Member functions only available to nvcc compilation so far */
211
+ #if defined(__CUDACC__)
212
+ /* Allow automatic construction from types supported natively in hardware */
213
+ /* Note we do avoid constructor init-list because of special host/device compilation rules */
214
+ __CUDA_HOSTDEVICE__ __half(const short val) { __x = __short2half_rn(val).__x; }
215
+ __CUDA_HOSTDEVICE__ __half(const unsigned short val) { __x = __ushort2half_rn(val).__x; }
216
+ __CUDA_HOSTDEVICE__ __half(const int val) { __x = __int2half_rn(val).__x; }
217
+ __CUDA_HOSTDEVICE__ __half(const unsigned int val) { __x = __uint2half_rn(val).__x; }
218
+ __CUDA_HOSTDEVICE__ __half(const long long val) { __x = __ll2half_rn(val).__x; }
219
+ __CUDA_HOSTDEVICE__ __half(const unsigned long long val) { __x = __ull2half_rn(val).__x; }
220
+
221
+ /* Allow automatic casts to supported builtin types, matching all that are permitted with float */
222
+ __CUDA_HOSTDEVICE__ operator short() const { return __half2short_rz(*this); }
223
+ __CUDA_HOSTDEVICE__ __half &operator=(const short val) { __x = __short2half_rn(val).__x; return *this; }
224
+
225
+ __CUDA_HOSTDEVICE__ operator unsigned short() const { return __half2ushort_rz(*this); }
226
+ __CUDA_HOSTDEVICE__ __half &operator=(const unsigned short val) { __x = __ushort2half_rn(val).__x; return *this; }
227
+
228
+ __CUDA_HOSTDEVICE__ operator int() const { return __half2int_rz(*this); }
229
+ __CUDA_HOSTDEVICE__ __half &operator=(const int val) { __x = __int2half_rn(val).__x; return *this; }
230
+
231
+ __CUDA_HOSTDEVICE__ operator unsigned int() const { return __half2uint_rz(*this); }
232
+ __CUDA_HOSTDEVICE__ __half &operator=(const unsigned int val) { __x = __uint2half_rn(val).__x; return *this; }
233
+
234
+ __CUDA_HOSTDEVICE__ operator long long() const { return __half2ll_rz(*this); }
235
+ __CUDA_HOSTDEVICE__ __half &operator=(const long long val) { __x = __ll2half_rn(val).__x; return *this; }
236
+
237
+ __CUDA_HOSTDEVICE__ operator unsigned long long() const { return __half2ull_rz(*this); }
238
+ __CUDA_HOSTDEVICE__ __half &operator=(const unsigned long long val) { __x = __ull2half_rn(val).__x; return *this; }
239
+
240
+ /* Boolean conversion - note both 0 and -0 must return false */
241
+ __CUDA_HOSTDEVICE__ operator bool() const { return (__x & 0x7FFFU) != 0U; }
242
+ #endif /* defined(__CUDACC__) */
243
+ #endif /* !defined(__CUDA_NO_HALF_CONVERSIONS__) */
244
+ };
245
+
246
+ /* Global-space operator functions are only available to nvcc compilation */
247
+ #if defined(__CUDACC__)
248
+
249
+ /* Arithmetic FP16 operations only supported on arch >= 5.3 */
250
+ #if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
251
+ #if !defined(__CUDA_NO_HALF_OPERATORS__)
252
+ /* Some basic arithmetic operations expected of a builtin */
253
+ __device__ __forceinline__ __half operator+(const __half &lh, const __half &rh) { return __hadd(lh, rh); }
254
+ __device__ __forceinline__ __half operator-(const __half &lh, const __half &rh) { return __hsub(lh, rh); }
255
+ __device__ __forceinline__ __half operator*(const __half &lh, const __half &rh) { return __hmul(lh, rh); }
256
+ __device__ __forceinline__ __half operator/(const __half &lh, const __half &rh) { return __hdiv(lh, rh); }
257
+
258
+ __device__ __forceinline__ __half &operator+=(__half &lh, const __half &rh) { lh = __hadd(lh, rh); return lh; }
259
+ __device__ __forceinline__ __half &operator-=(__half &lh, const __half &rh) { lh = __hsub(lh, rh); return lh; }
260
+ __device__ __forceinline__ __half &operator*=(__half &lh, const __half &rh) { lh = __hmul(lh, rh); return lh; }
261
+ __device__ __forceinline__ __half &operator/=(__half &lh, const __half &rh) { lh = __hdiv(lh, rh); return lh; }
262
+
263
+ /* Note for increment and decrement we use the raw value 0x3C00U equating to half(1.0F), to avoid the extra conversion */
264
+ __device__ __forceinline__ __half &operator++(__half &h) { __half_raw one; one.x = 0x3C00U; h += one; return h; }
265
+ __device__ __forceinline__ __half &operator--(__half &h) { __half_raw one; one.x = 0x3C00U; h -= one; return h; }
266
+ __device__ __forceinline__ __half operator++(__half &h, const int ignored) { const __half ret = h; __half_raw one; one.x = 0x3C00U; h += one; return ret; }
267
+ __device__ __forceinline__ __half operator--(__half &h, const int ignored) { const __half ret = h; __half_raw one; one.x = 0x3C00U; h -= one; return ret; }
268
+
269
+ /* Unary plus and inverse operators */
270
+ __device__ __forceinline__ __half operator+(const __half &h) { return h; }
271
+ __device__ __forceinline__ __half operator-(const __half &h) { return __hneg(h); }
272
+
273
+ /* Some basic comparison operations to make it look like a builtin */
274
+ __device__ __forceinline__ bool operator==(const __half &lh, const __half &rh) { return __heq(lh, rh); }
275
+ __device__ __forceinline__ bool operator!=(const __half &lh, const __half &rh) { return __hneu(lh, rh); }
276
+ __device__ __forceinline__ bool operator> (const __half &lh, const __half &rh) { return __hgt(lh, rh); }
277
+ __device__ __forceinline__ bool operator< (const __half &lh, const __half &rh) { return __hlt(lh, rh); }
278
+ __device__ __forceinline__ bool operator>=(const __half &lh, const __half &rh) { return __hge(lh, rh); }
279
+ __device__ __forceinline__ bool operator<=(const __half &lh, const __half &rh) { return __hle(lh, rh); }
280
+ #endif /* !defined(__CUDA_NO_HALF_OPERATORS__) */
281
+ #endif /* __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) */
282
+ #endif /* defined(__CUDACC__) */
283
+
284
+ /* __half2 is visible to non-nvcc host compilers */
285
+ struct __CUDA_ALIGN__(4) __half2 {
286
+ __half x;
287
+ __half y;
288
+
289
+ // All construct/copy/assign/move
290
+ public:
291
+ #if defined(__CPP_VERSION_AT_LEAST_11_FP16)
292
+ __half2() = default;
293
+ __CUDA_HOSTDEVICE__ __half2(const __half2 &&src) { __HALF2_TO_UI(*this) = std::move(__HALF2_TO_CUI(src)); }
294
+ __CUDA_HOSTDEVICE__ __half2 &operator=(const __half2 &&src) { __HALF2_TO_UI(*this) = std::move(__HALF2_TO_CUI(src)); return *this; }
295
+ #else
296
+ __CUDA_HOSTDEVICE__ __half2() { }
297
+ #endif /* defined(__CPP_VERSION_AT_LEAST_11_FP16) */
298
+ __CUDA_HOSTDEVICE__ __half2(const __half &a, const __half &b) : x(a), y(b) { }
299
+ __CUDA_HOSTDEVICE__ __half2(const __half2 &src) { __HALF2_TO_UI(*this) = __HALF2_TO_CUI(src); }
300
+ __CUDA_HOSTDEVICE__ __half2 &operator=(const __half2 &src) { __HALF2_TO_UI(*this) = __HALF2_TO_CUI(src); return *this; }
301
+
302
+ /* Convert to/from __half2_raw */
303
+ __CUDA_HOSTDEVICE__ __half2(const __half2_raw &h2r ) { __HALF2_TO_UI(*this) = __HALF2_TO_CUI(h2r); }
304
+ __CUDA_HOSTDEVICE__ __half2 &operator=(const __half2_raw &h2r) { __HALF2_TO_UI(*this) = __HALF2_TO_CUI(h2r); return *this; }
305
+ __CUDA_HOSTDEVICE__ operator __half2_raw() const { __half2_raw ret; ret.x = 0U; ret.y = 0U; __HALF2_TO_UI(ret) = __HALF2_TO_CUI(*this); return ret; }
306
+ };
307
+
308
+ /* Global-space operator functions are only available to nvcc compilation */
309
+ #if defined(__CUDACC__)
310
+
311
+ /* Arithmetic FP16x2 operations only supported on arch >= 5.3 */
312
+ #if (__CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)) && !defined(__CUDA_NO_HALF2_OPERATORS__)
313
+
314
+ __device__ __forceinline__ __half2 operator+(const __half2 &lh, const __half2 &rh) { return __hadd2(lh, rh); }
315
+ __device__ __forceinline__ __half2 operator-(const __half2 &lh, const __half2 &rh) { return __hsub2(lh, rh); }
316
+ __device__ __forceinline__ __half2 operator*(const __half2 &lh, const __half2 &rh) { return __hmul2(lh, rh); }
317
+ __device__ __forceinline__ __half2 operator/(const __half2 &lh, const __half2 &rh) { return __h2div(lh, rh); }
318
+
319
+ __device__ __forceinline__ __half2& operator+=(__half2 &lh, const __half2 &rh) { lh = __hadd2(lh, rh); return lh; }
320
+ __device__ __forceinline__ __half2& operator-=(__half2 &lh, const __half2 &rh) { lh = __hsub2(lh, rh); return lh; }
321
+ __device__ __forceinline__ __half2& operator*=(__half2 &lh, const __half2 &rh) { lh = __hmul2(lh, rh); return lh; }
322
+ __device__ __forceinline__ __half2& operator/=(__half2 &lh, const __half2 &rh) { lh = __h2div(lh, rh); return lh; }
323
+
324
+ __device__ __forceinline__ __half2 &operator++(__half2 &h) { __half2_raw one; one.x = 0x3C00U; one.y = 0x3C00U; h = __hadd2(h, one); return h; }
325
+ __device__ __forceinline__ __half2 &operator--(__half2 &h) { __half2_raw one; one.x = 0x3C00U; one.y = 0x3C00U; h = __hsub2(h, one); return h; }
326
+ __device__ __forceinline__ __half2 operator++(__half2 &h, const int ignored) { const __half2 ret = h; __half2_raw one; one.x = 0x3C00U; one.y = 0x3C00U; h = __hadd2(h, one); return ret; }
327
+ __device__ __forceinline__ __half2 operator--(__half2 &h, const int ignored) { const __half2 ret = h; __half2_raw one; one.x = 0x3C00U; one.y = 0x3C00U; h = __hsub2(h, one); return ret; }
328
+
329
+ __device__ __forceinline__ __half2 operator+(const __half2 &h) { return h; }
330
+ __device__ __forceinline__ __half2 operator-(const __half2 &h) { return __hneg2(h); }
331
+
332
+ __device__ __forceinline__ bool operator==(const __half2 &lh, const __half2 &rh) { return __hbeq2(lh, rh); }
333
+ __device__ __forceinline__ bool operator!=(const __half2 &lh, const __half2 &rh) { return __hbneu2(lh, rh); }
334
+ __device__ __forceinline__ bool operator>(const __half2 &lh, const __half2 &rh) { return __hbgt2(lh, rh); }
335
+ __device__ __forceinline__ bool operator<(const __half2 &lh, const __half2 &rh) { return __hblt2(lh, rh); }
336
+ __device__ __forceinline__ bool operator>=(const __half2 &lh, const __half2 &rh) { return __hbge2(lh, rh); }
337
+ __device__ __forceinline__ bool operator<=(const __half2 &lh, const __half2 &rh) { return __hble2(lh, rh); }
338
+
339
+ #endif /* __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) */
340
+ #endif /* defined(__CUDACC__) */
341
+
342
+ /* Restore warning for multiple assignment operators */
343
+ #if defined(_MSC_VER) && _MSC_VER >= 1500
344
+ #pragma warning( pop )
345
+ #endif /* defined(_MSC_VER) && _MSC_VER >= 1500 */
346
+
347
+ /* Restore -Weffc++ warnings from here on */
348
+ #if defined(__GNUC__)
349
+ #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
350
+ #pragma GCC diagnostic pop
351
+ #endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) */
352
+ #endif /* defined(__GNUC__) */
353
+
354
+ #undef __CUDA_HOSTDEVICE__
355
+ #undef __CUDA_ALIGN__
356
+
357
+ #ifndef __CUDACC_RTC__ /* no host functions in NVRTC mode */
358
+ static inline unsigned short __internal_float2half(const float f, unsigned int &sign, unsigned int &remainder)
359
+ {
360
+ unsigned int x;
361
+ unsigned int u;
362
+ unsigned int result;
363
+ #if defined(__CUDACC__)
364
+ (void)memcpy(&x, &f, sizeof(f));
365
+ #else
366
+ (void)std::memcpy(&x, &f, sizeof(f));
367
+ #endif
368
+ u = (x & 0x7fffffffU);
369
+ sign = ((x >> 16U) & 0x8000U);
370
+ // NaN/+Inf/-Inf
371
+ if (u >= 0x7f800000U) {
372
+ remainder = 0U;
373
+ result = ((u == 0x7f800000U) ? (sign | 0x7c00U) : 0x7fffU);
374
+ } else if (u > 0x477fefffU) { // Overflows
375
+ remainder = 0x80000000U;
376
+ result = (sign | 0x7bffU);
377
+ } else if (u >= 0x38800000U) { // Normal numbers
378
+ remainder = u << 19U;
379
+ u -= 0x38000000U;
380
+ result = (sign | (u >> 13U));
381
+ } else if (u < 0x33000001U) { // +0/-0
382
+ remainder = u;
383
+ result = sign;
384
+ } else { // Denormal numbers
385
+ const unsigned int exponent = u >> 23U;
386
+ const unsigned int shift = 0x7eU - exponent;
387
+ unsigned int mantissa = (u & 0x7fffffU);
388
+ mantissa |= 0x800000U;
389
+ remainder = mantissa << (32U - shift);
390
+ result = (sign | (mantissa >> shift));
391
+ }
392
+ return static_cast<unsigned short>(result);
393
+ }
394
+ #endif /* #if !defined(__CUDACC_RTC__) */
395
+
396
+ __CUDA_HOSTDEVICE_FP16_DECL__ __half __double2half(const double a)
397
+ {
398
+ #if defined(__CUDA_ARCH__)
399
+ __half val;
400
+ asm("{ cvt.rn.f16.f64 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "d"(a));
401
+ return val;
402
+ #else
403
+ __half result;
404
+ // Perform rounding to 11 bits of precision, convert value
405
+ // to float and call existing float to half conversion.
406
+ // By pre-rounding to 11 bits we avoid additional rounding
407
+ // in float to half conversion.
408
+ unsigned long long int absa;
409
+ unsigned long long int ua;
410
+ #if defined(__CUDACC__)
411
+ (void)memcpy(&ua, &a, sizeof(a));
412
+ #else
413
+ (void)std::memcpy(&ua, &a, sizeof(a));
414
+ #endif
415
+ absa = (ua & 0x7fffffffffffffffULL);
416
+ if ((absa >= 0x40f0000000000000ULL) || (absa <= 0x3e60000000000000ULL))
417
+ {
418
+ // |a| >= 2^16 or NaN or |a| <= 2^(-25)
419
+ // double-rounding is not a problem
420
+ result = __float2half(static_cast<float>(a));
421
+ }
422
+ else
423
+ {
424
+ // here 2^(-25) < |a| < 2^16
425
+ // prepare shifter value such that a + shifter
426
+ // done in double precision performs round-to-nearest-even
427
+ // and (a + shifter) - shifter results in a rounded to
428
+ // 11 bits of precision. Shifter needs to have exponent of
429
+ // a plus 53 - 11 = 42 and a leading bit in mantissa to guard
430
+ // against negative values.
431
+ // So need to have |a| capped to avoid overflow in exponent.
432
+ // For inputs that are smaller than half precision minnorm
433
+ // we prepare fixed shifter exponent.
434
+ unsigned long long shifterBits;
435
+ if (absa >= 0x3f10000000000000ULL)
436
+ { // Here if |a| >= 2^(-14)
437
+ // add 42 to exponent bits
438
+ shifterBits = (ua & 0x7ff0000000000000ULL) + 0x02A0000000000000ULL;
439
+ }
440
+ else
441
+ { // 2^(-25) < |a| < 2^(-14), potentially results in denormal
442
+ // set exponent bits to 42 - 14 + bias
443
+ shifterBits = 0x41B0000000000000ULL;
444
+ }
445
+ // set leading mantissa bit to protect against negative inputs
446
+ shifterBits |= 0x0008000000000000ULL;
447
+ double shifter;
448
+ #if defined(__CUDACC__)
449
+ (void)memcpy(&shifter, &shifterBits, sizeof(shifterBits));
450
+ #else
451
+ (void)std::memcpy(&shifter, &shifterBits, sizeof(shifterBits));
452
+ #endif
453
+ double aShiftRound = a + shifter;
454
+
455
+ // Prevent the compiler from optimizing away a + shifter - shifter
456
+ // by doing intermediate memcopy and harmless bitwize operation
457
+ unsigned long long int aShiftRoundBits;
458
+ #if defined(__CUDACC__)
459
+ (void)memcpy(&aShiftRoundBits, &aShiftRound, sizeof(aShiftRound));
460
+ #else
461
+ (void)std::memcpy(&aShiftRoundBits, &aShiftRound, sizeof(aShiftRound));
462
+ #endif
463
+
464
+ // the value is positive, so this operation doesn't change anything
465
+ aShiftRoundBits &= 0x7fffffffffffffffULL;
466
+
467
+ #if defined(__CUDACC__)
468
+ (void)memcpy(&aShiftRound, &aShiftRoundBits, sizeof(aShiftRound));
469
+ #else
470
+ (void)std::memcpy(&aShiftRound, &aShiftRoundBits, sizeof(aShiftRound));
471
+ #endif
472
+
473
+ result = __float2half(static_cast<float>(aShiftRound - shifter));
474
+ }
475
+
476
+ return result;
477
+ #endif
478
+ }
479
+
480
+ __CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half(const float a)
481
+ {
482
+ __half val;
483
+ #if defined(__CUDA_ARCH__)
484
+ asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a));
485
+ #else
486
+ __half_raw r;
487
+ unsigned int sign = 0U;
488
+ unsigned int remainder = 0U;
489
+ r.x = __internal_float2half(a, sign, remainder);
490
+ if ((remainder > 0x80000000U) || ((remainder == 0x80000000U) && ((r.x & 0x1U) != 0U))) {
491
+ r.x++;
492
+ }
493
+ val = r;
494
+ #endif
495
+ return val;
496
+ }
497
+ __CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rn(const float a)
498
+ {
499
+ __half val;
500
+ #if defined(__CUDA_ARCH__)
501
+ asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a));
502
+ #else
503
+ __half_raw r;
504
+ unsigned int sign = 0U;
505
+ unsigned int remainder = 0U;
506
+ r.x = __internal_float2half(a, sign, remainder);
507
+ if ((remainder > 0x80000000U) || ((remainder == 0x80000000U) && ((r.x & 0x1U) != 0U))) {
508
+ r.x++;
509
+ }
510
+ val = r;
511
+ #endif
512
+ return val;
513
+ }
514
+ __CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rz(const float a)
515
+ {
516
+ __half val;
517
+ #if defined(__CUDA_ARCH__)
518
+ asm("{ cvt.rz.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a));
519
+ #else
520
+ __half_raw r;
521
+ unsigned int sign = 0U;
522
+ unsigned int remainder = 0U;
523
+ r.x = __internal_float2half(a, sign, remainder);
524
+ val = r;
525
+ #endif
526
+ return val;
527
+ }
528
+ __CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rd(const float a)
529
+ {
530
+ __half val;
531
+ #if defined(__CUDA_ARCH__)
532
+ asm("{ cvt.rm.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a));
533
+ #else
534
+ __half_raw r;
535
+ unsigned int sign = 0U;
536
+ unsigned int remainder = 0U;
537
+ r.x = __internal_float2half(a, sign, remainder);
538
+ if ((remainder != 0U) && (sign != 0U)) {
539
+ r.x++;
540
+ }
541
+ val = r;
542
+ #endif
543
+ return val;
544
+ }
545
+ __CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_ru(const float a)
546
+ {
547
+ __half val;
548
+ #if defined(__CUDA_ARCH__)
549
+ asm("{ cvt.rp.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a));
550
+ #else
551
+ __half_raw r;
552
+ unsigned int sign = 0U;
553
+ unsigned int remainder = 0U;
554
+ r.x = __internal_float2half(a, sign, remainder);
555
+ if ((remainder != 0U) && (sign == 0U)) {
556
+ r.x++;
557
+ }
558
+ val = r;
559
+ #endif
560
+ return val;
561
+ }
562
+ __CUDA_HOSTDEVICE_FP16_DECL__ __half2 __float2half2_rn(const float a)
563
+ {
564
+ __half2 val;
565
+ #if defined(__CUDA_ARCH__)
566
+ asm("{.reg .f16 low;\n"
567
+ " cvt.rn.f16.f32 low, %1;\n"
568
+ " mov.b32 %0, {low,low};}\n" : "=r"(__HALF2_TO_UI(val)) : "f"(a));
569
+ #else
570
+ val = __half2(__float2half_rn(a), __float2half_rn(a));
571
+ #endif
572
+ return val;
573
+ }
574
+ __CUDA_HOSTDEVICE_FP16_DECL__ __half2 __floats2half2_rn(const float a, const float b)
575
+ {
576
+ __half2 val;
577
+ #if defined(__CUDA_ARCH__)
578
+ asm("{.reg .f16 low,high;\n"
579
+ " cvt.rn.f16.f32 low, %1;\n"
580
+ " cvt.rn.f16.f32 high, %2;\n"
581
+ " mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "f"(a), "f"(b));
582
+ #else
583
+ val = __half2(__float2half_rn(a), __float2half_rn(b));
584
+ #endif
585
+ return val;
586
+ }
587
+
588
+ #ifndef __CUDACC_RTC__ /* no host functions in NVRTC mode */
589
+ static inline float __internal_half2float(const unsigned short h)
590
+ {
591
+ unsigned int sign = ((static_cast<unsigned int>(h) >> 15U) & 1U);
592
+ unsigned int exponent = ((static_cast<unsigned int>(h) >> 10U) & 0x1fU);
593
+ unsigned int mantissa = ((static_cast<unsigned int>(h) & 0x3ffU) << 13U);
594
+ float f;
595
+ if (exponent == 0x1fU) { /* NaN or Inf */
596
+ /* discard sign of a NaN */
597
+ sign = ((mantissa != 0U) ? (sign >> 1U) : sign);
598
+ mantissa = ((mantissa != 0U) ? 0x7fffffU : 0U);
599
+ exponent = 0xffU;
600
+ } else if (exponent == 0U) { /* Denorm or Zero */
601
+ if (mantissa != 0U) {
602
+ unsigned int msb;
603
+ exponent = 0x71U;
604
+ do {
605
+ msb = (mantissa & 0x400000U);
606
+ mantissa <<= 1U; /* normalize */
607
+ --exponent;
608
+ } while (msb == 0U);
609
+ mantissa &= 0x7fffffU; /* 1.mantissa is implicit */
610
+ }
611
+ } else {
612
+ exponent += 0x70U;
613
+ }
614
+ unsigned int u = ((sign << 31U) | (exponent << 23U) | mantissa);
615
+ #if defined(__CUDACC__)
616
+ (void)memcpy(&f, &u, sizeof(u));
617
+ #else
618
+ (void)std::memcpy(&f, &u, sizeof(u));
619
+ #endif
620
+ return f;
621
+ }
622
+ #endif /* !defined(__CUDACC_RTC__) */
623
+
624
+ __CUDA_HOSTDEVICE_FP16_DECL__ float __half2float(const __half a)
625
+ {
626
+ float val;
627
+ #if defined(__CUDA_ARCH__)
628
+ asm("{ cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(__HALF_TO_CUS(a)));
629
+ #else
630
+ val = __internal_half2float(static_cast<__half_raw>(a).x);
631
+ #endif
632
+ return val;
633
+ }
634
+ __CUDA_HOSTDEVICE_FP16_DECL__ float __low2float(const __half2 a)
635
+ {
636
+ float val;
637
+ #if defined(__CUDA_ARCH__)
638
+ asm("{.reg .f16 low,high;\n"
639
+ " mov.b32 {low,high},%1;\n"
640
+ " cvt.f32.f16 %0, low;}\n" : "=f"(val) : "r"(__HALF2_TO_CUI(a)));
641
+ #else
642
+ val = __internal_half2float(static_cast<__half2_raw>(a).x);
643
+ #endif
644
+ return val;
645
+ }
646
+ __CUDA_HOSTDEVICE_FP16_DECL__ float __high2float(const __half2 a)
647
+ {
648
+ float val;
649
+ #if defined(__CUDA_ARCH__)
650
+ asm("{.reg .f16 low,high;\n"
651
+ " mov.b32 {low,high},%1;\n"
652
+ " cvt.f32.f16 %0, high;}\n" : "=f"(val) : "r"(__HALF2_TO_CUI(a)));
653
+ #else
654
+ val = __internal_half2float(static_cast<__half2_raw>(a).y);
655
+ #endif
656
+ return val;
657
+ }
658
+
659
+ /* Intrinsic functions only available to nvcc compilers */
660
+ #if defined(__CUDACC__)
661
+
662
+ /* CUDA vector-types compatible vector creation function (note returns __half2, not half2) */
663
+ __VECTOR_FUNCTIONS_DECL__ __half2 make_half2(const __half x, const __half y)
664
+ {
665
+ __half2 t; t.x = x; t.y = y; return t;
666
+ }
667
+ #undef __VECTOR_FUNCTIONS_DECL__
668
+
669
+
670
+ /* Definitions of intrinsics */
671
+ __CUDA_HOSTDEVICE_FP16_DECL__ __half2 __float22half2_rn(const float2 a)
672
+ {
673
+ const __half2 val = __floats2half2_rn(a.x, a.y);
674
+ return val;
675
+ }
676
+ __CUDA_HOSTDEVICE_FP16_DECL__ float2 __half22float2(const __half2 a)
677
+ {
678
+ float hi_float;
679
+ float lo_float;
680
+ #if defined(__CUDA_ARCH__)
681
+ asm("{.reg .f16 low,high;\n"
682
+ " mov.b32 {low,high},%1;\n"
683
+ " cvt.f32.f16 %0, low;}\n" : "=f"(lo_float) : "r"(__HALF2_TO_CUI(a)));
684
+
685
+ asm("{.reg .f16 low,high;\n"
686
+ " mov.b32 {low,high},%1;\n"
687
+ " cvt.f32.f16 %0, high;}\n" : "=f"(hi_float) : "r"(__HALF2_TO_CUI(a)));
688
+ #else
689
+ lo_float = __internal_half2float(((__half2_raw)a).x);
690
+ hi_float = __internal_half2float(((__half2_raw)a).y);
691
+ #endif
692
+ return make_float2(lo_float, hi_float);
693
+ }
694
+ __CUDA_FP16_DECL__ int __half2int_rn(const __half h)
695
+ {
696
+ int i;
697
+ asm("cvt.rni.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
698
+ return i;
699
+ }
700
+ __CUDA_HOSTDEVICE_FP16_DECL__ int __half2int_rz(const __half h)
701
+ {
702
+ int i;
703
+ #if defined __CUDA_ARCH__
704
+ asm("cvt.rzi.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
705
+ #else
706
+ const float f = __half2float(h);
707
+ i = static_cast<int>(f);
708
+ const int max_val = (int)0x7fffffffU;
709
+ const int min_val = (int)0x80000000U;
710
+ // saturation fixup
711
+ if (f != f) {
712
+ // NaN
713
+ i = 0;
714
+ } else if (f > static_cast<float>(max_val)) {
715
+ // saturate maximum
716
+ i = max_val;
717
+ } else if (f < static_cast<float>(min_val)) {
718
+ // saturate minimum
719
+ i = min_val;
720
+ }
721
+ #endif
722
+ return i;
723
+ }
724
+ __CUDA_FP16_DECL__ int __half2int_rd(const __half h)
725
+ {
726
+ int i;
727
+ asm("cvt.rmi.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
728
+ return i;
729
+ }
730
+ __CUDA_FP16_DECL__ int __half2int_ru(const __half h)
731
+ {
732
+ int i;
733
+ asm("cvt.rpi.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
734
+ return i;
735
+ }
736
+ __CUDA_HOSTDEVICE_FP16_DECL__ __half __int2half_rn(const int i)
737
+ {
738
+ __half h;
739
+ #if defined(__CUDA_ARCH__)
740
+ asm("cvt.rn.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i));
741
+ #else
742
+ // double-rounding is not a problem here: if integer
743
+ // has more than 24 bits, it is already too large to
744
+ // be represented in half precision, and result will
745
+ // be infinity.
746
+ const float f = static_cast<float>(i);
747
+ h = __float2half_rn(f);
748
+ #endif
749
+ return h;
750
+ }
751
+ __CUDA_FP16_DECL__ __half __int2half_rz(const int i)
752
+ {
753
+ __half h;
754
+ asm("cvt.rz.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i));
755
+ return h;
756
+ }
757
+ __CUDA_FP16_DECL__ __half __int2half_rd(const int i)
758
+ {
759
+ __half h;
760
+ asm("cvt.rm.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i));
761
+ return h;
762
+ }
763
+ __CUDA_FP16_DECL__ __half __int2half_ru(const int i)
764
+ {
765
+ __half h;
766
+ asm("cvt.rp.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i));
767
+ return h;
768
+ }
769
+
770
+ __CUDA_FP16_DECL__ short int __half2short_rn(const __half h)
771
+ {
772
+ short int i;
773
+ asm("cvt.rni.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
774
+ return i;
775
+ }
776
+ __CUDA_HOSTDEVICE_FP16_DECL__ short int __half2short_rz(const __half h)
777
+ {
778
+ short int i;
779
+ #if defined __CUDA_ARCH__
780
+ asm("cvt.rzi.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
781
+ #else
782
+ const float f = __half2float(h);
783
+ i = static_cast<short int>(f);
784
+ const short int max_val = (short int)0x7fffU;
785
+ const short int min_val = (short int)0x8000U;
786
+ // saturation fixup
787
+ if (f != f) {
788
+ // NaN
789
+ i = 0;
790
+ } else if (f > static_cast<float>(max_val)) {
791
+ // saturate maximum
792
+ i = max_val;
793
+ } else if (f < static_cast<float>(min_val)) {
794
+ // saturate minimum
795
+ i = min_val;
796
+ }
797
+ #endif
798
+ return i;
799
+ }
800
+ __CUDA_FP16_DECL__ short int __half2short_rd(const __half h)
801
+ {
802
+ short int i;
803
+ asm("cvt.rmi.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
804
+ return i;
805
+ }
806
+ __CUDA_FP16_DECL__ short int __half2short_ru(const __half h)
807
+ {
808
+ short int i;
809
+ asm("cvt.rpi.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
810
+ return i;
811
+ }
812
+ __CUDA_HOSTDEVICE_FP16_DECL__ __half __short2half_rn(const short int i)
813
+ {
814
+ __half h;
815
+ #if defined __CUDA_ARCH__
816
+ asm("cvt.rn.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i));
817
+ #else
818
+ const float f = static_cast<float>(i);
819
+ h = __float2half_rn(f);
820
+ #endif
821
+ return h;
822
+ }
823
+ __CUDA_FP16_DECL__ __half __short2half_rz(const short int i)
824
+ {
825
+ __half h;
826
+ asm("cvt.rz.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i));
827
+ return h;
828
+ }
829
+ __CUDA_FP16_DECL__ __half __short2half_rd(const short int i)
830
+ {
831
+ __half h;
832
+ asm("cvt.rm.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i));
833
+ return h;
834
+ }
835
+ __CUDA_FP16_DECL__ __half __short2half_ru(const short int i)
836
+ {
837
+ __half h;
838
+ asm("cvt.rp.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i));
839
+ return h;
840
+ }
841
+
842
+ __CUDA_FP16_DECL__ unsigned int __half2uint_rn(const __half h)
843
+ {
844
+ unsigned int i;
845
+ asm("cvt.rni.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
846
+ return i;
847
+ }
848
+ __CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __half2uint_rz(const __half h)
849
+ {
850
+ unsigned int i;
851
+ #if defined __CUDA_ARCH__
852
+ asm("cvt.rzi.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
853
+ #else
854
+ const float f = __half2float(h);
855
+ i = static_cast<unsigned int>(f);
856
+ const unsigned int max_val = 0xffffffffU;
857
+ const unsigned int min_val = 0U;
858
+ // saturation fixup
859
+ if (f != f) {
860
+ // NaN
861
+ i = 0U;
862
+ } else if (f > static_cast<float>(max_val)) {
863
+ // saturate maximum
864
+ i = max_val;
865
+ } else if (f < static_cast<float>(min_val)) {
866
+ // saturate minimum
867
+ i = min_val;
868
+ }
869
+ #endif
870
+ return i;
871
+ }
872
+ __CUDA_FP16_DECL__ unsigned int __half2uint_rd(const __half h)
873
+ {
874
+ unsigned int i;
875
+ asm("cvt.rmi.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
876
+ return i;
877
+ }
878
+ __CUDA_FP16_DECL__ unsigned int __half2uint_ru(const __half h)
879
+ {
880
+ unsigned int i;
881
+ asm("cvt.rpi.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
882
+ return i;
883
+ }
884
+ __CUDA_HOSTDEVICE_FP16_DECL__ __half __uint2half_rn(const unsigned int i)
885
+ {
886
+ __half h;
887
+ #if defined __CUDA_ARCH__
888
+ asm("cvt.rn.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i));
889
+ #else
890
+ // double-rounding is not a problem here: if integer
891
+ // has more than 24 bits, it is already too large to
892
+ // be represented in half precision, and result will
893
+ // be infinity.
894
+ const float f = static_cast<float>(i);
895
+ h = __float2half_rn(f);
896
+ #endif
897
+ return h;
898
+ }
899
+ __CUDA_FP16_DECL__ __half __uint2half_rz(const unsigned int i)
900
+ {
901
+ __half h;
902
+ asm("cvt.rz.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i));
903
+ return h;
904
+ }
905
+ __CUDA_FP16_DECL__ __half __uint2half_rd(const unsigned int i)
906
+ {
907
+ __half h;
908
+ asm("cvt.rm.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i));
909
+ return h;
910
+ }
911
+ __CUDA_FP16_DECL__ __half __uint2half_ru(const unsigned int i)
912
+ {
913
+ __half h;
914
+ asm("cvt.rp.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i));
915
+ return h;
916
+ }
917
+
918
+ __CUDA_FP16_DECL__ unsigned short int __half2ushort_rn(const __half h)
919
+ {
920
+ unsigned short int i;
921
+ asm("cvt.rni.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
922
+ return i;
923
+ }
924
+ __CUDA_HOSTDEVICE_FP16_DECL__ unsigned short int __half2ushort_rz(const __half h)
925
+ {
926
+ unsigned short int i;
927
+ #if defined __CUDA_ARCH__
928
+ asm("cvt.rzi.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
929
+ #else
930
+ const float f = __half2float(h);
931
+ i = static_cast<unsigned short int>(f);
932
+ const unsigned short int max_val = 0xffffU;
933
+ const unsigned short int min_val = 0U;
934
+ // saturation fixup
935
+ if (f != f) {
936
+ // NaN
937
+ i = 0U;
938
+ } else if (f > static_cast<float>(max_val)) {
939
+ // saturate maximum
940
+ i = max_val;
941
+ } else if (f < static_cast<float>(min_val)) {
942
+ // saturate minimum
943
+ i = min_val;
944
+ }
945
+ #endif
946
+ return i;
947
+ }
948
+ __CUDA_FP16_DECL__ unsigned short int __half2ushort_rd(const __half h)
949
+ {
950
+ unsigned short int i;
951
+ asm("cvt.rmi.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
952
+ return i;
953
+ }
954
+ __CUDA_FP16_DECL__ unsigned short int __half2ushort_ru(const __half h)
955
+ {
956
+ unsigned short int i;
957
+ asm("cvt.rpi.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
958
+ return i;
959
+ }
960
+ __CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort2half_rn(const unsigned short int i)
961
+ {
962
+ __half h;
963
+ #if defined __CUDA_ARCH__
964
+ asm("cvt.rn.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i));
965
+ #else
966
+ const float f = static_cast<float>(i);
967
+ h = __float2half_rn(f);
968
+ #endif
969
+ return h;
970
+ }
971
+ __CUDA_FP16_DECL__ __half __ushort2half_rz(const unsigned short int i)
972
+ {
973
+ __half h;
974
+ asm("cvt.rz.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i));
975
+ return h;
976
+ }
977
+ __CUDA_FP16_DECL__ __half __ushort2half_rd(const unsigned short int i)
978
+ {
979
+ __half h;
980
+ asm("cvt.rm.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i));
981
+ return h;
982
+ }
983
+ __CUDA_FP16_DECL__ __half __ushort2half_ru(const unsigned short int i)
984
+ {
985
+ __half h;
986
+ asm("cvt.rp.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i));
987
+ return h;
988
+ }
989
+
990
+ __CUDA_FP16_DECL__ unsigned long long int __half2ull_rn(const __half h)
991
+ {
992
+ unsigned long long int i;
993
+ asm("cvt.rni.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
994
+ return i;
995
+ }
996
+ __CUDA_HOSTDEVICE_FP16_DECL__ unsigned long long int __half2ull_rz(const __half h)
997
+ {
998
+ unsigned long long int i;
999
+ #if defined __CUDA_ARCH__
1000
+ asm("cvt.rzi.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
1001
+ #else
1002
+ const float f = __half2float(h);
1003
+ i = static_cast<unsigned long long int>(f);
1004
+ const unsigned long long int max_val = 0xffffffffffffffffULL;
1005
+ const unsigned long long int min_val = 0ULL;
1006
+ // saturation fixup
1007
+ if (f != f) {
1008
+ // NaN
1009
+ i = 0x8000000000000000ULL;
1010
+ } else if (f > static_cast<float>(max_val)) {
1011
+ // saturate maximum
1012
+ i = max_val;
1013
+ } else if (f < static_cast<float>(min_val)) {
1014
+ // saturate minimum
1015
+ i = min_val;
1016
+ }
1017
+ #endif
1018
+ return i;
1019
+ }
1020
+ __CUDA_FP16_DECL__ unsigned long long int __half2ull_rd(const __half h)
1021
+ {
1022
+ unsigned long long int i;
1023
+ asm("cvt.rmi.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
1024
+ return i;
1025
+ }
1026
+ __CUDA_FP16_DECL__ unsigned long long int __half2ull_ru(const __half h)
1027
+ {
1028
+ unsigned long long int i;
1029
+ asm("cvt.rpi.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
1030
+ return i;
1031
+ }
1032
+ __CUDA_HOSTDEVICE_FP16_DECL__ __half __ull2half_rn(const unsigned long long int i)
1033
+ {
1034
+ __half h;
1035
+ #if defined(__CUDA_ARCH__)
1036
+ asm("cvt.rn.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i));
1037
+ #else
1038
+ // double-rounding is not a problem here: if integer
1039
+ // has more than 24 bits, it is already too large to
1040
+ // be represented in half precision, and result will
1041
+ // be infinity.
1042
+ const float f = static_cast<float>(i);
1043
+ h = __float2half_rn(f);
1044
+ #endif
1045
+ return h;
1046
+ }
1047
+ __CUDA_FP16_DECL__ __half __ull2half_rz(const unsigned long long int i)
1048
+ {
1049
+ __half h;
1050
+ asm("cvt.rz.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i));
1051
+ return h;
1052
+ }
1053
+ __CUDA_FP16_DECL__ __half __ull2half_rd(const unsigned long long int i)
1054
+ {
1055
+ __half h;
1056
+ asm("cvt.rm.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i));
1057
+ return h;
1058
+ }
1059
+ __CUDA_FP16_DECL__ __half __ull2half_ru(const unsigned long long int i)
1060
+ {
1061
+ __half h;
1062
+ asm("cvt.rp.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i));
1063
+ return h;
1064
+ }
1065
+
1066
+ __CUDA_FP16_DECL__ long long int __half2ll_rn(const __half h)
1067
+ {
1068
+ long long int i;
1069
+ asm("cvt.rni.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
1070
+ return i;
1071
+ }
1072
+ __CUDA_HOSTDEVICE_FP16_DECL__ long long int __half2ll_rz(const __half h)
1073
+ {
1074
+ long long int i;
1075
+ #if defined __CUDA_ARCH__
1076
+ asm("cvt.rzi.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
1077
+ #else
1078
+ const float f = __half2float(h);
1079
+ i = static_cast<long long int>(f);
1080
+ const long long int max_val = (long long int)0x7fffffffffffffffULL;
1081
+ const long long int min_val = (long long int)0x8000000000000000ULL;
1082
+ // saturation fixup
1083
+ if (f != f) {
1084
+ // NaN
1085
+ i = min_val;
1086
+ } else if (f > static_cast<float>(max_val)) {
1087
+ // saturate maximum
1088
+ i = max_val;
1089
+ } else if (f < static_cast<float>(min_val)) {
1090
+ // saturate minimum
1091
+ i = min_val;
1092
+ }
1093
+ #endif
1094
+ return i;
1095
+ }
1096
+ __CUDA_FP16_DECL__ long long int __half2ll_rd(const __half h)
1097
+ {
1098
+ long long int i;
1099
+ asm("cvt.rmi.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
1100
+ return i;
1101
+ }
1102
+ __CUDA_FP16_DECL__ long long int __half2ll_ru(const __half h)
1103
+ {
1104
+ long long int i;
1105
+ asm("cvt.rpi.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
1106
+ return i;
1107
+ }
1108
+ __CUDA_HOSTDEVICE_FP16_DECL__ __half __ll2half_rn(const long long int i)
1109
+ {
1110
+ __half h;
1111
+ #if defined(__CUDA_ARCH__)
1112
+ asm("cvt.rn.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i));
1113
+ #else
1114
+ // double-rounding is not a problem here: if integer
1115
+ // has more than 24 bits, it is already too large to
1116
+ // be represented in half precision, and result will
1117
+ // be infinity.
1118
+ const float f = static_cast<float>(i);
1119
+ h = __float2half_rn(f);
1120
+ #endif
1121
+ return h;
1122
+ }
1123
+ __CUDA_FP16_DECL__ __half __ll2half_rz(const long long int i)
1124
+ {
1125
+ __half h;
1126
+ asm("cvt.rz.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i));
1127
+ return h;
1128
+ }
1129
+ __CUDA_FP16_DECL__ __half __ll2half_rd(const long long int i)
1130
+ {
1131
+ __half h;
1132
+ asm("cvt.rm.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i));
1133
+ return h;
1134
+ }
1135
+ __CUDA_FP16_DECL__ __half __ll2half_ru(const long long int i)
1136
+ {
1137
+ __half h;
1138
+ asm("cvt.rp.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i));
1139
+ return h;
1140
+ }
1141
+
1142
+ __CUDA_FP16_DECL__ __half htrunc(const __half h)
1143
+ {
1144
+ __half r;
1145
+ asm("cvt.rzi.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h)));
1146
+ return r;
1147
+ }
1148
+ __CUDA_FP16_DECL__ __half hceil(const __half h)
1149
+ {
1150
+ __half r;
1151
+ asm("cvt.rpi.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h)));
1152
+ return r;
1153
+ }
1154
+ __CUDA_FP16_DECL__ __half hfloor(const __half h)
1155
+ {
1156
+ __half r;
1157
+ asm("cvt.rmi.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h)));
1158
+ return r;
1159
+ }
1160
+ __CUDA_FP16_DECL__ __half hrint(const __half h)
1161
+ {
1162
+ __half r;
1163
+ asm("cvt.rni.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h)));
1164
+ return r;
1165
+ }
1166
+
1167
+ __CUDA_FP16_DECL__ __half2 h2trunc(const __half2 h)
1168
+ {
1169
+ __half2 val;
1170
+ asm("{.reg .f16 low,high;\n"
1171
+ " mov.b32 {low,high}, %1;\n"
1172
+ " cvt.rzi.f16.f16 low, low;\n"
1173
+ " cvt.rzi.f16.f16 high, high;\n"
1174
+ " mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h)));
1175
+ return val;
1176
+ }
1177
+ __CUDA_FP16_DECL__ __half2 h2ceil(const __half2 h)
1178
+ {
1179
+ __half2 val;
1180
+ asm("{.reg .f16 low,high;\n"
1181
+ " mov.b32 {low,high}, %1;\n"
1182
+ " cvt.rpi.f16.f16 low, low;\n"
1183
+ " cvt.rpi.f16.f16 high, high;\n"
1184
+ " mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h)));
1185
+ return val;
1186
+ }
1187
+ __CUDA_FP16_DECL__ __half2 h2floor(const __half2 h)
1188
+ {
1189
+ __half2 val;
1190
+ asm("{.reg .f16 low,high;\n"
1191
+ " mov.b32 {low,high}, %1;\n"
1192
+ " cvt.rmi.f16.f16 low, low;\n"
1193
+ " cvt.rmi.f16.f16 high, high;\n"
1194
+ " mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h)));
1195
+ return val;
1196
+ }
1197
+ __CUDA_FP16_DECL__ __half2 h2rint(const __half2 h)
1198
+ {
1199
+ __half2 val;
1200
+ asm("{.reg .f16 low,high;\n"
1201
+ " mov.b32 {low,high}, %1;\n"
1202
+ " cvt.rni.f16.f16 low, low;\n"
1203
+ " cvt.rni.f16.f16 high, high;\n"
1204
+ " mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h)));
1205
+ return val;
1206
+ }
1207
+ __CUDA_FP16_DECL__ __half2 __lows2half2(const __half2 a, const __half2 b)
1208
+ {
1209
+ __half2 val;
1210
+ asm("{.reg .f16 alow,ahigh,blow,bhigh;\n"
1211
+ " mov.b32 {alow,ahigh}, %1;\n"
1212
+ " mov.b32 {blow,bhigh}, %2;\n"
1213
+ " mov.b32 %0, {alow,blow};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)), "r"(__HALF2_TO_CUI(b)));
1214
+ return val;
1215
+ }
1216
+ __CUDA_FP16_DECL__ __half2 __highs2half2(const __half2 a, const __half2 b)
1217
+ {
1218
+ __half2 val;
1219
+ asm("{.reg .f16 alow,ahigh,blow,bhigh;\n"
1220
+ " mov.b32 {alow,ahigh}, %1;\n"
1221
+ " mov.b32 {blow,bhigh}, %2;\n"
1222
+ " mov.b32 %0, {ahigh,bhigh};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)), "r"(__HALF2_TO_CUI(b)));
1223
+ return val;
1224
+ }
1225
+ __CUDA_FP16_DECL__ __half __low2half(const __half2 a)
1226
+ {
1227
+ __half ret;
1228
+ asm("{.reg .f16 low,high;\n"
1229
+ " mov.b32 {low,high}, %1;\n"
1230
+ " mov.b16 %0, low;}" : "=h"(__HALF_TO_US(ret)) : "r"(__HALF2_TO_CUI(a)));
1231
+ return ret;
1232
+ }
1233
+ __CUDA_FP16_DECL__ int __hisinf(const __half a)
1234
+ {
1235
+ int retval;
1236
+ if (__HALF_TO_CUS(a) == 0xFC00U) {
1237
+ retval = -1;
1238
+ } else if (__HALF_TO_CUS(a) == 0x7C00U) {
1239
+ retval = 1;
1240
+ } else {
1241
+ retval = 0;
1242
+ }
1243
+ return retval;
1244
+ }
1245
+ __CUDA_FP16_DECL__ __half2 __low2half2(const __half2 a)
1246
+ {
1247
+ __half2 val;
1248
+ asm("{.reg .f16 low,high;\n"
1249
+ " mov.b32 {low,high}, %1;\n"
1250
+ " mov.b32 %0, {low,low};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
1251
+ return val;
1252
+ }
1253
+ __CUDA_FP16_DECL__ __half2 __high2half2(const __half2 a)
1254
+ {
1255
+ __half2 val;
1256
+ asm("{.reg .f16 low,high;\n"
1257
+ " mov.b32 {low,high}, %1;\n"
1258
+ " mov.b32 %0, {high,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
1259
+ return val;
1260
+ }
1261
+ __CUDA_FP16_DECL__ __half __high2half(const __half2 a)
1262
+ {
1263
+ __half ret;
1264
+ asm("{.reg .f16 low,high;\n"
1265
+ " mov.b32 {low,high}, %1;\n"
1266
+ " mov.b16 %0, high;}" : "=h"(__HALF_TO_US(ret)) : "r"(__HALF2_TO_CUI(a)));
1267
+ return ret;
1268
+ }
1269
+ __CUDA_FP16_DECL__ __half2 __halves2half2(const __half a, const __half b)
1270
+ {
1271
+ __half2 val;
1272
+ asm("{ mov.b32 %0, {%1,%2};}\n"
1273
+ : "=r"(__HALF2_TO_UI(val)) : "h"(__HALF_TO_CUS(a)), "h"(__HALF_TO_CUS(b)));
1274
+ return val;
1275
+ }
1276
+ __CUDA_FP16_DECL__ __half2 __half2half2(const __half a)
1277
+ {
1278
+ __half2 val;
1279
+ asm("{ mov.b32 %0, {%1,%1};}\n"
1280
+ : "=r"(__HALF2_TO_UI(val)) : "h"(__HALF_TO_CUS(a)));
1281
+ return val;
1282
+ }
1283
+ __CUDA_FP16_DECL__ __half2 __lowhigh2highlow(const __half2 a)
1284
+ {
1285
+ __half2 val;
1286
+ asm("{.reg .f16 low,high;\n"
1287
+ " mov.b32 {low,high}, %1;\n"
1288
+ " mov.b32 %0, {high,low};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
1289
+ return val;
1290
+ }
1291
+ __CUDA_FP16_DECL__ short int __half_as_short(const __half h)
1292
+ {
1293
+ return static_cast<short int>(__HALF_TO_CUS(h));
1294
+ }
1295
+ __CUDA_FP16_DECL__ unsigned short int __half_as_ushort(const __half h)
1296
+ {
1297
+ return __HALF_TO_CUS(h);
1298
+ }
1299
+ __CUDA_FP16_DECL__ __half __short_as_half(const short int i)
1300
+ {
1301
+ __half h;
1302
+ __HALF_TO_US(h) = static_cast<unsigned short int>(i);
1303
+ return h;
1304
+ }
1305
+ __CUDA_FP16_DECL__ __half __ushort_as_half(const unsigned short int i)
1306
+ {
1307
+ __half h;
1308
+ __HALF_TO_US(h) = i;
1309
+ return h;
1310
+ }
1311
+
1312
+ #if __CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__)
1313
+ /******************************************************************************
1314
+ * __half, __half2 warp shuffle *
1315
+ ******************************************************************************/
1316
+ #define __SHUFFLE_HALF2_MACRO(name) /* do */ {\
1317
+ __half2 r; \
1318
+ asm volatile ("{"#name" %0,%1,%2,%3;\n}" \
1319
+ :"=r"(__HALF2_TO_UI(r)): "r"(__HALF2_TO_CUI(var)), "r"(delta), "r"(c)); \
1320
+ return r; \
1321
+ } /* while(0) */
1322
+
1323
+ #define __SHUFFLE_SYNC_HALF2_MACRO(name) /* do */ {\
1324
+ __half2 r; \
1325
+ asm volatile ("{"#name" %0,%1,%2,%3,%4;\n}" \
1326
+ :"=r"(__HALF2_TO_UI(r)): "r"(__HALF2_TO_CUI(var)), "r"(delta), "r"(c), "r"(mask)); \
1327
+ return r; \
1328
+ } /* while(0) */
1329
+
1330
+ #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
1331
+
1332
+ __CUDA_FP16_DECL__ __half2 __shfl(const __half2 var, const int delta, const int width)
1333
+ {
1334
+ unsigned int warp_size;
1335
+ asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
1336
+ const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
1337
+ __SHUFFLE_HALF2_MACRO(shfl.idx.b32)
1338
+ }
1339
+ __CUDA_FP16_DECL__ __half2 __shfl_up(const __half2 var, const unsigned int delta, const int width)
1340
+ {
1341
+ unsigned int warp_size;
1342
+ asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
1343
+ const unsigned int c = (warp_size - static_cast<unsigned>(width)) << 8U;
1344
+ __SHUFFLE_HALF2_MACRO(shfl.up.b32)
1345
+ }
1346
+ __CUDA_FP16_DECL__ __half2 __shfl_down(const __half2 var, const unsigned int delta, const int width)
1347
+ {
1348
+ unsigned int warp_size;
1349
+ asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
1350
+ const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
1351
+ __SHUFFLE_HALF2_MACRO(shfl.down.b32)
1352
+ }
1353
+ __CUDA_FP16_DECL__ __half2 __shfl_xor(const __half2 var, const int delta, const int width)
1354
+ {
1355
+ unsigned int warp_size;
1356
+ asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
1357
+ const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
1358
+ __SHUFFLE_HALF2_MACRO(shfl.bfly.b32)
1359
+ }
1360
+
1361
+ #endif /* !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 */
1362
+
1363
+ __CUDA_FP16_DECL__ __half2 __shfl_sync(const unsigned mask, const __half2 var, const int delta, const int width)
1364
+ {
1365
+ unsigned int warp_size;
1366
+ asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
1367
+ const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
1368
+ __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.idx.b32)
1369
+ }
1370
+ __CUDA_FP16_DECL__ __half2 __shfl_up_sync(const unsigned mask, const __half2 var, const unsigned int delta, const int width)
1371
+ {
1372
+ unsigned int warp_size;
1373
+ asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
1374
+ const unsigned int c = (warp_size - static_cast<unsigned>(width)) << 8U;
1375
+ __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.up.b32)
1376
+ }
1377
+ __CUDA_FP16_DECL__ __half2 __shfl_down_sync(const unsigned mask, const __half2 var, const unsigned int delta, const int width)
1378
+ {
1379
+ unsigned int warp_size;
1380
+ asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
1381
+ const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
1382
+ __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.down.b32)
1383
+ }
1384
+ __CUDA_FP16_DECL__ __half2 __shfl_xor_sync(const unsigned mask, const __half2 var, const int delta, const int width)
1385
+ {
1386
+ unsigned int warp_size;
1387
+ asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
1388
+ const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
1389
+ __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.bfly.b32)
1390
+ }
1391
+
1392
+ #undef __SHUFFLE_HALF2_MACRO
1393
+ #undef __SHUFFLE_SYNC_HALF2_MACRO
1394
+
1395
+ #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
1396
+
1397
+ __CUDA_FP16_DECL__ __half __shfl(const __half var, const int delta, const int width)
1398
+ {
1399
+ const __half2 temp1 = __halves2half2(var, var);
1400
+ const __half2 temp2 = __shfl(temp1, delta, width);
1401
+ return __low2half(temp2);
1402
+ }
1403
+ __CUDA_FP16_DECL__ __half __shfl_up(const __half var, const unsigned int delta, const int width)
1404
+ {
1405
+ const __half2 temp1 = __halves2half2(var, var);
1406
+ const __half2 temp2 = __shfl_up(temp1, delta, width);
1407
+ return __low2half(temp2);
1408
+ }
1409
+ __CUDA_FP16_DECL__ __half __shfl_down(const __half var, const unsigned int delta, const int width)
1410
+ {
1411
+ const __half2 temp1 = __halves2half2(var, var);
1412
+ const __half2 temp2 = __shfl_down(temp1, delta, width);
1413
+ return __low2half(temp2);
1414
+ }
1415
+ __CUDA_FP16_DECL__ __half __shfl_xor(const __half var, const int delta, const int width)
1416
+ {
1417
+ const __half2 temp1 = __halves2half2(var, var);
1418
+ const __half2 temp2 = __shfl_xor(temp1, delta, width);
1419
+ return __low2half(temp2);
1420
+ }
1421
+
1422
+ #endif /* !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 */
1423
+
1424
+ __CUDA_FP16_DECL__ __half __shfl_sync(const unsigned mask, const __half var, const int delta, const int width)
1425
+ {
1426
+ const __half2 temp1 = __halves2half2(var, var);
1427
+ const __half2 temp2 = __shfl_sync(mask, temp1, delta, width);
1428
+ return __low2half(temp2);
1429
+ }
1430
+ __CUDA_FP16_DECL__ __half __shfl_up_sync(const unsigned mask, const __half var, const unsigned int delta, const int width)
1431
+ {
1432
+ const __half2 temp1 = __halves2half2(var, var);
1433
+ const __half2 temp2 = __shfl_up_sync(mask, temp1, delta, width);
1434
+ return __low2half(temp2);
1435
+ }
1436
+ __CUDA_FP16_DECL__ __half __shfl_down_sync(const unsigned mask, const __half var, const unsigned int delta, const int width)
1437
+ {
1438
+ const __half2 temp1 = __halves2half2(var, var);
1439
+ const __half2 temp2 = __shfl_down_sync(mask, temp1, delta, width);
1440
+ return __low2half(temp2);
1441
+ }
1442
+ __CUDA_FP16_DECL__ __half __shfl_xor_sync(const unsigned mask, const __half var, const int delta, const int width)
1443
+ {
1444
+ const __half2 temp1 = __halves2half2(var, var);
1445
+ const __half2 temp2 = __shfl_xor_sync(mask, temp1, delta, width);
1446
+ return __low2half(temp2);
1447
+ }
1448
+
1449
+ #endif /*__CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__)*/
1450
+ /******************************************************************************
1451
+ * __half and __half2 __ldg,__ldcg,__ldca,__ldcs *
1452
+ ******************************************************************************/
1453
+
1454
+ #if defined(__cplusplus) && (__CUDA_ARCH__ >= 320 || !defined(__CUDA_ARCH__))
1455
+ #if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)
1456
+ #define __LDG_PTR "l"
1457
+ #else
1458
+ #define __LDG_PTR "r"
1459
+ #endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/
1460
+ __CUDA_FP16_DECL__ __half2 __ldg(const __half2 *const ptr)
1461
+ {
1462
+ __half2 ret;
1463
+ asm ("ld.global.nc.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr));
1464
+ return ret;
1465
+ }
1466
+ __CUDA_FP16_DECL__ __half __ldg(const __half *const ptr)
1467
+ {
1468
+ __half ret;
1469
+ asm ("ld.global.nc.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr));
1470
+ return ret;
1471
+ }
1472
+ __CUDA_FP16_DECL__ __half2 __ldcg(const __half2 *const ptr)
1473
+ {
1474
+ __half2 ret;
1475
+ asm ("ld.global.cg.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr));
1476
+ return ret;
1477
+ }
1478
+ __CUDA_FP16_DECL__ __half __ldcg(const __half *const ptr)
1479
+ {
1480
+ __half ret;
1481
+ asm ("ld.global.cg.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr));
1482
+ return ret;
1483
+ }
1484
+ __CUDA_FP16_DECL__ __half2 __ldca(const __half2 *const ptr)
1485
+ {
1486
+ __half2 ret;
1487
+ asm ("ld.global.ca.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr));
1488
+ return ret;
1489
+ }
1490
+ __CUDA_FP16_DECL__ __half __ldca(const __half *const ptr)
1491
+ {
1492
+ __half ret;
1493
+ asm ("ld.global.ca.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr));
1494
+ return ret;
1495
+ }
1496
+ __CUDA_FP16_DECL__ __half2 __ldcs(const __half2 *const ptr)
1497
+ {
1498
+ __half2 ret;
1499
+ asm ("ld.global.cs.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr));
1500
+ return ret;
1501
+ }
1502
+ __CUDA_FP16_DECL__ __half __ldcs(const __half *const ptr)
1503
+ {
1504
+ __half ret;
1505
+ asm ("ld.global.cs.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr));
1506
+ return ret;
1507
+ }
1508
+ __CUDA_FP16_DECL__ __half2 __ldlu(const __half2 *const ptr)
1509
+ {
1510
+ __half2 ret;
1511
+ asm ("ld.global.lu.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr) : "memory");
1512
+ return ret;
1513
+ }
1514
+ __CUDA_FP16_DECL__ __half __ldlu(const __half *const ptr)
1515
+ {
1516
+ __half ret;
1517
+ asm ("ld.global.lu.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr) : "memory");
1518
+ return ret;
1519
+ }
1520
+ __CUDA_FP16_DECL__ __half2 __ldcv(const __half2 *const ptr)
1521
+ {
1522
+ __half2 ret;
1523
+ asm ("ld.global.cv.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr) : "memory");
1524
+ return ret;
1525
+ }
1526
+ __CUDA_FP16_DECL__ __half __ldcv(const __half *const ptr)
1527
+ {
1528
+ __half ret;
1529
+ asm ("ld.global.cv.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr) : "memory");
1530
+ return ret;
1531
+ }
1532
+ __CUDA_FP16_DECL__ void __stwb(__half2 *const ptr, const __half2 value)
1533
+ {
1534
+ asm ("st.global.wb.b32 [%0], %1;" :: __LDG_PTR(ptr), "r"(__HALF2_TO_CUI(value)) : "memory");
1535
+ }
1536
+ __CUDA_FP16_DECL__ void __stwb(__half *const ptr, const __half value)
1537
+ {
1538
+ asm ("st.global.wb.b16 [%0], %1;" :: __LDG_PTR(ptr), "h"(__HALF_TO_CUS(value)) : "memory");
1539
+ }
1540
+ __CUDA_FP16_DECL__ void __stcg(__half2 *const ptr, const __half2 value)
1541
+ {
1542
+ asm ("st.global.cg.b32 [%0], %1;" :: __LDG_PTR(ptr), "r"(__HALF2_TO_CUI(value)) : "memory");
1543
+ }
1544
+ __CUDA_FP16_DECL__ void __stcg(__half *const ptr, const __half value)
1545
+ {
1546
+ asm ("st.global.cg.b16 [%0], %1;" :: __LDG_PTR(ptr), "h"(__HALF_TO_CUS(value)) : "memory");
1547
+ }
1548
+ __CUDA_FP16_DECL__ void __stcs(__half2 *const ptr, const __half2 value)
1549
+ {
1550
+ asm ("st.global.cs.b32 [%0], %1;" :: __LDG_PTR(ptr), "r"(__HALF2_TO_CUI(value)) : "memory");
1551
+ }
1552
+ __CUDA_FP16_DECL__ void __stcs(__half *const ptr, const __half value)
1553
+ {
1554
+ asm ("st.global.cs.b16 [%0], %1;" :: __LDG_PTR(ptr), "h"(__HALF_TO_CUS(value)) : "memory");
1555
+ }
1556
+ __CUDA_FP16_DECL__ void __stwt(__half2 *const ptr, const __half2 value)
1557
+ {
1558
+ asm ("st.global.wt.b32 [%0], %1;" :: __LDG_PTR(ptr), "r"(__HALF2_TO_CUI(value)) : "memory");
1559
+ }
1560
+ __CUDA_FP16_DECL__ void __stwt(__half *const ptr, const __half value)
1561
+ {
1562
+ asm ("st.global.wt.b16 [%0], %1;" :: __LDG_PTR(ptr), "h"(__HALF_TO_CUS(value)) : "memory");
1563
+ }
1564
+ #undef __LDG_PTR
1565
+ #endif /*defined(__cplusplus) && (__CUDA_ARCH__ >= 320 || !defined(__CUDA_ARCH__))*/
1566
+ #if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
1567
+ /******************************************************************************
1568
+ * __half2 comparison *
1569
+ ******************************************************************************/
1570
+ #define __COMPARISON_OP_HALF2_MACRO(name) /* do */ {\
1571
+ __half2 val; \
1572
+ asm( "{ "#name".f16x2.f16x2 %0,%1,%2;\n}" \
1573
+ :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b))); \
1574
+ return val; \
1575
+ } /* while(0) */
1576
+ __CUDA_FP16_DECL__ __half2 __heq2(const __half2 a, const __half2 b)
1577
+ {
1578
+ __COMPARISON_OP_HALF2_MACRO(set.eq)
1579
+ }
1580
+ __CUDA_FP16_DECL__ __half2 __hne2(const __half2 a, const __half2 b)
1581
+ {
1582
+ __COMPARISON_OP_HALF2_MACRO(set.ne)
1583
+ }
1584
+ __CUDA_FP16_DECL__ __half2 __hle2(const __half2 a, const __half2 b)
1585
+ {
1586
+ __COMPARISON_OP_HALF2_MACRO(set.le)
1587
+ }
1588
+ __CUDA_FP16_DECL__ __half2 __hge2(const __half2 a, const __half2 b)
1589
+ {
1590
+ __COMPARISON_OP_HALF2_MACRO(set.ge)
1591
+ }
1592
+ __CUDA_FP16_DECL__ __half2 __hlt2(const __half2 a, const __half2 b)
1593
+ {
1594
+ __COMPARISON_OP_HALF2_MACRO(set.lt)
1595
+ }
1596
+ __CUDA_FP16_DECL__ __half2 __hgt2(const __half2 a, const __half2 b)
1597
+ {
1598
+ __COMPARISON_OP_HALF2_MACRO(set.gt)
1599
+ }
1600
+ __CUDA_FP16_DECL__ __half2 __hequ2(const __half2 a, const __half2 b)
1601
+ {
1602
+ __COMPARISON_OP_HALF2_MACRO(set.equ)
1603
+ }
1604
+ __CUDA_FP16_DECL__ __half2 __hneu2(const __half2 a, const __half2 b)
1605
+ {
1606
+ __COMPARISON_OP_HALF2_MACRO(set.neu)
1607
+ }
1608
+ __CUDA_FP16_DECL__ __half2 __hleu2(const __half2 a, const __half2 b)
1609
+ {
1610
+ __COMPARISON_OP_HALF2_MACRO(set.leu)
1611
+ }
1612
+ __CUDA_FP16_DECL__ __half2 __hgeu2(const __half2 a, const __half2 b)
1613
+ {
1614
+ __COMPARISON_OP_HALF2_MACRO(set.geu)
1615
+ }
1616
+ __CUDA_FP16_DECL__ __half2 __hltu2(const __half2 a, const __half2 b)
1617
+ {
1618
+ __COMPARISON_OP_HALF2_MACRO(set.ltu)
1619
+ }
1620
+ __CUDA_FP16_DECL__ __half2 __hgtu2(const __half2 a, const __half2 b)
1621
+ {
1622
+ __COMPARISON_OP_HALF2_MACRO(set.gtu)
1623
+ }
1624
+ #undef __COMPARISON_OP_HALF2_MACRO
1625
+ #define __BOOL_COMPARISON_OP_HALF2_MACRO(name) /* do */ {\
1626
+ __half2 val; \
1627
+ bool retval; \
1628
+ asm( "{ "#name".f16x2.f16x2 %0,%1,%2;\n}" \
1629
+ :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b))); \
1630
+ if (__HALF2_TO_CUI(val) == 0x3C003C00U) {\
1631
+ retval = true; \
1632
+ } else { \
1633
+ retval = false; \
1634
+ }\
1635
+ return retval;\
1636
+ } /* while(0) */
1637
+ __CUDA_FP16_DECL__ bool __hbeq2(const __half2 a, const __half2 b)
1638
+ {
1639
+ __BOOL_COMPARISON_OP_HALF2_MACRO(set.eq)
1640
+ }
1641
+ __CUDA_FP16_DECL__ bool __hbne2(const __half2 a, const __half2 b)
1642
+ {
1643
+ __BOOL_COMPARISON_OP_HALF2_MACRO(set.ne)
1644
+ }
1645
+ __CUDA_FP16_DECL__ bool __hble2(const __half2 a, const __half2 b)
1646
+ {
1647
+ __BOOL_COMPARISON_OP_HALF2_MACRO(set.le)
1648
+ }
1649
+ __CUDA_FP16_DECL__ bool __hbge2(const __half2 a, const __half2 b)
1650
+ {
1651
+ __BOOL_COMPARISON_OP_HALF2_MACRO(set.ge)
1652
+ }
1653
+ __CUDA_FP16_DECL__ bool __hblt2(const __half2 a, const __half2 b)
1654
+ {
1655
+ __BOOL_COMPARISON_OP_HALF2_MACRO(set.lt)
1656
+ }
1657
+ __CUDA_FP16_DECL__ bool __hbgt2(const __half2 a, const __half2 b)
1658
+ {
1659
+ __BOOL_COMPARISON_OP_HALF2_MACRO(set.gt)
1660
+ }
1661
+ __CUDA_FP16_DECL__ bool __hbequ2(const __half2 a, const __half2 b)
1662
+ {
1663
+ __BOOL_COMPARISON_OP_HALF2_MACRO(set.equ)
1664
+ }
1665
+ __CUDA_FP16_DECL__ bool __hbneu2(const __half2 a, const __half2 b)
1666
+ {
1667
+ __BOOL_COMPARISON_OP_HALF2_MACRO(set.neu)
1668
+ }
1669
+ __CUDA_FP16_DECL__ bool __hbleu2(const __half2 a, const __half2 b)
1670
+ {
1671
+ __BOOL_COMPARISON_OP_HALF2_MACRO(set.leu)
1672
+ }
1673
+ __CUDA_FP16_DECL__ bool __hbgeu2(const __half2 a, const __half2 b)
1674
+ {
1675
+ __BOOL_COMPARISON_OP_HALF2_MACRO(set.geu)
1676
+ }
1677
+ __CUDA_FP16_DECL__ bool __hbltu2(const __half2 a, const __half2 b)
1678
+ {
1679
+ __BOOL_COMPARISON_OP_HALF2_MACRO(set.ltu)
1680
+ }
1681
+ __CUDA_FP16_DECL__ bool __hbgtu2(const __half2 a, const __half2 b)
1682
+ {
1683
+ __BOOL_COMPARISON_OP_HALF2_MACRO(set.gtu)
1684
+ }
1685
+ #undef __BOOL_COMPARISON_OP_HALF2_MACRO
1686
+ /******************************************************************************
1687
+ * __half comparison *
1688
+ ******************************************************************************/
1689
+ #define __COMPARISON_OP_HALF_MACRO(name) /* do */ {\
1690
+ unsigned short val; \
1691
+ asm( "{ .reg .pred __$temp3;\n" \
1692
+ " setp."#name".f16 __$temp3, %1, %2;\n" \
1693
+ " selp.u16 %0, 1, 0, __$temp3;}" \
1694
+ : "=h"(val) : "h"(__HALF_TO_CUS(a)), "h"(__HALF_TO_CUS(b))); \
1695
+ return (val != 0U) ? true : false; \
1696
+ } /* while(0) */
1697
+ __CUDA_FP16_DECL__ bool __heq(const __half a, const __half b)
1698
+ {
1699
+ __COMPARISON_OP_HALF_MACRO(eq)
1700
+ }
1701
+ __CUDA_FP16_DECL__ bool __hne(const __half a, const __half b)
1702
+ {
1703
+ __COMPARISON_OP_HALF_MACRO(ne)
1704
+ }
1705
+ __CUDA_FP16_DECL__ bool __hle(const __half a, const __half b)
1706
+ {
1707
+ __COMPARISON_OP_HALF_MACRO(le)
1708
+ }
1709
+ __CUDA_FP16_DECL__ bool __hge(const __half a, const __half b)
1710
+ {
1711
+ __COMPARISON_OP_HALF_MACRO(ge)
1712
+ }
1713
+ __CUDA_FP16_DECL__ bool __hlt(const __half a, const __half b)
1714
+ {
1715
+ __COMPARISON_OP_HALF_MACRO(lt)
1716
+ }
1717
+ __CUDA_FP16_DECL__ bool __hgt(const __half a, const __half b)
1718
+ {
1719
+ __COMPARISON_OP_HALF_MACRO(gt)
1720
+ }
1721
+ __CUDA_FP16_DECL__ bool __hequ(const __half a, const __half b)
1722
+ {
1723
+ __COMPARISON_OP_HALF_MACRO(equ)
1724
+ }
1725
+ __CUDA_FP16_DECL__ bool __hneu(const __half a, const __half b)
1726
+ {
1727
+ __COMPARISON_OP_HALF_MACRO(neu)
1728
+ }
1729
+ __CUDA_FP16_DECL__ bool __hleu(const __half a, const __half b)
1730
+ {
1731
+ __COMPARISON_OP_HALF_MACRO(leu)
1732
+ }
1733
+ __CUDA_FP16_DECL__ bool __hgeu(const __half a, const __half b)
1734
+ {
1735
+ __COMPARISON_OP_HALF_MACRO(geu)
1736
+ }
1737
+ __CUDA_FP16_DECL__ bool __hltu(const __half a, const __half b)
1738
+ {
1739
+ __COMPARISON_OP_HALF_MACRO(ltu)
1740
+ }
1741
+ __CUDA_FP16_DECL__ bool __hgtu(const __half a, const __half b)
1742
+ {
1743
+ __COMPARISON_OP_HALF_MACRO(gtu)
1744
+ }
1745
+ #undef __COMPARISON_OP_HALF_MACRO
1746
+ /******************************************************************************
1747
+ * __half2 arithmetic *
1748
+ ******************************************************************************/
1749
+ __CUDA_FP16_DECL__ __half2 __hadd2(const __half2 a, const __half2 b)
1750
+ {
1751
+ __BINARY_OP_HALF2_MACRO(add)
1752
+ }
1753
+ __CUDA_FP16_DECL__ __half2 __hsub2(const __half2 a, const __half2 b)
1754
+ {
1755
+ __BINARY_OP_HALF2_MACRO(sub)
1756
+ }
1757
+ __CUDA_FP16_DECL__ __half2 __hmul2(const __half2 a, const __half2 b)
1758
+ {
1759
+ __BINARY_OP_HALF2_MACRO(mul)
1760
+ }
1761
+ __CUDA_FP16_DECL__ __half2 __hadd2_sat(const __half2 a, const __half2 b)
1762
+ {
1763
+ __BINARY_OP_HALF2_MACRO(add.sat)
1764
+ }
1765
+ __CUDA_FP16_DECL__ __half2 __hsub2_sat(const __half2 a, const __half2 b)
1766
+ {
1767
+ __BINARY_OP_HALF2_MACRO(sub.sat)
1768
+ }
1769
+ __CUDA_FP16_DECL__ __half2 __hmul2_sat(const __half2 a, const __half2 b)
1770
+ {
1771
+ __BINARY_OP_HALF2_MACRO(mul.sat)
1772
+ }
1773
+ __CUDA_FP16_DECL__ __half2 __hfma2(const __half2 a, const __half2 b, const __half2 c)
1774
+ {
1775
+ __TERNARY_OP_HALF2_MACRO(fma.rn)
1776
+ }
1777
+ __CUDA_FP16_DECL__ __half2 __hfma2_sat(const __half2 a, const __half2 b, const __half2 c)
1778
+ {
1779
+ __TERNARY_OP_HALF2_MACRO(fma.rn.sat)
1780
+ }
1781
+ __CUDA_FP16_DECL__ __half2 __h2div(const __half2 a, const __half2 b) {
1782
+ __half ha = __low2half(a);
1783
+ __half hb = __low2half(b);
1784
+
1785
+ const __half v1 = __hdiv(ha, hb);
1786
+
1787
+ ha = __high2half(a);
1788
+ hb = __high2half(b);
1789
+
1790
+ const __half v2 = __hdiv(ha, hb);
1791
+
1792
+ return __halves2half2(v1, v2);
1793
+ }
1794
+ /******************************************************************************
1795
+ * __half arithmetic *
1796
+ ******************************************************************************/
1797
+ __CUDA_FP16_DECL__ __half __hadd(const __half a, const __half b)
1798
+ {
1799
+ __BINARY_OP_HALF_MACRO(add)
1800
+ }
1801
+ __CUDA_FP16_DECL__ __half __hsub(const __half a, const __half b)
1802
+ {
1803
+ __BINARY_OP_HALF_MACRO(sub)
1804
+ }
1805
+ __CUDA_FP16_DECL__ __half __hmul(const __half a, const __half b)
1806
+ {
1807
+ __BINARY_OP_HALF_MACRO(mul)
1808
+ }
1809
+ __CUDA_FP16_DECL__ __half __hadd_sat(const __half a, const __half b)
1810
+ {
1811
+ __BINARY_OP_HALF_MACRO(add.sat)
1812
+ }
1813
+ __CUDA_FP16_DECL__ __half __hsub_sat(const __half a, const __half b)
1814
+ {
1815
+ __BINARY_OP_HALF_MACRO(sub.sat)
1816
+ }
1817
+ __CUDA_FP16_DECL__ __half __hmul_sat(const __half a, const __half b)
1818
+ {
1819
+ __BINARY_OP_HALF_MACRO(mul.sat)
1820
+ }
1821
+
1822
+ __CUDA_FP16_DECL__ __half __hfma(const __half a, const __half b, const __half c)
1823
+ {
1824
+ __TERNARY_OP_HALF_MACRO(fma.rn)
1825
+ }
1826
+ __CUDA_FP16_DECL__ __half __hfma_sat(const __half a, const __half b, const __half c)
1827
+ {
1828
+ __TERNARY_OP_HALF_MACRO(fma.rn.sat)
1829
+ }
1830
+ __CUDA_FP16_DECL__ __half __hdiv(const __half a, const __half b) {
1831
+ __half v;
1832
+ __half abs;
1833
+ __half den;
1834
+ __HALF_TO_US(den) = 0x008FU;
1835
+
1836
+ float rcp;
1837
+ const float fa = __half2float(a);
1838
+ const float fb = __half2float(b);
1839
+
1840
+ asm("{rcp.approx.ftz.f32 %0, %1;\n}" :"=f"(rcp) : "f"(fb));
1841
+
1842
+ float fv = rcp * fa;
1843
+
1844
+ v = __float2half(fv);
1845
+ __HALF_TO_US(abs) = static_cast<unsigned short>(static_cast<unsigned int>(__HALF_TO_CUS(v)) & 0x00007FFFU);
1846
+ if (__hlt(abs, den) && (!(__HALF_TO_CUS(abs) == 0x0000U))) {
1847
+ const float err = __fmaf_rn(-fb, fv, fa);
1848
+ fv = __fmaf_rn(rcp, err, fv);
1849
+ v = __float2half(fv);
1850
+ }
1851
+ return v;
1852
+ }
1853
+
1854
+ /******************************************************************************
1855
+ * __half2 functions *
1856
+ ******************************************************************************/
1857
+ #define __SPEC_CASE2(i,r, spc, ulp) \
1858
+ "{.reg.b32 spc, ulp, p;\n"\
1859
+ " mov.b32 spc,"#spc";\n"\
1860
+ " mov.b32 ulp,"#ulp";\n"\
1861
+ " set.eq.f16x2.f16x2 p,"#i", spc;\n"\
1862
+ " fma.rn.f16x2 "#r",p,ulp,"#r";\n}\n"
1863
+ #define __SPEC_CASE(i,r, spc, ulp) \
1864
+ "{.reg.b16 spc, ulp, p;\n"\
1865
+ " mov.b16 spc,"#spc";\n"\
1866
+ " mov.b16 ulp,"#ulp";\n"\
1867
+ " set.eq.f16.f16 p,"#i", spc;\n"\
1868
+ " fma.rn.f16 "#r",p,ulp,"#r";\n}\n"
1869
+ #define __APPROX_FCAST(fun) /* do */ {\
1870
+ __half val;\
1871
+ asm("{.reg.b32 f; \n"\
1872
+ " .reg.b16 r; \n"\
1873
+ " mov.b16 r,%1; \n"\
1874
+ " cvt.f32.f16 f,r; \n"\
1875
+ " "#fun".approx.f32 f,f; \n"\
1876
+ " cvt.rn.f16.f32 r,f; \n"\
1877
+ " mov.b16 %0,r; \n"\
1878
+ "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));\
1879
+ return val;\
1880
+ } /* while(0) */
1881
+ #define __APPROX_FCAST2(fun) /* do */ {\
1882
+ __half2 val;\
1883
+ asm("{.reg.b16 hl, hu; \n"\
1884
+ " .reg.b32 fl, fu; \n"\
1885
+ " mov.b32 {hl, hu}, %1; \n"\
1886
+ " cvt.f32.f16 fl, hl; \n"\
1887
+ " cvt.f32.f16 fu, hu; \n"\
1888
+ " "#fun".approx.f32 fl, fl; \n"\
1889
+ " "#fun".approx.f32 fu, fu; \n"\
1890
+ " cvt.rn.f16.f32 hl, fl; \n"\
1891
+ " cvt.rn.f16.f32 hu, fu; \n"\
1892
+ " mov.b32 %0, {hl, hu}; \n"\
1893
+ "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); \
1894
+ return val;\
1895
+ } /* while(0) */
1896
+ static __device__ __forceinline__ float __float_simpl_sinf(float a);
1897
+ static __device__ __forceinline__ float __float_simpl_cosf(float a);
1898
+ __CUDA_FP16_DECL__ __half __hsin_internal(const __half a) {
1899
+ float f = __half2float(a);
1900
+ f = __float_simpl_sinf(f);
1901
+ return __float2half_rn(f);
1902
+ }
1903
+ __CUDA_FP16_DECL__ __half hsin(const __half a) {
1904
+ __half r = __hsin_internal(a);
1905
+ asm("{\n\t"
1906
+ " .reg.b16 i,r,t; \n\t"
1907
+ " mov.b16 r, %0; \n\t"
1908
+ " mov.b16 i, %1; \n\t"
1909
+ " mov.b16 t, 0x8000U; \n\t"
1910
+ " and.b16 t,r,t; \n\t"
1911
+ __SPEC_CASE(i, r, 0X32B3U, 0x0800U)
1912
+ __SPEC_CASE(i, r, 0X5CB0U, 0x1000U)
1913
+ __SPEC_CASE(i, r, 0XB2B3U, 0x8800U)
1914
+ __SPEC_CASE(i, r, 0XDCB0U, 0x9000U)
1915
+ " or.b16 r,r,t; \n\t"
1916
+ " mov.b16 %0, r; \n"
1917
+ "}\n" : "+h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a)));
1918
+ return r;
1919
+ }
1920
+ __CUDA_FP16_DECL__ __half2 h2sin(const __half2 a) {
1921
+ const __half l = __low2half(a);
1922
+ const __half h = __high2half(a);
1923
+ const __half sl = __hsin_internal(l);
1924
+ const __half sh = __hsin_internal(h);
1925
+ __half2 r = __halves2half2(sl, sh);
1926
+ asm("{\n\t"
1927
+ " .reg.b32 i,r,t; \n\t"
1928
+ " mov.b32 r, %0; \n\t"
1929
+ " mov.b32 i, %1; \n\t"
1930
+ " and.b32 t, r, 0x80008000U; \n\t"
1931
+ __SPEC_CASE2(i, r, 0X32B332B3U, 0x08000800U)
1932
+ __SPEC_CASE2(i, r, 0X5CB05CB0U, 0x10001000U)
1933
+ __SPEC_CASE2(i, r, 0XB2B3B2B3U, 0x88008800U)
1934
+ __SPEC_CASE2(i, r, 0XDCB0DCB0U, 0x90009000U)
1935
+ " or.b32 r, r, t; \n\t"
1936
+ " mov.b32 %0, r; \n"
1937
+ "}\n" : "+r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a)));
1938
+ return r;
1939
+ }
1940
+ __CUDA_FP16_DECL__ __half __hcos_internal(const __half a) {
1941
+ float f = __half2float(a);
1942
+ f = __float_simpl_cosf(f);
1943
+ return __float2half_rn(f);
1944
+ }
1945
+ __CUDA_FP16_DECL__ __half hcos(const __half a) {
1946
+ __half r = __hcos_internal(a);
1947
+ asm("{\n\t"
1948
+ " .reg.b16 i,r; \n\t"
1949
+ " mov.b16 r, %0; \n\t"
1950
+ " mov.b16 i, %1; \n\t"
1951
+ __SPEC_CASE(i, r, 0X2B7CU, 0x1000U)
1952
+ __SPEC_CASE(i, r, 0XAB7CU, 0x1000U)
1953
+ " mov.b16 %0, r; \n"
1954
+ "}\n" : "+h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a)));
1955
+ return r;
1956
+ }
1957
+ __CUDA_FP16_DECL__ __half2 h2cos(const __half2 a) {
1958
+ const __half l = __low2half(a);
1959
+ const __half h = __high2half(a);
1960
+ const __half cl = __hcos_internal(l);
1961
+ const __half ch = __hcos_internal(h);
1962
+ __half2 r = __halves2half2(cl, ch);
1963
+ asm("{\n\t"
1964
+ " .reg.b32 i,r; \n\t"
1965
+ " mov.b32 r, %0; \n\t"
1966
+ " mov.b32 i, %1; \n\t"
1967
+ __SPEC_CASE2(i, r, 0X2B7C2B7CU, 0x10001000U)
1968
+ __SPEC_CASE2(i, r, 0XAB7CAB7CU, 0x10001000U)
1969
+ " mov.b32 %0, r; \n"
1970
+ "}\n" : "+r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a)));
1971
+ return r;
1972
+ }
1973
+ static __device__ __forceinline__ float __internal_trig_reduction_kernel(const float a, int *quadrant)
1974
+ {
1975
+ const int q = __float2int_rn(a * 0.636619772F);
1976
+ const float j = static_cast<float>(q);
1977
+ float t = __fmaf_rn(-j, 1.5707962512969971e+000F, a);
1978
+ t = __fmaf_rn(-j, 7.5497894158615964e-008F, t);
1979
+ *quadrant = q;
1980
+ return t;
1981
+ }
1982
+ static __device__ __forceinline__ float __internal_sin_cos_kernel(float x, const int i)
1983
+ {
1984
+ float z;
1985
+ const float x2 = x*x;
1986
+
1987
+ if ((static_cast<unsigned>(i) & 1U) != 0U) {
1988
+ z = 2.44331571e-5F;
1989
+ z = __fmaf_rn(z, x2, -1.38873163e-3F);
1990
+ }
1991
+ else {
1992
+ z = -1.95152959e-4F;
1993
+ z = __fmaf_rn(z, x2, 8.33216087e-3F);
1994
+ }
1995
+ if ((static_cast<unsigned>(i) & 1U) != 0U) {
1996
+ z = __fmaf_rn(z, x2, 4.16666457e-2F);
1997
+ z = __fmaf_rn(z, x2, -5.00000000e-1F);
1998
+ }
1999
+ else {
2000
+ z = __fmaf_rn(z, x2, -1.66666546e-1F);
2001
+ z = __fmaf_rn(z, x2, 0.0F);
2002
+ }
2003
+ if ((static_cast<unsigned>(i) & 1U) != 0U) {
2004
+ x = __fmaf_rn(z, x2, 1.0F);
2005
+ }
2006
+ else {
2007
+ x = __fmaf_rn(z, x, x);
2008
+ }
2009
+ if ((static_cast<unsigned>(i) & 2U) != 0U) {
2010
+ x = __fmaf_rn(x, -1.0F, 0.0F);
2011
+ }
2012
+ return x;
2013
+ }
2014
+ static __device__ __forceinline__ float __float_simpl_sinf(float a)
2015
+ {
2016
+ float z;
2017
+ int i;
2018
+ if (::isinf(a)) {
2019
+ a = a * 0.0F;
2020
+ }
2021
+ a = __internal_trig_reduction_kernel(a, &i);
2022
+ z = __internal_sin_cos_kernel(a, i);
2023
+ return z;
2024
+ }
2025
+ static __device__ __forceinline__ float __float_simpl_cosf(float a)
2026
+ {
2027
+ float z;
2028
+ int i;
2029
+ if (::isinf(a)) {
2030
+ a = a * 0.0F;
2031
+ }
2032
+ a = __internal_trig_reduction_kernel(a, &i);
2033
+ i++;
2034
+ z = __internal_sin_cos_kernel(a, i);
2035
+ return z;
2036
+ }
2037
+
2038
+ __CUDA_FP16_DECL__ __half hexp(const __half a) {
2039
+ __half val;
2040
+ asm("{.reg.b32 f, C; \n"
2041
+ " .reg.b16 h,r; \n"
2042
+ " mov.b16 h,%1; \n"
2043
+ " cvt.f32.f16 f,h; \n"
2044
+ " mov.b32 C, 0x3fb8aa3bU; \n"
2045
+ " mul.f32 f,f,C; \n"
2046
+ " ex2.approx.f32 f,f; \n"
2047
+ " cvt.rn.f16.f32 r,f; \n"
2048
+ __SPEC_CASE(h, r, 0X1F79U, 0x9400U)
2049
+ __SPEC_CASE(h, r, 0X25CFU, 0x9400U)
2050
+ __SPEC_CASE(h, r, 0XC13BU, 0x0400U)
2051
+ __SPEC_CASE(h, r, 0XC1EFU, 0x0200U)
2052
+ " mov.b16 %0,r; \n"
2053
+ "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));
2054
+ return val;
2055
+ }
2056
+ __CUDA_FP16_DECL__ __half2 h2exp(const __half2 a) {
2057
+ __half2 val;
2058
+ asm("{.reg.b16 hl, hu; \n"
2059
+ " .reg.b32 h,r,fl,fu, C; \n"
2060
+ " mov.b32 {hl, hu}, %1; \n"
2061
+ " mov.b32 h, %1; \n"
2062
+ " cvt.f32.f16 fl, hl; \n"
2063
+ " cvt.f32.f16 fu, hu; \n"
2064
+ " mov.b32 C, 0x3fb8aa3bU; \n"
2065
+ " mul.f32 fl,fl,C; \n"
2066
+ " mul.f32 fu,fu,C; \n"
2067
+ " ex2.approx.f32 fl, fl; \n"
2068
+ " ex2.approx.f32 fu, fu; \n"
2069
+ " cvt.rn.f16.f32 hl, fl; \n"
2070
+ " cvt.rn.f16.f32 hu, fu; \n"
2071
+ " mov.b32 r, {hl, hu}; \n"
2072
+ __SPEC_CASE2(h, r, 0X1F791F79U, 0x94009400U)
2073
+ __SPEC_CASE2(h, r, 0X25CF25CFU, 0x94009400U)
2074
+ __SPEC_CASE2(h, r, 0XC13BC13BU, 0x04000400U)
2075
+ __SPEC_CASE2(h, r, 0XC1EFC1EFU, 0x02000200U)
2076
+ " mov.b32 %0, r; \n"
2077
+ "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
2078
+ return val;
2079
+ }
2080
+ __CUDA_FP16_DECL__ __half hexp2(const __half a) {
2081
+ __half val;
2082
+ asm("{.reg.b32 f, ULP; \n"
2083
+ " .reg.b16 r; \n"
2084
+ " mov.b16 r,%1; \n"
2085
+ " cvt.f32.f16 f,r; \n"
2086
+ " ex2.approx.f32 f,f; \n"
2087
+ " mov.b32 ULP, 0x33800000U;\n"
2088
+ " fma.rn.f32 f,f,ULP,f; \n"
2089
+ " cvt.rn.f16.f32 r,f; \n"
2090
+ " mov.b16 %0,r; \n"
2091
+ "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));
2092
+ return val;
2093
+ }
2094
+ __CUDA_FP16_DECL__ __half2 h2exp2(const __half2 a) {
2095
+ __half2 val;
2096
+ asm("{.reg.b16 hl, hu; \n"
2097
+ " .reg.b32 fl, fu, ULP; \n"
2098
+ " mov.b32 {hl, hu}, %1; \n"
2099
+ " cvt.f32.f16 fl, hl; \n"
2100
+ " cvt.f32.f16 fu, hu; \n"
2101
+ " ex2.approx.f32 fl, fl; \n"
2102
+ " ex2.approx.f32 fu, fu; \n"
2103
+ " mov.b32 ULP, 0x33800000U;\n"
2104
+ " fma.rn.f32 fl,fl,ULP,fl; \n"
2105
+ " fma.rn.f32 fu,fu,ULP,fu; \n"
2106
+ " cvt.rn.f16.f32 hl, fl; \n"
2107
+ " cvt.rn.f16.f32 hu, fu; \n"
2108
+ " mov.b32 %0, {hl, hu}; \n"
2109
+ "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
2110
+ return val;
2111
+ }
2112
+ __CUDA_FP16_DECL__ __half hexp10(const __half a) {
2113
+ __half val;
2114
+ asm("{.reg.b16 h,r; \n"
2115
+ " .reg.b32 f, C; \n"
2116
+ " mov.b16 h, %1; \n"
2117
+ " cvt.f32.f16 f, h; \n"
2118
+ " mov.b32 C, 0x40549A78U; \n"
2119
+ " mul.f32 f,f,C; \n"
2120
+ " ex2.approx.f32 f, f; \n"
2121
+ " cvt.rn.f16.f32 r, f; \n"
2122
+ __SPEC_CASE(h, r, 0x34DEU, 0x9800U)
2123
+ __SPEC_CASE(h, r, 0x9766U, 0x9000U)
2124
+ __SPEC_CASE(h, r, 0x9972U, 0x1000U)
2125
+ __SPEC_CASE(h, r, 0xA5C4U, 0x1000U)
2126
+ __SPEC_CASE(h, r, 0xBF0AU, 0x8100U)
2127
+ " mov.b16 %0, r; \n"
2128
+ "}":"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));
2129
+ return val;
2130
+ }
2131
+ __CUDA_FP16_DECL__ __half2 h2exp10(const __half2 a) {
2132
+ __half2 val;
2133
+ asm("{.reg.b16 hl, hu; \n"
2134
+ " .reg.b32 h,r,fl,fu, C; \n"
2135
+ " mov.b32 {hl, hu}, %1; \n"
2136
+ " mov.b32 h, %1; \n"
2137
+ " cvt.f32.f16 fl, hl; \n"
2138
+ " cvt.f32.f16 fu, hu; \n"
2139
+ " mov.b32 C, 0x40549A78U; \n"
2140
+ " mul.f32 fl,fl,C; \n"
2141
+ " mul.f32 fu,fu,C; \n"
2142
+ " ex2.approx.f32 fl, fl; \n"
2143
+ " ex2.approx.f32 fu, fu; \n"
2144
+ " cvt.rn.f16.f32 hl, fl; \n"
2145
+ " cvt.rn.f16.f32 hu, fu; \n"
2146
+ " mov.b32 r, {hl, hu}; \n"
2147
+ __SPEC_CASE2(h, r, 0x34DE34DEU, 0x98009800U)
2148
+ __SPEC_CASE2(h, r, 0x97669766U, 0x90009000U)
2149
+ __SPEC_CASE2(h, r, 0x99729972U, 0x10001000U)
2150
+ __SPEC_CASE2(h, r, 0xA5C4A5C4U, 0x10001000U)
2151
+ __SPEC_CASE2(h, r, 0xBF0ABF0AU, 0x81008100U)
2152
+ " mov.b32 %0, r; \n"
2153
+ "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
2154
+ return val;
2155
+ }
2156
+ __CUDA_FP16_DECL__ __half hlog2(const __half a) {
2157
+ __half val;
2158
+ asm("{.reg.b16 h, r; \n"
2159
+ " .reg.b32 f; \n"
2160
+ " mov.b16 h, %1; \n"
2161
+ " cvt.f32.f16 f, h; \n"
2162
+ " lg2.approx.f32 f, f; \n"
2163
+ " cvt.rn.f16.f32 r, f; \n"
2164
+ __SPEC_CASE(r, r, 0xA2E2U, 0x8080U)
2165
+ __SPEC_CASE(r, r, 0xBF46U, 0x9400U)
2166
+ " mov.b16 %0, r; \n"
2167
+ "}":"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));
2168
+ return val;
2169
+ }
2170
+ __CUDA_FP16_DECL__ __half2 h2log2(const __half2 a) {
2171
+ __half2 val;
2172
+ asm("{.reg.b16 hl, hu; \n"
2173
+ " .reg.b32 fl, fu, r, p; \n"
2174
+ " mov.b32 {hl, hu}, %1; \n"
2175
+ " cvt.f32.f16 fl, hl; \n"
2176
+ " cvt.f32.f16 fu, hu; \n"
2177
+ " lg2.approx.f32 fl, fl; \n"
2178
+ " lg2.approx.f32 fu, fu; \n"
2179
+ " cvt.rn.f16.f32 hl, fl; \n"
2180
+ " cvt.rn.f16.f32 hu, fu; \n"
2181
+ " mov.b32 r, {hl, hu}; \n"
2182
+ __SPEC_CASE2(r, r, 0xA2E2A2E2U, 0x80808080U)
2183
+ __SPEC_CASE2(r, r, 0xBF46BF46U, 0x94009400U)
2184
+ " mov.b32 %0, r; \n"
2185
+ "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
2186
+ return val;
2187
+ }
2188
+ __CUDA_FP16_DECL__ __half hlog(const __half a) {
2189
+ __half val;
2190
+ asm("{.reg.b32 f, C; \n"
2191
+ " .reg.b16 r,h; \n"
2192
+ " mov.b16 h,%1; \n"
2193
+ " cvt.f32.f16 f,h; \n"
2194
+ " lg2.approx.f32 f,f; \n"
2195
+ " mov.b32 C, 0x3f317218U; \n"
2196
+ " mul.f32 f,f,C; \n"
2197
+ " cvt.rn.f16.f32 r,f; \n"
2198
+ __SPEC_CASE(h, r, 0X160DU, 0x9C00U)
2199
+ __SPEC_CASE(h, r, 0X3BFEU, 0x8010U)
2200
+ __SPEC_CASE(h, r, 0X3C0BU, 0x8080U)
2201
+ __SPEC_CASE(h, r, 0X6051U, 0x1C00U)
2202
+ " mov.b16 %0,r; \n"
2203
+ "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));
2204
+ return val;
2205
+ }
2206
+ __CUDA_FP16_DECL__ __half2 h2log(const __half2 a) {
2207
+ __half2 val;
2208
+ asm("{.reg.b16 hl, hu; \n"
2209
+ " .reg.b32 r, fl, fu, C, h; \n"
2210
+ " mov.b32 {hl, hu}, %1; \n"
2211
+ " mov.b32 h, %1; \n"
2212
+ " cvt.f32.f16 fl, hl; \n"
2213
+ " cvt.f32.f16 fu, hu; \n"
2214
+ " lg2.approx.f32 fl, fl; \n"
2215
+ " lg2.approx.f32 fu, fu; \n"
2216
+ " mov.b32 C, 0x3f317218U; \n"
2217
+ " mul.f32 fl,fl,C; \n"
2218
+ " mul.f32 fu,fu,C; \n"
2219
+ " cvt.rn.f16.f32 hl, fl; \n"
2220
+ " cvt.rn.f16.f32 hu, fu; \n"
2221
+ " mov.b32 r, {hl, hu}; \n"
2222
+ __SPEC_CASE2(h, r, 0X160D160DU, 0x9C009C00U)
2223
+ __SPEC_CASE2(h, r, 0X3BFE3BFEU, 0x80108010U)
2224
+ __SPEC_CASE2(h, r, 0X3C0B3C0BU, 0x80808080U)
2225
+ __SPEC_CASE2(h, r, 0X60516051U, 0x1C001C00U)
2226
+ " mov.b32 %0, r; \n"
2227
+ "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
2228
+ return val;
2229
+ }
2230
+ __CUDA_FP16_DECL__ __half hlog10(const __half a) {
2231
+ __half val;
2232
+ asm("{.reg.b16 h, r; \n"
2233
+ " .reg.b32 f, C; \n"
2234
+ " mov.b16 h, %1; \n"
2235
+ " cvt.f32.f16 f, h; \n"
2236
+ " lg2.approx.f32 f, f; \n"
2237
+ " mov.b32 C, 0x3E9A209BU; \n"
2238
+ " mul.f32 f,f,C; \n"
2239
+ " cvt.rn.f16.f32 r, f; \n"
2240
+ __SPEC_CASE(h, r, 0x338FU, 0x1000U)
2241
+ __SPEC_CASE(h, r, 0x33F8U, 0x9000U)
2242
+ __SPEC_CASE(h, r, 0x57E1U, 0x9800U)
2243
+ __SPEC_CASE(h, r, 0x719DU, 0x9C00U)
2244
+ " mov.b16 %0, r; \n"
2245
+ "}":"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));
2246
+ return val;
2247
+ }
2248
+ __CUDA_FP16_DECL__ __half2 h2log10(const __half2 a) {
2249
+ __half2 val;
2250
+ asm("{.reg.b16 hl, hu; \n"
2251
+ " .reg.b32 r, fl, fu, C, h; \n"
2252
+ " mov.b32 {hl, hu}, %1; \n"
2253
+ " mov.b32 h, %1; \n"
2254
+ " cvt.f32.f16 fl, hl; \n"
2255
+ " cvt.f32.f16 fu, hu; \n"
2256
+ " lg2.approx.f32 fl, fl; \n"
2257
+ " lg2.approx.f32 fu, fu; \n"
2258
+ " mov.b32 C, 0x3E9A209BU; \n"
2259
+ " mul.f32 fl,fl,C; \n"
2260
+ " mul.f32 fu,fu,C; \n"
2261
+ " cvt.rn.f16.f32 hl, fl; \n"
2262
+ " cvt.rn.f16.f32 hu, fu; \n"
2263
+ " mov.b32 r, {hl, hu}; \n"
2264
+ __SPEC_CASE2(h, r, 0x338F338FU, 0x10001000U)
2265
+ __SPEC_CASE2(h, r, 0x33F833F8U, 0x90009000U)
2266
+ __SPEC_CASE2(h, r, 0x57E157E1U, 0x98009800U)
2267
+ __SPEC_CASE2(h, r, 0x719D719DU, 0x9C009C00U)
2268
+ " mov.b32 %0, r; \n"
2269
+ "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
2270
+ return val;
2271
+ }
2272
+ #undef __SPEC_CASE2
2273
+ #undef __SPEC_CASE
2274
+ __CUDA_FP16_DECL__ __half2 h2rcp(const __half2 a) {
2275
+ __APPROX_FCAST2(rcp)
2276
+ }
2277
+ __CUDA_FP16_DECL__ __half hrcp(const __half a) {
2278
+ __APPROX_FCAST(rcp)
2279
+ }
2280
+ __CUDA_FP16_DECL__ __half2 h2rsqrt(const __half2 a) {
2281
+ __APPROX_FCAST2(rsqrt)
2282
+ }
2283
+ __CUDA_FP16_DECL__ __half hrsqrt(const __half a) {
2284
+ __APPROX_FCAST(rsqrt)
2285
+ }
2286
+ __CUDA_FP16_DECL__ __half2 h2sqrt(const __half2 a) {
2287
+ __APPROX_FCAST2(sqrt)
2288
+ }
2289
+ __CUDA_FP16_DECL__ __half hsqrt(const __half a) {
2290
+ __APPROX_FCAST(sqrt)
2291
+ }
2292
+ #undef __APPROX_FCAST
2293
+ #undef __APPROX_FCAST2
2294
+ __CUDA_FP16_DECL__ __half2 __hisnan2(const __half2 a)
2295
+ {
2296
+ __half2 r;
2297
+ asm("{set.nan.f16x2.f16x2 %0,%1,%2;\n}"
2298
+ :"=r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a)), "r"(__HALF2_TO_CUI(a)));
2299
+ return r;
2300
+ }
2301
+ __CUDA_FP16_DECL__ bool __hisnan(const __half a)
2302
+ {
2303
+ __half r;
2304
+ asm("{set.nan.f16.f16 %0,%1,%2;\n}"
2305
+ :"=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a)), "h"(__HALF_TO_CUS(a)));
2306
+ return __HALF_TO_CUS(r) != 0U;
2307
+ }
2308
+ __CUDA_FP16_DECL__ __half2 __hneg2(const __half2 a)
2309
+ {
2310
+ __half2 r;
2311
+ asm("{neg.f16x2 %0,%1;\n}"
2312
+ :"=r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a)));
2313
+ return r;
2314
+ }
2315
+ __CUDA_FP16_DECL__ __half __hneg(const __half a)
2316
+ {
2317
+ __half r;
2318
+ asm("{neg.f16 %0,%1;\n}"
2319
+ :"=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a)));
2320
+ return r;
2321
+ }
2322
+ __CUDA_FP16_DECL__ __half2 __habs2(const __half2 a)
2323
+ {
2324
+ __half2 r;
2325
+ asm("{abs.f16x2 %0,%1;\n}"
2326
+ :"=r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a)));
2327
+ return r;
2328
+ }
2329
+ __CUDA_FP16_DECL__ __half __habs(const __half a)
2330
+ {
2331
+ __half r;
2332
+ asm("{abs.f16 %0,%1;\n}"
2333
+ :"=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a)));
2334
+ return r;
2335
+ }
2336
+
2337
+ __CUDA_FP16_DECL__ __half2 __hcmadd(const __half2 a, const __half2 b, const __half2 c)
2338
+ {
2339
+ // fast version of complex multiply-accumulate
2340
+ // (a.re, a.im) * (b.re, b.im) + (c.re, c.im)
2341
+ // acc.re = (c.re + a.re*b.re) - a.im*b.im
2342
+ // acc.im = (c.im + a.re*b.im) + a.im*b.re
2343
+ const __half2 a_re = __half2half2(a.x);
2344
+ __half2 acc = __hfma2(a_re, b, c);
2345
+ const __half2 a_im = __half2half2(a.y);
2346
+ const __half2 ib = __halves2half2(__hneg(b.y), b.x);
2347
+ acc = __hfma2(a_im, ib, acc);
2348
+ return acc;
2349
+ }
2350
+ #endif /*__CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)*/
2351
+
2352
+ #if __CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)
2353
+ /******************************************************************************
2354
+ * __half arithmetic *
2355
+ ******************************************************************************/
2356
+ __CUDA_FP16_DECL__ __half __hmax(const __half a, const __half b)
2357
+ {
2358
+ __BINARY_OP_HALF_MACRO(max)
2359
+ }
2360
+ __CUDA_FP16_DECL__ __half __hmin(const __half a, const __half b)
2361
+ {
2362
+ __BINARY_OP_HALF_MACRO(min)
2363
+ }
2364
+ __CUDA_FP16_DECL__ __half __hmax_nan(const __half a, const __half b)
2365
+ {
2366
+ __BINARY_OP_HALF_MACRO(max.NaN)
2367
+ }
2368
+ __CUDA_FP16_DECL__ __half __hmin_nan(const __half a, const __half b)
2369
+ {
2370
+ __BINARY_OP_HALF_MACRO(min.NaN)
2371
+ }
2372
+ __CUDA_FP16_DECL__ __half __hfma_relu(const __half a, const __half b, const __half c)
2373
+ {
2374
+ __TERNARY_OP_HALF_MACRO(fma.rn.relu)
2375
+ }
2376
+ /******************************************************************************
2377
+ * __half2 arithmetic *
2378
+ ******************************************************************************/
2379
+ __CUDA_FP16_DECL__ __half2 __hmax2(const __half2 a, const __half2 b)
2380
+ {
2381
+ __BINARY_OP_HALF2_MACRO(max)
2382
+ }
2383
+ __CUDA_FP16_DECL__ __half2 __hmin2(const __half2 a, const __half2 b)
2384
+ {
2385
+ __BINARY_OP_HALF2_MACRO(min)
2386
+ }
2387
+ __CUDA_FP16_DECL__ __half2 __hmax2_nan(const __half2 a, const __half2 b)
2388
+ {
2389
+ __BINARY_OP_HALF2_MACRO(max.NaN)
2390
+ }
2391
+ __CUDA_FP16_DECL__ __half2 __hmin2_nan(const __half2 a, const __half2 b)
2392
+ {
2393
+ __BINARY_OP_HALF2_MACRO(min.NaN)
2394
+ }
2395
+ __CUDA_FP16_DECL__ __half2 __hfma2_relu(const __half2 a, const __half2 b, const __half2 c)
2396
+ {
2397
+ __TERNARY_OP_HALF2_MACRO(fma.rn.relu)
2398
+ }
2399
+ #endif /*__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)*/
2400
+
2401
+ /* Define __PTR for atomicAdd prototypes below, undef after done */
2402
+ #if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)
2403
+ #define __PTR "l"
2404
+ #else
2405
+ #define __PTR "r"
2406
+ #endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/
2407
+
2408
+ #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
2409
+
2410
+ __CUDA_FP16_DECL__ __half2 atomicAdd(__half2 *const address, const __half2 val) {
2411
+ __half2 r;
2412
+ asm volatile ("{ atom.add.noftz.f16x2 %0,[%1],%2; }\n"
2413
+ : "=r"(__HALF2_TO_UI(r)) : __PTR(address), "r"(__HALF2_TO_CUI(val))
2414
+ : "memory");
2415
+ return r;
2416
+ }
2417
+
2418
+ #endif /*!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600*/
2419
+
2420
+ #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
2421
+
2422
+ __CUDA_FP16_DECL__ __half atomicAdd(__half *const address, const __half val) {
2423
+ __half r;
2424
+ asm volatile ("{ atom.add.noftz.f16 %0,[%1],%2; }\n"
2425
+ : "=h"(__HALF_TO_US(r))
2426
+ : __PTR(address), "h"(__HALF_TO_CUS(val))
2427
+ : "memory");
2428
+ return r;
2429
+ }
2430
+
2431
+ #endif /*!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700*/
2432
+
2433
+ #undef __PTR
2434
+
2435
+ #undef __CUDA_FP16_DECL__
2436
+ #endif /* defined(__CUDACC__) */
2437
+ #endif /* defined(__cplusplus) */
2438
+
2439
+ #undef __TERNARY_OP_HALF2_MACRO
2440
+ #undef __TERNARY_OP_HALF_MACRO
2441
+ #undef __BINARY_OP_HALF2_MACRO
2442
+ #undef __BINARY_OP_HALF_MACRO
2443
+
2444
+ #undef __CUDA_HOSTDEVICE_FP16_DECL__
2445
+ #undef __CUDA_FP16_DECL__
2446
+
2447
+ /* Define first-class types "half" and "half2", unless user specifies otherwise via "#define CUDA_NO_HALF" */
2448
+ /* C cannot ever have these types defined here, because __half and __half2 are C++ classes */
2449
+ #if defined(__cplusplus) && !defined(CUDA_NO_HALF)
2450
+ typedef __half half;
2451
+ typedef __half2 half2;
2452
+ // for consistency with __nv_bfloat16
2453
+ typedef __half __nv_half;
2454
+ typedef __half2 __nv_half2;
2455
+ typedef __half_raw __nv_half_raw;
2456
+ typedef __half2_raw __nv_half2_raw;
2457
+ typedef __half nv_half;
2458
+ typedef __half2 nv_half2;
2459
+ #endif /* defined(__cplusplus) && !defined(CUDA_NO_HALF) */
2460
+
2461
+ #if defined(__CPP_VERSION_AT_LEAST_11_FP16)
2462
+ #undef __CPP_VERSION_AT_LEAST_11_FP16
2463
+ #endif /* defined(__CPP_VERSION_AT_LEAST_11_FP16) */
2464
+
2465
+ #endif /* end of include guard: __CUDA_FP16_HPP__ */