numba-cuda 0.0.1__py3-none-any.whl → 0.0.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. _numba_cuda_redirector.pth +1 -0
  2. _numba_cuda_redirector.py +74 -0
  3. numba_cuda/VERSION +1 -0
  4. numba_cuda/__init__.py +5 -0
  5. numba_cuda/_version.py +19 -0
  6. numba_cuda/numba/cuda/__init__.py +22 -0
  7. numba_cuda/numba/cuda/api.py +526 -0
  8. numba_cuda/numba/cuda/api_util.py +30 -0
  9. numba_cuda/numba/cuda/args.py +77 -0
  10. numba_cuda/numba/cuda/cg.py +62 -0
  11. numba_cuda/numba/cuda/codegen.py +378 -0
  12. numba_cuda/numba/cuda/compiler.py +422 -0
  13. numba_cuda/numba/cuda/cpp_function_wrappers.cu +47 -0
  14. numba_cuda/numba/cuda/cuda_fp16.h +3631 -0
  15. numba_cuda/numba/cuda/cuda_fp16.hpp +2465 -0
  16. numba_cuda/numba/cuda/cuda_paths.py +258 -0
  17. numba_cuda/numba/cuda/cudadecl.py +806 -0
  18. numba_cuda/numba/cuda/cudadrv/__init__.py +9 -0
  19. numba_cuda/numba/cuda/cudadrv/devicearray.py +904 -0
  20. numba_cuda/numba/cuda/cudadrv/devices.py +248 -0
  21. numba_cuda/numba/cuda/cudadrv/driver.py +3201 -0
  22. numba_cuda/numba/cuda/cudadrv/drvapi.py +398 -0
  23. numba_cuda/numba/cuda/cudadrv/dummyarray.py +452 -0
  24. numba_cuda/numba/cuda/cudadrv/enums.py +607 -0
  25. numba_cuda/numba/cuda/cudadrv/error.py +36 -0
  26. numba_cuda/numba/cuda/cudadrv/libs.py +176 -0
  27. numba_cuda/numba/cuda/cudadrv/ndarray.py +20 -0
  28. numba_cuda/numba/cuda/cudadrv/nvrtc.py +260 -0
  29. numba_cuda/numba/cuda/cudadrv/nvvm.py +707 -0
  30. numba_cuda/numba/cuda/cudadrv/rtapi.py +10 -0
  31. numba_cuda/numba/cuda/cudadrv/runtime.py +142 -0
  32. numba_cuda/numba/cuda/cudaimpl.py +1055 -0
  33. numba_cuda/numba/cuda/cudamath.py +140 -0
  34. numba_cuda/numba/cuda/decorators.py +189 -0
  35. numba_cuda/numba/cuda/descriptor.py +33 -0
  36. numba_cuda/numba/cuda/device_init.py +89 -0
  37. numba_cuda/numba/cuda/deviceufunc.py +908 -0
  38. numba_cuda/numba/cuda/dispatcher.py +1057 -0
  39. numba_cuda/numba/cuda/errors.py +59 -0
  40. numba_cuda/numba/cuda/extending.py +7 -0
  41. numba_cuda/numba/cuda/initialize.py +13 -0
  42. numba_cuda/numba/cuda/intrinsic_wrapper.py +77 -0
  43. numba_cuda/numba/cuda/intrinsics.py +198 -0
  44. numba_cuda/numba/cuda/kernels/__init__.py +0 -0
  45. numba_cuda/numba/cuda/kernels/reduction.py +262 -0
  46. numba_cuda/numba/cuda/kernels/transpose.py +65 -0
  47. numba_cuda/numba/cuda/libdevice.py +3382 -0
  48. numba_cuda/numba/cuda/libdevicedecl.py +17 -0
  49. numba_cuda/numba/cuda/libdevicefuncs.py +1057 -0
  50. numba_cuda/numba/cuda/libdeviceimpl.py +83 -0
  51. numba_cuda/numba/cuda/mathimpl.py +448 -0
  52. numba_cuda/numba/cuda/models.py +48 -0
  53. numba_cuda/numba/cuda/nvvmutils.py +235 -0
  54. numba_cuda/numba/cuda/printimpl.py +86 -0
  55. numba_cuda/numba/cuda/random.py +292 -0
  56. numba_cuda/numba/cuda/simulator/__init__.py +38 -0
  57. numba_cuda/numba/cuda/simulator/api.py +110 -0
  58. numba_cuda/numba/cuda/simulator/compiler.py +9 -0
  59. numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +2 -0
  60. numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +432 -0
  61. numba_cuda/numba/cuda/simulator/cudadrv/devices.py +117 -0
  62. numba_cuda/numba/cuda/simulator/cudadrv/driver.py +62 -0
  63. numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +4 -0
  64. numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py +4 -0
  65. numba_cuda/numba/cuda/simulator/cudadrv/error.py +6 -0
  66. numba_cuda/numba/cuda/simulator/cudadrv/libs.py +2 -0
  67. numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +29 -0
  68. numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +19 -0
  69. numba_cuda/numba/cuda/simulator/kernel.py +308 -0
  70. numba_cuda/numba/cuda/simulator/kernelapi.py +495 -0
  71. numba_cuda/numba/cuda/simulator/reduction.py +15 -0
  72. numba_cuda/numba/cuda/simulator/vector_types.py +58 -0
  73. numba_cuda/numba/cuda/simulator_init.py +17 -0
  74. numba_cuda/numba/cuda/stubs.py +902 -0
  75. numba_cuda/numba/cuda/target.py +440 -0
  76. numba_cuda/numba/cuda/testing.py +202 -0
  77. numba_cuda/numba/cuda/tests/__init__.py +58 -0
  78. numba_cuda/numba/cuda/tests/cudadrv/__init__.py +8 -0
  79. numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +145 -0
  80. numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +145 -0
  81. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +375 -0
  82. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +21 -0
  83. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +179 -0
  84. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +235 -0
  85. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +22 -0
  86. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +193 -0
  87. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +547 -0
  88. numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +249 -0
  89. numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +81 -0
  90. numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +192 -0
  91. numba_cuda/numba/cuda/tests/cudadrv/test_events.py +38 -0
  92. numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +65 -0
  93. numba_cuda/numba/cuda/tests/cudadrv/test_init.py +139 -0
  94. numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +37 -0
  95. numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py +12 -0
  96. numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +317 -0
  97. numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +127 -0
  98. numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +54 -0
  99. numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +199 -0
  100. numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +37 -0
  101. numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +20 -0
  102. numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +149 -0
  103. numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +36 -0
  104. numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +85 -0
  105. numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +41 -0
  106. numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +122 -0
  107. numba_cuda/numba/cuda/tests/cudapy/__init__.py +8 -0
  108. numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +234 -0
  109. numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +41 -0
  110. numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +58 -0
  111. numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +30 -0
  112. numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +100 -0
  113. numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +42 -0
  114. numba_cuda/numba/cuda/tests/cudapy/test_array.py +260 -0
  115. numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +201 -0
  116. numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +35 -0
  117. numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +1620 -0
  118. numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +120 -0
  119. numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +24 -0
  120. numba_cuda/numba/cuda/tests/cudapy/test_caching.py +545 -0
  121. numba_cuda/numba/cuda/tests/cudapy/test_casting.py +257 -0
  122. numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +33 -0
  123. numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +276 -0
  124. numba_cuda/numba/cuda/tests/cudapy/test_complex.py +296 -0
  125. numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +20 -0
  126. numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +129 -0
  127. numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +176 -0
  128. numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +147 -0
  129. numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +435 -0
  130. numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +90 -0
  131. numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +94 -0
  132. numba_cuda/numba/cuda/tests/cudapy/test_debug.py +101 -0
  133. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +221 -0
  134. numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +222 -0
  135. numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +700 -0
  136. numba_cuda/numba/cuda/tests/cudapy/test_enums.py +121 -0
  137. numba_cuda/numba/cuda/tests/cudapy/test_errors.py +79 -0
  138. numba_cuda/numba/cuda/tests/cudapy/test_exception.py +174 -0
  139. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +155 -0
  140. numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +244 -0
  141. numba_cuda/numba/cuda/tests/cudapy/test_forall.py +52 -0
  142. numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +29 -0
  143. numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +66 -0
  144. numba_cuda/numba/cuda/tests/cudapy/test_globals.py +60 -0
  145. numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +456 -0
  146. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +159 -0
  147. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +95 -0
  148. numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +37 -0
  149. numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +165 -0
  150. numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +1106 -0
  151. numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +318 -0
  152. numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +99 -0
  153. numba_cuda/numba/cuda/tests/cudapy/test_lang.py +64 -0
  154. numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +119 -0
  155. numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +187 -0
  156. numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +199 -0
  157. numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +164 -0
  158. numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +37 -0
  159. numba_cuda/numba/cuda/tests/cudapy/test_math.py +786 -0
  160. numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +74 -0
  161. numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +113 -0
  162. numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +22 -0
  163. numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +140 -0
  164. numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +46 -0
  165. numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +101 -0
  166. numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +49 -0
  167. numba_cuda/numba/cuda/tests/cudapy/test_operator.py +401 -0
  168. numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +86 -0
  169. numba_cuda/numba/cuda/tests/cudapy/test_overload.py +335 -0
  170. numba_cuda/numba/cuda/tests/cudapy/test_powi.py +124 -0
  171. numba_cuda/numba/cuda/tests/cudapy/test_print.py +128 -0
  172. numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +33 -0
  173. numba_cuda/numba/cuda/tests/cudapy/test_random.py +104 -0
  174. numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +610 -0
  175. numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +125 -0
  176. numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +76 -0
  177. numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +83 -0
  178. numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +85 -0
  179. numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +37 -0
  180. numba_cuda/numba/cuda/tests/cudapy/test_sm.py +444 -0
  181. numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +205 -0
  182. numba_cuda/numba/cuda/tests/cudapy/test_sync.py +271 -0
  183. numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +80 -0
  184. numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +277 -0
  185. numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +47 -0
  186. numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +307 -0
  187. numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +283 -0
  188. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +20 -0
  189. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +69 -0
  190. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +36 -0
  191. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +37 -0
  192. numba_cuda/numba/cuda/tests/cudapy/test_warning.py +139 -0
  193. numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +276 -0
  194. numba_cuda/numba/cuda/tests/cudasim/__init__.py +6 -0
  195. numba_cuda/numba/cuda/tests/cudasim/support.py +6 -0
  196. numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +102 -0
  197. numba_cuda/numba/cuda/tests/data/__init__.py +0 -0
  198. numba_cuda/numba/cuda/tests/data/cuda_include.cu +5 -0
  199. numba_cuda/numba/cuda/tests/data/error.cu +7 -0
  200. numba_cuda/numba/cuda/tests/data/jitlink.cu +23 -0
  201. numba_cuda/numba/cuda/tests/data/jitlink.ptx +51 -0
  202. numba_cuda/numba/cuda/tests/data/warn.cu +7 -0
  203. numba_cuda/numba/cuda/tests/doc_examples/__init__.py +6 -0
  204. numba_cuda/numba/cuda/tests/doc_examples/ffi/__init__.py +0 -0
  205. numba_cuda/numba/cuda/tests/doc_examples/ffi/functions.cu +49 -0
  206. numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +77 -0
  207. numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +76 -0
  208. numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +82 -0
  209. numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +155 -0
  210. numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +173 -0
  211. numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +109 -0
  212. numba_cuda/numba/cuda/tests/doc_examples/test_random.py +59 -0
  213. numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +76 -0
  214. numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +130 -0
  215. numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py +50 -0
  216. numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +73 -0
  217. numba_cuda/numba/cuda/tests/nocuda/__init__.py +8 -0
  218. numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +359 -0
  219. numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +36 -0
  220. numba_cuda/numba/cuda/tests/nocuda/test_import.py +49 -0
  221. numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +238 -0
  222. numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +54 -0
  223. numba_cuda/numba/cuda/types.py +37 -0
  224. numba_cuda/numba/cuda/ufuncs.py +662 -0
  225. numba_cuda/numba/cuda/vector_types.py +209 -0
  226. numba_cuda/numba/cuda/vectorizers.py +252 -0
  227. numba_cuda-0.0.13.dist-info/LICENSE +25 -0
  228. numba_cuda-0.0.13.dist-info/METADATA +69 -0
  229. numba_cuda-0.0.13.dist-info/RECORD +231 -0
  230. {numba_cuda-0.0.1.dist-info → numba_cuda-0.0.13.dist-info}/WHEEL +1 -1
  231. numba_cuda-0.0.1.dist-info/METADATA +0 -10
  232. numba_cuda-0.0.1.dist-info/RECORD +0 -5
  233. {numba_cuda-0.0.1.dist-info → numba_cuda-0.0.13.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,3631 @@
1
+ /*
2
+ * Copyright 1993-2021 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ /**
51
+ * \defgroup CUDA_MATH_INTRINSIC_HALF Half Precision Intrinsics
52
+ * This section describes half precision intrinsic functions that are
53
+ * only supported in device code.
54
+ * To use these functions, include the header file \p cuda_fp16.h in your program.
55
+ */
56
+
57
+ /**
58
+ * \defgroup CUDA_MATH__HALF_ARITHMETIC Half Arithmetic Functions
59
+ * \ingroup CUDA_MATH_INTRINSIC_HALF
60
+ * To use these functions, include the header file \p cuda_fp16.h in your program.
61
+ */
62
+
63
+ /**
64
+ * \defgroup CUDA_MATH__HALF2_ARITHMETIC Half2 Arithmetic Functions
65
+ * \ingroup CUDA_MATH_INTRINSIC_HALF
66
+ * To use these functions, include the header file \p cuda_fp16.h in your program.
67
+ */
68
+
69
+ /**
70
+ * \defgroup CUDA_MATH__HALF_COMPARISON Half Comparison Functions
71
+ * \ingroup CUDA_MATH_INTRINSIC_HALF
72
+ * To use these functions, include the header file \p cuda_fp16.h in your program.
73
+ */
74
+
75
+ /**
76
+ * \defgroup CUDA_MATH__HALF2_COMPARISON Half2 Comparison Functions
77
+ * \ingroup CUDA_MATH_INTRINSIC_HALF
78
+ * To use these functions, include the header file \p cuda_fp16.h in your program.
79
+ */
80
+
81
+ /**
82
+ * \defgroup CUDA_MATH__HALF_MISC Half Precision Conversion and Data Movement
83
+ * \ingroup CUDA_MATH_INTRINSIC_HALF
84
+ * To use these functions, include the header file \p cuda_fp16.h in your program.
85
+ */
86
+
87
+ /**
88
+ * \defgroup CUDA_MATH__HALF_FUNCTIONS Half Math Functions
89
+ * \ingroup CUDA_MATH_INTRINSIC_HALF
90
+ * To use these functions, include the header file \p cuda_fp16.h in your program.
91
+ */
92
+
93
+ /**
94
+ * \defgroup CUDA_MATH__HALF2_FUNCTIONS Half2 Math Functions
95
+ * \ingroup CUDA_MATH_INTRINSIC_HALF
96
+ * To use these functions, include the header file \p cuda_fp16.h in your program.
97
+ */
98
+
99
+ #ifndef __CUDA_FP16_H__
100
+ #define __CUDA_FP16_H__
101
+
102
+ #if defined(__cplusplus)
103
+ #if defined(__CUDACC__)
104
+ #define __CUDA_FP16_DECL__ static __device__ __inline__
105
+ #define __CUDA_HOSTDEVICE_FP16_DECL__ static __host__ __device__ __inline__
106
+ #else
107
+ #define __CUDA_HOSTDEVICE_FP16_DECL__ static
108
+ #endif /* defined(__CUDACC__) */
109
+
110
+ #define __CUDA_FP16_TYPES_EXIST__
111
+
112
+ /* Forward-declaration of structures defined in "cuda_fp16.hpp" */
113
+
114
+ /**
115
+ * \brief half datatype
116
+ *
117
+ * \details This structure implements the datatype for storing
118
+ * half-precision floating-point numbers. The structure implements
119
+ * assignment operators and type conversions.
120
+ * 16 bits are being used in total: 1 sign bit, 5 bits for the exponent,
121
+ * and the significand is being stored in 10 bits.
122
+ * The total precision is 11 bits. There are 15361 representable
123
+ * numbers within the interval [0.0, 1.0], endpoints included.
124
+ * On average we have log10(2**11) ~ 3.311 decimal digits.
125
+ *
126
+ * \internal
127
+ * \req IEEE 754-2008 compliant implementation of half-precision
128
+ * floating-point numbers.
129
+ * \endinternal
130
+ */
131
+ struct __half;
132
+
133
+ /**
134
+ * \brief half2 datatype
135
+ *
136
+ * \details This structure implements the datatype for storing two
137
+ * half-precision floating-point numbers.
138
+ * The structure implements assignment operators and type conversions.
139
+ *
140
+ * \internal
141
+ * \req Vectorified version of half.
142
+ * \endinternal
143
+ */
144
+ struct __half2;
145
+
146
+ /**
147
+ * \ingroup CUDA_MATH__HALF_MISC
148
+ * \brief Converts double number to half precision in round-to-nearest-even mode
149
+ * and returns \p half with converted value.
150
+ *
151
+ * \details Converts double number \p a to half precision in round-to-nearest-even mode.
152
+ * \param[in] a - double. Is only being read.
153
+ * \returns half
154
+ * \retval a converted to half.
155
+ * \internal
156
+ * \exception-guarantee no-throw guarantee
157
+ * \behavior reentrant, thread safe
158
+ * \endinternal
159
+ */
160
+ __CUDA_HOSTDEVICE_FP16_DECL__ __half __double2half(const double a);
161
+ /**
162
+ * \ingroup CUDA_MATH__HALF_MISC
163
+ * \brief Converts float number to half precision in round-to-nearest-even mode
164
+ * and returns \p half with converted value.
165
+ *
166
+ * \details Converts float number \p a to half precision in round-to-nearest-even mode.
167
+ * \param[in] a - float. Is only being read.
168
+ * \returns half
169
+ * \retval a converted to half.
170
+ * \internal
171
+ * \exception-guarantee no-throw guarantee
172
+ * \behavior reentrant, thread safe
173
+ * \endinternal
174
+ */
175
+ __CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half(const float a);
176
+ /**
177
+ * \ingroup CUDA_MATH__HALF_MISC
178
+ * \brief Converts float number to half precision in round-to-nearest-even mode
179
+ * and returns \p half with converted value.
180
+ *
181
+ * \details Converts float number \p a to half precision in round-to-nearest-even mode.
182
+ * \param[in] a - float. Is only being read.
183
+ * \returns half
184
+ * \retval a converted to half.
185
+ * \internal
186
+ * \exception-guarantee no-throw guarantee
187
+ * \behavior reentrant, thread safe
188
+ * \endinternal
189
+ */
190
+ __CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rn(const float a);
191
+ /**
192
+ * \ingroup CUDA_MATH__HALF_MISC
193
+ * \brief Converts float number to half precision in round-towards-zero mode
194
+ * and returns \p half with converted value.
195
+ *
196
+ * \details Converts float number \p a to half precision in round-towards-zero mode.
197
+ * \param[in] a - float. Is only being read.
198
+ * \returns half
199
+ * \retval a converted to half.
200
+ * \internal
201
+ * \exception-guarantee no-throw guarantee
202
+ * \behavior reentrant, thread safe
203
+ * \endinternal
204
+ */
205
+ __CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rz(const float a);
206
+ /**
207
+ * \ingroup CUDA_MATH__HALF_MISC
208
+ * \brief Converts float number to half precision in round-down mode
209
+ * and returns \p half with converted value.
210
+ *
211
+ * \details Converts float number \p a to half precision in round-down mode.
212
+ * \param[in] a - float. Is only being read.
213
+ *
214
+ * \returns half
215
+ * \retval a converted to half.
216
+ * \internal
217
+ * \exception-guarantee no-throw guarantee
218
+ * \behavior reentrant, thread safe
219
+ * \endinternal
220
+ */
221
+ __CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rd(const float a);
222
+ /**
223
+ * \ingroup CUDA_MATH__HALF_MISC
224
+ * \brief Converts float number to half precision in round-up mode
225
+ * and returns \p half with converted value.
226
+ *
227
+ * \details Converts float number \p a to half precision in round-up mode.
228
+ * \param[in] a - float. Is only being read.
229
+ *
230
+ * \returns half
231
+ * \retval a converted to half.
232
+ * \internal
233
+ * \exception-guarantee no-throw guarantee
234
+ * \behavior reentrant, thread safe
235
+ * \endinternal
236
+ */
237
+ __CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_ru(const float a);
238
+ /**
239
+ * \ingroup CUDA_MATH__HALF_MISC
240
+ * \brief Converts \p half number to float.
241
+ *
242
+ * \details Converts half number \p a to float.
243
+ * \param[in] a - float. Is only being read.
244
+ *
245
+ * \returns float
246
+ * \retval a converted to float.
247
+ * \internal
248
+ * \exception-guarantee no-throw guarantee
249
+ * \behavior reentrant, thread safe
250
+ * \endinternal
251
+ */
252
+ __CUDA_HOSTDEVICE_FP16_DECL__ float __half2float(const __half a);
253
+ /**
254
+ * \ingroup CUDA_MATH__HALF_MISC
255
+ * \brief Converts input to half precision in round-to-nearest-even mode and
256
+ * populates both halves of \p half2 with converted value.
257
+ *
258
+ * \details Converts input \p a to half precision in round-to-nearest-even mode and
259
+ * populates both halves of \p half2 with converted value.
260
+ * \param[in] a - float. Is only being read.
261
+ *
262
+ * \returns half2
263
+ * \retval The \p half2 value with both halves equal to the converted half
264
+ * precision number.
265
+ * \internal
266
+ * \exception-guarantee no-throw guarantee
267
+ * \behavior reentrant, thread safe
268
+ * \endinternal
269
+ */
270
+ __CUDA_HOSTDEVICE_FP16_DECL__ __half2 __float2half2_rn(const float a);
271
+ /**
272
+ * \ingroup CUDA_MATH__HALF_MISC
273
+ * \brief Converts both input floats to half precision in round-to-nearest-even
274
+ * mode and returns \p half2 with converted values.
275
+ *
276
+ * \details Converts both input floats to half precision in round-to-nearest-even mode
277
+ * and combines the results into one \p half2 number. Low 16 bits of the return
278
+ * value correspond to the input \p a, high 16 bits correspond to the input \p
279
+ * b.
280
+ * \param[in] a - float. Is only being read.
281
+ * \param[in] b - float. Is only being read.
282
+ *
283
+ * \returns half2
284
+ * \retval The \p half2 value with corresponding halves equal to the
285
+ * converted input floats.
286
+ * \internal
287
+ * \exception-guarantee no-throw guarantee
288
+ * \behavior reentrant, thread safe
289
+ * \endinternal
290
+ */
291
+ __CUDA_HOSTDEVICE_FP16_DECL__ __half2 __floats2half2_rn(const float a, const float b);
292
+ /**
293
+ * \ingroup CUDA_MATH__HALF_MISC
294
+ * \brief Converts low 16 bits of \p half2 to float and returns the result
295
+ *
296
+ * \details Converts low 16 bits of \p half2 input \p a to 32-bit floating-point number
297
+ * and returns the result.
298
+ * \param[in] a - half2. Is only being read.
299
+ *
300
+ * \returns float
301
+ * \retval The low 16 bits of \p a converted to float.
302
+ * \internal
303
+ * \exception-guarantee no-throw guarantee
304
+ * \behavior reentrant, thread safe
305
+ * \endinternal
306
+ */
307
+ __CUDA_HOSTDEVICE_FP16_DECL__ float __low2float(const __half2 a);
308
+ /**
309
+ * \ingroup CUDA_MATH__HALF_MISC
310
+ * \brief Converts high 16 bits of \p half2 to float and returns the result
311
+ *
312
+ * \details Converts high 16 bits of \p half2 input \p a to 32-bit floating-point number
313
+ * and returns the result.
314
+ * \param[in] a - half2. Is only being read.
315
+ *
316
+ * \returns float
317
+ * \retval The high 16 bits of \p a converted to float.
318
+ * \internal
319
+ * \exception-guarantee no-throw guarantee
320
+ * \behavior reentrant, thread safe
321
+ * \endinternal
322
+ */
323
+ __CUDA_HOSTDEVICE_FP16_DECL__ float __high2float(const __half2 a);
324
+
325
+ #if defined(__CUDACC__)
326
+ /**
327
+ * \ingroup CUDA_MATH__HALF_MISC
328
+ * \brief Converts both components of float2 number to half precision in
329
+ * round-to-nearest-even mode and returns \p half2 with converted values.
330
+ *
331
+ * \details Converts both components of float2 to half precision in round-to-nearest
332
+ * mode and combines the results into one \p half2 number. Low 16 bits of the
333
+ * return value correspond to \p a.x and high 16 bits of the return value
334
+ * correspond to \p a.y.
335
+ * \param[in] a - float2. Is only being read.
336
+ *
337
+ * \returns half2
338
+ * \retval The \p half2 which has corresponding halves equal to the
339
+ * converted float2 components.
340
+ * \internal
341
+ * \exception-guarantee no-throw guarantee
342
+ * \behavior reentrant, thread safe
343
+ * \endinternal
344
+ */
345
+ __CUDA_HOSTDEVICE_FP16_DECL__ __half2 __float22half2_rn(const float2 a);
346
+ /**
347
+ * \ingroup CUDA_MATH__HALF_MISC
348
+ * \brief Converts both halves of \p half2 to float2 and returns the result.
349
+ *
350
+ * \details Converts both halves of \p half2 input \p a to float2 and returns the
351
+ * result.
352
+ * \param[in] a - half2. Is only being read.
353
+ *
354
+ * \returns float2
355
+ * \retval a converted to float2.
356
+ * \internal
357
+ * \exception-guarantee no-throw guarantee
358
+ * \behavior reentrant, thread safe
359
+ * \endinternal
360
+ */
361
+ __CUDA_HOSTDEVICE_FP16_DECL__ float2 __half22float2(const __half2 a);
362
+ /**
363
+ * \ingroup CUDA_MATH__HALF_MISC
364
+ * \brief Convert a half to a signed integer in round-to-nearest-even mode.
365
+ *
366
+ * \details Convert the half-precision floating-point value \p h to a signed integer in
367
+ * round-to-nearest-even mode.
368
+ * \param[in] h - half. Is only being read.
369
+ *
370
+ * \returns int
371
+ * \retval h converted to a signed integer.
372
+ * \internal
373
+ * \exception-guarantee no-throw guarantee
374
+ * \behavior reentrant, thread safe
375
+ * \endinternal
376
+ */
377
+ __CUDA_FP16_DECL__ int __half2int_rn(const __half h);
378
+ /**
379
+ * \ingroup CUDA_MATH__HALF_MISC
380
+ * \brief Convert a half to a signed integer in round-towards-zero mode.
381
+ *
382
+ * \details Convert the half-precision floating-point value \p h to a signed integer in
383
+ * round-towards-zero mode.
384
+ * \param[in] h - half. Is only being read.
385
+ *
386
+ * \returns int
387
+ * \retval h converted to a signed integer.
388
+ * \internal
389
+ * \exception-guarantee no-throw guarantee
390
+ * \behavior reentrant, thread safe
391
+ * \endinternal
392
+ */
393
+ __CUDA_HOSTDEVICE_FP16_DECL__ int __half2int_rz(const __half h);
394
+ /**
395
+ * \ingroup CUDA_MATH__HALF_MISC
396
+ * \brief Convert a half to a signed integer in round-down mode.
397
+ *
398
+ * \details Convert the half-precision floating-point value \p h to a signed integer in
399
+ * round-down mode.
400
+ * \param[in] h - half. Is only being read.
401
+ *
402
+ * \returns int
403
+ * \retval h converted to a signed integer.
404
+ * \internal
405
+ * \exception-guarantee no-throw guarantee
406
+ * \behavior reentrant, thread safe
407
+ * \endinternal
408
+ */
409
+ __CUDA_FP16_DECL__ int __half2int_rd(const __half h);
410
+ /**
411
+ * \ingroup CUDA_MATH__HALF_MISC
412
+ * \brief Convert a half to a signed integer in round-up mode.
413
+ *
414
+ * \details Convert the half-precision floating-point value \p h to a signed integer in
415
+ * round-up mode.
416
+ * \param[in] h - half. Is only being read.
417
+ *
418
+ * \returns int
419
+ * \retval h converted to a signed integer.
420
+ * \internal
421
+ * \exception-guarantee no-throw guarantee
422
+ * \behavior reentrant, thread safe
423
+ * \endinternal
424
+ */
425
+ __CUDA_FP16_DECL__ int __half2int_ru(const __half h);
426
+
427
+ /**
428
+ * \ingroup CUDA_MATH__HALF_MISC
429
+ * \brief Convert a signed integer to a half in round-to-nearest-even mode.
430
+ *
431
+ * \details Convert the signed integer value \p i to a half-precision floating-point
432
+ * value in round-to-nearest-even mode.
433
+ * \param[in] i - int. Is only being read.
434
+ *
435
+ * \returns half
436
+ * \retval i converted to half.
437
+ * \internal
438
+ * \exception-guarantee no-throw guarantee
439
+ * \behavior reentrant, thread safe
440
+ * \endinternal
441
+ */
442
+ __CUDA_HOSTDEVICE_FP16_DECL__ __half __int2half_rn(const int i);
443
+ /**
444
+ * \ingroup CUDA_MATH__HALF_MISC
445
+ * \brief Convert a signed integer to a half in round-towards-zero mode.
446
+ *
447
+ * \details Convert the signed integer value \p i to a half-precision floating-point
448
+ * value in round-towards-zero mode.
449
+ * \param[in] i - int. Is only being read.
450
+ *
451
+ * \returns half
452
+ * \retval i converted to half.
453
+ * \internal
454
+ * \exception-guarantee no-throw guarantee
455
+ * \behavior reentrant, thread safe
456
+ * \endinternal
457
+ */
458
+ __CUDA_FP16_DECL__ __half __int2half_rz(const int i);
459
+ /**
460
+ * \ingroup CUDA_MATH__HALF_MISC
461
+ * \brief Convert a signed integer to a half in round-down mode.
462
+ *
463
+ * \details Convert the signed integer value \p i to a half-precision floating-point
464
+ * value in round-down mode.
465
+ * \param[in] i - int. Is only being read.
466
+ *
467
+ * \returns half
468
+ * \retval i converted to half.
469
+ * \internal
470
+ * \exception-guarantee no-throw guarantee
471
+ * \behavior reentrant, thread safe
472
+ * \endinternal
473
+ */
474
+ __CUDA_FP16_DECL__ __half __int2half_rd(const int i);
475
+ /**
476
+ * \ingroup CUDA_MATH__HALF_MISC
477
+ * \brief Convert a signed integer to a half in round-up mode.
478
+ *
479
+ * \details Convert the signed integer value \p i to a half-precision floating-point
480
+ * value in round-up mode.
481
+ * \param[in] i - int. Is only being read.
482
+ *
483
+ * \returns half
484
+ * \retval i converted to half.
485
+ * \internal
486
+ * \exception-guarantee no-throw guarantee
487
+ * \behavior reentrant, thread safe
488
+ * \endinternal
489
+ */
490
+ __CUDA_FP16_DECL__ __half __int2half_ru(const int i);
491
+
492
+ /**
493
+ * \ingroup CUDA_MATH__HALF_MISC
494
+ * \brief Convert a half to a signed short integer in round-to-nearest-even
495
+ * mode.
496
+ *
497
+ * \details Convert the half-precision floating-point value \p h to a signed short
498
+ * integer in round-to-nearest-even mode.
499
+ * \param[in] h - half. Is only being read.
500
+ *
501
+ * \returns short int
502
+ * \retval h converted to a signed short integer.
503
+ * \internal
504
+ * \exception-guarantee no-throw guarantee
505
+ * \behavior reentrant, thread safe
506
+ * \endinternal
507
+ */
508
+ __CUDA_FP16_DECL__ short int __half2short_rn(const __half h);
509
+ /**
510
+ * \ingroup CUDA_MATH__HALF_MISC
511
+ * \brief Convert a half to a signed short integer in round-towards-zero mode.
512
+ *
513
+ * \details Convert the half-precision floating-point value \p h to a signed short
514
+ * integer in round-towards-zero mode.
515
+ * \param[in] h - half. Is only being read.
516
+ *
517
+ * \returns short int
518
+ * \retval h converted to a signed short integer.
519
+ * \internal
520
+ * \exception-guarantee no-throw guarantee
521
+ * \behavior reentrant, thread safe
522
+ * \endinternal
523
+ */
524
+ __CUDA_HOSTDEVICE_FP16_DECL__ short int __half2short_rz(const __half h);
525
+ /**
526
+ * \ingroup CUDA_MATH__HALF_MISC
527
+ * \brief Convert a half to a signed short integer in round-down mode.
528
+ *
529
+ * \details Convert the half-precision floating-point value \p h to a signed short
530
+ * integer in round-down mode.
531
+ * \param[in] h - half. Is only being read.
532
+ *
533
+ * \returns short int
534
+ * \retval h converted to a signed short integer.
535
+ * \internal
536
+ * \exception-guarantee no-throw guarantee
537
+ * \behavior reentrant, thread safe
538
+ * \endinternal
539
+ */
540
+ __CUDA_FP16_DECL__ short int __half2short_rd(const __half h);
541
+ /**
542
+ * \ingroup CUDA_MATH__HALF_MISC
543
+ * \brief Convert a half to a signed short integer in round-up mode.
544
+ *
545
+ * \details Convert the half-precision floating-point value \p h to a signed short
546
+ * integer in round-up mode.
547
+ * \param[in] h - half. Is only being read.
548
+ *
549
+ * \returns short int
550
+ * \retval h converted to a signed short integer.
551
+ * \internal
552
+ * \exception-guarantee no-throw guarantee
553
+ * \behavior reentrant, thread safe
554
+ * \endinternal
555
+ */
556
+ __CUDA_FP16_DECL__ short int __half2short_ru(const __half h);
557
+
558
+ /**
559
+ * \ingroup CUDA_MATH__HALF_MISC
560
+ * \brief Convert a signed short integer to a half in round-to-nearest-even
561
+ * mode.
562
+ *
563
+ * \details Convert the signed short integer value \p i to a half-precision floating-point
564
+ * value in round-to-nearest-even mode.
565
+ * \param[in] i - short int. Is only being read.
566
+ *
567
+ * \returns half
568
+ * \retval i converted to half.
569
+ * \internal
570
+ * \exception-guarantee no-throw guarantee
571
+ * \behavior reentrant, thread safe
572
+ * \endinternal
573
+ */
574
+ __CUDA_HOSTDEVICE_FP16_DECL__ __half __short2half_rn(const short int i);
575
+ /**
576
+ * \ingroup CUDA_MATH__HALF_MISC
577
+ * \brief Convert a signed short integer to a half in round-towards-zero mode.
578
+ *
579
+ * \details Convert the signed short integer value \p i to a half-precision floating-point
580
+ * value in round-towards-zero mode.
581
+ * \param[in] i - short int. Is only being read.
582
+ *
583
+ * \returns half
584
+ * \retval i converted to half.
585
+ * \internal
586
+ * \exception-guarantee no-throw guarantee
587
+ * \behavior reentrant, thread safe
588
+ * \endinternal
589
+ */
590
+ __CUDA_FP16_DECL__ __half __short2half_rz(const short int i);
591
+ /**
592
+ * \ingroup CUDA_MATH__HALF_MISC
593
+ * \brief Convert a signed short integer to a half in round-down mode.
594
+ *
595
+ * \details Convert the signed short integer value \p i to a half-precision floating-point
596
+ * value in round-down mode.
597
+ * \param[in] i - short int. Is only being read.
598
+ *
599
+ * \returns half
600
+ * \retval i converted to half.
601
+ * \internal
602
+ * \exception-guarantee no-throw guarantee
603
+ * \behavior reentrant, thread safe
604
+ * \endinternal
605
+ */
606
+ __CUDA_FP16_DECL__ __half __short2half_rd(const short int i);
607
+ /**
608
+ * \ingroup CUDA_MATH__HALF_MISC
609
+ * \brief Convert a signed short integer to a half in round-up mode.
610
+ *
611
+ * \details Convert the signed short integer value \p i to a half-precision floating-point
612
+ * value in round-up mode.
613
+ * \param[in] i - short int. Is only being read.
614
+ *
615
+ * \returns half
616
+ * \retval i converted to half.
617
+ * \internal
618
+ * \exception-guarantee no-throw guarantee
619
+ * \behavior reentrant, thread safe
620
+ * \endinternal
621
+ */
622
+ __CUDA_FP16_DECL__ __half __short2half_ru(const short int i);
623
+
624
+ /**
625
+ * \ingroup CUDA_MATH__HALF_MISC
626
+ * \brief Convert a half to an unsigned integer in round-to-nearest-even mode.
627
+ *
628
+ * \details Convert the half-precision floating-point value \p h to an unsigned integer
629
+ * in round-to-nearest-even mode.
630
+ * \param[in] h - half. Is only being read.
631
+ *
632
+ * \returns unsigned int
633
+ * \retval h converted to an unsigned integer.
634
+ * \internal
635
+ * \exception-guarantee no-throw guarantee
636
+ * \behavior reentrant, thread safe
637
+ * \endinternal
638
+ */
639
+ __CUDA_FP16_DECL__ unsigned int __half2uint_rn(const __half h);
640
+ /**
641
+ * \ingroup CUDA_MATH__HALF_MISC
642
+ * \brief Convert a half to an unsigned integer in round-towards-zero mode.
643
+ *
644
+ * \details Convert the half-precision floating-point value \p h to an unsigned integer
645
+ * in round-towards-zero mode.
646
+ * \param[in] h - half. Is only being read.
647
+ *
648
+ * \returns unsigned int
649
+ * \retval h converted to an unsigned integer.
650
+ * \internal
651
+ * \exception-guarantee no-throw guarantee
652
+ * \behavior reentrant, thread safe
653
+ * \endinternal
654
+ */
655
+ __CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __half2uint_rz(const __half h);
656
+ /**
657
+ * \ingroup CUDA_MATH__HALF_MISC
658
+ * \brief Convert a half to an unsigned integer in round-down mode.
659
+ *
660
+ * \details Convert the half-precision floating-point value \p h to an unsigned integer
661
+ * in round-down mode.
662
+ * \param[in] h - half. Is only being read.
663
+ *
664
+ * \returns unsigned int
665
+ * \retval h converted to an unsigned integer.
666
+ * \internal
667
+ * \exception-guarantee no-throw guarantee
668
+ * \behavior reentrant, thread safe
669
+ * \endinternal
670
+ */
671
+ __CUDA_FP16_DECL__ unsigned int __half2uint_rd(const __half h);
672
+ /**
673
+ * \ingroup CUDA_MATH__HALF_MISC
674
+ * \brief Convert a half to an unsigned integer in round-up mode.
675
+ *
676
+ * \details Convert the half-precision floating-point value \p h to an unsigned integer
677
+ * in round-up mode.
678
+ * \param[in] h - half. Is only being read.
679
+ *
680
+ * \returns unsigned int
681
+ * \retval h converted to an unsigned integer.
682
+ * \internal
683
+ * \exception-guarantee no-throw guarantee
684
+ * \behavior reentrant, thread safe
685
+ * \endinternal
686
+ */
687
+ __CUDA_FP16_DECL__ unsigned int __half2uint_ru(const __half h);
688
+
689
+ /**
690
+ * \ingroup CUDA_MATH__HALF_MISC
691
+ * \brief Convert an unsigned integer to a half in round-to-nearest-even mode.
692
+ *
693
+ * \details Convert the unsigned integer value \p i to a half-precision floating-point
694
+ * value in round-to-nearest-even mode.
695
+ * \param[in] i - unsigned int. Is only being read.
696
+ *
697
+ * \returns half
698
+ * \retval i converted to half.
699
+ * \internal
700
+ * \exception-guarantee no-throw guarantee
701
+ * \behavior reentrant, thread safe
702
+ * \endinternal
703
+ */
704
+ __CUDA_HOSTDEVICE_FP16_DECL__ __half __uint2half_rn(const unsigned int i);
705
+ /**
706
+ * \ingroup CUDA_MATH__HALF_MISC
707
+ * \brief Convert an unsigned integer to a half in round-towards-zero mode.
708
+ *
709
+ * \details Convert the unsigned integer value \p i to a half-precision floating-point
710
+ * value in round-towards-zero mode.
711
+ * \param[in] i - unsigned int. Is only being read.
712
+ *
713
+ * \returns half
714
+ * \retval i converted to half.
715
+ * \internal
716
+ * \exception-guarantee no-throw guarantee
717
+ * \behavior reentrant, thread safe
718
+ * \endinternal
719
+ */
720
+ __CUDA_FP16_DECL__ __half __uint2half_rz(const unsigned int i);
721
+ /**
722
+ * \ingroup CUDA_MATH__HALF_MISC
723
+ * \brief Convert an unsigned integer to a half in round-down mode.
724
+ *
725
+ * \details Convert the unsigned integer value \p i to a half-precision floating-point
726
+ * value in round-down mode.
727
+ * \param[in] i - unsigned int. Is only being read.
728
+ *
729
+ * \returns half
730
+ * \retval i converted to half.
731
+ * \internal
732
+ * \exception-guarantee no-throw guarantee
733
+ * \behavior reentrant, thread safe
734
+ * \endinternal
735
+ */
736
+ __CUDA_FP16_DECL__ __half __uint2half_rd(const unsigned int i);
737
+ /**
738
+ * \ingroup CUDA_MATH__HALF_MISC
739
+ * \brief Convert an unsigned integer to a half in round-up mode.
740
+ *
741
+ * \details Convert the unsigned integer value \p i to a half-precision floating-point
742
+ * value in round-up mode.
743
+ * \param[in] i - unsigned int. Is only being read.
744
+ *
745
+ * \returns half
746
+ * \retval i converted to half.
747
+ * \internal
748
+ * \exception-guarantee no-throw guarantee
749
+ * \behavior reentrant, thread safe
750
+ * \endinternal
751
+ */
752
+ __CUDA_FP16_DECL__ __half __uint2half_ru(const unsigned int i);
753
+
754
+ /**
755
+ * \ingroup CUDA_MATH__HALF_MISC
756
+ * \brief Convert a half to an unsigned short integer in round-to-nearest-even
757
+ * mode.
758
+ *
759
+ * \details Convert the half-precision floating-point value \p h to an unsigned short
760
+ * integer in round-to-nearest-even mode.
761
+ * \param[in] h - half. Is only being read.
762
+ *
763
+ * \returns unsigned short int
764
+ * \retval h converted to an unsigned short integer.
765
+ * \internal
766
+ * \exception-guarantee no-throw guarantee
767
+ * \behavior reentrant, thread safe
768
+ * \endinternal
769
+ */
770
+ __CUDA_FP16_DECL__ unsigned short int __half2ushort_rn(const __half h);
771
+ /**
772
+ * \ingroup CUDA_MATH__HALF_MISC
773
+ * \brief Convert a half to an unsigned short integer in round-towards-zero
774
+ * mode.
775
+ *
776
+ * \details Convert the half-precision floating-point value \p h to an unsigned short
777
+ * integer in round-towards-zero mode.
778
+ * \param[in] h - half. Is only being read.
779
+ *
780
+ * \returns unsigned short int
781
+ * \retval h converted to an unsigned short integer.
782
+ * \internal
783
+ * \exception-guarantee no-throw guarantee
784
+ * \behavior reentrant, thread safe
785
+ * \endinternal
786
+ */
787
+ __CUDA_HOSTDEVICE_FP16_DECL__ unsigned short int __half2ushort_rz(const __half h);
788
+ /**
789
+ * \ingroup CUDA_MATH__HALF_MISC
790
+ * \brief Convert a half to an unsigned short integer in round-down mode.
791
+ *
792
+ * \details Convert the half-precision floating-point value \p h to an unsigned short
793
+ * integer in round-down mode.
794
+ * \param[in] h - half. Is only being read.
795
+ *
796
+ * \returns unsigned short int
797
+ * \retval h converted to an unsigned short integer.
798
+ */
799
+ __CUDA_FP16_DECL__ unsigned short int __half2ushort_rd(const __half h);
800
+ /**
801
+ * \ingroup CUDA_MATH__HALF_MISC
802
+ * \brief Convert a half to an unsigned short integer in round-up mode.
803
+ *
804
+ * \details Convert the half-precision floating-point value \p h to an unsigned short
805
+ * integer in round-up mode.
806
+ * \param[in] h - half. Is only being read.
807
+ *
808
+ * \returns unsigned short int
809
+ * \retval h converted to an unsigned short integer.
810
+ */
811
+ __CUDA_FP16_DECL__ unsigned short int __half2ushort_ru(const __half h);
812
+
813
+ /**
814
+ * \ingroup CUDA_MATH__HALF_MISC
815
+ * \brief Convert an unsigned short integer to a half in round-to-nearest-even
816
+ * mode.
817
+ *
818
+ * \details Convert the unsigned short integer value \p i to a half-precision floating-point
819
+ * value in round-to-nearest-even mode.
820
+ * \param[in] i - unsigned short int. Is only being read.
821
+ *
822
+ * \returns half
823
+ * \retval i converted to half.
824
+ * \internal
825
+ * \exception-guarantee no-throw guarantee
826
+ * \behavior reentrant, thread safe
827
+ * \endinternal
828
+ */
829
+ __CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort2half_rn(const unsigned short int i);
830
+ /**
831
+ * \ingroup CUDA_MATH__HALF_MISC
832
+ * \brief Convert an unsigned short integer to a half in round-towards-zero
833
+ * mode.
834
+ *
835
+ * \details Convert the unsigned short integer value \p i to a half-precision floating-point
836
+ * value in round-towards-zero mode.
837
+ * \param[in] i - unsigned short int. Is only being read.
838
+ *
839
+ * \returns half
840
+ * \retval i converted to half.
841
+ * \internal
842
+ * \exception-guarantee no-throw guarantee
843
+ * \behavior reentrant, thread safe
844
+ * \endinternal
845
+ */
846
+ __CUDA_FP16_DECL__ __half __ushort2half_rz(const unsigned short int i);
847
+ /**
848
+ * \ingroup CUDA_MATH__HALF_MISC
849
+ * \brief Convert an unsigned short integer to a half in round-down mode.
850
+ *
851
+ * \details Convert the unsigned short integer value \p i to a half-precision floating-point
852
+ * value in round-down mode.
853
+ * \param[in] i - unsigned short int. Is only being read.
854
+ *
855
+ * \returns half
856
+ * \retval i converted to half.
857
+ * \internal
858
+ * \exception-guarantee no-throw guarantee
859
+ * \behavior reentrant, thread safe
860
+ * \endinternal
861
+ */
862
+ __CUDA_FP16_DECL__ __half __ushort2half_rd(const unsigned short int i);
863
+ /**
864
+ * \ingroup CUDA_MATH__HALF_MISC
865
+ * \brief Convert an unsigned short integer to a half in round-up mode.
866
+ *
867
+ * \details Convert the unsigned short integer value \p i to a half-precision floating-point
868
+ * value in round-up mode.
869
+ * \param[in] i - unsigned short int. Is only being read.
870
+ *
871
+ * \returns half
872
+ * \retval i converted to half.
873
+ * \internal
874
+ * \exception-guarantee no-throw guarantee
875
+ * \behavior reentrant, thread safe
876
+ * \endinternal
877
+ */
878
+ __CUDA_FP16_DECL__ __half __ushort2half_ru(const unsigned short int i);
879
+
880
+ /**
881
+ * \ingroup CUDA_MATH__HALF_MISC
882
+ * \brief Convert a half to an unsigned 64-bit integer in round-to-nearest-even
883
+ * mode.
884
+ *
885
+ * \details Convert the half-precision floating-point value \p h to an unsigned 64-bit
886
+ * integer in round-to-nearest-even mode.
887
+ * \param[in] h - half. Is only being read.
888
+ *
889
+ * \returns unsigned long long int
890
+ * \retval h converted to an unsigned 64-bit integer.
891
+ * \internal
892
+ * \exception-guarantee no-throw guarantee
893
+ * \behavior reentrant, thread safe
894
+ * \endinternal
895
+ */
896
+ __CUDA_FP16_DECL__ unsigned long long int __half2ull_rn(const __half h);
897
+ /**
898
+ * \ingroup CUDA_MATH__HALF_MISC
899
+ * \brief Convert a half to an unsigned 64-bit integer in round-towards-zero
900
+ * mode.
901
+ *
902
+ * \details Convert the half-precision floating-point value \p h to an unsigned 64-bit
903
+ * integer in round-towards-zero mode.
904
+ * \param[in] h - half. Is only being read.
905
+ *
906
+ * \returns unsigned long long int
907
+ * \retval h converted to an unsigned 64-bit integer.
908
+ * \internal
909
+ * \exception-guarantee no-throw guarantee
910
+ * \behavior reentrant, thread safe
911
+ * \endinternal
912
+ */
913
+ __CUDA_HOSTDEVICE_FP16_DECL__ unsigned long long int __half2ull_rz(const __half h);
914
+ /**
915
+ * \ingroup CUDA_MATH__HALF_MISC
916
+ * \brief Convert a half to an unsigned 64-bit integer in round-down mode.
917
+ *
918
+ * \details Convert the half-precision floating-point value \p h to an unsigned 64-bit
919
+ * integer in round-down mode.
920
+ * \param[in] h - half. Is only being read.
921
+ *
922
+ * \returns unsigned long long int
923
+ * \retval h converted to an unsigned 64-bit integer.
924
+ * \internal
925
+ * \exception-guarantee no-throw guarantee
926
+ * \behavior reentrant, thread safe
927
+ * \endinternal
928
+ */
929
+ __CUDA_FP16_DECL__ unsigned long long int __half2ull_rd(const __half h);
930
+ /**
931
+ * \ingroup CUDA_MATH__HALF_MISC
932
+ * \brief Convert a half to an unsigned 64-bit integer in round-up mode.
933
+ *
934
+ * \details Convert the half-precision floating-point value \p h to an unsigned 64-bit
935
+ * integer in round-up mode.
936
+ * \param[in] h - half. Is only being read.
937
+ *
938
+ * \returns unsigned long long int
939
+ * \retval h converted to an unsigned 64-bit integer.
940
+ * \internal
941
+ * \exception-guarantee no-throw guarantee
942
+ * \behavior reentrant, thread safe
943
+ * \endinternal
944
+ */
945
+ __CUDA_FP16_DECL__ unsigned long long int __half2ull_ru(const __half h);
946
+
947
+ /**
948
+ * \ingroup CUDA_MATH__HALF_MISC
949
+ * \brief Convert an unsigned 64-bit integer to a half in round-to-nearest-even
950
+ * mode.
951
+ *
952
+ * \details Convert the unsigned 64-bit integer value \p i to a half-precision floating-point
953
+ * value in round-to-nearest-even mode.
954
+ * \param[in] i - unsigned long long int. Is only being read.
955
+ *
956
+ * \returns half
957
+ * \retval i converted to half.
958
+ * \internal
959
+ * \exception-guarantee no-throw guarantee
960
+ * \behavior reentrant, thread safe
961
+ * \endinternal
962
+ */
963
+ __CUDA_HOSTDEVICE_FP16_DECL__ __half __ull2half_rn(const unsigned long long int i);
964
+ /**
965
+ * \ingroup CUDA_MATH__HALF_MISC
966
+ * \brief Convert an unsigned 64-bit integer to a half in round-towards-zero
967
+ * mode.
968
+ *
969
+ * \details Convert the unsigned 64-bit integer value \p i to a half-precision floating-point
970
+ * value in round-towards-zero mode.
971
+ * \param[in] i - unsigned long long int. Is only being read.
972
+ *
973
+ * \returns half
974
+ * \retval i converted to half.
975
+ * \internal
976
+ * \exception-guarantee no-throw guarantee
977
+ * \behavior reentrant, thread safe
978
+ * \endinternal
979
+ */
980
+ __CUDA_FP16_DECL__ __half __ull2half_rz(const unsigned long long int i);
981
+ /**
982
+ * \ingroup CUDA_MATH__HALF_MISC
983
+ * \brief Convert an unsigned 64-bit integer to a half in round-down mode.
984
+ *
985
+ * \details Convert the unsigned 64-bit integer value \p i to a half-precision floating-point
986
+ * value in round-down mode.
987
+ * \param[in] i - unsigned long long int. Is only being read.
988
+ *
989
+ * \returns half
990
+ * \retval i converted to half.
991
+ * \internal
992
+ * \exception-guarantee no-throw guarantee
993
+ * \behavior reentrant, thread safe
994
+ * \endinternal
995
+ */
996
+ __CUDA_FP16_DECL__ __half __ull2half_rd(const unsigned long long int i);
997
+ /**
998
+ * \ingroup CUDA_MATH__HALF_MISC
999
+ * \brief Convert an unsigned 64-bit integer to a half in round-up mode.
1000
+ *
1001
+ * \details Convert the unsigned 64-bit integer value \p i to a half-precision floating-point
1002
+ * value in round-up mode.
1003
+ * \param[in] i - unsigned long long int. Is only being read.
1004
+ *
1005
+ * \returns half
1006
+ * \retval i converted to half.
1007
+ * \internal
1008
+ * \exception-guarantee no-throw guarantee
1009
+ * \behavior reentrant, thread safe
1010
+ * \endinternal
1011
+ */
1012
+ __CUDA_FP16_DECL__ __half __ull2half_ru(const unsigned long long int i);
1013
+
1014
+ /**
1015
+ * \ingroup CUDA_MATH__HALF_MISC
1016
+ * \brief Convert a half to a signed 64-bit integer in round-to-nearest-even
1017
+ * mode.
1018
+ *
1019
+ * \details Convert the half-precision floating-point value \p h to a signed 64-bit
1020
+ * integer in round-to-nearest-even mode.
1021
+ * \param[in] h - half. Is only being read.
1022
+ *
1023
+ * \returns long long int
1024
+ * \retval h converted to a signed 64-bit integer.
1025
+ * \internal
1026
+ * \exception-guarantee no-throw guarantee
1027
+ * \behavior reentrant, thread safe
1028
+ * \endinternal
1029
+ */
1030
+ __CUDA_FP16_DECL__ long long int __half2ll_rn(const __half h);
1031
+ /**
1032
+ * \ingroup CUDA_MATH__HALF_MISC
1033
+ * \brief Convert a half to a signed 64-bit integer in round-towards-zero mode.
1034
+ *
1035
+ * \details Convert the half-precision floating-point value \p h to a signed 64-bit
1036
+ * integer in round-towards-zero mode.
1037
+ * \param[in] h - half. Is only being read.
1038
+ *
1039
+ * \returns long long int
1040
+ * \retval h converted to a signed 64-bit integer.
1041
+ * \internal
1042
+ * \exception-guarantee no-throw guarantee
1043
+ * \behavior reentrant, thread safe
1044
+ * \endinternal
1045
+ */
1046
+ __CUDA_HOSTDEVICE_FP16_DECL__ long long int __half2ll_rz(const __half h);
1047
+ /**
1048
+ * \ingroup CUDA_MATH__HALF_MISC
1049
+ * \brief Convert a half to a signed 64-bit integer in round-down mode.
1050
+ *
1051
+ * \details Convert the half-precision floating-point value \p h to a signed 64-bit
1052
+ * integer in round-down mode.
1053
+ * \param[in] h - half. Is only being read.
1054
+ *
1055
+ * \returns long long int
1056
+ * \retval h converted to a signed 64-bit integer.
1057
+ * \internal
1058
+ * \exception-guarantee no-throw guarantee
1059
+ * \behavior reentrant, thread safe
1060
+ * \endinternal
1061
+ */
1062
+ __CUDA_FP16_DECL__ long long int __half2ll_rd(const __half h);
1063
+ /**
1064
+ * \ingroup CUDA_MATH__HALF_MISC
1065
+ * \brief Convert a half to a signed 64-bit integer in round-up mode.
1066
+ *
1067
+ * \details Convert the half-precision floating-point value \p h to a signed 64-bit
1068
+ * integer in round-up mode.
1069
+ * \param[in] h - half. Is only being read.
1070
+ *
1071
+ * \returns long long int
1072
+ * \retval h converted to a signed 64-bit integer.
1073
+ * \internal
1074
+ * \exception-guarantee no-throw guarantee
1075
+ * \behavior reentrant, thread safe
1076
+ * \endinternal
1077
+ */
1078
+ __CUDA_FP16_DECL__ long long int __half2ll_ru(const __half h);
1079
+
1080
+ /**
1081
+ * \ingroup CUDA_MATH__HALF_MISC
1082
+ * \brief Convert a signed 64-bit integer to a half in round-to-nearest-even
1083
+ * mode.
1084
+ *
1085
+ * \details Convert the signed 64-bit integer value \p i to a half-precision floating-point
1086
+ * value in round-to-nearest-even mode.
1087
+ * \param[in] i - long long int. Is only being read.
1088
+ *
1089
+ * \returns half
1090
+ * \retval i converted to half.
1091
+ * \internal
1092
+ * \exception-guarantee no-throw guarantee
1093
+ * \behavior reentrant, thread safe
1094
+ * \endinternal
1095
+ */
1096
+ __CUDA_HOSTDEVICE_FP16_DECL__ __half __ll2half_rn(const long long int i);
1097
+ /**
1098
+ * \ingroup CUDA_MATH__HALF_MISC
1099
+ * \brief Convert a signed 64-bit integer to a half in round-towards-zero mode.
1100
+ *
1101
+ * \details Convert the signed 64-bit integer value \p i to a half-precision floating-point
1102
+ * value in round-towards-zero mode.
1103
+ * \param[in] i - long long int. Is only being read.
1104
+ *
1105
+ * \returns half
1106
+ * \retval i converted to half.
1107
+ */
1108
+ __CUDA_FP16_DECL__ __half __ll2half_rz(const long long int i);
1109
+ /**
1110
+ * \ingroup CUDA_MATH__HALF_MISC
1111
+ * \brief Convert a signed 64-bit integer to a half in round-down mode.
1112
+ *
1113
+ * \details Convert the signed 64-bit integer value \p i to a half-precision floating-point
1114
+ * value in round-down mode.
1115
+ * \param[in] i - long long int. Is only being read.
1116
+ *
1117
+ * \returns half
1118
+ * \retval i converted to half.
1119
+ * \internal
1120
+ * \exception-guarantee no-throw guarantee
1121
+ * \behavior reentrant, thread safe
1122
+ * \endinternal
1123
+ */
1124
+ __CUDA_FP16_DECL__ __half __ll2half_rd(const long long int i);
1125
+ /**
1126
+ * \ingroup CUDA_MATH__HALF_MISC
1127
+ * \brief Convert a signed 64-bit integer to a half in round-up mode.
1128
+ *
1129
+ * \details Convert the signed 64-bit integer value \p i to a half-precision floating-point
1130
+ * value in round-up mode.
1131
+ * \param[in] i - long long int. Is only being read.
1132
+ *
1133
+ * \returns half
1134
+ * \retval i converted to half.
1135
+ * \internal
1136
+ * \exception-guarantee no-throw guarantee
1137
+ * \behavior reentrant, thread safe
1138
+ * \endinternal
1139
+ */
1140
+ __CUDA_FP16_DECL__ __half __ll2half_ru(const long long int i);
1141
+
1142
+ /**
1143
+ * \ingroup CUDA_MATH__HALF_FUNCTIONS
1144
+ * \brief Truncate input argument to the integral part.
1145
+ *
1146
+ * \details Round \p h to the nearest integer value that does not exceed \p h in
1147
+ * magnitude.
1148
+ * \param[in] h - half. Is only being read.
1149
+ *
1150
+ * \returns half
1151
+ * \retval The truncated integer value.
1152
+ * \internal
1153
+ * \exception-guarantee no-throw guarantee
1154
+ * \behavior reentrant, thread safe
1155
+ * \endinternal
1156
+ */
1157
+ __CUDA_FP16_DECL__ __half htrunc(const __half h);
1158
+ /**
1159
+ * \ingroup CUDA_MATH__HALF_FUNCTIONS
1160
+ * \brief Calculate ceiling of the input argument.
1161
+ *
1162
+ * \details Compute the smallest integer value not less than \p h.
1163
+ * \param[in] h - half. Is only being read.
1164
+ *
1165
+ * \returns half
1166
+ * \retval The smallest integer value not less than \p h.
1167
+ * \internal
1168
+ * \exception-guarantee no-throw guarantee
1169
+ * \behavior reentrant, thread safe
1170
+ * \endinternal
1171
+ */
1172
+ __CUDA_FP16_DECL__ __half hceil(const __half h);
1173
+ /**
1174
+ * \ingroup CUDA_MATH__HALF_FUNCTIONS
1175
+ * \brief Calculate the largest integer less than or equal to \p h.
1176
+ *
1177
+ * \details Calculate the largest integer value which is less than or equal to \p h.
1178
+ * \param[in] h - half. Is only being read.
1179
+ *
1180
+ * \returns half
1181
+ * \retval The largest integer value which is less than or equal to \p h.
1182
+ * \internal
1183
+ * \exception-guarantee no-throw guarantee
1184
+ * \behavior reentrant, thread safe
1185
+ * \endinternal
1186
+ */
1187
+ __CUDA_FP16_DECL__ __half hfloor(const __half h);
1188
+ /**
1189
+ * \ingroup CUDA_MATH__HALF_FUNCTIONS
1190
+ * \brief Round input to nearest integer value in half-precision floating-point
1191
+ * number.
1192
+ *
1193
+ * \details Round \p h to the nearest integer value in half-precision floating-point
1194
+ * format, with halfway cases rounded to the nearest even integer value.
1195
+ * \param[in] h - half. Is only being read.
1196
+ *
1197
+ * \returns half
1198
+ * \retval The nearest integer to \p h.
1199
+ * \internal
1200
+ * \exception-guarantee no-throw guarantee
1201
+ * \behavior reentrant, thread safe
1202
+ * \endinternal
1203
+ */
1204
+ __CUDA_FP16_DECL__ __half hrint(const __half h);
1205
+
1206
+ /**
1207
+ * \ingroup CUDA_MATH__HALF2_FUNCTIONS
1208
+ * \brief Truncate \p half2 vector input argument to the integral part.
1209
+ *
1210
+ * \details Round each component of vector \p h to the nearest integer value that does
1211
+ * not exceed \p h in magnitude.
1212
+ * \param[in] h - half2. Is only being read.
1213
+ *
1214
+ * \returns half2
1215
+ * \retval The truncated \p h.
1216
+ * \internal
1217
+ * \exception-guarantee no-throw guarantee
1218
+ * \behavior reentrant, thread safe
1219
+ * \endinternal
1220
+ */
1221
+ __CUDA_FP16_DECL__ __half2 h2trunc(const __half2 h);
1222
+ /**
1223
+ * \ingroup CUDA_MATH__HALF2_FUNCTIONS
1224
+ * \brief Calculate \p half2 vector ceiling of the input argument.
1225
+ *
1226
+ * \details For each component of vector \p h compute the smallest integer value not less
1227
+ * than \p h.
1228
+ * \param[in] h - half2. Is only being read.
1229
+ *
1230
+ * \returns half2
1231
+ * \retval The vector of smallest integers not less than \p h.
1232
+ * \internal
1233
+ * \exception-guarantee no-throw guarantee
1234
+ * \behavior reentrant, thread safe
1235
+ * \endinternal
1236
+ */
1237
+ __CUDA_FP16_DECL__ __half2 h2ceil(const __half2 h);
1238
+ /**
1239
+ * \ingroup CUDA_MATH__HALF2_FUNCTIONS
1240
+ * \brief Calculate the largest integer less than or equal to \p h.
1241
+ *
1242
+ * \details For each component of vector \p h calculate the largest integer value which
1243
+ * is less than or equal to \p h.
1244
+ * \param[in] h - half2. Is only being read.
1245
+ *
1246
+ * \returns half2
1247
+ * \retval The vector of largest integers which is less than or equal to \p h.
1248
+ * \internal
1249
+ * \exception-guarantee no-throw guarantee
1250
+ * \behavior reentrant, thread safe
1251
+ * \endinternal
1252
+ */
1253
+ __CUDA_FP16_DECL__ __half2 h2floor(const __half2 h);
1254
+ /**
1255
+ * \ingroup CUDA_MATH__HALF2_FUNCTIONS
1256
+ * \brief Round input to nearest integer value in half-precision floating-point
1257
+ * number.
1258
+ *
1259
+ * \details Round each component of \p half2 vector \p h to the nearest integer value in
1260
+ * half-precision floating-point format, with halfway cases rounded to the
1261
+ * nearest even integer value.
1262
+ * \param[in] h - half2. Is only being read.
1263
+ *
1264
+ * \returns half2
1265
+ * \retval The vector of rounded integer values.
1266
+ * \internal
1267
+ * \exception-guarantee no-throw guarantee
1268
+ * \behavior reentrant, thread safe
1269
+ * \endinternal
1270
+ */
1271
+ __CUDA_FP16_DECL__ __half2 h2rint(const __half2 h);
1272
+
1273
+ /**
1274
+ * \ingroup CUDA_MATH__HALF_MISC
1275
+ * \brief Returns \p half2 with both halves equal to the input value.
1276
+ *
1277
+ * \details Returns \p half2 number with both halves equal to the input \p a \p half
1278
+ * number.
1279
+ * \param[in] a - half. Is only being read.
1280
+ *
1281
+ * \returns half2
1282
+ * \retval The vector which has both its halves equal to the input \p a.
1283
+ * \internal
1284
+ * \exception-guarantee no-throw guarantee
1285
+ * \behavior reentrant, thread safe
1286
+ * \endinternal
1287
+ */
1288
+ __CUDA_FP16_DECL__ __half2 __half2half2(const __half a);
1289
+ /**
1290
+ * \ingroup CUDA_MATH__HALF_MISC
1291
+ * \brief Swaps both halves of the \p half2 input.
1292
+ *
1293
+ * \details Swaps both halves of the \p half2 input and returns a new \p half2 number
1294
+ * with swapped halves.
1295
+ * \param[in] a - half2. Is only being read.
1296
+ *
1297
+ * \returns half2
1298
+ * \retval a with its halves being swapped.
1299
+ * \internal
1300
+ * \exception-guarantee no-throw guarantee
1301
+ * \behavior reentrant, thread safe
1302
+ * \endinternal
1303
+ */
1304
+ __CUDA_FP16_DECL__ __half2 __lowhigh2highlow(const __half2 a);
1305
+ /**
1306
+ * \ingroup CUDA_MATH__HALF_MISC
1307
+ * \brief Extracts low 16 bits from each of the two \p half2 inputs and combines
1308
+ * into one \p half2 number.
1309
+ *
1310
+ * \details Extracts low 16 bits from each of the two \p half2 inputs and combines into
1311
+ * one \p half2 number. Low 16 bits from input \p a is stored in low 16 bits of
1312
+ * the return value, low 16 bits from input \p b is stored in high 16 bits of
1313
+ * the return value.
1314
+ * \param[in] a - half2. Is only being read.
1315
+ * \param[in] b - half2. Is only being read.
1316
+ *
1317
+ * \returns half2
1318
+ * \retval The low 16 bits of \p a and of \p b.
1319
+ * \internal
1320
+ * \exception-guarantee no-throw guarantee
1321
+ * \behavior reentrant, thread safe
1322
+ * \endinternal
1323
+ */
1324
+ __CUDA_FP16_DECL__ __half2 __lows2half2(const __half2 a, const __half2 b);
1325
+ /**
1326
+ * \ingroup CUDA_MATH__HALF_MISC
1327
+ * \brief Extracts high 16 bits from each of the two \p half2 inputs and
1328
+ * combines into one \p half2 number.
1329
+ *
1330
+ * \details Extracts high 16 bits from each of the two \p half2 inputs and combines into
1331
+ * one \p half2 number. High 16 bits from input \p a is stored in low 16 bits of
1332
+ * the return value, high 16 bits from input \p b is stored in high 16 bits of
1333
+ * the return value.
1334
+ * \param[in] a - half2. Is only being read.
1335
+ * \param[in] b - half2. Is only being read.
1336
+ *
1337
+ * \returns half2
1338
+ * \retval The high 16 bits of \p a and of \p b.
1339
+ * \internal
1340
+ * \exception-guarantee no-throw guarantee
1341
+ * \behavior reentrant, thread safe
1342
+ * \endinternal
1343
+ */
1344
+ __CUDA_FP16_DECL__ __half2 __highs2half2(const __half2 a, const __half2 b);
1345
+ /**
1346
+ * \ingroup CUDA_MATH__HALF_MISC
1347
+ * \brief Returns high 16 bits of \p half2 input.
1348
+ *
1349
+ * \details Returns high 16 bits of \p half2 input \p a.
1350
+ * \param[in] a - half2. Is only being read.
1351
+ *
1352
+ * \returns half
1353
+ * \retval The high 16 bits of the input.
1354
+ * \internal
1355
+ * \exception-guarantee no-throw guarantee
1356
+ * \behavior reentrant, thread safe
1357
+ * \endinternal
1358
+ */
1359
+ __CUDA_FP16_DECL__ __half __high2half(const __half2 a);
1360
+ /**
1361
+ * \ingroup CUDA_MATH__HALF_MISC
1362
+ * \brief Returns low 16 bits of \p half2 input.
1363
+ *
1364
+ * \details Returns low 16 bits of \p half2 input \p a.
1365
+ * \param[in] a - half2. Is only being read.
1366
+ *
1367
+ * \returns half
1368
+ * \retval Returns \p half which contains low 16 bits of the input \p a.
1369
+ * \internal
1370
+ * \exception-guarantee no-throw guarantee
1371
+ * \behavior reentrant, thread safe
1372
+ * \endinternal
1373
+ */
1374
+ __CUDA_FP16_DECL__ __half __low2half(const __half2 a);
1375
+ /**
1376
+ * \ingroup CUDA_MATH__HALF_COMPARISON
1377
+ * \brief Checks if the input \p half number is infinite.
1378
+ *
1379
+ * \details Checks if the input \p half number \p a is infinite.
1380
+ * \param[in] a - half. Is only being read.
1381
+ *
1382
+ * \returns int
1383
+ * \retval -1 iff \p a is equal to negative infinity,
1384
+ * \retval 1 iff \p a is equal to positive infinity,
1385
+ * \retval 0 otherwise.
1386
+ * \internal
1387
+ * \exception-guarantee no-throw guarantee
1388
+ * \behavior reentrant, thread safe
1389
+ * \endinternal
1390
+ */
1391
+ __CUDA_FP16_DECL__ int __hisinf(const __half a);
1392
+ /**
1393
+ * \ingroup CUDA_MATH__HALF_MISC
1394
+ * \brief Combines two \p half numbers into one \p half2 number.
1395
+ *
1396
+ * \details Combines two input \p half number \p a and \p b into one \p half2 number.
1397
+ * Input \p a is stored in low 16 bits of the return value, input \p b is stored
1398
+ * in high 16 bits of the return value.
1399
+ * \param[in] a - half. Is only being read.
1400
+ * \param[in] b - half. Is only being read.
1401
+ *
1402
+ * \returns half2
1403
+ * \retval The half2 with one half equal to \p a and the other to \p b.
1404
+ * \internal
1405
+ * \exception-guarantee no-throw guarantee
1406
+ * \behavior reentrant, thread safe
1407
+ * \endinternal
1408
+ */
1409
+ __CUDA_FP16_DECL__ __half2 __halves2half2(const __half a, const __half b);
1410
+ /**
1411
+ * \ingroup CUDA_MATH__HALF_MISC
1412
+ * \brief Extracts low 16 bits from \p half2 input.
1413
+ *
1414
+ * \details Extracts low 16 bits from \p half2 input \p a and returns a new \p half2
1415
+ * number which has both halves equal to the extracted bits.
1416
+ * \param[in] a - half2. Is only being read.
1417
+ *
1418
+ * \returns half2
1419
+ * \retval The half2 with both halves equal to the low 16 bits of the input.
1420
+ * \internal
1421
+ * \exception-guarantee no-throw guarantee
1422
+ * \behavior reentrant, thread safe
1423
+ * \endinternal
1424
+ */
1425
+ __CUDA_FP16_DECL__ __half2 __low2half2(const __half2 a);
1426
+ /**
1427
+ * \ingroup CUDA_MATH__HALF_MISC
1428
+ * \brief Extracts high 16 bits from \p half2 input.
1429
+ *
1430
+ * \details Extracts high 16 bits from \p half2 input \p a and returns a new \p half2
1431
+ * number which has both halves equal to the extracted bits.
1432
+ * \param[in] a - half2. Is only being read.
1433
+ *
1434
+ * \returns half2
1435
+ * \retval The half2 with both halves equal to the high 16 bits of the input.
1436
+ * \internal
1437
+ * \exception-guarantee no-throw guarantee
1438
+ * \behavior reentrant, thread safe
1439
+ * \endinternal
1440
+ */
1441
+ __CUDA_FP16_DECL__ __half2 __high2half2(const __half2 a);
1442
+
1443
+ /**
1444
+ * \ingroup CUDA_MATH__HALF_MISC
1445
+ * \brief Reinterprets bits in a \p half as a signed short integer.
1446
+ *
1447
+ * \details Reinterprets the bits in the half-precision floating-point number \p h
1448
+ * as a signed short integer.
1449
+ * \param[in] h - half. Is only being read.
1450
+ *
1451
+ * \returns short int
1452
+ * \retval The reinterpreted value.
1453
+ * \internal
1454
+ * \exception-guarantee no-throw guarantee
1455
+ * \behavior reentrant, thread safe
1456
+ * \endinternal
1457
+ */
1458
+ __CUDA_FP16_DECL__ short int __half_as_short(const __half h);
1459
+ /**
1460
+ * \ingroup CUDA_MATH__HALF_MISC
1461
+ * \brief Reinterprets bits in a \p half as an unsigned short integer.
1462
+ *
1463
+ * \details Reinterprets the bits in the half-precision floating-point \p h
1464
+ * as an unsigned short number.
1465
+ * \param[in] h - half. Is only being read.
1466
+ *
1467
+ * \returns unsigned short int
1468
+ * \retval The reinterpreted value.
1469
+ * \internal
1470
+ * \exception-guarantee no-throw guarantee
1471
+ * \behavior reentrant, thread safe
1472
+ * \endinternal
1473
+ */
1474
+ __CUDA_FP16_DECL__ unsigned short int __half_as_ushort(const __half h);
1475
+ /**
1476
+ * \ingroup CUDA_MATH__HALF_MISC
1477
+ * \brief Reinterprets bits in a signed short integer as a \p half.
1478
+ *
1479
+ * \details Reinterprets the bits in the signed short integer \p i as a
1480
+ * half-precision floating-point number.
1481
+ * \param[in] i - short int. Is only being read.
1482
+ *
1483
+ * \returns half
1484
+ * \retval The reinterpreted value.
1485
+ * \internal
1486
+ * \exception-guarantee no-throw guarantee
1487
+ * \behavior reentrant, thread safe
1488
+ * \endinternal
1489
+ */
1490
+ __CUDA_FP16_DECL__ __half __short_as_half(const short int i);
1491
+ /**
1492
+ * \ingroup CUDA_MATH__HALF_MISC
1493
+ * \brief Reinterprets bits in an unsigned short integer as a \p half.
1494
+ *
1495
+ * \details Reinterprets the bits in the unsigned short integer \p i as a
1496
+ * half-precision floating-point number.
1497
+ * \param[in] i - unsigned short int. Is only being read.
1498
+ *
1499
+ * \returns half
1500
+ * \retval The reinterpreted value.
1501
+ * \internal
1502
+ * \exception-guarantee no-throw guarantee
1503
+ * \behavior reentrant, thread safe
1504
+ * \endinternal
1505
+ */
1506
+ __CUDA_FP16_DECL__ __half __ushort_as_half(const unsigned short int i);
1507
+
1508
+ #if __CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__)
1509
+ #if !defined warpSize && !defined __local_warpSize
1510
+ #define warpSize 32
1511
+ #define __local_warpSize
1512
+ #endif
1513
+
1514
+ #if defined(_WIN32)
1515
+ # define __DEPRECATED__(msg) __declspec(deprecated(msg))
1516
+ #elif (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 5 && !defined(__clang__))))
1517
+ # define __DEPRECATED__(msg) __attribute__((deprecated))
1518
+ #else
1519
+ # define __DEPRECATED__(msg) __attribute__((deprecated(msg)))
1520
+ #endif
1521
+
1522
+ #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
1523
+ #define __WSB_DEPRECATION_MESSAGE(x) #x"() is deprecated in favor of "#x"_sync() and may be removed in a future release (Use -Wno-deprecated-declarations to suppress this warning)."
1524
+
1525
+ __CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) __half2 __shfl(const __half2 var, const int delta, const int width = warpSize);
1526
+ __CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) __half2 __shfl_up(const __half2 var, const unsigned int delta, const int width = warpSize);
1527
+ __CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down))__half2 __shfl_down(const __half2 var, const unsigned int delta, const int width = warpSize);
1528
+ __CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) __half2 __shfl_xor(const __half2 var, const int delta, const int width = warpSize);
1529
+ __CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) __half __shfl(const __half var, const int delta, const int width = warpSize);
1530
+ __CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) __half __shfl_up(const __half var, const unsigned int delta, const int width = warpSize);
1531
+ __CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) __half __shfl_down(const __half var, const unsigned int delta, const int width = warpSize);
1532
+ __CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) __half __shfl_xor(const __half var, const int delta, const int width = warpSize);
1533
+ #endif
1534
+
1535
+ /**
1536
+ * \ingroup CUDA_MATH__HALF_MISC
1537
+ * \brief Exchange a variable between threads within a warp. Direct copy from indexed thread.
1538
+ *
1539
+ * \details Returns the value of var held by the thread whose ID is given by delta.
1540
+ * If width is less than warpSize then each subsection of the warp behaves as a separate
1541
+ * entity with a starting logical thread ID of 0. If delta is outside the range [0:width-1],
1542
+ * the value returned corresponds to the value of var held by the delta modulo width (i.e.
1543
+ * within the same subsection). width must have a value which is a power of 2;
1544
+ * results are undefined if width is not a power of 2, or is a number greater than
1545
+ * warpSize.
1546
+ * \param[in] mask - unsigned int. Is only being read.
1547
+ * \param[in] var - half2. Is only being read.
1548
+ * \param[in] delta - int. Is only being read.
1549
+ * \param[in] width - int. Is only being read.
1550
+ *
1551
+ * \returns Returns the 4-byte word referenced by var from the source thread ID as half2.
1552
+ * If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned.
1553
+ * \internal
1554
+ * \exception-guarantee no-throw guarantee
1555
+ * \behavior not reentrant, not thread safe
1556
+ * \endinternal
1557
+ */
1558
+ __CUDA_FP16_DECL__ __half2 __shfl_sync(const unsigned mask, const __half2 var, const int delta, const int width = warpSize);
1559
+ /**
1560
+ * \ingroup CUDA_MATH__HALF_MISC
1561
+ * \brief Exchange a variable between threads within a warp. Copy from a thread with lower ID relative to the caller.
1562
+ *
1563
+ * \details Calculates a source thread ID by subtracting delta from the caller's lane ID.
1564
+ * The value of var held by the resulting lane ID is returned: in effect, var is shifted up
1565
+ * the warp by delta threads. If width is less than warpSize then each subsection of the warp
1566
+ * behaves as a separate entity with a starting logical thread ID of 0. The source thread index
1567
+ * will not wrap around the value of width, so effectively the lower delta threads will be unchanged.
1568
+ * width must have a value which is a power of 2; results are undefined if width is not a power of 2,
1569
+ * or is a number greater than warpSize.
1570
+ * \param[in] mask - unsigned int. Is only being read.
1571
+ * \param[in] var - half2. Is only being read.
1572
+ * \param[in] delta - int. Is only being read.
1573
+ * \param[in] width - int. Is only being read.
1574
+ *
1575
+ * \returns Returns the 4-byte word referenced by var from the source thread ID as half2.
1576
+ * If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned.
1577
+ * \internal
1578
+ * \exception-guarantee no-throw guarantee
1579
+ * \behavior not reentrant, not thread safe
1580
+ * \endinternal
1581
+ */
1582
+ __CUDA_FP16_DECL__ __half2 __shfl_up_sync(const unsigned mask, const __half2 var, const unsigned int delta, const int width = warpSize);
1583
+ /**
1584
+ * \ingroup CUDA_MATH__HALF_MISC
1585
+ * \brief Exchange a variable between threads within a warp. Copy from a thread with higher ID relative to the caller.
1586
+ *
1587
+ * \details Calculates a source thread ID by adding delta to the caller's thread ID.
1588
+ * The value of var held by the resulting thread ID is returned: this has the effect
1589
+ * of shifting var down the warp by delta threads. If width is less than warpSize then
1590
+ * each subsection of the warp behaves as a separate entity with a starting logical
1591
+ * thread ID of 0. As for __shfl_up_sync(), the ID number of the source thread
1592
+ * will not wrap around the value of width and so the upper delta threads
1593
+ * will remain unchanged.
1594
+ * \param[in] mask - unsigned int. Is only being read.
1595
+ * \param[in] var - half2. Is only being read.
1596
+ * \param[in] delta - int. Is only being read.
1597
+ * \param[in] width - int. Is only being read.
1598
+ *
1599
+ * \returns Returns the 4-byte word referenced by var from the source thread ID as half2.
1600
+ * If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned.
1601
+ * \internal
1602
+ * \exception-guarantee no-throw guarantee
1603
+ * \behavior not reentrant, not thread safe
1604
+ * \endinternal
1605
+ */
1606
+ __CUDA_FP16_DECL__ __half2 __shfl_down_sync(const unsigned mask, const __half2 var, const unsigned int delta, const int width = warpSize);
1607
+ /**
1608
+ * \ingroup CUDA_MATH__HALF_MISC
1609
+ * \brief Exchange a variable between threads within a warp. Copy from a thread based on bitwise XOR of own thread ID.
1610
+ *
1611
+ * \details Calculates a source thread ID by performing a bitwise XOR of the caller's thread ID with mask:
1612
+ * the value of var held by the resulting thread ID is returned. If width is less than warpSize then each
1613
+ * group of width consecutive threads are able to access elements from earlier groups of threads,
1614
+ * however if they attempt to access elements from later groups of threads their own value of var
1615
+ * will be returned. This mode implements a butterfly addressing pattern such as is used in tree
1616
+ * reduction and broadcast.
1617
+ * \param[in] mask - unsigned int. Is only being read.
1618
+ * \param[in] var - half2. Is only being read.
1619
+ * \param[in] delta - int. Is only being read.
1620
+ * \param[in] width - int. Is only being read.
1621
+ *
1622
+ * \returns Returns the 4-byte word referenced by var from the source thread ID as half2.
1623
+ * If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned.
1624
+ * \internal
1625
+ * \exception-guarantee no-throw guarantee
1626
+ * \behavior not reentrant, not thread safe
1627
+ * \endinternal
1628
+ */
1629
+ __CUDA_FP16_DECL__ __half2 __shfl_xor_sync(const unsigned mask, const __half2 var, const int delta, const int width = warpSize);
1630
+ /**
1631
+ * \ingroup CUDA_MATH__HALF_MISC
1632
+ * \brief Exchange a variable between threads within a warp. Direct copy from indexed thread.
1633
+ *
1634
+ * \details Returns the value of var held by the thread whose ID is given by delta.
1635
+ * If width is less than warpSize then each subsection of the warp behaves as a separate
1636
+ * entity with a starting logical thread ID of 0. If delta is outside the range [0:width-1],
1637
+ * the value returned corresponds to the value of var held by the delta modulo width (i.e.
1638
+ * within the same subsection). width must have a value which is a power of 2;
1639
+ * results are undefined if width is not a power of 2, or is a number greater than
1640
+ * warpSize.
1641
+ * \param[in] mask - unsigned int. Is only being read.
1642
+ * \param[in] var - half. Is only being read.
1643
+ * \param[in] delta - int. Is only being read.
1644
+ * \param[in] width - int. Is only being read.
1645
+ *
1646
+ * \returns Returns the 2-byte word referenced by var from the source thread ID as half.
1647
+ * If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned.
1648
+ * \internal
1649
+ * \exception-guarantee no-throw guarantee
1650
+ * \behavior not reentrant, not thread safe
1651
+ * \endinternal
1652
+ */
1653
+ __CUDA_FP16_DECL__ __half __shfl_sync(const unsigned mask, const __half var, const int delta, const int width = warpSize);
1654
+ /**
1655
+ * \ingroup CUDA_MATH__HALF_MISC
1656
+ * \brief Exchange a variable between threads within a warp. Copy from a thread with lower ID relative to the caller.
1657
+ * \details Calculates a source thread ID by subtracting delta from the caller's lane ID.
1658
+ * The value of var held by the resulting lane ID is returned: in effect, var is shifted up
1659
+ * the warp by delta threads. If width is less than warpSize then each subsection of the warp
1660
+ * behaves as a separate entity with a starting logical thread ID of 0. The source thread index
1661
+ * will not wrap around the value of width, so effectively the lower delta threads will be unchanged.
1662
+ * width must have a value which is a power of 2; results are undefined if width is not a power of 2,
1663
+ * or is a number greater than warpSize.
1664
+ * \param[in] mask - unsigned int. Is only being read.
1665
+ * \param[in] var - half. Is only being read.
1666
+ * \param[in] delta - int. Is only being read.
1667
+ * \param[in] width - int. Is only being read.
1668
+ *
1669
+ * \returns Returns the 2-byte word referenced by var from the source thread ID as half.
1670
+ * If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned.
1671
+ * \internal
1672
+ * \exception-guarantee no-throw guarantee
1673
+ * \behavior not reentrant, not thread safe
1674
+ * \endinternal
1675
+ */
1676
+ __CUDA_FP16_DECL__ __half __shfl_up_sync(const unsigned mask, const __half var, const unsigned int delta, const int width = warpSize);
1677
+ /**
1678
+ * \ingroup CUDA_MATH__HALF_MISC
1679
+ * \brief Exchange a variable between threads within a warp. Copy from a thread with higher ID relative to the caller.
1680
+ *
1681
+ * \details Calculates a source thread ID by adding delta to the caller's thread ID.
1682
+ * The value of var held by the resulting thread ID is returned: this has the effect
1683
+ * of shifting var down the warp by delta threads. If width is less than warpSize then
1684
+ * each subsection of the warp behaves as a separate entity with a starting logical
1685
+ * thread ID of 0. As for __shfl_up_sync(), the ID number of the source thread
1686
+ * will not wrap around the value of width and so the upper delta threads
1687
+ * will remain unchanged.
1688
+ * \param[in] mask - unsigned int. Is only being read.
1689
+ * \param[in] var - half. Is only being read.
1690
+ * \param[in] delta - int. Is only being read.
1691
+ * \param[in] width - int. Is only being read.
1692
+ *
1693
+ * \returns Returns the 2-byte word referenced by var from the source thread ID as half.
1694
+ * If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned.
1695
+ * \internal
1696
+ * \exception-guarantee no-throw guarantee
1697
+ * \behavior not reentrant, not thread safe
1698
+ * \endinternal
1699
+ */
1700
+ __CUDA_FP16_DECL__ __half __shfl_down_sync(const unsigned mask, const __half var, const unsigned int delta, const int width = warpSize);
1701
+ /**
1702
+ * \ingroup CUDA_MATH__HALF_MISC
1703
+ * \brief Exchange a variable between threads within a warp. Copy from a thread based on bitwise XOR of own thread ID.
1704
+ *
1705
+ * \details Calculates a source thread ID by performing a bitwise XOR of the caller's thread ID with mask:
1706
+ * the value of var held by the resulting thread ID is returned. If width is less than warpSize then each
1707
+ * group of width consecutive threads are able to access elements from earlier groups of threads,
1708
+ * however if they attempt to access elements from later groups of threads their own value of var
1709
+ * will be returned. This mode implements a butterfly addressing pattern such as is used in tree
1710
+ * reduction and broadcast.
1711
+ * \param[in] mask - unsigned int. Is only being read.
1712
+ * \param[in] var - half. Is only being read.
1713
+ * \param[in] delta - int. Is only being read.
1714
+ * \param[in] width - int. Is only being read.
1715
+ *
1716
+ * \returns Returns the 2-byte word referenced by var from the source thread ID as half.
1717
+ * If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned.
1718
+ * \internal
1719
+ * \exception-guarantee no-throw guarantee
1720
+ * \behavior not reentrant, not thread safe
1721
+ * \endinternal
1722
+ */
1723
+ __CUDA_FP16_DECL__ __half __shfl_xor_sync(const unsigned mask, const __half var, const int delta, const int width = warpSize);
1724
+
1725
+ #if defined(__local_warpSize)
1726
+ #undef warpSize
1727
+ #undef __local_warpSize
1728
+ #endif
1729
+ #endif /*__CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__) */
1730
+
1731
+ #if defined(__cplusplus) && ( __CUDA_ARCH__ >=320 || !defined(__CUDA_ARCH__) )
1732
+ /**
1733
+ * \ingroup CUDA_MATH__HALF_MISC
1734
+ * \brief Generates a `ld.global.nc` load instruction.
1735
+ * \param[in] ptr - memory location
1736
+ * \returns The value pointed by `ptr`
1737
+ */
1738
+ __CUDA_FP16_DECL__ __half2 __ldg(const __half2 *const ptr);
1739
+ /**
1740
+ * \ingroup CUDA_MATH__HALF_MISC
1741
+ * \brief Generates a `ld.global.nc` load instruction.
1742
+ * \param[in] ptr - memory location
1743
+ * \returns The value pointed by `ptr`
1744
+ */
1745
+ __CUDA_FP16_DECL__ __half __ldg(const __half *const ptr);
1746
+ /**
1747
+ * \ingroup CUDA_MATH__HALF_MISC
1748
+ * \brief Generates a `ld.global.cg` load instruction.
1749
+ * \param[in] ptr - memory location
1750
+ * \returns The value pointed by `ptr`
1751
+ */
1752
+ __CUDA_FP16_DECL__ __half2 __ldcg(const __half2 *const ptr);
1753
+ /**
1754
+ * \ingroup CUDA_MATH__HALF_MISC
1755
+ * \brief Generates a `ld.global.cg` load instruction.
1756
+ * \param[in] ptr - memory location
1757
+ * \returns The value pointed by `ptr`
1758
+ */
1759
+ __CUDA_FP16_DECL__ __half __ldcg(const __half *const ptr);
1760
+ /**
1761
+ * \ingroup CUDA_MATH__HALF_MISC
1762
+ * \brief Generates a `ld.global.ca` load instruction.
1763
+ * \param[in] ptr - memory location
1764
+ * \returns The value pointed by `ptr`
1765
+ */
1766
+ __CUDA_FP16_DECL__ __half2 __ldca(const __half2 *const ptr);
1767
+ /**
1768
+ * \ingroup CUDA_MATH__HALF_MISC
1769
+ * \brief Generates a `ld.global.ca` load instruction.
1770
+ * \param[in] ptr - memory location
1771
+ * \returns The value pointed by `ptr`
1772
+ */
1773
+ __CUDA_FP16_DECL__ __half __ldca(const __half *const ptr);
1774
+ /**
1775
+ * \ingroup CUDA_MATH__HALF_MISC
1776
+ * \brief Generates a `ld.global.cs` load instruction.
1777
+ * \param[in] ptr - memory location
1778
+ * \returns The value pointed by `ptr`
1779
+ */
1780
+ __CUDA_FP16_DECL__ __half2 __ldcs(const __half2 *const ptr);
1781
+ /**
1782
+ * \ingroup CUDA_MATH__HALF_MISC
1783
+ * \brief Generates a `ld.global.cs` load instruction.
1784
+ * \param[in] ptr - memory location
1785
+ * \returns The value pointed by `ptr`
1786
+ */
1787
+ __CUDA_FP16_DECL__ __half __ldcs(const __half *const ptr);
1788
+ /**
1789
+ * \ingroup CUDA_MATH__HALF_MISC
1790
+ * \brief Generates a `ld.global.lu` load instruction.
1791
+ * \param[in] ptr - memory location
1792
+ * \returns The value pointed by `ptr`
1793
+ */
1794
+ __CUDA_FP16_DECL__ __half2 __ldlu(const __half2 *const ptr);
1795
+ /**
1796
+ * \ingroup CUDA_MATH__HALF_MISC
1797
+ * \brief Generates a `ld.global.lu` load instruction.
1798
+ * \param[in] ptr - memory location
1799
+ * \returns The value pointed by `ptr`
1800
+ */
1801
+ __CUDA_FP16_DECL__ __half __ldlu(const __half *const ptr);
1802
+ /**
1803
+ * \ingroup CUDA_MATH__HALF_MISC
1804
+ * \brief Generates a `ld.global.cv` load instruction.
1805
+ * \param[in] ptr - memory location
1806
+ * \returns The value pointed by `ptr`
1807
+ */
1808
+ __CUDA_FP16_DECL__ __half2 __ldcv(const __half2 *const ptr);
1809
+ /**
1810
+ * \ingroup CUDA_MATH__HALF_MISC
1811
+ * \brief Generates a `ld.global.cv` load instruction.
1812
+ * \param[in] ptr - memory location
1813
+ * \returns The value pointed by `ptr`
1814
+ */
1815
+ __CUDA_FP16_DECL__ __half __ldcv(const __half *const ptr);
1816
+ /**
1817
+ * \ingroup CUDA_MATH__HALF_MISC
1818
+ * \brief Generates a `st.global.wb` store instruction.
1819
+ * \param[out] ptr - memory location
1820
+ * \param[in] value - the value to be stored
1821
+ */
1822
+ __CUDA_FP16_DECL__ void __stwb(__half2 *const ptr, const __half2 value);
1823
+ /**
1824
+ * \ingroup CUDA_MATH__HALF_MISC
1825
+ * \brief Generates a `st.global.wb` store instruction.
1826
+ * \param[out] ptr - memory location
1827
+ * \param[in] value - the value to be stored
1828
+ */
1829
+ __CUDA_FP16_DECL__ void __stwb(__half *const ptr, const __half value);
1830
+ /**
1831
+ * \ingroup CUDA_MATH__HALF_MISC
1832
+ * \brief Generates a `st.global.cg` store instruction.
1833
+ * \param[out] ptr - memory location
1834
+ * \param[in] value - the value to be stored
1835
+ */
1836
+ __CUDA_FP16_DECL__ void __stcg(__half2 *const ptr, const __half2 value);
1837
+ /**
1838
+ * \ingroup CUDA_MATH__HALF_MISC
1839
+ * \brief Generates a `st.global.cg` store instruction.
1840
+ * \param[out] ptr - memory location
1841
+ * \param[in] value - the value to be stored
1842
+ */
1843
+ __CUDA_FP16_DECL__ void __stcg(__half *const ptr, const __half value);
1844
+ /**
1845
+ * \ingroup CUDA_MATH__HALF_MISC
1846
+ * \brief Generates a `st.global.cs` store instruction.
1847
+ * \param[out] ptr - memory location
1848
+ * \param[in] value - the value to be stored
1849
+ */
1850
+ __CUDA_FP16_DECL__ void __stcs(__half2 *const ptr, const __half2 value);
1851
+ /**
1852
+ * \ingroup CUDA_MATH__HALF_MISC
1853
+ * \brief Generates a `st.global.cs` store instruction.
1854
+ * \param[out] ptr - memory location
1855
+ * \param[in] value - the value to be stored
1856
+ */
1857
+ __CUDA_FP16_DECL__ void __stcs(__half *const ptr, const __half value);
1858
+ /**
1859
+ * \ingroup CUDA_MATH__HALF_MISC
1860
+ * \brief Generates a `st.global.wt` store instruction.
1861
+ * \param[out] ptr - memory location
1862
+ * \param[in] value - the value to be stored
1863
+ */
1864
+ __CUDA_FP16_DECL__ void __stwt(__half2 *const ptr, const __half2 value);
1865
+ /**
1866
+ * \ingroup CUDA_MATH__HALF_MISC
1867
+ * \brief Generates a `st.global.wt` store instruction.
1868
+ * \param[out] ptr - memory location
1869
+ * \param[in] value - the value to be stored
1870
+ */
1871
+ __CUDA_FP16_DECL__ void __stwt(__half *const ptr, const __half value);
1872
+ #endif /*defined(__cplusplus) && ( __CUDA_ARCH__ >=320 || !defined(__CUDA_ARCH__) )*/
1873
+
1874
+ #if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
1875
+ /**
1876
+ * \ingroup CUDA_MATH__HALF2_COMPARISON
1877
+ * \brief Performs half2 vector if-equal comparison.
1878
+ *
1879
+ * \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b.
1880
+ * The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
1881
+ * NaN inputs generate false results.
1882
+ * \param[in] a - half2. Is only being read.
1883
+ * \param[in] b - half2. Is only being read.
1884
+ *
1885
+ * \returns half2
1886
+ * \retval The vector result of if-equal comparison of vectors \p a and \p b.
1887
+ * \internal
1888
+ * \exception-guarantee no-throw guarantee
1889
+ * \behavior reentrant, thread safe
1890
+ * \endinternal
1891
+ */
1892
+ __CUDA_FP16_DECL__ __half2 __heq2(const __half2 a, const __half2 b);
1893
+ /**
1894
+ * \ingroup CUDA_MATH__HALF2_COMPARISON
1895
+ * \brief Performs \p half2 vector not-equal comparison.
1896
+ *
1897
+ * \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b.
1898
+ * The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
1899
+ * NaN inputs generate false results.
1900
+ * \param[in] a - half2. Is only being read.
1901
+ * \param[in] b - half2. Is only being read.
1902
+ *
1903
+ * \returns half2
1904
+ * \retval The vector result of not-equal comparison of vectors \p a and \p b.
1905
+ * \internal
1906
+ * \exception-guarantee no-throw guarantee
1907
+ * \behavior reentrant, thread safe
1908
+ * \endinternal
1909
+ */
1910
+ __CUDA_FP16_DECL__ __half2 __hne2(const __half2 a, const __half2 b);
1911
+ /**
1912
+ * \ingroup CUDA_MATH__HALF2_COMPARISON
1913
+ * \brief Performs \p half2 vector less-equal comparison.
1914
+ *
1915
+ * \details Performs \p half2 vector less-equal comparison of inputs \p a and \p b.
1916
+ * The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
1917
+ * NaN inputs generate false results.
1918
+ * \param[in] a - half2. Is only being read.
1919
+ * \param[in] b - half2. Is only being read.
1920
+ *
1921
+ * \returns half2
1922
+ * \retval The \p half2 result of less-equal comparison of vectors \p a and \p b.
1923
+ * \internal
1924
+ * \exception-guarantee no-throw guarantee
1925
+ * \behavior reentrant, thread safe
1926
+ * \endinternal
1927
+ */
1928
+ __CUDA_FP16_DECL__ __half2 __hle2(const __half2 a, const __half2 b);
1929
+ /**
1930
+ * \ingroup CUDA_MATH__HALF2_COMPARISON
1931
+ * \brief Performs \p half2 vector greater-equal comparison.
1932
+ *
1933
+ * \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b.
1934
+ * The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
1935
+ * NaN inputs generate false results.
1936
+ * \param[in] a - half2. Is only being read.
1937
+ * \param[in] b - half2. Is only being read.
1938
+ *
1939
+ * \returns half2
1940
+ * \retval The vector result of greater-equal comparison of vectors \p a and \p b.
1941
+ * \internal
1942
+ * \exception-guarantee no-throw guarantee
1943
+ * \behavior reentrant, thread safe
1944
+ * \endinternal
1945
+ */
1946
+ __CUDA_FP16_DECL__ __half2 __hge2(const __half2 a, const __half2 b);
1947
+ /**
1948
+ * \ingroup CUDA_MATH__HALF2_COMPARISON
1949
+ * \brief Performs \p half2 vector less-than comparison.
1950
+ *
1951
+ * \details Performs \p half2 vector less-than comparison of inputs \p a and \p b.
1952
+ * The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
1953
+ * NaN inputs generate false results.
1954
+ * \param[in] a - half2. Is only being read.
1955
+ * \param[in] b - half2. Is only being read.
1956
+ *
1957
+ * \returns half2
1958
+ * \retval The half2 vector result of less-than comparison of vectors \p a and \p b.
1959
+ * \internal
1960
+ * \exception-guarantee no-throw guarantee
1961
+ * \behavior reentrant, thread safe
1962
+ * \endinternal
1963
+ */
1964
+ __CUDA_FP16_DECL__ __half2 __hlt2(const __half2 a, const __half2 b);
1965
+ /**
1966
+ * \ingroup CUDA_MATH__HALF2_COMPARISON
1967
+ * \brief Performs \p half2 vector greater-than comparison.
1968
+ *
1969
+ * \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b.
1970
+ * The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
1971
+ * NaN inputs generate false results.
1972
+ * \param[in] a - half2. Is only being read.
1973
+ * \param[in] b - half2. Is only being read.
1974
+ *
1975
+ * \returns half2
1976
+ * \retval The vector result of greater-than comparison of vectors \p a and \p b.
1977
+ * \internal
1978
+ * \exception-guarantee no-throw guarantee
1979
+ * \behavior reentrant, thread safe
1980
+ * \endinternal
1981
+ */
1982
+ __CUDA_FP16_DECL__ __half2 __hgt2(const __half2 a, const __half2 b);
1983
+ /**
1984
+ * \ingroup CUDA_MATH__HALF2_COMPARISON
1985
+ * \brief Performs \p half2 vector unordered if-equal comparison.
1986
+ *
1987
+ * \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b.
1988
+ * The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
1989
+ * NaN inputs generate true results.
1990
+ * \param[in] a - half2. Is only being read.
1991
+ * \param[in] b - half2. Is only being read.
1992
+ *
1993
+ * \returns half2
1994
+ * \retval The vector result of unordered if-equal comparison of vectors \p a and \p b.
1995
+ * \internal
1996
+ * \exception-guarantee no-throw guarantee
1997
+ * \behavior reentrant, thread safe
1998
+ * \endinternal
1999
+ */
2000
+ __CUDA_FP16_DECL__ __half2 __hequ2(const __half2 a, const __half2 b);
2001
+ /**
2002
+ * \ingroup CUDA_MATH__HALF2_COMPARISON
2003
+ * \brief Performs \p half2 vector unordered not-equal comparison.
2004
+ *
2005
+ * \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b.
2006
+ * The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
2007
+ * NaN inputs generate true results.
2008
+ * \param[in] a - half2. Is only being read.
2009
+ * \param[in] b - half2. Is only being read.
2010
+ *
2011
+ * \returns half2
2012
+ * \retval The vector result of unordered not-equal comparison of vectors \p a and \p b.
2013
+ * \internal
2014
+ * \exception-guarantee no-throw guarantee
2015
+ * \behavior reentrant, thread safe
2016
+ * \endinternal
2017
+ */
2018
+ __CUDA_FP16_DECL__ __half2 __hneu2(const __half2 a, const __half2 b);
2019
+ /**
2020
+ * \ingroup CUDA_MATH__HALF2_COMPARISON
2021
+ * \brief Performs \p half2 vector unordered less-equal comparison.
2022
+ *
2023
+ * Performs \p half2 vector less-equal comparison of inputs \p a and \p b.
2024
+ * The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
2025
+ * NaN inputs generate true results.
2026
+ * \param[in] a - half2. Is only being read.
2027
+ * \param[in] b - half2. Is only being read.
2028
+ *
2029
+ * \returns half2
2030
+ * \retval The vector result of unordered less-equal comparison of vectors \p a and \p b.
2031
+ * \internal
2032
+ * \exception-guarantee no-throw guarantee
2033
+ * \behavior reentrant, thread safe
2034
+ * \endinternal
2035
+ */
2036
+ __CUDA_FP16_DECL__ __half2 __hleu2(const __half2 a, const __half2 b);
2037
+ /**
2038
+ * \ingroup CUDA_MATH__HALF2_COMPARISON
2039
+ * \brief Performs \p half2 vector unordered greater-equal comparison.
2040
+ *
2041
+ * \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b.
2042
+ * The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
2043
+ * NaN inputs generate true results.
2044
+ * \param[in] a - half2. Is only being read.
2045
+ * \param[in] b - half2. Is only being read.
2046
+ *
2047
+ * \returns half2
2048
+ * \retval The \p half2 vector result of unordered greater-equal comparison of vectors \p a and \p b.
2049
+ * \internal
2050
+ * \exception-guarantee no-throw guarantee
2051
+ * \behavior reentrant, thread safe
2052
+ * \endinternal
2053
+ */
2054
+ __CUDA_FP16_DECL__ __half2 __hgeu2(const __half2 a, const __half2 b);
2055
+ /**
2056
+ * \ingroup CUDA_MATH__HALF2_COMPARISON
2057
+ * \brief Performs \p half2 vector unordered less-than comparison.
2058
+ *
2059
+ * \details Performs \p half2 vector less-than comparison of inputs \p a and \p b.
2060
+ * The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
2061
+ * NaN inputs generate true results.
2062
+ * \param[in] a - half2. Is only being read.
2063
+ * \param[in] b - half2. Is only being read.
2064
+ *
2065
+ * \returns half2
2066
+ * \retval The vector result of unordered less-than comparison of vectors \p a and \p b.
2067
+ * \internal
2068
+ * \exception-guarantee no-throw guarantee
2069
+ * \behavior reentrant, thread safe
2070
+ * \endinternal
2071
+ */
2072
+ __CUDA_FP16_DECL__ __half2 __hltu2(const __half2 a, const __half2 b);
2073
+ /**
2074
+ * \ingroup CUDA_MATH__HALF2_COMPARISON
2075
+ * \brief Performs \p half2 vector unordered greater-than comparison.
2076
+ *
2077
+ * \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b.
2078
+ * The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
2079
+ * NaN inputs generate true results.
2080
+ * \param[in] a - half2. Is only being read.
2081
+ * \param[in] b - half2. Is only being read.
2082
+ *
2083
+ * \returns half2
2084
+ * \retval The \p half2 vector result of unordered greater-than comparison of vectors \p a and \p b.
2085
+ * \internal
2086
+ * \exception-guarantee no-throw guarantee
2087
+ * \behavior reentrant, thread safe
2088
+ * \endinternal
2089
+ */
2090
+ __CUDA_FP16_DECL__ __half2 __hgtu2(const __half2 a, const __half2 b);
2091
+ /**
2092
+ * \ingroup CUDA_MATH__HALF2_COMPARISON
2093
+ * \brief Determine whether \p half2 argument is a NaN.
2094
+ *
2095
+ * \details Determine whether each half of input \p half2 number \p a is a NaN.
2096
+ * \param[in] a - half2. Is only being read.
2097
+ *
2098
+ * \returns half2
2099
+ * \retval The half2 with the corresponding \p half results set to
2100
+ * 1.0 for NaN, 0.0 otherwise.
2101
+ * \internal
2102
+ * \exception-guarantee no-throw guarantee
2103
+ * \behavior reentrant, thread safe
2104
+ * \endinternal
2105
+ */
2106
+ __CUDA_FP16_DECL__ __half2 __hisnan2(const __half2 a);
2107
+ /**
2108
+ * \ingroup CUDA_MATH__HALF2_ARITHMETIC
2109
+ * \brief Performs \p half2 vector addition in round-to-nearest-even mode.
2110
+ *
2111
+ * \details Performs \p half2 vector add of inputs \p a and \p b, in round-to-nearest
2112
+ * mode.
2113
+ * \internal
2114
+ * \req DEEPLEARN-SRM_REQ-95
2115
+ * \endinternal
2116
+ * \param[in] a - half2. Is only being read.
2117
+ * \param[in] b - half2. Is only being read.
2118
+ *
2119
+ * \returns half2
2120
+ * \retval The sum of vectors \p a and \p b.
2121
+ * \internal
2122
+ * \exception-guarantee no-throw guarantee
2123
+ * \behavior reentrant, thread safe
2124
+ * \endinternal
2125
+ */
2126
+ __CUDA_FP16_DECL__ __half2 __hadd2(const __half2 a, const __half2 b);
2127
+ /**
2128
+ * \ingroup CUDA_MATH__HALF2_ARITHMETIC
2129
+ * \brief Performs \p half2 vector subtraction in round-to-nearest-even mode.
2130
+ *
2131
+ * \details Subtracts \p half2 input vector \p b from input vector \p a in
2132
+ * round-to-nearest-even mode.
2133
+ * \internal
2134
+ * \req DEEPLEARN-SRM_REQ-104
2135
+ * \endinternal
2136
+ * \param[in] a - half2. Is only being read.
2137
+ * \param[in] b - half2. Is only being read.
2138
+ *
2139
+ * \returns half2
2140
+ * \retval The subtraction of vector \p b from \p a.
2141
+ * \internal
2142
+ * \exception-guarantee no-throw guarantee
2143
+ * \behavior reentrant, thread safe
2144
+ * \endinternal
2145
+ */
2146
+ __CUDA_FP16_DECL__ __half2 __hsub2(const __half2 a, const __half2 b);
2147
+ /**
2148
+ * \ingroup CUDA_MATH__HALF2_ARITHMETIC
2149
+ * \brief Performs \p half2 vector multiplication in round-to-nearest-even mode.
2150
+ *
2151
+ * \details Performs \p half2 vector multiplication of inputs \p a and \p b, in
2152
+ * round-to-nearest-even mode.
2153
+ * \internal
2154
+ * \req DEEPLEARN-SRM_REQ-102
2155
+ * \endinternal
2156
+ * \param[in] a - half2. Is only being read.
2157
+ * \param[in] b - half2. Is only being read.
2158
+ *
2159
+ * \returns half2
2160
+ * \retval The result of elementwise multiplying the vectors \p a and \p b.
2161
+ * \internal
2162
+ * \exception-guarantee no-throw guarantee
2163
+ * \behavior reentrant, thread safe
2164
+ * \endinternal
2165
+ */
2166
+ __CUDA_FP16_DECL__ __half2 __hmul2(const __half2 a, const __half2 b);
2167
+ /**
2168
+ * \ingroup CUDA_MATH__HALF2_ARITHMETIC
2169
+ * \brief Performs \p half2 vector division in round-to-nearest-even mode.
2170
+ *
2171
+ * \details Divides \p half2 input vector \p a by input vector \p b in round-to-nearest
2172
+ * mode.
2173
+ * \internal
2174
+ * \req DEEPLEARN-SRM_REQ-103
2175
+ * \endinternal
2176
+ * \param[in] a - half2. Is only being read.
2177
+ * \param[in] b - half2. Is only being read.
2178
+ *
2179
+ * \returns half2
2180
+ * \retval The elementwise division of \p a with \p b.
2181
+ * \internal
2182
+ * \exception-guarantee no-throw guarantee
2183
+ * \behavior reentrant, thread safe
2184
+ * \endinternal
2185
+ */
2186
+ __CUDA_FP16_DECL__ __half2 __h2div(const __half2 a, const __half2 b);
2187
+ /**
2188
+ * \ingroup CUDA_MATH__HALF2_ARITHMETIC
2189
+ * \brief Calculates the absolute value of both halves of the input \p half2 number and
2190
+ * returns the result.
2191
+ *
2192
+ * \details Calculates the absolute value of both halves of the input \p half2 number and
2193
+ * returns the result.
2194
+ * \param[in] a - half2. Is only being read.
2195
+ *
2196
+ * \returns half2
2197
+ * \retval Returns \p a with the absolute value of both halves.
2198
+ * \internal
2199
+ * \exception-guarantee no-throw guarantee
2200
+ * \behavior reentrant, thread safe
2201
+ * \endinternal
2202
+ */
2203
+ __CUDA_FP16_DECL__ __half2 __habs2(const __half2 a);
2204
+ /**
2205
+ * \ingroup CUDA_MATH__HALF2_ARITHMETIC
2206
+ * \brief Performs \p half2 vector addition in round-to-nearest-even mode, with
2207
+ * saturation to [0.0, 1.0].
2208
+ *
2209
+ * \details Performs \p half2 vector add of inputs \p a and \p b, in round-to-nearest
2210
+ * mode, and clamps the results to range [0.0, 1.0]. NaN results are flushed to
2211
+ * +0.0.
2212
+ * \param[in] a - half2. Is only being read.
2213
+ * \param[in] b - half2. Is only being read.
2214
+ *
2215
+ * \returns half2
2216
+ * \retval The sum of \p a and \p b, with respect to saturation.
2217
+ * \internal
2218
+ * \exception-guarantee no-throw guarantee
2219
+ * \behavior reentrant, thread safe
2220
+ * \endinternal
2221
+ */
2222
+ __CUDA_FP16_DECL__ __half2 __hadd2_sat(const __half2 a, const __half2 b);
2223
+ /**
2224
+ * \ingroup CUDA_MATH__HALF2_ARITHMETIC
2225
+ * \brief Performs \p half2 vector subtraction in round-to-nearest-even mode,
2226
+ * with saturation to [0.0, 1.0].
2227
+ *
2228
+ * \details Subtracts \p half2 input vector \p b from input vector \p a in
2229
+ * round-to-nearest-even mode, and clamps the results to range [0.0, 1.0]. NaN
2230
+ * results are flushed to +0.0.
2231
+ * \param[in] a - half2. Is only being read.
2232
+ * \param[in] b - half2. Is only being read.
2233
+ *
2234
+ * \returns half2
2235
+ * \retval The subtraction of vector \p b from \p a, with respect to saturation.
2236
+ * \internal
2237
+ * \exception-guarantee no-throw guarantee
2238
+ * \behavior reentrant, thread safe
2239
+ * \endinternal
2240
+ */
2241
+ __CUDA_FP16_DECL__ __half2 __hsub2_sat(const __half2 a, const __half2 b);
2242
+ /**
2243
+ * \ingroup CUDA_MATH__HALF2_ARITHMETIC
2244
+ * \brief Performs \p half2 vector multiplication in round-to-nearest-even mode,
2245
+ * with saturation to [0.0, 1.0].
2246
+ *
2247
+ * \details Performs \p half2 vector multiplication of inputs \p a and \p b, in
2248
+ * round-to-nearest-even mode, and clamps the results to range [0.0, 1.0]. NaN
2249
+ * results are flushed to +0.0.
2250
+ * \param[in] a - half2. Is only being read.
2251
+ * \param[in] b - half2. Is only being read.
2252
+ *
2253
+ * \returns half2
2254
+ * \retval The result of elementwise multiplication of vectors \p a and \p b,
2255
+ * with respect to saturation.
2256
+ * \internal
2257
+ * \exception-guarantee no-throw guarantee
2258
+ * \behavior reentrant, thread safe
2259
+ * \endinternal
2260
+ */
2261
+ __CUDA_FP16_DECL__ __half2 __hmul2_sat(const __half2 a, const __half2 b);
2262
+ /**
2263
+ * \ingroup CUDA_MATH__HALF2_ARITHMETIC
2264
+ * \brief Performs \p half2 vector fused multiply-add in round-to-nearest-even
2265
+ * mode.
2266
+ *
2267
+ * \details Performs \p half2 vector multiply on inputs \p a and \p b,
2268
+ * then performs a \p half2 vector add of the result with \p c,
2269
+ * rounding the result once in round-to-nearest-even mode.
2270
+ * \internal
2271
+ * \req DEEPLEARN-SRM_REQ-105
2272
+ * \endinternal
2273
+ * \param[in] a - half2. Is only being read.
2274
+ * \param[in] b - half2. Is only being read.
2275
+ * \param[in] c - half2. Is only being read.
2276
+ *
2277
+ * \returns half2
2278
+ * \retval The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c.
2279
+ * \internal
2280
+ * \exception-guarantee no-throw guarantee
2281
+ * \behavior reentrant, thread safe
2282
+ * \endinternal
2283
+ */
2284
+ __CUDA_FP16_DECL__ __half2 __hfma2(const __half2 a, const __half2 b, const __half2 c);
2285
+ /**
2286
+ * \ingroup CUDA_MATH__HALF2_ARITHMETIC
2287
+ * \brief Performs \p half2 vector fused multiply-add in round-to-nearest-even
2288
+ * mode, with saturation to [0.0, 1.0].
2289
+ *
2290
+ * \details Performs \p half2 vector multiply on inputs \p a and \p b,
2291
+ * then performs a \p half2 vector add of the result with \p c,
2292
+ * rounding the result once in round-to-nearest-even mode, and clamps the
2293
+ * results to range [0.0, 1.0]. NaN results are flushed to +0.0.
2294
+ * \param[in] a - half2. Is only being read.
2295
+ * \param[in] b - half2. Is only being read.
2296
+ * \param[in] c - half2. Is only being read.
2297
+ *
2298
+ * \returns half2
2299
+ * \retval The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c,
2300
+ * with respect to saturation.
2301
+ * \internal
2302
+ * \exception-guarantee no-throw guarantee
2303
+ * \behavior reentrant, thread safe
2304
+ * \endinternal
2305
+ */
2306
+ __CUDA_FP16_DECL__ __half2 __hfma2_sat(const __half2 a, const __half2 b, const __half2 c);
2307
+ /**
2308
+ * \ingroup CUDA_MATH__HALF2_ARITHMETIC
2309
+ * \brief Negates both halves of the input \p half2 number and returns the
2310
+ * result.
2311
+ *
2312
+ * \details Negates both halves of the input \p half2 number \p a and returns the result.
2313
+ * \internal
2314
+ * \req DEEPLEARN-SRM_REQ-101
2315
+ * \endinternal
2316
+ * \param[in] a - half2. Is only being read.
2317
+ *
2318
+ * \returns half2
2319
+ * \retval Returns \p a with both halves negated.
2320
+ * \internal
2321
+ * \exception-guarantee no-throw guarantee
2322
+ * \behavior reentrant, thread safe
2323
+ * \endinternal
2324
+ */
2325
+ __CUDA_FP16_DECL__ __half2 __hneg2(const __half2 a);
2326
+ /**
2327
+ * \ingroup CUDA_MATH__HALF_ARITHMETIC
2328
+ * \brief Calculates the absolute value of input \p half number and returns the result.
2329
+ *
2330
+ * \details Calculates the absolute value of input \p half number and returns the result.
2331
+ * \param[in] a - half. Is only being read.
2332
+ *
2333
+ * \returns half
2334
+ * \retval The absolute value of a.
2335
+ * \internal
2336
+ * \exception-guarantee no-throw guarantee
2337
+ * \behavior reentrant, thread safe
2338
+ * \endinternal
2339
+ */
2340
+ __CUDA_FP16_DECL__ __half __habs(const __half a);
2341
+ /**
2342
+ * \ingroup CUDA_MATH__HALF_ARITHMETIC
2343
+ * \brief Performs \p half addition in round-to-nearest-even mode.
2344
+ *
2345
+ * \details Performs \p half addition of inputs \p a and \p b, in round-to-nearest-even
2346
+ * mode.
2347
+ * \internal
2348
+ * \req DEEPLEARN-SRM_REQ-94
2349
+ * \endinternal
2350
+ * \param[in] a - half. Is only being read.
2351
+ * \param[in] b - half. Is only being read.
2352
+ *
2353
+ * \returns half
2354
+ * \retval The sum of \p a and \p b.
2355
+ * \internal
2356
+ * \exception-guarantee no-throw guarantee
2357
+ * \behavior reentrant, thread safe
2358
+ * \endinternal
2359
+ */
2360
+ __CUDA_FP16_DECL__ __half __hadd(const __half a, const __half b);
2361
+ /**
2362
+ * \ingroup CUDA_MATH__HALF_ARITHMETIC
2363
+ * \brief Performs \p half subtraction in round-to-nearest-even mode.
2364
+ *
2365
+ * \details Subtracts \p half input \p b from input \p a in round-to-nearest
2366
+ * mode.
2367
+ * \internal
2368
+ * \req DEEPLEARN-SRM_REQ-97
2369
+ * \endinternal
2370
+ * \param[in] a - half. Is only being read.
2371
+ * \param[in] b - half. Is only being read.
2372
+ *
2373
+ * \returns half
2374
+ * \retval The result of subtracting \p b from \p a.
2375
+ * \internal
2376
+ * \exception-guarantee no-throw guarantee
2377
+ * \behavior reentrant, thread safe
2378
+ * \endinternal
2379
+ */
2380
+ __CUDA_FP16_DECL__ __half __hsub(const __half a, const __half b);
2381
+ /**
2382
+ * \ingroup CUDA_MATH__HALF_ARITHMETIC
2383
+ * \brief Performs \p half multiplication in round-to-nearest-even mode.
2384
+ *
2385
+ * \details Performs \p half multiplication of inputs \p a and \p b, in round-to-nearest
2386
+ * mode.
2387
+ * \internal
2388
+ * \req DEEPLEARN-SRM_REQ-99
2389
+ * \endinternal
2390
+ * \param[in] a - half. Is only being read.
2391
+ * \param[in] b - half. Is only being read.
2392
+ *
2393
+ * \returns half
2394
+ * \retval The result of multiplying \p a and \p b.
2395
+ */
2396
+ __CUDA_FP16_DECL__ __half __hmul(const __half a, const __half b);
2397
+ /**
2398
+ * \ingroup CUDA_MATH__HALF_ARITHMETIC
2399
+ * \brief Performs \p half division in round-to-nearest-even mode.
2400
+ *
2401
+ * \details Divides \p half input \p a by input \p b in round-to-nearest
2402
+ * mode.
2403
+ * \internal
2404
+ * \req DEEPLEARN-SRM_REQ-98
2405
+ * \endinternal
2406
+ * \param[in] a - half. Is only being read.
2407
+ * \param[in] b - half. Is only being read.
2408
+ *
2409
+ * \returns half
2410
+ * \retval The result of dividing \p a by \p b.
2411
+ * \internal
2412
+ * \exception-guarantee no-throw guarantee
2413
+ * \behavior reentrant, thread safe
2414
+ * \endinternal
2415
+ */
2416
+ __CUDA_FP16_DECL__ __half __hdiv(const __half a, const __half b);
2417
+ /**
2418
+ * \ingroup CUDA_MATH__HALF_ARITHMETIC
2419
+ * \brief Performs \p half addition in round-to-nearest-even mode, with
2420
+ * saturation to [0.0, 1.0].
2421
+ *
2422
+ * \details Performs \p half add of inputs \p a and \p b, in round-to-nearest-even mode,
2423
+ * and clamps the result to range [0.0, 1.0]. NaN results are flushed to +0.0.
2424
+ * \param[in] a - half. Is only being read.
2425
+ * \param[in] b - half. Is only being read.
2426
+ *
2427
+ * \returns half
2428
+ * \retval The sum of \p a and \p b, with respect to saturation.
2429
+ * \internal
2430
+ * \exception-guarantee no-throw guarantee
2431
+ * \behavior reentrant, thread safe
2432
+ * \endinternal
2433
+ */
2434
+ __CUDA_FP16_DECL__ __half __hadd_sat(const __half a, const __half b);
2435
+ /**
2436
+ * \ingroup CUDA_MATH__HALF_ARITHMETIC
2437
+ * \brief Performs \p half subtraction in round-to-nearest-even mode, with
2438
+ * saturation to [0.0, 1.0].
2439
+ *
2440
+ * \details Subtracts \p half input \p b from input \p a in round-to-nearest
2441
+ * mode,
2442
+ * and clamps the result to range [0.0, 1.0]. NaN results are flushed to +0.0.
2443
+ * \param[in] a - half. Is only being read.
2444
+ * \param[in] b - half. Is only being read.
2445
+ *
2446
+ * \returns half
2447
+ * \retval The result of subtraction of \p b from \p a, with respect to saturation.
2448
+ * \internal
2449
+ * \exception-guarantee no-throw guarantee
2450
+ * \behavior reentrant, thread safe
2451
+ * \endinternal
2452
+ */
2453
+ __CUDA_FP16_DECL__ __half __hsub_sat(const __half a, const __half b);
2454
+ /**
2455
+ * \ingroup CUDA_MATH__HALF_ARITHMETIC
2456
+ * \brief Performs \p half multiplication in round-to-nearest-even mode, with
2457
+ * saturation to [0.0, 1.0].
2458
+ *
2459
+ * \details Performs \p half multiplication of inputs \p a and \p b, in round-to-nearest
2460
+ * mode, and clamps the result to range [0.0, 1.0]. NaN results are flushed to
2461
+ * +0.0.
2462
+ * \param[in] a - half. Is only being read.
2463
+ * \param[in] b - half. Is only being read.
2464
+ *
2465
+ * \returns half
2466
+ * \retval The result of multiplying \p a and \p b, with respect to saturation.
2467
+ * \internal
2468
+ * \exception-guarantee no-throw guarantee
2469
+ * \behavior reentrant, thread safe
2470
+ * \endinternal
2471
+ */
2472
+ __CUDA_FP16_DECL__ __half __hmul_sat(const __half a, const __half b);
2473
+ /**
2474
+ * \ingroup CUDA_MATH__HALF_ARITHMETIC
2475
+ * \brief Performs \p half fused multiply-add in round-to-nearest-even mode.
2476
+ *
2477
+ * \details Performs \p half multiply on inputs \p a and \p b,
2478
+ * then performs a \p half add of the result with \p c,
2479
+ * rounding the result once in round-to-nearest-even mode.
2480
+ * \internal
2481
+ * \req DEEPLEARN-SRM_REQ-96
2482
+ * \endinternal
2483
+ * \param[in] a - half. Is only being read.
2484
+ * \param[in] b - half. Is only being read.
2485
+ * \param[in] c - half. Is only being read.
2486
+ *
2487
+ * \returns half
2488
+ * \retval The result of fused multiply-add operation on \p
2489
+ * a, \p b, and \p c.
2490
+ * \internal
2491
+ * \exception-guarantee no-throw guarantee
2492
+ * \behavior reentrant, thread safe
2493
+ * \endinternal
2494
+ */
2495
+ __CUDA_FP16_DECL__ __half __hfma(const __half a, const __half b, const __half c);
2496
+ /**
2497
+ * \ingroup CUDA_MATH__HALF_ARITHMETIC
2498
+ * \brief Performs \p half fused multiply-add in round-to-nearest-even mode,
2499
+ * with saturation to [0.0, 1.0].
2500
+ *
2501
+ * \details Performs \p half multiply on inputs \p a and \p b,
2502
+ * then performs a \p half add of the result with \p c,
2503
+ * rounding the result once in round-to-nearest-even mode, and clamps the result
2504
+ * to range [0.0, 1.0]. NaN results are flushed to +0.0.
2505
+ * \param[in] a - half. Is only being read.
2506
+ * \param[in] b - half. Is only being read.
2507
+ * \param[in] c - half. Is only being read.
2508
+ *
2509
+ * \returns half
2510
+ * \retval The result of fused multiply-add operation on \p
2511
+ * a, \p b, and \p c, with respect to saturation.
2512
+ * \internal
2513
+ * \exception-guarantee no-throw guarantee
2514
+ * \behavior reentrant, thread safe
2515
+ * \endinternal
2516
+ */
2517
+ __CUDA_FP16_DECL__ __half __hfma_sat(const __half a, const __half b, const __half c);
2518
+ /**
2519
+ * \ingroup CUDA_MATH__HALF_ARITHMETIC
2520
+ * \brief Negates input \p half number and returns the result.
2521
+ *
2522
+ * \details Negates input \p half number and returns the result.
2523
+ * \internal
2524
+ * \req DEEPLEARN-SRM_REQ-100
2525
+ * \endinternal
2526
+ * \param[in] a - half. Is only being read.
2527
+ *
2528
+ * \returns half
2529
+ * \retval minus a
2530
+ * \internal
2531
+ * \exception-guarantee no-throw guarantee
2532
+ * \behavior reentrant, thread safe
2533
+ * \endinternal
2534
+ */
2535
+ __CUDA_FP16_DECL__ __half __hneg(const __half a);
2536
+ /**
2537
+ * \ingroup CUDA_MATH__HALF2_COMPARISON
2538
+ * \brief Performs \p half2 vector if-equal comparison and returns boolean true
2539
+ * iff both \p half results are true, boolean false otherwise.
2540
+ *
2541
+ * \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b.
2542
+ * The bool result is set to true only if both \p half if-equal comparisons
2543
+ * evaluate to true, or false otherwise.
2544
+ * NaN inputs generate false results.
2545
+ * \param[in] a - half2. Is only being read.
2546
+ * \param[in] b - half2. Is only being read.
2547
+ *
2548
+ * \returns bool
2549
+ * \retval true if both \p half results of if-equal comparison
2550
+ * of vectors \p a and \p b are true;
2551
+ * \retval false otherwise.
2552
+ * \internal
2553
+ * \exception-guarantee no-throw guarantee
2554
+ * \behavior reentrant, thread safe
2555
+ * \endinternal
2556
+ */
2557
+ __CUDA_FP16_DECL__ bool __hbeq2(const __half2 a, const __half2 b);
2558
+ /**
2559
+ * \ingroup CUDA_MATH__HALF2_COMPARISON
2560
+ * \brief Performs \p half2 vector not-equal comparison and returns boolean
2561
+ * true iff both \p half results are true, boolean false otherwise.
2562
+ *
2563
+ * \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b.
2564
+ * The bool result is set to true only if both \p half not-equal comparisons
2565
+ * evaluate to true, or false otherwise.
2566
+ * NaN inputs generate false results.
2567
+ * \param[in] a - half2. Is only being read.
2568
+ * \param[in] b - half2. Is only being read.
2569
+ *
2570
+ * \returns bool
2571
+ * \retval true if both \p half results of not-equal comparison
2572
+ * of vectors \p a and \p b are true,
2573
+ * \retval false otherwise.
2574
+ * \internal
2575
+ * \exception-guarantee no-throw guarantee
2576
+ * \behavior reentrant, thread safe
2577
+ * \endinternal
2578
+ */
2579
+ __CUDA_FP16_DECL__ bool __hbne2(const __half2 a, const __half2 b);
2580
+ /**
2581
+ * \ingroup CUDA_MATH__HALF2_COMPARISON
2582
+ * \brief Performs \p half2 vector less-equal comparison and returns boolean
2583
+ * true iff both \p half results are true, boolean false otherwise.
2584
+ *
2585
+ * \details Performs \p half2 vector less-equal comparison of inputs \p a and \p b.
2586
+ * The bool result is set to true only if both \p half less-equal comparisons
2587
+ * evaluate to true, or false otherwise.
2588
+ * NaN inputs generate false results.
2589
+ * \param[in] a - half2. Is only being read.
2590
+ * \param[in] b - half2. Is only being read.
2591
+ *
2592
+ * \returns bool
2593
+ * \retval true if both \p half results of less-equal comparison
2594
+ * of vectors \p a and \p b are true;
2595
+ * \retval false otherwise.
2596
+ * \internal
2597
+ * \exception-guarantee no-throw guarantee
2598
+ * \behavior reentrant, thread safe
2599
+ * \endinternal
2600
+ */
2601
+ __CUDA_FP16_DECL__ bool __hble2(const __half2 a, const __half2 b);
2602
+ /**
2603
+ * \ingroup CUDA_MATH__HALF2_COMPARISON
2604
+ * \brief Performs \p half2 vector greater-equal comparison and returns boolean
2605
+ * true iff both \p half results are true, boolean false otherwise.
2606
+ *
2607
+ * \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b.
2608
+ * The bool result is set to true only if both \p half greater-equal comparisons
2609
+ * evaluate to true, or false otherwise.
2610
+ * NaN inputs generate false results.
2611
+ * \param[in] a - half2. Is only being read.
2612
+ * \param[in] b - half2. Is only being read.
2613
+ *
2614
+ * \returns bool
2615
+ * \retval true if both \p half results of greater-equal
2616
+ * comparison of vectors \p a and \p b are true;
2617
+ * \retval false otherwise.
2618
+ * \internal
2619
+ * \exception-guarantee no-throw guarantee
2620
+ * \behavior reentrant, thread safe
2621
+ * \endinternal
2622
+ */
2623
+ __CUDA_FP16_DECL__ bool __hbge2(const __half2 a, const __half2 b);
2624
+ /**
2625
+ * \ingroup CUDA_MATH__HALF2_COMPARISON
2626
+ * \brief Performs \p half2 vector less-than comparison and returns boolean
2627
+ * true iff both \p half results are true, boolean false otherwise.
2628
+ *
2629
+ * \details Performs \p half2 vector less-than comparison of inputs \p a and \p b.
2630
+ * The bool result is set to true only if both \p half less-than comparisons
2631
+ * evaluate to true, or false otherwise.
2632
+ * NaN inputs generate false results.
2633
+ * \param[in] a - half2. Is only being read.
2634
+ * \param[in] b - half2. Is only being read.
2635
+ *
2636
+ * \returns bool
2637
+ * \retval true if both \p half results of less-than comparison
2638
+ * of vectors \p a and \p b are true;
2639
+ * \retval false otherwise.
2640
+ * \internal
2641
+ * \exception-guarantee no-throw guarantee
2642
+ * \behavior reentrant, thread safe
2643
+ * \endinternal
2644
+ */
2645
+ __CUDA_FP16_DECL__ bool __hblt2(const __half2 a, const __half2 b);
2646
+ /**
2647
+ * \ingroup CUDA_MATH__HALF2_COMPARISON
2648
+ * \brief Performs \p half2 vector greater-than comparison and returns boolean
2649
+ * true iff both \p half results are true, boolean false otherwise.
2650
+ *
2651
+ * \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b.
2652
+ * The bool result is set to true only if both \p half greater-than comparisons
2653
+ * evaluate to true, or false otherwise.
2654
+ * NaN inputs generate false results.
2655
+ * \param[in] a - half2. Is only being read.
2656
+ * \param[in] b - half2. Is only being read.
2657
+ *
2658
+ * \returns bool
2659
+ * \retval true if both \p half results of greater-than
2660
+ * comparison of vectors \p a and \p b are true;
2661
+ * \retval false otherwise.
2662
+ * \internal
2663
+ * \exception-guarantee no-throw guarantee
2664
+ * \behavior reentrant, thread safe
2665
+ * \endinternal
2666
+ */
2667
+ __CUDA_FP16_DECL__ bool __hbgt2(const __half2 a, const __half2 b);
2668
+ /**
2669
+ * \ingroup CUDA_MATH__HALF2_COMPARISON
2670
+ * \brief Performs \p half2 vector unordered if-equal comparison and returns
2671
+ * boolean true iff both \p half results are true, boolean false otherwise.
2672
+ *
2673
+ * \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b.
2674
+ * The bool result is set to true only if both \p half if-equal comparisons
2675
+ * evaluate to true, or false otherwise.
2676
+ * NaN inputs generate true results.
2677
+ * \param[in] a - half2. Is only being read.
2678
+ * \param[in] b - half2. Is only being read.
2679
+ *
2680
+ * \returns bool
2681
+ * \retval true if both \p half results of unordered if-equal
2682
+ * comparison of vectors \p a and \p b are true;
2683
+ * \retval false otherwise.
2684
+ * \internal
2685
+ * \exception-guarantee no-throw guarantee
2686
+ * \behavior reentrant, thread safe
2687
+ * \endinternal
2688
+ */
2689
+ __CUDA_FP16_DECL__ bool __hbequ2(const __half2 a, const __half2 b);
2690
+ /**
2691
+ * \ingroup CUDA_MATH__HALF2_COMPARISON
2692
+ * \brief Performs \p half2 vector unordered not-equal comparison and returns
2693
+ * boolean true iff both \p half results are true, boolean false otherwise.
2694
+ *
2695
+ * \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b.
2696
+ * The bool result is set to true only if both \p half not-equal comparisons
2697
+ * evaluate to true, or false otherwise.
2698
+ * NaN inputs generate true results.
2699
+ * \param[in] a - half2. Is only being read.
2700
+ * \param[in] b - half2. Is only being read.
2701
+ *
2702
+ * \returns bool
2703
+ * \retval true if both \p half results of unordered not-equal
2704
+ * comparison of vectors \p a and \p b are true;
2705
+ * \retval false otherwise.
2706
+ * \internal
2707
+ * \exception-guarantee no-throw guarantee
2708
+ * \behavior reentrant, thread safe
2709
+ * \endinternal
2710
+ */
2711
+ __CUDA_FP16_DECL__ bool __hbneu2(const __half2 a, const __half2 b);
2712
+ /**
2713
+ * \ingroup CUDA_MATH__HALF2_COMPARISON
2714
+ * \brief Performs \p half2 vector unordered less-equal comparison and returns
2715
+ * boolean true iff both \p half results are true, boolean false otherwise.
2716
+ *
2717
+ * \details Performs \p half2 vector less-equal comparison of inputs \p a and \p b.
2718
+ * The bool result is set to true only if both \p half less-equal comparisons
2719
+ * evaluate to true, or false otherwise.
2720
+ * NaN inputs generate true results.
2721
+ * \param[in] a - half2. Is only being read.
2722
+ * \param[in] b - half2. Is only being read.
2723
+ *
2724
+ * \returns bool
2725
+ * \retval true if both \p half results of unordered less-equal
2726
+ * comparison of vectors \p a and \p b are true;
2727
+ * \retval false otherwise.
2728
+ * \internal
2729
+ * \exception-guarantee no-throw guarantee
2730
+ * \behavior reentrant, thread safe
2731
+ * \endinternal
2732
+ */
2733
+ __CUDA_FP16_DECL__ bool __hbleu2(const __half2 a, const __half2 b);
2734
+ /**
2735
+ * \ingroup CUDA_MATH__HALF2_COMPARISON
2736
+ * \brief Performs \p half2 vector unordered greater-equal comparison and
2737
+ * returns boolean true iff both \p half results are true, boolean false
2738
+ * otherwise.
2739
+ *
2740
+ * \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b.
2741
+ * The bool result is set to true only if both \p half greater-equal comparisons
2742
+ * evaluate to true, or false otherwise.
2743
+ * NaN inputs generate true results.
2744
+ * \param[in] a - half2. Is only being read.
2745
+ * \param[in] b - half2. Is only being read.
2746
+ *
2747
+ * \returns bool
2748
+ * \retval true if both \p half results of unordered
2749
+ * greater-equal comparison of vectors \p a and \p b are true;
2750
+ * \retval false otherwise.
2751
+ * \internal
2752
+ * \exception-guarantee no-throw guarantee
2753
+ * \behavior reentrant, thread safe
2754
+ * \endinternal
2755
+ */
2756
+ __CUDA_FP16_DECL__ bool __hbgeu2(const __half2 a, const __half2 b);
2757
+ /**
2758
+ * \ingroup CUDA_MATH__HALF2_COMPARISON
2759
+ * \brief Performs \p half2 vector unordered less-than comparison and returns
2760
+ * boolean true iff both \p half results are true, boolean false otherwise.
2761
+ *
2762
+ * \details Performs \p half2 vector less-than comparison of inputs \p a and \p b.
2763
+ * The bool result is set to true only if both \p half less-than comparisons
2764
+ * evaluate to true, or false otherwise.
2765
+ * NaN inputs generate true results.
2766
+ * \param[in] a - half2. Is only being read.
2767
+ * \param[in] b - half2. Is only being read.
2768
+ *
2769
+ * \returns bool
2770
+ * \retval true if both \p half results of unordered less-than comparison of
2771
+ * vectors \p a and \p b are true;
2772
+ * \retval false otherwise.
2773
+ * \internal
2774
+ * \exception-guarantee no-throw guarantee
2775
+ * \behavior reentrant, thread safe
2776
+ * \endinternal
2777
+ */
2778
+ __CUDA_FP16_DECL__ bool __hbltu2(const __half2 a, const __half2 b);
2779
+ /**
2780
+ * \ingroup CUDA_MATH__HALF2_COMPARISON
2781
+ * \brief Performs \p half2 vector unordered greater-than comparison and
2782
+ * returns boolean true iff both \p half results are true, boolean false
2783
+ * otherwise.
2784
+ *
2785
+ * \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b.
2786
+ * The bool result is set to true only if both \p half greater-than comparisons
2787
+ * evaluate to true, or false otherwise.
2788
+ * NaN inputs generate true results.
2789
+ * \param[in] a - half2. Is only being read.
2790
+ * \param[in] b - half2. Is only being read.
2791
+ *
2792
+ * \returns bool
2793
+ * \retval true if both \p half results of unordered
2794
+ * greater-than comparison of vectors \p a and \p b are true;
2795
+ * \retval false otherwise.
2796
+ * \internal
2797
+ * \exception-guarantee no-throw guarantee
2798
+ * \behavior reentrant, thread safe
2799
+ * \endinternal
2800
+ */
2801
+ __CUDA_FP16_DECL__ bool __hbgtu2(const __half2 a, const __half2 b);
2802
+ /**
2803
+ * \ingroup CUDA_MATH__HALF_COMPARISON
2804
+ * \brief Performs \p half if-equal comparison.
2805
+ *
2806
+ * \details Performs \p half if-equal comparison of inputs \p a and \p b.
2807
+ * NaN inputs generate false results.
2808
+ * \param[in] a - half. Is only being read.
2809
+ * \param[in] b - half. Is only being read.
2810
+ *
2811
+ * \returns bool
2812
+ * \retval The boolean result of if-equal comparison of \p a and \p b.
2813
+ * \internal
2814
+ * \exception-guarantee no-throw guarantee
2815
+ * \behavior reentrant, thread safe
2816
+ * \endinternal
2817
+ */
2818
+ __CUDA_FP16_DECL__ bool __heq(const __half a, const __half b);
2819
+ /**
2820
+ * \ingroup CUDA_MATH__HALF_COMPARISON
2821
+ * \brief Performs \p half not-equal comparison.
2822
+ *
2823
+ * \details Performs \p half not-equal comparison of inputs \p a and \p b.
2824
+ * NaN inputs generate false results.
2825
+ * \param[in] a - half. Is only being read.
2826
+ * \param[in] b - half. Is only being read.
2827
+ *
2828
+ * \returns bool
2829
+ * \retval The boolean result of not-equal comparison of \p a and \p b.
2830
+ * \internal
2831
+ * \exception-guarantee no-throw guarantee
2832
+ * \behavior reentrant, thread safe
2833
+ * \endinternal
2834
+ */
2835
+ __CUDA_FP16_DECL__ bool __hne(const __half a, const __half b);
2836
+ /**
2837
+ * \ingroup CUDA_MATH__HALF_COMPARISON
2838
+ * \brief Performs \p half less-equal comparison.
2839
+ *
2840
+ * \details Performs \p half less-equal comparison of inputs \p a and \p b.
2841
+ * NaN inputs generate false results.
2842
+ * \param[in] a - half. Is only being read.
2843
+ * \param[in] b - half. Is only being read.
2844
+ *
2845
+ * \returns bool
2846
+ * \retval The boolean result of less-equal comparison of \p a and \p b.
2847
+ * \internal
2848
+ * \exception-guarantee no-throw guarantee
2849
+ * \behavior reentrant, thread safe
2850
+ * \endinternal
2851
+ */
2852
+ __CUDA_FP16_DECL__ bool __hle(const __half a, const __half b);
2853
+ /**
2854
+ * \ingroup CUDA_MATH__HALF_COMPARISON
2855
+ * \brief Performs \p half greater-equal comparison.
2856
+ *
2857
+ * \details Performs \p half greater-equal comparison of inputs \p a and \p b.
2858
+ * NaN inputs generate false results.
2859
+ * \param[in] a - half. Is only being read.
2860
+ * \param[in] b - half. Is only being read.
2861
+ *
2862
+ * \returns bool
2863
+ * \retval The boolean result of greater-equal comparison of \p a and \p b.
2864
+ * \internal
2865
+ * \exception-guarantee no-throw guarantee
2866
+ * \behavior reentrant, thread safe
2867
+ * \endinternal
2868
+ */
2869
+ __CUDA_FP16_DECL__ bool __hge(const __half a, const __half b);
2870
+ /**
2871
+ * \ingroup CUDA_MATH__HALF_COMPARISON
2872
+ * \brief Performs \p half less-than comparison.
2873
+ *
2874
+ * \details Performs \p half less-than comparison of inputs \p a and \p b.
2875
+ * NaN inputs generate false results.
2876
+ * \param[in] a - half. Is only being read.
2877
+ * \param[in] b - half. Is only being read.
2878
+ *
2879
+ * \returns bool
2880
+ * \retval The boolean result of less-than comparison of \p a and \p b.
2881
+ * \internal
2882
+ * \exception-guarantee no-throw guarantee
2883
+ * \behavior reentrant, thread safe
2884
+ * \endinternal
2885
+ */
2886
+ __CUDA_FP16_DECL__ bool __hlt(const __half a, const __half b);
2887
+ /**
2888
+ * \ingroup CUDA_MATH__HALF_COMPARISON
2889
+ * \brief Performs \p half greater-than comparison.
2890
+ *
2891
+ * \details Performs \p half greater-than comparison of inputs \p a and \p b.
2892
+ * NaN inputs generate false results.
2893
+ * \param[in] a - half. Is only being read.
2894
+ * \param[in] b - half. Is only being read.
2895
+ *
2896
+ * \returns bool
2897
+ * \retval The boolean result of greater-than comparison of \p a and \p b.
2898
+ * \internal
2899
+ * \exception-guarantee no-throw guarantee
2900
+ * \behavior reentrant, thread safe
2901
+ * \endinternal
2902
+ */
2903
+ __CUDA_FP16_DECL__ bool __hgt(const __half a, const __half b);
2904
+ /**
2905
+ * \ingroup CUDA_MATH__HALF_COMPARISON
2906
+ * \brief Performs \p half unordered if-equal comparison.
2907
+ *
2908
+ * \details Performs \p half if-equal comparison of inputs \p a and \p b.
2909
+ * NaN inputs generate true results.
2910
+ * \param[in] a - half. Is only being read.
2911
+ * \param[in] b - half. Is only being read.
2912
+ *
2913
+ * \returns bool
2914
+ * \retval The boolean result of unordered if-equal comparison of \p a and
2915
+ * \p b.
2916
+ * \internal
2917
+ * \exception-guarantee no-throw guarantee
2918
+ * \behavior reentrant, thread safe
2919
+ * \endinternal
2920
+ */
2921
+ __CUDA_FP16_DECL__ bool __hequ(const __half a, const __half b);
2922
+ /**
2923
+ * \ingroup CUDA_MATH__HALF_COMPARISON
2924
+ * \brief Performs \p half unordered not-equal comparison.
2925
+ *
2926
+ * \details Performs \p half not-equal comparison of inputs \p a and \p b.
2927
+ * NaN inputs generate true results.
2928
+ * \param[in] a - half. Is only being read.
2929
+ * \param[in] b - half. Is only being read.
2930
+ *
2931
+ * \returns bool
2932
+ * \retval The boolean result of unordered not-equal comparison of \p a and
2933
+ * \p b.
2934
+ * \internal
2935
+ * \exception-guarantee no-throw guarantee
2936
+ * \behavior reentrant, thread safe
2937
+ * \endinternal
2938
+ */
2939
+ __CUDA_FP16_DECL__ bool __hneu(const __half a, const __half b);
2940
+ /**
2941
+ * \ingroup CUDA_MATH__HALF_COMPARISON
2942
+ * \brief Performs \p half unordered less-equal comparison.
2943
+ *
2944
+ * \details Performs \p half less-equal comparison of inputs \p a and \p b.
2945
+ * NaN inputs generate true results.
2946
+ * \param[in] a - half. Is only being read.
2947
+ * \param[in] b - half. Is only being read.
2948
+ *
2949
+ * \returns bool
2950
+ * \retval The boolean result of unordered less-equal comparison of \p a and
2951
+ * \p b.
2952
+ * \internal
2953
+ * \exception-guarantee no-throw guarantee
2954
+ * \behavior reentrant, thread safe
2955
+ * \endinternal
2956
+ */
2957
+ __CUDA_FP16_DECL__ bool __hleu(const __half a, const __half b);
2958
+ /**
2959
+ * \ingroup CUDA_MATH__HALF_COMPARISON
2960
+ * \brief Performs \p half unordered greater-equal comparison.
2961
+ *
2962
+ * \details Performs \p half greater-equal comparison of inputs \p a and \p b.
2963
+ * NaN inputs generate true results.
2964
+ * \param[in] a - half. Is only being read.
2965
+ * \param[in] b - half. Is only being read.
2966
+ *
2967
+ * \returns bool
2968
+ * \retval The boolean result of unordered greater-equal comparison of \p a
2969
+ * and \p b.
2970
+ * \internal
2971
+ * \exception-guarantee no-throw guarantee
2972
+ * \behavior reentrant, thread safe
2973
+ * \endinternal
2974
+ */
2975
+ __CUDA_FP16_DECL__ bool __hgeu(const __half a, const __half b);
2976
+ /**
2977
+ * \ingroup CUDA_MATH__HALF_COMPARISON
2978
+ * \brief Performs \p half unordered less-than comparison.
2979
+ *
2980
+ * \details Performs \p half less-than comparison of inputs \p a and \p b.
2981
+ * NaN inputs generate true results.
2982
+ * \param[in] a - half. Is only being read.
2983
+ * \param[in] b - half. Is only being read.
2984
+ *
2985
+ * \returns bool
2986
+ * \retval The boolean result of unordered less-than comparison of \p a and
2987
+ * \p b.
2988
+ * \internal
2989
+ * \exception-guarantee no-throw guarantee
2990
+ * \behavior reentrant, thread safe
2991
+ * \endinternal
2992
+ */
2993
+ __CUDA_FP16_DECL__ bool __hltu(const __half a, const __half b);
2994
+ /**
2995
+ * \ingroup CUDA_MATH__HALF_COMPARISON
2996
+ * \brief Performs \p half unordered greater-than comparison.
2997
+ *
2998
+ * \details Performs \p half greater-than comparison of inputs \p a and \p b.
2999
+ * NaN inputs generate true results.
3000
+ * \param[in] a - half. Is only being read.
3001
+ * \param[in] b - half. Is only being read.
3002
+ *
3003
+ * \returns bool
3004
+ * \retval The boolean result of unordered greater-than comparison of \p a
3005
+ * and \p b.
3006
+ * \internal
3007
+ * \exception-guarantee no-throw guarantee
3008
+ * \behavior reentrant, thread safe
3009
+ * \endinternal
3010
+ */
3011
+ __CUDA_FP16_DECL__ bool __hgtu(const __half a, const __half b);
3012
+ /**
3013
+ * \ingroup CUDA_MATH__HALF_COMPARISON
3014
+ * \brief Determine whether \p half argument is a NaN.
3015
+ *
3016
+ * \details Determine whether \p half value \p a is a NaN.
3017
+ * \param[in] a - half. Is only being read.
3018
+ *
3019
+ * \returns bool
3020
+ * \retval true iff argument is NaN.
3021
+ * \internal
3022
+ * \exception-guarantee no-throw guarantee
3023
+ * \behavior reentrant, thread safe
3024
+ * \endinternal
3025
+ */
3026
+ __CUDA_FP16_DECL__ bool __hisnan(const __half a);
3027
+ #if __CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)
3028
+ /**
3029
+ * \ingroup CUDA_MATH__HALF_COMPARISON
3030
+ * \brief Calculates \p half maximum of two input values.
3031
+ *
3032
+ * \details Calculates \p half max(\p a, \p b)
3033
+ * defined as (\p a > \p b) ? \p a : \p b.
3034
+ * - If either of inputs is NaN, the other input is returned.
3035
+ * - If both inputs are NaNs, then canonical NaN is returned.
3036
+ * - If values of both inputs are 0.0, then +0.0 > -0.0
3037
+ * \param[in] a - half. Is only being read.
3038
+ * \param[in] b - half. Is only being read.
3039
+ *
3040
+ * \returns half
3041
+ * \internal
3042
+ * \exception-guarantee no-throw guarantee
3043
+ * \behavior reentrant, thread safe
3044
+ * \endinternal
3045
+ */
3046
+ __CUDA_FP16_DECL__ __half __hmax(const __half a, const __half b);
3047
+ /**
3048
+ * \ingroup CUDA_MATH__HALF_COMPARISON
3049
+ * \brief Calculates \p half minimum of two input values.
3050
+ *
3051
+ * \details Calculates \p half min(\p a, \p b)
3052
+ * defined as (\p a < \p b) ? \p a : \p b.
3053
+ * - If either of inputs is NaN, the other input is returned.
3054
+ * - If both inputs are NaNs, then canonical NaN is returned.
3055
+ * - If values of both inputs are 0.0, then +0.0 > -0.0
3056
+ * \param[in] a - half. Is only being read.
3057
+ * \param[in] b - half. Is only being read.
3058
+ *
3059
+ * \returns half
3060
+ * \internal
3061
+ * \exception-guarantee no-throw guarantee
3062
+ * \behavior reentrant, thread safe
3063
+ * \endinternal
3064
+ */
3065
+ __CUDA_FP16_DECL__ __half __hmin(const __half a, const __half b);
3066
+ /**
3067
+ * \ingroup CUDA_MATH__HALF_COMPARISON
3068
+ * \brief Calculates \p half maximum of two input values, NaNs pass through.
3069
+ *
3070
+ * \details Calculates \p half max(\p a, \p b)
3071
+ * defined as (\p a > \p b) ? \p a : \p b.
3072
+ * - If either of inputs is NaN, then canonical NaN is returned.
3073
+ * - If values of both inputs are 0.0, then +0.0 > -0.0
3074
+ * \param[in] a - half. Is only being read.
3075
+ * \param[in] b - half. Is only being read.
3076
+ *
3077
+ * \returns half
3078
+ * \internal
3079
+ * \exception-guarantee no-throw guarantee
3080
+ * \behavior reentrant, thread safe
3081
+ * \endinternal
3082
+ */
3083
+ __CUDA_FP16_DECL__ __half __hmax_nan(const __half a, const __half b);
3084
+ /**
3085
+ * \ingroup CUDA_MATH__HALF_COMPARISON
3086
+ * \brief Calculates \p half minimum of two input values, NaNs pass through.
3087
+ *
3088
+ * \details Calculates \p half min(\p a, \p b)
3089
+ * defined as (\p a < \p b) ? \p a : \p b.
3090
+ * - If either of inputs is NaN, then canonical NaN is returned.
3091
+ * - If values of both inputs are 0.0, then +0.0 > -0.0
3092
+ * \param[in] a - half. Is only being read.
3093
+ * \param[in] b - half. Is only being read.
3094
+ *
3095
+ * \returns half
3096
+ * \internal
3097
+ * \exception-guarantee no-throw guarantee
3098
+ * \behavior reentrant, thread safe
3099
+ * \endinternal
3100
+ */
3101
+ __CUDA_FP16_DECL__ __half __hmin_nan(const __half a, const __half b);
3102
+ /**
3103
+ * \ingroup CUDA_MATH__HALF_ARITHMETIC
3104
+ * \brief Performs \p half fused multiply-add in round-to-nearest-even mode with relu saturation.
3105
+ *
3106
+ * \details Performs \p half multiply on inputs \p a and \p b,
3107
+ * then performs a \p half add of the result with \p c,
3108
+ * rounding the result once in round-to-nearest-even mode.
3109
+ * Then negative result is clamped to 0.
3110
+ * NaN result is converted to canonical NaN.
3111
+ * \param[in] a - half. Is only being read.
3112
+ * \param[in] b - half. Is only being read.
3113
+ * \param[in] c - half. Is only being read.
3114
+ *
3115
+ * \returns half
3116
+ * \retval The result of fused multiply-add operation on \p
3117
+ * a, \p b, and \p c with relu saturation.
3118
+ * \internal
3119
+ * \exception-guarantee no-throw guarantee
3120
+ * \behavior reentrant, thread safe
3121
+ * \endinternal
3122
+ */
3123
+ __CUDA_FP16_DECL__ __half __hfma_relu(const __half a, const __half b, const __half c);
3124
+ /**
3125
+ * \ingroup CUDA_MATH__HALF2_COMPARISON
3126
+ * \brief Calculates \p half2 vector maximum of two inputs.
3127
+ *
3128
+ * \details Calculates \p half2 vector max(\p a, \p b).
3129
+ * Elementwise \p half operation is defined as
3130
+ * (\p a > \p b) ? \p a : \p b.
3131
+ * - If either of inputs is NaN, the other input is returned.
3132
+ * - If both inputs are NaNs, then canonical NaN is returned.
3133
+ * - If values of both inputs are 0.0, then +0.0 > -0.0
3134
+ * \param[in] a - half2. Is only being read.
3135
+ * \param[in] b - half2. Is only being read.
3136
+ *
3137
+ * \returns half2
3138
+ * \retval The result of elementwise maximum of vectors \p a and \p b
3139
+ * \internal
3140
+ * \exception-guarantee no-throw guarantee
3141
+ * \behavior reentrant, thread safe
3142
+ * \endinternal
3143
+ */
3144
+ __CUDA_FP16_DECL__ __half2 __hmax2(const __half2 a, const __half2 b);
3145
+ /**
3146
+ * \ingroup CUDA_MATH__HALF2_COMPARISON
3147
+ * \brief Calculates \p half2 vector minimum of two inputs.
3148
+ *
3149
+ * \details Calculates \p half2 vector min(\p a, \p b).
3150
+ * Elementwise \p half operation is defined as
3151
+ * (\p a < \p b) ? \p a : \p b.
3152
+ * - If either of inputs is NaN, the other input is returned.
3153
+ * - If both inputs are NaNs, then canonical NaN is returned.
3154
+ * - If values of both inputs are 0.0, then +0.0 > -0.0
3155
+ * \param[in] a - half2. Is only being read.
3156
+ * \param[in] b - half2. Is only being read.
3157
+ *
3158
+ * \returns half2
3159
+ * \retval The result of elementwise minimum of vectors \p a and \p b
3160
+ * \internal
3161
+ * \exception-guarantee no-throw guarantee
3162
+ * \behavior reentrant, thread safe
3163
+ * \endinternal
3164
+ */
3165
+ __CUDA_FP16_DECL__ __half2 __hmin2(const __half2 a, const __half2 b);
3166
+ /**
3167
+ * \ingroup CUDA_MATH__HALF2_COMPARISON
3168
+ * \brief Calculates \p half2 vector maximum of two inputs, NaNs pass through.
3169
+ *
3170
+ * \details Calculates \p half2 vector max(\p a, \p b).
3171
+ * Elementwise \p half operation is defined as
3172
+ * (\p a > \p b) ? \p a : \p b.
3173
+ * - If either of inputs is NaN, then canonical NaN is returned.
3174
+ * - If values of both inputs are 0.0, then +0.0 > -0.0
3175
+ * \param[in] a - half2. Is only being read.
3176
+ * \param[in] b - half2. Is only being read.
3177
+ *
3178
+ * \returns half2
3179
+ * \retval The result of elementwise maximum of vectors \p a and \p b, with NaNs pass through
3180
+ * \internal
3181
+ * \exception-guarantee no-throw guarantee
3182
+ * \behavior reentrant, thread safe
3183
+ * \endinternal
3184
+ */
3185
+ __CUDA_FP16_DECL__ __half2 __hmax2_nan(const __half2 a, const __half2 b);
3186
+ /**
3187
+ * \ingroup CUDA_MATH__HALF2_COMPARISON
3188
+ * \brief Calculates \p half2 vector minimum of two inputs, NaNs pass through.
3189
+ *
3190
+ * \details Calculates \p half2 vector min(\p a, \p b).
3191
+ * Elementwise \p half operation is defined as
3192
+ * (\p a < \p b) ? \p a : \p b.
3193
+ * - If either of inputs is NaN, then canonical NaN is returned.
3194
+ * - If values of both inputs are 0.0, then +0.0 > -0.0
3195
+ * \param[in] a - half2. Is only being read.
3196
+ * \param[in] b - half2. Is only being read.
3197
+ *
3198
+ * \returns half2
3199
+ * \retval The result of elementwise minimum of vectors \p a and \p b, with NaNs pass through
3200
+ * \internal
3201
+ * \exception-guarantee no-throw guarantee
3202
+ * \behavior reentrant, thread safe
3203
+ * \endinternal
3204
+ */
3205
+ __CUDA_FP16_DECL__ __half2 __hmin2_nan(const __half2 a, const __half2 b);
3206
+ /**
3207
+ * \ingroup CUDA_MATH__HALF2_ARITHMETIC
3208
+ * \brief Performs \p half2 vector fused multiply-add in round-to-nearest-even
3209
+ * mode with relu saturation.
3210
+ *
3211
+ * \details Performs \p half2 vector multiply on inputs \p a and \p b,
3212
+ * then performs a \p half2 vector add of the result with \p c,
3213
+ * rounding the result once in round-to-nearest-even mode.
3214
+ * Then negative result is clamped to 0.
3215
+ * NaN result is converted to canonical NaN.
3216
+ * \param[in] a - half2. Is only being read.
3217
+ * \param[in] b - half2. Is only being read.
3218
+ * \param[in] c - half2. Is only being read.
3219
+ *
3220
+ * \returns half2
3221
+ * \retval The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c with relu saturation.
3222
+ * \internal
3223
+ * \exception-guarantee no-throw guarantee
3224
+ * \behavior reentrant, thread safe
3225
+ * \endinternal
3226
+ */
3227
+ __CUDA_FP16_DECL__ __half2 __hfma2_relu(const __half2 a, const __half2 b, const __half2 c);
3228
+ #endif /*__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)*/
3229
+ /**
3230
+ * \ingroup CUDA_MATH__HALF2_ARITHMETIC
3231
+ * \brief Performs fast complex multiply-accumulate
3232
+ *
3233
+ * \details Interprets vector \p half2 input pairs \p a, \p b, and \p c as
3234
+ * complex numbers in \p half precision and performs
3235
+ * complex multiply-accumulate operation: a*b + c
3236
+ * \param[in] a - half2. Is only being read.
3237
+ * \param[in] b - half2. Is only being read.
3238
+ * \param[in] c - half2. Is only being read.
3239
+ *
3240
+ * \returns half2
3241
+ * \retval The result of complex multiply-accumulate operation on complex numbers \p a, \p b, and \p c
3242
+ * \internal
3243
+ * \exception-guarantee no-throw guarantee
3244
+ * \behavior reentrant, thread safe
3245
+ * \endinternal
3246
+ */
3247
+ __CUDA_FP16_DECL__ __half2 __hcmadd(const __half2 a, const __half2 b, const __half2 c);
3248
+ /**
3249
+ * \ingroup CUDA_MATH__HALF_FUNCTIONS
3250
+ * \brief Calculates \p half square root in round-to-nearest-even mode.
3251
+ *
3252
+ * \details Calculates \p half square root of input \p a in round-to-nearest-even mode.
3253
+ * \param[in] a - half. Is only being read.
3254
+ *
3255
+ * \returns half
3256
+ * \retval The square root of \p a.
3257
+ * \internal
3258
+ * \exception-guarantee no-throw guarantee
3259
+ * \behavior reentrant, thread safe
3260
+ * \endinternal
3261
+ */
3262
+ __CUDA_FP16_DECL__ __half hsqrt(const __half a);
3263
+ /**
3264
+ * \ingroup CUDA_MATH__HALF_FUNCTIONS
3265
+ * \brief Calculates \p half reciprocal square root in round-to-nearest-even
3266
+ * mode.
3267
+ *
3268
+ * \details Calculates \p half reciprocal square root of input \p a in round-to-nearest
3269
+ * mode.
3270
+ * \param[in] a - half. Is only being read.
3271
+ *
3272
+ * \returns half
3273
+ * \retval The reciprocal square root of \p a.
3274
+ * \internal
3275
+ * \exception-guarantee no-throw guarantee
3276
+ * \behavior reentrant, thread safe
3277
+ * \endinternal
3278
+ */
3279
+ __CUDA_FP16_DECL__ __half hrsqrt(const __half a);
3280
+ /**
3281
+ * \ingroup CUDA_MATH__HALF_FUNCTIONS
3282
+ * \brief Calculates \p half reciprocal in round-to-nearest-even mode.
3283
+ *
3284
+ * \details Calculates \p half reciprocal of input \p a in round-to-nearest-even mode.
3285
+ * \param[in] a - half. Is only being read.
3286
+ *
3287
+ * \returns half
3288
+ * \retval The reciprocal of \p a.
3289
+ * \internal
3290
+ * \exception-guarantee no-throw guarantee
3291
+ * \behavior reentrant, thread safe
3292
+ * \endinternal
3293
+ */
3294
+ __CUDA_FP16_DECL__ __half hrcp(const __half a);
3295
+ /**
3296
+ * \ingroup CUDA_MATH__HALF_FUNCTIONS
3297
+ * \brief Calculates \p half natural logarithm in round-to-nearest-even mode.
3298
+ *
3299
+ * \details Calculates \p half natural logarithm of input \p a in round-to-nearest-even
3300
+ * mode.
3301
+ * \param[in] a - half. Is only being read.
3302
+ *
3303
+ * \returns half
3304
+ * \retval The natural logarithm of \p a.
3305
+ * \internal
3306
+ * \exception-guarantee no-throw guarantee
3307
+ * \behavior reentrant, thread safe
3308
+ * \endinternal
3309
+ */
3310
+ __CUDA_FP16_DECL__ __half hlog(const __half a);
3311
+ /**
3312
+ * \ingroup CUDA_MATH__HALF_FUNCTIONS
3313
+ * \brief Calculates \p half binary logarithm in round-to-nearest-even mode.
3314
+ *
3315
+ * \details Calculates \p half binary logarithm of input \p a in round-to-nearest-even
3316
+ * mode.
3317
+ * \param[in] a - half. Is only being read.
3318
+ *
3319
+ * \returns half
3320
+ * \retval The binary logarithm of \p a.
3321
+ * \internal
3322
+ * \exception-guarantee no-throw guarantee
3323
+ * \behavior reentrant, thread safe
3324
+ * \endinternal
3325
+ */
3326
+ __CUDA_FP16_DECL__ __half hlog2(const __half a);
3327
+ /**
3328
+ * \ingroup CUDA_MATH__HALF_FUNCTIONS
3329
+ * \brief Calculates \p half decimal logarithm in round-to-nearest-even mode.
3330
+ *
3331
+ * \details Calculates \p half decimal logarithm of input \p a in round-to-nearest-even
3332
+ * mode.
3333
+ * \param[in] a - half. Is only being read.
3334
+ *
3335
+ * \returns half
3336
+ * \retval The decimal logarithm of \p a.
3337
+ * \internal
3338
+ * \exception-guarantee no-throw guarantee
3339
+ * \behavior reentrant, thread safe
3340
+ * \endinternal
3341
+ */
3342
+ __CUDA_FP16_DECL__ __half hlog10(const __half a);
3343
+ /**
3344
+ * \ingroup CUDA_MATH__HALF_FUNCTIONS
3345
+ * \brief Calculates \p half natural exponential function in round-to-nearest
3346
+ * mode.
3347
+ *
3348
+ * \details Calculates \p half natural exponential function of input \p a in
3349
+ * round-to-nearest-even mode.
3350
+ * \param[in] a - half. Is only being read.
3351
+ *
3352
+ * \returns half
3353
+ * \retval The natural exponential function on \p a.
3354
+ * \internal
3355
+ * \exception-guarantee no-throw guarantee
3356
+ * \behavior reentrant, thread safe
3357
+ * \endinternal
3358
+ */
3359
+ __CUDA_FP16_DECL__ __half hexp(const __half a);
3360
+ /**
3361
+ * \ingroup CUDA_MATH__HALF_FUNCTIONS
3362
+ * \brief Calculates \p half binary exponential function in round-to-nearest
3363
+ * mode.
3364
+ *
3365
+ * \details Calculates \p half binary exponential function of input \p a in
3366
+ * round-to-nearest-even mode.
3367
+ * \param[in] a - half. Is only being read.
3368
+ *
3369
+ * \returns half
3370
+ * \retval The binary exponential function on \p a.
3371
+ * \internal
3372
+ * \exception-guarantee no-throw guarantee
3373
+ * \behavior reentrant, thread safe
3374
+ * \endinternal
3375
+ */
3376
+ __CUDA_FP16_DECL__ __half hexp2(const __half a);
3377
+ /**
3378
+ * \ingroup CUDA_MATH__HALF_FUNCTIONS
3379
+ * \brief Calculates \p half decimal exponential function in round-to-nearest
3380
+ * mode.
3381
+ *
3382
+ * \details Calculates \p half decimal exponential function of input \p a in
3383
+ * round-to-nearest-even mode.
3384
+ * \param[in] a - half. Is only being read.
3385
+ *
3386
+ * \returns half
3387
+ * \retval The decimal exponential function on \p a.
3388
+ * \internal
3389
+ * \exception-guarantee no-throw guarantee
3390
+ * \behavior reentrant, thread safe
3391
+ * \endinternal
3392
+ */
3393
+ __CUDA_FP16_DECL__ __half hexp10(const __half a);
3394
+ /**
3395
+ * \ingroup CUDA_MATH__HALF_FUNCTIONS
3396
+ * \brief Calculates \p half cosine in round-to-nearest-even mode.
3397
+ *
3398
+ * \details Calculates \p half cosine of input \p a in round-to-nearest-even mode.
3399
+ * \param[in] a - half. Is only being read.
3400
+ *
3401
+ * \returns half
3402
+ * \retval The cosine of \p a.
3403
+ * \internal
3404
+ * \exception-guarantee no-throw guarantee
3405
+ * \behavior reentrant, thread safe
3406
+ * \endinternal
3407
+ */
3408
+ __CUDA_FP16_DECL__ __half hcos(const __half a);
3409
+ /**
3410
+ * \ingroup CUDA_MATH__HALF_FUNCTIONS
3411
+ * \brief Calculates \p half sine in round-to-nearest-even mode.
3412
+ *
3413
+ * \details Calculates \p half sine of input \p a in round-to-nearest-even mode.
3414
+ * \param[in] a - half. Is only being read.
3415
+ *
3416
+ * \returns half
3417
+ * \retval The sine of \p a.
3418
+ * \internal
3419
+ * \exception-guarantee no-throw guarantee
3420
+ * \behavior reentrant, thread safe
3421
+ * \endinternal
3422
+ */
3423
+ __CUDA_FP16_DECL__ __half hsin(const __half a);
3424
+ /**
3425
+ * \ingroup CUDA_MATH__HALF2_FUNCTIONS
3426
+ * \brief Calculates \p half2 vector square root in round-to-nearest-even mode.
3427
+ *
3428
+ * \details Calculates \p half2 square root of input vector \p a in round-to-nearest
3429
+ * mode.
3430
+ * \param[in] a - half2. Is only being read.
3431
+ *
3432
+ * \returns half2
3433
+ * \retval The elementwise square root on vector \p a.
3434
+ * \internal
3435
+ * \exception-guarantee no-throw guarantee
3436
+ * \behavior reentrant, thread safe
3437
+ * \endinternal
3438
+ */
3439
+ __CUDA_FP16_DECL__ __half2 h2sqrt(const __half2 a);
3440
+ /**
3441
+ * \ingroup CUDA_MATH__HALF2_FUNCTIONS
3442
+ * \brief Calculates \p half2 vector reciprocal square root in round-to-nearest
3443
+ * mode.
3444
+ *
3445
+ * \details Calculates \p half2 reciprocal square root of input vector \p a in
3446
+ * round-to-nearest-even mode.
3447
+ * \param[in] a - half2. Is only being read.
3448
+ *
3449
+ * \returns half2
3450
+ * \retval The elementwise reciprocal square root on vector \p a.
3451
+ * \internal
3452
+ * \exception-guarantee no-throw guarantee
3453
+ * \behavior reentrant, thread safe
3454
+ * \endinternal
3455
+ */
3456
+ __CUDA_FP16_DECL__ __half2 h2rsqrt(const __half2 a);
3457
+ /**
3458
+ * \ingroup CUDA_MATH__HALF2_FUNCTIONS
3459
+ * \brief Calculates \p half2 vector reciprocal in round-to-nearest-even mode.
3460
+ *
3461
+ * \details Calculates \p half2 reciprocal of input vector \p a in round-to-nearest-even
3462
+ * mode.
3463
+ * \param[in] a - half2. Is only being read.
3464
+ *
3465
+ * \returns half2
3466
+ * \retval The elementwise reciprocal on vector \p a.
3467
+ * \internal
3468
+ * \exception-guarantee no-throw guarantee
3469
+ * \behavior reentrant, thread safe
3470
+ * \endinternal
3471
+ */
3472
+ __CUDA_FP16_DECL__ __half2 h2rcp(const __half2 a);
3473
+ /**
3474
+ * \ingroup CUDA_MATH__HALF2_FUNCTIONS
3475
+ * \brief Calculates \p half2 vector natural logarithm in round-to-nearest-even
3476
+ * mode.
3477
+ *
3478
+ * \details Calculates \p half2 natural logarithm of input vector \p a in
3479
+ * round-to-nearest-even mode.
3480
+ * \param[in] a - half2. Is only being read.
3481
+ *
3482
+ * \returns half2
3483
+ * \retval The elementwise natural logarithm on vector \p a.
3484
+ * \internal
3485
+ * \exception-guarantee no-throw guarantee
3486
+ * \behavior reentrant, thread safe
3487
+ * \endinternal
3488
+ */
3489
+ __CUDA_FP16_DECL__ __half2 h2log(const __half2 a);
3490
+ /**
3491
+ * \ingroup CUDA_MATH__HALF2_FUNCTIONS
3492
+ * \brief Calculates \p half2 vector binary logarithm in round-to-nearest-even
3493
+ * mode.
3494
+ *
3495
+ * \details Calculates \p half2 binary logarithm of input vector \p a in round-to-nearest
3496
+ * mode.
3497
+ * \param[in] a - half2. Is only being read.
3498
+ *
3499
+ * \returns half2
3500
+ * \retval The elementwise binary logarithm on vector \p a.
3501
+ * \internal
3502
+ * \exception-guarantee no-throw guarantee
3503
+ * \behavior reentrant, thread safe
3504
+ * \endinternal
3505
+ */
3506
+ __CUDA_FP16_DECL__ __half2 h2log2(const __half2 a);
3507
+ /**
3508
+ * \ingroup CUDA_MATH__HALF2_FUNCTIONS
3509
+ * \brief Calculates \p half2 vector decimal logarithm in round-to-nearest-even
3510
+ * mode.
3511
+ *
3512
+ * \details Calculates \p half2 decimal logarithm of input vector \p a in
3513
+ * round-to-nearest-even mode.
3514
+ * \param[in] a - half2. Is only being read.
3515
+ *
3516
+ * \returns half2
3517
+ * \retval The elementwise decimal logarithm on vector \p a.
3518
+ * \internal
3519
+ * \exception-guarantee no-throw guarantee
3520
+ * \behavior reentrant, thread safe
3521
+ * \endinternal
3522
+ */
3523
+ __CUDA_FP16_DECL__ __half2 h2log10(const __half2 a);
3524
+ /**
3525
+ * \ingroup CUDA_MATH__HALF2_FUNCTIONS
3526
+ * \brief Calculates \p half2 vector exponential function in round-to-nearest
3527
+ * mode.
3528
+ *
3529
+ * \details Calculates \p half2 exponential function of input vector \p a in
3530
+ * round-to-nearest-even mode.
3531
+ * \param[in] a - half2. Is only being read.
3532
+ *
3533
+ * \returns half2
3534
+ * \retval The elementwise exponential function on vector \p a.
3535
+ * \internal
3536
+ * \exception-guarantee no-throw guarantee
3537
+ * \behavior reentrant, thread safe
3538
+ * \endinternal
3539
+ */
3540
+ __CUDA_FP16_DECL__ __half2 h2exp(const __half2 a);
3541
+ /**
3542
+ * \ingroup CUDA_MATH__HALF2_FUNCTIONS
3543
+ * \brief Calculates \p half2 vector binary exponential function in
3544
+ * round-to-nearest-even mode.
3545
+ *
3546
+ * \details Calculates \p half2 binary exponential function of input vector \p a in
3547
+ * round-to-nearest-even mode.
3548
+ * \param[in] a - half2. Is only being read.
3549
+ *
3550
+ * \returns half2
3551
+ * \retval The elementwise binary exponential function on vector \p a.
3552
+ * \internal
3553
+ * \exception-guarantee no-throw guarantee
3554
+ * \behavior reentrant, thread safe
3555
+ * \endinternal
3556
+ */
3557
+ __CUDA_FP16_DECL__ __half2 h2exp2(const __half2 a);
3558
+ /**
3559
+ * \ingroup CUDA_MATH__HALF2_FUNCTIONS
3560
+ * \brief Calculates \p half2 vector decimal exponential function in
3561
+ * round-to-nearest-even mode.
3562
+ *
3563
+ * \details Calculates \p half2 decimal exponential function of input vector \p a in
3564
+ * round-to-nearest-even mode.
3565
+ * \param[in] a - half2. Is only being read.
3566
+ *
3567
+ * \returns half2
3568
+ * \retval The elementwise decimal exponential function on vector \p a.
3569
+ * \internal
3570
+ * \exception-guarantee no-throw guarantee
3571
+ * \behavior reentrant, thread safe
3572
+ * \endinternal
3573
+ */
3574
+ __CUDA_FP16_DECL__ __half2 h2exp10(const __half2 a);
3575
+ /**
3576
+ * \ingroup CUDA_MATH__HALF2_FUNCTIONS
3577
+ * \brief Calculates \p half2 vector cosine in round-to-nearest-even mode.
3578
+ *
3579
+ * \details Calculates \p half2 cosine of input vector \p a in round-to-nearest-even
3580
+ * mode.
3581
+ * \param[in] a - half2. Is only being read.
3582
+ *
3583
+ * \returns half2
3584
+ * \retval The elementwise cosine on vector \p a.
3585
+ * \internal
3586
+ * \exception-guarantee no-throw guarantee
3587
+ * \behavior reentrant, thread safe
3588
+ * \endinternal
3589
+ */
3590
+ __CUDA_FP16_DECL__ __half2 h2cos(const __half2 a);
3591
+ /**
3592
+ * \ingroup CUDA_MATH__HALF2_FUNCTIONS
3593
+ * \brief Calculates \p half2 vector sine in round-to-nearest-even mode.
3594
+ *
3595
+ * \details Calculates \p half2 sine of input vector \p a in round-to-nearest-even mode.
3596
+ * \param[in] a - half2. Is only being read.
3597
+ *
3598
+ * \returns half2
3599
+ * \retval The elementwise sine on vector \p a.
3600
+ * \internal
3601
+ * \exception-guarantee no-throw guarantee
3602
+ * \behavior reentrant, thread safe
3603
+ * \endinternal
3604
+ */
3605
+ __CUDA_FP16_DECL__ __half2 h2sin(const __half2 a);
3606
+
3607
+ #endif /*if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)*/
3608
+
3609
+ #if __CUDA_ARCH__ >= 600 || !defined(__CUDA_ARCH__)
3610
+
3611
+ __CUDA_FP16_DECL__ __half2 atomicAdd(__half2 *const address, const __half2 val);
3612
+
3613
+ #endif /*if __CUDA_ARCH__ >= 600 || !defined(__CUDA_ARCH__)*/
3614
+
3615
+ #if __CUDA_ARCH__ >= 700 || !defined(__CUDA_ARCH__)
3616
+
3617
+ __CUDA_FP16_DECL__ __half atomicAdd(__half *const address, const __half val);
3618
+
3619
+ #endif /*if __CUDA_ARCH__ >= 700 || !defined(__CUDA_ARCH__)*/
3620
+
3621
+ #endif /* defined(__CUDACC__) */
3622
+
3623
+ #undef __CUDA_FP16_DECL__
3624
+ #undef __CUDA_HOSTDEVICE_FP16_DECL__
3625
+
3626
+ #endif /* defined(__cplusplus) */
3627
+
3628
+ /* Note the .hpp file is included even for host-side compilation, to capture the "half" & "half2" definitions */
3629
+ #include "cuda_fp16.hpp"
3630
+
3631
+ #endif /* end of include guard: __CUDA_FP16_H__ */