numba-cuda 0.8.1__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (237) hide show
  1. _numba_cuda_redirector.py +17 -13
  2. numba_cuda/VERSION +1 -1
  3. numba_cuda/_version.py +4 -1
  4. numba_cuda/numba/cuda/__init__.py +6 -2
  5. numba_cuda/numba/cuda/api.py +129 -86
  6. numba_cuda/numba/cuda/api_util.py +3 -3
  7. numba_cuda/numba/cuda/args.py +12 -16
  8. numba_cuda/numba/cuda/cg.py +6 -6
  9. numba_cuda/numba/cuda/codegen.py +74 -43
  10. numba_cuda/numba/cuda/compiler.py +246 -114
  11. numba_cuda/numba/cuda/cpp_function_wrappers.cu +1 -2
  12. numba_cuda/numba/cuda/cuda_bf16.py +5155 -0
  13. numba_cuda/numba/cuda/cuda_paths.py +293 -99
  14. numba_cuda/numba/cuda/cudadecl.py +93 -79
  15. numba_cuda/numba/cuda/cudadrv/__init__.py +3 -1
  16. numba_cuda/numba/cuda/cudadrv/devicearray.py +185 -135
  17. numba_cuda/numba/cuda/cudadrv/devices.py +16 -11
  18. numba_cuda/numba/cuda/cudadrv/driver.py +460 -297
  19. numba_cuda/numba/cuda/cudadrv/drvapi.py +241 -207
  20. numba_cuda/numba/cuda/cudadrv/dummyarray.py +66 -54
  21. numba_cuda/numba/cuda/cudadrv/enums.py +1 -1
  22. numba_cuda/numba/cuda/cudadrv/error.py +6 -2
  23. numba_cuda/numba/cuda/cudadrv/libs.py +67 -63
  24. numba_cuda/numba/cuda/cudadrv/linkable_code.py +27 -3
  25. numba_cuda/numba/cuda/cudadrv/mappings.py +16 -14
  26. numba_cuda/numba/cuda/cudadrv/nvrtc.py +146 -30
  27. numba_cuda/numba/cuda/cudadrv/nvvm.py +296 -161
  28. numba_cuda/numba/cuda/cudadrv/rtapi.py +1 -1
  29. numba_cuda/numba/cuda/cudadrv/runtime.py +20 -8
  30. numba_cuda/numba/cuda/cudaimpl.py +296 -275
  31. numba_cuda/numba/cuda/cudamath.py +1 -1
  32. numba_cuda/numba/cuda/debuginfo.py +99 -7
  33. numba_cuda/numba/cuda/decorators.py +87 -45
  34. numba_cuda/numba/cuda/descriptor.py +1 -1
  35. numba_cuda/numba/cuda/device_init.py +68 -18
  36. numba_cuda/numba/cuda/deviceufunc.py +143 -98
  37. numba_cuda/numba/cuda/dispatcher.py +300 -213
  38. numba_cuda/numba/cuda/errors.py +13 -10
  39. numba_cuda/numba/cuda/extending.py +55 -1
  40. numba_cuda/numba/cuda/include/11/cuda_bf16.h +3749 -0
  41. numba_cuda/numba/cuda/include/11/cuda_bf16.hpp +2683 -0
  42. numba_cuda/numba/cuda/{cuda_fp16.h → include/11/cuda_fp16.h} +1090 -927
  43. numba_cuda/numba/cuda/{cuda_fp16.hpp → include/11/cuda_fp16.hpp} +468 -319
  44. numba_cuda/numba/cuda/include/12/cuda_bf16.h +5118 -0
  45. numba_cuda/numba/cuda/include/12/cuda_bf16.hpp +3865 -0
  46. numba_cuda/numba/cuda/include/12/cuda_fp16.h +5363 -0
  47. numba_cuda/numba/cuda/include/12/cuda_fp16.hpp +3483 -0
  48. numba_cuda/numba/cuda/initialize.py +5 -3
  49. numba_cuda/numba/cuda/intrinsic_wrapper.py +0 -39
  50. numba_cuda/numba/cuda/intrinsics.py +203 -28
  51. numba_cuda/numba/cuda/kernels/reduction.py +13 -13
  52. numba_cuda/numba/cuda/kernels/transpose.py +3 -6
  53. numba_cuda/numba/cuda/libdevice.py +317 -317
  54. numba_cuda/numba/cuda/libdeviceimpl.py +3 -2
  55. numba_cuda/numba/cuda/locks.py +16 -0
  56. numba_cuda/numba/cuda/lowering.py +43 -0
  57. numba_cuda/numba/cuda/mathimpl.py +62 -57
  58. numba_cuda/numba/cuda/models.py +1 -5
  59. numba_cuda/numba/cuda/nvvmutils.py +103 -88
  60. numba_cuda/numba/cuda/printimpl.py +9 -5
  61. numba_cuda/numba/cuda/random.py +46 -36
  62. numba_cuda/numba/cuda/reshape_funcs.cu +1 -1
  63. numba_cuda/numba/cuda/runtime/__init__.py +1 -1
  64. numba_cuda/numba/cuda/runtime/memsys.cu +1 -1
  65. numba_cuda/numba/cuda/runtime/memsys.cuh +1 -1
  66. numba_cuda/numba/cuda/runtime/nrt.cu +3 -3
  67. numba_cuda/numba/cuda/runtime/nrt.py +48 -43
  68. numba_cuda/numba/cuda/simulator/__init__.py +22 -12
  69. numba_cuda/numba/cuda/simulator/api.py +38 -22
  70. numba_cuda/numba/cuda/simulator/compiler.py +2 -2
  71. numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +8 -2
  72. numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +63 -55
  73. numba_cuda/numba/cuda/simulator/cudadrv/devices.py +13 -11
  74. numba_cuda/numba/cuda/simulator/cudadrv/driver.py +5 -5
  75. numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +2 -2
  76. numba_cuda/numba/cuda/simulator/cudadrv/libs.py +1 -1
  77. numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +3 -3
  78. numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +3 -3
  79. numba_cuda/numba/cuda/simulator/kernel.py +43 -34
  80. numba_cuda/numba/cuda/simulator/kernelapi.py +31 -26
  81. numba_cuda/numba/cuda/simulator/reduction.py +1 -0
  82. numba_cuda/numba/cuda/simulator/vector_types.py +13 -9
  83. numba_cuda/numba/cuda/simulator_init.py +2 -4
  84. numba_cuda/numba/cuda/stubs.py +134 -108
  85. numba_cuda/numba/cuda/target.py +92 -47
  86. numba_cuda/numba/cuda/testing.py +24 -19
  87. numba_cuda/numba/cuda/tests/__init__.py +14 -12
  88. numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +16 -17
  89. numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +7 -7
  90. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +73 -54
  91. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +1 -1
  92. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +48 -50
  93. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +47 -29
  94. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +3 -3
  95. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +19 -19
  96. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +108 -103
  97. numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +20 -11
  98. numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +20 -17
  99. numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +8 -6
  100. numba_cuda/numba/cuda/tests/cudadrv/test_events.py +1 -1
  101. numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +8 -7
  102. numba_cuda/numba/cuda/tests/cudadrv/test_init.py +13 -13
  103. numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +12 -9
  104. numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +36 -31
  105. numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +8 -7
  106. numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +294 -0
  107. numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +10 -7
  108. numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +24 -15
  109. numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +43 -41
  110. numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +4 -5
  111. numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +2 -2
  112. numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +28 -17
  113. numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +1 -2
  114. numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +22 -14
  115. numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +1 -1
  116. numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +4 -3
  117. numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +10 -4
  118. numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +1 -0
  119. numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +10 -7
  120. numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +0 -2
  121. numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +1 -0
  122. numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +6 -5
  123. numba_cuda/numba/cuda/tests/cudapy/test_array.py +52 -42
  124. numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +5 -6
  125. numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +1 -1
  126. numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +501 -304
  127. numba_cuda/numba/cuda/tests/cudapy/test_bfloat16_bindings.py +257 -0
  128. numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +59 -23
  129. numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +3 -3
  130. numba_cuda/numba/cuda/tests/cudapy/test_caching.py +50 -37
  131. numba_cuda/numba/cuda/tests/cudapy/test_casting.py +29 -24
  132. numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +11 -6
  133. numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +84 -50
  134. numba_cuda/numba/cuda/tests/cudapy/test_complex.py +144 -73
  135. numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +2 -2
  136. numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +37 -27
  137. numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +43 -45
  138. numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +21 -14
  139. numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +60 -55
  140. numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +3 -2
  141. numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +26 -22
  142. numba_cuda/numba/cuda/tests/cudapy/test_debug.py +29 -27
  143. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +77 -28
  144. numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +52 -45
  145. numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +55 -43
  146. numba_cuda/numba/cuda/tests/cudapy/test_enums.py +24 -7
  147. numba_cuda/numba/cuda/tests/cudapy/test_errors.py +30 -15
  148. numba_cuda/numba/cuda/tests/cudapy/test_exception.py +11 -12
  149. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +21 -12
  150. numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +77 -66
  151. numba_cuda/numba/cuda/tests/cudapy/test_forall.py +5 -3
  152. numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +5 -3
  153. numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +1 -1
  154. numba_cuda/numba/cuda/tests/cudapy/test_globals.py +3 -5
  155. numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +144 -126
  156. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +23 -18
  157. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +16 -22
  158. numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +1 -3
  159. numba_cuda/numba/cuda/tests/cudapy/test_inline.py +59 -0
  160. numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +29 -20
  161. numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +147 -99
  162. numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +50 -36
  163. numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +1 -2
  164. numba_cuda/numba/cuda/tests/cudapy/test_lang.py +4 -4
  165. numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +7 -7
  166. numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +24 -20
  167. numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +36 -31
  168. numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +13 -13
  169. numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +13 -6
  170. numba_cuda/numba/cuda/tests/cudapy/test_math.py +83 -66
  171. numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +1 -3
  172. numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +19 -58
  173. numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +4 -4
  174. numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +9 -7
  175. numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +9 -8
  176. numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +12 -10
  177. numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +1 -1
  178. numba_cuda/numba/cuda/tests/cudapy/test_operator.py +180 -96
  179. numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +5 -5
  180. numba_cuda/numba/cuda/tests/cudapy/test_overload.py +37 -18
  181. numba_cuda/numba/cuda/tests/cudapy/test_powi.py +7 -7
  182. numba_cuda/numba/cuda/tests/cudapy/test_print.py +9 -7
  183. numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +1 -1
  184. numba_cuda/numba/cuda/tests/cudapy/test_random.py +15 -10
  185. numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +88 -87
  186. numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +12 -10
  187. numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +26 -11
  188. numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +7 -10
  189. numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +4 -6
  190. numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +1 -1
  191. numba_cuda/numba/cuda/tests/cudapy/test_sm.py +10 -9
  192. numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +62 -43
  193. numba_cuda/numba/cuda/tests/cudapy/test_stream_api.py +7 -3
  194. numba_cuda/numba/cuda/tests/cudapy/test_sync.py +7 -5
  195. numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +18 -11
  196. numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +111 -88
  197. numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +2 -3
  198. numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +305 -130
  199. numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +33 -36
  200. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +5 -5
  201. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +16 -12
  202. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +7 -7
  203. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +6 -7
  204. numba_cuda/numba/cuda/tests/cudapy/test_warning.py +31 -29
  205. numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +81 -30
  206. numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +19 -13
  207. numba_cuda/numba/cuda/tests/data/jitlink.cu +1 -1
  208. numba_cuda/numba/cuda/tests/data/jitlink.ptx +0 -2
  209. numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +15 -8
  210. numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +4 -7
  211. numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +14 -9
  212. numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +22 -18
  213. numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +7 -4
  214. numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +2 -0
  215. numba_cuda/numba/cuda/tests/doc_examples/test_random.py +8 -4
  216. numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +2 -1
  217. numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +94 -19
  218. numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +2 -2
  219. numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +91 -62
  220. numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +14 -5
  221. numba_cuda/numba/cuda/tests/nocuda/test_import.py +25 -25
  222. numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +40 -40
  223. numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +12 -10
  224. numba_cuda/numba/cuda/tests/nrt/test_nrt.py +16 -20
  225. numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +12 -10
  226. numba_cuda/numba/cuda/tests/test_binary_generation/generate_raw_ltoir.py +2 -2
  227. numba_cuda/numba/cuda/types.py +5 -2
  228. numba_cuda/numba/cuda/ufuncs.py +382 -362
  229. numba_cuda/numba/cuda/utils.py +2 -2
  230. numba_cuda/numba/cuda/vector_types.py +5 -3
  231. numba_cuda/numba/cuda/vectorizers.py +38 -33
  232. {numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/METADATA +1 -1
  233. numba_cuda-0.10.0.dist-info/RECORD +263 -0
  234. {numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/WHEEL +1 -1
  235. numba_cuda-0.8.1.dist-info/RECORD +0 -251
  236. {numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/licenses/LICENSE +0 -0
  237. {numba_cuda-0.8.1.dist-info → numba_cuda-0.10.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,3749 @@
1
+ /*
2
+ * Copyright 1993-2021 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ /**
51
+ * \defgroup CUDA_MATH_INTRINSIC_BFLOAT16 Bfloat16 Precision Intrinsics
52
+ * This section describes nv_bfloat16 precision intrinsic functions that are
53
+ * only supported in device code.
54
+ * To use these functions, include the header file \p cuda_bf16.h in your program.
55
+ */
56
+
57
+ /**
58
+ * \defgroup CUDA_MATH__BFLOAT16_ARITHMETIC Bfloat16 Arithmetic Functions
59
+ * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
60
+ * To use these functions, include the header file \p cuda_bf16.h in your program.
61
+ */
62
+
63
+ /**
64
+ * \defgroup CUDA_MATH__BFLOAT162_ARITHMETIC Bfloat162 Arithmetic Functions
65
+ * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
66
+ * To use these functions, include the header file \p cuda_bf16.h in your program.
67
+ */
68
+
69
+ /**
70
+ * \defgroup CUDA_MATH__BFLOAT16_COMPARISON Bfloat16 Comparison Functions
71
+ * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
72
+ * To use these functions, include the header file \p cuda_bf16.h in your program.
73
+ */
74
+
75
+ /**
76
+ * \defgroup CUDA_MATH__BFLOAT162_COMPARISON Bfloat162 Comparison Functions
77
+ * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
78
+ * To use these functions, include the header file \p cuda_bf16.h in your program.
79
+ */
80
+
81
+ /**
82
+ * \defgroup CUDA_MATH__BFLOAT16_MISC Bfloat16 Precision Conversion and Data Movement
83
+ * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
84
+ * To use these functions, include the header file \p cuda_bf16.h in your program.
85
+ */
86
+
87
+ /**
88
+ * \defgroup CUDA_MATH__BFLOAT16_FUNCTIONS Bfloat16 Math Functions
89
+ * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
90
+ * To use these functions, include the header file \p cuda_bf16.h in your program.
91
+ */
92
+
93
+ /**
94
+ * \defgroup CUDA_MATH__BFLOAT162_FUNCTIONS Bfloat162 Math Functions
95
+ * \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
96
+ * To use these functions, include the header file \p cuda_bf16.h in your program.
97
+ */
98
+
99
+ #ifndef __CUDA_BF16_H__
100
+ #define __CUDA_BF16_H__
101
+
102
+ #define ___CUDA_BF16_STRINGIFY_INNERMOST(x) #x
103
+ #define __CUDA_BF16_STRINGIFY(x) ___CUDA_BF16_STRINGIFY_INNERMOST(x)
104
+
105
+ #if defined(__cplusplus)
106
+ #if defined(__CUDACC__)
107
+ #define __CUDA_BF16_DECL__ static __device__ __inline__
108
+ #define __CUDA_HOSTDEVICE_BF16_DECL__ static __host__ __device__ __inline__
109
+ #else
110
+ #define __CUDA_HOSTDEVICE_BF16_DECL__ static
111
+ #endif /* defined(__CUDACC__) */
112
+
113
+ #define __CUDA_BF16_TYPES_EXIST__
114
+
115
+ /* Forward-declaration of structures defined in "cuda_bf16.hpp" */
116
+
117
+ /**
118
+ * \brief nv_bfloat16 datatype
119
+ *
120
+ * \details This structure implements the datatype for storing
121
+ * nv_bfloat16 floating-point numbers. The structure implements
122
+ * assignment operators and type conversions. 16 bits are being
123
+ * used in total: 1 sign bit, 8 bits for the exponent, and
124
+ * the significand is being stored in 7 bits. The total
125
+ * precision is 8 bits.
126
+ *
127
+ */
128
+ struct __nv_bfloat16;
129
+
130
+ /**
131
+ * \brief nv_bfloat162 datatype
132
+ *
133
+ * \details This structure implements the datatype for storing two
134
+ * nv_bfloat16 floating-point numbers.
135
+ * The structure implements assignment operators and type conversions.
136
+ *
137
+ */
138
+ struct __nv_bfloat162;
139
+
140
+ /**
141
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
142
+ * \brief Converts double number to nv_bfloat16 precision in round-to-nearest-even mode
143
+ * and returns \p nv_bfloat16 with converted value.
144
+ *
145
+ * \details Converts double number \p a to nv_bfloat16 precision in round-to-nearest-even mode.
146
+ * \param[in] a - double. Is only being read.
147
+ * \returns nv_bfloat16
148
+ * - \p a converted to nv_bfloat16.
149
+ * \internal
150
+ * \exception-guarantee no-throw guarantee
151
+ * \behavior reentrant, thread safe
152
+ * \endinternal
153
+ */
154
+ __CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __double2bfloat16(const double a);
155
+ /**
156
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
157
+ * \brief Converts float number to nv_bfloat16 precision in round-to-nearest-even mode
158
+ * and returns \p nv_bfloat16 with converted value.
159
+ *
160
+ * \details Converts float number \p a to nv_bfloat16 precision in round-to-nearest-even mode.
161
+ * \param[in] a - float. Is only being read.
162
+ * \returns nv_bfloat16
163
+ * - \p a converted to nv_bfloat16.
164
+ * \internal
165
+ * \exception-guarantee no-throw guarantee
166
+ * \behavior reentrant, thread safe
167
+ * \endinternal
168
+ */
169
+ __CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16(const float a);
170
+ /**
171
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
172
+ * \brief Converts float number to nv_bfloat16 precision in round-to-nearest-even mode
173
+ * and returns \p nv_bfloat16 with converted value.
174
+ *
175
+ * \details Converts float number \p a to nv_bfloat16 precision in round-to-nearest-even mode.
176
+ * \param[in] a - float. Is only being read.
177
+ * \returns nv_bfloat16
178
+ * - \p a converted to nv_bfloat16.
179
+ * \internal
180
+ * \exception-guarantee no-throw guarantee
181
+ * \behavior reentrant, thread safe
182
+ * \endinternal
183
+ */
184
+ __CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rn(const float a);
185
+ /**
186
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
187
+ * \brief Converts float number to nv_bfloat16 precision in round-towards-zero mode
188
+ * and returns \p nv_bfloat16 with converted value.
189
+ *
190
+ * \details Converts float number \p a to nv_bfloat16 precision in round-towards-zero mode.
191
+ * \param[in] a - float. Is only being read.
192
+ * \returns nv_bfloat16
193
+ * - \p a converted to nv_bfloat16.
194
+ * \internal
195
+ * \exception-guarantee no-throw guarantee
196
+ * \behavior reentrant, thread safe
197
+ * \endinternal
198
+ */
199
+ __CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rz(const float a);
200
+ /**
201
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
202
+ * \brief Converts float number to nv_bfloat16 precision in round-down mode
203
+ * and returns \p nv_bfloat16 with converted value.
204
+ *
205
+ * \details Converts float number \p a to nv_bfloat16 precision in round-down mode.
206
+ * \param[in] a - float. Is only being read.
207
+ *
208
+ * \returns nv_bfloat16
209
+ * - \p a converted to nv_bfloat16.
210
+ * \internal
211
+ * \exception-guarantee no-throw guarantee
212
+ * \behavior reentrant, thread safe
213
+ * \endinternal
214
+ */
215
+ __CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rd(const float a);
216
+ /**
217
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
218
+ * \brief Converts float number to nv_bfloat16 precision in round-up mode
219
+ * and returns \p nv_bfloat16 with converted value.
220
+ *
221
+ * \details Converts float number \p a to nv_bfloat16 precision in round-up mode.
222
+ * \param[in] a - float. Is only being read.
223
+ *
224
+ * \returns nv_bfloat16
225
+ * - \p a converted to nv_bfloat16.
226
+ * \internal
227
+ * \exception-guarantee no-throw guarantee
228
+ * \behavior reentrant, thread safe
229
+ * \endinternal
230
+ */
231
+ __CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_ru(const float a);
232
+ /**
233
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
234
+ * \brief Converts \p nv_bfloat16 number to float.
235
+ *
236
+ * \details Converts nv_bfloat16 number \p a to float.
237
+ * \param[in] a - float. Is only being read.
238
+ *
239
+ * \returns float
240
+ * - \p a converted to float.
241
+ * \internal
242
+ * \exception-guarantee no-throw guarantee
243
+ * \behavior reentrant, thread safe
244
+ * \endinternal
245
+ */
246
+ __CUDA_HOSTDEVICE_BF16_DECL__ float __bfloat162float(const __nv_bfloat16 a);
247
+ /**
248
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
249
+ * \brief Converts input to nv_bfloat16 precision in round-to-nearest-even mode and
250
+ * populates both halves of \p nv_bfloat162 with converted value.
251
+ *
252
+ * \details Converts input \p a to nv_bfloat16 precision in round-to-nearest-even mode and
253
+ * populates both halves of \p nv_bfloat162 with converted value.
254
+ * \param[in] a - float. Is only being read.
255
+ *
256
+ * \returns nv_bfloat162
257
+ * - The \p nv_bfloat162 value with both halves equal to the converted nv_bfloat16
258
+ * precision number.
259
+ * \internal
260
+ * \exception-guarantee no-throw guarantee
261
+ * \behavior reentrant, thread safe
262
+ * \endinternal
263
+ */
264
+ __CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __float2bfloat162_rn(const float a);
265
+ /**
266
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
267
+ * \brief Converts both input floats to nv_bfloat16 precision in round-to-nearest-even
268
+ * mode and returns \p nv_bfloat162 with converted values.
269
+ *
270
+ * \details Converts both input floats to nv_bfloat16 precision in round-to-nearest-even mode
271
+ * and combines the results into one \p nv_bfloat162 number. Low 16 bits of the return
272
+ * value correspond to the input \p a, high 16 bits correspond to the input \p
273
+ * b.
274
+ * \param[in] a - float. Is only being read.
275
+ * \param[in] b - float. Is only being read.
276
+ *
277
+ * \returns nv_bfloat162
278
+ * - The \p nv_bfloat162 value with corresponding halves equal to the
279
+ * converted input floats.
280
+ * \internal
281
+ * \exception-guarantee no-throw guarantee
282
+ * \behavior reentrant, thread safe
283
+ * \endinternal
284
+ */
285
+ __CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __floats2bfloat162_rn(const float a, const float b);
286
+ /**
287
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
288
+ * \brief Converts low 16 bits of \p nv_bfloat162 to float and returns the result
289
+ *
290
+ * \details Converts low 16 bits of \p nv_bfloat162 input \p a to 32-bit floating-point number
291
+ * and returns the result.
292
+ * \param[in] a - nv_bfloat162. Is only being read.
293
+ *
294
+ * \returns float
295
+ * - The low 16 bits of \p a converted to float.
296
+ * \internal
297
+ * \exception-guarantee no-throw guarantee
298
+ * \behavior reentrant, thread safe
299
+ * \endinternal
300
+ */
301
+ __CUDA_HOSTDEVICE_BF16_DECL__ float __low2float(const __nv_bfloat162 a);
302
+ /**
303
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
304
+ * \brief Converts high 16 bits of \p nv_bfloat162 to float and returns the result
305
+ *
306
+ * \details Converts high 16 bits of \p nv_bfloat162 input \p a to 32-bit floating-point number
307
+ * and returns the result.
308
+ * \param[in] a - nv_bfloat162. Is only being read.
309
+ *
310
+ * \returns float
311
+ * - The high 16 bits of \p a converted to float.
312
+ * \internal
313
+ * \exception-guarantee no-throw guarantee
314
+ * \behavior reentrant, thread safe
315
+ * \endinternal
316
+ */
317
+ __CUDA_HOSTDEVICE_BF16_DECL__ float __high2float(const __nv_bfloat162 a);
318
+
319
+ #if defined(__CUDACC__) && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
320
+ /**
321
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
322
+ * \brief Converts both components of float2 number to nv_bfloat16 precision in
323
+ * round-to-nearest-even mode and returns \p nv_bfloat162 with converted values.
324
+ *
325
+ * \details Converts both components of float2 to nv_bfloat16 precision in round-to-nearest
326
+ * mode and combines the results into one \p nv_bfloat162 number. Low 16 bits of the
327
+ * return value correspond to \p a.x and high 16 bits of the return value
328
+ * correspond to \p a.y.
329
+ * \param[in] a - float2. Is only being read.
330
+ *
331
+ * \returns nv_bfloat162
332
+ * - The \p nv_bfloat162 which has corresponding halves equal to the
333
+ * converted float2 components.
334
+ * \internal
335
+ * \exception-guarantee no-throw guarantee
336
+ * \behavior reentrant, thread safe
337
+ * \endinternal
338
+ */
339
+ __CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __float22bfloat162_rn(const float2 a);
340
+ /**
341
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
342
+ * \brief Converts both halves of \p nv_bfloat162 to float2 and returns the result.
343
+ *
344
+ * \details Converts both halves of \p nv_bfloat162 input \p a to float2 and returns the
345
+ * result.
346
+ * \param[in] a - nv_bfloat162. Is only being read.
347
+ *
348
+ * \returns float2
349
+ * - \p a converted to float2.
350
+ * \internal
351
+ * \exception-guarantee no-throw guarantee
352
+ * \behavior reentrant, thread safe
353
+ * \endinternal
354
+ */
355
+ __CUDA_HOSTDEVICE_BF16_DECL__ float2 __bfloat1622float2(const __nv_bfloat162 a);
356
+ /**
357
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
358
+ * \brief Convert a nv_bfloat16 to a signed integer in round-to-nearest-even mode.
359
+ *
360
+ * \details Convert the nv_bfloat16 floating-point value \p h to a signed integer in
361
+ * round-to-nearest-even mode. NaN inputs are converted to 0.
362
+ * \param[in] h - nv_bfloat16. Is only being read.
363
+ *
364
+ * \returns int
365
+ * - \p h converted to a signed integer.
366
+ * \internal
367
+ * \exception-guarantee no-throw guarantee
368
+ * \behavior reentrant, thread safe
369
+ * \endinternal
370
+ */
371
+ __CUDA_BF16_DECL__ int __bfloat162int_rn(const __nv_bfloat16 h);
372
+ /**
373
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
374
+ * \brief Convert a nv_bfloat16 to a signed integer in round-towards-zero mode.
375
+ *
376
+ * \details Convert the nv_bfloat16 floating-point value \p h to a signed integer in
377
+ * round-towards-zero mode. NaN inputs are converted to 0.
378
+ * \param[in] h - nv_bfloat16. Is only being read.
379
+ *
380
+ * \returns int
381
+ * - \p h converted to a signed integer.
382
+ * \internal
383
+ * \exception-guarantee no-throw guarantee
384
+ * \behavior reentrant, thread safe
385
+ * \endinternal
386
+ */
387
+ __CUDA_HOSTDEVICE_BF16_DECL__ int __bfloat162int_rz(const __nv_bfloat16 h);
388
+ /**
389
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
390
+ * \brief Convert a nv_bfloat16 to a signed integer in round-down mode.
391
+ *
392
+ * \details Convert the nv_bfloat16 floating-point value \p h to a signed integer in
393
+ * round-down mode. NaN inputs are converted to 0.
394
+ * \param[in] h - nv_bfloat16. Is only being read.
395
+ *
396
+ * \returns int
397
+ * - \p h converted to a signed integer.
398
+ * \internal
399
+ * \exception-guarantee no-throw guarantee
400
+ * \behavior reentrant, thread safe
401
+ * \endinternal
402
+ */
403
+ __CUDA_BF16_DECL__ int __bfloat162int_rd(const __nv_bfloat16 h);
404
+ /**
405
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
406
+ * \brief Convert a nv_bfloat16 to a signed integer in round-up mode.
407
+ *
408
+ * \details Convert the nv_bfloat16 floating-point value \p h to a signed integer in
409
+ * round-up mode. NaN inputs are converted to 0.
410
+ * \param[in] h - nv_bfloat16. Is only being read.
411
+ *
412
+ * \returns int
413
+ * - \p h converted to a signed integer.
414
+ * \internal
415
+ * \exception-guarantee no-throw guarantee
416
+ * \behavior reentrant, thread safe
417
+ * \endinternal
418
+ */
419
+ __CUDA_BF16_DECL__ int __bfloat162int_ru(const __nv_bfloat16 h);
420
+
421
+ /**
422
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
423
+ * \brief Convert a signed integer to a nv_bfloat16 in round-to-nearest-even mode.
424
+ *
425
+ * \details Convert the signed integer value \p i to a nv_bfloat16 floating-point
426
+ * value in round-to-nearest-even mode.
427
+ * \param[in] i - int. Is only being read.
428
+ *
429
+ * \returns nv_bfloat16
430
+ * - \p i converted to nv_bfloat16.
431
+ * \internal
432
+ * \exception-guarantee no-throw guarantee
433
+ * \behavior reentrant, thread safe
434
+ * \endinternal
435
+ */
436
+ __CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rn(const int i);
437
+ /**
438
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
439
+ * \brief Convert a signed integer to a nv_bfloat16 in round-towards-zero mode.
440
+ *
441
+ * \details Convert the signed integer value \p i to a nv_bfloat16 floating-point
442
+ * value in round-towards-zero mode.
443
+ * \param[in] i - int. Is only being read.
444
+ *
445
+ * \returns nv_bfloat16
446
+ * - \p i converted to nv_bfloat16.
447
+ * \internal
448
+ * \exception-guarantee no-throw guarantee
449
+ * \behavior reentrant, thread safe
450
+ * \endinternal
451
+ */
452
+ __CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rz(const int i);
453
+ /**
454
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
455
+ * \brief Convert a signed integer to a nv_bfloat16 in round-down mode.
456
+ *
457
+ * \details Convert the signed integer value \p i to a nv_bfloat16 floating-point
458
+ * value in round-down mode.
459
+ * \param[in] i - int. Is only being read.
460
+ *
461
+ * \returns nv_bfloat16
462
+ * - \p i converted to nv_bfloat16.
463
+ * \internal
464
+ * \exception-guarantee no-throw guarantee
465
+ * \behavior reentrant, thread safe
466
+ * \endinternal
467
+ */
468
+ __CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rd(const int i);
469
+ /**
470
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
471
+ * \brief Convert a signed integer to a nv_bfloat16 in round-up mode.
472
+ *
473
+ * \details Convert the signed integer value \p i to a nv_bfloat16 floating-point
474
+ * value in round-up mode.
475
+ * \param[in] i - int. Is only being read.
476
+ *
477
+ * \returns nv_bfloat16
478
+ * - \p i converted to nv_bfloat16.
479
+ * \internal
480
+ * \exception-guarantee no-throw guarantee
481
+ * \behavior reentrant, thread safe
482
+ * \endinternal
483
+ */
484
+ __CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_ru(const int i);
485
+
486
+ /**
487
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
488
+ * \brief Convert a nv_bfloat16 to a signed short integer in round-to-nearest-even
489
+ * mode.
490
+ *
491
+ * \details Convert the nv_bfloat16 floating-point value \p h to a signed short
492
+ * integer in round-to-nearest-even mode. NaN inputs are converted to 0.
493
+ * \param[in] h - nv_bfloat16. Is only being read.
494
+ *
495
+ * \returns short int
496
+ * - \p h converted to a signed short integer.
497
+ * \internal
498
+ * \exception-guarantee no-throw guarantee
499
+ * \behavior reentrant, thread safe
500
+ * \endinternal
501
+ */
502
+ __CUDA_BF16_DECL__ short int __bfloat162short_rn(const __nv_bfloat16 h);
503
+ /**
504
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
505
+ * \brief Convert a nv_bfloat16 to a signed short integer in round-towards-zero mode.
506
+ *
507
+ * \details Convert the nv_bfloat16 floating-point value \p h to a signed short
508
+ * integer in round-towards-zero mode. NaN inputs are converted to 0.
509
+ * \param[in] h - nv_bfloat16. Is only being read.
510
+ *
511
+ * \returns short int
512
+ * - \p h converted to a signed short integer.
513
+ * \internal
514
+ * \exception-guarantee no-throw guarantee
515
+ * \behavior reentrant, thread safe
516
+ * \endinternal
517
+ */
518
+ __CUDA_HOSTDEVICE_BF16_DECL__ short int __bfloat162short_rz(const __nv_bfloat16 h);
519
+ /**
520
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
521
+ * \brief Convert a nv_bfloat16 to a signed short integer in round-down mode.
522
+ *
523
+ * \details Convert the nv_bfloat16 floating-point value \p h to a signed short
524
+ * integer in round-down mode. NaN inputs are converted to 0.
525
+ * \param[in] h - nv_bfloat16. Is only being read.
526
+ *
527
+ * \returns short int
528
+ * - \p h converted to a signed short integer.
529
+ * \internal
530
+ * \exception-guarantee no-throw guarantee
531
+ * \behavior reentrant, thread safe
532
+ * \endinternal
533
+ */
534
+ __CUDA_BF16_DECL__ short int __bfloat162short_rd(const __nv_bfloat16 h);
535
+ /**
536
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
537
+ * \brief Convert a nv_bfloat16 to a signed short integer in round-up mode.
538
+ *
539
+ * \details Convert the nv_bfloat16 floating-point value \p h to a signed short
540
+ * integer in round-up mode. NaN inputs are converted to 0.
541
+ * \param[in] h - nv_bfloat16. Is only being read.
542
+ *
543
+ * \returns short int
544
+ * - \p h converted to a signed short integer.
545
+ * \internal
546
+ * \exception-guarantee no-throw guarantee
547
+ * \behavior reentrant, thread safe
548
+ * \endinternal
549
+ */
550
+ __CUDA_BF16_DECL__ short int __bfloat162short_ru(const __nv_bfloat16 h);
551
+
552
+ /**
553
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
554
+ * \brief Convert a signed short integer to a nv_bfloat16 in round-to-nearest-even
555
+ * mode.
556
+ *
557
+ * \details Convert the signed short integer value \p i to a nv_bfloat16 floating-point
558
+ * value in round-to-nearest-even mode.
559
+ * \param[in] i - short int. Is only being read.
560
+ *
561
+ * \returns nv_bfloat16
562
+ * - \p i converted to nv_bfloat16.
563
+ * \internal
564
+ * \exception-guarantee no-throw guarantee
565
+ * \behavior reentrant, thread safe
566
+ * \endinternal
567
+ */
568
+ __CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rn(const short int i);
569
+ /**
570
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
571
+ * \brief Convert a signed short integer to a nv_bfloat16 in round-towards-zero mode.
572
+ *
573
+ * \details Convert the signed short integer value \p i to a nv_bfloat16 floating-point
574
+ * value in round-towards-zero mode.
575
+ * \param[in] i - short int. Is only being read.
576
+ *
577
+ * \returns nv_bfloat16
578
+ * - \p i converted to nv_bfloat16.
579
+ * \internal
580
+ * \exception-guarantee no-throw guarantee
581
+ * \behavior reentrant, thread safe
582
+ * \endinternal
583
+ */
584
+ __CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rz(const short int i);
585
+ /**
586
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
587
+ * \brief Convert a signed short integer to a nv_bfloat16 in round-down mode.
588
+ *
589
+ * \details Convert the signed short integer value \p i to a nv_bfloat16 floating-point
590
+ * value in round-down mode.
591
+ * \param[in] i - short int. Is only being read.
592
+ *
593
+ * \returns nv_bfloat16
594
+ * - \p i converted to nv_bfloat16.
595
+ * \internal
596
+ * \exception-guarantee no-throw guarantee
597
+ * \behavior reentrant, thread safe
598
+ * \endinternal
599
+ */
600
+ __CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rd(const short int i);
601
+ /**
602
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
603
+ * \brief Convert a signed short integer to a nv_bfloat16 in round-up mode.
604
+ *
605
+ * \details Convert the signed short integer value \p i to a nv_bfloat16 floating-point
606
+ * value in round-up mode.
607
+ * \param[in] i - short int. Is only being read.
608
+ *
609
+ * \returns nv_bfloat16
610
+ * - \p i converted to nv_bfloat16.
611
+ * \internal
612
+ * \exception-guarantee no-throw guarantee
613
+ * \behavior reentrant, thread safe
614
+ * \endinternal
615
+ */
616
+ __CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_ru(const short int i);
617
+
618
+ /**
619
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
620
+ * \brief Convert a nv_bfloat16 to an unsigned integer in round-to-nearest-even mode.
621
+ *
622
+ * \details Convert the nv_bfloat16 floating-point value \p h to an unsigned integer
623
+ * in round-to-nearest-even mode. NaN inputs are converted to 0.
624
+ * \param[in] h - nv_bfloat16. Is only being read.
625
+ *
626
+ * \returns unsigned int
627
+ * - \p h converted to an unsigned integer.
628
+ * \internal
629
+ * \exception-guarantee no-throw guarantee
630
+ * \behavior reentrant, thread safe
631
+ * \endinternal
632
+ */
633
+ __CUDA_BF16_DECL__ unsigned int __bfloat162uint_rn(const __nv_bfloat16 h);
634
+ /**
635
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
636
+ * \brief Convert a nv_bfloat16 to an unsigned integer in round-towards-zero mode.
637
+ *
638
+ * \details Convert the nv_bfloat16 floating-point value \p h to an unsigned integer
639
+ * in round-towards-zero mode. NaN inputs are converted to 0.
640
+ * \param[in] h - nv_bfloat16. Is only being read.
641
+ *
642
+ * \returns unsigned int
643
+ * - \p h converted to an unsigned integer.
644
+ * \internal
645
+ * \exception-guarantee no-throw guarantee
646
+ * \behavior reentrant, thread safe
647
+ * \endinternal
648
+ */
649
+ __CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __bfloat162uint_rz(const __nv_bfloat16 h);
650
+ /**
651
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
652
+ * \brief Convert a nv_bfloat16 to an unsigned integer in round-down mode.
653
+ *
654
+ * \details Convert the nv_bfloat16 floating-point value \p h to an unsigned integer
655
+ * in round-down mode. NaN inputs are converted to 0.
656
+ * \param[in] h - nv_bfloat16. Is only being read.
657
+ *
658
+ * \returns unsigned int
659
+ * - \p h converted to an unsigned integer.
660
+ * \internal
661
+ * \exception-guarantee no-throw guarantee
662
+ * \behavior reentrant, thread safe
663
+ * \endinternal
664
+ */
665
+ __CUDA_BF16_DECL__ unsigned int __bfloat162uint_rd(const __nv_bfloat16 h);
666
+ /**
667
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
668
+ * \brief Convert a nv_bfloat16 to an unsigned integer in round-up mode.
669
+ *
670
+ * \details Convert the nv_bfloat16 floating-point value \p h to an unsigned integer
671
+ * in round-up mode. NaN inputs are converted to 0.
672
+ * \param[in] h - nv_bfloat16. Is only being read.
673
+ *
674
+ * \returns unsigned int
675
+ * - \p h converted to an unsigned integer.
676
+ * \internal
677
+ * \exception-guarantee no-throw guarantee
678
+ * \behavior reentrant, thread safe
679
+ * \endinternal
680
+ */
681
+ __CUDA_BF16_DECL__ unsigned int __bfloat162uint_ru(const __nv_bfloat16 h);
682
+
683
+ /**
684
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
685
+ * \brief Convert an unsigned integer to a nv_bfloat16 in round-to-nearest-even mode.
686
+ *
687
+ * \details Convert the unsigned integer value \p i to a nv_bfloat16 floating-point
688
+ * value in round-to-nearest-even mode.
689
+ * \param[in] i - unsigned int. Is only being read.
690
+ *
691
+ * \returns nv_bfloat16
692
+ * - \p i converted to nv_bfloat16.
693
+ * \internal
694
+ * \exception-guarantee no-throw guarantee
695
+ * \behavior reentrant, thread safe
696
+ * \endinternal
697
+ */
698
+ __CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rn(const unsigned int i);
699
+ /**
700
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
701
+ * \brief Convert an unsigned integer to a nv_bfloat16 in round-towards-zero mode.
702
+ *
703
+ * \details Convert the unsigned integer value \p i to a nv_bfloat16 floating-point
704
+ * value in round-towards-zero mode.
705
+ * \param[in] i - unsigned int. Is only being read.
706
+ *
707
+ * \returns nv_bfloat16
708
+ * - \p i converted to nv_bfloat16.
709
+ * \internal
710
+ * \exception-guarantee no-throw guarantee
711
+ * \behavior reentrant, thread safe
712
+ * \endinternal
713
+ */
714
+ __CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rz(const unsigned int i);
715
+ /**
716
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
717
+ * \brief Convert an unsigned integer to a nv_bfloat16 in round-down mode.
718
+ *
719
+ * \details Convert the unsigned integer value \p i to a nv_bfloat16 floating-point
720
+ * value in round-down mode.
721
+ * \param[in] i - unsigned int. Is only being read.
722
+ *
723
+ * \returns nv_bfloat16
724
+ * - \p i converted to nv_bfloat16.
725
+ * \internal
726
+ * \exception-guarantee no-throw guarantee
727
+ * \behavior reentrant, thread safe
728
+ * \endinternal
729
+ */
730
+ __CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rd(const unsigned int i);
731
+ /**
732
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
733
+ * \brief Convert an unsigned integer to a nv_bfloat16 in round-up mode.
734
+ *
735
+ * \details Convert the unsigned integer value \p i to a nv_bfloat16 floating-point
736
+ * value in round-up mode.
737
+ * \param[in] i - unsigned int. Is only being read.
738
+ *
739
+ * \returns nv_bfloat16
740
+ * - \p i converted to nv_bfloat16.
741
+ * \internal
742
+ * \exception-guarantee no-throw guarantee
743
+ * \behavior reentrant, thread safe
744
+ * \endinternal
745
+ */
746
+ __CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_ru(const unsigned int i);
747
+
748
+ /**
749
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
750
+ * \brief Convert a nv_bfloat16 to an unsigned short integer in round-to-nearest-even
751
+ * mode.
752
+ *
753
+ * \details Convert the nv_bfloat16 floating-point value \p h to an unsigned short
754
+ * integer in round-to-nearest-even mode. NaN inputs are converted to 0.
755
+ * \param[in] h - nv_bfloat16. Is only being read.
756
+ *
757
+ * \returns unsigned short int
758
+ * - \p h converted to an unsigned short integer.
759
+ * \internal
760
+ * \exception-guarantee no-throw guarantee
761
+ * \behavior reentrant, thread safe
762
+ * \endinternal
763
+ */
764
+ __CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_rn(const __nv_bfloat16 h);
765
+ /**
766
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
767
+ * \brief Convert a nv_bfloat16 to an unsigned short integer in round-towards-zero
768
+ * mode.
769
+ *
770
+ * \details Convert the nv_bfloat16 floating-point value \p h to an unsigned short
771
+ * integer in round-towards-zero mode. NaN inputs are converted to 0.
772
+ * \param[in] h - nv_bfloat16. Is only being read.
773
+ *
774
+ * \returns unsigned short int
775
+ * - \p h converted to an unsigned short integer.
776
+ * \internal
777
+ * \exception-guarantee no-throw guarantee
778
+ * \behavior reentrant, thread safe
779
+ * \endinternal
780
+ */
781
+ __CUDA_HOSTDEVICE_BF16_DECL__ unsigned short int __bfloat162ushort_rz(const __nv_bfloat16 h);
782
+ /**
783
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
784
+ * \brief Convert a nv_bfloat16 to an unsigned short integer in round-down mode.
785
+ *
786
+ * \details Convert the nv_bfloat16 floating-point value \p h to an unsigned short
787
+ * integer in round-down mode. NaN inputs are converted to 0.
788
+ * \param[in] h - nv_bfloat16. Is only being read.
789
+ *
790
+ * \returns unsigned short int
791
+ * - \p h converted to an unsigned short integer.
792
+ */
793
+ __CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_rd(const __nv_bfloat16 h);
794
+ /**
795
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
796
+ * \brief Convert a nv_bfloat16 to an unsigned short integer in round-up mode.
797
+ *
798
+ * \details Convert the nv_bfloat16 floating-point value \p h to an unsigned short
799
+ * integer in round-up mode. NaN inputs are converted to 0.
800
+ * \param[in] h - nv_bfloat16. Is only being read.
801
+ *
802
+ * \returns unsigned short int
803
+ * - \p h converted to an unsigned short integer.
804
+ */
805
+ __CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_ru(const __nv_bfloat16 h);
806
+
807
+ /**
808
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
809
+ * \brief Convert an unsigned short integer to a nv_bfloat16 in round-to-nearest-even
810
+ * mode.
811
+ *
812
+ * \details Convert the unsigned short integer value \p i to a nv_bfloat16 floating-point
813
+ * value in round-to-nearest-even mode.
814
+ * \param[in] i - unsigned short int. Is only being read.
815
+ *
816
+ * \returns nv_bfloat16
817
+ * - \p i converted to nv_bfloat16.
818
+ * \internal
819
+ * \exception-guarantee no-throw guarantee
820
+ * \behavior reentrant, thread safe
821
+ * \endinternal
822
+ */
823
+ __CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rn(const unsigned short int i);
824
+ /**
825
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
826
+ * \brief Convert an unsigned short integer to a nv_bfloat16 in round-towards-zero
827
+ * mode.
828
+ *
829
+ * \details Convert the unsigned short integer value \p i to a nv_bfloat16 floating-point
830
+ * value in round-towards-zero mode.
831
+ * \param[in] i - unsigned short int. Is only being read.
832
+ *
833
+ * \returns nv_bfloat16
834
+ * - \p i converted to nv_bfloat16.
835
+ * \internal
836
+ * \exception-guarantee no-throw guarantee
837
+ * \behavior reentrant, thread safe
838
+ * \endinternal
839
+ */
840
+ __CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rz(const unsigned short int i);
841
+ /**
842
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
843
+ * \brief Convert an unsigned short integer to a nv_bfloat16 in round-down mode.
844
+ *
845
+ * \details Convert the unsigned short integer value \p i to a nv_bfloat16 floating-point
846
+ * value in round-down mode.
847
+ * \param[in] i - unsigned short int. Is only being read.
848
+ *
849
+ * \returns nv_bfloat16
850
+ * - \p i converted to nv_bfloat16.
851
+ * \internal
852
+ * \exception-guarantee no-throw guarantee
853
+ * \behavior reentrant, thread safe
854
+ * \endinternal
855
+ */
856
+ __CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rd(const unsigned short int i);
857
+ /**
858
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
859
+ * \brief Convert an unsigned short integer to a nv_bfloat16 in round-up mode.
860
+ *
861
+ * \details Convert the unsigned short integer value \p i to a nv_bfloat16 floating-point
862
+ * value in round-up mode.
863
+ * \param[in] i - unsigned short int. Is only being read.
864
+ *
865
+ * \returns nv_bfloat16
866
+ * - \p i converted to nv_bfloat16.
867
+ * \internal
868
+ * \exception-guarantee no-throw guarantee
869
+ * \behavior reentrant, thread safe
870
+ * \endinternal
871
+ */
872
+ __CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_ru(const unsigned short int i);
873
+
874
+ /**
875
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
876
+ * \brief Convert a nv_bfloat16 to an unsigned 64-bit integer in round-to-nearest-even
877
+ * mode.
878
+ *
879
+ * \details Convert the nv_bfloat16 floating-point value \p h to an unsigned 64-bit
880
+ * integer in round-to-nearest-even mode. NaN inputs return 0x8000000000000000.
881
+ * \param[in] h - nv_bfloat16. Is only being read.
882
+ *
883
+ * \returns unsigned long long int
884
+ * - \p h converted to an unsigned 64-bit integer.
885
+ * \internal
886
+ * \exception-guarantee no-throw guarantee
887
+ * \behavior reentrant, thread safe
888
+ * \endinternal
889
+ */
890
+ __CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_rn(const __nv_bfloat16 h);
891
+ /**
892
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
893
+ * \brief Convert a nv_bfloat16 to an unsigned 64-bit integer in round-towards-zero
894
+ * mode.
895
+ *
896
+ * \details Convert the nv_bfloat16 floating-point value \p h to an unsigned 64-bit
897
+ * integer in round-towards-zero mode. NaN inputs return 0x8000000000000000.
898
+ * \param[in] h - nv_bfloat16. Is only being read.
899
+ *
900
+ * \returns unsigned long long int
901
+ * - \p h converted to an unsigned 64-bit integer.
902
+ * \internal
903
+ * \exception-guarantee no-throw guarantee
904
+ * \behavior reentrant, thread safe
905
+ * \endinternal
906
+ */
907
+ __CUDA_HOSTDEVICE_BF16_DECL__ unsigned long long int __bfloat162ull_rz(const __nv_bfloat16 h);
908
+ /**
909
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
910
+ * \brief Convert a nv_bfloat16 to an unsigned 64-bit integer in round-down mode.
911
+ *
912
+ * \details Convert the nv_bfloat16 floating-point value \p h to an unsigned 64-bit
913
+ * integer in round-down mode. NaN inputs return 0x8000000000000000.
914
+ * \param[in] h - nv_bfloat16. Is only being read.
915
+ *
916
+ * \returns unsigned long long int
917
+ * - \p h converted to an unsigned 64-bit integer.
918
+ * \internal
919
+ * \exception-guarantee no-throw guarantee
920
+ * \behavior reentrant, thread safe
921
+ * \endinternal
922
+ */
923
+ __CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_rd(const __nv_bfloat16 h);
924
+ /**
925
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
926
+ * \brief Convert a nv_bfloat16 to an unsigned 64-bit integer in round-up mode.
927
+ *
928
+ * \details Convert the nv_bfloat16 floating-point value \p h to an unsigned 64-bit
929
+ * integer in round-up mode. NaN inputs return 0x8000000000000000.
930
+ * \param[in] h - nv_bfloat16. Is only being read.
931
+ *
932
+ * \returns unsigned long long int
933
+ * - \p h converted to an unsigned 64-bit integer.
934
+ * \internal
935
+ * \exception-guarantee no-throw guarantee
936
+ * \behavior reentrant, thread safe
937
+ * \endinternal
938
+ */
939
+ __CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_ru(const __nv_bfloat16 h);
940
+
941
+ /**
942
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
943
+ * \brief Convert an unsigned 64-bit integer to a nv_bfloat16 in round-to-nearest-even
944
+ * mode.
945
+ *
946
+ * \details Convert the unsigned 64-bit integer value \p i to a nv_bfloat16 floating-point
947
+ * value in round-to-nearest-even mode.
948
+ * \param[in] i - unsigned long long int. Is only being read.
949
+ *
950
+ * \returns nv_bfloat16
951
+ * - \p i converted to nv_bfloat16.
952
+ * \internal
953
+ * \exception-guarantee no-throw guarantee
954
+ * \behavior reentrant, thread safe
955
+ * \endinternal
956
+ */
957
+ __CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rn(const unsigned long long int i);
958
+ /**
959
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
960
+ * \brief Convert an unsigned 64-bit integer to a nv_bfloat16 in round-towards-zero
961
+ * mode.
962
+ *
963
+ * \details Convert the unsigned 64-bit integer value \p i to a nv_bfloat16 floating-point
964
+ * value in round-towards-zero mode.
965
+ * \param[in] i - unsigned long long int. Is only being read.
966
+ *
967
+ * \returns nv_bfloat16
968
+ * - \p i converted to nv_bfloat16.
969
+ * \internal
970
+ * \exception-guarantee no-throw guarantee
971
+ * \behavior reentrant, thread safe
972
+ * \endinternal
973
+ */
974
+ __CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rz(const unsigned long long int i);
975
+ /**
976
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
977
+ * \brief Convert an unsigned 64-bit integer to a nv_bfloat16 in round-down mode.
978
+ *
979
+ * \details Convert the unsigned 64-bit integer value \p i to a nv_bfloat16 floating-point
980
+ * value in round-down mode.
981
+ * \param[in] i - unsigned long long int. Is only being read.
982
+ *
983
+ * \returns nv_bfloat16
984
+ * - \p i converted to nv_bfloat16.
985
+ * \internal
986
+ * \exception-guarantee no-throw guarantee
987
+ * \behavior reentrant, thread safe
988
+ * \endinternal
989
+ */
990
+ __CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rd(const unsigned long long int i);
991
+ /**
992
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
993
+ * \brief Convert an unsigned 64-bit integer to a nv_bfloat16 in round-up mode.
994
+ *
995
+ * \details Convert the unsigned 64-bit integer value \p i to a nv_bfloat16 floating-point
996
+ * value in round-up mode.
997
+ * \param[in] i - unsigned long long int. Is only being read.
998
+ *
999
+ * \returns nv_bfloat16
1000
+ * - \p i converted to nv_bfloat16.
1001
+ * \internal
1002
+ * \exception-guarantee no-throw guarantee
1003
+ * \behavior reentrant, thread safe
1004
+ * \endinternal
1005
+ */
1006
+ __CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_ru(const unsigned long long int i);
1007
+
1008
+ /**
1009
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
1010
+ * \brief Convert a nv_bfloat16 to a signed 64-bit integer in round-to-nearest-even
1011
+ * mode.
1012
+ *
1013
+ * \details Convert the nv_bfloat16 floating-point value \p h to a signed 64-bit
1014
+ * integer in round-to-nearest-even mode. NaN inputs return a long long int with hex value of 0x8000000000000000.
1015
+ * \param[in] h - nv_bfloat16. Is only being read.
1016
+ *
1017
+ * \returns long long int
1018
+ * - \p h converted to a signed 64-bit integer.
1019
+ * \internal
1020
+ * \exception-guarantee no-throw guarantee
1021
+ * \behavior reentrant, thread safe
1022
+ * \endinternal
1023
+ */
1024
+ __CUDA_BF16_DECL__ long long int __bfloat162ll_rn(const __nv_bfloat16 h);
1025
+ /**
1026
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
1027
+ * \brief Convert a nv_bfloat16 to a signed 64-bit integer in round-towards-zero mode.
1028
+ *
1029
+ * \details Convert the nv_bfloat16 floating-point value \p h to a signed 64-bit
1030
+ * integer in round-towards-zero mode. NaN inputs return a long long int with hex value of 0x8000000000000000.
1031
+ * \param[in] h - nv_bfloat16. Is only being read.
1032
+ *
1033
+ * \returns long long int
1034
+ * - \p h converted to a signed 64-bit integer.
1035
+ * \internal
1036
+ * \exception-guarantee no-throw guarantee
1037
+ * \behavior reentrant, thread safe
1038
+ * \endinternal
1039
+ */
1040
+ __CUDA_HOSTDEVICE_BF16_DECL__ long long int __bfloat162ll_rz(const __nv_bfloat16 h);
1041
+ /**
1042
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
1043
+ * \brief Convert a nv_bfloat16 to a signed 64-bit integer in round-down mode.
1044
+ *
1045
+ * \details Convert the nv_bfloat16 floating-point value \p h to a signed 64-bit
1046
+ * integer in round-down mode. NaN inputs return a long long int with hex value of 0x8000000000000000.
1047
+ * \param[in] h - nv_bfloat16. Is only being read.
1048
+ *
1049
+ * \returns long long int
1050
+ * - \p h converted to a signed 64-bit integer.
1051
+ * \internal
1052
+ * \exception-guarantee no-throw guarantee
1053
+ * \behavior reentrant, thread safe
1054
+ * \endinternal
1055
+ */
1056
+ __CUDA_BF16_DECL__ long long int __bfloat162ll_rd(const __nv_bfloat16 h);
1057
+ /**
1058
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
1059
+ * \brief Convert a nv_bfloat16 to a signed 64-bit integer in round-up mode.
1060
+ *
1061
+ * \details Convert the nv_bfloat16 floating-point value \p h to a signed 64-bit
1062
+ * integer in round-up mode. NaN inputs return a long long int with hex value of 0x8000000000000000.
1063
+ * \param[in] h - nv_bfloat16. Is only being read.
1064
+ *
1065
+ * \returns long long int
1066
+ * - \p h converted to a signed 64-bit integer.
1067
+ * \internal
1068
+ * \exception-guarantee no-throw guarantee
1069
+ * \behavior reentrant, thread safe
1070
+ * \endinternal
1071
+ */
1072
+ __CUDA_BF16_DECL__ long long int __bfloat162ll_ru(const __nv_bfloat16 h);
1073
+
1074
+ /**
1075
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
1076
+ * \brief Convert a signed 64-bit integer to a nv_bfloat16 in round-to-nearest-even
1077
+ * mode.
1078
+ *
1079
+ * \details Convert the signed 64-bit integer value \p i to a nv_bfloat16 floating-point
1080
+ * value in round-to-nearest-even mode.
1081
+ * \param[in] i - long long int. Is only being read.
1082
+ *
1083
+ * \returns nv_bfloat16
1084
+ * - \p i converted to nv_bfloat16.
1085
+ * \internal
1086
+ * \exception-guarantee no-throw guarantee
1087
+ * \behavior reentrant, thread safe
1088
+ * \endinternal
1089
+ */
1090
+ __CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rn(const long long int i);
1091
+ /**
1092
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
1093
+ * \brief Convert a signed 64-bit integer to a nv_bfloat16 in round-towards-zero mode.
1094
+ *
1095
+ * \details Convert the signed 64-bit integer value \p i to a nv_bfloat16 floating-point
1096
+ * value in round-towards-zero mode.
1097
+ * \param[in] i - long long int. Is only being read.
1098
+ *
1099
+ * \returns nv_bfloat16
1100
+ * - \p i converted to nv_bfloat16.
1101
+ */
1102
+ __CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rz(const long long int i);
1103
+ /**
1104
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
1105
+ * \brief Convert a signed 64-bit integer to a nv_bfloat16 in round-down mode.
1106
+ *
1107
+ * \details Convert the signed 64-bit integer value \p i to a nv_bfloat16 floating-point
1108
+ * value in round-down mode.
1109
+ * \param[in] i - long long int. Is only being read.
1110
+ *
1111
+ * \returns nv_bfloat16
1112
+ * - \p i converted to nv_bfloat16.
1113
+ * \internal
1114
+ * \exception-guarantee no-throw guarantee
1115
+ * \behavior reentrant, thread safe
1116
+ * \endinternal
1117
+ */
1118
+ __CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rd(const long long int i);
1119
+ /**
1120
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
1121
+ * \brief Convert a signed 64-bit integer to a nv_bfloat16 in round-up mode.
1122
+ *
1123
+ * \details Convert the signed 64-bit integer value \p i to a nv_bfloat16 floating-point
1124
+ * value in round-up mode.
1125
+ * \param[in] i - long long int. Is only being read.
1126
+ *
1127
+ * \returns nv_bfloat16
1128
+ * - \p i converted to nv_bfloat16.
1129
+ * \internal
1130
+ * \exception-guarantee no-throw guarantee
1131
+ * \behavior reentrant, thread safe
1132
+ * \endinternal
1133
+ */
1134
+ __CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_ru(const long long int i);
1135
+
1136
+ /**
1137
+ * \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
1138
+ * \brief Truncate input argument to the integral part.
1139
+ *
1140
+ * \details Round \p h to the nearest integer value that does not exceed \p h in
1141
+ * magnitude.
1142
+ * \param[in] h - nv_bfloat16. Is only being read.
1143
+ *
1144
+ * \returns nv_bfloat16
1145
+ * - The truncated integer value.
1146
+ * \internal
1147
+ * \exception-guarantee no-throw guarantee
1148
+ * \behavior reentrant, thread safe
1149
+ * \endinternal
1150
+ */
1151
+ __CUDA_BF16_DECL__ __nv_bfloat16 htrunc(const __nv_bfloat16 h);
1152
+ /**
1153
+ * \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
1154
+ * \brief Calculate ceiling of the input argument.
1155
+ *
1156
+ * \details Compute the smallest integer value not less than \p h.
1157
+ * \param[in] h - nv_bfloat16. Is only being read.
1158
+ *
1159
+ * \returns nv_bfloat16
1160
+ * - The smallest integer value not less than \p h.
1161
+ * \internal
1162
+ * \exception-guarantee no-throw guarantee
1163
+ * \behavior reentrant, thread safe
1164
+ * \endinternal
1165
+ */
1166
+ __CUDA_BF16_DECL__ __nv_bfloat16 hceil(const __nv_bfloat16 h);
1167
+ /**
1168
+ * \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
1169
+ * \brief Calculate the largest integer less than or equal to \p h.
1170
+ *
1171
+ * \details Calculate the largest integer value which is less than or equal to \p h.
1172
+ * \param[in] h - nv_bfloat16. Is only being read.
1173
+ *
1174
+ * \returns nv_bfloat16
1175
+ * - The largest integer value which is less than or equal to \p h.
1176
+ * \internal
1177
+ * \exception-guarantee no-throw guarantee
1178
+ * \behavior reentrant, thread safe
1179
+ * \endinternal
1180
+ */
1181
+ __CUDA_BF16_DECL__ __nv_bfloat16 hfloor(const __nv_bfloat16 h);
1182
+ /**
1183
+ * \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
1184
+ * \brief Round input to nearest integer value in nv_bfloat16 floating-point
1185
+ * number.
1186
+ *
1187
+ * \details Round \p h to the nearest integer value in nv_bfloat16 floating-point
1188
+ * format, with bfloat16way cases rounded to the nearest even integer value.
1189
+ * \param[in] h - nv_bfloat16. Is only being read.
1190
+ *
1191
+ * \returns nv_bfloat16
1192
+ * - The nearest integer to \p h.
1193
+ * \internal
1194
+ * \exception-guarantee no-throw guarantee
1195
+ * \behavior reentrant, thread safe
1196
+ * \endinternal
1197
+ */
1198
+ __CUDA_BF16_DECL__ __nv_bfloat16 hrint(const __nv_bfloat16 h);
1199
+
1200
+ /**
1201
+ * \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
1202
+ * \brief Truncate \p nv_bfloat162 vector input argument to the integral part.
1203
+ *
1204
+ * \details Round each component of vector \p h to the nearest integer value that does
1205
+ * not exceed \p h in magnitude.
1206
+ * \param[in] h - nv_bfloat162. Is only being read.
1207
+ *
1208
+ * \returns nv_bfloat162
1209
+ * - The truncated \p h.
1210
+ * \internal
1211
+ * \exception-guarantee no-throw guarantee
1212
+ * \behavior reentrant, thread safe
1213
+ * \endinternal
1214
+ */
1215
+ __CUDA_BF16_DECL__ __nv_bfloat162 h2trunc(const __nv_bfloat162 h);
1216
+ /**
1217
+ * \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
1218
+ * \brief Calculate \p nv_bfloat162 vector ceiling of the input argument.
1219
+ *
1220
+ * \details For each component of vector \p h compute the smallest integer value not less
1221
+ * than \p h.
1222
+ * \param[in] h - nv_bfloat162. Is only being read.
1223
+ *
1224
+ * \returns nv_bfloat162
1225
+ * - The vector of smallest integers not less than \p h.
1226
+ * \internal
1227
+ * \exception-guarantee no-throw guarantee
1228
+ * \behavior reentrant, thread safe
1229
+ * \endinternal
1230
+ */
1231
+ __CUDA_BF16_DECL__ __nv_bfloat162 h2ceil(const __nv_bfloat162 h);
1232
+ /**
1233
+ * \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
1234
+ * \brief Calculate the largest integer less than or equal to \p h.
1235
+ *
1236
+ * \details For each component of vector \p h calculate the largest integer value which
1237
+ * is less than or equal to \p h.
1238
+ * \param[in] h - nv_bfloat162. Is only being read.
1239
+ *
1240
+ * \returns nv_bfloat162
1241
+ * - The vector of largest integers which is less than or equal to \p h.
1242
+ * \internal
1243
+ * \exception-guarantee no-throw guarantee
1244
+ * \behavior reentrant, thread safe
1245
+ * \endinternal
1246
+ */
1247
+ __CUDA_BF16_DECL__ __nv_bfloat162 h2floor(const __nv_bfloat162 h);
1248
+ /**
1249
+ * \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
1250
+ * \brief Round input to nearest integer value in nv_bfloat16 floating-point
1251
+ * number.
1252
+ *
1253
+ * \details Round each component of \p nv_bfloat162 vector \p h to the nearest integer value in
1254
+ * nv_bfloat16 floating-point format, with bfloat16way cases rounded to the
1255
+ * nearest even integer value.
1256
+ * \param[in] h - nv_bfloat162. Is only being read.
1257
+ *
1258
+ * \returns nv_bfloat162
1259
+ * - The vector of rounded integer values.
1260
+ * \internal
1261
+ * \exception-guarantee no-throw guarantee
1262
+ * \behavior reentrant, thread safe
1263
+ * \endinternal
1264
+ */
1265
+ __CUDA_BF16_DECL__ __nv_bfloat162 h2rint(const __nv_bfloat162 h);
1266
+
1267
+ /**
1268
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
1269
+ * \brief Returns \p nv_bfloat162 with both halves equal to the input value.
1270
+ *
1271
+ * \details Returns \p nv_bfloat162 number with both halves equal to the input \p a \p nv_bfloat16
1272
+ * number.
1273
+ * \param[in] a - nv_bfloat16. Is only being read.
1274
+ *
1275
+ * \returns nv_bfloat162
1276
+ * - The vector which has both its halves equal to the input \p a.
1277
+ * \internal
1278
+ * \exception-guarantee no-throw guarantee
1279
+ * \behavior reentrant, thread safe
1280
+ * \endinternal
1281
+ */
1282
+ __CUDA_BF16_DECL__ __nv_bfloat162 __bfloat162bfloat162(const __nv_bfloat16 a);
1283
+ /**
1284
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
1285
+ * \brief Swaps both halves of the \p nv_bfloat162 input.
1286
+ *
1287
+ * \details Swaps both halves of the \p nv_bfloat162 input and returns a new \p nv_bfloat162 number
1288
+ * with swapped halves.
1289
+ * \param[in] a - nv_bfloat162. Is only being read.
1290
+ *
1291
+ * \returns nv_bfloat162
1292
+ * - \p a with its halves being swapped.
1293
+ * \internal
1294
+ * \exception-guarantee no-throw guarantee
1295
+ * \behavior reentrant, thread safe
1296
+ * \endinternal
1297
+ */
1298
+ __CUDA_BF16_DECL__ __nv_bfloat162 __lowhigh2highlow(const __nv_bfloat162 a);
1299
+ /**
1300
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
1301
+ * \brief Extracts low 16 bits from each of the two \p nv_bfloat162 inputs and combines
1302
+ * into one \p nv_bfloat162 number.
1303
+ *
1304
+ * \details Extracts low 16 bits from each of the two \p nv_bfloat162 inputs and combines into
1305
+ * one \p nv_bfloat162 number. Low 16 bits from input \p a is stored in low 16 bits of
1306
+ * the return value, low 16 bits from input \p b is stored in high 16 bits of
1307
+ * the return value.
1308
+ * \param[in] a - nv_bfloat162. Is only being read.
1309
+ * \param[in] b - nv_bfloat162. Is only being read.
1310
+ *
1311
+ * \returns nv_bfloat162
1312
+ * - The low 16 bits of \p a and of \p b.
1313
+ * \internal
1314
+ * \exception-guarantee no-throw guarantee
1315
+ * \behavior reentrant, thread safe
1316
+ * \endinternal
1317
+ */
1318
+ __CUDA_BF16_DECL__ __nv_bfloat162 __lows2bfloat162(const __nv_bfloat162 a, const __nv_bfloat162 b);
1319
+ /**
1320
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
1321
+ * \brief Extracts high 16 bits from each of the two \p nv_bfloat162 inputs and
1322
+ * combines into one \p nv_bfloat162 number.
1323
+ *
1324
+ * \details Extracts high 16 bits from each of the two \p nv_bfloat162 inputs and combines into
1325
+ * one \p nv_bfloat162 number. High 16 bits from input \p a is stored in low 16 bits of
1326
+ * the return value, high 16 bits from input \p b is stored in high 16 bits of
1327
+ * the return value.
1328
+ * \param[in] a - nv_bfloat162. Is only being read.
1329
+ * \param[in] b - nv_bfloat162. Is only being read.
1330
+ *
1331
+ * \returns nv_bfloat162
1332
+ * - The high 16 bits of \p a and of \p b.
1333
+ * \internal
1334
+ * \exception-guarantee no-throw guarantee
1335
+ * \behavior reentrant, thread safe
1336
+ * \endinternal
1337
+ */
1338
+ __CUDA_BF16_DECL__ __nv_bfloat162 __highs2bfloat162(const __nv_bfloat162 a, const __nv_bfloat162 b);
1339
+ /**
1340
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
1341
+ * \brief Returns high 16 bits of \p nv_bfloat162 input.
1342
+ *
1343
+ * \details Returns high 16 bits of \p nv_bfloat162 input \p a.
1344
+ * \param[in] a - nv_bfloat162. Is only being read.
1345
+ *
1346
+ * \returns nv_bfloat16
1347
+ * - The high 16 bits of the input.
1348
+ * \internal
1349
+ * \exception-guarantee no-throw guarantee
1350
+ * \behavior reentrant, thread safe
1351
+ * \endinternal
1352
+ */
1353
+ __CUDA_BF16_DECL__ __nv_bfloat16 __high2bfloat16(const __nv_bfloat162 a);
1354
+ /**
1355
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
1356
+ * \brief Returns low 16 bits of \p nv_bfloat162 input.
1357
+ *
1358
+ * \details Returns low 16 bits of \p nv_bfloat162 input \p a.
1359
+ * \param[in] a - nv_bfloat162. Is only being read.
1360
+ *
1361
+ * \returns nv_bfloat16
1362
+ * - Returns \p nv_bfloat16 which contains low 16 bits of the input \p a.
1363
+ * \internal
1364
+ * \exception-guarantee no-throw guarantee
1365
+ * \behavior reentrant, thread safe
1366
+ * \endinternal
1367
+ */
1368
+ __CUDA_BF16_DECL__ __nv_bfloat16 __low2bfloat16(const __nv_bfloat162 a);
1369
+ /**
1370
+ * \ingroup CUDA_MATH__BFLOAT16_COMPARISON
1371
+ * \brief Checks if the input \p nv_bfloat16 number is infinite.
1372
+ *
1373
+ * \details Checks if the input \p nv_bfloat16 number \p a is infinite.
1374
+ * \param[in] a - nv_bfloat16. Is only being read.
1375
+ *
1376
+ * \returns int
1377
+ * - -1 iff \p a is equal to negative infinity,
1378
+ * - 1 iff \p a is equal to positive infinity,
1379
+ * - 0 otherwise.
1380
+ * \internal
1381
+ * \exception-guarantee no-throw guarantee
1382
+ * \behavior reentrant, thread safe
1383
+ * \endinternal
1384
+ */
1385
+ __CUDA_BF16_DECL__ int __hisinf(const __nv_bfloat16 a);
1386
+ /**
1387
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
1388
+ * \brief Combines two \p nv_bfloat16 numbers into one \p nv_bfloat162 number.
1389
+ *
1390
+ * \details Combines two input \p nv_bfloat16 number \p a and \p b into one \p nv_bfloat162 number.
1391
+ * Input \p a is stored in low 16 bits of the return value, input \p b is stored
1392
+ * in high 16 bits of the return value.
1393
+ * \param[in] a - nv_bfloat16. Is only being read.
1394
+ * \param[in] b - nv_bfloat16. Is only being read.
1395
+ *
1396
+ * \returns nv_bfloat162
1397
+ * - The nv_bfloat162 with one nv_bfloat16 equal to \p a and the other to \p b.
1398
+ * \internal
1399
+ * \exception-guarantee no-throw guarantee
1400
+ * \behavior reentrant, thread safe
1401
+ * \endinternal
1402
+ */
1403
+ __CUDA_BF16_DECL__ __nv_bfloat162 __halves2bfloat162(const __nv_bfloat16 a, const __nv_bfloat16 b);
1404
+ /**
1405
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
1406
+ * \brief Extracts low 16 bits from \p nv_bfloat162 input.
1407
+ *
1408
+ * \details Extracts low 16 bits from \p nv_bfloat162 input \p a and returns a new \p nv_bfloat162
1409
+ * number which has both halves equal to the extracted bits.
1410
+ * \param[in] a - nv_bfloat162. Is only being read.
1411
+ *
1412
+ * \returns nv_bfloat162
1413
+ * - The nv_bfloat162 with both halves equal to the low 16 bits of the input.
1414
+ * \internal
1415
+ * \exception-guarantee no-throw guarantee
1416
+ * \behavior reentrant, thread safe
1417
+ * \endinternal
1418
+ */
1419
+ __CUDA_BF16_DECL__ __nv_bfloat162 __low2bfloat162(const __nv_bfloat162 a);
1420
+ /**
1421
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
1422
+ * \brief Extracts high 16 bits from \p nv_bfloat162 input.
1423
+ *
1424
+ * \details Extracts high 16 bits from \p nv_bfloat162 input \p a and returns a new \p nv_bfloat162
1425
+ * number which has both halves equal to the extracted bits.
1426
+ * \param[in] a - nv_bfloat162. Is only being read.
1427
+ *
1428
+ * \returns nv_bfloat162
1429
+ * - The nv_bfloat162 with both halves equal to the high 16 bits of the input.
1430
+ * \internal
1431
+ * \exception-guarantee no-throw guarantee
1432
+ * \behavior reentrant, thread safe
1433
+ * \endinternal
1434
+ */
1435
+ __CUDA_BF16_DECL__ __nv_bfloat162 __high2bfloat162(const __nv_bfloat162 a);
1436
+
1437
+ /**
1438
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
1439
+ * \brief Reinterprets bits in a \p nv_bfloat16 as a signed short integer.
1440
+ *
1441
+ * \details Reinterprets the bits in the nv_bfloat16 floating-point number \p h
1442
+ * as a signed short integer.
1443
+ * \param[in] h - nv_bfloat16. Is only being read.
1444
+ *
1445
+ * \returns short int
1446
+ * - The reinterpreted value.
1447
+ * \internal
1448
+ * \exception-guarantee no-throw guarantee
1449
+ * \behavior reentrant, thread safe
1450
+ * \endinternal
1451
+ */
1452
+ __CUDA_BF16_DECL__ short int __bfloat16_as_short(const __nv_bfloat16 h);
1453
+ /**
1454
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
1455
+ * \brief Reinterprets bits in a \p nv_bfloat16 as an unsigned short integer.
1456
+ *
1457
+ * \details Reinterprets the bits in the nv_bfloat16 floating-point \p h
1458
+ * as an unsigned short number.
1459
+ * \param[in] h - nv_bfloat16. Is only being read.
1460
+ *
1461
+ * \returns unsigned short int
1462
+ * - The reinterpreted value.
1463
+ * \internal
1464
+ * \exception-guarantee no-throw guarantee
1465
+ * \behavior reentrant, thread safe
1466
+ * \endinternal
1467
+ */
1468
+ __CUDA_BF16_DECL__ unsigned short int __bfloat16_as_ushort(const __nv_bfloat16 h);
1469
+ /**
1470
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
1471
+ * \brief Reinterprets bits in a signed short integer as a \p nv_bfloat16.
1472
+ *
1473
+ * \details Reinterprets the bits in the signed short integer \p i as a
1474
+ * nv_bfloat16 floating-point number.
1475
+ * \param[in] i - short int. Is only being read.
1476
+ *
1477
+ * \returns nv_bfloat16
1478
+ * - The reinterpreted value.
1479
+ * \internal
1480
+ * \exception-guarantee no-throw guarantee
1481
+ * \behavior reentrant, thread safe
1482
+ * \endinternal
1483
+ */
1484
+ __CUDA_BF16_DECL__ __nv_bfloat16 __short_as_bfloat16(const short int i);
1485
+ /**
1486
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
1487
+ * \brief Reinterprets bits in an unsigned short integer as a \p nv_bfloat16.
1488
+ *
1489
+ * \details Reinterprets the bits in the unsigned short integer \p i as a
1490
+ * nv_bfloat16 floating-point number.
1491
+ * \param[in] i - unsigned short int. Is only being read.
1492
+ *
1493
+ * \returns nv_bfloat16
1494
+ * - The reinterpreted value.
1495
+ * \internal
1496
+ * \exception-guarantee no-throw guarantee
1497
+ * \behavior reentrant, thread safe
1498
+ * \endinternal
1499
+ */
1500
+ __CUDA_BF16_DECL__ __nv_bfloat16 __ushort_as_bfloat16(const unsigned short int i);
1501
+
1502
+ #if !defined warpSize && !defined __local_warpSize
1503
+ #define warpSize 32
1504
+ #define __local_warpSize
1505
+ #endif
1506
+
1507
+ /**
1508
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
1509
+ * \brief Exchange a variable between threads within a warp. Direct copy from indexed thread.
1510
+ *
1511
+ * \details Returns the value of var held by the thread whose ID is given by delta.
1512
+ * If width is less than warpSize then each subsection of the warp behaves as a separate
1513
+ * entity with a starting logical thread ID of 0. If delta is outside the range [0:width-1],
1514
+ * the value returned corresponds to the value of var held by the delta modulo width (i.e.
1515
+ * within the same subsection). width must have a value which is a power of 2;
1516
+ * results are undefined if width is not a power of 2, or is a number greater than
1517
+ * warpSize.
1518
+ * \param[in] mask - unsigned int. Is only being read.
1519
+ * \param[in] var - nv_bfloat162. Is only being read.
1520
+ * \param[in] delta - int. Is only being read.
1521
+ * \param[in] width - int. Is only being read.
1522
+ *
1523
+ * \returns Returns the 4-byte word referenced by var from the source thread ID as nv_bfloat162.
1524
+ * If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned.
1525
+ * \note_ref_guide_warp_shuffle
1526
+ * \internal
1527
+ * \exception-guarantee no-throw guarantee
1528
+ * \behavior not reentrant, not thread safe
1529
+ * \endinternal
1530
+ */
1531
+ __CUDA_BF16_DECL__ __nv_bfloat162 __shfl_sync(const unsigned mask, const __nv_bfloat162 var, const int delta, const int width = warpSize);
1532
+ /**
1533
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
1534
+ * \brief Exchange a variable between threads within a warp. Copy from a thread with lower ID relative to the caller.
1535
+ *
1536
+ * \details Calculates a source thread ID by subtracting delta from the caller's lane ID.
1537
+ * The value of var held by the resulting lane ID is returned: in effect, var is shifted up
1538
+ * the warp by delta threads. If width is less than warpSize then each subsection of the warp
1539
+ * behaves as a separate entity with a starting logical thread ID of 0. The source thread index
1540
+ * will not wrap around the value of width, so effectively the lower delta threads will be unchanged.
1541
+ * width must have a value which is a power of 2; results are undefined if width is not a power of 2,
1542
+ * or is a number greater than warpSize.
1543
+ * \param[in] mask - unsigned int. Is only being read.
1544
+ * \param[in] var - nv_bfloat162. Is only being read.
1545
+ * \param[in] delta - int. Is only being read.
1546
+ * \param[in] width - int. Is only being read.
1547
+ *
1548
+ * \returns Returns the 4-byte word referenced by var from the source thread ID as nv_bfloat162.
1549
+ * If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned.
1550
+ * \note_ref_guide_warp_shuffle
1551
+ * \internal
1552
+ * \exception-guarantee no-throw guarantee
1553
+ * \behavior not reentrant, not thread safe
1554
+ * \endinternal
1555
+ */
1556
+ __CUDA_BF16_DECL__ __nv_bfloat162 __shfl_up_sync(const unsigned mask, const __nv_bfloat162 var, const unsigned int delta, const int width = warpSize);
1557
+ /**
1558
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
1559
+ * \brief Exchange a variable between threads within a warp. Copy from a thread with higher ID relative to the caller.
1560
+ *
1561
+ * \details Calculates a source thread ID by adding delta to the caller's thread ID.
1562
+ * The value of var held by the resulting thread ID is returned: this has the effect
1563
+ * of shifting var down the warp by delta threads. If width is less than warpSize then
1564
+ * each subsection of the warp behaves as a separate entity with a starting logical
1565
+ * thread ID of 0. As for __shfl_up_sync(), the ID number of the source thread
1566
+ * will not wrap around the value of width and so the upper delta threads
1567
+ * will remain unchanged.
1568
+ * \param[in] mask - unsigned int. Is only being read.
1569
+ * \param[in] var - nv_bfloat162. Is only being read.
1570
+ * \param[in] delta - int. Is only being read.
1571
+ * \param[in] width - int. Is only being read.
1572
+ *
1573
+ * \returns Returns the 4-byte word referenced by var from the source thread ID as nv_bfloat162.
1574
+ * If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned.
1575
+ * \note_ref_guide_warp_shuffle
1576
+ * \internal
1577
+ * \exception-guarantee no-throw guarantee
1578
+ * \behavior not reentrant, not thread safe
1579
+ * \endinternal
1580
+ */
1581
+ __CUDA_BF16_DECL__ __nv_bfloat162 __shfl_down_sync(const unsigned mask, const __nv_bfloat162 var, const unsigned int delta, const int width = warpSize);
1582
+ /**
1583
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
1584
+ * \brief Exchange a variable between threads within a warp. Copy from a thread based on bitwise XOR of own thread ID.
1585
+ *
1586
+ * \details Calculates a source thread ID by performing a bitwise XOR of the caller's thread ID with mask:
1587
+ * the value of var held by the resulting thread ID is returned. If width is less than warpSize then each
1588
+ * group of width consecutive threads are able to access elements from earlier groups of threads,
1589
+ * however if they attempt to access elements from later groups of threads their own value of var
1590
+ * will be returned. This mode implements a butterfly addressing pattern such as is used in tree
1591
+ * reduction and broadcast.
1592
+ * \param[in] mask - unsigned int. Is only being read.
1593
+ * \param[in] var - nv_bfloat162. Is only being read.
1594
+ * \param[in] delta - int. Is only being read.
1595
+ * \param[in] width - int. Is only being read.
1596
+ *
1597
+ * \returns Returns the 4-byte word referenced by var from the source thread ID as nv_bfloat162.
1598
+ * If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned.
1599
+ * \note_ref_guide_warp_shuffle
1600
+ * \internal
1601
+ * \exception-guarantee no-throw guarantee
1602
+ * \behavior not reentrant, not thread safe
1603
+ * \endinternal
1604
+ */
1605
+ __CUDA_BF16_DECL__ __nv_bfloat162 __shfl_xor_sync(const unsigned mask, const __nv_bfloat162 var, const int delta, const int width = warpSize);
1606
+ /**
1607
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
1608
+ * \brief Exchange a variable between threads within a warp. Direct copy from indexed thread.
1609
+ *
1610
+ * \details Returns the value of var held by the thread whose ID is given by delta.
1611
+ * If width is less than warpSize then each subsection of the warp behaves as a separate
1612
+ * entity with a starting logical thread ID of 0. If delta is outside the range [0:width-1],
1613
+ * the value returned corresponds to the value of var held by the delta modulo width (i.e.
1614
+ * within the same subsection). width must have a value which is a power of 2;
1615
+ * results are undefined if width is not a power of 2, or is a number greater than
1616
+ * warpSize.
1617
+ * \param[in] mask - unsigned int. Is only being read.
1618
+ * \param[in] var - nv_bfloat16. Is only being read.
1619
+ * \param[in] delta - int. Is only being read.
1620
+ * \param[in] width - int. Is only being read.
1621
+ *
1622
+ * \returns Returns the 2-byte word referenced by var from the source thread ID as nv_bfloat16.
1623
+ * If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned.
1624
+ * \note_ref_guide_warp_shuffle
1625
+ * \internal
1626
+ * \exception-guarantee no-throw guarantee
1627
+ * \behavior not reentrant, not thread safe
1628
+ * \endinternal
1629
+ */
1630
+ __CUDA_BF16_DECL__ __nv_bfloat16 __shfl_sync(const unsigned mask, const __nv_bfloat16 var, const int delta, const int width = warpSize);
1631
+ /**
1632
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
1633
+ * \brief Exchange a variable between threads within a warp. Copy from a thread with lower ID relative to the caller.
1634
+ * \details Calculates a source thread ID by subtracting delta from the caller's lane ID.
1635
+ * The value of var held by the resulting lane ID is returned: in effect, var is shifted up
1636
+ * the warp by delta threads. If width is less than warpSize then each subsection of the warp
1637
+ * behaves as a separate entity with a starting logical thread ID of 0. The source thread index
1638
+ * will not wrap around the value of width, so effectively the lower delta threads will be unchanged.
1639
+ * width must have a value which is a power of 2; results are undefined if width is not a power of 2,
1640
+ * or is a number greater than warpSize.
1641
+ * \param[in] mask - unsigned int. Is only being read.
1642
+ * \param[in] var - nv_bfloat16. Is only being read.
1643
+ * \param[in] delta - int. Is only being read.
1644
+ * \param[in] width - int. Is only being read.
1645
+ *
1646
+ * \returns Returns the 2-byte word referenced by var from the source thread ID as nv_bfloat16.
1647
+ * If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned.
1648
+ * \note_ref_guide_warp_shuffle
1649
+ * \internal
1650
+ * \exception-guarantee no-throw guarantee
1651
+ * \behavior not reentrant, not thread safe
1652
+ * \endinternal
1653
+ */
1654
+ __CUDA_BF16_DECL__ __nv_bfloat16 __shfl_up_sync(const unsigned mask, const __nv_bfloat16 var, const unsigned int delta, const int width = warpSize);
1655
+ /**
1656
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
1657
+ * \brief Exchange a variable between threads within a warp. Copy from a thread with higher ID relative to the caller.
1658
+ *
1659
+ * \details Calculates a source thread ID by adding delta to the caller's thread ID.
1660
+ * The value of var held by the resulting thread ID is returned: this has the effect
1661
+ * of shifting var down the warp by delta threads. If width is less than warpSize then
1662
+ * each subsection of the warp behaves as a separate entity with a starting logical
1663
+ * thread ID of 0. As for __shfl_up_sync(), the ID number of the source thread
1664
+ * will not wrap around the value of width and so the upper delta threads
1665
+ * will remain unchanged.
1666
+ * \param[in] mask - unsigned int. Is only being read.
1667
+ * \param[in] var - nv_bfloat16. Is only being read.
1668
+ * \param[in] delta - int. Is only being read.
1669
+ * \param[in] width - int. Is only being read.
1670
+ *
1671
+ * \returns Returns the 2-byte word referenced by var from the source thread ID as nv_bfloat16.
1672
+ * If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned.
1673
+ * \note_ref_guide_warp_shuffle
1674
+ * \internal
1675
+ * \exception-guarantee no-throw guarantee
1676
+ * \behavior not reentrant, not thread safe
1677
+ * \endinternal
1678
+ */
1679
+ __CUDA_BF16_DECL__ __nv_bfloat16 __shfl_down_sync(const unsigned mask, const __nv_bfloat16 var, const unsigned int delta, const int width = warpSize);
1680
+ /**
1681
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
1682
+ * \brief Exchange a variable between threads within a warp. Copy from a thread based on bitwise XOR of own thread ID.
1683
+ *
1684
+ * \details Calculates a source thread ID by performing a bitwise XOR of the caller's thread ID with mask:
1685
+ * the value of var held by the resulting thread ID is returned. If width is less than warpSize then each
1686
+ * group of width consecutive threads are able to access elements from earlier groups of threads,
1687
+ * however if they attempt to access elements from later groups of threads their own value of var
1688
+ * will be returned. This mode implements a butterfly addressing pattern such as is used in tree
1689
+ * reduction and broadcast.
1690
+ * \param[in] mask - unsigned int. Is only being read.
1691
+ * \param[in] var - nv_bfloat16. Is only being read.
1692
+ * \param[in] delta - int. Is only being read.
1693
+ * \param[in] width - int. Is only being read.
1694
+ *
1695
+ * \returns Returns the 2-byte word referenced by var from the source thread ID as nv_bfloat16.
1696
+ * If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned.
1697
+ * \note_ref_guide_warp_shuffle
1698
+ * \internal
1699
+ * \exception-guarantee no-throw guarantee
1700
+ * \behavior not reentrant, not thread safe
1701
+ * \endinternal
1702
+ */
1703
+ __CUDA_BF16_DECL__ __nv_bfloat16 __shfl_xor_sync(const unsigned mask, const __nv_bfloat16 var, const int delta, const int width = warpSize);
1704
+
1705
+ #if defined(__local_warpSize)
1706
+ #undef warpSize
1707
+ #undef __local_warpSize
1708
+ #endif
1709
+
1710
+ /**
1711
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
1712
+ * \brief Generates a `ld.global.nc` load instruction.
1713
+ * \param[in] ptr - memory location
1714
+ * \returns The value pointed by `ptr`
1715
+ */
1716
+ __CUDA_BF16_DECL__ __nv_bfloat162 __ldg(const __nv_bfloat162 *const ptr);
1717
+ /**
1718
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
1719
+ * \brief Generates a `ld.global.nc` load instruction.
1720
+ * \param[in] ptr - memory location
1721
+ * \returns The value pointed by `ptr`
1722
+ */
1723
+ __CUDA_BF16_DECL__ __nv_bfloat16 __ldg(const __nv_bfloat16 *const ptr);
1724
+ /**
1725
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
1726
+ * \brief Generates a `ld.global.cg` load instruction.
1727
+ * \param[in] ptr - memory location
1728
+ * \returns The value pointed by `ptr`
1729
+ */
1730
+ __CUDA_BF16_DECL__ __nv_bfloat162 __ldcg(const __nv_bfloat162 *const ptr);
1731
+ /**
1732
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
1733
+ * \brief Generates a `ld.global.cg` load instruction.
1734
+ * \param[in] ptr - memory location
1735
+ * \returns The value pointed by `ptr`
1736
+ */
1737
+ __CUDA_BF16_DECL__ __nv_bfloat16 __ldcg(const __nv_bfloat16 *const ptr);
1738
+ /**
1739
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
1740
+ * \brief Generates a `ld.global.ca` load instruction.
1741
+ * \param[in] ptr - memory location
1742
+ * \returns The value pointed by `ptr`
1743
+ */
1744
+ __CUDA_BF16_DECL__ __nv_bfloat162 __ldca(const __nv_bfloat162 *const ptr);
1745
+ /**
1746
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
1747
+ * \brief Generates a `ld.global.ca` load instruction.
1748
+ * \param[in] ptr - memory location
1749
+ * \returns The value pointed by `ptr`
1750
+ */
1751
+ __CUDA_BF16_DECL__ __nv_bfloat16 __ldca(const __nv_bfloat16 *const ptr);
1752
+ /**
1753
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
1754
+ * \brief Generates a `ld.global.cs` load instruction.
1755
+ * \param[in] ptr - memory location
1756
+ * \returns The value pointed by `ptr`
1757
+ */
1758
+ __CUDA_BF16_DECL__ __nv_bfloat162 __ldcs(const __nv_bfloat162 *const ptr);
1759
+ /**
1760
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
1761
+ * \brief Generates a `ld.global.cs` load instruction.
1762
+ * \param[in] ptr - memory location
1763
+ * \returns The value pointed by `ptr`
1764
+ */
1765
+ __CUDA_BF16_DECL__ __nv_bfloat16 __ldcs(const __nv_bfloat16 *const ptr);
1766
+ /**
1767
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
1768
+ * \brief Generates a `ld.global.lu` load instruction.
1769
+ * \param[in] ptr - memory location
1770
+ * \returns The value pointed by `ptr`
1771
+ */
1772
+ __CUDA_BF16_DECL__ __nv_bfloat162 __ldlu(const __nv_bfloat162 *const ptr);
1773
+ /**
1774
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
1775
+ * \brief Generates a `ld.global.lu` load instruction.
1776
+ * \param[in] ptr - memory location
1777
+ * \returns The value pointed by `ptr`
1778
+ */
1779
+ __CUDA_BF16_DECL__ __nv_bfloat16 __ldlu(const __nv_bfloat16 *const ptr);
1780
+ /**
1781
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
1782
+ * \brief Generates a `ld.global.cv` load instruction.
1783
+ * \param[in] ptr - memory location
1784
+ * \returns The value pointed by `ptr`
1785
+ */
1786
+ __CUDA_BF16_DECL__ __nv_bfloat162 __ldcv(const __nv_bfloat162 *const ptr);
1787
+ /**
1788
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
1789
+ * \brief Generates a `ld.global.cv` load instruction.
1790
+ * \param[in] ptr - memory location
1791
+ * \returns The value pointed by `ptr`
1792
+ */
1793
+ __CUDA_BF16_DECL__ __nv_bfloat16 __ldcv(const __nv_bfloat16 *const ptr);
1794
+
1795
+ /**
1796
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
1797
+ * \brief Generates a `st.global.wb` store instruction.
1798
+ * \param[out] ptr - memory location
1799
+ * \param[in] value - the value to be stored
1800
+ */
1801
+ __CUDA_BF16_DECL__ void __stwb(__nv_bfloat162 *const ptr, const __nv_bfloat162 value);
1802
+ /**
1803
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
1804
+ * \brief Generates a `st.global.wb` store instruction.
1805
+ * \param[out] ptr - memory location
1806
+ * \param[in] value - the value to be stored
1807
+ */
1808
+ __CUDA_BF16_DECL__ void __stwb(__nv_bfloat16 *const ptr, const __nv_bfloat16 value);
1809
+ /**
1810
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
1811
+ * \brief Generates a `st.global.cg` store instruction.
1812
+ * \param[out] ptr - memory location
1813
+ * \param[in] value - the value to be stored
1814
+ */
1815
+ __CUDA_BF16_DECL__ void __stcg(__nv_bfloat162 *const ptr, const __nv_bfloat162 value);
1816
+ /**
1817
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
1818
+ * \brief Generates a `st.global.cg` store instruction.
1819
+ * \param[out] ptr - memory location
1820
+ * \param[in] value - the value to be stored
1821
+ */
1822
+ __CUDA_BF16_DECL__ void __stcg(__nv_bfloat16 *const ptr, const __nv_bfloat16 value);
1823
+ /**
1824
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
1825
+ * \brief Generates a `st.global.cs` store instruction.
1826
+ * \param[out] ptr - memory location
1827
+ * \param[in] value - the value to be stored
1828
+ */
1829
+ __CUDA_BF16_DECL__ void __stcs(__nv_bfloat162 *const ptr, const __nv_bfloat162 value);
1830
+ /**
1831
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
1832
+ * \brief Generates a `st.global.cs` store instruction.
1833
+ * \param[out] ptr - memory location
1834
+ * \param[in] value - the value to be stored
1835
+ */
1836
+ __CUDA_BF16_DECL__ void __stcs(__nv_bfloat16 *const ptr, const __nv_bfloat16 value);
1837
+ /**
1838
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
1839
+ * \brief Generates a `st.global.wt` store instruction.
1840
+ * \param[out] ptr - memory location
1841
+ * \param[in] value - the value to be stored
1842
+ */
1843
+ __CUDA_BF16_DECL__ void __stwt(__nv_bfloat162 *const ptr, const __nv_bfloat162 value);
1844
+ /**
1845
+ * \ingroup CUDA_MATH__BFLOAT16_MISC
1846
+ * \brief Generates a `st.global.wt` store instruction.
1847
+ * \param[out] ptr - memory location
1848
+ * \param[in] value - the value to be stored
1849
+ */
1850
+ __CUDA_BF16_DECL__ void __stwt(__nv_bfloat16 *const ptr, const __nv_bfloat16 value);
1851
+
1852
+ /**
1853
+ * \ingroup CUDA_MATH__BFLOAT162_COMPARISON
1854
+ * \brief Performs nv_bfloat162 vector if-equal comparison.
1855
+ *
1856
+ * \details Performs \p nv_bfloat162 vector if-equal comparison of inputs \p a and \p b.
1857
+ * The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
1858
+ * NaN inputs generate false results.
1859
+ * \param[in] a - nv_bfloat162. Is only being read.
1860
+ * \param[in] b - nv_bfloat162. Is only being read.
1861
+ *
1862
+ * \returns nv_bfloat162
1863
+ * - The vector result of if-equal comparison of vectors \p a and \p b.
1864
+ * \internal
1865
+ * \exception-guarantee no-throw guarantee
1866
+ * \behavior reentrant, thread safe
1867
+ * \endinternal
1868
+ */
1869
+ __CUDA_BF16_DECL__ __nv_bfloat162 __heq2(const __nv_bfloat162 a, const __nv_bfloat162 b);
1870
+ /**
1871
+ * \ingroup CUDA_MATH__BFLOAT162_COMPARISON
1872
+ * \brief Performs \p nv_bfloat162 vector not-equal comparison.
1873
+ *
1874
+ * \details Performs \p nv_bfloat162 vector not-equal comparison of inputs \p a and \p b.
1875
+ * The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
1876
+ * NaN inputs generate false results.
1877
+ * \param[in] a - nv_bfloat162. Is only being read.
1878
+ * \param[in] b - nv_bfloat162. Is only being read.
1879
+ *
1880
+ * \returns nv_bfloat162
1881
+ * - The vector result of not-equal comparison of vectors \p a and \p b.
1882
+ * \internal
1883
+ * \exception-guarantee no-throw guarantee
1884
+ * \behavior reentrant, thread safe
1885
+ * \endinternal
1886
+ */
1887
+ __CUDA_BF16_DECL__ __nv_bfloat162 __hne2(const __nv_bfloat162 a, const __nv_bfloat162 b);
1888
+ /**
1889
+ * \ingroup CUDA_MATH__BFLOAT162_COMPARISON
1890
+ * \brief Performs \p nv_bfloat162 vector less-equal comparison.
1891
+ *
1892
+ * \details Performs \p nv_bfloat162 vector less-equal comparison of inputs \p a and \p b.
1893
+ * The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
1894
+ * NaN inputs generate false results.
1895
+ * \param[in] a - nv_bfloat162. Is only being read.
1896
+ * \param[in] b - nv_bfloat162. Is only being read.
1897
+ *
1898
+ * \returns nv_bfloat162
1899
+ * - The \p nv_bfloat162 result of less-equal comparison of vectors \p a and \p b.
1900
+ * \internal
1901
+ * \exception-guarantee no-throw guarantee
1902
+ * \behavior reentrant, thread safe
1903
+ * \endinternal
1904
+ */
1905
+ __CUDA_BF16_DECL__ __nv_bfloat162 __hle2(const __nv_bfloat162 a, const __nv_bfloat162 b);
1906
+ /**
1907
+ * \ingroup CUDA_MATH__BFLOAT162_COMPARISON
1908
+ * \brief Performs \p nv_bfloat162 vector greater-equal comparison.
1909
+ *
1910
+ * \details Performs \p nv_bfloat162 vector greater-equal comparison of inputs \p a and \p b.
1911
+ * The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
1912
+ * NaN inputs generate false results.
1913
+ * \param[in] a - nv_bfloat162. Is only being read.
1914
+ * \param[in] b - nv_bfloat162. Is only being read.
1915
+ *
1916
+ * \returns nv_bfloat162
1917
+ * - The vector result of greater-equal comparison of vectors \p a and \p b.
1918
+ * \internal
1919
+ * \exception-guarantee no-throw guarantee
1920
+ * \behavior reentrant, thread safe
1921
+ * \endinternal
1922
+ */
1923
+ __CUDA_BF16_DECL__ __nv_bfloat162 __hge2(const __nv_bfloat162 a, const __nv_bfloat162 b);
1924
+ /**
1925
+ * \ingroup CUDA_MATH__BFLOAT162_COMPARISON
1926
+ * \brief Performs \p nv_bfloat162 vector less-than comparison.
1927
+ *
1928
+ * \details Performs \p nv_bfloat162 vector less-than comparison of inputs \p a and \p b.
1929
+ * The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
1930
+ * NaN inputs generate false results.
1931
+ * \param[in] a - nv_bfloat162. Is only being read.
1932
+ * \param[in] b - nv_bfloat162. Is only being read.
1933
+ *
1934
+ * \returns nv_bfloat162
1935
+ * - The nv_bfloat162 vector result of less-than comparison of vectors \p a and \p b.
1936
+ * \internal
1937
+ * \exception-guarantee no-throw guarantee
1938
+ * \behavior reentrant, thread safe
1939
+ * \endinternal
1940
+ */
1941
+ __CUDA_BF16_DECL__ __nv_bfloat162 __hlt2(const __nv_bfloat162 a, const __nv_bfloat162 b);
1942
+ /**
1943
+ * \ingroup CUDA_MATH__BFLOAT162_COMPARISON
1944
+ * \brief Performs \p nv_bfloat162 vector greater-than comparison.
1945
+ *
1946
+ * \details Performs \p nv_bfloat162 vector greater-than comparison of inputs \p a and \p b.
1947
+ * The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
1948
+ * NaN inputs generate false results.
1949
+ * \param[in] a - nv_bfloat162. Is only being read.
1950
+ * \param[in] b - nv_bfloat162. Is only being read.
1951
+ *
1952
+ * \returns nv_bfloat162
1953
+ * - The vector result of greater-than comparison of vectors \p a and \p b.
1954
+ * \internal
1955
+ * \exception-guarantee no-throw guarantee
1956
+ * \behavior reentrant, thread safe
1957
+ * \endinternal
1958
+ */
1959
+ __CUDA_BF16_DECL__ __nv_bfloat162 __hgt2(const __nv_bfloat162 a, const __nv_bfloat162 b);
1960
+ /**
1961
+ * \ingroup CUDA_MATH__BFLOAT162_COMPARISON
1962
+ * \brief Performs \p nv_bfloat162 vector unordered if-equal comparison.
1963
+ *
1964
+ * \details Performs \p nv_bfloat162 vector if-equal comparison of inputs \p a and \p b.
1965
+ * The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
1966
+ * NaN inputs generate true results.
1967
+ * \param[in] a - nv_bfloat162. Is only being read.
1968
+ * \param[in] b - nv_bfloat162. Is only being read.
1969
+ *
1970
+ * \returns nv_bfloat162
1971
+ * - The vector result of unordered if-equal comparison of vectors \p a and \p b.
1972
+ * \internal
1973
+ * \exception-guarantee no-throw guarantee
1974
+ * \behavior reentrant, thread safe
1975
+ * \endinternal
1976
+ */
1977
+ __CUDA_BF16_DECL__ __nv_bfloat162 __hequ2(const __nv_bfloat162 a, const __nv_bfloat162 b);
1978
+ /**
1979
+ * \ingroup CUDA_MATH__BFLOAT162_COMPARISON
1980
+ * \brief Performs \p nv_bfloat162 vector unordered not-equal comparison.
1981
+ *
1982
+ * \details Performs \p nv_bfloat162 vector not-equal comparison of inputs \p a and \p b.
1983
+ * The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
1984
+ * NaN inputs generate true results.
1985
+ * \param[in] a - nv_bfloat162. Is only being read.
1986
+ * \param[in] b - nv_bfloat162. Is only being read.
1987
+ *
1988
+ * \returns nv_bfloat162
1989
+ * - The vector result of unordered not-equal comparison of vectors \p a and \p b.
1990
+ * \internal
1991
+ * \exception-guarantee no-throw guarantee
1992
+ * \behavior reentrant, thread safe
1993
+ * \endinternal
1994
+ */
1995
+ __CUDA_BF16_DECL__ __nv_bfloat162 __hneu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
1996
+ /**
1997
+ * \ingroup CUDA_MATH__BFLOAT162_COMPARISON
1998
+ * \brief Performs \p nv_bfloat162 vector unordered less-equal comparison.
1999
+ *
2000
+ * Performs \p nv_bfloat162 vector less-equal comparison of inputs \p a and \p b.
2001
+ * The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
2002
+ * NaN inputs generate true results.
2003
+ * \param[in] a - nv_bfloat162. Is only being read.
2004
+ * \param[in] b - nv_bfloat162. Is only being read.
2005
+ *
2006
+ * \returns nv_bfloat162
2007
+ * - The vector result of unordered less-equal comparison of vectors \p a and \p b.
2008
+ * \internal
2009
+ * \exception-guarantee no-throw guarantee
2010
+ * \behavior reentrant, thread safe
2011
+ * \endinternal
2012
+ */
2013
+ __CUDA_BF16_DECL__ __nv_bfloat162 __hleu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
2014
+ /**
2015
+ * \ingroup CUDA_MATH__BFLOAT162_COMPARISON
2016
+ * \brief Performs \p nv_bfloat162 vector unordered greater-equal comparison.
2017
+ *
2018
+ * \details Performs \p nv_bfloat162 vector greater-equal comparison of inputs \p a and \p b.
2019
+ * The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
2020
+ * NaN inputs generate true results.
2021
+ * \param[in] a - nv_bfloat162. Is only being read.
2022
+ * \param[in] b - nv_bfloat162. Is only being read.
2023
+ *
2024
+ * \returns nv_bfloat162
2025
+ * - The \p nv_bfloat162 vector result of unordered greater-equal comparison of vectors \p a and \p b.
2026
+ * \internal
2027
+ * \exception-guarantee no-throw guarantee
2028
+ * \behavior reentrant, thread safe
2029
+ * \endinternal
2030
+ */
2031
+ __CUDA_BF16_DECL__ __nv_bfloat162 __hgeu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
2032
+ /**
2033
+ * \ingroup CUDA_MATH__BFLOAT162_COMPARISON
2034
+ * \brief Performs \p nv_bfloat162 vector unordered less-than comparison.
2035
+ *
2036
+ * \details Performs \p nv_bfloat162 vector less-than comparison of inputs \p a and \p b.
2037
+ * The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
2038
+ * NaN inputs generate true results.
2039
+ * \param[in] a - nv_bfloat162. Is only being read.
2040
+ * \param[in] b - nv_bfloat162. Is only being read.
2041
+ *
2042
+ * \returns nv_bfloat162
2043
+ * - The vector result of unordered less-than comparison of vectors \p a and \p b.
2044
+ * \internal
2045
+ * \exception-guarantee no-throw guarantee
2046
+ * \behavior reentrant, thread safe
2047
+ * \endinternal
2048
+ */
2049
+ __CUDA_BF16_DECL__ __nv_bfloat162 __hltu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
2050
+ /**
2051
+ * \ingroup CUDA_MATH__BFLOAT162_COMPARISON
2052
+ * \brief Performs \p nv_bfloat162 vector unordered greater-than comparison.
2053
+ *
2054
+ * \details Performs \p nv_bfloat162 vector greater-than comparison of inputs \p a and \p b.
2055
+ * The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
2056
+ * NaN inputs generate true results.
2057
+ * \param[in] a - nv_bfloat162. Is only being read.
2058
+ * \param[in] b - nv_bfloat162. Is only being read.
2059
+ *
2060
+ * \returns nv_bfloat162
2061
+ * - The \p nv_bfloat162 vector result of unordered greater-than comparison of vectors \p a and \p b.
2062
+ * \internal
2063
+ * \exception-guarantee no-throw guarantee
2064
+ * \behavior reentrant, thread safe
2065
+ * \endinternal
2066
+ */
2067
+ __CUDA_BF16_DECL__ __nv_bfloat162 __hgtu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
2068
+ /**
2069
+ * \ingroup CUDA_MATH__BFLOAT162_COMPARISON
2070
+ * \brief Determine whether \p nv_bfloat162 argument is a NaN.
2071
+ *
2072
+ * \details Determine whether each nv_bfloat16 of input \p nv_bfloat162 number \p a is a NaN.
2073
+ * \param[in] a - nv_bfloat162. Is only being read.
2074
+ *
2075
+ * \returns nv_bfloat162
2076
+ * - The nv_bfloat162 with the corresponding \p nv_bfloat16 results set to
2077
+ * 1.0 for NaN, 0.0 otherwise.
2078
+ * \internal
2079
+ * \exception-guarantee no-throw guarantee
2080
+ * \behavior reentrant, thread safe
2081
+ * \endinternal
2082
+ */
2083
+ __CUDA_BF16_DECL__ __nv_bfloat162 __hisnan2(const __nv_bfloat162 a);
2084
+ /**
2085
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
2086
+ * \brief Performs \p nv_bfloat162 vector addition in round-to-nearest-even mode.
2087
+ *
2088
+ * \details Performs \p nv_bfloat162 vector add of inputs \p a and \p b, in round-to-nearest
2089
+ * mode.
2090
+ * \internal
2091
+ * \req DEEPLEARN-SRM_REQ-95
2092
+ * \endinternal
2093
+ * \param[in] a - nv_bfloat162. Is only being read.
2094
+ * \param[in] b - nv_bfloat162. Is only being read.
2095
+ *
2096
+ * \returns nv_bfloat162
2097
+ * - The sum of vectors \p a and \p b.
2098
+ * \internal
2099
+ * \exception-guarantee no-throw guarantee
2100
+ * \behavior reentrant, thread safe
2101
+ * \endinternal
2102
+ */
2103
+ __CUDA_BF16_DECL__ __nv_bfloat162 __hadd2(const __nv_bfloat162 a, const __nv_bfloat162 b);
2104
+ /**
2105
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
2106
+ * \brief Performs \p nv_bfloat162 vector subtraction in round-to-nearest-even mode.
2107
+ *
2108
+ * \details Subtracts \p nv_bfloat162 input vector \p b from input vector \p a in
2109
+ * round-to-nearest-even mode.
2110
+ * \internal
2111
+ * \req DEEPLEARN-SRM_REQ-104
2112
+ * \endinternal
2113
+ * \param[in] a - nv_bfloat162. Is only being read.
2114
+ * \param[in] b - nv_bfloat162. Is only being read.
2115
+ *
2116
+ * \returns nv_bfloat162
2117
+ * - The subtraction of vector \p b from \p a.
2118
+ * \internal
2119
+ * \exception-guarantee no-throw guarantee
2120
+ * \behavior reentrant, thread safe
2121
+ * \endinternal
2122
+ */
2123
+ __CUDA_BF16_DECL__ __nv_bfloat162 __hsub2(const __nv_bfloat162 a, const __nv_bfloat162 b);
2124
+ /**
2125
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
2126
+ * \brief Performs \p nv_bfloat162 vector multiplication in round-to-nearest-even mode.
2127
+ *
2128
+ * \details Performs \p nv_bfloat162 vector multiplication of inputs \p a and \p b, in
2129
+ * round-to-nearest-even mode.
2130
+ * \internal
2131
+ * \req DEEPLEARN-SRM_REQ-102
2132
+ * \endinternal
2133
+ * \param[in] a - nv_bfloat162. Is only being read.
2134
+ * \param[in] b - nv_bfloat162. Is only being read.
2135
+ *
2136
+ * \returns nv_bfloat162
2137
+ * - The result of elementwise multiplying the vectors \p a and \p b.
2138
+ * \internal
2139
+ * \exception-guarantee no-throw guarantee
2140
+ * \behavior reentrant, thread safe
2141
+ * \endinternal
2142
+ */
2143
+ __CUDA_BF16_DECL__ __nv_bfloat162 __hmul2(const __nv_bfloat162 a, const __nv_bfloat162 b);
2144
+ /**
2145
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
2146
+ * \brief Performs \p nv_bfloat162 vector addition in round-to-nearest-even mode.
2147
+ *
2148
+ * \details Performs \p nv_bfloat162 vector add of inputs \p a and \p b, in round-to-nearest
2149
+ * mode. Prevents floating-point contractions of mul+add into fma.
2150
+ * \internal
2151
+ * \req DEEPLEARN-SRM_REQ-95
2152
+ * \endinternal
2153
+ * \param[in] a - nv_bfloat162. Is only being read.
2154
+ * \param[in] b - nv_bfloat162. Is only being read.
2155
+ *
2156
+ * \returns nv_bfloat162
2157
+ * - The sum of vectors \p a and \p b.
2158
+ * \internal
2159
+ * \exception-guarantee no-throw guarantee
2160
+ * \behavior reentrant, thread safe
2161
+ * \endinternal
2162
+ */
2163
+ __CUDA_BF16_DECL__ __nv_bfloat162 __hadd2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b);
2164
+ /**
2165
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
2166
+ * \brief Performs \p nv_bfloat162 vector subtraction in round-to-nearest-even mode.
2167
+ *
2168
+ * \details Subtracts \p nv_bfloat162 input vector \p b from input vector \p a in
2169
+ * round-to-nearest-even mode. Prevents floating-point contractions of mul+sub into fma.
2170
+ * \internal
2171
+ * \req DEEPLEARN-SRM_REQ-104
2172
+ * \endinternal
2173
+ * \param[in] a - nv_bfloat162. Is only being read.
2174
+ * \param[in] b - nv_bfloat162. Is only being read.
2175
+ *
2176
+ * \returns nv_bfloat162
2177
+ * - The subtraction of vector \p b from \p a.
2178
+ * \internal
2179
+ * \exception-guarantee no-throw guarantee
2180
+ * \behavior reentrant, thread safe
2181
+ * \endinternal
2182
+ */
2183
+ __CUDA_BF16_DECL__ __nv_bfloat162 __hsub2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b);
2184
+ /**
2185
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
2186
+ * \brief Performs \p nv_bfloat162 vector multiplication in round-to-nearest-even mode.
2187
+ *
2188
+ * \details Performs \p nv_bfloat162 vector multiplication of inputs \p a and \p b, in
2189
+ * round-to-nearest-even mode. Prevents floating-point contractions of mul+add
2190
+ * or sub into fma.
2191
+ * \internal
2192
+ * \req DEEPLEARN-SRM_REQ-102
2193
+ * \endinternal
2194
+ * \param[in] a - nv_bfloat162. Is only being read.
2195
+ * \param[in] b - nv_bfloat162. Is only being read.
2196
+ *
2197
+ * \returns nv_bfloat162
2198
+ * - The result of elementwise multiplying the vectors \p a and \p b.
2199
+ * \internal
2200
+ * \exception-guarantee no-throw guarantee
2201
+ * \behavior reentrant, thread safe
2202
+ * \endinternal
2203
+ */
2204
+ __CUDA_BF16_DECL__ __nv_bfloat162 __hmul2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b);
2205
+ /**
2206
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
2207
+ * \brief Performs \p nv_bfloat162 vector division in round-to-nearest-even mode.
2208
+ *
2209
+ * \details Divides \p nv_bfloat162 input vector \p a by input vector \p b in round-to-nearest
2210
+ * mode.
2211
+ * \internal
2212
+ * \req DEEPLEARN-SRM_REQ-103
2213
+ * \endinternal
2214
+ * \param[in] a - nv_bfloat162. Is only being read.
2215
+ * \param[in] b - nv_bfloat162. Is only being read.
2216
+ *
2217
+ * \returns nv_bfloat162
2218
+ * - The elementwise division of \p a with \p b.
2219
+ * \internal
2220
+ * \exception-guarantee no-throw guarantee
2221
+ * \behavior reentrant, thread safe
2222
+ * \endinternal
2223
+ */
2224
+ __CUDA_BF16_DECL__ __nv_bfloat162 __h2div(const __nv_bfloat162 a, const __nv_bfloat162 b);
2225
+ /**
2226
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
2227
+ * \brief Calculates the absolute value of both halves of the input \p nv_bfloat162 number and
2228
+ * returns the result.
2229
+ *
2230
+ * \details Calculates the absolute value of both halves of the input \p nv_bfloat162 number and
2231
+ * returns the result.
2232
+ * \param[in] a - nv_bfloat162. Is only being read.
2233
+ *
2234
+ * \returns bfloat2
2235
+ * - Returns \p a with the absolute value of both halves.
2236
+ * \internal
2237
+ * \exception-guarantee no-throw guarantee
2238
+ * \behavior reentrant, thread safe
2239
+ * \endinternal
2240
+ */
2241
+ __CUDA_BF16_DECL__ __nv_bfloat162 __habs2(const __nv_bfloat162 a);
2242
+ /**
2243
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
2244
+ * \brief Performs \p nv_bfloat162 vector addition in round-to-nearest-even mode, with
2245
+ * saturation to [0.0, 1.0].
2246
+ *
2247
+ * \details Performs \p nv_bfloat162 vector add of inputs \p a and \p b, in round-to-nearest
2248
+ * mode, and clamps the results to range [0.0, 1.0]. NaN results are flushed to
2249
+ * +0.0.
2250
+ * \param[in] a - nv_bfloat162. Is only being read.
2251
+ * \param[in] b - nv_bfloat162. Is only being read.
2252
+ *
2253
+ * \returns nv_bfloat162
2254
+ * - The sum of \p a and \p b, with respect to saturation.
2255
+ * \internal
2256
+ * \exception-guarantee no-throw guarantee
2257
+ * \behavior reentrant, thread safe
2258
+ * \endinternal
2259
+ */
2260
+ __CUDA_BF16_DECL__ __nv_bfloat162 __hadd2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b);
2261
+ /**
2262
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
2263
+ * \brief Performs \p nv_bfloat162 vector subtraction in round-to-nearest-even mode,
2264
+ * with saturation to [0.0, 1.0].
2265
+ *
2266
+ * \details Subtracts \p nv_bfloat162 input vector \p b from input vector \p a in
2267
+ * round-to-nearest-even mode, and clamps the results to range [0.0, 1.0]. NaN
2268
+ * results are flushed to +0.0.
2269
+ * \param[in] a - nv_bfloat162. Is only being read.
2270
+ * \param[in] b - nv_bfloat162. Is only being read.
2271
+ *
2272
+ * \returns nv_bfloat162
2273
+ * - The subtraction of vector \p b from \p a, with respect to saturation.
2274
+ * \internal
2275
+ * \exception-guarantee no-throw guarantee
2276
+ * \behavior reentrant, thread safe
2277
+ * \endinternal
2278
+ */
2279
+ __CUDA_BF16_DECL__ __nv_bfloat162 __hsub2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b);
2280
+ /**
2281
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
2282
+ * \brief Performs \p nv_bfloat162 vector multiplication in round-to-nearest-even mode,
2283
+ * with saturation to [0.0, 1.0].
2284
+ *
2285
+ * \details Performs \p nv_bfloat162 vector multiplication of inputs \p a and \p b, in
2286
+ * round-to-nearest-even mode, and clamps the results to range [0.0, 1.0]. NaN
2287
+ * results are flushed to +0.0.
2288
+ * \param[in] a - nv_bfloat162. Is only being read.
2289
+ * \param[in] b - nv_bfloat162. Is only being read.
2290
+ *
2291
+ * \returns nv_bfloat162
2292
+ * - The result of elementwise multiplication of vectors \p a and \p b,
2293
+ * with respect to saturation.
2294
+ * \internal
2295
+ * \exception-guarantee no-throw guarantee
2296
+ * \behavior reentrant, thread safe
2297
+ * \endinternal
2298
+ */
2299
+ __CUDA_BF16_DECL__ __nv_bfloat162 __hmul2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b);
2300
+ /**
2301
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
2302
+ * \brief Performs \p nv_bfloat162 vector fused multiply-add in round-to-nearest-even
2303
+ * mode.
2304
+ *
2305
+ * \details Performs \p nv_bfloat162 vector multiply on inputs \p a and \p b,
2306
+ * then performs a \p nv_bfloat162 vector add of the result with \p c,
2307
+ * rounding the result once in round-to-nearest-even mode.
2308
+ * \internal
2309
+ * \req DEEPLEARN-SRM_REQ-105
2310
+ * \endinternal
2311
+ * \param[in] a - nv_bfloat162. Is only being read.
2312
+ * \param[in] b - nv_bfloat162. Is only being read.
2313
+ * \param[in] c - nv_bfloat162. Is only being read.
2314
+ *
2315
+ * \returns nv_bfloat162
2316
+ * - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c.
2317
+ * \internal
2318
+ * \exception-guarantee no-throw guarantee
2319
+ * \behavior reentrant, thread safe
2320
+ * \endinternal
2321
+ */
2322
+ __CUDA_BF16_DECL__ __nv_bfloat162 __hfma2(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c);
2323
+ /**
2324
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
2325
+ * \brief Performs \p nv_bfloat162 vector fused multiply-add in round-to-nearest-even
2326
+ * mode, with saturation to [0.0, 1.0].
2327
+ *
2328
+ * \details Performs \p nv_bfloat162 vector multiply on inputs \p a and \p b,
2329
+ * then performs a \p nv_bfloat162 vector add of the result with \p c,
2330
+ * rounding the result once in round-to-nearest-even mode, and clamps the
2331
+ * results to range [0.0, 1.0]. NaN results are flushed to +0.0.
2332
+ * \param[in] a - nv_bfloat162. Is only being read.
2333
+ * \param[in] b - nv_bfloat162. Is only being read.
2334
+ * \param[in] c - nv_bfloat162. Is only being read.
2335
+ *
2336
+ * \returns nv_bfloat162
2337
+ * - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c,
2338
+ * with respect to saturation.
2339
+ * \internal
2340
+ * \exception-guarantee no-throw guarantee
2341
+ * \behavior reentrant, thread safe
2342
+ * \endinternal
2343
+ */
2344
+ __CUDA_BF16_DECL__ __nv_bfloat162 __hfma2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c);
2345
+ /**
2346
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
2347
+ * \brief Negates both halves of the input \p nv_bfloat162 number and returns the
2348
+ * result.
2349
+ *
2350
+ * \details Negates both halves of the input \p nv_bfloat162 number \p a and returns the result.
2351
+ * \internal
2352
+ * \req DEEPLEARN-SRM_REQ-101
2353
+ * \endinternal
2354
+ * \param[in] a - nv_bfloat162. Is only being read.
2355
+ *
2356
+ * \returns nv_bfloat162
2357
+ * - Returns \p a with both halves negated.
2358
+ * \internal
2359
+ * \exception-guarantee no-throw guarantee
2360
+ * \behavior reentrant, thread safe
2361
+ * \endinternal
2362
+ */
2363
+ __CUDA_BF16_DECL__ __nv_bfloat162 __hneg2(const __nv_bfloat162 a);
2364
+ /**
2365
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
2366
+ * \brief Calculates the absolute value of input \p nv_bfloat16 number and returns the result.
2367
+ *
2368
+ * \details Calculates the absolute value of input \p nv_bfloat16 number and returns the result.
2369
+ * \param[in] a - nv_bfloat16. Is only being read.
2370
+ *
2371
+ * \returns nv_bfloat16
2372
+ * - The absolute value of a.
2373
+ * \internal
2374
+ * \exception-guarantee no-throw guarantee
2375
+ * \behavior reentrant, thread safe
2376
+ * \endinternal
2377
+ */
2378
+ __CUDA_BF16_DECL__ __nv_bfloat16 __habs(const __nv_bfloat16 a);
2379
+ /**
2380
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
2381
+ * \brief Performs \p nv_bfloat16 addition in round-to-nearest-even mode.
2382
+ *
2383
+ * \details Performs \p nv_bfloat16 addition of inputs \p a and \p b, in round-to-nearest-even
2384
+ * mode.
2385
+ * \internal
2386
+ * \req DEEPLEARN-SRM_REQ-94
2387
+ * \endinternal
2388
+ * \param[in] a - nv_bfloat16. Is only being read.
2389
+ * \param[in] b - nv_bfloat16. Is only being read.
2390
+ *
2391
+ * \returns nv_bfloat16
2392
+ * - The sum of \p a and \p b.
2393
+ * \internal
2394
+ * \exception-guarantee no-throw guarantee
2395
+ * \behavior reentrant, thread safe
2396
+ * \endinternal
2397
+ */
2398
+ __CUDA_BF16_DECL__ __nv_bfloat16 __hadd(const __nv_bfloat16 a, const __nv_bfloat16 b);
2399
+ /**
2400
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
2401
+ * \brief Performs \p nv_bfloat16 subtraction in round-to-nearest-even mode.
2402
+ *
2403
+ * \details Subtracts \p nv_bfloat16 input \p b from input \p a in round-to-nearest
2404
+ * mode.
2405
+ * \internal
2406
+ * \req DEEPLEARN-SRM_REQ-97
2407
+ * \endinternal
2408
+ * \param[in] a - nv_bfloat16. Is only being read.
2409
+ * \param[in] b - nv_bfloat16. Is only being read.
2410
+ *
2411
+ * \returns nv_bfloat16
2412
+ * - The result of subtracting \p b from \p a.
2413
+ * \internal
2414
+ * \exception-guarantee no-throw guarantee
2415
+ * \behavior reentrant, thread safe
2416
+ * \endinternal
2417
+ */
2418
+ __CUDA_BF16_DECL__ __nv_bfloat16 __hsub(const __nv_bfloat16 a, const __nv_bfloat16 b);
2419
+ /**
2420
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
2421
+ * \brief Performs \p nv_bfloat16 multiplication in round-to-nearest-even mode.
2422
+ *
2423
+ * \details Performs \p nv_bfloat16 multiplication of inputs \p a and \p b, in round-to-nearest
2424
+ * mode.
2425
+ * \internal
2426
+ * \req DEEPLEARN-SRM_REQ-99
2427
+ * \endinternal
2428
+ * \param[in] a - nv_bfloat16. Is only being read.
2429
+ * \param[in] b - nv_bfloat16. Is only being read.
2430
+ *
2431
+ * \returns nv_bfloat16
2432
+ * - The result of multiplying \p a and \p b.
2433
+ */
2434
+ __CUDA_BF16_DECL__ __nv_bfloat16 __hmul(const __nv_bfloat16 a, const __nv_bfloat16 b);
2435
+ /**
2436
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
2437
+ * \brief Performs \p nv_bfloat16 addition in round-to-nearest-even mode.
2438
+ *
2439
+ * \details Performs \p nv_bfloat16 addition of inputs \p a and \p b, in round-to-nearest-even
2440
+ * mode. Prevents floating-point contractions of mul+add into fma.
2441
+ * \internal
2442
+ * \req DEEPLEARN-SRM_REQ-94
2443
+ * \endinternal
2444
+ * \param[in] a - nv_bfloat16. Is only being read.
2445
+ * \param[in] b - nv_bfloat16. Is only being read.
2446
+ *
2447
+ * \returns nv_bfloat16
2448
+ * - The sum of \p a and \p b.
2449
+ * \internal
2450
+ * \exception-guarantee no-throw guarantee
2451
+ * \behavior reentrant, thread safe
2452
+ * \endinternal
2453
+ */
2454
+ __CUDA_BF16_DECL__ __nv_bfloat16 __hadd_rn(const __nv_bfloat16 a, const __nv_bfloat16 b);
2455
+ /**
2456
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
2457
+ * \brief Performs \p nv_bfloat16 subtraction in round-to-nearest-even mode.
2458
+ *
2459
+ * \details Subtracts \p nv_bfloat16 input \p b from input \p a in round-to-nearest
2460
+ * mode. Prevents floating-point contractions of mul+sub into fma.
2461
+ * \internal
2462
+ * \req DEEPLEARN-SRM_REQ-97
2463
+ * \endinternal
2464
+ * \param[in] a - nv_bfloat16. Is only being read.
2465
+ * \param[in] b - nv_bfloat16. Is only being read.
2466
+ *
2467
+ * \returns nv_bfloat16
2468
+ * - The result of subtracting \p b from \p a.
2469
+ * \internal
2470
+ * \exception-guarantee no-throw guarantee
2471
+ * \behavior reentrant, thread safe
2472
+ * \endinternal
2473
+ */
2474
+ __CUDA_BF16_DECL__ __nv_bfloat16 __hsub_rn(const __nv_bfloat16 a, const __nv_bfloat16 b);
2475
+ /**
2476
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
2477
+ * \brief Performs \p nv_bfloat16 multiplication in round-to-nearest-even mode.
2478
+ *
2479
+ * \details Performs \p nv_bfloat16 multiplication of inputs \p a and \p b, in round-to-nearest
2480
+ * mode. Prevents floating-point contractions of mul+add or sub into fma.
2481
+ * \internal
2482
+ * \req DEEPLEARN-SRM_REQ-99
2483
+ * \endinternal
2484
+ * \param[in] a - nv_bfloat16. Is only being read.
2485
+ * \param[in] b - nv_bfloat16. Is only being read.
2486
+ *
2487
+ * \returns nv_bfloat16
2488
+ * - The result of multiplying \p a and \p b.
2489
+ */
2490
+ __CUDA_BF16_DECL__ __nv_bfloat16 __hmul_rn(const __nv_bfloat16 a, const __nv_bfloat16 b);
2491
+ /**
2492
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
2493
+ * \brief Performs \p nv_bfloat16 division in round-to-nearest-even mode.
2494
+ *
2495
+ * \details Divides \p nv_bfloat16 input \p a by input \p b in round-to-nearest
2496
+ * mode.
2497
+ * \internal
2498
+ * \req DEEPLEARN-SRM_REQ-98
2499
+ * \endinternal
2500
+ * \param[in] a - nv_bfloat16. Is only being read.
2501
+ * \param[in] b - nv_bfloat16. Is only being read.
2502
+ *
2503
+ * \returns nv_bfloat16
2504
+ * - The result of dividing \p a by \p b.
2505
+ * \internal
2506
+ * \exception-guarantee no-throw guarantee
2507
+ * \behavior reentrant, thread safe
2508
+ * \endinternal
2509
+ */
2510
+ __CUDA_BF16_DECL__ __nv_bfloat16 __hdiv(const __nv_bfloat16 a, const __nv_bfloat16 b);
2511
+ /**
2512
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
2513
+ * \brief Performs \p nv_bfloat16 addition in round-to-nearest-even mode, with
2514
+ * saturation to [0.0, 1.0].
2515
+ *
2516
+ * \details Performs \p nv_bfloat16 add of inputs \p a and \p b, in round-to-nearest-even mode,
2517
+ * and clamps the result to range [0.0, 1.0]. NaN results are flushed to +0.0.
2518
+ * \param[in] a - nv_bfloat16. Is only being read.
2519
+ * \param[in] b - nv_bfloat16. Is only being read.
2520
+ *
2521
+ * \returns nv_bfloat16
2522
+ * - The sum of \p a and \p b, with respect to saturation.
2523
+ * \internal
2524
+ * \exception-guarantee no-throw guarantee
2525
+ * \behavior reentrant, thread safe
2526
+ * \endinternal
2527
+ */
2528
+ __CUDA_BF16_DECL__ __nv_bfloat16 __hadd_sat(const __nv_bfloat16 a, const __nv_bfloat16 b);
2529
+ /**
2530
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
2531
+ * \brief Performs \p nv_bfloat16 subtraction in round-to-nearest-even mode, with
2532
+ * saturation to [0.0, 1.0].
2533
+ *
2534
+ * \details Subtracts \p nv_bfloat16 input \p b from input \p a in round-to-nearest
2535
+ * mode,
2536
+ * and clamps the result to range [0.0, 1.0]. NaN results are flushed to +0.0.
2537
+ * \param[in] a - nv_bfloat16. Is only being read.
2538
+ * \param[in] b - nv_bfloat16. Is only being read.
2539
+ *
2540
+ * \returns nv_bfloat16
2541
+ * - The result of subtraction of \p b from \p a, with respect to saturation.
2542
+ * \internal
2543
+ * \exception-guarantee no-throw guarantee
2544
+ * \behavior reentrant, thread safe
2545
+ * \endinternal
2546
+ */
2547
+ __CUDA_BF16_DECL__ __nv_bfloat16 __hsub_sat(const __nv_bfloat16 a, const __nv_bfloat16 b);
2548
+ /**
2549
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
2550
+ * \brief Performs \p nv_bfloat16 multiplication in round-to-nearest-even mode, with
2551
+ * saturation to [0.0, 1.0].
2552
+ *
2553
+ * \details Performs \p nv_bfloat16 multiplication of inputs \p a and \p b, in round-to-nearest
2554
+ * mode, and clamps the result to range [0.0, 1.0]. NaN results are flushed to
2555
+ * +0.0.
2556
+ * \param[in] a - nv_bfloat16. Is only being read.
2557
+ * \param[in] b - nv_bfloat16. Is only being read.
2558
+ *
2559
+ * \returns nv_bfloat16
2560
+ * - The result of multiplying \p a and \p b, with respect to saturation.
2561
+ * \internal
2562
+ * \exception-guarantee no-throw guarantee
2563
+ * \behavior reentrant, thread safe
2564
+ * \endinternal
2565
+ */
2566
+ __CUDA_BF16_DECL__ __nv_bfloat16 __hmul_sat(const __nv_bfloat16 a, const __nv_bfloat16 b);
2567
+ /**
2568
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
2569
+ * \brief Performs \p nv_bfloat16 fused multiply-add in round-to-nearest-even mode.
2570
+ *
2571
+ * \details Performs \p nv_bfloat16 multiply on inputs \p a and \p b,
2572
+ * then performs a \p nv_bfloat16 add of the result with \p c,
2573
+ * rounding the result once in round-to-nearest-even mode.
2574
+ * \internal
2575
+ * \req DEEPLEARN-SRM_REQ-96
2576
+ * \endinternal
2577
+ * \param[in] a - nv_bfloat16. Is only being read.
2578
+ * \param[in] b - nv_bfloat16. Is only being read.
2579
+ * \param[in] c - nv_bfloat16. Is only being read.
2580
+ *
2581
+ * \returns nv_bfloat16
2582
+ * - The result of fused multiply-add operation on \p
2583
+ * a, \p b, and \p c.
2584
+ * \internal
2585
+ * \exception-guarantee no-throw guarantee
2586
+ * \behavior reentrant, thread safe
2587
+ * \endinternal
2588
+ */
2589
+ __CUDA_BF16_DECL__ __nv_bfloat16 __hfma(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c);
2590
+ /**
2591
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
2592
+ * \brief Performs \p nv_bfloat16 fused multiply-add in round-to-nearest-even mode,
2593
+ * with saturation to [0.0, 1.0].
2594
+ *
2595
+ * \details Performs \p nv_bfloat16 multiply on inputs \p a and \p b,
2596
+ * then performs a \p nv_bfloat16 add of the result with \p c,
2597
+ * rounding the result once in round-to-nearest-even mode, and clamps the result
2598
+ * to range [0.0, 1.0]. NaN results are flushed to +0.0.
2599
+ * \param[in] a - nv_bfloat16. Is only being read.
2600
+ * \param[in] b - nv_bfloat16. Is only being read.
2601
+ * \param[in] c - nv_bfloat16. Is only being read.
2602
+ *
2603
+ * \returns nv_bfloat16
2604
+ * - The result of fused multiply-add operation on \p
2605
+ * a, \p b, and \p c, with respect to saturation.
2606
+ * \internal
2607
+ * \exception-guarantee no-throw guarantee
2608
+ * \behavior reentrant, thread safe
2609
+ * \endinternal
2610
+ */
2611
+ __CUDA_BF16_DECL__ __nv_bfloat16 __hfma_sat(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c);
2612
+ /**
2613
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
2614
+ * \brief Negates input \p nv_bfloat16 number and returns the result.
2615
+ *
2616
+ * \details Negates input \p nv_bfloat16 number and returns the result.
2617
+ * \internal
2618
+ * \req DEEPLEARN-SRM_REQ-100
2619
+ * \endinternal
2620
+ * \param[in] a - nv_bfloat16. Is only being read.
2621
+ *
2622
+ * \returns nv_bfloat16
2623
+ * - minus a
2624
+ * \internal
2625
+ * \exception-guarantee no-throw guarantee
2626
+ * \behavior reentrant, thread safe
2627
+ * \endinternal
2628
+ */
2629
+ __CUDA_BF16_DECL__ __nv_bfloat16 __hneg(const __nv_bfloat16 a);
2630
+ /**
2631
+ * \ingroup CUDA_MATH__BFLOAT162_COMPARISON
2632
+ * \brief Performs \p nv_bfloat162 vector if-equal comparison and returns boolean true
2633
+ * iff both \p nv_bfloat16 results are true, boolean false otherwise.
2634
+ *
2635
+ * \details Performs \p nv_bfloat162 vector if-equal comparison of inputs \p a and \p b.
2636
+ * The bool result is set to true only if both \p nv_bfloat16 if-equal comparisons
2637
+ * evaluate to true, or false otherwise.
2638
+ * NaN inputs generate false results.
2639
+ * \param[in] a - nv_bfloat162. Is only being read.
2640
+ * \param[in] b - nv_bfloat162. Is only being read.
2641
+ *
2642
+ * \returns bool
2643
+ * - true if both \p nv_bfloat16 results of if-equal comparison
2644
+ * of vectors \p a and \p b are true;
2645
+ * - false otherwise.
2646
+ * \internal
2647
+ * \exception-guarantee no-throw guarantee
2648
+ * \behavior reentrant, thread safe
2649
+ * \endinternal
2650
+ */
2651
+ __CUDA_BF16_DECL__ bool __hbeq2(const __nv_bfloat162 a, const __nv_bfloat162 b);
2652
+ /**
2653
+ * \ingroup CUDA_MATH__BFLOAT162_COMPARISON
2654
+ * \brief Performs \p nv_bfloat162 vector not-equal comparison and returns boolean
2655
+ * true iff both \p nv_bfloat16 results are true, boolean false otherwise.
2656
+ *
2657
+ * \details Performs \p nv_bfloat162 vector not-equal comparison of inputs \p a and \p b.
2658
+ * The bool result is set to true only if both \p nv_bfloat16 not-equal comparisons
2659
+ * evaluate to true, or false otherwise.
2660
+ * NaN inputs generate false results.
2661
+ * \param[in] a - nv_bfloat162. Is only being read.
2662
+ * \param[in] b - nv_bfloat162. Is only being read.
2663
+ *
2664
+ * \returns bool
2665
+ * - true if both \p nv_bfloat16 results of not-equal comparison
2666
+ * of vectors \p a and \p b are true,
2667
+ * - false otherwise.
2668
+ * \internal
2669
+ * \exception-guarantee no-throw guarantee
2670
+ * \behavior reentrant, thread safe
2671
+ * \endinternal
2672
+ */
2673
+ __CUDA_BF16_DECL__ bool __hbne2(const __nv_bfloat162 a, const __nv_bfloat162 b);
2674
+ /**
2675
+ * \ingroup CUDA_MATH__BFLOAT162_COMPARISON
2676
+ * \brief Performs \p nv_bfloat162 vector less-equal comparison and returns boolean
2677
+ * true iff both \p nv_bfloat16 results are true, boolean false otherwise.
2678
+ *
2679
+ * \details Performs \p nv_bfloat162 vector less-equal comparison of inputs \p a and \p b.
2680
+ * The bool result is set to true only if both \p nv_bfloat16 less-equal comparisons
2681
+ * evaluate to true, or false otherwise.
2682
+ * NaN inputs generate false results.
2683
+ * \param[in] a - nv_bfloat162. Is only being read.
2684
+ * \param[in] b - nv_bfloat162. Is only being read.
2685
+ *
2686
+ * \returns bool
2687
+ * - true if both \p nv_bfloat16 results of less-equal comparison
2688
+ * of vectors \p a and \p b are true;
2689
+ * - false otherwise.
2690
+ * \internal
2691
+ * \exception-guarantee no-throw guarantee
2692
+ * \behavior reentrant, thread safe
2693
+ * \endinternal
2694
+ */
2695
+ __CUDA_BF16_DECL__ bool __hble2(const __nv_bfloat162 a, const __nv_bfloat162 b);
2696
+ /**
2697
+ * \ingroup CUDA_MATH__BFLOAT162_COMPARISON
2698
+ * \brief Performs \p nv_bfloat162 vector greater-equal comparison and returns boolean
2699
+ * true iff both \p nv_bfloat16 results are true, boolean false otherwise.
2700
+ *
2701
+ * \details Performs \p nv_bfloat162 vector greater-equal comparison of inputs \p a and \p b.
2702
+ * The bool result is set to true only if both \p nv_bfloat16 greater-equal comparisons
2703
+ * evaluate to true, or false otherwise.
2704
+ * NaN inputs generate false results.
2705
+ * \param[in] a - nv_bfloat162. Is only being read.
2706
+ * \param[in] b - nv_bfloat162. Is only being read.
2707
+ *
2708
+ * \returns bool
2709
+ * - true if both \p nv_bfloat16 results of greater-equal
2710
+ * comparison of vectors \p a and \p b are true;
2711
+ * - false otherwise.
2712
+ * \internal
2713
+ * \exception-guarantee no-throw guarantee
2714
+ * \behavior reentrant, thread safe
2715
+ * \endinternal
2716
+ */
2717
+ __CUDA_BF16_DECL__ bool __hbge2(const __nv_bfloat162 a, const __nv_bfloat162 b);
2718
+ /**
2719
+ * \ingroup CUDA_MATH__BFLOAT162_COMPARISON
2720
+ * \brief Performs \p nv_bfloat162 vector less-than comparison and returns boolean
2721
+ * true iff both \p nv_bfloat16 results are true, boolean false otherwise.
2722
+ *
2723
+ * \details Performs \p nv_bfloat162 vector less-than comparison of inputs \p a and \p b.
2724
+ * The bool result is set to true only if both \p nv_bfloat16 less-than comparisons
2725
+ * evaluate to true, or false otherwise.
2726
+ * NaN inputs generate false results.
2727
+ * \param[in] a - nv_bfloat162. Is only being read.
2728
+ * \param[in] b - nv_bfloat162. Is only being read.
2729
+ *
2730
+ * \returns bool
2731
+ * - true if both \p nv_bfloat16 results of less-than comparison
2732
+ * of vectors \p a and \p b are true;
2733
+ * - false otherwise.
2734
+ * \internal
2735
+ * \exception-guarantee no-throw guarantee
2736
+ * \behavior reentrant, thread safe
2737
+ * \endinternal
2738
+ */
2739
+ __CUDA_BF16_DECL__ bool __hblt2(const __nv_bfloat162 a, const __nv_bfloat162 b);
2740
+ /**
2741
+ * \ingroup CUDA_MATH__BFLOAT162_COMPARISON
2742
+ * \brief Performs \p nv_bfloat162 vector greater-than comparison and returns boolean
2743
+ * true iff both \p nv_bfloat16 results are true, boolean false otherwise.
2744
+ *
2745
+ * \details Performs \p nv_bfloat162 vector greater-than comparison of inputs \p a and \p b.
2746
+ * The bool result is set to true only if both \p nv_bfloat16 greater-than comparisons
2747
+ * evaluate to true, or false otherwise.
2748
+ * NaN inputs generate false results.
2749
+ * \param[in] a - nv_bfloat162. Is only being read.
2750
+ * \param[in] b - nv_bfloat162. Is only being read.
2751
+ *
2752
+ * \returns bool
2753
+ * - true if both \p nv_bfloat16 results of greater-than
2754
+ * comparison of vectors \p a and \p b are true;
2755
+ * - false otherwise.
2756
+ * \internal
2757
+ * \exception-guarantee no-throw guarantee
2758
+ * \behavior reentrant, thread safe
2759
+ * \endinternal
2760
+ */
2761
+ __CUDA_BF16_DECL__ bool __hbgt2(const __nv_bfloat162 a, const __nv_bfloat162 b);
2762
+ /**
2763
+ * \ingroup CUDA_MATH__BFLOAT162_COMPARISON
2764
+ * \brief Performs \p nv_bfloat162 vector unordered if-equal comparison and returns
2765
+ * boolean true iff both \p nv_bfloat16 results are true, boolean false otherwise.
2766
+ *
2767
+ * \details Performs \p nv_bfloat162 vector if-equal comparison of inputs \p a and \p b.
2768
+ * The bool result is set to true only if both \p nv_bfloat16 if-equal comparisons
2769
+ * evaluate to true, or false otherwise.
2770
+ * NaN inputs generate true results.
2771
+ * \param[in] a - nv_bfloat162. Is only being read.
2772
+ * \param[in] b - nv_bfloat162. Is only being read.
2773
+ *
2774
+ * \returns bool
2775
+ * - true if both \p nv_bfloat16 results of unordered if-equal
2776
+ * comparison of vectors \p a and \p b are true;
2777
+ * - false otherwise.
2778
+ * \internal
2779
+ * \exception-guarantee no-throw guarantee
2780
+ * \behavior reentrant, thread safe
2781
+ * \endinternal
2782
+ */
2783
+ __CUDA_BF16_DECL__ bool __hbequ2(const __nv_bfloat162 a, const __nv_bfloat162 b);
2784
+ /**
2785
+ * \ingroup CUDA_MATH__BFLOAT162_COMPARISON
2786
+ * \brief Performs \p nv_bfloat162 vector unordered not-equal comparison and returns
2787
+ * boolean true iff both \p nv_bfloat16 results are true, boolean false otherwise.
2788
+ *
2789
+ * \details Performs \p nv_bfloat162 vector not-equal comparison of inputs \p a and \p b.
2790
+ * The bool result is set to true only if both \p nv_bfloat16 not-equal comparisons
2791
+ * evaluate to true, or false otherwise.
2792
+ * NaN inputs generate true results.
2793
+ * \param[in] a - nv_bfloat162. Is only being read.
2794
+ * \param[in] b - nv_bfloat162. Is only being read.
2795
+ *
2796
+ * \returns bool
2797
+ * - true if both \p nv_bfloat16 results of unordered not-equal
2798
+ * comparison of vectors \p a and \p b are true;
2799
+ * - false otherwise.
2800
+ * \internal
2801
+ * \exception-guarantee no-throw guarantee
2802
+ * \behavior reentrant, thread safe
2803
+ * \endinternal
2804
+ */
2805
+ __CUDA_BF16_DECL__ bool __hbneu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
2806
+ /**
2807
+ * \ingroup CUDA_MATH__BFLOAT162_COMPARISON
2808
+ * \brief Performs \p nv_bfloat162 vector unordered less-equal comparison and returns
2809
+ * boolean true iff both \p nv_bfloat16 results are true, boolean false otherwise.
2810
+ *
2811
+ * \details Performs \p nv_bfloat162 vector less-equal comparison of inputs \p a and \p b.
2812
+ * The bool result is set to true only if both \p nv_bfloat16 less-equal comparisons
2813
+ * evaluate to true, or false otherwise.
2814
+ * NaN inputs generate true results.
2815
+ * \param[in] a - nv_bfloat162. Is only being read.
2816
+ * \param[in] b - nv_bfloat162. Is only being read.
2817
+ *
2818
+ * \returns bool
2819
+ * - true if both \p nv_bfloat16 results of unordered less-equal
2820
+ * comparison of vectors \p a and \p b are true;
2821
+ * - false otherwise.
2822
+ * \internal
2823
+ * \exception-guarantee no-throw guarantee
2824
+ * \behavior reentrant, thread safe
2825
+ * \endinternal
2826
+ */
2827
+ __CUDA_BF16_DECL__ bool __hbleu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
2828
+ /**
2829
+ * \ingroup CUDA_MATH__BFLOAT162_COMPARISON
2830
+ * \brief Performs \p nv_bfloat162 vector unordered greater-equal comparison and
2831
+ * returns boolean true iff both \p nv_bfloat16 results are true, boolean false
2832
+ * otherwise.
2833
+ *
2834
+ * \details Performs \p nv_bfloat162 vector greater-equal comparison of inputs \p a and \p b.
2835
+ * The bool result is set to true only if both \p nv_bfloat16 greater-equal comparisons
2836
+ * evaluate to true, or false otherwise.
2837
+ * NaN inputs generate true results.
2838
+ * \param[in] a - nv_bfloat162. Is only being read.
2839
+ * \param[in] b - nv_bfloat162. Is only being read.
2840
+ *
2841
+ * \returns bool
2842
+ * - true if both \p nv_bfloat16 results of unordered
2843
+ * greater-equal comparison of vectors \p a and \p b are true;
2844
+ * - false otherwise.
2845
+ * \internal
2846
+ * \exception-guarantee no-throw guarantee
2847
+ * \behavior reentrant, thread safe
2848
+ * \endinternal
2849
+ */
2850
+ __CUDA_BF16_DECL__ bool __hbgeu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
2851
+ /**
2852
+ * \ingroup CUDA_MATH__BFLOAT162_COMPARISON
2853
+ * \brief Performs \p nv_bfloat162 vector unordered less-than comparison and returns
2854
+ * boolean true iff both \p nv_bfloat16 results are true, boolean false otherwise.
2855
+ *
2856
+ * \details Performs \p nv_bfloat162 vector less-than comparison of inputs \p a and \p b.
2857
+ * The bool result is set to true only if both \p nv_bfloat16 less-than comparisons
2858
+ * evaluate to true, or false otherwise.
2859
+ * NaN inputs generate true results.
2860
+ * \param[in] a - nv_bfloat162. Is only being read.
2861
+ * \param[in] b - nv_bfloat162. Is only being read.
2862
+ *
2863
+ * \returns bool
2864
+ * - true if both \p nv_bfloat16 results of unordered less-than comparison of
2865
+ * vectors \p a and \p b are true;
2866
+ * - false otherwise.
2867
+ * \internal
2868
+ * \exception-guarantee no-throw guarantee
2869
+ * \behavior reentrant, thread safe
2870
+ * \endinternal
2871
+ */
2872
+ __CUDA_BF16_DECL__ bool __hbltu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
2873
+ /**
2874
+ * \ingroup CUDA_MATH__BFLOAT162_COMPARISON
2875
+ * \brief Performs \p nv_bfloat162 vector unordered greater-than comparison and
2876
+ * returns boolean true iff both \p nv_bfloat16 results are true, boolean false
2877
+ * otherwise.
2878
+ *
2879
+ * \details Performs \p nv_bfloat162 vector greater-than comparison of inputs \p a and \p b.
2880
+ * The bool result is set to true only if both \p nv_bfloat16 greater-than comparisons
2881
+ * evaluate to true, or false otherwise.
2882
+ * NaN inputs generate true results.
2883
+ * \param[in] a - nv_bfloat162. Is only being read.
2884
+ * \param[in] b - nv_bfloat162. Is only being read.
2885
+ *
2886
+ * \returns bool
2887
+ * - true if both \p nv_bfloat16 results of unordered
2888
+ * greater-than comparison of vectors \p a and \p b are true;
2889
+ * - false otherwise.
2890
+ * \internal
2891
+ * \exception-guarantee no-throw guarantee
2892
+ * \behavior reentrant, thread safe
2893
+ * \endinternal
2894
+ */
2895
+ __CUDA_BF16_DECL__ bool __hbgtu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
2896
+ /**
2897
+ * \ingroup CUDA_MATH__BFLOAT16_COMPARISON
2898
+ * \brief Performs \p nv_bfloat16 if-equal comparison.
2899
+ *
2900
+ * \details Performs \p nv_bfloat16 if-equal comparison of inputs \p a and \p b.
2901
+ * NaN inputs generate false results.
2902
+ * \param[in] a - nv_bfloat16. Is only being read.
2903
+ * \param[in] b - nv_bfloat16. Is only being read.
2904
+ *
2905
+ * \returns bool
2906
+ * - The boolean result of if-equal comparison of \p a and \p b.
2907
+ * \internal
2908
+ * \exception-guarantee no-throw guarantee
2909
+ * \behavior reentrant, thread safe
2910
+ * \endinternal
2911
+ */
2912
+ __CUDA_BF16_DECL__ bool __heq(const __nv_bfloat16 a, const __nv_bfloat16 b);
2913
+ /**
2914
+ * \ingroup CUDA_MATH__BFLOAT16_COMPARISON
2915
+ * \brief Performs \p nv_bfloat16 not-equal comparison.
2916
+ *
2917
+ * \details Performs \p nv_bfloat16 not-equal comparison of inputs \p a and \p b.
2918
+ * NaN inputs generate false results.
2919
+ * \param[in] a - nv_bfloat16. Is only being read.
2920
+ * \param[in] b - nv_bfloat16. Is only being read.
2921
+ *
2922
+ * \returns bool
2923
+ * - The boolean result of not-equal comparison of \p a and \p b.
2924
+ * \internal
2925
+ * \exception-guarantee no-throw guarantee
2926
+ * \behavior reentrant, thread safe
2927
+ * \endinternal
2928
+ */
2929
+ __CUDA_BF16_DECL__ bool __hne(const __nv_bfloat16 a, const __nv_bfloat16 b);
2930
+ /**
2931
+ * \ingroup CUDA_MATH__BFLOAT16_COMPARISON
2932
+ * \brief Performs \p nv_bfloat16 less-equal comparison.
2933
+ *
2934
+ * \details Performs \p nv_bfloat16 less-equal comparison of inputs \p a and \p b.
2935
+ * NaN inputs generate false results.
2936
+ * \param[in] a - nv_bfloat16. Is only being read.
2937
+ * \param[in] b - nv_bfloat16. Is only being read.
2938
+ *
2939
+ * \returns bool
2940
+ * - The boolean result of less-equal comparison of \p a and \p b.
2941
+ * \internal
2942
+ * \exception-guarantee no-throw guarantee
2943
+ * \behavior reentrant, thread safe
2944
+ * \endinternal
2945
+ */
2946
+ __CUDA_BF16_DECL__ bool __hle(const __nv_bfloat16 a, const __nv_bfloat16 b);
2947
+ /**
2948
+ * \ingroup CUDA_MATH__BFLOAT16_COMPARISON
2949
+ * \brief Performs \p nv_bfloat16 greater-equal comparison.
2950
+ *
2951
+ * \details Performs \p nv_bfloat16 greater-equal comparison of inputs \p a and \p b.
2952
+ * NaN inputs generate false results.
2953
+ * \param[in] a - nv_bfloat16. Is only being read.
2954
+ * \param[in] b - nv_bfloat16. Is only being read.
2955
+ *
2956
+ * \returns bool
2957
+ * - The boolean result of greater-equal comparison of \p a and \p b.
2958
+ * \internal
2959
+ * \exception-guarantee no-throw guarantee
2960
+ * \behavior reentrant, thread safe
2961
+ * \endinternal
2962
+ */
2963
+ __CUDA_BF16_DECL__ bool __hge(const __nv_bfloat16 a, const __nv_bfloat16 b);
2964
+ /**
2965
+ * \ingroup CUDA_MATH__BFLOAT16_COMPARISON
2966
+ * \brief Performs \p nv_bfloat16 less-than comparison.
2967
+ *
2968
+ * \details Performs \p nv_bfloat16 less-than comparison of inputs \p a and \p b.
2969
+ * NaN inputs generate false results.
2970
+ * \param[in] a - nv_bfloat16. Is only being read.
2971
+ * \param[in] b - nv_bfloat16. Is only being read.
2972
+ *
2973
+ * \returns bool
2974
+ * - The boolean result of less-than comparison of \p a and \p b.
2975
+ * \internal
2976
+ * \exception-guarantee no-throw guarantee
2977
+ * \behavior reentrant, thread safe
2978
+ * \endinternal
2979
+ */
2980
+ __CUDA_BF16_DECL__ bool __hlt(const __nv_bfloat16 a, const __nv_bfloat16 b);
2981
+ /**
2982
+ * \ingroup CUDA_MATH__BFLOAT16_COMPARISON
2983
+ * \brief Performs \p nv_bfloat16 greater-than comparison.
2984
+ *
2985
+ * \details Performs \p nv_bfloat16 greater-than comparison of inputs \p a and \p b.
2986
+ * NaN inputs generate false results.
2987
+ * \param[in] a - nv_bfloat16. Is only being read.
2988
+ * \param[in] b - nv_bfloat16. Is only being read.
2989
+ *
2990
+ * \returns bool
2991
+ * - The boolean result of greater-than comparison of \p a and \p b.
2992
+ * \internal
2993
+ * \exception-guarantee no-throw guarantee
2994
+ * \behavior reentrant, thread safe
2995
+ * \endinternal
2996
+ */
2997
+ __CUDA_BF16_DECL__ bool __hgt(const __nv_bfloat16 a, const __nv_bfloat16 b);
2998
+ /**
2999
+ * \ingroup CUDA_MATH__BFLOAT16_COMPARISON
3000
+ * \brief Performs \p nv_bfloat16 unordered if-equal comparison.
3001
+ *
3002
+ * \details Performs \p nv_bfloat16 if-equal comparison of inputs \p a and \p b.
3003
+ * NaN inputs generate true results.
3004
+ * \param[in] a - nv_bfloat16. Is only being read.
3005
+ * \param[in] b - nv_bfloat16. Is only being read.
3006
+ *
3007
+ * \returns bool
3008
+ * - The boolean result of unordered if-equal comparison of \p a and
3009
+ * \p b.
3010
+ * \internal
3011
+ * \exception-guarantee no-throw guarantee
3012
+ * \behavior reentrant, thread safe
3013
+ * \endinternal
3014
+ */
3015
+ __CUDA_BF16_DECL__ bool __hequ(const __nv_bfloat16 a, const __nv_bfloat16 b);
3016
+ /**
3017
+ * \ingroup CUDA_MATH__BFLOAT16_COMPARISON
3018
+ * \brief Performs \p nv_bfloat16 unordered not-equal comparison.
3019
+ *
3020
+ * \details Performs \p nv_bfloat16 not-equal comparison of inputs \p a and \p b.
3021
+ * NaN inputs generate true results.
3022
+ * \param[in] a - nv_bfloat16. Is only being read.
3023
+ * \param[in] b - nv_bfloat16. Is only being read.
3024
+ *
3025
+ * \returns bool
3026
+ * - The boolean result of unordered not-equal comparison of \p a and
3027
+ * \p b.
3028
+ * \internal
3029
+ * \exception-guarantee no-throw guarantee
3030
+ * \behavior reentrant, thread safe
3031
+ * \endinternal
3032
+ */
3033
+ __CUDA_BF16_DECL__ bool __hneu(const __nv_bfloat16 a, const __nv_bfloat16 b);
3034
+ /**
3035
+ * \ingroup CUDA_MATH__BFLOAT16_COMPARISON
3036
+ * \brief Performs \p nv_bfloat16 unordered less-equal comparison.
3037
+ *
3038
+ * \details Performs \p nv_bfloat16 less-equal comparison of inputs \p a and \p b.
3039
+ * NaN inputs generate true results.
3040
+ * \param[in] a - nv_bfloat16. Is only being read.
3041
+ * \param[in] b - nv_bfloat16. Is only being read.
3042
+ *
3043
+ * \returns bool
3044
+ * - The boolean result of unordered less-equal comparison of \p a and
3045
+ * \p b.
3046
+ * \internal
3047
+ * \exception-guarantee no-throw guarantee
3048
+ * \behavior reentrant, thread safe
3049
+ * \endinternal
3050
+ */
3051
+ __CUDA_BF16_DECL__ bool __hleu(const __nv_bfloat16 a, const __nv_bfloat16 b);
3052
+ /**
3053
+ * \ingroup CUDA_MATH__BFLOAT16_COMPARISON
3054
+ * \brief Performs \p nv_bfloat16 unordered greater-equal comparison.
3055
+ *
3056
+ * \details Performs \p nv_bfloat16 greater-equal comparison of inputs \p a and \p b.
3057
+ * NaN inputs generate true results.
3058
+ * \param[in] a - nv_bfloat16. Is only being read.
3059
+ * \param[in] b - nv_bfloat16. Is only being read.
3060
+ *
3061
+ * \returns bool
3062
+ * - The boolean result of unordered greater-equal comparison of \p a
3063
+ * and \p b.
3064
+ * \internal
3065
+ * \exception-guarantee no-throw guarantee
3066
+ * \behavior reentrant, thread safe
3067
+ * \endinternal
3068
+ */
3069
+ __CUDA_BF16_DECL__ bool __hgeu(const __nv_bfloat16 a, const __nv_bfloat16 b);
3070
+ /**
3071
+ * \ingroup CUDA_MATH__BFLOAT16_COMPARISON
3072
+ * \brief Performs \p nv_bfloat16 unordered less-than comparison.
3073
+ *
3074
+ * \details Performs \p nv_bfloat16 less-than comparison of inputs \p a and \p b.
3075
+ * NaN inputs generate true results.
3076
+ * \param[in] a - nv_bfloat16. Is only being read.
3077
+ * \param[in] b - nv_bfloat16. Is only being read.
3078
+ *
3079
+ * \returns bool
3080
+ * - The boolean result of unordered less-than comparison of \p a and
3081
+ * \p b.
3082
+ * \internal
3083
+ * \exception-guarantee no-throw guarantee
3084
+ * \behavior reentrant, thread safe
3085
+ * \endinternal
3086
+ */
3087
+ __CUDA_BF16_DECL__ bool __hltu(const __nv_bfloat16 a, const __nv_bfloat16 b);
3088
+ /**
3089
+ * \ingroup CUDA_MATH__BFLOAT16_COMPARISON
3090
+ * \brief Performs \p nv_bfloat16 unordered greater-than comparison.
3091
+ *
3092
+ * \details Performs \p nv_bfloat16 greater-than comparison of inputs \p a and \p b.
3093
+ * NaN inputs generate true results.
3094
+ * \param[in] a - nv_bfloat16. Is only being read.
3095
+ * \param[in] b - nv_bfloat16. Is only being read.
3096
+ *
3097
+ * \returns bool
3098
+ * - The boolean result of unordered greater-than comparison of \p a
3099
+ * and \p b.
3100
+ * \internal
3101
+ * \exception-guarantee no-throw guarantee
3102
+ * \behavior reentrant, thread safe
3103
+ * \endinternal
3104
+ */
3105
+ __CUDA_BF16_DECL__ bool __hgtu(const __nv_bfloat16 a, const __nv_bfloat16 b);
3106
+ /**
3107
+ * \ingroup CUDA_MATH__BFLOAT16_COMPARISON
3108
+ * \brief Determine whether \p nv_bfloat16 argument is a NaN.
3109
+ *
3110
+ * \details Determine whether \p nv_bfloat16 value \p a is a NaN.
3111
+ * \param[in] a - nv_bfloat16. Is only being read.
3112
+ *
3113
+ * \returns bool
3114
+ * - true iff argument is NaN.
3115
+ * \internal
3116
+ * \exception-guarantee no-throw guarantee
3117
+ * \behavior reentrant, thread safe
3118
+ * \endinternal
3119
+ */
3120
+ __CUDA_BF16_DECL__ bool __hisnan(const __nv_bfloat16 a);
3121
+ /**
3122
+ * \ingroup CUDA_MATH__BFLOAT16_COMPARISON
3123
+ * \brief Calculates \p nv_bfloat16 maximum of two input values.
3124
+ *
3125
+ * \details Calculates \p nv_bfloat16 max(\p a, \p b)
3126
+ * defined as (\p a > \p b) ? \p a : \p b.
3127
+ * - If either of inputs is NaN, the other input is returned.
3128
+ * - If both inputs are NaNs, then canonical NaN is returned.
3129
+ * - If values of both inputs are 0.0, then +0.0 > -0.0
3130
+ * \param[in] a - nv_bfloat16. Is only being read.
3131
+ * \param[in] b - nv_bfloat16. Is only being read.
3132
+ *
3133
+ * \returns nv_bfloat16
3134
+ * \internal
3135
+ * \exception-guarantee no-throw guarantee
3136
+ * \behavior reentrant, thread safe
3137
+ * \endinternal
3138
+ */
3139
+ __CUDA_BF16_DECL__ __nv_bfloat16 __hmax(const __nv_bfloat16 a, const __nv_bfloat16 b);
3140
+ /**
3141
+ * \ingroup CUDA_MATH__BFLOAT16_COMPARISON
3142
+ * \brief Calculates \p nv_bfloat16 minimum of two input values.
3143
+ *
3144
+ * \details Calculates \p nv_bfloat16 min(\p a, \p b)
3145
+ * defined as (\p a < \p b) ? \p a : \p b.
3146
+ * - If either of inputs is NaN, the other input is returned.
3147
+ * - If both inputs are NaNs, then canonical NaN is returned.
3148
+ * - If values of both inputs are 0.0, then +0.0 > -0.0
3149
+ * \param[in] a - nv_bfloat16. Is only being read.
3150
+ * \param[in] b - nv_bfloat16. Is only being read.
3151
+ *
3152
+ * \returns nv_bfloat16
3153
+ * \internal
3154
+ * \exception-guarantee no-throw guarantee
3155
+ * \behavior reentrant, thread safe
3156
+ * \endinternal
3157
+ */
3158
+ __CUDA_BF16_DECL__ __nv_bfloat16 __hmin(const __nv_bfloat16 a, const __nv_bfloat16 b);
3159
+ /**
3160
+ * \ingroup CUDA_MATH__BFLOAT16_COMPARISON
3161
+ * \brief Calculates \p nv_bfloat16 maximum of two input values, NaNs pass through.
3162
+ *
3163
+ * \details Calculates \p nv_bfloat16 max(\p a, \p b)
3164
+ * defined as (\p a > \p b) ? \p a : \p b.
3165
+ * - If either of inputs is NaN, then canonical NaN is returned.
3166
+ * - If values of both inputs are 0.0, then +0.0 > -0.0
3167
+ * \param[in] a - nv_bfloat16. Is only being read.
3168
+ * \param[in] b - nv_bfloat16. Is only being read.
3169
+ *
3170
+ * \returns nv_bfloat16
3171
+ * \internal
3172
+ * \exception-guarantee no-throw guarantee
3173
+ * \behavior reentrant, thread safe
3174
+ * \endinternal
3175
+ */
3176
+ __CUDA_BF16_DECL__ __nv_bfloat16 __hmax_nan(const __nv_bfloat16 a, const __nv_bfloat16 b);
3177
+ /**
3178
+ * \ingroup CUDA_MATH__BFLOAT16_COMPARISON
3179
+ * \brief Calculates \p nv_bfloat16 minimum of two input values, NaNs pass through.
3180
+ *
3181
+ * \details Calculates \p nv_bfloat16 min(\p a, \p b)
3182
+ * defined as (\p a < \p b) ? \p a : \p b.
3183
+ * - If either of inputs is NaN, then canonical NaN is returned.
3184
+ * - If values of both inputs are 0.0, then +0.0 > -0.0
3185
+ * \param[in] a - nv_bfloat16. Is only being read.
3186
+ * \param[in] b - nv_bfloat16. Is only being read.
3187
+ *
3188
+ * \returns nv_bfloat16
3189
+ * \internal
3190
+ * \exception-guarantee no-throw guarantee
3191
+ * \behavior reentrant, thread safe
3192
+ * \endinternal
3193
+ */
3194
+ __CUDA_BF16_DECL__ __nv_bfloat16 __hmin_nan(const __nv_bfloat16 a, const __nv_bfloat16 b);
3195
+ /**
3196
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
3197
+ * \brief Performs \p nv_bfloat16 fused multiply-add in round-to-nearest-even mode with relu saturation.
3198
+ *
3199
+ * \details Performs \p nv_bfloat16 multiply on inputs \p a and \p b,
3200
+ * then performs a \p nv_bfloat16 add of the result with \p c,
3201
+ * rounding the result once in round-to-nearest-even mode.
3202
+ * Then negative result is clamped to 0.
3203
+ * NaN result is converted to canonical NaN.
3204
+ * \param[in] a - nv_bfloat16. Is only being read.
3205
+ * \param[in] b - nv_bfloat16. Is only being read.
3206
+ * \param[in] c - nv_bfloat16. Is only being read.
3207
+ *
3208
+ * \returns nv_bfloat16
3209
+ * - The result of fused multiply-add operation on \p
3210
+ * a, \p b, and \p c with relu saturation.
3211
+ * \internal
3212
+ * \exception-guarantee no-throw guarantee
3213
+ * \behavior reentrant, thread safe
3214
+ * \endinternal
3215
+ */
3216
+ __CUDA_BF16_DECL__ __nv_bfloat16 __hfma_relu(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c);
3217
+ /**
3218
+ * \ingroup CUDA_MATH__BFLOAT162_COMPARISON
3219
+ * \brief Calculates \p nv_bfloat162 vector maximum of two inputs.
3220
+ *
3221
+ * \details Calculates \p nv_bfloat162 vector max(\p a, \p b).
3222
+ * Elementwise \p nv_bfloat16 operation is defined as
3223
+ * (\p a > \p b) ? \p a : \p b.
3224
+ * - If either of inputs is NaN, the other input is returned.
3225
+ * - If both inputs are NaNs, then canonical NaN is returned.
3226
+ * - If values of both inputs are 0.0, then +0.0 > -0.0
3227
+ * \param[in] a - nv_bfloat162. Is only being read.
3228
+ * \param[in] b - nv_bfloat162. Is only being read.
3229
+ *
3230
+ * \returns nv_bfloat162
3231
+ * - The result of elementwise maximum of vectors \p a and \p b
3232
+ * \internal
3233
+ * \exception-guarantee no-throw guarantee
3234
+ * \behavior reentrant, thread safe
3235
+ * \endinternal
3236
+ */
3237
+ __CUDA_BF16_DECL__ __nv_bfloat162 __hmax2(const __nv_bfloat162 a, const __nv_bfloat162 b);
3238
+ /**
3239
+ * \ingroup CUDA_MATH__BFLOAT162_COMPARISON
3240
+ * \brief Calculates \p nv_bfloat162 vector minimum of two inputs.
3241
+ *
3242
+ * \details Calculates \p nv_bfloat162 vector min(\p a, \p b).
3243
+ * Elementwise \p nv_bfloat16 operation is defined as
3244
+ * (\p a < \p b) ? \p a : \p b.
3245
+ * - If either of inputs is NaN, the other input is returned.
3246
+ * - If both inputs are NaNs, then canonical NaN is returned.
3247
+ * - If values of both inputs are 0.0, then +0.0 > -0.0
3248
+ * \param[in] a - nv_bfloat162. Is only being read.
3249
+ * \param[in] b - nv_bfloat162. Is only being read.
3250
+ *
3251
+ * \returns nv_bfloat162
3252
+ * - The result of elementwise minimum of vectors \p a and \p b
3253
+ * \internal
3254
+ * \exception-guarantee no-throw guarantee
3255
+ * \behavior reentrant, thread safe
3256
+ * \endinternal
3257
+ */
3258
+ __CUDA_BF16_DECL__ __nv_bfloat162 __hmin2(const __nv_bfloat162 a, const __nv_bfloat162 b);
3259
+ /**
3260
+ * \ingroup CUDA_MATH__BFLOAT162_COMPARISON
3261
+ * \brief Calculates \p nv_bfloat162 vector maximum of two inputs, NaNs pass through.
3262
+ *
3263
+ * \details Calculates \p nv_bfloat162 vector max(\p a, \p b).
3264
+ * Elementwise \p nv_bfloat16 operation is defined as
3265
+ * (\p a > \p b) ? \p a : \p b.
3266
+ * - If either of inputs is NaN, then canonical NaN is returned.
3267
+ * - If values of both inputs are 0.0, then +0.0 > -0.0
3268
+ * \param[in] a - nv_bfloat162. Is only being read.
3269
+ * \param[in] b - nv_bfloat162. Is only being read.
3270
+ *
3271
+ * \returns nv_bfloat162
3272
+ * - The result of elementwise maximum of vectors \p a and \p b, with NaNs pass through
3273
+ * \internal
3274
+ * \exception-guarantee no-throw guarantee
3275
+ * \behavior reentrant, thread safe
3276
+ * \endinternal
3277
+ */
3278
+ __CUDA_BF16_DECL__ __nv_bfloat162 __hmax2_nan(const __nv_bfloat162 a, const __nv_bfloat162 b);
3279
+ /**
3280
+ * \ingroup CUDA_MATH__BFLOAT162_COMPARISON
3281
+ * \brief Calculates \p nv_bfloat162 vector minimum of two inputs, NaNs pass through.
3282
+ *
3283
+ * \details Calculates \p nv_bfloat162 vector min(\p a, \p b).
3284
+ * Elementwise \p nv_bfloat16 operation is defined as
3285
+ * (\p a < \p b) ? \p a : \p b.
3286
+ * - If either of inputs is NaN, then canonical NaN is returned.
3287
+ * - If values of both inputs are 0.0, then +0.0 > -0.0
3288
+ * \param[in] a - nv_bfloat162. Is only being read.
3289
+ * \param[in] b - nv_bfloat162. Is only being read.
3290
+ *
3291
+ * \returns nv_bfloat162
3292
+ * - The result of elementwise minimum of vectors \p a and \p b, with NaNs pass through
3293
+ * \internal
3294
+ * \exception-guarantee no-throw guarantee
3295
+ * \behavior reentrant, thread safe
3296
+ * \endinternal
3297
+ */
3298
+ __CUDA_BF16_DECL__ __nv_bfloat162 __hmin2_nan(const __nv_bfloat162 a, const __nv_bfloat162 b);
3299
+ /**
3300
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
3301
+ * \brief Performs \p nv_bfloat162 vector fused multiply-add in round-to-nearest-even
3302
+ * mode with relu saturation.
3303
+ *
3304
+ * \details Performs \p nv_bfloat162 vector multiply on inputs \p a and \p b,
3305
+ * then performs a \p nv_bfloat162 vector add of the result with \p c,
3306
+ * rounding the result once in round-to-nearest-even mode.
3307
+ * Then negative result is clamped to 0.
3308
+ * NaN result is converted to canonical NaN.
3309
+ * \param[in] a - nv_bfloat162. Is only being read.
3310
+ * \param[in] b - nv_bfloat162. Is only being read.
3311
+ * \param[in] c - nv_bfloat162. Is only being read.
3312
+ *
3313
+ * \returns nv_bfloat162
3314
+ * - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c with relu saturation.
3315
+ * \internal
3316
+ * \exception-guarantee no-throw guarantee
3317
+ * \behavior reentrant, thread safe
3318
+ * \endinternal
3319
+ */
3320
+ __CUDA_BF16_DECL__ __nv_bfloat162 __hfma2_relu(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c);
3321
+ /**
3322
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
3323
+ * \brief Performs fast complex multiply-accumulate
3324
+ *
3325
+ * \details Interprets vector \p nv_bfloat162 input pairs \p a, \p b, and \p c as
3326
+ * complex numbers in \p nv_bfloat16 precision and performs
3327
+ * complex multiply-accumulate operation: a*b + c
3328
+ * \param[in] a - nv_bfloat162. Is only being read.
3329
+ * \param[in] b - nv_bfloat162. Is only being read.
3330
+ * \param[in] c - nv_bfloat162. Is only being read.
3331
+ *
3332
+ * \returns nv_bfloat162
3333
+ * - The result of complex multiply-accumulate operation on complex numbers \p a, \p b, and \p c
3334
+ * \internal
3335
+ * \exception-guarantee no-throw guarantee
3336
+ * \behavior reentrant, thread safe
3337
+ * \endinternal
3338
+ */
3339
+ __CUDA_BF16_DECL__ __nv_bfloat162 __hcmadd(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c);
3340
+
3341
+ /**
3342
+ * \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
3343
+ * \brief Calculates \p nv_bfloat16 square root in round-to-nearest-even mode.
3344
+ *
3345
+ * \details Calculates \p nv_bfloat16 square root of input \p a in round-to-nearest-even mode.
3346
+ * \param[in] a - nv_bfloat16. Is only being read.
3347
+ *
3348
+ * \returns nv_bfloat16
3349
+ * - The square root of \p a.
3350
+ * \internal
3351
+ * \exception-guarantee no-throw guarantee
3352
+ * \behavior reentrant, thread safe
3353
+ * \endinternal
3354
+ */
3355
+ __CUDA_BF16_DECL__ __nv_bfloat16 hsqrt(const __nv_bfloat16 a);
3356
+ /**
3357
+ * \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
3358
+ * \brief Calculates \p nv_bfloat16 reciprocal square root in round-to-nearest-even
3359
+ * mode.
3360
+ *
3361
+ * \details Calculates \p nv_bfloat16 reciprocal square root of input \p a in round-to-nearest
3362
+ * mode.
3363
+ * \param[in] a - nv_bfloat16. Is only being read.
3364
+ *
3365
+ * \returns nv_bfloat16
3366
+ * - The reciprocal square root of \p a.
3367
+ * \internal
3368
+ * \exception-guarantee no-throw guarantee
3369
+ * \behavior reentrant, thread safe
3370
+ * \endinternal
3371
+ */
3372
+ __CUDA_BF16_DECL__ __nv_bfloat16 hrsqrt(const __nv_bfloat16 a);
3373
+ /**
3374
+ * \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
3375
+ * \brief Calculates \p nv_bfloat16 reciprocal in round-to-nearest-even mode.
3376
+ *
3377
+ * \details Calculates \p nv_bfloat16 reciprocal of input \p a in round-to-nearest-even mode.
3378
+ * \param[in] a - nv_bfloat16. Is only being read.
3379
+ *
3380
+ * \returns nv_bfloat16
3381
+ * - The reciprocal of \p a.
3382
+ * \internal
3383
+ * \exception-guarantee no-throw guarantee
3384
+ * \behavior reentrant, thread safe
3385
+ * \endinternal
3386
+ */
3387
+ __CUDA_BF16_DECL__ __nv_bfloat16 hrcp(const __nv_bfloat16 a);
3388
+ /**
3389
+ * \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
3390
+ * \brief Calculates \p nv_bfloat16 natural logarithm in round-to-nearest-even mode.
3391
+ *
3392
+ * \details Calculates \p nv_bfloat16 natural logarithm of input \p a in round-to-nearest-even
3393
+ * mode.
3394
+ * \param[in] a - nv_bfloat16. Is only being read.
3395
+ *
3396
+ * \returns nv_bfloat16
3397
+ * - The natural logarithm of \p a.
3398
+ * \internal
3399
+ * \exception-guarantee no-throw guarantee
3400
+ * \behavior reentrant, thread safe
3401
+ * \endinternal
3402
+ */
3403
+ __CUDA_BF16_DECL__ __nv_bfloat16 hlog(const __nv_bfloat16 a);
3404
+ /**
3405
+ * \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
3406
+ * \brief Calculates \p nv_bfloat16 binary logarithm in round-to-nearest-even mode.
3407
+ *
3408
+ * \details Calculates \p nv_bfloat16 binary logarithm of input \p a in round-to-nearest-even
3409
+ * mode.
3410
+ * \param[in] a - nv_bfloat16. Is only being read.
3411
+ *
3412
+ * \returns nv_bfloat16
3413
+ * - The binary logarithm of \p a.
3414
+ * \internal
3415
+ * \exception-guarantee no-throw guarantee
3416
+ * \behavior reentrant, thread safe
3417
+ * \endinternal
3418
+ */
3419
+ __CUDA_BF16_DECL__ __nv_bfloat16 hlog2(const __nv_bfloat16 a);
3420
+ /**
3421
+ * \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
3422
+ * \brief Calculates \p nv_bfloat16 decimal logarithm in round-to-nearest-even mode.
3423
+ *
3424
+ * \details Calculates \p nv_bfloat16 decimal logarithm of input \p a in round-to-nearest-even
3425
+ * mode.
3426
+ * \param[in] a - nv_bfloat16. Is only being read.
3427
+ *
3428
+ * \returns nv_bfloat16
3429
+ * - The decimal logarithm of \p a.
3430
+ * \internal
3431
+ * \exception-guarantee no-throw guarantee
3432
+ * \behavior reentrant, thread safe
3433
+ * \endinternal
3434
+ */
3435
+ __CUDA_BF16_DECL__ __nv_bfloat16 hlog10(const __nv_bfloat16 a);
3436
+ /**
3437
+ * \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
3438
+ * \brief Calculates \p nv_bfloat16 natural exponential function in round-to-nearest
3439
+ * mode.
3440
+ *
3441
+ * \details Calculates \p nv_bfloat16 natural exponential function of input \p a in
3442
+ * round-to-nearest-even mode.
3443
+ * \param[in] a - nv_bfloat16. Is only being read.
3444
+ *
3445
+ * \returns nv_bfloat16
3446
+ * - The natural exponential function on \p a.
3447
+ * \internal
3448
+ * \exception-guarantee no-throw guarantee
3449
+ * \behavior reentrant, thread safe
3450
+ * \endinternal
3451
+ */
3452
+ __CUDA_BF16_DECL__ __nv_bfloat16 hexp(const __nv_bfloat16 a);
3453
+ /**
3454
+ * \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
3455
+ * \brief Calculates \p nv_bfloat16 binary exponential function in round-to-nearest
3456
+ * mode.
3457
+ *
3458
+ * \details Calculates \p nv_bfloat16 binary exponential function of input \p a in
3459
+ * round-to-nearest-even mode.
3460
+ * \param[in] a - nv_bfloat16. Is only being read.
3461
+ *
3462
+ * \returns nv_bfloat16
3463
+ * - The binary exponential function on \p a.
3464
+ * \internal
3465
+ * \exception-guarantee no-throw guarantee
3466
+ * \behavior reentrant, thread safe
3467
+ * \endinternal
3468
+ */
3469
+ __CUDA_BF16_DECL__ __nv_bfloat16 hexp2(const __nv_bfloat16 a);
3470
+ /**
3471
+ * \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
3472
+ * \brief Calculates \p nv_bfloat16 decimal exponential function in round-to-nearest
3473
+ * mode.
3474
+ *
3475
+ * \details Calculates \p nv_bfloat16 decimal exponential function of input \p a in
3476
+ * round-to-nearest-even mode.
3477
+ * \param[in] a - nv_bfloat16. Is only being read.
3478
+ *
3479
+ * \returns nv_bfloat16
3480
+ * - The decimal exponential function on \p a.
3481
+ * \internal
3482
+ * \exception-guarantee no-throw guarantee
3483
+ * \behavior reentrant, thread safe
3484
+ * \endinternal
3485
+ */
3486
+ __CUDA_BF16_DECL__ __nv_bfloat16 hexp10(const __nv_bfloat16 a);
3487
+ /**
3488
+ * \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
3489
+ * \brief Calculates \p nv_bfloat16 cosine in round-to-nearest-even mode.
3490
+ *
3491
+ * \details Calculates \p nv_bfloat16 cosine of input \p a in round-to-nearest-even mode.
3492
+ * \param[in] a - nv_bfloat16. Is only being read.
3493
+ *
3494
+ * \returns nv_bfloat16
3495
+ * - The cosine of \p a.
3496
+ * \internal
3497
+ * \exception-guarantee no-throw guarantee
3498
+ * \behavior reentrant, thread safe
3499
+ * \endinternal
3500
+ */
3501
+ __CUDA_BF16_DECL__ __nv_bfloat16 hcos(const __nv_bfloat16 a);
3502
+ /**
3503
+ * \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
3504
+ * \brief Calculates \p nv_bfloat16 sine in round-to-nearest-even mode.
3505
+ *
3506
+ * \details Calculates \p nv_bfloat16 sine of input \p a in round-to-nearest-even mode.
3507
+ * \param[in] a - nv_bfloat16. Is only being read.
3508
+ *
3509
+ * \returns nv_bfloat16
3510
+ * - The sine of \p a.
3511
+ * \internal
3512
+ * \exception-guarantee no-throw guarantee
3513
+ * \behavior reentrant, thread safe
3514
+ * \endinternal
3515
+ */
3516
+ __CUDA_BF16_DECL__ __nv_bfloat16 hsin(const __nv_bfloat16 a);
3517
+ /**
3518
+ * \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
3519
+ * \brief Calculates \p nv_bfloat162 vector square root in round-to-nearest-even mode.
3520
+ *
3521
+ * \details Calculates \p nv_bfloat162 square root of input vector \p a in round-to-nearest
3522
+ * mode.
3523
+ * \param[in] a - nv_bfloat162. Is only being read.
3524
+ *
3525
+ * \returns nv_bfloat162
3526
+ * - The elementwise square root on vector \p a.
3527
+ * \internal
3528
+ * \exception-guarantee no-throw guarantee
3529
+ * \behavior reentrant, thread safe
3530
+ * \endinternal
3531
+ */
3532
+ __CUDA_BF16_DECL__ __nv_bfloat162 h2sqrt(const __nv_bfloat162 a);
3533
+ /**
3534
+ * \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
3535
+ * \brief Calculates \p nv_bfloat162 vector reciprocal square root in round-to-nearest
3536
+ * mode.
3537
+ *
3538
+ * \details Calculates \p nv_bfloat162 reciprocal square root of input vector \p a in
3539
+ * round-to-nearest-even mode.
3540
+ * \param[in] a - nv_bfloat162. Is only being read.
3541
+ *
3542
+ * \returns nv_bfloat162
3543
+ * - The elementwise reciprocal square root on vector \p a.
3544
+ * \internal
3545
+ * \exception-guarantee no-throw guarantee
3546
+ * \behavior reentrant, thread safe
3547
+ * \endinternal
3548
+ */
3549
+ __CUDA_BF16_DECL__ __nv_bfloat162 h2rsqrt(const __nv_bfloat162 a);
3550
+ /**
3551
+ * \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
3552
+ * \brief Calculates \p nv_bfloat162 vector reciprocal in round-to-nearest-even mode.
3553
+ *
3554
+ * \details Calculates \p nv_bfloat162 reciprocal of input vector \p a in round-to-nearest-even
3555
+ * mode.
3556
+ * \param[in] a - nv_bfloat162. Is only being read.
3557
+ *
3558
+ * \returns nv_bfloat162
3559
+ * - The elementwise reciprocal on vector \p a.
3560
+ * \internal
3561
+ * \exception-guarantee no-throw guarantee
3562
+ * \behavior reentrant, thread safe
3563
+ * \endinternal
3564
+ */
3565
+ __CUDA_BF16_DECL__ __nv_bfloat162 h2rcp(const __nv_bfloat162 a);
3566
+ /**
3567
+ * \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
3568
+ * \brief Calculates \p nv_bfloat162 vector natural logarithm in round-to-nearest-even
3569
+ * mode.
3570
+ *
3571
+ * \details Calculates \p nv_bfloat162 natural logarithm of input vector \p a in
3572
+ * round-to-nearest-even mode.
3573
+ * \param[in] a - nv_bfloat162. Is only being read.
3574
+ *
3575
+ * \returns nv_bfloat162
3576
+ * - The elementwise natural logarithm on vector \p a.
3577
+ * \internal
3578
+ * \exception-guarantee no-throw guarantee
3579
+ * \behavior reentrant, thread safe
3580
+ * \endinternal
3581
+ */
3582
+ __CUDA_BF16_DECL__ __nv_bfloat162 h2log(const __nv_bfloat162 a);
3583
+ /**
3584
+ * \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
3585
+ * \brief Calculates \p nv_bfloat162 vector binary logarithm in round-to-nearest-even
3586
+ * mode.
3587
+ *
3588
+ * \details Calculates \p nv_bfloat162 binary logarithm of input vector \p a in round-to-nearest
3589
+ * mode.
3590
+ * \param[in] a - nv_bfloat162. Is only being read.
3591
+ *
3592
+ * \returns nv_bfloat162
3593
+ * - The elementwise binary logarithm on vector \p a.
3594
+ * \internal
3595
+ * \exception-guarantee no-throw guarantee
3596
+ * \behavior reentrant, thread safe
3597
+ * \endinternal
3598
+ */
3599
+ __CUDA_BF16_DECL__ __nv_bfloat162 h2log2(const __nv_bfloat162 a);
3600
+ /**
3601
+ * \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
3602
+ * \brief Calculates \p nv_bfloat162 vector decimal logarithm in round-to-nearest-even
3603
+ * mode.
3604
+ *
3605
+ * \details Calculates \p nv_bfloat162 decimal logarithm of input vector \p a in
3606
+ * round-to-nearest-even mode.
3607
+ * \param[in] a - nv_bfloat162. Is only being read.
3608
+ *
3609
+ * \returns nv_bfloat162
3610
+ * - The elementwise decimal logarithm on vector \p a.
3611
+ * \internal
3612
+ * \exception-guarantee no-throw guarantee
3613
+ * \behavior reentrant, thread safe
3614
+ * \endinternal
3615
+ */
3616
+ __CUDA_BF16_DECL__ __nv_bfloat162 h2log10(const __nv_bfloat162 a);
3617
+ /**
3618
+ * \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
3619
+ * \brief Calculates \p nv_bfloat162 vector exponential function in round-to-nearest
3620
+ * mode.
3621
+ *
3622
+ * \details Calculates \p nv_bfloat162 exponential function of input vector \p a in
3623
+ * round-to-nearest-even mode.
3624
+ * \param[in] a - nv_bfloat162. Is only being read.
3625
+ *
3626
+ * \returns nv_bfloat162
3627
+ * - The elementwise exponential function on vector \p a.
3628
+ * \internal
3629
+ * \exception-guarantee no-throw guarantee
3630
+ * \behavior reentrant, thread safe
3631
+ * \endinternal
3632
+ */
3633
+ __CUDA_BF16_DECL__ __nv_bfloat162 h2exp(const __nv_bfloat162 a);
3634
+ /**
3635
+ * \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
3636
+ * \brief Calculates \p nv_bfloat162 vector binary exponential function in
3637
+ * round-to-nearest-even mode.
3638
+ *
3639
+ * \details Calculates \p nv_bfloat162 binary exponential function of input vector \p a in
3640
+ * round-to-nearest-even mode.
3641
+ * \param[in] a - nv_bfloat162. Is only being read.
3642
+ *
3643
+ * \returns nv_bfloat162
3644
+ * - The elementwise binary exponential function on vector \p a.
3645
+ * \internal
3646
+ * \exception-guarantee no-throw guarantee
3647
+ * \behavior reentrant, thread safe
3648
+ * \endinternal
3649
+ */
3650
+ __CUDA_BF16_DECL__ __nv_bfloat162 h2exp2(const __nv_bfloat162 a);
3651
+ /**
3652
+ * \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
3653
+ * \brief Calculates \p nv_bfloat162 vector decimal exponential function in
3654
+ * round-to-nearest-even mode.
3655
+ *
3656
+ * \details Calculates \p nv_bfloat162 decimal exponential function of input vector \p a in
3657
+ * round-to-nearest-even mode.
3658
+ * \param[in] a - nv_bfloat162. Is only being read.
3659
+ *
3660
+ * \returns nv_bfloat162
3661
+ * - The elementwise decimal exponential function on vector \p a.
3662
+ * \internal
3663
+ * \exception-guarantee no-throw guarantee
3664
+ * \behavior reentrant, thread safe
3665
+ * \endinternal
3666
+ */
3667
+ __CUDA_BF16_DECL__ __nv_bfloat162 h2exp10(const __nv_bfloat162 a);
3668
+ /**
3669
+ * \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
3670
+ * \brief Calculates \p nv_bfloat162 vector cosine in round-to-nearest-even mode.
3671
+ *
3672
+ * \details Calculates \p nv_bfloat162 cosine of input vector \p a in round-to-nearest-even
3673
+ * mode.
3674
+ * \param[in] a - nv_bfloat162. Is only being read.
3675
+ *
3676
+ * \returns nv_bfloat162
3677
+ * - The elementwise cosine on vector \p a.
3678
+ * \internal
3679
+ * \exception-guarantee no-throw guarantee
3680
+ * \behavior reentrant, thread safe
3681
+ * \endinternal
3682
+ */
3683
+ __CUDA_BF16_DECL__ __nv_bfloat162 h2cos(const __nv_bfloat162 a);
3684
+ /**
3685
+ * \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
3686
+ * \brief Calculates \p nv_bfloat162 vector sine in round-to-nearest-even mode.
3687
+ *
3688
+ * \details Calculates \p nv_bfloat162 sine of input vector \p a in round-to-nearest-even mode.
3689
+ * \param[in] a - nv_bfloat162. Is only being read.
3690
+ *
3691
+ * \returns nv_bfloat162
3692
+ * - The elementwise sine on vector \p a.
3693
+ * \internal
3694
+ * \exception-guarantee no-throw guarantee
3695
+ * \behavior reentrant, thread safe
3696
+ * \endinternal
3697
+ */
3698
+ __CUDA_BF16_DECL__ __nv_bfloat162 h2sin(const __nv_bfloat162 a);
3699
+
3700
+ /**
3701
+ * \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
3702
+ * \brief Vector add \p val to the value stored at \p address in global or shared memory, and writes this
3703
+ * value back to \p address. The atomicity of the add operation is guaranteed separately for each of the
3704
+ * two nv_bfloat16 elements; the entire __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access.
3705
+ *
3706
+ * \details The location of \p address must be in global or shared memory. This operation has undefined
3707
+ * behavior otherwise. This operation is only supported by devices of compute capability 8.x and higher.
3708
+ *
3709
+ * \param[in] address - __nv_bfloat162*. An address in global or shared memory.
3710
+ * \param[in] val - __nv_bfloat162. The value to be added.
3711
+ *
3712
+ * \returns __nv_bfloat162
3713
+ * - The old value read from \p address.
3714
+ *
3715
+ * \note_ref_guide_atomic
3716
+ */
3717
+ __CUDA_BF16_DECL__ __nv_bfloat162 atomicAdd(__nv_bfloat162 *const address, const __nv_bfloat162 val);
3718
+
3719
+ /**
3720
+ * \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
3721
+ * \brief Adds \p val to the value stored at \p address in global or shared memory, and writes this value
3722
+ * back to \p address. This operation is performed in one atomic operation.
3723
+ *
3724
+ * \details The location of \p address must be in global or shared memory. This operation has undefined
3725
+ * behavior otherwise. This operation is only supported by devices of compute capability 8.x and higher.
3726
+ *
3727
+ * \param[in] address - __nv_bfloat16*. An address in global or shared memory.
3728
+ * \param[in] val - __nv_bfloat16. The value to be added.
3729
+ *
3730
+ * \returns __nv_bfloat16
3731
+ * - The old value read from \p address.
3732
+ *
3733
+ * \note_ref_guide_atomic
3734
+ */
3735
+ __CUDA_BF16_DECL__ __nv_bfloat16 atomicAdd(__nv_bfloat16 *const address, const __nv_bfloat16 val);
3736
+
3737
+ #endif /* defined(__CUDACC__) && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) */
3738
+
3739
+ #undef __CUDA_BF16_DECL__
3740
+ #undef __CUDA_HOSTDEVICE_BF16_DECL__
3741
+
3742
+ #endif /* defined(__cplusplus) */
3743
+
3744
+ /* Note the .hpp file is included even for host-side compilation, to capture the "nv_bfloat16" & "nv_bfloat162" definitions */
3745
+ #include "cuda_bf16.hpp"
3746
+ #undef ___CUDA_BF16_STRINGIFY_INNERMOST
3747
+ #undef __CUDA_BF16_STRINGIFY
3748
+
3749
+ #endif /* end of include guard: __CUDA_BF16_H__ */