numba-cuda 0.0.1__py3-none-any.whl → 0.0.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. _numba_cuda_redirector.pth +1 -0
  2. _numba_cuda_redirector.py +74 -0
  3. numba_cuda/VERSION +1 -0
  4. numba_cuda/__init__.py +5 -0
  5. numba_cuda/_version.py +19 -0
  6. numba_cuda/numba/cuda/__init__.py +22 -0
  7. numba_cuda/numba/cuda/api.py +526 -0
  8. numba_cuda/numba/cuda/api_util.py +30 -0
  9. numba_cuda/numba/cuda/args.py +77 -0
  10. numba_cuda/numba/cuda/cg.py +62 -0
  11. numba_cuda/numba/cuda/codegen.py +378 -0
  12. numba_cuda/numba/cuda/compiler.py +422 -0
  13. numba_cuda/numba/cuda/cpp_function_wrappers.cu +47 -0
  14. numba_cuda/numba/cuda/cuda_fp16.h +3631 -0
  15. numba_cuda/numba/cuda/cuda_fp16.hpp +2465 -0
  16. numba_cuda/numba/cuda/cuda_paths.py +258 -0
  17. numba_cuda/numba/cuda/cudadecl.py +806 -0
  18. numba_cuda/numba/cuda/cudadrv/__init__.py +9 -0
  19. numba_cuda/numba/cuda/cudadrv/devicearray.py +904 -0
  20. numba_cuda/numba/cuda/cudadrv/devices.py +248 -0
  21. numba_cuda/numba/cuda/cudadrv/driver.py +3201 -0
  22. numba_cuda/numba/cuda/cudadrv/drvapi.py +398 -0
  23. numba_cuda/numba/cuda/cudadrv/dummyarray.py +452 -0
  24. numba_cuda/numba/cuda/cudadrv/enums.py +607 -0
  25. numba_cuda/numba/cuda/cudadrv/error.py +36 -0
  26. numba_cuda/numba/cuda/cudadrv/libs.py +176 -0
  27. numba_cuda/numba/cuda/cudadrv/ndarray.py +20 -0
  28. numba_cuda/numba/cuda/cudadrv/nvrtc.py +260 -0
  29. numba_cuda/numba/cuda/cudadrv/nvvm.py +707 -0
  30. numba_cuda/numba/cuda/cudadrv/rtapi.py +10 -0
  31. numba_cuda/numba/cuda/cudadrv/runtime.py +142 -0
  32. numba_cuda/numba/cuda/cudaimpl.py +1055 -0
  33. numba_cuda/numba/cuda/cudamath.py +140 -0
  34. numba_cuda/numba/cuda/decorators.py +189 -0
  35. numba_cuda/numba/cuda/descriptor.py +33 -0
  36. numba_cuda/numba/cuda/device_init.py +89 -0
  37. numba_cuda/numba/cuda/deviceufunc.py +908 -0
  38. numba_cuda/numba/cuda/dispatcher.py +1057 -0
  39. numba_cuda/numba/cuda/errors.py +59 -0
  40. numba_cuda/numba/cuda/extending.py +7 -0
  41. numba_cuda/numba/cuda/initialize.py +13 -0
  42. numba_cuda/numba/cuda/intrinsic_wrapper.py +77 -0
  43. numba_cuda/numba/cuda/intrinsics.py +198 -0
  44. numba_cuda/numba/cuda/kernels/__init__.py +0 -0
  45. numba_cuda/numba/cuda/kernels/reduction.py +262 -0
  46. numba_cuda/numba/cuda/kernels/transpose.py +65 -0
  47. numba_cuda/numba/cuda/libdevice.py +3382 -0
  48. numba_cuda/numba/cuda/libdevicedecl.py +17 -0
  49. numba_cuda/numba/cuda/libdevicefuncs.py +1057 -0
  50. numba_cuda/numba/cuda/libdeviceimpl.py +83 -0
  51. numba_cuda/numba/cuda/mathimpl.py +448 -0
  52. numba_cuda/numba/cuda/models.py +48 -0
  53. numba_cuda/numba/cuda/nvvmutils.py +235 -0
  54. numba_cuda/numba/cuda/printimpl.py +86 -0
  55. numba_cuda/numba/cuda/random.py +292 -0
  56. numba_cuda/numba/cuda/simulator/__init__.py +38 -0
  57. numba_cuda/numba/cuda/simulator/api.py +110 -0
  58. numba_cuda/numba/cuda/simulator/compiler.py +9 -0
  59. numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +2 -0
  60. numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +432 -0
  61. numba_cuda/numba/cuda/simulator/cudadrv/devices.py +117 -0
  62. numba_cuda/numba/cuda/simulator/cudadrv/driver.py +62 -0
  63. numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +4 -0
  64. numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py +4 -0
  65. numba_cuda/numba/cuda/simulator/cudadrv/error.py +6 -0
  66. numba_cuda/numba/cuda/simulator/cudadrv/libs.py +2 -0
  67. numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +29 -0
  68. numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +19 -0
  69. numba_cuda/numba/cuda/simulator/kernel.py +308 -0
  70. numba_cuda/numba/cuda/simulator/kernelapi.py +495 -0
  71. numba_cuda/numba/cuda/simulator/reduction.py +15 -0
  72. numba_cuda/numba/cuda/simulator/vector_types.py +58 -0
  73. numba_cuda/numba/cuda/simulator_init.py +17 -0
  74. numba_cuda/numba/cuda/stubs.py +902 -0
  75. numba_cuda/numba/cuda/target.py +440 -0
  76. numba_cuda/numba/cuda/testing.py +202 -0
  77. numba_cuda/numba/cuda/tests/__init__.py +58 -0
  78. numba_cuda/numba/cuda/tests/cudadrv/__init__.py +8 -0
  79. numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +145 -0
  80. numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +145 -0
  81. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +375 -0
  82. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +21 -0
  83. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +179 -0
  84. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +235 -0
  85. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +22 -0
  86. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +193 -0
  87. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +547 -0
  88. numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +249 -0
  89. numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +81 -0
  90. numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +192 -0
  91. numba_cuda/numba/cuda/tests/cudadrv/test_events.py +38 -0
  92. numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +65 -0
  93. numba_cuda/numba/cuda/tests/cudadrv/test_init.py +139 -0
  94. numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +37 -0
  95. numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py +12 -0
  96. numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +317 -0
  97. numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +127 -0
  98. numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +54 -0
  99. numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +199 -0
  100. numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +37 -0
  101. numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +20 -0
  102. numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +149 -0
  103. numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +36 -0
  104. numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +85 -0
  105. numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +41 -0
  106. numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +122 -0
  107. numba_cuda/numba/cuda/tests/cudapy/__init__.py +8 -0
  108. numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +234 -0
  109. numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +41 -0
  110. numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +58 -0
  111. numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +30 -0
  112. numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +100 -0
  113. numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +42 -0
  114. numba_cuda/numba/cuda/tests/cudapy/test_array.py +260 -0
  115. numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +201 -0
  116. numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +35 -0
  117. numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +1620 -0
  118. numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +120 -0
  119. numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +24 -0
  120. numba_cuda/numba/cuda/tests/cudapy/test_caching.py +545 -0
  121. numba_cuda/numba/cuda/tests/cudapy/test_casting.py +257 -0
  122. numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +33 -0
  123. numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +276 -0
  124. numba_cuda/numba/cuda/tests/cudapy/test_complex.py +296 -0
  125. numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +20 -0
  126. numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +129 -0
  127. numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +176 -0
  128. numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +147 -0
  129. numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +435 -0
  130. numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +90 -0
  131. numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +94 -0
  132. numba_cuda/numba/cuda/tests/cudapy/test_debug.py +101 -0
  133. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +221 -0
  134. numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +222 -0
  135. numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +700 -0
  136. numba_cuda/numba/cuda/tests/cudapy/test_enums.py +121 -0
  137. numba_cuda/numba/cuda/tests/cudapy/test_errors.py +79 -0
  138. numba_cuda/numba/cuda/tests/cudapy/test_exception.py +174 -0
  139. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +155 -0
  140. numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +244 -0
  141. numba_cuda/numba/cuda/tests/cudapy/test_forall.py +52 -0
  142. numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +29 -0
  143. numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +66 -0
  144. numba_cuda/numba/cuda/tests/cudapy/test_globals.py +60 -0
  145. numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +456 -0
  146. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +159 -0
  147. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +95 -0
  148. numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +37 -0
  149. numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +165 -0
  150. numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +1106 -0
  151. numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +318 -0
  152. numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +99 -0
  153. numba_cuda/numba/cuda/tests/cudapy/test_lang.py +64 -0
  154. numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +119 -0
  155. numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +187 -0
  156. numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +199 -0
  157. numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +164 -0
  158. numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +37 -0
  159. numba_cuda/numba/cuda/tests/cudapy/test_math.py +786 -0
  160. numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +74 -0
  161. numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +113 -0
  162. numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +22 -0
  163. numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +140 -0
  164. numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +46 -0
  165. numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +101 -0
  166. numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +49 -0
  167. numba_cuda/numba/cuda/tests/cudapy/test_operator.py +401 -0
  168. numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +86 -0
  169. numba_cuda/numba/cuda/tests/cudapy/test_overload.py +335 -0
  170. numba_cuda/numba/cuda/tests/cudapy/test_powi.py +124 -0
  171. numba_cuda/numba/cuda/tests/cudapy/test_print.py +128 -0
  172. numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +33 -0
  173. numba_cuda/numba/cuda/tests/cudapy/test_random.py +104 -0
  174. numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +610 -0
  175. numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +125 -0
  176. numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +76 -0
  177. numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +83 -0
  178. numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +85 -0
  179. numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +37 -0
  180. numba_cuda/numba/cuda/tests/cudapy/test_sm.py +444 -0
  181. numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +205 -0
  182. numba_cuda/numba/cuda/tests/cudapy/test_sync.py +271 -0
  183. numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +80 -0
  184. numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +277 -0
  185. numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +47 -0
  186. numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +307 -0
  187. numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +283 -0
  188. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +20 -0
  189. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +69 -0
  190. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +36 -0
  191. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +37 -0
  192. numba_cuda/numba/cuda/tests/cudapy/test_warning.py +139 -0
  193. numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +276 -0
  194. numba_cuda/numba/cuda/tests/cudasim/__init__.py +6 -0
  195. numba_cuda/numba/cuda/tests/cudasim/support.py +6 -0
  196. numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +102 -0
  197. numba_cuda/numba/cuda/tests/data/__init__.py +0 -0
  198. numba_cuda/numba/cuda/tests/data/cuda_include.cu +5 -0
  199. numba_cuda/numba/cuda/tests/data/error.cu +7 -0
  200. numba_cuda/numba/cuda/tests/data/jitlink.cu +23 -0
  201. numba_cuda/numba/cuda/tests/data/jitlink.ptx +51 -0
  202. numba_cuda/numba/cuda/tests/data/warn.cu +7 -0
  203. numba_cuda/numba/cuda/tests/doc_examples/__init__.py +6 -0
  204. numba_cuda/numba/cuda/tests/doc_examples/ffi/__init__.py +0 -0
  205. numba_cuda/numba/cuda/tests/doc_examples/ffi/functions.cu +49 -0
  206. numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +77 -0
  207. numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +76 -0
  208. numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +82 -0
  209. numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +155 -0
  210. numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +173 -0
  211. numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +109 -0
  212. numba_cuda/numba/cuda/tests/doc_examples/test_random.py +59 -0
  213. numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +76 -0
  214. numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +130 -0
  215. numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py +50 -0
  216. numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +73 -0
  217. numba_cuda/numba/cuda/tests/nocuda/__init__.py +8 -0
  218. numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +359 -0
  219. numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +36 -0
  220. numba_cuda/numba/cuda/tests/nocuda/test_import.py +49 -0
  221. numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +238 -0
  222. numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +54 -0
  223. numba_cuda/numba/cuda/types.py +37 -0
  224. numba_cuda/numba/cuda/ufuncs.py +662 -0
  225. numba_cuda/numba/cuda/vector_types.py +209 -0
  226. numba_cuda/numba/cuda/vectorizers.py +252 -0
  227. numba_cuda-0.0.13.dist-info/LICENSE +25 -0
  228. numba_cuda-0.0.13.dist-info/METADATA +69 -0
  229. numba_cuda-0.0.13.dist-info/RECORD +231 -0
  230. {numba_cuda-0.0.1.dist-info → numba_cuda-0.0.13.dist-info}/WHEEL +1 -1
  231. numba_cuda-0.0.1.dist-info/METADATA +0 -10
  232. numba_cuda-0.0.1.dist-info/RECORD +0 -5
  233. {numba_cuda-0.0.1.dist-info → numba_cuda-0.0.13.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1106 @@
1
+ import itertools
2
+ import numpy as np
3
+ import operator
4
+ import re
5
+ from numba import cuda, int64
6
+ from numba.cuda import compile_ptx
7
+ from numba.core.errors import TypingError
8
+ from numba.core.types import f2
9
+ from numba.cuda.testing import (unittest, CUDATestCase, skip_on_cudasim,
10
+ skip_unless_cc_53)
11
+
12
+
13
+ def simple_threadidx(ary):
14
+ i = cuda.threadIdx.x
15
+ ary[0] = i
16
+
17
+
18
+ def fill_threadidx(ary):
19
+ i = cuda.threadIdx.x
20
+ ary[i] = i
21
+
22
+
23
+ def fill3d_threadidx(ary):
24
+ i = cuda.threadIdx.x
25
+ j = cuda.threadIdx.y
26
+ k = cuda.threadIdx.z
27
+
28
+ ary[i, j, k] = (i + 1) * (j + 1) * (k + 1)
29
+
30
+
31
+ def simple_grid1d(ary):
32
+ i = cuda.grid(1)
33
+ ary[i] = i
34
+
35
+
36
+ def simple_grid2d(ary):
37
+ i, j = cuda.grid(2)
38
+ ary[i, j] = i + j
39
+
40
+
41
+ def simple_gridsize1d(ary):
42
+ i = cuda.grid(1)
43
+ x = cuda.gridsize(1)
44
+ if i == 0:
45
+ ary[0] = x
46
+
47
+
48
+ def simple_gridsize2d(ary):
49
+ i, j = cuda.grid(2)
50
+ x, y = cuda.gridsize(2)
51
+ if i == 0 and j == 0:
52
+ ary[0] = x
53
+ ary[1] = y
54
+
55
+
56
+ def intrinsic_forloop_step(c):
57
+ startX, startY = cuda.grid(2)
58
+ gridX = cuda.gridDim.x * cuda.blockDim.x
59
+ gridY = cuda.gridDim.y * cuda.blockDim.y
60
+ height, width = c.shape
61
+
62
+ for x in range(startX, width, gridX):
63
+ for y in range(startY, height, gridY):
64
+ c[y, x] = x + y
65
+
66
+
67
+ def simple_popc(ary, c):
68
+ ary[0] = cuda.popc(c)
69
+
70
+
71
+ def simple_fma(ary, a, b, c):
72
+ ary[0] = cuda.fma(a, b, c)
73
+
74
+
75
+ def simple_hadd(ary, a, b):
76
+ ary[0] = cuda.fp16.hadd(a[0], b[0])
77
+
78
+
79
+ def simple_hadd_scalar(ary, a, b):
80
+ ary[0] = cuda.fp16.hadd(a, b)
81
+
82
+
83
+ def simple_hfma(ary, a, b, c):
84
+ ary[0] = cuda.fp16.hfma(a[0], b[0], c[0])
85
+
86
+
87
+ def simple_hfma_scalar(ary, a, b, c):
88
+ ary[0] = cuda.fp16.hfma(a, b, c)
89
+
90
+
91
+ def simple_hsub(ary, a, b):
92
+ ary[0] = cuda.fp16.hsub(a[0], b[0])
93
+
94
+
95
+ def simple_hsub_scalar(ary, a, b):
96
+ ary[0] = cuda.fp16.hsub(a, b)
97
+
98
+
99
+ def simple_hmul(ary, a, b):
100
+ ary[0] = cuda.fp16.hmul(a[0], b[0])
101
+
102
+
103
+ def simple_hmul_scalar(ary, a, b):
104
+ ary[0] = cuda.fp16.hmul(a, b)
105
+
106
+
107
+ def simple_hdiv_scalar(ary, a, b):
108
+ ary[0] = cuda.fp16.hdiv(a, b)
109
+
110
+
111
+ def simple_hdiv_kernel(ary, array_a, array_b):
112
+ i = cuda.grid(1)
113
+ if i < ary.size:
114
+ a = array_a[i]
115
+ b = array_b[i]
116
+ ary[i] = cuda.fp16.hdiv(a, b)
117
+
118
+
119
+ def simple_hneg(ary, a):
120
+ ary[0] = cuda.fp16.hneg(a[0])
121
+
122
+
123
+ def simple_hneg_scalar(ary, a):
124
+ ary[0] = cuda.fp16.hneg(a)
125
+
126
+
127
+ def simple_habs(ary, a):
128
+ ary[0] = cuda.fp16.habs(a[0])
129
+
130
+
131
+ def simple_habs_scalar(ary, a):
132
+ ary[0] = cuda.fp16.habs(a)
133
+
134
+
135
+ def simple_heq_scalar(ary, a, b):
136
+ ary[0] = cuda.fp16.heq(a, b)
137
+
138
+
139
+ def simple_hne_scalar(ary, a, b):
140
+ ary[0] = cuda.fp16.hne(a, b)
141
+
142
+
143
+ def simple_hge_scalar(ary, a, b):
144
+ ary[0] = cuda.fp16.hge(a, b)
145
+
146
+
147
+ def simple_hgt_scalar(ary, a, b):
148
+ ary[0] = cuda.fp16.hgt(a, b)
149
+
150
+
151
+ def simple_hle_scalar(ary, a, b):
152
+ ary[0] = cuda.fp16.hle(a, b)
153
+
154
+
155
+ def simple_hlt_scalar(ary, a, b):
156
+ ary[0] = cuda.fp16.hlt(a, b)
157
+
158
+
159
+ @cuda.jit(device=True)
160
+ def hlt_func_1(x, y):
161
+ return cuda.fp16.hlt(x, y)
162
+
163
+
164
+ @cuda.jit(device=True)
165
+ def hlt_func_2(x, y):
166
+ return cuda.fp16.hlt(x, y)
167
+
168
+
169
+ def test_multiple_hcmp_1(r, a, b, c):
170
+ # float16 predicates used in two separate functions
171
+ r[0] = hlt_func_1(a, b) and hlt_func_2(b, c)
172
+
173
+
174
+ def test_multiple_hcmp_2(r, a, b, c):
175
+ # The same float16 predicate used in the caller and callee
176
+ r[0] = hlt_func_1(a, b) and cuda.fp16.hlt(b, c)
177
+
178
+
179
+ def test_multiple_hcmp_3(r, a, b, c):
180
+ # Different float16 predicates used in the caller and callee
181
+ r[0] = hlt_func_1(a, b) and cuda.fp16.hge(c, b)
182
+
183
+
184
+ def test_multiple_hcmp_4(r, a, b, c):
185
+ # The same float16 predicates used twice in a function
186
+ r[0] = cuda.fp16.hlt(a, b) and cuda.fp16.hlt(b, c)
187
+
188
+
189
+ def test_multiple_hcmp_5(r, a, b, c):
190
+ # Different float16 predicates used in a function
191
+ r[0] = cuda.fp16.hlt(a, b) and cuda.fp16.hge(c, b)
192
+
193
+
194
+ def simple_hmax_scalar(ary, a, b):
195
+ ary[0] = cuda.fp16.hmax(a, b)
196
+
197
+
198
+ def simple_hmin_scalar(ary, a, b):
199
+ ary[0] = cuda.fp16.hmin(a, b)
200
+
201
+
202
+ def simple_hsin(r, x):
203
+ i = cuda.grid(1)
204
+
205
+ if i < len(r):
206
+ r[i] = cuda.fp16.hsin(x[i])
207
+
208
+
209
+ def simple_hcos(r, x):
210
+ i = cuda.grid(1)
211
+
212
+ if i < len(r):
213
+ r[i] = cuda.fp16.hcos(x[i])
214
+
215
+
216
+ def simple_hlog(r, x):
217
+ i = cuda.grid(1)
218
+
219
+ if i < len(r):
220
+ r[i] = cuda.fp16.hlog(x[i])
221
+
222
+
223
+ def simple_hlog2(r, x):
224
+ i = cuda.grid(1)
225
+
226
+ if i < len(r):
227
+ r[i] = cuda.fp16.hlog2(x[i])
228
+
229
+
230
+ def simple_hlog10(r, x):
231
+ i = cuda.grid(1)
232
+
233
+ if i < len(r):
234
+ r[i] = cuda.fp16.hlog10(x[i])
235
+
236
+
237
+ def simple_hexp(r, x):
238
+ i = cuda.grid(1)
239
+
240
+ if i < len(r):
241
+ r[i] = cuda.fp16.hexp(x[i])
242
+
243
+
244
+ def simple_hexp2(r, x):
245
+ i = cuda.grid(1)
246
+
247
+ if i < len(r):
248
+ r[i] = cuda.fp16.hexp2(x[i])
249
+
250
+
251
+ def simple_hsqrt(r, x):
252
+ i = cuda.grid(1)
253
+
254
+ if i < len(r):
255
+ r[i] = cuda.fp16.hsqrt(x[i])
256
+
257
+
258
+ def simple_hrsqrt(r, x):
259
+
260
+ i = cuda.grid(1)
261
+
262
+ if i < len(r):
263
+ r[i] = cuda.fp16.hrsqrt(x[i])
264
+
265
+
266
+ def numpy_hrsqrt(x, dtype):
267
+ return x ** -0.5
268
+
269
+
270
+ def simple_hceil(r, x):
271
+ i = cuda.grid(1)
272
+
273
+ if i < len(r):
274
+ r[i] = cuda.fp16.hceil(x[i])
275
+
276
+
277
+ def simple_hfloor(r, x):
278
+ i = cuda.grid(1)
279
+
280
+ if i < len(r):
281
+ r[i] = cuda.fp16.hfloor(x[i])
282
+
283
+
284
+ def simple_hrcp(r, x):
285
+ i = cuda.grid(1)
286
+
287
+ if i < len(r):
288
+ r[i] = cuda.fp16.hrcp(x[i])
289
+
290
+
291
+ def simple_htrunc(r, x):
292
+ i = cuda.grid(1)
293
+
294
+ if i < len(r):
295
+ r[i] = cuda.fp16.htrunc(x[i])
296
+
297
+
298
+ def simple_hrint(r, x):
299
+ i = cuda.grid(1)
300
+
301
+ if i < len(r):
302
+ r[i] = cuda.fp16.hrint(x[i])
303
+
304
+
305
+ def simple_cbrt(ary, a):
306
+ ary[0] = cuda.cbrt(a)
307
+
308
+
309
+ def simple_brev(ary, c):
310
+ ary[0] = cuda.brev(c)
311
+
312
+
313
+ def simple_clz(ary, c):
314
+ ary[0] = cuda.clz(c)
315
+
316
+
317
+ def simple_ffs(ary, c):
318
+ ary[0] = cuda.ffs(c)
319
+
320
+
321
+ def simple_round(ary, c):
322
+ ary[0] = round(c)
323
+
324
+
325
+ def simple_round_to(ary, c, ndigits):
326
+ ary[0] = round(c, ndigits)
327
+
328
+
329
+ def branching_with_ifs(a, b, c):
330
+ i = cuda.grid(1)
331
+
332
+ if a[i] > 4:
333
+ if b % 2 == 0:
334
+ a[i] = c[i]
335
+ else:
336
+ a[i] = 13
337
+ else:
338
+ a[i] = 3
339
+
340
+
341
+ def branching_with_selps(a, b, c):
342
+ i = cuda.grid(1)
343
+
344
+ inner = cuda.selp(b % 2 == 0, c[i], 13)
345
+ a[i] = cuda.selp(a[i] > 4, inner, 3)
346
+
347
+
348
+ def simple_laneid(ary):
349
+ i = cuda.grid(1)
350
+ ary[i] = cuda.laneid
351
+
352
+
353
+ def simple_warpsize(ary):
354
+ ary[0] = cuda.warpsize
355
+
356
+
357
+ def nonliteral_grid(x):
358
+ cuda.grid(x)
359
+
360
+
361
+ def nonliteral_gridsize(x):
362
+ cuda.gridsize(x)
363
+
364
+
365
+ class TestCudaIntrinsic(CUDATestCase):
366
+ def setUp(self):
367
+ super().setUp()
368
+ np.random.seed(0)
369
+
370
+ def test_simple_threadidx(self):
371
+ compiled = cuda.jit("void(int32[:])")(simple_threadidx)
372
+ ary = np.ones(1, dtype=np.int32)
373
+ compiled[1, 1](ary)
374
+ self.assertTrue(ary[0] == 0)
375
+
376
+ def test_fill_threadidx(self):
377
+ compiled = cuda.jit("void(int32[:])")(fill_threadidx)
378
+ N = 10
379
+ ary = np.ones(N, dtype=np.int32)
380
+ exp = np.arange(N, dtype=np.int32)
381
+ compiled[1, N](ary)
382
+ self.assertTrue(np.all(ary == exp))
383
+
384
+ def test_fill3d_threadidx(self):
385
+ X, Y, Z = 4, 5, 6
386
+
387
+ def c_contigous():
388
+ compiled = cuda.jit("void(int32[:,:,::1])")(fill3d_threadidx)
389
+ ary = np.zeros((X, Y, Z), dtype=np.int32)
390
+ compiled[1, (X, Y, Z)](ary)
391
+ return ary
392
+
393
+ def f_contigous():
394
+ compiled = cuda.jit("void(int32[::1,:,:])")(fill3d_threadidx)
395
+ ary = np.asfortranarray(np.zeros((X, Y, Z), dtype=np.int32))
396
+ compiled[1, (X, Y, Z)](ary)
397
+ return ary
398
+
399
+ c_res = c_contigous()
400
+ f_res = f_contigous()
401
+ self.assertTrue(np.all(c_res == f_res))
402
+
403
+ @skip_on_cudasim('Cudasim does not check types')
404
+ def test_nonliteral_grid_error(self):
405
+ with self.assertRaisesRegex(TypingError, 'RequireLiteralValue'):
406
+ cuda.jit('void(int32)')(nonliteral_grid)
407
+
408
+ @skip_on_cudasim('Cudasim does not check types')
409
+ def test_nonliteral_gridsize_error(self):
410
+ with self.assertRaisesRegex(TypingError, 'RequireLiteralValue'):
411
+ cuda.jit('void(int32)')(nonliteral_gridsize)
412
+
413
+ def test_simple_grid1d(self):
414
+ compiled = cuda.jit("void(int32[::1])")(simple_grid1d)
415
+ ntid, nctaid = 3, 7
416
+ nelem = ntid * nctaid
417
+ ary = np.empty(nelem, dtype=np.int32)
418
+ compiled[nctaid, ntid](ary)
419
+ self.assertTrue(np.all(ary == np.arange(nelem)))
420
+
421
+ def test_simple_grid2d(self):
422
+ compiled = cuda.jit("void(int32[:,::1])")(simple_grid2d)
423
+ ntid = (4, 3)
424
+ nctaid = (5, 6)
425
+ shape = (ntid[0] * nctaid[0], ntid[1] * nctaid[1])
426
+ ary = np.empty(shape, dtype=np.int32)
427
+ exp = ary.copy()
428
+ compiled[nctaid, ntid](ary)
429
+
430
+ for i in range(ary.shape[0]):
431
+ for j in range(ary.shape[1]):
432
+ exp[i, j] = i + j
433
+
434
+ self.assertTrue(np.all(ary == exp))
435
+
436
+ def test_simple_gridsize1d(self):
437
+ compiled = cuda.jit("void(int32[::1])")(simple_gridsize1d)
438
+ ntid, nctaid = 3, 7
439
+ ary = np.zeros(1, dtype=np.int32)
440
+ compiled[nctaid, ntid](ary)
441
+ self.assertEqual(ary[0], nctaid * ntid)
442
+
443
+ @skip_on_cudasim('Requires too many threads')
444
+ def test_issue_9229(self):
445
+ # Ensure that grid and grid size are correct - #9229 showed that they
446
+ # overflowed an int32.
447
+ @cuda.jit
448
+ def f(grid_error, gridsize_error):
449
+ i1 = cuda.grid(1)
450
+ i2 = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
451
+ gs1 = cuda.gridsize(1)
452
+ gs2 = cuda.blockDim.x * cuda.gridDim.x
453
+ if i1 != i2:
454
+ grid_error[0] = 1
455
+ if gs1 != gs2:
456
+ gridsize_error[0] = 1
457
+
458
+ grid_error = np.zeros(1, dtype=np.uint64)
459
+ gridsize_error = np.zeros(1, dtype=np.uint64)
460
+
461
+ # A large enough grid for thread IDs to overflow an int32
462
+ # (22121216 * 256 = 5663031296, which is greater than 2 ** 32)
463
+ f[22121216, 256](grid_error, gridsize_error)
464
+
465
+ self.assertEqual(grid_error[0], 0)
466
+ self.assertEqual(gridsize_error[0], 0)
467
+
468
+ @skip_on_cudasim('Tests PTX emission')
469
+ def test_selp(self):
470
+ sig = (int64[:], int64, int64[:])
471
+ cu_branching_with_ifs = cuda.jit(sig)(branching_with_ifs)
472
+ cu_branching_with_selps = cuda.jit(sig)(branching_with_selps)
473
+
474
+ n = 32
475
+ b = 6
476
+ c = np.full(shape=32, fill_value=17, dtype=np.int64)
477
+
478
+ expected = c.copy()
479
+ expected[:5] = 3
480
+
481
+ a = np.arange(n, dtype=np.int64)
482
+ cu_branching_with_ifs[n, 1](a, b, c)
483
+ ptx = cu_branching_with_ifs.inspect_asm(sig)
484
+ self.assertEqual(2, len(re.findall(r'\s+bra\s+', ptx)))
485
+ np.testing.assert_array_equal(a, expected, err_msg='branching')
486
+
487
+ a = np.arange(n, dtype=np.int64)
488
+ cu_branching_with_selps[n, 1](a, b, c)
489
+ ptx = cu_branching_with_selps.inspect_asm(sig)
490
+ self.assertEqual(0, len(re.findall(r'\s+bra\s+', ptx)))
491
+ np.testing.assert_array_equal(a, expected, err_msg='selp')
492
+
493
+ def test_simple_gridsize2d(self):
494
+ compiled = cuda.jit("void(int32[::1])")(simple_gridsize2d)
495
+ ntid = (4, 3)
496
+ nctaid = (5, 6)
497
+ ary = np.zeros(2, dtype=np.int32)
498
+ compiled[nctaid, ntid](ary)
499
+
500
+ self.assertEqual(ary[0], nctaid[0] * ntid[0])
501
+ self.assertEqual(ary[1], nctaid[1] * ntid[1])
502
+
503
+ def test_intrinsic_forloop_step(self):
504
+ compiled = cuda.jit("void(int32[:,::1])")(intrinsic_forloop_step)
505
+ ntid = (4, 3)
506
+ nctaid = (5, 6)
507
+ shape = (ntid[0] * nctaid[0], ntid[1] * nctaid[1])
508
+ ary = np.empty(shape, dtype=np.int32)
509
+
510
+ compiled[nctaid, ntid](ary)
511
+
512
+ gridX, gridY = shape
513
+ height, width = ary.shape
514
+ for i, j in zip(range(ntid[0]), range(ntid[1])):
515
+ startX, startY = gridX + i, gridY + j
516
+ for x in range(startX, width, gridX):
517
+ for y in range(startY, height, gridY):
518
+ self.assertTrue(ary[y, x] == x + y, (ary[y, x], x + y))
519
+
520
+ def test_3dgrid(self):
521
+ @cuda.jit
522
+ def foo(out):
523
+ x, y, z = cuda.grid(3)
524
+ a, b, c = cuda.gridsize(3)
525
+ out[x, y, z] = a * b * c
526
+
527
+ arr = np.zeros(9 ** 3, dtype=np.int32).reshape(9, 9, 9)
528
+ foo[(3, 3, 3), (3, 3, 3)](arr)
529
+
530
+ np.testing.assert_equal(arr, 9 ** 3)
531
+
532
+ def test_3dgrid_2(self):
533
+ @cuda.jit
534
+ def foo(out):
535
+ x, y, z = cuda.grid(3)
536
+ a, b, c = cuda.gridsize(3)
537
+ grid_is_right = (
538
+ x == cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x and
539
+ y == cuda.threadIdx.y + cuda.blockIdx.y * cuda.blockDim.y and
540
+ z == cuda.threadIdx.z + cuda.blockIdx.z * cuda.blockDim.z
541
+ )
542
+ gridsize_is_right = (a == cuda.blockDim.x * cuda.gridDim.x and
543
+ b == cuda.blockDim.y * cuda.gridDim.y and
544
+ c == cuda.blockDim.z * cuda.gridDim.z)
545
+ out[x, y, z] = grid_is_right and gridsize_is_right
546
+
547
+ x, y, z = (4 * 3, 3 * 2, 2 * 4)
548
+ arr = np.zeros((x * y * z), dtype=np.bool_).reshape(x, y, z)
549
+ foo[(4, 3, 2), (3, 2, 4)](arr)
550
+
551
+ self.assertTrue(np.all(arr))
552
+
553
+ def test_popc_u4(self):
554
+ compiled = cuda.jit("void(int32[:], uint32)")(simple_popc)
555
+ ary = np.zeros(1, dtype=np.int32)
556
+ compiled[1, 1](ary, 0xF0)
557
+ self.assertEqual(ary[0], 4)
558
+
559
+ def test_popc_u8(self):
560
+ compiled = cuda.jit("void(int32[:], uint64)")(simple_popc)
561
+ ary = np.zeros(1, dtype=np.int32)
562
+ compiled[1, 1](ary, 0xF00000000000)
563
+ self.assertEqual(ary[0], 4)
564
+
565
+ def test_fma_f4(self):
566
+ compiled = cuda.jit("void(f4[:], f4, f4, f4)")(simple_fma)
567
+ ary = np.zeros(1, dtype=np.float32)
568
+ compiled[1, 1](ary, 2., 3., 4.)
569
+ np.testing.assert_allclose(ary[0], 2 * 3 + 4)
570
+
571
+ def test_fma_f8(self):
572
+ compiled = cuda.jit("void(f8[:], f8, f8, f8)")(simple_fma)
573
+ ary = np.zeros(1, dtype=np.float64)
574
+ compiled[1, 1](ary, 2., 3., 4.)
575
+ np.testing.assert_allclose(ary[0], 2 * 3 + 4)
576
+
577
+ @skip_unless_cc_53
578
+ def test_hadd(self):
579
+ compiled = cuda.jit("void(f2[:], f2[:], f2[:])")(simple_hadd)
580
+ ary = np.zeros(1, dtype=np.float16)
581
+ arg1 = np.array([3.], dtype=np.float16)
582
+ arg2 = np.array([4.], dtype=np.float16)
583
+ compiled[1, 1](ary, arg1, arg2)
584
+ np.testing.assert_allclose(ary[0], arg1 + arg2)
585
+
586
+ @skip_unless_cc_53
587
+ def test_hadd_scalar(self):
588
+ compiled = cuda.jit("void(f2[:], f2, f2)")(simple_hadd_scalar)
589
+ ary = np.zeros(1, dtype=np.float16)
590
+ arg1 = np.float16(3.1415926)
591
+ arg2 = np.float16(3.)
592
+ compiled[1, 1](ary, arg1, arg2)
593
+ ref = arg1 + arg2
594
+ np.testing.assert_allclose(ary[0], ref)
595
+
596
+ @skip_on_cudasim('Compilation unsupported in the simulator')
597
+ def test_hadd_ptx(self):
598
+ args = (f2[:], f2, f2)
599
+ ptx, _ = compile_ptx(simple_hadd_scalar, args, cc=(5, 3))
600
+ self.assertIn('add.f16', ptx)
601
+
602
+ @skip_unless_cc_53
603
+ def test_hfma(self):
604
+ compiled = cuda.jit("void(f2[:], f2[:], f2[:], f2[:])")(simple_hfma)
605
+ ary = np.zeros(1, dtype=np.float16)
606
+ arg1 = np.array([2.], dtype=np.float16)
607
+ arg2 = np.array([3.], dtype=np.float16)
608
+ arg3 = np.array([4.], dtype=np.float16)
609
+ compiled[1, 1](ary, arg1, arg2, arg3)
610
+ np.testing.assert_allclose(ary[0], arg1 * arg2 + arg3)
611
+
612
+ @skip_unless_cc_53
613
+ def test_hfma_scalar(self):
614
+ compiled = cuda.jit("void(f2[:], f2, f2, f2)")(simple_hfma_scalar)
615
+ ary = np.zeros(1, dtype=np.float16)
616
+ arg1 = np.float16(2.)
617
+ arg2 = np.float16(3.)
618
+ arg3 = np.float16(4.)
619
+ compiled[1, 1](ary, arg1, arg2, arg3)
620
+ ref = arg1 * arg2 + arg3
621
+ np.testing.assert_allclose(ary[0], ref)
622
+
623
+ @skip_on_cudasim('Compilation unsupported in the simulator')
624
+ def test_hfma_ptx(self):
625
+ args = (f2[:], f2, f2, f2)
626
+ ptx, _ = compile_ptx(simple_hfma_scalar, args, cc=(5, 3))
627
+ self.assertIn('fma.rn.f16', ptx)
628
+
629
+ @skip_unless_cc_53
630
+ def test_hsub(self):
631
+ compiled = cuda.jit("void(f2[:], f2[:], f2[:])")(simple_hsub)
632
+ ary = np.zeros(1, dtype=np.float16)
633
+ arg1 = np.array([3.], dtype=np.float16)
634
+ arg2 = np.array([4.], dtype=np.float16)
635
+ compiled[1, 1](ary, arg1, arg2)
636
+ np.testing.assert_allclose(ary[0], arg1 - arg2)
637
+
638
+ @skip_unless_cc_53
639
+ def test_hsub_scalar(self):
640
+ compiled = cuda.jit("void(f2[:], f2, f2)")(simple_hsub_scalar)
641
+ ary = np.zeros(1, dtype=np.float16)
642
+ arg1 = np.float16(3.1415926)
643
+ arg2 = np.float16(1.57)
644
+ compiled[1, 1](ary, arg1, arg2)
645
+ ref = arg1 - arg2
646
+ np.testing.assert_allclose(ary[0], ref)
647
+
648
+ @skip_on_cudasim('Compilation unsupported in the simulator')
649
+ def test_hsub_ptx(self):
650
+ args = (f2[:], f2, f2)
651
+ ptx, _ = compile_ptx(simple_hsub_scalar, args, cc=(5, 3))
652
+ self.assertIn('sub.f16', ptx)
653
+
654
+ @skip_unless_cc_53
655
+ def test_hmul(self):
656
+ compiled = cuda.jit()(simple_hmul)
657
+ ary = np.zeros(1, dtype=np.float16)
658
+ arg1 = np.array([3.], dtype=np.float16)
659
+ arg2 = np.array([4.], dtype=np.float16)
660
+ compiled[1, 1](ary, arg1, arg2)
661
+ np.testing.assert_allclose(ary[0], arg1 * arg2)
662
+
663
+ @skip_unless_cc_53
664
+ def test_hmul_scalar(self):
665
+ compiled = cuda.jit("void(f2[:], f2, f2)")(simple_hmul_scalar)
666
+ ary = np.zeros(1, dtype=np.float16)
667
+ arg1 = np.float16(3.1415926)
668
+ arg2 = np.float16(1.57)
669
+ compiled[1, 1](ary, arg1, arg2)
670
+ ref = arg1 * arg2
671
+ np.testing.assert_allclose(ary[0], ref)
672
+
673
+ @skip_on_cudasim('Compilation unsupported in the simulator')
674
+ def test_hmul_ptx(self):
675
+ args = (f2[:], f2, f2)
676
+ ptx, _ = compile_ptx(simple_hmul_scalar, args, cc=(5, 3))
677
+ self.assertIn('mul.f16', ptx)
678
+
679
+ @skip_unless_cc_53
680
+ def test_hdiv_scalar(self):
681
+ compiled = cuda.jit("void(f2[:], f2, f2)")(simple_hdiv_scalar)
682
+ ary = np.zeros(1, dtype=np.float16)
683
+ arg1 = np.float16(3.1415926)
684
+ arg2 = np.float16(1.57)
685
+
686
+ compiled[1, 1](ary, arg1, arg2)
687
+ ref = arg1 / arg2
688
+ np.testing.assert_allclose(ary[0], ref)
689
+
690
+ @skip_unless_cc_53
691
+ def test_hdiv(self):
692
+ compiled = cuda.jit("void(f2[:], f2[:], f2[:])")(simple_hdiv_kernel)
693
+ arry1 = np.random.randint(-65504, 65505, size=500).astype(np.float16)
694
+ arry2 = np.random.randint(-65504, 65505, size=500).astype(np.float16)
695
+ ary = np.zeros_like(arry1, dtype=np.float16)
696
+
697
+ compiled.forall(ary.size)(ary, arry1, arry2)
698
+ ref = arry1 / arry2
699
+ np.testing.assert_allclose(ary, ref)
700
+
701
+ @skip_unless_cc_53
702
+ def test_hneg(self):
703
+ compiled = cuda.jit("void(f2[:], f2[:])")(simple_hneg)
704
+ ary = np.zeros(1, dtype=np.float16)
705
+ arg1 = np.array([3.], dtype=np.float16)
706
+ compiled[1, 1](ary, arg1)
707
+ np.testing.assert_allclose(ary[0], -arg1)
708
+
709
+ @skip_unless_cc_53
710
+ def test_hneg_scalar(self):
711
+ compiled = cuda.jit("void(f2[:], f2)")(simple_hneg_scalar)
712
+ ary = np.zeros(1, dtype=np.float16)
713
+ arg1 = np.float16(3.1415926)
714
+ compiled[1, 1](ary, arg1)
715
+ ref = -arg1
716
+ np.testing.assert_allclose(ary[0], ref)
717
+
718
+ @skip_on_cudasim('Compilation unsupported in the simulator')
719
+ def test_hneg_ptx(self):
720
+ args = (f2[:], f2)
721
+ ptx, _ = compile_ptx(simple_hneg_scalar, args, cc=(5, 3))
722
+ self.assertIn('neg.f16', ptx)
723
+
724
+ @skip_unless_cc_53
725
+ def test_habs(self):
726
+ compiled = cuda.jit()(simple_habs)
727
+ ary = np.zeros(1, dtype=np.float16)
728
+ arg1 = np.array([-3.], dtype=np.float16)
729
+ compiled[1, 1](ary, arg1)
730
+ np.testing.assert_allclose(ary[0], abs(arg1))
731
+
732
+ @skip_unless_cc_53
733
+ def test_habs_scalar(self):
734
+ compiled = cuda.jit("void(f2[:], f2)")(simple_habs_scalar)
735
+ ary = np.zeros(1, dtype=np.float16)
736
+ arg1 = np.float16(-3.1415926)
737
+ compiled[1, 1](ary, arg1)
738
+ ref = abs(arg1)
739
+ np.testing.assert_allclose(ary[0], ref)
740
+
741
+ @skip_on_cudasim('Compilation unsupported in the simulator')
742
+ def test_habs_ptx(self):
743
+ args = (f2[:], f2)
744
+ ptx, _ = compile_ptx(simple_habs_scalar, args, cc=(5, 3))
745
+ self.assertIn('abs.f16', ptx)
746
+
747
+ @skip_unless_cc_53
748
+ def test_fp16_intrinsics_common(self):
749
+ kernels = (simple_hsin, simple_hcos,
750
+ simple_hlog, simple_hlog2, simple_hlog10,
751
+ simple_hsqrt, simple_hceil, simple_hfloor,
752
+ simple_hrcp, simple_htrunc, simple_hrint,
753
+ simple_hrsqrt)
754
+ exp_kernels = (simple_hexp, simple_hexp2)
755
+ expected_functions = (np.sin, np.cos,
756
+ np.log, np.log2, np.log10,
757
+ np.sqrt, np.ceil, np.floor,
758
+ np.reciprocal, np.trunc, np.rint,
759
+ numpy_hrsqrt)
760
+ expected_exp_functions = (np.exp, np.exp2)
761
+
762
+ # Generate random data
763
+ N = 32
764
+ np.random.seed(1)
765
+ x = np.random.randint(1, 65505, size=N).astype(np.float16)
766
+ r = np.zeros_like(x)
767
+ for kernel, fn in zip(kernels, expected_functions):
768
+ with self.subTest(fn=fn):
769
+ kernel = cuda.jit("void(f2[:], f2[:])")(kernel)
770
+ kernel[1,N](r, x)
771
+ expected = fn(x, dtype=np.float16)
772
+ np.testing.assert_allclose(r, expected)
773
+
774
+ x2 = np.random.randint(1, 10, size=N).astype(np.float16)
775
+ for kernel, fn in zip(exp_kernels, expected_exp_functions):
776
+ with self.subTest(fn=fn):
777
+ kernel = cuda.jit("void(f2[:], f2[:])")(kernel)
778
+ kernel[1,N](r, x2)
779
+ expected = fn(x2, dtype=np.float16)
780
+ np.testing.assert_allclose(r, expected)
781
+
782
+ @skip_unless_cc_53
783
+ def test_hexp10(self):
784
+ @cuda.jit()
785
+ def hexp10_vectors(r, x):
786
+ i = cuda.grid(1)
787
+
788
+ if i < len(r):
789
+ r[i] = cuda.fp16.hexp10(x[i])
790
+
791
+ # Generate random data
792
+ N = 32
793
+ np.random.seed(1)
794
+ x = np.random.rand(N).astype(np.float16)
795
+ r = np.zeros_like(x)
796
+
797
+ # Run the kernel
798
+ hexp10_vectors[1, N](r, x)
799
+ np.testing.assert_allclose(r, 10 ** x)
800
+
801
+ @skip_unless_cc_53
802
+ def test_fp16_comparison(self):
803
+ fns = (simple_heq_scalar, simple_hne_scalar, simple_hge_scalar,
804
+ simple_hgt_scalar, simple_hle_scalar, simple_hlt_scalar)
805
+ ops = (operator.eq, operator.ne, operator.ge,
806
+ operator.gt, operator.le, operator.lt)
807
+
808
+ for fn, op in zip(fns, ops):
809
+ with self.subTest(op=op):
810
+ kernel = cuda.jit("void(b1[:], f2, f2)")(fn)
811
+
812
+ expected = np.zeros(1, dtype=np.bool_)
813
+ got = np.zeros(1, dtype=np.bool_)
814
+ arg2 = np.float16(2)
815
+ arg3 = np.float16(3)
816
+ arg4 = np.float16(4)
817
+
818
+ # Check with equal arguments
819
+ kernel[1, 1](got, arg3, arg3)
820
+ expected = op(arg3, arg3)
821
+ self.assertEqual(expected, got[0])
822
+
823
+ # Check with LHS < RHS
824
+ kernel[1, 1](got, arg3, arg4)
825
+ expected = op(arg3, arg4)
826
+ self.assertEqual(expected, got[0])
827
+
828
+ # Check with LHS > RHS
829
+ kernel[1, 1](got, arg3, arg2)
830
+ expected = op(arg3, arg2)
831
+ self.assertEqual(expected, got[0])
832
+
833
+ @skip_unless_cc_53
834
+ def test_multiple_float16_comparisons(self):
835
+ functions = (test_multiple_hcmp_1,
836
+ test_multiple_hcmp_2,
837
+ test_multiple_hcmp_3,
838
+ test_multiple_hcmp_4,
839
+ test_multiple_hcmp_5)
840
+ for fn in functions:
841
+ with self.subTest(fn=fn):
842
+ compiled = cuda.jit("void(b1[:], f2, f2, f2)")(fn)
843
+ ary = np.zeros(1, dtype=np.bool_)
844
+ arg1 = np.float16(2.)
845
+ arg2 = np.float16(3.)
846
+ arg3 = np.float16(4.)
847
+ compiled[1, 1](ary, arg1, arg2, arg3)
848
+ self.assertTrue(ary[0])
849
+
850
+ @skip_unless_cc_53
851
+ def test_hmax(self):
852
+ compiled = cuda.jit("void(f2[:], f2, f2)")(simple_hmax_scalar)
853
+ ary = np.zeros(1, dtype=np.float16)
854
+ arg1 = np.float16(3.)
855
+ arg2 = np.float16(4.)
856
+ compiled[1, 1](ary, arg1, arg2)
857
+ np.testing.assert_allclose(ary[0], arg2)
858
+ arg1 = np.float16(5.)
859
+ compiled[1, 1](ary, arg1, arg2)
860
+ np.testing.assert_allclose(ary[0], arg1)
861
+
862
+ @skip_unless_cc_53
863
+ def test_hmin(self):
864
+ compiled = cuda.jit("void(f2[:], f2, f2)")(simple_hmin_scalar)
865
+ ary = np.zeros(1, dtype=np.float16)
866
+ arg1 = np.float16(3.)
867
+ arg2 = np.float16(4.)
868
+ compiled[1, 1](ary, arg1, arg2)
869
+ np.testing.assert_allclose(ary[0], arg1)
870
+ arg1 = np.float16(5.)
871
+ compiled[1, 1](ary, arg1, arg2)
872
+ np.testing.assert_allclose(ary[0], arg2)
873
+
874
+ def test_cbrt_f32(self):
875
+ compiled = cuda.jit("void(float32[:], float32)")(simple_cbrt)
876
+ ary = np.zeros(1, dtype=np.float32)
877
+ cbrt_arg = 2.
878
+ compiled[1, 1](ary, cbrt_arg)
879
+ np.testing.assert_allclose(ary[0], cbrt_arg ** (1 / 3))
880
+
881
+ def test_cbrt_f64(self):
882
+ compiled = cuda.jit("void(float64[:], float64)")(simple_cbrt)
883
+ ary = np.zeros(1, dtype=np.float64)
884
+ cbrt_arg = 6.
885
+ compiled[1, 1](ary, cbrt_arg)
886
+ np.testing.assert_allclose(ary[0], cbrt_arg ** (1 / 3))
887
+
888
+ def test_brev_u4(self):
889
+ compiled = cuda.jit("void(uint32[:], uint32)")(simple_brev)
890
+ ary = np.zeros(1, dtype=np.uint32)
891
+ compiled[1, 1](ary, 0x000030F0)
892
+ self.assertEqual(ary[0], 0x0F0C0000)
893
+
894
+ @skip_on_cudasim('only get given a Python "int", assumes 32 bits')
895
+ def test_brev_u8(self):
896
+ compiled = cuda.jit("void(uint64[:], uint64)")(simple_brev)
897
+ ary = np.zeros(1, dtype=np.uint64)
898
+ compiled[1, 1](ary, 0x000030F0000030F0)
899
+ self.assertEqual(ary[0], 0x0F0C00000F0C0000)
900
+
901
+ def test_clz_i4(self):
902
+ compiled = cuda.jit("void(int32[:], int32)")(simple_clz)
903
+ ary = np.zeros(1, dtype=np.int32)
904
+ compiled[1, 1](ary, 0x00100000)
905
+ self.assertEqual(ary[0], 11)
906
+
907
+ def test_clz_u4(self):
908
+ """
909
+ Although the CUDA Math API
910
+ (http://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__INTRINSIC__INT.html)
911
+ only says int32 & int64 arguments are supported in C code, the LLVM
912
+ IR input supports i8, i16, i32 & i64 (LLVM doesn't have a concept of
913
+ unsigned integers, just unsigned operations on integers).
914
+ http://docs.nvidia.com/cuda/nvvm-ir-spec/index.html#bit-manipulations-intrinics
915
+ """
916
+ compiled = cuda.jit("void(int32[:], uint32)")(simple_clz)
917
+ ary = np.zeros(1, dtype=np.int32)
918
+ compiled[1, 1](ary, 0x00100000)
919
+ self.assertEqual(ary[0], 11)
920
+
921
+ def test_clz_i4_1s(self):
922
+ compiled = cuda.jit("void(int32[:], int32)")(simple_clz)
923
+ ary = np.zeros(1, dtype=np.int32)
924
+ compiled[1, 1](ary, 0xFFFFFFFF)
925
+ self.assertEqual(ary[0], 0)
926
+
927
+ def test_clz_i4_0s(self):
928
+ compiled = cuda.jit("void(int32[:], int32)")(simple_clz)
929
+ ary = np.zeros(1, dtype=np.int32)
930
+ compiled[1, 1](ary, 0x0)
931
+ self.assertEqual(ary[0], 32, "CUDA semantics")
932
+
933
+ @skip_on_cudasim('only get given a Python "int", assumes 32 bits')
934
+ def test_clz_i8(self):
935
+ compiled = cuda.jit("void(int32[:], int64)")(simple_clz)
936
+ ary = np.zeros(1, dtype=np.int32)
937
+ compiled[1, 1](ary, 0x000000000010000)
938
+ self.assertEqual(ary[0], 47)
939
+
940
+ def test_ffs_i4(self):
941
+ compiled = cuda.jit("void(int32[:], int32)")(simple_ffs)
942
+ ary = np.zeros(1, dtype=np.int32)
943
+ compiled[1, 1](ary, 0x00100000)
944
+ self.assertEqual(ary[0], 21)
945
+ compiled[1, 1](ary, 0x80000000)
946
+ self.assertEqual(ary[0], 32)
947
+
948
+ def test_ffs_u4(self):
949
+ compiled = cuda.jit("void(int32[:], uint32)")(simple_ffs)
950
+ ary = np.zeros(1, dtype=np.int32)
951
+ compiled[1, 1](ary, 0x00100000)
952
+ self.assertEqual(ary[0], 21)
953
+ compiled[1, 1](ary, 0x80000000)
954
+ self.assertEqual(ary[0], 32)
955
+
956
+ def test_ffs_i4_1s(self):
957
+ compiled = cuda.jit("void(int32[:], int32)")(simple_ffs)
958
+ ary = np.zeros(1, dtype=np.int32)
959
+ compiled[1, 1](ary, 0xFFFFFFFF)
960
+ self.assertEqual(ary[0], 1)
961
+
962
+ def test_ffs_i4_0s(self):
963
+ compiled = cuda.jit("void(int32[:], int32)")(simple_ffs)
964
+ ary = np.zeros(1, dtype=np.int32)
965
+ compiled[1, 1](ary, 0x0)
966
+ self.assertEqual(ary[0], 0)
967
+
968
+ @skip_on_cudasim('only get given a Python "int", assumes 32 bits')
969
+ def test_ffs_i8(self):
970
+ compiled = cuda.jit("void(int32[:], int64)")(simple_ffs)
971
+ ary = np.zeros(1, dtype=np.int32)
972
+ compiled[1, 1](ary, 0x000000000010000)
973
+ self.assertEqual(ary[0], 17)
974
+ compiled[1, 1](ary, 0x100000000)
975
+ self.assertEqual(ary[0], 33)
976
+
977
+ def test_simple_laneid(self):
978
+ compiled = cuda.jit("void(int32[:])")(simple_laneid)
979
+ count = 2
980
+ ary = np.zeros(count * 32, dtype=np.int32)
981
+ exp = np.tile(np.arange(32, dtype=np.int32), count)
982
+ compiled[1, count * 32](ary)
983
+ self.assertTrue(np.all(ary == exp))
984
+
985
+ def test_simple_warpsize(self):
986
+ compiled = cuda.jit("void(int32[:])")(simple_warpsize)
987
+ ary = np.zeros(1, dtype=np.int32)
988
+ compiled[1, 1](ary)
989
+ self.assertEqual(ary[0], 32, "CUDA semantics")
990
+
991
+ def test_round_f4(self):
992
+ compiled = cuda.jit("void(int64[:], float32)")(simple_round)
993
+ ary = np.zeros(1, dtype=np.int64)
994
+
995
+ for i in [-3.0, -2.5, -2.25, -1.5, 1.5, 2.25, 2.5, 2.75]:
996
+ compiled[1, 1](ary, i)
997
+ self.assertEqual(ary[0], round(i))
998
+
999
+ def test_round_f8(self):
1000
+ compiled = cuda.jit("void(int64[:], float64)")(simple_round)
1001
+ ary = np.zeros(1, dtype=np.int64)
1002
+
1003
+ for i in [-3.0, -2.5, -2.25, -1.5, 1.5, 2.25, 2.5, 2.75]:
1004
+ compiled[1, 1](ary, i)
1005
+ self.assertEqual(ary[0], round(i))
1006
+
1007
+ def test_round_to_f4(self):
1008
+ compiled = cuda.jit("void(float32[:], float32, int32)")(simple_round_to)
1009
+ ary = np.zeros(1, dtype=np.float32)
1010
+ np.random.seed(123)
1011
+ vals = np.random.random(32).astype(np.float32)
1012
+ np.concatenate((vals, np.array([np.inf, -np.inf, np.nan])))
1013
+ digits = (
1014
+ # Common case branch of round_to_impl
1015
+ -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5,
1016
+ # The algorithm currently implemented can only round to 13 digits
1017
+ # with single precision. Note that this doesn't trigger the
1018
+ # "overflow safe" branch of the implementation, which can only be
1019
+ # hit when using double precision.
1020
+ 13
1021
+ )
1022
+ for val, ndigits in itertools.product(vals, digits):
1023
+ with self.subTest(val=val, ndigits=ndigits):
1024
+ compiled[1, 1](ary, val, ndigits)
1025
+ self.assertPreciseEqual(ary[0], round(val, ndigits),
1026
+ prec='single')
1027
+
1028
+ # CPython on most platforms uses rounding based on dtoa.c, whereas the CUDA
1029
+ # round-to implementation uses CPython's fallback implementation, which has
1030
+ # slightly different behavior at the edges of the domain. Since the CUDA
1031
+ # simulator executes using CPython, we need to skip this test when the
1032
+ # simulator is active.
1033
+ @skip_on_cudasim('Overflow behavior differs on CPython')
1034
+ def test_round_to_f4_overflow(self):
1035
+ # Test that the input value is returned when y in round_ndigits
1036
+ # overflows.
1037
+ compiled = cuda.jit("void(float32[:], float32, int32)")(simple_round_to)
1038
+ ary = np.zeros(1, dtype=np.float32)
1039
+ val = np.finfo(np.float32).max
1040
+ # An unusually large number of digits is required to hit the "y
1041
+ # overflows" branch of the implementation because the typing results in
1042
+ # the computation of y as float64.
1043
+ ndigits = 300
1044
+ compiled[1, 1](ary, val, ndigits)
1045
+ self.assertEqual(ary[0], val)
1046
+
1047
+ def test_round_to_f4_halfway(self):
1048
+ compiled = cuda.jit("void(float32[:], float32, int32)")(simple_round_to)
1049
+ ary = np.zeros(1, dtype=np.float32)
1050
+ # Value chosen to trigger the "round to even" branch of the
1051
+ # implementation
1052
+ val = 0.3425
1053
+ ndigits = 3
1054
+ compiled[1, 1](ary, val, ndigits)
1055
+ self.assertPreciseEqual(ary[0], round(val, ndigits), prec='single')
1056
+
1057
+ def test_round_to_f8(self):
1058
+ compiled = cuda.jit("void(float64[:], float64, int32)")(simple_round_to)
1059
+ ary = np.zeros(1, dtype=np.float64)
1060
+ np.random.seed(123)
1061
+ vals = np.random.random(32)
1062
+ np.concatenate((vals, np.array([np.inf, -np.inf, np.nan])))
1063
+ digits = (-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5)
1064
+
1065
+ for val, ndigits in itertools.product(vals, digits):
1066
+ with self.subTest(val=val, ndigits=ndigits):
1067
+ compiled[1, 1](ary, val, ndigits)
1068
+ self.assertPreciseEqual(ary[0], round(val, ndigits),
1069
+ prec='exact')
1070
+
1071
+ # Trigger the "overflow safe" branch of the implementation
1072
+ val = 0.12345678987654321 * 10e-15
1073
+ ndigits = 23
1074
+ with self.subTest(val=val, ndigits=ndigits):
1075
+ compiled[1, 1](ary, val, ndigits)
1076
+ self.assertPreciseEqual(ary[0], round(val, ndigits),
1077
+ prec='double')
1078
+
1079
+ # Skipped on cudasim for the same reasons as test_round_to_f4 above.
1080
+ @skip_on_cudasim('Overflow behavior differs on CPython')
1081
+ def test_round_to_f8_overflow(self):
1082
+ # Test that the input value is returned when y in round_ndigits
1083
+ # overflows.
1084
+ compiled = cuda.jit("void(float64[:], float64, int32)")(simple_round_to)
1085
+ ary = np.zeros(1, dtype=np.float64)
1086
+ val = np.finfo(np.float64).max
1087
+ # Unlike test_round_to_f4_overflow, a reasonable number of digits can
1088
+ # be used for this test to overflow y in round_ndigits.
1089
+ ndigits = 12
1090
+ compiled[1, 1](ary, val, ndigits)
1091
+ self.assertEqual(ary[0], val)
1092
+
1093
+ def test_round_to_f8_halfway(self):
1094
+ compiled = cuda.jit("void(float64[:], float64, int32)")(simple_round_to)
1095
+ ary = np.zeros(1, dtype=np.float64)
1096
+ # Value chosen to trigger the "round to even" branch of the
1097
+ # implementation, with a value that is not exactly representable with a
1098
+ # float32, but only a float64.
1099
+ val = 0.5425
1100
+ ndigits = 3
1101
+ compiled[1, 1](ary, val, ndigits)
1102
+ self.assertPreciseEqual(ary[0], round(val, ndigits), prec='double')
1103
+
1104
+
1105
+ if __name__ == '__main__':
1106
+ unittest.main()