numba-cuda 0.0.1__py3-none-any.whl → 0.0.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. _numba_cuda_redirector.pth +1 -0
  2. _numba_cuda_redirector.py +74 -0
  3. numba_cuda/VERSION +1 -0
  4. numba_cuda/__init__.py +5 -0
  5. numba_cuda/_version.py +19 -0
  6. numba_cuda/numba/cuda/__init__.py +22 -0
  7. numba_cuda/numba/cuda/api.py +526 -0
  8. numba_cuda/numba/cuda/api_util.py +30 -0
  9. numba_cuda/numba/cuda/args.py +77 -0
  10. numba_cuda/numba/cuda/cg.py +62 -0
  11. numba_cuda/numba/cuda/codegen.py +378 -0
  12. numba_cuda/numba/cuda/compiler.py +422 -0
  13. numba_cuda/numba/cuda/cpp_function_wrappers.cu +47 -0
  14. numba_cuda/numba/cuda/cuda_fp16.h +3631 -0
  15. numba_cuda/numba/cuda/cuda_fp16.hpp +2465 -0
  16. numba_cuda/numba/cuda/cuda_paths.py +258 -0
  17. numba_cuda/numba/cuda/cudadecl.py +806 -0
  18. numba_cuda/numba/cuda/cudadrv/__init__.py +9 -0
  19. numba_cuda/numba/cuda/cudadrv/devicearray.py +904 -0
  20. numba_cuda/numba/cuda/cudadrv/devices.py +248 -0
  21. numba_cuda/numba/cuda/cudadrv/driver.py +3201 -0
  22. numba_cuda/numba/cuda/cudadrv/drvapi.py +398 -0
  23. numba_cuda/numba/cuda/cudadrv/dummyarray.py +452 -0
  24. numba_cuda/numba/cuda/cudadrv/enums.py +607 -0
  25. numba_cuda/numba/cuda/cudadrv/error.py +36 -0
  26. numba_cuda/numba/cuda/cudadrv/libs.py +176 -0
  27. numba_cuda/numba/cuda/cudadrv/ndarray.py +20 -0
  28. numba_cuda/numba/cuda/cudadrv/nvrtc.py +260 -0
  29. numba_cuda/numba/cuda/cudadrv/nvvm.py +707 -0
  30. numba_cuda/numba/cuda/cudadrv/rtapi.py +10 -0
  31. numba_cuda/numba/cuda/cudadrv/runtime.py +142 -0
  32. numba_cuda/numba/cuda/cudaimpl.py +1055 -0
  33. numba_cuda/numba/cuda/cudamath.py +140 -0
  34. numba_cuda/numba/cuda/decorators.py +189 -0
  35. numba_cuda/numba/cuda/descriptor.py +33 -0
  36. numba_cuda/numba/cuda/device_init.py +89 -0
  37. numba_cuda/numba/cuda/deviceufunc.py +908 -0
  38. numba_cuda/numba/cuda/dispatcher.py +1057 -0
  39. numba_cuda/numba/cuda/errors.py +59 -0
  40. numba_cuda/numba/cuda/extending.py +7 -0
  41. numba_cuda/numba/cuda/initialize.py +13 -0
  42. numba_cuda/numba/cuda/intrinsic_wrapper.py +77 -0
  43. numba_cuda/numba/cuda/intrinsics.py +198 -0
  44. numba_cuda/numba/cuda/kernels/__init__.py +0 -0
  45. numba_cuda/numba/cuda/kernels/reduction.py +262 -0
  46. numba_cuda/numba/cuda/kernels/transpose.py +65 -0
  47. numba_cuda/numba/cuda/libdevice.py +3382 -0
  48. numba_cuda/numba/cuda/libdevicedecl.py +17 -0
  49. numba_cuda/numba/cuda/libdevicefuncs.py +1057 -0
  50. numba_cuda/numba/cuda/libdeviceimpl.py +83 -0
  51. numba_cuda/numba/cuda/mathimpl.py +448 -0
  52. numba_cuda/numba/cuda/models.py +48 -0
  53. numba_cuda/numba/cuda/nvvmutils.py +235 -0
  54. numba_cuda/numba/cuda/printimpl.py +86 -0
  55. numba_cuda/numba/cuda/random.py +292 -0
  56. numba_cuda/numba/cuda/simulator/__init__.py +38 -0
  57. numba_cuda/numba/cuda/simulator/api.py +110 -0
  58. numba_cuda/numba/cuda/simulator/compiler.py +9 -0
  59. numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +2 -0
  60. numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +432 -0
  61. numba_cuda/numba/cuda/simulator/cudadrv/devices.py +117 -0
  62. numba_cuda/numba/cuda/simulator/cudadrv/driver.py +62 -0
  63. numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +4 -0
  64. numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py +4 -0
  65. numba_cuda/numba/cuda/simulator/cudadrv/error.py +6 -0
  66. numba_cuda/numba/cuda/simulator/cudadrv/libs.py +2 -0
  67. numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +29 -0
  68. numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +19 -0
  69. numba_cuda/numba/cuda/simulator/kernel.py +308 -0
  70. numba_cuda/numba/cuda/simulator/kernelapi.py +495 -0
  71. numba_cuda/numba/cuda/simulator/reduction.py +15 -0
  72. numba_cuda/numba/cuda/simulator/vector_types.py +58 -0
  73. numba_cuda/numba/cuda/simulator_init.py +17 -0
  74. numba_cuda/numba/cuda/stubs.py +902 -0
  75. numba_cuda/numba/cuda/target.py +440 -0
  76. numba_cuda/numba/cuda/testing.py +202 -0
  77. numba_cuda/numba/cuda/tests/__init__.py +58 -0
  78. numba_cuda/numba/cuda/tests/cudadrv/__init__.py +8 -0
  79. numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +145 -0
  80. numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +145 -0
  81. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +375 -0
  82. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +21 -0
  83. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +179 -0
  84. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +235 -0
  85. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +22 -0
  86. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +193 -0
  87. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +547 -0
  88. numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +249 -0
  89. numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +81 -0
  90. numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +192 -0
  91. numba_cuda/numba/cuda/tests/cudadrv/test_events.py +38 -0
  92. numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +65 -0
  93. numba_cuda/numba/cuda/tests/cudadrv/test_init.py +139 -0
  94. numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +37 -0
  95. numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py +12 -0
  96. numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +317 -0
  97. numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +127 -0
  98. numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +54 -0
  99. numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +199 -0
  100. numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +37 -0
  101. numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +20 -0
  102. numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +149 -0
  103. numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +36 -0
  104. numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +85 -0
  105. numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +41 -0
  106. numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +122 -0
  107. numba_cuda/numba/cuda/tests/cudapy/__init__.py +8 -0
  108. numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +234 -0
  109. numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +41 -0
  110. numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +58 -0
  111. numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +30 -0
  112. numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +100 -0
  113. numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +42 -0
  114. numba_cuda/numba/cuda/tests/cudapy/test_array.py +260 -0
  115. numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +201 -0
  116. numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +35 -0
  117. numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +1620 -0
  118. numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +120 -0
  119. numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +24 -0
  120. numba_cuda/numba/cuda/tests/cudapy/test_caching.py +545 -0
  121. numba_cuda/numba/cuda/tests/cudapy/test_casting.py +257 -0
  122. numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +33 -0
  123. numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +276 -0
  124. numba_cuda/numba/cuda/tests/cudapy/test_complex.py +296 -0
  125. numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +20 -0
  126. numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +129 -0
  127. numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +176 -0
  128. numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +147 -0
  129. numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +435 -0
  130. numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +90 -0
  131. numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +94 -0
  132. numba_cuda/numba/cuda/tests/cudapy/test_debug.py +101 -0
  133. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +221 -0
  134. numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +222 -0
  135. numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +700 -0
  136. numba_cuda/numba/cuda/tests/cudapy/test_enums.py +121 -0
  137. numba_cuda/numba/cuda/tests/cudapy/test_errors.py +79 -0
  138. numba_cuda/numba/cuda/tests/cudapy/test_exception.py +174 -0
  139. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +155 -0
  140. numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +244 -0
  141. numba_cuda/numba/cuda/tests/cudapy/test_forall.py +52 -0
  142. numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +29 -0
  143. numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +66 -0
  144. numba_cuda/numba/cuda/tests/cudapy/test_globals.py +60 -0
  145. numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +456 -0
  146. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +159 -0
  147. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +95 -0
  148. numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +37 -0
  149. numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +165 -0
  150. numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +1106 -0
  151. numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +318 -0
  152. numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +99 -0
  153. numba_cuda/numba/cuda/tests/cudapy/test_lang.py +64 -0
  154. numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +119 -0
  155. numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +187 -0
  156. numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +199 -0
  157. numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +164 -0
  158. numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +37 -0
  159. numba_cuda/numba/cuda/tests/cudapy/test_math.py +786 -0
  160. numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +74 -0
  161. numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +113 -0
  162. numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +22 -0
  163. numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +140 -0
  164. numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +46 -0
  165. numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +101 -0
  166. numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +49 -0
  167. numba_cuda/numba/cuda/tests/cudapy/test_operator.py +401 -0
  168. numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +86 -0
  169. numba_cuda/numba/cuda/tests/cudapy/test_overload.py +335 -0
  170. numba_cuda/numba/cuda/tests/cudapy/test_powi.py +124 -0
  171. numba_cuda/numba/cuda/tests/cudapy/test_print.py +128 -0
  172. numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +33 -0
  173. numba_cuda/numba/cuda/tests/cudapy/test_random.py +104 -0
  174. numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +610 -0
  175. numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +125 -0
  176. numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +76 -0
  177. numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +83 -0
  178. numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +85 -0
  179. numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +37 -0
  180. numba_cuda/numba/cuda/tests/cudapy/test_sm.py +444 -0
  181. numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +205 -0
  182. numba_cuda/numba/cuda/tests/cudapy/test_sync.py +271 -0
  183. numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +80 -0
  184. numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +277 -0
  185. numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +47 -0
  186. numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +307 -0
  187. numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +283 -0
  188. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +20 -0
  189. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +69 -0
  190. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +36 -0
  191. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +37 -0
  192. numba_cuda/numba/cuda/tests/cudapy/test_warning.py +139 -0
  193. numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +276 -0
  194. numba_cuda/numba/cuda/tests/cudasim/__init__.py +6 -0
  195. numba_cuda/numba/cuda/tests/cudasim/support.py +6 -0
  196. numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +102 -0
  197. numba_cuda/numba/cuda/tests/data/__init__.py +0 -0
  198. numba_cuda/numba/cuda/tests/data/cuda_include.cu +5 -0
  199. numba_cuda/numba/cuda/tests/data/error.cu +7 -0
  200. numba_cuda/numba/cuda/tests/data/jitlink.cu +23 -0
  201. numba_cuda/numba/cuda/tests/data/jitlink.ptx +51 -0
  202. numba_cuda/numba/cuda/tests/data/warn.cu +7 -0
  203. numba_cuda/numba/cuda/tests/doc_examples/__init__.py +6 -0
  204. numba_cuda/numba/cuda/tests/doc_examples/ffi/__init__.py +0 -0
  205. numba_cuda/numba/cuda/tests/doc_examples/ffi/functions.cu +49 -0
  206. numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +77 -0
  207. numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +76 -0
  208. numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +82 -0
  209. numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +155 -0
  210. numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +173 -0
  211. numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +109 -0
  212. numba_cuda/numba/cuda/tests/doc_examples/test_random.py +59 -0
  213. numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +76 -0
  214. numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +130 -0
  215. numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py +50 -0
  216. numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +73 -0
  217. numba_cuda/numba/cuda/tests/nocuda/__init__.py +8 -0
  218. numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +359 -0
  219. numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +36 -0
  220. numba_cuda/numba/cuda/tests/nocuda/test_import.py +49 -0
  221. numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +238 -0
  222. numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +54 -0
  223. numba_cuda/numba/cuda/types.py +37 -0
  224. numba_cuda/numba/cuda/ufuncs.py +662 -0
  225. numba_cuda/numba/cuda/vector_types.py +209 -0
  226. numba_cuda/numba/cuda/vectorizers.py +252 -0
  227. numba_cuda-0.0.12.dist-info/LICENSE +25 -0
  228. numba_cuda-0.0.12.dist-info/METADATA +68 -0
  229. numba_cuda-0.0.12.dist-info/RECORD +231 -0
  230. {numba_cuda-0.0.1.dist-info → numba_cuda-0.0.12.dist-info}/WHEEL +1 -1
  231. numba_cuda-0.0.1.dist-info/METADATA +0 -10
  232. numba_cuda-0.0.1.dist-info/RECORD +0 -5
  233. {numba_cuda-0.0.1.dist-info → numba_cuda-0.0.12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,401 @@
1
+ import numpy as np
2
+ from numba.cuda.testing import (unittest, CUDATestCase, skip_unless_cc_53,
3
+ skip_on_cudasim)
4
+ from numba import cuda
5
+ from numba.core.types import f2, b1
6
+ from numba.cuda import compile_ptx
7
+ import operator
8
+ import itertools
9
+ from numba.np.numpy_support import from_dtype
10
+
11
+
12
+ def simple_fp16_div_scalar(ary, a, b):
13
+ ary[0] = a / b
14
+
15
+
16
+ def simple_fp16add(ary, a, b):
17
+ ary[0] = a + b
18
+
19
+
20
+ def simple_fp16_iadd(ary, a):
21
+ ary[0] += a
22
+
23
+
24
+ def simple_fp16_isub(ary, a):
25
+ ary[0] -= a
26
+
27
+
28
+ def simple_fp16_imul(ary, a):
29
+ ary[0] *= a
30
+
31
+
32
+ def simple_fp16_idiv(ary, a):
33
+ ary[0] /= a
34
+
35
+
36
+ def simple_fp16sub(ary, a, b):
37
+ ary[0] = a - b
38
+
39
+
40
+ def simple_fp16mul(ary, a, b):
41
+ ary[0] = a * b
42
+
43
+
44
+ def simple_fp16neg(ary, a):
45
+ ary[0] = -a
46
+
47
+
48
+ def simple_fp16abs(ary, a):
49
+ ary[0] = abs(a)
50
+
51
+
52
+ def simple_fp16_gt(ary, a, b):
53
+ ary[0] = a > b
54
+
55
+
56
+ def simple_fp16_ge(ary, a, b):
57
+ ary[0] = a >= b
58
+
59
+
60
+ def simple_fp16_lt(ary, a, b):
61
+ ary[0] = a < b
62
+
63
+
64
+ def simple_fp16_le(ary, a, b):
65
+ ary[0] = a <= b
66
+
67
+
68
+ def simple_fp16_eq(ary, a, b):
69
+ ary[0] = a == b
70
+
71
+
72
+ def simple_fp16_ne(ary, a, b):
73
+ ary[0] = a != b
74
+
75
+
76
+ @cuda.jit('b1(f2, f2)', device=True)
77
+ def hlt_func_1(x, y):
78
+ return x < y
79
+
80
+
81
+ @cuda.jit('b1(f2, f2)', device=True)
82
+ def hlt_func_2(x, y):
83
+ return x < y
84
+
85
+
86
+ def test_multiple_hcmp_1(r, a, b, c):
87
+ # float16 predicates used in two separate functions
88
+ r[0] = hlt_func_1(a, b) and hlt_func_2(b, c)
89
+
90
+
91
+ def test_multiple_hcmp_2(r, a, b, c):
92
+ # The same float16 predicate used in the caller and callee
93
+ r[0] = hlt_func_1(a, b) and b < c
94
+
95
+
96
+ def test_multiple_hcmp_3(r, a, b, c):
97
+ # Different float16 predicates used in the caller and callee
98
+ r[0] = hlt_func_1(a, b) and c >= b
99
+
100
+
101
+ def test_multiple_hcmp_4(r, a, b, c):
102
+ # The same float16 predicates used twice in a function
103
+ r[0] = a < b and b < c
104
+
105
+
106
+ def test_multiple_hcmp_5(r, a, b, c):
107
+ # Different float16 predicates used in a function
108
+ r[0] = a < b and c >= b
109
+
110
+
111
+ class TestOperatorModule(CUDATestCase):
112
+ def setUp(self):
113
+ super().setUp()
114
+ np.random.seed(0)
115
+
116
+ """
117
+ Test if operator module is supported by the CUDA target.
118
+ """
119
+ def operator_template(self, op):
120
+ @cuda.jit
121
+ def foo(a, b):
122
+ i = 0
123
+ a[i] = op(a[i], b[i])
124
+
125
+ a = np.ones(1)
126
+ b = np.ones(1)
127
+ res = a.copy()
128
+ foo[1, 1](res, b)
129
+
130
+ np.testing.assert_equal(res, op(a, b))
131
+
132
+ def test_add(self):
133
+ self.operator_template(operator.add)
134
+
135
+ def test_sub(self):
136
+ self.operator_template(operator.sub)
137
+
138
+ def test_mul(self):
139
+ self.operator_template(operator.mul)
140
+
141
+ def test_truediv(self):
142
+ self.operator_template(operator.truediv)
143
+
144
+ def test_floordiv(self):
145
+ self.operator_template(operator.floordiv)
146
+
147
+ @skip_unless_cc_53
148
+ def test_fp16_binary(self):
149
+ functions = (simple_fp16add, simple_fp16sub, simple_fp16mul,
150
+ simple_fp16_div_scalar)
151
+ ops = (operator.add, operator.sub, operator.mul, operator.truediv)
152
+
153
+ for fn, op in zip(functions, ops):
154
+ with self.subTest(op=op):
155
+ kernel = cuda.jit("void(f2[:], f2, f2)")(fn)
156
+
157
+ got = np.zeros(1, dtype=np.float16)
158
+ arg1 = np.random.random(1).astype(np.float16)
159
+ arg2 = np.random.random(1).astype(np.float16)
160
+
161
+ kernel[1, 1](got, arg1[0], arg2[0])
162
+ expected = op(arg1, arg2)
163
+ np.testing.assert_allclose(got, expected)
164
+
165
+ @skip_on_cudasim('Compilation unsupported in the simulator')
166
+ def test_fp16_binary_ptx(self):
167
+ functions = (simple_fp16add, simple_fp16sub, simple_fp16mul)
168
+ instrs = ('add.f16', 'sub.f16', 'mul.f16')
169
+ args = (f2[:], f2, f2)
170
+ for fn, instr in zip(functions, instrs):
171
+ with self.subTest(instr=instr):
172
+ ptx, _ = compile_ptx(fn, args, cc=(5, 3))
173
+ self.assertIn(instr, ptx)
174
+
175
+ @skip_unless_cc_53
176
+ def test_mixed_fp16_binary_arithmetic(self):
177
+ functions = (simple_fp16add, simple_fp16sub, simple_fp16mul,
178
+ simple_fp16_div_scalar)
179
+ ops = (operator.add, operator.sub, operator.mul, operator.truediv)
180
+ types = (np.int8, np.int16, np.int32, np.int64,
181
+ np.float32, np.float64)
182
+ for (fn, op), ty in itertools.product(zip(functions, ops), types):
183
+ with self.subTest(op=op, ty=ty):
184
+ kernel = cuda.jit(fn)
185
+
186
+ arg1 = np.random.random(1).astype(np.float16)
187
+ arg2 = (np.random.random(1) * 100).astype(ty)
188
+ res_ty = np.result_type(np.float16, ty)
189
+
190
+ got = np.zeros(1, dtype=res_ty)
191
+ kernel[1, 1](got, arg1[0], arg2[0])
192
+ expected = op(arg1, arg2)
193
+ np.testing.assert_allclose(got, expected)
194
+
195
+ @skip_on_cudasim('Compilation unsupported in the simulator')
196
+ def test_fp16_inplace_binary_ptx(self):
197
+ functions = (simple_fp16_iadd, simple_fp16_isub, simple_fp16_imul)
198
+ instrs = ('add.f16', 'sub.f16', 'mul.f16')
199
+ args = (f2[:], f2)
200
+
201
+ for fn, instr in zip(functions, instrs):
202
+ with self.subTest(instr=instr):
203
+ ptx, _ = compile_ptx(fn, args, cc=(5, 3))
204
+ self.assertIn(instr, ptx)
205
+
206
+ @skip_unless_cc_53
207
+ def test_fp16_inplace_binary(self):
208
+ functions = (simple_fp16_iadd, simple_fp16_isub, simple_fp16_imul,
209
+ simple_fp16_idiv)
210
+ ops = (operator.iadd, operator.isub, operator.imul, operator.itruediv)
211
+
212
+ for fn, op in zip(functions, ops):
213
+ with self.subTest(op=op):
214
+ kernel = cuda.jit("void(f2[:], f2)")(fn)
215
+
216
+ got = np.random.random(1).astype(np.float16)
217
+ expected = got.copy()
218
+ arg = np.random.random(1).astype(np.float16)[0]
219
+ kernel[1, 1](got, arg)
220
+ op(expected, arg)
221
+ np.testing.assert_allclose(got, expected)
222
+
223
+ @skip_unless_cc_53
224
+ def test_fp16_unary(self):
225
+ functions = (simple_fp16neg, simple_fp16abs)
226
+ ops = (operator.neg, operator.abs)
227
+
228
+ for fn, op in zip(functions, ops):
229
+ with self.subTest(op=op):
230
+ kernel = cuda.jit("void(f2[:], f2)")(fn)
231
+
232
+ got = np.zeros(1, dtype=np.float16)
233
+ arg1 = np.random.random(1).astype(np.float16)
234
+
235
+ kernel[1, 1](got, arg1[0])
236
+ expected = op(arg1)
237
+ np.testing.assert_allclose(got, expected)
238
+
239
+ @skip_on_cudasim('Compilation unsupported in the simulator')
240
+ def test_fp16_neg_ptx(self):
241
+ args = (f2[:], f2)
242
+ ptx, _ = compile_ptx(simple_fp16neg, args, cc=(5, 3))
243
+ self.assertIn('neg.f16', ptx)
244
+
245
+ @skip_on_cudasim('Compilation unsupported in the simulator')
246
+ def test_fp16_abs_ptx(self):
247
+ args = (f2[:], f2)
248
+ ptx, _ = compile_ptx(simple_fp16abs, args, cc=(5, 3))
249
+
250
+ self.assertIn('abs.f16', ptx)
251
+
252
+ @skip_unless_cc_53
253
+ def test_fp16_comparison(self):
254
+ functions = (simple_fp16_gt, simple_fp16_ge,
255
+ simple_fp16_lt, simple_fp16_le,
256
+ simple_fp16_eq, simple_fp16_ne)
257
+ ops = (operator.gt, operator.ge, operator.lt, operator.le,
258
+ operator.eq, operator.ne)
259
+
260
+ for fn, op in zip(functions, ops):
261
+ with self.subTest(op=op):
262
+ kernel = cuda.jit("void(b1[:], f2, f2)")(fn)
263
+
264
+ got = np.zeros(1, dtype=np.bool_)
265
+ arg1 = np.random.random(1).astype(np.float16)
266
+ arg2 = np.random.random(1).astype(np.float16)
267
+
268
+ kernel[1, 1](got, arg1[0], arg2[0])
269
+ expected = op(arg1, arg2)
270
+ self.assertEqual(got[0], expected)
271
+
272
+ @skip_unless_cc_53
273
+ def test_mixed_fp16_comparison(self):
274
+ functions = (simple_fp16_gt, simple_fp16_ge,
275
+ simple_fp16_lt, simple_fp16_le,
276
+ simple_fp16_eq, simple_fp16_ne)
277
+ ops = (operator.gt, operator.ge, operator.lt, operator.le,
278
+ operator.eq, operator.ne)
279
+ types = (np.int8, np.int16, np.int32, np.int64,
280
+ np.float32, np.float64)
281
+
282
+ for (fn, op), ty in itertools.product(zip(functions, ops),
283
+ types):
284
+ with self.subTest(op=op, ty=ty):
285
+ kernel = cuda.jit(fn)
286
+
287
+ got = np.zeros(1, dtype=np.bool_)
288
+ arg1 = np.random.random(1).astype(np.float16)
289
+ arg2 = (np.random.random(1) * 100).astype(ty)
290
+
291
+ kernel[1, 1](got, arg1[0], arg2[0])
292
+ expected = op(arg1, arg2)
293
+ self.assertEqual(got[0], expected)
294
+
295
+ @skip_unless_cc_53
296
+ def test_multiple_float16_comparisons(self):
297
+ functions = (test_multiple_hcmp_1,
298
+ test_multiple_hcmp_2,
299
+ test_multiple_hcmp_3,
300
+ test_multiple_hcmp_4,
301
+ test_multiple_hcmp_5)
302
+ for fn in functions:
303
+ with self.subTest(fn=fn):
304
+ compiled = cuda.jit("void(b1[:], f2, f2, f2)")(fn)
305
+ ary = np.zeros(1, dtype=np.bool_)
306
+ arg1 = np.float16(2.)
307
+ arg2 = np.float16(3.)
308
+ arg3 = np.float16(4.)
309
+ compiled[1, 1](ary, arg1, arg2, arg3)
310
+ self.assertTrue(ary[0])
311
+
312
+ @skip_unless_cc_53
313
+ def test_multiple_float16_comparisons_false(self):
314
+ functions = (test_multiple_hcmp_1,
315
+ test_multiple_hcmp_2,
316
+ test_multiple_hcmp_3,
317
+ test_multiple_hcmp_4,
318
+ test_multiple_hcmp_5)
319
+ for fn in functions:
320
+ with self.subTest(fn=fn):
321
+ compiled = cuda.jit("void(b1[:], f2, f2, f2)")(fn)
322
+ ary = np.zeros(1, dtype=np.bool_)
323
+ arg1 = np.float16(2.)
324
+ arg2 = np.float16(3.)
325
+ arg3 = np.float16(1.)
326
+ compiled[1, 1](ary, arg1, arg2, arg3)
327
+ self.assertFalse(ary[0])
328
+
329
+ @skip_on_cudasim('Compilation unsupported in the simulator')
330
+ def test_fp16_comparison_ptx(self):
331
+ functions = (simple_fp16_gt, simple_fp16_ge,
332
+ simple_fp16_lt, simple_fp16_le,
333
+ simple_fp16_eq, simple_fp16_ne)
334
+ ops = (operator.gt, operator.ge, operator.lt, operator.le,
335
+ operator.eq, operator.ne)
336
+ opstring = ('setp.gt.f16', 'setp.ge.f16',
337
+ 'setp.lt.f16', 'setp.le.f16',
338
+ 'setp.eq.f16', 'setp.ne.f16')
339
+ args = (b1[:], f2, f2)
340
+
341
+ for fn, op, s in zip(functions, ops, opstring):
342
+ with self.subTest(op=op):
343
+ ptx, _ = compile_ptx(fn, args, cc=(5, 3))
344
+ self.assertIn(s, ptx)
345
+
346
+ @skip_on_cudasim('Compilation unsupported in the simulator')
347
+ def test_fp16_int8_comparison_ptx(self):
348
+ # Test that int8 can be safely converted to fp16
349
+ # in a comparison
350
+ functions = (simple_fp16_gt, simple_fp16_ge,
351
+ simple_fp16_lt, simple_fp16_le,
352
+ simple_fp16_eq, simple_fp16_ne)
353
+ ops = (operator.gt, operator.ge, operator.lt, operator.le,
354
+ operator.eq, operator.ne)
355
+
356
+ opstring = {operator.gt:'setp.gt.f16',
357
+ operator.ge:'setp.ge.f16',
358
+ operator.lt:'setp.lt.f16',
359
+ operator.le:'setp.le.f16',
360
+ operator.eq:'setp.eq.f16',
361
+ operator.ne:'setp.ne.f16'}
362
+ for fn, op in zip(functions, ops):
363
+ with self.subTest(op=op):
364
+ args = (b1[:], f2, from_dtype(np.int8))
365
+ ptx, _ = compile_ptx(fn, args, cc=(5, 3))
366
+ self.assertIn(opstring[op], ptx)
367
+
368
+ @skip_on_cudasim('Compilation unsupported in the simulator')
369
+ def test_mixed_fp16_comparison_promotion_ptx(self):
370
+ functions = (simple_fp16_gt, simple_fp16_ge,
371
+ simple_fp16_lt, simple_fp16_le,
372
+ simple_fp16_eq, simple_fp16_ne)
373
+ ops = (operator.gt, operator.ge, operator.lt, operator.le,
374
+ operator.eq, operator.ne)
375
+
376
+ types_promote = (np.int16, np.int32, np.int64,
377
+ np.float32, np.float64)
378
+ opstring = {operator.gt:'setp.gt.',
379
+ operator.ge:'setp.ge.',
380
+ operator.lt:'setp.lt.',
381
+ operator.le:'setp.le.',
382
+ operator.eq:'setp.eq.',
383
+ operator.ne:'setp.neu.'}
384
+ opsuffix = {np.dtype('int32'): 'f64',
385
+ np.dtype('int64'): 'f64',
386
+ np.dtype('float32'): 'f32',
387
+ np.dtype('float64'): 'f64'}
388
+
389
+ for (fn, op), ty in itertools.product(zip(functions, ops),
390
+ types_promote):
391
+ with self.subTest(op=op, ty=ty):
392
+ arg2_ty = np.result_type(np.float16, ty)
393
+ args = (b1[:], f2, from_dtype(arg2_ty))
394
+ ptx, _ = compile_ptx(fn, args, cc=(5, 3))
395
+
396
+ ops = opstring[op] + opsuffix[arg2_ty]
397
+ self.assertIn(ops, ptx)
398
+
399
+
400
+ if __name__ == '__main__':
401
+ unittest.main()
@@ -0,0 +1,86 @@
1
+ import numpy as np
2
+
3
+ from numba.cuda.testing import skip_on_cudasim, CUDATestCase
4
+ from numba import cuda, float64
5
+ import unittest
6
+
7
+
8
+ def kernel_func(x):
9
+ x[0] = 1
10
+
11
+
12
+ def device_func(x, y, z):
13
+ return x * y + z
14
+
15
+
16
+ # Fragments of code that are removed from kernel_func's PTX when optimization
17
+ # is on
18
+ removed_by_opt = ( '__local_depot0', 'call.uni', 'st.param.b64')
19
+
20
+
21
+ @skip_on_cudasim('Simulator does not optimize code')
22
+ class TestOptimization(CUDATestCase):
23
+ def test_eager_opt(self):
24
+ # Optimization should occur by default
25
+ sig = (float64[::1],)
26
+ kernel = cuda.jit(sig)(kernel_func)
27
+ ptx = kernel.inspect_asm()
28
+
29
+ for fragment in removed_by_opt:
30
+ with self.subTest(fragment=fragment):
31
+ self.assertNotIn(fragment, ptx[sig])
32
+
33
+ def test_eager_noopt(self):
34
+ # Optimization disabled
35
+ sig = (float64[::1],)
36
+ kernel = cuda.jit(sig, opt=False)(kernel_func)
37
+ ptx = kernel.inspect_asm()
38
+
39
+ for fragment in removed_by_opt:
40
+ with self.subTest(fragment=fragment):
41
+ self.assertIn(fragment, ptx[sig])
42
+
43
+ def test_lazy_opt(self):
44
+ # Optimization should occur by default
45
+ kernel = cuda.jit(kernel_func)
46
+ x = np.zeros(1, dtype=np.float64)
47
+ kernel[1, 1](x)
48
+
49
+ # Grab the PTX for the one definition that has just been jitted
50
+ ptx = next(iter(kernel.inspect_asm().items()))[1]
51
+
52
+ for fragment in removed_by_opt:
53
+ with self.subTest(fragment=fragment):
54
+ self.assertNotIn(fragment, ptx)
55
+
56
+ def test_lazy_noopt(self):
57
+ # Optimization disabled
58
+ kernel = cuda.jit(opt=False)(kernel_func)
59
+ x = np.zeros(1, dtype=np.float64)
60
+ kernel[1, 1](x)
61
+
62
+ # Grab the PTX for the one definition that has just been jitted
63
+ ptx = next(iter(kernel.inspect_asm().items()))[1]
64
+
65
+ for fragment in removed_by_opt:
66
+ with self.subTest(fragment=fragment):
67
+ self.assertIn(fragment, ptx)
68
+
69
+ def test_device_opt(self):
70
+ # Optimization should occur by default
71
+ sig = (float64, float64, float64)
72
+ device = cuda.jit(sig, device=True)(device_func)
73
+ ptx = device.inspect_asm(sig)
74
+ self.assertIn('fma.rn.f64', ptx)
75
+
76
+ def test_device_noopt(self):
77
+ # Optimization disabled
78
+ sig = (float64, float64, float64)
79
+ device = cuda.jit(sig, device=True, opt=False)(device_func)
80
+ ptx = device.inspect_asm(sig)
81
+ # Fused-multiply adds should be disabled when not optimizing
82
+ self.assertNotIn('fma.rn.f64', ptx)
83
+
84
+
85
+ if __name__ == '__main__':
86
+ unittest.main()