numba-cuda 0.0.1__py3-none-any.whl → 0.0.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. _numba_cuda_redirector.pth +1 -0
  2. _numba_cuda_redirector.py +74 -0
  3. numba_cuda/VERSION +1 -0
  4. numba_cuda/__init__.py +5 -0
  5. numba_cuda/_version.py +19 -0
  6. numba_cuda/numba/cuda/__init__.py +22 -0
  7. numba_cuda/numba/cuda/api.py +526 -0
  8. numba_cuda/numba/cuda/api_util.py +30 -0
  9. numba_cuda/numba/cuda/args.py +77 -0
  10. numba_cuda/numba/cuda/cg.py +62 -0
  11. numba_cuda/numba/cuda/codegen.py +378 -0
  12. numba_cuda/numba/cuda/compiler.py +422 -0
  13. numba_cuda/numba/cuda/cpp_function_wrappers.cu +47 -0
  14. numba_cuda/numba/cuda/cuda_fp16.h +3631 -0
  15. numba_cuda/numba/cuda/cuda_fp16.hpp +2465 -0
  16. numba_cuda/numba/cuda/cuda_paths.py +258 -0
  17. numba_cuda/numba/cuda/cudadecl.py +806 -0
  18. numba_cuda/numba/cuda/cudadrv/__init__.py +9 -0
  19. numba_cuda/numba/cuda/cudadrv/devicearray.py +904 -0
  20. numba_cuda/numba/cuda/cudadrv/devices.py +248 -0
  21. numba_cuda/numba/cuda/cudadrv/driver.py +3201 -0
  22. numba_cuda/numba/cuda/cudadrv/drvapi.py +398 -0
  23. numba_cuda/numba/cuda/cudadrv/dummyarray.py +452 -0
  24. numba_cuda/numba/cuda/cudadrv/enums.py +607 -0
  25. numba_cuda/numba/cuda/cudadrv/error.py +36 -0
  26. numba_cuda/numba/cuda/cudadrv/libs.py +176 -0
  27. numba_cuda/numba/cuda/cudadrv/ndarray.py +20 -0
  28. numba_cuda/numba/cuda/cudadrv/nvrtc.py +260 -0
  29. numba_cuda/numba/cuda/cudadrv/nvvm.py +707 -0
  30. numba_cuda/numba/cuda/cudadrv/rtapi.py +10 -0
  31. numba_cuda/numba/cuda/cudadrv/runtime.py +142 -0
  32. numba_cuda/numba/cuda/cudaimpl.py +1055 -0
  33. numba_cuda/numba/cuda/cudamath.py +140 -0
  34. numba_cuda/numba/cuda/decorators.py +189 -0
  35. numba_cuda/numba/cuda/descriptor.py +33 -0
  36. numba_cuda/numba/cuda/device_init.py +89 -0
  37. numba_cuda/numba/cuda/deviceufunc.py +908 -0
  38. numba_cuda/numba/cuda/dispatcher.py +1057 -0
  39. numba_cuda/numba/cuda/errors.py +59 -0
  40. numba_cuda/numba/cuda/extending.py +7 -0
  41. numba_cuda/numba/cuda/initialize.py +13 -0
  42. numba_cuda/numba/cuda/intrinsic_wrapper.py +77 -0
  43. numba_cuda/numba/cuda/intrinsics.py +198 -0
  44. numba_cuda/numba/cuda/kernels/__init__.py +0 -0
  45. numba_cuda/numba/cuda/kernels/reduction.py +262 -0
  46. numba_cuda/numba/cuda/kernels/transpose.py +65 -0
  47. numba_cuda/numba/cuda/libdevice.py +3382 -0
  48. numba_cuda/numba/cuda/libdevicedecl.py +17 -0
  49. numba_cuda/numba/cuda/libdevicefuncs.py +1057 -0
  50. numba_cuda/numba/cuda/libdeviceimpl.py +83 -0
  51. numba_cuda/numba/cuda/mathimpl.py +448 -0
  52. numba_cuda/numba/cuda/models.py +48 -0
  53. numba_cuda/numba/cuda/nvvmutils.py +235 -0
  54. numba_cuda/numba/cuda/printimpl.py +86 -0
  55. numba_cuda/numba/cuda/random.py +292 -0
  56. numba_cuda/numba/cuda/simulator/__init__.py +38 -0
  57. numba_cuda/numba/cuda/simulator/api.py +110 -0
  58. numba_cuda/numba/cuda/simulator/compiler.py +9 -0
  59. numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +2 -0
  60. numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +432 -0
  61. numba_cuda/numba/cuda/simulator/cudadrv/devices.py +117 -0
  62. numba_cuda/numba/cuda/simulator/cudadrv/driver.py +62 -0
  63. numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +4 -0
  64. numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py +4 -0
  65. numba_cuda/numba/cuda/simulator/cudadrv/error.py +6 -0
  66. numba_cuda/numba/cuda/simulator/cudadrv/libs.py +2 -0
  67. numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +29 -0
  68. numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +19 -0
  69. numba_cuda/numba/cuda/simulator/kernel.py +308 -0
  70. numba_cuda/numba/cuda/simulator/kernelapi.py +495 -0
  71. numba_cuda/numba/cuda/simulator/reduction.py +15 -0
  72. numba_cuda/numba/cuda/simulator/vector_types.py +58 -0
  73. numba_cuda/numba/cuda/simulator_init.py +17 -0
  74. numba_cuda/numba/cuda/stubs.py +902 -0
  75. numba_cuda/numba/cuda/target.py +440 -0
  76. numba_cuda/numba/cuda/testing.py +202 -0
  77. numba_cuda/numba/cuda/tests/__init__.py +58 -0
  78. numba_cuda/numba/cuda/tests/cudadrv/__init__.py +8 -0
  79. numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +145 -0
  80. numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +145 -0
  81. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +375 -0
  82. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +21 -0
  83. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +179 -0
  84. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +235 -0
  85. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +22 -0
  86. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +193 -0
  87. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +547 -0
  88. numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +249 -0
  89. numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +81 -0
  90. numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +192 -0
  91. numba_cuda/numba/cuda/tests/cudadrv/test_events.py +38 -0
  92. numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +65 -0
  93. numba_cuda/numba/cuda/tests/cudadrv/test_init.py +139 -0
  94. numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +37 -0
  95. numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py +12 -0
  96. numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +317 -0
  97. numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +127 -0
  98. numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +54 -0
  99. numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +199 -0
  100. numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +37 -0
  101. numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +20 -0
  102. numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +149 -0
  103. numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +36 -0
  104. numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +85 -0
  105. numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +41 -0
  106. numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +122 -0
  107. numba_cuda/numba/cuda/tests/cudapy/__init__.py +8 -0
  108. numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +234 -0
  109. numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +41 -0
  110. numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +58 -0
  111. numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +30 -0
  112. numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +100 -0
  113. numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +42 -0
  114. numba_cuda/numba/cuda/tests/cudapy/test_array.py +260 -0
  115. numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +201 -0
  116. numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +35 -0
  117. numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +1620 -0
  118. numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +120 -0
  119. numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +24 -0
  120. numba_cuda/numba/cuda/tests/cudapy/test_caching.py +545 -0
  121. numba_cuda/numba/cuda/tests/cudapy/test_casting.py +257 -0
  122. numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +33 -0
  123. numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +276 -0
  124. numba_cuda/numba/cuda/tests/cudapy/test_complex.py +296 -0
  125. numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +20 -0
  126. numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +129 -0
  127. numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +176 -0
  128. numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +147 -0
  129. numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +435 -0
  130. numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +90 -0
  131. numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +94 -0
  132. numba_cuda/numba/cuda/tests/cudapy/test_debug.py +101 -0
  133. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +221 -0
  134. numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +222 -0
  135. numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +700 -0
  136. numba_cuda/numba/cuda/tests/cudapy/test_enums.py +121 -0
  137. numba_cuda/numba/cuda/tests/cudapy/test_errors.py +79 -0
  138. numba_cuda/numba/cuda/tests/cudapy/test_exception.py +174 -0
  139. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +155 -0
  140. numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +244 -0
  141. numba_cuda/numba/cuda/tests/cudapy/test_forall.py +52 -0
  142. numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +29 -0
  143. numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +66 -0
  144. numba_cuda/numba/cuda/tests/cudapy/test_globals.py +60 -0
  145. numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +456 -0
  146. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +159 -0
  147. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +95 -0
  148. numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +37 -0
  149. numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +165 -0
  150. numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +1106 -0
  151. numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +318 -0
  152. numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +99 -0
  153. numba_cuda/numba/cuda/tests/cudapy/test_lang.py +64 -0
  154. numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +119 -0
  155. numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +187 -0
  156. numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +199 -0
  157. numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +164 -0
  158. numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +37 -0
  159. numba_cuda/numba/cuda/tests/cudapy/test_math.py +786 -0
  160. numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +74 -0
  161. numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +113 -0
  162. numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +22 -0
  163. numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +140 -0
  164. numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +46 -0
  165. numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +101 -0
  166. numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +49 -0
  167. numba_cuda/numba/cuda/tests/cudapy/test_operator.py +401 -0
  168. numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +86 -0
  169. numba_cuda/numba/cuda/tests/cudapy/test_overload.py +335 -0
  170. numba_cuda/numba/cuda/tests/cudapy/test_powi.py +124 -0
  171. numba_cuda/numba/cuda/tests/cudapy/test_print.py +128 -0
  172. numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +33 -0
  173. numba_cuda/numba/cuda/tests/cudapy/test_random.py +104 -0
  174. numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +610 -0
  175. numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +125 -0
  176. numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +76 -0
  177. numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +83 -0
  178. numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +85 -0
  179. numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +37 -0
  180. numba_cuda/numba/cuda/tests/cudapy/test_sm.py +444 -0
  181. numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +205 -0
  182. numba_cuda/numba/cuda/tests/cudapy/test_sync.py +271 -0
  183. numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +80 -0
  184. numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +277 -0
  185. numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +47 -0
  186. numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +307 -0
  187. numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +283 -0
  188. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +20 -0
  189. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +69 -0
  190. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +36 -0
  191. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +37 -0
  192. numba_cuda/numba/cuda/tests/cudapy/test_warning.py +139 -0
  193. numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +276 -0
  194. numba_cuda/numba/cuda/tests/cudasim/__init__.py +6 -0
  195. numba_cuda/numba/cuda/tests/cudasim/support.py +6 -0
  196. numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +102 -0
  197. numba_cuda/numba/cuda/tests/data/__init__.py +0 -0
  198. numba_cuda/numba/cuda/tests/data/cuda_include.cu +5 -0
  199. numba_cuda/numba/cuda/tests/data/error.cu +7 -0
  200. numba_cuda/numba/cuda/tests/data/jitlink.cu +23 -0
  201. numba_cuda/numba/cuda/tests/data/jitlink.ptx +51 -0
  202. numba_cuda/numba/cuda/tests/data/warn.cu +7 -0
  203. numba_cuda/numba/cuda/tests/doc_examples/__init__.py +6 -0
  204. numba_cuda/numba/cuda/tests/doc_examples/ffi/__init__.py +0 -0
  205. numba_cuda/numba/cuda/tests/doc_examples/ffi/functions.cu +49 -0
  206. numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +77 -0
  207. numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +76 -0
  208. numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +82 -0
  209. numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +155 -0
  210. numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +173 -0
  211. numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +109 -0
  212. numba_cuda/numba/cuda/tests/doc_examples/test_random.py +59 -0
  213. numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +76 -0
  214. numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +130 -0
  215. numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py +50 -0
  216. numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +73 -0
  217. numba_cuda/numba/cuda/tests/nocuda/__init__.py +8 -0
  218. numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +359 -0
  219. numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +36 -0
  220. numba_cuda/numba/cuda/tests/nocuda/test_import.py +49 -0
  221. numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +238 -0
  222. numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +54 -0
  223. numba_cuda/numba/cuda/types.py +37 -0
  224. numba_cuda/numba/cuda/ufuncs.py +662 -0
  225. numba_cuda/numba/cuda/vector_types.py +209 -0
  226. numba_cuda/numba/cuda/vectorizers.py +252 -0
  227. numba_cuda-0.0.12.dist-info/LICENSE +25 -0
  228. numba_cuda-0.0.12.dist-info/METADATA +68 -0
  229. numba_cuda-0.0.12.dist-info/RECORD +231 -0
  230. {numba_cuda-0.0.1.dist-info → numba_cuda-0.0.12.dist-info}/WHEEL +1 -1
  231. numba_cuda-0.0.1.dist-info/METADATA +0 -10
  232. numba_cuda-0.0.1.dist-info/RECORD +0 -5
  233. {numba_cuda-0.0.1.dist-info → numba_cuda-0.0.12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,36 @@
1
+ from numba import vectorize
2
+ from numba import cuda, float32
3
+ import numpy as np
4
+ from numba.cuda.testing import skip_on_cudasim, CUDATestCase
5
+ import unittest
6
+
7
+
8
+ @skip_on_cudasim('ufunc API unsupported in the simulator')
9
+ class TestCudaVectorizeDeviceCall(CUDATestCase):
10
+ def test_cuda_vectorize_device_call(self):
11
+
12
+ @cuda.jit(float32(float32, float32, float32), device=True)
13
+ def cu_device_fn(x, y, z):
14
+ return x ** y / z
15
+
16
+ def cu_ufunc(x, y, z):
17
+ return cu_device_fn(x, y, z)
18
+
19
+ ufunc = vectorize([float32(float32, float32, float32)], target='cuda')(
20
+ cu_ufunc)
21
+
22
+ N = 100
23
+
24
+ X = np.array(np.random.sample(N), dtype=np.float32)
25
+ Y = np.array(np.random.sample(N), dtype=np.float32)
26
+ Z = np.array(np.random.sample(N), dtype=np.float32) + 0.1
27
+
28
+ out = ufunc(X, Y, Z)
29
+
30
+ gold = (X ** Y) / Z
31
+
32
+ self.assertTrue(np.allclose(out, gold))
33
+
34
+
35
+ if __name__ == '__main__':
36
+ unittest.main()
@@ -0,0 +1,37 @@
1
+ import numpy as np
2
+ from numba import vectorize
3
+ from numba import cuda, float64
4
+ from numba.cuda.testing import skip_on_cudasim, CUDATestCase
5
+ import unittest
6
+
7
+ sig = [float64(float64, float64)]
8
+
9
+
10
+ @skip_on_cudasim('ufunc API unsupported in the simulator')
11
+ class TestCUDAVectorizeScalarArg(CUDATestCase):
12
+
13
+ def test_vectorize_scalar_arg(self):
14
+ @vectorize(sig, target='cuda')
15
+ def vector_add(a, b):
16
+ return a + b
17
+
18
+ A = np.arange(10, dtype=np.float64)
19
+ dA = cuda.to_device(A)
20
+ v = vector_add(1.0, dA)
21
+
22
+ np.testing.assert_array_almost_equal(
23
+ v.copy_to_host(),
24
+ np.arange(1, 11, dtype=np.float64))
25
+
26
+ def test_vectorize_all_scalars(self):
27
+ @vectorize(sig, target='cuda')
28
+ def vector_add(a, b):
29
+ return a + b
30
+
31
+ v = vector_add(1.0, 1.0)
32
+
33
+ np.testing.assert_almost_equal(2.0, v)
34
+
35
+
36
+ if __name__ == '__main__':
37
+ unittest.main()
@@ -0,0 +1,139 @@
1
+ import numpy as np
2
+ from numba import cuda
3
+ from numba.cuda.testing import unittest, CUDATestCase, skip_on_cudasim
4
+ from numba.tests.support import linux_only, override_config
5
+ from numba.core.errors import NumbaPerformanceWarning
6
+ import warnings
7
+
8
+
9
+ @skip_on_cudasim('cudasim does not raise performance warnings')
10
+ class TestWarnings(CUDATestCase):
11
+ def test_inefficient_launch_configuration(self):
12
+ @cuda.jit
13
+ def kernel():
14
+ pass
15
+
16
+ with override_config('CUDA_LOW_OCCUPANCY_WARNINGS', 1):
17
+ with warnings.catch_warnings(record=True) as w:
18
+ kernel[1, 1]()
19
+
20
+ self.assertEqual(w[0].category, NumbaPerformanceWarning)
21
+ self.assertIn('Grid size', str(w[0].message))
22
+ self.assertIn('low occupancy', str(w[0].message))
23
+
24
+ def test_efficient_launch_configuration(self):
25
+ @cuda.jit
26
+ def kernel():
27
+ pass
28
+
29
+ with override_config('CUDA_LOW_OCCUPANCY_WARNINGS', 1):
30
+ with warnings.catch_warnings(record=True) as w:
31
+ kernel[256, 256]()
32
+
33
+ self.assertEqual(len(w), 0)
34
+
35
+ def test_warn_on_host_array(self):
36
+ @cuda.jit
37
+ def foo(r, x):
38
+ r[0] = x + 1
39
+
40
+ N = 10
41
+ arr_f32 = np.zeros(N, dtype=np.float32)
42
+ with override_config('CUDA_WARN_ON_IMPLICIT_COPY', 1):
43
+ with warnings.catch_warnings(record=True) as w:
44
+ foo[1, N](arr_f32, N)
45
+
46
+ self.assertEqual(w[0].category, NumbaPerformanceWarning)
47
+ self.assertIn('Host array used in CUDA kernel will incur',
48
+ str(w[0].message))
49
+ self.assertIn('copy overhead', str(w[0].message))
50
+
51
+ def test_pinned_warn_on_host_array(self):
52
+ @cuda.jit
53
+ def foo(r, x):
54
+ r[0] = x + 1
55
+
56
+ N = 10
57
+ ary = cuda.pinned_array(N, dtype=np.float32)
58
+
59
+ with override_config('CUDA_WARN_ON_IMPLICIT_COPY', 1):
60
+ with warnings.catch_warnings(record=True) as w:
61
+ foo[1, N](ary, N)
62
+
63
+ self.assertEqual(w[0].category, NumbaPerformanceWarning)
64
+ self.assertIn('Host array used in CUDA kernel will incur',
65
+ str(w[0].message))
66
+ self.assertIn('copy overhead', str(w[0].message))
67
+
68
+ def test_nowarn_on_mapped_array(self):
69
+ @cuda.jit
70
+ def foo(r, x):
71
+ r[0] = x + 1
72
+
73
+ N = 10
74
+ ary = cuda.mapped_array(N, dtype=np.float32)
75
+
76
+ with override_config('CUDA_WARN_ON_IMPLICIT_COPY', 1):
77
+ with warnings.catch_warnings(record=True) as w:
78
+ foo[1, N](ary, N)
79
+
80
+ self.assertEqual(len(w), 0)
81
+
82
+ @linux_only
83
+ def test_nowarn_on_managed_array(self):
84
+ @cuda.jit
85
+ def foo(r, x):
86
+ r[0] = x + 1
87
+
88
+ N = 10
89
+ ary = cuda.managed_array(N, dtype=np.float32)
90
+
91
+ with override_config('CUDA_WARN_ON_IMPLICIT_COPY', 1):
92
+ with warnings.catch_warnings(record=True) as w:
93
+ foo[1, N](ary, N)
94
+
95
+ self.assertEqual(len(w), 0)
96
+
97
+ def test_nowarn_on_device_array(self):
98
+ @cuda.jit
99
+ def foo(r, x):
100
+ r[0] = x + 1
101
+
102
+ N = 10
103
+ ary = cuda.device_array(N, dtype=np.float32)
104
+
105
+ with override_config('CUDA_WARN_ON_IMPLICIT_COPY', 1):
106
+ with warnings.catch_warnings(record=True) as w:
107
+ foo[1, N](ary, N)
108
+
109
+ self.assertEqual(len(w), 0)
110
+
111
+ def test_warn_on_debug_and_opt(self):
112
+ with warnings.catch_warnings(record=True) as w:
113
+ cuda.jit(debug=True, opt=True)
114
+
115
+ self.assertEqual(len(w), 1)
116
+ self.assertIn('not supported by CUDA', str(w[0].message))
117
+
118
+ def test_warn_on_debug_and_opt_default(self):
119
+ with warnings.catch_warnings(record=True) as w:
120
+ cuda.jit(debug=True)
121
+
122
+ self.assertEqual(len(w), 1)
123
+ self.assertIn('not supported by CUDA', str(w[0].message))
124
+
125
+ def test_no_warn_on_debug_and_no_opt(self):
126
+ with warnings.catch_warnings(record=True) as w:
127
+ cuda.jit(debug=True, opt=False)
128
+
129
+ self.assertEqual(len(w), 0)
130
+
131
+ def test_no_warn_with_no_debug_and_opt_kwargs(self):
132
+ with warnings.catch_warnings(record=True) as w:
133
+ cuda.jit()
134
+
135
+ self.assertEqual(len(w), 0)
136
+
137
+
138
+ if __name__ == '__main__':
139
+ unittest.main()
@@ -0,0 +1,276 @@
1
+ import numpy as np
2
+ from numba import cuda, int32, int64, float32, float64
3
+ from numba.cuda.testing import unittest, CUDATestCase, skip_on_cudasim
4
+ from numba.core import config
5
+
6
+
7
+ def useful_syncwarp(ary):
8
+ i = cuda.grid(1)
9
+ if i == 0:
10
+ ary[0] = 42
11
+ cuda.syncwarp(0xffffffff)
12
+ ary[i] = ary[0]
13
+
14
+
15
+ def use_shfl_sync_idx(ary, idx):
16
+ i = cuda.grid(1)
17
+ val = cuda.shfl_sync(0xffffffff, i, idx)
18
+ ary[i] = val
19
+
20
+
21
+ def use_shfl_sync_up(ary, delta):
22
+ i = cuda.grid(1)
23
+ val = cuda.shfl_up_sync(0xffffffff, i, delta)
24
+ ary[i] = val
25
+
26
+
27
+ def use_shfl_sync_down(ary, delta):
28
+ i = cuda.grid(1)
29
+ val = cuda.shfl_down_sync(0xffffffff, i, delta)
30
+ ary[i] = val
31
+
32
+
33
+ def use_shfl_sync_xor(ary, xor):
34
+ i = cuda.grid(1)
35
+ val = cuda.shfl_xor_sync(0xffffffff, i, xor)
36
+ ary[i] = val
37
+
38
+
39
+ def use_shfl_sync_with_val(ary, into):
40
+ i = cuda.grid(1)
41
+ val = cuda.shfl_sync(0xffffffff, into, 0)
42
+ ary[i] = val
43
+
44
+
45
+ def use_vote_sync_all(ary_in, ary_out):
46
+ i = cuda.grid(1)
47
+ pred = cuda.all_sync(0xffffffff, ary_in[i])
48
+ ary_out[i] = pred
49
+
50
+
51
+ def use_vote_sync_any(ary_in, ary_out):
52
+ i = cuda.grid(1)
53
+ pred = cuda.any_sync(0xffffffff, ary_in[i])
54
+ ary_out[i] = pred
55
+
56
+
57
+ def use_vote_sync_eq(ary_in, ary_out):
58
+ i = cuda.grid(1)
59
+ pred = cuda.eq_sync(0xffffffff, ary_in[i])
60
+ ary_out[i] = pred
61
+
62
+
63
+ def use_vote_sync_ballot(ary):
64
+ i = cuda.threadIdx.x
65
+ ballot = cuda.ballot_sync(0xffffffff, True)
66
+ ary[i] = ballot
67
+
68
+
69
+ def use_match_any_sync(ary_in, ary_out):
70
+ i = cuda.grid(1)
71
+ ballot = cuda.match_any_sync(0xffffffff, ary_in[i])
72
+ ary_out[i] = ballot
73
+
74
+
75
+ def use_match_all_sync(ary_in, ary_out):
76
+ i = cuda.grid(1)
77
+ ballot, pred = cuda.match_all_sync(0xffffffff, ary_in[i])
78
+ ary_out[i] = ballot if pred else 0
79
+
80
+
81
+ def use_independent_scheduling(arr):
82
+ i = cuda.threadIdx.x
83
+ if i % 4 == 0:
84
+ ballot = cuda.ballot_sync(0x11111111, True)
85
+ elif i % 4 == 1:
86
+ ballot = cuda.ballot_sync(0x22222222, True)
87
+ elif i % 4 == 2:
88
+ ballot = cuda.ballot_sync(0x44444444, True)
89
+ elif i % 4 == 3:
90
+ ballot = cuda.ballot_sync(0x88888888, True)
91
+ arr[i] = ballot
92
+
93
+
94
+ def _safe_cc_check(cc):
95
+ if config.ENABLE_CUDASIM:
96
+ return True
97
+ else:
98
+ return cuda.get_current_device().compute_capability >= cc
99
+
100
+
101
+ @skip_on_cudasim("Warp Operations are not yet implemented on cudasim")
102
+ class TestCudaWarpOperations(CUDATestCase):
103
+ def test_useful_syncwarp(self):
104
+ compiled = cuda.jit("void(int32[:])")(useful_syncwarp)
105
+ nelem = 32
106
+ ary = np.empty(nelem, dtype=np.int32)
107
+ compiled[1, nelem](ary)
108
+ self.assertTrue(np.all(ary == 42))
109
+
110
+ def test_shfl_sync_idx(self):
111
+ compiled = cuda.jit("void(int32[:], int32)")(use_shfl_sync_idx)
112
+ nelem = 32
113
+ idx = 4
114
+ ary = np.empty(nelem, dtype=np.int32)
115
+ compiled[1, nelem](ary, idx)
116
+ self.assertTrue(np.all(ary == idx))
117
+
118
+ def test_shfl_sync_up(self):
119
+ compiled = cuda.jit("void(int32[:], int32)")(use_shfl_sync_up)
120
+ nelem = 32
121
+ delta = 4
122
+ ary = np.empty(nelem, dtype=np.int32)
123
+ exp = np.arange(nelem, dtype=np.int32)
124
+ exp[delta:] -= delta
125
+ compiled[1, nelem](ary, delta)
126
+ self.assertTrue(np.all(ary == exp))
127
+
128
+ def test_shfl_sync_down(self):
129
+ compiled = cuda.jit("void(int32[:], int32)")(use_shfl_sync_down)
130
+ nelem = 32
131
+ delta = 4
132
+ ary = np.empty(nelem, dtype=np.int32)
133
+ exp = np.arange(nelem, dtype=np.int32)
134
+ exp[:-delta] += delta
135
+ compiled[1, nelem](ary, delta)
136
+ self.assertTrue(np.all(ary == exp))
137
+
138
+ def test_shfl_sync_xor(self):
139
+ compiled = cuda.jit("void(int32[:], int32)")(use_shfl_sync_xor)
140
+ nelem = 32
141
+ xor = 16
142
+ ary = np.empty(nelem, dtype=np.int32)
143
+ exp = np.arange(nelem, dtype=np.int32) ^ xor
144
+ compiled[1, nelem](ary, xor)
145
+ self.assertTrue(np.all(ary == exp))
146
+
147
+ def test_shfl_sync_types(self):
148
+ types = int32, int64, float32, float64
149
+ values = (np.int32(-1), np.int64(1 << 42),
150
+ np.float32(np.pi), np.float64(np.pi))
151
+ for typ, val in zip(types, values):
152
+ compiled = cuda.jit((typ[:], typ))(use_shfl_sync_with_val)
153
+ nelem = 32
154
+ ary = np.empty(nelem, dtype=val.dtype)
155
+ compiled[1, nelem](ary, val)
156
+ self.assertTrue(np.all(ary == val))
157
+
158
+ def test_vote_sync_all(self):
159
+ compiled = cuda.jit("void(int32[:], int32[:])")(use_vote_sync_all)
160
+ nelem = 32
161
+ ary_in = np.ones(nelem, dtype=np.int32)
162
+ ary_out = np.empty(nelem, dtype=np.int32)
163
+ compiled[1, nelem](ary_in, ary_out)
164
+ self.assertTrue(np.all(ary_out == 1))
165
+ ary_in[-1] = 0
166
+ compiled[1, nelem](ary_in, ary_out)
167
+ self.assertTrue(np.all(ary_out == 0))
168
+
169
+ def test_vote_sync_any(self):
170
+ compiled = cuda.jit("void(int32[:], int32[:])")(use_vote_sync_any)
171
+ nelem = 32
172
+ ary_in = np.zeros(nelem, dtype=np.int32)
173
+ ary_out = np.empty(nelem, dtype=np.int32)
174
+ compiled[1, nelem](ary_in, ary_out)
175
+ self.assertTrue(np.all(ary_out == 0))
176
+ ary_in[2] = 1
177
+ ary_in[5] = 1
178
+ compiled[1, nelem](ary_in, ary_out)
179
+ self.assertTrue(np.all(ary_out == 1))
180
+
181
+ def test_vote_sync_eq(self):
182
+ compiled = cuda.jit("void(int32[:], int32[:])")(use_vote_sync_eq)
183
+ nelem = 32
184
+ ary_in = np.zeros(nelem, dtype=np.int32)
185
+ ary_out = np.empty(nelem, dtype=np.int32)
186
+ compiled[1, nelem](ary_in, ary_out)
187
+ self.assertTrue(np.all(ary_out == 1))
188
+ ary_in[1] = 1
189
+ compiled[1, nelem](ary_in, ary_out)
190
+ self.assertTrue(np.all(ary_out == 0))
191
+ ary_in[:] = 1
192
+ compiled[1, nelem](ary_in, ary_out)
193
+ self.assertTrue(np.all(ary_out == 1))
194
+
195
+ def test_vote_sync_ballot(self):
196
+ compiled = cuda.jit("void(uint32[:])")(use_vote_sync_ballot)
197
+ nelem = 32
198
+ ary = np.empty(nelem, dtype=np.uint32)
199
+ compiled[1, nelem](ary)
200
+ self.assertTrue(np.all(ary == np.uint32(0xffffffff)))
201
+
202
+ @unittest.skipUnless(_safe_cc_check((7, 0)),
203
+ "Matching requires at least Volta Architecture")
204
+ def test_match_any_sync(self):
205
+ compiled = cuda.jit("void(int32[:], int32[:])")(use_match_any_sync)
206
+ nelem = 10
207
+ ary_in = np.arange(nelem, dtype=np.int32) % 2
208
+ ary_out = np.empty(nelem, dtype=np.int32)
209
+ exp = np.tile((0b0101010101, 0b1010101010), 5)
210
+ compiled[1, nelem](ary_in, ary_out)
211
+ self.assertTrue(np.all(ary_out == exp))
212
+
213
+ @unittest.skipUnless(_safe_cc_check((7, 0)),
214
+ "Matching requires at least Volta Architecture")
215
+ def test_match_all_sync(self):
216
+ compiled = cuda.jit("void(int32[:], int32[:])")(use_match_all_sync)
217
+ nelem = 10
218
+ ary_in = np.zeros(nelem, dtype=np.int32)
219
+ ary_out = np.empty(nelem, dtype=np.int32)
220
+ compiled[1, nelem](ary_in, ary_out)
221
+ self.assertTrue(np.all(ary_out == 0b1111111111))
222
+ ary_in[1] = 4
223
+ compiled[1, nelem](ary_in, ary_out)
224
+ self.assertTrue(np.all(ary_out == 0))
225
+
226
+ @unittest.skipUnless(_safe_cc_check((7, 0)),
227
+ "Independent scheduling requires at least Volta "
228
+ "Architecture")
229
+ def test_independent_scheduling(self):
230
+ compiled = cuda.jit("void(uint32[:])")(use_independent_scheduling)
231
+ arr = np.empty(32, dtype=np.uint32)
232
+ exp = np.tile((0x11111111, 0x22222222, 0x44444444, 0x88888888), 8)
233
+ compiled[1, 32](arr)
234
+ self.assertTrue(np.all(arr == exp))
235
+
236
+ def test_activemask(self):
237
+ @cuda.jit
238
+ def use_activemask(x):
239
+ i = cuda.grid(1)
240
+ if (i % 2) == 0:
241
+ # Even numbered threads fill in even numbered array entries
242
+ # with binary "...01010101"
243
+ x[i] = cuda.activemask()
244
+ else:
245
+ # Odd numbered threads fill in odd numbered array entries
246
+ # with binary "...10101010"
247
+ x[i] = cuda.activemask()
248
+
249
+ out = np.zeros(32, dtype=np.uint32)
250
+ use_activemask[1, 32](out)
251
+
252
+ # 0x5 = 0101: The pattern from even-numbered threads
253
+ # 0xA = 1010: The pattern from odd-numbered threads
254
+ expected = np.tile((0x55555555, 0xAAAAAAAA), 16)
255
+ np.testing.assert_equal(expected, out)
256
+
257
+ def test_lanemask_lt(self):
258
+ @cuda.jit
259
+ def use_lanemask_lt(x):
260
+ i = cuda.grid(1)
261
+ x[i] = cuda.lanemask_lt()
262
+
263
+ out = np.zeros(32, dtype=np.uint32)
264
+ use_lanemask_lt[1, 32](out)
265
+
266
+ # A string of 1s that grows from the LSB for each entry:
267
+ # 0, 1, 3, 7, F, 1F, 3F, 7F, FF, 1FF, etc.
268
+ # or in binary:
269
+ # ...0001, ....0011, ...0111, etc.
270
+ expected = np.asarray([(2 ** i) - 1 for i in range(32)],
271
+ dtype=np.uint32)
272
+ np.testing.assert_equal(expected, out)
273
+
274
+
275
+ if __name__ == '__main__':
276
+ unittest.main()
@@ -0,0 +1,6 @@
1
+ from numba.cuda.tests import load_testsuite
2
+ import os
3
+
4
+
5
+ def load_tests(loader, tests, pattern):
6
+ return load_testsuite(loader, os.path.dirname(__file__))
@@ -0,0 +1,6 @@
1
+ from numba import cuda
2
+
3
+
4
+ @cuda.jit(device=True)
5
+ def cuda_module_in_device_function():
6
+ return cuda.threadIdx.x
@@ -0,0 +1,102 @@
1
+ import threading
2
+
3
+ import numpy as np
4
+
5
+ from numba import cuda
6
+ from numba.cuda.testing import CUDATestCase, skip_unless_cudasim
7
+ import numba.cuda.simulator as simulator
8
+ import unittest
9
+
10
+
11
+ class TestCudaSimIssues(CUDATestCase):
12
+ def test_record_access(self):
13
+ backyard_type = [('statue', np.float64),
14
+ ('newspaper', np.float64, (6,))]
15
+
16
+ goose_type = [('garden', np.float64, (12,)),
17
+ ('town', np.float64, (42,)),
18
+ ('backyard', backyard_type)]
19
+
20
+ goose_np_type = np.dtype(goose_type, align=True)
21
+
22
+ @cuda.jit
23
+ def simple_kernel(f):
24
+ f.garden[0] = 45.0
25
+ f.backyard.newspaper[3] = 2.0
26
+ f.backyard.newspaper[3] = f.backyard.newspaper[3] + 3.0
27
+
28
+ item = np.recarray(1, dtype=goose_np_type)
29
+ simple_kernel[1, 1](item[0])
30
+ np.testing.assert_equal(item[0]['garden'][0], 45)
31
+ np.testing.assert_equal(item[0]['backyard']['newspaper'][3], 5)
32
+
33
+ def test_recarray_setting(self):
34
+ recordwith2darray = np.dtype([('i', np.int32),
35
+ ('j', np.float32, (3, 2))])
36
+ rec = np.recarray(2, dtype=recordwith2darray)
37
+ rec[0]['i'] = 45
38
+
39
+ @cuda.jit
40
+ def simple_kernel(f):
41
+ f[1] = f[0]
42
+ simple_kernel[1, 1](rec)
43
+ np.testing.assert_equal(rec[0]['i'], rec[1]['i'])
44
+
45
+ def test_cuda_module_in_device_function(self):
46
+ """
47
+ Discovered in https://github.com/numba/numba/issues/1837.
48
+ When the `cuda` module is referenced in a device function,
49
+ it does not have the kernel API (e.g. cuda.threadIdx, cuda.shared)
50
+ """
51
+ from numba.cuda.tests.cudasim import support
52
+
53
+ inner = support.cuda_module_in_device_function
54
+
55
+ @cuda.jit
56
+ def outer(out):
57
+ tid = inner()
58
+ if tid < out.size:
59
+ out[tid] = tid
60
+
61
+ arr = np.zeros(10, dtype=np.int32)
62
+ outer[1, 11](arr)
63
+ expected = np.arange(arr.size, dtype=np.int32)
64
+ np.testing.assert_equal(expected, arr)
65
+
66
+ @skip_unless_cudasim('Only works on CUDASIM')
67
+ def test_deadlock_on_exception(self):
68
+ def assert_no_blockthreads():
69
+ blockthreads = []
70
+ for t in threading.enumerate():
71
+ if not isinstance(t, simulator.kernel.BlockThread):
72
+ continue
73
+
74
+ # join blockthreads with a short timeout to allow aborted
75
+ # threads to exit
76
+ t.join(1)
77
+ if t.is_alive():
78
+ self.fail("Blocked kernel thread: %s" % t)
79
+
80
+ self.assertListEqual(blockthreads, [])
81
+
82
+ @simulator.jit
83
+ def assign_with_sync(x, y):
84
+ i = cuda.grid(1)
85
+ y[i] = x[i]
86
+
87
+ cuda.syncthreads()
88
+ cuda.syncthreads()
89
+
90
+ x = np.arange(3)
91
+ y = np.empty(3)
92
+ assign_with_sync[1, 3](x, y)
93
+ np.testing.assert_array_equal(x, y)
94
+ assert_no_blockthreads()
95
+
96
+ with self.assertRaises(IndexError):
97
+ assign_with_sync[1, 6](x, y)
98
+ assert_no_blockthreads()
99
+
100
+
101
+ if __name__ == '__main__':
102
+ unittest.main()
File without changes
@@ -0,0 +1,5 @@
1
+ // Not all CUDA includes are safe to include in device code compiled by NVRTC,
2
+ // because it does not have paths to all system include directories. Headers
3
+ // such as cuda_device_runtime_api.h are safe to use in NVRTC without adding
4
+ // additional includes.
5
+ #include <cuda_device_runtime_api.h>
@@ -0,0 +1,7 @@
1
+ extern "C" __device__
2
+ int bar(int* out, int a) {
3
+ // Explicitly placed to generate an error
4
+ SYNTAX ERROR
5
+ *out = a * 2;
6
+ return 0;
7
+ }
@@ -0,0 +1,23 @@
1
+ // Compile with:
2
+ //
3
+ // nvcc -gencode arch=compute_50,code=compute_50 -rdc true -ptx jitlink.cu
4
+ //
5
+ // using the oldest supported toolkit version (10.2 at the time of writing).
6
+
7
+ extern "C" __device__
8
+ int bar(int *out, int a)
9
+ {
10
+ *out = a * 2;
11
+ return 0;
12
+ }
13
+
14
+
15
+ // The out argument is necessary due to Numba's CUDA calling convention, which
16
+ // always reserves the first parameter for a pointer to a returned value, even
17
+ // if there is no return value.
18
+ extern "C" __device__
19
+ int array_mutator(void *out, int *a)
20
+ {
21
+ a[0] = a[1];
22
+ return 0;
23
+ }
@@ -0,0 +1,51 @@
1
+ //
2
+ // Generated by NVIDIA NVVM Compiler
3
+ //
4
+ // Compiler Build ID: CL-27506705
5
+ // Cuda compilation tools, release 10.2, V10.2.89
6
+ // Based on LLVM 3.4svn
7
+ //
8
+
9
+ .version 6.5
10
+ .target sm_50
11
+ .address_size 64
12
+
13
+ // .globl bar
14
+
15
+ .visible .func (.param .b32 func_retval0) bar(
16
+ .param .b64 bar_param_0,
17
+ .param .b32 bar_param_1
18
+ )
19
+ {
20
+ .reg .b32 %r<4>;
21
+ .reg .b64 %rd<2>;
22
+
23
+
24
+ ld.param.u64 %rd1, [bar_param_0];
25
+ ld.param.u32 %r1, [bar_param_1];
26
+ shl.b32 %r2, %r1, 1;
27
+ st.u32 [%rd1], %r2;
28
+ mov.u32 %r3, 0;
29
+ st.param.b32 [func_retval0+0], %r3;
30
+ ret;
31
+ }
32
+
33
+ // .globl array_mutator
34
+ .visible .func (.param .b32 func_retval0) array_mutator(
35
+ .param .b64 array_mutator_param_0,
36
+ .param .b64 array_mutator_param_1
37
+ )
38
+ {
39
+ .reg .b32 %r<3>;
40
+ .reg .b64 %rd<2>;
41
+
42
+
43
+ ld.param.u64 %rd1, [array_mutator_param_1];
44
+ ld.u32 %r1, [%rd1+4];
45
+ st.u32 [%rd1], %r1;
46
+ mov.u32 %r2, 0;
47
+ st.param.b32 [func_retval0+0], %r2;
48
+ ret;
49
+ }
50
+
51
+