numba-cuda 0.0.1__py3-none-any.whl → 0.0.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. _numba_cuda_redirector.pth +1 -0
  2. _numba_cuda_redirector.py +74 -0
  3. numba_cuda/VERSION +1 -0
  4. numba_cuda/__init__.py +5 -0
  5. numba_cuda/_version.py +19 -0
  6. numba_cuda/numba/cuda/__init__.py +22 -0
  7. numba_cuda/numba/cuda/api.py +526 -0
  8. numba_cuda/numba/cuda/api_util.py +30 -0
  9. numba_cuda/numba/cuda/args.py +77 -0
  10. numba_cuda/numba/cuda/cg.py +62 -0
  11. numba_cuda/numba/cuda/codegen.py +378 -0
  12. numba_cuda/numba/cuda/compiler.py +422 -0
  13. numba_cuda/numba/cuda/cpp_function_wrappers.cu +47 -0
  14. numba_cuda/numba/cuda/cuda_fp16.h +3631 -0
  15. numba_cuda/numba/cuda/cuda_fp16.hpp +2465 -0
  16. numba_cuda/numba/cuda/cuda_paths.py +258 -0
  17. numba_cuda/numba/cuda/cudadecl.py +806 -0
  18. numba_cuda/numba/cuda/cudadrv/__init__.py +9 -0
  19. numba_cuda/numba/cuda/cudadrv/devicearray.py +904 -0
  20. numba_cuda/numba/cuda/cudadrv/devices.py +248 -0
  21. numba_cuda/numba/cuda/cudadrv/driver.py +3201 -0
  22. numba_cuda/numba/cuda/cudadrv/drvapi.py +398 -0
  23. numba_cuda/numba/cuda/cudadrv/dummyarray.py +452 -0
  24. numba_cuda/numba/cuda/cudadrv/enums.py +607 -0
  25. numba_cuda/numba/cuda/cudadrv/error.py +36 -0
  26. numba_cuda/numba/cuda/cudadrv/libs.py +176 -0
  27. numba_cuda/numba/cuda/cudadrv/ndarray.py +20 -0
  28. numba_cuda/numba/cuda/cudadrv/nvrtc.py +260 -0
  29. numba_cuda/numba/cuda/cudadrv/nvvm.py +707 -0
  30. numba_cuda/numba/cuda/cudadrv/rtapi.py +10 -0
  31. numba_cuda/numba/cuda/cudadrv/runtime.py +142 -0
  32. numba_cuda/numba/cuda/cudaimpl.py +1055 -0
  33. numba_cuda/numba/cuda/cudamath.py +140 -0
  34. numba_cuda/numba/cuda/decorators.py +189 -0
  35. numba_cuda/numba/cuda/descriptor.py +33 -0
  36. numba_cuda/numba/cuda/device_init.py +89 -0
  37. numba_cuda/numba/cuda/deviceufunc.py +908 -0
  38. numba_cuda/numba/cuda/dispatcher.py +1057 -0
  39. numba_cuda/numba/cuda/errors.py +59 -0
  40. numba_cuda/numba/cuda/extending.py +7 -0
  41. numba_cuda/numba/cuda/initialize.py +13 -0
  42. numba_cuda/numba/cuda/intrinsic_wrapper.py +77 -0
  43. numba_cuda/numba/cuda/intrinsics.py +198 -0
  44. numba_cuda/numba/cuda/kernels/__init__.py +0 -0
  45. numba_cuda/numba/cuda/kernels/reduction.py +262 -0
  46. numba_cuda/numba/cuda/kernels/transpose.py +65 -0
  47. numba_cuda/numba/cuda/libdevice.py +3382 -0
  48. numba_cuda/numba/cuda/libdevicedecl.py +17 -0
  49. numba_cuda/numba/cuda/libdevicefuncs.py +1057 -0
  50. numba_cuda/numba/cuda/libdeviceimpl.py +83 -0
  51. numba_cuda/numba/cuda/mathimpl.py +448 -0
  52. numba_cuda/numba/cuda/models.py +48 -0
  53. numba_cuda/numba/cuda/nvvmutils.py +235 -0
  54. numba_cuda/numba/cuda/printimpl.py +86 -0
  55. numba_cuda/numba/cuda/random.py +292 -0
  56. numba_cuda/numba/cuda/simulator/__init__.py +38 -0
  57. numba_cuda/numba/cuda/simulator/api.py +110 -0
  58. numba_cuda/numba/cuda/simulator/compiler.py +9 -0
  59. numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +2 -0
  60. numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +432 -0
  61. numba_cuda/numba/cuda/simulator/cudadrv/devices.py +117 -0
  62. numba_cuda/numba/cuda/simulator/cudadrv/driver.py +62 -0
  63. numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +4 -0
  64. numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py +4 -0
  65. numba_cuda/numba/cuda/simulator/cudadrv/error.py +6 -0
  66. numba_cuda/numba/cuda/simulator/cudadrv/libs.py +2 -0
  67. numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +29 -0
  68. numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +19 -0
  69. numba_cuda/numba/cuda/simulator/kernel.py +308 -0
  70. numba_cuda/numba/cuda/simulator/kernelapi.py +495 -0
  71. numba_cuda/numba/cuda/simulator/reduction.py +15 -0
  72. numba_cuda/numba/cuda/simulator/vector_types.py +58 -0
  73. numba_cuda/numba/cuda/simulator_init.py +17 -0
  74. numba_cuda/numba/cuda/stubs.py +902 -0
  75. numba_cuda/numba/cuda/target.py +440 -0
  76. numba_cuda/numba/cuda/testing.py +202 -0
  77. numba_cuda/numba/cuda/tests/__init__.py +58 -0
  78. numba_cuda/numba/cuda/tests/cudadrv/__init__.py +8 -0
  79. numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +145 -0
  80. numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +145 -0
  81. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +375 -0
  82. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +21 -0
  83. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +179 -0
  84. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +235 -0
  85. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +22 -0
  86. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +193 -0
  87. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +547 -0
  88. numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +249 -0
  89. numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +81 -0
  90. numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +192 -0
  91. numba_cuda/numba/cuda/tests/cudadrv/test_events.py +38 -0
  92. numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +65 -0
  93. numba_cuda/numba/cuda/tests/cudadrv/test_init.py +139 -0
  94. numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +37 -0
  95. numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py +12 -0
  96. numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +317 -0
  97. numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +127 -0
  98. numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +54 -0
  99. numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +199 -0
  100. numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +37 -0
  101. numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +20 -0
  102. numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +149 -0
  103. numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +36 -0
  104. numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +85 -0
  105. numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +41 -0
  106. numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +122 -0
  107. numba_cuda/numba/cuda/tests/cudapy/__init__.py +8 -0
  108. numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +234 -0
  109. numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +41 -0
  110. numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +58 -0
  111. numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +30 -0
  112. numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +100 -0
  113. numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +42 -0
  114. numba_cuda/numba/cuda/tests/cudapy/test_array.py +260 -0
  115. numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +201 -0
  116. numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +35 -0
  117. numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +1620 -0
  118. numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +120 -0
  119. numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +24 -0
  120. numba_cuda/numba/cuda/tests/cudapy/test_caching.py +545 -0
  121. numba_cuda/numba/cuda/tests/cudapy/test_casting.py +257 -0
  122. numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +33 -0
  123. numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +276 -0
  124. numba_cuda/numba/cuda/tests/cudapy/test_complex.py +296 -0
  125. numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +20 -0
  126. numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +129 -0
  127. numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +176 -0
  128. numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +147 -0
  129. numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +435 -0
  130. numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +90 -0
  131. numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +94 -0
  132. numba_cuda/numba/cuda/tests/cudapy/test_debug.py +101 -0
  133. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +221 -0
  134. numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +222 -0
  135. numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +700 -0
  136. numba_cuda/numba/cuda/tests/cudapy/test_enums.py +121 -0
  137. numba_cuda/numba/cuda/tests/cudapy/test_errors.py +79 -0
  138. numba_cuda/numba/cuda/tests/cudapy/test_exception.py +174 -0
  139. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +155 -0
  140. numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +244 -0
  141. numba_cuda/numba/cuda/tests/cudapy/test_forall.py +52 -0
  142. numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +29 -0
  143. numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +66 -0
  144. numba_cuda/numba/cuda/tests/cudapy/test_globals.py +60 -0
  145. numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +456 -0
  146. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +159 -0
  147. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +95 -0
  148. numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +37 -0
  149. numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +165 -0
  150. numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +1106 -0
  151. numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +318 -0
  152. numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +99 -0
  153. numba_cuda/numba/cuda/tests/cudapy/test_lang.py +64 -0
  154. numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +119 -0
  155. numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +187 -0
  156. numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +199 -0
  157. numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +164 -0
  158. numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +37 -0
  159. numba_cuda/numba/cuda/tests/cudapy/test_math.py +786 -0
  160. numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +74 -0
  161. numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +113 -0
  162. numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +22 -0
  163. numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +140 -0
  164. numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +46 -0
  165. numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +101 -0
  166. numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +49 -0
  167. numba_cuda/numba/cuda/tests/cudapy/test_operator.py +401 -0
  168. numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +86 -0
  169. numba_cuda/numba/cuda/tests/cudapy/test_overload.py +335 -0
  170. numba_cuda/numba/cuda/tests/cudapy/test_powi.py +124 -0
  171. numba_cuda/numba/cuda/tests/cudapy/test_print.py +128 -0
  172. numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +33 -0
  173. numba_cuda/numba/cuda/tests/cudapy/test_random.py +104 -0
  174. numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +610 -0
  175. numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +125 -0
  176. numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +76 -0
  177. numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +83 -0
  178. numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +85 -0
  179. numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +37 -0
  180. numba_cuda/numba/cuda/tests/cudapy/test_sm.py +444 -0
  181. numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +205 -0
  182. numba_cuda/numba/cuda/tests/cudapy/test_sync.py +271 -0
  183. numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +80 -0
  184. numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +277 -0
  185. numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +47 -0
  186. numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +307 -0
  187. numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +283 -0
  188. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +20 -0
  189. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +69 -0
  190. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +36 -0
  191. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +37 -0
  192. numba_cuda/numba/cuda/tests/cudapy/test_warning.py +139 -0
  193. numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +276 -0
  194. numba_cuda/numba/cuda/tests/cudasim/__init__.py +6 -0
  195. numba_cuda/numba/cuda/tests/cudasim/support.py +6 -0
  196. numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +102 -0
  197. numba_cuda/numba/cuda/tests/data/__init__.py +0 -0
  198. numba_cuda/numba/cuda/tests/data/cuda_include.cu +5 -0
  199. numba_cuda/numba/cuda/tests/data/error.cu +7 -0
  200. numba_cuda/numba/cuda/tests/data/jitlink.cu +23 -0
  201. numba_cuda/numba/cuda/tests/data/jitlink.ptx +51 -0
  202. numba_cuda/numba/cuda/tests/data/warn.cu +7 -0
  203. numba_cuda/numba/cuda/tests/doc_examples/__init__.py +6 -0
  204. numba_cuda/numba/cuda/tests/doc_examples/ffi/__init__.py +0 -0
  205. numba_cuda/numba/cuda/tests/doc_examples/ffi/functions.cu +49 -0
  206. numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +77 -0
  207. numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +76 -0
  208. numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +82 -0
  209. numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +155 -0
  210. numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +173 -0
  211. numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +109 -0
  212. numba_cuda/numba/cuda/tests/doc_examples/test_random.py +59 -0
  213. numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +76 -0
  214. numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +130 -0
  215. numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py +50 -0
  216. numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +73 -0
  217. numba_cuda/numba/cuda/tests/nocuda/__init__.py +8 -0
  218. numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +359 -0
  219. numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +36 -0
  220. numba_cuda/numba/cuda/tests/nocuda/test_import.py +49 -0
  221. numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +238 -0
  222. numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +54 -0
  223. numba_cuda/numba/cuda/types.py +37 -0
  224. numba_cuda/numba/cuda/ufuncs.py +662 -0
  225. numba_cuda/numba/cuda/vector_types.py +209 -0
  226. numba_cuda/numba/cuda/vectorizers.py +252 -0
  227. numba_cuda-0.0.12.dist-info/LICENSE +25 -0
  228. numba_cuda-0.0.12.dist-info/METADATA +68 -0
  229. numba_cuda-0.0.12.dist-info/RECORD +231 -0
  230. {numba_cuda-0.0.1.dist-info → numba_cuda-0.0.12.dist-info}/WHEEL +1 -1
  231. numba_cuda-0.0.1.dist-info/METADATA +0 -10
  232. numba_cuda-0.0.1.dist-info/RECORD +0 -5
  233. {numba_cuda-0.0.1.dist-info → numba_cuda-0.0.12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,74 @@
1
+ import numpy as np
2
+
3
+ from numba import cuda, float32, void
4
+ from numba.cuda.testing import unittest, CUDATestCase
5
+ from numba.core import config
6
+
7
+ # Ensure the test takes a reasonable amount of time in the simulator
8
+ if config.ENABLE_CUDASIM:
9
+ bpg, tpb = 2, 8
10
+ else:
11
+ bpg, tpb = 50, 32
12
+
13
+ n = bpg * tpb
14
+ SM_SIZE = (tpb, tpb)
15
+
16
+
17
+ class TestCudaMatMul(CUDATestCase):
18
+
19
+ def test_func(self):
20
+
21
+ @cuda.jit(void(float32[:, ::1], float32[:, ::1], float32[:, ::1]))
22
+ def cu_square_matrix_mul(A, B, C):
23
+ sA = cuda.shared.array(shape=SM_SIZE, dtype=float32)
24
+ sB = cuda.shared.array(shape=(tpb, tpb), dtype=float32)
25
+
26
+ tx = cuda.threadIdx.x
27
+ ty = cuda.threadIdx.y
28
+ bx = cuda.blockIdx.x
29
+ by = cuda.blockIdx.y
30
+ bw = cuda.blockDim.x
31
+ bh = cuda.blockDim.y
32
+
33
+ x = tx + bx * bw
34
+ y = ty + by * bh
35
+
36
+ acc = float32(0) # forces all the math to be f32
37
+ for i in range(bpg):
38
+ if x < n and y < n:
39
+ sA[ty, tx] = A[y, tx + i * tpb]
40
+ sB[ty, tx] = B[ty + i * tpb, x]
41
+
42
+ cuda.syncthreads()
43
+
44
+ if x < n and y < n:
45
+ for j in range(tpb):
46
+ acc += sA[ty, j] * sB[j, tx]
47
+
48
+ cuda.syncthreads()
49
+
50
+ if x < n and y < n:
51
+ C[y, x] = acc
52
+
53
+ np.random.seed(42)
54
+ A = np.array(np.random.random((n, n)), dtype=np.float32)
55
+ B = np.array(np.random.random((n, n)), dtype=np.float32)
56
+ C = np.empty_like(A)
57
+
58
+ stream = cuda.stream()
59
+ with stream.auto_synchronize():
60
+ dA = cuda.to_device(A, stream)
61
+ dB = cuda.to_device(B, stream)
62
+ dC = cuda.to_device(C, stream)
63
+ cu_square_matrix_mul[(bpg, bpg), (tpb, tpb), stream](dA, dB, dC)
64
+ dC.copy_to_host(C, stream)
65
+
66
+ # Host compute
67
+ Cans = np.dot(A, B)
68
+
69
+ # Check result
70
+ np.testing.assert_allclose(C, Cans, rtol=1e-5)
71
+
72
+
73
+ if __name__ == '__main__':
74
+ unittest.main()
@@ -0,0 +1,113 @@
1
+ import numpy as np
2
+
3
+ from numba import cuda, float64
4
+ from numba.cuda.testing import unittest, CUDATestCase, skip_on_cudasim
5
+
6
+
7
+ def builtin_max(A, B, C):
8
+ i = cuda.grid(1)
9
+
10
+ if i >= len(C):
11
+ return
12
+
13
+ C[i] = float64(max(A[i], B[i]))
14
+
15
+
16
+ def builtin_min(A, B, C):
17
+ i = cuda.grid(1)
18
+
19
+ if i >= len(C):
20
+ return
21
+
22
+ C[i] = float64(min(A[i], B[i]))
23
+
24
+
25
+ @skip_on_cudasim('Tests PTX emission')
26
+ class TestCudaMinMax(CUDATestCase):
27
+ def _run(
28
+ self,
29
+ kernel,
30
+ numpy_equivalent,
31
+ ptx_instruction,
32
+ dtype_left,
33
+ dtype_right,
34
+ n=5):
35
+ kernel = cuda.jit(kernel)
36
+
37
+ c = np.zeros(n, dtype=np.float64)
38
+ a = np.arange(n, dtype=dtype_left) + .5
39
+ b = np.full(n, fill_value=2, dtype=dtype_right)
40
+
41
+ kernel[1, c.shape](a, b, c)
42
+ np.testing.assert_allclose(c, numpy_equivalent(a, b))
43
+
44
+ ptx = next(p for p in kernel.inspect_asm().values())
45
+ self.assertIn(ptx_instruction, ptx)
46
+
47
+ def test_max_f8f8(self):
48
+ self._run(
49
+ builtin_max,
50
+ np.maximum,
51
+ 'max.f64',
52
+ np.float64,
53
+ np.float64)
54
+
55
+ def test_max_f4f8(self):
56
+ self._run(
57
+ builtin_max,
58
+ np.maximum,
59
+ 'max.f64',
60
+ np.float32,
61
+ np.float64)
62
+
63
+ def test_max_f8f4(self):
64
+ self._run(
65
+ builtin_max,
66
+ np.maximum,
67
+ 'max.f64',
68
+ np.float64,
69
+ np.float32)
70
+
71
+ def test_max_f4f4(self):
72
+ self._run(
73
+ builtin_max,
74
+ np.maximum,
75
+ 'max.f32',
76
+ np.float32,
77
+ np.float32)
78
+
79
+ def test_min_f8f8(self):
80
+ self._run(
81
+ builtin_min,
82
+ np.minimum,
83
+ 'min.f64',
84
+ np.float64,
85
+ np.float64)
86
+
87
+ def test_min_f4f8(self):
88
+ self._run(
89
+ builtin_min,
90
+ np.minimum,
91
+ 'min.f64',
92
+ np.float32,
93
+ np.float64)
94
+
95
+ def test_min_f8f4(self):
96
+ self._run(
97
+ builtin_min,
98
+ np.minimum,
99
+ 'min.f64',
100
+ np.float64,
101
+ np.float32)
102
+
103
+ def test_min_f4f4(self):
104
+ self._run(
105
+ builtin_min,
106
+ np.minimum,
107
+ 'min.f32',
108
+ np.float32,
109
+ np.float32)
110
+
111
+
112
+ if __name__ == '__main__':
113
+ unittest.main()
@@ -0,0 +1,22 @@
1
+ import math
2
+ from numba import cuda
3
+ from numba.cuda.testing import unittest, CUDATestCase
4
+
5
+
6
+ class TestCudaMonteCarlo(CUDATestCase):
7
+ def test_montecarlo(self):
8
+ """Just make sure we can compile this
9
+ """
10
+
11
+ @cuda.jit(
12
+ 'void(double[:], double[:], double, double, double, double[:])')
13
+ def step(last, paths, dt, c0, c1, normdist):
14
+ i = cuda.grid(1)
15
+ if i >= paths.shape[0]:
16
+ return
17
+ noise = normdist[i]
18
+ paths[i] = last[i] * math.exp(c0 * dt + c1 * noise)
19
+
20
+
21
+ if __name__ == '__main__':
22
+ unittest.main()
@@ -0,0 +1,140 @@
1
+ from numba import cuda
2
+ import numpy as np
3
+ from numba.cuda.testing import skip_on_cudasim, CUDATestCase
4
+ import threading
5
+ import unittest
6
+
7
+
8
+ class TestMultiGPUContext(CUDATestCase):
9
+ @unittest.skipIf(len(cuda.gpus) < 2, "need more than 1 gpus")
10
+ def test_multigpu_context(self):
11
+ @cuda.jit("void(float64[:], float64[:])")
12
+ def copy_plus_1(inp, out):
13
+ i = cuda.grid(1)
14
+ if i < out.size:
15
+ out[i] = inp[i] + 1
16
+
17
+ def check(inp, out):
18
+ np.testing.assert_equal(inp + 1, out)
19
+
20
+ N = 32
21
+ A = np.arange(N, dtype=np.float64)
22
+ B = np.arange(N, dtype=np.float64)
23
+
24
+ with cuda.gpus[0]:
25
+ copy_plus_1[1, N](A, B)
26
+
27
+ check(A, B)
28
+
29
+ copy_plus_1[1, N](A, B)
30
+ check(A, B)
31
+
32
+ with cuda.gpus[0]:
33
+ A0 = np.arange(N, dtype=np.float64)
34
+ B0 = np.arange(N, dtype=np.float64)
35
+ copy_plus_1[1, N](A0, B0)
36
+
37
+ with cuda.gpus[1]:
38
+ A1 = np.arange(N, dtype=np.float64)
39
+ B1 = np.arange(N, dtype=np.float64)
40
+ copy_plus_1[1, N](A1, B1)
41
+
42
+ check(A0, B0)
43
+ check(A1, B1)
44
+
45
+ A = np.arange(N, dtype=np.float64)
46
+ B = np.arange(N, dtype=np.float64)
47
+ copy_plus_1[1, N](A, B)
48
+ check(A, B)
49
+
50
+ @skip_on_cudasim('Simulator does not support multiple threads')
51
+ def test_multithreaded(self):
52
+ def work(gpu, dA, results, ridx):
53
+ try:
54
+ with gpu:
55
+ arr = dA.copy_to_host()
56
+
57
+ except Exception as e:
58
+ results[ridx] = e
59
+
60
+ else:
61
+ results[ridx] = np.all(arr == np.arange(10))
62
+
63
+ dA = cuda.to_device(np.arange(10))
64
+
65
+ nthreads = 10
66
+ results = [None] * nthreads
67
+ threads = [threading.Thread(target=work, args=(cuda.gpus.current,
68
+ dA, results, i))
69
+ for i in range(nthreads)]
70
+ for th in threads:
71
+ th.start()
72
+
73
+ for th in threads:
74
+ th.join()
75
+
76
+ for r in results:
77
+ if isinstance(r, BaseException):
78
+ raise r
79
+ else:
80
+ self.assertTrue(r)
81
+
82
+ @unittest.skipIf(len(cuda.gpus) < 2, "need more than 1 gpus")
83
+ def test_with_context(self):
84
+
85
+ @cuda.jit
86
+ def vector_add_scalar(arr, val):
87
+ i = cuda.grid(1)
88
+ if i < arr.size:
89
+ arr[i] += val
90
+
91
+ hostarr = np.arange(10, dtype=np.float32)
92
+ with cuda.gpus[0]:
93
+ arr1 = cuda.to_device(hostarr)
94
+
95
+ with cuda.gpus[1]:
96
+ arr2 = cuda.to_device(hostarr)
97
+
98
+ with cuda.gpus[0]:
99
+ vector_add_scalar[1, 10](arr1, 1)
100
+
101
+ with cuda.gpus[1]:
102
+ vector_add_scalar[1, 10](arr2, 2)
103
+
104
+ with cuda.gpus[0]:
105
+ np.testing.assert_equal(arr1.copy_to_host(), (hostarr + 1))
106
+
107
+ with cuda.gpus[1]:
108
+ np.testing.assert_equal(arr2.copy_to_host(), (hostarr + 2))
109
+
110
+ @unittest.skipIf(len(cuda.gpus) < 2, "need more than 1 gpus")
111
+ def test_with_context_peer_copy(self):
112
+ # Peer access is not always possible - for example, with one GPU in TCC
113
+ # mode and one in WDDM - if that is the case, this test would fail so
114
+ # we need to skip it.
115
+ with cuda.gpus[0]:
116
+ ctx = cuda.current_context()
117
+ if not ctx.can_access_peer(1):
118
+ self.skipTest('Peer access between GPUs disabled')
119
+
120
+ # 1. Create a range in an array
121
+ hostarr = np.arange(10, dtype=np.float32)
122
+
123
+ # 2. Copy range array from host -> GPU 0
124
+ with cuda.gpus[0]:
125
+ arr1 = cuda.to_device(hostarr)
126
+
127
+ # 3. Initialize a zero-filled array on GPU 1
128
+ with cuda.gpus[1]:
129
+ arr2 = cuda.to_device(np.zeros_like(hostarr))
130
+
131
+ with cuda.gpus[0]:
132
+ # 4. Copy range from GPU 0 -> GPU 1
133
+ arr2.copy_to_device(arr1)
134
+
135
+ # 5. Copy range from GPU 1 -> host and check contents
136
+ np.testing.assert_equal(arr2.copy_to_host(), hostarr)
137
+
138
+
139
+ if __name__ == '__main__':
140
+ unittest.main()
@@ -0,0 +1,46 @@
1
+ import os
2
+ import multiprocessing as mp
3
+
4
+ import numpy as np
5
+
6
+ from numba import cuda
7
+ from numba.cuda.testing import skip_on_cudasim, CUDATestCase
8
+ import unittest
9
+
10
+ has_mp_get_context = hasattr(mp, 'get_context')
11
+ is_unix = os.name == 'posix'
12
+
13
+
14
+ def fork_test(q):
15
+ from numba.cuda.cudadrv.error import CudaDriverError
16
+ try:
17
+ cuda.to_device(np.arange(1))
18
+ except CudaDriverError as e:
19
+ q.put(e)
20
+ else:
21
+ q.put(None)
22
+
23
+
24
+ @skip_on_cudasim('disabled for cudasim')
25
+ class TestMultiprocessing(CUDATestCase):
26
+ @unittest.skipUnless(has_mp_get_context, 'requires mp.get_context')
27
+ @unittest.skipUnless(is_unix, 'requires Unix')
28
+ def test_fork(self):
29
+ """
30
+ Test fork detection.
31
+ """
32
+ cuda.current_context() # force cuda initialize
33
+ # fork in process that also uses CUDA
34
+ ctx = mp.get_context('fork')
35
+ q = ctx.Queue()
36
+ proc = ctx.Process(target=fork_test, args=[q])
37
+ proc.start()
38
+ exc = q.get()
39
+ proc.join()
40
+ # there should be an exception raised in the child process
41
+ self.assertIsNotNone(exc)
42
+ self.assertIn('CUDA initialized before forking', str(exc))
43
+
44
+
45
+ if __name__ == '__main__':
46
+ unittest.main()
@@ -0,0 +1,101 @@
1
+ import traceback
2
+ import threading
3
+ import multiprocessing
4
+ import numpy as np
5
+ from numba import cuda
6
+ from numba.cuda.testing import (skip_on_cudasim, skip_under_cuda_memcheck,
7
+ CUDATestCase)
8
+ import unittest
9
+
10
+ try:
11
+ from concurrent.futures import ThreadPoolExecutor
12
+ except ImportError:
13
+ has_concurrent_futures = False
14
+ else:
15
+ has_concurrent_futures = True
16
+
17
+
18
+ has_mp_get_context = hasattr(multiprocessing, 'get_context')
19
+
20
+
21
+ def check_concurrent_compiling():
22
+ @cuda.jit
23
+ def foo(x):
24
+ x[0] += 1
25
+
26
+ def use_foo(x):
27
+ foo[1, 1](x)
28
+ return x
29
+
30
+ arrays = [cuda.to_device(np.arange(10)) for i in range(10)]
31
+ expected = np.arange(10)
32
+ expected[0] += 1
33
+ with ThreadPoolExecutor(max_workers=4) as e:
34
+ for ary in e.map(use_foo, arrays):
35
+ np.testing.assert_equal(ary, expected)
36
+
37
+
38
+ def spawn_process_entry(q):
39
+ try:
40
+ check_concurrent_compiling()
41
+ # Catch anything that goes wrong in the threads
42
+ except: # noqa: E722
43
+ msg = traceback.format_exc()
44
+ q.put('\n'.join(['', '=' * 80, msg]))
45
+ else:
46
+ q.put(None)
47
+
48
+
49
+ @skip_under_cuda_memcheck('Hangs cuda-memcheck')
50
+ @skip_on_cudasim('disabled for cudasim')
51
+ class TestMultiThreadCompiling(CUDATestCase):
52
+
53
+ @unittest.skipIf(not has_concurrent_futures, "no concurrent.futures")
54
+ def test_concurrent_compiling(self):
55
+ check_concurrent_compiling()
56
+
57
+ @unittest.skipIf(not has_mp_get_context, "no multiprocessing.get_context")
58
+ def test_spawn_concurrent_compilation(self):
59
+ # force CUDA context init
60
+ cuda.get_current_device()
61
+ # use "spawn" to avoid inheriting the CUDA context
62
+ ctx = multiprocessing.get_context('spawn')
63
+
64
+ q = ctx.Queue()
65
+ p = ctx.Process(target=spawn_process_entry, args=(q,))
66
+ p.start()
67
+ try:
68
+ err = q.get()
69
+ finally:
70
+ p.join()
71
+ if err is not None:
72
+ raise AssertionError(err)
73
+ self.assertEqual(p.exitcode, 0, 'test failed in child process')
74
+
75
+ def test_invalid_context_error_with_d2h(self):
76
+ def d2h(arr, out):
77
+ out[:] = arr.copy_to_host()
78
+
79
+ arr = np.arange(1, 4)
80
+ out = np.zeros_like(arr)
81
+ darr = cuda.to_device(arr)
82
+ th = threading.Thread(target=d2h, args=[darr, out])
83
+ th.start()
84
+ th.join()
85
+ np.testing.assert_equal(arr, out)
86
+
87
+ def test_invalid_context_error_with_d2d(self):
88
+ def d2d(dst, src):
89
+ dst.copy_to_device(src)
90
+
91
+ arr = np.arange(100)
92
+ common = cuda.to_device(arr)
93
+ darr = cuda.to_device(np.zeros(common.shape, dtype=common.dtype))
94
+ th = threading.Thread(target=d2d, args=[darr, common])
95
+ th.start()
96
+ th.join()
97
+ np.testing.assert_equal(darr.copy_to_host(), arr)
98
+
99
+
100
+ if __name__ == '__main__':
101
+ unittest.main()
@@ -0,0 +1,49 @@
1
+ import numpy as np
2
+ from numba import cuda, float32, void
3
+ from numba.cuda.testing import unittest, CUDATestCase
4
+
5
+
6
+ def generate_input(n):
7
+ A = np.array(np.arange(n * n).reshape(n, n), dtype=np.float32)
8
+ B = np.array(np.arange(n) + 0, dtype=A.dtype)
9
+ return A, B
10
+
11
+
12
+ class TestCudaNonDet(CUDATestCase):
13
+ def test_for_pre(self):
14
+ """Test issue with loop not running due to bad sign-extension at the for
15
+ loop precondition.
16
+ """
17
+
18
+ @cuda.jit(void(float32[:, :], float32[:, :], float32[:]))
19
+ def diagproduct(c, a, b):
20
+ startX, startY = cuda.grid(2)
21
+ gridX = cuda.gridDim.x * cuda.blockDim.x
22
+ gridY = cuda.gridDim.y * cuda.blockDim.y
23
+ height = c.shape[0]
24
+ width = c.shape[1]
25
+
26
+ for x in range(startX, width, (gridX)):
27
+ for y in range(startY, height, (gridY)):
28
+ c[y, x] = a[y, x] * b[x]
29
+
30
+ N = 8
31
+
32
+ A, B = generate_input(N)
33
+
34
+ F = np.empty(A.shape, dtype=A.dtype)
35
+
36
+ blockdim = (32, 8)
37
+ griddim = (1, 1)
38
+
39
+ dA = cuda.to_device(A)
40
+ dB = cuda.to_device(B)
41
+ dF = cuda.to_device(F, copy=False)
42
+ diagproduct[griddim, blockdim](dF, dA, dB)
43
+
44
+ E = np.dot(A, np.diag(B))
45
+ np.testing.assert_array_almost_equal(dF.copy_to_host(), E)
46
+
47
+
48
+ if __name__ == '__main__':
49
+ unittest.main()