numba-cuda 0.0.1__py3-none-any.whl → 0.0.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. _numba_cuda_redirector.pth +1 -0
  2. _numba_cuda_redirector.py +74 -0
  3. numba_cuda/VERSION +1 -0
  4. numba_cuda/__init__.py +5 -0
  5. numba_cuda/_version.py +19 -0
  6. numba_cuda/numba/cuda/__init__.py +22 -0
  7. numba_cuda/numba/cuda/api.py +526 -0
  8. numba_cuda/numba/cuda/api_util.py +30 -0
  9. numba_cuda/numba/cuda/args.py +77 -0
  10. numba_cuda/numba/cuda/cg.py +62 -0
  11. numba_cuda/numba/cuda/codegen.py +378 -0
  12. numba_cuda/numba/cuda/compiler.py +422 -0
  13. numba_cuda/numba/cuda/cpp_function_wrappers.cu +47 -0
  14. numba_cuda/numba/cuda/cuda_fp16.h +3631 -0
  15. numba_cuda/numba/cuda/cuda_fp16.hpp +2465 -0
  16. numba_cuda/numba/cuda/cuda_paths.py +258 -0
  17. numba_cuda/numba/cuda/cudadecl.py +806 -0
  18. numba_cuda/numba/cuda/cudadrv/__init__.py +9 -0
  19. numba_cuda/numba/cuda/cudadrv/devicearray.py +904 -0
  20. numba_cuda/numba/cuda/cudadrv/devices.py +248 -0
  21. numba_cuda/numba/cuda/cudadrv/driver.py +3201 -0
  22. numba_cuda/numba/cuda/cudadrv/drvapi.py +398 -0
  23. numba_cuda/numba/cuda/cudadrv/dummyarray.py +452 -0
  24. numba_cuda/numba/cuda/cudadrv/enums.py +607 -0
  25. numba_cuda/numba/cuda/cudadrv/error.py +36 -0
  26. numba_cuda/numba/cuda/cudadrv/libs.py +176 -0
  27. numba_cuda/numba/cuda/cudadrv/ndarray.py +20 -0
  28. numba_cuda/numba/cuda/cudadrv/nvrtc.py +260 -0
  29. numba_cuda/numba/cuda/cudadrv/nvvm.py +707 -0
  30. numba_cuda/numba/cuda/cudadrv/rtapi.py +10 -0
  31. numba_cuda/numba/cuda/cudadrv/runtime.py +142 -0
  32. numba_cuda/numba/cuda/cudaimpl.py +1055 -0
  33. numba_cuda/numba/cuda/cudamath.py +140 -0
  34. numba_cuda/numba/cuda/decorators.py +189 -0
  35. numba_cuda/numba/cuda/descriptor.py +33 -0
  36. numba_cuda/numba/cuda/device_init.py +89 -0
  37. numba_cuda/numba/cuda/deviceufunc.py +908 -0
  38. numba_cuda/numba/cuda/dispatcher.py +1057 -0
  39. numba_cuda/numba/cuda/errors.py +59 -0
  40. numba_cuda/numba/cuda/extending.py +7 -0
  41. numba_cuda/numba/cuda/initialize.py +13 -0
  42. numba_cuda/numba/cuda/intrinsic_wrapper.py +77 -0
  43. numba_cuda/numba/cuda/intrinsics.py +198 -0
  44. numba_cuda/numba/cuda/kernels/__init__.py +0 -0
  45. numba_cuda/numba/cuda/kernels/reduction.py +262 -0
  46. numba_cuda/numba/cuda/kernels/transpose.py +65 -0
  47. numba_cuda/numba/cuda/libdevice.py +3382 -0
  48. numba_cuda/numba/cuda/libdevicedecl.py +17 -0
  49. numba_cuda/numba/cuda/libdevicefuncs.py +1057 -0
  50. numba_cuda/numba/cuda/libdeviceimpl.py +83 -0
  51. numba_cuda/numba/cuda/mathimpl.py +448 -0
  52. numba_cuda/numba/cuda/models.py +48 -0
  53. numba_cuda/numba/cuda/nvvmutils.py +235 -0
  54. numba_cuda/numba/cuda/printimpl.py +86 -0
  55. numba_cuda/numba/cuda/random.py +292 -0
  56. numba_cuda/numba/cuda/simulator/__init__.py +38 -0
  57. numba_cuda/numba/cuda/simulator/api.py +110 -0
  58. numba_cuda/numba/cuda/simulator/compiler.py +9 -0
  59. numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +2 -0
  60. numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +432 -0
  61. numba_cuda/numba/cuda/simulator/cudadrv/devices.py +117 -0
  62. numba_cuda/numba/cuda/simulator/cudadrv/driver.py +62 -0
  63. numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +4 -0
  64. numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py +4 -0
  65. numba_cuda/numba/cuda/simulator/cudadrv/error.py +6 -0
  66. numba_cuda/numba/cuda/simulator/cudadrv/libs.py +2 -0
  67. numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +29 -0
  68. numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +19 -0
  69. numba_cuda/numba/cuda/simulator/kernel.py +308 -0
  70. numba_cuda/numba/cuda/simulator/kernelapi.py +495 -0
  71. numba_cuda/numba/cuda/simulator/reduction.py +15 -0
  72. numba_cuda/numba/cuda/simulator/vector_types.py +58 -0
  73. numba_cuda/numba/cuda/simulator_init.py +17 -0
  74. numba_cuda/numba/cuda/stubs.py +902 -0
  75. numba_cuda/numba/cuda/target.py +440 -0
  76. numba_cuda/numba/cuda/testing.py +202 -0
  77. numba_cuda/numba/cuda/tests/__init__.py +58 -0
  78. numba_cuda/numba/cuda/tests/cudadrv/__init__.py +8 -0
  79. numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +145 -0
  80. numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +145 -0
  81. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +375 -0
  82. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +21 -0
  83. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +179 -0
  84. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +235 -0
  85. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +22 -0
  86. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +193 -0
  87. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +547 -0
  88. numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +249 -0
  89. numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +81 -0
  90. numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +192 -0
  91. numba_cuda/numba/cuda/tests/cudadrv/test_events.py +38 -0
  92. numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +65 -0
  93. numba_cuda/numba/cuda/tests/cudadrv/test_init.py +139 -0
  94. numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +37 -0
  95. numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py +12 -0
  96. numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +317 -0
  97. numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +127 -0
  98. numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +54 -0
  99. numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +199 -0
  100. numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +37 -0
  101. numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +20 -0
  102. numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +149 -0
  103. numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +36 -0
  104. numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +85 -0
  105. numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +41 -0
  106. numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +122 -0
  107. numba_cuda/numba/cuda/tests/cudapy/__init__.py +8 -0
  108. numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +234 -0
  109. numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +41 -0
  110. numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +58 -0
  111. numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +30 -0
  112. numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +100 -0
  113. numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +42 -0
  114. numba_cuda/numba/cuda/tests/cudapy/test_array.py +260 -0
  115. numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +201 -0
  116. numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +35 -0
  117. numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +1620 -0
  118. numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +120 -0
  119. numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +24 -0
  120. numba_cuda/numba/cuda/tests/cudapy/test_caching.py +545 -0
  121. numba_cuda/numba/cuda/tests/cudapy/test_casting.py +257 -0
  122. numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +33 -0
  123. numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +276 -0
  124. numba_cuda/numba/cuda/tests/cudapy/test_complex.py +296 -0
  125. numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +20 -0
  126. numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +129 -0
  127. numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +176 -0
  128. numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +147 -0
  129. numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +435 -0
  130. numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +90 -0
  131. numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +94 -0
  132. numba_cuda/numba/cuda/tests/cudapy/test_debug.py +101 -0
  133. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +221 -0
  134. numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +222 -0
  135. numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +700 -0
  136. numba_cuda/numba/cuda/tests/cudapy/test_enums.py +121 -0
  137. numba_cuda/numba/cuda/tests/cudapy/test_errors.py +79 -0
  138. numba_cuda/numba/cuda/tests/cudapy/test_exception.py +174 -0
  139. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +155 -0
  140. numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +244 -0
  141. numba_cuda/numba/cuda/tests/cudapy/test_forall.py +52 -0
  142. numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +29 -0
  143. numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +66 -0
  144. numba_cuda/numba/cuda/tests/cudapy/test_globals.py +60 -0
  145. numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +456 -0
  146. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +159 -0
  147. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +95 -0
  148. numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +37 -0
  149. numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +165 -0
  150. numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +1106 -0
  151. numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +318 -0
  152. numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +99 -0
  153. numba_cuda/numba/cuda/tests/cudapy/test_lang.py +64 -0
  154. numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +119 -0
  155. numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +187 -0
  156. numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +199 -0
  157. numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +164 -0
  158. numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +37 -0
  159. numba_cuda/numba/cuda/tests/cudapy/test_math.py +786 -0
  160. numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +74 -0
  161. numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +113 -0
  162. numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +22 -0
  163. numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +140 -0
  164. numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +46 -0
  165. numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +101 -0
  166. numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +49 -0
  167. numba_cuda/numba/cuda/tests/cudapy/test_operator.py +401 -0
  168. numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +86 -0
  169. numba_cuda/numba/cuda/tests/cudapy/test_overload.py +335 -0
  170. numba_cuda/numba/cuda/tests/cudapy/test_powi.py +124 -0
  171. numba_cuda/numba/cuda/tests/cudapy/test_print.py +128 -0
  172. numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +33 -0
  173. numba_cuda/numba/cuda/tests/cudapy/test_random.py +104 -0
  174. numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +610 -0
  175. numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +125 -0
  176. numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +76 -0
  177. numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +83 -0
  178. numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +85 -0
  179. numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +37 -0
  180. numba_cuda/numba/cuda/tests/cudapy/test_sm.py +444 -0
  181. numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +205 -0
  182. numba_cuda/numba/cuda/tests/cudapy/test_sync.py +271 -0
  183. numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +80 -0
  184. numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +277 -0
  185. numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +47 -0
  186. numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +307 -0
  187. numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +283 -0
  188. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +20 -0
  189. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +69 -0
  190. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +36 -0
  191. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +37 -0
  192. numba_cuda/numba/cuda/tests/cudapy/test_warning.py +139 -0
  193. numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +276 -0
  194. numba_cuda/numba/cuda/tests/cudasim/__init__.py +6 -0
  195. numba_cuda/numba/cuda/tests/cudasim/support.py +6 -0
  196. numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +102 -0
  197. numba_cuda/numba/cuda/tests/data/__init__.py +0 -0
  198. numba_cuda/numba/cuda/tests/data/cuda_include.cu +5 -0
  199. numba_cuda/numba/cuda/tests/data/error.cu +7 -0
  200. numba_cuda/numba/cuda/tests/data/jitlink.cu +23 -0
  201. numba_cuda/numba/cuda/tests/data/jitlink.ptx +51 -0
  202. numba_cuda/numba/cuda/tests/data/warn.cu +7 -0
  203. numba_cuda/numba/cuda/tests/doc_examples/__init__.py +6 -0
  204. numba_cuda/numba/cuda/tests/doc_examples/ffi/__init__.py +0 -0
  205. numba_cuda/numba/cuda/tests/doc_examples/ffi/functions.cu +49 -0
  206. numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +77 -0
  207. numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +76 -0
  208. numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +82 -0
  209. numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +155 -0
  210. numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +173 -0
  211. numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +109 -0
  212. numba_cuda/numba/cuda/tests/doc_examples/test_random.py +59 -0
  213. numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +76 -0
  214. numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +130 -0
  215. numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py +50 -0
  216. numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +73 -0
  217. numba_cuda/numba/cuda/tests/nocuda/__init__.py +8 -0
  218. numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +359 -0
  219. numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +36 -0
  220. numba_cuda/numba/cuda/tests/nocuda/test_import.py +49 -0
  221. numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +238 -0
  222. numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +54 -0
  223. numba_cuda/numba/cuda/types.py +37 -0
  224. numba_cuda/numba/cuda/ufuncs.py +662 -0
  225. numba_cuda/numba/cuda/vector_types.py +209 -0
  226. numba_cuda/numba/cuda/vectorizers.py +252 -0
  227. numba_cuda-0.0.13.dist-info/LICENSE +25 -0
  228. numba_cuda-0.0.13.dist-info/METADATA +69 -0
  229. numba_cuda-0.0.13.dist-info/RECORD +231 -0
  230. {numba_cuda-0.0.1.dist-info → numba_cuda-0.0.13.dist-info}/WHEEL +1 -1
  231. numba_cuda-0.0.1.dist-info/METADATA +0 -10
  232. numba_cuda-0.0.1.dist-info/RECORD +0 -5
  233. {numba_cuda-0.0.1.dist-info → numba_cuda-0.0.13.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,444 @@
1
+ from numba import cuda, int32, float64, void
2
+ from numba.core.errors import TypingError
3
+ from numba.core import types
4
+ from numba.cuda.testing import unittest, CUDATestCase, skip_on_cudasim
5
+
6
+ import numpy as np
7
+ from numba.np import numpy_support as nps
8
+
9
+ from .extensions_usecases import test_struct_model_type, TestStruct
10
+
11
+ recordwith2darray = np.dtype([('i', np.int32),
12
+ ('j', np.float32, (3, 2))])
13
+
14
+
15
+ class TestSharedMemoryIssue(CUDATestCase):
16
+ def test_issue_953_sm_linkage_conflict(self):
17
+ @cuda.jit(device=True)
18
+ def inner():
19
+ inner_arr = cuda.shared.array(1, dtype=int32) # noqa: F841
20
+
21
+ @cuda.jit
22
+ def outer():
23
+ outer_arr = cuda.shared.array(1, dtype=int32) # noqa: F841
24
+ inner()
25
+
26
+ outer[1, 1]()
27
+
28
+ def _check_shared_array_size(self, shape, expected):
29
+ @cuda.jit
30
+ def s(a):
31
+ arr = cuda.shared.array(shape, dtype=int32)
32
+ a[0] = arr.size
33
+
34
+ result = np.zeros(1, dtype=np.int32)
35
+ s[1, 1](result)
36
+ self.assertEqual(result[0], expected)
37
+
38
+ def test_issue_1051_shared_size_broken_1d(self):
39
+ self._check_shared_array_size(2, 2)
40
+
41
+ def test_issue_1051_shared_size_broken_2d(self):
42
+ self._check_shared_array_size((2, 3), 6)
43
+
44
+ def test_issue_1051_shared_size_broken_3d(self):
45
+
46
+ self._check_shared_array_size((2, 3, 4), 24)
47
+
48
+ def _check_shared_array_size_fp16(self, shape, expected, ty):
49
+ @cuda.jit
50
+ def s(a):
51
+ arr = cuda.shared.array(shape, dtype=ty)
52
+ a[0] = arr.size
53
+
54
+ result = np.zeros(1, dtype=np.float16)
55
+ s[1, 1](result)
56
+ self.assertEqual(result[0], expected)
57
+
58
+ def test_issue_fp16_support(self):
59
+ self._check_shared_array_size_fp16(2, 2, types.float16)
60
+ self._check_shared_array_size_fp16(2, 2, np.float16)
61
+
62
+ def test_issue_2393(self):
63
+ """
64
+ Test issue of warp misalign address due to nvvm not knowing the
65
+ alignment(? but it should have taken the natural alignment of the type)
66
+ """
67
+ num_weights = 2
68
+ num_blocks = 48
69
+ examples_per_block = 4
70
+ threads_per_block = 1
71
+
72
+ @cuda.jit
73
+ def costs_func(d_block_costs):
74
+ s_features = cuda.shared.array((examples_per_block, num_weights),
75
+ float64)
76
+ s_initialcost = cuda.shared.array(7, float64) # Bug
77
+
78
+ threadIdx = cuda.threadIdx.x
79
+
80
+ prediction = 0
81
+ for j in range(num_weights):
82
+ prediction += s_features[threadIdx, j]
83
+
84
+ d_block_costs[0] = s_initialcost[0] + prediction
85
+
86
+ block_costs = np.zeros(num_blocks, dtype=np.float64)
87
+ d_block_costs = cuda.to_device(block_costs)
88
+
89
+ costs_func[num_blocks, threads_per_block](d_block_costs)
90
+
91
+ cuda.synchronize()
92
+
93
+
94
+ class TestSharedMemory(CUDATestCase):
95
+ def _test_shared(self, arr):
96
+ # Use a kernel that copies via shared memory to check loading and
97
+ # storing different dtypes with shared memory. All threads in a block
98
+ # collaborate to load in values, then the output values are written
99
+ # only by the first thread in the block after synchronization.
100
+
101
+ nelem = len(arr)
102
+ nthreads = 16
103
+ nblocks = int(nelem / nthreads)
104
+ dt = nps.from_dtype(arr.dtype)
105
+
106
+ @cuda.jit
107
+ def use_sm_chunk_copy(x, y):
108
+ sm = cuda.shared.array(nthreads, dtype=dt)
109
+
110
+ tx = cuda.threadIdx.x
111
+ bx = cuda.blockIdx.x
112
+ bd = cuda.blockDim.x
113
+
114
+ # Load this block's chunk into shared
115
+ i = bx * bd + tx
116
+ if i < len(x):
117
+ sm[tx] = x[i]
118
+
119
+ cuda.syncthreads()
120
+
121
+ # One thread per block writes this block's chunk
122
+ if tx == 0:
123
+ for j in range(nthreads):
124
+ y[bd * bx + j] = sm[j]
125
+
126
+ d_result = cuda.device_array_like(arr)
127
+ use_sm_chunk_copy[nblocks, nthreads](arr, d_result)
128
+ host_result = d_result.copy_to_host()
129
+ np.testing.assert_array_equal(arr, host_result)
130
+
131
+ def test_shared_recarray(self):
132
+ arr = np.recarray(128, dtype=recordwith2darray)
133
+ for x in range(len(arr)):
134
+ arr[x].i = x
135
+ j = np.arange(3 * 2, dtype=np.float32)
136
+ arr[x].j = j.reshape(3, 2) * x
137
+
138
+ self._test_shared(arr)
139
+
140
+ def test_shared_bool(self):
141
+ arr = np.random.randint(2, size=(1024,), dtype=np.bool_)
142
+ self._test_shared(arr)
143
+
144
+ def _test_dynshared_slice(self, func, arr, expected):
145
+ # Check that slices of shared memory are correct
146
+ # (See Bug #5073 - prior to the addition of these tests and
147
+ # corresponding fix, slices of dynamic shared arrays all aliased each
148
+ # other)
149
+ nshared = arr.size * arr.dtype.itemsize
150
+ func[1, 1, 0, nshared](arr)
151
+ np.testing.assert_array_equal(expected, arr)
152
+
153
+ def test_dynshared_slice_write(self):
154
+ # Test writing values into disjoint slices of dynamic shared memory
155
+ @cuda.jit
156
+ def slice_write(x):
157
+ dynsmem = cuda.shared.array(0, dtype=int32)
158
+ sm1 = dynsmem[0:1]
159
+ sm2 = dynsmem[1:2]
160
+
161
+ sm1[0] = 1
162
+ sm2[0] = 2
163
+ x[0] = dynsmem[0]
164
+ x[1] = dynsmem[1]
165
+
166
+ arr = np.zeros(2, dtype=np.int32)
167
+ expected = np.array([1, 2], dtype=np.int32)
168
+ self._test_dynshared_slice(slice_write, arr, expected)
169
+
170
+ def test_dynshared_slice_read(self):
171
+ # Test reading values from disjoint slices of dynamic shared memory
172
+ @cuda.jit
173
+ def slice_read(x):
174
+ dynsmem = cuda.shared.array(0, dtype=int32)
175
+ sm1 = dynsmem[0:1]
176
+ sm2 = dynsmem[1:2]
177
+
178
+ dynsmem[0] = 1
179
+ dynsmem[1] = 2
180
+ x[0] = sm1[0]
181
+ x[1] = sm2[0]
182
+
183
+ arr = np.zeros(2, dtype=np.int32)
184
+ expected = np.array([1, 2], dtype=np.int32)
185
+ self._test_dynshared_slice(slice_read, arr, expected)
186
+
187
+ def test_dynshared_slice_diff_sizes(self):
188
+ # Test reading values from disjoint slices of dynamic shared memory
189
+ # with different sizes
190
+ @cuda.jit
191
+ def slice_diff_sizes(x):
192
+ dynsmem = cuda.shared.array(0, dtype=int32)
193
+ sm1 = dynsmem[0:1]
194
+ sm2 = dynsmem[1:3]
195
+
196
+ dynsmem[0] = 1
197
+ dynsmem[1] = 2
198
+ dynsmem[2] = 3
199
+ x[0] = sm1[0]
200
+ x[1] = sm2[0]
201
+ x[2] = sm2[1]
202
+
203
+ arr = np.zeros(3, dtype=np.int32)
204
+ expected = np.array([1, 2, 3], dtype=np.int32)
205
+ self._test_dynshared_slice(slice_diff_sizes, arr, expected)
206
+
207
+ def test_dynshared_slice_overlap(self):
208
+ # Test reading values from overlapping slices of dynamic shared memory
209
+ @cuda.jit
210
+ def slice_overlap(x):
211
+ dynsmem = cuda.shared.array(0, dtype=int32)
212
+ sm1 = dynsmem[0:2]
213
+ sm2 = dynsmem[1:4]
214
+
215
+ dynsmem[0] = 1
216
+ dynsmem[1] = 2
217
+ dynsmem[2] = 3
218
+ dynsmem[3] = 4
219
+ x[0] = sm1[0]
220
+ x[1] = sm1[1]
221
+ x[2] = sm2[0]
222
+ x[3] = sm2[1]
223
+ x[4] = sm2[2]
224
+
225
+ arr = np.zeros(5, dtype=np.int32)
226
+ expected = np.array([1, 2, 2, 3, 4], dtype=np.int32)
227
+ self._test_dynshared_slice(slice_overlap, arr, expected)
228
+
229
+ def test_dynshared_slice_gaps(self):
230
+ # Test writing values to slices of dynamic shared memory doesn't write
231
+ # outside the slice
232
+ @cuda.jit
233
+ def slice_gaps(x):
234
+ dynsmem = cuda.shared.array(0, dtype=int32)
235
+ sm1 = dynsmem[1:3]
236
+ sm2 = dynsmem[4:6]
237
+
238
+ # Initial values for dynamic shared memory, some to be overwritten
239
+ dynsmem[0] = 99
240
+ dynsmem[1] = 99
241
+ dynsmem[2] = 99
242
+ dynsmem[3] = 99
243
+ dynsmem[4] = 99
244
+ dynsmem[5] = 99
245
+ dynsmem[6] = 99
246
+
247
+ sm1[0] = 1
248
+ sm1[1] = 2
249
+ sm2[0] = 3
250
+ sm2[1] = 4
251
+
252
+ x[0] = dynsmem[0]
253
+ x[1] = dynsmem[1]
254
+ x[2] = dynsmem[2]
255
+ x[3] = dynsmem[3]
256
+ x[4] = dynsmem[4]
257
+ x[5] = dynsmem[5]
258
+ x[6] = dynsmem[6]
259
+
260
+ arr = np.zeros(7, dtype=np.int32)
261
+ expected = np.array([99, 1, 2, 99, 3, 4, 99], dtype=np.int32)
262
+ self._test_dynshared_slice(slice_gaps, arr, expected)
263
+
264
+ def test_dynshared_slice_write_backwards(self):
265
+ # Test writing values into disjoint slices of dynamic shared memory
266
+ # with negative steps
267
+ @cuda.jit
268
+ def slice_write_backwards(x):
269
+ dynsmem = cuda.shared.array(0, dtype=int32)
270
+ sm1 = dynsmem[1::-1]
271
+ sm2 = dynsmem[3:1:-1]
272
+
273
+ sm1[0] = 1
274
+ sm1[1] = 2
275
+ sm2[0] = 3
276
+ sm2[1] = 4
277
+ x[0] = dynsmem[0]
278
+ x[1] = dynsmem[1]
279
+ x[2] = dynsmem[2]
280
+ x[3] = dynsmem[3]
281
+
282
+ arr = np.zeros(4, dtype=np.int32)
283
+ expected = np.array([2, 1, 4, 3], dtype=np.int32)
284
+ self._test_dynshared_slice(slice_write_backwards, arr, expected)
285
+
286
+ def test_dynshared_slice_nonunit_stride(self):
287
+ # Test writing values into slice of dynamic shared memory with
288
+ # non-unit stride
289
+ @cuda.jit
290
+ def slice_nonunit_stride(x):
291
+ dynsmem = cuda.shared.array(0, dtype=int32)
292
+ sm1 = dynsmem[::2]
293
+
294
+ # Initial values for dynamic shared memory, some to be overwritten
295
+ dynsmem[0] = 99
296
+ dynsmem[1] = 99
297
+ dynsmem[2] = 99
298
+ dynsmem[3] = 99
299
+ dynsmem[4] = 99
300
+ dynsmem[5] = 99
301
+
302
+ sm1[0] = 1
303
+ sm1[1] = 2
304
+ sm1[2] = 3
305
+
306
+ x[0] = dynsmem[0]
307
+ x[1] = dynsmem[1]
308
+ x[2] = dynsmem[2]
309
+ x[3] = dynsmem[3]
310
+ x[4] = dynsmem[4]
311
+ x[5] = dynsmem[5]
312
+
313
+ arr = np.zeros(6, dtype=np.int32)
314
+ expected = np.array([1, 99, 2, 99, 3, 99], dtype=np.int32)
315
+ self._test_dynshared_slice(slice_nonunit_stride, arr, expected)
316
+
317
+ def test_dynshared_slice_nonunit_reverse_stride(self):
318
+ # Test writing values into slice of dynamic shared memory with
319
+ # reverse non-unit stride
320
+ @cuda.jit
321
+ def slice_nonunit_reverse_stride(x):
322
+ dynsmem = cuda.shared.array(0, dtype=int32)
323
+ sm1 = dynsmem[-1::-2]
324
+
325
+ # Initial values for dynamic shared memory, some to be overwritten
326
+ dynsmem[0] = 99
327
+ dynsmem[1] = 99
328
+ dynsmem[2] = 99
329
+ dynsmem[3] = 99
330
+ dynsmem[4] = 99
331
+ dynsmem[5] = 99
332
+
333
+ sm1[0] = 1
334
+ sm1[1] = 2
335
+ sm1[2] = 3
336
+
337
+ x[0] = dynsmem[0]
338
+ x[1] = dynsmem[1]
339
+ x[2] = dynsmem[2]
340
+ x[3] = dynsmem[3]
341
+ x[4] = dynsmem[4]
342
+ x[5] = dynsmem[5]
343
+
344
+ arr = np.zeros(6, dtype=np.int32)
345
+ expected = np.array([99, 3, 99, 2, 99, 1], dtype=np.int32)
346
+ self._test_dynshared_slice(slice_nonunit_reverse_stride, arr, expected)
347
+
348
+ def test_issue_5073(self):
349
+ # An example with which Bug #5073 (slices of dynamic shared memory all
350
+ # alias) was discovered. The kernel uses all threads in the block to
351
+ # load values into slices of dynamic shared memory. One thread per
352
+ # block then writes the loaded values back to a global array after
353
+ # syncthreads().
354
+
355
+ arr = np.arange(1024)
356
+ nelem = len(arr)
357
+ nthreads = 16
358
+ nblocks = int(nelem / nthreads)
359
+ dt = nps.from_dtype(arr.dtype)
360
+ nshared = nthreads * arr.dtype.itemsize
361
+ chunksize = int(nthreads / 2)
362
+
363
+ @cuda.jit
364
+ def sm_slice_copy(x, y, chunksize):
365
+ dynsmem = cuda.shared.array(0, dtype=dt)
366
+ sm1 = dynsmem[0:chunksize]
367
+ sm2 = dynsmem[chunksize:chunksize * 2]
368
+
369
+ tx = cuda.threadIdx.x
370
+ bx = cuda.blockIdx.x
371
+ bd = cuda.blockDim.x
372
+
373
+ # load this block's chunk into shared
374
+ i = bx * bd + tx
375
+ if i < len(x):
376
+ if tx < chunksize:
377
+ sm1[tx] = x[i]
378
+ else:
379
+ sm2[tx - chunksize] = x[i]
380
+
381
+ cuda.syncthreads()
382
+
383
+ # one thread per block writes this block's chunk
384
+ if tx == 0:
385
+ for j in range(chunksize):
386
+ y[bd * bx + j] = sm1[j]
387
+ y[bd * bx + j + chunksize] = sm2[j]
388
+
389
+ d_result = cuda.device_array_like(arr)
390
+ sm_slice_copy[nblocks, nthreads, 0, nshared](arr, d_result, chunksize)
391
+ host_result = d_result.copy_to_host()
392
+ np.testing.assert_array_equal(arr, host_result)
393
+
394
+ @skip_on_cudasim("Can't check typing in simulator")
395
+ def test_invalid_array_type(self):
396
+ rgx = ".*Cannot infer the type of variable 'arr'.*"
397
+
398
+ def unsupported_type():
399
+ arr = cuda.shared.array(10, dtype=np.dtype('O')) # noqa: F841
400
+ with self.assertRaisesRegex(TypingError, rgx):
401
+ cuda.jit(void())(unsupported_type)
402
+
403
+ rgx = ".*Invalid NumPy dtype specified: 'int33'.*"
404
+
405
+ def invalid_string_type():
406
+ arr = cuda.shared.array(10, dtype='int33') # noqa: F841
407
+ with self.assertRaisesRegex(TypingError, rgx):
408
+ cuda.jit(void())(invalid_string_type)
409
+
410
+ @skip_on_cudasim("Struct model array unsupported in simulator")
411
+ def test_struct_model_type_static(self):
412
+ nthreads = 64
413
+
414
+ @cuda.jit(void(int32[::1], int32[::1]))
415
+ def write_then_reverse_read_static(outx, outy):
416
+ # Test creation
417
+ arr = cuda.shared.array(nthreads, dtype=test_struct_model_type)
418
+
419
+ i = cuda.grid(1)
420
+ ri = nthreads - i - 1
421
+
422
+ if i < len(outx) and i < len(outy):
423
+ # Test set to arr
424
+ obj = TestStruct(int32(i), int32(i * 2))
425
+ arr[i] = obj
426
+
427
+ cuda.syncthreads()
428
+ # Test get from arr
429
+ outx[i] = arr[ri].x
430
+ outy[i] = arr[ri].y
431
+
432
+ arrx = np.zeros((nthreads,), dtype="int32")
433
+ arry = np.zeros((nthreads,), dtype="int32")
434
+
435
+ write_then_reverse_read_static[1, nthreads](arrx, arry)
436
+
437
+ for i, x in enumerate(arrx):
438
+ self.assertEqual(x, nthreads - i - 1)
439
+ for i, y in enumerate(arry):
440
+ self.assertEqual(y, (nthreads - i - 1) * 2)
441
+
442
+
443
+ if __name__ == '__main__':
444
+ unittest.main()
@@ -0,0 +1,205 @@
1
+ import numpy as np
2
+ from numba import cuda, float32, int32, void
3
+ from numba.core.errors import TypingError
4
+ from numba.cuda.testing import unittest, CUDATestCase
5
+ from numba.cuda.testing import skip_on_cudasim
6
+ from .extensions_usecases import test_struct_model_type
7
+
8
+ GLOBAL_CONSTANT = 5
9
+ GLOBAL_CONSTANT_2 = 6
10
+ GLOBAL_CONSTANT_TUPLE = 5, 6
11
+
12
+
13
+ def udt_global_constants(A):
14
+ sa = cuda.shared.array(shape=GLOBAL_CONSTANT, dtype=float32)
15
+ i = cuda.grid(1)
16
+ A[i] = sa[i]
17
+
18
+
19
+ def udt_global_build_tuple(A):
20
+ sa = cuda.shared.array(shape=(GLOBAL_CONSTANT, GLOBAL_CONSTANT_2),
21
+ dtype=float32)
22
+ i, j = cuda.grid(2)
23
+ A[i, j] = sa[i, j]
24
+
25
+
26
+ def udt_global_build_list(A):
27
+ sa = cuda.shared.array(shape=[GLOBAL_CONSTANT, GLOBAL_CONSTANT_2],
28
+ dtype=float32)
29
+ i, j = cuda.grid(2)
30
+ A[i, j] = sa[i, j]
31
+
32
+
33
+ def udt_global_constant_tuple(A):
34
+ sa = cuda.shared.array(shape=GLOBAL_CONSTANT_TUPLE, dtype=float32)
35
+ i, j = cuda.grid(2)
36
+ A[i, j] = sa[i, j]
37
+
38
+
39
+ def udt_invalid_1(A):
40
+ sa = cuda.shared.array(shape=A[0], dtype=float32)
41
+ i = cuda.grid(1)
42
+ A[i] = sa[i]
43
+
44
+
45
+ def udt_invalid_2(A):
46
+ sa = cuda.shared.array(shape=(1, A[0]), dtype=float32)
47
+ i, j = cuda.grid(2)
48
+ A[i, j] = sa[i, j]
49
+
50
+
51
+ def udt_invalid_3(A):
52
+ sa = cuda.shared.array(shape=(1, A[0]), dtype=float32)
53
+ i = cuda.grid(1)
54
+ A[i] = sa[i, 0]
55
+
56
+
57
+ class TestSharedMemoryCreation(CUDATestCase):
58
+ def getarg(self):
59
+ return np.array(100, dtype=np.float32, ndmin=1)
60
+
61
+ def getarg2(self):
62
+ return self.getarg().reshape(1,1)
63
+
64
+ def test_global_constants(self):
65
+ udt = cuda.jit((float32[:],))(udt_global_constants)
66
+ udt[1, 1](self.getarg())
67
+
68
+ def test_global_build_tuple(self):
69
+ udt = cuda.jit((float32[:, :],))(udt_global_build_tuple)
70
+ udt[1, 1](self.getarg2())
71
+
72
+ @skip_on_cudasim('Simulator does not prohibit lists for shared array shape')
73
+ def test_global_build_list(self):
74
+ with self.assertRaises(TypingError) as raises:
75
+ cuda.jit((float32[:, :],))(udt_global_build_list)
76
+
77
+ self.assertIn("No implementation of function "
78
+ "Function(<function shared.array",
79
+ str(raises.exception))
80
+ self.assertIn("found for signature:\n \n "
81
+ ">>> array(shape=list(int64)<iv=[5, 6]>, "
82
+ "dtype=class(float32)",
83
+ str(raises.exception))
84
+
85
+ def test_global_constant_tuple(self):
86
+ udt = cuda.jit((float32[:, :],))(udt_global_constant_tuple)
87
+ udt[1, 1](self.getarg2())
88
+
89
+ @skip_on_cudasim("Can't check for constants in simulator")
90
+ def test_invalid_1(self):
91
+ # Scalar shape cannot be a floating point value
92
+ with self.assertRaises(TypingError) as raises:
93
+ cuda.jit((float32[:],))(udt_invalid_1)
94
+
95
+ self.assertIn("No implementation of function "
96
+ "Function(<function shared.array",
97
+ str(raises.exception))
98
+ self.assertIn("found for signature:\n \n "
99
+ ">>> array(shape=float32, dtype=class(float32))",
100
+ str(raises.exception))
101
+
102
+ @skip_on_cudasim("Can't check for constants in simulator")
103
+ def test_invalid_2(self):
104
+ # Tuple shape cannot contain a floating point value
105
+ with self.assertRaises(TypingError) as raises:
106
+ cuda.jit((float32[:, :],))(udt_invalid_2)
107
+
108
+ self.assertIn("No implementation of function "
109
+ "Function(<function shared.array",
110
+ str(raises.exception))
111
+ self.assertIn("found for signature:\n \n "
112
+ ">>> array(shape=Tuple(Literal[int](1), "
113
+ "array(float32, 1d, A)), dtype=class(float32))",
114
+ str(raises.exception))
115
+
116
+ @skip_on_cudasim("Can't check for constants in simulator")
117
+ def test_invalid_3(self):
118
+ # Scalar shape must be literal
119
+ with self.assertRaises(TypingError) as raises:
120
+ cuda.jit((int32[:],))(udt_invalid_1)
121
+
122
+ self.assertIn("No implementation of function "
123
+ "Function(<function shared.array",
124
+ str(raises.exception))
125
+ self.assertIn("found for signature:\n \n "
126
+ ">>> array(shape=int32, dtype=class(float32))",
127
+ str(raises.exception))
128
+
129
+ @skip_on_cudasim("Can't check for constants in simulator")
130
+ def test_invalid_4(self):
131
+ # Tuple shape must contain only literals
132
+ with self.assertRaises(TypingError) as raises:
133
+ cuda.jit((int32[:],))(udt_invalid_3)
134
+
135
+ self.assertIn("No implementation of function "
136
+ "Function(<function shared.array",
137
+ str(raises.exception))
138
+ self.assertIn("found for signature:\n \n "
139
+ ">>> array(shape=Tuple(Literal[int](1), int32), "
140
+ "dtype=class(float32))",
141
+ str(raises.exception))
142
+
143
+ def check_dtype(self, f, dtype):
144
+ # Find the typing of the dtype argument to cuda.shared.array
145
+ annotation = next(iter(f.overloads.values()))._type_annotation
146
+ l_dtype = annotation.typemap['s'].dtype
147
+ # Ensure that the typing is correct
148
+ self.assertEqual(l_dtype, dtype)
149
+
150
+ @skip_on_cudasim("Can't check typing in simulator")
151
+ def test_numba_dtype(self):
152
+ # Check that Numba types can be used as the dtype of a shared array
153
+ @cuda.jit(void(int32[::1]))
154
+ def f(x):
155
+ s = cuda.shared.array(10, dtype=int32)
156
+ s[0] = x[0]
157
+ x[0] = s[0]
158
+
159
+ self.check_dtype(f, int32)
160
+
161
+ @skip_on_cudasim("Can't check typing in simulator")
162
+ def test_numpy_dtype(self):
163
+ # Check that NumPy types can be used as the dtype of a shared array
164
+ @cuda.jit(void(int32[::1]))
165
+ def f(x):
166
+ s = cuda.shared.array(10, dtype=np.int32)
167
+ s[0] = x[0]
168
+ x[0] = s[0]
169
+
170
+ self.check_dtype(f, int32)
171
+
172
+ @skip_on_cudasim("Can't check typing in simulator")
173
+ def test_string_dtype(self):
174
+ # Check that strings can be used to specify the dtype of a shared array
175
+ @cuda.jit(void(int32[::1]))
176
+ def f(x):
177
+ s = cuda.shared.array(10, dtype='int32')
178
+ s[0] = x[0]
179
+ x[0] = s[0]
180
+
181
+ self.check_dtype(f, int32)
182
+
183
+ @skip_on_cudasim("Can't check typing in simulator")
184
+ def test_invalid_string_dtype(self):
185
+ # Check that strings of invalid dtypes cause a typing error
186
+ re = ".*Invalid NumPy dtype specified: 'int33'.*"
187
+ with self.assertRaisesRegex(TypingError, re):
188
+ @cuda.jit(void(int32[::1]))
189
+ def f(x):
190
+ s = cuda.shared.array(10, dtype='int33')
191
+ s[0] = x[0]
192
+ x[0] = s[0]
193
+
194
+ @skip_on_cudasim("Can't check typing in simulator")
195
+ def test_type_with_struct_data_model(self):
196
+ @cuda.jit(void(test_struct_model_type[::1]))
197
+ def f(x):
198
+ s = cuda.shared.array(10, dtype=test_struct_model_type)
199
+ s[0] = x[0]
200
+ x[0] = s[0]
201
+ self.check_dtype(f, test_struct_model_type)
202
+
203
+
204
+ if __name__ == '__main__':
205
+ unittest.main()