numba-cuda 0.0.1__py3-none-any.whl → 0.0.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. _numba_cuda_redirector.pth +1 -0
  2. _numba_cuda_redirector.py +74 -0
  3. numba_cuda/VERSION +1 -0
  4. numba_cuda/__init__.py +5 -0
  5. numba_cuda/_version.py +19 -0
  6. numba_cuda/numba/cuda/__init__.py +22 -0
  7. numba_cuda/numba/cuda/api.py +526 -0
  8. numba_cuda/numba/cuda/api_util.py +30 -0
  9. numba_cuda/numba/cuda/args.py +77 -0
  10. numba_cuda/numba/cuda/cg.py +62 -0
  11. numba_cuda/numba/cuda/codegen.py +378 -0
  12. numba_cuda/numba/cuda/compiler.py +422 -0
  13. numba_cuda/numba/cuda/cpp_function_wrappers.cu +47 -0
  14. numba_cuda/numba/cuda/cuda_fp16.h +3631 -0
  15. numba_cuda/numba/cuda/cuda_fp16.hpp +2465 -0
  16. numba_cuda/numba/cuda/cuda_paths.py +258 -0
  17. numba_cuda/numba/cuda/cudadecl.py +806 -0
  18. numba_cuda/numba/cuda/cudadrv/__init__.py +9 -0
  19. numba_cuda/numba/cuda/cudadrv/devicearray.py +904 -0
  20. numba_cuda/numba/cuda/cudadrv/devices.py +248 -0
  21. numba_cuda/numba/cuda/cudadrv/driver.py +3201 -0
  22. numba_cuda/numba/cuda/cudadrv/drvapi.py +398 -0
  23. numba_cuda/numba/cuda/cudadrv/dummyarray.py +452 -0
  24. numba_cuda/numba/cuda/cudadrv/enums.py +607 -0
  25. numba_cuda/numba/cuda/cudadrv/error.py +36 -0
  26. numba_cuda/numba/cuda/cudadrv/libs.py +176 -0
  27. numba_cuda/numba/cuda/cudadrv/ndarray.py +20 -0
  28. numba_cuda/numba/cuda/cudadrv/nvrtc.py +260 -0
  29. numba_cuda/numba/cuda/cudadrv/nvvm.py +707 -0
  30. numba_cuda/numba/cuda/cudadrv/rtapi.py +10 -0
  31. numba_cuda/numba/cuda/cudadrv/runtime.py +142 -0
  32. numba_cuda/numba/cuda/cudaimpl.py +1055 -0
  33. numba_cuda/numba/cuda/cudamath.py +140 -0
  34. numba_cuda/numba/cuda/decorators.py +189 -0
  35. numba_cuda/numba/cuda/descriptor.py +33 -0
  36. numba_cuda/numba/cuda/device_init.py +89 -0
  37. numba_cuda/numba/cuda/deviceufunc.py +908 -0
  38. numba_cuda/numba/cuda/dispatcher.py +1057 -0
  39. numba_cuda/numba/cuda/errors.py +59 -0
  40. numba_cuda/numba/cuda/extending.py +7 -0
  41. numba_cuda/numba/cuda/initialize.py +13 -0
  42. numba_cuda/numba/cuda/intrinsic_wrapper.py +77 -0
  43. numba_cuda/numba/cuda/intrinsics.py +198 -0
  44. numba_cuda/numba/cuda/kernels/__init__.py +0 -0
  45. numba_cuda/numba/cuda/kernels/reduction.py +262 -0
  46. numba_cuda/numba/cuda/kernels/transpose.py +65 -0
  47. numba_cuda/numba/cuda/libdevice.py +3382 -0
  48. numba_cuda/numba/cuda/libdevicedecl.py +17 -0
  49. numba_cuda/numba/cuda/libdevicefuncs.py +1057 -0
  50. numba_cuda/numba/cuda/libdeviceimpl.py +83 -0
  51. numba_cuda/numba/cuda/mathimpl.py +448 -0
  52. numba_cuda/numba/cuda/models.py +48 -0
  53. numba_cuda/numba/cuda/nvvmutils.py +235 -0
  54. numba_cuda/numba/cuda/printimpl.py +86 -0
  55. numba_cuda/numba/cuda/random.py +292 -0
  56. numba_cuda/numba/cuda/simulator/__init__.py +38 -0
  57. numba_cuda/numba/cuda/simulator/api.py +110 -0
  58. numba_cuda/numba/cuda/simulator/compiler.py +9 -0
  59. numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +2 -0
  60. numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +432 -0
  61. numba_cuda/numba/cuda/simulator/cudadrv/devices.py +117 -0
  62. numba_cuda/numba/cuda/simulator/cudadrv/driver.py +62 -0
  63. numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +4 -0
  64. numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py +4 -0
  65. numba_cuda/numba/cuda/simulator/cudadrv/error.py +6 -0
  66. numba_cuda/numba/cuda/simulator/cudadrv/libs.py +2 -0
  67. numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +29 -0
  68. numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +19 -0
  69. numba_cuda/numba/cuda/simulator/kernel.py +308 -0
  70. numba_cuda/numba/cuda/simulator/kernelapi.py +495 -0
  71. numba_cuda/numba/cuda/simulator/reduction.py +15 -0
  72. numba_cuda/numba/cuda/simulator/vector_types.py +58 -0
  73. numba_cuda/numba/cuda/simulator_init.py +17 -0
  74. numba_cuda/numba/cuda/stubs.py +902 -0
  75. numba_cuda/numba/cuda/target.py +440 -0
  76. numba_cuda/numba/cuda/testing.py +202 -0
  77. numba_cuda/numba/cuda/tests/__init__.py +58 -0
  78. numba_cuda/numba/cuda/tests/cudadrv/__init__.py +8 -0
  79. numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +145 -0
  80. numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +145 -0
  81. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +375 -0
  82. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +21 -0
  83. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +179 -0
  84. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +235 -0
  85. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +22 -0
  86. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +193 -0
  87. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +547 -0
  88. numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +249 -0
  89. numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +81 -0
  90. numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +192 -0
  91. numba_cuda/numba/cuda/tests/cudadrv/test_events.py +38 -0
  92. numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +65 -0
  93. numba_cuda/numba/cuda/tests/cudadrv/test_init.py +139 -0
  94. numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +37 -0
  95. numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py +12 -0
  96. numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +317 -0
  97. numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +127 -0
  98. numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +54 -0
  99. numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +199 -0
  100. numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +37 -0
  101. numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +20 -0
  102. numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +149 -0
  103. numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +36 -0
  104. numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +85 -0
  105. numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +41 -0
  106. numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +122 -0
  107. numba_cuda/numba/cuda/tests/cudapy/__init__.py +8 -0
  108. numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +234 -0
  109. numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +41 -0
  110. numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +58 -0
  111. numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +30 -0
  112. numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +100 -0
  113. numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +42 -0
  114. numba_cuda/numba/cuda/tests/cudapy/test_array.py +260 -0
  115. numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +201 -0
  116. numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +35 -0
  117. numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +1620 -0
  118. numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +120 -0
  119. numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +24 -0
  120. numba_cuda/numba/cuda/tests/cudapy/test_caching.py +545 -0
  121. numba_cuda/numba/cuda/tests/cudapy/test_casting.py +257 -0
  122. numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +33 -0
  123. numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +276 -0
  124. numba_cuda/numba/cuda/tests/cudapy/test_complex.py +296 -0
  125. numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +20 -0
  126. numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +129 -0
  127. numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +176 -0
  128. numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +147 -0
  129. numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +435 -0
  130. numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +90 -0
  131. numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +94 -0
  132. numba_cuda/numba/cuda/tests/cudapy/test_debug.py +101 -0
  133. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +221 -0
  134. numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +222 -0
  135. numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +700 -0
  136. numba_cuda/numba/cuda/tests/cudapy/test_enums.py +121 -0
  137. numba_cuda/numba/cuda/tests/cudapy/test_errors.py +79 -0
  138. numba_cuda/numba/cuda/tests/cudapy/test_exception.py +174 -0
  139. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +155 -0
  140. numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +244 -0
  141. numba_cuda/numba/cuda/tests/cudapy/test_forall.py +52 -0
  142. numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +29 -0
  143. numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +66 -0
  144. numba_cuda/numba/cuda/tests/cudapy/test_globals.py +60 -0
  145. numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +456 -0
  146. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +159 -0
  147. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +95 -0
  148. numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +37 -0
  149. numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +165 -0
  150. numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +1106 -0
  151. numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +318 -0
  152. numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +99 -0
  153. numba_cuda/numba/cuda/tests/cudapy/test_lang.py +64 -0
  154. numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +119 -0
  155. numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +187 -0
  156. numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +199 -0
  157. numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +164 -0
  158. numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +37 -0
  159. numba_cuda/numba/cuda/tests/cudapy/test_math.py +786 -0
  160. numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +74 -0
  161. numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +113 -0
  162. numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +22 -0
  163. numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +140 -0
  164. numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +46 -0
  165. numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +101 -0
  166. numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +49 -0
  167. numba_cuda/numba/cuda/tests/cudapy/test_operator.py +401 -0
  168. numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +86 -0
  169. numba_cuda/numba/cuda/tests/cudapy/test_overload.py +335 -0
  170. numba_cuda/numba/cuda/tests/cudapy/test_powi.py +124 -0
  171. numba_cuda/numba/cuda/tests/cudapy/test_print.py +128 -0
  172. numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +33 -0
  173. numba_cuda/numba/cuda/tests/cudapy/test_random.py +104 -0
  174. numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +610 -0
  175. numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +125 -0
  176. numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +76 -0
  177. numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +83 -0
  178. numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +85 -0
  179. numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +37 -0
  180. numba_cuda/numba/cuda/tests/cudapy/test_sm.py +444 -0
  181. numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +205 -0
  182. numba_cuda/numba/cuda/tests/cudapy/test_sync.py +271 -0
  183. numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +80 -0
  184. numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +277 -0
  185. numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +47 -0
  186. numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +307 -0
  187. numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +283 -0
  188. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +20 -0
  189. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +69 -0
  190. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +36 -0
  191. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +37 -0
  192. numba_cuda/numba/cuda/tests/cudapy/test_warning.py +139 -0
  193. numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +276 -0
  194. numba_cuda/numba/cuda/tests/cudasim/__init__.py +6 -0
  195. numba_cuda/numba/cuda/tests/cudasim/support.py +6 -0
  196. numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +102 -0
  197. numba_cuda/numba/cuda/tests/data/__init__.py +0 -0
  198. numba_cuda/numba/cuda/tests/data/cuda_include.cu +5 -0
  199. numba_cuda/numba/cuda/tests/data/error.cu +7 -0
  200. numba_cuda/numba/cuda/tests/data/jitlink.cu +23 -0
  201. numba_cuda/numba/cuda/tests/data/jitlink.ptx +51 -0
  202. numba_cuda/numba/cuda/tests/data/warn.cu +7 -0
  203. numba_cuda/numba/cuda/tests/doc_examples/__init__.py +6 -0
  204. numba_cuda/numba/cuda/tests/doc_examples/ffi/__init__.py +0 -0
  205. numba_cuda/numba/cuda/tests/doc_examples/ffi/functions.cu +49 -0
  206. numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +77 -0
  207. numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +76 -0
  208. numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +82 -0
  209. numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +155 -0
  210. numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +173 -0
  211. numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +109 -0
  212. numba_cuda/numba/cuda/tests/doc_examples/test_random.py +59 -0
  213. numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +76 -0
  214. numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +130 -0
  215. numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py +50 -0
  216. numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +73 -0
  217. numba_cuda/numba/cuda/tests/nocuda/__init__.py +8 -0
  218. numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +359 -0
  219. numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +36 -0
  220. numba_cuda/numba/cuda/tests/nocuda/test_import.py +49 -0
  221. numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +238 -0
  222. numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +54 -0
  223. numba_cuda/numba/cuda/types.py +37 -0
  224. numba_cuda/numba/cuda/ufuncs.py +662 -0
  225. numba_cuda/numba/cuda/vector_types.py +209 -0
  226. numba_cuda/numba/cuda/vectorizers.py +252 -0
  227. numba_cuda-0.0.12.dist-info/LICENSE +25 -0
  228. numba_cuda-0.0.12.dist-info/METADATA +68 -0
  229. numba_cuda-0.0.12.dist-info/RECORD +231 -0
  230. {numba_cuda-0.0.1.dist-info → numba_cuda-0.0.12.dist-info}/WHEEL +1 -1
  231. numba_cuda-0.0.1.dist-info/METADATA +0 -10
  232. numba_cuda-0.0.1.dist-info/RECORD +0 -5
  233. {numba_cuda-0.0.1.dist-info → numba_cuda-0.0.12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,7 @@
1
+ extern "C" __device__
2
+ int bar(int* out, int a) {
3
+ // Explicitly placed to generate a warning for testing the NVRTC program log
4
+ int unused;
5
+ *out = a * 2;
6
+ return 0;
7
+ }
@@ -0,0 +1,6 @@
1
+ from numba.cuda.tests import load_testsuite
2
+ import os
3
+
4
+
5
+ def load_tests(loader, tests, pattern):
6
+ return load_testsuite(loader, os.path.dirname(__file__))
@@ -0,0 +1,49 @@
1
+ // magictoken.ex_mul_f32_f32.begin
2
+ // Foreign function example: multiplication of a pair of floats
3
+
4
+ extern "C" __device__ int
5
+ mul_f32_f32(
6
+ float* return_value,
7
+ float x,
8
+ float y)
9
+ {
10
+ // Compute result and store in caller-provided slot
11
+ *return_value = x * y;
12
+
13
+ // Signal that no Python exception occurred
14
+ return 0;
15
+ }
16
+ // magictoken.ex_mul_f32_f32.end
17
+
18
+
19
+ // magictoken.ex_sum_reduce_proto.begin
20
+ extern "C"
21
+ __device__ int
22
+ sum_reduce(
23
+ float* return_value,
24
+ float* array,
25
+ int n
26
+ );
27
+ // magictoken.ex_sum_reduce_proto.end
28
+
29
+
30
+ // Performs a simple reduction on an array passed by pointer using the
31
+ // ffi.from_buffer() method. Implements the prototype above.
32
+ extern "C"
33
+ __device__ int
34
+ sum_reduce(
35
+ float* return_value,
36
+ float* array,
37
+ int n
38
+ )
39
+ {
40
+ double sum = 0.0;
41
+
42
+ for (size_t i = 0; i < n; ++i) {
43
+ sum += array[i];
44
+ }
45
+
46
+ *return_value = (float)sum;
47
+
48
+ return 0;
49
+ }
@@ -0,0 +1,77 @@
1
+ # Contents in this file are referenced from the sphinx-generated docs.
2
+ # "magictoken" is used for markers as beginning and ending of example text.
3
+
4
+ import unittest
5
+ from numba.cuda.testing import (CUDATestCase, skip_on_cudasim,
6
+ skip_if_cudadevrt_missing, skip_unless_cc_60,
7
+ skip_if_mvc_enabled)
8
+
9
+
10
+ @skip_if_cudadevrt_missing
11
+ @skip_unless_cc_60
12
+ @skip_if_mvc_enabled('CG not supported with MVC')
13
+ @skip_on_cudasim("cudasim doesn't support cuda import at non-top-level")
14
+ class TestCooperativeGroups(CUDATestCase):
15
+ def test_ex_grid_sync(self):
16
+ # magictoken.ex_grid_sync_kernel.begin
17
+ from numba import cuda, int32
18
+ import numpy as np
19
+
20
+ sig = (int32[:,::1],)
21
+
22
+ @cuda.jit(sig)
23
+ def sequential_rows(M):
24
+ col = cuda.grid(1)
25
+ g = cuda.cg.this_grid()
26
+
27
+ rows = M.shape[0]
28
+ cols = M.shape[1]
29
+
30
+ for row in range(1, rows):
31
+ opposite = cols - col - 1
32
+ # Each row's elements are one greater than the previous row
33
+ M[row, col] = M[row - 1, opposite] + 1
34
+ # Wait until all threads have written their column element,
35
+ # and that the write is visible to all other threads
36
+ g.sync()
37
+ # magictoken.ex_grid_sync_kernel.end
38
+
39
+ # magictoken.ex_grid_sync_data.begin
40
+ # Empty input data
41
+ A = np.zeros((1024, 1024), dtype=np.int32)
42
+ # A somewhat arbitrary choice (one warp), but generally smaller block sizes
43
+ # allow more blocks to be launched (noting that other limitations on
44
+ # occupancy apply such as shared memory size)
45
+ blockdim = 32
46
+ griddim = A.shape[1] // blockdim
47
+ # magictoken.ex_grid_sync_data.end
48
+
49
+ # Skip this test if the grid size used in the example is too large for
50
+ # a cooperative launch on the current GPU
51
+ mb = sequential_rows.overloads[sig].max_cooperative_grid_blocks(blockdim)
52
+ if mb < griddim:
53
+ self.skipTest('Device does not support a large enough coop grid')
54
+
55
+ # magictoken.ex_grid_sync_launch.begin
56
+ # Kernel launch - this is implicitly a cooperative launch
57
+ sequential_rows[griddim, blockdim](A)
58
+
59
+ # What do the results look like?
60
+ # print(A)
61
+ #
62
+ # [[ 0 0 0 ... 0 0 0]
63
+ # [ 1 1 1 ... 1 1 1]
64
+ # [ 2 2 2 ... 2 2 2]
65
+ # ...
66
+ # [1021 1021 1021 ... 1021 1021 1021]
67
+ # [1022 1022 1022 ... 1022 1022 1022]
68
+ # [1023 1023 1023 ... 1023 1023 1023]]
69
+ # magictoken.ex_grid_sync_launch.end
70
+
71
+ # Sanity check - are the results what we expect?
72
+ reference = np.tile(np.arange(1024), (1024, 1)).T
73
+ np.testing.assert_equal(A, reference)
74
+
75
+
76
+ if __name__ == '__main__':
77
+ unittest.main()
@@ -0,0 +1,76 @@
1
+ import unittest
2
+
3
+ from numba.cuda.testing import CUDATestCase, skip_on_cudasim
4
+ from numba.tests.support import captured_stdout
5
+ import numpy as np
6
+
7
+
8
+ @skip_on_cudasim("cudasim doesn't support cuda import at non-top-level")
9
+ class TestCpuGpuCompat(CUDATestCase):
10
+ """
11
+ Test compatibility of CPU and GPU functions
12
+ """
13
+
14
+ def setUp(self):
15
+ # Prevent output from this test showing up when running the test suite
16
+ self._captured_stdout = captured_stdout()
17
+ self._captured_stdout.__enter__()
18
+ super().setUp()
19
+
20
+ def tearDown(self):
21
+ # No exception type, value, or traceback
22
+ self._captured_stdout.__exit__(None, None, None)
23
+ super().tearDown()
24
+
25
+ def test_ex_cpu_gpu_compat(self):
26
+ # ex_cpu_gpu_compat.import.begin
27
+ from math import pi
28
+
29
+ import numba
30
+ from numba import cuda
31
+ # ex_cpu_gpu_compat.import.end
32
+
33
+ # ex_cpu_gpu_compat.allocate.begin
34
+ X = cuda.to_device([1, 10, 234])
35
+ Y = cuda.to_device([2, 2, 4014])
36
+ Z = cuda.to_device([3, 14, 2211])
37
+ results = cuda.to_device([0.0, 0.0, 0.0])
38
+ # ex_cpu_gpu_compat.allocate.end
39
+
40
+ # ex_cpu_gpu_compat.define.begin
41
+ @numba.jit
42
+ def business_logic(x, y, z):
43
+ return 4 * z * (2 * x - (4 * y) / 2 * pi)
44
+ # ex_cpu_gpu_compat.define.end
45
+
46
+ # ex_cpu_gpu_compat.cpurun.begin
47
+ print(business_logic(1, 2, 3)) # -126.79644737231007
48
+ # ex_cpu_gpu_compat.cpurun.end
49
+
50
+ # ex_cpu_gpu_compat.usegpu.begin
51
+ @cuda.jit
52
+ def f(res, xarr, yarr, zarr):
53
+ tid = cuda.grid(1)
54
+ if tid < len(xarr):
55
+ # The function decorated with numba.jit may be directly reused
56
+ res[tid] = business_logic(xarr[tid], yarr[tid], zarr[tid])
57
+ # ex_cpu_gpu_compat.usegpu.end
58
+
59
+ # ex_cpu_gpu_compat.launch.begin
60
+ f.forall(len(X))(results, X, Y, Z)
61
+ print(results)
62
+ # [-126.79644737231007, 416.28324559588634, -218912930.2987788]
63
+ # ex_cpu_gpu_compat.launch.end
64
+
65
+ expect = [
66
+ business_logic(x, y, z) for x, y, z in zip(X, Y, Z)
67
+ ]
68
+
69
+ np.testing.assert_equal(
70
+ expect,
71
+ results.copy_to_host()
72
+ )
73
+
74
+
75
+ if __name__ == "__main__":
76
+ unittest.main()
@@ -0,0 +1,82 @@
1
+ # Contents in this file are referenced from the sphinx-generated docs.
2
+ # "magictoken" is used for markers as beginning and ending of example text.
3
+
4
+ import unittest
5
+ from numba.cuda.testing import (CUDATestCase, skip_on_cudasim)
6
+ from numba.tests.support import skip_unless_cffi
7
+
8
+
9
+ @skip_unless_cffi
10
+ @skip_on_cudasim("cudasim doesn't support cuda import at non-top-level")
11
+ class TestFFI(CUDATestCase):
12
+ def test_ex_linking_cu(self):
13
+ # magictoken.ex_linking_cu.begin
14
+ from numba import cuda
15
+ import numpy as np
16
+ import os
17
+
18
+ # Declaration of the foreign function
19
+ mul = cuda.declare_device('mul_f32_f32', 'float32(float32, float32)')
20
+
21
+ # Path to the source containing the foreign function
22
+ # (here assumed to be in a subdirectory called "ffi")
23
+ basedir = os.path.dirname(os.path.abspath(__file__))
24
+ functions_cu = os.path.join(basedir, 'ffi', 'functions.cu')
25
+
26
+ # Kernel that links in functions.cu and calls mul
27
+ @cuda.jit(link=[functions_cu])
28
+ def multiply_vectors(r, x, y):
29
+ i = cuda.grid(1)
30
+
31
+ if i < len(r):
32
+ r[i] = mul(x[i], y[i])
33
+
34
+ # Generate random data
35
+ N = 32
36
+ np.random.seed(1)
37
+ x = np.random.rand(N).astype(np.float32)
38
+ y = np.random.rand(N).astype(np.float32)
39
+ r = np.zeros_like(x)
40
+
41
+ # Run the kernel
42
+ multiply_vectors[1, 32](r, x, y)
43
+
44
+ # Sanity check - ensure the results match those expected
45
+ np.testing.assert_array_equal(r, x * y)
46
+ # magictoken.ex_linking_cu.end
47
+
48
+ def test_ex_from_buffer(self):
49
+ from numba import cuda
50
+ import os
51
+
52
+ basedir = os.path.dirname(os.path.abspath(__file__))
53
+ functions_cu = os.path.join(basedir, 'ffi', 'functions.cu')
54
+
55
+ # magictoken.ex_from_buffer_decl.begin
56
+ signature = 'float32(CPointer(float32), int32)'
57
+ sum_reduce = cuda.declare_device('sum_reduce', signature)
58
+ # magictoken.ex_from_buffer_decl.end
59
+
60
+ # magictoken.ex_from_buffer_kernel.begin
61
+ import cffi
62
+ ffi = cffi.FFI()
63
+
64
+ @cuda.jit(link=[functions_cu])
65
+ def reduction_caller(result, array):
66
+ array_ptr = ffi.from_buffer(array)
67
+ result[()] = sum_reduce(array_ptr, len(array))
68
+ # magictoken.ex_from_buffer_kernel.end
69
+
70
+ import numpy as np
71
+ x = np.arange(10).astype(np.float32)
72
+ r = np.ndarray((), dtype=np.float32)
73
+
74
+ reduction_caller[1, 1](r, x)
75
+
76
+ expected = np.sum(x)
77
+ actual = r[()]
78
+ np.testing.assert_allclose(expected, actual)
79
+
80
+
81
+ if __name__ == '__main__':
82
+ unittest.main()
@@ -0,0 +1,155 @@
1
+ import unittest
2
+
3
+ from numba.cuda.testing import (CUDATestCase, skip_if_cudadevrt_missing,
4
+ skip_on_cudasim, skip_unless_cc_60,
5
+ skip_if_mvc_enabled)
6
+ from numba.tests.support import captured_stdout
7
+
8
+
9
+ @skip_if_cudadevrt_missing
10
+ @skip_unless_cc_60
11
+ @skip_if_mvc_enabled('CG not supported with MVC')
12
+ @skip_on_cudasim("cudasim doesn't support cuda import at non-top-level")
13
+ class TestLaplace(CUDATestCase):
14
+ """
15
+ Test simple vector addition
16
+ """
17
+
18
+ def setUp(self):
19
+ # Prevent output from this test showing up when running the test suite
20
+ self._captured_stdout = captured_stdout()
21
+ self._captured_stdout.__enter__()
22
+ super().setUp()
23
+
24
+ def tearDown(self):
25
+ # No exception type, value, or traceback
26
+ self._captured_stdout.__exit__(None, None, None)
27
+ super().tearDown()
28
+
29
+ def test_ex_laplace(self):
30
+
31
+ # set True to regenerate the figures that
32
+ # accompany this example
33
+ plot = False
34
+
35
+ # ex_laplace.import.begin
36
+ import numpy as np
37
+ from numba import cuda
38
+ # ex_laplace.import.end
39
+
40
+ # ex_laplace.allocate.begin
41
+ # Use an odd problem size.
42
+ # This is so there can be an element truly in the "middle" for symmetry.
43
+ size = 1001
44
+ data = np.zeros(size)
45
+
46
+ # Middle element is made very hot
47
+ data[500] = 10000
48
+ buf_0 = cuda.to_device(data)
49
+
50
+ # This extra array is used for synchronization purposes
51
+ buf_1 = cuda.device_array_like(buf_0)
52
+
53
+ niter = 10000
54
+ # ex_laplace.allocate.end
55
+
56
+ if plot:
57
+ import matplotlib.pyplot as plt
58
+ fig, ax = plt.subplots(figsize=(16 * 0.66, 9 * 0.66))
59
+ plt.plot(
60
+ np.arange(len(buf_0)),
61
+ buf_0.copy_to_host(),
62
+ lw=3,
63
+ marker="*",
64
+ color='black'
65
+ )
66
+
67
+ plt.title('Initial State', fontsize=24)
68
+ plt.xlabel('Position', fontsize=24)
69
+ plt.ylabel('Temperature', fontsize=24)
70
+
71
+ ax.set_xticks(ax.get_xticks(), fontsize=16)
72
+ ax.set_yticks(ax.get_yticks(), fontsize=16)
73
+ plt.xlim(0, len(data))
74
+ plt.ylim(0, 10001)
75
+ plt.savefig('laplace_initial.svg')
76
+
77
+ # ex_laplace.kernel.begin
78
+ @cuda.jit
79
+ def solve_heat_equation(buf_0, buf_1, timesteps, k):
80
+ i = cuda.grid(1)
81
+
82
+ # Don't continue if our index is outside the domain
83
+ if i >= len(buf_0):
84
+ return
85
+
86
+ # Prepare to do a grid-wide synchronization later
87
+ grid = cuda.cg.this_grid()
88
+
89
+ for step in range(timesteps):
90
+ # Select the buffer from the previous timestep
91
+ if (step % 2) == 0:
92
+ data = buf_0
93
+ next_data = buf_1
94
+ else:
95
+ data = buf_1
96
+ next_data = buf_0
97
+
98
+ # Get the current temperature associated with this point
99
+ curr_temp = data[i]
100
+
101
+ # Apply formula from finite difference equation
102
+ if i == 0:
103
+ # Left wall is held at T = 0
104
+ next_temp = curr_temp + k * (data[i + 1] - (2 * curr_temp))
105
+ elif i == len(data) - 1:
106
+ # Right wall is held at T = 0
107
+ next_temp = curr_temp + k * (data[i - 1] - (2 * curr_temp))
108
+ else:
109
+ # Interior points are a weighted average of their neighbors
110
+ next_temp = curr_temp + k * (
111
+ data[i - 1] - (2 * curr_temp) + data[i + 1]
112
+ )
113
+
114
+ # Write new value to the next buffer
115
+ next_data[i] = next_temp
116
+
117
+ # Wait for every thread to write before moving on
118
+ grid.sync()
119
+ # ex_laplace.kernel.end
120
+
121
+ # ex_laplace.launch.begin
122
+ solve_heat_equation.forall(len(data))(
123
+ buf_0, buf_1, niter, 0.25
124
+ )
125
+ # ex_laplace.launch.end
126
+
127
+ results = buf_1.copy_to_host()
128
+ if plot:
129
+ fig, ax = plt.subplots(figsize=(16 * 0.66, 9 * 0.66))
130
+ plt.plot(
131
+ np.arange(len(results)),
132
+ results, lw=3,
133
+ marker="*",
134
+ color='black'
135
+ )
136
+ plt.title(f"T = {niter}", fontsize=24)
137
+ plt.xlabel('Position', fontsize=24)
138
+ plt.ylabel('Temperature', fontsize=24)
139
+
140
+ ax.set_xticks(ax.get_xticks(), fontsize=16)
141
+ ax.set_yticks(ax.get_yticks(), fontsize=16)
142
+
143
+ plt.ylim(0, max(results))
144
+ plt.xlim(0, len(results))
145
+ plt.savefig('laplace_final.svg')
146
+
147
+ # Integral over the domain should be equal to its initial value.
148
+ # Note that this should match the initial value of data[500] above, but
149
+ # we don't assign it to a variable because that would make the example
150
+ # code look a bit oddly verbose.
151
+ np.testing.assert_allclose(results.sum(), 10000)
152
+
153
+
154
+ if __name__ == "__main__":
155
+ unittest.main()
@@ -0,0 +1,173 @@
1
+ """
2
+ Matrix multiplication example via `cuda.jit`.
3
+
4
+ Reference: https://stackoverflow.com/a/64198479/13697228 by @RobertCrovella
5
+
6
+ Contents in this file are referenced from the sphinx-generated docs.
7
+ "magictoken" is used for markers as beginning and ending of example text.
8
+ """
9
+ import unittest
10
+ from numba.cuda.testing import CUDATestCase, skip_on_cudasim
11
+ from numba.tests.support import captured_stdout
12
+
13
+
14
+ @skip_on_cudasim("cudasim doesn't support cuda import at non-top-level")
15
+ class TestMatMul(CUDATestCase):
16
+ """
17
+ Text matrix multiplication using simple, shared memory/square, and shared
18
+ memory/nonsquare cases.
19
+ """
20
+
21
+ def setUp(self):
22
+ # Prevent output from this test showing up when running the test suite
23
+ self._captured_stdout = captured_stdout()
24
+ self._captured_stdout.__enter__()
25
+ super().setUp()
26
+
27
+ def tearDown(self):
28
+ # No exception type, value, or traceback
29
+ self._captured_stdout.__exit__(None, None, None)
30
+ super().tearDown()
31
+
32
+ def test_ex_matmul(self):
33
+ """Test of matrix multiplication on various cases."""
34
+ # magictoken.ex_import.begin
35
+ from numba import cuda, float32
36
+ import numpy as np
37
+ import math
38
+ # magictoken.ex_import.end
39
+
40
+ # magictoken.ex_matmul.begin
41
+ @cuda.jit
42
+ def matmul(A, B, C):
43
+ """Perform square matrix multiplication of C = A * B."""
44
+ i, j = cuda.grid(2)
45
+ if i < C.shape[0] and j < C.shape[1]:
46
+ tmp = 0.
47
+ for k in range(A.shape[1]):
48
+ tmp += A[i, k] * B[k, j]
49
+ C[i, j] = tmp
50
+ # magictoken.ex_matmul.end
51
+
52
+ # magictoken.ex_run_matmul.begin
53
+ x_h = np.arange(16).reshape([4, 4])
54
+ y_h = np.ones([4, 4])
55
+ z_h = np.zeros([4, 4])
56
+
57
+ x_d = cuda.to_device(x_h)
58
+ y_d = cuda.to_device(y_h)
59
+ z_d = cuda.to_device(z_h)
60
+
61
+ threadsperblock = (16, 16)
62
+ blockspergrid_x = math.ceil(z_h.shape[0] / threadsperblock[0])
63
+ blockspergrid_y = math.ceil(z_h.shape[1] / threadsperblock[1])
64
+ blockspergrid = (blockspergrid_x, blockspergrid_y)
65
+
66
+ matmul[blockspergrid, threadsperblock](x_d, y_d, z_d)
67
+ z_h = z_d.copy_to_host()
68
+ print(z_h)
69
+ print(x_h @ y_h)
70
+ # magictoken.ex_run_matmul.end
71
+
72
+ # magictoken.ex_fast_matmul.begin
73
+ # Controls threads per block and shared memory usage.
74
+ # The computation will be done on blocks of TPBxTPB elements.
75
+ # TPB should not be larger than 32 in this example
76
+ TPB = 16
77
+
78
+ @cuda.jit
79
+ def fast_matmul(A, B, C):
80
+ """
81
+ Perform matrix multiplication of C = A * B using CUDA shared memory.
82
+
83
+ Reference: https://stackoverflow.com/a/64198479/13697228 by @RobertCrovella
84
+ """
85
+ # Define an array in the shared memory
86
+ # The size and type of the arrays must be known at compile time
87
+ sA = cuda.shared.array(shape=(TPB, TPB), dtype=float32)
88
+ sB = cuda.shared.array(shape=(TPB, TPB), dtype=float32)
89
+
90
+ x, y = cuda.grid(2)
91
+
92
+ tx = cuda.threadIdx.x
93
+ ty = cuda.threadIdx.y
94
+ bpg = cuda.gridDim.x # blocks per grid
95
+
96
+ # Each thread computes one element in the result matrix.
97
+ # The dot product is chunked into dot products of TPB-long vectors.
98
+ tmp = float32(0.)
99
+ for i in range(bpg):
100
+ # Preload data into shared memory
101
+ sA[ty, tx] = 0
102
+ sB[ty, tx] = 0
103
+ if y < A.shape[0] and (tx + i * TPB) < A.shape[1]:
104
+ sA[ty, tx] = A[y, tx + i * TPB]
105
+ if x < B.shape[1] and (ty + i * TPB) < B.shape[0]:
106
+ sB[ty, tx] = B[ty + i * TPB, x]
107
+
108
+ # Wait until all threads finish preloading
109
+ cuda.syncthreads()
110
+
111
+ # Computes partial product on the shared memory
112
+ for j in range(TPB):
113
+ tmp += sA[ty, j] * sB[j, tx]
114
+
115
+ # Wait until all threads finish computing
116
+ cuda.syncthreads()
117
+ if y < C.shape[0] and x < C.shape[1]:
118
+ C[y, x] = tmp
119
+ # magictoken.ex_fast_matmul.end
120
+
121
+ # magictoken.ex_run_fast_matmul.begin
122
+ x_h = np.arange(16).reshape([4, 4])
123
+ y_h = np.ones([4, 4])
124
+ z_h = np.zeros([4, 4])
125
+
126
+ x_d = cuda.to_device(x_h)
127
+ y_d = cuda.to_device(y_h)
128
+ z_d = cuda.to_device(z_h)
129
+
130
+ threadsperblock = (TPB, TPB)
131
+ blockspergrid_x = math.ceil(z_h.shape[0] / threadsperblock[0])
132
+ blockspergrid_y = math.ceil(z_h.shape[1] / threadsperblock[1])
133
+ blockspergrid = (blockspergrid_x, blockspergrid_y)
134
+
135
+ fast_matmul[blockspergrid, threadsperblock](x_d, y_d, z_d)
136
+ z_h = z_d.copy_to_host()
137
+ print(z_h)
138
+ print(x_h @ y_h)
139
+ # magictoken.ex_run_fast_matmul.end
140
+
141
+ # fast_matmul test(s)
142
+ msg = "fast_matmul incorrect for shared memory, square case."
143
+ self.assertTrue(np.all(z_h == x_h @ y_h), msg=msg)
144
+
145
+ # magictoken.ex_run_nonsquare.begin
146
+ x_h = np.arange(115).reshape([5, 23])
147
+ y_h = np.ones([23, 7])
148
+ z_h = np.zeros([5, 7])
149
+
150
+ x_d = cuda.to_device(x_h)
151
+ y_d = cuda.to_device(y_h)
152
+ z_d = cuda.to_device(z_h)
153
+
154
+ threadsperblock = (TPB, TPB)
155
+ grid_y_max = max(x_h.shape[0], y_h.shape[0])
156
+ grid_x_max = max(x_h.shape[1], y_h.shape[1])
157
+ blockspergrid_x = math.ceil(grid_x_max / threadsperblock[0])
158
+ blockspergrid_y = math.ceil(grid_y_max / threadsperblock[1])
159
+ blockspergrid = (blockspergrid_x, blockspergrid_y)
160
+
161
+ fast_matmul[blockspergrid, threadsperblock](x_d, y_d, z_d)
162
+ z_h = z_d.copy_to_host()
163
+ print(z_h)
164
+ print(x_h @ y_h)
165
+ # magictoken.ex_run_nonsquare.end
166
+
167
+ # nonsquare fast_matmul test(s)
168
+ msg = "fast_matmul incorrect for shared memory, non-square case."
169
+ self.assertTrue(np.all(z_h == x_h @ y_h), msg=msg)
170
+
171
+
172
+ if __name__ == '__main__':
173
+ unittest.main()