numba-cuda 0.0.1__py3-none-any.whl → 0.0.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. _numba_cuda_redirector.pth +1 -0
  2. _numba_cuda_redirector.py +74 -0
  3. numba_cuda/VERSION +1 -0
  4. numba_cuda/__init__.py +5 -0
  5. numba_cuda/_version.py +19 -0
  6. numba_cuda/numba/cuda/__init__.py +22 -0
  7. numba_cuda/numba/cuda/api.py +526 -0
  8. numba_cuda/numba/cuda/api_util.py +30 -0
  9. numba_cuda/numba/cuda/args.py +77 -0
  10. numba_cuda/numba/cuda/cg.py +62 -0
  11. numba_cuda/numba/cuda/codegen.py +378 -0
  12. numba_cuda/numba/cuda/compiler.py +422 -0
  13. numba_cuda/numba/cuda/cpp_function_wrappers.cu +47 -0
  14. numba_cuda/numba/cuda/cuda_fp16.h +3631 -0
  15. numba_cuda/numba/cuda/cuda_fp16.hpp +2465 -0
  16. numba_cuda/numba/cuda/cuda_paths.py +258 -0
  17. numba_cuda/numba/cuda/cudadecl.py +806 -0
  18. numba_cuda/numba/cuda/cudadrv/__init__.py +9 -0
  19. numba_cuda/numba/cuda/cudadrv/devicearray.py +904 -0
  20. numba_cuda/numba/cuda/cudadrv/devices.py +248 -0
  21. numba_cuda/numba/cuda/cudadrv/driver.py +3201 -0
  22. numba_cuda/numba/cuda/cudadrv/drvapi.py +398 -0
  23. numba_cuda/numba/cuda/cudadrv/dummyarray.py +452 -0
  24. numba_cuda/numba/cuda/cudadrv/enums.py +607 -0
  25. numba_cuda/numba/cuda/cudadrv/error.py +36 -0
  26. numba_cuda/numba/cuda/cudadrv/libs.py +176 -0
  27. numba_cuda/numba/cuda/cudadrv/ndarray.py +20 -0
  28. numba_cuda/numba/cuda/cudadrv/nvrtc.py +260 -0
  29. numba_cuda/numba/cuda/cudadrv/nvvm.py +707 -0
  30. numba_cuda/numba/cuda/cudadrv/rtapi.py +10 -0
  31. numba_cuda/numba/cuda/cudadrv/runtime.py +142 -0
  32. numba_cuda/numba/cuda/cudaimpl.py +1055 -0
  33. numba_cuda/numba/cuda/cudamath.py +140 -0
  34. numba_cuda/numba/cuda/decorators.py +189 -0
  35. numba_cuda/numba/cuda/descriptor.py +33 -0
  36. numba_cuda/numba/cuda/device_init.py +89 -0
  37. numba_cuda/numba/cuda/deviceufunc.py +908 -0
  38. numba_cuda/numba/cuda/dispatcher.py +1057 -0
  39. numba_cuda/numba/cuda/errors.py +59 -0
  40. numba_cuda/numba/cuda/extending.py +7 -0
  41. numba_cuda/numba/cuda/initialize.py +13 -0
  42. numba_cuda/numba/cuda/intrinsic_wrapper.py +77 -0
  43. numba_cuda/numba/cuda/intrinsics.py +198 -0
  44. numba_cuda/numba/cuda/kernels/__init__.py +0 -0
  45. numba_cuda/numba/cuda/kernels/reduction.py +262 -0
  46. numba_cuda/numba/cuda/kernels/transpose.py +65 -0
  47. numba_cuda/numba/cuda/libdevice.py +3382 -0
  48. numba_cuda/numba/cuda/libdevicedecl.py +17 -0
  49. numba_cuda/numba/cuda/libdevicefuncs.py +1057 -0
  50. numba_cuda/numba/cuda/libdeviceimpl.py +83 -0
  51. numba_cuda/numba/cuda/mathimpl.py +448 -0
  52. numba_cuda/numba/cuda/models.py +48 -0
  53. numba_cuda/numba/cuda/nvvmutils.py +235 -0
  54. numba_cuda/numba/cuda/printimpl.py +86 -0
  55. numba_cuda/numba/cuda/random.py +292 -0
  56. numba_cuda/numba/cuda/simulator/__init__.py +38 -0
  57. numba_cuda/numba/cuda/simulator/api.py +110 -0
  58. numba_cuda/numba/cuda/simulator/compiler.py +9 -0
  59. numba_cuda/numba/cuda/simulator/cudadrv/__init__.py +2 -0
  60. numba_cuda/numba/cuda/simulator/cudadrv/devicearray.py +432 -0
  61. numba_cuda/numba/cuda/simulator/cudadrv/devices.py +117 -0
  62. numba_cuda/numba/cuda/simulator/cudadrv/driver.py +62 -0
  63. numba_cuda/numba/cuda/simulator/cudadrv/drvapi.py +4 -0
  64. numba_cuda/numba/cuda/simulator/cudadrv/dummyarray.py +4 -0
  65. numba_cuda/numba/cuda/simulator/cudadrv/error.py +6 -0
  66. numba_cuda/numba/cuda/simulator/cudadrv/libs.py +2 -0
  67. numba_cuda/numba/cuda/simulator/cudadrv/nvvm.py +29 -0
  68. numba_cuda/numba/cuda/simulator/cudadrv/runtime.py +19 -0
  69. numba_cuda/numba/cuda/simulator/kernel.py +308 -0
  70. numba_cuda/numba/cuda/simulator/kernelapi.py +495 -0
  71. numba_cuda/numba/cuda/simulator/reduction.py +15 -0
  72. numba_cuda/numba/cuda/simulator/vector_types.py +58 -0
  73. numba_cuda/numba/cuda/simulator_init.py +17 -0
  74. numba_cuda/numba/cuda/stubs.py +902 -0
  75. numba_cuda/numba/cuda/target.py +440 -0
  76. numba_cuda/numba/cuda/testing.py +202 -0
  77. numba_cuda/numba/cuda/tests/__init__.py +58 -0
  78. numba_cuda/numba/cuda/tests/cudadrv/__init__.py +8 -0
  79. numba_cuda/numba/cuda/tests/cudadrv/test_array_attr.py +145 -0
  80. numba_cuda/numba/cuda/tests/cudadrv/test_context_stack.py +145 -0
  81. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_array_slicing.py +375 -0
  82. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_auto_context.py +21 -0
  83. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_devicerecord.py +179 -0
  84. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_driver.py +235 -0
  85. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_libraries.py +22 -0
  86. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +193 -0
  87. numba_cuda/numba/cuda/tests/cudadrv/test_cuda_ndarray.py +547 -0
  88. numba_cuda/numba/cuda/tests/cudadrv/test_deallocations.py +249 -0
  89. numba_cuda/numba/cuda/tests/cudadrv/test_detect.py +81 -0
  90. numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +192 -0
  91. numba_cuda/numba/cuda/tests/cudadrv/test_events.py +38 -0
  92. numba_cuda/numba/cuda/tests/cudadrv/test_host_alloc.py +65 -0
  93. numba_cuda/numba/cuda/tests/cudadrv/test_init.py +139 -0
  94. numba_cuda/numba/cuda/tests/cudadrv/test_inline_ptx.py +37 -0
  95. numba_cuda/numba/cuda/tests/cudadrv/test_is_fp16.py +12 -0
  96. numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +317 -0
  97. numba_cuda/numba/cuda/tests/cudadrv/test_managed_alloc.py +127 -0
  98. numba_cuda/numba/cuda/tests/cudadrv/test_mvc.py +54 -0
  99. numba_cuda/numba/cuda/tests/cudadrv/test_nvvm_driver.py +199 -0
  100. numba_cuda/numba/cuda/tests/cudadrv/test_pinned.py +37 -0
  101. numba_cuda/numba/cuda/tests/cudadrv/test_profiler.py +20 -0
  102. numba_cuda/numba/cuda/tests/cudadrv/test_ptds.py +149 -0
  103. numba_cuda/numba/cuda/tests/cudadrv/test_reset_device.py +36 -0
  104. numba_cuda/numba/cuda/tests/cudadrv/test_runtime.py +85 -0
  105. numba_cuda/numba/cuda/tests/cudadrv/test_select_device.py +41 -0
  106. numba_cuda/numba/cuda/tests/cudadrv/test_streams.py +122 -0
  107. numba_cuda/numba/cuda/tests/cudapy/__init__.py +8 -0
  108. numba_cuda/numba/cuda/tests/cudapy/cache_usecases.py +234 -0
  109. numba_cuda/numba/cuda/tests/cudapy/cache_with_cpu_usecases.py +41 -0
  110. numba_cuda/numba/cuda/tests/cudapy/extensions_usecases.py +58 -0
  111. numba_cuda/numba/cuda/tests/cudapy/jitlink.ptx +30 -0
  112. numba_cuda/numba/cuda/tests/cudapy/recursion_usecases.py +100 -0
  113. numba_cuda/numba/cuda/tests/cudapy/test_alignment.py +42 -0
  114. numba_cuda/numba/cuda/tests/cudapy/test_array.py +260 -0
  115. numba_cuda/numba/cuda/tests/cudapy/test_array_args.py +201 -0
  116. numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py +35 -0
  117. numba_cuda/numba/cuda/tests/cudapy/test_atomics.py +1620 -0
  118. numba_cuda/numba/cuda/tests/cudapy/test_blackscholes.py +120 -0
  119. numba_cuda/numba/cuda/tests/cudapy/test_boolean.py +24 -0
  120. numba_cuda/numba/cuda/tests/cudapy/test_caching.py +545 -0
  121. numba_cuda/numba/cuda/tests/cudapy/test_casting.py +257 -0
  122. numba_cuda/numba/cuda/tests/cudapy/test_cffi.py +33 -0
  123. numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +276 -0
  124. numba_cuda/numba/cuda/tests/cudapy/test_complex.py +296 -0
  125. numba_cuda/numba/cuda/tests/cudapy/test_complex_kernel.py +20 -0
  126. numba_cuda/numba/cuda/tests/cudapy/test_const_string.py +129 -0
  127. numba_cuda/numba/cuda/tests/cudapy/test_constmem.py +176 -0
  128. numba_cuda/numba/cuda/tests/cudapy/test_cooperative_groups.py +147 -0
  129. numba_cuda/numba/cuda/tests/cudapy/test_cuda_array_interface.py +435 -0
  130. numba_cuda/numba/cuda/tests/cudapy/test_cuda_jit_no_types.py +90 -0
  131. numba_cuda/numba/cuda/tests/cudapy/test_datetime.py +94 -0
  132. numba_cuda/numba/cuda/tests/cudapy/test_debug.py +101 -0
  133. numba_cuda/numba/cuda/tests/cudapy/test_debuginfo.py +221 -0
  134. numba_cuda/numba/cuda/tests/cudapy/test_device_func.py +222 -0
  135. numba_cuda/numba/cuda/tests/cudapy/test_dispatcher.py +700 -0
  136. numba_cuda/numba/cuda/tests/cudapy/test_enums.py +121 -0
  137. numba_cuda/numba/cuda/tests/cudapy/test_errors.py +79 -0
  138. numba_cuda/numba/cuda/tests/cudapy/test_exception.py +174 -0
  139. numba_cuda/numba/cuda/tests/cudapy/test_extending.py +155 -0
  140. numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py +244 -0
  141. numba_cuda/numba/cuda/tests/cudapy/test_forall.py +52 -0
  142. numba_cuda/numba/cuda/tests/cudapy/test_freevar.py +29 -0
  143. numba_cuda/numba/cuda/tests/cudapy/test_frexp_ldexp.py +66 -0
  144. numba_cuda/numba/cuda/tests/cudapy/test_globals.py +60 -0
  145. numba_cuda/numba/cuda/tests/cudapy/test_gufunc.py +456 -0
  146. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scalar.py +159 -0
  147. numba_cuda/numba/cuda/tests/cudapy/test_gufunc_scheduling.py +95 -0
  148. numba_cuda/numba/cuda/tests/cudapy/test_idiv.py +37 -0
  149. numba_cuda/numba/cuda/tests/cudapy/test_inspect.py +165 -0
  150. numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py +1106 -0
  151. numba_cuda/numba/cuda/tests/cudapy/test_ipc.py +318 -0
  152. numba_cuda/numba/cuda/tests/cudapy/test_iterators.py +99 -0
  153. numba_cuda/numba/cuda/tests/cudapy/test_lang.py +64 -0
  154. numba_cuda/numba/cuda/tests/cudapy/test_laplace.py +119 -0
  155. numba_cuda/numba/cuda/tests/cudapy/test_libdevice.py +187 -0
  156. numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py +199 -0
  157. numba_cuda/numba/cuda/tests/cudapy/test_localmem.py +164 -0
  158. numba_cuda/numba/cuda/tests/cudapy/test_mandel.py +37 -0
  159. numba_cuda/numba/cuda/tests/cudapy/test_math.py +786 -0
  160. numba_cuda/numba/cuda/tests/cudapy/test_matmul.py +74 -0
  161. numba_cuda/numba/cuda/tests/cudapy/test_minmax.py +113 -0
  162. numba_cuda/numba/cuda/tests/cudapy/test_montecarlo.py +22 -0
  163. numba_cuda/numba/cuda/tests/cudapy/test_multigpu.py +140 -0
  164. numba_cuda/numba/cuda/tests/cudapy/test_multiprocessing.py +46 -0
  165. numba_cuda/numba/cuda/tests/cudapy/test_multithreads.py +101 -0
  166. numba_cuda/numba/cuda/tests/cudapy/test_nondet.py +49 -0
  167. numba_cuda/numba/cuda/tests/cudapy/test_operator.py +401 -0
  168. numba_cuda/numba/cuda/tests/cudapy/test_optimization.py +86 -0
  169. numba_cuda/numba/cuda/tests/cudapy/test_overload.py +335 -0
  170. numba_cuda/numba/cuda/tests/cudapy/test_powi.py +124 -0
  171. numba_cuda/numba/cuda/tests/cudapy/test_print.py +128 -0
  172. numba_cuda/numba/cuda/tests/cudapy/test_py2_div_issue.py +33 -0
  173. numba_cuda/numba/cuda/tests/cudapy/test_random.py +104 -0
  174. numba_cuda/numba/cuda/tests/cudapy/test_record_dtype.py +610 -0
  175. numba_cuda/numba/cuda/tests/cudapy/test_recursion.py +125 -0
  176. numba_cuda/numba/cuda/tests/cudapy/test_reduction.py +76 -0
  177. numba_cuda/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +83 -0
  178. numba_cuda/numba/cuda/tests/cudapy/test_serialize.py +85 -0
  179. numba_cuda/numba/cuda/tests/cudapy/test_slicing.py +37 -0
  180. numba_cuda/numba/cuda/tests/cudapy/test_sm.py +444 -0
  181. numba_cuda/numba/cuda/tests/cudapy/test_sm_creation.py +205 -0
  182. numba_cuda/numba/cuda/tests/cudapy/test_sync.py +271 -0
  183. numba_cuda/numba/cuda/tests/cudapy/test_transpose.py +80 -0
  184. numba_cuda/numba/cuda/tests/cudapy/test_ufuncs.py +277 -0
  185. numba_cuda/numba/cuda/tests/cudapy/test_userexc.py +47 -0
  186. numba_cuda/numba/cuda/tests/cudapy/test_vector_type.py +307 -0
  187. numba_cuda/numba/cuda/tests/cudapy/test_vectorize.py +283 -0
  188. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_complex.py +20 -0
  189. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_decor.py +69 -0
  190. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_device.py +36 -0
  191. numba_cuda/numba/cuda/tests/cudapy/test_vectorize_scalar_arg.py +37 -0
  192. numba_cuda/numba/cuda/tests/cudapy/test_warning.py +139 -0
  193. numba_cuda/numba/cuda/tests/cudapy/test_warp_ops.py +276 -0
  194. numba_cuda/numba/cuda/tests/cudasim/__init__.py +6 -0
  195. numba_cuda/numba/cuda/tests/cudasim/support.py +6 -0
  196. numba_cuda/numba/cuda/tests/cudasim/test_cudasim_issues.py +102 -0
  197. numba_cuda/numba/cuda/tests/data/__init__.py +0 -0
  198. numba_cuda/numba/cuda/tests/data/cuda_include.cu +5 -0
  199. numba_cuda/numba/cuda/tests/data/error.cu +7 -0
  200. numba_cuda/numba/cuda/tests/data/jitlink.cu +23 -0
  201. numba_cuda/numba/cuda/tests/data/jitlink.ptx +51 -0
  202. numba_cuda/numba/cuda/tests/data/warn.cu +7 -0
  203. numba_cuda/numba/cuda/tests/doc_examples/__init__.py +6 -0
  204. numba_cuda/numba/cuda/tests/doc_examples/ffi/__init__.py +0 -0
  205. numba_cuda/numba/cuda/tests/doc_examples/ffi/functions.cu +49 -0
  206. numba_cuda/numba/cuda/tests/doc_examples/test_cg.py +77 -0
  207. numba_cuda/numba/cuda/tests/doc_examples/test_cpu_gpu_compat.py +76 -0
  208. numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py +82 -0
  209. numba_cuda/numba/cuda/tests/doc_examples/test_laplace.py +155 -0
  210. numba_cuda/numba/cuda/tests/doc_examples/test_matmul.py +173 -0
  211. numba_cuda/numba/cuda/tests/doc_examples/test_montecarlo.py +109 -0
  212. numba_cuda/numba/cuda/tests/doc_examples/test_random.py +59 -0
  213. numba_cuda/numba/cuda/tests/doc_examples/test_reduction.py +76 -0
  214. numba_cuda/numba/cuda/tests/doc_examples/test_sessionize.py +130 -0
  215. numba_cuda/numba/cuda/tests/doc_examples/test_ufunc.py +50 -0
  216. numba_cuda/numba/cuda/tests/doc_examples/test_vecadd.py +73 -0
  217. numba_cuda/numba/cuda/tests/nocuda/__init__.py +8 -0
  218. numba_cuda/numba/cuda/tests/nocuda/test_dummyarray.py +359 -0
  219. numba_cuda/numba/cuda/tests/nocuda/test_function_resolution.py +36 -0
  220. numba_cuda/numba/cuda/tests/nocuda/test_import.py +49 -0
  221. numba_cuda/numba/cuda/tests/nocuda/test_library_lookup.py +238 -0
  222. numba_cuda/numba/cuda/tests/nocuda/test_nvvm.py +54 -0
  223. numba_cuda/numba/cuda/types.py +37 -0
  224. numba_cuda/numba/cuda/ufuncs.py +662 -0
  225. numba_cuda/numba/cuda/vector_types.py +209 -0
  226. numba_cuda/numba/cuda/vectorizers.py +252 -0
  227. numba_cuda-0.0.13.dist-info/LICENSE +25 -0
  228. numba_cuda-0.0.13.dist-info/METADATA +69 -0
  229. numba_cuda-0.0.13.dist-info/RECORD +231 -0
  230. {numba_cuda-0.0.1.dist-info → numba_cuda-0.0.13.dist-info}/WHEEL +1 -1
  231. numba_cuda-0.0.1.dist-info/METADATA +0 -10
  232. numba_cuda-0.0.1.dist-info/RECORD +0 -5
  233. {numba_cuda-0.0.1.dist-info → numba_cuda-0.0.13.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,62 @@
1
+ '''
2
+ Most of the driver API is unsupported in the simulator, but some stubs are
3
+ provided to allow tests to import correctly.
4
+ '''
5
+
6
+
7
+ def device_memset(dst, val, size, stream=0):
8
+ dst.view('u1')[:size].fill(bytes([val])[0])
9
+
10
+
11
+ def host_to_device(dst, src, size, stream=0):
12
+ dst.view('u1')[:size] = src.view('u1')[:size]
13
+
14
+
15
+ def device_to_host(dst, src, size, stream=0):
16
+ host_to_device(dst, src, size)
17
+
18
+
19
+ def device_memory_size(obj):
20
+ return obj.itemsize * obj.size
21
+
22
+
23
+ def device_to_device(dst, src, size, stream=0):
24
+ host_to_device(dst, src, size)
25
+
26
+
27
+ class FakeDriver(object):
28
+ def get_device_count(self):
29
+ return 1
30
+
31
+
32
+ driver = FakeDriver()
33
+
34
+
35
+ class Linker:
36
+ @classmethod
37
+ def new(cls, max_registers=0, lineinfo=False, cc=None):
38
+ return Linker()
39
+
40
+ @property
41
+ def lto(self):
42
+ return False
43
+
44
+
45
+ class LinkerError(RuntimeError):
46
+ pass
47
+
48
+
49
+ class NvrtcError(RuntimeError):
50
+ pass
51
+
52
+
53
+ class CudaAPIError(RuntimeError):
54
+ pass
55
+
56
+
57
+ def launch_kernel(*args, **kwargs):
58
+ msg = 'Launching kernels directly is not supported in the simulator'
59
+ raise RuntimeError(msg)
60
+
61
+
62
+ USE_NV_BINDING = False
@@ -0,0 +1,4 @@
1
+ '''
2
+ drvapi is not implemented in the simulator, but this module exists to allow
3
+ tests to import correctly.
4
+ '''
@@ -0,0 +1,4 @@
1
+ # Dummy arrays are not implemented in the simulator. This file allows the dummy
2
+ # array tests to be imported, but they are skipped on the simulator.
3
+
4
+ Array = None
@@ -0,0 +1,6 @@
1
+ class CudaSupportError(RuntimeError):
2
+ pass
3
+
4
+
5
+ class NvrtcError(Exception):
6
+ pass
@@ -0,0 +1,2 @@
1
+ def check_static_lib(lib):
2
+ raise FileNotFoundError('Linking libraries not supported by cudasim')
@@ -0,0 +1,29 @@
1
+ '''
2
+ NVVM is not supported in the simulator, but stubs are provided to allow tests
3
+ to import correctly.
4
+ '''
5
+
6
+
7
+ class NvvmSupportError(ImportError):
8
+ pass
9
+
10
+
11
+ class NVVM(object):
12
+ def __init__(self):
13
+ raise NvvmSupportError('NVVM not supported in the simulator')
14
+
15
+
16
+ CompilationUnit = None
17
+ compile_ir = None
18
+ set_cuda_kernel = None
19
+ get_arch_option = None
20
+ LibDevice = None
21
+ NvvmError = None
22
+
23
+
24
+ def is_available():
25
+ return False
26
+
27
+
28
+ def get_supported_ccs():
29
+ return ()
@@ -0,0 +1,19 @@
1
+ '''
2
+ The runtime API is unsupported in the simulator, but some stubs are
3
+ provided to allow tests to import correctly.
4
+ '''
5
+
6
+
7
+ class FakeRuntime(object):
8
+ def get_version(self):
9
+ return (-1, -1)
10
+
11
+ def is_supported_version(self):
12
+ return True
13
+
14
+ @property
15
+ def supported_versions(self):
16
+ return (-1, -1),
17
+
18
+
19
+ runtime = FakeRuntime()
@@ -0,0 +1,308 @@
1
+ from contextlib import contextmanager
2
+ import functools
3
+ import sys
4
+ import threading
5
+
6
+ import numpy as np
7
+
8
+ from .cudadrv.devicearray import FakeCUDAArray, FakeWithinKernelCUDAArray
9
+ from .kernelapi import Dim3, FakeCUDAModule, swapped_cuda_module
10
+ from ..errors import normalize_kernel_dimensions
11
+ from ..args import wrap_arg, ArgHint
12
+
13
+
14
+ """
15
+ Global variable to keep track of the current "kernel context", i.e the
16
+ FakeCUDAModule. We only support one kernel launch at a time.
17
+ No support for concurrent kernel launch.
18
+ """
19
+ _kernel_context = None
20
+
21
+
22
+ @contextmanager
23
+ def _push_kernel_context(mod):
24
+ """
25
+ Push the current kernel context.
26
+ """
27
+ global _kernel_context
28
+ assert _kernel_context is None, "concurrent simulated kernel not supported"
29
+ _kernel_context = mod
30
+ try:
31
+ yield
32
+ finally:
33
+ _kernel_context = None
34
+
35
+
36
+ def _get_kernel_context():
37
+ """
38
+ Get the current kernel context. This is usually done by a device function.
39
+ """
40
+ return _kernel_context
41
+
42
+
43
+ class FakeOverload:
44
+ '''
45
+ Used only to provide the max_cooperative_grid_blocks method
46
+ '''
47
+ def max_cooperative_grid_blocks(self, blockdim):
48
+ # We can only run one block in a cooperative grid because we have no
49
+ # mechanism for synchronization between different blocks
50
+ return 1
51
+
52
+
53
+ class FakeOverloadDict(dict):
54
+ def __getitem__(self, key):
55
+ # Always return a fake overload for any signature, as we don't keep
56
+ # track of overloads in the simulator.
57
+ return FakeOverload()
58
+
59
+
60
+ class FakeCUDAKernel(object):
61
+ '''
62
+ Wraps a @cuda.jit-ed function.
63
+ '''
64
+
65
+ def __init__(self, fn, device, fastmath=False, extensions=[], debug=False):
66
+ self.fn = fn
67
+ self._device = device
68
+ self._fastmath = fastmath
69
+ self._debug = debug
70
+ self.extensions = list(extensions) # defensive copy
71
+ # Initial configuration: grid unconfigured, stream 0, no dynamic shared
72
+ # memory.
73
+ self.grid_dim = None
74
+ self.block_dim = None
75
+ self.stream = 0
76
+ self.dynshared_size = 0
77
+ functools.update_wrapper(self, fn)
78
+
79
+ def __call__(self, *args):
80
+ if self._device:
81
+ with swapped_cuda_module(self.fn, _get_kernel_context()):
82
+ return self.fn(*args)
83
+
84
+ # Ensure we've been given a valid grid configuration
85
+ grid_dim, block_dim = normalize_kernel_dimensions(self.grid_dim,
86
+ self.block_dim)
87
+
88
+ fake_cuda_module = FakeCUDAModule(grid_dim, block_dim,
89
+ self.dynshared_size)
90
+ with _push_kernel_context(fake_cuda_module):
91
+ # fake_args substitutes all numpy arrays for FakeCUDAArrays
92
+ # because they implement some semantics differently
93
+ retr = []
94
+
95
+ def fake_arg(arg):
96
+ # map the arguments using any extension you've registered
97
+ _, arg = functools.reduce(
98
+ lambda ty_val, extension: extension.prepare_args(
99
+ *ty_val,
100
+ stream=0,
101
+ retr=retr),
102
+ self.extensions,
103
+ (None, arg)
104
+ )
105
+
106
+ if isinstance(arg, np.ndarray) and arg.ndim > 0:
107
+ ret = wrap_arg(arg).to_device(retr)
108
+ elif isinstance(arg, ArgHint):
109
+ ret = arg.to_device(retr)
110
+ elif isinstance(arg, np.void):
111
+ ret = FakeCUDAArray(arg) # In case a np record comes in.
112
+ else:
113
+ ret = arg
114
+ if isinstance(ret, FakeCUDAArray):
115
+ return FakeWithinKernelCUDAArray(ret)
116
+ return ret
117
+
118
+ fake_args = [fake_arg(arg) for arg in args]
119
+ with swapped_cuda_module(self.fn, fake_cuda_module):
120
+ # Execute one block at a time
121
+ for grid_point in np.ndindex(*grid_dim):
122
+ bm = BlockManager(self.fn, grid_dim, block_dim, self._debug)
123
+ bm.run(grid_point, *fake_args)
124
+
125
+ for wb in retr:
126
+ wb()
127
+
128
+ def __getitem__(self, configuration):
129
+ self.grid_dim, self.block_dim = \
130
+ normalize_kernel_dimensions(*configuration[:2])
131
+
132
+ if len(configuration) == 4:
133
+ self.dynshared_size = configuration[3]
134
+
135
+ return self
136
+
137
+ def bind(self):
138
+ pass
139
+
140
+ def specialize(self, *args):
141
+ return self
142
+
143
+ def forall(self, ntasks, tpb=0, stream=0, sharedmem=0):
144
+ if ntasks < 0:
145
+ raise ValueError("Can't create ForAll with negative task count: %s"
146
+ % ntasks)
147
+ return self[ntasks, 1, stream, sharedmem]
148
+
149
+ @property
150
+ def overloads(self):
151
+ return FakeOverloadDict()
152
+
153
+ @property
154
+ def py_func(self):
155
+ return self.fn
156
+
157
+
158
+ # Thread emulation
159
+
160
+ class BlockThread(threading.Thread):
161
+ '''
162
+ Manages the execution of a function for a single CUDA thread.
163
+ '''
164
+ def __init__(self, f, manager, blockIdx, threadIdx, debug):
165
+ if debug:
166
+ def debug_wrapper(*args, **kwargs):
167
+ np.seterr(divide='raise')
168
+ f(*args, **kwargs)
169
+ target = debug_wrapper
170
+ else:
171
+ target = f
172
+
173
+ super(BlockThread, self).__init__(target=target)
174
+ self.syncthreads_event = threading.Event()
175
+ self.syncthreads_blocked = False
176
+ self._manager = manager
177
+ self.blockIdx = Dim3(*blockIdx)
178
+ self.threadIdx = Dim3(*threadIdx)
179
+ self.exception = None
180
+ self.daemon = True
181
+ self.abort = False
182
+ self.debug = debug
183
+ blockDim = Dim3(*self._manager._block_dim)
184
+ self.thread_id = self.threadIdx.x + (blockDim.x * (self.threadIdx.y +
185
+ blockDim.y *
186
+ self.threadIdx.z))
187
+
188
+ def run(self):
189
+ try:
190
+ super(BlockThread, self).run()
191
+ except Exception as e:
192
+ tid = 'tid=%s' % list(self.threadIdx)
193
+ ctaid = 'ctaid=%s' % list(self.blockIdx)
194
+ if str(e) == '':
195
+ msg = '%s %s' % (tid, ctaid)
196
+ else:
197
+ msg = '%s %s: %s' % (tid, ctaid, e)
198
+ tb = sys.exc_info()[2]
199
+ # Using `with_traceback` here would cause it to be mutated by
200
+ # future raise statements, which may or may not matter.
201
+ self.exception = (type(e)(msg), tb)
202
+
203
+ def syncthreads(self):
204
+
205
+ if self.abort:
206
+ raise RuntimeError("abort flag set on syncthreads call")
207
+
208
+ self.syncthreads_blocked = True
209
+ self.syncthreads_event.wait()
210
+ self.syncthreads_event.clear()
211
+
212
+ if self.abort:
213
+ raise RuntimeError("abort flag set on syncthreads clear")
214
+
215
+ def syncthreads_count(self, value):
216
+ idx = self.threadIdx.x, self.threadIdx.y, self.threadIdx.z
217
+ self._manager.block_state[idx] = value
218
+ self.syncthreads()
219
+ count = np.count_nonzero(self._manager.block_state)
220
+ self.syncthreads()
221
+ return count
222
+
223
+ def syncthreads_and(self, value):
224
+ idx = self.threadIdx.x, self.threadIdx.y, self.threadIdx.z
225
+ self._manager.block_state[idx] = value
226
+ self.syncthreads()
227
+ test = np.all(self._manager.block_state)
228
+ self.syncthreads()
229
+ return 1 if test else 0
230
+
231
+ def syncthreads_or(self, value):
232
+ idx = self.threadIdx.x, self.threadIdx.y, self.threadIdx.z
233
+ self._manager.block_state[idx] = value
234
+ self.syncthreads()
235
+ test = np.any(self._manager.block_state)
236
+ self.syncthreads()
237
+ return 1 if test else 0
238
+
239
+ def __str__(self):
240
+ return 'Thread <<<%s, %s>>>' % (self.blockIdx, self.threadIdx)
241
+
242
+
243
+ class BlockManager(object):
244
+ '''
245
+ Manages the execution of a thread block.
246
+
247
+ When run() is called, all threads are started. Each thread executes until it
248
+ hits syncthreads(), at which point it sets its own syncthreads_blocked to
249
+ True so that the BlockManager knows it is blocked. It then waits on its
250
+ syncthreads_event.
251
+
252
+ The BlockManager polls threads to determine if they are blocked in
253
+ syncthreads(). If it finds a blocked thread, it adds it to the set of
254
+ blocked threads. When all threads are blocked, it unblocks all the threads.
255
+ The thread are unblocked by setting their syncthreads_blocked back to False
256
+ and setting their syncthreads_event.
257
+
258
+ The polling continues until no threads are alive, when execution is
259
+ complete.
260
+ '''
261
+ def __init__(self, f, grid_dim, block_dim, debug):
262
+ self._grid_dim = grid_dim
263
+ self._block_dim = block_dim
264
+ self._f = f
265
+ self._debug = debug
266
+ self.block_state = np.zeros(block_dim, dtype=np.bool_)
267
+
268
+ def run(self, grid_point, *args):
269
+ # Create all threads
270
+ threads = set()
271
+ livethreads = set()
272
+ blockedthreads = set()
273
+ for block_point in np.ndindex(*self._block_dim):
274
+ def target():
275
+ self._f(*args)
276
+ t = BlockThread(target, self, grid_point, block_point, self._debug)
277
+ t.start()
278
+ threads.add(t)
279
+ livethreads.add(t)
280
+
281
+ # Potential optimisations:
282
+ # 1. Continue the while loop immediately after finding a blocked thread
283
+ # 2. Don't poll already-blocked threads
284
+ while livethreads:
285
+ for t in livethreads:
286
+ if t.syncthreads_blocked:
287
+ blockedthreads.add(t)
288
+ elif t.exception:
289
+
290
+ # Abort all other simulator threads on exception,
291
+ # do *not* join immediately to facilitate debugging.
292
+ for t_other in threads:
293
+ t_other.abort = True
294
+ t_other.syncthreads_blocked = False
295
+ t_other.syncthreads_event.set()
296
+
297
+ raise t.exception[0].with_traceback(t.exception[1])
298
+ if livethreads == blockedthreads:
299
+ for t in blockedthreads:
300
+ t.syncthreads_blocked = False
301
+ t.syncthreads_event.set()
302
+ blockedthreads = set()
303
+ livethreads = set([ t for t in livethreads if t.is_alive() ])
304
+ # Final check for exceptions in case any were set prior to thread
305
+ # finishing, before we could check it
306
+ for t in threads:
307
+ if t.exception:
308
+ raise t.exception[0].with_traceback(t.exception[1])